diff --git a/.bazelrc b/.bazelrc
index 391fc927c27..f137258fa26 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -194,6 +194,9 @@ build:macos --apple_platform_type=macos
 # gRPC on MacOS requires this #define
 build:macos --copt=-DGRPC_BAZEL_BUILD
 
+# Avoid hitting command line argument limit
+build:macos --features=archive_param_file
+
 # Settings for MacOS on ARM CPUs.
 build:macos_arm64 --cpu=darwin_arm64
 build:macos_arm64 --macos_minimum_os=11.0
@@ -345,6 +348,7 @@ build:windows --host_copt=/D_USE_MATH_DEFINES
 # Windows has a relatively short command line limit, which TF has begun to hit.
 # See https://docs.bazel.build/versions/main/windows.html
 build:windows --features=compiler_param_file
+build:windows --features=archive_param_file
 
 # Speed Windows compile times. Available in VS 16.4 (we are on 16.11). See
 # https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
@@ -446,7 +450,6 @@ build:rbe --bes_backend=buildeventservice.googleapis.com
 build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
 build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
-build:rbe --flaky_test_attempts=3
 build:rbe --jobs=800
 build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
 build:rbe --remote_timeout=3600
@@ -627,7 +630,6 @@ try-import %workspace%/.bazelrc.user
 
 # Here are bazelrc configs for release builds
 build:release_base --config=v2
-test:release_base --flaky_test_attempts=3
 test:release_base --test_size_filters=small,medium
 
 build:release_cpu_linux --config=release_base
@@ -691,10 +693,10 @@ build:ubsan --linkopt -fsanitize=undefined
 build:ubsan --linkopt -lubsan
 
 # Disable TFRT integration for now unless --config=tfrt is specified.
-build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug
+build      --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python
 # TODO(b/240450920): We are in the process of migrating JitRt backend to XLA
 # and while we are doing this we can't keep it buildable/testable in OSS.
-build:tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/eager,tensorflow/core/tfrt/eager/backends/cpu,tensorflow/core/tfrt/eager/backends/gpu,tensorflow/core/tfrt/eager/core_runtime,tensorflow/core/tfrt/eager/cpp_tests/core_runtime,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug
+build:tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compiler/mlir/tfrt/benchmarks,tensorflow/compiler/mlir/tfrt/ir,tensorflow/compiler/mlir/tfrt/ir/mlrt,tensorflow/compiler/mlir/tfrt/jit/python_binding,tensorflow/compiler/mlir/tfrt/jit/transforms,tensorflow/compiler/mlir/tfrt/python_tests,tensorflow/compiler/mlir/tfrt/tests,tensorflow/compiler/mlir/tfrt/tests/mlrt,tensorflow/compiler/mlir/tfrt/tests/ir,tensorflow/compiler/mlir/tfrt/tests/analysis,tensorflow/compiler/mlir/tfrt/tests/jit,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_tfrt,tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt,tensorflow/compiler/mlir/tfrt/tests/tf_to_corert,tensorflow/compiler/mlir/tfrt/tests/tf_to_tfrt_data,tensorflow/compiler/mlir/tfrt/tests/saved_model,tensorflow/compiler/mlir/tfrt/transforms/lhlo_gpu_to_tfrt_gpu,tensorflow/compiler/mlir/tfrt/transforms/mlrt,tensorflow/core/runtime_fallback,tensorflow/core/runtime_fallback/conversion,tensorflow/core/runtime_fallback/kernel,tensorflow/core/runtime_fallback/opdefs,tensorflow/core/runtime_fallback/runtime,tensorflow/core/runtime_fallback/util,tensorflow/core/tfrt/mlrt,tensorflow/core/tfrt/mlrt/attribute,tensorflow/core/tfrt/mlrt/kernel,tensorflow/core/tfrt/mlrt/bytecode,tensorflow/core/tfrt/mlrt/interpreter,tensorflow/compiler/mlir/tfrt/translate/mlrt,tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata,tensorflow/core/tfrt/gpu,tensorflow/core/tfrt/run_handler_thread_pool,tensorflow/core/tfrt/runtime,tensorflow/core/tfrt/saved_model,tensorflow/core/tfrt/graph_executor,tensorflow/core/tfrt/saved_model/tests,tensorflow/core/tfrt/tpu,tensorflow/core/tfrt/utils,tensorflow/core/tfrt/utils/debug,tensorflow/core/tfrt/saved_model/python,tensorflow/core/tfrt/graph_executor/python
 
 # TF Fuzztest config
 try-import fuzztest.bazelrc
diff --git a/.bazelversion b/.bazelversion
index f53152b50eb..b536fbc5061 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1,2 +1,2 @@
-5.3.0
+6.1.0
 # NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/.github/bot_config.yml b/.github/bot_config.yml
index b5cf2a5a6c2..b90b4f52c56 100644
--- a/.github/bot_config.yml
+++ b/.github/bot_config.yml
@@ -15,7 +15,7 @@
 
 # A list of assignees
 assignees:
-   - synandi
+   - sushreebarsa
    - SuryanarayanaY
    - tilakrayal
 # A list of assignees for compiler folder
diff --git a/.github/workflows/arm-cd.yml b/.github/workflows/arm-cd.yml
index b601b0054c7..a191c65a98f 100644
--- a/.github/workflows/arm-cd.yml
+++ b/.github/workflows/arm-cd.yml
@@ -28,6 +28,7 @@ jobs:
     runs-on: [self-hosted, linux, ARM64]
     continue-on-error: ${{ matrix.experimental }}
     strategy:
+      fail-fast: false
       matrix:
         pyver: ['3.8', '3.9', '3.10']
         experimental: [false]
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 7e32dafabe9..faba79089b8 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -27,8 +27,9 @@ jobs:
     if: github.repository == 'tensorflow/tensorflow' # Don't do this in forks
     runs-on: [self-hosted, linux, ARM64]
     strategy:
+      fail-fast: false
       matrix:
-        pyver: ['3.10']
+        pyver: ['3.8', '3.9', '3.10', '3.11']
     steps:
       - name: Stop old running containers (if any)
         shell: bash
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index d32d7affd64..965e3515b84 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -92,6 +92,18 @@ jobs:
         map sigbuild-r2.13-clang-python3.9 2.13-python3.9
         map sigbuild-r2.13-clang-python3.10 2.13-python3.10
         map sigbuild-r2.13-clang-python3.11 2.13-python3.11
+        # TF 2.14
+        map sigbuild-r2.14 2.14-python3.9
+        map sigbuild-r2.14-python3.8 2.14-python3.8
+        map sigbuild-r2.14-python3.9 2.14-python3.9
+        map sigbuild-r2.14-python3.10 2.14-python3.10
+        map sigbuild-r2.14-python3.11 2.14-python3.11
+        # TF 2.14 + Clang (containers are the same, but env vars in configs.bzl are different)
+        map sigbuild-r2.14-clang 2.14-python3.9
+        map sigbuild-r2.14-clang-python3.8 2.14-python3.8
+        map sigbuild-r2.14-clang-python3.9 2.14-python3.9
+        map sigbuild-r2.14-clang-python3.10 2.14-python3.10
+        map sigbuild-r2.14-clang-python3.11 2.14-python3.11
     - name: Create Pull Request with changes
       uses: peter-evans/create-pull-request@2b011faafdcbc9ceb11414d64d0573f37c774b04 # v4.2.3
       with:
diff --git a/README.md b/README.md
index fa7a6c45733..d0feb038bc0 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
   <img src="https://www.tensorflow.org/images/tf_logo_horizontal.png">
 </div>
 
-[![Python](https://img.shields.io/pypi/pyversions/tensorflow.svg?style=plastic)](https://badge.fury.io/py/tensorflow)
+[![Python](https://img.shields.io/pypi/pyversions/tensorflow.svg)](https://badge.fury.io/py/tensorflow)
 [![PyPI](https://badge.fury.io/py/tensorflow.svg)](https://badge.fury.io/py/tensorflow)
 [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.4724125.svg)](https://doi.org/10.5281/zenodo.4724125)
 [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/1486/badge)](https://bestpractices.coreinfrastructure.org/projects/1486)
@@ -11,6 +11,8 @@
 [![Fuzzing Status](https://oss-fuzz-build-logs.storage.googleapis.com/badges/tensorflow-py.svg)](https://bugs.chromium.org/p/oss-fuzz/issues/list?sort=-opened&can=1&q=proj:tensorflow-py)
 [![OSSRank](https://shields.io/endpoint?url=https://ossrank.com/shield/44)](https://ossrank.com/p/44)
 [![Contributor Covenant](https://img.shields.io/badge/Contributor%20Covenant-v1.4%20adopted-ff69b4.svg)](CODE_OF_CONDUCT.md)
+[![TF Official Continuous](https://tensorflow.github.io/build/TF%20Official%20Continuous.svg)](https://tensorflow.github.io/build#TF%20Official%20Continuous)
+[![TF Official Nightly](https://tensorflow.github.io/build/TF%20Official%20Nightly.svg)](https://tensorflow.github.io/build#TF%20Official%20Nightly)
 
 **`Documentation`** |
 ------------------- |
diff --git a/RELEASE.md b/RELEASE.md
index 87ebf46e557..c404a6183ae 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -16,6 +16,11 @@
       2.13 may be used when it is necessary to determine if a value is
       specifically a symbolic tensor.
 
+*   `tf.compat.v1.Session`
+    * `tf.compat.v1.Session.partial_run` and
+      `tf.compat.v1.Session.partial_run_setup` will be deprecated in the
+      next release.
+
 # Known Caveats
 
 * <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
@@ -26,6 +31,15 @@
 
 *   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
 *   <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+* `tf.keras`
+    * `Model.compile` now support `steps_per_execution='auto'` as a parameter,
+    allowing automatic tuning of steps per execution during `Model.fit`,
+    `Model.predict`, and `Model.evaluate` for a significant performance boost. 
+
+*   Enable JIT-compiled i64-indexed kernels on GPU for large tensors with more
+    than 2**32 elements.
+    *   Unary GPU kernels: Abs, Atanh, Acos, Acosh, Asin, Asinh, Atan, Cos,
+        Cosh, Sin, Sinh, Tan, Tanh.
 
 # Bug Fixes and Other Changes
 * `tf.lite`
@@ -34,6 +48,22 @@
 * <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
 * <NOTES SHOULD BE GROUPED PER AREA>
 
+* `tf.config.experimental.enable_tensor_float_32_execution`
+    * Disabling TensorFloat-32 execution now causes TPUs to use float32
+      precision for float32 matmuls and other ops. TPUs have always used
+      bfloat16 precision for certain ops, like matmul, when such ops had float32
+      inputs. Now, disabling TensorFloat-32 by calling
+      `tf.config.experimental.enable_tensor_float_32_execution(False)` will
+      cause TPUs to use float32 precision for such ops instead of bfloat16.
+
+*  `tf.experimental.dtensor`
+    * API changes for Relayout. Added a new API, `dtensor.relayout_like`, for 
+      relayouting a tensor according to the layout of another tensor. 
+    * Added `dtensor.get_default_mesh`, for retrieving the current default 
+      mesh under the dtensor context.
+
+*   TensorFlow Debugger (tfdbg) CLI: ncurses-based CLI for tfdbg v1 was removed.
+
 # Thanks to our Contributors
 
 This release contains contributions from many people at Google, as well as:
@@ -185,6 +215,9 @@ This release contains contributions from many people at Google, as well as:
         `dataset = dataset.shuffle(dataset.cardinality())`. This will load the
         full dataset into memory so that it can be shuffled, so make sure to
         only use this with datasets of filenames or other small datasets.
+    *   Added a new `tf.data.experimental.pad_to_cardinality` transformation
+        which pads a dataset with zero elements up to a specified cardinality.
+        This is useful for avoiding partial batches while not dropping any data.
 
 *   `tf.math`
 
@@ -243,6 +276,8 @@ This release contains contributions from many people at Google, as well as:
 
 *   `tf.lite`:
     *   Add UINT32 support to tfl.pack
+    *   Add INT64 support to tfl.range
+    *   Add UINT32 support to tfl.concatenation
 
 ## Thanks to our Contributors
 
diff --git a/ci/README.md b/ci/README.md
new file mode 100644
index 00000000000..1dc705f8e35
--- /dev/null
+++ b/ci/README.md
@@ -0,0 +1,17 @@
+# TensorFlow continuous integration
+
+> **Warning** This folder is still under construction. It is part of an ongoing
+> effort to improve the structure of CI and build related files within the
+> TensorFlow repo. This warning will be removed when the contents of this
+> directory are stable and appropriate documentation around its usage is in
+> place.
+
+Maintainer: TensorFlow DevInfra
+
+********************************************************************************
+
+The CI folder contains the configuration files and scripts used to build, test,
+and deploy TensorFlow. This folder is typically used by continuous integration
+(CI) tools to build and test TensorFlow whenever there is a change to the
+code. This folder is broken into subfolders that represent the level of support
+and ownership of the files contained within.
diff --git a/ci/devinfra/README.md b/ci/devinfra/README.md
new file mode 100644
index 00000000000..c31d50b87a6
--- /dev/null
+++ b/ci/devinfra/README.md
@@ -0,0 +1,17 @@
+# DevInfra CI Directory
+
+> **Warning** This folder is still under construction. It is part of an ongoing
+> effort to improve the structure of CI and build related files within the
+> TensorFlow repo. This warning will be removed when the contents of this
+> directory are stable and appropriate documentation around its usage is in
+> place.
+
+Maintainer: TensorFlow DevInfra
+
+Issue Reporting: File an issue against this repo and tag
+[@devinfra](https://github.com/orgs/tensorflow/teams/devinfra)
+
+********************************************************************************
+
+A directory for build and CI related scripts and jobs managed by the TensorFlow
+DevInfra team but not part of the official build, test, or release process.
diff --git a/ci/official/README.md b/ci/official/README.md
new file mode 100644
index 00000000000..2bd578c0160
--- /dev/null
+++ b/ci/official/README.md
@@ -0,0 +1,17 @@
+# Official CI Directory
+
+> **Warning** This folder is still under construction. It is part of an ongoing
+> effort to improve the structure of CI and build related files within the
+> TensorFlow repo. This warning will be removed when the contents of this
+> directory are stable and appropriate documentation around its usage is in
+> place.
+
+Maintainer: TensorFlow and TensorFlow DevInfra
+
+Issue Reporting: File an issue against this repo and tag
+[@devinfra](https://github.com/orgs/tensorflow/teams/devinfra)
+
+********************************************************************************
+
+A directory for build and CI related scripts and jobs that are used and
+monitored as part of the official TensorFlow build, test, and release process.
diff --git a/configure.py b/configure.py
index 73e124fb356..47b566a9c0f 100644
--- a/configure.py
+++ b/configure.py
@@ -964,7 +964,6 @@ def set_other_cuda_vars(environ_cp):
 
 def system_specific_test_config(environ_cp):
   """Add default build and test flags required for TF tests to bazelrc."""
-  write_to_bazelrc('test --flaky_test_attempts=3')
   write_to_bazelrc('test --test_size_filters=small,medium')
 
   # Each instance of --test_tag_filters or --build_tag_filters overrides all
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index fce465ff1f2..a014c90df67 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -109,6 +109,7 @@ PACKAGE_STATIC_DEPS = [
     "@local_execution_config_platform//:__subpackages__",
     "@mkl_dnn_acl_compatible//:__subpackages__",
     "@mkl_dnn_v1//:__subpackages__",
+    "@ml_dtypes//:__subpackages__",
     "@nccl_archive//:__subpackages__",
     "@nvtx_archive//:__subpackages__",
     "@org_sqlite//:__subpackages__",
@@ -1036,7 +1037,13 @@ package_group(
     ],
 )
 
-package_group(name = "ndarray_tensor_allow_list")
+package_group(
+    name = "ndarray_tensor_allow_list",
+    packages = [
+        "//third_party/py/courier/...",
+        "//third_party/py/tensorfn/...",
+    ],
+)
 
 # Packages that use private types symbols, until they are exported.
 # TODO(b/154650521) Remove.
diff --git a/tensorflow/c/BUILD b/tensorflow/c/BUILD
index 0e70244453f..f52e342da94 100644
--- a/tensorflow/c/BUILD
+++ b/tensorflow/c/BUILD
@@ -129,6 +129,7 @@ cc_library(
         # TODO: Only include tf_tstring_hdrs. Don't expose the implementation of TF_TString to API
         # users.
         ":tf_tstring",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -171,6 +172,7 @@ tf_cuda_library(
         ":tf_buffer_internal",
         ":tf_status_internal",
         ":tf_tensor_internal",
+        "//tensorflow/core:protos_all_cc",
     ],
 )
 
@@ -238,6 +240,7 @@ tf_cuda_library(
         ":tf_status_internal",
         ":tf_tensor_internal",
         ":tf_tstring",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:tstring",
         "//tensorflow/tsl/c:tsl_status",
     ] + select({
@@ -881,7 +884,7 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "c_api_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_test.cc"],
     data = [
         ":test_op1.so",
@@ -968,7 +971,7 @@ tf_cc_test(
 
 tf_cc_test(
     name = "c_api_function_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_function_test.cc"],
     deps = [
         ":c_api",
@@ -985,7 +988,7 @@ tf_cc_test(
 
 tf_cc_test(
     name = "while_loop_test",
-    size = "small",
+    size = "medium",
     srcs = ["while_loop_test.cc"],
     deps = [
         ":c_api",
@@ -1013,7 +1016,7 @@ tf_kernel_library(
 
 tf_cuda_cc_test(
     name = "env_test",
-    size = "small",
+    size = "medium",
     srcs = ["env_test.cc"],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
@@ -1032,7 +1035,7 @@ tf_cuda_cc_test(
 
 tf_cuda_cc_test(
     name = "kernels_test",
-    size = "small",
+    size = "medium",
     srcs = ["kernels_test.cc"],
     linkopts = select({
         "//tensorflow:macos": ["-headerpad_max_install_names"],
@@ -1059,7 +1062,7 @@ tf_cuda_cc_test(
 
 tf_cc_test(
     name = "ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["ops_test.cc"],
     linkopts = select({
         "//conditions:default": [],
diff --git a/tensorflow/c/c_api_internal.h b/tensorflow/c/c_api_internal.h
index 92f63553ee1..15d279b61ac 100644
--- a/tensorflow/c/c_api_internal.h
+++ b/tensorflow/c/c_api_internal.h
@@ -142,7 +142,7 @@ struct TF_ImportGraphDefOptions {
 
   // Backing memory for TensorId fields in opts.
   // TODO(skyewm): it'd be better if ImportGraphDefOptions owned this.
-  std::list<tensorflow::string> tensor_id_data;
+  std::vector<tensorflow::string> tensor_id_data;
 };
 
 struct TF_ImportGraphDefResults {
@@ -152,7 +152,7 @@ struct TF_ImportGraphDefResults {
   std::vector<int> missing_unused_key_indexes;
 
   // Backing memory for missing_unused_key_names values.
-  std::list<tensorflow::string> missing_unused_key_names_data;
+  std::vector<tensorflow::string> missing_unused_key_names_data;
 };
 
 struct TF_DeviceList {
diff --git a/tensorflow/c/c_api_macros.h b/tensorflow/c/c_api_macros.h
index e0c91a0d549..d73546aed16 100644
--- a/tensorflow/c/c_api_macros.h
+++ b/tensorflow/c/c_api_macros.h
@@ -26,7 +26,12 @@ limitations under the License.
 #define TF_CAPI_EXPORT __declspec(dllimport)
 #endif  // TF_COMPILE_LIBRARY
 #else
+#ifdef TF_CAPI_WEAK
+#define TF_CAPI_EXPORT \
+  __attribute__((visibility("default"))) __attribute((weak))
+#else
 #define TF_CAPI_EXPORT __attribute__((visibility("default")))
+#endif  // TF_CAPI_WEAK
 #endif  // _WIN32
 #endif  // SWIG
 
diff --git a/tensorflow/c/eager/BUILD b/tensorflow/c/eager/BUILD
index dd61bd26bc1..748d49565f6 100644
--- a/tensorflow/c/eager/BUILD
+++ b/tensorflow/c/eager/BUILD
@@ -8,7 +8,7 @@ load(
     "tf_cuda_cc_test",
     "tf_cuda_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "internal_tfrt_deps")
+load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
@@ -95,7 +95,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime/rpc/eager:grpc_eager_client",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
-    ] + internal_tfrt_deps(),
+    ],
     alwayslink = 1,
 )
 
@@ -636,7 +636,7 @@ tf_cuda_library(
 
 tf_cuda_cc_test(
     name = "c_api_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "c_api_debug_test.cc",
         "c_api_test.cc",
@@ -653,7 +653,6 @@ tf_cuda_cc_test(
         ":c_api_test_util",
         ":tfe_op_internal",
         ":tfe_tensorhandle_internal",
-        "@com_google_absl//absl/strings",
         "//tensorflow/c:c_test_util",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
@@ -663,10 +662,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime/eager:eager_operation",
         "//tensorflow/core/common_runtime/eager:tensor_handle",
         "//tensorflow/core/distributed_runtime/rpc:grpc_server_lib",
-        # copybara:uncomment_begin
-        # "//tensorflow/core/tfrt/eager:c_api_tfrt",
-        # "@tf_runtime//backends/cpu:tf_ops_alwayslink",
-        # copybara:uncomment_end
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -693,7 +689,7 @@ tf_cuda_library(
 
 tf_cuda_cc_test(
     name = "c_api_remote_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "c_api_remote_test.cc",
     ],
@@ -725,7 +721,7 @@ tf_cuda_cc_test(
 
 tf_cuda_cc_test(
     name = "c_api_remote_function_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "c_api_remote_function_test.cc",
     ],
@@ -776,7 +772,7 @@ tf_cuda_cc_test(
 
 tf_cuda_cc_test(
     name = "c_api_cluster_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "c_api_cluster_test.cc",
     ],
@@ -1014,7 +1010,7 @@ cc_library(
 
 tf_cc_test(
     name = "custom_device_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "custom_device_test.cc",
     ],
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 8503485f63c..41ced14455e 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -64,13 +64,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/version.h"
 
-// "tensorflow/core/platform/platform.h" must be included first before using
-// PLATFORM_GOOGLE, IS_MOBILE_PLATFORM, etc.
-#if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE) && \
-    !defined(PLATFORM_FUCHSIA)
-#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-#endif  // PLATFORM_GOOGLE && !LIBTPU_ON_GCE && !PLATFORM_FUCHSIA
-
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tensorflow/core/common_runtime/eager/context_distributed_manager.h"
 #endif  // !IS_MOBILE_PLATFORM
@@ -117,18 +110,8 @@ void TFE_DeleteContextOptions(TFE_ContextOptions* options) { delete options; }
 
 TFE_Context* TFE_NewContext(const TFE_ContextOptions* opts, TF_Status* status) {
   if (opts->use_tfrt) {
-#if defined(PLATFORM_GOOGLE) && !defined(LIBTPU_ON_GCE) && \
-    !defined(PLATFORM_FUCHSIA)
-    tfrt::tf::ContextInterface* tfrt_context = new tfrt::tf::ContextInterface(
-        opts->session_options.options,
-        static_cast<tensorflow::ContextDevicePlacementPolicy>(
-            opts->device_placement_policy),
-        opts->async);
-    return tensorflow::wrap(tfrt_context);
-#else
     status->status = tensorflow::errors::Unimplemented("TFRT is not supported");
     return nullptr;
-#endif  // PLATFORM_GOOGLE && !LIBTPU_ON_GCE && !PLATFORM_FUCHSIA
   }
   std::vector<std::unique_ptr<tensorflow::Device>> devices;
   status->status = tensorflow::DeviceFactory::AddDevices(
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index e35bc962525..13b688889a4 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -434,7 +434,7 @@ class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
   tensorflow::Status Run(const std::string& function_name,
                          const tensorflow::DeviceSet& device_set,
                          const tensorflow::ConfigProto& config_proto,
-                         absl::string_view xla_compile_device_type,
+                         const FunctionOptions& function_options,
                          std::unique_ptr<tensorflow::Graph>* graph,
                          tensorflow::FunctionLibraryDefinition* flib_def,
                          std::vector<std::string>* control_ret_node_names,
diff --git a/tensorflow/c/eager/c_api_test.cc b/tensorflow/c/eager/c_api_test.cc
index 0f8c97ce7ba..254648d9e09 100644
--- a/tensorflow/c/eager/c_api_test.cc
+++ b/tensorflow/c/eager/c_api_test.cc
@@ -47,10 +47,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 
-#ifdef PLATFORM_GOOGLE
-#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-#endif
-
 using tensorflow::string;
 
 namespace {
@@ -1262,16 +1258,6 @@ TEST(CAPI, RunAddFunctionWithGrappler) {
   RunAddFunction(/*use_tfrt=*/false, /*enable_grappler=*/true);
 }
 
-#ifdef PLATFORM_GOOGLE
-TEST(CAPI, RunAddFunction_TFRT) {
-  RunAddFunction(/*use_tfrt=*/true, /*enable_grappler=*/false);
-}
-
-TEST(CAPI, RunAddFunctionWithGrappler_TFRT) {
-  RunAddFunction(/*use_tfrt=*/true, /*enable_grappler=*/true);
-}
-#endif
-
 void BM_ExecuteFunction(::testing::benchmark::State& state) {
   const int async = state.range(0);
   state.SetLabel(async ? "ExecuteFunctionAsync" : "ExecuteFunction");
@@ -1802,23 +1788,9 @@ void TestOpAddAttrs(bool use_tfrt) {
   CHECK_EQ(TF_OK, TF_GetCode(status)) << TF_Message(status);
 
   tensorflow::AttrValueMap attr_values;
-  if (use_tfrt) {
-#ifdef PLATFORM_GOOGLE
-    auto* op = tensorflow::down_cast<tfrt::tf::OperationInterface*>(
-        tensorflow::unwrap(copy_op));
-    auto* tfrt_op_attrs =
-        tensorflow::down_cast<const tfrt::tf::OpAttrsInterface*>(
-            op->GetOpAttrs());
-    tensorflow::DataType result;
-    tfrt_op_attrs->GetType("dtype", &result);
-    EXPECT_EQ(tensorflow::DT_FLOAT, result);
-    tfrt_op_attrs->GetFallbackAttrs()->FillAttrValueMap(&attr_values);
-#endif
-  } else {
-    tensorflow::EagerOperation* op =
-        tensorflow::OperationFromInterface(tensorflow::unwrap(copy_op));
-    op->Attrs().FillAttrValueMap(&attr_values);
-  }
+  tensorflow::EagerOperation* op =
+      tensorflow::OperationFromInterface(tensorflow::unwrap(copy_op));
+  op->Attrs().FillAttrValueMap(&attr_values);
   EXPECT_EQ(tensorflow::DT_FLOAT, attr_values.find("dtype")->second.type());
 
   TF_DeleteStatus(status);
@@ -1829,11 +1801,6 @@ void TestOpAddAttrs(bool use_tfrt) {
 
 TEST(CAPI, TestTFE_OpAddAttrs) { TestOpAddAttrs(/*use_tfrt=*/false); }
 
-#ifdef PLATFORM_GOOGLE
-TEST(CAPI, TestTFE_OpAddAttrs_TFRT) { TestOpAddAttrs(/*use_tfrt=*/true); }
-
-#endif
-
 TEST(CAPI, TestTFE_OpAttrsSerialize) {
   TF_Status* status = TF_NewStatus();
   TFE_ContextOptions* opts = TFE_NewContextOptions();
diff --git a/tensorflow/c/eager/c_api_unified_experimental_test.cc b/tensorflow/c/eager/c_api_unified_experimental_test.cc
index edaf3d8e579..e866ec0ca78 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_test.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_test.cc
@@ -1006,18 +1006,10 @@ TEST_P(UnifiedCAPI, TF_ExecutionContextGetTFEContextFromFunctionContextRaises) {
 
 // The above tests are run for a combination of:
 // - graphdef and MLIR tracing engine
-// - Using TFRT as an execution runtime (true == enable TFRT)
-#ifdef PLATFORM_GOOGLE
-INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
-                         ::testing::Combine(::testing::Values("graphdef",
-                                                              "mlir"),
-                                            ::testing::Values(true, false)));
-#else
 INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
                          ::testing::Combine(::testing::Values("graphdef",
                                                               "mlir"),
                                             ::testing::Values(false)));
-#endif
 
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/unified_api_test.cc b/tensorflow/c/eager/unified_api_test.cc
index 27e42be5bcc..fce632b2210 100644
--- a/tensorflow/c/eager/unified_api_test.cc
+++ b/tensorflow/c/eager/unified_api_test.cc
@@ -188,18 +188,10 @@ TEST_P(UnifiedAPI, TestPartialShapeTracing) {
   ASSERT_EQ(-1, shape.dim_size(1));
 }
 
-#ifdef PLATFORM_GOOGLE
-INSTANTIATE_TEST_SUITE_P(
-    UnifiedCppAPI, UnifiedAPI,
-    ::testing::Combine(::testing::Values("graphdef", "mlir"),
-                       /*tfrt*/ ::testing::Values(true, false),
-                       /*use_function*/ ::testing::Values(true, false)));
-#else
 INSTANTIATE_TEST_SUITE_P(
     UnifiedCppAPI, UnifiedAPI,
     ::testing::Combine(::testing::Values("graphdef", "mlir"),
                        /*tfrt*/ ::testing::Values(false),
                        /*use_function*/ ::testing::Values(true, false)));
-#endif
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/c/experimental/gradients/custom_gradient_test.cc b/tensorflow/c/experimental/gradients/custom_gradient_test.cc
index cce9a051a74..02066362892 100644
--- a/tensorflow/c/experimental/gradients/custom_gradient_test.cc
+++ b/tensorflow/c/experimental/gradients/custom_gradient_test.cc
@@ -125,19 +125,12 @@ TEST_P(CustomGradientTest, ExpWithPassThroughGrad) {
   result_tensor = nullptr;
 }
 
-#ifdef PLATFORM_GOOGLE
-INSTANTIATE_TEST_SUITE_P(
-    CustomGradientTest, CustomGradientTest,
-    ::testing::Combine(::testing::Values("graphdef", "mlir"),
-                       /*tfrt*/ ::testing::Values(true, false),
-                       /*executing_eagerly*/ ::testing::Values(true, false)));
-#else
 INSTANTIATE_TEST_SUITE_P(
     CustomGradientTest, CustomGradientTest,
     ::testing::Combine(::testing::Values("graphdef", "mlir"),
                        /*tfrt*/ ::testing::Values(false),
                        /*executing_eagerly*/ ::testing::Values(true, false)));
-#endif
+
 }  // namespace
 }  // namespace internal
 }  // namespace gradients
diff --git a/tensorflow/c/experimental/next_pluggable_device/BUILD b/tensorflow/c/experimental/next_pluggable_device/BUILD
index eda00deb59c..ef81acf75a5 100644
--- a/tensorflow/c/experimental/next_pluggable_device/BUILD
+++ b/tensorflow/c/experimental/next_pluggable_device/BUILD
@@ -24,6 +24,7 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt:pjrt_c_api_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
+        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_initializer_helper",
         "//tensorflow/core:framework",
         "//tensorflow/core/common_runtime/next_pluggable_device:plugin_resource",
         "//tensorflow/core/platform:status",
@@ -32,6 +33,8 @@ cc_library(
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/c/experimental/next_pluggable_device/c_api.cc b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
index caa49be2d3f..dda6f5bcc26 100644
--- a/tensorflow/c/experimental/next_pluggable_device/c_api.cc
+++ b/tensorflow/c/experimental/next_pluggable_device/c_api.cc
@@ -21,6 +21,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/kernels_experimental.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/c/tf_status_internal.h"
@@ -30,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/variable_info_util.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"  // NOLINT(unused-includes): required for tensorflow::tpu::FindAndLoadTpuLibrary
 #include "tensorflow/core/common_runtime/next_pluggable_device/plugin_resource.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/status.h"
@@ -110,9 +114,9 @@ TF_VariableInfo* TF_CreateVariableInfoFromContext(TF_OpKernelContext* ctx,
   const tensorflow::Tensor& arg_tensor = cc_ctx->input(index);
   tsl::Status cc_status;
   if (arg_tensor.dtype() != tensorflow::DT_RESOURCE) {
-    cc_status = tsl::errors::InvalidArgument(
-        "Trying to obtain resource handle from Input[", index,
-        "], which is not type DT_RESOURCE.");
+    cc_status = absl::InvalidArgumentError(
+        absl::StrCat("Trying to obtain resource handle from Input[", index,
+                     "], which is not type DT_RESOURCE."));
     tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
   }
@@ -140,12 +144,12 @@ void TF_AllocateTempForVariableInfo(TF_OpKernelContext* ctx,
   auto* cc_ctx = reinterpret_cast<tensorflow::OpKernelContext*>(ctx);
   tsl::Status cc_status;
   if (var_info == nullptr) {
-    cc_status = tsl::errors::InvalidArgument("TF_VariableInfo is NULL.");
+    cc_status = absl::InvalidArgumentError("TF_VariableInfo is NULL.");
     tsl::Set_TF_Status_from_Status(status, cc_status);
     return;
   }
   if (var_info->var_info.var() == nullptr) {
-    cc_status = tsl::errors::InvalidArgument(
+    cc_status = absl::InvalidArgumentError(
         "VariableInfo does not track a resource variable.");
     tsl::Set_TF_Status_from_Status(status, cc_status);
     return;
@@ -161,12 +165,12 @@ TF_Tensor* TF_GetTensorFromVariableInfo(TF_VariableInfo* var_info,
                                         TF_Status* status) {
   tsl::Status cc_status;
   if (var_info == nullptr) {
-    cc_status = tsl::errors::InvalidArgument("TF_VariableInfo is NULL.");
+    cc_status = absl::InvalidArgumentError("TF_VariableInfo is NULL.");
     tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
   }
   if (var_info->var_info.var() == nullptr) {
-    cc_status = tsl::errors::InvalidArgument(
+    cc_status = absl::InvalidArgumentError(
         "VariableInfo does not track a resource variable.");
     tsl::Set_TF_Status_from_Status(status, cc_status);
     return nullptr;
@@ -239,6 +243,18 @@ void TF_CoordinationServiceDeleteKeyValue(const char* key,
 void TF_CreateAndSetPjRtCApiClient(const char* device_type, TF_Status* status,
                                    PJRT_NamedValue* create_options,
                                    int num_options) {
+  // TODO(b/262050449): use a common plugin discovery mechanism, rather than
+  // having TPU-specific code here.
+#if !defined(PLATFORM_GOOGLE) || defined(LIBTPU_STATIC)
+  if (absl::AsciiStrToLower(device_type) == "tpu") {
+    // TODO(b/261484192): handle device specific initialization.
+    tsl::Status tpu_status = tensorflow::tpu::FindAndLoadTpuLibrary();
+    if (!tpu_status.ok()) {
+      tensorflow::Set_TF_Status_from_Status(status, tpu_status);
+      return;
+    }
+  }
+#endif
   tsl::StatusOr<std::unique_ptr<xla::PjRtClient>> pjrt_client =
       xla::GetCApiClient(device_type, pjrt::ConvertFromPjRtNamedValueList(
                                           create_options, num_options));
@@ -263,8 +279,9 @@ PJRT_Client* TF_GetPjRtCClient(const char* device_type, TF_Status* status) {
       tensorflow::down_cast<xla::PjRtCApiClient*>(*pjrt_client);
   if (pjrt_c_api_client == nullptr) {
     tensorflow::Set_TF_Status_from_Status(
-        status, tsl::errors::Internal("PjRtClient for ", device_type,
-                                      " is not type PjRtCApiClient"));
+        status,
+        absl::InternalError(absl::StrCat("PjRtClient for ", device_type,
+                                         " is not type PjRtCApiClient")));
     return nullptr;
   }
   TF_SetStatus(status, TF_OK, "");
@@ -282,8 +299,7 @@ PJRT_Buffer* TF_GetPjRtCBuffer(TF_Tensor* c_tensor, TF_Status* status) {
       tensorflow::AsyncValueTensor::FromTensor(&tensor);
   if (av_tensor == nullptr || av_tensor->GetBuffer() == nullptr) {
     tensorflow::Set_TF_Status_from_Status(
-        status,
-        tsl::errors::Internal("Input tensor does not have PjRtBuffer."));
+        status, absl::InternalError("Input tensor does not have PjRtBuffer."));
     return nullptr;
   }
   auto* c_api_buffer =
@@ -291,7 +307,7 @@ PJRT_Buffer* TF_GetPjRtCBuffer(TF_Tensor* c_tensor, TF_Status* status) {
   if (c_api_buffer == nullptr) {
     tensorflow::Set_TF_Status_from_Status(
         status,
-        tsl::errors::Internal(
+        absl::InternalError(
             "The PjRtBuffer in the tensor is not type PjRtCApiBuffer."));
     return nullptr;
   }
@@ -317,8 +333,9 @@ void TF_CreatePjRtBuffer(TF_Tensor* c_tensor, PJRT_Buffer* c_buffer,
       tensorflow::down_cast<xla::PjRtCApiClient*>(*pjrt_client);
   if (pjrt_c_api_client == nullptr) {
     tensorflow::Set_TF_Status_from_Status(
-        status, tsl::errors::Internal("PjRtClient for ", device_type,
-                                      " is not type PjRtCApiClient"));
+        status,
+        absl::InternalError(absl::StrCat("PjRtClient for ", device_type,
+                                         " is not type PjRtCApiClient")));
     return;
   }
   tensorflow::AsyncValueTensor* av_tensor =
@@ -326,7 +343,7 @@ void TF_CreatePjRtBuffer(TF_Tensor* c_tensor, PJRT_Buffer* c_buffer,
   if (av_tensor == nullptr) {
     tensorflow::Set_TF_Status_from_Status(
         status,
-        tsl::errors::Internal(
+        absl::InternalError(
             "The tensor to set PjRtBuffer is not an AsyncValueTensor."));
     return;
   }
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
index ab7de9bae06..6c4c32a31db 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/BUILD
@@ -28,6 +28,8 @@ cc_library(
         "//tensorflow/cc/saved_model:constants",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -99,6 +101,8 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc b/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc
index 5cc14d615f5..2f32e6c76b0 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/asset.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/c/tensor_interface.h"
@@ -37,8 +39,8 @@ Status Asset::Create(ImmediateExecutionContext* ctx,
       io::JoinPath(saved_model_dir, kSavedModelAssetsDirectory, asset_filename);
   AbstractTensorPtr tensor(ctx->CreateStringScalar(abs_path));
   if (tensor.get() == nullptr) {
-    return errors::Internal(
-        "Failed to create scalar string tensor for Asset at path ", abs_path);
+    return absl::InternalError(absl::StrCat(
+        "Failed to create scalar string tensor for Asset at path ", abs_path));
   }
 
   ImmediateTensorHandlePtr handle(ctx->CreateLocalHandle(tensor.get()));
diff --git a/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
index b9344238b79..fe78a84a649 100644
--- a/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
+++ b/tensorflow/c/experimental/saved_model/core/revived_types/partially_revived_objects.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
@@ -60,15 +62,15 @@ Status AssertAllCreateResourceFunctionsHaveNoCaptures(
     const TFConcreteFunctionRevivalState* create_resource_fn =
         resource.create_resource;
     if (create_resource_fn == nullptr) {
-      return errors::FailedPrecondition(
-          "Resource at node ", node_id,
-          " did not have a create_resource() function");
+      return absl::FailedPreconditionError(
+          absl::StrCat("Resource at node ", node_id,
+                       " did not have a create_resource() function"));
     }
     const SavedConcreteFunction* saved_create_resource_fn =
         create_resource_fn->saved_concrete_func;
     if (!saved_create_resource_fn->bound_inputs().empty()) {
       // TODO(b/124045874): Support loading resource functions via a top sort
-      return errors::Unimplemented(
+      return absl::UnimplementedError(
           "Create Resource functions with captures are currently unsupported.");
     }
   }
@@ -86,9 +88,9 @@ Status TensorHandleFromNode(int node_id, const SavedObjectGraph& obj_graph,
     case SavedObject::kVariable: {
       const auto& variables_iter = objects.variables.find(node_id);
       if (variables_iter == objects.variables.end()) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "Tried to convert node id ", node_id,
-            " of type variable to tensor but the variable wasn't initialized");
+            " of type variable to tensor but the variable wasn't initialized"));
       }
       *handle = variables_iter->second->handle();
       return Status();
@@ -96,9 +98,10 @@ Status TensorHandleFromNode(int node_id, const SavedObjectGraph& obj_graph,
     case SavedObject::kConstant: {
       const auto& constants_iter = objects.constants.find(node_id);
       if (constants_iter == objects.constants.end()) {
-        return errors::FailedPrecondition("Tried to convert node id ", node_id,
-                                          " of type constant to tensor but the "
-                                          "constant wasn't initialized");
+        return absl::FailedPreconditionError(
+            absl::StrCat("Tried to convert node id ", node_id,
+                         " of type constant to tensor but the "
+                         "constant wasn't initialized"));
       }
       *handle = constants_iter->second->handle();
       return Status();
@@ -106,9 +109,9 @@ Status TensorHandleFromNode(int node_id, const SavedObjectGraph& obj_graph,
     case SavedObject::kAsset: {
       const auto& assets_iter = objects.assets.find(node_id);
       if (assets_iter == objects.assets.end()) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "Tried to convert node id ", node_id,
-            " of type asset to tensor but the asset wasn't initialized");
+            " of type asset to tensor but the asset wasn't initialized"));
       }
       *handle = assets_iter->second->handle();
       return Status();
@@ -116,24 +119,24 @@ Status TensorHandleFromNode(int node_id, const SavedObjectGraph& obj_graph,
     case SavedObject::kResource: {
       const auto& resource_iter = objects.restored_resources.find(node_id);
       if (resource_iter == objects.restored_resources.end()) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "Tried to convert node id ", node_id,
-            " of type Resource to tensor but the Resource wasn't initialized");
+            " of type Resource to tensor but the Resource wasn't initialized"));
       }
       const RestoredResourceRevivalState& resource = resource_iter->second;
       if (resource.resource_handle == nullptr) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "Resource with node id ", node_id,
-            " should have its resource_handle created, but was nullptr.");
+            " should have its resource_handle created, but was nullptr."));
       }
       *handle = resource.resource_handle.get();
       return Status();
     }
     default: {
-      return errors::FailedPrecondition(
+      return absl::FailedPreconditionError(absl::StrCat(
           "Only objects of type variable, constant, asset, and resources have "
           "capturable tensorhandles. Encountered object of kind ",
-          node.kind_case(), " at node id: ", node_id);
+          node.kind_case(), " at node id: ", node_id));
     }
   }
 }
@@ -167,35 +170,35 @@ Status SignatureDefArgsFromInputs(
   // (args, kwargs), where args is an empty tuple, and kwargs is a dictionary of
   // string keys to TensorSpecs.
   if (!canonicalized_input_signature.has_tuple_value()) {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(absl::StrCat(
         "SignatureDefFunction's canonicalized_input_signature should be "
         "of form tuple(tuple(), dict()), but was instead: \n",
-        canonicalized_input_signature.DebugString());
+        canonicalized_input_signature.DebugString()));
   }
 
   const TupleValue& args_kwargs_tuple =
       canonicalized_input_signature.tuple_value();
   if (args_kwargs_tuple.values_size() != 2) {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(absl::StrCat(
         "SignatureDefFunction's canonicalized_input_signature should be "
         "a tuple of two elements (args, kwargs), but was instead: \n",
-        args_kwargs_tuple.DebugString());
+        args_kwargs_tuple.DebugString()));
   }
 
   const StructuredValue& args = args_kwargs_tuple.values(0);
   if (!args.has_tuple_value() || !args.tuple_value().values().empty()) {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(absl::StrCat(
         "SignatureDefFunction's canonicalized_input_signature's args"
         "should be an empty tuple, but instead got: \n",
-        args.DebugString());
+        args.DebugString()));
   }
 
   const StructuredValue& kwargs = args_kwargs_tuple.values(1);
   if (!kwargs.has_dict_value()) {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(absl::StrCat(
         "SignatureDefFunction's canonicalized_input_signature's kwargs"
         "should be a dictionary, but instead got: \n",
-        kwargs.DebugString());
+        kwargs.DebugString()));
   }
 
   const DictValue& kwargs_dict = kwargs.dict_value();
@@ -206,10 +209,10 @@ Status SignatureDefArgsFromInputs(
     const std::string& key = key_value.first;
     const StructuredValue& value = key_value.second;
     if (!value.has_tensor_spec_value()) {
-      return errors::FailedPrecondition(
+      return absl::FailedPreconditionError(absl::StrCat(
           "SignatureDefFunction's canonicalized_input_signature's kwargs"
           "dictionary contained a non-tensorspec value for key-value pair: \n",
-          "Key: ", key, "Value: \n", value.DebugString());
+          "Key: ", key, "Value: \n", value.DebugString()));
     }
     result[key] = &value.tensor_spec_value();
   }
@@ -226,10 +229,10 @@ Status SignatureDefArgsFromInputs(
 Status SignatureDefReturnsFromOutputs(const StructuredValue& output_signature,
                                       std::vector<SignatureDefParam>* out) {
   if (!output_signature.has_dict_value()) {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(absl::StrCat(
         "SignatureDefFunction's output_signature must be a dictionary, but "
         "instead got: ",
-        output_signature.DebugString());
+        output_signature.DebugString()));
   }
 
   const DictValue& output_dict = output_signature.dict_value();
@@ -240,10 +243,10 @@ Status SignatureDefReturnsFromOutputs(const StructuredValue& output_signature,
     const std::string& key = key_value.first;
     const StructuredValue& value = key_value.second;
     if (!value.has_tensor_spec_value()) {
-      return errors::FailedPrecondition(
+      return absl::FailedPreconditionError(absl::StrCat(
           "SignatureDefFunction's output_signature dictionary contained a "
           "non-tensorspec value for key-value pair: \n",
-          "Key: ", key, "Value: \n", value.DebugString());
+          "Key: ", key, "Value: \n", value.DebugString()));
     }
     result[key] = &value.tensor_spec_value();
   }
@@ -337,7 +340,7 @@ Status InitializeCreateResourceFunctions(ImmediateExecutionContext* ctx,
         create_resource_fn->saved_concrete_func;
     if (!saved_create_resource_fn->bound_inputs().empty()) {
       // TODO(b/124045874): Load resource functions via a topological sort
-      return errors::Unimplemented(
+      return absl::UnimplementedError(
           "Create Resource functions with captures are currently unsupported.");
     }
     std::unique_ptr<TFConcreteFunction> out;
@@ -401,9 +404,9 @@ Status CreateAllResourceHandles(ImmediateExecutionContext* ctx,
     const TFConcreteFunction* create_resource_fn =
         revived->concrete_functions.Find(create_resource_fn_node);
     if (create_resource_fn == nullptr) {
-      return errors::FailedPrecondition(
-          "ConcreteFunction at node ", create_resource_fn_node,
-          " should have been initialized prior to being called.");
+      return absl::FailedPreconditionError(
+          absl::StrCat("ConcreteFunction at node ", create_resource_fn_node,
+                       " should have been initialized prior to being called."));
     }
     ImmediateOpPtr function_op;
     TF_RETURN_IF_ERROR(create_resource_fn->MakeCallOp({}, &function_op));
@@ -416,7 +419,7 @@ Status CreateAllResourceHandles(ImmediateExecutionContext* ctx,
     AbstractTensorHandlePtr owned_resource_handle(resource_handle);
     if (!tensorflow::isa<ImmediateExecutionTensorHandle>(
             owned_resource_handle.get())) {
-      return errors::Internal("Unexpected tensor handle kind.");
+      return absl::InternalError("Unexpected tensor handle kind.");
     }
     ImmediateTensorHandlePtr result(
         reinterpret_cast<ImmediateExecutionTensorHandle*>(
@@ -443,9 +446,9 @@ Status BuildResources(ImmediateExecutionContext* ctx,
       create_resource = revived->concrete_functions.Find(
           resource_revival_state.create_resource->node_id);
       if (create_resource == nullptr) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "'create_resource' function with node id ",
-            resource_revival_state.create_resource->node_id, " not found");
+            resource_revival_state.create_resource->node_id, " not found"));
       }
     }
 
@@ -454,9 +457,9 @@ Status BuildResources(ImmediateExecutionContext* ctx,
       initialize = revived->concrete_functions.Find(
           resource_revival_state.initialize->node_id);
       if (initialize == nullptr) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "'initialize' function with node id ",
-            resource_revival_state.initialize->node_id, " not found");
+            resource_revival_state.initialize->node_id, " not found"));
       }
     }
 
@@ -465,15 +468,16 @@ Status BuildResources(ImmediateExecutionContext* ctx,
       destroy_resource = revived->concrete_functions.Find(
           resource_revival_state.destroy_resource->node_id);
       if (destroy_resource == nullptr) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "'destroy_resource' function with node id ",
-            resource_revival_state.destroy_resource->node_id, " not found");
+            resource_revival_state.destroy_resource->node_id, " not found"));
       }
     }
 
     if (resource_revival_state.resource_handle == nullptr) {
-      return errors::FailedPrecondition("Resource at node id ", node_id,
-                                        " does not have a resource handle.");
+      return absl::FailedPreconditionError(
+          absl::StrCat("Resource at node id ", node_id,
+                       " does not have a resource handle."));
     }
 
     revived->restored_resources.emplace(
diff --git a/tensorflow/c/experimental/saved_model/internal/BUILD b/tensorflow/c/experimental/saved_model/internal/BUILD
index d6dc1f202b0..36e5cb52d2e 100644
--- a/tensorflow/c/experimental/saved_model/internal/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/BUILD
@@ -344,7 +344,7 @@ cc_library(
 
 tf_cc_test(
     name = "saved_model_api_test",
-    size = "small",
+    size = "medium",
     srcs = [
         "saved_model_api_test.cc",
     ],
diff --git a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
index ee8fda30e46..a10cfd03e3d 100644
--- a/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
+++ b/tensorflow/c/experimental/saved_model/internal/testdata/BUILD
@@ -13,15 +13,15 @@ py_strict_binary(
     srcs = ["gen_saved_models.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/saved_model",
         "@absl_py//absl:app",
     ],
diff --git a/tensorflow/cc/BUILD b/tensorflow/cc/BUILD
index c1019c705ba..82d1f2a7e4f 100644
--- a/tensorflow/cc/BUILD
+++ b/tensorflow/cc/BUILD
@@ -189,6 +189,8 @@ cc_library_with_android_deps(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/cc/experimental/libtf/BUILD b/tensorflow/cc/experimental/libtf/BUILD
index e281672de9e..31e15972668 100644
--- a/tensorflow/cc/experimental/libtf/BUILD
+++ b/tensorflow/cc/experimental/libtf/BUILD
@@ -1,4 +1,5 @@
-# TODO(aselle): describe this package.
+#include "third_party/absl/strings/str_cat.h"
+#TODO(aselle) : describe this package.
 
 load(
     "//tensorflow/core/platform:rules_cc.bzl",
@@ -42,6 +43,8 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -84,13 +87,13 @@ py_strict_binary(
     srcs = ["tests/generate_testdata.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
@@ -180,13 +183,11 @@ tf_cc_test(
     size = "medium",
     srcs = [
         "tests/runtime_test_core.cc",
-        "tests/runtime_test_tfrt.cc",
     ],
     deps = [
         ":runtime_test",
         "//tensorflow/cc/experimental/libtf/runtime",
         "//tensorflow/cc/experimental/libtf/runtime/core",
-        "//tensorflow/cc/experimental/libtf/runtime/tfrt",
     ],
 )
 
diff --git a/tensorflow/cc/experimental/libtf/object.h b/tensorflow/cc/experimental/libtf/object.h
index 72d05aaf430..4e15a508e39 100644
--- a/tensorflow/cc/experimental/libtf/object.h
+++ b/tensorflow/cc/experimental/libtf/object.h
@@ -29,6 +29,8 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/cc/experimental/libtf/value.h"
 #include "tensorflow/core/platform/errors.h"
@@ -172,7 +174,7 @@ class Object : public Handle {
         }
       }
     }
-    return tensorflow::errors::NotFound("Key not in dictionary.");
+    return absl::NotFoundError("Key not in dictionary.");
   }
 
   /// Sets `key` attribute with the underlying value of `h`.
@@ -202,7 +204,7 @@ class Dictionary final : public Handle {
   tensorflow::StatusOr<T> Get(const Handle& key) {
     auto it = value_.dict().find(key.value_);
     if (it != value_.dict().end()) return Cast<T>(Handle(it->second));
-    return tensorflow::errors::NotFound("Key not in dictionary.");
+    return absl::NotFoundError("Key not in dictionary.");
   }
   /// Sets `key` with value `value`.
   void Set(const String& key, Handle value) {
@@ -282,7 +284,7 @@ tensorflow::Status Tensor::GetValue(absl::Span<T> data) const {
   {
     const auto abstract_t = value_.tensor().get();
     if (!tensorflow::ImmediateExecutionTensorHandle::classof(abstract_t)) {
-      return tensorflow::errors::InvalidArgument(
+      return absl::InvalidArgumentError(
           "Attempting to get value of non eager tensor.");
     }
     auto imm_t =
@@ -315,7 +317,7 @@ class Tuple : public Handle {
   template <class T>
   tensorflow::StatusOr<T> Get(size_t i) {
     if (i >= value_.tuple().size())
-      return tensorflow::errors::InvalidArgument("Out of bounds index.");
+      return absl::InvalidArgumentError("Out of bounds index.");
     return Cast<T>(Handle(value_.tuple()[i]));
   }
 
@@ -348,7 +350,7 @@ class List final : public Handle {
   template <class T>
   tensorflow::StatusOr<T> Get(size_t i) {
     if (i >= size()) {
-      return tensorflow::errors::InvalidArgument("Out of bounds index.");
+      return absl::InvalidArgumentError("Out of bounds index.");
     }
     return Cast<T>(Handle(value_.list()[i]));
   }
@@ -356,7 +358,7 @@ class List final : public Handle {
   /// Sets value `h` at index `i`.
   tensorflow::Status Set(size_t i, Handle h) {
     if (i >= size()) {
-      return tensorflow::errors::InvalidArgument("Out of bounds index.");
+      return absl::InvalidArgumentError("Out of bounds index.");
     }
     value_.list()[i] = std::move(h.value_);
     return ::tensorflow::OkStatus();
@@ -533,7 +535,7 @@ tensorflow::StatusOr<T> Cast(Handle handle) {
   if (handle.value_.type() == TypeToTaggedType<T>() ||
       std::is_same<T, Handle>::value)
     return T((std::move(handle.value_)));
-  return tensorflow::errors::InvalidArgument("Incompatible cast.");
+  return absl::InvalidArgumentError("Incompatible cast.");
 }
 
 // Converters for C++ primitives like float and int to handles. Allows callable
@@ -656,10 +658,10 @@ class UneraseCallHelper<Fn, TReturn, TSignatureArg, TSignatureRest...> {
     Handle h(std::move(args_in.tuple()[argument_index]));
     tensorflow::StatusOr<TSignatureArg> x = Cast<TSignatureArg>(std::move(h));
     if (!x.ok())
-      return tensorflow::errors::InvalidArgument(
-          std::string("Function ") + name + " Arg " +
-          std::to_string(argument_index) +
-          " cannot be cast to desired signature type ");
+      return absl::InvalidArgumentError(
+          absl::StrCat(std::string("Function ") + name + " Arg " +
+                       std::to_string(argument_index) +
+                       " cannot be cast to desired signature type "));
     return UneraseCallHelper<Fn, TReturn, TSignatureRest...>::template Call(
         name, fn, argument_index + 1, args_in, args..., *x);
   }
@@ -683,9 +685,9 @@ class CallableWrapper {
                                                TaggedValue kwargs) {
     constexpr size_t argument_count = sizeof...(TFuncArgs);
     if (argument_count != args.tuple().size())
-      return tensorflow::errors::InvalidArgument(
-          std::string("Function ") + name_ + " expected " +
-          std::to_string(argument_count) + " args.");
+      return absl::InvalidArgumentError(
+          absl::StrCat(std::string("Function ") + name_ + " expected " +
+                       std::to_string(argument_count) + " args."));
     return UneraseCallHelper<Fn, TReturn, TFuncArgs...>::Call(name_, functor_,
                                                               0, args);
   }
diff --git a/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD b/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD
deleted file mode 100644
index 586ef6b9523..00000000000
--- a/tensorflow/cc/experimental/libtf/runtime/tfrt/BUILD
+++ /dev/null
@@ -1,26 +0,0 @@
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow/cc/experimental/libtf:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "tfrt",
-    srcs = [
-        "tfrt.cc",
-    ],
-    hdrs = [
-        "tfrt.h",
-    ],
-    deps = [
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/c:tf_status_internal",
-        "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:c_api_experimental",
-        "//tensorflow/c/eager:tfe_context_internal",
-        "//tensorflow/cc/experimental/libtf",
-        "//tensorflow/cc/experimental/libtf/runtime",
-    ],
-)
diff --git a/tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.cc b/tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.cc
deleted file mode 100644
index b50344fb0ed..00000000000
--- a/tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.h"
-
-#include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/c/eager/c_api_experimental.h"
-#include "tensorflow/c/eager/tfe_context_internal.h"
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/c/tf_status_internal.h"
-#include "tensorflow/cc/experimental/libtf/value.h"
-
-namespace tf {
-namespace libtf {
-namespace runtime {
-namespace tfrt {
-
-runtime::Runtime Runtime() {
-  TFE_Context* ctx;
-  TFE_ContextOptions* ctx_options = TFE_NewContextOptions();
-  TFE_ContextOptionsSetTfrt(ctx_options, true);
-  TFE_ContextOptionsSetDevicePlacementPolicy(ctx_options,
-                                             TFE_DEVICE_PLACEMENT_WARN);
-  TF_Status* status = TF_NewStatus();
-  ctx = TFE_NewContext(ctx_options, status);
-  TF_DeleteStatus(status);
-  TFE_DeleteContextOptions(ctx_options);
-  return runtime::Runtime(tensorflow::unwrap(ctx));
-}
-
-}  // namespace tfrt
-}  // namespace runtime
-}  // namespace libtf
-}  // namespace tf
diff --git a/tensorflow/cc/experimental/libtf/tests/function_test.cc b/tensorflow/cc/experimental/libtf/tests/function_test.cc
index a9b4061f1a0..fa1f21389df 100644
--- a/tensorflow/cc/experimental/libtf/tests/function_test.cc
+++ b/tensorflow/cc/experimental/libtf/tests/function_test.cc
@@ -288,7 +288,7 @@ TEST_P(FunctionTest, IncorrectDtypeInOutputSignatureFails) {
 INSTANTIATE_TEST_SUITE_P(TF2CAPI, FunctionTest,
                          ::testing::Combine(::testing::Values("graphdef",
                                                               "mlir"),
-                                            ::testing::Values(false, true)));
+                                            ::testing::Values(false)));
 
 }  // namespace libtf
 }  // namespace tf
diff --git a/tensorflow/cc/experimental/libtf/tests/tensor_test.cc b/tensorflow/cc/experimental/libtf/tests/tensor_test.cc
index 3f4708f0f0d..85243dd4287 100644
--- a/tensorflow/cc/experimental/libtf/tests/tensor_test.cc
+++ b/tensorflow/cc/experimental/libtf/tests/tensor_test.cc
@@ -123,7 +123,7 @@ TEST_P(UnifiedCAPI, SimpleCreationFunctions) {
 INSTANTIATE_TEST_SUITE_P(Tracing, UnifiedCAPI,
                          ::testing::Combine(::testing::Values("graphdef",
                                                               "mlir"),
-                                            ::testing::Values(true, false)));
+                                            ::testing::Values(false)));
 
 }  // namespace libtf
 }  // namespace tf
diff --git a/tensorflow/cc/experimental/libtf/tests/variable_test.cc b/tensorflow/cc/experimental/libtf/tests/variable_test.cc
index 8e7aca22bdc..1e37ed9cb2b 100644
--- a/tensorflow/cc/experimental/libtf/tests/variable_test.cc
+++ b/tensorflow/cc/experimental/libtf/tests/variable_test.cc
@@ -114,7 +114,7 @@ TEST_P(VariableTest, CreateAssignReadDestroy) {
 INSTANTIATE_TEST_SUITE_P(TF2CAPI, VariableTest,
                          ::testing::Combine(::testing::Values("graphdef",
                                                               "mlir"),
-                                            ::testing::Values(false, true)));
+                                            ::testing::Values(false)));
 
 }  // namespace libtf
 }  // namespace tf
diff --git a/tensorflow/cc/framework/ops.h b/tensorflow/cc/framework/ops.h
index d19b895654b..ab8b387ab56 100644
--- a/tensorflow/cc/framework/ops.h
+++ b/tensorflow/cc/framework/ops.h
@@ -21,6 +21,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/graph/graph.h"
@@ -156,9 +158,9 @@ class Input {
       typedef typename RealType<T>::type RealT;
       Tensor t(DataTypeToEnum<RealT>::v(), shape);
       if (t.NumElements() != static_cast<int64_t>(v.size())) {
-        status = errors::InvalidArgument(
+        status = absl::InvalidArgumentError(absl::StrCat(
             "Cannot construct a tensor with ", t.NumElements(),
-            " from an initializer list with ", v.size(), " elements");
+            " from an initializer list with ", v.size(), " elements"));
         return;
       }
       std::copy_n(v.begin(), v.size(), t.flat<RealT>().data());
diff --git a/tensorflow/cc/gradients/math_grad_test.cc b/tensorflow/cc/gradients/math_grad_test.cc
index 59ea373bd6d..b3d77f29b06 100644
--- a/tensorflow/cc/gradients/math_grad_test.cc
+++ b/tensorflow/cc/gradients/math_grad_test.cc
@@ -466,7 +466,7 @@ TEST_F(CWiseUnaryGradTest, Asin_Complex) {
   };
   // TODO(kbsriram)
   // Enable test when the asin kernel supports complex numbers
-  if (false) {
+  if (/* DISABLES CODE */ (false)) {
     TestCWiseGrad<complex64, complex64>(ASIN, x_fn);
   }
 }
@@ -482,7 +482,7 @@ TEST_F(CWiseUnaryGradTest, Acos_Complex) {
   };
   // TODO(kbsriram)
   // Add test when the acos kernel supports complex numbers
-  if (false) {
+  if (/* DISABLES CODE */ (false)) {
     TestCWiseGrad<complex64, complex64>(ACOS, x_fn);
   }
 }
@@ -510,7 +510,7 @@ TEST_F(CWiseUnaryGradTest, Atan_Complex) {
   };
   // TODO(kbsriram)
   // Add test when the atan kernel supports complex numbers
-  if (false) {
+  if (/* DISABLES CODE */ (false)) {
     TestCWiseGrad<complex64, complex64>(ATAN, x_fn);
   }
 }
@@ -561,7 +561,7 @@ TEST_F(CWiseUnaryGradTest, Lgamma_Complex) {
   };
   // TODO(kbsriram)
   // Add test when the lgamma kernel supports complex numbers
-  if (false) {
+  if (/* DISABLES CODE */ (false)) {
     TestCWiseGrad<complex64, complex64>(LGAMMA, x_fn);
   }
 }
@@ -579,7 +579,7 @@ TEST_F(CWiseUnaryGradTest, Erf_Complex) {
   };
   // TODO(kbsriram)
   // Add test when the erf kernel supports complex numbers
-  if (false) {
+  if (/* DISABLES CODE */ (false)) {
     TestCWiseGrad<complex64, complex64>(ERF, x_fn);
   }
 }
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index d52db030b1b..b9764d72c7e 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -1,4 +1,5 @@
-# Description:
+#include "third_party/absl/strings/str_cat.h"
+#Description:
 # TensorFlow SavedModel.
 
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
@@ -6,6 +7,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_android",
+    "if_google",
     "if_mobile",
     "if_not_mobile",
     "tf_cc_test",
@@ -28,6 +30,8 @@ package(
 
 exports_files([
     "loader.h",
+    "testdata/chunked_saved_model/chunked_model/saved_model.cpb",
+    "testdata/chunked_saved_model/chunked_model/saved_model.pbtxt",
 ])
 
 cc_library(
@@ -63,7 +67,9 @@ cc_library(
         ":metrics",
         ":util",
         "//tensorflow/core:protos_all_cc",
-    ] + if_not_mobile([
+    ] + if_google([
+        "//tensorflow/tools/proto_splitter:merge",
+    ]) + if_not_mobile([
         # TODO(b/111634734): :lib and :protos_all contain dependencies that
         # cannot be built on mobile platforms. Instead, include the appropriate
         # tf_lib depending on the build platform.
@@ -87,9 +93,8 @@ tf_cc_test(
         ":tag_constants",
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
         "//tensorflow/core/platform:resource_loader",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -131,6 +136,8 @@ cc_library(
         ":fingerprinting",
         ":loader_util",
         ":reader",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ] + if_not_mobile([
         ":metrics",
         ":util",
@@ -252,15 +259,15 @@ py_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/trackable:asset",
@@ -268,6 +275,31 @@ py_binary(
     ],
 )
 
+# copybara:uncomment_begin(google-only)
+#
+# py_binary(
+#     name = "testdata/generate_chunked_models",
+#     srcs = ["testdata/generate_chunked_models.py"],
+#     python_version = "PY3",
+#     srcs_version = "PY3",
+#     deps = [
+#         "//tensorflow/python/compat:v2_compat",
+#         "//tensorflow/python/eager:def_function",
+#         "//tensorflow/python/framework:constant_op",
+#         "//tensorflow/python/module",
+#         "//tensorflow/python/platform:client_testlib",
+#         "//tensorflow/python/saved_model:loader",
+#         "//tensorflow/python/saved_model:save",
+#         "//tensorflow/python/saved_model:save_options",
+#         "//tensorflow/python/util:compat",
+#         "//tensorflow/tools/proto_splitter:constants",
+#         "//tensorflow/tools/proto_splitter/python:saved_model",
+#         "@absl_py//absl:app",
+#     ],
+# )
+#
+# copybara:uncomment_end
+
 # TODO(b/32673259): add a test to continuously validate these files.
 filegroup(
     name = "saved_model_test_files",
@@ -284,6 +316,7 @@ filegroup(
         "testdata/fuzz_generated/**",
         "testdata/SimpleV1Model/**",
         "testdata/OptimizerSlotVariableModule/**",
+        "testdata/chunked_saved_model/**",
     ]),
 )
 
@@ -369,7 +402,7 @@ tf_cc_test(
         ":metrics",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@jsoncpp_git//:jsoncpp",
     ],
 )
diff --git a/tensorflow/cc/saved_model/bundle_v2.cc b/tensorflow/cc/saved_model/bundle_v2.cc
index 21692edbf40..85af07ce0d5 100644
--- a/tensorflow/cc/saved_model/bundle_v2.cc
+++ b/tensorflow/cc/saved_model/bundle_v2.cc
@@ -39,60 +39,6 @@ using strings::StrCat;
 // `tensorflow::SavedModelV2Bundle::Load` API label.
 constexpr char kCCLoadBundleV2Label[] = "cc_load_bundle_v2";
 
-Status ReadSavedModelProto(const string& export_dir,
-                           SavedModel* saved_model_proto) {
-  LOG(INFO) << "Reading SavedModel from: " << export_dir;
-
-  const string saved_model_pb_path =
-      io::JoinPath(export_dir, kSavedModelFilenamePb);
-  Status found_pb = Env::Default()->FileExists(saved_model_pb_path);
-  if (found_pb.ok()) {
-    Status result =
-        ReadBinaryProto(Env::Default(), saved_model_pb_path, saved_model_proto);
-    if (result.ok()) {
-      metrics::SavedModelReadCount(
-          saved_model::GetWriteVersion(*saved_model_proto))
-          .IncrementBy(1);
-    }
-    return result;
-  }
-
-  const string saved_model_pbtxt_path =
-      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
-  Status found_pbtxt = Env::Default()->FileExists(saved_model_pbtxt_path);
-  if (found_pbtxt.ok()) {
-    Status result = ReadTextProto(Env::Default(), saved_model_pbtxt_path,
-                                  saved_model_proto);
-    if (result.ok()) {
-      metrics::SavedModelReadCount(
-          saved_model::GetWriteVersion(*saved_model_proto))
-          .IncrementBy(1);
-    }
-    return result;
-  }
-
-  Status err;
-  if (found_pb.code() == found_pbtxt.code()) {
-    err = Status(found_pb.code(),
-                 StrCat(found_pb.message(), "\n", found_pbtxt.message()));
-  } else if (found_pb.code() == NOT_FOUND) {
-    err = found_pbtxt;
-  } else if (found_pbtxt.code() == NOT_FOUND) {
-    err = found_pb;
-  } else {
-    // found_pb and found_pbtxt both errored, w/ different codes, neither being
-    // NOT_FOUND.
-    err = Status(
-        absl::StatusCode::kInternal,
-        StrCat("Different errors encountered while looking for saved_model.pb "
-               "and saved_model.pbtxt in the export directory path \"",
-               export_dir, "\": \n", found_pb.ToString(), "\n",
-               found_pbtxt.ToString()));
-  }
-
-  return err;
-}
-
 Status ReadCheckpointObjectGraph(BundleReader* bundle_reader,
                                  TrackableObjectGraph* object_graph) {
   Tensor object_graph_tensor;
@@ -123,7 +69,7 @@ Status SavedModelV2Bundle::Load(const std::string& export_dir,
                                 SavedModelV2Bundle* const bundle) {
   metrics::SavedModelReadApi(kCCLoadBundleV2Label).IncrementBy(1);
   SavedModel saved_model_proto;
-  TF_RETURN_IF_ERROR(ReadSavedModelProto(export_dir, &saved_model_proto));
+  TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
   metrics::SavedModelReadPath().Set(export_dir);
 
   // Load MetaGraphDef.
diff --git a/tensorflow/cc/saved_model/constants.h b/tensorflow/cc/saved_model/constants.h
index b571a643113..e8a267e3280 100644
--- a/tensorflow/cc/saved_model/constants.h
+++ b/tensorflow/cc/saved_model/constants.h
@@ -19,56 +19,63 @@ limitations under the License.
 namespace tensorflow {
 
 // SavedModel assets directory.
-constexpr char kSavedModelAssetsDirectory[] = "assets";
+inline constexpr char kSavedModelAssetsDirectory[] = "assets";
 
 // SavedModel assets.extra directory.
-constexpr char kSavedModelAssetsExtraDirectory[] = "assets.extra";
+inline constexpr char kSavedModelAssetsExtraDirectory[] = "assets.extra";
 
 // SavedModel assets key for graph collection-def.
-constexpr char kSavedModelAssetsKey[] = "saved_model_assets";
+inline constexpr char kSavedModelAssetsKey[] = "saved_model_assets";
 
 /// SavedModel legacy init op collection key. Used in v1 SavedModels.
-constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
+inline constexpr char kSavedModelLegacyInitOpKey[] = "legacy_init_op";
 
 /// SavedModel main op collection key. Used in v1 SavedModels.
-constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
+inline constexpr char kSavedModelMainOpKey[] = "saved_model_main_op";
 
 // CollectionDef key for the SavedModel train op.
 // Not exported while export_all_saved_models is experimental.
-constexpr char kSavedModelTrainOpKey[] = "saved_model_train_op";
+inline constexpr char kSavedModelTrainOpKey[] = "saved_model_train_op";
 
 // Schema version for SavedModel.
-constexpr int kSavedModelSchemaVersion = 1;
+inline constexpr int kSavedModelSchemaVersion = 1;
 
+// SavedModel proto filename prefix.
+inline constexpr char kSavedModelFilenamePrefix[] = "saved_model";
 // SavedModel proto filename.
-constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
+inline constexpr char kSavedModelFilenamePb[] = "saved_model.pb";
+
+// SavedModel chunked proto filename.
+inline constexpr char kSavedModelFilenameCpb[] = "saved_model.cpb";
 
 // SavedModel text format proto filename.
-constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
+inline constexpr char kSavedModelFilenamePbTxt[] = "saved_model.pbtxt";
 
 // Subdirectory where debugging related files are written.
-constexpr char kSavedModelDebugDirectory[] = "debug";
+inline constexpr char kSavedModelDebugDirectory[] = "debug";
 
 // File name for GraphDebugInfo protocol buffer which corresponds to the
 // SavedModel.
-constexpr char kSavedModelDebugInfoFilenamePb[] = "saved_model_debug_info.pb";
+inline constexpr char kSavedModelDebugInfoFilenamePb[] =
+    "saved_model_debug_info.pb";
 
 // Directory in which to save the SavedModel variables.
-constexpr char kSavedModelVariablesDirectory[] = "variables";
+inline constexpr char kSavedModelVariablesDirectory[] = "variables";
 
 // SavedModel variables filename.
-constexpr char kSavedModelVariablesFilename[] = "variables";
+inline constexpr char kSavedModelVariablesFilename[] = "variables";
 
 // SavedModel SignatureDef keys for the initialization and train ops. Used in
 // V2 SavedModels.
-constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
-constexpr char kSavedModelTrainOpSignatureKey[] = "__saved_model_train_op";
+inline constexpr char kSavedModelInitOpSignatureKey[] = "__saved_model_init_op";
+inline constexpr char kSavedModelTrainOpSignatureKey[] =
+    "__saved_model_train_op";
 
 // Key in the TensorBundle for the object graph proto.
-constexpr char kObjectGraphProtoKey[] = "_CHECKPOINTABLE_OBJECT_GRAPH";
+inline constexpr char kObjectGraphProtoKey[] = "_CHECKPOINTABLE_OBJECT_GRAPH";
 
 // Filename for the FingerprintDef protocol buffer.
-constexpr char kFingerprintFilenamePb[] = "fingerprint.pb";
+inline constexpr char kFingerprintFilenamePb[] = "fingerprint.pb";
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/cc/saved_model/fingerprinting.cc b/tensorflow/cc/saved_model/fingerprinting.cc
index 389b28bf278..d4f05c472b9 100644
--- a/tensorflow/cc/saved_model/fingerprinting.cc
+++ b/tensorflow/cc/saved_model/fingerprinting.cc
@@ -120,23 +120,29 @@ uint64 HashCheckpointIndexFile(absl::string_view model_dir) {
 
 StatusOr<FingerprintDef> CreateFingerprintDef(const SavedModel& saved_model,
                                               absl::string_view export_dir) {
+  SavedModel copy = saved_model;
+  return CreateFingerprintDef(&copy, export_dir);
+}
+
+StatusOr<FingerprintDef> CreateFingerprintDef(SavedModel* saved_model,
+                                              absl::string_view export_dir) {
   // Create a copy of `metagraph` which will be used and mutated for fingerprint
   // computation.
-  MetaGraphDef metagraph_copy = saved_model.meta_graphs(0);
   FingerprintDef fingerprint_def;
+  MetaGraphDef* metagraph = saved_model->mutable_meta_graphs(0);
   // Set fingerprint field #1.
-  fingerprint_def.set_saved_model_checksum(HashSavedModel(saved_model));
+  fingerprint_def.set_saved_model_checksum(HashSavedModel(*saved_model));
   // Set fingerprint field #2.
-  graph_regularization::SimpleDelete(*metagraph_copy.mutable_graph_def());
+  graph_regularization::SimpleDelete(*metagraph->mutable_graph_def());
   fingerprint_def.set_graph_def_program_hash(
-      graph_regularization::ComputeHash(metagraph_copy.graph_def()));
+      graph_regularization::ComputeHash(metagraph->graph_def()));
   // Set fingerprint field #3.
   fingerprint_def.set_signature_def_hash(
-      RegularizeAndHashSignatureDefs(metagraph_copy.signature_def()));
+      RegularizeAndHashSignatureDefs(metagraph->signature_def()));
   // Set fingerprint field #4.
   TF_ASSIGN_OR_RETURN(
       StatusOr<uint64> object_graph_hash,
-      RegularizeAndHashSavedObjectGraph(metagraph_copy.object_graph_def()));
+      RegularizeAndHashSavedObjectGraph(metagraph->object_graph_def()));
   fingerprint_def.set_saved_object_graph_hash(object_graph_hash.value());
   // Set fingerprint field #5.
   fingerprint_def.set_checkpoint_hash(HashCheckpointIndexFile(export_dir));
diff --git a/tensorflow/cc/saved_model/fingerprinting.h b/tensorflow/cc/saved_model/fingerprinting.h
index 1d0a830b4d2..cabca831076 100644
--- a/tensorflow/cc/saved_model/fingerprinting.h
+++ b/tensorflow/cc/saved_model/fingerprinting.h
@@ -31,6 +31,12 @@ namespace tensorflow::saved_model::fingerprinting {
 StatusOr<FingerprintDef> CreateFingerprintDef(const SavedModel& saved_model,
                                               absl::string_view export_dir);
 
+// Creates a FingerprintDef proto from a SavedModel and the checkpoint meta file
+// (.index) in `export_dir`. The passed in `saved_model` is mutated and should
+// not be used afterwards.
+StatusOr<FingerprintDef> CreateFingerprintDef(SavedModel* saved_model,
+                                              absl::string_view export_dir);
+
 // Loads the `fingerprint.pb` from `export_dir`, returns an error if there is
 // none.
 StatusOr<FingerprintDef> ReadSavedModelFingerprint(
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index b9544bc7555..c0a816120cb 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/fingerprinting.h"
 #include "tensorflow/cc/saved_model/loader_util.h"
@@ -90,16 +92,16 @@ static Status ValidateNode(const NodeDef& node) {
     if (node_value.has_tensor()) {
       const PartialTensorShape node_shape(node_value.tensor().tensor_shape());
       if (node_shape.num_elements() < 0) {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(absl::StrCat(
             "Saved model contains node \"", node.name(), "\" (op \"", node.op(),
             "\") which initializes from a tensor with ",
-            node_shape.num_elements(), " elements");
+            node_shape.num_elements(), " elements"));
       }
     }
   } else if (node.op() == "Const") {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(absl::StrCat(
         "Saved model contains node \"", node.name(),
-        "\" which is a constant tensor but no value has been provided");
+        "\" which is a constant tensor but no value has been provided"));
   }
   return OkStatus();
 }
@@ -108,9 +110,9 @@ static Status ValidateFunctionNotRecursive(const FunctionDef& function) {
   const auto& function_name = function.signature().name();
   for (const auto& node : function.node_def()) {
     if (node.op() == function_name) {
-      return errors::FailedPrecondition(
+      return absl::FailedPreconditionError(absl::StrCat(
           "Function ", function_name,
-          " is self recursive and TensorFlow does not support this scenario.");
+          " is self recursive and TensorFlow does not support this scenario."));
     }
   }
 
@@ -340,17 +342,17 @@ class LiteSessionWrapper : public Session {
       : wrapped_(std::move(wrapped)) {}
 
   Status Create(const GraphDef& graph) override {
-    return errors::Unimplemented("Session::Create()");
+    return absl::UnimplementedError("Session::Create()");
   }
   Status Create(GraphDef&& graph) override {
-    return errors::Unimplemented("Session::Create()");
+    return absl::UnimplementedError("Session::Create()");
   }
 
   Status Extend(const GraphDef& graph) override {
-    return errors::Unimplemented("Session::Extend()");
+    return absl::UnimplementedError("Session::Extend()");
   }
   Status Extend(GraphDef&& graph) override {
-    return errors::Unimplemented("Session::Extend()");
+    return absl::UnimplementedError("Session::Extend()");
   }
 
   Status Run(const std::vector<std::pair<string, Tensor>>& inputs,
@@ -362,16 +364,16 @@ class LiteSessionWrapper : public Session {
   }
 
   Status Create(const RunOptions& run_options, const GraphDef& graph) override {
-    return errors::Unimplemented("Session::Create()");
+    return absl::UnimplementedError("Session::Create()");
   }
   Status Extend(const RunOptions& run_options, const GraphDef& graph) override {
-    return errors::Unimplemented("Session::Extend()");
+    return absl::UnimplementedError("Session::Extend()");
   }
   Status Create(const RunOptions& run_options, GraphDef&& graph) override {
-    return errors::Unimplemented("Session::Create()");
+    return absl::UnimplementedError("Session::Create()");
   }
   Status Extend(const RunOptions& run_options, GraphDef&& graph) override {
-    return errors::Unimplemented("Session::Extend()");
+    return absl::UnimplementedError("Session::Extend()");
   }
   Status Close(const RunOptions& run_options) override {
     return wrapped_->Close(run_options);
@@ -390,14 +392,14 @@ class LiteSessionWrapper : public Session {
                    const std::vector<string>& output_names,
                    const std::vector<string>& target_nodes,
                    string* handle) override {
-    return errors::Unimplemented("Session::PRunSetup()");
+    return absl::UnimplementedError("Session::PRunSetup()");
   }
 
   Status PRun(const string& handle,
               const std::vector<std::pair<string, Tensor>>& inputs,
               const std::vector<string>& output_names,
               std::vector<Tensor>* outputs) override {
-    return errors::Unimplemented("Session::PRun()");
+    return absl::UnimplementedError("Session::PRun()");
   }
 
   Status ListDevices(std::vector<DeviceAttributes>* response) override {
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index 40ba3e4a4e4..365874881dd 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/reader.h"
 
+#include <memory>
+#include <string>
 #include <unordered_set>
+#include <utility>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/cc/saved_model/constants.h"
@@ -31,60 +34,17 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/file_system_helper.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
 #include "tensorflow/core/util/tensor_bundle/byte_swap_tensor.h"
+// Placeholder for protosplitter merger include.
+
+#define IS_OSS true
 
 namespace tensorflow {
 namespace {
 
-// Reads the SavedModel proto from saved_model.pb in `export_dir`.
-// Returns a failure status when the SavedModel file does not exist.
-Status ReadSavedModel(absl::string_view export_dir,
-                      SavedModel* saved_model_proto) {
-  LOG(INFO) << "Reading SavedModel from: " << export_dir;
-
-  const std::string saved_model_pb_path =
-      io::JoinPath(export_dir, kSavedModelFilenamePb);
-
-  TF_ASSIGN_OR_RETURN(
-      bool saved_model_pb_exists,
-      internal::FileExists(Env::Default(), saved_model_pb_path));
-  if (saved_model_pb_exists) {
-    Status result =
-        ReadBinaryProto(Env::Default(), saved_model_pb_path, saved_model_proto);
-    if (result.ok()) {
-      metrics::SavedModelReadCount(
-          saved_model::GetWriteVersion(*saved_model_proto))
-          .IncrementBy(1);
-    }
-    return result;
-  }
-  const std::string saved_model_pbtxt_path =
-      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
-  TF_ASSIGN_OR_RETURN(
-      bool saved_model_pbtxt_exists,
-      internal::FileExists(Env::Default(), saved_model_pbtxt_path));
-  if (saved_model_pbtxt_exists) {
-    Status result = ReadTextProto(Env::Default(), saved_model_pbtxt_path,
-                                  saved_model_proto);
-    if (result.ok()) {
-      metrics::SavedModelReadCount(
-          saved_model::GetWriteVersion(*saved_model_proto))
-          .IncrementBy(1);
-    }
-    return result;
-  }
-  return Status(
-      absl::StatusCode::kNotFound,
-      strings::StrCat("Could not find SavedModel .pb or .pbtxt at supplied "
-                      "export directory path: ",
-                      export_dir,
-                      ". Check that "
-                      "the directory exists and that you have the right "
-                      "permissions for accessing it."));
-}
-
 Status FindMetaGraphDef(const std::unordered_set<string>& tags,
                         SavedModel* saved_model_proto,
                         MetaGraphDef* meta_graph_def) {
@@ -116,6 +76,61 @@ Status FindMetaGraphDef(const std::unordered_set<string>& tags,
 }
 }  // namespace
 
+// Reads the SavedModel proto from saved_model.pb in `export_dir`.
+// Returns a failure status when the SavedModel file does not exist.
+Status ReadSavedModel(absl::string_view export_dir,
+                      SavedModel* saved_model_proto) {
+  LOG(INFO) << "Reading SavedModel from: " << export_dir;
+
+  if (IS_OSS) {
+    const std::string saved_model_pb_path =
+        io::JoinPath(export_dir, kSavedModelFilenamePb);
+    TF_ASSIGN_OR_RETURN(
+        bool saved_model_pb_exists,
+        internal::FileExists(Env::Default(), saved_model_pb_path));
+    if (saved_model_pb_exists) {
+      Status result = ReadBinaryProto(Env::Default(), saved_model_pb_path,
+                                      saved_model_proto);
+      if (result.ok()) {
+        metrics::SavedModelReadCount(
+            saved_model::GetWriteVersion(*saved_model_proto))
+            .IncrementBy(1);
+      }
+      return result;
+    }
+  }
+
+  const std::string saved_model_pbtxt_path =
+      io::JoinPath(export_dir, kSavedModelFilenamePbTxt);
+  TF_ASSIGN_OR_RETURN(
+      bool saved_model_pbtxt_exists,
+      internal::FileExists(Env::Default(), saved_model_pbtxt_path));
+  if (saved_model_pbtxt_exists) {
+    Status result = ReadTextProto(Env::Default(), saved_model_pbtxt_path,
+                                  saved_model_proto);
+    if (result.ok()) {
+      metrics::SavedModelReadCount(
+          saved_model::GetWriteVersion(*saved_model_proto))
+          .IncrementBy(1);
+    }
+    return result;
+  }
+
+  if (!IS_OSS) {
+    // Only use Merger outside of OSS.
+    // Placeholder for protosplitter merger call.
+  }
+
+  return Status(
+      absl::StatusCode::kNotFound,
+      strings::StrCat("Could not find SavedModel .pb or .pbtxt at supplied "
+                      "export directory path: ",
+                      export_dir,
+                      ". Check that "
+                      "the directory exists and that you have the right "
+                      "permissions for accessing it."));
+}
+
 Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
                                       const std::unordered_set<string>& tags,
                                       MetaGraphDef* const meta_graph_def) {
@@ -140,8 +155,7 @@ Status ReadSavedModelDebugInfoIfPresent(
     GraphDebugInfo debug_info;
     TF_RETURN_IF_ERROR(
         ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
-    *debug_info_proto =
-        absl::make_unique<GraphDebugInfo>(std::move(debug_info));
+    *debug_info_proto = std::make_unique<GraphDebugInfo>(std::move(debug_info));
   }
   return OkStatus();
 }
diff --git a/tensorflow/cc/saved_model/reader.h b/tensorflow/cc/saved_model/reader.h
index f51fbeb557f..e82bd449c59 100644
--- a/tensorflow/cc/saved_model/reader.h
+++ b/tensorflow/cc/saved_model/reader.h
@@ -18,21 +18,26 @@ limitations under the License.
 #ifndef TENSORFLOW_CC_SAVED_MODEL_READER_H_
 #define TENSORFLOW_CC_SAVED_MODEL_READER_H_
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
 
 namespace tensorflow {
+Status ReadSavedModel(absl::string_view export_dir,
+                      SavedModel* saved_model_proto);
+
 // Reads the SavedModel proto from saved_model.pb(txt) in the given directory,
 // finds the MetaGraphDef that matches the given set of tags and writes it to
 // the `meta_graph_def` parameter. Returns a failure status when the SavedModel
 // file does not exist or no MetaGraphDef matches the tags.
 Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
                                       const std::unordered_set<string>& tags,
-                                      MetaGraphDef* const meta_graph_def);
+                                      MetaGraphDef* meta_graph_def);
 
 // Store debug info from the SavedModel export dir.
 Status ReadSavedModelDebugInfoIfPresent(
diff --git a/tensorflow/cc/saved_model/reader_test.cc b/tensorflow/cc/saved_model/reader_test.cc
index 4b8b5cde20d..7e00186b3ad 100644
--- a/tensorflow/cc/saved_model/reader_test.cc
+++ b/tensorflow/cc/saved_model/reader_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/cc/saved_model/reader.h"
 
+#include <gmock/gmock.h>
 #include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/metrics.h"
 #include "tensorflow/cc/saved_model/tag_constants.h"
@@ -24,7 +25,6 @@ limitations under the License.
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/resource_loader.h"
-#include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
 namespace {
@@ -39,6 +39,16 @@ string TestDataSharded() {
                       "half_plus_two", "00000123");
 }
 
+string ChunkedSavedModel() {
+  return io::JoinPath("tensorflow", "cc", "saved_model", "testdata",
+                      "chunked_saved_model", "chunked_model");
+}
+
+string NonChunkedSavedModel() {
+  return io::JoinPath("tensorflow", "cc", "saved_model", "testdata",
+                      "chunked_saved_model", "non_chunked_model");
+}
+
 class ReaderTest : public ::testing::Test {
  protected:
   ReaderTest() {}
@@ -88,15 +98,6 @@ TEST_F(ReaderTest, NoTagMatchMultiple) {
       << st.message();
 }
 
-TEST_F(ReaderTest, PbtxtFormat) {
-  MetaGraphDef meta_graph_def;
-
-  const string export_dir = GetDataDependencyFilepath(TestDataPbTxt());
-  TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(export_dir, {kSavedModelTagServe},
-                                              &meta_graph_def));
-  CheckMetaGraphDef(meta_graph_def);
-}
-
 TEST_F(ReaderTest, InvalidExportPath) {
   MetaGraphDef meta_graph_def;
 
@@ -136,5 +137,7 @@ TEST_F(ReaderTest, MetricsUpdatedSuccessfulRead) {
   EXPECT_EQ(metrics::SavedModelReadCount("1").value(), read_count_v1 + 1);
 }
 
+// Placeholder for protosplitter merger merge test.
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/testdata/chunked_saved_model/chunked_model/saved_model.cpb b/tensorflow/cc/saved_model/testdata/chunked_saved_model/chunked_model/saved_model.cpb
new file mode 100644
index 00000000000..d9f76e3b4a9
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/chunked_saved_model/chunked_model/saved_model.cpb differ
diff --git a/tensorflow/cc/saved_model/testdata/chunked_saved_model/chunked_model/saved_model.pbtxt b/tensorflow/cc/saved_model/testdata/chunked_saved_model/chunked_model/saved_model.pbtxt
new file mode 100644
index 00000000000..4a37bd88fb4
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/chunked_saved_model/chunked_model/saved_model.pbtxt
@@ -0,0 +1,2063 @@
+saved_model_schema_version: 1
+meta_graphs {
+  meta_info_def {
+    stripped_op_list {
+      op {
+        name: "Const"
+        output_arg {
+          name: "output"
+          type_attr: "dtype"
+        }
+        attr {
+          name: "value"
+          type: "tensor"
+        }
+        attr {
+          name: "dtype"
+          type: "type"
+        }
+      }
+      op {
+        name: "Identity"
+        input_arg {
+          name: "input"
+          type_attr: "T"
+        }
+        output_arg {
+          name: "output"
+          type_attr: "T"
+        }
+        attr {
+          name: "T"
+          type: "type"
+        }
+      }
+      op {
+        name: "MergeV2Checkpoints"
+        input_arg {
+          name: "checkpoint_prefixes"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "destination_prefix"
+          type: DT_STRING
+        }
+        attr {
+          name: "delete_old_dirs"
+          type: "bool"
+          default_value {
+            b: true
+          }
+        }
+        attr {
+          name: "allow_missing_files"
+          type: "bool"
+          default_value {
+            b: false
+          }
+        }
+        is_stateful: true
+      }
+      op {
+        name: "NoOp"
+      }
+      op {
+        name: "Pack"
+        input_arg {
+          name: "values"
+          type_attr: "T"
+          number_attr: "N"
+        }
+        output_arg {
+          name: "output"
+          type_attr: "T"
+        }
+        attr {
+          name: "N"
+          type: "int"
+          has_minimum: true
+          minimum: 1
+        }
+        attr {
+          name: "T"
+          type: "type"
+        }
+        attr {
+          name: "axis"
+          type: "int"
+          default_value {
+            i: 0
+          }
+        }
+      }
+      op {
+        name: "PartitionedCall"
+        input_arg {
+          name: "args"
+          type_list_attr: "Tin"
+        }
+        output_arg {
+          name: "output"
+          type_list_attr: "Tout"
+        }
+        attr {
+          name: "Tin"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "Tout"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "f"
+          type: "func"
+        }
+        attr {
+          name: "config"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "config_proto"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "executor_type"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+      }
+      op {
+        name: "Placeholder"
+        output_arg {
+          name: "output"
+          type_attr: "dtype"
+        }
+        attr {
+          name: "dtype"
+          type: "type"
+        }
+        attr {
+          name: "shape"
+          type: "shape"
+          default_value {
+            shape {
+              unknown_rank: true
+            }
+          }
+        }
+      }
+      op {
+        name: "RestoreV2"
+        input_arg {
+          name: "prefix"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "tensor_names"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "shape_and_slices"
+          type: DT_STRING
+        }
+        output_arg {
+          name: "tensors"
+          type_list_attr: "dtypes"
+        }
+        attr {
+          name: "dtypes"
+          type: "list(type)"
+          has_minimum: true
+          minimum: 1
+        }
+        is_stateful: true
+      }
+      op {
+        name: "SaveV2"
+        input_arg {
+          name: "prefix"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "tensor_names"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "shape_and_slices"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "tensors"
+          type_list_attr: "dtypes"
+        }
+        attr {
+          name: "dtypes"
+          type: "list(type)"
+          has_minimum: true
+          minimum: 1
+        }
+        is_stateful: true
+      }
+      op {
+        name: "Select"
+        input_arg {
+          name: "condition"
+          type: DT_BOOL
+        }
+        input_arg {
+          name: "t"
+          type_attr: "T"
+        }
+        input_arg {
+          name: "e"
+          type_attr: "T"
+        }
+        output_arg {
+          name: "output"
+          type_attr: "T"
+        }
+        attr {
+          name: "T"
+          type: "type"
+        }
+      }
+      op {
+        name: "ShardedFilename"
+        input_arg {
+          name: "basename"
+          type: DT_STRING
+        }
+        input_arg {
+          name: "shard"
+          type: DT_INT32
+        }
+        input_arg {
+          name: "num_shards"
+          type: DT_INT32
+        }
+        output_arg {
+          name: "filename"
+          type: DT_STRING
+        }
+      }
+      op {
+        name: "StatefulPartitionedCall"
+        input_arg {
+          name: "args"
+          type_list_attr: "Tin"
+        }
+        output_arg {
+          name: "output"
+          type_list_attr: "Tout"
+        }
+        attr {
+          name: "Tin"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "Tout"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "f"
+          type: "func"
+        }
+        attr {
+          name: "config"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "config_proto"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "executor_type"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        is_stateful: true
+        is_distributed_communication: true
+      }
+      op {
+        name: "StaticRegexFullMatch"
+        input_arg {
+          name: "input"
+          type: DT_STRING
+        }
+        output_arg {
+          name: "output"
+          type: DT_BOOL
+        }
+        attr {
+          name: "pattern"
+          type: "string"
+        }
+      }
+      op {
+        name: "StringJoin"
+        input_arg {
+          name: "inputs"
+          type: DT_STRING
+          number_attr: "N"
+        }
+        output_arg {
+          name: "output"
+          type: DT_STRING
+        }
+        attr {
+          name: "N"
+          type: "int"
+          has_minimum: true
+        }
+        attr {
+          name: "separator"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+      }
+    }
+    tags: "serve"
+    tensorflow_version: "2.14.0"
+    tensorflow_git_version: "unknown"
+    stripped_default_attrs: true
+  }
+  graph_def {
+    node {
+      name: "Const"
+      op: "Const"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 150
+              }
+              dim {
+                size: 150
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_DOUBLE
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_DOUBLE
+            tensor_shape {
+              dim {
+                size: 150
+              }
+              dim {
+                size: 150
+              }
+            }
+            tensor_content: "`\250S-\322\210\346?\021\261\230\257\371\257\340?\301\244\352\345g\221\346?\006\231\202C\203#\342?\241\001AY7\240\355?\031\275\343\257\231\272\340?X6B\007s\006\306?\364\216\221\020\241\'\303?\022\273\025\211)\010\336?\330[\324\264\201\231\323?\357\010\212\342\230\037\347?\240\272\301Fe\033\344?\3029J\332\352.\343?\2648\250\3644\034\317?\206\341\225\036\302\331\330?\200t\364\331Z\327\214?\364\341\007\n\201\002\350?b2\254e\243\006\327?\240ra\\\004#\255?\016\247\200\0006\362\334?z\203\247\034\253d\354?\320@pj\240\337\325?JM\251<\340\370\334?JQ{\352=w\330?\214\317D\r\267\314\336?\020\343!\237\026\300\336?\350\333\235\231r6\305?3l\223\335\343\323\347?\300z\344\275j\300\251?\201\345\321\227P\\\344?\031\030$\225\031\302\356?t\351d4\266D\315?\362\306m\265\254\037\354?\206\361\305F\324\002\343?\337\224\201\217\252\211\341?\302T\205x\271\205\345?!D7\244w\311\352?>U\244\250\333\260\330?@\2650\377 \360\271?\200#{\327H\032\206?\224\341\252\022\016\010\317?\024x\360l\215k\356?\300\263\234tb\230\206?Hbo\003\220<\320?`X\350\222H\202\256?\355\337\272\376Re\355?d\222bo}\365\340?\010\340*H\3636\352?\214;\2522\221\\\311?\305+\264\333\236\326\354?\225\257\272\270\242\026\342?\257\341V\245j\325\351?\300>\220\350\235\341\272?\226\\\n6\231_\335?h\323\323\200\220\366\344?\310P\000\351\353\255\330?i\233a\344\352\356\356?\'\365\371\206\330l\340?Ne\357\327\3001\323?\240\216\327_u8\260?x\224\220U\314\240\320?\256\177\267\346\234\n\334?+\302\250>\251\247\340?\214\352\273\013\263\274\322?\002\3351R\334v\347?J\343e\r\342\232\325?\230(\024\2345p\327?\362q\251\300;\346\345?Q\033\263\"y\320\346?\270\r\327\200i\347\302?\352\252\376i\252s\340?\252&\177,\253\r\332?*R]]\016\032\344?!\206{\017\226\026\353?\212S!\333!\247\325?\373U%\354\246\351\347?\230s\00607\212\306?2\355\376\323\343\315\325?\330%\252\212\332\313\333?\370\032.\244\267o\344?\224\212\324\371\264\300\331?\307\363\241\366\322\237\340?\250\275\000S9\373\301?\2449\254\237\025\323\316?\266\360mQ\264\016\327?Tof\271\267l\354?\341\234\036\301\311\314\347?\320\352FQ\tv\264?\005\250\300b\376\203\342?\000\026\366\351\034\372\335?\253\224\337 \370\266\343?\003\342nG\263\274\341?\214\233\200\032\264\331\351?o\'\253#2{\346?N\252\240p\323\236\343?\324\366\311\364\007\367\301?Z\234\361W6/\335?\"\020\002\307\260\357\344?\2562a\306\203s\323?PEG\003L\213\256?8\035\275$\253n\276?\310\305\315\306\n=\310?8\267\034\341w\315\314?p0\363\373x\205\276?Me\253\226\332\252\345?\3168\310M)\367\325?\036DE\220\344Z\326?\240\321G3\256x\250?r\233 \\$\374\323?ZN\367,\334d\357?l\212b\024\242\350\333?\366\213\206\005M4\342?a\254\273\355E?\357?\240u\026a>\374\271?\025\023\354G\305I\344?\177\340\332\272\237\230\346?2JJ\010>\264\350?d\032\276\263\350k\324?z\224\346\303\244j\327?8\205\237\244V\376\355?\246\371\363b#\r\321?`I\342\023\\4\307?\340\220\\c\307[\347?\004m\312\301\370\214\331?R\372E\331\034\206\336?\226c\2617K\360\351?\270\312\214\272\026\274\267?a\276\370az\007\353?\345n\364\321$[\345?b\366c\333\274\325\335?\021\n\232\0045O\342?`vl%\376\356\274?Y\254\363\014\256`\354?k\2025\t\266/\341?l\211\245\325\014{\344?&\350\033W|%\347?\306\260qD\274!\327?\0104c\000\334h\344?!\010\322L\364\001\356?\301\357:\023\302\304\351?\241ip1/\256\351?4H2j\352\321\326?FVN\224\335\273\324?oc\240=V\236\350?U\212h\212Wf\357?\350v\356M\325u\341?\010]\344\016\200Y\310?\370\355\271\371\217m\332?(\231\312\2514\270\263?\333w\367\036\333\224\342?0\375M\240\326\301\355?4\364\311\317\342\313\331?\200L\321\356O\356\237?\202J\032#Uv\345?6\206~j\010\357\345?@\343|\003\217,\261?\020\362\314o\n\226\346?t\030\024A{\013\303?\n%\230NQ\034\356?\222^I\033tN\345?\356\037\364T\177\363\323?`\322\277)\207\250\242?\214\325\233Wp\022\303?\303\360\213\241\311\200\351?\336\025\020\265\'\372\322?\374\225\257\'\273_\326?\314\006\010v\223\037\302?4\340\213\366\255\271\326?j<\335\005\005|\327?`J2\016\336\357\235?H\t\274\323\023\032\331?\000DVCL\033\315?\213\256\240\007\004\n\347?V=I\277\303\033\335?\2170\n\256\200K\355?@\231\257,\036\021\335?\370\345[7j#\306?\374\215\r\031iE\321?\374\374\357\235\263\'\321?\221\203\003\023\306?\354? 7\250\016g\013\274?h\250=\321d/\356?\031~\374V\016\233\340?t<\247~\313F\346?X\031\353\2008\252\353?P\364\033X\232\360\303?+((\241\023\360\341?\363\305\217\013&\235\342?\372\333\242\005~\025\345?\rO\277fe\211\350?\267\353\313B\303\315\351?\273\333K\251Y`\357?\332\324\375\367\310\034\345?\000\347\030\311\251\255\245?\235\231\376\010\'l\341?\317W\231\207q\366\342?uN7\336\351\017\344?$z\334\326I\013\337?~\361D\250\216\361\341?;\230O\371j\301\353?X\336\017\037\340\244\277?H\032^\021!\241\310?\266 xM\036 \337?\327\276\023\377\240\207\345?Wp\031\300\344=\341?\226\335\037\320E\003\340?b\331O\3301\313\325?0\373\\\377}\304\303?~lI\201\231)\334?\370x\n\320\261\266\325?C\353\342qB\260\356?\240\251\223\315\n\371\352?\222\362+\304\273V\326?\340f\322\334\370\357\234?\000\274\n\245`\324\\?\214}Yz\247\363\314?\202\2645P\352\327\354?\244Vv\321\372X\356?\230\312\003\032\372\376\321?\274\352\332\014\325s\306?8\263\370\322\016X\347?\215\240\3432.b\351?\202\277\361=\005#\340?\2721j4S\362\330?\207u\237\035\315\006\350?\231\374\231\0075=\355?\310\257fb\222`\320?NwJ~fB\332?x\255^\247\242\030\347?T\310 \341{\021\335?x`\335\031\207\274\321?\374\3254\240\355\342\336?T\203\247\024<\336\340?\240\212c\264\247\360\221?\022\316\352\340n\227\351?:\336|\377 \274\350?\013\230\2577\261\350\345?$>\375\255y\225\340?\267\017aLp\203\341?\234=\311\332\317\273\330?X\204\254\326\217\r\336?\320\257\337\325\366\335\256?\\\363\264\231\313\"\331?b\2620\026\361\315\352?-4\203\'\241k\347?nYg\366\006^\350?X5/X\224W\350?P(\010\363\004a\341?\220Q\'\240\010\332\273?\360R\232k\2229\336?\200\273\243Bg\321\267?\350\316FP\347(\332?\372\367\220\307\"\354\351?l\300\240\311\237\223\306?@\315c\264\007\250\246?\014\026\243\007B\314\336?P%\037\221B\357\343?c\034u\210\206e\357?\360\377\334}^\311\267?\"\311\032\305\330\364\351?HQ\253\213\251\225\351?\016\250\251\335\245\253\342?\017\2149\267\241\314\345?\272`\265k\316\013\320?\347\260OP\317\330\342?HvQs\302h\274?:Df\226\262\312\327?\236\235\004\260hE\332?\340\324\356A\245d\244?\300\245\270\270\217\234\302?/\227\3576(;\354?\005\014?\213!\315\342?$R\365\322\254\034\300??\020\255\354\264\307\343?\310S\200\207\221\376\313?\324\205iE\262\026\314?8\246\007\254\313-\325?<\001\274\331\0025\352?^\003\207\374JR\355?\304\034\242\n\355>\336?\371\2423\024\"\273\342?f\321\356\335\262\355\334?8\315\212\344\351W\345?\236b0\322\225\240\340?\034]\313\261\000\025\350?\340\370\255V(\265\241?\256\035\245\006\237*\347?p\250x\334\214 \264?U7\252p\310Z\343?\031\275j\321M\261\341?\264\362\262\217\324\311\323?\207)X?\007\363\345?dv\370m\207\356\353?pd\337\002[\203\325?_,MN\032a\356?$\244\227j\360\223\323?\355a\335\006\267\303\353?\010\361\351B\231\r\310?\340\017\321\177\2117\304?4n\342\345!\276\322?\336J\257y\224\201\350?\017\026\365OO\207\345?\250N7\313\222(\356?\250\325QMK\372\315?}\365\373\331\036\305\353?\266veT\202\365\352?\252\316\n\266\234~\332?\340T\001\027+\324\327?\234\375\240%\233\220\330?\014!f\373Qd\357?\0009k\372\374$\243?\244\t\022T\t\231\333?\351:@\362F\371\341?\002\035\305\007\223\364\320?\330\222\324\277\\\325\324?J8\002\200\305\333\346? \212tJ\346&\340?\250\260\014&\025\'\314?\300\277\"\031\355\300\254?+\016\220\236*c\347?\361\261\023\360\254\361\354?fe\331\203\025\032\326?hL\344\312\331\344\267?\323Yt\273\365\350\346?\254!{\352g\224\343?\316\375\236\222u\035\337?\217\314F\373Tt\353?\350\242o\317\204^\346?\314\353\357\262\276\331\340?\254\276{\3219!\315?\305\272@\370\376\337\347?0\251~\324\314\010\254?\000n\335\n\274C\334?\350\360\373\352\377\224\276?%\236\320\2759\201\351?\210\306AWq\232\306?L\302\301\256\345\350\334?`\332\305K\274.\242?\202dU\307\320\274\327?D\331\322\026\365U\314?%\177\237\005\331X\340?\321\360\357\353\243\344\353?P\026\314\013&\327\252?d\304\364$\024\255\346?\255\263\331\024^&\350?\025siS\000\023\353?$\203\213\222\206\374\311?kIg\266\326\240\357?c\031\022\354\322\372\346?H\344 9\0078\314?s\304\330\211\270\240\343?\026\250\305\232\316\333\322?\254\327\266VY\370\310?\267\375)v\210\207\357?\214\222\001\231m`\314?6p\302g\354\341\342?\360\265|+\305\215\300?\";\340\372\354<\342?\016\307\006\363\372\331\327?\360\315d\003\375\324\257?~\364\001\3347O\343?<_\n\316\264\262\301?\3524\253\322\255\234\342?6\020x \216V\342?\277\260I\252+\204\351?p\036\263f\360I\326?\352\263P$K\373\330?\207F\224\000\324\340\352?\3676O]\313\301\342?\202;\372\000[\327\332?L<\307\2433+\321?8$\023\"?n\327?\250\314\340\226\363I\322?\210c\t\264\024\244\344?2\375\233B\"\224\357?4T\201^)\000\302?dfu\317tZ\311?N3\300\2167I\321?\244\251B\341\361\305\304?\200\330\" \350E\357?\203\003\325\2725K\357?c\022\227?\036\316\353?L\231+\001\223i\354?\350\035\247\323\\\236\355?\236m\335af\334\352?\363\334L\2472\026\340?\370\354\223\317\354\244\312?\266\033\301D\327\206\323?4d)\260xw\325?|S\013\000\036g\340?Q\021K?\356~\340?\360\317\311\022\240C\357?\272\221D\332\222\340\332?\226\366\301\277\352\332\336?E\230T\277\320^\345?\274_\031\026\215\322\317?fy\016\3300\204\337?[\322c\214\334<\344?jb\017\357\021^\324?\274\227e\002\322\227\347?\370\005]\242^\t\345?\375\366\037\2630~\346?\206D\026\241\035\034\356?\270 \257\"\202-\317?/\341\276\306H\374\342?\260\034\020\317\r\"\337?\230\231]\021\245\223\271?C\272E|Z\264\352?0\302\000\276\245\242\340?\223\362\225W3\271\355?\344%\257\354$\037\333?\330\246\371jM\254\306?\200\257\320\333J\354\351?\300\254\311\206^5\227?\364y7\301\316.\307?\314\232W\tF\021\307?T+P&\326\250\352?\360G\3128\226\342\247?\262\214\007\016\017\335\340?|\270hH+{\330?K)\373\331\235B\340?0\313\313\0248M\262?\343\322\240m\203\251\355?x\307\333)\022s\322?\317o\t>\376<\341?\255P|\025Q\374\345?\'\320\037\230\250!\357?\356\343\002gV\366\344?d\021\340\001\007\332\334?\020=\221\330\336\335\352?`\2538B\251\324\225?\360\177\017:K\246\335?\202\316\366\265\004\245\347?\326\230\352\244Z\242\335?T\270\0176~\345\307?\220\3209\311\206\007\273?\204\"\r.\275\375\343?\023\352\003;\376}\340?\024u\266\223M\343\344?@h\3242\365\340\333?\251\n\240V\202\002\343?\261\334\'[w\247\340?Pn\331;\347a\262?\235y%\367\251\331\344?\014c \003\035\364\334?x\307\007\n\274\014\262?\374\023\300\362\226\323\350?\320\225\355\177$\333\333?})z\257\370\364\352?\333F\356\357\200\024\342?\244$\027\272\376o\342?\360\327\231\365\375\272\354?\000\266hU\246\324\341?\200N\036[\010\202\347?\227\025\303\353\223t\357?`\220\237\311\237R\300?~y\375\006\354\n\333?8\2353\'\360\375\324?`\004c\363\354\274\325?@~=66\202\332?\251A\274\251\214Y\342?J0\247c\201\225\341?\340\'t\315\354\217\264?@9\010\316\201/\216?%\374\224\3037\233\344?<\341\037\013{\232\354?\320\017\363\373~H\270?\311[\313\322}-\351?s\246\005;\325k\350?\210\251\252\r\334`\335?z\371\203\327\315H\354?\320\330$K\272U\336?\022\224\301R\236\336\341?\360\022\320on\025\345?\360\201c\244/\303\350?\264\247\327P\313\225\336?\237T\266\r\006\245\352?\272\"\262\256(D\355?\325\254\365\033\243\n\354?\024\307p/jV\316?V\332\210\312\212\343\324?\327\366N~\212\237\357?\200\025\252\263\371\036\207?\224wVd9\013\325?\250g\356\221f!\263?\300|\343\330\220\334\331?\362\037)\215\373\227\335?\372\360\000\212\302\244\346?\274~g\223\257\324\332?\024s\255WE\250\346?`b>xO\263\262?eR\007\367\273\233\341?/\330\326\361\312\037\347?\315h\316I\213\275\340?\300\373[gEk\305?2?I\021\262f\347?\250\377(\347\311\332\314?\300Q?o\240\315\274?\323~1\254-p\343?X\277&\tuL\271?\025\327\376\034j\217\347?\010\224\341\343\244U\264?\310n\233~\336\337\322?\337\343F\202\023\303\345?`[\225\010!\371\304?\211s\254\013\024G\345?\024\222\006\314\340\020\300?;\244\255\356\270\324\347?\020#\353\360\371\301\246?\\\000\013\240(\022\330?\276\331i\214\312\017\344?\345\346\000W\313i\353?\360MD\247\005\003\320?\030\201i\3634\344\313?\254B\031k\023\310\304?\030\316\030\215\204\335\267?#\233EM\230\344\354?bi\363u$\240\333?<\373\377D}G\310?\\%\003]\026\270\357?\224\005o67?\342?\342\020s\247k\"\325?\374_\324\343}q\313?%\236\234:\301\300\350?\000\010<=\000j\350?\230\3253\000\301\353\301?\244\367D\214\232\022\337?\364\ne\374\030\331\346?\213\342\225\372\n^\346?\274\260%\351V\010\340?x\220\353Q\347\330\304?\203v\200\266r\374\345?\242\375%8J\273\334?\007\245-U\362|\352?H\262\236f\2065\350?F\300\017\362\255\245\356?\212r\334\320C!\324?x\311\034D\037\362\303?/\207CE?x\342?,L\301d\275:\335?\342\335\330\031I\021\335?\237\342\001P\204\032\344?\010\241dYX~\327?\320\366\272\355\236z\304?\000\3667B6l\\?q\271\331\354c\301\341?\006<\226\337\254\342\354?\200\006k\025\2207\263?\001\361#\000\252i\346?\265\025\3546\367\017\340?\266\001\201\330\315\257\337?P\010\374!\n\372\327?@\267\3463\313x\211?wg/\336\363&\340?\346P\275zh\267\332?\264\230M[0\234\323?\365\244\303O\261W\343?\222\203\235\016\274\271\341?\277<\014k&\000\340?\230p\267J\315T\264?\230\364u\225\256\271\342?\314;\031\002\034\234\331?\240\373\236\217\372\230\333?\252\317A\217r\000\331?\000C\037\247\354r\325?\007V\013\374y/\345?\014\006\377\236j\353\331?\365\027\033\317\031\216\344?T\300\204$D\244\305?\000\343\237\272#\376\250?(\323N\277>O\345?A\275\036\343\037\375\346?|0\007m\020&\340?\214\277\336\001\247\313\314?\250\272\301\302\327\261\270?\372\365\037\034x$\346?B\260\370\212\346u\331?\270TLY\316\223\331?\270\341\300`\324P\304?\2721>\233D\037\341?\274\344+\257\204D\337?\225\005\036\205\034\037\356?\201]\221\237?\016\351?s\205\267\201\314\330\354?.@Re\253\r\327?\360T&\025\315\327\317?fd\021\242\360I\352?\200\362W\261\366\355\342?H\267\305$\313?\356?\347\206\241`\275\010\344?\340\314\177\344\017\230\235?\304\016T\252\207\031\330?\362\213\227$\331W\356?:\017C\205MT\323?\373\205\004\232X\366\350?@E\231\002\302O\357?z\3457\266\260\214\337?\205\210\037\265\210M\343?D\372\2760b6\343?r\362W\267k\333\341?>*\236an\244\353?\177\372fS\361R\344?N.\322\026\026t\321?\321\246\032\340\235\261\343?\000\373.\206\0312\261?\254\200k68[\331?L\256\225\036\006\t\347?%\023-QQ\252\356?z\021\3365\355\316\336?\221\260\225F<*\343?\366\035\300\344]G\356?\323\023t\021?\341\344?? \323\310\023\203\341?\200\216\331n8{\230?\010\334X\315\255]\300?qImhT\205\355?\302V\016\265el\323?\373\356\350\323\323\032\350?JeL{\205s\340?\006\027v\271\353\245\345?\377g\252<\032\376\341?R\213U\327\r\337\346?\240w\255\256r.\305?`\221X\270\240\\\264?0\3065.\365&\300?;\245`\266\025\203\341?\177|0\211\273\357\347?\0271TE\272\326\353?\250\226\n\036j\270\301?(\332\2610\033\352\300?\350\027BQ\215J\350?\235\t`\0307\225\353?\213\355\241E\"C\342?F\247\301\212\202\217\335?tXE*?P\314?\025\276?k\216\n\354?\311%\272\034\003\373\353?w\213\337=\276|\351?p\273\034k\227Z\304?\337=\023\267\230\241\353?`\376\004P\030\226\345?\3204\233\r\0258\306?T\343\204.)`\356?\326\355,\376\027P\333?{+\336\252\332\346\347?\300\270;\225\327\306\252?\000\023\275W+\r\247?h?\303\332\341\252\343?\324\214J\264x\032\344?\000\246P\354\312\024\205?\2729\326\220w\001\321?\230\t\367\354\355\325\304?h\310\234\233\tc\310?\355\003\254\223L\217\353?\314\"{\235\365\377\355?]\324\307\351^\205\355?\242_\224\023iu\344?X\316 \026\320\301\347?EVW\036E0\354?E\373>\366\211\357\344?\210a\272\353\350,\341?|\020\247\217\024\347\315?\275\210\244\226\226t\343?p\351\242\266\246\377\246?\032`\262\302\214C\352?\300\020\331\t\304\371\333?@\222 \370\204n\302?\240\346LQH}\245?\300\016eNYX\331?`Q;\201\031\272\223?\020XX\364\2661\254?\000\214\'*\"\352x?\024\343Z\336\352+\303?\240es*\026\316\253?rX1]\365\340\353?\320\247,\030\237\374\240?\252s*\303~\312\336? \255\216(\274O\317?L\335\267\373\375\301\310?\215\207i\225\024\020\346?\314\322\361\322\364\375\356?\244\222\2564>\320\323?\240\374\005\307\265\343\267?5;\216\206j\272\352?\230\\\323g\362G\323?P\226\367-:\317\271?\265\317\305\025\240\226\340?$\326\336]\311\252\326?\320\376\275&$i\303?\n[\231\316\374\372\357?\000\026\030GW\377\230?0\200\237\315~\326\276?\032\215\243\215^n\353?\370\225*\025\310\307\342?_\177\324v\371\325\346?2\364\371RB\231\320?|\344\331\266D\264\344?\310]w\301\270|\340?\321\000l\262\233\271\341?\033\260\023\246F\001\345?i\233s\233\3000\347?lw\364jMB\300?\316~\277\027}D\351?;yTV(S\354?\000~\t\036@\276\334?\233\237Muh;\351?\021\213\2437F\257\347?R\375,p\266\247\351?\356\354\277\354w\032\322?\234\306Ru\0009\301?}!\252\244 \021\354?0B\322.B\265\300?|/2\203\340\370\330?P\335N18\227\253?8\353\242\262\203\t\324?\350v\n\334\264\204\337?rTY=\303\300\345?\300\365*\277\272\365\266?\022\271\373\307Sn\346?\376\354]]\226\326\320?\320\207iU\234\311\345?\303\017,\321\236\336\343?(\303mb\022;\271?\3325\362\017K\236\332?\016Z\376I:n\336?\330$\266\307\232K\324?\214\0374\357\330M\330?\220\026kq\221\261\322?\301\241K\2017g\350?~\211IL\2463\320?\004\344\030\030\036\365\333?.\325n\311@h\354?\017\217\264~;\r\357?\216;\224\206\240\235\357?\330\212\302c\323y\341?\303\271u\345\371\323\347?4\262|\336L7\337?@\345\030\332\375\376\241?\224R\255\333+\360\311?\362\320\201\305\235\214\327?\010Q~\326\255\331\307? x\250|I{\277?\341g\376DIo\345?\300\205\353E\027T\332?X\352\357V)\002\330?\034\250\351uk\001\336?$\263]$wR\300?\250&\2147W\210\356?\250\017\030\362\311E\346?j\n!\253\201\230\342?}9:\375g\361\356?#\\\342\203\3503\341?H5\371\002;\254\263?(\306\234\205\346\373\312?\305u\"\247\314\222\351?\360M\232;;\321\274?\000\3373x\356\035\271?\354\3724;\300=\331?\222\347Q\023\236\027\343?\250\"|\327A\306\337?\360:\354l\253\013\312?\n\001?\332\332\326\356?!%\257\274N\361\346?|\360*\271\304\033\330?\317\006J\256F\003\353?Ak72|\267\345?\2416\335V\216:\347?\204o/\337V\305\341?\344\320(Z\332\310\341?2\252y\370\000\260\357?H\220\317\332\375\013\303?I>\261J\302\004\341?z\235\215\200w\t\322?\301\253\232\271\331\347\357?\002\243\271\356Ku\352?{\304\356\266\307I\350?\010Is\316\263_\305?p,\264\321)\023\332?VyBm&g\320?\210\311\216g \304\321?r\265\343\037\360\357\332?F\026\271!\345;\350?g\251\214\276Z\206\344?\3424[\241\265.\355?\253nG\266\276\316\350?4\243;\266\310\275\335?x\310\262\364^\001\353?8<\tU\361m\305?\2626t\342\216-\353?,\347\223,\273\034\314?\376\262\200J:\347\351?\366\325D\371Y\310\356?dE\324\330e6\333?\2260\024!\350{\356?!:\265\361%\266\347?H\'\247\271\266\365\332?@\212\274A\341-\320?\013\336\314k7\373\344?\301\006]\240\307\301\353?\304V\214o\306\352\316?\314\315\253\237\270c\345?\000\357\215\376\r&e?\242g\331\334|L\320?\342\356\227a\006C\350?\224\374\305Z;8\343?\352\222_\354jX\355?%~\"\270\036\363\346?t\202\206F\353\327\343? \234\323o\006;\324?\355\'5s\372.\357?}\021W\004V\271\342?\202,\224s\216\220\351?\266\215\001\3700\245\320?\277\237\316\273\302\204\346?\332\rv^i\260\337?\302y|m\230!\320?Pb\331Z\273&\311?\330,lg\324\024\276?\007\n_\03780\354?\354\312$\036\022\311\344?\301O\321\312\0105\355?\340d\200<\375o\271?\2258(\324\300\033\351??\250G\247\2400\354?p\251|S\267\267\300?\306MRFCF\346?\211O\027\373\371t\355?D\321g\234!X\327?\200M\023(\276\277\324?P\251\207\021\375\225\241?\000\026i{y\r\307? \024_\276L\246\357?dG\212\302\022\350\335?\033V\303\010\277\216\347?\244\254\007\272CP\343?=2\350\245}\022\347?NW\331\257\020p\356?\275V\245\214\355\273\350?u\324(\270\376\362\340?$pe)\321\270\341?p\354D\252\210.\332?\345\305{\354\213K\350?\367J\354\340\031\367\343?c\353\033\331\346\335\357?\3723d\216\030\314\353?h,\207\321\226\004\275?\322\036 /\201\304\356?0\266C\006\243\352\264?\255\331\223\331\233r\353?`S~u\323-\306?md\250\221\277\276\353?\320\026\357\316\232\376\354?\200\267n\236\271ru?\304\277\360\034qj\306?\214\362\264\234\337\256\350?\310\342\266d\316\236\331?\3747\345\253\231(\337?#\032d\034\374\275\343?\367RK\366<\030\357?t\316F\262?o\353?\024\334Z<\203\035\341?X~\230y\303\036\303?P\024\266\276\304:\274?\300h\353Y\332h\332?~\302Xun\237\320?\341\251\250\323\350\371\355?\316\003\250C\267\361\326?\200\036\306\272k\322\256?\224\264\3105)\177\322?uP\334\002UR\353?\362\014\341hp\222\336?\307\330\260;\203\341\340?\327Y\314q\3402\351?\000\013e\306\331\362\202?\024\023\236\025q\307\325?\222\251!\246\273\255\320?\036Z9\2015Q\347?F=\254CMX\344?\330\253bO=+\357?8\302\206\206\204u\313?\213\233\365C\006\276\352?y5\331U\377\326\341?\363\356\nb\322\244\353?\315\200\334r\013\200\342?\002\351\326\217\210\353\356?P,E-\010a\275?\232\001S#\255\210\332?5\347s\035\272\331\347?4\345\341\000\300\214\310?\364=E+{\300\337?`\225\203f\376\206\244?<?\007\342rC\332? \';xD\367\276?\000B\r\361!w\302?\223&\342\006I1\344?\277\365\255\245\360b\352?\244.\215\265H\320\321?<%\352\2455\242\323?}\343\207U\371B\344?5\335\352\2035G\344?\240\'hOh\364\222?|Zrdg]\327?\350\322\320Y\371\316\354?\200n[\r\361y\233?@\303}\233\245\344\212?\310\306\336T\243\277\352?*\004\344\233\317\r\326?\306,\341\356\\\313\345?XE_N\267\234\321?\210\"\t\324\036\274\343?\230\230\261\254\177j\350?\247%_~,\251\340?d\345\327(~\n\333?\232\261X\025\250\331\345?b\002\252\234\275\350\331?RZ\324\374\006g\346?M\337\324e\347\257\351?\260(\262a\031>\336?\300\32610<%\346?\210\367\3647J\273\357?\004\275\351\250\2753\335?\202W\275\2459\326\321?\370\010\364\272_\245\312?\363\315%wz\033\341?\363\001\241\373\252\233\350?\240Q7\353&\303\313?\263>\304\035\272L\356?ld\275v\307\216\352?.9yA31\325?\263K\251\034\335\246\343?\272\314\347\311Pc\333?\210\363\276t\240}\327?:\327yG\304V\356?\346\303l\024\340\303\325?@\231\002gP\321\263?\263H\230\242\305\205\345?V\263\256\227\323\'\321?\212\220\370cj\345\327?\204\030W\003\352\\\310?\000G\000\020\022\232\222?\356\002Z\2137n\351?\242+\333\216>j\351?\024\025\177\234w\221\341?n\362\027\240\240\350\340?^?JX\275\305\324?\000\233\312\363/\371u?x\006\215\323\200\341\352?\260\227\222\204\304\030\303?P\036\223~t\006\254?\014\306C\320\315\367\336?\350\006)es^\261?\2428\034\255<J\345?\235\253\222s\030K\340? \027\342\241Y+\225?ae\2067t\355\355?\346\215\311\373\301\275\346?TJ\016&\277\324\300?\223\017\301\354\026$\344?\270%\360\352l\230\347?\270a\225\265\324^\353?\241\202\261O\020g\340?\270\225\177h[\213\310?\273hq\232\024\355\351?b\214\240\031 -\327?0\032\017\232\352\014\323?@\331A\264\332(\306?n\272\035\022di\355?H\247\003D|:\317?\2558B\277\346\315\346?\3119f0\261\305\343?\320\357H\206\377o\325?\023\323\271\037\304q\345?!\276\234\356\240\257\347?\003}9\236UH\344?|`\020\001#u\301?\220GX\021Qg\271?\320\327\037\256\346\306\276?zQ\373\026\3715\321?KD\321\247\033O\357?\210\205\255\230D=\341?\220e46\014\360\265?X\333\017_\341\262\347?\262\343j\317a\364\352?\300\276\220Jg\n\320?>\204\311$\325*\324?d\350~\204e\311\316? \311\265\035\262#\254?(^\331,\334k\354?$\223\364\233B\232\311?P\360\305\r\261@\251?$p\267r\311\022\343? \244!\341?`\253?o\324\355^w\276\352?\036\347\n\227\250\254\352?$\216\345\343\336*\303?\004r\203Ge\364\335?X\272-Q\013\000\266?:\2528\356!Q\354?wL\351\022\330U\344?\2306\r\214<>\323?\323\356 y,\271\352?]\346@9g\324\343?\306\272Q\217\266.\321?\220\246_\"Q\035\331?\320(\274\251k\304\317?\267Y\222r\311\263\345?\347\352\262\332\341\353\345?\314\226\276\376\354\244\312?Dh\322C\230F\306?E\321O\234/\022\353?6Z\344:F\221\352?(\207\034\013\350N\276?\327\320W\031\305\177\352?\332/\334nu\220\351?\000N\253\300X\356\311?H\322\025%\304\254\265?\204\234\264\234\224G\320?\030<\212hE\373\337?\371\224[k\010\256\352?\350I\327\014\232F\342?\"\243\227%\017\252\355?@\335}4\013\373\273?C\372jH\030\332\346?\317\220\364n\350H\340?tW\360\277;\n\324?\260\0318\343|\"\330?\000\231\231O2\014u?\000\2178\343_\347\247?\300\262\000g\260\314\341?\324wU\005KE\305?\240\250\252\377\363\272\344?G\323\030\013\210n\356?\205\036\244\353w\230\340?\222\273\354KOA\320?@\340\2460,Z\243?6|\315\023\322\244\337?\002CPEzm\322?\016\206[~ I\357?<|*\030=2\312?u\270.P:\267\346?\277\370\007\252\321K\351?8\036l`\201\017\344?\000\235\307|\215N\273?V\206-#6\002\320?\t\006\2153\314\225\344?\240\363)\231\376,\311?\002\243,\250\035\306\326?\335\310\005\305gx\353?q\261\034d\257\221\342?o\344\321\257j\300\343?W\342\370e\315\241\357?\n\235\013V:Q\327?]\273\205D\256\321\347?\002\005A\314\234\005\324?\273\315\271\300]\223\343?h\3233\315\270@\325?\376|@\016?Y\335?P\351\253\036\335;\346?\220\323\022\354\365X\305?=)\303d\220%\342?\267m\036a@\030\340?\372\013Lv\3123\331?b\"\253\335\323%\355?~\330\266h\220b\353?\226\',\"\254\273\322?\306\037\323\267\000\021\357?4\205P\254\002\026\357?@m\313\275\031\347\204?A\313\275\212\374\217\344?n\250\262\034g.\345?\002\361v\202\013%\337?[\2525\274\204\244\357?\250\216P\255\204\256\274?\311\325\302\232\337\246\357?\376\027\311.\247|\345?lgUlh\233\331?\300-q\241\210\305\237?\2164e\002\335$\331?\215\271\360C\231\203\357?\360\026\222\237\003\235\326?n\024\225~\374\340\356?\247X\000W\013T\357?;\361\205M<\223\345?E,\217\307{\010\347?h\272\311\2414\226\342?\'6\022\010\371\364\357?%\220)\003\313\224\354?`\272\223\034\201\206\302?k\231h\324\301\274\340?T\277E\250M\346\300?\230\332\245\250\246\314\345?\034\213\307\005\275\220\343?\034\037[\021\271\003\341?\030\010\356\377.s\312?\232B\325\217\027\005\354?\214\\K\317\343\216\305?\347a>\247f\033\342?S\346\236\234\027T\343?v\000\204\262\270\\\345?\t\201\331W@\217\351?;\005\250\251T\001\340?\346\367#f\320=\333?\310\007\215\026\025[\274?\362}(\337\307\235\357?\212\031\353\007\220\271\333?\316\014@n\261\344\356?\200\343v\016\266\243\227?\023\203`|\217\314\345?A\225\234\'\247\374\341?\200\364\272T\324a\340?\350\264\303\370\252\226\324?\235\023B\266\\\261\340?C\252\334\254\236S\347?\365\203<\375\222\314\343?\034\002\334\376\277\026\350?\337\013\246\217\335\263\357?\214\236\2225\363 \341?\025\216\300\361>\254\356?\036\003\237u~\354\323?8\031\210\222\262\374\345?M\252\027\341q9\354?co\211.\360\347\351?\230\266\r\307s\322\356?\224\377\325n\363P\353?\300\002\222\2063\245\306?\207HE\004\010\341\355?|\331\335\237\370u\337?\250\242U\254\026e\265?0\302\363\030\221\225\306?@\177\026F\312\036\206?\365\274r\327}1\344?\003\262\010\255\221%\357?\024\372\037\306\014\276\343?\010\002\371h\300p\347?\323\233,]t\000\352?\265)\306`\226\267\344?\010\372\037fvf\322?p,\"\210\222$\242?\204\023\364\206\223\261\350?\311]y+V\213\347?A\034\246\277\335$\342?\203\001Is\214\205\357?\242\366_\303\220y\350?\010i\'e\320\353\315?\340I\317m\346\264\332?o\\_\255\000\n\352?\265\355I\t\3627\345?\273\3505X(\372\357?\306\"!\212\227\241\335?X\021\316\266\016\216\325?\210\0070\226\241S\306?]6\2116\233\300\340?\300\352$\333\373Y\273?\017\007\026g\356\004\355?P\317Qv\3319\250?\375\352\030[\357\310\355?\007$\005\333\312\316\343?\237~\315\321vN\340?\331F\232O\025\240\345?\010\360\252_\254\014\353?0\033\275\262!(\323?X\207\031\303\300g\356?\275\242\261\rJ\260\346?\343\025\222=\304\312\343?\234\260\231\215\362w\317?\200\207\'\026\222\300\203?@\277\377/\220\200\252?\306R\207\237k\036\327?\205\332VN\005\337\353? )\017?M\243\233?_Z&\265\r\371\340?\364-]j\316\307\337?L?\204\010[\304\330?RF\254\346;\366\324?a\000Pb.k\350?.\374\341*Q\r\355?\357\236\306\305\344\333\347?Na\231\363\037\372\345?\236R\345\214\314\257\322?\241mU3\005\037\346?\275.\360\n\"\317\353?\254J\371\037P\311\336?207mV\325\323?-\377\246#\342(\350?6\345*\317\027+\346?R\337\243\334\033>\343?\304\n\017\2520\317\340?[\227\206\034\365\225\347?l\344\306\361\'\n\314?\344\330\323\253}7\310?\214D\027-`i\347?\002\302\256\304\375\376\352?\010\367 E=\241\313?i\326t\330\242\001\346?\206z\306\205\2658\332?\261\tJ\272\203\376\341?\200\362z\260!o\216? \177\255\277L\302\346?\236\354\373\'\311\277\327?Sm*\313\225H\343?.\311D}f\032\350?X\034N\377\264o\276?\320\277S\360\335w\262?\362\313z\317S\254\344?\001\010\247\324\367\243\351?8\021\266\353\301\206\327?\327\234P\265\\\353\355?L\364\022(\334\304\331?\224\014\302{\250\346\305?\001\214\236\351\355\032\353?X\251s^Z\305\303?d)Ml$\247\321?j\033\243\':\373\350?\363tOk^\222\357?\370\274y\210\357\307\357?(\210P\3600`\265?B\324>&\327\022\350?>7#\212\201\325\326?fd\000V\366\273\356?i:\331%A\311\342?\004\2752\273\014\367\333?\230\255\203T\262Y\261?k\034N\345]\003\346?\002H,\240U\200\344?V\343\r\261\\\215\341?\026?\222\300\362\361\340?\341\277\217VgK\345?\365\005\372*=\317\355?h\350#\350k8\275?\247[\025\247f\020\352?\254\354\303\371\373\021\300?\233,\307\230\271\243\351?\004R\247\230\265\376\323?\324\2376o\341\365\331?\036\261\244y\277\235\321?`\\k\337_]\266?@vO\231It\352?\215\036A\376\364o\352?L\207m\240\017T\310?\214+\034\371\271\006\317?0\337\023\326s\236\354?P\351\353\320a\327\257?\002\206\262K\334\005\351?D\260\302\230\250b\352?*y\013\333U\211\331?\276\323\001F\336\330\344?\240\373%\364\222\201\255?Z\r+\251\343(\357?\n<\274io\313\320?x17\014\027\377\327?\270F\350;\350\233\307?\"\306~\245\307\345\351?\212\33127PN\342?H\343O\243\2265\331?/\304\000\330M\020\340?+j\033\274Y\017\354?\204\201\347\006~h\342?`\361\3442\251\t\331?\340\312\362+\367\241\254?>\005DL\036\311\350?\024}X\205\\\326\355?\300\020\322iD#\236?\370u\001?\357\005\265?\217\262\360\2004\312\344?H\'\234\266\300\336\351?p\006F\302-\340\322?\362J\361\205\017\317\323?|h\325Kb\377\300?\325\245v\035\032{\344?\336\007f\006_s\322?\310Z\277\005o\010\305?\376H8\241\370\267\354?/\344u\322\354\013\342?\320\030\202\255M\257\305?\335/3\312\002\315\356?\271\2743\254up\341?\264\246\350\362NU\320?\317!N\344\371|\344?,Y\263\275d\322\335?\337\354\311\201\256\311\350?\224\335\026y\022y\321?\016,.\342\020\376\330?P\274\226*\250\325\267?}\330\363\013\353\312\355?\356\006(\t\2060\344?\231I\222\243\264\320\352?\245\274\337\200L\004\352?\240\215\300\032\372X\320?(\374-\262\267a\276?\330x\357AL\231\325?\014\316F7\242\312\325?\200\\\024*\n\226\326?\263`\254\014\206\203\344?5\177$\355$\004\354?\274+\3559\035\357\340?\356\307\216\205\260\335\342?S\362\363\263\2356\345?\241r:\264\277r\345?Dcp\206\251k\313?\253\375m\000\212\251\342?\374h\201(\214\t\357?]\333\234;\332\204\350?\3760\3126\240\326\330?\307\326\314\321\256\016\357?\361\204<\026]\216\344?i]\'\327\177b\351?\241\260\3603\214O\347?\354\362\001\270R\374\315?Js2B\260\006\344?\376\3220tTT\340?\312\t\341\202\340\271\353?\266\372\205\231\274G\353?\214a\230\371\313\267\313?6\237\243o\244D\346?\256\275l\267G&\351?{\302.\220t\230\340?\334B\320\202^\226\345?\366\276z\203\271\301\350?7\344\261\332n\342\344?\002G\357O\253\316\320?t\343\010\262\030\031\352?\030+\323\314\212\301\320?\014\364\342\212\212\320\305?\244\352~!o\233\347?t\202GyI\357\347?<a\rq\310\345\304?t\346K\266\235C\327?\242\201-\024v\277\356?\346F\313A\214\310\350?\224\221vT}\032\356?\316\364t{\177\253\323?;\305\316\306Q\000\357?`\2127\"DX\247?\032\020/\370\317\376\321?1rp\300\233\025\354? \014\350\010\\=\333?\361\017\013=b\'\353?4W?\347<\231\323?\257`\342\344\241\216\340?\350\330wZ7&\352?\000\2602Jrr\223?\220\213q\337\345\364\276?`\361@\033/w\300?\242\257\034C\355\337\341?1;\311\272\223p\346?\310\241\030\212\370\221\356?p\266\'\264\000\324\346?p\232\304\236i\256\311?<\336}\245\201W\355?\210`\227<\300\353\345?\210V\231>\216\036\331?\275;#\300%\235\353?\030\243\243?]\001\343?\246~\"\270\352\233\350?[\214Y\231\235w\342?\250$\342\304\224k\263?8\261\333\240\310\306\307?\302\301\177\271\260\202\355?T\002\004\220W|\327?\340z\204\226&\313\301?\254\213A\360\036\251\322?0\325f\226\244\350\351?\270S\262\023\007\265\314?\364\306L?R\232\332?\346\264\003\263\242\323\334?\014R\267\256C\000\345?\033\230~\016U\000\357?\271N\226\301\250\214\355?h\356s:\241\254\260?=\2062\270o\233\356? %vb\203\336\251?\335\2718\3145_\352?X\322\246=\236P\332?\374\211\361|)i\357?\307 \263\264c)\346?\331\001U1\345z\343?\236\264~nl\302\351?\267\316\260\354\215Z\342?Hqe\037i\352\357?p\226\276VT\240\260?(\3506\366H\003\266? B\355\330e\245\225?\036\245\200\024ff\353?A;N\2709\265\345?\000\217\315\221\356\273\243?\224\264\201\365\241F\312?\251\006\357\341\215\367\353?&\231\353w\225\246\324?W\025\020\230L\335\346?\242p\013pX\217\334?\200\014=?{Y\210?@\240zy\263_\257?|\025\210\326n\357\343?\335/\242s\023@\352?\272\215\231\177*\001\321?\335\245c-\262h\344?\007|\003\272\210\223\342?\371\r\255\001\334\024\354?\274\212J\277>E\323?\374\362\027\357:0\312?\222\343\206\233\236W\353?Z\251\013@\030\366\325?\352Er\323\345_\325?\352\215|\303\001\364\345?\3557\220\262\237\377\355?l\256\226z\235\363\354?\273&6\036\341\177\357?\360i\334\303\246\266\333?\310\004\2545\307\311\356?\'\301R\343\223\332\345?\244\373\304\034\2005\331?\000=\331|\234\331j?\344#\306Z\2533\356?\276\307\233en\023\340?\\\231Z\201Z\334\316?\321O\2278\311\336\342?\235\201\224x\261\257\346?\000\332\0242\332(\255?\215\234\277\000\243\267\357?\0141q\310\203\244\323?\375\177\337h\276\273\354?\260/\317p\n\265\267?L\200\241\271\306r\322?\2753LG\367@\357?\220\364!d\345k\337? r\363\033\t5\311?\236g\006\323\254\"\352?x\225\013\354\231\025\357?\200\225&\nPg\300?`/\267j\343\212\353?\230A\232u\330\036\351?(\177\010\243\2246\337?h|=\302\252\256\330?\022%H\356zU\352?8JcE\232b\342?\300\2468\302)\013\322?\337\337C\t\226\317\344?B\213\271\354\3364\357?\240\311g\230C\365\222?\3463\206\310\277.\326?\270z\231\016\345\216\265?\261]\216\323)\330\345?x\263\210\225\207*\342?\375iO\257\237\313\351?\031\340\233C;V\355?\366\227\355\244\266\212\346?\2205\221\361\274\374\312?\372\202WC\351V\354?d\334C\266\333\340\311?2#\215\007\323K\357?\022J\253\211\251\t\320?s\244\261g\330\225\351?\204\033\214\020 `\321?F\t\367\324y\307\321?\326\337\205c>\037\340?{\373\200\342\230\020\357?e,C?+\314\341?\000\240kQQ%\214?\260\240\033\333\006\241\332?D\354\264&:?\324?\232\374\312\215u\264\320?&\262\265\027\336L\357?\320\333\237\\_\363\316?\304\231\345\r\261\337\342?\212\307l\367\017\354\355?\000Y\001.\357\230\275?\026\255\373BK\262\334?\360h\260\363\352*\311?\370\366~D\2038\302?F\246q|n\273\353?\364\240\2609\252\223\317?\201\232\345\030\322\253\351?\004\245\241/n\272\323?p\035\002\245\221\033\310?\274\342\377\241\017:\306?2\024\034D( \356?P\252\315|:\224\261?\227\323g\274cM\354?\000\263nTr\303\330?\362L\027X0)\340?\270K6\357\253s\352?)E\207\036X\207\346?\234\321\212QV\346\321?\321`\037\352U8\352?H(\334\204\324\322\264?}:\312rWI\344?\230\005\366s\2448\343?\3410\320sK-\346?uY\024$\216\242\342?\242\203\316\206\317\017\345?x\241D1\360 \333?\220\373u[\"\240\300?\210^\241\356\246;\313?\002\2003P\203J\326?bzL&?]\347?\334\251[)\330/\340?\007:\345\261\t6\340?-\353\321\302\303\005\342?\264\301l\343\303\003\332?D\334@e6\244\313?\373>O\240\006\362\346?f\231\3543\267\020\351?\030\330\314\336\304N\322?>\271\365\322\213\361\353?\034\262&>U\212\334?G\320\252B;\220\356?\254\t\233H\351L\307?\356\271S\330\236\354\345?\213\021x\267\206\313\340?\200K\375\272\205\"\202?\320\036\222?$\031\343?\030?\345\332\201\243\354?\024\000r[\236\023\306?\345\361\350\314t\267\357?\310\212\306\303B\266\335?\230\274\235v%\202\276?>\211\366\241\362d\355?\332\nQ\213Q\351\333?\230|\216+o^\343?\306*\270\216\214\035\345?\200Zo\250v\017\217?\300\325\273<\346!\213?`\372\036\321Ho\277?\t\340\266:>\342\351?@\351z\247x~\351?k\376\214p\002\303\343?\032\256\306\312\271v\333?HH\340\2270 \314?\222\205\302\367\262\027\351?\237Kg\242o=\355?0\246\205\2245\356\302?\357u\310x(\234\343?\3710\232\202\363\304\344?\240\317:lO>\241?B\313\030\014mQ\330?\316\222\321\242U\016\322?.\017\333?\245T\357?9S\007\215%\330\350?\034\357\205\215F\305\317?\242\251[*\217\020\352?hi\211\0134\322\332?:`L\326\201S\356?\270\324\266\254\345\336\335?H,\234\3552D\317?\350\343p\003\365\343\331?L\366!\022\233\362\334?yMJ\344\255\333\346?\355\033\215\265\263\027\355?L\361\002126\315?\362\243E\363\311\030\356?1\2420\357\214\353\351?\376l\202:\227\340\344?0\344\373\334\214\200\326?\216\206\265;:\232\351?\207T\217p\n\320\347?\224\030\366R\343\216\300? \273|=\341\337\266?t\204\320\3502\375\303?\200Z\233O{\037\323?\212c\335O\010\260\353?{\374\324\341\216\031\355?8D\027+\026\016\352?K\321\352\257\007\235\352?\203\276\313bLU\344?Xdi^p\301\323?\276b\205\345#\025\320?\324\203\026x\024\365\345?.O\264\003\032t\353?\\\034\007\273M\013\345?\335\310\010oU\274\344?\244\247R~\266\232\301?yw\016\236\np\356?\205\275\2169\321\030\341?\210#\317k\347J\267?\2521!z`\362\347?(:\346?3X\330?\333\362\001\261D\017\347?\344\004\342\375\207\016\353?r\331\264G\336\363\353?\236\333H\177\244Z\330?\264\256\275n\2057\345?\310\232\"R\346w\357?\314\225\005\014s\316\347?`\270l\233P\036\346?\252\251\006\355\357\253\354?\200\n:\225\270+\237?jL\033\300\252\337\321?\n\314l7$\232\344?\017\320\256\220\370\234\344?\247e\237\246Vk\351?j\227\330\217Z\363\355?\3358\275\237\036\261\345?,\240\220J\266\002\340?\220\303\026`+\253\273?\274\3579\033\251\233\341?\220\2044Y]\372\310?\302%\252NI\275\350?\220\327R!\225\307\323?$BR\234GZ\322?\365(\324\253\027\214\345?|\031u\273\361e\324?f\371`&\255\034\341?\367~\226\204d\\\356?\2000,\367!\200\221?\254\001\003\256}3\341?K\3330<\2317\343?(\020H\367 \377\276?,\221\236\363\251\017\315?\020\034\343\352g\307\250?\250\235\004\021\2715\275?\220\0361\016>\340\253?$\323\3431\321\005\334?\000r\'X\253UY?r\354P\025eE\333?\0304\223\252\206\351\345?\2358\252\226\203\201\352?.\212\232\323b^\352?\200\247\250\t\334S\261?\317\377\034\224\"\021\352?\370\313=\232\266a\356?\260D\002\306s!\271?7\204\213t\306\274\351?\272\2212\003\265\353\345?\230\216\236\222h\006\324?H\356\033 VM\323?\300\252\361\023\000o\244?\204\034}Q\232\256\344?\351\243\262\031\272c\350?n\034\234\206\315\353\330?\340\256\322\032\3351\354?\300\227\314\330\016e\263?\014v\300\0034-\351?ST\367o&\370\351?`F\234\214\356t\333? n\340\207t\357\241?\016\315\266\231\327L\336?\265\354u\226\346\335\344?\277)\300m\260\334\346?\307\272,\03592\352?\334\315\346\361+\345\331?\3405\266\202\315\005\301?4\272@\334[a\303?\200v\343x\027\236\250?e\206\"\354ic\353?\244nM\306\037o\305?\366#(\317\216f\344?\344\001\327\276\350\273\315?\\\250\217\320>B\306?j\0311\207\254\270\334?\374\262\330l\032\317\320?\253\000\036\332S\264\343?\316\272\177{\313l\341?]h~\365\370\335\351?P\327\307\335\036\335\337?\023|\356\352\302\256\347?iB\n\330-|\346?\335\203\304=9\217\353? \r\203$\212\261\224?\331\355L_L\250\341?4\031\365\327C\316\330?\000\233\204&\362X\256?@\302/\323Ue\205?]\335]\341\237\264\354?\370\205\270\230<\275\335?H\335\264\036`\261\312?\342\335\n\tr\211\353?\356@Z\353.\363\341?\300\272\034\256\2145\331?pv\306#\324\032\320?8\364i\020\037:\354?\230\201\260k\036\010\267?.e\323B\036\326\326?c\301\353\036UI\353?x\211\334]\253\007\261?\240W\206\222wj\323?g\351\377{R\321\351?\204\341REtj\316?\377\226\377\003}%\346?\360\212\006;\035\374\263?\365n/\316;Q\355?/\216\036\240\3473\341?\267\003\357\262\330\037\353?\224j\351\244NN\350?{\32116\003\373\350?0D\031\317-\302\322?N\315Z\342<\331\325?\016x\031\217\346\013\322?\240\212@gSC\354?\200\323\275Z\266\265\251?Y\177\241dSC\347?|\340,\242\215\313\352?\370\366\266\236\033_\333?+\033um\363d\341?\036\033\253q\277t\324?\330\3669\305W\207\326?\206\340\217\336z~\321?\343wd\367\330\030\345?\017\215\0231\347\014\356?\035N@\261uC\346?z\256\266\021\016{\323?\224q\250(\211\206\322?oO\200\347Z\222\356?H\230\256\352\224\320\300?r\234>\222\346\321\350?\014\313\266\222:[\356?\006YN\034\235\230\327?\030*`F@\327\306?3\246\371\307}\361\351?\312yn\017\267\332\324?\221\215 \370F\270\350?/U\030\006\r\322\342?P2\002\215\246\212\346?P\213n\211\034\351\313?\240d\247\311\\\277\351?\230\3362\002U\233\271?<\336\307\333\023\014\351?\006Y\227\346\016\325\355? B\035\t\210\275\223?\030\035V\253z\302\356?\005\004\3258\rS\354?\236\000;\217:\016\331?\240?\037\200\272+\355?N,\355\336\331/\354?\316!\016\366?\n\350?\360\007\224\277\331_\301?$\020\314\246\026\373\336?\232\323\331$\n\246\334?\215\213\353\334\307I\342?F\260u\376B\003\345?\300\250\312V\220\246\341?\000\243\202\375\024\335\314?\264q\275\331f\250\317?#\245:\212}R\357?\226\200\003\036\213\302\341?\231\203\220\232A\245\347?\346\203Hx\273\353\324?\010\r\200v\273\221\275?@?\\cJ\277\335?WM\377h\364\315\346?HL\016\272\020\206\275?\t\316\035\321\363\n\352?IO\036\375\252\274\353?\354\367\031m\013\027\313?\017\357\224BE\271\356?\316\264\037X\212\004\347?\311I\r)Z\261\343?\266\0145\275\235\325\321?\245*\313\344\273H\343?\246\t\325F\301\007\350?(\316\254p?L\317?\276\3658\261\340\377\336?$\265\311F2\243\352?\025\332\231\005\256\323\345?+\036\226\244F\235\341?\347\360\204H_]\354?\265\033\314\235\033\303\343?7~M\343,c\351?*\005G\311ef\356?\342{j\341\335\003\355?`\025\246\022\270\357\255?\323\303\202l\262H\344?\375Wd\310~n\357?h\345S\267\374\315\275?.\032`{\251z\346?r\354=\t\223\036\334?|\323m\243\355E\324?X)\213\037\272\263\260?\363Y\037\0310\356\345?|\311\031y|\315\330?\352nE]\010\373\357?\232\177d\3629\356\323?\335\265\n@/\376\356?C\356\224\375\203\366\351?\352\r\300\002wx\356?\342\377\250t\261\363\320?\352saF\325\201\351?\323z\264\225S\233\344?T\370\210\222A\354\340?\331GRQ\355P\357?L!\020/\250w\354?\322\275\364\222\320]\323?jJ\3751U\337\341?\325\260U\253\316O\346?p\210\332\036](\270? \024St=\201\264?|\251\272,zY\357?\260/\rK\213\330\355?\3642k\253\272\335\314?\340\2718\363\304\034\351?H\265\200V\301\305\310?@c\000!s\260\350?$g@\030\315}\343?:[Avj#\340?\260\362\241\357\346\307\352?\363\310\374\217\342d\340?*m\367\204M\272\354?\213+g\225SS\355?\207y\257\267i\277\354?p4F\'\225)\272?\030\3733\375\274\306\327?\210\275%\037\036\234\351?D\202\232\340\374\262\324?$\r1\367&\236\352?\341\277\002\373\\\374\350?>\347w\362|h\355?~\\\224\276\364\023\357?\274.\336=\026<\332?\342\020\325\263\230,\321?1\350\372\337\371\303\357?\311\257W\326\367\365\346?V-\006\004\r\345\354?\266\266\317%\306b\345?\000N7\347Q\322\210?\022 \274\351\355g\337?`\212\372\302\367\007\270?\234\305\354\2229H\345?\270\336\013=\256\324\355?J\262\225,\320:\336?\317\275\262\203\227\016\346?\310\233\"\274\312\321\326?\300\254\330\021`\332\351?2\360\251\307\236\207\323?@\260\256w\251\352\215?\325\276\375\242\013\266\345?T\326_\371U\232\315?\007*&)z\275\357?\364\021\375\036Q\247\313?\013kfW\312\356\346?0\213\244J\266\275\330?\247\276dA\022j\342?Hl }\360\371\322?h\275U,\213\201\317?\300\241\364\003\271K\343?$\001\361\200\354\353\303?0\204w\250\321\037\351?\021\t\300\333\227`\347?\224w\t\243mu\347?h\305M\005\237\366\263?\263L*\225|H\357?\341.\277\231\271/\347?4,\363p\352\006\301?\034\300\373\227x\230\340?p!\305\322f\205\261?\213\235\276\"\233\026\343?l\343\354\373\321p\341?\214\257>s\243\004\321?\340}\315\253\321\007\256?0\305\2272W{\265?D\221\232\215\365\023\307?\224\247\366\342A\243\333?\241\316h\313\355u\354?\373x\201\037\224\004\350?.\001f\345c\022\331?\245;\247\306u\r\344?\262\275V,$@\357?P\265[\276T\236\347?\"\007\261k(<\355?\251\224/\366\362}\350?\256n\261\277-\270\357?\302\223X\246\345l\335?\020!\2049<\233\351?\022\300\307\313\342]\357?\362\303^\027\364\025\351?Apf\207\200\014\345?\242\356b\023\2277\341?\037Y\263\250\254k\346?\264H_=k\025\335?`$\301\2104\035\351?k\320\342\302\215a\343?\010h\032\3532\035\261?m\226u\364x\333\340?T\013\007\254`\364\337?\302\304\224\016\376\254\345?P\225\231\010Y/\331?\346=\334\020;\234\341?\023x\370A\335\301\354?\025\361\306\322i\206\343?\272Ke\032Y\007\354?\360d\024xfs\341?\210\247WP\236\246\276?\302\307\2653\325\304\355?\310\346Y\305\277\024\354?\225v\222\017\323\037\343?\266\347XQg\304\330?JD\232\310>\247\355?\376\245\224\335\363V\350?\224\322\333\006\326\025\313?Em\026\303\373^\350?\236\327H^\324.\323?\200c#\211\0328\242?\264\374\255\006\345\032\314?\361[\337\235\033B\346?\264\346\257b\376\216\334?\252O\327\301\303\236\350?\366,\263\010W\265\340?]s\r\030,7\357?\233y\265\345\033\263\341?\346\361\316\344\334I\347?\027\025\253\306\305\204\340?x\263\rX\375\177\350?\260!\222r\234\n\240?\354\177yAX$\327?Z\023\356\263x\303\323?_\225#\272\020\370\354?]2\252b?\200\343?\217\036\331o9\236\340?\025~\304\375\330/\353?S\016\306\332\206d\355?\336\033\005\2065\346\347?\354\200p\331|\035\341?L0\337B\016\226\307?\261\362/\356L\273\352?\240\350\374C\334\244\232?`\345\027\'\361$\314?\252\344m\005\332\371\337?\365z\213{\226\021\341?\260\241\357\310\277\257\325?,?S\225:\033\332?\370\311*\003\030X\271?\242B\270\264\314(\340?\272J \233\305\364\342?\000*\243\217\206cb?\206.\3165\n\344\331?\255\377\305c0\004\346?\211\232\346)\003\213\357?QiZ]\310\025\356?\204I\221\252\357\327\322?\242\202A\215\367\237\354?\322\247\234*E\333\333?pb\240<w\037\265?\354*E\324Y+\300?\004\350\2656\250J\306?\322_vp\264u\335?\023\2513o\255\327\352? \312\325\262\236\007\334?\2152$\222Ps\344?\213\265\032zF`\341?\224I\221\005\336@\343?\200\266\364\177\274\270\336?x.g\247\024\010\335?<\204\004\201\t%\342?\024\022\033\2175<\343?\301A\266\317T\254\345?4\244\217\0208%\335?\224\025M\224:d\313?\245\020> \220\322\341?\330\362l\364BV\332?\204%\214\036\016\367\337?\000B]\317)N\232?\340\263M#\345u\261?\224\257h7Ii\335?|\r\"\261O\367\320?\001\336\307\017\3230\350?d(\200\371\306\322\315?T\023N`\221\364\314?\202\307\037l\212\002\345?\224\030\326Z\343U\350?`\243q\226\254x\347?,\275\237I52\356?\000\235\034p\253i\240?g\267\374g,1\340?<G\373V)\235\324?D\225\003\247\315\246\347?\376\346%?\364S\355?g\301}zT\213\354?\337\322\234Kn8\357?^\246\"\366\226\325\321?\263\217\000\231\356\213\351?<\343\t*\301\351\305?L\264\1772\254\277\356?\266\355\306\005\357\271\337?R\305\365\030\332E\352?\203{\376MH\206\343?\350\321n\332\023#\345?t l\203\226H\300?\n\222\343r\355\305\355?P\322\317\355\201\315\344?\274\024-Q\260h\354? \366\027\265\204D\240?\374.\260;\236\236\311?\336\007\230?\350d\352? \3217\231#\305\337?\226\023\\\224\357q\326?f\254\210?\244W\336?\300c3\262iF\231?\210\275\n\302\327v\335?\216\250(\\\025\376\320?\210\004\201zO\210\275?]\203w\220e\311\357?\307\203\035\013\306\253\355?\337\230X\326\031\312\341?\2000dCS6\340?\374\311\265\344\023\370\317?\335\373\343\246\333\326\350?a\000\267\016\336\033\343?P\364\347\3517\262\275?\362)\220\351\271\205\321?\315P\"\215I\363\346?\024\352,Of\013\340?\250\207\227\'9\222\334?T\317\201:\014\224\346?c\24218\325\251\353?\035\202\241vMg\350?\220\341\014m\273/\342?(\036)\371\272+\334?iL\370\313\016\262\346?\360\363\2402\215f\357?C~\274[`\372\355?\374\307\353\335\007\302\317? \312\213G\024\343\336?[\361l\013\247/\342?d\260\276\026\322i\312?\016b\214\210\301\024\342?\020\027\237tw\270\266?\377Ne\024\027\363\347?Ih\354\351Dm\351?\375s\360\320#v\347?\2643\315\334?\267\341?\260\306\321\352#?\345?\363yN\224\303A\347?H\333VF<i\264?*\277\230\344\300\002\331?\300\303\270\205[\002\266?\210\357:F\025\030\350?$2\254\302\234\346\331?.\327\322\260\003h\355?\013\341\222u\253\004\347?\314E*\234\272\247\357?|\315\3672bk\332?v\336\227*\262P\334?\234\ng\n\021\220\317?\350\360T\001\214\030\344?l~\306gP\314\341?\350\361\312\020\000m\276?l2a\371P\315\331?6V\034\321i\177\344?\206j\037E\357<\342?<\001\305\230\023L\312?\356\004\366\320\263s\324?\275\250T\251\251\016\347?`\222Dm9\005\354?~\202`\362\036\003\330?\\_\225{\361\200\316?\322Z\234b\271\322\321?]\177\234#\327\221\351?\247\256F\207\206z\356? \032\341\'e\216\353?W\226\307\334u\270\357?\307\377\355\335!=\347?\244\327\347~!\366\320?[\262\255E\005b\354?\017#\213$\027\344\347?\206\210\321\257+_\321?\337\253t\305\214\236\355?C\\\200\331\3121\352?\370w\312~g\333\272?\310R\313\314\002\177\321?O\243\031xK\257\352?\302i\377\215\377\257\335?\206\200\006f\005L\334?\022*\261\037\352;\351?`\004\272n\014\203\331?\352\212\366\276M\222\336?G\027\022\312\222\036\340?\177\317\263\3567\234\342?\320\344W\013\034\260\357?\261\021\361\205fX\344?\356s\300:\353\332\356?\244\241\312g/5\304?\355_\343\2342U\351?h\247,\203\223T\357?\300D<@-k\250? \347\306\254\277\r\323?\274\262O|\356m\312?\244Fi\313\343\260\343?q\"\375\362\347\217\346?\204\374x\257\276\014\331?\222\341\202\200\023\253\357?\262\260Lg+\\\325?@\257?\034D\270\342?~\001\365\032I\242\334?H}.J\316\206\266?h\265$\026\377)\261?\246\\\263x\263\243\337?1C~\252=+\340?\014\253\321\0009\361\330?\200\305\3226\356\351\237?\211\213\222\200\177\340\357?\000\220*n\022w\220?\210/\017\346\333\005\264?\r\336\025\035|\010\352?\275v<\256\207J\356?\242S-!\227p\323?(\2438\243Y\253\357?\320\2069/\341\246\271?\206\241\246\006=\201\332?\\\221:_;\312\305?\030[\000\035\351\257\343?\260E\246$\260\244\331?g\307\352p>e\357?\\u\214\251\010\024\350?\242\343|at\335\354?\211\303v\37332\345?\213\240\274Z0\320\350?\206?t,`U\337?\031>\207\017\2661\350?\264\027En\235\345\304?\214\371Gv}\274\301?\326*\021\242\036\352\326?\333\"\026#\2421\354?\200\312\333r\306xr?T\250xi\226K\351?,Th\252N\220\331?TF\3654D\343\322?\215[\370\207q\210\356?^\263}k\255;\346?\n\261O M0\346?\300\373be\332\343\340?[\340\334\272\264\306\354?\350\335\231\016:\243\275?\261\211\214;ka\353?L\013\033\243sp\337?:\360Z\273\354\210\341?\200%\276\264\306\022w?\200M\032\351[\'\334? \333\017Rg\266\336?\342\333\231-\241&\334?d\017\2412l\241\342?H\226\212.\225\345\351?\200l\027\336B\356\266?v\365:\372\240\332\336?\2200ND\227\374\335?\345\227\260\352k\333\346?\262I0\340;\327\337?h3`A\'\017\341?\364\345\203\357E\254\341?\000\351\330\266j\272\276?J\2043\323\264\356\327?)\252\262g&\222\342?\275M\372q\356E\355?5\022\312@\260\274\346?\274\216\0031\037\013\300?\370\022\3035\313\212\315?\326\230\266n\2236\320?\346\241E\221!D\343?\240\325\327\"\343D\326?@6\360\306*$\340?bi\272\024\263\233\355?\"\026\344P?\340\357?|\363\215\374\004E\323?\214\253\304\235\370\270\343?\227\243\211\t7\245\352?Ds\n\212\033\230\343?vf}J\201g\344?D\321Q\002>\024\355?w\370\242\033\313\260\352?\353M\r\240\273R\344?\260\273,\030\255r\353?\n,\215\0037[\325?D\314\350X\r\231\346?\272g\335b\234J\354?(%\301wU\365\346?\274\277\374/\364\035\313?\276P_S8\t\330?\035\205\347\347\326\004\346?\240K\301\250\306\306\266?\230\030\371\262gQ\345?\000\2468\221\3367\352?\014N\235\030\311;\307?@\355A\257\306\340\334?\210\240\237\352\310\351\305?x\316\272IM3\326?\024\014}\250zz\333?\354\r\023\n\234P\350?\232\353ZZoF\340?\3261\223\n\255\210\351? \371Z\215\304)\302?$MQw\237m\312?\t\033\350mv\230\352?\010q\t\275\322\323\331?~\306!\277d=\322?\202\221`\221/?\352?\210o\255WJ\277\327?U\224&\235\322\212\352?\010\324\034/\342@\276?\034\026\021\177\304:\303?\026\007\204\363\226\001\345?s\330\033\360J\004\352?E\322\375\235\277\320\357?\200\271\326o>\375\267?\2445q/\332\247\312?u;\233\232\275\233\357?\242Yk\254\336.\326?\246A\245!\024@\356?b)U\311\237:\354?\224\204R\223\3557\326?\200e\3146`6s?S\025\207|\343\330\344?\214I\333Vg\361\326?\004\353\364\220\301\335\343?\342\252\010\371k/\332?\277\347\317\347U\240\355?\243\362p\271\010\006\355?L\235N\257h*\346?\203\245\243\037HH\355?\007\225\247\212X\327\357?\352\236\350\207]\257\347?\210z\246I\373\363\336?p\310\371\036\321\313\341?mk\327[\226\232\340?\374\353\005\3207\366\320?\360\317sV\247\372\273?6\341$/\230\032\354?\271Oa\362C\272\347?\350\227(\373\363\367\355?0\366<\370C,\345?\312=\017^\306\304\350?\354\245Z\334\303\254\356?\256;f4%\334\333?\220,\222~\274\026\270?\030\020\377\216\223\376\354?\356\310\215\254m7\347?\234\2603\254u\206\346?\022\257<9w\314\321?44@\031\033\352\301?\020\206{\322\307\310\267?\373\010\327&Y\025\342?$\237\2748M\246\337?\000\210\033\376\335\333{?\276B1yT\250\332?\034V\360\016\240\026\345?\344\270\265{FG\327?\304\366q\312\261\314\347?\0267\350\014\216\263\353?W\2556oq\372\344?\342\375\230q\244\037\340?\315?\020\312s\213\343?\346\315\014`%J\331?XC2\345\003j\350?\212\242\023\305\2428\354?gX\302\324\345\367\342?\337\002\327\023\256\246\341?\374\003!\n\353\221\330?\212_Q\315\033\353\357?\0204\241\352\007\275\346?[\205(\314\264\272\344?\240\024sQ\3463\336?\206\376C\300\302|\344?|\313\005\020\313\320\330?\377\365\025\243K\t\343?g9\350J\352\310\344?\330G\214\213h\252\331?\254\351!\351--\344?\322\376G\005Z\314\345?G\307G1\374I\346?PU\210\'\232\222\260?\000\267\024h\345\n\245?\250\310\325\231\306\255\341?\020\242`\017Ks\314?\032\311;\352y\235\332?Zx\020ll5\322?m\264\357\"\201\r\347?J\272{\304A\260\331?P\035\037Rm\225\245?\037L\267d\035\367\355?\014m=\216\356\300\352?\361\312\3426\362\346\355?\326\2654\205\001\320\357?\017\344\356\332\322\234\347?\370\333\233\313\244\337\351?\334m\343\260;\341\355?\032\364\266\363\332Z\355?\264[Vh\310\306\305?\320\2662\267*\230\330?\205x\275\231f\245\347?\254v\'\205\007\316\340? GNQ\017\210\226?\001a\266\340\266q\344?p\267\240|< \255?U\247\367`\025;\344?\256\311\325q\226f\355?Sk\001\004\300}\350?\310+:\253\361\237\301?\004\356Q\365O\230\312?f\003\210\032tG\325?W\273\2614\034\024\356?\274H\254\0216\360\300?z\352\252%F^\352?h\266\020\201O\005\301?\376\235\314\001\3373\327?m\246\300s\233{\357?\344\373\344\317\215Q\300?n\002|\273\275\370\331?\220\321n\220\\\265\277?K[Q:\341\235\341?&Q4\005\361\234\347?DaWJ\256\305\305?\275M\331\333\216\317\344?\334\325\365\377\014\000\331?\330\376d;\246+\356?\033(\\f\262\022\345?H@\245A\220\324\343??mBa\013\314\350?\216\355(\227\240\027\354?@2Xu7\013\343?>\333\207\320\200\270\355?\340\362\200\0334J\353?\347\241\226g\004\364\342?j\253\246\234@\035\354?V/s\314\374\252\345?\206\264`e\212\035\323?c\365\331o\352\264\345?\212\026)w\216-\353?\010\200)!\341\361\345?zf\220\313\220\021\320?P0n\361\371\020\265?;Tb&J\300\352?\202?L\005/x\344?uK1\310B\020\345?@3o\332C>\240?\037\252\254\232\n\227\356?\370Oym\306\271\335?\034\314\351\315y9\310?\000\212;\026O\026|?,E\221\212&\205\314?<\217\233XfO\337?\222y\362\307\0052\352?\354\020\201G\2028\323?\330\230\337\334-j\315?\271\033\277E\365\223\344?RA?\203\007\223\325?\240\261\245K\217\261\232?\231\253\r\010|\031\354?R\252\027\272\370s\326?d\356WwT\236\305?\334M\363\000\010f\347?F-\375\304\355\032\351?J\376\225\320\204\364\336?\305z\342!\240\007\351?\250\274\313Tu\366\315?\000J\037\262\nC\233?E\177\251\210\'\302\346?\314\304\255\273\305\240\310?{E\234@\202\325\345?\360\310l\000r-\326?\360M\354\227r2\243?\020 \026\037\246(\310?\025\314\227U_/\355?61\267\032\307\274\337?^\025\257\003\316m\354?D\212Y\006\274\341\306?\250\333\335\271Y\227\326?\300\013\267\320\332\276\306?B\214+SQ\003\325?\344\336\227\336\253\233\327?\230\340\331\037x\032\331?2x\320\024\356\374\350?4\235r\276\021\027\323?v\244\377;\223J\356?\231\023F&\236z\346?\314pWW\025\021\316?\"\243\310\351Q\311\341?\320S\201R_\025\343?,\306|t\025\324\340?M<%3\236\301\340?T\2320\366\"\243\325?\200/iG\266q\331?Y\373?\31405\341?\242\204\036\377\020\327\321?\332\017h\357\205\333\341?y\036\301hYC\344?\236w\3431?z\333?\224\207E\026\244\324\345?\200\014\203\325c\n\310?\276\235\245\262\024b\347?\027p4\353\177w\347?K\202\020*\014\301\356?X\344\037\024\"\344\327?0yxj}\347\256?\222c\3116\352\263\323?\026\217\375z\n:\324?\320\262\027\356t\327\312?<\205f\214\210\322\300?\344\242\261y\350\010\307?\230\031\033\007\234\350\311?\222\365\200e\257\313\321?)\267\025zf)\341?\252\250\203h\376\034\342?\376`w\354\\\373\320?\"\302J)\021\210\332?\207X\304V5Q\353?\200\351~J\224\223\305?\240\316So\317\304\271?^/H\343\217\273\326?\226U\215*E\010\346?\332\272\350`\026C\341?\3743\224\216;u\305?|\360\272\214{\272\343?\256\303\035[\021N\325?w\340\370*\2762\341?(\354;\265\312\270\315?\354\032\346\026\006\341\311?\003P*}\026 \354?\016:\217\255\315\211\355?t\033\006v\206\330\301?v\'\354$`\006\335?\246\241=\207\022\020\324?\244\267\332o?e\340?\000\332B\363~\215\315?l\271^\275\237S\315?\205+\2066\230\374\344?\356\016&\030\373\306\352?\220\373\335h\251\321\274?\226[\363\037i\241\357?]$n\023\272\r\346?1\027N\273\226\253\344?\234}U\350t\024\306?\342 \024%\212\237\343?*\252\210\265\001\231\332?|j\360\234\245t\320?`\277G(\036(\275?\2070e\030\363|\341?\322\021\327\006#\373\331?c \373\306}O\354?\300\261NG\r\350\307?\344\350\221\2474\030\347?3\203M\226<\261\347?D\240\025Z\256\375\302?0\342C\025e\340\240?\002\256\275\355\003\003\350?C:.\262\220\232\342?)\034\0345NH\352?\324\203\201\027\005\256\321?\364\000;\021\214&\314?~\272\254\004\ri\324?\007?g\017\351\362\343?\313\310\250\231\342\307\341?\177DOG\212\204\355?\014\3548\240l\236\311?\320\251\255\345\274\334\305?|a\265Q\361\362\346?\274\345xC\256\233\303?P\3610\350\244\374\266?\200\330\234qmz\315?R\250\341[\240\340\352?`x\227HD\021\300?\267\303>\375\322\300\357?\036^\365~#r\324?\"\201\263<\016.\324?\300\001\245?\1778\316?\226W\3227\321\275\333?\000\020\334\\\235\300\262? \177,-\274\331\273?\202`b\3063k\330?\000\225\340\263=\024\263?\272(\324\031\204\037\356?\235\313j\352\270[\353?\370\306\307\204n\231\317?\344\353J-T\364\316?(\223\322\247s\335\340?\354s\177O8\234\346?\000%+\3316-\226?,(X\315\201\263\353?tz\345\253oY\357?@\235^7\213\t\346?\277\022\337v2\246\344?\010<\352n\256\342\312?t\337o&#@\353?\241\245f\225R\310\354?x\222\346\375\3265\340?\360\354I]\324\375\305?|S}\322\032\276\331?\261\275ayM\275\346?\343\221\345\260\256\364\344?\236\376)\004_\346\341?\304\017\204|v\252\346?*\265^_\235&\351?\000\243\205\230\367\027u?\360\277!\2347\023\256?\200\363\273NE2\333?\343\200\366$u\231\347?.\211\314\360\001s\337?\275\277\177EH+\351?8R\025\245\302\352\327?\350\353\361;\207/\327?\014\313]x\255\224\321?\347\220\365\300\0308\345?R\025\010\006\3078\331?\266\210\035\225\303;\343?@UJ\201\363\340\232?\220$\235}?\326\244?\\\2241\242\354\330\315?\033+\n7N\305\344?\227\322\362E7\020\350?\310\330>\234\2623\320?\246\037\240A\234\274\354?\222\2328V\261\017\325?\300;\270\345=\177\340?J4\375\367\346\017\354?\341\331\326\336]\234\341?\300\213\275;Sy\224?8L:>\304\322\314?Ta\274\342\313\305\330?\301\310\350\2078\'\352?\373\357oT\372/\350?\3209\220\346\374\263\301?\236\350~\225\330A\342?\370*\225\335\360\241\351?\000\223\324\230\265~\341?\224h\364*\302-\337?\316TiW\2709\336?\354\213Y\"O\361\355?\230\273s\250\367\030\330?\"a\264`\226\030\326?h\307\017\352\372.\323?V\301\240\034\021\020\343?\255U\316k\363\323\354?\315\257\270m\267\031\346?\016\2077pa\006\331?\3551\237\032t\276\357?\303K\260\265(\247\352?\"\274X\272\311\r\357?\034\034t^A\315\303?\310\247\257\257s\257\335?$A\267\303&\316\331?Z\246u]\216\202\351?\244\030\004B8\346\353?\027\325\024I\027\322\347?\266O\315\\\214D\345?{\317\252\017\264\206\355?\361\017\272\031\010\333\343?\262\021\234\373\274J\326?\226\344\2320\361T\335?\226\206F\222F\210\336?\217Ph\345|\"\353?\320\264\272\270\333`\301?\000\"gW\"js?$\021\323\004z\275\324?\314\354\350 `\335\335?Y.\367\311s)\353?C\235L\272u\304\356?\017\342`\341\334p\342?X\r\305%\337L\321?j\370\257\001s.\351?\\\266\2623\0203\325?\216\'s\311\253]\352?6\033}\'d\022\344?f+\205&\370\214\327?\212\241\242\r!\031\323?\300\257\370G\001d\230?(\341\000\023\202\205\316?\350\337\037\265\035\364\273?\307u\366\344\025\005\354?j\\\255M\210\254\327?\002\362\034\301\321\031\321?Xbi\375@\335\353?x\224\035\245\207\370\322?2\202R\347\366\321\337?\024\325\277\310\350)\311?\360w\3429\207\263\345?\r\300.a\034!\353?s\247\321\035\314\014\355?W\315\304\001I\365\347?\364\351\264\241\002\362\311? )\326\357\334t\312?\200\377\342W\245\236\257?\316~\271\314\266\343\341?\272\\p\372\000\354\337?v\302\271\311c\341\357?\254\014\250\244\321\030\336?\204\347\003\206\363\000\311?\000w<\302\302\246\343?\030G<\035 t\327?\264\004\371Q\307\010\321?Gxu~w\013\357?d\253\000\035\016\030\335?\2645k4\263\305\314?\237\231\343\204\342\212\355?\205xz\243\342j\340?L\275\2651\215\330\325?\340\242p\035\n$\271?D\362\370\262e\031\344?=\332|\276V\252\355?\300\303b\340H\036\233?\004\333\351\312\302\241\340?H\004\3564\\A\321?\002\307\361\211\235X\331?G?q\374fe\352?\332Q\344B}\014\323?&:\364\367M\010\347?\030\n\\\'\214\"\353?\261\262S\020[\311\345?\204Q\\:A\326\330?\261w\351h;#\356?Qq\335\300\324K\352?\253\207S\353g\265\341?h\323k\250\367 \317?)<$\0348y\342?\214\377d\336[_\323?\204\006X6\362\263\320?\360\341\324\026\236\365\322?\301D\024\370\304`\344?\344\341\310%\241l\301?\006\257\242\361Gm\342?\3307\235\211\330\312\333?`\367\007\377DQ\335?\320\034\204\020\346\334\350?\244\204\2309\223g\333?\210&\177\004\311<\345?\022\007\310\313\267\300\346?\240\025\034Os\203\234?\320\032m\250\243f\341?\314\334\035\321\213)\343?\277u\355,\3232\344?@F\343\245[.\252?\344\335\316\267\3725\351?\220i\274Pe\214\257?@\340&\207wa\306?\005T4m\2228\345?\316S\202\025\323}\353?\360\217\026\212+]\337?(z\365\274!Y\321?Y\276\302\255\270\277\342?n\322\2503\003\356\347?`E\355\212A|\277?x&\207\005\245\265\341?\224\277\224\270\363}\344?:I\216\202\310\242\336?\332\020Ah\256\260\343?\317\r\'\360\257\020\355?\300\302\322]Q\'\236?\227\244\026T\277\371\344?\237o\273\202~\022\347?\2401i\331R\035\223?\037\260d\371jT\345?\212I\360g\232\235\322?~pX\212?\353\337?\330s=\020Jl\276?\256$\021\274\031\221\333?x\270\001\016\224\251\300?r\303\326w\2777\354?W\200\211\361\023\262\346?\264\316\374]\365z\355?\372\\\022\251\234=\335?\274}\014e\330\373\313?\000\230\232\225#\017\257?\000\033\233bt\345\200?\200\215\356\2054<\344?\210H(\352\030\351\334?\216\276\337\272\245\266\342?u\212!*\361`\356?\030\375\243~JO\344?i\271\375j2~\341?\r\347\205@=\336\345?l<\345?\346\"\335?\300:\371\375w\r\350?\265\212uPJg\342?\014cr7\266R\312?\346\310\374\345c\211\337?\307_\301\344\266\017\350?\354\317\331\3475U\350?@b\370\003\224u\337?\000\303\225\022\215\351\332?\224\237]\227UR\322?\360\272\276E\361\016\341?\374\361\025?\271\271\305?W\3027\260v\013\353?\362\223\206!\265\322\337?\270J\005\246\007\037\323?\271\300\017\204\242p\351?\306\237\327\200;&\334?\360\244\352\324\317S\250?>\355\260\205+f\355?`{i\231\277\033\306?x\037\246|Yk\327?\206\211V7vZ\327?PJ\254\335\022\211\345?\327\260\335\202\216\360\350?\017\352\021\340\235\202\341?\014\302\233<|\377\332?M\300\327=D/\355?\220\305\366\323,a\244? \205H\262$\232\324?,~\376\247\317I\331?8\347sT&\204\357?6\003\207~\357H\335?\346+5\304\200$\322?\337\371\336\375T\321\340?2\307\231j\310\220\333?|q\021\203\337\025\315?me\005\334\356\232\347?\320E\320\330\332\325\317?Tz\232c\256x\313?\320\020\323\332,\207\321?#\231\320\226\335\316\356?\244\\\340\304I\201\300?\220\001\373f\213\273\327?\264\236(\361\313?\332?\004@|\230\220&\345?\242yi\243H\220\347?h\256\373du&\306?\370f\316\2649=\322?lC\240e\317\036\356?\300[a\235)Q\252?\374V7,\014\262\316?S\232\341\210\270\026\346?\271W\025\374!\026\341?\022l\001iin\342?.\306\263\346\020\250\354?T\216\010\230\025\254\306?B\262\013\021)\270\334?\000\017\333\347\273\023\345?\344\276\264f\225\037\320?\252+b\266T@\323?W\217\230\247\345\222\356?\300\202\3422U0\244?#\311\236(\225\232\343?\2309\331\317\224\320\314?\320\375\256\236\036x\327?D\320\006\247\364\037\331?`\027\215\020\216b\264?d\230\375\235\241\257\346?\353\344\256\336\320\364\347?\000\234O\0053\225T?\332\007\372no\377\327?\320\266}\237\241\316\341?0\340;\\\370S\352?^b\222\3062\262\343?\007L\334\230\341\316\356?\333\370\222\225\240\301\344?x\313\010+\343,\326?G \303F\0105\353?\362\223\005`\231\361\337?{z\237!U\351\350?\014n}N\2078\353?*[&\010\205t\334?0\324\271\256b2\321?s\333\266|\025\233\341?\224\360\343\256T\353\325?\352J\221\344\243h\333?\346E\347~0\177\350?\276\310o\354\254k\354?\266\213g\245H\304\343?\302\245\3176\036\r\330?\274\375\272\355\312H\347?\2340\374Z\'\205\314?t\025\017\325\220l\332?\324\373\255\377\215^\326?\320\342e\270\314)\265?W\244\372N\203\r\352?\252d\277-\345.\331?FIE}kP\323?H\207h\306\220\003\320?D\333zS7\210\345?r\001\332\336@\373\323?Qa\244]\035\034\356?\030\214\331\246\220=\341?l\255\300\020:\"\313?A\251\377+Hv\342?\024\365\333\005\243\243\332?\327\351\214\314\350\333\350?\264\004x3\227\376\322?\353g-\341H\373\356?8\013\367\003a\344\332?\270\361\003z57\275?\316\'!=\272\201\320?B\227r\360\221\247\356?\000w\221Je\226\332?`~v\025\275v\302?\372\3221|\211p\347?ze\017)\364P\353?\2748\003\004\375\261\323?\210_9\307XW\312?7\330\362\321\032R\356?h\026\tP\n\247\312?\251\265;\365\353\302\346?\240\017\340\366\231\332\273?\377\376y:\023\354\354?\340\"\345;]3\343?\276\223H\303[\274\334?\026\256R\020y:\320?\271\347(h!L\340?\334{P\376\240I\356?\262(\004\212\343\034\356?\220]o@P\253\243?l\005\307\215\334\377\333?\251B\324]\233\036\352?\034\336?H\275\335\322?\250G!\2769\023\356?\353\375\256\252\360\366\345?\020\204}K0\320\346?\014>\300\237\247\277\303?\304\330\242&]y\354?^s\336\233\022\023\344?\202\3719\204\305\313\347?\026g\036\352PX\334?\037\362\244\347\2326\340?\034\346Iq\233\260\356?\310LJt\270\014\322?\313s\364\361\300\233\356?\034^\034b-m\312?\"\370)\321}\023\321?\030\021\025\266;\302\323?\262.\0030\035\362\352?t(q\312\236r\310?\300\'\244+\350\303\270?:]\204^\374%\350?\274\223\340e*-\321?\373\240b\245\252\373\350?\310\302\017\353\035\203\307?\010\337\332\345\302\211\260?\317\0375\247\354\273\346?\300O\272Y\177\013\207?F,\364\023l|\353?\333K*5(T\347?\006\033\300\223Ky\320?\366\307.\350\302\252\345?T$\335\272wP\305?\364\031\007\325\2560\306?\024\207}\272\216\266\350?hI\214\r|\322\300?\\\"\367\334@\273\346?X\253\246\337\345,\347?\310\325d\357Pz\331?X\227\255t\353V\314?\300s\213\212j*\323?<\361\230\310u\007\324?\241sU+\231\377\350?\020B\254\214\270\327\320?\320\247h\303}\016\316?\010QeM\233\214\343?`\205\345eJM\232?\317\275I$h\022\357?\010\033\265\342\230n\317?\317z\345+i?\347?v\317H\376\217\376\341?\300\257\327\322\317\320\355?P\333\3675\251\313\273?\304\254+q\336\010\310?\312V\336\276\3173\337?=\337\3657\313\246\341?\306\372\332dR\215\321?j4\200\264\216\t\343?^\020Zj\2071\340?\270\330\010H\177\001\321?T\276\313\312\3127\313?\373\220\330\236\376\316\356?v\362\231\356q\214\357?=\357\002\357\3322\351?\202p\tvm\315\343?`\246\331\237_4\316?\024\215\nO\254\215\327?\203\351\311\247\226\207\341?,&\364p\211\257\340?@\025\273\313\302\005\357?s\273\313a\331\202\341?^\316\343\260\226y\327?Ui\324\273\363B\352?\000\024\237\2573\302e?\020\025\254\256p\355\255?4K\210\357(I\343?h\3603\017?\325\334?\374a\207 \232\002\346?J\032\332\255A\356\340?\210\2721x\260 \326?\251\314i4<;\357?6Sa]3\\\326?\261W5iu\031\346?\0041\'|@U\336?G\314\301mk!\351?\206z\255mz}\323?\257\r\025c\"n\355?\262U\n\3020&\324?\214\354\346\237\0309\340?\254c6\037]\206\315?\334\316\243\372\310\225\357?\356\214\333\315\274\003\321?0\231L\245\024@\240?\300\236\314o\312\010\300?\201n\"oZD\344?@\346,<$\215\234?]\217[\337\342\264\341?$XZ\246\300l\341?\245n\025a\305\234\355?p\370\330)q\352\301?Rs~v\004\370\346?\006\323\023b\016\201\322?Q\022\305\375\020\220\355?\030+&\300\002\354\264?\321.\020\213\204\315\354?\346\017\202\215\325j\321?\374\336\350d\026s\323?\262\246\005W\211_\347?\'! \344\270r\342?\324ei\3700k\353?hl\036c. \337?\320\317\310\215@\216\323?H\210;\227\253\221\331?\266\232\226\370H\334\357?j\336\240$,\340\325?`\213\361\256\341a\327?\210\010\220\373q2\260?\237\336X\301\303\333\345?\300<\365\375Y\352\261?2/\226o\337\324\350?=\3464\223\333\334\351?Mx\206\0244\025\347?/\313\310\3441\357\343?9\231,K1\345\345?\250E\244\332-[\350?\302Te&\021M\346?\230\273\324\372\270\361\354?\334&\354\001J_\351?\204O\376+\n\245\325?B\202e `\307\330?\252\314t!\224\265\346?<xM\t\231V\315?<4v6f{\341?\360\311\321$\345\341\346?d\230\\\205\223&\322?I\307m,#\254\346?4\242#y5\007\343?\304\240\254$gQ\341?\234X\307\2736A\340?\224\353\177DP%\323?-S~\031Z\314\350?v\234\224Az \340?\304X&\303<\254\355?\346\216\216\356\271\346\352?\002x\013\312j\323\340?Y\326B\177\025\036\342?\\r\rwg\227\323?\024\323*pe3\325?\010\363j\301\020\340\344?x\314\364\tRU\352?l\211\316\327[\014\326?\262Z\277\201}n\321?P;\247P\232\023\320?V\\\275\255\357\304\346?\236z^~\240\365\320?D.b\355\000\020\325?\304\366H\036\207\216\307?8\345\312\277\213\350\324?L\020\224\356C?\324?\304V\324^sQ\350?\270\312\260e&\210\306?\260\026w;\343\013\312?\275\321\225zD\355\341?\240\001\336zZ\024\237?\216\363\000\037\203 \353?\220\346!\3552v\325?\200gU9s\374\343?\344\r\325\t\247\236\346?1\202\264-\222v\342?y\364\006\003A\305\352?\325|\017\264}\364\344?\020\010Z\017\276|\244?6\366%\021\t\243\340?\255\352\0045\242.\357?N\205\031np\251\355?X\035\251\243\205\241\277?\236\274\232K\343(\326?\200<$\325\376\264\311?\010\242\321J\265\364\273?\2020\320OF\242\326?\260\333\331Q\343\372\342?\360x1\271\211\341\261?\340pv\342\314V\306?\000\377k\017,c\231?\013Yu\327\362U\344?\233p\024z%\037\355?{\226[R\240L\354?\350):\3016\265\337?\204x\372(\207%\305?h\365soK\256\337?S\037\\\344:\251\351?\340H\303\2448a\300?\234\211\237V9\352\303?\320\010\270(7W\336?\177\246|\364\236\002\347?\001\220\031\230\256\224\351?@\266\252\247b_\216?;1\316+\030e\341?\303~\032M\031\021\350?\346\325|]\336\241\340?\230\211\313p\356\340\314?8\246\212\323p\212\356?\"}a\032e\244\324?\351`\310\373\210\317\353?\272\261\256;\327\351\344?\360qy.\211D\322?\210\036\250\361{\200\321?\215\336\361\340\002\027\355?\210\214\330\221\264\300\346?\353x\314\263\352\017\354?\200=\010p\373}\264?\351\3337C\224\033\347?aiv(\322\310\356?\263\276\032\2247C\353?\324\007\266}t\344\317?\211@9$\305>\344?\350\220\177\312\036O\327?</\361o\216\241\353?\n\352H\214\344#\324?\230\342\234[Mq\316?Z\242~\270dI\341?\2730\005X\205\001\340?h\202\242\257\301y\347?\340\351\340\005\013\335\271?_\305\033\3038\210\354?LR\300\n5l\332?tb\324\316\275@\334?\242\341\353L\030\301\322?\350\370\307\327\266!\336?Hl\224\354|\010\273??\321\031Y\335\003\356?\310\2128\257l\324\333?\0034\337\226\352\373\340?\210\355b\314\335\352\312?-\277e\204\2055\355?\352\032:\255\001\336\321?\244\266\202\026G\262\334?\372\237\257}\213K\343?U\300xH$\314\342?q\317n\313\311o\356?\235\347\334\344\256\000\343?\372\021\304>\335o\326?\374G\305\263v\336\355?\357J\203\314]O\352?\020\031\361\320\373\346\346?\371\305\322\237\301\232\350?\335d;7$K\354?\"\362\024R\013[\331?\010\0018,\274\025\331?\266\335\360\324\256\025\341?6\327\252\277\205/\324?\344\365m\262-\004\333?\246\024\251\225,\337\334?\232[t\363=\013\324?\330\217\3211\016a\345?\217\300Ut\316\336\340?K\327AC\351\007\341?\310\315\014jd\376\327?\322;\202D\037\245\337?\000P\373\371\251\362\317?\344mI\207\354\217\326?/\252\030\037*\274\346?\210\001|]\323\351\300?\236\207ve\204\331\331?\020\304\rDe&\257?lGF\360~\374\341?\342\001\231\254\222\304\320?\240\006\362>\243\"\356?\002w[oM\274\335?\366\323\2376\"U\341?\177\302\017\353\031S\344?\032\020\032\302\3006\340?Rx\341\014\250G\354??kZt\307\026\342?j]X\351h\350\335?\362S;T!u\355?\307\n\r\360\245\277\350?\373\204$7\366\020\343?\260\005\376b\323\220\346?\020\316\232\\\356\267\326?\361\376T|T\233\346?\2142\340|H!\303?\342\273d\225\211d\346?\364\006\207\353\373\267\313?H\230P\315;\243\324?\362S|\23204\340?\034\001s\021\236\325\310?\204\2069w8$\345?x\316\307\333\355\033\343?/\376;\211\306\372\346?\034\214M:\264u\332?\360\214\245\265\356\246\241?\324v3\324\313x\341?\276\315\374\312\351P\355?\254\315\242b\232\036\333?\200\002{\357\301\237\305?P\336ed7\274\256?\3520\374\311\216\345\356?P\027_\372\352\254\240?|dD\210\365u\307?@\237\206\\\2438\257?DiD\335j\227\342?\347\354\027\310\020e\354?7\006\237\321\256\211\340?->\325\374\273\252\347?,\333|\226\366\324\304?\272\001\370\306MV\322?\360P\322\330\010s\331?r)\374\346\223\177\354?A\212C\212\267\002\344?j\237\221\220l\243\350?\2764\027/\243\271\342?\024\227>\214\307{\317?\310\001w\215@\253\312?t\316XA\332\267\316?\306\213\334P-\314\326?0\006h\263!H\250?\223P\251\252\271\373\347?P\231\021P\264Z\337?sj\035\200\024\304\350?\340\227u\005\237\346\274?\343\0024\03798\352?\211Q_\356\364\205\354?P\205]\237\274\313\340?9B\344\262W\365\346?\272\003\360\267U_\337?0h\024\"\022r\346?\330\317tz\217\r\307?\256\350\352\221.\210\332?mCU\231\324\305\354?\334Tx%s\324\320? \352\202v\230\'\322?\330zbj>\010\314?\244-t\366/?\332?$\034\246\021\243\320\320?h\203\014b\0043\315?\356.\346\364[\232\322?\317\345\334\373\255\375\341?\000}Dl\242(\254?\336\326\273\353\245F\352?\271Z\254\004\242\356\351?\030\023_\377\205G\261?\000\300\302$ZZq?\270\302\331[\207M\305?\205 \245\030\224\031\354?@\275\377\020\270\002\334?\271\313\330{\030N\345?P5\255Ln\237\272?$\316a\203\311\005\344?,\032u\347\002\346\340?`\022H\371Z=\275?h\212\261\354\305E\273?\370\014J]\\\332\341?\360\365\331\006\253\366\250?f\341d\3015(\336?\306\250\t#43\353?\036\327\341\223y*\330?\340\247\375\275\222g\350?\350\360\326a\314\252\336?\350\371[\365\364I\343?B\\\277\264\001\362\356?\270\255/\266\3218\303?\016\363\322 u\337\356?0\320\007\261\247\371\256?\326A\204Y;\347\342?\227\215\272\222\233\r\341? \335!\355\305\254\303?|L\233\'N\212\354?\300\261\335\240W#\243?J\230\247Q\360B\342?\244j\325\301\202\332\304?n\023L6\030\267\332?\264O\366f\270\275\300?\310\306<\264\311<\355?D}\273%\373\305\344?\253\320\250\312\377\352\355?\025\235\2053d\226\340?\244\013\370\233\347w\332?\006F&a\235\263\345?\002\204`\331A/\327?\'_\320\337>\371\350?\230\342x\335Ze\335?\354c\207l&\377\326?\200\345\272e\271\271\246?\332\013\262\323\353\256\330?hb\016\346\333\211\337?\314-\317\222\211\220\303?\010\027\300J\006\343\334?\300\370@\256\231\205\265?1\267\320\257N\327\350?\270\372\245\3217\234\300?n\3573\312Sp\351?\240\222nM\266\304\334?\257\007\256\316Ru\345?W\035\343 \345\325\343?1\225G|\364\250\342?!\023\355\262\334/\346?\306s\352\273\266 \341?\353M\224\354jq\344?\210t\244\nP\004\302?\317\232\233m\332\006\350?$\022\001Y\250\000\330?\367\212\307\026\001l\342?\310c\217 <^\352?(\314\251o\013W\276?$\224\321/\275\250\311?H\314x\014U\266\353?2z^\363\032u\333?\305\327!y+\334\355?\337\300\256\030\3359\353?6\266\n\010\233\006\325?4\332\335\005{\345\356?\210i\033bZ\244\305?|\341\374*1\373\303?~\337i21D\331?h\033\356\277\304\340\264? \322\241YI5\240?\030<\361\373\321L\353?\001D\025|Q\331\350?\317\001`T\353\316\341?\224]/\351\221\\\341?H\216\017NI\341\325?\014\232.\242\037_\354?zj\237\006+\345\332?\360b\r\245s\352\265?Hn\300A\222b\357?I\177c3+\271\344?\235\000\250\020\244\320\354?\346\026\005\241\232\026\335?\352\222/Gr\005\346?\241\t\272\242Sg\342?\306w\237\341-A\336?9\262\302\\=[\355?\354\272\375\222z\243\310?\272\313\370S$\000\322?R\310p\224\337\255\327?\300\272\207\326\365A\275?\372-<\361\362\355\325?\216\013\005D\326\352\330?\260N\014\353v\355\312?$\257B\"\027\371\353?\256\014\010<\210z\354?8\271\371\013\016\370\271?x\001\303\241\315\353\327?\250\016\277\320\212O\300?#V\314X}\305\353?0\237\204\270\207J\335?8\0224sYY\305?\270\2753\241t\022\276?\014q\3746o)\336?L\211\021\356\221Y\330?R\024\262=\341\240\326?(C\321O\224\234\277?\215\345\241\310\3079\340?lY\241\354z\212\302?$}\373\272\010#\321?)\237\324\302\356\273\341?\270\024\010\365I\213\333?\254\211\201v\226\330\336?\266 \231\335A\317\324?S\'z\004\315\266\344?8\255\331q\342.\323?\030}\245\256l\325\341?@W 5\021\000\334?h\242\330=\257\364\345?p\205\321\t\276\331\255?\010\353\204t\364\254\272?\345\023\256\035\014\210\346?\000\372\315\303\356\205\300?\t\317Y\360\031E\347?f\t\206ii\254\350?\320I\202\341\341\026\267?\010\270\3575a\245\356?\246\364\232\343\245}\345?Vz\374\302\001\031\336?\360\367ah\374s\266?(\025\t9\246\347\341?\210\343\344\330\224\254\267?\274yX\367\301T\325?\340\212A}\304\207\265?\027\3378\354G\355\347?h\305\013\350T\302\351?P@Y:\374z\325?\266\215\256qQ\224\323?\262&\314\341C\335\342?\226\311\332\267\324d\357?\261z\250HU\222\343?\261*X\304\253\240\340?D\347(=\340\355\300?\230\247\312d\357\313\266?G\256)t1I\346?\'\353\261\263\214\327\342?\311\001\033\370(\274\356?\334\256\260\257MR\333?\024\247`\322zS\346?.z\343\260\342\244\344?\205\177\256\327e\264\340?\032/\260\237\304p\326?\240^N\233\202\265\334?\244\331\021\233\201\262\345? v\037=\024/\355?\330K\260\035\235\232\325?\233\0332\317\365\376\346?\307\017\240d\237\300\346?\257\207\260\245\210f\356?\036{!\254\244\t\347?S\036d&k\327\353?\000\017-@\245\242\317?n4\354\207Z \327?\000v\030P6\237\351?\336UJ\017h5\347?\230B\215\272\350\240\337?\362\270\333 X{\341?\014\357m\334\024g\337?p\264\306\211\327\323\271?\000\340*\276Z\267$?\240\360\300\006l\030\326?h\216A\021\233\315\321?(\341\361\026\325\220\315?\214(\216\241\317\000\351?\030%\255\311\217\213\301?x,\035q\356S\333?\220\037\303Z\024\324\242?\374\346\243\n\274\r\324?\023k\226\313\025\003\347?\377\"b\255<\243\345?#P*}\233\017\347?\362P\"7-\204\334?\355\300FR\032\305\354?\267\365\216<\335G\346?\004\007\224E\250i\301?\260\325]_\232\272\242?\332\"\266\230&\265\344?1\344\004&\324\250\355?\314a\312Q\227\325\312?\336M\033\356)A\330?\005\233\r\"\300\332\345?$\2510\306\344\306\300? \204\004_f\314\353?\242\326w\265\232\376\351?\200(\324\356\244\257\353?K;\365:\245\352\352?\370@\207\025\2753\323?\204\265D\336\016\024\337?2\322h\207\010\365\346?\002`VOCf\325?\277\201UM\001\234\343?\330\267\006vw\007\327?\212(\307\n\235F\346?\"6\006N\277\002\336?\310\277\025~\035\317\277?K\362c\227\264*\357?\276\352\260 \325G\342?D\332M\004\335\211\313?\360\334H\217jE\340?&Z\3362\032\256\341?\244\231m\014L\266\343?`\027\231\215\207\250\237?\223\030%C\255\224\354?\356W\010+P\275\353?\361`\243n\246\377\356?l\003W\321\3429\330?\364\251\233G\2324\316?\245\\BFc\221\341?\220u\224\226\026\235\301?\245T\"Y\243\265\350?\360\200\257\010K\206\265?\230j\356\227\257\316\325?\237\030>\311Jm\356?\270K\377\253\376\225\331?j&\3064O\323\346?\313\306?\332\210V\353?\273\324=C\203\214\350?\354V\327+\262\001\355?\0004{\014\276\314C?\002\236\266\270\374\r\354?\332\323\036\020z2\333?e\202\2406\361e\345?\"8jh\356\360\333?\260M\377\220U\315\353?\'M\212\252F\310\340?\304`\256K\257Z\351?\372\364\013zN0\353?\340\363\r:\266\355\236?\000\206\372\267\276\227a?\347\'\'2\332/\354?P\245{\312:\224\306?\320bl\304A@\247?\352\252W\205\"\305\327?\233yv\346zL\341?S\020|\260oF\341?\202\305\013\316%A\354?\3403\000^vE\244?\026\204\303\177)\303\343?PMj\243~\240\276?\200\000K\313\363\207\254?42O`\303\240\343?\200\234jfz\212s?\322\3202&\2202\357?\344.\254;\252\245\325?\260n<\270X\312\316?\330\216\345\027\210]\267?\305\270\324I\336B\341?P\302\320\315\300\022\340?R\024k (\206\342?\254\243\331\302+l\324?\214:7\345&\257\330?v\210\253\342\225\031\322?p\356Sg\326\347\272?\260\014\210\2240\232\274?\000\266\337\304\347V\255?\244@ho\360\262\331?F\310\353\262p/\357?\370\003\363\313jv\304?\272\r\031Sz\020\335?\246\334t6(4\325?\272\216\240R\243\022\340?\200\347\344\0253\343\302?\234\264V\363\347\240\355?\000j\031\217l\256\204?\000\032{\306R\242\257?\310\020\277\302\230]\272?\200!\212\3530y\274?\360\212T\332\014n\304?\206\033\356\014\311Y\357?\345\202(\026G\243\351?x\306\200\016\"*\301?\356\275}H\355\356\353?\036%5\\\367$\324?\031\224gO\262\363\354?\036\215Q\243in\347?>\330\303\032K\356\330?\274\362\352\335x\010\356?(\356\217\001\017\016\310?j \002\246\366\364\327?\256Il\267-G\347?\030\271\325\344\n\273\337?\222]\254\"N\347\327?\214k\270\220\352\250\357?l2\017A\0053\336?\232\311YP4*\352?x0\317\324^\255\313?\336\t%j-{\350?\010\230B\260\203\n\314?\234\331x0-\316\356?8{&.t\024\336?O\262\266\023\342\354\342?^YPJ\310\214\326?\204\023.\352\006\014\317?\004S\232\256\005]\305?\323\240\327\240\200x\350?\206\230\201\343\260Z\357?hD\322.\234\310\322?\352\305A\325B\207\351?\276f\003\377Wb\344?\000\241=\360f\315\340?\370\355\267BlK\277?\350\226\004\007\367a\270?\272\304\371\010$\257\322?\014\035\312n\314\262\315?\257\223;\256[\276\350?\206`\316\332\247\346\337?xK\234\2006\215\320?\020\2359\211\003\205\276?\214v\202\336\331W\321?\234\366H\246\024\367\310?\317\3079v\304[\356?\250V\220Y\342v\355?\344\037\007\215\017\035\355?\254\272\257\323p{\321?n\233\'R\371M\327?pV\332\306\272\374\257?\365q.L\374\245\344?\207l\313\343\036\371\352?K\303\215\350p\350\341?\354\303\210]\246D\333?\346\206\270\254\036B\355?\211\224|!6@\343?\374\263u\217\240\317\313?\235\\O\262\226\311\356?`\212\224&\203\220\334?4\005\224\n\251\222\350?n\001O\313p\021\321?\210ca\310\022c\314?dx\347\026\266^\327?\200\203\022Q\221\277\237?\204\214^\305\333v\304?\024|W\035W\330\345?\340\000c\274\275s\317?\310\372\215\217q4\353?\215\337\312\222y\303\343?~\010\260[Tj\342?\305\357\rZej\351?\340\3043\222\232\374\260?\322?\376\333e\233\343?\374\240\026](]\322?_\374c;\352N\352?j\2749=\365\224\356?0\244,\277\035\341\336?\250\322\353>F\266\344?\340\000\"\341\250\237\327?!\323\242\373\234\242\354?\224\'\217\340\010}\301? \222\232\231\021g\350?\215\031\275\364\310F\355?\334]\304^\360\000\300?\330\004\252\016l\013\303?\234\005\352\302\267\210\347?\024\032\035\254\240\366\353?\304\230{.x0\353?\304\350\320\0279\376\306?0\213\2178\263&\333?\344\266\010\372\362\364\352?X\221k\311\375/\344?#\2445\304\007\004\340?\341I3\027\277\313\350?\200\3478\367wqq?Ib\222\346Nw\346?\370VKL\352<\302?\000=\212\247\200c~?\360 !\233|\332\302?\213\262$^\310W\345?h\214\352\207\316z\261?k\200\001o\277\334\355?a!\334L\267E\353?\273\346\2665\200\254\354?9CR\245\271t\342?R6\246\253\r\370\343?P7\350\241\2710\270?=y\304\327\2667\356?l\025kf#\304\330?Y\367\312\016\306\346\356?t\233\341\325\r\331\336?\214=\006\343\016*\313?\213{\352\303\025\347\352?\342a\031H\346\257\336?\206\001\014\341\374\330\355?\340\017\3708\267n\344?[ \237\314_q\345?&\277\362\250\372\207\332?\263V:\274\374`\351?\226LYo?\207\343?\227\026\250\272\271\n\357?\210\203\347\327 \237\335?(\303\216\251n\317\317?E\277\350\263\014U\351?\336\020\021\366\315\226\323?g\303\317w\007\261\355?\030\264\310\020[\367\272?D\2358\346\270/\305?\014\227\233QxK\310?\366\257\235\257\355\201\352?\362\2429\035Z;\326?\364;j\034\240}\310?\365\n\n\237\251\254\350?\231\211r\302H\363\345?|D\022Yd\247\313?\366;\003\225N\021\353?\342\274g\277\242q\322?\302\2327\013\237\273\324?\237\227\334\275\374\225\347?\235j:F_8\342?~\177<\030\333\265\344?F3\027\355|\233\355?\360\372\261\252\014\002\325?\363\274W$\033A\342?\204\225\004Ed*\321?\340\025\315S\200\365\252?xB]H\317>\330?\0009|\202\305\031}?\370\257\362\347S\201\335?R\320p\331\224<\345?8\244:h\221\206\304?l\032y\203\336S\314?\320\0236!m\224\273?e\346\364\304~\036\340?\010.u\311k\321\274?\"P(Y8U\351?n\345Z\234\004x\346?\272\025~\302\027W\337?\350\343\363\341\277{\266?\025$\341\220\2035\352?\036\252H\303\343\177\327?K\002\'\301&\303\352?\342\354\241\177`\013\322?\266~\251:}f\355?2d\265j)f\325?r\251lH\025\n\323?\020%\005\254Z\232\324?\231%\014TLW\352?\326\327{\363\323\377\341?\022\263NQ\343 \336?+\010\324\306\270\037\355?\\\365\033\245n\266\320?\364\343\253\201\022\277\342?\330Z\272\310\370\256\262?,1M\277zS\324?\200\340\341\300\345\002\264?i\214\310\213\367\374\354?d\342\010\243D6\343?\320\037\014\214.*\243?q\337w}\033=\346?0E\'\026&\277\337?\000\204\315\227\236\004\314?n\313\034\344\347/\321?T\207}Ds\036\323?v\336P1\236\324\335?^\343\232N#\354\355?\354!\252\035\326c\333?2\256x^\307\002\345?( <\336\355\322\300?x\246\203\302\227c\307?\022\206\007\242\243\247\355?gd\211\264\336\010\343?\0344\322O\317\246\332?\237\031\345\347\226\326\354?\233z\177\247u\277\357?\3627&1\010\360\345?\227\223\241\"\355Y\340?\000\371w\022\200V\245?\213\220\340}\035$\356?p\036\211\250\214\227\257?\330\326s\354\316K\303?\315u~\270\267\365\346?\350\322\215\223\213\223\300?>\014\207\263\326\257\343?OB\n)\022\272\342?\354!\316P\305\261\306?\325\2669\006\304]\355?\351+&\372\372W\355?h\347k;\"Y\344?\202\350iZ\177u\344?\000G\013\000\377\222\273?\360\004\004\002\213s\350?\372K\236\271\330l\346?\240\243\336c\177\223\335?m\003V6\227;\353?\022\225\250m\307\034\326?\207b\236(T\222\354?\322\226\376\244\216\343\324?\\R\301GT\'\320?\232&\2727\221\002\321?\234\025mA\355\014\352?\232\355\003\345\211\232\340?\204\255\005\323+l\333?\220\317\001\365\247\237\316?\004<\220Pf\212\310?Z\323\223;\020/\330?\014^\327\251\211\010\337?\210\247\027\233\315\006\354?:V\027\032\263\362\331?0\025\010\244\027\361\246?\002\346\241\360\027\343\345?rGD~\014;\320?vf\270\347\315\035\325?\246\241\361\032.:\344?\246\207\240\303\306\334\356?HJ\nlY\271\312?\306,u\253\314\031\332? \243\352\366\t\236\321?\314n\0179cs\306?\264\020\227Z.\334\305?\240u\277#\217\350\273?\036d\336\"r\264\351?\333\000\033\025\243\314\352?\275\023q\037V\267\341?\034\023~w\342\"\354?\354\217\002\241` \354?\n\373\2377\375\240\336?\276vdX\247H\331?L_D\215\"\237\311?\260c3\254\312\324\352?\364_\246\251\310\343\303?\303\3643B\033\311\354?H\336\362\260\016\333\264?9z\017\213\352`\345?\034\357W\346\373-\333?\030~\376\t\017\265\321?$;\r\324(~\357?\231\205dF)\232\342?k\365N\004\025\332\341?\201\250\264\326\007\'\343?#Q\034\270\224\304\351?n\250II\247\254\335?F\2272\266\270I\325?\200\231~p\371Q\317?*0\230\311Gk\335?\037h\335A\333\"\354?\310\024R\026\3748\335?\2439\024\r\226a\341?8\341O\225\244%\337?\331[\023\203\3032\353?0X\354\351+\211\335?\220\002\\\322\223\333\317?$\350[X\263\022\303?j\034o\227\367s\321?\334\300I\231\207-\356?\000zZr\260\233n?\003\220t\037-y\353?\274\311pt\353\033\314?\0007\345MN\341\250?(U\263U\240W\301?\202\311\032\200\n\252\325?\223\366\314\3169\007\340?`\367\241\216\232\203\223?04\357|\224B\321?+9\035\204\332W\340?\r\275\214\022X\327\353?\024\020\353a\205\311\311?\246\310\325@\325_\354?^\344\220\316.\323\343?!t*\320\315#\345?\372J\375z\232Z\356?\270\353\341\270\020f\265?\274Ie~W\315\320?\005\337P\\\033\032\350?\251\250\322\007c\372\343?\224f\3606\237\255\303?\240($t\372=\273?;\235x\240D\345\340?\240\347\022Y1>\255?\344A\\\220nS\302?;\027c\017W\224\343?t\315*\000\312\304\330?\016\020m\210\343\357\337?`\206\263Lf\031\321?\204\227\253x\202\332\323?\301\245)i\225\327\354?\0138\244-\352s\347?\376\366`\265rF\324?\300\035c!\213\037\216?\002\030\322\216<}\343?h\327\364\363\023N\276?\245\257\213\302\337B\346?\302\223 \374\007\241\326?\000\263\013!\326z\262?\314\311\250t\023\306\332?@2w\326\"\311\303?v\331\004\036\250^\345?9\347\343\225\251e\342?>\206\331\230M\212\335?m\373\032\314\263)\357? \243\001\343\350\353\220?\010\301\275\231\316\201\322?\200\276\335\367\366\343\271?\322\025v\037\360@\346?8r\266<\224\367\345?V\'\242\366\225\322\325?St\373i!\004\343?v\"\266\344{|\336?\022\363\250\374\020\242\334?\366\206\372\032\342\321\354?\366Ycg\307\275\357?@k@\305(\215\316?\026N\330\222\274\274\357?\277_\260\307\tC\340?\277\256\266\2563\315\344?\210\254xw:\224\330?\3315DQ\237\347\341?Mxj[\032I\345?|\321\344\224D\340\304?\351\352\021e}\037\350?\314\237\232\177a\334\322?\310\004\241\325\237t\260?\032\027S\035@\000\357?\324\362u\276\242\334\343?\001\r\257\2638S\352?\006Q\355\360\233g\322?\2558\341\311\213\217\356?\223\312\344\205W8\347?\007\376\001\332\2257\356?\014|\3275\301\275\324?\276\325\207\016\206\311\326?r\225]$\233\240\334?\\\210\3334\270\345\321?\312(\323\272\363\341\344?e(]\3670B\351?\000\204#\304\232\240[?q\343)8\230z\351?\000\200G7?\320[?\244c\n\231\323\312\354?\010A_\026n\310\326?\245\225.\267\307^\357?P\207\320\005\314\252\275?\374\007rs\353\304\350?\220/\rdd\374\240?Xf\216^\242\264\310?`\366\252]\034\201\337?. \272P\272\315\344?\\$\351\333\313W\304?\000\325\203\332\213G\272?{:\257\303\377\003\345?~V\006\247\215\r\354?\256vLAF\206\344?z\303u%\263\354\334?\256|\001\\\234\255\323?\000\020\210\233^\211|?\244\314\226Py\224\321?0\265]T\341\241\257?w\301\037\246\300R\346?\210|\276\347\203\246\266?%\025\004\223\317\005\351?``\010h\'w\221?\260\301M\206b\030\325?\230\246\254\217i\240\354?\021\202\264\241\313b\353?\224\300\221\203\253\205\350?!UH1m\336\351?\260T\200\303P7\356?R^q\315f\265\320?\206[\026%\204\370\340?\000\2067F\377\036n?\344\0073G-\216\317?\260/\203q\370\315\344?^\027\224\025\250\215\325?,\375\017\317\225\332\326?\264\237\177\216zC\312?4\256W{\334J\307?\033W\270.\n1\355?|=t\326\243P\326?\n\014\250g\267\377\335?\377\315\034\tp\204\352?\344\"\025\210\304f\320?\240,0\303p\001\230?\346$\310\233c\237\324?EK\327\311\3451\346?\036\365\266(Q\200\344?\344\212\345\324\001n\322?\303\245<\203\363u\347?y\366\363\270\232\250\354?\\]\332\0045\014\305?F\213\n\321C\333\331?V\350\n\275\003\221\340?\224\227\233\025\372\231\312?\217`2\326\014B\343?^rOE>\204\327?\322\034Od5\347\354? P\203\362\0100\323?\367\325\205\036&Y\343?\272 \301\257\004k\334?\261\372\377\213\212\326\344?\263\331\205\210G\216\341?t\335\217\321\216\006\350??A\2779\343\250\346?\017D\230\310\332\271\343?\255\0046p\373o\346?\246=:\322J \321?\003\367\360F\'\220\350?\226\242\227}\213\"\324?\2150\374\251]!\350?\354I\306\370+[\312?\215/\362\nl\256\346?\306a\352\263\201\331\343?\036J.\212\247\240\336?!\3675\363( \354?8[5\341z\345\337?\3346\216|\014\340\351?\332vW\331\355\274\352?V\201\215\365\216\264\350?^\376\272\360-\002\351?M\312\226[\036\351\355?\206B\177\224~Q\326?\200\230\3670l\362\215?\260\210\205\306\223R\345?\223\222\371\226\\\006\342?\305\244b\343\266\274\347? ~\333\273\3625\225?\317\257?\257~y\344?L\325\336\026\023?\304?8P\356a@\332\322?-h\271S\\\375\352?~(\355\220Y\342\331?[uZJ\262\227\346?f\013r%dt\337?\204W\'\\\314z\332?@\375*\354\032h\221?*\375\253\375\305y\336?\225\357`\231\300;\346?r\3338\004\014\272\345?\320\277\r\331P\036\326?\304\204\360\\\014\014\320?X\204\236\266\230\350\352?\266\337q\225\307\351\332?b \355\253f\261\353?x\367\374\361\340\037\311?\222Z\216\323\377\245\335?_J\320\346:\217\341?L\n\201\323lH\342?\371\206\247iH\002\341?\242\264\017\234\210%\324?~M\214om\324\323?H&F1\001\254\335?\350\310q\227z\010\274?\202=Uu\230\201\347?\370\325 |a{\306?\277\262\236\257\212c\345?m\006\230\304~\222\342?:&\031B\242\304\322?\301\330\365V\336S\341?\256(T+\243\211\320?\000D\030p\214\310\306?\332\273mF\321x\336?\240\376\342\200\004o\227?\370\317\210\304\240!\270?\230\302e\211e;\310?\243\232\231\2126\221\340?\000\'\3010wY\241?8y(\370\226\357\337?p 2\360\033\352\303?;\274Z/*\311\357?\323\006\201\2756\317\344?\003wB0\256\342\346?\240\247\2766\256\227\340?Fe<*\315\016\335?`\257\224\2104|\267?\275P\307\310\031\376\344?p\264\335q\311\031\273?~8\241\275=q\322?\300\303\340\213\034\346\330?p\016\237\'7\234\277?\340D\005\200\222\363\310?LB\307\370t\337\311?\200\014\304\230\307\261w?\253^\301\336\022\362\342?\210t\n\302E\007\350?\344Ho#\223\376\317?\274C\375q\031\355\334?@\227\337Ah\025\265?\030\320%\034\027\032\264?\301x\0356\223_\357?o\337Bw\230\222\351?\026!\320\300\371\267\355?\3202\226>\234\375\317?\330\000\261\246\364\300\303?[Q\"\312\376\002\346?(\300\002\340\315\002\337?m\223\343p\244n\355?\316\214\371\252\351\177\345?\334*\356\264\317\262\312?`.c\025\231E\274?\2107\310\021M\035\272?>.\2248`\323\351?\203\264\224V\361\200\357?|v\367h\020\252\354?\376\024\026\302^@\342?p{\2511\"\010\265?\350\210\023\316\215\"\320?\252\312\230\264\360\225\351?X,R{\262\367\354?\010/\214\365^/\305?tOhg\254\207\341?6\300\240\251\014\227\331??q\341k\002:\356?~\370\2445P7\327?b\014\034\347-\256\354?\007/\005\242\333\365\343?\263\271G\364P\227\351?mv<Q\323g\340?\036\000\332\335\033\214\356?\006\202\373\203\325\273\357?\3320\311\245\372\222\321?`z\231j\nU\244?yL%s\212\310\355?8x\245\274\'m\346?\240\254\237\224\306\356\301?\2575\002x\2265\340?(d\371*\351,\357?fP\377\222\360\342\346?\\\220\314t\360\236\344?\350A#Y \270\265?\3176qs?\356\340?\024\350*\020\204Z\305?\231q\026\314\377\201\351?\002\341\253\331b5\337?\344\222\267\023\264R\317?Dcz\331\260\237\325?\000\316A<\001E\273?0\007^\357g\376\336?\304\367\377\242\010\036\304?\000\201Axuc\350?\320\204\247\212\006[\317?y)y\242\317\336\357?ZX$o\361`\336?\26694\304K\344\325?\002\245\247\336\366\207\353?\200!\375\010\004R\256?\236\277\313joP\322?\354F\221\020\003\n\320?&V\323\262-\367\350?X;\324\237\264x\330?\272\247C_\322a\341?\346\375&\207\022n\357?\364R\330\n\357\324\342?x\r\214\014\337\034\332?\030\345\025\244xh\304?\221\220\277\201[\352\340?\200\346\3242\033}\354?Lc\r\276\250,\324?>,\267\370=\315\323?V\263\237\206\251\212\331?\242\017\371\365lg\350?\370\024\255$\007\241\261?\016\031\314O\2055\336?0\362\010\277Rq\254?\036\342\262\'\266\343\347?Py/\2448\357\345?d\370]\314\303\037\306?\234\220U)\245\220\336?\240\300\337S_4\220?\210\272.\250\241{\303?\360\035s+\376\027\356?Fq\274<\315\313\343?\267\330\013\353~\255\346?\027\366\020\235\215\030\344?\n\372\252\334x\326\322?\013\244\035\352\r\022\355?x\273\034b\325\257\335?!\371\r\254.\240\343?M\226@\243(L\346?L\177q\245\220\026\327?\313t9i\201\006\352?8\334.\020|D\300?\240\263\tu\327\361\334?\304\212\206\027\275\010\311?\340z\312VVb\220?)\262Y-\017\273\345?\347\021\014;\320\212\354?,\327\311\263\324Z\325?\242\257\204$\240\206\352?\034~\000b\365\220\310?$\370\251<M\017\345?\032\0058$D\037\337?6\375|!\301d\352?\362\033\242\221\201\212\351?\220\205R-,\212\336?\301\206\025\303\364\273\345?\232\005\212IA\202\353?@7\r\261\361\315\211?\340\271Sx\006\234\357?\000\262\244v4\220z?\264J\346\030pc\344?\320\332\264\307\202\025\354?\202\035\344\014,i\333?\\\256\202\2111\325\302?\370;_\352\244?\322?\034E9j\314\335\331?\\\305\333\201\024+\325?\035\02188>\n\347?\343\033#{\"|\344?\244I\006B\374\201\342?\336\257\3149\025\273\353?\227\031\361\215!\026\356?\331B\021\246\n\260\351?\340j|\244\376K\274?Pk7H\244^\335?6\372Q\035\320\332\324?\"\035\021\260n\321\322?\300H$>\213,\205?\261\204\355XE\315\347?\273\023\273j\203\326\344?\243w\342\032\232\252\340?\267\353\232\255_\350\351?h\002\001\223\323K\342?\240M\265\335C\266\350?\016\327?T\367\221\350?\036\306\311E\347\377\346?\342\332\215\262\245y\332?td\235\320\222\307\310? p\323$\276o\224?\204\025\212s\307\246\331?\214\247M\377`\277\343?D\327k\234\202n\345?F`\344n\362\027\351?\016\002\254\"\322\316\345?\374\007]\005\326\263\306?`K\277\350\253\363\270?\337\036\261\354m\037\355?\210\237=^({\357?Z#/\315\326\004\343?\252\366\326b\010\375\335?>H^lh\332\323?\210\274\323?L\262\343?\300\271U\002 \004\304?Xh3\200Q\331\345?*\010I\366(\352\326?[\334\252\3418\025\342?\240F:\356A\261\227?hb\240\370\315\232\312?2\306:\026\303\017\343?\"\'\241\316OO\341?<\263j\246\201\r\357?\274XE\025\007\233\324?\270i\225P3\372\300?)a\301.]\r\344?&5.\311n\311\342?\252]\020\336~Q\343?s\314=\221\271\\\353?D\373\234\325h\035\340?\3360\300\347\005l\345?\2724\315\237\364\177\343?E\215}\365x\007\341?d\214#\263\2220\302?\314kj\352\311\234\350?,\310\227\214}\003\355?`\237\236\032\320\345\235?\000\337\t\226W\256p?kN\314b\317\336\342?\302Y\235\033\307\206\330?\352\354\310L\347\227\343?\346\243\031\265Bo\351?,`V~\013s\334?F.+N\226\330\346?0\3213\030\305!\277?\200\022\363\030\031\367\252?\321\321\330x\2533\355?\320B\323\302g\363\247?\350\215!\274\013a\306?|\nk\266C\t\352?\361Y:\217\360\360\351? \231#\26290\266?\320/\007\270\366F\323?\260^e\357\240\376\301?>\006\250\363\342\014\347?&\271\266l\200g\350?\374Z\027\274K$\345?=\363\277\235\024\372\342?\352[9Y@\201\344?\242\351\324\237$\311\343?\003\364\230&R\375\356?`\326\rlk\271\330?Q\026\367\307\204O\343?\240@f\030T@\275?(\217Se\022\202\353?\374\223\037\364\257?\332?\230u/0O\302\327?m\325\274\003$\214\355?\244\266p\315Cm\323?\252\370\260\000\337\356\340?\370\004\261\366\362=\341?v\n_Il\353\346?\002\206\032X,\343\325?\361?\306 ,\216\343?X\242\345\236y\304\357?\300hhd\273\345\307?\332U\202n*\033\325?\242\363\326\204o;\342?4\222J\255\324F\320?S\336\254\221\356j\357?n.#\003\340,\356?\005\216,L3\030\357?x}p\n\363\n\340?\030j\3117) \355?\220\224V\316\245\n\241?\350\302\t\277\216\340\356?\266\356\336+\253y\346?q\276?\306V\322\353?\t\347~\352\201{\340?\016?\223^\346O\333?88m\204\336\371\262?\316W\3166\\\026\340?\014\001\177\355G.\302?*D\334&*\037\345?\364\200\226\272e]\307?\374f7\3270_\316?\"\356\226\204\315\225\350?\222\265\202\217\3709\351?\230\250\2079O6\323?\336\221\364\026\302\023\340?\013\2525CA\266\353?\236t/\261\211\303\326?V&\350kCI\354?x\344\246\023\355\311\352?\227\267\t*\020P\347?\2246\355\024\265\316\350?\006\026\t\235\302f\354?2`\231\366\203B\342?\220\240\373~\261\343\354?\364\305FT\021\243\324?_9\241U{\267\343?AQ\177k-\204\352?:\273\270!/\026\345?\010\024\343\254ro\356?\340\223\375\000\275\364\222?\2549iF\007s\305?,N\363\356\034\027\342?T%\017\212\207\232\354?\272\177UtKi\322?\264\266c\223\\\373\305?v\365\324=\261\271\343?\351\017\252\316&\222\344?\254\251\r\237\323\377\343?m\357a }9\356?~\311y\202\373_\345?\246S^Z\240\347\347? \003\2031M\264\271?\350\230\037\3513\032\270?\250\004eq\r\230\307?\237\305S$\314M\341?\206+\360z/\216\337?\260\360\300-\206G\344?B\305\201C\362\316\341?\220$d\202v\000\265?.\366=;\263~\337?\007\022\203\005\372\351\353?H\216l\360LF\262?$\271\t?\371V\310?v\"\030\305\226\204\325?\300\225\272H\224\274\226?<\261\322\177\231\023\336?\227z5\237)r\340?0\030\215x\263q\343?\000\25532\362\230e?=J\305\320\236\302\352?v\206\365\331\"F\321?\355\334\n\006\373\027\346?\233\301\354\305\202\017\341?\244\333\213\203\226\032\317?\\\251{x\345\373\310?N\034B&\017\275\340?\337ym\333\261Q\346?\026\032\266C\0008\334?9\203Bx|\315\347?@P}q\356\347\210?\004\350^\307\362z\314?\250\275\330d5@\346?\t>\206\342\324\346\340?D|\024\373\315p\336?h_\rK\340o\340?\302d>F\213\226\353?,\026\007E\345.\320?\352?\271\245\234\261\337?\016\335\222\2436\320\353?\265S&I\353\325\351?\n\256\202\377\253z\350?\252\302\n0 \200\324?\003\204si\235\004\353?\214s\266\250\237\221\317?\344\261\177\373\347B\331?v\202\"\003i\001\323?\224\340\250\372\237\205\315?\r\014|\037\367\203\344?\032\207\365.\010\031\324?\251\034\207_\315e\352?\256\346q\345\1777\326?\305Y\351R\320e\341?\324\353\357\220`\235\331?\216$!\237\270\273\352?\035\327N\023\343P\346?D\333\025}\317\357\341?\361\377\035l{q\354?\264\017K\206\244\007\303?\3462,\305\304\325\333?\325t\254\020\250U\357?[\010\\\312\022\010\354?w\327-\277\243^\354?`\242[\004F\305\334?\340\304\000(\370R\276?N\006\375$\030\277\324?%\026\271(Q[\344?\034x\273?\204g\311?\021\032\235\355\320\222\355?HjT\273\333\326\273?@\243\306\236\334Y\346?$\037O\204\376\252\307?~:\340\027\241\243\352?\000T\"\337\324\341O?\000\035p\361\230\351i?\020\227\246G@\304\337?(\377n\236\361\226\344?\321\224:\261L\347\357?H\314\335\2226a\271?\264\032~\236G\236\310?\334\233\3003(\326\345?\273\251\203\272\022 \357?\240K5\370\\\355\307?l\307\020q\211\223\315?\345\261%\275N\255\340?\037BB\214,\322\345?\026\3425\346\310\224\352?\374\215f\t<\251\305?_\035\2043\364\273\355?\005\363Y\210=~\351?$\t\327\311$\256\304?\250\255,\220\205q\274?U}+$\334\022\350?=\215#\231\t\201\351?\0346\254\251\345\232\333?\3409ip\300\325\251?X\244\236\250\235H\323?$c|\344\022\004\356?\317\360\245W\201\257\345?\254b\250\2322\267\317?\r\362\014\026:\332\357?P\350)\366\204@\331?\004\026\250\007/\210\331?3\245\273h\337a\342?<`\273&HW\357?\316R\266\264\0034\350?\345h\'\014\200\025\355?\362\252wpf\335\320?\001\363z\205\003\022\353?\240\004\315\025p\003\267?`\320Y\227D/\250?\264R\332\2478\r\327?\320&\343\010w\361\313?\000\037\312\351\261\342\317?\010\341\344\244\357\177\317?\222\2618>\225\010\351?\215\035\222\r+\307\340?Ab\27507\200\350?l\371\246\347\242J\315?H\253\013\347T\366\337?e\267\n\215D(\346?P_\033\022\312\230\355?\350\006\364w\344\203\333?x\214\375\263\266\374\351?\242~\023\205\322\317\335?\3061\240b(\210\350?\372K%\327\275\234\327?<\034\367$\234\317\316?n\232$H\353P\352?S\344\025d]t\340?\2009\256/\376\361\310?{&\365Guz\340?\034\2376E\260o\340?C\3012\314e\332\356?k#%c\330\276\341?k\271`^_3\357?\256\255\260\\\241\242\347?*\226\311\232P\336\332?\\\352\274A\342\360\346?\004\2642\177\362;\357?\200\362\334\377[\307\223?T\301D\275c\004\317?\024m\254\323m\022\347?\304\264\231\222Tp\315?\260W\245\210\373\316\343?*\030\371\177\031U\323?\330\235p\307a\265\344?\245RvA\355*\350?\203\313G\377\225\251\341?\242\204\30494k\351?\240{\3076\330D\337?\250\274\226r\255\035\351?\0002\007lA\372\211?\350z\341\270\016m\270?\001\\L\010Q\376\353?\342\337\334`?y\322?`\250+J\243\222\342?\024\326\265\263y\336\330?9]\027\014\026(\342?0+?\333\347\344\243?2\'\357C\363\177\354?H\374\2657\202y\335? \037\373\236\322w\316?\250\025H\377}\036\336?P\240\332\020\035\210\244?\226\270\256yn\300\356?\373\007\353\222\312\223\342?\256g\017li\301\332?\202\254Kh\034\274\335?fox\037p^\345?\310f\220f\216\201\351?\350\315R\365\363x\332?T\265\"6\220\265\300?\241\303\2043\227Y\351?8^2\264\317P\262?\220\317\353\210U\327\333?\201\021\323\300 T\343?K\332\353\301\304\256\355?P%,I\270\244\306?\223\371\226\206[}\344?\274VkP9\213\314?m_\371k~\253\343?T\255\005\3506\022\327?\023i\215B\361\021\341?\223g\207\305\340\216\347?\304\252f\323\266-\346?\034G\236&N\305\321?\2039\374Q\211\372\355?\354]\344\240h&\350?\306\360=\371!m\323?m2 \222i\211\351?\272\375\326\240R\224\322?\360\2404y\027\210\314?\240\002.V\267W\264?\230\347\261\347\233\327\340?\364\031\316P\303\247\313?\250\347A\373U=\304?hK\360\202Y\247\345?\274\\\3175\n8\331?\273\236\210\327W\316\340?\224d\337\006P!\335?lMI\223\375\242\344?a7\300\233\203\233\356?\270\226\372~\327\304\301?\004\251b\253\036\252\324?\336r\307\206\205J\321?@\265\016R~\221\323?\001\332\247=\217M\341?z\274\273\347\253\325\343?J7\276)F\261\323?\020\335\377:\313\236\250?\340\360O\264\235\343\246?q\342\374\335Y2\347?r\227\226\r;\013\323?|\333\302\272\253\026\343?h\272\342\263g\002\353?\206b\305y\003\016\344?\376\367\376\234{&\355?\020\021B\272\240R\303?\360&\236\367\370\322\312?\200]\350\033Z\305\313?\260{_\320\016\352\275?3\312\353l\345\035\341?\\$\374[\237\244\303?d(r\272\227\333\357?\370\257\245\253\224\354\343?\031\263EGV\013\343?\365\370}\335.{\357?_\203d\302t\033\353?\016\020\255\350\213\033\330?\200~[f\025?s?\"\354\300\247Lp\324?\000!{\tQ?\325?p\301O\204\227\366\253?24\256\026h\213\350?l$?s\313\325\330?1\273 \221\323\375\340?x\316\367z\346\005\301?\0102\nP\257`\332?\002\312\233\361q\254\345?\343\3575R\354\354\342?Sqc\325\307k\344?\260\332>\267\205\203\325?C\n7\325BK\353?\270R)_\021-\267?\214\010\362\214\002]\330?~\253\013\347\244>\357?x\023\030\036\"\321\277?\316E\317\020y\371\350?js/\226\242B\341?<\341l\\G\311\324?y\266\270Tv\200\341?4$\013}\221X\302?\2368)~\226\340\330?h\310\375\225sW\345?x}\213\252dd\263?\000e\204\317K<\316?ye\355X\005%\350?1\032\214\211x|\356?\002c\203\n]\232\323?+\277=\251\233\200\346?\363\260\017\3212L\344?d\275\330\033\312=\354?T\243\232,\331\346\343?\017\177?&P\025\346?\2030RN\202L\353?Kf\254\366y\322\352?\264(\322\245e\345\346?+\333t/K\256\354?7X\352L\266h\352?@\023\3013\226H\207?\354s\327\005\274M\314?*\322\001o\243\231\330?Z\375\377}\3703\322? }:UPz\253?\301\217U$\343\321\353?3\027\332\365\364\177\346?s;\034\347\006\232\345?W\002\270\024\202\335\346?D\2007u\362\010\351?\312\004W].X\332?l\226\207\344\375\271\320?l#\024\024?\'\331?\364tv\353\316\034\356?\000\237\365\273\'\340\351?&\230r\365\206\245\343?HBw\215\032\345\335? Qd\331e\216\250?h7\261\030\363\371\307?d\364\257\207i\320\323?\210(\310B\361/\330?\001}\004\235E\r\341?\200M\273\361\327\270\202?tv\372>!:\354?\244\351\244\357\032\224\304?\273\207lhv\344\342?@ g\333vW\316?\310sO}\263}\327?P\334\020\2414\010\345?\253lc\001B\024\356?4<\256[\207\326\307?\370=C\240\374\225\313?]A\302}w+\352?}\266\037\326\324\303\347?J\214\r\313\360\342\356?\202\314d8J\343\345?\2204\317Y\315\344\316?\243`\034\304\313\252\352?\031u,O\351\272\343?\310u\004Ki0\264?\014\304\'\210\261\266\325?\235t^\263\257\023\353?@\315\236\005\305\035\233?\245p\000\332!!\351?\353\325\201x\n\'\355?\260*\240\230zE\275?\003`\262Y\253\336\344?\220\276\200\330\216\316\323?e\031\254\216\335\251\356?\210\005\343R\347\r\325?jP\216\022\371\242\352?\246\303\244\010\177;\327?\220\022\n\272\323\264\241?\230\367\2538\177\321\304?\337$\001\013\217\005\357?\200c\237\352\213\302\305?\036\273Iz\251j\335?H\'\314\322\020B\266?J\3446*\252\007\336?i9\3756>:\353?p;O\t\177\314\266?\013m\372\246\022\032\354?\nA\246\216{V\356?m\205\300Y\316\221\353?\003/\341\214\330\035\357?p\261\373\t\037\243\273?\372\353\250\034\216Q\354?\036\324\243\277t]\322?.\005\336\363<\313\333?@\274\305\232\177\213\272?,QZ\374\032\353\347?@\343\355*\031A\323?@F\254W/b\303?\271\342^F\232\307\344?:\322V\256\"\023\336?Xk\013\254\'\264\300?J\310\365\245X\360\343?\036\020\260\004\275\201\330?x*.v\202G\317?6`\266S\016\371\346?P\0348\356\346w\306?\325\355\240-\r\370\352?8y\217\245L\325\353?A8\301p+\346\351?;\207>c\324\363\355?\373\332Hv\021R\353?\001o\374\022\005e\351?\234J\006\275?\251\323?\333\303X\313\246\320\351?I\345\242\222K\334\347?\312\220\3761\331\363\335?H\001b@\330\301\260?\315\251\217\250\002\355\355?0\245t;k\372\322?d\354P\025\212`\345?X\206\311\250@5\306?P\361\214\034\266\026\305?\233U\\|\226\265\340?8\347^\216U,\266?6\027Q\224\220\177\352?\220\220\204\007\to\276?\300\330\270\325\257\250\234?00{\225\301\364\251?/\254\314\345\337\264\341?\262f\352\371\337\351\321?\260\324!\005~U\307?\000\3008>\242\363Y?&\264\002H\031e\322?X\213\243\211\376[\333?\300+\322\356\336\246\233?\006\323\277\0300\247\357?\034\2679B)\035\344?I\246\260\3422\261\342?\233cu\315*\350\340?n\307\332&{#\322?\374\317W\310\t\253\303?\330\250/\005\236\225\271?P\324\345\205Q\243\276?\224\336\010JN\177\327?p\036+,\344\311\315?\362\227\203`T\312\340?\014\353\031\226;\031\322?\350\222\032\201h6\270?,\334#\206\365Y\314?\270z\375\202=B\330?`=\021\246\335\005\262?\206=\311\014\334u\346?|\2144\020\n\354\305?\200>e\236&\013\212?r\222\263K\346\210\333?\264\203\356O\356\211\351?`\302\030\266\231\204\351?\000*\n)\212fU?\010\233\351\226ZV\267?\270\013b\356M\021\326?vr\320m,\254\323?FV\224;p\316\327?@\245\261\365\375\242\257?\2062\324Q\350*\327?\034\255\317K\016\005\354?\332\377SX\224\354\353?\260\300G\276bM\251?\204C\t\355\017\326\314?\275\320\223\367\336\312\343?5\233i\347\233\035\354?\375\230\360y\306\201\357?\00087\225\211B\262?4\335\030\3462M\320?h\020\001iu\231\340?b\252\030\353N\375\327?\320\205\037d\254W\353?6\307\353)1\241\332?4\246G\275\366\001\317?p\001epg\332\276?u\210\337\021*\271\347?\021\366\025\016O.\352?\233\257\367[\370y\353?\214+\256\305\336W\334?faS\307\231\236\350?\000\320\037^-C\256?1\"\357\245\236\251\351?\314\223\370\372Z\004\346?\361\220\220\207\246s\343?\220Lb=\032a\302?\020\006\377T\251\020\332?]\322j\320\225S\351?\2449\017\330I\300\342?L\276\360W\2006\336?x\220 \256\337\"\306?T\034dN!\275\346?R{\022R\023\277\341?!-\251\302\304\r\341?0\323\254\246vV\341?b\241\276g\336w\355?J\013\242\267\212W\335?G\226\307\177\273\216\355?\320\370,\357|j\332?\222\010\362^\253\326\320?\3600m?\200e\277?\244\346+\320M\346\345?\030\227\2419<]\316?\310eDD^\365\344?\214\000?0K\252\351?\013\272|\346\247s\352?\200-2\276\004\352\237?\010\274\213\244@e\344?h\357~\373Q@\355?\322A!\212>C\351?>\231\276\355\336\367\332?\210\256\235t\r\214\274?l\232\241\204\336f\314?\0001\221\230\257/\336?\210\026\207\251\240\313\351?\260\262\026\211\374s\325?\245\\\215D\247\003\357?\304\364\022e\341\313\320?\354;E\022i\332\316?\224&\370\226\034A\346?ym\224\251Y\347\340?`\365R\336\2278\343?\034\222q`{+\330?L!#\236m\n\322?0\220\216%\245\036\345?\200\037\266\342IU\240?$\226\224\274J\231\343?\331\004\375%\237\374\354?5\247S\035\020\000\343?Y\343\3312\275\262\357?\261G\004\343\242\366\357?Z=a\234\263\224\340?&\326\343fa\360\354?\244\263\021\241W\317\327?\210\000S}\313)\347?\370\024\343z\235\273\272?\345\243\246\352\263\002\353?\333\201\213\317e\213\346?\022\3311\240\017\311\326?4\236\226\356\243\376\327?Dj\233\247OW\357?\355\304\216\361[\010\341?\251\260.i\262\260\351?@\000:\304\\\315\346?l\205.\"\3444\353?\304;\030\216\016\270\341?\326\363\206B\266\014\322? \365w\031\322V\262?\273\322^\314\315S\340?\245T9\237\231\255\343?t\3528+\303\303\344?\374~\314o\313\236\347?\344\313Jysm\350?\221\237\000\347\032\246\353?0:F\356\363\263\302?\360jo\214\276\247\306?\226\314K\367\361\335\351?~\263\247Qf\370\345?B\322\211\335\237\303\354?L\263C\r;T\340?d0E\210\016\326\331?\016]\027I3\215\324?\252.\024\027\300\357\334?\000`\201\232\036\316s?\336\226oK&\333\327?X[\243 \262\365\354?(o\304\212c\254\304?\210\\\232\273\006\177\333?&\017\024\263f\333\327?@#K\321Ld\306?\300}lQ\364;\222?c\001i\211\235S\354?\002\222+1\r\312\337?\177\253\356P\031\264\352?\004{n\306\002\242\303?\340\204\3379\201\274\230?.U\271\215z,\343?\224gN\237\224\307\350?PI\273\220\236k\241?\000%\222z_D\307?\300\346FB\250\007\214?\364\361\347\260\346d\310?\340\263\031\272Ha\252?\360V\272\231\377\344\343?^\226-0p\202\356?\247&\224\322\005\002\356?\240\037\030\205$\260\230?[M\236\003\000\231\342?\256\301\213\306\240\231\333?\256\177z\214f\267\350?\260\034Y7-\257\352?2H\205\201\221E\354?\020^B\352\346b\273?pt\n\350\243?\260?6\333i,%\271\350?U\363\367\222\r1\347?\372j\023\tp\207\322?\200R,\373\016\361u?\031\374T\267\t\244\351?\006Q\226h\013\314\340?\264\272\342\3331c\354?:zs\006\203}\337?\034\377E\263\010\373\332?\203\355%\200\230X\357?PW\034\377\367l\342?\300\344\316\272\240\350\345?\240f(\200_4\237?\332;\242A\261\026\352?00\215s\376-\316?\031\237\317*C\352\350?\224\233\340\tjQ\357?d\305\273\nd\366\332?\204\303\221\264\032w\301?T#\004H\353\304\330?\336\333\177\263\215\341\353?\007rw\r\300u\354?\000\313\377\322\301t\226?\252\261\244\036\333\215\355?\021\234ulr\340\354?\2541*a\252\207\326?zVj\237\344T\335?\367\212\333\310+\025\342?\210+\253\250\262\236\357?C\264\233\3013\375\353?\255@\240\224\214\013\351?8\354\341\315M\020\276?\276\224\001R\032/\340?\334\370\325\363\036\036\321?H\270\360\247\232\004\262?\252\351\243\023F\376\325?wH\001\355\274\025\357?vQ\330\303Q\225\333?\020\302\367\333_\210\271?,\354.\335LQ\335?\302\241\312B\333\227\331?\323\244jQ\356L\345?\206\003\210W\332\355\346?(\216k\354F\360\331?(h\353\241\266R\356?uo\342\327\340!\352?\240\020l\025\343b\344?\000\305v\360\311\347\276?\366\266*Z\017\"\346?1\320\304\245\237\322\351?\355\327\316\316G\026\351?n\353m\207W\005\342?\017@:\237\345\216\356?Swz\230\374]\340?\302,p\322ac\323?P\376<\230M>\265?\363\034\245\026a\246\353?\250!\037\035*\371\350?\250`h\233\227c\277?E\004\n\306\342\256\356?\300\331\322\355\030\226\330?\n`\211^ZY\333?5\330\253\267<\330\352?\357*&\352\201\351\356?\230\262P\027?\344\323?x\354\2464\014\270\263?\250\305Sn\202A\344?\330\230\350\3347\342\335?\201\260H\220\310M\342?\225\'jD\nc\353?\007\376\262\350Z*\343?b\027\214\177Vc\334?\030\r\010\306#\237\306?\207\302k\264\346z\351?&\032b\235\237\221\355?\260X\374r\344\031\353?,\211\243\345\252\315\305?T_\375\375V\377\337?l\377\367z\211a\323?\336\267\013\3141\340\321?\320M\027f\303\234\253?\375\351{\345\243\350\353?\372\006\004E\004\r\355?R\237.6\215t\327?V\224\'\207\026j\333?\2600\242\001\305r\312?\366\\G0\221\256\344?q\256\t\001\250\266\345?\212*a\346\246\376\326?\351\002\330\177M\326\344?f\242$}\016v\323?\253\322\334\311#\307\356?\010\"o`4\236\357?\3253\352+\233\027\356?p\335\337v\2460\267?\177\235\n\206\335*\350?\031\003\307\266P\307\344?L)\243\302\001\305\311?\010\003l\237\231F\267?p\320H\007\363\253\334?\362\037\377\266R\361\357?m\345\307\201CN\353?\213vn\370\233\021\347?^k*\214p\"\342? \007\375\217\331,\241?@\242\355\345[\215\277?^\201\327\036\251\206\336?\260k\017\376\200\254\343?H\346^)\025\376\266?x]\326S\330\017\306?\277\177\227\313\273T\354?\252=F\237\317\271\347?\330k\242\342\356\241\313?Q\371\372\2367S\347?\342\353\323<\253\006\346?\3100i\330\010\350\267?\217f\010\321;7\341?\002\302\353`?Z\351?\220h\326\262\376D\344?!\226A\r\322\337\347?h\332\024\017\340\247\261?R4\030 \020\037\351?\312v\327\227E/\357?D\376\203\303D\n\313?$\247\255\210$\211\330?\305\010\270\265\013\002\356?\265\002\002\2650\232\341?\006\340\225\362y\202\347?(\314\335\316\300p\265?\007\376\253\211w\017\341?\341&H\177!\227\343?\310\020\363Y\265\000\316?\212\032\234\377L\342\354?\235\032\215\234q\313\346?[\301\255\363\251\207\346?\262\312s\226O\204\354?\370\364\023\304\241 \315?\010\025\340\367\267\276\272?\275c\212\217\231\321\342?5\003\034K|\206\357?\227\251gb\200y\347?88\354\261#9\277?w|\364\321\233/\356?H8.\376\303\307\340?\360\365c\263\000\214\265?x\337\303\215J\241\275?G\234SIC\375\356?YH\021\010wX\351?0>\376!\2572\263?(\331\n\340\374.\302?x\302\225q~\233\276?H\004\263\202>\337\330?93+\343I\013\352?\004\325H\030*\314\335?\233\006f\241\230\000\355?Kl\307!\3601\356?~\r\342\231\033\361\331?\337\277\327\354A\262\350?\354\177=\236y\270\342?\342\231\223&2\243\340?U\201\010(\333\357\352?\230Uk\3009\334\327?\251\301\274\245P\267\346?r\274\236\366t\202\327?O8\271PK\233\352?\r\246^\312\260\321\340?\304f\3138w\230\336?\034\224\024\200\300\322\340?_\2019~\255F\346?\000\234\tT]\356\225?$p\256\253C\371\357?\266\032s\310j\203\321?\336LTo^\023\324?\254C\316\300\216\233\331?Ht\031;\211W\344?n\212\323\311\347\377\343?\"\240\033_h\242\334?\374\253b\"\346~\342?\330O\002d\274M\305?\3027\010\371\3156\352?y\352\304y|H\354?P\177\030\364\010\324\350?\371g\245.\331\267\341?\352p\364Y\261\370\334?\367\336\273\253\256\332\341?\327\352\2508\300r\346?8\036p\357\372\264\272?\264\021\270\005\032w\335?j\332\211\024\366\225\345?\243\306\227\361\021|\341?\314\303\215m \362\304?*\177P\355\273\010\343?.N\352*\000b\321?6F\225\250\354G\343?\251\270\300r]S\343?\370\246\324f-\277\273?\226\307\0264\307\245\351?n|\321\250`\240\337?\310\326$O${\324?\240\313;\235\256\350\343?\262\226\273[m%\355?\\t-\030\335\346\343?\366d\034\367\0376\355?\032H)\242\036\013\342?\000+\240d^\013\327?\017\360\343BQ\314\340?Vq\'E\231\263\345?P6|\345>?\270?|\242\033\222\237\366\307?7\"\027\256\260U\355?~\257j,\242\273\354?,H\236Y\232f\352?\274\357\247\356L4\345?\364\206V\311\222\366\332?\315\010\033\352N\365\351?\325RG\217\274\250\354?\203\362mU\311\323\341?\376\367\261\336`\021\350?J\020\302\343\325l\356?\021\177S\231tB\340?\010\370\t_\306\t\326?b(-_\236\001\326?}\246\030\032;}\346?@eD\177\365\224\326?\204\233\234j\235\024\353?\262w\037\235\360\004\320?\000C\236P\010\016\331?\264\343\344\006$\216\324?\240\373\031\3645^\234?u\246E\272St\340?\354\221#\000\210n\354?\274\303\013H\036\240\324?\240\355\303\264\231N\305?~}\261\366i\255\325?dN\220\277\343\235\315?\300\023\211\244\266\242\346?\036\317\354\360\\\225\357?\361\320Si&$\356?\235\324\366L\204\335\342?\024LR\325XK\353?\224n\005\211\335\360\333?gl\301W]=\357?\\\257~$\253c\353?\377\317\367&\317\354\350?\255\327\240\262\260\357\346?\240\312\024\376\371g\251?L4\330\365^\003\321?\320\200\2268\022S\244?\250\356v\221)\210\303?`^\3029\370]\316?K\2314sd\223\351?\013\036\336\347\313\343\351?J\234\233\273\223\332\322?\364\222a\034\224Y\325?\216\023\342\313\0260\340?\020\336\213\231j\'\313?\375\201\301\305 L\356?\036\"A\024\002\360\324?D\032\255\231\350\372\302?\010\034\203\014\302\224\344?\000:\317R5\342\324?\332\232\266\303\314f\356?H\367\307/\030\203\307?=\241=\313\373\256\340?\270\274\304W?.\323?\200\302\2243\\\364\356?\267F\016^\274-\345?\331\224T\367@#\354?\210\032\247\366\341\005\302?\220@V\320\332<\246?J\322\n\'R\333\347?\234\247\211t\244@\341?\024\307\262\264y\341\342?\262\234\253\001\232\232\357?\rw;A\022P\343?\233-\000\330\0223\350?\207C\206\311\000\345\351?\311\014\021\373\020\320\353?~\263\270\007\311(\337?m.\257\224\037\027\341?\343oN1rU\356?Z?\254\223\337N\345?J\304\301\300K\004\333?|\336\326\335\331q\336?p\001\003\252!B\254?\010\031:1\315M\277?\342\254T$\235\272\322?`\204\250\177mk\222?g\232}l\201J\346?\252 s\305\356\002\355?k\233\227W\220\223\345?\244\335\235\214\370\370\356?!=\177\026\013\023\352?\304\017\271A\351H\303?\347r\335\242Y\372\357?\317\006!;\3531\350?L\033\002 c\252\345?\336\nb\255\275\225\320?#z\032/\340\276\356?t<7T=\254\353?)\2236\037Y\006\356?\213\305)\200\313y\340?\260\006\227,\376\006\344? \t\370\327\217\277\273?\240\225\217\265\230=\264?\300Q>\261\n\031\275?\243\245\213I\022\230\342?\205q\255i\014\276\356?\340\265\323\344 \254\350?\004\367\263\210x\035\312?0\260\020\n\334\343\323?\326\034\225\323\302\023\342?\354\343G\362\264\327\323?<[\267 9\370\300?\014\236\217\355\260\207\357?\376 \333\220d\243\347?\342[\t\264\376\241\340?I\ruY\036G\355?x\027\r\\\357\367\351?\376\274\366\037\320-\341?\244\002\037\341=\000\352?\250\254\275\\N~\335?\240H\030w&\311\330?\000\207\276 \366v\316?\320\3656\2638t\246?t?\025\301L\312\343?\200V\'\360\350Pq?\263,3\375\202\367\355?\004\203\211\233\232~\347?5\030\364V\364A\341?@>\037.k\305\341?\350\236\033\317\341\322\353?\232\316\260\305i2\352?\354\243\337\2421\204\357?\332\264\013\271\350\031\321?\202\307\215\025\270\230\350?\322?\377\350,3\320?$6*\355K\300\315?\250*(#\234*\276?\240\254\307\202\361d\355?\246F\032\335\301\035\327?\010B\305\0107u\342?dL\257t\030\277\331?0 \333\337\002L\357?;\026}\237\222>\345?\363\256dq\362u\343?B\226\216\235\260\017\351?\000D\016\024\202\004\337?\020\266\262\333\340\200\252?4\267\214v\215\305\305?\260\337\303\032\343\336\243? \261\013\016E\332\357?P(^\314z8\252?A\022\250\352\3648\357?\354v\263R}9\354?\010\007X4\207@\341?\3529\026 E?\327?`\311N\310\035\021\257?\355\3509[\273p\352?\347\372\231x\235\226\353?\362\234N@\304L\343?\252\354\250\271\364\200\330?F\273\\\020\251\210\324?\200\347\262\224\353*\260?\347\t\354K\351m\345?\3606\000q\370\235\315?nf\246\227=}\352?\220\trv\324\000\311?%\243\0317i\262\355?\330\007;\377R\353\335?4\217\367\226cN\334?t\001\035\334\260I\347?\\\215\302\200\262\262\337?K4\343\311r2\353?\032\n/|\366i\335?\r;x\355\023\366\353?H\341\225:\211M\301?\271=L\205\304z\344?\250\n\036nK\365\267?\200\003k\372\306\254\263?\362\207\315i+%\344?\213+\014\253\020&\347?\212\235\251A\017\267\354? \362\375\274\213\347\260?f\320\276\245\216\247\345?\230\035\013\271\316e\271?\303\014\331\223;d\356?\224l(\232\240\323\335?<\210\375\372H\r\352?\210(\331\324\0014\343?Iw\3015\005B\340?\205\264\243\265<<\351?\024\356cb\020\255\340?\204\213=\025\'C\307?\251\261J\257x\020\353?X\375\362\222\024\005\333?^3F\350\355\353\356?\016{\334\032\270.\337?:\266:\266}\000\321?p+\227s\312\255\273?\333@\313\n\345\322\357?\216\260\344\014\260\267\321?V\216\200\375\233\353\345?>\035\0322\341\007\341?\310.\305\241_d\326?\262!\221\255\350\'\356?l\0262G|\363\357?\014\3141\251\010A\332?\350@=\207\366A\302?\260\363\330s\007Z\333?\267p\356\266\370a\346?\005\032\311\372U\231\344?\000\003\337p\245=\254?h\031\217!\261\235\324?\377\241\350\276\302i\351?\323Dta\320\251\350?\222\3701\312\016\252\336?^\217\357\016l\322\347?\237|o\361\335\200\357?\210!\202,\302H\355?\362\227u\025|\305\350?\320\237\324^\353.\342?\320\024/k\262\202\260?\351*W\321]\253\345?\252\313\023\204\370\315\350?+\340\305\256\221\'\354?\245\007\207.U\220\342?x\246\010!\321(\353?\036\235F-\273\301\353?@\2101\235/\304\210?\316\020pu&\000\334?4\344\244|\242F\333?\234\227\352&\037\237\314?LP\356\302\213\331\330?b\333\\\341N\313\325?$c\212kF\301\302?\024f\246y\347\330\351?\210Y\201P\252\321\267?\367\001\325\033\035\374\341?`\217\000\036\235\333\342?\254iG\246\343\367\324?\303]\223\252\256Z\347?\364\212\037\310\374\374\306?\337\201\244\200\341a\355?`,\220!\206\263\276?\\c\307cM\307\314?\t\'\267)I\343\355?,\375\014X\360\225\326?\024\326c\377\327\202\333?P\"\261*\352k\243?l\214/\314\232\036\317?4B\262Q\0056\317?\014\261\037\363@\005\330?\340\n\264qLm\320?\200\275\256\366\313\314\341?\350|U\311\326\316\354?\016\236\336FA6\335?\210@O\030Uv\266?\255:3s\230a\350?0*\007\017n\357\303?\023\303\3651\256\006\354?\310\322T\264l\337\261?\\\301=\030<\317\347?\313o\322X\200_\340?z\034\355\005\304\307\343??\337\3731\201\325\342?\264\231\037a\032\255\320?\n\002\241H\276\331\355?\236\257\366\264\244\231\323?\260\365\320m:i\267?\301\272\337\024 f\354?\372\251hET\304\357?\260\265\000<\210\335\275?Zu\306\247\016,\323?t\016\213\232^\302\311?\212\360\343\335\250\354\332?\200\025F.B\336\314?\360\353\037\326p\230\251?jM\370\340`\025\355?\007\2431\231\345b\344?\220k\210!P\364\304?Y\002\224\226&\264\356?\027\326#>\362\320\340?\326\261\200\301\207\233\341?p\347c\201\247:\243?\000S \365\000q\336?\026\005X&\221@\325?\000 \336\337\271\221\234?\256\346\035$\367\237\350?\260\025f.>\366\344?`d\223>\216\323\330?~\220\302\231\265\'\345?D\020\241x\273\003\334?<\377\014\326\3473\335?\320\224\222C\261\031\326?\320\276\347A\210\210\337?\230\360}\"I$\357?\244\000\321{6\350\352?`w\220x\350\224\265?\346\225S\363\255\021\350?\364g\221\'\264\344\312?\377\231R\032\0005\353?\240\254\277Kw{\331?\304\375\0033R\223\320?\330#GbZ\217\346?pSC\221g<\253?<i\256\231\240U\311?<v5\371F$\323?\370^\263\247\225y\332?8x\361\265\005{\314?\370\340\242\354\220~\321?\037\245\331\010s\375\347?0]Y\226\321\353\317?\356\371\022s\031;\322?d\377\215*\256\342\302?\240\2526\346$\214\221?B>\372\000\037\r\326?\346\266\026\320B\233\357?\370ZS\345v\344\274?\260Q\240m\217+\353?|u|\235\355\367\303?\212\275 \211\374s\355?$2\211d\t\035\346?\370\232E1\305z\307?|A\2208!\243\320?\000\274\307~\367\203\300?8{\005\002\270\033\334?\340\264x\257\205\274\304?\2048\275\275s[\335?\031,\242qy%\343?\240\364\257\036\312\204\274?\300\376\r\206\256\024\217?\3063\254cL\306\335?u\347\250\356R\352\347?\343\340\260\332D\345\341?`J\333aR2\235?\274\243\244\276\031,\340?\340\343}\271\311\352\260?\352\230\223\263\'\377\344?\nI\023\246O\310\357?}\211>\366\220w\342?0Y\030\332\301m\335?\324\203\257\327\220\343\315?H\362>E\035c\264?\315\241\r\"\300\036\357?\215\251\343\312\255G\356?\244\201pP\360b\337?}\346L\350J\336\352?49\010P\007\221\311?\325\021\267\364\222\253\353?\200\343\250{Io\227??\3106n\311\320\345?\364d\207\331\3314\327?\354\3045g\303\375\326?\264\353\034\242\306\010\334?\023\213z\230\251m\342?\374f3\264B\033\345?\354X\356\353\234\237\352?\200Ia\022(K\244?G\372)\240\010\367\355?(K\205o\305\200\336?\263\264\231\255\333\207\340?\236w\263A\310R\353?\200W\233<Y.\340?h\344~&\357\204\330?\344x\203\031\271\311\312?\010\0218 \301\037\357?\030\216\343\305\004\032\334?\n\233\243\013:\227\322?\016\330\256b\353\177\322?E\354\222\032G\234\353?\312X\325q\216\352\357?\r\220\244\003\244\227\345?f\036\264\311\255\314\337?\000\262u\337\232\355\242?\234|b\242eJ\327?\037\006|X\2708\343?\314`\210\346A\336\352?\204b\265\005\221\237\343?\310\353?\305\376\373\322?\270\r\025\034\265~\322?\2178\210\320\242?\353?\270\233\310M\013M\341?\304@\033o\333\344\347?\230\345Z\'\327\252\351?\264\017\355\310m\342\313?\010\3267\371F\'\276?\340 [%\032\033\317?\261\177\017\305\201\337\341?\250\316\317%<\001\274?\030\273V\252\324\234\323?\3139\023\2609\321\342?H1\326\005\227\347\351?\300\335Eu\345\212\261?\304\235)\372\307T\314?\365RX\n~\200\345?\235\243|\310\234\360\345?b\035\345\002<\014\324?\024%\006f\304\016\307?\251_V\326\3226\344?\336\230t\366\235\212\352?`[Q\226\002\303\224?\314b$\207T\241\334?\360\233\3577_\233\353?\370\314}\257\241\236\300?0\033z)S\303\243?\020N\024wm\270\272?5]q\3674\014\356?\270\216\265\243\355\004\357?\027P\233\035\023N\350?S\230\036\312\241\371\343?\351]V\360\351U\352?peKN\252\027\354?\370>\217\003\311\235\265?\230iI\221\310\203\273?\025\245v\252\023}\354?\370\371kow\375\312?\354\2775\316;\345\356?j\324\310D\256\227\344?R~\020\314\014$\323?\334e\267\033\034\345\333?P\264\363%\231\003\270?\322\351@M<\260\320? \266\341\244\206W\330?r$*9\251g\335?\030\342\371s\247\305\356?`\024\005-.P\316?\013AT\031\345-\354?\364JG\355\217\230\343?&\222,Z\317p\347?\300\226:\316\304\271\205?2\037\033\013B\023\354?\022\367\356\377\227O\341??S\311\335Px\356?\026\002\204\242\210\375\323?:2t\347k7\343?@\303X\215QA\357?8\357\202a\255I\357?\0247\374~\322\333\334?P\256\213\031\334o\342?|>\375\nY%\310?\332\333\204\365\034\260\331?\355W\374\024\027\322\342?Y\026h:\250\326\342?\244\262\204\273\361_\321?\"\274\2359&m\321?\310\300;?\0272\325?\020\000uL\365\270\303?\216\252;\n\322w\331?\315\257\372\343\305\302\354?\274\324\322\222\007X\350?lq\2728\3240\351?nd$\025\345\374\320?\271F\212b|Z\347?\324$r&\232\303\344?\216\346~r\354\312\342?<I\027JN\236\352?~sv`\tv\343?\\\337\353\312\223R\320?\204\"MEr\275\325?@\300\365\344\2017\340?\000\010\373\376\260;\204?\232Y\232+\360V\324?\230RX_\241f\312?u;\212\203X\271\342?\320\373o\267Z4\310?\340o\211\361\000\361\240?\264\242N\014\3733\310?\346r$\320\315;\353?\310\320[\306\223@\314?H\2550\366\254X\353?l\225\354%\365S\350?4\2742\324\235\303\337?\020\227\303\007z\312\264?g\270t(A\214\352?\224\023N\\$\024\354?\020\357\274\177\313\306\300?\214V=\374\347J\304?PU\023U\341\355\312?\034!\311\366\007-\351?hb3\303\317\031\337?\242\177I\346\325\320\332?\032\375\036\257$\275\340?p\315\310J\351G\311?\316\222\025?\321\225\336?N\324\271p\217z\357?\000d\206\306_W\272?\270q\211\'\272Q\263?5\231\353.\200\n\342?`\027D|\t\023\307?pk\r\307\202V\313?x\325\331$d\233\351?\314P\004\266[`\325?\\\226\333k\347\003\305?l2\304\313\334\271\350?\334\203\007bB\375\332?\317\030\266\257\030~\341?\000z\3732\277\010v?\300\270D!w\312\352?^\303\371\246\314*\342?\354\227\302\332\256\274\324?\327\262@L\361\037\353?{\311\370|\344p\346?\311\314c}\254\230\342?\200\332p\273\332\323}?\213\004K\204 -\344?\2605\356`\020\277\325?\310`.\021\367\000\311?$\341\267m\031b\315?EE\257\253\024\230\357?\350Y\215\254M\300\277?\255\331\262C\242\323\353?|V\321\216\rI\354?\332\225d\366w\337\322?\212\237(f)\252\326?\200\253}J\340$\346?\333\007\200a#\304\346?\310nW\217HS\313?\006\345X\205\372\230\331?\240\222\340\252\264\210\254?\264\206\037\345u\373\354?~~\262\302\333\036\330?\010>\370\263\331\255\265?\200?V\"\025<\271?\\\317\037\205\034\226\343?\024\216\024p\310{\314?d:\202O\333\253\341?\006\2021\235M5\323?\356lt\353h}\325?q\262--\001\354\353?\300Z\341\320R \330?\263x?rc\035\347?\300o\202\214N_\266?\232\236\377\260\233\317\330?\207\276\320\216\275\210\343?m\322\232\225\362\272\347?z\244\377\234b\037\352?\332\203\305_}\237\351?\316\037\277H\270R\332?\300\324\nz\272n\242?~8\013\334wn\352?4\021v,\330\336\346?\312(j>SK\350?A\260F\377\216\325\350?\007J=kdG\345?[\320\206\372\230\270\344?P\013\311m\277J\270?\350A&\n\032{\354?\320\334\365\213\216^\313?]-\230\177/,\352?(\303\362@\211\\\320?\257\261\376\371\331\230\350?`\236\216\230c%\303?d\344k\210\331\017\323?(\372UX\304\005\277?\222)z\345E\231\322?\025\330z\006\016\201\354?^`.\252\224\327\337?8\270\232\025\026\306\270?6W\354\213{\347\347?\246\333\226\306\303\347\333?\256\367\214\323>j\322?\204\352\326\252Y\335\353?d\321\253\205\235e\335?\336\212\251\352V1\334?}E\306\255\223\365\342?\344\260z\t|\357\340?R\326\323\241R\'\340?\250\217\321\331\217\210\272?\2741\024\2042\207\357?\306\305HV\312\006\337?\220\214\326\005\323O\263?\370\225:\374\302\203\355?\273\3659\320\207E\345?\014<K=\005\372\331?\252\275M@M6\347?I\0168\325\265.\352?\350\353\260\377J\347\336?\"\277\024:\343\035\326?\222\364\255t}w\331?L\377\2518o\315\340?\340\320\226j\313q\320?\030\021\232=\223(\320?\276p\214T\207Z\324?\213\363k8\033\355\341?\004\001Z\222\353^\347?puh\214E\347\263?\316\031#NM\245\341?:\013\206\236\264\224\320?\260\026\027\037\365\326\354?\211F\010\r\303\n\352?\317=\022\031{\314\340?XVVn\244\202\323?\200\340\000dT<\311?\300\322\355\246\263\177\202?(R\255-\265L\305?]*\244x\tl\345?c\210\3413\216y\356?\350C\"\231o\354\315?\345%Y\241S\251\355?\330\031\023\034\271t\344?\265\203\027\230Dx\355?\354\334M\305>j\354?>\271[y\206M\322?H\016\3242\230{\313?P\214>q\002\350\321?\217\250\305\226N\247\357?\200\233\313\214)>\211?x\254\252\211!(\345?\225\\\033\346\215x\345?\306\n\"\362~\036\322?\027\302h:\000\216\354?T\031\246\315W\337\304?\340\\\367\351B<\225?\333\027)x\025-\346?\010\036{\2702h\342?\202\235?\017\272\256\340?\207\237\365\241\362\303\357?\006\212\034\315-\376\326?\210\202\327\257\341\254\332?x\225\265\013\024\276\350?)K\344\235ch\351?\352\2642\001\236\333\356?\241\226\013\262\\\036\357?.\033d\020\204\360\323?\272=\237$j\222\332?\314\tcC\214Y\345?\310\315]6\245\325\345?\364#\007\t\236\212\337?\360\311<\230C\374\351?@\377\340kG\013\334?*\201p\321\202+\331?~\r\022\213e\307\351?\352\244\270\324\301\275\347?h|\227\336\253\"\351?\3135\010\364\204\354\342?g\273\010\'Sr\357?\230\322\240\221\020\236\356?\311R\227\263\376\366\354?\311\232\351\013\333\277\344?\240\010H\313D\303\274?<\305T\323\014\312\331?\234hkQP\264\322?\234\366\342\354\277c\322?\304\032@\372#\342\351?m\003/\312\322z\357?\240\026\336)\343\360\310?\230\200d\342\224\321\346?1\346\t\271\322P\343?\246\264\350\230\250\032\327?(\300\253\311\2715\305?\322\346\025\004b\272\323?\306\3602\023\232;\353?\240R\023\351\'y\344?,9\214U\307\244\303?\303\204\351;\265\223\343?\350Iz\026.\247\325?\270\016E\010-%\303?9x\203e\206Z\343?`\312\234\313\037\375\330?\0345k\374\217\330\336?\322\342_R($\342?\257}\026h\306\224\352?>\334N7\356z\321?&\306Q\003:\356\337?XIGkl\255\315?\302\241\210\351!\257\321? \247\242N=}\231?\350\371\264z\335\215\344?*^\321\214ol\355?H\236\320\006\354\376\350?1S\tg\035\361\350?\177:\253r\343\320\354?\200\2444h\347.\347?\3466\214a\351\352\337? @h4Z\257\272?H\3322\213\367U\324?\247\250\330aY\222\347?\304|\275}\025\241\312?\360\035\322\351\027\205\327?e\003\342\215W\251\354?\372\200\026\317\263[\330?\372&:\302\206\221\344?\317D\307u\001\241\343? l\322\317x\366\324?\360>6\215\272\033\242?\306\375\000\200j\274\353?\006)T\200\346\241\332?\352\265}\272\225\301\353? eX\231\373=\257?\014\302\253\331\236\221\355?\220v\343\351\360\341\337?\263v\313M\311Q\356?\244\225U\271\"\335\353?\200\035H\231.n\326?)\270\352\024\300x\346?\353\245\032\342\245}\350? \270\362H\362&\346?k|H\334\362\231\357?\220\261\273c\361 \307?I\362\241N\316\206\353?\240\302\272\300\340\216\246?SU\337\331\010\371\356?1G\315\312!\271\341?\253P2\221\270\333\340?_WG\023\205h\342?\216\317\030\027\301\347\326?\257\020,\242I\242\342?\362\367#;\332)\327?^ |\241\027\376\333?\320H\226\177=h\335?V\022\303\371\020\341\337?\320|>\220\307\324\320?\017*\231=2a\341?\036\312\263\250\255\377\347?\324;\3614\321\272\354?K=]\036\251~\345?$}\020=\237\270\322?\005U\225\326\205\317\351?\200\351=\033\200E\341?\006\302\335\334\361\315\357?[\034\315=,\245\353?\014\']4\324\275\316?\237\225\275|]r\356?L`\265\204\005\314\307?@\216\032\005qK\202?&\270z\0328\213\354?.v\\\331\234<\347?\324\003<\354\023_\315?x[]\313q\307\265?\374\274\242l1\377\324?CS\014\242c\264\347?\275\020\001\335g\222\340?4]\234\277?\210\351?0SU\333z\263\313?\263\'\323)gf\346?dJ5\n\207\334\333?FSN\373\\\005\333?\343(`\216Zu\345?\220V\007\372\376_\273?<\237\223\331\337\005\314?\000\000[\214\3327\334?#\344\361\300\367\033\353?T\304)8~\\\350?\004\272\374\013=E\345?\032n\202\364\036\033\350?(\277\177h\2475\271?\250\257\345\311\210\332\310?\242`\r\035<\004\324?\246\245\212\0307\025\344?\200\332|\027\337\035\261?\223\355j\034\3530\343?0\261\244\376\262\345\310?/\376,\235\350\236\353?\200\352\t/N\255\245?w\244\204\253\311{\356?\030\206\370D\025\301\261?L\031\034\276qN\305?*\034$\362\255\251\343?7,\242\241b\372\352?S\236]\335\336\"\357?\312z\243\202N\217\324?4\271\316XDM\353?\240_p\\zK\250?\345\\\235\240\353W\353?\364m\340\373\002W\351?D\245FC\026\227\350?\377\330tE)\005\340?\037\2046Cn.\344?t@&\372.\362\314?!\221\236|\207\324\341?\013j\003x\035\024\342?\000\3218\355\337S\317?\350PX#\321\252\300?\344J\207[\325\323\302?H\360\216\203\005\257\311?\262\351@\333\3220\345?I\342\207\325\014\247\343?^\246\272\242\t\027\323?\0107\202\010\254\004\313?\0000u\317c\000\222?\330\373\361\222\263\267\260?\004\313\366\212SV\352?^B(R\326\007\353?\006\334!\025\270\215\334?\272\344qt[\'\356?\242bU\300uU\356?\272\355\360\337\344\237\344?\017\307\226\032\247\356\341?@\240\224x1\352\232?]6\305vO\003\341?4\215\255\272\214\007\342?\350x&\274}b\315?(fI\312\255\273\334?\352\014\252:\275\300\352?\371\322Tm\253\366\345?\220\215:\tF\370\356?\360\005\n\234\273\334\252?@q%^k\022\334?k{e\206z\335\343?\320\013M\020}\244\342?0A\0022\200H\334?\235\202\023\024\362\354\344?\346\247B\345l\217\341?\020\240\233\325\000 \323?b#\216S\022\264\331?\335\013\\B\257\267\352?\302\022\344\332\307\212\322?D\340M\033\327\'\341?K;\261\234(V\346?\010e\232\235\244=\263?\232\345\241sY\350\333?\332`Wu\2005\340?\000C\367\256Bz\321?\230`X\206\036\233\310?\014\313taS\001\315?\005\37252\315)\350?\344^}o\027I\333?~h\357U\271\310\341?\210\370\267\370\302i\335?\244}\215D\006\316\317?rq|u\244B\347?\010\264\316\023T[\330?\000@\013\2250\306\030?C\027\375\222#\343\351?\212g\355c\013&\324?\314>h&\275Y\310?>o\334\247\030\255\332?\017pO.\215^\343?\366\030n\023:.\357?`\035\177\330O\354\222?\334F\212\277R\367\327?\337\\\244\233.,\347?\254\3123\274Vw\305?`\370\353\351\373\252\346?^n\031i\274\312\356?Pa\"fXs\317?k\033\370\t\213\335\340?\370(\236\235\007\374\262?z\217\2531\310\261\326?X\002\277\\\013\013\277?\220G\0103M\316\252?\371.\231\327 z\350?}\020\340\"\366\360\355?\314AN\205\330\006\303?\025\001`]\337\225\354?o\221\310\355B\250\347?\320\336\371\247D\222\254?eiL\274Eo\355?h\277>\370\271v\306?\254\022@\311\321\365\340?p\002OEU\210\302?D\327\323\226\255\367\304?\304%t\265\263>\317?\304h5/\026\277\307?D\305\003\313\033\335\306?\344~4x\251e\350?\304\303\202bi+\317?\325\212\254\214x\350\355?*#\242\271\027C\347?8\222_\031h\312\312?\226\335\3629\217\026\345?D\270Nq\016\252\334?U\272RR\315\005\350?\214-\342Y\341\267\357?\\pY\2568\'\312?\036\344l\032\332\322\324?\005X\31052\007\350?\241\207u\225\224\023\344?\036\330,G}M\357?t\007\211\251n\372\303?\310AoJz>\324?\023\213\327\214\014%\342?\343\362\016\035E\260\347?\264yk\322\004\305\315?x\315\002\301\230\027\301?\007\223b\330\302z\344?\240\362G\226\372]\227?\207\000\262\250\033\327\345?\031\036\202\273\241t\353?@Hn\245\244\001\321?%\320\210\222C\267\351?\301\020\330\034\260\337\340?p\017V\212\347j\246?\277\344\371\245\267\362\341?0\360Yp0]\300?df\275\241T~\301?\352\314\364\223l8\337?v\370\302\274\275\217\327?\254\345\223\354,\253\314?\304\033J.\324\210\322?H\361\340\276T\260\350?\036\"\3009Tf\341?b\334\261\311\215}\354?\030\030\361\371\355+\265?\342\310\237\305\260\335\344?\260\013\315PM\005\351?\374~d\311\330\250\335?-\222\035\312\0335\343?\035\000\303;\224\016\352?N,\342\273\245|\332?\350\310\264AK\217\326?]\317k8]\357\355?\206\216\211\216\rE\333?\371\373\227\372\212n\342?\t\203V\220nW\351?\222Gim\305\344\356?p\333N\225\027!\243?l\037\023\356\335\216\344?\253\243\263H\224\226\345?\211\374\271\2311M\347?n\271\376\235,_\356?\340\237\231\007\272\202\265?2\366\"0\232E\332?2\035\225\207\341\342\353?}\322\210\371\273\347\355?\254\305\2770V*\354?\300\342\201\254\360\323\342?\341k31\264}\356?/\023e\313\360$\347?\347\256m\225\316\326\346?t\275=&\003\315\305?\255\206\371\033\200\224\346?!\201+\221_o\340?\255\352\023\356\3206\354?\202^>$\220\353\325?-\366\305\016\372{\352?\350\022\003\031\372\004\335?o}n\014Ag\352??\2757\207S\234\341?\256\002\n\271\t=\320?`=q\263d*\267?\274\330\374\320\3440\306?\370\037i\204&\222\320?\264\314\367\024\347\030\305?`|\367\333\223%\264?\200\351\251\2644\227\237?\326 4\017\223`\351?\0355\344`O-\351?/Sk\'\374g\346?8\014\322\2528\273\345?f\013R{\242-\327?\372w\007\250\036;\333?\355w}\255\337\356\344?\370\237\003\273\206T\272?\200#_\346~\273\356??\031\036\212\344\325\356? \311\024\0245\373\333?\200\2415\233\377\311\267?\340\250\355h2\360\237?\274\333\325C\205\250\331?\034\257;\307\034\336\335?\252\224\320\302_\016\351?\310>t|\353\003\304?\225\270$\200\022\315\347?*\n\260<\310p\353?\374`+\307dy\323?\326\026\001\035e\036\340?x\342\241zD\242\323?_\233\275\301\360#\354?\343;\275((\333\344?Vh\261\360s\314\353?\340z2uO@\263?\223\014\032/\n\247\357?\254\032\270\351sg\344?\224\340\353\227!\226\341?(\265J\2354\207\311?\010p\233!\215<\322?xl.))\231\274?k\370\241\374\033\031\353?\2745\020<D\360\302?H5Y=f&\327?\030\373$\240\341p\353?\333+Y\217EP\354?\230\013\216\232\2642\353?%\200\260\2748{\344?\036\254d)\217\375\340?\334\307\234a\215\016\317?\000k\2470\252@\355?.\267\021\357\0108\336?7\253{\305&\002\343?r\033\324\237\327J\334?\340\242N#\360i\352?\200\346\223\241\315i\357?\312s\025I\224x\340?0\212\260\342\022\372\264?*\361\'\267m\007\352?7Os\213[\366\356?\026vy\2638\027\347?\024\344l\001iX\305?\032\355\010$Iv\332?\243\270U<\347f\347?\r\327\350Q\005\376\355?\342o\324\231~\363\346?\362\036N\035\031\\\334?\247Pd$$/\341?T\271\010\034\177\213\321?\324\252\226C\274\274\303?\202\2043\276K\206\333?\000\354\200\3001\204\214?@\026#\314\235\301\236?\244\1777`\n\021\303?\360\247\r\027T\302\246?\244\242\266\317\224.\334?2\223\034\204\316\232\343?\354\334\211\216\020/\307?,5\272i\353\216\344?\226\207\t\'\376\221\324?BJ\266c1\307\323?\340^\224X^\210\327?T\261\323\357)\313\313?\010\250\302 \375\230\311?@\256\205\367~\317\320?X\032\3417E\360\273?\013`d\331)\036\354?\231\376\363\351\340\236\351?\375\357\031C\323\021\341?\213TN\252j\032\340?\274\313\217F\256\367\326?\226\237\344\005\323}\322?\342\270\277\241\246\241\356?\220\005\205\034\030n\321?\317\302\r\233l\323\346?\022\030\321L{\237\332?,\220<G\257\324\321?\240=MV\363\323\307?q\260pT\217\237\352?\327\271\354\246\257\341\355?\377p\315\3101\345\340?\340)O\010\0343\334?\016-\226\240\317\342\322?N./p\303\252\320?F\003\261V\263!\341?\312\200C\241\2469\323?P\336\242\275\202V\317?`\025\004?\003\306\236?\007\241`\332\257;\356?x#\371$\214Y\264?\375\030a\304\r\026\347?\nN\274\314\346\007\354?\270UA\276v\373\312?\272\237\275C\250\235\344?\"\316[Y[l\330?>7\352]EF\325?\000v\264\025X\\\341? \356\024\221\2320\265?\201\332^\235\244\354\351?4\032g\227QB\314?\263\037f\274\347\r\343?\006\265\355.\265>\335?D\342r2:k\336?\200\360\373\033j?\256?\034\352\006\322=p\325?\210m\215\222\024M\303?$\243\243S\001\351\347?\204XZ\227\262g\311?P\202\231pI\325\306?\033\230\347!\346\271\353?H\367\252\335\367O\357?\300]7\000Zb\357?F\327\351\000,p\330?\375\265\215Q\033\333\345?N\233\3709\355\003\332?4\347\032w]\312\341?\037\261U\0050G\354?\\\342\200I\273,\302?\232\006\271\226\010\303\356?\303^\341E\231n\356?\276\004\202\ng\003\326?F\376\027q\354[\333?}\232\276\007\310\222\351?\312!\303\003\253\334\341?I\347\277V\005\377\346?\262\253\273T\331\343\323?\256)T\213\027e\353?\346\315\265\021*\"\357?\340\223\346w\242\346\310?\315*\214\254&\033\356?6\201\255\004\021@\341?\'\214[\366\334\342\356?<\306\006\"\373\215\306?\026\240\266:(.\336?r\177\363\332%\363\357?\014\315\024R\234p\331?\200B\317Uf\270\230?\027v\342\007\2168\341?\266z\261\257m~\346?\024\337\010U\337\210\311?\000\204\023\262\313\375\334?\000\321OH\225\355\322?\\f;\256\330l\334?[\310A\330`t\344?\334;\327d9\306\304?\244\230\3411]\314\306?\234\351U>aw\325?\356dXs\355\364\342?\360j!\'\274C\241?\247e\244\356\026\355\347?;\257\250\300Z\366\355?\270\"\322\226(\233\323?\232\354\337\200P\305\342?\ra\202\221\213\215\344?\006a\3305\"S\357?\200v\234\036\3620\254?\3541tW\257\321\321?\026\271\010\363\374\322\334?\024x\037U)\374\306?w\314\264&\335\026\356?r\311\3438:h\327?$\364\225\237.,\330?K8\244\0306\243\347?\256|z\256\360\226\333?8\217o2\353\\\276?S\214e\204;\177\357?\362\374,\241\235\322\323?\004\200\205\273(\305\340?6\215\023a\033J\336?\027\025\302\253HN\350?\034\314<\242M6\356?``c\367\326\257\317?}uy\377\201@\354?\320M\366\237\203!\313?R\210i\230nL\337?\253\224\014?s\\\345?A\210\356wo=\356?\346\216\371\375!_\355?\236\014\010\274\244#\354?\221\264\363\006^\362\351?\030\350\2121\235\005\320?l\035\365i\216i\317?\324\207\246\000\214\021\347?\316HW\363\361k\343?\035\2454\214\260\014\357?\313\255\350\306W\203\355?\234\261\340{u\n\325?a)7\240-h\350?\330\252\324\253\232\322\310?\243X\"r\224\302\340?&\377\013\317V\204\344?\273\373e\3720\026\352?Q\252\003\2677C\342?ok\234\315\277_\340?\340\326\361\303\030\243\353?\304\"\024\'\241\001\350?\3467N^\252\337\332?\300-\006%/P\330?YbM\321\000 \345?`\201\271\274:j\327?\354\316\207\327*!\326?\230\263\237\004\002\373\314?=\207@m\374\315\347?x\033\226\377\034\312\346?\314\370\372Vu\353\357?\010L\354G\034\205\333?\222\322\'\253\343\204\350?\236mU\307\235k\333?&G\330\304H\334\342?\306\027;\234\215\263\342?6\002\271gCe\325?5\032\333h&\345\355?@nD4~L\314?\361\250\215\375\t%\347?0\377\006\332\322\276\272?@L\361\227\322\317\304?nH\004\010\227\021\350?\374\016\257\2160V\356?B\333|\026\274-\351?\\\231\302T\016\000\327?b\304\356\225\374\025\324?\037D\236\326uV\354?P\033-\272\353\024\270?(\005A>B\234\311?\3701\316h\207\272\264?f\356\344\275B\350\321?\200x\235C\255\373\336?\267\336K~\037#\342?\200\031h*C\262\202?\310Aeb\266C\266?\232\030\0348>K\326?x\022t_\306>\302?#?\214\374\034`\342?\362\376\325\312}\300\336?\006\217S\216g[\353?SR\276\257\235\311\342?~~\312\330\330P\337?p\235\331q\n\010\327?\0304\250I\314\030\330?^\t\272\233jo\347?\312\"4k\366\230\341?h\301,\276S\313\305?Ps\330\336\305\313\347?a\032o3N>\342?\033\331nt\354\002\356?\274W\205\351\264l\312?I\375\020Y%#\341?\231#\230\314\t!\345?\362\364w\000nL\333?(/\270l\352\216\312?\0000Y\372\010F6?\212U\240lWK\344?\306D\222\357\3073\344?DG\337\'\375\332\314?\343\307\021\014\275\374\350?\330Z\254\\wX\334?`CO \263o\254?\224uU\373\273\177\340?\224>\266\2160 \307?z\2417\307\272\017\353?\247w\235\217\014\223\340?\200U\216)Y\271\340?\354\\\310_p\216\324?\"\026w\0169w\353?\023\n\217\253\240k\353?\350\322\233-\r\321\307?`\336Y$\026\261\337?\254\222\216\312O\332\347?\336\206\356\tm`\337?\212\305{s\201W\337?5H\361\243\374\370\351?H\337\252p\256S\316?>k.\"\225\334\354?\020?\226\266g@\311?\0005Uz\326\301\247?\032\225\266\370\0247\342?\000\270\230\252\237\326h?pZ\374\267O\007\277?@hT\314v\207\342?,J\231\216\372>\333?\320Fr\336\t\245\303?\206\314\024gKf\355?\000\324\002Ryi\274?\374\362\262}\243:\302?H;\rz(\212\326?\352\225(\306nP\351?\256\025\350mk\376\352?\330\246\371\221\024\322\315?\020\214\005c\020\304\345?\022\022\033EH0\350?\200\356yk\237`\272?\264c\007\025\n\252\311?P\023\325\334\330\301\347?d\307u\016\243\263\315?\244\364\t\014K\244\305?t\306\377\331\265\313\350?dM\343\277\030\020\317?o\261\251p>\324\341?\260kld\025\300\317?W\000*R-\375\354?\332\340\336\037!\320\331?xt\310\363\240\265\311?\250~\230\260\330\204\305?8\325\376\236\303\020\350?\212\347;7\216N\334?\221\000\202\031\317\302\355?h}\004\276\246\233\330?\300\014\331\0079\256\264?\277@6Yk\236\350?\331VP(\310\244\341?\276M\'\017\361s\325?WK\212A\246\\\346?\277B\204\240\"\272\357?<C7@\214\256\337?\220\266}\021\310\013\334?\352d\367\366\263\215\334?\361{UI\347C\347?\014\267*\343\257H\304?\004\027\003\232s\345\345?\375\207\376\0363\310\343?\300\324y\242h\302\207?\310\000-\300\315]\273?Le\346\021\232\303\312?\322:Wl\225\271\357?@M\036\330\323\314\244?\246\"\344\337g\220\330?\\\362\234\205\325u\343?7\206)7\372\234\345?4\334C\345b/\337?\367\317\362i\324\301\354?L\212\327y\213\032\314?w\366\317\224\375{\343?U\327\353F@\r\340?Tp(\202\2402\350?p\275\222\000\271\371\311?\242\220\024v\035\000\346?~\263\030\271\240\202\323?\260\211oe\205^\272?\342\243\r\304\364\237\327?\270TL5\004\016\277?\330qav\354o\302?(\213\246\261\007\273\327?0>\207\2201\341\253?\020\272\243\345\344\373\353?\364\211c\352\036\022\331?\332\333\303\332o/\320?\226\005\003\022\317\351\355?T\241q\213\217\235\342?\200E\016\016\007G\202?\356\371<ou\254\355?\022\204$T\331X\345?\212\355\243\240\352R\336?~>\247@\347%\337?\032v$\264\344\262\320?\003*\265\235\232D\342?\332\306)\003*\023\340?$\2300\214_A\330?\344\024{({\252\316?W\351\337\307W\367\340?\3226,W\\\204\352?\373\342b\371\205\271\344?\200\267\276\034I\003\224?\354\345Uy\242\254\311?\032\335+\377k%\340?\003|\224\231\243\333\356?\001\ro\352\350\366\341?\220\225&\316u\232\345?\005\005Bn\331`\342?B\274P\367F\200\340?\307!c\222\225\262\354?\003\260\235`\212\260\341?P\271\016\265\343\014\274?\343\006;^\344\215\345?\244\343d6\257J\320?\320F\311\362i\022\260?\341*\335\340\3260\355?\277\271RS\215-\353?C=\013\375sm\347?\254)\226\032\360\355\353?p\201\211+ \330\324?(\305\233\302{+\351?\2007\374\256\364\255\267?\350\252>\005\360\306\267?\312\363\310\203\311m\326?\237\274\200\300p\327\340?\244\235\035\252\260r\350?\250\314\023\356\265\225\327?x1M\244H:\346?\347\000\333\353\336 \353?\332I \333w\312\336? \330\351\250\334\350\322?F\333@\264\361\037\352?\270\202Dv\266\320\312?\272M\343N\233\322\340?\230n\357K\375;\273?\210J\357\\\007\214\307?\205.y\331\233\255\343?k;\217\252\351\210\346?[\317\255Y=\\\351?l\316\311U\276z\332?\326\205\365\376\244\355\352?\321\262\204\']\r\343?d(&\026$\000\320?\340\022\330\272`\327\235?\312\320\205y\317\217\326?98\322Y2\344\341?:\220\274\013\274\304\341?\246\307\036\177N\311\332?`\264\266\305\013\252\331?\242 Q\034K\217\351?us\342\273[\204\345?7H\217\244\252r\355?<\004\214 &C\336?\320YX\2629\222\263?\340(\323N\326\232\220?\010\017~\230\004[\322?\010\237\036\177\352\203\302?\364\313@\322\025-\316?\364\326$\213\353\220\316?\307\215\366\006Po\357?\002\233\360BM\263\327?\201\355o\310\364\252\357?r\200\314\367\231\271\357? L\250sf\177\322?\200\024\322\304e\336\227?\204!x\372\010\240\307? \253\214\217\352\202\274?S\274\207R\361\366\355?\000w\035w3\332\325?X{M\300\372\247\305?q\263\376\207\364\023\351?x\222\010\232\025\260\335?\342\264\325P3.\320?7\243\216\334\230\037\353?\306\377\314&\255\310\322?T\262\253\242\234^\352?\232\334Z$\305a\351?u\2535\276\325\263\342?X\210{]\340\312\350?\322\320\2016|\366\326?P\221\354\236\2718\311?\373\264y2\n\237\343?V\026\033\360\273\263\325?\360\320\212\002\227\217\271?\034\372B\032|_\337?\002\257{\002\004\341\353?\204\\FlK\356\300?T\013\377\264.\337\315?\371\244\0319\333\347\345?p]\220\230\315\177\356?\240\221\356?\001\341\267?\252\3754~\374\220\335?V4w\341\267\304\327?\036\276\211\002\323\311\357?e\261\344m\266k\347?\324\255\2326Q\006\334?v\307\347\262w\250\325?\260\020\023\315j\237\303?`H\270T\315\315\240?C\355\007\006\300\325\351?^\371r>\324\004\333?8\2002\r\177\037\264?\024mX\213\025\t\337?\237kl\t\371\277\344?\374\017\021ug-\303?\370%\277X\330&\343?\215\231\325\020\302\214\340?\300\232\264S\225\177\306?\230\017C\3064\030\353?\334\007\335\031 \000\354?\366\267\237-d\306\326?\250d#Z\305\301\334?\200\252\2278L\033\251?Fq\306 \232\234\347?\270\227\242\275\206\310\276??\260\204OT-\350?\322\352\222\342\303\275\337?\357\321\307A\241g\347? ~\327\333\2234\265?P\300\177\352\262\276\330?\303n\032\211\252\317\346?\230\230\310w.\265\310?\240\0234\033\021\344\337?\327\034rl\241\345\352?c9\006k\020.\343?\347j\373\302\214 \357?\262\250\217*\353 \345?\2251\256\306\347\362\342?e\271\257\027\002\254\356?$vcCoy\352?\267\312\324\306\325l\347?X\354\364\303\\\316\326?\354\242\266\177\241\233\320?h\332\242\230\243\265\260?\014]\251\344B\340\337?\364\343\355\3036;\307?D\230z\206\204?\320?\200\347\366G\203\240p?\232%,9\203f\321?\000\232b\352\323\332\270?\310\375\217y\335@\266?0\310\231f\301\340\245?\010\332s\007\357p\350?\344\346$\373\3539\320?\373R\335\360k\226\355?Ku\274\361\rY\341?\3440+\017\212\230\352?>C\034\254\347\356\342?TV\332\241\007\241\307?\240_fg\026%\356?\276\253\254I\014\236\323?ms\007N\267\314\352?,\227T\323\320\202\333?dn\266\014\214[\336?`\313\327\3166\224\261?0\302\005tQ\251\300?\036C\350\310{T\356?\360\244}\036\260\207\246?\202\215Qa^\226\326?\0003\327GV%\261?@\025\332O\315\364\327?\2044\311^\r\353\306?Us\246yfX\351?\304\227\255\205\327\232\315?\274\353\237@\246\323\351?\016\t\261\2245\207\330?\224\226\366\341\370\357\300?\210\3039\361\004D\315?\374\021\216!\205J\302?\300G}\227No\242?\346\017\276\260{\370\351?\206\203\241\310J\321\341?h\356P\241\177\330\260?\220(\316\r<\336\330?]L!\231%^\354?\351\326\3709\337\313\357?0\324\2162\351\347\242?\013*v\025\034\363\346?\016\023\345\022N\000\352??`\377\216\363\342\352?)\003\356\276\241@\343?\010\222KF\374@\321?\230x*Ju\202\304?\266\221;.\243n\354?\2334Oy;j\340?\371\326j`\325\230\344?z\002.\305M\310\341?\240\035.]\036X\236?\305k\262\2404\004\353?\210`\227\237?Z\302?)\313\345(o\020\340?\303\'n;\032T\354?\362\003\317\016\345-\350?\0148GPA\352\357?\225\237Y\023\000\371\353?ry{\322\251#\321?\204\003\270\360\304\035\354?X\347\330.\274\205\303?X{\274\360\315\312\355?\267\273/\240\346\355\342?\204\201\357+>e\333?\274\014\336\367\353\342\313?2:r\224.\255\325?b\005\341\002Kn\321?\026I\225g\303\002\326?\347\342\330\336$\226\345?\246Sd\343#\260\353?@h6i\225\026\317?\356\306\344V\320\201\335?\220M\014\270\333\361\246?D\260\020}\2121\343?\254\252\027o\321\030\317?\360\312%*\272H\330?,\177\341\022\376\213\311?32\313\356(\222\347?Z\364\365_a_\320?\270\025\300\323\216\344\315?2\315\033\363\354\376\345?\207\345\020\006\342(\352?\265]\211\344\267\262\347?@\353\030\227@\'\235?\022\246l\353\370^\350?\340\362Q\312y\360\232?\r\340GA\306,\345?\222\317\316\007ru\320?(\2525\260(j\260?\310\343\357\037\003\006\306?4\020\013\366\277\327\312?z\230]uVU\351?%\001\217{L\344\357?h\251a\245\247b\346?&\225\310\014f\035\332?fE!\321\344Z\343?\\`\006si)\351?J\225\213\314g^\346?\021\221\306)\366I\351?nM!{/J\341?\2205\374ur\245\241?\"\317S\301`\267\320?\007\033z\263\353\013\353?~\014\216\316\3609\342?vLT\212\226\303\340?\212m\344\303?\350\324?\210\272\010\311\205u\314?]\303h\314\216l\357?\3477r\277\032\251\347?\207\333\305\312\304\022\357?\204\016c\202\200\356\335?tn\351\361\324\017\341?\351\362\'\335W%\347?H]\377\273\235=\273?\300\033|\021\252\360\215?\360\337u\273~B\356?\220\220w\257\217p\247?\2144\016\254\201\034\323?\213\311\202\357\232P\354?\240\201_\371\223P\237?\034\353\356\322\330\271\342?\200T\364L\212\322\312?\367\n\t\033>\360\356?i\364$\307\276w\351?#\035C\342\260\277\352?4\241[u\253d\331?\370VzL\3018\265?\220W\021\206g\034\276?\345\004\245\033\305\014\356?b4\376\320Zp\352?$\227\237\352\327\332\317?\227\226\216|\351\275\356?\357Y#\374pQ\351?U5\310\024\357+\342?\350E\252\265\233\365\276?\300\316\365}m\275\321?t\020p.\355\323\342?pb\"n5\277\327?\340\202F\026p\r\347?j6\314t\300\370\331?f\225\002\235N\375\321?x\354s\0010x\313?\230\203\310\220>\330\317?\377UU(\3613\350?>\362\223\037\343\222\352?bE\235\271\366c\322?^\253@L-v\351?\000!7\240K\312e?\372_k&rA\342?\\\031-\213\266\373\335?\370\341z\257\251\251\337?\270\031\023?\3440\327?\236\0331\262\207@\357?\210\322\237\300o\357\277?\235\243\343C\266\002\356?\222\217\020]\224@\356?\000\374\346*\t\355\342? ]\004\251W\311\255?\316N:D4\333\324?\002Iw\037\0106\355?\226\243\025=\257\303\321?\013\233\017wd\241\343?c\027\312\346\022\321\357?W@l\014\363\312\353?:~5\220\270\363\336?x6o\264H-\346?x\221\314\357\'\276\277?\204};\316\035M\307?\374b\300<\255\337\323?\332%nl\203\205\353?\231~\370\027{Y\354?\214F\211\264\000u\316?l\023\252\005\367\314\300?\347-\247\022\337\304\343?\251>\t\212\026\215\340?\270\014\321a!\355\306?\235\271\310:\376\233\341?\350\363\273MS\223\312?\366M\010Y)\246\332?;Q\331\353\367\347\351?=Q!%\377\234\346?\007\300>\341Q\254\345?\220\000\266\310E\325\313?U\2230\001;\354\346?\274jm\256&\264\353?Bk\217q/\030\337?\027R\225Z\"q\352?\276\357\030O\326\244\347?u\"/\262;\300\346?A\213\254}\177X\340?\240\373\031w\233\322\300?5\344\231\216\220&\352?\r\032\266f\376\223\345?\020\317\007E_\224\335?\360\307\230\247\230\232\337? An\330e\353\263?l\322:\360a\337\317?Z\212T\205!\333\353?-\242\341|^N\353?(\213>\017\225#\311?\000\225i-\255\277\312?\264\353P\377\031>\334?\377\327&\226\216\217\352?\257\260N\260\035a\344?P\223\001T\027\366\342?\252\311(\370\261\216\334?\356\314\355\3761y\346?\315\331\323(k\217\344?\350\022\032\266\304\245\324?\255(\323\024\265\244\351?\006s\346\235\377\210\345?\316\244Z\307\221\252\351?\372\243\344\364\306*\332?(+|\225B*\315?3\314\350CPW\352?\020)\222F\246q\266?\350$\3450e\250\264?\375\320\342%\204\350\352?\360~n\326\000X\311?`jA(d[\346?\200|\025\211s3\224?\177\\\226\363(\344\346?\374\210G\373\362\212\355?|\345\235\362e`\347?\324\257\277EK7\305?@\236\341\252\261N\247?\224\366\375\0034s\333?\221\241\2529\000\277\352?\274\022\230\356\335\271\325?\270\232\350\207x\324\327?l\325\373\247\256\316\332?_l&\227\3625\343?\262 \365S\0015\334?\256\366\267\334\233\366\352?V\356Z\273\367\341\343?<!\352\2446>\340?\340\253\333\363\033W\265?\003\033\253\224\201\324\346?\360mR\216\214\257\337?F\177S\r\212\277\340?\027i\343k\007\347\356?\341F\200\252\201W\351?:\035\027\353\3132\351?\211\217\314\r\362+\347?B\r\212\202\356@\331?\232\216db\241\031\334?\313\236\307\230\337\353\345?\234\t\370j\0142\331? \305@\306S\346\247?\256\201\"J\337\007\355?8[80\222\270\301?&\344o\213-+\353?\375\\;\037{\324\353?c\303\2312\250p\355?\254\243a\213\223\227\334?9\201\266J\256G\345?\3624[\371\307\256\340?\222\322\326\201\273[\355?\3506\222\337\003Q\352?\021pd\310\266\025\343?\006\014]\3053/\341?\037\267\255\301p8\342?\364\265@\264U\035\312?\335i\361\'\031\377\354?\r\010|\363\'\350\346?\210p\351?\010Q\343?\000\267\302\324\300\037\246?\372\265m\257N\277\342?\370!}\234U\236\324?\344?\215X\352,\356?\247c\204\312x\206\351?@\211\\_\001\344\205?\263\010\331\220Gq\341?\'\334\234\230\211\347\356?h/7\006w\250\336?)\000\2507i\310\347?\216\013P\324k\326\351?-j\340\275\232T\345?\000\231:\307\373\262\342?\250z#B\272f\303?`A\034\265\256\024\343?\000\374k4O\327\322?`\2561\251\236\337\336?@\342=\310\325\241\333?\302\010\024W[\250\325?k\237T\026\237@\355?\276UN\300(\n\324?\235\203Y\374^\326\354?\036%B\257N\211\322?@\263*@\377e\231?H;\211\336&\232\347?\340\014\372\301X\327\307?X\207S-\371S\344?^\300\305\034\337\005\330?\033\266\3727E1\340? \352?\356\"\253\256?\302*\210J?\310\331?p3a\325\234\306\265?!\271\314\216\206\236\350?8H\250\377[2\326?\376\332\253\366\036\261\344?\"\324\005\010\245\323\356?\237rq\323\232,\347?\002\374\237x\337/\326?\013d\343#b2\344?41sz74\304?\314\'\255j\310\243\317?\250\310\303W\335\272\352?\340\262\0309hO\256?\365({\361\372-\352?f\331a\2079~\330?\243\207Tu\216)\350?\216\202#\323\337\202\323?\210m\032p\007\323\267?\344&\352\224\201\301\334?5\016\212\266D\203\343?b\276\3223\350j\343?}\037\273;D\215\350?xn[\251\366w\266?\344\377J\313\272\211\343?h\251b\013O\236\321?\320\033\220\031\230\373\242?\320\263N\363=\351\271?\246\250\000\177\267\211\344?\313Io\300\007\337\351?\303z\272\252jQ\357? \342?\334\266\202\313?\t.\367\3633\205\343?\360A\374\353\213\\\344?\004#\331t\225\236\356?l\316\210\246GN\302?*\001\210\214p\024\332?N9a\254\2726\336?\0263\336\272\344$\332?\020\220\'\237%}\320?\3708D\327/\031\332?\273\300u6\0056\355?\224\277\361\203\347\227\327?\377\031\214\t\320d\353?t\247\360\346\215\335\307?\224-\020u2\255\350?D\020\005?\010)\321?\370\335\335\261\334\214\303?\'\037\362\330\260\010\353?6\302\365\203z@\335?\320\276\276\037\002,\346?v\034=\3137A\333?\314F\325\375\225\200\323?Hk@\230\300\242\343?\325a\202\323\221/\354?\267\\q\377\372\371\342?_[\202\201H`\343?\361%\274\356\225\240\342?\341\235b\277\372\354\357?\"q\311\310+\252\320?<k\372\357w@\310?Q\023\267\200=\216\347?\206\025\342\373\364\"\356?\030]\'\334\233m\261?\377\tA+\264t\340?\330\213\211\202,\327\305?\020\005m\240Y\353\246?\206s2\347]\206\333?\300\371c\235\304\002\322?\315^\212\347\365X\357?hl\220\200\t\324\350?\313\334\210\363\370l\346?0\203\027\374\354\256\351?\2150\327\205\032p\354?\222E\222oq\003\353?p\267\301\357\375\325\275?\274\270\251\366\375\377\332?\177\025Gk\025\325\353?\334\036\033\207\010\256\300?\026\304\022\010\317S\336?\250\33265K\377\312?G\307\376T\345_\345?\340\r\223\262\022\263\257?\364\325!\006C\332\324?P\002\212\230R\267\250?\t\243\340_\366\207\340?v\025a\255\357\033\331?\000\360\206\001\217S\216?C\354`6OY\356?]\033\2256kk\343?\340\332\320\030\256\021\223?\027\304\005u\020\025\356?\023\303m\315\277\205\356?\2440|\240\354`\327?\264\303|\276mc\354?|\276\206\277/\344\337?\215\355:\366Ae\354?\270\2027\322\275G\314?2*\254\227~\367\324?v\351,mu2\344?2P\212\212\r\001\332?\234\214G}\246\371\351?\tAc\2622/\340?wo\344y\320\264\350?O\325\245P\004W\355?\001\216^\326\363\031\341?\013>mI\212!\344?Q\212\350\231J`\356?\300\'r\343@\n\325?4<\276\223\020\222\324?\214k\3501\007\370\341?\264\270\0305\027\335\351?\372X\245N^\\\351?l\350\354Tz\t\337?&\215\270\270s\327\325?@\352c\334\241\350\317?<\261\303\263\"O\311?\325\300\204\225\3533\344?\013`\301Z\"\324\351?\2334|\363\031\010\352?\245\260\363\027\301\276\351?\241\216\247\243\303\004\340?\240\323\t\3664a\307?\214\206*\210\353%\300?@\360Z\034.\354\230?PCs\326\r\245\253?%8\327\003?\002\354?\361\003\336k\313\n\346?(,}\037Y}\337?\357\263\226\2200\344\355?\240\277o \242\017\254?\230\366U/\004T\341?R\035\022\342$\352\347?\336inm\265\007\345?\260V\312\2774\377\340?\215J\035\'n2\356?\257x[be\205\344?n\220\373\235)x\346?oU\301\264CD\350?\250\231?\t\315\267\335?\224>\tQ6\361\323?\232#$\222YE\347?P\352\222t\305\366\323?XQ\003\022f\220\262?\354=\002\002\\*\345?\230\017!:\202\342\314?F\313D\346\242\261\334?\331\243\250\014X\226\345?`M^y\360\320\247?\322\035.4\235\266\331?\030\277\351\373\'\242\270?\210W\215e\350\314\312?)_\353\336\243h\342?\210Z&8\371)\273?\310P\030\314\253[\302?\334\375Z\210E\252\300?\360\234\374\324\344?\250?\316\253o\002|N\321?\210-\225\261\331\335\340?\216V\"N\014G\343?\204\226\316\277\304\231\313?X\250l`W\333\300?=\355N#v\242\347?\360\306\334}}|\260?@D\334UM\251\313?\024\3219\261\353\207\345?\216\363\324\274\210{\353?L@&\231?#\332?\3056/$\010q\351?\244\237t\333\r>\351?\300\250&\325\376]\277?>z=i\2649\350?\013\265\257\377\205\225\345?\217V\232\204W-\350?\376/\247rrh\346?\240\347vC\227\013\300?\tRq\022\007y\344?Zh\274\252\267X\333?H\223\253\244\014\256\314?\000p\033\262l\325\354?\245\2205\253v\367\352?\264\352\314M\330\300\305?\257r\346\026\337\032\355?\306\216\203>\2712\350?D\242\032A\017k\303?@\245\355\234\220W\321?\2566\016\003\343m\330?\346\206o\321\027\343\356?\350~\025\367\254\215\336?\331?Y\375\375\372\355?\320L7\037\003\355\323?\002=\024G\316\025\354? \211\030\'\242\350\253?\237\365\230\246\300\205\341?\322\037\261|\272z\323?P\252\0041\357m\255?8\344\330\022\354\255\342?\360\377bf\0050\334?\344V\230jB\356\320?\321C\310*/\306\354?\202\340\236\312:P\325?<YM(|\373\330?\374k\346\t\317\350\346?!\252\021\243.E\340?\004^8\373\312\312\340?`\026\246\374\372\031\263?\204\007u\253]k\325?x7\341\373\316\t\323?\314\321\365\021W_\346?\350k\367E\247\351\332?,\233B\017m\317\354?.\244\316\353\216\235\342?\006h\201\236\311\344\334? \367\2074v\263\226?\310\201\261\000~h\315?\033\230NX\272\345\341?\2401N>\014\027\237?\336\302as@\205\357?CgKV@w\346?\222\356\331Y\301\006\346?\031\265B\345\206\340\340?\000&$\243\336s{?@\361\2010\270\304\300?\021\366e\031I\275\342?\255\250\264k_\177\344?\001\251Us,\307\352?P\rB77K\326?\240\301\204on\242\240?`e\272l\222\202\314?\217\351:\332\t\327\355?N/f\237\311\312\330?\0342\322\366o\353\327?\0228\362\247\3425\346?<\321\362\3661\233\311?D\205\375\\N\026\344?X\366|\352[i\314?\200\023\t\373\244\204\204?[\275\330\366\317N\344?\017\302\036C7J\355?\200\263\245j\211\376\255?V\010\326\342\320*\356?}\250Y\311%,\343?\273\366\303\n\001\302\355?P\023titn\303?\240T\245\364E\206\242?|\244\005qy\212\322?\032\007l,g\301\352?k\271~)\232k\350?\020\300v.\262?\266?\000s$F\371\246\340?\243\315\t)D\377\357?7\272\352\231@\272\344?\010\370\315\013c\367\311?T\253\"\250\033J\323?\340\307e>Y\321\231?\313\241\231\352\331\235\352?p#\361yR\236\324?4\325\032\r\036P\353?@\002\0163\246\352\326?\300s\264\177\264\343\322?\003y\362bu\321\347?H\014\212\352!\347\272?\034\352\246\004\206\343\331?\241\200\233\351\023\356\345?\323\203\304\020\255\330\346?\025\251\367\014\031\377\351?H\227G\027\242\240\274?H\315\244\2504\352\301?W\227\035\006\371\257\353?>\270n\246\":\324?x\335\367%\234-\271?\340\325\364\346\007\025\254?Z\236\226\002\270\250\356?\230+\212\243S\271\270?\256\275\325!vT\351?\"K\303M\004\251\324?\332=\'\000\031\217\327?\321\t\264\344\216+\354?E(\316\002gx\356?A\264\351\346\200\247\351?0\025\347\322\020\246\315?\216_$$y\315\345?\302\252\n\321N\t\351?\307\227X\234\024e\347?s{9\002\300\243\346?\254\267f\237(\032\306?0{\022\272\374\217\255?\370\232\212\324\217\t\264?\250 \275\253\251\224\347?n\"~l\033\343\354?\224nmi0c\335?<\244M\321\304H\327?\274\371\037\371\375\343\352?\230\035\2167\255\341\337?O\200NZ\2478\353?\220\346\325\202\341,\265?\360A\027-\243\344\270?\236\275\333N\311\343\324?\260\363}8~&\273?0C\243\366\3017\345?\216Y\351\026_\276\334?\302\0316\251\260\303\320?\002\277\263\330JW\340?\207=\221V\376\016\347?E\217\"\242\0313\347?\3274\320\357jL\353?\270\250\013z7E\351?\330\212\000\374\302G\260?\370*\363\271vm\335?\013\323\332*\2300\346?$\304\213!jg\335?#\017\005#\313\346\352?\004\254\343\006\357\023\313?\250\254\016\362\362\003\356?X\217\240H{\263\273?0\033\345\322\365\201\302?\302\021\367\276\250\343\336?\300\326\025\005\311D\343?U\306v\340\343\230\350?\244\365]\002\202B\316?(C\366p\363H\274?N\003a\300Z\211\320?P\313(\341^\262\243?\214\330\314\244&$\300?\302\333\207\372}\200\357?\353\251X\227\251\213\343?3\225#\004\034\245\340?\230\226\331=\023\331\303?\'=\263sT\331\341?\275^\325\243!]\357?\354\317\211\306\345\225\312?\347L\250\021\221\021\344?MStw*#\346?D\223\234\006\025T\313?\025\266\2112V\311\346?\250\205@\326\313w\261?v\232\tY\340\260\320??\374\245\306>\271\341?\315\216\306\233\304\215\351?Fs\357i\257\366\347?\000xX,3\006F?\375\253\203]?^\350?\200Ck\346^)\257?S\250;\306\367\307\353?)\324\031\031\021@\352?-*\314&\271\007\356?AOW\025\014\227\356?P\0235\022Ed\271?\014\226\363\374A\273\325?\2016CCy\014\357?B\324^V\276\025\354?@w\230\222\325\226\210?\374\030\001\0024\027\354?\324\2579\206\321\263\337?n\373i\017P\275\322?\350J\220\010s<\277?\017\377\357\313t\262\351?\220\372\233\027\223s\355?3\253\341et\035\340?+\307\344\256\364\371\341?\034\276\3103`\262\351?\301\222\016!Zn\354?\304\rM\331\257\257\340?\364\316\232\277\312q\301?\r\336\265\320Lk\344?\254\346\334o\374-\300?d\265\333\344\307\243\351?x\016\306}\030\273\354?\244\0269RA\'\302?K\371bl\317\203\355?0\013\023[\305\306\346?\252\332\232\250M=\340?\374}\307\247\242\224\343?\250\230\254\217\237\026\320?\002\274\200u\256\375\331?dF\\\331\246\203\344?@>\224)\274\020\202?\330YZ\025\201t\341?\306\315\227\335\207f\340?8\233\"\250\037{\353?\350@?\364\330\344\326?\240\261\307\365.Q\226?j\237Y\t!}\356?^\350\261`\211z\351?\226x(\024\317\374\337?\222\311\372\273I\016\347?\226)\303\013\n9\346?\334\260Z}\305\037\307?\324\206}\277\232B\341?\243_%onF\352?\2106Q\036z\251\332?\351lk;\033\232\341?\310FA\023\201L\313?f_\347\311\007\204\323?\245\3456\205\303\340\347?n\0228\317\213\006\331?D\343\361\253\206\343\356?\334\010\277\026^\r\316?N\277\364\353\003\302\327?\320\241\262\223\374;\353?\233\315\341\354\321\007\341?\016\301\237\235h\320\350?\"\2660\033Q\321\334?\356s\207\013[\023\327?\327\221\376\370\351\376\347?`\373\342wZ\333\226?P\003\267\264O\310\320?\266e;r&\005\346?\230\261\034\325\231s\331?\212\313H\r\223R\335?\231\261h\352\375\017\345?n\355\2115\340<\347?\354\032&%\031\224\320?\323h\266|\005\227\346?H}\027\002D\240\342?\240\004+\227\315\315\315?@c!\336\0355\356?\031C\315w}+\352?*\201\3707\'\237\346?\350\243\202\216x\217\260?\246\027\n\325\275\201\334?1K\264a\273(\344?\320\370)\330\242\361\305?0\'a\330sG\322?H\255\370\345V\251\325?\266\215\031\346\222m\327?\206\t/\255iV\334?\241GJS\371\276\355?\221\325\320\036\334\025\350?@`\006\300z \244?\214\343Bl\007\350\321?\\m\032\2475&\356??:\253\244.b\340?w\274\305\275\217\212\357?u\373\326D\021\227\347?\350\272\365\010\306Z\306?\010\361\001\276\200\356\345?F\206%\321S\013\325?\230\363l\203d\t\357?\237+\261\306\266o\346?\306\255\242\377mJ\332?\256A94\275r\357?tN\371\234B\262\354?<\221\207\315\243\332\354?\324\323\2317\2325\357?\214\3044\021+ \323?\310\177\217\2010\244\322?\324x\232\256.\237\351?WF\351\007m\340\341?J,\302\264\301D\343?`\033\227\022\264\347\350?>B\035\214\261d\347?\340\203C?\252t\263?\304}#\222\223\240\346?n\030\302\371\364\326\327?#+F\235K\376\340?\230\340)\001w\020\306?3^w\001\3472\345?\200\013F\235T\243\345?\230\315\327~|\225\331?\323w\320\276/h\345?8\215\313\035\304i\275?\220\325\244\027@\243\260?\215\021\325\177\200\322\344?B<\212\214\004\246\331?\350\026\\\306\017}\333?\024\274^\033\252,\335?,\203<\334\364Q\312?\210\2431\007\005\205\307?R\2678\020\270\201\337?\260\354m9\274\303\324?\327\302\304\264\007\036\342?Zh\371+\0313\352?U\210\205\372hE\352?\024A\316\367\211\242\346?HG6VU\010\303?];\022\344_\005\355?$\336\301\364L\214\334?\316\226\035n~\035\356?I\235+\367H{\355?h\266\260\212\277E\326?\000\026\222\206\313\362\217?\300\033\330\177\037\216\272?\360`\352Q;\214\326?\346\304\353zo\344\357?k\2014\025\020h\351?\320\346ULO\037\331?G\021Z\363w\200\344?\214v\207lg\221\323?0\327\346z\322k\267?l\243\205.\220\000\337?\264\021%\312\377\301\321?\3240\242\030\315\342\307?\374\251\311\341]+\321?EC5\224\342\255\345?\317MQ\032`7\344?\376\3742\274\002?\355?\024\034x\314\351\252\312?\231vQ\306\033\326\357?h\365\334\342.\357\275?\366\321i;\231\250\345?`\335\036\030l\003\242?\000\002eI\352MP?\222Pc\223\242\204\341?0\370`\320p!\272?\004\366\333D4\"\304?\000Z\247\n\004\200\306?\270A\363N\223Y\267?\256\312\330\325\223Q\325?\320\371\262?)\030\272?r\343\372\236s\002\341?rf%\261\\7\347?\340\\\245\004\034\211\352?D\334c\3744s\343?\311\216\223\202\315 \346?\252DA\226\220\231\335?i\2323S\204\373\343?0\265\251\230tc\313?\230\323\260I`D\323? \241\332<tU\254? \270\036\270\231\216\262?-ER\325\335p\340?\362\325\321\357\270\275\351?\006\025\216\255\236\203\323?\271\322.\356\374\n\344?^\236\365\373)\320\322?\266\302\206[\026\227\333?Dqr?\233\300\300?\260\263\206\333\201Y\261?\025\204@\240\2532\347?\0109\033\311\364\364\336?\2608\344\3248\031\256?4#\233\302\365\031\322?\360\354S1T\275\257?\300\n\360\025\245B\276?j\362nx\265\027\333?\254\0261\'P\301\313?\020\303\367\277\245i\310?fJ\370\222\243\343\331?\311\240\306\357I\234\356?\000\270\263\255\235\021\206?N\016I\030\304S\356?xBf2\222\032\276?\204y\021\344\362\374\321?\n8OgO\350\323?\240\226\253Vy\262\247?%\322\340\367\377y\352?\217x\r7\320\002\355?:\027\360\224\265;\324?\265\'a\376\221\340\353?\320j\322\236!8\355?y\030\324t-t\340?\250Np\201\203b\307?\006Y\216\3362\'\323?x&*\256\216\374\276?\0231@\366\236\260\341?\223\366\302\322\200\001\341?\344v\303E\220g\347?.j\177\253\266\022\322?\036\006\003\263?\361\347?R\030\010\036J\330\336?(\302\250IX\030\335?!\205\224g\231g\354?]\\\3438\372\266\343?\370\177\346-\265\324\355?i\257\245\306\3661\350? 8\235\306\237|\341?\020H\334\026\241\177\244?\314\314\226]C\321\310?\260\243\220A[\371\252?w\240\352Bgg\345?\270\315\300\270\306{\336?\364\370j\303\215]\342?b\017\3352\310\005\332?\370j \217\261\335\274?t;\230\212\2674\320?\240\301n\017#\177\264?\260\377\306\307\2701\304?\267\335\r1\346\223\355?\310\361\233s\241\207\316?H\311k\354\275\207\275?o\217\257\016\255\003\342?`\2406\'\204\316\227?\017B\324w\036\376\342?\226\260*Z\013n\323?\301\021\024\330\262\275\347?\341\177\322\213oP\345?\304\335\213\\\306\006\312? \210\001NL\331\343?\030\354\017Y\311H\322?\010\243\244\0353v\354?\036\032e\234\352\355\350?\210\265c\357\302\014\340?P\250\200\2425\374\332?\\Pt\326U\324\347?\320JS4\332\261\272?\226y?ww\326\327?\257t\314\237\275v\345?\030\364\3451\233\013\305?H\342\245\256\363\267\335?![n\376\352\034\345?\002\014\005\232\2316\345?\003\326\016rK/\355?\265\225\372\334/\257\350?\217\357T\333\010\276\354?-\000\030\215\222\364\340?l\263\222\274?}\304?x\241,\203\356\342\277?\020\210Wp?\303\255?\010$\226\3669J\351?\\\245\257\267Q\334\353?e\245R\247\021\260\343?\230\254\272\234\370\247\265?\306,\tdo\006\354?\235\265\013\013\030\362\347?\260\355\003\275\355\367\307?-\323K\257\244y\357?@af[o\271\336?\016\212\367\3538\032\340?\032\003r\246\231v\337?\2304\036\226\360\341\271?\303\014\224u\270\255\344?\240,\236t\022\363\304?\264\003\t\273{5\314?P\025\235\220\024\250\275?\330c\207\25676\260?\210t\351\036\231=\270?\177\310\004\226\343\312\347?Jv!.\277G\354?\033\342\221\211\305\365\343?\352|.\313\227\022\347?\320G\tN3\236\347?\355b\362\032\r\307\345?@U\246\026\332\016\311?t\350\372w\333X\311?\332\304\371#\034\267\343?.\203,\310\010\205\336?\360\200\033C\314t\326?\374\277\234\003\374N\345?\364}p`[\351\301?\342\010%\324\377\203\352?\030U\020\317\277\216\353?t\242M\223\3415\307?\364\311\023.5\373\320?\330?\247\275W\234\267?p,\007p`m\321?O\334\333\365\344r\343?\240\257[\373p\331\331?\223i\016c\342\345\350?t\341\010\234\330\350\313?,\361\257\336\262\234\332?\366\001M~\003U\345?\224\266\177\366\275\300\356? \225z,\362\266\271?\254\213\244\236\212^\330?0\3172\243\374u\300?PP\342\217\351&\274?S]G\2538\274\353?v,\257R\3518\331?\007\276|d\000d\340?\300\033T:\363\360\315?*\330\223\360\345\271\351?8\t\3157\370\004\323?\177\204\220g\317v\350?\030\355\306\321}\007\350?\"e\235k\212\306\342?\362\344\376K\020\225\353?\363pf\237A\352\345?\000\2525\330c\233\277?\334\005\376\357_\376\321?+2\007\243<\211\352?b%\017\272\023E\330?p\240%\000\036\242\265?\332&o\200C\244\321?\204o*\330y\200\353?\200w\203\335V\342\234?\245Hq\025\260F\354?\344\315%J\255\025\322?p\331\275$\340\002\322?\274\003\365W\343\010\353?#\234|Av\366\347?\314_G\370|\254\303?@_\242\235\301\366\260?\3345*\331\310\330\316?\372\201\210h\3763\333?0G4\247\356\315\273?\370Q\216\313\013\304\316?\332\263\275:\030_\352?\336\226\002\236\020`\330?\221\330C\256y\207\355?+Q\327\2352\317\343?\014\017\352\353\256\002\342?H\364:G\3003\316?\315(\251O \343\342?\230;\364\033\246\243\352?3\311VYF\267\350?\237\255\016\304T!\346?{<}q%i\340?\300s\246\337\322\210\257?\020\203\337Z\266\030\335?\211\245\326\331\234\276\345?\000xC\225\377W\200?\000Ua\271z-\356?\250K0#\367~\315?\217e\300\366r\"\353?\340\240\341\204\366\341\246?\374\316\211\370\013\254\327?)A\262\t_1\340?\216C\241\306\262{\342?vi\022\204ld\326?)W\210N\271\243\341?\224\334\"\267N)\346?\010\300\340\034\352\231\266?\310\267)$*$\325?\235/T,\331\002\350?\364\034\nS\251\233\341?l\354\207(h\330\321?\034Ak\033\347\272\304?\307-\252\252m>\342?@dh\217\025n\337?\254\370\370\034\270\211\301?\234\260\253\343\366\373\337?\245\013>\352}w\357?\356]\204\313~\214\334?j\306o2D]\343?\341\205 *G3\347?f\3269:\243<\320? S\256A\236\267\223?\300\240\207\320\002\330\247?\231)a\n\351\240\341?\370\320\344?\271\031\305?v\261\034?\254b\326?th\332}b\303\350?\311\027iD\367\366\356?\377\n-\212\240\244\351?\004\271+\355q\325\303?bKh\227\246m\351?<\033\234\245\236\354\342?|X$\275\241\271\313?\370\375\376F\330q\267?\313\245!\262\357k\352?\253\364\325\036\216\272\353?\213e\307h\373\n\347?\035\261Q\026\376!\346?\004\361\210U#\300\344?\246\032\216\244\275;\344?\2221\031\340G\337\323?\210z\014(\362\013\325?\001\225\257Ek+\341?+\341\275\374~7\357?\202\273=E\233\032\353?\354\213?IgA\325?>w\337\232\347\367\322?\326-\3238<\335\323?l^>\357\226\236\305?\230\003\363\000\354\264\301?J\301\330w\227-\325?\216>R\352\336P\325?\210\245\330\325K\'\264?\350\3141\345O\373\347?`\2647|\375\311\220?\230\267S\241\267H\272?Y\"\"\020\016\363\346?<hhF\306\341\302?3\270\020\303\362\342\345?\360\000\3108\341k\334?\314\'\341\300<>\305?\337X5\353\207\343\356?\200\277\320<\177;\222?\254\260/d\033\323\336?\330\237\243\242\371J\321?\366;\270\270\t\217\331?\253\326\2361Y\245\353?\340VbJO\204\301?\305S\234#\366\"\350?\376\032\277\007p\262\331?\324\224Q\332\340\322\307?\032\027\356.\215\013\331?\253\331\361e\253.\344?\324\017h\367\321\227\307?\252c<\303\370n\357?\360\263X%\312-\252?\014\267\\\241\235H\357?Y\277\n\r\0140\344?\260u%\304\010T\314?\377o\303\017\331\372\345?\\\317\031\242nP\353?m\220\200\367\346X\350?\271\366\207\372ht\345?8\261\227\335\312\241\337?o\223}\351\035r\340?6w\201\2572\351\323?\344u\364\306\2449\317?N\033);\311\204\357?s\024J\311\271\330\354? \315\336\307Z\000\271?t\207!\202\036^\347?2\221\027\261\240\002\352?W\024yOT\230\354?\270\030\334\264$\366\350?#!\267\363\037s\346?\2641\311%n\001\323?}\374=b\r\\\356?68,A\223\302\323?\312\301\025\224\027?\337?i\311%\251\243&\342?\235\301\202\364\032\235\351?\240\242\343\246\250!\220?\330\246\371\037\235$\276?H.\235\n\016\264\340?KC\220\210\301\217\341?\3241+\340\366?\324?48\254\341\227\036\323?RA\025\222\347\217\324?\353\304\371\345\236Z\344?Z\206J*\367\372\350?\234\250Q\n\271\231\332?4_\226\013\356\310\355?\302^\362\317f\312\331?\200\2031\206z\t\265?9\302\332Z\346w\356?\322\031\030\346\244 \341?\340*\200z\207\357\335?F\\\3578\201\315\342?H[\224\243\270\350\276?\210)b#{\304\335?,Lq!\256\251\310?\376\242\312\033\240\020\347?\236\317\245gA\205\336?\0045\272\206\373\216\332?\030\311S\345\340W\320?p\311\034\340D\024\301?\226\251\036\25555\330?\030\226\271\351+\006\275?\003\252\205\215+\322\340?\204\242\362g\214+\345?\377\203\272\016\217Y\344?\346\213=]\317\304\342?\\\313@\026A\365\315?\324n\266\272\322`\335?5\334\244d=6\351?\215J6\205QQ\343?(\267\231\017\255\'\313?P\227\262J\236\316\352?\220\322\334_0\373\264?6\242\341\346\030\375\350?\277\017`\251\360\037\357?\\\350\255\001\274.\353?nY\220\002\3145\333?d\'\255S\0279\327?\335c\267\326!2\340?\274\036D+\202\207\300?\246\210m\227 \247\345?\240/\304\305\224^\240?\254\271\031k\004\034\306?I^\204\307\327\224\356?-\241\222QM\214\340?\274R\220\272\324.\352?\000\262l\t\254%\317?\260\236y\253\177~\252?|\302h2i\337\336?\210\202 \261\037_\277?\035\233\352\016\225Z\345?\260,R\341\001>\263?P\'c\027\270\361\334?\27211\215\277\374\341?%\276\037U\334\035\356?@\321\200#\343\327\324?\352\271X\363\241\275\346?\224\037\314\304\345\010\322?\255\376Ck\222\376\341?\350\260a\033\352n\340?\305\363;E\362\332\340?,\365^!\337\347\356?\002z\332\010\020Z\323?\310*\n\375\342>\324?\254\370f\213>\333\301?\024\344-\'\357\222\357?\231\016g2/F\346?a\310\260]l`\350?\371\2125\214\344\227\347?\000u\273\224\217N\311?\364\223\230wm=\316?8[E\303RY\276?\312\023\332\303\031\305\334?\357tu[\261\023\346?@\025\216\\\rF\221?|\023\037\001\237\365\302?nn\236+\252\306\334?\026A \376\315\337\347?\277\020\223?5\'\355?\310t\316\354\336\237\301?\366\230\345\004\324\336\344?T\\;i\304\217\310?\000\021\321\014\347\345\350?L0\27407\276\346?\341\363DZ\322\256\342?\301R\235\241\023o\347?\344W9\224\001\023\314?\2213Z8E}\340?<\274\235t\017\247\300?d\264\200&i\354\323?g\334,\264\376\240\341?\200\324\247\3067\205\260?HD\332D\245\215\313?\316L\310\025%\177\331?\022\362\310\277\216\361\354?\270G\317\250\307H\357?0\002!\241\211\314\333?`\304\320_T:\303?`2(Y\3659\223?`\320C\260k\177\252?0\205\017\017\247\261\331?\376,\214I\0314\323?\2422\221\365\362\206\354?\362\030\351\177+\202\330?\230J,\301\\\317\354?\346aZF\t\177\326?p}\177\021\320\234\242?\332\323\211\226\210\r\353?\020t\312\0361\341\355?\272\211\211\340\327e\320?\336l-h\2448\350?\303\232\215\245\335\232\344?a\347\267\214\000\320\354?\214\352\004h\003\343\340?\346\n\263\034\300\353\342?\306\340\3129NU\325?\024\317\204\235Z\332\343?\352\005\227G\264d\327?\331@\231I\357)\356?za\352\265\206\013\344?t\251\245\303~\367\352?`l\224ja\340\234?\200aqv\335\367\272?\250\330\214K\005\007\311?)vy\200\317\007\356?\210\037=\344\000\264\350?\275\263\274mP\300\347?\233c\274(\356\314\342?\270u\354O8j\276?\020\352\316-`\024\262?R\273\003^Zh\326?\200&\221\321\326\032\213?$\301\211\033\rk\357?\037\307\020\315\376\210\353?\252Z\177\270\344\204\335?r)-\034a\314\336?\301\235.\306\016\013\342?\032\211g\254\021\005\340?\005\273\212O\303\024\352?8Gu\202\216\211\322?\233/\327L.\230\343?\300\314F\303w4\247?\377h:\323\206\023\354?\032\343\034\251\266\212\321?\275\270\203\032[\213\355?@\003\231\246&\377\220?\256\224\334\017\035\263\344?\274^\210\373Ws\326?>\347\302\321L\266\325?X\214\215n\323\007\342?\345\300\263[\332\260\341?\200\\u\225\215S\265?nC\351\231\216\016\333?\014n\2753\361\337\300?\345\304\r\314\323\203\346?\350\025\256\202]d\306?@\255}\221\230\013\326?8\2667<3P\265?\020\355h\"n(\307?\366\242\373\245\221\225\352?\344\365k>S\202\347?\340Z\005x:i\272?\250o\254\372\243\226\275?)9$$\356\337\352?\350\310\300v\347\316\331?B\353\352\2060\256\343?j\354\024\245\004-\336?\030\205\321\226\256\216\322? \277\3142\214\337\241?8\273\275\033B\202\263?\200\214$\212\276\236p?\030\323\027\261\301)\303?\367\031\262\275,\300\345?|-\t\347\310\223\301?r>\366\224R<\353?\274\026\330\344\352}\354?\034\246(\246.\225\354?p\005\273\006\342#\316?S`\207\243S\321\357?\n\377L\322w\306\340?\321\030{\003\265!\355?\003\024\014\270\322\003\342?\207{\310\270t*\340?\244\274\261\276C\220\313?\024\026F\027\220\263\344?|d\036\223\310\255\313?\220-\010\317\351\020\346?\316\213\330\320\010t\330?(|@\270\330a\265?Kl$\304\306\342\350?\322\271\325[\251Y\340?\344?\364&\204,\302?\004J\361\3568\253\315?\325e\374\373\003l\352?\250\022\256\265\020\201\317?oz\262I\360\030\343?Pk\245\334c\231\246?\020\373\252l\3526\310?\207\274\305s\243\005\345?\202Y\336\022-3\327?H_~R\250\020\321?B\231r\234\232\016\351?+\r\026;\363\217\353?\260\336\201e\350\363\241?\254\263_\251`\322\350?`i\r\242\"m\305?Sk\312)}o\345?\360+\263\026c\016\311?q\273S\201U\t\352?\246\374\214\356Y\357\332?\366(\375\317\356\234\334?\344\334\034\032^\357\317?\024M\311\324\t\331\336?2\232\315\024>_\344?%\035,+V\210\341? \3434W\324\266\343?Py\373\'\177\337\356?\220t9\301g\345\354?R-\035\277\343\327\327?\034\234\223\tD\264\343?\247\321B\017\362\200\356?\240Vv\0165\211\324?r\250RQ\334\'\335?j}\025\254\327\034\330?v\303\032k\307Q\332?\340\210\033\347\035\036\314?\340\3537\356\246$\225?d*\205\204\033\007\316?\261XJ\212\221a\357?P\007\342\200\006\332\314?\305UHe\367Y\347?F0\240VW\264\350?\260\251\303\251Xz\255?\364\002\246mA}\300?\361&\313 U\265\354?\220\265\344\324\362\337\324?\250\335\333j\246\272\336?V\330=2\014W\337?@\013.\3603\360\336?\235\375\360\037\374\372\344?\204\200;\237\250\206\346?\n\255p\370G\222\352?pM\347i\2457\347?\\\246\232\313\214\211\344?\020\006\375!|3\332?\370\301\355_\344y\274?\361f\244\350i\351\347?\3107M\274s+\321?tq\006\202x1\306?\3670\334\215RS\341?|\251\225\035\321\277\311?\202\037\306\360 \023\356?\207\364\032\251\271?\344?\364\373e\336f\037\306?\200\nj\177\010\024\254?\300~\232\023p\t\310?\327\352\016nOG\350?\353\271\267\030\330\303\355?\374\300\377\005E\017\344?\030_\250\265\211\201\332?\333Q\242L\211\357\355?\360et\023\277{\255?\006.\256\306k^\335?\274fM\250Lr\343?\264d\325\2335\016\314?\217:\266\302EV\342?\345c41<,\356?\201T\014\363\0341\343?\360F9\023\266\343\245?\200.\262\3558w\306?\006Q\001\2453v\356?[1\367\023\215w\352?\340\207\341:\324\316\223?\220J\014\314\216\214\247?\334\335\352\324\376\276\324?G\211AVij\356?YW\232\311\005\256\344?P\025W\205\233\361\343?(\324\306U\334\243\325?\364\230@\031\035\313\336?\354\016\310\352\351\036\336?\200\210\323\236\325\247\246?\230%Y\032\336\216\352?\016\252\244\010dd\356?\243\306%\013\017\250\355? L&\033\241\321\262?wn!E,\236\341?\274\312)\336\266\320\354?\3373)!t\331\340?P*)\037\303!\241?\007\364`^?\265\354?\\\347g\320\245\005\353?\332\315\336aA\327\327?\220\223K\364\307\317\354?\026\333\t-\252\265\337?\211\347%T\305\373\350?\257-\205{\320\332\344?\303QR\327\324\222\346?\020by\313\341q\265?L\335D\274\2060\353?\360\232\021\321\017\234\255?\245\302d\237\016\216\353?\343\377O\261\357\242\351?a\303\265u$\375\342?qC\017\276\375\361\346?\342\031\023\212v\010\322?d\251\275:\241\021\346?\222&o\321^\346\353?\373?\251;9\240\352?/\303\356\017\331O\354?\343\021\272,\216j\346?\004\"\336Mb[\305?I*^\377&\032\345?\316\376\224\026\245\264\345?\204\341hC\300\252\321?\356P%\031\324;\343?\362\213\372\314\2202\350?^;\264cN>\351?\330\222\365M\245\240\331?Z\322\257x5*\346?\035|f]\336\231\353?l\032\261^\037\261\300?w>!5\261\204\352?\236S\335\177%O\322?!\223:\322\037\363\356?s\006\230\272*5\341?\031\036YD\363\263\354?\354\365\025\231\307\217\307?\342\342f\273N\033\340?:\n\346\317\364\211\326?\030q\225\241-\201\311?n\200\003\315:\000\341?\205w\311\342\205\311\354?\033\225Uz\\\332\352?\360\026\322\005\356\t\277?\"\335\n$$\372\334?H\261>\301\277\236\302?\223\336\002\300\305,\353?#8r\246\005\340\350?\216\356z\255\376\205\350?\356\264\240$O\334\332?\267-Cy$\326\353?B\'P\3076\026\344?\270\227\035\237\023\327\277?\331D6\230\253\330\342?\264\211F\354\377W\301?\000\245\t\325\333h\342?\342d\226lho\336?\360\177a\367)\257\317?n@\2743\020\013\350?&\031W\333\277\224\351?\302y\3410\264\244\321?\370r\312\261\312<\325?\340\220\220=\307\243\306?\315\237\3030cM\346?Y\3155\'P\342\354?\222l\277\220\332&\325?\000\004\302\2614\032\256?fa\000T\241\316\342?\270\252b\354w=\274?\320:\274<\360\237\340?6Y.\263=e\340?\374X\352\n\201\255\327?\013\020\253\026\274C\351?\002\255\031\036\3212\334?\214]\313\275\367\233\345?\335\367\236\341\345a\343?\027\031\261\035UR\352?\346\222)\336\016\246\344?@\322^w\'\307\203?\"t\315r9z\332?S\t\300AJ\332\342?\365\025p\027\027\200\346?\\/lx\262\'\355?\004[P\334\272v\335?\003O\020ML\307\347?\030\030;\235\355\311\357?\372\315\252\006W7\324?u\241|\344\010y\353?\036\026\371S\362\326\334?\nP\265\251G\260\322?\364\361hy\325\233\305? \270\350\270\220u\303?\370<\253E4_\267?\354\361\205\035{[\347?J\255\177M9 \342?d\035]\021\006H\335?\310\007q)\220\233\306?W\256el\332j\357?RL-\3618\267\321?\230\354\311H\005\003\330?2\315\235.2\010\341?\274\024\332\344\236\r\317?\204\354\236\236.u\323?\320\270g\024V\366\273?rx2\025\307V\356?\224PN\034\370\237\335?\366\020WM\213\201\351?\364)\321\274\272\006\326?\226\235o\224\366<\351?H\335\227\013\005s\276?\006A#+\005\315\340?\200\230GI\374\207\261?@\200zbY\022\236?\\\303\233\351\325\014\306?\204\266\"n\024\316\336?\253\261\312\r`\207\356?\030\257^\367N\220\327?\334\177Y\212\005M\340?,\245\177o\314N\357?\023.MR\013\016\356?\336\013\036\237\322\035\352?\021\260\302qs;\351?\333\017\224\334\303\306\350?tZ\316zs\177\344?\230\3616\024\216M\351?\260IL\003\327\177\302?,@\363\344\036\253\303?c\320\n\243ce\357?\305\251B\363N\254\352?\362\260\272\000\265\207\332?\000\356i\376\2709}?\002\262k\2700\177\330?\340av\013)\025\304?\202\021d\376\207u\331?\256\246\220\233E\272\357?\3009K;\263\377\252?\336\234\264N\221t\357?\300`\203\372Ux\326?\030\276\216\370\037}\275?\200r\277\373[\364\307?\222\'\312\026\266\005\320?\252\341\361u\2102\333?0Si\003\347\233\333?n\224\356f+\306\355?\304c\001\232`8\320?0\251I\207\031<\342?\020W\362\301\237\001\251?x\377l\343\210t\341?\\gh\300\020\212\335?Pi\031\233\002\t\253?\0204\254N\266<\315?\000t\006\313H\307\334?\254%\212P\n\361\310?l\035*\243\262Q\345?\001\312\303v\032\325\345?\320\374\023\220\261\202\274?\233\210\277\323\310\345\351?\030]\204\210\302\257\333?1\336\304\266\366`\342?Pb\014BuZ\314?\214\021\367\300N\334\325?\204\364\177\221\316\334\317?\010^\204\177u]\330?\016\017\276\021f\360\336?F\363\344\320\217I\332?\272+`\337\210M\340?i\316:\017\274\232\354?f\334\354\230\320\340\347?l]\371L\303l\342?\210\332E\375\273\252\307?\312*\252\276\310\246\321?\357\302\376\222\336/\343?p[\323-\277\205\313?dG\263\021\325\230\320?`\000\326;\214\366\252?${\235WH\253\355?(~&\263%\377\310?\301\236\356\362+\246\342?\356\355\3625\"n\346?\007\216Ci<\304\350?\370\202\365N\022\223\331?\232\277D\001\262\303\347?\237\367K\351vg\350?P\351\363e\247,\340?T_v\337e\325\330?\020\303\216\343z\227\326?\307\211n\320\224\036\342?8\232\252\363\341\374\273?\355l\235@KP\345?\212\216V\204\r\341\344?T\311\271\373\354+\327?\320S\205G8\007\353?0\310\366\205\356\211\303?[\373\213\\\221k\355?$\274\237@p\300\343?\353\352O\250|\200\357?\325\025B\350\'\300\353?\350v\037\256\014\'\323?K\003\201>\027\375\353?\206\201u\254F\223\334?\320\256\22457\337\346?\246\231\210\013\315\356\327?}\371P\254S\355\350?\021\363\213T\273P\344?N\330+\260K\004\325?q\374\346P\377\224\347?\2745\324\353Kz\347?8\204F\302\033\205\316?\010*\032w\313T\352?\210\213+6\351\360\275?\334r\341\023q\014\321?\364\264\242\037\333\355\353?#\332\277\250Y\010\342?\370\340\224n1\364\353?Y\264\344y\0303\343?\330\031Mw\200\n\350?X\216c\024\244\273\340?\006\266\216G\337M\333?>`\340TgQ\327?\250hk\202\362\336\267?\\}v7\235\327\322?\230\347\366_E\345\325?0\373pl\356\005\257?{\334\317\253\242\262\352?\316\275\211\314\220\303\347?7`O\305\354\002\341?\310\377*k\306\375\317?\371\340\204K\263\333\340?@\250{\262[\177\210?\222Wk\010s\025\341?\370\220\371\232\013\326\273?\212c\032\350D3\321?1\253l\264\r\224\357?\232\\\303\245\330%\320?\1774\256\037l)\347?\272\2777\306\037\031\357?\334[h\257\377~\313?``\206Pg\010\243?1\354\346\346wU\340?L[@\272\323V\324?\004\371\031\246\314\304\307?\010(\311\355\0311\321?\020\324$\320\271\016\246?2\030\201a\262\241\351?\250\241\232\037T\031\317?0\373e\317\\)\244?\210F\231BH;\340? \315\301\333j\346\247? \302\246\354\307\306\355?\300\030Z\330RH\220?\r{\200\236?\220\343?2\370\034\276z*\353?\374U\032\323*Q\332?0m\375\206\214\242\323?&\215S\367\240\306\327?\007\030\324N9\350\354?@j\327U\302\247\316?\274\242\352\266\312\342\310?=\227,j\t\324\357?\tq\363V\313C\351?\025r\317\266\035b\344?Im\033\037\202w\340?\206y\330\322\205\275\344?r^\3545O\302\326?\327\203h\223\267y\350?\354\316z\226\351\263\355?\300\373(\002\234G\260?\027\352h\255\216u\355?\350\315$\204\244\"\310?\306Y\327(|\252\324?\330;\303r\244\204\343?|\335gw^\014\333?x\313\314\360\223\265\260?\000*\337\"\206v\223?Z\247@\0166T\330?\322q|\235\272\274\357?\246\360/\264{\004\350?J\'\202>\303\335\333?@\350\202_\233\025\250?\366<\273{\247\t\353?\237c\313\r\010r\351?\237`\373J\346\252\345?\275]\350\024\3605\353?\000\322\235\276\352\340g?\010\226\374\316Z}\357?8\236]\n\367Z\355?\253#\022\245dT\354?\331\275\263\224/\241\356?\330\306\311\220\027\215\330?\271\351\324\004\2278\346?M\254\221E\314\366\345?|\204\013\333bD\352?6I\357\360\301\331\337?\350Nl\277\312m\333? \335\307\335\n1\315?\374\341\301\247mC\307?\"K\014C\224(\324?\242\344v\002\261\356\330?\2744^\227\003\262\306?\370\022\t\304g\026\264?\014ZB\266\372R\327?7b\217$K;\353?pm\037~7[\273?\020\333I1\237(\260?\030o\312\036\t{\274?`%\267\303&\234\226?\032\241-\027\032%\351?\336\334\035\266S\273\342?\366\334\353h\006\213\350?\370\374Z\361\264\360\260?%m\207\317\277\017\352?\364\001L\236\026\352\316?\223\365\371\232\326b\347?.~T\213N\371\340?\220C\310\027\007,\251?\240\'X\\_\353\274?0$0\307\030\232\331?\370\0260\276\273\337\326?>\217\204z\247r\336?`J\216\330F\027\255?\340\353yd\rq\357?\364\343\365\233v\346\345?\000q\303w\266-\341?\360z\261*\3132\345?\252dX\2522\206\342?\331\333w\"h\013\356?\205\002%\036\373\374\346?\240\310\311\224{\035\353?\360\223\365\331\013\271\317?\250\315%\224l\013\262?A\274\003q\014X\354?\234\303fji\350\345?\np\266\272\3604\357?\340U\3601J}\230?u\262\224e5\251\353?^\305/\226n\211\336?\205\355\345e\252\321\341?@\340\235A\022\033\263?Ha|\3215\250\332?\001,\t\256}\177\357?%\372DB)V\350?8Q\274:\342\237\304?\000\364\246\031`\ra?\240\"i\typ\262?\340\207\207\340\377\237\274?\\K4\025\270\214\302?q\374\254\2112\370\342?\260:v^\372\327\330?`\r\347\203\351\203\340?Z\375\364@M\324\340?\"\376,\032\277\007\346?\333b\255_i\311\344?\226\\\346\332\242I\321?8\303QV5\361\263?[$Ri\322\320\355?\030m\001i,\217\300?$\037f\222.\004\306?\236%\302~\215]\351?\345\2615\031\271G\345?\272\355\361%\244\333\320?\370\325Z\235\256\310\337?\300\026i=i\243\240? \361Mu\"\250\322?\377\331\'\275o\255\341?\003\263\'\177\235\346\356?/\227\324\226\370>\355?\274\032\365\027\014\317\305?\000\236\353=\3543\266?l\323\351\230\301\232\343?\212\203o\0361u\320?\314\000\255\363\332y\307?\220\010\352\004\033\263\267?.\006\n\016\332M\343?\270\346@\360\342\276\310?Mg\230)\233\020\344?!\302\344\035\202\033\340?\214\257LTN\225\316?\340\324_yOd\262?\000\305Rb9!\313?\242~\035\2432v\337?\306>\245\021\305\230\352?\340\2007K_\025\316?j~!\0235\240\342?\\\321\031\361\320P\314?\242;g#\"\005\351?\332:Hv\357x\331?\263\225\3223U\267\340?\330u\005@\243\032\337?\202q\337\037\344\010\353?\320\026\273?\264$\306?`,\305\305\177\203\315?@\\\t\306\000\253\217?\347/\"\340O\255\346?\350\320\376\365\034\221\331?F}e=\321\300\355?\020\347\232\022]\316\265? \212\265JD\032\242?9R\254S G\355? @;\3337\004\347?\200\323\370\345\255\325\275?\314\277\r\267\257\007\352?\330\013<BV\220\333?\030Gf\322\325\310\347?\360\255\320w\007\306\356?\202\204\300c;\273\345?\330\010L\0306[\345?A\222/\201\230\262\353?\000\312o\203\354\226\213?\250\324D\203\215&\342?\224\026[\324\020n\333?\030\216e\271\370u\264?\200\355\213\033\031\334\342?L\237^X\3434\320?<\010\320\013<\037\337?\342Jl\374\236\005\323?\273\024\206\351.\014\354?\034-tf\234#\302?`\235^r\273?\256?\223\205\275\262\263c\344?\030\007r\222\377l\351?\203c\216\210s\342\344?t\016\276\335\301\342\326?\206\252\036\216\335\356\330?\250|\036\024[\032\263?\354\216\314\'\231\326\324?x.\356\001\336_\326?\324\022|t\354\233\354?\374\315\205\377\"\024\337?E\352\241\"1R\342?\307\224\226\373^\304\357?\377\200\002\032\033\251\354?f\031e\207\203\306\325?:\220/2\310\254\352?\023 \374oZ\237\345?Pe\347\200\227\373\277?\300\342\266\266\266\017\325?\342g\014K\345\343\350?\362\016q\313d\221\343?\216\276\350\025\222H\345?\000={,{]\316?&\221N1\344\000\344?`\255\252\204\335\036\327?hQ\247\365\356R\327?SBbK\332\232\347?\020\324\361\0013V\316?\340~?\364\231=\266?\245\316d\364\230\261\341?r{\367\351\231\013\341?J\213/iI\300\333?*\2168\316 \263\337?@\250d\256\361\002\300?Xh\371U\275\355\272?\220\3678\277I\351\340?\213p\272\260\305\214\350?B\304~A\200x\333?\316)\035Ey\252\352?\372\320\335\350oV\333?\222]\250\347=\240\340?8\370\r\232\rK\322?P\376\246\n\324R\350?\377\331m_\221\316\352?\350?\251\330P\324\325?We\312\354\027\013\346?\003\273\210<\230\016\344?\032[0Q2\331\340?\307S0_\363\006\353?\236\203)*\210\366\333?#7\352\254a\323\341?\324;\301\312h\317\333?\225\253%\306\351\037\346?\014\"\352Q\225\213\311?i\222\340\224Z\224\354?\312\323\005\004\256C\330?\225(\346\332*\207\340?\370wM)\376~\262?\304\376\255\232^\256\327?\tA)}\261:\357?D\005\222\036\020\270\301?\037 \204/\357\204\343?\000\250\240\230\202\032K?\272d\270\377\310\266\334?h`\237\370\361\226\353?\313\261\251D\324\310\354?D7\254\315\375E\305?\010\tt\032\325j\275?<\317v6PS\324?\307\216\324\201\014\335\353?V\316zb\000-\344?,J\265iJf\335?\330\265Z\353\246\376\323?\313\271\noN\304\346?\220Z\301\224[\254\250?\330.\221\272\317k\350?\014\227\265g\354\265\311?\264\032\221\017\027*\303?\232\264\007\nM\023\345?\014\377G\304\354\214\301?\304\022\347\024\342E\345?\014\342\024{S\230\355?\000\010\261\2048\341\271?\340\327\004B\302\035\324? \003\017\315\025\347\235?@:=\035a\220\333?bgwJ]T\357?\257\254\265\303o/\340?E\257\033Q\370\250\344?\231\232\016\255p\351\355?\216\251\313\356_c\332?\230\\\004\316Y\217\272?\274\244I\353d\232\315?\331.\020^\360m\352?\330)\216\344\r\303\314?\354\201\002\205\253\257\354?\030\'\006\030\016m\322?v\301r\201 \206\345?\324\365\2155\333d\356?0/\217\026o\212\275?\314\372\222Y\210~\325?\374\232\223<\3637\352?\360\220\010\221\037H\263?4\332\253\301a\230\324?,\316Z^\205\323\332?[\312D\224\235b\342? \257\352\373gy\301?\222\207j\300\267\374\342?\326%o\236\305m\343?\2304\251Ak5\322?\244m6\355\204P\323?\340\237\025\325\265h\352?@\371\303\r\024\301\233?\370\\b\355\275,\347?\251\305\326;*7\341?\200Pc\014K\377\253?\234\003\270\365A\376\313?\330\002\351\203|/\352?~\266\321\206\221\004\345?\252F&<\336}\320?`n\205?\307*\316?j`\006\344\306&\356?\244\001E\253\031O\313?\014\355\023\014\250\033\335?\334\216%\375\320=\335?E\376\236H\343\354\344?{\207t]\314\305\353?-\211\t}0\247\346?g\323xtHv\347?\304z\177%\200R\307?A!F\344h\024\357?\260\010\035r}\016\310?\330\313Y\023\r?\306?\240\261ztP@\264?\374\016Y\233\361\205\345?b\330\353\233\321\027\322?\245\001:1GG\357?\330\215\324\353?O\333?\241N\347!\017A\343?\320\252\214\276{\250\301?\244\324\250\353\210A\304?u\341\355_#\236\342?\263\340{\006\341\335\356?S\306\363\367[\020\340?\2511\306\177\rg\355?-\240\272H\327\333\347?\316d@U\265\342\354?\n\007\005vU\"\354?\262\234\240\374\230\353\357?\275\036:\356\271\274\353?\000:o\030\004u\355?\237V\000\346s\273\356?p\351\276ZH\323\346?u7\346\324\364-\346?\360Kae\036\\\355?\202s\204\0319%\322?\244l\273\202\276\037\316?8\037\326.\221\306\355?;U#\\\020S\351?\034\016\250d6z\303?\210\024\200\r\020v\316?`>f\344\214\001\322?\316\357\257\340\217\310\335?\"\304\014\216\246,\354?\24676\236\317\317\354?:\237\016\244y\214\336?\r9\362\210\276\200\340?\326\210F\341{\343\331?\212`:\013A\371\326?\035S\003\364\314\023\343?\340\010u\363\225\336\277?\306\234\356\362\275%\336?md\013}{U\342?\034\323\225\313,\241\352?\326R=v\\\263\341?G_\204R\272\223\350?\361,\320\257[\204\352?\376\013\241K\226\277\322?\270Nf\260\032\203\310?b\320MU=\324\350?\2730v\025\200\357\351?w\302\0173I4\352?\302\033>\343\365,\320?\232\245\311\242+v\321?\232\221\"|\243\246\340?W\274\200a\336\003\343?|\341\014Y\244\263\327?%\270\342HQm\354?\260L\353\372\304\354\246?\275\302\266H\332[\343?\242N\030Y\027o\354?\324 C\036\231K\355?\377\270QX\340\362\351?\316\336\370V\236\311\333?\300\n\334\034\263\236\223?j\273\266\336\341\316\330?8\200?\355*\353\350?\260\263\226jC\006\246?x\237\307\351Y\302\266?%\t\206Jp!\356?B\363dy\024\336\346?\032[\265\246>V\335?,\332]\270\0108\353?\004A\007\255T\320\331?\343>\343\221uv\356?\224\024%\367c\311\355?#\177 A\261\004\355?D\244\214t\2651\311?\334\315x\354`\315\323?rj0,V\t\353?7\363\356\232\244\022\342?y\225\273\027+u\342?\240`\036\277p~\253?x\210\367\301;\231\337?\232v8\020\031\363\324?\350U\023a%\020\325?\347\014\353\345\232\262\347?\220N\342b\264p\347?4\213\226C_\020\345?9\0172\\\252D\347?\270e\277\264\320\371\267?\300\341\311\002;\277\263?\315\245\325j*\340\351?\260=\211\334\006\034\356?\200\320\356\272\205(\347?\230!\027_\253D\315?\303\256\033g]$\345?x\241\002GL\244\347?}\345\360\202\213\356\357?\250%1!\206\315\321?\353%a\3616^\340?\377+7\242\352\014\345?\274\253\006^T\202\315?\340\232\224\370\361\233\231?Hm\302\016%\301\321?]\224%}\335f\353?\310y\235}\205\r\332?\307\324\033\315\263\271\356?P\217\247\376&E\355?\252{+Z\255\254\333?@\033\265\367\371v\351?\2408S\276.\326\275?\310\270\262\205\345\027\340?\230\357y\016\036\311\305?\350\275S\253sz\273?\030\000\302\024\2207\272?\230\277#b{+\342??\242\335\342\004\004\354?6X|\3576\243\333?\374^\206p\365D\333?]\270J\206\020\263\352?l\037eW:N\330?\263\347~t\210\377\350?@?f+\213\236\224?\000\010\217\324ap\337?:p\025\253\016\301\340?$\343\"c\020\037\311?f\337\370:\025\377\354?\204\242\257\272\023\304\304?\010-\307\210\\\036\266?\000\245y\257\317\366\216?0\347\274\005\nj\303?\340\222\321|ds\354?\224[i\274$~\331?t\035N\276\032l\321?|\350a\037\314\365\347?_S\247\275\004\t\342?\376\034\354)#]\333?w\273\263\220\245\356\345?\225\201\341\003\314\310\356?~\215\316\320\325\201\342?d\314\351\003a\232\302?\361\034i\367\226\237\346?\346\033\017\337\262\"\354?r&\360\3143\312\345?\362\261\247\240\"\032\330?)\352RG\276\356\341?:\370\313\315\270\222\355?\025\260\325\336e\005\346?C\220\364\007S\024\346?o\001\024\236w1\344?6K\213\203\374\375\336?\020p.x\266\212\335?H\225\317\"\275(\263?\005\207B@7\250\341?\020\3531\362\347\007\325?r\313!\214K\216\337?k\257\353\257\025\257\347?\303\321/\352\263h\342? \272G\265\273\277\241?\024\323 \354\003I\344?\000\030,z\340(V?H\0026\250A`\266?\351X\300.\223V\354?se\214\245\272\257\343?\254\335\227\032?\010\354?\350%Z\025\270l\340?\030\355J\304[\005\314?\241G\301`k\224\347?F\035@i/\222\323?1f\346\225\002\267\357?I\301\234\270\010\351\343?\2610\317\037\306\251\340?\274u=\256\377|\334?\367\332\223\370\'\231\357?F\376\370\3146\242\327?P}\352\315\310\306\240?\344ci\223\351\363\320?\274\274,8D\312\326?d\253%\231\332\364\325?\264\263q\200\301\247\300?\235\221\334\221\351x\354?G\204\037\226\315\024\345?p\202\335n\331*\332?\010\351=\226<\240\306?\364\231>H\375g\300?8\340e\237yL\324?P\301\307\3735\025\240?\262\274\035xx\030\347?\360\037\342\3551\233\352?\000\340i\247\372\025\330?L8\002\350\207A\342?\330<m\301q\305\317?d\275\310`j\311\352?\212T\016\211\350v\330?\201\005z\202\031\200\345?\177\313\0012\325\210\351?\363\014\004:\204h\345?.\016\377x\020S\342?\273\014\026\313\321\325\347?\327\352UR\3163\345?\2538<\016\321\226\347?\324~}X\3443\324?\336\027\310\355e(\331?\3444;i\r\247\353?\304wgO\327\210\317?\222HR\2677y\321?\353_\374\036\020L\352?\312\2372\231T\274\320?n0J\377c\240\334? m\035\003\032\237\224?q\256\203\032}\330\354?u\2314\2207\032\340?sl\034\257\377\002\340?#?!O\025\222\356?\213\364Ht\034\005\342?VS<qur\344?\310r\216;\255\274\351?\2203\321\032\033K\345?\020\202\224\312\242\033\357?\200\251\257\240\021\303\262?G ,#\262C\340?8\215\3670\264\335\346?Q\342\321kl\277\346?\3768\026\016:+\321?\374U+\230\315\\\316?\273\003\334q$b\351?\243B\023|xb\347?\307\247k&2\305\352?\000sQ*\271A\353?\271-\204\247\262\036\344?\267\331\360\014\225\270\353?\264\257\210e4\276\332?\374\316\276\357 =\310?\376\344\017\3724^\322?,f\322\2615\330\315?\021<\'aqX\351?\307\201w\244`*\355?\350\277\265\346l\327\345?P\034\244{\243\002\306?\237\010\320\230S\246\356?\354\241\361\377\210~\316?\030S\324Cww\316?\234R\264\364:\301\334?\334\300K\355\264\013\354?Y`\2239\033\273\350?\236\305\232W\311\037\334?\376<KX\312I\343?\032\343\025\276\0330\340?@\272\254z\236b\314?~\'\271_\0045\322?\0006\230\010O\205\301?(?\365\366y&\324?9\\_\002 \026\341?\273\020<x)\372\354?\2705\217\234\203\310\316?\355\270v\250\250$\340?\300{\254\354#\004\301?\300\211\035Eq\214\335?\364\263\027\321S\343\341?\223\201\000\022z\226\344?\340(\214Be\302\272?J\2760\372\263\333\330?\314\177K9\013\263\353?W3r\240\301\361\345?\365C}0\275!\356?8\036\034\177\375u\303?\272\034\014\'\222\372\343?\232\320}\311\266\023\342?r\t\203\377\035N\356?\353v\366\301\311\210\340?\t\362\355\3720\326\350?\352\257\014 \251^\321?\354f\251_\336\302\324?1Y\331\255\2339\352?\330\032\371P\350.\276?\036\r\321\302\233X\347?\205\350=\341\321Y\352?\004\031\362\352\322\307\322?\220\177\220\234]\'\313?@L(\357\332\241\332?\240%@\254\200\366\251?\370!\312\274s\\\335?\320Y$\210\274y\354?\240x_e\257\370\324?\340u A,~\316?\226\365*\231b\203\334?b\236\314\345K\317\330?\312\224\017\266\224\350\331?\370yU\001F\306\341?\3318o\355\177\030\345?\344\3736\027\332\331\350?$\252uR\212\317\300?\030A+\177Af\353?\376\316\370\314p\231\326?\331\016\014,b\206\345?i\"\242\370B\374\356?\346\341&\261\200\354\331?he\022\240\005\312\272?\004k\246\007\371\262\337?\3547?*\203K\340?\344z\230\221\224\363\342?F\330\330\217\326\354\336?\376\253\246B\235.\354?\034\367jC\020\023\321?\244\t(^\337\340\311?\374{\254\nv\007\335?\366\220T\314\027\373\322?\210<\277\277\373\232\352?\240\347(\255\313\301\321?\020\345\374W\244\013\265?z\022\250v\344L\320?\274\236\013:Q\t\302?\002\267\277\025\216\025\333?g`\2041\300\316\344?\006_.\177\314\016\350?\214n\236/\003\303\330?)\244\027\311\234\225\341?\324,\355\260\026W\307?\254\273\310\216i\271\313?`W\021\260\227\351\312?6\n\250+Z\331\347?\250Mo\'\017e\347?J\277\240\311\265\302\352?r\'0\305\027a\356?\247G\024\345!r\357?r\25018\213\177\344?^\327FZC%\341?T9\025aqi\330?\032\013E\343\231\222\340?%\206\274\002M|\350?\010\366\304\342Y\202\337?\2308\271]\320\216\356?,j\310K\211Y\327?4h\336&\225\010\313?Pzh\223\305_\300?\240.\244\237\346\375\300?\371\242\377\2640\036\347?6\244\210\266\320a\355?\036\345\2449\177\030\345?\325\2029}\217#\355?\214\361\331t\275\025\307?\240(2\035\304\006\230?\000\237\340\234y\025\200?\320K\317\377\267\332\256?\031%;\332\302&\340?\254\r\273\204\245\223\347?\367\022/s\360\245\344?\200\320#M`\346\225?\2106\234\014\262L\321?~\221\017,\346\251\347?\204R\n\203\266\367\313?\331\016%\020S1\340?\216;\304\217\310l\346?\221\270c\212\030\211\357?&\372\333\311\262\242\346?\220]\371&L\223\344?8Z}\303\033E\266?\240;\353\227\375\303\234?Z\242\372r\354&\354?\251\323<\241;C\347?\232\261\247\340\037\236\343?\214\346)B\027\221\342?\210\323\216=#V\335?\226[\301j\006\310\336?\016\352ip\354-\335?\004\261p\327\211#\306?\020\300\345\200\322s\273?\366\375\313\227`\330\325?\200\366\376<\366<z?\266\263\020\221\013|\352?\222\362C[\360\317\325?\234\347\342YSc\324?&\377P\001\352\302\340?\352\241\227\232\321\007\356?\244Y\032\237\325\201\354?\027@\221cM\216\353?\236\001\242Z\356h\344?l\355\250^9\350\305?\320\331\336\250)t\323?D,\243\016\337\235\337?\210YQB\237\230\264?\006\361\030\260k\r\321?\004\'\254|\\\354\350?\232\026\226a\306\305\350?\267\035\311\020\214\025\352?\'\377\362\3219\374\340?\322\363YK:k\352?\376\304\332\301\244$\326?\200\342\206\272\024\242\267?\020Z\025\350\313\007\350?\212\343m\236\330L\340?$\202\251v\004\326\316?\210<\014\212\005\226\322?\220\363P%\253{\243?\360\320\354\024\'\210\345?\326\355\334\347i\252\353?\2667\352\002\224<\335?\324\214\226\026}B\354?T\257\275\233\327\234\334?\306\211S\233\\o\350?\024\313\rqU\367\347?\317`+\017\271\316\355?\220\355\034d\214\"\350?\220^-\'\270\321\314?\214\0254wP^\321?\250\262\306\222c8\275?\2768\"[%\023\352?4BA\\T\'\320? O{\203wI\223?\370\362Lr})\343?..\257*\255\373\345?\240}\360\236\347\t\221?L\367\317!\231\371\300?\336\215\023S<[\324?\356\375\347\272\025\256\332?\274#\351\027\323\003\344?@\r\344\216z\026\216?\020\315\351\210k\361\311?\276CV.9%\355?\200\234\311\253\330\310|?43\025\360,z\330??\026S\254\274B\340?\346\003\244X2;\341?\032~\030\2707\264\353?\340\230\320\330\350\333\223?\010\017X\'\242\376\306?\376:\217\365\221\t\336?\242\320I\331\\Q\344?\360\254\357{\345U\323?\363\234\356G\272B\352?7\023{\325\216\023\350?\344[\3765K\365\341?\264\002B!\317B\302?P\341\205\336+X\256?H\276OP \237\352?F\022\005\223\372_\320?D=Q\374\250,\352?\312\375\036\013\177z\321?\005\250axG\177\350?\\\262-i\302\035\352?=\310\341lC\371\342?\370\323`T\304g\320?\204\322[a\223\001\341?\\\331\273NV\204\341?\232\010\010\027\276\317\341?\204\265W\253\210\365\314?\\*\220\002\304m\332?\320m\007K\217:\352?\360\374Ic\265\345\316?r\263\331\005h\256\351?\324\374\204h\010\220\325?4\337\023\374\315\351\336?\010A\263\305\025\217\327?\007\345\365+\003\335\345?R\373>\223>I\357?\344-\006\367\024w\307?\270\374\232v\003\323\271?|V\211\013\231\351\310?\002d\333\313k8\326?b\366\226od\255\325?\272\3133\224;\230\322?\"\\\264K5\025\352?Y\235w\223\014i\346?b\256\302\236\341{\347?\004?\017m\\\303\354?\352\2257\2248\351\321?TB\203\363\0204\316?rc\376\220>W\344?+\304\357R2t\356? \030\237\272\246r\316?\356\211.\301\3354\343?L2\302DQ\306\344?*G\261\303)g\322?.a;a\204\212\325?\361Ii\376\227\235\350?\"\356\207\213\"\264\355?\242\276\337\243w:\332?\352\255\263\225\223I\343??\021]\tB\177\355?\314d\213Mf\205\304?C+s\262}\364\340?x;\024P\261\202\357?\026\334\332\000~\211\352?\374\"\204/\357\177\317?\304\306\036\357\002P\321?B\350`?\340.\332?&\0239\324\314\342\344?\004V\223pE\237\312?\374W\227\275\010\317\325?wf\324\243\373b\357?LX\364\360\363\225\342?]\343\0251\257W\346?\330\253T72\351\324?J\2558\315\205Q\341?\301I\265\223B\037\340?L\202\250\030\275\013\305?\\\220\320\300\323\274\317?\020\220yD\233\001\331?\2544\201b\223\334\302?\270wS\003\233h\356?\376\352.\025B\320\346??Fd\305\271\372\355?\365K\336jw&\341?@\231\342\344\310\001\311?\n\244\201o@\023\344?$>\327P7\024\334?\270$}I\277\375\300?\252\256\3004\234\325\344?\030\266\226\231h\215\260?\340\216\223h\361,\356?\002\201\025\022D\014\322?\352\\\240U\'\322\325?\260\367\245\325\223\244\261?\274\336\230\367}\362\355?\214\260\271\022Yl\357?\037\354\020\212\266\223\344?\344w\364\272\'\263\315?:\367\027\204&\227\336?48\325(O\275\322?\342\373\343yZf\324?ZPME\250{\342?\350\337\"\373\212x\270?\215\217\203\311\224v\347?h\350\24735M\350?\260\227u\025\254R\276?@:\236\r.\260\250?\234\370|\301\205\221\307?\302\314\206\322]\370\353?\206\367\262\241\262\010\334?\315\242E\324E\223\347?\000Z|8\026\237a?z{\213\030\005L\330?=N\330\307j2\354?V\023\t^\244\333\344?Pq*\356K\235\312?\021\216\245\214\214\314\340?B\035_\311\013\027\327?\200I\331\241\370u\205?\264\255\2100\347\020\342?\007\264.\036u\352\345?gV\310GVd\345?\331a\255d\026\214\346?\320\327\020\216R\346\327?\333gnG\242\374\352?@w\341A\224J\316?\000\025\365\365e\244\232?\262\\\014Z\321\021\335?B[K:\374u\346?\036<PY\246\\\321?`\332\324B\032=\302?4\303\017G\345\337\351?\254\006\262R\301\260\306?\347[6s\220\365\346?)\373\276\217[M\345?\370q\321\335\275D\342?\262\373\322\316c\002\351?\\b\374\027\333\322\353?b\277\006~\347l\327?~7e\303\264(\335?D\275(\250\026\000\357?\342\214/|\211\244\347?57$(^\326\342?ja\325\261fB\324?h1\2423#\353\276?O\001\353io\363\356?hI\233E\027\332\324?\277\260\262\r\204\354\342?\200\314\010Q\230{{?\250\304\347\335\330\331\262?<\257\024a\023\353\350?\016\035\302\271\016\023\345?\200\200y\006\000\177\256?\r\276\032t\245\343\356?\304vdw3u\330?\204>^7\375\322\342?X\300\361\221&\246\305?x\206\030\377\366\315\312?\355\206\277\344\006\302\356?4#\360U\224C\352?n\317RE;\212\343?#\"4\372^\357\355?\214\207\t\345\241\r\317?q\032\004\314\367q\352?\374V\000w\367\364\333?\004\327\226\365\0062\310?B9{\201\2009\351?\001\005X\005;\034\353?\002\035\367\256u>\354?4/\237;\"\347\322?\013E\np\354\310\346?\344R\241\244\023*\353?\334B\314K\367\007\316?(\331\222\370\276\313\354?`01\332\220\211\305?.T\010\3570\345\322?x\366Y\344Wn\305?\224\340f\212\253[\340?\332\241B\333{\252\337?No\177\317\374K\326?\340\243\265\024?8\341?}\374*\3402\006\350?@\350o{\261\\\342?\244z\203\021\345\267\322?O\254\250\312\321\356\351?d\013\267\230;\262\347?(\317\205\300\026\237\264?B\347U\'\351\n\341?\004\342\217\312\177\242\332?\373\263\271\265~\370\344?&\'\024\252\331\242\345?\233\003\346\2528\342\340?@\346\324a\260\343\324?\300\326\321\236T`\236?\364Z\337^\361\242\353?\250\340\263\253\340}\335?\366\014\323\263{J\336?\300\241\313\261\036\304\333?\350\344\036~\371\241\326?bv\201\013\201~\330?2\276\202\t:\255\357?25\002\2755>\340?\n\\\3611\017!\342?\366\304\024\360\035\337\321?\254\022\301\007\233\304\344?\300\375\211=\246\223\345?\022\3735\330%M\341?\314\321\\aK\326\337? d\000\001!\243\234?\332\256\224\036\200\342\350?\374?\033\'p\241\344?(~T\330;@\271?X\"\373\272(\014\342?e\333o\232\233\371\357?\360w`@\216K\252?U\370\000C\277Q\354?[n?\332q\332\354?<\256\355\333\353S\310?@\233n\217\322P\250?\302[b\021v\271\337?\n\t\3115\024\315\342?B7\227\327\336\375\355?9\265X\214\000\034\352?\025\267v\313\302\214\352?\240\375\373\357\326\311\257?\035\\\327\273\245\316\354?\374\243D\261\235V\351?p\221\321\350z\203\246?\323;\377T\317$\352?\034AA6D\241\352?\376\355-\206N\317\342?\317g\\\256+\302\346?\225\037\335-\373\247\347?\245f.\254\324\213\346?\204\346\267\036+v\306?5+<H$\321\343?4[-\374FY\336?p\257\257_\233Z\341?\215l\001IQ\032\345?\023\361\023\233\025\242\346?\247\336N\355\223\342\340?F=\3431O\302\342?s\273\332B\210\327\350?\007\241\320\355\244C\346?\310\314+\301-\n\356?\304(G\373\n\271\305?d\377\326\201\223s\336?\265\247\267I\316\222\343?\000\315\343\177C\250\234?V*\370Q\251i\343?\035\210\n\207[\024\347?\036\006\271\320\240\316\323?\255\255\014\277\2018\356?!\336T\272\336\305\345?\032\235\300\272\0063\324?n\236\034\rr\213\340?\2541\377\314\364m\353?\200<r\333\374\021\236?\307\016\302\326\354\355\342?xp\"S\302\265\324?\250~\215V\372\t\351?\204\272\274\036\236\252\323?;w\031P\001\032\350?\322es\005\034\212\355?\200\264\320\374\260j\235?\010\207\216\020*3\354?\005s\021O\260v\341?\"\333!\217+\261\343?\332\005\333]\203\301\330?\006\311\357\336\034\375\354?\226\177\223\277\024\374\337?(;\306\251\201\311\340?F\010W[\r\344\332?\276n\3270\325x\347?\203\201\302\235?\266\343?p\250\213\034\233\031\256?<\004\307\025\312r\332?`Z\351Y\226\236\235?\366\010\370%\246\221\342?\342(\021\024#,\333?\347\306\346\330\311\302\346?\275\273\331\303\013\243\345?\220\002<\363\321\253\317?z\250\317hl\237\335?\332E\330g#\010\326?\022\"\256\242\216\r\330?\030,\350&\243.\302?|\246\024\r\217!\355?\025\365\253\310\022`\340?O\245 \311O\\\341?\304\336\0202\032\255\301?\364\366j\034\322\n\352?|UIG\352\005\307?\231\306\334\\\341\213\351?H\354\356\305\301(\313?\260\347W\342\016U\260?V<\370i\020\373\353?\305a\355\031#\222\340?\010\255&\230\030\016\356?\0213#v\320\006\357?\2071G\253\315\201\341?R\371s\314\347\311\350?L\032r\223\203\307\350?c\'\347q\214\317\341?\214\211)\375\021s\352?\332sY\033\307\002\350?Dyf\255\340T\320?d\256\204,a\345\332?\226\006\007I\364\277\357?\315/u\233\372|\341?d\206\027;\270\216\334?\203\253\310D\264N\350?*^4\356g\304\322?R\254m5<\273\332?\022*\306\324\222@\343?;:\256\353\326\373\351?\376\265\273\332\336]\330?~0[tI\216\355?\333\232\0160*\241\352?\240\273\305B\345\226\231?\014\021>\216o\207\314?\024\343s|\353\272\342?\200g\325\232\326\227\337?\245v\334l[N\350?\335\256\215\3510\362\347?\315\311K\345\304\r\356?)0\343\351\244\212\347?\304\204\224T\005\262\343?\320\320\210!}\213\336?%T\210Qn\303\346?\355V\216\352\333\r\345?}\003\275r\255W\346?\360\270\273\277L\245\243?dS\241\304y\232\335?v\201\370-Qb\330?z\311\362\371\243h\321?\250\322\311t\212u\343?&Ga\241f\335\350?\375\322dM\230\243\356?6p\314@\006E\325?C:S\230Vx\350?\334\274\220\202g,\332?P\347\207\214\tW\275?^\313\325W\233\002\334?\035\335x\235\365\265\351?\036\360_n\377\240\341?\240=\216%\250\021\332?6\026^\250\013\347\344?{\343\237\373\030\304\354?\352\016\331\277\343\366\341?\344e\010M\037\023\323?b\025\323w\254\204\325?\320\177O\250\217v\260?\244\3629{\177{\326?>=\313wX\367\324?J!\322\213\336L\320?,\t5\017\023\235\311?0\230!D\345!\305?L\2733o\213\250\340?\030\355\031\366\\\217\341?\031\342\205\223\177\204\343?\260\246\335\266\245\"\307?\261\377\326\336\252\202\357?\366\327\211\312\313e\325?\212\254N\320\200v\337?\374\3759\266\3029\305?\013u\342\017\262\342\346?fh\216\311\337\231\336?\234\215\211yNn\345?\010\2779\200\352a\355?\230\305:|)\307\334?\240\037\277\336s\020\273?\375\274\334\000+\337\354?\364%8`D\274\353?\260}\014\000\232{\270?L\236\224\027\002\223\346?0\000\312\177\267\301\267?\004+Uu\020\016\304?\202P:i\251\305\342?\036\232\212\006l\346\353?\020M\r\"\330\016\301?`\321\247\301\000P\240?@7\177\216T\271\326?\n\202Y{/J\325?\277\352\024\177\211\357\355?^~c<\366\215\327?\200\261b\305.\230\227?\310\357a\310=o\276?\2441md4v\316?\230k\343\365@\361\354? \021\261\232\337)\253?\236\005\025\372\314\332\333?\240)7\025\031\367\323?\366\330K\360\200\322\321?\276\3730\023\232t\353?\020\320\2324^\035\307?2\000\343\177\223\224\327?\210\271\337Q\t\314\343?\345\256\211\221\356s\356?c\237~\022\264\017\344?[\243\240P\305C\345?r(\005\245\305\247\353?x\251\351\322y\337\321?\340Z\237\373\366z\227?\000\206\014\242\236\375v?\252\303\373\307\264\215\355?\273\031\314\221\327\350\352?\332(qT\220{\327?v9\371\361\316\002\341?\327\244h=\305\006\341?\034\n\216\003\365\237\314?FC8\300\333Q\356?\310\357\227]\300\010\310?\300\006<Jz\241\215?x,\202\233S:\323?\372bO\351\204\255\334?\220BI\034\210\204\350?\023\202\0329]\347\350?\360z\360}kn\312?0Y\020\354\372\313\311?.h\256\010\355D\343?\262\201n\0036\001\332?\360\353\032\204A&\310?\2208np\302\205\254?,M\357\355k\213\317?\344\352\357A\352\260\302?\016J\311\246\200\200\325?\241@c\307\254k\354?\255\372\366#\322\253\341?\200\311\027\036&\365\310?p\036\203\332;\371\254?^\260\\V\200k\324?\332\026iz\\]\333?\270B\304\253b\345\354?\204\362D8d\357\355?\200\253\3254\335\313\217?cd\230\272\331\272\353?\350\032N{]\252\261?$\212\340\373\034\270\330?P\322\353:\265\"\242?\3026]P3\030\340?8\324\376\350\213\230\275?\340\363\221\246\224\227\265?vqG7\035\356\341?@\277\234\316\321\247\324?\000\2175\212NQ\356?\356P\234\001\271u\344?`\200\324\241\374\340\337?\000;\346\224\203|\236?\310\205hy3\251\345?+zR\017\322\255\354?\340i\357h\026\224\324?\314\217\243\017\247m\314?@B \317\027\363\243?\315\251P\335\307%\344?\023e\224\330c=\340?\235\320\312\240\2768\344?\010\214\233y\274\t\352?\0279`tp\327\345?\235\023\202\250aY\346?\370\034\023\353\342\241\335?\204%\364z\021=\346?\203u\332\374\340\330\353?ud&!\220\254\352?v\350\024\377\254\313\334?l\354\256\310\026\357\331?\335+\343\242\035\035\353?\274C\017\033\353\336\355?\300`\245,\200\033\203?\000\001\360\324 \333\325?\n\266\020prg\352?\177Dg\002\230\215\355?s\362\365\363\241g\344?\374\205\256%(P\317?\332\262\\\211\252\327\322?\257m\316_\030\245\342?L\373\265\361\370\221\304?\000NJ\202\016\007\254?l\345<b\203\367\357?@\205\253~\313%\273?\002\247\"c\334\323\333?\230\037\312\221f\220\346?\3711\302!}\217\347?d\010\032\321\306\376\353?\004Z\346\256\355\002\324?0q4\002\245;\263?Lk\376`Kr\351?I\215\n5\355*\340?\267=|\270p3\353?..Y\236$\307\347?\366\312\305\217\314\004\357?\244\225e\037\260\304\302?\307+H\027\243\241\347?B0dV\314(\353?r\340\312\230\207\004\335?\004\264RmC4\334?x\327\321\300\266\251\321?\232\364\356\026\272,\323?\270\263<S\264\262\305?<\356o\377I2\342?\250\340\273\217\254\213\315?Hc4\373U\033\355?B\211\244z\310\347\336?\245\257~7\263+\344?}8\306b\275\312\341?\366,\215P\364&\356?d\274\241H\334u\335?\250A{\205?\270\323?\006\t\303\"0\377\324?\017\203\022\323\276Q\357?*\315LTK\242\337?\343Hk?\370P\355?\000\325\007\375\350\ro?b\361&\374I\330\357?P\004\302\225\331\254\341?\230\2105\264\245\377\335?hG\221\343\203\013\325?\340P\332\334?\266\273?\252k\375\226\322\336\342?\340\327\206\343/y\243?\342\177\177\261\3633\321?tM\373\362\347X\331?\200[\375\272L\345\247?/\227\366\221\353,\355? \030~\364\214(\346?\033\322]\320\271\216\347?\364nZ\233!Z\314?h\206q\366o\233\335?^\n\265\'\361\364\334?\010\245\003y\024\237\315?f:\347\316\266\342\353?>V\263/\255\341\356?h`YR1\177\354?\000?\035\225\332\000o?@X\3112\257\337\245?D\263\253:\"w\354?\020]\013\251\330\333\254?\320\007\274\252\275W\272?\342)6\237\330`\340?C\334\310\031\242\200\353?\'\305H\214\016/\355?B\0001\245\216^\357?\253o\273\324\t\223\342?\220r\307gR(\340?\247\275Ej\310\336\346?\324<}\356D\004\313?<\375P\231\341\272\305?B\257\227\023\360\327\335?\232\244\027\347P2\340?\274%\227d\325,\355?N\231\340\254\276t\346?hR\347\217-P\310? \247\r\311\367\232\316?\310<\352\300\310\037\300?\340V\232gKs\317?d\343A\007\316)\341?$\001_g\2758\354?\370\307;\016\243\n\317?\250\237\356\237\373g\336?|\247\307\373\234\335\313?v\342>>\252\275\340?3\355\035\335\313\004\350?,\221&\365\266\276\321?\000t\370\372\030$\223?\220\200\264K{\254\261?\244\322\306\210$\263\337?\312>\'\230\250\206\323?\3700\216{\207\031\332?\3577\353W\253\334\353?Z\217\333\003\327B\333?\240\246\260\272\225c\262?\325QY\204\"\352\340?$\321\331\352\223g\340?\300-#S\345C\220?\332\234\035kt\274\335?\374\257\004\333k\361\352?\021\030\217\257\351\332\341?\251\332\\n\374\211\352?\354\356Q24\213\347?\0211\215\202E\376\342?\300gu\276\220\037\203?\244mt.\331_\314?pkT\235\227e\273?\030o\344\276\233\316\337?F\200\245\017\t@\333?\243\232>\014\275\235\350?\326\004\2003q\240\340?\200\020\003[\242\361\337?~\263\366\322.\024\353?r\245\221*\206r\347?M\352Q\374\255\352\352?\375\222\2628\310p\344?\336\315\001V\305%\352?\3021O\033\261l\355?L\310@\037 p\327?\n\307\234n\236w\356?4\376\350\2601\002\317? (?\304\230J\244?\206\242\306\026\200m\357?\310\\\\,(\341\357?((u\262j\014\305?,\3546\014\353`\322?zlz\253\235\217\333?\314p \317\302\303\330?T3\354N\300\027\320?\266;\315y\3373\342?\2014\213\341<\222\343?\000d\346T\235Q\202?\205\277\002YNH\343?\220\n\350\262\355\306\306?Z\267y\254,\256\336?\340RDI\344\234\274?bE\2039\367x\352?\204\362Q\243x\266\303?\254\206\273\007\250+\343?^\277\027\302 \327\344?\302\023;\221\035\300\323?0\204e\354\322\355\264?\2676\314\007\300\263\347?\220^0\217\036\301\274?A\303\370\315\342\321\352?\010%\003\016>l\340?HC\025[\214\367\261?\206\352\301vQK\351?|K:y\242\363\324?\014\216\350q\335+\326?u\364v\300\213:\340?\032`P\357\272\313\341?@]\371\316\255\007\336?\361Q\014\217\0022\343?g$\225\r\363#\357?\2052\326b9\t\341?\310\t\367\202\257K\333?t?\304c\213\323\321?p\222\322\022\024\333\241?\207\276\302\304\024-\354?}\\\2237#\350\341?\316\223\314\206H\367\343?\177!\321\327\202\326\347?^,\306\322\215?\320?(\262\355\004#,\274?\264\030\373\310`\243\340?\347\307\\\267\004\274\345?\002\014\356\224\3002\343?\316\241\262r\\\240\341?\003G\207U\244\264\347?h3.\301\212\022\265?$\235#\304\003\373\340?\250\351\236?&\016\276?\374\317\230P.m\311?\336.\3577\251X\344?@\302|\3063S\335?\340\230\255@\247?\325?\331\327\265\336\r\241\350?\366\t\273\213\341\340\327?\272h\364\3758\251\351?J\211Ij.\216\341?\212\371@H\027,\324?\240vR\t1r\306?\252\025\312\305\031\364\353?l\262\372\305\277\344\323?\347\241Q\021\215\205\357?0\027\016\246\327[\323?\316Waf&\215\342?P\273\301\374\243\354\263?\026\207:\214\024\332\335?z\273\257\017\\\264\351?zZjhoo\325?\240t\346\030\315\232\327?\302\307cC\034\236\321?\n\277M\003\037\233\340?Jk\242b\334\271\331?\035\366\362VKx\355?\206\r\235\226\220\222\346?\224|\223\222\215\355\303?\272\347V7D\013\331?\270\334\323\251\321r\332?*\374/\363K0\346?\206F\017\344\310\000\343?\350\331\rJ\206r\337?\3661\203\234\203\214\351?\221N\322\220JW\347?\222C\215\256\220f\356?0,-\010D\303\253?\016\377\177\252\\\316\332?\232z\317\314\002/\330?\200\265{%\374\323\351?\2346n\"^\370\333?\362\322\274DJ\303\357?\261\233\006\330\006\377\355?\206x\221\326\037\363\332?\342+\300+VF\324?\220?\255,D\250\276?\374\213\373IJ\025\323?P8G!\206\333\271?\220:)\332S\370\313?\344\300Du\371\025\303?H\377\356\376\013Y\323?0j\216\032\3737\340?\306\357z{\210\241\330?\260n#\376%\t\261?\340\205l\232x\274\266?\036}\237\244\314\305\334?/H;TU\322\343?B\371^=oM\344?\030\324\300\222\3069\263?\331\3626\310\233\314\353?\340\216\0370\237_\275?\220\323\2322\215\354\276?\230\305\271\224|\362\261?6Ft\310o\355\353?\330\233\315J\315\016\274?\312:\237|#\247\352?l\207\250&~\273\340?\262\224\335\326\275\003\356?\354>\321\251\343x\314?\204\350\001\226v-\314?\240\035\211\232\226\217\315?\346\230\014\245\364\342\350?\330\n\025\000\322\210\330?\257\262\010\255\275\375\344?\232e\203\270\353~\327?\271ju\370\345\302\340?&mqZt}\337?\362\223\315\262\351\325\337?\370\353f\036O:\302?@\262\357\244\006\251\342?V\307\'\373\036\251\321?\3432\260\332\231\232\345?\231\345\274\343\371\263\341?\024\370\320,\275~\307?64\353\177\251\"\331?\241\302\024\266A\030\356?-\010\342\032\023\004\346?z&n-\306r\340?n\350\000\267\007\313\334?\344\223?N\352\031\307?+\004\207\216*\350\344?\304nG\317L\320\303? 1hq\024z\310?\310\203a\3013\211\353?\2654o\376\314\023\356?\223\031^}G\036\354?\227c\274\354\"\300\341?XnL\324L?\266?So\312\200\234\314\350?\000;{\271\2569\247?dl\020\025\247\367\355?9\340\236\222\326\301\347?\2345\246\261S\317\306?\360\346\260\034=o\336?\020k\303\n\343\000\355?ft=\217\3674\323?\340\362\204\025\023L\270?ae&\230\222\350\350?Hy\313\024\275\224\274?\271\337OR\245\357\356?\324\017\215\302\010\212\354?T\0058\226\261\\\344?\025e\354\251bf\352?\027\247|\2040\331\353?alTf\330\231\355?\274\355r\244\256\265\302?3\270\023a~V\351?\2303\316\335\376\257\270?\306\r\251\\f\213\355?\256\1778b<6\342?\376h\235-\307\333\343?8l\017#\025A\327?VjZ\377\033Y\345?\007\255J\201\333K\341?j\002^+6\257\325?\3368[\234\370\371\340?\264\261\300&\370h\346?\2201z\210\253l\304?\353\"\206\367\327V\352?`\016$\255\254b\313?l\205\201\016z\346\344?H\327\205\301\020*\304?\354\335D{Y\265\353?so\375mA:\343?\254\306\211\256\201\265\311?\310\013\347\343\347R\265?\240\215\201lkd\243?\274dN\211\342\351\347?\266\207\335\323\271}\355?t]\242\333_Q\351?\202\005\374\346\023e\324?\001\247\343\t\241\205\357?x*\206`\225\321\263?\024D\233\030y\211\340?t\237CUWC\322?\235\343LD\311h\355?\232\314\241\'\373\016\346?\310[T$0\n\346?9\356\3549\266\355\350?PF,\275\333\275\256?\326WJv\307\026\352?0y1o\320\317\330?xv\2106\334\301\346?\224\0031;>:\322?\004\230 rqc\300?T\320\316>\265j\347?\347\201\377B=d\356?\236m\200\237\272\202\357?&\200(\215\353\224\354?\342<\257u\031\271\347?2?\332\205\260]\323?\2605zC|\274\355?;e\307\003\305\351\345?<\023x\3518\r\336?\323+5\340\014\274\352?$\230\343\tm\362\326?\010\371\3169uz\337?\250 w\220\221I\347?:_3\005L\255\320?\335V\031S\276\225\344?$\266\370^\275\372\332?X\333\213Q\034\274\263?\220\356\343\345I\353\346?s\371\3348\330\217\343?K4\306\375\200\277\354?8D\273\224\276\326\351?\026W.\336C\311\325?\326\224p j=\343?st\214Q\342\303\354?\274\033.\343\373\203\354?\246\363&\005\210\203\346?\251\342w\275\203\304\344?\274\320\250\017\235\235\326?z!f\200\244\205\325?\022\250\177B\352\352\356?8\013\021\215O\222\262?\374[\232\236\014l\330?\340\3546@\021\177\307?\010\004\370\320\323\271\333?b\035n` \274\342?.?M\347u\324\356?\364\345\237\027\004\347\324?\"_\214\276\303\027\356?\\\215Y\375\266\n\332?\300\265;y:\323\276?1\216\"\327\255\026\345?\"\335\3132\215\271\355?b\016\252i\270(\345?!]\273\344\241\244\346?\331\215\276\2336\243\356?pQ\327\276\035\356\265?\320\244\306{-\312\312?|\265\334\360=c\305?\370\241\023\273o\233\345?\0205\257\234\267\211\342?\364G\213/\376\365\315?\200\247=xw\337\266?.l\352\341\\j\333?Y\310\261\267\254I\357?\302\221}\035\327y\345?\202d\035\276#\206\340?\260u8\021\220\357\344?\222\360\271\256\377\231\330?\206\317\311\236\260x\324?hs\035H8\232\304?\000\244\002/n\010\271?X\346\347\234\223^\265?d\035d\3648?\326?\000hm\300\342\263D?\014\262&\231\364W\313?\214G:\227\332y\324?l\234\311\020\331S\334?o\017XK\264Z\355?%\023\335v \335\354?\014\314\364ke\201\353?\222Y\246\333\202+\323?\214;\321\322S\307\347?\034\327\234~\374}\341?\340\363\254\344A\371\340?x,I\3616\251\304?\270\273--\273\364\354?\010Vcy\240v\276?+k\250d\332\367\342?\337\004\251\222-+\353?\200\306\001\263Kh\274?~\366\234E{\372\336?tO\336y3R\351?yu\351Z\016?\352?\320\274C\201{S\255?2\020\210\223\016K\355?S\003\354\256~\213\345?\000\321\324u\275,\350?\317\261Q\365\221\360\343?\221\246\014\334\345\272\351?\321\2500\203q\306\352?)8\031\371`\366\356?\350\023K\305]\031\271?h&B\227\033m\310?\020\2600 \273\256\265?\250\336\016\376\005!\352?v\230\224$\216Y\326?@\367Z\262\335\346\264?L\206e\364 #\345?\036\244\351W&\313\356?lY)\221,\356\311?\310G\346\313\262-\355?\250\201\375Q\030F\266? B\350P\2132\332?`f\2073\2411\253?d~MW\010\324\327?\310\375^\316\225\030\351?\331k\312-\332\210\355?\242~\035\037>\224\337?\301b\201\262\031\036\345?\n\332\2333\345\027\355?\360\014\236\225\023\332\242?\344\245\337\'#\'\317?\004\t\177\033\244\266\325?`\323z\353.W\245?\033\227d%wC\341?\372\t\023\367\222\252\330?0\277\022l\322[\267?\020\214\246\337[\250\312?\340\037b\204\333\275\233?\231\035\235g\213\230\351?\356\324L\211\006\232\343?P\332j\t)\251\354?Y;\212\374z!\351?\220ow\365NA\336?}7\271=yK\341?%\347\317!fM\342?\013>\333\217\213{\346?\204\372;[S\310\323?\356\240\003}\231O\354?L\201=\177\337\256\301?psU|\340E\275?\021\267\352\035\235C\351?\001\334\345\016\013W\343?\334\036{Ix~\310?\305?w\024\224\256\347?\344\355\034\213\271`\332?\324\343\335\005\310$\345?\234?\237\315\354?\322?\242U\333}Ta\354?Hy\310\222\000\336\266?\310\275\345^\355\252\262?\034KV\254%\321\306?\240\333\277\013Z\311\306?\260=\024fq\315\332?>\271(\257\303_\355?`\372&\322\377p\337?,\021\034j\\\266\330?\210\372\246\261K\334\314?B3\235ZO8\346?H/B\321c\252\332?\264\001\003&#$\300? \310]\203G\016\252?\311\353\350\023\260\210\355?\343\240\206\310? \344?\306{\275\215\224\025\331?*\251V\373\231\340\332?\361\271B\275\206#\353?z\240\333E\354L\322?\036QH\346\210\347\333?h\321\325fs&\353?\3560yif\275\350?\325\003\263!{}\356?\370w\225|T\231\326?\360\232&\r\014\307\303?C\361W\360\260\002\357?\020~t\210\204\010\241?\226\220\000\t\337\336\320?\003\204\264\375\233W\351?r\207\300M\223\246\345?\013\313\005Fr!\344?Js>B%\177\333?\337\214>M\'g\355?\370\\*E\313\267\351?T\t\242\200\313\306\350?Jh\275\370\365$\346?~\357fnyO\337?\236;\013X\211\007\320?\2729\363\307\347>\324?\340Is\301\027~\305?\360\245d\372t\256\247?\\\200C\t\244\020\313?^`\246\026i\276\320?\304\2064\020\373:\325?@\rF\335w\313\270?\274\255~8\330\211\333?>\313=\323\372\317\336?\024\246\357\353\311$\300?\262\024r?\n\t\343?P8r\250\221\305\273?^\037\301\240\377\252\334?\234 \336\306j\214\302?\020-\022\343a\350\261?\210P\251p\"\226\323?\210\346\222Z\025Y\344?H\\\220C8\214\276?P\360\314v\036\254\263?SUu\332/\271\342?\200\3441\003\2342\217?\306\211Dm\303w\342?\304#\221\331\221\257\326?\366\230\361\004}\375\353?\240\233\005\271\n>\321?\000Zl\240{\004j?\226?%O\030\225\332?|p\206ZF\272\333?P$\251~\376\206\334?\272\375\323\233\217\323\330?\263Car\250\273\354?\024\347\277\354\035>\316?H7z\226\020J\351?\324\306\324\245\\;\316?/\006)\272\343\307\344?\301\251md\364d\350?h9\361\0069\332\311?\333\260\350\306.\242\352?\006\203,\246\037Q\352?\326dos\222/\351?\315\245UTX,\345?\331!\271\227{\350\351?hR0\373\234\326\302?d\214G\037\275q\327?\352Mu\362\000\214\343?I\353\23354\"\350?\360\3602\3128\'\354?\264,\337\216\001j\341?\366\203\354Z7\275\325?\272\ne\034\226\221\345?\362\365|h\037E\326?V\024\256\201\356\374\354?\004C*I\236H\315?\030\030\234);2\271?`\343\271\007v\233\224?3\035.\367b1\357?\300N#$\'\266\272?\360&\357\001\036\324\353?^\353\177L\273\217\357?\232\250rf\246]\334?)\343\2542\340\342\342?\354\275$`\225`\317?>1\265\370E\201\354?l\334\246z\365M\354?\270,\207S\320\001\335?\234Nr\245\207@\331?\254\225Z\217\251+\341?>\037\246\227R\024\321?\360V\322\245\200\343\337?\331\203\207|\367\037\351? \037\034:@w\310?0n\264\363\301\244\334?<\254\007|V\036\344?\364\266\332\254k\212\325?;4\001\211G\214\343?\342\361\360T\273I\343?/\367nP0U\353?\214\353\336\017\334\242\333?}\022)$\357\361\340?\237\274\323\206\212\224\343?-\030L\374\240J\344?\000\343\311\240\3176\331?\032w\305\255o\034\331?`A\007\311\323\001\271?\267C\314\241\235\336\351?\214\311B\177\375\234\336?\226\332\374\375\274\010\322?f\204\207\275\246\377\334?\202%\243\270\020\325\324?p\250\307\233\024\211\302?\222\345`\261\033\305\335?8\364\347\035\276e\337?\216\215\217j\371\257\356?6W\245\247\347\301\322?\210{\252s\025R\300?\366\347\234\330oO\347?\225\377)\216\377\215\340?J\037\265T\'\222\326?By\202\234\302\237\325?\200\221\247\323](\221?\231\277\357\342n{\351?$K\346-w\032\327?\006,wMm\376\357?`\214\202*? \242?l\207\357\'\356\331\324?\016\3304y%\213\343?\374[\336\362\331\377\324?\220\003\373|\330\345\260?[\334\315\315\311-\345?\335\263\360\024\351\267\346?(Xm\211\202\372\263?\362<\373\236\'\307\336?W\356*\350X`\354?4\007\332\216\005\360\302?8\324\375\277gI\315?\276M9v\362p\337?\222Z\377\217\030\340\323?\375\255\334\304\375\030\341?\220\223\211\375\034\322\271?\032P\311$\316\200\322?\316\3556\207\322\307\351?f\022\226:\226\247\336?P\360\257>t\347\333?d\n\234\004\347\376\306?Z\025\t+\300\036\344?{\352\353\3159\303\344?@j\352\210\236:\265?\343\212\230\213\003;\342?p\240\334wS\304\310?n\207\351C\nw\352?\030b\252%!\217\270?\232\025m\303;U\344?\033\234\217\327\251\005\357?\360G\346{\302H\352?\220*\t~-T\254?\256\334S\337\305\342\337?\2003E\177\212\036\266?\274\300\375\207yr\333?\334\3458\275\244^\354?.\035\215\232K\324\346?\334\035\217\341rE\325?\247\031\337D\200l\344?\240L\325\301\203U\356?\373\241\004\005\221\317\347?H\\\010\325n\034\302?O\334\037\374\024$\346?`\247\350\265\335\204\341?\354\026uI\037-\353?\016\356m\\-\030\337?\300\221v\217\376\244\245?\206\345\"\200\263\255\356?\3168 \235H\245\322?\346 x\261u\236\324?\230\022\323\272\267k\313?/\241`\260cb\350?N\022}\004\021\203\343?\270\256k\020\272k\327?\364\243\343\267\026\303\316?\232L\22101\245\332?A\362\030\017 \247\342?p<\341i3B\246?t\301\332}\215(\300?\000.\304E2-\341?\260\3242*IG\252?hV7\257O\336\322?5S\363g\n7\346?\365\035\313\035\372\323\352?8\n?\355\250\314\264?\227\245wI\210\257\344?\\\376r\223\014\242\300?\207+\205\332}\311\355?\244\n\316<~h\343?\234\005C\365\346\003\302?xh\004\024\021\264\356?\232\221E\235\377\256\355?\355\037\324h\375\244\343?A\334{\243\177\327\347?mg\253\034\206[\354?\200\027\333\315X\"\270?\275\016\317*b\017\347?\3248\337-\004\027\331?X(\345I<\274\275?.\317\315\331\'3\324?c\030Jl\013a\353?\240\007\305\001\224\332\263?l\275\033\374\321\301\310?\200\301\025\031G\243\204?\251k\000\375\237O\357?@\276\3141U\311\232?\002\014\'\201c\261\346?\362Y(\361\374x\343?\200\3247\340\304d\252?i\37385\177\357\353?C\2734\374X\214\343?TA\301\357\246o\302?\010|\201\\(\253\261?\367\300\362\231>l\342?\265_\\2\022t\347?`\277\233\200x\347\321?\262)\265+\356\206\350?V#\351Y[p\347?\033\331N\317\232.\357?~ms\242$\000\325?\\\000\264\352\307\213\350?\230\240\222U\022\027\265?\256?\003L+\343\356?`oRn\303\216\357?K\203ax\033\376\353?R\370\233\230\217i\324?,R\352\203\311\366\306?\036\221 e\362\242\357?\030\302\353\346ZK\327?\337R\302\265\027\246\354?\375uB}=\205\351?\260\027Y\235d\022\310?\235\'\342q\225\217\344?\326h\256\241\000\324\324?xJ\222\310!\322\330?\322\210\245\255\217\310\343?6M{\014\271\354\322?{!(#\377\325\354?\310\241\035\346?\202\301?\354\212{\340\005\333\323?Q\317\357\365\242?\354?\242s\307\306\213\306\347?\206iu\220F\313\333?P&\376s\014\310\351?p:o\263\215)\314?\252\372\3252\254\026\331?\206\325\362\236\345\253\330?p\341Y\244o\356\245?{\203\351\272\014a\340?DM\026\374v\007\300?\340\350\\k8\253\353?\323*A\026\312*\356?\220dT\202\363\336\243?\014)Z\t\037\\\344?\340\343\262\250g\254\315?\340O\024\224\337<\325?R\376A\241\343]\335?I*g\303\3177\353?\004\033\013n\210\n\330?\260#\364\331\264\364\320?\360\302\177\330\224H\247?\204\255\277w\007\345\300? \003\213\235?6\302?\201\350N\3237\352\350?\204\006\223T\206\263\316?\252B<\310{\367\350?\216P\215\236\361\316\357?k\221Gf\363}\353?\264B\016T\213-\353?\024u\331V\327n\312?\306\207\365\311\034Q\357?\306~\263\355\222\217\351?pQ\214<\245\023\337?\270\213*\244\364!\346?\255x\237\036\374\304\344?8\360OI<T\312?\034\267}\261Bk\342?\350(\t\227\352\026\322?{|f\237\335\200\355?\202\n\211^\004;\352?\320,\314\320\334\035\254?\010\324k\205\202\224\262?\266\232\276\366\355,\337?7\256\r!\235\253\343?\357\331\020n\227@\345? \272\265j\'n\236?\255\006\343\305\020\020\354?\340\212i|\237g\230?n\204NBW\346\324?T\006\314WM\213\330?\234\345B\366\331E\325?\343,>\372\225\377\344?\370\226=\376)\265\357?\210\350\2738\364\345\262?d\023\302\225\233x\321?\207\310\205\317\201e\352?\325x\216`NY\343?/\355:\211+\026\341?\330\344\030\315\007/\333?\300\374\262\325Wc\256?\264\244a\322\031<\315?\276\370\342O\214\010\353?\326*\363\241\273\346\332?N>\220\332\023\374\323?\300\311\261\213o\025\246?\334D\0217O/\300?9\3151\317\207!\343?{\315\350A\3716\341?\200\331k\013\324\001\262?\334\267\322\031^\312\343?D\224Iu\013\331\326?\332i]!\324\332\347?\350\3671\371\353q\320?\014\224\353f\033\213\334?\027\350\001\367F\212\347?t\321\234\337\350\327\321?86\274=\3230\314?\302\373c2\374\305\337?\200\3313\265\223I\257?\334\3605q\211\226\353?5\333\334\201\204\350\341?\242\263i\232\275\235\345?\240\031\376\216\317y\357?\266\364k~-\253\327?6f\237\\\236%\335?\346\323)t \306\331?\306\333]\201J\365\346?t/\342c\227\001\317?\230\355\201+\351\202\261?\205`\244\255\351\\\354?\320\223\311\241e\312\276?/T\014\010ZB\346?\224G&w\301\206\313?\232I\324[2\305\344?\010@]2?\035\354?\3269<^ZO\344?m\351Bk\332\323\345?bY\232F\3658\350?\350\312\035\375\354\356\272?\240\335kR\211\304\313?\r\315s\241\275\342\350?\304\037\262{\357]\303?\3561{\203P\235\333?Z\261/\266B\326\346?\t\342\341\017\252\021\342?\0373h:%g\353?\035\337\335h>\207\340?8IqV\273i\307?\"\327:\207\n\340\353?\320\007E aA\314?<-jl\231h\340?\351\2406\204Oi\341?m\311!\021#\303\346?\310\205\002S\241\305\357?\304\022R\253Ae\307?pm\365u\245\307\263?\240\315o\330\306\323\315?\324\013\372\326I\234\351? gqx\336\210\263?\300g\253\377{\032\251?}\255\327i\342\361\346?\346\030T\266\000]\334?`\014\362F\303\225\276?\344X\274\32767\312?\370\262QRF\205\340?\210\234\361\242\360.\334?)\213\257\3510\213\343?D\000c\324\2757\302?g{\2406\244\313\354?\323\244\331\314\216\016\350?\2063T\257\317\034\353?*(#%=\364\335?IO\002Z\017x\340?\"\203A\331\nc\346?\225\374\322\370\374s\352?\230\032B]\250\272\300?\310\342\273X \267\313?\255\354]\025\216j\352?\220z=\030=r\347?y\366*\036h\360\341?\300\013\252\310\017X\212?Y\230\325\361\026\313\347?NP\020\250*(\326?\356\303)\273\203\200\346?\000\020\0000@\272F?\241\n\311<m\303\352?!D\274|\'\267\342? \216\261\363o\377\251? \257\372\223\354\340\357?8\220\023F\023\337\356?\210\256Fs&\377\262?s:|\374\237\254\352?\001R\331\270\344i\352?=\232\000\247\230R\352?X\203\255\223\\c\272?hc\276W\217\031\320?\202\031\333r5\014\356?#\364\233\000~9\350?\3666\010\210\017\374\327?\342b\'\204\354\007\346?\354\200\004\227!Y\354?\030\360\271x\307\215\357?\254\312\241j\202\241\316?h\255\270mo\304\260?\371>\375\361\247\027\350?\312]0c\241\316\335?\2771{pw\342\344?>\220\333\330\266\256\342?\020\231\211~*\035\267?\364\365*\242eb\330?yH\245\016\007c\347?}\222\034\227\356\210\355? \356\360\021M\325\327?\200+K\212\024z\304?\324k\334\350%V\334?\224g\325\347\233=\301?\302\305\366\"J\020\331?{>u\276\272 \345?J1\300\006\346%\325?\360\330i\025\252d\340?\263\204\356\265\276\254\346?\200Z\372p\322\213q?\374\262*B\035!\310?(\344n,\311\226\330?8\305\"\352\261Q\262??\226\026\032fe\352?\370\254\247w\261\033\326?h\261z\236n\252\311?\257}a\260-\367\354?@\021\010}\270\277\336?\000\207@\375\207\254\244?D\275\250s\017x\325?\036\202\213\343\000\242\354?`Pi\t\201F\262? \236eA\032g\313?\t\036\367\377b\353\352?\260~\002y@&\347?\364\273\231Q\215@\330?z\347\260)nd\327?\367\257\222\373\263\231\355?\233$\224\225\311\341\352? \264\232Ji\323\342?\330\215m![\246\337?\235\2219\377\350\245\345?c\212\313\324\360\317\342?\2402\316\231\367o\234?X,\224\222+\313\275? w\367\2201\022\306?\252\030\023\200\244\340\322?`\'4\315[\305\354?\220\227\372@\255\250\356?4\234\345\2104\t\321?\330i\273Nrz\306?@\240\361C\225.\243?V\314\251\274\275?\350?\334y\372S\005\254\344?\010\306R;\237\355\311?\370\271\032\254\263g\307?D\200\021\256\3619\325?I\344q\253\037\030\342?B\350\211\226sc\355?*\261\rD\276y\335?czp\311\371p\352?D\331\265\271\034\374\351?}Y\367\333\303r\344?+\036\2365D\307\352? \2025\022\'\347\250?\276\010\3469\205S\325?\000g\nq(>\210?\332\343\205\310\375\262\325?jE\273\367\006\353\345?\320<~Z\030\337\313?\016u\022\341\377\274\325?\323\024q~\261\347\355?\231\2169\204fi\343?t\234\210\215\n\320\322?\207\357\236\317\223C\351?\346\305\302H\0279\341?L\204\343\357\254\310\315?ps\323\006\271\013\313?\220\325\033\362\022P\332?]n\273\004\262\357\351?\376*\357\221\334;\331?\001z\004\340j\002\352?1\254e\200\014B\357?z\032\214X\371\357\333?h\'\307\013\003\341\263??\023?\017\361v\353?\371\332\'m\377\304\343?\232\333\251}A\010\345?\344g\261\233\210\266\304?\3444\262\205\210b\326?J\327\370\347xa\356?\320u\256\3738a\344?\n\270\2333\212\234\346?\312sj\336\310\211\320?\362\236\3148*\030\350?FsEV\014=\357?\300\320\332K\253\356\211?J\2723L\'\317\324?\350\225T?\006j\350?\tr)\236z\006\352?\3206\021\302o\366\244?4~ft\314\017\336?\240\200#\344\320(\343?k\254\365\271\253\230\342?\201\204\252K\007\247\343?\300\314\313\201\213[\270?.\242\256%6\353\344?\035\267R2X\\\357?\023+B\361\256\320\343?8R\232\010U\345\352?+\';\260{\362\350?\266E\375\t\3656\342?\020\227S\337;M\345? \2569\201\2434\310?D\276\217\013\202&\344?\000x>\356\261\243\242?\027\344U8E\312\352?4\022)N\243\004\300?\226\371\321\320\245\234\345?\256+w\002CO\336?\336YJ\275La\332?d\371\245\317@!\316?X\225,\225\323\237\326?\267\330\212>\023+\355?@\253\211\253\370\275\251?\032\357\200}&U\321?\372d\304L\261\200\332?I\032\244\212R\370\343?\3067\330\360\210k\342?\000(\203\0068/\305?\275\221J\212\376j\356?(d\236b\337h\357?\200\347\342\242f\330q?\340\344\002*v\275\241?\340\361(\366%\306\323?\200\271\006\247\nI\204?R\241\204\246\rg\325?\244\263\2465\367s\320?\330K\270\035\250\305\301?jc\233\003\205/\333?GwS4R\n\350?\337\300}M6q\342?:p\225)L5\350?`>\337\340\261\010\253?\360\343\304?\221\357\326?\334\035R:\027C\347?N\320\370\316\351T\326?\266c\217|)\263\323?J\217\214\213\366\231\357?\n]\2467\312\024\321?\276^\345\320x\274\330?\366\342O4\313\031\353?\240\310\272\204\330{\230?0k\375\364}\375\266?\344.\000\032\267Y\340?T\346\004K\363\221\351?:\341\245\347\255\035\331?Q]\250\355 \007\345?i\342\225\376\331\356\356?\320\267\221j\321\"\275?\007\324\313\343\212A\352?\360\2668\037\245=\311?p\373\216!D\337\333?\336\257 \252\355\201\335?\200\346\014h\t\353\341?\305yS\301.\231\351?T\350\241\372\177\365\331?\020\032\037nq\224\253?`\242\020\\\037o\341?\213\177P\277\205\240\341?\203l\345\n\\\007\352?\320\320h\3028\r\243?\331\276K\033c\344\341?\207+a\265\305\221\347?\234\204\373}\234B\327?\214YW\300T\001\356?\240\327\010\306g~\304?\317\332\307\310\254p\351?\241\302\200-~\020\350?\3406\331\376\2251\353?R\"\236\241H\355\344?\001\373\000[\357[\342?o\025\344\026\261\340\342?v\035\345*0.\355?QGm\355\310z\351?\032\351\200/\017\251\344?\202\004\005&v\002\350?\374.OZ\324\301\353?\027yc\214j\231\352?\254\225\245\261\034\233\311?\202\353\032n*T\354?\">\037\363\267\272\351?h9\251Ye\366\336?`\2656>\004\330\357?\200gX\224u7\245?\\1F\034^\364\312?\310\010\352\356\255\237\355?\300\266\002\005\003\213\240?\000F\204\366\274\"\354?\370A\365\265\351S\341?\336]\210\370$\241\323?\\\t\350=\366u\312?\n\221\271\367\365\350\324?\311\242\341#\253,\357?.\207^Ee\246\351?\314\000\342\274\002c\317?\374\335\2737\021\352\356?\004v%\211bK\351?3\016\325\346\354-\345?\024!r\353\000w\344?MN\325\314\331-\352?\307\212~\022\375\251\356?8\317-\375\037f\327?fw~\274\003\177\343?\362\\y\336\024!\325?\204JJ\356\350v\355?\277\315\344\372_`\351?\326yOw\027Z\330?\036\3422\344\340\326\331?\034\2105\003\255\364\342?\260\n\326\3422\354\332?zN\277\230\320\300\330?\024\207P:\260\201\341?8gt\362\233~\351?\230\371p\304-e\310?<\343g\330\366S\346?\255<\"\240\177\341\345?\232>$zf\016\352?\264\270G\201t\221\352?x\034 \272\374\006\322?\335\377(\326\352\306\355?\274\r\346;;j\306?\264R\210\360\000S\321?\202c\343\361\261\t\347?\001\323+\336\014\332\345?\330\371\271s\351)\273?\324\351\325\242\230\243\313?\371\310,\'\361]\350?\270[,\306\326\360\330?\250\336\\\335F\000\265?\030Q\354-ib\332?\2003B\301V\214\310?\340\r\177\361\036U\226?\000\203\351\317\360\321\214?\364%|\323O\337\336?\315wJGf\360\347?\316\363\234k1[\335?\307W\006\265\234[\340?\342\340n\256\235~\324?F\320>\330\264\"\332?\236\262C\316\351t\351?g\036\223\376pc\355?\345\304\350\310\016\245\351?\330+\232H\350\033\356?~=l\n\364Y\357?\014#0\'\305\022\331?\211\305>\263\033/\345?vp\272\256.\263\336?1\364\034\233\274\034\354?@+msk\265\275?\370F\365F{\372\312?Ll\212\375q:\320?Y3H\313\352\301\352?$~\365*\340\021\350?\276\330\025p\231v\343?|\013\2411\373\216\331?\004\220I`\221$\322?\030\0179i\003$\347?\330\257#\211\\M\326?\227<\211\013H\014\354?d~K\013`\240\323?\004\002\022K\214\001\311?\300\246\377\202\353g\337?\316\267\331\224*\276\324?0\323\t\374n\310\245?\217\267\304pt#\351?\022`\356_`\364\342?\030\307ai\324\353\331?\276\223\027\325[\010\323?t\214\001\253n\353\337?\235pIk\267d\341?+\037\'\307\274\250\347?|\'\325\257(\363\352?\243>\332\302\023g\345?;,\034N\362\232\354?LKX\240WR\312?XA\353\036\025\310\331?^\345\270\216u\022\350?\010k\352\231\234K\274?RI\346\310\030\225\337?\230\261HP\201\261\312?\240K\346\332(\020\230?\377\205\275\271\000l\356?\220\210\261\004\207\232\245?\3700\255\035\226\361\300?\2561_\376]R\331?\255y\253\267\317E\354?\374\234l\226t\233\325?+\241\331b\211\022\351?6\177\315\235\277x\344?\224\257\206=R\214\340?\271\342?\332\216\'\350?\014\302R[)\371\307?\304\256\204\332E=\300?x\344\025\354a\344\273?\217\377y\357L\260\353?\365\034\275@\220#\356?\234\334`,\301\301\302?\032\344\371\243\274\215\342?Z.>\334\243\203\337?\322\025\014\347\324x\351?eHpt*\022\353?\304\335\266\251\3759\317?\365\225v){\266\340?a\002\027D&&\350?\312\020~\350\241r\320?\250lLa\035\201\315?\304\257\3347T\205\312?\274\343\231.\035\200\307?\343\326\342\360 \351\351?\220\002\351\226\204I\275?\316\346\263\304\304\302\351?\203y+CA\251\341?\210\243\304;\270\234\304?R\337\275\'\032j\352?x\351@\027|\006\261?z\371g\270\243\n\342?t\357\303\230\353\037\321? N\243\017\223\270\313?E\267\335\225v\000\346?dF\365L^\264\341?nU\214\021>2\320?B;\375\330\317\001\341?O\225V\266-\236\353?\373K-\336\374\316\346?b\225\355\352\010K\323?p@R\201d\255\356?\" \233]\272\273\346?\320uF\3135\213\350?\327\235\354(>V\340?5\277(\356\026\t\345?\nr\234#\332\261\347?\003\230l\013\242\327\353?X\017!\337)\376\313?^\267r\245\356\313\357?\200\266224\026\317?Fpf\247\215\006\350?\200\360\203\255S;\246?;X\241,(\205\355?\335\031\261\274\032x\353?\366!\265@\325\245\353?*\253\362\314\2141\327?\340N\263l>x\307?\020(p\300\303\317\355?\264\371\261\245\312A\347?,\201\270\203!C\326?\32786i#B\357?\226\324\377\013d\034\321?\241(\251\235q\260\347?\326\027p\274\354\257\351?d\373[{\007\301\357?\354\204\n\216\\\202\336?\256[\r\365\314\373\327?\272\024\202\244Um\346?\262\177On\022s\341?\250\363u%\215\325\344?\310 /\325*\231\351?\242\327\355\232\354\203\322?\334\3231@\244\"\312?*\376$Y)\212\337?Td\364\000O\210\350?\224_\323\241\247$\322?\205\264\350T\204u\342?\200\201\256\252\267\212\273?\221\314K\270\267\276\344?\202jYM\316H\330?v\342l\372~\241\334?N\235\317\264 \342\355?\364_\261\312K}\313?\221\345\014\356\242F\351?\254}\335;Og\333?p{\337\373\365X\312?(\2228\013\206\236\336?\370\277\212\345\201\244\336?\374o\224F\304&\313?\214\375\3700\347\222\316?\000+x\3736\237t?H#\237E\021)\306?\323\335\376\323\200\017\357?)\255Q\232.M\350?f\007\200\267\257\037\343?\343\035\210\311a:\356?\324\356G\241\037M\345?\026\002v\274\363\325\331?0\272\346N8\327\246?\233F\320\356\374\311\341?`zo\266Z\364\310?\023=\215\317\'\323\354?\320\330\372\270\2272\350?x\204v\262\213b\320?\326Z\277\226bj\340?\000\274\343\036\207^\243?E\340\241\034\'\335\356?ZFuH\326(\327?\221\304\244\2412\326\346?\360\227\214\210Kg\355?\200\'#\324\266(\257?\214\366\252\267\227\264\304?k!\333|J\323\347?zs4\324-\334\354?\000i\315\001\245\020\336?\307\362\360V\036\334\351?\363\264\250\324\320\367\344?\246\315\033\370q\357\356?.\234\212\345\033r\357?\274\340\003\374\035\276\325?a[\314\332\361\211\353?>4\010\346\334\022\333?F18\333\274\332\350?h7^{\177R\357?\240:\030\220h\306\335?d\341\000fy\'\300?\311\235\304\014\325e\342?r\215\010\031^\n\337?\000h*\2463\3256?R\377\212)\350\010\352?\262WLSY\326\322?r\316`|\247\370\321?j\233\250+}\231\346?#71\035E\216\357?\300C\t\260\004\340\307?\370\372\351\375\035\256\327?\215\235Z\035\004\t\354?\224\020\246\321\253\353\340?\034.K\003\341\372\324?\206\231V\327\375\330\324?^\227\212\315@Q\322?\344\310\322A\032\353\331?=\3426\267!X\357?\346\342\t\263f\307\344?\340),\276\211\322\274?I<\202\306\325\372\345?8\3332?>E\303?X^p.\005\304\266?\030.\370\347\026\241\333?\nxU;\2755\330?\276\373\252\212P$\323?vR\257\221\301\021\341?\002g\3451z\341\355?*R\236&\0169\322?wv\201\350\\\224\345?v;\213\257k\345\326?b\340\363j]\234\337?\300\3428\270]\000\312?\332j-S`n\325?\200\036\367\365;/\311?\206\253\006\026\377\356\323?\231\013\201\244\026A\342?.\356\235\327D\215\321?V\367\0231\327\307\340?P\345\251\014pq\316?H<\324f\341\201\305?\272\355j\204\272\347\333?\245\331(\037\211\022\352?_\261\265v\241\311\344?\374\366\326)\037\255\351?\214\n\266VI\321\355?v\334\022{V6\353?k&VI2^\344?\'\006\306xm/\356?\\\3206\353\201\300\330?\334\314\020\035>\314\352?\310\200\364\375\245V\343?H\206\217{\006N\322?\256\330\261\326^\245\322?X\3716\227\226\333\274?\263\">\241W\307\346?\264ee\257jV\312?v\214\002\275A\366\341?\'#\327{\004\370\356?\220\203\003\331\206z\333?\364\010\007\204S\213\326? \220\004\224\026\366\330?\202\rG\"\300\373\342?\314\371\362\310&/\344?Qt\333*\244\311\343?\305\307\224F\227.\341?\000,!\007\035W\276?\310\373\3301\335\232\321?\000\017\247\025k\035\276?\004\374\2648\013\320\317?x\373qd\304\204\334?\006\004Fz\003\233\323?\300\230v \020\317\311?*\033\323\333\022\207\323?\273\332z\014,7\352?\n\330\274\211\t\256\344?\257\217<i\235\365\341?0\336\020\027\247X\273?\332\036\261\023\027\370\332?\354cs\350/.\335?\3414B\3535X\353?0a1\032\322\266\254?\306d\372\201\204\274\357?<\354\363f7\251\317?\226\317\262\246P\205\333?\360\240k\006\202\343\271?Oo?z\342y\342?:\325\271jb\325\341?)\257R%\200\307\352?&\\\202\222Rq\351?\020\337E\337\212\273\336?\007~\360\357\025\025\341?\250\202\017\'r-\330?d\232T\'j/\320?\350\233\373\200\317*\354?\332\202\3339\243`\344?\324\236\306\255+\262\311?\316i\201\021T\210\335?\210\372a`i\034\320?z\r\017\344 \277\354?\224g\215\321\221\'\356?\023?\203\230\213\201\350?\204\310\212\2056\361\315?<\235\300\377\005g\323?\260:k\307U\265\333?\346\006F\372\211\374\351?d\333\262\223I\007\347?h\3715\215\364\307\272?\346+\242{\\/\322?\203\376*\251\270\306\356?}\354\303\364\252|\351?,\025\373$s\"\310?\366bb\213\306\314\351?h6>\323S;\333?(\224\034/\264=\343?\035}Ln)Z\354?\234\'\233\013\013\242\357?\024<\246\220#\273\336?\230\327\013\031$\277\357?%\315\225\256\362\376\356?\377S\036\245\306\224\355?\3729<4\323L\357?(\214y\347\261\313\305?\257\037\246\203\'\334\350?\247\355hD\3339\344?\245+\2174\375C\347?\300c\375\010\201\264\275?H\022\243\014B\274\260?\363\007\024\207~\306\344?\335d9RjH\341?\257[h\267\202_\356?\227\343\315\260\367\264\353?\370\\}\265\204|\271?\341\226\356UDY\356?`\306\204\231]\237\225?\262\364,\346(!\353?P\373k\354^-\242?\253s\007\034t\202\347?d*\276\363o\234\330?\034|\312sd\270\310?\352\306\362k\003\307\344?\363^B:\004\277\345?+\241\013\021\027n\340?\034);\347\356k\335?.b*\307-}\341?Y\026a\361\037\317\344?H\220\t\356\366D\273?\344\024Y\3570\351\344?\022x\2035\261k\341?\224\340<j(\016\326?\240{\t\313\367*\220?\362n\004\215\273\326\320?D\337T\362#\"\320?\343\017\\\260\370I\353?\020$]U\312\236\271?0\206=\373\"\033\256?\262\301\222\232\331\003\343?\003\315\034\002\210\002\346?-\023Mu\\\222\353?\nb\251\306\320\313\341?\270\304\337\261\370C\351?\260\360P\354n\036\347?@O5hR\214\262?\317q\003l\275\277\351?\032\342\010\301@\340\347?:\244\251\202\031\321\336?\240&0\211\332W\314?\013\')\273Z\354\342?\202W\325#*\361\335?\177fC\033\377e\356?\212\341\006\251p\200\342?\234\2040~\032<\320?\323q\316\213[\257\346?\010\350\347\032)\257\327?\204hr\241rH\356?m\201#wA\240\342?L{\274GS\322\333?F\313\333N\326y\346?\245X\222?pp\347?\312<\341\321\302\205\357?t\230\'\337-\364\314?\203\213C1\037T\345?lm\360!\006\276\355?8\334\2773j\277\346?\016z\274\031\240\233\337?\350\r\203\325\020\365\354?\216\321T\270\375\303\325?\017\013\033\214}l\351?\253\364\351\265\215\334\345?P\274t\205\261\321\345?B\221\t\023\243:\330?\330= \205\360\347\340?t\3011/\237)\304?D\345:5\216b\303?bW\223\254\354\242\334?\030\300\017\327V\235\352?F\263\375\346Z\351\343?\216\237l\025\365\275\351?\024\345Zp\206\206\354?\022\016+\331\375\305\356?\302[\3510D\214\342?N\023YE\375\235\352?\032\214\377s>]\325?dW\203\263\337\007\346?c\324\221xmM\343?S\027\253Sn\"\356?\030\315n\375\226x\274?\323\302\272\325\223\324\355?,\261:\216\245M\352?.\247\024\362\277\363\335?dSKj\030\226\345?\264@\250\356kX\343?\330\353\233r\203I\265?\315\223\005\306q\\\352?\357\212\205\2264\331\353?\247\206e\263\024h\351?\206W\212j5\327\341?|\034\023\356\361\243\345?\002\356\310\377tH\357?\323\255 X\343\353\352?\300rQv3l\234?\351\247PP>?\342? \031\323wq\r\264?\000\253M\347\010Wr?0*\355\022\262b\356?\210\020\271\263\357\222\312?aq@f\214.\344?\316\336>\032\225G\324?\000\307\232\273\277\352\212?\261h\266\330\275\361\354?@ky\313F\344\321?x\235X\330x\244\322?\215\036M\223\340\370\345?V\027\022\305\252D\340?\260\341+\325\217\232\310?\264\377\270\2716\310\344?\275\301Z\256W\317\353?\340.\256\014\311N\241?T\001me\331\236\355?B\366a,\237\202\347?hk\321\336i\350\304?\000qz\237\333\007\327?\361\310\341\014N\211\342?V\335\245\000\361I\353?\0107@w\341%\335?\005{\373\221X\371\351?\263\204\034\324\346\177\345?j\231U\316\254h\347?\201,\033\013\324\"\347?t\036\016\nL(\336?\240\247E\202\237e\246?\320`a-H\270\312?Pb\367\225\260\365\341?\224K\2421\370\003\356?\214\344I\307\305\333\356?\366z\\\254\311\220\342?\014\r\241\\\347\272\334?\320\225\037T\003\236\340?\276\320c\350d\254\327?p\343\001\335\000/\242?\264\3777\177\350\312\321?\002\000\030[\254\212\347?\270\\\305.\2035\323?\250\251\366:w\370\304?\236\317\240L\326]\350?\204j\201\202\254\004\305?\250k\357\326L\364\306?p=\236n\024\311\333?\315\177\026.\345\341\344?7\237\312\345\227e\353?\n\325\315\353\177\227\331?\260\275\213\367\316[\274?&Q%\373\245#\342?\230m\t%\266w\261?\356\243Z\324\322\027\350?\221\253\340]\237Z\354?\356\332w\304Y\020\346?b\021\016\262\363\027\343?\340\014\352\241C\362\267?0\356hcP\313\326?\230\230\270\317\260\334\347?\323\304GGr2\345?\027d\253\023\016\246\353?`V\364S\007f\322?\335\005\320\342\203\232\344?\210\251\\\213\314:\337?\327\272\340U\005\241\345?\340\210J`\273\307\314?\010@s\200sh\270?\200\352\016\372\345\306\263?\246\3605\235\003\253\325?\000s\355\226\370\352\203?\275\024\032x\220\352\344?\310\251\347\013[\216\325?\300x]W\377\312\336?\235K\261\316S\375\342?\020\205f\254\320\026\324?\304\rZ\255\356\360\340?{\252\255&\030@\343?^\327\376\370\036\204\355?2\215hV\234\332\353?b\301\257y\313`\353?n3!\343\253\277\321?4\005\301\267d\216\322?Y\216s\371\027z\355?.\223\363\244\355\202\354?\241\235J\260\263:\354?\000\364Py\342\006n?\346\257\214\t?\304\341?\342\331~\1778\214\350?\236\241I\nY\252\334?{\001\2063Q:\355?\020|A7\031\231\333?%\277|Vdz\341?\332\270\354\202p\035\326?EXj\213\211p\350?\207\350\263n\322\230\354?\351\373[X\016\217\356?\312ux\311\366\343\342?\240j\376dk\256\220?X\364\033L\361%\310?\216\244\275\215\225\364\354?H\217\210\020\235\022\315?\340}\271\256\252\374\267?@\263\252\373\322d\202?\026\351\021>\324_\326?\026\374\204\217\301\357\321?\272#\277\2261\002\333?\260\301!\350\276Z\352?A\334u\363\004\241\351?\277\347\226(]\361\346?\212\273\375\225\004\274\330? \231\232\230e\024\354?H\205@\301\302\303\275?\341\2608C\0250\341?\257\304\3133\335\215\342?\024\373M\221\001\217\334?_\211M\373s\255\357?00\272\202\334\224\322?(*\005A\363b\276?\032\273\224\336\374:\357?\254E}\251\245Z\321?\302\222\270\336\325\351\355?^\242\367\260eC\325?\244v\335\nU\007\334?\334\272\231\272G7\332?\252\3534\\\2404\351?\177\317\312\023\023\251\350?P\236\0376\311\365\314?X\001\343\014\233L\261? \363\235\270~\014\272?F{D\265\241\321\350?\270 \236:\373\017\324?r\330\360\227\264,\346?\274\270\242\365\240\305\343?2\027P\3343\001\326?\220\301\345*\006\003\316?\2321M\270^y\341?\001\324r\273h]\342?)\237\206]\326\361\353?b\217\3064-\352\332?P\023\007\2471\213\341?\367\247s\352\n\222\342?\366M\204\3520\250\353?\350\364\233\001\210\317\274?\200\342O\356\001\001\225??\032\346\007\226#\352?\357\251\313\313\343\225\343?#5\177\2418&\340?\205\002a\326\214\236\350?\320m\325\355\351F\313?\020&8\036G\317\277?\256\275V\177\332\271\337?\341\362!\006v\261\357?\300M\331\201^\353\254?\373\002#\331\240\332\340?\300,\261%\335\251\331?\300\031i6\346\324\324?\226\335\\\020tn\353?\354$\253\'\256,\352?t\'\340\210\374\247\350?\310\034\335\253\337\265\313?\rv\201\035\356\305\340?\253\274s\304\323\232\353?^\210]\244\262\261\333?\010ua\321\221o\313? &\200\322\244:\273?\320\363\013\240\303\201\257?TN\366\020\224\351\305?\214`\256\027\332\034\340?\020|\356\033\"\277\305?\014wG\311\323\314\346?\026\227\025\360\316b\324?>\216\360\277\006\236\325?\264\376\237\221\253\337\300?0\3637\253\353\301\323?\304\261aFWj\306?\320\'\355%\004c\276?\200Q-\025\330\r\223?\374\236or\207\260\306?\270B<3\315\027\266?D\227\344\343\261Y\342?\352.\365\t5\331\331?\264W/\271Z\t\320?\306\326\355\025\360:\341?\256\312dE\270\017\347?\020V\'\035\230\035\303?\000Am\356p\006\201?\210\304}H\023\267\311?\300\372r^t`\211?\231\363\215\307\302\213\353?\346\343Z\034pm\326?\3006\260\027\204S\355?\223\006\313\237~\330\341?D\265\263\333E\001\357?lY\007?\002x\327?(r\236\247\013y\351?D\346_ho\036\310?\215]qMFY\351?t\330\303\r3g\317?\320\343m\250\243\255\276?\271\\7g!\014\340?>\355\207J\030!\347?@\214\331>I}\256?\370\037q\3559j\333?\236\244\240\\\302\234\341?\227d\304Q\316y\347?\375\235\215\362Y\033\340?\301\254\322/\231\274\347?\030U\354jk\275\341?\005=\343jCa\355?\020\353%\331\300\024\243?\253\231\345\007]\257\343?\007<a\017ON\340?D\315\301\210\037\027\321?\310L\277\032]\257\321?\271<\340\276\341B\356?\227\313\002b\247D\344?d\212!t\354M\323?\310\275\"\321\342\221\342?\316*\017\033HF\334?\347\275\250\tEO\344?\200\343\330\200~\320\275?\257\211\"^\331g\342?`W\336f\202\216\345?,\327\223\3516l\353?\270c\343#\010\030\350?\000_\\\023J\022\222?\375\321\323\364\322\351\354?\332\373\177\316\241c\324?gD8l0\253\357?t\023Q\270\026\036\337?\377\303+@T\352\353?\230\263a,\240\371\270?\300T^[\273\211\252?V?8(\336Z\347?\336l\377\354s\313\354?\351S0!\256\014\352?\310\323\337\244\313S\316?0\215\022h_\206\357?\000 \317\225Z\364\225?x\330Gu\315?\340?\230\372\006t~\346\270?\323v\352\030\331$\354?\352\001\377\356\211I\357?\000~9\037p\277\343?\020\246Z\312\276\230\304?}\212\240PS\357\342?\224N\220\371\323\246\302?Yf\270h\276\330\351?R\363\205\313\330\345\341?\264G$\373\035\220\344?[\201\254V\034\217\350?X\007\001e\254\024\314?\312\242^\273\n\303\325?\274N3\036\354p\334?\240\245\246-\237V\320?$pI\'\275\333\335?0\377\271\'\241\271\310?\207>n\220B\273\347?d\014=W\231\342\350?<\301\\\340\311\320\320?\334\231\247\240\344;\320?\240L\361\245`u\341?\367u\n\317\370\263\346?\3453\271\035S}\351?\014\375i\t\013\022\356?\372\300-\245Dg\335?\326\343\307Z\344\217\330?\002X\254\362\240\261\344?\304\337\257p\022Y\334?l \023\216\340\373\336?\340\025\305\265\345y\356?\034#\252\324\017\247\352?\264\323\262{G\215\307?pv\0359\001\315\306?\225\3428z\204c\355?^\341\343\017S(\344?\261\343\027\262\327\236\342?3\t\322%:+\344?d\035\232\232rD\347?\034\366\021,dY\315?%\356<L\273n\346?&H\230\262\254\000\324?\300\225\266\263\036\313\206?\210\347;\005\270n\337?\266\377\032\315`G\325?\260\316%\000\021\r\301?-\367\353\375\325n\340?\016\362>\244\340\013\340?8v\000JQ\231\305?66\326:\035\276\353?_\246\207\253\377\371\347?\216\352\213\275\216\247\346?Y%N\326\305\332\343?\206/\362W\'\370\327?\272\024\"w\302\035\350?\035\345UF:\256\355?\330\033\230t\254X\356?0K\365\007\"\206\322?\252\"e&\353a\331?QZ/\2703\273\350?\236\300\014\313\016\234\346?\244\001\332\341\023\346\321?\366\016\014\204\364\230\331?t\014\216\207m\320\324?aOD\0273\354\353?\006N*]\303\237\336?\002W\014L|r\323?\036)\204M\345\020\336?`\\\263\213M7\260?\340f\352U\273k\224?\216\235AA\013\"\332?\335\354\037\013\0349\343?0\253\000Wp3\264?\000\260\335~\037\306\257?\214\370\305O\200g\316?\2404\351\022d)\267?\332\326a\006Ox\322?\224;\025\263\267\316\323?\316\276\343`b\342\341?\220\273\032Lx\273\330?\030\236u\362\300\261\327?&\233 \266\375A\341?/\274\374\206>\337\356?\321\020\353\366\2550\350?\301\322 \253\037\200\353?|\270\203\332\341\210\353?\266\310\264\247\026p\330?\\h4w\315l\302?\262m\211$\020\300\342?<e\026\007\0146\322?:\020\336D\322\265\356?l\325G\317?,\355?Gp\370#T\370\352?\300\254C@w\232\220?H\326\273\\\303\032\320?=\"\204\216\301\313\355?h\372o%\010\032\270?h\377\214o\260\376\353?\302p\204\177\017\001\334?\304\215\203\212\324#\326?ty\007\317\204\364\345?\000\335\267\205;0i?\2402c\321\377\214\311?\210<\340&\326\240\357?\234\3049\236\002\374\320?\337\212\214\016\347\337\342?\320%\320jE2\313?\215\231(\264\332+\350?\224ip\225\005\r\313?d\330\345~\035o\304?x/\017\325e\234\326?\020^/!=\324\245?\306\263\202\355#\235\352?\253\017>E\303\227\356?\360l@\202\212\\\353?\375\257e\264X.\355?e\2052\3655X\351?R\251\224t\335M\332?\210\336\013\022\022z\356?Y9`\372\001\230\345?h\364\367\217\004\362\311?\354\330\216\353m\276\327?\334s\274\367\300p\313?y\272\376\231R+\356?\324N\376W\224\255\315?\000(oT\323\315v?\032\023\377\016\006=\354?\220\264\224\327\352\255\262?\357\244\212\226\313\372\353?\3170\306\350,h\353?&\267\242\215\327\357\357?L\315\354\352F\300\356?4\271\325\377\014S\313?@\233\252=G\200\324?\212w-\345\021\377\342?\200\236\350K\360Bv?1({\347\253C\351?\225/\354\226\202\325\346?:H[\0310z\346?\036@\"\\\322\301\350?\202/\266\256\272\255\325?\372(\337km\220\335?\026\253\314\225\321\341\320?8\n\310\252\244\302\330?\255\035\330\223\371g\342?f\357BkT\266\322?\240\224\233\304}\350\244?\340\273!\375\035@\255?\256\356\300Ny\035\321?W\366c$\357\270\350?6I#)\354\214\352?\312\327\007\272\325\315\323?\345\'\250\351\216\254\342?Z\021S@\265\346\325?\344@\316S\034&\344?\317\200\266\210mG\353?\020\202\300\337u3\320?\210\020\360s\215>\324?+\277I\332Ig\354?\2534\301\2733\300\342?\334G\177A\347\301\330?\247o\371\272MY\341?\250\021\0364p\265\353?7tS\331\2523\346?\nj\214\007\332\257\346?\211y\364\026f\022\350?\221\220\234\"h\202\341?\320\242\372\0018\274\340?R\351Ch\245\242\354?\324\203E\344\331\373\326?0\217\201n\346\357\275?L\257\247)\224\310\303?\000\027\026\345\276\003g?L\017\014\2421*\341?\320\362\363^~f\240?\377p\3656\2163\345?\360\025\361\270A\026\332?:\027\313\334\0162\356?>\316\302os\036\326?3VZd\307\345\346?\236\244\nP\212P\326?\320(\000\226&m\332?\254$\203+\311>\305?|wC.fV\317?\200\332\250>k8\245?\\e\025G\003\212\315? $\021$9\356\355?\3644*X\255P\337?tP#\217\036`\357?<W\206\370+H\345?`T\347y%|\350?\300\363\261\354\345\217\255? \206\n4(\241\346?\256U\236\215 \357\337?\340\"*\373BY\221?\226\022H\220\312\017\326?\244\005\317s\260,\331?\277s\242\325\261\254\356?h\316t\035n\270\301?\372J\214i%:\323?L/\020\333\005\307\310?\373U\3425O\233\341?\264\263q\351\372\010\353?-\304\333\203\000\227\353?\210\340Y\360\244\321\264?\350\302\320\263\022l\277?\016\232\220q\345\300\354?\376_\262\202U\274\346?g\320\221\312\177\\\344?hh\323q\332\322\314?Q\346\364 \335T\352?\356t\254\373\316.\347?\264\264j}r(\331?\034-\263\214\2551\324?\244n3\213$\222\324?\"\316W\032\275\252\345?\010\024\240\27529\260?(\266*f\246t\266?M\205\026\204P\351\356?\212**h\254\036\355?\224\375d\247\300\306\321?\242:\241-{x\336?]\211\024\nf\361\343?\352>\236td\360\351?\320B\352\350\237\340\273?\266\354\202\336\307[\344?\200\363|\205\350\331\233?v\231U[Z\001\347?\320C\340kf#\240?\264\235v6r^\311?\314{\301\013\363\350\302?\320\222h\200\231\257\320?=\000\351k8f\346?\212\227s}r\323\346?\354k\202\027\225_\342?@s\327(\031\010\341?C@\020a\256\010\341?`:\260]\033\233\346?:\235\314\014\254\327\331?\314\tWA\310Q\327?N\024[\222\007\255\347?Y\202>\\\306\204\347?p\3701\351\t\300\356?8\331\336\216\201\226\303?\272\3013_\2524\351?\234k\320\243:g\341?*\027\342\236\265\257\341?\310\350)@.>\304?h\362\206Tr\377\310?p$\2670\376\324\246?\216E\274:\327p\336?\030\3327\324h\235\323?\203\363n\347\231\332\340?&7r\021\314\357\332?\256\025\314\022\025\207\327?\216\235G\277>\204\343?RC\230}v\321\354?@E`\311?\034\245?9N\352\240\236g\355?\232\035j\302\203*\353?\256\010D\3205\301\322?\364p\256\350\372\273\313?\366/\271VrT\346?{\0060\266m~\347?\322\233\370Xx\377\346?\200\371\007Hy>\332?\030\263[\360\037\004\260?\310\234\216\374\264\252\304?\010\345\235\'\205>\276?\376\346\361\027\271t\320?8\334\230>\212\363\317?<?\241\311\3437\307?\300\341\317}\275c\241?\214\330Be\025F\330?\241\024\265ly\325\340?\234N&\372]w\345?N\r\255\323\242\335\320?j\022\351\202\363\241\334?l\322\261\321\253\325\322?u\327\356\255|\336\344?M\303r8\271\372\343?\304\271\000\372\316\320\332?{\216\342\276\014T\357?\270;iI$\324\315?\305\304e\310\325\210\354?\005\320\233F\207y\357?\3102I\2577\202\325?(\2742\235\001\200\352?d\336|Q%\267\342?\3115\215\212\322\273\350?GU\271z\253\226\343?\316C\371\216\215\220\335?\014\332\022Y\n\264\326?\364\260s\231\353\025\316?v@\2357\274\301\340?\256\307\n\377\275\227\335?\373\220\300\020AF\340?3jU\310m\215\351?\260\270?\320\243Z\274?\236\320G\306C5\336?\202\2506\212\250\307\323?\010D\336\220\207\234\276?\t)\037+\030*\352?=Ql\226\307g\352?\026\340\227\246\350L\320?\340\266\3268\376\262\270?P\"B\242\006\204\257?\024\356GJ\010\315\343?\344\330elYd\313?\240\236\270\'\277m\220?\004ok\334i\341\330?H\036\233\005\357\354\273?\000\021\330\030\006\201}?3\025\347\234\262\026\345?\016\377\247.\211\305\357?R\017\201\022\262\333\335?\250O\346\315\014\372\304?\037\312\303\260\336\370\346?? g\377\037\375\340?@6\206;\332\307\323?z\200\2410\371b\321?AD5\243P\376\351?\300\033H\010C|\355?D\345\257\000\2470\354?Y\375\227>\377\236\347?\020\360\245,\343\260\357?R.#e\0142\347?`\363\3024\326 \310?\'U\256\177P]\350?\366o\354\013}\203\341?O\016\377Z;9\356?|\212\241\347\230\323\352?Y\035\221;\374\247\340? \302\305\346ol\334?\333\372\033Qn\372\346?\372C!\267\003\t\342?\020YGQ\262\265\272?h\217\266\240\254\273\350?\220\232\231u@+\347?\032\200=_\035L\351?\026\310,&\261\203\351?\250e\215q\312\226\322?]\001&\241\014<\355?r\261sw\367\032\325?\324i\215\265\257\031\332?v\005\007\275$^\333?\371 \r4U\363\353?\024\362X\326\272\022\301?\374v\035\\\345;\357?\265\216C\321n\374\343?\321B\230;\r\305\344?\330\016=d\222\376\261?\310Z<\225_\265\323?6X<\005\225\253\336?Z@\320\240\330\344\353?W\236[{E0\357?\027l\266\347y\305\343?\204\343\365\303\337T\332?\307\030\316[\201\211\356?\300m\321\255F\322\317?\207lV\365\"\345\351?\210/\001h^\353\311?\314gy\341(\255\306?x3\"\254&F\311?\023\037\233\023\204\207\352?\300*\221\\\027\020\304?8\266O\364;d\307?\213\235\362\317\264\346\342?\240\364C\205\025a\327?\322\020\216\203\227\023\341?E\270j\362\331\312\342?$j|\247\227G\353?\210\\\213\341_\351\346?\361\222\324\032\216e\352?\010\'8\371\t\340\336?\004m\037\345d\217\310?\363\306\274\020Z4\344?\354\330MM:\207\343?\355\355\215x\253z\352?\210\032\177\307=\000\274?\"\223\0022s:\337?\357\364\377\331%\234\351?\031e\313P\310T\354?\336\000\315sV\335\326?cZ\'w\233f\344?\370\240\2211\025?\322?\240\223\273\275C\343\332?\365u\311\025\335t\345?\335\014\001L\227\306\344?8\370\222\342~#\326?<\202\0101}\023\357?\364nF\264\230s\315?\'v\277\207S\327\352?@H\307\276\220q\305?\210\001\322u_\024\313? \034;T=\233\345?\314\347\312\320\017\253\307?\300G@\365\241\370\217?f\304/6\017)\337?8-\353\322\240~\345?\350\360\240\353\205\351\341?\360K\366\002;F\255?\214\t\310\200j\372\306?X_)\300Mc\261?y\r\273|\3621\355?\200\ty\276\313\036\254?P\t\273\n\222<\314?\232p\325O\3003\355?\332\276[\037&\235\321?\313e4\255b\353\352?\334\307\326\210\034\204\346? \002\'\244\257\222\312?h\365\366\342\274\'\316?\013\223h\236t\206\343?\016FgOG\257\322?\003\037\327\367\255\035\354?\326\2045M\342\246\344?\333\315~T\220\276\352?\204W7\351\371\r\345?\360P\023RC\366\326?`$\237m}6\257?\000\237\220\307\312\304c?_\256\241\344CY\357?]\367\2241\305t\341?O\365\323\311\276\246\342?\360oHX\334\356\334?\204\226%\341.\036\317?\037\213\271\203\331P\353?\304\260\205rG\227\306?\247\315\020\337P2\357?@\301\245\345l\000\301?\310\221.\372\355T\352?GZ\360]\343\354\357?\267K\374\377\274\203\353?\234H\r\323[\t\342? \312M\262\256\034\231?\306\0244lLN\351?\250\242|\322\320>\300?\326\240\323\360mj\330?\\\202*\203*_\315?\311\001\277\0210\212\341?\347&~\3360C\342?\204og\351\017\025\320?\322\311\356\246\270(\337?\234-\316>\t\261\343?\236\"3\202\353\351\343?(G\010x\323\323\336?*\303)j%`\334?!!\220\254\233F\351?\300\333\333r\226\226\335?\031\322\362\362\263S\352?I\227M\251v\021\343?:\210\255\204\376y\341?\242\243[\214\374<\352?\323\272/\200pR\342?\350\223b\032\027\036\331?8ND\377\005\354\320?`n\375\337\341\363\346?\000\010\374?\260c\223?PDl\223\211\336\301?~g9\025wJ\341?\210Y\326\325\273\221\315?\017\271\301L\242H\355?$\247\375~\261U\304?\037\273\307v\177p\341?t\030I\006\235\236\347?\030\234\265\345\0255\301?\254\n\241\246\275\372\324?h&>\211\307\302\352?\204!wJ&\025\336?\310O<C&\324\275?\0275\002\255\202\315\356?\337D\257/\357\303\354?3\374\"%\237\003\356?\302\2613\303$\356\323?\262mI\rNj\354?\370\305\324o\353#\274?\272-}zj\272\325?\204\355\305\017(\375\347?D`,\271\220\344\335?W-p\365\026\375\354?\322P\010\001\360\313\340?\254\0043\3028c\303?\212\332\2256\021V\330?\264\303P!\323\350\355?\000\033\024f\336\034h?Y4\242p\022\277\345?\375\036\026\211\016\206\352?\220\307/\306\334\203\263?\260U2\016\007r\257?\327\013x\321F\000\344?\350\316\207[\311\005\311?\340\311\257\220R\240\357?B\367\300:\242\213\333??\250\343\216\316\326\343?-\200&\3022a\340?\264\2417\021\3046\356?]\004\017\316\367\246\341?\306\334\247\202\212V\357?\230el\353D\025\327?\010\274\250\014\366\324\263?n9\337\004\177}\343?\250\031\365pk\307\343?$\212);\230\217\322?X8\323\351\332[\325?A\303\200\007\032\345\345?\323\276\014\333\025\265\352?\001\377>\312J9\352?\217\306;\333\355\233\344?,\211\223T\242\233\353?\252\355\312\016_\010\323?w6\220\t\027\263\343?y\250\360\245\205\246\344?2\002\342\307\212\037\336?\310\223e\026\344\374\334?\034\351\364\'\305l\311?p\0228\226bj\313?\3364`\221\030\224\353?\201)\244\237\034p\346?P\021h|\272\035\332?\000\362\267p\217[\\?w\262>\255\231\231\357?\316\245>@\026t\324?\342\273\337\037r\273\342?\240~>\\\306s\237?h\370\223\322\261\014\326?\304\351\306\337D(\320?`\325\000\271!\251\251?`\np\353\354\211\316?\034\200\221\037\310b\306?\021E+\204\227u\353?\354\203\177\323>\023\345?z\247\304PQ\020\346?F\210c(\267o\333?\276aM\0166\347\355?dt\031\343\367\331\350?\274\375\331\367\363\213\310?R\362\243Wx\301\357?\342\320\275\341\037Y\341?\034\306\250\300\317b\322?\200E\364\252\233W\343?\232\313\2117\276\313\356?\354Ys\025\261\211\310?\244R\261]\'\256\357? .\317\271\231%\221?9\375\235V\241\203\354?B \016\201\360\275\347?@\217>\r\246\240\346?\002\223O\251\316G\353?\240\223\326\326\374\376\251?\014\240\240!\3334\302?\032\207E=\\\374\351?\353\\q\341-\221\356?r\231:\242}\376\357?\002\341\337^O\316\337?\200\364\036\335\257\376\263?.R\371z\036Z\321?j\347\341\223\357\356\356?t\227\365!\334L\302?\321\2436\343\307v\346?r \354`\223\201\352?\216\305\221\354\220\225\345?\000a\342Lk\202m?\272\267\"\217$,\347?\016\230\322\362\334\352\340?xiN\316\033;\275?\272\207\260\223;\265\350?\207\236Z|\301\027\350?2\204;\257\356H\356?\274`\351`s\340\324?(D\246\301\324\217\261?\240\206Xg\374\017\272?\200\320k\250\316V\337?\312\264\023\251%\206\325?\010\353\204\177G&\327?\270!\226,??\342?\326\2304\321\317\333\342?\232\234\375\317\307\354\322?\325\327\253M\341u\344?\000\302\375\235\213\351\324?pP\310\301\216y\256?\253\017\240\312\305\251\347?:U\273\366\005\310\343?\310\277\234eI\214\315?\001\035\276\257T\324\356?0%\244\031\305\203\316?\352\347Qu$\274\347?\210\354\311nwP\274?B\t\2114\007l\357?,m\000\231\271\202\307?\260)d\236E\305\356?S\375\0063\346M\352?a\212\343G\300\"\344?@\324h\247\354V\344?;z\026i\325\223\342?\300\252z3\267*\322?V3\226\270\324\004\332?\030\005\203I\336\370\320?\270v~{\234z\261?\340JmV\036\305\300?4d\225\361\301R\331?_\306\220\213:\372\347?\260\267\363\331\314\241\302?\020\373LE\322H\317?R\3753XE\255\333?N\353s\235\233\006\332?\377\337ZLm\350\351?\356\275_\215s\352\327?\257\2713\262\003\272\356?;\"\026L\237(\343?\310\036\366\323YK\357?\225\331\246\364\257\341\340?\354\262\201r\303\311\326?0SA\302\302\331\357?_\204\005l\263\010\357?\254\352\362\306\374R\305?\265\000nT\254\215\340?X\311H\004\212\264\355?I\375\257\021\202\373\342?l\"!\354V\036\327?\000\342\220K\017(\235?w\020N\221\253\005\344?\000\301\001\025\206@\334? \203\334<\241\337\332?@\334\255\363\312\030\232?\000@\243S\370E\267?\363/:?-\030\354?\227\370\323\246\254V\347?\202p\320R\336}\320?kn\304Oh\257\346?l\026\331\253\336\323\331?\200\250Z\037\230\010\275?2u\313\346\3251\352?\236\265\007\252G\237\333?\026\026\356\275\223\330\345?t\241\242\237\025\275\343?\002\327\330VV~\336?Wef8\374\003\357?\240\270Pu\241\320\353?\354\302\026!\035\261\302?\360\203\337\014\000\222\255?V\327\205\372\001>\343?\020\256\255\020\254\241\313?\262\215o\214\245\223\346?\034\\\326\324,\213\326?\370uo\347\001g\266?\370\2473\247\374\363\302?\360\330\212\0143\247\315?\\\244\274`\273\305\356?\352\362\236\331 O\333?\356E\270\366\363X\330?F\320\236o\237\212\357?\026\007\323\030\240\272\335?\350(\032\277\037>\264?\022\346\313\211O\360\333?w\340\027\371\220W\347?8s\254\276s\206\305?\270\014\317\231\346\232\312?\263\014\332\307sS\350?\375\317\364\274}\204\353?\266\343E:\341\324\357?\261\326\243\346\214/\342?\226\300\217\310\000\235\331?\210+\217&k\225\265?\"d7R\215\377\356?H0\326\206\315\235\332?\020\343\264\335d\337\243?\324\000\033\202#Z\334?\230\345y\374RE\312?\246\261\363\366\265\271\333?\3166\021\024\211\236\321?\010z\221\017\026\222\277?~\265\312\317\347\276\342?[n\\M\212\024\346?\222\237\344lV\021\323?\353ys\252\220u\344?\264`\310\213[\344\347?\242\227\\l\321S\344?\020&\371\310a\357\250?\324\343_;\245a\331?\n\006\370>H\247\353?P2=W\310\033\304?#8\335$\351C\356?I\205\250\0366\266\353?xY\352\260\352n\335?\324+t%d\370\316?\370\272\244\262\321\232\303?\\\022\331\302\302\035\356?\206\213|\311\210\251\334?A\272E\203|+\352?h/\250\224\207V\353?\321ex\207\236\002\356?@\214\214\351\325\241\252?\264p\331~N\344\331?\374\037\225\213\005<\336?\224\037v\270\303\177\302?xk4|c\273\314?h\003\026\250W\\\313?\230\\\022\037\371\003\310?\230\347\274@n8\355?\330\3063T\376\016\272?\010\371u\3616\022\347?\010RW\347\252\240\346?\035\213t\361(!\341?2\307\222\024\354K\341?\016\243\206i\025\245\346?\247_\002^=a\355?\374\033\336!\362\332\315?\306\201\n\334\000\334\343?\363\260\2117Hq\351?\243`\342\377\013\316\343?\256\352IW\365\t\322?\253\006\234\327\353\315\342?\334\310P4L\330\302?\206C\351\220\363?\341?\300:Sm9A\214?\026qXE\030\037\323?E\016\243D\216\226\351?Z&\253\266\225\037\344?\325\311\034\317z\316\350?C\372\024\014(\360\343?\324\274C\204\265\330\353?\270\213\244\236\277I\317?\315A\332XNI\352?\340\365j\333\311\251\232?U\024\333\031\246:\356?\346s\367\021!\303\346?\200b\022`\256\270\332?\3760n\351\356\204\353?:\340C\276\324\312\327?`$T\255\177\020\232?B\000J\234\256X\322?\212\362\317\362~\310\325?$7)\221\221X\317?\227\277\333\251\310Z\355?\274X\3768\347>\313?\362:\037\340\262\306\340?\324\010&\032\231~\334?\010\230X{\020&\315?\274I\007\017\320\002\352?\201\230r-K\357\340?\nL\210d\3373\331?\354\n\311\032\361t\322?,\211b\232\225\366\357?@}\3443\214\016\354?\350\270\377a\377\030\333?\270g\345\004\014\300\316?\364\307\242{\377p\330?\250\226\320f\302\313\333?\274\344>\"&\373\351?\254K\323\213\364\212\323?\000\322\307\330\342\222\344?\350 Js\016W\353?dx\260;\036\010\303?\334\260\242\\\'l\301?.\\i\226v\004\321?\270\'>\324\335\346\340?\003\031\034\2629\302\354?\372\353\2607W\307\355?\320\020\223!4\274\314?8Ot!\031\"\273?\330a\314\026:\215\351?\002_\321\'\203A\342?p\376\260\0003\331\252?\017\003\247CO\264\354?\232\266\177A\304\177\350?\200\315\214\020e\351r?\317jJrO\354\352?\256\351.\341M\252\356?\ro\025\2510\030\343?\206\177\210\022.J\347?hqQ\'\275\035\342?\310\206nV\362\334\262?8\2072_2g\315?\246\004\350%n\346\351?\350\257\222\244\023m\263?`\022I \377\002\341?pk\262\t%_\342?\344\332\216\257$\315\311?@\204\350r\334\225\231?\317\376\3654\374Q\352??\354\346\216H\340\353?\220\353G\216\217\326\340?\346-\014\360\345\370\345?\\.\206\212\342\201\316?\330\206[x7}\357?\220\322\255\320\305\032\331?\324\303YA\265\t\322?P\351r\337\300\220\343?\334~6V\301\200\300?\234w\272px\016\344?@d\362S2%\234?\014\274O\343m\357\316?\316-\03234o\347?\346\255\320#\312H\340?\323\260\000*_\007\347?L\231\252\263\255\275\337?\021\034v\230\247\t\340?B\254M\177\034\347\336? \223\310\341lm\320?J\365\351\023e\022\334?\014{\'\216\033:\341?\336v\233\363\003X\325?\020\234\262\346\210o\301?\260\005\204\"(\324\267?W\301E\017\006?\354?|_1\207\300\276\302?\276\327\002\027Pk\335?\315G\344\246<\210\342?F \266=\016<\332?,\274cL\220\323\340?\027\266\242\323\213\022\346?\334\275\024G\255\377\345?\200\240N\307)d\226?\302a\302\307-7\353?\204,\326l\2271\350?\302R\021\312\316k\345?\266n\255\363\267c\347?\307-\253qGX\350?\340\260<\300\311\260\313?\330\254\356\312\302{\347?\276\353\032Q\023\352\341?\310\327c\372\024Z\302?`&KO\243M\252?\351\022>\\\321\321\352?\334\205PI:\261\323?\016%\317\262*k\355?Z7\024\346\216\263\325?\340>\370g;I\321?[X\003\022\306^\350?\032--x\205\300\341?Ql\266B\230\333\340?\206\245\351\243\244\026\334?\264\360\263\225\223\351\303?&\3671\316\365,\327?\344P\245_\352\275\315?o\271\n\206\n2\354?\327\3722\361L9\340?\216F#\333\337\360\322?y8F:ud\351?`\030\305\216\335\007\274?T\335\026U\350\274\315?\307\266 K\262\271\340?\252j\002b\325u\337?\370\312B\215-\363\311?\230\275W\336X\343\354?\000:l\212\341\370a?\225_w\014{\032\357?\363Y\326\346zo\356?\340\031\261\002\031\246\250?\250\262\327\317A5\347?\\\374\375\266C\317\314?\014\264\340f\300\304\305?\010]\363(\236\014\311?T*\354AV\010\342?`a\277\"i\r\316?\364\332\266+#\347\327?\262\234\303\014L\217\341?\226\354_~I[\352?\342\022\343\017\021\334\323?\376P\013:\013\242\332?\005\315\303b\335\205\352?n\032@\236\007\353\354?<\230\337em\200\314?z\233\nO\355m\351?\200\260l\230\021u\312?<|\3546\225\'\315?d\t9\326\311,\302?,\230AM\305\004\356?$\247e\037\265\255\304?\376#\317\005\246\240\330?\013/!G\356\225\352?\340\020\365\"m\333\315?d:sxP\017\352?\260f\t0\0052\317?\230mHC\231\004\265?S8Uf_\366\356?9\314f\227\236\037\345?\200-)\313\252\n\231?\250\224g\277v_\261?\230b\350\037\232q\263?\200s\223x\"\230r?\200\\eV\202\211\236?>Z\324\310\246\024\335?\3160\n\355\251L\327?\024\246x#n~\345?\212G\270\2463_\336?ZP\006\215\263M\332?\365\020\316~\342f\341? ]\365K\3467\320?\267t\237\375\243\344\342?(P\001\213?\365\307?f\\\244\250\337\035\352?\006\020\213\r\211\361\321?@\346s8\024\213\210?\335\327o*=x\340?\366\202w\316g\272\341?p~E\257\236`\341?\274Sh\350\032@\337?h\313\204\335W{\301?P=o=\325\321\347?\"r\316\024\230,\343? $\212\273\3039\313?C\2643\232.(\351?\263\335%\t\350[\357?\325\216\350eK\362\342?\210\336}\222\030(\272?\2200+\256 \223\250?#\344\261C5K\350?\356\214d\362g\347\354?`\253\347\307tJ\355?\261`\324?\201\037\350?\2043n\030\300\003\341?H\301\250\006\312\222\344?\317\242\000\324#)\357?8e\312\331\276\362\321?\257(\000\205\220\312\355?@\363\342\300s\251\247?M\265j\373\032\232\351?d\032\021+J\356\322?p\326\027\240\315\227\324?\200%\016\025>\362\324?\037\331\'\370V\035\342?\244E;\212\250r\353?\200\331\233kEW\266?\222?/{6\277\333?\n\234m\207Q\212\327?*\232\2539\027\345\355?\224L\346!`\345\350?\307\233\261\027\273u\340?\254`!\234\034*\302?\272\370_X\212}\343?\021\003SU}\251\340?\240\216\245\201\311<\256?\373\010[/\374\345\342?\373\3559\230\201\371\357?\243F\030\203\306\275\342?\256\022\334\346\317x\332?\343\3531\327\343$\352?T\315w\177\261y\347?\220y\363\"\233I\320?\202\014X)d\231\344?\240\374En\210\264\237?\343\313\316\347v\270\354?!\354\003\030at\344?d3\214\030\016\274\305?\n\217\327\315\333\010\352?\374\240`\300JX\304?\323\342U\324\215\274\342?w\010\364\307\340\244\345?\307\274\277\310\272\275\347?\252N\256\312.\035\356?\026\230\253\366\356\323\322?]\034\351\205w\214\346?\236\307\326!\037\360\351?\272\325\376\220\234C\326?X\346a\227\n}\337?\236\372\211\374\204\342\326?\300?\260>\271,\353?\314\301x\313\202\004\300?\034\351\223\224\323\274\307?\3362;r\260z\346?\221j\002| \204\353?\000\200`\317T@\337?\340P\037\021\204\314\230?\032A\365z\001}\356?\355\3045n?I\351?0\t\341\032\000!\267?\304\337\035\022M5\347?\274\210\230\005\262\352\316?d\367\026\005O\013\303?%\014R\220\337P\346?Pn\260\036\n\224\331?\360\367:\177 [\335?\\Yf\037\354\262\312?\244-{\242\025\202\336?\274\2536&\346\002\323?\222\242M\300\302\226\344?\264\262\247\230\260\033\345?n\212\236\301K\235\337?$\236\335\001\022 \301?`\330\272\035\022\372\247?J\2065\001|F\327?\000\245\00330|\335?\253\311r&-\005\350?\302\246\300\323\225\264\347?Z\376\343%0\265\353?S\217\302\231\007\263\342?\25154\000C\327\351?<\353\220`\376?\331?,\335\252\nQ\261\316? \362\304\004\314\207\263?\365U\264\325C4\343?h\224\211\177\250\274\273?\320|\023\316_\335\305?\232\317\007\245\342\207\345?\0241\314\027Y\205\341?4\204a\327\000\272\336?Tw\240\314\271\252\311?\000J\255\363\217\347\311?\336\271y\276\202\263\337?\212\367o\250\2579\342?+\317\262\304f\232\353?\243l\246\3762\343\340?\273\240\242/\347\016\344?0\\\331s\357\304\302?\023{0\226q\376\345?\220!\256\0017\002\274?uY\305\276@t\343?1\3378\024\306\276\345?\020\333\263)\211I\320?\344\257\360\034\025B\302?\244\256Fw\325\370\312?\214fF\334\016>\330?x\251\316\355\2406\337??P\316\234eF\356?.\301\275$\264\251\322?\030\241\226\276<\276\355?\212\n\021@\3327\354?\000\215\307\0338\334\212?Kx\2025/E\342?\314)\327H\222\004\307?A]\001\255\337R\346?\271\315\'\274\201+\357?\340z\370\211\236\214\267?%\240\333\261w\341\357?\220\363\242\206\241\243\342?q\374]\302\310\264\351?\021\035\275\207\311W\345?\004\325\237\362\277\350\355?7\213\203\224f9\346?\346\260\245\212\205\304\353?\3200\363x\235|\262?V}=fn\343\324?\334\341\316\024\216\264\316?\306iT\016\340\273\336?\360ES\263\220\374\273?Bqv\377C\"\350? \302\276G\316~\243?\006(\233:\202\232\322?\326\265\320\211\003\225\353?\010\307\333\253\032\330\347?\354\270\353+\354\243\310?8o>\245\351B\312?\323\033c\017\366\374\356?\330\254\302\037\207\006\271?\240\032\255\025\300\217\265?\310A\247`\211\340\301?\361\210/\377\033\201\343?0\217\263&n\353\252?\330\001\024\005@\020\355?\t\213\303\006%\027\352?\252,E\0316\034\350?`O\223\321\005T\233?\224\250\351\233x6\326?\372\261\006\363\250\363\344?\352\3310\376\020a\351?D\014gI\030\177\331?`\214\022\211\212\303\332?\342\252b\225\207\373\322?&\237\356\255\'\037\341?NPw\223){\326?\330T\331\010N*\350?\020\255q\2264\212\340?f\020G}\333\213\355?\200\350\366\331\273e\304?\002^\226\336\217\034\333?\300\216\267\231\262\226\320?n\343o\340f,\333?\35754\360B\000\343?S\3741\360\342\316\350?\0264@Q\231\234\322?\256\322\354?J\323\345?\314\034\223\211`\314\344?\230\234\016\n\236 \356?\354\"\233j4&\324?\014\002\243\200k\264\356?{%\004\317\365[\356?\010q\224\234\327L\321?@pJ\330\307\231\303?5\335^\265\222\375\354?\264c\205\251\332\200\310?\244gN\212M:\354?T\354\006\227\233\037\353?\230\311\307\364 \335\335?\254\212\343x\322\315\306?Vz\004\234\262a\337?\236T\352\367\243 \327?\314`\023\2259i\347?\360\014ob\006\251\253?\"s\001\266\2534\331?\364\274\315/,>\340?\027\016\037\267\325\242\340?)\332\302H\311\255\345?\276\021\351\246\317h\354?\354\335_v#7\314?\274\375/\326E\265\334?\\n\335\312\373\353\317?(\363\201\035\307x\261?\214\000\374:o\370\334?2?\342\030F\377\324?\324w\331\245\254\374\305?\250\240\331T\324\241\332?\236f\252\257\237\340\330?\345~+\341\220\260\356?\010)m\3240\276\267?\330\314\322\371\211\314\333?\326j\347R\030<\327?w\332\214o\0234\354?\005\225\310\374\334\016\342?\005\345B>\r\306\356?\366\335\240\372T/\327?h\206\341\223\213\236\315?\217[\273D\254j\354?\237\262\344\226\370|\355?~Y\320\3155\265\347?\034\005\266u\375\354\317?\250a\271\220,\"\321?\004\275i\275C\202\304?P\']\307\003@\241?\224#GLw\'\343?\242\220\252\034\311a\344?\305\004\352\351\254e\346?\2403,\352\304d\256?\266P\376)\342\177\356?\300\235\227\000\365\321\235?\360NB\270\354\023\242?\036b\n\010\220t\324?4\245\205Kn4\300?!\026tD\263q\347?\360pS\303\003\260\337?q\3408}\330\\\355?\314\332Zt\377(\336?\022\354|]r\315\330?\2169\230#:z\350?\004\276\226\307\014\023\313?x\"#h&\222\320?\316\2633p\230t\334?\244J\233g\362\355\322?\324b\206\267FY\337?\244*\023\260\267L\341?\006\246m\326\2064\342?K\260\257[&T\343?\300\306\366<\267\224\300?\210\"\215gD)\326?\263~P\016\031`\356?\3202\250\277\213>\271?J\';\251\366\315\321?*_7,\324\037\337?\370H\266W\245\257\315?\260i\037\240\034\357\246?j\235\033\210\355\221\330?tG\377\300\352Z\327?s\361g\027\210\257\344?|\335\032\273\373\317\333?\300S\254Ibg\235?\262W\357\017\266\276\321?\266\200\030V\265\306\322?G\002m\366\316\350\355?P*\375|\030\367\321?\264/\020\347\327*\342?\246\261\264\365*\265\327?PbS\034t@\252?\310\215\367\263\377M\304?<\313\235\331\247\006\312?\362\307h\235\363P\347?\210\334\230\013\373:\345?\206\270\233EQ\216\347?@\220\206\254f\310\303?\324\2675TG\373\333?\205\240\344\230\246\322\344?\3001\350P\310\244\322?z\223\240\235\264\301\331?$:\333\000U\277\352?\214\331\305\243\275\014\322?\365K\305]\0058\347?\345\245\354\271V\204\346?q:=Z\220\375\356?8\032\352\367\371*\305?\305\204c\270A\247\350?\263\342\267\374\017_\356?\212\261\255\351\356\010\334?l\214|\244\023\027\302?`\365\027C\326\310\320?JR\371+\330\274\346?\246o\211\241\002\351\341?j\263l}w\354\321?\025\r\347\0304o\345?\002\263I\237\031n\330?\324\272\336w$T\341?\240\224\314\366\tQ\236?.9\341jP\200\356?@\265\350b\334\354\267?A\263Z\305\002\353\344?8\\\235.\302L\342?\316\010k\"&)\342?6\367!.,\240\342?\356\010R\215G\331\322?N\257?\256\033\232\334?8[\241\303\033\221\355?\002n\224\255\361\371\324?`*\364\343\003\343\337?y-\304V\367\254\345?\200\351\326iZ\031\257?\202f\3619\332\263\325?d\021\276P\375,\327?\006l\220\317-\250\353?\305\231\273\214\204!\341?\310\303\335:~^\321?\277\2008\310\332\357\344?L\240\331.\253\002\316?Rj\036 O\245\347?\204\327H\253\364\"\330?\\?T\330R\324\312?\235\347y\303\346K\354?4\341\334\347\332V\320?\232C}\213\232\177\327?~>\372(nP\357?\0344\177\371>\371\324?OS,6\000\"\345?\200\017:\006\277\216\305?\376\014N\271\027A\325?\251\252P\236\373\005\353?\270\317\233\264\360k\265?,\250\201\203*\370\321?\336z\311\203\224:\334?\340q\372Y\305\230\244?\002/\336j\'[\347?\312\215\352\225\204\342\352?\354\305}\3600\226\326?\306o\354\350\020 \332?\004\335\003\366MT\310?\\m\375\236\352\326\347?\260\211cH\312\264\261?Q\303\027xP\373\344?\022\260}f\nG\340?\347\244\301\360\204\352\346?\301T\034\014\214v\356?\000\320\024\302TYf?\314\361\371\264\362\304\353?\233\265\367\234\371\312\347?\025PR\325}D\342?_\341\007@\354\031\353?\320y\017\254e\274\303?\007\311\250kf\005\345?\357\2779\370\212}\344?x\256\201\363)\271\270?\330\3410\265n\216\351?\t\253\231_\345\226\352?o\204\360\257w\347\355?\n\030do\2330\327?\326_\\U\006\360\325?\256\0059|ut\356?b\033G#k\002\334?\n\001!:\346\252\323?\315\355\341\034>\t\356?\356\261\376\362\016R\351?\016\341\320\002\273\323\352?\\+L8\240\210\341?\204\222Gi!\247\301?\336\327;F6\234\343?\n\364\330dgX\357?\202\270\245\263\320\267\322?\344\334H\006\215h\344?fqD\034[_\320?\270\037\275\016t\372\264?\340ol\022\322\370\272?@\315~U\0001\326?!v!\305@w\356?\374\367\021\361>;\310?$w\257\376\324\201\327?\341\362\352\177\351r\356?mXG\025R\303\341?d!\306By\030\311?`\017\r\310\264\022\314?\245vB\024\327\000\342?\356\365Y\007(\370\321?\352\344C\363\362\364\342?\312\3714\271\233u\356?q\320\0356Yw\344?\266\031\033\336C\331\340?o{\025\215\207\017\347?\340\351C\264R\324\267?\000\006\"\214\037\030X?\210uv[_\262\315?\367QC\036#$\352?r\315\000\033w\210\353?K\241j\221~y\343?\000\365\261\327\253\231\276?PK\340$}U\265?d\267\374:0\204\354?%D#17\255\351?\274|\332\332i\365\300?V\030^$\0130\342?F-\t\312\237\276\346?\036\356\211\r!\232\330?\346\352%\341[\374\326?\256\270\352\326\354\017\334?\374t;\027\3117\317?ra\236\351^\001\345?\2222\004\340\342E\324?\324[WP)\366\322?\364NM\340\034`\325?\230g\337\0252\021\313?\377\024}\244\321\311\340?\264\246\024\3214\362\330?F\2638\2429\371\333?s\337\r\\\005\272\344?\300\314\356\335`\\\320?\306\302\273}\242\321\352?\035pf\3019\211\347?\300%\237\264\210\024\217?l\313\342~\325\207\343?\256C\333\006_!\353?\000\243\376\220R\305\313?\246\003>rq\350\336?\370\321\365\027\314Z\301?\367\024$\305\020F\343?\342\237\341\350\370~\327?\217P\3756\030\252\357?`UAsM]\305?\272^\250\007\277\037\320?u\302}\307\364o\357?\034\230\311\211c,\351?\230\016$\2325\263\323?\034\202p@\267\226\357?\320\322s\304q7\313?\314\334s\261`\340\337?f\362\217\344A,\345?\006F*\200/\"\331?8\251A\r\023\327\320?x\343\273fz\262\302? \013\306\364\305Z\224?\372R\364\352\276\230\330?Gs\361\2516\270\356?\223[\357Obt\354?\352x\000Go1\324?0\tN\215f\327\266?V\200\213s/\250\346?\240\263M\035\226:\274?N{\030\225\\\346\352?\236\213\3269\326\001\335?\304\331\322fj#\342?0\355^\314p`\257?4\021\361\253\306\321\303?\230wk\033\227\226\270?80\250\214\226\301\345?b\256\222\003\3030\325?@7\217~\320\205\355?_`\3424\357z\352?\004e\226\323F\224\334?\200!s\221s\226w?\363\\\266\234\021\236\340?\273{v\363!\323\350?\310X\212\000>\004\321?\334P\267;s\200\300?IK\034\2509\323\351?\326v|\365\225\256\347?\270\036\000\305\000\375\337?\006\345I\347E\365\330?t\344\375L\221\325\336?t\244\240S\013\332\343?$\360\r\372\251N\341?\340(:\303\243\363\302?@a\024\246\203\r\206?\"$\021u\232^\322?\214\024\317\301\353%\356?p\305\334\337IL\317?\000\'=\020\031t\331?\223\302L\375]j\351?`\271\221\326O\033\263?|\304&i-\223\357?\306/\245+\253\305\352?U\r\021\343\001\222\357?\314\321\225\366\204\326\347?\272\241y\365q\263\324?H\265L\352\220\314\335?X\202<\013\200\001\336?(\004\241\030]\255\346?\032@\375\273\240\024\330?\274jj5\3120\310?\326\214h\232\003\337\330?\240\207a~\264\253\353?.\021k\010\307\231\353?\206%(\020\3106\336?\032\300\323\200C\325\325?\220g\301\275\"\352\323?\340\255*$\225h\352?\314\243=SMT\326?\204\345a\3176\276\320?\375i\"a\303\313\345?\224\347\237\312\3533\314?0)\201\312\023\215\273?@\254\333\362\252Z\356?\306u5\315\237\t\344?g?\013o\233x\347?F\236{\353\344L\326?`\302n\204\226\371\326?h\326t?\371p\356?\2307a\307\365|\274?LF\020\337o\210\347?\300mS\036;$\324?\324\266\323\330\315\231\347?!\307\365X\020 \346?7d/\352z6\347?\254\206\252\236\261\234\354?\336}F1l\240\324?p\322+\n\260Y\304? \316\003(\224\204\345?\245\232\006o\311\371\351?\024\013\363\326\263\350\305?\022\323\251B`\322\357?@C\022\341\320q\272?\371\362\314\300\340\013\345?\034\2464\321\212\272\317?\262\327\277H\204\026\333?\221\317\267&p\270\346?\327\022\341:\204\274\345?\325\314\301\350\226N\344?\010\335u\377\222\314\311?\336q\201l\036\010\327?\030\315z\215\032\331\341?f\256\337P\321k\347?\013\224z1:u\341?\000\307_\035\222\312\340?\304%\203\336\262\207\301?2\372%\241N\375\332?\013\034\024\357\204\275\356?0\027\342\3214\222\347?(n\330\375ns\273?!\003\234\323\021x\346?uO\244\207U\302\355?@\302\330\024\377[\312?x\021x\225\342\304\274?8~\240\271\217\034\350?[\226S&\211P\342?^GI\361\257m\330?;\360Q\013$\031\347?\316\354\366 \372*\337?<\203~\336\267:\332?H\333\354\264,C\316?\205\354]z<A\347?,\330T\363 \374\307?h\234\223\013\270.\273?z\2620\357\333\251\351?,]\3436\263\364\315?L\331\023&\260\343\334?c\016\216M\317\010\352?Z\235\331\230\177\017\336?\005\373\321\254y\257\357?\"M\336\'K\354\344?\340\214e\237\216\366\244?\357\006/\037\226=\350?\\\215\\Y\370\005\355?\223\033\373\341\202\304\352?U\275\322\221\330\273\353?N\\?f\203\311\337?\004\304\246\002\377\264\316?\224#*\\Xy\302?BJ\004\017\315\313\332?\2206\362\327i\370\310?\\e\201\352r\317\310?:\352\207|\306\352\332?\362\037Rn\031Z\333?\031\344\222\024\010X\344?\244H>Ci\377\340?x\010x\272m\005\263?e\337Fq{(\355?.\346:\341I@\323?\010\277\235\0365\264\262?sjO\210y\331\346?W\337\311\216\013\375\356?8\366\013D8\351\262?\024ds\2179\035\343?\314\237\322;|\320\307?\374HleJ\375\327?\263&\340\032f8\355?\360\022\221\302\017\233\340?\355\311\263\277\262?\354?\364\347\351\320+%\322?\005l8?k\344\346?\037=\2207\367\026\346?E\327\225\343\326\205\346?\nY\004$\034\345\357?\020\r\272\3621\304\334?i\213\001o\311(\342?\004\356\250\221\235t\305?T\373\331\273&\341\332?`p\324\205\256\256\254?\256\257\rY\006+\350?\222\224\005<p\356\350?F\235\"\267kn\326?\264\017\372\350\ti\325?\3162\311\"M\353\332?\2364_\361/\235\355?\220\301u\035\002z\252?\314\247$\2451\032\327?\244\021\265\'sP\321?,+F]\326\342\340?\332\325#\327\003 \326?\204U\335\333\236G\343?Hr\307Y\206\272\333?\000FX\274\017\177\304?@\313\275\264\264\345\245?\017\211\304\"\245?\353?,l~\300]?\333?z\251\3563\017s\353?wZ\020\021\354c\351?\370\327E\265h<\312?\340\323@e\374t\277?\rIR\337\277\347\356?\350\300\274>\213G\334?@\226Ou\242\377\337?\024\034rX/\037\301?\250\006\010\363$\210\350?H\343m~\200\320\334?\300\221+\367\021\277\340?\023\020\304\r\251;\357?X\320\304|d\354\274?\370\030y(\321\253\325?\376F\360\205\307\337\353?x\252\312\307\275\257\353?\201\r6&;\361\341?\250\312\026\021\345\363\333?\227U\310\313\245\327\354?\256\361\246\337 K\347?T&\360\221\364\270\337?\300\203\326\016{L\207?\276\327\346\377\010V\355?\306\nd\006I\026\323?\322\345\264\255\340\250\330?\312P\226}\020\236\336?Q\356\n\220\357\366\340?U\3118H\016u\350?\212\3160z}\251\351?&R3\212\026\336\325?\344\300\250\250\273\301\316?\332\334y\306\270\341\320?\240\213d;\317\014\332?|\223YK|\005\321?\237\277(YB\374\357?\242\250\243T\020\273\320?\3443\323\241\026\300\326?5ue\352\366\376\340?\020m~\263)\215\307?@e=~\225(\242?5\226\307\365\203B\345?\216*\367\004\177\354\352?\236\332G+\266\322\355?(0\315\202`p\325?\362\311\007\243\021_\337?F\212\367\201\207\357\352?\360~\315d\266\014\314?t\031W9k\025\325?M\366\3775\347\263\340?X\356D\222\324\034\351?R\033\237\025\242\353\341?(\323\022/\364I\300?U\3670\353\270\362\346?\224\223{\3032\347\346?F\325\222=K\345\321?\222\204\315<d\277\341?\262\231I\267E\251\333?\321O\360\220\303\340\356?`\303\037\267\340\214\235?\241\007\322B\216\025\346?\301p8\023\353\312\342?p\345\030\025\356\251\342?\\\233\364\3207\000\336?\324\312\223\207)n\303?H\243\031\245\300\333\302?\226\310g\327\325\"\353?\203\333\317N\034\377\340?\324\265\225\3253\'\303?\344\315\341\353G\367\307?\320\032\000\210\324\177\352?`\254eG\322G\313?\250\333\004\277*\356\352?\360 [R\373\376\342?\254\266\366[W\027\312?}I\345\020\222X\355?\354\227A\020\214\317\313?\000hC\217\014\351\323?\333\277yHt^\352?B\256r\370NL\334?t,hR\255\241\300?\2601\006\340\001\230\271?\362\023\275\262\0254\356?i?B6 \033\342?\330\222\342\346\244(\314?f\235\351\224\002\271\320?\210[@\330\300\253\356?/\3327\000\311\240\345?\3760Wq\267\241\345?\350!%\311c\004\341?\3220\001J\334(\325? \221\300n(\n\270?\004\'_\377\031#\313?\200\253\002S\352\346\213?\213k\213\277\334\304\356?\216\337co\211\235\355?p\232\301\326\177\326\304?\360\273\262\322\037#\346?2\355X\265\360#\330?\250\340\317\272\362\t\327?\020\305\316mw\223\276?\344\244\311\251\026\326\310?\306T*\357\360\354\332?Ff[\027\242%\342?\025nlb\021\271\342?\274a\201p7\325\325?\260(\\\220\312\307\347?P\371n\372\3039\240?\331\023\332,#\222\345?\300\212\320\360ZB\321?\350\214\3344\337\352\320?Z\023<Jb\\\345?\"\277\265\215\222\347\325?\216V%\2456y\324?\\\\\303T\210x\341?\341g\230\232#>\345?(\353\217\363\353$\306?\rG\300\240A[\343?\271\371y\032\276\217\342?\324\372\376\236\355G\346?\2448K\022_\367\322?\024\031\212X\245\354\323?X\300\211\313\'\205\310?h\020;[\334=\272?\210\364`X\241\014\263? q\221>I%\323?\370t\035\324\370\205\340?e\254\364\013H,\351?\373\020L!C\031\342?Nm\304\202\270\250\326?\224=^=r\235\304?\236\327k \255.\337?!\325\321\316\0064\345?h#\321\362:}\321?:\001\220\315\262/\345?s\035\201\353\006I\355?\330\003\233^\032l\321?\203\304\2276\000\327\351? .v\t\270\020\331?\000:\202\225*a\311?\006\232\245t\017I\325?\310\275\370*qc\300?\266\"\212\r\035\024\341?`W[\366D\265\273?\220n\364\243\363\314\304?j\360NY\311^\350?\263|\361\203y\263\345?\004\267\241\272gd\353?\010\341\301c\234p\332?\0200\322}\006\334\336?\250y\227\005\372(\350?\332iC\335=\333\347?VS\"\340\357\004\330?\336\321\255\264b`\351?\262n\334L\227B\354?\360\321\272\020\032\324\334?\264.\000\362\277W\343?\372\274m\346HM\323?:-6\341\016\335\325?*\265\022\340y\273\351?\354\2673\322AQ\311?P\256\213!\024\376\315?JS\212\024b\270\356?Y<\222D\263\034\353?\322(\344?Y\245\337?L!\243\232\321U\312?v9(\223\366\204\345?b\013D\311\237\372\354?\316\363\233\206\244W\340?H\364\025>\267\241\324?\\\216V\335\244,\322?\343.KE\033\373\351?\240\215_XWc\347?R\320T\207\320l\334?@F9\022\306\351\252?,\224%\035\026\311\307?8F!\330\347F\277?6X\372\353\237\013\331?w,35\314v\346?\031n\260\3606<\343?\334\360\033\001\035%\320?\037#\200\237\206\373\352?\366\272\241\247\334l\327?:B\245 \314*\320?d\327\007\352\303\371\317?{l\257:5\325\341?\0166s3\307\325\325?W\262\022%\010,\357?\220\354\357\375>>\333?t\346g\346\223\302\346?;\n\221\202Y<\342?T\363\211\264\216x\325?\013h\302\220TE\347?X\262|\2124\355\340?\330\345\324\026\257\277\260?\010W\031\344_t\275?{\034(\034\030(\355?x\303^\353L\370\347?\030ViK\333F\327?\340\263\211\311\346\247\241?7\004\314\013\"y\344?\210\201\321U\245/\347?\327\376`\315\225\202\342?\360\314\224\274\357\021\272?\365\351\213\035\213\233\354?\'\365\2757\003\365\347?\320\376\353\244\036\267\326?jR\274\375\"\251\321?o\210-\313\345\034\350?\034\202\267\266\204\371\326?\010\267t\2053\200\265?`\320\r\341\005\t\263?C\003#\017\333\271\346?F\006\252\034\257Q\332?\253%\263\'!;\346?G\277R;L\221\356?\370YZ\354\177\355\270?\352\332\271\020M\247\321?A?\252\253}\252\350?\274D {\347W\342?X|\231\241\345\326\333?\247\310!\331\365Z\341?\200\252NH6\357\217?H3\367\245\004\330\266?[\\[\307\355\'\352?(\353\360\203\316=\275?d\376\206\013\236\331\332?n\216\020\330u\250\342?Z\246uS\244\027\340?\355\221{$|\223\353?k?IBu\205\350?\352\367]\232\355\333\327?\016|\260\251\350\336\347?X\013%\274A*\264?\224H??\314\260\332?\274o\211\301\313\354\320?\232L\2374\035\341\333?\240Q\302\210!2\235?\372\302\265\304s.\335?\2600\270\240T\225\305?\320:\335\365\372\016\263?d\244N\253\210\207\310?P\306\207$\244\312\337?\270e\250}\226\210\354? c\307\n,:\301?\212\364\314~>\266\344?\327\253\035\227)\322\353?!L)\016\177\307\350?\356F!h\207b\325?\002\360g%\332\365\353?\240\254\224t\324\230\266?\000\\\3068\352\347K?\253\234R\016q\307\343?t\310\244\233\247\307\334?XS\276\254\320O\302?\200\300\353K\2569\353?\020\021)\206i\206\253?\2507\\wd\303\320?6\241\030o\364\236\325?\034/&gZ\354\336?;kW\216\322N\353?\224C\265;\312L\323?\n\243\333\2722\\\353?\200\274]\200\255\347\202?\344\277\025F\020b\352?\"\334\024DY\267\346?\204\354fya/\303?\360\245\242\3640\225\324?\245&\2155\'O\347?Z\341\r\0348\330\336?^GT\003\033\274\337?v\3763\301Ql\334?t\316\231h7\234\302?\300gJ\210\376p\336?\200:M\332\026\020\227?\302\242\027\214\246\010\323?\2604\325\244L\371\346?L\003\260\327(\302\314?\232\311\357\013gR\320?\342\035\337\316V\305\331?\230|\233\326\0027\350?I=\026\2324\010\352?H\360\210;\247\222\274?\200\213\245\211\262\375\233?\212\035w\207HA\335?\302u\301!W\257\325?~\310Q\002`=\321?\224r??\024/\330?\231\375\305\212h\321\350?\230]E\0327\023\355?PX\261Keb\240?f\006r\361\353\254\335?\200\265\222<\n\t\355?\244\374\301\010\306\010\343?\316\365r\274(\016\322?\0206\025\340\323\013\275?\376\037\327e\312\357\322?Z\302S\376A{\332?\030\025\275\270p\342\317?\202A|c\0173\357?\232\247\333\350^\247\326?&\275\225\343\300\003\347?\301\246\260D\365\342\341?&\202BB\261\373\323?|RX\203bD\316?Q\363\340\302s\233\357?@\006\021\235|R\272?\264\325+\315\204\336\316?0t\024\202\2042\252?M\252\307%`s\350?\214\335q}\2605\321?\367\377\231{.\251\353?\310i\232\"\377\217\341?\365e\224\214Z\254\351?\344\315\264HP\312\356?\000.{Gm\014s?\024%;\345\327z\316?\024\331\026/67\354?\355\206\245i\r`\343?\306\233\350\016\tK\337?\330\023O\362\371\030\304?@.\335\333\353\002\347?\362\264,=\312?\335?\326\276\221\0015$\343?\370\363\333\347`D\321?2\000XkP\300\331?\002Ty\305\021\314\345?\n]7f\334X\341?^\r3\267\332\262\346?(,\335\274\344\227\344?\336\010\263\223\265\203\337?2\312YR\244\016\324?mY\253@\264d\356?D\253\356K\343\345\344?\223\'/2;\326\345?\254\000\214U?\273\302?\004\333\215\006\364|\346?\036\337\326\367X\336\335?\014\244\221\2612\221\342?\000C\204\373\002~\221?\002\375\337\006\374\236\336?\036\342\272\330E>\356?D\204\251q\004\217\306?/\342\025\263\254\022\351?\304h\373\365X\221\352?\342\t\365\315\334\017\347?\274DQc\026\223\352?\313A\337\264\337P\356?\356\353;\202\177\220\353?[\232\262\315{\272\354?\274B}\355\026\224\334?\302C\374\003>\234\342?\321\203]\230F\315\345?\251\212\367\255\221\250\340?MoxoR0\346?\337\354\247\220\320\201\341?\352\027\226\000\216D\355?\320\375Y\306\271i\244?B%\362\272-\251\343?x\367Zj\030\235\353?\2263\352h\331\342\323?\360\025Z\2757P\240?\314kK\304\245\342\351?\'\264H\277 \234\340?Y\330VD\031i\350?\177\364\327\365\361\034\342?\246\006\004\232\215\273\345??cd\377WY\341?T%,\022Y5\337?!\032\265\000\257n\356?0\336\326\370a\020\243?pH?\227dh\323?\270\331\320u-\323\315?\222\316K\222W\\\357?O\023\336Z\\\315\355?\257\315\221]\246\224\353?\363\324X6\207\232\353?\327,\177 \351\036\340?\200\021\210\363V*v?\274\233\356\r\310\316\343?\245jT\035Y\333\354?\032I\\w\254\230\342?-j\311?x\275\345?\3477qu<\221\356?\354_\3171+\231\306?\016-\371\305\230\355\335?7.J#\330\344\351?w\224q\023\357\035\351?jwc\255\350\217\345?x\035\221\212|\363\332?\353\362\350\213\257j\344?B\013\322\357\025.\350?(R\377)\277?\301?$\257\347\350\000U\345?8q\331\006zs\354?\214B\005BS-\343?O\235\276\302c*\354?\250\005\234\270\023\240\345?P\340N\013\377\210\314?V\223[\364\005$\320?r\377\031)Al\330?\374w\230\027\352\244\336?\306.\231\325\267o\354?`~\213~\301{\232?\316\225\000$\206\353\334?h~G;r\277\303?\254\357K\2148\366\344?\034+\357\213T\300\305?@\007!\3730\264\313?\031\333\314f\365\210\354?\364\272\237\2444\276\320?{\305\304eG\332\353?R#\336?\230\277\357?\010x]\332\350\031\277?d\231`Pa\371\357?\266l\265\325uO\342?\350\2503\356iw\303?\270\244Y\n\004\227\317?\324D\364:\226\255\322?kp\327]h\341\357?\322\274\252X\007#\324?\231+\330_z{\353?\\H\004\3159A\313?\304\301\371Xls\357?\207\244\257}\230\305\354?\312l\314\311\352\244\341?\375\242\260\r\327\257\341?\312\245\3453a\330\354?\230\226\247q\025g\315?\342(;\214C\263\350?\365\314\235t\036\236\353?\226Xx*gw\350?\372\3414tf%\340?\323\r\223\324\005x\355?0:\204\003\337\243\317?P\200\'\260hU\274?\234\t46\210\233\352?\226\261tk\221\206\331?\316\341!O\211\017\354?P\201\010\304\211U\273?\014\005C\\\377\000\340?`\267F\350\236\200\307?\266\337\337\206\336\203\325?\214\014\370J\256\033\305?\024@\\D\375\002\341?\226\343\367g\272q\335?8\225\371\372\037\204\350?o\217Y\343\253\014\353?\320\245\362l\"\306\301?x\030\267\376i\t\307?\014\362\376\375\332@\315?\270z\357\314x{\330?\242?[l:9\341?l\332@\326\361\377\307?\210\375J\312\252#\347?\n\311\3646nq\345?\364\345\251\220f\226\352?\270\312\377Fw3\276?\344\354\233\324n\206\306?PQ\200B\026\342\315?\010\251\266o\315k\316?\204A\31076\212\301?NYV\342\226\273\343?X9r/\227f\272?\003\237\265\246\030\277\344?\236+\023\016\321\314\334?\360d\034%\345\263\325?\247\370*\271\233u\342?\374o\023H\244T\317?\263a_\342\375\273\340?N\210\255\222\000\364\356?\252Z%wu$\321?\342\2538\327\340\200\351?(\t\244l=b\344?\244\247\371\267\310\230\327?~\271\376y\236\215\335?\000O\242Zo\242\251?\313*me\346\024\340? (\302\205\207|\240?k\257\357\365\261/\345?\212\237\361BF\370\345?\260k\213\373\211\004\337?\314\022d\246\200\030\347?&\205[_\363\250\345?\265\354\243\307o\232\344?\340\007\030?\3344\225?t0\230\032R^\346?\010\202\241\220\345\262\347?\"\'\177\344\020\035\354?\334\251 \334X\214\306?\334\310\336\206Z]\321?S\366\345\340k~\343?\340\002X\010o\257\263?p\216\271\014y\270\333?)\354WJ\257\277\341?\362\211/\303\326U\320? ]v\000\005\237\304?>Q2\006\016\017\342?\200\377{\nw\362\277?\230\251\341\312\227z\317?\366\2403\302\237z\350?{\300\201)/N\343?>\202\202\220\201\252\325?\356\334\376T.\373\345?|\327\330\024\251\342\336?\212\340\310\315=\227\322?\250\001\213\327\362\212\332?\002\215\337\326\216\327\340?\270\371\277p\234\270\357?\246\345\350\n\2441\322?\334;\305/\241\023\340?\036bd\361\030\245\353?\360nH\200z\315\344?\270\244p@0P\277?H\333\331\234\212\274\357?}\353\214\355!\346\357?\314>\330\324\3776\330?\305G\250_/\335\340?n\353\226a\211W\333?\210*\213\036\205\370\343?p\212B}\355\350\346?\2055\277\374\3269\353?\224$\260\007\030\000\321?\257\3240\314\235\336\340?\000{\364\\\031K\301?\310;\246\231hq\350?\314\373\256\374\2039\347?\361/\023\366\332\374\350?\014W\021\214_\220\314?\370]\0067@\352\272?\260\264\210\356\353v\342?\304\037+\210Bd\303?\350\n\223]9k\340?\324\210$O*\241\322?\000\255\246\243\323\311\211?\360\327\036\372\223\267\324?\240\266@W\254t\251?<\2271\242\204\366\322?20\334\020E!\336?\2070|?\301c\353?\345\353m\006\'\321\354?8O\275\340\221\346\272?\000\342\007]\313m\345?`C\373\265\200h\227? \231P \342q\321?\022\031l\222\037\307\353?\214\177\332\343\240 \311?\266\037\216\233\370\304\326?\022\324\031B\303z\327?\354\302\343R<k\343?\321\006\336Lp>\357?\243D\335\354\303\245\355?&%\234\232\204\276\341?0\035\r\242m\356\274?\004\256\253\025aQ\352?(\307\021\205q+\316?>\251\024>\250q\350?\340\023\010o\261>\233?\332e\351Q-\001\346?\000\t\235\014\310\241\215?\2369\231\307\311\337\334?\027a\0277\236\235\346?\3256\345\260$\037\356?D\360T$\007\233\311?\330\t\004\006\031\301\261?\022\377\2162\360x\352?\320:\355\017\361\323\311?\324\r\266\210\r\222\344?\242GQ%\244\035\336?\225\237\370\256\025?\343?4\370\017\205\324\265\303?\273\276\037E/\001\355?~v/\033\202@\320?\310=#\363\274\321\260?P\204\214\301cZ\323?\255s\201;C\r\340?\206!*\226U\372\343?Dq\035\221\361\214\337?h\325\331]\240i\355?\366\215\367Qpu\350?\314KF\340\272=\321?\'\300\343m17\342?\000rg\034\313;\235?\\\036\316\n\337\201\345?\340\\\203(\030\230\277?`\"3H\372\260\270?\312\211\363|\312\t\336?\213\344aZJH\357?\204c\351\362@\021\342?j\211~\266\000\002\322?x\323\026\326\253\336\322?\3419\342{W\277\341?A>\337\263\365\250\353?\t\255\261D\335\307\347?\255\337S\272TN\353?`\346\010A\236,\332?nV\'I0\016\353?\377S\322\300\034\322\346?\264\304\371\304\344\316\353?M=\330@\t\"\355?\326p\254\215\312t\350?\203d\346\230:\330\352?\202\212\254*\252\222\353?\265\227\373\374\017\260\350?&cX\014;3\334?\300\356\371\010\"V\330?\210\211%\275\357p\330?\303\351\311\227!\344\351?\354\224\300^\333\221\300? \311pE%}\310?Tm\326\262\013\340\301?\254\274G\026C@\326?<\nK\302x\211\331?0W\363\256c\032\254?\204\020\000\013\350\342\336?\365\005o\316\207\247\343?\0203h)\002\265\304?\310\335\032\344\224\311\341?\241\275>\345\233A\355?$\327\215\254\030\314\356?@\021Z* \307\264?x\201\372\342W\347\340?\350X\212k\3324\321?\024MA4:c\300?\264NwQ8\025\316?\333\034\024\337\203\330\352?\276\252\316\360\177e\333?\256:\206\036\310\321\353?\256n\277\203\254\223\340?\220\315a\342\301\254\332?b\001\351\212\226\241\324?\340l\342\034\307\334\331?htf\300.\026\263?\334&\247\021X\315\350?\274)jf\314\344\306?u\031\035\250\262n\342?\225\355\275\007\245\363\341?\245\363\230\tG\271\355?\360\265\324\344u\326\355?\254:u\"\377\201\344?%\026\326\232\024v\355?\224\251X7?C\341?(7\362\214\004\376\273?dG\317\320\315\256\303?\215\311>\013\222\014\356?\n\333\312\352#\255\354?\337O\2728\353@\354?<\030T\373vT\336?\346\307\001\273\022\022\336?\210\224\'\256WA\304?s\271W\300\254\257\347?\350\376\2265\2251\310?\220\304\026\2102\304\254?$\231\366+\304r\312?\265z\230*\373)\353?V\221O\014\263\266\325?\200ew\373\340\360\245?\315\220A9\202\307\353?\240\367\2425\252~\245?\320\204XL\007\336\345?\316Y\r=\266\254\347?\271\006\375L\177\201\347?}5\334\301i\244\347?\3373LI\010[\350?3\242,\330o\326\347?\236\021\306\271\001\005\335?\242k9\351\231U\335?\260\335\223\245\232\235\304?f\"\345\020HM\326?\234\303\035\206}\003\310?\360J\324\364\236\004\247?\222\360\376\001\230 \321?\324B\t\010b\312\341?\270wf\017\311\004\333?%\2521\364\270`\347?\204$P\374\267/\311?\370t1Z\363\023\263?\002\006\"\006\316\351\350?^2\2042\272\177\322?R\306\220n9\237\330?\245\004\366\234p\301\355?PU\341\355\323\213\312?\004h\322p\020\323\326?\000\014k\0213\310\324?:^\360#\235\013\330?u?\234\210m\005\353?\307\266\361\036\364\001\344?(\233rj!f\316?\252\033\247\262\325\202\345?\344J\222x\250\254\346?\345\235\344\001\352\343\356?\250\024\376\261Z-\340?\036\200r\273c\210\342?\270\'3\231\365x\273?\000\306E1\r\247\352?\300\304\032\265\305\203\337?\001\377L\2551e\357?\330d\231\037+3\306?\\\370\013\371\321\025\313?PRxez\013\260?\351\356{\363\3050\343?b2T\007\217\307\356?Dc\355I\260r\310?\372\026\2013x4\331?\300\300ED\247\356\253?\\\007\364J\022\n\311?u\255n\266\2310\352?\216\017>\366g]\333?\350\034\317\321\321\026\301?\030\016\254&\200t\342?\334\315E5\335\271\334?f\363@FM_\323?\010\353\302O+\261\275?\255\215Z|)\306\346?<\211lrE\206\336?\3676I3\033@\343?\252u\233N\325\214\333?\256V\317\307\016\341\352?\017y\340F\332\024\347?\355\235?\311\326\\\353?P>\213^\217\353\314?0\340\353T\273\225\353?\244\312\212I\300\\\343?\377\272\366b\375-\352?7\241YC\037q\344?\360\014\226\353%\t\251?\226D\274\036x\246\341?5\024{\237@\202\346?P-\341\005\262\327\345?\264\360/EG\033\310?\220\352!\3410\270\274?,\222\214\004\275\353\331?t`\241l\271\252\335?8 \2726\260l\334?\330\300\271\322,\354\344?Xf\351h\326*\336?\304\022f\253\342\306\350?\020\215\227\035\235\321\336?\351\331\006\235\023(\354?+\237\221\320M\000\343?\250\300\266J\356\254\337?\250\007\235\177\331(\272?\264\016\211\307\027I\312?r\243\233@\255/\321?K\007\320\315J\322\347?\237\235[!\022B\350?\362\343*(\240\376\354?\014Udz\321?\315?*\316\316\213\215\207\340?P\016\003\346tU\333?y!\246\256g\214\347?r\371\224\305!T\341?De\230\207\'e\301?\230a\245\no\377\357?_}.\236WJ\344?f\366h\200\2376\337?\217\337%\013\261f\356?\014\360\212\274&\222\353?\326\017\n\245%y\332?$\346\034\022\376Y\302?\300T\336\363\351C\247?a\313\035\036\025#\355?\211\014\371:\344j\347?\210\257\2254\0042\324?\372\217\234\177x/\346?\334#\261\271-l\330?\200\014#\267k\256\332?8\270\'\234\371\025\320?N\331q\252Mi\351?T\342&Yk\n\306?Q\027\245\n\233\264\353?\254\\\200\333\342<\300?\231\014\300\274\355+\350?\200\303\355U6&v?\222\227K\311\014\314\341?\262\260\246\016Sg\345?\030A\206\320\205Q\303?\340c\312\375\341^\300?\200\217\350\231\"l\346?\032\364\300Zm\362\321?\000\271U\300\340\007s?\204\231\203\277Va\313?\304G\2324\307\362\357?\004d\276\212\256\306\315?\350d:\021\033e\357?N\277\255\2261C\322?T\221Y\360\257\025\314?\372q\311\352\252\336\345?j\"O{9\230\343?\240\225\035a\341\253\253?\035\247\336\232(\364\340?\300$\277|?%\256?o\0332\234\006U\350?\204=\246L\250\332\304?\245\2609\347\342\304\343?\264,_y\016~\320?\224\n\241[\001\227\306?\310\331\246*\2340\273?\230\031\037]\037\314\356?d\314h\253\335\265\326?J\206,\212\216\377\326?X\025\347\272F\020\274?\010\201\313W\007e\356?\345\201\236\226\266U\350?\210\206\224\230\201\273\317?\201\237\302\207\363J\342?%\037\262\013{\362\355?\225\206\323AD\367\347?\014\370\331@\347\220\355?\007\375z8\260\'\345?\2101\214L5\"\325?\034_.\355\343\307\354? d\234\275\2547\307?\305\322\312-\016`\345?\3015\314\203\241\016\342?7\350Ke\237\263\346?\340\317\372U{\316\342?[\360\253T\214\006\347?\362b\363\350\215\324\352?\260\331?\t\323\345\342?\000\026v\345\\\263^?W\372\372\236\375\250\352?F}\0272\306I\335?T\335\244\021]\347\317?\240D\227\312v\266\226?\322\343\311\324\331\024\326?w,\261]\227\016\345?\000MP\177\220\r\357?\000\2272\343\020_\356?6\306\233\023\231\230\346?\201\257Q\271r\231\342?\225*y.\203\227\353?~\263X\037Dn\325?\223-\213\354\240>\347?%,uj\373\321\357?\310\271\003\224\223\005\324?O\335\261\003\330\351\352?Pe\263\330\335\320\353?#{\276a\365\236\343?\000\205\217i{jx?\020\310\361J[E\247?\02513\262\'h\355?\016\357\n\245v\234\336?\302\264\2660\302\315\344?\224\344\010\340\310R\340?\230\021\264\232\217\007\354?@T\220\265\311O\337?r>C\243ln\327?z]\007\322;\266\342?y\245?\343\2610\354?\247S\033\344\364=\340?\272]\3374\031\357\330?<U\272\354\025%\337?\3762\226\314G\324\350?\210\205bq\247\267\325?H\321\021I\014\226\343?]\002c\213\367\027\343?\340\224\310\254\220\336\256?Z\n\030\333%1\337?\200\020\023[\343\246\252?@\305:\001)\255\235?^}\023\247\356\262\335?^+\\\360\004\014\342?8\201cW\372b\343?\302\233\232y=b\355?\231\\\372f\212#\342?\307\223N_Aw\352?L5\3563\264\355\357?\200\005\257\310\350Z\251?\204\321\360\242\352\213\307?`\213C\201_\001\337?8(\323\214U\366\303?\000\3768\254\232lx?4\262\177\325\304\300\301?\340\372b~yX\304?\020\330\3106\372y\355?\200\265v\0227+\325?F\263V\274\372\211\355?\314Y\201K\316\246\326?\352yd\267G*\323?\220Q\030.\273\336\337?\262b\027SD;\333?M\025<\266<w\341?p\274\256\3773\177\254?\246\317\276\250\353^\322?Dl\3469\217k\331?\310\277\343)>s\300?/\271\350\3719w\356?\203\200\372\354j.\353?;%\316\236\346W\357?\256$\016\022\374\356\350?T>\331\276\325\026\346?\n\007\367P\370\354\336?\020\354\265B\375\005\322?\303\371\202\231\361\366\356?/\241%\227\333\325\345?0C\236\244\204$\274?\236\246`PV\373\327?C\3605\351\214`\346?\304\007\314\354\310V\342?\356^\265\036\301|\355?(\245\017\025\324\323\262?\272\221\335z\002\333\347?\274-iBq\014\353?\270b\336\235\252\370\344?H\001\243*\320\004\323?Z\3511I\315C\322?4W;\347\336T\337?\222\303\005\007\313 \347?|\234i\240Y\267\356?\\\271\034\331\375\262\342?\264\000\237\362\263\253\347?\264\200h\331>t\333?;\266\213\244\274\315\352?\022\244\001\346\356|\326?n\215\000\260\356!\343?\376\307C\241o\254\326?\006i\336\225w\017\320?\364\226)}\217\300\302?r\210t\271?\023\320?(\270\331\260\231&\340?\330\205\204\345\3217\300?4\207\365w\026\237\343?\340\277\2322:W\325?>\304!e\264\224\322?\224\223\367\267\026\316\327?\004|\244\303\251\030\316?*N|+i\371\335?P\217\r\2312&\262?\034\2639\324\030\243\347? \314\251&\033g\310?\004~\221\222\327@\350?\310C\235\266\365\376\266?*t3\000\307f\336?\215\357\027aD\333\353?\216E\007{\235\230\340?8\333\234\264\316C\267?6\355\007\236\203\030\322? ?\370Q\340\233\324?:m\005E\326\014\334?\375\266D-\247:\357?\030\365Y\006\010p\300?\0364\206\330)\025\356?R^J\025\"\237\352?\266!\251\020\"\274\324?\230s\314l\272\300\331?\024\225\312\2668J\333?\223\330\0043\021\242\351?s\263\247\021\351\020\350?>\320\244\377\344\376\322?3\377\232\314h\016\357?\321C\251\241x\212\350?P}\202P\203:\315?F\026{\230\337\345\353?>[\213\013\'8\357?\316\226P\267<\007\352?8F\020\327\213{\317?\250i0\277\224\333\264?\336\261\270\350\375\345\321?X\363\207\320\014\013\350?\253MKW|\310\340?`SQ\276Y\316\305?\343\237^E9\243\340?\024\322\237\035%a\335?0\002\217\276\367\354\254?\270\2749\317\373\236\265?\223P`\314\234\320\353?,u\006\004\026F\327?\350&:\313\237n\305?\211q\360)\236\333\344?\240#\022\303fp\311?w\312b\222\2572\347?\370\276\276\272\t\021\302?\324\276\270E\030\376\343?\362\005\354r=I\350?\222\2741\222\315\253\351?\340\364\014s\274\250\265?\340\362\354k\033\221\302?(:\201\272gl\300?\361\262\335)\324\354\355?\030\364\314\204\336\325\323?)\030\266\207d\212\354?Y*\316\373\375\253\357?\244\232\351\231\327E\307?~\323\373\177S\202\321?\300\226K,\374\334\357?S\246\226\"\342P\341?\376\212\334x\324\207\356?i2\337\322;\334\350?\234lluD\356\325?\200\n\342C\365w\246?\003\251\303_+%\350?\210\370\203\323\252k\330?`e\247,ik\257?\265\342\347\261O_\344?H\007\222\007\263\316\323?\335\257\035LQQ\347?\004z\'8\360\033\324?\020[\307Wl3\317?\364\301b\352bt\315?\314\336;\030hZ\355?\220\234\032\376;\002\327?\3447\301\220\002\244\350?V\316\324\327pE\347?^4\371\300\001~\335?\3109\016\235S\274\331?\260\016\024r\213\264\306?4\210\301\366;h\340?lh\220\2259\273\310?\\\022\345\314\230\373\345?\330\004\234\357\236l\275?n\336\327\231X\267\336?\005{\3113*@\356?\324\007vql\352\304?\252b\237\034\301A\334?#:\227\266\":\345?\256\312$r\251\364\345?\260\306\324&I\235\312?\304\315\014\333K\265\352?\260\240\331\335\005N\254?\270I-\362X&\331?\275\257\023\303^\036\353?\035a\r\365S\364\353?\250\202\272\314\207c\335?D\376n4+6\324?\350G\0217z\375\264?F(\344^a\277\327?q0\255+z\205\351?\314\371\357\262\037\364\335?\200\007\341\222\227\327\222?\376Z\233l\020\273\352?\324\006\245\\\240\005\317?\367u\014\030_l\342?\257\324\314\341\377\312\341?\r\207c2\305\326\357?\203\203\024\030\271\372\355?Hyk5\333\017\267?erf\221fv\354?_30P\306\225\347?\360\231o\235\t\337\332?\234\337\354\255V\006\347?H\354k\315\375\227\265?\034x\226r\005\346\337?r\356\252\240\263\313\354?\374\321\014\230-\275\327?pp\260\365\024\370\341?\000\'\377\230\232/n?\034R\207\024LD\333?\356c\227\360e\022\357?G\361]\201*e\354?\000\'\031\220\347\373\272?XI\323X\004\001\306?\331\261X\032 \362\355?\007\320\365\3052Q\340?nB\303\256jz\344?\246\034)\225k\242\346?\030\340u\306\037\000\322?0\003\326\316\005\260\315?\375gP\303k\243\351?V\375\267\254\256\307\352?\000u\345K\255\355\270?4\202\216\307\365\036\306?\220\356\321\312\3768\301?\201\310~Y\257V\355?\340}`\275\206\177\247?\334\225\316<_\234\346?Hk\270H\224\030\311?\014BY\302I\302\315?\022\274\314\366Y\250\334?\233=\005V\232i\356?\257\307\027w\261\\\356?\030\032^\\\261<\277?D,\321I\333\305\303?\216\016\235\354C\306\331?\256lfz%\205\331?a\315\230\274/\033\346?[4\222\231Oy\343?H8\324\017\254\330\311?\260\257\371L\325\265\313?^U\325!\356e\354?\362NK0\035\001\346?\250\377>Xz\200\300?\000\023goE\264\260?\026\357h\265\013q\353?X<\306\376L\370\306?i\331eI)\036\342?\307\262C\304\006_\341?\200n\260\024\263\233\344?\030~\200\021\212\303\326?\002HX\362k\365\350?\322@\334\004/\366\345?U/\314\t\205\344\357?0@<|\207\243\344?P\265\016\\^\324\322?\340l\366\351\323a\226?\316\372SP\272\000\343?N\257\373\304\255\316\325?\352\251-\006Q\200\351?_\235\362\306!\317\340?\331\374\275\035/\210\356?<\202=\034\007p\303?,\036\027\354}\275\322?@3\241\014\221\334\275?\343\240{F\355\256\350?\260\231(W\031\320\321?Wo\'l\353\021\357?\224P\256\267t\324\301?\202\207\222%\036\001\336?\337\371\224\202\356\206\344?\242\324~M\314\355\325?\264u@\260\326\253\332?P\264\337\305\242\325\242?ZFN\377\205\013\356?\030\221X,\341\005\264?\305\350\360D\312F\342?\177\342P\262\213n\345?\364\002\031\033\200\242\352?\220:\271 g\202\333?#x\027\232\367\242\340?\276\233\352\037\343\302\356? KV\305>w\336?\250C\000\301do\271?\255\202\306[\302\261\340?%\374[dc*\356?\324\327\312\'\037\301\355?p\231\367\324A\314\322?\316\216\271%I\027\351?V$\276\374pW\337?(\275\342\364\230\365\316?3N\335L\016\347\352?z\342\365\363\353\323\336?D\352\032`\016\000\314?\350F\331\276\266|\306?\361\205Y\217\333\331\345?\200ypH*\344\253?\334\031\276/\243\304\312?\223\277\355\212\002\006\350?\000\250\263\337\246A\335?R\n~\316m#\322?PJ\250\227\0007\324?\030\006@\347\246h\320?\022z7\206\314\343\337?\r\374M\327,\224\355?e*\236\341\311\217\354?\270W\366\363\'J\271?&\340\2523\206\245\354?\344~\211\313\014x\306?}{h\313\030\271\350?&\363\344\226\3604\350?\232L7\377G\353\356??\324S\304\035\205\345?! z\333\202\345\347?Dx\332\016W\321\325?\234\264V\004\216N\304?X^l\253^\207\277?p\234\311\305.w\241?\202\312qw\334\237\345?\244\234 \337\n\257\336?\376\311\222\356!d\337?\234Q\036E\221\244\306?,j\373\203\247\261\312?\300it\2006\333\256?0\251\'W\313\300\302?\200\027\025\033\200\370\214?<]{|]\267\325?\371z\227\236\246\017\356?\360e\310\351\to\272?=\325c\347\261\360\340?\214\3258\244\337Q\324?\0334\244/\010\203\354?\301\211W\320\275\010\353?\240\301\010\035Q^\233?\320g\265\360G\022\320?\3140C|\200n\326?\274147\037L\311?\323%\227\"8\240\354?0\267\276\365\375\324\354?\205\367\242m\001\320\350?\212\346W}\000+\324?>\373\021\240\316\231\352?c1\276dQU\351?\300\213k\240B=\337?x\212\222)\234\314\357?\307\202bA\344\203\346?\252d\222 \303\243\343?@b\241\010\326\033\316?\303=#\233W\350\344?#\222\314\275\037\t\352?\374\211F\311&x\314?\006\363\205w\262\216\327?\001^\247\232=\016\354?\315\365OB_(\347?\311\177)\202\377\366\346?.\236\311D\010*\353?\304\026\223\304\007\032\326?\210\nI\361w\236\273?\330\307\264\347\330\217\344?R5\360?\032\203\331?r\307\202^\347\026\333?\274#\'\222\243\036\343?\\\266lG\262\271\333?\261Y\273\017\346q\345?\340\264\315J\td\326?\357\257\357\313\246\246\357?\340\010\003\013\212\246\262?\302\261;xR{\342?\260\311\213\323x\214\350?lM\002\\{\007\326?d\034\360\354\312\353\336?#{\267&\274]\343?\213\247\207/r\026\353?\330W^\311\2365\346?R\271Y\016\267\272\354?\224\023H-\234\013\350?P\305[\357kY\255?\255\216j\177\007g\357?\366\025\006\350\230\300\324?\206\021\373Vf2\355?\2670\254\203H\264\353?\024\351\201\202\301\026\333?\200#a\224D\302\200?\200\243\267\276\033\204\271?\370\234\300P\312k\313?\375\207\276M\212\231\341?;T\300)@\206\353?>]\303\247\260o\331?$\361\263W\001\013\323?\"]7\013l\246\350?\350\367\354Z\340@\302?\017\367\306\007]\352\344?\334\316\343\376.:\336?\260\034>iYa\320?U\236\n\017\215\212\347?X0w\246K{\272?8I:\235k\323\345?\245\007\373\226\205w\341?\\iT\361|U\313?.\366&\243\013\353\351?D\r7\236\nQ\334?\313\205,m\342\236\342?iP\361\253\212=\344?^\030\311\022\330\376\340?\030^X\362\244\233\337?\036\256\364en\224\321?\270s\254\t\010\025\324?\005\245\rPbq\356?\347\325)\010\227\260\350?(\235]9\353\344\356?\360o\217QX\361\277?xUz\370Y\303\264?\300\220I\0036:\201?<6ch\277M\311?\'\267\241*\341\325\356?\"1\326\221\201\351\335?\014\277\374\233|\237\316?gH7\251\022@\345?v\007\362\267u\200\353?\236\266\322+\304\225\332?\254\031\267\222w\305\314?\367Y\353<>)\354?\266\002\204hd\013\326?\202;\006t\302D\341?\276o\177\003Ms\352?\020\205%\3668T\305?\270\276\375\351T+\301?\246\343\353\204]\022\350?\n\332\370\200\261\020\337?-\344\263\372\350v\340?\320n\326\352\356p\321?\266\024k\204w\316\352?\337\333\324\013\016\225\344?4\310\342\332OF\321?L\301\035P\\\361\310?0\311P_\360i\256?l\365\205\006\030\303\304?\370\225LL\213\373\272?`+\027^\004A\240?\330\"\265\025\316\254\344?\240\346q\340kp\345?\021s\314;\260\004\350?\344\343/\026:P\346?\234\204\206\310J,\324?+\2400G+\370\354?\341\013\022\235\346X\342?O\201\036\267\373~\355?ba\377|\010\372\352?\344\235\033y\214\246\342?\372\354Y\037\340\005\342?\272\265\313\261\020q\341?x^\235[\266\023\270?\225\356(f\037V\352?\271\370\3516\374J\347?\343\211\267\234\371\323\350?08\260A5Y\303?@3\217\020\355\250\310?5\235\004\204\202U\340?\000\256+\305\277\256\300?\340\234\266\204S\302\270?\321\340\211\251+S\356?)?W]\355\354\342?\312\212\346;\245\355\320?r>\276\201)\366\356?R.!\242 \002\343?o\033\350\261)\346\353?\251\376\201\230I,\344?\212.\000\343\231\371\344?v2\230\244\275\247\322?\351\205V\307Th\347?\306\004Cg\315G\322?\355\227k1\323S\347?H2\246\366[<\356?`g\023\263\233\355\233?X\375\317\"\343\226\353?\020\256\222(\3356\262?\314\017\333\333\243G\331?\030\263\007\355\023\263\340?\221\210;`%\312\355?\303\301\214\177\376\313\355?>?N\360\020\264\333?\010K\006\340\272\340\325?\n\016\205\t\374\235\330?\340\334\335\242L\304\303?\000\327;\346\234C\231?\230\243\365\030\233\000\327?D\027\214\224}\010\307?8\321\330\315\020\352\317?\013X\313\231\240\027\344?dq[\276\267\017\332?\260c\025dzU\315?\200X\336ml\302\261?\373\014\363i\023E\346?\341\'\276\200\214\022\346?\250\210\360\2130-\312?\300\t\277u\340\340\231?\312\034\256\337p\314\357?v\356\375\002\360.\325?\330Z\231\344^\016\337?\270\260I\274\244\321\301?\3401\025i\177\266\305?\214\324\234\301,\035\315?\233\326\024A\251]\344?@[\212\213\022\255\233?\332O\311\000\374\263\353?\'\204(\037+e\342?\350\312S-\033x\302?P\365|T\240\250\311?pF&/\332}\264?\242\242\232\200\327\254\330?\036\243IL\345\357\320?G.ux\304\277\351?9\0169\270\030\252\343?X\302\035\010\261\033\276?\366\344\241\022\333\346\354?\210\263\226<&\372\272?\317/\035\001\001\215\353?\27692w\205F\343?\262#w\023\262\316\347?YN\035\014\323R\355?\306G\240j\225\005\356?p\351{\034\301\375\242?\2209\267\301s\"\257? \213s\301\222n\221?p\274\314\330\275k\353?<\311\231(\276)\342?8\350\375jv\261\300?\265\327c2\221\255\350?\210\374\315\014\226m\301?\334\331s\246\004\364\343?\024\230\300\363\236\236\350?<\025$i\245m\354?\356\335\333S\275\244\346?5]\002j\310\003\356?\364\'\013[\303\306\346?\246\341\021\225\367\212\322?\330\315\265k \262\272?\314\356\276\365\251\275\325?\224kZ{\246b\355?\212/tV\333\n\330?\204\202\212\\4\237\307?\202\301\202u\234\262\332?\020p\341\251!\217\336?\212\010\0368\354\376\336?\365\270\344\331C$\355?\256\250j\263+R\324?bM\030\215\206y\357?`K\"I\314o\352?\342\340\2719\332\025\355?0\310=C\t\210\336?\234a\301\270[\263\303?9\204\230\222\341)\357?\030\224\360\321\375\207\324?(\365]\201p\257\312?pB\332\312\343d\247?\356T\034z\272\345\321?\234\304\217\303gH\332?\360HI-`\362\240?\030S;E\013Q\326?4\355\333&\340s\307?\024\277\247\274\263A\342?@!\205\036%<\357?\336\362\201\235yH\345?\265\252;p3\216\351?\3607(e8\354\256?\000\323T\2770\211e?@\270\222f\027\024\244?\356\034CC\320\316\330?\302\202\223vo\215\355?,mh\326\206J\331?\013|1\300\001W\347?\225!\312\037\365p\346?\333\005\200\024\265q\346?\335T\036;}\204\340?\344\236\307\r\242d\311?\032F\221\366\327\006\337?\016\272\020n\315\354\326?@\333j\013\337\261\275?\212\265\265,\214\n\342?\310\014\001g\324\211\323?\340\321\267N\256\032\312?\004\030\254^d\177\357?\003\333\373_4\326\357?\345I`\r3\351\352?\360\007\024\016~\363\313?\024\331x\312P\262\340?\200\305\034PR\007v?P\340\340\302\320?\335?\370\006\237^\214\256\355?\330\220q\244\304d\326?\220\003\336\022\252G\275?\352d\342\2041B\326?\005_)\217\267\204\353?r\272\270K#\226\324?\275Q\0264\017\022\351?FM\232!\345k\355?\214\202o\205\212f\325?\000mX\211\246\303\261? \361\325\014GD\342?\270\025<\003S\304\346?p\363\236rY\r\300?\"*A\020\t\200\332?6\2254YV\276\327?\340\243\365\257\250\216\331?\252\206\251\236\330\236\341?\310_{+\024\361\344?\363\326\263\014\360\226\343? K~\351\374#\231?\322\344\\\3770J\352?\210w\306\243e4\264?\260\236TT.\202\256?\004\200d\270\373H\335?JEN.\025\237\341?i\2243\342\\\200\351?\200\324 \216$|\274?\326n\327a+\372\346?\200\351\262|\023ts?Z\"\030\032\3057\323?`U(* \\\337?\343\373%\215V\336\345?\220,\223\237g\217\263?\n\227j&\261P\356?\330&\314\202Z&\347?<jir\257A\301?`g\227\027\224w\313?#\020/\341\306\376\357?\030T0\263\032\275\341?\343\330L\020\354e\351?\260}\222I\251k\334?\237.\214\254\250\317\341?D-F\013\013J\335?\271\200\200\201\344{\357?\360\325\036=\204\270\322?`\2654f\247\004\255?6.\242s\203.\330?d\032\224\275\272\237\357?p\206N\343\347*\270?T\215]h\300\373\327?\310s\273S\027\251\263?lLJ\"\013\355\350?/\325_\305\003\346\342?\2325\3146\261\007\332?\237\267\r\035\216\346\347?\177/\005\260$y\351?\352\353\240\255\236T\324?i\274\340p\005\332\352?\274\343\242\"U\020\305?\302h\355d\342)\345?\014v\200\026\014\255\322?tl\017Zz\234\340?\210d\340)\034\363\314?\332\226\023\361\366M\351?\252\034\252\323\005_\322?\022\203\237\307o\311\327?\240\266\204\024:\271\307?^\320\016\343E_\333?h\022\t\010\305\265\332?\236\354\204\262\'\232\343?xD\377\344\373N\265?\334T~\231\233\257\340?(\\\324\303qu\336?_\352\026\037\265\255\346?dr\3058\264y\353?\304\212-\377{\305\305?\234\r\240\244\004\372\357?\016\225\326\230\274}\327?\304-\273\315\250\374\303?\333\000_8E\263\340?\'6\3274\027\005\352?\022\016\245\237\247\260\324?}\266\326\267\233\020\350?4G\332T9\215\341?\370\315\253\202J\334\320?@\353\312\336\031e\350?\340Ob\330\250\275\272?<\371,N\220e\307?8\225Y\2533\324\321?\276\306\271j<\357\355?2\303\265K`o\322?4\325\277\356\237\370\320?\000x\370\252\35715?\244\3312\341$(\341?\312\3726\r\216U\321?\344\231pOG\021\302?\301\300\003\354\032\036\355?\315G?~\371\002\357?\002\214\303\356\3416\340?\014\346a\266\311\343\320?\033,\212\220\363\370\344?\340\3274\275\001\215\236?\200\307 $\310\344\227?2\351P\320\275\030\320?\200\t\301\031\300\225\347?\344\333;\245\310\026\324?\260Xx\227\303:\301?\340\372\354\0131\266\246?x@\336\324\037\\\314?\312\337\341 \242\365\331?Tc*\205\361Z\321?}\306C\320\306\210\356?\021\345}\244C~\354?\357\373h\206\346Y\356?\177\347%\317\263|\355?\204\003\327\371\022\010\330?\266\347\003\257\267%\331?\030\200,Y\220\221\302?H4\256\220r\300\275?\006-R\325\020\366\355?<\236\321mm\\\317?\304K+\326\317\337\300?\342b\214wDP\354?Ld\332\177\002Q\354?\010A\334R\344\267\301?\016\300!\027}0\335?\177w\326],\273\354?N?\025\330\275\023\324?N\202\313\264\177\222\325?\3278\005\r\227\223\356?\020\302r;v\242\275?4:p\210\230P\352?*\006c\307\363*\356?\376\322DyP\274\320?\350\213\033NQ\337\340?$(l\231#\377\312?\026\345\031\240\346p\321?\020\014\215\374\020j\352?\202\212\014)\',\351?;R\276\023\203-\342?\270epK\003F\323?\373\246\367m\317B\345?\356\341V(0\310\341?\nRd)\0347\324?\2002\010T\307\265x?\257c+D\020{\346?\243`\227\027\250U\347?T\\\232@w\327\353?\327[\374A\313j\352?\276\373\257\345\037\360\354?\272J\333\375k\234\350?\300\010\330\364\321[\221?\301\242\331A\265\205\350?\032\035b\255\034e\332?\240\373b\301\276\366\252?\236\351h\214\265}\345?\303&U\001G\243\355?\032Lh{0N\333?=M\261A\355\320\343?a\234\317\"\200\033\341?$\246\2005H\017\302?>N}\3323i\354?\306\2625J\346\311\351?\331H|#p\346\341?\204\377Ld\335\000\352?\027[L~\312\202\354?\010z:\242\237\220\264?\373\371\001!/&\341?0\332y\322W)\342?\030P\344\273p\253\324?0\237_\202]t\323?\364\353(\216\331 \311?/\362\313\343\210\345\355?>\250`\035\306\010\327?\237\007\332\312\365\336\353?\t\3731\240\224\034\351?3\343,m\007*\353?@Lb\314\233b\260?\302F\247\031L\246\330?\\\332\362\217\035\366\340?\222\007&`\377\374\352?p\276\"y\335\307\260?\372nw\001\275\345\343?\023\3145i\223 \343?`\177\r\325m\025\306?\215\265\220\220h \355?\360\023\341k\004\324\257?\234U\002@\366\204\330?\350j\257\033\010\264\347?\254+)1\014}\351?\230m\237\367\225\270\345?X\341\360=4\t\307?\324\374A\237\004\214\334?\020X\354\177\200h\313?\023\202\210\345F\245\347?i\303:?\362\t\352?\373_Q\342<\253\344?\237\371d!\330\007\357?\014\234\257~\177{\302?\226\267f;\373B\321?\342\305\214\256\002\350\344?\200:3N\033\243\341?\002)\255\310\016\214\334?\223\0259\313\325\324\341?H\024\353\373\333\263\351?\340\230\213D\3246\337?\247\036\301\375q\234\354?\254[[>\362\030\312?\216\250\014\275\206\221\351?\340\232K\302\337\330\253?\240\364$\302\222\222\344?\350u\374LO\256\346?\347\032\027\204\275\030\345?\2203\335\272%\016\253?@q\022&\020-\351?\340\231r\226\212\025\235?\"\217(n\327B\355?\312\267\264Z!\241\335?\260\356\361 \371\314\241?\350\243XX\031\230\343?\266\332_i|j\356?\324B*;\353\341\356?\324\375N\224!\202\343?x\346z\315\014\360\344?4v\370\361\3425\316?\016\206\242[!\r\351?9\353It\241\315\347?\034\246\002\224\214\024\322?\350\204g\351\036\353\275?;\271\355\'\325|\357?CY \327\031\337\356?\310\235\031\276KM\261?}K\203S\331\252\341?\356\255\216\023s1\356?\216\366\032\r\257&\350?\0323\263\265\353\323\325?\324L\000\177\367>\311?\214]D>sJ\302?\0223\330\265\305\311\344?\266&{\324\252\201\335?%\005\026\263\'\373\343?\312\302\222\235\274S\331?\023KY,\317\261\341?sa+\207)w\353?\254\361\370BL\213\352?E4\346D:_\356?E\027\264\027s\274\345?6\231\254Tf\032\352?8\307\341\014\252\004\336?\203\320\334\334\017\272\345?\220\260\342 \210\371\314?-\331\240\314\r\020\347?\266P\017\364gs\353?\030Zn\260\3363\300?\317r\367\210a\001\340?V3\035?w\\\353?\014O\207\005%\331\342?\300C\341\353\276\304\303?^[\370Z^?\321? *\213|\333\334\261?\300\327\235Y\321E\353?\201!\345\021\036\343\347?\227\020\210\014\n\341\345?\244\220\016\242\375\027\314?\204\312j\215?\217\302?\37018T\033\357\307?\247\0323q@\205\356?\260\022\002;bE\334?\200\276\344\334\246\253\304?\314|N\323\206F\342?\222\237T\3075`\357?dv\307\006\201\255\344?8\305\251\3411\254\327?\303\237\350\277?@\357?\217C\202\2312\256\356?\240?)\336\177\222\232?|)_\373F\315\337?\2175^\303\342r\352?~\246\017a\204\246\334?@\273\313\202@|\355?\020\207\262\3706K\321?\004\2377\211\202\353\346?):\033e4\207\352?\330\230i\030\260\007\344?\302W\242\003(O\323?\030S1\334l\037\335?f\360F\354.p\333?\244\200<\241\350\374\345?\004\316L\273\031\326\320?\360z\354??\224\326?L\207\265\234n\350\312?H\211$\234\341_\314?9\270<\267\374\262\350?p}]AFY\317?n\001U&}\244\327?\276\014\'[U&\353?d\213\201*o\237\312?\000(<Q\247}\235?N\366\345\300Q\265\325?k\360\361>\321\224\352?\213\2778\314vc\357?\342\027\205;\177/\356?p\217\345bK\343\351?\342\022\225\006\236\037\325?\232p\367\0219D\324?@\3340\007\224\037\336?\360\371\023#A\377\312?\222\227~\363D\262\327?@\255\320\243\314<\230?\244\222\013\032\241\022\346?\330b\361\321\177W\340?\006\305v\206\033\264\355?\301\226&\366\361\020\355?\267\"d\341\303\337\351? \312\004\360\rO\264?1]2\n\200)\355?\222:\322\375\020\261\337?H\221\237\261\2737\263?\025)i\324\001!\354?\256\271\253Rk\375\334?\323?\026\260\2261\346?\243\263\226Z\037u\354?0SO\2726\277\252?J\017\214\320\361Q\350?\354K\232\370u\200\332?\230\265\340\264=\330\273?\220\262\273\232\311\374\241?\000\004a J\211\312?\371l\026\2677\273\340?\331\244\n\303\321\000\356?\226\353\257\334\017A\357?Z\367\207&\330\037\320?\350@\t\'h5\336?\2349\377\010O\334\345?\231\000~(\310\214\353?@\211\354\367\346\372\315?R\364P\273\\\014\351?\340\206x\234\321h\352?S\251\355!\307\366\354?\246\217\010\273j\270\342?\364\254\250\271\\X\343?\340t!{0_\340?\214M\366\271\305\301\351?8.\261\r\307\200\321?\226+U\257O>\345?D\202\000W\036d\343?CH\370\010x\271\343?\324V\247\230\225-\310?}^\235\253\024\373\357?\314=`^\355Y\340?\274Za\"\020\001\317?\032p\303\3053\001\330?\"\201*\003E(\330?\"|\326\rhf\322?\242E~o\031O\321?\313\251\311\271\365\030\347?\356\276\307\375\255\275\346?x2\212\271\272\276\314?\034\2672C\307 \321?vT3\372\325{\321?\350m\217S\223b\270?\337\235\036(\2228\355?\300\355\004\327\322\336\333?T\303\030\222\263\"\321?\310\247\241\020^E\276?,6\020\253\247,\301?\331f\0144\363@\341?\222\337\026\0108\225\323?&-\024u:\301\321?$EKdj\273\342?\2604\242\2234\307\304?\200&\030#\216\237\311?p\216g\233/\373\353?\3319\216\014\257\272\347?\211M\276\231\316&\355?\034\230\346}\202\\\341?\001\270\017\241\2441\356?\274/\311g\354t\312?\3344\210\346\315F\336?\226c\331\317\335&\324?\004s\033\232\030\372\346?\274\250\364\005M}\346?\030d\263\246@\236\357?(t\247\372J\326\342?\0345\220\313\212?\343?\236\251\014QFa\346?^\224\272\277\327\375\342?\255\301\345\252Pc\351?\213\365\345\201\237\022\346?$N\232wI\330\350?\240w\323\241\236=\322?LOU\020\337v\343?e\005(\307\301\341\347?\250P\275\037\032&\337?\330\3121\203=\305\343?\211\230\306\353\343\226\342?\300|\314\215,\210\316?\330\027\001\010O\237\313?\340Z\025:\\\262\317?x\2358g\013\232\337?\222\022<N\206\363\351?\224`\263Y\217\350\335?\340\243M\007\333H\250?C\310\003\000\220I\342?\016&\344\t.\032\331?,T\307\272\331J\314?\320\250\313&f\331\257?\037k> 13\355?Va=BH2\327?\262\320\363vN\373\330?*\357z3\206!\337?\210?x\343\376\372\312?\000\242\022\246!\037Y?\206\304\201q#O\334?\321\2041xt\\\354?p\360\0071(\252\323? L\333\222\2363\245?h=\347\274UU\324?\004xz\306\342<\344?\335\037^Z\222\367\353?\272G\260\216n\346\344?\246\202\246a\361y\323?\020;Y\351\212\221\306?(D\227\362\216\335\311?\2403\022\213\223\005\223?6\276\247;v9\352?\314&Y]\357`\305?\360F\245\261\034E\242?x\201\034\226\372\310\260?\252\000\t\2039\314\340?\300\333\225\216\246\225\253?\255\277\343\001\371\371\353?\3605[\211\2542\240?\315t~\017\324\275\351?\177\356\332Gq\304\347?U\001Q\215\226\002\357?zO\201\333\240\376\342?\000\343!&\257\237v?\213HU\221\210\253\342?\232N\'\202\371\334\332?X\306\211=\221\005\343?\220>$\016a\236\252?\273h\217\030{\243\342?L\364\260\024\034\307\323?\364*\200\200\340\213\312?_\261\216E\362\251\355?p\3130\370y \304?S\014\273Xo\346\352?\332\234N\033Ho\352?\327@rV:\211\353?\200Z\027)<\207\241?@\365uY\262\016\222?\000\355+\211c\336\307?\014\361\377-O \305?0\312s(\r\344\331?:M\370V<\006\336?J\272\221\326\242\266\343?\004\200TC^\306\352?\3706j\224\241\010\314?7{\237\244\333\252\347?\232:l\270k}\330?\034\036\031;X\362\303?\374i\334i\317c\300?8C7j\265X\330?\334\260\346\312\327\227\316?\214\032\2228\033\334\346?\320\010O\272\246N\317?`\376\357(\260\227\252?||\333\332\241\006\355?\371&y\257\352b\347?\312o\364a6\243\334?\344\316?\252\224[\335?x\321\216\275\337\213\274?\250\312\251\243\014\345\353?\024\207\235\'a\257\304?\2602\353\0315\202\276?\275\217\324\247\343R\351?\206\231\000xi\271\323?I!\225\327\236\350\341?\000\'A\310`\020\337? s\307\377W\322\301?\316\264\016\364<H\357?8\034\326DN\360\261?\004\344\'\003\322\007\327?\342\360s\330]\032\335?\\\210\203\357_\376\356?Tf\\\r\013x\343?\343D\327Kw\023\346?\020D\343\3051e\276?\343\316_\344\207\021\341?&\340\272:\031\217\351?\014\276Y\252\241\037\323?\212\013\330\323\260\252\355?E.\242)\333\277\346?\360\'Ea\312\236\311?W\315\275,\336\345\347?$\216&O\004d\335?\304\201O\247Q\342\306?fam5\261\312\352?\360\2468s\226\366\332?\233\273\002\244\216\273\342?>+\2276\231\340\330?\001\347?\272\211t\351?\034NT\372\023z\341?2\341\304M\260\016\332?V\033R/D\206\344?\301\233\305\226\347\315\346?\236\303\340\026#\337\356?G\010D\363B\030\347?\\Z\307\"\340\221\322?\024l[\014\212d\305?\330\225`\372u\244\347?\211\010\002\310\\\302\342?\037\272\325j\261\270\352?>n\245\325\221\267\332?\267\224\221)!Y\350?\340\217\007!\016;\305?\016]b3\237\022\331?\264\\\226\"\266\273\301?\242cb\264\2769\333?\350\205\270\327\264N\326?\310\300\033\"GU\307?\270n\235\2737\014\351?\254\017\270.\345B\301?\002\360\036.\320.\327?%\241J\t\023&\353?s]S\324\"3\351?\033\251SS\370\365\343?k\310]\256\245b\355?T\362\211\332X\334\311?\310\314\244\205\350d\275?\356V\276\366\267>\347?s\234\367\354h\261\344?\200\'\3034U\013\227?\225\r\006p\235_\357?\300\340@\312\010U\261?\250\027\264\222\"\350\263?\272\233\nO\246\376\353?\334\325\177\276\231\310\347?Mc\374\023l\001\355?\303\"hdZ\024\346?\244X\355?\037\254\351?\\\254\223\270\374l\307?)\357\357\247\021x\351?w\030\302\022\353\363\355?\211\023\366\273\345\025\346?s\230wM\033\362\346?x\356g\030\"j\343?Z\244\333\370\232\000\320?dO\350\022\037V\304?\264\273\332\"\364?\304?\362\240\321s\364\362\337?f\257\335)3\322\355?\242di\226di\332?4\205%S\236\214\337?C\372j\332 M\344?\010\312\205\364\006\375\337?\353\362\367\206\354x\345?\254X\022~\265-\343?\241\226k\224<\005\346?\031\303)\273\232\005\343?\370y\0047cq\331?\344x\232Y\222\250\350?\201N\321\013\026\223\341?f/wVx\207\355?Xl\265\nI\372\332?\240\006i\037\216J\230?\330\257\235t3:\315?C\021\322\220XD\352?\353F\363\222]\r\356?Px\225\027,e\267?\263\313\270k\326\313\355?\327\310\214\016\240p\347? \370c\223\373-\264?k\023\306\253@\033\357?\006v\2277t\347\340?\220\245\002P\010\207\257?0\032\020\325\245/\323?V\274d\006kx\351?_e\274\027\226F\352?g|\362\253\346h\356?s\354\311\224\370\342\351?\024\340v\305\267\232\356?\022D}\204\317\267\335?RT?5\213\335\353?\225\336\326\231X1\343?\314;\242\222\314\236\340?c\036\264\232\212\303\340?L\242\222\356\364\307\347?\325%X\267e9\346?\200\321\035\362\300\253\345?\314\245\021M\334\232\317?L3+qV\364\355?ZA*\327c\247\344?\364?\007\273F\263\346?\"\026\252\317gm\350?\361.\315\236\230\357\347?\260~\302\207G\233\276?k\3264\220\033\356\356?\357\212*\346&(\352?Lo\316\262\322\344\333?\000\'Ol\236\201t?p\236?\227Lt\243? ]\272\331\340\361\305?\242\\\236\001Q\344\334?(6\r\316\372\256\334?\300\334\376\307R\035\236?\266\324\350\004\021`\331?|&5\203\372\365\310?\372J;k\250\257\344?\222.\200\352n\321\330?\214\004>F\334/\323?\362\213\216\223\333>\355?\316\322)\021d\027\351?\262\267k%\340\036\343?Z[\023\246M\003\357?\366\351+\361:\205\343?\024=R@\343\214\302?\314*V\033_\030\335?\307=D\344>\237\343?D\351\275\177\007x\307?=\314S\334\276!\353?\266\265\300>\002a\342?\264.\200\256&\360\355?\336\264\275\350\365\261\337?X\363\256> \016\334?nL\311u\020\003\336?\200E\357\237`s\217?\325\206\256\372,O\347?P\340ri95\264?\230\217\016K\323\374\302?D\215\026|\331\361\356?\022!\306\013\264>\347?\274\354 \246\3129\347?\242f\352\336\274O\346?\277\003\377D\336S\345?0\003\035KZ\353\322?h-L\271H\'\274?\322\005f\224t4\352?\334\266\031\214\366\327\322?\310\"\255\360\260\031\352?\037?\306\371[\223\355?$0\322\370\210\236\302?8@\320\356\237\013\305?\351\234\277\344\034\016\350?\033\3411\022\220x\354?\376q\355>\356a\351?X3\224\345\322\355\351?\200\353 uhm\324?(\271\354\303\320a\326?t\355\334\227,\301\342?d.\344n\256g\332?\307\017i\3005`\342?d\211\017\243]\016\322?\241S\230\253A)\356?\332l\377\244\n2\324?\202\274\207E\014s\340?\027D\305\024z\021\356?\203`\247\n\322\262\347?\\\313\207\r\363\201\337?\310\224\265\322\002\022\276?0y\306\362\204\356\270?\300H\263\336\314c\341?\360\004\203q\240_\317?\355\207\206`\271\210\344?M\3768I@\003\353?\"w\2160@\241\353?\300v\271\275\364\022\242?(#O\001\031\252\340?\270C\r\325\330\220\331?\357Co\363\207\215\345?\366\266\277\024\210\335\353?.X\033\264i\314\320?T\224\263\177\3634\315?$\272\375\214\235S\340?\024\327\325o\370a\304? C\213m\037\017\230?\034\r\312SA\317\332?\237\030\337\270\342\010\350?\240\362[\274h\021\235?\350=\"\206\003h\276?4\326\300\005\2471\356?\240X\204>p\301\246?\024\326\356\230i3\334?\267\352\245! _\340?*\360?\"\\\n\346?\250\n\356\3266\005\337?\320 \264\324\037\345\334?\277(\205k~\330\352?\235\345\036\234\234\000\353?`\364\322-\250\314\303?\214~\026\314kS\351?\264\215\267\\\013`\350?P\336cU\037@\325?J\177V\254\030\037\352?\200\242K\234\205\236\242?X_\332xg\356\335?\344\337\303\325\313\330\331?\314-88\330\314\312?]0\'W\'1\356?w\266Y\246h\201\341?\370\305\3753\026@\347?\204\302q4\343P\343?\254\303\033\272`\321\324?&f\261c\244\004\321?3\360\037\214\362^\340?g\312\2764\030\350\343?\3518\334\251\363C\341?\376\320\2002\352#\340?\210\211\232\213\331\351\322?\200\241\371}in\220?wN\222\353\234z\355?p\205/^qY\277?\374\245\016\326\357\313\300?\374\332+6gr\322?\334\363/\370\320w\315?\"\272\274\014v\216\334?\010\230\241g\276\313\320?xY\221\326\311J\346? \202\017!\227\227\263?\364}Cc\300\276\336?\334l\2327\2617\324?\013>\212\363v]\343?&\r\017j\301\275\344?\354Ev\014\331\337\302?k\225\221\242 \t\347?\230`\371\351\'\316\263?W\346\302\232\325\233\341?\360\235\272x\256\300\260?\224\274\006\303w0\331?4\237\216E\252\003\343?`E\353\230}\177\346?z\034\207\335\270\273\343?yeL\334[{\341?=KwWh_\357?8\007\204#C/\343?d\244\037m`\261\321?p\014\257I\202\005\356?h\341T\\0$\326?\224\316K=\177\303\312?:\014\035\356\000T\357?\264\240\361\323\340\201\324?\0179\312\222O\344\351?\336)\307]\235a\333?\220=\"\350\233\224\253?c\257\236\363C1\350?\022\374;\3201\346\351?\344C\361\263\270s\342?\374\275\333\340\327]\307?\0340\243g1\035\323?\360d\360\267\373\317\327?\321\236\266\374\353\376\343? \372\321\202Hq\267?\275\003(?*\220\356?L\r9\217&\215\330?\313G\220\233\320\316\346?&u_|YK\352?P\004b\331\325\243\342?\242\322\336\r\360\250\332?\010\261\026\222D\031\273?_#\226\205!0\354?\220\211\257\367m\026\312?Ow\225\253w\216\342?\335k\3301XI\354?~\2040O&\227\346?\260M\327\356\027T\251?\370j\nX\310\235\265?\336\317m\370:\217\326?U/\216\007\302\354\344?\334Gq\361\337\270\312?\3142\247\351\314v\354?\025\321\030\270\221\022\353?V\344\3402\321X\347?0\336N4\234z\313?\022ZB\241q \325?\373\035\347\\\244\224\342?\000X5\351\353N\333?<\355\300\205 8\350?\216\014\200\023\230\033\324?dY\307\273\030\010\303?%\231\n\271k\351\346?x\220\327R\313\221\346?\2163f\321\341z\351?\332 \303\306\375\332\333?\374U$\270\357\234\333?\341\234\2669\276i\343?\317\003\243$\323\013\355?\033\217\226\3707\225\341?1;\214\0164\220\355?\370\217\210\352f\376\307?\300\374s\207E.\205?\212\317\0146\334d\344?2\3414\rC-\320?nX\256l\013\310\326?Q7\245\031\256\224\341?\220\0236\300\310Y\343?h\300oD\016\204\267?i\210Ux\224E\352?7\307u\252H~\343?\032\323\033)\210+\333?\230\177jy\305\345\333?\347\255h\206Q\033\344?\342`;\313;\247\321?\022\354:j\243\243\354?\234\027\"\307\0310\325? \r\355\034@\332\326?\000#\352y\257\333\264?\000\336\327L\357\253y?\366\030\002\326\253\265\353?\300\030\277\260(\020\243?s\022:\031\025\336\353?\243\336\331(9t\344?\330\014r\216k\270\357?\355G\240\327\371\376\351?\333%\354L\003:\355?\313\357\222\003\265\352\346?\000\353r\203+\301\250?\371\305\253E:\244\353?\260\233\026\306\327h\324?\3504Nm9\310\320?\t*\014\316^\272\357?\232SU\201@\t\352?\356\361\364\273K\343\353?(\213\211Cx\321\266?a@lYh\316\357?\000\010\266\010\273{\335?\372\340&\271\316k\325?\000]\232z\177\353\325?\364\204\033^\246\217\340?\345\211\370\252?\013\346?\350L\360>\270\t\344?\255\355\264\342\327\362\344?e!\003\366\341\222\340?\203\026\307\376\030(\357?v\tD\240\303\327\334?G\026\246\315\035_\342?\351:\363\341H\212\356?\335W\243\033\376\247\357?\357\241\346\250K\347\351?\004\207@~}g\340?8\363W6\352\314\321?,X\2062\335\363\312?\201\262\263.i\251\352?7T\273\241\255\217\357?\034\004\307\254\344\225\301?q\000\3514cP\356?\000\310\215\363\002\n\303?\002\245\326\220\016]\325?\216\374\260\234\225\016\352?\000\024\234&\263\237\342?\337\036\016a?\025\351?\270\036\363\343P\020\327?\351\206\035\322\322O\347?(\007\373\333\3235\350?\365\222C\n\230X\342? \245i\310\217q\331?|\371\313\341\335\264\354?\260\322T~\323\217\241?\004\270s\225\310\350\305?\023@\315\244 \237\356?J_TJ\306\221\330?Z+\351\361u\005\322?\270\214\356\340\377\206\305?\324\321\341\321\014\327\325?\220\246\224\313\352\252\240?;^Y\351\356\272\343?\r\317\303\322e\351\354?)^;2\213f\357?\207\230\350;\325\034\355?\206\266\034F[V\353?\372}\264I]\223\325?\261\270\355~\300\247\355?\210\2532\314\302\\\341?\362\320Y3`\256\336?\010\236\220\t\221V\310?\000\351\204?\247\275o?\362Z\250\217\030\364\350?PP\224\216\r\274\252?\275\003\270g\0066\343?\332\315c\317\235\364\342?p\026\322\272@\257\356?\000\006\240\241\321\257Q?\222s#\020\324\373\345?\364a\334\214\253\034\330?v*\231\270|8\343?P\274* \371l\345?2(\025\r\221\245\356?\355\325\343\245]Y\341?0B\037N\345U\350?\246\253\207\274FT\343?\264\033cm\232\215\303?%\245\333\224\300f\354?h;V\013{\222\300?\205h\023\324\004\332\347?\317V\220\3130\226\353?\360\016w\225i\377\310?\000u\371U0\360\201?1\267g\026BQ\352?\215Z\275\231\255]\343?=vAv\223\224\340?\317\361\377\305\250\305\351?\367\035\021\025\264\203\347?\0356\277\3331\232\353?T\327 \034\232\262\312?.h\r\270\034\202\352?\212\'\260/\004\372\321?\352\332\374\324\260\371\355?\251\272\022_\031|\355?\346\366:\222\014\214\334?f\310P\200u+\334? \360\013o\252\376\264?\024\212r4\201\261\347?\365\241-X\323(\347?\220N\360n\033V\320?H\244\363{\333\212\260?@\345_6*\270\324?\001r\337!\002\231\340?\320\225E\031\370+\301?\240\231\222J\010O\316?DB\251jJT\306?\336\303\213\314\022\035\350?\246\324\232}b-\341?\300\002F\240<-\334?\270\254\214\227\033\226\311?\323qE}\346\005\353?\034\226,\211k\377\352?\233\261\352\000\004\210\350?2xIR\334\037\352?\376\306\232\237\307j\334?\340[\227\264\273)\240?|~\277\016I\027\303?\240#\t\276\364\020\351?0>\215\225\322k\256?@g\231%\367\336\205?*\322STU\331\336?f%p\362U\246\343?$3}O\223*\312?\344|\271%U\266\350?l!\371\020#`\337?x\315L\242>\036\303?I\216\211\036\363k\340?i10\273|Y\350?\212\"wqC\245\333?p<U%\3653\301?2!\003\r\001~\320?\262t\271\354O\216\346?m\006\252\272\220\235\353?\350\010Q\336\315\242\327?\353\004\375c\034\337\350?b\025\343\0105\350\341?2y\0302\205\034\334?J\030\326\343\237\224\353?\306]\237;tb\324?\3004\327\277\327\330\250?<x\372s\306\323\302?\240\305LG\323\224\335?\220\340{;\264l\345?t\226GR\254\321\330?\205\212\2515c\022\357?|\253L\253\206\364\312?\n\220\342\006\256\263\353?\336JSE\272\022\327?\331\305\254[(\262\355?<\370m\207\263A\344?\204\2142\223\0360\342?\\7\312\235 \031\335?\200\262\025\364\2543\275?\216\307t\037 \356\347?\004\023@sY\346\343?\205KVhj\236\356?\370\253\2206\252\376\326?\210B\232\226\266C\265?\250r?=Ih\340?KP\325\017#\r\353?q4\276\370\316\225\352?\000\363\310\213$\331\213?\230\374E$>\202\303?\240\337]\345\220\267\223?@b:\267\222k\273?\212@\007\265\216\014\356?\360\277owi\360\347?Y\246\357\006E\220\343?\0032r\203\200\377\356?\240tC\000J4\314?6\323\331\211\022m\346?\234\312\366\317h\312\304?\n2\302\357\236Q\330?\032w\250\365\322Y\352? \003Q,\312\005\307?\020\260\t\002X1\333?\226lz*\242\017\332?`\273\204ni^\254?H\367\"Z\314\231\311?\240\244\344{\003#\226?\2619\251\201c\260\343?\302\255\312g\375\265\336?\364\275\231a\302\346\355?-\362\366C\254\247\355?UV\3620:\345\356?m!\n\177H\223\347?z\363\250\004f\246\332?dPv\344qU\335?\010R4\214\327v\327?HTow|\351\353?\032\357\270YZ\271\347?\317\354G\207C\036\356?(\t\302: 9\265?\036\025\366\224Y;\335?J\315+\373\341\r\343?\3046\316d\377H\354?\375\252\314\320\237C\356?\321\327\010\014V\213\340?\250z\377\307\024\021\341?\216\255E\227X\362\325?`z\016\224\374\217\324?\331s\260\230\235\356\357?\3667\274M\323\316\325?\222\333\n+\272L\322?2\371:y\316\r\352?b\346\314kT\315\356?\266M\212\323\304\242\330?\322#\010[\357\253\331?^\230|B\017\026\323?\014\343\023\000a\316\305?#j\0029e\303\352?\213\256\204\336\000W\345?\020\\\374\376\357\267\316?\2047Tb\241\237\354?\300FW9\347(\213?\300\036\261+B\343\223?t\324tc\367M\323?8\276\244\366\343\260\276?\260/p\376\315\271\342?\304\001\326\007\320\'\301?Y?\372^\004\210\352?\334\313\014\206\014\001\336?\000\260.P\221\022@?\322zB\237x\202\344?1i\355\307\265\245\340?`N\201\323\303\345\273?i\367a\033\330\223\347?\340\322\2334[\326\354?\367/\247%\221\323\342?~\t\266\3543\305\326?\241C\022 U\355\355? $\027\360\206D\241?@\321P!\327X\253?\201\332(\2246\304\350?p\026\377\006\017\342\255?\241\264\215\037\274C\344?\010\035}\321\265e\357?\"\224%\2044<\344?B\300Sj\001b\325?O(\212\374M\200\340?6?\252(\244\245\346?},\372\246\367\310\347?FB\310M\030\341\353?\370Z\tX\264\337\354?\236<\304\256\330\270\342?\002\265\246=\037\004\325?\260\214\227e\217Q\267?\205\n\250\r]/\340?P\24086\032\231\350?\024r\205\272\0173\315?\000.\303\206\264\253\257?v\203Z\n\366\001\320?\344+\307\351D\314\310?4\362\260\352\227:\353?\354\014\254\333u\347\321?\200^k\300\350\020y?\007\\\316\325\275\005\354?oZ5/\237\213\357?\025\262u\274\305\245\344?0\n=Y\321\274\355?@\270\023\240\325Q\266?\222jv\206B\035\342?\021\225\020\003\007\363\355?\260jj\341#,\304?\214t\035\t\222\366\300?\020\361\332\252\306\247\252?`\035\t\252O\243\221?\311\021\306\252@.\356?\320\235\272\317\304\351\345?zd\361\261\215\\\330?\264cP}\026\257\322?W`\326\302\233\021\353?\343@K#fl\352?\t\345!\357Q\212\356?\234\024\206{\272\n\322?F\262E6H\003\320?\202\2774N\243p\336?\326\020Z\312;J\327?\250h\022\345\224\234\351?\340\003\345\346\256\370\265?\342\230\001\032\231\307\342?*\222o\354y\264\331?\376\243\254M\243{\347?(\256F \231)\270?\023x\3366\370H\352?\226\347\350\003;\366\356?@\342\372\347\372\333\276?\005\371\364\355d\261\344?\333\361\032M\r\033\353?\022\307\327|\207\233\344?\031\201\023\341\032.\344?\037qx\232\017#\347?$\323\310\266\240\234\304?Q\322\177\'\022\337\357?U\234\024A\035;\344?\000\254\251\246\003~\213?\220\237\362T``\334?k\341\024\035\377\376\352?07+\212\010\020\355?\241c\013\333\214\362\341?\000 M6\250\244k?\333\351\376\nX\252\354?\216\004z\342\370.\326?\177\037v\231\355\324\340?XW\236:\013\332\307?0\036\254a\033F\305?P\205\315\212z,\353?\240\264\355\035\224\261\251?\2102\200\342l9\350?\330!\306\354\025K\275?\360A\242\230|\342\272?\360\235\275\267\236\n\250?a\337\230A\246\002\357?,\2436\301\201\211\335?)Jt8E\000\351?\240\005#&\033\001\323?XCq\266\325`\316?(\356\3261/\241\350?\210T\241\227\372\034\306?/0_\305p\322\345?\246\352\305J\243\272\320?\024~\177\211\254>\303?O\0044\356\203\315\346?\356\337\030u#}\352?\324,\203\027\221\234\341?|7\177\357\235\326\323?\026\331\254\355\350\233\355?\341F\215<\347\304\357?\200\316*f\367S\350?b-C\256\364\215\342?\312H\024_\207K\334?F\3505\315\331\241\342?R8}\214_^\323?\036\320\007M-}\337?KB\200k\237c\346?\033sS@\0339\343?\274\353\336\n\310\325\302?\024\010(\217\276\246\353?\270\266\034\232\337\301\352?\0149B^\271z\316?\020k\020\312x\266\313?\322\336\310D\204s\323?b\211d$\374\346\326?z\312_\342I\265\320?\300Hl\342L\253\356?\314>e\t\233\322\311?>\365@q\376\024\341?\210\010\211<\272E\266?\014\312\342\000\2676\325?\243\334\021\225\005A\346?\240f\345D\216\315\320?\304\367\332x\260\341\310?\246\203\0313\004\210\336?0\tq\313[V\357?7\303\345\3454k\344?\275\246e\233S1\353?\241#\340\301{\230\342?\333[\372\\Og\356?\265+`\343\226\231\356?\221\243\025\016\177\032\351?,LJ~\200\371\306?\330X/\227\377.\323?\234):\254{K\341?\244\273\374\017\324Z\357?\033=\324\201Ep\354?\250\332\2745\306\007\352?\002\037\365\205Y\226\341?\335}\332\376hq\354?\222\344\273\021\263q\340?\006\257\207\317)\266\340?b\327\303\221\234\236\356?\216\232\314\331\017\365\342?\340g\341\301Y\254\327?\266\261(\255\354\307\334?P\313\237\217[.\322?\240% \244\021\016\220?\266\021\023\372\372V\340?\360\317\327\376\024\031\305?\003O\356\276\270\367\354?\030\235\226\346\260b\313?(\212S\306\300_\311?\256\205\276\325(\353\346?\303)\342\026?\177\342?\210M<\333:\007\325?D\274\367\302\001!\301?%%E\320\'\245\352?k?R\347G\346\356?\354\311\203G\245\241\325?~\004\311\300t\351\347?\315\313\374\366\331\325\341?\257\0213\226.\274\346?8K\341nc\227\353?q\366\206\034\377\321\355?\205\000\340\037\310\330\351?\370bs\207hU\267? \313\217\177\263\376\310?pM2\374\247C\305?\351\233\206\020\306\366\350?\000\331\3309\255\361s?P_<\004\344Z\241?H\226Q\200\364\\\321?\251$\363,3)\356?\236\224/\000\312\347\336?\'\321\213g\227\226\340?\321\013\351<\262\n\350?qI\020o\321\247\345?\302upEG\346\333?\016\023t$D\264\352?\260p\004O?0\313?\002\311\323N\212\264\325?\020\352\324GYR\271?\261\206\237\253;\355\340?\3601\264\2771\350\333?B6\033\225%\033\333?4r\272:\335X\306?|:\325\000F\310\340?\371\252]7-\027\343?O\376\2105\007\256\355?@F\210\347e\246\262?\247\341Z\372\264j\343?`%\317\271_\241\255?1\346IT_B\357?\234\014\302\313V\020\342?\023X#\341\324\365\347?\230\374\303\372\276p\356?\330\232d\230V\312\311?\214\037*Rh\177\312?\310XdM\332\233\325?\250\026\206.\216\024\322?\345q\325\304%\032\355?@n<%\226v\217?\314\344\203Z%n\347?\2004\345\017$\032\247?\030\367.\337\277\262\307?\177X93V6\350?\2474\263M\212\314\345?\372G\360QC0\320?\352\201J\230w\351\350?\263D\355RC\026\341?\002H\245-\010\355\340?\030\237\363\037\220\371\275?\240\250|\241\227u\230?.E\360\252V\361\340?\266\003\300\262\230\014\320?E\215\247\367M\350\357?&+\n\351f\363\355?,\235\r\002\032\316\314?w\020\375J2\276\355?w\322\2366\234\376\344?<\233\013\017\254\021\300?\214\356Q\376b\013\335?\'~#\007\315\326\354?\355\314\273+\367\370\344?4\361O\027\234\233\353?\265\267\260\034\247\361\352?\370\304\016\313c7\261?\226\333\247\313y\353\330?\205\022\203\264\030\222\354?\177\306\2626<\036\354?xy\021\235\376\345\274?\021&\226\315\002\t\353?\320\252\000\305\035\252\306?\376t\331\252\236q\357?PN\236\211\330\322\276?>\304\251~\004!\320?\244\2620[Pk\324?:a\014,\220\361\341?\316_\222\024:\377\334?\026\323<\243Z\273\351?\232\315\010\325!I\341?o\023,\035\033U\347?BJ\1771\357\227\333?\242\304\013q\377\210\320?vu^E\216\300\320?#\232e\\\005\003\352?\200=$\276\016\246z?\374\266\254\005\302\013\336?\317aA\003\332\336\343?\300C\3573\022z\336?\274\232@\270T\t\350?\tKT\210=I\354?P\241m_\036\363\336?D\365\375Jh\324\352?\035\301!\356\350\265\346?\273\311\030\2340 \345?\242\244.%\352\201\342?}\222R\377\263}\356?\267\243G\252\205)\341?\317~@\351\330-\354?\2737\250s\327g\353?\200q\336\221\351\\u?P<\230\234\253\342\243?\300\357\n]\200\264\330?JQ\265H\275\'\357?O\t\034\241\275E\356?\025\030\354\265\341\243\343?\214\350\263\033\356n\332?\313;\014\263\370\025\344?hg\307V]f\346?\023\034+\260D\334\355?\264wL\233\227\370\303?\230\371\237\246p<\343?\t\017\241\375\210F\341?k\371\213G\341\346\356?\264\250\352\005i\232\327?\212\\\rj\211I\342?\265\203-\221{\370\346?\320\034^\034\245\013\304?8{>\247z\204\303?\334 \3122\260\377\316?\240\344\273\364\000\222\345?@\270\203\360+\030\324?\270\215]\027\203V\330?\200u\346b\246\204\323?+@\242]\031\351\344?\206.S\307\210.\347?\360;\300\373\241\035\345?\310k\264\001\275\001\277?PW\352\304m\227\311?\356\363\275\211\320\000\342?\353\2754\337\177\363\356?\000\320\377+\331\255s?\\\251\014\236\252\005\335?:\202M\215\372\'\343?\330\035\274\213\360c\353?\320\347e1(\273\255?\240\210\3051\324\313\317? uL\237\357P\270?\350\330.\302\004#\315?\251\261\006<\220\353\342?\002\221`\200\373P\324?\354\217SC\375\246\333?JDbO\352q\320?\350=\021\254\237W\274?\243e+\362$\033\350?L\006\005\232\367<\333?\352\337]\tzC\333?l\016\204h\347d\304?\"\370\207jX?\343?\264u?{U\270\321?\314\265\010\3146@\305?)\315\320\231\2214\347?\255^\256kFE\343?\273o\021\247;u\344?\310ix\370\033\337\270?\300D\345\224\321Z\335?P\357\020s\273<\266?&k\2007~\233\321?M\316\223#\232J\343?X\273/\302\221#\344?\254e\002\177\215h\305?\277\316I{\030\236\352?\312~~\317\363t\353?\020-\351@^\316\327?\345<\302\373\207\005\343?\014\334(\240\365m\311?\030\237\204\025\236\273\331?\025\262\303\221\211\260\357?\214\224\240\300f\326\323?D\251\227S\203\317\314?,\337}\211I\014\354?\337>\242\212\306%\345?>\013P\331\316\272\321?*S\003\311\250S\327?O\375\364\210{t\347?(\365;7\345\341\267?0\326\233\361\247u\240?\302A\350,\006\320\336?-\313j\004\005N\346?L\267\036;\235\204\334?\310\2237\260\202\232\306?\224mC\335Z\323\305?3\010\322\363o8\347?\n\232\275\230%\311\354?!\002\352\327\353\330\346?\355\305\375\023! \357?\370r=)\326\246\303?\032\341b\001\351J\347?\342HRK\377\330\341?\366(\001\"\315\272\345?\270|=\341\211\254\267?\262%\240^KZ\345?\320\037\231\216\347\363\242?\200?\003\340T\361\321?w \3756\376\314\347?t\342\327\337\231\251\310?\020;]f\343\264\336?f\036~\333\344\237\322?\246\277$\234Hm\324?\370#Kh\370<\324?\332\030\003\272\225\351\347?\351J\350A\256\265\342?\210\242\370o\036\335\337?\276\332\254WT\247\331?\014\356.\231\r8\304?~\210\252\260\246\353\356?\320\377\330\373\021R\256?@w*=\244;\220?\312c\024\262sg\346?aN_\310~n\353?\000d\266.\202\274z?\004\203R=K)\346?z*\300\033\004\352\331?X\036\200\265\374-\313?\300\027\001\264\370\237\343?*\205.\272M\300\323?\2178\270w\314\350\350?\034fE ?\240\341?]U\252\212\243\014\340?\230\345}\022\020\276\336?(iw=\013\317\262?[\013\34743\007\347?\024.\267\014m.\312?\244\346\215g.\032\354?\004\3653\272\342\310\315?\322`\263\323\212\006\346?h\216\216\261l\004\317?\235\332\022O\207b\355?\320v/H\tJ\341?2\230\342\356:\274\357?([\000\\\224\\\272?0zU\014$\364\273?\"\327\007\013$\245\320?\207)\260\215\006\352\341?\316L\356Y\034|\322?\340Ty~c\216\250?}\320\343\207\r\311\357?\324\211J\307\345\363\323?HT\270q\375\021\344?\271\177\217iG7\350?\003l\326@\324\333\354?il9\237\023\370\351?\346PQ\305\203\001\352?\236xL+\277\370\341?\310@\356\210\017\202\332?0T\254\303\350\201\265?\234K\375\235_\317\340?\017\234\256]\177g\355?L\034\351e\345\003\354?\364\034\254\327\221\365\344?\247c\3218P)\356?6\2138\345M,\333?\020\322\215\003r\t\343?\004\220O)-\\\355?V\035*@N\277\322?N\254\360\033\357\351\327?\252#[q\003\023\356?\'\356\372~\331u\342?\227u\330$a\027\343?x\240\002\324;C\301?\310\001L\230i\276\332?\010\226\213W\312\004\310?aF\r\205\230\301\344?\226\223i\tZ1\335?\336vt\335J#\352?\364{G\026\223v\344?\357!\254\276\3666\357?_b\350\372\210\021\344?\356~\252z\253\276\323?)\245\201\022\"K\356?<N\302\201\226\242\314?\004\244[\272\234\204\322?\267,\231\337\266\332\342?\370C\210\276cw\347?\004\362\245\205WG\354?\202V\336:\200\201\332?\312q[m\245\n\324?\020\375\335\001\316~\321?\324\276\374B\334\376\327?\322\267\335\372\276d\355?1q\262\313?\306\344?4&\200\013\264}\305?\000\270e\354\216T\202?\240g#\r4^\273?\3309jwX\261\342?\306\'\200\356\203\320\331?}\303\002F\316\035\341?SC\020\243\013\247\354?\331\223\235\247\370\311\347?\235\034?\221\304.\355?\376\211\350\323\224\246\334?\030/M\010\245V\302?(\0002B\006\330\356?\270\333\246\317\014\016\320?\350Y\205\016\200\215\302?\376\356\014\n\0330\322?\266r\243\205\356\226\325?TQ\3059:\206\331?\310\027\271@O\312\305?\000\333\352\227\340qz?\276\260k\325\343\216\355?\242D\177\002\275\006\325?W\251\252\224\200c\342?\030\215X\327\257\001\353?\312\317kO\206\343\331?\352z\253\374\215\013\356?H\252I\036}\001\353?|\3516\273\277\307\325?TE7\006\261\360\356?\360/\003s\376\037\303?O\017_ \321\332\354?7\340.KJ\212\351?\314\302\333\251\276\342\326?\242\357k\256\r(\326?\220\027\262\024>\317\342?\250\365\324\372\350\240\306?;\\z\350\364\250\340?\235\233\036\030\037\341\344?\007\022\033\202\306\265\345?\0002m\222)X\344?F\016\213\303.\335\345?.&?J\200\215\325?\244\255w\035!`\343?T\035\210\262$\212\320?\300\277\364\251S3\260?\331E`>\324\323\350?\026MqA\210\377\335?\025m\3147-\340\346?\204p\241Mzy\355?\347\363.\214\2019\357?\006\001\200\222\201]\324?\347Y\250\266\201J\356?\000\304\374*\217.Q?\006\304E\004%\213\343?\312\r\233#\365&\327?\374WRX\304\372\310?\204n\354_\036\215\305?8\216\266\326t\027\303?G\3252\201\325\221\342?K\032\022&\356\276\351?T\353l\345\334\245\317?Vxt#\224\325\342?\365Y\267\020B\327\350?\240qs\n~n\314?2\325k\341\205[\347?\000\261\356}\362q\256?\370g\362\324\210\310\260?\027u\215\3702H\347?~|\334\231\244\311\321?\2016\255\025\271\245\354?\344\2363\255\233\232\315?\022il\217\306\377\347?T\006\toD\212\317?\201\324\354\362\202\324\340?\202\323\237GEc\350?\002P8\307\177~\352?\2276\266\255\376\247\343?B\204;a\021\301\354?\274\r\033\000\324*\347?\254\336\335\001 {\312?L\375\016\323\250s\312?\340G\365\352\371\261\304?\0300_\323\221\\\313?Lh\304pO\366\322?\300\"CI\240\353\276?\265\256\250\034\253\254\351?\253K?9\221\322\350?@\273\276\276\304\340\350?\032\340@\267\214\006\351?\030\274\244\\\257\372\337?\242\246\272a\363\316\351?Y\243\377>\333\346\350?\034?\236\344k;\307?^9\373\300uP\350?\032C\247\340e\220\334?T\266\250\276\002M\331?\234\256Kr\350\017\340?\230\000\374G\371\357\343?p\357s\013\260\n\355?\037{\035\205_\272\346?\177\267[05\202\355?\200\001!H\204\020\355?\240s&\254\272\301\227?6;i\321\354\233\354?\260\326\341\003H\023\262? \237o\2044\250\325?\010]c\242Y\010\307?k\323\376\334K\302\353?,\234\201\337\032]\323?H\257\304\265t\332\340?*\315\324:\345P\331?\233W\271iE\016\350?\330\304\271`\t\275\335?\366-\332nXf\345?p/p\240\037\003\304?y\342\314O_:\340?1\367\252S\303Z\343?\245^C\357\255\250\344?\304\344C\373\336\343\347?V\2221\033\235c\337?\300\005\252J\262\350\343?\230F\305\".s\272?<\312ui\201\'\330?p\341\027\014\210\243\342?\023\330i\002HB\351?H\3009\320M\325\302?\000\302S\252^F\226?\244\205&\301|E\324?\034\t\343\333\211=\337?\032\375Xn\374\017\336?\300\370c\200f,\352?\300\213\354>\336\220\262?T\313\331\214\304\320\353?\274\352 F\021k\347?}\353\r~\244\335\357?\300\367\252\371\014\362\277?@\2343H\303\270\300?\317\377jJ\231\022\344?.\023:\300qP\357?\310\326\214k\327\351\271?\206\223\302\330\350\024\351?\206\033c\324U\254\347?:\323\351~A~\331?\000`\205\352>\372\300?b\275\347L\337s\342?\264Z\3270\213\002\337?\372\342\037C\236\335\336?\360\303k]d2\332?\220\254\240\252M(\322?z\006\324bb\362\322?\265\277\277\213\213\000\345?\327`\206UXB\351?\337\331\276\341.\230\347?\\\374\270Q\016/\320?<\352\300x\014&\305?\336\t\026\231\377\217\325?h\267T\212\265\345\313?\004\213#\342-\337\317?\3521x\224U\353\344?\334/\345\262\312\205\304?\300p\262\242}<\230?\270\366i5\202\260\340?$lV\306\344\265\342?F\337\252\251V\255\333?\032\251\375\3625\375\322?\223\022/\233%\321\343?\365\265\023\336\264~\340?\202\224\030\236\205s\320?\330J\370\373\022h\335??\276\344b\273\260\343?a\3550g\000\321\357?0\250V15t\244? \355\201\007DO\332?\177\007F\261\036\213\344?\210\001x\226|\032\346?\016\273\027\234\253\025\344?\n\016f\341\235\025\344?P}\343\351\t\t\331?\362\276\376\341\302\235\327?\350\ni\257R\357\304?\024\276\233.\035n\310?\314\373\261|\014\351\344?\240p\266\224\001 \271?\357\032\305\"r\210\342?\017\352,\367M\317\352?\255Qv\227\277\265\350?\316~\336\020\375\242\322?\244\2456\177u%\330?\260\344;\031,(\305?h\270\234\226\322-\320?\262\000yJ\332\227\323?\000\243\"\024\222\343\344?6F\202\037\250\373\321?J\201\355\244\266\031\356?\004\274\312A\350\322\322?\314z\245S\020\262\317?\350\3449\2070N\330?\260\343y\034\'\256\357?Cw\254\242+\257\343?\212\371\245\334\216\027\352?\206\335\001\232\353T\325?\342\n\004\261\315n\343?\2765u\207\313\245\326?\342\213YV\306`\347?\334\032=\014f\310\335?\312\316\005@\241\203\323?\016\322\216\372\264\302\355?^-\230)E\303\345?\316\017\344J\025\201\342?\326\322\303\342\"3\337?\213\3705\0076&\346?@\243\232\307\344\200\261?8\313\032\243wQ\346? ]K\231\226\205\235?\304\330q\376H\330\344?Au/I\211\242\354?$\257\323\350_&\312?\325\227\273@\261*\341?\3203}\375\236\241\266?q\264J\341\373z\354?|\234\367Y\250!\336?t\300\016Z\342_\326?\\\301\266\362\237\261\346?\300Dv;\232\334\273?\020\372\262\026\005C\322?C\343\255\004B\230\354?\020\353\273Y\223\370\352?l\274&\013/\235\330?\373\336\303ZP\325\352?%6\364\256\202\217\354?\300\225\231\204\003\215\231?\220\227\003\223\002\372\245?\366,x\210\314J\325?\020\225-{>\035\335?@\3258\003\023\313\324?\343\351\032\210\263`\354?q\347\306j\233\234\352?z\323\222x\276\271\323?\024A6\212\374\020\336?\300\273K\301U#\256?x?ji\327\235\314?\024\317\030fc\230\311?Z\234\363f\315m\333?\000Y\240\304\tb\257?\010\267\035\371\211\311\331?\251\374\306Q\036o\346?\367J9\033y\327\350?\013#\275\203%e\341?!L\3722\177t\357?I\017+\022\277\t\350?\360@\225\373\027\254\311?\3109Q\314\026\r\345? \301\362\324\336D\355?\230\002\267e6\307\325?\\\342x\235\303\224\326?\274\316d\373O\005\340?l\032=\354#\221\347?2c\352\247\037\005\337?\230\027\031\317x\206\325?\367\241\241g\246\203\356?09d\177/Y\343?\255M*o\315X\345?658\"\321\231\332?\364&\014\223\323\316\327?\276K\240\022\326z\331?,\211\211\2152\357\333?\324\261\313\225\010\260\351?m\005\235 \265\243\350?\000,HX\217W\316?$\320\356Gd\025\326?\260\337\024\242\017\326\353?\240\035L\003\322<\250?D\026\322\201\2136\351?^\320:\255\365\307\324?\325\307P\206D4\353?\210\237H\250\357:\270?\200l\231\006\353\364\220?~\337\201f\253~\340?\340\247\346Y\261\350\341?5\306\266\233\250X\353?\017\337?\227\265\254\353?\016z.\021YM\323?\364z\205S\273y\322?\230\233J\re\240\271?\262\t\327\0212\221\320?\357\267\210\246E\"\357?\305q\320\245\270\322\354?>\376\303\205\305\325\354?\342n\251\226\313\002\343?\320o\315\367\204\225\327?\037\205\"\002dP\352?\032\010Z>\370r\345?\213\005\213\336\020\211\353?\332\310^\2768\233\336?\272*G\206\377\'\335?}[\003\2552\310\352?g\363\007b\252\356\353?H\324\227\301RJ\352?=&\327\370}\374\355?L\363\016^\232:\331?|\335\0074\237\006\353?\300k\313\337\370\031\322?zs\263\333\007\256\326?\363\027\370\353\014+\345?\374}n\255)g\330?\226\227\0319\223\317\331?\370\333\221\300\366\350\317?\276~_%\371Y\347?p\177\274\333\345z\356?\3249\036\356\244\255\310?\214Z\010\374HD\335?\270\230\315\004Q\257\332?\265\247d\317\327h\345?\352TV\023J\354\322?L\225Q\357Z\267\347?A\006\255GJh\352?Q\367\266\344\357\273\347?;Q<\227\250}\351?R\025P\031\010r\356?`P\023k\316\243\226?1\030H\231\226\002\354?\257\204\374L\230\n\351?\2304`\364B\242\270?\032\341\267\370\207\366\322?\374\345\306@]I\331?\341~\311\262\323\214\340?\000,\262x[\275P?\240\216x6\237+\251?\225\252g`\311\336\356?)\237\263\322\257\205\341?WK\356\013\014\036\353?\356M\277U\374\262\324?\203\2202\264Fk\350?*\002\325\263\337\331\345?\374\221\250E\314x\333?\344Cf\202\212\373\314?\022D\200W\220\305\353?\340|\330FIW\307?\3008\237\005\206\376\324?\300\001#\315\200\031\224?\237L\037\336L\002\355?|\343}Q\253X\314?\200W\t\354\202\362\231?\220\037\026\370\r\021\323?X\013\005\313\027\241\350?\372\356\371\351\301\253\356?\250\301YF\374\256\274?S\365\366\305(\236\350?J\354?\230h\302\337? \232\231D\030h\257?\005-O\337\013\205\353?h\276,\026~\025\300?\204\206\322|\255\344\313??\213\031:07\344?@+A\3634>\221?\350\277(\346x\306\331?\214m\230\363\277\214\354?\000\303\232|\303\"\357?q\221\303C\323\313\345?\000\333\250\222\024Z\273?D\321FCO\375\337?\355\023\037ufo\345?xU\333\347\351\034\314?\244Q\261\261\224j\344?\304\n\310\362\301\225\305?\276\254\3318\3463\334?\216\347\373\247:\327\333?\211g\313\300\347{\351?p\331\351\316\2175\263?\222\336m\037\327\214\335?9\240\262\030\243L\354?\'4Pt\316\021\341?P2b\251\223\027\334?\375_\263@\317\350\352?\000+\001:\252$\350?\230\376\333z\311p\306?\353\224k^\033|\352?\214$\213\273^z\357?@\323?\326\301\"\337?\rr\211\360\273q\350?\024\317H\345\374\242\322?\3327\362r%\352\327?\263(oa!#\345?\307i\205\327g8\340?\200\272\222\n\215\311w?M9\326\360\276[\340?\320\210\013}\226p\324?\037\003\032\267\252\212\345?J\324\253\017L\251\337?x\322\226G\014i\263?\003K\023\276\250]\342?\304?\320i\327\240\324?|?*\317\216\242\315?\366\021i\n\277\\\321?\324\251\031M\313\027\303?\345\035\221\256\363\027\353?\307\251\362}\203 \347?PIG\232X\245\313?\363\177\223)f\026\356? \213\316;\3339\260?(\022\264\231(\014\350?\340\341(mn\265\251?\326M\315Zzy\322?\26491L\330\311\353?\272\004t\222\272\333\342?\354\214\210\302{x\313?\352\242\000\232Vo\333?`\222\006\246\273d\222?\360\027\235Hw\200\330?\300\311\230\375\334\206\231?[Z\221\026\315\307\340?\370\262\256\\\335\032\316?q\306\224\306+\330\346?\330\312\243\t\021\'\356?\273\363\311 \217\226\341?\226A\273e\210~\346?)Q\214.\201\200\357?\004\201\260\355\323\355\304?\024\310\355{|\336\335?`\313nK\266!\251?QgJ\266\232\013\346?\373\355u\355w\254\341?1\253LNK\213\356?C\213\276\270\271f\351?0\246\275\263\371\201\355?\000\277\030\263\220k\263?\014:u\"\227\257\330?\360\241\264\374M6\334?\214\334\036\010@3\301?)\363%\302\201Y\357?m\233\305<!\032\356?\352IUy\005\365\320?m\331\001\343#\375\347?\0049\323>\016\237\320?\220t\3607\236\024\260?\370\214\\\372\222\340\324?\244{\0064Q\032\320?6r\001\220\213\035\326?\200\'S\2639j\264?@\351\273\254\331\355\334?\026!e)\301\331\354?\342\211\300\271%+\323?\362/\327\213c\320\322?\314\260z\212\320.\326?\020\252N\"\373\344\251?d9&\322\246\346\304?:\255\217\026\200 \335?p\303\372\251\331\306\267?\022u2\324D\205\322?-\265\356N\252\201\345?\370\016\273\363\276<\312?\334\246\255\214\374\002\313?\022\271\300\315C\377\350?\320\027\177\216\242\035\345?\200.+\221\326\206\212?`\367$\267b\007\353?\243\261\002X\370Y\354?@\004t\254\360|\244?>\332>\352\321\224\337?L\245m\356\000\000\317?V\250\037\253\332\333\321?`\324\211\374\235\362\230?\370,\001\316\264\001\270?\033P\363\313\333\n\354?v*\347\344 F\320?\212\346\r\027\201\224\346?\226\213WGF\024\352?\2121@\376\205\326\330?K_\271\3747\263\341?@!\263\262 \355\202?\356\271\307\310\270Z\355?\000^\222\024E\276\312?\306\311uD\317\206\320?\376!\003\3070~\322?4\314Gd\313a\353?I\260\005#\013<\344?\344\033\334_\221\340\310?Q\241\372[fQ\343?4\331\341\301\332\254\353?A\370\251{a\367\353?\256\313Iv\211\220\351?0\325\270D\304\200\306?\320\235C<\276\307\326?h\332\356]e\343\307?H\241G\312\200\001\332?v\246\373\362\324\330\343?\202\263\024D\343\007\350?\3446zT\315\214\351?\370\002\033e\342\201\346?\226jK\303\210N\351?\240F\006\372\315\212\256?\n\304\322\363\331D\340?n\251ucr\240\321?\360\352\370\204\272\262\347?\320\352\337\023A\016\347?D\342\013h\343\276\303?\320U\303\261*\273\262?\002=\371D\226\344\326?\340\214\3633.\226\266?d-F\225\356\320\346?\220v\242t\365\200\254?\007\335\346^{g\354?\326\266\362xFR\326?\374\tcr\373\362\345?\340/\254\215\301\360\305?\320\2337\306u\270\333?\233\005QC\032 \353?\277l\330\225\034\376\340?V\315\3373\302q\322?\320KqB\\H\324?\t\215\252\347\323\t\346?$\255-\257\336/\326?<Ju\223\355\302\306?\260\037\223\236\377\245\267?\3741\214$g\r\345?\262\332\303\253j}\344?\340\327\303\025L\374\257?\356\233]\364;\'\346?T\3706:\362H\324?Z\r\242}\226\323\323?#\206\034\257\000\273\341?\003\243)\240\270\250\350?\206\354\360!\351\204\345?4\304@#\221\220\355?\014i\314k\323\201\325?\346\222\t\341\022\033\321?a\260\034\336\t\n\340?0P\276^\361\275\331?\304t\257\205\030\353\315?\231\241\237\215hn\342?`\351\031\0130V\226?FFda!Z\333?\210]B\313\206/\322?\014*\030zg_\315?z!\210\313\370\000\325?T\006M\0019!\306?@V\201_\340\261\316?\260~\346\367\326\352\320?X\014\200\252Ke\307?\214v\324\312\235\235\305?\034Q*\035\257\341\300?\017];M\241\366\357?\016\004\274\257g\031\356?8\034\034@36\342?\2423uo\230t\324?\210\236$\221}\301\321?T\211\251\304\234\336\321?\350\215\345\013\252\336\307?Z(\320\266\351m\357?\274\234\235\327\003\260\323?\263\002\316bC\377\353?M\300\2468\305\242\342?C\024\374\232G\367\357?\320\323\\\340\026\220\273?\312\372O\274\365\037\356?@\202L\267\2733\205?\r\357\317\311\271\266\341?\260X\036;\334\242\321?h\026\223\277\314\315\323?\200\024\327e\361%w?-\363x\274q{\340?\234\372V\351\232\370\302?`G\177DF\217\342?\330\245\271j1K\272?8\t>\234\276\277\305?L\362\347\370_\247\322?\252r\372\203\tW\356?\300ey&\353\234\353?+\036\327.M\300\353?\350\036\302G\213\310\260? y\214\016{\277\270?\332\032/\201\241\003\320?C\022\265\007N\273\347?\026IrX\227\356\325?t\031)?\031*\352?\234\327\267\360\217\207\315?l|\005\257\236G\343?|\243\302\237\234|\302?\202\331\024c\376\311\330?\027G\365\300\204V\350?P\022a\377\025\221\266?x\204\326I\334\307\305?Xb\177\262\263\325\345?(\034\223XI\n\342?\000\005\356\325\342w\323?\244\034\211\221bR\347?\250!|\272\341\324\275?\216+\235x\\\204\325?\262\266T\233\323i\356?\316\005R\270\2516\332?`\356\r\tn%\260?\335mb\320o\363\357?p\235#t\276q\306? U\320\003\344[\332?~O\266\340\341\026\325? \0219\346\311\003\242?\3041\r\201\207\314\342?\300\321\234:\222\341\311?\264\3607uH\277\331?\374\270.\374\200\223\330?\3748\005t\373\235\314?\256\212;\021\262\035\340?\237\\j\253\0174\354?P\345\035\031\376#\356?\340\341\030\317\335B\257?\210\373!\022\377]\337? \203.\371{\343\304?\303\226\023\325\n\315\341?\n2\360\371\352\204\323?\242\037\'\333\362\001\323?Y\246_\326\025h\357?\200\334E%\330\227\201?\020x\024|\221\344\246?\364\021\373\2161\237\331?\3072F7\333\002\357?\274\206\255\343\350\302\317?\034\363\250H\360\270\336?\312k\253\037\005?\325?\210\315\200Y\237\220\266?n\362\273\361X\334\320?w\360\267%\372*\340?\3206\033\035C\t\314?9C\225r\216\230\343?\217\314k\340OW\343?\352Hn\211;\242\346?\313\332\246\253\260(\347?\355\227\266\344\237t\355?\007\"\341\330d&\351?h\331\3565{\375\321?\t\353W`\200\366\352?\356\306\336\236\222\316\353?\203\332\371le\203\355?\3014\270\327b%\356?O\360,\250$\233\354? \201\256\2203\252\355?t\377\357\004\tp\317?\212\207\033k\317\227\327?\266r\226\311\265\362\320?\320o\375\317,\315\267?U\t\236r\016\r\350?\234\\\364k\204\026\310?\374Y\305\212i\005\331?tO&GqK\354?@\203sV\310\264\342?\017\273\366e\224 \354?\350Z\210\272\364\232\310?b\007\225\363\r\335\342?\362\370O\321} \325?\354\231wB\324\246\343?\\\327u\204\\\364\353?R\337 x$\022\343?P\271\2622\265\257\351?S2\315\272SV\340?\324\206\230\341\276\030\314?\262\210f\2638\332\354?\252\364\301\330*w\345?\220\247b\335\351\032\246?\272\211\270\205\255H\347?\310\243G\2215<\274?\244\360\362\353\2117\314?\344dX\311\237\020\352?p\303e\323\344|\315?H\t\314P\026\315\274?<[\200Z/\315\341?\3307\223\2477\221\275?4\276\267\3128?\304?\367\324\023\3514\000\351?b\035H9h\223\320?@\000\036\244K\005\331?\365\033\301\032,R\357?\265\335O?\234\330\351?\326\326\337\037\276\313\346?\270=_Y\335\336\341?\322\013%#]\342\351?0\356+\257+\344\307?\260\314:m\236\341\325?\210D\347j%>\260?\300M;#<\341\232?\001|aL\234\r\347?&\\\227\026\263\020\352?\340[\240L\222\277\257?\376\254\013\263#\275\345?\020H\254s\374\247\255?\370\253\373,\244\224\307?\000\365\026V\313l\206?\334\'\217\230H\266\313?\023<T\312\230\020\354?\334J\323O\203Y\355?H\321E:\202\374\300?\365\255\257\315/v\354?KR\300\"\r+\353?\213}\343\327:8\357?\000\036\217-\007\260}?\017hkvL\305\357?\\\177\350\026\0268\334?\240s\t=\372~\300?\264\243\231\270zR\304?*0j\001;A\324?\373\0055\224\201\005\347?\341\373\235\321\315\316\352?\2454\220\227Q\255\351?\314\335\r\355\005K\324?z^*]i!\343?\204\243Zr*\237\334?\254\342\314\247;\365\322?\336^\310h\270\016\336?\315<\227\305X\340\350?\230\031\266\346\324j\276?\010v\345\252\021\364\260?\234\233\026`}\006\345?l\345m\020@\331\337?\020\005\265\362\005\263\327?\022V\0301\203C\357?\354\263\245\3174\027\320?\274\226\010\0320h\310?\024\273\006eW\212\350?<}k\270d|\352?T\205Z\013a\365\300?\034`Z(\014\372\327?\220\344\251\267.\317\307?\306\033\344i\223n\327?;\017\377\253E\250\343?\244\252M\225\332\226\310?@\254\240H\271\'\243?H5(\214^:\342?<L\'\227\033\025\335?\250\322+\341\325\210\355?\256\\\353[\203\n\344?\227\275\007:\2556\346?\230\2049 +.\341?\033f\354[Ho\345?\037\206\312V\034\303\346?\002\216@\313r\013\340?H\375\305\332\227\377\326?\372\0315M\320j\333?\242ez\203\216\020\346?\347\336\"\031\215\036\346?hKZ\274\240\305\355?8Z0\305eT\332?kB\340i\'\301\345?\250\264\355\374[0\341?\254\253\234|8\231\356?\201\t\005{\330)\357?p@\013P\004g\311?\000\243\237\3768\203\356?$\362\322)\004\266\321?\266m\373\323\217_\354?\241\024T\305r5\352?\\\300q\204\3451\317?P+\210\361=\342\350?U]\276\345L\340\347?\255sb-wJ\352?u\306\237\327C\305\345?\332i\"\333\372r\337?wo\224O\300\316\342?\"\322*\026\210s\330?\323\215\037\007\265\337\341?\222k\303h\234:\357?v\251+\377w\334\353?\362>3\312I%\345?,\020\303\202\201\016\346?\2448\221\242\300\267\330?\300\333\325Z6\221\351?`\026\265\207\014\251\355?I48(\003\210\351?y\254\362Sd\270\354?Xk\300\202\376\355\277?\010;\262\0163\316\355?\205*B\372\232\343\354?H\020\343XN\010\303?\220\020r\202\000\352\267?\351\\\237Q\373\212\350?\333\376C]V;\345?V_\202\363\377\330\324?@\032\356\335\304\357\254?`\255\330{:\252\304?\206\321@Wd\010\337?\3560\314\3509K\344?\225\177^\312\233\375\351?\347Mf\0146\367\344?y\342\327\352\265\r\357?X\031\327i\355\312\334?8\025\372|\232X\263?\321\374M%\200\274\350?\'\034\rw\020w\347?N\313\3303\370\355\333?\003\236\366\031\314\216\353?\236+\311\247\306L\351?-\017Q\273I\005\357?\342\277\233\002Y\314\357?\277\324xV)8\351?\366Y|KZ\213\322?\317\020\374l\336\003\341?n\356\372\014\300\224\342?M\210_\317S\352\354?\376*m\362\314\355\354?\352\257a<\250\221\333?\021\037V=\266\357\356?\000g/[5d\335? \212\313\256\246\357\276?\020O\301\316gs\301?\220k4/\363\211\355?\032\001|u\260F\341?\254\264C\211!a\333?\220\261I\010C\201\340?\"C\266\244\021q\354?\243\277/\355\022\266\340?\006\220:\264\3779\355?\330\3754\263\204\314\320?\324K\265mE\344\307?m\020\346\"~\325\346?\030\362\262\202q\302\336?\244Z\341@\320\341\354?\323\201\013`\302i\342?J\250Vc\263F\337?\242\347\2139d\330\346?@af\353?T\253?:\014\203S\3474\326?X\200\204\255B\033\266?\340\366\020\206(E\303?\000\335Q~\003\275p?d\014\352\324\215\343\316?\270\006\305\250\242\333\354?\204@1\267\221\325\346?W\367\025Y\273N\341?G\307r\341`\206\342?`x\264\244\tB\242?\272\220\206r\362\246\335?P\237\302\220\336\211\264?\211\334\001\034\363\034\341?pF\225\016\004\355\244?\326\377\374\223\',\335?\222m\326\374\016-\357?P\362\237\316\207\343\311?\260QU\263\307$\274?9\224/K\'\331\357?\214P\034ZLl\353?\324\'$\340\340)\334?\360\211\3775\022v\313?h@t\325\314\237\325?\000L\006\320#\177n?w3p\026w\022\341?\210\363<\'y\243\310?\024\221\177\345\375\255\323?\324D\327\360\022\355\311?@\237{\374r\370\242?\036v\376\260\201Y\336?\020e!^\267\223\262?6\243\3200\242j\322? \212F\373\321\031\256?R\261\007-+<\335?<\360\320u$\273\356?\374\300LP\263J\355?\242\306\207E\300\274\325?\320\214\311\257\037\365\243?h\016\224\222:\323\354?\356\205\3309\306.\327?Xo\"*\te\336?|p.ht\306\304?6\n\"N\320D\322?\023\251\031\004\242l\353?\340>\2345<\325\246?\000\030%!\356\014\331?\365L\210Gl\340\355?\250/\013$\263\013\271?\024,\3741)\000\323?0\367N\026`\333\317?s1\\M\334\005\345?.`\"\256-\257\345?\035\255\027\345D\023\350?\240\325\251s\021r\347?`&u\224\373\260\265?Z\347\257{t\016\335?\362\325\200\031\322l\320?\3004\300\364\351\234\201?N\362|\311N\013\332?Y6\354\240\373W\344?\240i\373ed\340\334?\031\265\363}\306\204\342?\000\027f`\232\213\233?\326\275\321`?\214\331?\346\020Ndo\240\341?\200\324\351T1\233\352?7^p\0141\300\343?S\243\260\3635\356\351?>\211\022\241\240e\355?\240\373;\364\276_\254?\250\321\217*\206\370\326?\202\350o\2352\232\344?V\034\355\317\034$\343?\020\364\212\264K\311\243?#\272\335P\240\257\340?\n\332\236\323\2239\324?>Y\214\260\024\370\347?r\372\241\316\234\341\330?\236e\220\233A\033\354?\327\356L#)\232\357?\360\271FI\257A\253?\000^SP\236\036y?\031\253\017u\325\323\341?\".\031\337\036T\342?\364\317\241\201\341\331\347? \331\363E\017,\327?]MV\201\275W\344?r\335{Pm\377\354?\255\263d\222\360\362\346?B\266\323\212L4\322?\210\335\250\001tg\265?\2002\336\035\253\206r?[\367\303\335|\346\344?\306\320*N$`\330?Q\030\233\300l\231\352?X\024h\215a`\324?\327\236\224\035\030\351\346?x&\272\220e\033\355?\357\271\323\304\231\211\355?\347c\004`(n\347?\266\347\207\210\014\276\324?\206\3473\224,$\343?\360\311)l\236x\265?\230\035\275E\353O\274?l\251\237J\320\245\343?\243z\030\013\272\327\357?,x2\022\202>\335?\340\212z!\213%\245?\266\332P\034\372\350\336?lA3\205\322\024\326?,\352\215\307\265\023\321?\210J\'\3556\313\327?\252\325gt\267\371\354?\022;\211\346\307%\323?V\316\222\211\036\246\325?p!\246\277=\353\270?V\030\036\265oU\335?\200\300z\252/\rz?A\242\021l )\356?\300\373U\355\207\037\273?\205\320\250\265\t+\355?\350\021b\022\236\202\356?\026\262\002/\"\003\352?\004g[\346\247\311\333?\000v\212\356\317\237p?\032\214\325\326Y\005\355?\350\354\235u\227W\305?\324M\005\356w\224\311?Rw\266\323\260\310\343?\237q2bl\246\355?J\025\262D<I\353?\374\224\202\216i9\345?\302C\230\320DW\327?!\300y\312\341<\354?\034J\306\037$\215\354?\316> \3220\037\346?\212\014\364\006]\345\357?W\363\206b\237}\343?\244\226}{\200g\320?\320U\241\3264\212\356?\206\357\010I\342N\342?\025\014/\242\2763\351?\310\177\212\230\344\250\351?\320S{\n\375\347\276?;-\377,<\272\346?\260\332\306K\210\333\276?\362a\247\rm8\340?\356\314\315\227\263\377\353?_)L\204\261\'\357?\370\022\035\315\205\224\330?@\253\242Q\354_\344?VV\\@\032\031\353?\005\0043q}\371\345?\342(\327v&\256\345?\017\251\201\314\0357\353?2\037\3101\267\022\346?\320\277\345:\312A\304?\320\254\240\347\276_\241?\302\216(X\005@\321?\005\014\002.\t\244\356?\302\227\266\006\223\356\350?%]\374\301\273\351\340?<\366x\274\021\253\330?2\225 Q(E\344?\325\327\353\275l\021\354?\026ch\366\211F\333?\314\244\252\370\246\316\330?\311\301\005:\314=\341?\262v\245?\352B\331?\260\316\266\367\r\252\264?\350N\352^g\271\317?\206\\\264\340\362f\353?\220\027\243\320\375@\301?\202~\360\316>G\342?&\276p\224\024~\325?\023\255Q\024\355o\354?\224d\2305\221\334\307?`Aq\375\264\031\224?\236\313\030\337\260\355\350?\n\220\243q\221x\327?`\301\2232\003\026\273?\304\230\343\330v\"\330?\260\234\210\337\275j\325?K>\241\337F\265\355?\005\200j\217\370u\353?\330c!\276N|\342?zo\235\"\213#\354?\306\207QJ\036\311\355?@\322\265\211\320\255\241?\320CL\313\244\364\324?\204\315\n\314Y\361\341?.\371q\323\245D\356?\350[Bx\256\272\301?\031\r\362\204\t\034\355?\325\204\355\256\232\004\345?\032\247i\035\300,\352?\3332\274\020oW\355?\374\0335d\356\267\344?\312\205\360\213;R\351?\260}v\363#o\300?\354:\035\2455\364\336?\346>@l]\262\346?(\261\213\273\033\'\276?O\020!K\357\r\351?\"\331\234-I|\354?\341\010\252;\222\265\354?\360\000%Q@e\343?*\034r\365oO\321?{\321\371h\242E\343?\350\021\361\246\314o\352?&\210c\370\237\360\321?\340V\254\344G\355\305?t\322}\2323\030\313?\220!hD\307\367\314?FY\350\330\367\300\341?xF\t\023\006 \313?(z%QHb\322?\000h\372b\364\225\311?\021\3714\265,\307\341?!NT\226\242-\340?d3\216v6_\337?Q\347\263\224o\372\345?,\366\223\023h\215\347?\r\224\237\372}\206\342?(\303\370o;\271\300?\221\261\315\335\211\223\355?*0\237q\"\013\343?\200!\323a~9\275?\273\346\322`\365\306\341?\236Y\026_3P\356?\335\344\351\3509\031\347?\020\023\374!8\177\320?\020y\203un,\267?\020\203,\327S\027\251?p\201\016i\341\375\335?@\237\324\200\351?\220?(\275jc1\271\324?g\357#\250\037(\345?V~Eq\325\024\324?\034YU\210\372\207\304?pO\025\027\332d\340?\214\356\"\263w\217\344?4%i\332\004(\355?Z\230\207\235LK\345?\254\002\355M\300R\327?`\256O,\235\304\265?\300\233K\n\307\323\313?\240C\035\265\"[\272?\246\340\206e\006\225\342?B)d\243\335\224\334?\231E\373;\021u\341?\330\2731\354\300\007\274?\376\t\013\'\260)\355?\316\350uW\277\007\343? \276e\216\231%\330?\366\232\262:PF\342?\334\235\033Iw\360\356?\251\201O\356\214\003\343?;\311Q\233\305\233\350?\206s(#\374\'\327?\234\201\273_\223\263\322?\373\177V\005=\337\350?\204\302\324\034\355\324\347?\036\245J\313#\340\337?@Z\366WS\330\237?\324\365g\253\037=\304?EF\n\203\246C\345?\323\306\224\016F\365\354?\017\263\343R\324\231\341?\000\345\327\371\343\307\257?}+\004t\277.\340?\321\251s\304LV\351?PT\200\326\\U\321?\264\354\222Z\234\222\341?\022r\n\270\305\363\323?\215\000\014n\225\212\354?\2318\246P\033\303\350?\014\225V\371\034\\\305?\240z.=\\\272\274?\344XJ\261\230x\332?k\300yT*0\352?\232\023\207\3538F\351?\210\2108\267f8\304?\330\001\022\212\224x\332?\217\204\252\026\260\367\344?\266\326\263\230\035\344\325?\364+\277\021\022\316\352?.\254\351h\362Q\345?\244:\320_\216:\324?\241O\274\034\024|\341?f9\037w\322\311\342?h\224\243\230\236\346\331?(\000\314\275\211)\353?\332\255N\001\215\356\330?\365k][\3058\347?!\377;\303\223\263\350?\034\332]\214\315\227\307?\236D\3665\202#\350?\220\304\335\271}\232\342?\347zh\266\2400\354?l\342\236\360#\240\344?:\234\331i\360\271\321?\024E\200\330 b\303?PdNV\0141\347?\202\310\261\316\234\005\322?\243\376B>!>\345?\324\334\253.\036\331\320?\366\253\337C\033\026\327?\3008?\362\247K\233?IS\245\200\341\346\340?\310r\343\311`Q\262?\n\272\360fB\257\327?c1\222\374\177\346\347?\262\376[C\354\022\337?@\016WW[\347\223?\336\373\227\275\033Z\322?6\233\345\2744\312\342?0l\241T\237/\316?\334E~\346G\300\307?\215\356b\005\341\327\354?\234\321\030]\351\242\304?\236H\014q=\231\357?\250B\376x\335F\261?h\213\305\257\023\310\322?\321\330C\036Qr\343?\270Px\243z}\267?\334x\177@\251\270\331?\005\376F\331?\\\357?\t=\344\216W\355\356?\352cg\273\020_\345?f\377\360\360L\376\335?.\346\007\034\3361\323?\202\225%\006(n\326?\340\211\223\202\374p\310?\210eS\037@\357\271?\264\232\273\244\216!\327?\004=\tb9\376\333?\216fr2\253\"\346?\264\347\314\217\355\227\352?\314\256q*Sh\351?0<\305\010(\036\322?\226\330\313r\220b\357?c$\355\322\177\"\355?P\342UOx\214\272?\034\361b3_\005\335?\035\224mp\361\234\341?\236\213,M\203\242\332?s\370e\366\t\312\354?I\350r\221\355\362\347?\020e8\273Ca\323?T\036b\354\343^\303?\334\235\363%\375\253\347?Dy,&\301\277\350?\345\327\2124\261\347\341?\260\340/mB\"\352?\374\323\021/\n`\327?20|\032ga\356?8/\001<\200\351\333?T\003\256K\324\246\310?\025\252\202tl3\355?t\235L\325l\305\307?\200\217\243{\217G\267?\344W\037\033\346n\320?&\250\303\341\314\227\342?\022\237\237\010\034u\353?\322_\212tm3\340?\217\267R\005\263\345\345?\200e(\025\252&\266?y\364\227\245,\022\345?\260ke%\346\322\317?|\277\016C\253\035\336?\300:W\201\223l\330?\214\2031\235,\204\353?2\214\314\026\240Z\335?Pv\273\373\371\323\357?\223\270ud\304g\343?\205\270\343>\276<\353?\354\275\r\247#\000\357?\355\263J\215\210\257\347?\250fN\272\200\343\276?\300\313\245n\320\245\344?\345\362\347\212w\353\354?\265.\034\366\311\211\353?H\311Kd\215>\277?\234)\211\261\365\362\312?D\004\201]L\225\310?b\375\306\tI\321\340?Vbx\252\201\213\324?\204\230w>\264\346\324?\376\030R,w\344\326?\022\255\374\220\320\010\330?\004`si\206\353\301?\213\214\363\343N\237\351?\232\320A\226v\'\352?\n\2736\273\254\343\325?\022\016\013\t\226Q\337?\350t\223+G\226\333?T\262\215\271@\324\320?\340\317d\336\240\264\260?\3334\260C\237\252\350?\022\256{g\376\014\324?)g\366\252\313\360\341?t~<=\316k\342?o\377*\255\3501\345?8\034=\254^X\326?\325\271\251;`\367\344?\030|q\214Z\205\311?\212{\302\376\233~\347?\205\004\334\336<S\340?\022\235#\2477o\352?\340\276L[\341\203\323?\031_\336K\037\373\345?\324\333b\316\277\001\306?9C\2118\022\374\347?\257\307\317\226\200\016\352?\030\264.\347\226\206\264?\360r\365\311\352(\246?\210\306^\261\003-\272?\344\022\3649C\347\302?\364Z\002_2\227\303?\330\244(\025du\340?3}\177\r\215\233\347?\254dr\362\347\241\336?\204\n\"\356!\256\323?\356\022\275\370c:\352?\247v\350V\302\372\353?X\365\002\r\230\200\322?:{\252P(A\336?I\2207HtR\344?c\302\250\227\315w\342?\341\322\\\320\372M\340?\332\014Ox|*\340?\344\231\375\211\360@\326?\3734\211Fs\327\340?%\305\355[\035#\346?\r;\201\227\212Q\355?\260\005\250}\302\336\257?905\262\250M\347?4n4{\026@\343?\324\321+\225\274\203\312?\260\375SO\252\275\354?\2706\306i\354)\354?\270\277\341\254(P\310?\360\373\360\\\004\343\306?\230<\364|\363\371\273?l$\351\223S\033\325?\300\273\002C\247\315\277?\214\212F{p8\303?\273\243_`\033\334\353?\030\202\n\250\336\272\333?\002\311\205\034oW\322?\030\217\307\341\357\221\310?\302\304Sn\210[\345?\336Sc|!.\322?\260\313]_\221\212\320?H}\224`S\237\317?\275\253\361H\030\014\350?\013\240:C\366\020\347?\234\356U\341v\325\335?%\314\256\266\233\241\343?\256c1r\337\010\354?\024\313\274\306NK\345?\300}\301\350\356U\216?p\334\316\000\234m\271?\000\235\000\313\216\351}?\252\\\014z\203\277\323?\226\354\225X\205\322\344?\204\230\305\320d!\351?X\335s8\345:\346?\350q\014u\216\373\313?\273\254,\314\r\344\352?\340\267\263H\001t\302?\005\277y\227\311\036\357?\214p\350\005#z\353?\034\267<a\355\376\320?\021ai\365?\r\357?\311\001\373\350\317\030\355?\r\\\315\322\366\022\351?;0\306\250\252Q\353?=\264\220\336\006T\353?T\r\355KD\362\344?E\005s\340\272\227\350?\357\375\236\247\300\232\344?\370x\220\342\205\311\325?\204\361m\003\243U\351?\316\177\2711\371\356\350?\200\366\234\005\222%\336?\252\341\211\026\2413\324?hN/7\364h\313?\270\266\356\322\364\207\351?\346\177\346\372\313I\351?ag\326\345Rk\357?\214\\S\255\036&\342?;\017T\251\235\324\347?\242\361\367<\367\362\333?\022\035\033?\014\352\351?p>\212\251\t\274\330?@W\316\336\377\221\273?H4y\304\tM\305?hC\36661\014\350?\205hP\000\275\317\346?\243g\343G(\032\355?\310\311Xd(\235\273?\344\362\265\000\206\347\314?\032\0073\261\270\204\325?a\t\262a\304\256\341?T\335*5\037\'\340?Jon\252\210\352\320?\300\310V\206k8\330?\340|\265\340V\234\302?\340\253\230=\263\327\265?\350@\272\211LB\313? {\237\357\321\370\332?\224f\217\231\247/\303?\200%\035\3137=\267?\354 \244qEA\314?\310\014\034\226d\362\347?\026\376\256]!\222\343?>\351\0251n\001\335?o\312\303]\025\231\347?j\256\267\'\020\r\346?\240\257\336\0228\273\311?)\206\304\262\232\251\343?`j\016&vx\244?\316\314\346\255\223\232\340?\221\304\360\315\237\204\340?\330\347\037V`E\356?\337\307~\215\350~\345?@\324y4}G\247?hca\270i\326\345?\006\034\3474\257\346\335?\274\317\347\374W\340\305?\010|\362\026\267u\325?PO\013\343\315X\323?X!\023\367\001\365\343?\361\0350\007\241Y\357?\250}yk\376\320\270?\313401\014\321\354?\227\010Nz\203n\343?\277\005\220\000\265\024\356?x;\256\217\373D\311?\364\216\000\305\021\017\336?\330\177\324\220LM\322?\270\375\036\213C&\332?`+\254H\2464\355?\264s\036M\r\341\342?\216x\334m\347\250\347?\310\377\364\357\330\363\310?\337\001i]\221/\353?\244t\323<V\270\326?L\003}\177\003\334\352?\366\264\307\347\001g\347?\314\236\202>\332\220\356?\370\263|e\210\005\262?F\332\031\034\222\275\354?A|\344T2{\351?E9\341\300`\255\355?\210ZF\014\307\344\317?\336\361\332u\354T\332? \362P\223\302\300\333?_\002\251\273\313e\354?%\013\300\205Mm\344?p\340\037\326\037\367\267?\226\367\212\215P\321\323?|\220\351\033#\263\333?Xu\001\332\037\307\357?\306d\020y\354W\322?b\2734\364y&\346?\020\305\225F\332*\265?2\211\335a\306\341\355?\034\276&\312\233\331\327?\340\264\271-l\211\266?\246\033s|\364P\323?\222$\0031s6\345?\314\\#\253-\006\355?\235\267\312[\332\033\346?\204,\000\236-\013\350?0~\030\276\300B\245?M^\325\215A\300\341?\252\016\315\t\263r\326?]P\357\n\347J\351?\310\327\013?S#\301?\305S\246\023\255\273\355?t \032\366h\254\321?\256\361]\300\256\324\324?-\006\2629\336\234\343?0\0060&;\310\343?\367m\225\373\212\277\341?\240\"\337\370\265$\225?\013K\277\343\027\230\355?\310\310\"\003\"\005\263?D\216\213y_\243\324?\3001\225ru\331\226?,\363\350\351\324)\311?\315p\025\345au\351?X\365\276\246\272 \345?<\245)9\241=\301?\006\277\225\0068\242\341?\000\323\211\206\002\374j?\342h\t\301\307\377\320?\226/\030\225\350\203\342?\355\n\224\3262M\345?\360\010f\t\314\001\332?\270\230\302\364;\374\276?\350\311\366\034\303v\271?nwa*f\341\331?\300\270B\267\246*\260?\342{p!\266\014\357?`\370\367by\000\337?\212A^\3202\r\333?\024\001\302\021\006\352\307?d\204?)\247\004\305?\031\346a\2308Y\356?\\<c\004\354\322\352?\020\311\364\006\256(\354?\370\352)\000\254\263\273?\0004\355Tx8N?f\300%\327\242\352\354?LW\236\204\337W\333?x,DW\325y\275?n?\276\177\336\334\333?\003/@\374\311{\344?,\357\337\003\354\303\354?\366\2543\207_\373\330?Y\r\\\251\345\276\352?z\226\216\240\310Q\320?\035Evg\370\266\356?|\006\013P\334r\325?\314\2729\213\226\351\317?\364\273i\327L\267\327?\010?\271\032\003n\334?\\\312\337\263<\240\327?b\256\251Za\270\341?&\362\035?\267\275\335?4\205\"\322\365]\350?h\324\256\323\342d\272?\262xa\010(\210\343?\\\354\242[\323\224\302?\242\367}\270\026\200\327?1<\261\255\360\310\356?\220Jw\026Wn\320?\370\336\223\262b\214\264?h\214X\312\025\301\261?\003\213m\032_L\345?\005@j\023\257;\355?P\250u\312\233\304\266?0\345\022\331s\310\321?\022\235\255\234\243c\341?\366\373\000NRs\356?b\037\216\347X\361\332? \206X\223l\201\341?\360h\373\335\032\003\304?\004V\2747B\226\344?\351d.O\374\217\356?\300V\351\007\013S\233?\024?X\020\360\252\344?z\020\357N\374\200\331?\215\032OU\360\332\356?\360\276^]\363-\350?\034\243=\352\017%\317?\220\222\007\230,\003\240?\314\352\310\356\364\315\317?[\2326(h\231\341?AOi\344\031\001\352?j\361*\302\245^\337?\351\032\313\370\216\305\347?\240\022G\3773\337\314?\250\'|\253\013\253\262?\220l\210\365Z<\345?\036\356\236k\225\276\335?\323\271\236\326\3302\356?\202\200b`\217 \340?H-&\002\234\316\302?\221\022\010\006\347z\357?`\256\0340\233\212\353?\315\375\3558Q\270\355?@\256\233y>u\241?T#\341\370\205Y\316?/\354\324\324\233\347\342?\332\303R#\313\372\334?\260\361>JI\375\354?\230a\222\"Gc\306?\271\010\000\211\316\211\341?L\207l\202\350~\313?\305Yf\211V\330\355?\020\333a\365\210e\341?\306<\225\246\376\341\352?\261\033\337\263\314x\344?{x\233{\022\375\352? !?\326\336/\266?\304\234\026\223y\312\310?\313\211]\316_e\350?z\315*\266\032J\345?\216\021\364\261\0069\333?\030\020\013\333\200@\342?v|\332ao\377\321?\320\203\017U\372`\343?*\330[R@\207\337?\374\327\304r\025\223\347?\271\351n\330=i\357?\362\365\342[\355\232\332?\274\272\355\306<\035\306?A\330\262\007D\226\357?:\313!\001\222\024\353?\226\272\302\242\336\203\334?\225Z\375\275\004\277\350?t\330re\371\266\350?\034,\271\224\000\341\311?d\212\325\322{\r\331?h\260\374\245N\262\357?\036o$\226\007>\321?$\006$\004\216\t\305?9l>mzh\355?\236\326\0250\220\306\337?\327\232\342\232\304\212\347?\002\022Z0\346\277\357?\323\332\036\307C\002\350?\372\230*\336HJ\342?\036e\252\3427\230\342?h\312j\301\265\343\261?xMa\374%w\311?\3303\361\242\211x\354?\226\207N\234\203R\322?\202d\223\025m\261\326?HsM\343\241\177\351?\302v\256\362I\246\327?\220U\230\312\206=\354?\247\3703e\241\236\356?\"\241\327N\354\347\352?\000\255\3556\325\002\253?\014\006T`\034\344\304?p\017\215^\016\020\354?iZ\241\005N\366\345?\344`1 \324,\313?\310p\027\016\224\305\336?Xmzr\305A\351?\362\330\376c\335:\323?\000\237\230\360<^\264?X\327\270\256T\312\351?`\244\341\005\363g\355?\372\344\362C<m\324?TxT\030\330%\323?\352\346\242A\362b\340?l\021\302\307B\"\322?\327)\214\232=\027\342?*MAL\332\360\356?\360\021\003\2361\335\306?(\226[U\320\313\334?(\037\254\260\3058\324?0\321\274\300(\\\271?\320\\_\355\3414\314?N\211\365\225gA\345?\t\306j\322\347\312\340?\220\031\r\275q9\324?\tWH~\320\t\347?\264\335fe\302\030\355?~\007^8y\257\352?\374\206>\t\330\\\315?z\n\207\232G\357\357?5\n\315\262+\330\354? \200\027\266}~\226?qr\336\201\033I\357?\207#Zs\357\215\350?\310\234r:p1\277?\300\014\rz\250\276\236?\257FW\331\310\227\357?(\206z\027-\356\262?\036\"\233\350\\\036\352?\250Nu_\353\016\263?\354\024N\304\252S\345?\254\336\372\275\253\266\336?\300\r\354\222J\327\357? \204n|\000W\236?\374x C\257\306\310?\326,\241\310W\353\355?\'\254*J=*\344?\250\347\243\204\206\032\345?\362\220\347\377\220\037\327?\'<\363\223\315\221\345?\340\255\365\264\005&\344?\342#\376i\241\210\344?\244\254 \265mK\335?\014@\027\250\332\366\325?\020F\200\214\324\351\336?\216\013\277k\310\022\344?\356\262X.\257V\330?t\373^\277\314%\301?|\313\353\205\2352\323?S\032\226\355\305\327\342?\320\252A\357B\357\242?29\327\236fp\332?\240jV~\235!\330?:x\200\227\202\372\321?\362\023J\213\304\257\325?If#d%\357\352?J\237\330O[\373\346?\242\350\277|\034e\324?B\345\"x\332\'\330?\000\363\010\212\340\232\340?\230\355$w\232\344\263?\177s\330\031\312H\352?\362\220~\245\271\265\323?\327\'9\332k\000\355?\344\257ILfQ\336?H{e\035&\005\337?JQ\375\026\234\345\353?\267?\316M\007\317\355?\034\310\354\0338-\346?\222x\331}\333\230\352?\230H*\262Xa\271?\337\232\310\273\247\236\343?\316\017Q\3216\320\327?\214\306\362\003B\246\350?j\306\030\264U\024\320?T\036\244\n_N\312?\311#woA\322\344?\312b\322o\026\344\333?\370z\234\357\254$\357?\373\213m\221\021\276\343?\264\327A\361\356\330\352?\300jd\301\322\356\252?\220\016*\020m\026\304?\027D\313\324##\343?\330kn\220\201\325\346?:\245F_\342\002\326?\332\242L\225\371\t\327?dq\335\'\305\320\314?\230\034\304\334\232\351\304?\000\201\001\274\0109\226?\005i\336)#\334\341?\300r\021)b\314\323?\333U%l?Y\344?\275\245\224\360?\367\357?\354+S\220\263\360\350?\024\277\221\273#\364\342?\004=i\315h\200\350?@\305\025\243\210\355\321?\016\375\331%n\270\324?\365m\270,Tl\342?-\004y\265\301\014\354?\256\311*\324\013\264\342?@@L~c\225\223?\346\335\354\372\212s\330?\260{y\010\244\216\341?@\344\t\337hx\274?\211\301\224\r\341\372\352?\370\305hX(\375\271?E;2\244\001\242\344?8@\220\2463\215\304?\013X\364\027n\307\356?\323\247\251\300\261\003\355?p\334\032(s\260\266?\344\244{\322\226\205\353?\371\211\371\360\345\026\343?\270\201\313\240\021\330\270?n\347J\260X\311\330?\374g&\332lT\310?r)k\203\220\340\344?\216\306\340\326\026\272\354?6G\206\3653\007\320?\\0\334\226\264\212\305?\270`\256\2264W\357?x\013\264\236^\351\260?\270\010.\360E\361\306?\244D\026\273\256\243\344?\n\207\231W\345\214\354?Q\353~\251\203\366\346?&\365\344\242\216\007\350?]\312h\244\227|\344?0\320Z\016\370\207\277?\240\256\312\334\001\365\234?\343\305\334\004\014\341\344?H\367\302\372NK\276?\236\243\200\020(\317\341?,,\207\363\376w\353?\310\235\036\247\302\t\320?\004\342\3308t\254\334?\200\242\223AJ\376\274?\002\376\027r\245n\325?\'p\253J\315\211\350?\370\202\241`Gn\277?\353\004_x\034\037\356?U\305\3300\027)\351?R\323]J8A\335?\236\0109\3611\313\332?\226\237\220\037\376E\324?\336\310\361\255\r\270\324?\010\004IW\345C\354?p\256\177\347\340\007\247?X\271d\306\330\223\355?\234\200\365t\242\227\350?H\330\251\250\240\001\264?\320Q\246\315\323\016\254?b~|\247pP\321?@5(\212^\230\223?\304\312\235K1w\352?\250\240@\201.\034\325?X\230\036\227\230\022\305?\200\275\311:<\203\205?\207\264\343\266\246s\342?&\230F\211\263\355\334?T\367i\023\276\265\303?[\335P\3531:\347?\220\033\275\037.\371\315?\035d\317sx\303\353?\0017[\247\255Y\342?HU/\336z.\346?\344\224\003F/\274\301? %+\276\023\036\251?P\020&\211\341\221\253?\302\036\277]B\220\340?~\234w7\035!\355?\024\326\211\207\007\320\347?\032\376N\333;s\324?\250}*I\351W\307?RA!\232\311\326\355?V\020\005\223\003\270\323?}<Y\207I\215\342?\375\352\263\341\212$\350?\362\376\351\330\241\325\320?1\005\206\"\275Z\340?(\311z\362M\246\265?%c\000\332\301i\346?\200\034\220m\321)\223?h\342Oq\275\225\301?>|\346\263\303\347\324?\362jy\031\374]\352?$\252C\274\217\024\335?l\304\1770\253Q\353?y\336\356\301WJ\353?\342\215\227End\330?\246\037\312\026P\323\320?\360\247\374\025\332\347\273?@n\177\017ze\345?\'\351\"\255\326d\342?/\251z\304\355\375\340?\211\325\240\3159\334\347?v\311\320\330a\201\341?P\247\240\267\352c\303?\250\201\017\263\227N\324?\254v\364#`\375\303?\322m\344\327\230\024\355?v\376BE\306\261\332?T\272\013\211\245[\330?\251{\335\252\r\305\352?\373\253\333d\013\340\354?`\202\313\306\267z\274?&F\304_\334\200\335?\252\014`\210\373t\354?\220\244^\313I\351\244? \243\"T\020\355\241?p\272\016\226\027\017\240?1~\377\003-P\351?\362\373\035za\227\331?\035\360}\354 \027\342?N\031\337h\030\030\347?\365\372\213y\205&\357?\272\025\377c\255\224\325?Z\025\216\346\354\r\344?\240\336\026\242\300x\222?\360\306\344A\352\\\272?e\256>}g\370\346?t\316D\nR\005\316?\000\257\276NFy\225?\002\347\033,\001-\322?\260V)\263HW\245?8\273\315nT\222\312?\330{\215O\313\007\351?<Q\303&\202Z\335?B_\200>\240\235\346?\200\324Z\361,\200\223?^\353g\217\261\211\321?\000\204\244\000\271\256\266?\322\261\233\214\262@\336?|\303\272*\275\204\330?6\3631\027\035N\357?\242\325\\\204\366@\356?,\274\356\276\014H\300?\244\214\315\342G\216\356?\021W\245\022\037\317\343?\324\263\326\203\247\343\322?\240Y\021;7)\331?\200!{s!]\340?<\205\367x\307\363\302?\227p\247\336\346b\345?\\\317\255\014\316\r\333?\252>VnH\206\333?@L\301\033ss\315?\312a\341\230S\375\355?\216\030\214\255\035\355\320?<d))\036\315\352?\362\204\235Ir\212\356?\010:e\335\300\003\305?4;<\222\226\254\357?\200yb \302\021\277?\254\265\275\370\220\246\343?\030=\034\0163\350\327?`\256\267\353\265j\327?\364\212\260BH\\\350?/\216zF\300\027\350?:\276O\270l\360\357?\344R\352Y\351\326\315?\006\004fVo\271\336?V0\317\353\312\004\334?\344\257\204\030_}\303?\327>\352\266\346\252\354?\251O1\\\3118\353?J\320\032\241@e\330?_\017)\352\375\304\353? \354\202}\312\247\354?\266G\227\200\020\037\334?\230\004\243\253w\362\356?\370\320-aW\"\346?<\210\323\036\215\177\300?\030,\231g\264\213\315?\026\030y\345\246.\340?\010\226>\274\0133\274?g\263BQ\303g\356?O(\207\310y\322\344?\n\250\232jr\354\320?@.\277\317\006\202\242?\205\305J_\232\200\343?k\205,\376Lw\352?\342\204WbW6\325?\000\252u\032\r\356_?8\250\340a\357\311\276?@|\321c@\244\217?\260\256)&\315\237\337?\330\316\253$\371\224\264?\t\204\261B\016\273\341?:0/\002\343\305\346?4\317\221\334\265\002\300?\3701\325\377\022\001\304?\214\313\302\276Z\245\344?\006]\034y\375\224\351?\233@$\237\300\274\341?@4D\251\320\376\347?\316\323,\310\263\356\333?2\333c~\274\264\330?\034\021\350\031\260\316\301?\240s\255$\'\n\252?\355u\030B\035\235\340?\\\233\377\220\353\357\326?\344o\276\321\246\353\315?\230\370\027\246\244\\\354?f^\\q\003\255\334?\231\305P\350P\236\353?\006\331G\225o)\320?\306\214\275\223\014L\345?\363y\256\361h?\347?{S\240\247\305\034\355?Hd]\020\371\023\277?\230d\346g\361Y\310?h|\246\351r\243\277?\230MS\315\243H\305?.\216V\233\215\361\331?\323\276\335F\355\232\356?\350\232\241\360=8\336?\262\371\032We\201\355?`\347\030><\320\352?\333\031\302\0348m\355?\036\n.\032\320\237\322?3\353\200U\000\210\351?@,\266\301H\363\252?\210\2757\252/\204\334?w\245\305\330\256/\350?c\350@\312qh\341?\323[\232\306\016\255\344?&x\374\263\361\'\322?\234N6z]]\344?x\217\237dJ\220\323?n\275\002\311\341\345\323?b\373%\321\233s\320?\026\213tM\020H\334? =\302\271$\r\317?\226\030,\372\270.\344?1\247\204\316\340E\347?\364\250\036\035\320\357\315?\334\312\267cf\257\337?\234\202M1\341\364\341?\232\177YqC,\341?\177\311\177rBT\353?d\233\221\343\206=\336?\310\335C\211\035\316\335?\300\275rM\312\360\323?~J\016\204\003\362\344?\000\016\200l\371\371\337?\026\304r\275\344\016\341?\354H\251\315\220\262\335?\222\217\017\342aw\321?d*IH\264;\333?\322\0316\266\035\302\345?&\301w\342\274\352\350?\265\316\341wSq\357?pN\271\377D\227\341?\332c1\313\220\027\335?Z@\265@a\233\344?\033\245\260Y\002G\352?BM\216\262B\000\347?\252\300\307\223\020\311\325?\260\346\230\377\237\247\253?>\322\307[.\367\343?@\335\333\275\312\233\271?\010\224\251\210\222C\335?\270S\314\002\375\315\261?\254Q\235\000aC\303?\244-\233\2344\231\356?P\"\013Ae\r\334?\252\t2\232\2432\344?iH\254\256V\034\344?\014\325\215*\215\003\313?l!\255r\313\323\345?\210\364-x\346\363\346? \242\026\316$|\271?\222\204\035Hm\031\330?p1\\\351\036g\342?\356\241\017w\301\222\323?\240\201s(D\354\314?\302\261\341\246\036L\323?\344\343s\311\031\365\356?\330\202|\300\252\374\322?\301\342\213\034\344\303\356?\000\013\201[\336<\353?>@\213\313\026\205\337?=C\330\246\231\201\342?\347\335\3434;\361\347?\351|\215~\365\201\354?\246\002\271\3741\026\354?\300\216\230\035\337,\254?\215\017%\001\245\310\342?\214\330\375\t\005\036\317?\371\302H]\367D\355?`U\321\'y>\300?\034\350o\211\001\245\317?F\213\226\016\334$\327?\301\023Q=^\014\354?\3009\364\267\024+\201?\3249k\360zd\344?\350\030lgl:\324?\036\217\356\203\216\027\351?\021\246\355\302\242\371\344?\003\322\233\357x\343\347?\321\347\236o\017\267\346?\230)R\201\3518\335?T\317O.0A\317?\301\"\277\366\255\311\346?\361\343-\315\334\023\350?\010l\017F\364\263\262?2\357\335\003:s\325?\206JO\336\222\017\346?\200\276up\026\355\205?\211\262%\240\346B\343?\326]\224\t\204\275\346?\220\314F\031z\327\331?\240w\002\302K\271\232?\363\370e\r\324\314\351?`A\025+\245\032\351?\\\245\200e/\336\324?^\304\225\260\315r\342?^\356\275\375i\336\335?\026\333{\220\375\300\331?\206t\232hg\001\324?ey\352\3108\305\347?\352\300\323\212\364\336\341?&\306\311\334\336\002\331?p\250\352\341\244\231\304?\366\351Ud\226\222\331?\304\324B\371\266\363\335?Yw\343\032Pp\355?@\373+\221\017\366\313?z\243\313\276\001\215\353?\004\314\2318\203\350\352?\342n\271j\216\017\322?\343\242.\2759r\341?\204\220\322\364k9\337?\211\255\200Fl\333\356?/\362\221\003\366\314\346?\360T\2075xE\330?Xa\003\235:\034\310?\225\251 \344\255\300\346?\372\22306\340\365\344?`\277\215\004\362G\333?&\022a\225\233\303\353?b\353\252}5\223\342?w\251\236\225)\204\350?6\210\260\343\357\231\354?\330eaww\r\271?t\235\202\357\304l\341?\224r\002\343c\207\346?\320\371\ru\275)\345?^>\263\213\312w\327?\200\240\371\310@\254\240?\240\376hn\272\334\271?\310\000.n\236\343\331?\374\023\212\243\036J\320?\327\014\360;\253\321\356?\226f\000\035\276\237\344?\256P\377c\343\327\334?VN]b9\021\341?\227\303(\031\342\271\354?\014S\340\322O\337\344?@\355\275\217\233\271\215?\\\000-\243\035I\315?\234\226\007\r\006\206\347?\340\336%\345\240X\344?D\226$\376\303\004\333?\272\036\322\010\272E\352?|\237\2243B1\347?2\3037gJ\360\331?0(\326\311\0028\254?\032\357\327\233L9\356?\260\254\210q\204|\274?Y\355;\347D:\350?\270-\255\001\311\023\353?\200VzWd\215\246?\346\214\244\344\330\220\353?@\351\321\030\007.\315?\304x\255j\006\026\323?\000\304\036\202\205\025\350?\324\357C>\356b\343?)f\"\361\030\241\347?\332\336\242G2\225\352?X\312\344\003!\t\356?:e\025\371T\342\337?\356m\256\030?8\335?\352\232E\276\301\323\340?\274\262\365Jz3\327?h\304S\321yg\342?\317~\307]\353\360\342?\334\250`B\345\300\314?\360u\337c\241m\337?\000=\027\361`i`?\t\\\0318\360\334\352?\323\035\305\024\352\004\352?\370\263\205\344\220=\337?\200n\t{y\240\255?\2778\272GHa\357?\217\\\322\355\2759\340?\335F\234\346\3703\354?\224=\310\342d\315\353?\020\277\325\360D\340\301?\333\313neI\002\340?%>\202\234\202\020\353?\302\356z\355wk\322?X\3522K:\336\355?J\026\323>\272c\330?\360^(4\302x\335?j\217\336\263L\035\347?\274`ZPb\000\351?\222M\320\254bE\346?\314\216\261\204&\253\303??%5\373\000/\353?\202\204\326\333\273\375\343?\355{\324\216t\350\356?Y\nU\252\025\310\350?p2\236\310\3169\333?4e\363b\241#\356?\256nM\304\264\324\333?\027\244\300\277\037_\357?Y@\363\003\003)\355?\007\331\375\306M\370\353?\214X\005|\315W\307?\241\240>*\324\204\350?\027ct\365k\320\344?d\266\200\301\203\321\330?\372\323b\347!k\354?\320\276\244m!\352\307?T{\307W\317\214\340?0\31136\375\351\324?\230D\036\260\213\317\266?\354|\313_\214\t\317?(\303]\271h<\265?\325\037\022\225G-\344?v\347\225>z\r\336?b1\326\217\272<\355?P{=\257\267m\350?\320\242\216\224\006\314\345?\220\243\206OP\026\242?8d\013\367pW\325?p;\242\"\0020\254?\362\363L\337\247}\337?\340\326\235\321{!\244?<_\212\014\271\305\315?\374\325\332\224\225\326\316?\2109\213L\246Z\307?\340\352<e\307\032\333?\264e\200 u\355\316?6\365]!r1\334?\275s\335\361\005\341\343?\200B\032\273~\251\335?:p\303\316P\277\355?i\277\323\326o\006\355?\330rB%\365W\327?t\315\311\211j`\303?d\332\233\311\226\014\342? ]\254\233\360C\326?i&\302\177\036\361\345?\306\251\005\340@\330\333?z\210\210\202b\370\341?r\003\222\334q\"\335?\274~\023\260\365u\320?\004:\n4\337\027\336?\3709\216\037\220\353\347?*j\237d\031\006\337?\200`5\373\2049\267?\224R\220\033`S\300?\243`\310\331\334\270\340?\212\335\013}-3\357?\214I\236\014yA\332?B\270\202\022K\337\341?rE\326\204;\251\324?\220c\023\027=\204\253?\254\340\377\302bc\337?\276L\004\266\004B\347?>p\372\255\300\224\350?\221\303\376\235YW\346?o\335\352^\026\020\350?\202`[S\246\312\356?\020P\035\360\363o\260?\274\3131\372(@\341?\342\303\360\006C\240\352?(\245T\373o\376\266?\264\\\177\321{\314\352?\316l\3115s\355\336?\200c\023[\375f\266?\244M\014\322\211!\330?m\375s\216\010\177\353?U\233\255/,\266\352?\307xu\333\006\017\357?<\266\024?\021\235\305?@}aj\026\230\204?Hl7YE>\354?\016\016+Tn\033\342?\3446\205;>k\345?\010\300v\205\tB\260?\300LQz\2758\350?\353\345dB\216\363\346?B\272\004\320\307\036\334?l\222\356\333\224\200\316?\345\215\331\302}`\347?\352u\010\333\302}\346?\236\362_\360\366\341\334?\336\323\365\267\323\317\354?\232\'q\365\355n\351?\000\271Dk7\300n?\377\331\365\031 &\355?Da\373\357\224q\320?\303\']H\3535\340?\220DI3b\274\313?sNI\037\367H\344?\252\261E\302}G\344?\354\303\371\203\350\367\337?\363\277\027\023Qt\351?\360\371\220\306\351\261\317?%\351\"\310\320_\347?\002\\\3355\010\003\346?\307\341\231!\273\025\343?\235\331\256\335\245\354\355?\014o\371Y\325\357\334?@\340F\237\020\353\215?\030i\345m\222\367\266?\300tdN\337\360\274?\267\263\364u\222\310\343?\257=\004E}\373\357?\257\334N\265\370u\356? \r2\324D0\312?.$C\253\017\346\343?>\246\210\216\325\370\352?)\033|\307\276\222\350?*\220CX7E\322?+\203\213H!\301\345?\035\334\001\177I\237\344?\311JG\204\377\311\352?\232S\271\323\220C\341?\371\250Uu:\207\357?X\376p+\234\367\310?Ha\332e\005\310\276?\354\227\255S$\361\356?N\344\023\347\300\336\325?\214#\364W\3157\323?\343\335\352\207\0362\357? \251\304\326.\363\234?\360#VuP\306\323?\304\347\022GS\002\312?^\031\275zA\014\327?\237L\303\024\347\014\353?\301e[\213\262\324\351?,T\354\366\346\276\336?\216\n\222\221\266l\347?\322KH\232\033\300\343?\254!\303%\241\333\353?\340\025:q\r\217\351?H-9\271\331h\270?\2005^9\360\325\243?\356A{\'\0056\335?x\236\031c\0161\260?<<.Kc\266\320?z\354\235Df\304\327?\220\031\331\267\321\360\326?\224(\242y\270\234\353?S\374y\307!n\347?\035\021\272\206\352,\340?\307R\312\223\327\254\340?\300\265\\\225t\277\335?\350}\330\337\347j\355??\204\360\232\263\364\341?\230\270\027\213\365o\334?\210.\362o |\266??\177\305IAw\344?\032\312$B\364\307\352?D\026d\255\234\340\351?\354\371;\033\3363\321?\206W\264g]\331\325?\030\200}D\3112\310?S?\013G\2748\350?Pa\264\364\311\036\320?\303U;\344\244\342\356?\272\206\3767>K\324?\031\r\372\016\215=\355?D5\245q?\034\326?n\035P\r\3517\352?P>\033,\300,\245?8\201\000\304l#\345?|\341?\3305\235\347?\305?\317vc@\353?\300\223\356\333\377\021\224?0\267D\212\211\302\240?|A\033\036\022g\357?.}\373\235d&\325?N\300\375\277\\\245\335?\214T\223\371\031\267\350?\270\347\210`\"\025\323?i}\214\023n\261\356?,(?\2258c\303?\254W\354~\254\266\317?\335\032\021\020\262v\357?j\0172\315m\330\350?\240\333\244\300}\263\260?1Z\213#\225y\341?\027\301\001\356?:\341?B\244\357\333)\356\324?\2026\266\211\024U\343?ZXjy]V\325?@X\365\006\347\345\311?\032\3731\003\023\034\345?\030i\004\034\314O\304?\332\267\037\313\364\236\324?rDH>\345\300\333?F\322\005\273\2604\326?a\307\316\017\017\032\354?\220\337\3375s\\\332?6FV4\213Z\334?\232\020\240\237\002\223\352?\352\234\3506\317f\346?<U\300\271\375\231\342?\247gb\234W\277\356?jx\273K\261\307\350?\356\n\002\243&\314\352?\211\343\206\2635v\353?NY,7@B\353?\266\306\030#7=\323?L\021\240-\312f\326?\355\201k\022pw\350?\277\n/\272\341\353\346?\364\254\373\304d\"\300?V\254.\246p.\347?\240\270\330\326\230\303\270?FI\207\302\324\"\351?M\035\277\037\243\001\351?t\364f\225q\315\300?\035\3777\300\004\345\340?x\345\017\"v\326\262?X\230\220V \203\352?l eKo\326\320?\027R?\225]B\354?\352\237\n\362\261I\323?\034\314.\303\352\014\340?`\177\000\3526\204\330?\240?k>[\r\331?F\017\364\016\223\204\332?\270w\274K\023+\311?v\033\252\315I\367\356?\265\310\004\000-\372\356?R\034i\204\333[\331?p<\212K\262\367\267?p\220\262\372\242\312\307?\334S&n\244\333\321?\3351\0371Ox\354?\207\200\302\377,I\344?\220\205\020\222\026P\323?\010\305M\244\320<\340?\022\274;&o\370\327?\001\211\371\216n5\357?\340\307\243\242\025w\355?\310aBY\242\313\312?\240\224\236o\364\365\313?X\362^\360\307\203\312?\266\257\267 I\332\353?\200\216\027?r=\341?\262\360m-c\223\350?\200\303%9\3348\307? \025\234\031M\353\342?v\300\253\245\240\366\332?\300\024\2062\036\305\250?\210\033.\r\356\\\316?\256P\034\200I\005\347?N\2425\353\275R\336?\343\315\205\257\023)\357?,9v\311N\020\313?\276\370\376\026\333\377\346?\037e\t\025\351\232\357?\267\310\240\177k\342\347?Fh\207C\033\220\354?\230\272\261\376y\013\306?\373\260\314\037\212\002\347?\352\226\027\244|\210\347?\246\274\0067V\243\320?\230\347\0202\333f\326?\002o\227\261\341\002\345?\320\023\242\212J\342\330?Bg\327\231\327\256\352?\336v\213M\'%\333?\332\302\225\341T\226\340?s\346Y\023#\006\353?\260\320\302=\225\252\241?p\307F[P\356\336?\240\265%\260\231\325\251?]t\234e\343m\353?\300o\377\035\367\364\301?\236k\243f\347d\357?|\347K\0163\304\347?\351\250\301f\3240\355?U\016\213]i\257\345?`_0~Ck\262?\242s\2058\201\314\345?\3542\"d\232\215\342?0\262\'\362#\362\321?\230\251&:G1\323?\216\341\237\242E\031\346?(\330\242\207\341\325\305?\324+w\335\326\\\336?*s\302\323\002\307\353?\343a\356\202Gx\347?\334\037k\324\366\375\304?\207\303E\217|I\341?\022\014\026\033\252\260\332?\360\333ED)\306\270?G\r\020\204\372\200\357?\206\257\207\0211p\345?B\237\032\330Fw\345?\200\250\025\276\361%\334? \205`\031|R\320?0\323\252\256\217\025\272?\235EM\232\244\203\356?\000\240\200\221\210\200W?%\313Dpa\215\340?\245V*M\344\367\354?d\352\306\000\336\373\312?P\\\353At;\340?\"}\243\371\232=\346?$=\311\307sq\334?\016cJ\314K\235\357?y\177v\261%\313\343?\373\336\305\372\223|\356?\"3\256\034MD\336?B\351\366\202)\257\357?\020GK\324&\021\276?\250\355<\313\2239\307?(\235\350y\347>\301?\010SGG\223\306\334?XK-\024\234\276\344?v(\203\263\311I\334?\310;\301\314\303\226\322?\210c\344\302\356L\333?\205\n@:\216G\345?\034\274\230\003\025\302\354?\327 q\214@\006\350?\315\311\255\246\3342\344?\030\364\324\363W\231\355?\304\225\352\236v\270\305?pj\006J\360\005\246?V0p\242(\'\324?\032]k\360\211p\323?>\001\373\242\201\357\325?\255R\257\317\377\260\340?\272s\272\326\353H\344?\244\213\365\253q\242\340?\304%\336_\017\303\315?\320R\034\362\376=\250?e\035\001M\244j\343?\250\017\007\017\347\326\310?\362\307\303\005\231m\347?p\016dA\035\367\266?\364\375_FT|\340?Tgq;?\"\334?\270zV1\220\177\356?\214\026\245X\214\216\320?\250\026\227\215 l\317?f\312\006\345O\342\343?\220\306\032\245r\221\351?\240\207\267\004{|\353?EbJ\367\022\232\352?c\223\346\317\037\'\343?\270\356\240\352\221e\315?\372\367\366\356~\227\324?\341\240\253 \277@\343?\362$B;\006\361\322?H\300\263Y$\200\323?\n\324<\301\006!\325?T\367a\005mD\306?\233\351\031\327\237\276\354?\000\216\000\377\341bs?\nv\250V\246\216\344?$\323\273\377\027m\334?\270\275\317:\315H\333?\004\342\002|?\256\305?\004P%\337\251S\302?y{+\030-\253\351?\256ax\321j\005\342?\340y\367\n\351:\333?\034?\245\006\221!\332?\370\331}q&\007\325?\240vW\316\214g\304?l\032\323\340\315[\304?[+\342\343k\226\356?\200\373\303X\345\010\261?(\361t\312\242S\353?(\236\376\033g\363\336?U\270\324IY\375\343?X#6\260\323\325\270?xy\350\302k9\307?\276\036wi\212\342\347?\214\233O\004\373;\343?f\301\255B\243\311\321?\034\3333\240\027\364\326?\212d\203\367v\006\345?\352\2416c\346\234\340?T\036\016\207\325&\353?(\216\023\016L\031\320?\362\330\001\016\213\236\326?\250\255\350\036c\304\305?\007\225\271\313\373 \355?u\203\326@E\035\343?|\027\343\324t\n\322?hB\265\205\253\226\302?\214\364\032xN\257\340?f\310\320;\255i\330?\3305c4>\026\346?H \330\242\223)\262?\330 m\351%\000\264?fww\352\225p\344?\347\334\336\376\314\221\347?\030\371\265\001\0349\306?G\357\260\351\374\003\353?\312cI\326\241\311\341?PU\321\264\235w\271?\340\215\002\030\301\336\274?\306:N\307G\233\326?\366\330\276/R\324\333?\226\2377YZ\260\347?P\"rf\t\277\267?8\021\364\004\350\341\340?0_T\322hH\251?\\\345\207J\006\271\324?j,PH\024r\342?.\214\371\033\300n\356?J\335\030\351\2741\351?\344\212\314\376j\377\355?\'\003\013\330\035z\343?\324;\356\202\366\240\341?\006P\314\t\210\344\350?\202\340\220m\260\241\346?%\241%{&\243\340?0\206\305- +\317? \260 ~\313Y\253?\236l\223\272\276\017\331?\237\007\262\312\235K\344?\254\260N\233\213\030\300?\343Z;\023\353\201\355?\324#\275\001\377\361\321?^S\034\020\032\272\343?\260\001\204\301\004U\242?\342{\236\035o6\320?\000j%\375\257\310`?\264\032\003\333\017x\344?8Q>\352\366m\327?0KD\273\023 \252?\000\014DE\265\251\274?\030\013\320\210?m\307?\330j\300\307O\243\342?h\306B\321\363d\301?\334\223,&\022\277\357?\260)).wV\352?%m\372\001-\235\347?\353\006\303\323\321C\345?\250\352\373|#A\355?\360r\345\307\356\247\262?\250J\271\350\246,\353?x4u\246\0162\357?\255\022\364b\311\n\355?J\233\371\217\316o\345?\235fZ\003\270\323\353?\365GVR\335\223\354?w0\320\204+T\356?xE~\330\312 \305?[\345#<\2407\353?\270\204\002\225/\230\345?\036\215\317r\320\013\342?U\262\367B\200\210\342?EUF0\031\322\353?\335\225\350`\252\270\356?\256\267>\360\317x\321?@\236\251\273_W\326?\222\210p\262i4\325?\013\353\265\226@\003\340?\340\262\260\350\363\222\272?\250\272XO\0322\343?\035\240\320W\021\236\353?p\036\335\2329\320\356?\200\374\033<^\004\223?\354\360\256*\204\213\330?\"\223\250\271W\312\330?\374\350Cu\377\356\327?\212[hv\313\375\322?\3450\213\366\321\312\357?\376\320\315<\254.\340?\320\332M\030\2357\322?\216Q\304h\234\360\346?\240}p\3433\316\314?%J\262\202\251\363\344?^\232\251\326\372m\322?Z\341\345^w\221\333?P\235e\007\nn\242?\344#\206\333\243\227\345?\213\317\177\366\302\356\357?4\236\305\022\207\276\311?\320\352Vi\346\215\242?\270?\236\002b/\313?\031\272\202\231\366\333\355?\223\346{m\342\261\354?\245\021\322nn\225\345?\260\3557\006\0008\356?\332\362\363\")\320\344?\236O\037\361:Q\350?\260\257\321\276!\260\262?@\020B\216\351c\302?P\233c^k\212\251?h\270t\254\324V\317?\272\\\342\257|\375\321?\351\'W,\323\204\340?B\253\005\305\236\023\340?PC|\264!\242\252?\271\262\336\247]~\351?\036b\350\262\244\353\320?\332\254\244P\315w\345?\260a\032\n.\234\256?H\236tv\340+\305?Fx\t\213\031\233\352?\2637Y\241\303\214\352?fhT!.\360\325?\177C3\345\003\277\350?\253z8\024\226\301\355?\325$\234\002[\274\342?\211\274\005\005\014\354\351?\033/\210\014\300\355\340?i\004\024\206\265\314\355?V\351\335\335Y\000\325?\363=\212\030/\246\350?$\323;\230q]\343?\203\345i\345\220k\346?\332\256\220eF\365\344?\232\312\361\215\372\205\321?b\374\301\235\242\256\333?~\010J\206\006\361\336?\272\354\353\260\311:\340?r<\242B\261\260\356?L\2257\250\223\005\305?\254\322\261\200\244\233\323?\014\307\321\365\014\370\306?\346\316\025\337-\246\353?9E*\"{\304\345?\222,\033I\006\254\320?\r)\225\327FS\350?\035\006M\332:\032\355?\036\223)IV\342\346?\r\232|XA`\350?7\177\212E\307\177\347?\242M\020!\'\220\345?\242\004wu\303\027\345?\037\3232Q,X\355?4\020[g\315[\354?%\251\276\353m\241\343?\242t\227\030\357^\321?\234Tp\374\263\233\351?\010z\216S\320\203\345?\310F\367tG\370\265?\265\020\376\005\n\244\347?\226\017\022\275\032[\330?H\373m\010\332\010\310?`W\263\353\014\001\336?\021\177a\271\024\226\352?\300A\177*\\u\345?RX\240\275Fg\326?\217B\330\003\240\257\357?\202$\261e9\213\320?*\362\034k\374\256\330?\300m\330\004\034\227\225?\020\204A\032\023P\316?\016\331\304\016\3161\335?\034D\020T\335\240\337?\356\312\347E\237\\\343?t\327#K\017l\345?\t\224\035\215}\231\345?@\274\217\377>\377\303?0Sa\014,W\253?\270\302\301N\305\205\354?\210\205f\243\014\343\304?`T>\227\206\247\252?\0207?\333*\231\325?\014\344\302\265\302\304\352?\350:\214\230\330\211\322?\212$\255>W\017\353?>\025z\240\t1\346?\036\234\023\314\273;\331?i\271@\203\234\275\342?\214\364\357Y`M\330?f=,\224a,\346?\020\255\272\020y#\267?\220R\316\340,\326\273?\226\307\"\273p[\351?Z\250\312\'M\004\327?p\311\014\241n\241\250?\'\211\337e\005\377\346?\305Fe\244eq\344?* \256$\333<\347?W\2156=\3050\352?\246F\262\234\003\256\324?h\362\n\351\030\031\341?`\346jwOn\313?\270\257\274\366\220\310\263?\214\361\037\343H*\300?v\'\234\300jX\332?\000\304u!\332\033[?\220\'\353\024\035\377\324?\206\377\010\273>f\322?P\333\322F-\310\276?h2\254b=8\274?\365\345;\365\233\037\341?\376\306\361\tgS\327?\016\224-\263nz\333?\304\035\274\013k\274\337?\334\246:\213\360 \305?\300\345\360\256Q\246\325?\261\322\034\377}\324\345?\350\362J\370W\022\340?\240v:\030AB\314?\261\3553\3017)\344?P\274\226\302\260\334\321?Pkd\014o\"\262?\240;\014sq\022\261?\216+\231\360\214[\321? \276\2562\300O\246?(\014\201\252\367\354\336?\204\261\276\201 \'\352?\311\270\312&N[\351?k,\207}\033F\345?\203\3645\2576\362\345?\341ET\276\220\036\343?\224r\346\322\241\\\337?h8\320R/\204\344?\360a\373H@\205\301?\304-\r\360\320\205\327?9\327<.\205\250\346?\240j~\270\201\236\342?\360\n\257\200\257\027\352?a\305\234y\361\277\355?p\221\324\353\274\304\240?\225\264W_+\003\347?\200C\333\254\0340\272?\323Q\307\342\352(\343?\216\000\230<)\342\323?\236>\335\213\007\204\346?\340\006N\362\231\320\260?(.)\013\364\215\333?\200;U\251f\272\273?`\201\202\353\333t\310?\020(\374\361\350\266\326?\\=\204\3335\234\344?\212\206j\320\030%\351?\024\351k\300\322H\356?\000\264N\271 <\326?v\315\247\354q\352\350? g\372\353Lx\222?\366\211\347IE\031\334?\300\246N\3745\255\307?\272\321%)\354\201\355?<\263\221#\256L\322?\304i*\000B\037\310?\262\037\253\217\027\307\357?ty\240x}\002\351?\210\334:\335\364\034\326?\034\216B\321\346\371\323?\250\347\375\"\032\254\335?\375?\205\027\373\300\342?\220g$\"\243\212\276?\245\204\006m\"\244\340?h&\r{\222\221\353?\253\273\356P\331\364\355?\010\004\001\363\263\026\332?\371\026\026jn\367\345?\000\315@>\365H\271?H\302\323\373/\313\357?\200\352\206|8\330\341?\216+0\253z\244\345?\000$\021`\2633\355?--\204\\7\225\350?\000\346@\'\020.\347?\0301p\004\227\244\327?\032\365\315R\265\316\326?\317\363\265\261\350\263\357?\250dk>\216\255\356?]\373\n\r m\356?\030\261 \230H&\262?\245\317\\x\257\205\340?@0\2447\320q\330?\025\220\237\373\210c\344?H\364\3228\260#\271?\326w\213\305\271\305\342?0\306\r\323\222\223\253?\370\004\n\325\377B\307?\251%\007\247\260\275\357?\004\r\245\273\236y\331?\366\222\021\031\375;\334?|\314\345\313\335\201\323?h\320zg\272\273\313?X\030\276G\224\262\352?\237\177E\356\230\230\353?H,\272\266\211\357\323?\300\006\214\247?\017\342?\354R`\374\315\007\300?(3\010]\277\003\311?,\204\321\320\004X\300?j\376p\367ip\352?8xD/\274\027\327? \036\317\007\374\270\234?^1\262\212\244\207\333?@\256E\035\322t\237?\314G\360z\230\257\304?\256\222$\273\354/\354?f\365^\001\310\033\331?\370K\265$\t\006\341?r\226h\t\314\315\331?\220\245\253\357\244\257\315?`TQ\374\303\'\227?\330/6\244\224S\340?<\357E\252\0338\335?\274\024\353\221f6\336?\334\345\241-[N\314?OEe4\353\002\342?\206\"\304q/\267\356?<|\260\210\213\017\356?P\\\316\001\311\265\320?\340\t\360\313\235\304\302?\277cYN{l\350?Z\302\020\253[\342\357?\314\266w\265\262y\313?\337}2\346@\272\355?l\337\261HLL\312?\3400\312Z\307\367\223?\244\241\373\312\343m\352?\306\216\213\223\031\302\334?\324\037j\036\020\250\331?\256\014\211Yx~\334?L\0273VD[\315?\000\324+\004\267,\333?^\336\215I\177x\341?\260[\361\241\212\\\263? \270\222\245\341\320\356?cB\242\212e\334\343?\014B\3709\367\250\316?\253W \226\036U\343?\315H4\232\rB\344?\024b\307N>\032\356?\211\2230eSu\340?\233#\031/\016K\350?\000\222\311\241N\n\264?\207\350\031\342\r\246\340?\361l{j&3\341?\000\243\317.]\262\335?\266\362By/-\327?\010\214\222&;\355\310?\364\226\352\366&k\321?;\314`+(m\357?m\t\356~\332j\344?\010\223mP\0273\331?@g\253\347)\321\305?h\210=0\247V\340?w>\336~\233o\340?\374j\014\017y\371\356?\035jD\302\2743\353?\034\026\267BP\300\331?\360f3\253{\233\302?\376\347\314RN\302\320?\024\3349ju\276\341?\201\202\013\336\262\200\357?\310\3347-B\362\331?q\263\215\301\372\250\343?\303\256\323}[s\346?f\231 \321n\350\345?h\207G\202\351\023\322?:c\266\331\255k\346?\273\346\351|A\220\344?%51\202\'\221\340?\267\210\331\243\324\302\343?\\<\372.\365\346\340?\232\353K\311^o\345?\313\324\007\375\373\340\353?\303-\353\321\317*\347?\306\370\026\033%\331\336?t\221S\215iA\316?\250&$\236\006\215\317?;\251B\213Y6\357?4\016\025@\357\344\331?\177nY*\243\211\346?\333\274?\332yd\346?\275\227HYt+\342?\214\237\224\264 $\323?X]\2663\001\320\302?\020\324{8!\215\271?6\017?\207\t\241\343?\326\020\234\242\232\271\352?\340F\337\304cY\257?\2008\220a\302\202\211?xE\376cr\346\341?\322S\261E[\273\344?\204\364\326\371\350\334\346?\025\3054\376_\002\357?vk\001\036\230\225\320?~p\377\207Nc\342?\360h\200\320\2450\330?\341L\227\317|\310\351?9\365Z}M\366\351?\320P\rf\237 \325?\266\205q\205\341\333\322?Ur\017\223\265C\355?N\177f\027K\022\340?\\O\336\266\262Y\323?\363\325H|H4\355?\237\275\013\222\304G\341?:\021D\336\3207\341?P(lyM5\262?\\\211f\241\333\330\356?\370-_\\U\266\327?\"\237K-Ax\325?$%\304\013\205>\333?)c)q\350\273\352?>7\331\313Y5\351?`\222\245o\311\322\322?,\034\306F\026\243\322?\307\n\r0\366B\340?\214\223o\275`\301\325?\250\022\000\'\n\214\310?\020\360-Pd\266\251?\00445\205<4\316?T\025\004\317G\010\353?>\352\252?\300\313\336?\262\251\222)\210G\320?XJ\360\376\253\005\327?\345\375\366\272\217\312\343?\317\24292v\340\352?z^\225F\034?\341?\030\261\233C\307:\357?\330%\222\341\340E\344?~\245\027\033`l\325?r2\206\200\221\374\345?l\033F\274\364K\336?\354\013\210\346\334\273\335?r\"\362\302\037\200\350?\030\010\000\233\354\024\302?\262\031v\016\317\010\332?P\345K_\340\276\302?@\375\032\261\250\252\273?\316\241\215\327\247\362\350?\330V\353P7]\272?\345\346\266\352\307y\355?\034\222\003\315\005\030\350?$d\231\014!X\346?$+\327m\244\342\342?R\274\337Q\035\201\346?tY}\334T\335\314?\340\217\367\223\254\373\312?pF9\036\214\271\245?P\321n\305\314u\265?\303\234\026!\275\007\341?\024\201\367\265\373\244\346?D\t\002W\020\213\324?\031\375L\337Q\361\346?\225b\267\312\362\317\342?\362\177(\316\337\357\322?:F\222\234\227\251\320?\027\205\004 \276\366\351?\340x\361r7\'\242?\376\310\002}\360G\350?\352\007\243\341\027\367\345?\374\351\360\311\320G\307?\353\305B\257\324\023\344?\242?e!l\252\320?\223H5\272\205\025\353?\342\365\025S\325\373\342?\340\245!=ag\274?\000\246-\002\246\223\\?\364\253\3625\356\017\335?8WDO\357\204\327?\034\3311\225G\006\327?\210:a\312\345\351\261? \377\320\262\360\024\230? m\270\177aJ\313?\224\330\361\340Ll\302?\2522\251{\327\256\345?,\373\210\317\261^\345?\333\356\313[\0058\356?\244\361fK$\240\303?\362\365t\240b\361\333?\347\\\320\211\216>\350?B\276\033\010v$\325?\001\306\257\254\022\007\356?hO\014\010\036\234\306?\362p\364\370\330\331\337?\022\261s\177&\217\336?\014\376\300\206\226\355\352?\200\264\220\275\366\300y?\2113\235\017\267\361\354?\331\314\036\014\336r\342?\360\353x:\261\327\276?0\206\025\322!\360\340?\335)\356\207\"\334\353?\\\235p\027\340>\350?\020\231\2550Zd\350?\252\264|z\002\214\341?\276\245\325\276\372\017\325?LJc3y+\302?\322\227/\344\253\255\347?\030\300@h\243\253\337?\222\007(\324c\254\332?\300\306l\342\361\371\303?\265\251\361\217\260}\344?\350\357$\t\231\254\317?\372\317\237\006w\374\327?&R\277\244~\327\320?\227\014\206\342pF\347?\3303\201q\334a\344?\213\331\250B\242O\356?\216\017\3744\033\306\327?\006\272\255\373\255\024\340?\300\221\357\202\017\306\204?$\321\344\373\"\267\312?\230\251\3478\332u\347?p\274(\327\353 \321?\314\373\305\177\024\027\327?\032\271b\224\254;\345?p?\314i\220\234\343?\316E\227\313\231?\341?\344\024\222\246\242\341\344?\000I\246s\2747\216?\242\250\371\213\361\376\345?\211&o\316\267=\346?\014\343vJ?:\326? \213\214kN\000\344?\212\031\177cl)\320?b\322\206R\277\037\327?Bv\007\242\215\t\343?a\022q\225\234^\342?\324\234\346\323\026R\333?\324\276:\372\307F\351?r-\230G^8\344?MMa\362#>\340?\322=>\333\375\304\321?\210\206\304&\372\320\330?tI\252\232\032R\337?\363\016K?\216K\356?\003\306J\026\264\302\342?y\254\005\213\301\260\345?\216P\227\211\266\302\352?\255+\025\324\210\034\353?\200Q\236\277\214\361\322?\206\274\343>l\214\330?\002VDY\333N\341?0*\204\031S\232\351?2rr\035q\227\347?+\276\031\317;\215\356?\222O\265\320\014\345\320?\227#\267n\024k\356?5\331l\177\0318\352?\266`\367D\3361\345?\360\216\265\351\206\374\326?P\370kL\344\251\272?\200\245\302\261\324xs?^\335\202\271m\007\324?\206r;\300\036\321\322?h\025Ma\323$\357?\221\376l\270@}\343?\366\222E\024\213[\356?\200M\2155\202\246\234?~\324U\354Zr\333?\264\204\n\336\037\033\355?\235W0+\304\245\357?\353\365\223\202\033\362\355?|S\327a;\271\323?^\024+>\006\257\357?\214\326\321h2B\314?\300\021\253\375\\^\301?\260\002y\r\325\225\240?,~\237\320\007\214\352?\013\326\"\267\0217\351?l\363\000\356\341\264\345?R\300q\223mC\331?1\204\201\2505\032\343?\002n\234|\300\021\331?l\331}\330\253<\347?O)\322=\342\033\342?\315A\324\315B(\350?\206J\361*\257L\342?\024\t\214D\235c\336?$\253\355\223{\'\324?~\n\033\375\246|\345?\003\320?\006\177@\354?Xe\353\215\236\217\354?,\367\301\221\277\312\321?\365\266\310W&\007\346?~\221\000\326\276\320\334?\310\357\003\306x\330\314?\351\332\006\334\030\376\354?\"Q\341Z|`\326?Z\310`\303u=\333?v\314K\027\350L\351?\262\340v\274\201\354\323?O\373C#\232\342\353?\236\213\221\365\234)\346?\320\240A\214s&\354?\312\200M\361\336D\351?@i2^\336\024\344?\2445[1\023n\324?\241\324\206D\2209\351?(h\nm\331\337\341?J\270y\336\031\223\324?\376\257\3714B\326\322?\027\331\236\001\242>\345?\261)\256\270d\266\342?@\212|/\344f\204?\250Z\233\316\276p\307?P\236E\251\026\026\346?@v\266\271K\360\272?\\\n\334\026\311i\353?\304\022\225w\365\002\314?\\e\234.\217\370\302?\264UNQ\005\\\325?\270\014sM\244\370\322?nh\262(\315\210\320?5\233\r\354\036c\355?\276\330$^\364\016\331?\266A\262\254AR\335?\2361\023\312\361\235\353?UB\013\302\351\375\344? \027\260\316\236\314\223?\354ea\007HP\311?@\"\264\004#L\253?\242\373\360X\t>\355?\240\013q\032\'L\354?\245gJ\357\345\213\355?V\021\300\360\346\321\326?\265\254\301\336\323d\350?\025\266f\033\310 \342?c/\035\036C\376\342?\340\200is\274\273\270?[g4\227 (\354?\216n\277\205\035i\343?P\024\264\224\202j\314?\305i\305NM\001\351?f\211\200ol\"\351?JC\334\254\352\247\351?:[\034\374\325\237\345?\030T\200N\013\361\267?8\032$t\tj\316?\236+\"E\2734\351?@-\220\316\350\334\267?\005\201\267\222\345Z\356?\227,\202\223\r\353\356?\3165\200\357:\034\336?@M\253\363<\253\236?$\033a\270|\200\346?\31468\021\365G\301?D\244\211\234\323+\356?\177\036\033\245\232\200\353?Vr:\337\033\021\333?\332\033\234\014\254\027\350?\376\356J\320\004B\324?\334.L\022\263T\325?\270\373\223-1\273\275?(9\263\223\0333\305?\370i\312\276\346\216\321?\352\311X\260\246{\334?lf a\326i\304?\353\246\007\033\"\221\343?0\341\304\001\"E\327?BEx\350n\263\325?Z\036g\365k\355\327?\366\244n\240o\257\334?\234\021wP\345U\312?\350?LV\215\246\271?\333H\030\346\'&\356?\364\365J\302\304\034\316?\224\377~S\242\332\306?\235[\304\231d\036\355?\037S\001\255=\235\343?\270R8\034\275\367\275?\360\301Q\376:[\330?\274\246\237u\370\271\300?%]\036\334\261\263\345?h\363\341U#\213\300?\010\243\255\266\223\344\355?\214\260\'\023\r\004\301?\030Z\237(\322\255\351?\270\216\304o\017\025\263?\t\235\006y\022\203\350?jM\2368&\361\326?\214\223\035\024]9\345?\341d\330\265\276\024\353?\027$\210\314\233\244\355?\006j\3240\326\277\333?\004Y\337\212\251\362\325?\356\272Y\315\363\263\341?\243*\034\223\250\313\356?d\322\237\to\004\325?\222/\201H\362\376\344?\010\226\340>\'\336\331?\250\326\026\000#h\301?\370\204k\322&\314\343?|waB\205\233\322?F\006F7\275\227\350?\236\323\324\005\265\316\357?K\231\001<\340X\350?(\306\204F\246\005\353?\2445s\324^^\306?\200\377\204\343\346w\310?T-\362T\013,\314?\270\3600\205\327d\301?\264q\300\304\032;\345?0mW\263\277\220\260?\004\366\017\014\343\217\322?y\214\2033\355\r\353?T\024\035\327b\021\340?\354\361\354\031\220\237\333? \334\036\343\343\272\355?4g\"D\3047\333?\314U\025\006\016\363\345?o\246\020\255\265!\342?\336\361\314/\300R\332?\023\216\362\002O\211\356?6\242\266K\266<\351?\230_Z\315\323\361\316?U\205&8\267A\351?\346\301\027g#h\326?\2428%\204\331\351\356?\232\227\371\301>\354\337?\240&]\021\276w\235?\354\320w\274\213\237\353?\230\t\020d^\003\352?lE\n#()\337?L\027\r_\356\300\354?\214.\2348g\001\306?\202e\034\013\271\327\325?\033\355\227\014|\373\342??\243\356\252\347\364\356?\226\3073\300\217.\353?|E\330\350\005\205\352?@\032\266\325\036\270\357?\373\332\372Co(\342?\n\025\340\202\315\017\340?>\315\320\343\002\262\342?\200+@q\ty\312?&<\357j\320~\351?\304r&k\025\360\336?`\314\333\207\333\270\331?L-\247 \263\361\355?\262\240\013Ap\211\337?\371@\210&\253\244\354?p\210\346Q\255!\312?b\275\255\332#B\342?\254\2378\252<N\346?\271\330\014\373Q\222\341?\244l\t\2037+\306?\305\353\227\335\016\000\340?\204d\034\206\3126\340?$,\325\220\t\242\334?1\217\343\032\223_\343?\337\026L\347\367\202\340?(\004\210\021v\375\262?\266\034\202-q\363\327?\014\255S\027\216\225\326?|4\274y\267\030\330?\243\250\267V\002\246\343?\354\250\361\032\301=\303?\220&K\331S\305\243?\360m\005a\367b\334?-\247\372\275\371@\344?\263\027\356)\0013\350?\370\317z\361\3336\310?#U\312\264\000{\345?m\245\021\254\321\024\357?\364Y\251t\243\367\322?\364,\321\361sD\344?\202\016/B\013\236\333?H\371<@t\226\276?\374p\321W\211I\345?4B\214\211Y\234\307?@\010y\334<\241\232?\020\032.2\204/\317?,\372\027\010\241\224\301?\257S:\222\337\357\342?\345\326\037L\213\247\346?\324u\314ju\363\355?\360\311\202|\031\276\302?@\221\025\362A\212\343?\340s}\370\215n\320?\325\276\257\020-\301\354?\027\nlh\362\225\353?\322`\232\360\231@\344?\304\303\000|\343\034\334?\264\363>\330B\200\330?,\245\215<\n5\302?\000S\232\240\206\217\234?\3202\220:w\363\354?\276N\236\014\206\023\320?\302\235f\303\304\250\324?\340\273\272\314\3169\320?\020T\037\337\376\375\310?\210\345\233\305I\260\352?\005HFC\264\213\352?\262r\345\234\277;\321?-k&A\027\332\346?\244 L4\365\250\342?u\223\3160\356\004\355?\243\033\272vBr\341?\230F\0068\203\027\326?\200#[\330\233\307\333?\260\233\241\355\374\344\256?\000\224\t\247\271\345F?\000\026\231\200d\231\237?\340\254\236\246\262\023\262?@\213\216\255Q\024\346?\354\004\347/t6\316?\206\263\310%\305\323\356?|(\237\200K\004\342?J\325%\2745Z\323?\352\272m\277\203\t\330?\n\315\332\3045\271\342?\213G[,X\327\342?%t\354B\005\257\356?\244\243+\340S\236\350?\2534)\233\260\035\351?\365G\"\030\202D\342?\200>\310\014g-\230?\340\206\334\353\251\261\231?H\t\000\262\307\271\260?P\370\000@\033\277\301?A\214J0B\271\347?\024Wa\245\027F\346?\200k\350\212\240?\204?\324\312\343,A~\301?\016 q\320\2708\355?\224\2510N\200\254\326?\014\302\306\376E\004\311?W\2642o3\216\344?\200\207|\3054\302\226?\262\221\031\314\002-\354?\030\232\333\237\026\312\351?R\307m\002\2259\320?\200\251\374s=\203\323?\242\347]\036\354E\335?6t\256\235P\256\332?\000|\203A\304\340j?\246\330-\024\364\376\340?\342r\231\307\346\326\350?\262\333\223\351\374\361\353?\212\374\342LF^\337?_\325\267jH\275\351?\274\225f8\211\230\313?\321\330\013~\177[\343?\310\2020\211\033\244\261?K\263\365:\214\307\340?\340\'VOT\\\334?F\312!\200\351 \323?\034\352}\020\026\360\321?\313\300\177\375\336>\342?z\305\210\3712\242\334?P\361\"\334\304\036\267?\3244\372\321.\374\313?\214\230\376\017{\022\300?\3269\2071\334\336\342?ed\215\277\362\336\344?`\310\206F\240\342\235?EQI\'[\177\347?@\021\246\362\235\350\277?\000\021\225\3565\016h?h\354N\017\311q\312?\016\000N\032\241\034\350?4\020*|\322\264\320?\302\267:R\244N\326?\236J\023HK\377\341?\210}\256\257\227Z\305?\360Q\367\177&\235\307?\200I\177h$\320\214?e\014\267\303\220\035\350?|&\377\332\253\337\344?^\314\017\331e\265\354?\320G\201\327\n\363\245?l\200\020\027K\346\307?st(\0060J\354?\200Q\243\332\231o\355?\312\213\263\317\223l\335?R\3378L\347\263\355?\036i\305\334\310~\355?\222\274\273Wy\301\346?\210\217\303b\275\324\304?\365\347\020\371=\306\355?\0201\262\340\356\353\246?\201\3300\350\001\313\340?\316\350dap\024\340?\273\270\037X\263}\342?\030R\"\354u\n\336?\262\365\331\223I\215\341?\234@\226 bf\334?\342\362\031\367\"\232\352?P[\236D&Z\354?\347Cy\340\0347\344?\250\3774/m\335\263?\234\322\232\224\237\321\312?Z\\\034J\372\240\322?\034\306S\217y\346\347?-\251\260\364*\353\352?\273j\304^\214\351\353?\211Lng\277%\344?dH\355}\371&\320?\300\330\321z\363\236\316?\252mK\2049\365\351?`\226\277U\016M\273?5\375WX\301\357\340?\360\030\337*\214\003\255?\3127\202\356\357\212\334?A\361@\360\032\262\344?\301\312\037x\016\323\356?\200\324\317+\342\271\265?-\221\203\306\030\266\353?\324\035]N7&\312?\206Y\316\273\250\247\323?\275n\254\236X\213\350?8\263\326%z\356\266?\353\273\276\374X\026\344?{\000\257-\366\276\352?\337-\257\240\026.\352?\030\177z\273jT\266?\270\0133\256}v\306?:\010\225?\t\206\347?\242\206\260\272\220&\333?\020\234\250\256\016\227\311?\235.\333\003\254\263\342?\330]\\\177\263K\307?\230\003J\247\340\343\355?\007O\033\226\261P\356?\005\314\346=!*\343?,St\002\344\374\301?<\321@?M\347\356?\010\364]O\311C\340?\210Q\353\344\361\300\301?!\017\'\276\303\345\352?\242\016\217=6\272\327?\022\0373A\022\235\345?6O\253\017bj\321?V]\2574\226A\322?\257\027\211o}\336\340?\376\014\n{i$\330?\220r\001\253LJ\307?`\203\247r\237W\267?\216\005\253\366u\327\353?\320\023\204\300O\305\267?\324\347{?$M\337?V;E\256\025N\350?\210\014\300\330\361{\307?\355J\336fmm\344?\201\332V\034c\352\356?\306)\3256\265K\342?\010M\377b\340\372\300?\220\264vH\300\003\341?\\\367Y+A\n\315?J{\232u^\034\322?\375\247g\337\031M\347?x+\337\274;\240\302?\202\202l\3517\225\337?\307i3\361\320\314\355?0\265\2227\264\275\306?\320k\323xNg\340?\241\377 \334\276\335\352?\264\017\274<\266\000\301?`\314\321\001\177|\240?\250\227\225\207\024\r\264?\204 2\272\200V\326?h\364.{\272\027\263?x%K0s\303\266?8$\024\357\264\307\320?\376~\247%\351Q\330?\352:F\263k\262\323?`ok\262\227\316\264?\2579\305\2036\217\343?\034\226\205\303\205c\314?Z+\351U5\016\342?\032\013mUR\302\321?#\226\200\020\365\335\355?Yq\026\331\310V\342?\254\215\026d\'\330\355?\340\300\234\210\250\312\235?\213\202\315\010.\276\353?\212\002\351\363\025\327\324?2.\212\026\375v\335?\254\254\270\364\241:\357?\000\317y\020\033\010\317?E\301vz\360\223\351?^2J\240\203\325\346?g\007\266\207\367]\341?\214\263G\252E?\304?\2022i\274%Z\326?`O\031zy\022\244?\202\331\216\356m^\335?T\337\261\017\305~\324?\030AO\377\255\244\334?\300\303\237s \350\276?\316k\250\275<\224\321?\232\033|c\312\321\346?\265\351\231]\014H\340?\010\250\320\257\361\366\334?\264\0226\032\216k\316?\3109\335\2642W\303?\357p\373[u*\344?\350P\341~<\277\313?\000\005\317\333*\341\225?\332\224&_\322\321\343?\232\376t\345\374\234\326?\232\006\352\21093\335?\353\177\215\206k\342\346?\3600\005\221\302\302\304?B\344\240u\257\367\355?\314*\275()\233\324?\020\024\2560_k\347?Y;\370\263\362\355\351?\034]\271\252Ek\315?\320\322$2\250;\322?\3201\2477\211;\242?\365\016\363\245\362e\344?\311\352`r/I\345?\005WB\340\004Y\356?\370\000\223\354\322\364\305?\322\001\210\nD`\321?lo\222\\=\266\322?\374\372\216\024\343\305\322?(~\247\214\223U\326?\rA\321\222\315;\352?\325Z\235\305\223*\345?t\260G\316\372%\305?H[=\2449b\344? \341\276w4E\240?\226\251oP\376\242\347?\234;\035\333\277\314\341?\215\034b\255\301G\341?:\325\371\340GQ\344?\340\274\035\2558\275\226?\005\275\242\376!\243\354?v%\345\315N)\322?f\232\264niY\330?x\207\222\224\313\231\334?\362j\261\257\025\243\342?e^\214\332\217\266\353?\300\210\006O\027<\343?\202#\203vV\314\336?\260\262\256\274h\205\262?D6\305\032\345\346\344?\000\006yS\302\200l?\252Q\231K\260\211\341?\036\2679\257\336s\334?\260y\242\314\003X\342?\344_\304\005\201Y\303?\202\206\230y\212X\354?G\324*6\310\214\347?Q\276\343-Z\033\352?\300\273\001\022\205\224\210?\030\210 Ud\000\277?\310\223Hl\035x\326?h-\264\224eU\346?p\301\363\273\266\203\333?4\r\037\277s\354\353?\200\223.\003\031\341\351?\025\\\221\n08\350?q\242\3409\343;\342?\271!\366\367\235$\347?\315\227\027I\347\375\341?\316N\030\245`\016\342?\004\000\334\331\376\234\355?\363\"\273\212\232\000\353?\320u\003T\302\356\336?\243\213tM\034n\357?\2405\304p\215\260\260?\201\036\334.\346(\350?\235\223\302\333\346\\\351?x\336\226\256z\234\320?,\240\024\006\006O\314?\367!\254KL\210\350?\010\351\323\227x\026\302?\303\304\361<\240H\352?6\\\323GH2\337?h\227&u:r\272?%\301\346}\230#\354?P3\243\235\251F\321?\355\"\355/\240\342\342?\217\255\373\233\245\366\341?\240X\241\357\032\344\224?\005\373\230yi\257\356?`,\177\210P\320\325?\340\337~\014\252l\347?j\201\307\255\262}\337?O\222i%\267`\352?\200\200\234\361S\305\224?\315\201\304n\362\331\355?D\240\251\023\214`\321?\363\034\335\360\310\025\346?\026\217\346C\361\010\356?D7\263\371!k\302?L\177\267\371\2755\325?\346\014\303C&\242\353?-\273A\343)\306\345?\364\020\221\002\216\002\321?\346\323\223\224\364X\347?\327,\025j\224\202\344?EG\214-\376\334\346?\ti\350\252\273\001\353?4\032J\217S\201\357?\342\244\363To\"\347?;\021\263\312\312\315\356?\244\004\327H\257\031\345?\340\214\002\246\352\324\317?-\215\037^8\000\356?\327&\216\020?\226\352?\351\002.\031Zx\341?\377\345\3641\020\270\354?\217\3230\001+K\346?`\3467\266e\211\222?\240y\017\022\315\370\327?\010J\022\001o\374\353?P\034\264\307G7\327?\037\334\340w\3029\347?\320\334\223\211\363\031\355?\310\177\007\030\007\351\301?\350Yt)*\027\266?5\242\310\370\250\254\341?\272\212j5f\023\327?\300\227\305\321\231\314\240?\215\325\361{\004\t\355?\264\200I\230\n\266\351?\340f\207\200\027\\\353?`\236\276\215\022\320\325?\254v\033H\212\320\321? H\316&1\\\267?9\314\220\201C\246\357?\3407vX]\222\232?\304G\020\241e#\344?\302\227&\237\261f\353?PU0\375\007\343\246?la\237YU\020\356?3\201\337N\200\320\356?\206\224\306gy\334\335?\020R\334\313\233\006\321?\327\006x8X*\347?\375\372\315\205\235N\340?\330\260\006\357\356f\353?sT\265\367\332\010\340?\202\366s\"Z=\326?\322AyH4M\337?\313J\374\271\256\316\346?\374\200az\303\207\322?\231\355n\365\000\303\352?x\\\226\003C\200\310?\353f\005\254u2\341?\006\1772;V\262\341?\214X\221\370\r\010\310?o\254\002\227@\326\343?\257\344`f\021\303\340?b]0\030\331,\352?J\236I\303\376\306\347?\332\006\241\311.\024\330?2\'\317B\221\315\350?\3347X\253\371\300\321?\350\237C\n\305\260\260?8\346,p\230Q\266?\222\007\200)1\276\320?h\356\222ol-\334?`\224u\313M\265\237?X\337P2\227\263\346?\010G\344\222\2037\261?\220\265\357\333X\241\355?\260\266W\343]c\276?\232z\333\236Qi\341?\354\211\005\274\020\321\315?t\211]\006\366e\305?\032\335s\376\322\275\350?\n\336e7X\364\330?\006\245e[i+\323?\364*\312Kuz\350?g\226\035h\216a\345?\330\362\200\302\340\277\347?\200\234/\366\362\016\210?\010\247Jd\034\031\265?\260\036\357\252{Q\260?\262\312\017^\222\236\323?\200{ \311\326\255\255?\204\374<\177\242\326\355?\000\2142\224|\350B?n\241\323\312p\313\351?\220\034\334\311W\343\344?v^\3571\301\230\323?\270\217\261\300z\245\357?\244g\306J;\336\326?\252(\225H\221\023\333?yF\352\334\227\255\347?\002\367\026\313w\203\345?\312\\\331\020\207\255\351?Z\352\351\265\247\223\356?K\202\327(^6\355?\305}\256\237\215\366\347?\267X[~\265\014\357?\310\3634\373!>\306?^mt!\034\246\322?4x\240I\202V\342?d\210\274=\360\230\350?\000\315L-\013+h?\313\367\025.7\271\342?\200F\031e\325\020x?n\375\004o\367\346\334?X9\365P\347\243\274?\376\215\305\222q2\334?\211\021\202\'7\276\346?\274$\250\301\2037\330?!^V\215\031Q\342?Q\350\237wL\253\353?F\205EDD\004\344?$\017\231\r\212(\327?\361\334#\240o\006\352?\374dG\217J7\316?^zP\031Y\355\356?w\177\0144:\366\351?\034\240Z$\372=\332?0\2149C\267\032\312?Xx\325\305Cd\343?\014NR\337\362\035\347?\242=,\201\303\216\326?F\364<\017\014\265\352?u\231\314\3206u\351?\260*\211\276<W\250?8s\256\255\036\320\346?\270>\357\370\203\366\356?d&\276k\251\373\331?\025\223R`\322\375\354?n\027V\006\216\034\353?\3018\354MH\345\354?_\025\027\223\344\321\353?\352:\'\004s\240\326?\210\025\376m\010\t\332?D*\273\246\337\202\352?\200\322\2023\363\232\300?\"\266\364\366WP\333?\257t\230\367\302[\354?Q\315Z0\213*\346?\024\240\334\r\001Q\321?\272\252\224\352\277\021\341?\3006\3438\257\273\255?\352\247{\375\\|\344?x\303\257\266\212J\354?.\022\376%j\270\322?\364\313\233\310\3235\315?H\371\017\232%\213\340?\300\356!lb\337\340?my\221\312\251X\343?\024B%\356I\257\322?t\243\271\036\341\003\316?\263\342oB\374\244\355?\362B\362Kw \331? \035\276\0357p\251?ia\256\246\324\217\356?\260\367\230\365MX\256?\010#H\255\334\245\343?$\303\312\326\032\006\357?k\006\003\362\034/\355?\000P\265I\356\237\347?\356\034\017\276\347\307\344?\220\306#\227\342\032\306?4N\325\304\025\352\300?F\273k\372\033r\337?<H\327\311\\b\346?\320\233vV1A\300?$\304t\257\020\202\331?\344\242\235[\010\376\322?\316x\350\210{\246\321?pC9z\353!\316?\376\327\252\225\250D\345?x\367\023,\271\240\340?\364\365\327\301\236\014\304?\014\'\262\303~\361\344?R\n\240-}\022\352?\340\243Y\347\353_\301?\001\036\356y\022T\353?HJ\310%\\\013\312?\275\262\367\272\311\331\347?\242-\347\000\262\354\341?\210\250\335\367\270\004\314?U\262\016\340\311l\355?\036,\177\275\351\251\320?H\346\214Aj\336\343?v\0063n\233\243\325?\366*6\'\363M\333?rm\027\322[m\357?A\333r[^\005\342?h\374\230\336\206\216\313?Y\333k\216\222\367\346?\340^/\213\3735\346?\310\034<\201\333\366\277?\230\222\177\r\260Q\353?\177D\037\334\363\\\342?I!r\304\310e\345?\314\355V\367\340@\317?\340\316\217\363\013.\233?P\252\334GJ\267\333?U\215vRq\024\353?C`\301\007~=\354?e\203G\224\303\316\340?\332\304\214\261}\213\332?P\271\316\006\034\022\326?\266I\032\007%\004\347?\300\033\035,\336\200\232?\272hZ\010-\312\351?>Z\235\205\013T\357?q\373\252\351\377\360\341?\"\275\206@w\264\325?\376{|x[\353\337? \032\371\250\353\233\257?\037\232\221\224\246\327\341?n?\363\371\257!\353?:\346\250\252\025\'\344?4\3439\177\243*\341?\3247P\2531\004\324?\230\247B\035\250\362\342?P\0223[dG\351?\021\340\300\202L\313\356?d\241O\332l\232\322?R\020)C\351\327\341?\030\266ci\230g\274?\000zL\316bp\305?\377fO \033\361\347?\366\227E\301\tp\342?DoO\222\360L\346?\034\002I\203\320\007\300?\270,\377&}q\274?\346\266\247\251%D\353?\000\300\nj\375i\022?R\214\277\267\246\353\321?\356\257rD\250t\340?\235\220\270c\216_\343?\000.\273\357\337\340\355?\224\010\215r\301c\337?cq\316\217\221j\344?\374\322\367p\245\004\346?\032\220\251YZ\304\332?@i$\277\302P\347?`b\t\300\262\326\240?\216&\032\261\226\332\336?\255\207\265\272\211!\344?\000 \032\334\026u7?H0\034\305SQ\304?\206\242\023\364\307I\341?z\236\342\246R\336\320?\037\r\344\2051N\353?\272L|\255\315\242\353?\376\0169r\314\303\356?\362;W)$\005\353?JCC\035\214\233\326?\230@\327eI(\335?\000\2105\235\241\032O?\000;\367\230\001\221\352?\204af~h\373\347?\374\240\265\332\261\257\332?\244\245O\370\034\312\335?\350\236\027@t?\317?0\374=\335\327*\262?\032\230p\000\215A\325?\346Lz\021\361\374\356?\300n\231\333M\234\224?\346\324(\017\276\246\357?\213/\312\306\363\327\344?PB\376\376\362S\323?\340\367\317\257\220a\313? \261\\]r\344\353?a\267DS\340v\351?\243X\017kK \347?/\337G\032\010<\347?\354\256\317\303<-\300?\342i\235\362\243\315\327?\356\325\327fK\310\320?\330\377\245\336\363\335\344?\364\264)\203\351\253\351?\030p\257j,\024\301?@yW\342\361\235\326?\n\275\302\365\217\270\355?\314\213\201\244\215!\327?\252&\211\213\303N\322?#x\263Tqx\345?Dfy\035\021\336\326?\3165\311\215V\365\323?\024\027\232W\207t\303?\032\017b\0260\231\340?`\356t\330\\\177\356?\206\264_c\312;\326?h\010G\256\310\321\324?\'A\013\304\321\333\355?l\243o_~\001\326?\364\265D\313\335.\335? \315\264.\331I\260?\320\007\224\246\241{\357?h\212\310\315\244\035\302?P\334\256\254\014M\322?P#\013\366~J\304?\370\253\275ey\335\324?l;I\241\332\340\356?\366\010K\260W\026\352?\322\215\251\002\275\210\337?t\036\244xX\253\305?XEO\217\000\034\274?\326\317\355\203i\004\321?$^\035\226\260\241\343?\310\001\246\237\276t\304?\335vP\347\265f\355?\023\007\254g\331i\345?\200\252\257\225](\342?9\276i\340\023\312\340?\212\306H\244A\232\320?\256\220\314:\330\204\334?\233@\351\365\275\304\351?n<\214~]\236\353?\240\352>\363\021\326\327?\036qR\006\260Q\322?I\327,\030\314P\353?;\221\326n\005\253\352?D8\350\251\rN\302?\300\005\241M\326\221\352?\205\232\003d\221\207\351?^E\026\340_\263\340?H\221\233\313qD\267?\370G\224/\215\240\325?\000\265\342\204\307\224\236?\000\235\261h\223\254\233?\010\353\230\014\340\023\266?\017\371\314\273)\024\341?\346\302\331\\\261s\345?\031V\245=T\256\356?\t.\330\004g<\354?\266\030\233\206\017\225\326?q\225\341\310\254\254\343?_\354G\272\034u\351?0\003%\357\220R\304?{\221\351d\253\017\355?V\2451k\316\027\353?\nM\261\345\316\271\324?\035\343\000\355\3700\346?\217\347\216;\207\266\352?_\t\347\250\024&\340?\347\211K\226`Q\353?C\376\372?M\357\347?\016\3506\217\245|\322?\260\032\347\251f\322\331?\322L*\351\253\203\335?\264\314^z\333\365\341?(v01\260\023\265?\312\262\247\rPp\336?\374\344\371\001)\016\303?g\263\331g?\037\346?b%\215\003\254%\323?*\215\256}\201;\350?\217\002\010\365p_\344?\330\350\324\213J[\304?|\337\024\371\226K\315?\314O\371T\360&\335?\024\236T\3408@\312?\264\277I\344\354\271\315?\342\352fPV9\356?d;6\035\221V\350?\237\203\370]Zt\353?\240?R\213\355\271\267?x5\354a\365\313\353?I\306\031Ud\213\351?\256\205\241`\354!\337?MM\300\221\177y\351?\024Y[\014\327\347\347?\340`\301\342|\210\252?\304^.#\266_\330?\\\330\304~v\014\322?\204p\343\361\010 \346?\337\n\"Hg\366\346? o\267\"qX\251?\373\270\317\323,\316\340?\330\023_\"\261\274\273?\363\345\024-\336\001\347?\022\240C\016p\362\357?L#\\u0]\341?\277\231*\244b8\340?/\227\014\213\302\341\354?,\3701]\255\362\315?\260\2615\323PA\346?4\035\240\256/\210\300?\311\243wG\"%\344?\262\001\360\263\377\220\344?\010/4\322\377\267\305?\251\037T\241)?\355?\027\241klD\034\357?\335\216\226\326\t\250\343?\304\000\235\ri\222\336?\325Zc\300in\344?\216\017rt\270\377\344?\252m\002\267\311\021\320?<\243\234\215\301k\301?e\301\277\230\312\214\355?\376\030\255Q\336\230\323?\267\220\027\216*\226\340?\343\225y\022h\"\351?\002#\366w\234.\335?n \t8\345\374\344?H\367e\307\342k\354?\034\231g\221\362O\315?\346\221\206p\312*\357?t\'\005|\371\216\332?J\275\361\200h\206\351?+\243\014\\\201\307\353?\024\201WX\362/\327?xa\213\007\362,\356?\361hx$\2146\347?\230O{\007b\362\331?9\374K\025\212\253\353?t\001\324\027\227\331\350?\320\252\255p\221\377\315?\362u\332T,l\342?26\364\375\227l\336?\200o\356C{P\302?\350\265\006\317k.\346?\274Ws\002\237r\317?\360s\016\"|\245\277?\200\326z\327\217i\237?\350\021\250\003~\022\261?\250\314\250\017\302%\330?\274\254W\025@v\347?Q\177$\236\376I\340?h\271\235\256\317j\330?^\"u\215\327I\325?I\016\313\226\"\323\341?\342\254\347\370\307\212\353?\370\214\203{\340\310\336?\274\226\021\337\250\260\336?\222\216\033\302g{\336?\026\344\227\317\004\332\331?\'H\303\375\261\274\351?\204\204\t\202\210o\314?\\\330\237ko\003\300?\252E\361\005\372n\331?\000\330\304+\2069\177?\021\364\300\017\032\222\351?\365!/\n\032k\351?\304\200\316\247\257\260\322?\316J\301\201F\024\330?\033#\251\360\327/\350?p\200\2724Q\303\246?\007[\311\332\'9\341?H[\330\255\215\317\260?N6\305 \2330\342?\020rVx\302\"\244?`c/\246c\214\276?\360\301\204U\007{\242?\320\036\013\237\013\332\243?@\260\346\355p\301\266?\221V\263\347:\235\343?J\355\r\325\034\\\341?\301+]\311\013j\345?\327<\214\210D\214\355?\030\272=\246\246\030\340?fW\365\017\304\225\346?\2345a\250jA\343?\300\334X\"\3120\253?\2003\307T\370N\351?b\225\264\265\237\250\330?F\271\261qx\227\321?\225%\362\026#<\353?\354\'\332\236U\331\304?R\303\264\363\305\244\351?w\2174+|\200\350?\326\207\031O\311\216\354?\200Pn\311\362\005\254?=)\355k\261\243\357?\250\353\005\317\020\352\264?\217x/\370\235\364\357?\260\014\204kzw\246?\":\370\224\026j\343?\245\355R\3352\225\343?B\306\271W\255\372\325?\335|\373\tZ#\357?\344.\207v\300\337\301?\360\202\020\021=7\254?|{4@;@\312?n\31366\365#\345?0W\317%\320\337\257?\027\231\254y=\277\354?2\241\234nZz\332?\236\222u\361\013J\350?\337\306\"\003k\216\351?\030*e\202*\024\354?\000\261p\215\377\201\307?5&Z{\220\245\352?\300\320\350\002\007t\211?\306\331wG\224\237\345?\000\231\261\372G\'\270?u\263\357b\214\201\343?^\213\020Q\321\367\327?\032q\227\366\246\017\351?\360}\017\344\024\t\354?\356AV\237.\244\343?Q\261\032\212\177I\355?\210D\021\032\371\240\312?\224\256\256Rs\202\341?R\310\'\272\315\t\321?\317\210dI\274P\343?\255\212q\037\230s\357?5\010\0273M\021\341?\306\355\206\252\253.\357?\220\003\344\177\222y\261?\264\356u\023]I\323?\350\336e\355\232\213\342?^\237\257\316\370^\332?H\343\301\264\365\233\342?\200\252\217-g\243\250?nr\334\361\317\205\355?4\337H\217\252\227\301?H\330\322\300}\006\262?$\235\333\267\211\324\315?\314&|b\276\014\307?\220\013\334\274\313\337\300?U\2773\022\240\343\346?\236f\022\201.\367\353?\320^\2075\350Y\306?\000\013|\014\257\255\350?\300}\354\345\301f\260?-\361>F\032\362\347?\216\304\320\027\361\332\336?g\355\275\024\312\253\351?\000i\262HF\321\216?\346\377\231@j\335\332?\020G\037\177^t\250?\340%j\007zt\303?\207\345-(\231\204\346?,\001\347Gv\376\330?\0172\362M\267m\344?@\262I\344\364\261\223?\030hV\360n\200\333?(\271\271m\324\350\316?\3162\372U\272\301\330?\014\237d\356vW\336?\327h\022\303\210\371\346?|\313\006\231%\276\322?\330/\2728Ip\327?\347\373k\217!\230\340?0\313\014R\340\346\273?\346\n\374d6!\340?\260}|\340\302\027\270?`\360\341\202#\365\344?\007\375Q\272\307\032\352?\n\211&\037\214\204\332?dk;\221\345\032\321?\354\326\235W%1\337?)8\370\264 \177\351?\374\233\345\320\256\224\353?A\363z\277n\227\354?\262\227\305\204\265\017\350?v\3703Sv\022\350?`\335\221\253\023\203\262?U\331]\245\234\261\350?\3541*,\247\247\335?\330\261b\376Mq\343?{\336%\237Y\000\354?\300q$\225\006r\312?X\274\312\352&\265\320?\200\230\027\351\223\251\256?x{/\006\230\207\305?\220J\252\232iy\325?0\330\210w\213\217\323? \275\016\3362(\321??f\225$M\341\354?\0331\332~\277M\351?\340\366\273\271\301$\223?@\017\231P+\362\303?\250\021\n\310\253\321\331?\024(\230\206\022M\335?\226o\032\202;T\340?\036cI\n>/\332?JK\271\257\310\014\350?\000\221\343+\'\271\250?\320\230\272\304z\247\334?\363\3172\002bk\354?\340\023\346\274\356\256\241?#E\220\341\021\021\355?K\247\363FA\362\357?\244\014\324AC\021\341?\222\271p\245\207\304\332?@\310X\377p\373\264?\201\274\361I%\340\346?o\304\257\234\\\370\344?\250U\326Yu\216\311?^\211\275\246\261\033\350?\326\373\217\306qh\347?\344\356\254\034\243\332\331?0\231d\320\256\313\320?8QU\356\340\343\314?\234B\326\273d~\351?\200l\rh\336\271\202?\024\367\246j\274\202\355?\373_\361\263\311\204\357?7X\365\233\312\016\342?\330\347\004`\301\242\333?^\035M\202x\241\321?\"kw8\303\215\342?\232\303\313\332(\006\350?.\024<c\214\347\357?0\257\363\305\326\t\315?\320G\324\250s\345\265?\316\214-\306\240\\\322?h\014\316\2104\021\315?o\252N\363\231\035\354?p9\234H\353\026\274?\3303\371\232\t-\317?\214V\337\226\212>\332?g(5\024V<\346?\240tp\367>1\341?g[\035#\241\207\343?\273\021\370\344:I\355?\007\212\261\002\\\323\342?eF\367\205\326\234\341?\360\336\327\236\263Y\354?\036\2408$\345\275\344?ge*ki\213\357?\210\242\305=\207\240\307?\307\341\213\222\251\200\354?kr\226\0374\311\344?\225d\351\226\373\201\355?N\013\361s\"G\333?X\374)\364\211X\273?t\026\310\363N\243\350?\004v&J\267\365\320?\361\314=\255\020\244\353?\327\331JT`\216\347?\000\314\307\231\277z\332?\240\205}\254\360\005\245?\240C)s%h\311?TB\332eT\343\356?8\025\311\'H\210\264?\226\212W\243\201q\346?\234\264`\033`\277\313?e\007\311\305\234\001\341?\016\000\240\0146\371\335?\020\305\213\206\352>\244?BB$\230<\004\356?\204\271\2740\270^\304?\014\025\272k\301\337\330?\010:\3152\3714\261?J,0mc\032\350?7&\307C\212\242\342?Fm\026\377\037a\327?o\366?\005\246\332\342?\215o\275#\270\'\344?0\3231UA\031\354?d\332\222\363\024\241\347?0\004\206 \217\234\306?\2472\260&\264\305\340?\304\002\370\351\031 \356?\207\360\1774U\364\346?\033\201p\222\310\312\340?\302\340|v\272A\334?*\272\261\204\023k\321?\007~\034\255\317&\354?\014\335\315\275\0337\325?\214\344b\372p\017\311?\016+\301\360\230\325\340?\364\254\225\3777\220\315?\274bA\020\2577\311?\340\275(\232\244p\251?\247\2679\317\020\024\355?\320\2662\346\340\224\272?\340\203\342\317\347\227\333?lo\273\375n\006\335?D\253s,#\207\351?@\034\340n\223\276\274?\231\213U\354\301\345\341?\234\201\373\244\276e\357?\3161\0034\370\352\356?\336m5\221\025\234\337?\346\324\233\367\245\202\343?\340%Kb\363\244\263?\003\321\301\020e:\355?\264Q\337\210\372\346\335?\010\333\247\351G+\341?\330\037\014\244\233U\270?\257qo,;\256\347?\212\312\272\201\307\t\321?\246\014a\203\373(\343?0&=\334W\353\251?\303@%\345\301\360\356?1[i\320d\370\347?\030\373d\032\034s\261?`OXL#{\273?N\021p\010\266\333\327?\240\270U\177\254\343\304?\236>\231r\2261\334?\200\354axo#\217?P\247/\026>[\332?\366B{\273\344\237\321?,\330\031\251\305 \312?\002\373\010\316\227\242\326?\272\r1t\177\322\326?\250Q\366m\231/\343?X\300\007\006\246\276\267?\300\200\252\366\362\223\202?\300\200f@Vo\223?\3308:SN\004\315?\000Q\367b\337\244m?\3703\245>\014`\324?\272\203\340B\260\023\322?\275u\016\310\020\302\355?\036f\354\376\273\270\347?\353}j0*s\345?\324\261\3239\367I\343?\206\036\242\371I\221\320?\256\373\266\230\203\377\344?\300\277\365B\007\"\236?\252\216\225\001fI\350?\320\317\207\203\371#\322?\265\2572n\\\307\343?\000\217\034\325\355<\357?\332\004Q\202P(\330?L\357\031Z\334m\345?\304a\315\223[!\352?-\364/\027<0\352?\322>\343\334P]\357?\340@qR8q\227?u\3566\251\307_\347?\300\244\253\3468\204\217?\251\312\320\371\342;\344?\200,\363\306\347\341\252?Z3M\"\001\001\330?\270X\212.]1\321?\221hzr\021\323\351?\300O\006tK\024\230?\372[ _\001\207\347?\274\254gp\244d\331?\325\010\320W\366\026\353?&?Z\343\217\023\340?V>\341\323R\311\343?\226V\362\032\316\372\326?\360\257\333\371\205\235\343?\004\001-q\352\030\342?Jn\244\370\032!\354?\355@{\211\350\264\345?z/\324\321f\202\331?\022kx)\013\036\347?X.\314\325h\004\275? \210\366\363\222\030\254?\344\n\246\255\302\024\303?\376\375f\242\217`\354?\006\271JB\224\223\321?#\266\322\210!\036\351?0\247%<\345\350\262?\345\350\206\346\'l\353?\2172<\257\327\261\346?VP\307S\317\372\354?@t\233u\306|\273?\256\016}o\010O\347?H\307\305)\340\335\262?\324\362\245\327\"\005\352?@\316\245\013?V\345?\357\361\247\274\346\224\340?\000\316v\030,\022r?\3627\360N\254`\343?<\032\315\026\031\267\333?\036X\364\035iF\334?\330\350\356>\370\237\301?t]\242\246\017\037\307?n\261\325\243s\n\321?\340q(d\276/\337?\204W\345\007\357}\352?s\235\"\302b%\353?\300\204\224\037\244!\317?\rMS$\371f\344?X\275\323\303\356Z\351?j\025\023\254\223\235\342?n\241@\203Bh\327?a\243`\020\352\274\340?\013\313O\276\372\"\346?|5\025\200\263\214\314?\"~81No\352?\254^\277\305\3076\340?\204US\252z/\326?\270\314\235\262\305\247\306?\024\344\373\t_Q\333?\034\345\253qH\235\341?\312\257\220\021\rJ\326?0,\211\276\002\367\242?X\323\203d`\315\333?\014\035<\0376\260\347? \t\214M\021D\223?\252\372\343yg\230\335?hPx\364\276\335\301?V\211\322\261\252\270\354?|I?\374P\036\303?\030I{\207j\317\343?\203\377W\216k%\352?k\013\210\361\203\337\352?\262\227%\266\262\021\331?\362\346\\\323o\337\341?\rr\352\262t-\341?\252\273\010\rCk\333?<@\001%^A\306?\253\225\233\242\346\347\350?F*\203P\006\357\355?\246\237\321\237v\026\324?8z\2253\365e\321?ta\246\263\214\206\342?\036\037\365)\214\307\323?]\234\251]z\377\354?\322\232\305\225\225;\326?\3265\003\035=\335\336?\315\256\330\232\323F\346?Wi\207\351\240\247\354?[\262a\'\257\253\355?M\016\362\030#c\345?/-\342\263\275\241\355?_\356\2111\'k\343?$\233\177\r\024\263\341?\232oY\t\267\031\341?\302#4\343cY\344?\220\024[\220dN\341?\"\3778\213\200*\330?\201\002L\324\260X\347?v\351f\325O\'\350?\000\317\371]\331\202j?\330\341\032\243K\376\303?#\023c^\024\234\340?4\217\204_\343*\310?k\230XDU\377\352?\320H\252\302\242\333\345?hBC\364\022\373\341?\n\336\tS\266\244\346?\254\022\2310xF\347?\301a\0036\320\274\341?H\260^\001^~\314?d\2577>\302e\304?\222$\013D*\341\332?\235\301A\253-\236\351?\274h7\t\306L\315?q^zH8H\354?\000\252|\227\007#{?\210$\006\357\234\315\275?\020/\265\037\223\351\325?\327\234.\313\030\260\345?m\213\362l\204o\340?\242\244\241^\207P\354?\016`\3055?\343\341?b\335RE\264\314\352?i\027S\231\265D\342?\024\rcB\200\364\354?\224\202\212!\010\027\316?\340\357\007\276L\326\272?h\227l\003\351\365\303?Qu\242\032`T\353?X\223\263\334o\216\355?Q{\010m\216\013\342?U\244+\320Q\373\342?\020T\331a\321I\326?&\331\t\345\250J\357?\004\201\350\320\260\003\306?\356U\371&5A\352?\210\216\350W\206\203\312?\322\034\211\316h\371\342?8\317\320\326\201O\321?rY\336TL!\333?\267et\300\344\006\343?\0001\312\267\3177\262?\200>/\250X\265\177?\302\376it\010\220\326?\262#\375\220^\257\343?\300SK[\312\014\302?\224,\277 wh\337?1\235\223DQ\247\346?\356\026\212\373e\261\336?\314\200Q\005\031\006\350?\324\211\344*\271\016\326?\2200\241\366\225]\335?\260X\217\356\370F\342?J\337\017\224YM\357?\017\036\276\377\004\226\352?\323C\375\261\274\326\345?\230\272\227\211\363\001\333?\341z\237}m`\355?x\266\034^\246\003\305?\342\313\307\306F#\343?\331\302\222\372\244\346\351?\261\031\370\324\235k\347?\316;\347w\251\323\353?\216R\213\030Cl\351?4}\221_\203\347\312?e>w\3464I\355?\272[<,]\356\350?.\232Z\2466\342\332?\303S\201\360\372\231\340?\022\244\216\326h\033\323? r\206o\332\360\343?\202\"\242\364T6\324?L\356\324\017\0100\352?0 \240\324\313q\313?G\275\026\374\377E\350?h\203\260-O\207\300?\037;t7\256\307\342?.\247{\332\361\224\344?\3107\241L\365\337\275?\320\357\267?\336\277\262?:\370\334\267`\360\352?2n3\013\376d\350?\"\335n\353\360R\334?\2749\304\313gT\303?n\\\211\324O\340\356?\350G\243\010\250n\274?E\207\340Z\030\275\356?Y\300\031\332\265b\355?|\357\177[\032j\353?\020<\345fn)\265?\233\014\306#\312\242\341?&\363\"\331|\314\320?\231-6\002\370\330\351?\374\254\215-Z\372\313?\252\367\350\206\320k\335?\327\224\014|\365\255\345?&u\037q\301\t\340?]\201\001=\307Y\344?`\217\316\374\274\275\330?\210C\320\303a\375\261?\332\2023\273h\335\341?Z\246x\357R\322\324?\007\212\204,\275\316\356?\212L\375\0028:\352?\007gM\274}\027\342?\246,\222\220\235\372\323?\200o\363\205\242\232\321?DGJ~\203\340\307?\313\253\3725\344I\343?\223x\r\253\352(\353?x\271\341\205\2178\274?D\031+\225\253\340\327?rvgK\310\007\342?\361\361\227\271\241\376\350?\006\333\220\354\007F\322?\035\223\264\247kq\343?\216\013\325\272\003\354\342?\3550\216\332\274O\343?\355\023\306\355\252\343\342?\345\264$YN\360\343?\261TC\212p%\353?@\362g!^\265\302?\206\366\027\264\211r\332?\230c\\Wh\273\271?.\262>\244\216s\357?\026y\304\022\377\\\334?\220\225\016\210\037\352\352?\000\200\355QO\255\244?\300\312K\262\tH\240?\010\002\331\215\342\372\304?\340N\313/;P\233?G\350\242\320\364\\\353?%@\244c\251b\351?\204V\236\2479\271\342?\351\310U\254\263\332\347?\362\326\0363\333H\353?\000\211\206\025\244J\330?\302\360q\223\276&\341?\330\323\347\000J\203\270?9>\275(4\033\343?\321\313)\001\270\361\355?\217s\233\211\300\360\347?78\340\235\272\341\346?A\303\004\276\372@\357?\277)\361\034\256\237\342?p;\207\367f\034\267?\036\257,\370\361\220\323?\366\270\250\013a\027\331? \3656L\327C\304?\277\212J\032\235+\344?\230x\220Y\361_\325?\205\2222K\273\330\341?\206\016\265\214\345\305\346?9nC\215\331[\347?\320&\303\262\300\244\243?\237&\325\265\362\240\341?\340\306\360\322\220o\256?\001\321\355\217\002]\353?U\204t\026\023\233\356?>\014p\224\344&\355?\030q\'\223d\375\335?P\271\361\363<R\303?\324;3\2740e\313?\277\r\000\013\207\263\347?\200T\377VK\007w?\343\235\274j\273-\346?\307\331\224\032>\213\353?\277Bh\354\245j\344?J\243\034\364\n\322\333?7h\343\244\317\275\340?\330;\2302\351\304\306?@\312\326\362\271s\324?\230%\334Di\207\262?Xc%z*\217\314?p=\275\030XS\250?_\257./FV\345?\265\264R\234F?\341?6\250\237\223\032\242\345?\340 \224V\343\306\317?\236\315\'\016\302f\324?j{\305u\252\276\332? \025\353GS\240\262?\356Y\275\36677\332?\374N\001F\207\327\317?\220;l\2312\261\241?@\243S\265#\321\273?\257\245\267\203\264\374\342?a\347\360-\327c\357?\002\214\254g b\336?\211R\003\003\236\204\347?|\003$(\202,\312?\210\270qj\275\352\307??T~\213N\254\353?\267\020M\271\357-\350?/\307\021\356\231\024\346?\030\376\355\001\226\274\275?j/\345S\277\270\333?\306\036\004\300\212\241\331?\320\213*\351*{\265?\370\253\260}\310\364\301?_\307\266\220\366%\345?@.\014Vt&\343?\020\033\307\305\227$\243?\301w\300\034\3767\357?j\005\305\264w\215\332?\306\303dqN\265\324?T\216[>\307\007\307?\036\026\031\3350\372\336?\316\373\177\200k\243\327?\312\243|\033\200M\334?`I;\003\271\373\305?Z\345\2316:\204\332?VE\365\314B\377\357?t\225\342\324/u\317?\217\317\304\005\350R\355?XF\222\256\342z\331?\000\027h/\236\005f?NS8L\277\231\331?t:\341@#\255\331?{O\255\234t\010\343?\247R\301\327\226\274\343?\026\360\370\336\256\342\330?\002\007\264\2562\034\322?P\3466\271\343]\312?\326`\352-\320e\340?\340K\344R\345l\271?\311\377\310\321\335\034\344?\353HG\276\337\327\350?\032E9\245n\020\354?\246\274\033\023\007\\\357?\226\035k\035\252\213\351?a\320\302C}\026\340?\234ek\001\3018\322?A\221P\341,k\352?\210X\031\025\317\374\346?L\357L\316\376\203\313? \035\314n\247\320\306?\310V\254n\242\361\270?\000AP\355J\345q?\270\225q\326\3314\332?v\303D\242`\250\344?x]Z\371n\206\343?W\372\032\267\353\345\352?$\353B\021\200\"\352?\260\204\364\032\321\177\242?\260\322l\325y\272\326?\001\311 \267X\t\351?pH)e\010\261\334?\240\243\r;\034\377\251?\341\324f\256<\214\350?\213\007\r\367\270_\347?\232\212\300\177\354(\320?f\272M\274\347\207\342?)y\2773\232\373\351?\243:\336T\020t\342?\244W0,\276\247\311?\325V\273.\216\202\342?~\375\002\233\364\200\347?b\240\316.\2358\321?\322\021\352\342Vj\340?\3523\3412\333_\355?\210\nW\264\004X\357?\360\237\330\242\351E\315?]\342{r\341B\347?\352\007r\367\223\363\341?\200\275+[\246\232\231?h#c\031\362<\266?\352C\262\216\030\272\330?!q%\035\275\020\356?\360\244\363\363\254\247\272?\364\0013\310\216\257\304?\326\006A|\325\257\357?>\213=\325\037\354\326?\250N\306:\312\315\345?\260\311\205\303R\003\322?\010\365e6\t\230\326?pd\343<\227\322\264?\250\331V\221&\355\345?5\365F\245\363N\353?0\372\267\373\022\265\324?v\010\275\017\n\326\357? \242<\233{\367\323?YV ~#L\340?\250\371D\304\177+\266?\331\253&1v$\356?\270Q\"L\013\330\302?\317\035t\260\006\337\351?@\262\310{\261\326\212?\233-\260\341Q\372\356?@\252\242\037\215\247\242?\340@\340a\327\315\272?\000 i\263\227\3508?D/g\216\354}\310?\371\370PKD]\357?\202\237/\2605\202\346?\030\021\n\337>\306\300?Ps\350\252P\270\331?\345\204\252Y)\'\350?\200\351\033\263\363\270\273?\300n\226\016\321\240\333?\000\373)\270\230\343|?P\227{7\314H\264?\350\340q\341\256\353\261?\230\002?Y\370\340\312?\330\310)\253\266\024\312?\246\336\237\205@R\354?\224\024\244\313\372\270\345?;\217\004\314\001\323\351?2\212\325\310\032\037\327?\260\370\362\363\356\370\326?.T\276\200\320\333\347? 9\023\202\301\216\353?,S\250\037?+\325?&\244\326\360\276\r\321?D\004\350\332\003J\320?\260\324\264y\013Q\326?Ruay+\033\332?\347\361?X-\333\342?\200F\256(;\263\303?\3007\013\373\352\313\242?\354+\t\354u\365\332?>=Hv\307\345\340?R\344\312(Zc\352?\374LI\360\r\r\310?Lp\214\202a\001\351?h\032\260\013z\243\270?\226\355\323\2129x\341?p/G\360\357d\313?mDT\023\017\247\341?\260=d\364\3472\244?\344L\037*\301\221\334?\200-\367\263t\343~?\363\256\374\205\004\364\356?6\'\240$7Z\357?\270\211\236\313p\245\340?\246\263\242\377\033\340\331?\270\327\357\317\2026\313?$u\331/ l\347?k\r\304\200\"\032\341?\262\346\333\315\267\n\354?\2440s@\235\014\335?\007\273\351\336?\342\353?\323\223\344\234\037\241\343?\226\203\205\223\002\213\324?~\210\002:\331H\354?\220\025\256a\360\272\351?\313\013-\350\307\020\345?\234\312h\311\035F\311?\006\030\030\014\204\230\320?\2247\237<\241\023\302?@6\223\306\350\224\326?\354\021\232\353\362:\320?ZI_\r\255\323\334?\312@\202#A \357?\010f\344!F \334?aM\205>\334\234\355?\276\016\301\337\361&\347?\200\351n\247\372\314\352?ahh\355\356\323\354?X-\266o\221\250\333?\240\332\036\333\317m\224?F\210\277@A\001\325?\204\234W\272\254\200\347?\336\244o\315\327\030\327?P\246\337\271\353\036\277?\364,\215w\214\221\341?~)}\320\334E\332?\240\000)\010\347\n\226?H\250\3631IJ\272?\366\037\246`\330\337\326?T)\240\303\260\345\302?`s\270\022\324<\237?\005l\264W\270&\353?B\366\214a\342G\340?\274\363[|t\276\316?\277\0102\026\323y\347?\274\333\205t\034\372\333?\300\264\374\216\300\360\241?8\200\253\3626j\264?\032\371d\360\246\237\336?\310\216_z\256m\351?\343E8R`[\354?\276so\240?\037\320?\200\017*\325\325\355\335?v\034{\024\016\245\321?L\007\024\276\273\264\340?\006\0376\264\276\366\330?X\177\222\"]w\332?\'\345\t\304\314]\352?\266j\274\245\327\306\346?\324\334i\001\210/\350?\206\347\177\254&\347\320?@W\307\322=\314\247?\300h\010\2364\324\336?\371\335\314\301\355\270\343?l\324\375\247\325\273\321?\024\264OZ\257S\304?d\310L\317\003]\312?mt\263G\t\273\344?\270\325\223\222&\260\300?\234\375\320XH\024\301?\333\022\203\372\343N\352?\362i\017\034\035/\350?L\267r\\\226l\300?\004\r:\273\366-\316?G\267 \252\363\024\343?\3707\242\224\316\361\344?\310ba\322\353\326\316?`\341Ga\245\234\270?\200\032U\037*xu?X\245,t7z\353?\2411u\37133\354?\260\304s\257\324\376\326?\214\363\213\310\324\242\355?\224 \346\372\243\233\340?\267\333:\336\021\307\354?1r\277\"\n\260\343?\256\223\204c\272+\320?\254\214\023l\373\231\302?\000\373\377=\026(\266?/\333\'2d\237\354?\337\303\3629\231\202\346?\256\3066\367\361\241\343?\250\364\251m\205H\314?m\025}\214\037G\350?\274\226Y\243eY\301?fP\271\203\206\252\343?\312\217h\331\317\353\322?B\277\300\370\377\234\327?KcA\302\235\017\352?b\013T\324\310*\342?\334E\235\343<Y\315?\\\021\335\222\317Q\331?AD\240+8\305\346?\200Kk\"\215\361\177?l\341\374\305\002-\344?$\004\343W\250j\347?\002\226y\355\325_\337?\326\243\001W\340\277\331?\020\350a\321_O\337?;\0050b\360v\357?h\250\230!\354\264\324?b\335\234wW\n\337?\342\277\307Z\r_\340?G\303%\000\303}\354?\354k\273\3404\365\341?vL\372?\207\365\334?\220D\270\351\213g\272?>UJ\2450p\333?L\265D\356\376;\340?\306\362\2657\0347\327?\301Fx\000!\365\347?@\030u\377f\374\226?LD\254\335c,\357?X\306\177\211[\007\310?\260r\361j\311A\300?\024\326}Y\271\025\356?*\031\\\240\336M\327?\376\356\260\013t\235\323?\232\027\235\256\017L\353?\327\2063\307\211+\357?\357\367\270\023s\304\346?\360\336\250\366\n\237\324?\014\225\363f%\220\354?\367\006 \212\003h\340?@\262t\303\3213\222?T\'j\263\365/\323?\370\314B\355\337{\267?Z\036=F}\231\327?\2148\200\306\\\010\316?\024\004.\234\205f\335?\305\220\032\\\033\350\340?\200E\216z\rQ\311?\346]\270;`\206\322?`\t\242Lb\246\344?j\364\212&\263\304\326?\361\347\273\262\030\330\340?0\213+\037P\034\311?\004\324\352\315\007r\326?\254\313\206g\356\371\313?\020T\234\021\225\206\261?\216\270\005]\267 \353?\373Z\317\202\031\033\345?\351\241\317\006\320\253\342?e\247\224\351k\254\343?\216\204n\034.v\320?\334\364\250\344\201\205\336? \330\036\227\003b\347?tEb\272\020\036\335?q3(\212\302\264\355?d\357\332\252\265C\317?w\243\245;\343{\352?p\365\004\246T\252\256?\350\351\010(J/\264?\300\206!\265?\217\235?\001\337\344\307I\341\353? #\274Wj\363\337?s\316\204$\022\374\342?\024\2572[c\327\355?\200v\246\360\001\225\221?P\245\032\313\210f\352?\226-)\351K\032\342?\364z\325\304\335\220\340?x\201\304(\345|\300?F\202\244,\241j\336?Pv\360 \'\204\327?\364V\307X\354\036\327?\260\301\217\007L\331\260?\356bj\221\330\365\331?>\332\271w\343l\323?\201\214?Y\223\225\354?8\255f\327\247\377\335?\343\3752\245ZH\357?\224\177@\010I\236\317?\261\260O\263Qu\345?9gVY,\364\355?\257\350\014;.\377\354?\302\313\244\3200f\322?\000\032P}\"l\304?v\340j\302\376\224\330?\321\214\374W\363\005\343?\246\210\366R\375\311\330?\300\000\315\210\233\375\323?\372\0136\342.\374\345?;\275\n\235\004N\356? \265\207Im\304\255?d\035\257P\032\261\317?^\343\251C\241\301\341?\nH\232\263\346\314\336?\027G\t\230=w\343?~w\377r\212\244\350?:\307&\314\2544\331?\250\242\254k\300\006\260?Jz\270\025\356\'\352?\344Y\247\211N\232\334?\314\304*UU\333\335?(=1\204\342\247\274?g%\345\"\013}\342?\220\251\321\364`_\270?\000\216.\362CB\225?\2778\203f\230\356\345?\300\002O\231t\340\272?r\003\345\037\241?\353?8\002\347\005y\243\304?\014\224am\240\377\320?\361\345H\004\035\303\352?\300w\027\360R\026\345?X\223\351l\214p\302?\020\006\301s7h\335?\032\275\335\305o\244\347?\257=A{\355\020\345?\337t\203\"\014\215\354?\230S\342\363\020\214\356?@\250\023><\033\344?\\\233\245\2062O\325?hl\277\350\362 \265?8\361\000Z\2639\306?3,\364\017I$\345?\007\003+\2019\026\357?V\021\353\346\037\030\354?e\306.3\017B\341?Txd\026\267`\316?K9TP\356\270\351?.\374\312\356\321\344\353?\\\226\222\330\311\024\350?\310zHH\003\245\332?_\222\361\006\333\t\356?\372]j\250\352\256\346?-x\004:\027\032\341?~\362\271G\335\324\354?Q2/k\311\212\352?\001\244\271rb\361\350?\220]\274zC\231\317?\254\326V.\301%\331?\014\265\023\332\212\203\351?\203\025N\213\223\253\354?\234\206\361\033\312m\303?\24400\322u\223\310?_?\253\035\227/\345?\207\260\372\321\027N\351?0\30356\255a\247?P\256\317d\302\324\325?-\360\343\356\267I\357?\2024e.\316\216\322?\242\250\271\304\236R\347?\031\253\207Y\030\030\342?\343\3557\337\001\304\354?\240\275\376\037\0327\352?\260\'\374\336\357\331\260?\212\376\372~\246\330\356?B,\"\317\247\245\346?\000\201\342\354\365Sf?9\006^]V\254\353?\241\301\212\313:\371\344?\336\357\3409\250Q\335?\355M\262:\353M\340?@z<~\335\355\251?\224E\216\322\220\212\320?5\367j\357\352j\344?5\251\0103\007\336\340?\200\001_\254\362\321\232?dP\342VH\347\344?]\261\202\006\357\334\345?\346\240\336I\022G\321?r+sm\232\340\354?\rl\357\325\312\244\352?\247P\2368\203\250\343?\247\017\257klt\357?\343\367\337\035o\t\346?\000q\246\r\344\271a?pe\342\336?\346\251?2\302\334\301\312\310\352?\334\330\017uPq\304?a5\277\264\215B\353?\275\227\370\303r\030\356?\241\035/D\201\036\347?\2270\327\026\020n\340?\232\216\361A\017\343\324? \006y\315#\021\256?Qe\n7\007\"\353?Z/\033P\310\020\343?\214\311P\\\337\377\354?h\000\222\366]\221\346?qR\376[;\340\344?\234\217M\006\203\371\357?\200K\300?p4\224?\312\307\367\013q|\344?(ha8\005\027\264?\005N?\006G\213\344?@\177N\236\354x\276?\006\022\177\263?\026\355?@\027\335\206d\377\347?\214p_\004\216\037\357?4\332p\230\303\203\340?\2255\355J\023\004\354?\374]V&U\253\307?\272\324\022\035\013\353\347?\304\277\210\206_\277\311?\273\217z\224\231$\351?\236AZH\212\310\342?\272\307\350\247!:\324?@xB-\227\265\350?Ez\202R\001\003\356?T\034\260\033\2258\305?\000\035\255\310\221g\264?\230\346\027\344\366\351\341?\320D\033BX\203\321?`f\340\023\245\201\342? \027\234>r;\247?\tI\033]\302+\350?\3403\274\242\315\312\224?\026|8\010t\321\337?i\314cP\346\245\352?\222\301\371\025\007f\326?\261\nQqN\212\346?\374\207\255\217\216Q\335?\000\344\222$\2054\205?t\367\354\265\263\273\345?\234\234\022\013\004\177\327?\272\224q(\372\242\332?\222\334\333A\252;\351?\237A\310]\246f\346?3\010H\234\261p\341?\020oYK\212\224\336?\200\323\302{\337\341\302?\232a\352p7\331\336?\266\314a\\\234\037\322?\031\210@\210\216\t\344?\220\216\325\235\007\302\310?|\277\214\342i\207\327?\001\356\321M\372\217\345?\"UzZ2\014\320?\2326\311-\0028\351?\340\221[1\305\356\225?\331\204\247\370DS\353?\254\214[z\2325\357?\200\236\031\036\225-\213?\213\241\334\277\240\315\342?\321\0069\036\\\334\352?b\254~\310\214\254\336?\220\330\364\237Y\210\306?\364G\210\237\317\372\326?r\361\207\372\244\220\354?\334\346A04\250\317?A\343\361\332GS\346?\374\033\032\313q \324?\241\025\2739,}\353?\351\302)\341\343}\355?\227\206\20104\200\345?pA&\014;A\254?p\261\246\034;\345\315?\030\006\\z\227\360\354?zwqg\224\214\346?\002\372@\342\374\273\321?\256\003\235\302\360\201\336?H\261\267\256,\350\327?~\260\344\271\214\214\347?\270\346\325g\331j\302?\366\334V,V\331\337?\3605\335X/!\255?bB=\323ku\330?\275\277\304\344e9\351?\214\326\3635\374\223\310?\210\300\002A\332\371\331?\371\023\265x\247T\357?\300K-\214w\265\302?0^\233\251\023\363\302?\354\240\223\352\247\021\343?`|\262ml\317\324?\212j\345\020n\273\344?<\361\3717C&\301?\314m\253|.3\305?\250\007y\343\241\036\342?\354\316\371pm`\300?\363\200\037]~\241\354?1\265;\262\300\312\342?\000\316\0175\254\363\241?\031\0021\037\363:\343?r\235\037\302\365\036\333?J\250\330\256\320r\357?\340l+\031\314u\261?\322(u\3600p\357?\266:\312\324QZ\340?`\267\204\252\274c\223?\222\245\325\341]\234\326?b\016\221J\323e\330?\244\207\262\210\360\027\302?`y\301V\200\272\313?\234\273\'~xz\340?X&%\260O\032\325?\217\305\252dN\"\343?%X\356u\355\200\357?\311&\352\226\375\250\356?\224\253\310q3Q\320?\304]\222\022\201a\347?\214\222:w\270-\330?\234P\"\315`\006\337?K\362\317\027\371\324\342?c\263uWG\322\350?~\nY(\030\335\335?\260\027\232\220\374\323\274?\344\277\014z\310\232\327?\2704\377vE\037\262?hO\224=;\035\300?~Q\353\242O\002\332?p9\001b\222\205\320?\"\037\361M\255S\333?5K\031\377\361:\355?\353|<\242M7\340?\263\235\374\013~\016\343?Z\340^\341\230B\350?\340\261\021\212\304\270\343?\023v\226p\374\001\350?\332!\014\273\351L\335?\217\032/\025%\002\342?\330\345\013\230\336\355\263?\266/\215)\351\231\325?\177F\355\256M\005\341?\372\2070hU>\321?\017\355?Xb\244\347?\322\034\242\203Xp\322?\354d\261\363\020\312\357?\000\016\t\345\344{\246?\302\271\320\376\261\266\337?/\036\327L\343\371\341?4\"Y\246G9\312?\250\000n\013\271#\331?$\322\244\251\033&\322?\252\310\r\310\350\030\322?\362P_\322\2072\341?\336\351\0228@x\321?\240\356\306\014Zh\320?\303\342\217\352\352\207\341?@\0075\252\233\237\271?\367\311\366Y\222\033\345?(\0342>v\017\330?*_\277\001i\204\332?h\036\244\376\367d\335?\256\254\331x:\000\350?\3149\032\317wI\354?\005\037sC\224\025\353?\266\344\270\330\014\010\321?(\025\344\030\370T\315?\334\356\352\323\006\267\316?\370\"\013My\213\265?\322\004,\005\302E\353?\212\207\220\233b\364\334?}M^\235\035\207\351?\314\252\312_\250\217\316?\262 \264\337\241\235\343?D\355J4\251^\343?p?#\275|\034\250?L\257\025\301\354\314\342?\256\330~\340@D\334?\303\353iD\334\212\356?Ru\007\251t\246\336?4\2216Zc\224\320?xm\264\224\202j\323?B#1\314\353<\353?<9\365\300\207\032\334?\234I\362\2544w\302?F\370\275\325\202\377\334?r.\203^\262c\350?\300b\314\352Z7\217?\020N\227\347\252\016\343?`\030\352\344\324?\300?i\t\311\214\246\324\353?\310\353\335\202(\\\352?\204xD\205\364\362\336?[\315H\002#M\355?~l\211[\305\222\337?\272m\"\347\\\265\330?hw!\333\225?\320?-\331\2724{\253\352?\326\210X\365\213\275\333?&\226]\214\241\227\340?9q\247m\275\023\341?\244q\325\255\276\305\334?\"\202R\201\037\233\322?\340\245GB\257\027\270?X\177\254\302\235Z\303?\366\236\\\235(\206\357?6\351\377\342\323S\336??\352\233\313\226\033\344?\210$\263\tKi\320?\243 \206\320\303\212\342?\200\217\224\250L\266\252?4K\247\330#\277\333?\370O\303\351i[\337?\300A\246\023\324w\313?\275\354X\274\210\353\345??\343\020\320\344`\346?R\'\322\326\255\024\334?\340\332\375\031\010\264\326?\347\023;\260\213\032\351?\234\262\223\361_\231\341?R\304\343\3037\347\334?\020\303\321m\006W\252?\300p\235s\231N\277?\335C\237\206\333\244\351?\250\017\253\272\210\254\345?\257\307\224\306\306c\340?.\266\004\204\245\004\342?5\244+~\026]\341?\230\210\203\237\\\343\342?\374\3470\264\2327\305?\210\005\242A)\210\341?\355>{B\204\324\355?\272\2325\217\274\010\323?\252L\216\303sZ\340?:t6\347\037\312\351?H\315\367//\300\320?/.\'\201t~\354?\016\r>\246X\310\331?h\266W\256-\277\267?\214\275\244\237\361|\334?\342\356[Wz\'\347?\352\016\302G\224\234\325?bq\027\304\262\264\347?\210\037VF\016T\353?av\314\246\224\303\342?\"\023\026\254;\374\332?\325C\241\034Xx\343?\247\261?S\010\324\357?P\304\265\247\250\007\321?$\357_\\\201\021\347?\230zO\371}T\313?\320\r\325qcq\243?\370\210\234\362\263\277\343?\230$V\374\266\235\320?\305\262\237(:T\357?G\370\322\225\302\334\346?\364WZ\262h\326\330?0\nC\322\325K\356?Al\303z\272\016\353?\0249\230\224\017P\331?\"\362\377\235\347\221\357?K.\n\\|*\354?\267w{\236]\277\345?\200;\272\302\270\242\241?\274\263\362\274\033v\320?H\203\342\350\267F\330?\372f\276\366\025y\352?u#\001m\"\001\353?\342I\343\365r\227\344?~\033\317\007\272*\352?.&A\262\231a\352?\270\303\237\302t\257\345?\242\313u\3202\\\341?D<\353\267\345\364\317?\376{\372\361o\354\324?\022d\327\n\263\343\332?_\277\243\263S\035\353?\270h\'xs`\351?\300\026\013\365\307N\252?\240R\351\202\240\272\325?\320)\221kZ\205\343?\214\207\362G\331[\334?;\271\356\r\205\022\350?\220C\035\261\326c\271?\220\373\031a\325\273\304?V\031\354w[\357\355?\304\030c#zr\336?\000UO\304H\030\341?H\302\206\t\345U\346?\273\006\357\020d\255\346?\034\327V\357\241\306\343?\320\324\330\256\216\260\347?:\262]x\332\331\321?\210\002x\247\365\025\322?\274\221\375\301\245\240\311?\271\370\223\335 \236\355?\273\341Qn\010\352\347?\344\366\001\035\250\275\355?\3309V\376\031\362\260?.L\005\000Z\271\340?\\\232\330\010M<\317?\031\376\205UW\370\346?\000Y\014\370\356\232\204?b\211\222S&\374\343?\327\010xs\343\n\343?\202J/H\330\310\341?\367\255\362\010\313\037\351?9\332Q#\224(\357?~\266\020\252F\224\335?2\335n\026\021H\357?\370\316\'\235\373S\304?B\304\035$\371\352\335?\260QRG\2245\254?\200U\362\261\221\372\201?\377\275t\031\302J\353?\354V\321\005\017C\354?\326\r\303\336\275\362\341?\263\201\375#\246V\340?\215\023K\036\016\350\345?\271t\217a\356%\355?\343>\327v\263p\355?%,\205[?\345\350?>M\210<\237\251\341?B\222U\376\036v\345?<\275m\230\0008\324?\200\351\321haM\233?\274\356\351rVv\340?\030\257f\332\265\330\267?\034\345\177\034\3765\303?\371\273\010\r0\272\341?\270qn\266\036H\311?n\336\005\250!\002\327?\230\020y\242\371\013\332?P\033f#Q\024\321?4\016\241a\302\"\320?\2607\271\241}%\326?\260^\321D?\234\257?\330\0010\014\242;\356?e~e\267\245>\354?\212/^\025\220/\345?\ny\013\276\217\214\347?\032\n\373g\330\250\321?H\377\202\016\025\204\323?\030\256\374\321U\370\322?\202\3205v\240\036\331?\370\026\351K\231f\314?Pr\006\030\304T\303?\022N/\315c\354\321?\332\275|\247Q\350\330?h\352\225\252\364V\267?>X\260u\313Y\355?\250\036\267\357\3450\351?\'4\210\007\257S\354?p\333\260\342\3646\326?t(\334o\375\371\306?\037,d\0131S\354?N\203\r\257\224\205\340?\204\265\177\255\2248\343?\231\251\017-V\321\356?\204\375\207\363+\033\341?\270\326w\366\234\270\345?\364g=\340\235F\327?\270\351\035E\t\271\320?2U3\210M\372\332?\220M\247\264\307\324\351?\367\333[\001\225#\341?\233\030\236<\267W\354?d\0321>\274;\314?\250\t\213\350\343\033\337?\226\247\014\243C/\327?@OM\330\277\353\205?\230\3702\366^\322\344?\020\352\262\253\260\223\302?\232\\\3403qJ\354?\370\375}\002\243\330\352?\036\0324\332\177+\326?\206\037\256z\2754\353?\342\274D\004k^\350?\230Q\351\354\361I\344?\275\006\031\236W\206\344?t\365\313;\246,\304?(\264\320n|\367\306?\306c\362\354LJ\350?\210c\353\"\247\370\261?`\300R\312\327\361\273?\031\361\355f\303Y\340?\001%\336\363\272N\355?\350Y35/\256\337?\321\177\377x\335/\345?\216p!\2760>\326?~\336\n\343V\355\347?\213\211$\267>\270\343?\320\003`\261\037\202\355?p\357!;\242\262\313?X\035\270D\307b\312? \205ls\367\316\305?) \tx\275p\342?OnN;\026\261\355?\200\354\330\236\244\266t?\314\316E-\351W\301?oc\231\224%\264\352?\022|\356\t\332\307\341?\367\373\\$\302\242\342?\344\"j\311g\373\336?\257NW\314\315`\344?\301,|\031\317\242\345?\306\020qo\003\313\323?9\366\010\335\200w\343?bu\202?\217,\357?\206\210\266h2$\330?\260\'4!$\216\247?\374\036\245\200W\035\334?\327\267\361>Z\267\350?h1\'\006%J\264?\300\326\241\334*{\300?\347n\010]D\217\355?PZ\001\332o\225\351?\320^\361\272\232F\346?\036B\035\241\'\375\345?\260b\200\253\246\255\335?\315F#\032F\364\346?\346.\327!\243)\323?\275\305oh\036}\340?\240 [\036\235\037\250?\260\373\034\352~\014\357?\177\025\007i\305\001\355?\000}D\235\364A\237?\300\261u\035t\276\210?p*\035~\224}\272?,\201F\033\262\034\325?\336^e\237\0161\323?&\376]o6s\356?Id\204\017^\311\341?\210\251\204\200\341\240\337?\220n\201\364P\355\273?\254\233\307\023\207\006\332?\n\225D%\006\014\330?4\376A\023?6\310?\302:\343m\366\225\347?H\334\025-[\240\301?\222\321\344\217!\n\327?\244k\347w{=\337?\343\270Rfe5\341?\3112\314\352k\302\341?0\264*\237\275\274\320?P3\213Qn\237\311?\200\314H\017\246~\257?\276y\363\n\232l\356?\342F\205\265)~\342?\356\332[\373\223\231\325?\354l+I\260\323\356?`\324\243\265\177r\330?0\365\371\245\353\204\241?e\374\316\373\356k\356?d3\310jNd\321?4\261\253\312zU\336?6\207H\232\226@\345?<\213O\221_{\326?\263B\317\021\354\036\345?\nu\213A8K\322?B\354\342\010\231\003\343?\214\034\224\264\016\217\330?\240\205\333\"K\346\234?\337$#\341[\330\342?\034\317\313z\221\200\312?\355ms^\337\004\353?\204\351\014\031\207\365\304?:\356\003Z\021\303\342?\377\000V\333;\343\347?\370yNa\374P\323?\304\013\324-\353 \311?\273z\235\016\224k\346?\252-\216\275e`\320?b\334\220\022&\325\325?\270]\025\365\264d\331?\nf@d\337c\344?\200I\374\203\245\347\252?`\360+\025m1\333?<U\202\210\257o\357?\377\370\306\242\312\263\344?\324\032\314\372\0321\347?oY\033jzc\357?G\267\270B\036\\\356?\330]1n\227\206\346?\262(#{\253[\353?o\016\014\244\262\032\357?\200 \350\357a@\332?\330_\265\331\252)\351? \231\302\236\032\372\322?(\323\t\243\330V\352?\240\262\273\235\177\320\274?\370\240\245`\321L\345?\036\334b\014{\315\353?.\246q\346\342\'\330?\270\036\205\343\353R\341?.\225Bu\030r\345?\231\357\'\236\313\337\345?\216\231\250?\310\340\326?\022\016\033\036@\272\335?\256O\206\207\256\242\336?Gx4\205\205\301\341?\004||\270M\231\310?\321\276\\\010\206\023\340?\240\213\312\202A\320\331?\240|\362H\203\301\323?\345mg\202z\234\342?\340X\201\356J\254\305?\250\377\364\027\273\332\344?\340 \na\\5\302?\\/&\367\266\265\305?\326\003V\262\335\023\331?\301\277\350p\267P\357?w]\320\371\026\000\340?$(\1775\206F\335?\244\247\030\nu\247\306?\260-\307o\'g\346?\330\022\353\263\273,\266?fr\346$\327\251\337?0\247\3042\254?\337?\001\255\\\330M\031\347?b\356\022\035J\302\356?9Y\352\340\312k\346?\017\361\210\261q\260\351?\241\313\331\177\301 \353?\323_\033\250\334(\351?\314\257\320\251\007\021\313?\025N\001\201\227\310\353?X\341\270\000\2405\340?\220\\\216\363\027\222\314?J\370V\256\354N\356?\020}.?\262S\321?\350\343\006n\200x\355?\";\376?oU\321?\200\023\344\275\350\007\320?\300\203\233^\254\326\206?\242\254\323G\322k\347?=\227E=m\222\354?DS\223\311\032\372\332?\020\006d[fN\320?\332*\347\211[\226\326?\000\3618\307\203Z\301?\200{\007ua\357\211?\216A\253\366\247$\354?k\237\263s\271O\345?\\\351\371\320uS\354?\347\331\003?\234\036\352?&\257\363\232\311\233\322?\347Z\236\237\037\226\345?\370P\202l0\231\330?\300kO\313\033M\250?\215\221\337\244\300\333\344?\312\301\212\236:\241\344?\370Y\371I\024\010\303?\244\010\016\265\030G\322?\016\316\2662\332\021\320?LUj5\204\254\313?DQ\346{\310+\333?:\361\340\210I\201\352?.\262\321\254\264H\352?\022\335ES\200\307\336?\226;\371(\301O\347?\004\223\255=\330\372\304?(\224\017c\233b\264?\001\265\360#Td\356?\270\361H\035\271u\302?ZX\241\245\353\361\354?l/s\tR\266\335?\001PC\320D\343\352?\361\036^\006\373;\350?A\370\314\020\367\246\347?k@A\024\312\331\353?\221v\255\317\207O\352?sX\261 \251\272\351?\032\227\370bI\021\357?\300$\302Hm\260\224?r\221\031\304\342\305\346?D\201\200+\000W\322?k<\206b\273\246\356?\330\300EBD\031\266?8A<\003k\211\265?\025B\004=/q\341?\301\307\223\020\020\366\343?\220\2241\376J\361\310?\313$\255\375\303\'\350?\334}\t@\255\230\330?0y\272o\323\235\246?<`\204@\307{\331?i\203\214`\030O\346?\034\252R\271@i\354?\230J\362#\364N\340?P\246\221\t\"}\347?P\004d\271D\353\245?\340\270\237\227\024\023\311?\244`\301.\306\265\312?\340f\257b\225\341\331?\000\315\362J\327\030\256?@\336`\007\251\212\202?\333\326\314\375\232\322\357?\200\306\315\225\331 \353?X\343]\232\005Y\342?Ie\027}\315\265\350?y\010\251sfQ\345?\216\210\007\313;-\331?|\310)\332y\332\342?\371\016\220\232!\207\356?\302O\230\306\3132\327?\0041\323\375\026\252\350?\275\032\336}\367s\341?\236on\320;\341\352?\220G\210\363\202s\243?\031\027\031\240@\332\344?\010\223\3540\026\317\302?\340\014\343$0\341\264?z\000bD\370D\332?\n\372$\304Y\003\323?\330\232\313\354\237.\273?\240\263\016)R\323\311?\036\243\\\220\244A\341?\257\257\\\204\036\r\357?<:\252yV\340\314?\200l\3410\241l\252?\346\\#a\022\325\353?\326|\177\022\301\335\343?\354\024\322\001\337\232\324?\002\227\032\315l\370\351?.\352\342[\024M\352?\320\036\334\227\321+\260?\205\231n|]\177\351?0W\020\226\324\364\334?g`mgG\316\345?L\004\303\010\320\271\306?\354{F\032\036k\344?\370_\303N\251\321\301? \304\374\324\277S\274?\255\212\255\352\312\354\354?\264+\376\337\310[\305?llan\213\234\351?\372\377+\037\230\225\333?l\371\233\352\354\247\357?\006\341\205\367l\224\327?\326\311\023\201i\024\352?J>\253\320\373m\356?\220\222YQ\234\223\327?8\206(\036\223\353\320?NG\304C\346|\327?\226\004q\255\3724\330?\224j\030\2032\344\321?\013\324\225\033\260\002\347?2\3571W\314\t\326?\372\272.\230`\\\335? \031.\235\026\241\221?^\201\356\323!\332\352?h\305\343\2133\341\277?X\362h\003\345/\277?\302\036Y\201\022d\320?6\254\000-\351\240\330?1\321\355\363\247\005\357? 3\371\006\314>\344?\255\343^\247\223:\341?\350\366\232\t\307^\350?\000\243\340\354\354\031g?n\"2>\265\367\332?b\346\302\324\377\204\351?\276Si\351\177\250\326?2<\003pRH\326?s\211\303\315\367\213\354?\020M[\230\246\313\340?\350\337\306\332\347\315\341?\214\217`\233\025G\306?)\365=x\2027\342?\375iB(a\322\340?\246\306\327\346\227\267\326?VX\273f\274\216\337?p\345\300T\014P\344?\240\0077\267=\010\333??\2215\310\"g\357?\032\326\3745z\242\334?H\347\343\320\036\345\262?hco\030+\351\306?0\036\255\375\320\031\251?\006\337K:\371\023\332?\242\376p<\256v\325?\246X\021\020\267\217\354?fT\244\225S\204\355?\000\230\252\215\026.\312?\316\262\200\251\013x\320?\032\324l\234Z\002\353?U\311\331\033\2176\340?!\206&T8]\357?\246\205\000\322\304x\344?2\017\347\312\372\242\322?X\334\312S8\335\345?V\214\031\230\033\316\331?\340\274\362\213\213\021\251?\354\313\027n\232L\306?X\336{\222d\276\311?\267\331\257U=\246\340?*\031\371\317\0011\332?\310~\216\212e\205\260?V\326\330\207\034\211\333?\366\033\304\367io\335?\236\264m`\014\007\332?\266]_Gc\350\353?L\373sg\244\230\303?\372\225\307\220\305v\332?\014\364m\017j\313\301?H\313\304Cp\343\326?\321\237d[\215\t\350?\255|\276:\202\204\354?\262V[\246\354\352\332?\371\037~\317u\037\343?x\331\244^e\217\277?\2668\3250\347W\342?<\241#u/\314\354?\333\266\361%y\345\345?>4(\202\364\010\347?\360\223\274\254 \020\271?\354\374\023\216\211h\302?@{\233\264\001\242\341?\325\020\3014\243h\345?\243\025\225\316x\266\341?B\272\312:\224/\320?\274f(\013p\370\317?=]>m\247&\347?\232gwz\367\252\337?\210\225\30133\306\346?\26294\317\347w\347?$\302\331+\030\036\335?\222\213T\312\375\"\330?\301\214\251ns\202\346?\324\207q\221\237\034\332?c\272\020\253>\244\357?\226=G\361\221>\342?\316&\t5SO\356?fh\321\205\333R\334?\200\271Q8\026\242\223?l7\263\2559\200\322?bUp\336Kz\345?\306t\214M\322\036\332?\200(\230\306\251\346\321?\344\3239-\343u\357?\3003\362\341\217)\217?\203\231q\346A\021\356?\350jK\374\307\370\343?FD\240\000L\213\323?jW\271\300\372j\341?l\353\370&\304\347\327?\271}\035x\304\235\343?\253-\001\201\221\372\352?n\035\223\023\261\270\351?\347\360\323\275H\276\345?\372\022;%$\362\343?\031\2466-$\204\346?n:\021\256yl\336?\004j\266\365\301\205\335?\034\3367\215\373\251\300?w \006\327\220?\342?.\377B\365L1\330?\004\336\031T\230@\347?F\370\244\336\216%\353?s\366k\020\317\216\345?\270\325\253\365\3450\341?\214Uv\222\305\267\345?\201\2779M\312K\354?\265\2636\347P\000\343?\000P\334\326\030\252\212?\376=\371u\211\235\335?xP\316\331\201\341\306?9\263\376\017-\002\343?\220\267\002\370\333\016\316?\350\313\250K\006\352\327?\334\017\177\341\004\315\302?\227\244\323\366\353+\343?\327.bD_\232\357?P\353\235\250\201t\277?\243t\243=j2\357?J\205\301\273w\252\333? \235i\2427\027\266?d\312\366\240\030\024\302?\351\211`\316\310S\340?\220\205\021\300\221\253\264?\236\316o\321\365B\332?\351e%$\201\250\342?(\007\333\021\236\251\267?\227\362\247M\247[\350?\324\362\215\361<i\324?`mzn\351\014\341?2J\203\001\264\206\355?5\362\365\352\002M\350?J]q\242B)\355?\304@\225p\004`\326? \326S\004\242\346\260?\3026\220o\343\\\354?E\226\010\034)H\346?G\254\354\265\343a\341?\006#X\361J\354\333?\370:5J\330\033\277?nU\364w\256Z\323?\t\232\355\202\235&\350?\260hm\350Z\311\344?\342\2239zop\331?\313\357\247\017:[\356?u\244tB.\304\355?\000$\212\273z\254`?d\202Z,@B\354?\222\314t^\027\346\321?\320\375\242&91\307?\272[\313hu\235\327?\n\334u\335\250\250\326?\224\254E\363\261A\356?=\036\317+\016\321\345?\032\304\216\2740\352\347?\355ET5F\373\343?\250\301\031[vV\347?\310<i_\222\217\313?\272\371\033u\200\227\330?\216\016\'\000\305\273\320?\275_\306\311p\261\354?\313Qo\333\210\333\344?\006/YC\311L\323?p\331\377\320Z\273\241?\222\3577\027\302[\337?\000\036H\332Lj\323?\356\312\236nB\337\352?\230b\364:\215\376\260?k\317i\033@\262\352?J\232\232\3179w\323?\016#\276\3104\363\337?p\341\320S?\371\255?\300\270X\377v\265\256?\354\021a\326\004\254\307?Lv9 t\310\310?~\212\020\032\031\007\334?l\010\337i\306\357\330?8\020\226V\374\246\325?\201\320\034O\316\025\353?\030\277\004\340\233\307\305?9;\312\354P\003\343?\354_\266\354\326\331\306?\255%}\202\345k\350?\253\3154\341\341\032\341?\333\373\326W\0309\347?\325_\262\021\033\373\340?0\277\031B\'L\310?\346\274:gZ\024\353?~\263o\006=\253\333?5K\2742\017|\341?\216\364\314},\221\343?\265\363\020\226\350\354\351?XKv\247\270\t\302?\334\027c\237\037I\356?$\303?\265\375\033\323?\3439\rD\307\377\352?mL?\201\324\347\345?\025\361\0315\325\211\347?\372h\375s\333\242\326?|\377\305\372+U\352?\020\241U\033~+\351?F\211fbz\245\330?\001\\::\250\367\344?\257\271\025U;\'\342? \265+\201\241\350\264?\\\3634\'\335$\316?\305\327\245\017\324\n\353?L\274\334\246A\321\350?\254\345/\n\315\001\326?\260\277\353@\320-\352?q\305\245%\227\245\352?_m\025\325\004k\354?\334\234\264j\004\310\310?\014\335\2012\225(\326?\340q9\000(\364\274?\314\354\265X\273\371\324?\030=F\250\276\324\310?\322 3H\255\356\343?\020\\\247\210m\214\343?\233\274\242\225\231\315\355?\320\"\266\366{\275\347?\242\227\306\353x\\\333?!\213P\361\356\374\340?\354Y\314i\237\226\357?V\271\317&\273\300\332?\034\026l\262{\024\357?\253m\003g\365\220\356?H\312\354\351s\204\310? .\027Y4\237\256?\245\367\222\006\014\311\343?\364<\227\235\360(\337?^\351\253b2\001\353?\212`&IP\237\344?\006\333\240\r\357h\336?=\264\264\204\320\353\347?\260\331\233JH\013\245?\262M]\304\230%\355?\354\371B[\233Y\307?\200\t\366\311m\262\200?\373\342\303P\234\342\357?\320\224G\365\022\024\320?\270[\361\306\367\236\267?%\013\342\024\236\266\353?\213C\331\336b\254\346?i\325\261\3410\245\357?u\243\255!\277\024\340?\'\366\007\002\256\271\354?\200p\333\360$\324\250?\002/\231\205\204\267\322?\255\nlOl\341\343?\' \026xF\214\342?\027e\0227\335\000\356?F\313v\310\334H\331?\3663\014_z\344\324?p7\221,(\366\347?\006\376J\335\336\"\337?\331\220\314\2118!\352?\304HH\376$\321\300?\314\244\"{\234j\337?@\203\332\234\326F\303?\000fE\333yY[?1\243\223*\223\262\345?\360b\340h\377O\307?d\270UENH\306?x\226\024sH\270\346?\316\0178%\240\023\352?\267\235\204\333Z\242\355?\2140\207\333^\220\301? ac)\3661\242?h\275\206\nI\202\337?\241]\305\317\010e\343?\030\353\332\333\203\257\261?\260\232\000kJ{\356?l\177\'/\374\254\305?W\374:\364\025T\355?\330\215\345]\250s\343?\212\0215\256L:\330?$%e\210\211@\332?hb2n\215\365\313?P\355\323x\031\212\326?(\234O\364D7\300?F\300\236j\372\212\324?\000\231\t\331}\323\216?\030\3418\212\027\202\266?\014\230\262Mam\350?\314\031\002\003\005\273\324?\244k\346\032\264Q\322?\020\370\323e\277W\325?\350\302\350\n%\020\330?\022\202^+\246/\323?\240\t\034D\207\005\227?X`\2063\036\342\270?\000\272\031\324i\267\311?o-\234\315;\232\351?\303\224x0[\325\342?\246\034\032\310\035\027\356?\265o#o\024\270\343?\000\303zB\366q\345?\273\030\323P,\221\357?\240\326t\353 \000\351?-\210\273\244`G\345?\270\004\226\264a\241\273?$F\313\303\277t\307?$@\2777K)\346?\204\322\232\312i\321\341??\311:y@\326\354?\213\272L\177\330\376\347?\356\213\367\004\273v\345?G\324\255L\215-\350?\336uj\257\\\263\333?Q\312.\342\244 \356?y\215\na\221\004\353?\2605\377\014\302\361\327?8^\024\363)\353\326?\023\277j\316\372\217\353?\250\204C\n\335\221\354?\371\232\255\337\345\224\345?\256S\374.\216\276\356?\332\311\332\214\362-\342?14V\006\277\271\345?\300\256\35533\252\323?B\373\202.Z\275\341?\007\262*\256\021\302\347?#DhZ\336\263\343?z\025\237C..\326?\342\272\022Z\333\212\340?\251T\365\207O\372\347?\363\334s\315\010i\340?P\215\216~C\206\247?\202?\\\216\r\243\345?c\031/p]\035\352?d\177=Hk\231\353?F\335\250T\277h\326?\362\004\330\347\320P\344?``~BB\322\325?\3221\006/=d\347?\006f@e\252\276\346?\026K\237\232\351\223\357?\360w\367:\275\"\342?!fU\365t\273\342?\200\237\027\311\275\337\255?\000\342\221\275>\365\331?\313\036\255DV\022\351?\014\000z\2762\367\326?I\027\240u\252\363\342?\221\344c\303\205\344\343?\314\004n\365\026\307\355?\234\020>\300\314\216\351?(\311\256\344i\017\347?\326\201\210E\0062\327?\227\226\020\\\323u\352?O\334\025\005d\322\352?\020y\311\320#g\330?\267\220d\226!\314\357?\360\260\252\313J\020\335?\340s@\357 \277\336?\030\341.\304D\312\306?\377\241k\326+\254\342?}\r\227\244e\332\340?\224$\373\013#\357\345?\006\326/\326p\031\354?\200\022G\270\"\344\234?\030\210ZDn`\313?\232\'\n\201\001\232\326?\306\201\034\237\255\216\321?\320\3468A\305\026\334?9K\032j\347\033\347?\350(\314\312V\213\340?`P1eA\344\275?\3310\177\2605\225\351?\020-\370\264b?\354?\322Y\032\354\010\270\331?j\312G\226f\364\331?\340\275\006?\017\204\332?\206\344Sj8i\342?x\260\335W\314K\321?f\365\033\232\254\315\330?\300M\327ZF!\341?\030\035&v\336\206\277?\360\274\350\354\261\306\347?\334\\\254\330\004\260\344?\213u\302%KB\346?\332Bq\300U\246\337?X6\032\353;l\351?\340N\214\203\301\301\352?\215\211\222+\020\206\341?\357\227\016H\017\304\355?\\:\2372\006\336\332?\336\231\361D\023\032\333?dB,\221\201=\326?\275\375IZ\274\247\350?t1\310V\212\332\306?\021F._ y\357?e\204\263\372$6\344?\357tE\221\223\033\350?\214\335\032\034\326\247\342?\373-\025\013{\203\351?p\0164a\250g\326?\017O?Aq\320\352?\300\247e\333\026P\262?j\277D\345\000B\325?l\n\206\302\3236\341?\002\253\334{(\222\331?\010f!4\007\341\330?\001\277\264\301\r\217\353?|\006m\006\333_\343?}R\234\347\017j\344?\030\357\177\275f\331\300?D\212\362\334\033\377\341?\007l\256r_\037\346?-\270\177 /\001\340?.\002\253\001\365a\350?r\2251;\213\026\354?\344\336\340u\333v\327?\210\320\361\237\236p\322?:\315\224\314\217J\322?\241\360y\334{}\342?j]\275\251\253\031\340?\260\201\227\341\344\001\241?\304\301+\210~\241\320?\340\212E<\021\207\231?\243\273|\316\336\203\353?\016\000\r\014\322\006\355?^\260<P\227A\321?\374\013\210\240\347z\320?<\203\205\352\004V\352?\231\332l,g\324\344?0\364\350\375\214\362\346?\340\222\200\336\235\240\272?Z(\t\203\346\035\347?\375H\025\373\200N\343?\306\244Ry=\204\344?\326i(\006/2\335?\252k=}9\025\340?\220\221\363\362\260\311\332?\332\261\311+\267\343\336?\356\377\020:\316\000\351?=\323R\266*\344\355?.\232R5n\371\337?\"J\007\361([\354?\212\242\222\"\300\317\345?\017\261\205\273G\273\350?\206{_7\265A\356?l\247\320\366#\245\304?6P3\314eC\337?\017\2469\311\305\272\347?\032+\260UU\200\321?\227\202%Kl\233\351?\260\357\237\343V;\356?\213\270\332\203S\021\353?\363SG\3207z\353?6\265\013\322\317\353\355?\352 \215Fe\335\322?\242\344P\254w\231\320?Z\224e \010\n\355?$\034\341\032\313\221\336?&f\203\335R\331\322?\343\241(}\213i\346?xmr\323lL\341?\300\253\235\274G\236\314?\274\310U\245.\360\322?\330\316\334\325\201J\274?@\022!\0300\000\312?8\320H,yI\310?0Q\377\224\323A\304?\212y\201v3\251\342?ao[\362\025\224\350?\200\335|\024m`\265?G\241\023\326\210*\344?[\307\201\310&\'\352?\240\r\230\223y`\256?\335\211\200\243\224\023\340?\010\025\026\301\177>\331?,G\360l9\235\345?s\335#\302\210\211\354?\200\n\323\\\\\370\235?\210\033K\337\350\243\313?\206\361He\370\236\354?\206o\232i\255\231\321?\215\326X\tY\031\353?P\314-\366\201(\305?DK\022U\200\230\351?\231\327o/7\254\345?\331e\253@a\"\342?\360`\274{\240\240\351?\344\240\236\201\032B\303?|\003q\364\0330\304?\006\320x\336\257\006\321?P\240\203\276\3065\323?\235\375\275\014\261<\350?\330}\002f\374\301\267?`\273\3008\241\276\356?\344\232/\323,\214\343?2\005\\6L\330\330?\264\302\2019\336\222\323?\017\274\325\276\323\275\357?H\337\273\235/\246\342?\320\236/s\2624\302?`M\204\323z\027\347?O\325\304\250%<\347?\231m\016\001\276\035\346?\254\230\224\310\317\177\322?\027\023~\315y\367\350?X\333=\356\374\017\321?\202\345r\006\307}\357?\372\303\245vd\343\331?\322\025\331\252\013\274\323?\220\224{]\374\023\276?\034\031\272\375\256\324\334?\214\327b]\347\211\321?\322\n\340\367A\317\352?\356\r\323n\325\344\326? =PQU\216\220?aFv1 |\354?\330C\023\000G\336\327?\006\210zs\010(\354?d\346!\237\207m\311?\0240\316\272c\307\315?\2633\231b\025E\340?\304d\364L\246\244\342?\214\332\217\001\037\232\314?\234YG\335{\270\355?\020\312\367\031$.\242?\222\261\"\207\rj\354?]\221`\034(\\\343?,6haG\027\336?\354J\027\327\3707\310?\010\032\250-\347\006\346?l!IBFP\327?\265+&\251F)\350?\010M\263\243\264\035\261?\331\377\354\223{W\351?\000drn\204\213\230?*\232Jr-\232\320?\322\273\007\326\001\334\332?4\027J\034mW\320?\360\245A\225u\327\261?h\001\365\021\024\003\347??\374\205\340\003\242\342?\340\002\210u\266\337\327?\272\250t\352\234\343\337?7c\236\373[b\356?O\036\367Uo\261\346?\000\335i\005\"\035\211?\212\340\322\232\305l\335?\340Z\256 }\007\300?\260\230dD\215\333\261?\030\313\250\251.L\330?\366\020\010\030\234 \346?\3667\010\244\263\234\352?\250]\260\026\035\350\337?\026\367/7df\322?K\274\336-;\352\353?\247\353L\364Y}\351?\200\305\301r\217\205\212?\356\2117$\315\307\342?\000E\377\250\016J\243?\236\225\321^\236\276\333?\024\267\003\300\332\311\306?x\204\340\272\223e\336?\360T<lu^\261?|D\275\300\241\370\305?\315y\356d\347\030\351?\203\023\024\315/\013\355?\034N\231\036\214q\311?\3457\225\375M\363\346?\260\365\017:Ql\271?\252\235\200}\016\205\350?\300\216\254\262\206\026\210?\\\245,j\237\262\335?@E\301G,\334\200?-\351\002\257\002_\340?F\277>\027\177\264\330?F\330\255\251\253)\340?\256*\354Rxp\340?\314\272\362\313F\017\323?\220R\031\263\256\031\314?@\223\305\264\235\371\207?R\333\303\234\003\026\330?\2075#\324\372\273\357?B\274\210\022ob\323?\006\316e\232\317\304\346?\032\216\250\020\371P\357?bl\237\177\366\023\325?\004\252\013c\246F\306?D#\312\363r\365\342?\032\3547\004\n\027\333?4Kyf\227\337\347?\356)\271\021b0\323?r\270pF|\246\357?\026\316\352\375lN\346?\3304p\230(a\305?\\\342QTZ\215\330?\314\367\302D\337\317\305? \010\002\316gx\324?<0S\000\242M\305?\rK\016\347\271.\355?\035\r\261\263\227\310\356?\201\352\234H\002;\350?\0246\346$1\332\313?T\225\265\252\031i\311?}a\276#\246\321\345?0\325\300\242Z\030\335?\330\2360\\\247\312\265?^\\\257Sr\275\332?\222|&\247\310\316\352?^\032\363cS\347\343?\344|K\273\241\213\340?\254\305[\232\377\350\337?\231e\332(\347m\342?\212[\325\214\320i\333?\304\356\311\326/\252\311?\340\263\216+\253\355\316?\370\3531Q3\217\315?\213\331\227\030\276I\352?\013k{\240\340\250\354?\270^{\253\243\003\301?\001\246H`/L\357?x\210\354\344\002\345\313?\036`\271\365\'\330\353?\240\2255@gf\234?\2622\026\341E\230\331?\312\032\021D\356#\345?T\376\027\341\3337\315?\3749*n%U\302?D\373\330\027\270\205\356?ZH\264\005q\351\357?P\013\326\3709\354\357?{\221r \345i\356?8C\237k\017\353\353?\225\025\010o\246q\352?V\274\362\342\014/\334?\270\234\035\245\230j\330?\336\365G~\341C\326?\340\016\215]l1\224?4\277\265W\214P\331?\354B\277,\205\177\334?\300\222V\276\035\236\232?\325p\213bR\033\342?\216s\367\250\272\n\343?W\246\271e\\\272\346?`\234\016\257 \246\343?\354\216E^C\226\323?\224\273V\204\3728\307?\250\376\205\364:K\276?4\224:\236\206\035\302?\")\367\315\323;\332?\351v\327\264\250\035\344?04\316\201\230\270\303?$\0009@\257\317\300? \212\245W\227\206\270?SY\030j\335\021\344?\020\310\374\266s#\354?`\305d\003\231\267\270?\020\021\230\363\341k\347?\222\216@]0\n\324?\241?\313{\222\r\355?\340\347VN6\007\241?\273\371t\216\274_\340?_\222c\303\214\230\354?\010\007]\332\377\244\315?\200\211\257b)\177\347?\30411\375X*\352?\343\002O\032O\t\344?\270\372\027k\377\351\303?\234\003\353E\352W\326?!\304\341\210\326C\351?~\177#\017A\354\351?\270\204W\307\027{\326?\340W*\203\207\302\304?\372\274\253x\327E\323?\026\365U~{^\343?\313y\243^b/\351?hy\274\262e\340\356?o\352\364\304\247w\344?\270S\202)+ \346?(\206\227\005\302L\310?\210\262\371\216\345o\315?\200\323\300\363\265\'\240?\370\244rG\306\251\327?<\026\372<U\"\307?\315PD\366\245\267\340?\350\345k\244i\t\341?\217\267\261?\037\203\357?\340\354\033\245\315\232\226?\032\341I\247\264D\354?\302\311\216\004\330d\333?\370\"\222\026\350\242\327?0\210\340xM\245\336?-\351DU\262\204\341?da2\027\"\252\305?\214\262\350\216\036\237\341?7\252)-\362i\357?\360\302*\3545\334\256?\331(Q\3565\205\343?{\301B\362}\367\347?\211\345\262%\357\005\340?\204\026K\022\023=\331?\300\347\001\314u\226\242?S\277\222E\207\324\346?\347Q\313b\273\026\357?\306<<\240t\025\354?&g\261ab\305\347?\205WK\0277\367\354?\304\347.\257\333\330\317?1D9\276|m\340?p\222\003\013jq\255?\200:\n\302\361&\216?\223e\317>\202r\355?\220,\313\000\267R\241?~\362\305\370\2124\331?\205\340\305y]4\343?@Q}\227\365\007\313?\270\004\035\255\270\'\317?\345\023\206%f\347\344?+\023\017\362\232\317\351?Z\244\372\343a,\353?\242\305+\276*z\334?Pf\"\037\302t\344?v\254\2459\251\303\324?\376\273A\n\340\234\353?\270\224?\254F\311\352?:\333\265\312\357\334\347?\270\023\234\343%\342\313?\245\004\373c\215l\357?\230\004\016I]\342\303?0\337\024\372\263#\330?N0\202>r\245\327?Z\2361\252\217(\322?\354\262c#\365\231\316?\322\033U\263\020`\337?\260\376?\273\337V\356?\260\330sd l\335?1\363\333\027\372\365\355?\234Wm\021\333\364\336?\215\222\010\032\024#\343?\363TK\342td\342?\353TZQ\373<\354?\346\230\014\213D\317\341?,\251\213\357\253\340\353?Xz0\225\266\224\354?\237\225-q&\236\346?\213I\367>MY\355?t\341\236\262\240\232\333?\020\004\3626!\235\306?\006Fx\264\355\360\344?\220;\307_7\274\271?\276\257\201E0x\327?\022\254\024\034\371\260\336?\256\302\310M\253\010\352?$D\251\235\317\211\300?\354DX\207\314\000\304?\234QM\304\341*\314?[\365\272\t\265C\356?s\222\006r^\304\341?r\263\363C\326\001\354?\334D\314\213E\256\317?<\253C\017+\267\316?\200tT\024\262\231\341?\233\261\257Z\323\231\353?\324\\\014\336\350c\341?\244\323\243j\340,\303?U\353\024.\237{\356?\332\220@\215d\347\342?3\030\n\356Z\244\340?<O\371o\367t\332?\331\260\200\364\310O\340?\304TM\310,O\357?\014c\362\"\377*\324?CP\247\007\007X\354?\226\262M\306\272\263\327?^\342E\3167W\321?\351\357cj\222\016\344?\270\013\026\r\315`\327?\205C>\363\036>\346?\203\252A)\303\010\355?d\242\257\264\203\251\353?\006RNq\317\332\355?|\326N0\374\251\356?&\"\240\303\305\205\335?\356\352\313\202\370+\324?\240fE:\377\317\336?\310!\236T\355\354\311?\346\016\314R\341f\347?\000f\223\301\230\207\325?\370\246@-\273a\301?\375\017\037\"\373b\347?Y\250(T\031\223\345?C\035\335~E\354\356?\364\310\256\023\244t\334?\035\006EnjM\346?\314\313o\024\276q\342?\333\345B\310\262\237\356?\336\"n\013\311\211\330?\222\221\320\323\212\262\351?B\230)9\321i\352?#\223\305\347\002\303\352?\260\210\336\342O \251?\264\366\036\354O]\327?\306\303\364|\014\\\327?L\345\374\376Dh\330?\022\272,\021\300\030\344?\333\207\021\350E\241\357?l\022Q\351\001\263\353?\256\r\224*\305\253\357?\245\361\376\252\2364\351?\\%\375\277jz\355?X\261#9G}\271?\\\337\225\324\361\374\352?\332\351\275\360\272X\356?\372\354X<\314\213\334?0\r\361\233\377\263\256?\263w\237\177\302\246\353?\336\251\336\214:U\346?\234\373:\013b\320\310?j`AH\221?\326?\340S\307v\347\320\234?\220\204\200\033;\014\272?c\272A\262]\014\347?\212S\241\331\244\024\326?:4\350z>M\345?v\234q\206V\007\347?8\306s\036\276\021\344?\234\237\275\244\234\026\355?@\342o\361\301]\233?)t\326\343\211\251\340?\026\377 `&\326\334?Q\204\214m\327\030\341?\361\303\321\231\233\304\342?\232\001I\005c.\337?\000\230\375`\3251\217?\275\364\344\315=\304\354?\2440\353\346\340\230\336?T%\277 \366\266\337?t\014\353\270\240\017\341?\370\331{N\363\010\262?\310\210\313\212\222=\271?\200\0348\203\230\016\272?jv\276\024\350\215\341?\350:\332\226\004\232\312?^,\177\206\036\376\332?5\247E\376~\365\346?\025\037\341\2045G\346?\202$\331\014{%\351?\370\2775\243-\331\324?\026\204n\306\315S\351?\324\300YZ\250\243\353?4M\355\2641\033\345?\200\370a\362=\244\276?\334J\233\221\021\'\305?\t\272;\204\004\312\343?\300\3701\263\371\243\331?\336\212\353\370\337\260\321?\312E[\320E\312\341?0\314\356\302\2419\311?\004\365\260\204>\024\317?Z\256t\177K\260\336?\250\030\357\342\352\254\272?\300\220\0322\026w\320?`\no0\r\236\265?\237\313X\0315\204\352?\262\342\250\374,+\341?)\026\233JTS\350?88\326\346I\214\311?\3461C;\310\353\347?\347\003\0338\357V\350?B\262\353\344q\371\343?\246\251}\365\337\277\337?{Z\013\265\336\237\355?K\026\026\010\306;\350?|Z\377\321H\261\356?\302^U\027|\303\327?&\261\214\031?\324\347?\247\365\214\017E\t\341?~\316\202\313o.\356?j~V\220b<\354?L.\025k\372\177\325?\232MdS\246\355\352?\301\360\371;\217\026\346?\300\003\252t\265\341\357?\370\303f\226d\245\320?\262\"#\317\320\251\343?;\001\003b9y\355?\300\242\272\027\262\203\316?Z\000e\217=\357\327?\3775\330\266\241\270\340?\322\n\343\367#+\342?\273\344@\000\240\374\344?\000\210\030\213\327!\253?\273\3337t\0245\344?\232\010\371\320\317\003\332?n\202\375\035\222\365\330?\262\252\276\204\033\240\337?\341p\325\347E\024\344?\214N\202?Q\206\334?\306\227\304r\237\370\350?\330\004\330YeE\265?\361\264i;i9\353?\006\253\t\232\302\200\356?\207N{k\241?\340?\336\226E\377\215c\342?\205\034\355?\266a\340?h,\272\240\303\344\352?\300\025)\234w]\274?\235\254\003\013\032J\345?\246\227vR\235\330\344?^-\246\2434\262\333?xi$_\212O\321?Ms\341\200t\036\350?\375\277Y\251\317c\340?\250\256\003\"!\274\354?\261\302yW\367\375\357?\350\035\207\234X\231\352?\340\347\206\202\212\344\354?\tb(r\302>\340?\335\331+D^\325\347?\300\215|\021\t\006\315?x\334\374(|:\325?\034\221+\250:\005\355?R\351\030pdO\335?\032\266\314\350\360a\352?\360o\335\351\016s\245?\323\221\372W\035\242\357?Rh\316\276\216X\345?}X\372\271$\327\350?\231\310\n\022\330\346\357?e>\241\035l\243\356?Xx\260\026`b\266??\205\270\016vq\341?:\331j\2622\214\327?\252\024\033\035\371%\327?\240o+\001\266\247\233?2\200\355\004\325*\352?\273\212\177Q0\356\350?\215\234W\3479\347\345?\272\005\321\",O\336?4\013\315}\017\323\323?\375\264\205\375\215\346\344?\373B\317\324\311,\352?\2309\211Q`?\342?\346\223d\206\350\210\331?q8+*X\300\353?\213S\363\274g\211\347?\262O1\"\026\324\357?\020L?\\s\214\337?$\276\227\340_\001\341?\33087\212\216\240\265?e\264\361^\335\372\354?d\346\270\233\010[\334?\247\222\246(\267\007\345?L\375\210l\001*\306?\234\241\363ld_\324?\030\311\036\363\332\002\317?4\367=\002>\000\331?\214?\002ts\234\336?\256R\330\211>>\352?\236\375j\200\346W\346?`I\032\'\'\346\352?\367ed\372\3404\352?(h\202\231\342N\336?\360\250\303zk\200\354?\252\027\016\277\354\337\345?x\371\256\227\340\333\276?\273v}\226\241\325\352?\302\202\303Y\304\241\353?+\335U+\312J\353?\345\330d\324\204\353\340?\2666c\211\215\016\345?Ot\231\253\345\300\351?>\341\353~\370\037\336?\315M\331\374\355,\347?]G\350\205\244\373\354?\362/\361\223\367\340\352?`\"\203\001\202\354\242?\243e\nS\346R\340?\201G\025I&\346\341?\224_Z\300\232\014\355?\245T\016\331\355\014\352?m\244*\3049\374\352?\262V\341\270\350\276\330?\307\315\373\027\364\333\344?3v\321\353\215b\350?\214\315`#\224b\326?\304+zw\032\372\354?\257\036_B\256q\351?\201]\3102\026\272\353?H_Z%\343\277\261?v[\224\326rp\335?@X\300X\254\320\302?\007\337\207\251m\033\341?\022%^A|S\327?\t2\364@ \014\344?\212>\313 /o\323?\234sK)\352\200\320?r\201\234\320FE\330?(V\327\014\224g\276?\334+\370\377\3476\354?\330\350zi\235+\354?@fB\267\222\000\255?\0348\313\"\335\272\353?\202\325\224*\021\251\355?nq\314\201\237\231\323?\364\371\330\301F\341\350?+\205\022\220\"&\343?\2501\207\264\306\355\320?\026}\305\225\220\361\332?\340\370Q\274\371\356\277?\034\t\370\206\032|\340?\336\335\337\3440\345\323?\031\033ek7c\351?\200\260\237\003\314\322\247?>\262\315EJN\357?\334k\001\255\335\"\312?\320\350{{@\267\327?\260\201\327v\253s\322?\204^Z]C\222\305?vx6\320\004\005\322?\000\322\202\231W=\221?\232\306\370\221#\242\353?\226v\304u\006\027\340?D\017$R\014\275\307?P\370v\017\244-\317?\362\005\275\276\353m\344?\335\337\330\253\262\233\342?\032\361\261\211\322\210\324?\343\252\214t\',\357?b\320y\206\352~\324?d3~\276\316\334\342?8T\210a_\032\277?\330\336\230\263U7\274?r\016\303:\005\225\347?8#)+[\211\350?\356\235\244\223*H\356?\357E\333\251\2329\354?\372zQw\251Z\355?\221:\366^\212\335\343?\211V\327J\233?\341?\3029\323\032\252\004\336?\000\314\244\212\273\322\324?\320\340\235\335\235l\254?H\200\210\210T\017\317?p=\233d@6\250?0d\224\r\303\010\344?hr\350)\016=\347?\362a\026\215\317T\351?\340\317\211 \223\323\310?dM\317\351\"\273\304?\260b\351|m\177\330?z\022\212\360VQ\350?\307\025\203b.\016\352?\023\\\370n\334\245\341?\340\026\201\270\246\016\324?]\2223\233\323\347\346?f~\366\r\223^\324?\254\231xm\017\363\331?_^\217\253\276\235\351?`\220\3758\300\277\220?\nU\325\\\204\335\320?\344F\005\265\364%\316?\3200r(\021\267\260?\200\266\321\366\347\036\340? \352\020F\334\232\260?\330z\023\234\371j\273?\322M\021\242\266\006\334?e\322;;6\210\353?\220Q\211\372t\013\324?>\375\274\373\3074\342?~*\262\003g\310\347?.yx\364>\241\341?\220\311h\214\300\337\265?\306`\177S\261\206\346?\320\304g\242\325\036\324?\024\244\030j\252\266\343?o\265\277\323v\317\347?\037\036J\250h\270\345?\240\025\307\357ay\312?h4T&`u\314?^\325\336yL\335\357?\351H\033\271\310\014\353?\363\004|9h\004\340?\323\201l\211\177\223\340?\342\344\001@\3221\327?\374\221\244\250e\007\351?U\326\236J\014H\353?8\214\371FF\356\311?\\[k\274q\260\330?\250\001Vr\356\367\327?\364\\=\351\362V\307?5s\000\356\212\246\357?\000\006@)e|o?\004{\353+o\013\344? \340)$i\275\261?<Vm\350\207S\307?ly\304\254\020\301\355??/d9\214U\352?I\325\370f\240\256\340?\2706vU\302\227\343?\362d,\022\302|\341?\200\276\315%j\203\217?@`\000~3\005\352?vT\334\3328\031\330?\374\241\326\347pK\340?x\013\362A\016\354\317?\375eH\301\001\241\344?A\r\321+$\274\344?\244\024\3271)\350\354?\315>Lq\326\342\340?x\300Y@\016\220\343?\0204B\"\331\332\335?\032J\235\357X\227\323?$w\007\242\225\351\347?\230y\302\3748|\307?\237k\001\346\274\037\357?\270\032@\325\306;\302?\232\365\246\257@0\355?&\373B\265\313\221\350?>\\&\351\177`\345? q2\265\337\023\326?\356\323\267\211)\035\337?\200\306\232\035\360\331\221?h\370\263~\205\305\316?tTQ\037\371s\311?8\265!7\330\340\273?\334\344*\022P\310\332? \266\336}_;\302?\246\360(\037U\346\357?x3\203~D\232\332?[.\010\235\3647\347?\251L\236\216*U\357?,\257\366\2412\240\310?\374ISmn\306\351?\273Z\201\347/C\340?\000\273\330\362\216\317\333?\220\374\335\374\244\027\254?e\206\022\226i\252\342?K\215\240\271\260P\341?tKb\251+\t\304?\317\302\250F\332f\353?\010\001\334\250\207\244\267?\210\253\314\222r\243\317?\334[\'\002\234\374\345?$\346\266\014\314\003\306?\010\3663\004\026\201\332?~\242G\276E\007\322?x&\221`\276\360\347?\366<\242\352x\247\335?}X\335n\216\347\345?9\025\027)j\273\355?Ck\277\307\277\343\340?\314p\253\007K\340\323?\245\tP\203FG\341?\010\250Pp\020\256\331?\350\035\3520\363Y\342?0\261\021\311#\265\264?\241\210+\266\025\213\357?N!k\274\033\030\341?&:\227\202\330:\343?\237B1aPA\355?\302Q\305\305\014\232\341?\214\314\337@\363\304\324?\032\307`\243ft\342?\274tE;\321\344\352?\355X\261d\0145\356?\231JE\337\010\336\343? \357\014\375\332\027\326?\201A7\033;P\341?\374\245\336\36571\323?\331\363\267\317\323F\344?\320FK\177\304X\344?C\357\3264[\245\357?\340\275<W\"\274\271?(\325y\3423\364\265?\262\320oPY&\342?:\342\013O\234\025\343?0\020\265p\315\007\321?\340\372\327#\027{\255?@0\202\256\242K\200? \2462I\036\372\316?\005y\246@G\344\347?\\-t\234\013J\327?\220\277\235\327\216\254\324?\202\216I\177\002i\353?\265*\2432\3355\342?\034Su\351\247$\304?\026\275\212a$t\333?J\030u}\313f\351?\370~j\203\243\037\313?\255\225\255]\302\317\354?\001d\214\023\375\345\355?\0244\0043\020\347\302?\356er\256x\367\322?&\211vQ\273\316\352?\014\213\243\337\372\213\327?\035\342\366\346j\304\357?`Y\300\303\227\376\256?,\336\331a\334`\324?T\003wL\252G\343?\014Id\354\231\254\353?\226\251\200\014\242\316\324?b\330D\331\201\361\333?\221\204\366\2504\276\343?\260\327Gy\351\334\305?\r\347\233.\210\024\343?\220\016}\370\363T\334?\214\377\316\031k\355\311?\250M$\306\227:\267?\360h\375P\007\254\357?j\000\272.\3368\350?\000\324g\021Z\335\201?\0045@\321\212\363\303?\320\371\025\215\256<\247?@<_$s\306\317?\304\016jxz\373\327?2\007p\241\231\213\353?\323_\263\377\314e\357?\204\255\347\255|\376\344?\2007\273\027QA\357?J\360\235\365\224\303\332?H\001!\371\312>\346?\344\372\226E\025C\337?\355\277\0019vc\344?\200_\020r\003\037\265?\251t\323\371\256:\343?\034\226\233 \026J\341?\335\020\375d\266\006\345?\240\262\034\321[9\236?@%\315<\030\246\333?\255\307p\334\205J\343?j\317V\306K\251\353?\200<\232T\221\264\222?\240K\3439\273>\224?\260!\203\216Y\377\317?\257\264\026\027V\202\354?&|\241CI\327\347?\022\365L\234\375\332\333?\202\365\337\216N@\345?B\352\000S\342k\327?Bn\022\305\267\274\343?@\t\253K>\017\325?\033\214\023\253\251\226\350?\220)]h\204u\345?\025DzW\342\217\347?$\032\351\331\245t\353?0aoo8\\\277?\274\257\256\210\320_\324? \370\257\257\341\034\255?\024\001h\324c\037\353?\230\346D\006_\304\355?~\032,\035a\"\342?2\355VU\225A\327?\024\306\253#\346\207\345?\344\020\301T\321\230\337?\256\264\264J\327\000\323?\034\304\227\337[\003\304?O<\362\r%E\343?+\035\206Xe\242\340?\320\271\306\270\277X\271?\036\017 \276T\376\340?\200Tm\276\210p\351?`1:\311!$\343?p\\\n\355\007\360\313?\300f\375/Lc\345?\000\037(L\016\020\240?\312;]$\006\311\355?h*\031\275\271p\275?\002H\036\306C\016\353?\002\223\370\316\233\004\341?\221\377(*\202,\340?\375\211\344\0079$\357?\277\233\320N\033\034\345?.\315I\033T\264\356?p?\315\257\212\372\273?\342\3557\245s\307\340?\316#m8\251\344\327?y\226\273n\035\321\354?XPn\333\336\004\324?\200\037\336\267h%\277?B\253y\336\267\273\347?b\226\237\344j%\321?&\324]8tn\323?\354L\246\252yG\344?\203S\'\342\204v\357?=\227h\360>\320\344?\364-\225\271\254V\344?\334\026\005:\254(\346?\350\301F5\335?\342?&_F\257\216\247\353?\220\354\016.O^\323?$\267\'\354{\214\324?\224\220\225p\036\004\355?(\302~\212\n3\304?xP\225\213\204A\271?F%o*d\277\344?\362(\327z?\207\345?\210\024\"\3528\254\271?\304~\037C%O\340?_\373\3159\314P\346?f\310W+i\017\357?N\356\317\233\247\206\341?Qj7v\026\021\345?(c\301\036z\300\270?\212H?\220s\321\332?\345\335E+\325&\350?L\206\340:\236\177\322?\3248\356\202\347\202\354?R@EN\210h\336?8\236\030\032\232\r\277?\210\332\244H\034\334\302?x+\250\373\021\301\313?\260]Y\250u\330\355?\364f\177\244\232\275\312?HR\002\374\215K\307?\367&?ja\354\352?\\\221\320\371\356\217\335?\352\n\210Q\201@\322?\324\267\372\007v\251\357?\304iV\317\032$\331?V\365\304\033\rK\323?\241\017\023\023o\305\350?\254F\374\302\325h\343?\205AI\210\263\311\340?\203\254W\244\026\241\346?\342\213\357\376cb\353?\244\232L\004I\005\307?\216\322g\350\363\243\330?O\010\244X\241\347\343?S\366\033\t\347\361\353?\027\224\335\244.?\340?\210\252\333A\203L\325?\226\316t\020)\226\341?\300\307\027\261`\n\321?l]%\352\3075\316?Bm\274\\\215^\323?\215\232\203\255\216\344\356?\324Q\254\235\313a\336?\272\217\334\013\347\313\354?\000=\316/\262\377\222?h\215X(<3\357?\252\301\004V\357\355\354?@\245\323L\006/\332?\310*\307E\354\031\275?\004\'\203\241\373\355\305?\232\306gy\346\251\350?\312\276O\226s\260\332?:\276\225\000\251\235\355?|\301\354X\323K\352?D\017\274px\231\346?@\tEs\026f\262?\270\007y\233\204\324\331?Og\236\346b=\344?>\323\201\320b>\352?>\376J\237\020}\341?\246\362\2458\016\020\333?\364s^\"\337\217\343?\010\364\300M\351r\323?\211t\310\3267#\351?\024B,\300\035\375\357? _\233\014&-\301?R\373\\\023X\321\357?\210\221\322j\222,\337?ux\220\245;\\\346?\244\353\273\3038\224\315?\343\013A\037I\263\346?m\013\236\302!\224\351?h\371\\\022\224\276\302?\030\327\2062E.\277?`\0045xg\205\311?\020Y\220g.f\251?\200]GP))\247?h\274\362q\323F\325?\020\254L5/\316\262?\035F\275{\340\274\354?\300\323\341\362\377\236\255?\330x\327\206\331\340\327?\310<a]\307\355\270?D\357k\013\337\226\322?;.\244G\337\213\355?d\230\346\224\371\025\336?\000\220mk#\210\346?\023\337\370\317\267\232\352?\022|\242e|\352\330?Pg\256HL<\245?\3649\237f\307\n\312?H\264\200\243\301\276\353?\217\2773%\244P\356?\3001L\331\256\263\204?>\327pd\345{\342?\010u\272\361m7\326?\254\273\247\245\245{\300?\300\311?@\016\247\254?h\3355\215\037\274\321?\265\307\003\277%W\343?xR\360c+\274\271?\000\212X\026\361o\222?\\\203\277,\303I\313?\340\325\t\022\361t\221?\300\027\312\330\337\020\256?\261)\356l\036\211\347?\260\021\224\216>\024\323?(\n\337k\317p\327?\304:\361jF\037\326?\260d\247l\034\375\342?y\340S\275\0069\357?f\362\323\235dP\346?\350?\341\347J\301\307?d\332(\211\033\026\357?\234I\363\341\372\220\342?~\333z\371N\334\335?\204\316^9\276\210\330?\307\0241\244#H\341?\225\343\215\007\tq\340?\346\340l\000\035n\344?t\337\366\250\2371\353?8\301\331\025\345\325\333?\306&k\266\027o\340?\251\037\345\010\242\301\356?\036\350\006Rn\361\323?\217\341\274\217\311\\\356?f\3235\376f\240\333?\225\271\t\241k\277\344?d:\271\260\324\251\301?\034\312+\203\223\357\326?n\275\004\321A\303\353?\324\305Z\241\010\242\342?\222\363\350}I\312\347?f\324\247gBm\322?\035nn\027\005\257\346?\260\255G\216\272\003\240?\224W\374\033\260\016\345?\340\024hZ>S\311?z:`S\204g\346?\354%Rh\203H\353?\"\033\365\252\213\272\335?xZ]\n\037\220\276?\270\363\360\034\331\023\347?\250\377\\\321\312\301\347?@\263w\267\311\207\200?@@\317\242\022\201\314?\316\346\263\262\213\224\320?\244\335}\255\241\255\357?\317\'#\376q\003\353?v\235\3069\377\257\333?2\265vQ\336\240\326?e\352\350\221\277Y\344?\322\330FL\260\350\346?\263\270>A\245g\351?\r/\\\000j\264\352?\204\035l\265\261e\330?\272\275\244\205\024\345\330?\200\220\261\017\323\233\244?\202SI>\310 \324?8\3134\037\266\353\306?\260\323\002\353\241\274\261?=\244VG(|\352?Y\345V\317\271a\351?,\004\265\316\304N\351?l\177R\323\nv\317?\230\274\034z\230\304\323?\360\260\255\222\355\236\341?"
+          }
+        }
+      }
+    }
+    node {
+      name: "PartitionedCall"
+      op: "PartitionedCall"
+      input: "Const"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_DOUBLE
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_DOUBLE
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 150
+              }
+              dim {
+                size: 150
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\202\001\0008\0012\002J\000\n\007\n\003CPU\020\001\n\007\n\003GPU\020\000"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference_signature_wrapper_13"
+          }
+        }
+      }
+    }
+    node {
+      name: "NoOp"
+      op: "NoOp"
+    }
+    node {
+      name: "Const_1"
+      op: "Const"
+      device: "/device:CPU:0"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "\n\035\n\t\022\005get_c\010\001\n\016\022\nsignatures\010\002*\000\n\017\n\013\022\007trace_0\010\003*\000\n\027\n\023\022\017serving_default\010\004*\000\n\021\n\r\022\tcapture_0\010\005*\000\n\021\n\r\022\tcapture_0\010\005*\000\n\002*\000"
+          }
+        }
+      }
+    }
+    node {
+      name: "saver_filename"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+          }
+        }
+      }
+    }
+    node {
+      name: "StatefulPartitionedCall"
+      op: "StatefulPartitionedCall"
+      input: "saver_filename"
+      input: "Const_1"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_STRING
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\202\001\0008\0012\002J\000\n\007\n\003CPU\020\001\n\007\n\003GPU\020\000"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference__traced_save_40"
+          }
+        }
+      }
+    }
+    node {
+      name: "StatefulPartitionedCall_1"
+      op: "StatefulPartitionedCall"
+      input: "saver_filename"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\202\001\0008\0012\002J\000\n\007\n\003CPU\020\001\n\007\n\003GPU\020\000"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference__traced_restore_49"
+          }
+        }
+      }
+    }
+    library {
+      function {
+        signature {
+          name: "__inference_signature_wrapper_13"
+          input_arg {
+            name: "unknown"
+            type: DT_DOUBLE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_DOUBLE
+          }
+        }
+        node_def {
+          name: "PartitionedCall"
+          op: "PartitionedCall"
+          input: "unknown"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_DOUBLE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_DOUBLE
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 150
+                  }
+                  dim {
+                    size: 150
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\202\001\0008\0012\002J\000\n\007\n\003CPU\020\001\n\007\n\003GPU\020\000"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_<lambda>_6"
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "PartitionedCall:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_DOUBLE
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 150
+                  }
+                  dim {
+                    size: 150
+                  }
+                }
+              }
+            }
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_construction_context"
+          value {
+            s: "kEagerRuntime"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 150
+                }
+                dim {
+                  size: 150
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 150
+                    }
+                    dim {
+                      size: 150
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "9"
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference__traced_save_40"
+          input_arg {
+            name: "file_prefix"
+            type: DT_STRING
+          }
+          input_arg {
+            name: "savev2_const_1"
+            type: DT_STRING
+          }
+          output_arg {
+            name: "identity_1"
+            type: DT_STRING
+          }
+          is_stateful: true
+          control_output: "MergeV2Checkpoints"
+        }
+        node_def {
+          name: "StaticRegexFullMatch"
+          op: "StaticRegexFullMatch"
+          input: "file_prefix"
+          device: "/device:CPU:*"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "pattern"
+            value {
+              s: "^s3://.*"
+            }
+          }
+        }
+        node_def {
+          name: "Const"
+          op: "Const"
+          device: "/device:CPU:*"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                }
+                string_val: ".part"
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Const_1"
+          op: "Const"
+          device: "/device:CPU:*"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                }
+                string_val: "_temp/part"
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Select"
+          op: "Select"
+          input: "StaticRegexFullMatch:output:0"
+          input: "Const:output:0"
+          input: "Const_1:output:0"
+          device: "/device:CPU:*"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "StringJoin"
+          op: "StringJoin"
+          input: "file_prefix"
+          input: "Select:output:0"
+          device: "/device:CPU:*"
+          attr {
+            key: "N"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "num_shards"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 1
+              }
+            }
+          }
+        }
+        node_def {
+          name: "ShardedFilename/shard"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 0
+              }
+            }
+          }
+        }
+        node_def {
+          name: "ShardedFilename"
+          op: "ShardedFilename"
+          input: "StringJoin:output:0"
+          input: "ShardedFilename/shard:output:0"
+          input: "num_shards:output:0"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "SaveV2/tensor_names"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: "_CHECKPOINTABLE_OBJECT_GRAPH"
+              }
+            }
+          }
+        }
+        node_def {
+          name: "SaveV2/shape_and_slices"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: ""
+              }
+            }
+          }
+        }
+        node_def {
+          name: "SaveV2"
+          op: "SaveV2"
+          input: "ShardedFilename:filename:0"
+          input: "SaveV2/tensor_names:output:0"
+          input: "SaveV2/shape_and_slices:output:0"
+          input: "savev2_const_1"
+          device: "/device:CPU:0"
+          attr {
+            key: "_has_manual_control_dependencies"
+            value {
+              b: true
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtypes"
+            value {
+              list {
+                type: DT_STRING
+              }
+            }
+          }
+        }
+        node_def {
+          name: "MergeV2Checkpoints/checkpoint_prefixes"
+          op: "Pack"
+          input: "ShardedFilename:filename:0"
+          input: "^SaveV2"
+          device: "/device:CPU:0"
+          attr {
+            key: "N"
+            value {
+              i: 1
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "MergeV2Checkpoints"
+          op: "MergeV2Checkpoints"
+          input: "MergeV2Checkpoints/checkpoint_prefixes:output:0"
+          input: "file_prefix"
+          device: "/device:CPU:0"
+          attr {
+            key: "_has_manual_control_dependencies"
+            value {
+              b: true
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "file_prefix"
+          input: "^MergeV2Checkpoints"
+          device: "/device:CPU:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Identity_1"
+          op: "Identity"
+          input: "Identity:output:0"
+          input: "^NoOp"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "NoOp"
+          op: "NoOp"
+          input: "^MergeV2Checkpoints"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+        }
+        ret {
+          key: "identity_1"
+          value: "Identity_1:output:0"
+        }
+        attr {
+          key: "_construction_context"
+          value {
+            s: "kEagerRuntime"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+              }
+              shape {
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "MergeV2Checkpoints"
+          value: "MergeV2Checkpoints"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "file_prefix"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "Const_1"
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_<lambda>_18"
+          input_arg {
+            name: "unknown"
+            type: DT_DOUBLE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_DOUBLE
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "unknown"
+          attr {
+            key: "T"
+            value {
+              type: DT_DOUBLE
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 150
+                  }
+                  dim {
+                    size: 150
+                  }
+                }
+              }
+            }
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_construction_context"
+          value {
+            s: "kEagerRuntime"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 150
+                }
+                dim {
+                  size: 150
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 150
+                    }
+                    dim {
+                      size: 150
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "15"
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference__traced_restore_49"
+          input_arg {
+            name: "file_prefix"
+            type: DT_STRING
+          }
+          output_arg {
+            name: "identity_1"
+            type: DT_STRING
+          }
+          is_stateful: true
+        }
+        node_def {
+          name: "RestoreV2/tensor_names"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: "_CHECKPOINTABLE_OBJECT_GRAPH"
+              }
+            }
+          }
+        }
+        node_def {
+          name: "RestoreV2/shape_and_slices"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: ""
+              }
+            }
+          }
+        }
+        node_def {
+          name: "RestoreV2"
+          op: "RestoreV2"
+          input: "file_prefix"
+          input: "RestoreV2/tensor_names:output:0"
+          input: "RestoreV2/shape_and_slices:output:0"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtypes"
+            value {
+              list {
+                type: DT_STRING
+              }
+            }
+          }
+        }
+        node_def {
+          name: "NoOp"
+          op: "NoOp"
+          device: "/device:CPU:0"
+          attr {
+            key: "_has_manual_control_dependencies"
+            value {
+              b: true
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "file_prefix"
+          input: "^NoOp"
+          device: "/device:CPU:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        node_def {
+          name: "Identity_1"
+          op: "Identity"
+          input: "Identity:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+        }
+        ret {
+          key: "identity_1"
+          value: "Identity_1:output:0"
+        }
+        attr {
+          key: "_construction_context"
+          value {
+            s: "kEagerRuntime"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "file_prefix"
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_<lambda>_6"
+          input_arg {
+            name: "unknown"
+            type: DT_DOUBLE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_DOUBLE
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "unknown"
+          attr {
+            key: "T"
+            value {
+              type: DT_DOUBLE
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 150
+                  }
+                  dim {
+                    size: 150
+                  }
+                }
+              }
+            }
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_construction_context"
+          value {
+            s: "kEagerRuntime"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 150
+                }
+                dim {
+                  size: 150
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 150
+                    }
+                    dim {
+                      size: 150
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "3"
+              }
+            }
+          }
+        }
+      }
+    }
+    versions {
+      producer: 1520
+      min_consumer: 12
+    }
+  }
+  saver_def {
+    filename_tensor_name: "saver_filename:0"
+    save_tensor_name: "StatefulPartitionedCall:0"
+    restore_op_name: "StatefulPartitionedCall_1"
+    version: V2
+  }
+  collection_def {
+    key: "saved_model_main_op"
+    value {
+      node_list {
+        value: "NoOp"
+      }
+    }
+  }
+  signature_def {
+    key: "__saved_model_init_op"
+    value {
+      outputs {
+        key: "__saved_model_init_op"
+        value {
+          name: "NoOp"
+          tensor_shape {
+            unknown_rank: true
+          }
+        }
+      }
+    }
+  }
+  signature_def {
+    key: "serving_default"
+    value {
+      outputs {
+        key: "output_0"
+        value {
+          name: "PartitionedCall:0"
+          dtype: DT_DOUBLE
+          tensor_shape {
+            dim {
+              size: 150
+            }
+            dim {
+              size: 150
+            }
+          }
+        }
+      }
+      method_name: "tensorflow/serving/predict"
+    }
+  }
+  object_graph_def {
+    nodes {
+      children {
+        node_id: 1
+        local_name: "get_c"
+      }
+      children {
+        node_id: 2
+        local_name: "signatures"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 3
+        local_name: "trace_0"
+      }
+      function {
+        concrete_functions: "__inference_<lambda>_18"
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          input_signature {
+            tuple_value {
+            }
+          }
+        }
+      }
+      dependencies {
+        node_id: 3
+        local_name: "trace_0"
+      }
+    }
+    nodes {
+      children {
+        node_id: 4
+        local_name: "serving_default"
+      }
+      user_object {
+        identifier: "signature_map"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 5
+        local_name: "capture_0"
+      }
+      bare_concrete_function {
+        concrete_function_name: "__inference_<lambda>_18"
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          input_signature {
+            tuple_value {
+            }
+          }
+        }
+      }
+      dependencies {
+        node_id: 5
+        local_name: "capture_0"
+      }
+    }
+    nodes {
+      children {
+        node_id: 5
+        local_name: "capture_0"
+      }
+      bare_concrete_function {
+        concrete_function_name: "__inference_signature_wrapper_13"
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          input_signature {
+            tuple_value {
+            }
+          }
+        }
+      }
+      dependencies {
+        node_id: 5
+        local_name: "capture_0"
+      }
+    }
+    nodes {
+      constant {
+        operation: "Const"
+      }
+      registered_name: "tf.TrackableConstant"
+    }
+    concrete_functions {
+      key: "__inference_<lambda>_18"
+      value {
+        bound_inputs: 5
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          tensor_spec_value {
+            name: "unknown"
+            shape {
+              dim {
+                size: 150
+              }
+              dim {
+                size: 150
+              }
+            }
+            dtype: DT_DOUBLE
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_signature_wrapper_13"
+      value {
+        bound_inputs: 5
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          dict_value {
+            fields {
+              key: "output_0"
+              value {
+                tensor_spec_value {
+                  name: "output_0"
+                  shape {
+                    dim {
+                      size: 150
+                    }
+                    dim {
+                      size: 150
+                    }
+                  }
+                  dtype: DT_DOUBLE
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/fingerprint.pb b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/fingerprint.pb
new file mode 100644
index 00000000000..a033e7c08e9
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/fingerprint.pb
@@ -0,0 +1 @@
+2(Ş�콅�� �����������⭐���ū�Ͼ����������
\ No newline at end of file
diff --git a/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/saved_model.pb b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/saved_model.pb
new file mode 100644
index 00000000000..46fc1c51987
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/saved_model.pb differ
diff --git a/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/variables/variables.data-00000-of-00001 b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/variables/variables.data-00000-of-00001
new file mode 100644
index 00000000000..3e08df4e8f9
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/variables/variables.data-00000-of-00001 differ
diff --git a/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/variables/variables.index b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/variables/variables.index
new file mode 100644
index 00000000000..2b377c2506a
Binary files /dev/null and b/tensorflow/cc/saved_model/testdata/chunked_saved_model/non_chunked_model/variables/variables.index differ
diff --git a/tensorflow/cc/saved_model/testdata/generate_chunked_models.py b/tensorflow/cc/saved_model/testdata/generate_chunked_models.py
new file mode 100644
index 00000000000..ef9f968a68b
--- /dev/null
+++ b/tensorflow/cc/saved_model/testdata/generate_chunked_models.py
@@ -0,0 +1,76 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Generates GraphDef test data for Merger.
+
+Constructs chunked protos test data containing GraphDefs with lots of nodes and
+large nodes for Merger::Read and Merger::Merge.
+"""
+
+from collections.abc import Sequence
+
+import os
+
+from absl import app
+from absl import flags
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.lib.io import file_io
+from tensorflow.python.module import module
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import save
+from tensorflow.python.saved_model import save_options
+from tensorflow.python.util import compat
+from tensorflow.tools.proto_splitter import constants
+from tensorflow.tools.proto_splitter.python import saved_model as proto_splitter
+
+SPLITTER_TESTDATA_PATH = flags.DEFINE_string(
+    "path", None, help="Path to testdata directory.")
+
+
+def generate_non_chunked_model(non_chunked_dir: str):
+  root = module.Module()
+  root.c = constant_op.constant(np.random.random_sample([150, 150]))
+  constants.debug_set_max_size(80000)
+  root.get_c = def_function.function(lambda: root.c)
+  signatures = root.get_c.get_concrete_function()
+  save.save(root, non_chunked_dir, signatures=signatures,
+            options=save_options.SaveOptions(experimental_image_format=False))
+
+
+def generate_chunked_model(non_chunked_dir: str, chunked_dir: str):
+  saved_model = loader_impl.parse_saved_model(non_chunked_dir)
+  prefix = file_io.join(compat.as_str(chunked_dir), "saved_model")
+  file_io.write_string_to_file(f"{prefix}.pbtxt", str(saved_model))
+  proto_splitter.SavedModelSplitter(saved_model).write(prefix)
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError("Too many command-line arguments.")
+
+  main_dir = os.path.join(SPLITTER_TESTDATA_PATH.value, "chunked_saved_model")
+  non_chunked_dir = os.path.join(main_dir, "non_chunked_model")
+  generate_non_chunked_model(non_chunked_dir)
+  chunked_dir = os.path.join(main_dir, "chunked_model")
+  generate_chunked_model(non_chunked_dir, chunked_dir)
+
+
+if __name__ == "__main__":
+  v2_compat.enable_v2_behavior()
+  app.run(main)
diff --git a/tensorflow/cc/tools/BUILD b/tensorflow/cc/tools/BUILD
index 510e7f589fd..bb5daa99742 100644
--- a/tensorflow/cc/tools/BUILD
+++ b/tensorflow/cc/tools/BUILD
@@ -1,4 +1,5 @@
-# Description:
+#include "third_party/absl/strings/str_cat.h"
+#Description:
 # TensorFlow cc tools.
 
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
@@ -22,6 +23,8 @@ cc_library(
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/cc/tools/freeze_saved_model.cc b/tensorflow/cc/tools/freeze_saved_model.cc
index 480c048e94f..5dcf5e64964 100644
--- a/tensorflow/cc/tools/freeze_saved_model.cc
+++ b/tensorflow/cc/tools/freeze_saved_model.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <iostream>
 #include <queue>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -193,7 +195,7 @@ StatusOr<string> GetVarHandleName(
   if (node->op() == "VarHandleOp") {
     return node->name();
   }
-  return errors::NotFound("No VarHandleOp ancestor found");
+  return absl::NotFoundError("No VarHandleOp ancestor found");
 }
 
 // Looks up the variable handle that provides input to node with node_name,
@@ -209,7 +211,7 @@ StatusOr<string> GetHandleNameIfNeedsToFreeze(
   if (var_handle_name.ok() && variable_node_names.count(*var_handle_name)) {
     return var_handle_name;
   }
-  return errors::NotFound("No VarHandleOp ancestor found");
+  return absl::NotFoundError("No VarHandleOp ancestor found");
 }
 
 // Freezes the subgraph of all nodes needed by `outputs`.
diff --git a/tensorflow/compiler/aot/codegen.cc b/tensorflow/compiler/aot/codegen.cc
index 7a8d5273b03..d8d2ea82e76 100644
--- a/tensorflow/compiler/aot/codegen.cc
+++ b/tensorflow/compiler/aot/codegen.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/aot/codegen.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -24,6 +25,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/str_split.h"
+#include "absl/strings/substitute.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/aot/embedded_protocol_buffers.h"
 #include "tensorflow/compiler/tf2xla/tf2xla.pb.h"
@@ -312,6 +314,74 @@ Status GenVariableMethods(const tf2xla::Config& config,
   return OkStatus();
 }
 
+// Generate shape infos for args (inputs).
+Status GenArgShapeInfos(const xla::ProgramShapeProto& ps, string* infos) {
+  for (int i = 0; i < ps.parameters_size(); ++i) {
+    const xla::ShapeProto& shape = ps.parameters(i);
+    if (shape.element_type() == xla::TUPLE) {
+      // ShapeInfo cannot represent tuple args.
+      return absl::InternalError(
+          absl::StrCat("parameter ", i,
+                       ": codegen requires XLA parameters to "
+                       "be non-tuples."));
+    }
+    // Please some compilers (e.g. MSVC) by avoiding the initialization of an
+    // array of unknown size an empty initializer. Use "-1" for this; note that
+    // this value is never used (the size attribute is set to 0 in ShapeInfo).
+    *infos += absl::Substitute(R"(  static constexpr int32_t kArg$0Shapes[] = {
+$1
+  };
+)",
+                               i,
+                               shape.dimensions_size() > 0
+                                   ? absl::StrJoin(shape.dimensions(), ", ")
+                                   : "-1");
+  }
+  *infos += R"(  static const ShapeInfo* ArgShapeInfos() {
+    static constexpr ShapeInfo kArgShapeInfoTable[kNumArgs] = {
+)";
+  for (int i = 0; i < ps.parameters_size(); ++i) {
+    const xla::ShapeProto& shape = ps.parameters(i);
+    *infos +=
+        absl::Substitute("{ kArg$0Shapes, $1 },\n", i, shape.dimensions_size());
+  }
+  *infos += R"(    };
+    return kArgShapeInfoTable;
+  })";
+  return OkStatus();
+}
+
+// Generate shape infos for results.
+Status GenResultShapeInfos(const xla::ProgramShapeProto& ps, string* infos) {
+  if (ps.result().element_type() != xla::TUPLE) {
+    return absl::InternalError("codegen requires the XLA result to be a tuple");
+  }
+  for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
+    const xla::ShapeProto& shape = ps.result().tuple_shapes(i);
+    // See above comment about the use here of "-1".
+    *infos += absl::Substitute(
+        R"(  static constexpr int32_t kResult$0Shapes[] = {
+$1
+  };
+)",
+        i,
+        shape.dimensions_size() > 0 ? absl::StrJoin(shape.dimensions(), ", ")
+                                    : "-1");
+  }
+  *infos += R"(  static const ShapeInfo* ResultShapeInfos() {
+    static constexpr ShapeInfo kResultShapeInfoTable[kNumResults] = {
+)";
+  for (int i = 0; i < ps.result().tuple_shapes_size(); ++i) {
+    const xla::ShapeProto& shape = ps.result().tuple_shapes(i);
+    *infos += absl::Substitute("{ kResult$0Shapes, $1 },\n", i,
+                               shape.dimensions_size());
+  }
+  *infos += R"(    };
+    return kResultShapeInfoTable;
+  })";
+  return OkStatus();
+}
+
 // Generates code implementing {Arg,Result}Names(), where T is one of
 // tf2xla::{Feed,Fetch,Variable}. Each feed or fetch name results in a C-style
 // string literal in the array, with nullptr terminating the array.
@@ -377,17 +447,27 @@ std::vector<string> BufferInfosToCppExpression(
   std::transform(buffer_infos.begin(), buffer_infos.end(),
                  std::back_inserter(buffer_infos_as_strings),
                  [](const BufferInfo& buffer_info) {
-                   std::pair<uint64, uint64> encoded = buffer_info.Encode();
-                   string encoded_second_as_str =
-                       encoded.second == ~0ULL
-                           ? "~0ULL"
-                           : absl::StrCat(encoded.second, "ULL");
+                   xla::cpu_function_runtime::EncodedBufferInfo encoded =
+                       buffer_info.Encode();
+                   auto param_to_str = [](uint32_t param) -> std::string {
+                     return param == ~0U ? "~0U" : absl::StrCat(param, "U");
+                   };
                    return absl::StrCat(
-                       "::xla::cpu_function_runtime::BufferInfo({",
-                       encoded.first, "ULL, ", encoded_second_as_str, "})");
+                       "::xla::cpu_function_runtime::BufferInfo(",
+                       encoded.packed_kind_and_size, "ULL, ",
+                       param_to_str(encoded.entry_param_number), ", ",
+                       param_to_str(encoded.result_param_number), ")");
                  });
   return buffer_infos_as_strings;
 }
+
+Status CheckEqual(size_t a, size_t b, absl::string_view error_msg) {
+  if (a != b) {
+    return absl::InternalError(
+        absl::StrCat(error_msg, ". Expected ", a, ", got ", b, "."));
+  }
+  return OkStatus();
+}
 }  // namespace
 
 Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
@@ -400,6 +480,8 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
       compile_result.aot->buffer_infos();
   const std::vector<int32> arg_index_table =
       ::xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
+  const std::vector<int32> result_index_table =
+      ::xla::cpu::CreateResultIndexTableFromBufferInfos(buffer_infos);
   std::vector<string> buffer_infos_as_strings =
       BufferInfosToCppExpression(buffer_infos);
   const int64_t buffer_infos_size = buffer_infos.size();
@@ -419,6 +501,15 @@ Status GenerateHeader(const CodegenOpts& opts, const tf2xla::Config& config,
   TF_RETURN_IF_ERROR(GenArgMethods(config, ps, compile_result, &methods_arg));
   TF_RETURN_IF_ERROR(GenResultMethods(config, ps, &methods_result));
   TF_RETURN_IF_ERROR(GenVariableMethods(config, ps, &methods_variable));
+  string arg_shape_infos, result_shape_infos;
+  TF_RETURN_IF_ERROR(GenArgShapeInfos(ps, &arg_shape_infos));
+  TF_RETURN_IF_ERROR(
+      CheckEqual(ps.parameters_size(), arg_index_table.size(),
+                 "Arg number mismatch, proto vs. arg_index_table"));
+  TF_RETURN_IF_ERROR(GenResultShapeInfos(ps, &result_shape_infos));
+  TF_RETURN_IF_ERROR(
+      CheckEqual(ps.result().tuple_shapes_size(), result_index_table.size(),
+                 "Result number mismatch, proto vs. result_index_table"));
   const size_t arg_bytes_aligned =
       xla::cpu_function_runtime::AlignedBufferBytes(
           buffer_infos_for_args.data(), buffer_infos_for_args.size(),
@@ -544,6 +635,8 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = {{ARG_NUM}};
 
+  static constexpr size_t kNumResults = {{RESULT_NUM}};
+
   // Number of variables for the compiled computation.
   static constexpr size_t kNumVariables = {{VARIABLE_NUM}};
 
@@ -560,16 +653,21 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       set_static_data_raw_function(data, {{ENTRY}});
       set_static_data_buffer_infos(data, BufferInfos());
       set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_result_index_table(data, ResultIndexToBufferIndex());
+      set_static_data_num_results(data, kNumResults);
       set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
       set_static_data_num_args(data, kNumArgs);
       set_static_data_num_variables(data, kNumVariables);
       set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_shape_infos(data, ArgShapeInfos());
+      set_static_data_result_shape_infos(data, ResultShapeInfos());
       set_static_data_arg_names(data, StaticArgNames());
       set_static_data_variable_names(data, StaticVariableNames());
       set_static_data_result_names(data, StaticResultNames());
       set_static_data_program_shape(data, StaticProgramShape());
       set_static_data_hlo_profile_printer_data(
           data, StaticHloProfilePrinterData());
+      set_static_data_use_xla_runtime(data, {{USE_XLA_RUNTIME}});
 {{ASSIGN_PROFILE_COUNTERS_SIZE}}
       return data;
     }();
@@ -589,7 +687,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   //
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
-  //   any AllocMode. Must be called before Run to have an affect. Must be
+  //   any AllocMode. Must be called before Run to have an effect. Must be
   //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
   //   argument, to set the argument buffers.
   //
@@ -655,6 +753,13 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
     return kBufferInfos;
   }
 
+  static const ::tensorflow::int32* ResultIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kResultIndexToBufferIndex[kNumResults] = {
+{{RESULT_INDEX_TABLE}}
+    };
+    return kResultIndexToBufferIndex;
+  }
+
   static const ::tensorflow::int32* ArgIndexToBufferIndex() {
     static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
 {{ARG_INDEX_TABLE}}
@@ -665,6 +770,12 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
   // The 0-based index of the result tuple in the temporary buffers.
   static constexpr size_t kResultIndex = {{RESULT_INDEX}};
 
+  // Shapes of the input arguments.
+{{ARG_SHAPE_INFOS}};
+
+  // Shapes of the results.
+{{RESULT_SHAPE_INFOS}};
+
   // Array of names of each positional argument, terminated by nullptr.
   static const char** StaticArgNames() {{ARG_NAMES_CODE}}
 
@@ -699,13 +810,18 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       {"{{ARG_BYTES_TOTAL}}", absl::StrCat(arg_bytes_total)},
       {"{{ARG_NAMES_CODE}}", arg_names_code},
       {"{{ARG_NUM}}", absl::StrCat(arg_index_table.size())},
+      {"{{ARG_SHAPE_INFOS}}", arg_shape_infos},
       {"{{VARIABLE_NUM}}", absl::StrCat(config.variable_size())},
       {"{{ARG_INDEX_TABLE}}", absl::StrJoin(arg_index_table, ", ")},
+      {"{{RESULT_NUM}}", absl::StrCat(result_index_table.size())},
+      {"{{RESULT_INDEX_TABLE}}", absl::StrJoin(result_index_table, ", ")},
+
       {"{{ASSIGN_PROFILE_COUNTERS_SIZE}}", assign_profile_counters_size},
       {"{{CLASS}}", opts.class_name},
       {"{{DECLS_FROM_OBJ_FILE}}",
        absl::StrJoin(metadata_result.header_variable_decls, "\n")},
       {"{{ENTRY}}", compile_result.entry_point},
+      {"{{USE_XLA_RUNTIME}}", opts.use_xla_runtime ? "true" : "false"},
       {"{{HLO_PROFILE_PRINTER_DATA_SHIM_EXPRESSION}}",
        metadata_result.hlo_profile_printer_data_access_shim},
       {"{{INCLUDE_XLA_DATA_PROTO}}", include_xla_data_proto},
@@ -722,6 +838,7 @@ class {{CLASS}} final : public tensorflow::XlaCompiledCpuFunction {
       {"{{VARIABLE_NAMES_CODE}}", variable_names_code},
       {"{{RESULT_INDEX}}", absl::StrCat(result_index)},
       {"{{RESULT_NAMES_CODE}}", result_names_code},
+      {"{{RESULT_SHAPE_INFOS}}", result_shape_infos},
       {"{{TEMP_BYTES_ALIGNED}}", absl::StrCat(temp_bytes_aligned)},
       {"{{TEMP_BYTES_TOTAL}}", absl::StrCat(temp_bytes_total)},
       {"{{NUM_BUFFERS}}", absl::StrCat(buffer_infos.size())},
@@ -749,7 +866,7 @@ Status GenerateMetadata(const CodegenOpts& opts,
 
   if (opts.gen_program_shape) {
     program_shape =
-        absl::make_unique<xla::ProgramShapeProto>(compile_result.program_shape);
+        std::make_unique<xla::ProgramShapeProto>(compile_result.program_shape);
 
     // The parameter names are currently meaningless, and redundant with the
     // rest of our metadata, so clear them out to avoid confusion and save
diff --git a/tensorflow/compiler/aot/codegen.h b/tensorflow/compiler/aot/codegen.h
index 9485e86b10e..a0caceaf4c6 100644
--- a/tensorflow/compiler/aot/codegen.h
+++ b/tensorflow/compiler/aot/codegen.h
@@ -48,6 +48,9 @@ struct CodegenOpts {
   // If true, emit a serialized HloProfilePrinterData protobuf that can be used
   // to pretty print HLO profile counters.
   bool gen_hlo_profile_printer_data = false;
+
+  // If true, sets this executable as an XLA Runtime one.
+  bool use_xla_runtime = false;
 };
 
 // Describes a generated metadata object file.
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index 18e3182e686..dc02f88e6a9 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -215,18 +215,22 @@ TEST(CodegenTest, Golden) {
   CompileResult compile_result;
   compile_result.aot.reset(new xla::cpu::CpuAotCompilationResult(
       {},
-      {BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/8, /*param_number=*/0),
+      {BufferInfo::MakeTempBuffer(3 * 8),
+       BufferInfo::MakeEntryParameter(/*size=*/8, /*entry_param_number=*/0),
        BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/1),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/1),
        BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/2),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/2),
        BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/3),
-       BufferInfo::MakeTempBuffer(1),
-       BufferInfo::MakeEntryParameter(/*size=*/96, /*param_number=*/4),
-       BufferInfo::MakeTempBuffer(1), BufferInfo::MakeTempBuffer(120)},
-      11, {}));
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/3),
+       BufferInfo::MakeResultParameter(/*size=*/5 * 6 * 4,
+                                       /*result_param_number=*/0),
+       BufferInfo::MakeEntryParameter(/*size=*/96, /*entry_param_number=*/4),
+       BufferInfo::MakeResultParameter(/*size=*/1 * 4,
+                                       /*result_param_number=*/1),
+       BufferInfo::MakeResultParameter(/*size=*/5 * 4,
+                                       /*result_param_number=*/2)},
+      0, {}));
   compile_result.program_shape =
       xla::ShapeUtil::MakeProgramShape(
           {
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index b4af9ef633d..88aefb744ad 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -58,13 +58,15 @@ namespace bar {
 // Memory stats:
 //   arg bytes total:    392
 //   arg bytes aligned:  576
-//   temp bytes total:   126
+//   temp bytes total:   171
 //   temp bytes aligned: 512
 class MyClass final : public tensorflow::XlaCompiledCpuFunction {
  public:
   // Number of input arguments for the compiled computation.
   static constexpr size_t kNumArgs = 5;
 
+  static constexpr size_t kNumResults = 3;
+
   // Number of variables for the compiled computation.
   static constexpr size_t kNumVariables = 3;
 
@@ -81,16 +83,21 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
       set_static_data_raw_function(data, entry_point);
       set_static_data_buffer_infos(data, BufferInfos());
       set_static_data_num_buffers(data, kNumBuffers);
+      set_static_data_result_index_table(data, ResultIndexToBufferIndex());
+      set_static_data_num_results(data, kNumResults);
       set_static_data_arg_index_table(data, ArgIndexToBufferIndex());
       set_static_data_num_args(data, kNumArgs);
       set_static_data_num_variables(data, kNumVariables);
       set_static_data_result_index(data, kResultIndex);
+      set_static_data_arg_shape_infos(data, ArgShapeInfos());
+      set_static_data_result_shape_infos(data, ResultShapeInfos());
       set_static_data_arg_names(data, StaticArgNames());
       set_static_data_variable_names(data, StaticVariableNames());
       set_static_data_result_names(data, StaticResultNames());
       set_static_data_program_shape(data, StaticProgramShape());
       set_static_data_hlo_profile_printer_data(
           data, StaticHloProfilePrinterData());
+      set_static_data_use_xla_runtime(data, false);
 
       return data;
     }();
@@ -110,7 +117,7 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
   //
   // void set_argN_data(void* data)
   //   Sets the buffer of type T for positional argument N. May be called in
-  //   any AllocMode. Must be called before Run to have an affect. Must be
+  //   any AllocMode. Must be called before Run to have an effect. Must be
   //   called in AllocMode::RESULTS_PROFILES_AND_TEMPS_ONLY for each positional
   //   argument, to set the argument buffers.
   //
@@ -354,22 +361,29 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
   static const ::xla::cpu_function_runtime::BufferInfo* BufferInfos() {
     static const ::xla::cpu_function_runtime::BufferInfo
       kBufferInfos[kNumBuffers] = {
-::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
-::xla::cpu_function_runtime::BufferInfo({34ULL, 0ULL}),
-::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
-::xla::cpu_function_runtime::BufferInfo({386ULL, 1ULL}),
-::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
-::xla::cpu_function_runtime::BufferInfo({386ULL, 2ULL}),
-::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
-::xla::cpu_function_runtime::BufferInfo({386ULL, 3ULL}),
-::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
-::xla::cpu_function_runtime::BufferInfo({386ULL, 4ULL}),
-::xla::cpu_function_runtime::BufferInfo({5ULL, ~0ULL}),
-::xla::cpu_function_runtime::BufferInfo({481ULL, ~0ULL})
+::xla::cpu_function_runtime::BufferInfo(97ULL, ~0U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(34ULL, 0U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(5ULL, ~0U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(386ULL, 1U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(5ULL, ~0U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(386ULL, 2U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(5ULL, ~0U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(386ULL, 3U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(481ULL, ~0U, 0U),
+::xla::cpu_function_runtime::BufferInfo(386ULL, 4U, ~0U),
+::xla::cpu_function_runtime::BufferInfo(17ULL, ~0U, 1U),
+::xla::cpu_function_runtime::BufferInfo(81ULL, ~0U, 2U)
       };
     return kBufferInfos;
   }
 
+  static const ::tensorflow::int32* ResultIndexToBufferIndex() {
+    static constexpr ::tensorflow::int32 kResultIndexToBufferIndex[kNumResults] = {
+8, 10, 11
+    };
+    return kResultIndexToBufferIndex;
+  }
+
   static const ::tensorflow::int32* ArgIndexToBufferIndex() {
     static constexpr ::tensorflow::int32 kArgIndexToBufferIndex[kNumArgs] = {
 1, 3, 5, 7, 9
@@ -378,7 +392,53 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
   }
 
   // The 0-based index of the result tuple in the temporary buffers.
-  static constexpr size_t kResultIndex = 11;
+  static constexpr size_t kResultIndex = 0;
+
+  // Shapes of the input arguments.
+  static constexpr int32_t kArg0Shapes[] = {
+1, 2
+  };
+  static constexpr int32_t kArg1Shapes[] = {
+3, 4
+  };
+  static constexpr int32_t kArg2Shapes[] = {
+1
+  };
+  static constexpr int32_t kArg3Shapes[] = {
+1
+  };
+  static constexpr int32_t kArg4Shapes[] = {
+5
+  };
+  static const ShapeInfo* ArgShapeInfos() {
+    static constexpr ShapeInfo kArgShapeInfoTable[kNumArgs] = {
+{ kArg0Shapes, 2 },
+{ kArg1Shapes, 2 },
+{ kArg2Shapes, 1 },
+{ kArg3Shapes, 1 },
+{ kArg4Shapes, 1 },
+    };
+    return kArgShapeInfoTable;
+  };
+
+  // Shapes of the results.
+  static constexpr int32_t kResult0Shapes[] = {
+5, 6
+  };
+  static constexpr int32_t kResult1Shapes[] = {
+1
+  };
+  static constexpr int32_t kResult2Shapes[] = {
+5
+  };
+  static const ShapeInfo* ResultShapeInfos() {
+    static constexpr ShapeInfo kResultShapeInfoTable[kNumResults] = {
+{ kResult0Shapes, 2 },
+{ kResult1Shapes, 1 },
+{ kResult2Shapes, 1 },
+    };
+    return kResultShapeInfoTable;
+  };
 
   // Array of names of each positional argument, terminated by nullptr.
   static const char** StaticArgNames() {
diff --git a/tensorflow/compiler/aot/compile.cc b/tensorflow/compiler/aot/compile.cc
index fd3bf0bb7e9..290a6bb4ab4 100644
--- a/tensorflow/compiler/aot/compile.cc
+++ b/tensorflow/compiler/aot/compile.cc
@@ -273,6 +273,15 @@ Status Main(const MainFlags& flags) {
   codegen_opts.gen_name_to_index = flags.gen_name_to_index;
   codegen_opts.gen_program_shape = flags.gen_program_shape;
   codegen_opts.target_triple = flags.target_triple;
+  // Set the XLA Runtime bit if this is an HloLowering.
+  if (!flags.mlir_components.empty() && flags.mlir_components != "None") {
+    for (auto component : absl::StrSplit(flags.mlir_components, ',')) {
+      if (component == "HloLowering") {
+        codegen_opts.use_xla_runtime = true;
+      }
+    }
+  }
+
   if (flags.cpp_class.empty()) {
     return errors::InvalidArgument("Must specify --cpp_class");
   }
diff --git a/tensorflow/compiler/aot/tests/BUILD b/tensorflow/compiler/aot/tests/BUILD
index f04aa37c887..191188b674d 100644
--- a/tensorflow/compiler/aot/tests/BUILD
+++ b/tensorflow/compiler/aot/tests/BUILD
@@ -69,18 +69,18 @@ py_binary(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
         "@absl_py//absl:app",
         "@six_archive//:six",
     ],
@@ -437,6 +437,7 @@ tf_cc_test(
     tags = [
         "manual",
         "no_mac",  # TODO(b/228273415)
+        "not_run:arm",
     ],
     deps = [
         ":test_graph_tfadd",
@@ -510,6 +511,7 @@ tf_cc_test(
     tags = [
         "manual",
         "no_mac",  # TODO(b/228273415)
+        "not_run:arm",
     ],
     deps = [
         ":test_graph_tfadd_mlir_bridge",
diff --git a/tensorflow/compiler/aot/tests/tfcompile_test.cc b/tensorflow/compiler/aot/tests/tfcompile_test.cc
index 872ce4160c3..64138e47c98 100644
--- a/tensorflow/compiler/aot/tests/tfcompile_test.cc
+++ b/tensorflow/compiler/aot/tests/tfcompile_test.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <vector>
 #define EIGEN_USE_THREADS
 #define EIGEN_USE_CUSTOM_THREAD_POOL
 
diff --git a/tensorflow/compiler/aot/tfcompile.bzl b/tensorflow/compiler/aot/tfcompile.bzl
index c1f8fdc089a..c965760785a 100644
--- a/tensorflow/compiler/aot/tfcompile.bzl
+++ b/tensorflow/compiler/aot/tfcompile.bzl
@@ -319,6 +319,8 @@ def _tf_library(
         ] or []) + (include_standard_runtime_deps and [
             # TODO(cwhipkey): only depend on kernel code that the model actually
             # needed.
+            "//tensorflow/compiler/xla/service/cpu/runtime:convolution_ffi",
+            "//tensorflow/compiler/xla/service/cpu/runtime:rng_ffi",
             "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
             "//tensorflow/compiler/xla/service/cpu:runtime_custom_call_status",
             "//tensorflow/compiler/xla/service/cpu:runtime_key_value_sort",
@@ -329,6 +331,7 @@ def _tf_library(
             "//third_party/eigen3",
         ] or []) + (
             mlir_components.count("HloLowering") > 0 and [
+                "//tensorflow/compiler/xla/runtime:aot_ffi_c_symbols",
                 "//tensorflow/compiler/xla/service/cpu:runtime_mlir_utils",
             ] or []
         ) + (
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index b3fd29ff259..9bc3348b38a 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.bzl", "if_libtpu", "if_with_tpu_support", "tf_cc_test", "tf_copts", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.bzl", "if_libtpu", "if_with_tpu_support", "tf_cc_test", "tf_copts", "tf_cuda_cc_test", "tf_cuda_only_cc_test")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm")
 load("//tensorflow:tensorflow.default.bzl", "cc_header_only_library", "filegroup", "tf_custom_op_py_library", "tf_jit_compilation_passes_extra_deps")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
@@ -352,8 +352,10 @@ cc_library(
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:tf_pjrt_client",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/core/tfrt/common:create_pjrt_client_util",
+        "//tensorflow/core/tfrt/common:global_state",
         "//tensorflow/core/tfrt/common:pjrt_util",
         "//tensorflow/core/tpu:tpu_defs",
         "@com_google_absl//absl/log",
@@ -595,7 +597,8 @@ tf_cc_test(
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/kernels:identity_op",
         "//tensorflow/core/kernels:ops_testutil",
-        "@com_google_googletest//:gtest_main",
+        "//tensorflow/core/tpu:tpu_defs",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -782,6 +785,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -1509,6 +1513,70 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "xla_host_recv_device_context",
+    srcs = [
+        "xla_host_recv_device_context.cc",
+    ],
+    hdrs = [
+        "xla_host_recv_device_context.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/core:framework",
+        "@tf_runtime//:async_value",
+    ],
+)
+
+cc_library(
+    name = "xla_host_send_device_context",
+    srcs = [
+        "xla_host_send_device_context.cc",
+    ],
+    hdrs = [
+        "xla_host_send_device_context.h",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/core:framework",
+        "@tf_runtime//:async_value",
+    ],
+)
+
+tf_cuda_only_cc_test(
+    name = "xla_host_send_recv_device_context_test",
+    srcs = ["xla_host_send_recv_device_context_test.cc"],
+    tags = tf_cuda_tests_tags() + [
+        "config-cuda-only",
+        "no_oss",  # Temporarily disable OSS.
+    ],
+    deps = [
+        ":flags",
+        ":xla_device",
+        ":xla_gpu_device",
+        ":xla_host_recv_device_context",
+        ":xla_host_send_device_context",
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/compiler/xla/stream_executor:multi_platform_manager",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core/framework:tensor_testutil",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 tf_cc_test(
     name = "device_compilation_cluster_signature_test",
     srcs = [
@@ -1527,6 +1595,9 @@ tf_cc_test(
 tf_cc_test(
     name = "device_compilation_profiler_test",
     srcs = ["device_compilation_profiler_test.cc"],
+    tags = [
+        "nomsan",  # TODO(b/284492454)
+    ],
     deps = [
         ":device_compilation_profiler",
         ":xla_activity_proto_cc",
@@ -1641,6 +1712,7 @@ tf_cuda_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":flags",
+        ":pjrt_device_compiler_client",
         ":test_util",
         ":xla_device_no_jit_rewrite_registration",
         ":xla_gpu_device",
@@ -1649,6 +1721,7 @@ tf_cuda_cc_test(
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core/tpu:tpu_defs",
diff --git a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
index f0fcd17ba23..e426c9d40d9 100644
--- a/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
+++ b/tensorflow/compiler/jit/encapsulate_subgraphs_pass.cc
@@ -19,9 +19,9 @@ limitations under the License.
 #include <memory>
 #include <numeric>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
@@ -188,7 +188,7 @@ class Encapsulator {
 
     // Adds the function call node to graph_out.
     Status AddFunctionCallNode(
-        const std::unordered_map<const Node*, Node*>& node_images,
+        const absl::flat_hash_map<const Node*, Node*>& node_images,
         Graph* graph_out);
 
     // Returns the Node that the inputs and outputs of the function should be
@@ -206,7 +206,7 @@ class Encapsulator {
     // and adds the edge within the subgraph from the _Arg node to the image of
     // the dst node.
     Status RecordArg(const Edge* edge,
-                     const std::unordered_map<const Node*, Node*>& node_images,
+                     const absl::flat_hash_map<const Node*, Node*>& node_images,
                      std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
 
     // Records the src of the given edge as a control result of the graph.
@@ -214,14 +214,14 @@ class Encapsulator {
     // the function signature.
     Status RecordControlResult(
         const Edge* edge,
-        const std::unordered_map<const Node*, Node*>& node_images);
+        const absl::flat_hash_map<const Node*, Node*>& node_images);
 
     // Creates a _Retval node for the src node of edge, and add it to results_,
     // if none exists yet. If a new _Retval node is created, also adds the edge
     // within the subgraph from the src to the _Retval node.
     Status RecordResult(
         const Edge* edge,
-        const std::unordered_map<const Node*, Node*>& node_images);
+        const absl::flat_hash_map<const Node*, Node*>& node_images);
 
     // Creates the sequencer node if it doesn't exist, adding it to graph_out.
     Status MakeSequencingNode(const string& subgraph_name, Graph* graph_out);
@@ -260,14 +260,14 @@ class Encapsulator {
     // (consumer node/slot) tensors in the input graph to _Arg numbers in
     // the subgraph. The source map is one-to-one, whereas the dest map may be
     // many-to-one.
-    std::unordered_map<OutputTensor, int, OutputTensor::Hash> args_by_src_;
-    std::unordered_map<InputTensor, int, InputTensor::Hash> args_by_dst_;
+    absl::flat_hash_map<OutputTensor, int, OutputTensor::Hash> args_by_src_;
+    absl::flat_hash_map<InputTensor, int, InputTensor::Hash> args_by_dst_;
 
     // The arguments to the subgraph, in order.
     std::vector<Node*> args_;
 
     // Map from source tensor in the input graph to result #.
-    std::unordered_map<OutputTensor, int, OutputTensor::Hash> results_;
+    absl::flat_hash_map<OutputTensor, int, OutputTensor::Hash> results_;
 
     // Set of node names that are the source of a control output of the
     // subgraph. We store strings here so that we can tolerate nodes being
@@ -285,19 +285,20 @@ class Encapsulator {
   // Copies edges local to a subgraph. Adds _Arg and _Retval nodes to
   // subgraphs for data edges that cross subgraph boundaries.
   Status CopySubgraphEdges(
-      const std::unordered_map<const Node*, Node*>& node_images,
+      const absl::flat_hash_map<const Node*, Node*>& node_images,
       std::vector<std::pair<const Node*, Node*>>* src_arg_pairs);
 
   // Copies all marked nodes to a subgraph. Does nothing for unmarked nodes.
-  Status CopySubgraphNodes(std::unordered_map<const Node*, Node*>* node_images);
+  Status CopySubgraphNodes(
+      absl::flat_hash_map<const Node*, Node*>* node_images);
 
   // Copies all nodes that aren't in a compiled subgraph to the output graph.
   Status CopyNodesToOutputGraph(
-      Graph* graph_out, std::unordered_map<const Node*, Node*>* node_images);
+      Graph* graph_out, absl::flat_hash_map<const Node*, Node*>* node_images);
 
   // Adds function call nodes for each compiled subgraph.
   Status AddFunctionCallNodes(
-      const std::unordered_map<const Node*, Node*>& node_images,
+      const absl::flat_hash_map<const Node*, Node*>& node_images,
       Graph* graph_out);
 
   // Finds the image of an edge source in the output graph. If the edge crosses
@@ -305,7 +306,7 @@ class Encapsulator {
   // in the output graph.
   Status FindOutputImageOfEdgeSrc(
       const string& src_func_id, const string& dst_func_id,
-      const std::unordered_map<const Node*, Node*>& node_images,
+      const absl::flat_hash_map<const Node*, Node*>& node_images,
       const Node* original_src_node, Node** src_image);
 
   // Finds an edge source slot in the output graph. If the edge crosses a
@@ -320,7 +321,7 @@ class Encapsulator {
   // a node in the output graph.
   Status FindOutputImageOfEdgeDst(
       const string& src_func_id, const string& dst_func_id,
-      const std::unordered_map<const Node*, Node*>& node_images,
+      const absl::flat_hash_map<const Node*, Node*>& node_images,
       const Node* original_dst_node, Node** dst_image);
 
   // Finds an edge destination slot in the output graph. If the edge crosses a
@@ -334,14 +335,14 @@ class Encapsulator {
   // within the output graph, or crosses into or out of a compiled subgraph.
   Status CopyEdgeToOutputGraph(
       const Edge* edge, const string& src_func_id, const string& dst_func_id,
-      const std::unordered_map<const Node*, Node*>& node_images,
+      const absl::flat_hash_map<const Node*, Node*>& node_images,
       Graph* graph_out,
-      std::unordered_set<std::pair<OutputTensor, InputTensor>,
-                         OutputInputTensorPairHasher>* edges_added);
+      absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
+                          OutputInputTensorPairHasher>* edges_added);
 
   // Adds all edges to the output graph.
   Status AddEdgesToOutputGraph(
-      const std::unordered_map<const Node*, Node*>& node_images,
+      const absl::flat_hash_map<const Node*, Node*>& node_images,
       Graph* graph_out);
 
   // Makes a copy of graph containing only nodes that are ancestors of at least
@@ -351,13 +352,13 @@ class Encapsulator {
   Status MakePrunedGraphCopyAndInline(
       const Graph& graph, const std::vector<Node*>& sink_nodes,
       std::unique_ptr<Graph>* pruned_graph,
-      std::unordered_map<const Node*, Node*>* node_images,
+      absl::flat_hash_map<const Node*, Node*>* node_images,
       FunctionLibraryDefinition* library);
 
   const string group_attribute_;
   const Graph* graph_in_;
 
-  std::unordered_map<string, Subgraph> subgraphs_;
+  absl::flat_hash_map<string, Subgraph> subgraphs_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(Encapsulator);
 };
@@ -369,9 +370,9 @@ namespace {
 // including clusters that are not present in the ancestors map. has_successors
 // is the set of clusters that are ancestors of some other cluster.
 void TopologicalClusterSort(
-    const std::unordered_set<string>& clusters,
-    const std::unordered_set<string>& has_successors,
-    const std::unordered_map<string, std::unordered_set<string>>& ancestors,
+    const absl::flat_hash_set<string>& clusters,
+    const absl::flat_hash_set<string>& has_successors,
+    const absl::flat_hash_map<string, absl::flat_hash_set<string>>& ancestors,
     std::vector<string>* sorted) {
   // The nodes are placed in 'sorted' in topological order.
   sorted->clear();
@@ -447,11 +448,12 @@ Node* Encapsulator::Subgraph::MakeNodeImage(const Graph* graph_in, Node* node) {
 Graph* Encapsulator::Subgraph::GetGraph() const { return graph_.get(); }
 
 Status Encapsulator::Subgraph::RecordArg(
-    const Edge* edge, const std::unordered_map<const Node*, Node*>& node_images,
+    const Edge* edge,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
   Node* src_node = edge->src();
   int src_slot = edge->src_output();
-  std::unordered_map<OutputTensor, int, OutputTensor::Hash>::iterator iter;
+  absl::flat_hash_map<OutputTensor, int, OutputTensor::Hash>::iterator iter;
   bool inserted;
   std::tie(iter, inserted) = args_by_src_.emplace(
       OutputTensor(src_node, src_slot), args_by_src_.size());
@@ -481,7 +483,7 @@ Status Encapsulator::Subgraph::RecordArg(
 
 Status Encapsulator::Subgraph::RecordControlResult(
     const Edge* edge,
-    const std::unordered_map<const Node*, Node*>& node_images) {
+    const absl::flat_hash_map<const Node*, Node*>& node_images) {
   Node* src_node = edge->src();
   Node* src_image = node_images.at(src_node);
   control_output_nodes_.insert(src_image->name());
@@ -490,11 +492,11 @@ Status Encapsulator::Subgraph::RecordControlResult(
 
 Status Encapsulator::Subgraph::RecordResult(
     const Edge* edge,
-    const std::unordered_map<const Node*, Node*>& node_images) {
+    const absl::flat_hash_map<const Node*, Node*>& node_images) {
   Node* src_node = edge->src();
   Node* src_image = node_images.at(src_node);
   int src_slot = edge->src_output();
-  std::unordered_map<OutputTensor, int, OutputTensor::Hash>::iterator iter;
+  absl::flat_hash_map<OutputTensor, int, OutputTensor::Hash>::iterator iter;
   bool inserted;
   std::tie(iter, inserted) =
       results_.emplace(OutputTensor(src_node, src_slot), results_.size());
@@ -592,7 +594,7 @@ Status Encapsulator::Subgraph::BuildFunctionDef(
   FunctionDef fdef;
   auto lookup = [this](const Node* node) -> std::optional<string> {
     if (control_output_nodes_.contains(node->name())) {
-      return absl::make_optional(node->name());
+      return std::make_optional(node->name());
     }
     return std::nullopt;
   };
@@ -637,7 +639,7 @@ Status Encapsulator::Subgraph::ReplaceFunctionDef(
 }
 
 Status Encapsulator::Subgraph::AddFunctionCallNode(
-    const std::unordered_map<const Node*, Node*>& node_images,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
   TF_ASSIGN_OR_RETURN(call_node_, graph_out->AddNode(call_node_def_));
 
@@ -663,7 +665,7 @@ Status Encapsulator::GetFunctionNameAttr(Node const* node, string* attr) const {
 bool IsInSubgraph(const string& func_id) { return !func_id.empty(); }
 
 Status Encapsulator::CopySubgraphNodes(
-    std::unordered_map<const Node*, Node*>* node_images) {
+    absl::flat_hash_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
     string func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
@@ -678,7 +680,7 @@ Status Encapsulator::CopySubgraphNodes(
 }
 
 Status Encapsulator::CopySubgraphEdges(
-    const std::unordered_map<const Node*, Node*>& node_images,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     std::vector<std::pair<const Node*, Node*>>* src_arg_pairs) {
   for (const Edge* edge : graph_in_->edges()) {
     string src_func_id;
@@ -752,7 +754,7 @@ Status Encapsulator::SplitIntoSubgraphs(FunctionLibraryDefinition* library) {
   Status s;
 
   // Map from input graph nodes to subgraph nodes.
-  std::unordered_map<const Node*, Node*> node_images;
+  absl::flat_hash_map<const Node*, Node*> node_images;
 
   // Each entry of src_arg_pairs is a pair whose first element is a node in the
   // original graph that has an output edge in the subgraph, and whose second
@@ -794,7 +796,7 @@ Status Encapsulator::BuildFunctionDefs(
 }
 
 Status Encapsulator::CopyNodesToOutputGraph(
-    Graph* graph_out, std::unordered_map<const Node*, Node*>* node_images) {
+    Graph* graph_out, absl::flat_hash_map<const Node*, Node*>* node_images) {
   for (Node* node : graph_in_->op_nodes()) {
     string func_id;
     TF_RETURN_IF_ERROR(GetFunctionNameAttr(node, &func_id));
@@ -811,7 +813,7 @@ Status Encapsulator::CopyNodesToOutputGraph(
 }
 
 Status Encapsulator::AddFunctionCallNodes(
-    const std::unordered_map<const Node*, Node*>& node_images,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
   for (auto& subgraph_entry : subgraphs_) {
     TF_RETURN_IF_ERROR(
@@ -822,7 +824,7 @@ Status Encapsulator::AddFunctionCallNodes(
 
 Status Encapsulator::FindOutputImageOfEdgeSrc(
     const string& src_func_id, const string& dst_func_id,
-    const std::unordered_map<const Node*, Node*>& node_images,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     const Node* original_src_node, Node** src_image) {
   if (IsInSubgraph(src_func_id)) {
     // The edge is from a subgraph to a regular node in the output graph so
@@ -853,7 +855,7 @@ int Encapsulator::FindOutputSlotOfEdgeSrc(const string& src_func_id,
 
 Status Encapsulator::FindOutputImageOfEdgeDst(
     const string& src_func_id, const string& dst_func_id,
-    const std::unordered_map<const Node*, Node*>& node_images,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     const Node* original_dst_node, Node** dst_image) {
   if (IsInSubgraph(dst_func_id)) {
     // The edge is to a subgraph from a regular node in the output graph so
@@ -884,9 +886,10 @@ int Encapsulator::FindOutputSlotOfEdgeDst(const string& src_func_id,
 
 Status Encapsulator::CopyEdgeToOutputGraph(
     const Edge* edge, const string& src_func_id, const string& dst_func_id,
-    const std::unordered_map<const Node*, Node*>& node_images, Graph* graph_out,
-    std::unordered_set<std::pair<OutputTensor, InputTensor>,
-                       OutputInputTensorPairHasher>* edges_added) {
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
+    Graph* graph_out,
+    absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
+                        OutputInputTensorPairHasher>* edges_added) {
   Node* src_image;
   TF_RETURN_IF_ERROR(FindOutputImageOfEdgeSrc(
       src_func_id, dst_func_id, node_images, edge->src(), &src_image));
@@ -924,13 +927,13 @@ Status Encapsulator::CopyEdgeToOutputGraph(
 }
 
 Status Encapsulator::AddEdgesToOutputGraph(
-    const std::unordered_map<const Node*, Node*>& node_images,
+    const absl::flat_hash_map<const Node*, Node*>& node_images,
     Graph* graph_out) {
   // Set of edges already added to the output graph, represented as (src, dst)
   // pairs. We use the set to deduplicate edges; multiple edges in the input
   // graph may map to one edge in the output graph.
-  std::unordered_set<std::pair<OutputTensor, InputTensor>,
-                     OutputInputTensorPairHasher>
+  absl::flat_hash_set<std::pair<OutputTensor, InputTensor>,
+                      OutputInputTensorPairHasher>
       edges_added;
 
   for (const Edge* edge : graph_in_->edges()) {
@@ -1010,7 +1013,7 @@ Node* AddDummyShapedNode(const Node* src_node, int src_port,
 Status Encapsulator::MakePrunedGraphCopyAndInline(
     const Graph& graph, const std::vector<Node*>& sink_nodes,
     std::unique_ptr<Graph>* pruned_graph,
-    std::unordered_map<const Node*, Node*>* node_images,
+    absl::flat_hash_map<const Node*, Node*>* node_images,
     FunctionLibraryDefinition* library) {
   // First copy all ancestor nodes of sink_nodes into a new graph.
   pruned_graph->reset(new Graph(library));
@@ -1070,7 +1073,7 @@ Status Encapsulator::MakePrunedGraphCopyAndInline(
 Status Encapsulator::BuildOutputGraph(Graph* graph_out,
                                       FunctionLibraryDefinition* library) {
   // Map from nodes in the input graph to nodes in the output graph.
-  std::unordered_map<const Node*, Node*> node_images;
+  absl::flat_hash_map<const Node*, Node*> node_images;
 
   TF_RETURN_IF_ERROR(CopyNodesToOutputGraph(graph_out, &node_images));
   TF_RETURN_IF_ERROR(AddFunctionCallNodes(node_images, graph_out));
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 6e8fceaf47d..54c79d77ca8 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -34,6 +34,7 @@ BuildXlaOpsPassFlags* build_ops_flags;
 MarkForCompilationPassFlags* mark_for_compilation_flags;
 XlaDeviceFlags* device_flags;
 XlaOpsCommonFlags* ops_flags;
+XlaCallModuleFlags* call_module_flags;
 MlirCommonFlags* mlir_flags;
 JitRtFlags* jitrt_flags;
 std::vector<Flag>* jitrt_flag_list;
@@ -76,6 +77,13 @@ bool SetterForXlaAutoJitFlag(const string& value) {
   return true;
 }
 
+bool SetterForXlaCallModuleDisabledChecks(const string& value) {
+  auto directives = absl::StrSplit(value, ',', absl::SkipEmpty());
+  call_module_flags->disabled_checks.insert(directives.begin(),
+                                            directives.end());
+  return true;
+}
+
 void AppendMarkForCompilationPassFlagsInternal(std::vector<Flag>* flag_list) {
   std::vector<Flag> new_flags = {
       Flag("tf_xla_auto_jit", SetterForXlaAutoJitFlag, "0",
@@ -184,6 +192,7 @@ void AllocateAndParseFlags() {
   build_ops_flags->tf_xla_check_cluster_input_numerics = false;
   build_ops_flags->tf_xla_check_cluster_output_numerics = false;
   build_ops_flags->tf_xla_disable_constant_folding = false;
+  build_ops_flags->tf_xla_disable_full_embedding_pipelining = false;
 
   mark_for_compilation_flags = new MarkForCompilationPassFlags;
   mark_for_compilation_flags->xla_auto_jit_flag.optimization_level_single_gpu =
@@ -213,9 +222,12 @@ void AllocateAndParseFlags() {
   ops_flags = new XlaOpsCommonFlags;
   ops_flags->tf_xla_always_defer_compilation = false;
   ops_flags->tf_xla_async_compilation = false;
-  ops_flags->tf_xla_use_device_api.enabled_for_xla_launch_ = false;
-  ops_flags->tf_xla_use_device_api.enabled_for_compile_on_demand_ = false;
+  ops_flags->tf_xla_use_device_api.enabled_for_xla_launch_ = true;
+  ops_flags->tf_xla_use_device_api.enabled_for_compile_on_demand_ = true;
+  ops_flags->tf_xla_use_device_api.enabled_for_compile_and_run_ = false;
+  ops_flags->tf_xla_use_device_api.enabled_for_all_ = false;
 
+  call_module_flags = new XlaCallModuleFlags;
   // The `enable_mlir_bridge` flag allows the user to explicitly request that
   // their program is (or isn't) compiled using the MLIR-based TF-to-XLA bridge.
   //
@@ -251,6 +263,10 @@ void AllocateAndParseFlags() {
             &build_ops_flags->tf_xla_disable_constant_folding,
             "If true then disables constant folding on TF graph before XLA "
             "compilation."),
+       Flag("tf_xla_disable_full_embedding_pipelining",
+            &build_ops_flags->tf_xla_disable_full_embedding_pipelining,
+            "If true then disables full embedding pipelining and instead use "
+            "strict SparseCore / TensorCore sequencing."),
 
        Flag("tf_xla_compile_on_demand", &device_flags->tf_xla_compile_on_demand,
             "Switch a device into 'on-demand' mode, where instead of "
@@ -277,6 +293,22 @@ void AllocateAndParseFlags() {
             &ops_flags->tf_xla_use_device_api.enabled_for_compile_on_demand_,
             "If true, uses Device API (PjRt) for compiling and executing ops "
             "one by one in 'on-demand' mode. Defaults to false."),
+       Flag("tf_xla_use_device_api_for_auto_jit",
+            &ops_flags->tf_xla_use_device_api.enabled_for_compile_and_run_,
+            "If true, uses Device API (PjRt) for compilation and execution "
+            "when auto-clustering is enabled. Defaults to false."),
+       Flag("tf_xla_use_device_api",
+            &ops_flags->tf_xla_use_device_api.enabled_for_all_,
+            "If true, uses Device API (PjRt) for compilation and execution "
+            "of ops one-by-one in 'on-demand' mode, for functions marked for "
+            "JIT compilation, or when auto-clustering is enabled. Defaults to "
+            "false."),
+
+       Flag("tf_xla_call_module_disabled_checks",
+            SetterForXlaCallModuleDisabledChecks, "",
+            "A comma-sepated list of directives specifying the safety checks "
+            "to be skipped when compiling XlaCallModuleOp. See the op "
+            "documentation for the recognized values."),
 
        Flag("tf_mlir_enable_mlir_bridge", &enable_mlir_bridge,
             "Enables experimental MLIR-Based TensorFlow Compiler Bridge.",
@@ -365,6 +397,11 @@ XlaOpsCommonFlags* GetXlaOpsCommonFlags() {
   return ops_flags;
 }
 
+XlaCallModuleFlags* GetXlaCallModuleFlags() {
+  absl::call_once(flags_init, &AllocateAndParseFlags);
+  return call_module_flags;
+}
+
 MlirCommonFlags* GetMlirCommonFlags() {
   absl::call_once(flags_init, &AllocateAndParseFlags);
   return mlir_flags;
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 9f151b89eb7..042b3688fba 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -137,8 +137,9 @@ struct XlaOpsCommonFlags {
     }
 
     bool IsEnabledInXlaLaunchForDevice(const DeviceType& device_type) const {
-      return enabled_for_xla_launch_ &&
-             xla_launch_allowed_devices_.contains(device_type.type_string());
+      return enabled_for_all_ ||
+             (enabled_for_xla_launch_ &&
+              xla_launch_allowed_devices_.contains(device_type.type_string()));
     }
 
     // Allow using Device API (PjRt) for `device_type` in the XlaCompileOnDemand
@@ -152,9 +153,26 @@ struct XlaOpsCommonFlags {
 
     bool IsEnabledInXlaCompileOnDemandForDevice(
         const DeviceType& device_type) const {
-      return enabled_for_compile_on_demand_ &&
-             xla_compile_on_demand_allowed_devices_.contains(
-                 device_type.type_string());
+      return enabled_for_all_ ||
+             (enabled_for_compile_on_demand_ &&
+              xla_compile_on_demand_allowed_devices_.contains(
+                  device_type.type_string()));
+    }
+
+    // Allow using Device API (PjRt) for `device_type` in the XlaCompile and
+    // XlaRun ops. Please note that `enabled_for_compile_and_run_` needs to be
+    // true in addition to the `device_type` being allowed in order to use the
+    // Device API for single device compilation and execution in the XlaCompile
+    // and XlaRun ops.
+    void AllowForDeviceInXlaCompileAndRun(const DeviceType& device_type) {
+      xla_compile_and_run_allowed_devices_.insert(device_type.type_string());
+    }
+
+    bool IsEnabledInXlaCompileAndRunForDevice(
+        const DeviceType& device_type) const {
+      return enabled_for_all_ || (enabled_for_compile_and_run_ &&
+                                  xla_compile_and_run_allowed_devices_.contains(
+                                      device_type.type_string()));
     }
 
     // If true, uses Device API (PjRt) for single device compilation and
@@ -166,6 +184,16 @@ struct XlaOpsCommonFlags {
     // one in "on-demand" mode. Defaults to false.
     bool enabled_for_compile_on_demand_;
 
+    // If true, uses Device API (PjRt) for compilation and execution when
+    // auto-clustering is enabled. Defaults to false.
+    bool enabled_for_compile_and_run_;
+
+    // If true, uses Device API (PjRt) for compilation and execution everywhere
+    // i.e. for functions marked for JIT compilation, for ops in "on-demand"
+    // mode and autoclustering, no matter whether other flags are enabled or
+    // not, and whether devices have been allowed or not. Defaults to false.
+    bool enabled_for_all_;
+
    private:
     // Devices for which using Device API (PjRt) is allowed in the XlaLaunch op.
     // This can only be modified programmatically.
@@ -173,9 +201,18 @@ struct XlaOpsCommonFlags {
     // Devices for which using Device API (PjRt) is allowed in the
     // XlaCompileOnDemand op. This can only be modified programmatically.
     absl::flat_hash_set<std::string> xla_compile_on_demand_allowed_devices_;
+    // Devices for which using Device API (PjRt) is allowed in the
+    // XlaCompile and XlaRun ops. This can only be modified programmatically.
+    absl::flat_hash_set<std::string> xla_compile_and_run_allowed_devices_;
   } tf_xla_use_device_api;
 };
 
+// Flags for the XlaCallModule kernel.
+struct XlaCallModuleFlags {
+  // Used by XlaCallModuleOp to specify safety checks to disable.
+  absl::flat_hash_set<std::string> disabled_checks;
+};
+
 // Flags for the build_xla_ops pass.
 struct BuildXlaOpsPassFlags {
   // Enables lazy compilation for TF/XLA (only when auto-clustering) if true.
@@ -197,6 +234,10 @@ struct BuildXlaOpsPassFlags {
   // Disables all constant folding. The primary use for this is for testing to
   // guarantee that tests are run on XLA and not on TF's CPU implementation.
   bool tf_xla_disable_constant_folding;
+
+  // Disables full embedding pipelining when true. Instead, strict SparseCore
+  // TensorCore sequencing will be used.
+  bool tf_xla_disable_full_embedding_pipelining;
 };
 
 // Flags for common MLIR configurations.
@@ -235,6 +276,7 @@ MarkForCompilationPassFlags* GetMarkForCompilationPassFlags();
 BuildXlaOpsPassFlags* GetBuildXlaOpsPassFlags();
 XlaDeviceFlags* GetXlaDeviceFlags();
 XlaOpsCommonFlags* GetXlaOpsCommonFlags();
+XlaCallModuleFlags* GetXlaCallModuleFlags();
 
 MlirCommonFlags* GetMlirCommonFlags();
 
diff --git a/tensorflow/compiler/jit/kernels/BUILD b/tensorflow/compiler/jit/kernels/BUILD
index 8046207ed54..d651933a5d2 100644
--- a/tensorflow/compiler/jit/kernels/BUILD
+++ b/tensorflow/compiler/jit/kernels/BUILD
@@ -23,6 +23,8 @@ XLA_OPS_DEPS = [
     "//tensorflow/compiler/jit:variable_info_util",
     "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
     "//tensorflow/compiler/jit:xla_cluster_util",
+    "//tensorflow/compiler/jit:xla_host_recv_device_context",
+    "//tensorflow/compiler/jit:xla_host_send_device_context",
     "//tensorflow/compiler/jit:xla_launch_util",
     "//tensorflow/compiler/tf2xla:common",
     "//tensorflow/compiler/tf2xla:tf2xla_util",
@@ -59,6 +61,8 @@ cc_library(
         "//tensorflow/compiler/jit:xla_compile_util",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/core/platform:refcount",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 913cca35be3..ff134d49c50 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -20,11 +20,15 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <set>
+#include <string>
+#include <string_view>
 #include <tuple>
 #include <utility>
 #include <variant>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/jit/device_compilation_profiler.h"
 #include "tensorflow/compiler/jit/device_compiler.h"
@@ -35,6 +39,8 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/jit/xla_compiler_options_util.h"
+#include "tensorflow/compiler/jit/xla_host_recv_device_context.h"
+#include "tensorflow/compiler/jit/xla_host_send_device_context.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_util.h"
@@ -92,10 +98,11 @@ auto* xla_launch_counter = monitoring::Counter<1>::New(
 // the initial values for the resource variables (and cannot snapshot them again
 // during execution) because otherwise we risk observing a different snapshot
 // with shapes different from what we compiled for.
-class XlaExecutableClosure {
+template <typename ExecutableType, typename ClientType>
+class ExecutableClosure {
  public:
-  explicit XlaExecutableClosure(
-      xla::LocalClient* client, xla::LocalExecutable* executable,
+  explicit ExecutableClosure(
+      ClientType* client, ExecutableType* executable,
       const XlaCompiler::CompilationResult* compilation_result,
       ResourceVarsSnapshot resource_var_snapshots, int num_constant_args)
       : client_(client),
@@ -104,11 +111,11 @@ class XlaExecutableClosure {
         resource_var_snapshots_(std::move(resource_var_snapshots)),
         num_constant_args_(num_constant_args) {}
 
-  XlaExecutableClosure(XlaExecutableClosure&&) = default;
-  XlaExecutableClosure& operator=(XlaExecutableClosure&&) = default;
+  ExecutableClosure(ExecutableClosure&&) = default;
+  ExecutableClosure& operator=(ExecutableClosure&&) = default;
 
-  xla::LocalClient* client() const { return client_; }
-  xla::LocalExecutable* executable() const { return executable_; }
+  ClientType* client() const { return client_; }
+  ExecutableType* executable() const { return executable_; }
   const XlaCompiler::CompilationResult* compilation_result() const {
     return compilation_result_;
   }
@@ -118,24 +125,25 @@ class XlaExecutableClosure {
   int num_constant_args() const { return num_constant_args_; }
 
  private:
-  xla::LocalClient* client_;
-  xla::LocalExecutable* executable_;
+  ClientType* client_;
+  ExecutableType* executable_;
   const XlaCompiler::CompilationResult* compilation_result_;
   ResourceVarsSnapshot resource_var_snapshots_;
   int num_constant_args_;
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosure);
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutableClosure);
 };
 
-// This maintains a mapping from a globally unique ID to XlaExecutableClosure
+// This maintains a mapping from a globally unique ID to ExecutableClosure
 // instances.
-class XlaExecutableClosureStore {
+template <typename ExecutableType, typename ClientType>
+class ExecutableClosureStore {
  public:
-  XlaExecutableClosureStore() : key_counter_(0) {}
+  ExecutableClosureStore() : key_counter_(0) {}
 
   using KeyT = string;
 
-  KeyT Produce(XlaExecutableClosure result) {
+  KeyT Produce(ExecutableClosure<ExecutableType, ClientType> result) {
     mutex_lock l(mutex_);
     KeyT key = absl::StrCat(key_counter_++);
     bool insert_successful = closures_.emplace(key, std::move(result)).second;
@@ -144,29 +152,38 @@ class XlaExecutableClosureStore {
     return key;
   }
 
-  XlaExecutableClosure Consume(const KeyT& key) {
+  ExecutableClosure<ExecutableType, ClientType> Consume(const KeyT& key) {
     mutex_lock l(mutex_);
     auto it = closures_.find(key);
     DCHECK(it != closures_.end());
-    XlaExecutableClosure value = std::move(it->second);
+    ExecutableClosure<ExecutableType, ClientType> value = std::move(it->second);
     closures_.erase(it);
     return value;
   }
 
-  static XlaExecutableClosureStore* Global() {
-    static XlaExecutableClosureStore* instance = new XlaExecutableClosureStore;
+  static ExecutableClosureStore* Global() {
+    static ExecutableClosureStore* instance = new ExecutableClosureStore;
     return instance;
   }
 
  private:
   mutex mutex_;
   int64_t key_counter_ TF_GUARDED_BY(mutex_);
-  absl::flat_hash_map<KeyT, XlaExecutableClosure> closures_
-      TF_GUARDED_BY(mutex_);
+  absl::flat_hash_map<KeyT, ExecutableClosure<ExecutableType, ClientType>>
+      closures_ TF_GUARDED_BY(mutex_);
 
-  TF_DISALLOW_COPY_AND_ASSIGN(XlaExecutableClosureStore);
+  TF_DISALLOW_COPY_AND_ASSIGN(ExecutableClosureStore);
 };
 
+using XlaExecutableClosure =
+    ExecutableClosure<xla::LocalExecutable, xla::LocalClient>;
+using XlaExecutableClosureStore =
+    ExecutableClosureStore<xla::LocalExecutable, xla::LocalClient>;
+using PjRtExecutableClosure =
+    ExecutableClosure<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+using PjRtExecutableClosureStore =
+    ExecutableClosureStore<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+
 se::Stream* GetStream(OpKernelContext* ctx) {
   return ctx->op_device_context() ? ctx->op_device_context()->stream()
                                   : nullptr;
@@ -185,6 +202,111 @@ XlaComputationLaunchContext GetLaunchContext(
   return launch_context;
 }
 
+Status GetTaskName(const std::string_view device_name, std::string* task_name) {
+  string ignored;
+  if (!DeviceNameUtils::SplitDeviceName(device_name, task_name, &ignored)) {
+    return errors::InvalidArgument("Unable to parse device name: ",
+                                   device_name);
+  }
+
+  return OkStatus();
+}
+
+// Provide SendDeviceMemoryFunction for XLA host callbacks.  This callback
+// handles transferring from device to host.
+xla::SendDeviceMemoryFunction GetSendDeviceMemoryFunction(
+    OpKernelContext* ctx) {
+  return
+      [ctx](int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
+            const se::DeviceMemoryBase& device_memory_base,
+            const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
+          -> StatusOr<tsl::AsyncValueRef<se::Event>> {
+        auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
+
+        // Generate the Rendezvous key.
+        const std::string& rendezvous_key_base = iter->second;
+        const std::string& src_device = ctx->device()->name();
+
+        std::string task_prefix;
+        TF_RETURN_IF_ERROR(GetTaskName(src_device, &task_prefix));
+        const std::string dst_device =
+            absl::StrCat(task_prefix, "/device:CPU:0");
+        const std::string& rendezvous_key =
+            Rendezvous::CreateKey(src_device, /*src_incarnation=*/1, dst_device,
+                                  rendezvous_key_base, FrameAndIter(0, 0));
+        VLOG(2) << "Rendezvous Key for receiving at host: " << rendezvous_key;
+
+        RendezvousInterface::ParsedKey parsed_key;
+        TF_RETURN_IF_ERROR(Rendezvous::ParseKey(rendezvous_key, &parsed_key));
+
+        tsl::AsyncValueRef<se::Event> done_event =
+            tsl::MakeConstructedAsyncValueRef<se::Event>(stream->parent());
+        if (!done_event->Init()) {
+          return errors::Internal(
+              "Failed to initialize done event (channel_id=%d)", channel_id);
+        }
+
+        Rendezvous::Args args;
+        // Rendezvous::Args owns the device context pointer.
+        args.device_context = new XlaHostRecvDeviceContext(
+            stream, device_memory_base, shape, done_event);
+
+        Tensor host_tensor;
+        TF_RETURN_IF_ERROR(
+            ctx->rendezvous()->Send(parsed_key, args, host_tensor, false));
+
+        return std::move(done_event);
+      };
+}
+
+// Provide RecvDeviceMemoryFunction for XLA host callbacks.  This callback
+// handles transferring from host to device.
+xla::RecvDeviceMemoryFunction GetRecvDeviceMemoryFunction(
+    OpKernelContext* ctx) {
+  return
+      [ctx](int64_t channel_id, se::Stream* stream, const xla::Shape& shape,
+            se::DeviceMemoryBase* device_memory_base,
+            const absl::flat_hash_map<std::string, std::string>& frontend_attrs)
+          -> StatusOr<tsl::AsyncValueRef<se::Event>> {
+        auto iter = frontend_attrs.find("_xla_host_transfer_rendezvous");
+
+        // Generate the Rendezvous key.
+        const std::string& rendezvous_key_base = iter->second;
+        const std::string& dst_device = ctx->device()->name();
+
+        std::string task_prefix;
+        TF_RETURN_IF_ERROR(GetTaskName(dst_device, &task_prefix));
+        const std::string src_device =
+            absl::StrCat(task_prefix, "/device:CPU:0");
+        const std::string& rendezvous_key =
+            Rendezvous::CreateKey(src_device, /*src_incarnation=*/1, dst_device,
+                                  rendezvous_key_base, FrameAndIter(0, 0));
+        VLOG(2) << "Rendezvous Key for sending from host: " << rendezvous_key;
+
+        RendezvousInterface::ParsedKey parsed_key;
+        TF_RETURN_IF_ERROR(Rendezvous::ParseKey(rendezvous_key, &parsed_key));
+
+        tsl::AsyncValueRef<se::Event> done_event =
+            tsl::MakeConstructedAsyncValueRef<se::Event>(stream->parent());
+        if (!done_event->Init()) {
+          return errors::Internal(
+              "Failed to initialize done event (channel_id=%d)", channel_id);
+        }
+
+        Rendezvous::Args args;
+        // Rendezvous::Args owns the device context pointer.
+        args.device_context = new XlaHostSendDeviceContext(
+            stream, device_memory_base, shape, done_event);
+
+        Tensor device_tensor;
+        bool is_dead;
+        TF_RETURN_IF_ERROR(ctx->rendezvous()->Recv(
+            parsed_key, args, &device_tensor, /*is_dead=*/&is_dead));
+
+        return std::move(done_event);
+      };
+}
+
 StatusOr<xla::ExecutionOutput> RunExecutable(
     const XlaPlatformInfo& platform_info,
     const XlaComputationLaunchContext& launch_context,
@@ -200,6 +322,15 @@ StatusOr<xla::ExecutionOutput> RunExecutable(
   run_options.set_allocator(allocator);
   run_options.set_intra_op_thread_pool(&ctx->eigen_cpu_device());
   run_options.set_rng_seed(GetXLARandomSeed());
+
+  // Host callbacks used for HLO send/recv.
+  xla::SendDeviceMemoryFunction send_function =
+      GetSendDeviceMemoryFunction(ctx);
+  run_options.set_send_device_memory_function(&send_function);
+  xla::RecvDeviceMemoryFunction recv_function =
+      GetRecvDeviceMemoryFunction(ctx);
+  run_options.set_recv_device_memory_function(&recv_function);
+
   StatusOr<xla::ExecutionOutput> execution_output;
   bool run_synchronous =
       !stream || platform_info.platform_id() == se::host::kHostPlatformId;
@@ -263,7 +394,7 @@ Status CompileToLocalExecutable(
   // in the ResourceMgr.
   ResourceMgr* rm = ctx->resource_manager();
   if (!rm) {
-    return errors::Internal("No resource manager.");
+    return absl::InternalError("No resource manager.");
   }
 
   XlaDeviceCompiler* xla_device_compiler;
@@ -312,23 +443,13 @@ Status CompileToPjRtLoadedExecutable(
   // in the ResourceMgr.
   ResourceMgr* rm = ctx.resource_manager();
   if (!rm) {
-    return errors::Internal("No resource manager.");
+    return absl::InternalError("No resource manager.");
   }
 
   PjRtDeviceCompiler* pjrt_device_compiler;
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<PjRtDeviceCompiler>(
-      rm->default_container(), "pjrt_device_compiler", &pjrt_device_compiler,
-      [&](PjRtDeviceCompiler** pjrt_device_compiler) {
-        return BuildPjRtDeviceCompiler(platform_info, ctx.function_library(),
-                                       pjrt_device_compiler);
-      }));
   DeviceCompilationProfiler* profiler;
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
-      rm->default_container(), "pjrt_device_compilation_profiler", &profiler,
-      [](DeviceCompilationProfiler** profiler) {
-        *profiler = new DeviceCompilationProfiler();
-        return OkStatus();
-      }));
+  TF_RETURN_IF_ERROR(GetOrCreatePjRtDeviceCompilerAndProfiler(
+      platform_info, ctx.function_library(), &pjrt_device_compiler, &profiler));
   // Hold the reference to the PJRT device compiler and profiler during
   // evaluation. (We could probably free them sooner because the ResourceMgr
   // will retain references, but this is more obviously correct.)
@@ -337,8 +458,9 @@ Status CompileToPjRtLoadedExecutable(
 
   *client = pjrt_device_compiler->client();
 
-  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
-      *ctx.function_library(), ctx.device(), platform_info);
+  XlaCompiler::Options options =
+      GenerateCompilerOptionsForPjRt(*ctx.function_library(), ctx.device(),
+                                     platform_info, pjrt_device_compiler);
 
   XlaCompiler::CompileOptions compile_options =
       GenerateCompileOptions(has_ref_vars, may_alias_resource_update);
@@ -474,19 +596,23 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
                              compilation_result, done, inputs,
                              resources = resources_]() {
       auto platform_info = XlaPlatformInfoFromDevice(ctx->device());
-      std::vector<VariableInfo> variable_infos;
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          GetUpdatedVariables(ctx, inputs, resources, *compilation_result,
-                              &variable_infos),
-          done);
-      OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
-                           done);
-      OP_REQUIRES_OK_ASYNC(
-          ctx,
-          RunPjRtExecutable(*pjrt_client, inputs, variable_infos,
-                            *compilation_result, pjrt_executable, ctx),
-          done);
+      // Separate scope so that VariableInfo locks are released before done() is
+      // called.
+      {
+        std::vector<VariableInfo> variable_infos;
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            GetUpdatedVariables(ctx, inputs, resources, *compilation_result,
+                                &variable_infos),
+            done);
+        OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
+                             done);
+        OP_REQUIRES_OK_ASYNC(
+            ctx,
+            RunPjRtExecutable(*pjrt_client, inputs, variable_infos,
+                              *compilation_result, pjrt_executable, ctx),
+            done);
+      }
       VLOG(2) << "Done executing with PJRT.";
       done();
     };
@@ -505,65 +631,69 @@ void XlaLocalLaunchBase::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   // Continuation of the execution, may be run in a different thread.
   auto run_xla_cluster = [ctx, client, executable, compilation_result, done,
                           inputs, resources = resources_]() {
-    auto platform_info = XlaPlatformInfoFromDevice(ctx->device());
-    std::vector<VariableInfo> variable_infos;
-    OP_REQUIRES_OK_ASYNC(
-        ctx,
-        GetUpdatedVariables(ctx, inputs, resources, *compilation_result,
-                            &variable_infos),
-        done);
-    OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
-                         done);
-    std::map<int, const Tensor*> resource_var_ptrs;
-    for (int i = 0; i < resources.size(); i++) {
-      resource_var_ptrs[resources[i]] = variable_infos[i].var()->tensor();
-    }
-
-    std::shared_ptr<se::DeviceMemoryAllocator> allocator =
-        GetAllocator(ctx->device(), GetStream(ctx), platform_info);
-    XlaComputationLaunchContext launch_context =
-        GetLaunchContext(platform_info, ctx, client, allocator.get());
-
-    const xla::HloInputOutputAliasConfig& input_output_alias =
-        executable->executable()->module().input_output_alias_config();
-    StatusOr<std::vector<xla::ExecutionInput>> execution_inputs =
-        launch_context.PopulateInputs(
-            ctx, compilation_result, resource_var_ptrs,
-            /*missing_ctx_input_prefix=*/0, input_output_alias);
-    OP_REQUIRES_OK_ASYNC(ctx, execution_inputs.status(), done);
-
-    xla::gpu::GpuExecutableRunOptions gpu_options;
-    xla::DeviceAssignment device_assignment;
-    xla::ExecutableRunOptions run_options;
-    if (compilation_result->collective_info.has_value()) {
+    // Separate scope so that VariableInfo locks are released before done is
+    // called.
+    {
+      auto platform_info = XlaPlatformInfoFromDevice(ctx->device());
+      std::vector<VariableInfo> variable_infos;
       OP_REQUIRES_OK_ASYNC(
           ctx,
-          ResolveDeviceAssignment(ctx, *compilation_result->collective_info,
-                                  run_options, device_assignment, gpu_options),
+          GetUpdatedVariables(ctx, inputs, resources, *compilation_result,
+                              &variable_infos),
           done);
+      OP_REQUIRES_OK_ASYNC(ctx, LockVariables(absl::MakeSpan(variable_infos)),
+                           done);
+      std::map<int, const Tensor*> resource_var_ptrs;
+      for (int i = 0; i < resources.size(); i++) {
+        resource_var_ptrs[resources[i]] = variable_infos[i].var()->tensor();
+      }
+
+      std::shared_ptr<se::DeviceMemoryAllocator> allocator =
+          GetAllocator(ctx->device(), GetStream(ctx), platform_info);
+      XlaComputationLaunchContext launch_context =
+          GetLaunchContext(platform_info, ctx, client, allocator.get());
+
+      const xla::HloInputOutputAliasConfig& input_output_alias =
+          executable->executable()->module().input_output_alias_config();
+      StatusOr<std::vector<xla::ExecutionInput>> execution_inputs =
+          launch_context.PopulateInputs(
+              ctx, compilation_result, resource_var_ptrs,
+              /*missing_ctx_input_prefix=*/0, input_output_alias);
+      OP_REQUIRES_OK_ASYNC(ctx, execution_inputs.status(), done);
+
+      xla::gpu::GpuExecutableRunOptions gpu_options;
+      xla::DeviceAssignment device_assignment;
+      xla::ExecutableRunOptions run_options;
+      if (compilation_result->collective_info.has_value()) {
+        OP_REQUIRES_OK_ASYNC(ctx,
+                             ResolveDeviceAssignment(
+                                 ctx, *compilation_result->collective_info,
+                                 run_options, device_assignment, gpu_options),
+                             done);
+      }
+
+      // Hardcode run id to always be zero: TF distributed strategy
+      // differentiates between subsequent runs using dependency edges. This
+      // is safe, as only TF dist-strat can produce distributed ops, and we
+      // can rely on TF dist-strat invariants.
+      xla::RunId run_id(0);
+      run_options.set_run_id(run_id);
+
+      StatusOr<xla::ExecutionOutput> execution_output = RunExecutable(
+          platform_info, launch_context, std::move(*execution_inputs),
+          run_options, executable, ctx, allocator.get());
+      OP_REQUIRES_ASYNC(ctx, execution_output.ok(), execution_output.status(),
+                        done);
+
+      OP_REQUIRES_OK_ASYNC(
+          ctx,
+          launch_context.PopulateOutputs(
+              ctx, compilation_result, execution_output->ConsumeResult(),
+              /*missing_ctx_input_prefix=*/0, absl::MakeSpan(variable_infos),
+              input_output_alias, resource_var_ptrs),
+          done);
+      VLOG(1) << "Done";
     }
-
-    // Hardcode run id to always be zero: TF distributed strategy
-    // differentiates between subsequent runs using dependency edges. This
-    // is safe, as only TF dist-strat can produce distributed ops, and we
-    // can rely on TF dist-strat invariants.
-    xla::RunId run_id(0);
-    run_options.set_run_id(run_id);
-
-    StatusOr<xla::ExecutionOutput> execution_output = RunExecutable(
-        platform_info, launch_context, std::move(*execution_inputs),
-        run_options, executable, ctx, allocator.get());
-    OP_REQUIRES_ASYNC(ctx, execution_output.ok(), execution_output.status(),
-                      done);
-
-    OP_REQUIRES_OK_ASYNC(
-        ctx,
-        launch_context.PopulateOutputs(
-            ctx, compilation_result, execution_output->ConsumeResult(),
-            /*missing_ctx_input_prefix=*/0, absl::MakeSpan(variable_infos),
-            input_output_alias, resource_var_ptrs),
-        done);
-    VLOG(1) << "Done";
     done();
   };
 
@@ -658,9 +788,11 @@ XlaCompileOp::XlaCompileOp(OpKernelConstruction* ctx)
 void XlaCompileOp::Compute(OpKernelContext* ctx) {
   VLOG(3) << "XlaCompileOp " << def().name()
           << (must_compile_ ? "(must-compile)" : "");
-  xla::LocalClient* client;
-  const XlaCompiler::CompilationResult* kernel;
-  xla::LocalExecutable* executable;
+  const XlaCompiler::CompilationResult* kernel = nullptr;
+  xla::LocalClient* client = nullptr;
+  xla::LocalExecutable* executable = nullptr;
+  xla::PjRtClient* pjrt_client = nullptr;
+  xla::PjRtLoadedExecutable* pjrt_executable = nullptr;
   ResourceVarsSnapshot variables_snapshot;
 
   std::vector<const Tensor*> inputs = InputsFromContext(ctx);
@@ -678,6 +810,11 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
                : DeviceCompileMode::kLazy;
   }();
 
+  bool use_pjrt =
+      GetXlaOpsCommonFlags()
+          ->tf_xla_use_device_api.IsEnabledInXlaCompileAndRunForDevice(
+              platform_info_.device_type());
+
   if (GetXlaOpsCommonFlags()->tf_xla_always_defer_compilation ||
       cannot_compile_cluster) {
     executable = nullptr;
@@ -691,22 +828,33 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
 
     // Do not alias resource updates as locking variables in XlaCompile and
     // unlocking them in XlaRun may lead to deadlocks.
-    const Status status = CompileToLocalExecutable(
-        ctx, function_, has_ref_vars_, platform_info_, args, compile_mode,
-        /*may_alias_resource_update=*/false, &client, &kernel, &executable);
+    Status status;
+    if (use_pjrt) {
+      VLOG(2) << "Using PJRT for compilation. Function name: "
+              << function_.name();
+      status = CompileToPjRtLoadedExecutable(
+          *ctx, platform_info_, function_, args, compile_mode, has_ref_vars_,
+          /*may_alias_resource_update=*/false, &kernel, &pjrt_client,
+          &pjrt_executable);
+    } else {
+      status = CompileToLocalExecutable(
+          ctx, function_, has_ref_vars_, platform_info_, args, compile_mode,
+          /*may_alias_resource_update=*/false, &client, &kernel, &executable);
+    }
     if (compile_mode != DeviceCompileMode::kLazy ||
         status.code() != error::UNIMPLEMENTED) {
       OP_REQUIRES_OK(ctx, status);
     }
 
     if (status.code() == error::UNIMPLEMENTED) {
-      LOG(WARNING) << "Compilation failed:" << status.ToString()
+      LOG(WARNING) << "Compilation failed:" << status
                    << ".  Falling back to TF function call.";
 
       BroadcastOptimizationRemark(
           XlaOptimizationRemark::UNIMPLEMENTED_OPERATION, status.ToString())
           .IgnoreError();
       executable = nullptr;
+      pjrt_executable = nullptr;
       mutex_lock guard(cannot_compile_cluster_mu_);
       cannot_compile_cluster_ = true;
     }
@@ -718,28 +866,36 @@ void XlaCompileOp::Compute(OpKernelContext* ctx) {
   Allocator* cpu_allocator = ctx->device()->GetAllocator(host_alloc_attrs);
 
   // Async compilation returns nullptr executable without an error.
-  if (!executable) {
+  if (!executable && !pjrt_executable) {
     DCHECK(!must_compile_);
     Tensor compilation_key(cpu_allocator, DT_STRING, TensorShape({}));
-
     Tensor compilation_successful(cpu_allocator, DT_BOOL, TensorShape({}));
     compilation_successful.scalar<bool>()() = false;
-    ctx->set_output(0, Tensor(cpu_allocator, DT_STRING, TensorShape({})));
+    ctx->set_output(0, compilation_key);
     ctx->set_output(1, compilation_successful);
     return;
   }
 
-  // Each execution of an XlaCompile op creates a new XlaExecutableClosure, even
+  // Each execution of an XlaCompile op creates a new ExecutableClosure, even
   // if it didn't have to compile the cluster because of a compilation-cache
   // hit.  This is because we at least need new snapshots of the resource
   // variables.
-  XlaExecutableClosureStore::KeyT key =
-      XlaExecutableClosureStore::Global()->Produce(XlaExecutableClosure(
-          client, executable, kernel, std::move(variables_snapshot),
-          constants_.size()));
-
   Tensor compilation_key(cpu_allocator, DT_STRING, TensorShape({}));
-  compilation_key.flat<tstring>()(0) = key;
+  if (use_pjrt) {
+    PjRtExecutableClosureStore::KeyT key =
+        PjRtExecutableClosureStore::Global()->Produce(PjRtExecutableClosure(
+            pjrt_client, pjrt_executable, kernel, std::move(variables_snapshot),
+            constants_.size()));
+    compilation_key.flat<tstring>()(0) = key;
+    VLOG(2) << "Compiled with PJRT. compilation_key: " << key;
+  } else {
+    XlaExecutableClosureStore::KeyT key =
+        XlaExecutableClosureStore::Global()->Produce(XlaExecutableClosure(
+            client, executable, kernel, std::move(variables_snapshot),
+            constants_.size()));
+    compilation_key.flat<tstring>()(0) = key;
+    VLOG(2) << "Compiled with XLA. compilation_key: " << key;
+  }
 
   Tensor compilation_successful(cpu_allocator, DT_BOOL, TensorShape({}));
   compilation_successful.flat<bool>()(0) = true;
diff --git a/tensorflow/compiler/jit/ops/BUILD b/tensorflow/compiler/jit/ops/BUILD
index e70b5c2525d..1059a263d57 100644
--- a/tensorflow/compiler/jit/ops/BUILD
+++ b/tensorflow/compiler/jit/ops/BUILD
@@ -24,5 +24,5 @@ py_library(
     name = "xla_ops_grad",
     srcs = ["xla_ops_grad.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python:framework_ops"],
+    deps = ["//tensorflow/python/framework:ops"],
 )
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index 17e47dd9a81..c74ea677fcd 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -77,8 +77,8 @@ cc_library(
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:ops",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "//tensorflow/core/platform:path",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.h b/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
index edb6be6a0ff..e8ae70928d1 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/jit/xla_activity_listener.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -51,7 +52,7 @@ class JitCompilationListener : public XlaActivityListener {
       bool expect_persistent_cache_use) {
     for (const auto& activity : activity_history_) {
       if (activity.used_persistent_cache() != expect_persistent_cache_use) {
-        return errors::FailedPrecondition("Unexpected listener history.");
+        return absl::FailedPreconditionError("Unexpected listener history.");
       }
     }
     return OkStatus();
diff --git a/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc b/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc
index 3f96a0f2aa9..052ed6b6f38 100644
--- a/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc
+++ b/tensorflow/compiler/jit/tf_graph_to_hlo_compiler.cc
@@ -23,14 +23,15 @@ Status TfGraphToHloCompiler::Compile(const XlaCompiler::CompileOptions& options,
                                      const NameAttrList& function,
                                      absl::Span<const XlaArgument> args,
                                      XlaCompilationResult* result) {
-  return xla_compiler_.CompileFunction(options, function, args, result);
+  return ADD_SOURCE_LOCATION(
+      xla_compiler_.CompileFunction(options, function, args, result));
 }
 
 Status TfGraphToHloCompiler::CompileSingleOp(
     const XlaCompiler::CompileOptions& options, const OpKernelContext* ctx,
     absl::Span<const XlaArgument> args, XlaCompilationResult* result) {
-  return xla_compiler_.CompileSingleOp(
-      options, XlaCompiler::SingleOpCompileArgument(*ctx), args, result);
+  return ADD_SOURCE_LOCATION(xla_compiler_.CompileSingleOp(
+      options, XlaCompiler::SingleOpCompileArgument(*ctx), args, result));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
index f6bdaf4e0bc..010ce8bd7c2 100644
--- a/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
+++ b/tensorflow/compiler/jit/xla_compile_on_demand_op.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <map>
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -43,6 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/tf_pjrt_client.h"
 #include "tensorflow/compiler/xla/service/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/core/framework/function.h"
@@ -53,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tfrt/common/pjrt_util.h"
 #include "tensorflow/tsl/platform/errors.h"
 
 namespace tensorflow {
@@ -169,28 +172,12 @@ Status XlaCompileOnDemandOp::Compile(
     DeviceCompilationProfiler** profiler,
     const XlaCompiler::CompilationResult** result,
     xla::PjRtLoadedExecutable** executable) {
-  // We store information about the JIT-compiled XLA computation
-  // in the ResourceMgr.
-  ResourceMgr* rm = ctx->resource_manager();
-  if (!rm) {
-    return errors::Internal("No resource manager.");
-  }
+  TF_RETURN_IF_ERROR(GetOrCreatePjRtDeviceCompilerAndProfiler(
+      platform_info_, ctx->function_library(), pjrt_device_compiler, profiler));
 
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<PjRtDeviceCompiler>(
-      rm->default_container(), "pjrt_device_compiler", pjrt_device_compiler,
-      [&](PjRtDeviceCompiler** pjrt_device_compiler) {
-        return BuildPjRtDeviceCompiler(platform_info_, ctx->function_library(),
-                                       pjrt_device_compiler);
-      }));
-  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
-      rm->default_container(), "pjrt_device_compilation_profiler", profiler,
-      [](DeviceCompilationProfiler** profiler) {
-        *profiler = new DeviceCompilationProfiler();
-        return OkStatus();
-      }));
-
-  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
-      *(ctx->function_library()), ctx->device(), platform_info_);
+  XlaCompiler::Options options =
+      GenerateCompilerOptionsForPjRt(*(ctx->function_library()), ctx->device(),
+                                     platform_info_, *pjrt_device_compiler);
   // No detailed logging for on demand op.
   options.detailed_logging = false;
   XlaCompiler::CompileOptions compile_options = GetCompileOptions(true);
diff --git a/tensorflow/compiler/jit/xla_compile_util.cc b/tensorflow/compiler/jit/xla_compile_util.cc
index e5256a8b2c9..6a3e43f4a94 100644
--- a/tensorflow/compiler/jit/xla_compile_util.cc
+++ b/tensorflow/compiler/jit/xla_compile_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/compiler/jit/flags.h"
@@ -24,6 +25,11 @@ limitations under the License.
 #include "tensorflow/core/util/determinism.h"
 
 namespace tensorflow {
+namespace {
+constexpr const char* kPjRtDeviceCompilerResourceName = "pjrt_device_compiler";
+constexpr const char* kPjRtDeviceCompilationProfilerResourceName =
+    "pjrt_device_compilation_profiler";
+}  // namespace
 
 StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
     const NodeDef& node_def, absl::Span<const XlaArgument> args,
@@ -69,7 +75,18 @@ StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
 bool UsePjRtForSingleDeviceCompilation(const DeviceType& device_type) {
   const auto& rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
   return rollout_config.IsEnabledInXlaLaunchForDevice(device_type) ||
-         rollout_config.IsEnabledInXlaCompileOnDemandForDevice(device_type);
+         rollout_config.IsEnabledInXlaCompileOnDemandForDevice(device_type) ||
+         rollout_config.IsEnabledInXlaCompileAndRunForDevice(device_type);
 }
 
+std::string GetPjRtDeviceCompilerResourceName(const DeviceType& device_type) {
+  return absl::StrCat(kPjRtDeviceCompilerResourceName, "_",
+                      device_type.type_string());
+}
+
+std::string GetPjRtDeviceCompilationProfilerResourceName(
+    const DeviceType& device_type) {
+  return absl::StrCat(kPjRtDeviceCompilationProfilerResourceName, "_",
+                      device_type.type_string());
+}
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compile_util.h b/tensorflow/compiler/jit/xla_compile_util.h
index 345c55a86e5..d555738d4c3 100644
--- a/tensorflow/compiler/jit/xla_compile_util.h
+++ b/tensorflow/compiler/jit/xla_compile_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
 
 #include <memory>
+#include <string>
 
 #include "tensorflow/compiler/tf2xla/xla_argument.h"
 #include "tensorflow/core/graph/graph.h"
@@ -47,6 +48,14 @@ StatusOr<std::unique_ptr<Graph>> CreateSingleOpGraph(
 // Checks if single device compilation and execution with PJRT is enabled for
 // `device_type` in either the XlaLaunch op or the XlaCompileOnDemand op.
 bool UsePjRtForSingleDeviceCompilation(const DeviceType& device_type);
+
+// Gets the resource name of the PjRt DeviceCompiler for `device_type`.
+std::string GetPjRtDeviceCompilerResourceName(const DeviceType& device_type);
+
+// Gets the resource name of the DeviceCompilationProfiler for `device_type`
+// when PjRt is used for compilation and execution.
+std::string GetPjRtDeviceCompilationProfilerResourceName(
+    const DeviceType& device_type);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_JIT_XLA_COMPILE_UTIL_H_
diff --git a/tensorflow/compiler/jit/xla_compile_util_test.cc b/tensorflow/compiler/jit/xla_compile_util_test.cc
index 9fc706fb649..7e55498ec42 100644
--- a/tensorflow/compiler/jit/xla_compile_util_test.cc
+++ b/tensorflow/compiler/jit/xla_compile_util_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/core/framework/fake_input.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/tpu/tpu_defs.h"
 
 namespace tensorflow {
 namespace {
@@ -118,5 +119,31 @@ TEST(XlaCompileUtilTest, PjRtXlaCompileOnDemandFlagTest) {
   EXPECT_FALSE(UsePjRtForSingleDeviceCompilation(DeviceType(DEVICE_CPU)));
 }
 
+TEST(XlaCompileUtilTest, PjRtDeviceCompilerResourceName) {
+  EXPECT_EQ(GetPjRtDeviceCompilerResourceName(DeviceType(DEVICE_TPU)),
+            "pjrt_device_compiler_TPU");
+  EXPECT_EQ(GetPjRtDeviceCompilerResourceName(DeviceType(DEVICE_TPU_NODE)),
+            "pjrt_device_compiler_TPU");
+  EXPECT_EQ(GetPjRtDeviceCompilerResourceName(DeviceType(DEVICE_CPU)),
+            "pjrt_device_compiler_CPU");
+  EXPECT_EQ(GetPjRtDeviceCompilerResourceName(DeviceType(DEVICE_GPU)),
+            "pjrt_device_compiler_GPU");
+}
+
+TEST(XlaCompileUtilTest, PjRtDeviceCompilationProfilerResourceName) {
+  EXPECT_EQ(
+      GetPjRtDeviceCompilationProfilerResourceName(DeviceType(DEVICE_TPU)),
+      "pjrt_device_compilation_profiler_TPU");
+  EXPECT_EQ(
+      GetPjRtDeviceCompilationProfilerResourceName(DeviceType(DEVICE_TPU_NODE)),
+      "pjrt_device_compilation_profiler_TPU");
+  EXPECT_EQ(
+      GetPjRtDeviceCompilationProfilerResourceName(DeviceType(DEVICE_CPU)),
+      "pjrt_device_compilation_profiler_CPU");
+  EXPECT_EQ(
+      GetPjRtDeviceCompilationProfilerResourceName(DeviceType(DEVICE_GPU)),
+      "pjrt_device_compilation_profiler_GPU");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_compiler_options_util.cc b/tensorflow/compiler/jit/xla_compiler_options_util.cc
index 8580bcfbeef..1ba962380d8 100644
--- a/tensorflow/compiler/jit/xla_compiler_options_util.cc
+++ b/tensorflow/compiler/jit/xla_compiler_options_util.cc
@@ -15,10 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/jit/xla_compiler_options_util.h"
 
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+
 namespace tensorflow {
 namespace {
 using XlaDeviceCompiler =
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
 
 inline void LogOptions(const XlaCompiler::Options& options) {
   VLOG(2) << "XlaCompiler::Options[device_type=" << options.device_type
@@ -81,7 +85,8 @@ XlaCompiler::Options GenerateCompilerOptionsForTfrtTpu(
 
 XlaCompiler::Options GenerateCompilerOptionsForPjRt(
     const FunctionLibraryRuntime& function_library,
-    const DeviceBase* device_base, const XlaPlatformInfo& platform_info) {
+    const DeviceBase* device_base, const XlaPlatformInfo& platform_info,
+    const PjRtDeviceCompiler* pjrt_device_compiler) {
   XlaCompiler::Options options;
   options.device_ordinal = device_base->parsed_name().id;
   options.flib_def = function_library.GetFunctionLibraryDefinition();
@@ -96,8 +101,9 @@ XlaCompiler::Options GenerateCompilerOptionsForPjRt(
     options.device_type = metadata->jit_device_type();
     options.shape_determination_fns =
         metadata->default_shape_determination_fns();
+  } else if (pjrt_device_compiler != nullptr) {
+    options.device_type = pjrt_device_compiler->device_type();
   }
-  // TODO(b/255826209): Set options for non-XLA devices once PjRt supports them.
   // TODO(b/255826209): Confirm below options are correctly set after testing.
   options.allow_cpu_custom_calls = false;
   options.alias_passthrough_params = false;
diff --git a/tensorflow/compiler/jit/xla_compiler_options_util.h b/tensorflow/compiler/jit/xla_compiler_options_util.h
index 1be63a6dc8b..1c70b91c8f5 100644
--- a/tensorflow/compiler/jit/xla_compiler_options_util.h
+++ b/tensorflow/compiler/jit/xla_compiler_options_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 
 namespace tensorflow {
 
@@ -39,9 +40,12 @@ XlaCompiler::Options GenerateCompilerOptionsForTfrtTpu(
 
 // Returns created options for XLA compiler when PjRt (Device API) is used for
 // compilation and execution.
+// TODO(b/255826209): Remove default arg once PjRtCompileOnDemand op is deleted.
 XlaCompiler::Options GenerateCompilerOptionsForPjRt(
     const FunctionLibraryRuntime& function_library,
-    const DeviceBase* device_base, const XlaPlatformInfo& platform_info);
+    const DeviceBase* device_base, const XlaPlatformInfo& platform_info,
+    const DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>*
+        pjrt_device_compiler = nullptr);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/jit/xla_compiler_options_util_test.cc b/tensorflow/compiler/jit/xla_compiler_options_util_test.cc
index 2a4742567e4..1ab03bc7444 100644
--- a/tensorflow/compiler/jit/xla_compiler_options_util_test.cc
+++ b/tensorflow/compiler/jit/xla_compiler_options_util_test.cc
@@ -23,12 +23,14 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
 #include "tensorflow/compiler/jit/test_util.h"
 #include "tensorflow/compiler/jit/xla_platform_info.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -42,18 +44,31 @@ using XlaDeviceCompiler =
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>;
 using XlaDeviceExecutablePersistor =
     DeviceExecutablePersistor<xla::LocalExecutable, xla::LocalClient>;
+using PjRtDeviceCompiler =
+    DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>;
+using PjRtDeviceExecutablePersistor =
+    DeviceExecutablePersistor<xla::PjRtLoadedExecutable, xla::PjRtClient>;
 
-XlaDeviceCompiler* CreateXlaDeviceCompiler(
-    const XlaDeviceExecutablePersistor::Config& persistor_config,
-    DeviceType device_type, xla::LocalClient* local_client) {
+XlaDeviceCompiler* CreateXlaDeviceCompiler(DeviceType device_type,
+                                           xla::LocalClient* local_client) {
   auto persistor = std::make_unique<XlaDeviceExecutablePersistor>(
-      std::move(persistor_config), device_type);
+      XlaDeviceExecutablePersistor::Config(), device_type);
   auto compiler_client =
       std::make_unique<XlaDeviceCompilerClient>(local_client);
   return new XlaDeviceCompiler(std::move(persistor),
                                std::move(compiler_client));
 }
 
+PjRtDeviceCompiler* CreatePjRtDeviceCompiler(DeviceType device_type,
+                                             xla::PjRtClient* pjrt_client) {
+  auto persistor = std::make_unique<PjRtDeviceExecutablePersistor>(
+      PjRtDeviceExecutablePersistor::Config(), device_type);
+  auto compiler_client =
+      std::make_unique<PjRtDeviceCompilerClient>(pjrt_client);
+  return new PjRtDeviceCompiler(std::move(persistor),
+                                std::move(compiler_client));
+}
+
 std::vector<XlaShapeLayoutHelpers::ShapeDeterminationFns>
 GetShapeDeterminationFns() {
   XlaHelpers::ShapeRepresentationFn shape_representation_fn =
@@ -160,6 +175,45 @@ TEST_F(XlaCompilerOptionsTest, PjRtOptionsPjRtBaseDevice) {
             tensorflow::XlaLayoutPreference::kTpuPreferLinearLayout);
 }
 
+TEST_F(XlaCompilerOptionsTest, PjRtOptionsNonXlaDevice) {
+  device_setup_.AddDevicesAndSetUp({DEVICE_CPU});
+  Device* device = device_setup_.GetDevice(DEVICE_CPU);
+  DeviceType compilation_device_type = DeviceType(DEVICE_CPU_XLA_JIT);
+
+  XlaPlatformInfo platform_info(compilation_device_type,
+                                /*platform_id=*/nullptr,
+                                /*xla_device_metadata=*/nullptr,
+                                /*pjrt_device_metadata=*/nullptr,
+                                /*device_allocator=*/nullptr);
+
+  auto pjrt_device_compiler =
+      CreatePjRtDeviceCompiler(compilation_device_type, nullptr);
+  core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+
+  XlaCompiler::Options options = GenerateCompilerOptionsForPjRt(
+      *device_setup_.flr(), device, platform_info, pjrt_device_compiler);
+
+  EXPECT_EQ(options.device_type, compilation_device_type);
+  EXPECT_EQ(options.device_ordinal, 0);
+  EXPECT_NE(options.flib_def, nullptr);
+  EXPECT_EQ(options.graph_def_version, TF_GRAPH_DEF_VERSION);
+  EXPECT_FALSE(options.allow_cpu_custom_calls);
+  EXPECT_FALSE(options.alias_passthrough_params);
+  EXPECT_FALSE(options.detailed_logging);
+  // Check whether options have default shape determination functions set.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto shape, options.shape_determination_fns.shape_representation_fn(
+                      TensorShape(), DT_FLOAT, false,
+                      tensorflow::XlaLayoutPreference::kNoPreference));
+  xla::ShapeProto shape_proto;
+  shape_proto.set_element_type(xla::PrimitiveType::F32);
+  shape_proto.mutable_layout();
+  EXPECT_EQ(shape, xla::Shape(shape_proto));
+  EXPECT_EQ(options.shape_determination_fns.layout_preference_fn(
+                TensorShape(), DT_FLOAT, std::nullopt),
+            tensorflow::XlaLayoutPreference::kNoPreference);
+}
+
 TEST_F(XlaCompilerOptionsTest, XlaOptions) {
   device_setup_.AddDevicesAndSetUp({DEVICE_XLA_GPU});
   Device* device = device_setup_.GetDevice(DEVICE_XLA_GPU);
@@ -168,8 +222,8 @@ TEST_F(XlaCompilerOptionsTest, XlaOptions) {
   DeviceType device_type = DeviceType(DEVICE_XLA_GPU);
   DeviceType compilation_device_type = DeviceType(DEVICE_GPU_XLA_JIT);
 
-  auto xla_device_compiler = CreateXlaDeviceCompiler(
-      XlaDeviceExecutablePersistor::Config(), compilation_device_type, client);
+  auto xla_device_compiler =
+      CreateXlaDeviceCompiler(compilation_device_type, client);
   core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
 
   se::Platform::Id platform_id = se::host::kHostPlatformId;
@@ -208,8 +262,8 @@ TEST_F(XlaCompilerOptionsTest, XlaOptionsHasRefVarsNoXlaDeviceMetadata) {
   DeviceType device_type = DeviceType(DEVICE_CPU);
   DeviceType compilation_device_type = DeviceType(DEVICE_CPU_XLA_JIT);
 
-  auto xla_device_compiler = CreateXlaDeviceCompiler(
-      XlaDeviceExecutablePersistor::Config(), compilation_device_type, client);
+  auto xla_device_compiler =
+      CreateXlaDeviceCompiler(compilation_device_type, client);
   core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
 
   se::Platform::Id platform_id = se::host::kHostPlatformId;
@@ -249,8 +303,8 @@ TEST_F(XlaCompilerOptionsTest, TfRtTpuOptions) {
   xla::LocalClient* client = xla::ClientLibrary::LocalClientOrDie();
   DeviceType compilation_device_type = DeviceType(DEVICE_TPU_XLA_JIT);
 
-  auto xla_device_compiler = CreateXlaDeviceCompiler(
-      XlaDeviceExecutablePersistor::Config(), compilation_device_type, client);
+  auto xla_device_compiler =
+      CreateXlaDeviceCompiler(compilation_device_type, client);
   core::ScopedUnref xla_device_compiler_ref(xla_device_compiler);
 
   XlaCompiler::Options options = GenerateCompilerOptionsForTfrtTpu(
diff --git a/tensorflow/compiler/jit/xla_host_recv_device_context.cc b/tensorflow/compiler/jit/xla_host_recv_device_context.cc
new file mode 100644
index 00000000000..b634ac88739
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_host_recv_device_context.cc
@@ -0,0 +1,49 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/jit/xla_host_recv_device_context.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+
+namespace tensorflow {
+
+void XlaHostRecvDeviceContext::CopyDeviceTensorToCPU(
+    const Tensor* device_tensor, StringPiece tensor_name, Device* device,
+    Tensor* cpu_tensor, StatusCallback done) {
+  DataType dtype = EncodePrimitiveTypeAsDataType(shape_.element_type()).value();
+  TensorShape tensor_shape;
+  Status status = XLAShapeToTensorShape(shape_, &tensor_shape);
+  if (!status.ok()) {
+    done(status);
+    return;
+  }
+
+  *cpu_tensor = Tensor(dtype, tensor_shape);
+
+  stream_->ThenMemcpy(cpu_tensor->data(), device_memory_base_,
+                      device_memory_base_.size());
+  stream_->ThenRecordEvent(&done_event_.get());
+  if (auto st = stream_->BlockHostUntilDone(); !st.ok()) {
+    done_event_.SetError(absl::InternalError(absl::StrFormat(
+        "failed to synchronize send operation with a stream: %s",
+        st.ToString())));
+    return;
+  }
+
+  done_event_.SetStateConcrete();
+  done(OkStatus());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_host_recv_device_context.h b/tensorflow/compiler/jit/xla_host_recv_device_context.h
new file mode 100644
index 00000000000..e2c5d1767d1
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_host_recv_device_context.h
@@ -0,0 +1,92 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_HOST_RECV_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_HOST_RECV_DEVICE_CONTEXT_H_
+
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// XlaHostRecvDeviceContext is a DeviceContext that is intended to be
+// used to transfer from device->host using Rendezvous. It transfers the
+// content of `device_memory_base` with `shape` using `stream`. Only
+// `CopyDeviceTensorToCPU` method is implemented. The `done_event` is marked as
+// Concrete once transfer is completed.
+//
+// Example usage:
+//
+//  Device device;
+//  stream_executor::Stream stream(executor);
+//  Tensor device_tensor(device_allocator, DT_FLOAT, TensorShape({2, 2}));
+//  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+//  xla::Shape shape(xla::F32, {2, 2}, {}, {})
+//  tsl::AsyncValueRef<se::Event> done_event =
+//      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+//  done_event->Init();
+//  Tensor dest_cpu_tensor;
+//
+//  XlaHostRecvDeviceContext device_context(&stream, gpu_dst,
+//    shape, done_event);
+//  device_context.CopyDeviceTensorToCPUSync(
+//    &device_tensor, "", &device, &dest_cpu_tensor);
+
+class XlaHostRecvDeviceContext : public DeviceContext {
+ public:
+  XlaHostRecvDeviceContext(se::Stream* stream,
+                           const se::DeviceMemoryBase& device_memory_base,
+                           const xla::Shape& shape,
+                           tsl::AsyncValueRef<se::Event>& done_event)
+      : stream_(stream),
+        device_memory_base_(device_memory_base),
+        shape_(shape),
+        done_event_(done_event) {}
+
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override {
+    done(errors::Internal("host->device copy not implemented."));
+  }
+
+  // Copies `device_memory_base_` with `shape_` into `cpu_tensor`.
+  // `device_tensor` is unused.
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             StringPiece tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override;
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override {
+    done(errors::Internal("device->device copy not implemented."));
+  }
+
+ private:
+  se::Stream* stream_;  // Not owned.
+  // This is copied rather than a reference or pointer since its lifetime
+  // is not guaranteed to outlast the original object.  Object slicing is
+  // not an issue here since only DeviceMemoryBase methods/members are used.
+  const se::DeviceMemoryBase device_memory_base_;
+  const xla::Shape shape_;
+  tsl::AsyncValueRef<se::Event> done_event_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaHostRecvDeviceContext);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_HOST_RECV_DEVICE_CONTEXT_H_
diff --git a/tensorflow/compiler/jit/xla_host_send_device_context.cc b/tensorflow/compiler/jit/xla_host_send_device_context.cc
new file mode 100644
index 00000000000..1c30ef022a8
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_host_send_device_context.cc
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/jit/xla_host_send_device_context.h"
+
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+
+namespace tensorflow {
+
+void XlaHostSendDeviceContext::CopyCPUTensorToDevice(
+    const Tensor* cpu_tensor, Device* device, Tensor* device_tensor,
+    StatusCallback done, bool sync_dst_compute) const {
+  stream_->ThenMemcpy(device_memory_base_, cpu_tensor->data(),
+                      device_memory_base_->size());
+  stream_->ThenRecordEvent(&done_event_.get());
+  if (auto st = stream_->BlockHostUntilDone(); !st.ok()) {
+    done_event_.SetError(absl::InternalError(absl::StrFormat(
+        "failed to synchronize send operation with a stream: %s",
+        st.ToString())));
+    return;
+  }
+
+  done_event_.SetStateConcrete();
+  done(OkStatus());
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_host_send_device_context.h b/tensorflow/compiler/jit/xla_host_send_device_context.h
new file mode 100644
index 00000000000..ce292fa61d1
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_host_send_device_context.h
@@ -0,0 +1,89 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_JIT_XLA_HOST_SEND_DEVICE_CONTEXT_H_
+#define TENSORFLOW_COMPILER_JIT_XLA_HOST_SEND_DEVICE_CONTEXT_H_
+
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/core/framework/device_base.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace tensorflow {
+
+// XlaHostSendDeviceContext is a DeviceContext that is intended to be
+// used to transfer from host->device using Rendezvous. It transfers the
+// content of `device_memory_base` with `shape` using `stream`. Only
+// `CopyCPUTensorToDevice` method is implemented. The `done_event` is marked as
+// Concrete once transfer is completed.
+//
+// Example usage:
+//
+//  Device device;
+//  stream_executor::Stream stream(executor);
+//  Tensor cpu_tensor(host_allocator, DT_FLOAT, TensorShape({2, 2}));
+//  Tensor device_tensor(device_allocator, DT_FLOAT, TensorShape({2, 2}));
+//  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+//  xla::Shape shape(xla::F32, {2, 2}, {}, {})
+//  tsl::AsyncValueRef<se::Event> done_event =
+//      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+//  done_event->Init();
+//
+//  XlaHostSendDeviceContext device_context(&stream, &gpu_dst,
+//    shape, done_event);
+//  device_context.CopyCPUTensorToDeviceSync(
+//    &cpu_tensor, &device, &device_tensor);
+
+class XlaHostSendDeviceContext : public DeviceContext {
+ public:
+  XlaHostSendDeviceContext(se::Stream* stream,
+                           se::DeviceMemoryBase* device_memory_base,
+                           const xla::Shape& shape,
+                           tsl::AsyncValueRef<se::Event>& done_event)
+      : stream_(stream),
+        device_memory_base_(device_memory_base),
+        shape_(shape),
+        done_event_(done_event) {}
+
+  // Copies 'cpu_tensor' to `device_memory_base_` with `shape_`.
+  // `device_tensor` is unused.
+  void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
+                             Tensor* device_tensor, StatusCallback done,
+                             bool sync_dst_compute) const override;
+
+  void CopyDeviceTensorToCPU(const Tensor* device_tensor,
+                             StringPiece tensor_name, Device* device,
+                             Tensor* cpu_tensor, StatusCallback done) override {
+    done(errors::Internal("host->device copy not implemented."));
+  }
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Device* device,
+                              Tensor* output_tensor,
+                              StatusCallback done) const override {
+    done(errors::Internal("device->device copy not implemented."));
+  }
+
+ private:
+  se::Stream* stream_;                        // Not owned.
+  se::DeviceMemoryBase* device_memory_base_;  // Not owned.
+  const xla::Shape shape_;
+  tsl::AsyncValueRef<se::Event> done_event_;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(XlaHostSendDeviceContext);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_JIT_XLA_HOST_SEND_DEVICE_CONTEXT_H_
diff --git a/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc b/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
new file mode 100644
index 00000000000..90d7b3b7b8f
--- /dev/null
+++ b/tensorflow/compiler/jit/xla_host_send_recv_device_context_test.cc
@@ -0,0 +1,171 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string_view>
+#include <utility>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/jit/xla_host_recv_device_context.h"
+#include "tensorflow/compiler/jit/xla_host_send_device_context.h"
+#include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
+#include "tensorflow/compiler/xla/stream_executor/device_memory.h"
+#include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+class XlaHostSendRecvDeviceContextTest : public ::testing::Test {
+ public:
+  void SetDevice(const string& device_type) {
+    auto device_factory = DeviceFactory::GetFactory(device_type);
+    SessionOptions options;
+    std::vector<std::unique_ptr<Device>> devices;
+    Status s = device_factory->CreateDevices(
+        options, "/job:worker/replica:0/task:0", &devices);
+    device_ = std::move(devices[0]);
+
+    AllocatorAttributes host_alloc_attr;
+    host_alloc_attr.set_on_host(true);
+    host_allocator_ = device_->GetAllocator(host_alloc_attr);
+
+    AllocatorAttributes device_alloc_attr;
+    device_alloc_attr.set_on_host(false);
+    device_allocator_ = device_->GetAllocator(device_alloc_attr);
+  }
+
+ protected:
+  std::unique_ptr<Device> device_;
+  Allocator* host_allocator_;
+  Allocator* device_allocator_;
+};
+
+TEST_F(XlaHostSendRecvDeviceContextTest, CopyDeviceTensorToCPU) {
+  SetDevice("GPU");
+  Tensor origin_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&origin_cpu_tensor, {1.2, 2.3, 3.4, 4.5});
+  Tensor device_tensor(device_allocator_, DT_FLOAT, TensorShape({2, 2}));
+  Tensor dest_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
+
+  stream_executor::Platform* platform =
+      stream_executor::MultiPlatformManager::PlatformWithName("CUDA").value();
+  stream_executor::StreamExecutor* executor =
+      platform->ExecutorForDevice(0).value();
+  stream_executor::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+  xla::Shape shape;
+  TF_ASSERT_OK(TensorShapeToXLAShape(DT_FLOAT, TensorShape({2, 2}), &shape));
+
+  // Copy the cpu_tensor to the GPU first before trying to copy it back.
+  stream.ThenMemcpy(&gpu_dst, origin_cpu_tensor.data(), gpu_dst.size());
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  tsl::AsyncValueRef<se::Event> done_event =
+      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+  done_event->Init();
+  XlaHostRecvDeviceContext* device_context =
+      new XlaHostRecvDeviceContext(&stream, gpu_dst, shape, done_event);
+  TF_ASSERT_OK(device_context->CopyDeviceTensorToCPUSync(
+      &device_tensor, "", device_.get(), &dest_cpu_tensor));
+
+  tensorflow::test::ExpectClose(origin_cpu_tensor, dest_cpu_tensor);
+  device_context->Unref();
+}
+
+TEST_F(XlaHostSendRecvDeviceContextTest, CopyCPUTensorToDevice) {
+  SetDevice("GPU");
+  Tensor origin_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&origin_cpu_tensor, {1.2, 2.3, 3.4, 4.5});
+  Tensor device_tensor(device_allocator_, DT_FLOAT, TensorShape({2, 2}));
+  Tensor dest_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
+
+  stream_executor::Platform* platform =
+      stream_executor::MultiPlatformManager::PlatformWithName("CUDA").value();
+  stream_executor::StreamExecutor* executor =
+      platform->ExecutorForDevice(0).value();
+  stream_executor::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+  xla::Shape shape;
+  TF_ASSERT_OK(TensorShapeToXLAShape(DT_FLOAT, TensorShape({2, 2}), &shape));
+
+  tsl::AsyncValueRef<se::Event> done_event =
+      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+  done_event->Init();
+  XlaHostSendDeviceContext* device_context =
+      new XlaHostSendDeviceContext(&stream, &gpu_dst, shape, done_event);
+  TF_ASSERT_OK(device_context->CopyCPUTensorToDeviceSync(
+      &origin_cpu_tensor, device_.get(), &device_tensor));
+
+  // Copy the GPU tensor back to CPU to check that copy worked.
+  stream.ThenMemcpy(dest_cpu_tensor.data(), gpu_dst, gpu_dst.size());
+  TF_ASSERT_OK(stream.BlockHostUntilDone());
+
+  tensorflow::test::ExpectClose(origin_cpu_tensor, dest_cpu_tensor);
+  device_context->Unref();
+}
+
+TEST_F(XlaHostSendRecvDeviceContextTest, RoundTrip) {
+  SetDevice("GPU");
+  Tensor origin_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
+  test::FillValues<float>(&origin_cpu_tensor, {1.2, 2.3, 3.4, 4.5});
+  Tensor device_tensor(device_allocator_, DT_FLOAT, TensorShape({2, 2}));
+  Tensor dest_cpu_tensor(host_allocator_, DT_FLOAT, TensorShape({2, 2}));
+
+  stream_executor::Platform* platform =
+      stream_executor::MultiPlatformManager::PlatformWithName("CUDA").value();
+  stream_executor::StreamExecutor* executor =
+      platform->ExecutorForDevice(0).value();
+  stream_executor::Stream stream(executor);
+  stream.Init();
+  ASSERT_TRUE(stream.ok());
+
+  se::DeviceMemoryBase gpu_dst{device_tensor.data(), 4 * sizeof(float)};
+  xla::Shape shape;
+  TF_ASSERT_OK(TensorShapeToXLAShape(DT_FLOAT, TensorShape({2, 2}), &shape));
+
+  tsl::AsyncValueRef<se::Event> send_done_event =
+      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+  send_done_event->Init();
+  XlaHostSendDeviceContext* send_device_context =
+      new XlaHostSendDeviceContext(&stream, &gpu_dst, shape, send_done_event);
+  TF_ASSERT_OK(send_device_context->CopyCPUTensorToDeviceSync(
+      &origin_cpu_tensor, device_.get(), &device_tensor));
+
+  tsl::AsyncValueRef<se::Event> recv_done_event =
+      tsl::MakeConstructedAsyncValueRef<se::Event>(stream.parent());
+  recv_done_event->Init();
+  XlaHostRecvDeviceContext* recv_device_context =
+      new XlaHostRecvDeviceContext(&stream, gpu_dst, shape, recv_done_event);
+  TF_ASSERT_OK(recv_device_context->CopyDeviceTensorToCPUSync(
+      &device_tensor, "", device_.get(), &dest_cpu_tensor));
+
+  tensorflow::test::ExpectClose(origin_cpu_tensor, dest_cpu_tensor);
+  send_device_context->Unref();
+  recv_device_context->Unref();
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_kernel_creator_test.cc b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
index e683caa1aac..b66e4270d3a 100644
--- a/tensorflow/compiler/jit/xla_kernel_creator_test.cc
+++ b/tensorflow/compiler/jit/xla_kernel_creator_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_kernel_creator.h"
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/function_testlib.h"
@@ -136,7 +137,7 @@ TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrNotSet) {
                                         input: 'b'
                                       )proto"),
                                       &kernel_);
-  EXPECT_TRUE(errors::IsInternal(status)) << status.ToString();
+  EXPECT_TRUE(absl::IsInternal(status)) << status;
 }
 
 TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrIsSetToFalse) {
@@ -153,7 +154,7 @@ TEST_F(XlaKernelCreatorTest, FailsIfXlaCompileAttrIsSetToFalse) {
                                         input: 'b'
                                       )proto"),
                                       &kernel_);
-  EXPECT_TRUE(errors::IsInternal(status)) << status.ToString();
+  EXPECT_TRUE(absl::IsInternal(status)) << status;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/xla_platform_info.cc b/tensorflow/compiler/jit/xla_platform_info.cc
index b311faa13ab..6d6c5e5b492 100644
--- a/tensorflow/compiler/jit/xla_platform_info.cc
+++ b/tensorflow/compiler/jit/xla_platform_info.cc
@@ -18,17 +18,21 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <set>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/jit/device_executable_persistor.h"
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/jit/pjrt_device_compiler_client.h"
+#include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/jit/xla_device_compiler_client.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/common/create_pjrt_client_util.h"
+#include "tensorflow/core/tfrt/common/global_state.h"
 #include "tensorflow/core/tfrt/common/pjrt_util.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
 
@@ -45,19 +49,23 @@ using PjRtDeviceExecutablePersistor =
 
 XlaDeviceCompiler* CreateXlaDeviceCompiler(
     const XlaDeviceExecutablePersistor::Config& persistor_config,
-    DeviceType device_type, xla::LocalClient* local_client) {
+    DeviceType compilation_device_type, xla::LocalClient* local_client) {
   return new XlaDeviceCompiler(
       std::make_unique<XlaDeviceExecutablePersistor>(
-          std::move(persistor_config), device_type),
+          std::move(persistor_config), compilation_device_type),
       std::make_unique<XlaDeviceCompilerClient>(local_client));
 }
 
-PjRtDeviceCompiler* CreatePjRtDeviceCompiler(
-    const PjRtDeviceExecutablePersistor::Config& persistor_config,
-    DeviceType device_type, xla::PjRtClient* pjrt_client) {
+PjRtDeviceCompiler* CreatePjRtDeviceCompiler(DeviceType compilation_device_type,
+                                             xla::PjRtClient* pjrt_client) {
+  PjRtDeviceExecutablePersistor::Config persistor_config(
+      GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_directory,
+      GetMarkForCompilationPassFlags()->tf_xla_disable_strict_signature_checks,
+      GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix);
+
   return new PjRtDeviceCompiler(
       std::make_unique<PjRtDeviceExecutablePersistor>(
-          std::move(persistor_config), device_type),
+          std::move(persistor_config), compilation_device_type),
       std::make_unique<PjRtDeviceCompilerClient>(pjrt_client));
 }
 
@@ -73,6 +81,60 @@ StatusOr<std::optional<std::set<int>>> GetAllowedGpus(
 
   return gpu_ids;
 }
+
+Status GetCompilationDeviceTypeAndPjRtClient(
+    const XlaPlatformInfo& platform_info, FunctionLibraryRuntime* flr,
+    DeviceType* compilation_device_type, xla::PjRtClient** pjrt_client) {
+  DeviceType device_type = platform_info.device_type();
+
+  if (platform_info.xla_device_metadata()) {
+    VLOG(2) << "Building PjRtDeviceCompiler using "
+               "platform_info.xla_device_metadata().";
+
+    *compilation_device_type =
+        platform_info.xla_device_metadata()->jit_device_type();
+    TF_ASSIGN_OR_RETURN(*pjrt_client, GetOrCreatePjRtClient(device_type));
+    return OkStatus();
+  }
+
+  if (platform_info.pjrt_device_metadata()) {
+    VLOG(2) << "Building PjRtDeviceCompiler using "
+               "platform_info.pjrt_device_metadata().";
+
+    *compilation_device_type =
+        platform_info.pjrt_device_metadata()->jit_device_type();
+    TF_ASSIGN_OR_RETURN(*pjrt_client, GetOrCreatePjRtClient(device_type));
+    return OkStatus();
+  }
+
+  // TFRT-TPU is used if device_type is `DEVICE_TPU` and platform_info does not
+  // have `xla_device_metadata`.
+  if (device_type == DEVICE_TPU) {
+    *compilation_device_type = DeviceType(DEVICE_TPU_XLA_JIT);
+    TF_ASSIGN_OR_RETURN(*pjrt_client, GetOrCreatePjRtClient(device_type));
+    return OkStatus();
+  }
+
+  VLOG(2) << "platform_info.xla_device_metadata not found and "
+             "platform_info.device_type() != DEVICE_TPU. Building "
+             "PjRtDeviceCompiler for non-XLA device.";
+
+  const XlaOpRegistry::DeviceRegistration* registration;
+  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
+    return errors::InvalidArgument("No JIT device registered for ",
+                                   device_type.type());
+  }
+  *compilation_device_type = DeviceType(registration->compilation_device_name);
+
+  TF_ASSIGN_OR_RETURN(auto allowed_gpus, GetAllowedGpus(flr));
+  // TODO(b/255826209): Set platform, intra op parallelism threads if required
+  // and when supported by GetOrCreatePjRtClient().
+  // The `allowed_gpus` argument is used only if the `device_type` is GPU.
+  TF_ASSIGN_OR_RETURN(*pjrt_client,
+                      GetOrCreatePjRtClient(device_type, allowed_gpus));
+
+  return OkStatus();
+}
 }  // namespace
 
 xla::StatusOr<std::optional<std::set<int>>> ParseVisibleDeviceList(
@@ -175,71 +237,45 @@ Status BuildXlaDeviceCompiler(DeviceBase* device, FunctionLibraryRuntime* flr,
   return OkStatus();
 }
 
-Status BuildPjRtDeviceCompiler(const XlaPlatformInfo& platform_info,
-                               FunctionLibraryRuntime* flr,
-                               PjRtDeviceCompiler** pjrt_device_compiler) {
-  PjRtDeviceExecutablePersistor::Config persistor_config(
-      GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_directory,
-      GetMarkForCompilationPassFlags()->tf_xla_disable_strict_signature_checks,
-      GetMarkForCompilationPassFlags()->tf_xla_persistent_cache_prefix);
+Status GetOrCreatePjRtDeviceCompilerAndProfiler(
+    const XlaPlatformInfo& platform_info, FunctionLibraryRuntime* flr,
+    PjRtDeviceCompiler** pjrt_device_compiler,
+    DeviceCompilationProfiler** profiler) {
+  // We store information about the JIT-compiled XLA computation
+  // in the ResourceMgr.
+  ResourceMgr* rm = tfrt_global::GetTFGlobalResourceMgr();
 
-  DeviceType device_type = platform_info.device_type();
+  const auto& device_type = platform_info.device_type();
+  const std::string& compiler_name =
+      GetPjRtDeviceCompilerResourceName(device_type);
 
-  if (platform_info.xla_device_metadata()) {
-    VLOG(2) << "Building PjRtDeviceCompiler using "
-               "platform_info.xla_device_metadata().";
+  // Lookup the DeviceCompiler, create one if not found.
+  Status s = rm->Lookup<PjRtDeviceCompiler>(
+      rm->default_container(), compiler_name, pjrt_device_compiler);
+  if (!s.ok()) {
+    DeviceType compilation_device_type("");
+    xla::PjRtClient* pjrt_client = nullptr;
+    TF_RETURN_IF_ERROR(GetCompilationDeviceTypeAndPjRtClient(
+        platform_info, flr, &compilation_device_type, &pjrt_client));
 
-    DeviceType compilation_device_type =
-        platform_info.xla_device_metadata()->jit_device_type();
-    TF_ASSIGN_OR_RETURN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
-
-    *pjrt_device_compiler = CreatePjRtDeviceCompiler(
-        persistor_config, compilation_device_type, pjrt_client);
-    return OkStatus();
-  }
-  if (platform_info.pjrt_device_metadata()) {
-    VLOG(2) << "Building PjRtDeviceCompiler using "
-               "platform_info.pjrt_device_metadata().";
-
-    DeviceType compilation_device_type =
-        platform_info.pjrt_device_metadata()->jit_device_type();
-    TF_ASSIGN_OR_RETURN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
-
-    *pjrt_device_compiler = CreatePjRtDeviceCompiler(
-        persistor_config, compilation_device_type, pjrt_client);
-    return OkStatus();
+    TF_RETURN_IF_ERROR(rm->LookupOrCreate<PjRtDeviceCompiler>(
+        rm->default_container(), compiler_name, pjrt_device_compiler,
+        [&](PjRtDeviceCompiler** pjrt_device_compiler) {
+          *pjrt_device_compiler =
+              CreatePjRtDeviceCompiler(compilation_device_type, pjrt_client);
+          return OkStatus();
+        }));
   }
 
-  // TFRT-TPU is used if device_type is `DEVICE_TPU` and platform_info does not
-  // have `xla_device_metadata`.
-  if (device_type == DEVICE_TPU) {
-    TF_ASSIGN_OR_RETURN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
-    *pjrt_device_compiler = CreatePjRtDeviceCompiler(
-        persistor_config, DeviceType(DEVICE_TPU_XLA_JIT), pjrt_client);
-    return OkStatus();
-  }
+  const std::string& profiler_name =
+      GetPjRtDeviceCompilationProfilerResourceName(device_type);
+  TF_RETURN_IF_ERROR(rm->LookupOrCreate<DeviceCompilationProfiler>(
+      rm->default_container(), profiler_name, profiler,
+      [](DeviceCompilationProfiler** profiler) {
+        *profiler = new DeviceCompilationProfiler();
+        return OkStatus();
+      }));
 
-  VLOG(2) << "platform_info.xla_device_metadata not found and "
-             "platform_info.device_type() != DEVICE_TPU. Building "
-             "PjRtDeviceCompiler for non-XLA device.";
-
-  const XlaOpRegistry::DeviceRegistration* registration;
-  if (!XlaOpRegistry::GetCompilationDevice(device_type.type(), &registration)) {
-    return errors::InvalidArgument("No JIT device registered for ",
-                                   device_type.type());
-  }
-  auto compilation_device_type =
-      DeviceType(registration->compilation_device_name);
-
-  TF_ASSIGN_OR_RETURN(auto allowed_gpus, GetAllowedGpus(flr));
-  // TODO(b/255826209): Set platform, intra op parallelism threads if required
-  // and when supported by GetOrCreatePjRtClient().
-  // The `allowed_gpus` argument is used only if the `device_type` is GPU.
-  TF_ASSIGN_OR_RETURN(auto pjrt_client,
-                      GetOrCreatePjRtClient(device_type, allowed_gpus));
-
-  *pjrt_device_compiler = CreatePjRtDeviceCompiler(
-      persistor_config, compilation_device_type, pjrt_client);
   return OkStatus();
 }
 
diff --git a/tensorflow/compiler/jit/xla_platform_info.h b/tensorflow/compiler/jit/xla_platform_info.h
index 725a876904d..4a8bc27a045 100644
--- a/tensorflow/compiler/jit/xla_platform_info.h
+++ b/tensorflow/compiler/jit/xla_platform_info.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 
 #include "tensorflow/compiler/jit/device_compiler.h"
 #include "tensorflow/compiler/jit/pjrt_base_device.h"
@@ -113,17 +114,21 @@ Status BuildXlaDeviceCompiler(
     DeviceCompiler<xla::LocalExecutable, xla::LocalClient>**
         xla_device_compiler);
 
-// Builds a DeviceCompiler that uses xla::PjRtClient using an appropriate
+// Fetches a DeviceCompiler from the tfrt_global resource manager (or creates
+// one there if not found) that uses xla::PjRtClient using an appropriate
 // PjRtClient for `platform_info.device_type()` and sets *pjrt_device_compiler
-// to point to it. Uses flags from `MarkForCompilationPassFlags` for configuring
-// the persistor used in the DeviceCompiler. Please note that non-XLA devices
-// aren't supported yet. This is because:
+// to point to it. Also fetches/creates a DeviceCompilationProfiler from/in the
+// tfrt_global resource manager for `platform_info.device_type()` and sets
+// *profiler to point to it.  Uses flags from `MarkForCompilationPassFlags` for
+// configuring the persistor used in the DeviceCompiler. Please note that
+// non-XLA devices aren't supported yet. This is because:
 // 1. PjRtClient doesn't support data transfer for non-XLA devices yet
 // 2. Fetching the PjRtClient for non-XLA devices is also not supported yet
-Status BuildPjRtDeviceCompiler(
+Status GetOrCreatePjRtDeviceCompilerAndProfiler(
     const XlaPlatformInfo& platform_info, FunctionLibraryRuntime* flr,
     DeviceCompiler<xla::PjRtLoadedExecutable, xla::PjRtClient>**
-        pjrt_device_compiler);
+        pjrt_device_compiler,
+    DeviceCompilationProfiler** profiler);
 
 // Returns information about the platform from kernel context.
 XlaPlatformInfo XlaPlatformInfoFromDevice(DeviceBase* device);
diff --git a/tensorflow/compiler/jit/xla_platform_info_test.cc b/tensorflow/compiler/jit/xla_platform_info_test.cc
index 0dedbb39bb9..e12a9366c04 100644
--- a/tensorflow/compiler/jit/xla_platform_info_test.cc
+++ b/tensorflow/compiler/jit/xla_platform_info_test.cc
@@ -81,7 +81,7 @@ TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerNonXlaDevice) {
   EXPECT_TRUE(xla_device_compiler->client() != nullptr);
 }
 
-TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTestXlaDevice) {
+TEST_F(XlaPlatformInfoTest, GetOrCreatePjRtDeviceCompilerAndProfilerXlaDevice) {
   DeviceType device_type = DeviceType(DEVICE_XLA_GPU);
   device_setup_.AddDevicesAndSetUp({device_type.type()});
 
@@ -91,23 +91,27 @@ TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTestXlaDevice) {
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
 
   PjRtDeviceCompiler* pjrt_device_compiler = nullptr;
-  TF_EXPECT_OK(BuildPjRtDeviceCompiler(platform_info, device_setup_.flr(),
-                                       &pjrt_device_compiler));
+  DeviceCompilationProfiler* profiler = nullptr;
+  TF_EXPECT_OK(GetOrCreatePjRtDeviceCompilerAndProfiler(
+      platform_info, device_setup_.flr(), &pjrt_device_compiler, &profiler));
   core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+  core::ScopedUnref profiler_ref(profiler);
 
   TF_ASSERT_OK_AND_ASSIGN(auto pjrt_client, GetOrCreatePjRtClient(device_type));
   EXPECT_EQ(pjrt_device_compiler->device_type(), metadata->jit_device_type());
   EXPECT_EQ(pjrt_device_compiler->client(), pjrt_client);
 }
 
-TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTestGpuDevice) {
+TEST_F(XlaPlatformInfoTest, GetOrCreatePjRtDeviceCompilerAndProfilerGpuDevice) {
   device_setup_.AddDevicesAndSetUp({DEVICE_GPU});
   Device* device = device_setup_.GetDevice(DEVICE_GPU);
   XlaPlatformInfo platform_info = XlaPlatformInfoFromDevice(device);
   PjRtDeviceCompiler* pjrt_device_compiler = nullptr;
-  TF_EXPECT_OK(BuildPjRtDeviceCompiler(platform_info, device_setup_.flr(),
-                                       &pjrt_device_compiler));
+  DeviceCompilationProfiler* profiler = nullptr;
+  TF_EXPECT_OK(GetOrCreatePjRtDeviceCompilerAndProfiler(
+      platform_info, device_setup_.flr(), &pjrt_device_compiler, &profiler));
   core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+  core::ScopedUnref profiler_ref(profiler);
 }
 #endif
 
@@ -138,7 +142,7 @@ TEST_F(XlaPlatformInfoTest, BuildXlaDeviceCompilerTpuDevice) {
 
 // TODO(b/255826209): Look into using an actual TPU device for the unit test,
 // and move this out of OSS.
-TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTpuDevice) {
+TEST_F(XlaPlatformInfoTest, GetOrCreatePjRtDeviceCompilerAndProfilerTpuDevice) {
   DeviceType device_type = DeviceType(DEVICE_TPU);
   DeviceType compilation_device_type = DeviceType(DEVICE_TPU_XLA_JIT);
   // Use a CPU PjRtClient instead of a TPU one just for testing whether
@@ -158,9 +162,11 @@ TEST_F(XlaPlatformInfoTest, BuildPjRtDeviceCompilerTpuDevice) {
                                 /*device_allocator=*/nullptr);
 
   PjRtDeviceCompiler* pjrt_device_compiler = nullptr;
-  TF_EXPECT_OK(
-      BuildPjRtDeviceCompiler(platform_info, nullptr, &pjrt_device_compiler));
+  DeviceCompilationProfiler* profiler = nullptr;
+  TF_EXPECT_OK(GetOrCreatePjRtDeviceCompilerAndProfiler(
+      platform_info, nullptr, &pjrt_device_compiler, &profiler));
   core::ScopedUnref pjrt_device_compiler_ref(pjrt_device_compiler);
+  core::ScopedUnref profiler_ref(profiler);
 
   EXPECT_EQ(pjrt_device_compiler->device_type(), compilation_device_type);
   EXPECT_EQ(pjrt_device_compiler->client(), pjrt_client);
diff --git a/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md b/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
index 9405aa417df..342b5e0d23b 100644
--- a/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
+++ b/tensorflow/compiler/mlir/g3doc/_includes/tf_passes.md
@@ -272,6 +272,10 @@ compiling to XLA.
 ### `-tf-embedding-pipelining`: Rewrite graph for embedding pipelining
 For architectures that support accelerated embedding lookups, this pass will
 rewrite the graph to use pipelining for better device utilization.
+### `-tf-embedding-sequencing`: Rewrite graph for sequential execution of embeddings
+This is a strictly sequential and formally correct fallback option for the
+embedding pipelining pass intended for debugging during pipelining
+development.
 ### `-tf-executor-break-up-islands`: Transform from TF control dialect to TF executor dialect.
 ### `-tf-executor-check-control-dependencies`: Checks control dependencies
 This pass analyzes control dependencies between islands and warns about
@@ -1993,4 +1997,21 @@ This pass will transform it into
 ### `-tf-verify-for-export`: Verify module is suitable for export back to TF Graph
 Verifies whether all functions in module are of single tf_executor.graph and
 each tf_executor.island in tf_executor.graph only has a single op.
+### `-tf-xla-call-module-deserialization`: Deserializes StableHLO functions embedded in `tf.XlaCallModule` to top level module
+This pass deserializes the StableHLO bytecodes embedded in tf.XlaCallModule,
+then outlines the functions in the deserialized StableHLO module to the top
+level MLIR module, with function renamings to avoid naming conflicts.
+
+After the outlining, it updates tf.XlaCallModule's module attribute to be
+empty, adds an `_entry_function` attribute referring to the entry function.
+It also adds a `_from_xla_call_module: true` attribute to each lifted
+StableHLO function.
+### `-tf-xla-call-module-serialization`: Serializes StableHLO functions from top-level module into `tf.XlaCallModule`'s `module` attribute
+This pass collects StableHLO functions referenced from `tf.XlaCallModule`'s
+`_entry_function` attribute into a module, serializes the module into MLIR
+bytecode, and embed the bytecode to `tf.XlaCallModule`'s `module` attribute.
+
+After serialization, this pass removes the `_entry_function` attribute from
+`tf.XlaCallModule`, and removes all the serialized stablehlo functions
+from the top-level module.
 ### `-tfe-legalize-tfg`: Legalize from TFG to the TFE dialect
diff --git a/tensorflow/compiler/mlir/glob_lit_test.bzl b/tensorflow/compiler/mlir/glob_lit_test.bzl
index 35491ed3d55..f65c86b727b 100644
--- a/tensorflow/compiler/mlir/glob_lit_test.bzl
+++ b/tensorflow/compiler/mlir/glob_lit_test.bzl
@@ -64,6 +64,7 @@ def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
     )
 
 def glob_lit_tests(
+        name = None,
         exclude = [],
         test_file_exts = _default_test_file_exts,
         default_size = _default_size,
@@ -78,6 +79,7 @@ def glob_lit_tests(
     """Creates all plausible Lit tests (and their inputs) under this directory.
 
     Args:
+      name: str, name of the test_suite rule to generate for running all tests.
       exclude: [str], paths to exclude (for tests and inputs).
       test_file_exts: [str], extensions for files that are tests.
       default_size: str, the test size for targets not in "size_override".
@@ -103,7 +105,10 @@ def glob_lit_tests(
 
     # Run tests individually such that errors can be attributed to a specific
     # failure.
+    all_tests = []
     for curr_test in tests:
+        all_tests.append(curr_test + ".test")
+
         # Instantiate this test with updated parameters.
         _run_lit_test(
             name = curr_test + ".test",
@@ -114,3 +119,11 @@ def glob_lit_tests(
             features = features,
             exec_properties = exec_properties,
         )
+
+    # TODO: remove this check after making it a required param.
+    if name:
+        native.test_suite(
+            name = name,
+            tests = all_tests,
+            tags = ["manual"],
+        )
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 27c2706622a..5bd1e41c068 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -342,8 +342,6 @@ cc_library(
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:LoopLikeInterface",
         "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:SideEffectInterfaces",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
     ],
@@ -1235,6 +1233,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_passes",
         "//tensorflow/compiler/mlir/lite/quantization/tensorflow:tf_quantization_passes",
+        "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
+        "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index 1ea913ae5c4..642a3349528 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -83,7 +83,9 @@ struct PassConfig {
   // Whether to run the `GuaranteeAllFuncsOneUsePass` to ensure each function
   // has a single use.
   bool guarantee_all_funcs_one_use;
-  // Whether to enable the hlo to tf conversion.
+  // Whether to enable the hlo/stablehlo to tf conversion. This also supports
+  // the case where a saved model contains both TF module and serialized
+  // StableHLO module.
   bool enable_hlo_to_tf_conversion;
   // Whether to enable to use DynamicUpdateSlice op.
   bool enable_dynamic_update_slice;
diff --git a/tensorflow/compiler/mlir/lite/emit_error_reporter.cc b/tensorflow/compiler/mlir/lite/emit_error_reporter.cc
index d280bec85f5..f9c4760326b 100644
--- a/tensorflow/compiler/mlir/lite/emit_error_reporter.cc
+++ b/tensorflow/compiler/mlir/lite/emit_error_reporter.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/emit_error_reporter.h"
 
+#include <cstdio>
+#include <vector>
+
 namespace tflite {
 
 int EmitErrorReporter::Report(const char* format, va_list args) {
diff --git a/tensorflow/compiler/mlir/lite/experimental/common/BUILD b/tensorflow/compiler/mlir/lite/experimental/common/BUILD
index 4b7a41ab347..02fab009fda 100644
--- a/tensorflow/compiler/mlir/lite/experimental/common/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/common/BUILD
@@ -1,5 +1,7 @@
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "outline_operations",
     srcs = ["outline_operations.cc"],
diff --git a/tensorflow/compiler/mlir/lite/experimental/remat/BUILD b/tensorflow/compiler/mlir/lite/experimental/remat/BUILD
index b9b69bf852d..f0d059ca919 100644
--- a/tensorflow/compiler/mlir/lite/experimental/remat/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/remat/BUILD
@@ -1,5 +1,7 @@
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "rematerializer",
     srcs = ["rematerializer.cc"],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
index 0903cda3e43..c5c4c422bf8 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/BUILD
@@ -232,6 +232,7 @@ cc_library(
         "transforms/get_alternative_subgraph.cc",
         "transforms/pick_subgraphs.cc",
         "transforms/raise_target_subgraphs.cc",
+        "transforms/tac_filter.cc",
         "transforms/target_annotation.cc",
     ],
     hdrs = [
@@ -243,6 +244,7 @@ cc_library(
         ":common",
         ":cost_model",
         ":device_transform",
+        ":tac_filter_cc_proto",
         ":tac_importer_exporter",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
@@ -253,6 +255,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_protobuf//:protobuf_headers",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -380,3 +383,15 @@ py_library(
         "//tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper:_pywrap_tac_wrapper",
     ],
 )
+
+proto_library(
+    name = "tac_filter_proto",
+    srcs = ["tac_filter.proto"],
+    compatible_with = get_compatible_with_cloud(),
+)
+
+cc_proto_library(
+    name = "tac_filter_cc_proto",
+    compatible_with = get_compatible_with_cloud(),
+    deps = [":tac_filter_proto"],
+)
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h b/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
index 2bd79ddf848..2f2992871a1 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_COMMON_TARGETS_H_
 
 #include <algorithm>
+#include <cctype>
 #include <functional>
 #include <optional>
 #include <string>
@@ -36,6 +37,9 @@ constexpr char kDevice[] = "tac.device";
 // Inference type.
 constexpr char kInferenceType[] = "tac.inference_type";
 
+// Inference type.
+constexpr char kSkipTargetAnnotation[] = "tac.skip_target_annotation";
+
 // TODO(renjieliu): Add more inference types.
 enum InferenceType {
   UNKNOWN = 0,
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.cc b/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.cc
index e300f0686ac..6a580db3185 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/experimental/tac/examples/example_hardware.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc
index 6d17c7f6ff6..932e047f7a4 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter_test.cc
@@ -15,6 +15,7 @@
 #include "tensorflow/compiler/mlir/lite/experimental/tac/execution_metadata_exporter.h"
 
 #include <string>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/cpu_hardware.cc b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/cpu_hardware.cc
index a4f09f98bc7..09876d9373f 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/cpu_hardware.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/cpu_hardware.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc
index ed3785f6898..6af9f6211d3 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/gpu_hardware.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.cc b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.cc
index ad09d8d2762..ab2de0b75d2 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/experimental/tac/hardwares/nnapi_hardware.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/utils.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.cc b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.cc
index f1b883fda58..62874d7cc51 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,22 +34,16 @@ namespace mlir {
 namespace TFL {
 namespace tac {
 namespace {
-struct RegisteredTargetHardware {
-  // TODO(b/177376459): Remove this constructor.
-  RegisteredTargetHardware(const std::string& name,
-                           const std::string& description, mlir::TypeID type_id,
-                           std::unique_ptr<TargetHardware> target_hardware)
-      : unique_name(GetCanonicalHardwareName(name)),
-        description(description),
-        type_id(type_id),
-        target_hardware(std::move(target_hardware)) {}
 
+struct RegisteredTargetHardware {
   RegisteredTargetHardware(
       const std::string& name, const std::string& description,
       mlir::TypeID type_id,
       std::function<std::unique_ptr<TargetHardware>()> target_hardware_factory)
       : unique_name(GetCanonicalHardwareName(name)),
         description(description),
+        type_id(type_id),
+        target_hardware(target_hardware_factory()),
         target_hardware_factory(target_hardware_factory) {}
 
   std::string unique_name;
@@ -185,22 +180,6 @@ std::function<std::unique_ptr<TargetHardware>()> GetTargetHardwareFactory(
 
 namespace internal {
 
-void RegisterTargetHardware(
-    const std::string& unique_name, const std::string& description,
-    mlir::TypeID type_id,
-    std::function<std::unique_ptr<TargetHardware>()> target_hardware_factory) {
-  auto* registered_hardwares = GetRegisteredHardwares();
-  for (const auto& hardware : *registered_hardwares) {
-    if (hardware.unique_name == unique_name) {
-      llvm::errs() << "Ignoring duplicate hardware. Hardware " << unique_name
-                   << " already registered\n";
-      return;
-    }
-  }
-  registered_hardwares->push_back(RegisteredTargetHardware(
-      unique_name, description, type_id, target_hardware_factory()));
-}
-
 void RegisterTargetHardwareFactory(
     const std::string& unique_name, const std::string& description,
     mlir::TypeID type_id,
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
index 38286ed3cfe..9a1e21dcc19 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/hardwares/target_hardware.h
@@ -44,7 +44,7 @@ constexpr static float kCrossHardwareTransferFixedCost = 10.f;
 // for registering the operation.
 class TargetHardwareOperation {
  public:
-  virtual ~TargetHardwareOperation() {}
+  virtual ~TargetHardwareOperation() = default;
 
   virtual double GetOpCost(mlir::Operation* op) const = 0;
 
@@ -64,7 +64,7 @@ class TargetHardwareOperation {
 // };
 class TargetHardware {
  public:
-  virtual ~TargetHardware() {}
+  virtual ~TargetHardware() = default;
 
   // Initializes all TargetHardwareOperation registered for this hardware.
   // Users overriding this function, should call the base class method to
@@ -111,20 +111,6 @@ std::function<std::unique_ptr<TargetHardware>()> GetTargetHardwareFactory(
     const std::string& hardware_name);
 
 namespace internal {
-// DEPRECATED: Do not use, prefer using RegisterTargetHardwareFactory instead.
-void RegisterTargetHardware(
-    const std::string& unique_name, const std::string& description,
-    mlir::TypeID type_id,
-    std::function<std::unique_ptr<TargetHardware>()> target_hardware_factory);
-
-// DEPRECATED: Do not use, prefer using RegisterTargetHardwareFactory instead.
-template <typename T>
-void RegisterTargetHardware(
-    const std::string& description,
-    std::function<std::unique_ptr<TargetHardware>()> target_hardware_factory) {
-  RegisterTargetHardware(T::kId, description, mlir::TypeID::get<T>(),
-                         target_hardware_factory);
-}
 
 void RegisterTargetHardwareFactory(
     const std::string& unique_name, const std::string& description,
@@ -158,9 +144,6 @@ struct TargetHardwareRegistration {
   TargetHardwareRegistration(const std::string& description,
                              std::function<std::unique_ptr<TargetHardware>()>
                                  target_hardware_factory) {
-    // TODO(b/177376459): remove this.
-    internal::RegisterTargetHardware<Hardware>(description,
-                                               target_hardware_factory);
     internal::RegisterTargetHardwareFactory<Hardware>(description,
                                                       target_hardware_factory);
   }
@@ -185,7 +168,7 @@ struct TargetHardwareOpRegistration {
 //======== util functions ==========
 
 // Process user specified device specs, will always add CPU if it's not there.
-// specified_deivce_specs: ',' separated, like "GPU,DSP,CPU".
+// specified_device_specs: ',' separated, like "GPU,DSP,CPU".
 // device_specs: processed device specs enum.
 bool ProcessTargetDevices(llvm::ArrayRef<std::string> specified_device_specs,
                           std::vector<std::string>* device_specs);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
index d363334fb5f..57ee70321ee 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/BUILD
@@ -103,7 +103,7 @@ pybind_extension(
     ] + if_onednn_v3(["@onednn_v3//:__subpackages__"]),
     deps = [
         ":tac_wrapper_lib",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.cc b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.cc
index b9b4d3d465b..0ae1e3db3fe 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper_pybind11.cc b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper_pybind11.cc
index 5d0366515cf..18616733118 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper_pybind11.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper_pybind11.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <pybind11/stl.h>
 
 #include <string>
+#include <vector>
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/compiler/mlir/lite/experimental/tac/py_wrapper/tac_wrapper.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tac_filter.proto b/tensorflow/compiler/mlir/lite/experimental/tac/tac_filter.proto
new file mode 100644
index 00000000000..d26e0996dbe
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tac_filter.proto
@@ -0,0 +1,49 @@
+syntax = "proto3";
+
+package third_party.tensorflow.compiler.mlir.lite.experimental.tac;
+
+// A list of filters for TAC users to run ops/functions on ML hardwares. The
+// intuition is that, for ops/functions that can be run on ML hardware (e.g.
+// EdgeTPU) and TFLite CPU, TAC users give a hint that they're more performant
+// to run on TFLite CPU. These filters give the TAC users freedom to specify the
+// parts that they want to use other hardware to accelerate.
+message TacFilters {
+  // A list of filters/rules to specify the parts that user wants to run on
+  // other hardware.
+  repeated TacFilter tac_filters = 1;
+}
+
+// A filter can be used for an op or function.
+message TacFilter {
+  oneof filter {
+    OpFilter op_filter = 1;
+    FunctionFilter function_filter = 2;
+  }
+}
+
+// Function filter is to include/exclude a function in the target annotation
+// pass in the TAC tool pipeline.
+message FunctionFilter {
+  // Function filter types that are supported. If one function is matched for
+  // two rules with conflict, INCLUDE_TARGET_ANNOTATION has higher priority.
+  enum FunctionFilterType {
+    // To skip this function in the target annotation pass. This means all ops
+    // in this function run on TFLite CPU.
+    SKIP_TARGET_ANNOTATION = 0;
+    // To include this function in the target annotation pass. This has higher
+    // priority than `SKIP_TARGET_ANNOTATION`.
+    INCLUDE_TARGET_ANNOTATION = 1;
+  }
+  // This name corresponds to the TFLite subgraph name in the flatbuffer.
+  // `function_name_pattern` supports regex matching.
+  string function_name_pattern = 1;
+  FunctionFilterType filter_type = 2;
+}
+
+// Op filter is to filter out ops that user wants to run. Ops with this filter
+// run on TFLite CPU.
+message OpFilter {
+  // This name corresponds to the mlir::Location of the tensor.
+  // `op_name_pattern` supports regex matching.
+  string op_name_pattern = 1;
+}
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h b/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h
index 3711b8874a3..a40a3b94b52 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tac_importer_exporter.h
@@ -28,7 +28,7 @@ namespace tac {
 // See TacModule in how to register it with the module and use it.
 class TacImporter {
  public:
-  virtual ~TacImporter() {}
+  virtual ~TacImporter() = default;
 
   // Imports and returns the Module for the imported program.
   virtual absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> Import() = 0;
@@ -40,7 +40,7 @@ class TacImporter {
 // See TacModule in how to register it with the module and use it.
 class TacExporter {
  public:
-  virtual ~TacExporter() {}
+  virtual ~TacExporter() = default;
 
   // Imports and returns the Module for the imported program.
   virtual absl::Status Export(mlir::ModuleOp module) = 0;
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc b/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc
index 805b7802517..8313bf2c10e 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h b/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h
index 883b1ba84e2..7733a9bda80 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tac_module.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_MODULE_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TAC_MODULE_H_
 
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -57,7 +58,7 @@ class TacModule {
     bool legalize_to_tflite_ops = false;
   };
 
-  virtual ~TacModule() {}
+  virtual ~TacModule() = default;
 
   explicit TacModule(const Options& options) : options_(options) {}
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD
index 1ae5f737d37..f3a0574e882 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD
index 8fef794a866..58beccdb043 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/e2e/BUILD
@@ -10,6 +10,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
index 18b9e0fd605..3018221fdac 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/raise-target-subgraphs.mlir
@@ -314,7 +314,7 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
     %cst_2 = arith.constant dense<1> : tensor<1xi32>
     %cst_3 = arith.constant dense<0> : tensor<1xi32>
     %0 = "tfl.shape"(%arg2) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>) -> tensor<2xi32>
-    %1 = "tfl.strided_slice"(%0, %cst_3, %cst_2, %cst_2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+    %1 = "tfl.strided_slice"(%0, %cst_3, %cst_2, %cst_2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
     %2 = "tfl.custom"(%cst_1, %1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230072A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?x!tf_type.string>>>
     %3 = "tfl.custom"(%cst_1, %1) {custom_code = "FlexTensorListReserve", custom_option = #tfl<const_bytes : "0x1154656E736F724C697374526573657276650040121154656E736F724C697374526573657276651A001A002A130A0D656C656D656E745F6474797065120230032A100A0A73686170655F74797065120230033200000255431414042801">, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<1xi32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<?xi32>>>
     %4:8 = "tfl.while"(%cst_0, %cst_0, %arg5, %arg6, %2, %2, %3, %3) ({
@@ -337,23 +337,23 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
       %cst_13 = arith.constant dense<1> : tensor<1xi32>
       %cst_14 = arith.constant dense<0> : tensor<1xi32>
       %9 = "tfl.shape"(%arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
-      %10 = "tfl.strided_slice"(%9, %cst_14, %cst_13, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %10 = "tfl.strided_slice"(%9, %cst_14, %cst_13, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
       %11 = "tfl.range"(%cst_12, %10, %cst_11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
       %12 = "tfl.pack"(%10, %cst_11) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-      %13 = "tfl.strided_slice"(%9, %cst_13, %cst_10, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %13 = "tfl.strided_slice"(%9, %cst_13, %cst_10, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
       %14 = tfl.mul(%11, %13) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
       %15 = "tfl.reshape"(%14, %12) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
-      %16 = "tfl.strided_slice"(%9, %cst_14, %cst_10, %cst_13) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+      %16 = "tfl.strided_slice"(%9, %cst_14, %cst_10, %cst_13) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
       %17 = "tfl.reduce_prod"(%16, %cst_14) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
       %18 = "tfl.reshape"(%arg1, %17) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
       %19 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
-      %20 = "tfl.strided_slice"(%19, %cst_14, %cst_13, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %20 = "tfl.strided_slice"(%19, %cst_14, %cst_13, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
       %21 = "tfl.range"(%cst_12, %20, %cst_11) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
       %22 = "tfl.pack"(%20, %cst_11) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-      %23 = "tfl.strided_slice"(%19, %cst_13, %cst_10, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+      %23 = "tfl.strided_slice"(%19, %cst_13, %cst_10, %cst_13) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
       %24 = tfl.mul(%21, %23) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
       %25 = "tfl.reshape"(%24, %22) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
-      %26 = "tfl.strided_slice"(%19, %cst_14, %cst_10, %cst_13) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+      %26 = "tfl.strided_slice"(%19, %cst_14, %cst_10, %cst_13) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
       %27 = "tfl.reduce_prod"(%26, %cst_14) {keep_dims = true, tac.device = "CPU", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>) -> tensor<1xi32>
       %28 = "tfl.reshape"(%arg0, %27) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
       %29 = tfl.add %arg8, %cst_11 {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : tensor<i32>
@@ -406,7 +406,7 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:   }
 // CHECK:   func.func private @func_0_DARWINN_FLOAT(%arg0: tensor<?x?xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<i32> attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_0"} {
 // CHECK:     %0 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?xi32>) -> tensor<2xi32>
-// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     return %1 : tensor<i32>
 // CHECK:   }
 // CHECK:   func.func private @func_1_CPU_FLOAT(%arg0: tensor<1xi32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<?xi32>, %arg4: tensor<?xi32>, %arg5: tensor<?x?x!tf_type.string>, %arg6: tensor<?x?x!tf_type.string>, %arg7: tensor<?x?xi32>, %arg8: tensor<2xi32>) -> (tensor<?x?x!tf_type.string>, tensor<?x?x!tf_type.string>) attributes {tac.device = "CPU", tac.inference_type = "FLOAT", tac.interface_name = "func_1"} {
@@ -458,25 +458,25 @@ func.func @cond_false_72730(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<?x
 // CHECK:   }
 // CHECK:   func.func private @func_3_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<1xi32>) -> (tensor<?x1xi32>, tensor<2xi32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_3"} {
 // CHECK:     %0 = "tfl.shape"(%arg0) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
-// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     %2 = "tfl.range"(%arg3, %1, %arg4) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %3 = "tfl.pack"(%1, %arg4) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK:     %4 = "tfl.strided_slice"(%0, %arg2, %arg5, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %4 = "tfl.strided_slice"(%0, %arg2, %arg5, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     %5 = tfl.mul(%2, %4) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %6 = "tfl.reshape"(%5, %3) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
-// CHECK:     %7 = "tfl.strided_slice"(%0, %arg1, %arg5, %arg2) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK:     %7 = "tfl.strided_slice"(%0, %arg1, %arg5, %arg2) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
 // CHECK:     return %6, %7 : tensor<?x1xi32>, tensor<2xi32>
 // CHECK:   }
 // CHECK:   func.func private @func_4_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<?x?x!tf_type.string>, %arg3: tensor<1xi32>, %arg4: tensor<1xi32>, %arg5: tensor<i32>, %arg6: tensor<i32>, %arg7: tensor<1xi32>) -> (tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_4"} {
 // CHECK:     %0 = "tfl.reshape"(%arg0, %arg1) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>, tensor<1xi32>) -> tensor<?x!tf_type.string>
 // CHECK:     %1 = "tfl.shape"(%arg2) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?x?x!tf_type.string>) -> tensor<2xi32>
-// CHECK:     %2 = "tfl.strided_slice"(%1, %arg3, %arg4, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %2 = "tfl.strided_slice"(%1, %arg3, %arg4, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     %3 = "tfl.range"(%arg5, %2, %arg6) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %4 = "tfl.pack"(%2, %arg6) {axis = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT", values_count = 2 : i32} : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-// CHECK:     %5 = "tfl.strided_slice"(%1, %arg4, %arg7, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+// CHECK:     %5 = "tfl.strided_slice"(%1, %arg4, %arg7, %arg4) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
 // CHECK:     %6 = tfl.mul(%3, %5) {fused_activation_function = "NONE", tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
 // CHECK:     %7 = "tfl.reshape"(%6, %4) {tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<?xi32>, tensor<2xi32>) -> tensor<?x1xi32>
-// CHECK:     %8 = "tfl.strided_slice"(%1, %arg3, %arg7, %arg4) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK:     %8 = "tfl.strided_slice"(%1, %arg3, %arg7, %arg4) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32, tac.device = "DARWINN", tac.inference_type = "FLOAT"} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
 // CHECK:     return %0, %7, %8 : tensor<?x!tf_type.string>, tensor<?x1xi32>, tensor<2xi32>
 // CHECK:   }
 // CHECK:   func.func private @func_5_DARWINN_FLOAT(%arg0: tensor<?x?x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<?x1xi32>, %arg5: tensor<?x1xi32>, %arg6: tensor<1xi32>, %arg7: tensor<!tf_type.string>, %arg8: tensor<?x1xi32>, %arg9: tensor<?x1xi32>, %arg10: tensor<?x!tf_type.string>, %arg11: tensor<?x?xi32>, %arg12: tensor<i32>, %arg13: tensor<5xi32>, %arg14: tensor<?xi32>, %arg15: tensor<5xi32>, %arg16: tensor<?xi32>, %arg17: tensor<i32>) -> (tensor<i32>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?x!tf_type.string>, tensor<?xi1>, tensor<?xi1>, tensor<?xi32>, tensor<?xi32>, tensor<i32>) attributes {tac.device = "DARWINN", tac.inference_type = "FLOAT", tac.interface_name = "func_5"} {
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir
new file mode 100644
index 00000000000..9b6d68c49f5
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/tac-filter.mlir
@@ -0,0 +1,64 @@
+// RUN: tac-opt-all-backends -tfl-tac-filter='use-test-setting=true' %s -split-input-file -verify-diagnostics | FileCheck %s
+
+// expected-remark@below {{Tac filter (0): filter type: function filter SKIP_TARGET_ANNOTATION, filter_pattern: "^testFunction"}}
+// expected-remark@below {{Tac filter (1): filter type: function filter INCLUDE_TARGET_ANNOTATION, filter_pattern: "testFunctionInclude"}}
+// expected-remark@below {{Tac filter (1) specified but not applied to any op}}
+// expected-remark@below {{Tac filter (2): filter type: op filter, filter_pattern: "^test_op"}}
+// expected-remark@below {{Tac filter (2) specified but not applied to any op}}
+module {
+  // CHECK-LABEL: testFunctionSkiped
+  // expected-remark@+1 {{filtered by tac filter (0)}}
+  func.func @testFunctionSkiped(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) {
+    // CHECK: tfl.add
+    // CHECK-SAME: tac.skip_target_annotation
+    %0 = "tfl.add"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: tfl.add
+    // CHECK-SAME: tac.skip_target_annotation
+    %1 = "tfl.add"(%arg0, %0) {fused_activation_function = "RELU"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    // CHECK: tfl.relu
+    // CHECK-SAME: tac.skip_target_annotation
+    %2 = "tfl.relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32>
+    func.return
+  }
+}
+
+// -----
+
+// expected-remark@below {{Tac filter (0): filter type: function filter SKIP_TARGET_ANNOTATION, filter_pattern: "^testFunction"}}
+// expected-remark@below {{Tac filter (1): filter type: function filter INCLUDE_TARGET_ANNOTATION, filter_pattern: "testFunctionInclude"}}
+// expected-remark@below {{Tac filter (2): filter type: op filter, filter_pattern: "^test_op"}}
+// expected-remark@below {{Tac filter (2) specified but not applied to any op}}
+module {
+  // CHECK-LABEL: testFunctionInclude
+  // CHECK-NOT: tac.skip_target_annotation
+  // expected-remark@+2 {{filtered by tac filter (0)}}
+  // expected-remark@+1 {{filtered by tac filter (1)}}
+  func.func @testFunctionInclude(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) {
+    %0 = "tfl.add"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
+    func.return
+  }
+}
+
+// -----
+
+// expected-remark@below {{Tac filter (0): filter type: function filter SKIP_TARGET_ANNOTATION, filter_pattern: "^testFunction"}}
+// expected-remark@below {{Tac filter (0) specified but not applied to any op}}
+// expected-remark@below {{Tac filter (1): filter type: function filter INCLUDE_TARGET_ANNOTATION, filter_pattern: "testFunctionInclude"}}
+// expected-remark@below {{Tac filter (1) specified but not applied to any op}}
+// expected-remark@below {{Tac filter (2): filter type: op filter, filter_pattern: "^test_op"}}
+module {
+  // CHECK-LABEL: testOpFilter
+  // expected-remark@+1 {{all ops filtered by tac filter (2): "tfl.add", "tfl.relu"}}
+  func.func @testOpFilter(%arg0: tensor<1xf32>, %arg1: tensor<1xf32>) {
+    // CHECK: tfl.add
+    // CHECK-SAME: tac.skip_target_annotation
+    %0 = "tfl.add"(%arg0, %arg1) {fused_activation_function = "RELU6"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32> loc("test_op_0")
+    // CHECK: tfl.add
+    // CHECK-NOT: tac.skip_target_annotation
+    %1 = "tfl.add"(%arg0, %0) {fused_activation_function = "RELU"} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32> loc("non_test_op")
+    // CHECK: tfl.relu
+    // CHECK-SAME: tac.skip_target_annotation
+    %2 = "tfl.relu"(%arg0) : (tensor<1xf32>) -> tensor<1xf32> loc("test_op_1")
+    func.return
+  }
+}
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tests/target-annotation.mlir b/tensorflow/compiler/mlir/lite/experimental/tac/tests/target-annotation.mlir
index 22faae6016c..8197ca323c4 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tests/target-annotation.mlir
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tests/target-annotation.mlir
@@ -80,3 +80,12 @@ func.func @annotateInferenceType(%arg0: tensor<1x1x384x!quant.uniform<i8:f32, 0.
   %1 = "tfl.mul"(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<1x1x384x!quant.uniform<i8:f32, 0.003:-128>>, tensor<1x384x1x!quant.uniform<i8:f32, 0.003:-128>>) -> tensor<1x384x384x!quant.uniform<i8:f32, 0.003:-128>>
   func.return %1 : tensor<1x384x384x!quant.uniform<i8:f32, 0.003:-128>>
 }
+
+// -----
+
+// CHECK-LABEL: testSkipAnnotation
+func.func @testSkipAnnotation(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x3xf32>, %arg2: tensor<16xf32>) -> tensor<256x30x30x16xf32> {
+  // CHECK-NOT: tac.device
+  %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32, tac.skip_target_annotation } : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x30x30x16xf32>
+  func.return %0 : tensor<256x30x30x16xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc b/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc
index e579b60869b..bf3481b79a4 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h"
 
+#include <memory>
+#include <set>
 #include <string>
 
 #include "absl/status/status.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h b/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h
index 2dcba4ab868..ed59787f946 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/tflite_import_export.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TFLITE_IMPORT_EXPORT_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TFLITE_IMPORT_EXPORT_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
index 330d3096b51..9760bad9998 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/experimental/tac/transforms/device_transform_patterns.h"
 
+#include <limits>
 #include <memory>
 
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
index 42852425741..7ccf26d3bac 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/get_alternative_subgraph.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
index f738b2e7a60..a16b0f772c0 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/tac_filter.pb.h"
 
 namespace mlir {
 namespace TFL {
@@ -64,6 +65,11 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateGetOpCostPass();
 std::unique_ptr<OperationPass<ModuleOp>> CreateFoldConstantsToSubgraphPass(
     bool fold_all_constants);
 
+// Create an instance of TacFilterPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateTacFilterPass(
+    ::third_party::tensorflow::compiler::mlir::lite::experimental::tac::
+        TacFilters* tac_filters);
+
 }  // namespace tac
 }  // namespace TFL
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
new file mode 100644
index 00000000000..a2f7441cc17
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_filter.cc
@@ -0,0 +1,259 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <memory>
+#include <string>
+
+#include "google/protobuf/text_format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/Regex.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/experimental/tac/common/targets.h"
+#include "tensorflow/compiler/mlir/lite/experimental/tac/tac_filter.pb.h"
+#include "tensorflow/compiler/mlir/lite/experimental/tac/transforms/passes.h"
+
+namespace mlir {
+namespace TFL {
+namespace tac {
+namespace {
+
+using ::third_party::tensorflow::compiler::mlir::lite::experimental::tac::
+    FunctionFilter;
+using ::third_party::tensorflow::compiler::mlir::lite::experimental::tac::
+    TacFilter;
+using ::third_party::tensorflow::compiler::mlir::lite::experimental::tac::
+    TacFilters;
+
+class TacFilterPass
+    : public PassWrapper<TacFilterPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TacFilterPass)
+
+  TacFilterPass() = default;
+  TacFilterPass(const TacFilterPass& other) {
+    this->tac_filters_ = other.tac_filters_;
+  }
+  explicit TacFilterPass(TacFilters* tac_filters) {
+    tac_filters_ = tac_filters;
+  }
+
+ private:
+  TacFilters* tac_filters_ = nullptr;
+
+  llvm::StringRef getArgument() const final { return "tfl-tac-filter"; }
+  llvm::StringRef getDescription() const final {
+    return "This pass marks the ops to skip target annotation by inserting "
+           "`tac.skip_target_annotation` attribute to them based on user "
+           "provided config.";
+  }
+
+  Option<bool> use_test_setting_{
+      *this, "use-test-setting",
+      llvm::cl::desc(
+          "Whether to use the test config for the tac filter protobuf."),
+      llvm::cl::init(false)};
+
+  void runOnOperation() override;
+};
+
+void ApplyFunctionTacFilter(func::FuncOp func,
+                            FunctionFilter::FunctionFilterType type,
+                            OpBuilder& builder) {
+  for (Operation& op : func.front()) {
+    if (type == FunctionFilter::SKIP_TARGET_ANNOTATION) {
+      op.setAttr(kSkipTargetAnnotation, builder.getUnitAttr());
+    } else if (type == FunctionFilter::INCLUDE_TARGET_ANNOTATION) {
+      op.removeAttr(kSkipTargetAnnotation);
+    }
+  }
+}
+
+void ApplyTacFilter(ModuleOp module, const TacFilter& tac_filter,
+                    SmallVector<Operation*>& filtered_ops, OpBuilder& builder) {
+  if (tac_filter.has_function_filter()) {
+    llvm::Regex func_regex(
+        tac_filter.function_filter().function_name_pattern());
+    for (auto func : module.getOps<func::FuncOp>()) {
+      if (!func_regex.match(func.getName())) {
+        continue;
+      }
+
+      ApplyFunctionTacFilter(func, tac_filter.function_filter().filter_type(),
+                             builder);
+      filtered_ops.push_back(func);
+    }
+    return;
+  }
+
+  llvm::Regex op_regex(tac_filter.op_filter().op_name_pattern());
+  module.walk([&](Operation* op) {
+    auto named_loc = op->getLoc().dyn_cast<NameLoc>();
+    if (!named_loc) {
+      return;
+    }
+    if (!op_regex.match(named_loc.getName())) {
+      return;
+    }
+
+    op->setAttr(kSkipTargetAnnotation, builder.getUnitAttr());
+    filtered_ops.push_back(op);
+  });
+}
+
+// A custom string for tac filter.
+std::string TacFilterToString(const TacFilter& tac_filter) {
+  std::string tac_filter_type_str;
+  std::string tac_filter_name_pattern_str;
+  if (tac_filter.has_function_filter()) {
+    tac_filter_type_str = (llvm::Twine("function filter ") +
+                           FunctionFilter::FunctionFilterType_Name(
+                               tac_filter.function_filter().filter_type()))
+                              .str();
+    tac_filter_name_pattern_str =
+        tac_filter.function_filter().function_name_pattern();
+  } else {
+    tac_filter_type_str = "op filter";
+    tac_filter_name_pattern_str = tac_filter.op_filter().op_name_pattern();
+  }
+  return (llvm::Twine("filter type: ") + tac_filter_type_str +
+          ", filter_pattern: \"" + tac_filter_name_pattern_str + "\"")
+      .str();
+}
+
+void PrintTacFilterResult(Location module_loc, const TacFilter& tac_filter,
+                          int count,
+                          const SmallVector<Operation*>& filtered_ops) {
+  emitRemark(module_loc) << llvm::formatv("Tac filter ({0}): {1}", count,
+                                          TacFilterToString(tac_filter));
+  if (filtered_ops.empty()) {
+    emitRemark(module_loc) << llvm::formatv(
+        "Tac filter ({0}) specified but not applied to any op", count);
+    return;
+  }
+
+  if (tac_filter.has_function_filter()) {
+    for (Operation* op : filtered_ops) {
+      auto func = cast<func::FuncOp>(op);
+      func.emitRemark() << llvm::formatv("filtered by tac filter ({0})", count);
+    }
+    return;
+  }
+
+  DenseMap<func::FuncOp, SmallVector<Operation*>> func_to_filtered_ops_map;
+  for (Operation* op : filtered_ops) {
+    auto func = op->getParentOfType<func::FuncOp>();
+    func_to_filtered_ops_map[func].push_back(op);
+  }
+  for (auto& [func, ops] : func_to_filtered_ops_map) {
+    std::string interleaved_op_name;
+    llvm::raw_string_ostream os(interleaved_op_name);
+    llvm::interleaveComma(
+        ops, os, [&](Operation* op) { os << "\"" << op->getName() << "\""; });
+    os.flush();
+    func.emitRemark() << llvm::formatv(
+        "all ops filtered by tac filter ({0}): {1}", count,
+        interleaved_op_name);
+  }
+}
+
+void TacFilterPass::runOnOperation() {
+  TacFilters test_tac_filters;
+  if (use_test_setting_) {
+    // Sets up the test config used in the mlir LIT test.
+    google::protobuf::TextFormat::ParseFromString(R"(
+      tac_filters {
+        function_filter {
+          function_name_pattern: "^testFunction"
+        }
+      }
+      tac_filters {
+        function_filter {
+          function_name_pattern: "testFunctionInclude"
+          filter_type: INCLUDE_TARGET_ANNOTATION
+        }
+      }
+      tac_filters {
+        op_filter {
+          op_name_pattern: "^test_op"
+        }
+      }
+    )",
+                                        &test_tac_filters);
+    tac_filters_ = &test_tac_filters;
+  }
+
+  if (!tac_filters_) {
+    return;
+  }
+
+  ModuleOp module = getOperation();
+  OpBuilder builder(module);
+  std::sort(tac_filters_->mutable_tac_filters()->pointer_begin(),
+            tac_filters_->mutable_tac_filters()->pointer_end(),
+            [](const TacFilter* a, const TacFilter* b) {
+              const bool a_is_function_filter = a->has_function_filter();
+              const bool b_is_function_filter = b->has_function_filter();
+              if (a_is_function_filter != b_is_function_filter) {
+                // Function filter is applied before op filter.
+                return a_is_function_filter > b_is_function_filter;
+              }
+
+              if (!a_is_function_filter && !b_is_function_filter) {
+                // The order of 2 op filters don't matter.
+                return false;
+              }
+
+              const bool a_is_function_exclude =
+                  (a->function_filter().filter_type() ==
+                   FunctionFilter::SKIP_TARGET_ANNOTATION);
+              const bool b_is_function_exclude =
+                  (b->function_filter().filter_type() ==
+                   FunctionFilter::SKIP_TARGET_ANNOTATION);
+              // Function exclude filter is applied before function include
+              // filter.
+              return a_is_function_exclude > b_is_function_exclude;
+            });
+
+  for (const auto& tac_filter : llvm::enumerate(tac_filters_->tac_filters())) {
+    SmallVector<Operation*> filtered_ops;
+    ApplyTacFilter(module, tac_filter.value(), filtered_ops, builder);
+    PrintTacFilterResult(module.getLoc(), tac_filter.value(),
+                         tac_filter.index(), filtered_ops);
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateTacFilterPass(
+    TacFilters* tac_filters) {
+  return std::make_unique<TacFilterPass>(tac_filters);
+}
+
+static PassRegistration<TacFilterPass> pass;
+
+}  // namespace tac
+}  // namespace TFL
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
index 392a2713e95..6e61dbe99bb 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/tac_pass.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_TAC_PASS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_EXPERIMENTAL_TAC_TRANSFORMS_TAC_PASS_H_
 
+#include <memory>
 #include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -39,7 +40,7 @@ class TacPass : public OperationPass<T> {
       : OperationPass<T>::OperationPass(mlir::TypeID::get<T>()),
         module_(module) {}
 
-  ~TacPass() override {}
+  ~TacPass() override = default;
 
   const TargetHardware* GetTargetHardware(
       const std::string& hardware_name) const {
@@ -62,7 +63,7 @@ class TacFunctionPass : public TacPass<func::FuncOp> {
  public:
   using TacPass<func::FuncOp>::TacPass;
 
-  ~TacFunctionPass() override {}
+  ~TacFunctionPass() override = default;
 
   mlir::func::FuncOp getFunction() { return getOperation(); }
 
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
index 2dddad4e9a8..6d1bf7ab934 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/transforms/target_annotation.cc
@@ -92,6 +92,9 @@ void SetAnnotation(Operation* op, std::string attribute, std::string annotation,
 void TargetAnnotationPass::SetTargetAnnotation(
     Operation* op, llvm::ArrayRef<std::string> device_specs,
     OpBuilder* builder) {
+  if (op->hasAttr(kSkipTargetAnnotation)) {
+    return;
+  }
   const InferenceType inference_type = GetInferenceType(op);
   const std::string inference_type_str = GetInferenceString(inference_type);
   SetAnnotation(op, kInferenceType, inference_type_str, builder);
diff --git a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
index 20c81962e5a..aef77e208d2 100644
--- a/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
+++ b/tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/experimental/tac/utils/utils.h"
 
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 1e4475bd4b3..9bb1c172116 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -649,7 +649,7 @@ class Translator {
   // to a function's body or while op. Modifies *region by calling
   // ExtractControlEdges.
   std::optional<BufferOffset<tflite::SubGraph>> BuildSubGraph(
-      const std::string& name, Region* region, const int index);
+      const std::string& name, Region* region, int index);
 
   // Modifies *block by unwrapping all ControlNodeOps. The DAG of the control
   // dependencies is returned as a vector of its edges, with node indices into
@@ -674,8 +674,7 @@ class Translator {
   // 'items' is a map from tensor name in signatureDef to tensor name in
   // the subgraph, specified by the 'subgraph_index' argument.
   std::vector<BufferOffset<tflite::TensorMap>> GetList(
-      const int subgraph_index,
-      const std::map<std::string, std::string>& items);
+      int subgraph_index, const std::map<std::string, std::string>& items);
 
   // Uses the tf.entry_function attribute (if set) to initialize the op to name
   // mapping.
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 487b3edd60a..143b67acf96 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <climits>
 #include <cstdint>
 #include <iostream>
+#include <memory>
 #include <optional>
+#include <set>
 #include <sstream>
 #include <string>
 #include <utility>
@@ -973,12 +975,19 @@ StatusOr<std::vector<int>> GetTensorIndices(
   return indices;
 }
 
+// Given a list of tensor indices, returns true if any of the tensors have
+// non-empty name strings.
+bool HasNonEmptyNames(const tflite::SubGraphT& subgraph,
+                      ArrayRef<int32_t> indices) {
+  return llvm::any_of(
+      indices, [&](int i) { return !subgraph.tensors.at(i)->name.empty(); });
+}
+
 // Given a list of tensor indices, returns a string of concatenated tensor names
 // wrapped in a NamedAttribute.
-template <typename ContainerType>
 mlir::NamedAttribute BuildTFEntryFunctionAttribute(
-    const tflite::SubGraphT& subgraph, Builder* builder, const std::string name,
-    const ContainerType indices) {
+    const tflite::SubGraphT& subgraph, Builder* builder,
+    const std::string& name, ArrayRef<int32_t> indices) {
   auto tensor_names = llvm::map_range(
       indices, [&](int i) { return subgraph.tensors.at(i)->name; });
   return builder->getNamedAttr(
@@ -1351,15 +1360,17 @@ StatusOr<FuncOp> ConvertSubgraph(
   // Set tf.entry_function attribute
   if (is_entry_point) {
     llvm::SmallVector<mlir::NamedAttribute, 2> attributes;
-    if (!func_inputs.empty()) {
+    if (HasNonEmptyNames(subgraph, func_inputs)) {
       attributes.push_back(BuildTFEntryFunctionAttribute(
           subgraph, &builder, "inputs", func_inputs));
     }
-    if (!func_outputs.empty()) {
+    if (HasNonEmptyNames(subgraph, func_outputs)) {
       attributes.push_back(BuildTFEntryFunctionAttribute(
           subgraph, &builder, "outputs", func_outputs));
     }
-    func->setAttr("tf.entry_function", builder.getDictionaryAttr(attributes));
+    if (!attributes.empty()) {
+      func->setAttr("tf.entry_function", builder.getDictionaryAttr(attributes));
+    }
   } else {
     func.setPrivate();
   }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.h b/tensorflow/compiler/mlir/lite/flatbuffer_import.h
index 8707be2894e..76edd13afd4 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.h
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_FLATBUFFER_IMPORT_H_
 
 #include <string>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 35475091aa8..2f1779b97d0 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/flatbuffer_operator.h"
 
+#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
index 88e3029188b..62cb3447313 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_translate.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <vector>
+
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/MemoryBuffer.h"
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
index f23dfd96e88..58820f0edee 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.cc
@@ -16,9 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 
 #include <algorithm>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <cstdlib>
+#include <cstring>
 #include <iterator>
 #include <numeric>
 #include <optional>
@@ -3330,33 +3332,36 @@ OpFoldResult StridedSliceOp::fold(FoldAdaptor) {
 
 namespace {
 
-// Computes the permutation of a constant `input_tensor` according to `perm`.
 // The function recursively traverses the dimensions of the output tensor in
-// a row-major order and writes the value in the output tensor into
-// `new_values`.
-void ComputePermutation(mlir::detail::ElementsAttrRange<
-                            mlir::detail::ElementsAttrIterator<mlir::Attribute>>
-                            input_tensor_values,
-                        ArrayRef<int32_t> perm, ArrayRef<int64_t> output_shape,
-                        const int num_dimensions, const int output_axis,
-                        std::vector<uint64_t>* input_indices,
-                        std::vector<Attribute>* new_values) {
-  // Refer to the implementation of `Transpose` function in
-  // tensorflow/lite/kernels/internal/reference/reference_ops.h
-  assert(output_axis < num_dimensions);
-  const int input_axis = perm[output_axis];
-  for (int i = 0; i < output_shape[output_axis]; ++i) {
+// a row-major order and writes the value of the output tensor into
+// `output_element_addr`.
+// TODO(@lukeboyer) make element byte size a template param.
+void ComputePermutation(ArrayRef<int64_t> perms, ArrayRef<int64_t> output_shape,
+                        const char* raw_input, const int element_byte_size,
+                        const int64_t current_axis, char*& output_element_addr,
+                        MutableArrayRef<uint64_t> current_input_index,
+                        ShapedType input_shape_type) {
+  const int64_t input_axis = perms[current_axis];
+  const bool is_last_axis = current_axis == output_shape.size() - 1;
+  for (int i = 0; i < output_shape[current_axis]; ++i) {
     // Update the input indices on `input_axis`.
-    input_indices->at(input_axis) = i;
+    current_input_index[input_axis] = i;
     // Write the value from `input_tensor` if it is the last axis or
     // recurse into the next axis.
-    const bool is_last_axis = output_axis == num_dimensions - 1;
     if (is_last_axis) {
-      new_values->push_back(input_tensor_values[*input_indices]);
+      int64_t input_flat_index = ElementsAttr::getFlattenedIndex(
+          input_shape_type, current_input_index);
+      // Address of input element to write raw data.
+      const char* input_element_addr =
+          raw_input + (input_flat_index * element_byte_size);
+      std::memcpy(output_element_addr, input_element_addr, element_byte_size);
+      // Increment the next output address to write to by bytes equal to
+      // width of constiuent elements.
+      output_element_addr += element_byte_size;
     } else {
-      ComputePermutation(input_tensor_values, perm, output_shape,
-                         num_dimensions, output_axis + 1, input_indices,
-                         new_values);
+      ComputePermutation(perms, output_shape, raw_input, element_byte_size,
+                         current_axis + 1, output_element_addr,
+                         current_input_index, input_shape_type);
     }
   }
 }
@@ -3365,8 +3370,8 @@ void ComputePermutation(mlir::detail::ElementsAttrRange<
 
 OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   auto operands = adaptor.getOperands();
-  assert(operands.size() == 2);
-  auto input_tensor = operands[0].dyn_cast_or_null<ElementsAttr>();
+
+  auto input_tensor = operands[0].dyn_cast_or_null<DenseElementsAttr>();
   auto perm_tensor = operands[1].dyn_cast_or_null<ElementsAttr>();
   if (!input_tensor || !perm_tensor) return nullptr;
 
@@ -3375,33 +3380,56 @@ OpFoldResult TransposeOp::fold(FoldAdaptor adaptor) {
   if (!getType().cast<ShapedType>().getElementType().isSignlessIntOrFloat())
     return nullptr;
 
-  assert(perm_tensor.getShapedType().getRank() == 1);
-  const int num_dimensions = input_tensor.getShapedType().getRank();
-  assert(perm_tensor.getShapedType().getNumElements() == num_dimensions);
-
-  ArrayRef<int64_t> input_shape = input_tensor.getShapedType().getShape();
-  auto output_type = getType().cast<ShapedType>();
-
-  SmallVector<int32_t, 4> perm;
-  SmallVector<int64_t, 4> output_shape;
+  // TODO(b/280099953) This algorithm only works for fixed width element types.
+  // This is the usual case, but consider falling back to old approach
+  // if transposing string tensors becomes needed while folding.
+  if (!input_tensor.getElementType().isIntOrIndexOrFloat()) return nullptr;
+  SmallVector<int64_t> perms;
+  SmallVector<int64_t> output_shape;
+  ArrayRef<int64_t> input_shape = input_tensor.getType().getShape();
+  auto attr_iter = perm_tensor.getValues<IntegerAttr>();
+  const int num_dimensions = input_tensor.getType().getRank();
   for (int i = 0; i < num_dimensions; ++i) {
-    perm.push_back(perm_tensor.getValues<IntegerAttr>()[i].getInt());
-    output_shape.push_back(input_shape[perm[i]]);
-
-    // Check that the derived output shape matches the static shape.
-    assert(!output_type.hasStaticShape() ||
-           output_type.getShape()[i] == output_shape[i]);
+    perms.push_back(attr_iter[i].getInt());
+    output_shape.push_back(input_shape[perms[i]]);
   }
 
-  std::vector<Attribute> new_values;
-  new_values.reserve(input_tensor.getShapedType().getNumElements());
-  std::vector<uint64_t> input_indices(num_dimensions);
-  auto input_tensor_values = input_tensor.getValues<Attribute>();
-  ComputePermutation(input_tensor_values, perm, output_shape, num_dimensions,
-                     /*output_axis=*/0, &input_indices, &new_values);
-  auto result_type = tensorflow::GetTypeFromTFTensorShape(
-      output_shape, output_type.getElementType());
-  return DenseElementsAttr::get(result_type, new_values);
+  // If the input tensor values are splat, then it has exactly one value.
+  // It is sufficient then to just reshape the input data.
+  if (input_tensor.isSplat()) {
+    return input_tensor.reshape(input_tensor.getType().cloneWith(
+        output_shape, input_tensor.getElementType()));
+  }
+
+  // MLIR implementation pads elements < 8 bits to 8 bits and pads non byte
+  // aligned to the nearest byte. So this is allowed.
+  const char* raw_input = input_tensor.getRawData().data();
+  const int element_byte_size =
+      input_tensor.getElementType().getIntOrFloatBitWidth() / 8;
+
+  // Hold current ND index in input tensor when computing
+  // permutation.
+  llvm::OwningArrayRef<uint64_t> current_input_index(
+      input_tensor.getType().getRank());
+
+  // Allocate raw data and retrieve address of the first char in its raw
+  // buffer.
+  llvm::OwningArrayRef<char> raw_output_arr(input_tensor.getRawData());
+  char* raw_output = (char*)raw_output_arr.data();
+
+  // Compute the result and write to `raw_output`.
+  ComputePermutation(perms, output_shape, raw_input, element_byte_size,
+                     /*current_axis=*/0, raw_output, current_input_index,
+                     input_tensor.getType());
+
+  bool detected_splat = false;
+  const bool valid_output_buffer = DenseElementsAttr::isValidRawBuffer(
+      input_tensor.getType(), raw_output_arr, detected_splat);
+  if (!valid_output_buffer || detected_splat) return nullptr;
+
+  auto result_type =
+      RankedTensorType::get(output_shape, input_tensor.getElementType());
+  return DenseElementsAttr::getFromRawBuffer(result_type, raw_output_arr);
 }
 
 mlir::LogicalResult TransposeOp::verify() {
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 8266fc605c0..01c77e4f21c 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -803,14 +803,14 @@ def TFL_ConcatenationOp : TFL_Op<"concatenation",
 
   let arguments = (
     ins TFL_VariadicTensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, UI8, I1]>:$values,
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8, UI32, I1]>:$values,
     I32Attr:$axis,
     TFL_AFAttr:$fused_activation_function
   );
 
   let results = (outs
     TFL_TensorOf<
-      [F32, I64, I32, I16, I8, QI8, QUI8, UI8, I1]>:$output
+      [F32, I64, I32, I16, I8, QI8, QUI8, UI8, UI32, I1]>:$output
   );
 
   let hasOptions = 1;
@@ -3063,11 +3063,11 @@ def TFL_RangeOp: TFL_Op<"range", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[I32, F32]>:$start,
-    TFL_TensorOf<[I32, F32]>:$limit,
-    TFL_TensorOf<[I32, F32]>:$delta);
+    TFL_TensorOf<[I32, F32, I64]>:$start,
+    TFL_TensorOf<[I32, F32, I64]>:$limit,
+    TFL_TensorOf<[I32, F32, I64]>:$delta);
 
-  let results = (outs TFL_TensorOf<[I32, F32]>:$result);
+  let results = (outs TFL_TensorOf<[I32, F32, I64]>:$result);
 
   let hasFolder = 1;
 }
@@ -3873,7 +3873,8 @@ def TFL_StridedSliceOp: TFL_Op<"strided_slice", [
     I32Attr:$end_mask,
     I32Attr:$ellipsis_mask,
     I32Attr:$new_axis_mask,
-    I32Attr:$shrink_axis_mask
+    I32Attr:$shrink_axis_mask,
+    BoolAttr:$offset
   );
 
   let results = (outs
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
index 738ea1ecd2d..322ec2e852d 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_INST_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_METRICS_ERROR_COLLECTOR_INST_H_
 
+#include <memory>
 #include <string>
+#include <unordered_map>
 #include <utility>
 
 #include "mlir/IR/Location.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
index 7874f6c5f3c..75a0c3eb3bb 100644
--- a/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
+++ b/tensorflow/compiler/mlir/lite/metrics/error_collector_inst_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
 
 #include <cstddef>
+#include <memory>
 #include <set>
 #include <string>
 #include <utility>
@@ -52,7 +53,7 @@ class MockSuccessPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MockSuccessPass)
 
-  explicit MockSuccessPass() {}
+  explicit MockSuccessPass() = default;
 
  private:
   void runOnOperation() override {
@@ -73,7 +74,7 @@ class MockFailurePass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MockFailurePass)
 
-  explicit MockFailurePass() {}
+  explicit MockFailurePass() = default;
 
  private:
   void runOnOperation() override {
diff --git a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
index bef202887a8..4f97fb56f86 100644
--- a/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
+++ b/tensorflow/compiler/mlir/lite/mlir_tflite_runner.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <string>
+#include <system_error>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc b/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc
index b71d6c1dbd2..02743c9c65f 100644
--- a/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc
+++ b/tensorflow/compiler/mlir/lite/python/flatbuffer_to_mlir.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index 85c87fd66ad..344c558ba3e 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <ostream>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
index 1b0f22c7cd1..473f63812bd 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 74c09b3e9e6..acbfa08e770 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "llvm/ADT/StringSet.h"
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 5cfbc0c937a..fb5efba769a 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index d5a98612b2d..85d4ddffaa2 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 29216f3be16..7581b5c78cf 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
 
 #include <string>
+#include <unordered_set>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
index 353e023d3fb..798e011dec2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
@@ -17,9 +17,13 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
+#include <iostream>
 #include <memory>
 #include <optional>
 #include <string>
+#include <unordered_set>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
index d7cb5ab1fe6..6c94e4c2d10 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
index 9ae572b1c7a..36855cdb744 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights_test.cc
@@ -15,8 +15,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_weights.h"
 
 #include <algorithm>
+#include <iostream>
 #include <memory>
 #include <string>
+#include <vector>
 
 #include <gtest/gtest.h>
 #include "llvm/ADT/Twine.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
index 5bd1b71e631..fe5ca2ca8f1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/tfl_quantizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <iostream>
 #include <memory>
+#include <system_error>
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
index d1fc9318116..759893401e6 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/fallback_to_flex_ops.cc
@@ -171,7 +171,7 @@ inline TFL::ConstBytesAttr CustomOptionForFlexOp(OpBuilder *builder,
 class FallbackToFlexOps
     : public PassWrapper<FallbackToFlexOps, OperationPass<func::FuncOp>> {
  public:
-  FallbackToFlexOps() {}
+  FallbackToFlexOps() = default;
   explicit FallbackToFlexOps(const std::string &mode) { mode_ = mode; }
   FallbackToFlexOps(const FallbackToFlexOps &other) { mode_ = other.mode_; }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
index 03332e19f6a..796676e1d28 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
index 6afc81e8ce9..9dfe5166033 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tensorflow/tf_to_quant.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
 #include <utility>
 
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/quantization/tests/BUILD b/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
index 03332e19f6a..796676e1d28 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
index 04d0b7675bb..ad4112a05ad 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
index f73f162929f..7cbcb108729 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/tflite_op_coverage_spec_getters_gen.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <list>
+#include <map>
 #include <string>
+#include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/str_replace.h"
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
index 525d73c1b79..04e9a070af8 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/odml_to_stablehlo.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <system_error>
 #include <unordered_set>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
index 7002dd57dda..dd691a25be1 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/BUILD
@@ -10,6 +10,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir/lite/stablehlo:run_lit.sh",
     size_override = {
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
index b7277ae0415..2a7950cf581 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
@@ -44,6 +44,28 @@ namespace odml {
 
 static constexpr std::string_view kStablehloModuleDefaultEntryFuncName = "main";
 static constexpr std::string_view kStablehloFuncNamePrefix = "XlaCallModule";
+static constexpr char kShardingAttr[] = "mhlo.sharding";
+static constexpr char kShardingName[] = "Sharding";
+
+class RemoveCustomCallWithSharding
+    : public mlir::OpRewritePattern<mlir::stablehlo::CustomCallOp> {
+  using OpRewritePattern<mlir::stablehlo::CustomCallOp>::OpRewritePattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::stablehlo::CustomCallOp op,
+      PatternRewriter &rewriter) const override {
+    // Removes the custom call with sharding op if the operand type is the
+    // same as the result type.
+    if (op->hasAttr(kShardingAttr) && op.getCallTargetName() == kShardingName &&
+        op.getNumOperands() == 1 && op.getNumResults() == 1 &&
+        op.getOperands().front().getType() ==
+            op.getResults().front().getType()) {
+      rewriter.replaceOp(op, op.getOperands());
+      return mlir::success();
+    }
+    return mlir::failure();
+  }
+};
 
 class ConvertTFXlaCallModuleOp
     : public mlir::OpRewritePattern<mlir::TF::XlaCallModuleOp> {
@@ -90,10 +112,12 @@ class ConvertTFXlaCallModuleOp
          stablehlo_module_op.get().getOps<mlir::func::FuncOp>()) {
       mlir::func::FuncOp cloned_func_op = func_op.clone();
       if (cloned_func_op.getSymName().contains(
-              kStablehloModuleDefaultEntryFuncName)) {
+              kStablehloModuleDefaultEntryFuncName) &&
+          cloned_func_op.getSymVisibility() == "public") {
         main_fn = cloned_func_op;
-        main_fn.setSymVisibility(stablehlo_builder.getStringAttr("private"));
       }
+      cloned_func_op.setSymVisibility(
+          stablehlo_builder.getStringAttr("private"));
       parent_module_symbol_table.insert(cloned_func_op);
     }
 
@@ -159,6 +183,7 @@ class TFXlaCallModuleOpToStablehloPass
     ModuleOp module_op = getOperation();
     RewritePatternSet patterns(&getContext());
     patterns.add<ConvertTFXlaCallModuleOp>(&getContext(), module_op);
+    patterns.add<RemoveCustomCallWithSharding>(&getContext());
     if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
       return signalPassFailure();
     }
diff --git a/tensorflow/compiler/mlir/lite/tests/BUILD b/tensorflow/compiler/mlir/lite/tests/BUILD
index f2cfa39ab54..72efe28296c 100644
--- a/tensorflow/compiler/mlir/lite/tests/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = ["load-quantization-recipe.mlir"],
diff --git a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
index 5dcaada7e56..ba9c1e58565 100644
--- a/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/const-fold.mlir
@@ -450,6 +450,7 @@ func.func @transpose_no_fold(%arg0 : tensor<2xi32>) -> tensor<2x2xi32> {
   func.return %0 : tensor<2x2xi32>
 }
 
+
 // CHECK-LABEL: @transpose_1d
 // Basic 1D identity
 func.func @transpose_1d() -> tensor<3xi32> {
@@ -484,6 +485,17 @@ func.func @transpose_2d() -> tensor<2x2xi32> {
   func.return %0 : tensor<2x2xi32>
 }
 
+// CHECK-LABEL: @transpose_2d_splat
+func.func @transpose_2d_splat() -> tensor<3x2xi32> {
+  %cst = arith.constant dense<0> : tensor<2x3xi32>
+  %cst_perm = arith.constant dense<[1, 0]> : tensor<2xi32>
+
+  // CHECK: %[[CST:.*]] = arith.constant dense<0> : tensor<3x2xi32>
+  // CHECK: return %[[CST]]
+  %0 = "tfl.transpose"(%cst, %cst_perm) : (tensor<2x3xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+  func.return %0 : tensor<3x2xi32>
+}
+
 // CHECK-LABEL: @transpose_2d_identity
 func.func @transpose_2d_identity() -> tensor<2x2xi32> {
   %cst = arith.constant dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>
@@ -837,7 +849,7 @@ func.func @ConstFoldStridedSlice(%arg0 : tensor<15600xf32>) -> tensor<15600xf32>
   %0 = "tfl.pseudo_const"() {value = dense<15600> : tensor<1xi32>} : () -> tensor<1xi32>
   %1 = "tfl.pseudo_const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
   %2 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
-  %3 = "tfl.strided_slice"(%arg0, %1, %0, %2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<15600xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<15600xf32>
+  %3 = "tfl.strided_slice"(%arg0, %1, %0, %2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<15600xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<15600xf32>
   func.return %3 : tensor<15600xf32>
   // CHECK:  return %arg0
 }
@@ -846,7 +858,7 @@ func.func @ConstFoldStridedSliceMultiDims(%arg0 : tensor<10x10x10xf32>) -> tenso
   %0 = "tfl.pseudo_const"() {value = dense<[10, 10, 10]> : tensor<3xi32>} : () -> tensor<3xi32>
   %1 = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
   %2 = "tfl.pseudo_const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
-  %3 = "tfl.strided_slice"(%arg0, %1, %0, %2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<10x10x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<10x10x10xf32>
+  %3 = "tfl.strided_slice"(%arg0, %1, %0, %2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<10x10x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<10x10x10xf32>
   func.return %3 : tensor<10x10x10xf32>
   // CHECK:  return %arg0
 }
@@ -855,7 +867,7 @@ func.func @NotFoldStridedSlice(%arg0 : tensor<10x10x10xf32>) -> tensor<9x9x9xf32
   %0 = "tfl.pseudo_const"() {value = dense<[9, 9, 9]> : tensor<3xi32>} : () -> tensor<3xi32>
   %1 = "tfl.pseudo_const"() {value = dense<0> : tensor<3xi32>} : () -> tensor<3xi32>
   %2 = "tfl.pseudo_const"() {value = dense<1> : tensor<3xi32>} : () -> tensor<3xi32>
-  %3 = "tfl.strided_slice"(%arg0, %1, %0, %2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<10x10x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<9x9x9xf32>
+  %3 = "tfl.strided_slice"(%arg0, %1, %0, %2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<10x10x10xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<9x9x9xf32>
   func.return %3 : tensor<9x9x9xf32>
   // CHECK: %[[STRIDED_SLICE:.*]] = "tfl.strided_slice"
   // CHECK:  return %[[STRIDED_SLICE]]
diff --git a/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD b/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
index 9a0b427f294..bb7412a10f9 100644
--- a/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/debuginfo/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [
         ":debug_info_files",
         ":test_utilities",
diff --git a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
index b162606d135..b0e8270e4dc 100644
--- a/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/end2end/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [
         ":quant_stats_files",
         ":test_utilities",
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
index f7dbeaf48af..e1687b22816 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/BUILD
@@ -7,6 +7,7 @@ load("//tensorflow:tensorflow.bzl", "tf_native_cc_binary")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [
         ":extra_files",
         ":test_utilities",
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/empty_input_output_names.json b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/empty_input_output_names.json
new file mode 100644
index 00000000000..87c809fa7cc
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/empty_input_output_names.json
@@ -0,0 +1,81 @@
+// RUN: json_to_flatbuffer %p/test_schema.fbs %s | flatbuffer_translate --tflite-flatbuffer-to-mlir -o - | FileCheck %s
+
+// If input and output tensors don't have names, there shouldn't be an
+// `tf.entry_function` attribute created.
+// CHECK-NOT: tf.entry_function
+
+{
+  "version": 3,
+  "operator_codes": [
+    {
+      "builtin_code": "CONV_2D"
+    }
+  ],
+  "subgraphs": [
+    {
+      "tensors": [
+        {
+          "shape": [
+            256,
+            32,
+            32,
+            3
+          ],
+          "quantization": {
+          }
+        },
+        {
+          "shape": [
+            16,
+            3,
+            3,
+            3
+          ],
+          "quantization": {
+          }
+        },
+        {
+          "shape": [
+            0
+          ],
+        },
+        {
+          "shape": [
+            256,
+            32,
+            32,
+            16
+          ],
+          "quantization": {
+          }
+        }
+      ],
+      "inputs": [
+        0,
+        1
+      ],
+      "outputs": [
+        3
+      ],
+      "operators": [
+        {
+          "inputs": [
+            0,
+            1,
+            -1
+          ],
+          "outputs": [
+            3
+          ],
+          "builtin_options_type": "Conv2DOptions",
+          "builtin_options": {
+            "stride_w": 1,
+            "stride_h": 1
+          }
+        }
+      ],
+      "name": "main"
+    }
+  ],
+  "description": "MLIR Converted."
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
index ba0fd474a3a..8fc5a0cb051 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/importer_test_min_max.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <iostream>
 #include <memory>
 #include <optional>
+#include <system_error>
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
index 4f58b7af868..15a0f4b160d 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf.mlir
@@ -1307,84 +1307,84 @@ func.func @resize_with_bilinear_with_half_pixel_centers(%arg0: tensor<1x100x100x
 }
 
 func.func @strided_slice(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_with_constant_attributes(%arg0: tensor<10x10x10xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<10x10xf32> {
   %cst = arith.constant dense<-1> : tensor<1xi32>
   %cst_1 = arith.constant dense<0> : tensor<1xi32>
   %cst_2 = arith.constant dense<1> : tensor<1xi32>
-  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_1, %cst_2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
+  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_1, %cst_2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64, offset = false} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
   func.return %0 : tensor<10x10xf32>
   // CHECK-LABEL: strided_slice_with_constant_attributes
   // CHECK-DAG: [[BEGIN:%cst.*]] = arith.constant dense<-1> : tensor<1xi32>
   // CHECK-DAG: [[END:%cst.*]] = arith.constant dense<0> : tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%cst.*]] = arith.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
 }
 
 func.func @strided_slice_with_string(%arg0: tensor<12x2x2x5x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
   func.return %0 : tensor<1x2x2x5x!tf_type.string>
   // CHECK-LABEL: strided_slice_with_string
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
 }
 
 func.func @strided_slice_with_unranked_input_and_i64_parameters(%arg0: tensor<*xf32>, %arg1: tensor<1xi64>, %arg2: tensor<1xi64>, %arg3: tensor<1xi64>) -> tensor<*xf32> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<*xf32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<*xf32>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<*xf32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
   // CHECK-LABEL: strided_slice_with_unranked_input_and_i64_parameters
   // CHECK-DAG: [[BEGIN:%.*]] = "tfl.cast"(%arg1) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[END:%.*]] = "tfl.cast"(%arg2) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%.*]] = "tfl.cast"(%arg3) : (tensor<1xi64>) -> tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<*xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<*xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<*xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<*xf32>
 }
 
 func.func @strided_slice_with_i64_parameters(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi64>, %arg2: tensor<1xi64>, %arg3: tensor<1xi64>) -> tensor<1x2x2x5xf32> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<12x2x2x5xf32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1x2x2x5xf32>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice_with_i64_parameters
   // CHECK-DAG: [[BEGIN:%.*]] = "tfl.cast"(%arg1) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[END:%.*]] = "tfl.cast"(%arg2) : (tensor<1xi64>) -> tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%.*]] = "tfl.cast"(%arg3) : (tensor<1xi64>) -> tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_with_i64_constant_attributes(%arg0: tensor<10x10x10xf32>) -> tensor<10x10xf32> {
   %cst = arith.constant dense<-1> : tensor<1xi64>
   %cst_1 = arith.constant dense<0> : tensor<1xi64>
   %cst_2 = arith.constant dense<1> : tensor<1xi64>
-  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_1, %cst_2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<10x10x10xf32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<10x10xf32>
+  %0 = "tf.StridedSlice"(%arg0, %cst, %cst_1, %cst_2) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64, offset = false} : (tensor<10x10x10xf32>, tensor<1xi64>, tensor<1xi64>, tensor<1xi64>) -> tensor<10x10xf32>
   func.return %0 : tensor<10x10xf32>
   // CHECK-LABEL: strided_slice_with_i64_constant_attributes
   // CHECK-DAG: [[BEGIN:%cst.*]] = arith.constant dense<-1> : tensor<1xi32>
   // CHECK-DAG: [[END:%cst.*]] = arith.constant dense<0> : tensor<1xi32>
   // CHECK-DAG: [[STRIDES:%cst.*]] = arith.constant dense<1> : tensor<1xi32>
-  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 1 : i32} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
+  // CHECK-NEXT: "tfl.strided_slice"(%arg0, [[BEGIN]], [[END]], [[STRIDES]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 1 : i32} : (tensor<10x10x10xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<10x10xf32>
 }
 
 func.func @strided_slice_non_zero_ellipsis_mask(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 1 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 1 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice_non_zero_ellipsis_mask
-  // CHECK:  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK:  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_non_zero_new_axis_mask(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 2 : i64, shrink_axis_mask = 0 : i64} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 2 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
   // CHECK-LABEL: strided_slice_non_zero_new_axis_mask
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 2 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 2 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
 }
 
 func.func @strided_slice_big_dims(%arg0: tensor<5x6x7xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>, %arg3: tensor<3xi32>) -> tensor<1x1x5x6x7xf32> {
-  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, shrink_axis_mask = 0 : i64} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
+  %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, shrink_axis_mask = 0 : i64, offset = false} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
   func.return %0 : tensor<1x1x5x6x7xf32>
   // CHECK-LABEL: strided_slice_big_dims
-  // CHECK: %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, shrink_axis_mask = 0 : i64} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
+  // CHECK: %0 = "tf.StridedSlice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i64, ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 7 : i64, offset = false, shrink_axis_mask = 0 : i64} : (tensor<5x6x7xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x5x6x7xf32>
 }
 
 func.func @slice1Tensor(%arg0: tensor<2x3x5xf32>, %arg1: tensor<3xi32>, %arg2: tensor<3xi32>) -> tensor<?x3x5xf32> {
@@ -2627,3 +2627,31 @@ func.func @batchmatmul2fullyconnected(%arg0: tensor<4x128x2xf32>) -> (tensor<4x1
   // CHECK:  return %2 : tensor<4x128x1xf32>
 }
 
+func.func @approx_top_k_with_max_k_last_reduction_dimension(%arg0: tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
+  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  func.return %values, %indices: tensor<1x4xf32>, tensor<1x4xi32>
+
+  // CHECK-LABEL: approx_top_k_with_max_k_last_reduction_dimension
+  // CHECK-DAG:  %cst = arith.constant dense<4> : tensor<i32>
+  // CHECK:  %values, %indices = "tfl.topk_v2"(%arg0, %cst) : (tensor<1x4xf32>, tensor<i32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  // CHECK:  return %values, %indices : tensor<1x4xf32>, tensor<1x4xi32>
+}
+
+func.func @approx_top_k_with_min_k(%arg0: tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
+  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = false, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  func.return %values, %indices: tensor<1x4xf32>, tensor<1x4xi32>
+
+  // CHECK-LABEL: approx_top_k_with_min_k
+  // CHECK:  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = false, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  // CHECK:  return %values, %indices : tensor<1x4xf32>, tensor<1x4xi32>
+}
+
+func.func @approx_top_k_reduction_dimension_not_last_dim(%arg0: tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
+  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 0 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  func.return %values, %indices: tensor<1x4xf32>, tensor<1x4xi32>
+
+  // CHECK-LABEL: approx_top_k_reduction_dimension_not_last_dim
+  // CHECK:  %values, %indices = "tf.ApproxTopK"(%arg0) {aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 0 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  // CHECK:  return %values, %indices : tensor<1x4xf32>, tensor<1x4xi32>
+}
+
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD b/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
index 930e0f20b05..8bb228b8520 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2exec/BUILD
@@ -14,6 +14,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
index 7e748ffe18d..3d4e40f9119 100644
--- a/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
+++ b/tensorflow/compiler/mlir/lite/tests/mlir2flatbuffer/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/lite/tests/ops.mlir b/tensorflow/compiler/mlir/lite/tests/ops.mlir
index 628e523c488..51fc212a2a7 100644
--- a/tensorflow/compiler/mlir/lite/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/ops.mlir
@@ -1684,32 +1684,32 @@ func.func @testResizeBilinearInvalidOutputType(%arg0 : tensor<1x100x100x3xf32>,
 
 // CHECK-LABEL: testStridedSlice
 func.func @testStridedSlice(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xf32> {
-  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %0 : tensor<1x2x2x5xf32>
 }
 
 // CHECK-LABEL: testStridedSliceWithQI8
 func.func @testStridedSliceWithQI8(%arg0: tensor<12x2x2x5x!quant.uniform<i8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>> {
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!quant.uniform<i8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5x!quant.uniform<i8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>>
   func.return %0 : tensor<1x2x2x5x!quant.uniform<i8:f32, 0.1>>
 }
 
 // CHECK-LABEL: testStridedSliceWithQUI8
 func.func @testStridedSliceWithQUI8(%arg0: tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>> {
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>>
   func.return %0 : tensor<1x2x2x5x!quant.uniform<u8:f32, 0.1>>
 }
 
 // CHECK-LABEL: testStridedSliceTFType
 func.func @testStridedSliceTFType(%arg0: tensor<12x2x2x5xui8>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.quint8> {
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xui8>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.quint8>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5xui8>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.quint8>
   func.return %0 : tensor<1x2x2x5x!tf_type.quint8>
 }
 
 // CHECK-LABEL: testStridedSliceWithString
 func.func @testStridedSliceWithString(%arg0: tensor<12x2x2x5x!tf_type.string>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string> {
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5x!tf_type.string>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5x!tf_type.string>
   func.return %0 : tensor<1x2x2x5x!tf_type.string>
 }
 
@@ -1717,7 +1717,7 @@ func.func @testStridedSliceWithString(%arg0: tensor<12x2x2x5x!tf_type.string>, %
 
 func.func @testStridedSliceWithInvalidOutputType(%arg0: tensor<12x2x2x5xf32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<1x2x2x5xi32> {
   // expected-error @+1 {{op failed to verify that input and output must have same element type}}
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xi32>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xi32>
   func.return %0 : tensor<1x2x2x5xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index 8d57178a47f..2515e209396 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -13,7 +13,7 @@ func.func @fusedConv2dRelu(%arg0: tensor<256x32x32x3xf32>, %arg1: tensor<16x3x3x
   %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
   %1 = "tfl.relu"(%0) : (tensor<256x32x32x16xf32>) -> tensor<256x32x32x16xf32>
   func.return %1 : tensor<256x32x32x16xf32>
-  
+
   // CHECK: %0 = "tfl.conv_2d"(%arg0, %arg1, %arg2) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "RELU", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<256x32x32x3xf32>, tensor<16x3x3x3xf32>, tensor<16xf32>) -> tensor<256x32x32x16xf32>
   // CHECK: return %0
 }
@@ -568,8 +568,8 @@ func.func @FuseFullyConnectedAddWithScalarRhs(%arg0: tensor<40x37xf32>, %arg1: t
   // CHECK: return %[[fc]]
 }
 
-// CHECK-LABEL: @FuseFullyConnectedAddWithUnfusableRhs
-func.func @FuseFullyConnectedAddWithUnfusableRhs(%arg0: tensor<4x37xf32>, %arg1: tensor<4x37xf32>) -> tensor<4x4xf32> {
+// CHECK-LABEL: @FuseFullyConnectedAddNoBiasWithUnfusableRhs
+func.func @FuseFullyConnectedAddNoBiasWithUnfusableRhs(%arg0: tensor<4x37xf32>, %arg1: tensor<4x37xf32>) -> tensor<4x4xf32> {
   %cst = "tfl.no_value"() {value} : () -> none
   %cst2 = arith.constant dense<[[2.0, 2.1, 2.2, 2.3], [2.0, 2.1, 2.2, 2.3], [2.0, 2.1, 2.2, 2.3], [2.0, 2.1, 2.2, 2.3]]> : tensor<4x4xf32>
 
@@ -585,6 +585,23 @@ func.func @FuseFullyConnectedAddWithUnfusableRhs(%arg0: tensor<4x37xf32>, %arg1:
   // CHECK: return %[[add_result]]
 }
 
+// CHECK-LABEL: @FuseFullyConnectedAddWithUnfusableRhs
+func.func @FuseFullyConnectedAddWithUnfusableRhs(%arg0: tensor<4x37xf32>, %arg1: tensor<4x37xf32>) -> tensor<4x4xf32> {
+  %cst = arith.constant dense<[2.0, 2.1, 2.2, 2.3]> : tensor<4xf32>
+  %cst2 = arith.constant dense<[[2.0, 2.1, 2.2, 2.3], [2.0, 2.1, 2.2, 2.3], [2.0, 2.1, 2.2, 2.3], [2.0, 2.1, 2.2, 2.3]]> : tensor<4x4xf32>
+
+  %0 = "tfl.fully_connected" (%arg0, %arg1, %cst) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x37xf32>, tensor<4x37xf32>, tensor<4xf32>) -> (tensor<4x4xf32>)
+  %1 = "tfl.add"(%0, %cst2) {fused_activation_function = "NONE"} : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+
+  func.return %1 : tensor<4x4xf32>
+
+  // CHECK-DAG: %[[bias:.*]] = arith.constant dense<{{.*}}> : tensor<4xf32>
+  // CHECK-DAG: %[[filter:.*]] = arith.constant dense<{{.*}}> : tensor<4x4xf32>
+  // CHECK: %[[fc_result:.*]] = "tfl.fully_connected"(%arg0, %arg1, %[[bias]])
+  // CHECK: %[[add_result:.*]] = tfl.add %[[fc_result]], %[[filter]]
+  // CHECK: return %[[add_result]]
+}
+
 // CHECK-LABEL: @FuseFullyConnectedReshapeAddConst
 // FOLD-LABEL: @FuseFullyConnectedReshapeAddConst
 func.func @FuseFullyConnectedReshapeAddConst(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
@@ -643,6 +660,46 @@ func.func @RetainRedundantReshapeUseInNonBinaryOp(%arg0: tensor<128xf32>, %arg1:
   // CHECK: return %1, %2
 }
 
+// CHECK-LABEL: @FuseTransposeReshapeTranspose
+func.func @FuseTransposeReshapeTranspose(%arg0: tensor<1x16x256xf32>) -> tensor<16x256xf32> {
+  %cst_10 = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
+  %cst_3 = arith.constant dense<[256, 16]> : tensor<2xi32>
+  %cst_6 = arith.constant dense<[1, 0]> : tensor<2xi32>
+  %2057 = "tfl.transpose"(%arg0, %cst_10) : (tensor<1x16x256xf32>, tensor<3xi32>) -> tensor<1x256x16xf32>
+  %2058 = "tfl.reshape"(%2057, %cst_3) : (tensor<1x256x16xf32>, tensor<2xi32>) -> tensor<256x16xf32>
+  %2059 = "tfl.transpose"(%2058, %cst_6) : (tensor<256x16xf32>, tensor<2xi32>) -> tensor<16x256xf32>
+  return %2059: tensor<16x256xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[16, 256]> : tensor<2xi32>
+  // CHECK: %0 = "tfl.reshape"(%arg0, %cst) : (tensor<1x16x256xf32>, tensor<2xi32>) -> tensor<16x256xf32>
+  // CHECK: return %0
+}
+
+// CHECK-LABEL: @FoldDoubleTranspose
+func.func @FoldDoubleTranspose(%arg0: tensor<1x4x1440x256xf32>) -> tensor<1x1440x256x4xf32> {
+    %cst_12 = arith.constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+    %cst_18 = arith.constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+    %2112 = "tfl.transpose"(%arg0, %cst_18) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x1440x4x256xf32>
+    %2114 = "tfl.transpose"(%2112, %cst_12) : (tensor<1x1440x4x256xf32>, tensor<4xi32>) -> tensor<1x1440x256x4xf32>
+    return %2114 : tensor<1x1440x256x4xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+  // CHECK: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x1440x256x4xf32>
+  // CHECK: return %0
+}
+
+// CHECK-LABEL: @FoldMultpleTranspose
+func.func @FoldMultpleTranspose(%arg0: tensor<1x4x1440x256xf32>) -> tensor<1x256x4x1440xf32> {
+    %cst_11 = arith.constant dense<[0, 2, 3, 1]> : tensor<4xi32>
+    %cst_12 = arith.constant dense<[0, 1, 3, 2]> : tensor<4xi32>
+    %cst_18 = arith.constant dense<[0, 2, 1, 3]> : tensor<4xi32>
+    %2112 = "tfl.transpose"(%arg0, %cst_11) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x1440x256x4xf32>
+    %2113 = "tfl.transpose"(%2112, %cst_18) : (tensor<1x1440x256x4xf32>, tensor<4xi32>) -> tensor<1x256x1440x4xf32>
+    %2114 = "tfl.transpose"(%2113, %cst_12) : (tensor<1x256x1440x4xf32>, tensor<4xi32>) -> tensor<1x256x4x1440xf32>
+    return %2114 : tensor<1x256x4x1440xf32>
+  // CHECK-DAG: %cst = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
+  // CHECK: %0 = "tfl.transpose"(%arg0, %cst) : (tensor<1x4x1440x256xf32>, tensor<4xi32>) -> tensor<1x256x4x1440xf32>
+  // CHECK: return %0
+}
+
 // CHECK-LABEL: @FuseFullyConnectedReshapeAddConstWithOptionalAttribute
 // FOLD-LABEL: @FuseFullyConnectedReshapeAddConstWithOptionalAttribute
 func.func @FuseFullyConnectedReshapeAddConstWithOptionalAttribute(%arg0: tensor<40x37xf32>, %arg1: tensor<40x37xf32>) -> tensor<40x40xf32> {
@@ -2613,6 +2670,63 @@ func.func @noReplaceReshapeEqualWithOneHotBadIndex(%arg: tensor<2xi32>) -> tenso
   // CHECK: %[[RES:.*]] = "tfl.equal"(%[[TMP]], %[[CST2]]) : (tensor<2x1xi32>, tensor<3xi32>) -> tensor<2x3xi1>
 }
 
+// CHECK-LABEL: ReplaceReshapeEqualOneHotDynamicBatch
+func.func @ReplaceReshapeEqualOneHotDynamicBatch(%arg0: tensor<?xi32>) -> (tensor<?x10xf32>) {
+  %cst = arith.constant dense<-1> : tensor<i32>
+  %cst_0 = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  %0 = "tfl.expand_dims"(%arg0, %cst) : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  %1 = "tfl.equal"(%0, %cst_0) : (tensor<?x1xi32>, tensor<10xi32>) -> tensor<?x10xi1>
+  %2 = "tfl.cast"(%1) : (tensor<?x10xi1>) -> tensor<?x10xf32>
+  func.return %2 : tensor<?x10xf32>
+
+  // CHECK-DAG: %[[CST:.*]] = arith.constant dense<-1> : tensor<1xi32>
+  // CHECK-DAG: %[[CST_0:.*]] = arith.constant dense<10> : tensor<i32>
+  // CHECK-DAG: %[[CST_1:.*]] = arith.constant dense<1.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[CST_2:.*]] = arith.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK-DAG: %[[CST_3:.*]] = arith.constant dense<-1> : tensor<i32>
+  // CHECK: %[[EXPAND_DIMS:.*]] = "tfl.expand_dims"(%arg0, %[[CST_3]]) : (tensor<?xi32>, tensor<i32>) -> tensor<?x1xi32>
+  // CHECK: %[[RESHAPE:.*]] = "tfl.reshape"(%0, %[[CST]]) : (tensor<?x1xi32>, tensor<1xi32>) -> tensor<?xi32>
+  // CHECK: %[[ONE_HOT:.*]] = "tfl.one_hot"(%1, %[[CST_0]], %[[CST_1]], %[[CST_2]]) {axis = -1 : i32} : (tensor<?xi32>, tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<?x10xf32>
+  // CHECK-NEXT: return %[[ONE_HOT]]
+}
+
+// CHECK-LABEL: noReplaceReshapeEqualWithOneHotDynamicNonBatch
+func.func @noReplaceReshapeEqualWithOneHotDynamicNonBatch(%arg0: tensor<1x?xi32>) -> tensor<1x?x10xf32> {
+  %cst = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  %1 = "tfl.equal"(%arg0, %cst) : (tensor<1x?xi32>, tensor<10xi32>) -> tensor<1x?x10xi1>
+  %2 = "tfl.cast"(%1) : (tensor<1x?x10xi1>) -> tensor<1x?x10xf32>
+  func.return %2 : tensor<1x?x10xf32>
+
+  // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  // CHECK: %[[EQUAL:.*]] = "tfl.equal"(%arg0, %[[CST]]) : (tensor<1x?xi32>, tensor<10xi32>) -> tensor<1x?x10xi1>
+  // CHECK: %[[CAST:.*]] = "tfl.cast"(%[[EQUAL]]) : (tensor<1x?x10xi1>) -> tensor<1x?x10xf32>
+  // CHECK-NEXT: return %[[CAST]]
+}
+
+// CHECK-LABEL: noReplaceReshapeEqualWithOneHotUnranked
+func.func @noReplaceReshapeEqualWithOneHotUnranked(%arg0: tensor<*xi1>) -> tensor<*xi1> {
+  %cst = arith.constant dense<true> : tensor<i1>
+  %1 = "tfl.equal"(%arg0, %cst) : (tensor<*xi1>, tensor<i1>) -> tensor<*xi1>
+  func.return %1 : tensor<*xi1>
+
+  // CHECK-DAG: %[[CST:.*]] = arith.constant dense<true> : tensor<i1>
+  // CHECK: %[[EQUAL:.*]] = "tfl.equal"(%arg0, %cst) : (tensor<*xi1>, tensor<i1>) -> tensor<*xi1>
+  // CHECK-NEXT: return %[[EQUAL]]
+}
+
+// CHECK-LABEL: noReplaceReshapeEqualWithOneHotDynamicNonBatchRank1
+func.func @noReplaceReshapeEqualWithOneHotDynamicNonBatchRank1(%arg0: tensor<?xi32>) -> tensor<?x10xf32> {
+  %cst = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  %1 = "tfl.equal"(%arg0, %cst) : (tensor<?xi32>, tensor<10xi32>) -> tensor<?x10xi1>
+  %2 = "tfl.cast"(%1) : (tensor<?x10xi1>) -> tensor<?x10xf32>
+  func.return %2 : tensor<?x10xf32>
+
+  // CHECK-DAG: %[[CST:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]> : tensor<10xi32>
+  // CHECK: %[[EQUAL:.*]] = "tfl.equal"(%arg0, %[[CST]]) : (tensor<?xi32>, tensor<10xi32>) -> tensor<?x10xi1>
+  // CHECK: %[[CAST:.*]] = "tfl.cast"(%[[EQUAL]]) : (tensor<?x10xi1>) -> tensor<?x10xf32>
+  // CHECK-NEXT: return %[[CAST]]
+}
+
 // CHECK-LABEL: fuseOneHotCast
 func.func @fuseOneHotCast(%arg: tensor<2xi32>) -> (tensor<2x3xf32>, tensor<2x3xf32>) {
   %depth = arith.constant dense<3> : tensor<i32>
@@ -3118,3 +3232,160 @@ func.func @DontEliminateExtraSelect(%arg0: tensor<4x2xf32>, %arg1: tensor<4x2xi1
   // CHECK-NEXT: %[[SELECT_1:.*]] = "tfl.select_v2"
   // CHECK-NEXT: return %[[SELECT_1]]
 }
+
+// CHECK-LABEL:   func @fuseReluToMin1_StaticShapeWithBroadcastedCst_Float1
+func.func @fuseReluToMin1_StaticShapeWithBroadcastedCst_Float1(%arg0: tensor<2x2xf32>) -> (tensor<2x2xf32>) {
+  %cst0 = arith.constant dense<0.0> : tensor<f32>
+  %0 = "tfl.maximum"(%arg0, %cst0) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  %cst1 = arith.constant dense<1.0> : tensor<f32>
+  %1 = "tfl.minimum"(%0, %cst1) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+
+  func.return %1 : tensor<2x2xf32>
+  // CHECK-NOT: "tfl.relu"
+  // CHECK-NOT: "tfl.minimum"
+  // CHECK-NOT: "tfl.pseudo_const"
+  // CHECK: "tfl.relu_0_to_1"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+}
+
+// CHECK-LABEL:   func @fuseReluToMin1_StaticShapeWithBroadcastedCst_Float2
+func.func @fuseReluToMin1_StaticShapeWithBroadcastedCst_Float2(%arg0: tensor<2x2xf32>) -> (tensor<2x2xf32>) {
+  %cst0 = arith.constant dense<1.0> : tensor<f32>
+  %0 = "tfl.minimum"(%arg0, %cst0) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+  %cst1 = arith.constant dense<0.0> : tensor<f32>
+  %1 = "tfl.maximum"(%0, %cst1) : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+
+  func.return %1 : tensor<2x2xf32>
+  // CHECK-NOT: "tfl.relu"
+  // CHECK-NOT: "tfl.minimum"
+  // CHECK-NOT: "tfl.pseudo_const"
+  // CHECK: "tfl.relu_0_to_1"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+}
+
+// CHECK-LABEL:   func @fuseReluToMin1_StaticShapeWithSameShapeCst_Float
+func.func @fuseReluToMin1_StaticShapeWithSameShapeCst_Float2(%arg0: tensor<2x2xf32>) -> (tensor<2x2xf32>) {
+  %cst0 = arith.constant dense<1.0> : tensor<2x2xf32>
+  %0 = "tfl.minimum"(%arg0, %cst0) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+  %cst1 = arith.constant dense<0.0> : tensor<2x2xf32>
+  %1 = "tfl.maximum"(%0, %cst1) : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+
+  func.return %1 : tensor<2x2xf32>
+  // CHECK-NOT: "tfl.relu"
+  // CHECK-NOT: "tfl.minimum"
+  // CHECK-NOT: "tfl.pseudo_const"
+  // CHECK: "tfl.relu_0_to_1"(%arg0) : (tensor<2x2xf32>) -> tensor<2x2xf32>
+}
+
+
+
+// CHECK-LABEL:   func @fuseAddAndStridedSlice
+func.func @fuseAddAndStridedSlice(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK:  %cst = arith.constant dense<1> : tensor<1xi32>
+  // CHECK:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK:  %1 = "tfl.strided_slice"(%arg0, %arg1, %cst, %0) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = true, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tfl.add"(%arg1, %cst_0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @fuseSubAndStridedSlice
+func.func @fuseSubAndStridedSlice(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK:  %cst = arith.constant dense<1> : tensor<1xi32>
+  // CHECK:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK:  %1 = "tfl.strided_slice"(%arg0, %arg1, %cst, %0) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = true, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tfl.sub"(%arg1, %cst_0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @dontFuseAddAndStridedSliceNonConstantStride
+func.func @dontFuseAddAndStridedSliceNonConstantStrides(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK:  %1 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %2 = "tfl.strided_slice"(%arg0, %arg1, %1, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tfl.add"(%arg1, %cst) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @dontFuseAddAndStridedSliceOffset
+func.func @dontFuseAddAndStridedSliceOffset(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK:  %1 = tfl.add(%arg2, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %2 = "tfl.strided_slice"(%arg0, %arg1, %1, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tfl.add"(%arg2, %cst) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @dontFuseAddAndStridedSliceNonConstantOffset
+func.func @dontFuseAddAndStridedSliceNonConstantOffset(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK:  %0 = tfl.add %arg1, %arg1 {fused_activation_function = "NONE"} : tensor<1xi32>
+  // CHECK: "tfl.strided_slice"(%arg0, %arg1, %0, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %0 = "tfl.add"(%arg1, %arg1) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %arg2) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @dontFuseAddAndStridedSliceBeginMask
+func.func @dontFuseAddAndStridedSliceBeginMask(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG:  %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK:  %2 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tfl.add"(%arg1, %cst_0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %cst_1) {begin_mask = 1 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @dontFuseAddAndStridedSliceEndMask
+func.func @dontFuseAddAndStridedSliceEndMask(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG:  %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK:  %2 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tfl.add"(%arg1, %cst_0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @dontFuseAddAndStridedSliceEllipsisMask
+func.func @dontFuseAddAndStridedSliceEllipsisMask(%arg0: tensor<4xi32>, %arg1: tensor<1xi32>) -> tensor<4xi32> {
+  // CHECK-DAG:  %0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-DAG:  %1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  // CHECK:  %2 = tfl.add(%arg1, %0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  // CHECK:  %3 = "tfl.strided_slice"(%arg0, %arg1, %2, %1) {begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+
+  %cst_0 = "tfl.pseudo_const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tfl.pseudo_const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %0 = "tfl.add"(%arg1, %cst_0) {fused_activation_function = "NONE"} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %1 = "tfl.strided_slice"(%arg0, %arg1, %0, %cst_1) {begin_mask = 0 : i32, ellipsis_mask = 1 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<4xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+  func.return %1 : tensor<4xi32>
+}
+
+// CHECK-LABEL:   func @fuseSigmoid
+func.func @fuseSigmoid(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+  // CHECK: "tfl.logistic"
+  %cst = arith.constant dense<1.000000e+00> : tensor<10xf32>
+  %0 = "tfl.neg"(%arg0) : (tensor<10xf32>) -> tensor<10xf32>
+  %1 = "tfl.exp"(%0) : (tensor<10xf32>) -> tensor<10xf32>
+  %2 = tfl.add %1, %cst {fused_activation_function = "NONE"} : tensor<10xf32>
+  %3 = tfl.div %cst, %2 {fused_activation_function = "NONE"} : tensor<10xf32>
+  return %3 : tensor<10xf32>
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
index b549b564515..01ed79e5a63 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
@@ -3,6 +3,8 @@
 // RUN: tf-opt %s -tfl-prepare-quantize-dynamic-range="enable-float16-quantization" | FileCheck --check-prefix=Float16 %s
 // RUN: tf-opt %s -tfl-prepare-quantize-dynamic-range="enable-custom-op-quantization=CustomTestOp=1-3,CustomTestOp3=3" | FileCheck --check-prefix=CustomOp %s
 // RUN: tf-opt %s -tfl-prepare-quantize-dynamic-range="min-elements-for-weights=4000 enable-custom-op-quantization=CustomTestOp=1-3,CustomTestOp3=3" | FileCheck --check-prefix=MinElement %s
+// RUN: tf-opt %s -tfl-prepare-quantize-dynamic-range="min-elements-for-weights=19" | FileCheck --check-prefix=LSTMOpQuantized %s
+// RUN: tf-opt %s -tfl-prepare-quantize-dynamic-range="min-elements-for-weights=21" | FileCheck --check-prefix=LSTMOpNotQuantized %s
 
 // CHECK-LABEL: QuantizeConv2D
 // PerTensor-LABEL: QuantizeConv2D
@@ -409,3 +411,41 @@ func.func @LargeFloat16Constants(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112
 // Float16-DAG: %[[w:.*]] = arith.constant dense<6.550400e+04> : tensor<64x3x3x3xf16>
 // Float16-DAG: %[[b:.*]] = arith.constant dense<-6.550400e+04> : tensor<64xf16>
 }
+
+// LSTMOpQuantized-LABEL: LSTMOpNotPartiallyQuantized
+// LSTMOpNotQuantized-LABEL: LSTMOpNotPartiallyQuantized
+func.func @LSTMOpNotPartiallyQuantized(%arg0: tensor<1x28x28xf32>) -> tensor<1x28x20xf32> {
+    %cst_2 = "tfl.no_value"() {value = unit} : () -> none
+    %cst_3 = arith.constant dense<1.0> : tensor<20x20xf32>
+    %cst_7 = arith.constant dense<1.0> : tensor<20xf32>
+    %recurrent_input = arith.constant dense<1.0> : tensor<1x20xf32>
+    %recurrent_stats = "quantfork.stats"(%recurrent_input) {layerStats = dense<[-2.0, 1.0]> : tensor<2xf32>} : (tensor<1x20xf32>) -> tensor<1x20xf32>
+    %cell_input = arith.constant dense<1.0> : tensor<1x20xf32>
+    %cell_stats = "quantfork.stats"(%cell_input) {layerStats = dense<[-2.73090601, 7.94872093]> : tensor<2xf32>} : (tensor<1x20xf32>) -> tensor<1x20xf32>
+    %0 = "tfl.unidirectional_sequence_lstm"(%arg0,
+      %cst_3, %cst_3, %cst_3, %cst_3,
+      %cst_3, %cst_3, %cst_3, %cst_3,
+      %cst_7, %cst_7, %cst_7,
+      %cst_7, %cst_7, %cst_7, %cst_7,
+      %cst_3, %cst_2,
+      %recurrent_stats, %cell_stats,
+      %cst_2, %cst_2, %cst_2, %cst_2) {cell_clip = 1.000000e+01 : f32, fused_activation_function = "TANH", proj_clip = 0.000000e+00 : f32, time_major = false}
+    : ( tensor<1x28x28xf32>,
+        tensor<20x20xf32>, tensor<20x20xf32>, tensor<20x20xf32>, tensor<20x20xf32>,
+        tensor<20x20xf32>, tensor<20x20xf32>, tensor<20x20xf32>, tensor<20x20xf32>,
+        tensor<20xf32>, tensor<20xf32>, tensor<20xf32>,
+        tensor<20xf32>, tensor<20xf32>, tensor<20xf32>, tensor<20xf32>,
+        tensor<20x20xf32>, none,
+        tensor<1x20xf32>, tensor<1x20xf32>,
+        none, none, none, none) -> tensor<1x28x20xf32>
+    %1 = "quantfork.stats"(%0) {layerStats = dense<[-1.0, 2.0]> : tensor<2xf32>} : (tensor<1x28x20xf32>) -> tensor<1x28x20xf32>
+    func.return %1 : tensor<1x28x20xf32>
+
+// LSTMOpQuantized-DAG: %[[dq1:.*]] = "tfl.dequantize"({{.*}}) : (tensor<20x20x!quant.uniform<i8<-127:127>:f32, 0.0078740157480314959>>) -> tensor<20x20xf32>
+// LSTMOpQuantized-DAG: %[[dq3:.*]] = "tfl.dequantize"({{.*}}) : (tensor<20x!quant.uniform<i8<-127:127>:f32, 0.0078740157480314959>>) -> tensor<20xf32>
+// LSTMOpQuantized: %[[lstm:.*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %[[dq1]], %[[dq1]], %[[dq1]], %[[dq1]], %[[dq1]], %[[dq1]], %[[dq1]], %[[dq1]], %[[dq3]], %[[dq3]], %[[dq3]], %cst_0, %cst_0, %cst_0, %cst_0, %[[dq1]], %0, %cst_1, %cst_1, %0, %0, %0, %0)
+
+// LSTMOpNotQuantized-DAG: %[[cst_1:.*]] = arith.constant dense<1.000000e+00> : tensor<20x20xf32>
+// LSTMOpNotQuantized-DAG: %[[cst_3:.*]] = arith.constant dense<1.000000e+00> : tensor<20xf32>
+// LSTMOpNotQuantized: %[[lstm:.*]] = "tfl.unidirectional_sequence_lstm"(%arg0, %[[cst_1]], %[[cst_1]], %[[cst_1]], %[[cst_1]], %[[cst_1]], %[[cst_1]], %[[cst_1]], %[[cst_1]], %[[cst_3]], %[[cst_3]], %[[cst_3]], %cst_0, %cst_0, %cst_0, %cst_0, %[[cst_1]], %0, %cst_1, %cst_1, %0, %0, %0, %0)
+}
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index ea12951a97d..2a4b2af88f5 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -302,7 +302,7 @@ func.func @QuantizeSlice(tensor<2x3x5x!quant.uniform<u8:f32, 0.1>>, tensor<3xi32
 func.func @QuantizeStridedSlice(tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32> {
 ^bb0(%arg0: tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>, %arg1: tensor<1xi32>, %arg2: tensor<1xi32>, %arg3: tensor<1xi32>):
   %0 = "tfl.dequantize"(%arg0) : (tensor<12x2x2x5x!quant.uniform<u8:f32, 0.1>>) -> tensor<12x2x2x5xf32>
-  %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
+  %1 = "tfl.strided_slice"(%0, %arg1, %arg2, %arg3) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<12x2x2x5xf32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2x2x5xf32>
   func.return %1 : tensor<1x2x2x5xf32>
 
 // CHECK: %0 = "tfl.dequantize"(%arg0)
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index a668475a9e2..4f3914265b4 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -206,6 +206,19 @@ func.func @sharding(%arg0: tensor<10x10xi32>) -> (tensor<10x10xi32>) {
 // CHECK-NOT: %3 = "tf.XlaSharding"(%1) {_XlaSharding = "\08\03\1A\02\01\01\22\01\00", device = "", sharding = "\08\03\1A\02\01\01\22\01\00", unspecified_dims = []} : (tensor<10x10xi32>) -> tensor<10x10xi32>
 }
 
+func.func @preventGradient(%arg0: tensor<10x10xi32>) -> (tensor<10x10xi32>) {
+  %0 = "tf.MatMul"(%arg0, %arg0) {device = "", transpose_a = false, transpose_b = false} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xi32>
+  %1 = "tf.MatMul"(%arg0, %arg0) {device = "", transpose_a = false, transpose_b = false} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xi32>
+  %2 = "tf.PreventGradient"(%0) : (tensor<10x10xi32>) -> tensor<10x10xi32>
+  %3 = "tf.PreventGradient"(%1) : (tensor<10x10xi32>) -> tensor<10x10xi32>
+  %4 = "tf.AddV2"(%2, %3) {device = ""} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xi32>
+  func.return %4 : tensor<10x10xi32>
+
+// CHECK-LABEL: preventGradient
+// CHECK-NOT: %2 = "tf.PreventGradient"(%0) : (tensor<10x10xi32>) -> tensor<10x10xi32>
+// CHECK-NOT: %3 = "tf.PreventGradient"(%1) : (tensor<10x10xi32>) -> tensor<10x10xi32>
+}
+
 func.func @matmulNoTransposeAOrB(%arg0: tensor<1x1280xf32>, %arg1: tensor<1280x1000xf32>) -> tensor<1x1000xf32> {
   %166 = "tf.MatMul"(%arg0, %arg1) {T = "tfdtype$DT_FLOAT", _output_shapes = ["tfshape$dim { size = 1} dim { size = 1000}"], device = "", name = "matmul", transpose_a = false, transpose_b = false} : (tensor<1x1280xf32>, tensor<1280x1000xf32>) -> tensor<1x1000xf32>
   func.return %166 : tensor<1x1000xf32>
@@ -737,4 +750,12 @@ func.func @UnsupportedGroupConv_DynamicDimAtInputDimThree(%arg0: tensor<?x1x26x?
   // CHECK: "tf.Conv2D"
 }
 
+func.func @RedundantShapeOp(%shape: tensor<?xi64>, %fill: tensor<f32>) -> (tensor<?xi64>) {
+  %0 = "tf.Fill"(%shape, %fill) : (tensor<?xi64>, tensor<f32>) -> (tensor<*xf32>)
+  %1 = "tf.Shape"(%0) : (tensor<*xf32>) -> (tensor<?xi64>)
+  func.return %1 : tensor<?xi64>
+
+  // CHECK-LABEL: RedundantShapeOp
+  // CHECK-NOT: "tf.Shape"
+}
 }
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
index 4431444c1ba..58dfed58a69 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-variables.mlir
@@ -83,7 +83,7 @@ func.func @QuantizeReadAssign(%arg0: tensor<1x32x1x3xf32>) -> (tensor<1x34x1x3xf
   %4 = "tfl.concatenation"(%3, %1) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x1x3xf32>, tensor<1x32x1x3xf32>) -> tensor<1x34x1x3xf32>
   %5 = "tfl.quantize"(%4) {qtype = tensor<1x34x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x34x1x3xf32>) -> tensor<1x34x1x3x!quant.uniform<i8:f32, 1.0>>
   %6 = "tfl.dequantize"(%5) : (tensor<1x34x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x34x1x3xf32>
-  %7 = "tfl.strided_slice"(%6, %cst_1, %cst_0, %cst) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+  %7 = "tfl.strided_slice"(%6, %cst_1, %cst_0, %cst) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
   %8 = "tfl.quantize"(%7) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>
   %9 = "tfl.dequantize"(%8) : (tensor<1x2x1x3x!quant.uniform<i8:f32, 1.0>>) -> tensor<1x2x1x3xf32>
   "tfl.assign_variable"(%2, %9) : (tensor<!tf_type.resource>, tensor<1x2x1x3xf32>) -> ()
@@ -100,7 +100,7 @@ func.func @QuantizeReadAssign(%arg0: tensor<1x32x1x3xf32>) -> (tensor<1x34x1x3xf
 // CHECK-NEXT:  %[[cc:.*]] = "tfl.concatenation"(%[[dq2]], %[[dq1]]) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x1x3xf32>, tensor<1x32x1x3xf32>) -> tensor<1x34x1x3xf32>
 // CHECK-NEXT:  %[[q2:.*]] = "tfl.quantize"(%[[cc]]) {qtype = tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x34x1x3xf32>) -> tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  %[[dq3:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x34x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x34x1x3xf32>
-// CHECK-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[dq3]], %[[cst_1]], %[[cst_0]], %[[cst]]) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
+// CHECK-NEXT:  %[[ss:.*]] = "tfl.strided_slice"(%[[dq3]], %[[cst_1]], %[[cst_0]], %[[cst]]) {begin_mask = 13 : i32, ellipsis_mask = 0 : i32, end_mask = 15 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<1x34x1x3xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x2x1x3xf32>
 // CHECK-NEXT:  %[[q3:.*]] = "tfl.quantize"(%[[ss]]) {qtype = tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>, volatile} : (tensor<1x2x1x3xf32>) -> tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK-NEXT:  "tfl.assign_variable"(%[[vh]], %[[q3]]) : (tensor<*x!tf_type.resource<tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>>>, tensor<1x2x1x3x!quant.uniform<i8:f32, 1.000000e+00>>) -> ()
 // CHECK-NEXT:  return %[[dq3]] : tensor<1x34x1x3xf32>
@@ -127,7 +127,7 @@ func.func @QuantizeConvVariable(%arg0: tensor<1x3x1x1xf32>) -> (tensor<1x3x1x1xf
   %11 = "tfl.concatenation"(%7, %10) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x3x1x1xf32>, tensor<1x3x1x1xf32>) -> tensor<1x6x1x1xf32>
   %12 = "tfl.quantize"(%11) {qtype = tensor<1x6x1x1x!quant.uniform<i8:f32, 1.0:2>>, volatile} : (tensor<1x6x1x1xf32>) -> tensor<1x6x1x1x!quant.uniform<i8:f32, 1.0:2>>
   %13 = "tfl.dequantize"(%12) : (tensor<1x6x1x1x!quant.uniform<i8:f32, 1.0:2>>) -> tensor<1x6x1x1xf32>
-  %14 = "tfl.strided_slice"(%13, %cst_1, %cst_0, %cst) {begin_mask = 15 : i32, ellipsis_mask = 0 : i32, end_mask = 13 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x6x1x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3x1x1xf32>
+  %14 = "tfl.strided_slice"(%13, %cst_1, %cst_0, %cst) {begin_mask = 15 : i32, ellipsis_mask = 0 : i32, end_mask = 13 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<1x6x1x1xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3x1x1xf32>
   %15 = "tfl.quantize"(%14) {qtype = tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>, volatile} : (tensor<1x3x1x1xf32>) -> tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>
   %16 = "tfl.dequantize"(%15) : (tensor<1x3x1x1x!quant.uniform<i8:f32, 1.0:2>>) -> tensor<1x3x1x1xf32>
   "tfl.assign_variable"(%6, %16) : (tensor<!tf_type.resource>, tensor<1x3x1x1xf32>) -> ()
@@ -157,7 +157,7 @@ func.func @QuantizeTwoVariable(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>)
   %41 = "quantfork.stats"(%40) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
   %42 = "tfl.concatenation"(%41, %0) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x4x3xf32>
   %43 = "quantfork.stats"(%42) {layerStats = dense<[-1.0, 1.0]> : tensor<2xf32>} : (tensor<1x4x3xf32>) -> tensor<1x4x3xf32>
-  %44 = "tfl.strided_slice"(%43, %1, %2, %3) {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 5 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x4x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3xf32>
+  %44 = "tfl.strided_slice"(%43, %1, %2, %3) {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 5 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<1x4x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3xf32>
   %45 = "quantfork.stats"(%44) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
   "tfl.assign_variable"(%4, %45) : (tensor<!tf_type.resource>, tensor<1x2x3xf32>) -> ()
 
@@ -165,7 +165,7 @@ func.func @QuantizeTwoVariable(%arg0: tensor<1x2x3xf32>) -> (tensor<1x2x3xf32>)
   %51 = "quantfork.stats"(%50) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
   %52 = "tfl.concatenation"(%51, %0) {axis = 1 : i32, fused_activation_function = "NONE"} : (tensor<1x2x3xf32>, tensor<1x2x3xf32>) -> tensor<1x4x3xf32>
   %53 = "quantfork.stats"(%52) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x4x3xf32>) -> tensor<1x4x3xf32>
-  %54 = "tfl.strided_slice"(%53, %1, %2, %3) {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 5 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32} : (tensor<1x4x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3xf32>
+  %54 = "tfl.strided_slice"(%53, %1, %2, %3) {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 5 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false} : (tensor<1x4x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x2x3xf32>
   %55 = "quantfork.stats"(%54) {layerStats = dense<[0.0, 1.0]> : tensor<2xf32>} : (tensor<1x2x3xf32>) -> tensor<1x2x3xf32>
   "tfl.assign_variable"(%5, %55) : (tensor<!tf_type.resource>, tensor<1x2x3xf32>) -> ()
 
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 84323be0555..8a7625d672b 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -27,6 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
@@ -122,6 +126,10 @@ void AddDynamicRangeQuantizationPasses(
 
 void AddConvertHloToTfPass(std::string entry_function_name,
                            mlir::OpPassManager* pass_manager) {
+  pass_manager->addPass(mlir::odml::CreateRenameEntrypointToMainPass());
+  pass_manager->addPass(
+      mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
+  pass_manager->addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   // Legalize jax random to tflite custom op.
   // The CreateLegalizeJaxRandom Pass has to stay at because we need to replace
   // the random function body before being inlined.
@@ -150,6 +158,14 @@ void AddConvertHloToTfPass(std::string entry_function_name,
   pass_manager->addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateLegalizeHloToTfPass());
 
+  // folds tf.BroadcastTo ops with subsequent ops if they have built in
+  // broadcasting support. This needs to be run immediately after HLO->TF
+  // legalization; otherwise other passes like `ConvertTFBroadcastTo` will
+  // constant fold the newly generated TF broadcast ops and materialize the
+  // weights.
+  pass_manager->addNestedPass<mlir::func::FuncOp>(
+      mlir::TF::CreateBroadcastFoldPass());
+
   // Canonicalization after TF legalization.
   pass_manager->addNestedPass<mlir::func::FuncOp>(
       mlir::createCanonicalizerPass());
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index 2d25ef59a14..8b4057bc625 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include <functional>
 #include <iostream>
+#include <memory>
 #include <optional>
+#include <unordered_set>
+#include <vector>
 
 #include "absl/strings/str_split.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 86dbe9c513e..51fd8dbc23e 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -165,24 +165,27 @@ StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
   auto extra_opdefs_status = RegisterExtraTfOpDefs(extra_tf_opdefs);
   if (!extra_opdefs_status.ok()) return extra_opdefs_status;
 
+  ::tensorflow::GraphdefToMlirOptions graphdef_conversion_options{
+      std::string(debug_info_file),
+      /*xla_compile_device_type=*/"",
+      /*prune_unused_nodes=*/specs.prune_unused_nodes,
+      /*convert_legacy_fed_inputs=*/true,
+      /*graph_as_function=*/false,
+      specs.upgrade_legacy,
+      /*enable_shape_inference=*/false,
+      /*unconditionally_use_set_output_shapes=*/true,
+      /*enable_soft_placement=*/false};
+
   if (use_splatted_constant) {
     return tensorflow::GraphdefToSplattedMlirTranslateFunction(
-        file->getBuffer(), debug_info_file, /*xla_compile_device_type=*/"",
-        input_arrays, input_dtypes, input_shapes, output_arrays,
-        control_output_arrays, specs.prune_unused_nodes,
-        /*convert_legacy_fed_inputs=*/true,
-        /*graph_as_function=*/false, specs.upgrade_legacy,
-        /*enable_shape_inference=*/false,
-        /*unconditionally_use_set_output_shapes=*/true, context);
+        file->getBuffer(), input_arrays, input_dtypes, input_shapes,
+        output_arrays, control_output_arrays, graphdef_conversion_options,
+        context);
   }
   return tensorflow::GraphdefToMlirTranslateFunction(
-      file->getBuffer(), debug_info_file, /*xla_compile_device_type=*/"",
-      input_arrays, input_dtypes, input_shapes, output_arrays,
-      control_output_arrays, specs.prune_unused_nodes,
-      /*convert_legacy_fed_inputs=*/true,
-      /*graph_as_function=*/false, specs.upgrade_legacy,
-      /*enable_shape_inference=*/false,
-      /*unconditionally_use_set_output_shapes=*/true, context);
+      file->getBuffer(), input_arrays, input_dtypes, input_shapes,
+      output_arrays, control_output_arrays, graphdef_conversion_options,
+      context);
 }
 
 // Applying post-training dynamic range quantization from the old TOCO quantizer
@@ -321,8 +324,7 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   mlir::PassManager pass_manager(module.getContext());
   mlir::registerPassManagerCLOptions();
   if (mlir::failed(mlir::applyPassManagerCLOptions(pass_manager))) {
-    return tensorflow::FromAbslStatus(
-        absl::UnknownError("failed to apply MLIR pass manager CL options"));
+    return absl::UnknownError("failed to apply MLIR pass manager CL options");
   }
   pass_manager.addInstrumentation(
       std::make_unique<mlir::TFL::ErrorCollectorInstrumentation>(
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 173f16ab488..95c9817f560 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -16,9 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
 
+#include <memory>
 #include <optional>
 #include <string>
 #include <unordered_set>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "llvm/Support/SourceMgr.h"
@@ -53,7 +55,7 @@ tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromGraphdefOrMlirSource(
 // Load Saved model (either v1 or v2) into MLIR.
 // 'saved_model_bundle' will be initialized if V1 model was loaded.
 tsl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
-    const std::string& input_filename, const int saved_model_version,
+    const std::string& input_filename, int saved_model_version,
     const std::unordered_set<std::string>& tags,
     absl::Span<const std::string> extra_tf_opdefs,
     absl::Span<std::string> exported_names, const GraphImportConfig& specs,
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index a020a4be43a..34373268527 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -50,6 +50,10 @@ def ConvertToQuantTypeFromAttrs : NativeCodeCall<
 def convertIntAttrTo32Bit : NativeCodeCall<
     "$_builder.getI32IntegerAttr($0.cast<IntegerAttr>().getInt())">;
 
+// Builds a constant bool attribute.
+class GetBoolAttr<int value> :
+    NativeCodeCall<"$_builder.getBoolAttr(" # value #")">;
+
 // Converts an integer attribute $0 to 64-bit with builder.
 def convertIntAttrTo64Bit : NativeCodeCall<
     "$_builder.getI64IntegerAttr($0.cast<IntegerAttr>().getInt())">;
@@ -69,6 +73,10 @@ def CreateTFCastToInt32Op : NativeCodeCall<
 def CreateNoneValue : NativeCodeCall<
   "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
 
+// Creates an int32 constant op from an integer attribute $0.
+def CreateInt32ConstOpFromIntAttr
+  : NativeCodeCall<"$_builder.create<TF::ConstOp>($_loc, DenseElementsAttr::get(RankedTensorType::get({}, $_builder.getI32Type()), {static_cast<int32_t>($0.cast<IntegerAttr>().getInt())}))">;
+
 //===----------------------------------------------------------------------===//
 // Nullary ops patterns.
 //===----------------------------------------------------------------------===//
@@ -373,6 +381,16 @@ def LegalizeSum : Pat<(TF_SumOp $arg, $axes, BoolAttr:$arg2),
 def LegalizeTopKV2 : Pat<(TF_TopKV2Op $input, $k, $ignored_sorted),
                          (TFL_TopKV2Op $input, $k)>;
 
+def ReductionDimensionIsLastDim : Constraint<CPred<"($0.cast<IntegerAttr>().getInt() == "
+  "$1.getType().cast<ShapedType>().getRank() - 1 || $0.cast<IntegerAttr>().getInt() == -1)">>;
+
+// Legalizes TF_ApproxTopKOp to TFL_TopKV2Op with the following constraints:
+//    1. It computes max k
+//    2. The reduction dimension is the last dim of the input.
+def LegalizeApproxTopK : Pat<(TF_ApproxTopKOp $input, $k, $reduction_dimenstion, $ignored_recall_target, /*is_max_k*/ConstBoolAttrTrue, $ignored_reduction_input_size_override, $ignored_aggregate_to_topk),
+                         (TFL_TopKV2Op $input, (CreateInt32ConstOpFromIntAttr $k)),
+                         [(ReductionDimensionIsLastDim $reduction_dimenstion, $input)]>;
+
 def LegalizeMin : Pat<
   (TF_MinOp $arg0, $axes, BoolAttr:$arg2),
   (TFL_ReduceMinOp $arg0, (CreateTFCastToInt32Op $axes), $arg2)>;
@@ -534,7 +552,8 @@ def LegalizeStridedSlice : Pat<
     (CreateTFCastToInt32Op $strides), (convertIntAttrTo32Bit $begin_mask),
     (convertIntAttrTo32Bit $end_mask), (convertIntAttrTo32Bit $ellipsis_mask),
     (convertIntAttrTo32Bit $new_axis_mask),
-    (convertIntAttrTo32Bit $shrink_axis_mask))>;
+    (convertIntAttrTo32Bit $shrink_axis_mask),
+    (GetBoolAttr<false>))>;
 
 def LegalizeRfft2d : Pat<
   (TF_RFFT2DOp $input, $fft_length),
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index 9b74c6bf606..7b31bcbc1a1 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <climits>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -236,8 +237,8 @@ bool CanFuseConvOrDepthwiseConv(Attribute filter, Attribute val,
   return false;
 }
 
-// Retuns true if we can eliminate the GatherNdOp or ScatterNdOp. When the value
-// of `indices` are from 0 to n-1, the output tensor are identical to the
+// Returns true if we can eliminate the GatherNdOp or ScatterNdOp. When the
+// value of `indices` are from 0 to n-1, the output tensor are identical to the
 // `params`.
 bool CanOptimizeIdentityGatherNdOrScatterNdOp(Value params,
                                               DenseIntElementsAttr indices,
@@ -344,15 +345,25 @@ TypeAttr RescaleQtype(Type input, Attribute factor) {
 
 // Returns shape of a ranked tensor.
 // Precondition: output_val's is ranked tensor.
-DenseElementsAttr GetShape(Value output_val) {
+// Returns a truncated shape when `truncate` is set to true.
+DenseElementsAttr GetShape(Value output_val, bool truncate = false) {
   auto output_type = output_val.getType().cast<RankedTensorType>();
 
   SmallVector<int32_t> shape;
   shape.reserve(output_type.getRank());
-  for (int64_t dim : output_type.getShape()) {
+
+  bool needs_truncation = true;
+  for (size_t dim_idx = 0; dim_idx < output_type.getRank(); ++dim_idx) {
+    int64_t dim = output_type.getShape()[dim_idx];
+    if (truncate && needs_truncation && dim == 1) {
+      continue;
+    } else if (needs_truncation && dim != 1) {
+      needs_truncation = false;
+    }
     shape.push_back(ShapedType::isDynamic(dim) ? -1
                                                : static_cast<int32_t>(dim));
   }
+
   return mlir::DenseElementsAttr::get(
       RankedTensorType::get(
           {static_cast<int>(shape.size())},
@@ -360,6 +371,34 @@ DenseElementsAttr GetShape(Value output_val) {
       llvm::ArrayRef(shape));
 }
 
+// Utility function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+DenseElementsAttr RemapPermutation(Value permutation1, Value permutation2) {
+  SmallVector<int32_t> initial_permutation;
+  DenseElementsAttr perm1_const;
+  DenseElementsAttr perm2_const;
+
+  SmallVector<int32_t> new_permutation;
+  if (matchPattern(permutation1, m_Constant(&perm1_const)) &&
+      matchPattern(permutation2, m_Constant(&perm2_const))) {
+    for (int32_t idx = 0; idx < perm1_const.getNumElements(); ++idx) {
+      initial_permutation.push_back(idx);
+    }
+    for (auto perm : perm2_const.getValues<APInt>()) {
+      new_permutation.push_back(
+          initial_permutation[perm1_const
+                                  .getValues<APInt>()[perm.getSExtValue()]
+                                  .getSExtValue()]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get(
+          {static_cast<int>(new_permutation.size())},
+          mlir::IntegerType::get(permutation1.getContext(), 32)),
+      llvm::ArrayRef(new_permutation));
+}
+
 // Returns `true` if reducing `axes` in `input` with `keep_dims=true` results in
 // the specified `shape` and `false` otherwise.
 static bool ShapeMatchesReduceWithKeepAxes(Value input,
@@ -480,8 +519,12 @@ Value ReshapeValueDroppingLastDim(OpBuilder &builder, Value value) {
   // so we could cast safely here.
   auto type = value.getType().cast<ShapedType>();
   SmallVector<int> new_shape;
-  for (int64_t dim : type.getShape().drop_back()) {
-    new_shape.push_back(dim);
+  if (type.hasStaticShape()) {
+    for (int64_t dim : type.getShape().drop_back()) {
+      new_shape.push_back(dim);
+    }
+  } else {
+    new_shape.push_back(-1);
   }
   return builder.create<ReshapeOp>(
       value.getLoc(), value,
@@ -635,6 +678,78 @@ TypedAttr ConvertSingleElementAttrToFloatAttr(Attribute attr) {
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
+struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
+  using OpRewritePattern<TFL::StridedSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::StridedSliceOp strided_slice_op,
+                                PatternRewriter &rewriter) const override {
+    // Match Add
+    mlir::TFL::AddOp add_op =
+        dyn_cast_or_null<TFL::AddOp>(strided_slice_op.getEnd().getDefiningOp());
+    mlir::TFL::SubOp sub_op =
+        dyn_cast_or_null<TFL::SubOp>(strided_slice_op.getEnd().getDefiningOp());
+    if (!(add_op || sub_op)) {
+      return failure();
+    }
+
+    // Check that add rhs is constant.
+    DenseElementsAttr added_value;
+    Value constant_val = add_op ? add_op.getRhs() : sub_op.getRhs();
+    if (!matchPattern(constant_val, m_Constant(&added_value))) return failure();
+
+    // Check the add op is applied to begin.
+    mlir::TypedValue<::mlir::TensorType> begin_tensor =
+        strided_slice_op.getBegin();
+    mlir::TypedValue<::mlir::TensorType> add_source_tensor =
+        add_op ? add_op.getLhs() : sub_op.getLhs();
+    if (begin_tensor != add_source_tensor) {
+      return failure();
+    }
+
+    // Check that strides are constant
+    DenseElementsAttr strides_value;
+    Value strides_val = strided_slice_op.getStrides();
+    if (!matchPattern(strides_val, m_Constant(&strides_value)))
+      return failure();
+
+    mlir::TensorType constant_val_type =
+        constant_val.getType().cast<TensorType>();
+    // If it's not 1D or 0D (which can be broadcasted to 1D), reject the
+    // matching.
+    if (constant_val_type.getRank() > 1) {
+      return failure();
+    }
+
+    mlir::RankedTensorType end_type =
+        strided_slice_op.getEnd().getType().dyn_cast<RankedTensorType>();
+    // begin, end and strides are Rank 1 tensors with one element per dimension
+    // of input.
+    int64_t num_dims = end_type.getShape()[0];
+    DenseElementsAttr new_added_value =
+        added_value.reshape(RankedTensorType::get(
+            {num_dims},
+            added_value.getType().cast<ShapedType>().getElementType()));
+    ::mlir::arith::ConstantOp new_end = rewriter.create<arith::ConstantOp>(
+        strided_slice_op.getEnd().getLoc(), new_added_value);
+
+    if (strided_slice_op.getBeginMask() != 0) return failure();
+    if (strided_slice_op.getEndMask() != 0) return failure();
+    if (strided_slice_op.getEllipsisMask() != 0) return failure();
+    mlir::TFL::StridedSliceOp new_strided_slice_op =
+        rewriter.create<TFL::StridedSliceOp>(
+            strided_slice_op.getLoc(), strided_slice_op.getOutput().getType(),
+            strided_slice_op.getInput(), strided_slice_op.getBegin(), new_end,
+            strided_slice_op.getStrides(), strided_slice_op.getBeginMask(),
+            strided_slice_op.getEndMask(), strided_slice_op.getEllipsisMask(),
+            strided_slice_op.getNewAxisMask(),
+            strided_slice_op.getShrinkAxisMask(),
+            /*offset=*/true);
+    rewriter.replaceOp(strided_slice_op, new_strided_slice_op.getOutput());
+
+    return success();
+  }
+};
+
 // Fuse Add with proceeding FullyConnected.
 // TODO(b/136285429): Move to tablegen when variadic is supported
 struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
@@ -655,6 +770,9 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
     // Check if the constant RHS is either 0D (scalar), or a 1D with
     // `{num_channels}` shape.
     auto constant_val_type = constant_val.getType().cast<TensorType>();
+    if (constant_val_type.getRank() > 1) {
+      return failure();
+    }
 
     // In TFLite FullyConnect definition, bias must be a 1D tensor where
     // the number of elements is equal to the number of channels.
@@ -716,7 +834,7 @@ struct FuseFullyConnectedAndAdd : public OpRewritePattern<TFL::AddOp> {
                   .getOutput();
         } else {
           // If the RHS is neither a scalar constant nor a 1d constant, look
-          // if there is opportunity to reduce the dimentionality and allow
+          // if there is opportunity to reduce the dimensionality and allow
           // implicit broadcasting
 
           auto new_added_value = added_value.reshape(RankedTensorType::get(
@@ -1803,7 +1921,7 @@ void OptimizePass::runOnOperation() {
       FuseBinaryOpToFollowingFullyConnected, FuseConv2DAndMulWithQDQs,
       FuseDepthwiseConv2DAndMulWithQDQs, ConvertTrivialTransposeOpToReshapeOp,
       RemoveReshapeAfterFullyConnected, RemoveReshapeBeforeFullyConnected,
-      FuseUnpackAndConcatToReshape, OptimizeTopK>(ctx);
+      FuseUnpackAndConcatToReshape, OptimizeTopK, FuseAddAndStridedSlice>(ctx);
   if (!this->disable_fuse_mul_and_fc_) {
     phase_2_patterns.add<FuseMulAndFullyConnected>(ctx);
   }
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 216ac15c034..01357c332d5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -679,16 +679,56 @@ foreach ValueOp = [TFL_CeilOp, TFL_ExpOp, TFL_FloorOp, TFL_NegOp,
 // if called without a ranked tensor it will fail.
 def GetShape: NativeCodeCall<"GetShape($0)">;
 
+// Returns truncated shape of a ranked-tensor.
+// Truncated, here, means eliminating any contiguous 1s' in the lower
+// dimentions of the tensor
+def GetTruncatedShape: NativeCodeCall<"GetShape($0, true)">;
+
 // Returns True if the operand type is RankedTensorType and valid.
 def HasValidRankedTensor : Constraint<CPred<
   "$0.getType().isa<RankedTensorType>() && "
   "$0.getType().cast<RankedTensorType>().getNumDynamicDims() <= 1">>;
 
+// Check if the truncated shape of the lhs is equal to the shape of rhs
+def IsTruncatedShapeEqualTo : Constraint<CPred<
+  "GetShape($0, true) == GetShape($1)">>;
+
 def ConvertSqueezeToReshape : Pat<
   (TFL_SqueezeOp:$squeeze_op $input, $squeeze_dims),
   (TFL_ReshapeOp $input, (Arith_ConstantOp (GetShape $squeeze_op))),
   [(HasValidRankedTensor $squeeze_op)]>;
 
+// Pattern to perform the following optimization
+// transpose [1xAxB] -> [1xBxA]
+//    |
+// reshape [1xBxA] -> [BxA]
+//    |
+// transpose [BxA] -> [AxB]
+//    ||
+// reshape [1xAxB] -> [AxB]
+def ConvertTrasposeReshapeTransposeToReshape : Pat<
+  (TFL_TransposeOp:$second_transpose
+    (TFL_ReshapeOp:$middle_reshape
+      (TFL_TransposeOp:$first_transpose $input, $permutation2),
+        $shape),
+      $permutation1),
+  (TFL_ReshapeOp $input, (Arith_ConstantOp (GetTruncatedShape $input))),
+  [(IsTruncatedShapeEqualTo $first_transpose, $middle_reshape),
+   (IsTruncatedShapeEqualTo $input, $second_transpose)]>;
+
+// Function to map final permutation to initial permutation
+// initial -> permutation1 -> permutation2 -> final
+def RemapPermutation: NativeCodeCall<"RemapPermutation($0, $1)">;
+
+// Pattern to fuse redundant tanspose op
+def FoldDoubleTranspose : Pat<
+  (TFL_TransposeOp
+    (TFL_TransposeOp:$transpose_out1 $input, (Arith_ConstantOp:$permutation1 $p1)),
+    (Arith_ConstantOp:$permutation2 $p2)),
+  (TFL_TransposeOp $input,
+    (Arith_ConstantOp (RemapPermutation $permutation1, $permutation2))),
+  [(HasOneUse $transpose_out1)]>;
+
 // Convert expand_dims to reshape if possible.
 def ConvertExpandDimsToReshape : Pat<
   (TFL_ExpandDimsOp:$expand_dims_op $input, $dim),
@@ -714,6 +754,19 @@ def MinimumOfReluAnd6ToRelu6 :
       (TFL_Relu6Op $x),
       [(IsConstantValueOf<6> $y)]>;
 
+// For both relu1 and relu_0_to_1, the min/max operators commute,
+// so there are two possible orderings we need to rewrite.
+// Concretely, `m < n -> max(m, min(n, x)) = min(m, max(m, x))`.
+// Proof:
+// case (x <= m)
+//   max(m, min(n, x)) = max(m, m) = m and
+//   min(n, max(m, x)) = min(n, m) = m
+// case (m < x < n)
+//   max(m, min(n, x)) = max(m, x) = x and
+//   min(n, max(m, x)) = min(n, x) = x
+// case (n <= x)
+//   max(m, min(n, x)) = max(m, n) = n and
+//   min(n, max(m, x)) = min(n, x) = n
 def MatchRelu1Pattern1 : Pat<
   (TFL_MinimumOp (TFL_MaximumOp $input, (Arith_ConstantOp $NegOne)),
     (Arith_ConstantOp $One)),
@@ -726,6 +779,18 @@ def MatchRelu1Pattern2 : Pat<
   (TFL_Relu1Op $input),
   [(FloatValueEquals<"-1"> $NegOne), (FloatValueEquals<"1"> $One)]>;
 
+def MatchRelu0To1Pattern1: Pat<
+  (TFL_MinimumOp (TFL_MaximumOp $x, (Arith_ConstantOp $max_cst)),
+    (Arith_ConstantOp $min_cst)),
+    (TFL_Relu0To1Op $x),
+  [(FloatValueEquals<"0"> $max_cst), (FloatValueEquals<"1"> $min_cst)]>;
+
+def MatchRelu0To1Pattern2: Pat<
+  (TFL_MaximumOp (TFL_MinimumOp $x, (Arith_ConstantOp $min_cst)),
+    (Arith_ConstantOp $max_cst)),
+    (TFL_Relu0To1Op $x),
+  [(FloatValueEquals<"0"> $max_cst), (FloatValueEquals<"1"> $min_cst)]>;
+
 def MatchLeakyRelu : Pat<
   (TFL_MaximumOp
     (TFL_MulOp:$mul_out $x,
@@ -1027,9 +1092,18 @@ def ReshapeValueDroppingLastDim : NativeCodeCall<
 def IsOneHotIndexAttribute : Constraint<CPred<
   "TFL::IsOneHotIndexAttribute($0)">>;
 
-// Checks if the shape has shape with last dimension equals 1.
+// Checks if the shape has static shape with last dimension equals 1.
 def IsLastDimensionEqualOne : Constraint<CPred<"IsLastDimensionEqualOne($0)">>;
 
+// As above but if shape is not static and rank 2 with last dim 1.
+def IsLastDimensionEqualOneOrDynamicBatchDimRank2 : Constraint<
+  CPred<"IsLastDimensionEqualOne($0) || "
+        "(!$0.getType().cast<ShapedType>().hasStaticShape() && "
+        "  $0.getType().cast<ShapedType>().hasRank() && "
+        "  $0.getType().cast<ShapedType>().getRank() == 2 && "
+        "  !$0.getType().cast<ShapedType>().getShape().empty() && "
+        "  $0.getType().cast<ShapedType>().getShape()[1] == 1)">>;
+
 // Replace
 //   Equal(X, indices)
 // With
@@ -1044,7 +1118,7 @@ def ReshapeEqualOpToOneHotOp : Pat<
                 (Arith_ConstantOp ConstantAttr<RankedSignlessIntElementsAttr<1, []>, "true">),
                 (Arith_ConstantOp ConstantAttr<RankedSignlessIntElementsAttr<1, []>, "false">),
                 ConstantAttr<I32Attr, "-1">),
-  [(IsLastDimensionEqualOne $x),
+  [(IsLastDimensionEqualOneOrDynamicBatchDimRank2 $x),
    (HasRankAtLeast<2> $x),
    (IsOneHotIndexAttribute $series)]>;
 
@@ -1258,3 +1332,18 @@ def SimplifyDoubleSelectFCZerosRHS : Pat<
    (AllValuesAreZero $zeros_1),
    (AllValuesAreZero $zeros_2)
   ]>;
+
+def FuseSigmoid : Pat<
+  (TFL_DivOp
+   (Arith_ConstantOp F32ElementsAttr:$ones),
+   (TFL_AddOp:$add_out
+    (TFL_ExpOp:$exp_out
+     (TFL_NegOp:$neg_out $arg)),
+    (Arith_ConstantOp F32ElementsAttr:$ones_1), TFL_AF_None), TFL_AF_None),
+  (TFL_LogisticOp $arg),
+  [(FloatValueEquals<"1"> $ones_1),
+   (FloatValueEquals<"1"> $ones),
+   (HasOneUse $neg_out),
+   (HasOneUse $add_out),
+   (HasOneUse $exp_out),
+  ]>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
index 9064d6c7f50..47fc9df2ba5 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_patterns.td
@@ -76,6 +76,7 @@ def ConvertPlaceholderWithDefault : Pat<(TF_PlaceholderWithDefaultOp $arg), (TF_
 //===----------------------------------------------------------------------===//
 // Op removal patterns.
 //===----------------------------------------------------------------------===//
+def RemovePreventGradient : Pat<(TF_PreventGradientOp $op, $msg), (replaceWithValue $op)>;
 def RemoveXlaSharding : Pat<(TF_XlaShardingOp $a, $b, $c), (replaceWithValue $a)>;
 def RemoveIdentityN : Pat<(TF_IdentityNOp $arg), (replaceWithValue $arg)>;
 
@@ -196,3 +197,10 @@ def LowerUInt32AddV2 : Pat<
       (CreateTFCastOpI32 $rhs, /*truncate=*/ConstBoolAttrFalse)),
      /*truncate=*/ConstBoolAttrFalse),
   [(TensorOf<[TF_Uint32]> $lhs), (TensorOf<[TF_Uint32]> $rhs)]>;
+
+//===----------------------------------------------------------------------===//
+// Fill op patterns.
+//===----------------------------------------------------------------------===//
+
+def RemoveRedundantShapeOp :
+  Pat<(TF_ShapeOp (TF_FillOp $shape, $fill)), (replaceWithValue $shape)>;
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index a19c29a666f..951748b3127 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -84,6 +84,10 @@ class PrepareDynamicRangeQuantizePass
   void runOnOperation() override;
 
  private:
+  // Keeps track of ops whose inputs cannot be quantized due to not meeting the
+  // minimum_elements_for_weights threshold. Prevents emitting duplicate
+  // warnings for the same op, once deemed ineligible for quantization.
+  llvm::SetVector<Operation*> visited_nonquantizable_ops_;
   quant::QuantizationSpecs quant_specs_;
 };
 
@@ -95,8 +99,10 @@ class PrepareDynamicRangeQuantizableOp
     : public OpRewritePattern<arith::ConstantOp> {
  public:
   explicit PrepareDynamicRangeQuantizableOp(
-      MLIRContext* context, const quant::QuantizationSpecs& quant_specs)
+      MLIRContext* context, const quant::QuantizationSpecs& quant_specs,
+      llvm::SetVector<Operation*>* const visited_nonquantizable_ops)
       : OpRewritePattern<arith::ConstantOp>(context),
+        visited_nonquantizable_ops_(visited_nonquantizable_ops),
         quant_specs_(quant_specs) {}
 
   LogicalResult matchAndRewrite(arith::ConstantOp op,
@@ -129,6 +135,8 @@ class PrepareDynamicRangeQuantizableOp
   }
 
  private:
+  llvm::SetVector<Operation*>* const visited_nonquantizable_ops_;
+
   // Check if the operand_index is included in the quantizable_indices.
   bool isQuantizableIndex(const int operand_index,
                           const std::vector<int>& quantizable_indices) const {
@@ -142,6 +150,10 @@ class PrepareDynamicRangeQuantizableOp
   // specification for checking the support. For custom ops, it checks the
   // provided map.
   bool hasInt8QuantizableOperandAt(Operation* op, int operand_index) const {
+    if (visited_nonquantizable_ops_->contains(op)) {
+      return false;
+    }
+
     if (auto custom_op = llvm::dyn_cast_or_null<CustomOp>(op)) {
       std::string op_name = custom_op.getCustomCode().str();
       auto custom_map_iter = quant_specs_.custom_map.find(op_name);
@@ -152,7 +164,53 @@ class PrepareDynamicRangeQuantizableOp
                    llvm::dyn_cast<DynamicRangeQuantizedOpInterface>(op)) {
       const auto& quantizable_indices =
           quantizable_op.GetQuantizableOperandIndices();
-      return isQuantizableIndex(operand_index, quantizable_indices);
+
+      if (!isQuantizableIndex(operand_index, quantizable_indices)) {
+        return false;
+      }
+
+      // Special case handling for UnidirectionalSequenceLSTMOp, which doesn't
+      // support partial quantization of its inputs.
+      // Below, we check all of the input constants for the
+      // UnidirectionalSequenceLSTMOp to see if any of them would not be
+      // quantized due to not meeting the minimum_elements_for_weights
+      // threshold. Should we find any, we don't quantize any of the ops.
+      if (!llvm::dyn_cast<UnidirectionalSequenceLSTMOp>(op)) {
+        return true;
+      }
+
+      for (int qi : quantizable_indices) {
+        auto const_op = llvm::dyn_cast_or_null<arith::ConstantOp>(
+            op->getOperand(qi).getDefiningOp());
+        if (!const_op) {
+          continue;
+        }
+
+        DenseFPElementsAttr attr;
+        if (!matchPattern(const_op->getResult(0), m_Constant(&attr))) {
+          continue;
+        }
+
+        if (attr.dyn_cast<DenseFPElementsAttr>().size() >=
+            quant_specs_.minimum_elements_for_weights) {
+          continue;
+        }
+
+        visited_nonquantizable_ops_->insert(op);
+        op->emitWarning(
+            "Skipped quantization for UnidirectionalSequenceLSTMOp. Partial "
+            "quantization of inputs for UnidirectionalSequenceLSTMOp is not "
+            "supported. The operand ")
+            << const_op->getName().getStringRef().str() << " at index " << qi
+            << " was not quantized because it has "
+            << attr.dyn_cast<DenseFPElementsAttr>().size()
+            << " elements which is fewer than the "
+               "`minimum_elements_for_weights` threshold of "
+            << quant_specs_.minimum_elements_for_weights;
+        return false;
+      }
+
+      return true;
     }
     return false;
   }
@@ -427,7 +485,8 @@ void PrepareDynamicRangeQuantizePass::runOnOperation() {
   removeAllStatsOp(func);
 
   RewritePatternSet patterns(&getContext());
-  patterns.add<PrepareDynamicRangeQuantizableOp>(ctx, quant_specs_);
+  patterns.add<PrepareDynamicRangeQuantizableOp>(ctx, quant_specs_,
+                                                 &visited_nonquantizable_ops_);
   (void)applyPatternsAndFoldGreedily(func, std::move(patterns));
 
   ConvertMlirQuantOpsToTFLQuantOps(func);
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
index 9f2301d4803..86d0509ceb7 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 
 #include <string>
+#include <vector>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.cc b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.cc
index f6b257128dd..28c6106dcb7 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h"
 
 #include <string>
+#include <vector>
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
index 093e53c0869..77b047f68c6 100644
--- a/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_FAKE_QUANT_UTILS_H_
 
 #include <string>
+#include <vector>
 
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
index f1f006e93a2..18320bba3c9 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <optional>
+#include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/compiler/mlir/lite/utils/lstm_utils.h b/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
index 7421fe2faa8..9e01b5dbf75 100644
--- a/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/lstm_utils.h
@@ -58,7 +58,7 @@ class ConvertLSTMCellSimpleToFusedLSTM {
       delete;
   ConvertLSTMCellSimpleToFusedLSTM& operator=(
       const ConvertLSTMCellSimpleToFusedLSTM&) = delete;
-  virtual ~ConvertLSTMCellSimpleToFusedLSTM() {}
+  virtual ~ConvertLSTMCellSimpleToFusedLSTM() = default;
 
   virtual llvm::StringRef GetCompositeOpName() { return kLstmCellSimple; }
 
@@ -184,7 +184,7 @@ class ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM
       const ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM&) = delete;
   ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM& operator=(
       const ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM&) = delete;
-  ~ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM() override {}
+  ~ConvertLayerNormalizedLSTMCellSimpleToFusedLSTM() override = default;
 
   llvm::StringRef GetCompositeOpName() override {
     return kLayerNormalizedLstmCellSimple;
diff --git a/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h b/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
index e5d7fd1a639..b78f7c86e45 100644
--- a/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/stateful_ops_utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_STATEFUL_OPS_UTILS_H_
 
+#include <vector>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
index a542267a14a..7ce9c56086e 100644
--- a/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/tftext_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "flatbuffers/flexbuffers.h"  // from @flatbuffers
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/lite/utils/validators.h b/tensorflow/compiler/mlir/lite/utils/validators.h
index e16feb92652..6ff354fb23b 100644
--- a/tensorflow/compiler/mlir/lite/utils/validators.h
+++ b/tensorflow/compiler/mlir/lite/utils/validators.h
@@ -51,7 +51,7 @@ bool TFIntListIs1XY1(Operation *op, StringRef name, IntegerAttr *x,
                      IntegerAttr *y);
 
 // Returns true if the attribute is an integer list of the form [1, X, Y, 1],
-bool TFIntListIs1XY1(const Attribute attr);
+bool TFIntListIs1XY1(Attribute attr);
 
 // Returns true if the given `op`
 //   * has an attribute with the given `name`,
@@ -62,7 +62,7 @@ bool TFIntListIs1XYZ1(Operation *op, StringRef name, IntegerAttr *x,
 
 // Returns true if every element of the attribute is 1. All elements of `attr`
 // must be `IntegerAttr`.
-bool TFIntListIsAllOnes(const Attribute attr);
+bool TFIntListIsAllOnes(Attribute attr);
 
 // Returns true iff the given value is a float32 tensor.
 // is "DT_FLOAT".
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
index dc69f3d64bb..c1753dd34fb 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
 #include "absl/container/flat_hash_set.h"
@@ -137,7 +139,8 @@ static void RegisterDialects(mlir::DialectRegistry& registry) {
 
 Status MlirFunctionOptimizationPass::Run(
     const std::string& function_name, const DeviceSet& device_set,
-    const ConfigProto& config_proto, absl::string_view xla_compile_device_type,
+    const ConfigProto& config_proto,
+    const FunctionOptimizationPass::FunctionOptions& function_options,
     std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
     std::vector<std::string>* control_ret_node_names,
     bool* control_rets_updated) {
@@ -208,7 +211,9 @@ Status MlirFunctionOptimizationPass::Run(
   // the shape inference pass is run early in the pass pipeline, shape inference
   // during import is not necessary.
   import_config.enable_shape_inference = false;
-  import_config.xla_compile_device_type = xla_compile_device_type;
+  import_config.xla_compile_device_type =
+      function_options.xla_compile_device_type;
+  import_config.enable_soft_placement = function_options.allow_soft_placement;
 
   static const char* kTfMlirCategory = "TfMlir";
   tensorflow::metrics::ScopedCounter<2> timings(
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
index d3a8420af94..059147d4ea9 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass.h
@@ -17,7 +17,11 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_MLIR_GRAPH_OPTIMIZATION_PASS_H_
 
 #include <functional>
+#include <memory>
+#include <set>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -120,7 +124,7 @@ class MlirFunctionOptimizationPass : public FunctionOptimizationPass {
   // Executes all of the underlying registered MlirOptimizationPasses.
   Status Run(const std::string& function_name, const DeviceSet& device_set,
              const ConfigProto& config_proto,
-             absl::string_view xla_compile_device_type,
+             const FunctionOptimizationPass::FunctionOptions& function_options,
              std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override;
diff --git a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
index 4e7d1449946..95d669ff9bf 100644
--- a/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
+++ b/tensorflow/compiler/mlir/mlir_graph_optimization_pass_test.cc
@@ -40,45 +40,43 @@ constexpr char kFailure[] = "kFailure";
 
 class MockMlirOptimizationPass : public MlirOptimizationPass {
  public:
-  // MOCK_METHOD does not work on Windows build, using MOCK_CONST_METHODX
-  // instead.
-  MOCK_CONST_METHOD0(name, llvm::StringRef());
-  MOCK_CONST_METHOD4(GetPassState,
-                     MlirOptimizationPassState(
-                         const DeviceSet* device_set,
-                         const ConfigProto& config_proto, const Graph& graph,
-                         const FunctionLibraryDefinition& function_library));
-  MOCK_METHOD5(Run, Status(const std::string& function_name,
-                           const ConfigProto& config_proto,
-                           mlir::ModuleOp module, const Graph& graph,
-                           const FunctionLibraryDefinition& function_library));
+  MOCK_METHOD(llvm::StringRef, name, (), (const, override));
+  MOCK_METHOD(MlirOptimizationPassState, GetPassState,
+              (const DeviceSet* device_set, const ConfigProto& config_proto,
+               const Graph& graph,
+               const FunctionLibraryDefinition& function_library),
+              (const, override));
+  MOCK_METHOD(Status, Run,
+              (const std::string& function_name,
+               const ConfigProto& config_proto, mlir::ModuleOp module,
+               const Graph& graph,
+               const FunctionLibraryDefinition& function_library),
+              (override));
 };
 
 class MockMlirV1CompatOptimizationPass : public MlirV1CompatOptimizationPass {
  public:
-  // MOCK_METHOD does not work on Windows build, using MOCK_CONST_METHODX
-  // instead.
-  MOCK_CONST_METHOD0(name, llvm::StringRef());
-  MOCK_CONST_METHOD4(GetPassState,
-                     MlirOptimizationPassState(
-                         const DeviceSet* device_set,
-                         const ConfigProto& config_proto, const Graph& graph,
-                         const FunctionLibraryDefinition& function_library));
-  MOCK_METHOD2(Run, Status(const GraphOptimizationPassOptions& options,
-                           mlir::ModuleOp module));
+  MOCK_METHOD(llvm::StringRef, name, (), (const, override));
+  MOCK_METHOD(MlirOptimizationPassState, GetPassState,
+              (const DeviceSet* device_set, const ConfigProto& config_proto,
+               const Graph& graph,
+               const FunctionLibraryDefinition& function_library),
+              (const, override));
+  MOCK_METHOD(Status, Run,
+              (const GraphOptimizationPassOptions& options,
+               mlir::ModuleOp module),
+              (override));
 };
 
 class ModifyMlirModulePass : public MlirOptimizationPass {
  public:
   explicit ModifyMlirModulePass(Status run_status) : run_status_(run_status) {}
-  // MOCK_METHOD does not work on Windows build, using MOCK_CONST_METHODX
-  // instead.
-  MOCK_CONST_METHOD0(name, llvm::StringRef());
-  MOCK_CONST_METHOD4(GetPassState,
-                     MlirOptimizationPassState(
-                         const DeviceSet* device_set,
-                         const ConfigProto& config_proto, const Graph& graph,
-                         const FunctionLibraryDefinition& function_library));
+  MOCK_METHOD(llvm::StringRef, name, (), (const, override));
+  MOCK_METHOD(MlirOptimizationPassState, GetPassState,
+              (const DeviceSet* device_set, const ConfigProto& config_proto,
+               const Graph& graph,
+               const FunctionLibraryDefinition& function_library),
+              (const, override));
 
   // Just modify MLIR module so that we can check whether original TF graph
   // has changed or not.
@@ -187,12 +185,12 @@ class MlirGraphOptimizationPassTest : public Test {
   }
 
   ConfigProto config_proto_;
+  FunctionOptimizationPass::FunctionOptions function_options_;
   MlirFunctionOptimizationPass function_optimization_pass_;
   DeviceSet device_set_;
   std::unique_ptr<Graph> graph_;
   std::unique_ptr<FunctionLibraryDefinition> flib_;
   std::vector<std::string> control_ret_node_names_;
-  std::string xla_compile_device_type_;
   bool control_rets_updated_{false};
   monitoring::testing::CellReader<int64_t> mlir_function_pass_fallback_count_ =
       monitoring::testing::CellReader<int64_t>(
@@ -219,11 +217,11 @@ TEST_F(MlirGraphOptimizationPassTest, OptimizationPassFailsNoFallback) {
   GraphDef original_graph_def;
   graph_->ToGraphDef(&original_graph_def);
 
-  EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_,
-                xla_compile_device_type_, &graph_, flib_.get(),
-                &control_ret_node_names_, &control_rets_updated_),
-            Status(absl::StatusCode::kAborted, "aborted"));
+  EXPECT_EQ(
+      function_optimization_pass_.Run(
+          "test_func", device_set_, config_proto_, function_options_, &graph_,
+          flib_.get(), &control_ret_node_names_, &control_rets_updated_),
+      Status(absl::StatusCode::kAborted, "aborted"));
   verifyGraph(original_graph_def);
   verifyCounters();
 }
@@ -246,11 +244,11 @@ TEST_F(MlirGraphOptimizationPassTest, OptimizationPassFailsDisabledFallback) {
   AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
                             Status(absl::StatusCode::kAborted, "aborted"));
 
-  EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_,
-                xla_compile_device_type_, &graph_, flib_.get(),
-                &control_ret_node_names_, &control_rets_updated_),
-            OkStatus());
+  EXPECT_EQ(
+      function_optimization_pass_.Run(
+          "test_func", device_set_, config_proto_, function_options_, &graph_,
+          flib_.get(), &control_ret_node_names_, &control_rets_updated_),
+      OkStatus());
   verifyGraph(original_graph_def);
   verifyCounters();
 }
@@ -263,11 +261,11 @@ TEST_F(MlirGraphOptimizationPassTest, OptimizationPassDoesNotFailFallback) {
 
   AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
                             OkStatus());
-  EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_,
-                xla_compile_device_type_, &graph_, flib_.get(),
-                &control_ret_node_names_, &control_rets_updated_),
-            OkStatus());
+  EXPECT_EQ(
+      function_optimization_pass_.Run(
+          "test_func", device_set_, config_proto_, function_options_, &graph_,
+          flib_.get(), &control_ret_node_names_, &control_rets_updated_),
+      OkStatus());
 
   verifyGraph(original_graph_def, true);
   verifyCounters();
@@ -281,11 +279,11 @@ TEST_F(MlirGraphOptimizationPassTest, GraphDoesntConvertUpdatesCounter) {
 
   AddModuleModificationPass(MlirOptimizationPassState::FallbackEnabled,
                             OkStatus());
-  EXPECT_EQ(function_optimization_pass_.Run(
-                "test_func", device_set_, config_proto_,
-                xla_compile_device_type_, &graph_, flib_.get(),
-                &control_ret_node_names_, &control_rets_updated_),
-            OkStatus());
+  EXPECT_EQ(
+      function_optimization_pass_.Run(
+          "test_func", device_set_, config_proto_, function_options_, &graph_,
+          flib_.get(), &control_ret_node_names_, &control_rets_updated_),
+      OkStatus());
 
   EXPECT_EQ(mlir_function_pass_graph_conversion_count_.Read(kOk), 0);
   EXPECT_EQ(mlir_function_pass_graph_conversion_count_.Read(kInvalidArgument),
diff --git a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
index cbd03639c02..f5912553f10 100644
--- a/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
+++ b/tensorflow/compiler/mlir/op_or_arg_name_mapper.cc
@@ -40,7 +40,7 @@ static inline llvm::StringRef StringViewToRef(absl::string_view view) {
 
 namespace tensorflow {
 
-OpOrArgNameMapper::~OpOrArgNameMapper() {}
+OpOrArgNameMapper::~OpOrArgNameMapper() = default;
 
 llvm::StringRef OpOrArgNameMapper::GetUniqueName(llvm::StringRef prefix,
                                                  int hash_value) {
diff --git a/tensorflow/compiler/mlir/python/mlir.cc b/tensorflow/compiler/mlir/python/mlir.cc
index 7cc1d25355e..b709ede8956 100644
--- a/tensorflow/compiler/mlir/python/mlir.cc
+++ b/tensorflow/compiler/mlir/python/mlir.cc
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/python/mlir.h"
 
+#include <memory>
 #include <string>
 #include <type_traits>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
@@ -388,6 +391,8 @@ void ExperimentalWriteBytecode(const std::string& filename,
   }
   mlir::FallbackAsmResourceMap fallback_resource_map;
   mlir::BytecodeWriterConfig writer_config(fallback_resource_map);
+  // TODO(jpienaar): Make this an option to the call.
+  writer_config.setDesiredBytecodeVersion(1);
   std::string error;
   std::unique_ptr<llvm::ToolOutputFile> outputFile =
       mlir::openOutputFile(filename, &error);
@@ -446,6 +451,8 @@ void ExperimentalTFLiteToTosaBytecode(
   }
   mlir::FallbackAsmResourceMap fallback_resource_map;
   mlir::BytecodeWriterConfig writer_config(fallback_resource_map);
+  // TODO(jpienaar): Make this an option to the call.
+  writer_config.setDesiredBytecodeVersion(1);
   std::string error;
   std::unique_ptr<llvm::ToolOutputFile> outputFile =
       mlir::openOutputFile(tosa_bytecode_file, &error);
diff --git a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
index 807d0f497df..3e184602595 100644
--- a/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
+++ b/tensorflow/compiler/mlir/python/mlir_wrapper/BUILD
@@ -20,8 +20,8 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
         "@llvm-project//llvm:FileCheckLib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -36,8 +36,8 @@ tf_python_pybind_extension(
     srcs = ["filecheck_wrapper.cc"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
         "@llvm-project//llvm:FileCheckLib",
         "@llvm-project//llvm:Support",
         "@pybind11",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index f85f8f13882..50e4037fa1c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -15,6 +15,15 @@ package_group(
     ] + internal_visibility_allowlist(),
 )
 
+package(
+    # copybara:uncomment default_applicable_licenses = ["@stablehlo//:license"],
+    default_visibility = [
+        ":internal_visibility_allowlist_package",
+        "//tensorflow:__pkg__",
+    ],
+    licenses = ["notice"],
+)
+
 # TODO(b/264218457): Add quantize and post_quantize passes.
 cc_library(
     name = "passes",
@@ -26,19 +35,18 @@ cc_library(
     ],
     compatible_with = get_compatible_with_cloud(),
     deps = [
+        ":fill_quantization_options",
         ":quantization_options_proto_cc",
         ":stablehlo_passes_inc_gen",
-        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/core/platform:path",
         "//third_party/eigen3",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@stablehlo//:stablehlo_ops",
@@ -59,11 +67,14 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     visibility = [":internal_visibility_allowlist_package"],
     deps = [
+        ":fill_quantization_options",
         ":passes",
         ":quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_passes",
         "//tensorflow/core/platform:path",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
     ],
@@ -87,6 +98,17 @@ gentbl_cc_library(
     ],
 )
 
+cc_library(
+    name = "fill_quantization_options",
+    srcs = ["utils/fill_quantization_options.cc"],
+    hdrs = ["utils/fill_quantization_options.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        ":quantization_options_proto_cc",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
 tf_proto_library(
     name = "quantization_options_proto",
     srcs = ["quantization_options.proto"],
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
index 788a00f349c..acd3657484e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -31,7 +31,8 @@ namespace stablehlo {
 
 // Creates a pass that quantizes weight component of StableHLO graph.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
-    ::stablehlo::quantization::QuantizationOptions quantization_options);
+    ::stablehlo::quantization::QuantizationComponentSpec
+        quantization_component_spec);
 
 }  // namespace stablehlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
index 9d5d0cc8e91..d4480dbf170 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_weight.cc
@@ -14,29 +14,31 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
-#include <cstdint>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "third_party/eigen3/Eigen/Core"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Rewrite/FrozenRewritePatternSet.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h"
 
 // NOLINTNEXTLINE
 //===----------------------------------------------------------------------===//
@@ -50,6 +52,7 @@ namespace {
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
 
 using QuantizationUnits = llvm::SetVector<std::pair<Operation*, int>>;
+using ::stablehlo::quantization::QuantizationComponentSpec;
 
 // Min/Max values used for creating ConstantOp.
 constexpr float kMaxFloat16Value = 65504.f;
@@ -61,7 +64,8 @@ class QuantizeWeightPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(QuantizeWeightPass)
 
   explicit QuantizeWeightPass(
-      ::stablehlo::quantization::QuantizationOptions quantization_options) {}
+      QuantizationComponentSpec quantization_component_spec)
+      : quantization_component_spec_(quantization_component_spec) {}
 
   StringRef getArgument() const final {
     // This is the argument used to refer to the pass in
@@ -75,13 +79,17 @@ class QuantizeWeightPass
 
  private:
   void runOnOperation() override;
+  QuantizationComponentSpec quantization_component_spec_;
 };
 
 // Collects quantizable target ops, then insert Q-DQ quantization patterns.
 class QuantizeWeight : public OpRewritePattern<ConstantOp> {
  public:
-  explicit QuantizeWeight(MLIRContext* context)
-      : OpRewritePattern<ConstantOp>(context) {}
+  explicit QuantizeWeight(
+      MLIRContext* context,
+      const QuantizationComponentSpec& quantization_component_spec)
+      : OpRewritePattern<ConstantOp>(context),
+        quantization_component_spec_(quantization_component_spec) {}
 
   LogicalResult matchAndRewrite(ConstantOp op,
                                 PatternRewriter& rewriter) const override {
@@ -104,6 +112,7 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
   }
 
  private:
+  const QuantizationComponentSpec quantization_component_spec_;
   // Marks users that are applicable for quantization where the criteria for
   // determining quantizable ops differs by the inference type.
   QuantizationUnits GetQuantizableOps(ConstantOp op) const {
@@ -125,7 +134,6 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
   // Returns whether quantization is applied to filtered users.
   bool QuantizeOps(PatternRewriter& rewriter, ConstantOp op,
                    const QuantizationUnits& quantizable_ops) const {
-    // TODO(b/212514817): refactor mode checking to improve code quality.
     for (const std::pair<Operation*, int>& quant_op : quantizable_ops) {
       // For f16 quantization, quantize all constant ops as float16.
       QuantizeOpAsFloat16(rewriter, op, quant_op);
@@ -222,9 +230,9 @@ class QuantizeWeight : public OpRewritePattern<ConstantOp> {
 void QuantizeWeightPass::runOnOperation() {
   func::FuncOp func = getOperation();
   MLIRContext* ctx = func.getContext();
-
   RewritePatternSet patterns(ctx);
-  patterns.add<QuantizeWeight>(ctx);
+
+  patterns.add<QuantizeWeight>(ctx, quantization_component_spec_);
 
   FrozenRewritePatternSet frozen_patterns(std::move(patterns));
 
@@ -237,8 +245,8 @@ void QuantizeWeightPass::runOnOperation() {
 
 // Creates an instance of the StableHLO dialect Quantize Weight pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
-    ::stablehlo::quantization::QuantizationOptions quantization_options) {
-  return std::make_unique<QuantizeWeightPass>(quantization_options);
+    QuantizationComponentSpec quantization_component_spec) {
+  return std::make_unique<QuantizeWeightPass>(quantization_component_spec);
 }
 }  // namespace stablehlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc b/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc
index 05290bcb126..31bb012e372 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.cc
@@ -14,9 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h"
 
+#include "absl/container/flat_hash_set.h"
+#include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 
 namespace stablehlo {
@@ -24,8 +28,30 @@ namespace quantization {
 
 void AddQuantizationPasses(mlir::PassManager& pass_manager,
                            const QuantizationOptions& quantization_options) {
+  QuantizationOptions quantization_options_ = quantization_options;
+  if (quantization_options.quantization_method()
+          .has_preset_quantization_method()) {
+    quantization_options_ =
+        mlir::stablehlo::FillPresetQuantizationOptions(quantization_options);
+  }
+
+  // TODO(b/276999414): Add activation and bias quantization component as
+  // respective quantization passes are created.
+  QuantizationComponentSpec weight_component;
+  for (const auto& component : quantization_options_.quantization_method()
+                                   .custom_quantization_method()
+                                   .quantization_component_spec()) {
+    switch (component.quantization_component()) {
+      case QuantizationComponentSpec::COMPONENT_WEIGHT:
+        weight_component = component;
+        break;
+      default:
+        break;
+    }
+  }
+
   pass_manager.addNestedPass<mlir::func::FuncOp>(
-      mlir::stablehlo::CreateQuantizeWeightPass(quantization_options));
+      mlir::stablehlo::CreateQuantizeWeightPass(weight_component));
 }
 
 }  // namespace quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
index 00c76a029e9..4b657b51762 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
@@ -1,5 +1,6 @@
 load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,6 +8,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir/quantization/stablehlo:run_lit.sh",
     size_override = {
@@ -27,3 +29,15 @@ filegroup(
         # TODO(b/254144841): Add tests in this directory with the proper stablehlo-opt.
     ],
 )
+
+tf_cc_test(
+    name = "fill_quantization_options_test",
+    srcs = ["fill_quantization_options_test.cc"],
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:fill_quantization_options",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto_cc",
+        "//tensorflow/tsl/platform:protobuf",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/fill_quantization_options_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/tests/fill_quantization_options_test.cc
new file mode 100644
index 00000000000..55ef992934b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/fill_quantization_options_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h"
+
+#include <ostream>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+#include "tensorflow/tsl/platform/protobuf.h"
+
+namespace mlir::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::PresetQuantizationMethod;
+using ::stablehlo::quantization::QuantizationComponentSpec;
+using ::stablehlo::quantization::QuantizationOptions;
+
+// Simple implementation of ::testing::EqualsProto equivalent until open source
+// b/135192747 is fixed. Originally from type_to_shape_test.cc.
+class ProtoStringMatcher {
+ public:
+  explicit ProtoStringMatcher(const tsl::protobuf::Message& expected)
+      : expected_(expected.SerializeAsString()) {}
+
+  template <typename Message>
+  bool MatchAndExplain(const Message& p, testing::MatchResultListener*) const {
+    return p.SerializeAsString() == expected_;
+  }
+
+  void DescribeTo(::std::ostream* os) const { *os << expected_; }
+  void DescribeNegationTo(::std::ostream* os) const {
+    *os << "not equal to expected message: " << expected_;
+  }
+
+ private:
+  const std::string expected_;
+};
+
+inline ::testing::PolymorphicMatcher<ProtoStringMatcher> EqualsProto(
+    const tsl::protobuf::Message& x) {
+  return ::testing::MakePolymorphicMatcher(ProtoStringMatcher(x));
+}
+
+void FillPresetQuantizationOptionsTestHelper(
+    const PresetQuantizationMethod::PresetMethod preset_quantization_options,
+    const QuantizationComponentSpec expected_activation_component,
+    const QuantizationComponentSpec expected_weight_component,
+    const QuantizationComponentSpec expected_bias_component) {
+  QuantizationOptions quantization_options;
+  quantization_options.mutable_quantization_method()
+      ->mutable_preset_quantization_method()
+      ->set_preset_method(preset_quantization_options);
+  QuantizationOptions filled_quantization_options =
+      FillPresetQuantizationOptions(quantization_options);
+  for (QuantizationComponentSpec component :
+       filled_quantization_options.quantization_method()
+           .custom_quantization_method()
+           .quantization_component_spec()) {
+    switch (component.quantization_component()) {
+      case (QuantizationComponentSpec::COMPONENT_ACTIVATION):
+        EXPECT_THAT(component, EqualsProto(expected_activation_component));
+        break;
+      case (QuantizationComponentSpec::COMPONENT_WEIGHT):
+        EXPECT_THAT(component, EqualsProto(expected_weight_component));
+        break;
+      case (QuantizationComponentSpec::COMPONENT_BIAS):
+        EXPECT_THAT(component, EqualsProto(expected_bias_component));
+        break;
+      default:
+        break;
+    }
+  }
+}
+
+TEST(FillQuantizationOptionsTest, PresetFloat16) {
+  QuantizationComponentSpec activation_component, weight_component,
+      bias_component;
+  weight_component.set_quantization_component(
+      QuantizationComponentSpec::COMPONENT_WEIGHT);
+  weight_component.set_bit_width(QuantizationComponentSpec::BIT_WIDTH_16);
+  weight_component.set_bit_type(QuantizationComponentSpec::BIT_TYPE_FLOAT);
+  bias_component.set_quantization_component(
+      QuantizationComponentSpec::COMPONENT_BIAS);
+  bias_component.set_bit_width(QuantizationComponentSpec::BIT_WIDTH_16);
+  bias_component.set_bit_type(QuantizationComponentSpec::BIT_TYPE_FLOAT);
+
+  FillPresetQuantizationOptionsTestHelper(
+      /*preset_quantization_options=*/PresetQuantizationMethod::FLOAT16,
+      /*expected_activation_component=*/activation_component,
+      /*expected_weight_component*/ weight_component,
+      /*expected_bias_component*/ bias_component);
+}
+
+}  // namespace
+}  // namespace mlir::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.cc b/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.cc
new file mode 100644
index 00000000000..bff29736476
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.cc
@@ -0,0 +1,71 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "llvm/Support/Debug.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace mlir {
+namespace stablehlo {
+
+using ::stablehlo::quantization::CustomQuantizationMethod;
+using ::stablehlo::quantization::PresetQuantizationMethod;
+using ::stablehlo::quantization::QuantizationComponentSpec;
+
+// Returns QuantizationOptions filled with detailed specs when user specifies
+// an optional preset method name. The preset methods are defined in
+// quantization_options.proto. This function will only be executed if a user
+// gives a preset method, not a custom method.
+::stablehlo::quantization::QuantizationOptions FillPresetQuantizationOptions(
+    ::stablehlo::quantization::QuantizationOptions quantization_options_) {
+  CustomQuantizationMethod custom_method =
+      quantization_options_.quantization_method().custom_quantization_method();
+  QuantizationComponentSpec *weight_component, *bias_component;
+  auto preset_method = quantization_options_.quantization_method()
+                           .preset_quantization_method()
+                           .preset_method();
+  if (!preset_method) return quantization_options_;
+  switch (preset_method) {
+    case PresetQuantizationMethod::FLOAT16:
+      weight_component = custom_method.add_quantization_component_spec();
+      weight_component->set_quantization_component(
+          QuantizationComponentSpec::COMPONENT_WEIGHT);
+      weight_component->set_bit_width(QuantizationComponentSpec::BIT_WIDTH_16);
+      weight_component->set_bit_type(QuantizationComponentSpec::BIT_TYPE_FLOAT);
+      bias_component = custom_method.add_quantization_component_spec();
+      bias_component->set_quantization_component(
+          QuantizationComponentSpec::COMPONENT_WEIGHT);
+      bias_component->set_bit_width(QuantizationComponentSpec::BIT_WIDTH_16);
+      bias_component->set_bit_type(QuantizationComponentSpec::BIT_TYPE_FLOAT);
+      break;
+    // Note: This is weight-only quantization by default, but with the legacy
+    // flag "--force_dynamic_range_in_kernel", a DRQ behavior will be forced
+    // in the kernel.
+    case PresetQuantizationMethod::WEIGHT_ONLY:
+      weight_component = custom_method.add_quantization_component_spec();
+      weight_component->set_quantization_component(
+          QuantizationComponentSpec::COMPONENT_WEIGHT);
+      weight_component->set_bit_width(QuantizationComponentSpec::BIT_WIDTH_8);
+      weight_component->set_bit_type(QuantizationComponentSpec::BIT_TYPE_INT);
+      break;
+    default:
+      break;
+  }
+  *quantization_options_.mutable_quantization_method()
+       ->mutable_custom_quantization_method() = custom_method;
+  return quantization_options_;
+}
+
+}  // namespace stablehlo
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h b/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h
new file mode 100644
index 00000000000..782920826c6
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/utils/fill_quantization_options.h
@@ -0,0 +1,30 @@
+/* Copyright 2023 The StableHLO Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_FILL_QUANTIZATION_OPTIONS_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_FILL_QUANTIZATION_OPTIONS_H_
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
+
+namespace mlir {
+namespace stablehlo {
+
+::stablehlo::quantization::QuantizationOptions FillPresetQuantizationOptions(
+    ::stablehlo::quantization::QuantizationOptions quantization_options);
+
+}  // namespace stablehlo
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UTILS_FILL_QUANTIZATION_OPTIONS_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 2d42d137f9b..6e6a6c8077f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -22,17 +22,6 @@ package(
     licenses = ["notice"],
 )
 
-cc_library(
-    name = "constants",
-    hdrs = [
-        "constants.h",
-    ],
-    compatible_with = get_compatible_with_cloud(),
-    deps = [
-        "@com_google_absl//absl/strings",
-    ],
-)
-
 py_binary(
     name = "gen_quantized_function_library",
     srcs = ["gen_quantized_function_library.py"],
@@ -78,6 +67,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "manipulate_model_attr",
+    srcs = [
+        "passes/manipulate_model_attr.cc",
+    ],
+    hdrs = [
+        "passes/manipulate_model_attr.h",
+    ],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "remove_identity_op_pattern",
     srcs = [
@@ -375,11 +381,13 @@ cc_library(
         "passes/insert_restore_op.cc",
         "passes/insert_save_op.cc",
         "passes/issue_ids_of_custom_aggregation_ops.cc",
+        "passes/lift_hashtable_ops_as_args.cc",
         "passes/lift_quantizable_spots_as_functions.cc",
         "passes/lift_quantizable_spots_as_functions.inc",
         "passes/lift_quantizable_spots_as_functions_drq.cc",
         "passes/lift_quantizable_spots_as_functions_drq.inc",
         "passes/mark_functions_noinline.cc",
+        "passes/merge_duplicate_resource_ops.cc",
         "passes/merge_initializer_function_ops_to_main.cc",
         "passes/merge_save_function_ops_to_main.cc",
         "passes/optimize.cc",
@@ -408,7 +416,7 @@ cc_library(
     ],
     compatible_with = get_compatible_with_cloud(),
     deps = [
-        ":constants",
+        ":manipulate_model_attr",
         ":pass_utils",
         ":quantization_options_proto_cc",
         ":remove_identity_op_pattern",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
index 831bf9980ca..1f1ae5a13de 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/BUILD
@@ -93,7 +93,7 @@ tf_py_test(
         ":gen_custom_aggregator_op_wrapper",
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:pywrap_quantize_model",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
index 3f153106f8a..23e7ee54f13 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 
 #include <algorithm>
+#include <optional>
 #include <string>
+#include <utility>
 
 namespace tensorflow {
 namespace calibrator {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
index 7c0f830505a..c87fed2569c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CALIBRATOR_CALIBRATOR_SINGLETON_H_
 
 #include <map>
+#include <optional>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
index 189c0b319f6..cd8a473fa90 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 
+#include <optional>
+#include <utility>
+
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
index a911b495e20..f640b1aa3d7 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/calibrator/custom_aggregator_op.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
+#include <string>
+
 #include "tensorflow/compiler/mlir/quantization/tensorflow/calibrator/calibrator_singleton.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_kernel.h"
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
index 4ad3a370efc..1e442583b1d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables.cc
@@ -70,12 +70,12 @@ absl::StatusOr<std::string> AddTensorToBundleWriter(
   if (const tsl::Status status = mlir::tfg::ConvertToTensor(
           /*attr=*/const_op.getValue(), /*output_tensor=*/&const_tensor);
       !status.ok()) {
-    return tsl::ToAbslStatus(status);
+    return status;
   }
 
   if (!bundle_writer.Add(/*key=*/var_handle_op.getSharedName(), const_tensor)
            .ok()) {
-    return tsl::ToAbslStatus(bundle_writer.status());
+    return bundle_writer.status();
   }
 
   return var_handle_op.getSharedName().str();
@@ -97,7 +97,7 @@ absl::StatusOr<std::vector<std::string>> SaveVariablesToCheckpoint(
 
   BundleWriter bundle_writer(Env::Default(), prefix);
   if (!bundle_writer.status().ok()) {
-    return tsl::ToAbslStatus(bundle_writer.status());
+    return bundle_writer.status();
   }
 
   std::vector<std::string> saved_variable_shared_names;
@@ -122,7 +122,7 @@ absl::StatusOr<std::vector<std::string>> SaveVariablesToCheckpoint(
   }
 
   if (!bundle_writer.Finish().ok()) {
-    return tsl::ToAbslStatus(bundle_writer.status());
+    return bundle_writer.status();
   }
 
   return saved_variable_shared_names;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
index 8967b64b877..fefff2345f6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/save_variables_test.cc
@@ -114,8 +114,7 @@ TEST_F(SaveVariablesToCheckpointTest, VariableSavedToCheckpoint) {
   BundleReader bundle_reader(env_, *checkpoint_prefix);
 
   Tensor loaded_tensor{};
-  EXPECT_TRUE(
-      tsl::ToAbslStatus(bundle_reader.Lookup("var_0", &loaded_tensor)).ok());
+  EXPECT_TRUE(bundle_reader.Lookup("var_0", &loaded_tensor).ok());
 
   ExpectEqual(loaded_tensor, AsTensor<float>({1.0, 2.0}));
 }
@@ -161,13 +160,11 @@ TEST_F(SaveVariablesToCheckpointTest, MultipleVariablesSavedToCheckpoint) {
   BundleReader bundle_reader(env_, *checkpoint_prefix);
 
   Tensor loaded_var_0{};
-  EXPECT_TRUE(
-      tsl::ToAbslStatus(bundle_reader.Lookup("var_0", &loaded_var_0)).ok());
+  EXPECT_TRUE(bundle_reader.Lookup("var_0", &loaded_var_0).ok());
   ExpectEqual(loaded_var_0, AsTensor<float>({1.0, 2.0}));
 
   Tensor loaded_var_1{};
-  EXPECT_TRUE(
-      tsl::ToAbslStatus(bundle_reader.Lookup("var_1", &loaded_var_1)).ok());
+  EXPECT_TRUE(bundle_reader.Lookup("var_1", &loaded_var_1).ok());
   ExpectEqual(loaded_var_1, AsTensor<int>({3, 4, 5, 6}));
 }
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
index 448ac05842f..879ccc88de0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
@@ -2,6 +2,7 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//tensorflow/compiler/mlir/quantization:__subpackages__",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
index fe1b205ce1d..c5157fed64c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
@@ -77,7 +77,7 @@ absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>> CreateMlirDumpFile(
   auto *env = tsl::Env::Default();
   const tsl::Status status = env->RecursivelyCreateDir(*dump_dir);
   if (!status.ok()) {
-    return tsl::ToAbslStatus(status);
+    return status;
   }
 
   std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
index db13cd19f08..803cd39a0a5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
@@ -35,7 +35,7 @@ void EnableIrPrinting(llvm::raw_ostream &out_stream, mlir::PassManager &pm);
 // level < 1 or TF_QUANT_MLIR_DUMP_PREFIX is not set or set to an empty string.
 // The returned ostream instance should live until the pass run is complete.
 absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> MaybeEnableIrPrinting(
-    mlir::PassManager &pm, const absl::string_view name);
+    mlir::PassManager &pm, absl::string_view name);
 
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
index a3cb59b241c..c3759ff75c6 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/convert_custom_aggregation_op_to_quant_stats.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
index 2970cafffc0..9671f1b17eb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_custom_aggregation_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
index 0d60c9a2020..0e6ce592ea0 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/insert_main_function.cc
@@ -96,7 +96,7 @@ bool ShouldIncludeInMainFunction(func::FuncOp func_op) {
 void SetFunctionPrivate(func::FuncOp func) {
   func.setVisibility(SymbolTable::Visibility::Private);
 
-  // The `tf_saved_model` attributes can only be appied to public functions.
+  // The `tf_saved_model` attributes can only be applied to public functions.
   for (auto& attr : func->getAttrs()) {
     StringRef attr_name = attr.getName().getValue();
     if (attr_name.startswith("tf_saved_model.")) {
@@ -136,7 +136,7 @@ struct OutputInfo {
 };
 
 // Makes input/output names across entry functions unique if necessary. If a
-// dupliated name is found, this function will add signature prefix for all the
+// duplicated name is found, this function will add signature prefix for all the
 // input/output names.
 void GetUniqueInputOutputNodeNames(ModuleOp module_op,
                                    std::vector<std::string>& input_name_vec,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc
index 513076b51fb..0d1302c99f8 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/issue_ids_of_custom_aggregation_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc
new file mode 100644
index 00000000000..175bf572074
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/lift_hashtable_ops_as_args.cc
@@ -0,0 +1,210 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+constexpr StringRef kSharedNameAttr = "shared_name";
+
+class LiftHashTableOpsAsArgsPass
+    : public PassWrapper<LiftHashTableOpsAsArgsPass, OperationPass<ModuleOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(LiftHashTableOpsAsArgsPass)
+  explicit LiftHashTableOpsAsArgsPass() = default;
+
+  StringRef getArgument() const final {
+    // This is the argument used to refer to the pass in
+    // the textual format (on the commandline for example).
+    return "quant-lift-hashtable-ops-as-args";
+  }
+  StringRef getDescription() const final {
+    return "Lifts HashTable ops as function arguments.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Checks if the given op is a Hashtable op.
+bool IsHashTableOp(Operation* op) {
+  return llvm::isa<TF::HashTableOp, TF::HashTableV2Op,
+                   TF::MutableHashTableV2Op>(op);
+}
+
+// Checks if the function is the main or initializer function.
+bool IsMainOrInitializerFunction(ModuleOp module, func::FuncOp func) {
+  if (func.getSymName().equals(tensorflow::kImportModelDefaultGraphFuncName) ||
+      func.getSymName().equals(kTfQuantSaveFuncName)) {
+    return true;
+  }
+
+  for (func::FuncOp init_func :
+       tf_saved_model::GetInitializerFunctions(module)) {
+    if (func.getSymName().equals(init_func.getSymName())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Checks if the function is only used by supported ops. Returns false when the
+// function has no uses. Currently, only PartitionedCall is supported.
+// TODO(b/284222309): Support lifting for functions called by control flow.
+bool UsedBySupportedOps(ModuleOp module, func::FuncOp func) {
+  auto function_uses =
+      SymbolTable::getSymbolUses(func, &module.getBodyRegion());
+  if (!function_uses.has_value()) return false;
+  for (auto& function_use : function_uses.value()) {
+    if (!llvm::isa<TF::PartitionedCallOp, TF::StatefulPartitionedCallOp>(
+            function_use.getUser())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Returns the `shared_name` attribute value if exists. If not, returns an
+// empty string.
+StringRef GetSharedName(Operation* op) {
+  if (!op->hasAttrOfType<StringAttr>(kSharedNameAttr)) return "";
+  return op->getAttrOfType<StringAttr>(kSharedNameAttr).getValue();
+}
+
+// Checks if the HashTable is initialized. This function assumes that the
+// HashTable is initialized if it appears in the initializer since it can't
+// check the actual value.
+bool IsResourceInitialized(ModuleOp module_op, Operation* hash_table) {
+  StringRef shared_name = GetSharedName(hash_table);
+  if (shared_name.empty()) return false;
+
+  for (func::FuncOp init_func_op :
+       tf_saved_model::GetInitializerFunctions(module_op)) {
+    for (Operation& op : init_func_op.getBody().getOps()) {
+      StringRef other_shared_name = GetSharedName(&op);
+      if (IsHashTableOp(&op) && other_shared_name.equals(shared_name)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Lifts HashTable ops in the target function as function arguments and returns
+// the lifted ops. These ops  will then be added to the caller function and
+// passed to the target function.
+LogicalResult LiftHashTableOpsToArguments(ModuleOp module_op,
+                                          func::FuncOp target_func) {
+  if (!llvm::hasSingleElement(target_func)) return success();
+  if (!UsedBySupportedOps(module_op, target_func)) return success();
+  if (IsMainOrInitializerFunction(module_op, target_func)) return success();
+
+  llvm::StringMap<int> shared_name_to_arg_idx;
+  llvm::SmallDenseMap<Operation*, int> lifted_op_to_arg_idx;
+  Block& block = target_func.front();
+  auto func_type = target_func.getFunctionType();
+
+  for (Operation& op : block.without_terminator()) {
+    StringRef shared_name = GetSharedName(&op);
+    if (shared_name.empty() || !IsHashTableOp(&op)) continue;
+    if (!IsResourceInitialized(module_op, &op)) continue;
+
+    auto it =
+        shared_name_to_arg_idx.insert({shared_name, block.getNumArguments()});
+    if (it.second) {
+      auto resource_type = op.getResult(0).getType();
+      op.getResult(0).replaceAllUsesWith(
+          block.addArgument(resource_type, op.getLoc()));
+      AddEntryFunctionInput(
+          absl::StrCat("hash_table_", it.first->getValue(), ":0"), target_func);
+      // Avoid deleting the op here, clone it to the caller function first.
+      lifted_op_to_arg_idx.insert({&op, it.first->getValue()});
+    } else {
+      op.getResult(0).replaceAllUsesWith(
+          block.getArgument(it.first->getValue()));
+      op.erase();
+    }
+  }
+  if (lifted_op_to_arg_idx.empty()) return success();
+
+  // Update the function signature as well as its uses.
+  target_func.setType(FunctionType::get(target_func.getContext(),
+                                        block.getArgumentTypes(),
+                                        func_type.getResults()));
+
+  IRMapping mapping;
+  OpBuilder builder(module_op);
+  OpBuilder::InsertionGuard g(builder);
+  // The function has been checked to have at least one use.
+  auto function_uses =
+      SymbolTable::getSymbolUses(target_func, &module_op.getBodyRegion());
+  for (auto& function_use : function_uses.value()) {
+    auto call_op = function_use.getUser();
+    auto caller_func = call_op->getParentOfType<func::FuncOp>();
+    if (!caller_func) return failure();
+
+    builder.setInsertionPoint(call_op);
+    for (auto [lifted_op, arg_idx] : lifted_op_to_arg_idx) {
+      auto new_op = builder.clone(*lifted_op, mapping);
+      call_op->insertOperands(arg_idx, new_op->getResult(0));
+    }
+
+    // Try to lift recursively until the main function.
+    if (failed(LiftHashTableOpsToArguments(module_op, caller_func))) {
+      return failure();
+    }
+  }
+
+  // Erase the lifted operations explicitly.
+  for (auto [lifted_op, arg_idx] : lifted_op_to_arg_idx) {
+    lifted_op->erase();
+  }
+
+  return success();
+}
+
+void LiftHashTableOpsAsArgsPass::runOnOperation() {
+  auto module_op = getOperation();
+
+  for (auto func_op : module_op.getOps<func::FuncOp>()) {
+    if (failed(LiftHashTableOpsToArguments(module_op, func_op))) {
+      signalPassFailure();
+      return;
+    }
+  }
+}
+
+static PassRegistration<LiftHashTableOpsAsArgsPass> pass;
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftHashTableOpsAsArgsPass() {
+  return std::make_unique<LiftHashTableOpsAsArgsPass>();
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.cc
new file mode 100644
index 00000000000..06784f8dba5
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.cc
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
+
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/StringExtras.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+constexpr StringRef kTfEntryFunctionAttr = "tf.entry_function";
+
+void AddEntryFunctionInput(StringRef input_name, func::FuncOp func_op) {
+  auto entry_func_attr =
+      func_op->getAttrOfType<DictionaryAttr>(kTfEntryFunctionAttr);
+  if (!entry_func_attr) return;
+
+  auto entry_func_attrs = SmallVector<NamedAttribute>(entry_func_attr.begin(),
+                                                      entry_func_attr.end());
+
+  MLIRContext* ctx = func_op.getContext();
+  for (auto& named_attr : entry_func_attrs) {
+    if (named_attr.getName() != "inputs") continue;
+
+    // Splits the "inputs" field to retrieve individual input names. Ignores
+    // empty strings.
+    SmallVector<StringRef> inputs_attrs{};
+    cast<StringAttr>(named_attr.getValue())
+        .strref()
+        .split(inputs_attrs, /*Separator=*/',', /*MaxSplit=*/-1,
+               /*KeepEmpty=*/false);
+
+    inputs_attrs.emplace_back(input_name);
+
+    const std::string new_inputs_attr_str =
+        llvm::join(std::move(inputs_attrs), /*Separator=*/",");
+
+    named_attr.setValue(StringAttr::get(ctx, new_inputs_attr_str));
+  }
+
+  func_op->setAttr(kTfEntryFunctionAttr,
+                   DictionaryAttr::get(ctx, entry_func_attrs));
+}
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h
new file mode 100644
index 00000000000..d42ad360034
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_MANIPULATE_MODEL_ATTR_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_MANIPULATE_MODEL_ATTR_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+
+namespace mlir {
+namespace quant {
+
+// Adds a new input name to the `inputs` field of the `tf.entry_function`
+// attribute if the attribute exist in the given function. Otherwise, no
+// attribute is modified.
+void AddEntryFunctionInput(StringRef input_name, func::FuncOp func_op);
+
+}  // namespace quant
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_PASSES_MANIPULATE_MODEL_ATTR_H_
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_duplicate_resource_ops.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_duplicate_resource_ops.cc
new file mode 100644
index 00000000000..be179db7306
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_duplicate_resource_ops.cc
@@ -0,0 +1,139 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+
+namespace mlir {
+namespace quant {
+namespace {
+
+using ::mlir::tf_executor::GraphOp;
+using ::mlir::tf_executor::IslandOp;
+
+constexpr StringRef kSharedNameAttr = "shared_name";
+
+class MergeDuplicateResourceOpsPass
+    : public PassWrapper<MergeDuplicateResourceOpsPass,
+                         OperationPass<func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(MergeDuplicateResourceOpsPass)
+
+  StringRef getArgument() const final {
+    return "quant-merge-duplicate-resource-ops";
+  }
+
+  StringRef getDescription() const final {
+    return "Merge resource ops that have the same shared name.";
+  }
+
+  void runOnOperation() override;
+};
+
+// Checks if the island op contains a resource op like Variable or Hashtable
+// and returns that resource op. Otherwise, returns null.
+Operation* GetResourceOp(Operation* op) {
+  // Check if the island has only one block thats contain two ops, including
+  // one resource op and one Yield op.
+  auto island_op = llvm::dyn_cast_or_null<IslandOp>(op);
+  if (!island_op || !island_op.getBody().hasOneBlock()) return nullptr;
+  auto& island_block = island_op.getBody().front();
+  if (++island_block.begin() != --island_block.end()) return nullptr;
+
+  Operation* resource_op = &island_block.front();
+  if (llvm::isa<TF::VarHandleOp, TF::HashTableOp, TF::HashTableV2Op,
+                TF::MutableHashTableV2Op>(resource_op)) {
+    return resource_op;
+  }
+  return nullptr;
+}
+
+// Returns the `shared_name` attribute value if exists. If not, returns an
+// empty string.
+StringRef GetSharedName(Operation* op) {
+  if (!op->hasAttrOfType<StringAttr>(kSharedNameAttr)) return "";
+  return op->getAttrOfType<StringAttr>(kSharedNameAttr).getValue();
+}
+
+// Gets the GraphOp from the function op. Returns an empty op iff it doesn't
+// exist.
+// TODO(b/284222084): Move executor dialect utilities to a new library.
+GraphOp GetGraphOpFromFuncOp(func::FuncOp func_op) {
+  if (func_op->getNumRegions() == 0 || func_op.getBody().empty()) return {};
+
+  auto graph_op_range = func_op.front().without_terminator();
+  if (llvm::hasSingleElement(graph_op_range)) {
+    // The pass runs on a valid tf_executor dialect, so the op should be the
+    // GraphOp.
+    return cast<GraphOp>(graph_op_range.begin());
+  }
+
+  return {};
+}
+
+void MergeDuplicateResourceOpsPass::runOnOperation() {
+  func::FuncOp func_op = getOperation();
+  GraphOp graph_op = GetGraphOpFromFuncOp(func_op);
+  if (!graph_op) return;
+
+  llvm::StringMap<Operation*> shared_name_to_resource;
+  llvm::SmallVector<Operation*> ops_to_remove;
+  for (Operation& op : graph_op.GetBody().without_terminator()) {
+    Operation* resource_op = GetResourceOp(&op);
+    if (!resource_op) continue;
+    StringRef shared_name = GetSharedName(resource_op);
+    if (shared_name.empty()) continue;
+
+    if (!shared_name_to_resource.contains(shared_name)) {
+      shared_name_to_resource[shared_name] = resource_op;
+      continue;
+    }
+
+    auto existing_resource = shared_name_to_resource[shared_name];
+    if (resource_op->getName().getStringRef() !=
+            existing_resource->getName().getStringRef() ||
+        resource_op->getResult(0).getType() !=
+            existing_resource->getResult(0).getType()) {
+      resource_op->emitOpError(
+          "This op has the same `shared_name` but different type with another "
+          "resource op in the function");
+      signalPassFailure();
+      return;
+    }
+    op.replaceAllUsesWith(existing_resource->getParentOp()->getResults());
+    ops_to_remove.push_back(&op);
+  }
+
+  // Remove op after the loop to avoid crash.
+  for (Operation* op : ops_to_remove) {
+    op->erase();
+  }
+}
+
+static PassRegistration<MergeDuplicateResourceOpsPass> pass{};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateMergeDuplicateResourceOpsPass() {
+  return std::make_unique<MergeDuplicateResourceOpsPass>();
+}
+
+}  // namespace quant
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
index 221d3318730..6e94beb6b0a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -205,42 +206,6 @@ FailureOr<absl::flat_hash_map<std::string, func::FuncOp>> GetInitFuncOps(
   return init_func_ops;
 }
 
-// If `main_func_op` has the `tf.entry_function` attribute, adds a new input
-// name to the `inputs` field of the attribute. Otherwise, no attribute is
-// modified.
-void MaybeAddEntryFunctionInput(const StringRef input_name,
-                                func::FuncOp main_func_op) {
-  auto entry_func_attr =
-      main_func_op->getAttrOfType<DictionaryAttr>("tf.entry_function");
-  if (!entry_func_attr) return;
-
-  auto entry_func_attrs = SmallVector<NamedAttribute>(entry_func_attr.begin(),
-                                                      entry_func_attr.end());
-
-  MLIRContext* ctx = main_func_op.getContext();
-  for (auto& named_attr : entry_func_attrs) {
-    if (named_attr.getName() != "inputs") continue;
-
-    // Splits the "inputs" field to retrieve individual input names. Ignores
-    // empty strings.
-    SmallVector<StringRef> inputs_attrs{};
-    cast<StringAttr>(named_attr.getValue())
-        .strref()
-        .split(inputs_attrs, /*Separator=*/',', /*MaxSplit=*/-1,
-               /*KeepEmpty=*/false);
-
-    inputs_attrs.emplace_back(input_name);
-
-    const std::string new_inputs_attr_str =
-        llvm::join(std::move(inputs_attrs), /*Separator=*/",");
-
-    named_attr.setValue(StringAttr::get(ctx, new_inputs_attr_str));
-  }
-
-  main_func_op->setAttr("tf.entry_function",
-                        DictionaryAttr::get(ctx, entry_func_attrs));
-}
-
 // Creates new arguments to the main function that corresponds to the source
 // function's arguments. Returns the `IRMapping` that contains the
 // relationship.
@@ -265,7 +230,7 @@ IRMapping CloneSrcFuncArgumentsToMainFunc(func::FuncOp src_func_op,
     const std::string new_input_name =
         absl::StrCat(GetInitializerType(src_func_op), "_", src_arg_idx, ":0");
 
-    MaybeAddEntryFunctionInput(new_input_name, main_func_op);
+    AddEntryFunctionInput(new_input_name, main_func_op);
 
     // During cloning, let it know that the source function's argument
     // corresponds to the main function's newly created argument when cloning
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
index e5037fe4962..caef5c034f4 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_save_function_ops_to_main.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/constants.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -35,8 +36,6 @@ using ::mlir::tf_executor::IslandOp;
 using ::mlir::tf_saved_model::kTfSavedModelIndexPathAttr;
 using ::tensorflow::kImportModelDefaultGraphFuncName;
 
-constexpr StringRef kTfEntryFunctionAttr = "tf.entry_function";
-
 class MergeSaveFunctionOpsToMainPass
     : public PassWrapper<MergeSaveFunctionOpsToMainPass,
                          OperationPass<ModuleOp>> {
@@ -130,28 +129,7 @@ BlockArgument CreateFilePrefixArg(func::FuncOp main_func_op) {
 
   // Append the "__tf_file_prefix:0" to the "tf.entry_function" attribute's
   // item keyed by "inputs".
-  auto entry_function_attr =
-      main_func_op->getAttrOfType<mlir::DictionaryAttr>(kTfEntryFunctionAttr);
-
-  SmallVector<NamedAttribute> new_entry_function_attr_items;
-  for (NamedAttribute entry_function_attr_item : entry_function_attr) {
-    if (entry_function_attr_item.getName() == "inputs") {
-      auto inputs_attr = entry_function_attr_item.getValue().cast<StringAttr>();
-      const auto new_inputs_value_attr = Twine(inputs_attr.getValue())
-                                             .concat(kTfFilePrefix)
-                                             .concat(":0")
-                                             .str();
-      new_entry_function_attr_items.emplace_back(
-          builder.getNamedAttr(builder.getStringAttr("inputs"),
-                               builder.getStringAttr(new_inputs_value_attr)));
-    } else {
-      new_entry_function_attr_items.emplace_back(entry_function_attr_item);
-    }
-  }
-
-  main_func_op->setAttr(
-      /*name=*/kTfEntryFunctionAttr,
-      /*value=*/builder.getDictionaryAttr(new_entry_function_attr_items));
+  AddEntryFunctionInput(Twine(kTfFilePrefix).concat(":0").str(), main_func_op);
 
   return new_file_prefix_arg;
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
index 5406e4d2ed8..99edd6fc0ea 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h
@@ -109,7 +109,10 @@ std::unique_ptr<OperationPass<ModuleOp>> CreatePrepareQuantizeDRQPass(
 // Creates an instance of the PreprocessOp pass, which will perform op
 // preprocessing to allow multi-axis quantization, prior to quantization.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
-    const QuantizationSpecs& quant_specs, OpSet op_set);
+    OpSet op_set,
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod
+        quantization_method,
+    bool enable_per_channel_quantization);
 
 // Creates an instance of the PostQuantize pass, which will remove unnecessary
 // ops from the final quantized graph.
@@ -210,6 +213,16 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateConvertTpuModelToCpuPass();
 // model quantization.
 std::unique_ptr<OperationPass<ModuleOp>> CreateCastBf16OpsToF32Pass();
 
+// Creates a pass that lifts HashTable ops as function arguments. In the graph
+// execution mode, resource ops with the same `shared_name` attribute point to
+// the same underlying resource. This is not true in the eager execution mode.
+// Lifting resource ops as arguments will help unifying them across functions.
+std::unique_ptr<OperationPass<ModuleOp>> CreateLiftHashTableOpsAsArgsPass();
+
+// Creates a pass that merges duplicate resource ops in each function. Two
+// resource ops are considered duplicated if they have the same `shared_name`.
+std::unique_ptr<OperationPass<func::FuncOp>>
+CreateMergeDuplicateResourceOpsPass();
 }  // namespace quant
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
index 6f6e6d89da6..4a95afd8873 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
@@ -41,10 +41,15 @@ def ConvertArithConstToTfConst : Pat<
   (TF_ConstOp $value),
   [(AnyStaticShapeTensor $res)]>;
 
-// Converts CheckNumerics op to Identity
-def ConvertCheckNumerics : Pat<
+// Remove CheckNumerics op
+def RemoveCheckNumerics : Pat<
   (TF_CheckNumericsOp $arg, $msg),
-  (TF_IdentityOp $arg)>;
+  (replaceWithValue $arg)>;
+
+// Remove StopGradient op
+def RemoveStopGradient : Pat<
+  (TF_StopGradientOp $arg),
+  (replaceWithValue $arg)>;
 
 // Only handles the case where batch_dimension is empty.
 def IsXlaGatherWithoutBatch :
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
index 18aa58fe60c..4ec9e5361ff 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
@@ -42,6 +43,8 @@ namespace quant {
 
 namespace {
 
+using QuantMethod =
+    tensorflow::quantization::QuantizationMethod::ExperimentalMethod;
 using QuantizationUnit = std::pair<Operation*, int>;
 using QuantizationUnits = llvm::SetVector<QuantizationUnit>;
 
@@ -57,19 +60,20 @@ class PreprocessOpPass
  public:
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PreprocessOpPass)
 
-  // Constructor used by the PassRegistration and enforce int8 quantization.
-  // This is only used by test.
-  explicit PreprocessOpPass() : op_set_(OpSet::UNIFORM_QUANTIZED) {
-    quant_specs_.inference_type = tensorflow::DT_QINT8;
-  }
+  explicit PreprocessOpPass() = default;
 
   // Constructor used by manually creating the pass.
-  explicit PreprocessOpPass(const QuantizationSpecs& quant_specs, OpSet op_set)
-      : quant_specs_(quant_specs), op_set_(op_set) {}
+  explicit PreprocessOpPass(OpSet op_set, const QuantMethod quantization_method,
+                            bool enable_per_channel_quantization) {
+    op_set_ = op_set;
+    quantization_method_ = quantization_method;
+    enable_per_channel_quantization_ = enable_per_channel_quantization;
+  }
 
   PreprocessOpPass(const PreprocessOpPass& other) {
-    quant_specs_ = other.quant_specs_;
     op_set_ = other.op_set_;
+    quantization_method_ = other.quantization_method_;
+    enable_per_channel_quantization_ = other.enable_per_channel_quantization_;
   }
 
   StringRef getArgument() const final {
@@ -85,15 +89,103 @@ class PreprocessOpPass
   void runOnOperation() override;
 
  private:
-  QuantizationSpecs quant_specs_;
-  OpSet op_set_;
+  Option<OpSet> op_set_{
+      *this, "target-opset", llvm::cl::init(OpSet::UNIFORM_QUANTIZED),
+      llvm::cl::desc("Choose target opset."),
+      llvm::cl::values(
+          clEnumValN(OpSet::TF, "TF",
+                     "Uses TF ops that mimic quantization behavior"),
+          clEnumValN(OpSet::XLA, "XLA", "Uses TF XLA ops"),
+          clEnumValN(OpSet::UNIFORM_QUANTIZED, "UNIFORM_QUANTIZED",
+                     "Uses TF Uniform Quantized ops"))};
+
+  Option<QuantMethod> quantization_method_{
+      *this, "quantization-method",
+      llvm::cl::init(
+          tensorflow::quantization::QuantizationMethod::STATIC_RANGE),
+      llvm::cl::desc("Choose quantization method."),
+      llvm::cl::values(
+          clEnumValN(tensorflow::quantization::QuantizationMethod::STATIC_RANGE,
+                     "ptq", "Post-training static-range quantization"),
+          clEnumValN(
+              tensorflow::quantization::QuantizationMethod::DYNAMIC_RANGE,
+              "drq", "Post-training dynamic-range quantizaiton"),
+          clEnumValN(tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY,
+                     "weight_only", "Post-training weight-only quantizaiton"))};
+
+  Option<bool> enable_per_channel_quantization_{
+      *this, "enable-per-channel-quantization", llvm::cl::init(false),
+      llvm::cl::desc("Whether enable per-channel quantized weights.")};
 };
 
 // Apply constant transformations for the op_set.
 class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
  public:
-  explicit PreprocessConstantOp(MLIRContext* context, OpSet op_set)
-      : OpRewritePattern<TF::PartitionedCallOp>(context), op_set_(op_set) {}
+  explicit PreprocessConstantOp(MLIRContext* context, OpSet op_set,
+                                QuantMethod quantization_method,
+                                bool enable_per_channel_quantization)
+      : OpRewritePattern<TF::PartitionedCallOp>(context),
+        op_set_(op_set),
+        quantization_method_(quantization_method),
+        enable_per_channel_quantization_(enable_per_channel_quantization) {}
+
+  LogicalResult addReshapeOpToDepthwiseWeight(TF::PartitionedCallOp op,
+                                              PatternRewriter& rewriter,
+                                              StringRef function_name) const {
+    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(op);
+    const absl::flat_hash_set<int> operands = spec->quantizable_operands;
+
+    if (operands.size() != 1) return failure();
+    int weight_operand_idx = *operands.begin();
+
+    Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
+    DenseFPElementsAttr attr;
+    if (!matchPattern(weight_op->getResult(0), m_Constant(&attr))) {
+      return failure();
+    }
+
+    // Get new shape.
+    llvm::ArrayRef<int64_t> cur_shape = attr.getType().getShape();
+    int cur_rank = cur_shape.size();
+    if (cur_rank != 4 || cur_shape[2] == 1) return failure();
+    TensorType new_shape = RankedTensorType::get(
+        {cur_shape[0], cur_shape[1], 1, cur_shape[2] * cur_shape[3]},
+        attr.getElementType());
+
+    // Inserts a reshape op.
+    auto shape_spec_type =
+        RankedTensorType::get({cur_rank}, rewriter.getIntegerType(64));
+    auto new_shape_const_attr =
+        DenseElementsAttr::get(shape_spec_type, new_shape.getShape());
+    rewriter.setInsertionPointAfter(weight_op);
+    auto new_shape_const = rewriter.create<arith::ConstantOp>(
+        weight_op->getLoc(), shape_spec_type, new_shape_const_attr);
+    auto reshape_op = rewriter.create<TF::ReshapeOp>(
+        weight_op->getLoc(), new_shape, weight_op->getResult(0),
+        new_shape_const);
+    op->setOperand(weight_operand_idx, reshape_op);
+
+    // Create a new function with preprocessed types.
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
+    func::FuncOp float_func =
+        dyn_cast<func::FuncOp>(symbol_table.lookup(function_name));
+    OperandRange func_args = op.getArgs();
+    func::FuncOp new_float_func = float_func.clone();
+
+    SmallVector<Value> new_float_func_args{func_args.begin(), func_args.end()};
+    new_float_func_args[weight_operand_idx] = reshape_op;
+    new_float_func.getArgument(weight_operand_idx).setType(new_shape);
+    new_float_func.setType(FunctionType::get(
+        getContext(), TypeRange{ValueRange{new_float_func_args}},
+        new_float_func.getResultTypes()));
+    symbol_table.insert(new_float_func);
+
+    op->setAttr("f", SymbolRefAttr::get(rewriter.getContext(),
+                                        new_float_func.getName()));
+
+    return success();
+  }
 
   LogicalResult matchAndRewrite(TF::PartitionedCallOp op,
                                 PatternRewriter& rewriter) const override {
@@ -101,13 +193,12 @@ class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
     // Non-quantizable op
     if (!op->hasAttr(kQuantTraitAttrName)) return failure();
     StringRef function_name = f_attr.getValue();
+    // TODO(b/228928859): Improve the getter function to match attributes rather
+    // than function name.
     if (!function_name.startswith("composite_")) {
       return failure();
     }
 
-    std::unique_ptr<OpQuantSpec> spec = GetTFOpQuantSpec(op);
-    const absl::flat_hash_set<int> operands = spec->quantizable_operands;
-
     if (function_name.contains("depthwise_conv2d")) {
       // Uniform Quantized op requires weights of tf.DepthwiseConv2dNative to
       // be transformed from [H,W,C,M] to [H,W,1,CxM] where
@@ -115,57 +206,11 @@ class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
       // inserted between the constant op and the function op so that the
       // constant is safely transformed for the multi-use cases as well. Note
       // that bias doesn't need transformation as its shape is already in [CxM].
-      if (operands.size() != 1) return failure();
-      int weight_operand_idx = *operands.begin();
-      Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
-
-      if (op_set_ == OpSet::UNIFORM_QUANTIZED) {
-        DenseFPElementsAttr attr;
-        if (!matchPattern(weight_op->getResult(0), m_Constant(&attr))) {
-          return failure();
-        }
-
-        // Get new shape.
-        llvm::ArrayRef<int64_t> cur_shape = attr.getType().getShape();
-        int cur_rank = cur_shape.size();
-        if (cur_rank != 4 || cur_shape[2] == 1) return failure();
-        TensorType new_shape = RankedTensorType::get(
-            {cur_shape[0], cur_shape[1], 1, cur_shape[2] * cur_shape[3]},
-            attr.getElementType());
-
-        // Inserts a reshape op.
-        auto shape_spec_type =
-            RankedTensorType::get({cur_rank}, rewriter.getIntegerType(64));
-        auto new_shape_const_attr =
-            DenseElementsAttr::get(shape_spec_type, new_shape.getShape());
-        rewriter.setInsertionPointAfter(weight_op);
-        auto new_shape_const = rewriter.create<arith::ConstantOp>(
-            weight_op->getLoc(), shape_spec_type, new_shape_const_attr);
-        auto reshape_op = rewriter.create<TF::ReshapeOp>(
-            weight_op->getLoc(), new_shape, weight_op->getResult(0),
-            new_shape_const);
-        op->setOperand(weight_operand_idx, reshape_op);
-
-        // Create a new function with preprocessed types.
-        ModuleOp module = op->getParentOfType<ModuleOp>();
-        SymbolTable symbol_table(module);
-        func::FuncOp float_func =
-            dyn_cast<func::FuncOp>(symbol_table.lookup(function_name));
-        OperandRange func_args = op.getArgs();
-        func::FuncOp new_float_func = float_func.clone();
-
-        SmallVector<Value> new_float_func_args{func_args.begin(),
-                                               func_args.end()};
-        new_float_func_args[weight_operand_idx] = reshape_op;
-        new_float_func.getArgument(weight_operand_idx).setType(new_shape);
-        new_float_func.setType(FunctionType::get(
-            getContext(), TypeRange{ValueRange{new_float_func_args}},
-            new_float_func.getResultTypes()));
-        symbol_table.insert(new_float_func);
-
-        op->setAttr("f", SymbolRefAttr::get(rewriter.getContext(),
-                                            new_float_func.getName()));
-        return success();
+      if (op_set_ == OpSet::UNIFORM_QUANTIZED ||
+          (op_set_ == OpSet::XLA && enable_per_channel_quantization_ &&
+           quantization_method_ ==
+               tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY)) {
+        return addReshapeOpToDepthwiseWeight(op, rewriter, function_name);
       }
     }
     return failure();
@@ -173,6 +218,8 @@ class PreprocessConstantOp : public OpRewritePattern<TF::PartitionedCallOp> {
 
  private:
   const OpSet op_set_;
+  const QuantMethod quantization_method_;
+  const bool enable_per_channel_quantization_;
 };
 
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/preprocess_op.inc"
@@ -183,7 +230,8 @@ void PreprocessOpPass::runOnOperation() {
   ModuleOp module_op = getOperation();
 
   populateWithGenerated(patterns);
-  patterns.add<PreprocessConstantOp>(ctx, op_set_);
+  patterns.add<PreprocessConstantOp>(ctx, op_set_, quantization_method_,
+                                     enable_per_channel_quantization_);
   FrozenRewritePatternSet frozen_patterns(std::move(patterns));
 
   for (auto func : module_op.getOps<func::FuncOp>()) {
@@ -199,8 +247,10 @@ void PreprocessOpPass::runOnOperation() {
 // Creates an instance of the TensorFlow dialect PreprocessOp
 // pass.
 std::unique_ptr<OperationPass<ModuleOp>> CreatePreprocessOpPass(
-    const QuantizationSpecs& quant_specs, const OpSet op_set) {
-  return std::make_unique<PreprocessOpPass>(quant_specs, op_set);
+    const OpSet op_set, QuantMethod quantization_method,
+    const bool enable_per_channel_quantization) {
+  return std::make_unique<PreprocessOpPass>(op_set, quantization_method,
+                                            enable_per_channel_quantization);
 }
 
 static PassRegistration<PreprocessOpPass> pass;
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
index 6c374141025..9269461a80c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/quantize_composite_functions.cc
@@ -968,6 +968,79 @@ class QuantizeConstPattern
   OpSet target_opset_;
 };
 
+// To calculate per-channel scale and offset, weight of depthwise was reshaped
+// to [H, W, 1, InxMul]. After scale and offset has been calculated, this
+// pattern gets called and restores the weight of depthwise back
+// into [H, W, In, Mul]
+class RestoreWeightShapePattern
+    : public OpRewritePattern<TF::PartitionedCallOp> {
+  using OpRewritePattern<TF::PartitionedCallOp>::OpRewritePattern;
+
+ private:
+  LogicalResult addReshapeOpToDepthwiseWeight(TF::PartitionedCallOp op,
+                                              PatternRewriter& rewriter) const {
+    int weight_operand_idx = 1;
+    Operation* weight_op = op.getOperand(weight_operand_idx).getDefiningOp();
+
+    auto weight_type = weight_op->getResult(0).getType().dyn_cast<ShapedType>();
+    auto input_type = op.getOperand(0).getType().dyn_cast<ShapedType>();
+
+    llvm::ArrayRef<int64_t> weight_shape = weight_type.getShape();
+    llvm::ArrayRef<int64_t> input_shape = input_type.getShape();
+
+    // If weight_shape[2] != 1, it means weight shape was already restored.
+    if (weight_shape[2] != 1) return failure();
+
+    // Weight was reshaped into [H, W, 1, InxMul].
+    // Since we know in_channels from input_shape, we can derive multiplier.
+    int64_t in_channels = input_shape[3];
+    // If in_channels is 1, there is no need to restore weight shape.
+    if (in_channels == 1) return failure();
+    int64_t multiplier = weight_shape[3] / in_channels;
+
+    TensorType new_shape = RankedTensorType::get(
+        {weight_shape[0], weight_shape[1], in_channels, multiplier},
+        weight_type.getElementType());
+
+    int cur_rank = weight_type.getRank();
+
+    // Inserts a reshape op.
+    auto shape_spec_type =
+        RankedTensorType::get({cur_rank}, rewriter.getIntegerType(64));
+    auto new_shape_const_attr =
+        DenseElementsAttr::get(shape_spec_type, new_shape.getShape());
+    rewriter.setInsertionPointAfter(weight_op);
+    auto new_shape_const = rewriter.create<TF::ConstOp>(
+        weight_op->getLoc(), shape_spec_type, new_shape_const_attr);
+    auto reshape_op = rewriter.create<TF::ReshapeOp>(
+        weight_op->getLoc(), new_shape, weight_op->getResult(0),
+        new_shape_const);
+    op->setOperand(weight_operand_idx, reshape_op);
+
+    return success();
+  }
+
+  LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
+                                PatternRewriter& rewriter) const override {
+    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
+    StringRef function_name = f_attr.getValue();
+    // TODO(b/228928859): Improve the getter function to match attributes rather
+    // than function name.
+    // If enable_legacy_weight_only is enabled, QuantizeFunctionsPattern
+    // does not get called and function remains as composite
+    if (!function_name.startswith("quantized_") &&
+        !function_name.startswith("composite_")) {
+      return failure();
+    }
+
+    if (function_name.contains("depthwise_conv2d")) {
+      return addReshapeOpToDepthwiseWeight(call_op, rewriter);
+    }
+
+    return failure();
+  }
+};
+
 // Prints a summary about the quantization results.
 class QuantizationSummary {
  public:
@@ -1133,10 +1206,12 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   pm.enableVerifier(false);
 
   QuantizationSpecs quant_specs;
-  pm.addPass(CreatePreprocessOpPass(quant_specs, target_opset_));
-
   quant_specs.inference_type = tensorflow::DT_QINT8;
   quant_specs.disable_per_channel = !enable_per_channel_quantization_;
+
+  pm.addPass(CreatePreprocessOpPass(target_opset_, quantization_method_,
+                                    enable_per_channel_quantization_));
+
   // Apply activation-weight quantization.
   if (quantization_method_ ==
       tensorflow::quantization::QuantizationMethod::STATIC_RANGE) {
@@ -1180,6 +1255,13 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   patterns_2.add<ReplaceQuantizePattern, ReplaceDequantizePattern>(
       ctx, target_opset_);
   patterns_2.add<QuantizeConstPattern>(ctx, target_opset_);
+
+  if (target_opset_ == OpSet::XLA && enable_per_channel_quantization_ &&
+      quantization_method_ ==
+          tensorflow::quantization::QuantizationMethod::WEIGHT_ONLY) {
+    patterns_2.add<RestoreWeightShapePattern>(ctx);
+  }
+
   if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns_2))) ||
       failed(verify(module))) {
     signalPassFailure();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 3b5a9d55f5f..538be57c88c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -137,12 +137,12 @@ pytype_strict_library(
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:constants",
         "//tensorflow/python/saved_model:loader",
@@ -167,11 +167,11 @@ pytype_strict_library(
         "//tensorflow/compiler/mlir/quantization/tensorflow:exported_model_proto_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:tf_logging",
@@ -198,8 +198,8 @@ tf_py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:tag_constants",
         "//third_party/py/numpy",
@@ -216,16 +216,6 @@ pytype_library(
         ":representative_dataset",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
@@ -233,7 +223,17 @@ pytype_library(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:signature_def_utils",
@@ -253,7 +253,7 @@ tf_py_test(
     deps = [
         ":quantize_model",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:tag_constants",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index 892dfde7c9a..a023c8e9148 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -198,6 +198,7 @@ class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   """
 
   class SimpleModel(module.Module):
+
     def __init__(self):
       self.filters = np.random.uniform(low=-1.0, high=1.0, size=(4, 3)).astype(
           'f4'
@@ -288,7 +289,9 @@ class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
           self._input_saved_model_path, quantization_options=options
       )
 
-  def test_per_channel_for_non_uniform_opset_raises_value_error(self):
+  def test_drq_per_channel_for_non_uniform_opset_raises_value_error(
+      self,
+  ):
     model = self.SimpleModel()
 
     saved_model_save.save(model, self._input_saved_model_path)
@@ -385,6 +388,7 @@ class QuantizationOptionsTest(quantize_model_test_base.QuantizedModelTest):
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
         ),
+        op_set=quant_opts_pb2.TF,
         force_graph_mode_calibration=True,
     )
 
@@ -916,8 +920,14 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
             equation, shape_unknown, has_bias and not shape_unknown
         )
     )
-    model = self._create_einsum_model_with_fake_quant(
-        equation, y_shape, x_signature, y_signature, bias_shape, activation_fn
+    model = self._create_einsum_model(
+        equation,
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        activation_fn,
+        is_qat_model=True,
     )
     x = array_ops.constant(
         np.random.uniform(size=x_shape), dtype=dtypes.float32
@@ -1027,8 +1037,14 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
             equation, shape_unknown, has_bias and not shape_unknown
         )
     )
-    model = self._create_einsum_model_with_fake_quant(
-        equation, y_shape, x_signature, y_signature, bias_shape, activation_fn
+    model = self._create_einsum_model(
+        equation,
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        activation_fn,
+        is_qat_model=True,
     )
 
     x = array_ops.constant(
@@ -1098,13 +1114,14 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         self._prepare_sample_einsum_datashapes(equation)
     )
 
-    model = self._create_einsum_model_with_fake_quant(
+    model = self._create_einsum_model(
         equation,
         y_shape,
         x_signature,
         y_signature,
         bias_shape=None,
         activation_fn=None,
+        is_qat_model=True,
     )
 
     if use_kernel:
@@ -1180,8 +1197,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         self._output_saved_model_path, self._input_saved_model_path, 0.5
     )
 
-  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
-  @test_util.deprecated_graph_mode_only
   def test_qat_vocab_table_lookup_model(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -1204,7 +1219,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     signature_def_keys = [signature_def_key]
@@ -1253,8 +1269,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
       self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
-  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
-  @test_util.deprecated_graph_mode_only
   def test_qat_file_init_hash_table_lookup_model_tf1(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -1277,7 +1291,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
     signature_def_keys = [signature_def_key]
 
@@ -1390,7 +1405,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     converted_model = quantize_model.quantize(
@@ -2044,7 +2060,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     converted_model = quantize_model.quantize(
@@ -2135,7 +2152,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   def test_matmul_with_reshape_and_bias_ptq_model(
       self, input_shape, filter_shape, bias_size, activation_fn, use_biasadd
   ):
-
     model = self._create_matmul_model(
         input_shape,
         filter_shape,
@@ -2175,9 +2191,7 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     )
 
     input_data = ops.convert_to_tensor(
-        rng.uniform(low=0.0, high=1.0, size=input_shape).astype(
-            np.float32
-        )
+        rng.uniform(low=0.0, high=1.0, size=input_shape).astype(np.float32)
     )
     expected_outputs = model.matmul(input_data)
 
@@ -2188,31 +2202,38 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     self.assertAllClose(expected_outputs, got_outputs, atol=0.05)
 
   @parameterized.parameters(
-      ('abc,cde->abde', (2, 2, 64), (64, 3, 3), (3, 3), quant_opts_pb2.XLA),
-      ('abc,dce->abde', (2, 2, 64), (3, 64, 3), (3, 3), quant_opts_pb2.XLA),
+      ('abc,cde->abde', quant_opts_pb2.XLA),
+      ('abc,dce->abde', quant_opts_pb2.XLA),
   )
   def test_einsum_ptq_model(
       self,
       equation: str,
-      input_shape: Sequence[int],
-      weight_shape: Sequence[int],
-      bias_shape: Sequence[int],
       target_opset: quant_opts_pb2.OpSet,
   ):
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes(equation, use_bias=True)
+    )
+
     model = self._create_einsum_model(
-        self._input_saved_model_path,
         equation,
-        input_shape,
-        weight_shape,
+        y_shape,
+        x_signature,
+        y_signature,
         bias_shape,
         activation_fn=nn_ops.relu,
     )
 
+    signatures = {
+        'serving_default': model.einsum_with_kernel.get_concrete_function(),
+    }
+
+    saved_model_save.save(model, self._input_saved_model_path, signatures)
+
     def data_gen() -> repr_dataset.RepresentativeDataset:
-      for _ in range(200):
+      for _ in range(4):
         yield {
-            'input_tensor': ops.convert_to_tensor(
-                np.random.uniform(low=0.0, high=1.0, size=input_shape).astype(
+            'x': ops.convert_to_tensor(
+                np.random.uniform(low=0.0, high=1.0, size=x_signature).astype(
                     'f4'
                 )
             ),
@@ -2223,7 +2244,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     converted_model = quantize_model.quantize(
@@ -2246,13 +2268,13 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
     input_data = ops.convert_to_tensor(
-        np.random.uniform(low=0.0, high=1.0, size=input_shape).astype('f4')
+        np.random.uniform(low=0.0, high=1.0, size=x_signature).astype('f4')
     )
-    expected_outputs = model.einsum(input_data)
+    expected_outputs = model.einsum_with_kernel(input_data)
     got_outputs = converted_model.signatures['serving_default'](
-        input_tensor=ops.convert_to_tensor(input_data)
+        x=ops.convert_to_tensor(input_data)
     )
-    self.assertAllClose(expected_outputs, got_outputs, atol=0.0608)
+    self.assertAllClose(expected_outputs, got_outputs, atol=0.097)
 
     # Check the converted model in the target opset.
     quantization_options = quant_opts_pb2.QuantizationOptions(
@@ -2283,10 +2305,10 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
       self.assertTrue(self._contains_op(output_graphdef, 'XlaDotV2'))
 
     new_outputs = converted_model.signatures['serving_default'](
-        input_tensor=ops.convert_to_tensor(input_data)
+        x=ops.convert_to_tensor(input_data)
     )
     # The difference between TF and target path is expected to be small.
-    self.assertAllClose(new_outputs, got_outputs, atol=0.0666)
+    self.assertAllClose(new_outputs, got_outputs, atol=0.097)
     self.assertAllClose(new_outputs, expected_outputs, atol=0.057)
 
   @test_util.run_in_graph_and_eager_modes
@@ -2363,8 +2385,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     _, y_shape, _, x_signature, y_signature = (
         self._prepare_sample_einsum_datashapes('ab,bc->ac')
     )
-    model = self._create_einsum_model_with_fake_quant(
-        'ab,bc->ac', y_shape, x_signature, y_signature
+    model = self._create_einsum_model(
+        'ab,bc->ac', y_shape, x_signature, y_signature, is_qat_model=True
     )
 
     signatures = {
@@ -2420,7 +2442,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
                 func.node_def, op_name='XlaDotV2', attr_name='', attr_val=None
             )
 
-  @test_util.deprecated_graph_mode_only
   def test_matmul_ptq_model_with_unfreeze_constants(self):
     # Uses large weight to exceed the constant size threshold of 64KiB
     # (specified by `kDefaultConstantSizeThresholdInBytes`) for unfreezing.
@@ -2440,6 +2461,7 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
         ),
+        op_set=quant_opts_pb2.TF,
         freeze_all_variables=quant_opts_pb2.FreezeAllVariables(enabled=False),
     )
 
@@ -2507,7 +2529,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     data_gen = self._create_data_generator(
@@ -2549,7 +2572,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
     tags = {tag_constants.SERVING}
 
@@ -2590,7 +2614,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
     tags = {tag_constants.SERVING}
 
@@ -2634,7 +2659,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
     tags = {tag_constants.SERVING}
 
@@ -2676,7 +2702,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
     tags = {tag_constants.SERVING}
     signature_def_keys = [signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY]
@@ -2726,6 +2753,53 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
+  def test_model_ptq_preserving_assets_extra(self):
+    self._create_matmul_model(
+        input_shape=(1, 1024),
+        weight_shape=(1024, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
+    asset_filename = 'assets.extra/tf_serving_warmup_requests'
+    file_io.create_dir_v2(
+        os.path.join(self._input_saved_model_path, 'assets.extra')
+    )
+    file_io.write_string_to_file(
+        filename=os.path.join(self._input_saved_model_path, asset_filename),
+        file_content='Test content',
+    )
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.STATIC_RANGE
+        ),
+        op_set=quant_opts_pb2.TF,
+    )
+    tags = {tag_constants.SERVING}
+
+    # Use plain python lists as representative samples.
+    representative_dataset = [
+        {
+            'input_tensor': [[i * 0.1 for i in range(1024)]],
+        }
+        for _ in range(4)
+    ]
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options=quantization_options,
+        representative_dataset=representative_dataset,
+    )
+    self.assertIsNotNone(converted_model)
+    # Check if the assets.extra file exists in the output model.
+    self.assertTrue(
+        file_io.file_exists_v2(
+            os.path.join(self._output_saved_model_path, asset_filename)
+        )
+    )
+
   # tf.data.Dataset is as an Iterable (thus can be used as representative
   # dataset) only in TF2 (eager mode).
   @test_util.run_v2_only
@@ -2739,7 +2813,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
     tags = {tag_constants.SERVING}
 
@@ -2897,7 +2972,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     with self.assertLogs(level='WARN') as warning_logs:
@@ -2964,7 +3040,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     def data_gen_sig1() -> repr_dataset.RepresentativeDataset:
@@ -3074,7 +3151,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     data_gen = self._create_data_generator(
@@ -3129,7 +3207,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     data_gen = self._create_data_generator(
@@ -3160,7 +3239,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
-  @test_util.deprecated_graph_mode_only
   def test_ptq_model_with_variable_tf1_saved_model_unfreeze_constants(self):
     signature_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
     tags = {tag_constants.SERVING}
@@ -3184,6 +3262,7 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
         ),
+        op_set=quant_opts_pb2.TF,
         freeze_all_variables=quant_opts_pb2.FreezeAllVariables(enabled=False),
     )
 
@@ -3257,7 +3336,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     data_gen = self._create_data_generator(
@@ -3316,7 +3396,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     def data_gen_sig1() -> repr_dataset.RepresentativeDataset:
@@ -3429,7 +3510,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     data_gen = self._create_data_generator(
@@ -3496,8 +3578,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
           representative_dataset=data_gen,
       )
 
-  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
-  @test_util.deprecated_graph_mode_only
   def test_ptq_vocab_table_lookup_model(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -3520,7 +3600,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     signature_def_keys = [signature_def_key]
@@ -3569,7 +3650,6 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
       self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
-  @test_util.deprecated_graph_mode_only
   def test_ptq_file_init_hash_table_lookup_model(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -3592,7 +3672,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     signature_def_keys = [signature_def_key]
@@ -3871,7 +3952,8 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.STATIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     data_gen = self._create_data_generator(
@@ -3917,6 +3999,78 @@ class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   eager mode (default in TF2) to ensure support for when TF2 is disabled.
   """
 
+  @parameterized.parameters(
+      (True, quant_opts_pb2.XLA),
+      (False, quant_opts_pb2.XLA),
+      (True, quant_opts_pb2.UNIFORM_QUANTIZED),
+      (False, quant_opts_pb2.UNIFORM_QUANTIZED),
+  )
+  @test_util.run_in_graph_and_eager_modes
+  def test_einsum_model(
+      self,
+      constant_y_operand: bool,
+      target_opset: quant_opts_pb2.OpSet,
+  ):
+    equation = 'abc,cde->abde'
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes(equation, use_bias=True)
+    )
+
+    model = self._create_einsum_model(
+        equation,
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        activation_fn=nn_ops.relu,
+    )
+
+    if constant_y_operand:
+      signatures = {
+          'serving_default': model.einsum_with_kernel.get_concrete_function(),
+      }
+    else:
+      signatures = {
+          'serving_default': (
+              model.einsum_without_kernel.get_concrete_function()
+          ),
+      }
+
+    saved_model_save.save(model, self._input_saved_model_path, signatures)
+
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
+        ),
+        op_set=target_opset,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    # TODO(b/286489783): Support Einsum
+    if target_opset == quant_opts_pb2.UNIFORM_QUANTIZED:
+      self.assertFalse(self._contains_op(output_graphdef, 'XlaDotV2'))
+      self.assertTrue(self._contains_op(output_graphdef, 'BatchMatMulV2'))
+    else:
+      self.assertFalse(self._contains_op(output_graphdef, 'XlaDotV2'))
+      self.assertTrue(self._contains_op(output_graphdef, 'Einsum'))
+
   @parameterized.named_parameters(
       ('to_tf_per_tensor', quant_opts_pb2.TF, False),
       ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
@@ -4555,7 +4709,8 @@ class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     quantization_options = quant_opts_pb2.QuantizationOptions(
         quantization_method=quant_opts_pb2.QuantizationMethod(
             experimental_method=_ExperimentalMethod.DYNAMIC_RANGE
-        )
+        ),
+        op_set=quant_opts_pb2.TF,
     )
 
     converted_model = quantize_model.quantize(
@@ -4578,8 +4733,6 @@ class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
     self.assertTrue(self._contains_quantized_function_call(output_graphdef))
 
-  # TODO(b/244276332): Allow table initialization in TF2 eager mode.
-  @test_util.deprecated_graph_mode_only
   def test_table_initialized_when_model_has_table_tf1(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -4637,7 +4790,6 @@ class DynamicRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
       self.assertAllClose(lookup_val, [1.0, 2.0, 0.0])
 
-  @test_util.deprecated_graph_mode_only
   def test_file_init_hash_table_lookup_model(self):
     tags = {tag_constants.SERVING}
     signature_def_key = signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
@@ -4700,6 +4852,65 @@ class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   (default in TF2) to ensure support for when TF2 is disabled.
   """
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_einsum_model(
+      self,
+  ):
+    equation = 'abc,cde->abde'
+    _, y_shape, bias_shape, x_signature, y_signature = (
+        self._prepare_sample_einsum_datashapes(equation, use_bias=True)
+    )
+
+    model = self._create_einsum_model(
+        equation,
+        y_shape,
+        x_signature,
+        y_signature,
+        bias_shape,
+        activation_fn=nn_ops.relu,
+    )
+
+    # Use constant y operand.
+    signatures = {
+        'serving_default': model.einsum_with_kernel.get_concrete_function(),
+    }
+
+    saved_model_save.save(model, self._input_saved_model_path, signatures)
+
+    tags = {tag_constants.SERVING}
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            experimental_method=_ExperimentalMethod.WEIGHT_ONLY
+        ),
+        op_set=quant_opts_pb2.XLA,
+    )
+
+    converted_model = quantize_model.quantize(
+        self._input_saved_model_path,
+        ['serving_default'],
+        tags,
+        self._output_saved_model_path,
+        quantization_options,
+    )
+    self.assertIsNotNone(converted_model)
+    self.assertCountEqual(
+        converted_model.signatures._signatures.keys(), {'serving_default'}
+    )
+
+    output_loader = saved_model_loader.SavedModelLoader(
+        self._output_saved_model_path
+    )
+    output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
+
+    # TODO(b/286489783): Support Einsum for Weight only quantization
+    # Due to other meta data, the compression is not exactly 1/4.
+    self.assertFalse(self._contains_op(output_graphdef, 'XlaDotV2'))
+    self.assertSizeRatioLessThan(
+        self._output_saved_model_path,
+        self._input_saved_model_path,
+        threshold=0.5,
+    )
+
   @parameterized.named_parameters(
       # TODO(b/269421880): Enable legacy weight-only scheme with the uniform
       # quantized opset
@@ -4756,17 +4967,22 @@ class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
   @parameterized.named_parameters(
       # TODO(b/269421880): Enable legacy weight-only scheme with the uniform
       # quantized opset
-      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False, False),
+      ('to_xla_per_channel', quant_opts_pb2.XLA, True, False),
+      ('to_xla_per_channel_legacy', quant_opts_pb2.XLA, True, True),
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_model(
       self,
       target_opset: quant_opts_pb2.OpSet,
       enable_per_channel_quantization: bool,
+      enable_legacy_weight_only: bool,
   ):
+    input_shape = (1, 3, 4, 512)
+    filter_shape = (2, 3, 512, 2)
     model = self._create_conv2d_model(
-        input_shape=(1, 3, 4, 512),
-        filter_shape=(2, 3, 512, 2),
+        input_shape=input_shape,
+        filter_shape=filter_shape,
         has_bias=False,
         has_batch_norm=False,
         activation_fn=nn_ops.relu6,
@@ -4781,6 +4997,7 @@ class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         ),
         op_set=target_opset,
         enable_per_channel_quantization=enable_per_channel_quantization,
+        enable_legacy_weight_only=enable_legacy_weight_only,
     )
 
     converted_model = quantize_model.quantize(
@@ -4801,30 +5018,68 @@ class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     )
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
+    if not enable_legacy_weight_only:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
     self.assertSizeRatioLessThan(
         self._output_saved_model_path,
         self._input_saved_model_path,
         threshold=0.3,
     )
 
+    if enable_per_channel_quantization:
+      per_channel_size_attr = attr_value_pb2.AttrValue(
+          list=attr_value_pb2.AttrValue.ListValue(
+              shape=[
+                  tensor_shape_pb2.TensorShapeProto(
+                      dim=[
+                          tensor_shape_pb2.TensorShapeProto.Dim(
+                              size=filter_shape[-1]
+                          )
+                      ]
+                  )
+              ]
+          )
+      )
+      self.assertTrue(
+          self._contains_op(
+              output_graphdef, 'Const', '_output_shapes', per_channel_size_attr
+          )
+      )
+
+    input_tensor = array_ops.constant(
+        np.random.uniform(low=0, high=0.1, size=input_shape),
+        dtype=dtypes.float32,
+    )
+    original_output = model.conv(input_tensor)
+    quantized_output = converted_model.signatures['serving_default'](
+        input_tensor
+    )
+
+    threshold = 0.015 if enable_per_channel_quantization else 0.02
+    self.assertAllClose(original_output, quantized_output, atol=threshold)
+
   @parameterized.named_parameters(
       # TODO(b/269421880): Enable legacy weight-only scheme with the uniform
       # quantized opset
-      ('to_xla_per_tensor', quant_opts_pb2.XLA, False),
+      ('to_xla_per_tensor', quant_opts_pb2.XLA, False, False),
+      ('to_xla_per_channel', quant_opts_pb2.XLA, True, False),
+      ('to_xla_per_channel_legacy', quant_opts_pb2.XLA, True, True),
   )
   @test_util.run_in_graph_and_eager_modes
   def test_depthwise_conv2d_model(
       self,
       target_opset: quant_opts_pb2.OpSet,
       enable_per_channel_quantization: bool,
+      enable_legacy_weight_only: bool,
   ):
+    input_shape = (1, 3, 4, 512)
     filter_shape = (2, 3, 512, 2)
     strides = (1, 2, 2, 1)
 
     model = self._create_depthwise_conv2d_model(
-        input_shape=(1, 3, 4, 512), filter_shape=filter_shape, strides=strides
+        input_shape=input_shape, filter_shape=filter_shape, strides=strides
     )
 
     saved_model_save.save(model, self._input_saved_model_path)
@@ -4837,6 +5092,7 @@ class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
         ),
         op_set=target_opset,
         enable_per_channel_quantization=enable_per_channel_quantization,
+        enable_legacy_weight_only=enable_legacy_weight_only,
     )
 
     converted_model = quantize_model.quantize(
@@ -4858,13 +5114,48 @@ class WeightOnlyQuantizationTest(quantize_model_test_base.QuantizedModelTest):
     output_graphdef = output_loader.get_meta_graph_def_from_tags(tags).graph_def
 
     # Due to other meta data, the compression is not exactly 1/4.
-    self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+    if not enable_legacy_weight_only:
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaConvV2'))
+
+    size_threshold = 0.5 if enable_per_channel_quantization else 0.3
     self.assertSizeRatioLessThan(
         self._output_saved_model_path,
         self._input_saved_model_path,
-        threshold=0.3,
+        threshold=size_threshold,
     )
 
+    if enable_per_channel_quantization:
+      per_channel_size_attr = attr_value_pb2.AttrValue(
+          list=attr_value_pb2.AttrValue.ListValue(
+              shape=[
+                  tensor_shape_pb2.TensorShapeProto(
+                      dim=[
+                          tensor_shape_pb2.TensorShapeProto.Dim(
+                              size=filter_shape[2] * filter_shape[3]
+                          ),
+                      ]
+                  )
+              ]
+          )
+      )
+      self.assertTrue(
+          self._contains_op(
+              output_graphdef, 'Const', '_output_shapes', per_channel_size_attr
+          )
+      )
+
+    input_tensor = array_ops.constant(
+        np.random.uniform(low=-0.1, high=0.1, size=input_shape),
+        dtype=dtypes.float32,
+    )
+    original_output = model.depthwise_conv(input_tensor)
+    quantized_output = converted_model.signatures['serving_default'](
+        input_tensor
+    )
+
+    threshold = 0.68 if enable_per_channel_quantization else 1.3
+    self.assertAllClose(original_output, quantized_output, atol=threshold)
+
   @parameterized.named_parameters(
       ('to_tf_use_constant', quant_opts_pb2.TF, False),
       ('to_xla_use_constant', quant_opts_pb2.XLA, False),
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
index f2593d336f7..d7dc023e1dc 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test_base.py
@@ -1114,11 +1114,20 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
       """A simple model with a single depthwise conv2d, bias and relu."""
 
       def __init__(self):
-        self.filters = np.random.uniform(
-            low=-10, high=10, size=filter_shape
-        ).astype('f4')
-
         self.out_channel_size = filter_shape[2] * filter_shape[3]
+
+        # This ensures filters will have different value range per out channel
+        self.filters = np.stack(
+            [
+                np.random.uniform(
+                    low=-(i + 1), high=(i + 1), size=filter_shape[:-2]
+                ).astype('f4')
+                for i in range(self.out_channel_size)
+            ],
+            axis=-1,
+        )
+        self.filters = self.filters.reshape(filter_shape)
+
         self.bias = np.random.uniform(
             low=0, high=10, size=(self.out_channel_size)
         ).astype('f4')
@@ -1178,11 +1187,19 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
       """A simple model with a single conv2d, bias and relu."""
 
       def __init__(self):
-        self.filters = np.random.uniform(
-            low=-10, high=10, size=filter_shape
-        ).astype('f4')
-
         self.out_channel_size = filter_shape[-1]
+
+        # This ensures filters will have different value range per out channel
+        self.filters = np.stack(
+            [
+                np.random.uniform(
+                    low=-(i + 1), high=(i + 1), size=filter_shape[:-1]
+                ).astype('f4')
+                for i in range(self.out_channel_size)
+            ],
+            axis=-1,
+        )
+
         self.bias = np.random.uniform(
             low=0, high=10, size=(self.out_channel_size)
         ).astype('f4')
@@ -1313,7 +1330,7 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
 
     # Verify that when bias_size is not None, has_bias should be True.
     # And if bias_size is None, has_bias should be False using XNOR
-    assert (not ((bias_size is not None) ^ has_bias))
+    assert not ((bias_size is not None) ^ has_bias)
 
     # Verify that bias size is correct
     if bias_size:
@@ -1332,82 +1349,6 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
     )
     return model
 
-  def _create_einsum_model(
-      self,
-      saved_model_path: str,
-      equation: str,
-      input_shape: Sequence[int],
-      weight_shape: Sequence[int],
-      bias_shape: Optional[Sequence[int]] = None,
-      activation_fn: Optional[ops.Operation] = None,
-  ) -> module.Module:
-    class EinsumModel(module.Module):
-      """A simple model with a single einsum.
-
-      Bias and activation function are optional.
-      """
-
-      def __init__(
-          self,
-          equation: str,
-          weight_shape: Sequence[int],
-          bias_shape: Optional[Sequence[int]] = None,
-          activation_fn: Optional[ops.Operation] = None,
-      ) -> None:
-        """Initializes a EinsumModel.
-
-        Args:
-          equation: a string describing the contraction.
-          weight_shape: Shape of the weight tensor.
-          bias_shape: Shape of the bias. This is not always 1D so Einsum ops
-            usually use Add op instead of BiasAdd.
-          activation_fn: The activation function to be used. No activation
-            function if None.
-        """
-        self.equation = equation
-        self.activation_fn = activation_fn
-        self.weight = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
-        self.bias = (
-            np.random.uniform(low=-1.0, high=1.0, size=bias_shape)
-            if bias_shape is not None
-            else None
-        )
-
-      @def_function.function
-      def einsum(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
-        """Evaluates the Einstein summation convention.
-
-        Depending on self.has_bias and self.activation_fn, it may add a bias
-        term or go through the activaction function.
-
-        Args:
-          input_tensor: Input tensor to einsum with the weight.
-
-        Returns:
-          A map of: output key -> output result.
-        """
-        out = tensorflow.einsum(self.equation, input_tensor, self.weight)
-
-        if self.bias is not None:
-          out = out + self.bias
-
-        if self.activation_fn is not None:
-          out = self.activation_fn(out)
-
-        return {'output': out}
-
-    model = EinsumModel(equation, weight_shape, bias_shape, activation_fn)
-    saved_model_save.save(
-        model,
-        saved_model_path,
-        signatures=model.einsum.get_concrete_function(
-            tensor_spec.TensorSpec(
-                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
-            )
-        ),
-    )
-    return model
-
   # Prepares sample einsum input data shapes.
   # This function returns:
   # 1. Shape for input 1
@@ -1435,7 +1376,7 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
     out_labels = equation[arrow_pos + 1 :]
 
     # 2. Create sample shapes.
-    label_to_size = {'a': 2, 'b': 3, 'c': 4, 'd': 5, 'e': 6}
+    label_to_size = {'a': 4, 'b': 32, 'c': 64, 'd': 128, 'e': 8}
     x_shape = [label_to_size.get(x_label) for x_label in x_labels]
     y_shape = [label_to_size.get(y_label) for y_label in y_labels]
     bias_shape = None
@@ -1460,7 +1401,7 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
       ]
     return x_shape, y_shape, bias_shape, x_signature, y_signature
 
-  def _create_einsum_model_with_fake_quant(
+  def _create_einsum_model(
       self,
       equation: str,
       y_shape: Sequence[int],
@@ -1468,9 +1409,10 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
       y_signature: Sequence[Optional[int]],
       bias_shape: Optional[Sequence[int]] = None,
       activation_fn: Optional[ops.Operation] = None,
+      is_qat_model: bool = False,
   ) -> module.Module:
     class EinsumModel(module.Module):
-      """Einsum class with fakequants."""
+      """Einsum class."""
 
       def __init__(self):
         self._bias = None
@@ -1509,33 +1451,35 @@ class QuantizedModelTest(test.TestCase, parameterized.TestCase):
         return self._einsum(x, y)
 
       def _einsum(self, x, y):
-        x = array_ops.fake_quant_with_min_max_vars(
-            x,
-            min=ops.convert_to_tensor(self._min[0]),
-            max=ops.convert_to_tensor(self._max[0]),
-            num_bits=8,
-            narrow_range=False,
-        )
-        y = array_ops.fake_quant_with_min_max_vars(
-            y,
-            min=ops.convert_to_tensor(self._min[1]),
-            max=ops.convert_to_tensor(self._max[1]),
-            num_bits=8,
-            narrow_range=False,
-        )
+        if is_qat_model:
+          x = array_ops.fake_quant_with_min_max_vars(
+              x,
+              min=ops.convert_to_tensor(self._min[0]),
+              max=ops.convert_to_tensor(self._max[0]),
+              num_bits=8,
+              narrow_range=False,
+          )
+          y = array_ops.fake_quant_with_min_max_vars(
+              y,
+              min=ops.convert_to_tensor(self._min[1]),
+              max=ops.convert_to_tensor(self._max[1]),
+              num_bits=8,
+              narrow_range=False,
+          )
 
         out = tensorflow.einsum(equation, x, y)
         if self._bias is not None:
           out = nn_ops.bias_add(out, self._bias)
         if activation_fn is not None:
           out = activation_fn(out)
-        out = array_ops.fake_quant_with_min_max_vars(
-            out,
-            min=ops.convert_to_tensor(self._min[2]),
-            max=ops.convert_to_tensor(self._max[2]),
-            num_bits=8,
-            narrow_range=False,
-        )
+        if is_qat_model:
+          out = array_ops.fake_quant_with_min_max_vars(
+              out,
+              min=ops.convert_to_tensor(self._min[2]),
+              max=ops.convert_to_tensor(self._max[2]),
+              num_bits=8,
+              narrow_range=False,
+          )
         return {'output': out}
 
     return EinsumModel()
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 0a3f2e95c36..d9a5aaf4e31 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -119,11 +119,14 @@ void AddExportPasses(const bool duplicate_shape_determining_constants,
   }
 
   pm.addPass(mlir::quant::CreateInsertMainFunctionPass());
+  pm.addPass(mlir::quant::CreateLiftHashTableOpsAsArgsPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::CreateFunctionalToExecutorDialectConversionPass());
   pm.addPass(mlir::CreateBreakUpIslandsPass());
   pm.addPass(mlir::quant::CreateMergeInitializerFunctionOpsToMainPass());
   pm.addPass(mlir::quant::CreateMergeSaveFunctionOpsToMainPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::quant::CreateMergeDuplicateResourceOpsPass());
 
   // Used to clean up the "tf._noinliner" attribute that is previously used to
   // prevent certain functions from being inlined (see
@@ -384,7 +387,7 @@ absl::Status UnfreezeConstantsAndSaveVariables(
       !create_dir_status.ok()) {
     LOG(ERROR) << "Failed to create checkpoint directory at: "
                << checkpoint_dir;
-    return tsl::ToAbslStatus(create_dir_status);
+    return create_dir_status;
   }
 
   TF_ASSIGN_OR_RETURN(const auto _,
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
index f17f20df4b6..2344c108016 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.h
@@ -47,7 +47,7 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quant_opts,
+    const QuantizationOptions& quantization_options,
     const absl::flat_hash_map<std::string, std::string>& function_aliases);
 
 // Apply post-training dynamic range quantization to the model.
@@ -55,20 +55,20 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quant_opts);
+    const QuantizationOptions& quantization_options);
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
     absl::string_view saved_model_path,
-    const std::vector<std::string>& exported_names,
+    const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quant_opts,
+    const QuantizationOptions& quantization_options,
     const absl::flat_hash_map<std::string, std::string>& function_aliases);
 
 absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
     absl::string_view saved_model_path,
     const std::vector<std::string>& signature_keys,
     const std::unordered_set<std::string>& tags,
-    const QuantizationOptions& quant_opts,
+    const QuantizationOptions& quantization_options,
     const absl::flat_hash_map<std::string, std::string>& function_aliases);
 
 }  // namespace quantization
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 758f99b62a0..53dd34cfc30 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -17,8 +17,8 @@ import collections.abc
 import tempfile
 from typing import Callable, Collection, Dict, Mapping, Optional, Sequence
 import uuid
-from absl import logging
 
+from absl import logging
 import numpy as np
 
 from tensorflow.compiler.mlir.quantization.tensorflow import exported_model_pb2
@@ -51,11 +51,12 @@ _ExperimentalMethod = quant_opts_pb2.QuantizationMethod.ExperimentalMethod
 _SignatureDefMap = Mapping[str, meta_graph_pb2.SignatureDef]
 
 # Default minimum number of elements in the weights for them to be quantized
-# during dynamic range quantization (DRQ).
+# during dynamic range quantization (DRQ) and weight-only quantization.
 _DYNAMIC_RANGE_DEFAULT_MIN_NUM_ELEMENTS_FOR_WEIGHTS = 1024
 
 # Name of the saved model assets directory.
 _ASSETS_DIR = 'assets'
+_ASSETS_EXTRA_DIR = 'assets.extra'
 
 
 def _is_qat_saved_model(saved_model_path: str):
@@ -530,26 +531,27 @@ def _copy_assets(src_path: str, dst_path: str) -> None:
     src_path: Source saved model directory.
     dst_path: Destination saved model directory. This directory must exist.
   """
-  src_assets_path = file_io.join(src_path, _ASSETS_DIR)
-  if not file_io.file_exists_v2(src_assets_path):
-    # Do nothing if the source assets path does not exist.
-    return
+  for assets_dir_name in [_ASSETS_DIR, _ASSETS_EXTRA_DIR]:
+    src_assets_path = file_io.join(src_path, assets_dir_name)
+    if not file_io.file_exists_v2(src_assets_path):
+      # Do nothing if the source assets path does not exist.
+      continue
 
-  dst_assets_path = file_io.join(dst_path, _ASSETS_DIR)
-  file_io.create_dir_v2(dst_assets_path)
+    dst_assets_path = file_io.join(dst_path, assets_dir_name)
+    file_io.create_dir_v2(dst_assets_path)
 
-  for curr_dir, _, files in file_io.walk_v2(src_assets_path):
-    for asset_file_name in files:
-      src_asset_file = file_io.join(curr_dir, asset_file_name)
+    for curr_dir, _, files in file_io.walk_v2(src_assets_path):
+      for asset_file_name in files:
+        src_asset_file = file_io.join(curr_dir, asset_file_name)
 
-      # Construct the destination assets file path.
-      curr_dst_dir = curr_dir.replace(src_assets_path, dst_assets_path)
-      dst_asset_file = file_io.join(curr_dst_dir, asset_file_name)
+        # Construct the destination assets file path.
+        curr_dst_dir = curr_dir.replace(src_assets_path, dst_assets_path)
+        dst_asset_file = file_io.join(curr_dst_dir, asset_file_name)
 
-      file_io.copy_v2(src_asset_file, dst_asset_file)
-      logging.info(
-          'Copied asset file: %s -> %s', src_asset_file, dst_asset_file
-      )
+        file_io.copy_v2(src_asset_file, dst_asset_file)
+        logging.info(
+            'Copied asset file: %s -> %s', src_asset_file, dst_asset_file
+        )
 
 
 def _run_static_range_qat(
@@ -1017,17 +1019,20 @@ def _populate_quantization_options_default_values(
     quantization_options: An instance of QuantizationOptions.
   """
   if quantization_options.op_set == quant_opts_pb2.OpSet.OP_SET_UNSPECIFIED:
-    quantization_options.op_set = quant_opts_pb2.OpSet.TF
+    quantization_options.op_set = quant_opts_pb2.OpSet.XLA
 
   if not quantization_options.HasField('freeze_all_variables'):
     quantization_options.freeze_all_variables.enabled = True
 
-  if quantization_options.enable_per_channel_quantization and (
-      quantization_options.op_set != quant_opts_pb2.OpSet.UNIFORM_QUANTIZED
+  # TODO(b/281595329): Implement static range quantization per-channel support
+  if quantization_options.enable_per_channel_quantization and not (
+      quantization_options.op_set == quant_opts_pb2.OpSet.UNIFORM_QUANTIZED
+      or quantization_options.quantization_method.experimental_method
+      == _ExperimentalMethod.WEIGHT_ONLY
   ):
     raise ValueError(
         'Currently, per-channel quantization is supported for Uniform '
-        'Quantized opset only.'
+        'Quantized opset and Weight-only.'
     )
 
   if (
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
index 6c5c520bffc..701f01a1da2 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.proto
@@ -141,7 +141,7 @@ message QuantizationOptions {
   // units that are not specified in unit-wise configurations.
   QuantizationMethod quantization_method = 1;
 
-  OpSet op_set = 2;  // If not specified, it defaults to `TF`.
+  OpSet op_set = 2;  // If not specified, it defaults to `XLA`.
 
   QuantizationPrecision quantization_precision = 3;
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index 6c085b85b2d..abbb663462b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -57,7 +57,7 @@ absl::Status RunPassesOnModuleOp(const absl::string_view mlir_dump_file_name,
   }
 
   if (failed(pass_manager.run(module_op))) {
-    return tsl::ToAbslStatus(statusHandler.ConsumeStatus());
+    return statusHandler.ConsumeStatus();
   }
 
   return absl::OkStatus();
@@ -106,7 +106,7 @@ absl::Status PreprocessAndFreezeGraph(
 
   if (session.has_value() && failed(mlir::tf_saved_model::FreezeVariables(
                                  module_op, session.value()))) {
-    return tsl::ToAbslStatus(statusHandler.ConsumeStatus());
+    return statusHandler.ConsumeStatus();
   }
 
   return RunPassesOnModuleOp(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD
index 2587a70d9cf..6484c365f92 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/BUILD
@@ -21,6 +21,7 @@ filegroup(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     size_override = {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_hashtable_ops_as_args.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_hashtable_ops_as_args.mlir
new file mode 100644
index 00000000000..20f37c578f2
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/lift_hashtable_ops_as_args.mlir
@@ -0,0 +1,109 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-lift-hashtable-ops-as-args | FileCheck %s
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1506 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_all_tables]} : () -> ()
+  func.func @init_all_tables() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init_all_tables"], tf_saved_model.initializer_type = "init_op"} {
+    %cst = "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %cst_0 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.LookupTableImportV2"(%0, %cst, %cst_0) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    return
+  }
+
+// Check that HashTable op in the initilizer is not lifted.
+// CHECK: func.func @init_all_tables()
+// CHECK: %[[OUT_0:.*]] = "tf.HashTableV2"()
+// CHECK: "tf.LookupTableImportV2"(%[[OUT_0]]
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) attributes {tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}} {
+    %cst = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+    %cst_0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<0.00235294132> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {value = dense<0.00156862743> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %1 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %2 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+    %3 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 5 : i64} : (tensor<?x!tf_type.string>) -> tensor<?xi64>
+    %4 = "tf.AddV2"(%3, %1) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+    %5 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+    return %5 : tensor<*xi64>
+  }
+
+// Check that HashTable op is lifted.
+// CHECK: func.func private @serving_default
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK-SAME: tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0,hash_table_1:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}
+// CHECK: "tf.LookupTableSizeV2"(%arg1)
+// CHECK: "tf.LookupTableFindV2"(%arg1
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input_vocabs:0"]} ) -> (tensor<*xi64>  {tf_saved_model.index_path = ["FakeQuantWithMinMaxArgs_2:0"]}) attributes {tf.entry_function = {inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+
+// Check that the caller is updated.
+// CHECK: func.func @main
+// CHECK: %[[OUT_1:.*]] = "tf.HashTableV2"()
+// CHECK: %[[OUT_2:.*]] = "tf.PartitionedCall"(%arg0, %[[OUT_1]])
+}
+// -----
+// Test nested function case.
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1506 : i32}, tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init_all_tables]} : () -> ()
+  func.func @init_all_tables() attributes {tf_saved_model.exported_names = ["__tf_saved_model_session_initializer_init_all_tables"], tf_saved_model.initializer_type = "init_op"} {
+    %cst = "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %cst_0 = "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    "tf.LookupTableImportV2"(%0, %cst, %cst_0) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    return
+  }
+
+// Check that HashTable op in the initilizer is not lifted.
+// CHECK: func.func @init_all_tables()
+// CHECK: %[[OUT_0:.*]] = "tf.HashTableV2"()
+// CHECK: "tf.LookupTableImportV2"(%[[OUT_0]]
+  func.func private @serving_default(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) attributes {tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default1} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+// Check that HashTable op is passed through.
+// CHECK: func.func private @serving_default
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK-SAME: tf.entry_function = {control_outputs = "", inputs = "input_vocabs:0,hash_table_1:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}
+// CHECK: "tf.PartitionedCall"(%arg0, %arg1)
+  func.func private @serving_default1(%arg0: tensor<?x!tf_type.string> ) -> (tensor<*xi64>) {
+    %cst = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+    %cst_0 = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+    %cst_1 = "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %cst_2 = "tf.Const"() {value = dense<0.00235294132> : tensor<f32>} : () -> tensor<f32>
+    %cst_3 = "tf.Const"() {value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+    %cst_4 = "tf.Const"() {value = dense<0.00117647066> : tensor<f32>} : () -> tensor<f32>
+    %cst_5 = "tf.Const"() {value = dense<-43> : tensor<i32>} : () -> tensor<i32>
+    %cst_6 = "tf.Const"() {value = dense<0.00156862743> : tensor<f32>} : () -> tensor<f32>
+    %0 = "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %1 = "tf.LookupTableSizeV2"(%0) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %2 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+    %3 = "tf.StringToHashBucketFast"(%arg0) {device = "", num_buckets = 5 : i64} : (tensor<?x!tf_type.string>) -> tensor<?xi64>
+    %4 = "tf.AddV2"(%3, %1) {device = ""} : (tensor<?xi64>, tensor<i64>) -> tensor<?xi64>
+    %5 = "tf.LookupTableFindV2"(%0, %arg0, %cst) {device = ""} : (tensor<!tf_type.resource>, tensor<?x!tf_type.string>, tensor<i64>) -> tensor<*xi64>
+    return %5 : tensor<*xi64>
+  }
+
+// Check that HashTable op is lifted.
+// CHECK: func.func private @serving_default1
+// CHECK-SAME: (%arg0: tensor<?x!tf_type.string>, %arg1: tensor<!tf_type.resource>) -> tensor<*xi64>
+// CHECK: "tf.LookupTableSizeV2"(%arg1)
+// CHECK: "tf.LookupTableFindV2"(%arg1
+  func.func @main(%arg0: tensor<?x!tf_type.string> {tf_saved_model.index_path = ["input_vocabs:0"]} ) -> (tensor<*xi64>  {tf_saved_model.index_path = ["FakeQuantWithMinMaxArgs_2:0"]}) attributes {tf.entry_function = {inputs = "input_vocabs:0", outputs = "FakeQuantWithMinMaxArgs_2:0"}, tf_saved_model.exported_names = ["main"]} {
+    %0 = "tf.PartitionedCall"(%arg0) {config = "", config_proto = "", executor_type = "", f = @serving_default} : (tensor<?x!tf_type.string>) -> (tensor<*xi64>)
+    %1 = "tf.Identity"(%0) : (tensor<*xi64>) -> tensor<*xi64>
+    return %1 : tensor<*xi64>
+  }
+// Check that the caller is updated.
+// CHECK: func.func @main
+// CHECK: %[[OUT_1:.*]] = "tf.HashTableV2"()
+// CHECK: %[[OUT_2:.*]] = "tf.PartitionedCall"(%arg0, %[[OUT_1]])
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_duplicate_resource_ops.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_duplicate_resource_ops.mlir
new file mode 100644
index 00000000000..c3099ea9418
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/merge_duplicate_resource_ops.mlir
@@ -0,0 +1,108 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-merge-duplicate-resource-ops | FileCheck %s
+
+func.func @merge_duplicate_variable(%arg0: tensor<1x20xf32>, %arg1: tensor<!tf_type.string>) -> (tensor<20x4096xf32>) {
+  %0 = tf_executor.graph {
+    %outputs_5, %control_6 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_7, %control_8 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.RestoreV2"(%arg1, %outputs_7, %outputs_5) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<20x4096xf32>
+    %control_13 = tf_executor.island(%control_12) wraps "tf.AssignVariableOp"(%outputs_9, %outputs_11) {validate_shape = false} : (tensor<!tf_type.resource<tensor<20x4096xf32>>>, tensor<20x4096xf32>) -> ()
+    %control_14 = tf_executor.island(%control_13) wraps "tf.NoOp"() : () -> ()
+    %outputs_15, %control_16 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_17, %control_18 = tf_executor.island wraps "tf.ReadVariableOp"(%outputs_15) : (tensor<!tf_type.resource<tensor<20x4096xf32>>>) -> tensor<20x4096xf32>
+    %outputs_19, %control_20 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_21, %control_22 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %control_23 = tf_executor.island(%control_18) wraps "tf.SaveV2"(%arg1, %outputs_19, %outputs_21, %outputs_17) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<20x4096xf32>) -> ()
+    %outputs_24, %control_25 = tf_executor.island(%control_23) wraps "tf.Identity"(%arg1) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+    tf_executor.fetch %outputs_17, %control_14, %control_25 : tensor<20x4096xf32>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<20x4096xf32>
+}
+// CHECK-LABEL: @merge_duplicate_variable
+// CHECK: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.VarHandleOp"()
+// CHECK: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.RestoreV2"
+// CHECK: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_1]]) wraps "tf.AssignVariableOp"(%[[OUT_0]], %[[OUT_1]])
+
+// Check that ReadVariableOp now use the same variable op.
+// CHECK: %[[OUT_3:.*]], %[[CTL_3:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[OUT_0]])
+
+// -----
+
+func.func @variables_with_different_shared_names(%arg0: tensor<1x20xf32>, %arg1: tensor<!tf_type.string>) -> (tensor<20x4096xf32>) {
+  %0 = tf_executor.graph {
+    %outputs_5, %control_6 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_7, %control_8 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.RestoreV2"(%arg1, %outputs_7, %outputs_5) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<20x4096xf32>
+    %control_13 = tf_executor.island(%control_12) wraps "tf.AssignVariableOp"(%outputs_9, %outputs_11) {validate_shape = false} : (tensor<!tf_type.resource<tensor<20x4096xf32>>>, tensor<20x4096xf32>) -> ()
+    %control_14 = tf_executor.island(%control_13) wraps "tf.NoOp"() : () -> ()
+    %outputs_15, %control_16 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_1"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_17, %control_18 = tf_executor.island wraps "tf.ReadVariableOp"(%outputs_15) : (tensor<!tf_type.resource<tensor<20x4096xf32>>>) -> tensor<20x4096xf32>
+    %outputs_19, %control_20 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_21, %control_22 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %control_23 = tf_executor.island(%control_18) wraps "tf.SaveV2"(%arg1, %outputs_19, %outputs_21, %outputs_17) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<20x4096xf32>) -> ()
+    %outputs_24, %control_25 = tf_executor.island(%control_23) wraps "tf.Identity"(%arg1) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+    tf_executor.fetch %outputs_17, %control_14, %control_25 : tensor<20x4096xf32>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<20x4096xf32>
+}
+// CHECK-LABEL: @variables_with_different_shared_names
+// CHECK: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.VarHandleOp"()
+// CHECK-SAME: shared_name = "MatMul/b_0"
+// CHECK: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.RestoreV2"
+// CHECK: %[[CTL_2:.*]] = tf_executor.island(%[[CTL_1]]) wraps "tf.AssignVariableOp"(%[[OUT_0]], %[[OUT_1]])
+
+// Check that the second variable is not removed since they have different
+// `shared_name` attribute.
+// CHECK: %[[OUT_3:.*]], %[[CTL_3:.*]] = tf_executor.island wraps "tf.VarHandleOp"()
+// CHECK-SAME: shared_name = "MatMul/b_1"
+// CHECK: %[[OUT_4:.*]], %[[CTL_4:.*]] = tf_executor.island wraps "tf.ReadVariableOp"(%[[OUT_3]])
+
+// -----
+
+// Test two resource ops have the same shared_name but different types.
+// expected-error @+1 {{This op has the same `shared_name` but different type with another}}
+func.func @same_shared_name_but_different_types(%arg0: tensor<1x20xf32>, %arg1: tensor<!tf_type.string>) -> (tensor<20x4096xf32>) {
+  %0 = tf_executor.graph {
+    %outputs_5, %control_6 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_7, %control_8 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.RestoreV2"(%arg1, %outputs_7, %outputs_5) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<20x4096xf32>
+    %control_13 = tf_executor.island(%control_12) wraps "tf.AssignVariableOp"(%outputs_9, %outputs_11) {validate_shape = false} : (tensor<!tf_type.resource<tensor<20x4096xf32>>>, tensor<20x4096xf32>) -> ()
+    %control_14 = tf_executor.island(%control_13) wraps "tf.NoOp"() : () -> ()
+    %outputs_15, %control_16 = tf_executor.island wraps "tf.VarHandleOp"() {container = "", shared_name = "MatMul/b_0"} : () -> tensor<!tf_type.resource<tensor<20x4096xf32>>>
+    %outputs_17, %control_18 = tf_executor.island wraps "tf.ReadVariableOp"(%outputs_15) : (tensor<!tf_type.resource<tensor<20x4096xf32>>>) -> tensor<20x4096xf32>
+    %outputs_19, %control_20 = tf_executor.island wraps "tf.Const"() {value = dense<"MatMul/b_0"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %outputs_21, %control_22 = tf_executor.island wraps "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+    %control_23 = tf_executor.island(%control_18) wraps "tf.SaveV2"(%arg1, %outputs_19, %outputs_21, %outputs_17) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>, tensor<20x4096xf32>) -> ()
+    %outputs_24, %control_25 = tf_executor.island(%control_23) wraps "tf.Identity"(%arg1) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+    tf_executor.fetch %outputs_17, %control_14, %control_25 : tensor<20x4096xf32>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<20x4096xf32>
+}
+
+// -----
+
+func.func @merge_hashtable_ops(%arg0: tensor<?x!tf_type.string>) -> (tensor<i64>) {
+  %0 = tf_executor.graph {
+    %outputs, %control = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %outputs_0, %control_1 = tf_executor.island wraps "tf.LookupTableSizeV2"(%outputs) {device = ""} : (tensor<!tf_type.resource>) -> tensor<i64>
+    %outputs_2, %control_3 = tf_executor.island wraps "tf.Const"() {value = dense<1.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    %outputs_4, %control_5 = tf_executor.island wraps "tf.Identity"(%outputs_0) : (tensor<i64>) -> tensor<i64>
+    %control_8 = tf_executor.island(%control_3, %control_5) wraps "tf.NoOp"() : () -> ()
+    %outputs_9, %control_10 = tf_executor.island wraps "tf.Const"() {value = dense<["hello", "model", "quantization"]> : tensor<3x!tf_type.string>} : () -> tensor<3x!tf_type.string>
+    %outputs_11, %control_12 = tf_executor.island wraps "tf.Const"() {value = dense<[0, 1, 2]> : tensor<3xi64>} : () -> tensor<3xi64>
+    %outputs_13, %control_14 = tf_executor.island wraps "tf.HashTableV2"() {container = "", device = "", key_dtype = !tf_type.string, shared_name = "hash_table_ce3dfbfc-7367-4d62-9d48-d13bf8125391", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+    %control_15 = tf_executor.island wraps "tf.LookupTableImportV2"(%outputs_13, %outputs_9, %outputs_11) {_has_manual_control_dependencies = true, device = ""} : (tensor<!tf_type.resource>, tensor<3x!tf_type.string>, tensor<3xi64>) -> ()
+    %control_16 = tf_executor.island(%control_15) wraps "tf.NoOp"() : () -> ()
+    tf_executor.fetch %outputs_4, %control_8, %control_16 : tensor<i64>, !tf_executor.control, !tf_executor.control
+  }
+  return %0 : tensor<i64>
+}
+
+// CHECK-LABEL: @merge_hashtable_ops
+// CHECK: %[[OUT_0:.*]], %[[CTL_0:.*]] = tf_executor.island wraps "tf.HashTableV2"()
+// CHECK: %[[OUT_1:.*]], %[[CTL_1:.*]] = tf_executor.island wraps "tf.LookupTableSizeV2"(%[[OUT_0]])
+
+// Check that LookupTableImportV2 is using the same HashTableV2 with LookupTableSizeV2.
+// CHECK: %[[CTL_2:.*]] = tf_executor.island wraps "tf.LookupTableImportV2"(%[[OUT_0]]
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
index ee97c375ba9..6eae6df1323 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
@@ -371,10 +371,20 @@ func.func @xla_gather_known_output_shape(%arg0: tensor<5xi32>, %arg1: tensor<1xi
 
 // -----
 
-func.func @replace_checknumerics_to_identity(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+func.func @remove_check_numerics_op(%arg0: tensor<*xf32>) -> tensor<*xf32> {
   %0 = "tf.CheckNumerics"(%arg0) {device = "", message = "transformer"} : (tensor<*xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
-// CHECK: func @replace_checknumerics_to_identity
-// CHECK: %[[out:.*]] = "tf.Identity"(%arg0) : (tensor<*xf32>) -> tensor<*xf32>
\ No newline at end of file
+// CHECK: func @remove_check_numerics_op
+// CHECK: return %arg0 : tensor<*xf32>
+
+// -----
+
+func.func @remove_stop_gradient_op(%arg0: tensor<*xf32>) -> tensor<*xf32> {
+  %0 = "tf.StopGradient"(%arg0) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
+// CHECK: func @remove_stop_gradient_op
+// CHECK: return %arg0 : tensor<*xf32>
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir
new file mode 100644
index 00000000000..4f36784e67a
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/preprocess_op_weight_only.mlir
@@ -0,0 +1,55 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-preprocess-op='target-opset=XLA quantization-method=weight_only enable-per-channel-quantization=false' | FileCheck --check-prefix PerTensor %s
+// RUN: tf-quant-opt %s -split-input-file -quant-preprocess-op='target-opset=XLA quantization-method=weight_only enable-per-channel-quantization=true' | FileCheck --check-prefix PerChannel %s
+
+module {
+  // For XLA weight-only per-channel depthwise convolution, tensor shape should have
+  // transformed from [H,W,C,M] to [H,W,1,CxM],
+  func.func @depthwise_conv(%arg0: tensor<1x3x4x3xf32>) -> (tensor<*xf32>) {
+    %cst_0 = "tf.Const"() {value = dense<0.000000e+00> : tensor<6xf32>} : () -> tensor<6xf32>
+    %cst_1 = "tf.Const"() {value = dense<[[[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]]],[[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]],[[3.0, 2.0], [1.0, 0.0],[3.0, 2.0]]]]> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "tf.PartitionedCall"(%arg0, %cst_1) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    %1 = "tf.BiasAdd"(%0, %cst_0) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+    func.return %1: tensor<*xf32>
+  }
+  func.func private @composite_depthwise_conv2d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<*xf32> attributes {tf_quant.composite_function} {
+    %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {
+      attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1]
+    } : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+    return %0 : tensor<*xf32>
+  }
+
+// PerTensor-LABEL: func @depthwise_conv
+// PerTensor-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<6xf32>
+// PerTensor: %[[CONST_1:.*]] = arith.constant dense
+// PerTensor-NOT: tensor<2x3x1x6xf32>
+// PerTensor-SAME: tensor<2x3x3x2xf32>
+// PerTensor: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<*xf32>
+// PerTensor: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// PerTensor: return %[[BIAS_0:.*]] : tensor<*xf32>
+
+// PerTensor-LABEL: func private @composite_depthwise_conv2d_fn(
+// PerTensor-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// PerTensor-SAME:                                             %arg1: tensor<2x3x3x2xf32>)
+// PerTensor: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// PerTensor: return %0 : tensor<*xf32>
+
+// PerChannel-LABEL: func @depthwise_conv
+// PerChannel-DAG: %[[CONST_0:.*]] = arith.constant dense<0.000000e+00> : tensor<6xf32>
+// PerChannel: %[[CONST_1:.*]] = arith.constant dense
+// PerChannel-NOT: tensor<2x3x3x2xf32>
+// PerChannel-SAME: tensor<2x3x1x6xf32>
+// PerChannel: %[[PARTITIONEDCALL_0:.*]] = "tf.PartitionedCall"(%arg0, %[[CONST_1:.*]]) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x1x6xf32>) -> tensor<*xf32>
+// PerChannel: %[[BIAS_0:.*]] = "tf.BiasAdd"(%[[PARTITIONEDCALL_0]], %[[CONST_0:.*]]) {data_format = "NHWC", device = ""} : (tensor<*xf32>, tensor<6xf32>) -> tensor<*xf32>
+// PerChannel: return %[[BIAS_0:.*]] : tensor<*xf32>
+
+// PerChannel-LABEL: func private @composite_depthwise_conv2d_fn(
+// PerChannel-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// PerChannel-SAME:                                             %arg1: tensor<2x3x3x2xf32>)
+
+// PerChannel-LABEL: func private @composite_depthwise_conv2d_fn_0(
+// PerChannel-SAME:                                             %arg0: tensor<1x3x4x3xf32>,
+// PerChannel-SAME:                                             %arg1: tensor<2x3x1x6xf32>)
+// PerChannel: %0 = "tf.DepthwiseConv2dNative"(%arg0, %arg1) {attr_map = "0:strides,1:padding,2:explicit_paddings,3:dilations", data_format = "NHWC", device = "",
+// PerChannel: return %0 : tensor<*xf32>
+}
+
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
index 6d2fec56737..8c0786178ee 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/quantize_composite_functions_weight_only.mlir
@@ -1,4 +1,5 @@
-// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions='quantization-method=weight_only target-opset=XLA' -quant-quantize-composite-functions='quantization-method=weight_only target-opset=XLA' -symbol-dce | FileCheck %s
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions='quantization-method=weight_only target-opset=XLA' -quant-quantize-composite-functions='quantization-method=weight_only target-opset=XLA' -symbol-dce | FileCheck --check-prefix=PerTensor %s
+// RUN: tf-quant-opt %s -split-input-file -quant-insert-quantized-functions='quantization-method=weight_only target-opset=XLA' -quant-quantize-composite-functions='quantization-method=weight_only target-opset=XLA enable-per-channel-quantization=true' -symbol-dce | FileCheck --check-prefix=PerChannel %s
 
 module {
   // TODO(b/260020937): Support transpose_a, transpose_b for matmul.
@@ -13,13 +14,21 @@ module {
   }
 }
 
-// CHECK-LABEL: func @matmul
-// CHECK-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<0> : tensor<12x2xi8>} : () -> tensor<12x2xi8>
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// CHECK-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: return %[[out]]
+// PerTensor-LABEL: func @matmul
+// PerTensor-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<0> : tensor<12x2xi8>} : () -> tensor<12x2xi8>
+// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
+// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// PerTensor: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: return %[[out]]
+
+// PerChannel-LABEL: func @matmul
+// PerChannel-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<0> : tensor<12x2xi8>} : () -> tensor<12x2xi8>
+// PerChannel-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<3.93700805E-9> : tensor<f32>} : () -> tensor<f32>
+// PerChannel-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// PerChannel: %[[out:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_matmul_fn_0} : (tensor<2x12xf32>, tensor<12x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerChannel: return %[[out]]
 
 // -----
 
@@ -41,15 +50,25 @@ module {
     return %conv : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @conv
-// CHECK-DAG: %[[q_w:.*]] = "tf.Const"()
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"()
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"()
-// CHECK: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// CHECK-SAME: f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// CHECK-SAME: f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: return %[[out_1]], %[[out_2]]
+// PerTensor-LABEL: func @conv
+// PerTensor-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<f32>} : () -> tensor<f32>
+// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<i32>} : () -> tensor<i32>
+// PerTensor: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: return %[[out_1]], %[[out_2]]
+
+// PerChannel-LABEL: func @conv
+// PerChannel-DAG: %[[q_w:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+// PerChannel-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<2xf32>} : () -> tensor<2xf32>
+// PerChannel-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2xi32>} : () -> tensor<2xi32>
+// PerChannel: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_conv2d_fn_1} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<2xf32>, tensor<2xi32>) -> tensor<*xf32>
+// PerChannel: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_conv2d_fn_0} : (tensor<1x2x2x3xf32>, tensor<2x3x3x2xi8>, tensor<2xf32>, tensor<2xi32>) -> tensor<*xf32>
+// PerChannel: return %[[out_1]], %[[out_2]]
 
 }
 
@@ -78,16 +97,31 @@ module {
     return %0 : tensor<*xf32>
   }
 
-// CHECK-LABEL: func @depthwise_conv
-// CHECK-DAG: %[[q_w1:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x1xi8>}
-// CHECK-DAG: %[[q_w2:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
-// CHECK-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>} : () -> tensor<f32>
-// CHECK-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG: %[[bias:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>}
-// CHECK: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// CHECK-SAME: f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: %[[out_1_add:.*]]  = "tf.BiasAdd"(%[[out_1]], %[[bias]])
-// CHECK: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
-// CHECK-SAME: f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
-// CHECK: return %[[out_1_add]], %[[out_2]]
+// PerTensor-LABEL: func @depthwise_conv
+// PerTensor-DAG: %[[q_w1:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x1xi8>}
+// PerTensor-DAG: %[[q_w2:.*]] = "tf.Const"() {value = dense<127> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+// PerTensor-DAG: %[[scale:.*]] = "tf.Const"() {value = dense<0.0236220472> : tensor<f32>} : () -> tensor<f32>
+// PerTensor-DAG: %[[zp:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// PerTensor-DAG: %[[bias:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>}
+// PerTensor: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: %[[out_1_add:.*]]  = "tf.BiasAdd"(%[[out_1]], %[[bias]])
+// PerTensor: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale]], %[[zp]]) {config = "", config_proto = "", executor_type = "",
+// PerTensor-SAME: f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<f32>, tensor<i32>) -> tensor<*xf32>
+// PerTensor: return %[[out_1_add]], %[[out_2]]
+
+// PerChannel-LABEL: func @depthwise_conv
+// PerChannel-DAG: %[[bias1:.*]] = "tf.Const"() {value = dense<0.000000e+00> : tensor<3xf32>} : () -> tensor<3xf32>
+// PerChannel-DAG: %[[q_w1:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x1xi8>} : () -> tensor<2x3x3x1xi8>
+// PerChannel-DAG: %[[q_w2:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2xi8>
+// PerChannel-DAG: %[[scale1:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<3xf32>} : () -> tensor<3xf32>
+// PerChannel-DAG: %[[scale2:.*]] = "tf.Const"() {value = dense<{{[0-9\.Ee\+\-]+}}> : tensor<6xf32>} : () -> tensor<6xf32>
+// PerChannel-DAG: %[[zp1:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<3xi32>} : () -> tensor<3xi32>
+// PerChannel-DAG: %[[zp2:.*]] = "tf.Const"() {value = dense<{{[0-9]+}}> : tensor<6xi32>} : () -> tensor<6xi32>
+// PerChannel: %[[out_1:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w1]], %[[scale1]], %[[zp1]]) {config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_depthwise_conv2d_fn_1} : (tensor<1x3x4x3xf32>, tensor<2x3x3x1xi8>, tensor<3xf32>, tensor<3xi32>) -> tensor<*xf32>
+// PerChannel: %[[out_1_add:.*]]  = "tf.BiasAdd"(%[[out_1]], %[[bias1]])
+// PerChannel: %[[out_2:.*]] = "tf.PartitionedCall"(%arg0, %[[q_w2]], %[[scale2]], %[[zp2]]) {config = "", config_proto = "", executor_type = "",
+// PerChannel-SAME: f = @quantized_depthwise_conv2d_fn_0} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xi8>, tensor<6xf32>, tensor<6xi32>) -> tensor<*xf32>
+// PerChannel: return %[[out_1_add]], %[[out_2]]
 }
diff --git a/tensorflow/compiler/mlir/stablehlo/BUILD b/tensorflow/compiler/mlir/stablehlo/BUILD
new file mode 100644
index 00000000000..162572e5d7c
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/BUILD
@@ -0,0 +1,56 @@
+load("//tensorflow:pytype.default.bzl", "pytype_library")
+load("//tensorflow/tsl:tsl.default.bzl", "tsl_pybind_extension")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        ":friends",
+    ],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        "//tensorflow/compiler/tests/...",
+    ],
+)
+
+tsl_pybind_extension(
+    name = "stablehlo_extension",
+    srcs = [
+        "stablehlo.cc",
+        "@stablehlo//:stablehlo/integrations/python/PortableApi.cpp",
+    ],
+    hdrs = [
+        "@stablehlo//:stablehlo/integrations/python/PortableApi.h",
+    ],
+    copts = [
+        "-fexceptions",
+        "-frtti",
+    ],
+    features = ["-use_header_modules"],
+    deps = [
+        "@pybind11",
+        "@stablehlo//:stablehlo_portable_api",
+    ],
+)
+
+pytype_library(
+    name = "stablehlo",
+    srcs = ["stablehlo.py"],
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        ":stablehlo_extension",
+    ],
+)
+
+py_test(
+    name = "stablehlo_test",
+    srcs = ["stablehlo_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":stablehlo",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.h b/tensorflow/compiler/mlir/stablehlo/stablehlo.cc
similarity index 53%
rename from tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.h
rename to tensorflow/compiler/mlir/stablehlo/stablehlo.cc
index acd63a76c25..0a256ff67c9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.h
+++ b/tensorflow/compiler/mlir/stablehlo/stablehlo.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,25 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef MLIR_HLO_TOSA_TRANSFORMS_PASSES_H
-#define MLIR_HLO_TOSA_TRANSFORMS_PASSES_H
-
-#include <memory>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Pass/Pass.h"
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "stablehlo/integrations/python/PortableApi.h"  // from @stablehlo
 
 namespace mlir {
-namespace tosa {
+namespace stablehlo {
 
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMhloPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createPrepareMhloPass();
+PYBIND11_MODULE(stablehlo_extension, m) { mlir::stablehlo::AddPortableApi(m); }
 
-#define GEN_PASS_REGISTRATION
-#define GEN_PASS_DECL_TOSALEGALIZEMHLOPASS
-#include "passes.h.inc"
-
-}  // namespace tosa
+}  // namespace stablehlo
 }  // namespace mlir
-
-#endif  // MLIR_HLO_TOSA_TRANSFORMS_PASSES_H
diff --git a/tensorflow/python/training/tracking/python_state.py b/tensorflow/compiler/mlir/stablehlo/stablehlo.py
similarity index 58%
rename from tensorflow/python/training/tracking/python_state.py
rename to tensorflow/compiler/mlir/stablehlo/stablehlo.py
index 39e6e28addc..64c3f1b7be3 100644
--- a/tensorflow/python/training/tracking/python_state.py
+++ b/tensorflow/compiler/mlir/stablehlo/stablehlo.py
@@ -1,5 +1,4 @@
-"""Utilities for including Python state in TensorFlow checkpoints."""
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,12 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
+"""StableHLO Portable Python APIs.
 
+This setup only exports the the StableHLO Portable C++ APIs, which have
+signatures that do not rely on MLIR classes.
 
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import python_state
-from tensorflow.python.util import deprecation
+Exporting all of MLIR Python bindings to TF OSS has high maintenance
+implications, especially given the frequency that TF updates the revision of
+LLVM used.
+"""
 
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, python_state, "2.11")
+# pylint: disable=wildcard-import
+from .stablehlo_extension import *
diff --git a/tensorflow/compiler/mlir/stablehlo/stablehlo_test.py b/tensorflow/compiler/mlir/stablehlo/stablehlo_test.py
new file mode 100644
index 00000000000..f6a1d1a75bb
--- /dev/null
+++ b/tensorflow/compiler/mlir/stablehlo/stablehlo_test.py
@@ -0,0 +1,40 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Smoke test of functions in StableHLO Portable APIs."""
+
+from tensorflow.compiler.mlir.stablehlo import stablehlo
+
+
+def smoketest():
+  """Test StableHLO Portable APIs."""
+  assert isinstance(stablehlo.get_api_version(), int)
+  assembly = """
+    module @jit_f_jax.0 {
+      func.func public @main(%arg0: tensor<ui32>) -> tensor<i1> {
+        %0 = stablehlo.constant dense<1> : tensor<ui32>
+        %1 = "stablehlo.compare"(%arg0, %0) {compare_type = #stablehlo<comparison_type UNSIGNED>, comparison_direction = #stablehlo<comparison_direction GE>} : (tensor<ui32>, tensor<ui32>) -> tensor<i1>
+        return %1 : tensor<i1>
+      }
+    }
+  """
+  target = stablehlo.get_current_version()
+  artifact = stablehlo.serialize_portable_artifact(assembly, target)
+  deserialized = stablehlo.deserialize_portable_artifact(artifact)
+  rountrip = stablehlo.serialize_portable_artifact(deserialized, target)
+  assert artifact == rountrip
+
+
+if __name__ == "__main__":
+  smoketest()
diff --git a/tensorflow/compiler/mlir/tensorflow/BUILD b/tensorflow/compiler/mlir/tensorflow/BUILD
index d49bca20c10..43206931918 100644
--- a/tensorflow/compiler/mlir/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/BUILD
@@ -474,6 +474,7 @@ cc_library(
     ] + ["ir/tf_" + target["name"] + ".h.inc" for target in tf_ops_category_list],
     deps = [
         ":attribute_utils",
+        ":convert_type",
         ":dynamic_shape_utils",
         ":rewrite_util",
         ":tensorflow_attributes",
@@ -929,6 +930,19 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "string_util",
+    srcs = ["utils/string_util.cc"],
+    hdrs = ["utils/string_util.h"],
+    deps = [
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
 cc_library(
     name = "fake_session",
     srcs = ["utils/fake_session.cc"],
@@ -1048,6 +1062,7 @@ cc_library(
         ":tensorflow_ops",
         ":tensorflow_passes",
         ":tensorflow_types",
+        ":tf_saved_model_asset_sinking_pass",
         "//tensorflow/core:core_cpu",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -1209,6 +1224,7 @@ cc_library(
         "transforms/drop_while_shape_invariant.cc",
         "transforms/einsum.cc",
         "transforms/embedding_pipelining.cc",
+        "transforms/embedding_sequencing.cc",
         "transforms/executor_island_coarsening.cc",
         "transforms/executor_tpuv1_inline_tpu_island.cc",
         "transforms/executor_tpuv1_island_coarsening.cc",
@@ -1291,9 +1307,12 @@ cc_library(
         "transforms/tpu_variable_runtime_reformatting.cc",
         "transforms/update_control_dependencies.cc",
         "transforms/verify_suitable_for_graph_export_pass.cc",
+        "transforms/xla_call_module_deserialization.cc",
+        "transforms/xla_call_module_serialization.cc",
         "transforms/xla_cluster_formation.cc",
         "transforms/xla_inline_device_ops.cc",
         "transforms/xla_rewrite.cc",
+        "transforms/xla_validate_inputs.cc",
         "translate/breakup-islands.cc",
         "translate/split_into_island_per_op_pass.cc",
         "translate/tf_executor_to_functional.cc",
@@ -1301,7 +1320,6 @@ cc_library(
     ],
     hdrs = [
         "transforms/bridge.h",
-        "transforms/call_graph_util.h",
         "transforms/cluster_ops_by_policy.h",
         "transforms/collection_ops_util.h",
         "transforms/einsum.h",
@@ -1318,6 +1336,7 @@ cc_library(
     deps = [
         ":attribute_utils",
         ":bridge_logger",
+        ":call_graph_util",
         ":cluster_util",
         ":convert_tensor",
         ":convert_type",
@@ -1333,6 +1352,8 @@ cc_library(
         ":parallel_execute_util",
         ":serialize_mlir_module_utils",
         ":shape_inference_pass",
+        ":stablehlo_custom_call_utils",
+        ":string_util",
         ":tensorflow",
         ":tensorflow_analysis",
         ":tensorflow_ops",
@@ -1340,6 +1361,7 @@ cc_library(
         ":tensorflow_side_effects",
         ":tensorflow_types",
         ":tf_data_optimization",
+        ":tf_device_pass_inc_gen",
         ":tf_legalize_hlo",
         ":tf_ops_layout_helper",
         ":tf_pass_inc_gen",
@@ -1353,6 +1375,8 @@ cc_library(
         ":unroll_batch_matmul_pass",
         ":verification_utils",
         ":verify_suitable_for_graph_export",
+        ":visitor",
+        ":xla_call_module_attrs",
         ":xla_sharding_util",
         "//tensorflow/compiler/jit:flags_headers",
         "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
@@ -1360,11 +1384,13 @@ cc_library(
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf_with_tf2xla",
         "//tensorflow/compiler/tf2xla:side_effect_util",
+        "//tensorflow/compiler/tf2xla/kernels:xla_call_module_loader",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/client:sharding_builder",
         "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -1378,6 +1404,7 @@ cc_library(
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1395,12 +1422,38 @@ cc_library(
         "@llvm-project//mlir:InferTypeOpInterface",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Rewrite",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_portable_api",
+        "@stablehlo//:stablehlo_serialization",
+        "@stablehlo//:vhlo_ops",
+    ],
+)
+
+cc_library(
+    name = "xla_call_module_attrs",
+    srcs = [],
+    hdrs = ["utils/xla_call_module_attrs.h"],
+    deps = ["@llvm-project//llvm:Support"],
+)
+
+cc_library(
+    name = "stablehlo_custom_call_utils",
+    srcs = ["utils/stablehlo_custom_call.cc"],
+    hdrs = ["utils/stablehlo_custom_call.h"],
+    deps = [
+        ":xla_call_module_attrs",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -1919,6 +1972,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -1981,11 +2035,15 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/c:tf_status",
-        "//tensorflow/c/eager:c_api",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform:logging",
-        "//tensorflow/core/protobuf:for_core_protos_cc",
+        ":convert_tensor",
+        ":export_tf_dialect_op",
+        ":tensorflow",
+        ":tensorflow_traits",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -2000,9 +2058,9 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":constant_fold_utils",
         ":convert_tensor",
         ":export_graphdef",
-        ":export_tf_dialect_op",
         ":tensorflow",
         ":tensorflow_traits",
         ":tensorflow_types",
@@ -2012,8 +2070,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/ops",
-        "//tensorflow/core/tfrt/fallback:fallback_state",
-        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -2346,7 +2402,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:errors",
         "//tensorflow/tsl/platform:statusor",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
     ],
 )
@@ -2456,7 +2512,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/platform:test",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:FuncDialect",
@@ -2542,6 +2598,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "call_graph_util",
+    srcs = [
+        "utils/call_graph_util.cc",
+    ],
+    hdrs = [
+        "utils/call_graph_util.h",
+    ],
+    deps = [
+        ":tensorflow",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "call_graph_util_test",
+    size = "small",
+    srcs = ["utils/call_graph_util_test.cc"],
+    deps = [
+        ":attribute_utils",
+        ":call_graph_util",
+        ":tensorflow",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
 cc_library(
     name = "xla_sharding_util",
     srcs = [
@@ -2732,6 +2823,35 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "visitor",
+    srcs = ["utils/visitor.cc"],
+    hdrs = ["utils/visitor.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tf_saved_model_asset_sinking_pass",
+    srcs = ["transforms/tf_saved_model_asset_sinking_pass.cc"],
+    hdrs = ["transforms/tf_saved_model_asset_sinking_pass.h"],
+    deps = [
+        ":tensorflow",
+        ":tensorflow_types",
+        ":tf_savedmodel_pass_inc_gen",
+        "//tensorflow/tsl/platform:path",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
 build_test(
     name = "tensorflow_build_test",
     targets = [
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
index 89580d1edd7..9817b290c4c 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_value_typed_analyzer.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_VALUE_TYPED_ANALYZER_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_ANALYSIS_RESOURCE_VALUE_TYPED_ANALYZER_H_
 
+#include <tuple>
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index 14ae242525a..43db7e91a56 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -16,8 +16,12 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 
 #include <bitset>
+#include <limits>
+#include <map>
 #include <optional>
 #include <string>
+#include <unordered_map>
+#include <utility>
 
 #include "absl/container/node_hash_map.h"
 #include "llvm/ADT/DenseMap.h"
@@ -30,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
@@ -255,6 +260,14 @@ class OpSideEffectCollector {
       for (Region& region : op->getRegions()) {
         AddRegionSideEffectsForOp(region, op);
       }
+    } else if (auto xla_call_module_op = dyn_cast<XlaCallModuleOp>(op)) {
+      for (auto func_symbol : xla_call_module_op.getFunctionList().getAsRange<
+          mlir::FlatSymbolRefAttr>()) {
+        if (auto func = symbol_table_collection_.lookupNearestSymbolFrom<
+                mlir::func::FuncOp>(xla_call_module_op, func_symbol)) {
+          AddRegionSideEffectsForOp(func.getBody(), op);
+        }
+      }
     } else {
       // Now handle all other ops.
       auto& side_effects_by_resource_id = op_side_effect_map_[op];
diff --git a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
index 6ed83b65428..05321522d50 100644
--- a/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
+++ b/tensorflow/compiler/mlir/tensorflow/c/c_api_unified_experimental_mlir.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <cstddef>
 #include <memory>
 #include <optional>
+#include <string>
+#include <utility>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/StringRef.h"
@@ -165,7 +167,7 @@ class MlirAbstractOp : public TracingOperation {
   Status SetAttrType(const char* attr_name,
                      tensorflow::DataType dtype) override;
   Status SetAttrShape(const char* attr_name, const int64_t* dims,
-                      const int num_dims) override;
+                      int num_dims) override;
   Status SetAttrFunction(const char* attr_name,
                          const AbstractOperation* value) override;
   Status SetAttrFunctionName(const char* attr_name, const char* value,
@@ -189,7 +191,7 @@ class MlirAbstractOp : public TracingOperation {
       const char* attr_name,
       absl::Span<const AbstractOperation*> values) override;
 
-  Status SetOpName(const char* const op_name) override;
+  Status SetOpName(const char* op_name) override;
 
   MLIRContext* GetContext() { return context_; }
 
@@ -543,7 +545,7 @@ Status MlirFunction::GetFunctionDef(tensorflow::FunctionDef** f) {
   TF_RETURN_IF_ERROR(diag_handler.ConsumeStatus());
 
   tensorflow::GraphExportConfig configs;
-  fdef_.reset(new tensorflow::FunctionDef());
+  fdef_ = std::make_unique<tensorflow::FunctionDef>();
   TF_RETURN_IF_ERROR(
       ConvertMlirFunctionToFunctionLibraryDef(func_, configs, fdef_.get()));
   *f = fdef_.get();
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
index bc46b0c04ec..aa0f84eb122 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_arith_ops_folder.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ARITH_OPS_FOLDER_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_ARITH_OPS_FOLDER_H_
 
+#include <utility>
+
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h
index d02b2b20e55..cad01806953 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h
@@ -19,6 +19,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DIALECT_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_DIALECT_H_
 
+#include <functional>
+#include <utility>
+
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 14ff8f37ae8..f063732db29 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -2509,15 +2509,16 @@ This is typically used by gradient computations for a concat operation.
 
   let arguments = (ins
     Arg<TF_Int32Tensor, [{The dimension along which to concatenate.}]>:$concat_dim,
-    Arg<Variadic<TF_Int32Tensor>, [{The `N` int32 vectors representing shape of tensors being concatenated.}]>:$shape
+    Arg<Variadic<TF_I32OrI64Tensor>, [{The `N` int32 or int64 vectors representing shape of tensors being concatenated.}]>:$shape
   );
 
   let results = (outs
-    Res<Variadic<TF_Int32Tensor>, [{The `N` int32 vectors representing the starting offset
-of input tensors within the concatenated output.}]>:$offset
+    Res<Variadic<TF_I32OrI64Tensor>, [{The `N` vectors representing the starting offset
+of input tensors within the concatenated output with type matching `shape`.}]>:$offset
   );
 
   TF_DerivedOperandSizeAttr N = TF_DerivedOperandSizeAttr<1>;
+  TF_DerivedOperandTypeAttr shape_type = TF_DerivedOperandTypeAttr<1>;
 
   let hasVerifier = 1;
 
@@ -4621,6 +4622,36 @@ This operation creates a tensor of `shape` and `dtype`.
   let hasFolder = 1;
 }
 
+def TF_EncodePngOp : TF_Op<"EncodePng", [Pure]> {
+  let summary = "PNG-encode an image.";
+
+  let description = [{
+`image` is a 3-D uint8 or uint16 Tensor of shape `[height, width, channels]`
+where `channels` is:
+
+*   1: for grayscale.
+*   2: for grayscale + alpha.
+*   3: for RGB.
+*   4: for RGBA.
+
+The ZLIB compression level, `compression`, can be -1 for the PNG-encoder
+default or a value from 0 to 9.  9 is the highest compression level, generating
+the smallest output, but is slower.
+  }];
+
+  let arguments = (ins
+    Arg<TensorOf<[TF_Uint16, TF_Uint8]>, [{3-D with shape `[height, width, channels]`.}]>:$image,
+
+    DefaultValuedOptionalAttr<I64Attr, "-1">:$compression
+  );
+
+  let results = (outs
+    Res<TF_StrTensor, [{0-D. PNG-encoded image.}]>:$contents
+  );
+
+  TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+}
+
 def TF_EnqueueTPUEmbeddingArbitraryTensorBatchOp : TF_Op<"EnqueueTPUEmbeddingArbitraryTensorBatch", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, SameVariadicOperandSize, TF_TPUEmbeddingWriteEffect]> {
   let summary = [{
 Eases the porting of code that uses tf.nn.embedding_lookup_sparse().
@@ -11187,6 +11218,8 @@ underlying graph, and executes each of the partitioned subgraphs as a function.
 
     // Returns the callee of this operation.
     CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
+    // Sets the callee from the callable.
+    void setCalleeFromCallable(CallInterfaceCallable callee);
 
     // returns the callee of this operation.
     func::FuncOp func() {
@@ -21152,7 +21185,7 @@ for binary operators.
   }];
 }
 
-def TF_XlaCallModuleOp : TF_Op<"XlaCallModule", [Pure]> {
+def TF_XlaCallModuleOp : TF_Op<"XlaCallModule", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Invokes a StableHLO module.";
 
   let description = [{
@@ -21171,7 +21204,9 @@ platform argument (see `platforms`) nor the dimension arguments (see
     TF_ShapeAttrArray:$Sout,
     DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$dim_args_spec,
     DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$platforms,
-    DefaultValuedOptionalAttr<TF_SymbolRefArrayAttr, "{}">:$function_list
+    DefaultValuedOptionalAttr<TF_SymbolRefArrayAttr, "{}">:$function_list,
+    DefaultValuedOptionalAttr<BoolAttr, "false">:$has_token_input_output,
+    DefaultValuedOptionalAttr<StrArrayAttr, "{}">:$disabled_checks
   );
 
   let results = (outs
@@ -22633,6 +22668,39 @@ expected to create these operators.
   }];
 }
 
+def TF__XlaCompileOp : TF_Op<"_XlaCompile", [AttrSizedOperandSegments]> {
+  let summary = "XLA Compile Op. For use by the XLA JIT only.";
+
+  let description = [{
+Compiles a TensorFlow function into an XLA LocalExecutable and returns a key
+that _XlaRun can use to look up the LocalExecutable and execute it.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$constants,
+    Variadic<TF_Tensor>:$args,
+    Variadic<TF_ResourceTensor>:$resources,
+
+    BoolAttr:$must_compile,
+    SymbolRefAttr:$function
+  );
+
+  let results = (outs
+    Res<TF_StrTensor, [{A key that can be used to look up the local executable compiled by the
+node and associated metadata.}]>:$key,
+    Res<TF_BoolTensor, [{If the `must_compile` attr is false the _XlaCompile op
+can decide not to compile the clusters based on some profitability
+heuristics.  In that case `compilation_successful` is false if _XlaCompile
+chose not to compile the cluster.  If the `must_compile` attr is true then
+_XlaCompile always attempts to compile the cluster and
+`compilation_successful` is always true.}]>:$compilation_successful
+  );
+
+  TF_DerivedOperandSizeAttr Nresources = TF_DerivedOperandSizeAttr<2>;
+  TF_DerivedOperandTypeListAttr Targs = TF_DerivedOperandTypeListAttr<1>;
+  TF_DerivedOperandTypeListAttr Tconstants = TF_DerivedOperandTypeListAttr<0>;
+}
+
 def TF__XlaHostComputeMlirOp : TF_Op<"_XlaHostComputeMlir", [TF_RecvSideEffect, TF_SendSideEffect, TF_XlaHostComputeSideEffect]> {
   let summary = [{
 A pseudo-op to represent host-side computation in an XLA program.
@@ -22703,6 +22771,27 @@ execution the transfer corresponds to.}]>:$dynamic_key,
   TF_DerivedResultTypeListAttr Toutputs = TF_DerivedResultTypeListAttr<0>;
 }
 
+def TF__XlaRunOp : TF_Op<"_XlaRun", []> {
+  let summary = "XLA Run Op. For use by the XLA JIT only.";
+
+  let description = [{
+Executes a TensorFlow function previously compiled into a LocalExecutable by an
+_XlaCompile op.
+  }];
+
+  let arguments = (ins
+    Variadic<TF_Tensor>:$args,
+    TF_StrTensor:$key
+  );
+
+  let results = (outs
+    Variadic<TF_Tensor>:$results
+  );
+
+  TF_DerivedOperandTypeListAttr Targs = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Tresults = TF_DerivedResultTypeListAttr<0>;
+}
+
 def TF__XlaSendFromHostOp : TF_Op<"_XlaSendFromHost", [DeclareOpInterfaceMethods<TF_GetResourceInstanceInterface>, TF_SendSideEffect]> {
   let summary = "A placeholder op to send values to a running XLA computation.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
index c9a890778f6..d40089d2948 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops.td
@@ -409,6 +409,8 @@ def TF_LegacyCallOp : TF_Op<"LegacyCall",
 
     // Returns the callee of this operation.
     CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
+    // Sets the callee from the callable
+    void setCalleeFromCallable(::mlir::CallInterfaceCallable callee);
 
     // Returns the resolved callee function of this operation.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
@@ -570,6 +572,8 @@ underlying graph, and executes each of the partitioned subgraphs as a function.
 
     // Returns the callee of this operation.
     CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
+    // Sets the callee from the callable
+    void setCalleeFromCallable(::mlir::CallInterfaceCallable callee);
 
     // Returns the resolved callee function of this operation.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
@@ -1009,6 +1013,8 @@ def TF_TPUPartitionedCallOp : TF_Op<"TPUPartitionedCall",
 
     // Returns the callee of this operation.
     CallInterfaceCallable getCallableForCallee() { return getFAttr(); }
+    // Sets the callee from the callable.
+    void setCalleeFromCallable(CallInterfaceCallable callee);
 
     // Returns the resolved callee function of this operation.
     // Prefer passing in SymbolTableCollection to reduce lookup costs by
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
index dfa46846aa1..7f066b3f327 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h"
 
 #include <algorithm>
+#include <array>
+#include <complex>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -3214,6 +3216,16 @@ LogicalResult LegacyCallOp::verifySymbolUses(
   return success();
 }
 
+void LegacyCallOp::setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+  // Direct call.
+  if (SymbolRefAttr fAttr = getFAttr()) {
+    SymbolRefAttr calleeAttr = callee.get<SymbolRefAttr>();
+    return setFAttr(cast<FlatSymbolRefAttr>(calleeAttr));
+  }
+  // Indirect call, callee Value is the first operand.
+  return setOperand(0, callee.get<Value>());
+}
+
 //===----------------------------------------------------------------------===//
 // LogOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
index 77f87e0f960..29dae2715a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_layout_helper.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_LAYOUT_HELPER_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_IR_TF_OPS_LAYOUT_HELPER_H_
 
+#include <array>
+#include <utility>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
index 36b9d6c6e20..62a047bd441 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
 
 #include <algorithm>
+#include <array>
 #include <cstdint>
 #include <functional>
+#include <iterator>
 #include <limits>
 #include <numeric>
 #include <optional>
@@ -61,6 +63,7 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -79,6 +82,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/rewrite_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/util/tensor_format.h"
@@ -563,6 +567,31 @@ LogicalResult TPUPartitionedCallOp::verifySymbolUses(
   return VerifyPartitionedCall(*this, symbolTable);
 }
 
+template <typename CallOpClass>
+static void SetPartitionCalleeFromCallable(CallOpClass op,
+                                           mlir::CallInterfaceCallable callee) {
+  // Direct call.
+  if (SymbolRefAttr fAttr = op.getFAttr()) {
+    SymbolRefAttr calleeAttr = callee.get<SymbolRefAttr>();
+    return op.setFAttr(cast<FlatSymbolRefAttr>(calleeAttr));
+  }
+  // Indirect call, callee Value is the first operand.
+  return op.setOperand(0, callee.get<Value>());
+}
+
+void PartitionedCallOp::setCalleeFromCallable(
+    mlir::CallInterfaceCallable callee) {
+  return SetPartitionCalleeFromCallable(*this, callee);
+}
+void StatefulPartitionedCallOp::setCalleeFromCallable(
+    CallInterfaceCallable callee) {
+  return SetPartitionCalleeFromCallable(*this, callee);
+}
+void TPUPartitionedCallOp::setCalleeFromCallable(
+    mlir::CallInterfaceCallable callee) {
+  return SetPartitionCalleeFromCallable(*this, callee);
+}
+
 //===----------------------------------------------------------------------===//
 // PowOp
 //===----------------------------------------------------------------------===//
@@ -1057,7 +1086,7 @@ static Type InferSelectV2OpType(Value condition, Value e, Value t) {
   if (!cond_ranked_ty || !broadcasted_ranked_ty) return unranked_ty;
 
   // Explicitly get broadcasted output type as element types of condition may
-  // not be same as the broadcated type's element type.
+  // not be same as the broadcasted type's element type.
   SmallVector<int64_t, 4> result_shape;
   if (!OpTrait::util::getBroadcastedShape(cond_ranked_ty.getShape(),
                                           broadcasted_ranked_ty.getShape(),
@@ -2829,6 +2858,27 @@ OpFoldResult FoldCancellableTranspose(TransposeOp op) {
   auto transpose = dyn_cast_or_null<TF::TransposeOp>(op.getX().getDefiningOp());
   if (!transpose) return {};
 
+  // If the transpose ops are on different devices, we don't fold them.
+  if (transpose->getBlock() != op->getBlock()) {
+    tensorflow::DataType dtype;
+    auto status = tensorflow::ConvertToDataType(
+        op.getX().getType().cast<TensorType>().getElementType(), &dtype);
+    if (status.ok()) {
+      // We can only leave the transpose op on host if its dtype is supported on
+      // host.
+      if (dtype == tensorflow::DT_UINT64 || dtype == tensorflow::DT_INT64 ||
+          dtype == tensorflow::DT_UINT32 || dtype == tensorflow::DT_INT32 ||
+          dtype == tensorflow::DT_UINT16 || dtype == tensorflow::DT_INT16 ||
+          dtype == tensorflow::DT_UINT8 || dtype == tensorflow::DT_INT8 ||
+          dtype == tensorflow::DT_HALF || dtype == tensorflow::DT_BFLOAT16 ||
+          dtype == tensorflow::DT_FLOAT || dtype == tensorflow::DT_DOUBLE ||
+          dtype == tensorflow::DT_COMPLEX64 ||
+          dtype == tensorflow::DT_COMPLEX128 || dtype == tensorflow::DT_BOOL) {
+        return {};
+      }
+    }
+  }
+
   // Permutations defined by constant operations.
   DenseIntElementsAttr perm0;
   DenseIntElementsAttr perm1;
@@ -2933,6 +2983,39 @@ void FusedBatchNormOp::getCanonicalizationPatterns(RewritePatternSet &results,
   results.add<ConvertFusedBatchNorm>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// XlaCallModuleOp
+//===----------------------------------------------------------------------===//
+
+void XlaCallModuleOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  if (!getFunctionList().empty()) {
+    // The StableHLO module embedded in XlaCallModule contains
+    // `stablehlo.custom_call` calling TF host callback functions.
+    // `stablehlo.custom_call` will be lowered to `stablehlo.send` and
+    // `stablehlo.recv`.
+    effects.emplace_back(MemoryEffects::Write::get(),
+                         ResourceEffects::Send::get());
+    effects.emplace_back(MemoryEffects::Write::get(),
+                         ResourceEffects::Recv::get());
+    effects.emplace_back(MemoryEffects::Write::get(),
+                         ResourceEffects::XlaHostCompute::get());
+  }
+}
+
+LogicalResult XlaCallModuleOp::verifySymbolUses(
+    SymbolTableCollection &symbolTable) {
+  for (auto f : getFunctionList()) {
+    auto func = symbolTable.lookupNearestSymbolFrom<func::FuncOp>(
+        getOperation(), f.cast<mlir::SymbolRefAttr>());
+    if (!func) {
+      return emitOpError() << "refers to an undefined function: " << f;
+    }
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // XlaLaunchOp
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
index 7bf2b3ca1f1..b295461d533 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 
+#include <algorithm>
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
index 6cc7344b083..4c2e9dc642c 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     size_override = {
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
index 49ba1afe393..7bca5e649b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/canonicalize.mlir
@@ -906,6 +906,26 @@ func.func @cancellableTranspose(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf3
   // CHECK: return %arg0
 }
 
+// CHECK-LABEL: @nonCancellableTransposeCrossRegion
+func.func @nonCancellableTransposeCrossRegion(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
+  %0 = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %1 = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>} : () -> tensor<4xi32>
+  %2 = "tf.Transpose"(%arg0, %0) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+
+  %result = "tf_device.launch"() ({
+    %3 = "tf.Transpose"(%2, %1) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+    tf_device.return %3: tensor<1x4x4x8xf32>
+  }) {device = "device"} : () -> tensor<1x4x4x8xf32>
+
+  func.return %result : tensor<1x4x4x8xf32>
+
+  // CHECK-DAG: %[[CONST1:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi32>}
+  // CHECK-DAG: %[[CONST2:.*]] = "tf.Const"() {value = dense<[0, 2, 3, 1]> : tensor<4xi32>}
+  // CHECK: %[[TRANS1:.*]] = "tf.Transpose"(%arg0, %[[CONST1]]) : (tensor<1x4x4x8xf32>, tensor<4xi32>) -> tensor<1x8x4x4xf32>
+  // CHECK: %[[TRANS2:.*]] = "tf.Transpose"(%[[TRANS1]], %[[CONST2]]) : (tensor<1x8x4x4xf32>, tensor<4xi32>) -> tensor<1x4x4x8xf32>
+  // CHECK: return %[[TRANS2]]
+}
+
 // CHECK-LABEL: @cancellableTransposeConst
 func.func @cancellableTransposeConst(%arg0: tensor<1x4x4x8xf32>) -> tensor<1x4x4x8xf32> {
   %0 = arith.constant dense<[0, 3, 1, 2]> : tensor<4xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
index 9132abf2fe5..20ca45e8264 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/compile_mlir_util/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
index f6bd3d4d586..408342b0ebd 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_pipelining.mlir
@@ -8,13 +8,19 @@ module {
     return
   }
   func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
-    // Verify that everything is extracted into one of the four functions.
+    // Verify the overall pipelining control flow and supporting functions.
     // The order of these functions is also significant.
-    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_backward.*}}
-    // CHECK-NEXT: return
+    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_0.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_1.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
+    // CHECK: {{.*tf.While.* body = @new_while_body.* cond = @new_while_cond.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm2.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm1.*}}
+    // CHECK: return
     // metadata ops
     "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
     %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
@@ -37,39 +43,20 @@ module {
     %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
     return %0 : tensor<i1>
   }
-  // Generated functions
+  // Generated functions for control flow ops (if, while, switch)
+
   // non_tpu should have to TPU ops - just identity and return (in this test).
-  // CHECK: func.func private @_func_non_tpu
+  // CHECK: func.func private @non_tpu
   // CHECK-NEXT: tf.Identity
   // CHECK-NEXT: return
 
-  // sc_forward should have TPU ops including replicated outputs but not inputs
-  // CHECK: func.func private @_func_sc_forward
-  // CHECK-NOT: TPUReplicatedInput
-  // CHECK-DAG: TPUReplicateMetadata
-  // CHECK-DAG: TPUCompilationResult
-  // CHECK-DAG: TPUReplicatedOutput
-  // CHECK: return
-
-  // core_tput should have TPU ops including both replicated inputs and outputs
-  // CHECK: func.func private @_func_core_tpu
-  // CHECK-DAG: TPUReplicatedInput
-  // CHECK-DAG: TPUReplicateMetadata
-  // CHECK-DAG: TPUCompilationResult
-  // CHECK-DAG: TPUReplicatedOutput
-  // CHECK: return
-
-  // sc_backward should have TPU ops including replicted inputs but not outputs
-  // CHECK: func.func private @_func_sc_backward
-  // CHECK-NOT: TPUReplicatedOutput
-  // CHECK-DAG: TPUReplicateMetadata
-  // CHECK-DAG: TPUCompilationResult
-  // CHECK-DAG: TPUReplicatedInput
-  // CHECK: return
+  // Since there is a backward pass, finish_step_nm2 should be non-empty.
+  // CHECK: func.func private @finish_step_nm2
+  // CHECK-NEXT: tf.TPUReplicateMetadata
 }
 
 // -----
-// This test verifies that the extraction works correctly for evaluation-only models.
+// This test verifies that the pipelining works correctly for evaluation-only models.
 module {
   func.func @main() {
     %cst = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
@@ -77,9 +64,19 @@ module {
     return
   }
   func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
-    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // The pipelining control flow and supporting functions stay the same as the training version above.
+    // The order of these functions is also significant.
+    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_0.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @non_tpu.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @start_step_1.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @while_cond.*}}
+    // CHECK: {{.*tf.While.* body = @new_while_body.* cond = @new_while_cond.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm2.*}}
+    // CHECK: {{.*StatefulPartitionedCall.* f = @finish_step_nm1.*}}
+    // CHECK: return
     // metadata ops
     "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
     %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
@@ -99,8 +96,8 @@ module {
     %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
     return %0 : tensor<i1>
   }
-  // Only verify sc_backward. The previous test case verifies everything else.
-  // CHECK: func.func private @_func_sc_backward
+  // There's no backward pass so finish_step_nm2 should be empty
+  // CHECK: func.func private @finish_step_nm2
   // CHECK-NEXT: return
 }
 
@@ -147,43 +144,6 @@ module {
   }
 }
 
-// -----
-// A test verifying TPUReplicatedOutput in the input graph doesn't trigger
-// any additional TPUReplicatedInput or TPUReplicatedOutput ops.
-module {
-  func.func @main() {
-    %cst_1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %cst_2 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
-    %0:2 = "tf.While"(%cst_1, %cst_2) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
-    return
-  }
-  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
-    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
-    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
-    // metadata ops
-    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
-    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
-    %2 = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<3> : tensor<i32>} : () -> tensor<i32>
-    %3:2 = "tf.TPUReplicatedOutput"(%2) {device = ""} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
-
-    // core_tpu ops:
-    %res_t = "tf.Const"() {_replication_info = "repl_info", value = dense<4> : tensor<i32>} : () -> tensor<i32>
-
-    // non_tpu_ops
-    %res_n = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
-
-    return %res_n, %3#1 : tensor<i32>, tensor<i32>
-  }
-  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
-    %0 = "tf.Less"(%arg1, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
-    return %0 : tensor<i1>
-  }
-  // CHECK-DAG: TPUReplicatedOutput
-  // CHECK-NOT: TPUReplicatedoutput
-  // CHECK-NOT: TPUReplicatedInput
-}
-
 // -----
 // Verify error for backward pass with no forward pass.
 module {
@@ -317,3 +277,207 @@ module {
     return %0 : tensor<i1>
   }
 }
+
+// -----
+// Verify one while body function per while loop op.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    // expected-error @+1 {{'tf.While' op multiple users of function.}}
+    %1 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify that the function to be pipelined is a while loop body function.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    return
+  }
+  // expected-error @+1 {{'func.func' op unable to find while body user.}}
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// This test verifies that TPUReplicatedInputOps for resource variable args are packed.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0, %1 = "tf.While"(%cst_main, %arg0) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>) -> (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>)
+    return
+  }
+
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>) {
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // expected-error @+1 {{'tf.TPUReplicatedInput' op unexpected variable input, not packed}}
+    %37 = "tf.TPUReplicatedInput"(%arg1) {device = "", index = -1 : i64, is_mirrored_variable = true, is_packed = false} : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<*x!tf_type.resource<tensor<i64>>>
+
+   // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    %cst_12 = "tf.Const"() {_replication_info = "repl_info", _xla_compile_device_type = "TPU", device = "", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    "tf.AssignAddVariableOp"(%37, %cst_12) {_has_manual_control_dependencies = true, _replication_info = "while/cluster_while_body_451", _xla_compile_device_type = "TPU", device = ""} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+
+    return %res_n, %arg1 : tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// This test verifies that duplicate TPUReplicatedInput ops for a resource variable arg is an error.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0, %1 = "tf.While"(%cst_main, %arg0) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>) -> (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>)
+    return
+  }
+
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>) {
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // expected-error @+1 {{'tf.TPUReplicatedInput' op unexpected multiple TPUReplicatedInputOp for single argument}}
+    %37 = "tf.TPUReplicatedInput"(%arg1) {device = "", index = -1 : i64, is_mirrored_variable = true, is_packed = true} : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<*x!tf_type.resource<tensor<i64>>>
+    %38 = "tf.TPUReplicatedInput"(%arg1) {device = "", index = -1 : i64, is_mirrored_variable = true, is_packed = true} : (tensor<*x!tf_type.resource<tensor<i64>>>) -> tensor<*x!tf_type.resource<tensor<i64>>>
+
+   // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    %cst_12 = "tf.Const"() {_replication_info = "repl_info", _xla_compile_device_type = "TPU", device = "", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+    "tf.AssignAddVariableOp"(%37, %cst_12) {_has_manual_control_dependencies = true, _replication_info = "while/cluster_while_body_451", _xla_compile_device_type = "TPU", device = ""} : (tensor<*x!tf_type.resource<tensor<i64>>>, tensor<i64>) -> ()
+
+    return %res_n, %arg1 : tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// This test verifies the EliminateResourceLoops workaround.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0, %1 = "tf.While"(%cst_main, %arg0) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>) -> (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>)
+    return
+  }
+
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> (tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>) {
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    %rsrc_copy = "tf.StatefulPartitionedCall"(%arg1) {f = @broken_func, config = "", config_proto = "", executor_type = ""} : (tensor<*x!tf_type.resource<tensor<i64>>>)  -> (tensor<*x!tf_type.resource<tensor<i64>>>)
+    // We expect uses of %rsrc_copy are replaced by the input resource variable (%arg1 in this context).
+    "tf.StatefulPartitionedCall"(%arg1) {f = @func1, config = "", config_proto = "", executor_type = ""} : (tensor<*x!tf_type.resource<tensor<i64>>>)  -> ()
+    "tf.StatefulPartitionedCall"(%rsrc_copy) {f = @func2, config = "", config_proto = "", executor_type = ""} : (tensor<*x!tf_type.resource<tensor<i64>>>)  -> ()
+
+   // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    %cst_12 = "tf.Const"() {_replication_info = "repl_info", _xla_compile_device_type = "TPU", device = "", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+
+    return %res_n, %arg1 : tensor<i32>, tensor<*x!tf_type.resource<tensor<i64>>>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  func.func private @broken_func(%arg0: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"})  -> (tensor<*x!tf_type.resource<tensor<i64>>>) {
+    %x = "tf.Identity"(%arg0) : (tensor<*x!tf_type.resource<tensor<i64>>>)  -> (tensor<*x!tf_type.resource<tensor<i64>>>)
+    %y = "tf.Identity"(%x) : (tensor<*x!tf_type.resource<tensor<i64>>>)  -> (tensor<*x!tf_type.resource<tensor<i64>>>)
+    return %y : tensor<*x!tf_type.resource<tensor<i64>>>
+  }
+  func.func private @func1(%arg0: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"})  -> () {
+    return
+  }
+  func.func private @func2(%arg0: tensor<*x!tf_type.resource<tensor<i64>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf._user_specified_name = "rsrc", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"})  -> () {
+    return
+  }
+  // Make sure func1 and func2 use the original resource variable and not the result of @broken_func.
+  // CHECK: func.func private @non_tpu
+  // CHECK: {{.*%0 = \"tf.StatefulPartitionedCall\"\(%arg0\).*f = @broken_func.*}}
+  // CHECK: {{.*StatefulPartitionedCall\"\(%arg0\).*f = @func1.*}}
+  // CHECK: {{.*StatefulPartitionedCall\"\(%arg0\).*f = @func2.*}}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/embedding_sequencing.mlir b/tensorflow/compiler/mlir/tensorflow/tests/embedding_sequencing.mlir
new file mode 100644
index 00000000000..0a8a3069861
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/embedding_sequencing.mlir
@@ -0,0 +1,319 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-embedding-sequencing | FILECHECK_OPTS="" FileCheck %s
+
+// This test verifies the handling of TPU replicated inputs and outputs as well as the extraction of the four main functions.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    // Verify that everything is extracted into one of the four functions.
+    // The order of these functions is also significant.
+    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_backward.*}}
+    // CHECK-NEXT: return
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // Generated functions
+  // non_tpu should have to TPU ops - just identity and return (in this test).
+  // CHECK: func.func private @_func_non_tpu
+  // CHECK-NEXT: tf.Identity
+  // CHECK-NEXT: return
+
+  // sc_forward should have TPU ops including replicated outputs but not inputs
+  // CHECK: func.func private @_func_sc_forward
+  // CHECK-NOT: TPUReplicatedInput
+  // CHECK-DAG: TPUReplicateMetadata
+  // CHECK-DAG: TPUCompilationResult
+  // CHECK-DAG: TPUReplicatedOutput
+  // CHECK: return
+
+  // core_tput should have TPU ops including both replicated inputs and outputs
+  // CHECK: func.func private @_func_core_tpu
+  // CHECK-DAG: TPUReplicatedInput
+  // CHECK-DAG: TPUReplicateMetadata
+  // CHECK-DAG: TPUCompilationResult
+  // CHECK-DAG: TPUReplicatedOutput
+  // CHECK: return
+
+  // sc_backward should have TPU ops including replicted inputs but not outputs
+  // CHECK: func.func private @_func_sc_backward
+  // CHECK-NOT: TPUReplicatedOutput
+  // CHECK-DAG: TPUReplicateMetadata
+  // CHECK-DAG: TPUCompilationResult
+  // CHECK-DAG: TPUReplicatedInput
+  // CHECK: return
+}
+
+// -----
+// This test verifies that the extraction works correctly for evaluation-only models.
+module {
+  func.func @main() {
+    %cst = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Identity"(%arg0) {_embedding_pipelining = "forward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // Only verify sc_backward. The previous test case verifies everything else.
+  // CHECK: func.func private @_func_sc_backward
+  // CHECK-NEXT: return
+}
+
+// -----
+// A test verifying too many TPUReplicateMetadataOp ops. Same logic tests too many TPUCompilationResultOp ops.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource<tensor<512x256xf32>>>) {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i1>} : () -> tensor<i1>
+    %0 = "tf.While"(%cst) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i1>) -> (tensor<i1>)
+    return
+  }
+  // expected-error @+1 {{number of tf.TPUReplicateMetadata in loop body is not 1}}
+  func.func private @while_body(%arg0: tensor<i1>) -> (tensor<i1>) {
+    // metadata ops
+    %embedding_pass_trigger = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+    return %arg0 : tensor<i1>
+  }
+  func.func private @while_cond(%arg0: tensor<i1>) -> tensor<i1> {
+    return %arg0 : tensor<i1>
+  }
+}
+
+// -----
+// A test verifying the replication region of TPUReplicateMetadataOp ops. Same logic tests too many TPUCompilationResultOp ops.
+module {
+  func.func @main(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource<tensor<512x256xf32>>>) {
+    %cst = "tf.Const"() {value = dense<1> : tensor<i1>} : () -> tensor<i1>
+    %0 = "tf.While"(%cst) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i1>) -> (tensor<i1>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i1>) -> (tensor<i1>) {
+    // metadata ops
+    %embedding_pass_trigger = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 1 : i64} : () -> ()
+    // expected-error @+1 {{'tf.TPUCompilationResult' op is not part of the replication region "repl_info" vs "wrong_repl_info"}}
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "wrong_repl_info"} : () -> tensor<!tf_type.string>
+    return %arg0 : tensor<i1>
+  }
+  func.func private @while_cond(%arg0: tensor<i1>) -> tensor<i1> {
+    return %arg0 : tensor<i1>
+  }
+}
+
+// -----
+// A test verifying TPUReplicatedOutput in the input graph doesn't trigger
+// any additional TPUReplicatedInput or TPUReplicatedOutput ops.
+module {
+  func.func @main() {
+    %cst_1 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %cst_2 = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+    %0:2 = "tf.While"(%cst_1, %cst_2) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+    // CHECK: {{.*StatefulPartitionedCall.* f = @_func_non_tpu.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_sc_forward.*}}
+    // CHECK-NEXT: {{.*StatefulPartitionedCall.* f = @_func_core_tpu.*}}
+    // metadata ops
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %1 = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+    %2 = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+    %3:2 = "tf.TPUReplicatedOutput"(%2) {device = ""} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+
+    // core_tpu ops:
+    %res_t = "tf.Const"() {_replication_info = "repl_info", value = dense<4> : tensor<i32>} : () -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Const"() {value = dense<5> : tensor<i32>} : () -> tensor<i32>
+
+    return %res_n, %3#1 : tensor<i32>, tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg1, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+  // CHECK-DAG: TPUReplicatedOutput
+  // CHECK-NOT: TPUReplicatedoutput
+  // CHECK-NOT: TPUReplicatedInput
+}
+
+// -----
+// Verify error for backward pass with no forward pass.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    // expected-error @+1 {{'tf.Identity' op embedding backwards pass op with no forwards pass ops}}
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify error for unknown _embedding_pipelining attribute value.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    // expected-error @+1 {{'tf.Identity' op embedding op has unknown _embedding_pipelining attribute value garbage.}}
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "garbage", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify error for multiple WhileOp use of while_body function.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    // expected-error @+1 {{'tf.While' op multiple users of function.}}
+    %1 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
+
+// -----
+// Verify error for non-WhileOp use of while_body function.
+module {
+  func.func @main() {
+    %cst_main = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+    %0 = "tf.While"(%cst_main) {body = @while_body, cond = @while_cond, is_stateless = false} : (tensor<i32>) -> (tensor<i32>)
+    // expected-error @+1 {{'tf.StatefulPartitionedCall' op non while use of function.}}
+    %38 = "tf.StatefulPartitionedCall"(%cst_main) {config = "", config_proto = "", executor_type = "", f = @while_body} : (tensor<i32>) -> tensor<i32>
+    return
+  }
+  func.func private @while_body(%arg0: tensor<i32>) -> (tensor<i32>) {
+    "tf.TPUReplicateMetadata"() {_has_manual_control_dependencies = true, _replication_info = "repl_info", num_replicas = 2 : i64} : () -> ()
+    %comp_res = "tf.TPUCompilationResult"() {_tpu_compilation_status = "repl_info"} : () -> tensor<!tf_type.string>
+
+    // forward_ops
+    %res_f = "tf.Const"() {_embedding_pipelining = "forward", _replication_info = "repl_info", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+    // core_tpu ops:
+    %res_t = "tf.Identity"(%res_f) {_replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // backward_ops
+    %res_b = "tf.Identity"(%res_t) {_embedding_pipelining = "backward", _replication_info = "repl_info"} : (tensor<i32>) -> tensor<i32>
+
+    // non_tpu_ops
+    %res_n = "tf.Identity"(%arg0) : (tensor<i32>) -> tensor<i32>
+
+    return %res_n : tensor<i32>
+  }
+  func.func private @while_cond(%arg0: tensor<i32>) -> tensor<i1> {
+    %0 = "tf.Less"(%arg0, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+    return %0 : tensor<i1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD
index 954eca9c0e2..421bbd5de79 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_coarsening/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD
index 954eca9c0e2..421bbd5de79 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_island_inlining/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD
index 954eca9c0e2..421bbd5de79 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/executor_tpuv1_outline_island/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
index f9a097d8fef..8657ed861c3 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/extract_outside_compilation.mlir
@@ -881,7 +881,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
-    // CHECK:            "tf._XlaHostComputeMlir"(%6)
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[G_OUTPUT]])
     // CHECK-SAME:       key = "if_predicate_channel_1"
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK:              %[[HOST_COMPUTE_OUT:[0-9]*]] = "tf._XlaHostComputeMlir"(%[[B_OUTPUT]], %[[A_OUTPUT]])
@@ -932,7 +932,7 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     // CHECK:            %[[A_OUTPUT:[0-9]*]] = "tf.A"
     // CHECK:            %[[B_OUTPUT:[0-9]*]] = "tf.B"
     // CHECK:            %[[G_OUTPUT:[0-9]*]] = "tf.G"
-    // CHECK:            "tf._XlaHostComputeMlir"(%6)
+    // CHECK:            "tf._XlaHostComputeMlir"(%[[G_OUTPUT]])
     // CHECK-SAME:       key = "if_predicate_channel_0"
     // CHECK-NEXT:       tf.IfRegion"(%[[G_OUTPUT]])
     // CHECK-NEXT:         "tf.Yield"() : () -> ()
@@ -2098,3 +2098,203 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:loc
     func.return %0 : tensor<2xi32>
   }
 }
+
+// -----
+
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1443 : i32}} {
+  // Tests map_outside_compilation when there is no replication.
+  // The sharding is:
+  //   type: OTHER
+  //   tile_assignment_dimensions: 2
+  //   tile_assignment_dimensions: 1
+  //   tile_assignment_devices: 0
+  //   tile_assignment_devices: 1
+  // Serialized string:
+  //   "\08\03\1A\02\02\01\22\02\00\01"
+
+  // CHECK-LABEL: func @map_outside_compilation_not_replicated
+  func.func @map_outside_compilation_not_replicated() -> () {
+    // CHECK:       "tf_device.parallel_execute"
+    // CHECK:         "tf_device.launch"
+    // CHECK:           %[[PROGRAM0:.+]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+    // CHECK:           %[[RECV0:.+]] = "tf._XlaRecvAtHost"(%[[PROGRAM0]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        device_ordinal = 0
+    // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK:           %[[B0:.+]] = "tf.OpB"(%[[RECV0]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
+    // CHECK:           "tf._XlaSendFromHost"(%[[B0]], %[[PROGRAM0]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        device_ordinal = 0
+    // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK:         }, {
+    // CHECK:           %[[PROGRAM1:.+]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+    // CHECK:           %[[RECV1:.+]] = "tf._XlaRecvAtHost"(%[[PROGRAM1]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        device_ordinal = 1
+    // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK:           %[[B1:.+]] = "tf.OpB"(%[[RECV1]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
+    // CHECK:           "tf._XlaSendFromHost"(%[[B1]], %[[PROGRAM1]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        device_ordinal = 1
+    // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK:         }, {
+    // CHECK:           "tf_device.cluster"
+    // CHECK:             %[[A:.+]] = "tf.OpA"
+    // CHECK:             %[[A_SHARD:.+]] = "tf.XlaSpmdFullToShardShape"(%[[A]]) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+    // CHECK:             %[[B:.+]] = "tf._XlaHostComputeMlir"(%[[A_SHARD]])
+    // CHECK-SAME:          manual_sharding = true
+    // CHECK-SAME:          recv_key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:          send_key = "host_compute_channel_0_args"
+    // CHECK:             %[[B_FULL:.+]] = "tf.XlaSpmdShardToFullShape"(%[[B]]) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+    // CHECK:             "tf.OpC"(%[[B_FULL]])
+    "tf_device.cluster"() ({
+      %0 = "tf.OpA"() {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\01"} : () -> tensor<2x2xi64>
+      %1 = "tf.OpB"(%0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<2x2xi64>) -> tensor<2x2xi64>
+      "tf.OpC"(%1) : (tensor<2x2xi64>) -> ()
+      tf_device.return
+    }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU:2", "/job:localhost/replica:0/task:0/device:TPU:3", "/job:localhost/replica:0/task:0/device:TPU:4", "/job:localhost/replica:0/task:0/device:TPU:5", "/job:localhost/replica:0/task:0/device:TPU:6", "/job:localhost/replica:0/task:0/device:TPU:7", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1458 : i32}} {
+  // Tests map_outside_compilation when there is replication.
+  // The sharding is:
+  //   type: OTHER
+  //   tile_assignment_dimensions: 2
+  //   tile_assignment_dimensions: 1
+  //   tile_assignment_devices: 0
+  //   tile_assignment_devices: 1
+  // Serialized string:
+  //   "\08\03\1A\02\02\01\22\02\00\01"
+
+  // CHECK-LABEL: func @map_outside_compilation_replicated
+  func.func @map_outside_compilation_replicated() -> () {
+    // CHECK:     tf_device.replicate
+    // CHECK:       "tf_device.parallel_execute"
+    // CHECK:           %[[PROGRAM0:.+]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+    // CHECK:           %[[DEVICE0_0:.+]] = "tf._TPUDeviceOrdinalPlaceholder"
+    // CHECK:           %[[RECV0:.+]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM0]], %[[DEVICE0_0]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK:           %[[B0:.+]] = "tf.OpB"(%[[RECV0]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
+    // CHECK:           "tf._XlaSendFromHostV2"(%[[B0]], %[[PROGRAM0]], %[[DEVICE0_0]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK:         }, {
+    // CHECK:           %[[PROGRAM1:.+]] = "tf._TPUCompileMlirPlaceholderProgramKey"
+    // CHECK:           %[[DEVICE1_0:.+]] = "tf._TPUDeviceOrdinalPlaceholder"
+    // CHECK:           %[[ONE_0:.+]] = "tf.Const"
+    // CHECK-SAME:        value = dense<1>
+    // CHECK:           %[[DEVICE1_1:.+]] = "tf.AddV2"(%[[DEVICE1_0]], %[[ONE_0]])
+    // CHECK:           %[[RECV1:.+]] = "tf._XlaRecvAtHostV2"(%[[PROGRAM1]], %[[DEVICE1_1]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        key = "host_compute_channel_0_args"
+    // CHECK:           %[[B1:.+]] = "tf.OpB"(%[[RECV1]]) : (tensor<2x2xi64>) -> tensor<2x2xi64>
+    // CHECK:           %[[ONE_1:.+]] = "tf.Const"
+    // CHECK-SAME:        value = dense<1>
+    // CHECK:           %[[DEVICE1_2:.+]] = "tf.AddV2"(%[[DEVICE1_0]], %[[ONE_1]])
+    // CHECK:           "tf._XlaSendFromHostV2"(%[[B1]], %[[PROGRAM1]], %[[DEVICE1_2]])
+    // CHECK-SAME:        _xla_has_host_transfer = true
+    // CHECK-SAME:        key = "host_compute_channel_0_retvals"
+    // CHECK:         }, {
+    // CHECK:           "tf_device.cluster"
+    // CHECK:             %[[A:.+]] = "tf.OpA"
+    // CHECK:             %[[A_SHARD:.+]] = "tf.XlaSpmdFullToShardShape"(%[[A]]) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xi64>) -> tensor<1x2xi64>
+    // CHECK:             %[[B:.+]] = "tf._XlaHostComputeMlir"(%[[A_SHARD]])
+    // CHECK-SAME:          manual_sharding = true
+    // CHECK-SAME:          recv_key = "host_compute_channel_0_retvals"
+    // CHECK-SAME:          send_key = "host_compute_channel_0_args"
+    // CHECK:             %[[B_FULL:.+]] = "tf.XlaSpmdShardToFullShape"(%[[B]]) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xi64>) -> tensor<2x2xi64>
+    // CHECK:             "tf.OpC"(%[[B_FULL]])
+    tf_device.replicate() {n = 4 : i32} {
+      "tf_device.cluster"() ({
+        %0 = "tf.OpA"() {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\01"} : () -> tensor<2x2xi64>
+        %1 = "tf.OpB"(%0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<2x2xi64>) -> tensor<2x2xi64>
+        "tf.OpC"(%1) : (tensor<2x2xi64>) -> ()
+        tf_device.return
+      }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+      tf_device.return
+    }
+    return
+  }
+}
+
+// -----
+
+module attributes {tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0"}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1443 : i32}} {
+
+  // Test that map_outside_compilation's inputs are not unranked.
+  func.func @map_outside_compilation_must_be_ranked() -> () {
+    "tf_device.cluster"() ({
+      %0 = "tf.OpA"() : () -> tensor<*xi64>
+      // expected-error @+1 {{must be ranked}}
+      %1 = "tf.OpB"(%0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<*xi64>) -> tensor<*xi64>
+      tf_device.return
+    }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+    return
+  }
+
+  // Test that map_outside_compilation's inputs have rank >= 1.
+  func.func @map_outside_compilation_must_have_rank_gte_1() -> () {
+    "tf_device.cluster"() ({
+      %0 = "tf.OpA"() : () -> tensor<i64>
+      // expected-error @+1 {{must have rank at least one}}
+      %1 = "tf.OpB"(%0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<i64>) -> tensor<i64>
+      tf_device.return
+    }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+    return
+  }
+
+  // Test that map_outside_compilation's inputs shapes are divisible by num_cores_per_replica.
+  func.func @map_outside_compilation_div_num_cores_per_replica() -> () {
+    "tf_device.cluster"() ({
+      %0 = "tf.OpA"() : () -> tensor<3xi64>
+      // expected-error @+1 {{divisible by num_cores_per_replica}}
+      %1 = "tf.OpB"(%0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<3xi64>) -> tensor<3xi64>
+      tf_device.return
+    }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+    return
+  }
+
+  // Test that map_outside_compilation's preceeding ops have an _XlaSharding attribute.
+  func.func @map_outside_compilation_explicit_sharding() -> () {
+    "tf_device.cluster"() ({
+      %0 = "tf.OpA"() : () -> tensor<2xi64>
+      // expected-error @+1 {{should have an explicit sharding}}
+      %1 = "tf.OpB"(%0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<2xi64>) -> tensor<2xi64>
+      tf_device.return
+    }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+    return
+  }
+
+  // Test that map_outside_compilation has at least 1 input to the
+  // _XlaHostComputeMlir op. In this case, %arg0 is not input to the
+  // generated _XlaHostComputeMlir.
+  func.func @map_outside_compilation_preceeding_op(%arg0 : tensor<2xi64>) -> () {
+    "tf_device.cluster"() ({
+      // expected-error @+1 {{should have at least one input}}
+      %1 = "tf.OpB"(%arg0) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<2xi64>) -> tensor<2xi64>
+      tf_device.return
+    }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\01\01\01\02\10\01\18\02\22\08\00\00\00\00\00\00\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+    return
+  }
+
+  // Test that map_outside_compilation inputs have the same sharding.
+  func.func @map_outside_compilation_same_sharding() -> () {
+    tf_device.replicate() {n = 4 : i32} {
+      "tf_device.cluster"() ({
+        %0 = "tf.OpA"() {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\01"} : () -> tensor<2x2xi64>
+        %1 = "tf.OpB"() {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\02"} : () -> tensor<2x2xi64>
+        // expected-error @+1 {{should have the same sharding}}
+        %2 = "tf.OpC"(%0, %1) {_xla_map_outside_compilation = "0", _xla_outside_compilation = "from_launch"} : (tensor<2x2xi64>, tensor<2x2xi64>) -> tensor<2x2xi64>
+        "tf.OpD"(%2) : (tensor<2x2xi64>) -> ()
+        tf_device.return
+      }) {_xla_compile_device_type = "TPU", computation_shape = [], device = "", device_assignment = [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1], host_compute_core = [], num_cores_per_replica = 2 : i64, padding_map = [], topology = "\0A\04\02\02\01\02\10\01\18\08\22 \00\00\00\00\00\00\00\01\01\00\00\00\01\00\00\01\00\01\00\00\00\01\00\01\01\01\00\00\01\01\00\01*\02\08\01", use_spmd_for_xla_partitioning = true, use_tpu = true} : () -> ()
+      tf_device.return
+    }
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
index 71493d0f30a..186794e8891 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [
         ":debug_info_files",
         ":test_utilities",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt
index 2d2ad5b5083..515d74231df 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/arg-retval-attrs.pbtxt
@@ -152,4 +152,4 @@ versions {
 # CHECK:      func @main
 # CHECK-SAME: ({{%.*}}: tensor<*xf32>, {{%.*}}: tensor<*xi32> {tf._arg1_attr0 = "_arg1_attr0_value", tf._arg1_attr1 = 8.000000e+00 : f32}, {{%.*}}: tensor<*xi1>)
 # CHECK-SAME: -> (tensor<*xf32> {tf._ret0_attr0 = 8 : i64, tf._ret0_attr1 = false}, tensor<*xi32>, tensor<*xi1> {tf._ret2_attr0 = !tf_type.variant, tf._ret2_attr1 = #tf_type.shape<128x1024>})
-# CHECK-SAME: attributes {tf.entry_function = {control_outputs = "", inputs = "arg0,arg1,arg2", outputs = "ret0,ret1,ret2"}}
+# CHECK-SAME: attributes {allow_soft_placement = false, tf.entry_function = {control_outputs = "", inputs = "arg0,arg1,arg2", outputs = "ret0,ret1,ret2"}}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD
index ab1cc6459a1..b770ab3bf89 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/batch_use_same_function/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [
         ":debug_info_files",
         ":test_utilities",
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
index d4d5b8e3c52..eef2fbb92b2 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
+++ b/tensorflow/compiler/mlir/tensorflow/tests/graphdef2mlir/graph-as-function.pbtxt
@@ -1,4 +1,4 @@
-# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-xla-compile-device-type="GPU" -o - | FileCheck %s
+# RUN: tf-mlir-translate -graphdef-to-mlir -tf-enable-shape-inference-on-import=false %s -tf-graph-as-function -tf-xla-compile-device-type="GPU" -tf-enable-soft-placement-on-import=true -o - | FileCheck %s
 
 # Verify main graph was converted to a function, args/rets are mapped correctly,
 # and ops in the main graph are retained. In addition, check if subsequent
@@ -6,6 +6,7 @@
 
 # CHECK:      func @main(%arg0: tensor<*x!tf_type.resource>, %arg1: tensor<*x!tf_type.resource<tensor<3x3x1x32xf32>>>, %arg2: tensor<*xf32>, %arg3: tensor<2x4x6x8xi32>) -> (tensor<*xf32>, tensor<*xf32>)
 # CHECK-SAME: _xla_compile_device_type = "GPU"
+# CHECK-SAME: allow_soft_placement
 # CHECK-SAME: control_outputs = ""
 # CHECK-SAME: inputs = "args_0,args_1,args_2,args_3"
 # CHECK-SAME: outputs = "rets_0,rets_1"
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
index 56b9adab296..68440b125d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/legalize_hlo.mlir
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// RUN: tf-opt -tf-legalize-hlo %s | FileCheck %s
+// RUN: tf-opt -tf-legalize-hlo %s -verify-diagnostics -split-input-file | FileCheck %s
 
 // CHECK-LABEL:   func @biasAdd_NHWC(
 // CHECK-SAME:                       %[[VAL_0:.*]]: tensor<1x32x10x32xi32>,
@@ -1476,7 +1476,16 @@ func.func @convert_slice(%arg0: tensor<1x4672xf32>) -> tensor<1x519xf32> {
 func.func @reshape(%arg0: tensor<4x6xf32>) -> tensor<2x2x6xf32> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<4x6xf32>) -> tensor<2x2x6xf32>
   func.return %0 : tensor<2x2x6xf32>
+}
 
+// CHECK-LABEL: func @round_nearest_even(
+// CHECK-SAME:                           %[[VAL_0:.*]]: tensor<2xf32>) -> tensor<2xf32> {
+// CHECK:         %[[VAL_1:.*]] = "tf.Round"(%[[VAL_0]]) : (tensor<2xf32>) -> tensor<2xf32>
+// CHECK:         return %[[VAL_1]] : tensor<2xf32>
+// CHECK:       }
+func.func @round_nearest_even(%arg0: tensor<2xf32>) -> tensor<2xf32> {
+  %0 = "mhlo.round_nearest_even"(%arg0) : (tensor<2xf32>) -> tensor<2xf32>
+  return %0 : tensor<2xf32>
 }
 
 // CHECK-LABEL:   func @convert_dot_2d_1d(
@@ -1733,13 +1742,25 @@ func.func @no_convert_conv1d_feature_group_gt_1(%arg0: tensor<16x32x256xbf16>, %
   func.return %0 : tensor<16x32x128xbf16>
 }
 
-// CHECK-LABEL:   func.func @no_convert_conv1d_missing_windows_strides(
-// CHECK-SAME:                                                         %[[VAL_0:.*]]: tensor<16x32x256xbf16>,
-// CHECK-SAME:                                                         %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
-// CHECK:           %[[VAL_2:.*]] = mhlo.convolution(%[[VAL_0]], %[[VAL_1]]) dim_numbers = [b, 0, f]x[0, i, o]->[b, 0, f], window = {pad = {{\[\[}}0, 0]], lhs_dilate = [1], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32x256xbf16>, tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16>
-// CHECK:           return %[[VAL_2]] : tensor<16x32x256xbf16>
+// CHECK-LABEL:   func.func @convert_conv1d_missing_windows_strides_fallback(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<16x32x256xbf16>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant dense<[16, 32, 256, 1]> : tensor<4xi64>
+// CHECK:           %[[VAL_3:.*]] = "tf.Reshape"(%[[VAL_0]], %[[VAL_2]]) : (tensor<16x32x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
+// CHECK-DAG:       %[[VAL_4:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_5:.*]] = "tf.Transpose"(%[[VAL_3]], %[[VAL_4]]) : (tensor<16x32x256x1xbf16>, tensor<4xi64>) -> tensor<16x32x1x256xbf16>
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant dense<[1, 256, 256, 1]> : tensor<4xi64>
+// CHECK:           %[[VAL_7:.*]] = "tf.Reshape"(%[[VAL_1]], %[[VAL_6]]) : (tensor<1x256x256xbf16>, tensor<4xi64>) -> tensor<1x256x256x1xbf16>
+// CHECK-DAG:       %[[VAL_8:.*]] = "tf.Const"() {value = dense<[0, 3, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_9:.*]] = "tf.Transpose"(%[[VAL_7]], %[[VAL_8]]) : (tensor<1x256x256x1xbf16>, tensor<4xi64>) -> tensor<1x1x256x256xbf16>
+// CHECK:           %[[VAL_10:.*]] = "tf.Conv2D"(%[[VAL_5]], %[[VAL_9]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<16x32x1x256xbf16>, tensor<1x1x256x256xbf16>) -> tensor<16x32x1x256xbf16>
+// CHECK:           %[[VAL_11:.*]] = "tf.Const"() {value = dense<[0, 1, 3, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+// CHECK:           %[[VAL_12:.*]] = "tf.Transpose"(%[[VAL_10]], %[[VAL_11]]) : (tensor<16x32x1x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
+// CHECK:           %[[VAL_13:.*]] = arith.constant dense<[16, 32, 256]> : tensor<3xi64>
+// CHECK:           %[[VAL_14:.*]] = "tf.Reshape"(%[[VAL_12]], %[[VAL_13]]) : (tensor<16x32x256x1xbf16>, tensor<3xi64>) -> tensor<16x32x256xbf16>
+// CHECK:           return %[[VAL_14]] : tensor<16x32x256xbf16>
 // CHECK:         }
-func.func @no_convert_conv1d_missing_windows_strides(%arg0: tensor<16x32x256xbf16>, %arg1: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
+func.func @convert_conv1d_missing_windows_strides_fallback(%arg0: tensor<16x32x256xbf16>, %arg1: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
 	%0 = "mhlo.convolution"(%arg0, %arg1) {
     batch_group_count = 1 : i64,
     dimension_numbers = #mhlo.conv<[b, 0, f]x[0, i, o]->[b, 0, f]>,
@@ -1752,6 +1773,25 @@ func.func @no_convert_conv1d_missing_windows_strides(%arg0: tensor<16x32x256xbf1
   func.return %0 : tensor<16x32x256xbf16>
 }
 
+// CHECK-LABEL:   func.func @convert_conv1d_missing_windows_strides_fallback_2(
+// CHECK-SAME:                              %[[VAL_0:.*]]: tensor<1x64x64x4xbf16>,
+// CHECK-SAME:                              %[[VAL_1:.*]]: tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16> {
+// CHECK:           %[[VAL_2:.*]] = "tf.Conv2D"(%[[VAL_0]], %[[VAL_1]]) {data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true} : (tensor<1x64x64x4xbf16>, tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16>
+// CHECK:           return %[[VAL_2]] : tensor<1x62x62x320xbf16>
+// CHECK:         }
+func.func @convert_conv1d_missing_windows_strides_fallback_2(%arg0: tensor<1x64x64x4xbf16>, %arg1: tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16> {
+	%0 = "mhlo.convolution"(%arg0, %arg1) {
+    batch_group_count = 1 : i64,
+    dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
+    feature_group_count = 1 : i64,
+    lhs_dilation = dense<[1, 1]> : tensor<2xi64>,
+    padding = dense<0> : tensor<2x2xi64>,
+    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
+    rhs_dilation = dense<[1, 1]> : tensor<2xi64>
+  } : (tensor<1x64x64x4xbf16>, tensor<3x3x4x320xbf16>) -> tensor<1x62x62x320xbf16>
+  func.return %0 : tensor<1x62x62x320xbf16>
+}
+
 // CHECK-LABEL:   func @convert_conv2d(
 // CHECK-SAME:                         %[[VAL_0:.*]]: tensor<1x8x8x207xf32>,
 // CHECK-SAME:                         %[[VAL_1:.*]]: tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32> {
@@ -2625,6 +2665,89 @@ func.func @convert_gather_offset(%arg0: tensor<1x20xi32>, %arg1: tensor<1x1xi32>
   func.return %0 : tensor<1x1xi32>
 }
 
+// CHECK-LABEL:   func @convert_gather_to_slice_batch_size_1(
+// CHECK-SAME:                         %[[ARG_0:.*]]: tensor<1x2944xi32>,
+// CHECK-SAME:                         %[[ARG_1:.*]]: tensor<1x2xi32>)
+// CHECK-DAG:         %[[CST:.*]] = "tf.Const"() {value = dense<[0, 1440]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:         %[[CST_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:             %[[VAL_0:.*]] = "tf.Maximum"(%[[ARG_1]], %[[CST_0:.*]]) : (tensor<1x2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+// CHECK:             %[[VAL_1:.*]] = "tf.Minimum"(%[[VAL_0]], %[[CST]]) : (tensor<1x2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+// CHECK-DAG:         %[[CST_1:.*]] = "tf.Const"() {value = dense<[1, 1504]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:             %[[VAL_2:.*]] = "tf.Squeeze"(%[[VAL_1]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:             %[[VAL_3:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_2]], %[[CST_1]]) : (tensor<1x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
+// CHECK:            return %[[VAL_3]]
+// CHECK:         }
+func.func @convert_gather_to_slice_batch_size_1(%arg0: tensor<1x2944xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x1504xi32> {
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [1],
+      collapsed_slice_dims = [0],
+      start_index_map = [0, 1],
+      index_vector_dim = 1,
+    >,
+    indices_are_sorted = true,
+    slice_sizes = dense<[1, 1504]> : tensor<2xi64>
+  } : (tensor<1x2944xi32>, tensor<1x2xi32>) -> tensor<1x1504xi32>
+  func.return %0 : tensor<1x1504xi32>
+}
+
+// CHECK-LABEL:   func @convert_gather_to_slice(
+// CHECK-SAME:                         %[[ARG_0:.*]]: tensor<3x2944xi32>,
+// CHECK-SAME:                         %[[ARG_1:.*]]: tensor<3x2xi32>)
+// CHECK-DAG:        %[[CST:.*]] = "tf.Const"() {value = dense<[2, 1440]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_0:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:            %[[VAL_0:.*]] = "tf.Maximum"(%[[ARG_1]], %[[CST_0]]) : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+// CHECK:            %[[VAL_1:.*]] = "tf.Minimum"(%[[VAL_0]], %[[CST]]) : (tensor<3x2xi32>, tensor<2xi32>) -> tensor<3x2xi32>
+// CHECK-DAG:        %[[CST_1:.*]] = "tf.Const"() {value = dense<[1, 1504]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_2:.*]] = "tf.Const"() {value = dense<0> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_3:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:            %[[VAL_2:.*]] = "tf.Slice"(%[[VAL_1]], %[[CST_2]], %[[CST_3]]) : (tensor<3x2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+// CHECK:            %[[VAL_3:.*]] = "tf.Squeeze"(%[[VAL_2]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:            %[[VAL_4:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_3]], %[[CST_1]]) : (tensor<3x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
+// CHECK-DAG:        %[[CST_4:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_5:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:            %[[VAL_5:.*]] = "tf.Slice"(%[[VAL_1]], %[[CST_4]], %[[CST_5]]) : (tensor<3x2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+// CHECK:            %[[VAL_6:.*]] = "tf.Squeeze"(%[[VAL_5]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:            %[[VAL_7:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_6]], %[[CST_1]]) : (tensor<3x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
+// CHECK-DAG:        %[[CST_6:.*]] = "tf.Const"() {value = dense<[2, 0]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK-DAG:        %[[CST_7:.*]] = "tf.Const"() {value = dense<[1, 2]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:            %[[VAL_8:.*]] = "tf.Slice"(%[[VAL_1]], %[[CST_6]], %[[CST_7]]) : (tensor<3x2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x2xi32>
+// CHECK:            %[[VAL_9:.*]] = "tf.Squeeze"(%[[VAL_8]]) {squeeze_dims = [0]} : (tensor<1x2xi32>) -> tensor<2xi32>
+// CHECK:            %[[VAL_10:.*]] = "tf.Slice"(%[[ARG_0]], %[[VAL_9]], %[[CST_1]]) : (tensor<3x2944xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<1x1504xi32>
+// CHECK-DAG:        %[[CST_8:.*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+// CHECK:            %[[VAL_11:.*]] = "tf.ConcatV2"(%[[VAL_4]], %[[VAL_7]], %[[VAL_10]], %[[CST_8]]) : (tensor<1x1504xi32>, tensor<1x1504xi32>, tensor<1x1504xi32>, tensor<i32>) -> tensor<3x1504xi32>
+// CHECK:            return %[[VAL_11]]
+// CHECK:         }
+func.func @convert_gather_to_slice(%arg0: tensor<3x2944xi32>, %arg1: tensor<3x2xi32>) -> tensor<3x1504xi32> {
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [1],
+      collapsed_slice_dims = [0],
+      start_index_map = [0, 1],
+      index_vector_dim = 1,
+    >,
+    indices_are_sorted = true,
+    slice_sizes = dense<[1, 1504]> : tensor<2xi64>
+  } : (tensor<3x2944xi32>, tensor<3x2xi32>) -> tensor<3x1504xi32>
+  func.return %0 : tensor<3x1504xi32>
+}
+
+// CHECK-LABEL:   func @convert_gather_to_slice_dynamic_error
+func.func @convert_gather_to_slice_dynamic_error(%arg0: tensor<3x?xi32>, %arg1: tensor<3x2xi32>) -> tensor<3x1504xi32> {
+  // expected-error @+1 {{Dynamic shaped inputs are not supported.}}
+  %0 = "mhlo.gather"(%arg0, %arg1) {
+    dimension_numbers = #mhlo.gather<
+      offset_dims = [1],
+      collapsed_slice_dims = [0],
+      start_index_map = [0, 1],
+      index_vector_dim = 1,
+    >,
+    indices_are_sorted = true,
+    slice_sizes = dense<[1, 1504]> : tensor<2xi64>
+  } : (tensor<3x?xi32>, tensor<3x2xi32>) -> tensor<3x1504xi32>
+  func.return %0 : tensor<3x1504xi32>
+}
+
 // CHECK-LABEL: func @convert_dynamic_slice(
 // CHECK-SAME:                                      %[[VAL_0:.*]]: tensor<7x3xf32>,
 // CHECK-SAME:                                      %[[VAL_1:.*]]: tensor<i32>,
@@ -2755,6 +2878,39 @@ func.func @convert_scatter_update_to_non_trailing_operand_dimensions(
   func.return %0 : tensor<5x4x3x7xf32>
 }
 
+// CHECK-LABEL:   func @convert_scatter_update_reshape_indices_and_updates(
+// CHECK-SAME:                                                            %[[ARG_0:.*]]: tensor<16x1504xf32>,
+// CHECK-SAME:                                                            %[[ARG_1:.*]]: tensor<1xi32>,
+// CHECK-SAME:                                                            %[[ARG_2:.*]]: tensor<16xf32>) -> tensor<16x1504xf32> {
+// CHECK:           %[[CST:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_0:.*]] = "tf.Transpose"(%[[ARG_0]], %[[CST]]) : (tensor<16x1504xf32>, tensor<2xi64>) -> tensor<1504x16xf32>
+// CHECK:           %[[CST_0:.*]] = "tf.Const"() {value = dense<1> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:           %[[VAL_1:.*]] = "tf.Reshape"(%[[ARG_1]], %[[CST_0]]) : (tensor<1xi32>, tensor<2xi32>) -> tensor<1x1xi32>
+// CHECK:           %[[CST_1:.*]] = "tf.Const"() {value = dense<[1, 16]> : tensor<2xi32>} : () -> tensor<2xi32>
+// CHECK:           %[[VAL_2:.*]] = "tf.Reshape"(%[[ARG_2]], %[[CST_1]]) : (tensor<16xf32>, tensor<2xi32>) -> tensor<1x16xf32>
+// CHECK:           %[[VAL_3:.*]] = "tf.TensorScatterUpdate"(%[[VAL_0]], %[[VAL_1]], %[[VAL_2]]) : (tensor<1504x16xf32>, tensor<1x1xi32>, tensor<1x16xf32>) -> tensor<1504x16xf32>
+// CHECK:           %[[CST_2:.*]] = "tf.Const"() {value = dense<[1, 0]> : tensor<2xi64>} : () -> tensor<2xi64>
+// CHECK:           %[[VAL_4:.*]] = "tf.Transpose"(%[[VAL_3]], %[[CST_2]]) : (tensor<1504x16xf32>, tensor<2xi64>) -> tensor<16x1504xf32>
+// CHECK:           return %[[VAL_4]]
+// CHECK:         }
+func.func @convert_scatter_update_reshape_indices_and_updates(
+  %arg0: tensor<16x1504xf32>,
+  %arg1: tensor<1xi32>,
+  %arg2: tensor<16xf32>) -> tensor<16x1504xf32>
+{
+  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
+^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+  "mhlo.return"(%arg4) : (tensor<f32>) -> ()
+}) {
+  indices_are_sorted = true,
+  scatter_dimension_numbers = #mhlo.scatter<
+    update_window_dims = [0],
+    inserted_window_dims = [1],
+    scatter_dims_to_operand_dims = [1]>,
+    unique_indices = true} : (tensor<16x1504xf32>, tensor<1xi32>, tensor<16xf32>) -> tensor<16x1504xf32>
+  func.return %0 : tensor<16x1504xf32>
+}
+
 // CHECK-LABEL:   func @convert_scatter_add(
 // CHECK-SAME:                              %[[VAL_0:.*]]: tensor<20x6xf32>,
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<4x1xi32>,
@@ -3264,30 +3420,9 @@ func.func @if(%arg0: tensor<i1>) -> (tensor<i32>) {
 // CHECK-SAME:                                       %[[VAL_2:[a-z0-9]*]]: tensor<i32>,
 // CHECK-SAME:                                       %[[VAL_3:[a-z0-9]*]]: tensor<i32>,
 // CHECK-SAME:                                       %[[VAL_4:[a-z0-9]*]]: tensor<i32>) -> tensor<28x1x100xf32> {
-// CHECK-DAG:       %[[CST_0:[_a-z0-9]*]] = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
-// CHECK-DAG:       %[[CST_1:[_a-z0-9]*]] = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[START_IND:[_a-z0-9]*]] = "tf.Pack"(%[[VAL_2]], %[[VAL_3]], %[[VAL_4]]) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
-// CHECK-DAG:       %[[OP_SHAPE:[_a-z0-9]*]] = "tf.Const"() {value = dense<[28, 1, 100]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK-DAG:       %[[UP_SHAPE:[_a-z0-9]*]] = "tf.Const"() {value = dense<[1, 1, 100]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           %[[MAX_START:[_a-z0-9]*]] = "tf.Sub"(%[[OP_SHAPE]], %[[UP_SHAPE]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           %[[START_1:[_a-z0-9]*]] = "tf.Minimum"(%[[START_IND]], %[[MAX_START]]) : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi32>
-// CHECK:           %[[CLAMP_START:[_a-z0-9]*]] = "tf.Maximum"(%[[START_1]], %[[CST_0]]) : (tensor<3xi32>, tensor<i32>) -> tensor<3xi32>
-// CHECK:           %[[N_OP:[_a-z0-9]*]] = "tf.Const"() {value = dense<2800> : tensor<i32>} : () -> tensor<i32>
-// CHECK:           %[[FLAT_RANGE:[_a-z0-9]*]] = "tf.Range"(%[[CST_0]], %[[N_OP]], %[[CST_1]]) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2800xi32>
-// CHECK:           %[[OP_SHAPE_1:[_a-z0-9]*]] = "tf.Const"() {value = dense<[28, 1, 100]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           %[[RANGE:[_a-z0-9]*]] = "tf.Reshape"(%[[FLAT_RANGE]], %[[OP_SHAPE_1]]) : (tensor<2800xi32>, tensor<3xi32>) -> tensor<28x1x100xi32>
-// CHECK:           %[[UP_SHAPE_1:[_a-z0-9]*]] = "tf.Const"() {value = dense<[1, 1, 100]> : tensor<3xi32>} : () -> tensor<3xi32>
-// CHECK:           %[[UPDATE_IDX:[_a-z0-9]*]] = "tf.Slice"(%[[RANGE]], %[[CLAMP_START]], %[[UP_SHAPE_1]]) : (tensor<28x1x100xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<1x1x100xi32>
-// CHECK:           %[[FLAT_UP_SHAPE:[_a-z0-9]*]] = "tf.Const"() {value = dense<[100, 1]> : tensor<2xi32>} : () -> tensor<2xi32>
-// CHECK:           %[[FLAT_UP_IDX:[_a-z0-9]*]] = "tf.Reshape"(%[[UPDATE_IDX]], %[[FLAT_UP_SHAPE]]) : (tensor<1x1x100xi32>, tensor<2xi32>) -> tensor<100x1xi32>
-// CHECK:           %[[FLAT_OP_SHAPE:[_a-z0-9]*]] = "tf.Const"() {value = dense<2800> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK:           %[[FLAT_OP:[_a-z0-9]*]] = "tf.Reshape"(%[[VAL_0]], %[[FLAT_OP_SHAPE]]) : (tensor<28x1x100xf32>, tensor<1xi32>) -> tensor<2800xf32>
-// CHECK:           %[[FLAT_UP_SHAPE_1:[_a-z0-9]*]] = "tf.Const"() {value = dense<100> : tensor<1xi32>} : () -> tensor<1xi32>
-// CHECK:           %[[FLAT_UP:[_a-z0-9]*]] = "tf.Reshape"(%[[VAL_1]], %[[FLAT_UP_SHAPE_1]]) : (tensor<1x1x100xf32>, tensor<1xi32>) -> tensor<100xf32>
-// CHECK:           %[[FLAT_RESULT:[_a-z0-9]*]] = "tf.TensorScatterUpdate"(%[[FLAT_OP]], %[[FLAT_UP_IDX]], %[[FLAT_UP]]) : (tensor<2800xf32>, tensor<100x1xi32>, tensor<100xf32>) -> tensor<2800xf32>
-// CHECK:           %[[RESULT:[_a-z0-9]*]] = "tf.Reshape"(%[[FLAT_RESULT]], %[[OP_SHAPE]]) : (tensor<2800xf32>, tensor<3xi32>) -> tensor<28x1x100xf32>
-// CHECK:           return %[[RESULT]] : tensor<28x1x100xf32>
-// CHECK:         }
+// CHECK:         %0 = "tf.Pack"(%arg2, %arg3, %arg4) {axis = 0 : i64} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<3xi32>
+// CHECK:         %1 = "tf.XlaDynamicUpdateSlice"(%arg0, %arg1, %0) : (tensor<28x1x100xf32>, tensor<1x1x100xf32>, tensor<3xi32>) -> tensor<28x1x100xf32>
+// CHECK:         return %1 : tensor<28x1x100xf32>
 func.func @convert_dynamic_update_slice(%arg0: tensor<28x1x100xf32>, %arg1: tensor<1x1x100xf32>, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>) -> tensor<28x1x100xf32> {
   %0 = "mhlo.dynamic_update_slice"(%arg0, %arg1, %arg2, %arg3, %arg4) : (tensor<28x1x100xf32>, tensor<1x1x100xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<28x1x100xf32>
   func.return %0 : tensor<28x1x100xf32>
@@ -3577,7 +3712,7 @@ func.func @reduce_window_trivial_window_dims(%arg0: tensor<4x12xf32>) -> tensor<
   // expected-error @+1 {{no reduced dimension is found.}}
   %1 = "mhlo.reduce_window"(%arg0, %0) ({
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-    %2 = mhlo.add %arg1, %arg2 : tensor<f32>
+    %2 = mhlo.multiply %arg1, %arg2 : tensor<f32>
     "mhlo.return"(%2) : (tensor<f32>) -> ()
   }) {padding = dense<0> : tensor<2x2xi64>, window_dimensions = dense<1> : tensor<2xi64>} : (tensor<4x12xf32>, tensor<f32>) -> tensor<4x12xf32>
   func.return %1 : tensor<4x12xf32>
@@ -3596,3 +3731,30 @@ func.func @convert_dot_quant_type(%arg0: tensor<1x256xf32>, %arg1: tensor<256x!q
   %0 = "mhlo.dot"(%arg0, %arg1) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<1x256xf32>, tensor<256x!quant.uniform<i8:f32, 1.0>>) -> tensor<1xf32>
   func.return %0 : tensor<1xf32>
 }
+
+// CHECK-LABEL: func @convert_approx_top_k_custom_call(
+// CHECK-SAME:                                        %[[ARG_0:.*]]: tensor<1x4xf32>,
+// CHECK-SAME:                                        %[[ARG_1:.*]]: tensor<1x4xi32>,
+// CHECK-SAME:                                        %[[ARG_2:.*]]: tensor<f32>,
+// CHECK-SAME:                                        %[[ARG_3:.*]]: tensor<i32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
+// CHECK:          %[[VALUES:.*]], %[[INDICES:.*]] = "tf.ApproxTopK"(%[[ARG_0]]) {aggregate_to_topk = true, is_max_k = true, k = 4 : i64, recall_target = 8.500000e-01 : f32, reduction_dimension = 1 : i64, reduction_input_size_override = -1 : i64} : (tensor<1x4xf32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+// CHECK:          return %[[VALUES]], %[[INDICES]] : tensor<1x4xf32>, tensor<1x4xi32>
+// CHECK:        }
+func.func @convert_approx_top_k_custom_call(%arg0: tensor<1x4xf32>, %arg1: tensor<1x4xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<1x4xf32>, tensor<1x4xi32>) {
+  %0:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg1, %arg2, %arg3) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_f32_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      is_fallback = true,
+      recall_target = 8.500000e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      top_k = 4 : i64}
+    } : (tensor<1x4xf32>, tensor<1x4xi32>, tensor<f32>, tensor<i32>) -> (tensor<1x4xf32>, tensor<1x4xi32>)
+  func.return %0#0, %0#1 : tensor<1x4xf32>, tensor<1x4xi32>
+}
+func.func @top_k_gt_f32_comparator(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
index 432d0ab8733..4c9bd8a03e6 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-order.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-order.mlir
index 019197a0a6c..9e2a83f4e06 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-order.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/function-order.mlir
@@ -4,10 +4,13 @@
 func.func @main() {
   tf_executor.graph {
     // CHECK: node {
-    // CHECK-NEXT: name: "tf.foo"
-    // CHECK-NEXT: op: "foo"
+    // CHECK-NEXT: name: "tf.PartitionedCall"
+    // CHECK-NEXT: op: "PartitionedCall"
+    // CHECK:   func {
+    // CHECK:     name: "foo"
+    // CHECK:   }
     // CHECK: }
-    %0:2 = tf_executor.island wraps "tf.foo"() {name = "tf.foo"} : () -> tensor<*xf32>
+    %0 = tf_executor.island wraps "tf.PartitionedCall"() {Tin = [], Tout = [], config = "", config_proto = "", device = "", executor_type = "", f = @foo, name = "Call_foo"} : () -> ()
     tf_executor.fetch
   }
   func.return
@@ -65,14 +68,17 @@ func.func @bar() {
 // CHECK-NEXT:       name: "foo"
 // CHECK-NEXT:     }
 // CHECK-NEXT:     node_def {
-// CHECK-NEXT:       name: "tf.bar"
-// CHECK-NEXT:       op: "bar"
+// CHECK-NEXT:       name: "tf.PartitionedCall"
+// CHECK-NEXT:       op: "PartitionedCall"
+// CHECK:            func {
+// CHECK:              name: "bar"
+// CHECK:            }
 // CHECK:          }
 // CHECK-NEXT:   }
 // CHECK:      }
 func.func @foo() {
   tf_executor.graph {
-    %0:2 = tf_executor.island wraps "tf.bar"() {name = "tf.bar"} : () -> tensor<*xf32>
+    %0 = tf_executor.island wraps "tf.PartitionedCall"() {Tin = [], Tout = [], config = "", config_proto = "", device = "", executor_type = "", f = @bar, name = "Call_bar"} : () -> ()
     tf_executor.fetch
   }
   func.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir
index c5aca980abb..4de7cb6ccaf 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/mlir2graphdef/tf-gradient-attr.mlir
@@ -8,10 +8,12 @@ func.func @main() {
     %0:2 = tf_executor.island wraps "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", value = dense<2.500000e-01> : tensor<f32>} : () -> tensor<f32> loc("Const")
 
   // CHECK:      node {
-  // CHECK-NEXT:   name: "foo"
-  // CHECK-NEXT:   op: "foo"
-  // CHECK-NEXT:   input: "Const"
-    %1:2 = tf_executor.island wraps "tf.foo"(%0#0) {device = ""} : (tensor<f32>) -> tensor<*xf32> loc("foo")
+  // CHECK-NEXT:   name: "tf.PartitionedCall"
+  // CHECK-NEXT:   op: "PartitionedCall"
+  // CHECK:        func {
+  // CHECK:          name: "foo"
+  // CHECK:        }
+    %1:2 = tf_executor.island wraps "tf.PartitionedCall"(%0) {Tin = [], Tout = [], config = "", config_proto = "", device = "", executor_type = "", f = @foo, name = "Call_foo"} : (tensor<f32>) -> tensor<*xf32>
     tf_executor.fetch
   }
   func.return
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
index f3fee9f74d4..02c144467d9 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/resource_op_lifting.mlir
@@ -53,7 +53,7 @@ func.func @only_resource_store() -> tensor<*xi32> {
 
 // -----
 
-// Tests that a resource ops with both load and store are hoisted.
+// Tests that resource ops with both load and store are hoisted.
 
 // CHECK-LABEL: func @same_resource_load_and_store
 func.func @same_resource_load_and_store() -> tensor<*xi32> {
@@ -82,7 +82,7 @@ func.func @same_resource_load_and_store() -> tensor<*xi32> {
 
 // -----
 
-// Tests that a resource ops with both load and store are hoisted
+// Tests that resource ops with both load and store are hoisted
 // but input to load and output from store have mixed defined/undefined shapes.
 
 // CHECK-LABEL: func @same_resource_load_and_store_cast
@@ -114,13 +114,85 @@ func.func @same_resource_load_and_store_cast() -> tensor<1xi32> {
 
 // -----
 
-// Tests that internal resource operations are not hoisted.
+// Tests that anonymous internal resource operations are eliminated.
 
-// CHECK-LABEL: func @internal_resource
-func.func @internal_resource() -> tensor<*xi32> {
+// CHECK-LABEL: func @anonymous_internal_resource
+func.func @anonymous_internal_resource() -> tensor<*xi32> {
+
+  // CHECK: %[[COMPUTE1_RES:[0-9]*]] = "tf.SomeComputation1"()
+  %0 = "tf.SomeComputation1"() : () -> (tensor<*xi32>)
 
   // CHECK: %[[CLUSTER_RES:[0-9]*]] = "tf_device.cluster"
-  %0 = "tf_device.cluster"() ({
+  // CHECK-NOT: "tf.VarHandleOp"
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK-NOT: "tf.ReadVariableOp"
+  // CHECK: %[[COMPUTE2_RES:[0-9]*]] = "tf.SomeComputation2"(%[[COMPUTE1_RES]])
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK-NOT: "tf.ReadVariableOp"
+  // CHECK: tf_device.return %[[COMPUTE2_RES]]
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> tensor<*xi32>
+
+  %1 = "tf_device.cluster"() ( {
+    %1 = "tf.VarHandleOp"() {shared_name = "cd2c89b7-88b7-44c8-ad83-06c2a9158347"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
+    "tf.AssignVariableOp"(%1, %0) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<*xi32>) -> ()
+    %2 = "tf.ReadVariableOp"(%1) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>) -> tensor<*xi32>
+    %3 = "tf.SomeComputation2"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
+    "tf.AssignVariableOp"(%1, %3) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<*xi32>) -> ()
+    %4 = "tf.ReadVariableOp"(%1) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>) -> tensor<*xi32>
+    tf_device.return %4 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]
+  return %1 : tensor<*xi32>
+}
+
+// -----
+
+// Tests that anonymous internal resource operations (including DestroyResourceOp) are eliminated.
+
+// CHECK-LABEL: func @anonymous_internal_resource_with_destroy
+func.func @anonymous_internal_resource_with_destroy() -> tensor<*xi32> {
+
+  // CHECK: %[[COMPUTE1_RES:[0-9]*]] = "tf.SomeComputation1"()
+  %0 = "tf.SomeComputation1"() : () -> (tensor<*xi32>)
+
+  // CHECK: %[[CLUSTER_RES:[0-9]*]] = "tf_device.cluster"
+  // CHECK-NOT: "tf.VarHandleOp"
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK-NOT: "tf.ReadVariableOp"
+  // CHECK: %[[COMPUTE2_RES:[0-9]*]] = "tf.SomeComputation2"(%[[COMPUTE1_RES]])
+  // CHECK-NOT: "tf.AssignVariableOp"
+  // CHECK-NOT: "tf.ReadVariableOp"
+  // CHECK-NOT: "tf.DestroyResourceOp"
+  // CHECK: tf_device.return %[[COMPUTE2_RES]]
+  // CHECK: {cluster_attr = "cluster_attr"}
+  // CHECK-SAME: () -> tensor<*xi32>
+
+  %1 = "tf_device.cluster"() ( {
+    %1 = "tf.VarHandleOp"() {shared_name = "cd2c89b7-88b7-44c8-ad83-06c2a9158347"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
+    "tf.AssignVariableOp"(%1, %0) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<*xi32>) -> ()
+    %2 = "tf.ReadVariableOp"(%1) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>) -> tensor<*xi32>
+    %3 = "tf.SomeComputation2"(%2) : (tensor<*xi32>) -> (tensor<*xi32>)
+    "tf.AssignVariableOp"(%1, %3) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<*xi32>) -> ()
+    %4 = "tf.ReadVariableOp"(%1) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>) -> tensor<*xi32>
+    "tf.DestroyResourceOp"(%1) {dtype = i32} : (tensor<*x!tf_type.resource<tensor<*xi32>>>) -> ()
+    tf_device.return %4 : tensor<*xi32>
+  }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
+
+  // CHECK: return %[[CLUSTER_RES]]
+  return %1 : tensor<*xi32>
+}
+
+// -----
+
+// Tests that named internal resource operations are not hoisted.
+
+// CHECK-LABEL: func @named_internal_resource
+func.func @named_internal_resource() -> tensor<*xi32> {
+
+  // CHECK: %[[CLUSTER_RES:[0-9]*]] = "tf_device.cluster"
+  %0 = "tf_device.cluster"() ( {
 
     // CHECK: %[[RES_HANDLE:[0-9]*]] = "tf.VarHandleOp"
     %1 = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
@@ -139,7 +211,7 @@ func.func @internal_resource() -> tensor<*xi32> {
   }) {cluster_attr = "cluster_attr"} : () -> tensor<*xi32>
 
   // CHECK: return %[[CLUSTER_RES]]
-  func.return %0 : tensor<*xi32>
+  return %0 : tensor<*xi32>
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
index b489dd04e73..a760759e9ed 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/side-effect-analysis-test.mlir
@@ -2628,3 +2628,39 @@ func.func @fetch_with_resource_operand(
   // expected-remark@above {{ID: 8}}
   // expected-remark@above {{Sinks: {7}}}
 }
+
+// -----
+
+// Tests about we create the dependency PwStreamResults `start` and `end`
+// within in XlaCallModule
+func.func @_pws_program(%arg0: tensor<i32> {tf_saved_model.index_path = ["arg0"]}) -> (tensor<i32> {tf_saved_model.index_path = ["result0"]}, tensor<i32> {tf_saved_model.index_path = ["result1"]}) attributes {pws.program_id = 4722582128360897113 : i64, tf.entry_function = {}} {
+  // expected-remark@above {{ID: 7}}
+  "tf.PwStreamResults"(%arg0) {_callback_id = -2694175233261920887 : i64, _controller_address = "[2002:afb:afb::]:10004", _has_manual_control_dependencies = true, _model_name = "test", device = "/device/CPU", names = ["begin"]} : (tensor<i32>) -> ()
+  // expected-remark@above {{ID: 0}}
+  // expected-remark@above {{Successors: {4}}}
+  %0:2 = "tf_device.cluster"() ({
+  // expected-remark@above {{Predecessors: {0}}}
+  // expected-remark@above {{ID: 4}}
+  // expected-remark@above {{Successors: {5}}}
+    %1 = "tf.XlaSharding"(%arg0) {_XlaSharding = "", sharding = "", unspecified_dims = []} : (tensor<i32>) -> tensor<i32>
+    // expected-remark@above {{ID: 1}}
+    %2:2 = "tf.XlaCallModule"(%1) {Sout = [#tf_type.shape<>, #tf_type.shape<>], dim_args_spec = [], function_list = [@__inference_callable_flat_tf_150], module = "ML\EFR\00__inference_callable_flat_tf_15\00", platforms = [], version = 5 : i64} : (tensor<i32>) -> (tensor<i32>, tensor<i32>)
+    // expected-remark@above {{ID: 2}}
+    tf_device.return %2#0, %2#1 : tensor<i32>, tensor<i32>
+    // expected-remark@above {{ID: 3}}
+  }) {_tpu_replicate = "cluster_0", allow_soft_placement = false, computation_shape = [], device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> (tensor<i32>, tensor<i32>)
+  "tf.PwStreamResults"(%arg0) {_callback_id = -2694175233261920887 : i64, _controller_address = "[2002:afb:afb::]:10004", _model_name = "test", device = "/device/CPU", names = ["end"]} : (tensor<i32>) -> ()
+  // expected-remark@above {{Predecessors: {4}}}
+  // expected-remark@above {{ID: 5}}
+  return %0#0, %0#1 : tensor<i32>, tensor<i32>
+  // expected-remark@above {{ID: 6}}
+  // expected-remark@above {{Sinks: {5}}}
+}
+// expected-remark@below {{ID: 2}}
+func.func private @__inference_callable_flat_tf_150(%arg0: tensor<i32> {tf._user_specified_name = "args_tf_flat_0"}, %arg1: tensor<i32> {tf._user_specified_name = "args_tf_flat_1"}) attributes {tf._XlaMustCompile = false, tf._construction_context = "kEagerRuntime", tf._input_shapes = [#tf_type.shape<>, #tf_type.shape<>], tf._original_func_name = "__inference_callable_flat_tf_15", tf.signature.is_stateful} {
+  "tf.PwStreamResults"(%arg0, %arg1) {_callback_id = -2694175233261920887 : i64, _controller_address = "[2002:afb:afb::]:10004", _model_name = "test", names = ["foo", "bar"]} : (tensor<i32>, tensor<i32>) -> ()
+  // expected-remark@above {{ID: 0}}
+  return
+  // expected-remark@above {{ID: 1}}
+  // expected-remark@above {{Sinks: {0}}}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
index 74363ecc967..c17ef278ba1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf-ops.mlir
@@ -5134,3 +5134,11 @@ func.func @test_batch_function_with_invalid_symbol(%arg0: tensor<1x3xf32>, %arg1
   "tf.BatchFunction"(%arg0, %arg1) {batch_timeout_micros = 100000 : i64, f = @undefined_function, max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64, num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 1>} : (tensor<1x3xf32>, tensor<!tf_type.resource<tensor<1x3xf32>>>) -> tensor<*xf32>
   func.return
 }
+
+// -----
+
+func.func @test_xla_call_module_with_invalid_symbol() {
+  // expected-error @below {{refers to an undefined function: @undefined_function}}
+  "tf.XlaCallModule"() {Sout = [], device = "", dim_args_spec = [], function_list = [@undefined_function], module = "", platforms = [], version = 4 : i64} : () -> ()
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
index 88c31e5057f..cee4ca7f782 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model/BUILD
@@ -62,6 +62,7 @@ test_files = glob(
 ]
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     default_size = "medium",
     default_tags = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir
new file mode 100644
index 00000000000..2638aab86b8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_asset_sinking.mlir
@@ -0,0 +1,30 @@
+// RUN: tf-opt %s -split-input-file -tf-saved-model-asset-sinking='saved-model-dir=foo/bar' | FileCheck %s
+
+// CHECK-LABEL: module @asset
+module @asset attributes {tf_saved_model.semantics} {
+  "tf_saved_model.session_initializer"() {initializers = [@init]} : () -> ()
+
+  // CHECK-NOT: "tf_saved_model.asset"
+  "tf_saved_model.asset"() {filename = "assets/test0.txt", sym_name = "asset0"} : () -> ()
+  "tf_saved_model.asset"() {filename = "assets/test1.txt", sym_name = "asset1"} : () -> ()
+
+  // CHECK: func @init()
+  func.func @init(%arg0: tensor<!tf_type.string> {tf_saved_model.bound_input = @asset0}, %arg1: tensor<!tf_type.string> {tf_saved_model.bound_input = @asset1}) attributes {tf_saved_model.exported_names = ["init"]} {
+    // CHECK-DAG: %[[ASSET0:.*]] = "tf.Const"() {value = dense<"foo/bar/assets/test0.txt"> : tensor<!tf_type.string>}
+    // CHECK-DAG: %[[ASSET1:.*]] = "tf.Const"() {value = dense<"foo/bar/assets/test1.txt"> : tensor<!tf_type.string>}
+
+    // CHECK: %[[VAR0:.*]] = "tf.VarHandleOp"()
+    %0 = "tf.VarHandleOp"() {container = "", shared_name = "var0"} : () -> tensor<!tf_type.resource<tensor<!tf_type.string>>>
+
+    // CHECK: "tf.AssignVariableOp"(%[[VAR0]], %[[ASSET0]])
+    "tf.AssignVariableOp"(%0, %arg0) : (tensor<!tf_type.resource<tensor<!tf_type.string>>>, tensor<!tf_type.string>) -> ()
+
+    // CHECK: %[[VAR1:.*]] = "tf.VarHandleOp"()
+    %1 = "tf.VarHandleOp"() {container = "", shared_name = "var1"} : () -> tensor<!tf_type.resource<tensor<!tf_type.string>>>
+
+    // CHECK: "tf.AssignVariableOp"(%[[VAR1]], %[[ASSET1]])
+    "tf.AssignVariableOp"(%1, %arg1) : (tensor<!tf_type.resource<tensor<!tf_type.string>>>, tensor<!tf_type.string>) -> ()
+
+    func.return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD
index 954eca9c0e2..421bbd5de79 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_to_hlo_pipeline/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD b/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD
index 954eca9c0e2..421bbd5de79 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_bridge_v1/BUILD
@@ -6,6 +6,7 @@ load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 licenses(["notice"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
index db266ed4afe..8852458137f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_cluster_formation.mlir
@@ -838,7 +838,7 @@ func.func @valid_compilation_cluster_no_replication_op_device() {
 // CHECK-NOT: device =
 // CHECK: return
 func.func @valid_compilation_cluster_no_replication_op_device() {
-  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/device:CPU:0"} : () -> ()
+  "tf.opA"() { _xla_compile_device_type = "TPU", device = "/device:TPU:0"} : () -> ()
   "tf.opB"() { _xla_compile_device_type = "TPU", device = "/task:0/device:TPU:1"} : () -> ()
   func.return
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir
new file mode 100644
index 00000000000..be47ea6ff2c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_deserialization.mlir
@@ -0,0 +1,40 @@
+// RUN: tf-opt %s -split-input-file -tf-xla-call-module-deserialization | FileCheck %s
+
+// Tests that `tf.XlaCallModule` with both StableHLO module and TF function
+// calls can be deserialized.
+
+// CHECK-LABEL: module
+module {
+  // CHECK-LABEL: func private @_tf_func
+  func.func private @_tf_func(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) {
+    // CHECK: tf.StreamResults
+
+    // StreamResults is a pseudo op in this test.
+    "tf.StreamResults"(%arg0, %arg1) : (tensor<?xi32>, tensor<*xi32>) -> ()
+    func.return
+  }
+
+  // CHECK-LABEL: func @main
+  // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi32>, %[[ARG1:.*]]: tensor<10xi32>
+  func.func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> {
+    // CHECK:      %[[RESULT:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME:   _entry_function = @_stablehlo_main_0
+    // CHECK-NOT:    function_list
+    // CHECK-SAME:   module = ""
+
+    // `module` is stablehlo bytecode for:
+    //  func.func @main(%arg0: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}, %arg1: tensor<*xi32>) -> (tensor<?xi32> {jax.result_info = ""}) {
+    //    stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_func = @_tf_func}} : (tensor<?xi32>, tensor<*xi32>) -> ()
+    //    return %arg0 : tensor<?xi32>
+    //  }
+    %0 = "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<?>], dim_args_spec = [], function_list = [@_tf_func], module = "ML\EFR\07StableHLO_v0.12.0\00\01\19\05\01\05\01\03\05\03\09\07\09\0B\0D\03\8Fm\0F\01?\0B\07\0B\0B\0B\0B\0B\13\0B\0F\133\133\13\13S\0B\0B\0B\0B\0B\0B\0B\0B\0B\13\13\0B\13\13\03/\0B\0B\0B\13\1B\0B\0B\0B\0B\0B\0B\0F\13\0B\0B\0B\0B\0B\0B\0B\13\0B\0F\01\03\0F\03\0D3\07\0B\1B\17\07\02:\03\05\0F\1F\05\11\05\13\05\15\05\17\05\19\03\03\11\13\05\1B\11\01\05\17\01S\15\03\0B\05E\07S\09U\0B[\0DA\17\011\07\03\0B\05?\07]\09?\0BC\0D_\17\01'\07\17\01)\0B\03\13#a%A'c)?+e-?/?1?3g\05\1D\05\1F\05!\05#\05%\05'\05)\05+\05-\17\013\0B\03\039C\05/\17\015\1B\17\017\0B\03\01\1D1\1D3\03\05GQ\0D\05IKMO\1D5\1D7\1D9\1D;\0D\01#\09\03\03W\0D\03YA\1D=\1D?#\0B\1DA\0B\05\1DC\05\03\0D\03ik\1DE\13\0D\01\01\02\04)\03\00\FF\FF\FF\FF\FF\FF\FF\FF\05\1B3\05\11\05\03\07\03\03\11\03\03\03\03\1D\04}\05\01\11\15\0F\07\04m\03\01\09\03\11\19\17\05\03\07\0F\05\03\03\07\03\00\07\055!\05\01\03\09\07;7\03\03\03\01\05\04=\03\05\03\11\1D\1B\05\03\03\07\03\03\03\00\05\04\1F\03\01\06\03\01\05\01\00\9E\07G\1B)\11\0B!\1B\1D\05\1B\1B\03\0F%\1F/!!)#\1F\19)\1F\13\15\1D\15G\11\1F\15\11\0F\0B\11builtin\00vhlo\00module\00func_v1\00return_v1\00custom_call_v1\00call_v1\00xla_call_module_serialization.mlir\00arg_attrs\00function_type\00res_attrs\00sym_name\00sym_visibility\00mhlo.num_partitions\00api_version\00backend_config\00call_target_name\00called_computations\00has_side_effect\00operand_layouts\00output_operand_aliases\00result_layouts\00tf.backend_config\00callee\00\00_stablehlo_f\00jax.arg_info\00x\00mhlo.sharding\00{replicated}\00jax.result_info\00main\00private\00tf.call_tf_function\00called_index\00", platforms = [], version = 5 : i64} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
+    // CHECK:     return %[[RESULT]]
+    func.return %0 : tensor<10xi32>
+  }
+
+  // CHECK-LABEL: func private @_stablehlo_main_0
+  // CHECK-SAME:    (%[[ARG0:.*]]: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}, %[[ARG1:.*]]: tensor<*xi32>) -> (tensor<?xi32> {jax.result_info = ""}) attributes {_from_xla_call_module} {
+  // CHECK:         stablehlo.custom_call @tf.call_tf_function(%[[ARG0]], %[[ARG1]]) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_func = @_tf_func}} : (tensor<?xi32>, tensor<*xi32>) -> ()
+  // CHECK:         return %arg0 : tensor<?xi32>
+  // CHECK:       }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir
new file mode 100644
index 00000000000..446a61cabc8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_round_trip.mlir
@@ -0,0 +1,60 @@
+// RUN: tf-opt %s -split-input-file -tf-xla-call-module-serialization -tf-xla-call-module-deserialization | FileCheck %s
+
+// Tests that running xla-call-module-serialization followed by
+// xla-call-module-deserialization preserves the original module.
+//
+// Note that function names may be different, but arguments, attributes,
+// results, and function body should be the same.
+
+// CHECK-LABEL: module
+module {
+  // CHECK-LABEL: func @main
+  // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi32>, %[[ARG1:.*]]: tensor<10xi32>
+  func.func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> {
+    // CHECK:      %[[RESULT:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME:   Sout = [#tf_type.shape<?>]
+    // CHECK-SAME:   _entry_function = @_stablehlo_main_0
+    // CHECK-SAME:   _stablehlo_module_attrs = {}
+    // CHECK-NOT:    function_list
+    // CHECK-SAME:   module = ""
+    // CHECK-SAME:   platforms = []
+    // CHECK-SAME:   version = 5
+
+    %0 = "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<?>], dim_args_spec = [], _entry_function = @_stablehlo_main_0, module = "", platforms = [], version = 5 : i64} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
+    // CHECK: return %[[RESULT]]
+    func.return %0 : tensor<10xi32>
+  }
+
+  // CHECK-LABEL: func private @_tf_func
+  func.func private @_tf_func(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) {
+    // CHECK: tf.StreamResults
+
+    // StreamResults is a pseudo op in this test.
+    "tf.StreamResults"(%arg0, %arg1) : (tensor<?xi32>, tensor<*xi32>) -> ()
+    func.return
+  }
+
+  // CHECK-LABEL: func private @_stablehlo_main_0
+  // CHECK-SAME:    %[[ARG0:.*]]: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}
+  // CHECK-SAME:    %[[ARG1:.*]]: tensor<*xi32>)
+  // CHECK-SAME:    (tensor<?xi32> {jax.result_info = ""})
+  // CHECK-SAME:    attributes {_from_xla_call_module}
+  func.func private @_stablehlo_main_0(%arg0: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}, %arg1: tensor<*xi32>) -> (tensor<?xi32> {jax.result_info = ""}) attributes {_from_xla_call_module} {
+    // CHECK:      stablehlo.custom_call @tf.call_tf_function(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME: {
+    // CHECK-SAME:  api_version = 2 : i32,
+    // CHECK-SAME:  has_side_effect = true,
+    // CHECK-SAME:  tf.backend_config = {called_func = @_tf_func}
+    // CHECK-SAME: }
+    stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_func = @_tf_func}} : (tensor<?xi32>, tensor<*xi32>) -> ()
+    // CHECK: call @_stablehlo__stablehlo_f_0
+    %arg2 = func.call @_stablehlo_f(%arg0) : (tensor<?xi32>) -> (tensor<?xi32>)
+    return %arg2 : tensor<?xi32>
+  }
+
+  // CHECK-LABEL: func private @_stablehlo__stablehlo_f_0
+  // CHECK:    attributes {_from_xla_call_module}
+  func.func private @_stablehlo_f(%arg0: tensor<?xi32>) -> (tensor<?xi32>) attributes {_from_xla_call_module} {
+    return %arg0 : tensor<?xi32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_serialization.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_serialization.mlir
new file mode 100644
index 00000000000..e51433e38bc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_call_module_serialization.mlir
@@ -0,0 +1,45 @@
+// RUN: tf-opt %s -split-input-file -tf-xla-call-module-serialization | FileCheck %s
+
+// Tests that stablehlo functions called by XlaCallModuleOp in the top-level
+// module can be serialized into bytecode and embedded in XlaCallModuleOp's
+// `module` attribute.
+
+// CHECK-LABEL: module
+module {
+  // CHECK-LABEL: func private @_tf_func
+  func.func private @_tf_func(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) {
+    // CHECK: tf.StreamResults
+
+    // StreamResults is a pseudo op in this test.
+    "tf.StreamResults"(%arg0, %arg1) : (tensor<?xi32>, tensor<*xi32>) -> ()
+    func.return
+  }
+
+  // CHECK-NOT: @_stablehlo_f
+  func.func private @_stablehlo_f(%arg0: tensor<?xi32>) -> (tensor<?xi32>) attributes {_from_xla_call_module} {
+    return %arg0 : tensor<?xi32>
+  }
+
+  // CHECK-NOT: @_stablehlo_main_0
+  func.func private @_stablehlo_main_0(%arg0: tensor<?xi32> {jax.arg_info = "x", mhlo.sharding = "{replicated}"}, %arg1: tensor<*xi32>) -> (tensor<?xi32> {jax.result_info = ""}) attributes {_from_xla_call_module} {
+    stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_func = @_tf_func}} : (tensor<?xi32>, tensor<*xi32>) -> ()
+    %arg2 = func.call @_stablehlo_f(%arg0) : (tensor<?xi32>) -> (tensor<?xi32>)
+    return %arg2 : tensor<?xi32>
+  }
+
+  // CHECK-LABEL: func @main
+  // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi32>, %[[ARG1:.*]]: tensor<10xi32>
+  func.func @main(%arg0: tensor<10xi32>, %arg1: tensor<10xi32>) -> tensor<10xi32> {
+    // CHECK:      %[[RESULT:.*]] = "tf.XlaCallModule"(%[[ARG0]], %[[ARG1]])
+    // CHECK-SAME:   Sout = [#tf_type.shape<?>]
+    // CHECK-SAME:   dim_args_spec = []
+    // CHECK-NOT:    _entry_function
+    // CHECK-NOT:    _stablehlo_module_attrs
+    // CHECK-SAME:   function_list = [@_tf_func]
+    // CHECK-SAME:   module = "ML\EFR{{.*}}"
+
+    %0 = "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<?>], dim_args_spec = [], _entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = { mhlo.num_partitions = 1 }, module = "", platforms = [], version = 5 : i64} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
+    // CHECK: return %[[RESULT]]
+    func.return %0 : tensor<10xi32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_iputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_iputs.mlir
new file mode 100644
index 00000000000..f7166ae11f4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/xla_validate_iputs.mlir
@@ -0,0 +1,11 @@
+// RUN: tf-opt %s -split-input-file -verify-diagnostics -tf-xla-validate-inputs
+
+// expected-error @+1 {{CPU/GPU MLIR phase 1 pipeline does not support nested calls of entry functions}}
+func.func @nested_entry_functions(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @func(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+  func.return %arg0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
index 74f5a458b0b..fcc70ab1b39 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/bridge.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -25,7 +26,6 @@ limitations under the License.
 #include "tensorflow/compiler/jit/flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
@@ -148,8 +148,18 @@ void CreateTPUBridgePipelineImpl(
   pm.addNestedPass<func::FuncOp>(
       CreateTPUReorderReplicateAndPartitionedInputsPass());
   pm.addNestedPass<func::FuncOp>(TF::CreateDecomposeReduceDatasetPass());
-  pm.addPass(TFDevice::CreateEmbeddingPipeliningPass());
+  if (tensorflow::GetBuildXlaOpsPassFlags()
+          ->tf_xla_disable_full_embedding_pipelining) {
+    pm.addPass(TFDevice::CreateEmbeddingSequencingPass());
+  } else {
+    pm.addPass(TFDevice::CreateEmbeddingPipeliningPass());
+  }
   pm.addPass(CreateTPUClusterFormationPass());
+  // CreateEmbeddingPipeliningPass may have created more functions, but
+  // TPUClusterCleanup and OutsideCompiledToHostLaunch need every function to be
+  // only called from one cluster. Here, we choose to fix the all-funcs-one-use
+  // invariant right before it's needed, not after it's been broken.
+  pm.addPass(mlir::TF::CreateGuaranteeAllFuncsOneUsePass());
   // Run TPU cluster cleanup attributes so ops with no outside compiled
   // attribute have no host device attribute.
   pm.addPass(CreateTPUClusterCleanupAttributesPass());
@@ -404,6 +414,8 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
   VLOG(2) << "Create TF XLA Bridge pipeline";
   pm.addNestedPass<func::FuncOp>(
       TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
+  // This pass expectes unified compilation markers.
+  pm.addPass(TFDevice::CreateXlaValidateInputsPass());
   const llvm::SmallVector<std::string, 4> ops_to_preserve = {};
   pm.addNestedPass<func::FuncOp>(
       tf_executor::CreateTFExecutorGraphPruningPass(ops_to_preserve));
@@ -425,6 +437,12 @@ void CreateTFXLABridgePipeline(OpPassManager &pm) {
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   // Decompose resource ops.
   pm.addPass(TFDevice::CreateDecomposeResourceOpsInClusterPass());
+  // TODO(b/267193636): Remove this flag when outside compilation
+  // for generic pipeline is landed.
+  if (tensorflow::GetMlirCommonFlags()
+          ->tf_mlir_enable_generic_outside_compilation) {
+    pm.addPass(TF::CreateTFFunctionalControlFlowToRegions());
+  }
   // Run another shape inference pass because resource decomposition might have
   // created new partial types. Also, after dropping `shape_invariant` attribute
   // from While/WhileRegion ops within cluster would lead to more precise
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h b/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
deleted file mode 100644
index 6d27780316f..00000000000
--- a/tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CALL_GRAPH_UTIL_H_
-#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CALL_GRAPH_UTIL_H_
-
-#include <functional>
-#include <stack>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-
-namespace mlir {
-
-// Find the outermost ops with any of specified types starting from the tree
-// rooted at `root` parameter. The results are stored in `ops`. Addtional
-// filters can be specified by providing `predicate` parameter.
-template <typename T, typename... Types>
-LogicalResult GetOutermostOpsOfType(
-    func::FuncOp root, SymbolTable &symtab, llvm::SmallVector<Operation *> &ops,
-    const std::function<bool(Operation *)> &predicate = {}) {
-  std::stack<func::FuncOp> worklist;
-  worklist.push(root);
-  while (!worklist.empty()) {
-    func::FuncOp u = worklist.top();
-    worklist.pop();
-    auto result = u.walk([&](SymbolUserOpInterface op) {
-      if (llvm::isa<T, Types...>(op) && (!predicate || predicate(op))) {
-        ops.push_back(op);
-        return WalkResult::advance();
-      }
-      for (auto attr : op->getAttrs()) {
-        auto sym = attr.getValue().dyn_cast<SymbolRefAttr>();
-        if (!sym) continue;
-        auto v = symtab.lookup<func::FuncOp>(sym.getRootReference());
-        if (!v) {
-          // This is not expected to happen in practice.
-          op->emitError() << "Cannot find function " << sym.getRootReference();
-          return WalkResult::interrupt();
-        }
-        worklist.push(v);
-      }
-      return WalkResult::advance();
-    });
-    if (result.wasInterrupted()) return failure();
-  }
-  return success();
-}
-
-}  // namespace mlir
-
-#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CALL_GRAPH_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
index 481f2d868e1..84220aa346b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.cc
@@ -16,39 +16,18 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold.h"
 
 #include <algorithm>
-#include <functional>
-#include <string>
-#include <utility>
-#include <vector>
 
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h"
 #include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/tfrt/fallback/fallback_state.h"
-#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
-#include "tensorflow/tsl/util/device_name_utils.h"
 
 namespace mlir {
 namespace TF {
 
-static bool IsOk(const tensorflow::Status& s) {
-  if (s.ok()) return true;
-  VLOG(2) << s.message();
-  return false;
-}
-
-#define RETURN_FAILURE_IF_ERROR(expr) \
-  if (!IsOk(expr)) {                  \
-    return mlir::failure();           \
-  }
-
 // Implements a TF specific policy on when constant folding is allowed.
 // Policy:
 //
@@ -63,7 +42,7 @@ static bool IsOk(const tensorflow::Status& s) {
 // (`kResultsSizeThreshold`), or
 // 2. size of results is within a factor (`kSizeFactor`) of size of operands, or
 // TODO(b/157226221): Look into other heuristics for constant fold policy.
-static bool ShouldBeFolded(Operation* inst) {
+static bool IsFoldedByDefaultPolicy(Operation* inst) {
   bool has_unknown_shape = false;
   auto get_size = [&](TypeRange types) {
     int64_t size = 0;
@@ -98,142 +77,14 @@ static bool ShouldBeFolded(Operation* inst) {
           (results_size <= kSizeFactor * operands_size));
 }
 
-static const tensorflow::tfrt_stub::FallbackState& GetDefaultFallbackState() {
-  static const auto* const fallback_state = []() {
-    tensorflow::SessionOptions session_options;
-    tensorflow::FunctionDefLibrary fdef_lib;
-    auto fallback_state =
-        tensorflow::tfrt_stub::FallbackState::CreateWithCpuDevice(
-            session_options, fdef_lib)
-            .value();
-    return fallback_state.release();
-  }();
-
-  return *fallback_state;
-}
-
-static std::function<void(std::function<void()>)>* GetDefaultRunner() {
-  static auto* const default_runner =
-      new std::function<void(std::function<void()>)>(
-          [](const std::function<void()>& f) { f(); });
-  return default_runner;
-}
-
-static mlir::LogicalResult EvaluateOperation(
-    mlir::Operation* inst, llvm::ArrayRef<mlir::ElementsAttr> operands,
-    llvm::SmallVectorImpl<mlir::Attribute>* results) {
-  // If any operand is nullptr returns true for a failure.
-  // TODO(b/120678030): remove this constraint if we find operators can be
-  // evaluated with some unknown operands.
-  if (std::any_of(operands.begin(), operands.end(),
-                  [](mlir::Attribute operand) { return !operand; })) {
-    VLOG(1) << "Can't evaluate since not all operands are constant.";
-    return mlir::failure();
-  }
-
-  // Builds TF operation and sets all the attributes.
-  std::string node_name = "unnamed";
-  if (auto attr = inst->getAttrOfType<mlir::StringAttr>("name")) {
-    node_name = std::string(attr.getValue());
-  }
-  auto node_def_or = tensorflow::ConvertTFDialectOpToNodeDef(
-      inst, node_name.c_str(), /*ignore_unregistered_attrs=*/true);
-  RETURN_FAILURE_IF_ERROR(node_def_or.status());
-  const auto& node_def = node_def_or.value();
-
-  const auto& fallback_state = GetDefaultFallbackState();
-
-  // Explicitly set device to Host CPU instead of the device present in device
-  // attribute of the MLIR op. The assigned device might be remote, not
-  // available during compilation or compilation only device for on demand
-  // execution which may create a recursion if used for constant folding.
-  auto host_cpu = tensorflow::DeviceNameUtils::FullName(
-      /*job=*/"localhost", /*replica=*/0, /*task=*/0, /*type=*/"CPU", /*id=*/0);
-
-  auto statusor_runner = tensorflow::tfrt_stub::OpKernelRunner::Create(
-      node_def->op(), node_def->name(), host_cpu, operands.size(),
-      [&](tensorflow::AttrValueMap* attr_value_map) {
-        *attr_value_map = node_def->attr();
-        return tensorflow::OkStatus();
-      },
-      fallback_state.device_manager(),
-      fallback_state.process_function_library_runtime());
-  RETURN_FAILURE_IF_ERROR(statusor_runner.status());
-  const auto& runner = *statusor_runner;
-
-  VLOG(1) << "Start to evaluate node: " << node_def->DebugString();
-
-  std::vector<tensorflow::Tensor> inputs;
-
-  // Adds inputs to the TF operation.
-  for (const auto operand : operands) {
-    tensorflow::Tensor tensor;
-    RETURN_FAILURE_IF_ERROR(tensorflow::ConvertToTensor(operand, &tensor));
-    inputs.push_back(std::move(tensor));
-  }
-
-  std::vector<tensorflow::TensorValue> input_values;
-  for (auto& tensor : inputs) {
-    input_values.emplace_back();
-    input_values.back().tensor = &tensor;
-  }
-
-  tensorflow::OpKernelContext::Params params;
-  params.inputs = input_values;
-  params.device = runner.device();
-  params.op_kernel = runner.op_kernel();
-  // Still use original device's resource_manager.
-  params.resource_manager = runner.resource_manager();
-  params.input_alloc_attrs = runner.input_alloc_attrs();
-  params.output_attr_array = runner.output_alloc_attrs().data();
-  // Following two parameters are used to support executing tf.data via
-  // fallback.
-  params.function_library = runner.function_library_runtime();
-  params.runner = GetDefaultRunner();
-
-  // Executes the TF operation.
-  tensorflow::OpKernelContext op_kernel_context(&params);
-  runner.Run(&op_kernel_context);
-  RETURN_FAILURE_IF_ERROR(op_kernel_context.status());
-
-  // Converts the outputs to MLIR attributes.
-  mlir::Builder builder(inst->getContext());
-
-  for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
-    DCHECK(op_kernel_context.mutable_output(i));
-    auto attr_or = tensorflow::ConvertTensor(
-        *op_kernel_context.mutable_output(i), &builder);
-    RETURN_FAILURE_IF_ERROR(attr_or.status());
-    results->push_back(attr_or.value());
-  }
-
-  VLOG(1) << "Evaluate node " << node_name << " successfully!";
-
-  return mlir::success();
-}
-
 LogicalResult ConstantFoldFallbackHook(
     Operation* inst, ArrayRef<Attribute> operands,
     SmallVectorImpl<OpFoldResult>& results) {  // NOLINT
-  // Instructions with side effects should not be constant folded to preserve
-  // the original semantics. Ops that have no side effect and zero results but
-  // could be folded should have a custom folder instead of relying on the
-  // TensorFlow folding hook.
-  if (inst->getNumResults() == 0 ||
-      inst->hasTrait<OpTrait::TF::NoConstantFold>() ||
-      inst->getNumRegions() != 0 || !isMemoryEffectFree(inst))
-    return failure();
+  if (!CanBeFolded(inst)) return failure();
 
-  // If any of the result types are variants, don't try to constant fold them.
-  // This creates opaque variant constants which lose information and would
-  // require "raising" later.
-  for (auto type : inst->getResultTypes()) {
-    if (auto tensor_type = type.dyn_cast<TensorType>()) {
-      if (tensor_type.getElementType().isa<VariantType>()) {
-        return failure();
-      }
-    }
-  }
+  // Determine if we should attempt to fold this operation by considering the
+  // size/size increase due to folding.
+  if (!IsFoldedByDefaultPolicy(inst)) return failure();
 
   // If all the results are empty and has numerical element types, set results
   // to empty elements attribute. This is restricted to the numerical element
@@ -259,15 +110,6 @@ LogicalResult ConstantFoldFallbackHook(
     return success();
   }
 
-  // Do not execute function calls.
-  if (llvm::isa<TF::WhileOp, TF::CaseOp, TF::IfOp, CallOpInterface>(inst)) {
-    return failure();
-  }
-
-  // Determine if we should attempt to fold this operation by considering the
-  // size/size increase due to folding.
-  if (!ShouldBeFolded(inst)) return failure();
-
   // Returns directly if any of the operands is not an elements attributes.
   if (std::any_of(operands.begin(), operands.end(), [](Attribute attr) {
         return !attr || !attr.isa<ElementsAttr>();
@@ -284,8 +126,8 @@ LogicalResult ConstantFoldFallbackHook(
   // TODO(jpienaar): Avoid using global context & mutex here.
   static auto* mu = new tensorflow::mutex();
   tensorflow::mutex_lock l(*mu);
-  SmallVector<Attribute, 8> constants;
-  LogicalResult status = EvaluateOperation(inst, inputs, &constants);
+  SmallVector<Attribute> constants;
+  LogicalResult status = EvaluateOperation(inst, inputs, constants);
   results.assign(constants.begin(), constants.end());
   return status;
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
index fbd5541c137..6d28fa03a98 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.cc
@@ -15,77 +15,182 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h"
 
+#include <algorithm>
+#include <functional>
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
-#include "tensorflow/c/tf_status.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/tsl/platform/mem.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
 namespace mlir {
 namespace TF {
 
-TFE_Context* GetContextForConstantFold() {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  std::unique_ptr<TFE_ContextOptions, decltype(&TFE_DeleteContextOptions)> opts(
-      TFE_NewContextOptions(), TFE_DeleteContextOptions);
-  // Only initialize single CPU.
-  tensorflow::ConfigProto config_proto;
-  // This is conceptually equal to what we do in python/eager/context.py but
-  // with all GPU/TPU devices ignored and CPU only set to 1.
-  (*config_proto.mutable_device_count())["CPU"] = 1;
-  config_proto.add_device_filters("/device:CPU:*");
-  // Limit the thread pool size. Without this, TF by default creates as many
-  // threads as the number of CPUs (`port::MaxParallelism()`). This can be
-  // expensive since this TFE context persists the entire program execution.
-  config_proto.set_inter_op_parallelism_threads(2);
-  std::unique_ptr<TF_Buffer, decltype(&TF_DeleteBuffer)> config(
-      TF_NewBuffer(), TF_DeleteBuffer);
-  DCHECK(config->data == nullptr);
+using tensorflow::tfrt_stub::FallbackState;
+using tensorflow::tfrt_stub::OpKernelRunner;
 
-  // Copy config_proto into config.
-  {
-    const size_t proto_size = config_proto.ByteSizeLong();
-    void* buf = tsl::port::Malloc(proto_size);
-    if (buf == nullptr) {
-      LOG(ERROR) << "Failed to allocate memory to serialize ConfigProto "
-                    "while creating context options for constant folding";
-      return nullptr;
+static bool IsOk(const tensorflow::Status& s) {
+  if (s.ok()) return true;
+  VLOG(2) << s.message();
+  return false;
+}
+
+#define RETURN_FAILURE_IF_ERROR(expr) \
+  if (!IsOk(expr)) {                  \
+    return mlir::failure();           \
+  }
+
+bool CanBeFolded(Operation* inst) {
+  // Instructions with side effects should not be constant folded to preserve
+  // the original semantics. Ops that have no side effect and zero results but
+  // could be folded should have a custom folder instead of relying on the
+  // TensorFlow folding hook.
+  if (inst == nullptr || inst->getNumResults() == 0 ||
+      inst->hasTrait<OpTrait::TF::NoConstantFold>() ||
+      inst->getNumRegions() != 0 || !isMemoryEffectFree(inst)) {
+    return false;
+  }
+
+  // If any of the result types are variants, don't try to constant fold them.
+  // This creates opaque variant constants which lose information and would
+  // require "raising" later.
+  for (const Type type : inst->getResultTypes()) {
+    if (const TensorType tensor_type = type.dyn_cast<TensorType>()) {
+      if (tensor_type.getElementType().isa<VariantType>()) {
+        return false;
+      }
     }
-    if (!config_proto.SerializeWithCachedSizesToArray(
-            static_cast<uint8_t*>(buf))) {
-      tsl::port::Free(buf);
-      LOG(ERROR) << "Unable to serialize ConfigProto while creating context "
-                    "options for constant folding";
-      return nullptr;
-    }
-    config->data = buf;
-    config->length = proto_size;
-    config->data_deallocator = [](void* data, size_t length) {
-      tsl::port::Free(data);
-    };
   }
 
-  TFE_ContextOptionsSetConfig(opts.get(), config->data, config->length,
-                              status.get());
-  if (TF_GetCode(status.get()) != TF_OK) {
-    LOG(ERROR) << "Failed to set context options for constant folding: "
-               << status.get();
-    return nullptr;
+  // Operations that execute function calls shouldn't be constant folded.
+  if (llvm::isa<TF::WhileOp, TF::CaseOp, TF::IfOp, CallOpInterface>(inst)) {
+    return false;
   }
 
-  // Input tensors are placed on the host CPU so use the explicit device
-  // policy to fail if no CPU kernels are available for the op.
-  TFE_ContextOptionsSetDevicePlacementPolicy(opts.get(),
-                                             TFE_DEVICE_PLACEMENT_EXPLICIT);
-  auto ctx = TFE_NewContext(opts.get(), status.get());
-  if (TF_GetCode(status.get()) != TF_OK) {
-    LOG(ERROR) << "Failed to create context for constant folding: "
-               << status.get();
-    return nullptr;
+  return true;
+}
+
+static const FallbackState& GetDefaultFallbackState() {
+  static const auto* const fallback_state = []() {
+    tensorflow::SessionOptions session_options;
+    tensorflow::FunctionDefLibrary fdef_lib;
+    auto fallback_state =
+        FallbackState::CreateWithCpuDevice(session_options, fdef_lib).value();
+    return fallback_state.release();
+  }();
+
+  return *fallback_state;
+}
+
+static std::function<void(std::function<void()>)>* GetDefaultRunner() {
+  static auto* const default_runner =
+      new std::function<void(std::function<void()>)>(
+          [](const std::function<void()>& f) { f(); });
+  return default_runner;
+}
+
+LogicalResult EvaluateOperation(Operation* inst,
+                                llvm::ArrayRef<ElementsAttr> operands,
+                                llvm::SmallVector<Attribute>& results) {
+  // If any operand is nullptr returns true for a failure.
+  // TODO(b/120678030): remove this constraint if we find operators can be
+  // evaluated with some unknown operands.
+  if (std::any_of(operands.begin(), operands.end(),
+                  [](Attribute operand) { return !operand; })) {
+    VLOG(1) << "Can't evaluate since not all operands are constant.";
+    return failure();
   }
-  return ctx;
+
+  // Builds TF operation and sets all the attributes.
+  std::string node_name = "unnamed";
+  if (const StringAttr attr = inst->getAttrOfType<StringAttr>("name")) {
+    node_name = std::string(attr.getValue());
+  }
+  absl::StatusOr<std::unique_ptr<tensorflow::NodeDef>> node_def =
+      tensorflow::ConvertTFDialectOpToNodeDef(
+          inst, node_name.c_str(), /*ignore_unregistered_attrs=*/true);
+  RETURN_FAILURE_IF_ERROR(node_def.status());
+
+  const FallbackState& fallback_state = GetDefaultFallbackState();
+
+  // Explicitly set device to Host CPU instead of the device present in device
+  // attribute of the MLIR op. The assigned device might be remote, not
+  // available during compilation or compilation only device for on demand
+  // execution which may create a recursion if used for constant folding.
+  std::string host_cpu = tensorflow::DeviceNameUtils::FullName(
+      /*job=*/"localhost", /*replica=*/0, /*task=*/0, /*type=*/"CPU", /*id=*/0);
+
+  absl::StatusOr<OpKernelRunner> runner = OpKernelRunner::Create(
+      node_def->get()->op(), node_def->get()->name(), host_cpu, operands.size(),
+      [&](tensorflow::AttrValueMap* attr_value_map) {
+        *attr_value_map = node_def->get()->attr();
+        return tensorflow::OkStatus();
+      },
+      fallback_state.device_manager(),
+      fallback_state.process_function_library_runtime());
+  RETURN_FAILURE_IF_ERROR(runner.status());
+
+  VLOG(1) << "Start to evaluate node: " << node_def->get()->DebugString();
+
+  std::vector<tensorflow::Tensor> inputs;
+
+  // Adds inputs to the TF operation.
+  for (const ElementsAttr& operand : operands) {
+    tensorflow::Tensor tensor;
+    RETURN_FAILURE_IF_ERROR(tensorflow::ConvertToTensor(operand, &tensor));
+    inputs.push_back(std::move(tensor));
+  }
+
+  std::vector<tensorflow::TensorValue> input_values;
+  for (tensorflow::Tensor& tensor : inputs) {
+    input_values.emplace_back();
+    input_values.back().tensor = &tensor;
+  }
+
+  tensorflow::OpKernelContext::Params params;
+  params.inputs = input_values;
+  params.device = runner->device();
+  params.op_kernel = runner->op_kernel();
+
+  // Still use original device's resource_manager.
+  params.resource_manager = runner->resource_manager();
+  params.input_alloc_attrs = runner->input_alloc_attrs();
+  params.output_attr_array = runner->output_alloc_attrs().data();
+
+  // Following two parameters are used to support executing tf.data via
+  // fallback.
+  params.function_library = runner->function_library_runtime();
+  params.runner = GetDefaultRunner();
+
+  // Executes the TF operation.
+  tensorflow::OpKernelContext op_kernel_context(&params);
+  runner->Run(&op_kernel_context);
+  RETURN_FAILURE_IF_ERROR(op_kernel_context.status());
+
+  // Converts the outputs to MLIR attributes.
+  Builder builder(inst->getContext());
+
+  for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
+    DCHECK(op_kernel_context.mutable_output(i));
+    absl::StatusOr<ElementsAttr> result_attr = tensorflow::ConvertTensor(
+        *op_kernel_context.mutable_output(i), &builder);
+    RETURN_FAILURE_IF_ERROR(result_attr.status());
+    results.push_back(result_attr.value());
+  }
+
+  VLOG(1) << "Evaluate node " << node_name << " successfully!";
+
+  return success();
 }
 
 }  // namespace TF
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h
index 8f28735d2a9..636dde98d2b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/constant_fold_utils.h
@@ -16,12 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_CONSTANT_FOLD_UTILS_H_
 
-#include "tensorflow/c/eager/c_api.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 
 namespace mlir {
 namespace TF {
 
-TFE_Context* GetContextForConstantFold();
+// Checks whether the given TF operation can be folded or not.
+bool CanBeFolded(Operation* inst);
+
+// Evaluates the operation with given operand values.
+LogicalResult EvaluateOperation(Operation* inst,
+                                llvm::ArrayRef<ElementsAttr> operands,
+                                llvm::SmallVector<Attribute>& results);
 
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
index e5671bf5961..84b161a0fd7 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_pipelining.cc
@@ -13,53 +13,160 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This pass implements automated pipelining for TPU embeddings defined using
-// the TF2 Embedding API. This is designed for applications that have an
-// embedding lookup on the SparseCore, followed by one or more dense layers on
-// TensorCores, optionally followed by a backward pass (training update) with
-// more ops on the SparseCore. Ops are broken up into:
-//   1. SC forward pass
-//   2. TC forward/backward pass
-//   3. SC backward pass
-//   4. non-TPU loop counter updates
-// These 4 functions are then staggered so as to enable parallel execution.
+/******************************************************************************
+This pass implements automated pipelining for TPU embeddings defined using
+the TF2 Embedding API. This is designed for applications that have an
+embedding lookup on the SparseCore, followed by one or more dense layers on
+TensorCores, optionally followed by a backward pass (training update) with
+more ops on the SparseCore. Ops are broken up into:
+  1. SC forward pass
+  2. TC forward/backward pass
+  3. SC backward pass
+  4. non-TPU loop counter updates
+These 4 functions are then staggered so as to enable parallel execution.
+
+In pseudocode, the algorithm is as follows:
+
+// Start step 0
+C_0 = cond(args_0)
+N_0 = non_tup(args_0)
+if (C_0) {
+   F_0 = forward(args_0, N_0)
+   T_0 = core_tpu(args_0, N_0, F_0)
+   // B_0 = backward() is not evaluated here.
+}
+
+args_1 = update_args(args_0, N_0, T_0)
+
+// Start step 1
+C_1 = cond(args_1)
+N_1 = non_tup(args_1)
+if (C_1) {
+   F_1 = forward(args_1, N_1)
+   // T_1 = core_tpu() is not evaluated here.
+   // B_1 = backward() is not evaluated here.
+}
+
+// Partial update of args. We expect this to be sufficient
+// for evaluating cond().
+args_2a = update_args(args_1, N_1)  // NO T_1 here
+
+// Conditional for step 2
+C_2 = cond(args_2)
+
+new_while_body (new_args) {  // starts at i==2
+   // Finish step i-2
+   B_im2 = backward(args_im2, N_im2, F_im2, T_im2)
+
+   // Advance step i-1
+   T_im1 = core_tpu(args_im1, N_im1, F_im1)
+
+   // Finish the update of args_2
+   args_i = args_2b = update_args(args_2a, T_im1)
+
+   // Start step i
+   N_i = non_tpu(args_i)
+   F_i = forward(args_i, N_i)
+
+   // Conditional update
+   args_ip1 = update_args(args_i, N_i)  // T_i is lagged.
+   C_ip1 = cond(args_ip1)
+
+   return (...)
+}
+// Note: the tf.while conditional is based on Ci which is initially C2. The
+// tf.while op returns the inputs unmodified if the initial conditional is
+// false. Thus, the following special cases hold for N <= 2:
+//                   N==0  | N==1  | N==2 | N==3
+//                  -----------------------------
+//   C_nm2 == C_0 -> false | true  | true  | true
+//   C_nm1 == C_1 -> false | false | true  | true
+
+// Finish step N-2
+if (C_nm2) {
+   backward(args_nm2, N_nm2, F_nm2, T_nm2)
+}
+
+// Finish step N-1
+if (C_nm1) {
+   T_nm1 = core_tpu(args_nm1, N_nm1, F_nm1)
+   backward(args_nm1, N_nm1, F_nm1, T_nm1)
+}
+
+// To match the original, un-pipelined while loop, we need to return the
+// correct results from the pipelined version. Nominally, we'd like to do
+// this:
+// if ( NOT(C_nm2) ) {
+//   return args_nm2
+// } else if (NOT(C_nm1)) {
+//   return args_nm1
+// } else {
+//   return args_n
+// }
+// but we don't have if/else-if operators. We can convert this to a CaseOp.
+// Note, if C_nm1==true and C_nm2 must also be true.
+branch_index = int(C_nm2) + int(C_nm1)
+selected_results = switch(branch_index) {
+  case 0: return args_nm2
+  case 1: return args_nm1
+  case 2: return args_n
+}
+return selected_results
+******************************************************************************/
 
 #include <cstdint>
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+// #include "smartass/brain/ops/flogs_ops.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 
 #define GEN_PASS_DEF_EMBEDDINGPIPELININGPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 
 static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
+static constexpr char kEmbeddingPipeliningInlineAttr[] =
+    "_embedding_pipelining_inline";
 static constexpr char kEmbeddingForward[] = "forward";
 static constexpr char kEmbeddingBackward[] = "backward";
 static constexpr char kDevice[] = "device";
+static constexpr char kLower[] = "_lower_using_switch_merge";
 static constexpr llvm::StringRef kTpuCompilationStatus =
     "_tpu_compilation_status";
 
@@ -67,24 +174,6 @@ namespace mlir {
 namespace TFDevice {
 namespace {
 
-struct EmbeddingPipeliningPass
-    : public ::impl::EmbeddingPipeliningPassBase<EmbeddingPipeliningPass> {
-  void getDependentDialects(mlir::DialectRegistry& registry) const override {
-    registry.insert<TF::TensorFlowDialect>();
-  }
-
-  void runOnOperation() override;
-};
-
-template <typename InputContainer>
-std::vector<Type> GetValueTypes(const InputContainer& input) {
-  // Convert a list of mlir::Value's into a list of mlir::Type's
-  std::vector<Type> types;
-  types.reserve(input.size());
-  for (auto val : input) types.push_back(val.getType());
-  return types;
-}
-
 bool IsResourceType(Type val_type) {
   if (auto tensor_type = val_type.dyn_cast<mlir::TensorType>()) {
     if (tensor_type.getElementType().isa<TF::ResourceType>()) {
@@ -94,9 +183,14 @@ bool IsResourceType(Type val_type) {
   return false;
 }
 
-bool IsTPUOp(mlir::Operation* op) {
-  return op->hasAttr(TF::kReplicationInfoAttr);
-}
+struct EmbeddingPipeliningPass
+    : public ::impl::EmbeddingPipeliningPassBase<EmbeddingPipeliningPass> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
+  void runOnOperation() override;
+};
 
 StringAttr GetReplicationAttr(mlir::Operation* op) {
   return op->getAttrOfType<StringAttr>(TF::kReplicationInfoAttr);
@@ -108,12 +202,313 @@ StringAttr GetReplicationAttr(TF::TPUCompilationResultOp op) {
   return op->getAttrOfType<StringAttr>(kTpuCompilationStatus);
 }
 
+// Replaces the replication region attribute if it already exists.
+void UpdateReplicationAttr(Operation* op, StringAttr attr) {
+  if (op->hasAttr(TF::kReplicationInfoAttr)) {
+    op->setAttr(TF::kReplicationInfoAttr, attr);
+  }
+}
+
+// Replaces the replication region attribute if it already exists.
+void UpdateReplicationAttr(TF::TPUCompilationResultOp& op, StringAttr attr) {
+  // Special case for getting the replication region for
+  // TPUCompilationResultsOp.
+  if (op->hasAttr(kTpuCompilationStatus)) {
+    op->setAttr(kTpuCompilationStatus, attr);
+  }
+}
+
+// A helper class to inline TF::StatefulPartitionedCall ops
+struct Inliner : public InlinerInterface {
+  Inliner(OpBuilder& builder, SymbolTable& symbol_table)
+      : InlinerInterface(builder.getContext()),
+        builder(builder),
+        symbol_table(symbol_table) {}
+
+  bool isLegalToInline(Operation* call, Operation* callable,
+                       bool wouldBeCloned) const override {
+    return true;
+  }
+  bool isLegalToInline(Region* dest, Region* src, bool wouldBeCloned,
+                       IRMapping& valueMapping) const override {
+    return true;
+  }
+  bool isLegalToInline(Operation* op, Region* dest, bool wouldBeCloned,
+                       IRMapping& valueMapping) const override {
+    return true;
+  }
+
+  // Don't recursively analyze operations, because they can all be "inlined".
+  bool shouldAnalyzeRecursively(Operation* op) const override { return true; }
+
+  LogicalResult UnifyReplicationInfo(func::FuncOp func) {
+    auto new_repl_info =
+        builder.getStringAttr(func.getSymName().str() + "_repl_info");
+    for (auto& op : func.getRegion().getOps()) {
+      if (auto compile_op = llvm::dyn_cast<TF::TPUCompilationResultOp>(op)) {
+        UpdateReplicationAttr(compile_op, new_repl_info);
+      } else {
+        UpdateReplicationAttr(&op, new_repl_info);
+      }
+    }
+    return LogicalResult::success();
+  }
+
+  // After inlining, there will likely be some instances where a
+  // TPUReplicatedInput feeds directly into a TPUReplicatedOutput. Find such
+  // pairs and remove them.
+  LogicalResult RemoveOutputInputPairs(func::FuncOp func) {
+    llvm::SetVector<Operation*> ops_to_erase;
+    // Inlining can result in multiple TPUCompilationResultOp and
+    // TPUReplicateMetadataOp ops. Only keep one, the first will do fine.
+    TF::TPUCompilationResultOp compile_op = nullptr;
+    for (auto op : func.getRegion().getOps<TF::TPUCompilationResultOp>()) {
+      if (compile_op == nullptr) {
+        compile_op = op;
+      } else {
+        ops_to_erase.insert(op);
+      }
+    }
+    // If there's no outside compilation, we can exit early because this isn't
+    // a TPU function.
+    if (compile_op == nullptr) {
+      return LogicalResult::success();
+    }
+
+    TF::TPUReplicateMetadataOp metadata_op = nullptr;
+    for (auto op : func.getRegion().getOps<TF::TPUReplicateMetadataOp>()) {
+      if (metadata_op == nullptr)
+        metadata_op = op;
+      else
+        ops_to_erase.insert(op);
+    }
+    if (metadata_op == nullptr) {
+      func->emitError(
+          "Expected to find TPUReplicateMetadataOps but found none.");
+      return LogicalResult::failure();
+    }
+
+    for (auto output_op :
+         func.getRegion().getOps<TF::TPUReplicatedOutputOp>()) {
+      bool outputs_are_returned = false;
+      TF::TPUReplicatedInputOp input_op = nullptr;
+      // Only visit each user of the results once.
+      llvm::SetVector<Operation*> seen_users;
+      for (auto user : output_op->getUsers()) {
+        if (!seen_users.insert(user)) continue;
+        if (llvm::isa<TF::TPUReplicatedInputOp>(user)) {
+          if (input_op != nullptr) {
+            func->emitError(
+                "Found multiple TPUReplicatedInput ops but only expected 1.");
+            return LogicalResult::failure();
+          }
+          input_op = llvm::dyn_cast<TF::TPUReplicatedInputOp>(user);
+        }
+        if (llvm::isa<func::ReturnOp>(user)) {
+          outputs_are_returned = true;
+        }
+      }
+      if (input_op == nullptr) continue;
+
+      // If we found matching input ops, we can remove the TPUReplicatedInput
+      // ops and replace their result values with the inputs to the matching
+      // TPUReplicatedOutput op.
+      replaceAllUsesInRegionWith(input_op.getResult(), output_op.getOperand(),
+                                 func.getRegion());
+      ops_to_erase.insert(input_op);
+
+      // If the outputs aren't also returned from this function, then we can
+      // remove the TPUReplicatedOutput op as well. In some cases we'll
+      // still need these ops.
+      if (!outputs_are_returned) ops_to_erase.insert(output_op);
+    }
+    for (auto op : ops_to_erase) op->erase();
+
+    return LogicalResult::success();
+  }
+
+  LogicalResult RemoveDuplicateReplication(func::FuncOp func) {
+    llvm::SetVector<Operation*> ops_to_erase;
+    llvm::MapVector<BlockArgument, TF::TPUReplicatedInputOp> cache;
+    for (auto input_op : func.getRegion().getOps<TF::TPUReplicatedInputOp>()) {
+      // We're only expecting a single input argument to be replicated.
+      if (input_op->getNumOperands() > 1) continue;
+      Value operand = input_op->getOperand(0);
+      if (!llvm::isa<BlockArgument>(operand)) continue;
+      BlockArgument arg = llvm::dyn_cast<BlockArgument>(operand);
+
+      // See if we've run across this TPUReplicatedInputOp before.
+      if (!cache.insert({arg, input_op}).second) {
+        // We've seen this before. Replace this instance with the cached op.
+        for (auto p :
+             llvm::zip(input_op->getResults(), cache[arg]->getResults())) {
+          replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                                     func.getRegion());
+        }
+        ops_to_erase.insert(input_op);
+      }
+    }
+    for (auto op : ops_to_erase) op->erase();
+    return LogicalResult::success();
+  }
+
+  // Find any StatefulPartitionedCalls and inline their contents in this func.
+  LogicalResult InlineCallsInFunc(func::FuncOp func,
+                                  bool inline_all_funcs = false) {
+    llvm::SetVector<Operation*> ops_to_erase;
+    for (auto caller :
+         func.getRegion().getOps<TF::StatefulPartitionedCallOp>()) {
+      if (!inline_all_funcs &&
+          !caller->hasAttr(kEmbeddingPipeliningInlineAttr)) {
+        continue;
+      }
+      Operation* symbol = symbol_table.lookup(caller.getF());
+      if (symbol == nullptr) {
+        func.emitError() << "Symbol not found in SymbolTable: "
+                         << caller.getF();
+        return LogicalResult::failure();
+      }
+      if (!llvm::isa<func::FuncOp>(symbol)) {
+        func.emitError() << "Invalid callee: " << caller.getF();
+        return LogicalResult::failure();
+      }
+      auto callee =
+          llvm::dyn_cast<func::FuncOp>(symbol_table.lookup(caller.getF()));
+      auto& src_region = callee.getRegion();
+      auto result = inlineCall(*this, caller, callee, &src_region, true);
+      if (failed(result)) {
+        func.emitError("Inliner failed");
+        return result;
+      }
+      ops_to_erase.insert(caller);
+    }
+    for (auto op : ops_to_erase) op->erase();
+
+    auto result = UnifyReplicationInfo(func);
+    if (failed(result)) return result;
+
+    result = RemoveOutputInputPairs(func);
+    if (failed(result)) return result;
+
+    result = RemoveDuplicateReplication(func);
+    if (failed(result)) return result;
+
+    return LogicalResult::success();
+  }
+
+ private:
+  OpBuilder& builder;
+  SymbolTable& symbol_table;
+};
+
+LogicalResult EliminateResourceLoops(OpBuilder& builder,
+                                     SymbolTable& symbol_table,
+                                     func::FuncOp func) {
+  // Examine all StatefulPartitionedCall ops that have resources as return
+  // types. If the returned resource traces back to an input argument for the
+  // SPC, then replace uses of the returned copy with the original input.
+  //
+  // Note: This does not descend through nested SCPs.
+  auto ComesFromBlockArgNumber = [](Value val) -> int {
+    while (true) {
+      if (auto block_arg = llvm::dyn_cast<BlockArgument>(val)) {
+        return block_arg.getArgNumber();
+      }
+      if (auto identity_op =
+              llvm::dyn_cast<TF::IdentityOp>(val.getDefiningOp())) {
+        val = identity_op.getOperand();
+      } else {
+        return -1;
+      }
+    }
+  };
+
+  for (auto call_op :
+       func.getRegion().getOps<TF::StatefulPartitionedCallOp>()) {
+    for (int i = 0; i < call_op->getNumResults(); ++i) {
+      if (IsResourceType(call_op->getResult(i).getType())) {
+        Operation* symbol = symbol_table.lookup(call_op.getF());
+        if (symbol == nullptr) {
+          func.emitError() << "Symbol not found in SymbolTable: "
+                           << call_op.getF();
+          return LogicalResult::failure();
+        }
+        if (!llvm::isa<func::FuncOp>(symbol)) {
+          func.emitError() << "Invalid callee: " << call_op.getF();
+          return LogicalResult::failure();
+        }
+        auto callee =
+            llvm::dyn_cast<func::FuncOp>(symbol_table.lookup(call_op.getF()));
+        func::ReturnOp return_op = *callee.getOps<func::ReturnOp>().begin();
+        auto val = return_op.getOperand(i);
+        auto block_arg_number = ComesFromBlockArgNumber(val);
+        if (block_arg_number >= 0) {
+          replaceAllUsesInRegionWith(call_op->getResult(i),
+                                     call_op->getOperand(block_arg_number),
+                                     func.getRegion());
+        }
+      }
+    }
+  }
+  return LogicalResult::success();
+}
+
+struct Callers {
+  TF::StatefulPartitionedCallOp forward;
+  TF::StatefulPartitionedCallOp core_tpu;
+  TF::StatefulPartitionedCallOp backward;
+  TF::StatefulPartitionedCallOp non_tpu;
+};
+
+template <typename InputContainer>
+std::vector<Type> GetValueTypes(const InputContainer& input) {
+  // Convert a list of mlir::Value's into a list of mlir::Type's
+  std::vector<Type> types;
+  types.reserve(input.size());
+  for (auto val : input) types.push_back(val.getType());
+  return types;
+}
+
+bool IsTPUOp(mlir::Operation* op) {
+  return op->hasAttr(TF::kReplicationInfoAttr);
+}
+
+template <typename Vector, typename Container>
+void Append(Vector& a, const Container& b) {
+  a.insert(a.end(), b.begin(), b.end());
+}
+
+template <typename Vector>
+void Append(Vector& a, const Vector& b) {
+  a.insert(a.end(), b.begin(), b.end());
+}
+
 int64_t GetNumOps(func::FuncOp func) {
   int64_t num_ops = 0;
   for (auto it = func.begin(); it != func.end(); ++it) ++num_ops;
   return num_ops;
 }
 
+std::vector<Value> ResultsAsVector(Operation* op) {
+  std::vector<Value> vec;
+  vec.reserve(op->getNumResults());
+  for (auto res : op->getResults()) vec.push_back(res);
+  return vec;
+}
+
+void SetBasicBlockAttributes(OpBuilder& builder, Operation* op) {
+  op->setAttr(kDevice, builder.getStringAttr(""));
+  op->setAttr(kLower, builder.getBoolAttr(true));
+}
+
+std::vector<Value> ResultsAsVector(Operation* op, int begin, int num) {
+  int end = begin + num;
+  std::vector<Value> vec;
+  vec.reserve(end - begin);
+  for (int i = begin; i < end; ++i) vec.push_back(op->getResult(i));
+  return vec;
+}
+
 void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
                             const mlir::SetVector<Operation*>& ops_to_avoid,
                             bool predecessors, bool successors) {
@@ -158,9 +553,11 @@ void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
   }
 }
 
-TF::StatefulPartitionedCallOp MakeFuncCaller(
-    mlir::OpBuilder& builder, const Location& loc, func::FuncOp func,
-    const llvm::SetVector<Value>& operands) {
+TF::StatefulPartitionedCallOp MakeFuncCaller(mlir::OpBuilder& builder,
+                                             const Location& loc,
+                                             func::FuncOp func,
+                                             const ArrayRef<Value>& operands,
+                                             bool flag_for_inlining) {
   // Constructs a tf.StatefulPartitionedCall to the function provided in 'func'
   // using the operands in 'operands'. Assumes the insertion point on builder is
   // already set.
@@ -168,60 +565,65 @@ TF::StatefulPartitionedCallOp MakeFuncCaller(
       mlir::SymbolRefAttr::get(builder.getContext(), func.getSymName());
   auto result_types = func.getResultTypes();
   auto caller = builder.create<TF::StatefulPartitionedCallOp>(
-      loc, result_types, operands.getArrayRef(), symbol,
+      loc, result_types, operands, symbol,
       /*config=*/builder.getStringAttr(""),
       /*config_proto=*/builder.getStringAttr(""),
       /*executor_type=*/builder.getStringAttr(""));
   caller.setFAttr(symbol);
+
+  // Set an attribute that our inliner will look for when choosing which
+  // TF::StatefulPartitionedCallOps to inline.
+  if (flag_for_inlining)
+    caller->setAttr(kEmbeddingPipeliningInlineAttr, builder.getBoolAttr(true));
   return caller;
 }
 
-func::FuncOp CreateFnWithSignature(ModuleOp module,
+func::FuncOp CreateFnWithSignature(ModuleOp module, SymbolTable& symbol_table,
                                    const llvm::SetVector<Value>& inputs,
                                    const llvm::SetVector<Value>& outputs,
                                    const std::string& name) {
   // Creates an empty func.FuncOp with a signature compatible with 'inputs'
   // (operands) and 'outputs' (results).
   OpBuilder builder(module);
-
-  std::vector<Type> input_types = GetValueTypes(inputs);
-  std::vector<Type> output_types = GetValueTypes(outputs);
+  auto in_types = GetValueTypes(inputs);
+  auto out_types = GetValueTypes(outputs);
   builder.setInsertionPointToEnd(&module.getBodyRegion().back());
-  func::FuncOp func_op = builder.create<func::FuncOp>(
-      module.getLoc(), name,
-      builder.getFunctionType(input_types, output_types));
+  auto func_op = builder.create<func::FuncOp>(
+      module.getLoc(), name, builder.getFunctionType(in_types, out_types));
   func_op.setPrivate();
-
+  symbol_table.insert(func_op);
   return func_op;
 }
 
 TF::StatefulPartitionedCallOp EncapsulateOpsInFunc(
-    OpBuilder& builder, const llvm::SetVector<Operation*>& ops,
+    OpBuilder& builder, SymbolTable& symbol_table,
+    const llvm::SetVector<Operation*>& ops,
     const llvm::SetVector<Value>& inputs, const llvm::SetVector<Value>& outputs,
-    func::FuncOp parent_func, ModuleOp module, const std::string& name) {
+    func::FuncOp parent_func, ModuleOp module, const std::string& name,
+    bool flag_for_inlining) {
   // Moves all of the Operations in 'ops' into a newly created func.FuncOp
   // function named 'name' and replaces the original ops with a call to the
   // newly created function using a tf.StatefulPartitionedCall. Here,
   // 'parent_func' is the function that holds the original set of ops.
   // Note, 'inputs' and 'outputs' are the predetermined set of values that
   // should become the operands and return values, respectively.
-  auto insertion_point = builder.saveInsertionPoint();
-  func::FuncOp new_func = CreateFnWithSignature(module, inputs, outputs,
-                                                absl::StrCat("_func_", name));
+  auto saved_insertion_point = builder.saveInsertionPoint();
+  func::FuncOp new_func =
+      CreateFnWithSignature(module, symbol_table, inputs, outputs, name);
 
   // This preserves the order of the ops that was in the original parent
-  // funtion. This is critical for preserving correctness in the presence of
+  // function. This is critical for preserving correctness in the presence of
   // resource variables and stateful functions.
   std::vector<Operation*> topological_order;
   for (Operation& op : parent_func.getOps())
     if (ops.contains(&op)) topological_order.push_back(&op);
 
   // Create the partitioned call
-  builder.restoreInsertionPoint(insertion_point);
-  auto caller = MakeFuncCaller(builder, module.getLoc(), new_func, inputs);
+  builder.restoreInsertionPoint(saved_insertion_point);
+  auto caller = MakeFuncCaller(builder, module.getLoc(), new_func,
+                               inputs.getArrayRef(), flag_for_inlining);
 
   Block* block = new_func.addEntryBlock();
-
   for (Operation* op : topological_order) op->moveBefore(block, block->end());
 
   // Replace the 'inputs' values with the new function's arguments.
@@ -293,7 +695,7 @@ LogicalResult FindAndExcludeOp(func::FuncOp func,
 }
 
 LogicalResult FindOwningWhileOp(func::FuncOp body_func, ModuleOp module,
-                                TF::WhileOp* while_op) {
+                                TF::WhileOp& while_op) {
   // Given a while loop body function 'body_func', find the tf.While Op that
   // uses it.
   auto uses_optional = body_func.getSymbolUses(module);
@@ -301,14 +703,14 @@ LogicalResult FindOwningWhileOp(func::FuncOp body_func, ModuleOp module,
     body_func.emitOpError() << "no use of while loop body";
     return LogicalResult::failure();
   }
-  *while_op = nullptr;
+  while_op = nullptr;
   for (auto& use : uses_optional.value()) {
     if (llvm::isa<TF::WhileOp>(use.getUser())) {
-      if (*while_op != nullptr) {
+      if (while_op != nullptr) {
         use.getUser()->emitOpError() << "multiple users of function.";
         return LogicalResult::failure();
       } else {
-        *while_op = llvm::cast<TF::WhileOp>(use.getUser());
+        while_op = llvm::cast<TF::WhileOp>(use.getUser());
       }
     } else {
       use.getUser()->emitOpError() << "non while use of function.";
@@ -397,15 +799,13 @@ LogicalResult FindForwardPassOps(OpBuilder& builder,
       if (use_in_forward && use_in_not_forward) {
         loop_body_func.emitOpError()
             << "resource input " << argument.getArgNumber()
-            << " is used both in the forwards and "
-            << "not forward passes dataset";
+            << " is used both in the forwards and not forward passes dataset";
         return LogicalResult::failure();
       }
       if (is_non_variable && is_variable) {
         loop_body_func.emitOpError()
             << "resource input " << argument.getArgNumber()
-            << " is used both as a varible and not "
-            << " a variable";
+            << " is used both as a variable and not a variable";
         return LogicalResult::failure();
       }
       if (is_variable && use_in_forward)
@@ -461,7 +861,7 @@ LogicalResult FindForwardPassOps(OpBuilder& builder,
     }
   }
 
-  VLOG(2) << "Cloned " << cloned_inputs << " TPUReplicatedInputOps";
+  VLOG(3) << "Cloned " << cloned_inputs << " TPUReplicatedInputOps";
 
   // Add TPUReplicatedInput/TPUReplicatedOutput pairs along each edge.
   llvm::SetVector<Operation*> new_forward_ops;
@@ -515,7 +915,7 @@ LogicalResult FindForwardPassOps(OpBuilder& builder,
     }
   }
 
-  VLOG(2) << "inserted " << new_forward_ops.size() << " TPU Input/Output ops";
+  VLOG(3) << "Inserted " << new_forward_ops.size() << " TPU Input/Output ops.";
   forward_pass_ops.insert(new_forward_ops.begin(), new_forward_ops.end());
   return LogicalResult::success();
 }
@@ -537,7 +937,7 @@ LogicalResult FindBackwardPassOps(
   GatherOpsForExtraction(&backward_pass_ops, merged_set, /*predecessors=*/false,
                          /*successors=*/true);
 
-  VLOG(3) << "found " << backward_pass_ops.size() << " backwards pass ops";
+  VLOG(3) << "Found " << backward_pass_ops.size() << " backwards pass ops.";
 
   // If any inputs are to the backward_pass_ops region are direct
   // TPUReplicatedInput ops, then include (if this is the only use) or
@@ -719,10 +1119,12 @@ LogicalResult FindNonTPUOps(llvm::SetVector<Operation*>& non_tpu_ops,
 }
 
 LogicalResult ExtractOpsAsFunc(
-    OpBuilder& builder, ModuleOp module, llvm::SetVector<Operation*>& ops,
-    StringAttr replication_attr, TF::TPUReplicateMetadataOp metadata_op,
+    OpBuilder& builder, ModuleOp module, SymbolTable& symbol_table,
+    llvm::SetVector<Operation*>& ops, StringAttr replication_attr,
+    TF::TPUReplicateMetadataOp metadata_op,
     TF::TPUCompilationResultOp compilation_op, func::FuncOp parent_func,
-    const std::string& func_name, Operation** caller) {
+    const std::string& func_name, TF::StatefulPartitionedCallOp* caller,
+    bool flag_for_inlining) {
   // Move the given set of 'ops' into it's own function and replace them with a
   // call to that function ('caller'). if 'metadata_op' and 'compilation_op' are
   // non-null, also insert those (i.e., target the resulting function to the
@@ -753,8 +1155,9 @@ LogicalResult ExtractOpsAsFunc(
   }
   llvm::SetVector<Value> outputs;
   for (auto output : results) outputs.insert(output);
-  auto tf_caller = EncapsulateOpsInFunc(builder, ops, inputs, outputs,
-                                        parent_func, module, func_name);
+  auto tf_caller =
+      EncapsulateOpsInFunc(builder, symbol_table, ops, inputs, outputs,
+                           parent_func, module, func_name, flag_for_inlining);
   if (!ops.empty() && metadata_op != nullptr && compilation_op != nullptr)
     UpdateAndInsertTPUOps(tf_caller, metadata_op, compilation_op,
                           replication_attr);
@@ -762,8 +1165,464 @@ LogicalResult ExtractOpsAsFunc(
   return LogicalResult::success();
 }
 
+LogicalResult FindSourceTPUReplicatedOutput(
+    Value val, TF::TPUReplicatedOutputOp& rep_out) {
+  Operation* op = val.getDefiningOp();
+  if (auto src = llvm::dyn_cast<TF::TPUReplicatedOutputOp>(op)) {
+    rep_out = src;
+    return LogicalResult::success();
+  }
+  if (auto src = llvm::dyn_cast<TF::IdentityOp>(op)) {
+    return FindSourceTPUReplicatedOutput(src->getOperand(0), rep_out);
+  }
+  op->emitOpError() << "Value did not come from a TPUReplicatedOutput op: "
+                    << val;
+  return LogicalResult::failure();
+}
+
+int FindReturnIndex(Value val) {
+  const int not_found = -1;
+  for (auto user : val.getUsers()) {
+    if (auto ret_op = llvm::dyn_cast<func::ReturnOp>(user)) {
+      for (auto index = 0; index < ret_op->getNumOperands(); ++index) {
+        if (val == ret_op->getOperand(index)) {
+          return index;
+        }
+      }
+    }
+    if (auto ident_op = llvm::dyn_cast<TF::IdentityOp>(user)) {
+      auto index = FindReturnIndex(ident_op->getResult(0));
+      if (index != not_found) return index;
+    }
+  }
+  return not_found;
+}
+
+void AddAssertion(OpBuilder& builder, Location& loc, Value cond,
+                  const std::string& message) {
+  auto shape_type =
+      RankedTensorType::get({1}, builder.getType<TF::StringType>());
+  auto msg = builder.create<TF::ConstOp>(
+      loc, DenseStringElementsAttr::get(shape_type,
+                                        llvm::ArrayRef<StringRef>{message}));
+  builder.create<TF::AssertOp>(loc, cond, msg.getResult());
+}
+
+LogicalResult StartStep0(OpBuilder& builder, Location& loc,
+                         SymbolTable& symbol_table,
+                         TF::TPUReplicateMetadataOp& metadata_op,
+                         TF::TPUCompilationResultOp& compilation_op,
+                         Value& cond_value, Callers& callers,
+                         const std::vector<Value>& loop_operands_nm0,
+                         TF::StatefulPartitionedCallOp& caller) {
+  const std::string name = "start_step_0";
+
+  AddAssertion(builder, loc, cond_value,
+               "Auto-pipelining requires at least two steps.");
+  auto insertion_point = builder.saveInsertionPoint();
+
+  func::FuncOp orig_parent_func =
+      callers.backward->getParentOfType<func::FuncOp>();
+
+  std::vector<Value> operands = loop_operands_nm0;
+
+  // Input types will be the same as the original loop body.
+  std::vector<Type> input_types = GetValueTypes(operands);
+
+  // Determine the results types.
+  // Return ALL outputs, respecting the provided order of the Operations. This
+  // makes it straightforward for users of this function to map the return
+  // values.
+  llvm::SetVector<Operation*> ops;
+  ops.insert(callers.forward);
+  ops.insert(callers.core_tpu);
+  std::vector<int> result_map;
+  result_map.reserve(callers.forward->getNumResults() +
+                     callers.core_tpu->getNumResults());
+  int result_pos = 0;
+  for (auto res : callers.forward->getResults()) {
+    bool is_output = false;
+    for (auto user : res.getUsers()) {
+      if (!ops.contains(user)) {
+        is_output = true;
+        break;
+      }
+    }
+    result_map.push_back(is_output ? result_pos++ : -1);
+  }
+  std::vector<Type> result_types;
+  Append(result_types, callers.forward->getResultTypes());
+  Append(result_types, callers.core_tpu->getResultTypes());
+
+  // Create the function based on input and result types and values.
+  auto func_type =
+      mlir::FunctionType::get(builder.getContext(), input_types, result_types);
+  func::FuncOp then_func = func::FuncOp::create(loc, name, func_type);
+  then_func.setPrivate();
+  symbol_table.insert(then_func);
+  mlir::OpBuilder func_builder =
+      mlir::OpBuilder::atBlockBegin(then_func.addEntryBlock());
+
+  // This must match the concatenation order in 'operands' above.
+  IRMapping ir_map;
+  int pos = 0;
+  for (auto orig : orig_parent_func.getArguments())
+    ir_map.map(orig, then_func.getArgument(pos++));
+
+  // Clone the specified ops into the new function.
+  auto new_forward = func_builder.insert(callers.forward->clone(ir_map));
+  for (auto p :
+       llvm::zip(callers.core_tpu->getResults(), new_forward->getResults()))
+    ir_map.map(std::get<0>(p), std::get<1>(p));
+  auto new_core_tpu = func_builder.insert(callers.core_tpu->clone(ir_map));
+
+  // Add the function return;
+  std::vector<Value> results;
+  Append(results, new_forward->getResults());
+  Append(results, new_core_tpu->getResults());
+  func_builder.create<func::ReturnOp>(loc, results);
+
+  // Inline any StatefulPartitionCall Ops.
+  auto result = Inliner(builder, symbol_table).InlineCallsInFunc(then_func);
+  if (failed(result)) return result;
+
+  builder.restoreInsertionPoint(insertion_point);
+  caller = MakeFuncCaller(builder, loc, then_func, operands,
+                          /*flag_for_inlining=*/false);
+  return LogicalResult::success();
+}
+
+LogicalResult StartStep1(OpBuilder& builder, Location& loc,
+                         SymbolTable& symbol_table,
+                         TF::TPUReplicateMetadataOp& metadata_op,
+                         TF::TPUCompilationResultOp& compilation_op,
+                         Value& cond_value, Callers& callers,
+                         const std::vector<Value>& loop_operands_1,
+                         TF::StatefulPartitionedCallOp& caller) {
+  const std::string name = "start_step_1";
+
+  AddAssertion(builder, loc, cond_value,
+               "Auto-pipelining requires at least two steps.");
+
+  auto insertion_point = builder.saveInsertionPoint();
+  func::FuncOp orig_parent_func =
+      callers.backward->getParentOfType<func::FuncOp>();
+
+  std::vector<Value> operands = loop_operands_1;
+
+  // Input types will be the same as the original loop body.
+  std::vector<Type> input_types = GetValueTypes(operands);
+
+  // Determine the results types.
+  // Return ALL outputs, respecting the provided order of the Operations. This
+  // makes it straightforward for users of this function to map the return
+  // values.
+  auto result_types = callers.forward->getResultTypes();
+
+  // Create the function based on input and result types and values.
+  auto func_type =
+      mlir::FunctionType::get(builder.getContext(), input_types, result_types);
+  func::FuncOp then_func = func::FuncOp::create(loc, name, func_type);
+  then_func.setPrivate();
+  symbol_table.insert(then_func);
+  mlir::OpBuilder func_builder =
+      mlir::OpBuilder::atBlockBegin(then_func.addEntryBlock());
+
+  // This must match the concatenation order in 'operands' above.
+  IRMapping ir_map;
+  int pos = 0;
+  for (auto orig : orig_parent_func.getArguments())
+    ir_map.map(orig, then_func.getArgument(pos++));
+
+  // Clone the specified ops into the new function.
+  auto new_forward = func_builder.insert(callers.forward->clone(ir_map));
+
+  // Add the function return;
+  func_builder.create<func::ReturnOp>(loc, new_forward->getResults());
+
+  // Inline any StatefulPartitionCall Ops.
+  auto result = Inliner(builder, symbol_table).InlineCallsInFunc(then_func);
+  if (failed(result)) return result;
+
+  builder.restoreInsertionPoint(insertion_point);
+  caller = MakeFuncCaller(builder, loc, then_func, operands,
+                          /*flag_for_inlining=*/false);
+  return LogicalResult::success();
+}
+
+LogicalResult FinishStepNm2(OpBuilder& builder, Location& loc,
+                            SymbolTable& symbol_table,
+                            TF::TPUReplicateMetadataOp& metadata_op,
+                            TF::TPUCompilationResultOp& compilation_op,
+                            Value& cond_value, Callers& callers,
+                            const std::vector<Value>& loop_operands_nm2,
+                            const std::vector<Value>& forward_res_nm2,
+                            const std::vector<Value>& core_tpu_res_nm2,
+                            TF::StatefulPartitionedCallOp& caller) {
+  const std::string name = "finish_step_nm2";
+
+  AddAssertion(builder, loc, cond_value,
+               "Auto-pipelining requires at least two steps.");
+
+  auto insertion_point = builder.saveInsertionPoint();
+  func::FuncOp orig_parent_func =
+      callers.backward->getParentOfType<func::FuncOp>();
+
+  std::vector<Value> operands = loop_operands_nm2;
+  Append(operands, forward_res_nm2);
+  Append(operands, core_tpu_res_nm2);
+
+  // Input types will be the same as the original loop body.
+  std::vector<Type> input_types = GetValueTypes(operands);
+
+  // Determine the results types.
+  // Return ALL outputs, respecting the provided order of the Operations. This
+  // makes it straightforward for users of this function to map the return
+  // values.
+  auto result_types = callers.backward->getResultTypes();
+
+  // Create the function based on input and result types and values.
+  auto func_type =
+      mlir::FunctionType::get(builder.getContext(), input_types, result_types);
+  func::FuncOp then_func = func::FuncOp::create(loc, name, func_type);
+  then_func.setPrivate();
+  symbol_table.insert(then_func);
+  mlir::OpBuilder func_builder =
+      mlir::OpBuilder::atBlockBegin(then_func.addEntryBlock());
+
+  // This must match the concatenation order in 'operands' above.
+  IRMapping ir_map;
+  int pos = 0;
+  for (auto orig : orig_parent_func.getArguments())
+    ir_map.map(orig, then_func.getArgument(pos++));
+  for (auto orig : callers.forward->getResults())
+    ir_map.map(orig, then_func.getArgument(pos++));
+  for (auto orig : callers.core_tpu->getResults())
+    ir_map.map(orig, then_func.getArgument(pos++));
+
+  // Clone the specified ops into the new function.
+  auto new_backward = func_builder.insert(callers.backward->clone(ir_map));
+
+  // Add the function return;
+  func_builder.setInsertionPointAfter(new_backward);
+  func_builder.create<func::ReturnOp>(loc, new_backward->getResults());
+
+  // Inline any StatefulPartitionCall Ops.
+  auto result = Inliner(builder, symbol_table).InlineCallsInFunc(then_func);
+  if (failed(result)) return result;
+
+  builder.restoreInsertionPoint(insertion_point);
+  caller = MakeFuncCaller(builder, loc, then_func, operands,
+                          /*flag_for_inlining=*/false);
+  return LogicalResult::success();
+}
+
+LogicalResult FinishStepNm1(OpBuilder& builder, Location& loc,
+                            SymbolTable& symbol_table,
+                            TF::TPUReplicateMetadataOp& metadata_op,
+                            TF::TPUCompilationResultOp& compilation_op,
+                            Value& cond_value, Callers& callers,
+                            const std::vector<Value>& loop_operands_nm1,
+                            const std::vector<Value>& forward_res_nm1,
+                            TF::StatefulPartitionedCallOp& caller) {
+  const std::string name = "finish_step_nm1";
+
+  AddAssertion(builder, loc, cond_value,
+               "Auto-pipelining requires at least two steps.");
+
+  auto insertion_point = builder.saveInsertionPoint();
+  func::FuncOp orig_parent_func =
+      callers.backward->getParentOfType<func::FuncOp>();
+
+  std::vector<Value> operands = loop_operands_nm1;
+  Append(operands, forward_res_nm1);
+
+  // Input types will be the same as the original loop body.
+  std::vector<Type> input_types = GetValueTypes(operands);
+
+  // Determine the results types.
+  // Return ALL outputs, respecting the provided order of the Operations. This
+  // makes it straightforward for users of this function to map the return
+  // values.
+  std::vector<Type> result_types;
+  Append(result_types, callers.core_tpu->getResultTypes());
+  Append(result_types, callers.backward->getResultTypes());
+
+  // Create the function based on input and result types and values.
+  auto func_type =
+      mlir::FunctionType::get(builder.getContext(), input_types, result_types);
+  func::FuncOp then_func = func::FuncOp::create(loc, name, func_type);
+  then_func.setPrivate();
+  symbol_table.insert(then_func);
+  mlir::OpBuilder func_builder =
+      mlir::OpBuilder::atBlockBegin(then_func.addEntryBlock());
+
+  // This must match the concatenation order in 'operands' above.
+  IRMapping ir_map;
+  int pos = 0;
+  for (auto orig : orig_parent_func.getArguments())
+    ir_map.map(orig, then_func.getArgument(pos++));
+  for (auto orig : callers.forward->getResults())
+    ir_map.map(orig, then_func.getArgument(pos++));
+
+  // Clone the specified ops into the new function.
+  auto new_core_tpu = func_builder.insert(callers.core_tpu->clone(ir_map));
+  for (auto p :
+       llvm::zip(callers.core_tpu->getResults(), new_core_tpu->getResults()))
+    ir_map.map(std::get<0>(p), std::get<1>(p));
+  auto new_backward = func_builder.insert(callers.backward->clone(ir_map));
+  // Add the function return;
+  std::vector<Value> results;
+  Append(results, new_core_tpu->getResults());
+  Append(results, new_backward->getResults());
+  func_builder.create<func::ReturnOp>(loc, results);
+
+  // Inline any StatefulPartitionCall Ops.
+  auto result = Inliner(builder, symbol_table).InlineCallsInFunc(then_func);
+  if (failed(result)) return result;
+
+  builder.restoreInsertionPoint(insertion_point);
+  caller = MakeFuncCaller(builder, loc, then_func, operands,
+                          /*flag_for_inlining=*/false);
+  return LogicalResult::success();
+}
+
+LogicalResult MakeForwardOperands(Operation* forward_caller,
+                                  Operation* non_tpu_caller,
+                                  const std::vector<Value>& loop_operands,
+                                  const std::vector<Value>& non_tpu_res,
+                                  std::vector<Value>& f_operands) {
+  f_operands.clear();
+  f_operands.reserve(forward_caller->getNumOperands());
+  for (auto operand : forward_caller->getOperands()) {
+    if (llvm::isa<BlockArgument>(operand)) {
+      // Pull this from the original operands to the original while op.
+      auto arg = llvm::cast<BlockArgument>(operand);
+      f_operands.push_back(loop_operands[arg.getArgNumber()]);
+      continue;
+    }
+    auto src = operand.getDefiningOp();
+    auto res = llvm::cast<OpResult>(operand);
+    if (src == non_tpu_caller) {
+      f_operands.push_back(non_tpu_res[res.getResultNumber()]);
+    } else {
+      forward_caller->emitOpError()
+          << "Unknown op source for operand " << operand;
+      return LogicalResult::failure();
+    }
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult MakeCoreTPUOperands(Operation* core_tpu_caller,
+                                  Operation* non_tpu_caller,
+                                  Operation* forward_caller,
+                                  const std::vector<Value>& loop_operands,
+                                  const std::vector<Value>& non_tpu_res,
+                                  const std::vector<Value>& forward_res,
+                                  std::vector<Value>& t_operands) {
+  t_operands.clear();
+  t_operands.reserve(core_tpu_caller->getNumOperands());
+  for (auto operand : core_tpu_caller->getOperands()) {
+    if (llvm::isa<BlockArgument>(operand)) {
+      // Pull this from the original operands to the original while op.
+      auto arg = llvm::cast<BlockArgument>(operand);
+      t_operands.push_back(loop_operands[arg.getArgNumber()]);
+      continue;
+    }
+    auto src = operand.getDefiningOp();
+    auto res = llvm::cast<OpResult>(operand);
+    if (src == non_tpu_caller) {
+      t_operands.push_back(non_tpu_res[res.getResultNumber()]);
+    } else if (src == forward_caller) {
+      t_operands.push_back(forward_res[res.getResultNumber()]);
+    } else {
+      core_tpu_caller->emitOpError() << "Unknown op source for operand "
+                                     << operand << ": " << src->getName();
+      return LogicalResult::failure();
+    }
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult MakeBackwardOperands(Operation* forward_caller,
+                                   Operation* core_tpu_caller,
+                                   Operation* backward_caller,
+                                   const std::vector<Value>& loop_operands,
+                                   const std::vector<Value>& forward_res,
+                                   const std::vector<Value>& core_tpu_res,
+                                   std::vector<Value>& b_operands) {
+  b_operands.clear();
+  b_operands.reserve(backward_caller->getNumOperands());
+  for (auto operand : backward_caller->getOperands()) {
+    if (llvm::isa<BlockArgument>(operand)) {
+      // Pull this from the original operands to the original while op.
+      auto arg = llvm::cast<BlockArgument>(operand);
+      b_operands.push_back(loop_operands[arg.getArgNumber()]);
+      continue;
+    }
+    auto src = operand.getDefiningOp();
+    auto res = llvm::cast<OpResult>(operand);
+    if (src == forward_caller) {
+      b_operands.push_back(forward_res[res.getResultNumber()]);
+    } else if (src == core_tpu_caller) {
+      b_operands.push_back(core_tpu_res[res.getResultNumber()]);
+    } else {
+      // Note: we're expecting no edges from non_tpu() to backward().
+      backward_caller->emitOpError() << "Unknown op source for operand "
+                                     << operand << ": " << src->getName();
+      return LogicalResult::failure();
+    }
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult MakeNonTPUOperands(Operation* non_tpu_caller,
+                                 const std::vector<Value>& loop_operands,
+                                 std::vector<Value>& n_operands) {
+  n_operands.clear();
+  n_operands.reserve(non_tpu_caller->getNumOperands());
+  for (auto operand : non_tpu_caller->getOperands()) {
+    if (llvm::isa<BlockArgument>(operand)) {
+      auto arg = llvm::cast<BlockArgument>(operand);
+      n_operands.push_back(loop_operands[arg.getArgNumber()]);
+      continue;
+    }
+    // This shouldn't happen:
+    auto src = operand.getDefiningOp();
+    non_tpu_caller->emitOpError() << "Unknown op source for operand " << operand
+                                  << ": " << src->getName();
+    return LogicalResult::failure();
+  }
+  return LogicalResult::success();
+}
+
+Operation* LiftNonTpuFuncCaller(mlir::OpBuilder& builder,
+                                Operation* orig_non_tpu_caller,
+                                const std::vector<Value>& operands) {
+  // Use this to clone an op and lift it outside its parent function. The
+  // original while body is unchanged. Example:
+  // Original:
+  //    %x = tf.while(%a, %b)
+  //    ...
+  //    while_body:
+  //       call(f=@sc_fw, %arg0, %arg1)
+  // Lifted:
+  //    call(f=@sc_fw, %a, %b)
+  //    %x = tf.while(%a, %b)
+  //    ...
+  func::FuncOp orig_parent_func =
+      orig_non_tpu_caller->getParentOfType<func::FuncOp>();
+  IRMapping ir_map;
+  ir_map.map(orig_parent_func.getArguments(), operands);
+  Operation* new_caller = builder.clone(*orig_non_tpu_caller, ir_map);
+  return new_caller;
+}
+
 void EmbeddingPipeliningPass::runOnOperation() {
+  VLOG(3) << "EmbeddingPipeliningPass::runOnOperation()";
   ModuleOp module = getOperation();
+  SymbolTable symbol_table(module);
 
   llvm::SetVector<Operation*> forward_pass_ops;
   llvm::SetVector<Operation*> backward_pass_ops;
@@ -793,6 +1652,7 @@ void EmbeddingPipeliningPass::runOnOperation() {
   // If there are no forward pass ops, there is no SC, so we end early.
   if (forward_pass_ops.empty()) {
     if (backward_pass_ops.empty()) {
+      VLOG(1) << "no pipelining ops found";
       return;
     } else {
       (*backward_pass_ops.begin())->emitOpError()
@@ -804,9 +1664,9 @@ void EmbeddingPipeliningPass::runOnOperation() {
   // Ensure that all ops are in the same region, and have the same replication
   // info.
   // TODO(bfontain): Allow for multiple regions/loops in one module.
-  // TODO(patn): move this pass after cluster formation to remove the complexity
-  // with replication info and metadata, cluster checking and generalizing to
-  // multiple TPU clusters.
+  // TODO(patn): move this pass after cluster formation to remove the
+  // complexity with replication info and metadata, cluster checking and
+  // generalizing to multiple TPU clusters.
   Region* region = (*forward_pass_ops.begin())->getParentRegion();
   StringAttr replication_attr = GetReplicationAttr(*forward_pass_ops.begin());
   llvm::SmallVector<Operation*> checkset(forward_pass_ops.getArrayRef());
@@ -826,7 +1686,7 @@ void EmbeddingPipeliningPass::runOnOperation() {
   // TODO(bfontain): Check that the region here is the region
   // of the loop body func.
   // Find the FuncOp for the surrounding while loop body.
-  func::FuncOp loop_body_func =
+  auto loop_body_func =
       (*forward_pass_ops.begin())->getParentOfType<func::FuncOp>();
 
   // merged_set will keep track of which ops are to be avoided when gather ops
@@ -846,12 +1706,21 @@ void EmbeddingPipeliningPass::runOnOperation() {
       loop_body_func, replication_attr, merged_set, compilation_op);
   if (failed(result)) return signalPassFailure();
 
-  TF::WhileOp while_op = nullptr;
-  result = FindOwningWhileOp(loop_body_func, module, &while_op);
+  TF::WhileOp orig_while_op = nullptr;
+  result = FindOwningWhileOp(loop_body_func, module, orig_while_op);
   if (failed(result)) return signalPassFailure();
+  Location loc = orig_while_op->getLoc();
 
   OpBuilder builder(module);
 
+  // A special fix for models that pass resources into helper functions and
+  // return the same resource (after passing it through multiple identity ops).
+  // Some subsequent ops use the original resource and others use the returned
+  // version. Pipelining splits these uses across loop iterations resulting in
+  // terrible things.
+  result = EliminateResourceLoops(builder, symbol_table, loop_body_func);
+  if (failed(result)) return signalPassFailure();
+
   result = FindForwardPassOps(builder, forward_pass_ops, backward_pass_ops,
                               merged_set, loop_body_func, num_replicas);
   if (failed(result)) return signalPassFailure();
@@ -873,45 +1742,440 @@ void EmbeddingPipeliningPass::runOnOperation() {
   if (failed(result)) return signalPassFailure();
   merged_set.insert(non_tpu_ops.begin(), non_tpu_ops.end());
 
-  VLOG(2) << "Forwards pass " << forward_pass_ops.size()
+  VLOG(3) << "Forwards pass " << forward_pass_ops.size()
           << " ops, backwards pass " << backward_pass_ops.size()
           << " ops, core " << core_tpu_ops.size()
           << " ops. Total = " << merged_set.size() << " of "
-          << GetNumOps(loop_body_func) << ".\n";
+          << GetNumOps(loop_body_func);
 
   builder.setInsertionPointAfter(*non_tpu_ops.begin());
-  Operation* non_tpu_caller = nullptr;
+  TF::StatefulPartitionedCallOp non_tpu_caller = nullptr;
   result =
-      ExtractOpsAsFunc(builder, module, non_tpu_ops, replication_attr, nullptr,
-                       nullptr, loop_body_func, "non_tpu", &non_tpu_caller);
+      ExtractOpsAsFunc(builder, module, symbol_table, non_tpu_ops,
+                       replication_attr, nullptr, nullptr, loop_body_func,
+                       "non_tpu", &non_tpu_caller, /*flag_for_inlining=*/false);
   if (failed(result)) return signalPassFailure();
 
   builder.setInsertionPointAfter(non_tpu_caller);
-  Operation* forward_caller = nullptr;
-  result = ExtractOpsAsFunc(builder, module, forward_pass_ops, replication_attr,
-                            metadata_op, compilation_op, loop_body_func,
-                            "sc_forward", &forward_caller);
+  TF::StatefulPartitionedCallOp forward_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, symbol_table, forward_pass_ops,
+                            replication_attr, metadata_op, compilation_op,
+                            loop_body_func, "sc_forward", &forward_caller,
+                            /*flag_for_inlining=*/true);
   if (failed(result)) return signalPassFailure();
 
   // Create tpu_core function
   builder.setInsertionPointAfter(forward_caller);
-  Operation* core_tpu_caller = nullptr;
-  result = ExtractOpsAsFunc(builder, module, core_tpu_ops, replication_attr,
-                            metadata_op, compilation_op, loop_body_func,
-                            "core_tpu", &core_tpu_caller);
+  TF::StatefulPartitionedCallOp core_tpu_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, symbol_table, core_tpu_ops,
+                            replication_attr, metadata_op, compilation_op,
+                            loop_body_func, "core_tpu", &core_tpu_caller,
+                            /*flag_for_inlining=*/true);
   if (failed(result)) return signalPassFailure();
 
   builder.setInsertionPointAfter(core_tpu_caller);
-  Operation* backwards_pass_caller = nullptr;
-  result = ExtractOpsAsFunc(
-      builder, module, backward_pass_ops, replication_attr, metadata_op,
-      compilation_op, loop_body_func, "sc_backward", &backwards_pass_caller);
+  TF::StatefulPartitionedCallOp backward_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, symbol_table, backward_pass_ops,
+                            replication_attr, metadata_op, compilation_op,
+                            loop_body_func, "sc_backward", &backward_caller,
+                            /*flag_for_inlining=*/true);
   if (failed(result)) return signalPassFailure();
 
-  metadata_op->erase();
-  compilation_op->erase();
-}
+  Callers orig_callers;
+  orig_callers.forward = forward_caller;
+  orig_callers.backward = backward_caller;
+  orig_callers.core_tpu = core_tpu_caller;
+  orig_callers.non_tpu = non_tpu_caller;
 
+  // The output of the original while op also serves as subsequent input to
+  // the same function so input_signature == output_signature. Figure out the
+  // mapping from the result of each of the four functions into the result
+  // vector.
+  auto orig_return_op = *loop_body_func.getOps<func::ReturnOp>().begin();
+  std::map<int, int> loop_arg_update_map_non_tpu;
+  std::map<int, int> loop_arg_update_map_core_tpu;
+  for (int ret_pos = 0; ret_pos < orig_return_op->getNumOperands(); ++ret_pos) {
+    auto operand = orig_return_op->getOperand(ret_pos);
+    auto def_op = operand.getDefiningOp();
+    auto result = operand.dyn_cast<OpResult>();
+    if (def_op == non_tpu_caller) {
+      loop_arg_update_map_non_tpu[result.getResultNumber()] = ret_pos;
+    } else if (def_op == core_tpu_caller) {
+      loop_arg_update_map_core_tpu[result.getResultNumber()] = ret_pos;
+    } else if (def_op == forward_caller) {
+      loop_body_func->emitOpError(
+          "Unexpected loop carried variable dependency on sc_forward");
+      return signalPassFailure();
+    } else if (def_op == backward_caller) {
+      loop_body_func->emitOpError(
+          "Unexpected loop carried variable dependency on sc_");
+      return signalPassFailure();
+    } else if (llvm::isa<BlockArgument>(operand)) {
+      // pass
+    } else {
+      // This should never happen.
+      loop_body_func->emitOpError("Couldn't find mapping for return value ");
+      return signalPassFailure();
+    }
+  }
+
+  const int num_f_res = forward_caller->getNumResults();
+  const int num_t_res = core_tpu_caller->getNumResults();
+
+  // At this point, we have separated the main while body ops into four
+  // functions:
+  //   1. SC forward pass ("forward_ops")
+  //   2. TC forward/backward pass ("core_tput_ops")
+  //   3. SC backward pass ("backward_ops")
+  //   4. Loop counter updates ("non_tpu_ops")
+  //
+  // Next, extract the original conditional function which we'll use to
+  // kick off the pre-loop pipelining steps.
+  // are just the operands passed to the original WhileOp.
+  func::FuncOp orig_cond_func = orig_while_op.cond_function();
+
+  std::vector<Value> loop_operands_0;
+  const int num_orig_loop_operands = orig_while_op->getNumOperands();
+  loop_operands_0.reserve(num_orig_loop_operands);
+  Append(loop_operands_0, orig_while_op->getOperands());
+
+  // Evaluate the real conditional function before the new while loop.
+  builder.setInsertionPoint(orig_while_op);
+  Operation* cond_caller_0 =
+      MakeFuncCaller(builder, orig_while_op->getLoc(), orig_cond_func,
+                     loop_operands_0, /*flag_for_inlining=*/false);
+  Value C_0 = cond_caller_0->getResults().front();
+
+  // Call the non_tpu function to update the loop counters. This is still
+  // part of the i=0 loop iteration.
+  builder.setInsertionPointAfter(cond_caller_0);
+  Operation* non_tpu_caller_0 =
+      LiftNonTpuFuncCaller(builder, non_tpu_caller, loop_operands_0);
+  // Save the results for later reference.
+  auto non_tpu_res_0 = ResultsAsVector(non_tpu_caller_0);
+
+  // Start step 0.
+  // Now make the sc_fw + tc_fb call in the pre-loop. We assume (and assert)
+  // that we'll execute at least two steps.
+  builder.setInsertionPointAfter(non_tpu_caller_0);
+  TF::StatefulPartitionedCallOp start_step_0;
+  result = StartStep0(builder, loc, symbol_table, metadata_op, compilation_op,
+                      C_0, orig_callers, loop_operands_0, start_step_0);
+  if (failed(result)) return signalPassFailure();
+
+  // Save the results of the forward_0 and core_tpu_0 calls by slicing them
+  // out of the results.
+  auto forward_res_0 = ResultsAsVector(start_step_0, 0, num_f_res);
+  auto core_tpu_res_0 = ResultsAsVector(start_step_0, num_f_res, num_t_res);
+
+  // Update the loop operands with results of non_tpu() and core_tpu().
+  std::vector<Value> loop_operands_1 = loop_operands_0;
+  for (auto p : loop_arg_update_map_non_tpu)
+    loop_operands_1[p.second] = non_tpu_res_0[p.first];
+  for (auto p : loop_arg_update_map_core_tpu)
+    loop_operands_1[p.second] = core_tpu_res_0[p.first];
+
+  // The second conditional evaluation.
+  builder.setInsertionPointAfter(start_step_0);
+  Operation* cond_caller_1 =
+      MakeFuncCaller(builder, orig_while_op->getLoc(), orig_cond_func,
+                     loop_operands_1, /*flag_for_inlining=*/false);
+  Value C_1 = cond_caller_1->getResults().front();
+
+  builder.setInsertionPointAfter(cond_caller_1);
+  Operation* non_tpu_caller_1 =
+      LiftNonTpuFuncCaller(builder, non_tpu_caller, loop_operands_1);
+  auto non_tpu_res_1 = ResultsAsVector(non_tpu_caller_1);
+
+  // Start step 1. Again, assume.
+  builder.setInsertionPointAfter(non_tpu_caller_1);
+  TF::StatefulPartitionedCallOp start_step_1;
+  result = StartStep1(builder, loc, symbol_table, metadata_op, compilation_op,
+                      C_1, orig_callers, loop_operands_1, start_step_1);
+  if (failed(result)) return signalPassFailure();
+
+  // Save the results of the forward_1 call.
+  auto forward_res_1 = ResultsAsVector(start_step_1);
+
+  // Update the loop operands with any outputs from the non_tpu and core_tpu
+  // functions. Note, core_tpu isn't called again until the middle of the loop
+  // body. So, loop_operands_2 is only partially updated here. We'll finish
+  // updating this after core_tpu() is called in the new while body.
+  std::vector<Value> loop_operands_2 = loop_operands_1;
+  for (auto p : loop_arg_update_map_non_tpu)
+    loop_operands_2[p.second] = non_tpu_res_1[p.first];
+
+  // The second conditional evaluation. The assumption here is that the
+  // partially updated loop_operands_2 is sufficient for correct evaluation of
+  // the cond() function.
+  builder.setInsertionPointAfter(start_step_1);
+  Operation* cond_caller_2 =
+      MakeFuncCaller(builder, orig_while_op->getLoc(), orig_cond_func,
+                     loop_operands_2, /*flag_for_inlining=*/false);
+  Value C_2 = cond_caller_2->getResults().front();
+
+  // The new while body:
+  //
+  // First, we need to construct the body and conditional functions. To do so,
+  // we need to create the initial operand list that we'll need. This will
+  // determine the type signature for the body and cond functions.
+  std::vector<Value> tmp_while_operands;
+  Append(tmp_while_operands, loop_operands_0);
+  Append(tmp_while_operands, loop_operands_1);
+  Append(tmp_while_operands, loop_operands_2);
+  Append(tmp_while_operands, forward_res_0);
+  Append(tmp_while_operands, forward_res_1);
+  Append(tmp_while_operands, core_tpu_res_0);
+  Append(tmp_while_operands, non_tpu_res_1);
+  Append(tmp_while_operands, {C_0, C_1, C_2});
+
+  // Dedupe the operands. We'll need a map to help translate.
+  llvm::SetVector<Value> new_while_operands;
+  llvm::MapVector<Value, int> loop_var_map;
+  for (auto operand : tmp_while_operands) {
+    if (new_while_operands.insert(operand)) {
+      // First time seeing this operand. Let's record the final resting place
+      // in the new_while_operands vector.
+      loop_var_map[operand] = new_while_operands.size() - 1;
+    }
+  }
+  // Save index mappings for canonical vectors.
+  auto BuildUnpackIndexes =
+      [&loop_var_map](std::vector<Value>& prototype_vals) {
+        std::vector<int> indexes;
+        indexes.reserve(prototype_vals.size());
+        for (auto prototype_val : prototype_vals)
+          indexes.push_back(loop_var_map[prototype_val]);
+        return indexes;
+      };
+  auto loop_operands_indexes_im2 = BuildUnpackIndexes(loop_operands_0);
+  auto loop_operands_indexes_im1 = BuildUnpackIndexes(loop_operands_1);
+  auto loop_operands_indexes_i = BuildUnpackIndexes(loop_operands_2);
+  auto forward_res_indexes_im2 = BuildUnpackIndexes(forward_res_0);
+  auto forward_res_indexes_im1 = BuildUnpackIndexes(forward_res_1);
+  auto core_tpu_res_indexes_im2 = BuildUnpackIndexes(core_tpu_res_0);
+  auto non_tpu_res_indexes_im1 = BuildUnpackIndexes(non_tpu_res_1);
+  int C_index_im2 = loop_var_map[C_0];
+  int C_index_im1 = loop_var_map[C_1];
+  int C_index_i = loop_var_map[C_2];
+
+  // Get the operand types.
+  std::vector<Type> new_while_operand_types = GetValueTypes(new_while_operands);
+
+  // Make cond and body functions for the new while op.
+  // Create the function based on input and result types and values.
+  // Note, for a while loop body function, the operand types and result types
+  // are identical.
+  auto body_func_type = mlir::FunctionType::get(
+      &getContext(), new_while_operand_types, new_while_operand_types);
+  auto cond_func_type = mlir::FunctionType::get(
+      &getContext(), new_while_operand_types, orig_cond_func.getResultTypes());
+  func::FuncOp cond =
+      func::FuncOp::create(loc, "new_while_cond", cond_func_type);
+  func::FuncOp body =
+      func::FuncOp::create(loc, "new_while_body", body_func_type);
+  cond.setPrivate();
+  body.setPrivate();
+  symbol_table.insert(cond);
+  symbol_table.insert(body);
+  OpBuilder cond_builder = OpBuilder::atBlockBegin(cond.addEntryBlock());
+  OpBuilder body_builder = OpBuilder::atBlockBegin(body.addEntryBlock());
+
+  //****************************************************************************
+  // Build the internals of the new tf.While op's conditional function.
+  //****************************************************************************
+  // Build the cond function body. All we need is a ReturnOp that returns C_i
+  // which is the last argument.
+  cond_builder.create<func::ReturnOp>(loc, cond.getArgument(C_index_i));
+
+  //****************************************************************************
+  // Build the internals of the new tf.While op's body function.
+  //****************************************************************************
+  auto body_args = body.getArguments();
+  // First, let's unpack all the body arguments.
+  auto UnpackArgs = [&body_args](std::vector<int>& indexes) {
+    // This helper makes it easy to unpack "natural" vectors of values while
+    // still respecting the impact of deduping.
+    std::vector<Value> slice;
+    int num = indexes.size();
+    slice.reserve(num);
+    for (auto i : indexes) slice.push_back(body_args[i]);
+    return slice;
+  };
+  auto loop_operands_im2 = UnpackArgs(loop_operands_indexes_im2);
+  auto loop_operands_im1 = UnpackArgs(loop_operands_indexes_im1);
+  auto loop_operands_i = UnpackArgs(loop_operands_indexes_i);
+  auto forward_res_im2 = UnpackArgs(forward_res_indexes_im2);
+  auto forward_res_im1 = UnpackArgs(forward_res_indexes_im1);
+  auto core_tpu_res_im2 = UnpackArgs(core_tpu_res_indexes_im2);
+  auto non_tpu_res_im1 = UnpackArgs(non_tpu_res_indexes_im1);
+  auto C_im1 = body_args[C_index_im1];
+  auto C_i = body_args[C_index_i];
+
+  // Now, construct the operand least for each op by unpacking values.
+
+  //
+  // Finish step i-2
+  //
+  // First, add all the inputs to sc_backward(). These all come from the block
+  // arguments, sc_forward() and core_tpu() and need to be pulled from the
+  // "i-2" (or "0") version of the inputs.
+  std::vector<Value> b_operands;
+  result = MakeBackwardOperands(forward_caller, core_tpu_caller,
+                                backward_caller, loop_operands_im2,
+                                forward_res_im2, core_tpu_res_im2, b_operands);
+  if (failed(result)) return signalPassFailure();
+  auto backward_caller_im2 = body_builder.clone(*backward_caller);
+  backward_caller_im2->setOperands(b_operands);
+
+  //
+  // Finish step i-1
+  //
+  // Second, add all the inputs to core_tpu(). Thesse all come from the while
+  // loop opernads, sc_forward() or non_tpu() and need to be pulled from the
+  // "i-1" (or "1") version of the inputs.
+  std::vector<Value> t_operands;
+  result = MakeCoreTPUOperands(core_tpu_caller, non_tpu_caller, forward_caller,
+                               loop_operands_im1, non_tpu_res_im1,
+                               forward_res_im1, t_operands);
+  if (failed(result)) return signalPassFailure();
+  auto core_tpu_caller_im1 = body_builder.clone(*core_tpu_caller);
+  core_tpu_caller_im1->setOperands(t_operands);
+  auto core_tpu_res_im1 = ResultsAsVector(core_tpu_caller_im1);
+
+  // Update the loop operands with results of core_tpu().
+  for (auto p : loop_arg_update_map_core_tpu)
+    loop_operands_i[p.second] = core_tpu_res_im1[p.first];
+
+  //
+  // Start step i
+  //
+  // Third, add all the inputs to non_tpu(). These all come from the while
+  // loop operands and need to be pulled from the "i" (or "2") version of the
+  // inputs.
+  std::vector<Value> n_operands;
+  result = MakeNonTPUOperands(non_tpu_caller, loop_operands_i, n_operands);
+  if (failed(result)) return signalPassFailure();
+  auto non_tpu_caller_i = body_builder.clone(*non_tpu_caller);
+  non_tpu_caller_i->setOperands(n_operands);
+  auto non_tpu_res_i = ResultsAsVector(non_tpu_caller_i);
+
+  // Fourth, add all the inputs to sc_forward(). These all come from the
+  // while loop operands or the non_tpu() call that's in the loop body. The
+  // loop operands need to be pulled from the "i" (or "2") version of the
+  // inputs. The inputs coming from non_tpu() are from the same loop iteration
+  // (non_tpu_res_i).
+  std::vector<Value> f_operands;
+  result = MakeForwardOperands(forward_caller, non_tpu_caller, loop_operands_i,
+                               non_tpu_res_i, f_operands);
+  if (failed(result)) return signalPassFailure();
+  auto forward_caller_i = body_builder.clone(*forward_caller);
+  forward_caller_i->setOperands(f_operands);
+  auto forward_res_i = ResultsAsVector(forward_caller_i);
+
+  // Update the loop operands with results of non_tpu(). Results for
+  // core_tpu() are lagged.
+  std::vector<Value> loop_operands_ip1 = loop_operands_i;
+  for (auto p : loop_arg_update_map_non_tpu)
+    loop_operands_ip1[p.second] = non_tpu_res_i[p.first];
+
+  // Add the conditional evaluation for the next loop iteration.
+  Operation* cond_caller_ip1 =
+      MakeFuncCaller(body_builder, orig_while_op->getLoc(), orig_cond_func,
+                     loop_operands_ip1, /*flag_for_inlining=*/false);
+  Value C_ip1 = cond_caller_ip1->getResults().front();
+
+  // Build the ReturnOp. This mirrors the construction of the operands with
+  // 'i' values incremented.
+  std::vector<Value> tmp_body_results;
+  Append(tmp_body_results, loop_operands_im1);
+  Append(tmp_body_results, loop_operands_i);
+  Append(tmp_body_results, loop_operands_ip1);
+  Append(tmp_body_results, forward_res_im1);
+  Append(tmp_body_results, forward_res_i);
+  Append(tmp_body_results, core_tpu_res_im1);
+  Append(tmp_body_results, non_tpu_res_i);
+  Append(tmp_body_results, {C_im1, C_i, C_ip1});
+
+  llvm::SetVector<Value> new_body_results;
+  // This should pack the same as deduping code above.
+  new_body_results.insert(tmp_body_results.begin(), tmp_body_results.end());
+  auto new_body_return_types = GetValueTypes(new_body_results);
+
+  body_builder.setInsertionPointAfter(cond_caller_ip1);
+  body_builder.create<func::ReturnOp>(orig_while_op->getLoc(),
+                                      new_body_results.getArrayRef());
+
+  // Finally, create the new tf.WhileOp.
+  builder.setInsertionPoint(orig_while_op);
+  auto new_while_op = builder.create<TF::WhileOp>(
+      orig_while_op->getLoc(), new_body_return_types,
+      new_while_operands.getArrayRef(), cond.getSymName(), body.getSymName(),
+      /*parallel_iterations=*/10,
+      /*is_stateless=*/false,
+      /*shape_invariant=*/false);
+  SetBasicBlockAttributes(builder, new_while_op);
+
+  // First, let's unpack all the body arguments.
+  auto UnpackResults = [&new_while_op](std::vector<int>& indexes) {
+    int num = indexes.size();
+    std::vector<Value> slice;
+    slice.reserve(num);
+    for (auto i : indexes) slice.push_back(new_while_op->getResult(i));
+    return slice;
+  };
+  auto loop_operands_nm2 = UnpackResults(loop_operands_indexes_im2);
+  auto loop_operands_nm1 = UnpackResults(loop_operands_indexes_im1);
+  auto loop_operands_n = UnpackResults(loop_operands_indexes_i);
+  auto forward_res_nm2 = UnpackResults(forward_res_indexes_im2);
+  auto forward_res_nm1 = UnpackResults(forward_res_indexes_im1);
+  auto core_tpu_res_nm2 = UnpackResults(core_tpu_res_indexes_im2);
+  auto non_tpu_res_nm1 = UnpackResults(non_tpu_res_indexes_im1);
+  auto C_nm2 = new_while_op->getResult(C_index_im2);
+  auto C_nm1 = new_while_op->getResult(C_index_im1);
+
+  // Finish step n-2.
+  builder.setInsertionPointAfter(new_while_op);
+  TF::StatefulPartitionedCallOp finish_step_nm2;
+  result = FinishStepNm2(builder, loc, symbol_table, metadata_op,
+                         compilation_op, C_nm2, orig_callers, loop_operands_nm2,
+                         forward_res_nm2, core_tpu_res_nm2, finish_step_nm2);
+  if (failed(result)) return signalPassFailure();
+
+  // Finish step n-1.
+  builder.setInsertionPointAfter(finish_step_nm2);
+  TF::StatefulPartitionedCallOp finish_step_nm1;
+  result = FinishStepNm1(builder, loc, symbol_table, metadata_op,
+                         compilation_op, C_nm1, orig_callers, loop_operands_nm1,
+                         forward_res_nm1, finish_step_nm1);
+  if (failed(result)) return signalPassFailure();
+
+  // Save the results of the core_tpu_0 call and use it to finalize the
+  // loop_operands_n array.
+  auto core_tpu_res_nm1 = ResultsAsVector(finish_step_nm1, 0, num_t_res);
+  for (auto p : loop_arg_update_map_core_tpu)
+    loop_operands_n[p.second] = core_tpu_res_nm1[p.first];
+
+  // Replace the return values from the original WhileOp with the output of
+  // the pipelining.
+  for (auto p : llvm::zip(orig_while_op->getResults(), loop_operands_n))
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               *orig_while_op->getParentRegion());
+
+  // Inline the new while body.
+  result = Inliner(builder, symbol_table).InlineCallsInFunc(body, false);
+  if (failed(result)) return signalPassFailure();
+
+  // Erase original while op and temporary functions. Note, we use the non_tpu
+  // function in the output graph.
+  symbol_table.lookup(orig_callers.forward.getF())->erase();
+  symbol_table.lookup(orig_callers.core_tpu.getF())->erase();
+  symbol_table.lookup(orig_callers.backward.getF())->erase();
+  orig_while_op.body_function().erase();
+  orig_while_op.erase();
+
+  VLOG(3) << "EmbeddingPipeliningPass::runOnOperation done.";
+}
 }  // namespace
 
 std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateEmbeddingPipeliningPass() {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc
new file mode 100644
index 00000000000..a83f6ac54a8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/embedding_sequencing.cc
@@ -0,0 +1,924 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This pass separates SparseCore, TensorCore, and non-TPU operations into
+// separate functions for proper sequencing of TF2 TPU Embedding (see
+// tpu_embedding_v3.py). This pass is a precursor for pipelining (see
+// embedding_pipelining.cc) and DOES NOT permit parallel execution across SC and
+// TC. This pass is a temporary fallback to use while developing full pipelining
+// capabilities.
+//
+// Ops are broken up into:
+//   1. SC forward pass
+//   2. TC forward/backward pass
+//   3. SC backward pass
+//   4. non-TPU loop counter updates
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+
+#define GEN_PASS_DEF_EMBEDDINGSEQUENCINGPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+static constexpr char kEmbeddingPipelining[] = "_embedding_pipelining";
+static constexpr char kEmbeddingForward[] = "forward";
+static constexpr char kEmbeddingBackward[] = "backward";
+static constexpr char kDevice[] = "device";
+static constexpr llvm::StringRef kTpuCompilationStatus =
+    "_tpu_compilation_status";
+
+namespace mlir {
+namespace TFDevice {
+namespace {
+
+struct EmbeddingSequencingPass
+    : public ::impl::EmbeddingSequencingPassBase<EmbeddingSequencingPass> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<TF::TensorFlowDialect>();
+  }
+
+  void runOnOperation() override;
+};
+
+template <typename InputContainer>
+std::vector<Type> GetValueTypes(const InputContainer& input) {
+  // Convert a list of mlir::Value's into a list of mlir::Type's
+  std::vector<Type> types;
+  types.reserve(input.size());
+  for (auto val : input) types.push_back(val.getType());
+  return types;
+}
+
+bool IsResourceType(Type val_type) {
+  if (auto tensor_type = val_type.dyn_cast<mlir::TensorType>()) {
+    if (tensor_type.getElementType().isa<TF::ResourceType>()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsTPUOp(mlir::Operation* op) {
+  return op->hasAttr(TF::kReplicationInfoAttr);
+}
+
+StringAttr GetReplicationAttr(mlir::Operation* op) {
+  return op->getAttrOfType<StringAttr>(TF::kReplicationInfoAttr);
+}
+
+StringAttr GetReplicationAttr(TF::TPUCompilationResultOp op) {
+  // Special case for getting the replication region for
+  // TPUCompilationResultsOp.
+  return op->getAttrOfType<StringAttr>(kTpuCompilationStatus);
+}
+
+int64_t GetNumOps(func::FuncOp func) {
+  int64_t num_ops = 0;
+  for (auto it = func.begin(); it != func.end(); ++it) ++num_ops;
+  return num_ops;
+}
+
+void GatherOpsForExtraction(mlir::SetVector<Operation*>* operations,
+                            const mlir::SetVector<Operation*>& ops_to_avoid,
+                            bool predecessors, bool successors) {
+  // Walk the input and output dependencies of the Ops in `operations` to form
+  // the closer of Ops needed to evaluate 'operations'. Input dependencies are
+  // walked if 'predecessors' is true and output dependencies are walked if
+  // 'successors' is true. In either case, if a discoverd Op is in the
+  // 'ops_to_avoid' set, then the dependency walking is terminated.
+  llvm::SetVector<Operation*> ops_to_process(*operations);
+  llvm::SetVector<Operation*> new_ops;
+
+  while (!ops_to_process.empty()) {
+    for (Operation* op : ops_to_process) {
+      if (predecessors) {
+        for (Value operand : op->getOperands()) {
+          // Stop at the block boundary.
+          if (operand.isa<BlockArgument>()) continue;
+
+          Operation* predecessor = operand.getDefiningOp();
+          if (!operations->contains(predecessor) &&
+              !ops_to_avoid.contains(predecessor)) {
+            new_ops.insert(operand.getDefiningOp());
+            operations->insert(operand.getDefiningOp());
+          }
+        }
+      }
+      if (successors) {
+        for (mlir::Operation* successor : op->getUsers()) {
+          // Don't include the return op
+          if (llvm::isa<func::ReturnOp>(successor)) continue;
+
+          if (!operations->contains(successor) &&
+              !ops_to_avoid.contains(successor)) {
+            new_ops.insert(successor);
+            operations->insert(successor);
+          }
+        }
+      }
+    }
+    ops_to_process.swap(new_ops);
+    new_ops.clear();
+  }
+}
+
+TF::StatefulPartitionedCallOp MakeFuncCaller(
+    mlir::OpBuilder& builder, const Location& loc, func::FuncOp func,
+    const llvm::SetVector<Value>& operands) {
+  // Constructs a tf.StatefulPartitionedCall to the function provided in 'func'
+  // using the operands in 'operands'. Assumes the insertion point on builder is
+  // already set.
+  auto symbol =
+      mlir::SymbolRefAttr::get(builder.getContext(), func.getSymName());
+  auto result_types = func.getResultTypes();
+  auto caller = builder.create<TF::StatefulPartitionedCallOp>(
+      loc, result_types, operands.getArrayRef(), symbol,
+      /*config=*/builder.getStringAttr(""),
+      /*config_proto=*/builder.getStringAttr(""),
+      /*executor_type=*/builder.getStringAttr(""));
+  caller.setFAttr(symbol);
+  return caller;
+}
+
+func::FuncOp CreateFnWithSignature(ModuleOp module,
+                                   const llvm::SetVector<Value>& inputs,
+                                   const llvm::SetVector<Value>& outputs,
+                                   const std::string& name) {
+  // Creates an empty func.FuncOp with a signature compatible with 'inputs'
+  // (operands) and 'outputs' (results).
+  OpBuilder builder(module);
+
+  std::vector<Type> input_types = GetValueTypes(inputs);
+  std::vector<Type> output_types = GetValueTypes(outputs);
+  builder.setInsertionPointToEnd(&module.getBodyRegion().back());
+  func::FuncOp func_op = builder.create<func::FuncOp>(
+      module.getLoc(), name,
+      builder.getFunctionType(input_types, output_types));
+  func_op.setPrivate();
+
+  return func_op;
+}
+
+TF::StatefulPartitionedCallOp EncapsulateOpsInFunc(
+    OpBuilder& builder, const llvm::SetVector<Operation*>& ops,
+    const llvm::SetVector<Value>& inputs, const llvm::SetVector<Value>& outputs,
+    func::FuncOp parent_func, ModuleOp module, const std::string& name) {
+  // Moves all of the Operations in 'ops' into a newly created func.FuncOp
+  // function named 'name' and replaces the original ops with a call to the
+  // newly created function using a tf.StatefulPartitionedCall. Here,
+  // 'parent_func' is the function that holds the original set of ops.
+  // Note, 'inputs' and 'outputs' are the predetermined set of values that
+  // should become the operands and return values, respectively.
+  auto insertion_point = builder.saveInsertionPoint();
+  func::FuncOp new_func = CreateFnWithSignature(module, inputs, outputs,
+                                                absl::StrCat("_func_", name));
+
+  // This preserves the order of the ops that was in the original parent
+  // funtion. This is critical for preserving correctness in the presence of
+  // resource variables and stateful functions.
+  std::vector<Operation*> topological_order;
+  for (Operation& op : parent_func.getOps())
+    if (ops.contains(&op)) topological_order.push_back(&op);
+
+  // Create the partitioned call
+  builder.restoreInsertionPoint(insertion_point);
+  auto caller = MakeFuncCaller(builder, module.getLoc(), new_func, inputs);
+
+  Block* block = new_func.addEntryBlock();
+
+  for (Operation* op : topological_order) op->moveBefore(block, block->end());
+
+  // Replace the 'inputs' values with the new function's arguments.
+  for (auto p : llvm::zip(inputs, new_func.getArguments()))
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               new_func.getBody());
+
+  builder.setInsertionPointToEnd(block);
+  builder.create<func::ReturnOp>(parent_func.getLoc(), outputs.getArrayRef());
+
+  // Replace the original 'outputs' values with the result of the call to the
+  // new function.
+  for (auto p : llvm::zip(outputs, caller->getResults()))
+    replaceAllUsesInRegionWith(std::get<0>(p), std::get<1>(p),
+                               parent_func.getBody());
+
+  return caller;
+}
+
+void UpdateAndInsertTPUOps(TF::StatefulPartitionedCallOp caller,
+                           TF::TPUReplicateMetadataOp metadata_op,
+                           TF::TPUCompilationResultOp compilation_op,
+                           StringAttr old_group) {
+  // Adds the TPUReplicateMetatdataOp and TPUCompilationResultOp ops to the
+  // function called by the provided 'caller'.
+  mlir::CallInterfaceCallable callable = caller.getCallableForCallee();
+  mlir::SymbolRefAttr sym = callable.dyn_cast<mlir::SymbolRefAttr>();
+  auto func = llvm::dyn_cast<mlir::func::FuncOp>(
+      mlir::SymbolTable::lookupNearestSymbolFrom(caller, sym));
+  OpBuilder builder(func.getBody());
+
+  StringAttr new_group = builder.getStringAttr(
+      absl::StrCat(old_group.getValue().str(), caller.getF().str()));
+
+  builder.insert(metadata_op.clone());
+  for (Operation& op : func.getOps()) {
+    if (!IsTPUOp(&op)) continue;
+    op.setAttr(TF::kReplicationInfoAttr, new_group);
+  }
+  TF::TPUCompilationResultOp new_result = compilation_op.clone();
+  new_result->setAttr(kTpuCompilationStatus, new_group);
+  builder.insert(new_result);
+}
+
+template <typename OpType>
+LogicalResult FindAndExcludeOp(func::FuncOp func,
+                               const StringAttr& replication_attr,
+                               llvm::SetVector<Operation*>& merged_set,
+                               OpType& found_op) {
+  // Find the TPUReplicationMetadata or TPUCompilationResult ops which will be
+  // cloned/inserted into each region. We add them to the merged_set so that
+  // they're ignored when extracting the four main functions.
+  found_op = nullptr;
+  for (OpType op : func.getOps<OpType>()) {
+    if (found_op != nullptr) {
+      func.emitOpError() << "number of " << found_op.getOperationName()
+                         << " in loop body is not 1";
+      return LogicalResult::failure();
+    }
+    if (GetReplicationAttr(op) != replication_attr) {
+      op.emitOpError() << "is not part of the replication region "
+                       << replication_attr << " vs " << GetReplicationAttr(op);
+      return LogicalResult::failure();
+    }
+    found_op = op;
+    merged_set.insert(found_op);
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult FindOwningWhileOp(func::FuncOp body_func, ModuleOp module,
+                                TF::WhileOp* while_op) {
+  // Given a while loop body function 'body_func', find the tf.While Op that
+  // uses it.
+  auto uses_optional = body_func.getSymbolUses(module);
+  if (!uses_optional.has_value()) {
+    body_func.emitOpError() << "no use of while loop body";
+    return LogicalResult::failure();
+  }
+  *while_op = nullptr;
+  for (auto& use : uses_optional.value()) {
+    if (llvm::isa<TF::WhileOp>(use.getUser())) {
+      if (*while_op != nullptr) {
+        use.getUser()->emitOpError() << "multiple users of function.";
+        return LogicalResult::failure();
+      } else {
+        *while_op = llvm::cast<TF::WhileOp>(use.getUser());
+      }
+    } else {
+      use.getUser()->emitOpError() << "non while use of function.";
+      return LogicalResult::failure();
+    }
+  }
+  // TODO(bfontain): If the while op is not present we could just split things
+  // or we wait until the compiler supports multiple regions?
+  if (while_op == nullptr) {
+    body_func.emitOpError() << "unable to find while body user.";
+    return LogicalResult::failure();
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult FindForwardPassOps(OpBuilder& builder,
+                                 llvm::SetVector<Operation*>& forward_pass_ops,
+                                 llvm::SetVector<Operation*>& backward_pass_ops,
+                                 llvm::SetVector<Operation*>& merged_set,
+                                 func::FuncOp loop_body_func,
+                                 const int num_replicas) {
+  // Find all the ops that are to be included in the 'sc_forward' function which
+  // will be executed on the SparseCore. Note, 'forward_pass_ops' is initially
+  // seeded with ops from the input MLIR graph that have the
+  // _embedding_pipelining="forward" attribute which is set by the TF2 Embedding
+  // API.
+  //
+  // When outputs of the forward pass function are used outside of it, we'll
+  // need to insert a TPUReplicatedOutput Op and include that in the
+  // forward_pass_ops. And if that usage is also on the TPU (either TensorCore
+  // or SparseCore) we'll need to insert a matching TPUReplicatedInput. We do
+  // this before the Ops are removed from the original function/graph so that
+  // function operands and return values are handled automatically.
+
+  // First, walk the op dependencies.
+  GatherOpsForExtraction(&forward_pass_ops, merged_set, /*predecessors=*/true,
+                         /*successors=*/false);
+
+  // Locate which variable inputs are part of the forwards pass. These will
+  // also be used in the backwards pass. We need to create a 'private' copy
+  // of the TpuReplicatedInput for for the fowards pass if there are users
+  // outside the pass. Note that in the case of the backwards pass existing
+  // this will be the case.
+  // This means that when we have put all out sections together some resource
+  // inputs will have multiple TPUReplicateInput nodes, so we will need a final
+  // pass to merge these together into the earliest copy.
+  llvm::SetVector<int64_t> forward_variable_inputs;
+
+  // Validate that the only resource inputs that are read by ops in
+  // forward_pass_ops are dataset and variable ops.
+  int64_t resource_count = 0;
+  for (auto argument : loop_body_func.getArguments()) {
+    // Check that all resource arguments are either fed to iterator get next
+    // or a TPUReplicatedInput with is_packed.
+
+    if (IsResourceType(argument.getType())) {
+      resource_count++;
+      bool is_variable = false;
+      bool is_non_variable = false;
+      bool use_in_forward = false;
+      bool use_in_not_forward = false;
+      for (auto user : argument.getUsers()) {
+        if (llvm::isa<func::ReturnOp>(user)) continue;
+        if (!forward_pass_ops.contains(user)) {
+          use_in_not_forward = true;
+        } else {
+          use_in_forward = true;
+        }
+        if (TF::TPUReplicatedInputOp input =
+                llvm::dyn_cast<TF::TPUReplicatedInputOp>(user)) {
+          if (!input.getIsPacked()) {
+            input.emitOpError() << "unexpected variable input, not packed";
+            return LogicalResult::failure();
+          }
+
+          if (is_variable) {
+            input.emitOpError() << "unexpected multiple TPUReplicatedInputOp "
+                                << "for single argument";
+            return LogicalResult::failure();
+          }
+          is_variable = true;
+        } else {
+          is_non_variable = true;
+        }
+      }
+      if (use_in_forward && use_in_not_forward) {
+        loop_body_func.emitOpError()
+            << "resource input " << argument.getArgNumber()
+            << " is used both in the forwards and "
+            << "not forward passes dataset";
+        return LogicalResult::failure();
+      }
+      if (is_non_variable && is_variable) {
+        loop_body_func.emitOpError()
+            << "resource input " << argument.getArgNumber()
+            << " is used both as a varible and not "
+            << " a variable";
+        return LogicalResult::failure();
+      }
+      if (is_variable && use_in_forward)
+        forward_variable_inputs.insert(argument.getArgNumber());
+    }
+  }
+
+  VLOG(3) << "Found " << forward_variable_inputs.size()
+          << " variables used in forward pass of " << resource_count
+          << " total resource inputs";
+
+  // Clone the TPUReplicatedInputs.
+  int64_t cloned_inputs = 0;
+  for (int64_t index : forward_variable_inputs) {
+    Value argument = loop_body_func.getArgument(index);
+    // Uses of this argument should only be the return and the
+    // TPUReplicateInputOp. This is checked by the loop above.
+    Operation* input_ptr = nullptr;
+    for (Operation* user : argument.getUsers()) {
+      if (llvm::isa<TF::TPUReplicatedInputOp>(user)) {
+        input_ptr = user;
+        break;
+      }
+    }
+    TF::TPUReplicatedInputOp input =
+        llvm::cast<TF::TPUReplicatedInputOp>(input_ptr);
+
+    // Validate that all users of the TPUReplicatedInput are ReadVariable
+    // or AssignVariable ops and check if any are outside the forwards pass.
+    bool duplicate_needed = false;
+    for (Operation* next_user : input.getOutput().getUsers()) {
+      if (!llvm::isa<TF::ReadVariableOp>(next_user) &&
+          !llvm::isa<TF::AssignVariableOp>(next_user)) {
+        next_user->emitOpError()
+            << "unexpected user of output of TPUReplicatedInputOp";
+        return LogicalResult::failure();
+      }
+      if (!forward_pass_ops.contains(next_user)) duplicate_needed = true;
+    }
+    if (!duplicate_needed) continue;
+
+    cloned_inputs++;
+    builder.setInsertionPointAfter(input);
+    forward_pass_ops.remove(input);
+
+    TF::TPUReplicatedInputOp private_input = input.clone();
+    builder.insert(private_input);
+    forward_pass_ops.insert(private_input);
+    for (OpOperand& next_use : input.getOutput().getUses()) {
+      if (!forward_pass_ops.contains(next_use.getOwner())) continue;
+      next_use.getOwner()->setOperand(next_use.getOperandNumber(),
+                                      private_input.getOutput());
+    }
+  }
+
+  VLOG(2) << "Cloned " << cloned_inputs << " TPUReplicatedInputOps";
+
+  // Add TPUReplicatedInput/TPUReplicatedOutput pairs along each edge.
+  llvm::SetVector<Operation*> new_forward_ops;
+  for (Operation* op : forward_pass_ops) {
+    // TODO(bfontain): Should validate that all the TPU ops are in the same
+    // replication region.
+    if (!IsTPUOp(op)) continue;
+    for (Value result : op->getResults()) {
+      std::vector<std::pair<Operation*, int64_t>> out_of_region_use;
+      for (OpOperand& use : result.getUses()) {
+        auto use_owner = use.getOwner();
+        // TODO(bfontain): Error check here, if the use.getOwner() is not a TPU
+        // then this op must be a TPUReplicatedOutputOp.
+        if (IsTPUOp(use_owner) && !forward_pass_ops.contains(use_owner))
+          out_of_region_use.push_back(
+              std::make_pair(use_owner, use.getOperandNumber()));
+      }
+      if (out_of_region_use.empty()) continue;
+      builder.setInsertionPointAfter(op);
+      std::vector<Type> types(num_replicas, result.getType());
+      TF::TPUReplicatedOutputOp replicated_output =
+          builder.create<TF::TPUReplicatedOutputOp>(op->getLoc(),
+                                                    TypeRange(types), result);
+      new_forward_ops.insert(replicated_output);
+      // TODO(bfontain): Check for other attributes.
+      replicated_output->setAttr(kDevice, builder.getStringAttr(""));
+      TF::TPUReplicatedInputOp input = builder.create<TF::TPUReplicatedInputOp>(
+          op->getLoc(), result.getType(), replicated_output.getResults());
+      input->setAttr(kDevice, builder.getStringAttr(""));
+      mlir::Value new_value = input.getOutput();
+
+      if (mlir::isa<TF::TPUAnnotateTensorsWithDynamicShapeOp>(
+              result.getDefiningOp())) {
+        TF::TPUAnnotateTensorsWithDynamicShapeOp annotate_op =
+            builder.create<TF::TPUAnnotateTensorsWithDynamicShapeOp>(
+                op->getLoc(), result.getType(), new_value,
+                result.getDefiningOp()->getAttrs());
+        for (auto [operation, index] : out_of_region_use) {
+          if (!backward_pass_ops.contains(operation)) {
+            operation->emitOpError()
+                << "expect all dynamic inputs consumed by backwards pass.";
+            return LogicalResult::failure();
+          }
+        }
+
+        backward_pass_ops.insert(annotate_op);
+        new_value = annotate_op->getResult(0);
+      }
+      for (auto [operation, index] : out_of_region_use)
+        operation->setOperand(index, new_value);
+    }
+  }
+
+  VLOG(2) << "inserted " << new_forward_ops.size() << " TPU Input/Output ops";
+  forward_pass_ops.insert(new_forward_ops.begin(), new_forward_ops.end());
+  return LogicalResult::success();
+}
+
+LogicalResult FindBackwardPassOps(
+    OpBuilder& builder, llvm::SetVector<Operation*>& backward_pass_ops,
+    llvm::SetVector<Operation*>& merged_set, const int num_replicas) {
+  // Find all the ops that are to be included in the 'sc_backward' function
+  // which will be executed on the SparseCore. Note, 'backward_pass_ops' is
+  // initially seeded with ops from the input MLIR graph that have the
+  // _embedding_pipelining="backward" attribute which is set by the TF2
+  // Embedding API.
+  //
+  // Since we're inserting a replication boundary around the backward pass
+  // function, we'll also need to make sure TPUReplicatedInputOp and
+  // TPUReplicatedOutputOp ops are inserted as necessary.
+
+  // First, walk the Ops dependencies.
+  GatherOpsForExtraction(&backward_pass_ops, merged_set, /*predecessors=*/false,
+                         /*successors=*/true);
+
+  VLOG(3) << "found " << backward_pass_ops.size() << " backwards pass ops";
+
+  // If any inputs are to the backward_pass_ops region are direct
+  // TPUReplicatedInput ops, then include (if this is the only use) or
+  // clone the op. This will be the case for all Read/Assign variable ops.
+
+  llvm::SetVector<TF::TPUReplicatedInputOp> to_clone;
+  llvm::SetVector<TF::TPUReplicatedInputOp> to_insert;
+
+  for (Operation* op : backward_pass_ops) {
+    for (OpOperand& input_value : op->getOpOperands()) {
+      Operation* predecessor_op = input_value.get().getDefiningOp();
+      if (TF::TPUReplicatedInputOp input =
+              llvm::dyn_cast<TF::TPUReplicatedInputOp>(predecessor_op)) {
+        if (to_clone.contains(input) || to_insert.contains(input)) continue;
+        // Check if all uses in backwards pass.
+        bool all_in_backwards = true;
+        for (Operation* user : input->getUsers())
+          if (!backward_pass_ops.contains(user)) all_in_backwards = false;
+        if (all_in_backwards)
+          to_insert.insert(input);
+        else
+          to_clone.insert(input);
+      }
+    }
+  }
+  backward_pass_ops.insert(to_insert.begin(), to_insert.end());
+  for (TF::TPUReplicatedInputOp input : to_clone) {
+    builder.setInsertionPointAfter(input);
+    TF::TPUReplicatedInputOp private_input = input.clone();
+    builder.insert(private_input);
+    backward_pass_ops.insert(private_input);
+    for (OpOperand& next_use : input.getOutput().getUses()) {
+      if (!backward_pass_ops.contains(next_use.getOwner())) continue;
+      next_use.getOwner()->setOperand(next_use.getOperandNumber(),
+                                      private_input.getOutput());
+    }
+  }
+
+  VLOG(2) << " cloned " << to_clone.size() << " and inserted "
+          << to_insert.size() << " TPUReplicatedInput ops";
+
+  // For all other inputs that go from TPU op to TPU op, insert the
+  // TPUOutput/Input pair.
+
+  // Add TPUReplicatedInput/TPUReplicatedOutput pairs along each edge.
+  // TODO(bfontain): Should be merged with the above loop.
+  llvm::SetVector<Value> values_to_add_nodes;
+
+  for (Operation* op : backward_pass_ops) {
+    // TODO(bfontain): Should validate that all the TPU ops are in the same
+    // replication region.
+    // If the op is already a replicated input, no need to to anything.
+    if (!IsTPUOp(op) || llvm::isa<TF::TPUReplicatedInputOp>(op)) continue;
+    for (OpOperand& input_value : op->getOpOperands())
+      // TODO(bfontain): Error check here, this line should never be false,
+      // since we skip the TF::TPUReplicatedInputOp case.
+      if (IsTPUOp(input_value.get().getDefiningOp()) &&
+          !backward_pass_ops.contains(input_value.get().getDefiningOp()))
+        values_to_add_nodes.insert(input_value.get());
+  }
+
+  for (Value value : values_to_add_nodes) {
+    builder.setInsertionPointAfter(value.getDefiningOp());
+    std::vector<Type> types(num_replicas, value.getType());
+    Location loc = value.getDefiningOp()->getLoc();
+    TF::TPUReplicatedOutputOp output =
+        builder.create<TF::TPUReplicatedOutputOp>(loc, TypeRange(types), value);
+    // TODO(bfontain): Check for other attributes.
+    output->setAttr(kDevice, builder.getStringAttr(""));
+    TF::TPUReplicatedInputOp input = builder.create<TF::TPUReplicatedInputOp>(
+        loc, value.getType(), output.getResults());
+    input->setAttr(kDevice, builder.getStringAttr(""));
+    for (OpOperand& use : value.getUses())
+      if (backward_pass_ops.contains(use.getOwner()))
+        use.getOwner()->setOperand(use.getOperandNumber(), input.getOutput());
+    backward_pass_ops.insert(input);
+  }
+
+  VLOG(2) << " inserted " << values_to_add_nodes.size()
+          << " TPUReplicatedInput/Output pairs";
+  return LogicalResult::success();
+}
+
+LogicalResult FindCoreTPUOps(
+    llvm::SetVector<Operation*>& core_tpu_ops,
+    const llvm::SetVector<Operation*>& forward_pass_ops,
+    const llvm::SetVector<Operation*>& backward_pass_ops,
+    const llvm::SetVector<Operation*>& merged_set,
+    func::FuncOp loop_body_func) {
+  // Find all of the Ops that are part of the forward/backward pass but aren't
+  // targeting the SparseCore. Note that we need to include some non-TPU ops
+  // that flow out of the forward pass function. Otherwise, they would get
+  // absorbed into the non_tpu function which breaks the pipelining
+  // decomposition strategy.
+  //
+  // Find all the outputs of the forward pass that aren't fed into the backward
+  // pass.
+  for (Operation* op : forward_pass_ops) {
+    for (Value res : op->getResults()) {
+      for (auto user : res.getUsers()) {
+        if (!forward_pass_ops.contains(user) &&
+            !backward_pass_ops.contains(user)) {
+          core_tpu_ops.insert(user);
+        }
+      }
+    }
+  }
+
+  // Gather all TPU ops marked for compilation in this while loop body that also
+  // are not in one of the two other sets.
+  for (Operation& op : loop_body_func.getOps()) {
+    // Find all TPU ops that don't belong to the forward or backward pass.
+    if (merged_set.contains(&op) || llvm::isa<func::ReturnOp>(op) ||
+        !IsTPUOp(&op) || op.hasAttr(kEmbeddingPipelining))
+      continue;
+    // TODO(bfontain): only collect those ops in a fixed TPUReplica.
+    core_tpu_ops.insert(&op);
+  }
+
+  GatherOpsForExtraction(&core_tpu_ops, merged_set, /*predecessors=*/true,
+                         /*successors=*/true);
+
+  // TODO(patn): Verify that all the ops here fall between the forward pass
+  // and backward pass ops (i.e., not before the forward pass or after the
+  // backward pass).
+  return LogicalResult::success();
+}
+
+LogicalResult FindNonTPUOps(llvm::SetVector<Operation*>& non_tpu_ops,
+                            const llvm::SetVector<Operation*>& merged_set,
+                            func::FuncOp loop_body_func) {
+  // Find all of the left over Ops after the sc_forward, sc_backward and
+  // core_tpu ops have been identified. What's left are just the ops necessary
+  // for updating loop counters etc.
+  llvm::SetVector<int64_t> non_tpu_args;
+  for (Operation& op : loop_body_func.getOps()) {
+    if (merged_set.contains(&op) || llvm::isa<func::ReturnOp>(op) ||
+        op.hasAttr(kEmbeddingPipelining))
+      continue;
+    // Note, there should be no TPU ops left at this point. If this trips,
+    // there's likely a bug in this pass.
+    if (IsTPUOp(&op)) {
+      loop_body_func.emitOpError()
+          << "Unexpcted TPU op found while identifying non-TPU ops.";
+      return LogicalResult::failure();
+    }
+    non_tpu_ops.insert(&op);
+  }
+
+  // Validate that remainder_ops takes and returns a subset of the loop carried
+  // args. This will basically be our set increment fn.
+  for (Operation* op : non_tpu_ops)
+    for (Value input : op->getOperands())
+      if (BlockArgument arg = llvm::dyn_cast<BlockArgument>(input))
+        // TODO(bfontain): Check that this is actually an argument to the loop
+        // body.
+        non_tpu_args.insert(arg.getArgNumber());
+
+  // All funcs have a return op so this should be safe.
+  func::ReturnOp return_op = *loop_body_func.getOps<func::ReturnOp>().begin();
+
+  for (OpOperand& operand : return_op->getOpOperands()) {
+    if (non_tpu_args.contains(operand.getOperandNumber())) {
+      if (BlockArgument argument =
+              llvm::dyn_cast<BlockArgument>(operand.get())) {
+        if (argument.getArgNumber() != operand.getOperandNumber()) {
+          return_op.emitOpError()
+              << "non TPU ops do not divide state into two pieces.";
+          return LogicalResult::failure();
+        }
+      } else if (!non_tpu_ops.contains(operand.get().getDefiningOp())) {
+        return_op.emitOpError()
+            << "non TPU ops do not divide state into two pieces.";
+        return LogicalResult::failure();
+      }
+    }
+  }
+  return LogicalResult::success();
+}
+
+LogicalResult ExtractOpsAsFunc(
+    OpBuilder& builder, ModuleOp module, llvm::SetVector<Operation*>& ops,
+    StringAttr replication_attr, TF::TPUReplicateMetadataOp metadata_op,
+    TF::TPUCompilationResultOp compilation_op, func::FuncOp parent_func,
+    const std::string& func_name, Operation** caller) {
+  // Move the given set of 'ops' into it's own function and replace them with a
+  // call to that function ('caller'). if 'metadata_op' and 'compilation_op' are
+  // non-null, also insert those (i.e., target the resulting function to the
+  // TPU). Here, 'parent_func' is the func.FuncOp that owns the ops in 'ops'.
+  //
+  // Returns in 'caller' a tf.StatefulPartitionedCallOp that calls the function
+  // that was extracted..
+
+  // Find the input edges to form the set of operands to the new function call.
+  llvm::SetVector<Value> inputs;
+  for (Operation* op : ops) {
+    for (Value operand : op->getOperands()) {
+      Operation* defining_op = operand.getDefiningOp();
+      if (!ops.contains(defining_op)) inputs.insert(operand);
+    }
+  }
+  // Find the output edges to form the set of resutls of the new function call.
+  llvm::SetVector<OpResult> results;
+  for (Operation* op : ops) {
+    for (auto result : op->getResults()) {
+      for (const OpOperand& operand : result.getUsers()) {
+        if (!ops.contains(operand.getOwner())) {
+          results.insert(result);
+          break;
+        }
+      }
+    }
+  }
+  llvm::SetVector<Value> outputs;
+  for (auto output : results) outputs.insert(output);
+  auto tf_caller = EncapsulateOpsInFunc(builder, ops, inputs, outputs,
+                                        parent_func, module, func_name);
+  if (!ops.empty() && metadata_op != nullptr && compilation_op != nullptr)
+    UpdateAndInsertTPUOps(tf_caller, metadata_op, compilation_op,
+                          replication_attr);
+  *caller = tf_caller;
+  return LogicalResult::success();
+}
+
+void EmbeddingSequencingPass::runOnOperation() {
+  ModuleOp module = getOperation();
+
+  llvm::SetVector<Operation*> forward_pass_ops;
+  llvm::SetVector<Operation*> backward_pass_ops;
+
+  // Find all ops that we know compose the embedding forward and backward pass.
+  // These ops are only tagged if one enables the
+  // `pipeline_execution_with_tensor_core` flag in the mid-level API.
+  WalkResult walk_result = module.walk([&](Operation* op) -> WalkResult {
+    if (op->hasAttr(kEmbeddingPipelining)) {
+      const std::string region =
+          op->getAttrOfType<StringAttr>(kEmbeddingPipelining).getValue().str();
+      if (region == kEmbeddingForward) {
+        forward_pass_ops.insert(op);
+      } else if (region == kEmbeddingBackward) {
+        backward_pass_ops.insert(op);
+      } else {
+        return op->emitOpError()
+               << "embedding op has unknown " << kEmbeddingPipelining
+               << " attribute value " << region << ".";
+      }
+      op->removeAttr(kEmbeddingPipelining);
+    }
+    return WalkResult::advance();
+  });
+  if (walk_result.wasInterrupted()) return signalPassFailure();
+
+  // If there are no forward pass ops, there is no SC, so we end early.
+  if (forward_pass_ops.empty()) {
+    if (backward_pass_ops.empty()) {
+      return;
+    } else {
+      (*backward_pass_ops.begin())->emitOpError()
+          << "embedding backwards pass op with no forwards pass ops.";
+      return signalPassFailure();
+    }
+  }
+
+  // Ensure that all ops are in the same region, and have the same replication
+  // info.
+  // TODO(bfontain): Allow for multiple regions/loops in one module.
+  // TODO(patn): move this pass after cluster formation to remove the complexity
+  // with replication info and metadata, cluster checking and generalizing to
+  // multiple TPU clusters.
+  Region* region = (*forward_pass_ops.begin())->getParentRegion();
+  StringAttr replication_attr = GetReplicationAttr(*forward_pass_ops.begin());
+  llvm::SmallVector<Operation*> checkset(forward_pass_ops.getArrayRef());
+  checkset.append(backward_pass_ops.begin(), backward_pass_ops.end());
+  for (Operation* op : checkset) {
+    if (op->getParentRegion() != region) {
+      op->emitOpError() << "embedding ops in two different regions";
+      return signalPassFailure();
+    }
+    if (GetReplicationAttr(op) != replication_attr) {
+      op->emitOpError() << "embedding ops with different replication info "
+                        << replication_attr << " vs " << GetReplicationAttr(op);
+      return signalPassFailure();
+    }
+  }
+
+  // TODO(bfontain): Check that the region here is the region
+  // of the loop body func.
+  // Find the FuncOp for the surrounding while loop body.
+  func::FuncOp loop_body_func =
+      (*forward_pass_ops.begin())->getParentOfType<func::FuncOp>();
+
+  // merged_set will keep track of which ops are to be avoided when gather ops
+  // for inclusion into the four extracted functions.
+  llvm::SetVector<Operation*> merged_set;
+
+  // Find the TPUReplicationMetadata and TPUCompilationResult ops and delete
+  // them. These will be cloned/inserted into each region.
+  TF::TPUReplicateMetadataOp metadata_op;
+  auto result = FindAndExcludeOp(loop_body_func, replication_attr, merged_set,
+                                 metadata_op);
+  if (failed(result)) return signalPassFailure();
+  const int num_replicas = metadata_op.getNumReplicas();
+
+  TF::TPUCompilationResultOp compilation_op;
+  result = FindAndExcludeOp<TF::TPUCompilationResultOp>(
+      loop_body_func, replication_attr, merged_set, compilation_op);
+  if (failed(result)) return signalPassFailure();
+
+  TF::WhileOp while_op = nullptr;
+  result = FindOwningWhileOp(loop_body_func, module, &while_op);
+  if (failed(result)) return signalPassFailure();
+
+  OpBuilder builder(module);
+
+  result = FindForwardPassOps(builder, forward_pass_ops, backward_pass_ops,
+                              merged_set, loop_body_func, num_replicas);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(forward_pass_ops.begin(), forward_pass_ops.end());
+
+  result =
+      FindBackwardPassOps(builder, backward_pass_ops, merged_set, num_replicas);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(backward_pass_ops.begin(), backward_pass_ops.end());
+
+  llvm::SetVector<Operation*> core_tpu_ops;
+  result = FindCoreTPUOps(core_tpu_ops, forward_pass_ops, backward_pass_ops,
+                          merged_set, loop_body_func);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(core_tpu_ops.begin(), core_tpu_ops.end());
+
+  llvm::SetVector<Operation*> non_tpu_ops;
+  result = FindNonTPUOps(non_tpu_ops, merged_set, loop_body_func);
+  if (failed(result)) return signalPassFailure();
+  merged_set.insert(non_tpu_ops.begin(), non_tpu_ops.end());
+
+  VLOG(2) << "Forwards pass " << forward_pass_ops.size()
+          << " ops, backwards pass " << backward_pass_ops.size()
+          << " ops, core " << core_tpu_ops.size()
+          << " ops. Total = " << merged_set.size() << " of "
+          << GetNumOps(loop_body_func) << ".\n";
+
+  builder.setInsertionPointAfter(*non_tpu_ops.begin());
+  Operation* non_tpu_caller = nullptr;
+  result =
+      ExtractOpsAsFunc(builder, module, non_tpu_ops, replication_attr, nullptr,
+                       nullptr, loop_body_func, "non_tpu", &non_tpu_caller);
+  if (failed(result)) return signalPassFailure();
+
+  builder.setInsertionPointAfter(non_tpu_caller);
+  Operation* forward_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, forward_pass_ops, replication_attr,
+                            metadata_op, compilation_op, loop_body_func,
+                            "sc_forward", &forward_caller);
+  if (failed(result)) return signalPassFailure();
+
+  // Create tpu_core function
+  builder.setInsertionPointAfter(forward_caller);
+  Operation* core_tpu_caller = nullptr;
+  result = ExtractOpsAsFunc(builder, module, core_tpu_ops, replication_attr,
+                            metadata_op, compilation_op, loop_body_func,
+                            "core_tpu", &core_tpu_caller);
+  if (failed(result)) return signalPassFailure();
+
+  builder.setInsertionPointAfter(core_tpu_caller);
+  Operation* backwards_pass_caller = nullptr;
+  result = ExtractOpsAsFunc(
+      builder, module, backward_pass_ops, replication_attr, metadata_op,
+      compilation_op, loop_body_func, "sc_backward", &backwards_pass_caller);
+  if (failed(result)) return signalPassFailure();
+
+  metadata_op->erase();
+  compilation_op->erase();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<mlir::ModuleOp>> CreateEmbeddingSequencingPass() {
+  return std::make_unique<EmbeddingSequencingPass>();
+}
+
+}  // namespace TFDevice
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_head_tail_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_head_tail_outside_compilation.cc
index 58f2e62df2f..863bdc6b635 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_head_tail_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_head_tail_outside_compilation.cc
@@ -53,10 +53,15 @@ namespace TFDevice {
 
 namespace {
 
+constexpr char kXlaMapOutsideCompilationAttr[] = "_xla_map_outside_compilation";
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 
+// Return true if `op` has attributes that say it can be outside compiled by
+// this pass. This pass ignores _xla_map_outside_compilation, which will only be
+// handled by extract_outside_compilation pass.
 bool HasOutsideCompilationAttribute(Operation* op) {
-  return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr;
+  return op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr) != nullptr &&
+         !op->hasAttrOfType<BoolAttr>(kXlaMapOutsideCompilationAttr);
 }
 
 // Finds op that created a given value. If the value is a BlockArgument, this
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
index 9c3e82e88e1..8b3acdf0063 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/extract_outside_compilation.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <deque>
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -32,10 +33,10 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
@@ -53,6 +54,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
 namespace mlir {
@@ -62,6 +64,7 @@ namespace {
 
 constexpr char kDeviceAttr[] = "device";
 constexpr char kHostFunctionAttr[] = "host_func";
+constexpr char kXlaMapOutsideCompilationAttr[] = "_xla_map_outside_compilation";
 constexpr char kXlaOutsideCompilationAttr[] = "_xla_outside_compilation";
 constexpr char kNoReplicationCluster[] = "__no_replication_cluster";
 
@@ -444,23 +447,189 @@ void GetExternalOutputs(const llvm::SmallSetVector<Operation*, 4>& cluster_ops,
   }
 }
 
-// Creates the HostCompute with `inputs` and `outputs`
-// using `communication_key`.
-TF::_XlaHostComputeMlirOp CreateHostCompute(
-    OpBuilder& builder, Location loc,
-    const llvm::SmallSetVector<Value, 4>& inputs, llvm::ArrayRef<Value> outputs,
-    llvm::StringRef args_communication_key,
-    llvm::StringRef retvals_communication_key,
-    llvm::StringRef serialized_func_module) {
+// Output `shard_type`, which is the type of each shard, given `full_type`. If
+// the full shape is (num_cores_per_replica * a, b, c), then the shard shape is
+// (a, b, c). `context_op` is used for error reporting, in case of errors.
+LogicalResult GetShardShapedType(Operation* context_op,
+                                 int num_cores_per_replica, Type full_type,
+                                 Type& shard_type) {
+  RankedTensorType ranked_type = full_type.dyn_cast<RankedTensorType>();
+  if (!ranked_type)
+    return context_op->emitOpError()
+           << "A map_outside_compilation op's input and output types must be "
+              "ranked tensors.";
+  ArrayRef<int64_t> in_shape = ranked_type.getShape();
+  if (in_shape.empty() || in_shape[0] < 0) {
+    return context_op->emitOpError()
+           << "A map_outside_compilation op's input and output shapes must "
+              "have rank at least one and the first dimension must be known.";
+  }
+  int64_t split_size = in_shape[0] / num_cores_per_replica;
+  if (in_shape[0] % num_cores_per_replica != 0) {
+    return context_op->emitOpError()
+           << "A map_outside_compilation op's input and output shapes must be "
+              "divisible by num_cores_per_replica="
+           << num_cores_per_replica;
+  }
+  llvm::SmallVector<int64_t, 4> shape;
+  shape.push_back(split_size);
+  for (int i = 1; i < in_shape.size(); ++i) {
+    shape.push_back(in_shape[i]);
+  }
+  shard_type = RankedTensorType::Builder(ranked_type).setShape(shape);
+  return success();
+}
+
+// Output `sharding`, which is the sharding of `val`. `context_op` is used for
+// error reporting, in case of errors.
+// TODO(b/255350483): Explicitly pass the sharding to map_outside_compilation,
+//   so it does not need to be retrieved from a Value.
+LogicalResult GetShardingOfValue(Operation* context_op, Value val,
+                                 std::string& sharding) {
+  Operation* op = val.getDefiningOp();
+  // val should always have a defining op because cluster inputs always have
+  // defining ops.
+  assert(op);
+  StringAttr sharding_attr = op->getAttrOfType<StringAttr>("_XlaSharding");
+  if (!sharding_attr)
+    return context_op->emitOpError()
+           << "A map_outside_compilation op's input should have an explicit "
+              "sharding. There is no _XlaSharding attribute on the input op.";
+  sharding = sharding_attr.str();
+  return success();
+}
+
+// Create an `_XlaHostComputeMlir` for the map_outside_compilation case. Inputs
+// are converted from split sharding to MANUAL sharding and outputs are
+// converted from MANUAL sharding to split sharding. Output `full_outputs`,
+// which is the outputs of the `_XlaHostComputeMlir` and add the
+// `_XlaHostComputeMlir` to `host_compute_out_ops`.
+LogicalResult CreateHostComputeMap(
+    Operation* original_op, OpBuilder& builder, Location loc,
+    ArrayRef<Value> inputs, ArrayRef<Value> outputs,
+    StringRef args_communication_key, StringRef retvals_communication_key,
+    StringRef serialized_func_module, int num_cores_per_replica,
+    SmallVector<Value, 4>& full_outputs,
+    SmallVector<Operation*, 4>& host_compute_out_ops) {
+  // Get output types.
+  llvm::SmallVector<Type, 4> shard_output_types;
+  llvm::SmallVector<Type, 4> full_output_types;
+  shard_output_types.reserve(outputs.size());
+  full_output_types.reserve(outputs.size());
+  for (const auto& output : outputs) {
+    Type shard_type;
+    if (failed(GetShardShapedType(original_op, num_cores_per_replica,
+                                  output.getType(), shard_type)))
+      return failure();
+    shard_output_types.push_back(shard_type);
+    full_output_types.push_back(output.getType());
+  }
+
+  // There should be at least 1 input so common_split_sharding can be defined.
+  if (inputs.empty())
+    return original_op->emitOpError()
+           << "map_outside_compilation should have at least one input";
+
+  // Convert split sharded inputs to MANUAL sharded inputs.
+  // common_split_sharding is the split sharding that is common to all inputs
+  // and outputs.
+  std::string common_split_sharding;
+  llvm::SmallVector<Value, 4> manual_inputs;
+  manual_inputs.reserve(inputs.size());
+  for (Value in : inputs) {
+    Type shard_type;
+    if (failed(GetShardShapedType(original_op, num_cores_per_replica,
+                                  in.getType(), shard_type)))
+      return failure();
+    std::string in_sharding;
+    if (failed(GetShardingOfValue(original_op, in, in_sharding)))
+      return failure();
+    if (common_split_sharding.empty()) {
+      common_split_sharding = std::move(in_sharding);
+    } else {
+      if (common_split_sharding != in_sharding)
+        return original_op->emitOpError()
+               << "All inputs and outputs of map_outside_compilation should "
+                  "have the same sharding.";
+    }
+    auto in_manual = builder.create<TF::XlaSpmdFullToShardShapeOp>(
+        loc, shard_type, in, common_split_sharding, /*dim=*/-1,
+        /*unspecified_dims=*/builder.getI64ArrayAttr({}));
+    manual_inputs.push_back(in_manual);
+  }
+
+  // Create the _XlaHostComputeMlirOp
+  auto host_compute = builder.create<TF::_XlaHostComputeMlirOp>(
+      loc, shard_output_types, manual_inputs,
+      /*send_key=*/builder.getStringAttr(args_communication_key),
+      /*recv_key=*/builder.getStringAttr(retvals_communication_key),
+      /*host_mlir_module=*/builder.getStringAttr(serialized_func_module),
+      /*manual_sharding=*/builder.getBoolAttr(true));
+  host_compute_out_ops.push_back(host_compute);
+
+  // Convert MANUAL sharded outputs to split sharded outputs.
+  for (auto [full_type, out] :
+       llvm::zip(full_output_types, host_compute.getResults())) {
+    RankedTensorType full_type_ranked = full_type.dyn_cast<RankedTensorType>();
+    if (!full_type_ranked)
+      return original_op->emitOpError()
+             << "map_outside_compilation must have ranked outputs";
+    auto out_full = builder.create<TF::XlaSpmdShardToFullShapeOp>(
+        loc, full_type, out, common_split_sharding, full_type_ranked.getShape(),
+        /*dim=*/-1,
+        /*unspecified_dims=*/builder.getI64ArrayAttr({}));
+    host_compute_out_ops.push_back(out_full);
+    full_outputs.push_back(out_full);
+  }
+
+  return success();
+}
+
+// Create the _XlaHostComputeMlir with `inputs` and `outputs` for the ordinary
+// outside_compilation case.
+// Output `full_outputs`, which is the outputs of the `_XlaHostComputeMlir` and
+// add the `_XlaHostComputeMlir` to `host_compute_out_ops`.
+void CreateHostComputeNotMap(OpBuilder& builder, Location loc,
+                             ArrayRef<Value> inputs, ArrayRef<Value> outputs,
+                             StringRef args_communication_key,
+                             StringRef retvals_communication_key,
+                             StringRef serialized_func_module,
+                             SmallVector<Value, 4>& full_outputs,
+                             SmallVector<Operation*, 4>& host_compute_out_ops) {
   llvm::SmallVector<Type, 4> device_output_types;
   for (const auto& output : outputs)
     device_output_types.push_back(output.getType());
   auto host_compute = builder.create<TF::_XlaHostComputeMlirOp>(
-      loc, device_output_types, inputs.getArrayRef(),
+      loc, device_output_types, inputs,
       builder.getStringAttr(args_communication_key),
       builder.getStringAttr(retvals_communication_key),
       /*host_mlir_module=*/builder.getStringAttr(serialized_func_module));
-  return host_compute;
+  host_compute_out_ops.push_back(host_compute);
+  for (Value v : host_compute.getResults()) full_outputs.push_back(v);
+}
+
+// Create the _XlaHostComputeMlir with `inputs` and `outputs`.
+// Output `full_outputs`, which is the outputs of the `_XlaHostComputeMlir` and
+// add the `_XlaHostComputeMlir` to `host_compute_out_ops`.
+LogicalResult CreateHostCompute(
+    Operation* original_op, OpBuilder& builder, Location loc,
+    ArrayRef<Value> inputs, ArrayRef<Value> outputs,
+    StringRef args_communication_key, StringRef retvals_communication_key,
+    StringRef serialized_func_module, bool is_map_oc, int num_cores_per_replica,
+    SmallVector<Value, 4>& full_outputs,
+    SmallVector<Operation*, 4>& host_compute_out_ops) {
+  if (is_map_oc) {
+    return CreateHostComputeMap(
+        original_op, builder, loc, inputs, outputs, args_communication_key,
+        retvals_communication_key, serialized_func_module,
+        num_cores_per_replica, full_outputs, host_compute_out_ops);
+  } else {
+    CreateHostComputeNotMap(builder, loc, inputs, outputs,
+                            args_communication_key, retvals_communication_key,
+                            serialized_func_module, full_outputs,
+                            host_compute_out_ops);
+    return success();
+  }
 }
 
 void MarkOutsideCompiled(Operation* op) {
@@ -498,10 +667,10 @@ bool ShouldCloseCluster(llvm::ArrayRef<Value> outputs) {
 // region as insertion.
 // For static-shapes, Replace operand usages if op is in the same region as
 // insertion or if the op is outside compiled and will be moved to host later.
-void ReplaceExternalOperandUsage(
-    const llvm::SmallSetVector<Value, 4>& external_operands,
-    Operation* recv_at_host, Operation* insertion_point,
-    Block* original_op_block) {
+void ReplaceExternalOperandUsage(ArrayRef<Value> external_operands,
+                                 Operation* recv_at_host,
+                                 Operation* insertion_point,
+                                 Block* original_op_block) {
   auto replace_operand_usage = [&](OpOperand& operand) {
     if (TF::CanBeRefined(operand.get().getType()) ||
         HasDynamicOutputs(operand.getOwner())) {
@@ -531,10 +700,9 @@ bool HasDynamicOutputs(llvm::ArrayRef<Value> outputs) {
 
 // Replaces usages of `external_outputs` which are values returned by outside
 // compilation with the corresponding outputs from `host_compute`.
-void ReplaceExternalOutputUsage(
-    const llvm::SmallSetVector<Value, 4>& external_outputs,
-    TF::_XlaHostComputeMlirOp host_compute) {
-  bool has_dynamic_outputs = HasDynamicOutputs(external_outputs.getArrayRef());
+void ReplaceExternalOutputUsage(ArrayRef<Value> external_outputs,
+                                ArrayRef<Value> host_compute_outputs) {
+  bool has_dynamic_outputs = HasDynamicOutputs(external_outputs);
 
   auto replace_output_usage = [&](OpOperand& operand) {
     // Don't replace output usages if in host computation (defining op and user
@@ -551,25 +719,16 @@ void ReplaceExternalOutputUsage(
              !HasOutsideCompilationAncestor(operand.getOwner());
     }
   };
-  for (auto result : llvm::zip(external_outputs, host_compute.getResults())) {
+  for (auto result : llvm::zip(external_outputs, host_compute_outputs)) {
     Value external_output = std::get<0>(result);
     external_output.replaceUsesWithIf(std::get<1>(result),
                                       replace_output_usage);
   }
 }
 
-// Move `clustered_ops` to run on host and adds communication ops to transfer
-// `external_operands` and `external_outputs` to/from device/host.  Inserts
-// ops at `insertion_point` and uses `compilation_key` and `device_ordinal` when
-// creating comm ops.
-void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
-                   const llvm::SmallSetVector<Value, 4>& external_operands,
-                   const llvm::SmallSetVector<Value, 4>& external_outputs,
-                   Operation* insertion_point, Value compilation_key,
-                   Value device_ordinal, int default_device_ordinal,
-                   StringAttr device_type_attr, int& communication_key_index) {
-  OpBuilder builder(insertion_point);
-  Operation& op = *clustered_ops.back();
+std::pair<std::string, std::string> MakeCommunicationKeys(
+    ArrayRef<Operation*> clustered_ops, ArrayRef<Value> external_operands,
+    int communication_key_index, Operation& op) {
   std::string args_communication_key =
       llvm::formatv("host_compute_channel_{0}_args", (communication_key_index))
           .str();
@@ -586,22 +745,22 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
         llvm::formatv("if_predicate_channel_{0}", (communication_key_index))
             .str();
   }
+  return std::pair(args_communication_key, retvals_communication_key);
+}
 
-  std::string serialized_func_module;
-  if (HasDynamicOutputs(external_outputs.getArrayRef())) {
-    func::FuncOp shape_op = BuildFunction(
-        clustered_ops.getArrayRef(), external_operands.getArrayRef(),
-        external_outputs.getArrayRef(), &builder);
-    EncapsulateFuncAndSerialize(shape_op, &serialized_func_module);
-  }
-
-  builder.setInsertionPoint(&op);
-  auto host_compute =
-      CreateHostCompute(builder, op.getLoc(), external_operands,
-                        external_outputs.getArrayRef(), args_communication_key,
-                        retvals_communication_key, serialized_func_module);
-  // Insert ops on the host side computation to receive data from device.
-  builder.setInsertionPoint(insertion_point);
+// Add ops to the host-side. These are `RecvAtHost`, `clustered_ops` moved from
+// device cluster, `SendFromHost`. Add these host-side ops to `host_ops`. Return
+// the `RecvAtHost` op.
+Operation* CreateHostOps(ArrayRef<Operation*> clustered_ops,
+                         ArrayRef<Value> external_operands,
+                         ArrayRef<Value> external_outputs,
+                         Operation* host_insertion_point, Value compilation_key,
+                         Value device_ordinal, int default_device_ordinal,
+                         StringAttr device_type_attr, OpBuilder& builder,
+                         Operation& op, std::string args_communication_key,
+                         std::string retvals_communication_key,
+                         SmallVector<Operation*, 4>& host_ops) {
+  builder.setInsertionPoint(host_insertion_point);
   llvm::SmallVector<Type, 4> host_operand_types;
   for (const auto& operand : external_operands)
     host_operand_types.push_back(operand.getType());
@@ -609,37 +768,174 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
   Operation* recv_at_host = CreateRecvAtHostOp(
       builder, op.getLoc(), host_operand_types, compilation_key, device_ordinal,
       default_device_ordinal, device_type_attr, args_communication_key);
-  Block* original_op_block = op.getBlock();
+
+  if (!external_operands.empty()) host_ops.push_back(recv_at_host);
   Operation* after_op = recv_at_host;
   for (Operation* cluster_op : clustered_ops) {
     cluster_op->moveAfter(after_op);
     cluster_op->removeAttr(StringAttr::get(op.getContext(), kDeviceAttr));
     after_op = cluster_op;
+    host_ops.push_back(cluster_op);
   }
 
   if (!external_outputs.empty()) {
-    CreateSendFromHostOp(builder, op.getLoc(), external_outputs.getArrayRef(),
-                         compilation_key, device_ordinal,
-                         default_device_ordinal, device_type_attr,
-                         retvals_communication_key);
+    Operation* send_from_host = CreateSendFromHostOp(
+        builder, op.getLoc(), external_outputs, compilation_key, device_ordinal,
+        default_device_ordinal, device_type_attr, retvals_communication_key);
+    host_ops.push_back(send_from_host);
   }
 
+  return recv_at_host;
+}
+
+// Clone the first outside compiled region to one for each TPU core. This is
+// used for map_outside_compilation.
+// Message identification arguments to RecvAtHost and SendFromHost are changed.
+void CloneFirstHost(ArrayRef<Operation*> core_to_host_insertion_point,
+                    ArrayRef<Value> core_to_compilation_key,
+                    ArrayRef<Value> core_to_device_ordinal,
+                    int num_cores_per_replica, ArrayRef<Operation*> host0_ops,
+                    OpBuilder& builder) {
+  for (int core = 1; core < num_cores_per_replica; ++core) {
+    IRMapping mapper;
+    for (Operation* op : host0_ops) {
+      builder.setInsertionPoint(core_to_host_insertion_point[core]);
+      Operation* clone = builder.clone(*op, mapper);
+      mapper.map(op, clone);
+      if (auto recv_at_host = llvm::dyn_cast<TF::_XlaRecvAtHostOp>(clone)) {
+        recv_at_host.setDeviceOrdinal(core);
+        clone->setOperand(0, core_to_compilation_key[core]);
+      } else if (auto send_from_host =
+                     llvm::dyn_cast<TF::_XlaSendFromHostOp>(clone)) {
+        send_from_host.setDeviceOrdinal(core);
+        clone->setOperand(1, core_to_compilation_key[core]);
+      } else if (auto recv_at_host =
+                     llvm::dyn_cast<TF::_XlaRecvAtHostV2Op>(clone)) {
+        recv_at_host.setOperand(0, core_to_compilation_key[core]);
+        builder.setInsertionPoint(recv_at_host);
+        // core_ordinal = device_ordinal + core
+        // where device_ordinal is the base device for the replica
+        Value device_ordinal = core_to_device_ordinal[core];
+        Value const_core = builder.create<TF::ConstOp>(
+            recv_at_host.getLoc(), builder.getI64IntegerAttr(core));
+        Value core_ordinal = builder.create<TF::AddV2Op>(
+            recv_at_host.getLoc(), device_ordinal.getType(), device_ordinal,
+            const_core);
+        recv_at_host.setOperand(1, core_ordinal);
+      } else if (auto send_from_host =
+                     llvm::dyn_cast<TF::_XlaSendFromHostV2Op>(clone)) {
+        send_from_host.setOperand(1, core_to_compilation_key[core]);
+        builder.setInsertionPoint(send_from_host);
+        // core_ordinal = device_ordinal + core
+        // where device_ordinal is the base device for the replica
+        Value device_ordinal = core_to_device_ordinal[core];
+        Value const_core = builder.create<TF::ConstOp>(
+            send_from_host.getLoc(), builder.getI64IntegerAttr(core));
+        Value core_ordinal = builder.create<TF::AddV2Op>(
+            send_from_host.getLoc(), device_ordinal.getType(), device_ordinal,
+            const_core);
+        send_from_host.setOperand(2, core_ordinal);
+      }
+    }
+  }
+}
+
+// Move `clustered_ops` to run on host and adds communication ops to transfer
+// `external_operands` and `external_outputs` to/from device/host.  Inserts
+// ops at `insertion_point` and uses `compilation_key` and `device_ordinal` when
+// creating comm ops.
+LogicalResult MoveToHostSingleCluster(
+    ArrayRef<Operation*> clustered_ops, ArrayRef<Value> external_operands,
+    ArrayRef<Value> external_outputs,
+    ArrayRef<Operation*> core_to_host_insertion_point,
+    ArrayRef<Value> core_to_compilation_key,
+    ArrayRef<Value> core_to_device_ordinal, int default_device_ordinal,
+    StringAttr device_type_attr, bool is_map_oc, int num_cores_per_replica,
+    int& communication_key_index) {
+  OpBuilder builder(core_to_host_insertion_point[0]);
+  Operation& op = *clustered_ops.back();
+  Block* original_op_block = op.getBlock();
+  auto [args_communication_key, retvals_communication_key] =
+      MakeCommunicationKeys(clustered_ops, external_operands,
+                            communication_key_index, op);
+
+  std::string serialized_func_module;
+  if (HasDynamicOutputs(external_outputs)) {
+    func::FuncOp shape_op = BuildFunction(clustered_ops, external_operands,
+                                          external_outputs, &builder);
+    EncapsulateFuncAndSerialize(shape_op, &serialized_func_module);
+  }
+
+  builder.setInsertionPoint(&op);
+  SmallVector<Value, 4> host_compute_outputs;
+  SmallVector<Operation*, 4> host_compute_out_ops;
+  if (failed(CreateHostCompute(
+          &op, builder, op.getLoc(), external_operands, external_outputs,
+          args_communication_key, retvals_communication_key,
+          serialized_func_module, is_map_oc, num_cores_per_replica,
+          host_compute_outputs, host_compute_out_ops)))
+    return failure();
+
+  // Insert ops on the host side computation to receive data from device.
+  // host0_ops are the ops that will make up the first host process. In the
+  // map_outside_compilation case, there are multiple host processes, which will
+  // be created by cloning.
+  SmallVector<Operation*, 4> host0_ops;
+  Operation* recv_at_host = CreateHostOps(
+      clustered_ops, external_operands, external_outputs,
+      core_to_host_insertion_point[0], core_to_compilation_key[0],
+      core_to_device_ordinal.empty() ? nullptr : core_to_device_ordinal[0],
+      default_device_ordinal, device_type_attr, builder, op,
+      args_communication_key, retvals_communication_key, host0_ops);
+
   if (external_operands.empty()) {
     recv_at_host->erase();
   } else {
-    ReplaceExternalOperandUsage(external_operands,
-                                /*recv_at_host=*/recv_at_host,
-                                /*insertion_point=*/insertion_point,
-                                /*original_op_block=*/original_op_block);
+    ReplaceExternalOperandUsage(
+        external_operands, recv_at_host,
+        /*insertion_point=*/core_to_host_insertion_point[0], original_op_block);
   }
 
-  ReplaceExternalOutputUsage(external_outputs, host_compute);
+  ReplaceExternalOutputUsage(external_outputs, host_compute_outputs);
+
+  // Clone the first outside compiled region to one for each TPU core.
+  if (is_map_oc)
+    CloneFirstHost(core_to_host_insertion_point, core_to_compilation_key,
+                   core_to_device_ordinal, num_cores_per_replica, host0_ops,
+                   builder);
+
+  ReplaceExternalOutputUsage(external_outputs, host_compute_outputs);
 
   if (external_operands.empty() && external_outputs.empty()) {
-    host_compute.erase();
+    for (Operation* op : host_compute_out_ops) op->erase();
   } else {
     ++communication_key_index;
   }
+
+  return success();
+}
+
+// Update is_map_oc the true if op has attribute _xla_map_outside_compilation
+// and false otherwise. Check that this is consistent with the previous setting
+// of is_map_oc.
+LogicalResult UpdateIsMapOutsideCompilation(Operation& op, bool control_above,
+                                            std::optional<bool>& is_map_oc) {
+  bool op_is_map_oc =
+      op.hasAttrOfType<StringAttr>(kXlaMapOutsideCompilationAttr);
+  if (is_map_oc) {
+    if (op_is_map_oc != *is_map_oc) {
+      return op.emitOpError()
+             << "Cannot mix map_outside_compilation with ordinary "
+                "outside_compilation in the same graph.";
+    }
+  } else {
+    is_map_oc = op_is_map_oc;
+  }
+  if (control_above && op_is_map_oc) {
+    return op.emitOpError() << "map_outside_compilation inside control flow "
+                               "is not implemented.";
+  }
+  return success();
 }
 
 // Move outside compiled ops in `src` to `insertion_point` in host
@@ -649,13 +945,21 @@ void MoveOpsToHost(const llvm::SmallSetVector<Operation*, 4>& clustered_ops,
 // `communication_key_index` which is incremented when used. Communication ops
 // are added only when needed and at the location need.  There are checks to
 // ensure that duplicate communication between device and host is not added.
-// When `return_value_from_host` is not nullptr, MoveOpsToHost will also update
-// its value.
-LogicalResult MoveOpsToHost(
-    tf_device::ClusterOp device_cluster, Block* src, Operation* insertion_point,
-    Value compilation_key, Value device_ordinal, int default_device_ordinal,
+// When `return_value_from_host` is not nullptr, MoveToHostMultiCluster will
+// also update its value. `control_above` means that this Block is within
+// control flow, which is not currently supported with map_outside_compilation.
+// `is_map_oc` tracks whether map_outside_compilation is used, for the whole
+// program. Currently only map_outside_compilation-only or ordinary
+// outside_compilation only is supported.
+LogicalResult MoveToHostMultiCluster(
+    tf_device::ClusterOp device_cluster, Block* src,
+    ArrayRef<Operation*> core_to_host_insertion_point,
+    ArrayRef<Value> core_to_compilation_key,
+    ArrayRef<Value> core_to_device_ordinal, int default_device_ordinal,
+    bool control_above, std::optional<bool>& is_map_oc,
     int& communication_key_index,
     llvm::SmallVector<Value, 4>* return_value_from_host = nullptr) {
+  int num_cores_per_replica = core_to_host_insertion_point.size();
   // Contains all of the outside compiled operations that should be moved to the
   // host using a single `_XlaHostComputeMlir` op.  This should only contain a
   // single op except in the case where some of the input/output shapes are
@@ -669,6 +973,9 @@ LogicalResult MoveOpsToHost(
         !op.hasAttrOfType<StringAttr>(kXlaOutsideCompilationAttr))
       continue;
 
+    if (failed(UpdateIsMapOutsideCompilation(op, control_above, is_map_oc)))
+      return failure();
+
     llvm::SmallSetVector<Value, 4> external_outputs;
     llvm::SmallVector<Value, 4> host_outputs;
     // We want to move the clustered_ops if the op to be added has all
@@ -684,10 +991,13 @@ LogicalResult MoveOpsToHost(
           return_value_from_host->push_back(output);
         }
       }
-      MoveOpsToHost(clustered_ops, external_operands, external_outputs,
-                    insertion_point, compilation_key, device_ordinal,
-                    default_device_ordinal, device_type_attr,
-                    communication_key_index);
+      if (failed(MoveToHostSingleCluster(
+              clustered_ops.getArrayRef(), external_operands.getArrayRef(),
+              external_outputs.getArrayRef(), core_to_host_insertion_point,
+              core_to_compilation_key, core_to_device_ordinal,
+              default_device_ordinal, device_type_attr, *is_map_oc,
+              num_cores_per_replica, communication_key_index)))
+        return failure();
       clustered_ops.clear();
     }
 
@@ -708,10 +1018,13 @@ LogicalResult MoveOpsToHost(
         }
       }
 
-      MoveOpsToHost(clustered_ops, external_operands, external_outputs,
-                    insertion_point, compilation_key, device_ordinal,
-                    default_device_ordinal, device_type_attr,
-                    communication_key_index);
+      if (failed(MoveToHostSingleCluster(
+              clustered_ops.getArrayRef(), external_operands.getArrayRef(),
+              external_outputs.getArrayRef(), core_to_host_insertion_point,
+              core_to_compilation_key, core_to_device_ordinal,
+              default_device_ordinal, device_type_attr, *is_map_oc,
+              num_cores_per_replica, communication_key_index)))
+        return failure();
       clustered_ops.clear();
     }
   }
@@ -736,27 +1049,34 @@ void GetReturnValueFromDevice(
 // (outside compiled) computation into two separate control flow ops with
 // communication between the device/host for data dependencies.  Both device and
 // host control flow initially remain within `device_cluster` and a subsequency
-// call to MoveOpsToHost moves the host side control flow to the host launch in
-// tf_device.parallel_execute.  Uses `compilation_key, `device_ordinal` and
-// `communication_key_index` when creating communication ops.
+// call to MoveToHostSingleCluster moves the host side control flow to the host
+// launch in tf_device.parallel_execute.  Uses `compilation_key,
+// `device_ordinal` and `communication_key_index` when creating communication
+// ops.
 LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
-                                   Value compilation_key, Value device_ordinal,
+                                   ArrayRef<Value> core_to_compilation_key,
+                                   ArrayRef<Value> core_to_device_ordinal,
                                    int default_device_ordinal,
-                                   int& communication_key_index) {
+                                   int& communication_key_index,
+                                   std::optional<bool>& is_map_oc) {
   auto result = device_cluster.GetBody().walk([&](Operation* op) {
     if (auto if_op = llvm::dyn_cast<TF::IfRegionOp>(op)) {
       if (!HasOutsideCompilationNested(op)) return WalkResult::advance();
       OpBuilder builder(if_op);
       auto host_if = CloneEmptyIfWithPredicate(if_op, builder);
-      if (failed(MoveOpsToHost(
+      if (failed(MoveToHostMultiCluster(
               device_cluster, &if_op.getThenBranch().front(),
-              host_if.getThenBranch().front().getTerminator(), compilation_key,
-              device_ordinal, default_device_ordinal, communication_key_index)))
+              {host_if.getThenBranch().front().getTerminator()},
+              core_to_compilation_key, core_to_device_ordinal,
+              default_device_ordinal, /*control_above=*/true, is_map_oc,
+              communication_key_index)))
         return WalkResult::interrupt();
-      if (failed(MoveOpsToHost(
+      if (failed(MoveToHostMultiCluster(
               device_cluster, &if_op.getElseBranch().front(),
-              host_if.getElseBranch().front().getTerminator(), compilation_key,
-              device_ordinal, default_device_ordinal, communication_key_index)))
+              {host_if.getElseBranch().front().getTerminator()},
+              core_to_compilation_key, core_to_device_ordinal,
+              default_device_ordinal, /*control_above=*/true, is_map_oc,
+              communication_key_index)))
         return WalkResult::interrupt();
       // Mark op as stateful due to side-effecting communication ops.
       if_op->setAttr("is_stateless", builder.getBoolAttr(false));
@@ -778,24 +1098,32 @@ LogicalResult DecomposeControlFlow(tf_device::ClusterOp device_cluster,
       builder.setInsertionPoint(while_op.getCond().front().getTerminator());
       builder.create<TF::XlaSendToHostOp>(while_op.getLoc(), condition,
                                           condition_send_recv_key);
+      // device_ordinal0 is the ordinal of TPU_REPLICATED_CORE_0 and is only
+      // used in the replicated case.
+      Value device_ordinal0 = nullptr;
+      if (!core_to_device_ordinal.empty())
+        device_ordinal0 = core_to_device_ordinal[0];
       builder.setInsertionPointToEnd(&cond.front());
       auto recv_condition_at_host = CreateRecvAtHostOp(
           builder, while_op.getLoc(), TypeRange{condition.getType()},
-          compilation_key, device_ordinal, default_device_ordinal,
+          core_to_compilation_key[0], device_ordinal0, default_device_ordinal,
           device_cluster->getAttrOfType<StringAttr>(TF::kCompileDeviceTypeAttr),
           condition_send_recv_key);
       builder.create<TF::YieldOp>(while_op.getLoc(),
                                   recv_condition_at_host->getResults());
 
-      if (failed(MoveOpsToHost(device_cluster, &while_op.getCond().front(),
-                               recv_condition_at_host, compilation_key,
-                               device_ordinal, default_device_ordinal,
-                               communication_key_index)))
+      if (failed(MoveToHostMultiCluster(
+              device_cluster, &while_op.getCond().front(),
+              {recv_condition_at_host}, core_to_compilation_key,
+              core_to_device_ordinal, default_device_ordinal,
+              /*control_above=*/true, is_map_oc, communication_key_index)))
         return WalkResult::interrupt();
-      if (failed(MoveOpsToHost(
+      if (failed(MoveToHostMultiCluster(
               device_cluster, &while_op.getBody().front(),
-              host_while.getBody().front().getTerminator(), compilation_key,
-              device_ordinal, default_device_ordinal, communication_key_index)))
+              {host_while.getBody().front().getTerminator()},
+              core_to_compilation_key, core_to_device_ordinal,
+              default_device_ordinal, /*control_above=*/true, is_map_oc,
+              communication_key_index)))
         return WalkResult::interrupt();
       // Mark op as stateful due to side-effecting communication ops.
       while_op->setAttr("is_stateless", builder.getBoolAttr(false));
@@ -859,8 +1187,8 @@ LogicalResult GetDefaultDeviceOrdinal(tf_device::ClusterOp device_cluster,
 // The results of parallel executes is the combination of return values from
 // both host and device.
 llvm::SmallVector<Type, 4> GetParallelExecuteResultsTypes(
-    const llvm::SmallVector<Value, 4>& return_value_from_host,
-    const llvm::SmallVector<Value, 4>& return_value_from_device) {
+    ArrayRef<Value> return_value_from_host,
+    ArrayRef<Value> return_value_from_device) {
   llvm::SmallVector<Type, 4> parallel_execute_result_types;
   const int num_of_outputs =
       return_value_from_host.size() + return_value_from_device.size();
@@ -939,7 +1267,7 @@ void RemapDeviceClusterResultsWithParallelExecuteResults(
 
 // Get the vector of results for new device cluster
 llvm::SmallVector<Value, 4> GetNewDeviceResults(
-    const llvm::SmallVector<Value, 4>& return_value_from_device) {
+    ArrayRef<Value> return_value_from_device) {
   llvm::SmallVector<Value, 4> device_results;
   device_results.reserve(return_value_from_device.size());
   for (Value old_result : return_value_from_device)
@@ -949,7 +1277,7 @@ llvm::SmallVector<Value, 4> GetNewDeviceResults(
 
 // Get the vector of types of results for new device cluster
 llvm::SmallVector<Type, 4> GetNewDeviceTypes(
-    const llvm::SmallVector<Value, 4>& return_value_from_device) {
+    ArrayRef<Value> return_value_from_device) {
   llvm::SmallVector<Type, 4> device_result_types;
   device_result_types.reserve(return_value_from_device.size());
   for (Value old_result : return_value_from_device)
@@ -983,10 +1311,11 @@ void MoveTmpLaunchOpToNewLaunchOp(tf_device::LaunchOp tmp_host_launch_op,
 // Still, one region is for the host computation for outside compilation and
 // the other one is for the original Device cluster computation.
 tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
-    OpBuilder& builder, int num_regions, llvm::StringRef host_device,
-    tf_device::ClusterOp device_cluster, tf_device::LaunchOp tmp_host_launch_op,
-    const llvm::SmallVector<Value, 4>& return_value_from_host,
-    const llvm::SmallVector<Value, 4>& return_value_from_device) {
+    OpBuilder& builder, int num_regions, ArrayRef<std::string> core_to_host,
+    tf_device::ClusterOp device_cluster,
+    ArrayRef<tf_device::LaunchOp> core_to_tmp_host_launch,
+    ArrayRef<Value> return_value_from_host,
+    ArrayRef<Value> return_value_from_device) {
   llvm::SmallVector<Type, 4> parallel_execute_result_types =
       GetParallelExecuteResultsTypes(return_value_from_host,
                                      return_value_from_device);
@@ -994,25 +1323,35 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
   builder.setInsertionPoint(device_cluster);
   auto parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
       device_cluster.getLoc(), num_regions, parallel_execute_result_types);
-  Block& host_computation_block =
-      parallel_execute_op.GetRegionBlockWithIndex(0);
-  builder.setInsertionPointToEnd(&host_computation_block);
+  SmallVector<tf_device::LaunchOp, 4> core_to_host_launch;
+  for (int core = 0; core < core_to_tmp_host_launch.size(); ++core) {
+    Block& host_computation_block =
+        parallel_execute_op.GetRegionBlockWithIndex(core);
+    builder.setInsertionPointToEnd(&host_computation_block);
 
-  // Create a single launch op for all outside compiled ops.
-  llvm::SmallVector<Value, 4> host_results;
-  host_results.insert(host_results.end(), return_value_from_host.begin(),
-                      return_value_from_host.end());
-  tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
-      builder, device_cluster, host_device, host_results);
+    // map_outside_compilation with return values from host is not implemented.
+    // This would only be needed if head-tail-outside-compilation supports
+    // map_outside_compilation";
+    assert(core == 0 || return_value_from_host.empty());
 
-  // Create a return op for host computation block
-  builder.setInsertionPointToEnd(&host_computation_block);
-  builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
-                                      host_launch_op->getResults());
+    // Create a single launch op for all outside compiled ops.
+    llvm::SmallVector<Value, 4> host_results;
+    host_results.insert(host_results.end(), return_value_from_host.begin(),
+                        return_value_from_host.end());
+    tf_device::LaunchOp host_launch_op = CreateLaunchOpForOutsideCluster(
+        builder, device_cluster, core_to_host[core], host_results);
+    core_to_host_launch.push_back(host_launch_op);
+
+    // Create a return op for host computation block
+    builder.setInsertionPointToEnd(&host_computation_block);
+    builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
+                                        host_launch_op->getResults());
+  }
 
   // Move the launch body to last parallel_execute block.
   Block& parallel_execute_device_block =
-      parallel_execute_op.GetRegionBlockWithIndex(1);
+      parallel_execute_op.GetRegionBlockWithIndex(
+          core_to_tmp_host_launch.size());
   builder.setInsertionPointToEnd(&parallel_execute_device_block);
 
   // Get the vector of results and types of results for new device cluster
@@ -1042,8 +1381,13 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
 
   MoveOldTpuClusterToNewTpuCluster(device_cluster, after_op_r);
 
-  Operation* after_op_host_cluster = host_launch_op.GetBody().getTerminator();
-  MoveTmpLaunchOpToNewLaunchOp(tmp_host_launch_op, after_op_host_cluster);
+  // Move each host-side Launch op.
+  for (int core = 0; core < core_to_tmp_host_launch.size(); ++core) {
+    Operation* after_op_host_cluster =
+        core_to_host_launch[core].GetBody().getTerminator();
+    MoveTmpLaunchOpToNewLaunchOp(core_to_tmp_host_launch[core],
+                                 after_op_host_cluster);
+  }
 
   return parallel_execute_op;
 }
@@ -1052,102 +1396,121 @@ tf_device::ParallelExecuteOp CreateFinalParallelExecuteOp(
 // a region for `device_cluster` computation by extracting outside compiled ops
 // to host computation.
 LogicalResult CreateParallelExecuteForOutsideCompilation(
-    ModuleOp module, tf_device::ClusterOp device_cluster,
-    llvm::StringRef host_device,
+    tf_device::ClusterOp device_cluster,
     llvm::SmallVector<tf_device::ParallelExecuteOp, 4>& ops,
+    std::optional<bool>& is_map_oc, ArrayRef<std::string> core_to_host,
     bool has_tpu_device) {
   OpBuilder builder(device_cluster);
   llvm::SmallVector<Value, 4> returns_from_host;
 
   // Create a temporary parallel_execute. This is temporary because the result
-  // type is not determined until after it is filled. There are two regions in
-  // `tmp_parallel_execute_op`. The first one is for the host computation for
-  // outside compilation and the second one is for the original Device cluster
-  // computation.
-  const int num_regions = 2;
+  // type is not determined until after it is filled. The parallel_execute has
+  // `num_host_regions` assigned to hosts and 1 region for the Device cluster.
+  // In the ordinary outside compilation case `num_host_regions` is 1 and in the
+  // `map_outside_compilation` case `num_host_regions == num_cores_per_replica`.
+  const int num_host_regions = core_to_host.size();
+  const int num_regions = 1 + num_host_regions;
   auto tmp_parallel_execute_op = builder.create<tf_device::ParallelExecuteOp>(
       device_cluster.getLoc(), num_regions, llvm::ArrayRef<Type>{});
-  Block& tmp_host_computation_block =
-      tmp_parallel_execute_op.GetRegionBlockWithIndex(0);
-  builder.setInsertionPointToEnd(&tmp_host_computation_block);
+  SmallVector<Operation*, 4> core_to_host_insertion_point;
+  SmallVector<tf_device::LaunchOp, 4> core_to_tmp_launch;
+  SmallVector<Operation*, 4> compilation_key_ops;
+  SmallVector<Value, 4> core_to_compilation_key;
+  SmallVector<Operation*, 4> core_to_device_ordinal_op;
+  SmallVector<Value, 4> core_to_device_ordinal;
+  for (int core = 0; core < num_host_regions; ++core) {
+    Block& tmp_host_computation_block =
+        tmp_parallel_execute_op.GetRegionBlockWithIndex(core);
+    builder.setInsertionPointToEnd(&tmp_host_computation_block);
+    // Create a single tmp launch op for all outside compiled ops.
+    llvm::SmallVector<Value, 4> tmp_host_results;
+    tf_device::LaunchOp tmp_host_launch_op = CreateLaunchOpForOutsideCluster(
+        builder, device_cluster, core_to_host[core], tmp_host_results);
+    core_to_tmp_launch.push_back(tmp_host_launch_op);
+    // Create a tmp return op for tmp host computation block
+    builder.setInsertionPointToEnd(&tmp_host_computation_block);
+    builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
+                                        llvm::ArrayRef<Value>{});
+    core_to_host_insertion_point.push_back(
+        tmp_host_launch_op.GetBody().getTerminator());
 
-  // Create a single tmp launch op for all outside compiled ops.
-  llvm::SmallVector<Value, 4> tmp_host_results;
-  tf_device::LaunchOp tmp_host_launch_op = CreateLaunchOpForOutsideCluster(
-      builder, device_cluster, host_device, tmp_host_results);
+    builder.setInsertionPoint(tmp_host_launch_op.GetBody().getTerminator());
 
-  // Create a tmp return op for tmp host computation block
-  builder.setInsertionPointToEnd(&tmp_host_computation_block);
-  builder.create<tf_device::ReturnOp>(device_cluster.getLoc(),
-                                      llvm::ArrayRef<Value>{});
-
-  builder.setInsertionPoint(tmp_host_launch_op.GetBody().getTerminator());
-
-  Operation* compilation_key_op = nullptr;
-  Value compilation_key = nullptr;
-  Operation* device_ordinal_op = nullptr;
-
-  if (has_tpu_device) {
-    compilation_key_op =
-        CreateCompilationKeyPlaceholder(device_cluster.getLoc(), builder);
-    compilation_key =
-        llvm::dyn_cast<TF::_TPUCompileMlirPlaceholderProgramKeyOp>(
-            compilation_key_op)
-            .getProgram();
-    device_ordinal_op = builder.create<TF::_TPUDeviceOrdinalPlaceholderOp>(
-        device_cluster.getLoc(),
-        RankedTensorType::get({}, builder.getI64Type()));
-  } else {
-    compilation_key_op =
-        CreateCpuGpuComilationKeyPlaceholder(device_cluster.getLoc(), builder);
-    compilation_key =
-        llvm::dyn_cast<Value>(compilation_key_op->getResults()[0]);
-    device_ordinal_op = builder.create<TF::ConstOp>(
-        device_cluster.getLoc(),
-        DenseIntElementsAttr::get(
-            RankedTensorType::get({}, builder.getI64Type()),
-            static_cast<int64_t>(0)));
+    // Create message identification ops.
+    Operation* compilation_key_op = nullptr;
+    Value compilation_key = nullptr;
+    Operation* device_ordinal_op = nullptr;
+    if (has_tpu_device) {
+      compilation_key_op =
+          CreateCompilationKeyPlaceholder(device_cluster.getLoc(), builder);
+      compilation_key =
+          llvm::dyn_cast<TF::_TPUCompileMlirPlaceholderProgramKeyOp>(
+              compilation_key_op)
+              .getProgram();
+      device_ordinal_op = builder.create<TF::_TPUDeviceOrdinalPlaceholderOp>(
+          device_cluster.getLoc(),
+          RankedTensorType::get({}, builder.getI64Type()));
+    } else {
+      compilation_key_op = CreateCpuGpuComilationKeyPlaceholder(
+          device_cluster.getLoc(), builder);
+      compilation_key =
+          llvm::dyn_cast<Value>(compilation_key_op->getResults()[0]);
+      device_ordinal_op = builder.create<TF::ConstOp>(
+          device_cluster.getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get({}, builder.getI64Type()),
+              static_cast<int64_t>(0)));
+    }
+    compilation_key_ops.push_back(compilation_key_op);
+    core_to_compilation_key.push_back(compilation_key);
+    core_to_device_ordinal_op.push_back(device_ordinal_op);
+    if (device_cluster->getParentOfType<tf_device::ReplicateOp>())
+      core_to_device_ordinal.push_back(
+          core_to_device_ordinal_op[core]->getResults()[0]);
   }
 
-  Value device_ordinal = nullptr;
-
-  if (device_cluster->getParentOfType<tf_device::ReplicateOp>()) {
-    device_ordinal = device_ordinal_op->getResults()[0];
-  }
+  builder.setInsertionPoint(tmp_parallel_execute_op);
   int default_device_ordinal = 0;
   if (failed(GetDefaultDeviceOrdinal(device_cluster, default_device_ordinal))) {
     return failure();
   }
+  // communication_key_index is part of the message identifier and is
+  // incremented for each _XlaHostComputeMlir.
   int communication_key_index = 0;
+
   // Decompose control flow into device and host control flow when outside
   // compilation is included.
-  if (failed(DecomposeControlFlow(device_cluster, compilation_key,
-                                  device_ordinal, default_device_ordinal,
-                                  communication_key_index)))
+  if (failed(DecomposeControlFlow(
+          device_cluster, core_to_compilation_key, core_to_device_ordinal,
+          default_device_ordinal, communication_key_index, is_map_oc)))
     return failure();
 
   // Move all outside compiled ops including control flow to tmp host launch.
   // Also set the values returned from the host when ops are moved.
-  if (failed(MoveOpsToHost(device_cluster, &device_cluster.GetBody(),
-                           tmp_host_launch_op.GetBody().getTerminator(),
-                           compilation_key, device_ordinal,
-                           default_device_ordinal, communication_key_index,
-                           &returns_from_host)))
+  if (failed(MoveToHostMultiCluster(
+          device_cluster, &device_cluster.GetBody(),
+          core_to_host_insertion_point, core_to_compilation_key,
+          core_to_device_ordinal, default_device_ordinal,
+          /*control_above=*/false, is_map_oc, communication_key_index,
+          &returns_from_host)))
     return failure();
 
   llvm::SmallVector<Value, 4> returns_from_device;
   GetReturnValueFromDevice(device_cluster, returns_from_host,
                            returns_from_device);
 
-  if (communication_key_index == 0) compilation_key_op->erase();
-  if (communication_key_index == 0 || device_ordinal == nullptr)
-    device_ordinal_op->erase();
+  // Remove unused message identification ops.
+  if (communication_key_index == 0)
+    for (auto op : compilation_key_ops) op->erase();
+  if (communication_key_index == 0 || core_to_device_ordinal.empty())
+    for (auto op : core_to_device_ordinal_op) op->erase();
 
-  RemoveOutsideCompilation(tmp_host_launch_op);
+  for (tf_device::LaunchOp tmp_host_launch_op : core_to_tmp_launch)
+    RemoveOutsideCompilation(tmp_host_launch_op);
 
   tf_device::ParallelExecuteOp parallel_execute_op =
-      CreateFinalParallelExecuteOp(builder, num_regions, host_device,
-                                   device_cluster, tmp_host_launch_op,
+      CreateFinalParallelExecuteOp(builder, num_regions, core_to_host,
+                                   device_cluster, core_to_tmp_launch,
                                    returns_from_host, returns_from_device);
 
   ops.push_back(tmp_parallel_execute_op);
@@ -1167,11 +1530,10 @@ LogicalResult CreateParallelExecuteForOutsideCompilation(
 LogicalResult CheckClusterResults(tf_device::ClusterOp cluster) {
   for (OpResult result : cluster.getResults()) {
     if (!tensorflow::TypeValidForXLA(result.getType())) {
-      cluster.emitError()
-          << "The ExtractHeadTailOutsideCompilation pass produced a Device "
-             "cluster with a result with a non-XLA type: "
-          << result.getType();
-      return failure();
+      return cluster.emitError()
+             << "The ExtractHeadTailOutsideCompilation pass produced a Device "
+                "cluster with a result with a non-XLA type: "
+             << result.getType();
     }
   }
   return success();
@@ -1185,11 +1547,10 @@ LogicalResult CheckAncestorNotOutsideComp(Operation* op) {
   Operation* iter_op = op;
   while (auto* parent_op = iter_op->getParentOp()) {
     if (parent_op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
-      op->emitOpError()
-          << "An op marked for outside compilation (having attribute "
-          << kXlaOutsideCompilationAttr
-          << ") has an ancestor marked for outside compilation.";
-      return failure();
+      return op->emitOpError()
+             << "An op marked for outside compilation (having attribute "
+             << kXlaOutsideCompilationAttr
+             << ") has an ancestor marked for outside compilation.";
     }
     iter_op = parent_op;
   }
@@ -1226,15 +1587,15 @@ void ExtractOutsideCompilation::runOnOperation() {
     return signalPassFailure();
 
   llvm::SmallVector<tf_device::ParallelExecuteOp, 4> tmp_parallel_execute_ops;
+  std::optional<bool> is_map_oc;
 
   module.walk([&](tf_device::ClusterOp device_cluster) {
     if (HasOutsideCompilationNested(device_cluster.getOperation())) {
-      std::string host_device;
-      if (failed(tensorflow::GetHostDeviceOutsideComputation(
-              devices, device_cluster, &host_device)))
+      SmallVector<std::string, 8> core_to_host;
+      if (failed(tensorflow::GetDeviceToHostMap(device_cluster, core_to_host)))
         return signalPassFailure();
       if (failed(CreateParallelExecuteForOutsideCompilation(
-              module, device_cluster, host_device, tmp_parallel_execute_ops,
+              device_cluster, tmp_parallel_execute_ops, is_map_oc, core_to_host,
               tensorflow::HasTPUDevice(devices))))
         return signalPassFailure();
     }
@@ -1248,8 +1609,10 @@ void ExtractOutsideCompilation::runOnOperation() {
   // on ops outside of tf_device.cluster don't have any meaning and can lead to
   // errors later on.  These ops were likely lifted out of the
   // tf_device.cluster in an earlier pass.
-  module.walk(
-      [](Operation* op) { op->removeAttr("_xla_outside_compilation"); });
+  module.walk([](Operation* op) {
+    op->removeAttr(kXlaOutsideCompilationAttr);
+    op->removeAttr(kXlaMapOutsideCompilationAttr);
+  });
 
   if (failed(CheckPostconditions(module))) return signalPassFailure();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
index 433308c7966..e8520cb932a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/IRMapping.h"  // from @llvm-project
 #include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
@@ -50,6 +51,7 @@ limitations under the License.
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
@@ -183,7 +185,7 @@ struct ConvertNdConvOp {
 
     auto num_spatial_dims =
         conv_op.getDimensionNumbers().getInputSpatialDimensions().size();
-    // TODO(b/158636600): Currently we don't support 3D Convolution.
+    // TODO: b/158636600 - Currently we don't support 3D Convolution.
     if (num_spatial_dims != SupportedSpatialDims) return false;
 
     return true;
@@ -204,6 +206,20 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp>,
     // Check that input is a supported 1d convolution.
     //
 
+    // stablehlo.convolution allows ops without window strides, where default
+    // value 1 will be set for each spatial dimension. However, window_strides
+    // are needed for mhlo.convolution -> tf.Conv2D conversion. Therefore, in
+    // this conversion path have a fallback to set window strides if not set.
+    if (!conv_op.getWindowStrides().has_value()) {
+      const int window_strides_size =
+          conv_op.getDimensionNumbers().getInputSpatialDimensions().size();
+      std::vector<int64_t> window_strides_2d_array_default(window_strides_size,
+                                                           1);
+      DenseIntElementsAttr window_strides_2d_default =
+          rewriter.getI64TensorAttr(window_strides_2d_array_default);
+      conv_op.setWindowStridesAttr(window_strides_2d_default);
+    }
+
     if (!IsSupportedConvOp(conv_op) || conv_op->getNumResults() != 1)
       return rewriter.notifyMatchFailure(conv_op, "unsupported conv op.");
 
@@ -1219,62 +1235,14 @@ class ConvertDynamicUpdateSliceOp
 
     Type idx_type = start_indices_type.getElementType();
     int64_t shape_dim = operand_type.getRank();
-    auto operand_shape = operand_type.getShape();
-    auto update_shape = update_type.getShape();
-
-    ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
-    Value zero_cst = BuildIntConstOp(builder, rewriter, 0, idx_type);
-    Value one_cst = BuildIntConstOp(builder, rewriter, 1, idx_type);
-    // Clamp start indices in [0, operand_size - update_size].
     llvm::SmallVector<Value> start_indices_vector;
     Append(start_indices_vector, op.getStartIndices());
     auto shape_tensor_type = RankedTensorType::get({shape_dim}, idx_type);
-    Value start_indices_tensor =
-        builder.create<PackOp>(shape_tensor_type, start_indices_vector);
-    Value operand_shape_cst =
-        BuildIntArrayConstOp(builder, rewriter, operand_shape, idx_type);
-    Value update_shape_cst =
-        BuildIntArrayConstOp(builder, rewriter, update_shape, idx_type);
-    Value max_start_indices =
-        builder.create<SubOp>(operand_shape_cst, update_shape_cst);
-    Value start_indices_clip_max =
-        builder.create<MinimumOp>(start_indices_tensor, max_start_indices);
-    Value clamped_start_indices =
-        builder.create<MaximumOp>(start_indices_clip_max, zero_cst);
-
-    // Do dynamic_upate_slice on flattened operand and update with the aid of
-    // tf.TensorScatterUpdate op. It takes in 3 parameters: flat_operand,
-    // indices and flat_update. The indices are computed as follows:
-    // 1. Construct a range (0, n_operand). It arranges a id number to each
-    //    element position in operand.
-    // 2. Reshape the range to the shape of operand.
-    // 3. Compute the id numbers of update positions by choose a slice form
-    //    clamped_start_indices to clamped_start_indices + update_size.
-    // 4. Flatten the update id numbers and the indices is obtained.
-    int64_t n_operand = operand_type.getNumElements();
-    Value n_operand_cst =
-        BuildIntConstOp(builder, rewriter, n_operand, idx_type);
-    Value range_flat =
-        builder.create<RangeOp>(zero_cst, n_operand_cst, one_cst);
-    Value range = BuildReshapeOp(builder, rewriter, range_flat, operand_shape,
-                                 idx_type, idx_type);
-    Value update_indices_raw =
-        BuildSliceOp(builder, rewriter, range, clamped_start_indices,
-                     update_shape, idx_type, idx_type);
-    int64_t n_update = update_type.getNumElements();
-    Type element_type = operand_type.getElementType();
-    Value update_indices = BuildReshapeOp(builder, rewriter, update_indices_raw,
-                                          {n_update, 1}, idx_type, idx_type);
-    Value operand_flat = BuildReshapeOp(builder, rewriter, op.getOperand(),
-                                        {n_operand}, idx_type, element_type);
-    Value update_flat = BuildReshapeOp(builder, rewriter, op.getUpdate(),
-                                       {n_update}, idx_type, element_type);
-    Value flat_result = builder.create<TensorScatterUpdateOp>(
-        operand_flat, update_indices, update_flat);
-
-    // Reshape back before return.
-    rewriter.replaceOpWithNewOp<ReshapeOp>(op, operand_type, flat_result,
-                                           operand_shape_cst);
+    Value start_indices_tensor = rewriter.create<PackOp>(
+        op.getLoc(), shape_tensor_type, start_indices_vector);
+    rewriter.replaceOpWithNewOp<TF::XlaDynamicUpdateSliceOp>(
+        op, op.getType(), op.getOperand(), op.getUpdate(),
+        start_indices_tensor);
     return success();
   };
 };
@@ -1547,21 +1515,26 @@ bool MatchIota(DenseIntElementsAttr dimensions, Value iota) {
          MatchIotaConst(dimensions, iota);
 }
 
+template <typename ReturnOpType, typename CompareOpType>
 bool MatchTopKComparator(Region& comparator) {
   if (!comparator.hasOneBlock()) return false;
   Block& comparator_blk = comparator.front();
   using OpListType = llvm::iplist<Operation>;
   OpListType& operations = comparator_blk.getOperations();
   if (operations.size() != 2) return false;
-  auto compare_op = dyn_cast_or_null<mhlo::CompareOp>(&operations.front());
-  auto return_op = dyn_cast_or_null<mhlo::ReturnOp>(&operations.back());
+  auto compare_op = dyn_cast_or_null<CompareOpType>(&operations.front());
+  auto return_op = dyn_cast_or_null<ReturnOpType>(&operations.back());
   if (!compare_op || !return_op) return false;
   // TODO(xuanyuanluo): Support mhlo::ComparisonDirection::LT direction.
-  if (compare_op.getComparisonDirection() != mhlo::ComparisonDirection::GT)
+  if (std::is_same_v<CompareOpType, mhlo::CompareOp> &&
+      dyn_cast_or_null<mhlo::CompareOp>(&operations.front())
+              .getComparisonDirection() != mhlo::ComparisonDirection::GT) {
     return false;
-  if (compare_op.getLhs() != comparator_blk.getArgument(0) ||
-      compare_op.getRhs() != comparator_blk.getArgument(1))
+  }
+  if (compare_op.getOperands()[0] != comparator_blk.getArgument(0) ||
+      compare_op.getOperands()[1] != comparator_blk.getArgument(1)) {
     return false;
+  }
   return return_op.getOperands().front() == compare_op.getResult();
 }
 
@@ -1612,7 +1585,8 @@ class ConvertSortToTfTopk : public OpConversionPattern<mhlo::SortOp> {
     if (!MatchIota(sort_dim_attr, indices))
       return rewriter.notifyMatchFailure(
           op, "the second operand is supposed to be obtained from IOTA");
-    if (!MatchTopKComparator(op.getComparator()))
+    if (!MatchTopKComparator<mhlo::ReturnOp, mhlo::CompareOp>(
+            op.getComparator()))
       return rewriter.notifyMatchFailure(op, "only match for GT comparator");
     ImplicitLocOpBuilder builder(op.getLoc(), rewriter);
     Value k_cst = BuildIntConstOp(builder, rewriter, k, rewriter.getI32Type());
@@ -1798,7 +1772,7 @@ Value ConvertDotGeneralOp(PatternRewriter& rewriter, Operation* old_op) {
 }
 
 // Checks if the specified region is a binary reduction function that takes 2
-// inputs, passes it to an instance of the specifiied reduction op and then
+// inputs, passes it to an instance of the specified reduction op and then
 // returns the result.
 template <typename ReductionOp>
 LogicalResult MatchBinaryReduceFunction(mlir::Region& function) {
@@ -1835,7 +1809,7 @@ LogicalResult MatchBinaryReduceFunction<void>(mlir::Region& function) {
 }
 
 // Replace BinaryOp with a combination of TfBinaryOp and TfReduceOp if the
-// init value doesn't match the expection of TfReduceOp.
+// init value doesn't match the expectation of TfReduceOp.
 template <typename TfReduceOp, typename TfBinOp>
 LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
                                        ConstOp reduction_indices,
@@ -1849,7 +1823,7 @@ LogicalResult rewriteNonMatchInitValue(mhlo::ReduceOp reduce_op, Value input,
   return success();
 }
 
-// Cannot replace BinaryOp if the init value doesn't match the expection of
+// Cannot replace BinaryOp if the init value doesn't match the expectation of
 // TfReduceOp and there is no corresponding TfBinaryOp.
 template <>
 LogicalResult rewriteNonMatchInitValue<TF::MaxOp, void>(
@@ -2449,7 +2423,8 @@ class ConvertLoweredCumOp : public OpConversionPattern<mhlo::ReduceWindowOp> {
     }
 
     if (cumulative_axis == -1) {
-      return rewriter.notifyMatchFailure(rw, "no reduced dimension is found.");
+      rw.emitOpError() << "no reduced dimension is found.";
+      return failure();
     }
 
     // For a cumulative op, padding (expressed as a list of left-padding and
@@ -2993,6 +2968,10 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
   LogicalResult matchAndRewrite(
       mhlo::GatherOp gather_op, OpAdaptor adaptor,
       ConversionPatternRewriter& rewriter) const final {
+    if (succeeded(ConvertGatherOpToSlice(gather_op, rewriter))) {
+      return success();
+    }
+
     Value operand = gather_op.getOperand();
     Value start_indices = gather_op.getStartIndices();
 
@@ -3113,6 +3092,153 @@ class ConvertGatherOp : public OpConversionPattern<mhlo::GatherOp> {
     return success();
   }
 
+  // Convert gather op to tf.slice and tf.concat
+  LogicalResult ConvertGatherOpToSlice(
+      mhlo::GatherOp gather_op, ConversionPatternRewriter& rewriter) const {
+    Value operand = gather_op.getOperand();
+    Value start_indices = gather_op.getStartIndices();
+    static const int rank_two = 2;
+    // This converts a gather op to multiple slice ops, cap the number of slice
+    // ops allowed.
+    static const int max_batch_size = 50;
+
+    // Can only convert with static shaped gather.
+    ShapedType operand_type = operand.getType().cast<ShapedType>();
+    ShapedType start_indices_type = start_indices.getType().cast<ShapedType>();
+    ShapedType result_type = gather_op.getResult().getType().cast<ShapedType>();
+    if (!operand_type.hasStaticShape() ||
+        !start_indices_type.hasStaticShape() || !result_type.hasStaticShape()) {
+      gather_op.emitOpError() << "Dynamic shaped inputs are not supported.";
+      return failure();
+    }
+
+    auto start_index_map = gather_op.getDimensionNumbers().getStartIndexMap();
+    auto collapsed_slice_dims =
+        gather_op.getDimensionNumbers().getCollapsedSliceDims();
+    auto offset_dims = gather_op.getDimensionNumbers().getOffsetDims();
+    auto slice_sizes = gather_op.getSliceSizes();
+    llvm::SmallVector<int64_t, 2> slice_sizes_vector;
+    slice_sizes_vector.reserve(slice_sizes.size());
+    for (int64_t s : slice_sizes.getValues<int64_t>()) {
+      slice_sizes_vector.push_back(s);
+    }
+
+    llvm::SmallVector<int64_t, 1> batch_dims;
+    // Offset dims are guaranteed to be sorted.
+    int offset_index = 0;
+    for (int64_t i = 0; i < result_type.getRank(); ++i) {
+      if (offset_index >= offset_dims.size() ||
+          offset_dims[offset_index] != i) {
+        batch_dims.push_back(i);
+      } else {
+        ++offset_index;
+      }
+    }
+    // Here we only support gather with one batch dim and the batch dim is 0.
+    if (batch_dims.size() != 1 || batch_dims[0] != 0) {
+      return failure();
+    }
+    int64_t batch_dim = batch_dims[0];
+    // Batch dim in operand and start indices should match.
+    if (operand_type.getDimSize(batch_dim) > max_batch_size ||
+        operand_type.getRank() != rank_two ||
+        start_indices_type.getRank() != rank_two ||
+        operand_type.getDimSize(batch_dim) !=
+            start_indices_type.getDimSize(batch_dim) ||
+        slice_sizes_vector[batch_dim] != 1) {
+      return failure();
+    }
+    // Here we only support the case where [0, 1] in start_indices maps to
+    // operand[0, 1]
+    for (int64_t i = 0; i < start_index_map.size(); i++) {
+      if (start_index_map[i] != i) {
+        return failure();
+      }
+    }
+    // Collapsed slice dims should contain the batch dim.
+    if (collapsed_slice_dims.size() != start_index_map.size() - 1 ||
+        collapsed_slice_dims.size() != 1 || collapsed_slice_dims[0] != 0) {
+      return failure();
+    }
+
+    // Normalize start_indices so index_vector_dim == start_indices.rank() - 1.
+    int64_t index_vector_dim =
+        gather_op.getDimensionNumbers().getIndexVectorDim();
+    if (failed(NormalizeIndexVector(gather_op, start_indices,
+                                    start_indices_type, index_vector_dim,
+                                    rewriter))) {
+      return failure();
+    }
+
+    ImplicitLocOpBuilder builder(gather_op.getLoc(), rewriter);
+    // Clamp the start indices to ensure it is in bounds.
+    auto max_start_indices = BuildIntArrayConstOp(
+        builder, rewriter,
+        llvm::SmallVector<int64_t>(
+            {operand_type.getDimSize(0) - slice_sizes_vector[0],
+             operand_type.getDimSize(1) - slice_sizes_vector[1]}),
+        start_indices_type.getElementType());
+    auto min_start_indices = BuildIntArrayConstOp(
+        builder, rewriter, llvm::SmallVector<int64_t>({0, 0}),
+        start_indices_type.getElementType());
+    auto start_indices_max_op = rewriter.create<MaximumOp>(
+        gather_op.getLoc(), start_indices, min_start_indices);
+    auto clamped_start_indices_op = rewriter.create<MinimumOp>(
+        gather_op.getLoc(), start_indices_max_op, max_start_indices);
+
+    int64_t batch_size = start_indices_type.getDimSize(batch_dim);
+    auto slice_size = BuildIntArrayConstOp(
+        builder, rewriter, slice_sizes_vector, rewriter.getI32Type());
+    if (batch_size == 1) {
+      auto squeeze_op = rewriter.create<TF::SqueezeOp>(
+          gather_op.getLoc(),
+          RankedTensorType::get({rank_two},
+                                start_indices_type.getElementType()),
+          clamped_start_indices_op,
+          rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({batch_dim})));
+      auto slice_op =
+          rewriter.create<SliceOp>(gather_op.getLoc(), gather_op.getType(),
+                                   operand, squeeze_op, slice_size);
+      rewriter.replaceOp(gather_op, {slice_op});
+      return mlir::success();
+    }
+
+    llvm::SmallVector<Value, 1> slices;
+    slices.reserve(batch_size);
+    for (int64_t i = 0; i < batch_size; ++i) {
+      auto zero = BuildIntArrayConstOp(builder, rewriter,
+                                       llvm::SmallVector<int64_t>({i, 0}),
+                                       rewriter.getI32Type());
+      auto two = BuildIntArrayConstOp(builder, rewriter,
+                                      llvm::SmallVector<int64_t>({1, 2}),
+                                      rewriter.getI32Type());
+      auto begin = rewriter.create<SliceOp>(
+          gather_op.getLoc(),
+          RankedTensorType::get({1, 2}, start_indices_type.getElementType()),
+          clamped_start_indices_op, zero, two);
+      auto squeeze_op = rewriter.create<TF::SqueezeOp>(
+          gather_op.getLoc(),
+          RankedTensorType::get({rank_two},
+                                start_indices_type.getElementType()),
+          begin,
+          rewriter.getI64ArrayAttr(llvm::ArrayRef<int64_t>({batch_dim})));
+      auto slice_op = rewriter.create<SliceOp>(
+          gather_op.getLoc(),
+          RankedTensorType::get({1, slice_sizes_vector[1]},
+                                operand_type.getElementType()),
+          operand, squeeze_op, slice_size);
+      slices.push_back(slice_op);
+    }
+    auto scalar_type = RankedTensorType::get({}, rewriter.getI32Type());
+    auto zero_scalar = rewriter.create<TF::ConstOp>(
+        gather_op.getLoc(),
+        DenseIntElementsAttr::get(scalar_type, static_cast<int32_t>(0)));
+    auto concat_op = rewriter.create<ConcatV2Op>(
+        gather_op.getLoc(), result_type, slices, zero_scalar);
+    rewriter.replaceOp(gather_op, {concat_op});
+    return mlir::success();
+  }
+
  private:
   // Canonicalize the offset dims to make sure the offset dims are the trailing
   // dimensions of the output tensor.
@@ -3332,11 +3458,41 @@ class ConvertScatterOp : public OpConversionPattern<mhlo::ScatterOp> {
         loc, permutation_and_shape.shape, operands[0],
         permutation_and_shape.permutation);
 
+    Value new_indices = indices;
+    int64_t index_depth =
+        permutation_and_shape.shape.getRank() - inserted_window_dims.size();
+    int64_t num_updates = indices_type.getDimSize(0);
+    // For TF::TensorScatterUpdateOp, `indices` must have at least 2 axes:
+    // `(num_updates, index_depth)`. Reshape indices and updates if necessary.
+    if (std::is_same<TfOp, TF::TensorScatterUpdateOp>::value &&
+        indices_type.getRank() == 1 && updates_type.getRank() == 1 &&
+        index_depth == 1 && num_updates == 1) {
+      ImplicitLocOpBuilder builder(loc, rewriter);
+      auto indices_shape = BuildIntArrayConstOp(
+          builder, rewriter,
+          llvm::SmallVector<int64_t>({num_updates, index_depth}),
+          rewriter.getI32Type());
+      new_indices = rewriter.create<ReshapeOp>(
+          loc,
+          RankedTensorType::get({num_updates, index_depth},
+                                indices_type.getElementType()),
+          indices, indices_shape);
+      auto updates_shape = BuildIntArrayConstOp(
+          builder, rewriter,
+          llvm::SmallVector<int64_t>({num_updates, updates_type.getDimSize(0)}),
+          rewriter.getI32Type());
+      new_updates = rewriter.create<ReshapeOp>(
+          loc,
+          RankedTensorType::get({1, updates_type.getDimSize(0)},
+                                updates_type.getElementType()),
+          new_updates, updates_shape);
+    }
+
     // Apply TF scatter to update the trailing dimensions of the
     // transposed operand.
     auto tf_scatter_op =
         rewriter.create<TfOp>(loc, permutation_and_shape.shape,
-                              transposed_operand, indices, new_updates);
+                              transposed_operand, new_indices, new_updates);
 
     // Reverse the earlier transpose.
     auto inverse_permutation =
@@ -3398,6 +3554,161 @@ class ConvertPopulationCountOp
   }
 };
 
+class ConvertCustomCallWithApproxTopK
+    : public mlir::OpConversionPattern<mhlo::CustomCallOp> {
+ public:
+  explicit ConvertCustomCallWithApproxTopK(MLIRContext* context,
+                                           mlir::ModuleOp* module_op)
+      : OpConversionPattern<mhlo::CustomCallOp>(context),
+        module_op_(module_op) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      mhlo::CustomCallOp op, OpAdaptor adaptor,
+      ConversionPatternRewriter& rewriter) const final {
+    if (op.getCallTargetName() != "ApproxTopK") {
+      return mlir::failure();
+    }
+    auto is_supported_attr_name = [](NamedAttribute attr) {
+      auto name = attr.getName();
+      return name == "call_target_name" || name == "backend_config" ||
+             name == "api_version" || name == "called_computations";
+    };
+    for (const auto& attr : op->getAttrs()) {
+      if (!is_supported_attr_name(attr)) {
+        return op.emitOpError()
+               << attr.getName().getValue()
+               << " is not a supported attribute for ApproxTopK";
+      }
+    }
+    auto backend_config =
+        op.getBackendConfigAttr().dyn_cast_or_null<mlir::DictionaryAttr>();
+    if (!backend_config) {
+      return op.emitOpError() << "Missing backend_config attribute";
+    }
+
+    for (const auto& attr : backend_config) {
+      auto name = attr.getName();
+      if (!(name == "top_k" || name == "reduction_dim" ||
+            name == "recall_target" || name == "aggregate_to_topk" ||
+            name == "reduction_input_size_override" || name == "is_fallback")) {
+        return op.emitOpError()
+               << name.getValue() << " is not a supported backend_config"
+               << " attribute for ApproxTopK";
+      }
+    }
+
+    auto check_i64_attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name)) {
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      }
+      auto attr = backend_config.getAs<IntegerAttr>(attr_name);
+      if (!attr || !attr.getType().isInteger(64)) {
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of i64 type";
+      }
+      return success();
+    };
+    auto check_f32_attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name)) {
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      }
+      auto attr = backend_config.getAs<FloatAttr>(attr_name);
+      if (!attr || !attr.getType().isF32()) {
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of f32 type";
+      }
+      return success();
+    };
+    auto check_bool_attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name)) {
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      }
+      auto attr = backend_config.getAs<BoolAttr>(attr_name);
+      if (!attr) {
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of bool type";
+      }
+      return success();
+    };
+    if (failed(check_i64_attr("top_k"))) return failure();
+    if (failed(check_i64_attr("reduction_dim"))) return failure();
+    if (failed(check_f32_attr("recall_target"))) return failure();
+    if (failed(check_bool_attr("aggregate_to_topk"))) return failure();
+    if (failed(check_i64_attr("reduction_input_size_override"))) {
+      return failure();
+    }
+    bool has_is_fallback = backend_config.contains("is_fallback");
+    if (has_is_fallback && !backend_config.getAs<BoolAttr>("is_fallback")) {
+      return op.emitOpError()
+             << "is_fallback attribute in backend_config must be of bool type";
+    }
+
+    auto top_k_attr = backend_config.getAs<IntegerAttr>("top_k");
+    auto reduction_dim_attr =
+        backend_config.getAs<IntegerAttr>("reduction_dim");
+    auto recall_target_attr = backend_config.getAs<FloatAttr>("recall_target");
+    auto aggregate_to_topk_attr =
+        backend_config.getAs<BoolAttr>("aggregate_to_topk");
+    auto reduction_input_size_override_attr =
+        backend_config.getAs<IntegerAttr>("reduction_input_size_override");
+    if (op.getInputs().size() % 2 != 0) {
+      return op.emitOpError() << "ApproxTopK takes an even number of operands.";
+    }
+
+    auto called_computations = op.getCalledComputations();
+    if (called_computations.size() != 1) {
+      return op.emitOpError()
+             << "ApproxTopK takes exactly 1 called_computation.";
+    }
+    mlir::func::FuncOp callee = module_op_->lookupSymbol<mlir::func::FuncOp>(
+        op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
+    mlir::FunctionType callee_type = callee.getFunctionType();
+    SmallVector<Type, 4> expected_callee_input_types;
+    auto num_inputs = op.getInputs().size() / 2;
+    for (unsigned i = 0; i < num_inputs; ++i) {
+      auto input_type = op.getOperand(i).getType().dyn_cast<RankedTensorType>();
+      auto scalar = RankedTensorType::get({}, input_type.getElementType());
+      expected_callee_input_types.push_back(scalar);
+      expected_callee_input_types.push_back(scalar);
+    }
+    FunctionType expected_callee_type = mlir::FunctionType::get(
+        op->getContext(), expected_callee_input_types,
+        RankedTensorType::get({}, IntegerType::get(op->getContext(), 1)));
+    if (callee_type != expected_callee_type) {
+      return op.emitOpError()
+             << "called_computation type does not match the expected type. Got "
+             << callee_type << " expected " << expected_callee_type;
+    }
+    if (!MatchTopKComparator<mlir::func::ReturnOp, TF::GreaterOp>(
+            callee.getBody()) &&
+        !MatchTopKComparator<mlir::func::ReturnOp, mhlo::CompareOp>(
+            callee.getBody())) {
+      return op.emitOpError() << "only match for GT comparator";
+    }
+    auto is_max_k = rewriter.getBoolAttr(true);
+
+    auto approx_top_k = rewriter.create<TF::ApproxTopKOp>(
+        op.getLoc(), op->getResultTypes(), op.getInputs()[0], top_k_attr,
+        reduction_dim_attr, recall_target_attr, is_max_k,
+        reduction_input_size_override_attr, aggregate_to_topk_attr);
+
+    rewriter.replaceOp(op, approx_top_k.getResults());
+    return mlir::success();
+  }
+
+ private:
+  mlir::ModuleOp* module_op_;
+};
+
 // Returns true if broadcast_dimensions obey Tensorflow convention, as in new
 // dimensions are added as prefix.
 bool IsTFStyleBroadcast(DenseIntElementsAttr broadcast_dimensions,
@@ -3441,9 +3752,10 @@ arith::ConstantOp ExpandedShape(PatternRewriter& rewriter, Value input,
 /// Performs the lowering to XLA dialect.
 void LegalizeHloToTf::runOnOperation() {
   MLIRContext& context = getContext();
+  mlir::ModuleOp module = getOperation()->getParentOfType<mlir::ModuleOp>();
 
-  // Add legalization patterns to the list.
   RewritePatternSet patterns(&getContext());
+  patterns.add<ConvertCustomCallWithApproxTopK>(&context, &module);
   PopulateLegalizeHloToTfPatterns(&patterns, &context);
 
   ConversionTarget target(context);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
index 16493c30286..0261783da7c 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/legalize_hlo_patterns.td
@@ -189,6 +189,9 @@ def : Pat<(MHLO_ReverseOp $op, $dims), (TF_ReverseV2Op $op, (TF_ConstOp $dims))>
 def : Pat<(MHLO_ReshapeOp:$output $input),
           (TF_ReshapeOp $input, (ShapeToConst $output))>;
 
+// Both implement the Banker's rounding.
+def : Pat<(MHLO_RoundNearestEvenOp $input), (TF_RoundOp $input)>;
+
 //===----------------------------------------------------------------------===//
 // Ternary op patterns.
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
index 738a109dec6..ed1e896e2d4 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/mark_ops_for_outside_compilation.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
@@ -32,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/lower_tf.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 
@@ -395,6 +397,38 @@ void UnmarkChildren(ModuleOp module) {
   });
 }
 
+constexpr int kTooManyOutsideCompileRegionThreshold = 32;
+constexpr int kOpDetailCount = 8;
+
+void WarnOnExcessOutsideCompilationOps(ModuleOp module) {
+  // Count the number of outside compilation ops. If it exceeds the reporting
+  // threshold, warn the user that their model may run slowly.
+  llvm::SmallVector<Operation*, 8> outside_compile_ops;
+  module->walk([&](Operation* op) {
+    if (op->getAttrOfType<StringAttr>(kXlaOutsideCompilationAttr)) {
+      outside_compile_ops.push_back(op);
+    }
+  });
+
+  if (outside_compile_ops.size() > kTooManyOutsideCompileRegionThreshold) {
+    llvm::SmallVector<std::string, kOpDetailCount> op_info;
+    for (int i = 0; i < kOpDetailCount; ++i) {
+      auto& op = outside_compile_ops[i];
+      op_info.push_back(tensorflow::OpAsString(*op));
+    }
+
+    LOG(WARNING) << outside_compile_ops.size() << " outside compilation "
+                 << "regions found while processing "
+                 << module->getName().getStringRef().str()
+                 << ". This may result in excessively slow model execution. "
+                 << "First " << op_info.size()
+                 << " ops: " << absl::StrJoin(op_info, "\n");
+  } else {
+    LOG(INFO) << "Found " << outside_compile_ops.size()
+              << " outside compilation regions.";
+  }
+}
+
 void MarkOpsForOutsideCompilation::runOnOperation() {
   auto module = getOperation();
   const Dialect* tf_dialect = getContext().getLoadedDialect("tf");
@@ -446,6 +480,8 @@ void MarkOpsForOutsideCompilation::runOnOperation() {
   if (result.wasInterrupted()) return signalPassFailure();
 
   UnmarkChildren(module);
+
+  WarnOnExcessOutsideCompilationOps(module);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index 4092b90411e..a46a1204af2 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -323,6 +323,20 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateSplitIntoIslandPerOpPass();
 // CPU/GPU bridge.
 void CreateTFXLABridgePipeline(OpPassManager& pm);
 
+//===----------------------------------------------------------------------===//
+// XlaCallModule
+//===----------------------------------------------------------------------===//
+
+// Creates a pass that deserializes functions in the StableHLO modules from
+// `tf.XlaCallModule` to the top-level module.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateXlaCallModuleDeserializationPass();
+
+// Creates a pass that serializes StableHLO functions referenced by
+// `tf.XlaCallModule` from the top-level module to `tf.XlaCallModule`'s
+// `module` attribute.
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaCallModuleSerializationPass();
+
 }  // namespace TF
 
 namespace tf_executor {
@@ -435,7 +449,9 @@ std::unique_ptr<OperationPass<func::FuncOp>>
 CreateReplicaIDToDeviceOrdinalPass();
 
 // Creates a pass that adds pipelining to a graph that contains device
-// accelerated embeddings.
+// accelerated embeddings. The EmbeddingSequencingPass is a temporary fallback
+// while developing full pipelining capabilities.
+std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingSequencingPass();
 std::unique_ptr<OperationPass<ModuleOp>> CreateEmbeddingPipeliningPass();
 
 // Creates a pass that creates `tf_executor.island` from a single
@@ -491,6 +507,9 @@ std::unique_ptr<OperationPass<ModuleOp>> CreateXlaInlineDeviceOpsPass();
 // Creates a pass that rewrites partitioned calls with `_xla_compile_device
 // type` with `tf.XlaLaunch` ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateXlaRewritePass();
+
+// Create a pass that validates the input graph to the CPU/GPU bridge.
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaValidateInputsPass();
 }  // namespace TFDevice
 
 namespace TFTPU {
@@ -725,6 +744,9 @@ enum MoveTransposeDirection { kBegin, kEnd };
 #define GEN_PASS_DECL_TRANSFORMEINSUMPASS
 #define GEN_PASS_DECL_UNROLLBATCHMATMULPASS
 #define GEN_PASS_DECL_VERIFYSUITABLEFOREXPORTPASS
+#define GEN_PASS_DECL_XLACALLMODULEDESERIALIZATIONPASS
+#define GEN_PASS_DECL_XLACALLMODULESERIALIZATIONPASS
+#define GEN_PASS_DECL_XLACALLMODULECUSTOMCALLTFFUNCTIONRENAMINGPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 }  // namespace detail
 using namespace detail;  // NOLINT
@@ -746,6 +768,7 @@ namespace TFDevice {
 #define GEN_PASS_DECL_XLACLUSTERFORMATIONPASS
 #define GEN_PASS_DECL_XLAINLINEDEVICEOPSPASS
 #define GEN_PASS_DECL_XLAREWRITEPASS
+#define GEN_PASS_DECL_XLAVALIDATEINPUTSPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
 }  // namespace TFDevice
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
index a8bfb700209..e03eb9a9228 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting.cc
@@ -308,7 +308,7 @@ LogicalResult RegionResourceHoister::Analyze() {
       // Since all the sub-regions within this region (i.e., regions attached to
       // op's in this region) have themselves gone through lifting, all resource
       // users are expected to be operations in this region and not embedded
-      // within other sub-regions attached to op's in this region. So the check
+      // within other sub-regions attached to ops in this region. So the check
       // for whether a user is in one of the regions attached to this op is
       // straightforward.
       if (user->getParentRegion()->getParentOp() != op_) continue;
@@ -1260,6 +1260,11 @@ void ResourceOpLiftingPass::runOnOperation() {
   });
 
   if (walk_result.wasInterrupted()) return signalPassFailure();
+
+  // Clean up and canonicalize to remove dead local variables as some local
+  // variables might be dead after hoisting resource loads/stores.
+  if (failed(TF::CleanupAndCanonicalizeForResourceOpLifting(module)))
+    return signalPassFailure();
 }
 
 #define GEN_PASS_DEF_RESOURCEOPLIFTINGFORMAINFUNCTIONPASS
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
index 99693a91b2f..2f1c675b305 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/transforms/resource_op_lifting_cleanup.h"
 
 #include <optional>
+#include <variant>
 
 #include "llvm/ADT/BitVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -52,22 +53,55 @@ void RemovePassthroughOp(Block &block) {
   }
 }
 
+using LocalVarOp = std::variant<TF::VarHandleOp, TF::MlirLocalVarOp>;
+
+Value LocalVarOp_resource(LocalVarOp &op) {
+  if (auto var_handle_op = std::get_if<TF::VarHandleOp>(&op)) {
+    return var_handle_op->getResource();
+  } else {
+    return std::get<TF::MlirLocalVarOp>(op).getResource();
+  }
+}
+
+void LocalVarOp_erase(LocalVarOp &op) {
+  if (auto var_handle_op = std::get_if<TF::VarHandleOp>(&op)) {
+    var_handle_op->erase();
+  } else {
+    std::get<TF::MlirLocalVarOp>(op).erase();
+  }
+}
+
+std::optional<LocalVarOp> IsLocalVarOp(Operation &op) {
+  if (TF::MlirLocalVarOp mlir_local_var_op =
+          llvm::dyn_cast<TF::MlirLocalVarOp>(&op)) {
+    return std::make_optional(LocalVarOp(mlir_local_var_op));
+  }
+  if (TF::VarHandleOp var_handle_op = llvm::dyn_cast<TF::VarHandleOp>(&op)) {
+    auto ANONYMOUS_NAME = ::tensorflow::ResourceHandle::ANONYMOUS_NAME;
+    if (var_handle_op.getSharedName() == ANONYMOUS_NAME) {
+      return std::make_optional(LocalVarOp(var_handle_op));
+    }
+  }
+  return {};
+}
+
 // Eliminate local variables that are only assigned to but never read, and thus
 // are dead.
 void RemoveDeadLocalVariables(Block &block) {
-  llvm::SmallVector<TF::MlirLocalVarOp, 8> local_vars;
+  llvm::SmallVector<LocalVarOp, 8> local_vars;
   for (Operation &op : block) {
-    if (auto local_var = llvm::dyn_cast<TF::MlirLocalVarOp>(&op)) {
-      local_vars.push_back(local_var);
+    if (auto local_var = IsLocalVarOp(op)) {
+      local_vars.push_back(local_var.value());
     }
   }
   for (auto local_var : local_vars) {
-    auto users = local_var.getResource().getUsers();
+    auto users = LocalVarOp_resource(local_var).getUsers();
     if (llvm::all_of(users, [](const Operation *user) {
-          return isa<TF::AssignVariableOp>(user);
+          return isa<TF::AssignVariableOp>(user) ||
+                 isa<TF::DestroyResourceOp>(user);
         })) {
       for (auto user : llvm::make_early_inc_range(users)) user->erase();
-      local_var.erase();
+      LocalVarOp_erase(local_var);
     }
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index 1edc7f4bb73..2cf33360e10 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -530,7 +530,7 @@ struct ValuePortHasher {
 };
 
 using ValuePortResultMap =
-    std::unordered_map<ValuePort, Attribute, ValuePortHasher>;
+    absl::flat_hash_map<ValuePort, Attribute, ValuePortHasher>;
 using ComputedQueryFn = function_ref<bool(ValuePort)>;
 using ValueQueryFn = function_ref<Attribute(const ValuePort&)>;
 using ValuePortInputs = SmallVectorImpl<ValuePort>;
@@ -1200,14 +1200,24 @@ bool ShapeInference::InferShapeForXlaCallModule(XlaCallModuleOp op) {
       for (auto attr : op.getDimArgsSpec().getAsRange<StringAttr>()) {
         dim_args_spec.push_back(attr.getValue().str());
       }
-
+      std::vector<std::string> disabled_checks;
+      for (auto attr : op.getDisabledChecks().getAsRange<StringAttr>()) {
+        disabled_checks.push_back(attr.getValue().str());
+      }
+      std::vector<std::string> platforms;
+      for (auto attr : op.getPlatforms().getAsRange<StringAttr>()) {
+        platforms.push_back(attr.getValue().str());
+      }
       // Always use the first platform. The assumption is that shape inference
       // results should be the same regardless of which platform is chosen.
-      int platform_index = op.getPlatforms().size() > 1 ? 0 : -1;
+      // Very old versions of the op have an empty platforms attribute.
+      std::string loading_platform =
+          (platforms.empty() ? "CPU" : platforms.front());
 
       auto l = tensorflow::XlaCallModuleLoader::Create(
           &xla_call_module_context_, op.getVersion(), op.getModule().str(),
-          std::move(dim_args_spec), platform_index);
+          std::move(dim_args_spec), std::move(disabled_checks),
+          std::move(platforms), std::move(loading_platform));
       if (!l.ok()) {
         LLVM_DEBUG(llvm::dbgs() << "Parsing error in XlaCallModule: "
                                 << l.status().ToString() << "\n");
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
index e307266c93f..e8d78b646cf 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.td
@@ -390,3 +390,12 @@ def XlaRewritePass : Pass<"tf-xla-rewrite", "mlir::ModuleOp"> {
   let constructor = "TFDevice::CreateXlaRewritePass()";
   let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
 }
+
+def XlaValidateInputsPass : Pass<"tf-xla-validate-inputs", "ModuleOp"> {
+  let summary = "Validtes inputs to the TF CPU/GPU bridge";
+  let description = [{
+    This pass checks that the IR has valid input to CPU/GPU TF/XLA bridge.
+  }];
+  let constructor = "TFDevice::CreateXlaValidateInputsPass()";
+  let dependentDialects = ["tf_device::TensorFlowDeviceDialect"];
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index 839d9d601d9..93d2a9c708b 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -402,6 +402,16 @@ def EmbeddingPipeliningPass : Pass<"tf-embedding-pipelining", "mlir::ModuleOp">
   }];
 }
 
+def EmbeddingSequencingPass : Pass<"tf-embedding-sequencing", "mlir::ModuleOp"> {
+  let summary = "Rewrite graph for sequential execution of embeddings";
+  let constructor = "TFDevice::CreateEmbeddingSequencingPass()";
+    let description = [{
+    This is a strictly sequential and formally correct fallback option for the
+    embedding pipelining pass intended for debugging during pipelining
+    development.
+  }];
+}
+
 def ConvertReadonlyReferenceVariablesToResourceVariablesPass :
   Pass<"tf-readonly-references-to-resources", "mlir::func::FuncOp"> {
   let summary = "Convert readonly reference variables to resource variables.";
@@ -2707,3 +2717,70 @@ def NameAnonymousIteratorsPass : Pass<"tf-name-anonymous-iterators", "ModuleOp">
   }];
   let constructor = "TF::CreateNameAnonymousIteratorsPass()";
 }
+
+//===----------------------------------------------------------------------===//
+// XlaCallModule
+//===----------------------------------------------------------------------===//
+
+def XlaCallModuleDeserializationPass
+    : Pass<"tf-xla-call-module-deserialization", "ModuleOp"> {
+  let summary = "Deserializes StableHLO functions embedded in `tf.XlaCallModule` to top level module";
+
+  let description = [{
+    This pass deserializes the StableHLO bytecodes embedded in tf.XlaCallModule,
+    then outlines the functions in the deserialized StableHLO module to the top
+    level MLIR module, with function renamings to avoid naming conflicts.
+
+    After the outlining, it updates tf.XlaCallModule's module attribute to be
+    empty, adds an `_entry_function` attribute referring to the entry function.
+    It also adds a `_from_xla_call_module: true` attribute to each lifted
+    StableHLO function.
+  }];
+
+  // These dialects are needed by stablehlo deserialization.
+  //
+  // We use tensorflow::XlaCallModuleLoader.
+  // tensorflow::XlaCallModuleLoader will get or load dialects:
+  //   Func, Stablehlo, Mhlo, Chlo, and Vhlo.
+  //
+  // XlaCallModuleLoader uses mlir::stablehlo::deserializePortableArtifact,
+  // which runs VhloLegalizeToStablehloPass whose depends on dialects:
+  //   Func, Stablehlo, Shape, and Quantization.
+  //
+  // If we do not register them here, an error will be
+  // triggered because we cannot load a dialect while in a
+  // multi-threaded execution context, and PassManager is
+  // multi-threaded.
+  let dependentDialects = [
+    "chlo::ChloDialect",
+    "mhlo::MhloDialect",
+    "shape::ShapeDialect",
+    "stablehlo::StablehloDialect",
+    "vhlo::VhloDialect",
+    "quant::QuantizationDialect",
+  ];
+
+  let constructor = "TF::CreateXlaCallModuleDeserializationPass()";
+}
+
+def XlaCallModuleSerializationPass
+    : Pass<"tf-xla-call-module-serialization", "ModuleOp"> {
+  let summary = "Serializes StableHLO functions from top-level module into `tf.XlaCallModule`'s `module` attribute";
+
+  let description = [{
+    This pass collects StableHLO functions referenced from `tf.XlaCallModule`'s
+    `_entry_function` attribute into a module, serializes the module into MLIR
+    bytecode, and embed the bytecode to `tf.XlaCallModule`'s `module` attribute.
+
+    After serialization, this pass removes the `_entry_function` attribute from
+    `tf.XlaCallModule`, and removes all the serialized stablehlo functions
+    from the top-level module.
+  }];
+
+  let dependentDialects = [
+    "stablehlo::StablehloDialect",
+    "vhlo::VhloDialect",
+  ];
+
+  let constructor = "TF::CreateXlaCallModuleSerializationPass()";
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
new file mode 100644
index 00000000000..cb7d8ea0c21
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.cc
@@ -0,0 +1,142 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h"
+
+#include <stdint.h>
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/tsl/platform/path.h"
+
+namespace mlir {
+namespace tf_saved_model {
+namespace {
+
+#define GEN_PASS_DEF_ASSETSINKINGPASS
+#define GEN_PASS_DECL_ASSETSINKINGPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.h.inc"
+
+class AssetSinkingPass : public impl::AssetSinkingPassBase<AssetSinkingPass> {
+ public:
+  AssetSinkingPass() = default;
+
+  explicit AssetSinkingPass(llvm::StringRef saved_model_dir) {
+    saved_model_dir_ = saved_model_dir.str();
+  }
+
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    if (!mlir::tf_saved_model::HasTfSavedModelSemantics(module)) {
+      return;
+    }
+
+    auto init_op = mlir::tf_saved_model::GetSessionInitializerOp(module);
+    if (init_op == nullptr || init_op.getInitializers().empty()) {
+      return;
+    }
+
+    mlir::SymbolTable symbol_table(module);
+    for (auto initializer : init_op.getInitializers()) {
+      auto func = symbol_table.lookup<mlir::func::FuncOp>(
+          initializer.cast<mlir::FlatSymbolRefAttr>().getValue());
+      RewriteFunction(symbol_table, func);
+    }
+
+    // Clean up unused asset ops.
+    for (auto asset : llvm::make_early_inc_range(
+             module.getOps<mlir::tf_saved_model::AssetOp>())) {
+      if (symbol_table.symbolKnownUseEmpty(asset, module)) {
+        asset.erase();
+      }
+    }
+  }
+
+ private:
+  // Replaces bounded-input arguments of the function with constant ops in the
+  // body and removes the arguments.
+  void RewriteFunction(const mlir::SymbolTable& symbol_table,
+                       mlir::func::FuncOp func) {
+    if (func.getNumArguments() == 0) {
+      return;
+    }
+
+    auto builder = mlir::OpBuilder::atBlockBegin(&func.front());
+
+    llvm::SmallDenseMap<llvm::StringRef, mlir::TF::ConstOp> const_ops;
+    llvm::BitVector arg_indexes_to_remove(func.getNumArguments());
+
+    // Replace arguments with const ops.
+    for (mlir::BlockArgument argument : func.getArguments()) {
+      auto asset = mlir::tf_saved_model::LookupBoundInputOfType<
+          mlir::tf_saved_model::AssetOp>(func, argument.getArgNumber(),
+                                         symbol_table);
+      if (asset == nullptr) {
+        continue;
+      }
+
+      // Create a const op for the asset if it doesn't already exist.
+      auto it = const_ops.find(asset.getSymName());
+      if (it == const_ops.end()) {
+        // Asset filenames are relative to the SavedModel directory.
+        const std::string filename = tsl::io::JoinPath(
+            saved_model_dir_, absl::string_view(asset.getFilename()));
+
+        mlir::RankedTensorType type = mlir::RankedTensorType::get(
+            {}, mlir::TF::StringType::get(builder.getContext()));
+        auto const_op = builder.create<mlir::TF::ConstOp>(
+            builder.getUnknownLoc(),
+            mlir::DenseStringElementsAttr::get(type, {filename}));
+
+        it = const_ops.insert({asset.getSymName(), const_op}).first;
+      }
+
+      argument.replaceAllUsesWith(it->second.getOutput());
+      arg_indexes_to_remove.set(argument.getArgNumber());
+    }
+
+    // Erase function arguments with bounded input.
+    func.eraseArguments(arg_indexes_to_remove);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAssetSinkingPass(
+    llvm::StringRef saved_model_dir) {
+  return std::make_unique<AssetSinkingPass>(saved_model_dir);
+}
+
+}  // namespace tf_saved_model
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h
new file mode 100644
index 00000000000..a14e98e483f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_ASSET_SINKING_PASS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_ASSET_SINKING_PASS_H_
+
+#include <memory>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace mlir {
+namespace tf_saved_model {
+
+// Creates a pass that sinks SavedModel asset filenames to constants.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAssetSinkingPass(
+    llvm::StringRef saved_model_dir);
+
+}  // namespace tf_saved_model
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_TRANSFORMS_TF_SAVED_MODEL_ASSET_SINKING_PASS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
index 3cdabf52246..801eaaeb0ae 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h"
 #include "tensorflow/core/public/session.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td
index 302cdf29bc3..2e190255080 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_savedmodel_passes.td
@@ -162,3 +162,28 @@ def AddFunctionsForExportedNamesPass : Pass<"tf-saved-model-add-functions-for-ex
   }];
   let constructor = "::mlir::tf_saved_model::CreateAddFunctionsForExportedNamesPass()";
 }
+
+def AssetSinkingPass : Pass<"tf-saved-model-asset-sinking", "mlir::ModuleOp"> {
+  let summary = "Sinks SavedModel asset filenames to constants";
+
+  let description = [{
+    This pass sinks arguments of SavedModel methods that are bounded to
+    `tf_saved_model.asset` into constants in the methods. After the pass, unused
+    asset ops are removed from the module.
+
+    This is to convert initialization methods with bound inputs into the same
+    methods without any arguments, so that program invocation doesn't need to
+    track and explicitly pass asset filenames.
+
+    This pass accepts an option `saved-model-dir`, which specifies the directory
+    where SavedModel is stored. This is a required option because all asset
+    filenames are relative to this directory.
+  }];
+
+  let constructor = "::mlir::tf_saved_model::CreateAssetSinkingPass(\"\")";
+
+  let options = [
+    Option<"saved_model_dir_", "saved-model-dir", "std::string", "",
+           "SavedModel directory, which is prepended to asset file names.">,
+  ];
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
index bb2f8f26b65..bbbb92db49e 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_cluster_formation.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include <cstdint>
 #include <iterator>
 #include <memory>
+#include <ostream>
 #include <set>
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <unordered_map>
@@ -36,10 +38,12 @@ limitations under the License.
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
@@ -51,6 +55,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
@@ -134,6 +139,11 @@ LogicalResult CollectMetadata(Block* block, MetadataMap* metadata_map) {
   return success();
 }
 
+struct OpDevice {
+  Operation* op;
+  std::string device;
+};
+
 // Collects and clusters ops either based on `_replication_info` attribute
 // (replicated case) or using one single cluster (non-replicated case). Also
 // sets `device_type` if there is any cluster (note that the device type must be
@@ -147,7 +157,7 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
   bool has_local_device_name_collisions = false;
   // Use ordered set here to make error message below deterministic.
   std::set<llvm::StringRef> device_types;
-  std::unordered_map<std::string, std::string> devices;
+  absl::flat_hash_map<std::string, OpDevice> devices;
   for (Operation& op : *block) {
     LogicalResult result = TF::HasValidCompilationAndReplicationAttributes(op);
     if (failed(result)) return result;
@@ -188,10 +198,25 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
     }
     auto device_attr = op.getAttrOfType<StringAttr>(kDeviceAttr);
     std::string device_local_name;
+    bool is_tpu_device = false;
     if (device_attr && !device_attr.str().empty()) {
+      tensorflow::DeviceNameUtils::ParsedName parsed;
+      if (!tensorflow::DeviceNameUtils::ParseFullOrLocalName(device_attr.str(),
+                                                             &parsed)) {
+        op.emitWarning() << "Invalid device name " << device_attr.str();
+        return failure();
+      }
+
       device_local_name =
-          tensorflow::DeviceNameUtils::LocalName(device_attr.str());
+          tensorflow::DeviceNameUtils::LocalName(parsed.type, parsed.id);
+      is_tpu_device = parsed.type == "TPU";
     }
+
+    // Ignore non-TPU devices when clustering.
+    if (!is_tpu_device) {
+      continue;
+    }
+
     if (!has_replicated_compiled_op && !device_local_name.empty()) {
       // It is possible that a device may be same Local Name but
       // different fullname. Devices with same Local name are identical
@@ -200,24 +225,30 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
       // information such as task, replica, job etc. An example fullname is
       // "/job:foo_bar/replica:1/task:2/device:GPU:3"
       if (devices.count(device_local_name)) {
-        std::string device1 = devices[device_local_name];
+        std::string device1 = devices[device_local_name].device;
         std::string device2 = device_attr.str();
         // Is either of the two devices just a substring of the other? If
         // not, we treat them as different devices, and we have a collision.
         if (device1.find(device2) == std::string::npos &&
             device2.find(device1) == std::string::npos) {
+          Operation* previous_op = devices[device_local_name].op;
           has_local_device_name_collisions = true;
-          LOG(WARNING) << "found two devices with same local name "
+
+          LOG(WARNING) << "Found two devices with same local name "
                        << device_local_name
                        << " but conflicting fullname: " << device1 << " and "
-                       << device2;
+                       << device2 << ".";
+          LOG(WARNING) << "Previous assignment came from op: "
+                       << tensorflow::OpAsString(*previous_op)
+                       << ". Current op is: " << tensorflow::OpAsString(op);
         }
         // Always keep the longer name.
-        if (devices[device_local_name].size() < device_attr.str().size()) {
-          devices[device_local_name] = device_attr.str();
+        if (devices[device_local_name].device.size() <
+            device_attr.str().size()) {
+          devices[device_local_name] = {&op, device_attr.str()};
         }
       } else {
-        devices.insert({device_local_name, device_attr.str()});
+        devices.insert({device_local_name, {&op, device_attr.str()}});
       }
     }
   }
@@ -237,13 +268,14 @@ LogicalResult CollectAndGroupClusterOps(Block* block, ClusterMap* clusters,
     if (devices.size() > 1) {
       LOG(WARNING) << "found different devices for no replication: ";
       for (const auto& device_names : devices) {
-        LOG(WARNING) << device_names.first << ", " << device_names.second;
+        LOG(WARNING) << device_names.first << ", "
+                     << device_names.second.device;
       }
     } else if (has_local_device_name_collisions) {
       LOG(WARNING) << "Not assigning device because of conflicting fullnames.";
     } else if (devices.size() == 1 &&
-               absl::StrContains(devices.begin()->second, "TPU:")) {
-      device = devices.begin()->second;
+               absl::StrContains(devices.begin()->second.device, "TPU:")) {
+      device = devices.begin()->second.device;
     }
   }
   if (!clusters->empty()) {
@@ -697,7 +729,7 @@ LogicalResult ReplicateCluster(tf_device::ClusterOp cluster, int num_replicas,
     }
   }
 
-  // Create `ordered_tpu_replicate_inputs` which constains the final ordered
+  // Create `ordered_tpu_replicate_inputs` which contains the final ordered
   // replicate inputs. All packed arguments are moved to the end of the arg
   // list.
   llvm::SmallVector<TF::TPUReplicatedInputOp, 8> ordered_tpu_replicate_inputs =
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
new file mode 100644
index 00000000000..8e9431f7391
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_deserialization.cc
@@ -0,0 +1,280 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/str_format.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+#include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+#define GEN_PASS_DEF_XLACALLMODULEDESERIALIZATIONPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
+
+// `tf.backend_config` is a DictionaryAttr, JAX2TF sets the value of its
+// i64 attribute `called_index` to the TF function's name.
+constexpr llvm::StringRef kTfBackendConfigAttrName = "tf.backend_config";
+constexpr llvm::StringRef kCalledIndexAttrName = "called_index";
+constexpr llvm::StringRef kCalledFuncAttrName = "called_func";
+
+// The function name format for the deserialized stablehlo functions:
+//   _stablehlo_{original function name}_{index}.
+constexpr const char *kNewFuncNameFormat = "_stablehlo_%s_%d";
+
+// Deserialize the StableHLO module embedded in XlaCallModuleOp's module
+// attribute.
+tsl::StatusOr<OwningOpRef<ModuleOp>> DeserializeStablehlo(MLIRContext *context,
+                                                          XlaCallModuleOp op) {
+  std::vector<std::string> dim_args_spec;
+  for (auto attr : op.getDimArgsSpec().getAsRange<StringAttr>()) {
+    dim_args_spec.push_back(attr.getValue().str());
+  }
+  std::vector<std::string> disabled_checks;
+  for (auto attr : op.getDisabledChecks().getAsRange<StringAttr>()) {
+    disabled_checks.push_back(attr.getValue().str());
+  }
+  std::vector<std::string> platforms;
+  for (auto attr : op.getPlatforms().getAsRange<StringAttr>()) {
+    platforms.push_back(attr.getValue().str());
+  }
+  // XlaCallModuleOp OpKernel will determine platform index when running
+  // TF2XLA. We don't know the device/platform type in this MLIR pass, so
+  // we set loading_platform to the first platform.
+  std::string loading_platform =
+      (platforms.empty() ? "CPU" : platforms.front());
+  TF_ASSIGN_OR_RETURN(
+      auto loader,
+      tensorflow::XlaCallModuleLoader::Create(
+          context, static_cast<int>(op.getVersion()), op.getModule().str(),
+          std::move(dim_args_spec), std::move(disabled_checks),
+          std::move(platforms), std::move(loading_platform)));
+  return std::move(*loader).module();
+}
+
+// Returns a new function name in the kNewFuncNameFormat.
+// The new name is unique in the symbol table.
+std::string NewFuncName(const SymbolTable &symbol_table,
+                        const llvm::StringRef func_name) {
+  uint64_t index = 0;
+  std::string new_func_name;
+  do {
+    new_func_name = absl::StrFormat(kNewFuncNameFormat, func_name, index++);
+  } while (symbol_table.lookup(new_func_name));
+  return new_func_name;
+}
+
+// Renames functions in the stablehlo module to avoid naming conflicts with
+// existing functions in the tf module.
+// Sets _from_xla_call_module attribute for each stablehlo function.
+// Returns the new stablehlo main function's name or error.
+//
+// If we directly insert stablehlo functions into tf module, MLIR will rename
+// the stablehlo functions themselves in the tf module automatically to avoid
+// naming conflicts. But we need to rename the function calls inside the
+// stablehlo functions as well. So we first do this renaming in the stablehlo
+// module itself without inserting into the tf module.
+FailureOr<StringAttr> RenameStablehloFunctions(
+    MLIRContext *context, SymbolTableCollection &symbol_tables,
+    ModuleOp tf_module, ModuleOp stablehlo_module) {
+  SymbolTable &tf_sym_table = symbol_tables.getSymbolTable(tf_module);
+  SymbolTable &stablehlo_sym_table =
+      symbol_tables.getSymbolTable(stablehlo_module);
+  Builder builder(context);
+  StringAttr new_main_func_name;
+  for (auto func : stablehlo_module.getOps<func::FuncOp>()) {
+    auto new_func_name =
+        builder.getStringAttr(NewFuncName(tf_sym_table, func.getSymName()));
+    if (func.getSymName() == kStablehloMainFunctionName) {
+      new_main_func_name = new_func_name;
+    }
+    if (failed(stablehlo_sym_table.replaceAllSymbolUses(func, new_func_name,
+                                                        stablehlo_module))) {
+      return failure();
+    }
+    func.setName(new_func_name);
+    func->setAttr(kFromXlaCallModuleAttrName, builder.getUnitAttr());
+  }
+  return new_main_func_name;
+}
+
+// Moves functions from one module to another.
+// The moved functions are set to private.
+void MoveFunctions(SymbolTableCollection &symbol_tables, ModuleOp from,
+                   ModuleOp to) {
+  SymbolTable &to_sym_table = symbol_tables.getSymbolTable(to);
+  for (auto func : llvm::make_early_inc_range(from.getOps<func::FuncOp>())) {
+    func->remove();
+    func.setPrivate();
+    to_sym_table.insert(func);
+  }
+}
+
+void CopyStablehloModuleAttrs(ModuleOp stablehlo_module, XlaCallModuleOp op) {
+  op->setAttr(kStablehloModuleAttrsAttrName,
+              stablehlo_module->getAttrDictionary());
+}
+
+// Symbolizes `called_index` attributes in custom all ops to `called_func`.
+LogicalResult SymbolizeCustomCallCalledIndex(
+    ModuleOp module, llvm::ArrayRef<SymbolRefAttr> function_list) {
+  WalkResult result =
+      module.walk([&](stablehlo::CustomCallOp op) {
+        if (!IsTfFuncCustomCall(op)) {
+          return WalkResult::advance();
+        }
+
+        auto backend_config =
+            op->getAttrOfType<DictionaryAttr>(kTfBackendConfigAttrName);
+        if (!backend_config) {
+          op->emitOpError()
+              << "is missing attribute '" << kTfBackendConfigAttrName << "'";
+          return WalkResult::interrupt();
+        }
+
+        auto called_index_attr = backend_config.get(kCalledIndexAttrName)
+                                     .dyn_cast_or_null<IntegerAttr>();
+        if (!called_index_attr) {
+          op->emitOpError()
+              << "is missing attribute '" << kCalledIndexAttrName << "'";
+          return WalkResult::interrupt();
+        }
+        int called_index = called_index_attr.getInt();
+        if (called_index < 0 || called_index >= function_list.size()) {
+          op->emitOpError()
+              << "references function #" << called_index
+              << " but enclosing XlaCallModule has a function list of size "
+              << function_list.size();
+          return WalkResult::interrupt();
+        }
+
+        llvm::SmallVector<NamedAttribute> new_config;
+        // Copy the attributes in the current config except `called_index`.
+        for (auto attr : backend_config) {
+          if (attr.getName() != kCalledIndexAttrName) {
+            new_config.push_back(attr);
+          }
+        }
+
+        Builder builder(op.getContext());
+        // Sets the `called_index` attribute to the TF function's name.
+        new_config.push_back(builder.getNamedAttr(kCalledFuncAttrName,
+                                                  function_list[called_index]));
+
+        // Sets the `tf.backend_config` attribute to the `new_config`.
+        op->setAttr(kTfBackendConfigAttrName,
+                    builder.getDictionaryAttr(new_config));
+
+        return WalkResult::advance();
+      });
+  return result.wasInterrupted() ? failure() : success();
+}
+
+LogicalResult DeserializeXlaCallModule(MLIRContext *context,
+                                       SymbolTableCollection &symbol_tables,
+                                       ModuleOp module, XlaCallModuleOp op) {
+  auto deserialized = DeserializeStablehlo(context, op);
+  if (!deserialized.ok()) {
+    return op.emitOpError()
+           << "failed to deserialize StableHLO module from XlaCallModule: "
+           << deserialized.status().ToString();
+  }
+  OwningOpRef<ModuleOp> stablehlo_module = *std::move(deserialized);
+
+  CopyStablehloModuleAttrs(*stablehlo_module, op);
+
+  auto main_func = RenameStablehloFunctions(context, symbol_tables, module,
+                                            stablehlo_module.get());
+  if (failed(main_func)) {
+    return failure();
+  }
+
+  MoveFunctions(symbol_tables, *stablehlo_module, module);
+
+  // Translate `called_index` in TF function custom calls into symbol
+  // references. `function_list` attribute is needed after that.
+  SmallVector<SymbolRefAttr> function_list(
+      op.getFunctionList().getAsRange<SymbolRefAttr>());
+  if (failed(SymbolizeCustomCallCalledIndex(module, function_list))) {
+    return failure();
+  }
+  op.removeFunctionListAttr();
+
+  // Module is deserialized, we set an empty string to it instead removing
+  // it because it's a required attribute.
+  op.setModule("");
+  // Set the stablehlo main function as a symbol attribute.
+  // This is required because we not only need this to look up the
+  // stablehlo function called by XlaCallModule, but also need the symbol
+  // reference to prevent DCE from removing the stablehlo functions from the
+  // top-level module.
+  op->setAttr(kStablehloEntryFunctionAttrName,
+              SymbolRefAttr::get(main_func.value()));
+
+  return success();
+}
+
+class XlaCallModuleDeserializationPass
+    : public impl::XlaCallModuleDeserializationPassBase<
+          XlaCallModuleDeserializationPass> {
+ public:
+  void runOnOperation() override {
+    ModuleOp module = getOperation();
+    SymbolTableCollection symbol_tables;
+    WalkResult result = module.walk([&](XlaCallModuleOp op) {
+      if (failed(DeserializeXlaCallModule(&getContext(), symbol_tables, module,
+                                          op))) {
+        return WalkResult::interrupt();
+      }
+      return WalkResult::advance();
+    });
+    if (result.wasInterrupted()) {
+      return signalPassFailure();
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateXlaCallModuleDeserializationPass() {
+  return std::make_unique<XlaCallModuleDeserializationPass>();
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc
new file mode 100644
index 00000000000..a75bf4c75d8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_call_module_serialization.cc
@@ -0,0 +1,260 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "stablehlo/api/PortableApi.h"  // from @stablehlo
+#include "stablehlo/dialect/Serialization.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
+
+namespace mlir {
+namespace TF {
+namespace {
+
+#define GEN_PASS_DEF_XLACALLMODULESERIALIZATIONPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"  // IWYU pragma: keep
+
+// `tf.backend_config` is a DictionaryAttr, JAX2TF sets the value of its
+// i64 attribute `called_index` to the TF function's name.
+constexpr llvm::StringRef kTfBackendConfigAttrName = "tf.backend_config";
+constexpr llvm::StringRef kCalledIndexAttrName = "called_index";
+constexpr llvm::StringRef kCalledFuncAttrName = "called_func";
+
+// Converts `called_func` attributes in custom call ops back to `called_index`.
+FailureOr<ArrayAttr> DesymbolizeCustomCallCalledIndex(ModuleOp module) {
+  Builder builder(module.getContext());
+
+  SmallVector<Attribute> function_list;
+  llvm::DenseMap<SymbolRefAttr, int> called_indexes;
+
+  WalkResult result = module.walk([&](stablehlo::CustomCallOp op) {
+    if (!IsTfFuncCustomCall(op)) {
+      return WalkResult::advance();
+    }
+
+    auto backend_config =
+        op->getAttrOfType<DictionaryAttr>(kTfBackendConfigAttrName);
+    if (!backend_config) {
+      op->emitOpError() << "is missing attribute '" << kTfBackendConfigAttrName
+                        << "'";
+      return WalkResult::interrupt();
+    }
+    auto called_func = backend_config.get(kCalledFuncAttrName)
+                           .dyn_cast_or_null<SymbolRefAttr>();
+    if (!called_func) {
+      op->emitOpError() << "is missing attribute '" << kCalledFuncAttrName
+                        << "'";
+      return WalkResult::interrupt();
+    }
+
+    llvm::SmallVector<NamedAttribute> new_config;
+    // Copy the attributes in the current config except `called_func`.
+    for (auto attr : backend_config) {
+      if (attr.getName() != kCalledFuncAttrName) {
+        new_config.push_back(attr);
+      }
+    }
+
+    auto [it, inserted] =
+        called_indexes.insert({called_func, called_indexes.size()});
+    if (inserted) {
+      function_list.push_back(called_func);
+    }
+
+    // Set the `called_index` attribute to the TF function's name.
+    new_config.push_back(builder.getNamedAttr(
+        kCalledIndexAttrName, builder.getI64IntegerAttr(it->second)));
+
+    // Set the `tf.backend_config` attribute to the `new_config`.
+    op->setAttr(kTfBackendConfigAttrName,
+                builder.getDictionaryAttr(new_config));
+
+    return WalkResult::advance();
+  });
+  if (result.wasInterrupted()) {
+    return failure();
+  }
+
+  return builder.getArrayAttr(function_list);
+}
+
+// Creates a pruned module containing the XlaCallModule's entry function and
+// other functions transitively called by the entry function.
+FailureOr<OwningOpRef<ModuleOp>> PruneStablehloModule(
+    SymbolTableCollection& symbol_table, ModuleOp module, XlaCallModuleOp op) {
+  auto entry_func_symbol =
+      op->getAttrOfType<FlatSymbolRefAttr>(kStablehloEntryFunctionAttrName);
+  if (!entry_func_symbol) {
+    return op.emitOpError() << "does not have "
+                            << kStablehloEntryFunctionAttrName << " attribute";
+  }
+  auto entry_func =
+      symbol_table.lookupSymbolIn<func::FuncOp>(module, entry_func_symbol);
+  if (!entry_func) {
+    return op.emitOpError()
+           << "references an unknown entry function " << entry_func_symbol;
+  }
+
+  OpBuilder builder(module.getContext());
+
+  OwningOpRef<ModuleOp> stablehlo_module =
+      builder.create<ModuleOp>(op.getLoc());
+  builder.setInsertionPointToEnd(stablehlo_module->getBody());
+
+  // Copy all referenced StableHLO functions to the new module.
+  WalkResult result = WalkReachableFunctions(
+      entry_func,
+      [&](func::FuncOp f) -> WalkResult {
+        if (!f->hasAttr(kFromXlaCallModuleAttrName)) {
+          return WalkResult::advance();
+        }
+
+        auto cloned = llvm::cast<func::FuncOp>(builder.clone(*f));
+        cloned->removeAttr(kFromXlaCallModuleAttrName);
+
+        if (f == entry_func) {
+          // Entry function must be public and has symbol name "@main".
+          cloned.setPublic();
+          cloned.setName(kStablehloMainFunctionName);
+        } else {
+          cloned.setPrivate();
+        }
+
+        return WalkResult::advance();
+      },
+      &symbol_table);
+  if (result.wasInterrupted()) {
+    return failure();
+  }
+
+  // Rewrite `custom_call`'s `called_func` attribute to `called_index`.
+  auto function_list = DesymbolizeCustomCallCalledIndex(*stablehlo_module);
+  if (failed(function_list)) return failure();
+  op.setFunctionListAttr(*function_list);
+
+  // Restore the deserialized stablehlo module's attributes to the reconstructed
+  // stablehlo module. The stablehlo module's attributes can contain important
+  // information such as SPMD num_replicas and num_partitions.
+  auto original_stablehlo_module_attrs =
+      op->getAttrOfType<DictionaryAttr>(kStablehloModuleAttrsAttrName);
+  if (original_stablehlo_module_attrs) {
+    (*stablehlo_module)->setAttrs(original_stablehlo_module_attrs);
+    // Now, remove the attribute because later passes may not know how to handle
+    // it, we may encounter errors such as:
+    // "Unhandled attribute kind for attribute '_stablehlo_module_attrs'".
+    op->removeAttr(kStablehloModuleAttrsAttrName);
+  }
+
+  return stablehlo_module;
+}
+
+// Serializes the stablehlo module into bytecode.
+FailureOr<std::string> SerializeStablehlo(ModuleOp stablehlo_module) {
+  std::string bytecode;
+  llvm::raw_string_ostream os(bytecode);
+  if (mlir::failed(stablehlo::serializePortableArtifact(
+          stablehlo_module, stablehlo::getCurrentVersion(), os))) {
+    return stablehlo_module.emitError()
+           << "failed to serialize the pruned stablehlo module";
+  }
+  return bytecode;
+}
+
+// Serializes the stablehlo functions called by XlaCallModuleOp to bytecode
+// and embeds the bytecode in XlaCallModuleOp's `module` attribute.
+//
+// The stablehlo functions include the function referred by XlaCallModuleOp's
+// `_entry_function` attribute, and any stablehlo functions called transitively
+// from the entry function.
+LogicalResult SerializeXlaCallModule(SymbolTableCollection& symbol_table,
+                                     ModuleOp module, XlaCallModuleOp op) {
+  auto stablehlo_module = PruneStablehloModule(symbol_table, module, op);
+  if (failed(stablehlo_module)) {
+    return failure();
+  }
+
+  auto bytecode = SerializeStablehlo(**stablehlo_module);
+  if (failed(bytecode)) {
+    return failure();
+  }
+
+  op.setModule(*bytecode);
+  op->removeAttr(kStablehloEntryFunctionAttrName);
+
+  return success();
+}
+
+// Removes the serialized stablehlo functions, because `XlaCallModuleOp` no
+// longer has `_entry_function` attribute referencing the stablehlo main
+// function, so all stablehlo functions are of no use in the top-level module.
+//
+// Walk the module to find functions with `_from_xla_call_module` attribute,
+// and remove them.
+void RemoveSerializedStablehloFunctions(ModuleOp module) {
+  module.walk([&](func::FuncOp f) {
+    if (f->hasAttr(kFromXlaCallModuleAttrName)) {
+      f->erase();
+    }
+  });
+}
+
+class XlaCallModuleSerializationPass
+    : public impl::XlaCallModuleSerializationPassBase<
+          XlaCallModuleSerializationPass> {
+ public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::SymbolTableCollection symbol_table;
+
+    mlir::WalkResult result =
+        module.walk([&](mlir::TF::XlaCallModuleOp xla_call_module) {
+          if (failed(SerializeXlaCallModule(symbol_table, module,
+                                            xla_call_module))) {
+            return mlir::WalkResult::interrupt();
+          }
+          return mlir::WalkResult::advance();
+        });
+    if (result.wasInterrupted()) {
+      return signalPassFailure();
+    }
+
+    RemoveSerializedStablehloFunctions(module);
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateXlaCallModuleSerializationPass() {
+  return std::make_unique<XlaCallModuleSerializationPass>();
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
index f876231ab00..03e05816992 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_cluster_formation.cc
@@ -20,11 +20,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
-
-inline constexpr absl::string_view kEntryFunctionAttr = "tf.entry_function";
+#include "tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h"
 
 namespace mlir {
 
@@ -63,37 +61,25 @@ void EncapsulatePartitionedCall(Operation *call_op) {
 }
 
 void XlaClusterFormationPass::runOnOperation() {
+  auto has_compile_device_type = [](SymbolUserOpInterface op) {
+    return op->hasAttr(tensorflow::kCompileDeviceTypeAttr);
+  };
+
   ModuleOp module = getOperation();
   SymbolTable symtab(module);
-
-  llvm::SmallVector<func::FuncOp> entry_funcs;
-  // A model may have multiple graphs, with each graph having its own entry.
-  // When a graph is imported to MLIR, `tf.entry_function` will be added to
-  // each entry function. The one exception are initializer functions, which
-  // have `tf_saved_model.initializer_type` instead.
-  module.walk([&](func::FuncOp func) {
-    if (func->hasAttr(kEntryFunctionAttr) ||
-        func->hasAttr(tf_saved_model::kTfSavedModelInitializerTypeAttr)) {
-      entry_funcs.push_back(func);
-    }
-  });
-  if (entry_funcs.empty()) {
-    LOG(WARNING) << "no entry function is found";
-  }
-  auto predicate = [](Operation *op) {
-    if (op->hasAttr(tensorflow::kCompileDeviceTypeAttr)) return true;
-    return false;
-  };
-  for (auto &root : entry_funcs) {
-    llvm::SmallVector<Operation *> outermost_call_ops;
-    if (failed(GetOutermostOpsOfType<TF::StatefulPartitionedCallOp,
-                                     TF::PartitionedCallOp>(
-            root, symtab, outermost_call_ops, predicate)))
+  llvm::SmallVector<func::FuncOp> entry_funcs = GetEntryFunctions(module);
+  for (auto &entry_func : entry_funcs) {
+    llvm::SmallVector<SymbolUserOpInterface> outermost_pcall_ops;
+    if (failed(GetFirstOpsOfType<TF::StatefulPartitionedCallOp,
+                                 TF::PartitionedCallOp>(
+            entry_func, symtab, /*predicate*/ has_compile_device_type,
+            outermost_pcall_ops))) {
       return signalPassFailure();
+    }
     // Cluster outermost partitioned calls with _xla_compile_device_type
     // attribute.
-    for (auto &call_op : outermost_call_ops) {
-      EncapsulatePartitionedCall(call_op);
+    for (auto &pcall_op : outermost_pcall_ops) {
+      EncapsulatePartitionedCall(pcall_op);
     }
   }
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
index c59d6e532d0..1992f43a951 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_rewrite.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This transformation pass converts stateful and stateless paritioned calls
+// This transformation pass converts stateful and stateless partitioned calls
 // with _xla_compile_device_type attribute to XLA launch ops.
 
 #include <stack>
@@ -21,9 +21,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/call_graph_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h"
 
 #define DEBUG_TYPE "tf-xla-rewrite"
 
@@ -52,7 +52,7 @@ void MoveResourceArgsToEnd(func::FuncOp callee) {
       removed_params.push_back(false);
     }
   }
-  // Remove old reousrce-type parameters.
+  // Remove old resource-type parameters.
   callee.getBody().front().eraseArguments(removed_params);
   // Update function type.
   callee.setFunctionType(FunctionType::get(callee.getContext(),
@@ -98,20 +98,6 @@ void XlaRewritePass::runOnOperation() {
   module.walk([&](tf_device::ClusterFuncOp cluster_func_op) {
     RewriteCall(cluster_func_op, symtab, builder);
   });
-
-  // Verify that there are no nested XLA launch ops.
-  module.walk([&](TF::XlaLaunchOp xla_launch_op) {
-    llvm::SmallVector<mlir::Operation *> nested_launch_ops;
-    func::FuncOp root = symtab.lookup<func::FuncOp>(
-        xla_launch_op.getFunctionAttr().getRootReference());
-    if (failed(GetOutermostOpsOfType<TF::XlaLaunchOp>(root, symtab,
-                                                      nested_launch_ops)))
-      return signalPassFailure();
-    if (!nested_launch_ops.empty()) {
-      xla_launch_op.emitError() << "Nested XLA launch ops detected";
-      return signalPassFailure();
-    }
-  });
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc
new file mode 100644
index 00000000000..7891a672bdb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/xla_validate_inputs.cc
@@ -0,0 +1,102 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h"
+
+namespace mlir {
+
+namespace {
+
+#define GEN_PASS_DEF_XLAVALIDATEINPUTSPASS
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_device_passes.h.inc"
+
+// Validate input graph.
+struct XlaValidateInputsPass
+    : public impl::XlaValidateInputsPassBase<XlaValidateInputsPass> {
+  void runOnOperation() override;
+};
+
+LogicalResult has_nested_entry_functions(
+    const llvm::SmallVector<func::FuncOp> &entry_funcs, SymbolTable &symtab) {
+  auto calls_entry_functions = [&](SymbolUserOpInterface op) {
+    llvm::SmallVector<func::FuncOp> callees;
+    if (GetCallees(op, symtab, callees).failed()) {
+      return false;
+    }
+    for (auto &callee : callees) {
+      if (IsEntryFunction(callee)) {
+        return true;
+      }
+    }
+    return false;
+  };
+
+  for (auto &entry_func : entry_funcs) {
+    llvm::SmallVector<SymbolUserOpInterface> calls;
+    if (GetFirstOpsOfType<SymbolUserOpInterface>(
+            entry_func, symtab, /*predicate*/ calls_entry_functions, calls)
+            .failed()) {
+      return failure();
+    }
+    if (!calls.empty()) {
+      // Some passes in MLIR GPU phase 1 pipeline uses entry functions as start
+      // point for tree traversal (input graphs are transformed to trees in
+      // GuaranteeAllFuncsOneUsePass). They will not work properly if there are
+      // nested calls of entry fucntions. We can add a pass after
+      // GuaranteeAllFuncsOneUsePass to remove "tf.entry_function" or
+      // "tf_saved_model.initializer_type" attribute from the callee of the
+      // inner calls
+      entry_func->emitError()
+          << "CPU/GPU MLIR phase 1 pipeline does not support nested calls of "
+             "entry functions. Remove tf.entry_function or "
+             "tf_saved_model.initializer_type from the called functions in the "
+             "inner calls after GuaranteeAllFuncsOneUsePass to add the support";
+      return failure();
+    }
+  }
+  return success();
+}
+
+void XlaValidateInputsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  SymbolTable symtab(module);
+  llvm::SmallVector<func::FuncOp> entry_funcs = GetEntryFunctions(module);
+  if (entry_funcs.empty()) {
+    LOG(WARNING) << "missing entry functions";
+  }
+
+  if (has_nested_entry_functions(entry_funcs, symtab).failed()) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+
+namespace TFDevice {
+std::unique_ptr<OperationPass<ModuleOp>> CreateXlaValidateInputsPass() {
+  return std::make_unique<XlaValidateInputsPass>();
+}
+}  // namespace TFDevice
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
index 61490f6a749..74cf8423270 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/export_graphdef.cc
@@ -557,7 +557,7 @@ StatusOr<std::unique_ptr<Graph>> Exporter::Convert(
                    llvm::dyn_cast<mlir::tf_executor::IslandOp>(inst)) {
       Operation& inner_op = island.GetBody().front();
       auto op_name = GetTensorFlowOpName(inner_op.getName().getStringRef());
-      if (op_name.ok()) {
+      if (llvm::isa<FuncOp>(inner_op) && op_name.ok()) {
         // If it is TF Control dialect specific op, look up custom operation
         // in the module and first convert that, then add it to function
         // definition library
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index b8ba989b33b..8b23293dadc 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -1764,7 +1764,7 @@ mlir::Location ImporterBase::GetLocation(const Node& node) {
     // finally to just name.
     if (auto stack_trace = node.GetStackTrace()) {
       DVLOG(1) << "Stack available for " << node.name();
-      absl::Span<const StackFrame> frames = stack_trace->ToFrames();
+      std::vector<StackFrame> frames = stack_trace->ToUncachedFrames();
       locations.reserve(frames.size());
       for (const StackFrame& frame : llvm::reverse(frames)) {
         auto file_name = mlir::StringAttr::get(context_, frame.file_name);
@@ -1773,7 +1773,6 @@ mlir::Location ImporterBase::GetLocation(const Node& node) {
             mlir::FileLineColLoc::get(file_name, frame.line_number, 1);
         locations.push_back(file_line_loc);
       }
-      stack_trace->WipeCache();
     } else {
       DVLOG(1) << "No stack trace for " << node.name();
       const auto location_it = debug_info.find(debug_info_key);
@@ -2486,6 +2485,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphDefImporter::Convert(
           b.getNamedAttr("_xla_compile_device_type",
                          b.getStringAttr(specs.xla_compile_device_type)));
     }
+    attrs.push_back(b.getNamedAttr("allow_soft_placement",
+                                   b.getBoolAttr(specs.enable_soft_placement)));
   } else {
     // Collects the argument and return nodes by looking up the node names
     // specified by the user.
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
index 79d364bf6b2..191676999be 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h
@@ -93,6 +93,9 @@ struct GraphImportConfig {
   // If set, use the value as the device type and mark the function graph for
   // XLA compilation.
   string xla_compile_device_type;
+  // If true, enables moving ops to different devices or moving unsupported ops
+  // out of a compilation cluster.
+  bool enable_soft_placement = false;
 };
 
 struct GraphExportConfig {
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
index 233d35d8c01..45bfe3e2e11 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 
 #include <optional>
+#include <string>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "llvm/Support/raw_ostream.h"
@@ -45,16 +47,12 @@ limitations under the License.
 namespace tensorflow {
 
 static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type,
-    const std::vector<std::string>& input_arrays,
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
     const std::vector<std::string>& output_arrays,
     const std::vector<std::string>& control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
-    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context) {
   GraphDef graphdef;
   TF_RETURN_IF_ERROR(
       tensorflow::LoadProtoFromBuffer({input.data(), input.size()}, &graphdef));
@@ -62,19 +60,21 @@ static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
     TF_RETURN_IF_ERROR(ByteSwapTensorContentInGraphDef(&graphdef));
 
   GraphDebugInfo debug_info;
-  if (!debug_info_file.empty()) {
-    TF_RETURN_IF_ERROR(LoadProtoFromFile(debug_info_file, &debug_info));
+  if (!import_options.debug_info_file.empty()) {
+    TF_RETURN_IF_ERROR(
+        LoadProtoFromFile(import_options.debug_info_file, &debug_info));
   }
 
   GraphImportConfig specs;
-  specs.prune_unused_nodes = prune_unused_nodes;
-  specs.convert_legacy_fed_inputs = convert_legacy_fed_inputs;
-  specs.graph_as_function = graph_as_function;
-  specs.upgrade_legacy = upgrade_legacy;
-  specs.enable_shape_inference = enable_shape_inference;
+  specs.prune_unused_nodes = import_options.prune_unused_nodes;
+  specs.convert_legacy_fed_inputs = import_options.convert_legacy_fed_inputs;
+  specs.graph_as_function = import_options.graph_as_function;
+  specs.upgrade_legacy = import_options.upgrade_legacy;
+  specs.enable_shape_inference = import_options.enable_shape_inference;
   specs.unconditionally_use_set_output_shapes =
-      unconditionally_use_set_output_shapes;
-  specs.xla_compile_device_type = xla_compile_device_type;
+      import_options.unconditionally_use_set_output_shapes;
+  specs.xla_compile_device_type = import_options.xla_compile_device_type;
+  specs.enable_soft_placement = import_options.enable_soft_placement;
   TF_RETURN_IF_ERROR(ParseInputArrayInfo(input_arrays, input_dtypes,
                                          input_shapes, &specs.inputs));
   TF_RETURN_IF_ERROR(ParseOutputArrayInfo(output_arrays, &specs.outputs));
@@ -109,22 +109,15 @@ static StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirImport(
 }
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type,
-    const std::vector<std::string>& input_arrays,
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
     const std::vector<std::string>& output_arrays,
     const std::vector<std::string>& control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
-    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
-      input, debug_info_file, xla_compile_device_type, input_arrays,
-      input_dtypes, input_shapes, output_arrays, control_output_arrays,
-      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
-      upgrade_legacy, enable_shape_inference,
-      unconditionally_use_set_output_shapes, context);
+      input, input_arrays, input_dtypes, input_shapes, output_arrays,
+      control_output_arrays, import_options, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
   }
@@ -132,13 +125,10 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
 }
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, absl::string_view control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
-    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context) {
   std::vector<std::string> input_array_vector;
   std::vector<std::string> input_dtype_vector;
   std::vector<std::optional<std::vector<int>>> input_shapes_vector;
@@ -151,11 +141,9 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
   TF_RETURN_IF_ERROR(
       ParseNodeNames(control_output_arrays, control_output_array_vector));
   return GraphdefToMlirTranslateFunction(
-      input, debug_info_file, xla_compile_device_type, input_array_vector,
-      input_dtype_vector, input_shapes_vector, output_array_vector,
-      control_output_array_vector, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
-      enable_shape_inference, unconditionally_use_set_output_shapes, context);
+      input, input_array_vector, input_dtype_vector, input_shapes_vector,
+      output_array_vector, control_output_array_vector, import_options,
+      context);
 }
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SavedModelObjectGraphToMlirImport(
@@ -252,22 +240,15 @@ SavedModelSignatureDefsToMlirImportLite(
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type,
-    const std::vector<std::string>& input_arrays,
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
     const std::vector<std::string>& output_arrays,
     const std::vector<std::string>& control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
-    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context) {
   auto module_or = GraphdefToMlirImport(
-      input, debug_info_file, xla_compile_device_type, input_arrays,
-      input_dtypes, input_shapes, output_arrays, control_output_arrays,
-      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
-      upgrade_legacy, enable_shape_inference,
-      unconditionally_use_set_output_shapes, context);
+      input, input_arrays, input_dtypes, input_shapes, output_arrays,
+      control_output_arrays, import_options, context);
   if (!module_or.status().ok()) {
     LOG(ERROR) << "Graph import failed: " << module_or.status();
     return module_or.status();
@@ -306,13 +287,10 @@ GraphdefToSplattedMlirTranslateFunction(
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, absl::string_view control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy, bool enable_shape_inference,
-    bool unconditionally_use_set_output_shapes, mlir::MLIRContext* context) {
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context) {
   std::vector<std::string> input_array_vector;
   std::vector<std::string> input_dtype_vector;
   std::vector<std::optional<std::vector<int>>> input_shapes_vector;
@@ -325,11 +303,9 @@ GraphdefToSplattedMlirTranslateFunction(
   TF_RETURN_IF_ERROR(
       ParseNodeNames(control_output_arrays, control_output_array_vector));
   return GraphdefToSplattedMlirTranslateFunction(
-      input, debug_info_file, xla_compile_device_type, input_array_vector,
-      input_dtype_vector, input_shapes_vector, output_array_vector,
-      control_output_array_vector, prune_unused_nodes,
-      convert_legacy_fed_inputs, graph_as_function, upgrade_legacy,
-      enable_shape_inference, unconditionally_use_set_output_shapes, context);
+      input, input_array_vector, input_dtype_vector, input_shapes_vector,
+      output_array_vector, control_output_array_vector, import_options,
+      context);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
index 677c09dd027..ff53e066964 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <unordered_set>
+#include <vector>
 
 #include "absl/base/macros.h"
 #include "absl/strings/string_view.h"
@@ -34,24 +35,30 @@ namespace tensorflow {
 using tsl::Status;
 using tsl::StatusOr;
 
+struct GraphdefToMlirOptions {
+  std::string debug_info_file;
+  std::string xla_compile_device_type;
+  bool prune_unused_nodes;
+  bool convert_legacy_fed_inputs;
+  bool graph_as_function;
+  bool upgrade_legacy;
+  bool enable_shape_inference;
+  bool unconditionally_use_set_output_shapes;
+  bool enable_soft_placement;
+};
+
 // TODO(antiagainst): Directly manipulating files in library functions is not
 // a good idea. We should pass in a string/stream here.
 
 // Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
 // Creates MLIR entities into the given MLIR `context`.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type,
-    const std::vector<std::string>& input_arrays,
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::optional<std::vector<int>>>& input_shapes,
     const std::vector<std::string>& output_arrays,
     const std::vector<std::string>& control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy,
-    // TODO(jpienaar): Remove these.
-    bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
-    mlir::MLIRContext* context);
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
 
 ABSL_DEPRECATED(
     "Please use the other overload of this function which accepts structured "
@@ -59,32 +66,21 @@ ABSL_DEPRECATED(
 // Converts a TensorFlow GraphDef contained in `input` param into a MLIR module.
 // Creates MLIR entities into the given MLIR `context`.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GraphdefToMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, absl::string_view control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy,
-    // TODO(jpienaar): Remove these.
-    bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
-    mlir::MLIRContext* context);
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
 
 // Similar as the above function, but replaces all constant tensors
 // with randomly generated splat values.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type,
-    const std::vector<std::string>& input_arrays,
+    llvm::StringRef input, const std::vector<std::string>& input_arrays,
     const std::vector<std::string>& input_dtypes,
     const std::vector<std::vector<int>>& input_shapes,
     const std::vector<std::string>& output_arrays,
     const std::vector<std::string>& control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy,
-    // TODO(jpienaar): Remove these.
-    bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
-    mlir::MLIRContext* context);
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
 
 ABSL_DEPRECATED(
     "Please use the other overload of this function which accepts structured "
@@ -93,15 +89,10 @@ ABSL_DEPRECATED(
 // with randomly generated splat values.
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 GraphdefToSplattedMlirTranslateFunction(
-    llvm::StringRef input, absl::string_view debug_info_file,
-    absl::string_view xla_compile_device_type, absl::string_view input_arrays,
+    llvm::StringRef input, absl::string_view input_arrays,
     absl::string_view input_dtypes, absl::string_view input_shapes,
     absl::string_view output_arrays, absl::string_view control_output_arrays,
-    bool prune_unused_nodes, bool convert_legacy_fed_inputs,
-    bool graph_as_function, bool upgrade_legacy,
-    // TODO(jpienaar): Remove these.
-    bool enable_shape_inference, bool unconditionally_use_set_output_shapes,
-    mlir::MLIRContext* context);
+    const GraphdefToMlirOptions& import_options, mlir::MLIRContext* context);
 
 // Converts a TensorFlow SavedModel stored in the directory with the given
 // `saved_model_dir` into a MLIR module. Creates MLIR entities into the
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
index d739b3997c5..ac1a6fe6881 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.cc
@@ -130,6 +130,12 @@ opt<bool> unconditionally_use_set_output_shapes(
                    "(temporary)"),
     llvm::cl::init(false));
 
+// NOLINTNEXTLINE
+opt<bool> enable_soft_placement(
+    "tf-enable-soft-placement-on-import",
+    llvm::cl::desc("Enable soft device placement on import."),
+    llvm::cl::init(false));
+
 // Export options.
 // NOLINTNEXTLINE
 opt<bool> export_entry_func_to_flib(
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
index af50bdc185f..ebf5dc0b0a7 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h
@@ -44,6 +44,7 @@ extern llvm::cl::opt<bool> upgrade_legacy;
 // TODO(jpienaar): Temporary flag, flip default and remove.
 extern llvm::cl::opt<bool> enable_shape_inference;
 extern llvm::cl::opt<bool> unconditionally_use_set_output_shapes;
+extern llvm::cl::opt<bool> enable_soft_placement;
 
 // Export options.
 extern llvm::cl::opt<bool> export_entry_func_to_flib;
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
index 6ce04664a7b..4aa10153e79 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_registration.cc
@@ -44,12 +44,16 @@ inline absl::string_view StringRefToView(llvm::StringRef ref) {
 
 static OwningOpRef<mlir::ModuleOp> GraphdefToMlirTranslateFunction(
     llvm::StringRef input, MLIRContext* context) {
+  tensorflow::GraphdefToMlirOptions options{
+      debug_info_file,        xla_compile_device_type,
+      prune_unused_nodes,     convert_legacy_fed_inputs,
+      graph_as_function,      upgrade_legacy,
+      enable_shape_inference, unconditionally_use_set_output_shapes,
+      enable_soft_placement};
+
   auto module_or = tensorflow::GraphdefToMlirTranslateFunction(
-      input, debug_info_file, xla_compile_device_type, input_arrays,
-      input_dtypes, input_shapes, output_arrays, control_output_arrays,
-      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
-      upgrade_legacy, enable_shape_inference,
-      unconditionally_use_set_output_shapes, context);
+      input, input_arrays, input_dtypes, input_shapes, output_arrays,
+      control_output_arrays, options, context);
   if (!module_or.status().ok()) return nullptr;
   return std::move(module_or).value();
 }
@@ -59,12 +63,14 @@ static TranslateToMLIRRegistration GraphdefToMlirTranslate(
 
 static OwningOpRef<mlir::ModuleOp> GraphdefToSplattedMlirTranslateFunction(
     llvm::StringRef input, MLIRContext* context) {
+  tensorflow::GraphdefToMlirOptions options{
+      debug_info_file,        xla_compile_device_type,
+      prune_unused_nodes,     convert_legacy_fed_inputs,
+      graph_as_function,      upgrade_legacy,
+      enable_shape_inference, unconditionally_use_set_output_shapes};
   auto module_or = tensorflow::GraphdefToSplattedMlirTranslateFunction(
-      input, debug_info_file, xla_compile_device_type, input_arrays,
-      input_dtypes, input_shapes, output_arrays, control_output_arrays,
-      prune_unused_nodes, convert_legacy_fed_inputs, graph_as_function,
-      upgrade_legacy, enable_shape_inference,
-      unconditionally_use_set_output_shapes, context);
+      input, input_arrays, input_dtypes, input_shapes, output_arrays,
+      control_output_arrays, options, context);
   if (!module_or.status().ok()) return nullptr;
   return std::move(module_or).value();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
index ed84b747fc6..d11371e395f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
 
 #include <atomic>
+#include <memory>
+#include <string>
+#include <vector>
 
 #include "absl/strings/str_split.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
index cf972183f4e..485ac2f7293 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_BRIDGE_LOGGER_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_BRIDGE_LOGGER_H_
 
+#include <string>
+#include <vector>
+
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger_test.cc
index 4a15dace1c8..b2d2d71128a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/bridge_logger_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
 
+#include <memory>
+
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc
new file mode 100644
index 00000000000..c1e9c9ad24b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.cc
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
+
+inline constexpr absl::string_view kEntryFunctionAttr = "tf.entry_function";
+
+namespace mlir {
+
+bool IsEntryFunction(func::FuncOp func) {
+  return func->hasAttr(kEntryFunctionAttr) ||
+         func->hasAttr(tf_saved_model::kTfSavedModelInitializerTypeAttr);
+}
+
+llvm::SmallVector<func::FuncOp> GetEntryFunctions(ModuleOp module) {
+  llvm::SmallVector<func::FuncOp> entry_funcs;
+  module.walk([&](func::FuncOp func) {
+    // A model may have multiple graphs, with each graph having its own entry.
+    // When a graph is imported to MLIR, `tf.entry_function` will be added to
+    // each entry function. The one exception are initializer functions, which
+    // have `tf_saved_model.initializer_type` instead.
+    if (IsEntryFunction(func)) {
+      entry_funcs.push_back(func);
+    }
+  });
+  return entry_funcs;
+}
+
+LogicalResult GetCallees(SymbolUserOpInterface op, SymbolTable &symtab,
+                         llvm::SmallVector<func::FuncOp> &callees) {
+  for (auto attr : op->getAttrs()) {
+    auto sym = attr.getValue().dyn_cast<SymbolRefAttr>();
+    if (!sym) continue;
+    auto callee = symtab.lookup<func::FuncOp>(sym.getRootReference());
+    if (!callee) {
+      // This is not expected to happen in practice.
+      return op->emitError()
+             << "Cannot find function " << sym.getRootReference();
+    }
+    callees.push_back(callee);
+  }
+  return success();
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h
new file mode 100644
index 00000000000..8a45d6e79c8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h
@@ -0,0 +1,73 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CALL_GRAPH_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CALL_GRAPH_UTIL_H_
+
+#include <functional>
+#include <stack>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+
+// Check if a function is an entry in an MLIR module.
+bool IsEntryFunction(func::FuncOp func);
+
+// Get all the entry functions in an MLIR module.
+llvm::SmallVector<func::FuncOp> GetEntryFunctions(ModuleOp module);
+
+// Get all the functions referenced in a symber user op and save them in
+// `callees`.
+LogicalResult GetCallees(SymbolUserOpInterface op, SymbolTable &symtab,
+                         llvm::SmallVector<func::FuncOp> &callees);
+
+// Find the first op with any of the specified types on the paths rooted at the
+// `root` node in a tree. Additional filters can be applied via `predicate`. The
+// results are stored in `ops`.
+template <typename T, typename... Types>
+LogicalResult GetFirstOpsOfType(
+    func::FuncOp root, SymbolTable &symtab,
+    const std::function<bool(SymbolUserOpInterface)> &predicate,
+    llvm::SmallVector<SymbolUserOpInterface> &ops) {
+  std::stack<func::FuncOp> worklist;
+  worklist.push(root);
+  while (!worklist.empty()) {
+    func::FuncOp u = worklist.top();
+    worklist.pop();
+    auto result = u.walk([&](SymbolUserOpInterface op) {
+      if (llvm::isa<T, Types...>(op) && (!predicate || predicate(op))) {
+        ops.push_back(op);
+        return WalkResult::advance();
+      }
+      llvm::SmallVector<func::FuncOp> callees;
+      if (GetCallees(op, symtab, callees).failed()) {
+        return WalkResult::interrupt();
+      }
+      for (auto callee : callees) {
+        worklist.push(callee);
+      }
+      return WalkResult::advance();
+    });
+    if (result.wasInterrupted()) return failure();
+  }
+  return success();
+}
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_CALL_GRAPH_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util_test.cc
new file mode 100644
index 00000000000..54f30fbe3b4
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/call_graph_util_test.cc
@@ -0,0 +1,156 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/call_graph_util.h"
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/attribute_utils.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+
+TEST(CallGraphUtilTest, GetEntryFunctions) {
+  const char *const code = R"mlir(
+func.func @entry_func_1(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @entry_func_2(%arg0: tensor<i32>) -> tensor<i32> attributes {tf_saved_model.initializer_type = ""} {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @func(%arg0: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
+)mlir";
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect>();
+  mlir::OwningOpRef<mlir::ModuleOp> module =
+      mlir::parseSourceString<mlir::ModuleOp>(code, &context);
+  ASSERT_TRUE(module);
+  auto entry_funcs = GetEntryFunctions(*module);
+  EXPECT_EQ(entry_funcs.size(), 2);
+  EXPECT_EQ(entry_funcs[0].getSymName(), "entry_func_1");
+  EXPECT_EQ(entry_funcs[1].getSymName(), "entry_func_2");
+}
+
+TEST(CallGraphUtilTest, GetCallees) {
+  const char *const code = R"mlir(
+func.func @entry_func(%arg0: tensor<i32>) -> tensor<i32> attributes {tf_saved_model.initializer_type = ""} {
+  %0 = "tf.While"(%arg0) {cond = @while_cond_func, body = @while_body_func, is_stateless = true} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @while_cond_func(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Const"() {value = dense<0> : tensor<i1>} : () -> tensor<i1>
+  func.return %0 : tensor<i1>
+}
+
+func.func @while_body_func(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
+
+
+)mlir";
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect>();
+  mlir::OwningOpRef<mlir::ModuleOp> module =
+      mlir::parseSourceString<mlir::ModuleOp>(code, &context);
+  ASSERT_TRUE(module);
+  mlir::SymbolTable symtab(*module);
+  llvm::SmallVector<mlir::func::FuncOp> callees;
+  module->walk([&](mlir::SymbolUserOpInterface op) {
+    auto result = GetCallees(op, symtab, callees).succeeded();
+    ASSERT_TRUE(result);
+    EXPECT_EQ(callees.size(), 2);
+    EXPECT_EQ(callees[0].getSymName(), "while_body_func");
+    EXPECT_EQ(callees[1].getSymName(), "while_cond_func");
+  });
+}
+
+TEST(CallGraphUtilTest, GetFirstOpsOfType) {
+  const char *const code = R"mlir(
+func.func @entry_func(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+  %0 = "tf.While"(%arg0) {cond = @while_cond_func, body = @while_body_func, is_stateless = true} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @while_cond_func(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Const"() {value = dense<0> : tensor<i1>} : () -> tensor<i1>
+  func.return %0 : tensor<i1>
+}
+
+// CHECK-LABEL: func.func @while_body_func
+func.func @while_body_func(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @outer_stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @outer_stateful_pcall_func(%arg0: tensor<i32>) -> (tensor<i32>) {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @inner_stateful_pcall_func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @inner_stateful_pcall_func(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.StatefulPartitionedCall"(%arg0) {_xla_compile_device_type = "CPU", config = "", config_proto = "", device = "/device:CPU:0", executor_type = "", f = @func} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+
+func.func @func(%arg0: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
+)mlir";
+  auto has_compile_device_type = [](mlir::SymbolUserOpInterface op) {
+    return op->hasAttr(tensorflow::kCompileDeviceTypeAttr);
+  };
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect>();
+  mlir::OwningOpRef<mlir::ModuleOp> module =
+      mlir::parseSourceString<mlir::ModuleOp>(code, &context);
+  ASSERT_TRUE(module);
+  mlir::SymbolTable symtab(*module);
+  llvm::SmallVector<mlir::func::FuncOp> entry_funcs =
+      GetEntryFunctions(*module);
+  EXPECT_EQ(entry_funcs.size(), 1);
+  EXPECT_EQ(entry_funcs[0].getSymName(), "entry_func");
+  llvm::SmallVector<mlir::SymbolUserOpInterface> outermost_pcall_ops;
+  auto result =
+      mlir::GetFirstOpsOfType<mlir::TF::StatefulPartitionedCallOp,
+                              mlir::TF::PartitionedCallOp>(
+          entry_funcs[0], symtab, has_compile_device_type, outermost_pcall_ops)
+          .succeeded();
+  ASSERT_TRUE(result);
+  EXPECT_EQ(outermost_pcall_ops.size(), 1);
+  auto func =
+      llvm::dyn_cast<mlir::func::FuncOp>(outermost_pcall_ops[0]->getParentOp());
+  ASSERT_TRUE(func);
+  EXPECT_EQ(func.getSymName(), "outer_stateful_pcall_func");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc
index f855b7f2c19..df641fb2176 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/cluster_util_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/cluster_util.h"
 
+#include <string>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
index fce8c6f8dcf..4100ce55cf3 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 #include <optional>
+#include <string>
+#include <vector>
 
 #include "absl/base/casts.h"
 #include "absl/container/inlined_vector.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
@@ -38,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -268,6 +272,15 @@ mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type) {
   return mlir::TF::ShapeAttr::get(type.getContext(), ArrayRef<int64_t>());
 }
 
+StatusOr<TensorSpecProto> ConvertTypeToTensorSpecProto(const mlir::Type& type) {
+  DataType dtype;
+  TF_RETURN_IF_ERROR(ConvertToDataType(type, &dtype));
+  TensorSpecProto tensor_spec;
+  tensor_spec.set_dtype(dtype);
+  *tensor_spec.mutable_shape() = ConvertTypeToTensorShape(type).AsProto();
+  return tensor_spec;
+}
+
 // Converts the tensor shape proto into an MLIR shape attribute.
 StatusOr<mlir::Attribute> ConvertTensorShapeProto(const TensorShapeProto& shape,
                                                   mlir::MLIRContext* context) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
index 9255667c647..227e4bf465f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/protobuf/struct.pb.h"
 
 namespace tensorflow {
 
@@ -47,6 +48,10 @@ PartialTensorShape ConvertTypeToTensorShape(const mlir::Type& type);
 // Converts an MLIR shaped type to a TensorFlow shape attribute.
 mlir::TF::ShapeAttr ConvertTypeToTensorShapeAttr(const mlir::Type& type);
 
+// Converts an MLIR shaped type to a Tensorflow tensor spec proto.
+absl::StatusOr<TensorSpecProto> ConvertTypeToTensorSpecProto(
+    const mlir::Type& type);
+
 // Converts a TensorFlow shape attribute to an MLIR shape attribute.
 StatusOr<mlir::Attribute> ConvertTensorShapeProto(const TensorShapeProto& shape,
                                                   mlir::MLIRContext* context);
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
index 373e88f7413..f5e58f28689 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_tensor_test.cc
@@ -222,5 +222,41 @@ TEST(ConvertTensorProtoTest, NonSplatTensor) {
             ResultOf(IsSplat, IsFalse())));
 }
 
+TEST(ConvertTypeToTensorSpecProtoTest, UnrankedTensorType) {
+  mlir::MLIRContext context;
+  mlir::Builder b(&context);
+
+  auto output_proto = ConvertTypeToTensorSpecProto(
+      mlir::UnrankedTensorType::get(b.getF32Type()));
+  TF_ASSERT_OK(output_proto.status());
+  EXPECT_EQ(output_proto->dtype(), DT_FLOAT);
+  EXPECT_TRUE(output_proto->shape().unknown_rank());
+}
+
+TEST(ConvertTypeToTensorSpecProtoTest, RankedTensorType) {
+  mlir::MLIRContext context;
+  mlir::Builder b(&context);
+
+  auto output_proto = ConvertTypeToTensorSpecProto(
+      mlir::RankedTensorType::get({1, 2, 3}, b.getF32Type()));
+  TF_ASSERT_OK(output_proto.status());
+  EXPECT_EQ(output_proto->dtype(), DT_FLOAT);
+  EXPECT_EQ(output_proto->shape().dim_size(), 3);
+  EXPECT_EQ(output_proto->shape().dim().at(0).size(), 1);
+  EXPECT_EQ(output_proto->shape().dim().at(1).size(), 2);
+  EXPECT_EQ(output_proto->shape().dim().at(2).size(), 3);
+}
+
+TEST(ConvertTypeToTensorSpecProtoTest, ScalarTensorType) {
+  mlir::MLIRContext context;
+  mlir::Builder b(&context);
+
+  auto output_proto = ConvertTypeToTensorSpecProto(b.getF32Type());
+  TF_ASSERT_OK(output_proto.status());
+  EXPECT_EQ(output_proto->dtype(), DT_FLOAT);
+  EXPECT_FALSE(output_proto->shape().unknown_rank());
+  EXPECT_EQ(output_proto->shape().dim_size(), 0);
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
index 2546fa44a05..45459e31f3f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 
+#include <limits>
+
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
index 7bc65919030..b844966c7ee 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/convert_type_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
 
+#include <string>
+#include <vector>
+
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
index 92ce8886f8a..326dbbb4781 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/device_util_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 
 #include <memory>
+#include <string>
 #include <tuple>
 #include <utility>
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
index fd100546555..9f3e0113339 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_graph.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <string>
+#include <utility>
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
index efcbca84872..f07af4f8b85 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstring>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -87,7 +88,19 @@ struct WritableFileRawStream : public llvm::raw_ostream {
     SetUnbuffered();
   }
   ~WritableFileRawStream() override = default;
-  uint64_t current_pos() const override { return 0; }
+
+  uint64_t current_pos() const override {
+    int64_t position;
+    if (file->Tell(&position).ok()) {
+      return position;
+    } else {
+      // MLIR uses os.tell() to determine whether something was written by
+      // a subroutine or not, so it's important we have a working current_pos().
+      LOG(WARNING)
+          << "Couldn't query file position. Stream might be malformed.\n";
+      return -1;
+    }
+  }
 
   void write_impl(const char* ptr, size_t size) override {
     // Write the file if it is still valid. If the write fails, null out the
@@ -154,7 +167,8 @@ Status CreateFileForDumping(llvm::StringRef name,
 
   if (dir == kCrashReproducerStdErr) {
     *os = std::make_unique<LogInfoRawStream>();
-    *filepath = "(stderr)";
+    *filepath =
+        llvm::formatv("(stderr; requested filename: '{0}')", name).str();
     return Status();
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
index 6069b8ca2ad..a7760872d79 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_DUMP_MLIR_UTIL_H_
 
+#include <memory>
 #include <string>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
index 908bf40f834..bb474b1413f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 
+#include <string>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "llvm/Support/MemoryBuffer.h"
@@ -61,7 +63,7 @@ TEST(DumpMlirModuleTest, LogInfo) {
   setenv("TF_DUMP_GRAPH_PREFIX", "-", 1);
 
   std::string filepath = DumpMlirOpToFile("module", module_ref.get());
-  EXPECT_EQ(filepath, "(stderr)");
+  EXPECT_EQ(filepath, "(stderr; requested filename: 'module')");
 }
 
 TEST(DumpMlirModuleTest, Valid) {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
index 42bdbf19d2a..6a66067920f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/error_util.cc
@@ -47,15 +47,13 @@ StatusScopedDiagnosticHandler::StatusScopedDiagnosticHandler(
 }
 
 Status StatusScopedDiagnosticHandler::ConsumeStatus() {
-  return tensorflow::FromAbslStatus(
-      BaseScopedDiagnosticHandler::ConsumeStatus());
+  return BaseScopedDiagnosticHandler::ConsumeStatus();
 }
 
 Status StatusScopedDiagnosticHandler::Combine(Status status) {
-  absl::Status absl_s =
-      BaseScopedDiagnosticHandler::Combine(tensorflow::ToAbslStatus(status));
+  absl::Status absl_s = BaseScopedDiagnosticHandler::Combine(status);
 
-  return tensorflow::FromAbslStatus(absl_s);
+  return absl_s;
 }
 
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
index 925c2dfc57b..260caf3494b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/eval_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/eval_util.h"
 
+#include <algorithm>
+#include <string>
+
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
index 5f1c2735972..b51856bc478 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/export_utils.h"
 
+#include <algorithm>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
index 24152cad81c..86ff64b5ed4 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/export_utils.h
@@ -57,7 +57,7 @@ StatusOr<std::unique_ptr<NodeDef>> GetOperationNodeDef(
 // "name" and "device" attributes are ignored by default. Use attrs_to_ignore to
 // specify any other attributes that should be ignored.
 Status ConvertAttributes(
-    const llvm::ArrayRef<mlir::NamedAttribute> attrs,
+    llvm::ArrayRef<mlir::NamedAttribute> attrs,
     const absl::flat_hash_set<absl::string_view>& attrs_to_ignore,
     bool remove_ref_type, AttrValueMap* values);
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
index ecbc6e12fa1..7d7c2a3c074 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tensorflow/utils/fake_session.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/strings/match.h"
 #include "llvm/Support/CommandLine.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h
index 83d499e0361..213cf4e66e1 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/fake_session.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_FAKE_SESSION_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_FAKE_SESSION_H_
 
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
index b47da952929..7b3312a76a6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/import_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tensorflow/utils/import_utils.h"
 
+#include <system_error>
+
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/ToolOutputFile.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
index d29343d83e4..ffd41db7f47 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/mangling_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 
+#include <cstring>
 #include <string>
 
 #include "absl/strings/match.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc b/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc
index 550b9c87b77..477d2948d25 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/parse_text_proto.h"
 
+#include <string>
+
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
index db365a0c910..3709f88c4d7 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 
+#include <string>
+#include <utility>
+
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
index fdb1ebc39a9..2895ebdc9c6 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tensorflow/utils/session_utils.h"
 
+#include <string>
+#include <vector>
+
 #include "absl/status/status.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h
index 3d009cbda37..be2d3786cb7 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/session_utils.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SESSION_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SESSION_UTILS_H_
 
+#include <string>
+#include <vector>
+
 #include "absl/status/statusor.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
index d672c967060..040429ccf73 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/shape_inference_utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_SHAPE_INFERENCE_UTILS_H_
 
+#include <optional>
+
 #include "tensorflow/core/ir/utils/shape_inference_utils.h"
 
 namespace mlir {
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc b/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc
new file mode 100644
index 00000000000..549b665f044
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.cc
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h"
+
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+namespace {
+
+// jax2tf sets `stablehlo.custom_call`'s target name as `tf.call_tf_function`
+// to represent calling a TF host callback function.
+constexpr llvm::StringRef kTfTargetName = "tf.call_tf_function";
+
+// `tf.backend_config` is a DictionaryAttr, JAX2TF sets the value of its
+// string attribute `caller_name` to the TF host callback function's name.
+constexpr llvm::StringRef kTfBackendConfigAttrName = "tf.backend_config";
+constexpr llvm::StringRef kCalledFuncAttrName = "called_func";
+
+}  // namespace
+
+bool IsTfFuncCustomCall(stablehlo::CustomCallOp op) {
+  return op.getCallTargetName() == kTfTargetName;
+}
+
+DictionaryAttr GetTfBackendConfig(stablehlo::CustomCallOp op) {
+  return op->getAttrOfType<DictionaryAttr>(kTfBackendConfigAttrName);
+}
+
+FailureOr<SymbolRefAttr> GetTfFuncCustomCallFuncName(
+    stablehlo::CustomCallOp op) {
+  if (!IsTfFuncCustomCall(op)) {
+    return success(nullptr);
+  }
+
+  auto config = GetTfBackendConfig(op);
+  if (config == nullptr) {
+    op.emitOpError() << "does not have dictionary attribute '"
+                     << kTfBackendConfigAttrName << "'";
+    return failure();
+  }
+
+  auto f = config.get(kCalledFuncAttrName);
+  if (f == nullptr) {
+    op.emitOpError() << "does not have attribute '" << kCalledFuncAttrName
+                     << "' in its dictionary attribute '"
+                     << kTfBackendConfigAttrName << "'";
+    return failure();
+  }
+
+  if (auto attr = f.dyn_cast<FlatSymbolRefAttr>()) {
+    return attr;
+  }
+
+  op.emitOpError() << "'s attribute '" << kCalledFuncAttrName
+                   << "' is neither StringAttr nor FlatSymbolRefAttr";
+  return failure();
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h b/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h
new file mode 100644
index 00000000000..7bb38112f77
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/stablehlo_custom_call.h
@@ -0,0 +1,38 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STABLEHLO_CUSTOM_CALL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STABLEHLO_CUSTOM_CALL_H_
+
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+
+namespace mlir {
+namespace TF {
+
+// Returns whether the custom call op represents a TF function call.
+bool IsTfFuncCustomCall(stablehlo::CustomCallOp op);
+
+// Returns the `called_func` symbol ref attribute in the `tf.backend_config`
+// dictionary attribute.
+//
+// If the op does not represent a TF function call, returns nullptr.
+// Otherwise, if the op does not have `caller_name`, returns failure.
+FailureOr<SymbolRefAttr> GetTfFuncCustomCallFuncName(
+    stablehlo::CustomCallOp op);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STABLEHLO_CUSTOM_CALL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/string_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/string_util.cc
new file mode 100644
index 00000000000..7fd832e7604
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/string_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tensorflow/utils/string_util.h"
+
+#include <ostream>
+#include <string>
+
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace tensorflow {
+
+// Return a string form of `op` including debug information.
+std::string OpAsString(mlir::Operation& op) {
+  std::string out;
+  llvm::raw_string_ostream op_stream(out);
+  op.print(op_stream, mlir::OpPrintingFlags()
+                          .elideLargeElementsAttrs()
+                          .assumeVerified()
+                          .skipRegions()
+                          .printGenericOpForm());
+  return out;
+}
+
+std::string AttrAsString(mlir::Attribute& attr) {
+  std::string out;
+  llvm::raw_string_ostream attr_stream(out);
+  attr.print(attr_stream);
+  return out;
+}
+
+std::ostream& operator<<(std::ostream& o, const LoggableOperation& op) {
+  return o << OpAsString(op.v);
+}
+
+std::ostream& operator<<(std::ostream& o, const LoggableAttribute& attr) {
+  return o << AttrAsString(attr.v);
+}
+
+std::ostream& operator<<(std::ostream& o, const LoggableStringRef& ref) {
+  return o << ref.v.str();
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/string_util.h b/tensorflow/compiler/mlir/tensorflow/utils/string_util.h
new file mode 100644
index 00000000000..56410385c20
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/string_util.h
@@ -0,0 +1,60 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STRING_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STRING_UTIL_H_
+
+#include <ostream>
+#include <string>
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+// Utility functions for dumping operations/attributes as strings and ostream
+// bindings.
+
+namespace tensorflow {
+std::string OpAsString(mlir::Operation& op);
+std::string AttrAsString(mlir::Attribute& attr);
+
+// b/281863212 enable automatic without Op/AttrAsString.
+// We add logging via a wrapper struct in order to respect ODS and avoid
+// multiple symbol definitions if MLIR or someone else decides to add ostream
+// definitions for the MLIR symbols.
+struct LoggableOperation {
+  mlir::Operation& v;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LoggableOperation(mlir::Operation& v) : v(v) {}
+};
+std::ostream& operator<<(std::ostream& o, const LoggableOperation& op);
+
+struct LoggableAttribute {
+  mlir::Attribute& v;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LoggableAttribute(mlir::Attribute& v) : v(v) {}
+};
+std::ostream& operator<<(std::ostream& o, const LoggableAttribute& attr);
+
+struct LoggableStringRef {
+  const llvm::StringRef& v;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  LoggableStringRef(const llvm::StringRef& v) : v(v) {}
+};
+std::ostream& operator<<(std::ostream& o, const LoggableStringRef& ref);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_STRING_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
index 69671753ca9..2853816dd87 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tf_xla_mlir_translate.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <optional>
 #include <string>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_join.h"
@@ -89,7 +91,7 @@ mlir::LogicalResult PrintHloModuleText(
       compilation_result.computation->proto(), module_config);
   if (!status_or_hlo_module.ok()) {
     LOG(ERROR) << "Conversion to HLO module failed: "
-               << status_or_hlo_module.status().ToString();
+               << status_or_hlo_module.status();
     return mlir::failure();
   }
 
@@ -315,7 +317,7 @@ static mlir::LogicalResult MlirTfToHloTextTranslateFunctionImpl(
   auto args_status =
       ParseArgumentShapes(mlir::StringRefToView(input_shapes), arg_shapes);
   if (!args_status.ok()) {
-    LOG(ERROR) << args_status.ToString();
+    LOG(ERROR) << args_status;
     return mlir::failure();
   }
 
@@ -334,8 +336,7 @@ static mlir::LogicalResult MlirTfToHloTextTranslateFunctionImpl(
                         /*shape_determination_fns=*/{}, &compilation_result,
                         custom_legalization_passes);
   if (!compilation_status.ok()) {
-    LOG(ERROR) << "TF/XLA compilation failed: "
-               << compilation_status.ToString();
+    LOG(ERROR) << "TF/XLA compilation failed: " << compilation_status;
     return mlir::failure();
   }
 
@@ -351,7 +352,7 @@ static mlir::LogicalResult MlirTfGraphToHloTextTranslateFunction(
       mlir::StringRefToView(input_shapes), mlir::StringRefToView(input_dtypes),
       mlir::StringRefToView(input_types), xla_arguments);
   if (!args_status.ok()) {
-    LOG(ERROR) << args_status.ToString();
+    LOG(ERROR) << args_status;
     return mlir::failure();
   }
 
@@ -363,8 +364,7 @@ static mlir::LogicalResult MlirTfGraphToHloTextTranslateFunction(
                            /*shape_determination_fns=*/{}, &compilation_result,
                            /*custom_legalization_passes=*/{});
   if (!compilation_status.ok()) {
-    LOG(ERROR) << "TF/XLA compilation failed: "
-               << compilation_status.ToString();
+    LOG(ERROR) << "TF/XLA compilation failed: " << compilation_status;
     return mlir::failure();
   }
 
@@ -403,7 +403,7 @@ SerializedMlirStringAttrToMlirModuleTranslate(llvm::StringRef input,
   auto status =
       DeserializeMlirModule(str_attr.getValue().str(), context, &module_ref);
   if (!status.ok()) {
-    LOG(ERROR) << status.ToString();
+    LOG(ERROR) << status;
     return nullptr;
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.cc
index b07b4ad6f5a..9c82c728f5d 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.cc
@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
+#include <optional>
+#include <string>
+#include <unordered_map>
+#include <utility>
+
 #include "mlir/Analysis/CallGraph.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/device_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h
index 69e4bc0593b..46ead1b827b 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_cluster_util.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_CLUSTER_UTIL_H_
 #define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_TPU_CLUSTER_UTIL_H_
 
+#include <functional>
+#include <optional>
+#include <string>
+
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h"
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
index 449e0532cf0..c7ce98aff86 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -222,7 +223,7 @@ StatusOr<TPUDevicesAndHosts> GetFullMeshTPUExecutionDeviceAssignment(
 // Helper struct for keeping track of task and device for an associated TPU
 // device coordinate.
 struct TaskAndDevice {
-  TaskAndDevice() {}
+  TaskAndDevice() = default;
   TaskAndDevice(int task, int device) : task(task), device(device) {}
 
   int task = -1;
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
index 77f853be582..183688cd88c 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -42,7 +43,7 @@ inline constexpr absl::string_view kDeviceAssignmentAttr = "device_assignment";
 
 // A TPU device for execution alongside its associated host CPU device.
 struct TPUDeviceAndHost {
-  TPUDeviceAndHost() {}
+  TPUDeviceAndHost() = default;
   TPUDeviceAndHost(llvm::StringRef device, llvm::StringRef host)
       : device(device), host(host) {}
 
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
index 2f33ccd88b2..fb88bc8bc44 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/tpu_rewrite_device_util_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <cstdint>
 #include <optional>
+#include <string>
 #include <tuple>
+#include <vector>
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/visitor.cc b/tensorflow/compiler/mlir/tensorflow/utils/visitor.cc
new file mode 100644
index 00000000000..517a56de5de
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/visitor.cc
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tensorflow/utils/visitor.h"
+
+#include <utility>
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+WalkResult WalkReachableFunctions(
+    func::FuncOp func,
+    llvm::function_ref<WalkResult(func::FuncOp)> callback,
+    SymbolTableCollection* symbol_table) {
+  llvm::SmallDenseSet<Operation*> visited;
+
+  llvm::SmallVector<func::FuncOp> stack;
+  stack.push_back(func);
+
+  while (!stack.empty()) {
+    func::FuncOp f = stack.back();
+    stack.pop_back();
+
+    if (!visited.insert(f).second) {
+      continue;
+    }
+
+    WalkResult result = callback(f);
+    if (result.wasInterrupted()) {
+      return result;
+    } else if (result.wasSkipped()) {
+      continue;
+    }
+
+    result = f.walk([&](Operation* op) {
+      const auto uses = SymbolTable::getSymbolUses(op);
+      if (!uses.has_value()) {
+        op->emitOpError() << "contains a potentially unknown symbol table";
+        return WalkResult::interrupt();
+      }
+
+      for (const SymbolTable::SymbolUse& use : *uses) {
+        func::FuncOp called_func =
+            symbol_table != nullptr
+                ? symbol_table->lookupNearestSymbolFrom<func::FuncOp>(
+                      use.getUser(), use.getSymbolRef())
+                : SymbolTable::lookupNearestSymbolFrom<
+                      func::FuncOp>(use.getUser(), use.getSymbolRef());
+        if (called_func == nullptr) {
+          op->emitOpError()
+              << "refers to an unknown symbol (expects a function)";
+          return WalkResult::interrupt();
+        }
+        stack.push_back(called_func);
+      }
+
+      return WalkResult::advance();
+    });
+    if (result.wasInterrupted()) {
+      return result;
+    }
+  }
+
+  return WalkResult::advance();
+}
+
+FailureOr<OwningOpRef<ModuleOp>> CreatePrunedModule(
+    ModuleOp module, llvm::ArrayRef<llvm::StringRef> function_names) {
+  SymbolTableCollection symbol_table;
+  OpBuilder builder(module.getContext());
+
+  OwningOpRef<ModuleOp> pruned =
+      builder.create<ModuleOp>(module->getLoc());
+  (*pruned)->setAttrs(module->getAttrs());
+  builder.setInsertionPointToEnd(pruned->getBody());
+
+  llvm::SmallDenseSet<func::FuncOp> added;
+  for (const llvm::StringRef function_name : function_names) {
+    auto func =
+        llvm::dyn_cast_or_null<func::FuncOp>(symbol_table.lookupSymbolIn(
+            module, builder.getStringAttr(function_name)));
+    if (func == nullptr) {
+      return module.emitError()
+             << "Cannot find function '" << function_name << "'";
+    }
+
+    const WalkResult result = WalkReachableFunctions(
+        func,
+        [&](func::FuncOp f) {
+          if (!added.insert(f).second) {
+            return WalkResult::skip();
+          }
+          builder.clone(*f);
+          return WalkResult::advance();
+        },
+        &symbol_table);
+    if (result.wasInterrupted()) {
+      return failure();
+    }
+  }
+
+  return pruned;
+}
+
+}  // namespace TF
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/visitor.h b/tensorflow/compiler/mlir/tensorflow/utils/visitor.h
new file mode 100644
index 00000000000..6a7ada0bdb8
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/visitor.h
@@ -0,0 +1,50 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_H_
+
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace mlir {
+namespace TF {
+
+// Walks the function by following function call chains and calling the callback
+// for each reachable function (including `func`). Each function is visited only
+// once even if it's called from multiple places and/or recursively.
+//
+// The current implementation follows direct calls to `mlir::func::FuncOp` only
+// and returns a `mlir::WalkResult::interrupt()` when it encounters a call whose
+// callee cannot be resolved to `mlir::func::FuncOp`.
+mlir::WalkResult WalkReachableFunctions(
+    mlir::func::FuncOp func,
+    llvm::function_ref<mlir::WalkResult(mlir::func::FuncOp)> callback,
+    mlir::SymbolTableCollection* symbol_table = nullptr);
+
+// Creates a new MLIR module that contains only the given functions and all
+// reachable functions from them.
+mlir::FailureOr<mlir::OwningOpRef<mlir::ModuleOp>> CreatePrunedModule(
+    mlir::ModuleOp module, llvm::ArrayRef<llvm::StringRef> function_names);
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_VISITOR_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h b/tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h
new file mode 100644
index 00000000000..5f8275b21d9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_CALL_MODULE_ATTRS_H_
+#define TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_CALL_MODULE_ATTRS_H_
+
+#include "llvm/ADT/StringRef.h"
+
+namespace mlir {
+namespace TF {
+
+// The main function's name in the serialized stablehlo module embedded in
+// XlaCallModule's `module` attribute.
+constexpr llvm::StringRef kStablehloMainFunctionName = "main";
+
+// After deserializing the stablehlo functions from XlaCallModule,
+// this XlaCallModule attribute refers to the deserialized stablehlo main
+// function.
+constexpr llvm::StringRef kStablehloEntryFunctionAttrName = "_entry_function";
+
+// Every stablehlo function deserialized from XlaCallModule has this attribute.
+constexpr llvm::StringRef kFromXlaCallModuleAttrName = "_from_xla_call_module";
+
+// Name of `tf.XlaCallModule`'s dictionary attribute for keeping the
+// deserialized stablehlo module's attributes.
+constexpr llvm::StringRef kStablehloModuleAttrsAttrName =
+    "_stablehlo_module_attrs";
+
+}  // namespace TF
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_CALL_MODULE_ATTRS_H_
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
index e55ba55caf9..838624e0d2f 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.cc
@@ -41,47 +41,6 @@ namespace {
 
 constexpr char kNumSplitAttr[] = "num_split";
 
-// Gets the proper tensor dimension from XLA OpSharding.
-// "replicate_on_last_tile_dim" and "last_tile_dims" should be deducted from the
-// real Tensor dimensions when tiled.
-// For example:
-// f32[8,512](sharding={devices=[1,1,2]0,1 last_tile_dims={REPLICATED})
-// also means a replicated tensor over all devices.
-//
-// See xla_data.proto for detailed explanations on the fields.
-int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding) {
-  return xla_sharding.tile_assignment_dimensions_size() -
-         (xla_sharding.replicate_on_last_tile_dim() ? 1 : 0) -
-         xla_sharding.last_tile_dims_size();
-}
-
-// A sharding with OTHER type may be REPLICATED if:
-// 'replicate_on_last_tile_dim' is true OR
-// 'last_tile_dims' is not empty
-// AND
-// other than replicated last tile dims, all other dims are not sharded.
-bool IsOtherReplicatedSharding(const xla::OpSharding& xla_sharding) {
-  int max_dim = GetDimsFromXLAShardingTiled(xla_sharding);
-  for (int i = 0; i < max_dim; ++i) {
-    if (xla_sharding.tile_assignment_dimensions(i) != 1) {
-      return false;
-    }
-  }
-  return xla_sharding.type() == xla::OpSharding::OTHER &&
-         (xla_sharding.replicate_on_last_tile_dim() ||
-          !xla_sharding.last_tile_dims().empty());
-}
-
-bool IsSplitSharding(const xla::OpSharding& sharding) {
-  return sharding.type() == xla::OpSharding::OTHER &&
-         !IsOtherReplicatedSharding(sharding);
-}
-
-bool IsReplicatedSharding(const xla::OpSharding& sharding) {
-  return sharding.type() == xla::OpSharding::REPLICATED ||
-         IsOtherReplicatedSharding(sharding);
-}
-
 // Creates a tf::SplitOp that splits 'src_input' into 'num_splits' ways
 // in 'split_dimension' dimension and returns the split values.
 mlir::LogicalResult CreateSplitOp(const int num_split,
@@ -241,6 +200,34 @@ bool UnsupportedPartitionedShardingType(xla::OpSharding::Type sharding) {
 
 }  // namespace
 
+int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding) {
+  return xla_sharding.tile_assignment_dimensions_size() -
+         (xla_sharding.replicate_on_last_tile_dim() ? 1 : 0) -
+         xla_sharding.last_tile_dims_size();
+}
+
+bool IsOtherReplicatedSharding(const xla::OpSharding& xla_sharding) {
+  int max_dim = GetDimsFromXLAShardingTiled(xla_sharding);
+  for (int i = 0; i < max_dim; ++i) {
+    if (xla_sharding.tile_assignment_dimensions(i) != 1) {
+      return false;
+    }
+  }
+  return xla_sharding.type() == xla::OpSharding::OTHER &&
+         (xla_sharding.replicate_on_last_tile_dim() ||
+          !xla_sharding.last_tile_dims().empty());
+}
+
+bool IsSplitSharding(const xla::OpSharding& sharding) {
+  return sharding.type() == xla::OpSharding::OTHER &&
+         !IsOtherReplicatedSharding(sharding);
+}
+
+bool IsReplicatedSharding(const xla::OpSharding& sharding) {
+  return sharding.type() == xla::OpSharding::REPLICATED ||
+         IsOtherReplicatedSharding(sharding);
+}
+
 mlir::LogicalResult ExtractInputsForLogicalDevices(
     const int num_cores_per_replica,
     mlir::tf_device::ClusterFuncOp cluster_func, mlir::OpBuilder* builder,
diff --git a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
index 3297b9aa5b5..715a9ce1c1a 100644
--- a/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
+++ b/tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h
@@ -37,25 +37,24 @@ inline constexpr absl::string_view kOutputShardingAttr =
 
 // Parses "input_sharding_configuration" attribute and returns a list where i-th
 // element is a list of mlir::Value's which represent inputs for the TPU
-// computation correponding to i-th logical device. If the attribute does not
+// computation corresponding to i-th logical device. If the attribute does not
 // exist, the all inputs are placed on logical core 0.
 mlir::LogicalResult ExtractInputsForLogicalDevices(
-    const int num_cores_per_replica,
-    mlir::tf_device::ClusterFuncOp cluster_func, mlir::OpBuilder* builder,
+    int num_cores_per_replica, mlir::tf_device::ClusterFuncOp cluster_func,
+    mlir::OpBuilder* builder,
     llvm::SmallVectorImpl<llvm::SmallVector<mlir::Value, 4>>* input_list);
 
 // Extracts a list of OpSharding that represent output sharding configuration of
 // `tf_device.cluster`.
 mlir::LogicalResult ParseAndValidateOutputSharding(
-    const int num_cores_per_replica,
-    mlir::tf_device::ClusterFuncOp cluster_func,
+    int num_cores_per_replica, mlir::tf_device::ClusterFuncOp cluster_func,
     mlir::SmallVector<xla::OpSharding, 4>* output_sharding_list);
 
 // Retrieves output types for TPUExecute op representing execution for provided
 // logical device id. TPUExecute op for different logical device may have
 // different outputs depending on the output sharding configuration.
 mlir::LogicalResult GetOutputTypesForLogicalDeviceComputation(
-    const int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
+    int core_id, llvm::ArrayRef<xla::OpSharding> output_sharding_config,
     mlir::tf_device::ClusterFuncOp cluster_func,
     llvm::SmallVectorImpl<mlir::Type>* output_types,
     llvm::SmallVectorImpl<int>* cluster_to_core_index);
@@ -80,6 +79,31 @@ mlir::LogicalResult RemapOutputsFromLogicalDevices(
 llvm::SmallVector<llvm::SmallVector<int64_t, 4>, 4> GetMetadataArgumentMapping(
     const tpu::TPUCompileMetadataProto& metadata);
 
+// Gets the proper tensor dimension from XLA OpSharding.
+// "replicate_on_last_tile_dim" and "last_tile_dims" should be deducted from the
+// real Tensor dimensions when tiled.
+// For example:
+// f32[8,512](sharding={devices=[1,1,2]0,1 last_tile_dims={REPLICATED})
+// also means a replicated tensor over all devices.
+//
+// See xla_data.proto for detailed explanations on the fields.
+int GetDimsFromXLAShardingTiled(const xla::OpSharding& xla_sharding);
+
+// A sharding with OTHER type may be REPLICATED if:
+// 'replicate_on_last_tile_dim' is true OR
+// 'last_tile_dims' is not empty
+// AND
+// other than replicated last tile dims, all other dims are not sharded.
+bool IsOtherReplicatedSharding(const xla::OpSharding& xla_sharding);
+
+// Returns whether the sharding is split sharding. i.e. A sharding with OTHER
+// type but not replicated.
+bool IsSplitSharding(const xla::OpSharding& sharding);
+
+// Returns whether the sharding is replicated. It includes sharding with
+// REPLICATED type and replicated OTHER type.
+bool IsReplicatedSharding(const xla::OpSharding& sharding);
+
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TENSORFLOW_UTILS_XLA_SHARDING_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD
index 18744b3032f..72bee83a841 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/BUILD
@@ -27,6 +27,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
+        "//tensorflow/compiler/mlir/tf2xla/internal:mlir_pass_instrumentation",
         "//tensorflow/compiler/mlir/tf2xla/transforms:tf_xla_passes",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_targets",
         "//tensorflow/compiler/mlir/tf2xla/transforms:xla_legalize_tf",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc
index 19c148214e1..108f3760f4f 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.cc
@@ -56,6 +56,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_targets.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
@@ -518,6 +519,10 @@ Status LegalizeToHlo(mlir::ModuleOp module_op, llvm::StringRef device_type,
   CreateConvertMlirToXlaHloPipeline(tf2xla, device_type, enable_op_fallback,
                                     custom_legalization_passes);
 
+  auto pass_instrumentors = mlir::GetPassInstrumentors();
+  for (const auto& creator : pass_instrumentors) {
+    tf2xla.addInstrumentation(creator());
+  }
   if (DEBUG_DATA_DUMPER()->ShouldDump(module_name.str(), kDebugGroupMain) ||
       VLOG_IS_ON(1)) {
     tensorflow::DumpMlirOpToFile(
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index a95e558f506..d9d4a963648 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -47,6 +47,7 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_util_hdrs",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:variant",
         "@llvm-project//mlir:IR",
@@ -71,7 +72,7 @@ tf_cc_test(
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
         "//tensorflow/tsl/lib/monitoring:test_utils",
         "//tensorflow/tsl/platform:statusor",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc
index f5f6818d33e..95913f9692a 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/legalize_tf.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/types/variant.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -266,6 +267,27 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     return old_bridge_status;
   }
 
+  if (VLOG_IS_ON(2)) {
+    xla::DebugOptions debug_options;
+    TF_ASSIGN_OR_RETURN(
+        auto hlo_module_config,
+        xla::HloModule::CreateModuleConfigFromProto(
+            compilation_result.computation->proto(), debug_options));
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<xla::HloModule> hlo_module,
+        xla::HloModule::CreateFromProto(compilation_result.computation->proto(),
+                                        hlo_module_config));
+
+    std::string all_computations;
+    for (auto computation : hlo_module->computations()) {
+      all_computations += computation->ToString() + "\n\n";
+    }
+
+    tensorflow::DumpRawStringToFile("legalize_tf_fallback_hlo",
+                                    all_computations);
+  }
+
   if (filtered_graph) {
     mlir_second_phase_count->GetCell(kOldBridgeMlirFilteredSuccess)
         ->IncrementBy(1);
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
new file mode 100644
index 00000000000..6913853f682
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -0,0 +1,31 @@
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow/compiler/mlir/tf2xla/api/v0:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "mlir_pass_instrumentation",
+    srcs = ["mlir_pass_instrumentation.cc"],
+    hdrs = ["mlir_pass_instrumentation.h"],
+    deps = [
+        "//tensorflow/core/platform:logging",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+tf_cc_test(
+    name = "mlir_pass_instrumentation_test",
+    srcs = ["mlir_pass_instrumentation_test.cc"],
+    deps = [
+        ":mlir_pass_instrumentation",
+        "//tensorflow/compiler/mlir/tf2xla/api/v0:compile_mlir_util_no_tf_dialect_passes",
+        "//tensorflow/core:test",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.cc
new file mode 100644
index 00000000000..f6366f47011
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.cc
@@ -0,0 +1,66 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h"
+
+#include <algorithm>
+#include <functional>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include "tensorflow/core/platform/logging.h"
+
+namespace mlir {
+
+class MlirPassInstrumentationRegistry {
+ public:
+  static MlirPassInstrumentationRegistry& Instance() {
+    static MlirPassInstrumentationRegistry* r =
+        new MlirPassInstrumentationRegistry;
+    return *r;
+  }
+  std::unordered_map<std::string,
+                     std::function<std::unique_ptr<PassInstrumentation>()>>
+      instrumentors_;
+};
+
+void RegisterPassInstrumentor(
+    const std::string& name,
+    std::function<std::unique_ptr<PassInstrumentation>()> creator) {
+  MlirPassInstrumentationRegistry& r =
+      MlirPassInstrumentationRegistry::Instance();
+  auto result = r.instrumentors_.emplace(name, creator);
+  if (!result.second) {
+    VLOG(1) << "Duplicate MLIR pass instrumentor registration";
+  }
+}
+
+std::vector<std::function<std::unique_ptr<PassInstrumentation>()>>
+GetPassInstrumentors() {
+  MlirPassInstrumentationRegistry& r =
+      MlirPassInstrumentationRegistry::Instance();
+  std::vector<std::function<std::unique_ptr<PassInstrumentation>()>> result;
+  result.reserve(r.instrumentors_.size());
+
+  std::transform(r.instrumentors_.begin(), r.instrumentors_.end(),
+                 std::back_inserter(result), [](auto v) { return v.second; });
+
+  return result;
+}
+
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h b/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h
new file mode 100644
index 00000000000..f4375dfc562
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_PASS_INSTRUMENTATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_PASS_INSTRUMENTATION_H_
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "mlir/Pass/PassInstrumentation.h"  // from @llvm-project
+
+namespace mlir {
+
+void RegisterPassInstrumentor(
+    const std::string& name,
+    std::function<std::unique_ptr<PassInstrumentation>()> creator);
+std::vector<std::function<std::unique_ptr<PassInstrumentation>()>>
+GetPassInstrumentors();
+
+}  // namespace mlir
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_PASS_INSTRUMENTATION_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation_test.cc
new file mode 100644
index 00000000000..b2a8dde0700
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation_test.cc
@@ -0,0 +1,109 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/mlir_pass_instrumentation.h"
+
+#include <cstddef>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/mlir/tf2xla/api/v0/compile_mlir_util.h"
+#include "tensorflow/core/lib/core/status_test_util.h"
+
+namespace mlir {
+namespace {
+static const char* kTestInstrumentationName = "test-intrumentatron";
+static const char* kTestInstrumentationSearch = "tf.Identity";
+
+struct StringStream : public llvm::raw_ostream {
+  StringStream() { SetUnbuffered(); }
+  ~StringStream() override = default;
+  uint64_t current_pos() const override { return 0; }
+
+  void write_impl(const char* ptr, size_t size) override {
+    ss.write(ptr, size);
+  }
+  std::stringstream ss;
+};
+
+class TestPassInstrumentation : public ::testing::Test {
+ public:
+  void SetPassThatChangedIdentity(absl::string_view pass_name) {
+    pass_that_changed_identity_ = pass_name;
+  }
+  absl::string_view GetPassThatChangedIdentity() {
+    return pass_that_changed_identity_;
+  }
+
+ private:
+  std::string pass_that_changed_identity_;
+  friend class TestInstrumentor;
+};
+
+class TestInstrumentor : public PassInstrumentation {
+ public:
+  explicit TestInstrumentor(TestPassInstrumentation* test) : test_(test) {}
+
+ private:
+  void runBeforePass(Pass* pass, Operation* op) override {
+    StringStream stream;
+    op->print(stream, mlir::OpPrintingFlags().useLocalScope());
+    ops_seen_by_pass_[pass] = stream.ss.str();
+  }
+  void runAfterPass(Pass* pass, Operation* op) override {
+    StringStream stream;
+    op->print(stream, mlir::OpPrintingFlags().useLocalScope());
+    if (!absl::StrContains(stream.ss.str(), kTestInstrumentationSearch) &&
+        absl::StrContains(ops_seen_by_pass_[pass],
+                          kTestInstrumentationSearch)) {
+      test_->SetPassThatChangedIdentity(pass->getName().str());
+    }
+  }
+
+ private:
+  TestPassInstrumentation* test_;
+  std::unordered_map<mlir::Pass*, std::string> ops_seen_by_pass_;
+};
+
+TEST_F(TestPassInstrumentation, CreatedCalledAndSetsPassName) {
+  RegisterPassInstrumentor(kTestInstrumentationName, [&]() {
+    return std::make_unique<TestInstrumentor>(this);
+  });
+  constexpr char legalization[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main(%arg0: tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>) -> tensor<?xi32, #mhlo.type_extensions<bounds = [1]>> {
+      %0 = "tf.Identity"(%arg0) : (tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>) -> tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>
+      func.return %0 : tensor<?xi32, #mhlo.type_extensions<bounds = [1]>>
+    }
+  })";
+  SetPassThatChangedIdentity("");
+  std::vector<::tensorflow::TensorShape> arg_shapes = {{1}};
+  auto compilation_result = tensorflow::XlaCompilationResult();
+
+  TF_EXPECT_OK(tensorflow::CompileSerializedMlirToXlaHlo(
+      legalization, arg_shapes, /*device_type=*/"XLA_TPU_JIT",
+      /*use_tuple_args=*/true, /*enable_op_fallback=*/false,
+      /*shape_determination_fns=*/{}, &compilation_result));
+
+  EXPECT_FALSE(GetPassThatChangedIdentity().empty());
+}
+
+}  // namespace
+}  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
index 6479253dd6e..3f35813744c 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
@@ -24,7 +24,7 @@ namespace tensorflow {
 MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
     const tensorflow::Graph& graph,
     const FunctionLibraryDefinition* function_library,
-    std::optional<ConfigProto> config_proto, bool is_tpu_graph,
+    std::optional<ConfigProto> config_proto, bool run_tpu_bridge,
     bool uses_uninitialized_resource_args, bool is_v1_compat,
     bool record_stats) {
   switch (GetMlirBridgeRolloutState(config_proto)) {
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
index 9f67442205d..5c7f47a219e 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
@@ -53,7 +53,7 @@ enum class MlirBridgeRolloutPolicy {
 MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
     const tensorflow::Graph& graph,
     const FunctionLibraryDefinition* function_library,
-    std::optional<tensorflow::ConfigProto> config_proto, bool is_tpu_graph,
+    std::optional<tensorflow::ConfigProto> config_proto, bool run_tpu_bridge,
     bool uses_uninitialized_resource_args, bool is_v1_compat,
     bool record_stats);
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/BUILD b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
index a728ed58ad9..c68c485954d 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     size_override = {
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/tf2xla/tests/convert-mhlo-quant-to-int.mlir
index 849c270f083..947c9f85624 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/convert-mhlo-quant-to-int.mlir
@@ -7,21 +7,21 @@ func.func @uniform_quantize_and_dequantize(%arg0: tensor<?x?xf32>) -> tensor<?x?
   // CHECK-DAG: %[[HALF:.*]] = mhlo.constant dense<5.000000e-01> : tensor<f32>
   // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-128> : tensor<i32>
   // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<127> : tensor<i32>
-  // CHECK: %[[VAL0:.*]] = chlo.broadcast_divide %arg0, %[[SCALES]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL1:.*]] = chlo.broadcast_add %[[VAL0]], %[[HALF]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[VAL0:.*]] = chlo.broadcast_divide %arg0, %[[SCALES]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[VAL1:.*]] = chlo.broadcast_add %[[VAL0]], %[[HALF]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
   // CHECK: %[[VAL2:.*]] = mhlo.floor %[[VAL1]] : tensor<?x?xf32>
   // CHECK: %[[VAL3:.*]] = mhlo.convert %[[VAL2]] : (tensor<?x?xf32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL4:.*]] = chlo.broadcast_add %[[VAL3]], %[[ZPS]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL5:.*]] = chlo.broadcast_maximum %[[VAL4]], %[[QUANT_MIN]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL6:.*]] = chlo.broadcast_minimum %[[VAL5]], %[[QUANT_MAX]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL4:.*]] = chlo.broadcast_add %[[VAL3]], %[[ZPS]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL5:.*]] = chlo.broadcast_maximum %[[VAL4]], %[[QUANT_MIN]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL6:.*]] = chlo.broadcast_minimum %[[VAL5]], %[[QUANT_MAX]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
   // CHECK: %[[VAL7:.*]] = mhlo.convert %[[VAL6]] : (tensor<?x?xi32>) -> tensor<?x?xi8>
 
   // CHECK-DAG: %[[SCALES_DQ:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[ZPS_DQ:.*]] = mhlo.constant dense<3> : tensor<i32>
   // CHECK: %[[VAL8:.*]] = mhlo.convert %[[VAL7]] : (tensor<?x?xi8>) -> tensor<?x?xi32>
-  // CHECK: %[[VAL9:.*]] = chlo.broadcast_subtract %[[VAL8]], %[[ZPS_DQ]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
+  // CHECK: %[[VAL9:.*]] = chlo.broadcast_subtract %[[VAL8]], %[[ZPS_DQ]] : (tensor<?x?xi32>, tensor<i32>) -> tensor<?x?xi32>
   // CHECK: %[[VAL10:.*]] = mhlo.convert %[[VAL9]] : (tensor<?x?xi32>) -> tensor<?x?xf32>
-  // CHECK: %[[VAL11:.*]] = chlo.broadcast_multiply %[[VAL10]], %[[SCALES_DQ]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
+  // CHECK: %[[VAL11:.*]] = chlo.broadcast_multiply %[[VAL10]], %[[SCALES_DQ]] : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xf32>
   // CHECK: return %[[VAL11]] : tensor<?x?xf32>
   %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>
   %1 = mhlo.uniform_dequantize %0 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>>) -> tensor<?x?xf32>
@@ -33,7 +33,7 @@ func.func @uniform_quantize_and_dequantize(%arg0: tensor<?x?xf32>) -> tensor<?x?
 // CHECK-LABEL: func @uniform_quantize_and_dequantize_type_exensions
 func.func @uniform_quantize_and_dequantize_type_exensions(%arg0: tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>) -> () {
   // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?x?xi32, #mhlo.type_extensions<bounds = [4, 4]>>) -> tensor<?x?xi8, #mhlo.type_extensions<bounds = [4, 4]>>
-  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>, tensor<f32>) -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>
+  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>, tensor<f32>) -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>
   %0 = mhlo.uniform_quantize %arg0 : (tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>) -> tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>, #mhlo.type_extensions<bounds = [4, 4]>>
   %1 = mhlo.uniform_dequantize %0 : (tensor<?x?x!quant.uniform<i8:f32, 1.000000e+00:3>, #mhlo.type_extensions<bounds = [4, 4]>>) -> tensor<?x?xf32, #mhlo.type_extensions<bounds = [4, 4]>>
   return
@@ -42,10 +42,10 @@ func.func @uniform_quantize_and_dequantize_type_exensions(%arg0: tensor<?x?xf32,
 // -----
 
 // CHECK-LABEL: func @uniform_quantize_and_dequantize_sparse_tensor_encoding
-func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> () {
-  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xi32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<?xi8, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
-  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>, tensor<f32>) -> tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
-  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
-  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
+func.func @uniform_quantize_and_dequantize_sparse_tensor_encoding(%arg0: tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> () {
+  // CHECK: %[[QUANTIZED:.*]] = mhlo.convert %[[VAL0:.*]] : (tensor<?xi32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?xi8, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
+  // CHECK: %[[DEQUANTIZED:.*]] = chlo.broadcast_multiply %[[VAL1:.*]], %[[CONST_SCALE:.*]] : (tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>, tensor<f32>) -> tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
+  %0 = mhlo.uniform_quantize %arg0 : (tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
+  %1 = mhlo.uniform_dequantize %0 : (tensor<?x!quant.uniform<i8:f32, 1.000000e+00:3>, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>
   return
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
index 28e1c0c37a1..11e19b3b1a6 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_runtime_pipeline.mlir
@@ -9,7 +9,7 @@ func.func @simple_add(%arg0: tensor<f64>) -> tensor<f64> {
 
 // -----
 
-#CSR = #sparse_tensor.encoding<{dimLevelType = [ "dense", "compressed" ]}>
+#CSR = #sparse_tensor.encoding<{lvlTypes = [ "dense", "compressed" ]}>
 
 // CHECK-LABEL: func.func @csr_gendot(
 // CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
@@ -53,8 +53,8 @@ func.func @csr_gendot(%arg0: tensor<32x64xf64, #CSR>,
 
 // -----
 
-#CSR  = #sparse_tensor.encoding<{ dimLevelType = ["dense", "compressed"] }>
-#DCSR = #sparse_tensor.encoding<{ dimLevelType = ["compressed", "compressed"] }>
+#CSR  = #sparse_tensor.encoding<{ lvlTypes = ["dense", "compressed"] }>
+#DCSR = #sparse_tensor.encoding<{ lvlTypes = ["compressed", "compressed"] }>
 
 // CHECK-LABEL: func.func @convert_nop(
 // CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir
index 3a5d2e95e24..920c2ab744c 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/hlo_xla_sparsification.mlir
@@ -1,6 +1,6 @@
 // RUN: tf-opt -hlo-legalize-to-linalg -hlo-xla-runtime-sparsification %s | FileCheck %s
 
-#SparseVector = #sparse_tensor.encoding<{ dimLevelType = ["compressed"] }>
+#SparseVector = #sparse_tensor.encoding<{ lvlTypes = ["compressed"] }>
 
 // CHECK-LABEL: func.func @mult_sparse_dense(
 // CHECK-SAME:    %[[PTR:.*0]]: memref<?xindex>,
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir
index fefaeb7d589..bd1b01fa171 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-collective.mlir
@@ -89,6 +89,15 @@ func.func @xla_all_reduce_mul(%input: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
+// -----
+
+func.func @xla_all_reduce_tuple(%input: tuple<tensor<f32>, tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>> {
+  %group_assignment = "tf.Const"() { value = dense<[[0],[1]]> : tensor<2x1xi32> } : () -> tensor<2x1xi32>
+  // expected-error@+1 {{'tf.XlaAllReduce' op operand #0 must be tensor of bfloat16 or 16-bit float or 32-bit float or 32-bit integer or 32-bit unsigned integer values, but got 'tuple<tensor<f32>, tensor<f32>>'}}
+  %0 = "tf.XlaAllReduce"(%input, %group_assignment) {reduce_op = "Add", mode = "CrossReplica"} : (tuple<tensor<f32>, tensor<f32>>, tensor<2x1xi32>) -> tuple<tensor<f32>, tensor<f32>>
+  func.return %0 : tuple<tensor<f32>, tensor<f32>> 
+}
+
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-no-tf2xla-fallback.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-no-tf2xla-fallback.mlir
index 3ca8dc09a80..92fa37f7e44 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-no-tf2xla-fallback.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-no-tf2xla-fallback.mlir
@@ -4352,7 +4352,7 @@ func.func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK-LABEL: conv_dynamic
 func.func @conv_dynamic(%arg0: tensor<?x32x32x6xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<?x8x7x16xf32> {
   // CHECK: "mhlo.dynamic_conv"
-  // CHECK-SAME: {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>, feature_group_count = 2 : i64, rhs_dilation = dense<[2, 3]> : tensor<2xi64>, window_strides = dense<[4, 5]> : tensor<2xi64>} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<?x8x7x16xf32>
+  // CHECK-SAME: {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<[2, 3]> : tensor<2xi64>, window_strides = dense<[4, 5]> : tensor<2xi64>} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<?x8x7x16xf32>
   %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>) -> tensor<?x8x7x16xf32>
   func.return %0 : tensor<?x8x7x16xf32>
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
index 1d6bfb6bcd7..730e3ec09c4 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla-hlo-importer.mlir
@@ -553,4 +553,157 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     %values, %indices = "tf.ApproxTopK"(%0) {aggregate_to_topk = true, device = "", is_max_k = true, k = 10 : i64, recall_target = 0.949999988 : f32, reduction_dimension = -1 : i64, reduction_input_size_override = -1 : i64} : (tensor<10x500xbf16>) -> (tensor<10x10xbf16>, tensor<10x10xi32>)
     return %values : tensor<10x10xbf16>
   }
+
+  // CHECK-LABEL: fusedBatchNormV3_noTraining
+  func.func @fusedBatchNormV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+    // CHECK: "mhlo.batch_norm_inference"({{.*}}, %arg1, %arg2, %arg3, %arg4) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> tensor<8x8x8x8xf32>
+    %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+    func.return %0#0 : tensor<8x8x8x8xf32>
+  }
+
+  // CHECK-LABEL: fusedBatchNormV3_training
+  func.func @fusedBatchNormV3_training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+    // CHECK: %[[OUT:.*]], %[[MEAN:.*]], %[[VAR:.*]] = "mhlo.batch_norm_training"({{.*}}, %arg1, %arg2) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+    %0:6 = "tf.FusedBatchNormV3"(%arg0, %arg1, %arg2, %arg3, %arg4) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, exponential_avg_factor = 1.0 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+    func.return %0#0 : tensor<8x8x8x8xf32>
+  }
+
+  // CHECK-LABEL: fusedBatchNormGradV3_noTraining
+  func.func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>) {
+    // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
+    // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
+    // CHECK: %[[scr1:.*]] = mhlo.rsqrt
+    // CHECK: %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+    // CHECK: %[[sub:.*]] = mhlo.subtract %[[act]], %[[bcast_arg3]] : tensor<8x8x8x8xf32>
+    // CHECK: %[[mul:.*]] = mhlo.multiply %[[grad]], %[[sub]] : tensor<8x8x8x8xf32>
+    // CHECK: mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
+    // CHECK-NEXT: %[[cmul:.*]] = mhlo.convert %[[mul]] : tensor<8x8x8x8xf32>
+    // CHECK-NEXT: %[[init:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK-NEXT: %[[convert_init:.*]] = mhlo.convert %[[init]] : tensor<f32>
+    // CHECK: %[[red1:.*]] = mhlo.reduce(%[[cmul]] init: %[[convert_init]]) across dimensions = [0, 1, 2] : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
+    // CHECK: %[[scr2:.*]] = mhlo.convert %[[red1]] : tensor<8xf32>
+
+    // CHECK: %[[mul2:.*]] = mhlo.multiply %arg2, %[[scr1]] : tensor<8xf32>
+    // CHECK: %[[bcast_mul2:.+]] = "mhlo.dynamic_broadcast_in_dim"(%[[mul2]], {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
+    // CHECK: %[[mul3:.*]] = mhlo.multiply %[[grad]], %[[bcast_mul2]] : tensor<8x8x8x8xf32>
+
+    // CHECK: %[[scale_backprop:.*]] = mhlo.multiply %[[scr1]], %[[scr2]] : tensor<8xf32>
+
+    // CHECK: mhlo.constant dense<[0, 1, 2]> : tensor<3xi64>
+    // CHECK: %[[cgrad:.*]] = mhlo.convert %[[grad]] : tensor<8x8x8x8xf32>
+    // CHECK: %[[init2:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK-NEXT: %[[convert_init2:.*]] = mhlo.convert %[[init2]] : tensor<f32>
+    // CHECK: %[[red2:.*]] = mhlo.reduce(%[[cgrad]] init: %[[convert_init2]]) across dimensions = [0, 1, 2] : (tensor<8x8x8x8xf32>, tensor<f32>) -> tensor<8xf32>
+    // CHECK: %[[offset_backprop:.*]] = mhlo.convert %[[red2]] : tensor<8xf32>
+
+    // CHECK: %[[x_backprop:.*]] = mhlo.convert %[[mul3]] : tensor<8x8x8x8xf32>
+    // CHECK: return %[[x_backprop]] : tensor<8x8x8x8xf32>
+
+    %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+    func.return %0#0 : tensor<8x8x8x8xf32>
+  }
+
+  // CHECK-LABEL: fusedBatchNormGradV3_Training
+  func.func @fusedBatchNormGradV3_Training(%arg0: tensor<8x8x8x8xf32>, %arg1: tensor<8x8x8x8xf32>, %arg2: tensor<8xf32>, %arg3: tensor<8xf32>, %arg4: tensor<8xf32>, %arg5: tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>) {
+    // CHECK-NEXT: %[[grad:.*]] = mhlo.convert %arg0 : tensor<8x8x8x8xf32>
+    // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
+    // CHECK-NEXT: %[[grad_operand:.*]], %[[grad_scale:.*]], %[[grad_offset:.*]] = "mhlo.batch_norm_grad"(%[[act]], %arg2, %arg3, %arg4, %[[grad]]) {epsilon = 1.000000e-03 : f32, feature_index = 3 : i64} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>)
+    // CHECK-NEXT: %[[x_backprop:.*]] = mhlo.convert %[[grad_operand]] : tensor<8x8x8x8xf32>
+    // CHECK: return %[[x_backprop]]
+    // CHECK-SAME: tensor<8x8x8x8xf32>
+
+    %0:5 = "tf.FusedBatchNormGradV3"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = true} : (tensor<8x8x8x8xf32>, tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<0xf32>, tensor<*xf32>)
+    func.return %0#0, %0#3, %0#4 : tensor<8x8x8x8xf32>, tensor<0xf32>, tensor<*xf32>
+  }
+
+  // CHECK-LABEL: @max_pool_grad_valid
+  // CHECK-SAME: %[[INPUT:.*]]: tensor<10x24x24x64xf32>, %arg1: tensor<10x12x12x64xf32>, %[[GRAD:.*]]: tensor<10x12x12x64xf32>
+  func.func @max_pool_grad_valid(%orig_input: tensor<10x24x24x64xf32>, %orig_output: tensor<10x12x12x64xf32>, %grad: tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32> {
+    // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK: %[[RESULT:.*]] = "mhlo.select_and_scatter"(%[[INPUT]], %[[GRAD]], %[[ZERO]]) ({
+    // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
+    // CHECK: %[[SELECT_RESULT:.*]] = mhlo.compare GE, %[[VALUE_A]], %[[VALUE_B]] : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<i1>
+    // CHECK: },  {
+    // CHECK: ^bb0(%[[VALUE_A:.*]]: tensor<f32>, %[[VALUE_B:.*]]: tensor<f32>):
+    // CHECK: %[[SELECT_RESULT:.*]] = mhlo.add %[[VALUE_A]], %[[VALUE_B]] : tensor<f32>
+    // CHECK: mhlo.return %[[SELECT_RESULT]] : tensor<f32>
+    // CHECK: }) {padding = dense<0> : tensor<4x2xi64>, window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+    // CHECK: return %[[RESULT]] : tensor<10x24x24x64xf32>
+    %result = "tf.MaxPoolGrad"(%orig_input, %orig_output, %grad) {
+      data_format = "NHWC",
+      explicit_paddings = [],
+      ksize = [1, 2, 2, 1],
+      padding = "VALID",
+      strides = [1, 2, 2, 1]
+    } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<10x12x12x64xf32>) -> tensor<10x24x24x64xf32>
+    func.return %result : tensor<10x24x24x64xf32>
+  }
+
+  // CHECK-LABEL: @max_pool_grad_same
+  func.func @max_pool_grad_same(%orig_input: tensor<2x13x25x7xf32>, %orig_output: tensor<2x4x7x7xf32>, %grad: tensor<2x4x7x7xf32>) -> tensor<2x13x25x7xf32> {
+    // CHECK: padding = dense<{{\[\[}}0, 0], [0, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
+    %result = "tf.MaxPoolGrad"(%orig_input, %orig_output, %grad) {
+      data_format = "NHWC",
+      explicit_paddings = [],
+      ksize = [1, 2, 3, 1],
+      padding = "SAME",
+      strides = [1, 4, 4, 1]
+    } : (tensor<2x13x25x7xf32>, tensor<2x4x7x7xf32>, tensor<2x4x7x7xf32>) -> tensor<2x13x25x7xf32>
+    func.return %result : tensor<2x13x25x7xf32>
+  }
+
+  //===--------------------------------------------------------------------===//
+  // tf.XlaReduceScatter legalization
+  //===--------------------------------------------------------------------===//
+  // CHECK-LABEL: func @xla_reduce_scatter
+  func.func @xla_reduce_scatter(%arg0: tensor<128x128xf32>) -> tensor<64x128xf32> {
+      %cst = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+      %cst_0 = "tf.Const"() {value = dense<[[0, 4], [1, 5], [2, 6], [3, 7]]> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+      // CHECK:          "mhlo.reduce_scatter"(%arg0)
+      // CHECK{LITERAL}: replica_groups = dense<[[0, 4], [1, 5], [2, 6], [3, 7]]>
+      // CHECK-SAME:     scatter_dimension = 0
+      //
+      %1 = "tf.XlaReduceScatter"(%arg0, %cst_0, %cst) {reduce_op = "Add"} : (tensor<128x128xf32>, tensor<4x2xi32>, tensor<i32>) -> tensor<64x128xf32>
+      func.return %1 : tensor<64x128xf32>
+  }
+
+  // CHECK-LABEL: func @tf_mod
+  func.func @tf_mod(%arg1: tensor<2x2xf32>) -> tensor<2x2xf32> {
+    %cst = "tf.Const"() {value = dense<7.000000e+00> : tensor<f32>} : () -> tensor<f32>
+    // CHECK: "mhlo.dynamic_broadcast_in_dim"
+    // CHECK: mhlo.remainder
+    %6 = "tf.Mod"(%arg1, %cst) {_global_shape = [#tf_type.shape<4x8>], device = ""} : (tensor<2x2xf32>, tensor<f32>) -> tensor<2x2xf32>
+    return %6 : tensor<2x2xf32>
+  }
+
+  // CHECK-LABEL: func @concat_v2
+  func.func @concat_v2(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<6x3xf32> {
+    // CHECK: "mhlo.concatenate"({{.*}}) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
+    %axis = "tf.Const"() { value = dense<0> : tensor<i64> } : () -> tensor<i64>
+    %1 = "tf.ConcatV2"(%arg0, %arg1, %axis) : (tensor<3x3xf32>, tensor<3x3xf32>, tensor<i64>) -> tensor<6x3xf32>
+    func.return %1 : tensor<6x3xf32>
+  }
+
+  // CHECK-LABEL: func @xla_call_module
+  func.func @xla_call_module(%arg0: tensor<f32>) -> tensor<*xf32> {
+    // Equivalent to the following:
+    //
+    // module @jit_sin {
+    //   func.func public @main(%arg0: tensor<f32>) -> tensor<f32> {
+    //     %0 = mhlo.sine %arg0 : tensor<f32>
+    //     return %0 : tensor<f32>
+    //   }
+    // }
+    // CHECK: call @main.2
+    %0 = "tf.XlaCallModule"(%arg0) {Sout = [#tf_type.shape<*>], device = "", dim_args_spec = [], function_list = [], disabled_checks = [], has_token_input_output = false, module = "ML\EFR\03MLIRxxx-trunk\00\01\17\05\01\05\01\03\05\03\07\07\t\0B\03K5\07\01\1B\07\0B\13\0B3\0B\0B\0B\0B\0F\0B\13\0B\03\1B\0F\1B\0B\0B\0B\0B\0B\0F\13\0B\0B\0B\0B\03\07\0F\17\07\02\A7\1F\05\0D\03\03\03\07\05\0F\03\0B\0B\1B\0D'\0F)\031\113\05\11\05\13\05\15\05\17\1D\15\17\05\19\17\19\EF\01\05\1B\03\03\1D\0D\05\1F!#%\1D\1D\1D\1F\1D!\1D##\03\03\03+\0D\03-/\1D%\1D'\1D)\1D+)\01\05\11\03\01\03\01\t\04A\05\01\11\01\05\07\03\01\05\03\11\01\t\05\03\05\0B\03\01\01\05\06\13\03\01\03\01\07\04\01\03\03\06\03\01\05\01\00\9A\04-\0F\0B\03!\1B\1D\05\1B\83/\1F\15\1D\15\11\13\15\11\11\0F\0B\11builtin\00vhlo\00module\00func_v1\00sine_v1\00return_v1\00sym_name\00jit_sin\00arg_attrs\00function_type\00res_attrs\00sym_visibility\00jit(sin)/jit(main)/sin\00third_party/py/jax/experimental/jax2tf/tests/back_compat_test.py\00jax.arg_info\00x\00mhlo.sharding\00{replicated}\00jax.result_info\00\00main\00public\00", platforms = ["CPU"], version = 6 : i64} : (tensor<f32>) -> tensor<*xf32>
+    func.return %0 : tensor<*xf32>
+  }
+
+  // Verifies that the following functions are added from xla_call_module. Note this must be at the end of the file.
+  // CHECK: func.func private @main.2(%arg0: tensor<f32> {mhlo.sharding = "{replicated}"}) -> tensor<f32> {
+  // CHECK:   %0 = mhlo.sine %arg0 : tensor<f32>
+  // CHECK:   return %0 : tensor<f32>
+  // CHECK: }
+
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir
index 3e550e0366c..90ef7c88910 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-with-tf2xla.mlir
@@ -491,6 +491,21 @@ func.func @approx_topk(%arg0: tensor<!tf_type.resource<tensor<10x500xbf16>>> {tf
   return %values : tensor<10x10xbf16>
 }
 
+// CHECK-LABEL: func @xla_call_module
+func.func @xla_call_module(%arg0: tensor<f32>) -> tensor<*xf32> {
+  // Equivalent to the following:
+  //
+  // module @jit_sin {
+  //   func.func public @main(%arg0: tensor<f32>) -> tensor<f32> {
+  //     %0 = mhlo.sine %arg0 : tensor<f32>
+  //     return %0 : tensor<f32>
+  //   }
+  // }
+  // expected-remark@+1 {{UNIMPLEMENTED: MlirHloBuilder does not support op call}}
+  %0 = "tf.XlaCallModule"(%arg0) {Sout = [#tf_type.shape<*>], device = "", dim_args_spec = [], function_list = [], disabled_checks = [], has_token_input_output = false, module = "ML\EFR\03MLIRxxx-trunk\00\01\17\05\01\05\01\03\05\03\07\07\t\0B\03K5\07\01\1B\07\0B\13\0B3\0B\0B\0B\0B\0F\0B\13\0B\03\1B\0F\1B\0B\0B\0B\0B\0B\0F\13\0B\0B\0B\0B\03\07\0F\17\07\02\A7\1F\05\0D\03\03\03\07\05\0F\03\0B\0B\1B\0D'\0F)\031\113\05\11\05\13\05\15\05\17\1D\15\17\05\19\17\19\EF\01\05\1B\03\03\1D\0D\05\1F!#%\1D\1D\1D\1F\1D!\1D##\03\03\03+\0D\03-/\1D%\1D'\1D)\1D+)\01\05\11\03\01\03\01\t\04A\05\01\11\01\05\07\03\01\05\03\11\01\t\05\03\05\0B\03\01\01\05\06\13\03\01\03\01\07\04\01\03\03\06\03\01\05\01\00\9A\04-\0F\0B\03!\1B\1D\05\1B\83/\1F\15\1D\15\11\13\15\11\11\0F\0B\11builtin\00vhlo\00module\00func_v1\00sine_v1\00return_v1\00sym_name\00jit_sin\00arg_attrs\00function_type\00res_attrs\00sym_visibility\00jit(sin)/jit(main)/sin\00third_party/py/jax/experimental/jax2tf/tests/back_compat_test.py\00jax.arg_info\00x\00mhlo.sharding\00{replicated}\00jax.result_info\00\00main\00public\00", platforms = ["CPU"], version = 6 : i64} : (tensor<f32>) -> tensor<*xf32>
+  func.return %0 : tensor<*xf32>
+}
+
 // TODO(hinsu): Add a test with a valid TF op for which tf2xla kernel is
 // available but doesn't support this instance.
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
index 19fe43f0250..3bce71fa26a 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
@@ -4499,7 +4499,7 @@ func.func @cross_replica_sum(%input: tensor<10xf32>) -> tensor<10xf32> {
 // CHECK-LABEL: conv_dynamic
 func.func @conv_dynamic(%arg0: tensor<?x32x32x6xf32>, %arg1: tensor<3x3x3x16xf32>) -> tensor<?x8x7x16xf32> {
   // CHECK: "mhlo.dynamic_conv"
-  // CHECK-SAME: {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>, feature_group_count = 2 : i64, rhs_dilation = dense<[2, 3]> : tensor<2xi64>, window_strides = dense<[4, 5]> : tensor<2xi64>} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<?x8x7x16xf32>
+  // CHECK-SAME: {batch_group_count = 1 : i64, dimension_numbers = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>], rhs_dilation = dense<[2, 3]> : tensor<2xi64>, window_strides = dense<[4, 5]> : tensor<2xi64>} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>, tensor<4xi32>) -> tensor<?x8x7x16xf32>
   %0 = "tf.Conv2D"(%arg0, %arg1) {data_format = "NHWC", dilations = [1, 2, 3, 1], padding = "SAME", strides = [1, 4, 5, 1]} : (tensor<?x32x32x6xf32>, tensor<3x3x3x16xf32>) -> tensor<?x8x7x16xf32>
   func.return %0 : tensor<?x8x7x16xf32>
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index df4bf8fa204..181d6b582b9 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -159,6 +160,7 @@ cc_library(
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "//tensorflow/tsl/platform:bfloat16",
         "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:Dialect",
@@ -171,7 +173,7 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
         "@stablehlo//:chlo_ops",
-    ],
+    ] + if_static(["//tensorflow/tsl/platform:tensor_float_32_utils"]),
 )
 
 cc_library(
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/tf2xla/transforms/convert_mhlo_quant_to_int.cc
index 382b059ebb4..0343ae3b96b 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/convert_mhlo_quant_to_int.cc
@@ -114,7 +114,6 @@ class ConvertUniformQuantizeOp
         op->getLoc(), rewriter.getI32IntegerAttr(static_cast<int32_t>(
                           element_type.getStorageTypeMax())));
 
-    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
     auto res_float_tensor_type_or =
         GetSameShapeTensorType(op, op.getOperand().getType().cast<TensorType>(),
                                rewriter.getF32Type(), rewriter);
@@ -123,10 +122,9 @@ class ConvertUniformQuantizeOp
     }
     Value res_float = rewriter.create<chlo::BroadcastDivOp>(
         op->getLoc(), *res_float_tensor_type_or, adaptor.getOperand(), scale,
-        scalar_broadcast_dims);
+        nullptr);
     res_float = rewriter.create<chlo::BroadcastAddOp>(
-        op->getLoc(), *res_float_tensor_type_or, res_float, half,
-        scalar_broadcast_dims);
+        op->getLoc(), *res_float_tensor_type_or, res_float, half, nullptr);
     res_float = rewriter.create<mhlo::FloorOp>(op->getLoc(), res_float);
     auto res_int32_tensor_type_or =
         GetSameShapeTensorType(op, res_float.getType().cast<TensorType>(),
@@ -138,13 +136,13 @@ class ConvertUniformQuantizeOp
         op->getLoc(), *res_int32_tensor_type_or, res_float);
     res_int32 = rewriter.create<chlo::BroadcastAddOp>(
         op->getLoc(), *res_int32_tensor_type_or, res_int32, zero_point,
-        scalar_broadcast_dims);
+        nullptr);
     res_int32 = rewriter.create<chlo::BroadcastMaxOp>(
         op->getLoc(), *res_int32_tensor_type_or, res_int32, quantization_min,
-        scalar_broadcast_dims);
+        nullptr);
     res_int32 = rewriter.create<chlo::BroadcastMinOp>(
         op->getLoc(), *res_int32_tensor_type_or, res_int32, quantization_max,
-        scalar_broadcast_dims);
+        nullptr);
     auto res_final_tensor_type_or =
         GetSameShapeTensorType(op, res_int32.getType().cast<TensorType>(),
                                rewriter.getI8Type(), rewriter);
@@ -177,7 +175,6 @@ class ConvertUniformDequantizeOp
                           static_cast<int32_t>(element_type.getZeroPoint())));
 
     Value input = adaptor.getOperand();
-    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
     auto res_int32_tensor_type_or =
         GetSameShapeTensorType(op, input.getType().cast<TensorType>(),
                                rewriter.getI32Type(), rewriter);
@@ -188,7 +185,7 @@ class ConvertUniformDequantizeOp
         op->getLoc(), *res_int32_tensor_type_or, input);
     res_int32 = rewriter.create<chlo::BroadcastSubOp>(
         op->getLoc(), *res_int32_tensor_type_or, res_int32, zero_point,
-        scalar_broadcast_dims);
+        nullptr);
     auto res_float_tensor_type_or =
         GetSameShapeTensorType(op, res_int32.getType().cast<TensorType>(),
                                rewriter.getF32Type(), rewriter);
@@ -198,7 +195,7 @@ class ConvertUniformDequantizeOp
     Value res_float = rewriter.create<mhlo::ConvertOp>(
         op->getLoc(), *res_float_tensor_type_or, res_int32);
     res_float = rewriter.replaceOpWithNewOp<chlo::BroadcastMulOp>(
-        op, *res_float_tensor_type_or, res_float, scale, scalar_broadcast_dims);
+        op, *res_float_tensor_type_or, res_float, scale, nullptr);
     return success();
   }
 };
@@ -213,6 +210,8 @@ void ConvertMHLOQuantToInt::runOnOperation() {
   patterns.add<ConvertUniformQuantizeOp, ConvertUniformDequantizeOp>(context);
 
   ConversionTarget target(*op->getContext());
+  // An addDynamicallyLegalDialect callback that declares a given operation as
+  // legal only if its all operands and results are non-quantized types.
   auto is_legal = [](Operation *op) {
     auto is_not_quant = [](Type type) {
       return !getElementTypeOrSelf(type).isa<quant::UniformQuantizedType>();
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 06d6df007f2..b1bf04bb232 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -67,6 +67,7 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_format.h"
 #include "tensorflow/tsl/platform/bfloat16.h"
 #include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 
 namespace mlir {
 namespace mhlo {
@@ -150,6 +151,21 @@ static IntegerAttr GetHLOAxisFromTFAxis(Attribute attr, int64_t rank,
   return b->getI64IntegerAttr(axis);
 }
 
+// Returns a PrecisionConfig as an array attribute based on whether TF32
+// execution is enabled
+static ArrayAttr GetPrecisionConfig(Builder *builder) {
+  mlir::mhlo::Precision precision = tsl::tensor_float_32_execution_enabled()
+                                        ? mhlo::Precision::DEFAULT
+                                        : mlir::mhlo::Precision::HIGHEST;
+  llvm::SmallVector<mlir::Attribute, 2> attr_vec;
+  const int num_inputs = 2;
+  for (int i = 0; i < num_inputs; i++) {
+    attr_vec.push_back(
+        mlir::mhlo::PrecisionAttr::get(builder->getContext(), precision));
+  }
+  return builder->getArrayAttr(attr_vec);
+}
+
 // If `value` is an IntegerAttr, returns the integer value for the HLO axis
 // corresponding to the tensorflow axis. In particular, the tensorflow axis can
 // be negative, in which case, the corresponding HLO axis is
@@ -1082,6 +1098,9 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
     auto batch_group_count_attr = rewriter.getNamedAttr(
         "batch_group_count", rewriter.getI64IntegerAttr(1));
 
+    auto precision_config_attr = rewriter.getNamedAttr(
+        "precision_config", GetPrecisionConfig(&rewriter));
+
     Value paddings_op = rewriter.create<tensor::FromElementsOp>(
         op.getLoc(),
         tensorflow::GetTypeFromTFTensorShape(2 * num_spatial_dims,
@@ -1105,9 +1124,9 @@ class ConvertConvDynamic : public OpRewritePattern<OpT> {
                                                filter_ty.getElementType()),
           operands[1]);
     }
-    NamedAttribute attrs[] = {rhs_dilations_attr, window_strides_attr,
+    NamedAttribute attrs[] = {rhs_dilations_attr,     window_strides_attr,
                               dimension_numbers_attr, feature_group_count_attr,
-                              batch_group_count_attr};
+                              batch_group_count_attr, precision_config_attr};
     rewriter.replaceOpWithNewOp<mhlo::DynamicConvOp>(op, op.getType(), operands,
                                                      llvm::ArrayRef(attrs));
     return success();
@@ -1246,6 +1265,9 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
     auto paddings_attr = rewriter.getNamedAttr(
         "padding", DenseElementsAttr::get<int64_t>(paddings_ty, paddings));
 
+    auto precision_config_attr = rewriter.getNamedAttr(
+        "precision_config", GetPrecisionConfig(&rewriter));
+
     SmallVector<Value, 2> operands(op.getOperands());
     // Reshape the filter to {spatial_dims...., 1,in_channels *
     // channel_multiplier}
@@ -1264,7 +1286,8 @@ class ConvertConvOp : public OpRewritePattern<OpTy> {
     }
     NamedAttribute attrs[] = {rhs_dilations_attr,     window_strides_attr,
                               dimension_numbers_attr, feature_group_count_attr,
-                              batch_group_count_attr, paddings_attr};
+                              batch_group_count_attr, paddings_attr,
+                              precision_config_attr};
     rewriter.replaceOpWithNewOp<ConvolutionOp>(op, op.getType(), operands,
                                                llvm::ArrayRef(attrs));
     return success();
@@ -3160,9 +3183,9 @@ class ConvertBatchMatMulV2Op : public OpRewritePattern<TF::BatchMatMulV2Op> {
         /*rhs_contracting_dimensions=*/rhs_contracting_dimensions);
     // TODO(silvasean): Emit shape checks for contracting dimensions.
     // (The batch dimensions are checked by the broadcasting logic)
-    rewriter.replaceOpWithNewOp<DotGeneralOp>(op, op.getType(), lhs, rhs,
-                                              dimension_numbers,
-                                              /*precision_config=*/nullptr);
+    rewriter.replaceOpWithNewOp<DotGeneralOp>(
+        op, op.getType(), lhs, rhs, dimension_numbers,
+        /*precision_config=*/GetPrecisionConfig(&rewriter));
     return success();
   }
 };
@@ -4958,7 +4981,7 @@ class ConvertConvBackpropInputOp : public OpRewritePattern<OpTy> {
             /*outputSpatialDimensions=*/spatial_dims),
         rewriter.getI64IntegerAttr(feature_group_count),
         /*batch_group_count=*/rewriter.getI64IntegerAttr(1),
-        /*precision_config=*/ArrayAttr());
+        /*precision_config=*/GetPrecisionConfig(&rewriter));
 
     rewriter.replaceOp(op, {result});
 
@@ -5165,7 +5188,7 @@ class ConvertConvBackpropFilterOp : public OpRewritePattern<OpTy> {
             /*outputSpatialDimensions=*/output_spatial_dimensions),
         /*feature_group_count=*/rewriter.getI64IntegerAttr(1),
         rewriter.getI64IntegerAttr(batch_group_count),
-        /*precision_config=*/ArrayAttr());
+        /*precision_config=*/GetPrecisionConfig(&rewriter));
 
     rewriter.replaceOp(op, {result});
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
index 20eeb67b7d5..4f355d5255c 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
@@ -144,6 +144,11 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
   Type element_type = getElementTypeOrSelf(input.getType());
   auto all_reduce = builder.create<AllReduceOp>(
       loc, result_type, input, replica_groups, channel_handle, nullptr);
+
+  if (all_reduce.getNumResults() != 1) {
+    return op->emitOpError()
+           << "AllReduceOp must have one result: " << *all_reduce;
+  }
   if (merge_op == "Add") {
     BuildReduceBody<AddOp>(element_type, &all_reduce.getComputation(),
                            &builder);
@@ -173,7 +178,7 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
         GetScalarConstOfType(element_type, loc, replica_group_size, &builder);
     auto broadcast_dims = GetI64ElementsAttr({}, &builder);
     result = builder.create<chlo::BroadcastDivOp>(
-        loc, all_reduce.getResult(), divisor.getResult(), broadcast_dims);
+        loc, all_reduce.getResult(0), divisor.getResult(), broadcast_dims);
   } else if (final_op != "Id") {
     return op->emitOpError()
            << "invalid final_op " << final_op << ", want one of [Id, Div]";
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index f28ea6958d3..3234e22bf6e 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -389,11 +389,14 @@ foreach src = [TF_PreventGradientOp, TF_CheckNumericsOp] in
 // MatMul op patterns.
 //===----------------------------------------------------------------------===//
 
+def GetPrecisionConfig: NativeCodeCall<
+  "GetPrecisionConfig(&$_builder)">;
+
 def : Pat<(TF_MatMulOp $a, $b, $transpose_a, $transpose_b),
           (MHLO_DotOp
           (TF_TransposeOp $a, (TF_ConstOp (Get2DTransposePerm $transpose_a))),
           (TF_TransposeOp $b, (TF_ConstOp (Get2DTransposePerm $transpose_b))),
-          /*precision_config=*/(NullArrayAttr))>;
+          /*precision_config=*/(GetPrecisionConfig))>;
 
 //===----------------------------------------------------------------------===//
 // Lower `tf.ZerosLike`
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
index ddd3b091e23..f5c76c4fecd 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_with_tf2xla.cc
@@ -180,6 +180,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
             TypeID::get<TF::MatrixSolveOp>(),
             TypeID::get<TF::MatrixTriangularSolveOp>(),
             TypeID::get<TF::MaxPool3DGradGradOp>(),
+            TypeID::get<TF::MaxPoolGradOp>(),
             TypeID::get<TF::MaxPoolGradGradOp>(),
             TypeID::get<TF::MirrorPadOp>(),
             TypeID::get<TF::MirrorPadGradOp>(),
@@ -272,6 +273,7 @@ bool IsOpAllowedTf2XlaFallback(Operation* op) {
             TypeID::get<TF::UpperBoundOp>(),
             TypeID::get<TF::WhereOp>(),
             TypeID::get<TF::XlaBroadcastHelperOp>(),
+            TypeID::get<TF::XlaCallModuleOp>(),
             TypeID::get<TF::XlaCustomCallV2Op>(),
             TypeID::get<TF::XlaDynamicUpdateSliceOp>(),
             TypeID::get<TF::XlaKeyValueSortOp>(),
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
index 4117b5ce026..c916c89fb43 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter.cc
@@ -156,6 +156,13 @@ tsl::StatusOr<mhlo::TupleOp> Tf2XlaRewriter::ImportXlaComputation(
     return tsl::errors::InvalidArgument("Imported XLA Root is not a tuple op");
   }
 
+  if (op_->getNumOperands() !=
+      hlo_module->entry_computation()->num_parameters()) {
+    return tsl::errors::InvalidArgument(
+        "Entry computation does not have equal number of parameters to op "
+        "operands");
+  }
+
   ModuleOp mlir_module = op_->getParentOfType<ModuleOp>();
   mlir::OpBuilder builder(op_);
   mlir::SymbolTable symbol_table(mlir_module);
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
index 4aeb42bd7bd..b6f1b54591b 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/tf2xla_rewriter_test.cc
@@ -75,8 +75,10 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
 
 XlaComputation GetTestXlaComputation() {
   XlaBuilder xla_builder("test");
-  XlaOp add = xla::Add(xla::ConstantR0<float>(&xla_builder, 1.0),
-                       xla::ConstantR0<float>(&xla_builder, 2.0));
+  auto param =
+      Parameter(&xla_builder, 0, ShapeUtil::MakeScalarShape(xla::F32), "a");
+
+  XlaOp add = xla::Add(param, xla::ConstantR0<float>(&xla_builder, 2.0));
 
   std::vector<XlaOp> tuple_values;
   tuple_values.push_back(add);
@@ -291,7 +293,7 @@ TEST_F(Tf2XlaRewriterTest, InsertsConstantParameters) {
       LegalizeModule(/*use_tf2xla_hlo_importer=*/true, kModuleWithConstParam));
 }
 
-TEST_F(Tf2XlaRewriterTest, DISABLED_ImportsPrivateFunctions) {
+TEST_F(Tf2XlaRewriterTest, ErrorsWithInvalidNumberOfParametersToArgs) {
   XlaBuilder builder("test_builder");
   XlaComputation to_apply;
   {
@@ -315,9 +317,9 @@ TEST_F(Tf2XlaRewriterTest, DISABLED_ImportsPrivateFunctions) {
   EXPECT_EQ(computation.proto().computations_size(), 2);
 
   TF_ASSERT_OK(CreateMlirModule());
-  TF_ASSERT_OK_AND_ASSIGN(TupleOp root_tuple,
-                          ImportXlaComputationIntoModule(computation));
-  EXPECT_TRUE(root_tuple);
+  tsl::StatusOr<TupleOp> status_or_tuple_op =
+      ImportXlaComputationIntoModule(computation);
+  EXPECT_FALSE(status_or_tuple_op.ok());
 }
 
 }  // namespace mhlo
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
index 5fe37a04160..e773f5d8b52 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/verify_tfxla_legalization.cc
@@ -48,7 +48,7 @@ class VerifyTFXLALegalization
     : public impl::VerifyTFXLALegalizationBase<VerifyTFXLALegalization> {
  public:
   explicit VerifyTFXLALegalization(bool legalize_chlo) {
-    legalize_chlo_ = legalize_chlo_;
+    legalize_chlo_ = legalize_chlo;
   }
 
   void runOnOperation() override;
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
index 09d5b91f05a..fe5326206a4 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf.cc
@@ -730,6 +730,13 @@ const llvm::DenseSet<mlir::TypeID> &MlirPreferredOps() {
   // clang-format off
   static const llvm::DenseSet<mlir::TypeID>* ops =
       new llvm::DenseSet<mlir::TypeID>{
+    // Ops that should always use the MLIR legalization.
+    TypeID::get<TF::FusedBatchNormV3Op>(),
+    TypeID::get<TF::FusedBatchNormGradV3Op>(),
+    TypeID::get<TF::XlaReduceScatterOp>(),
+    TypeID::get<TF::ModOp>(),
+    TypeID::get<TF::ConcatV2Op>(),
+
     // Ops that are legalized in the old bridge using MlirXlaOpKernel
     TypeID::get<TF::AbsOp>(),
     TypeID::get<TF::AtanOp>(),
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
index cfec5714798..727baf76084 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/xla_legalize_tf_passes.td
@@ -46,7 +46,7 @@ def LegalizeTF : Pass<"xla-legalize-tf", "ModuleOp"> {
         "Prioritize tf2xla fallback legalization over MLIR legalization "
         "patterns">,
     Option<"use_tf2xla_hlo_importer_", "use-tf2xla-hlo-importer",
-       "bool", /*default=*/"false",
+       "bool", /*default=*/"true",
         "Use the experimental HLO to MHLO importer for per-op fallback calls "
         " from MLIR bridge to TF2XLA."
         "Users should not set this flag and ideally this goes away.">
diff --git a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
index 6758aee3b77..00ae360f1e6 100644
--- a/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_translate_main.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <unordered_set>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/str_split.h"
 #include "llvm/Support/InitLLVM.h"
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index f9fff19986e..b1990be9b58 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -194,6 +194,7 @@ tf_cc_binary(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     test_file_exts = ["mlir"],
@@ -328,8 +329,8 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tfr",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -387,8 +388,8 @@ tf_py_test(
         ":tfr_gen",
         "//tensorflow/compiler/mlir/python/mlir_wrapper:filecheck_wrapper",
         "//tensorflow/compiler/mlir/tfr/resources:test_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfr/examples/customization/BUILD b/tensorflow/compiler/mlir/tfr/examples/customization/BUILD
index 748a189e25c..fe4b0ebee47 100644
--- a/tensorflow/compiler/mlir/tfr/examples/customization/BUILD
+++ b/tensorflow/compiler/mlir/tfr/examples/customization/BUILD
@@ -39,6 +39,6 @@ tf_py_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/tfr:test_utils",
-        "//tensorflow/python:test_ops",
+        "//tensorflow/python/framework:test_ops",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
index 4160d864f2e..b54b5fc56fb 100644
--- a/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
+++ b/tensorflow/compiler/mlir/tfr/examples/mnist/BUILD
@@ -54,7 +54,7 @@ py_library(
         ":mnist_ops",
         ":mnist_ops_py",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework",
         "@absl_py//absl/flags",
     ],
 )
@@ -80,12 +80,12 @@ distribute_py_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":mnist_train",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:is_mlir_bridge_test_true",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/distribute:test_util",
+        "//tensorflow/python/framework:is_mlir_bridge_test_true",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
index 91a306c1fba..d30b5934691 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.cc
@@ -161,6 +161,16 @@ bool TFRType::classof(Type type) {
 // Custom op methods
 //===----------------------------------------------------------------------===//
 
+void CallOp::setCalleeFromCallable(CallInterfaceCallable callee) {
+  // Direct call.
+  if (FlatSymbolRefAttr calleeAttr = getCalleeAttr()) {
+    auto symRef = callee.get<SymbolRefAttr>();
+    return setCalleeAttr(cast<FlatSymbolRefAttr>(symRef));
+  }
+  // Indirect call, callee Value is the first operand.
+  return setOperand(0, callee.get<Value>());
+}
+
 LogicalResult ConstantTensorOp::verify() {
   ConstantTensorOp op = *this;
   auto input_type = op.getArg().getType();
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
index f9ce81f680b..3746674a8ce 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_ops.td
@@ -143,6 +143,8 @@ def TFR_CallOp : TFR_Op<"call", [CallOpInterface]> {
 
     // Return the callee of this operation.
     CallInterfaceCallable getCallableForCallee() { return getCalleeAttr(); }
+    // Sets the callee from the callable
+    void setCalleeFromCallable(CallInterfaceCallable callee);
   }];
 
   let assemblyFormat = [{
diff --git a/tensorflow/compiler/mlir/tfr/ir/tfr_types.h b/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
index d1049e51dd9..c862f0f1b5f 100644
--- a/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
+++ b/tensorflow/compiler/mlir/tfr/ir/tfr_types.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
 #define TENSORFLOW_COMPILER_MLIR_TFR_IR_TFR_TYPES_H_
 
+#include <memory>
+#include <string>
+
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfr/passes/decompose.cc b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
index 9a76d68efd9..5d59d958d3e 100644
--- a/tensorflow/compiler/mlir/tfr/passes/decompose.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/decompose.cc
@@ -14,8 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <cmath>
 #include <cstdint>
 #include <iterator>
+#include <limits>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <string>
diff --git a/tensorflow/compiler/mlir/tfr/passes/passes.h b/tensorflow/compiler/mlir/tfr/passes/passes.h
index 967a4c35d99..00bf11870ca 100644
--- a/tensorflow/compiler/mlir/tfr/passes/passes.h
+++ b/tensorflow/compiler/mlir/tfr/passes/passes.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFR_PASSES_PASSES_H_
 #define TENSORFLOW_COMPILER_MLIR_TFR_PASSES_PASSES_H_
 
+#include <memory>
 #include <optional>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
index 2b69ba782a8..dd85565cfed 100644
--- a/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/raise_to_tf.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <string>
diff --git a/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc b/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
index c1f34402835..babfef28d33 100644
--- a/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
+++ b/tensorflow/compiler/mlir/tfr/passes/rewrite_quantized_io.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/compiler/mlir/tfr/python/test_utils.py b/tensorflow/compiler/mlir/tfr/python/test_utils.py
index 22c61d0a5c8..09c1455eae0 100644
--- a/tensorflow/compiler/mlir/tfr/python/test_utils.py
+++ b/tensorflow/compiler/mlir/tfr/python/test_utils.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 """Test utils for composite op definition."""
 from tensorflow.python.eager import backprop
+from tensorflow.python.framework import test_util
 from tensorflow.python.platform import test
 
 
@@ -23,6 +24,8 @@ class OpsDefsTest(test.TestCase):
                             op_kwargs=None):
     if op_kwargs is None:
       op_kwargs = kwargs
+    if test_util.IsMklEnabled():
+      self.skipTest("Not compatible with oneDNN custom ops.")
 
     # compute with op.
     with backprop.GradientTape() as gt:
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
index 7e580ba61e4..760ddab974c 100644
--- a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfr/utils/utils.cc b/tensorflow/compiler/mlir/tfr/utils/utils.cc
index 4f7a90bb972..3580b7dab7f 100644
--- a/tensorflow/compiler/mlir/tfr/utils/utils.cc
+++ b/tensorflow/compiler/mlir/tfr/utils/utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfr/utils/utils.h"
 
+#include <string>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
diff --git a/tensorflow/compiler/mlir/tfr/utils/utils.h b/tensorflow/compiler/mlir/tfr/utils/utils.h
index 7e0c0208254..911015ae0be 100644
--- a/tensorflow/compiler/mlir/tfr/utils/utils.h
+++ b/tensorflow/compiler/mlir/tfr/utils/utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFR_UTILS_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_TFR_UTILS_UTILS_H_
 
+#include <string>
+
 #include "mlir/IR/Block.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index 068b7cabf22..55ca19518cb 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -22,9 +22,9 @@ package_group(
     packages = [
         "//tensorflow/compiler/...",
         "//tensorflow/core/runtime_fallback/...",
-        "//tensorflow/core/tfrt/eager/...",
         "//tensorflow/core/tfrt/experimental/data/...",
         "//tensorflow/core/tfrt/graph_executor/...",
+        "//tensorflow/core/tfrt/mlrt/...",
         "//tensorflow/core/tfrt/saved_model/...",
         "//tensorflow/core/tfrt/tfrt_session/...",
     ] + if_google([
@@ -307,8 +307,10 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
+        "@tf_runtime//:basic_kernels_alwayslink",
         "@tf_runtime//:bef",
         "@tf_runtime//:befexecutor",
+        "@tf_runtime//:core_runtime_alwayslink",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:mlirtobef",
         "@tf_runtime//:support",
@@ -425,13 +427,12 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_asset_sinking_pass",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_gpu_opdefs",
         "//tensorflow/core:framework",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:tstring",
-        "//tensorflow/tsl/platform:status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -531,10 +532,9 @@ cc_library(
         "translate/import_model.h",
     ],
     visibility = [
-        # copybara:uncomment "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:__pkg__",
         # copybara:uncomment "//learning/brain/experimental/tfrt/visualization:__pkg__",
         "//tensorflow/compiler/mlir/tfrt/tests/saved_model:__pkg__",
-        "//tensorflow/core/tfrt/eager:__pkg__",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__pkg__",
         "//tensorflow/core/tfrt/graph_executor:__pkg__",
         "//tensorflow/core/tfrt/saved_model:__pkg__",
     ],
@@ -587,6 +587,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@tf_runtime//:compiler_tfrt_op_interfaces",
     ],
 )
 
@@ -660,29 +661,35 @@ cc_library(
     deps = [
         ":passes",
         ":test_cost_analysis_pass",
+        ":test_opkernels",
         ":test_tensor_array_side_effect_analysis",
         ":tf_jitrt_opdefs",
         ":tf_to_tfrt",
         ":tfrt_jitrt_passes",
+        ":transforms/gpu_passes",
         "//tensorflow/compiler/mlir:init_mlir",
         "//tensorflow/compiler/mlir:passes",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:bridge_pass_test_pipeline_registration",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tfrt:transforms/gpu_passes",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_async_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_fallback_sync_opdefs",
         "//tensorflow/compiler/mlir/tfrt/ir:tfrt_gpu_opdefs",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_passes",
         "//tensorflow/compiler/mlir/tfrt/jit/transforms:tf_jitrt_test_passes",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:passes",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
         "//tensorflow/core:lib",
+        "//tensorflow/core:tensorflow",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:MlirOptLib",
         "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Transforms",
         "@tf_runtime//:init_tfrt_dialects",
         "@tf_runtime//:print_stream_pass",
         "@tf_runtime//backends/jitrt:jitrt_compiler",
@@ -892,3 +899,11 @@ cc_library(
     name = "constants",
     hdrs = ["constants.h"],
 )
+
+cc_library(
+    name = "test_opkernels",
+    testonly = True,
+    srcs = ["test_opkernels.cc"],
+    deps = ["//tensorflow/core:framework"],
+    alwayslink = True,
+)
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
index c7d02332839..5573e7c2d46 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.cc
@@ -14,12 +14,15 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h"
 
+#include <algorithm>
 #include <string>
+#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/constants.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tfrt/compiler/opdefs/tfrt_op_interfaces.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace tfrt_compiler {
@@ -157,6 +160,12 @@ void CostAnalysis::AnalyzeBlock(mlir::Block* block) {
 }
 
 void CostAnalysis::EvaluateCost(mlir::Operation* op) {
+  if (auto cost_function =
+          mlir::dyn_cast<tfrt::compiler::CostFunctionInterface>(op)) {
+    cost_map_[op] = cost_function.cost();
+    return;
+  }
+
   if (!llvm::isa<mlir::TF::TensorFlowDialect>(op->getDialect())) {
     cost_map_[op] = max_arg_size_;
     return;
@@ -178,7 +187,7 @@ void CostAnalysis::EvaluateCost(mlir::Operation* op) {
     const auto op_key_attr =
         op->getAttrOfType<mlir::IntegerAttr>(kOpKeyAttrName);
     if (op_key_attr) {
-      cost_map_[op] = cost_recorder_->GetCostNanosecond(op_key_attr.getInt());
+      cost_map_[op] = cost_recorder_->GetCost(op_key_attr.getInt());
       return;
     }
   }
diff --git a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
index fa01b38dd64..809846619d3 100644
--- a/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
+++ b/tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COST_ANALYSIS_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_ANALYSIS_COST_ANALYSIS_H_
 
+#include <functional>
+
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
index 58ff8929c2a..cf3acc48906 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 
+#include <memory>
 #include <string>
+#include <utility>
 
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h
index c315d2e9917..41f3b93b121 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_BENCHMARK_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_BENCHMARK_H_
 
+#include <array>
+#include <string>
 #define EIGEN_USE_THREADS
 
 #include <memory>
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
index a390d365303..2fc595caee0 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.cc
@@ -55,7 +55,7 @@ static llvm::SmallVector<Tensor> GetInputTensors(
 
   for (const InputTensorSpec& spec : input_specs) {
     TensorShape shape;
-    CHECK(TensorShapeUtils::MakeShape(spec.dims, &shape).ok());
+    CHECK_OK(TensorShapeUtils::MakeShape(spec.dims, &shape));
     input_tensors.emplace_back(spec.dtype, shape);
 
     // Initialize tensors with random data.
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/cwise_op_unary_benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/cwise_op_unary_benchmark.h
index d3f7ade5a32..5d8972ec0e2 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/cwise_op_unary_benchmark.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/cwise_op_unary_benchmark.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_CWISE_OP_UNARY_BENCHMARK_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_BENCHMARKS_CWISE_OP_UNARY_BENCHMARK_H_
 
+#include <array>
+#include <memory>
 #include <optional>
 #include <utility>
 
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc
index 5e8f9115360..c578b82c17a 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h"
+
+#include <string>
+
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 
 namespace tensorflow {
diff --git a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h
index 22977dfd702..da619834397 100644
--- a/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h
+++ b/tensorflow/compiler/mlir/tfrt/benchmarks/reduction_benchmark.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark.h"
 #include "tensorflow/compiler/mlir/tfrt/benchmarks/benchmark_mlir_function.h"
diff --git a/tensorflow/compiler/mlir/tfrt/constants.h b/tensorflow/compiler/mlir/tfrt/constants.h
index dfbb9ba4898..ed6e773c52a 100644
--- a/tensorflow/compiler/mlir/tfrt/constants.h
+++ b/tensorflow/compiler/mlir/tfrt/constants.h
@@ -23,12 +23,6 @@ namespace tfrt_compiler {
 inline constexpr char kOpKeyAttrName[] = "__op_key";
 
 }  // namespace tfrt_compiler
-
-namespace mlrt_compiler {
-
-inline constexpr char kArgPassByValue[] = "mlrt.__pass_by_value";
-
-}  // namespace mlrt_compiler
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_TFRT_CONSTANTS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/BUILD b/tensorflow/compiler/mlir/tfrt/ir/BUILD
index f63f0c7ff07..464bef0fe8d 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/ir/BUILD
@@ -62,6 +62,7 @@ cc_library(
     visibility = [
         "//tensorflow/compiler/mlir/tfrt:__subpackages__",
         # copybara:uncomment "//tensorflow/core/runtime_fallback:internal",
+        "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
     ],
     deps = [
         ":tfrt_fallback_common",
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
new file mode 100644
index 00000000000..313b7ee1197
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/BUILD
@@ -0,0 +1,190 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+td_library(
+    name = "mlrt_td_files",
+    srcs = [
+        "mlrt_dialect.td",
+        "mlrt_ops.td",
+    ],
+    includes = ["."],
+    visibility = [
+        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+    ],
+    deps = [
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "mlrt_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "mlrt_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "mlrt_ops.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "mlrt_ops.td",
+    deps = [":mlrt_td_files"],
+)
+
+cc_library(
+    name = "mlrt_ops",
+    srcs = [
+        "mlrt_dialect.cc",
+        "mlrt_ops.cc",
+    ],
+    hdrs = [
+        "mlrt_dialect.h",
+        "mlrt_ops.h",
+    ],
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+    ],
+    deps = [
+        ":mlrt_ops_inc_gen",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+td_library(
+    name = "tf_mlrt_td_files",
+    srcs = [
+        "tf_mlrt_dialect.td",
+        "tf_mlrt_ops.td",
+        "tf_ops.td",
+    ],
+    includes = ["."],
+    visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+    ],
+    deps = [
+        ":mlrt_td_files",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+        "@tf_runtime//:compiler_td_files",
+    ],
+)
+
+td_library(
+    name = "tf_mlrt_tpu_td_files",
+    srcs = [
+        "tf_mlrt_tpu_ops.td",
+    ],
+    includes = ["."],
+    visibility = [
+        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+    ],
+    deps = [
+        ":mlrt_td_files",
+        ":tf_mlrt_td_files",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_td_files",
+        "@llvm-project//mlir:OpBaseTdFiles",
+        "@llvm-project//mlir:SideEffectInterfacesTdFiles",
+    ],
+)
+
+gentbl_cc_library(
+    name = "tf_mlrt_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "tf_mlrt_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "tf_mlrt_ops.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "tf_mlrt_ops.td",
+    deps = [":tf_mlrt_td_files"],
+)
+
+gentbl_cc_library(
+    name = "tf_mlrt_tpu_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "tf_mlrt_tpu_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "tf_mlrt_tpu_ops.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "tf_mlrt_tpu_ops.td",
+    deps = [":tf_mlrt_tpu_td_files"],
+)
+
+gentbl_cc_library(
+    name = "tf_ops_inc_gen",
+    tbl_outs = [
+        (
+            ["-gen-op-decls"],
+            "tf_ops.h.inc",
+        ),
+        (
+            ["-gen-op-defs"],
+            "tf_ops.cpp.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "tf_ops.td",
+    deps = [":tf_mlrt_td_files"],
+)
+
+cc_library(
+    name = "tf_mlrt_ops",
+    srcs = ["tf_mlrt_ops.cc"],
+    hdrs = ["tf_mlrt_ops.h"],
+    visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+    ],
+    deps = [
+        ":mlrt_ops",
+        ":tf_mlrt_ops_inc_gen",
+        ":tf_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_side_effects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Transforms",
+        "@tf_runtime//:compiler_tfrt_op_interfaces",
+        "@tf_runtime//:compiler_tfrt_traits",
+    ],
+)
+
+cc_library(
+    name = "tf_mlrt_tpu_ops",
+    srcs = ["tf_mlrt_tpu_ops.cc"],
+    hdrs = ["tf_mlrt_tpu_ops.h"],
+    visibility = [
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
+        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+    ],
+    deps = [
+        ":mlrt_ops",
+        ":tf_mlrt_ops",
+        ":tf_mlrt_tpu_ops_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc
new file mode 100644
index 00000000000..50d4cb12142
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.cc
@@ -0,0 +1,95 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
+
+namespace mlrt {
+namespace compiler {
+
+namespace {
+
+struct MlrtInlinerInterface : public mlir::DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+  bool isLegalToInline(mlir::Operation *op, mlir::Region *dest,
+                       bool would_be_cloned,
+                       mlir::IRMapping &mapping) const final {
+    // All mlrt dialect ops can be inlined.
+    return true;
+  }
+};
+
+}  // namespace
+
+MlrtDialect::MlrtDialect(mlir::MLIRContext *context)
+    : mlir::Dialect(/*name=*/"mlrt", context,
+                    mlir::TypeID::get<MlrtDialect>()) {
+  addTypes<FutureType>();
+  addTypes<PromiseType>();
+  addTypes<AsyncHandleType>();
+  addInterfaces<MlrtInlinerInterface>();
+
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.cpp.inc"
+      >();
+}
+
+// Parse a type registered to this dialect.
+mlir::Type MlrtDialect::parseType(mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+
+  if (keyword == "future") return FutureType::get(getContext());
+  if (keyword == "promise") return PromiseType::get(getContext());
+  if (keyword == "async_handle") return AsyncHandleType::get(getContext());
+
+  parser.emitError(parser.getNameLoc(), "unknown type: ") << keyword;
+  return mlir::Type();
+}
+
+// Print a type registered to this dialect.
+void MlrtDialect::printType(mlir::Type type,
+                            mlir::DialectAsmPrinter &os) const {
+  if (type.isa<FutureType>()) {
+    os << "future";
+    return;
+  }
+
+  if (type.isa<PromiseType>()) {
+    os << "promise";
+    return;
+  }
+
+  if (type.isa<AsyncHandleType>()) {
+    os << "async_handle";
+    return;
+  }
+
+  llvm_unreachable("unexpected mlrt type kind");
+}
+
+}  // namespace compiler
+}  // namespace mlrt
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
new file mode 100644
index 00000000000..0fb568b44dc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h
@@ -0,0 +1,59 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_DIALECT_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_DIALECT_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+
+namespace mlrt {
+namespace compiler {
+
+class MlrtDialect : public mlir::Dialect {
+ public:
+  explicit MlrtDialect(mlir::MLIRContext *context);
+  static llvm::StringRef getDialectNamespace() { return "mlrt"; }
+
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+  void printType(mlir::Type type, mlir::DialectAsmPrinter &os) const override;
+};
+
+// The MLIR type represents a C++ mlrt::Future.
+class FutureType
+    : public mlir::Type::TypeBase<FutureType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+// The MLIR type represents a C++ mlrt::Promise.
+class PromiseType
+    : public mlir::Type::TypeBase<PromiseType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+// The MLIR type represents a C++ mlrt::AsyncHandle.
+class AsyncHandleType : public mlir::Type::TypeBase<AsyncHandleType, mlir::Type,
+                                                    mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+}  // namespace compiler
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_DIALECT_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
new file mode 100644
index 00000000000..b260dcb402f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td
@@ -0,0 +1,55 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef MLRT_DIALECT
+#else
+#define MLRT_DIALECT
+
+include "mlir/IR/OpBase.td"
+
+def Mlrt_Dialect : Dialect {
+  let name = "mlrt";
+
+  let description = [{
+    The MLRT Dialect.
+  }];
+
+  let cppNamespace = "::mlrt::compiler";
+}
+
+def MlrtFutureType : DialectType<Mlrt_Dialect,
+    CPred<"$_self.isa<::mlrt::compiler::FutureType>()">, "!mlrt.future type">,
+    BuildableType<"$_builder.getType<::mlrt::compiler::FutureType>()"> {
+  let description = [{
+    `!mlrt.future type` represents a C++ mlrt::Future.
+  }];
+}
+
+def MlrtPromiseType : DialectType<Mlrt_Dialect,
+    CPred<"$_self.isa<::mlrt::compiler::PromiseType>()">, "!mlrt.promise type">,
+    BuildableType<"$_builder.getType<::mlrt::compiler::PromiseType>()"> {
+  let description = [{
+    `!mlrt.promise type` represents a C++ mlrt::Promise.
+  }];
+}
+
+def MlrtAsyncHandleType : DialectType<Mlrt_Dialect,
+    CPred<"$_self.isa<::mlrt::compiler::AsyncHandleType>()">, "!mlrt.async_handle type">,
+    BuildableType<"$_builder.getType<::mlrt::compiler::AsyncHandleType>()"> {
+  let description = [{
+    `!mlrt.async_handle type` represents a C++ mlrt::AsyncHandle.
+  }];
+}
+
+#endif
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.cc b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.cc
new file mode 100644
index 00000000000..878b2504de2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.cc
@@ -0,0 +1,28 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.cpp.inc"
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h
new file mode 100644
index 00000000000..e3922c6e0ce
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h
@@ -0,0 +1,27 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_OPS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_MLRT_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.td
new file mode 100644
index 00000000000..24c34fb4a41
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.td
@@ -0,0 +1,240 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef MLRT_OPS
+#else
+#define MLRT_OPS
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td"
+
+class Mlrt_Op<string mnemonic, list<Trait> traits = []> :
+    Op<Mlrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+}
+
+def CondOp: Mlrt_Op<"cond", []> {
+  let summary = "mlrt.cond op";
+
+  let description = [{
+    Execute $a_true_fn with $args if $cond is true; otherwise, %b_false_fn is
+    executed.
+  }];
+
+  let arguments = (ins
+    I1:$cond,
+    Variadic<AnyType>:$args,
+    SymbolRefAttr:$a_true_fn,
+    SymbolRefAttr:$b_false_fn
+  );
+
+  let results = (outs
+    Variadic<AnyType>:$results
+  );
+
+  let assemblyFormat = [{
+    $cond $a_true_fn $b_false_fn `(` $args `)` attr-dict `:` `(` type($args) `)` `->` `(` type($results) `)`
+  }];
+}
+
+def AsyncOp: Mlrt_Op<"async", []> {
+  let summary = "Launches a function asynchronously.";
+
+  let description = [{
+    Launch a function asynchronously.
+
+    $args: a list of arguments to be passed.
+    $callee: The function to be launched. Its return op must not have operands.
+
+    $handle: This op returns a handle object that manages the context of the async execution.
+  }];
+
+  let arguments = (ins
+    Variadic<AnyType>:$args,
+    SymbolRefAttr:$callee
+  );
+
+  let results = (outs
+    MlrtAsyncHandleType:$handle
+  );
+
+  let assemblyFormat = "`(` $args `)` attr-dict `:` functional-type($args, $handle)";
+}
+
+def AwaitHandleOp: Mlrt_Op<"await_handle", []> {
+  let summary = "Awaits an async execution ";
+
+  let description = [{
+    Awaits an async execution.
+
+    $handle: The handle returned by mlrt.async op.
+  }];
+
+  let arguments = (ins
+    MlrtAsyncHandleType:$handle
+  );
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def AwaitAllHandleOp: Mlrt_Op<"await_all_handle", []> {
+  let summary = "Awaits multiple async executions";
+
+  let description = [{
+    Awaits multiple async execution.
+
+    $handles: A list of handles returned by mlrt.async ops.
+  }];
+
+  let arguments = (ins
+    Variadic<MlrtAsyncHandleType>:$handles
+  );
+
+  let assemblyFormat = "operands attr-dict `:` type($handles)";
+}
+
+def AwaitControlOp: Mlrt_Op<"await_control", []> {
+  let summary = "Await a signal from a future";
+
+  let description = [{
+    Await a signal, instead of a value, from a future.
+
+    $future: A value of !mlrt.future type.
+  }];
+
+  let arguments = (ins
+    MlrtFutureType:$future
+  );
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def AwaitAllControlOp: Mlrt_Op<"await_all_control", []> {
+  let summary = "Awaits multiple signals";
+
+  let description = [{
+    Awaits multiple signals
+
+    $futures: A list of !mlrt.futures
+  }];
+
+  let arguments = (ins
+    Variadic<MlrtFutureType>:$futures
+  );
+
+  let assemblyFormat = "operands attr-dict `:` type($futures)";
+}
+
+def PromiseControlOp: Mlrt_Op<"promise_control", []> {
+  let summary = "Set a control promise";
+
+  let description = [{
+    Set a control promise.
+
+    $promise: A value of !mlrt.promise type.
+  }];
+
+  let arguments = (ins
+    MlrtPromiseType:$promise
+  );
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def CaseOp : Mlrt_Op<"case"> {
+  let summary = "An n-way switch statement which calls a single branch function.";
+  let description = [{
+    An n-way switch statement, implementing the following:
+        ```
+        switch (branch_index) {
+          case 0:
+            outputs = branches[0](inputs);
+            break;
+          case 1:
+            outputs = branches[1](inputs);
+            break;
+          ...
+          case [[nbranches-1]]:
+          default:
+            outputs = branches[nbranches-1](inputs);
+            break;
+        }
+        ```
+    Example: %res = mlrt.case %branch_idx [@branch0, @branch1] (%arg0, %arg1) : (i32, i32) -> (i32)
+  }];
+
+  let arguments = (ins I32:$branch_index,
+                       ConfinedAttr<SymbolRefArrayAttr, [ArrayMinCount<1>]>:$branches,
+                       Variadic<AnyType>:$branch_operands);
+
+  let results = (outs Variadic<AnyType>:$branch_outputs);
+  let assemblyFormat = [{
+    $branch_index $branches `(` $branch_operands `)` attr-dict `:` `(` type($branch_operands) `)` `->` `(` type($branch_outputs) `)`
+  }];
+}
+
+def AllocateControlFuturesOp: Mlrt_Op<"allocate_control_futures", [AttrSizedResultSegments]> {
+  let summary = "Allocate futures and corresponding promises";
+
+  let description = [{
+    Allocate futures and corresponding promises.
+
+    $num: The number of futures to be allocated.
+
+    $promises: There are $num promises, and promises[i] shares the state with futures[i].
+    $futures: There are $num futures, and futures[i] shares the state with promises[i].
+  }];
+
+  let arguments = (ins
+    I32Attr:$num
+  );
+
+  let results = (outs
+    Variadic<MlrtPromiseType>:$promises,
+    Variadic<MlrtFutureType>:$futures
+  );
+}
+
+def WhileOp : Mlrt_Op<"while", []> {
+  let summary = "while operation";
+  let description = [{
+    cond: The boolean to control whether the first iteration should be
+      executed.
+    operands: The arguments to the first iteration.
+    results: The results of the last iteration. The number and types of results
+      excluding the last one are the same as the number and types of operands. The
+      last element of results is an I1 value that is false.
+    body_fn: The body function that takes the arguments and returns the results
+      that includes an I1 value to indicate whether next iteration should be executed.
+
+    The pseudo code:
+
+    while(cond) {
+      results = body_fn(operands)
+      cond = results#1
+    }
+    return results
+
+  }];
+
+  let arguments = (ins I1:$cond,
+                       Variadic<AnyType>:$arguments,
+                       FlatSymbolRefAttr:$body_fn);
+
+  let results = (outs Variadic<AnyType>);
+
+  let assemblyFormat = [{
+    $cond $body_fn `(` $arguments `)` attr-dict `:` `(` type($arguments) `)` `->` `(` type(results) `)`
+  }];
+}
+#endif
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
new file mode 100644
index 00000000000..9cf997e0c3e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td
@@ -0,0 +1,56 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef TF_MLRT_DIALECT
+#else
+#define TF_MLRT_DIALECT
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td"
+
+// TODO(chky,rohitju): Unify this dialect with tfrt_fallback_sync dialect after
+// vrooml is using the new interpreter.
+def TensorflowMlrt_Dialect : Dialect {
+  let name = "tf_mlrt";
+
+  let description = [{
+    The TF MLRT Dialect.
+  }];
+
+  let cppNamespace = "::tensorflow::tf_mlrt";
+}
+
+class TensorflowMlrt_Op<string mnemonic, list<Trait> traits = []> :
+    Op<TensorflowMlrt_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+}
+
+// This corresponds to tensorflow::Tensor.
+def TFTensorType : DialectType<TensorflowMlrt_Dialect,
+    CPred<"$_self.isa<::tensorflow::tf_mlrt::TFTensorType>()">, "!tf_mlrt.tensor type">,
+    BuildableType<"$_builder.getType<::tensorflow::tf_mlrt::TFTensorType>()"> {
+  let description = [{
+    `!tf_mlrt.tensor type` represents a tensorflow::Tensor.
+  }];
+}
+
+// This corresponds to tensorflow::Device* .
+def TFDeviceType : DialectType<TensorflowMlrt_Dialect,
+    CPred<"$_self.isa<::tensorflow::tf_mlrt::TFDeviceType>()">, "!tf_mlrt.device type">,
+    BuildableType<"$_builder.getType<::tensorflow::tf_mlrt::TFDeviceType>()"> {
+  let description = [{
+    `!tf_mlrt.device type` represents a tensorflow::device.
+  }];
+}
+
+#endif
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc
new file mode 100644
index 00000000000..fc4cb6a93a2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cc
@@ -0,0 +1,95 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Transforms/InliningUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+namespace {
+
+struct TensorflowMlrtInlinerInterface : public mlir::DialectInlinerInterface {
+  using DialectInlinerInterface::DialectInlinerInterface;
+  bool isLegalToInline(mlir::Operation *op, mlir::Region *dest,
+                       bool would_be_cloned,
+                       mlir::IRMapping &mapping) const final {
+    // All tf_mlrt dialect ops can be inlined.
+    return true;
+  }
+  // Note that CallOp and ReturnOp are handled by func; so need to implement
+  // handleTerminator.
+};
+
+}  // namespace
+
+TensorflowMlrtDialect::TensorflowMlrtDialect(mlir::MLIRContext *context)
+    : mlir::Dialect(/*name=*/"tf_mlrt", context,
+                    mlir::TypeID::get<TensorflowMlrtDialect>()) {
+  addTypes<TFTensorType, TFDeviceType>();
+  addInterfaces<TensorflowMlrtInlinerInterface>();
+
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cpp.inc"
+      >();
+
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.cpp.inc"
+      >();
+}
+
+// Parse a type registered to this dialect.
+mlir::Type TensorflowMlrtDialect::parseType(
+    mlir::DialectAsmParser &parser) const {
+  llvm::StringRef keyword;
+  if (parser.parseKeyword(&keyword)) return mlir::Type();
+
+  if (keyword == "tensor") return TFTensorType::get(getContext());
+
+  parser.emitError(parser.getNameLoc(), "unknown type: ") << keyword;
+  return mlir::Type();
+}
+
+// Print a type registered to this dialect.
+void TensorflowMlrtDialect::printType(mlir::Type type,
+                                      mlir::DialectAsmPrinter &os) const {
+  if (type.isa<TFTensorType>()) {
+    os << "tensor";
+    return;
+  }
+
+  llvm_unreachable("unexpected tf_mlrt type kind");
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.cpp.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.cpp.inc"
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
new file mode 100644
index 00000000000..da91450aa19
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h
@@ -0,0 +1,61 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_OPS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_side_effects.h"
+#include "tfrt/compiler/opdefs/tfrt_op_interfaces.h"  // from @tf_runtime
+#include "tfrt/compiler/opdefs/tfrt_traits.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+class TensorflowMlrtDialect : public mlir::Dialect {
+ public:
+  explicit TensorflowMlrtDialect(mlir::MLIRContext *context);
+  static llvm::StringRef getDialectNamespace() { return "tf_mlrt"; }
+
+  mlir::Type parseType(mlir::DialectAsmParser &parser) const override;
+  void printType(mlir::Type type, mlir::DialectAsmPrinter &os) const override;
+};
+
+// The MLIR type represents a tensorflow::Tensor.
+class TFTensorType
+    : public mlir::Type::TypeBase<TFTensorType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+// The MLIR type represents a tensorflow::Device*
+class TFDeviceType
+    : public mlir::Type::TypeBase<TFDeviceType, mlir::Type, mlir::TypeStorage> {
+ public:
+  using Base::Base;
+};
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h.inc"
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
new file mode 100644
index 00000000000..bbbec10187a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -0,0 +1,378 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef TF_MLRT_OPS
+#else
+#define TF_MLRT_OPS
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td"
+
+def CreateOp: TensorflowMlrt_Op<"createop", []> {
+  let summary = "The Fallback CreateOp";
+
+  let description = [{
+    The CreateOp creates the tensorflow::OpKernel in the fallback context.
+  }];
+
+  let arguments = (ins
+    StrAttr:$node_def,
+    I32Attr:$op_key
+  );
+
+  let assemblyFormat = "attr-dict";
+}
+
+def ExecuteOp : TensorflowMlrt_Op<"executeop", []> {
+  let summary = "The Fallback ExecuteOp";
+  let description = [{
+    The ExecuteOp executes an operation on the specified device.
+  }];
+
+  let arguments = (ins
+    Variadic<TFTensorType>:$args,
+    StrAttr:$node_def,
+    I32Attr:$op_key
+  );
+
+  let results = (outs
+    Variadic<TFTensorType>:$results
+  );
+
+  let assemblyFormat = "`(` $args `)` attr-dict `:` functional-type($args, $results)";
+}
+
+def ExecuteOpWithDevice: TensorflowMlrt_Op<"executeop.device", []> {
+  let summary = "The Fallback ExecuteOp with custom device";
+  let description = [{
+    The ExecuteOp executes an operation on the specified device using a custom device.
+  }];
+
+  let arguments = (ins
+    TFDeviceType:$device,
+    Variadic<TFTensorType>:$args,
+    StrAttr:$node_def,
+    I32Attr:$op_key
+  );
+
+  let results = (outs
+    Variadic<TFTensorType>:$results
+  );
+
+  let assemblyFormat = "`(` $device`)` `(` $args `)` attr-dict `:` functional-type($args, $results)";
+}
+
+def AsyncExecuteOp : TensorflowMlrt_Op<"async_executeop", []> {
+  let summary = "The Fallback ExecuteOp for tensorflow::AsyncOpKernel";
+  let description = [{
+    The ExecuteOp executes an operation on the specified device asynchronously.
+  }];
+
+  let arguments = (ins
+    Variadic<TFTensorType>:$args,
+    StrAttr:$node_def,
+    I32Attr:$op_key
+  );
+
+  let results = (outs
+    Variadic<MlrtFutureType>:$results
+  );
+
+  let assemblyFormat = "`(` $args `)` attr-dict `:` functional-type($args, $results)";
+}
+
+def AsyncExecuteOpWithDevice : TensorflowMlrt_Op<"async_executeop.device", []> {
+  let summary = "The Fallback ExecuteOp for tensorflow::AsyncOpKernel";
+  let description = [{
+    The ExecuteOp executes an operation on the specified device asynchronously.
+  }];
+
+  let arguments = (ins
+    TFDeviceType:$device,
+    Variadic<TFTensorType>:$args,
+    StrAttr:$node_def,
+    I32Attr:$op_key
+  );
+
+  let results = (outs
+    Variadic<MlrtFutureType>:$results
+  );
+
+  let assemblyFormat = "`(` $device`)` `(` $args `)` attr-dict `:` functional-type($args, $results)";
+}
+
+def SetResourceOp : TensorflowMlrt_Op<"set_resource", []> {
+  let summary = "Set a tensor in resource array";
+
+  let description = [{
+    Set a tensor in resource array.
+
+    arg: the tensor to be set in the resource array.
+    index: the index in the resource array
+  }];
+
+  let arguments = (ins
+    TFTensorType:$arg,
+    I64Attr:$index
+  );
+
+  let results = (outs);
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def GetResourceOp : TensorflowMlrt_Op<"get_resource", []> {
+  let summary = "get a tensor in resource array";
+
+  let description = [{
+    Get a tensor in resource array.
+
+    indices: the indices in the resource array.
+    results: the tensor values for the corresponding indices.
+  }];
+
+  let arguments = (ins
+    I64ArrayAttr:$indices
+  );
+
+  let results = (outs
+    Variadic<TFTensorType>:$results
+  );
+
+  let assemblyFormat = "attr-dict `:` type($results)";
+}
+
+def AwaitOp: TensorflowMlrt_Op<"await", [Pure]> {
+  let summary = "Await a tensor from a !mlrt.future";
+
+  let description = [{
+    Await a tensor from a !mlrt.future.
+
+    $future: A value of type !mlrt.future. The underlying value must be a tensorflow tensor.
+
+    $result: a tensorflow tensor.
+  }];
+
+  let arguments = (ins
+    MlrtFutureType:$future
+  );
+
+  let results = (outs
+    TFTensorType:$result
+  );
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def AwaitAllOp: TensorflowMlrt_Op<"await_all", [Pure]> {
+  let summary = "Await tensors from a list of !mlrt.future";
+
+  let description = [{
+    Await tensors from a list of !mlrt.future.
+
+    $futures: A list of !mlrt.future. The underlying value must be tensorflow tensors.
+
+    $results: A list of tensorflow tensors.
+  }];
+
+  let arguments = (ins
+    Variadic<MlrtFutureType>:$futures
+  );
+
+  let results = (outs
+    Variadic<TFTensorType>:$results
+  );
+
+  let assemblyFormat = "$futures attr-dict `:` type($results)";
+}
+
+def PromiseOp: TensorflowMlrt_Op<"promise", []> {
+  let summary = "Set a tensor in a promise";
+
+  let description = [{
+    Set a tensor in a promise.
+
+    $promise: A value of type !mlrt.promise. The underlying value must be a tensorflow tensor.
+    $tensor: A tensorflow tensor.
+  }];
+
+  let arguments = (ins
+    MlrtPromiseType:$promise,
+    TFTensorType:$tensor
+  );
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def PromiseFutureOp: TensorflowMlrt_Op<"promise_future", []> {
+  let summary = "Set a tensor future in a promise";
+
+  let description = [{
+    Set a tensor future in a promise.
+
+    $promise: A value of type !mlrt.promise. The underlying value must be a tensorflow tensor.
+    $future: A value of type !mlrt.future. Must represents a tensorflow tensor.
+  }];
+
+  let arguments = (ins
+    MlrtPromiseType:$promise,
+    MlrtFutureType:$tensor
+  );
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+
+def AllocateFuturesOp: TensorflowMlrt_Op<"allocate_futures", [AttrSizedResultSegments]> {
+  let summary = "Allocate futures and promsies for tensorflow tensors";
+
+  let description = [{
+    Allocate futures and promsies for tensorflow tensors.
+
+    $num_futures: The number of futures to be allocated.
+
+    $promises: There are $num_futures promises. promises[i] shares the state with futures[i].
+    $futures: There are $num_futures futures. futures[i] shares the state with promises[i].
+  }];
+
+  let arguments = (ins
+    I32Attr:$num_futures
+  );
+
+  let results = (outs
+    Variadic<MlrtPromiseType>:$promises,
+    Variadic<MlrtFutureType>:$futures
+  );
+}
+
+def TensorToIntOp : TensorflowMlrt_Op<"tensor_to_int32", [Pure]> {
+  let summary = "Cast a Tensor to int32.";
+  let description = [{
+    Cast a Tensor to int32.
+
+    Example:
+      %one = tf_mlrt.tensor_to_int32 %src_tenosr
+  }];
+
+  let arguments = (ins TFTensorType:$src);
+  let results = (outs I32:$result);
+  let assemblyFormat = "operands attr-dict";
+}
+
+def PredicateOp : TensorflowMlrt_Op<"predicate", [Pure]> {
+  let summary = "Converts a fallback tensor to a bool";
+
+  let description = [{
+    Note: this kernel is used for CPU tensors.
+
+    Converts a fallback tensor to a bool with the following rules:
+
+    - For 0D tensors, truthiness is determined by comparing against a "zero"
+      value. For numerical types it is the obvious zero. For strings it is the
+      empty string.
+
+    - For >0D tensors, truthiness is determined by looking at the number of
+      elements. If has zero elements, then the result is false. Otherwise the
+      result is true.
+
+    input: a fallback tensor representing the condition.
+    device: the name of the tensorflow device that is associated with the
+      input fallback tensor.
+
+    output: the converted bool.
+  }];
+
+  let arguments = (ins
+    TFTensorType:$input
+  );
+
+  let results = (outs
+    I1:$output
+  );
+
+  let assemblyFormat = "$input attr-dict";
+}
+
+def BatchFunctionOp : TensorflowMlrt_Op<"batch_function", [Pure]> {
+  let summary = "Fallback ExecuteOp specialized for tf.BatchFunction.";
+
+  let description = [{
+    This kernel executes a variant tf.BatchFunction kernel that supports having
+    the `f` attribute as a bytecode function.
+
+    Example:
+      %res = tf_mlrt.batch_function(%input, %captured_input)  {
+          device = "/device:CPU:0",
+          f = @batch_function,
+          node_def = "..."
+        } : (!tf_mlrt.tensor,!tf_mlrt.tensor) -> (!tf_mlrt.tensor)
+
+    Note that the trailing number indicates the number of results.
+  }];
+
+  let arguments = (ins
+    Variadic<TFTensorType>:$args,
+    StrAttr:$device,
+    SymbolRefAttr:$f,
+    StrAttr:$node_def
+  );
+
+  let results = (outs
+    Variadic<MlrtFutureType>:$results
+  );
+
+  let assemblyFormat = "`(` $args `)` attr-dict `:` functional-type($args, $results)";
+}
+
+def CancelOp: TensorflowMlrt_Op<"cancel", []> {
+  let summary = "Handle cancellation request.";
+
+  let description = [{
+    This kernel will early terminate the program upon cancellation request (e.g. time out).
+  }];
+}
+
+def MapFnOp : TensorflowMlrt_Op<"map_fn", [AttrSizedOperandSegments, Pure]> {
+  let summary = "The Parallel Map for tf_mlrt dialect";
+  let description = [{
+    The Pmap executes body function in parallel for all ranges up to $max_iterations.
+
+    The pseudo code:
+      for(int i = 0; i < $max_iterations; i++) {
+        body_fn(MlrtFture($tensor_list_or_flow_in[i]),
+                MlrtPromise($tensor_list_or_flow_in[i+1]),
+                i, i, $invariant_args);
+      }
+
+      return $tensor_list_or_flow_in[$max_iterations]
+  }];
+
+  let arguments = (ins
+    TFTensorType:$max_iterations,
+    Variadic<TFTensorType>:$tensor_list_or_flow_in,
+    Variadic<TFTensorType>:$invariant_args,
+    FlatSymbolRefAttr:$body_fn,
+    I32Attr:$num_tensor_list_or_flow_in
+  );
+
+  let results = (outs
+     Variadic<TFTensorType>:$result
+  );
+
+  let assemblyFormat = "`(`$max_iterations`,` $tensor_list_or_flow_in`,` $invariant_args `)` attr-dict `:` functional-type(operands, results)";
+}
+
+
+#endif
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.cc b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.cc
new file mode 100644
index 00000000000..94e5d52bde1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.cc
@@ -0,0 +1,46 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h"
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectImplementation.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+
+namespace tensorflow {
+namespace tf_mlrt_tpu {
+
+TensorflowMlrtTpuDialect::TensorflowMlrtTpuDialect(mlir::MLIRContext *context)
+    : mlir::Dialect(/*name=*/"tf_mlrt_tpu", context,
+                    mlir::TypeID::get<TensorflowMlrtTpuDialect>()) {
+  addOperations<
+#define GET_OP_LIST
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.cpp.inc"
+      >();
+}
+
+}  // namespace tf_mlrt_tpu
+}  // namespace tensorflow
+
+//===----------------------------------------------------------------------===//
+// TableGen'd op method definitions
+//===----------------------------------------------------------------------===//
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.cpp.inc"
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h
new file mode 100644
index 00000000000..a428488da86
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_TPU_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_TPU_OPS_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/OpImplementation.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace tf_mlrt_tpu {
+
+class TensorflowMlrtTpuDialect : public mlir::Dialect {
+ public:
+  explicit TensorflowMlrtTpuDialect(mlir::MLIRContext *context);
+  static llvm::StringRef getDialectNamespace() { return "tf_mlrt_tpu"; }
+};
+
+}  // namespace tf_mlrt_tpu
+}  // namespace tensorflow
+
+#define GET_OP_CLASSES
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h.inc"
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_IR_MLRT_TF_MLRT_TPU_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td
new file mode 100644
index 00000000000..a207b83c7e5
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.td
@@ -0,0 +1,82 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef TF_MLRT_TPU_OPS
+#else
+#define TF_MLRT_TPU_OPS
+
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td"
+
+def TensorflowMlrtTpu_Dialect : Dialect {
+  let name = "tf_mlrt_tpu";
+
+  let description = [{
+    The TF MLRT TPU Dialect.
+  }];
+
+  let cppNamespace = "::tensorflow::tf_mlrt_tpu";
+}
+
+class TensorflowMlrtTpu_Op<string mnemonic, list<Trait> traits = []> :
+    Op<TensorflowMlrtTpu_Dialect, mnemonic, !listconcat(traits, [IsolatedFromAbove])> {
+}
+
+def GetTpuHostDeviceOp : TensorflowMlrtTpu_Op<"get_tpu_host_device", [Pure]> {
+  let summary = "get the tpu host allocator that implements tensorflow::Device";
+
+  let results = (outs
+    TFDeviceType:$device
+  );
+
+  let assemblyFormat = "attr-dict";
+}
+
+def CompileAndExecuteOp : TensorflowMlrtTpu_Op<"compile_and_execute"> {
+  let summary = "tpu compile and execute operation";
+  let description = [{
+    tf_mlrt_tpu.compile_and_execute compiles a mlir tpu program and executes the compiled tpu program.
+
+    $mlir_module is a serialized MLIR module with a `main` function that contains target computation.
+    $metadata is a serialized TPUCompileMetadataProto describing the shapes and types of the inputs to the computation, as well as a mapping onto the TPU pod topology.
+    $constant_operand_indices are the indices of the inputs that are constant to the TPU program (e.g. weights in inference), the rest of the inputs are input tensors.
+    constant_operand_indices is sorted in ascending order.
+    $operands_with_static_shape are indices of operands that are tagged with a maximum static shape.
+    $producer_name is a string describing the name of the framework that added support for running this portion of the model on TPUs.
+
+    Example:
+      %rendezvous_key_base, %result = tf_mlrt_tpu.compile_and_execute (%operands) constant_operand_indices = [1, 3] metadata = "metadata..." mlir_module = "mlir_module..."
+  }];
+  let arguments = (ins
+    Variadic<TFTensorType>:$operands_and_static_shapes,
+    DenseI32ArrayAttr:$constant_operand_indices,
+    StrAttr:$metadata,
+    StrAttr:$mlir_module,
+    UI32Attr:$num_operands,
+    DenseI32ArrayAttr:$operands_with_static_shape,
+    StrAttr:$producer_name
+  );
+
+  let results = (outs
+    TFTensorType:$rendezvous_key_base,
+    Variadic<MlrtFutureType>:$results
+  );
+
+  let assemblyFormat = [{
+    `(` $operands_and_static_shapes `)` attr-dict `:` functional-type($operands_and_static_shapes, results)
+  }];
+}
+
+#endif
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
new file mode 100644
index 00000000000..7268588749d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_ops.td
@@ -0,0 +1,131 @@
+/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifdef MLRT_TF_OPS
+#else
+#define MLRT_TF_OPS
+
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_dialect.td"
+include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.td"
+include "tensorflow/compiler/mlir/tensorflow/ir/tf_op_base.td"
+include "third_party/tf_runtime/include/tfrt/compiler/opdefs/tfrt_op_interfaces.td"
+include "third_party/tf_runtime/include/tfrt/compiler/opdefs/tfrt_traits.td"
+
+// tf_mlrt.tf_await returns a tensorflow Tensor. It is a fake op that is only
+// used during parallelization and has no runtime implementation.
+def TFAwaitOp: TensorflowMlrt_Op<"tf_await", [Pure, TFRT_CostFunctionInterface, TFRT_FixedCost<1>]> {
+  let summary = "Await a tensor from a !mlrt.future";
+
+  let description = [{
+    Await a tensor from a !mlrt.future.
+
+    $future: A value of type !mlrt.future. The underlying value must be a tensorflow tensor.
+
+    $result: a tensorflow tensor.
+  }];
+
+  let arguments = (ins
+    MlrtFutureType:$future
+  );
+
+  let results = (outs
+    TF_Tensor:$result
+  );
+}
+
+// tf_mlrt.tf_promise takes a tensorflow Tensor. It is a fake op that is only
+// used during parallelization and has no runtime implementation.
+def TFPromiseOp: TensorflowMlrt_Op<"tf_promise", [TF_MustExecute, TFRT_CostFunctionInterface, TFRT_FixedCost<1>]> {
+  let summary = "Set a tensor in a promise";
+
+  let description = [{
+    Set a tensor in a promise.
+
+    $promise: A value of type !mlrt.promise. The underlying value will always be a tensorflow tensor.
+    $tensor: A tensorflow tensor.
+  }];
+
+  let arguments = (ins
+    MlrtPromiseType:$promise,
+    TF_Tensor:$tensor
+  );
+}
+
+def TFMapFnOp : TensorflowMlrt_Op<"tf_map_fn", [AttrSizedOperandSegments, Pure]> {
+  let summary = "The Parallel Map for tf_mlrt dialect";
+  let description = [{
+    The Pmap executes body function in parallel for all ranges up to $max_iterations.
+
+    The pseudo code:
+      for(int i = 0; i < $max_iterations; i++) {
+        body_fn(MlrtFture($tensor_list_or_flow_in[i]),
+                MlrtPromise($tensor_list_or_flow_in[i+1]),
+                i, i, $invariant_args);
+      }
+
+      return $tensor_list_or_flow_in[$max_iterations]
+  }];
+
+  let arguments = (ins
+    TF_Tensor:$max_iterations,
+    Variadic<TF_Tensor>:$tensor_list_or_flow_in,
+    Variadic<TF_Tensor>:$invariant_args,
+    FlatSymbolRefAttr:$body_fn,
+    I32Attr:$num_tensor_list_or_flow_in
+  );
+
+  let results = (outs
+     Variadic<TF_Tensor>:$result
+  );
+
+  let assemblyFormat = "`(`$max_iterations`,` $tensor_list_or_flow_in`,` $invariant_args `)` attr-dict `:` functional-type(operands, results)";
+}
+
+def TFTPUCompileAndExecuteOp : TensorflowMlrt_Op<"tf_tpu_compile_and_execute", [TF_MustExecute]> {
+  let summary = "tpu compile and execute operation";
+  let description = [{
+    tf_mlrt_tpu.compile_and_execute compiles a mlir tpu program and executes the compiled tpu program.
+
+    $mlir_module is a serialized MLIR module with a `main` function that contains target computation.
+    $metadata is a serialized TPUCompileMetadataProto describing the shapes and types of the inputs to the computation, as well as a mapping onto the TPU pod topology.
+    $constant_operand_indices are the indices of the inputs that are constant to the TPU program (e.g. weights in inference), the rest of the inputs are input tensors.
+    constant_operand_indices is sorted in ascending order.
+    $operands_with_static_shape are indices of operands that are tagged with a maximum static shape.
+    $producer_name is a string describing the name of the framework that added support for running this portion of the model on TPUs.
+
+    Example:
+      %rendezvous_key_base, %result = tf_mlrt_tpu.compile_and_execute (%operands) constant_operand_indices = [1, 3] metadata = "metadata..." mlir_module = "mlir_module..."
+  }];
+  let arguments = (ins
+    Variadic<TF_Tensor>:$operands_and_static_shapes,
+    DenseI32ArrayAttr:$constant_operand_indices,
+    StrAttr:$metadata,
+    StrAttr:$mlir_module,
+    UI32Attr:$num_operands,
+    DenseI32ArrayAttr:$operands_with_static_shape,
+    StrAttr:$producer_name
+  );
+
+  let results = (outs
+    TF_Tensor:$rendezvous_key_base,
+    Variadic<TF_Tensor>:$results
+  );
+
+  let assemblyFormat = [{
+    `(` $operands_and_static_shapes `)` attr-dict `:` functional-type($operands_and_static_shapes, results)
+  }];
+}
+
+
+#endif
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc
index 9643c041cf6..19d29e506b3 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
 
+#include <utility>
+
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc
index 2dc4adfa084..28af77dd5a7 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h"
 
+#include <utility>
+
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
index 93e75309206..e78d247c038 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_common.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_COMMON_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_IR_TFRT_FALLBACK_COMMON_H_
 
+#include <utility>
+
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
index daf76268bc2..bba8a021921 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.td
@@ -240,4 +240,47 @@ def ConvertFallbackTensorToDhtOp : FallbackSync_Op<"convert_fallback_tensor_to_d
   let assemblyFormat = "operands attr-dict `:` type($dht)";
 }
 
+// TODO(rohitju): This is Ads specific, need to find an appropriate home for it.
+def SetSparseMatrixResourceOp : FallbackSync_Op<"set_sparse_matrix_resource", [CoreRT_TypedAttributeTrait]> {
+  let summary = "Set a Sparse matrix in resource array";
+
+  let description = [{
+    Set a sparse matrix in resource array.
+
+    arg: the matrix to be set in the resource array.
+    index: the index in the resource array
+  }];
+
+  let arguments = (ins
+    TFTensorType:$arg,
+    I64Attr:$index
+  );
+
+  let results = (outs);
+
+  let assemblyFormat = "operands attr-dict";
+}
+
+def GetSparseMatrixResourceOp : FallbackSync_Op<"get_sparse_matrix_resource",
+    [CoreRT_TypedAttributeTrait]> {
+  let summary = "get a sparse matrix from resource array";
+
+  let description = [{
+    Get a sparse matrix from resource array.
+
+    indices: the indices in the resource array.
+    results: the tensor values for the corresponding indices.
+  }];
+
+  let arguments = (ins
+    I64ArrayAttr:$indices
+  );
+
+  let results = (outs
+    Variadic<TFTensorType>:$results
+  );
+
+  let assemblyFormat = "attr-dict `:` type($results)";
+}
+
 #endif
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc
index 18634cdc3a1..f0d5e08ece0 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tf_jitrt_executor.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stdexcept>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h"
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
diff --git a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tfrt_fallback.cc b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tfrt_fallback.cc
index 4f4b9439619..c36cd3f498a 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/python_binding/tfrt_fallback.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/python_binding/tfrt_fallback.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/jit/python_binding/tfrt_fallback.h"
 
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.cc
index cc649528ad5..58f1501b54a 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.h"
 
+#include <optional>
 #include <string>
 
 #include "absl/time/time.h"
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.h b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.h
index 5a328b7f265..92378ee7ef3 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.h
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_TFRT_JIT_TF_JITRT_H_
 #define TENSORFLOW_COMPILER_MLIR_TFRT_JIT_TF_JITRT_H_
 
+#include <optional>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
index 6fe3091fed3..ec1d4f199eb 100644
--- a/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
+++ b/tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_kernels.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
 #define EIGEN_USE_THREADS
 
 #include <memory>
@@ -279,7 +280,7 @@ static std::string AsTensorContent(const MemrefDesc& desc) {
 }
 
 // Gets the session name from the fallback request state.
-static const std::string GetSessionName(RequestContext* req_ctx) {
+static std::string GetSessionName(RequestContext* req_ctx) {
   auto* fallback = req_ctx->GetDataIfExists<KernelFallbackCompatRequestState>();
   if (!fallback) return "<unknown>";
 
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/BUILD b/tensorflow/compiler/mlir/tfrt/python_tests/BUILD
index 15574068e61..4f7288d4c9c 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/BUILD
@@ -15,7 +15,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -27,7 +27,7 @@ py_strict_test(
     tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -42,7 +42,7 @@ py_strict_test(
     ],
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -57,7 +57,7 @@ py_strict_test(
     ],
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -72,7 +72,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -87,7 +87,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -102,7 +102,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -114,7 +114,7 @@ py_strict_test(
     tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -129,7 +129,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -141,7 +141,7 @@ py_strict_test(
     tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -154,7 +154,7 @@ py_strict_test(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
         "@absl_py//absl/testing:parameterized",
@@ -168,10 +168,11 @@ py_strict_test(
     tags = [
         "no_oss",
         "no_pip",
+        "not_run:arm",
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -186,7 +187,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -201,7 +202,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -216,7 +217,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -231,7 +232,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -243,7 +244,7 @@ py_strict_test(
     tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -258,7 +259,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -271,10 +272,11 @@ py_strict_test(
     tags = [
         "no_oss",
         "no_pip",  # TODO(b/201803253): TFRT pybindings not in OSS.
+        "not_run:arm",
     ],
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -289,7 +291,7 @@ py_strict_test(
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -301,10 +303,11 @@ py_strict_test(
     tags = [
         "no_oss",
         "no_pip",
+        "not_run:arm",
     ],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -316,7 +319,7 @@ py_strict_test(
     tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -328,7 +331,7 @@ py_strict_test(
     tags = ["no_pip"],  # TODO(b/201803253): TFRT pybindings not in OSS.
     deps = [
         "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl
index fb183e02362..9e64cfb74f6 100644
--- a/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl
+++ b/tensorflow/compiler/mlir/tfrt/python_tests/regression_tests/build_defs.bzl
@@ -31,7 +31,6 @@ def _run_regression_test(name, compare_with_tensorflow, vectorize, data):
             "//third_party/py/numpy",
             "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tf_jitrt",
             "//tensorflow/compiler/mlir/tfrt/jit/python_binding:tfrt_fallback",
-            "//tensorflow/python:client_testlib",
             "//tensorflow/python/platform:tf_logging",
             "//tensorflow/python/platform:client_testlib",
             "//tensorflow/python/platform:resource_loader",
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
index 47572390666..214dd90d009 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
 
+#include <utility>
+
 #include "absl/strings/str_split.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
index fab391c753b..94b7f73fd73 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/tensorflow/compiler/mlir/tfrt/test_opkernels.cc b/tensorflow/compiler/mlir/tfrt/test_opkernels.cc
new file mode 100644
index 00000000000..096a2626b75
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/test_opkernels.cc
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+REGISTER_OP("TestAsyncIdentity")
+    .Input("in: T")
+    .Output("out: T")
+    .Attr(
+        "T: {bfloat16, half, float, double, uint8, int8, int16, uint32, int32, "
+        "int64, complex64, complex128}")
+    .SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+class TestAsyncIdentityKernel : public AsyncOpKernel {
+ public:
+  explicit TestAsyncIdentityKernel(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    const Tensor& in = ctx->input(0);
+    ctx->set_output(0, in);
+    done();
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TestAsyncIdentityKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("TestAsyncIdentity").Device(DEVICE_CPU),
+                        TestAsyncIdentityKernel);
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/BUILD
index f7df07f4708..3bff3d02e5f 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
index 91f9e57a9b2..c9b64b7b4fb 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     exclude = ["testdata/**"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc b/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc
index 0c50bbbdea8..420a937eaae 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/analysis/update_op_cost_in_tfrt_mlir_test.cc
@@ -47,7 +47,13 @@ absl::flat_hash_map<int64_t, uint64_t> GetOpCostMap(mlir::ModuleOp op) {
   return op_cost_map;
 }
 
-TEST(CostUpdateTest, Basic) {
+struct TestParams {
+  uint32_t normalize_ratio = 1;
+};
+
+class CostUpdateTest : public ::testing::TestWithParam<TestParams> {};
+
+TEST_P(CostUpdateTest, Basic) {
   std::string saved_model_mlir_path = tensorflow::GetDataDependencyFilepath(
       "tensorflow/compiler/mlir/tfrt/tests/analysis/testdata/test.mlir");
 
@@ -61,15 +67,17 @@ TEST(CostUpdateTest, Basic) {
   ASSERT_TRUE(module);
 
   // Create a cost recorder with fake cost records.
-  auto expected_op_cost_map = GetOpCostMap(module.get());
-  EXPECT_EQ(expected_op_cost_map.size(), 1);
+  auto fake_recorded_op_cost_map = GetOpCostMap(module.get());
+  EXPECT_EQ(fake_recorded_op_cost_map.size(), 1);
   unsigned int seed = 23579;
-  for (auto& [op_key, cost] : expected_op_cost_map) {
+  for (auto& [op_key, cost] : fake_recorded_op_cost_map) {
     cost = rand_r(&seed) % 1000;
   }
-  tensorflow::tfrt_stub::CostRecorder cost_recorder;
-  for (const auto& [op_key, cost] : expected_op_cost_map) {
-    cost_recorder.RecordCostNanosecond(op_key, cost);
+  tensorflow::tfrt_stub::CostRecorder cost_recorder(GetParam().normalize_ratio);
+  absl::flat_hash_map<int64_t, uint64_t> expected_op_cost_map;
+  for (const auto& [op_key, cost] : fake_recorded_op_cost_map) {
+    cost_recorder.RecordCost(op_key, cost);
+    expected_op_cost_map[op_key] = cost_recorder.GetCost(op_key);
   }
 
   // Update the TFRT MLIR with the cost recorder.
@@ -80,5 +88,8 @@ TEST(CostUpdateTest, Basic) {
   EXPECT_THAT(got_op_cost_map, ::testing::ContainerEq(expected_op_cost_map));
 }
 
+INSTANTIATE_TEST_SUITE_P(CostUpdateTests, CostUpdateTest,
+                         ::testing::Values(TestParams{1}, TestParams{100}));
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
index e4662bd66d7..8d49d08b102 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/ir/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     exclude = ["testdata/**"],
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ir/tfrt_fallback_util_test.cc b/tensorflow/compiler/mlir/tfrt/tests/ir/tfrt_fallback_util_test.cc
index 1cf8bc78a2d..bfa9c148174 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ir/tfrt_fallback_util_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/ir/tfrt_fallback_util_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_util.h"
 
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
diff --git a/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD b/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD
index d4abbb3bc44..3e1391e6e45 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/jit/BUILD
@@ -8,6 +8,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
diff --git a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
index 16f00e94ca7..e23bbda91e0 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/lhlo_to_jitrt/BUILD
@@ -9,6 +9,7 @@ package(
 # copybara:uncomment_begin
 #
 # glob_lit_tests(
+#     name = "all_tests",
 #     data = [":test_utilities"],
 #     driver = "//tensorflow/compiler/mlir:run_lit.sh",
 #     features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD b/tensorflow/compiler/mlir/tfrt/tests/mlrt/BUILD
similarity index 54%
rename from tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD
rename to tensorflow/compiler/mlir/tfrt/tests/mlrt/BUILD
index a32ee149f5a..8da9c4cf2d5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/BUILD
@@ -1,14 +1,11 @@
-load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
-    driver = "@llvm-project//mlir:run_lit.sh",
+    driver = "//tensorflow/compiler/mlir:run_lit.sh",
     test_file_exts = ["mlir"],
 )
 
@@ -17,7 +14,8 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "//tensorflow/compiler/xla/mlir_hlo/tosa:mhlo-tosa-opt",
+        "//tensorflow/compiler/mlir/tfrt:tf-tfrt-opt",
         "@llvm-project//llvm:FileCheck",
+        "@llvm-project//mlir:run_lit.sh",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/assign_op_key.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/assign_op_key.mlir
new file mode 100644
index 00000000000..9677f183b61
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/assign_op_key.mlir
@@ -0,0 +1,49 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-mlrt-assign-op-key %s | FileCheck %s
+
+// CHECK-LABEL: func @main
+// CHECK: tf.AddV2
+// CHECK-SAME: {__op_key = 0 : i32}
+
+// CHECK: tf.AddV2
+// CHECK-SAME: {__op_key = 1 : i32}
+
+// CHECK: tf.AddV2
+// CHECK-SAME: {__op_key = 2 : i32}
+
+// CHECK: tf.AddV2
+// CHECK-SAME: {__op_key = 3 : i32}
+
+// CHECK: tf.Sub
+// CHECK-SAME: {__op_key = 4 : i32}
+
+// CHECK: tf.Sub
+// CHECK-SAME: {__op_key = 5 : i32}
+
+// CHECK: tf.Sub
+// CHECK-SAME: {__op_key = 6 : i32}
+
+// CHECK: tf.Sub
+// CHECK-SAME: {__op_key = 7 : i32}
+
+
+// CHECK: [[x:%.*]] = "tf.AddV2"
+// CHECK-SAME: {__op_key = 8 : i32}
+
+// CHECK: return [[x]]
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> tensor<i32> {
+
+  %a0 = "tf.AddV2"(%a, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a1 = "tf.AddV2"(%a0, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a2 = "tf.AddV2"(%a1, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a3 = "tf.AddV2"(%a2, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %b0 = "tf.Sub"(%b, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b1 = "tf.Sub"(%b0, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b2 = "tf.Sub"(%b1, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b3 = "tf.Sub"(%b2, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %c = "tf.AddV2"(%a3, %b3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  func.return %c : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/fuse_mlrt_ops.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/fuse_mlrt_ops.mlir
new file mode 100644
index 00000000000..ce750ec73ed
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/fuse_mlrt_ops.mlir
@@ -0,0 +1,58 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-mlrt-fuse %s | FileCheck %s
+
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[f0:%.*]]: !mlrt.future, [[f1:%.*]]: !mlrt.future, [[f2:%.*]]: !mlrt.future)
+func.func @main(%f0: !mlrt.future, %f1: !mlrt.future, %f2: !mlrt.future) -> (!tf_mlrt.tensor, !tf_mlrt.tensor, !tf_mlrt.tensor) {
+  // CHECK-NEXT: [[t:%.*]]:3 = tf_mlrt.await_all [[f0]], [[f1]], [[f2]]
+  // CHECK-NOT: tf_mlrt.await
+  // CHECK-NEXT: return [[t]]#0, [[t]]#1, [[t]]#2
+  %t0 = tf_mlrt.await %f0
+  %t1 = tf_mlrt.await %f1
+  %t2 = tf_mlrt.await %f2
+  func.return %t0, %t1, %t2 : !tf_mlrt.tensor, !tf_mlrt.tensor, !tf_mlrt.tensor
+}
+
+// -----
+
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[f0:%.*]]: !mlrt.future, [[f1:%.*]]: !mlrt.future, [[f2:%.*]]: !mlrt.future)
+func.func @main(%f0: !mlrt.future, %f1: !mlrt.future, %f2: !mlrt.future) -> (!tf_mlrt.tensor, !tf_mlrt.tensor) {
+  // CHECK-NEXT: [[t:%.*]]:2 = tf_mlrt.await_all [[f0]], [[f1]]
+  // CHECK-NOT: tf_mlrt.await
+  // CHECK-NEXT: [[t2:%.*]] = tf_mlrt.executeop([[t]]#0, [[t]]#1)
+  // CHECK-NEXT: [[t3:%.*]] = tf_mlrt.await [[f2]]
+  // CHECK-NEXT: return [[t2]], [[t3]]
+  %t0 = tf_mlrt.await %f0
+  %t1 = tf_mlrt.await %f1
+  %t2 = tf_mlrt.executeop(%t0, %t1) {node_def = "AddV2", op_key = 0 : i32} : (!tf_mlrt.tensor, !tf_mlrt.tensor) -> (!tf_mlrt.tensor)
+  %t3 = tf_mlrt.await %f2
+  func.return %t2, %t3 : !tf_mlrt.tensor, !tf_mlrt.tensor
+}
+
+// -----
+
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[f0:%.*]]: !mlrt.async_handle, [[f1:%.*]]: !mlrt.async_handle, [[f2:%.*]]: !mlrt.async_handle)
+func.func @main(%f0: !mlrt.async_handle, %f1: !mlrt.async_handle, %f2: !mlrt.async_handle) -> () {
+  // CHECK-NEXT: mlrt.await_all_handle [[f0]], [[f1]], [[f2]]
+  // CHECK-NOT: mlrt.await_handle
+  // CHECK-NEXT: return
+  mlrt.await_handle %f0
+  mlrt.await_handle %f1
+  mlrt.await_handle %f2
+  func.return
+}
+
+// -----
+
+// CHECK-LABEL: @main
+func.func @main() -> (!tf_mlrt.tensor, !tf_mlrt.tensor) {
+  // CHECK-NEXT: [[r:%.*]]:3 = tf_mlrt.get_resource {indices = [2, 0, 1]}
+  // CHECK-NEXT: [[v:%.*]] = tf_mlrt.executeop([[r]]#0, [[r]]#1)
+  // CHECK-NEXT: return [[v]], [[r]]#2
+  %0 = tf_mlrt.get_resource {indices = [2]} : !tf_mlrt.tensor
+  %1 = tf_mlrt.get_resource {indices = [0]} : !tf_mlrt.tensor
+  %r = tf_mlrt.executeop(%0, %1) {node_def = "AddV2", op_key = 0 : i32} : (!tf_mlrt.tensor, !tf_mlrt.tensor) -> (!tf_mlrt.tensor)
+  %2 = tf_mlrt.get_resource {indices = [1]} : !tf_mlrt.tensor
+  func.return %r, %2 : !tf_mlrt.tensor, !tf_mlrt.tensor
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
new file mode 100644
index 00000000000..de2a29c017d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/inline.mlir
@@ -0,0 +1,50 @@
+// RUN: tf-tfrt-opt -split-input-file -pass-pipeline='builtin.module(tf-to-mlrt, inline)' %s | FileCheck %s -dump-input=fail
+
+// Test generated tf_mlrt while body and predicate is inlined.
+
+func.func @then(%x: tensor<i1>, %y: tensor<i1>, %z: tensor<i32>) -> tensor<i1> {
+  return %x: tensor<i1>
+}
+
+func.func @else(%x: tensor<i1>, %y: tensor<i1>, %z: tensor<i32>) -> tensor<i1> {
+  return %y: tensor<i1>
+}
+
+// CHECK-LABEL: func @while_cond_if
+// CHECK: [[cond:%.*]] = tf_mlrt.predicate
+// CHECK: [[z:%.*]] = mlrt.cond [[cond]] @then @else
+// CHECK: return [[z]]
+func.func @while_cond_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>, %z: tensor<i32>) -> (tensor<i1>) {
+  %r = "tf.If"(%cond, %x, %y, %z) {then_branch = @then, else_branch = @else, is_stateless = true} : (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i32>) -> tensor<i1>
+  return %r : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_body_if
+func.func @while_body_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>, %z: tensor<i32>) -> (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i32>) {
+  %0 = "tf.Const"() {__op_key = 0: i32, device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Add"(%z, %0) {__op_key = 1: i32, device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %cond, %x, %y, %1 : tensor<i1>, tensor<i1>, tensor<i1>, tensor<i32>
+}
+
+// CHECK-LABEL: func @while_test_if
+// CHECK-SAME: -> !tf_mlrt.tensor
+func.func @while_test_if(%cond: tensor<i1>, %x: tensor<i1>, %y: tensor<i1>) -> (tensor<i32>) {
+  // CHECK: [[CONST:%.*]] = tf_mlrt.executeop
+  %cst = "tf.Const"() {__op_key = 2: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // Predicate should be inlined.
+  // CHECK-NEXT: tf_mlrt.predicate
+  // CHECK-NEXT: mlrt.cond
+  // CHECK-NEXT: tf_mlrt.predicate
+
+  // CHECK-NEXT: mlrt.while
+  %0:4 = "tf.While"(%cond, %x, %y, %cst) { cond = @while_cond_if, body = @while_body_if, is_stateless = false, parallel_iterations = 1} : (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i32>) -> (tensor<i1>, tensor<i1>, tensor<i1>, tensor<i32>)
+  // CHECK: return
+  // CHECK-SAME: !tf_mlrt.tensor
+  func.return %0#3 : tensor<i32>
+}
+
+// CHECK-LABEL: func @"while_body_if/tf_mlrt_body"
+// CHECK-NOT: call
+
+// CHECK-LABEL: func @"while_cond_if/tf_mlrt_predicate"
+// CHECK-NOT: call
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/parallelization.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/parallelization.mlir
new file mode 100644
index 00000000000..d4ab7c2c321
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/parallelization.mlir
@@ -0,0 +1,378 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-mlrt-parallelization %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+
+// CHECK-LABEL: func private @main_stream_{{[0-9]*}}
+// CHECK-SAME: ({{%.*}}: tensor<i32>, [[PROMISE:%.*]]: !mlrt.promise)
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: [[RES:%.*]] = "tf.Sub"
+// CHECK: "tf_mlrt.tf_promise"([[PROMISE]], [[RES]])
+// CHECK: return
+
+// CHECK-LABEL: func @main
+// CHECK: [[PROMISE:%.*]], [[FUTURE:%.*]] = "tf_mlrt.allocate_futures"
+// CHECK: [[HANDLE:%.*]] = mlrt.async({{%.*}}, [[PROMISE]])
+// CHECK-SAME: callee = @main_stream_{{[0-9]*}}
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: [[x:%.*]] = "tf.AddV2"
+// CHECK: [[y:%.*]] = "tf_mlrt.tf_await"([[FUTURE]])
+// CHECK: [[RES:%.*]] = "tf.AddV2"([[x]], [[y]])
+// CHECK: mlrt.await_handle [[HANDLE]]
+// CHECK: return [[RES]]
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> tensor<i32> {
+
+  %a0 = "tf.AddV2"(%a, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a1 = "tf.AddV2"(%a0, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a2 = "tf.AddV2"(%a1, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a3 = "tf.AddV2"(%a2, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %b0 = "tf.Sub"(%b, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b1 = "tf.Sub"(%b0, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b2 = "tf.Sub"(%b1, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b3 = "tf.Sub"(%b2, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %c = "tf.AddV2"(%a3, %b3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  func.return %c : tensor<i32>
+}
+
+// -----
+
+// Test merging child streams
+
+// CHECK-LABEL: func private @main_stream_{{[0-9]*}}
+// CHECK-SAME: ({{%.*}}: tensor<i32>, {{%.*}}: tensor<i32>, [[PROMISE:%.*]]: !mlrt.promise)
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: [[RES:%.*]] = "tf.Sub"
+// CHECK: "tf_mlrt.tf_promise"([[PROMISE]], [[RES]])
+// CHECK: return
+
+// CHECK-LABEL: func @main
+// CHECK: [[PROMISE:%.*]], [[FUTURE:%.*]] = "tf_mlrt.allocate_futures"
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: [[VALUE:%.*]] = "tf.AddV2"
+// CHECK: [[HANDLE:%.*]] = mlrt.async([[VALUE]], {{%.*}}, [[PROMISE]])
+// CHECK-SAME: callee = @main_stream_{{[0-9]*}}
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: [[x:%.*]] = "tf.AddV2"
+// CHECK: [[y:%.*]] = "tf_mlrt.tf_await"([[FUTURE]])
+// CHECK: [[RES:%.*]] = "tf.AddV2"([[x]], [[y]])
+// CHECK: mlrt.await_handle [[HANDLE]]
+// CHECK: return [[RES]]
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> tensor<i32> {
+
+  %a0 = "tf.AddV2"(%a, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a1 = "tf.AddV2"(%a0, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a2 = "tf.AddV2"(%a1, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a3 = "tf.AddV2"(%a2, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a4 = "tf.AddV2"(%a3, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a5 = "tf.AddV2"(%a4, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a6 = "tf.AddV2"(%a5, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a7 = "tf.AddV2"(%a6, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %b0 = "tf.Sub"(%a3, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b1 = "tf.Sub"(%b0, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b2 = "tf.Sub"(%b1, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b3 = "tf.Sub"(%b2, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %c = "tf.AddV2"(%a7, %b3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  func.return %c : tensor<i32>
+}
+
+// -----
+
+// Test side-effecting ops
+
+// CHECK-LABEL: func private @main_stream_{{[0-9]*}}
+// CHECK-SAME: ([[ARG:%.*]]: tensor<i32>, [[FUTURE:%.*]]: !mlrt.future, [[CONTROL_PROMISE:%.*]]: !mlrt.promise)
+// CHECK: [[HANDLE:%.*]] = "tf_mlrt.tf_await"([[FUTURE]])
+// CHECK: "tf.AssignVariableOp"([[HANDLE]], [[ARG]])
+// CHECK-NEXT: mlrt.promise_control [[CONTROL_PROMISE]]
+
+// CHECK-LABEL: func private @main_stream_{{[0-9]*}}
+// CHECK-SAME: ({{%.*}}: tensor<i32>, {{%.*}}: tensor<i32>, [[FUTURE:%.*]]: !mlrt.future, [[PROMISE:%.*]]: !mlrt.promise)
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: [[V:%.*]] = "tf_mlrt.tf_await"([[FUTURE]])
+// CHECK-NEXT: [[RES:%.*]] = "tf.Sub"({{%.*}}, [[V]])
+// CHECK: "tf_mlrt.tf_promise"([[PROMISE]], [[RES]])
+// CHECK: return
+
+// CHECK-LABEL: func private @main_stream_{{[0-9]*}}
+// CHECK-SAME: ([[CONTROL_FUTURE:%.*]]: !mlrt.future, [[PROMISE:%.*]]: !mlrt.promise, [[PROMISE_HANDLE:%.*]]: !mlrt.promise)
+// CHECK: [[HANDLE:%.*]] = "tf.VarHandleOp"
+// CHECK-NEXT: "tf_mlrt.tf_promise"([[PROMISE_HANDLE]], [[HANDLE]])
+// CHECK: mlrt.await_control [[CONTROL_FUTURE]]
+// CHECK-NEXT: [[V:%.*]] = "tf.ReadVariableOp"([[HANDLE]])
+// CHECK: "tf_mlrt.tf_promise"([[PROMISE]], [[V]])
+
+// CHECK-LABEL: func @main
+// CHECK: [[PROMISE:%.*]]:3, [[FUTURE:%.*]]:3 = "tf_mlrt.allocate_futures"
+// CHECK: [[CONTROL_PROMISE:%.*]], [[CONTROL_FUTURE:%.*]] = "mlrt.allocate_control_futures"
+// CHECK: [[ASYNC_HANDLE_0:%.*]] = mlrt.async([[CONTROL_FUTURE]], [[PROMISE]]#0, [[PROMISE]]#1)
+// CHECK-SAME: callee = @main_stream_{{[0-9]*}}
+// CHECK: [[ASYNC_HANDLE_1:%.*]] = mlrt.async({{%.*}}, {{%.*}}, [[FUTURE]]#0, [[PROMISE]]#2)
+// CHECK-SAME: callee = @main_stream_{{[0-9]*}}
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: [[x:%.*]] = "tf.AddV2"
+// CHECK: [[ASYNC_HANDLE_2:%.*]] = mlrt.async([[x]], [[FUTURE]]#1, [[CONTROL_PROMISE]])
+// CHECK-SAME: callee = @main_stream_{{[0-9]*}}
+// CHECK: [[y:%.*]] = "tf_mlrt.tf_await"([[FUTURE]]#2)
+// CHECK: [[RES:%.*]] = "tf.AddV2"([[x]], [[y]])
+// CHECK: mlrt.await_handle [[ASYNC_HANDLE_0]]
+// CHECK-NEXT: mlrt.await_handle [[ASYNC_HANDLE_1]]
+// CHECK-NEXT: mlrt.await_handle [[ASYNC_HANDLE_2]]
+// CHECK-NEXT: return [[RES]]
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> tensor<i32> {
+  %handle = "tf.VarHandleOp"() {container = "", shared_name = "var"} : () -> tensor<!tf_type.resource_handle<tensor<i32>>>
+
+  %a0 = "tf.AddV2"(%a, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a1 = "tf.AddV2"(%a0, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a2 = "tf.AddV2"(%a1, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a3 = "tf.AddV2"(%a2, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  "tf.AssignVariableOp"(%handle, %a3) : (tensor<!tf_type.resource<tensor<i32>>>, tensor<i32>) -> ()
+
+  %b0 = "tf.Sub"(%a, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b1 = "tf.Sub"(%b0, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b2 = "tf.Sub"(%b1, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %var = "tf.ReadVariableOp"(%handle) : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<i32>
+  %b3 = "tf.Sub"(%b2, %var) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %c = "tf.AddV2"(%a3, %b3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  func.return %c : tensor<i32>
+}
+
+// -----
+
+// Test multiple promises and futures
+
+// CHECK-LABEL: func private @main_stream_1
+// CHECK: mlrt.await_control
+// CHECK: "tf.DummySideEffecting"() {id = 4
+// CHECK: return
+
+// CHECK-LABEL: func private @main_stream_2
+// CHECK: mlrt.await_control
+// CHECK: "tf.DummySideEffecting"() {id = 3
+// CHECK: mlrt.promise_control
+// CHECK: return
+
+// CHECK-LABEL: func private @main_stream_3
+// CHECK: mlrt.await_control
+// CHECK: "tf.DummySideEffecting"() {id = 2
+// CHECK: mlrt.promise_control
+// CHECK: return
+
+// CHECK-LABEL: func private @main_stream_4
+// CHECK: "tf.DummySideEffecting"() {id = 1
+// CHECK: mlrt.promise_control
+// CHECK: return
+
+// CHECK-LABEL: func @main()
+// CHECK: [[PROMISES:%.*]]:3, [[FUTURES:%.*]]:3 = "mlrt.allocate_control_futures"
+// CHECK: mlrt.async([[PROMISES]]#2) {callee = @main_stream_4
+// CHECK: mlrt.async([[FUTURES]]#2, [[PROMISES]]#1) {callee = @main_stream_3
+// CHECK: mlrt.async([[FUTURES]]#1, [[PROMISES]]#0) {callee = @main_stream_2
+// CHECK: mlrt.async([[FUTURES]]#0) {callee = @main_stream_1
+// CHECK: mlrt.await_handle
+// CHECK: mlrt.await_handle
+// CHECK: mlrt.await_handle
+// CHECK: mlrt.await_handle
+
+func.func @main() {
+  "tf.DummySideEffecting"() {id = 1} : () -> ()
+  "tf.DummySideEffecting"() {id = 2} : () -> ()
+  "tf.DummySideEffecting"() {id = 3} : () -> ()
+  "tf.DummySideEffecting"() {id = 4} : () -> ()
+  func.return
+}
+
+// -----
+
+// Test correctness when there are both data and control promises in a stream function.
+
+// CHECK-LABEL: func private @main_stream_1
+// CHECK-SAME: ([[PROMISE:%.*]]: !mlrt.promise, [[CONTROL_PROMISE:%.*]]: !mlrt.promise)
+// CHECK: tf.DummySideEffecting
+// CHECK: "tf_mlrt.tf_promise"([[PROMISE]]
+// CHECK: mlrt.promise_control [[CONTROL_PROMISE]]
+
+func.func @main() -> tensor<i32> {
+  %v = "tf.DummySideEffecting"() {id = 1} : () -> tensor<i32>
+
+  %w = "tf.DummySideEffecting"() {id = 2} : () -> tensor<i32>
+  %r = "tf.AddV2"(%w, %v) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %r : tensor<i32>
+}
+
+// -----
+
+// Test inputs to the child streams are merged to the parent streams
+
+// CHECK-LABEL: func private @main_stream_1
+// CHECK-SAME: ([[INPUT0:%.*]]: tensor<i32>, [[INPUT1:%.*]]: tensor<i32>
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: mlrt.async({{%.*}}, [[INPUT1]]
+
+// CHECK-LABEL: func @main
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> tensor<i32> {
+
+  %a0 = "tf.AddV2"(%a, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a1 = "tf.AddV2"(%a0, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a2 = "tf.AddV2"(%a1, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a3 = "tf.AddV2"(%a2, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %b0 = "tf.Sub"(%b, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b1 = "tf.Sub"(%b0, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %c = "tf.AddV2"(%b1, %a) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %b2 = "tf.Sub"(%b1, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b3 = "tf.Sub"(%b2, %b) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %d = "tf.AddN"(%a3, %b3, %c) : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %d : tensor<i32>
+}
+
+// -----
+
+// Test that constants are copied instead of using promise/await.
+
+// CHECK-LABEL: func private @main_stream_1
+// CHECK-SAME: ({{%.*}}: tensor<i32>, [[PROMISE:%.*]]: !mlrt.promise)
+// CHECK: tf._TfrtGetResource
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: tf.Sub
+// CHECK: [[RES:%.*]] = "tf.Sub"
+// CHECK: "tf_mlrt.tf_promise"([[PROMISE]], [[RES]])
+// CHECK: return
+
+// CHECK-NOT: func private @main_stream
+
+// CHECK-LABEL: func @main
+// CHECK: [[PROMISE:%.*]], [[FUTURE:%.*]] = "tf_mlrt.allocate_futures"
+// CHECK-NEXT: [[HANDLE:%.*]] = mlrt.async({{%.*}}, [[PROMISE]])
+// CHECK-SAME: callee = @main_stream_1
+// CHECK: tf._TfrtGetResource
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: tf.AddV2
+// CHECK: [[x:%.*]] = "tf.AddV2"
+// CHECK: [[y:%.*]] = "tf_mlrt.tf_await"([[FUTURE]])
+// CHECK: [[RES:%.*]] = "tf.AddV2"([[x]], [[y]])
+// CHECK: mlrt.await_handle [[HANDLE]]
+// CHECK: return [[RES]]
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> tensor<i32> {
+
+  %c0 = "tf._TfrtGetResource"() {indices = [0], shared_name = [""], container = [""]} : () -> (tensor<i32>)
+
+  %a0 = "tf.AddV2"(%a, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a1 = "tf.AddV2"(%a0, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a2 = "tf.AddV2"(%a1, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %a3 = "tf.AddV2"(%a2, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %b0 = "tf.Sub"(%b, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b1 = "tf.Sub"(%b0, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b2 = "tf.Sub"(%b1, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %b3 = "tf.Sub"(%b2, %c0) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  %c = "tf.AddV2"(%a3, %b3) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  func.return %c : tensor<i32>
+}
+
+// -----
+
+// Test that constants private to a stream are still handled properly when we are copying shared constants.
+
+// CHECK-LABEL: func private @main_stream_1
+// CHECK: [[r:%.*]] = "tf._TfrtGetResource"
+// CHECK-SAME: indices = [1]
+// CHECK: "tf.DummySideEffecting"([[r]])
+
+// CHECK-LABEL: func private @main_stream_2
+// CHECK: [[r:%.*]] = "tf._TfrtGetResource"
+// CHECK-SAME: indices = [0]
+// CHECK: "tf.DummySideEffecting"([[r]])
+
+// CHECK-LABEL: func @main
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> () {
+
+  %c0 = "tf._TfrtGetResource"() {indices = [0], shared_name = [""], container = [""]} : () -> (tensor<i32>)
+  "tf.DummySideEffecting"(%c0) : (tensor<i32>) -> ()
+
+  %c1 = "tf._TfrtGetResource"() {indices = [1], shared_name = [""], container = [""]} : () -> (tensor<i32>)
+  "tf.DummySideEffecting"(%c1) : (tensor<i32>) -> ()
+
+  func.return
+}
+
+// -----
+
+// Test that streams with no args but side-effecting ops are still created properly
+
+// CHECK-LABEL: func private @main_stream_1()
+// CHECK: [[r:%.*]] = "tf._TfrtGetResource"
+// CHECK-SAME: indices = [0]
+// CHECK: "tf.DummySideEffecting"([[r]])
+
+// CHECK-LABEL: func @main
+
+func.func @main(%a: tensor<i32>, %b: tensor<i32>) -> () {
+  %c0 = "tf._TfrtGetResource"() {indices = [0], shared_name = [""], container = [""]} : () -> (tensor<i32>)
+  "tf.DummySideEffecting"(%c0) : (tensor<i32>) -> ()
+  func.return
+}
+
+// -----
+
+// Test control deps of tf.Assert is skipped.
+
+// CHECK-LABEL: func.func private @skip_assert_stream_3(
+// CHECK-NOT: mlrt.await_control
+// CHECK: tf.Assert
+// CHECK-NOT: mlrt.promise_control
+// CHECK: return
+
+// CHECK-LABEL: func.func private @skip_assert_stream_2(
+// CHECK-NOT: mlrt.await_control
+// CHECK: tf.Assert
+// CHECK-NOT: mlrt.promise_control
+// CHECK: return
+
+func.func @skip_assert(%key: tensor<!tf_type.string>) -> (tensor<i64>, tensor<i64>) {
+  %error_message = "tf.Const"() {value = dense<"error"> : tensor<!tf_type.string>} : () -> tensor<!tf_type.string>
+  %default = "tf.Const"() {value = dense<-1> : tensor<i64>} : () -> tensor<i64>
+  %handle = "tf.HashTableV2"() {container = "", device = "/job:localhost/replica:0/task:0/device:CPU:0", key_dtype = !tf_type.string, shared_name = "hash_table", use_node_name_sharing = false, value_dtype = i64} : () -> tensor<!tf_type.resource>
+
+
+  %keys = "tf.Const"() {value = dense<["a", "b", "c", "d"]> : tensor<4x!tf_type.string>} : () -> tensor<4x!tf_type.string>
+  %values = "tf.Const"() {value = dense<[1, 2, 3, 4]> : tensor<4xi64>} : () -> tensor<4xi64>
+  "tf.LookupTableImportV2"(%handle, %keys, %values) {device = ""} : (tensor<!tf_type.resource>, tensor<4x!tf_type.string>, tensor<4xi64>) -> ()
+  %value0 = "tf.LookupTableFindV2"(%handle, %key, %default) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf_type.resource>, tensor<!tf_type.string>, tensor<i64>) -> tensor<i64>
+  %cond = "tf.Equal"(%value0, %default) {device = "/job:localhost/replica:0/task:0/device:CPU:0", incompatible_shape_error = true} : (tensor<i64>, tensor<i64>) -> tensor<i1>
+  "tf.Assert"(%cond, %error_message) {device = "/job:localhost/replica:0/task:0/device:CPU:0", summarize = 3 : i64} : (tensor<i1>, tensor<!tf_type.string>) -> ()
+  "tf.Assert"(%cond, %error_message) {device = "/job:localhost/replica:0/task:0/device:CPU:0", summarize = 3 : i64} : (tensor<i1>, tensor<!tf_type.string>) -> ()
+  %value1 = "tf.LookupTableFindV2"(%handle, %key, %default) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf_type.resource>, tensor<!tf_type.string>, tensor<i64>) -> tensor<i64>
+  func.return %value0, %value1 : tensor<i64>, tensor<i64>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
new file mode 100644
index 00000000000..d5d3254901c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -0,0 +1,419 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-to-mlrt %s | FileCheck %s
+
+// CHECK-LABEL: @main_stream_0
+// CHECK-SAME: ([[input0:%.*]]: !tf_mlrt.tensor, [[promise_b:%.*]]: !mlrt.promise)
+func.func @main_stream_0(%input0: tensor<i32>, %promise_b: !mlrt.promise) {
+  %const = "tf.Const"() {__op_key = 0 : i32, value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[a:%.*]] = tf_mlrt.executeop([[input0]],
+  // CHECK-SAME: AddV2
+  %a = "tf.AddV2"(%input0, %const) {__op_key = 1: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[b:%.*]] = tf_mlrt.executeop([[a]])
+  // CHECK-SAME: Abs
+  %b = "tf.Abs"(%a) {__op_key = 2 : i32}: (tensor<i32>) -> tensor<i32>
+  // CHECK: tf_mlrt.promise [[promise_b]], [[b]]
+  "tf_mlrt.tf_promise"(%promise_b, %b) : (!mlrt.promise, tensor<i32>) -> ()
+  // CHECK: return
+  return
+}
+
+// CHECK-LABEL: @main_stream_1
+// CHECK-SAME: ([[input1:%.*]]: !tf_mlrt.tensor, [[promise_c:%.*]]: !mlrt.promise, [[promise_d:%.*]]: !mlrt.promise)
+func.func @main_stream_1(%input1: tensor<i32>, %promise_c: !mlrt.promise, %promise_d: !mlrt.promise) {
+  %const = "tf.Const"() {__op_key = 3 : i32, value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[c:%.*]] = tf_mlrt.executeop([[input1]],
+  // CHECK-SAME: Sub
+  %c = "tf.Sub"(%input1, %const) {__op_key = 4: i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: tf_mlrt.promise [[promise_c]], [[c]]
+  "tf_mlrt.tf_promise"(%promise_c, %c) : (!mlrt.promise, tensor<i32>) -> ()
+  // CHECK: [[d:%.*]] = tf_mlrt.executeop([[c]])
+  // CHECK-SAME: Abs
+  %d = "tf.Abs"(%c) {__op_key = 5: i32}: (tensor<i32>) -> tensor<i32>
+  // CHECK: tf_mlrt.promise [[promise_d]], [[d]]
+  "tf_mlrt.tf_promise"(%promise_d, %d) : (!mlrt.promise, tensor<i32>) -> ()
+  // CHECK: return
+  return
+}
+
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[input0:%.*]]: !tf_mlrt.tensor, [[input1:%.*]]: !tf_mlrt.tensor)
+func.func @main(%input0: tensor<i32>, %input1: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[promises:%.*]]:3, [[futures:%.*]]:3 = "tf_mlrt.allocate_futures"
+  // CHECK-SAME: num_futures = 3
+  %promise_b, %promise_c, %promise_d, %future_b, %future_c, %future_d =
+    "tf_mlrt.allocate_futures"()
+    {num_futures = 3 : i32, result_segment_sizes = array<i32: 3, 3>} : () ->
+    (!mlrt.promise, !mlrt.promise, !mlrt.promise,
+     !mlrt.future, !mlrt.future, !mlrt.future)
+
+  // CHECK: [[handle_0:%.*]] = mlrt.async([[input0]], [[promises]]#0)
+  // CHECK-SAME: callee = @main_stream_0
+  %handle_0 = mlrt.async(%input0, %promise_b)
+    {callee = @main_stream_0} :
+    (tensor<i32>, !mlrt.promise) -> !mlrt.async_handle
+  // CHECK: [[handle_1:%.*]] = mlrt.async([[input1]], [[promises]]#1, [[promises]]#2)
+  // CHECK-SAME: callee = @main_stream_1
+  %handle_1 = mlrt.async(%input1, %promise_c, %promise_d)
+    {callee = @main_stream_1} :
+    (tensor<i32>, !mlrt.promise, !mlrt.promise) -> !mlrt.async_handle
+
+  %const = "tf.Const"() {__op_key = 6: i32, value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[e:%.*]] = tf_mlrt.executeop([[input1]],
+  // CHECK-SAME: Mul
+  %e = "tf.Mul"(%input1, %const) {__op_key = 7: i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[c:%.*]] = tf_mlrt.await [[futures]]#1
+  %c = "tf_mlrt.tf_await"(%future_c) : (!mlrt.future) ->tensor<i32>
+  // CHECK: [[f:%.*]] = tf_mlrt.executeop([[e]], [[c]])
+  // CHECK-SAME: Div
+  %f = "tf.Div"(%e, %c) {__op_key = 8: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  // CHECK: [[b:%.*]] = tf_mlrt.await [[futures]]#0
+  %b = "tf_mlrt.tf_await"(%future_b) : (!mlrt.future) ->tensor<i32>
+  // CHECK: [[d:%.*]] = tf_mlrt.await [[futures]]#2
+  %d = "tf_mlrt.tf_await"(%future_d) : (!mlrt.future) ->tensor<i32>
+
+  // CHECK: [[result:%.*]] = tf_mlrt.executeop([[b]], [[d]], [[f]])
+  // CHECK-SAME: AddN
+  %result = "tf.AddN"(%b, %d, %f) {__op_key = 9: i32}: (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  // CHECK: mlrt.await_handle [[handle_0]]
+  // CHECK: mlrt.await_handle [[handle_1]]
+  mlrt.await_handle %handle_0
+  mlrt.await_handle %handle_1
+
+  // CHECK: return [[result]]
+  return %result : tensor<i32>
+}
+
+// -----
+
+// Test lowering tf.If
+
+func.func @then(%x: tensor<i32>, %y: tensor<i32>) -> tensor<i32> {
+  return %x: tensor<i32>
+}
+
+func.func @else(%x: tensor<i32>, %y: tensor<i32>) -> tensor<i32> {
+  return %y: tensor<i32>
+}
+
+// CHECK-LABEL: func @main
+// CHECK-SAME: ([[cond_tensor:%.*]]: !tf_mlrt.tensor, [[x:%.*]]: !tf_mlrt.tensor, [[y:%.*]]: !tf_mlrt.tensor)
+// CHECK: [[cond:%.*]] = tf_mlrt.predicate [[cond_tensor]]
+// CHECK: [[z:%.*]] = mlrt.cond [[cond]] @then @else([[x]], [[y]])
+// CHECK: return [[z]]
+func.func @main(%cond: tensor<i1>, %x: tensor<i32>, %y: tensor<i32>) -> tensor<i32> {
+  %z = "tf.If"(%cond, %x, %y) {then_branch = @then, else_branch = @else, is_stateless = true} : (tensor<i1>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %z: tensor<i32>
+}
+
+// -----
+
+// Test lowering AsyncOpKernel
+
+// CHECK-LABEL: func @main
+func.func @main(%x: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
+  // CHECK: [[y_future:%.*]] = tf_mlrt.async_executeop
+  %y = "tf.TestAsyncIdentity"(%x) {__op_key = 0: i32, T = i32} : (tensor<i32>) -> tensor<i32>
+  // CHECK: [[z:%.*]] = tf_mlrt.executeop
+  %z = "tf.Identity"(%x) {__op_key = 1: i32}: (tensor<i32>) -> tensor<i32>
+  // CHECK: [[y:%.*]] = tf_mlrt.await [[y_future]]
+  // CHECK-NEXT: tf_mlrt.executeop([[y]]
+  %w = "tf.AddV2"(%y, %z) {__op_key = 2: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: tf_mlrt.executeop([[y]]
+  %u = "tf.AddV2"(%y, %z) {__op_key = 3: i32} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: tf_mlrt.executeop([[y]]
+  %v = "tf.AddV2"(%y, %z) {__op_key = 4: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %w, %u, %v : tensor<i32>, tensor<i32>, tensor<i32>
+}
+
+// -----
+
+// Test lowering BatchFunction op.
+
+func.func @batched_function(%x: tensor<?xi32>) -> tensor<?xi32> {
+  return %x : tensor<?xi32>
+}
+
+// CHECK-LABEL: func @main
+func.func @main(%x: tensor<1xi32>) -> (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) {
+  // CHECK: [[y_future:%.*]] = tf_mlrt.batch_function
+  // CHECK-SAME: f = @batched_function
+  // CHECK-SAME: \22batch_function\22
+  %y = "tf.BatchFunction"(%x) {
+    allowed_batch_sizes = [6], batch_timeout_micros = 100000 : i64,
+    batching_queue = "", container = "", device = "/device:CPU:0",
+    enable_large_batch_splitting = false, f = @batched_function,
+    max_batch_size = 6 : i64, max_enqueued_batches = 10 : i64,
+    num_batch_threads = 1 : i64, operand_segment_sizes = array<i32: 1, 0>,
+    shared_name = "batch_function"
+  } : (tensor<1xi32>) -> tensor<1xi32>
+
+  // CHECK: [[z:%.*]] = tf_mlrt.executeop
+  %z = "tf.Identity"(%x) {__op_key = 0: i32} : (tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: [[y:%.*]] = tf_mlrt.await [[y_future]]
+  // CHECK-NEXT: tf_mlrt.executeop([[y]]
+  %w = "tf.AddV2"(%y, %z) {__op_key = 1: i32}: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: tf_mlrt.executeop([[y]]
+  %u = "tf.AddV2"(%y, %z) {__op_key = 2: i32}: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK-NEXT: tf_mlrt.executeop([[y]]
+  %v = "tf.AddV2"(%y, %z) {__op_key = 3: i32}: (tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
+  return %w, %u, %v : tensor<1xi32>, tensor<1xi32>, tensor<1xi32>
+}
+
+// -----
+
+// Test node names are preserved.
+
+// CHECK-LABEL: func @main
+func.func @main(%x: tensor<i32>) -> tensor<i32> {
+  // CHECK: tf_mlrt.executeop
+  // CHECK-SAME: name: \22name_loc/AddV2_0\22
+  %y = "tf.AddV2"(%x, %x) {__op_key = 0: i32} : (tensor<i32>, tensor<i32>) -> tensor<i32> loc("name_loc:AddV2")
+  // CHECK: tf_mlrt.executeop
+  // CHECK-SAME: name: \22fused_loc/AddV2_1\22
+  %z = "tf.AddV2"(%y, %x) {__op_key = 1: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32> loc(fused["fused_loc:", "AddV2"])
+  // CHECK: tf_mlrt.executeop
+  // CHECK-SAME: name: \22AddV2_2\22
+  %w = "tf.AddV2"(%z, %x) {__op_key = 2: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return %z : tensor<i32>
+}
+
+// -----
+
+// Test function name canonicalization
+
+// CHECK-LABEL: func @__inference_pruned_35
+func.func @__inference_pruned_35() -> tensor<!tf_type.variant> attributes {tf.entry_function = {control_outputs = "", inputs = "", outputs = "flatmapdataset__4_RetVal"}} {
+  %0 = "tf.Const"() {__op_key = 0: i32, device = "/device:CPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  %1 = "tf.Const"() {__op_key = 1: i32, device = "/device:CPU:0", value = dense<5> : tensor<i64>} : () -> tensor<i64>
+  %2 = "tf.Const"() {__op_key = 2: i32, device = "/device:CPU:0", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %3 = "tf.RangeDataset"(%0, %1, %2) {__op_key = 3: i32, device = "/device:CPU:0", output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<!tf_type.variant>
+  // CHECK: tf_mlrt.executeop{{.*}}op: \22FlatMapDataset\22
+  // CHECK-SAME: \22__inference_Dataset_flat_map_lambda_19\22
+  %4 = "tf.FlatMapDataset"(%3) {__op_key = 4: i32, Targuments = [], device = "/device:CPU:0", f = @__inference_Dataset_flat_map_lambda_190, output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+  func.return %4 : tensor<!tf_type.variant>
+}
+// CHECK-LABEL: __inference_Dataset_flat_map_lambda_190
+func.func private @__inference_Dataset_flat_map_lambda_190(%arg0: tensor<i64> {tf._user_specified_name = "args_0"}) -> tensor<!tf_type.variant> attributes {tf._original_func_name = "__inference_Dataset_flat_map_lambda_19", tf._tf_data_function = true, tf.signature.is_stateful} {
+  %0 = "tf.Const"() {__op_key = 5: i32, device = "/device:CPU:0", value = dense<0> : tensor<i64>} : () -> tensor<i64>
+  %1 = "tf.Const"() {__op_key = 6: i32,device = "/device:CPU:0", value = dense<1> : tensor<i64>} : () -> tensor<i64>
+  %2 = "tf.Const"() {__op_key = 7: i32,device = "/device:CPU:0", value = dense<5> : tensor<i64>} : () -> tensor<i64>
+  %3 = "tf.RangeDataset"(%0, %2, %1) {__op_key = 8: i32, device = "/device:CPU:0", output_shapes = [#tf_type.shape<>], output_types = [i64], metadata = ""} : (tensor<i64>, tensor<i64>, tensor<i64>) -> tensor<!tf_type.variant>
+  // CHECK: tf_mlrt.executeop{{.*}}op: \22MapDataset\22
+  // CHECK-SAME: \22__inference_Dataset_map_lambda_16\22
+  %4 = "tf.MapDataset"(%3) {__op_key = 9: i32, device = "/device:CPU:0", f = @__inference_Dataset_map_lambda_160, f._tf_data_function = true, output_shapes = [#tf_type.shape<>], output_types = [i64], preserve_cardinality = true, use_inter_op_parallelism = true, metadata = ""} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+  %5 = "tf.Identity"(%4) {__op_key = 10: i32, device = "/device:CPU:0"} : (tensor<!tf_type.variant>) -> tensor<!tf_type.variant>
+  func.return %5 : tensor<!tf_type.variant>
+}
+// CHECK-LABEL: __inference_Dataset_map_lambda_160
+func.func private @__inference_Dataset_map_lambda_160(%arg0: tensor<i64> {tf._user_specified_name = "args_0"}) -> tensor<i64> attributes {tf._tf_data_function = true} {
+  %0 = "tf.Const"() {__op_key = 11: i32, device = "/device:CPU:0", value = dense<2> : tensor<i64>} : () -> tensor<i64>
+  %1 = "tf.Mul"(%arg0, %0) {__op_key = 12: i32, device = "/device:CPU:0"} : (tensor<i64>, tensor<i64>) -> tensor<i64>
+  %2 = "tf.Identity"(%1) {__op_key = 13: i32, device = "/device:CPU:0"} : (tensor<i64>) -> tensor<i64>
+  func.return %2 : tensor<i64>
+}
+
+// -----
+
+// Test while conversion
+
+// CHECK-LABEL: func @while_cond_lt9
+// CHECK-SAME: ([[arg0:%.*]]: !tf_mlrt.tensor) -> !tf_mlrt.tensor
+func.func @while_cond_lt9(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Const"() {__op_key = 0: i32, device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Less"(%arg0, %0) {__op_key = 1: i32, device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  func.return %1 : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_body_add2
+// CHECK-SAME: ([[arg0:%.*]]: !tf_mlrt.tensor) -> !tf_mlrt.tensor
+func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.Const"() {__op_key = 2: i32, device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Add"(%arg0, %0) {__op_key = 3: i32, device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// CHECK-LABEL: func @while_test()
+// CHECK-SAME: -> !tf_mlrt.tensor
+func.func @while_test() -> (tensor<i32>) {
+  // CHECK: [[CONST:%.*]] = tf_mlrt.executeop
+  %0 = "tf.Const"() {__op_key = 4: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[pred_res:%.*]] = call @"while_cond_lt9/tf_mlrt_predicate"([[CONST]]) : (!tf_mlrt.tensor) -> i1
+  // CHECK: [[while_res:%.*]]:2 = mlrt.while
+  // CHECK-SAME: @"while_body_add2/tf_mlrt_body"([[CONST]])
+  // CHECK-SAME: (!tf_mlrt.tensor) -> (!tf_mlrt.tensor, i1)
+  %1 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
+  // CHECK: return [[while_res]]#0 : !tf_mlrt.tensor
+  func.return %1 : tensor<i32>
+}
+// CHECK: func @"while_body_add2/tf_mlrt_body"([[arg:%.*]]: !tf_mlrt.tensor) -> (!tf_mlrt.tensor, i1)
+// CHECK: [[body_res:%.*]] = call @while_body_add2([[arg]]) : (!tf_mlrt.tensor) -> !tf_mlrt.tensor
+// CHECK: [[pred_res:%.*]] = call @"while_cond_lt9/tf_mlrt_predicate"([[body_res]]) : (!tf_mlrt.tensor) -> i1
+// CHECK: return [[body_res]], [[pred_res]] : !tf_mlrt.tensor, i1
+
+// CHECK: func @"while_cond_lt9/tf_mlrt_predicate"([[arg:%.*]]: !tf_mlrt.tensor) -> i1
+// CHECK: [[cond_res:%.*]] = call @while_cond_lt9([[arg]]) : (!tf_mlrt.tensor) -> !tf_mlrt.tensor
+// CHECK: [[bool_res:%.*]] = tf_mlrt.predicate [[cond_res]]
+// CHECK: return [[bool_res]] : i1
+
+// CHECK-LABEL: func @multi_while_test
+func.func @multi_while_test() -> (tensor<i32>, tensor<i32>) {
+  %0 = "tf.Const"() {__op_key = 5: i32, device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Const"() {__op_key = 6: i32, device = "/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[pred_0:%.*]] = call @"while_cond_lt9/tf_mlrt_predicate"
+  // CHECK: mlrt.while [[pred_0]] @"while_body_add2/tf_mlrt_body"
+  // CHECK: [[pred_1:%.*]] = call @"while_cond_lt9/tf_mlrt_predicate"
+  // CHECK: mlrt.while [[pred_1]] @"while_body_add2/tf_mlrt_body"
+  %2 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
+  %3 = "tf.While"(%1) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
+  func.return %2, %3 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+// Test async output to function is converted
+
+// CHECK-LABEL: @serving_default_stream_1
+// CHECK-SAME: !mlrt.future
+func.func private @serving_default_stream_1(%arg0: tensor<i32>) {
+  // CHECK: [[tensor:%.*]] = tf_mlrt.await
+  // CHECK: tf_mlrt.executeop([[tensor]])
+  %0 = "tf.StringFormat"(%arg0) {__op_key = 0: i32, device = "/job:localhost/replica:0/task:0/device:CPU:0", placeholder = "{}", strtemplate = "%s", summarize = 3 : i64, template = "Outside compiled {}"} : (tensor<i32>) -> tensor<!tf_type.string>
+  "tf.PrintV2"(%0) {__op_key = 1: i32, device = "/job:localhost/replica:0/task:0/device:CPU:0", end = "\0A", output_stream = "stderr"} : (tensor<!tf_type.string>) -> ()
+  return
+}
+
+func.func @callee(%arg: tensor<i32>) -> (tensor<i32>) {
+  func.return %arg: tensor<i32>
+}
+
+// CHECK-LABEL: @executeop_input
+func.func @executeop_input(%arg0: tensor<i32>) -> (tensor<i32>) {
+  // CHECK: [[async_out:%.*]] = tf_mlrt.batch_function
+  %2 = "tf.BatchFunction"(%arg0) {device = "/device:CPU:0", allowed_batch_sizes = [64], batch_timeout_micros = 1 : i64, batching_queue = "", container = "", f = @callee, max_batch_size = 256 : i64, num_batch_threads = 2 : i64, operand_segment_sizes = array<i32: 1, 0>, shared_name = ""} : (tensor<i32>) -> tensor<i32>
+  // CHECK-NEXT: mlrt.async([[async_out]]) {{.*}} : (!mlrt.future)
+  %3 = mlrt.async(%2) {callee = @serving_default_stream_1} : (tensor<i32>) -> !mlrt.async_handle
+  // CHECK: mlrt.await_handle
+  mlrt.await_handle %3
+  // CHECK: return
+  // CHECK-SAME: !tf_mlrt.tensor
+  func.return %2 : tensor<i32>
+}
+
+// -----
+
+// Support pre-assigned op_key
+
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[input0:%.*]]: !tf_mlrt.tensor, [[promise_b:%.*]]: !mlrt.promise)
+func.func @main(%input0: tensor<i32>, %promise_b: !mlrt.promise) {
+  %const = "tf.Const"() {__op_key = 0 : i32, value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[a:%.*]] = tf_mlrt.executeop([[input0]],
+  // CHECK-SAME: AddV2
+  // CHECK-SAME: op_key = 1
+  // CHECK-NOT: __op_key
+  %a = "tf.AddV2"(%input0, %const) {__op_key = 1: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[b:%.*]] = tf_mlrt.executeop([[a]])
+  // CHECK-SAME: Abs
+  // CHECK-SAME: op_key = 2
+  // CHECK-NOT: __op_key
+  %b = "tf.Abs"(%a) {__op_key = 2: i32 }: (tensor<i32>) -> tensor<i32>
+  // CHECK: tf_mlrt.promise [[promise_b]], [[b]]
+  "tf_mlrt.tf_promise"(%promise_b, %b) : (!mlrt.promise, tensor<i32>) -> ()
+  // CHECK: return
+  return
+}
+
+// -----
+
+// Test future as input to promise
+
+// CHECK-LABEL: func @main_stream_0
+func.func @main_stream_0(%x: tensor<i32>, %p: !mlrt.promise) -> () {
+  // CHECK: [[y_future:%.*]] = tf_mlrt.async_executeop
+  %y = "tf.TestAsyncIdentity"(%x) {__op_key = 0: i32, T = i32} : (tensor<i32>) -> tensor<i32>
+  // CHECK: tf_mlrt.promise_future
+  // CHECK-SAME: [[y_future]]
+  "tf_mlrt.tf_promise"(%p, %y): (!mlrt.promise, tensor<i32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[input0:%.*]]: !tf_mlrt.tensor)
+func.func @main(%input0: tensor<i32>) -> tensor<i32> {
+  // CHECK: [[promises:%.*]], [[futures:%.*]] = "tf_mlrt.allocate_futures"
+  // CHECK-SAME: num_futures = 1
+  %promise_b, %future_b = "tf_mlrt.allocate_futures"()
+    {num_futures = 1 : i32, result_segment_sizes = array<i32: 1, 1>} : () ->
+    (!mlrt.promise, !mlrt.future)
+
+  // CHECK: [[handle_0:%.*]] = mlrt.async([[input0]], [[promises]])
+  // CHECK-SAME: callee = @main_stream_0
+  %handle_0 = mlrt.async(%input0, %promise_b)
+    {callee = @main_stream_0} :
+    (tensor<i32>, !mlrt.promise) -> !mlrt.async_handle
+
+  // CHECK: [[const:%.*]]  = tf_mlrt.executeop
+  // CHECK-SAME: Const
+  %const = "tf.Const"() {__op_key = 1: i32, value = dense<2> : tensor<i32>} : () -> tensor<i32>
+
+  // CHECK: [[b:%.*]] = tf_mlrt.await [[futures]]
+  %b = "tf_mlrt.tf_await"(%future_b) : (!mlrt.future) ->tensor<i32>
+
+  // CHECK: [[result:%.*]] = tf_mlrt.executeop([[b]], [[const]])
+  // CHECK-SAME: AddV2
+  %result = "tf.AddV2"(%b, %const) {__op_key = 2: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+
+  // CHECK: mlrt.await_handle [[handle_0]]
+  mlrt.await_handle %handle_0
+
+  // CHECK: return [[result]]
+  return %result : tensor<i32>
+}
+
+// -----
+
+// Test lowering of tf call ops
+
+// CHECK-LABEL: @callee
+func.func @callee(%arg0: tensor<i32>) -> (tensor<i32>) {
+  func.return %arg0: tensor<i32>
+}
+
+// CHECK-LABEL: func @call_test
+func.func @call_test(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>, tensor<i32>) {
+  %0 = "tf.Add"(%arg0, %arg0) {__op_key = 0, device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  // CHECK: [[results_0:%.*]] = call @callee(
+  // CHECK-SAME: (!tf_mlrt.tensor) -> !tf_mlrt.tensor
+  %1 = "tf.StatefulPartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<i32>) -> (tensor<i32>)
+  // CHECK-NEXT: [[results_1:%.*]] = call @callee(
+  // CHECK-SAME: (!tf_mlrt.tensor) -> !tf_mlrt.tensor
+  %2 = "tf.PartitionedCall"(%0) {config = "", config_proto = "", executor_type = "", f = @callee} : (tensor<i32>) -> (tensor<i32>)
+  // CHECK-NEXT: [[results_2:%.*]] = call @callee(
+  // CHECK-SAME: (!tf_mlrt.tensor) -> !tf_mlrt.tensor
+  %3 = "tf.LegacyCall"(%0) {f = @callee} : (tensor<i32>) -> (tensor<i32>)
+  // CHECK: [[results_0]], [[results_1]], [[results_2]]
+  func.return %1, %2, %3 : tensor<i32>, tensor<i32>, tensor<i32>
+}
+
+// CHECK-LABEL: @branch0
+func.func @branch0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %0 = "tf.Add" (%arg0, %arg1) {__op_key = 1, device = "/device:CPU:0"}  : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
+
+// CHECK-LABEL: @branch1
+func.func @branch1(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %0 = "tf.Add" (%arg0, %arg1) {__op_key = 2, device = "/device:CPU:0"}  : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  %1 = "tf.Add" (%arg0, %0) {__op_key = 3, device = "/device:CPU:0"}  : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return %1 : tensor<f32>
+}
+
+// CHECK-LABEL: func @case_test
+// CHECK-SAME: ([[tf_idx:%.*]]: !tf_mlrt.tensor, [[branch_arg0:%.*]]: !tf_mlrt.tensor, [[branch_arg1:%.*]]: !tf_mlrt.tensor)
+func.func @case_test(%arg0: tensor<i32>, %arg1: tensor<f32>,  %arg2: tensor<f32>) -> tensor<f32> {
+  // CHECK: [[idx:%.*]] = tf_mlrt.tensor_to_int32 [[tf_idx]]
+  // CHECK-NEXT: [[out:%.*]] = mlrt.case [[idx]] [@branch0, @branch1]([[branch_arg0]], [[branch_arg1]])
+  %0 = "tf.Case"(%arg0, %arg1, %arg2) {_lower_using_switch_merge = true, branches = [@branch0, @branch1], is_stateless = true} : (tensor<i32>, tensor<f32>, tensor<f32>) -> tensor<f32>
+  func.return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir
new file mode 100644
index 00000000000..87f906bcbe1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tpu_conversions.mlir
@@ -0,0 +1,168 @@
+// RUN: tf-tfrt-opt --split-input-file -pass-pipeline='builtin.module(pre-parallel-tf-to-mlrt{use-tpu-host-allocator-for-inputs=true},tf-mlrt-parallelization{tfrt-cost-threshold=4},tf-to-mlrt)'  %s | FileCheck %s --dump-input=fail --dump-input-filter=all
+
+func.func @callee(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>) {
+  func.return %arg0: tensor<i32>
+}
+
+// CHECK-LABEL: func @batch_function
+func.func @batch_function(%arg0: tensor<i32>) -> (tensor<i32>) {
+  // CHECK: [[batch_result_future:%.*]] = tf_mlrt.batch_function
+  // CHECK: [[batch_result:%.*]] = tf_mlrt.await [[batch_result_future]]
+  // CHECK-NEXT: [[rendezvous_key_base:%.*]] = tf_mlrt_tpu.compile_and_execute([[batch_result]])
+  // CHECK-NEXT: return [[rendezvous_key_base]]
+  %0 = "tf.BatchFunction"(%arg0, %arg0) {device = "/device:CPU:0", allowed_batch_sizes = [64], batch_timeout_micros = 1 : i64, batching_queue = "", container = "", f = @callee, max_batch_size = 256 : i64, num_batch_threads = 2 : i64, operand_segment_sizes = array<i32: 1, 1>, shared_name = ""} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+func.func @executeop_input(%arg0: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+  // CHECK-NOT: tf_mlrt.executeop(
+  // CHECK: [[device:%.*]] = tf_mlrt_tpu.get_tpu_host_device
+  // CHECK: [[cast:%.*]] = tf_mlrt.executeop.device([[device]]){{.*}}op: \22Cast\22
+  // CHECK: [[rendezvous_key_base:%.*]], [[result_future:%.*]] = tf_mlrt_tpu.compile_and_execute([[cast]])
+  // CHECK: tf_mlrt.await [[result_future]]
+  %0 = "tf.Cast"(%arg0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  func.return %1, %2 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+func.func @executeop_side_effecting_input(%arg0: tensor<!tf_type.resource<tensor<4xf32>>>, %indices: tensor<i32>) -> (tensor<i32>) {
+  // CHECK-NOT: tf_mlrt.executeop(
+  // CHECK: [[device:%.*]] = tf_mlrt_tpu.get_tpu_host_device
+  // CHECK: [[var:%.*]] = tf_mlrt.executeop.device([[device]]){{.*}}op: \22ResourceGather\22
+  // CHECK: [[rendezvous_key_base:%.*]] = tf_mlrt_tpu.compile_and_execute([[var]])
+  %0 = "tf.ResourceGather"(%arg0, %indices) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<4xf32>>>, tensor<i32>) -> tensor<f32>
+  %1 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+func.func @executeop_input_same_execute_op(%arg0: tensor<i32>, %arg1: tensor<2xf32>) -> (tensor<i32>) {
+  // CHECK-NOT: tf_mlrt.executeop(
+  // CHECK: [[device:%.*]] = tf_mlrt_tpu.get_tpu_host_device
+  // CHECK: [[split:%.*]]:2 = tf_mlrt.executeop.device([[device]])
+  // CHECK: tf_mlrt_tpu.compile_and_execute([[split]]#0, [[split]]#1)
+  %0, %1 = "tf.Split"(%arg0, %arg1) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>, tensor<2xf32>) -> (tensor<f32>, tensor<f32>)
+  %2 = "tf.TPUCompileMlirAndExecute"(%0, %1) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 2, 0>, producer_name = "producer_name"} : (tensor<f32>, tensor<f32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+// -----
+
+// Test that inputs are lowered correctly when they form a DAG.
+
+// CHECK-LABEL: executeop_dag
+func.func @executeop_dag(%arg0: tensor<i32>) -> (tensor<i32>) {
+  // CHECK-NEXT: tf_mlrt_tpu.get_tpu_host_device
+  // CHECK-NEXT: tf_mlrt.executeop.device{{.*}}op: \22Cast\22
+  // CHECK-NEXT: tf_mlrt_tpu.get_tpu_host_device
+  // CHECK-NEXT: tf_mlrt.executeop.device{{.*}}op: \22Relu\22
+  // CHECK-NEXT: tf_mlrt_tpu.compile_and_execute
+  %0 = "tf.Cast"(%arg0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  %1 = "tf.Relu"(%0) {__op_key = 1: i32, device = "/device:CPU:0"} : (tensor<f32>) -> (tensor<f32>)
+  %2 = "tf.TPUCompileMlirAndExecute"(%1, %0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 2, 0>, producer_name = "producer_name"} : (tensor<f32>, tensor<f32>) -> tensor<i32>
+  func.return %2 : tensor<i32>
+}
+
+// -----
+
+func.func @test_fuse_dynamic_dimension_ops(%arg0: tensor<*xi32>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>, %arg4: tensor<*xi32>, %arg5: tensor<?xi64>, %arg6: tensor<?xi64>, %arg7: tensor<?xi64>) -> tensor<*xi32> {
+  %0 = "tf.ReadVariableOp"(%arg1) {__op_key = 0: i32, device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<*xi32>
+  %1 = "tf.Shape"(%arg0) {__op_key = 1: i32, device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
+  %2 = "tf.Shape"(%0) {__op_key = 2: i32, device = "/CPU:0"} : (tensor<*xi32>) -> tensor<?xi64>
+  // CHECK: [[rendezvous_key_base:%.*]], [[result_future:%.*]] = tf_mlrt_tpu.compile_and_execute
+  // CHECK-SAME: constant_operand_indices = array<i32: 2>
+  // CHECK-SAME: num_operands = 4
+  // CHECK-SAME: operands_with_static_shape = array<i32: 0, 1, 3>
+  %rendezvous_key_base, %results = "tf.TPUCompileMlirAndExecute"(%arg0, %2, %0, %1, %arg5, %arg6, %arg7) {operands_with_static_shape = [0 : i32, 1 : i32, 3 : i32], metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 4, 3>, producer_name = "producer_name"} : (tensor<*xi32>, tensor<?xi64>, tensor<*xi32>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>, tensor<?xi64>) -> (tensor<3x!tf_type.string>, tensor<*xi32>)
+  func.return %results : tensor<*xi32>
+}
+
+// -----
+
+// Test async output of tf.TPUCompileMlirAndExecute to function is converted
+
+// CHECK-LABEL: @executeop_input_stream_1
+// CHECK-SAME: ([[future:%.*]]: !mlrt.future
+// CHECK: [[tensor:%.*]] = tf_mlrt.await [[future]]
+// CHECK: tf_mlrt.executeop([[tensor]])
+// CHECK-SAME: StringFormat
+
+// CHECK-LABEL: @executeop_input
+func.func @executeop_input(%arg0: tensor<i32>) -> (tensor<i32>) {
+  // CHECK: tf_mlrt.executeop
+  %0 = "tf.Cast"(%arg0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  // CHECK: [[rendezvous_key_base:%.*]], [[result:%.*]] = tf_mlrt_tpu.compile_and_execute
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  %3 = "tf.StringFormat"(%2) {__op_key = 1: i32, device = "/job:localhost/replica:0/task:0/device:CPU:0", placeholder = "{}", strtemplate = "%s", summarize = 3 : i64, template = "Outside compiled {}"} : (tensor<i32>) -> tensor<!tf_type.string>
+  "tf.PrintV2"(%3) {__op_key = 2: i32, device = "/job:localhost/replica:0/task:0/device:CPU:0", end = "\0A", output_stream = "stderr"} : (tensor<!tf_type.string>) -> ()
+  // CHECK: [[handle:%.*]] = mlrt.async([[result]])
+  // CHECK-SAME: (!mlrt.future)
+  // CHECK: mlrt.await_handle [[handle]]
+  // CHECK: return [[rendezvous_key_base]]
+  // CHECK-SAME: !tf_mlrt.tensor
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+// Test constant arguments to tf.TPUCompileMlirAndExecute are preserved during parallelization.
+
+// CHECK-LABEL: @preserve_constant_args(
+func.func @preserve_constant_args(%arg0: tensor<i32>, %arg1: tensor<*x!tf_type.resource>, %arg2: tensor<*x!tf_type.resource>, %arg3: tensor<*x!tf_type.resource>) -> (tensor<i32>) {
+  // CHECK-NOT: ReadVariableOp
+  // CHECK: mlrt.async(
+  %v0 = "tf.ReadVariableOp"(%arg1) {__op_key = 0: i32, device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<i32>
+  %v1 = "tf.ReadVariableOp"(%arg2) {__op_key = 1: i32, device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<i32>
+  // CHECK: [[cast:%.*]] = tf_mlrt.executeop(
+  // CHECK-SAME: ReadVariableOp
+  %v2 = "tf.ReadVariableOp"(%arg3) {__op_key = 2: i32, device = "/CPU:0"} : (tensor<*x!tf_type.resource>) -> tensor<i32>
+  // CHECK: [[cast:%.*]] = tf_mlrt.executeop.device
+  // CHECK-SAME: Cast
+  %0 = "tf.Cast"(%arg0) {__op_key = 3: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  // CHECK: tf_mlrt_tpu.compile_and_execute({{%.*}}, [[cast]]
+  // CHECK-SAME: constant_operand_indices = array<i32: 1, 3, 4>
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0, %v1, %0, %v2, %v0, %arg0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 6, 0>, producer_name = "producer_name"} : (tensor<f32>, tensor<i32>, tensor<f32>, tensor<i32>, tensor<i32>, tensor<i32>) -> (tensor<i32>, tensor<i32>)
+  func.return %2 : tensor<i32>
+}
+
+// -----
+
+func.func @executeop_input_async() -> (tensor<i32>, tensor<i32>) {
+  // CHECK-NOT: tf_mlrt.executeop(
+  // CHECK: [[device:%.*]] = tf_mlrt_tpu.get_tpu_host_device
+  // CHECK: [[recv_future:%.*]] = tf_mlrt.async_executeop.device([[device]]){{.*}}op: \22Recv\22
+  // CHECK: [[recv:%.*]] = tf_mlrt.await [[recv_future]]
+  // CHECK: [[rendezvous_key_base:%.*]], [[result_future:%.*]] = tf_mlrt_tpu.compile_and_execute([[recv]])
+  // CHECK: tf_mlrt.await [[result_future]]
+  %0 = "tf.Recv"() {__op_key = 0: i32, device = "/device:CPU:0", tensor_name = "tensor", send_device = "/device:CPU:0", send_device_incarnation = 0, recv_device = "/device:CPU:0"} : () -> tensor<f32>
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  func.return %1, %2 : tensor<i32>, tensor<i32>
+}
+
+// -----
+
+// Test the output from TPU op is properly awaited before its use by map_fn.
+// CHECK-LABEL: @main
+// CHECK-SAME: ([[input0:%.*]]: !tf_mlrt.tensor, [[input1:%.*]]: !tf_mlrt.tensor)
+func.func @main(%input0: tensor<i32>, %input1: tensor<i32>, %input2: tensor<!tf_type.variant<tensor<*xf32>>> ) -> tensor<i32> {
+  %0 = "tf.Cast"(%input0) {__op_key = 0: i32, device = "/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  // CHECK: tf_mlrt_tpu.compile_and_execute
+  %1, %2 = "tf.TPUCompileMlirAndExecute"(%0) {metadata = "metadata", mlir_module = "mlir_module", operand_segment_sizes = array<i32: 1, 0>, producer_name = "producer_name"} : (tensor<f32>) -> (tensor<i32>, tensor<i32>)
+  %max_iter = "tf.Const"() {__op_key = 1, value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf_mlrt.map_fn
+  %result = "tf_mlrt.tf_map_fn"(%max_iter, %input2, %2) { operand_segment_sizes = array<i32: 1, 1, 1>, body_fn = @NopMapFnBody, num_tensor_list_or_flow_in = 1 : i32} : (tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>) -> tensor<i32>
+  return %result : tensor<i32>
+}
+
+// CHECK-LABEL: @NopMapFnBody
+func.func private @NopMapFnBody(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<i32>, %arg3: tensor<!tf_type.variant<tensor<*xf32>>>) -> () {
+  %const = "tf.Const"() {__op_key = 2 : i32, value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %a = "tf.AddV2"(%arg2, %const) {__op_key = 3: i32}: (tensor<i32>, tensor<i32>) -> tensor<i32>
+  return
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
new file mode 100644
index 00000000000..27c92289a5b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/while_to_map_fn.mlir
@@ -0,0 +1,640 @@
+// RUN: tf-tfrt-opt -split-input-file -tf-mlrt-while-to-map-fn %s | FileCheck %s
+
+// Test a while to map_fn conversion in which the max iteration is hard coded inside the predicate body.
+
+// CHECK-LABEL: map/while_cond
+func.func private @"map/while_cond"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> tensor<i1> {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Less"(%arg0, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = "tf.Less"(%arg1, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = "tf.LogicalAnd"(%0, %1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: map/while_body
+func.func private @"map/while_body"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00, 9.000000e+00]> : tensor<9xf32>} : () -> tensor<9xf32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[0, 1, 2]> : tensor<3xi32>} : () -> tensor<3xi32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<3> : tensor<2xi32>} : () -> tensor<2xi32>
+  %cst_3 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00, 8.000000e+00]> : tensor<9xf32>} : () -> tensor<9xf32>
+  %cst_4 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg0, %cst_4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.Mul"(%arg3, %cst_3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?xf32>, tensor<9xf32>) -> tensor<9xf32>
+  %2 = "tf.Reshape"(%1, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<9xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  %3 = "tf.AddV2"(%arg1, %cst_4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %4 = "tf.GatherV2"(%cst_1, %arg1, %cst_0) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<3xi32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  %5 = "tf.Cast"(%4) {Truncate = false, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  %6 = "tf.Mul"(%5, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<f32>, tensor<9xf32>) -> tensor<9xf32>
+  %7 = "tf.Reshape"(%6, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<9xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  %8 = "tf.MatMul"(%2, %7) {device = "/job:localhost/replica:0/task:0/device:CPU:0", transpose_a = false, transpose_b = false} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  %9 = "tf.MatrixDeterminant"(%8) {T = f32, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<3x3xf32>) -> tensor<f32>
+  %10 = "tf.TensorListSetItem"(%arg2, %arg1, %9) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<f32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  return %0, %3, %10, %arg3 : tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>
+}
+
+// CHECK-LABEL: map/while_body/MapFnBody
+// CHECK-SAME: (%arg0: !mlrt.future, %arg1: !mlrt.promise, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<?xf32>)
+// CHECK: [[det:%.*]] = "tf.MatrixDeterminant"
+// CHECK-NEXT: [[ta_0:%.*]] = "tf_mlrt.tf_await"(%arg0) : (!mlrt.future) -> tensor<!tf_type.variant<tensor<*xf32>>>
+// CHECK-NEXT: [[ta_1:%.*]] = "tf.TensorListSetItem"([[ta_0]], %arg3, [[det]]) {
+// CHECK-NEXT:  "tf_mlrt.tf_promise"(%arg1, [[ta_1]]) : (!mlrt.promise, tensor<!tf_type.variant<tensor<*xf32>>>) -> ()
+// CHECK-NEXT: return
+
+//CHECK-LABEL: @serving_default
+func.func @serving_default(%arg0: tensor<?xf32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> tensor<3xf32> attributes {tf.entry_function = {control_outputs = "", inputs = "serving_default_input:0", outputs = "PartitionedCall:0"}} {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<3> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[tensor_list:%.*]] = "tf.TensorListReserve"([[shape:%.*]], [[reserve_size:%.*]]) {
+  %0 = "tf.TensorListReserve"(%cst_1, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  // CHECK: [[map_fn_result:%.*]] = tf_mlrt.tf_map_fn([[reserve_size]], [[tensor_list]], %arg0)
+  // CHECK-SAME: {body_fn = @"map/while_body/MapFnBody", num_tensor_list_or_flow_in = 1 : i32}
+  // CHECK-NOT: tf.While
+  %1:4 = "tf.While"(%cst, %cst, %0, %arg0) {_lower_using_switch_merge = true, _num_original_outputs = 6 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, body = @"map/while_body", cond = @"map/while_cond", device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = true, parallel_iterations = 4 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>)
+  // CHECK-NEXT: "tf.TensorListStack"([[map_fn_result]], %cst_0) {
+  %2 = "tf.TensorListStack"(%1#2, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 3 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<0xi32>) -> tensor<3xf32>
+  return %2 : tensor<3xf32>
+}
+
+// -----
+
+// Test a while to map_fn conversion in which max_iterations are passed
+// into the predicate function.
+
+// CHECK-LABEL: @"map/while_cond"
+func.func private @"map/while_cond"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<i32>, %arg4: tensor<!tf_type.resource<tensor<3x1xf32>>>, %arg5: tensor<?x3xf32>, %arg6: tensor<?x4xf32>) -> tensor<i1> {
+  %outputs =  "tf.Less"(%arg0, %arg3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %outputs_0  =  "tf.Less"(%arg1, %arg3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %outputs_2  =  "tf.LogicalAnd"(%outputs_0, %outputs) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+
+  return %outputs_2 : tensor<i1>
+}
+
+// CHECK-LABEL: @"map/while_body"
+func.func private @"map/while_body"(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<i32>, %arg4: tensor<!tf_type.resource<tensor<3x1xf32>>>, %arg5: tensor<?x3xf32>, %arg6: tensor<?x4xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<?x3xf32>, tensor<?x4xf32>) {
+  %outputs =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_0  =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_2  =  "tf.AddV2"(%arg0, %outputs_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %outputs_4  =  "tf.ReadVariableOp"(%arg4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> tensor<3x1xf32>
+  %outputs_6  =  "tf.Identity"(%outputs_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> tensor<i32>
+  %outputs_8  =  "tf.MatMul"(%arg5, %outputs_4) {device = "/job:localhost/replica:0/task:0/device:CPU:0", transpose_a = false, transpose_b = false} : (tensor<?x3xf32>, tensor<3x1xf32>) -> tensor<?x1xf32>
+  %outputs_10  =  "tf.AddV2"(%arg1, %outputs_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %outputs_12  =  "tf.Identity"(%outputs_10) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> tensor<i32>
+  %outputs_14  =  "tf.GatherV2"(%arg6, %arg1, %outputs) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?x4xf32>, tensor<i32>, tensor<i32>) -> tensor<4xf32>
+  %outputs_16  =  "tf.AddV2"(%outputs_8, %outputs_14) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?x1xf32>, tensor<4xf32>) -> tensor<?x4xf32>
+  %outputs_18  =  "tf.TensorListSetItem"(%arg2, %arg1, %outputs_16) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<?x4xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  return  %outputs_6, %outputs_12, %outputs_18, %arg3, %arg4, %arg5, %arg6 : tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<?x3xf32>, tensor<?x4xf32>
+}
+
+// CHECK-LABEL: @"map/while_body/MapFnBody"
+// CHECK-SAME (%arg0: !mlrt.Future, %arg1: !mlrt.Promise, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<i32>, %arg5: tensor<!tf_type.resource<tensor<3x1xf32>>>, %arg6: tensor<?x3xf32>, %arg7: tensor<?x4xf32>)
+// CHECK-NEXT: [[cst_0:%.*]] = "tf.Const"
+// CHECK-NEXT: [[cst_1:%.*]] = "tf.Const"
+// CHECK-NEXT: [[loop_counter:%.*]] = "tf.AddV2"(%arg2, [[cst_1]])
+// CHECK-NEXT: [[weight:%.*]] = "tf.ReadVariableOp"(%arg5)
+// CHECK-NEXT: [[mpy:%.*]] = "tf.MatMul"(%arg6, [[weight]])
+// CHECK-NEXT: [[element_index:%.*]] = "tf.AddV2"(%arg3, [[cst_1]])
+// CHECK-NEXT: [[bias:%.*]] = "tf.GatherV2"(%arg7, %arg3, [[cst_0]])
+// CHECK-NEXT: [[res:%.*]] = "tf.AddV2"([[mpy]], [[bias]])
+// CHECK-NEXT: [[ta_0:%.*]] = "tf_mlrt.tf_await"(%arg0)
+// CHECK-NEXT: [[ta_1:%.*]] = "tf.TensorListSetItem"([[ta_0]], %arg3, [[res]])
+// CHECK-NEXT: "tf_mlrt.tf_promise"(%arg1, [[ta_1]])
+// CHECK-NEXT: return
+
+// CHECK-LABEL: func @main_while
+func.func @main_while(%arg0: tensor<?x3xf32>, %arg1: tensor<?x4xf32>) -> tensor<?x?x4xf32> {
+  %outputs =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[-1, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %outputs_0  =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_2  =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_4  =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_6  =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_8  =  "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[elems:%.*]] = "tf.VarHandleOp"
+  %outputs_10  =  "tf.VarHandleOp"() {_xla_inferred_shapes = [#tf_type.shape<>], allowed_devices = [], container = "", device = "/job:localhost/replica:0/task:0/device:CPU:0", shared_name = "w"} : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+  %outputs_12  =  "tf.Shape"(%arg1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?x4xf32>) -> tensor<2xi32>
+  // CHECK: [[max_iter:%.*]] = "tf.StridedSlice"
+  %outputs_14  =  "tf.StridedSlice"(%outputs_12, %outputs_2, %outputs_4, %outputs_4) {begin_mask = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  // CHECK: [[tensor_list:%.*]] = "tf.TensorListReserve"
+  %outputs_16  =  "tf.TensorListReserve"(%outputs_0, %outputs_14) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  // CHECK: tf_mlrt.tf_map_fn
+  // CHECK-SAME: ([[max_iter]], [[tensor_list]], [[max_iter]], [[elems]], %arg0, %arg1)
+  // CHECK-SAME: {body_fn = @"map/while_body/MapFnBody", num_tensor_list_or_flow_in = 1 : i32}
+  // CHECK-NOT: tf.while
+  %outputs_18:7  =  "tf.While"(%outputs_6, %outputs_6, %outputs_16, %outputs_14, %outputs_10, %arg0, %arg1) {_lower_using_switch_merge = true, _num_original_outputs = 8 : i64, _read_only_resource_inputs = [6], _xla_propagate_compile_time_consts = true, body = @"map/while_body", cond = @"map/while_cond", device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<?x3xf32>, tensor<?x4xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<!tf_type.resource<tensor<3x1xf32>>>, tensor<?x3xf32>, tensor<?x4xf32>)
+  %outputs_20  =  "tf.TensorListStack"(%outputs_18#2, %outputs) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = -1 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<2xi32>) -> tensor<?x?x4xf32>
+  return %outputs_20 : tensor<?x?x4xf32>
+}
+
+// -----
+
+// Test a while to map_fn conversion in which the passed in max_iterations 
+// is not in typical location of %arg3 and there are identify chains in function bodies.
+
+// CHECK-LABEL: @map_while_cond_170
+func.func private @map_while_cond_170(%arg0: tensor<i32> {tf._user_specified_name = "map/while/loop_counter"}, %arg1: tensor<i32> {tf._user_specified_name = "map/while/maximum_iterations"}, %arg2: tensor<i32>, %arg3: tensor<!tf_type.variant>, %arg4: tensor<*x!tf_type.variant>, %arg5: tensor<*xf32>) -> tensor<*xi1> attributes {tf._construction_context = "kEagerRuntime", tf._original_func_name = "map_while_cond_17"} {
+  %outputs =  "tf.Const"() {device = "", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  %outputs_0 =  "tf.Less"(%arg0, %arg1) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi1>
+  %outputs_2 =  "tf.Less"(%arg2, %outputs) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi1>
+  %outputs_4 =  "tf.LogicalAnd"(%outputs_0, %outputs_2) {device = ""} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  %outputs_6 =  "tf.Identity"(%outputs_4) {device = ""} : (tensor<*xi1>) -> tensor<*xi1>
+  return %outputs_6 : tensor<*xi1>
+}
+
+// Original input argument list (loop_counter, max_iterations, element_index, tensor_list, read_only_tensor_list, scale)
+// CHECK-LABEL: @map_while_body_180
+func.func private @map_while_body_180(%arg0: tensor<i32> {tf._user_specified_name = "map/while/loop_counter"}, %arg1: tensor<i32> {tf._user_specified_name = "map/while/maximum_iterations"}, %arg2: tensor<i32>, %arg3: tensor<!tf_type.variant>, %arg4: tensor<!tf_type.variant> {tf._user_specified_name = "map/TensorArrayUnstack/TensorListFromTensor"}, %arg5: tensor<?xf32> {tf._user_specified_name = "input"}) -> (tensor<*xi32>, tensor<*xi32>, tensor<*xi32>, tensor<*x!tf_type.variant>, tensor<!tf_type.variant>, tensor<?xf32>) attributes {tf._construction_context = "kEagerRuntime", tf._original_func_name = "map_while_body_18"} {
+  %outputs =  "tf.Const"() {device = "", value = dense<16> : tensor<2xi32>} : () -> tensor<2xi32>
+  %outputs_0 =  "tf.Const"() {device = "", value = dense<16> : tensor<2xi32>} : () -> tensor<2xi32>
+  %outputs_2 =  "tf.Const"() {device = "", value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %outputs_4 =  "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_6 =  "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_8 =  "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_10 =  "tf.Const"() {device = "", value = dense<256> : tensor<i32>} : () -> tensor<i32>
+  %outputs_12 =  "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_14 =  "tf.Range"(%outputs_12, %outputs_10, %outputs_8) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_16 =  "tf.Cast"(%outputs_14) {Truncate = false, device = ""} : (tensor<*xi32>) -> tensor<*xf32>
+  %outputs_18 =  "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_20 =  "tf.Const"() {device = "", value = dense<257> : tensor<i32>} : () -> tensor<i32>
+  %outputs_22 =  "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_24 =  "tf.Range"(%outputs_22, %outputs_20, %outputs_18) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_26 =  "tf.Cast"(%outputs_24) {Truncate = false, device = ""} : (tensor<*xi32>) -> tensor<*xf32>
+  %outputs_28 =  "tf.Const"() {device = "", value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_30 =  "tf.Transpose"(%outputs_26, %outputs_28) {device = ""} : (tensor<*xf32>, tensor<1xi32>) -> tensor<*xf32>
+  %outputs_32 =  "tf.AddV2"(%arg0, %outputs_6) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_34 =  "tf.Identity"(%outputs_32) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  %outputs_36 =  "tf.Identity"(%arg1) {device = ""} : (tensor<i32>) -> tensor<*xi32>
+  %outputs_38 =  "tf.Mul"(%outputs_16, %arg5) {device = ""} : (tensor<*xf32>, tensor<?xf32>) -> tensor<*xf32>
+  %outputs_40 =  "tf.Reshape"(%outputs_38, %outputs) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %outputs_42 =  "tf.AddV2"(%arg2, %outputs_4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_44 =  "tf.Identity"(%outputs_42) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  %outputs_46 =  "tf.TensorListGetItem"(%arg4, %arg2, %outputs_2) {device = ""} : (tensor<!tf_type.variant>, tensor<i32>, tensor<0xi32>) -> tensor<*xi32>
+  %outputs_48 =  "tf.Cast"(%outputs_46) {Truncate = false, device = ""} : (tensor<*xi32>) -> tensor<*xf32>
+  %outputs_50 =  "tf.Mul"(%outputs_30, %outputs_48) {device = ""} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  %outputs_52 =  "tf.Reshape"(%outputs_50, %outputs_0) {device = ""} : (tensor<*xf32>, tensor<2xi32>) -> tensor<*xf32>
+  %outputs_54 =  "tf.MatMul"(%outputs_40, %outputs_52) {device = "", transpose_a = false, transpose_b = false} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
+  %outputs_56 =  "tf.MatrixDeterminant"(%outputs_54) {T = f32, device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  %outputs_58 =  "tf.TensorListSetItem"(%arg3, %arg2, %outputs_56) {device = "", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant>, tensor<i32>, tensor<*xf32>) -> tensor<*x!tf_type.variant>
+  %outputs_60 =  "tf.Identity"(%outputs_58) {device = ""} : (tensor<*x!tf_type.variant>) -> tensor<*x!tf_type.variant>
+  return %outputs_34, %outputs_36, %outputs_44, %outputs_60, %arg4, %arg5 : tensor<*xi32>, tensor<*xi32>, tensor<*xi32>, tensor<*x!tf_type.variant>, tensor<!tf_type.variant>, tensor<?xf32>
+}
+
+// Converted input argument list (loop_counter, element_index, max_iterations, tensor_list, read_only_tensor_list, scale)
+// CHECK-LABEL: @"map_while_body_180/MapFnBody"
+// CHECK-SAME: (%arg0: !mlrt.future, %arg1: !mlrt.promise, %arg2: tensor<i32> {tf._user_specified_name = "map/while/loop_counter"}, %arg3: tensor<i32>, %arg4: tensor<i32> {tf._user_specified_name = "map/while/maximum_iterations"}, %arg5: tensor<!tf_type.variant> {tf._user_specified_name = "map/TensorArrayUnstack/TensorListFromTensor"}, %arg6: tensor<?xf32> {tf._user_specified_name = "input"})
+// CHECK: [[res:%.*]] = "tf.MatrixDeterminant"
+// CHECK-NEXT: [[ta_0:%.*]] = "tf_mlrt.tf_await"(%arg0)
+// CHECK-NEXT: [[ta_1:%.*]] = "tf.TensorListSetItem"([[ta_0]], %arg3, [[res]])
+// CHECK-NEXT: "tf_mlrt.tf_promise"(%arg1, [[ta_1]])
+// CHECK-NEXT: return
+
+
+// CHECK-LABEL: __inference_while_from_map_fn_810
+// CHECK-SAME: ([[scale:%.*]]: tensor<?xf32>
+func.func private @__inference_while_from_map_fn_810(%arg0: tensor<?xf32> {tf._user_specified_name = "input"}) -> tensor<*xf32> attributes {tf._construction_context = "kEagerRuntime", tf._original_func_name = "__inference_while_from_map_fn_81"} {
+  // CHECK: [[element_index:%.*]] = "tf.Const"
+  %outputs =  "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_0 =  "tf.Const"() {device = "", value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %outputs_2=  "tf.Const"() {device = "", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_4 =  "tf.Const"() {device = "", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf.TensorListReserve
+  %outputs_6 =  "tf.TensorListReserve"(%outputs_2, %outputs_4) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xi32>>>
+  %outputs_8 =  "tf.Const"() {device = "", value = dense<> : tensor<0xi32>} : () -> tensor<0xi32>
+  %outputs_10 =  "tf.Const"() {device = "", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_12 =  "tf.Const"() {device = "", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: [[tensor_list:%.*]] = "tf.TensorListReserve"([[shape:%.*]], [[reserve_size:%.*]]) {
+  %outputs_14 =  "tf.TensorListReserve"(%outputs_10, %outputs_12) {device = ""} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  // CHECK-NEXT: [[loop_counter:%.*]] = "tf.Const"
+  %outputs_16 =  "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK-NEXT: [[max_iterations:%.*]] = "tf.Const"
+  %outputs_18 =  "tf.Const"() {device = "", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  %outputs_20 =  "tf.Const"() {device = "", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_22 =  "tf.Const"() {device = "", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  %outputs_24 =  "tf.Const"() {device = "", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_26 =  "tf.Range"(%outputs_24, %outputs_22, %outputs_20) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<*xi32>
+  // CHECK: [[read_only_tensor_list:%.*]] = "tf.TensorListFromTensor"
+  %outputs_28 =  "tf.TensorListFromTensor"(%outputs_26, %outputs_0) {device = ""} : (tensor<*xi32>, tensor<0xi32>) -> tensor<*x!tf_type.variant>
+// CHECK: [[map_fn_out:%.*]] = tf_mlrt.tf_map_fn
+  // CHECK-SAME: ([[reserve_size]], [[tensor_list]], [[max_iterations]], [[read_only_tensor_list]], [[scale]])
+  // CHECK-SAME: {body_fn = @"map_while_body_180/MapFnBody", num_tensor_list_or_flow_in = 1 : i32}
+  // CHECK-NOT: tf.While
+  %outputs_30:6 =  "tf.While"(%outputs_16, %outputs_18, %outputs, %outputs_14, %outputs_28, %arg0) {T = [i32, i32, i32, !tf_type.variant, !tf_type.variant, f32], _lower_using_switch_merge = true, _num_original_outputs = 6 : i64, _read_only_resource_inputs = [], body = @map_while_body_180, cond = @map_while_cond_170, device = "", is_stateless = true, output_shapes = [#tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<>, #tf_type.shape<?>], parallel_iterations = 4 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<*x!tf_type.variant>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<i32>, tensor<!tf_type.variant>, tensor<!tf_type.variant>, tensor<?xf32>)
+  // CHECK-NEXT: "tf.TensorListStack"
+  // CHECK-SAME: ([[map_fn_out]],
+  %outputs_32 =  "tf.TensorListStack"(%outputs_30#3, %outputs_8) {device = "", num_elements = 16 : i64} : (tensor<!tf_type.variant>, tensor<0xi32>) -> tensor<*xf32>
+  %outputs_34 =  "tf.Identity"(%outputs_32) {device = ""} : (tensor<*xf32>) -> tensor<*xf32>
+  return %outputs_34 : tensor<*xf32>
+}
+
+// -----
+
+// Test a while to map_fn conversion in which tensor array is used instead of
+// tensor list.
+
+// CHECK-LABEL: map/while/LoopCond_cond
+func.func private @"map/while/LoopCond_cond"(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, %arg5: tensor<f32>, %arg6: tensor<2x!tf_type.resource<tensor<*xui8>>>) -> tensor<i1> {
+  %outputs = "tf.Less"(%arg0, %arg3) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %outputs_0 = "tf.Less"(%arg1, %arg3) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %outputs_2 = "tf.LogicalAnd"(%outputs, %outputs_0) {device = ""} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  %outputs_4 = "tf.ToBool"(%outputs_2) : (tensor<*xi1>) -> tensor<i1>
+  return %outputs_4 : tensor<i1>
+}
+
+// CHECK-LABEL: map/while/LoopCond_body
+func.func private @"map/while/LoopCond_body"(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, %arg5: tensor<f32>, %arg6: tensor<2x!tf_type.resource<tensor<*xui8>>>) -> (tensor<*xi32>, tensor<*xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>) {
+  %outputs = "tf.Const"() {value = dense<224> : tensor<2xi32>} : () -> tensor<2xi32>
+  %outputs_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_4 = "tf.Identity"(%arg0) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  %outputs_6 = "tf.AddV2"(%outputs_4, %outputs_2) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_8 = "tf.Identity"(%arg1) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  %outputs_10 = "tf.AddV2"(%outputs_8, %outputs_2) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_12 = "tf.Identity"(%arg2) {device = ""} : (tensor<f32>) -> tensor<f32>
+  %outputs_14 = "tf.TensorArrayReadV3"(%arg4, %outputs_8, %arg5) {device = ""} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<*xi32>, tensor<f32>) -> tensor<*x!tf_type.string>
+  %outputs_16 = "tf.DecodeJpeg"(%outputs_14) {acceptable_fraction = 1.000000e+00 : f32, channels = 3 : i64, dct_method = "INTEGER_FAST", device = "", fancy_upscaling = true, ratio = 1 : i64, try_recover_truncated = false} : (tensor<*x!tf_type.string>) -> tensor<?x?x3xui8>
+  %outputs_18 = "tf.ExpandDims"(%outputs_16, %outputs_0) {device = ""} : (tensor<?x?x3xui8>, tensor<i32>) -> tensor<1x?x?x3xui8>
+  %outputs_20 = "tf.ResizeBilinear"(%outputs_18, %outputs) {align_corners = false, device = "", half_pixel_centers = false} : (tensor<1x?x?x3xui8>, tensor<2xi32>) -> tensor<1x224x224x3xf32>
+  %outputs_22 = "tf.Squeeze"(%outputs_20) {device = "", squeeze_dims = [0]} : (tensor<1x224x224x3xf32>) -> tensor<224x224x3xf32>
+  %outputs_24 = "tf.Cast"(%outputs_22) {Truncate = false, device = ""} : (tensor<224x224x3xf32>) -> tensor<224x224x3xui8>
+  %outputs_26 = "tf.TensorArrayWriteV3"(%arg6, %outputs_8, %outputs_24, %outputs_12) {device = ""} : (tensor<2x!tf_type.resource<tensor<*xui8>>>, tensor<*xi32>, tensor<224x224x3xui8>, tensor<f32>) -> tensor<f32>
+  return %outputs_6, %outputs_10, %outputs_26, %arg3, %arg4, %arg5, %arg6: tensor<*xi32>, tensor<*xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>
+}
+
+// CHECK-LABEL: @"map/while/LoopCond_body/MapFnBody"
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.TensorArrayReadV3
+// CHECK-NEXT: tf.DecodeJpeg
+// CHECK-NEXT: tf.ExpandDims
+// CHECK-NEXT: tf.ResizeBilinear
+// CHECK-NEXT: tf.Squeeze
+// CHECK-NEXT: tf.Cast
+// CHECK-NEXT: tf_mlrt.tf_await
+// CHECK-NEXT: tf.TensorArrayWriteV3
+// CHECK-NEXT: tf_mlrt.tf_promise
+// CHECK-NEXT: return
+
+//CHECK-LABEL: map_while_test
+func.func @map_while_test(%arg0: tensor<?x!tf_type.string>) -> tensor<?x224x224x3xui8> {
+  %outputs = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<1xi32>
+  %outputs_0 = "tf.Const"() {value = dense<224> : tensor<2xi32>} : () -> tensor<2xi32>
+  %outputs_2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_4 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_6 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_8 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_10 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+  // CHECK: [[max_iter:%.*]] = "tf.StridedSlice"
+  %outputs_12 = "tf.StridedSlice"(%outputs_10, %outputs_6, %outputs_4, %outputs_4) {begin_mask = 0 : i64, device = "", ellipsis_mask = 0 : i64, end_mask = 0 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<i32>
+  // CHECK-NEXT: tf.Range
+  %outputs_14 = "tf.Range"(%outputs_2, %outputs_12, %outputs_8) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK-NEXT: [[handle_1:%.*]], [[flow_in_1:%.*]] = "tf.TensorArrayV3"
+  %outputs_16:2 = "tf.TensorArrayV3"(%outputs_12) {clear_after_read = true, device = "", dtype = !tf_type.string, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = ""} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>)
+  // CHECK-NEXT: [[handle_2:%.*]] = "tf.TensorArrayScatterV3"
+  %outputs_18 = "tf.TensorArrayScatterV3"(%outputs_16#0, %outputs_14, %arg0, %outputs_16#1) {device = ""} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<?xi32>, tensor<?x!tf_type.string>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: tf.Range
+  %outputs_20 = "tf.Range"(%outputs_2, %outputs_12, %outputs_8) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK-NEXT: [[tensor_array:%.*]], [[flow_in:%.*]] = "tf.TensorArrayV3"
+  %outputs_22:2 = "tf.TensorArrayV3"(%outputs_12) {clear_after_read = true, device = "", dtype = ui8, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = ""} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<*xui8>>>, tensor<f32>)
+  // CHECK-NEXT: tf_mlrt.tf_map_fn
+  // CHECK-SAME: ([[max_iter]], [[flow_in]], [[max_iter]], [[handle_1]], [[handle_2]], [[tensor_array]])
+  // CHECK-SAME: {body_fn = @"map/while/LoopCond_body/MapFnBody", num_tensor_list_or_flow_in = 1 : i32} 
+  // CHECK-NOT: tf.While
+  %outputs_24:7 = "tf.While"(%outputs, %outputs, %outputs_22#1, %outputs_12, %outputs_16#0, %outputs_18, %outputs_22#0) {_xla_propagate_compile_time_consts = true, body = @"map/while/LoopCond_body", cond = @"map/while/LoopCond_cond", device = "", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<1xi32>, tensor<1xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>) -> (tensor<1xi32>, tensor<1xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>)
+  // CHECK-NEXT: tf.TensorArrayGatherV3
+  %outputs_26 = "tf.TensorArrayGatherV3"(%outputs_22#0, %outputs_20, %outputs_24#2) {device = "", element_shape = #tf_type.shape<224x224x3>} : (tensor<2x!tf_type.resource<tensor<*xui8>>>, tensor<?xi32>, tensor<f32>) -> tensor<?x224x224x3xui8>
+  return %outputs_26 : tensor<?x224x224x3xui8>
+}
+
+// -----
+// Test non-applicable while is NOT converted to map_fn.
+
+// CHECK-LABEL: func @while_cond_lt9
+func.func @while_cond_lt9(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<9> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Less"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  func.return %1 : tensor<i1>
+}
+
+// CHECK-LABEL: func @while_body_add2
+func.func @while_body_add2(%arg0: tensor<i32>) -> tensor<i32> {
+  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Add"(%arg0, %0) {device = "/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+
+// CHECK-LABEL: func @while_test()
+func.func @while_test() -> (tensor<i32>) {
+  %0 = "tf.Const"() {device = "/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf.While
+  %1 = "tf.While"(%0) { cond = @while_cond_lt9, body = @while_body_add2, is_stateless = false, parallel_iterations = 1} : (tensor<i32>) -> (tensor<i32>)
+  func.return %1 : tensor<i32>
+}
+
+// -----
+
+// Test a case that the while body has multiple tensor lists.
+
+// CHECK-LABEL: tf.MultiListWhileRegion_body
+func.func private @tf.MultiListWhileRegion_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<!tf_type.variant<tensor<*xf32>>>, %arg4: tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00, 5.000000e+00, 6.000000e+00, 7.000000e+00], [8.000000e+00, 9.000000e+00, 1.000000e+01, 1.100000e+01, 1.200000e+01, 1.300000e+01, 1.400000e+01, 1.500000e+01]]> : tensor<2x8xf32>} : () -> tensor<2x8xf32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[[1.600000e+01, 1.700000e+01, 1.800000e+01, 1.900000e+01, 2.000000e+01, 2.100000e+01, 2.200000e+01, 2.300000e+01], [2.400000e+01, 2.500000e+01, 2.600000e+01, 2.700000e+01, 2.800000e+01, 2.900000e+01, 3.000000e+01, 3.100000e+01]]> : tensor<2x8xf32>} : () -> tensor<2x8xf32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.GatherV2"(%arg4, %cst_2, %cst_2) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?xf32>, tensor<i32>, tensor<i32>) -> tensor<f32>
+  %1 = "tf.AddV2"(%arg0, %cst_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %2 = "tf.AddV2"(%arg1, %cst_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %3 = "tf.GatherV2"(%cst_0, %arg1, %cst_2) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x8xf32>, tensor<i32>, tensor<i32>) -> tensor<8xf32>
+  %4 = "tf.Mul"(%0, %3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<f32>, tensor<8xf32>) -> tensor<8xf32>
+  %5 = "tf.TensorListSetItem"(%arg2, %arg1, %4) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<8xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  %6 = "tf.GatherV2"(%cst, %arg1, %cst_2) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x8xf32>, tensor<i32>, tensor<i32>) -> tensor<8xf32>
+  %7 = "tf.Mul"(%0, %6) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<f32>, tensor<8xf32>) -> tensor<8xf32>
+  %8 = "tf.TensorListSetItem"(%arg3, %arg1, %7) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<8xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  return %1, %2, %5, %8, %arg4 : tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>
+}
+
+// CHECK-LABEL: tf.MultiListWhileRegion_body/MapFnBody
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.GatherV2
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.GatherV2
+// CHECK-NEXT: tf.Mul
+// CHECK-NEXT: tf.GatherV2
+// CHECK-NEXT: tf.Mul
+// CHECK-NEXT: tf_mlrt.tf_await
+// CHECK-NEXT: tf_mlrt.tf_await
+// CHECK-NEXT: tf.TensorListSetItem
+// CHECK-NEXT: tf.TensorListSetItem
+// CHECK-NEXT: tf_mlrt.tf_promise
+// CHECK-NEXT: tf_mlrt.tf_promise
+// CHECK-NEXT: return
+
+// CHECK-LABEL: tf.MultiListWhileRegion_cond
+func.func private @tf.MultiListWhileRegion_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<!tf_type.variant<tensor<*xf32>>>, %arg4: tensor<?xf32>) -> tensor<i1> {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Less"(%arg0, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = "tf.Less"(%arg1, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = "tf.LogicalAnd"(%0, %1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
+// CHECK-LABEL: multilist_serving
+func.func private @multilist_serving(%arg0: tensor<?xf32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<2x8xf32>, tensor<2x8xf32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<8> : tensor<1xi32>} : () -> tensor<1xi32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<2> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: TensorListReserve
+  %0 = "tf.TensorListReserve"(%cst_1, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  // CHECK-NEXT: tf_mlrt.tf_map_fn
+  %1:5 = "tf.While"(%cst, %cst, %0, %0, %arg0) {_lower_using_switch_merge = true, _num_original_outputs = 8 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, body = @tf.MultiListWhileRegion_body, cond = @tf.MultiListWhileRegion_cond, device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = true, parallel_iterations = 4 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>)
+  // CHECK-NEXT: TensorListStack
+  %2 = "tf.TensorListStack"(%1#2, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 2 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<2x8xf32>
+  %3 = "tf.TensorListStack"(%1#3, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 2 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<2x8xf32>
+  return %3, %2 : tensor<2x8xf32>, tensor<2x8xf32>
+}
+
+
+// -----
+
+// Convert a while with multiple tensor array to map_fn
+
+// CHECK-LABEL: tf.WhileRegion1_body(
+func.func private @tf.WhileRegion1_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, %arg6: tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, %arg7: tensor<*xi32>) -> (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<*xi32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg0, %cst_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.AddV2"(%arg1, %cst_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %3 = "tf.RaggedTensorToVariant"(%arg7) {RAGGED_RANK = 0 : i64, Tsplits = i64, Tvalues = i32, batched_input = false, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*xi32>) -> tensor<!tf_type.variant>
+  %4 = "tf.TensorArrayWriteV3"(%arg5, %arg1, %3, %arg2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<i32>, tensor<!tf_type.variant>, tensor<f32>) -> tensor<f32>
+  %5 = "tf.RaggedTensorToVariant"(%arg7) {RAGGED_RANK = 0 : i64, Tsplits = i64, Tvalues = f32, batched_input = false, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*xi32>) -> tensor<!tf_type.variant>
+  %6 = "tf.TensorArrayWriteV3"(%arg6, %arg1, %5, %arg3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<i32>, tensor<!tf_type.variant>, tensor<f32>) -> tensor<f32>
+  return %0, %1, %4, %6, %arg4, %arg5, %arg6, %arg7 : tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<*xi32>
+}
+
+// CHECK-LABEL: func.func private @"tf.WhileRegion1_body/MapFnBody"(%arg0: !mlrt.future, %arg1: !mlrt.promise, %arg2: !mlrt.future, %arg3: !mlrt.promise, %arg4: tensor<i32>, %arg5: tensor<i32>, %arg6: tensor<i32>, %arg7: tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, %arg8: tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, %arg9: tensor<*xi32>) attributes {tfrt.cost_threshold = 4294967295 : i64} 
+// CHECK: [[result_0:%.*]] = "tf.RaggedTensorToVariant"
+// CHECK: [[result_1:%.*]] = "tf.RaggedTensorToVariant"
+// CHECK-NEXT: [[flow_in_0:%.*]] = "tf_mlrt.tf_await"(%arg0) : (!mlrt.future) -> tensor<f32>
+// CHECK-NEXT: [[flow_in_1:%.*]] = "tf_mlrt.tf_await"(%arg2) : (!mlrt.future) -> tensor<f32>
+// CHECK-NEXT: [[flow_out_0:%.*]] = "tf.TensorArrayWriteV3"(%arg7, %arg5, [[result_0]], [[flow_in_0]])
+// CHECK-NEXT: [[flow_out_1:%.*]] = "tf.TensorArrayWriteV3"(%arg8, %arg5, [[result_1]], [[flow_in_1]])
+// CHECK-NEXT: "tf_mlrt.tf_promise"(%arg1, [[flow_out_0]]) : (!mlrt.promise, tensor<f32>) -> ()
+// CHECK-NEXT: "tf_mlrt.tf_promise"(%arg3, [[flow_out_1]]) : (!mlrt.promise, tensor<f32>) -> ()
+// CHECK-NEXT: return
+
+// CHECK-LABEL: tf.WhileRegion1_cond
+func.func private @tf.WhileRegion1_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<f32>, %arg3: tensor<f32>, %arg4: tensor<i32>, %arg5: tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, %arg6: tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, %arg7: tensor<*xi32>) -> (tensor<i1>) {
+  %0 = "tf.Less"(%arg0, %arg4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<*xi1>
+  %1 = "tf.Less"(%arg1, %arg4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<*xi1>
+  %2 = "tf.LogicalAnd"(%0, %1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  %3 = "tf.ToBool"(%2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<*xi1>) -> tensor<i1>
+  return %3 : tensor<i1>
+}
+
+// CHECK-LABEL: func.func private @tf.WhileRegion2_body(
+func.func private @tf.WhileRegion2_body(%arg0: tensor<*xi32>) -> (tensor<?x!tf_type.variant>, tensor<?x!tf_type.variant>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[-1, 4]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %max_iter = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<4> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: "tf.TensorArrayV3"
+  %handle_12, %flow_13 = "tf.TensorArrayV3"(%max_iter) {device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = !tf_type.variant, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = ""} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<f32>)
+  // CHECK: "tf.TensorArrayV3"
+  %handle_14, %flow_15 = "tf.TensorArrayV3"(%max_iter) {device = "/job:localhost/replica:0/task:0/device:CPU:0", dtype = !tf_type.variant, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = ""} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<f32>)
+  // CHECK: tf_mlrt.tf_map_fn
+  // CHECK-SAME: {body_fn = @"tf.WhileRegion1_body/MapFnBody", num_tensor_list_or_flow_in = 2 : i32}
+  %4:8 = "tf.While"(%cst_0, %cst_0, %flow_13, %flow_15, %max_iter, %handle_12, %handle_14, %arg0) {body = @tf.WhileRegion1_body, cond = @tf.WhileRegion1_cond, device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, tensor<f32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<*xi32>)
+  // CHECK: TensorArrayGatherV3
+  %5 = "tf.TensorArrayGatherV3"(%handle_12, %1, %4#2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<i32>, tensor<f32>) -> tensor<?x!tf_type.variant>
+  // CHECK: TensorArrayGatherV3
+  %6 = "tf.TensorArrayGatherV3"(%handle_14, %2, %4#3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.variant>>>, tensor<i32>, tensor<f32>) -> tensor<?x!tf_type.variant>
+  return %5, %6 : tensor<?x!tf_type.variant>, tensor<?x!tf_type.variant> 
+}
+
+// -----
+
+// Test a while to map_fn conversion in which tensor array is used instead of
+// tensor list and the tensor array size and the number of iterations are bounded
+// by separate constants of the same value.
+
+// CHECK-LABEL: map2/while/LoopCond_body
+func.func private @"map2/while/LoopCond_body"(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, %arg5: tensor<f32>, %arg6: tensor<2x!tf_type.resource<tensor<*xui8>>>) -> (tensor<*xi32>, tensor<*xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>) {
+  %outputs = "tf.Const"() {value = dense<224> : tensor<2xi32>} : () -> tensor<2xi32>
+  %outputs_0 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_2 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_4 = "tf.Identity"(%arg0) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  %outputs_6 = "tf.AddV2"(%outputs_4, %outputs_2) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_8 = "tf.Identity"(%arg1) {device = ""} : (tensor<*xi32>) -> tensor<*xi32>
+  %outputs_10 = "tf.AddV2"(%outputs_8, %outputs_2) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi32>
+  %outputs_12 = "tf.Identity"(%arg2) {device = ""} : (tensor<f32>) -> tensor<f32>
+  %outputs_14 = "tf.TensorArrayReadV3"(%arg4, %outputs_8, %arg5) {device = ""} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<*xi32>, tensor<f32>) -> tensor<*x!tf_type.string>
+  %outputs_16 = "tf.DecodeJpeg"(%outputs_14) {acceptable_fraction = 1.000000e+00 : f32, channels = 3 : i64, dct_method = "INTEGER_FAST", device = "", fancy_upscaling = true, ratio = 1 : i64, try_recover_truncated = false} : (tensor<*x!tf_type.string>) -> tensor<?x?x3xui8>
+  %outputs_18 = "tf.ExpandDims"(%outputs_16, %outputs_0) {device = ""} : (tensor<?x?x3xui8>, tensor<i32>) -> tensor<1x?x?x3xui8>
+  %outputs_20 = "tf.ResizeBilinear"(%outputs_18, %outputs) {align_corners = false, device = "", half_pixel_centers = false} : (tensor<1x?x?x3xui8>, tensor<2xi32>) -> tensor<1x224x224x3xf32>
+  %outputs_22 = "tf.Squeeze"(%outputs_20) {device = "", squeeze_dims = [0]} : (tensor<1x224x224x3xf32>) -> tensor<224x224x3xf32>
+  %outputs_24 = "tf.Cast"(%outputs_22) {Truncate = false, device = ""} : (tensor<224x224x3xf32>) -> tensor<224x224x3xui8>
+  %outputs_26 = "tf.TensorArrayWriteV3"(%arg6, %outputs_8, %outputs_24, %outputs_12) {device = ""} : (tensor<2x!tf_type.resource<tensor<*xui8>>>, tensor<*xi32>, tensor<224x224x3xui8>, tensor<f32>) -> tensor<f32>
+  return %outputs_6, %outputs_10, %outputs_26, %arg3, %arg4, %arg5, %arg6: tensor<*xi32>, tensor<*xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>
+}
+
+// CHECK-LABEL: @"map2/while/LoopCond_body/MapFnBody"
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.Const
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.TensorArrayReadV3
+// CHECK-NEXT: tf.DecodeJpeg
+// CHECK-NEXT: tf.ExpandDims
+// CHECK-NEXT: tf.ResizeBilinear
+// CHECK-NEXT: tf.Squeeze
+// CHECK-NEXT: tf.Cast
+// CHECK-NEXT: tf_mlrt.tf_await
+// CHECK-NEXT: tf.TensorArrayWriteV3
+// CHECK-NEXT: tf_mlrt.tf_promise
+// CHECK-NEXT: return
+
+// CHECK-LABEL: map2/while/LoopCond_cond
+func.func private @"map2/while/LoopCond_cond"(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<f32>, %arg3: tensor<i32>, %arg4: tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, %arg5: tensor<f32>, %arg6: tensor<2x!tf_type.resource<tensor<*xui8>>>) -> tensor<i1> {
+  %cst = "tf.Const"() {value = dense<224> : tensor<i32>} : () -> tensor<i32>
+  %outputs = "tf.Less"(%arg0, %cst) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %outputs_0 = "tf.Less"(%arg1, %cst) {device = ""} : (tensor<*xi32>, tensor<i32>) -> tensor<*xi1>
+  %outputs_2 = "tf.LogicalAnd"(%outputs, %outputs_0) {device = ""} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
+  %outputs_4 = "tf.ToBool"(%outputs_2) : (tensor<*xi1>) -> tensor<i1>
+  return %outputs_4 : tensor<i1>
+}
+
+//CHECK-LABEL: map2_while_test
+func.func private @map2_while_test(%arg0: tensor<?x!tf_type.string>) -> tensor<?x224x224x3xui8> {
+  // CHECK-NEXT: tf.Const
+  %outputs = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<1xi32>
+  // CHECK-NEXT: [[max_iter:%.*]] = "tf.Const"
+  %cst_0 = "tf.Const"() {value = dense<224> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {value = dense<256> : tensor<i32>} : () -> tensor<i32>
+  %outputs_2 = "tf.Const"() {value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %outputs_4 = "tf.Const"() {value = dense<1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_6 = "tf.Const"() {value = dense<0> : tensor<1xi32>} : () -> tensor<1xi32>
+  %outputs_8 = "tf.Const"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %outputs_10 = "tf.Shape"(%arg0) {device = ""} : (tensor<?x!tf_type.string>) -> tensor<1xi32>
+  // CHECK: tf.Range
+  %outputs_14 = "tf.Range"(%outputs_2, %cst_0, %outputs_8) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK-NEXT: tf.TensorArrayV3
+  %outputs_16:2 = "tf.TensorArrayV3"(%cst_0) {clear_after_read = true, device = "", dtype = !tf_type.string, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = ""} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>)
+  // CHECK-NEXT: tf.TensorArrayScatterV3
+  %outputs_18 = "tf.TensorArrayScatterV3"(%outputs_16#0, %outputs_14, %arg0, %outputs_16#1) {device = ""} : (tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<?xi32>, tensor<?x!tf_type.string>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT: tf.Range
+  %outputs_20 = "tf.Range"(%outputs_2, %cst_0, %outputs_8) {device = ""} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
+  // CHECK-NEXT: [[tensor_array:%.*]], [[flow_in:%.*]] = "tf.TensorArrayV3"
+  %outputs_22:2 = "tf.TensorArrayV3"(%cst_0) {clear_after_read = true, device = "", dtype = ui8, dynamic_size = false, element_shape = #tf_type.shape<*>, identical_element_shapes = true, tensor_array_name = ""} : (tensor<i32>) -> (tensor<2x!tf_type.resource<tensor<*xui8>>>, tensor<f32>)
+  // CHECK-NEXT: tf_mlrt.tf_map_fn
+  // CHECK-SAME: ([[max_iter]], [[flow_in]], %cst_1
+  // CHECK-SAME: {body_fn = @"map2/while/LoopCond_body/MapFnBody", num_tensor_list_or_flow_in = 1 : i32}
+  // CHECK-NOT: tf.While
+  %outputs_24:7 = "tf.While"(%outputs, %outputs, %outputs_22#1, %cst_1, %outputs_16#0, %outputs_18, %outputs_22#0) {_xla_propagate_compile_time_consts = true, body = @"map2/while/LoopCond_body", cond = @"map2/while/LoopCond_cond", device = "", is_stateless = false, parallel_iterations = 10 : i64, shape_invariant} : (tensor<1xi32>, tensor<1xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>) -> (tensor<1xi32>, tensor<1xi32>, tensor<f32>, tensor<i32>, tensor<2x!tf_type.resource<tensor<*x!tf_type.string>>>, tensor<f32>, tensor<2x!tf_type.resource<tensor<*xui8>>>)
+  // CHECK-NEXT: tf.TensorArrayGatherV3
+  %outputs_26 = "tf.TensorArrayGatherV3"(%outputs_22#0, %outputs_20, %outputs_24#2) {device = "", element_shape = #tf_type.shape<224x224x3>} : (tensor<2x!tf_type.resource<tensor<*xui8>>>, tensor<?xi32>, tensor<f32>) -> tensor<?x224x224x3xui8>
+  return %outputs_26 : tensor<?x224x224x3xui8>
+}
+
+// -----
+// Test a nest while in which the while body is after the usage.
+
+// CHECK-LABEL: nested_while
+func.func @nested_while(%arg0: tensor<?xf32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}) -> (tensor<16x16x?xf32>)  {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[16, -1]> : tensor<2xi32>} : () -> tensor<2xi32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  // CHECK: tf.TensorListReserve
+  %0 = "tf.TensorListReserve"(%cst_1, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  // CHECK-NEXT: tf_mlrt.tf_map_fn
+  %1:4 = "tf.While"(%cst, %cst, %0, %arg0) {_lower_using_switch_merge = true, _num_original_outputs = 6 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, body = @tf.NestedWhileRegion1_body, cond = @tf.NestedWhileRegion1_cond, device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = true, parallel_iterations = 4 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>)
+  %2 = "tf.TensorListStack"(%1#2, %cst_0) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 16 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<2xi32>) -> tensor<16x16x?xf32>
+  return %2 : tensor<16x16x?xf32>
+}
+// CHECK-LABEL: tf.NestedWhileRegion1_body
+func.func private @tf.NestedWhileRegion1_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<1xi32>} : () -> tensor<1xi32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : tensor<16xi32>} : () -> tensor<16xi32>
+  %cst_2 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %cst_3 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  %cst_4 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<-1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.TensorListReserve"(%cst_4, %cst_3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  %1 = "tf.AddV2"(%arg0, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %2 = "tf.AddV2"(%arg1, %cst_2) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %3 = "tf.GatherV2"(%cst_1, %arg1, %cst_0) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<16xi32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  %4 = "tf.Cast"(%3) {Truncate = false, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  %5 = "tf.Mul"(%arg3, %4) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
+  %6:4 = "tf.While"(%cst_0, %cst_0, %0, %5) {_lower_using_switch_merge = true, _num_original_outputs = 6 : i64, _read_only_resource_inputs = [], _xla_propagate_compile_time_consts = true, body = @tf.NestedWhileRegion_body, cond = @tf.NestedWhileRegion_cond, device = "/job:localhost/replica:0/task:0/device:CPU:0", is_stateless = true, parallel_iterations = 10 : i64, shape_invariant} : (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>)
+  %7 = "tf.TensorListStack"(%6#2, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0", num_elements = 16 : i64} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<1xi32>) -> tensor<16x?xf32>
+  %8 = "tf.TensorListSetItem"(%arg2, %arg1, %7) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<16x?xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  return %1, %2, %8, %arg3 : tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>
+}
+
+//CHECK-LABEL: @"tf.NestedWhileRegion1_body/MapFnBody"(%arg0: !mlrt.future, %arg1: !mlrt.promise, %arg2: tensor<i32>, %arg3: tensor<i32>, %arg4: tensor<?xf32>) 
+// CHECK: tf.TensorListReserve
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.GatherV2
+// CHECK-NEXT: tf.Cast
+// CHECK-NEXT: tf.Mul
+// CHECK-NEXT: tf_mlrt.tf_map_fn
+// CHECK-NEXT: tf.TensorListStack
+// CHECK-NEXT: tf_mlrt.tf_await
+// CHECK-NEXT: tf.TensorListSetItem
+// CHECK-NEXT: tf_mlrt.tf_promise
+// CHECK-NEXT: return
+
+// CHECK-LABEL: tf.NestedWhileRegion1_cond
+func.func private @tf.NestedWhileRegion1_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> tensor<i1> {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Less"(%arg0, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = "tf.Less"(%arg1, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = "tf.LogicalAnd"(%0, %1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+// CHECK-LABEL: tf.NestedWhileRegion_body
+func.func private @tf.NestedWhileRegion_body(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> (tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>) {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<0> : tensor<i32>} : () -> tensor<i32>
+  %cst_0 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]> : tensor<16xi32>} : () -> tensor<16xi32>
+  %cst_1 = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<1> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.AddV2"(%arg0, %cst_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %1 = "tf.AddV2"(%arg1, %cst_1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i32>
+  %2 = "tf.GatherV2"(%cst_0, %arg1, %cst) {batch_dims = 0 : i64, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<16xi32>, tensor<i32>, tensor<i32>) -> tensor<i32>
+  %3 = "tf.Cast"(%2) {Truncate = false, device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>) -> tensor<f32>
+  %4 = "tf.Mul"(%arg3, %3) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
+  %5 = "tf.TensorListSetItem"(%arg2, %arg1, %4) {device = "/job:localhost/replica:0/task:0/device:CPU:0", resize_if_index_out_of_bounds = false} : (tensor<!tf_type.variant<tensor<*xf32>>>, tensor<i32>, tensor<?xf32>) -> tensor<!tf_type.variant<tensor<*xf32>>>
+  return %0, %1, %5, %arg3 : tensor<i32>, tensor<i32>, tensor<!tf_type.variant<tensor<*xf32>>>, tensor<?xf32>
+}
+
+// CHECK-LABEL: tf.NestedWhileRegion_body/MapFnBody
+// CHECK: tf.AddV2
+// CHECK-NEXT: tf.AddV2
+// CHECK-NEXT: tf.GatherV2
+// CHECK-NEXT: tf.Cast
+// CHECK-NEXT: tf.Mul
+// CHECK-NEXT: tf_mlrt.tf_await
+// CHECK-NEXT: tf.TensorListSetItem
+// CHECK-NEXT: "tf_mlrt.tf_promise
+// CHECK-NEXT: return
+
+// CHECK-LABEL: tf.NestedWhileRegion_cond
+func.func private @tf.NestedWhileRegion_cond(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<!tf_type.variant<tensor<*xf32>>>, %arg3: tensor<?xf32>) -> tensor<i1> {
+  %cst = "tf.Const"() {device = "/job:localhost/replica:0/task:0/device:CPU:0", value = dense<16> : tensor<i32>} : () -> tensor<i32>
+  %0 = "tf.Less"(%arg0, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %1 = "tf.Less"(%arg1, %cst) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i32>, tensor<i32>) -> tensor<i1>
+  %2 = "tf.LogicalAnd"(%0, %1) {device = "/job:localhost/replica:0/task:0/device:CPU:0"} : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  return %2 : tensor<i1>
+}
+
diff --git a/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc b/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
index 995bb242fe3..6f6aafa566d 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/tests/saved_model/saved_model_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
 
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/strings/match.h"
 #include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
index 4badbc11669..c73d06bd8ff 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     features = if_oss(["--path=org_tensorflow/tensorflow/compiler/mlir/tfrt"]),
diff --git a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
index eb5615dc2c6..eb2006e6849 100644
--- a/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
+++ b/tensorflow/compiler/mlir/tfrt/tf-tfrt-opt.cc
@@ -17,10 +17,13 @@ limitations under the License.
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/InitAllPasses.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_async.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/tfrt_fallback_sync.h"
@@ -28,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/transforms/tf_jitrt_test_passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/gpu_passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
@@ -38,6 +42,8 @@ int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
 
   mlir::registerAllPasses();
+  mlir::registerInlinerPass();
+
   mlir::registerTensorFlowPasses();
 
   // Register passes for TF->JitRt compilation.
@@ -45,6 +51,8 @@ int main(int argc, char **argv) {
   registerTfJitRtTestPasses();
   mlir::gml_st::registerGmlStPasses();
 
+  tensorflow::mlrt_compiler::RegisterMlrtPasses();
+
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
   mlir::RegisterAllTensorFlowDialects(registry);
@@ -56,6 +64,8 @@ int main(int argc, char **argv) {
   registry.insert<tfrt::fallback::FallbackDialect>();
   registry.insert<tfrt::fallback_async::FallbackAsyncDialect>();
   registry.insert<tfrt::fallback_sync::FallbackSyncDialect>();
+  registry.insert<tensorflow::tf_mlrt::TensorflowMlrtDialect,
+                  mlrt::compiler::MlrtDialect>();
   tensorflow::RegisterTPUDialects(&registry);
   tensorflow::RegisterGpuDialects(&registry);
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
new file mode 100644
index 00000000000..beb50129756
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -0,0 +1,195 @@
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+        "//tensorflow/core/tfrt:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "parallelization",
+    srcs = ["parallelization.cc"],
+    hdrs = ["parallelization.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
+        "//tensorflow/compiler/mlir/tfrt:constants",
+        "//tensorflow/compiler/mlir/tfrt:cost_analysis",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@tf_runtime//:stream_analysis",
+    ],
+)
+
+cc_library(
+    name = "assign_op_key",
+    srcs = ["assign_op_key.cc"],
+    hdrs = ["assign_op_key.h"],
+    deps = [
+        ":util",
+        "//tensorflow/compiler/mlir/tfrt:constants",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "util",
+    srcs = ["util.cc"],
+    hdrs = ["util.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_a_m_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_n_z_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_tfrt_ops_inc_gen",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+cc_library(
+    name = "tf_to_mlrt",
+    srcs = ["tf_to_mlrt.cc"],
+    hdrs = ["tf_to_mlrt.h"],
+    deps = [
+        ":execute_op_registry",
+        ":tpu_conversion_patterns",
+        ":util",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tfrt:constants",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
+        "//tensorflow/compiler/mlir/tfrt:transform_utils",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_tpu_ops",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
+        "//third_party/protobuf",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncTransforms",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    srcs = ["passes.cc"],
+    hdrs = ["passes.h"],
+    deps = [
+        ":assign_op_key",
+        ":fuse_mlrt_ops",
+        ":parallelization",
+        ":tf_to_mlrt",
+        ":while_to_map_fn",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "execute_op_registry",
+    hdrs = ["execute_op_registry.h"],
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "tpu_conversion_patterns",
+    srcs = ["tpu_conversion_patterns.cc"],
+    hdrs = ["tpu_conversion_patterns.h"],
+    deps = [
+        ":execute_op_registry",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
+        "//tensorflow/compiler/mlir/tfrt:transform_utils",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_tpu_ops",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
+cc_library(
+    name = "fuse_mlrt_ops",
+    srcs = ["fuse_mlrt_ops.cc"],
+    hdrs = ["fuse_mlrt_ops.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Pass",
+    ],
+)
+
+cc_library(
+    name = "import_model",
+    srcs = ["import_model.cc"],
+    hdrs = ["import_model.h"],
+    deps = [
+        ":assign_op_key",
+        ":passes",
+        ":while_to_map_fn",
+        "//base:vlog",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tfrt:import_model",
+        "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
+        "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
+        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:mlir_to_bytecode",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/mlrt/attribute",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+    ],
+)
+
+cc_library(
+    name = "while_to_map_fn",
+    srcs = ["while_to_map_fn.cc"],
+    hdrs = ["while_to_map_fn.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:mlrt_ops",
+        "//tensorflow/compiler/mlir/tfrt/ir/mlrt:tf_mlrt_ops",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.cc
new file mode 100644
index 00000000000..e2896f8a070
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.cc
@@ -0,0 +1,71 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h"
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/constants.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+class AssignOpKeyPass
+    : public mlir::PassWrapper<AssignOpKeyPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  AssignOpKeyPass() = default;
+  AssignOpKeyPass& operator=(const AssignOpKeyPass&) = delete;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(AssignOpKeyPass)
+
+ private:
+  llvm::StringRef getArgument() const final { return "tf-mlrt-assign-op-key"; }
+  llvm::StringRef getDescription() const final {
+    return "tf-mlrt-assign-op-key";
+  }
+
+  void runOnOperation() override;
+};
+
+void AssignOpKeyPass::runOnOperation() {
+  auto module = getOperation();
+  mlir::OpBuilder builder(module);
+
+  int32_t op_key = 0;
+  module.walk([&builder, &op_key](mlir::Operation* op) mutable {
+    if (UseFallback(op)) {
+      op->setAttr(tensorflow::tfrt_compiler::kOpKeyAttrName,
+                  builder.getI32IntegerAttr(op_key));
+      op_key++;
+    }
+  });
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAssignOpKeyPass() {
+  return std::make_unique<AssignOpKeyPass>();
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h
new file mode 100644
index 00000000000..6ed9f1e9198
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h
@@ -0,0 +1,32 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASSIGN_OP_KEY_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASSIGN_OP_KEY_H_
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Create a pass that assigns an op_key to every fallback OP. The op_key
+// provides a uniform key to look up online cost for a specific op.
+// This pass is expected to run before parallerization.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateAssignOpKeyPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_ASSIGN_OP_KEY_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h
new file mode 100644
index 00000000000..93dde8140c0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h
@@ -0,0 +1,60 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_EXECUTE_OP_REGISTRY_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_EXECUTE_OP_REGISTRY_H_
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+class ExecuteOpRegistry {
+ public:
+  mlir::LogicalResult RegisterExecuteOp(mlir::Operation* op, uint32_t op_key) {
+    if (op_key >= execute_ops_.size()) {
+      execute_ops_.resize(op_key + 1);
+    }
+    if (auto* register_op = execute_ops_[op_key]) {
+      if (register_op->getName() != op->getName() ||
+          register_op->getAttrs() != op->getAttrs()) {
+        return op->emitError() << "Key " << op_key << " already registered.";
+      }
+      return mlir::success();
+    }
+    execute_ops_[op_key] = op;
+    return mlir::success();
+  }
+
+  void ReplaceExecuteOp(int64_t key, mlir::Operation* op) {
+    execute_ops_[key] = op;
+  }
+
+  llvm::ArrayRef<mlir::Operation*> GetExecuteOps() const {
+    return execute_ops_;
+  }
+
+ private:
+  // Using a vector to keep fallback ops in order, and the key for a fallback op
+  // is its corresponding index here.
+  llvm::SmallVector<mlir::Operation*, 8> execute_ops_;
+};
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_EXECUTE_OP_REGISTRY_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.cc
new file mode 100644
index 00000000000..a53404653fa
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.cc
@@ -0,0 +1,157 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h"
+
+#include <memory>
+
+#include "llvm/ADT/SmallVector.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+class FuseMlrtOpPass
+    : public mlir::PassWrapper<FuseMlrtOpPass,
+                               mlir::OperationPass<mlir::func::FuncOp>> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(FuseMlrtOpPass)
+
+ private:
+  llvm::StringRef getArgument() const final { return "tf-mlrt-fuse"; }
+
+  llvm::StringRef getDescription() const final {
+    return "Fuse consecutive mlrt ops of the same kind into one.";
+  }
+
+  void runOnOperation() override;
+};
+
+void FuseGetResourceOps(mlir::OpBuilder& builder, mlir::Block& block) {
+  llvm::SmallVector<tf_mlrt::GetResourceOp> get_resource_ops;
+  for (auto& op : llvm::make_early_inc_range(block)) {
+    if (auto get_resource_op = llvm::dyn_cast<tf_mlrt::GetResourceOp>(&op)) {
+      get_resource_ops.push_back(get_resource_op);
+    }
+  }
+
+  if (get_resource_ops.empty()) return;
+
+  // The last op is always a return op, so it is guaranteed to process all
+  // groups of the candidate ops.
+  auto first_get = get_resource_ops.front();
+
+  builder.setInsertionPointAfter(first_get);
+
+  llvm::SmallVector<mlir::Attribute> indices;
+  llvm::SmallVector<mlir::Type> result_types;
+  llvm::SmallVector<mlir::Value> old_values;
+
+  indices.reserve(get_resource_ops.size());
+  result_types.reserve(get_resource_ops.size());
+  old_values.reserve(get_resource_ops.size());
+
+  for (auto op : get_resource_ops) {
+    auto indices_attr = op.getIndices();
+    indices.append(indices_attr.begin(), indices_attr.end());
+    result_types.append(op.result_type_begin(), op.result_type_end());
+    old_values.append(op.result_begin(), op.result_end());
+  }
+
+  auto new_op = builder.create<tf_mlrt::GetResourceOp>(
+      first_get.getLoc(), result_types, builder.getArrayAttr(indices));
+
+  for (auto [old_value, new_value] :
+       llvm::zip(old_values, new_op.getResults())) {
+    old_value.replaceAllUsesWith(new_value);
+  }
+
+  for (auto get_resource_op : get_resource_ops) {
+    get_resource_op->erase();
+  }
+}
+
+template <typename AwaitOpType, typename AwaitAllOpType,
+          typename ValueType = void>
+void FuseAwaitOps(mlir::OpBuilder& builder, mlir::Block& block) {
+  llvm::SmallVector<AwaitOpType> await_ops;
+  for (auto& op : llvm::make_early_inc_range(block)) {
+    if (auto await_op = llvm::dyn_cast<AwaitOpType>(&op)) {
+      await_ops.push_back(await_op);
+      continue;
+    }
+
+    // The last op is always a return op, so it is guaranteed to process all
+    // groups of the candidate ops.
+    if (await_ops.size() > 1) {
+      auto last_await = await_ops.back();
+
+      builder.setInsertionPointAfter(last_await);
+
+      llvm::SmallVector<mlir::Value> futures;
+      futures.reserve(await_ops.size());
+      for (auto op : await_ops) {
+        futures.push_back(op.getOperand());
+      }
+
+      llvm::SmallVector<mlir::Type> result_types;
+      if constexpr (!std::is_same_v<ValueType, void>) {
+        result_types.assign(futures.size(), builder.getType<ValueType>());
+      }
+
+      auto await_all =
+          builder.create<AwaitAllOpType>(op.getLoc(), result_types, futures);
+
+      if constexpr (!std::is_same_v<ValueType, void>) {
+        for (auto [await_op, new_value] :
+             llvm::zip(await_ops, await_all.getResults())) {
+          await_op.getResult().replaceAllUsesWith(new_value);
+        }
+      }
+
+      for (auto await_op : await_ops) {
+        await_op->erase();
+      }
+    }
+
+    await_ops.clear();
+  }
+}
+
+void FuseMlrtOpPass::runOnOperation() {
+  auto func = getOperation();
+
+  mlir::OpBuilder builder(func);
+
+  FuseAwaitOps<tf_mlrt::AwaitOp, tf_mlrt::AwaitAllOp, tf_mlrt::TFTensorType>(
+      builder, func.front());
+  FuseAwaitOps<mlrt::compiler::AwaitHandleOp, mlrt::compiler::AwaitAllHandleOp>(
+      builder, func.front());
+  FuseAwaitOps<mlrt::compiler::AwaitControlOp,
+               mlrt::compiler::AwaitAllControlOp>(builder, func.front());
+  FuseGetResourceOps(builder, func.front());
+}
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateFuseMlrtOpPass() {
+  return std::make_unique<FuseMlrtOpPass>();
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h
new file mode 100644
index 00000000000..6f772a895bb
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_FUSE_MLRT_OPS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_FUSE_MLRT_OPS_H_
+
+#include <memory>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>> CreateFuseMlrtOpPass();
+
+}
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_FUSE_MLRT_OPS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
new file mode 100644
index 00000000000..63b0de8e243
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.cc
@@ -0,0 +1,136 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
+
+#include <utility>
+
+#include "base/vlog_is_on.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/mlrt/attribute/attribute.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
+    const TfrtCompileOptions& options,
+    const tfrt_stub::FallbackState& fallback_state, mlir::ModuleOp module,
+    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys) {
+  mlrt::bc::Buffer bytecode_buffer;
+  TF_RETURN_IF_ERROR(ConvertTfMlirToRuntimeExecutable(
+      options, module,
+      [&bytecode_buffer, &fallback_state, module_with_op_keys](
+          mlir::PassManager& pm, mlir::ModuleOp module,
+          const TfrtPipelineOptions& options) {
+        mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
+
+        if (options.enable_while_parallel_iterations) {
+          pm.addPass(mlrt_compiler::CreateWhileToMapFnPass());
+          // Remove unreachable private functions after mapfn conversion.
+          pm.addPass(mlir::createSymbolDCEPass());
+        }
+        tensorflow::CreateTFExecutorToTFInvariantOptimizationPipelineHelper(
+            pm, options);
+        // TODO(b/283481729): Add test to cover unused constants that do not
+        // cause op_key discontinuity
+        pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+        pm.addPass(mlrt_compiler::CreateAssignOpKeyPass());
+        // Run passes until (including) AssignOpKeyPass.
+        if (mlir::failed(pm.run(module))) {
+          return diag_handler.Combine(absl::InternalError(
+              "failed to finish passes before (including) assign op keys."));
+        }
+        if (VLOG_IS_ON(1)) {
+          tensorflow::DumpMlirOpToFile("tf_dialect_after_assign_op_key",
+                                       module);
+        }
+        // Save the module.
+        if (module_with_op_keys != nullptr) {
+          *module_with_op_keys = module.clone();
+        }
+        // Clear passes already run.
+        pm.clear();
+        // Create the remaining pipeline and run.
+        CreateTfToMlrtPipeline(pm, options, &fallback_state);
+        if (mlir::failed(pm.run(module))) {
+          return diag_handler.Combine(absl::InternalError(
+              "failed to lower TF Dialect to MLRT dialect."));
+        }
+        // Generate bytecode.
+        mlrt::AttributeEncoderRegistry registry;
+        registry.Register("tf_mlrt",
+                          &tensorflow::tf_mlrt::EncodeTensorflowAttribute);
+        auto statusor = mlrt::EmitExecutable(registry, module);
+        if (!statusor.ok()) return statusor.status();
+        bytecode_buffer = std::move(*statusor);
+        return OkStatus();
+      }));
+  return bytecode_buffer;
+}
+
+StatusOr<mlrt::bc::Buffer> ConvertTfMlirWithOpKeysToBytecode(
+    const TfrtCompileOptions& options,
+    const tfrt_stub::FallbackState& fallback_state,
+    mlir::ModuleOp module_with_op_keys,
+    const tfrt_stub::CostRecorder& cost_recorder) {
+  mlir::StatusScopedDiagnosticHandler diag_handler(
+      module_with_op_keys.getContext());
+  if (VLOG_IS_ON(1)) {
+    tensorflow::DumpMlirOpToFile("tf_dialect_with_op_keys",
+                                 module_with_op_keys);
+  }
+  // Create the reconversion pipeline and run.
+  mlir::PassManager pm(module_with_op_keys.getContext());
+  const auto pipeline_options = GetTfrtPipelineOptions(options);
+  CreateTfToMlrtPipeline(pm, *pipeline_options, &fallback_state,
+                         &cost_recorder);
+  if (mlir::failed(pm.run(module_with_op_keys))) {
+    return diag_handler.Combine(
+        absl::InternalError("failed to lower TF Dialect to MLRT dialect."));
+  }
+  // Generate bytecode.
+  mlrt::AttributeEncoderRegistry registry;
+  registry.Register("tf_mlrt", &tensorflow::tf_mlrt::EncodeTensorflowAttribute);
+  auto statusor = mlrt::EmitExecutable(registry, module_with_op_keys);
+  if (!statusor.ok()) return statusor.status();
+  if (VLOG_IS_ON(1)) {
+    tensorflow::DumpMlirOpToFile("tfrt_dialect_from_tf_dialect_with_op_keys",
+                                 module_with_op_keys);
+  }
+  return std::move(*statusor);
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
new file mode 100644
index 00000000000..37e0563c691
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h
@@ -0,0 +1,52 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IMPORT_MODEL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IMPORT_MODEL_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Converts an MLIR `module` in TF dialect to MLRT's bytecode format. If
+// `module_with_op_keys` is non-null, the intermediate module on which passes
+// until (including) AssignOpKeyPass have run will be cloned to it.
+//
+// This is for initial conversion.
+StatusOr<mlrt::bc::Buffer> ConvertTfMlirToBytecode(
+    const TfrtCompileOptions& options,
+    const tfrt_stub::FallbackState& fallback_state, mlir::ModuleOp module,
+    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys = nullptr);
+
+// Converts an MLIR `module_with_op_keys` in TF dialect to MLRT's bytecode
+// format, with op costs from `cost_recorder`.
+//
+// This is for re-conversion.
+StatusOr<mlrt::bc::Buffer> ConvertTfMlirWithOpKeysToBytecode(
+    const TfrtCompileOptions& options,
+    const tfrt_stub::FallbackState& fallback_state,
+    mlir::ModuleOp module_with_op_keys,
+    const tfrt_stub::CostRecorder& cost_recorder);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_IMPORT_MODEL_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.cc
new file mode 100644
index 00000000000..7cab8a9d528
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.cc
@@ -0,0 +1,833 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h"
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/analysis/cost_analysis.h"
+#include "tensorflow/compiler/mlir/tfrt/constants.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tfrt/compiler/stream_analysis.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+using tensorflow::tfrt_compiler::CostAnalysis;
+using tfrt::compiler::Stream;
+using tfrt::compiler::StreamAnalysis;
+
+std::string GetStreamFunctionName(absl::string_view func_name,
+                                  const Stream& stream) {
+  return absl::StrCat(func_name, "_stream_", stream.id());
+}
+
+bool IsConstant(mlir::Operation* op) {
+  return op && llvm::isa<mlir::TF::ConstOp, mlir::TF::_TfrtGetResourceOp>(op);
+}
+
+// StreamInfo is a bookkeeping for inputs, futures, and promises for a stream.
+struct StreamInfo {
+  const Stream* parent = nullptr;
+
+  // The values that are produced by constant ops. Instead of using
+  // promise/await to pass these values between streams, we can just copying
+  // these ops to the streams that use these constants.
+  llvm::SetVector<mlir::Value> constants;
+  // The values that are the inputs to the stream.
+  llvm::SetVector<mlir::Value> inputs;
+  // The values that will be the futures to the stream.
+  llvm::SetVector<mlir::Value> futures;
+  // The values that will be the control futures (i.e. futures with no data) to
+  // the stream.
+  llvm::SetVector<mlir::Operation*> control_futures;
+  // The values that will be the promises to the stream.
+  llvm::SetVector<mlir::Value> promises;
+  // The values that will be the control promises (i.e., promises with no data)
+  // to the stream.
+  llvm::SetVector<mlir::Operation*> control_promises;
+  // The values that are defined by the operations in the stream. Note that all
+  // values in `futures` will also be in `results`.
+  llvm::DenseSet<mlir::Value> results;
+
+  bool contains_only_constants = true;
+
+  bool IsRoot() const { return parent == nullptr; }
+};
+
+// Preprocess the block to produce StreamInfo for every stream.
+llvm::DenseMap<const Stream*, StreamInfo> PreprocessStreamInfo(
+    mlir::Block& block,
+    const llvm::DenseMap<mlir::Operation*,
+                         llvm::SmallSetVector<mlir::Operation*, 4>>&
+        control_predecessors,
+    const StreamAnalysis& stream_analysis) {
+  llvm::DenseMap<const Stream*, StreamInfo> stream_map;
+
+  // All values that will be promises in the block.
+  llvm::DenseSet<mlir::Value> promises;
+
+  // All operations that will be control promises in the block.
+  llvm::DenseSet<mlir::Operation*> control_promises;
+
+  // Keep track of all available values and controls as we traverse the stream
+  // tree in depth-first order.
+  llvm::DenseSet<mlir::Value> available_values;
+  llvm::DenseSet<mlir::Operation*> available_controls;
+
+  struct Entry {
+    explicit Entry(const Stream* stream) : stream(stream) {}
+
+    const Stream* stream = nullptr;
+
+    // Keep track of the next operation to be processed. If all operations are
+    // processed, we can pop this stream from the DFS stack.
+    int op_idx = 0;
+  };
+
+  std::vector<Entry> stack;
+  stack.reserve(stream_analysis.GetNumStreams());
+
+  // We first push the entry for the root stream.
+  const auto& root_stream = stream_analysis.GetRootStream();
+  auto& root_stream_info = stream_map[&root_stream];
+  available_values.insert(block.getArguments().begin(),
+                          block.getArguments().end());
+  root_stream_info.results.insert(block.getArguments().begin(),
+                                  block.getArguments().end());
+  stack.push_back(Entry(&root_stream));
+
+  // The root stream's first operation a dummy operation that defines all block
+  // arguments.
+  for (auto* child_stream : root_stream.GetChildStreamsForRootOp()) {
+    stream_map[child_stream].parent = &root_stream;
+    stack.push_back(Entry(child_stream));
+  }
+
+  // The first DFS traveral populates inputs and futures for every stream but
+  // not promises. We only know whether a value definition is a promise only
+  // after traversing all streams, so it is not possible to know it in the first
+  // pass.
+  while (!stack.empty()) {
+    auto& [stream, op_idx] = stack.back();
+    auto& stream_info = stream_map[stream];
+
+    auto ops = stream->ops();
+
+    // If we finish processing all operations in the stream, we can pop this
+    // stream, as well as the values defined by its operations.
+    if (op_idx == ops.size()) {
+      for (auto* op : stream->ops()) {
+        // Erase the values and controls produced by the current stream.
+        for (auto result : op->getResults()) {
+          available_values.erase(result);
+        }
+        available_controls.erase(op);
+      }
+      // Futures and control futures will also be available, so we erase them as
+      // well.
+      for (auto future : stream_info.futures) {
+        available_values.erase(future);
+      }
+      for (auto* control_future : stream_info.control_futures) {
+        available_controls.erase(control_future);
+      }
+
+      if (!stream_info.IsRoot()) {
+        // Merge inputs, futures, and promises into the parent stream, as they
+        // will be passed down from the root in the output program.
+        DCHECK_GT(stream_map.count(stream_info.parent), 0);
+        auto& parent_info = stream_map[stream_info.parent];
+
+        for (const auto& input : stream_info.inputs) {
+          DCHECK(available_values.contains(input));
+          if (!parent_info.results.contains(input)) {
+            // An input in the current stream will be an input in the parent
+            // stream only if it is not a result in the parent stream.
+            parent_info.inputs.insert(input);
+          }
+        }
+
+        for (auto future : stream_info.futures) {
+          DCHECK(!available_values.contains(future));
+          parent_info.futures.insert(future);
+        }
+        for (auto* control_future : stream_info.control_futures) {
+          DCHECK(!available_controls.contains(control_future));
+          parent_info.control_futures.insert(control_future);
+        }
+      }
+
+      // Update the global promise set.
+      promises.insert(stream_info.futures.begin(), stream_info.futures.end());
+      control_promises.insert(stream_info.control_futures.begin(),
+                              stream_info.control_futures.end());
+
+      stack.pop_back();
+      continue;
+    }
+
+    // We process the operations one by one. If the operation has child streams,
+    // we process the child streams first before continuing to the next
+    // operation.
+    bool has_child_streams = false;
+    for (; op_idx < ops.size() && !has_child_streams; ++op_idx) {
+      auto* op = ops[op_idx];
+
+      stream_info.contains_only_constants &= IsConstant(op);
+
+      // Check every operand to see whether it is a future or input.
+      for (mlir::Value operand : op->getOperands()) {
+        // If the value is defined in the current stream, nothing needs to be
+        // done.
+        if (!stream_info.results.contains(operand)) {
+          if (available_values.insert(operand).second) {
+            // If the operand is not available in the current stream or any
+            // parent stream, it will be a future and then become a result.
+            if (IsConstant(operand.getDefiningOp())) {
+              stream_info.constants.insert(operand);
+            } else {
+              stream_info.futures.insert(operand);
+            }
+            stream_info.results.insert(operand);
+          } else {
+            // If the operand is not available in the current stream but
+            // available in the parent stream, it is an input.
+            if (IsConstant(operand.getDefiningOp())) {
+              stream_info.constants.insert(operand);
+            } else {
+              stream_info.inputs.insert(operand);
+            }
+          }
+        }
+      }
+
+      // Insert mlrt.await_control if this op has control deps on other ops.
+      if (auto ctrl_iter = control_predecessors.find(op);
+          ctrl_iter != control_predecessors.end()) {
+        const auto& ctrl_deps = ctrl_iter->second;
+
+        for (mlir::Operation* control_dep : ctrl_deps) {
+          if (available_controls.insert(control_dep).second) {
+            // If the control is not already available, it will be a control
+            // future and then become available.
+            stream_info.control_futures.insert(control_dep);
+          }
+        }
+      }
+
+      // Update results of this operations.
+      for (mlir::Value result : op->getResults()) {
+        available_values.insert(result);
+        stream_info.results.insert(result);
+      }
+
+      // Update this op as an available control.
+      available_controls.insert(op);
+
+      // Pause processing the current stream to process the child streams first.
+      const auto& child_streams = stream->GetChildStreams(op);
+      has_child_streams = !child_streams.empty();
+      for (auto* child_stream : child_streams) {
+        stream_map[child_stream].parent = stream;
+        stack.push_back(Entry(child_stream));
+      }
+    }
+  }
+
+  // The second pass populates promises for each stream. We also need to merge
+  // promises in a child to its parent stream. We can do this by traversing the
+  // operation in reverse program order.
+  for (auto& op : llvm::reverse(block)) {
+    const auto& stream = stream_analysis.GetStream(&op);
+    auto& stream_info = stream_map[&stream];
+
+    for (mlir::Value result : op.getResults()) {
+      if (promises.contains(result)) {
+        stream_info.promises.insert(result);
+      }
+    }
+
+    if (control_promises.contains(&op)) {
+      stream_info.control_promises.insert(&op);
+    }
+
+    for (const auto* child_stream : stream.GetChildStreams(&op)) {
+      const auto& child_info = stream_map[child_stream];
+
+      stream_info.promises.insert(child_info.promises.begin(),
+                                  child_info.promises.end());
+      stream_info.control_promises.insert(child_info.control_promises.begin(),
+                                          child_info.control_promises.end());
+    }
+  }
+
+  // Special handling for the dummy operation in the root.
+  auto& root_info = stream_map[&root_stream];
+  for (const auto* child_stream : root_stream.GetChildStreamsForRootOp()) {
+    const auto& child_info = stream_map[child_stream];
+
+    root_info.promises.insert(child_info.promises.begin(),
+                              child_info.promises.end());
+    root_info.control_promises.insert(child_info.control_promises.begin(),
+                                      child_info.control_promises.end());
+  }
+
+  return stream_map;
+}
+
+// A custom struct that groups mappings for values, futures and promises for a
+// stream during creating the corresponding stream function.
+struct Mapping {
+  // This is the mappings for the SSA values used in the original and new
+  // operations.
+  mlir::IRMapping value_mapping;
+
+  // Maps the original tensor value that will be a future to the corresponding
+  // !mlrt.future value.
+  mlir::IRMapping future_mapping;
+
+  // Maps the original tensor value that will be a promise to the corresponding
+  // !mlrt.promise value.
+  mlir::IRMapping promise_mapping;
+
+  // In addition to value mappings, we also need mappings for input control
+  // dependencies to the corresponding !mlrt.future and !mlrt.promise values.
+  llvm::DenseMap<mlir::Operation*, mlir::Value> future_control_mapping;
+  llvm::DenseMap<mlir::Operation*, mlir::Value> promise_control_mapping;
+};
+
+mlrt::compiler::AsyncOp CreateAsyncOp(
+    mlir::OpBuilder& builder, absl::string_view function_name,
+    const llvm::DenseMap<const Stream*, StreamInfo>& stream_map,
+    const Stream& stream, const Mapping& mapping, mlir::Location loc) {
+  auto iter = stream_map.find(&stream);
+  DCHECK(iter != stream_map.end());
+  const auto& stream_info = iter->second;
+
+  if (stream_info.contains_only_constants) return nullptr;
+
+  const auto& [value_mapping, future_mapping, promise_mapping,
+               future_control_mapping, promise_control_mapping] = mapping;
+
+  llvm::SmallVector<mlir::Value> async_operands;
+
+  for (auto input : stream_info.inputs) {
+    async_operands.push_back(value_mapping.lookup(input));
+    DCHECK(async_operands.back());
+  }
+
+  for (auto future : stream_info.futures) {
+    async_operands.push_back(future_mapping.lookup(future));
+    DCHECK(async_operands.back());
+  }
+
+  for (auto* control_future : stream_info.control_futures) {
+    DCHECK_GT(future_control_mapping.count(control_future), 0);
+    async_operands.push_back(future_control_mapping.lookup(control_future));
+    DCHECK(async_operands.back());
+  }
+
+  for (auto promise : stream_info.promises) {
+    async_operands.push_back(promise_mapping.lookup(promise));
+    DCHECK(async_operands.back());
+  }
+
+  for (auto* control_promise : stream_info.control_promises) {
+    DCHECK_GT(promise_control_mapping.count(control_promise), 0);
+    async_operands.push_back(promise_control_mapping.lookup(control_promise));
+    DCHECK(async_operands.back());
+  }
+
+  return builder.create<mlrt::compiler::AsyncOp>(
+      loc, builder.getType<mlrt::compiler::AsyncHandleType>(), async_operands,
+      mlir::SymbolRefAttr::get(builder.getContext(),
+                               GetStreamFunctionName(function_name, stream)));
+}
+
+mlir::func::FuncOp CreateStreamFunction(
+    mlir::OpBuilder& builder, Mapping& mapping, absl::string_view name,
+    const Stream& stream, const StreamInfo& stream_info, mlir::Location loc) {
+  if (stream_info.contains_only_constants) return nullptr;
+
+  auto& [value_mapping, future_mapping, promise_mapping, future_control_mapping,
+         promise_control_mapping] = mapping;
+
+  llvm::SmallVector<mlir::Type> arg_types;
+  for (mlir::Value input : stream_info.inputs) {
+    arg_types.push_back(input.getType());
+  }
+
+  arg_types.append(
+      stream_info.futures.size() + stream_info.control_futures.size(),
+      builder.getType<mlrt::compiler::FutureType>());
+  arg_types.append(
+      stream_info.promises.size() + stream_info.control_promises.size(),
+      builder.getType<mlrt::compiler::PromiseType>());
+
+  // The stream function has no result.
+  auto func_type = builder.getFunctionType(arg_types, /*results=*/{});
+
+  auto func = builder.create<mlir::func::FuncOp>(
+      loc, GetStreamFunctionName(name, stream), func_type);
+  func.setVisibility(mlir::func::FuncOp::Visibility::Private);
+
+  // Populate the body of the stream function by copying over the operations
+  // in the stream.
+  auto* new_block = func.addEntryBlock();
+
+  // Replace inputs with the function arguments.
+  for (int i = 0; i < stream_info.inputs.size(); ++i) {
+    value_mapping.map(stream_info.inputs[i], new_block->getArgument(i));
+  }
+
+  // Maps the original tensor value that will be a future or a promise to
+  // the corresponding !mlrt.future or !mlrt.promise value.
+  size_t start = stream_info.inputs.size();
+  for (int i = 0; i < stream_info.futures.size(); ++i) {
+    future_mapping.map(stream_info.futures[i],
+                       new_block->getArgument(i + start));
+  }
+
+  start += stream_info.futures.size();
+  for (int i = 0; i < stream_info.control_futures.size(); ++i) {
+    future_control_mapping[stream_info.control_futures[i]] =
+        new_block->getArgument(i + start);
+  }
+
+  start += stream_info.control_futures.size();
+  for (int i = 0; i < stream_info.promises.size(); ++i) {
+    promise_mapping.map(stream_info.promises[i],
+                        new_block->getArgument(i + start));
+  }
+
+  start += stream_info.promises.size();
+  for (int i = 0; i < stream_info.control_promises.size(); ++i) {
+    promise_control_mapping[stream_info.control_promises[i]] =
+        new_block->getArgument(i + start);
+  }
+
+  return func;
+}
+
+void CreateAllocateFuturesOp(mlir::OpBuilder& builder, Mapping& mapping,
+                             const StreamInfo& stream_info,
+                             mlir::Location loc) {
+  auto& [value_mapping, future_mapping, promise_mapping, future_control_mapping,
+         promise_control_mapping] = mapping;
+
+  DCHECK_EQ(stream_info.futures.size(), stream_info.promises.size());
+
+  llvm::SmallVector<mlir::Type> promise_types(
+      stream_info.promises.size(),
+      builder.getType<mlrt::compiler::PromiseType>());
+  llvm::SmallVector<mlir::Type> future_types(
+      stream_info.futures.size(),
+      builder.getType<mlrt::compiler::FutureType>());
+
+  if (!stream_info.futures.empty()) {
+    auto allocate_futures = builder.create<tf_mlrt::AllocateFuturesOp>(
+        loc, promise_types, future_types, stream_info.futures.size());
+    for (int i = 0; i < stream_info.futures.size(); ++i) {
+      future_mapping.map(stream_info.futures[i],
+                         allocate_futures.getFutures()[i]);
+    }
+
+    for (int i = 0; i < stream_info.futures.size(); ++i) {
+      // Use the original values in `futures` to make sure futures[i] shares the
+      // state with promises[i].
+      DCHECK(stream_info.promises.contains(stream_info.futures[i]));
+      promise_mapping.map(stream_info.futures[i],
+                          allocate_futures.getPromises()[i]);
+    }
+  }
+
+  DCHECK_EQ(stream_info.control_futures.size(),
+            stream_info.control_promises.size());
+  if (!stream_info.control_futures.empty()) {
+    promise_types.resize(stream_info.control_promises.size(),
+                         builder.getType<mlrt::compiler::PromiseType>());
+    future_types.resize(stream_info.control_futures.size(),
+                        builder.getType<mlrt::compiler::FutureType>());
+
+    auto allocate_control_futures =
+        builder.create<mlrt::compiler::AllocateControlFuturesOp>(
+            loc, promise_types, future_types,
+            stream_info.control_futures.size());
+    for (int i = 0; i < stream_info.control_futures.size(); ++i) {
+      future_control_mapping[stream_info.control_futures[i]] =
+          allocate_control_futures.getFutures()[i];
+    }
+    for (int i = 0; i < stream_info.control_futures.size(); ++i) {
+      // Use the original operations in `control_futures` to make sure
+      // control_futures[i] shares the state with control_promises[i].
+      DCHECK(stream_info.control_promises.contains(
+          stream_info.control_futures[i]));
+      promise_control_mapping[stream_info.control_futures[i]] =
+          allocate_control_futures.getPromises()[i];
+    }
+  }
+}
+
+class TensorflowCostModel : public StreamAnalysis::CostModelInterface {
+ public:
+  explicit TensorflowCostModel(CostAnalysis* cost_analysis)
+      : cost_analysis_(*cost_analysis) {}
+
+  std::optional<int64_t> GetOperationCost(mlir::Operation* op) const override {
+    return cost_analysis_.GetCost(op);
+  }
+
+ private:
+  const CostAnalysis& cost_analysis_;
+};
+
+bool SkipControlDep(mlir::Operation* op) {
+  // TODO(chky): Consider define side effects more properly for these ops.
+  return llvm::isa<mlir::TF::TPUCompileMlirAndExecuteOp, mlir::TF::AssertOp>(
+      op);
+}
+
+void ParallelizeBlock(
+    absl::string_view name, mlir::Block& block,
+    const mlir::TF::SideEffectAnalysis::Info& side_effect_analysis,
+    const tfrt_stub::CostRecorder* cost_recorder) {
+  // First, we use SideEffectAnalysis to find out control predecessors for each
+  // operation. We use this map later to insert control futures.
+  llvm::DenseMap<mlir::Operation*, llvm::SmallSetVector<mlir::Operation*, 4>>
+      control_predecessors;
+  for (auto& op : block) {
+    auto& deps = control_predecessors[&op];
+    for (auto* dep : side_effect_analysis.DirectControlPredecessors(&op)) {
+      // If we skip the control deps of `op`, then we need to use the control
+      // deps of these control deps instead.
+      if (SkipControlDep(dep)) {
+        for (auto* d : control_predecessors[dep]) {
+          DCHECK(!SkipControlDep(d));
+          deps.insert(d);
+        }
+      } else {
+        deps.insert(dep);
+      }
+    }
+  }
+
+  // Remove skipped control deps.
+  for (auto& op : block) {
+    if (SkipControlDep(&op)) {
+      control_predecessors.erase(&op);
+    }
+  }
+
+  // Perform stream analysis.
+  CostAnalysis cost_analysis(
+      llvm::cast<mlir::func::FuncOp>(block.getParentOp()), cost_recorder);
+  TensorflowCostModel cost_model(&cost_analysis);
+  StreamAnalysis stream_analysis(block, &cost_model);
+
+  // Preprocess all streams to gather StreamInfos for all streams, without
+  // modifying the program.
+  llvm::DenseMap<const Stream*, StreamInfo> stream_map =
+      PreprocessStreamInfo(block, control_predecessors, stream_analysis);
+
+  // Then we perform a DFS traversal to create stream functions and insert async
+  // operations.
+  std::vector<const Stream*> stack;
+  stack.reserve(stream_analysis.GetNumStreams());
+
+  const auto& root_stream = stream_analysis.GetRootStream();
+  stack.push_back(&root_stream);
+
+  llvm::SmallVector<mlir::Operation*> to_remove;
+
+  mlir::OpBuilder builder(block.getParentOp());
+
+  while (!stack.empty()) {
+    const auto* stream = stack.back();
+    stack.pop_back();
+    DCHECK(stream);
+
+    DCHECK_GT(stream_map.count(stream), 0);
+    const auto& stream_info = stream_map[stream];
+
+    Mapping mapping;
+    auto& [value_mapping, future_mapping, promise_mapping,
+           future_control_mapping, promise_control_mapping] = mapping;
+
+    // `async_handles` keeps the !mlrt.async_handle created in the stream. A
+    // mlrt.await_handle op will be inserted at the end of the stream function
+    // for each async handle.
+    llvm::SmallVector<mlir::Value> async_handles;
+
+    mlir::func::FuncOp stream_func;
+    if (!stream_info.IsRoot()) {
+      // If it is not a root stream, we need to create a new function for this
+      // stream. And futures and promises are also passed as parameters. For the
+      // root stream, futures and promises are allocated in the body.
+
+      // Insert the stream function before the original function.
+      builder.setInsertionPoint(block.getParentOp());
+
+      stream_func =
+          CreateStreamFunction(builder, mapping, name, *stream, stream_info,
+                               block.getParentOp()->getLoc());
+
+      if (stream_func) {
+        // Set the insertion point to the start of the new block in the
+        // function.
+        builder.setInsertionPointToStart(&stream_func.front());
+      }
+    } else {
+      stream_func = llvm::cast<mlir::func::FuncOp>(block.getParentOp());
+
+      DCHECK_EQ(stream, &root_stream);
+      // If it is the root stream, we insert new operations in the original
+      // function. And we need to allocate all the futures used here.
+      builder.setInsertionPointToStart(&block);
+
+      // The block arguments of the root stream are in the `results`. There will
+      // be no additional inputs in `inputs`.
+      DCHECK(stream_info.inputs.empty());
+
+      // Put the original arguments in the mapping as they are not changed.
+      for (auto arg : block.getArguments()) {
+        value_mapping.map(arg, arg);
+      }
+
+      // Insert a tf_mlrt.allocate_futures op to allocate all futures used.
+      CreateAllocateFuturesOp(builder, mapping, stream_info,
+                              block.getParentOp()->getLoc());
+
+      // Lastly for the root stream, we need to handle the dummy op that defines
+      // the arguments.
+      for (const auto* child_stream : stream->GetChildStreamsForRootOp()) {
+        stack.push_back(child_stream);
+        if (auto async =
+                CreateAsyncOp(builder, name, stream_map, *child_stream, mapping,
+                              block.getParentOp()->getLoc())) {
+          async_handles.push_back(async);
+        }
+      }
+    }
+
+    for (auto* op : stream->ops()) {
+      to_remove.push_back(op);
+    }
+
+    // Skip empty streams.
+    if (!stream_func) continue;
+
+    mlir::Operation* return_op = nullptr;
+
+    // Cloning the operations in the stream. If the operand is a future, a
+    // tf_mlrt.Await op will be inserted. If the result is a promise, a
+    // tf_mlrt.Promise will be inserted. Similar to control futures and control
+    // promises.
+    for (auto* op : stream->ops()) {
+      // Clone the current op into the function of this stream, using the
+      // new operands, which can be futures.
+      for (mlir::Value operand : op->getOperands()) {
+        if (stream_info.constants.contains(operand) &&
+            !value_mapping.contains(operand)) {
+          builder.clone(*operand.getDefiningOp(), value_mapping);
+        } else if (stream_info.futures.contains(operand) &&
+                   !value_mapping.contains(operand)) {
+          // Insert Await op if it is a future.
+          auto future_value = builder.create<tf_mlrt::TFAwaitOp>(
+              op->getLoc(), operand.getType(), future_mapping.lookup(operand));
+
+          // Now this future is available in the current stream, so it can be a
+          // normal value.
+          value_mapping.map(operand, future_value);
+        }
+      }
+
+      if (auto ctrl_iter = control_predecessors.find(op);
+          ctrl_iter != control_predecessors.end()) {
+        const auto& ctrl_deps = ctrl_iter->second;
+
+        for (mlir::Operation* control_dep : ctrl_deps) {
+          // This control may be available in the ancestors or in a previous
+          // AwaitControl, we only insert a new AwaitControl if it is not.
+          if (stream_info.control_futures.contains(control_dep)) {
+            if (auto iter = future_control_mapping.find(control_dep);
+                iter != future_control_mapping.end()) {
+              builder.create<mlrt::compiler::AwaitControlOp>(
+                  control_dep->getLoc(), iter->second);
+
+              // Now we no longer need this control dep in this stream.
+              future_control_mapping.erase(iter);
+            }
+          }
+        }
+      }
+
+      // Clone the op using the value mapping that includes values from futures.
+      auto* new_op = builder.clone(*op, value_mapping);
+
+      // TODO(chky): Ensure the original return op is in the root stream. This
+      // is currently an implicit guarantee in stream analysis.
+      if (llvm::isa<mlir::func::ReturnOp>(op)) {
+        DCHECK(stream_info.IsRoot()) << name << " " << stream->id();
+        return_op = new_op;
+      }
+
+      for (mlir::Value result : op->getResults()) {
+        if (stream_info.promises.contains(result)) {
+          // Insert Promise op if the result is a promise.
+          builder.create<tf_mlrt::TFPromiseOp>(op->getLoc(),
+                                               promise_mapping.lookup(result),
+                                               value_mapping.lookup(result));
+        }
+      }
+
+      if (stream_info.control_promises.contains(op)) {
+        // Insert Promise op if this op produce a control dependency to ops in
+        // other streams.
+        builder.create<mlrt::compiler::PromiseControlOp>(
+            op->getLoc(), promise_control_mapping[op]);
+      }
+
+      // If this op has child streams, insert mlrt.async ops.
+      for (auto* child_stream : stream->GetChildStreams(op)) {
+        stack.push_back(child_stream);
+        if (auto async = CreateAsyncOp(builder, name, stream_map, *child_stream,
+                                       mapping, op->getLoc())) {
+          async_handles.push_back(async);
+        }
+      }
+    }
+
+    // Create the return op for non-root streams.
+    //
+    // TODO(chky): Ensure the original return op is in the root stream. This is
+    // currently an implicit guarantee in stream analysis.
+    if (!return_op) {
+      DCHECK(!stream_info.IsRoot()) << name << " " << stream->id();
+      return_op =
+          builder.create<mlir::func::ReturnOp>(block.getParentOp()->getLoc());
+    }
+
+    // We need to wait for async executions at the end of the stream function,
+    // in order to manage resource lifetime and handle errors properly. These
+    // mlrt.await_handle ops are inserted before the return op.
+    builder.setInsertionPoint(return_op);
+    for (auto handle : async_handles) {
+      builder.create<mlrt::compiler::AwaitHandleOp>(
+          block.getParentOp()->getLoc(), handle);
+    }
+  }
+
+  // Remove the operations in the original block.
+  for (auto* op : llvm::reverse(to_remove)) {
+    op->dropAllDefinedValueUses();
+    op->erase();
+  }
+}
+
+class ParallelizationPass
+    : public mlir::PassWrapper<ParallelizationPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  ParallelizationPass() = default;
+  ParallelizationPass(uint64_t cost_threshold,
+                      bool merge_inter_dependent_streams,
+                      const tfrt_stub::CostRecorder* cost_recorder) {
+    cost_threshold_ = cost_threshold;
+    merge_inter_dependent_streams_ = merge_inter_dependent_streams;
+    cost_recorder_ = cost_recorder;
+  }
+  ParallelizationPass(const ParallelizationPass&) {}
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(ParallelizationPass)
+
+ private:
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<tensorflow::tf_mlrt::TensorflowMlrtDialect>();
+    registry.insert<mlrt::compiler::MlrtDialect>();
+  }
+
+  llvm::StringRef getArgument() const final {
+    return "tf-mlrt-parallelization";
+  }
+
+  llvm::StringRef getDescription() const final {
+    return "Parallelize tf graphs by inserting mlrt async operations.";
+  }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+
+    mlir::Builder builder(module);
+    module->setAttr("tfrt.cost_threshold",
+                    builder.getI64IntegerAttr(cost_threshold_));
+    module->setAttr("tfrt.merge_inter_dependent_streams",
+                    builder.getBoolAttr(merge_inter_dependent_streams_));
+
+    mlir::TF::SideEffectAnalysis side_effect_analysis(module);
+
+    for (auto func_op :
+         llvm::make_early_inc_range(module.getOps<mlir::func::FuncOp>())) {
+      ParallelizeBlock(func_op.getSymName(), func_op.front(),
+                       side_effect_analysis.GetAnalysisForFunc(func_op),
+                       cost_recorder_);
+    }
+  }
+
+  Option<uint64_t> cost_threshold_{
+      *this, "tfrt-cost-threshold",
+      llvm::cl::desc("If a sequence of operations has a cost lower than the "
+                     "cost-threshold, the sequence will be executed as a block "
+                     "in the same thread."),
+      llvm::cl::init(1)};
+  Option<bool> merge_inter_dependent_streams_{
+      *this, "tfrt-merge-inter-dependent-streams",
+      llvm::cl::desc("If true, streams with inter data depenedencies will be "
+                     "preferred to be merged for inline execution."),
+      llvm::cl::init(false)};
+  const tfrt_stub::CostRecorder* cost_recorder_ = nullptr;
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateParallelizationPass(
+    uint64_t cost_threshold, bool merge_inter_dependent_streams,
+    const tfrt_stub::CostRecorder* cost_recorder) {
+  return std::make_unique<ParallelizationPass>(
+      cost_threshold, merge_inter_dependent_streams, cost_recorder);
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateParallelizationPass() {
+  return std::make_unique<ParallelizationPass>();
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h
new file mode 100644
index 00000000000..71221276fa9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PARALLELIZATION_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PARALLELIZATION_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateParallelizationPass(
+    uint64_t cost_threshold, bool merge_inter_dependent_streams,
+    const tfrt_stub::CostRecorder* cost_recorder = nullptr);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateParallelizationPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PARALLELIZATION_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc
new file mode 100644
index 00000000000..b55a7ff19bc
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.cc
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/assign_op_key.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/fuse_mlrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/parallelization.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+void RegisterMlrtPasses() {
+  mlir::registerPass([]() { return CreateAssignOpKeyPass(); });
+  mlir::registerPass([]() { return CreateParallelizationPass(); });
+  mlir::registerPass([]() { return CreateWhileToMapFnPass(); });
+  mlir::registerPass(
+      []() { return CreateTfToMlrtPreParallelizationConversionPass({}); });
+  mlir::registerPass([]() { return CreateTfToMlrtConversionPass({}); });
+  mlir::registerPass([]() { return CreateFuseMlrtOpPass(); });
+}
+
+void CreateTfToMlrtPipeline(mlir::OpPassManager &pm,
+                            const TfrtPipelineOptions &options,
+                            const tfrt_stub::FallbackState *fallback_state,
+                            const tfrt_stub::CostRecorder *cost_recorder) {
+  pm.addPass(
+      mlrt_compiler::CreateTfToMlrtPreParallelizationConversionPass(options));
+  pm.addPass(mlrt_compiler::CreateParallelizationPass(
+      options.cost_threshold, options.merge_inter_dependent_streams,
+      cost_recorder));
+
+  DCHECK(fallback_state);
+  pm.addPass(
+      mlrt_compiler::CreateTfToMlrtConversionPass(options, fallback_state));
+
+  // Perform optimizations in the lowered MLIR.
+  pm.addNestedPass<mlir::func::FuncOp>(mlrt_compiler::CreateFuseMlrtOpPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addPass(mlir::createInlinerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h
new file mode 100644
index 00000000000..f9bf621b8bf
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/passes.h
@@ -0,0 +1,38 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PASSES_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PASSES_H_
+
+#include "mlir/Pass/PassOptions.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+void RegisterMlrtPasses();
+
+// Creates a pipeline of passes that lowers MLIR TF dialect to MLRT dialects.
+// The op costs from `cost_recorder` (if non-null) are used for Stream Analysis.
+void CreateTfToMlrtPipeline(
+    mlir::OpPassManager& pm, const TfrtPipelineOptions& options,
+    const tfrt_stub::FallbackState* fallback_state,
+    const tfrt_stub::CostRecorder* cost_recorder = nullptr);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_PASSES_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
new file mode 100644
index 00000000000..c9ec37e0aea
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -0,0 +1,1146 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h"
+
+#include <stdint.h>
+
+#include <cstddef>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Func/Transforms/FuncConversions.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinDialect.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "third_party/protobuf/text_format.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
+#include "tensorflow/compiler/mlir/tfrt/constants.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/utils.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+// TODO(chky): Add registration interface for custom device
+mlir::Value CreateCustomDevice(mlir::Location loc, llvm::StringRef device_name,
+                               mlir::ConversionPatternRewriter &rewriter) {
+  if (device_name == kTpuHostDevice) {
+    return rewriter.create<tf_mlrt_tpu::GetTpuHostDeviceOp>(
+        loc, rewriter.getType<tf_mlrt::TFDeviceType>());
+  }
+
+  return nullptr;
+}
+
+class FuncOpSignatureConversion final
+    : public mlir::OpConversionPattern<mlir::func::FuncOp> {
+ public:
+  explicit FuncOpSignatureConversion(
+      mlir::MLIRContext *context, mlir::TypeConverter *type_converter,
+      const llvm::DenseMap<llvm::StringRef, llvm::SmallVector<mlir::Type>>
+          *function_call_site_input_types)
+      : mlir::OpConversionPattern<mlir::func::FuncOp>(context),
+        type_converter_(*type_converter),
+        function_call_site_input_types_(*function_call_site_input_types) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::func::FuncOp func_op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    auto it = function_call_site_input_types_.find(func_op.getName());
+    if (it == function_call_site_input_types_.end()) {
+      return mlir::failure();
+    }
+    const llvm::SmallVector<mlir::Type> &call_site_input_types = it->second;
+
+    mlir::FunctionType func_type = func_op.getFunctionType();
+    DCHECK_EQ(func_type.getNumInputs(), call_site_input_types.size());
+
+    mlir::TypeConverter::SignatureConversion converted_signature(
+        func_type.getNumInputs());
+    for (const auto &[index, value] : llvm::enumerate(call_site_input_types)) {
+      converted_signature.addInputs(index, value);
+    }
+
+    // Update the function signature in-place.
+    rewriter.updateRootInPlace(func_op, [&] {
+      func_op.setType(mlir::FunctionType::get(
+          func_op.getContext(), converted_signature.getConvertedTypes(),
+          func_type.getResults()));
+    });
+
+    // Update the entry block
+    if (rewriter.applySignatureConversion(&func_op.getBody(),
+                                          converted_signature,
+                                          &type_converter_) == nullptr) {
+      return mlir::failure();
+    }
+
+    return mlir::success();
+  }
+
+ private:
+  mlir::TypeConverter &type_converter_;
+  const llvm::DenseMap<llvm::StringRef, llvm::SmallVector<mlir::Type>>
+      &function_call_site_input_types_;
+};
+
+class TFAwaitOpConversion final
+    : public mlir::OpConversionPattern<tf_mlrt::TFAwaitOp> {
+ public:
+  explicit TFAwaitOpConversion(mlir::MLIRContext *context)
+      : mlir::OpConversionPattern<tf_mlrt::TFAwaitOp>(context) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      tf_mlrt::TFAwaitOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    auto new_op = rewriter.create<tf_mlrt::AwaitOp>(
+        op->getLoc(), rewriter.getType<tf_mlrt::TFTensorType>(),
+        adaptor.getFuture());
+    rewriter.replaceOp(op, new_op.getResult());
+    return mlir::success();
+  }
+};
+
+class TFPromiseOpConversion final
+    : public mlir::OpConversionPattern<tf_mlrt::TFPromiseOp> {
+ public:
+  explicit TFPromiseOpConversion(mlir::MLIRContext *context)
+      : mlir::OpConversionPattern<tf_mlrt::TFPromiseOp>(context) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      tf_mlrt::TFPromiseOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    if (llvm::isa<::mlrt::compiler::FutureType>(
+            adaptor.getTensor().getType())) {
+      auto new_op = rewriter.create<tf_mlrt::PromiseFutureOp>(
+          op->getLoc(), adaptor.getPromise(), adaptor.getTensor());
+      rewriter.replaceOp(op, new_op->getResults());
+
+    } else {
+      auto new_op = rewriter.create<tf_mlrt::PromiseOp>(
+          op->getLoc(), adaptor.getPromise(), adaptor.getTensor());
+      rewriter.replaceOp(op, new_op->getResults());
+    }
+    return mlir::success();
+  }
+};
+
+// Convert tf_mlrt::MapFn's signature to tf_mlrt::TFTensorType
+class TFMapFnOpConversion
+    : public mlir::OpConversionPattern<tf_mlrt::TFMapFnOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      tf_mlrt::TFMapFnOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    llvm::SmallVector<mlir::Type> result_types;
+    result_types.resize(op->getResultTypes().size(),
+                        rewriter.getType<tf_mlrt::TFTensorType>());
+
+    auto new_op = rewriter.create<tf_mlrt::MapFnOp>(
+        op.getLoc(), result_types, adaptor.getOperands(), op->getAttrs());
+    rewriter.replaceOp(op, new_op.getResult());
+    return mlir::success();
+  }
+};
+
+// Convert TF call ops (eg. StatefulPartitionedCall) to call.
+template <typename TFCallOp>
+class TFCallOpConversion : public mlir::OpConversionPattern<TFCallOp> {
+ public:
+  TFCallOpConversion(mlir::MLIRContext *context,
+                     mlir::TypeConverter *type_converter)
+      : mlir::OpConversionPattern<TFCallOp>(context),
+        type_converter_(*type_converter) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      TFCallOp op, typename TFCallOp::Adaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    if (auto xla_must_compile =
+            op->template getAttrOfType<mlir::BoolAttr>("_XlaMustCompile");
+        xla_must_compile && xla_must_compile.getValue()) {
+      return mlir::failure();
+    }
+
+    auto callee =
+        op.getCallableForCallee().template dyn_cast<mlir::SymbolRefAttr>();
+    if (!callee) return mlir::failure();
+
+    llvm::SmallVector<mlir::Type, 4> result_types;
+    for (auto type : op.getOperation()->getResultTypes()) {
+      if (failed(type_converter_.convertType(type, result_types)))
+        return mlir::failure();
+    }
+
+    auto new_op = rewriter.create<mlir::func::CallOp>(
+        op.getLoc(), result_types, callee.getRootReference().getValue(),
+        adaptor.getOperands());
+    rewriter.replaceOp(op, new_op.getResults());
+    return mlir::success();
+  }
+
+ private:
+  mlir::TypeConverter &type_converter_;
+};
+
+// Convert tf.Case op to mlrt.Case.
+//
+// TF dialect:
+// %outputs = "tf.Case"(%idx_tensor, %arg, ...) { branches = [@branch0,
+// @branch1],
+// ...}
+//
+// lowered MLRT dialect:
+// %branch_idx = tf_mlrt.tensor_to_int32(%idx_tensor)
+// %outputs = mlrt.case %branch_idx [@branch0, @branch1] (%arg, ...)
+class CaseOpConversion : public mlir::OpConversionPattern<mlir::TF::CaseOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::CaseOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::ArrayAttr branches = op.getBranches();
+
+    llvm::SmallVector<mlir::Type> result_types;
+    result_types.resize(op->getResultTypes().size(),
+                        rewriter.getType<tf_mlrt::TFTensorType>());
+
+    auto index_operand = rewriter.create<tf_mlrt::TensorToIntOp>(
+        op.getLoc(), rewriter.getI32Type(), adaptor.getBranchIndex());
+
+    auto new_op = rewriter.create<mlrt::compiler::CaseOp>(
+        op.getLoc(), result_types, index_operand.getResult(), branches,
+        adaptor.getInput());
+
+    rewriter.replaceOp(op, new_op.getResults());
+    return mlir::success();
+  }
+};
+
+class AsyncOpConversion
+    : public mlir::OpConversionPattern<mlrt::compiler::AsyncOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  // Hook for derived classes to implement combined matching and rewriting.
+  mlir::LogicalResult matchAndRewrite(
+      mlrt::compiler::AsyncOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<mlrt::compiler::AsyncOp>(
+        op, op->getResultTypes(), adaptor.getOperands(), op.getCallee());
+    return mlir::success();
+  }
+};
+
+// SetResourceOpConversion lowers a TF SetResource op to a tf_mlrt.set_resource
+// op.
+class SetResourceOpConversion final
+    : public mlir::OpConversionPattern<mlir::TF::_TfrtSetResourceOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::_TfrtSetResourceOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<tf_mlrt::SetResourceOp>(op, adaptor.getArg(),
+                                                        op.getIndex());
+    return mlir::success();
+  }
+};
+
+// GetResourceOpConversion lowers a TF GetResource op to a tf_mlrt.get_resource
+// op.
+class GetResourceOpConversion final
+    : public mlir::OpConversionPattern<mlir::TF::_TfrtGetResourceOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::_TfrtGetResourceOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    llvm::SmallVector<mlir::Type> result_types(
+        op.getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
+    auto new_op = rewriter.create<tf_mlrt::GetResourceOp>(
+        op->getLoc(), result_types, op.getIndices());
+    rewriter.replaceOp(op, new_op->getResults());
+    return mlir::success();
+  }
+};
+
+std::optional<std::string> DecodeLongName(mlir::Location loc) {
+  if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) {
+    return name_loc.getName().str();
+  }
+
+  if (auto fused_loc = loc.dyn_cast<mlir::FusedLoc>()) {
+    std::string fused_name;
+    for (auto l : fused_loc.getLocations()) {
+      if (auto n = DecodeLongName(l)) {
+        fused_name += *n;
+      }
+    }
+    return fused_name;
+  }
+
+  return std::nullopt;
+}
+
+std::string GetNodeName(mlir::Operation *op) {
+  auto name = [&]() -> std::string {
+    if (auto name = DecodeLongName(op->getLoc())) {
+      return *std::move(name);
+    }
+
+    return op->getName().stripDialect().str();
+  }();
+
+  for (char &c : name) {
+    if (c == ':') c = '/';
+  }
+  return name;
+}
+
+void CanonicalizeFunctionNameInNodeDef(const mlir::SymbolTable &symbol_table,
+                                       NodeDef &node_def) {
+  for (auto &p : *node_def.mutable_attr()) {
+    if (p.second.has_func()) {
+      auto *func = p.second.mutable_func();
+      if (auto n = CanonicalizeTensorflowFunctionName(
+              symbol_table, func->name(),
+              /*use_mlir_func_name=*/false)) {
+        func->set_name(*n);
+      }
+    }
+
+    if (p.second.has_list() && p.second.list().func_size() > 0) {
+      for (auto &func : *p.second.mutable_list()->mutable_func()) {
+        if (auto n = CanonicalizeTensorflowFunctionName(
+                symbol_table, func.name(),
+                /*use_mlir_func_name=*/false)) {
+          func.set_name(*n);
+        }
+      }
+    }
+  }
+}
+
+class ExecuteOpConversion final : public mlir::ConversionPattern {
+ public:
+  ExecuteOpConversion(mlir::MLIRContext *context,
+                      const mlir::SymbolTable *symbol_table,
+                      mlir::TypeConverter *type_converter,
+                      ExecuteOpRegistry *execute_op_registry,
+                      tfrt_stub::OpKernelRunnerCache *op_kernel_cache,
+                      const tfrt_stub::FallbackState *fallback_state)
+      : mlir::ConversionPattern(*type_converter,
+                                mlir::Pattern::MatchAnyOpTypeTag(),
+                                /*benefit=*/1, context),
+        symbol_table_(*symbol_table),
+        execute_op_registry_(*execute_op_registry),
+        op_kernel_cache_(*op_kernel_cache),
+        fallback_state_(*fallback_state) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::Operation *op, llvm::ArrayRef<mlir::Value> operands,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    // TODO(b/173017701): Avoid fallback for ops within XLA GPU clusters.
+    if (!UseFallback(op)) return mlir::failure();
+
+    // The assign_op_key pass should have ran.
+    if (!op->hasAttr(tensorflow::tfrt_compiler::kOpKeyAttrName))
+      return op->emitError("does not have op_key defined");
+
+    std::string node_name = GetNodeName(op);
+
+    uint32_t execute_key = op->getAttrOfType<mlir::IntegerAttr>(
+                                 tensorflow::tfrt_compiler::kOpKeyAttrName)
+                               .getInt();
+
+    absl::StrAppend(&node_name, "_", execute_key);
+
+    auto statusor_node_def = tensorflow::ConvertTFDialectOpToNodeDef(
+        op, node_name, /*ignore_unregistered_attrs=*/false);
+    if (!statusor_node_def.ok())
+      return op->emitWarning("failed to export NodeDef.");
+    auto &node_def = **statusor_node_def;
+
+    CanonicalizeFunctionNameInNodeDef(symbol_table_, node_def);
+
+    std::string node_def_text;
+    proto2::TextFormat::PrintToString(node_def, &node_def_text);
+
+    auto op_kernel_runner = op_kernel_cache_.GetOrCreate(
+        tfrt::Location(nullptr, execute_key), node_def.op(), node_def.device(),
+        op->getNumOperands(),
+        [&](tensorflow::AttrValueMap *attr_value_map) {
+          *attr_value_map = node_def.attr();
+          return OkStatus();
+        },
+        fallback_state_.device_manager(),
+        fallback_state_.process_function_library_runtime());
+    LOG_IF(ERROR, !op_kernel_runner.ok()) << op_kernel_runner.status();
+
+    mlir::Value device;
+    if (auto custom_device =
+            op->getAttrOfType<mlir::StringAttr>(kTfMlrtCustomDevice)) {
+      device =
+          CreateCustomDevice(op->getLoc(), custom_device.getValue(), rewriter);
+      if (!device) return op->emitWarning("Failed to create custom device.");
+    }
+
+    mlir::Operation *new_op = nullptr;
+    if (op_kernel_runner.ok() && (*op_kernel_runner)->IsAsync()) {
+      // If it is an AsyncOpKernel, we lower it to tf_mlrt.async_executeop,
+      // which return !mlrt.futures. These results will be converted as
+      // necessary through the target materialization hook in the type
+      // converter.
+      llvm::SmallVector<mlir::Type, 4> result_types(
+          op->getNumResults(), rewriter.getType<mlrt::compiler::FutureType>());
+      if (device) {
+        new_op = rewriter.replaceOpWithNewOp<tf_mlrt::AsyncExecuteOpWithDevice>(
+            op, result_types, device, operands, node_def_text, execute_key);
+      } else {
+        new_op = rewriter.replaceOpWithNewOp<tf_mlrt::AsyncExecuteOp>(
+            op, result_types, operands, node_def_text, execute_key);
+      }
+      if (mlir::failed(
+              execute_op_registry_.RegisterExecuteOp(new_op, execute_key))) {
+        return op->emitWarning("Fail to register async op");
+      }
+    } else {
+      // Otherwise, lower to tf_mlrt.executeop.
+      llvm::SmallVector<mlir::Type, 4> result_types(
+          op->getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
+      if (device) {
+        new_op = rewriter.replaceOpWithNewOp<tf_mlrt::ExecuteOpWithDevice>(
+            op, result_types, device, operands, node_def_text, execute_key);
+      } else {
+        new_op = rewriter.replaceOpWithNewOp<tf_mlrt::ExecuteOp>(
+            op, result_types, operands, node_def_text, execute_key);
+      }
+
+      if (op_kernel_runner.ok()) {
+        // Only register this executeop if its opkernel can be created.
+        // Otherwise, it is an unused op so we don't need to create them at
+        // runtime.
+        if (mlir::failed(
+                execute_op_registry_.RegisterExecuteOp(new_op, execute_key))) {
+          return op->emitWarning("Fail to register sync op");
+        }
+      }
+    }
+
+    return mlir::success();
+  }
+
+ private:
+  const mlir::SymbolTable &symbol_table_;
+  ExecuteOpRegistry &execute_op_registry_;
+  tfrt_stub::OpKernelRunnerCache &op_kernel_cache_;
+  const tfrt_stub::FallbackState &fallback_state_;
+};
+
+mlir::Value GetPredicate(mlir::Operation *op, mlir::Value cond_operand,
+                         mlir::ConversionPatternRewriter &rewriter) {
+  return rewriter.create<tf_mlrt::PredicateOp>(
+      op->getLoc(), rewriter.getI1Type(), cond_operand);
+}
+
+class CondOpConversion : public mlir::OpConversionPattern<mlir::TF::IfOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::IfOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::FlatSymbolRefAttr then_branch = op.getThenBranchAttr();
+    mlir::FlatSymbolRefAttr else_branch = op.getElseBranchAttr();
+
+    llvm::SmallVector<mlir::Type, 4> result_types(
+        op.getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
+
+    auto bool_cond = GetPredicate(op, adaptor.getCond(), rewriter);
+
+    auto new_op = rewriter.create<mlrt::compiler::CondOp>(
+        op.getLoc(), result_types, bool_cond, adaptor.getInput(), then_branch,
+        else_branch);
+
+    rewriter.replaceOp(op, new_op.getResults());
+
+    return mlir::success();
+  }
+};
+
+// Convert TF WhileOp to mlrt.while.
+// The pseudo code of mlrt.while is as follows:
+//
+//  while(cond) {
+//    outputs, cond = body(inputs)
+//    inputs = outputs
+//  }
+//  return outputs, cond
+//
+// So we need to insert extra conversion kernels and merge functions when
+// lowering tf.While to mlrt.while.
+//
+//  %result = tf.While(%arg) {cond = @original_cond_fn, body =
+//  @original_body_fn}
+//
+// is converted to
+//
+//  func @new_pred_fn(%arg) {
+//    %cond_tensor = func.call @original_cond_fn(%arg)
+//    %cond_bool = mlrt.predicate %cond_tensor
+//    return %cond_bool
+//  }
+//
+//  func @new_while_body(%arg) {
+//    %result = func.call @original_body_fn(%arg)
+//    %cond_bool = func.call @new_pred_fn(%result)
+//    return%result, %cond_bool
+//  }
+//
+//  %first_iter_cond = func.call @new_pred_fn(%arg)
+//  %result = mlrt.while %first_iter_cond @new_while_body(%arg)
+//
+class WhileOpConversion : public mlir::OpConversionPattern<mlir::TF::WhileOp> {
+ public:
+  WhileOpConversion(mlir::MLIRContext *context,
+                    mlir::TypeConverter *type_converter,
+                    mlir::SymbolTable *symbol_table)
+      : mlir::OpConversionPattern<mlir::TF::WhileOp>(*type_converter, context),
+        symbol_table_(*symbol_table) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::WhileOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::FlatSymbolRefAttr cond_fn = op.getCondAttr();
+    mlir::FlatSymbolRefAttr body_fn = op.getBodyAttr();
+
+    // Create the predicate function that calls the original cond function and
+    // in addition convert the result to a boolean value.
+    mlir::func::FuncOp pred_fn = GetPredicateFunction(
+        op, cond_fn, adaptor.getOperands().getTypes(), rewriter);
+    if (!pred_fn) return mlir::failure();
+
+    // Insert a call op to call the pred function for the first iteration.
+    auto call_pred_fn = rewriter.create<mlir::func::CallOp>(
+        op.getLoc(), pred_fn.getFunctionType().getResults(),
+        pred_fn.getSymName(), adaptor.getOperands());
+
+    if (!call_pred_fn) return mlir::failure();
+
+    // Create the new while body function.
+    mlir::func::FuncOp new_body_fn = GetWhileBodyFunction(
+        op, body_fn, pred_fn, adaptor.getOperands().getTypes(), rewriter);
+
+    // mlrt.while returns one more additional boolean value than tf.while.
+    llvm::SmallVector<mlir::Type, 4> while_result_types(
+        adaptor.getOperands().getTypes().begin(),
+        adaptor.getOperands().getTypes().end());  // = while_arg_types;
+    while_result_types.push_back(rewriter.getI1Type());
+    auto new_op = rewriter.create<mlrt::compiler::WhileOp>(
+        op.getLoc(), while_result_types, call_pred_fn.getResult(0),
+        adaptor.getOperands(), new_body_fn.getSymName());
+
+    rewriter.replaceOp(op, new_op.getResults().drop_back());
+
+    return mlir::success();
+  }
+
+ private:
+  mlir::func::FuncOp GetPredicateFunction(
+      mlir::TF::WhileOp op, mlir::FlatSymbolRefAttr cond_fn,
+      mlir::TypeRange arg_types,
+      mlir::ConversionPatternRewriter &rewriter) const;
+
+  mlir::func::FuncOp GetWhileBodyFunction(
+      mlir::TF::WhileOp op, mlir::FlatSymbolRefAttr body_fn,
+      mlir::func::FuncOp pred_fn, mlir::TypeRange arg_types,
+      mlir::ConversionPatternRewriter &rewriter) const;
+
+  mlir::SymbolTable &symbol_table_;
+};
+
+// Create the pred function that contains a call to the original cond function
+// and a predicate kernel that converts the cond tensor to a boolean value. eg.
+//
+// func @pred_fn( %arg) {
+//  %cond_tensor = tf_mlrt.call @original_cond_fn(%arg)
+//  %cond_bool = tf_mlrt.predicate %cond_tensor
+//  return %cond_bool
+// }
+//
+mlir::func::FuncOp WhileOpConversion::GetPredicateFunction(
+    mlir::TF::WhileOp op, mlir::FlatSymbolRefAttr cond_fn,
+    mlir::TypeRange arg_types,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  std::string pred_fn_name =
+      absl::StrCat(cond_fn.getValue().str(), "/tf_mlrt_predicate");
+
+  if (auto pred_fn = symbol_table_.lookup<mlir::func::FuncOp>(pred_fn_name)) {
+    return pred_fn;
+  }
+
+  auto func_op = op->getParentOfType<mlir::func::FuncOp>();
+
+  mlir::ConversionPatternRewriter::InsertionGuard insertion_guard(rewriter);
+  rewriter.setInsertionPointAfter(func_op);
+
+  auto func_type = rewriter.getFunctionType(arg_types, {rewriter.getI1Type()});
+
+  auto pred_fn =
+      rewriter.create<mlir::func::FuncOp>(op.getLoc(), pred_fn_name, func_type);
+
+  auto *block = pred_fn.addEntryBlock();
+  rewriter.setInsertionPointToStart(block);
+
+  auto call_cond_fn = rewriter.create<mlir::func::CallOp>(
+      op.getLoc(), arg_types.take_front(), cond_fn, block->getArguments());
+  mlir::Value bool_cond = GetPredicate(op, call_cond_fn.getResult(0), rewriter);
+  rewriter.create<mlir::func::ReturnOp>(op.getLoc(), bool_cond);
+
+  symbol_table_.insert(pred_fn);
+
+  return pred_fn;
+}
+
+// Create the new while body function that contains a call to original while
+// body and then a call to the pred function. eg.
+//
+// func @while_body(%arg) {
+//   %result = mlrt.call @original_body(%arg)
+//   %cond_bool = mlrt.call @pred_function(%arg)
+//   mlrt.return %result, %cond_bool
+// }
+//
+mlir::func::FuncOp WhileOpConversion::GetWhileBodyFunction(
+    mlir::TF::WhileOp op, mlir::FlatSymbolRefAttr original_body_fn,
+    mlir::func::FuncOp pred_fn, mlir::TypeRange arg_types,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  std::string body_fn_name =
+      absl::StrCat(original_body_fn.getValue().str(), "/tf_mlrt_body");
+
+  if (auto body_fn = symbol_table_.lookup<mlir::func::FuncOp>(body_fn_name)) {
+    return body_fn;
+  }
+
+  auto func_op = op->getParentOfType<mlir::func::FuncOp>();
+
+  mlir::ConversionPatternRewriter::InsertionGuard insertion_guard(rewriter);
+  rewriter.setInsertionPointAfter(func_op);
+
+  llvm::SmallVector<mlir::Type, 4> body_result_types(arg_types.begin(),
+                                                     arg_types.end());
+  // The last result of the while body function is the boolean condition.
+  body_result_types.push_back(rewriter.getI1Type());
+
+  auto func_type = rewriter.getFunctionType(arg_types, body_result_types);
+  auto body_fn =
+      rewriter.create<mlir::func::FuncOp>(op.getLoc(), body_fn_name, func_type);
+
+  auto *block = body_fn.addEntryBlock();
+  rewriter.setInsertionPointToStart(block);
+
+  // Insert a call to the original body function.
+  // The returned result type is also the original argument types.
+  auto call_original_body_fn = rewriter.create<mlir::func::CallOp>(
+      op.getLoc(), arg_types, original_body_fn, block->getArguments());
+
+  // Insert a call to the pred function, which contains a call to the original
+  // cond function and the predicate kernel that converts the tensor to boolean
+  // value.
+  auto call_pred_fn = rewriter.create<mlir::func::CallOp>(
+      op.getLoc(), pred_fn.getFunctionType().getResults(), pred_fn.getSymName(),
+      call_original_body_fn.getResults());
+
+  llvm::SmallVector<mlir::Value, 4> body_results =
+      call_original_body_fn.getResults();
+
+  // The last result should be the boolean value converted from the condition.
+  auto bool_cond = call_pred_fn.getResult(0);
+  body_results.push_back(bool_cond);
+
+  rewriter.create<mlir::func::ReturnOp>(op.getLoc(), body_results);
+
+  symbol_table_.insert(body_fn);
+
+  return body_fn;
+}
+
+class BatchFunctionOpConversion
+    : public mlir::OpConversionPattern<mlir::TF::BatchFunctionOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::BatchFunctionOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    std::string node_name = GetNodeName(op);
+
+    auto statusor_node_def = tensorflow::ConvertTFDialectOpToNodeDef(
+        op, node_name, /*ignore_unregistered_attrs=*/true);
+    if (!statusor_node_def.ok())
+      return op->emitWarning("failed to export NodeDef.");
+    const auto &node_def = **statusor_node_def;
+
+    std::string node_def_text;
+    proto2::TextFormat::PrintToString(node_def, &node_def_text);
+
+    llvm::SmallVector<mlir::Type, 4> result_types(
+        op->getNumResults(), rewriter.getType<mlrt::compiler::FutureType>());
+
+    rewriter.replaceOpWithNewOp<tf_mlrt::BatchFunctionOp>(
+        op, result_types, adaptor.getOperands(), node_def.device(),
+        op.getFAttr(), node_def_text);
+
+    return mlir::success();
+  }
+};
+
+void CreateFallbackInitializationFunction(
+    mlir::ModuleOp module, ExecuteOpRegistry &execute_op_registry) {
+  mlir::OpBuilder builder(&module.getBodyRegion());
+
+  auto func_op = builder.create<mlir::func::FuncOp>(
+      module.getLoc(), "_tfrt_fallback_init",
+      mlir::FunctionType::get(module.getContext(), /*inputs=*/{},
+                              /*outputs=*/{}));
+
+  auto *block = func_op.addEntryBlock();
+  builder.setInsertionPointToStart(block);
+
+  // Create operations for all fallback kernels in the module.
+  for (const auto &[op_index, op] :
+       llvm::enumerate(execute_op_registry.GetExecuteOps())) {
+    if (op) {
+      // There might be unused ops, and we don't need to create them at runtime.
+      //
+      // TODO(chky, deqiangc): Clean up unused ops before hand.
+      builder.create<tf_mlrt::CreateOp>(
+          func_op.getLoc(), /*resultTypes=*/mlir::TypeRange{},
+          /*operands=*/mlir::ValueRange{}, op->getAttrs());
+    }
+  }
+
+  builder.create<mlir::func::ReturnOp>(func_op.getLoc());
+}
+
+// Move the tf_mlrt.await ops to right before their first uses to avoid
+// unnecessary blocking.
+void MoveAwaitOpToFirstUse(mlir::Block &block) {
+  llvm::SmallVector<tf_mlrt::AwaitOp> await_ops;
+  for (auto &op : block) {
+    if (auto await_op = llvm::dyn_cast<tf_mlrt::AwaitOp>(&op)) {
+      await_ops.push_back(await_op);
+    }
+  }
+
+  for (auto op : await_ops) {
+    auto result = op.getResult();
+    if (result.use_empty()) continue;
+
+    mlir::Operation *first_user = *result.user_begin();
+    for (auto *user : result.getUsers()) {
+      if (user->isBeforeInBlock(first_user)) {
+        first_user = user;
+      }
+    }
+
+    op->moveBefore(first_user);
+  }
+}
+
+const tfrt_stub::FallbackState &GetDefaultFallbackState() {
+  static const auto *const fallback_state = []() {
+    tensorflow::SessionOptions session_options;
+    tensorflow::FunctionDefLibrary fdef_lib;
+    auto fallback_state =
+        tfrt_stub::FallbackState::Create(session_options, fdef_lib).value();
+    return fallback_state.release();
+  }();
+
+  return *fallback_state;
+}
+
+// The conversion pass that is run before 'tf-mlrt-parallelization' passes. The
+// parallelization pass changes the graph content, so any rewrite/conversion
+// that depends on the graph instead of individual ops should be done before
+// parallelization.
+class TfToMlrtPreParallelizationConversionPass
+    : public mlir::PassWrapper<TfToMlrtPreParallelizationConversionPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  TfToMlrtPreParallelizationConversionPass() = default;
+  explicit TfToMlrtPreParallelizationConversionPass(
+      const TfrtPipelineOptions &options) {
+    // This is needed to progating user configs into this pass.
+    options_.copyOptionValuesFrom(options);
+  }
+  TfToMlrtPreParallelizationConversionPass(
+      const TfToMlrtPreParallelizationConversionPass &other) {}
+  TfToMlrtPreParallelizationConversionPass &operator=(
+      const TfToMlrtPreParallelizationConversionPass &) = delete;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TfToMlrtPreParallelizationConversionPass)
+
+ private:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlrt::compiler::MlrtDialect>();
+    registry.insert<tensorflow::tf_mlrt::TensorflowMlrtDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+
+    RegisterTpuDialect(registry);
+  }
+
+  llvm::StringRef getArgument() const final {
+    return "pre-parallel-tf-to-mlrt";
+  }
+  llvm::StringRef getDescription() const final {
+    return "pre-parallel-tf-to-mlrt";
+  }
+
+  mlir::LogicalResult initialize(mlir::MLIRContext *context) override {
+    if (use_tpu_host_allocator_for_inputs_.hasValue()) {
+      options_.use_tpu_host_allocator_for_inputs =
+          use_tpu_host_allocator_for_inputs_;
+    }
+
+    return mlir::success();
+  }
+
+  mlir::LogicalResult runOnFunction(mlir::func::FuncOp func) {
+    auto &context = getContext();
+    mlir::ConversionTarget target(context);
+    mlir::RewritePatternSet patterns(&getContext());
+    target.addLegalDialect<mlrt::compiler::MlrtDialect,
+                           tensorflow::tf_mlrt::TensorflowMlrtDialect,
+                           mlir::TF::TensorFlowDialect>();
+    PopulateTpuPreParallelizationConversionPatterns(target, patterns, options_);
+
+    return mlir::applyPartialConversion(func, target, std::move(patterns));
+  }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+
+    for (auto func : module.getOps<mlir::func::FuncOp>()) {
+      if (mlir::failed(runOnFunction(func))) {
+        signalPassFailure();
+        return;
+      }
+    }
+  }
+
+  Option<bool> use_tpu_host_allocator_for_inputs_{
+      *this, "use-tpu-host-allocator-for-inputs",
+      llvm::cl::desc("If true, fallback executeops that produce inputs to tpu "
+                     "program will use tpu host allocator."),
+      llvm::cl::init(false)};
+
+  TfrtPipelineOptions options_;
+};
+
+class TfToMlrtConversionPass
+    : public mlir::PassWrapper<TfToMlrtConversionPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  TfToMlrtConversionPass()
+      : TfToMlrtConversionPass({}, &GetDefaultFallbackState()) {}
+  explicit TfToMlrtConversionPass(
+      const TfrtPipelineOptions &options,
+      const tfrt_stub::FallbackState *fallback_state)
+      : fallback_state_(*fallback_state) {
+    // This is needed to progating user configs into this pass.
+    options_.copyOptionValuesFrom(options);
+  }
+  TfToMlrtConversionPass(const TfToMlrtConversionPass &other)
+      : fallback_state_(other.fallback_state_) {}
+  TfToMlrtConversionPass &operator=(const TfToMlrtConversionPass &) = delete;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TfToMlrtConversionPass)
+
+ private:
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<mlrt::compiler::MlrtDialect>();
+    registry.insert<tensorflow::tf_mlrt::TensorflowMlrtDialect>();
+    registry.insert<mlir::func::FuncDialect>();
+
+    RegisterTpuDialect(registry);
+  }
+
+  llvm::StringRef getArgument() const final { return "tf-to-mlrt"; }
+  llvm::StringRef getDescription() const final { return "tf-to-mlrt"; }
+
+  mlir::LogicalResult initialize(mlir::MLIRContext *context) override {
+    // TODO(b/285064425): See if this and below are the right way to
+    // accommodate other dialects.
+    type_converter_.addConversion([](mlir::Type type) { return type; });
+    type_converter_.addConversion(
+        [=](mlir::TensorType type) -> std::optional<mlir::Type> {
+          // Ref types are not supported in both compiler and runtime.
+          if (type.getElementType().isa<mlir::TF::TensorFlowRefType>())
+            return std::nullopt;
+          return tf_mlrt::TFTensorType::get(context);
+        });
+
+    auto future_to_tensor_materialization =
+        [](mlir::OpBuilder &builder, mlir::Type desired_type,
+           mlir::ValueRange inputs, mlir::Location loc) -> mlir::Value {
+      if (inputs.size() != 1) return mlir::Value();
+
+      if (inputs[0].getType().isa<mlrt::compiler::FutureType>()) {
+        if (desired_type.isa<tf_mlrt::TFTensorType>()) {
+          return builder.create<tf_mlrt::AwaitOp>(loc, desired_type, inputs[0]);
+        }
+
+        return mlir::Value();
+      }
+
+      return inputs[0];
+    };
+
+    type_converter_.addTargetMaterialization(future_to_tensor_materialization);
+    type_converter_.addArgumentMaterialization(
+        future_to_tensor_materialization);
+    type_converter_.addSourceMaterialization(
+        [](mlir::OpBuilder &builder, mlir::Type result_type,
+           mlir::ValueRange inputs,
+           mlir::Location loc) -> std::optional<mlir::Value> {
+          return builder
+              .create<mlir::UnrealizedConversionCastOp>(loc, result_type,
+                                                        inputs)
+              .getResult(0);
+        });
+
+    if (use_tpu_host_allocator_for_inputs_.hasValue()) {
+      options_.use_tpu_host_allocator_for_inputs =
+          use_tpu_host_allocator_for_inputs_;
+    }
+
+    return mlir::success();
+  }
+
+  void runOnOperation() override {
+    auto module = getOperation();
+    mlir::SymbolTable symbol_table(module);
+
+    // Use llvm::make_early_inc_range instead of the stock range from
+    // module.getOps because conversions such as WhileOpConversion could insert
+    // new functions into the module ops list causing the stock range to not
+    // able to find next OP correctly.
+    for (auto func :
+         llvm::make_early_inc_range(module.getOps<mlir::func::FuncOp>())) {
+      if (mlir::failed(runOnFunction(func, symbol_table))) {
+        signalPassFailure();
+        return;
+      }
+    }
+
+    // Some mlrt kernels such as tf_mlrt_tpu.CompileAndExecute produce futures,
+    // but function invoked by mlrt execute op are not aware of these changes.
+    // We add a post process to fix up this caller-callee mismatch.
+    for (auto func : module.getOps<mlir::func::FuncOp>()) {
+      CollectFunctionCallSiteInputTypes(func);
+    }
+    for (auto func : module.getOps<mlir::func::FuncOp>()) {
+      if (mlir::failed(PostProcessFunctionSignature(func, symbol_table))) {
+        signalPassFailure();
+        return;
+      }
+      // Move the tf_mlrt.await ops to right before their first uses to avoid
+      // unnecessary blocking.
+      MoveAwaitOpToFirstUse(func.getBlocks().front());
+    }
+
+    CreateFallbackInitializationFunction(module, execute_op_registry_);
+
+    module.walk([&](mlir::UnrealizedConversionCastOp op) {
+      op->replaceAllUsesWith(op->getOperands());
+      op->erase();
+    });
+  }
+
+  mlir::LogicalResult PostProcessFunctionSignature(
+      mlir::func::FuncOp func, mlir::SymbolTable &symbol_table) {
+    mlir::ConversionTarget target(getContext());
+    mlir::RewritePatternSet patterns(&getContext());
+
+    target.addDynamicallyLegalOp<mlir::func::FuncOp>(
+        [this](mlir::func::FuncOp func) {
+          // By default, we assume callers are well behaved.
+          if (function_call_site_input_types_.find(func.getName()) ==
+              function_call_site_input_types_.end()) {
+            return true;
+          }
+          DCHECK_EQ(function_call_site_input_types_.at(func.getName()).size(),
+                    func.getFunctionType().getInputs().size());
+
+          for (auto [expected_input_type, call_site_type] :
+               llvm::zip(func.getFunctionType().getInputs(),
+                         function_call_site_input_types_.at(func.getName()))) {
+            if (expected_input_type != call_site_type) {
+              return false;
+            }
+          }
+          return true;
+        });
+
+    patterns.add<FuncOpSignatureConversion>(&getContext(), &type_converter_,
+                                            &function_call_site_input_types_);
+
+    return mlir::applyPartialConversion(func, target, std::move(patterns));
+  }
+
+  void CollectFunctionCallSiteInputTypes(mlir::func::FuncOp func) {
+    func.walk([&function_call_site_input_types =
+                   function_call_site_input_types_](
+                  mlir::Operation *op) mutable {
+      // Only collect the call-site input types when a function is invoked
+      // by async op. This is the only known case that the previous pass
+      // may left un-match types between call-site and callee.
+      if (auto async_op = llvm::dyn_cast<mlrt::compiler::AsyncOp>(op)) {
+        function_call_site_input_types[async_op.getCallee()
+                                           .getLeafReference()] =
+            llvm::SmallVector<mlir::Type>(async_op.getOperandTypes().begin(),
+                                          async_op.getOperandTypes().end());
+      }
+    });
+  }
+
+  mlir::LogicalResult runOnFunction(mlir::func::FuncOp func,
+                                    mlir::SymbolTable &symbol_table) {
+    auto &context = getContext();
+    mlir::ConversionTarget target(context);
+    mlir::RewritePatternSet patterns(&getContext());
+    target.addLegalDialect<mlrt::compiler::MlrtDialect,
+                           tensorflow::tf_mlrt::TensorflowMlrtDialect>();
+    target.addIllegalDialect<mlir::TF::TensorFlowDialect>();
+    target.addIllegalOp<tf_mlrt::TFAwaitOp>();
+    target.addIllegalOp<tf_mlrt::TFPromiseOp>();
+    target.addIllegalOp<tf_mlrt::TFMapFnOp>();
+
+    target.addDynamicallyLegalOp<mlir::func::FuncOp>(
+        [this](mlir::func::FuncOp op) {
+          return type_converter_.isSignatureLegal(op.getFunctionType());
+        });
+    target.addDynamicallyLegalOp<mlir::func::ReturnOp>(
+        [this](mlir::func::ReturnOp op) {
+          for (auto operand : op.getOperands()) {
+            if (!type_converter_.isLegal(operand.getType())) return false;
+          }
+          return true;
+        });
+    target.addDynamicallyLegalOp<mlrt::compiler::AsyncOp>(
+        [this](mlrt::compiler::AsyncOp op) {
+          for (auto operand : op.getOperands()) {
+            if (!type_converter_.isLegal(operand.getType())) return false;
+          }
+          return true;
+        });
+    target.addDynamicallyLegalOp<mlir::func::CallOp>(
+        [this](mlir::func::CallOp op) {
+          for (auto operand : op.getOperands()) {
+            if (!type_converter_.isLegal(operand.getType())) return false;
+          }
+          return true;
+        });
+
+    // LINT.IfChange(fallback_allow_list)
+    // Order the list of added ops alphabetically.
+    patterns.add<WhileOpConversion>(&context, &type_converter_, &symbol_table);
+    patterns.add<AsyncOpConversion, GetResourceOpConversion,
+                 SetResourceOpConversion, TFAwaitOpConversion,
+                 TFPromiseOpConversion>(&context);
+    patterns.add<BatchFunctionOpConversion, CaseOpConversion, CondOpConversion,
+                 TFMapFnOpConversion>(type_converter_, &context);
+    patterns.add<ExecuteOpConversion>(&context, &symbol_table, &type_converter_,
+                                      &execute_op_registry_, &op_kernel_cache_,
+                                      &fallback_state_);
+    patterns.add<TFCallOpConversion<mlir::TF::PartitionedCallOp>,
+                 TFCallOpConversion<mlir::TF::StatefulPartitionedCallOp>,
+                 TFCallOpConversion<mlir::TF::LegacyCallOp>>(&context,
+                                                             &type_converter_);
+    // LINT.ThenChange(util.cc:fallback_allow_list)
+
+    mlir::populateFunctionOpInterfaceTypeConversionPattern<mlir::func::FuncOp>(
+        patterns, type_converter_);
+    mlir::populateReturnOpTypeConversionPattern(patterns, type_converter_);
+
+    PopulateTpuConversionPatterns(target, patterns, type_converter_,
+                                  execute_op_registry_, options_);
+
+    return mlir::applyPartialConversion(func, target, std::move(patterns));
+  }
+
+  Option<bool> use_tpu_host_allocator_for_inputs_{
+      *this, "use-tpu-host-allocator-for-inputs",
+      llvm::cl::desc("If true, fallback executeops that produce inputs to tpu "
+                     "program will use tpu host allocator."),
+      llvm::cl::init(false)};
+
+  TfrtPipelineOptions options_;
+  mlir::TypeConverter type_converter_;
+  ExecuteOpRegistry execute_op_registry_;
+  tfrt_stub::OpKernelRunnerCache op_kernel_cache_;
+  const tfrt_stub::FallbackState &fallback_state_;
+
+  // True input argument types for a given function at call site.
+  llvm::DenseMap<llvm::StringRef, llvm::SmallVector<mlir::Type>>
+      function_call_site_input_types_;
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtPreParallelizationConversionPass(
+    const TfrtPipelineOptions &options) {
+  return std::make_unique<TfToMlrtPreParallelizationConversionPass>(options);
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtConversionPass(const TfrtPipelineOptions &options,
+                             const tfrt_stub::FallbackState *fallback_state) {
+  return std::make_unique<TfToMlrtConversionPass>(options, fallback_state);
+}
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtConversionPass(const TfrtPipelineOptions &options) {
+  return CreateTfToMlrtConversionPass(options, &GetDefaultFallbackState());
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h
new file mode 100644
index 00000000000..1206f66f72b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TF_TO_MLRT_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TF_TO_MLRT_H_
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// The conversion pass that is run before 'tf-mlrt-parallelization' passes. The
+// parallelization pass changes the graph content, so any rewrite/conversion
+// that depends on the graph instead of individual ops should be done before
+// parallelization.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtPreParallelizationConversionPass(
+    const TfrtPipelineOptions& options);
+
+// The conversion pass that is run after 'tf-mlrt-parallelization' passes. The
+// parallelization pass changes the graph content, so this pass should only
+// contain conversion that depends on individual ops.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtConversionPass(const TfrtPipelineOptions& options);
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateTfToMlrtConversionPass(const TfrtPipelineOptions& options,
+                             const tfrt_stub::FallbackState* fallback_state);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TF_TO_MLRT_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc
new file mode 100644
index 00000000000..0212c945de6
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.cc
@@ -0,0 +1,169 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h"
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_tpu_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/utils.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+class TPUCompileMlirAndExecuteOpPreParallelizationConversion
+    : public mlir::OpConversionPattern<mlir::TF::TPUCompileMlirAndExecuteOp> {
+ public:
+  TPUCompileMlirAndExecuteOpPreParallelizationConversion(
+      mlir::MLIRContext* context, bool use_tpu_host_allocator_for_inputs)
+      : OpConversionPattern(context),
+        use_tpu_host_allocator_for_inputs_(use_tpu_host_allocator_for_inputs) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::TPUCompileMlirAndExecuteOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    llvm::SmallVector<int> constant_operand_indices;
+    llvm::SmallVector<int> non_constant_operand_indices;
+
+    for (int i = 0; i < adaptor.getArgs().size(); ++i) {
+      auto operand = adaptor.getOperands()[i];
+      auto original_operand = op.getOperand(i);
+      if (IsResultVariable(original_operand, operand)) {
+        // NOTE: It's important to populate constant_operand_indices in
+        // ascending order.
+        constant_operand_indices.push_back(i);
+      } else {
+        non_constant_operand_indices.push_back(i);
+      }
+    }
+
+    llvm::SmallVector<mlir::Value> operands = adaptor.getArgs();
+
+    size_t tensor_operands_size = operands.size();
+    operands.append(adaptor.getStaticShapes().begin(),
+                    adaptor.getStaticShapes().end());
+
+    auto producer_name = op->getAttrOfType<mlir::StringAttr>("producer_name");
+
+    llvm::SmallVector<int32_t> operands_with_static_shapes;
+    if (adaptor.getOperandsWithStaticShape().has_value()) {
+      for (auto attr : adaptor.getOperandsWithStaticShapeAttr()
+                           .getAsRange<mlir::IntegerAttr>()) {
+        operands_with_static_shapes.push_back(
+            static_cast<int32_t>(attr.getInt()));
+      }
+    }
+
+    if (use_tpu_host_allocator_for_inputs_) {
+      llvm::DenseMap<mlir::Operation*, mlir::Operation*> replaced_ops;
+
+      for (int i : non_constant_operand_indices) {
+        DCHECK_LT(i, op.getNumOperands());
+        auto old_value = operands[i];
+        mlir::Operation* def = old_value.getDefiningOp();
+
+        if (def && llvm::isa<mlir::TF::TensorFlowDialect>(def->getDialect())) {
+          auto*& op_with_device = replaced_ops[def];
+          if (!op_with_device) {
+            mlir::ConversionPatternRewriter::InsertionGuard guard(rewriter);
+            rewriter.setInsertionPoint(def);
+
+            op_with_device = rewriter.clone(*def);
+            op_with_device->setAttr(kTfMlrtCustomDevice,
+                                    rewriter.getStringAttr(kTpuHostDevice));
+            rewriter.replaceOp(def, op_with_device->getResults());
+          }
+        }
+      }
+    }
+
+    auto compile_and_execute_op =
+        rewriter.create<tf_mlrt::TFTPUCompileAndExecuteOp>(
+            op.getLoc(), op.getResultTypes(), operands,
+            rewriter.getDenseI32ArrayAttr(constant_operand_indices),
+            op.getMetadataAttr(), op.getMlirModuleAttr(),
+            rewriter.getUI32IntegerAttr(tensor_operands_size),
+            rewriter.getDenseI32ArrayAttr(operands_with_static_shapes),
+            producer_name);
+
+    rewriter.replaceOp(op, compile_and_execute_op->getResults());
+
+    return mlir::success();
+  }
+
+ private:
+  bool use_tpu_host_allocator_for_inputs_ = false;
+};
+
+class TPUCompileMlirAndExecuteOpConversion
+    : public mlir::OpConversionPattern<tf_mlrt::TFTPUCompileAndExecuteOp> {
+ public:
+  TPUCompileMlirAndExecuteOpConversion(mlir::TypeConverter* type_converter,
+                                       mlir::MLIRContext* context,
+                                       ExecuteOpRegistry* execute_op_registry)
+      : OpConversionPattern(*type_converter, context) {}
+
+  mlir::LogicalResult matchAndRewrite(
+      tf_mlrt::TFTPUCompileAndExecuteOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter& rewriter) const override {
+    llvm::SmallVector<mlir::Value> operands =
+        adaptor.getOperandsAndStaticShapes();
+    llvm::SmallVector<mlir::Type> result_types;
+    result_types.push_back(rewriter.getType<tf_mlrt::TFTensorType>());
+    result_types.append(op.getResults().size(),
+                        rewriter.getType<mlrt::compiler::FutureType>());
+
+    auto compile_and_execute_op =
+        rewriter.create<tf_mlrt_tpu::CompileAndExecuteOp>(
+            op.getLoc(), result_types, operands, op.getConstantOperandIndices(),
+            op.getMetadataAttr(), op.getMlirModuleAttr(), op.getNumOperands(),
+            op.getOperandsWithStaticShape(), op.getProducerName());
+
+    rewriter.replaceOp(op, compile_and_execute_op->getResults());
+
+    return mlir::success();
+  }
+};
+
+}  // namespace
+
+void PopulateTpuPreParallelizationConversionPatterns(
+    mlir::ConversionTarget& target, mlir::RewritePatternSet& patterns,
+    const TfrtPipelineOptions& options) {
+  target.addIllegalOp<mlir::TF::TPUCompileMlirAndExecuteOp>();
+  patterns.add<TPUCompileMlirAndExecuteOpPreParallelizationConversion>(
+      patterns.getContext(), options.use_tpu_host_allocator_for_inputs);
+}
+
+void PopulateTpuConversionPatterns(mlir::ConversionTarget& target,
+                                   mlir::RewritePatternSet& patterns,
+                                   mlir::TypeConverter& type_converter,
+                                   ExecuteOpRegistry& execute_op_registry,
+                                   const TfrtPipelineOptions& options) {
+  target.addIllegalOp<tf_mlrt::TFTPUCompileAndExecuteOp>();
+  target.addLegalDialect<tf_mlrt_tpu::TensorflowMlrtTpuDialect>();
+
+  patterns.add<TPUCompileMlirAndExecuteOpConversion>(
+      &type_converter, patterns.getContext(), &execute_op_registry);
+}
+
+void RegisterTpuDialect(mlir::DialectRegistry& registry) {
+  registry.insert<tf_mlrt_tpu::TensorflowMlrtTpuDialect>();
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h
new file mode 100644
index 00000000000..979b4b46033
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tpu_conversion_patterns.h
@@ -0,0 +1,45 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TPU_CONVERSION_PATTERNS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TPU_CONVERSION_PATTERNS_H_
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/execute_op_registry.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+inline constexpr char kTfMlrtCustomDevice[] = "tf_mlrt.custom_device";
+inline constexpr char kTpuHostDevice[] = "tpu_host_device";
+
+void RegisterTpuDialect(mlir::DialectRegistry& registry);
+
+void PopulateTpuPreParallelizationConversionPatterns(
+    mlir::ConversionTarget& target, mlir::RewritePatternSet& patterns,
+    const TfrtPipelineOptions& options);
+
+void PopulateTpuConversionPatterns(mlir::ConversionTarget& target,
+                                   mlir::RewritePatternSet& patterns,
+                                   mlir::TypeConverter& type_converter,
+                                   ExecuteOpRegistry& execute_op_registry,
+                                   const TfrtPipelineOptions& options);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_TPU_CONVERSION_PATTERNS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
new file mode 100644
index 00000000000..fb110fb01f2
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
@@ -0,0 +1,43 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h"
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_a_m.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h.inc"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h.inc"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+bool UseFallback(mlir::Operation *op) {
+  if (!llvm::isa<mlir::TF::TensorFlowDialect>(op->getDialect())) return false;
+
+  // TODO(b/173017701): have a centralized place to hold the information
+  // whether a TF op should be lowered to FallbackExecute op.
+  // LINT.IfChange(fallback_allow_list)
+  return !llvm::isa<mlir::TF::_TfrtSetResourceOp, mlir::TF::_TfrtGetResourceOp,
+                    mlir::TF::BatchFunctionOp, mlir::TF::CaseOp,
+                    mlir::TF::StatefulPartitionedCallOp,
+                    mlir::TF::PartitionedCallOp, mlir::TF::LegacyCallOp,
+                    mlir::TF::IfOp, mlir::TF::WhileOp,
+                    mlir::TF::TPUCompileMlirAndExecuteOp>(op);
+  // LINT.ThenChange(tf_to_mlrt.cc:fallback_allow_list)
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h
new file mode 100644
index 00000000000..c47471f67cd
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_UTIL_H_
+
+#include "mlir/IR/Operation.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+// Use fallback by default for anything that does not have a native kernel
+// with some exceptions.
+bool UseFallback(mlir::Operation *op);
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
new file mode 100644
index 00000000000..a7975c40e1f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.cc
@@ -0,0 +1,944 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h"
+
+#include <linux/limits.h>
+
+#include <algorithm>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Block.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/IRMapping.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
+#include "tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.h"
+
+namespace tensorflow {
+namespace mlrt_compiler {
+namespace {
+
+void RemoveIdentityOp(mlir::func::FuncOp func) {
+  auto &block = func.getBody().front();
+  llvm::SmallVector<mlir::TF::IdentityOp> identity_ops;
+  for (auto &op : block) {
+    if (auto identity_op = llvm::dyn_cast<mlir::TF::IdentityOp>(&op)) {
+      identity_ops.push_back(identity_op);
+    }
+  }
+
+  for (auto op : llvm::reverse(identity_ops)) {
+    op.getOutput().replaceAllUsesWith(op.getInput());
+  }
+
+  for (auto op : identity_ops) {
+    op->erase();
+  }
+
+  auto return_op = llvm::cast<mlir::func::ReturnOp>(block.getTerminator());
+
+  auto func_type = mlir::FunctionType::get(
+      func.getContext(), func.getArgumentTypes(), return_op.getOperandTypes());
+
+  func.setType(func_type);
+}
+
+// tf.map_fn (https://www.tensorflow.org/api_docs/python/tf/map_fn) is converted
+// to tf.while during lowering. tf.map_fn expects parallel execution of its body
+// function but not all tf.while can guarantee parallel executions. The tf.while
+// op that is converted from tf.map_fn has distinct programming patterns. This
+// pass matches those patterns to convert applicable tf.while to tf_mlrt.map_fn
+// for parallel execution of the body function.
+//
+// For example, tf.map_fn(fn, elems, ...) can be converted to the following:
+//
+// %tensor_list =  "tf.TensorListReserve"(%per_iteration_shape, %max_iterations)
+//
+// %while_outputs:7 =  "tf.While"(%loop_counter,
+// %tensor_list_index, %other_args, %tensor_list) {body = @while_body, cond =
+// @while_cond}
+//
+// %outputs =  "tf.TensorListStack"(%while_outputs#2, %output_shape)
+//
+// in which
+//
+// while_cond: check loop_counter and tensor_list_index both smaller than
+// max_iterations.
+//
+// while_body: loop_counter and tensor_list_index is incremented and returned;
+// also gather input from elems based on un-incremented tensor_list_index,
+// call fn and set output into a TensorList at tensor_list_index.
+//
+// This pass additionally assumes the following patterns to identify a tf.While
+// that are converted from tf.map_fn:
+// 1. Arguments have one loop_counter and one element_index that are initialized
+// to be 0.
+// 2. TensorList or TensorArray is reserved with max_iterations size. The
+// max_iterations shall be a constant.
+// 3. The predicate function check both loop_counter and element_index is less
+// than max_iterations.
+// 4. The body function increase loop_counter and element_index by 1 and use
+// element_index to stores its result into  tensor list or tensor array such
+// that there is no overlap in write between iterations
+// 5. The body function does not have side effects such that one iteration will
+// impact the next iteration outside #4.
+//
+// After conversion, the pseudocode is
+//
+// %tensor_list =  "tf.TensorListReserve"(%per_iteration_shape, %max_iterations)
+//
+// %updated_tensor_list =  "tf_mlrt.map_fn" (%max_iterations, %tensor_list,
+// %other_args) {body = @map_fn_body}
+//
+// %outputs = "tf.TensorListStack"(%updated_tensor_list, %output_shape)
+//
+// where
+//
+// tf_mlrt.map_fn leads to a blocking call and
+// the argument list of tf_mlrt.map_fn is (%max_iterations, %tensor_list,
+// tf.while's argument list minus loop_counter, tensor_list_index and
+// tensor_list). tf_mlrt.map_fn is a block call and returns the updated tensor
+// list.
+//
+// map_fn_body has an input signature of (%in_tensor_list_future,
+// %out_tensor_list_promise, %loop_counter, %tensor_list_index, %other_args) and
+// has not return values (the updated_tensor_list is delivered through
+// %out_tensor_list_promise).
+//
+class WhileToMapFnPass
+    : public mlir::PassWrapper<WhileToMapFnPass,
+                               mlir::OperationPass<mlir::ModuleOp>> {
+ public:
+  WhileToMapFnPass() = default;
+  WhileToMapFnPass &operator=(const WhileToMapFnPass &) = delete;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(WhileToMapFnPass)
+
+ private:
+  struct LoopInfo {
+    // Argument indices in while op of key loop variables.
+    int loop_counter = -1;
+    int element_index = -1;
+    std::vector<int> tensor_list_or_flow_in;
+    // Max iteration may be passed in as an argument to while op.
+    std::optional<int> max_iterations_arg_idx;
+    // Max itertions may be hard coded as constant inside while predicate
+    // function.
+    std::optional<int> max_iterations_value;
+    // Defining Op of max_iterations.
+    mlir::Value max_iterations;
+  };
+
+  void getDependentDialects(mlir::DialectRegistry &registry) const override {
+    registry.insert<tensorflow::tf_mlrt::TensorflowMlrtDialect>();
+    registry.insert<mlrt::compiler::MlrtDialect>();
+  }
+
+  llvm::StringRef getArgument() const final {
+    return "tf-mlrt-while-to-map-fn";
+  }
+
+  llvm::StringRef getDescription() const final {
+    return "Convert tf.while to tf_mlrt.map_fn when possible for parallel "
+           "execution.";
+  }
+
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::SymbolTable symbol_table(module);
+
+    // Use make_early_inc_range because the processing might insert new node
+    // into the list
+    for (auto func_op :
+         llvm::make_early_inc_range(module.getOps<mlir::func::FuncOp>())) {
+      MayConvertWhileToMapFn(func_op, symbol_table);
+    }
+  }
+
+  // We match while op's predicate function and body function with known
+  // patterns from tf.map_fn. If matched, tf.while is converted to
+  // tf_mlrt.map_fn.
+  void MayConvertWhileToMapFn(mlir::func::FuncOp op,
+                              mlir::SymbolTable &symbol_table) {
+    mlir::OpBuilder builder(op);
+    for (mlir::Operation &op : llvm::make_early_inc_range(op.front())) {
+      auto while_op = llvm::dyn_cast<mlir::TF::WhileOp>(&op);
+      if (!while_op) continue;
+      LoopInfo loop_info;
+      if (mlir::succeeded(MatchPredicate(while_op.getCondAttr(), symbol_table,
+                                         loop_info)) &&
+          mlir::succeeded(
+              MatchBody(while_op.getBodyAttr(), symbol_table, loop_info)) &&
+          mlir::succeeded(MatchInputSource(while_op, loop_info)) &&
+          mlir::succeeded(MatchOutputUse(while_op, loop_info))) {
+        // Input, predicate function, body function and output are all following
+        // patterns, we can convert it to tf_mlrt.map_fn.
+        mlir::func::FuncOp while_body_func =
+            symbol_table.lookup<mlir::func::FuncOp>(while_op.getBody());
+        auto map_fn_body_func = CreateMapFnBodyFunction(
+            builder, while_body_func, symbol_table, loop_info);
+
+        mlir::OpBuilder::InsertionGuard guard(builder);
+        builder.setInsertionPointAfter(while_op);
+        std::vector<mlir::Value> invariant_arguments;
+        invariant_arguments.reserve(while_op->getNumOperands());
+
+        absl::flat_hash_set<int> variant_arguments = {loop_info.loop_counter,
+                                                      loop_info.element_index};
+        variant_arguments.insert(loop_info.tensor_list_or_flow_in.begin(),
+                                 loop_info.tensor_list_or_flow_in.end());
+        for (int i = 0; i < while_op->getNumOperands(); ++i) {
+          if (variant_arguments.contains(i)) {
+            continue;
+          }
+          invariant_arguments.push_back(while_op.getOperand(i));
+        }
+
+        llvm::SmallVector<mlir::Type> result_types;
+        llvm::SmallVector<mlir::Value> tensor_list_operands;
+        for (int i = 0; i < loop_info.tensor_list_or_flow_in.size(); ++i) {
+          tensor_list_operands.push_back(
+              while_op.getOperand(loop_info.tensor_list_or_flow_in[i]));
+          result_types.push_back(
+              while_op.getResult(loop_info.tensor_list_or_flow_in[i])
+                  .getType());
+        }
+
+        auto map_fn_op = builder.create<tf_mlrt::TFMapFnOp>(
+            while_op.getLoc(), result_types, loop_info.max_iterations,
+            tensor_list_operands, invariant_arguments,
+            map_fn_body_func.getSymName(),
+            loop_info.tensor_list_or_flow_in.size());
+
+        // MatchOutputUse already makes sure only the tensor_list or
+        // tensor_array output is used.
+        absl::flat_hash_map<int, int> old_arg_indx_to_new_index;
+        for (int i = 0; i < loop_info.tensor_list_or_flow_in.size(); ++i) {
+          old_arg_indx_to_new_index.insert(
+              {loop_info.tensor_list_or_flow_in[i], i});
+        }
+        for (int i = 0; i < while_op.getResults().size(); ++i) {
+          if (!old_arg_indx_to_new_index.contains(i)) {
+            while_op.getResult(i).dropAllUses();
+          } else {
+            while_op.getResult(i).replaceAllUsesWith(
+                map_fn_op.getResult()[old_arg_indx_to_new_index[i]]);
+          }
+        }
+
+        while_op.erase();
+      }
+    }
+  }
+
+  // Match that (a) the tensor list or tensor array are reserved with
+  // max_iterations size such that parallel operations on tensor list or tensor
+  // array is thread safe; (b) loop_counter and element_index starts with 0.
+  // Also may identify source of max_iterations.
+  mlir::LogicalResult MatchInputSource(mlir::TF::WhileOp while_op,
+                                       LoopInfo &loop_info) {
+    // Element index and loop counter should start from 0.
+    if (!mlir::matchPattern(while_op.getOperand(loop_info.loop_counter),
+                            mlir::m_Zero()) ||
+        !mlir::matchPattern(while_op.getOperand(loop_info.element_index),
+                            mlir::m_Zero())) {
+      return mlir::failure();
+    }
+
+    DCHECK_GE(loop_info.tensor_list_or_flow_in.size(), 1);
+    // Tensor list or a tensor array are reserved
+
+    for (auto tensor_list_index : loop_info.tensor_list_or_flow_in) {
+      mlir::Operation *tensor_list_or_flow_in_defining_op =
+          while_op.getOperand(tensor_list_index).getDefiningOp();
+      mlir::Operation *max_iterations = nullptr;
+      if (loop_info.max_iterations_arg_idx.has_value()) {
+        max_iterations =
+            while_op.getOperand(loop_info.max_iterations_arg_idx.value())
+                .getDefiningOp();
+      }
+      if (auto tensor_list_reserve =
+              llvm::dyn_cast<mlir::TF::TensorListReserveOp>(
+                  tensor_list_or_flow_in_defining_op)) {
+        // Tensor list should resever for max_iterations.
+        mlir::Operation *tensor_list_reserve_size =
+            tensor_list_reserve.getNumElements().getDefiningOp();
+
+        if (tensor_list_reserve_size != max_iterations) {
+          // if tensor list is not reserved by max_iteration variable, then
+          // another acceptable case is that both contain same constant values.
+          llvm::APInt reserved_cst;
+          if (!mlir::matchPattern(tensor_list_reserve_size,
+                                  mlir::m_ConstantInt(&reserved_cst)) ||
+              !loop_info.max_iterations_value.has_value() ||
+              reserved_cst.getZExtValue() !=
+                  loop_info.max_iterations_value.value()) {
+            return mlir::failure();
+          }
+        }
+        // TensorListReserveOp has only one result and is already in used by
+        // while.
+        loop_info.max_iterations = tensor_list_reserve.getNumElements();
+      } else if (auto tensor_array = llvm::dyn_cast<mlir::TF::TensorArrayV3Op>(
+                     tensor_list_or_flow_in_defining_op)) {
+        mlir::Operation *tensor_array_size =
+            tensor_array.getOperand().getDefiningOp();
+        if (tensor_array_size != max_iterations) {
+          // if tensor array is not reserved by max_iteration variable, then
+          // another acceptable case is that both contain same constant values.
+          llvm::APInt reserved_cst;
+          if (!mlir::matchPattern(tensor_array_size,
+                                  mlir::m_ConstantInt(&reserved_cst)) ||
+              !loop_info.max_iterations_value.has_value() ||
+              reserved_cst.getZExtValue() !=
+                  loop_info.max_iterations_value.value()) {
+            return mlir::failure();
+          }
+        }
+
+        // Other than flow_in, the tensor array should be used by while as well.
+        if (!llvm::is_contained(while_op.getOperands(),
+                                tensor_array.getHandle())) {
+          return mlir::failure();
+        }
+        loop_info.max_iterations = tensor_array.getSize();
+      } else {
+        return mlir::failure();
+      }
+    }
+    return mlir::success();
+  }
+
+  // Match the map_attern that output of while op is subsequentially stacked.
+  mlir::LogicalResult MatchOutputUse(mlir::TF::WhileOp &while_op,
+                                     const LoopInfo &loop_info) {
+    absl::flat_hash_set<int> used_results;
+    used_results.insert(loop_info.tensor_list_or_flow_in.begin(),
+                        loop_info.tensor_list_or_flow_in.end());
+    for (int i = 0; i < while_op->getResults().size(); ++i) {
+      if (used_results.contains(i)) {
+        // Tensor list or flow in should be used next.
+        if (!while_op->getResult(i).hasOneUse()) {
+          return mlir::failure();
+        }
+      } else {
+        // No other result should be used.
+        if (!while_op->getResult(i).use_empty()) {
+          return mlir::failure();
+        }
+      }
+    }
+
+    for (auto result_index : loop_info.tensor_list_or_flow_in) {
+      mlir::Operation *use_op =
+          *while_op->getResult(result_index).getUsers().begin();
+
+      if (!llvm::isa<mlir::TF::TensorListStackOp,
+                     mlir::TF::TensorArrayGatherV3Op>(use_op)) {
+        return mlir::failure();
+      }
+    }
+    return mlir::success();
+  }
+
+  // Match that the while predicate function is doing just
+  // loop_counter < max iterations && element_index < max_iterations.
+  // Through this pattern, we also update the argument index of
+  // loop_counter, element_index and possibly max_iterations.
+  mlir::LogicalResult MatchPredicate(mlir::FlatSymbolRefAttr predicate_fn,
+                                     const mlir::SymbolTable &symbol_table,
+                                     LoopInfo &loop_info) {
+    mlir::func::FuncOp predicate_fn_op =
+        symbol_table.lookup<mlir::func::FuncOp>(predicate_fn.getValue());
+
+    // The body of the predicate function should have two LessOp and one
+    // LogicalAndOp. It can optionally has IdentityOp and ToBoolOp.
+    enum class PredicateBodyExpectingOp {
+      kExpectFirstLess,
+      kExpectSecondLess,
+      kExpectLogicalAnd,
+      kExpectTerminator
+    };
+    std::vector<mlir::Operation *> less_ops;
+    less_ops.reserve(2);
+    PredicateBodyExpectingOp expecting_op =
+        PredicateBodyExpectingOp::kExpectFirstLess;
+    for (auto &body_op : predicate_fn_op.getBody().front()) {
+      switch (expecting_op) {
+        case PredicateBodyExpectingOp::kExpectFirstLess:
+          if (llvm::isa<mlir::TF::LessOp>(body_op)) {
+            expecting_op = PredicateBodyExpectingOp::kExpectSecondLess;
+            less_ops.push_back(&body_op);
+          } else if (!llvm::isa<mlir::TF::ConstOp, mlir::TF::IdentityOp>(
+                         body_op)) {
+            return mlir::failure();
+          }
+          break;
+        case PredicateBodyExpectingOp::kExpectSecondLess:
+          if (llvm::isa<mlir::TF::LessOp>(body_op)) {
+            expecting_op = PredicateBodyExpectingOp::kExpectLogicalAnd;
+            less_ops.push_back(&body_op);
+          } else if (!llvm::isa<mlir::TF::ConstOp, mlir::TF::IdentityOp>(
+                         body_op)) {
+            return mlir::failure();
+          }
+          break;
+        case PredicateBodyExpectingOp::kExpectLogicalAnd:
+          if (llvm::isa<mlir::TF::LogicalAndOp>(body_op)) {
+            expecting_op = PredicateBodyExpectingOp::kExpectTerminator;
+          } else if (!llvm::isa<mlir::TF::IdentityOp>(body_op)) {
+            return mlir::failure();
+          }
+          break;
+        case PredicateBodyExpectingOp::kExpectTerminator:
+          if (!llvm::isa<mlir::TF::ToBoolOp, mlir::func::ReturnOp,
+                         mlir::TF::IdentityOp>(body_op)) {
+            return mlir::failure();
+          }
+          break;
+        default:
+          return mlir::failure();
+      }
+    }
+
+    // Identify loop_counter
+    int counter_index = -1;
+    auto counter_iter =
+        llvm::find(predicate_fn_op.getArguments(), less_ops[0]->getOperand(0));
+    if (counter_iter != predicate_fn_op.getArguments().end()) {
+      counter_index = counter_iter->getArgNumber();
+      if (!IsScalarOrUnrankedI32Tensor(
+              predicate_fn_op.getArgument(counter_index))) {
+        return mlir::failure();
+      }
+    }
+
+    // Find upper bound on loop_counter.
+    int max_iter_index_from_counter = -1;
+    int max_iter_value_from_counter = -1;
+    if (auto max_iter_iter = llvm::find(predicate_fn_op.getArguments(),
+                                        less_ops[0]->getOperand(1));
+        max_iter_iter != predicate_fn_op.getArguments().end()) {
+      // Upper bound on loop_counter is from one argument.
+      max_iter_index_from_counter = max_iter_iter->getArgNumber();
+      // Argument has to be int32
+      if (!IsScalarOrUnrankedI32Tensor(
+              predicate_fn_op.getArgument(max_iter_index_from_counter))) {
+        return mlir::failure();
+      }
+    } else {
+      // If upper bound is not passed in, it has to be a constant
+      llvm::APInt value;
+      if (!mlir::matchPattern(less_ops[0]->getOperand(1).getDefiningOp(),
+                              mlir::m_ConstantInt(&value))) {
+        return mlir::failure();
+      }
+      max_iter_value_from_counter = value.getZExtValue();
+    }
+
+    // Identify element_index
+    int element_index = -1;
+    auto element_index_iter =
+        llvm::find(predicate_fn_op.getArguments(), less_ops[1]->getOperand(0));
+    if (element_index_iter != predicate_fn_op.getArguments().end()) {
+      element_index = element_index_iter->getArgNumber();
+      if (!IsScalarOrUnrankedI32Tensor(
+              predicate_fn_op.getArgument(element_index))) {
+        return mlir::failure();
+      }
+    }
+
+    // Find upper bound on element_index.
+    int max_iter_index_from_element = -1;
+    int max_iter_value_from_element = -1;
+    if (auto max_iter_iter = llvm::find(predicate_fn_op.getArguments(),
+                                        less_ops[1]->getOperand(1));
+        max_iter_iter != predicate_fn_op.getArguments().end()) {
+      // Upper bound on element_index is from one argument.
+      max_iter_index_from_element = max_iter_iter->getArgNumber();
+      // Upper bound argument needs to be int32
+      if (!IsScalarOrUnrankedI32Tensor(
+              predicate_fn_op.getArgument(max_iter_index_from_element))) {
+        return mlir::failure();
+      }
+    } else {
+      // If upper bound is not passed in, it has to be a constant
+      llvm::APInt value;
+      if (!mlir::matchPattern(less_ops[1]->getOperand(1).getDefiningOp(),
+                              mlir::m_ConstantInt(&value))) {
+        return mlir::failure();
+      }
+      max_iter_value_from_element = value.getZExtValue();
+    }
+
+    // Loop_counter is always available.
+    if (counter_index < 0) return mlir::failure();
+    // element_index can change its location, but will always be provided.
+    if (element_index < 0) return mlir::failure();
+
+    std::optional<int> max_iter_const;
+    std::optional<int> max_iter_index;
+    if (max_iter_index_from_counter < 0 && max_iter_index_from_element < 0) {
+      // If both loop counter and element index are not upper bounded by passing
+      // in arguments, they shall be upper bounded by constants of same value.
+      if (max_iter_value_from_element != max_iter_value_from_counter ||
+          max_iter_value_from_element < 0 || max_iter_value_from_counter < 0) {
+        return mlir::failure();
+      } else {
+        max_iter_const = max_iter_value_from_element;
+      }
+    } else if (max_iter_index_from_counter >= 0 &&
+               max_iter_index_from_element >= 0) {
+      // Loop counter or element are upper bounded by pass-in arguments.
+      // They need to be upper bounded by the same argument
+      if (max_iter_index_from_element != max_iter_index_from_counter) {
+        return mlir::failure();
+      } else {
+        max_iter_index = max_iter_index_from_counter;
+      }
+    } else {
+      // TODO(deqiangc): remove this clause after verifying grappler pass remove
+      // the case that one of them is bounded by pass-in argument and the other
+      // is bounded by constants.
+      max_iter_index =
+          std::max(max_iter_index_from_counter, max_iter_index_from_element);
+      max_iter_const =
+          std::max(max_iter_value_from_element, max_iter_value_from_counter);
+    }
+
+    // Update hypothesis
+    loop_info.loop_counter = counter_index;
+    loop_info.element_index = element_index;
+    loop_info.max_iterations_arg_idx = max_iter_index;
+    loop_info.max_iterations_value = max_iter_const;
+    return mlir::success();
+  }
+
+  // Match that the current hypothesis of current loop_counter and element_index
+  // in the while body function based on the following simple pattern:
+  // %updated_loop_counter = %loop_counter + 1
+  // %updated_element_index = %element_index + 1
+  // %loaded_elem = tf.Gather(.., %element_index,... )
+  // DoSomething
+  // tf.TensorListSetItem(.., %element_index)
+  // return %update_loop_counter, %updated_element_index,
+  // %tensor_array_list, %max_iterations, %other_args
+  mlir::LogicalResult MatchLoopCounterElementIndexInBody(
+      mlir::func::FuncOp while_body_func, LoopInfo &loop_info) {
+    mlir::Block &block = while_body_func.getBlocks().front();
+
+    // Verify argument loop_counter is +1 and returned at the same location.
+    mlir::BlockArgument loop_counter =
+        block.getArgument(loop_info.loop_counter);
+    llvm::SmallVector<mlir::Operation *> loop_counter_users =
+        GetUsersIgnoringIdentityOp(loop_counter);
+    if (loop_counter_users.size() != 1 ||
+        !llvm::isa<mlir::TF::AddOp, mlir::TF::AddV2Op>(
+            loop_counter_users.front()) ||
+        !mlir::matchPattern(
+            loop_counter_users.front()->getOperand(1).getDefiningOp(),
+            mlir::m_One())) {
+      return mlir::failure();
+    }
+
+    // loop_counter + 1 is in ReturnOp's operand.
+    if (loop_counter_users.front() !=
+        GetDefiningOpIgnoringIdentityOp(
+            GetReturnedOperand(while_body_func, loop_info.loop_counter))) {
+      return mlir::failure();
+    }
+
+    // Verify element_index's usage and also identify the argument index of
+    // tensor list or tensor array flow_in.
+    std::vector<int> tensor_list_or_flow_in_index;
+    mlir::BlockArgument element_index =
+        block.getArgument(loop_info.element_index);
+    for (auto *element_index_use : GetUsersIgnoringIdentityOp(element_index)) {
+      if (llvm::isa<mlir::TF::AddOp, mlir::TF::AddV2Op>(element_index_use)) {
+        // One use of element_index is +1 and then returned at the same
+        // location.
+        if (!mlir::matchPattern(
+                element_index_use->getOperand(1).getDefiningOp(),
+                mlir::m_One()) ||
+            element_index_use !=
+                GetDefiningOpIgnoringIdentityOp(GetReturnedOperand(
+                    while_body_func, loop_info.element_index))) {
+          return mlir::failure();
+        }
+      } else if (llvm::isa<mlir::TF::TensorListSetItemOp>(element_index_use)) {
+        if (auto tensor_list_index = MayGetArgumentIndexIgnoringIdentityOp(
+                while_body_func,
+                llvm::dyn_cast<mlir::TF::TensorListSetItemOp>(element_index_use)
+                    .getInputHandle());
+            !tensor_list_index.has_value()) {
+          return mlir::failure();
+        } else {
+          tensor_list_or_flow_in_index.push_back(tensor_list_index.value());
+        }
+      } else if (llvm::isa<mlir::TF::TensorArrayWriteV3Op>(element_index_use)) {
+        if (auto flow_in_index = MayGetArgumentIndexIgnoringIdentityOp(
+                while_body_func, llvm::dyn_cast<mlir::TF::TensorArrayWriteV3Op>(
+                                     element_index_use)
+                                     .getFlowIn());
+            !flow_in_index.has_value()) {
+          return mlir::failure();
+        } else {
+          tensor_list_or_flow_in_index.push_back(flow_in_index.value());
+        }
+      } else if (!llvm::isa<mlir::TF::GatherOp, mlir::TF::GatherV2Op,
+                            mlir::TF::TensorListGetItemOp,
+                            mlir::TF::TensorArrayReadV3Op>(element_index_use)) {
+        // The only other use is to either gather the input or set output.
+        return mlir::failure();
+      }
+    }
+
+    if (tensor_list_or_flow_in_index.empty()) {
+      return mlir::failure();
+    }
+
+    // Update hypothesis
+    loop_info.tensor_list_or_flow_in = std::move(tensor_list_or_flow_in_index);
+
+    return mlir::success();
+  }
+
+  // Match that the while body function is the following simple pattern:
+  // %updated_loop_counter = %loop_counter + 1
+  // %updated_element_index = %element_index + 1
+  // %loaded_elem = tf.Gather(.., %element_index,... )
+  // DoSomething
+  // tf.TensorListSetItem(.., %element_index)
+  // return %update_loop_counter, %updated_element_index,
+  // %tensor_array_list, %max_iterations, %other_args
+  //
+  // in which
+  // DoSomething has no side-effect on the next iteration.
+  //
+  // Also identify argument index for TensorList or TensorArray flow_in.
+  mlir::LogicalResult MatchBody(mlir::FlatSymbolRefAttr while_body_func_name,
+                                const mlir::SymbolTable &symbol_table,
+                                LoopInfo &loop_info) {
+    mlir::func::FuncOp while_body_func =
+        symbol_table.lookup<mlir::func::FuncOp>(
+            while_body_func_name.getValue());
+
+    if (mlir::failed(
+            MatchLoopCounterElementIndexInBody(while_body_func, loop_info))) {
+      // Swap the order of loop_counter and element_index in the current
+      // hypothesis and try again
+      int swap = loop_info.loop_counter;
+      loop_info.loop_counter = loop_info.element_index;
+      loop_info.element_index = swap;
+      if (mlir::failed(
+              MatchLoopCounterElementIndexInBody(while_body_func, loop_info))) {
+        return mlir::failure();
+      }
+    }
+
+    // The next iteration of while_body does not depend on the previous
+    // iteration except loop_counter, element_index, tensor_list_or_flow_in, and
+    // max_iterations.
+    absl::flat_hash_set<int> allowed_variable_between_iterations;
+    allowed_variable_between_iterations.insert(loop_info.loop_counter);
+    allowed_variable_between_iterations.insert(loop_info.element_index);
+    if (loop_info.max_iterations_arg_idx.has_value()) {
+      allowed_variable_between_iterations.insert(
+          loop_info.max_iterations_arg_idx.value());
+    }
+    allowed_variable_between_iterations.insert(
+        loop_info.tensor_list_or_flow_in.begin(),
+        loop_info.tensor_list_or_flow_in.end());
+    for (int j = 0; j < while_body_func.getNumArguments(); j++) {
+      if (!allowed_variable_between_iterations.contains(j)) {
+        if (GetReturnedOperand(while_body_func, j) !=
+            while_body_func.getArgument(j)) {
+          return mlir::failure();
+        }
+      }
+    }
+
+    return mlir::success();
+  }
+
+  // The map_fn body function is a clone of the while_body_func that
+  // canonicalize loop_counter and tensor_list_index to be the first two
+  // arguments.
+  mlir::func::FuncOp CreateMapFnBodyFunction(mlir::OpBuilder &builder,
+                                             mlir::func::FuncOp while_body_func,
+                                             mlir::SymbolTable &symbol_table,
+                                             const LoopInfo &loop_info) {
+    std::string map_fn_body_name =
+        absl::StrCat(while_body_func.getSymName().str(), "/MapFnBody");
+
+    if (auto func = symbol_table.lookup<mlir::func::FuncOp>(map_fn_body_name)) {
+      return func;
+    }
+
+    RemoveIdentityOp(while_body_func);
+
+    absl::flat_hash_set<int> variant_arguments = {loop_info.loop_counter,
+                                                  loop_info.element_index};
+    variant_arguments.insert(loop_info.tensor_list_or_flow_in.begin(),
+                             loop_info.tensor_list_or_flow_in.end());
+    llvm::SmallVector<mlir::Type> remapped_input_type;
+
+    for (int i = 0; i < loop_info.tensor_list_or_flow_in.size(); i++) {
+      remapped_input_type.push_back(
+          builder.getType<mlrt::compiler::FutureType>());
+      remapped_input_type.push_back(
+          builder.getType<mlrt::compiler::PromiseType>());
+    }
+
+    remapped_input_type.push_back(
+        while_body_func.getFunctionType().getInput(loop_info.loop_counter));
+    remapped_input_type.push_back(
+        while_body_func.getFunctionType().getInput(loop_info.element_index));
+    for (int i = 0; i < while_body_func.getFunctionType().getNumInputs(); i++) {
+      if (!variant_arguments.contains(i)) {
+        remapped_input_type.push_back(
+            while_body_func.getFunctionType().getInput(i));
+      }
+    }
+    mlir::OpBuilder::InsertionGuard insertion_guard(builder);
+    builder.setInsertionPointAfter(while_body_func);
+    auto map_fn_body_func = builder.create<mlir::func::FuncOp>(
+        while_body_func.getLoc(), map_fn_body_name,
+        mlir::FunctionType::get(while_body_func.getContext(),
+                                remapped_input_type, {}));
+
+    map_fn_body_func->setAttr(
+        "tfrt.cost_threshold",
+        builder.getI64IntegerAttr(std::numeric_limits<uint32_t>::max()));
+
+    if (while_body_func.getArgAttrs().has_value()) {
+      llvm::SmallVector<mlir::Attribute> remapped_input_attributes;
+      // No attributes carry over for tensor list future/promise.
+      for (int i = 0; i < loop_info.tensor_list_or_flow_in.size(); i++) {
+        remapped_input_attributes.push_back(mlir::Attribute());
+        remapped_input_attributes.push_back(mlir::Attribute());
+      }
+      auto args_attrs = while_body_func.getArgAttrs().value();
+      remapped_input_attributes.push_back(args_attrs[loop_info.loop_counter]);
+      remapped_input_attributes.push_back(args_attrs[loop_info.element_index]);
+      for (int i = 0; i < args_attrs.size(); i++) {
+        if (!variant_arguments.contains(i)) {
+          remapped_input_attributes.push_back(args_attrs[i]);
+        }
+      }
+      map_fn_body_func.setAllArgAttrs(remapped_input_attributes);
+    }
+    auto future_index = [](int i) { return 2 * i; };
+    auto promise_index = [](int i) { return 2 * i + 1; };
+
+    if (while_body_func.getResAttrs().has_value()) {
+      // The order and types of results remain the same; so does attributes.
+      map_fn_body_func.setAllResultAttrs(while_body_func.getResAttrs().value());
+    }
+    map_fn_body_func.setVisibility(mlir::func::FuncOp::Visibility::Private);
+
+    builder.setInsertionPointToEnd(map_fn_body_func.addEntryBlock());
+
+    mlir::IRMapping mapping;
+    std::vector<tf_mlrt::TFAwaitOp> await_ops;
+    for (int i = 0; i < loop_info.tensor_list_or_flow_in.size(); i++) {
+      await_ops.push_back(builder.create<tf_mlrt::TFAwaitOp>(
+          while_body_func.getLoc(),
+          while_body_func.getArgument(loop_info.tensor_list_or_flow_in.at(i))
+              .getType(),
+          map_fn_body_func.getArgument(future_index(i))));
+
+      mapping.map(
+          while_body_func.getArgument(loop_info.tensor_list_or_flow_in.at(i)),
+          await_ops.at(i));
+    }
+    // Rest of argument start after promise
+    int map_fn_argument_index =
+        promise_index(loop_info.tensor_list_or_flow_in.size() - 1);
+    mapping.map(while_body_func.getArgument(loop_info.loop_counter),
+                map_fn_body_func.getArgument(++map_fn_argument_index));
+    mapping.map(while_body_func.getArgument(loop_info.element_index),
+                map_fn_body_func.getArgument(++map_fn_argument_index));
+    for (int i = 0; i < while_body_func.getNumArguments(); i++) {
+      if (!variant_arguments.contains(i)) {
+        mapping.map(while_body_func.getArgument(i),
+                    map_fn_body_func.getArgument(++map_fn_argument_index));
+      }
+    }
+
+    for (auto &op : while_body_func.getBody().front()) {
+      builder.clone(op, mapping);
+    }
+
+    auto return_op = map_fn_body_func.getBody().front().getTerminator();
+
+    mlir::Operation *first_write = nullptr;
+    // Move tensor list write to the end of the block.
+    for (int index : loop_info.tensor_list_or_flow_in) {
+      auto *def = return_op->getOperand(index).getDefiningOp();
+      CHECK(def);  // Crash OK
+      def->moveBefore(return_op);
+      if (!first_write) first_write = def;
+    }
+
+    // Move the await op before the first write.
+    for (auto tensor_list_or_flow_in : await_ops) {
+      tensor_list_or_flow_in->moveBefore(first_write);
+    }
+
+    // Insert promise right before return
+    builder.setInsertionPoint(return_op);
+    for (int i = 0; i < await_ops.size(); i++) {
+      builder.create<tf_mlrt::TFPromiseOp>(
+          return_op->getLoc(), map_fn_body_func.getArgument(promise_index(i)),
+          return_op->getOperand(loop_info.tensor_list_or_flow_in.at(i)));
+    }
+    builder.create<mlir::func::ReturnOp>(return_op->getLoc());
+    return_op->erase();
+
+    symbol_table.insert(map_fn_body_func);
+
+    return map_fn_body_func;
+  }
+
+  std::optional<int> MayGetArgumentIndexIgnoringIdentityOp(
+      mlir::func::FuncOp func, mlir::Value value) const {
+    // Value may go through some identify chains.
+    while (value.getDefiningOp()) {
+      if (!llvm::isa<mlir::TF::IdentityOp>(value.getDefiningOp())) {
+        return std::nullopt;
+      }
+      value = value.getDefiningOp()->getOperand(0);
+    }
+
+    // Value is directly from argument since it has no defining op.
+    auto argument_iter = llvm::find(func.getArguments(), value);
+    if (argument_iter == func.getArguments().end()) {
+      return std::nullopt;
+    }
+    return argument_iter->getArgNumber();
+  }
+
+  // Given a value, find its use ignoring identify op.
+  // For example, given the below chains:
+  //
+  // %original_value = OriginalDefinedOp()
+  // %value1 = tf.IdentifyOp(original_value)
+  // %value2 = tf.IdentifyOp(value1)
+  // UseOp(%value2)
+  //
+  // GetUseIgnroningIdentifyOp(%original_value) will return UseOp
+  llvm::SmallVector<mlir::Operation *> GetUsersIgnoringIdentityOp(
+      mlir::Value value) {
+    llvm::SmallVector<mlir::Operation *> users;
+    std::vector<mlir::Operation *> users_stack;
+
+    for (auto *direct_user : value.getUsers()) {
+      users_stack.push_back(direct_user);
+    }
+
+    while (!users_stack.empty()) {
+      mlir::Operation *descendent_user = users_stack.back();
+      users_stack.pop_back();
+
+      if (!llvm::isa<mlir::TF::IdentityOp>(descendent_user)) {
+        users.push_back(descendent_user);
+      } else {
+        // User of identify op is considered as user.
+        for (auto *user : descendent_user->getResult(0).getUsers()) {
+          users_stack.push_back(user);
+        }
+      }
+    }
+    return users;
+  }
+
+  // Given a value, find its source defined op ignoring identify op.
+  // For example, given the below chains:
+  //
+  // %original_value = OriginalDefinedOp()
+  // %value1 = tf.IdentifyOp(original_value)
+  // %value2 = tf.IdentifyOp(value1)
+  // UseOp(%value2)
+  //
+  // GetDefiningOpIgnroningIdentifyOp(%value2) will return OriginalDefinedOp
+  mlir::Operation *GetDefiningOpIgnoringIdentityOp(mlir::Value value) {
+    mlir::Operation *source_op = value.getDefiningOp();
+    while (llvm::isa<mlir::TF::IdentityOp>(source_op)) {
+      source_op = source_op->getOperand(0).getDefiningOp();
+    }
+    return source_op;
+  }
+
+  mlir::Value GetReturnedOperand(const mlir::func::FuncOp func,
+                                 uint32_t result_index) {
+    auto return_op = llvm::dyn_cast<mlir::func::ReturnOp>(
+        func->getRegion(0).front().getTerminator());
+    DCHECK_NE(return_op, nullptr);
+    return return_op->getOperand(result_index);
+  }
+
+  bool IsScalarI32Tensor(mlir::Value value) const {
+    if (auto value_type = llvm::dyn_cast<mlir::TensorType>(value.getType())) {
+      if (value_type.getElementType().isInteger(32) && value_type.hasRank() &&
+          value_type.getRank() == 0) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool IsScalarOrUnrankedI32Tensor(mlir::Value value) const {
+    if (auto value_type = llvm::dyn_cast<mlir::TensorType>(value.getType())) {
+      if (value_type.getElementType().isInteger(32) &&
+          ((value_type.hasRank() && value_type.getRank() == 0) ||
+           !value_type.hasRank())) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateWhileToMapFnPass() {
+  return std::make_unique<WhileToMapFnPass>();
+}
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h
new file mode 100644
index 00000000000..a45c03871c7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/while_to_map_fn.h
@@ -0,0 +1,31 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_WHILE_TO_MAP_FN_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_WHILE_TO_MAP_FN_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+
+namespace tensorflow {
+namespace mlrt_compiler {
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateWhileToMapFnPass();
+
+}  // namespace mlrt_compiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSFORMS_MLRT_WHILE_TO_MAP_FN_H_
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
index 2eda5bfd0e9..bf27bac6ffb 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/passes.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_asset_sinking_pass.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/bridge_logger.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/set_shape_invariant_in_while_ops.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/tfrt_jitrt_stub.h"
@@ -207,6 +208,11 @@ void CreateTFExecutorToTFInvariantOptimizationPipelineHelper(
     pm.addPass(CreateSinkInInvariantOpsPass());
   }
 
+  if (!options.saved_model_dir.empty()) {
+    pm.addPass(
+        mlir::tf_saved_model::CreateAssetSinkingPass(options.saved_model_dir));
+  }
+
   pm.addPass(CreateLowerTFSavedModelPass(
       options.hoist_invariant_ops, options.fuse_get_resource_ops_in_hoisting));
 }
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
index 24d245b1714..a4c62f8bf20 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/tfrt_pipeline_options.h
@@ -24,6 +24,8 @@ namespace tensorflow {
 
 struct TfrtPipelineOptions
     : public mlir::PassPipelineOptions<TfrtPipelineOptions> {
+  Option<std::string> saved_model_dir{*this, "saved-model-dir",
+                                      llvm::cl::desc(""), llvm::cl::init("")};
   Option<std::string> default_device{
       *this, "default-device", llvm::cl::desc("default device assignment"),
       llvm::cl::init("/job:localhost/replica:0/task:0/device:CPU:0")};
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc b/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc
index d4e39782e2c..b14882d9ca8 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.cc
@@ -39,8 +39,8 @@ void UpdateOpCostInTfrtMlir(mlir::ModuleOp op,
     if (!op_key_attr) return;
     // Set the cost attr with a new value.
     const int64_t op_key = op_key_attr.getInt();
-    op->setAttr(kCostAttrName, builder.getI64IntegerAttr(
-                                   cost_recorder.GetCostNanosecond(op_key)));
+    op->setAttr(kCostAttrName,
+                builder.getI64IntegerAttr(cost_recorder.GetCost(op_key)));
   });
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 1573306ba2d..9e93aa345ad 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -70,7 +70,7 @@ StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(mlir::ModuleOp module) {
 
     const auto func_op = symbol_table.lookup<mlir::func::FuncOp>(func_name);
     if (!func_op) {
-      return tensorflow::errors::Internal(
+      return absl::InternalError(
           absl::StrCat("Function ", func_name, " is not found."));
     }
     FunctionDef func_def;
@@ -92,6 +92,14 @@ StatusOr<std::vector<FunctionDef>> ExportXlaFunctions(mlir::ModuleOp module) {
         }
       }
     });
+
+    // Remove the function from the module, as it will be handled by XLA.
+    // It is safe to remove the function, i.e., the function won't be invoked on
+    // CPU. This is because bridge guarantees that each function has only one
+    // use. We don't replace the uses of the function, because we iterate from
+    // the root caller and hence its uses should have been removed.
+    func_op->erase();
+
     visited.insert(func_name);
   }
   return xla_func_defs;
@@ -111,9 +119,9 @@ Status ConvertFunctionToBef(
       tensorflow::ConvertFunctionToMlir(fbody, flib_def, &context);
 
   if (!expected_module.ok())
-    return tensorflow::errors::Internal(
+    return absl::InternalError(absl::StrCat(
         "Failed to convert function to mlir for function ", function_name.str(),
-        ". Error: ", expected_module.status().message());
+        ". Error: ", expected_module.status().message()));
 
   auto module = std::move(expected_module).value();
 
@@ -152,7 +160,7 @@ Status ConvertTfMlirToRuntimeExecutable(
         tensorflow::RunTPUBackwardCompatConversion(module, tpu_compile_options);
     if (mlir::failed(backward_compat_result)) {
       return diag_handler.Combine(
-          tensorflow::errors::Internal("Failed to handle legacy TPU Ops"));
+          absl::InternalError("Failed to handle legacy TPU Ops"));
     }
 
     if (VLOG_IS_ON(1)) {
@@ -165,7 +173,7 @@ Status ConvertTfMlirToRuntimeExecutable(
     auto tpu_partitioned_call_fallback_compat_result =
         tensorflow::RunTPUPartitionedCallFallbackCompatConversion(module);
     if (mlir::failed(tpu_partitioned_call_fallback_compat_result)) {
-      return diag_handler.Combine(tensorflow::errors::Internal(
+      return diag_handler.Combine(absl::InternalError(
           "Failed to process TPUPartitionedCallOp for fallback execution"));
     }
   } else if (options.device_target == TfrtDeviceInfraTarget::kGpu &&
@@ -222,7 +230,7 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
           if (VLOG_IS_ON(1)) {
             tensorflow::DumpMlirOpToFile("tf_to_corert_failure", module);
           }
-          return diag_handler.Combine(tensorflow::errors::Internal(
+          return diag_handler.Combine(absl::InternalError(
               "failed to lower TF Dialect to CoreRT dialect."));
         }
 
@@ -230,7 +238,7 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
             tfrt::ConvertMLIRToBEF(module, /*disable_optional_sections=*/true);
         if (bef_buffer->empty())
           return diag_handler.Combine(
-              tensorflow::errors::Internal("failed to convert MLIR to BEF."));
+              absl::InternalError("failed to convert MLIR to BEF."));
 
         bef_buffer->shrink_to_fit();
         return OkStatus();
@@ -241,6 +249,9 @@ Status ConvertTfMlirToBef(const TfrtCompileOptions& options,
 std::unique_ptr<tensorflow::TfrtPipelineOptions> GetTfrtPipelineOptions(
     const TfrtCompileOptions& options) {
   auto pipeline_options = std::make_unique<tensorflow::TfrtPipelineOptions>();
+
+  pipeline_options->saved_model_dir = options.saved_model_dir;
+
   if (!options.default_device.empty()) {
     pipeline_options->default_device = options.default_device;
   }
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
new file mode 100644
index 00000000000..0d5fac70054
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
@@ -0,0 +1,71 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        # copybara:uncomment "//smartass/brain/ops/tfrt_kernels:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
+        "//tensorflow/core/tfrt:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "mlir_to_bytecode",
+    srcs = ["mlir_to_bytecode.cc"],
+    hdrs = ["mlir_to_bytecode.h"],
+    deps = [
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "mlir_to_bytecode_test",
+    srcs = ["mlir_to_bytecode_test.cc"],
+    data = glob(["testdata/**"]),
+    deps = [
+        ":mlir_to_bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "//tensorflow/core/tfrt/mlrt/interpreter:attribute_span",
+        "//tensorflow/tsl/platform:resource_loader",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Parser",
+    ],
+)
+
+cc_library(
+    name = "test_utils",
+    testonly = 1,
+    srcs = ["test_utils.cc"],
+    hdrs = ["test_utils.h"],
+    deps = [
+        "//learning/brain/experimental/tfrt/native_lowering/kernels:sync_context",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/tfrt/mlrt/attribute",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:kernel",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:interpreter_testutil",
+        "//tensorflow/core/tfrt/mlrt/interpreter:value",
+        "//tensorflow/core/tfrt/utils:tensor_util",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/status:statusor",
+        "@tf_runtime//:hostcontext",
+        "@tf_runtime//:tensor",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
new file mode 100644
index 00000000000..895f705e6b1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
@@ -0,0 +1,470 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+
+#include <cstring>
+#include <iterator>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+
+namespace mlrt {
+namespace {
+
+// LINT.IfChange(mlrt_attributes)
+bool CanBeInlined(mlir::Attribute attr, absl::string_view data) {
+  // FlatSymbolRefAttr is a special case as we are emitting it as integer.
+  return attr.isa<mlir::IntegerAttr, mlir::FloatAttr,
+                  mlir::FlatSymbolRefAttr>() &&
+         data.size() <= sizeof(uint32_t);
+}
+// LINT.ThenChange(../../../../../core/tfrt/mlrt/interpreter/attribute_span.h:mlrt_attributes)
+
+// Encode integer or float-point numbers as bytes.
+template <typename T>
+std::string EncodeIntegerOrFloat(T attr) {
+  std::string data(sizeof(attr), '\0');
+  std::memcpy(data.data(), &attr, sizeof(attr));
+  return data;
+}
+
+// Encode a list of I64 integers as bytes using bc::Vector<uint64_t>. The bytes
+// can be decoded directly using bc::Vector<uint64_t>. If `array` is not a list
+// I64 integers, a nullopt will be returned.
+
+template <typename T>
+std::optional<std::string> EncodeListOfInteger(mlir::ArrayAttr array) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+  auto ctor = bc::New<bc::Vector<T>>(&allocator, array.size());
+
+  mlir::Type type;
+
+  for (int i = 0; i < array.size(); ++i) {
+    if (auto integer_attr = array[i].dyn_cast<mlir::IntegerAttr>()) {
+      if (type && integer_attr.getType() != type) return std::nullopt;
+      type = integer_attr.getType();
+      llvm::APInt value = integer_attr.getValue();
+      if (value.getBitWidth() != sizeof(T) * 8) return std::nullopt;
+      ctor.ConstructAt(i, value.getZExtValue());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  return std::string(buffer.data(), buffer.size());
+}
+
+std::optional<std::string> EncodeListOfSymbolRef(
+    const ModuleEmitterContext& module_context, mlir::ArrayAttr array) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+  auto ctor = bc::New<bc::Vector<uint32_t>>(&allocator, array.size());
+
+  for (int i = 0; i < array.size(); ++i) {
+    if (auto symbol_ref = array[i].dyn_cast<mlir::FlatSymbolRefAttr>()) {
+      ctor.ConstructAt(i, module_context.GetFunctionId(symbol_ref.getValue()));
+    } else {
+      return std::nullopt;
+    }
+  }
+  return std::string(buffer.data(), buffer.size());
+}
+
+template <typename T>
+std::optional<std::string> EncodeDenseArray(llvm::ArrayRef<T> array) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+  auto ctor = bc::New<bc::Vector<T>>(&allocator, array.size());
+
+  if (!array.empty()) {
+    ctor.Place(reinterpret_cast<const char*>(array.data()),
+               array.size() * sizeof(T));
+  }
+
+  return std::string(buffer.data(), buffer.size());
+}
+
+// Encode a list of strings as bytes using bc::Vector<bc::String>. The bytes
+// can be decoded directly using bc::Vector<bc::String>. If `array` is not a
+// list of strings, a nullopt will be returned.
+std::optional<std::string> EncodeListOfString(mlir::ArrayAttr array) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+  auto ctor = bc::New<bc::Vector<bc::String>>(&allocator, array.size());
+
+  for (int i = 0; i < array.size(); ++i) {
+    if (auto string_attr = array[i].dyn_cast<mlir::StringAttr>()) {
+      ctor.ConstructAt(i, string_attr.getValue().str());
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  return std::string(buffer.data(), buffer.size());
+}
+
+struct FunctionEmitterContext {
+  explicit FunctionEmitterContext(const ModuleEmitterContext* module_context)
+      : module_context(*module_context) {}
+
+  const ModuleEmitterContext& module_context;
+
+  struct RegInfo {
+    int num_uses = 0;
+    int id = -1;
+  };
+
+  int next_reg_id = 0;
+  llvm::DenseMap<mlir::Value, RegInfo> register_table;
+  std::vector<int> free_regs;
+
+  int AssignRegId() {
+    if (free_regs.empty()) {
+      return next_reg_id++;
+    }
+    int id = free_regs.back();
+    free_regs.pop_back();
+    return id;
+  }
+
+  void FreeRegId(int id) { free_regs.push_back(id); }
+};
+
+// Emit the bytecode for a kernel. It uses the information in an MLIR operation
+// and populates the bytecode using bc::Kernel::Constructor. For a kernel's
+// bytecode format, please refer to kernel.h.
+void EmitKernel(FunctionEmitterContext& function_context,
+                bc::Kernel::Constructor& constructor, mlir::Operation& op,
+                std::vector<uint32_t>& function_output_regs,
+                std::vector<uint8_t>& function_output_last_uses) {
+  // Assign reg ids for results first to make sure results does not reuse reg
+  // ids freed from args in the same operation.
+  std::vector<uint32_t> results;
+  results.reserve(op.getNumResults());
+  for (auto result : op.getResults()) {
+    auto iter = function_context.register_table.find(result);
+    CHECK(iter != function_context.register_table.end());  // Crash Ok
+    CHECK_EQ(iter->second.id, -1);                         // Crash Ok
+    iter->second.id = function_context.AssignRegId();
+    results.push_back(iter->second.id);
+  }
+  constructor.construct_results(results.size())
+      .Assign(results.begin(), results.end());
+
+  std::vector<uint32_t> arguments;
+  std::vector<uint8_t> last_uses;
+  arguments.reserve(op.getNumOperands());
+  last_uses.reserve(op.getNumOperands());
+  for (auto operand : op.getOperands()) {
+    auto iter = function_context.register_table.find(operand);
+    CHECK(iter != function_context.register_table.end());  // Crash Ok
+    int id = iter->second.id;
+    CHECK_NE(id, -1);  // Crash Ok
+    last_uses.push_back(0);
+    if (--iter->second.num_uses == 0) {
+      function_context.FreeRegId(id);
+      last_uses.back() = 1;
+    }
+    arguments.push_back(id);
+  }
+
+  constructor.construct_arguments(arguments.size())
+      .Assign(arguments.begin(), arguments.end());
+  constructor.construct_last_uses(last_uses.size())
+      .Assign(last_uses.begin(), last_uses.end());
+
+  std::vector<uint32_t> attributes;
+  attributes.reserve(op.getAttrs().size());
+  for (auto attr : op.getAttrs()) {
+    int attr_id =
+        function_context.module_context.GetAttributeId(attr.getValue());
+    absl::string_view attr_data =
+        function_context.module_context.attributes().at(attr_id);
+
+    if (CanBeInlined(attr.getValue(), attr_data)) {
+      uint32_t data = 0;
+      std::memcpy(&data, attr_data.data(), attr_data.size());
+      attributes.push_back(data);
+    } else {
+      attributes.push_back(attr_id);
+    }
+  }
+  constructor.construct_attributes(attributes.size())
+      .Assign(attributes.begin(), attributes.end());
+
+  if (op.hasTrait<mlir::OpTrait::IsTerminator>()) {
+    constructor.set_code(function_context.module_context.GetKernelId("return"));
+
+    function_output_regs = std::move(arguments);
+    function_output_last_uses = std::move(last_uses);
+
+  } else if (llvm::isa<mlir::func::CallOp>(&op)) {
+    constructor.set_code(function_context.module_context.GetKernelId("call"));
+  } else {
+    llvm::StringRef op_name = op.getName().getStringRef();
+    constructor.set_code(function_context.module_context.GetKernelId(op_name));
+  }
+}
+
+// Emit the bytecode for a function. It uses information in an MLIR function or
+// an MLIR region, and populates the bytecode using bc::Function::Constructor.
+// For a function's bytecode format, please refer to function.h.
+void EmitFunction(const ModuleEmitterContext& module_context,
+                  bc::Function::Constructor& constructor, llvm::StringRef name,
+                  mlir::Region& region) {
+  FunctionEmitterContext function_context(&module_context);
+
+  constructor.construct_name(name.str());
+
+  DCHECK(llvm::hasSingleElement(region)) << "should have a single block";
+
+  auto& block = region.front();
+
+  auto& register_table = function_context.register_table;
+
+  std::vector<uint32_t> input_regs;
+  input_regs.reserve(block.getNumArguments());
+  for (auto arg : block.getArguments()) {
+    int id = function_context.AssignRegId();
+    input_regs.push_back(id);
+    register_table[arg] = {static_cast<int>(std::distance(arg.getUses().begin(),
+                                                          arg.getUses().end())),
+                           id};
+  }
+  constructor.construct_input_regs(input_regs);
+
+  for (auto& op : block) {
+    for (auto result : op.getResults()) {
+      register_table[result] = {static_cast<int>(
+          std::distance(result.getUses().begin(), result.getUses().end()))};
+    }
+  }
+
+  auto kernels_constructor =
+      constructor.construct_kernels(block.getOperations().size());
+
+  std::vector<uint32_t> output_regs;
+  std::vector<uint8_t> output_last_uses;
+  for (const auto& iter : llvm::enumerate(block.getOperations())) {
+    int i = iter.index();
+    mlir::Operation& op = iter.value();
+    auto kernel_ctor = kernels_constructor.ConstructAt(i);
+    EmitKernel(function_context, kernel_ctor, op, output_regs,
+               output_last_uses);
+  }
+
+  constructor.set_num_regs(function_context.next_reg_id);
+  constructor.construct_output_regs(output_regs);
+  constructor.construct_output_last_uses(output_last_uses);
+}
+
+// Emit the bytecode for an executable. It converts attributes, kernels, and
+// functions in an MLIR module to bytecode using bc::Executable::Constructor.
+// For an executable's bytecode format, please refer to executable.h.
+absl::Status EmitExecutable(ModuleEmitterContext& module_context,
+                            bc::Executable::Constructor& constructor,
+                            mlir::ModuleOp module) {
+  module.walk(
+      [&](mlir::func::FuncOp func) { module_context.AddFunction(func); });
+
+  auto functions = module_context.functions();
+  for (auto func : functions) {
+    if (!llvm::hasSingleElement(func.getRegion())) {
+      return absl::InvalidArgumentError("function should have a single block.");
+    }
+    auto& block = func.getRegion().front();
+
+    for (auto& op : block) {
+      if (llvm::isa<mlir::func::CallOp>(&op)) {
+        // Canonicalize the MLIR builtin call op's name to "call".
+        module_context.AddKernelName("call");
+      } else if (op.hasTrait<mlir::OpTrait::IsTerminator>()) {
+        // Canonicalize the return op's name to "return".
+        if (op.getNumResults() != 0) {
+          return absl::InvalidArgumentError(
+              "Block terminator must be a return op.");
+        }
+        module_context.AddKernelName("return");
+      } else {
+        module_context.AddKernelName(op.getName().getStringRef().str());
+      }
+
+      for (auto attr : op.getAttrs()) {
+        if (auto status = module_context.AddAttribute(&op, attr.getValue());
+            !status.ok()) {
+          return status;
+        }
+      }
+
+      // TODO(chky): Support inline regions.
+    }
+  }
+
+  constructor.construct_kernel_names(module_context.kernels().size())
+      .Assign(module_context.kernels().begin(), module_context.kernels().end());
+
+  auto functions_constructor =
+      constructor.construct_functions(functions.size());
+  for (int i = 0; i < functions.size(); ++i) {
+    auto func = functions[i];
+    auto function_ctor = functions_constructor.ConstructAt(i);
+    EmitFunction(module_context, function_ctor, func.getSymName(),
+                 func.getRegion());
+  }
+
+  // Emit attributes after emitting functions as attributes might be large.
+  // Large attributes may result in large offsets that do not fit into a
+  // unit32_t integer. Since functions section should fit into 2GB size limit,
+  // so we emit functions first.
+  constructor.construct_attributes(module_context.attributes().size())
+      .Assign(module_context.attributes().begin(),
+              module_context.attributes().end());
+
+  return absl::OkStatus();
+}
+
+}  // namespace
+
+absl::Status ModuleEmitterContext::AddAttribute(mlir::Operation* op,
+                                                mlir::Attribute attr) {
+  absl::StatusOr<std::string> attr_data;
+  if (auto* encoder = attribute_encoder_registry_.Get(
+          op->getName().getDialectNamespace())) {
+    attr_data = (*encoder)(*this, attr);
+  } else {
+    attr_data = DefaultEncodeAttribute(attr);
+  }
+  if (!attr_data.ok()) return std::move(attr_data).status();
+
+  int id = AddData(std::move(*attr_data), attributes_, attribute_data_id_map_);
+  attribute_id_map_[attr] = id;
+
+  return absl::OkStatus();
+}
+
+int ModuleEmitterContext::AddFunction(mlir::func::FuncOp func) {
+  int id = functions_.size();
+  functions_.push_back(func);
+  DCHECK(!function_name_id_map_.contains(func.getSymName()));
+  function_name_id_map_[func.getSymName()] = id;
+  return id;
+}
+
+std::optional<std::string> EncodeSimpleAttribute(
+    const ModuleEmitterContext& module_context, mlir::Attribute attr) {
+  return llvm::TypeSwitch<mlir::Attribute, std::optional<std::string>>(attr)
+      .Case<mlir::StringAttr>(
+          [](const auto& str_attr) { return str_attr.str(); })
+      .Case<mlir::IntegerAttr>(
+          [](const auto& integer_attr) -> std::optional<std::string> {
+            switch (llvm::APInt value = integer_attr.getValue();
+                    value.getBitWidth()) {
+              case 1:
+                return EncodeIntegerOrFloat<uint8_t>(value.getZExtValue());
+              case 32:
+                return EncodeIntegerOrFloat<uint32_t>(value.getZExtValue());
+              case 64:
+                return EncodeIntegerOrFloat<uint64_t>(value.getZExtValue());
+              default:
+                return std::nullopt;
+            }
+          })
+      .Case<mlir::FloatAttr>(
+          [](const auto& float_attr) -> std::optional<std::string> {
+            llvm::APFloat value = float_attr.getValue();
+            if (float_attr.getType().isF32()) {
+              return EncodeIntegerOrFloat<float>(value.convertToFloat());
+            }
+            return std::nullopt;
+          })
+      .Case<mlir::ArrayAttr>([&](const auto& array_attr)
+                                 -> std::optional<std::string> {
+        if (auto encoded_list_i32 = EncodeListOfInteger<uint32_t>(array_attr)) {
+          return std::move(*encoded_list_i32);
+        } else if (auto encoded_list_i64 =
+                       EncodeListOfInteger<uint64_t>(array_attr)) {
+          return std::move(*encoded_list_i64);
+        } else if (auto encoded_list_string = EncodeListOfString(array_attr)) {
+          return std::move(*encoded_list_string);
+        } else if (auto encoded_list_symbol_ref =
+                       EncodeListOfSymbolRef(module_context, array_attr)) {
+          return std::move(*encoded_list_symbol_ref);
+        } else {
+          return std::nullopt;
+        }
+      })
+      .Case<mlir::DenseI32ArrayAttr>(
+          [](const auto& dense_array_i32) -> std::optional<std::string> {
+            return EncodeDenseArray<int32_t>(dense_array_i32);
+          })
+      .Case<mlir::DenseI64ArrayAttr>(
+          [](const auto& dense_array_i64) -> std::optional<std::string> {
+            return EncodeDenseArray<int64_t>(dense_array_i64);
+          })
+      .Case<mlir::FlatSymbolRefAttr>([&](const auto& symbol_ref) {
+        return EncodeIntegerOrFloat<uint32_t>(
+            module_context.GetFunctionId(symbol_ref.getValue()));
+      })
+      .Default([](const auto& attr) { return std::nullopt; });
+}
+
+// Encode mlir attributes with a limited support such as I64, string and array
+// of I64. Returns an error if the attribute is not supported.
+absl::StatusOr<std::string> ModuleEmitterContext::DefaultEncodeAttribute(
+    mlir::Attribute attr) {
+  if (auto result = EncodeSimpleAttribute(*this, attr)) {
+    return std::move(*result);
+  }
+
+  // TODO(chky): Add a unit test for the error below. This requires we
+  // propagate the error all the way back to the entry point.
+  std ::string attr_str;
+  llvm::raw_string_ostream os(attr_str);
+  attr.print(os);
+
+  return absl::InvalidArgumentError(
+      absl::StrCat("Try to encode unsupported attribute: ", attr_str));
+}
+
+absl::StatusOr<bc::Buffer> EmitExecutable(
+    const AttributeEncoderRegistry& attribute_encoder_registry,
+    mlir::ModuleOp module) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  ModuleEmitterContext module_context(&attribute_encoder_registry);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  if (auto status = EmitExecutable(module_context, executable_ctor, module);
+      !status.ok()) {
+    return status;
+  }
+
+  return buffer;
+}
+
+}  // namespace mlrt
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h
new file mode 100644
index 00000000000..7f5416d230c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h
@@ -0,0 +1,128 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_MLIR_TO_BYTECODE_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_MLIR_TO_BYTECODE_H_
+
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+
+class ModuleEmitterContext;
+
+// Defines a custom attribute encoding registry. Users can register custom
+// attribute encoding for their dialects in this registry. If no custom encoder
+// is registered for a dialect, the default encoding with a limited support, the
+// EncodeSimpleAttribute() below, will be used.
+class AttributeEncoderRegistry {
+ public:
+  using EncoderFn = std::function<absl::StatusOr<std::string>(
+      const ModuleEmitterContext&, mlir::Attribute)>;
+
+  void Register(absl::string_view dialect, EncoderFn encoder) {
+    encoders_[dialect] = std::move(encoder);
+  }
+
+  // Returns the encoder for the specified dialect. It can be nullptr if it is
+  // not registered for this dialect. The returned reference will be invalidated
+  // if Register() is called.
+  const EncoderFn* Get(absl::string_view dialect) const {
+    auto iter = encoders_.find(dialect);
+    if (iter != encoders_.end()) return &iter->second;
+    return nullptr;
+  }
+
+ private:
+  absl::flat_hash_map<std::string, EncoderFn> encoders_;
+};
+
+class ModuleEmitterContext {
+ public:
+  explicit ModuleEmitterContext(
+      const AttributeEncoderRegistry* attribute_encoder_registry)
+      : attribute_encoder_registry_(*attribute_encoder_registry) {}
+
+  void AddKernelName(std::string name) {
+    AddData(std::move(name), kernels_, kernel_id_map_);
+  }
+
+  int GetKernelId(llvm::StringRef name) const {
+    return kernel_id_map_.at(name);
+  }
+
+  absl::Status AddAttribute(mlir::Operation* op, mlir::Attribute attr);
+
+  int GetAttributeId(mlir::Attribute attr) const {
+    return attribute_id_map_.lookup(attr);
+  }
+
+  int AddFunction(mlir::func::FuncOp func);
+
+  int GetFunctionId(absl::string_view name) const {
+    return function_name_id_map_.at(name);
+  }
+
+  absl::Span<const std::string> kernels() const { return kernels_; }
+  absl::Span<const std::string> attributes() const { return attributes_; }
+  absl::Span<const mlir::func::FuncOp> functions() const { return functions_; }
+
+ private:
+  int AddData(std::string data, std::vector<std::string>& data_vector,
+              absl::flat_hash_map<std::string, int>& data_map) {
+    auto iter = data_map.find(data);
+    if (iter != data_map.end()) return iter->second;
+
+    int id = data_vector.size();
+    data_map[data] = id;
+    data_vector.push_back(std::move(data));
+    return id;
+  }
+
+  absl::StatusOr<std::string> DefaultEncodeAttribute(mlir::Attribute attr);
+
+  const AttributeEncoderRegistry& attribute_encoder_registry_;
+
+  std::vector<std::string> kernels_;
+  absl::flat_hash_map<std::string, int> kernel_id_map_;
+
+  std::vector<std::string> attributes_;
+  llvm::DenseMap<mlir::Attribute, int> attribute_id_map_;
+  absl::flat_hash_map<std::string, int> attribute_data_id_map_;
+
+  std::vector<mlir::func::FuncOp> functions_;
+  absl::flat_hash_map<std::string, int> function_name_id_map_;
+};
+
+// Encodes a few simple attributes. Users can use this function in their custom
+// attribute encoder.
+std::optional<std::string> EncodeSimpleAttribute(
+    const ModuleEmitterContext& module_context, mlir::Attribute attr);
+
+absl::StatusOr<bc::Buffer> EmitExecutable(
+    const AttributeEncoderRegistry& attribute_encoder_registry,
+    mlir::ModuleOp module);
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_MLIR_TO_BYTECODE_H_
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
new file mode 100644
index 00000000000..d94e8df3b2e
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode_test.cc
@@ -0,0 +1,365 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
+#include "tensorflow/tsl/platform/resource_loader.h"
+
+namespace mlrt {
+namespace {
+
+TEST(MlirToByteCodeTest, Basic) {
+  constexpr char kBasicMlir[] =
+      "tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic.mlir";
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect>();
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.allowUnregisteredDialects();
+  auto mlir_module = mlir::parseSourceFile<mlir::ModuleOp>(
+      tsl::GetDataDependencyFilepath(kBasicMlir), &mlir_context);
+
+  AttributeEncoderRegistry attribute_encoder_registry;
+  bc::Buffer buffer =
+      EmitExecutable(attribute_encoder_registry, mlir_module.get()).value();
+
+  bc::Executable executable(buffer.data());
+
+  auto kernel_names = executable.kernel_names();
+  EXPECT_THAT(kernel_names, ::testing::ElementsAreArray({"test_mlbc.add.i32",
+                                                         "test_mlbc.sub.i32",
+                                                         "call", "return"}));
+
+  auto functions = executable.functions();
+  ASSERT_GE(functions.size(), 1);
+
+  auto function = functions[0];
+  EXPECT_EQ(function.name().str(), "add_i32_10");
+  EXPECT_EQ(function.num_regs(), 5);
+  EXPECT_THAT(function.input_regs(), ::testing::ElementsAreArray({0}));
+  EXPECT_THAT(function.output_regs(), ::testing::ElementsAreArray({0, 2, 2}));
+  EXPECT_THAT(function.output_last_uses(),
+              ::testing::ElementsAreArray({true, false, true}));
+
+  auto kernels = function.kernels();
+  ASSERT_EQ(kernels.size(), 11);
+
+  EXPECT_EQ(kernels[0].code(), 0);
+  EXPECT_THAT(kernels[0].arguments(), ::testing::ElementsAreArray({0, 0}));
+  EXPECT_THAT(kernels[0].results(), ::testing::ElementsAreArray({1}));
+  EXPECT_THAT(kernels[0].last_uses(), ::testing::ElementsAreArray({0, 0}));
+
+  for (int i = 1; i < 9; i++) {
+    EXPECT_EQ(kernels[i].code(), i % 2);
+    EXPECT_THAT(kernels[i].arguments(),
+                ::testing::ElementsAreArray({(i - 1) % 2 + 1, 0}));
+    EXPECT_THAT(kernels[i].results(), ::testing::ElementsAreArray({i % 2 + 1}));
+    EXPECT_THAT(kernels[i].last_uses(), ::testing::ElementsAreArray({1, 0}));
+  }
+
+  EXPECT_EQ(kernels[9].code(), 2);
+  EXPECT_THAT(kernels[9].arguments(), ::testing::ElementsAreArray({1}));
+  EXPECT_THAT(kernels[9].last_uses(), ::testing::ElementsAreArray({true}));
+  EXPECT_THAT(kernels[9].results(), ::testing::ElementsAreArray({2, 3, 4}));
+
+  EXPECT_EQ(kernels[10].code(), 3);
+  EXPECT_THAT(kernels[10].arguments(), ::testing::ElementsAreArray({0, 2, 2}));
+  EXPECT_THAT(kernels[10].last_uses(),
+              ::testing::ElementsAreArray({true, false, true}));
+  EXPECT_TRUE(kernels[10].results().empty());
+}
+
+template <typename T>
+absl::StatusOr<T> DecodeAttribute(absl::string_view data) {
+  if (data.size() < sizeof(T))
+    return absl::InvalidArgumentError("Invalid data size for attribute.");
+
+  T value;
+  std::memcpy(&value, data.data(), sizeof(T));
+  return value;
+}
+
+TEST(MlirToByteCodeTest, BasicAttributes) {
+  constexpr char kBasicAttributesMlir[] =
+      "tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/"
+      "basic_attributes.mlir";
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect>();
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.allowUnregisteredDialects();
+  auto mlir_module = mlir::parseSourceFile<mlir::ModuleOp>(
+      tsl::GetDataDependencyFilepath(kBasicAttributesMlir), &mlir_context);
+
+  AttributeEncoderRegistry attribute_encoder_registry;
+  bc::Buffer buffer =
+      EmitExecutable(attribute_encoder_registry, mlir_module.get()).value();
+
+  bc::Executable executable(buffer.data());
+
+  auto attributes = executable.attributes();
+
+  ASSERT_EQ(attributes.size(), 14);
+
+  auto attr_iter = attributes.begin();
+
+  EXPECT_EQ(*attr_iter, "test string");
+  ++attr_iter;
+
+  EXPECT_EQ(*attr_iter, "ts");
+  ++attr_iter;
+
+  EXPECT_THAT(DecodeAttribute<int32_t>(*attr_iter),
+              ::testing::status::IsOkAndHolds(100));
+  ++attr_iter;
+
+  EXPECT_THAT(DecodeAttribute<int64_t>(*attr_iter),
+              ::testing::status::IsOkAndHolds(200));
+  ++attr_iter;
+
+  EXPECT_THAT(DecodeAttribute<float>(*attr_iter),
+              ::testing::status::IsOkAndHolds(::testing::FloatEq(3.0)));
+  ++attr_iter;
+
+  EXPECT_THAT(DecodeAttribute<uint8_t>(*attr_iter),
+              ::testing::status::IsOkAndHolds(0));
+  ++attr_iter;
+
+  bc::Vector<int64_t> list_of_i64((*attr_iter).data());
+  EXPECT_THAT(list_of_i64, ::testing::ElementsAreArray({0, 1, 2, 3, 4}));
+  ++attr_iter;
+
+  bc::Vector<int32_t> list_of_i32((*attr_iter).data());
+  EXPECT_THAT(list_of_i32, ::testing::ElementsAreArray({0, 1, 2, 3}));
+  ++attr_iter;
+
+  bc::Vector<bc::String> list_of_str((*attr_iter).data());
+  EXPECT_THAT(list_of_str,
+              ::testing::ElementsAreArray({"string 0", "string 1"}));
+  ++attr_iter;
+
+  EXPECT_THAT(DecodeAttribute<uint32_t>(*attr_iter),
+              ::testing::status::IsOkAndHolds(1));
+  EXPECT_EQ(executable.functions()[1].name().Get(), "callee");
+  ++attr_iter;
+
+  bc::Vector<int32_t> list_of_symbol_ref((*attr_iter).data());
+  EXPECT_EQ(executable.functions()[2].name().Get(), "callee0");
+  EXPECT_EQ(executable.functions()[3].name().Get(), "callee1");
+  EXPECT_THAT(list_of_symbol_ref, ::testing::ElementsAreArray({2, 3}));
+  ++attr_iter;
+
+  bc::Vector<int32_t> dense_array_of_i32((*attr_iter).data());
+  EXPECT_THAT(dense_array_of_i32, ::testing::ElementsAreArray({0, 1, 2}));
+  ++attr_iter;
+
+  bc::Vector<int64_t> dense_array_of_i64((*attr_iter).data());
+  EXPECT_THAT(dense_array_of_i64, ::testing::ElementsAreArray({0, 1, 2}));
+  ++attr_iter;
+
+  bc::Vector<int32_t> empty_dense_array((*attr_iter).data());
+  EXPECT_TRUE(empty_dense_array.empty());
+
+  auto kernels = executable.functions()[0].kernels();
+  ASSERT_EQ(kernels.size(), 15);
+  auto kernel_iter = kernels.begin();
+
+  auto attribute_span = [&](auto kernel_iter) {
+    return mlrt::AttributeSpan((*kernel_iter).attributes(), attributes);
+  };
+
+  EXPECT_EQ(attribute_span(kernel_iter).GetAs<bc::String>(0).Get(),
+            "test string");
+  ++kernel_iter;
+
+  EXPECT_EQ(attribute_span(kernel_iter).GetAs<bc::String>(0).Get(), "ts");
+  ++kernel_iter;
+
+  EXPECT_EQ(attribute_span(kernel_iter).GetAs<int32_t>(0), 100);
+  ++kernel_iter;
+
+  EXPECT_EQ(attribute_span(kernel_iter).GetAs<int64_t>(0), 200);
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<float>(0),
+              ::testing::FloatEq(3.0));
+  ++kernel_iter;
+
+  EXPECT_EQ(attribute_span(kernel_iter).GetAs<uint8_t>(0), false);
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int64_t>>(0),
+              ::testing::ElementsAreArray({0, 1, 2, 3, 4}));
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int32_t>>(0),
+              ::testing::ElementsAreArray({0, 1, 2, 3}));
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<bc::String>>(0),
+              ::testing::ElementsAreArray({"string 0", "string 1"}));
+  ++kernel_iter;
+
+  EXPECT_EQ(attribute_span(kernel_iter).GetAs<uint32_t>(0), 1);
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int32_t>>(0),
+              ::testing::ElementsAreArray({2, 3}));
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int32_t>>(0),
+              ::testing::ElementsAreArray({0, 1, 2}));
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int64_t>>(0),
+              ::testing::ElementsAreArray({0, 1, 2}));
+  ++kernel_iter;
+
+  EXPECT_THAT(attribute_span(kernel_iter).GetAs<bc::Vector<int32_t>>(0),
+              ::testing::IsEmpty());
+}
+
+TEST(MlirToByteCodeTest, UnsupportedAttributes) {
+  constexpr char kUnsupportedAttributesMlir[] =
+      "tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/"
+      "unsupported_attributes.mlir";
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect>();
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.allowUnregisteredDialects();
+  auto mlir_module = mlir::parseSourceFile<mlir::ModuleOp>(
+      tsl::GetDataDependencyFilepath(kUnsupportedAttributesMlir),
+      &mlir_context);
+
+  AttributeEncoderRegistry attribute_encoder_registry;
+  EXPECT_THAT(EmitExecutable(attribute_encoder_registry, mlir_module.get()),
+              ::testing::status::CanonicalStatusIs(
+                  absl::StatusCode::kInvalidArgument,
+                  "Try to encode unsupported attribute: unit"));
+}
+
+class CustomDense {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(bc::Vector<int64_t>, shape);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, data);
+  };
+
+  class Constructor {
+   public:
+    Constructor(bc::Allocator* allocator, bc::BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    template <typename... Args>
+    auto construct_shape(Args&&... args) {
+      return StorageType::construct_shape(allocator_, address_,
+                                          std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_data(Args&&... args) {
+      return StorageType::construct_data(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    bc::BcAddr_t address() const { return address_; }
+
+   private:
+    bc::Allocator* allocator_;
+    bc::BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit CustomDense(const char* p) : p_(p) {}
+
+  bc::Vector<int64_t> shape() const { return StorageType::read_shape(p_); }
+  bc::Vector<uint32_t> data() const { return StorageType::read_data(p_); }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+absl::StatusOr<std::string> EncodeCustomDense(const ModuleEmitterContext&,
+                                              mlir::Attribute attr) {
+  auto dense_int_attr = attr.dyn_cast<mlir::DenseIntElementsAttr>();
+  if (!dense_int_attr)
+    return absl::InvalidArgumentError(
+        "The element of the custom dense attribute must be an integer.");
+
+  if (dense_int_attr.getElementType().cast<mlir::IntegerType>().getWidth() !=
+      32) {
+    return absl::InvalidArgumentError(
+        "The element of the custom dense attribute must be an i32 integer.");
+  }
+
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+  auto custom_dense_ctor = bc::New<CustomDense>(&allocator);
+
+  auto shaped_type = dense_int_attr.getType();
+  std::vector<int64_t> shape(shaped_type.getShape().begin(),
+                             shaped_type.getShape().end());
+  custom_dense_ctor.construct_shape(shape);
+
+  custom_dense_ctor.construct_data(shaped_type.getNumElements())
+      .Place(dense_int_attr.getRawData().data(),
+             dense_int_attr.getRawData().size());
+
+  return std::string(buffer.data(), buffer.size());
+}
+
+TEST(MlirToByteCodeTest, CustomDense) {
+  constexpr char kCustomAttributesMlir[] =
+      "tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/"
+      "custom_attributes.mlir";
+
+  mlir::DialectRegistry registry;
+  registry.insert<mlir::func::FuncDialect>();
+  mlir::MLIRContext mlir_context(registry);
+  mlir_context.allowUnregisteredDialects();
+  auto mlir_module = mlir::parseSourceFile<mlir::ModuleOp>(
+      tsl::GetDataDependencyFilepath(kCustomAttributesMlir), &mlir_context);
+
+  AttributeEncoderRegistry attribute_encoder_registry;
+  attribute_encoder_registry.Register("test_custom", &EncodeCustomDense);
+  bc::Buffer buffer =
+      EmitExecutable(attribute_encoder_registry, mlir_module.get()).value();
+
+  bc::Executable executable(buffer.data());
+
+  auto attributes = executable.attributes();
+
+  ASSERT_EQ(attributes.size(), 10);
+  for (int i = 0; i < 10; ++i) {
+    bc::String attr_data = attributes[i];
+
+    CustomDense custom_dense(attr_data.data());
+    EXPECT_THAT(custom_dense.shape(), ::testing::ElementsAreArray({1}));
+    EXPECT_THAT(custom_dense.data(), ::testing::ElementsAreArray({i}));
+  }
+}
+
+}  // namespace
+}  // namespace mlrt
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.cc
new file mode 100644
index 00000000000..b5a3cb9550c
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.cc
@@ -0,0 +1,174 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h"
+
+#include <algorithm>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h"
+
+namespace mlrt {
+namespace testing {
+
+absl::StatusOr<std::string> EncodeAttribute(const tensorflow::AttrValue& attr) {
+  if (attr.has_b()) {
+    std::string result;
+    result.resize(sizeof(uint8_t));
+    uint8_t v = attr.b();
+    std::memcpy(result.data(), &v, sizeof(v));
+    return result;
+  }
+
+  if (attr.has_i()) {
+    std::string result;
+    result.resize(sizeof(int64_t));
+    int64_t v = attr.i();
+    std::memcpy(result.data(), &v, sizeof(v));
+    return result;
+  }
+
+  if (attr.has_f()) {
+    std::string result;
+    result.resize(sizeof(float));
+    float v = attr.f();
+    std::memcpy(result.data(), &v, sizeof(v));
+    return result;
+  }
+
+  if (attr.has_s()) {
+    return attr.s();
+  }
+
+  if (attr.has_list()) {
+    if (attr.list().s_size() > 0) {
+      mlrt::bc::Buffer buffer;
+      mlrt::bc::Allocator allocator(&buffer);
+      auto ctor = mlrt::bc::New<mlrt::bc::Vector<mlrt::bc::String>>(
+          &allocator, attr.list().s_size());
+
+      for (int i = 0; i < attr.list().s_size(); ++i) {
+        ctor.ConstructAt(i, attr.list().s(i));
+      }
+
+      return std::string(buffer.data(), buffer.size());
+    }
+  }
+
+  if (attr.has_tensor()) {
+    mlrt::bc::Buffer buffer;
+    mlrt::bc::Allocator allocator(&buffer);
+
+    tensorflow::Tensor tensor;
+    if (!tensor.FromProto(attr.tensor())) {
+      return absl::InvalidArgumentError("Invalid tensor proto.");
+    }
+
+    auto tensor_attr_ctor = mlrt::bc::New<tensorflow::tf_mlrt::TensorAttr>(
+        &allocator, tensor.dtype());
+
+    auto shape = tensor.shape().dim_sizes();
+
+    tensor_attr_ctor.construct_shape(shape.size())
+        .Assign(shape.begin(), shape.end());
+
+    auto tensor_data = tensor.tensor_data();
+    tensor_attr_ctor.construct_data(tensor_data.size())
+        .Place(tensor_data.data(), tensor_data.size());
+
+    return std::string(buffer.data(), buffer.size());
+  }
+
+  // TODO(chky,rohitju): Add more attribute support.
+
+  return absl::InvalidArgumentError("Unsupported attribute.");
+}
+
+namespace {
+
+bool CanBeInlined(const tensorflow::AttrValue& attr) {
+  return attr.has_b() || attr.has_f();
+}
+
+}  // namespace
+
+absl::Status EncodeAttributes(AttributeTable& attributes,
+                              const tensorflow::AttrValueMap& attr_map) {
+  std::vector<std::pair<std::string, tensorflow::AttrValue>> attrs(
+      attr_map.begin(), attr_map.end());
+  std::sort(attrs.begin(), attrs.end(),
+            [](const auto& x, const auto& y) { return x.first < y.first; });
+
+  for (int i = 0; i < attrs.size(); ++i) {
+    const tensorflow::AttrValue& attr = attrs[i].second;
+    TF_ASSIGN_OR_RETURN(auto attr_str, EncodeAttribute(attr));
+    if (CanBeInlined(attr)) {
+      attributes.AddInline(absl::StrCat(i), attr_str);
+    } else {
+      attributes.Add(absl::StrCat(i), attr_str);
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::pair<mlrt::bc::Kernel, mlrt::bc::Vector<mlrt::bc::String>>>
+CreateKernelAndAttrs(int num_inputs, int num_outputs,
+                     mlrt::ExecutionContext& exec_ctx, mlrt::bc::Buffer* buffer,
+                     const tensorflow::AttrValueMap& attrs) {
+  mlrt::bc::Allocator allocator(buffer);
+  auto attributes_ctor = mlrt::bc::New<mlrt::bc::Vector<mlrt::bc::String>>(
+      &allocator, attrs.size());
+  AttributeTable attribute_table(attributes_ctor);
+  TF_RETURN_IF_ERROR(EncodeAttributes(attribute_table, attrs));
+
+  auto kernel_ctor = mlrt::bc::New<mlrt::bc::Kernel>(&allocator);
+  kernel_ctor.set_code(0);
+
+  std::vector<int> input_indices(num_inputs);
+  std::iota(input_indices.begin(), input_indices.end(), 0);
+  kernel_ctor.construct_arguments(input_indices.size())
+      .Assign(input_indices.begin(), input_indices.end());
+
+  std::vector<int> output_indices(num_outputs);
+  std::iota(output_indices.begin(), output_indices.end(), num_inputs);
+  kernel_ctor.construct_results(output_indices.size())
+      .Assign(output_indices.begin(), output_indices.end());
+
+  std::vector<uint32_t> attr_indices;
+  attr_indices.reserve(attrs.size());
+  for (int i = 0; i < attrs.size(); ++i) {
+    attr_indices.push_back(attribute_table.GetHandle(absl::StrCat(i)));
+  }
+
+  kernel_ctor.construct_attributes(attr_indices.size())
+      .Assign(attr_indices.begin(), attr_indices.end());
+
+  mlrt::bc::Vector<mlrt::bc::String> attributes(
+      buffer->Get(attributes_ctor.address()));
+  mlrt::bc::Kernel kernel(buffer->Get(kernel_ctor.address()));
+
+  return std::make_pair(kernel, attributes);
+}
+
+}  // namespace testing
+}  // namespace mlrt
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h b/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h
new file mode 100644
index 00000000000..fd2d491923f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/test_utils.h
@@ -0,0 +1,115 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_TEST_UTILS_H_
+#define TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_TEST_UTILS_H_
+
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_context.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/tfrt/mlrt/attribute/attribute.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/utils/tensor_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/tensor/dense_tensor_utils.h"  // from @tf_runtime
+
+namespace mlrt {
+namespace testing {
+
+absl::StatusOr<std::string> EncodeAttribute(const tensorflow::AttrValue& attr);
+
+absl::Status EncodeAttributes(AttributeTable& attributes,
+                              const tensorflow::AttrValueMap& attr_map);
+
+absl::StatusOr<std::pair<mlrt::bc::Kernel, mlrt::bc::Vector<mlrt::bc::String>>>
+CreateKernelAndAttrs(int num_inputs, int num_outputs,
+                     mlrt::ExecutionContext& exec_ctx, mlrt::bc::Buffer* buffer,
+                     const tensorflow::AttrValueMap& attrs = {});
+
+template <typename T>
+absl::Status TestMlrtKernel(
+    absl::string_view kernel_name, absl::Span<mlrt::Value> regs,
+    tfrt::HostContext* host, int num_inputs, int num_outputs,
+    absl::Span<const tensorflow::Tensor> expected_outputs,
+    mlrt::KernelRegistry* registry, bool approx_equal = false,
+    const tensorflow::AttrValueMap& attrs = {}) {
+  mlrt::ExecutionContext execution_context(nullptr);
+
+  mlrt::bc::Buffer buffer;
+  TF_ASSIGN_OR_RETURN(auto kernel_and_attrs,
+                      CreateKernelAndAttrs(num_inputs, num_outputs,
+                                           execution_context, &buffer, attrs));
+
+  tfrt::ExecutionContext tfrt_execution_context(
+      *tfrt::RequestContextBuilder(host, nullptr).build());
+  tensorflow::tfrt_stub::SyncResourceState sync_resource_state;
+  auto sync_context =
+      std::make_unique<tfrt::SyncContext>(*host, &sync_resource_state);
+  execution_context.AddUserContext(std::move(sync_context));
+
+  auto kernel_fn = registry->Get(kernel_name);
+  mlrt::KernelFrame::State state(regs, kernel_and_attrs.second,
+                                 &execution_context);
+  mlrt::KernelFrame frame(&state);
+  frame.set_kernel(kernel_and_attrs.first);
+
+  kernel_fn(frame);
+
+  TF_RETURN_IF_ERROR(execution_context.status());
+
+  for (int i = 0, j = num_inputs; i < expected_outputs.size(); ++i, ++j) {
+    const auto& expected_output = expected_outputs[i];
+    auto expected_dht = tfrt::ConvertTfTensorToDHT(expected_output);
+    if (!expected_dht) {
+      return absl::InternalError(tfrt::StrCat(expected_dht.takeError()));
+    }
+
+    if (!approx_equal) {
+      if (!tfrt::TensorEqual<T>(regs[j].Get<tfrt::DenseHostTensor>(),
+                                *expected_dht)) {
+        return absl::InternalError(
+            absl::StrCat("wrong result for ", kernel_name));
+      }
+    } else {
+      if (!tfrt::TensorApproxEqual<T>(regs[j].Get<tfrt::DenseHostTensor>(),
+                                      *expected_dht)) {
+        return absl::InternalError(
+            absl::StrCat("wrong result for ", kernel_name));
+      }
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace testing
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TFRT_TRANSLATE_MLRT_TEST_UTILS_H_
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic.mlir
new file mode 100644
index 00000000000..a5d5f98332b
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic.mlir
@@ -0,0 +1,13 @@
+func.func @add_i32_10(%c0: i32) -> (i32, i32, i32) {
+  %c1 = "test_mlbc.add.i32"(%c0, %c0) : (i32, i32) -> i32
+  %c2 = "test_mlbc.sub.i32"(%c1, %c0) : (i32, i32) -> i32
+  %c3 = "test_mlbc.add.i32"(%c2, %c0) : (i32, i32) -> i32
+  %c4 = "test_mlbc.sub.i32"(%c3, %c0) : (i32, i32) -> i32
+  %c5 = "test_mlbc.add.i32"(%c4, %c0) : (i32, i32) -> i32
+  %c6 = "test_mlbc.sub.i32"(%c5, %c0) : (i32, i32) -> i32
+  %c7 = "test_mlbc.add.i32"(%c6, %c0) : (i32, i32) -> i32
+  %c8 = "test_mlbc.sub.i32"(%c7, %c0) : (i32, i32) -> i32
+  %c9 = "test_mlbc.add.i32"(%c8, %c0) : (i32, i32) -> i32
+  %c10, %c11, %c12 = call @add_i32_10(%c9) : (i32) -> (i32, i32, i32)
+  func.return %c0, %c10, %c10 : i32, i32, i32
+}
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir
new file mode 100644
index 00000000000..db72598bb4a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/basic_attributes.mlir
@@ -0,0 +1,29 @@
+func.func @simple_attributes() {
+  "test_custom.attribute"() {value = "test string"} : () -> ()
+  "test_custom.attribute"() {value = "ts"} : () -> ()
+  "test_custom.attribute"() {value = 100 : i32} : () -> ()
+  "test_custom.attribute"() {value = 200 : i64} : () -> ()
+  "test_custom.attribute"() {value = 3.0 : f32} : () -> ()
+  "test_custom.attribute"() {value = false} : () -> ()
+  "test_custom.attribute"() {value = [0, 1, 2, 3, 4]} : () -> ()
+  "test_custom.attribute"() {value = [0 : i32, 1 : i32, 2 : i32, 3 : i32]} : () -> ()
+  "test_custom.attribute"() {value = ["string 0", "string 1"]} : () -> ()
+  "test_custom.attribute"() {value = @callee} : () -> ()
+  "test_custom.attribute"() {value = [@callee0, @callee1]} : () -> ()
+  "test_custom.attribute"() {value = array<i32: 0, 1, 2>} : () -> ()
+  "test_custom.attribute"() {value = array<i64: 0, 1, 2>} : () -> ()
+  "test_custom.attribute"() {value = array<i32>} : () -> ()
+  func.return
+}
+
+func.func @callee() {
+  return
+}
+
+func.func @callee0() {
+  return
+}
+
+func.func @callee1() {
+  return
+}
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/custom_attributes.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/custom_attributes.mlir
new file mode 100644
index 00000000000..54f325092f3
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/custom_attributes.mlir
@@ -0,0 +1,13 @@
+func.func @add_const_custom_dense_i32_10(%c0: i32) -> i32 {
+  %c1 = "test_custom.add.const.i32"(%c0) {value = dense<[0]> : tensor<1xi32>} : (i32) -> i32
+  %c2 = "test_custom.add.const.i32"(%c1) {value = dense<[1]> : tensor<1xi32>} : (i32) -> i32
+  %c3 = "test_custom.add.const.i32"(%c2) {value = dense<[2]> : tensor<1xi32>} : (i32) -> i32
+  %c4 = "test_custom.add.const.i32"(%c3) {value = dense<[3]> : tensor<1xi32>} : (i32) -> i32
+  %c5 = "test_custom.add.const.i32"(%c4) {value = dense<[4]> : tensor<1xi32>} : (i32) -> i32
+  %c6 = "test_custom.add.const.i32"(%c5) {value = dense<[5]> : tensor<1xi32>} : (i32) -> i32
+  %c7 = "test_custom.add.const.i32"(%c6) {value = dense<[6]> : tensor<1xi32>} : (i32) -> i32
+  %c8 = "test_custom.add.const.i32"(%c7) {value = dense<[7]> : tensor<1xi32>} : (i32) -> i32
+  %c9 = "test_custom.add.const.i32"(%c8) {value = dense<[8]> : tensor<1xi32>} : (i32) -> i32
+  %c10 = "test_custom.add.const.i32"(%c9) {value = dense<[9]> : tensor<1xi32>} : (i32) -> i32
+  func.return %c10 : i32
+}
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/unsupported_attributes.mlir b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/unsupported_attributes.mlir
new file mode 100644
index 00000000000..4c060815c95
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/testdata/unsupported_attributes.mlir
@@ -0,0 +1,5 @@
+func.func @unsupported_attributes() {
+  "test_custom.attribute"() {unit} : () -> ()
+  func.return
+}
+
diff --git a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
index e451cf737f3..7b731307531 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
+++ b/tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h
@@ -38,6 +38,7 @@ enum class TfrtDeviceInfraTarget {
 std::ostream& operator<<(std::ostream& os, TfrtDeviceInfraTarget device_target);
 
 struct TfrtCompileOptions {
+  std::string saved_model_dir;
   // TODO(tfrt-devs): Ideally, compiler should make the decision where
   // to place the variable.
   std::string variable_device = "/job:localhost/replica:0/task:0/device:CPU:0";
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 145cce5ac6b..29d96e79c47 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -16,7 +16,6 @@ load(
 )
 load(
     "//tensorflow/core/platform:build_config.bzl",
-    "if_llvm_aarch64_available",
     "if_llvm_system_z_available",
     "tf_proto_library",
 )
@@ -102,6 +101,7 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:lib",
         "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
         "@llvm-project//llvm:ARMCodeGen",  # fixdeps: keep
         "@llvm-project//llvm:Analysis",
         "@llvm-project//llvm:CodeGen",
@@ -119,8 +119,6 @@ tf_cc_binary(
         "@llvm-project//mlir:ToLLVMIRTranslation",
     ] + if_llvm_system_z_available([
         "@llvm-project//llvm:SystemZCodeGen",  # fixdeps: keep
-    ]) + if_llvm_aarch64_available([
-        "@llvm-project//llvm:AArch64CodeGen",  # fixdeps: keep
     ]),
 )
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
index 29ae6752cd2..e4892ba1d2e 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/BUILD
@@ -83,6 +83,7 @@ cc_library(
         ":tf_framework_ops_inc_gen",
         ":tf_status_inc_gen",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/status",
         "@llvm-project//mlir:AllocationOpInterface",
         "@llvm-project//mlir:BufferizationDialect",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
index 48e288eb48d..e6f87f387ed 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.cc
@@ -28,6 +28,7 @@ limitations under the License.
 
 // Generated dialect definitions.
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_dialect.cc.inc"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
 namespace mlir {
 namespace kernel_gen {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
index 4d4abfd9d90..3fa42457e1d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
index e2a5601fc53..6d90a339e5c 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -13,8 +13,9 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
   func.return %buf : memref<?x10x?xf32>
 }
 // Compute number of elements.
-// CHECK: [[SIZE_1:%.*]] = llvm.mlir.constant(10 : index) : i64
-// CHECK: [[NUM_ELEM_0:%.*]] = llvm.mul [[SIZE_0]], [[SIZE_1]] : i64
+// CHECK: [[SIZE_1A:%.*]] = llvm.mlir.constant(10 : index) : i64
+// CHECK: [[SIZE_1B:%.*]] = llvm.mlir.constant(10 : index) : i64
+// CHECK: [[NUM_ELEM_0:%.*]] = llvm.mul [[SIZE_0]], [[SIZE_1B]] : i64
 // CHECK: [[NUM_ELEMS:%.*]] = llvm.mul [[NUM_ELEM_0]], [[SIZE_2]] : i64
 
 // Compute the size of an individual element.
@@ -48,9 +49,9 @@ func.func @alloc(%ctx: !tf_framework.op_kernel_context,
 // CHECK: [[DESC_4:%.*]] = llvm.insertvalue [[SIZE_2]], [[DESC_3]][3, 2]
 // CHECK: [[DESC_5:%.*]] = llvm.insertvalue [[STRIDE_2]], [[DESC_4]][4, 2]
 // CHECK: [[STRIDE_1:%.*]] = llvm.mul [[STRIDE_2]], [[SIZE_2]] : i64
-// CHECK: [[DESC_6:%.*]] = llvm.insertvalue [[SIZE_1]], [[DESC_5]][3, 1]
+// CHECK: [[DESC_6:%.*]] = llvm.insertvalue [[SIZE_1A]], [[DESC_5]][3, 1]
 // CHECK: [[DESC_7:%.*]] = llvm.insertvalue [[STRIDE_1]], [[DESC_6]][4, 1]
-// CHECK: [[STRIDE_0:%.*]] = llvm.mul [[STRIDE_1]], [[SIZE_1]] : i64
+// CHECK: [[STRIDE_0:%.*]] = llvm.mul [[STRIDE_1]], [[SIZE_1A]] : i64
 // CHECK: [[DESC_8:%.*]] = llvm.insertvalue [[SIZE_0]], [[DESC_7]][3, 0]
 // CHECK: [[DESC_9:%.*]] = llvm.insertvalue [[STRIDE_0]], [[DESC_8]][4, 0]
 // CHECK: llvm.return [[DESC_9]] : [[DESC_TY]]
@@ -212,7 +213,7 @@ func.func @jit_execute(%ctx: !tf_framework.op_kernel_context,
   // CHECK: %[[ARG:.*]] = llvm.insertvalue %[[ARG_DESCR]], %[[T1]][1]
   // CHECK: %[[C1:.*]] = llvm.mlir.constant(1 : i64)
   // CHECK: %[[RESULT_PTR:.*]] = llvm.alloca %[[C1]] x !llvm.struct<(i64, ptr)>
-  
+
   // Copy argument(s) to stack-allocated buffer.
   // CHECK: %[[NUM_ARGS:.*]] = llvm.mlir.constant(1 : i64)
   // CHECK: %[[ARGS_PTR:.*]] = llvm.alloca %[[NUM_ARGS]] x !llvm.struct<(i64, ptr)>
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
index 38f0b297272..d0caa87983d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_to_kernel/BUILD
@@ -6,6 +6,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     default_tags = [
         # We need access to the CUDA SDK.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index fe9d26723b9..73cc324f405 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -176,8 +176,8 @@ llvm::Expected<std::unique_ptr<ExecutionEngine>> Compile(
   }
 
   // Create the kernel.
-  mlir::OwningOpRef<mlir::ModuleOp> module;
   mlir::MLIRContext context;
+  mlir::OwningOpRef<mlir::ModuleOp> module;
 
   if (item.result_module().empty()) {
     // Otherwise, compile the module now.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
index 8d5d583a2dc..e6ab814216a 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_to_kernel.cc
@@ -17,6 +17,7 @@
 // This file implements the entry point to compile a tf op to a kernel.
 //
 //===----------------------------------------------------------------------===//
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
@@ -50,10 +51,15 @@ namespace {
 
 static llvm::codegen::RegisterCodeGenFlags CGF;
 
-std::unique_ptr<llvm::TargetMachine> GetTargetMachine(llvm::Module* module) {
+std::unique_ptr<llvm::TargetMachine> GetTargetMachine(
+    llvm::StringRef host_triple, llvm::Module* module) {
   llvm::Triple triple(module->getTargetTriple());
   if (triple.getTriple().empty()) {
-    triple = llvm::Triple(llvm::sys::getDefaultTargetTriple());
+    if (!host_triple.empty()) {
+      triple = llvm::Triple(host_triple);
+    } else {
+      triple = llvm::Triple(llvm::sys::getDefaultTargetTriple());
+    }
     module->setTargetTriple(triple.getTriple());
   }
 
@@ -71,14 +77,15 @@ std::unique_ptr<llvm::TargetMachine> GetTargetMachine(llvm::Module* module) {
 }
 
 // Compiles the given MLIR module via LLVM into an executable binary format.
-StatusOr<std::string> EmitToBinary(mlir::ModuleOp module) {
+StatusOr<std::string> EmitToBinary(llvm::StringRef host_triple,
+                                   mlir::ModuleOp module) {
   // Translate the module.
   llvm::LLVMContext llvm_context;
   mlir::registerLLVMDialectTranslation(*module->getContext());
   std::unique_ptr<llvm::Module> llvm_module =
       mlir::translateModuleToLLVMIR(module, llvm_context);
 
-  auto target_machine = GetTargetMachine(llvm_module.get());
+  auto target_machine = GetTargetMachine(host_triple, llvm_module.get());
   llvm_module->setDataLayout(target_machine->createDataLayout());
 
   // Run LLVM's mid-level optimizer to clean up the IR.
@@ -106,6 +113,7 @@ StatusOr<std::string> EmitToBinary(mlir::ModuleOp module) {
 }
 
 Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
+           llvm::StringRef host_triple,
            llvm::ArrayRef<std::string> architectures,
            llvm::ArrayRef<int64_t> tile_sizes,
            llvm::ArrayRef<int64_t> unroll_factors, int64_t max_supported_rank,
@@ -130,7 +138,7 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
                               /*apply_cl_options=*/true));
 
   // Get binary.
-  TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(*module));
+  TF_ASSIGN_OR_RETURN(std::string binary, EmitToBinary(host_triple, *module));
 
   // Write .a file.
   TF_RETURN_IF_ERROR(
@@ -167,6 +175,8 @@ int main(int argc, char** argv) {
   llvm::cl::opt<bool> jit_compile(
       "jit", llvm::cl::desc("Generate only a JIT compiler invocation."),
       llvm::cl::init(false));
+  llvm::cl::opt<std::string> host_triple(
+      "host-triple", llvm::cl::desc("Override host triple for module"));
   llvm::cl::list<std::string> architectures(
       "arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
       llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
@@ -189,16 +199,25 @@ int main(int argc, char** argv) {
       llvm::cl::init(false));
 
   tensorflow::InitMlir y(&argc, &argv);
-  llvm::InitializeNativeTarget();
-  llvm::InitializeNativeTargetAsmPrinter();
+
+  LLVMInitializeX86Target();
+  LLVMInitializeX86TargetInfo();
+  LLVMInitializeX86TargetMC();
+  LLVMInitializeX86AsmPrinter();
+
+  LLVMInitializeAArch64Target();
+  LLVMInitializeAArch64TargetInfo();
+  LLVMInitializeAArch64TargetMC();
+  LLVMInitializeAArch64AsmPrinter();
+
   mlir::registerPassManagerCLOptions();
   mlir::registerMLIRContextCLOptions();
   llvm::cl::ParseCommandLineOptions(argc, argv, "TF op kernel generator\n");
 
   auto status = tensorflow::kernel_gen::Run(
-      input_file, output_file, architectures, tile_sizes, unroll_factors,
-      max_supported_rank, print_ptx, print_llvmir, enable_ftz, index_64bit,
-      jit_compile, jit_i64_indexed_for_large_tensors);
+      input_file, output_file, host_triple, architectures, tile_sizes,
+      unroll_factors, max_supported_rank, print_ptx, print_llvmir, enable_ftz,
+      index_64bit, jit_compile, jit_i64_indexed_for_large_tensors);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return 1;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
index e2cf9051761..796f133cff5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/buffer_reuse_pass.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstddef>
+#include <memory>
 #include <optional>
 #include <vector>
 
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
index e54f36f5684..af0943ded1f 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <vector>
 
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
index 1a8bc8882f8..ed1138849e5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/embed_tf_framework_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/fuse_inner_parallel_loops_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/fuse_inner_parallel_loops_pass.cc
index afba89b635d..5698d6c7025 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/fuse_inner_parallel_loops_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/fuse_inner_parallel_loops_pass.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index 0d76cd4c93c..561e87e6dda 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -13,6 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Export.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
index 0bfacccc1a1..35b11c8abf0 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/parallel_loops_to_sequential.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 167b370e17f..84712606e98 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TOOLS_KERNEL_GEN_TRANSFORMS_PASSES_H_
 
 #include <memory>
+#include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
index b6f3a98237c..16116cac215 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/rewrite_tf_framework_assert.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
+#include <memory>
 #include <optional>
+#include <utility>
 
 #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
index 246556da0ca..1e422f6ab88 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/same_shape_propagation.cc
@@ -18,6 +18,8 @@ limitations under the License.
 // sizes of operands with equal shapes.
 
 #include <memory>
+#include <tuple>
+#include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMapInfo.h"
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
index 3be09ac912b..b308241aede 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/shape_to_descriptors_pass.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // This file combines patterns for lowering shape dialect to standard ops,
 // structured control flow and descriptors.
 
+#include <memory>
+#include <utility>
+
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index 9a5b0749888..8adb4c1eebe 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
 #include <optional>
 #include <string>
+#include <utility>
 
 #include "mlir/Conversion/LLVMCommon/Pattern.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -125,7 +127,8 @@ class TFAllocOpConverter : public ConvertToLLVMCallOpPattern<TFAllocOp> {
                              llvm::to_vector<4>(adaptor.getDynSizes()),
                              rewriter, sizes, strides, sizeBytes);
     // Get number of elements.
-    Value num_elements = getNumElements(loc, sizes, rewriter);
+    Value num_elements =
+        getNumElements(loc, memref_type, adaptor.getDynSizes(), rewriter);
     // Get element size.
     Value element_size =
         getSizeInBytes(loc, memref_type.getElementType(), rewriter);
diff --git a/tensorflow/compiler/mlir/tosa/tests/BUILD b/tensorflow/compiler/mlir/tosa/tests/BUILD
index e7c4a5b9a61..a523ba82942 100644
--- a/tensorflow/compiler/mlir/tosa/tests/BUILD
+++ b/tensorflow/compiler/mlir/tosa/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     size_override = {
diff --git a/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir b/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir
index fbadf264186..803c1415dc4 100644
--- a/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/strip-quant-types.mlir
@@ -1,23 +1,25 @@
 // RUN: tf-opt --split-input-file --tosa-strip-quant-types  --verify-each %s | FileCheck %s
 
-// CHECK-LABEL: @test_add_qi8
-// CHECK-SAME: %arg0: tensor<i8>) -> tensor<i8>
-func.func @test_add_qi8(%arg0: tensor<!quant.uniform<i8:f32, 0.1:1>>) -> tensor<!quant.uniform<i8:f32, 0.1:2>> {
-  %0 = "tosa.add"(%arg0, %arg0) : (tensor<!quant.uniform<i8:f32, 0.1:1>>, tensor<!quant.uniform<i8:f32, 0.1:1>>) -> tensor<!quant.uniform<i8:f32, 0.1:2>>
+// -----
 
-  // CHECK: %[[VAR0:.+]] = "tosa.add"(%arg0, %arg0) : (tensor<i8>, tensor<i8>) -> tensor<i8>
-  // CHECK: return %[[VAR0]] : tensor<i8>
-  func.return %0 : tensor<!quant.uniform<i8:f32, 0.1:2>>
+// CHECK-LABEL: @test_max_pool2d_qi8
+// CHECK-SAME: %arg0: tensor<1x4x4x4xi8>) -> tensor<1x4x4x4xi8>
+func.func @test_max_pool2d_qi8(%arg0: tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:2>> {
+  %0 = "tosa.max_pool2d"(%arg0) {kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>} : (tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:1>>) -> tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:2>>
+
+  // CHECK: %[[VAR0:.+]] = "tosa.max_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}> : (tensor<1x4x4x4xi8>) -> tensor<1x4x4x4xi8>
+  // CHECK: return %[[VAR0]] : tensor<1x4x4x4xi8>
+  func.return %0 : tensor<1x4x4x4x!quant.uniform<i8:f32, 0.1:2>>
 }
 
-// ----
+// -----
 
-// CHECK-LABEL: @test_add_qu8
+// CHECK-LABEL: @test_bitwise_not_qu8
 // CHECK-SAME: %arg0: tensor<ui8>) -> tensor<ui8>
-func.func @test_add_qu8(%arg0: tensor<!quant.uniform<u8:f32, 0.1:1>>) -> tensor<!quant.uniform<u8:f32, 0.1:2>> {
-  %0 = "tosa.add"(%arg0, %arg0) : (tensor<!quant.uniform<u8:f32, 0.1:1>>, tensor<!quant.uniform<u8:f32, 0.1:1>>) -> tensor<!quant.uniform<u8:f32, 0.1:2>>
+func.func @test_bitwise_not_qu8(%arg0: tensor<!quant.uniform<u8:f32, 0.1:1>>) -> tensor<!quant.uniform<u8:f32, 0.1:1>> {
+  %0 = "tosa.bitwise_not"(%arg0) : (tensor<!quant.uniform<u8:f32, 0.1:1>>) -> tensor<!quant.uniform<u8:f32, 0.1:1>>
 
-  // CHECK: %[[VAR0:.+]] = "tosa.add"(%arg0, %arg0) : (tensor<ui8>, tensor<ui8>) -> tensor<ui8>
+  // CHECK: %[[VAR0:.+]] = "tosa.bitwise_not"(%arg0) : (tensor<ui8>) -> tensor<ui8>
   // CHECK: return %[[VAR0]] : tensor<ui8>
-  func.return %0 : tensor<!quant.uniform<u8:f32, 0.1:2>>
+  func.return %0 : tensor<!quant.uniform<u8:f32, 0.1:1>>
 }
diff --git a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
index 5cacdf03552..47e2571e2bb 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tf-to-tosa-pipeline.mlir
@@ -565,7 +565,7 @@ func.func @test_argmax(%arg0: tensor<13x21x3xf32>) -> tensor<21x3xi32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32> {
   %2 = "tf.AvgPool"(%arg0)  {data_format = "NHWC", ksize = [1, 1, 1, 1], padding = "SAME", strides = [1, 1, 1, 1]}  : (tensor<1x32x32x8xf32>) -> tensor<1x32x32x8xf32>
   func.return %2 : tensor<1x32x32x8xf32>
@@ -617,8 +617,8 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
 
 // CHECK-LABEL: test_strided_slice
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 1, 7, 3, 2, 1>}
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 7, 3, 2>}
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 7, 1, 2>, start = array<i64: 0, 0, 0, 0>}
 // CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 9, 7, 2>}
 func.func @test_strided_slice(%arg0: tensor<13x21x3xf32>) -> tensor<9x7x2xf32> {
   %2 = "tf.Const"()  {value = dense<[4, 0, 1]> : tensor<3xi64>}  : () -> tensor<3xi64>
diff --git a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
index ddfc7eefe81..145b1877761 100644
--- a/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
+++ b/tensorflow/compiler/mlir/tosa/tests/tfl-to-tosa-pipeline.mlir
@@ -962,7 +962,7 @@ func.func @test_less_equal_dynamic(%arg0: tensor<13x1x3xf32>, %arg1: tensor<13x?
 // -----
 
 // CHECK-LABEL: test_avg_pool2d
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -971,7 +971,7 @@ func.func @test_avg_pool2d(%arg0: tensor<1x32x32x8xf32>) -> tensor<*xf32> {
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_dynamic
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{acc_type = f32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 func.func @test_avg_pool2d_dynamic(%arg0: tensor<?x32x32x8xf32>) -> tensor<*xf32> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<?x32x32x8xf32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
@@ -1064,14 +1064,14 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
 
 // CHECK-LABEL: test_strided_slice_simple
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 21, 2>, start = array<i64: 4, 0, 1>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 1, 7, 3, 2, 1>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 1, 7, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 7, 3, 2>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 7, 1, 2>, start = array<i64: 0, 0, 0, 0>}>
 // CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 9, 7, 2>}>
 func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, 3, 1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1079,14 +1079,14 @@ func.func @test_strided_slice_simple(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32
 
 // CHECK-LABEL: test_strided_slice_simple_negative
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 9, 18, 2>, start = array<i64: 4, 0, 1>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 1, 6, 3, 2, 1>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 1, 6, 1, 2, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 9, 6, 3, 2>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 9, 6, 1, 2>, start = array<i64: 0, 0, 0, 0>}>
 // CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 9, 6, 2>}>
 func.func @test_strided_slice_simple_negative(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, -3, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, 3, 1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 1 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1099,7 +1099,7 @@ func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, 1, 1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 2 : i32}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 2 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1107,14 +1107,14 @@ func.func @test_strided_slice_strideless(%arg0: tensor<13x21x3xf32>) -> tensor<*
 
 // CHECK-LABEL: test_strided_slice_shrink
 // CHECK-DAG: %[[VAR0:.*]] = "tosa.slice"(%arg0) <{size = array<i64: 1, 21, 1>, start = array<i64: 4, 0, 1>}>
-// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 1, 1, 7, 3, 1, 1>}>
-// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 1, 1, 7, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+// CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%[[VAR0]]) <{new_shape = array<i64: 1, 7, 3, 1>}>
+// CHECK-DAG: %[[VAR2:.*]] = "tosa.slice"(%[[VAR1]]) <{size = array<i64: 1, 7, 1, 1>, start = array<i64: 0, 0, 0, 0>}>
 // CHECK: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]]) <{new_shape = array<i64: 7>}>
 func.func @test_strided_slice_shrink(%arg0: tensor<13x21x3xf32>) -> tensor<*xf32> {
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, 3, 1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 5 : i32}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 5 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1127,7 +1127,7 @@ func.func @test_strided_slice_shrink_ignore_stride(%arg0: tensor<13x21x3xf32>) -
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, 3, 1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 3 : i32}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 3 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1142,7 +1142,7 @@ func.func @test_strided_slice_unstrided(%arg0: tensor<13x21x3xf32>) -> tensor<*x
   %cst = arith.constant dense<[4, 0, 1]> : tensor<3xi32>
   %cst_0 = arith.constant dense<[13, 21, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, 1, -1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1157,7 +1157,7 @@ func.func @test_strided_slice_unstrided_shorter(%arg0: tensor<13x21x3xf32>) -> t
   %cst = arith.constant dense<[4, 0]> : tensor<2xi32>
   %cst_0 = arith.constant dense<[13, 21]> : tensor<2xi32>
   %cst_1 = arith.constant dense<[1, -1]> : tensor<2xi32>
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<13x21x3xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 3 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false}  : (tensor<13x21x3xf32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1171,7 +1171,7 @@ func.func @test_strided_slice_unstrided_shorter(%arg0: tensor<13x21x3xf32>) -> t
 func.func @test_strided_slice_dynamic_masked(%arg0: tensor<10x?x?xf32>, %arg1: tensor<3xi32>) -> tensor<*xf32> {
   %cst_0 = arith.constant dense<[13, -1, 3]> : tensor<3xi32>
   %cst_1 = arith.constant dense<[1, -1, -1]> : tensor<3xi32>
-  %0 = "tfl.strided_slice"(%arg0, %arg1, %cst_0, %cst_1)  {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 7 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %arg1, %cst_0, %cst_1)  {begin_mask = 7 : i32, ellipsis_mask = 0 : i32, end_mask = 7 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 
@@ -1190,7 +1190,7 @@ func.func @test_strided_slice_dynamic_begin(%arg0: tensor<10x?x?xf32>) -> tensor
   // CHECK: %[[VAR0:.*]] = "tosa.reverse"(%arg0) <{axis = 1 : i64}>
   // CHECK: %[[VAR1:.*]] = "tosa.reverse"(%[[VAR0]]) <{axis = 2 : i64}>
   // CHECK: return %[[VAR1]]
-  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 7 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %cst, %cst_0, %cst_1)  {begin_mask = 2 : i32, ellipsis_mask = 0 : i32, end_mask = 7 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 0 : i32, offset = false}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   func.return %0 : tensor<*xf32>
 }
 // -----
@@ -1203,10 +1203,10 @@ func.func @test_strided_slice_dynamic_end(%arg0: tensor<10x?x?xf32>) -> tensor<*
   %stride = arith.constant dense<[1, 2, -1]> : tensor<3xi32>
 
   // CHECK: %[[SLICE1:.+]] = "tosa.slice"(%arg0) <{size = array<i64: 7, -1, 1>, start = array<i64: 0, 1, 2>}>
-  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[SLICE1]]) <{new_shape = array<i64: 7, 1, -1, 2, 1, 1>}>
-  // CHECK: %[[SLICE2:.+]] = "tosa.slice"(%[[RESHAPE1]]) <{size = array<i64: 7, 1, -1, 1, 1, 1>, start = array<i64: 0, 0, 0, 0, 0, 0>}>
+  // CHECK: %[[RESHAPE1:.+]] = "tosa.reshape"(%[[SLICE1]]) <{new_shape = array<i64: 7, -1, 2, 1>}>
+  // CHECK: %[[SLICE2:.+]] = "tosa.slice"(%[[RESHAPE1]]) <{size = array<i64: 7, -1, 1, 1>, start = array<i64: 0, 0, 0, 0>}>
   // CHECK: %[[RESHAPE2:.+]] = "tosa.reshape"(%[[SLICE2]]) <{new_shape = array<i64: 7, -1>}>
-  %0 = "tfl.strided_slice"(%arg0, %begin, %end, %stride)  {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 2 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 4 : i32}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
+  %0 = "tfl.strided_slice"(%arg0, %begin, %end, %stride)  {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 2 : i32, new_axis_mask = 0 : i32, shrink_axis_mask = 4 : i32, offset = false}  : (tensor<10x?x?xf32>, tensor<3xi32>, tensor<3xi32>, tensor<3xi32>) -> tensor<*xf32>
   // CHECK: return %[[RESHAPE2]]
   func.return %0 : tensor<*xf32>
 }
@@ -1882,7 +1882,7 @@ func.func @test_mul_qi8(%arg0: tensor<13x21x3x!quant.uniform<i8:f32, 0.015681236
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_qi8
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.unary_quant<input_zp = 0, output_zp = 0>, stride = array<i64: 1, 1>}>
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{acc_type = i32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, quantization_info = #tosa.unary_quant<input_zp = 0, output_zp = 0>, stride = array<i64: 1, 1>}>
 // CHECK-SAME: -> tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
 func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8x!quant.uniform<i8:f32, 0.015684349462389946:-1>>) -> tensor<*x!quant.uniform<i8:f32, 0.015684349462389946:-1>>
@@ -1892,7 +1892,7 @@ func.func @test_avg_pool2d_qi8(%arg0: tensor<1x32x32x8x!quant.uniform<i8:f32, 0.
 // -----
 
 // CHECK-LABEL: test_avg_pool2d_i16
-// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
+// CHECK: %[[VAR0:.*]] = "tosa.avg_pool2d"(%arg0) <{acc_type = i32, kernel = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>}>
 // CHECK-SAME: -> tensor<1x32x32x8xi16>
 func.func @test_avg_pool2d_i16(%arg0: tensor<1x32x32x8xi16>) -> tensor<*xi16> {
   %0 = "tfl.average_pool_2d"(%arg0)  {filter_height = 1 : i32, filter_width = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32}  : (tensor<1x32x32x8xi16>) -> tensor<*xi16>
diff --git a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
index 62058cbe799..c3ef4e0bd67 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/convert_tfl_uint8.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <memory>
 #include <numeric>
+#include <utility>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
@@ -58,7 +60,7 @@ namespace {
 class ConvertUint8ToInt8
     : public impl::TosaConvertTFLUint8PassBase<ConvertUint8ToInt8> {
  public:
-  explicit ConvertUint8ToInt8() {}
+  explicit ConvertUint8ToInt8() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc b/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc
index 13c60e8af21..b64e4eda6d5 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/dequantize_tfl_softmax.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
@@ -36,7 +37,7 @@ namespace {
 class TosaDequantizeTFLSoftmax
     : public impl::TosaDequantizeTFLSoftmaxPassBase<TosaDequantizeTFLSoftmax> {
  public:
-  explicit TosaDequantizeTFLSoftmax() {}
+  explicit TosaDequantizeTFLSoftmax() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
index 1fc479502d2..a3521eea92b 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/fuse_bias_tf.cc
@@ -19,8 +19,10 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <optional>
+#include <utility>
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
@@ -44,7 +46,7 @@ namespace {
 
 class FuseBiasTF : public impl::TosaFusebiasTFPassBase<FuseBiasTF> {
  public:
-  explicit FuseBiasTF() {}
+  explicit FuseBiasTF() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
index 54539429695..e4cf77db4cd 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_common.cc
@@ -23,12 +23,16 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_common.h"
 
+#include <algorithm>
 #include <climits>
+#include <cmath>
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <limits>
 #include <numeric>
 #include <optional>
+#include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -1414,6 +1418,14 @@ std::optional<Value> convertSoftmaxOp(PatternRewriter& rewriter, Operation* op,
     return std::nullopt;
   }
 
+  // beta is not exposed from the TF API, assume only beta=1.0 is supported
+  // For more details: https://github.com/tensorflow/tensorflow/issues/60435
+  if (beta != 1.0) {
+    (void)rewriter.notifyMatchFailure(
+        op, "beta values other than 1.0 are not supported");
+    return std::nullopt;
+  }
+
   // reduce_sum on last dimension
   int32_t input_rank = input_type.getShape().size();
   ArrayRef<int64_t> logits_shape = output_type.getShape();
@@ -2271,9 +2283,7 @@ std::optional<Value> convertStridedSliceOp(
   // tensor
   //
   // 2. Reshape2: Reshape the tensor from (1) such that each dimension with
-  // stride is split into two dimensions of size_i/stride_i, stride_i. A naive
-  // implementation doubles the input tensor rank, but only dimensions being
-  // strided actually need to be doubled.
+  // abs(stride) != 1 is split into two dimensions of size_i/stride_i, stride_i.
   //
   // 3. Slice3: Slice the tensor from (2) such that we select index [0] from
   // each of the stride_i dimensions in (2)
@@ -2316,7 +2326,6 @@ std::optional<Value> convertStridedSliceOp(
   int32_t strides_size = strides.size();
   for (auto stride : strides) all_strides_one &= abs(stride) == 1;
 
-
   // If all of the masks are set we can just bypass the entire thing.
   const int32_t all_masks_one = (1 << strides_size) - 1;
 
@@ -2448,10 +2457,14 @@ std::optional<Value> convertStridedSliceOp(
   }
 
   // Step 2: reshape the sliced array
-  SmallVector<int64_t> a2_shape(input_rank * 2);
+  SmallVector<int64_t> a2_shape;
   for (int i = 0; i < input_rank; ++i) {
-    a2_shape[i * 2 + 0] = a1_size[i] == -1 ? -1 : a1_size[i] / abs(strides[i]);
-    a2_shape[i * 2 + 1] = abs(strides[i]);
+    int64_t abs_stride_i = abs(strides[i]);
+    a2_shape.push_back(a1_size[i] == -1 ? -1 : a1_size[i] / abs_stride_i);
+    if (abs_stride_i != 1) {
+      // only add a stride dimension if strides[i] != 1
+      a2_shape.push_back(abs_stride_i);
+    }
   }
 
   auto a2_reshape_op = CreateOpAndInfer<tosa::ReshapeOp>(
@@ -2462,19 +2475,24 @@ std::optional<Value> convertStridedSliceOp(
           tensorflow::ConvertMlirShapeToTF(a2_shape)));
 
   // Step 3: take a slice along the strides
-  SmallVector<int64_t> a3_begin(input_rank * 2), a3_size(input_rank * 2);
+  SmallVector<int64_t> a3_begin, a3_size;
   for (int i = 0; i < input_rank; ++i) {
-    a3_begin[i * 2 + 0] = 0;
-    a3_begin[i * 2 + 1] = 0;
+    int64_t abs_stride_i = abs(strides[i]);
+    a3_begin.push_back(0);
 
     if (shrink_axis_mask & (1 << i)) {
-      a3_size[i * 2 + 0] = 1;
+      a3_size.push_back(1);
     } else {
-      a3_size[i * 2 + 0] =
-          (a1_size[i] == -1) ? -1 : (a1_size[i] / abs(strides[i]));
+      a3_size.push_back((a1_size[i] == -1) ? -1 : (a1_size[i] / abs_stride_i));
+    }
+    if (abs_stride_i != 1) {
+      // previous reshape only adds a stride dimension if strides[i] != 1
+      a3_begin.push_back(0);
+      a3_size.push_back(1);
     }
-    a3_size[i * 2 + 1] = 1;
   }
+  assert(a2_shape.size() == a3_begin.size());
+  assert(a2_shape.size() == a3_size.size());
 
   auto a3_slice_op = CreateOpAndInfer<tosa::SliceOp>(
       rewriter, op->getLoc(),
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
index 5418eab622c..082cfe74018 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf.cc
@@ -632,8 +632,14 @@ LogicalResult ConvertTFAvgPoolOp::matchAndRewrite(
       return failure();
   }
 
-  CreateReplaceOpAndInfer<tosa::AvgPool2dOp>(
-      rewriter, op, output_type, tf_avgpool_op.getValue(), kernel, stride, pad);
+  // Tosa supports FP16 and FP32 accumulator type for FP16 input. When the time
+  // FP16 is supported, the accumulator type can be selected based on trade-off
+  // between performance and accuracy. Set to FP32 by default.
+  auto acc_attr = mlir::TypeAttr::get(rewriter.getF32Type());
+
+  CreateReplaceOpAndInfer<tosa::AvgPool2dOp>(rewriter, op, output_type,
+                                             tf_avgpool_op.getValue(), kernel,
+                                             stride, pad, acc_attr);
   return success();
 }
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf_tfl.cc
index d40688d570d..72c86e40a7e 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tf_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tf_tfl.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 // Legalize TensorFlow and TensorFlow Lite to TOSA
 
+#include <memory>
+#include <utility>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -38,7 +41,7 @@ namespace {
 // Performs lowering to TOSA dialect
 class LegalizeTFTFL : public impl::TosaLegalizeTFTFLPassBase<LegalizeTFTFL> {
  public:
-  explicit LegalizeTFTFL() {}
+  explicit LegalizeTFTFL() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
index 0162ddd4a8a..87573d30ed5 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_tfl.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // Legalize TensorFlow Lite to TOSA
 
+#include <algorithm>
 #include <cfloat>
 #include <climits>
 #include <cmath>
@@ -23,6 +24,7 @@ limitations under the License.
 #include <fstream>
 #include <iterator>
 #include <limits>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <string>
@@ -681,7 +683,12 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
 
   Value output;
   if (output_is_qtype && input_lhs_is_qtype && input_rhs_is_qtype) {
-    ShapedType rescale_type = output_type.clone(rewriter.getI32Type());
+    ShapedType rescale_type_output = output_type.clone(rewriter.getI32Type());
+    ShapedType rescale_type_input_left =
+        input_lhs_type.clone(rewriter.getI32Type());
+    ShapedType rescale_type_input_right =
+        input_rhs_type.clone(rewriter.getI32Type());
+
     UniformQuantizedType input_lhs_qtype =
         input_lhs_type.getElementType()
             .dyn_cast<mlir::quant::UniformQuantizedType>();
@@ -743,10 +750,11 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
       Value op1_none_half_scale_intermediate;
       if (output_qtype.getStorageTypeIntegralWidth() == 16) {
         auto tfl_add_lhs_casted = CreateOpAndInfer<tosa::CastOp>(
-            rewriter, op->getLoc(), rescale_type, tfl_add_op.getLhs());
+            rewriter, op->getLoc(), rescale_type_input_left,
+            tfl_add_op.getLhs());
         op1_none_half_scale_intermediate =
             CreateOpAndInfer<tosa::LogicalLeftShiftOp>(
-                rewriter, op->getLoc(), rescale_type,
+                rewriter, op->getLoc(), rescale_type_input_left,
                 tfl_add_lhs_casted.getResult(),
                 getTosaConstTensorSingleI32(rewriter, op, input_shift));
       } else {
@@ -773,10 +781,11 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
       Value op2_none_half_scale_intermediate;
       if (output_qtype.getStorageTypeIntegralWidth() == 16) {
         auto tfl_add_rhs_casted = CreateOpAndInfer<tosa::CastOp>(
-            rewriter, op->getLoc(), rescale_type, tfl_add_op.getRhs());
+            rewriter, op->getLoc(), rescale_type_input_right,
+            tfl_add_op.getRhs());
         op2_none_half_scale_intermediate =
             CreateOpAndInfer<tosa::LogicalLeftShiftOp>(
-                rewriter, op->getLoc(), rescale_type,
+                rewriter, op->getLoc(), rescale_type_input_right,
                 tfl_add_rhs_casted.getResult(),
                 getTosaConstTensorSingleI32(rewriter, op, input_shift));
       } else {
@@ -789,8 +798,9 @@ static LogicalResult matchAndRewriteAddSub(Operation* op,
     }
 
 #endif  // TFLITE_DOUBLE_ROUNDING
-    auto op3_add_op1_op2 = CreateOpAndInfer<TosaOp>(
-        rewriter, op->getLoc(), rescale_type, op1_rescale_lhs, op2_rescale_rhs);
+    auto op3_add_op1_op2 =
+        CreateOpAndInfer<TosaOp>(rewriter, op->getLoc(), rescale_type_output,
+                                 op1_rescale_lhs, op2_rescale_rhs);
     Value op4_rescale_op3 = buildRescaleFromInt32(
         rewriter, op, output_type, op3_add_op1_op2.getResult(),
         output_rescale_scale, output_qtype.getZeroPoint());
@@ -1190,6 +1200,13 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
   auto average_etype = input_type.getElementType();
   auto average_type = output_type.clone(average_etype);
 
+  // Tosa supports FP16 and FP32 accumulator type for FP16 input. When the time
+  // FP16 is supported, the accumulator type can be selected based on trade-off
+  // between performance and accuracy. Set to FP32 by default.
+  TypeAttr acc_attr = average_etype.isa<FloatType>()
+                          ? mlir::TypeAttr::get(rewriter.getF32Type())
+                          : mlir::TypeAttr::get(rewriter.getIntegerType(32));
+
   Value result;
   if (average_etype.isa<quant::UniformQuantizedType>()) {
     // TensorFlow Lite doesn't use the zero point when calculating
@@ -1200,11 +1217,11 @@ LogicalResult ConvertTFLAveragePool2DOp::matchAndRewrite(
         /*input_zp=*/0, /*output_zp=*/0);
     result = CreateOpAndInfer<tosa::AvgPool2dOp>(
         rewriter, op->getLoc(), average_type, tfl_avgpool_op.getInput(),
-        kernel_size, stride, pad, quant_attr);
+        kernel_size, stride, pad, acc_attr, quant_attr);
   } else {
     result = CreateOpAndInfer<tosa::AvgPool2dOp>(
         rewriter, op->getLoc(), average_type, tfl_avgpool_op.getInput(),
-        kernel_size, stride, pad);
+        kernel_size, stride, pad, acc_attr);
   }
   if (average_type != output_type) {
     result = CreateOpAndInfer<tosa::CastOp>(rewriter, op->getLoc(), output_type,
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
index ff8616687a2..29f913edb87 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h"
 
+#include <algorithm>
+#include <cmath>
+#include <functional>
 #include <optional>
 
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
index b2e76197fb5..07a781c6240 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
+++ b/tensorflow/compiler/mlir/tosa/transforms/legalize_utils.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <climits>
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <iterator>
 #include <numeric>
 #include <optional>
diff --git a/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc b/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc
index da989275d52..0180574f1d0 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/lower_complex_types.cc
@@ -31,6 +31,8 @@ limitations under the License.
 // resulting graph is free of illegal complex tensors.
 
 #include <iterator>
+#include <memory>
+#include <utility>
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -55,7 +57,7 @@ namespace {
 class LowerComplexTypes
     : public impl::TosaLowerComplexTypesPassBase<LowerComplexTypes> {
  public:
-  explicit LowerComplexTypes() {}
+  explicit LowerComplexTypes() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc b/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc
index c55908f31fc..b0ce9d0d80f 100644
--- a/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc
+++ b/tensorflow/compiler/mlir/tosa/transforms/strip_quant_types.cc
@@ -27,7 +27,9 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <memory>
 #include <numeric>
+#include <utility>
 
 #include "mlir/Dialect/Tosa/IR/TosaOps.h"  // from @llvm-project
 #include "mlir/Dialect/Tosa/Utils/QuantUtils.h"  // from @llvm-project
@@ -56,7 +58,7 @@ namespace {
 class StripQuantTypes
     : public impl::TosaStripQuantTypesPassBase<StripQuantTypes> {
  public:
-  explicit StripQuantTypes() {}
+  explicit StripQuantTypes() = default;
   void runOnOperation() override;
 };
 
diff --git a/tensorflow/compiler/mlir/utils/name_utils.cc b/tensorflow/compiler/mlir/utils/name_utils.cc
index d966d887b1f..ba50a923bc4 100644
--- a/tensorflow/compiler/mlir/utils/name_utils.cc
+++ b/tensorflow/compiler/mlir/utils/name_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/utils/name_utils.h"
 
 #include <cctype>
+#include <string>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/compiler/tests/BUILD b/tensorflow/compiler/tests/BUILD
index 1fb31b9db74..d3ea4b077a9 100644
--- a/tensorflow/compiler/tests/BUILD
+++ b/tensorflow/compiler/tests/BUILD
@@ -1,16 +1,17 @@
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_cuda_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_cuda_cc_test")
 load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load(
     "//tensorflow/compiler/tests:build_defs.bzl",
     "generate_backend_suites",
-    "tf_xla_py_test",
 )
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -38,7 +39,7 @@ package_group(
 
 generate_backend_suites()
 
-py_library(
+py_strict_library(
     name = "xla_test",
     testonly = 1,
     srcs = ["xla_test.py"],
@@ -46,21 +47,24 @@ py_library(
     visibility = [":friends"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/tpu:tpu_py",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test_utils",
     testonly = 1,
     srcs = [
@@ -74,7 +78,7 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "xla_test_test",
     size = "small",
     srcs = ["xla_test_test.py"],
@@ -88,7 +92,7 @@ py_test(
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "adadelta_test",
     size = "medium",
     srcs = ["adadelta_test.py"],
@@ -99,15 +103,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "adagrad_test",
     size = "small",
     srcs = ["adagrad_test.py"],
@@ -118,16 +123,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "adagrad_da_test",
     size = "small",
     srcs = ["adagrad_da_test.py"],
@@ -139,15 +144,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "adam_test",
     size = "small",
     srcs = ["adam_test.py"],
@@ -159,16 +166,19 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "add_n_test",
     size = "small",
     srcs = ["add_n_test.py"],
@@ -181,16 +191,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "argminmax_test",
     size = "small",
     srcs = ["argminmax_test.py"],
@@ -202,15 +212,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "binary_ops_test",
     size = "medium",
     srcs = ["binary_ops_test.py"],
@@ -224,19 +234,22 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "complex_div_test",
     size = "medium",
     srcs = ["complex_div_test.py"],
@@ -254,16 +267,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "bucketize_op_test",
     size = "small",
     srcs = ["bucketize_op_test.py"],
@@ -274,15 +286,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "categorical_op_test",
     size = "small",
     srcs = ["categorical_op_test.py"],
@@ -294,15 +307,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:standard_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "cholesky_op_test",
     size = "medium",
     srcs = ["cholesky_op_test.py"],
@@ -315,17 +331,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "cond_test",
     size = "small",
     srcs = ["cond_test.py"],
@@ -336,20 +353,26 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "self_adjoint_eig_op_test",
     size = "medium",
     srcs = ["self_adjoint_eig_op_test.py"],
@@ -361,18 +384,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "searchsorted_op_test",
     size = "small",
     timeout = "moderate",
@@ -384,12 +404,13 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "svd_op_test",
     size = "medium",
     srcs = ["svd_op_test.py"],
@@ -406,18 +427,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:linalg_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "matrix_inverse_op_test",
     size = "small",
     timeout = "moderate",
@@ -429,15 +449,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "matrix_solve_op_test",
     size = "small",
     timeout = "moderate",
@@ -449,14 +470,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "matrix_triangular_solve_op_test",
     size = "small",
     timeout = "moderate",
@@ -469,16 +491,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "clustering_test",
     size = "small",
     srcs = ["clustering_test.py"],
@@ -489,14 +513,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "concat_ops_test",
     size = "medium",
     srcs = ["concat_ops_test.py"],
@@ -507,17 +534,19 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradient_checker",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "conv2d_test",
     size = "medium",
     srcs = ["conv2d_test.py"],
@@ -530,17 +559,38 @@ tf_xla_py_test(
     deps = [
         ":test_utils",
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
+    name = "tensor_float_32_test",
+    size = "medium",
+    srcs = ["tensor_float_32_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    use_xla_device = False,  # Uses tf.function(jit_compile=True)
+    deps = [
+        ":xla_test",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+    ],
+)
+
+tf_xla_py_strict_test(
     name = "conv3d_test",
     size = "medium",
     srcs = ["conv3d_test.py"],
@@ -552,16 +602,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "depthwise_conv_op_test",
     size = "medium",
     srcs = ["depthwise_conv_op_test.py"],
@@ -575,17 +627,19 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "dynamic_slice_ops_test",
     size = "small",
     srcs = ["dynamic_slice_ops_test.py"],
@@ -595,15 +649,16 @@ tf_xla_py_test(
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
-        "//tensorflow/compiler/tests:xla_test",
+        ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "einsum_op_test",
     size = "medium",
     srcs = ["einsum_op_test.py"],
@@ -619,14 +674,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:special_math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "reshape_op_test",
     size = "small",
     srcs = ["reshape_op_test.py"],
@@ -636,16 +692,16 @@ tf_xla_py_test(
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
-        "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
+        ":xla_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "dynamic_stitch_test",
     size = "small",
     srcs = ["dynamic_stitch_test.py"],
@@ -655,14 +711,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
@@ -672,14 +729,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "eager_test",
     size = "medium",
     srcs = ["eager_test.py"],
@@ -691,19 +748,33 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/training:adam",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "fifo_queue_test",
     size = "medium",
     srcs = ["fifo_queue_test.py"],
@@ -714,16 +785,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:data_flow_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "fft_test",
     size = "medium",
     srcs = ["fft_test.py"],
@@ -737,15 +806,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
         "//tensorflow/python/ops/signal",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "slice_ops_test",
     size = "medium",
     srcs = ["slice_ops_test.py"],
@@ -757,14 +828,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "ftrl_test",
     size = "medium",
     srcs = ["ftrl_test.py"],
@@ -776,16 +848,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "ftrl_ops_test",
     size = "medium",
     srcs = ["ftrl_ops_test.py"],
@@ -796,17 +868,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/tpu",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "function_test",
     size = "small",
     srcs = ["function_test.py"],
@@ -817,13 +888,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/ops:array_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "image_ops_test",
     size = "small",
     timeout = "long",
@@ -837,20 +911,25 @@ tf_xla_py_test(
     python_version = "PY3",
     shard_count = 10,
     tags = [
+        "no_oss",  # TODO(b/282033702): Re-enable this test in OSS.
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
         "optonly",  # Times out frequently in fastbuild mode.
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:image_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
@@ -862,16 +941,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "lrn_ops_test",
     size = "medium",
     srcs = ["lrn_ops_test.py"],
@@ -882,15 +959,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "manip_ops_test",
     size = "small",
     srcs = ["manip_ops_test.py"],
@@ -901,14 +981,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:manip_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:manip_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "matrix_band_part_test",
     size = "medium",
     timeout = "long",
@@ -921,15 +1002,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "matrix_diag_ops_test",
     size = "medium",
     timeout = "long",
@@ -942,13 +1024,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "momentum_test",
     size = "small",
     srcs = ["momentum_test.py"],
@@ -959,16 +1042,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "nary_ops_test",
     size = "small",
     srcs = ["nary_ops_test.py"],
@@ -979,14 +1064,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "nullary_ops_test",
     size = "small",
     srcs = ["nullary_ops_test.py"],
@@ -997,13 +1084,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "pooling_ops_test",
     size = "medium",
     srcs = ["pooling_ops_test.py"],
@@ -1015,15 +1103,19 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
@@ -1035,16 +1127,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "proximal_adagrad_test",
     size = "medium",
     srcs = ["proximal_adagrad_test.py"],
@@ -1055,14 +1148,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "proximal_gradient_descent_test",
     size = "medium",
     srcs = ["proximal_gradient_descent_test.py"],
@@ -1073,14 +1168,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "qr_op_test",
     size = "medium",
     srcs = ["qr_op_test.py"],
@@ -1098,17 +1195,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "unstack_test",
     size = "medium",
     srcs = ["unstack_test.py"],
@@ -1126,17 +1223,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
@@ -1147,17 +1242,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:standard_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops/distributions:special_math",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "reduce_ops_test",
     size = "medium",
     srcs = ["reduce_ops_test.py"],
@@ -1169,16 +1265,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "reduce_window_test",
     size = "small",
     srcs = ["reduce_window_test.py"],
@@ -1190,15 +1288,15 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/ops:array_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "reverse_ops_test",
     size = "medium",
     srcs = ["reverse_ops_test.py"],
@@ -1209,13 +1307,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "reverse_sequence_op_test",
     size = "medium",
     srcs = ["reverse_sequence_op_test.py"],
@@ -1227,15 +1327,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
 # copybara:uncomment_begin(google-only)
-# tf_xla_py_test(
+# tf_xla_py_strict_test(
 #     name = "reverse_sequence_op_args_test",
 #     size = "medium",
 #     srcs = ["reverse_sequence_op_args_test.py"],
@@ -1249,17 +1349,16 @@ tf_xla_py_test(
 #     deps = [
 #         ":xla_test",
 #         "//tensorflow/compiler/jit:xla_cpu_jit",  # DisableOnExport
-#         "//tensorflow/python:array_ops",
-#         "//tensorflow/python:framework",
-#         "//tensorflow/python:platform_test",
 #         "//tensorflow/python/compat:v2_compat",
-#         "//tensorflow/python/eager:function",
+#         "//tensorflow/python/eager:def_function",
+#         "//tensorflow/python/framework:errors",
+#         "//tensorflow/python/ops:array_ops",
 #         "//tensorflow/python/platform:client_testlib",
 #     ],
 # )
 # copybara:uncomment_end
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "rmsprop_test",
     size = "small",
     srcs = ["rmsprop_test.py"],
@@ -1270,16 +1369,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
@@ -1292,15 +1391,19 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "segment_reduction_ops_test",
     size = "medium",
     srcs = ["segment_reduction_ops_test.py"],
@@ -1312,15 +1415,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "spacetobatch_op_test",
     size = "medium",
     srcs = ["spacetobatch_op_test.py"],
@@ -1332,17 +1436,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "sparse_to_dense_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["sparse_to_dense_op_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
@@ -1351,15 +1456,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
@@ -1372,15 +1478,17 @@ tf_xla_py_test(
     use_xla_device = False,
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "stateful_random_ops_test",
     size = "medium",
     srcs = ["stateful_random_ops_test.py"],
@@ -1395,17 +1503,25 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:standard_ops",
-        "//tensorflow/python:stateful_random_ops",
+        "//tensorflow/python/client",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/kernel_tests/random:util",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/ops:stateful_random_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "stateless_random_ops_test",
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
@@ -1418,16 +1534,25 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:standard_ops",
-        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/client",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/kernel_tests/random:util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:stateless_random_ops_v2_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "tensor_array_ops_test",
     size = "medium",
     srcs = ["tensor_array_ops_test.py"],
@@ -1443,21 +1568,27 @@ tf_xla_py_test(
     use_xla_device = False,
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python/compiler/xla",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:data_flow_ops_gen",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "tensor_list_ops_test",
     size = "small",
     srcs = ["tensor_list_ops_test.py"],
@@ -1470,16 +1601,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:list_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "ternary_ops_test",
     size = "medium",
     srcs = ["ternary_ops_test.py"],
@@ -1491,38 +1624,46 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "unary_ops_test",
     size = "medium",
     srcs = ["unary_ops_test.py"],
     enable_mlir_bridge = True,
     python_version = "PY3",
-    shard_count = 32,
+    shard_count = 50,
     tags = [
         "no_cuda_asan",  # times out
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:functional_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "fused_batchnorm_test",
     size = "medium",
     srcs = ["fused_batchnorm_test.py"],
@@ -1535,20 +1676,18 @@ tf_xla_py_test(
     deps = [
         ":test_utils",
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "variable_ops_test",
     size = "small",
     srcs = ["variable_ops_test.py"],
@@ -1560,18 +1699,26 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "while_test",
     size = "small",
     srcs = ["while_test.py"],
@@ -1583,16 +1730,22 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "case_test",
     size = "small",
     srcs = ["case_test.py"],
@@ -1605,18 +1758,17 @@ tf_xla_py_test(
     use_xla_device = False,  # Uses tf.function(jit_compile=True)
     deps = [
         ":xla_test",
-        "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:framework",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:io_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "cast_ops_test",
     size = "small",
     srcs = ["cast_ops_test.py"],
@@ -1628,18 +1780,20 @@ tf_xla_py_test(
     use_xla_device = False,  # Uses tf.function(jit_compile=True)
     deps = [
         ":xla_test",
-        "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "gather_test",
     size = "medium",
     srcs = ["gather_test.py"],
@@ -1650,16 +1804,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "gather_nd_op_test",
     size = "medium",
     srcs = ["gather_nd_op_test.py"],
@@ -1670,14 +1825,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "scatter_nd_op_test",
     size = "medium",
     srcs = ["scatter_nd_op_test.py"],
@@ -1689,14 +1845,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "sort_ops_test",
     size = "medium",
     srcs = ["sort_ops_test.py"],
@@ -1709,16 +1866,22 @@ tf_xla_py_test(
         "optonly",
     ],
     deps = [
-        "//tensorflow/compiler/tests:xla_test",
+        ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "data_format_ops_test",
     size = "small",
     srcs = ["data_format_ops_test.py"],
@@ -1728,15 +1891,16 @@ tf_xla_py_test(
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
-        "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:nn_ops",
+        ":xla_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "xla_device_test",
     size = "small",
     srcs = ["xla_device_test.py"],
@@ -1748,14 +1912,17 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "xla_device_gpu_test",
     size = "small",
     srcs = ["xla_device_gpu_test.py"],
@@ -1765,16 +1932,16 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "jit_test",
     size = "medium",
     srcs = ["jit_test.py"],
@@ -1788,21 +1955,25 @@ cuda_py_test(
     deps = [
         ":test_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "async_comp_test",
     size = "medium",
     srcs = ["async_comp_test.py"],
@@ -1813,18 +1984,18 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
-        ":test_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dense_layer_test",
     size = "medium",
     srcs = ["dense_layer_test.py"],
@@ -1836,11 +2007,13 @@ cuda_py_test(
     deps = [
         ":test_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -1962,22 +2135,23 @@ tf_cuda_cc_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "lstm",
     testonly = 1,
     srcs = ["lstm.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_v1",
         "@six_archive//:six",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "lstm_test",
     srcs = ["lstm_test.py"],
     tags = [
@@ -1988,13 +2162,17 @@ cuda_py_test(
     deps = [
         ":lstm",
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
@@ -2021,7 +2199,7 @@ tf_library(
     tfcompile_flags = ["--xla_cpu_multi_thread_eigen=false"],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "fake_quant_ops_test",
     size = "medium",
     srcs = ["fake_quant_ops_test.py"],
@@ -2032,12 +2210,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "placeholder_test",
     size = "small",
     srcs = ["placeholder_test.py"],
@@ -2048,13 +2229,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "quantized_ops_test",
     size = "medium",
     srcs = ["quantized_ops_test.py"],
@@ -2066,16 +2248,18 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "xla_ops_test",
     size = "medium",
     srcs = ["xla_ops_test.py"],
@@ -2086,16 +2270,26 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
+        "//tensorflow/compiler/xla:xla_data_proto_py",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "xla_custom_call_ops_test",
     size = "small",
     srcs = ["xla_custom_call_ops_test.py"],
@@ -2113,14 +2307,16 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "runtime_shape_check_test",
     size = "small",
     srcs = ["runtime_shape_check_test.py"],
@@ -2137,13 +2333,17 @@ tf_xla_py_test(
     use_xla_device = False,
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "conv_node_name_test",
     size = "medium",
     srcs = ["conv_node_name_test.py"],
@@ -2156,17 +2356,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "tridiagonal_solve_ops_test",
     size = "medium",
     srcs = ["tridiagonal_solve_ops_test.py"],
@@ -2178,15 +2377,20 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:standard_ops",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/linalg:linalg_impl",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "tridiagonal_matmul_ops_test",
     size = "medium",
     srcs = ["tridiagonal_matmul_ops_test.py"],
@@ -2198,15 +2402,22 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:standard_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops/linalg:linalg_impl",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
@@ -2219,14 +2430,20 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:random_ops_gen",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/flags",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "repeat_op_test",
     size = "medium",
     srcs = ["repeat_op_test.py"],
@@ -2239,13 +2456,14 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "image_ops_jit_compile_test",
     size = "medium",
     srcs = ["image_ops_jit_compile_test.py"],
@@ -2262,13 +2480,20 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "ensure_shape_op_test",
     size = "medium",
     srcs = ["ensure_shape_op_test.py"],
@@ -2280,14 +2505,15 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "where_op_test",
     size = "small",
     srcs = ["where_op_test.py"],
@@ -2303,17 +2529,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python/compiler/xla:compiler_py",
-        "//tensorflow/python/tpu",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_py",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "where_op_tpu_test",
     size = "small",
     srcs = ["where_op_test.py"],
@@ -2335,17 +2560,16 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python/compiler/xla:compiler_py",
-        "//tensorflow/python/tpu",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:tpu_py",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "risc_ops_test",
     size = "small",
     srcs = ["risc_ops_test.py"],
@@ -2356,16 +2580,18 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:is_mlir_bridge_test_true",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:is_mlir_bridge_test_true",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/ops/risc:risc_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "const_arg_test",
     size = "small",
     srcs = ["const_arg_test.py"],
@@ -2376,14 +2602,14 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:is_mlir_bridge_test_false",
+        "//tensorflow/compiler/tf2xla/python:xla",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "const_test",
     size = "small",
     srcs = ["const_test.py"],
@@ -2391,14 +2617,16 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
-        ":xla_test",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "giant_const_op_test",
     srcs = [
         "giant_const_op_test.py",
@@ -2410,14 +2638,19 @@ tpu_py_test(
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:flags",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "sharding_util_ops_test",
     srcs = ["sharding_util_ops_test.py"],
     disabled_backends = [
@@ -2434,40 +2667,41 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tpu_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "approx_topk_test",
     srcs = ["approx_topk_test.py"],
     disable_experimental = False,
     disable_mlir_bridge = False,
     tags = ["no_oss"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:variables",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "xla_call_module_test",
     size = "small",
     srcs = ["xla_call_module_test.py"],
@@ -2479,14 +2713,50 @@ tf_xla_py_test(
     use_xla_device = False,  # Uses tf.function(jit_compile=True)
     deps = [
         ":xla_test",
+        "//tensorflow/compiler/mlir/stablehlo",
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
+    name = "xla_call_module_no_platform_check_test",
+    size = "small",
+    srcs = ["xla_call_module_no_platform_check_test.py"],
+    enable_mlir_bridge = False,
+    env = {"TF_XLA_FLAGS": "--tf_xla_call_module_disabled_checks=platform"},
+    python_version = "PY3",
+    tags = [
+        "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
+    ],
+    use_xla_device = False,  # Uses tf.function(jit_compile=True)
+    deps = [
+        ":xla_test",
+        "//tensorflow/compiler/mlir/stablehlo",
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_xla_py_strict_test(
     name = "bincount_op_test",
     size = "small",
     srcs = ["bincount_op_test.py"],
@@ -2499,10 +2769,12 @@ tf_xla_py_test(
     deps = [
         ":xla_test",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:math_ops_gen",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "unique_ops_test",
     size = "small",
     srcs = ["unique_ops_test.py"],
@@ -2518,9 +2790,11 @@ tf_xla_py_test(
     ],
     deps = [
         ":xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/compiler/tests/build_defs.bzl b/tensorflow/compiler/tests/build_defs.bzl
index 8bf82c34644..7343bb9b89e 100644
--- a/tensorflow/compiler/tests/build_defs.bzl
+++ b/tensorflow/compiler/tests/build_defs.bzl
@@ -1,6 +1,7 @@
 """Build rules for Tensorflow/XLA testing."""
 
 load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
 load("//tensorflow/compiler/tests:plugin.bzl", "plugins")
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
@@ -21,6 +22,7 @@ def tf_xla_py_test(
         disabled_backends = None,
         use_xla_device = True,
         enable_mlir_bridge = True,
+        test_rule = py_test,
         **kwargs):
     """Generates py_test targets, one per XLA backend.
 
@@ -111,7 +113,7 @@ def tf_xla_py_test(
             extra_tag = []
             updated_name = test_name
 
-            mlir_bridge_dep = "//tensorflow/python:is_mlir_bridge_test_true"
+            mlir_bridge_dep = "//tensorflow/python/framework:is_mlir_bridge_test_true"
             has_mlir_dep = (mlir_bridge_dep in deps)
             if mlir_option:
                 if updated_name.endswith("_test"):
@@ -130,7 +132,7 @@ def tf_xla_py_test(
                 # version.
                 continue
 
-            py_test(
+            test_rule(
                 name = updated_name,
                 srcs = srcs,
                 srcs_version = "PY3",
@@ -145,6 +147,9 @@ def tf_xla_py_test(
             test_names.append(updated_name)
     native.test_suite(name = name, tests = test_names)
 
+def tf_xla_py_strict_test(**kwargs):
+    tf_xla_py_test(test_rule = py_strict_test, **kwargs)
+
 def generate_backend_suites(backends = []):
     """Generates per-backend test_suites that run all tests for a backend."""
     if not backends:
diff --git a/tensorflow/compiler/tests/giant_const_op_test.py b/tensorflow/compiler/tests/giant_const_op_test.py
index 014b9d5f1eb..9a73a95cb34 100644
--- a/tensorflow/compiler/tests/giant_const_op_test.py
+++ b/tensorflow/compiler/tests/giant_const_op_test.py
@@ -25,7 +25,6 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.platform import flags
-from tensorflow.python.tpu import tpu_strategy_util
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
@@ -45,7 +44,7 @@ def get_tpu_cluster_resolver():
 def get_tpu_strategy():
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
-  tpu_strategy_util.initialize_tpu_system(resolver)
+  tpu_cluster_resolver.initialize_tpu_system(resolver)
   return tpu_lib.TPUStrategyV2(resolver)
 
 
diff --git a/tensorflow/compiler/tests/randomized_tests.cc b/tensorflow/compiler/tests/randomized_tests.cc
index b5d3baec7fc..d4d9afaad54 100644
--- a/tensorflow/compiler/tests/randomized_tests.cc
+++ b/tensorflow/compiler/tests/randomized_tests.cc
@@ -43,8 +43,19 @@ limitations under the License.
 // * StridedSliceGrad (need to use shape function to compute sensible inputs)
 
 #include <algorithm>
+#include <array>
+#include <cmath>
+#include <functional>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <numeric>
+#include <optional>
 #include <random>
+#include <string>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_set.h"
@@ -504,7 +515,7 @@ OpTest::OpTest() {
              << ". To reproduce the "
                 "results of this test, pass flag --tf_xla_random_seed="
              << seed;
-  generator_.reset(new std::mt19937(seed));
+  generator_ = std::make_unique<std::mt19937>(seed);
 }
 
 namespace {
@@ -532,7 +543,7 @@ template <typename T>
 class TensorGenerator {
  public:
   explicit TensorGenerator(OpTest& test) : test_(test) {}
-  virtual ~TensorGenerator() {}
+  virtual ~TensorGenerator() = default;
   virtual DataType dtype() = 0;
   virtual void RandomVals(std::optional<T> lo, std::optional<T> hi,
                           bool needs_unique_values,
diff --git a/tensorflow/compiler/tests/sharding_util_ops_test.py b/tensorflow/compiler/tests/sharding_util_ops_test.py
index 26e39ca2a2b..7d5ac5771f1 100644
--- a/tensorflow/compiler/tests/sharding_util_ops_test.py
+++ b/tensorflow/compiler/tests/sharding_util_ops_test.py
@@ -294,7 +294,7 @@ class XlaSplitNDOpTest(xla_test.XLATestCase, parameterized.TestCase):
   def testRanked(self, graph_fn, rank):
     num_splits = [2] * rank
     num_outputs = 2 << (rank - 1)
-    input_value = np.reshape(np.arange(np.product(num_splits)), num_splits)
+    input_value = np.reshape(np.arange(np.prod(num_splits)), num_splits)
     for dtype in self.numeric_types:
       with self.session() as sess, self.device_scope():
         split = graph_fn(
diff --git a/tensorflow/compiler/tests/tensor_float_32_test.py b/tensorflow/compiler/tests/tensor_float_32_test.py
new file mode 100644
index 00000000000..f02b69948f4
--- /dev/null
+++ b/tensorflow/compiler/tests/tensor_float_32_test.py
@@ -0,0 +1,106 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests that the PrecisionConfig is set if TF32 is disabled."""
+
+from tensorflow.compiler.tests import xla_test
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.platform import googletest
+
+
+class TensorFloat32ConvTest(xla_test.XLATestCase):
+
+  def tearDown(self):
+    super().tearDown()
+    config.enable_tensor_float_32_execution(True)
+
+  def _test_fn(self, fn, inputs):
+    with ops.device('device:{}:0'.format(self.device)):
+      # Test with TF32 disabled
+      config.enable_tensor_float_32_execution(False)
+      compiled_fn = def_function.function(fn, jit_compile=True)
+      hlo_text = compiled_fn.experimental_get_compiler_ir(*inputs)(stage='hlo')
+      self.assertIn('operand_precision={highest,highest}', hlo_text)
+
+      # Test the output is sufficiently precise by comparing with FP64 results
+      out = compiled_fn(*inputs)
+      f64_out = compiled_fn(*[math_ops.cast(x, 'float64') for x in inputs])
+      self.assertAllClose(out, f64_out, rtol=1e-5, atol=1e-5)
+
+      # Test with TF32 enabled. Recompile fn because enabling TF32 does not
+      # reset function cache.
+      config.enable_tensor_float_32_execution(True)
+      compiled_fn = def_function.function(fn, jit_compile=True)
+      hlo_text = compiled_fn.experimental_get_compiler_ir(*inputs)(stage='hlo')
+      # operand_precision is not in HLO if it's the default value.
+      self.assertNotIn('operand_precision', hlo_text)
+
+  def test_matmul(self):
+    x = array_ops.fill((1024, 1024), 1 + 2**-12)
+    y = array_ops.fill((1024, 1024), 1.0)
+
+    def matmul(x, y):
+      return math_ops.matmul(x, y)
+
+    self._test_fn(matmul, [x, y])
+
+  def test_batch_matmul(self):
+    x = array_ops.fill((2, 1024, 1024), 1 + 2**-12)
+    y = array_ops.fill((2, 1024, 1024), 1.0)
+
+    def batch_matmul(x, y):
+      return math_ops.matmul(x, y)
+
+    self._test_fn(batch_matmul, [x, y])
+
+  def test_conv2d(self):
+    x = array_ops.fill((2, 20, 20, 32), 1 + 2**-12)
+    y = array_ops.fill((3, 3, 32, 32), 1.0)
+
+    def conv2d(x, y):
+      return nn_ops.conv2d(x, y, [1, 1, 1, 1], padding='SAME')
+
+    self._test_fn(conv2d, [x, y])
+
+  def test_conv2d_backprop_input(self):
+    y = array_ops.fill((3, 3, 32, 32), 1 + 2**-12)
+    out_backprop = array_ops.fill((2, 20, 20, 32), 1.0)
+
+    def conv2d_backprop_input(y, out_backprop):
+      return nn_ops.conv2d_backprop_input(
+          (2, 20, 20, 32), y, out_backprop, [1, 1, 1, 1], padding='SAME'
+      )
+
+    self._test_fn(conv2d_backprop_input, [y, out_backprop])
+
+  def test_conv2d_backprop_filter(self):
+    x = array_ops.fill((2, 20, 20, 32), 1 + 2**-12)
+    out_backprop = array_ops.fill((2, 20, 20, 32), 1.0)
+
+    def conv2d_backprop_filter(x, out_backprop):
+      return nn_ops.conv2d_backprop_filter(
+          x, (3, 3, 32, 32), out_backprop, [1, 1, 1, 1], padding='SAME'
+      )
+
+    self._test_fn(conv2d_backprop_filter, [x, out_backprop])
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  googletest.main()
diff --git a/tensorflow/compiler/tests/unary_ops_test.py b/tensorflow/compiler/tests/unary_ops_test.py
index 34aeeafe976..c944c9e22e0 100644
--- a/tensorflow/compiler/tests/unary_ops_test.py
+++ b/tensorflow/compiler/tests/unary_ops_test.py
@@ -953,6 +953,17 @@ class UnaryOpsTest(xla_test.XLATestCase):
           lambda x: array_ops.bitcast(x, dtypes.uint64),
           np.array([1, 0x100000003f800000], np.int64),
           expected=np.array([1, 0x100000003f800000], np.uint64))
+      self._assertOpOutputMatchesExpected(
+          lambda x: array_ops.bitcast(x, dtypes.float64),
+          np.array(
+              [0, 0x3FF0000000000000, 0xc3af161421c8e000, 0x4032000000000007],
+              np.uint64,
+          ),
+          expected=np.array(
+              [0, 1.0, -1.12e+18, 18.000000000000024869], np.float64
+          ),
+          atol=0
+      )
 
   def testBitcastInt8ToFloat(self):
     self._assertOpOutputMatchesExpected(
diff --git a/tensorflow/compiler/tests/xla_call_module_no_platform_check_test.py b/tensorflow/compiler/tests/xla_call_module_no_platform_check_test.py
new file mode 100644
index 00000000000..9146711bda3
--- /dev/null
+++ b/tensorflow/compiler/tests/xla_call_module_no_platform_check_test.py
@@ -0,0 +1,87 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for XLA call module op wrapper with disabled platform check.
+
+This test runs with --tf_xla_call_module_disabled_checks=platform
+"""
+from typing import Tuple
+
+import numpy as np
+
+from tensorflow.compiler.mlir.stablehlo import stablehlo
+from tensorflow.compiler.tests import xla_test
+from tensorflow.compiler.tf2xla.python import xla
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import googletest
+
+
+def serialize(module_str: str) -> Tuple[str, int]:
+  target = stablehlo.get_minimum_version()
+  byte_str = stablehlo.serialize_portable_artifact(module_str, target)
+  return byte_str, xla.call_module_maximum_supported_version()
+
+
+class XlaCallModuleOpTest(xla_test.XLATestCase):
+
+  def _assertOpOutputMatchesExpected(self,
+                                     op,
+                                     args,
+                                     expected,
+                                     equality_fn=None):
+    """Asserts op(*args) == expected."""
+    with self.session() as session:
+      with self.test_scope():
+        placeholders = [
+            array_ops.placeholder(dtypes.as_dtype(arg.dtype), arg.shape)
+            for arg in args
+        ]
+        feeds = {placeholders[i]: args[i] for i in range(0, len(args))}
+        output = op(*placeholders)
+      result = session.run(output, feeds)
+      if not equality_fn:
+        equality_fn = self.assertAllClose
+      equality_fn(result, expected, rtol=1e-3)
+
+  def test_platforms_errors(self):
+    """Error reporting for the platforms attribute."""
+    x = np.float32(0.)
+
+    module_str = """
+module @jit_f.0 {
+  func.func public @main(%arg0: tensor<f32>) -> tensor<f32> {
+    return %arg0 : tensor<f32>
+  }
+}
+"""
+    module, version = serialize(module_str)
+    def f(x):
+      return xla.call_module(
+          [x], version=version,
+          module=module,
+          Tout=[np.float32],
+          Sout=[()],
+          platforms=['RANDOM_PLATFORM'],
+          disabled_checks=[])
+    # No error even though the `platforms` does not match the testing platform
+    self._assertOpOutputMatchesExpected(f, (x,), (x,))
+
+
+if __name__ == '__main__':
+  # This test is using Tensorflow sessions which are not compatible with eager
+  # mode.
+  ops.disable_eager_execution()
+  googletest.main()
diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index 01f30718217..31abd1f700e 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -18,24 +18,24 @@ import unittest
 
 import numpy as np
 
+from tensorflow.compiler.mlir.stablehlo import stablehlo
 from tensorflow.compiler.tests import xla_test
 from tensorflow.compiler.tf2xla.ops import gen_xla_ops
 from tensorflow.compiler.tf2xla.python import xla
-
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.platform import googletest
+from tensorflow.python.platform import test
 
 
 def serialize(module_str: str) -> Tuple[str, int]:
-  # TODO(b/274838200): error importing xla_extension in OSS
-  # target_version = '0.9.0'  # TODO(gleasonk): use APIs to get this
-  # return xla_extension.mlir.serialize_portable_artifact(
-  #     module_str, target_version), 4
-  return module_str, 3
+  target = stablehlo.get_minimum_version()
+  byte_str = stablehlo.serialize_portable_artifact(module_str, target)
+  return byte_str, xla.call_module_maximum_supported_version()
 
 
 class XlaCallModuleOpTest(xla_test.XLATestCase):
@@ -64,7 +64,10 @@ class XlaCallModuleOpTest(xla_test.XLATestCase):
     if self.device in ['CPU', 'XLA_CPU']:
       return 'CPU'
     elif self.device in ['GPU', 'XLA_GPU']:
-      return 'CUDA'
+      if test.is_built_with_rocm():
+        return 'ROCM'
+      else:
+        return 'CUDA'
     elif self.device in ['TPU', 'XLA_TPU']:
       return 'TPU'
     else:
@@ -85,7 +88,34 @@ module @jit_f.0 {
 }
 """)
       return xla.call_module([x], version=version,
-                             module=module, Tout=[x.dtype], Sout=[x.shape])
+                             module=module, Tout=[x.dtype], Sout=[x.shape],
+                             platforms=[self.testing_platform()])
+
+    self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
+
+  def test_basic_with_token(self):
+    x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
+
+    def f(x):
+      # sin(cos(x))
+      module, version = serialize("""
+module @jit_f.0 {
+  func.func public @main(%arg0: !stablehlo.token, %arg1: tensor<3xf32>) -> (!stablehlo.token, tensor<3xf32>) {
+    %0 = stablehlo.cosine %arg1 : tensor<3xf32>
+    %1 = stablehlo.sine %0 : tensor<3xf32>
+    return %arg0, %1 : !stablehlo.token, tensor<3xf32>
+  }
+}
+""")
+      return xla.call_module(
+          [x],
+          version=version,
+          module=module,
+          Tout=[x.dtype],
+          Sout=[x.shape],
+          has_token_input_output=True,
+          platforms=[self.testing_platform()],
+      )
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
 
@@ -107,7 +137,8 @@ module @jit_f_jax.0 {
       return xla.call_module([x], version=version,
                              module=module,
                              Tout=[res.dtype],
-                             Sout=[res.shape])
+                             Sout=[res.shape],
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -129,17 +160,19 @@ module @jit_f.0 {
       return xla.call_module([x, y], version=version,
                              module=module,
                              Tout=[x.dtype, y.dtype],
-                             Sout=[x.shape, y.shape])
+                             Sout=[x.shape, y.shape],
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x, y), (np.sin(x), np.cos(y)))
 
+  # TODO(b/283439649): remove dim_args_spec support
   def test_dim_var_basic(self):
     x = np.arange(6, dtype=np.float32).reshape((2, 3))
 
     def f(x):  # x: f32[2, b]
       # Module takes another argument which is the value of b
       # (sin(x), x.shape[1])
-      module, version = serialize("""
+      module, _ = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i32>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i32>) {
     %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
@@ -147,21 +180,24 @@ module @jit_f.0 {
   }
 }
 """)
-      return xla.call_module([x], version=version,
-                             module=module,
-                             Tout=[x.dtype, np.int32],
-                             Sout=[(None, 3), ()],
-                             dim_args_spec=['0.1'])
+      return gen_xla_ops.xla_call_module(
+          [x],
+          version=4,
+          module=module,
+          Tout=[x.dtype, np.int32],
+          Sout=[(None, 3), ()],
+          dim_args_spec=['0.1'])
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x), x.shape[1]))
 
+  # TODO(b/283439649): remove dim_args_spec support
   def test_dim_var_basic_dim_arg_i64(self):
     x = np.arange(6, dtype=np.float32).reshape((2, 3))
 
     def f(x):  # x: f32[2, b]
       # Module takes another argument which is the value of b
       # (sin(x), x.shape[1])
-      module, version = serialize("""
+      module, _ = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg0: tensor<i64>, %arg1: tensor<2x?xf32>) -> (tensor<2x?xf32>, tensor<i64>) {
     %0 = stablehlo.sine %arg1 : tensor<2x?xf32>
@@ -169,11 +205,12 @@ module @jit_f.0 {
   }
 }
 """)
-      return xla.call_module([x],
-                             module=module, version=version,
-                             Tout=[x.dtype, np.int64],
-                             Sout=[(None, 3), ()],
-                             dim_args_spec=['0.1'])
+      return gen_xla_ops.xla_call_module(
+          [x],
+          module=module, version=4,
+          Tout=[x.dtype, np.int64],
+          Sout=[(None, 3), ()],
+          dim_args_spec=['0.1'])
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x), x.shape[1]))
 
@@ -199,89 +236,64 @@ module @jit_f.0 {
       return xla.call_module([x],
                              module=module, version=version,
                              Tout=[x.dtype, np.int32],
-                             Sout=[(None, 3), ()])
+                             Sout=[(None, 3), ()],
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(x), x.shape[1]))
 
-  def test_dim_args_spec_errors(self):
-    # x, y: f32[2, b, c]
-    x = np.arange(24, dtype=np.float32).reshape((2, 3, 4))
-    y = x
+  def test_wrong_actual_args_errors(self):
+    x = np.arange(6, dtype=np.float32).reshape((3, 2))
+    y = np.arange(6, dtype=np.int32).reshape((2, 3))
 
-    # Module takes two prefix arguments with the values of b and c
-    #   return (sin(x + y), x.shape[1])
+    # x: f32[a, 2], return x
     module, version = serialize("""
 module @jit_f.0 {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tensor<2x?x?xf32>, %arg3: tensor<2x?x?xf32>) -> (tensor<2x?x?xf32>, tensor<i32>) {
-    %0 = stablehlo.add %arg2, %arg3 : tensor<2x?x?xf32>
-    %1 = stablehlo.sine %0 : tensor<2x?x?xf32>
-    return %1, %arg0 : tensor<2x?x?xf32>, tensor<i32>
+  func.func public @main(%arg0: tensor<?x2xf32>, %arg1: tensor<*xi32>) -> tensor<?x2xf32> {
+    return %arg0 : tensor<?x2xf32>
   }
 }
 """)
 
-    dim_args_spec = ['0.1', '0.2']
     def f(x, y):
-      return xla.call_module([x, y],
-                             module=module, version=version,
-                             Tout=[x.dtype, np.int32],
-                             Sout=[(None, 3), ()],
-                             dim_args_spec=dim_args_spec)
-    self._assertOpOutputMatchesExpected(f, (x, y), (np.sin(x + y), x.shape[1]))
+      return xla.call_module(
+          [x, y],
+          module=module,
+          version=version,
+          Tout=[x.dtype],
+          Sout=[(None, 2)],
+          platforms=[self.testing_platform()],
+      )
 
-    dim_args_spec = ['0.0', '0.0', '0.0', '0.0']  # Too many dim_args_spec
+    self._assertOpOutputMatchesExpected(f, (x, y), (x,))
+
+    x_bad_etype = x.astype(np.int32)
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
-        'The module should have 0 platform index arguments and '
-        '4 dimension arguments, '
-        'but it has only 4 total arguments'):
-      self._assertOpOutputMatchesExpected(f, (x, y),
-                                          (np.sin(x + y), x.shape[1]))
+        'Element type mismatch for argument 0 passed to XlaCallModule: '
+        r'expecting tensor<\?x2xf32>, got tensor<3x2xi32>',
+    ):
+      self._assertOpOutputMatchesExpected(f, (x_bad_etype, y), (x_bad_etype,))
 
-    dim_args_spec = ['0.0', '0.0', '0.0']  # dim_args_spec refers to non-scalar
+    y_bad_etype = y.astype(np.float32)
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
-        'Module argument at index 2 should be a 0-dimensional integer-tensor '
-        'dimension argument but has type'):
-      self._assertOpOutputMatchesExpected(f, (x, y),
-                                          (np.sin(x + y), x.shape[1]))
+        'Element type mismatch for argument 1 passed to XlaCallModule: '
+        r'expecting tensor<\*xi32>, got tensor<2x3xf32>',
+    ):
+      self._assertOpOutputMatchesExpected(f, (x, y_bad_etype), (x,))
 
-    dim_args_spec = ['1.0']  # Too few dim_args_spec
+    x_bad_shape = np.arange(15, dtype=np.float32).reshape(5, 3)
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
-        'Incorrect number of arguments passed to XlaCallModule: 2. '
-        'The module takes 4 arguments of which 0 platform index arguments '
-        'and 1 dimension arguments.'):
-      self._assertOpOutputMatchesExpected(f, (x, y),
-                                          (np.sin(x + y), x.shape[1]))
-
-    dim_args_spec = ['0.b', '0.1']  # axis_idx not a number
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError,
-        "Syntax error in dim_args_spec '0.b'"):
-      self._assertOpOutputMatchesExpected(f, (x, y),
-                                          (np.sin(x + y), x.shape[1]))
-
-    dim_args_spec = ['2.0', '0.1']  # arg_idx too large
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError,
-        'Invalid argument index 2 when the number of non-dimension arguments '
-        "is 2 in dim_arg_spec '2.0'"):
-      self._assertOpOutputMatchesExpected(f, (x, y),
-                                          (np.sin(x + y), x.shape[1]))
-
-    dim_args_spec = ['0.3', '0.1']  # axis_idx too large
-    with self.assertRaisesRegex(
-        errors.InvalidArgumentError,
-        'Invalid axis index 3 when the rank of non-dimension argument 0 '
-        "is 3 in dim_arg_spec '0.3'"):
-      self._assertOpOutputMatchesExpected(f, (x, y),
-                                          (np.sin(x + y), x.shape[1]))
+        'Shape mismatch for argument 0 passed to XlaCallModule: '
+        r'expecting tensor<\?x2xf32>, got tensor<5x3xf32>',
+    ):
+      self._assertOpOutputMatchesExpected(f, (x_bad_shape, y), (x_bad_shape,))
 
   def test_platforms_basic(self):
     x = np.float32(0.)
 
-    #  returns x + 2. on CPU, x + 3. on GPU and x + 4. on TPU
+    #  returns x + 2. on CPU, x + 3. on GPU (CUDA or ROCM) and x + 4. on TPU
     module, version = serialize("""
 module @jit_f.0 {
   func.func public @main(%arg_platform_idx: tensor<i32>, %arg0: tensor<f32>) -> tensor<f32> {
@@ -301,7 +313,7 @@ module @jit_f.0 {
 }
 """)
 
-    platforms = ['CPU', 'CUDA', 'TPU']
+    platforms = ['CPU', 'CUDA', 'ROCM', 'TPU']
     def f(x):
       return xla.call_module([x], version=version,
                              module=module,
@@ -309,40 +321,11 @@ module @jit_f.0 {
                              Sout=[()],
                              platforms=platforms)
 
-    expected_value = x + dict(CPU=2., CUDA=3., TPU=4.)[self.testing_platform()]
+    expected_value = (
+        x + dict(CPU=2.0, CUDA=3.0, ROCM=3.0, TPU=4.0)[self.testing_platform()]
+    )
     self._assertOpOutputMatchesExpected(f, (x,), (expected_value,))
 
-  def test_platforms_with_dim_vars(self):
-    x = np.ones((3,), dtype=np.float32)
-    y = np.arange(3., dtype=np.float32)
-
-    #  returns x + x on CPU and x - x on TPU
-    module, version = serialize("""
-module @jit_f.0 {
-  func.func public @main(%arg_platform_idx: tensor<i32>, %arg_dim0: tensor<i32>, %arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
-    %res = "stablehlo.case"(%arg_platform_idx) ({
-      %0 = stablehlo.add %arg0, %arg1 : tensor<?xf32>
-      stablehlo.return %0 : tensor<?xf32>
-    }, {
-      %1 = stablehlo.subtract %arg0, %arg1 : tensor<?xf32>
-      stablehlo.return %1 : tensor<?xf32>
-    }) : (tensor<i32>) -> tensor<?xf32>
-    return %res : tensor<?xf32>
-  }
-}
-""")
-    def f(x, y):
-      return xla.call_module([x, y], version=version,
-                             module=module,
-                             Tout=[np.float32],
-                             Sout=[(None,)],
-                             platforms=['CPU', 'TPU'],
-                             dim_args_spec=['0.0'])
-
-    expected_value = x + (y if self.testing_platform() == 'CPU' else -y)
-    if self.testing_platform() in ['CPU', 'TPU']:
-      self._assertOpOutputMatchesExpected(f, (x, y), (expected_value,))
-
   def test_platforms_errors(self):
     """Error reporting for the platforms attribute."""
     x = np.float32(0.)
@@ -353,17 +336,26 @@ module @jit_f.0 {
     return %arg0 : tensor<f32>
   }
 }
+"""
+    module_str_no_platform_arg = """
+module @jit_f.0 {
+  func.func public @main(%arg0: tensor<f32>) -> tensor<f32> {
+    return %arg0 : tensor<f32>
+  }
+}
 """
     module, version = serialize(module_str)
-    platforms = []
+    platforms = [self.testing_platform()]
+    disabled_checks = []
     def f(x):
       return xla.call_module([x], version=version,
                              module=module,
                              Tout=[np.float32],
                              Sout=[()],
-                             platforms=platforms)
+                             platforms=platforms,
+                             disabled_checks=disabled_checks)
 
-    # With empty platforms, there should be no platform_index argument
+    # With singleton `platforms`, there should be no platform_index argument
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'Incorrect number of arguments passed to XlaCallModule: 1. '
@@ -371,23 +363,33 @@ module @jit_f.0 {
         'and 0 dimension arguments.'):
       self._assertOpOutputMatchesExpected(f, (x,), (x,))
 
-    # Same with a single platform
-    platforms = ['CPU']
-    if self.testing_platform() == 'CPU':
-      with self.assertRaisesRegex(
-          errors.InvalidArgumentError,
-          'Incorrect number of arguments passed to XlaCallModule: 1. '
-          'The module takes 2 arguments of which 0 platform index arguments '
-          'and 0 dimension arguments.'):
-        self._assertOpOutputMatchesExpected(f, (x,), (x,))
-
     platforms = ['RANDOM_PLATFORM_1', 'RANDOM_PLATFORM_2']
     with self.assertRaisesRegex(
         errors.NotFoundError,
         'The current platform .* is not among the platforms'):
       self._assertOpOutputMatchesExpected(f, (x,), (x,))
 
-    platforms = ['CPU', 'CUDA']
+    # Disable the check but have two platforms
+    platforms = ['RANDOM_PLATFORM_1', 'RANDOM_PLATFORM_2']
+    disabled_checks = [xla.call_module_disable_check_platform()]
+    # No error
+    self._assertOpOutputMatchesExpected(f, (x,), (x,))
+
+    # Disable the check but have a single platform and hence no platform arg.
+    platforms = ['RANDOM_PLATFORM_1']
+    module, version = serialize(module_str_no_platform_arg)
+    # No error
+    self._assertOpOutputMatchesExpected(f, (x,), (x,))
+    disabled_checks = []
+    module, version = serialize(module_str)
+
+    platforms = []
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'must have non-empty platforms'):
+      self._assertOpOutputMatchesExpected(f, (x,), (x,))
+
+    platforms = ['CPU', 'CUDA', 'ROCM']
     if self.testing_platform() not in platforms:
       with self.assertRaisesRegex(
           errors.NotFoundError,
@@ -398,7 +400,7 @@ module @jit_f.0 {
 
     # The module cannot have i64 %arg_platform_idx
     module, version = serialize(module_str.replace('i32', 'i64'))
-    platforms = ['CPU', 'CUDA', 'TPU']
+    platforms = ['CPU', 'CUDA', 'ROCM', 'TPU']
     with self.assertRaisesRegex(
         errors.InvalidArgumentError,
         'Module argument at index 0 should be a 0-dimensional '
@@ -428,7 +430,12 @@ module @jit_f.0 {
       # return np.arange(x.shape[0], dtype=np.int32)
       module, version = serialize("""
 module @jit_fun.1 {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x5xi32>) -> tensor<?xi32> {
+  func.func public @main(%arg1: tensor<?x5xi32>) -> tensor<?xi32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x5xi32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x5xi32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x5xi32>) -> tensor<?xi32> {
     %0 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
     %1 = "stablehlo.dynamic_iota"(%0) {iota_dimension = 0 : i64} : (tensor<1xi32>) -> tensor<?xi32>
     return %1 : tensor<?xi32>
@@ -439,7 +446,7 @@ module @jit_fun.1 {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None,)],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -471,7 +478,12 @@ module @jit_f.0 {
     def f(x):  # x: f32[b, 3]
       module, version = serialize("""
 module @jit_fun_flat_jax {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
+  func.func public @main(%arg1: tensor<?x3xf32>) -> tensor<?xf32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x3xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x3xf32>) -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x3xf32>) -> tensor<?xf32> {
     %0 = stablehlo.constant dense<3> : tensor<i32>
     %1 = stablehlo.multiply %arg0, %0 : tensor<i32>
     %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
@@ -484,7 +496,7 @@ module @jit_fun_flat_jax {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None,)],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -495,7 +507,12 @@ module @jit_fun_flat_jax {
     def f(x):  # x: f32[b, 4]
       module, version = serialize("""
 module @jit_fun_flat_jax {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
+  func.func public @main(%arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x4xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x4xf32>) -> tensor<?x2xf32>
+    return %0 : tensor<?x2xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<?x2xf32> {
     %0 = stablehlo.constant dense<0> : tensor<i64>
     %1 = stablehlo.constant dense<0> : tensor<1xi64>
     %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
@@ -510,7 +527,7 @@ module @jit_fun_flat_jax {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None, 2)],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -521,7 +538,12 @@ module @jit_fun_flat_jax {
     def f(x):  # x: f32[b, 4]
       module, version = serialize("""
 module @jit_fun_flat_jax {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<4xf32> {
+  func.func public @main(%arg1: tensor<?x4xf32>) -> tensor<4xf32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x4xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x4xf32>) -> tensor<4xf32>
+    return %0 : tensor<4xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>) -> tensor<4xf32> {
     %0 = stablehlo.constant dense<-1> : tensor<i32>
     %1 = stablehlo.add %arg0, %0 : tensor<i32>
     %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
@@ -541,7 +563,7 @@ module @jit_fun_flat_jax {
                              module=module,
                              Tout=[x.dtype],
                              Sout=[(4,)],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -553,7 +575,12 @@ module @jit_fun_flat_jax {
     def f(x, idx):  # x: f32[b, 4]  idx: i32
       module, version = serialize("""
 module @jit_fun_flat_jax {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
+  func.func public @main(%arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x4xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1, %arg2) : (tensor<i32>, tensor<?x4xf32>, tensor<i32>) -> tensor<?x4xf32>
+    return %0 : tensor<?x4xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<i32>) -> tensor<?x4xf32> {
     %0 = stablehlo.constant dense<0> : tensor<i32>
     %1 = stablehlo.compare  LT, %arg2, %0,  SIGNED : (tensor<i32>, tensor<i32>) -> tensor<i1>
     %2 = stablehlo.add %arg2, %arg0 : tensor<i32>
@@ -568,7 +595,7 @@ module @jit_fun_flat_jax {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None, 4)],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x, idx), (res,))
 
@@ -581,7 +608,12 @@ module @jit_fun_flat_jax {
       # return (np.broadcast_to(x, y.shape), x + y)
       module, version = serialize("""
 module @jit_fun.0 {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
+  func.func public @main(%arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg2) {dimension = 1 : i64} : (tensor<2x?x4xf32>) -> tensor<i32>
+    %0, %1 = call @dyn_main(%arg0_new, %arg1, %arg2) : (tensor<i32>, tensor<?x4xf32>, tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>)
+    return %0, %1 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x4xf32>, %arg2: tensor<2x?x4xf32>) -> (tensor<2x?x4xf32>, tensor<2x?x4xf32>) {
     %0 = stablehlo.constant dense<2> : tensor<1xi32>
     %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
     %3 = stablehlo.constant dense<4> : tensor<1xi32>
@@ -596,7 +628,7 @@ module @jit_fun.0 {
                              module=module,
                              Tout=[res[0].dtype, res[1].dtype],
                              Sout=[(2, None, 4), (2, None, 4)],
-                             dim_args_spec=['1.1'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x, y), res)
 
@@ -608,14 +640,19 @@ module @jit_fun.0 {
     def f(x):  # x: i32[b]
       module, version = serialize("""
 module @jit_fun{
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xi32>) -> tensor<i32> {
+  func.func public @main(%arg1: tensor<?xi32>) -> tensor<i32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg2) {dimension = 0 : i64} : (tensor<?xi32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xi32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xi32>) -> tensor<i32> {
     %0 = stablehlo.constant dense<0> : tensor<i32>
     %1 = stablehlo.reduce(%arg1 init: %0) across dimensions = [0] : (tensor<?xi32>, tensor<i32>) -> tensor<i32>
      reducer(%arg2: tensor<i32>, %arg3: tensor<i32>)  {
-      %4 = mhlo.add %arg2, %arg3 : tensor<i32>
-      "mhlo.return"(%4) : (tensor<i32>) -> ()
+      %4 = stablehlo.add %arg2, %arg3 : tensor<i32>
+      "stablehlo.return"(%4) : (tensor<i32>) -> ()
     }
-    %2 = mhlo.multiply %1, %arg0 : tensor<i32>
+    %2 = stablehlo.multiply %1, %arg0 : tensor<i32>
     return %2 : tensor<i32>
   }
 }
@@ -624,7 +661,7 @@ module @jit_fun{
                              module=module,
                              Tout=[res.dtype],
                              Sout=[res.shape],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -635,7 +672,12 @@ module @jit_fun{
     def f(x):  # x: f32[b, 5]
       module, version = serialize("""
 module @jit_fun_flat_jax {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
+  func.func public @main(%arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?x5xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?x5xf32>) -> tensor<?x1xf32>
+    return %0 : tensor<?x1xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?x5xf32>) -> tensor<?x1xf32> {
     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
     %1 = stablehlo.reduce(%arg1 init: %0) across dimensions = [1] : (tensor<?x5xf32>, tensor<f32>) -> tensor<?xf32>
      reducer(%arg2: tensor<f32>, %arg3: tensor<f32>)  {
@@ -654,7 +696,7 @@ module @jit_fun_flat_jax {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[(None, 1)],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()],)
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -666,7 +708,12 @@ module @jit_fun_flat_jax {
     def f(x):  # x: f32[b]
       module, version = serialize("""
 module @jit_fun_3 {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xi32> {
+  func.func public @main(%arg1: tensor<?xf32>) -> tensor<?xi32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xi32>
+    return %0 : tensor<?xi32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xi32> {
     %0 = call @f(%arg0, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xi32>
     return %0 : tensor<?xi32>
   }
@@ -681,7 +728,7 @@ module @jit_fun_3 {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[()],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()])
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -692,7 +739,12 @@ module @jit_fun_3 {
     def f(x):  # x: f32[b]
       module, version = serialize("""
 module @jit_fun_3 {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
+  func.func public @main(%arg1: tensor<?xf32>) -> tensor<?xf32> {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
+    %0 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> tensor<?xf32>
+    return %0 : tensor<?xf32>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
     return %arg1 : tensor<?xf32>
   }
 }
@@ -701,7 +753,7 @@ module @jit_fun_3 {
                              module=module,
                              Tout=[res.dtype],
                              Sout=[()],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()])
 
     self._assertOpOutputMatchesExpected(f, (x,), (res,))
 
@@ -717,7 +769,12 @@ module @jit_fun_3 {
     def f(x):  # x: f32[b]
       module, version = serialize("""
 module @jit_fun_flat_jax {
-  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
+  func.func public @main(%arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
+    %arg0_new = "stablehlo.get_dimension_size"(%arg1) {dimension = 0 : i64} : (tensor<?xf32>) -> tensor<i32>
+    %0, %1 = call @dyn_main(%arg0_new, %arg1) : (tensor<i32>, tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>)
+    return %0, %1 : tensor<?xf32>, tensor<i64>
+  }
+  func.func private @dyn_main(%arg0: tensor<i32>, %arg1: tensor<?xf32>) -> (tensor<?xf32>, tensor<i64>) {
     %0 = stablehlo.constant dense<0> : tensor<i64>
     %1:2 = "stablehlo.while"(%arg1, %0) ({
     ^bb0(%arg2: tensor<?xf32>, %arg3: tensor<i64>):
@@ -741,10 +798,301 @@ module @jit_fun_flat_jax {
                              module=module,
                              Tout=[res0.dtype, res1.dtype],
                              Sout=[(None,), res1.shape],
-                             dim_args_spec=['0.0'])
+                             platforms=[self.testing_platform()])
 
     self._assertOpOutputMatchesExpected(f, (x,), (res0, res1))
 
+  def test_tf_call_function(self):
+    """A TensorFlow function call inside StableHLO."""
+    x = np.int32(2)
+    y = np.int32(3)
+    res = x + y
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def foo(x, y):
+      return x + y
+
+    def f(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 0}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res.dtype],
+          Sout=[res.shape],
+          platforms=[self.testing_platform()],
+          function_list=(foo,),
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x, y), (res,))
+
+  def test_tf_call_function_multiple_funcs(self):
+    """Multiple TensorFlow function calls inside StableHLO."""
+    x = np.int32(2)
+    y = np.int32(3)
+    res = (x + y) + (x + y)
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def foo(x, y):
+      return x + y
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def bar(x, y):
+      return foo(x, y)
+
+    def f(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 0}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 1}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %2 = stablehlo.custom_call @tf.call_tf_function(%0, %1) {
+      tf.backend_config = {called_index = 1}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %2 : tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res.dtype],
+          Sout=[res.shape],
+          platforms=[self.testing_platform()],
+          function_list=(foo, bar),
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x, y), (res,))
+
+  def test_shape_polymorphic_tf_call_function(self):
+    """A TensorFlow function call inside StableHLO."""
+    x = np.full((2,), 2, dtype=np.int32)
+    y = np.full((2,), 3, dtype=np.int32)
+    res = x + y
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def foo(x, y):
+      return x + y
+
+    def f(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<?xi32>, %arg1: tensor<?xi32>) -> tensor<?xi32> {
+    %0 = stablehlo.get_dimension_size %arg0, dim = 0 : (tensor<?xi32>) -> tensor<i32>
+    %1 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %0) {
+      tf.backend_config = {called_index = 0},
+      indices_of_shape_operands = dense<[2]> : tensor<1xi64>
+    } : (tensor<?xi32>, tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+    return %1 : tensor<?xi32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res.dtype],
+          Sout=[res.shape],
+          platforms=[self.testing_platform()],
+          function_list=(foo,),
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x, y), (res,))
+
+  def test_tf_call_function_with_token(self):
+    """A TensorFlow function call inside StableHLO."""
+    x = np.int32(2)
+    y = np.int32(3)
+    res = x + y
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def foo(x, y):
+      return x + y
+
+    def f(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: !stablehlo.token, %arg1: tensor<i32>, %arg2: tensor<i32>) -> (!stablehlo.token, tensor<i32>) {
+    %0:2 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1, %arg2) {
+      tf.backend_config = {called_index = 0, has_token_input_output = true}
+    } : (!stablehlo.token, tensor<i32>, tensor<i32>) -> (!stablehlo.token, tensor<i32>)
+    return %0#0, %0#1 : !stablehlo.token, tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res.dtype],
+          Sout=[res.shape],
+          platforms=[self.testing_platform()],
+          function_list=(foo,),
+          has_token_input_output=True,
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x, y), (res,))
+
+  def test_tf_call_function_nested(self):
+    """Nested XlaCallModule inside TensorFlow function calls."""
+    x = np.int32(2)
+    y = np.int32(3)
+    res = x + y
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def add(x, y):
+      return x + y
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def nested_xla_call(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 0}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res.dtype],
+          Sout=[res.shape],
+          platforms=[self.testing_platform()],
+          function_list=(add,),
+      )
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def call(x, y):
+      return nested_xla_call(x, y)
+
+    def f(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 0}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res.dtype],
+          Sout=[res.shape],
+          platforms=[self.testing_platform()],
+          function_list=(call,),
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x, y), (res,))
+
+  def test_tf_call_function_nested_func_renaming(self):
+    """Multiple custom calls with identically named private functions."""
+    x = np.int32(2)
+    y = np.int32(3)
+    res0 = x + y
+    res1 = x - y
+
+    # Verify that multiple inner TF function calls with the same private
+    # functions are properly renamed during MHLO import. This test case is
+    # carefully constructed such that one outer XlaCallModule op has two custom
+    # calls, each of which has the same private "@call" function with different
+    # body. This is to catch bugs in the func renaming logic.
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def add(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func private @call(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = func.call @call(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res0.dtype],
+          Sout=[res0.shape],
+          platforms=[self.testing_platform()],
+      )
+
+    @function.Defun(dtypes.int32, dtypes.int32)
+    def subtract(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func private @call(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = stablehlo.subtract %arg0, %arg1 : tensor<i32>
+    return %0 : tensor<i32>
+  }
+
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> tensor<i32> {
+    %0 = func.call @call(%arg0, %arg1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %0 : tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res1.dtype],
+          Sout=[res1.shape],
+          platforms=[self.testing_platform()],
+      )
+
+    def f(x, y):
+      module, version = serialize("""
+module @jit_fun_flat_jax {
+  func.func public @main(%arg0: tensor<i32>, %arg1: tensor<i32>) -> (tensor<i32>, tensor<i32>) {
+    %0 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 0}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    %1 = stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {
+      tf.backend_config = {called_index = 1}
+    } : (tensor<i32>, tensor<i32>) -> tensor<i32>
+    return %0, %1 : tensor<i32>, tensor<i32>
+  }
+}
+""")
+      return xla.call_module(
+          [x, y],
+          version=version,
+          module=module,
+          Tout=[res0.dtype, res1.dtype],
+          Sout=[res0.shape, res1.shape],
+          platforms=[self.testing_platform()],
+          function_list=(add, subtract),
+      )
+
+    self._assertOpOutputMatchesExpected(f, (x, y), (res0, res1))
+
   def test_op_backward_compatibility(self):
     """Test for ensuring XlaCallModuleOp backward compatiblity."""
     x = np.array([1.0, 2.0, 3.0], dtype=np.float32)
@@ -769,6 +1117,7 @@ module @jit_f.0 {
           module=module,
           Tout=[x.dtype],
           Sout=[x.shape],
+          platforms=[self.testing_platform()],
       )
 
     self._assertOpOutputMatchesExpected(f, (x,), (np.sin(np.cos(x)),))
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index bb48f9e806b..8f4c707e901 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -443,9 +443,9 @@ tf_custom_op_py_library(
     deps = [
         ":_pywrap_py_utils",
         ":trt_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:resources",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:resources",
     ],
 )
 
@@ -1030,6 +1030,7 @@ pybind_extension(
         "@local_config_rocm//:__subpackages__",
         "@local_config_tensorrt//:__subpackages__",
         "@local_execution_config_platform//:__subpackages__",
+        "@ml_dtypes//:__subpackages__",
         "@nsync//:__subpackages__",
         "@platforms//:__subpackages__",
         "@pybind11//:__subpackages__",
diff --git a/tensorflow/compiler/tf2tensorrt/common/utils.cc b/tensorflow/compiler/tf2tensorrt/common/utils.cc
index 92166c2e79e..26ac37b237b 100644
--- a/tensorflow/compiler/tf2tensorrt/common/utils.cc
+++ b/tensorflow/compiler/tf2tensorrt/common/utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2tensorrt/common/utils.h"
 
+#include <tuple>
+
 #if GOOGLE_CUDA && GOOGLE_TENSORRT
 #include "absl/base/call_once.h"
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index 0bf252386bc..676281bd6a4 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -210,9 +210,9 @@ filegroup(
     srcs = [
         "xla_compiled_cpu_function.h",
         "//tensorflow/compiler/xla:cpu_runtime_hdrs",
+        "//tensorflow/compiler/xla/runtime:aot_ffi_execution_context_hdrs",
         "//tensorflow/compiler/xla/service:custom_call_status_hdrs",
         "//tensorflow/compiler/xla/service/cpu:runtime_hdrs",
-        "//tensorflow/compiler/xla/service/cpu:xla_runtime_runner_hdrs",
         "//tensorflow/core/kernels:xla_cpu_runtime_hdrs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
         "//tensorflow/tsl/framework:xla_cpu_runtime_hdrs",
@@ -229,7 +229,6 @@ filegroup(
         "//tensorflow/compiler/xla:cpu_runtime_srcs",
         "//tensorflow/compiler/xla/service:custom_call_status_srcs",
         "//tensorflow/compiler/xla/service/cpu:runtime_srcs",
-        "//tensorflow/compiler/xla/service/cpu:xla_runtime_runner_srcs",
         "//tensorflow/core/kernels:xla_cpu_runtime_srcs",
         "//tensorflow/core/platform:xla_cpu_runtime_srcs",
         "//tensorflow/tsl/platform:xla_cpu_runtime_srcs",
@@ -377,6 +376,7 @@ cc_library(
         # binary produced by tfcompile.
         "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla/runtime:aot_ffi_execution_context",
         "//tensorflow/compiler/xla/service/cpu:buffer_desc",
         "//tensorflow/core/platform:types",
     ],
@@ -513,6 +513,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
diff --git a/tensorflow/compiler/tf2xla/kernels/BUILD b/tensorflow/compiler/tf2xla/kernels/BUILD
index ac616e542a5..69adee9baab 100644
--- a/tensorflow/compiler/tf2xla/kernels/BUILD
+++ b/tensorflow/compiler/tf2xla/kernels/BUILD
@@ -11,6 +11,7 @@ load(
     "tf_cc_test",
     "tf_cuda_library",
 )
+load("//tensorflow/tsl/platform:build_config_root.bzl", "if_static")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -236,8 +237,10 @@ tf_kernel_library(
         "//tensorflow/core/kernels:stateless_random_ops_v2_header",
         "//tensorflow/core/tpu:tpu_defs",
         "//tensorflow/core/util:overflow",
+        "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
@@ -250,7 +253,7 @@ tf_kernel_library(
     ] + if_cuda_or_rocm(
         if_false = [],
         if_true = [":light_outside_compilation"],
-    ),
+    ) + if_static(["//tensorflow/tsl/platform:tensor_float_32_utils"]),
 )
 
 tf_cuda_library(
@@ -341,8 +344,9 @@ cc_library(
         "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/kernels:conv_grad_shape_utils",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "@com_google_absl//absl/types:span",
-    ],
+    ] + if_static(["//tensorflow/tsl/platform:tensor_float_32_utils"]),
 )
 
 cc_library(
@@ -382,13 +386,19 @@ cc_library(
     srcs = ["xla_call_module_loader.cc"],
     hdrs = ["xla_call_module_loader.h"],
     deps = [
+        "//tensorflow/compiler/jit:flags",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
+        "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
         "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:regexp",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -409,27 +419,33 @@ tf_kernel_library(
     srcs = ["xla_call_module_op.cc"],
     deps = [
         ":xla_call_module_loader",
-        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/tf2xla:side_effect_util",
-        "//tensorflow/compiler/tf2xla:tf2xla_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
         "//tensorflow/compiler/tf2xla/ops:xla_ops",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:sharding_op_util",
-        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
+        "//tensorflow/compiler/xla/translate/mhlo_to_hlo:type_to_shape",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:status",
         "//tensorflow/core/tpu:tpu_defs",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
     ],
 )
 
diff --git a/tensorflow/compiler/tf2xla/kernels/arg_op.cc b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
index 007074d8a9d..8f3081515e5 100644
--- a/tensorflow/compiler/tf2xla/kernels/arg_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/arg_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compilation_device.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/assert_op.cc b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
index c40caa8fa10..c1c14d7dcaf 100644
--- a/tensorflow/compiler/tf2xla/kernels/assert_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/assert_op.cc
@@ -26,7 +26,7 @@ namespace {
 class AssertOp : public XlaOpKernel {
  public:
   explicit AssertOp(OpKernelConstruction* ctx) : XlaOpKernel(ctx) {}
-  ~AssertOp() override {}
+  ~AssertOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
     static mutex mu(tensorflow::LINKER_INITIALIZED);
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
index 095bedcda95..76a91179da6 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_matmul_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -20,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace {
@@ -41,10 +44,13 @@ class BatchMatMulOp : public XlaOpKernel {
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
-    auto result =
-        xla::BatchDot(MaybeConjugate(ctx->Input(0), adj_x_), adj_x_,
-                      MaybeConjugate(ctx->Input(1), adj_y_), adj_y_,
-                      xla::PrecisionConfig::DEFAULT, preferred_element_type_);
+    xla::PrecisionConfig::Precision precision =
+        tsl::tensor_float_32_execution_enabled()
+            ? xla::PrecisionConfig::DEFAULT
+            : xla::PrecisionConfig::HIGHEST;
+    auto result = xla::BatchDot(MaybeConjugate(ctx->Input(0), adj_x_), adj_x_,
+                                MaybeConjugate(ctx->Input(1), adj_y_), adj_y_,
+                                precision, preferred_element_type_);
     ctx->SetOutput(0, result);
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
index e340342b1c9..18526b68538 100644
--- a/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batch_norm_op.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <algorithm>
 #include <numeric>
 #include <string>
+#include <vector>
 
 #include "tensorflow/compiler/tf2xla/kernels/relu_op.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
index 14e9f0b5590..5864da9885e 100644
--- a/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/batchtospace_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -21,7 +25,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp& input,
+void BatchToSpace(XlaOpKernelContext* ctx, const xla::XlaOp input,
                   DataType input_dtype, const TensorShape& input_tensor_shape,
                   absl::Span<const int64_t> block_shape,
                   const xla::Literal& crops) {
diff --git a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
index a8e2755bfe9..60c3077649c 100644
--- a/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bcast_ops.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // XLA-specific Ops for broadcasting used in gradient
 // code.
 
+#include <vector>
+
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
index a970c873695..0e2afe33de6 100644
--- a/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/binary_ops.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 // Native XLA implementations of simple binary Ops
 
+#include <tuple>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
index eef8e940feb..7db022b280f 100644
--- a/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/broadcast_to_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
index 5078f8662bd..cce0e332e68 100644
--- a/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/bucketize_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.cc b/tensorflow/compiler/tf2xla/kernels/case_op.cc
index 438d454cb21..e1b4ef94208 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/case_op.h"
 
+#include <tuple>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/kernels/if_while_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/case_op.h b/tensorflow/compiler/tf2xla/kernels/case_op.h
index cac026d81b6..1aa64228591 100644
--- a/tensorflow/compiler/tf2xla/kernels/case_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/case_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_CASE_OP_H_
 
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
index 534cfc58013..a89d3b5f2be 100644
--- a/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/categorical_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA implementations of Categorical op.
 
+#include <array>
+
 #include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index 20934423141..833efb34649 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -17,6 +17,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h"
 
+#include <algorithm>
+#include <numeric>
+#include <utility>
+#include <vector>
+
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -39,10 +44,24 @@ limitations under the License.
 #include "tensorflow/core/kernels/conv_grad_shape_utils.h"
 #include "tensorflow/core/util/padding.h"
 #include "tensorflow/core/util/tensor_format.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace {
 
+xla::PrecisionConfig GetPrecisionConfig() {
+  xla::PrecisionConfig::Precision precision =
+      tsl::tensor_float_32_execution_enabled() ? xla::PrecisionConfig::DEFAULT
+                                               : xla::PrecisionConfig::HIGHEST;
+  xla::PrecisionConfig config;
+  const int num_inputs = 2;
+  config.mutable_operand_precision()->Reserve(num_inputs);
+  for (int i = 0; i < num_inputs; ++i) {
+    config.add_operand_precision(precision);
+  }
+  return config;
+}
+
 // Returns the expanded size of a filter used for depthwise convolution.
 // If `shape` is [H, W, ..., M, N] returns [H, W, ..., 1, M*N].
 xla::Shape GroupedFilterShapeForDepthwiseConvolution(
@@ -187,9 +206,10 @@ StatusOr<ConvOpAttrs> ConvOpAttrs::Create(int num_spatial_dims, bool depthwise,
   return attrs;
 }
 
-StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
-    StringPiece /*type_string*/, xla::XlaOp conv_input, xla::XlaOp filter,
-    const ConvOpAttrs& attrs, const xla::PrecisionConfig* precision_config) {
+StatusOr<xla::XlaOp> MakeXlaForwardConvOp(StringPiece /*type_string*/,
+                                          xla::XlaOp conv_input,
+                                          xla::XlaOp filter,
+                                          const ConvOpAttrs& attrs) {
   TF_RETURN_IF_ERROR(CheckConvAttrs(attrs));
 
   auto* builder = conv_input.builder();
@@ -277,6 +297,7 @@ StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
         rhs_dilation[i], window_strides[i], attrs.padding, &unused_output_size,
         &padding[i].first, &padding[i].second));
   }
+  xla::PrecisionConfig precision_config = GetPrecisionConfig();
 
   if (padding_type != xla::PaddingType::PADDING_INVALID) {
     return xla::DynamicConvForward(
@@ -284,20 +305,22 @@ StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
         dims,
         /*feature_group_count=*/attrs.depthwise ? in_depth
                                                 : feature_group_count,
-        /*batch_group_count=*/1, precision_config, padding_type);
+        /*batch_group_count=*/1, &precision_config, padding_type);
   }
 
   return xla::ConvGeneralDilated(
       conv_input, filter, window_strides, padding, lhs_dilation, rhs_dilation,
       dims,
       /*feature_group_count=*/attrs.depthwise ? in_depth : feature_group_count,
-      /*batch_group_count=*/1, precision_config);
+      /*batch_group_count=*/1, &precision_config);
 }
 
-StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
-    StringPiece type_string, const xla::Shape& input_shape, xla::XlaOp filter,
-    xla::XlaOp out_backprop, const ConvOpAttrs& attrs,
-    const xla::PrecisionConfig* precision_config, xla::XlaOp* input_sizes) {
+StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(StringPiece type_string,
+                                                const xla::Shape& input_shape,
+                                                xla::XlaOp filter,
+                                                xla::XlaOp out_backprop,
+                                                const ConvOpAttrs& attrs,
+                                                xla::XlaOp* input_sizes) {
   TF_RETURN_IF_ERROR(CheckConvAttrs(attrs));
 
   int num_dims = attrs.num_spatial_dims + 2;
@@ -367,6 +390,7 @@ StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
     lhs_dilation[i] = dims.spatial_dims[i].stride;
     rhs_dilation[i] = attrs.dilations[dim];
   }
+  xla::PrecisionConfig precision_config = GetPrecisionConfig();
 
   if (feature_group_count != 1 && !attrs.depthwise) {
     filter = TransposeFilterForGroupConvolutionBackpropInput(
@@ -381,7 +405,7 @@ StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
         lhs_dilation, rhs_dilation, dnums,
         /*feature_group_count=*/
         feature_group_count,
-        /*batch_group_count=*/1, precision_config, padding_type);
+        /*batch_group_count=*/1, &precision_config, padding_type);
   }
   // activation gradients
   //   = gradients (with padding and dilation) <conv> mirrored_weights
@@ -389,13 +413,14 @@ StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
                                  padding, lhs_dilation, rhs_dilation, dnums,
                                  /*feature_group_count=*/
                                  feature_group_count,
-                                 /*batch_group_count=*/1, precision_config);
+                                 /*batch_group_count=*/1, &precision_config);
 }
 
-StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
-    StringPiece type_string, xla::XlaOp activations,
-    const xla::Shape& filter_shape, xla::XlaOp gradients,
-    const ConvOpAttrs& attrs, const xla::PrecisionConfig* precision_config) {
+StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(StringPiece type_string,
+                                                 xla::XlaOp activations,
+                                                 const xla::Shape& filter_shape,
+                                                 xla::XlaOp gradients,
+                                                 const ConvOpAttrs& attrs) {
   TF_RETURN_IF_ERROR(CheckConvAttrs(attrs));
 
   auto* builder = activations.builder();
@@ -519,6 +544,7 @@ StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
                                            : 0;
     padding[i] = {pad_before, pad_total - pad_before};
   }
+  xla::PrecisionConfig precision_config = GetPrecisionConfig();
 
   // Besides padding the input, we will also expand output_rows to
   //    expanded_out_rows = (output_rows - 1) * stride + 1
@@ -533,14 +559,14 @@ StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
         activations, gradients, window_strides, padding, /*lhs_dilation=*/ones,
         rhs_dilation, dnums,
         /*feature_group_count=*/1,
-        /*batch_group_count=*/batch_group_count, precision_config,
+        /*batch_group_count=*/batch_group_count, &precision_config,
         padding_type);
   } else {
     filter_backprop = xla::ConvGeneralDilated(
         activations, gradients, window_strides, padding, /*lhs_dilation=*/ones,
         rhs_dilation, dnums,
         /*feature_group_count=*/1,
-        /*batch_group_count=*/batch_group_count, precision_config);
+        /*batch_group_count=*/batch_group_count, &precision_config);
   }
 
   if (attrs.depthwise) {
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
index 7922c6ba821..70c579cde73 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h
@@ -58,20 +58,19 @@ struct ConvOpAttrs {
 
 // Creates a new XLA forward or backward convolution with the given inputs and
 // attributes.
-StatusOr<xla::XlaOp> MakeXlaForwardConvOp(
-    StringPiece type_string, xla::XlaOp conv_input, xla::XlaOp filter,
-    const ConvOpAttrs& attrs,
-    const xla::PrecisionConfig* precision_config = nullptr);
+StatusOr<xla::XlaOp> MakeXlaForwardConvOp(StringPiece type_string,
+                                          xla::XlaOp conv_input,
+                                          xla::XlaOp filter,
+                                          const ConvOpAttrs& attrs);
 StatusOr<xla::XlaOp> MakeXlaBackpropInputConvOp(
     StringPiece type_string, const xla::Shape& input_shape, xla::XlaOp filter,
     xla::XlaOp out_backprop, const ConvOpAttrs& attrs,
-    const xla::PrecisionConfig* precision_config = nullptr,
     xla::XlaOp* input_sizes = nullptr);
-StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(
-    StringPiece type_string, xla::XlaOp activations,
-    const xla::Shape& filter_shape, xla::XlaOp gradients,
-    const ConvOpAttrs& attrs,
-    const xla::PrecisionConfig* precision_config = nullptr);
+StatusOr<xla::XlaOp> MakeXlaBackpropFilterConvOp(StringPiece type_string,
+                                                 xla::XlaOp activations,
+                                                 const xla::Shape& filter_shape,
+                                                 xla::XlaOp gradients,
+                                                 const ConvOpAttrs& attrs);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
index 1d94cf4969f..0f1b53c8a56 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_ops.cc
@@ -113,7 +113,7 @@ class ConvBackpropInputOp : public XlaOpKernel {
     xla::XlaOp input_sizes = ctx->Input(0);
     StatusOr<xla::XlaOp> in_backprop = MakeXlaBackpropInputConvOp(
         ctx->op_kernel().type_string(), input_shape, ctx->Input(1),
-        ctx->Input(2), attrs_, nullptr, &input_sizes);
+        ctx->Input(2), attrs_, &input_sizes);
     OP_REQUIRES_OK(ctx, in_backprop.status());
     ctx->SetOutput(0, in_backprop.value());
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/cross_op.cc b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
index a1ab899a7c1..923f578900c 100644
--- a/tensorflow/compiler/tf2xla/kernels/cross_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cross_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
index 10570c91339..14678369741 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.cc
@@ -17,6 +17,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
 
+#include <algorithm>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
index 199d3514c22..748ce28777f 100644
--- a/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/cwise_ops.h
@@ -18,6 +18,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_CWISE_OPS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_CWISE_OPS_H_
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -41,7 +44,7 @@ class XlaBinaryOp : public XlaOpKernel {
     OP_REQUIRES(ctx, lhs == rhs,
                 errors::InvalidArgument("Input types of binary op must match"));
   }
-  ~XlaBinaryOp() override {}
+  ~XlaBinaryOp() override = default;
 
   // Implement the (tensor,tensor)->tensor lambda that should be
   // applied to the inputs. The desired computation should be added to
diff --git a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
index e2b3e3ffcf5..5833480a664 100644
--- a/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/data_format_ops.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
index 6ca29c5526f..a8bad158812 100644
--- a/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/depthtospace_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/data_format.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
index ff058f92cd7..3c4bbe78bfe 100644
--- a/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/device_index_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/diag_op.cc b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
index 1ad75b65c66..e11844303da 100644
--- a/tensorflow/compiler/tf2xla/kernels/diag_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/diag_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/util.h"
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
index d6de86a4ef8..635cad36675 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_partition_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
index cd03b617158..5a36c175478 100644
--- a/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/dynamic_stitch_op.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 // XLA-specific dynamic stitch Op.
 
+#include <algorithm>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/empty_op.cc b/tensorflow/compiler/tf2xla/kernels/empty_op.cc
index 2b90a2c4d35..348cdf06ec6 100644
--- a/tensorflow/compiler/tf2xla/kernels/empty_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/empty_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific Empty Op.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
index 55bce65bd8e..49e80226786 100644
--- a/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/extract_image_patches_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/kernels/conv_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
index eb9de507fb0..d437e5476b3 100644
--- a/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fake_quantize_ops.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -47,7 +50,7 @@ void CpuNudge(const float min, const float max, const float quant_min,
 
 // An XLA version of CpuNudge().
 void XlaNudge(xla::XlaBuilder* b, const DataType data_type,
-              const xla::XlaOp& min, const xla::XlaOp& max,
+              const xla::XlaOp min, const xla::XlaOp max,
               const float quant_min_value, const float quant_max_value,
               xla::XlaOp* nudged_min, xla::XlaOp* nudged_max,
               xla::XlaOp* scale) {
@@ -67,11 +70,10 @@ void XlaNudge(xla::XlaBuilder* b, const DataType data_type,
   *nudged_max = xla::Mul(xla::Sub(quant_max, nudged_zero_point), *scale);
 }
 
-xla::XlaOp Quantize(xla::XlaBuilder* b, const xla::XlaOp& input,
-                    const DataType data_type,
-                    const xla::XlaOp& nudged_input_min,
-                    const xla::XlaOp& nudged_input_max,
-                    const xla::XlaOp& input_scale) {
+xla::XlaOp Quantize(xla::XlaBuilder* b, const xla::XlaOp input,
+                    const DataType data_type, const xla::XlaOp nudged_input_min,
+                    const xla::XlaOp nudged_input_max,
+                    const xla::XlaOp input_scale) {
   xla::XlaOp one = XlaHelpers::FloatLiteral(b, data_type, 1.0f);
   xla::XlaOp inv_scale = xla::Div(one, input_scale);
   xla::XlaOp half = XlaHelpers::FloatLiteral(b, data_type, 0.5f);
diff --git a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
index 1368d15a030..48726350c98 100644
--- a/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fft_ops.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 // XLA-specific Ops for FFT.
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/fill_op.cc b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
index ebcbadb894e..3c5f41161ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/fill_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fill_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific Fill Op.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/function_ops.cc b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
index 516e3aeaa88..3da7ce96bee 100644
--- a/tensorflow/compiler/tf2xla/kernels/function_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/function_ops.cc
@@ -55,7 +55,7 @@ class AlwaysFailOp : public OpKernel {
  public:
   explicit AlwaysFailOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  ~AlwaysFailOp() override {}
+  ~AlwaysFailOp() override = default;
 
   void Compute(OpKernelContext* ctx) override {
     ctx->CtxFailure(errors::FailedPrecondition(
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index a28a0e9eb26..807e4304e8d 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <optional>
+#include <vector>
 
 #include "absl/types/optional.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
index 83ab17686e9..54d186dd12d 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h
@@ -43,7 +43,7 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
 // the input instead of context->input(0) in order to allow ResourceGather to
 // handle obtaining the data from the ResourceVariable.
 Status XlaGatherWithBatchDimsOpImpl(XlaOpKernelContext* context,
-                                    const xla::XlaOp input,
+                                    xla::XlaOp input,
                                     const TensorShape& input_shape,
                                     int batch_dims, xla::XlaOp* gather_output);
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
index 3162d197480..a7e47c3850a 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_scatter_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 7dd618aaf91..4a55c479ac0 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/if_op.h"
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/kernels/if_while_utils.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.h b/tensorflow/compiler/tf2xla/kernels/if_op.h
index 42f4e9d9e6b..11b196f939e 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_OP_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc b/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc
index 15314b0434e..7d3a7c7d176 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_while_utils.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/if_while_utils.h"
 
+#include <functional>
+#include <optional>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/xla/literal.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/if_while_utils.h b/tensorflow/compiler/tf2xla/kernels/if_while_utils.h
index 631fedd25f7..15f30975076 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_while_utils.h
+++ b/tensorflow/compiler/tf2xla/kernels/if_while_utils.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_WHILE_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_IF_WHILE_UTILS_H_
 
+#include <functional>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/image_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
index 4abfb149792..8e8b7134413 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_ops.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <numeric>
 #include <string>
+#include <vector>
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
index 48c63d56e4c..6d034b8c6c7 100644
--- a/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/image_resize_ops.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/tf2xla/kernels/image_resize_ops.h"
 
+#include <algorithm>
+#include <cmath>
 #include <string>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
@@ -213,7 +216,7 @@ xla::XlaOp MakeGeneralResizeKernelInDim(xla::XlaBuilder* builder,
 }
 
 xla::XlaOp BroadcastSpatialDimensions(xla::XlaBuilder* builder,
-                                      const xla::XlaOp& input,
+                                      const xla::XlaOp input,
                                       int32_t spatial_dimensions_offset,
                                       absl::Span<const int64_t> in_size,
                                       absl::Span<const int64_t> out_size) {
@@ -235,7 +238,7 @@ xla::XlaOp BroadcastSpatialDimensions(xla::XlaBuilder* builder,
 }
 
 xla::XlaOp ResizeUsingDilationAndConvolution(
-    xla::XlaBuilder* builder, const xla::XlaOp& input, xla::PrimitiveType type,
+    xla::XlaBuilder* builder, const xla::XlaOp input, xla::PrimitiveType type,
     const int num_spatial_dims, absl::Span<const int64_t> in_size,
     absl::Span<const int64_t> out_size, const int64_t channels,
     const bool align_corners, bool is_kernel_bilinear) {
@@ -381,7 +384,7 @@ xla::XlaOp ResizeUsingDilationAndConvolution(
 }
 
 xla::XlaOp ResizeUsingDilationAndConvolutionGradOp(
-    xla::XlaBuilder* builder, const xla::XlaOp& grad, xla::PrimitiveType type,
+    xla::XlaBuilder* builder, const xla::XlaOp grad, xla::PrimitiveType type,
     const int num_spatial_dims, absl::Span<const int64_t> in_size,
     absl::Span<const int64_t> grad_size, const int64_t channels,
     const bool align_corners, bool is_kernel_bilinear) {
diff --git a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
index 8e81356ea85..bb28f1ea0aa 100644
--- a/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/l2loss_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <numeric>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index aaf6a8f89eb..fa8a5ddf8f1 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -281,7 +281,7 @@ int GetOutputBufferId(int output_num, const TfCallbackData& callback_data) {
 
 int64_t BufferSize(const TfCallbackData::BufferDescription& descr) {
   TensorShape shape;
-  CHECK(TensorShape::BuildTensorShape(descr.shape(), &shape).ok());  // Crash OK
+  TF_CHECK_OK(TensorShape::BuildTensorShape(descr.shape(), &shape));  // Crash OK
   return shape.num_elements() * DataTypeSize(descr.type());
 }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h
index e4786f0142e..24675783495 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_LIGHT_OUTSIDE_COMPILATION_H_
 
 #include <functional>
+#include <map>
 
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index e741d6dfcff..ec95ceccfe6 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -16,8 +16,11 @@ limitations under the License.
 // XLA-specific ListDiff Op. This only supports constant DT_INT32 and DT_INT64
 // input.
 
+#include <array>
 #include <unordered_set>
+#include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
@@ -74,7 +77,7 @@ class ListDiffOp : public XlaOpKernel {
     TF_RETURN_IF_ERROR(context->ConstantInputAsIntVector(0, &x_input));
     TF_RETURN_IF_ERROR(context->ConstantInputAsIntVector(1, &y_input));
 
-    std::unordered_set<Tval> y_input_set;
+    absl::flat_hash_set<Tval> y_input_set;
     y_input_set.reserve(y_input.size());
     for (auto y : y_input) {
       y_input_set.insert(y);
diff --git a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
index 785b7bea107..86c0d97e97f 100644
--- a/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matmul_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific MatMul Op.
 
+#include <array>
+
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/tsl/platform/tensor_float_32_utils.h"
 
 namespace tensorflow {
 namespace {
@@ -88,7 +91,12 @@ class MatMulOp : public XlaOpKernel {
         b = xla::ConvertElementType(b, xla::F32);
       }
     }
-    ctx->SetOutput(0, xla::BatchDot(a, transpose_a_, b, transpose_b_));
+    xla::PrecisionConfig::Precision precision =
+        tsl::tensor_float_32_execution_enabled()
+            ? xla::PrecisionConfig::DEFAULT
+            : xla::PrecisionConfig::HIGHEST;
+    ctx->SetOutput(0,
+                   xla::BatchDot(a, transpose_a_, b, transpose_b_, precision));
   }
 
  private:
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
index 2fcbf22a5f0..e9cb7b60db9 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_diag_ops.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
index b221f55655e..7edb6fbf3b3 100644
--- a/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/matrix_triangular_solve_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <tuple>
+#include <utility>
+
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index 64351c6a741..83e8697d8c0 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 // XLA specific pooling ops.
 
+#include <optional>
 #include <string>
+#include <vector>
 
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
index ad1554312f2..f36b07cc93c 100644
--- a/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/quantize_and_dequantize_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstddef>
+#include <vector>
 
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/random_ops.cc b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
index 176c83a8375..2a66980314d 100644
--- a/tensorflow/compiler/tf2xla/kernels/random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/random_ops.cc
@@ -17,6 +17,8 @@ limitations under the License.
 // TODO(misard,phawkins): handle random number generator seeds/states correctly.
 // TODO(misard,phawkins): add tests.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/lib/random.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index 662052dac29..fef848224c1 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // XLA-specific reduction Ops.
 
 #include "tensorflow/compiler/tf2xla/kernels/reduction_ops.h"
+
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
index 8141dde7f2c..42631ae4b5b 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.h
@@ -18,6 +18,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_REDUCTION_OPS_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -34,7 +36,7 @@ namespace tensorflow {
 class XlaReductionOp : public XlaOpKernel {
  public:
   XlaReductionOp(OpKernelConstruction* ctx, DataType reduction_type);
-  ~XlaReductionOp() override {}
+  ~XlaReductionOp() override = default;
 
   // Return the base case for the reduction.
   virtual xla::XlaOp InitialValue(xla::XlaBuilder* builder) = 0;
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
index 95a2a454210..1194a2e0c70 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops_common.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific reduction Ops.
 
+#include <vector>
+
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/kernels/reduction_ops.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
index e6b21219894..fec4b5fea61 100644
--- a/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reshape_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific reshape Op.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
index 72932ea72ec..0df51930ffc 100644
--- a/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reverse_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific reverse Op.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/roll_op.cc b/tensorflow/compiler/tf2xla/kernels/roll_op.cc
index ae0827391d8..b9b9939d0ee 100644
--- a/tensorflow/compiler/tf2xla/kernels/roll_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/roll_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
index 7be090adb4a..12bdc30a950 100644
--- a/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scan_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index 1812ddab2b6..0e41300351f 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
+
 #include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
index 46b0aab40dc..9b33307ac00 100644
--- a/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/segment_reduction_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/select_op.cc b/tensorflow/compiler/tf2xla/kernels/select_op.cc
index 446d7f3d7aa..c1ca1fc67d9 100644
--- a/tensorflow/compiler/tf2xla/kernels/select_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/select_op.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <numeric>
+#include <vector>
 
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_op.cc b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
index 60b1f5eea3a..39ca2030045 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_op.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 // XLA-specific Shape Ops.
 
+#include <algorithm>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
@@ -312,7 +317,7 @@ class SqueezeOp : public XlaOpKernel {
     xla::Shape shape = input_shape.value();
     int64_t rank = shape.rank();
 
-    std::unordered_set<int32> wrapped_squeeze_dims;
+    absl::flat_hash_set<int32> wrapped_squeeze_dims;
     wrapped_squeeze_dims.reserve(squeeze_dims_.size());
     std::vector<int64_t> new_shape;
     // Validate squeeze dims against the input.
@@ -360,7 +365,7 @@ class SqueezeOp : public XlaOpKernel {
   }
 
  private:
-  std::unordered_set<int32> squeeze_dims_;
+  absl::flat_hash_set<int32> squeeze_dims_;
 };
 
 REGISTER_XLA_OP(Name("Squeeze"), SqueezeOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/sharding_op.cc b/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
index fdfef3a5355..14cdb4ab1e3 100644
--- a/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sharding_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/slice_op.cc b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
index 2c78ad74c16..0f930046c31 100644
--- a/tensorflow/compiler/tf2xla/kernels/slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/slice_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific Slice Op.
 
+#include <vector>
+
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
index 520c73ffbbf..44c332c5eb5 100644
--- a/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/softmax_op.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 // XLA-specific Ops for softmax.
 
+#include <tuple>
+#include <utility>
+
 #include "absl/strings/match.h"
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
index d6e38f1309f..39ce1057139 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetobatch_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
@@ -22,7 +26,7 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
-void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp& input,
+void SpaceToBatch(XlaOpKernelContext* ctx, const xla::XlaOp input,
                   DataType input_dtype, const TensorShape& input_tensor_shape,
                   absl::Span<const int64_t> block_shape,
                   const xla::Literal& paddings) {
diff --git a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
index 378aef0205d..be3d8e01b35 100644
--- a/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/spacetodepth_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/data_format.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
index 4f3c7b79861..5d6ccd54a2d 100644
--- a/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sparse_to_dense_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/split_op.cc b/tensorflow/compiler/tf2xla/kernels/split_op.cc
index ae5150f14f9..4871a89b0af 100644
--- a/tensorflow/compiler/tf2xla/kernels/split_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/split_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // XLA-specific Ops for split.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
index ad33157ed78..a64c0d54a7b 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cmath>
+#include <tuple>
 
 #include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
 #include "tensorflow/compiler/tf2xla/lib/broadcast.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
index b5445aa3e90..76cd46e1893 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateless_random_ops_v2.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/kernels/stateless_random_ops_v2.h"
 
 #include <cmath>
+#include <functional>
+#include <tuple>
 
 #include "tensorflow/compiler/tf2xla/kernels/random_ops_util.h"
 #include "tensorflow/compiler/tf2xla/kernels/rng_converter_utils.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
index a057b84c28b..d6e7f404fb9 100644
--- a/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/strided_slice_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/strided_slice_op.h"
 
+#include <algorithm>
 #include <vector>
 
 #include "absl/algorithm/container.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index a0f8f62cd57..fa70efbd906 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -118,8 +118,8 @@ Status GetTensorArrayShape(const XlaResource* resource,
 
 // Like XlaBuilder::DynamicUpdateSlice, but adds 'update' to the
 // relevant slice of 'operand'.
-xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp& operand,
-                           const xla::XlaOp& update,
+xla::XlaOp DynamicAddSlice(xla::XlaBuilder* builder, const xla::XlaOp operand,
+                           const xla::XlaOp update,
                            absl::Span<const int64_t> update_dims,
                            absl::Span<const xla::XlaOp> start_indices,
                            DataType dtype) {
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index f7e0f350a2f..544fc1d14ba 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h"
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
index f31cfd9eafc..d422bb63afd 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_TENSOR_LIST_UTILS_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
index d873396a828..228d1c5bbe3 100644
--- a/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/transpose_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 // handles all transposes, while Eigen needs a restricted DoTranspose
 // helper.
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
index 3c992ee8407..8da1b6eb6eb 100644
--- a/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unary_ops_composition.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
+#include <string>
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/tf2xla/kernels/cwise_ops.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/unique_op.cc b/tensorflow/compiler/tf2xla/kernels/unique_op.cc
index c4389baf4a8..39fd56cd5ce 100644
--- a/tensorflow/compiler/tf2xla/kernels/unique_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/unique_op.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include <sys/types.h>
 
+#include <memory>
+#include <optional>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 6fe7a44f32a..11b67ba9e54 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
+#include <utility>
+
 #include "tensorflow/compiler/tf2xla/kernels/gather_op_helpers.h"
 #include "tensorflow/compiler/tf2xla/kernels/shape_util.h"
 #include "tensorflow/compiler/tf2xla/lib/scatter.h"
@@ -208,7 +211,7 @@ class ResourceScatterAddOp : public ResourceScatterOp {
       : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Add(x, y);
   }
@@ -221,7 +224,7 @@ class ResourceScatterSubOp : public ResourceScatterOp {
       : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Sub(x, y);
   }
@@ -234,7 +237,7 @@ class ResourceScatterMulOp : public ResourceScatterOp {
       : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Mul(x, y);
   }
@@ -247,7 +250,7 @@ class ResourceScatterDivOp : public ResourceScatterOp {
       : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Div(x, y);
   }
@@ -260,7 +263,7 @@ class ResourceScatterMinOp : public ResourceScatterOp {
       : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Min(x, y);
   }
@@ -273,7 +276,7 @@ class ResourceScatterMaxOp : public ResourceScatterOp {
       : ResourceScatterOp(context, /*indices_are_vectors=*/false, Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Max(x, y);
   }
@@ -303,7 +306,7 @@ class ResourceScatterNdAddOp : public ResourceScatterOp {
                           /*combiner=*/Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Add(x, y);
   }
@@ -317,7 +320,7 @@ class ResourceScatterNdSubOp : public ResourceScatterOp {
                           /*combiner=*/Combine) {}
 
  private:
-  static xla::XlaOp Combine(const xla::XlaOp& x, const xla::XlaOp& y,
+  static xla::XlaOp Combine(const xla::XlaOp x, const xla::XlaOp y,
                             xla::XlaBuilder* builder) {
     return xla::Sub(x, y);
   }
diff --git a/tensorflow/compiler/tf2xla/kernels/where_op.cc b/tensorflow/compiler/tf2xla/kernels/where_op.cc
index 107cf72eb5a..44ef36c063b 100644
--- a/tensorflow/compiler/tf2xla/kernels/where_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/where_op.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
+#include <memory>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index 53b02fb5416..53a5e3c0525 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/while_op.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "absl/strings/str_split.h"
 #include "tensorflow/compiler/tf2xla/const_analysis.h"
 #include "tensorflow/compiler/tf2xla/kernels/if_while_utils.h"
@@ -246,7 +250,7 @@ StatusOr<xla::XlaComputation> BuildWrappedBody(
 xla::XlaOp BuildWhile(XlaOpKernelContext* ctx,
                       const xla::XlaComputation& wrapped_cond,
                       const xla::XlaComputation& wrapped_body,
-                      const xla::XlaOp& initial_values,
+                      const xla::XlaOp initial_values,
                       const std::vector<int>& input_mapping,
                       const std::vector<bool>& compile_time_const_arg_indices,
                       int num_compile_time_const_args,
@@ -347,12 +351,12 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   body_options.return_updated_values_for_all_resources = true;
   body_options.is_entry_computation = false;
   body_options.add_token_input_output = has_token_input_output_;
-  XlaCompiler::CompilationResult body;
+  auto body = std::make_unique<XlaCompiler::CompilationResult>();
   OP_REQUIRES_OK(ctx, compiler->CompileFunction(body_options, body_name_attr_,
-                                                arguments, &body));
+                                                arguments, body.get()));
   OP_REQUIRES_OK(
       ctx, ctx->xla_context()->RecordCollectiveInfoFromNestedCompilationResult(
-               body));
+               *body.get()));
 
   // We must use a static shape for parameters to an XLA compilation. However,
   // we may not know the shape of a resource if it is first
@@ -378,8 +382,8 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
             << has_uninitialized_tensor_lists;
     // Initializes any uninitialized resource with zero values of the
     // shape determined by the first compilation.
-    for (int i = 0; i < body.resource_updates.size(); ++i) {
-      const XlaCompiler::ResourceUpdate& update = body.resource_updates[i];
+    for (int i = 0; i < body->resource_updates.size(); ++i) {
+      const XlaCompiler::ResourceUpdate& update = body->resource_updates[i];
       XlaResource* resource;
       OP_REQUIRES_OK(ctx, ctx->GetResourceInput(update.input_index, &resource));
 
@@ -416,7 +420,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
     // Set the shape of any uninitialized TensorLists to the shape determined by
     // the first compilation. Note that, unlike resources, we do not initialize
     // the input list with zeros here, that is done later.
-    xla::Shape body_output_shape = body.xla_output_shape;
+    xla::Shape body_output_shape = body->xla_output_shape;
     OP_REQUIRES(ctx, body_output_shape.IsTuple(),
                 errors::FailedPrecondition(
                     "xla_output_shape of while body must be a tuple."));
@@ -431,9 +435,9 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
     // Recompile the body with the "correct" resource shapes.
     VLOG(1) << "Recompiling body with corrected resource shapes";
-    body = {};
+    *body = {};
     OP_REQUIRES_OK(ctx, compiler->CompileFunction(body_options, body_name_attr_,
-                                                  arguments, &body));
+                                                  arguments, body.get()));
   }
 
   VLOG(1) << "Compiling condition";
@@ -446,9 +450,9 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, compiler->CompileFunction(cond_options, cond_name_attr_,
                                                 arguments, &cond));
 
-  OP_REQUIRES(ctx, body.xla_input_shapes.size() == 1,
+  OP_REQUIRES(ctx, body->xla_input_shapes.size() == 1,
               errors::FailedPrecondition("Expected one input shape"));
-  xla::Shape body_input_shape = body.xla_input_shapes[0];
+  xla::Shape body_input_shape = body->xla_input_shapes[0];
   OP_REQUIRES(ctx, body_input_shape.IsTuple(),
               errors::FailedPrecondition("Expected tuple shape"));
   OP_REQUIRES(ctx, cond.xla_input_shapes.size() == 1,
@@ -458,7 +462,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
               errors::FailedPrecondition("Expected tuple shape"));
 
   VLOG(2) << "Body shape: " << xla::ShapeUtil::HumanString(body_input_shape)
-          << " -> " << xla::ShapeUtil::HumanString(body.xla_output_shape);
+          << " -> " << xla::ShapeUtil::HumanString(body->xla_output_shape);
   VLOG(2) << "Cond shape: " << xla::ShapeUtil::HumanString(cond_input_shape)
           << " -> " << xla::ShapeUtil::HumanString(cond.xla_output_shape);
 
@@ -473,7 +477,7 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   // args (which are pruned from the body outputs in body_wapper) matches the
   // shape of the inputs.
   OP_REQUIRES_OK(ctx, VerifyBodyInputAndOutputShapeMatch(
-                          ctx, compile_time_const_arg_indices, body,
+                          ctx, compile_time_const_arg_indices, *body.get(),
                           has_token_input_output_));
 
   xla::Shape expected_cond_output_shape_without_side_effect =
@@ -494,10 +498,10 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
                   "(pred[], token[]), got: ",
                   xla::ShapeUtil::HumanString(cond.xla_output_shape)));
 
-  int num_inputs = body.input_mapping.size();
+  int num_inputs = body->input_mapping.size();
   std::vector<xla::XlaOp> inputs(num_inputs);
   for (int i = 0; i < num_inputs; ++i) {
-    int input_num = body.input_mapping[i];
+    int input_num = body->input_mapping[i];
     if (has_token_input_output_ && i == num_inputs - 1) {
       // Set token input for this "while" op.
       std::vector<xla::XlaOp> token_inputs;
@@ -577,14 +581,14 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
 
   // Remove compile time const args from the list of body outputs.
   StatusOr<xla::XlaComputation> body_result =
-      BuildWrappedBody(ctx, body, compile_time_const_arg_indices,
+      BuildWrappedBody(ctx, *body.get(), compile_time_const_arg_indices,
                        num_compile_time_const_args, has_token_input_output_);
   OP_REQUIRES_OK(ctx, body_result.status());
   xla::XlaComputation wrapped_body = std::move(body_result.value());
 
   // Builds the While op and pads its output with the compile time const args.
   xla::XlaOp while_result =
-      BuildWhile(ctx, wrapped_cond, wrapped_body, init, body.input_mapping,
+      BuildWhile(ctx, wrapped_cond, wrapped_body, init, body->input_mapping,
                  compile_time_const_arg_indices, num_compile_time_const_args,
                  has_token_input_output_);
 
@@ -617,8 +621,8 @@ void XlaWhileOp::Compile(XlaOpKernelContext* ctx) {
   }
 
   // Updates the values of any resource variables modified by the loop.
-  for (int i = 0; i < body.resource_updates.size(); ++i) {
-    const XlaCompiler::ResourceUpdate& update = body.resource_updates[i];
+  for (int i = 0; i < body->resource_updates.size(); ++i) {
+    const XlaCompiler::ResourceUpdate& update = body->resource_updates[i];
     XlaResource* resource;
     OP_REQUIRES_OK(ctx, ctx->GetResourceInput(update.input_index, &resource));
     if (update.modified) {
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.h b/tensorflow/compiler/tf2xla/kernels/while_op.h
index 0e259b3bac0..2f0b6c3a7f4 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.h
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
 #define TENSORFLOW_COMPILER_TF2XLA_KERNELS_WHILE_OP_H_
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
index 44495e77e33..2beabc31f34 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_broadcast_helper_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <set>
+#include <vector>
+
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index c8a82fbfa28..e265184ad2d 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -15,11 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
 
+#include <algorithm>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Block.h"  // from @llvm-project
@@ -44,10 +48,14 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo
 #include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/regexp.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -60,20 +68,37 @@ namespace {
 // version in the constructor in xla.py.
 // Version 1 used MHLO & CHLO, not supported anymore.
 // Version 2 supports StableHLO & CHLO. From 10/2022.
-const int VERSION_START_STABLE_HLO = 2;
+constexpr int VERSION_START_STABLE_HLO = 2;
 // Version 3 supports platform checking and multiple platforms. From 02/2023.
-const int VERSION_START_PLATFORMS = 3;
+constexpr int VERSION_START_PLATFORMS = 3;
 // Version 4 supports StableHLO with compatibility guarantees.
-// Used from 03/2023.
-const int VERSION_START_STABLE_HLO_COMPATIBILITY = 4;
-// Version 5 add support to stablehlo.custom_call for host call tf graph.
-// Used from 04/2023.
-const int VERSION_SUPPORT_CUSTOM_CALL = 5;
-const int VERSION_MINIMUM_SUPPORTED = VERSION_START_STABLE_HLO;
-const int VERSION_MAXIMUM_SUPPORTED = VERSION_SUPPORT_CUSTOM_CALL;
+// Used in jax2tf from March 15, 2023 (cl/516885716). Starting with
+// March 28th, 2023 we stopped using dim_args_spec (cl/520033493).
+// TODO(b/283439649): Remove support for dim_args_spec.
+constexpr int VERSION_START_STABLE_HLO_COMPATIBILITY = 4;
+// Version 5 adds support for call_tf_graph. This does not change the semantics
+// of the op, but it allows the `function_list` attribute.
+// Used in jax2tf from May 3rd, 2023 (cl/529106145).
+constexpr int VERSION_START_SUPPORT_CALL_TF_GRAPH = 5;
+// Version 6 adds support for the `disabled_checks` attribute. This version
+// mandates a non-empty `platforms` attribute.
+// Used in jax2tf since June 2023.
+constexpr int VERSION_START_SUPPORT_DISABLED_CHECKS = 6;
+constexpr int VERSION_MINIMUM_SUPPORTED =
+    VERSION_START_STABLE_HLO_COMPATIBILITY;
+
+constexpr int VERSION_MAXIMUM_SUPPORTED = VERSION_START_SUPPORT_DISABLED_CHECKS;
+
+constexpr absl::string_view DISABLED_CHECK_PLATFORM = "platform";
+
+bool IsPlatformCheckDisabled(absl::Span<const std::string> disabled_checks) {
+  return std::find(disabled_checks.begin(), disabled_checks.end(),
+                   DISABLED_CHECK_PLATFORM) != disabled_checks.end();
+}
 
 // Computes a dimension value from the dim_arg specification.
 // The specification is of the form "<arg_idx>.<arg_axis_idx>".
+// TODO(b/283439649): Remove support for dim_args_spec.
 tsl::StatusOr<mlir::Value> ComputeDimensionValue(
     int version, std::string dim_arg_spec, std::vector<mlir::Value> arguments,
     mlir::OpBuilder op_builder, mlir::Type dim_arg_type) {
@@ -81,27 +106,27 @@ tsl::StatusOr<mlir::Value> ComputeDimensionValue(
   int arg_idx, arg_axis_idx;
   if (!RE2::FullMatch(dim_arg_spec, *dim_arg_spec_re, &arg_idx,
                       &arg_axis_idx)) {
-    return tsl::errors::InvalidArgument("Syntax error in dim_args_spec '",
-                                        dim_arg_spec, "'");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Syntax error in dim_args_spec '", dim_arg_spec, "'"));
   }
   if (arg_idx < 0 || arg_idx >= arguments.size()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Invalid argument index ", arg_idx,
         " when the number of non-dimension arguments is ", arguments.size(),
-        " in dim_arg_spec '", dim_arg_spec, "'");
+        " in dim_arg_spec '", dim_arg_spec, "'"));
   }
   mlir::RankedTensorType arg_type =
       arguments[arg_idx].getType().dyn_cast<mlir::RankedTensorType>();
   if (!arg_type) {
-    return tsl::errors::InvalidArgument(
-        "Argument ", arg_idx, " referenced in dim_arg_spec '", dim_arg_spec,
-        "' does not have a RankedTensorType");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Argument ", arg_idx, " referenced in dim_arg_spec '",
+                     dim_arg_spec, "' does not have a RankedTensorType"));
   }
   if (arg_axis_idx < 0 || arg_axis_idx >= arg_type.getShape().size()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Invalid axis index ", arg_axis_idx,
         " when the rank of non-dimension argument ", arg_idx, " is ",
-        arg_type.getShape().size(), " in dim_arg_spec '", dim_arg_spec, "'");
+        arg_type.getShape().size(), " in dim_arg_spec '", dim_arg_spec, "'"));
   }
   mlir::Value val;
   mlir::Type get_dim_type =
@@ -120,27 +145,14 @@ tsl::StatusOr<mlir::Value> ComputeDimensionValue(
 
 tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> XlaCallModuleLoader::Create(
     mlir::MLIRContext *context, int version, std::string module_str,
-    std::vector<std::string> dim_args_spec, int platform_index) {
-  if (version < VERSION_MINIMUM_SUPPORTED) {
-    return tsl::errors::InvalidArgument(
-        "XlaCallModuleOp with version ", version,
-        " is not supported anymore. Must be >= ", VERSION_MINIMUM_SUPPORTED);
-  }
-  if (version > VERSION_MAXIMUM_SUPPORTED) {
-    return tsl::errors::InvalidArgument(
-        "XlaCallModuleOp with version ", version,
-        " is not supported by this build. Must be <= ",
-        VERSION_MAXIMUM_SUPPORTED);
-  }
-
-  if (version < VERSION_START_PLATFORMS) {
-    platform_index = -1;
-  }
-
+    std::vector<std::string> dim_args_spec,
+    std::vector<std::string> disabled_checks,
+    std::vector<std::string> platforms, std::string loading_platform) {
   std::unique_ptr<XlaCallModuleLoader> loader(new XlaCallModuleLoader);
   TF_RETURN_IF_ERROR(loader->LoadAndPreprocessModule(
       context, version, std::move(module_str), std::move(dim_args_spec),
-      platform_index));
+      std::move(disabled_checks), std::move(platforms),
+      std::move(loading_platform)));
   return loader;
 }
 
@@ -191,18 +203,18 @@ tsl::Status XlaCallModuleLoader::AddMainWrapper() {
   mlir::func::FuncOp orig_main =
       module_->lookupSymbol<mlir::func::FuncOp>("main");
   if (!orig_main) {
-    return tsl::errors::InvalidArgument("Cannot find 'main' in module");
+    return absl::InvalidArgumentError("Cannot find 'main' in module");
   }
   int nr_platform_args = 0;
   if (platform_index_ >= 0) {
     nr_platform_args = 1;
   }
   if (orig_main.getNumArguments() <= nr_platform_args + nr_dim_args) {
-    return tsl::errors::InvalidArgument(
-        "The module should have ", nr_platform_args,
-        " platform index arguments and ", nr_dim_args,
-        " dimension arguments, but it ", "has only ",
-        orig_main.getNumArguments(), " total arguments");
+    return absl::InvalidArgumentError(
+        absl::StrCat("The module should have ", nr_platform_args,
+                     " platform index arguments and ", nr_dim_args,
+                     " dimension arguments, but it ", "has only ",
+                     orig_main.getNumArguments(), " total arguments"));
   }
   mlir::Block &orig_main_body = orig_main.front();
 
@@ -237,18 +249,18 @@ tsl::Status XlaCallModuleLoader::AddMainWrapper() {
           !arg_ranked_type.getShape().empty()) {
         std::string argument_type =
             (i < nr_platform_args) ? "platform index" : "dimension";
-        return tsl::errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "Module argument at index ", i,
             " should be a 0-dimensional integer-tensor ", argument_type,
-            " argument but has type ", mlir::debugString(arg_type));
+            " argument but has type ", mlir::debugString(arg_type)));
       }
       if (i < nr_platform_args) {
         if (arg_ranked_type.getElementTypeBitWidth() != 32) {
-          return tsl::errors::InvalidArgument(
-              "Module argument at index ", i,
-              " should be a 0-dimensional 32-bit integer-tensor"
-              " platform index argument but has type ",
-              mlir::debugString(arg_type));
+          return absl::InvalidArgumentError(
+              absl::StrCat("Module argument at index ", i,
+                           " should be a 0-dimensional 32-bit integer-tensor"
+                           " platform index argument but has type ",
+                           mlir::debugString(arg_type)));
         }
         call_args[i] = op_builder.create<mlir::stablehlo::ConstantOp>(
             block_args[0].getLoc(),
@@ -268,8 +280,10 @@ tsl::Status XlaCallModuleLoader::AddMainWrapper() {
   mlir::func::CallOp call_op = op_builder.create<mlir::func::CallOp>(
       loc, orig_main.getResultTypes(), orig_main.getSymName(), call_args);
   op_builder.create<mlir::func::ReturnOp>(loc, call_op.getResults());
-  VLOG(3) << "XlaCallModule module with wrapper: "
-          << mlir::debugString(*module_);
+
+  if (VLOG_IS_ON(5)) {
+    DumpMlirOpToFile("xla_call_module.after_add_main_wrapper", *module_);
+  }
 
   return tsl::OkStatus();
 }
@@ -283,35 +297,62 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
   int nr_dim_args = dim_args_spec_.size();
   int non_dimension_arguments = input_shapes.size();
   if (non_dimension_arguments != main_body.getNumArguments()) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Incorrect number of arguments passed to XlaCallModule: ",
         non_dimension_arguments, ". The module takes ",
         main_body.getNumArguments() + nr_platform_args + nr_dim_args,
         " arguments of which ", nr_platform_args,
         " platform index arguments and ", nr_dim_args,
         " dimension arguments. It must be called with ",
-        main_body.getNumArguments(), " arguments.");
+        main_body.getNumArguments(), " arguments."));
   }
 
   mlir::Builder builder(module_->getContext());
   std::vector<mlir::Type> static_array_input_types(non_dimension_arguments);
   for (int i = 0, end = non_dimension_arguments; i < end; ++i) {
     const xla::Shape &xla_shape = input_shapes[i];
-    std::vector<int64_t> xla_dimensions(xla_shape.dimensions().begin(),
-                                        xla_shape.dimensions().end());
-    TF_ASSIGN_OR_RETURN(
-        mlir::Type element_type,
-        ConvertPrimitiveTypeToMLIRType(xla_shape.element_type(), builder));
-    mlir::Type type = mlir::RankedTensorType::get(xla_dimensions, element_type);
-    // TODO(burmako): This fails with an obscure compilation error.
-    // TF_ASSIGN_OR_RETURN(
-    //     mlir::Type type,
-    //     ConvertShapeToType<mlir::RankedTensorType>(xla_shape, builder));
-    VLOG(3) << "XlaCallModule static array input type #" << i << ": "
-            << mlir::debugString(type);
-    // TODO(b/278273480): Determine whether it's safe to override the element
-    // type using that from the input shape.
-    static_array_input_types[i] = type;
+    if (xla_shape.IsToken()) {
+      static_array_input_types[i] = mlir::stablehlo::TokenType::get(context_);
+    } else {
+      std::vector<int64_t> xla_dimensions(xla_shape.dimensions().begin(),
+                                          xla_shape.dimensions().end());
+      TF_ASSIGN_OR_RETURN(
+          mlir::Type element_type,
+          ConvertPrimitiveTypeToMLIRType(xla_shape.element_type(), builder));
+      mlir::RankedTensorType type =
+          mlir::RankedTensorType::get(xla_dimensions, element_type);
+      // TODO(burmako): This fails with an obscure compilation error.
+      // TF_ASSIGN_OR_RETURN(
+      //     mlir::Type type,
+      //     ConvertShapeToType<mlir::RankedTensorType>(xla_shape, builder));
+      VLOG(3) << "XlaCallModule static array input type #" << i << ": "
+              << mlir::debugString(type);
+      mlir::TensorType arg_type =
+          main_body.getArgument(i).getType().dyn_cast<mlir::TensorType>();
+      if (arg_type == nullptr) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Argument ", i, " passed to XlaCallModule is not a tensor"));
+      }
+
+      if (arg_type.getElementType() != type.getElementType()) {
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Element type mismatch for argument ", i,
+            " passed to XlaCallModule: ", "expecting ",
+            mlir::debugString(arg_type), ", got ", mlir::debugString(type)));
+      }
+
+      if (auto ranked_arg_type = arg_type.dyn_cast<mlir::RankedTensorType>()) {
+        if (mlir::failed(mlir::verifyCompatibleShape(ranked_arg_type.getShape(),
+                                                     type.getShape()))) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Shape mismatch for argument ", i,
+              " passed to XlaCallModule: ", "expecting ",
+              mlir::debugString(arg_type), ", got ", mlir::debugString(type)));
+        }
+      }
+
+      static_array_input_types[i] = type;
+    }
   }
 
   // Refine 'main' argument types to use static input types instead.
@@ -320,13 +361,18 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
   // shape refinement as explained below.
   // Before refining the argument types it is useful to run the inliner to
   // remove calls that may be called with the input arguments.
-  mlir::PassManager pm_inline(module_->getContext());
-  pm_inline.addPass(mlir::createInlinerPass());
-  if (!mlir::succeeded(pm_inline.run(*module_))) {
-    return tsl::errors::InvalidArgument("Module inlining failed");
+  {
+    mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
+
+    mlir::PassManager pm_inline(module_->getContext());
+    applyTensorflowAndCLOptions(pm_inline);
+    pm_inline.addPass(mlir::createInlinerPass());
+
+    if (mlir::failed(pm_inline.run(*module_))) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Module inlining failed: ", diag_handler.ConsumeStatus().ToString()));
+    }
   }
-  VLOG(3) << "XlaCallModule module after inlining: "
-          << mlir::debugString(*module_);
 
   auto static_array_output_types = llvm::to_vector(main_.getResultTypes());
   for (auto i = 0; i < main_body.getNumArguments(); ++i) {
@@ -346,42 +392,49 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
   }
   main_.setType(builder.getFunctionType(static_array_input_types,
                                         static_array_output_types));
+  if (VLOG_IS_ON(5)) {
+    DumpMlirOpToFile("xla_call_module.after_refined_input_types", *module_);
+  }
 
   // Verify the module before running passes on it.
   // If the module doesn't pass verification, all sorts of weirdness might
   // happen if we run the pass manager.
-  if (failed(verify(*module_))) {
-    VLOG(3) << "XlaCallModule module with verification failed: "
-            << mlir::debugString(*module_);
-    return tsl::errors::InvalidArgument("Module verification failed");
-  }
-  mlir::PassManager pm(module_->getContext());
-  if (VLOG_IS_ON(3)) {
-    auto print_before = [](mlir::Pass *, mlir::Operation *) { return true; };
-    auto print_after = [](mlir::Pass *, mlir::Operation *) { return true; };
-    pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
-                        /*printAfterOnlyOnChange=*/false);
-  }
-  pm.addPass(mlir::createCSEPass());
-  pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::stablehlo::createStablehloCanonicalizeDynamismPass());
-  if (!mlir::succeeded(pm.run(*module_))) {
-    return tsl::errors::InvalidArgument("Module shape refinement failed");
-  }
+  {
+    mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
 
-  VLOG(3) << "XlaCallModule module with refined shapes: "
-          << mlir::debugString(*module_);
+    if (failed(verify(*module_))) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Module verification failed: ",
+                       diag_handler.ConsumeStatus().ToString()));
+    }
+
+    mlir::PassManager pm(module_->getContext());
+    applyTensorflowAndCLOptions(pm);
+    pm.addPass(mlir::createCSEPass());
+    pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
+    pm.addNestedPass<mlir::func::FuncOp>(
+        mlir::stablehlo::createStablehloCanonicalizeDynamismPass());
+    if (mlir::failed(pm.run(*module_))) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("Module shape refinement failed: ",
+                       diag_handler.ConsumeStatus().ToString()));
+    }
+
+    if (VLOG_IS_ON(3)) {
+      DumpMlirOpToFile("xla_call_module.after_shape_refinement", *module_);
+    }
+  }
   return tsl::OkStatus();
 }
 
 tsl::Status XlaCallModuleLoader::LoadAndPreprocessModule(
     mlir::MLIRContext *context, int version, std::string module_str,
-    std::vector<std::string> dim_args_spec, int platform_index) {
+    std::vector<std::string> dim_args_spec,
+    std::vector<std::string> disabled_checks,
+    std::vector<std::string> platforms, std::string loading_platform) {
   context_ = context;
   version_ = version;
   dim_args_spec_ = std::move(dim_args_spec);
-  platform_index_ = platform_index;
 
   // Load a superset of dialects; we should check at serialization time that
   // we only include allowable dialects.
@@ -390,6 +443,13 @@ tsl::Status XlaCallModuleLoader::LoadAndPreprocessModule(
   context_->loadDialect<mlir::mhlo::MhloDialect>();
   context_->loadDialect<mlir::chlo::ChloDialect>();
   context_->loadDialect<mlir::vhlo::VhloDialect>();
+
+  if (version >= VERSION_START_SUPPORT_DISABLED_CHECKS && platforms.empty()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("XlaCallModuleOp with version ", version,
+                     " must have non-empty platforms."));
+  }
+
   // Parses both IR text and bytecode.
   if (version >= VERSION_START_STABLE_HLO_COMPATIBILITY) {
     module_ =
@@ -398,22 +458,75 @@ tsl::Status XlaCallModuleLoader::LoadAndPreprocessModule(
     module_ = mlir::parseSourceString<mlir::ModuleOp>(module_str, context_);
   }
 
+  std::vector<std::string> loading_disabled_checks = disabled_checks;
+  loading_disabled_checks.insert(
+      loading_disabled_checks.end(),
+      GetXlaCallModuleFlags()->disabled_checks.begin(),
+      GetXlaCallModuleFlags()->disabled_checks.end());
   if (!module_) {
-    return tsl::errors::InvalidArgument("Cannot deserialize computation");
+    return absl::InvalidArgumentError("Cannot deserialize computation");
   }
-  VLOG(3) << "Parsed serialized module (version " << version
-          << ", platform_index = " << platform_index_ << ", dim_args_spec = ["
-          << absl::StrJoin(dim_args_spec_, ", ") << "])\n"
-          << mlir::debugString(*module_);
 
-  if (failed(module_->verifyInvariants())) {
-    VLOG(1) << "MLIR verification failed.";
-    module_->dump();
-    return tsl::errors::InvalidArgument("Error verifying module");
+  VLOG(3) << "Parsed serialized module (version " << version
+          << ", platforms = [" << absl::StrJoin(platforms, ", ")
+          << "], loading_platform = " << loading_platform
+          << ", dim_args_spec = [" << absl::StrJoin(dim_args_spec_, ", ")
+          << "], disabled_checks = [" << absl::StrJoin(disabled_checks, ", ")
+          << "], loading_disabled_checks = ["
+          << absl::StrJoin(loading_disabled_checks, ", ") << "]), module = "
+          << DumpMlirOpToFile("xla_call_module.parsed", *module_);
+
+  if (version < VERSION_MINIMUM_SUPPORTED) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "XlaCallModuleOp with version ", version,
+        " is not supported anymore. Must be >= ", VERSION_MINIMUM_SUPPORTED));
+  }
+  if (version > VERSION_MAXIMUM_SUPPORTED) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("XlaCallModuleOp with version ", version,
+                     " is not supported by this build. Must be <= ",
+                     VERSION_MAXIMUM_SUPPORTED));
+  }
+
+  platform_index_ = -1;
+  if (!platforms.empty()) {
+    auto found_platform =
+        std::find(platforms.begin(), platforms.end(), loading_platform);
+    if (found_platform == platforms.end()) {
+      if (!IsPlatformCheckDisabled(loading_disabled_checks)) {
+        return absl::NotFoundError(absl::StrCat(
+            "The current platform ", loading_platform,
+            " is not among the platforms required by the module: [",
+            absl::StrJoin(platforms, ", "), "]"));
+      } else {
+        if (platforms.size() > 1) {
+          platform_index_ = 0;
+        }
+      }
+    } else {
+      // We only use a platform index arguments if we support at least 2
+      // platforms.
+      if (platforms.size() > 1) {
+        platform_index_ = found_platform - platforms.begin();
+      }
+    }
+  }
+
+  if (version >= VERSION_START_SUPPORT_CALL_TF_GRAPH &&
+      !dim_args_spec_.empty()) {
+    return absl::InvalidArgumentError(
+        "dim_args_spec not supported in this version");
+  }
+  {
+    mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
+    if (mlir::failed(mlir::verify(*module_))) {
+      return absl::InvalidArgumentError(absl::StrCat(
+          "Error verifying module: ", diag_handler.ConsumeStatus().ToString()));
+    }
   }
   main_ = module_->lookupSymbol<mlir::func::FuncOp>("main");
   if (!main_) {
-    return tsl::errors::InvalidArgument("Cannot find 'main' in module");
+    return absl::InvalidArgumentError("Cannot find 'main' in module");
   }
 
   if (!dim_args_spec_.empty() || platform_index_ >= 0) {
@@ -423,9 +536,9 @@ tsl::Status XlaCallModuleLoader::LoadAndPreprocessModule(
   return tsl::OkStatus();
 }
 
-tsl::Status XlaCallModuleLoader::ValidateModule() {
+tsl::Status XlaCallModuleLoader::ValidateDialect() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
   bool moduleHasUnsupportedDialects = false;
-  bool moduleHasDynamicShapes = false;
 
   module_->walk([&](mlir::Operation *op) {
     // StableHLO programs created by jax2tf only contain operations
@@ -434,10 +547,23 @@ tsl::Status XlaCallModuleLoader::ValidateModule() {
                    mlir::func::FuncDialect, mlir::stablehlo::StablehloDialect>(
             op->getDialect())) {
       moduleHasUnsupportedDialects = true;
-      VLOG(3) << "Operation has unsupported dialects: "
-              << mlir::debugString(*op);
+      op->emitOpError() << "is an op from an unsupported dialect";
     }
+  });
 
+  if (moduleHasUnsupportedDialects) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Module has unsupported dialects: ",
+                     diag_handler.ConsumeStatus().ToString()));
+  }
+  return tsl::OkStatus();
+}
+
+tsl::Status XlaCallModuleLoader::ValidateStaticShapes() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
+  bool moduleHasDynamicShapes = false;
+
+  module_->walk([&](mlir::Operation *op) {
     // It's sufficient to only check results because operands either come from
     // results or from block arguments which are checked below.
     auto hasDynamicShape = [](mlir::Value value) {
@@ -452,22 +578,53 @@ tsl::Status XlaCallModuleLoader::ValidateModule() {
     }
     if (opHasDynamicShapes) {
       moduleHasDynamicShapes = true;
-      VLOG(3) << "Operation has dynamic shapes: " << mlir::debugString(*op);
+      op->emitOpError() << "has dynamic shapes";
     }
   });
 
-  if (moduleHasUnsupportedDialects)
-    return tsl::errors::InvalidArgument("Module has unsupported dialects");
-  if (moduleHasDynamicShapes)
-    return tsl::errors::InvalidArgument("Module has dynamic shapes");
+  if (moduleHasDynamicShapes) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Module has dynamic shapes: ",
+                     diag_handler.ConsumeStatus().ToString()));
+  }
   return tsl::OkStatus();
 }
 
+absl::Status XlaCallModuleLoader::LowerModuleToMhlo() {
+  mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
+
+  mlir::PassManager pm(module_->getContext());
+  applyTensorflowAndCLOptions(pm);
+  pm.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createLegalizeSparseChloToLinalgPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createChloLegalizeToHloPass(
+      /*legalizeBroadcasts=*/true, /*expandCompositions=*/true));
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  // In order to export to XLA, we must sink constants to control flow
+  // regions, since XLA uses functional control flow.
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createSinkConstantsToControlFlowPass());
+  if (failed(pm.run(*module_))) {
+    return absl::InternalError(
+        absl::StrCat("MHLO->HLO lowering passes failed: ",
+                     diag_handler.ConsumeStatus().ToString()));
+  }
+
+  if (VLOG_IS_ON(5)) {
+    DumpMlirOpToFile("xla_call_module.after_mhlo_lowering", *module_);
+  }
+
+  return absl::OkStatus();
+}
+
 tsl::StatusOr<xla::XlaComputation> XlaCallModuleLoader::ToXlaComputation() {
-  xla::XlaComputation xla_computation;
+  xla::HloProto proto;
+  mlir::MlirToHloConversionOptions options;
   TF_RETURN_IF_ERROR(
-      MlirToXlaComputation(*module_, xla_computation, false, false));
-  return xla_computation;
+      mlir::ConvertMlirHloToHlo(*module_, &proto, /*use_tuple_args=*/false,
+                                /*return_tuple=false*/ false, options));
+  return xla::XlaComputation(std::move(*proto.mutable_hlo_module()));
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
index 6196cfe1f20..54aaa6ae58f 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -34,7 +35,9 @@ class XlaCallModuleLoader {
  public:
   static tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> Create(
       mlir::MLIRContext* context, int version, std::string module_str,
-      std::vector<std::string> dim_args_spec, int platform_index);
+      std::vector<std::string> dim_args_spec,
+      std::vector<std::string> disabled_checks,
+      std::vector<std::string> platforms, std::string loading_platform);
 
   int nr_outputs() { return main_.getNumResults(); }
   mlir::TypeRange output_types() { return main_.getResultTypes(); }
@@ -52,13 +55,26 @@ class XlaCallModuleLoader {
   // cause lifetime issues.
   tsl::Status RefineDynamicShapes(llvm::ArrayRef<xla::Shape> input_shapes);
 
-  // Validate that the module represents a statically-shaped StableHLO program,
+  // Validates that the module only contains ops from valid dialects.
+  tsl::Status ValidateDialect();
+
+  // Validates that the module represents a statically-shaped StableHLO program,
   // otherwise all sorts of weirdness might happen in the HLO exporter which is
   // much easier to detect here.
-  tsl::Status ValidateModule();
+  tsl::Status ValidateStaticShapes();
 
+  // Lowers the StableHLO module to MHLO in place.
+  absl::Status LowerModuleToMhlo();
+
+  // Lowers the MHLO module to XlaComputation and returns it.
+  //
+  // REQUIRES: `LowerModuleToMhlo()` is called beforehand.
   tsl::StatusOr<xla::XlaComputation> ToXlaComputation();
 
+  // Returns the deserialized stablehlo module.
+  mlir::ModuleOp module() & { return *module_; }
+  mlir::OwningOpRef<mlir::ModuleOp> module() && { return std::move(module_); }
+
  private:
   XlaCallModuleLoader() = default;
 
@@ -66,7 +82,9 @@ class XlaCallModuleLoader {
   tsl::Status LoadAndPreprocessModule(mlir::MLIRContext* context, int version,
                                       std::string module_str,
                                       std::vector<std::string> dim_args_spec,
-                                      int platform_index);
+                                      std::vector<std::string> disabled_checks,
+                                      std::vector<std::string> platforms,
+                                      std::string loading_platform);
 
   // Adds a wrapper for the "main" function to compute the platform index and
   // the dimension arguments.
@@ -75,6 +93,8 @@ class XlaCallModuleLoader {
   mlir::MLIRContext* context_;
   int version_;
   mlir::OwningOpRef<mlir::ModuleOp> module_;
+  // Index in platforms of the current platform, or -1 if module does not take
+  // a platform index arg.
   int platform_index_;
   std::vector<std::string> dim_args_spec_;
   mlir::func::FuncOp main_;
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
index fbb853528fc..8e6c6e9af93 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_op.cc
@@ -19,22 +19,109 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/TypeRange.h"  // from @llvm-project
+#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h"
+#include "tensorflow/compiler/tf2xla/side_effect_util.h"
+#include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
+#include "tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace {
 
+// Imports the given `XlaComputation` into StableHLO functions the MLIR module.
+// Returns the MLIR function in the imported module that represents the entry
+// function of the imported computation.
+absl::StatusOr<mlir::func::FuncOp> ImportXlaComputation(
+    mlir::SymbolTableCollection &symbol_table_collection, mlir::ModuleOp module,
+    const xla::XlaComputation &computation) {
+  mlir::MLIRContext *context = module.getContext();
+  mlir::SymbolTable &symbol_table =
+      symbol_table_collection.getSymbolTable(module);
+
+  mlir::OwningOpRef<mlir::ModuleOp> imported =
+      mlir::ModuleOp::create(mlir::UnknownLoc::get(context));
+  context->loadDialect<mlir::func::FuncDialect>();
+  context->loadDialect<mlir::mhlo::MhloDialect>();
+  TF_RETURN_IF_ERROR(
+      xla::ConvertHloToMlirHlo(*imported, &computation.proto(),
+                               /*import_all_computations=*/true));
+  if (VLOG_IS_ON(5)) {
+    DumpMlirOpToFile("xla_call_module.imported_tf_func", *imported);
+  }
+
+  // Rename all functions beforehand in order to avoid conflicts.
+  mlir::StringAttr main_func_name;
+  for (auto func : imported->getOps<mlir::func::FuncOp>()) {
+    mlir::StringAttr name = func.getSymNameAttr();
+    mlir::StringAttr new_name = name;
+    for (int i = 0; symbol_table.lookup(new_name) != nullptr; ++i) {
+      new_name = mlir::StringAttr::get(
+          context, absl::StrCat(absl::string_view(name.getValue()), i));
+    }
+    if (new_name != name) {
+      if (failed(mlir::SymbolTable::replaceAllSymbolUses(func, new_name,
+                                                         *imported))) {
+        return absl::InternalError(
+            absl::StrCat("Failed to replace all symbol uses of function '",
+                         absl::string_view(func.getName()), "'"));
+      }
+      func.setSymNameAttr(new_name);
+    }
+    if (name.getValue() == "main") {
+      main_func_name = new_name;
+    }
+  }
+  if (!main_func_name) {
+    return absl::InternalError(
+        "HLO module lowered from TF function is missing a main function");
+  }
+
+  mlir::func::FuncOp main_func;
+  for (auto func : imported->getOps<mlir::func::FuncOp>()) {
+    auto cloned = func.clone();
+    cloned.setPrivate();
+    symbol_table.insert(cloned);
+    if (func.getSymNameAttr() == main_func_name) {
+      main_func = cloned;
+    }
+  }
+
+  return main_func;
+}
+
 class XlaCallModuleOp : public XlaOpKernel {
  public:
   explicit XlaCallModuleOp(OpKernelConstruction *ctx) : XlaOpKernel(ctx) {
@@ -50,73 +137,105 @@ class XlaCallModuleOp : public XlaOpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("dim_args_spec", &dim_args_spec));
     OP_REQUIRES(ctx,
                 expected_output_shapes.size() == expected_output_dtypes.size(),
-                errors::InvalidArgument("The size of Sout (",
-                                        expected_output_shapes.size(),
-                                        ") must match the size of Tout (",
-                                        expected_output_dtypes.size(), ")"));
+                absl::InvalidArgumentError(absl::StrCat(
+                    "The size of Sout (", expected_output_shapes.size(),
+                    ") must match the size of Tout (",
+                    expected_output_dtypes.size(), ")")));
+    std::vector<string> disabled_checks;
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("disabled_checks", &disabled_checks));
     std::vector<string> platforms;
-    // Index in platforms of the current platform, or -1 if module does not take
-    // a platform index arg.
-    int platform_index = -1;
-    if (ctx->HasAttr("platforms")) {
-      OP_REQUIRES_OK(ctx, ctx->GetAttr("platforms", &platforms));
-      if (!platforms.empty()) {
-        string current_device_type = ctx->device_type().type_string();
-        string current_platform = "";
-        if (current_device_type == DEVICE_CPU_XLA_JIT) {
-          current_platform = "CPU";
-        } else if (current_device_type == DEVICE_GPU_XLA_JIT) {
+    OP_REQUIRES_OK(ctx, ctx->GetAttr("platforms", &platforms));
+
+    string loading_device_type = ctx->device_type().type_string();
+    string loading_platform = "";
+    if (loading_device_type == DEVICE_CPU_XLA_JIT) {
+      loading_platform = "CPU";
+    } else if (loading_device_type == DEVICE_GPU_XLA_JIT) {
 #if GOOGLE_CUDA
-          current_platform = "CUDA";
+      loading_platform = "CUDA";
 #elif TENSORFLOW_USE_ROCM
-          current_platform = "ROCM";
+      loading_platform = "ROCM";
 #else
-          OP_REQUIRES(ctx, false,
-                      errors::Unimplemented("CUDA or ROCM build required"));
+      OP_REQUIRES(ctx, false,
+                  absl::UnimplementedError("CUDA or ROCM build required"));
 #endif
-        } else if (current_device_type == DEVICE_TPU_XLA_JIT) {
-          current_platform = "TPU";
-        } else {
-          OP_REQUIRES(ctx, false,
-                      errors::Unimplemented("Unexpected device type ",
-                                            current_device_type));
-        }
-        VLOG(3) << "Initialized XlaCallModuleOp on " << current_platform;
-        auto found_platform =
-            std::find(platforms.begin(), platforms.end(), current_platform);
-        OP_REQUIRES(ctx, found_platform != platforms.end(),
-                    errors::NotFound(
-                        "The current platform ", current_platform,
-                        " is not among the platforms required by the module: [",
-                        absl::StrJoin(platforms, ", "), "]"));
-        // We only use a platform index arguments if we support at least 2
-        // platforms.
-        if (platforms.size() > 1) {
-          platform_index = found_platform - platforms.begin();
-        }
-      }
+    } else if (loading_device_type == DEVICE_TPU_XLA_JIT) {
+      loading_platform = "TPU";
+    } else {
+      OP_REQUIRES(ctx, false,
+                  absl::UnimplementedError(absl::StrCat(
+                      "Unexpected device type ", loading_device_type)));
+    }
+    VLOG(3) << "Initialized XlaCallModuleOp on " << loading_platform;
+    {
+      auto loader = XlaCallModuleLoader::Create(
+          &context_, version, std::move(module_str), std::move(dim_args_spec),
+          std::move(disabled_checks), std::move(platforms), loading_platform);
+      OP_REQUIRES_OK(ctx, loader.status());
+      loader_ = *std::move(loader);
+    }
+    OP_REQUIRES_OK(ctx, loader_->ValidateDialect());
+
+    if (!ctx->GetAttr("function_list", &function_list_).ok()) {
+      function_list_.clear();
     }
 
-    auto loader =
-        XlaCallModuleLoader::Create(&context_, version, std::move(module_str),
-                                    std::move(dim_args_spec), platform_index);
-    OP_REQUIRES_OK(ctx, loader.status());
-    loader_ = *std::move(loader);
+    if (!ctx->GetAttr("has_token_input_output", &module_has_token_input_output_)
+             .ok()) {
+      module_has_token_input_output_ = false;
+    }
+    if (!ctx->GetAttr(kXlaTokenInputNodesAttrName, &token_input_nodes_).ok()) {
+      token_input_nodes_.clear();
+      op_has_token_input_output_ = false;
+    } else {
+      op_has_token_input_output_ = !token_input_nodes_.empty();
+    }
+    if (!ctx->GetAttr(kXlaOriginalOutsideCompilationNodeName,
+                      &original_node_name_)
+             .ok()) {
+      original_node_name_ = name();
+    }
   }
 
   void Compile(XlaOpKernelContext *ctx) override {
+    XlaCompiler *const compiler = ctx->compiler();
+    xla::XlaBuilder *const b = ctx->builder();
+
     std::vector<xla::Shape> input_shapes;
+    if (module_has_token_input_output_) {
+      input_shapes.push_back(xla::ShapeUtil::MakeTokenShape());
+    }
     for (int i = 0; i < ctx->num_inputs(); ++i) {
       auto shape = ctx->InputXlaShape(i);
       OP_REQUIRES_OK(ctx, shape.status());
       input_shapes.push_back(*std::move(shape));
     }
     OP_REQUIRES_OK(ctx, loader_->RefineDynamicShapes(input_shapes));
-    OP_REQUIRES_OK(ctx, loader_->ValidateModule());
+    OP_REQUIRES_OK(ctx, loader_->ValidateStaticShapes());
+    OP_REQUIRES_OK(ctx, loader_->LowerModuleToMhlo());
+    if (!function_list_.empty()) {
+      OP_REQUIRES_OK(ctx, LowerTfFunctionCalls(ctx));
+    }
 
-    std::vector<xla::XlaOp> inputs(ctx->num_inputs());
+    std::vector<xla::XlaOp> inputs;
+    if (module_has_token_input_output_) {
+      // The main function expects a token input at the start.
+      if (!token_input_nodes_.empty()) {
+        std::vector<xla::XlaOp> token_inputs;
+        for (const string &node_name : token_input_nodes_) {
+          auto token = compiler->GetNodeToken(node_name);
+          OP_REQUIRES_OK(ctx, token.status());
+          token_inputs.push_back(token.value());
+        }
+        inputs.push_back(xla::AfterAll(b, token_inputs));
+      } else {
+        // Generate a dummy token if the main function expects a token but the
+        // XlaCallModule doesn't take one.
+        inputs.push_back(xla::CreateToken(b));
+      }
+    }
     for (int i = 0, end = ctx->num_inputs(); i < end; ++i) {
-      inputs[i] = ctx->Input(i);
+      inputs.push_back(ctx->Input(i));
     }
 
     auto xla_computation = loader_->ToXlaComputation();
@@ -132,30 +251,268 @@ class XlaCallModuleOp : public XlaOpKernel {
                             xla_computation->proto(), module_config));
       xla::HloPrintOptions options;
       options = xla::HloPrintOptions::ShortParsable();
-      VLOG(3) << "XlaCallModule converted to HLO module "
-              << hlo_module->ToString(options);
+      XLA_VLOG_LINES(3, absl::StrCat("XlaCallModule converted to HLO module ",
+                                     hlo_module->ToString(options)));
     }
 
-    xla::XlaOp output = xla::Call(ctx->builder(), *xla_computation, inputs);
+    xla::XlaOp output = xla::Call(b, *xla_computation, inputs);
 
     // Check that the resulting computation returns the expected shape
-    OP_REQUIRES_VALUE(xla::Shape found_output_shape, ctx,
-                      ctx->builder()->GetShape(output));
+    OP_REQUIRES_VALUE(xla::Shape found_output_shape, ctx, b->GetShape(output));
     VLOG(3) << "XlaCallModule compiled output shape : "
             << xla::ShapeUtil::HumanString(found_output_shape);
 
+    std::vector<xla::XlaOp> outputs;
     if (loader_->nr_outputs() == 1) {
-      ctx->SetOutput(0, output);
+      outputs.push_back(output);
     } else {
       for (int i = 0; i < loader_->nr_outputs(); ++i) {
-        ctx->SetOutput(i, xla::GetTupleElement(output, i));
+        outputs.push_back(xla::GetTupleElement(output, i));
       }
     }
+
+    xla::XlaOp token_output;
+    if (module_has_token_input_output_) {
+      // The main function returns a token as the first output.
+      token_output = outputs.front();
+      outputs.erase(outputs.begin());
+      auto shape = b->GetShape(token_output);
+      OP_REQUIRES_OK(ctx, shape.status());
+      OP_REQUIRES(ctx, shape->IsToken(),
+                  absl::FailedPreconditionError(
+                      absl::StrCat("Token output is not token type: ",
+                                   xla::ShapeUtil::HumanString(*shape))));
+    }
+    if (op_has_token_input_output_) {
+      if (token_output.IsUninitialized()) {
+        // The main function does not return any token, but the XlaCallModule is
+        // expected to return one. Create a dummy token.
+        token_output = xla::CreateToken(b);
+      }
+      OP_REQUIRES_OK(ctx,
+                     compiler->SetNodeToken(original_node_name_, token_output));
+    }
+
+    for (int i = 0; i < outputs.size(); ++i) {
+      ctx->SetOutput(i, outputs[i]);
+    }
   }
 
  private:
+  // Lowers `mhlo.CustomCall` ops representing TF function calls into nested XLA
+  // computation. The called TF functions are lowered into MHLO and inserted as
+  // function calls in the main module.
+  //
+  // This is implemented here instead of in xla_call_module_loader.cc in order
+  // to prevent cyclic dependency with TF MLIR passes.
+  absl::Status LowerTfFunctionCalls(XlaOpKernelContext *ctx) {
+    mlir::ModuleOp module = loader_->module();
+    mlir::SymbolTableCollection symbol_table_collection;
+
+    llvm::SmallDenseSet<mlir::func::FuncOp> updated_funcs;
+
+    auto lower = [&](mlir::mhlo::CustomCallOp custom_call) -> absl::Status {
+      if (custom_call.getCallTargetName() != "tf.call_tf_function") {
+        return absl::OkStatus();
+      }
+
+      NameAttrList f;
+      bool custom_call_has_token_input_output = false;
+      {
+        auto backend_config = custom_call->getAttrOfType<mlir::DictionaryAttr>(
+            "tf.backend_config");
+        if (!backend_config) {
+          return absl::InternalError(
+              "TF function custom call must have 'tf.backend_config' "
+              "attribute");
+        }
+
+        auto called_index =
+            backend_config.getAs<mlir::IntegerAttr>("called_index");
+        if (!called_index) {
+          return absl::InternalError(
+              "TF function custom call must have 'called_index' in the "
+              "'tf.backend_config' attribute");
+        }
+
+        int index = called_index.getInt();
+        if (index < 0 || index >= function_list_.size()) {
+          return absl::OutOfRangeError(absl::StrCat(
+              "XlaCallModule has function_list of size ", function_list_.size(),
+              " but TF function custom call references function #", index));
+        }
+        f = function_list_[index];
+
+        // Whether the custom call takes a token argument and returns another
+        // token. Used to model side effects.
+        if (auto attr =
+                backend_config.getAs<mlir::BoolAttr>("has_token_input_output");
+            attr != nullptr) {
+          custom_call_has_token_input_output = attr.getValue();
+        }
+      }
+
+      // Lower the called TF function into an HLO module.
+
+      std::vector<XlaCompiler::Argument> arguments;
+      {
+        mlir::TypeRange input_types(custom_call->getOperandTypes());
+        if (custom_call_has_token_input_output) {
+          if (input_types.empty() ||
+              !input_types.front().isa<mlir::mhlo::TokenType>()) {
+            return absl::InvalidArgumentError(absl::StrCat(
+                "stablehlo.custom_call with has_token_input_output = true is "
+                "expected to take !stablehlo.token as the first argument, but "
+                "got ",
+                mlir::debugString(custom_call)));
+          }
+          input_types = input_types.drop_front();
+        }
+        for (mlir::Type input_type : input_types) {
+          XlaCompiler::Argument &argument = arguments.emplace_back();
+          argument.kind = XlaCompiler::Argument::kParameter;
+          TF_RETURN_IF_ERROR(ConvertToDataType(input_type, &argument.type));
+          argument.shape = xla::TypeToShape(input_type);
+        }
+
+        mlir::TypeRange result_types(custom_call->getResultTypes());
+        if (custom_call_has_token_input_output) {
+          if (result_types.empty() ||
+              !result_types.front().isa<mlir::mhlo::TokenType>()) {
+            return absl::InvalidArgumentError(absl::StrCat(
+                "stablehlo.custom_call with has_token_input_output = true is "
+                "expected to return !stablehlo.token as the first result, but "
+                "got ",
+                mlir::debugString(custom_call)));
+          }
+        }
+      }
+
+      XlaCompiler::CompileOptions options;
+      options.use_tuple_arg = true;
+      options.always_return_tuple = true;
+      options.is_entry_computation = false;
+      // Propagate tokens from XlaCallModule to inner computation.
+      options.add_token_input_output = op_has_token_input_output_;
+
+      XlaCompiler::CompilationResult result;
+      TF_RETURN_IF_ERROR(
+          ctx->compiler()->CompileFunction(options, f, arguments, &result));
+
+      // Import the lowered HLO module into StableHLO functions in `module`. The
+      // main function accepts tupled arguments and returns tupled results.
+      TF_ASSIGN_OR_RETURN(mlir::func::FuncOp main_func,
+                          ImportXlaComputation(symbol_table_collection, module,
+                                               *result.computation));
+
+      // Replace the custom call with ops that call the imported main function.
+      mlir::OpBuilder builder(custom_call);
+      auto loc = custom_call.getLoc();
+
+      // Pack all arguments into a tuple (`options.use_tuple_arg` is true). If
+      // `has_tuple_input_output` is true, the first argument is a token type.
+      mlir::Value arg_tuple;
+      {
+        llvm::SmallVector<mlir::Value> args(custom_call->getOperands());
+        if (custom_call_has_token_input_output) {
+          // Adjust the indexes since custom calls with `has_token_input_output`
+          // takes a token as the first argument, but TF2XLA'ed computation
+          // expects the token to be the last argument.
+          std::rotate(args.begin(), args.begin() + 1, args.end());
+        } else if (options.add_token_input_output) {
+          // Add a dummy token if the inner computation takes a token but the
+          // custom call doesn't have a token argument.
+          args.push_back(builder.create<mlir::mhlo::CreateTokenOp>(loc));
+        }
+
+        llvm::SmallVector<mlir::Value> elements;
+        elements.reserve(result.input_mapping.size());
+        for (int index : result.input_mapping) {
+          elements.push_back(args[index]);
+        }
+        arg_tuple =
+            builder.create<mlir::mhlo::TupleOp>(loc, elements).getResult();
+      }
+
+      // Call the lowered function.
+      auto call = builder.create<mlir::func::CallOp>(
+          loc, main_func, mlir::ValueRange(arg_tuple));
+
+      // Unpack the result tuple (`options.always_return_tuple` is true). If
+      // `has_tuple_input_output` is true, the first result is a token type.
+      {
+        llvm::SmallVector<mlir::Value> results(custom_call->getResults());
+        if (custom_call_has_token_input_output) {
+          // Adjust the indexes since custom calls with `has_token_input_output`
+          // returns a token as the first result, but TF2XLA'ed computation
+          // returns the token as the last result.
+          std::rotate(results.begin(), results.begin() + 1, results.end());
+
+          if (!options.add_token_input_output) {
+            // If the custom call returns a token but the inner computation
+            // doesn't, replace the token result with a dummy token.
+            mlir::Value token = results.back();
+            if (!token.use_empty()) {
+              token.replaceAllUsesWith(
+                  builder.create<mlir::mhlo::CreateTokenOp>(loc));
+            }
+            results.pop_back();
+          }
+        }
+
+        for (const auto &it : llvm::enumerate(results)) {
+          if (!it.value().use_empty()) {
+            auto get_tuple_element =
+                builder.create<mlir::mhlo::GetTupleElementOp>(
+                    loc, call.getResults().front(), it.index());
+            it.value().replaceAllUsesWith(get_tuple_element.getResult());
+          }
+        }
+      }
+
+      updated_funcs.insert(call->getParentOfType<mlir::func::FuncOp>());
+      custom_call->erase();
+
+      return absl::OkStatus();
+    };
+
+    absl::Status status;
+    mlir::WalkResult result = module->walk([&](mlir::mhlo::CustomCallOp op) {
+      status.Update(lower(op));
+      if (!status.ok()) {
+        return mlir::WalkResult::interrupt();
+      }
+      return mlir::WalkResult::advance();
+    });
+    if (result.wasInterrupted()) {
+      return status;
+    }
+
+    // If the call results are used by `func.return`, then we may need to update
+    // function result types.
+    for (auto func : updated_funcs) {
+      auto ret = llvm::cast<mlir::func::ReturnOp>(
+          func.getFunctionBody().front().getTerminator());
+      func.setFunctionType(mlir::FunctionType::get(
+          &context_, func.getArgumentTypes(), ret.getOperandTypes()));
+    }
+
+    if (VLOG_IS_ON(5)) {
+      DumpMlirOpToFile("xla_call_module.after_tf_func_call_import", module);
+    }
+    return absl::OkStatus();
+  }
+
   mlir::MLIRContext context_{mlir::MLIRContext::Threading::DISABLED};
   std::unique_ptr<XlaCallModuleLoader> loader_;
+  std::vector<NameAttrList> function_list_;
+
+  // Whether the StableHLO module's main function has token input/output.
+  bool module_has_token_input_output_;
+  // Whether the XlaCallModule op has token input/output.
+  bool op_has_token_input_output_;
+  std::vector<std::string> token_input_nodes_;
+  std::string original_node_name_;
 };
 
 REGISTER_XLA_OP(Name("XlaCallModule"), XlaCallModuleOp);
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
index 0c67ad7f8de..40770909df7 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
index 31f8f2840ba..63d10e399f7 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_dot_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+
 #include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
index 5e297a8c80b..9871ac537c0 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_pad_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
index bf5244e52e6..699e9248eaa 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_svd_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
diff --git a/tensorflow/compiler/tf2xla/lib/util.cc b/tensorflow/compiler/tf2xla/lib/util.cc
index 748971271bf..c8c03fd2b97 100644
--- a/tensorflow/compiler/tf2xla/lib/util.cc
+++ b/tensorflow/compiler/tf2xla/lib/util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -35,87 +36,35 @@ xla::XlaOp Zeros(xla::XlaBuilder* builder, const xla::Shape& shape) {
 
 xla::XlaOp FloatLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
                         double value) {
-  switch (type) {
-    case xla::F16:
-      return xla::ConstantR0<xla::half>(builder, static_cast<xla::half>(value));
-      break;
-    case xla::BF16:
-      return xla::ConstantR0<bfloat16>(builder, static_cast<bfloat16>(value));
-      break;
-    case xla::F32:
-      return xla::ConstantR0<float>(builder, static_cast<float>(value));
-      break;
-    case xla::F64:
-      return xla::ConstantR0<double>(builder, value);
-      break;
-    case xla::C64:
-      return xla::ConstantR0<xla::complex64>(builder, value);
-      break;
-    case xla::C128:
-      return xla::ConstantR0<xla::complex128>(builder, value);
-      break;
-    default:
-      LOG(FATAL) << "unhandled element type " << type;
-  }
+  return xla::primitive_util::PrimitiveTypeSwitch<xla::XlaOp>(
+      [&](auto primitive_type_constant) -> xla::XlaOp {
+        if constexpr (xla::primitive_util::IsFloatingPointType(
+                          primitive_type_constant) ||
+                      xla::primitive_util::IsComplexType(
+                          primitive_type_constant)) {
+          using NativeT =
+              xla::primitive_util::NativeTypeOf<primitive_type_constant>;
+          return xla::ConstantR0<NativeT>(builder, static_cast<NativeT>(value));
+        }
+        LOG(FATAL) << "unhandled element type " << type;
+      },
+      type);
 }
 
 xla::XlaOp IntegerLiteral(xla::XlaBuilder* builder, xla::PrimitiveType type,
                           int64_t value) {
-  xla::Literal literal;
-  switch (type) {
-    case xla::U8:
-      literal = xla::LiteralUtil::CreateR0<uint8_t>(value);
-      break;
-    case xla::U16:
-      literal = xla::LiteralUtil::CreateR0<uint16_t>(value);
-      break;
-    case xla::U32:
-      literal = xla::LiteralUtil::CreateR0<uint32_t>(value);
-      break;
-    case xla::U64:
-      literal = xla::LiteralUtil::CreateR0<uint64_t>(value);
-      break;
-    case xla::S8:
-      literal = xla::LiteralUtil::CreateR0<int8_t>(value);
-      break;
-    case xla::S16:
-      literal = xla::LiteralUtil::CreateR0<int16_t>(value);
-      break;
-    case xla::S32:
-      literal = xla::LiteralUtil::CreateR0<int32_t>(value);
-      break;
-    case xla::S64:
-      literal = xla::LiteralUtil::CreateR0<int64_t>(value);
-      break;
-    case xla::F32:
-      literal = xla::LiteralUtil::CreateR0<float>(value);
-      break;
-    case xla::F64:
-      literal = xla::LiteralUtil::CreateR0<double>(value);
-      break;
-    case xla::C64:
-      literal = xla::LiteralUtil::CreateR0<xla::complex64>(value);
-      break;
-    case xla::C128:
-      literal = xla::LiteralUtil::CreateR0<xla::complex128>(value);
-      break;
-    case xla::PRED:
-      LOG(FATAL) << "pred element type is not integral";
-    case xla::BF16:
-      literal = xla::LiteralUtil::CreateR0<xla::bfloat16>(
-          static_cast<xla::bfloat16>(value));
-      break;
-    case xla::F16:
-      literal =
-          xla::LiteralUtil::CreateR0<xla::half>(static_cast<xla::half>(value));
-      break;
-    case xla::TUPLE:
-      LOG(FATAL) << "tuple element type is not integral";
-    case xla::OPAQUE_TYPE:
-      LOG(FATAL) << "opaque element type is not integral";
-    default:
-      LOG(FATAL) << "unhandled element type " << type;
-  }
+  xla::Literal literal = xla::primitive_util::PrimitiveTypeSwitch<xla::Literal>(
+      [&](auto primitive_type_constant) -> xla::Literal {
+        if constexpr (xla::primitive_util::IsArrayType(
+                          primitive_type_constant)) {
+          using NativeT =
+              xla::primitive_util::NativeTypeOf<primitive_type_constant>;
+          return xla::LiteralUtil::CreateR0<NativeT>(
+              static_cast<NativeT>(value));
+        }
+        LOG(FATAL) << "unhandled element type " << type;
+      },
+      type);
   return xla::ConstantLiteral(builder, literal);
 }
 
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 3959ebb5771..3ff4fa845d2 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -159,18 +159,12 @@ bool EnableNonTpuBridge(const Graph& graph) {
 //
 // The config_proto param is a required input for all TF1 graphs but it is
 // redundant for TF2 graphs.
-MlirOptimizationPassState MlirBridgePass::GetPassState(
-    const DeviceSet* device_set, const ConfigProto& config_proto,
-    const Graph& graph,
-    const FunctionLibraryDefinition& function_library) const {
+MlirOptimizationPassState GetPassStateImpl(
+    bool run_tpu_bridge, const ConfigProto& config_proto, const Graph& graph,
+    const FunctionLibraryDefinition& function_library) {
   // Skip MLIR TF/XLA Bridge if no TPU devices and no qualified CPU/GPU
   // graphs are found.
-  bool has_tpu_device = device_set ? HasTPUDevice(*device_set) : false;
-  // GetPassState is called once before MlirBridgePass starts, and the pass
-  // gets skipped if it is disabled. Log such cases in this function. The cases
-  // where the pass is enabled will only be logged during their execution to
-  // prevent them from being counted twice.
-  if (device_set && !has_tpu_device && !EnableNonTpuBridge(graph)) {
+  if (!run_tpu_bridge && !EnableNonTpuBridge(graph)) {
     // Only record CPU/GPU graphs that are qualified but filtered out
     if (HasQualifiedNonTPUOp(graph)) {
       metrics::UpdateTfMlirBridgeFirstPhaseCounter(
@@ -184,11 +178,17 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
 
   // We set `uses_uninitialized_resource_args` to false here because the first
   // phase of the bridge is not affected by uninitialized resource args.
+  // GetMlirBridgeRolloutPolicy will analyze a TPU graph if users have not
+  // explicltly requested a policy.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
-      graph, &function_library, config_proto, /*is_tpu_graph*/ has_tpu_device,
+      graph, &function_library, config_proto, /*run_tpu_bridge*/ run_tpu_bridge,
       /*uses_uninitialized_resource_args=*/false,
       /*is_v1_compat=*/false, /*record_stats=*/false);
-  if (has_tpu_device) {
+  // GetPassState is called once before MlirBridgePass starts, and the pass
+  // gets skipped if it is disabled. Log such cases in this function. The cases
+  // where the pass is enabled will only be logged during their execution to
+  // prevent them from being counted twice.
+  if (run_tpu_bridge) {
     switch (policy) {
       case MlirBridgeRolloutPolicy::kEnabledByUser:
         return MlirOptimizationPassState::Enabled;
@@ -236,6 +236,20 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
   }
 }
 
+MlirOptimizationPassState MlirBridgePass::GetPassState(
+    const DeviceSet* device_set, const ConfigProto& config_proto,
+    const Graph& graph,
+    const FunctionLibraryDefinition& function_library) const {
+  if (!device_set) {
+    // This is not expected in practice.
+    VLOG(1) << "Device set is empty!";
+    return MlirOptimizationPassState::Disabled;
+  }
+
+  return GetPassStateImpl(/*run_tpu_bridge*/ HasTPUDevice(*device_set),
+                          config_proto, graph, function_library);
+}
+
 // This runs the first phase of the "bridge", transforming the graph in a form
 // that can be executed with delegation of some computations to an accelerator.
 // This builds on the model of XLA where a subset of the graph is encapsulated
@@ -252,22 +266,17 @@ Status MlirBridgePass::Run(const std::string& function_name,
   // Check if there are TPU devices or TPU ops. If not, then check if the
   // non TPU graph is qualified to run TF2XLA Bridge.
   // This check needs to precede GetPassState for instrumentation purposes.
-  bool is_qualified_for_tpu_bridge = HasTPUDevicesAndOps(module),
-       is_qualified_for_non_tpu_bridge = false;
-  if (!is_qualified_for_tpu_bridge)
-    is_qualified_for_non_tpu_bridge = EnableNonTpuBridge(graph);
-  if (!is_qualified_for_tpu_bridge && !is_qualified_for_non_tpu_bridge) {
+  bool run_tpu_bridge = HasTPUDevicesAndOps(module);
+  if (!run_tpu_bridge && !HasQualifiedNonTPUOp(graph)) {
     VLOG(1)
         << "Skipping MLIR TF2XLA Bridge, no qualified devices or ops found.";
     return OkStatus();
   }
 
-  // Set device_set to nullptr here as the device specific checks are performed
-  // based on the devices in the module.
   // TODO(b/241853328): Add caching of pass state and call logging/metrics
   // related to graph analysis from here.
-  auto pass_state = GetPassState(/*device_set=*/nullptr, config_proto, graph,
-                                 function_library);
+  auto pass_state =
+      GetPassStateImpl(run_tpu_bridge, config_proto, graph, function_library);
 
   if (pass_state == MlirOptimizationPassState::Disabled) {
     // GetPassState is called before run() and run() will only be called if the
@@ -278,7 +287,7 @@ Status MlirBridgePass::Run(const std::string& function_name,
     return OkStatus();
   }
 
-  if (is_qualified_for_tpu_bridge) {
+  if (run_tpu_bridge) {
     bool fallback_enabled = false;
     if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
       // We set `uses_uninitialized_resource_args` to false here because the
@@ -310,7 +319,7 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
   // phase of the bridge is not affected by uninitialized resource args.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
       graph, /*function_library=*/&function_library, config_proto,
-      /*is_tpu_graph*/ true,
+      /*run_tpu_bridge*/ true,
       /*uses_uninitialized_resource_args=*/false, /*is_v1_compat=*/true,
       /*record_stats=*/false);
   switch (policy) {
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index e536ffa3746..2d80bf3c2a2 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -1326,6 +1326,9 @@ REGISTER_OP("XlaCallModule")
     .Attr("dim_args_spec: list(string) = []")
     .Attr("platforms: list(string) = []")
     .Attr("function_list: list(func) = []")
+    .Attr("has_token_input_output: bool = false")
+    .Attr("disabled_checks: list(string) = []")
+    .SetIsStateful()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       std::vector<shape_inference::ShapeHandle> args_shapes;
       TF_RETURN_IF_ERROR(c->input("args", &args_shapes));
@@ -1361,7 +1364,8 @@ version: Tracks changes the semantics of the op, to support backwards
   version 3, the op also supports the `platforms` attribute. From version 4,
   the op carries a StableHLO module with compatibility guarantees. From version
   5, XLACallModule can include `stablehlo.custom_call` op to execute tf
-  functions.
+  functions. From version 6 the op supports the `disabled_checks` attribute.
+  See more versioning details at https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_call_module+%22int+VERSION_MAXIMUM_SUPPORTED%22&type=code.
 module: A serialized computation, a text or bytecode representation of
   an mlir.Module. The return type must be a tuple if and only if the `Sout` is
   a list with 0 or more than 1 elements. The length of `Tout` and
@@ -1369,15 +1373,16 @@ module: A serialized computation, a text or bytecode representation of
   module returns a single result.
 Tout: List of output tensor data types.
 Sout: List of output tensor shapes.
-platforms: the list of platforms supported by `module`. If the list is empty,
-  the `module` is platform independent or there should be no platform checking
-  or preprocessing. The list can contain the strings "CPU", "CUDA", "ROCM",
-  or "TPU".
-  If the list is not empty then it is an error to compile this op for a
-  platform that does not appear in the list. If the list contains more than
+platforms: the list of platforms supported by `module`. The list can contain
+  the strings "CPU", "CUDA", "ROCM", or "TPU". It is an error to compile
+  this op for a platform that does not appear in the list. This check can be
+  disabled using `disabled_checks`. If the list contains more than
   one platform, then the `module` takes one additional 0-dimensional
   integer-tensor parameter in the first position, encoding the index in
-  `platforms` of the current compilation platform.
+  `platforms` of the current compilation platform. This parameter has value 0
+  if the plaform is not among `platforms` and the check has been disabled.
+  The list can be empty in old versions (earlier than 6) to denote that no
+  platform checking must be performed at loading time.
 dim_args_spec: in presence of dynamic shapes, this is the specification for the
   dimension arguments. In absence of dynamic shapes this list is empty. The
   `module` takes one 0-dimensional integer tensor dimension argument for each
@@ -1386,11 +1391,26 @@ dim_args_spec: in presence of dynamic shapes, this is the specification for the
   string of the form "<arg_idx>.<axis_idx>" that specifies that the value of
   the corresponding dimension argument must be "args[arg_idx].shape[axis_idx]",
   where "args" are the actual array arguments.
+  This attribute is not used anymore in modules serialized with version 5
+  after March 28th, 2023 and JAX OSS versions higher than 0.4.6.
+  TODO(b/283439649): remove support for dim_args_spec.
 function_list: This list contains the TensorFlow FunctionDefs that are used by
   the XLACallModule. If the XLACallModule contains `stablehlo.custom_call`
   operations, they can call TensorFlow graph functions outside of the
   XLACallModule. This `function_list` attribute registers the dependency of the
   XLACallModule on those functions. This attribute was added in version 5.
+has_token_input_output: If true, the embedded StableHLO module's main function
+  must take a `!stablehlo.token` as its first argument and returns a token as
+  its first result. This can be used in conjunction with the TF2XLA's side
+  effect mechanism in order to model side effects.
+disabled_checks: A list of strings describing the safety checks that were
+  disabled at serialization time. This attribute was added in version 6. The
+  following directives are recognized: "platform" (allow a compilation platform
+  that is not among the `platforms`); "custom_call:xxx" (allow a custom call
+  with target function name "xxx" even if it is not known to JAX to be stable).
+  This list, supplemented with a comma-separate list of directives specified
+  using the flag --tf_xla_call_module_disabled_checks,
+  is used at module loading time to skip the corresponding checks.
 )doc");
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/python/BUILD b/tensorflow/compiler/tf2xla/python/BUILD
index 815fc42b44a..c5e7ca1fbf4 100644
--- a/tensorflow/compiler/tf2xla/python/BUILD
+++ b/tensorflow/compiler/tf2xla/python/BUILD
@@ -33,5 +33,6 @@ tf_custom_op_py_library(
     deps = [
         "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
         "//tensorflow/compiler/xla:xla_data_proto_py",
+        "//tensorflow/python/ops/numpy_ops:np_utils",
     ],
 )
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 61d2be76ac1..620535d40c2 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -604,12 +604,54 @@ def custom_call_v2(
   )
 
 
-def call_module(args, *, version=4, module, Tout, Sout,
-                dim_args_spec=(), platforms=(), function_list=()):
-  # See documentation for the XlaCallModule op.
-  return gen_xla_ops.xla_call_module(
-      args, version=version, module=module, dim_args_spec=dim_args_spec,
-      Tout=Tout, Sout=Sout, platforms=platforms, function_list=function_list)
+# pylint: disable=g-doc-args
+# pylint: disable=g-doc-return-or-yield
+def call_module(
+    args,
+    *,
+    version=4,
+    module,
+    Tout,
+    Sout,
+    platforms=(),
+    function_list=(),
+    has_token_input_output=False,
+    disabled_checks=(),
+):
+  """See documentation for the XlaCallModule op.
+
+  https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_ops.cc+xlacallmodule&type=code
+  """
+  res = gen_xla_ops.xla_call_module(
+      args,
+      version=version,
+      module=module,
+      dim_args_spec=(),
+      Tout=Tout,
+      Sout=Sout,
+      platforms=platforms,
+      function_list=function_list,
+      has_token_input_output=has_token_input_output,
+      disabled_checks=disabled_checks,
+  )
+  # Since XLACallModule op is stateful, zero return function will return the TF
+  # op under tf.function. It creates trouble for downstream codes.
+  # Here we force it return empty tuple to work around it.
+  # TODO(johnqiangzhang): Figure out a better way to handle control dependency.
+  if isinstance(res, ops.Operation):
+    res = ()
+  return res
+# pylint: enable=g-doc-args
+# pylint: enable=g-doc-return-or-yield
+
+
+def call_module_maximum_supported_version():
+  return 6
+
+
+def call_module_disable_check_platform():
+  # For use with xla_call_module.disabled_checks.
+  return "platform"
 
 
 def gather(operand,
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
index 1c24cffa93d..a30c29259d9 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.cc
@@ -16,29 +16,65 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h"
 
 #include <cassert>
+#include <iostream>
 #include <vector>
 
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h"
 
 namespace tensorflow {
 
+namespace {
+// MemrefDesc's are part of the XLA Runtime ABI. Redefine them here (with a
+// slightly different name to avoid confusion) because we cannot depend on
+// XLA Runtime's headers.
+// Note: this is an internal type, to be used exclusively in this file.
+struct MemrefHolder {
+  MemrefHolder(const XlaCompiledCpuFunction::ShapeInfo& shape_info,
+               void* data_ptr)
+      : rank(shape_info.num_dimensions), data(data_ptr), offset(0) {
+    sizes.resize(shape_info.num_dimensions);
+    strides.resize(shape_info.num_dimensions);
+    int64_t multiplier = 1;
+    for (int i = shape_info.num_dimensions - 1; i >= 0; --i) {
+      int64_t size = shape_info.dimensions[i];
+      sizes[i] = size;
+      strides[i] = multiplier;
+      multiplier *= size;
+    }
+  }
+
+  unsigned rank = 0;
+  // Note: dtype is not needed here.
+  void* data = nullptr;
+  int64_t offset = 0;
+  std::vector<int64_t> sizes;
+  std::vector<int64_t> strides;
+};
+}  // namespace
+
 XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
                                                AllocMode alloc_mode)
     : raw_function_(static_data.raw_function_),
-      run_function_(static_data.run_function_),
+      external_run_function_(static_data.external_run_function_),
       cpu_executable_(static_data.cpu_executable_),
       result_index_(static_data.result_index_),
       buffer_table_(new void*[static_data.num_buffers_]),
       buffer_infos_(static_data.buffer_infos_),
       num_buffers_(static_data.num_buffers_),
+      num_results_(static_data.num_results_),
+      result_index_table_(static_data.result_index_table_),
       arg_index_table_(static_data.arg_index_table_),
       num_args_(static_data.num_args_),
       num_variables_(static_data.num_variables_),
+      arg_shape_infos_(static_data.arg_shape_infos_),
+      result_shape_infos_(static_data.result_shape_infos_),
       arg_names_(static_data.arg_names_),
       variable_names_(static_data.variable_names_),
       result_names_(static_data.result_names_),
       program_shape_(static_data.program_shape_),
-      hlo_profile_printer_data_(static_data.hlo_profile_printer_data_) {
+      hlo_profile_printer_data_(static_data.hlo_profile_printer_data_),
+      use_xla_runtime_(static_data.use_xla_runtime_) {
   bool allocate_entry_params =
       alloc_mode == AllocMode::ARGS_VARIABLES_RESULTS_PROFILES_AND_TEMPS;
   // Allocate arg and temp buffers.
@@ -56,11 +92,75 @@ XlaCompiledCpuFunction::XlaCompiledCpuFunction(const StaticData& static_data,
   }
 }
 
+bool XlaCompiledCpuFunction::RunXlaRuntime() {
+  size_t num_memref_args = num_args_ + num_results_;
+  std::vector<MemrefHolder> memref_args;
+  memref_args.reserve(num_memref_args);
+
+  size_t num_ptrs = 1;  // execution context.
+
+  // Append arguments.
+  for (int i = 0; i < num_args_; ++i) {
+    const ShapeInfo& shape_info = arg_shape_infos_[i];
+    memref_args.emplace_back(shape_info, buffer_table_[arg_index_table_[i]]);
+    num_ptrs += 3 + 2 * shape_info.num_dimensions;
+  }
+
+  // Append results.
+  for (int i = 0; i < num_results_; ++i) {
+    const ShapeInfo& shape_info = result_shape_infos_[i];
+    memref_args.emplace_back(shape_info, buffer_table_[result_index_table_[i]]);
+    num_ptrs += 3 + 2 * shape_info.num_dimensions;
+
+    // Point to this result from the "result" entry in the buffer table.
+    void** results = static_cast<void**>(buffer_table_[result_index_]);
+    results[i] = buffer_table_[result_index_table_[i]];
+  }
+
+  std::vector<void*> call_frame;
+  call_frame.resize(num_ptrs);
+  size_t ptr_index = 1;
+  for (const MemrefHolder& memref : memref_args) {
+    auto cast = [](const void* p) { return const_cast<void*>(p); };
+    call_frame[ptr_index + 0] = cast(&memref.data);  // memref.basePtr
+    call_frame[ptr_index + 1] = cast(&memref.data);  // memref.data
+    call_frame[ptr_index + 2] = cast(&memref.offset);
+    unsigned rank = memref.rank;
+    for (int64_t d = 0; d < rank; ++d) {
+      call_frame[ptr_index + 3 + d] = cast(&memref.sizes[d]);
+      call_frame[ptr_index + 3 + d + rank] = cast(&memref.strides[d]);
+    }
+    ptr_index += 3 + 2 * rank;
+  }
+
+  assert(num_ptrs == ptr_index);
+
+  xla::runtime::aot::ExecutionContext execution_context;
+  execution_context.custom_call_data = &run_options_;
+  xla::runtime::aot::ExecutionContext* execution_context_ptr =
+      &execution_context;
+  call_frame[0] = &execution_context_ptr;
+
+  auto xla_runtime_func =
+      reinterpret_cast<XlaRuntimeRawFunction>(raw_function_);
+  xla_runtime_func(call_frame.data());
+  if (execution_context.error) {
+    // No error support in XLA; dump error message to stderr.
+    std::cerr << "XLA AOT error: " << execution_context.error << ".\n";
+    return false;
+  }
+  return true;
+}
+
 bool XlaCompiledCpuFunction::Run() {
-  if (run_function_) {
+  if (use_xla_runtime_) {
+    return RunXlaRuntime();
+  }
+  if (external_run_function_) {
     std::vector<xla::cpu::BufferDesc> descriptor_table =
         MakeXlaRuntimeDescriptorTable();
-    return run_function_(cpu_executable_, descriptor_table, &run_options_);
+    return external_run_function_(cpu_executable_, descriptor_table,
+                                  &run_options_);
   }
   XlaCustomCallStatus status;
   raw_function_(buffer_table_[result_index_], &run_options_, nullptr,
diff --git a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
index 176f203e924..bde21d559c5 100644
--- a/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
+++ b/tensorflow/compiler/tf2xla/xla_compiled_cpu_function.h
@@ -55,16 +55,30 @@ namespace tensorflow {
 //   is guaranteed that no thread may call a non-const method.
 class XlaCompiledCpuFunction {
  public:
-  // Type of the raw function, produced by either JIT or AOT.
+  // Type of the raw XLA Classic function, produced by either JIT or AOT.
   using RawFunction = void (*)(void* result,
                                const xla::ExecutableRunOptions* run_options,
                                const void** args, void** temps,
                                XlaCustomCallStatus*, int64_t* profile_counters);
-  using RunFunction =
+
+  // Signature of the XLA Runtime raw function. Used only by XLA Runtime AOT.
+  using XlaRuntimeRawFunction = void (*)(void**);
+
+  // Signature of an external run function. Used only by XLA Runtime JIT.
+  using ExternalRunFunction =
       bool (*)(const xla::cpu::CpuExecutable* cpu_executable,
                const std::vector<xla::cpu::BufferDesc>& descriptor_table,
                const xla::ExecutableRunOptions* run_options);
 
+  // Simple struct to describe a tensor's shape.
+  // Note: this is a poor man's substitute for xla::ShapeProto, but we cannot
+  // depend on protobuf's in this library.
+  // TODO(ecg): extend ShapeInfo to support tuples, if needed.
+  struct ShapeInfo {
+    const int32_t* dimensions = nullptr;
+    int32_t num_dimensions = 0;
+  };
+
   // StaticData represents the state necessary to run an XLA-compiled
   // function. For JIT this is backed by data in XlaJitCompiledCpuFunction; for
   // AOT this is backed by data compiled into the object file.
@@ -76,13 +90,20 @@ class XlaCompiledCpuFunction {
     // The raw function to call.
     RawFunction raw_function_;
 
-    RunFunction run_function_ = nullptr;
+    ExternalRunFunction external_run_function_ = nullptr;
     const xla::cpu::CpuExecutable* cpu_executable_ = nullptr;
 
     // Contains information about the buffers used by the XLA computation.
     const xla::cpu_function_runtime::BufferInfo* buffer_infos_ = nullptr;
     int32_t num_buffers_ = 0;
 
+    // Result parameter i is described by
+    // buffer_infos[result_index_table[i]].
+    const int32* result_index_table_ = nullptr;
+
+    // There are num_results result parameters.
+    int64_t num_results_ = 0;
+
     // Entry parameter i is described by
     // buffer_infos[arg_index_table[i]].
     const int32* arg_index_table_ = nullptr;
@@ -96,6 +117,9 @@ class XlaCompiledCpuFunction {
     // The 0-based index of the result tuple, in the temp buffers.
     size_t result_index_ = 0;
 
+    const ShapeInfo* arg_shape_infos_ = nullptr;
+    const ShapeInfo* result_shape_infos_ = nullptr;
+
     // [Optional] Arrays of arg and result names. These are arrays of C-style
     // strings, where the array is terminated by nullptr.
     const char** arg_names_ = nullptr;
@@ -115,6 +139,8 @@ class XlaCompiledCpuFunction {
     // declared so we don't have access to that information here.
     int64_t profile_counters_size_ = 0;
 
+    bool use_xla_runtime_ = false;
+
     // Only XlaCompiledCpuFunction is allowed to read and write the above
     // fields.
     friend class XlaCompiledCpuFunction;
@@ -166,6 +192,8 @@ class XlaCompiledCpuFunction {
     return buffer_table_[arg_index_table_[index]];
   }
 
+  int num_results() const { return num_results_; }
+
   int num_args() const { return num_args_; }
 
   int num_variables() const { return num_variables_; }
@@ -291,9 +319,9 @@ class XlaCompiledCpuFunction {
     static_data->raw_function_ = raw_function;
   }
 
-  static void set_static_data_run_function(StaticData* static_data,
-                                           RunFunction run_function) {
-    static_data->run_function_ = run_function;
+  static void set_static_data_external_run_function(
+      StaticData* static_data, ExternalRunFunction external_run_function) {
+    static_data->external_run_function_ = external_run_function;
   }
 
   static void set_static_data_cpu_executable(
@@ -312,6 +340,16 @@ class XlaCompiledCpuFunction {
     static_data->num_buffers_ = num_buffers;
   }
 
+  static void set_static_data_result_index_table(
+      StaticData* static_data, const int32* result_index_table) {
+    static_data->result_index_table_ = result_index_table;
+  }
+
+  static void set_static_data_num_results(StaticData* static_data,
+                                          int64_t num_results) {
+    static_data->num_results_ = num_results;
+  }
+
   static void set_static_data_arg_index_table(StaticData* static_data,
                                               const int32* arg_index_table) {
     static_data->arg_index_table_ = arg_index_table;
@@ -332,6 +370,16 @@ class XlaCompiledCpuFunction {
     static_data->result_index_ = result_index;
   }
 
+  static void set_static_data_arg_shape_infos(StaticData* static_data,
+                                              const ShapeInfo* shape_infos) {
+    static_data->arg_shape_infos_ = shape_infos;
+  }
+
+  static void set_static_data_result_shape_infos(StaticData* static_data,
+                                                 const ShapeInfo* shape_infos) {
+    static_data->result_shape_infos_ = shape_infos;
+  }
+
   static void set_static_data_arg_names(StaticData* static_data,
                                         const char** arg_names) {
     static_data->arg_names_ = arg_names;
@@ -368,14 +416,19 @@ class XlaCompiledCpuFunction {
     static_data->profile_counters_size_ = profile_counters_size;
   }
 
+  static void set_static_data_use_xla_runtime(StaticData* static_data,
+                                              bool use_xla_runtime) {
+    static_data->use_xla_runtime_ = use_xla_runtime;
+  }
+
  private:
   const RawFunction raw_function_;
-  // TODO(ecg): RunFunction and CpuExecutable should go away. Instead, we should
-  // have a pointer or reference to a minimal wrapper around CpuExecutable's
-  // Execute(), without CpuExecutable's dependences. We could call this wrapper
-  // "XlaRuntimeRunner".
-  const RunFunction run_function_;
+
+  // [Optional] External Run() function.
+  const ExternalRunFunction external_run_function_;
+  // [Maybe Optional] CpuExecutable to be passed to external_run_function_.
   const xla::cpu::CpuExecutable* cpu_executable_;
+
   const size_t result_index_;
 
   // Array containing pointers to argument and temp buffers (slots corresponding
@@ -386,6 +439,10 @@ class XlaCompiledCpuFunction {
   const xla::cpu_function_runtime::BufferInfo* const buffer_infos_;
   const int32 num_buffers_;
 
+  // Indices of expanded result tuple.
+  const int32 num_results_;
+  const int32* const result_index_table_;
+
   // Argument i needs to be placed in buffer_table_[arg_index_to_temp_index_[i]]
   // for XLA generated code to be able to find it.
   const int32* const arg_index_table_;
@@ -396,6 +453,12 @@ class XlaCompiledCpuFunction {
   // The number of incoming variables.
   const int32 num_variables_;
 
+  // Shapes of the input arguments.
+  const ShapeInfo* const arg_shape_infos_;
+
+  // Shapes of the results.
+  const ShapeInfo* const result_shape_infos_;
+
   // Backing memory for buffer_table_ and args_, the latter depending on
   // AllocMode.
   void* alloc_buffer_table_ = nullptr;
@@ -413,9 +476,13 @@ class XlaCompiledCpuFunction {
   const xla::ProgramShapeProto* program_shape_ = nullptr;
   const xla::HloProfilePrinterData* hlo_profile_printer_data_ = nullptr;
 
+  const bool use_xla_runtime_ = false;
+
   // Creates a descriptor table for XLA Runtime.
   std::vector<xla::cpu::BufferDesc> MakeXlaRuntimeDescriptorTable();
 
+  bool RunXlaRuntime();
+
   // Add `XlaJitCompiledCpuFunction` as a friend so that it can access the
   // `set_static_data_*` static methods above.
   friend class XlaJitCompiledCpuFunction;
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index ef7c45f0a4b..cc951fe375e 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -16,14 +16,20 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 
 #include <algorithm>
+#include <map>
 #include <memory>
 #include <numeric>
+#include <optional>
+#include <set>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/types/variant.h"
 #include "tensorflow/compiler/jit/defs.h"
 #include "tensorflow/compiler/jit/flags.h"
@@ -572,7 +578,7 @@ Status XlaCompiler::FindFunctionBody(const NameAttrList& function,
   // function in flib_runtime_.
   auto status = GetFunctionBody(function, local_flib_runtime_, fbody);
   if (!status.ok()) {
-    if (!errors::IsNotFound(status)) {
+    if (!absl::IsNotFound(status)) {
       return status;
     }
     TF_RETURN_WITH_CONTEXT_IF_ERROR(
@@ -750,8 +756,8 @@ Status XlaCompiler::CompileSingleOp(
 
   auto compile_with_old_bridge = [&]() {
     *result = {};
-    return CompileGraph(compile_options, node_def.name(), std::move(graph),
-                        args, result);
+    return ADD_SOURCE_LOCATION(CompileGraph(compile_options, node_def.name(),
+                                            std::move(graph), args, result));
   };
 
   const ConfigProto* config = &(single_op_compile_argument.config_proto);
@@ -1426,6 +1432,11 @@ Status XlaCompiler::CompileGraph(
     std::unique_ptr<Graph> graph, absl::Span<const XlaCompiler::Argument> args,
     CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.: " << name;
+  if (VLOG_IS_ON(2)) {
+    VLOG(2) << "XlaCompiler::CompileGraph: "
+            << DumpGraphToFile(absl::StrCat("xla_compile_graph_", name), *graph,
+                               flib_runtime_->GetFunctionLibraryDefinition());
+  }
 
   DummyStackTrace stack_trace;
   for (auto node : graph->nodes()) {
@@ -1443,12 +1454,6 @@ Status XlaCompiler::CompileGraph(
       graph.get(), local_flib_def_.get(),
       pflr_->GetFunctionLibraryDefinition()));
 
-  if (VLOG_IS_ON(2)) {
-    VLOG(2) << "XlaCompiler::CompileGraph: "
-            << DumpGraphToFile(absl::StrCat("xla_compile_graph_", name), *graph,
-                               flib_runtime_->GetFunctionLibraryDefinition());
-  }
-
   // Report the error here if initialization failed.
   TF_RETURN_IF_ERROR(initialization_status_);
 
@@ -1456,8 +1461,8 @@ Status XlaCompiler::CompileGraph(
   // FunctionalizeControlFlow may remove some nodes from the graph.
   TF_RETURN_IF_ERROR(ValidateGraph(graph.get(), *options_.flib_def,
                                    options_.device_type, name));
-  xla::XlaBuilder builder(name);
-  XlaContext* context = new XlaContext(this, &builder, graph.get());
+  auto builder = std::make_unique<xla::XlaBuilder>(name);
+  XlaContext* context = new XlaContext(this, builder.get(), graph.get());
   core::ScopedUnref context_unref(context);
 
   std::vector<XlaCompiler::Argument> real_args(args.begin(), args.end());
@@ -1479,7 +1484,7 @@ Status XlaCompiler::CompileGraph(
 
   std::vector<XlaExpression> arg_expressions;
   TF_RETURN_IF_ERROR(BuildArguments(
-      *graph, real_args, options.use_tuple_arg, &builder, context,
+      *graph, real_args, options.use_tuple_arg, builder.get(), context,
       arg_shardings, &arg_expressions, &result->input_mapping,
       &result->xla_input_shapes, options.is_entry_computation));
   context->set_args(std::move(arg_expressions));
@@ -1505,7 +1510,7 @@ Status XlaCompiler::CompileGraph(
     // Original token is manually created.
     if (HasSideEffectingNodes(*graph)) {
       TF_RETURN_IF_ERROR(
-          SetNodeToken(kXlaTokenArgNodeName, xla::CreateToken(&builder)));
+          SetNodeToken(kXlaTokenArgNodeName, xla::CreateToken(builder.get())));
     }
   }
 
@@ -1523,7 +1528,8 @@ Status XlaCompiler::CompileGraph(
       TF_RETURN_IF_ERROR(token_or.status());
       token_inputs.push_back(token_or.value());
     }
-    token_output.reset(new xla::XlaOp(xla::AfterAll(&builder, token_inputs)));
+    token_output = std::make_unique<xla::XlaOp>(
+        xla::AfterAll(builder.get(), token_inputs));
   }
   TF_RETURN_IF_ERROR(PopNodeTokenMapping());
 
@@ -1532,7 +1538,8 @@ Status XlaCompiler::CompileGraph(
   result->computation = std::make_shared<xla::XlaComputation>();
   result->outputs.resize(context->retvals().size());
   std::vector<XlaExpression> retvals = context->retvals();
-  ConvertConstantsToExpressions(&builder, absl::Span<XlaExpression>(retvals));
+  ConvertConstantsToExpressions(builder.get(),
+                                absl::Span<XlaExpression>(retvals));
   XlaShapeLayoutHelpers::ShapeDeterminationFns shape_determination_fns{
       UseNoPreferenceLayoutFn(), IdentityShapeRepresentationFn()};
   TF_RETURN_IF_ERROR(BuildComputation(
@@ -1543,7 +1550,7 @@ Status XlaCompiler::CompileGraph(
       options.is_entry_computation,
       options.return_updated_values_for_all_resources,
       options.always_return_tuple, options.use_tuple_arg,
-      options.alias_resource_update, &builder, result->computation.get(),
+      options.alias_resource_update, builder.get(), result->computation.get(),
       &num_computation_outputs, &num_nonconst_outputs, &result->outputs,
       &result->resource_updates, &result->xla_output_shape,
       result->input_mapping));
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
index 5c54551707b..a8cca7befd4 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function.cc
@@ -140,7 +140,8 @@ XlaJitCompiledCpuFunction::Compile(
 
   // Compute buffer infos and the result index, needed to run the raw function.
   std::vector<xla::cpu_function_runtime::BufferInfo> buffer_infos =
-      xla::cpu::CreateBufferInfosFromBufferAssignment(buffer_assignment);
+      xla::cpu::CreateBufferInfosFromBufferAssignment(cpu_executable->module(),
+                                                      buffer_assignment);
   std::vector<int32> arg_index_table =
       xla::cpu::CreateArgIndexTableFromBufferInfos(buffer_infos);
   TF_ASSIGN_OR_RETURN(size_t result_index,
@@ -157,8 +158,8 @@ XlaJitCompiledCpuFunction::Compile(
   XlaCompiledCpuFunction::set_static_data_raw_function(&jit->static_data_,
                                                        raw_function);
   if (cpu_executable->IsXlaRuntime()) {
-    XlaCompiledCpuFunction::set_static_data_run_function(&jit->static_data_,
-                                                         RunXlaRuntime);
+    XlaCompiledCpuFunction::set_static_data_external_run_function(
+        &jit->static_data_, RunXlaRuntime);
     XlaCompiledCpuFunction::set_static_data_cpu_executable(&jit->static_data_,
                                                            cpu_executable);
   }
diff --git a/tensorflow/compiler/xla/BUILD b/tensorflow/compiler/xla/BUILD
index f29391811ed..7fc7ba9f111 100644
--- a/tensorflow/compiler/xla/BUILD
+++ b/tensorflow/compiler/xla/BUILD
@@ -121,8 +121,6 @@ xla_cc_test(
     deps = [
         ":bit_cast",
         ":test",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -219,7 +217,6 @@ xla_cc_test(
     deps = [
         ":test",
         ":types",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -231,7 +228,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":status",
-        ":xla_cc_grpc_proto",
         ":xla_data_proto_cc",
         ":xla_proto_cc",
     ],
@@ -263,7 +259,6 @@ xla_cc_test(
         ":test",
         ":test_helpers",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -377,7 +372,6 @@ cc_library(
         ":types",
         ":util",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:protobuf",
         "@com_google_absl//absl/hash",
@@ -500,7 +494,6 @@ xla_cc_test(
         ":types",
         ":util",
         ":xla_data_proto_cc",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -563,6 +556,7 @@ cc_library(
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/util:byte_swap_array",
+        "//third_party/eigen3",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
@@ -585,7 +579,6 @@ xla_cc_test(
         ":types",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:float8",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/base",
@@ -653,7 +646,6 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -679,7 +671,6 @@ cc_library(
     deps = [
         ":status",
         ":types",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -880,8 +871,6 @@ xla_cc_test(
         ":types",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -912,10 +901,8 @@ xla_cc_test(
         ":shape_util",
         ":test",
         ":xla_data_proto_cc",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -997,12 +984,10 @@ xla_cc_test(
         ":literal",
         ":reference_util",
         ":test",
-        ":util",
         ":xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1081,12 +1066,9 @@ xla_cc_test(
     deps =
         [
             ":xla_proto_cc",
-            "//tensorflow/compiler/xla/hlo/ir:hlo",
-            "//tensorflow/tsl/platform:logging",
             "//tensorflow/tsl/platform:test",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/strings",
-            "@com_google_absl//absl/strings:str_format",
         ],
 )
 
diff --git a/tensorflow/compiler/xla/array.h b/tensorflow/compiler/xla/array.h
index 3238ffdf53d..55be3df5890 100644
--- a/tensorflow/compiler/xla/array.h
+++ b/tensorflow/compiler/xla/array.h
@@ -38,6 +38,10 @@ namespace xla {
 
 namespace array_impl {
 
+template <typename T, typename T2>
+using overload_for_float = std::enable_if_t<
+    is_specialized_floating_point_v<T> && std::is_same<T2, float>::value, bool>;
+
 // A type trait that is valid when all elements in a parameter pack are of
 // integral type. Not using an alias template to work around MSVC 14.00 bug.
 template <typename... Ts>
@@ -110,12 +114,7 @@ class Array {
 
   // Creates a 1D array of a floating-point type (half, bfloat16, float,
   // or double) from an initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array(std::initializer_list<T2> values)
       : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
@@ -128,14 +127,7 @@ class Array {
 
   // Creates a 2D array of a floating-point type (float8, half, bfloat16, float,
   // or double) from an initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, tsl::float8_e4m3fn>::value ||
-                              std::is_same<T, tsl::float8_e5m2>::value ||
-                              std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array(std::initializer_list<std::initializer_list<T2>> values)
       : Array(ToInt64Array(values), no_default_init_t{}) {
     int64_t idx = 0;
@@ -166,12 +158,7 @@ class Array {
 
   // Creates a 3D array of a floating-point type (half, bfloat16, float,
   // or double) from an initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array(std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
             values)
       : Array(ToInt64Array(values), no_default_init_t{}) {
@@ -207,12 +194,7 @@ class Array {
 
   // Creates a 4D array of a floating-point type (half, bfloat16, float,
   // or double) from an initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array(std::initializer_list<
         std::initializer_list<std::initializer_list<std::initializer_list<T2>>>>
             values)
@@ -510,6 +492,17 @@ class Array {
 
   // Performs a permutation of dimensions.
   void TransposeDimensions(absl::Span<const int64_t> permutation) {
+    return TransposeDimensionsImpl<int64_t>(permutation);
+  }
+  void TransposeDimensions(absl::Span<const int> permutation) {
+    return TransposeDimensionsImpl<int>(permutation);
+  }
+  void TransposeDimensions(std::initializer_list<int> permutation) {
+    return TransposeDimensionsImpl<int>(permutation);
+  }
+  template <typename IntT,
+            std::enable_if_t<std::is_integral_v<IntT>>* = nullptr>
+  void TransposeDimensionsImpl(absl::Span<const IntT> permutation) {
     CHECK_EQ(sizes_.size, permutation.size());
     OwnedBuffer<int64_t> permuted_dims(permutation.size());
     for (int64_t i = 0; i < permutation.size(); ++i) {
diff --git a/tensorflow/compiler/xla/array2d.h b/tensorflow/compiler/xla/array2d.h
index 2409fe6268b..834d602956b 100644
--- a/tensorflow/compiler/xla/array2d.h
+++ b/tensorflow/compiler/xla/array2d.h
@@ -52,15 +52,7 @@ class Array2D : public Array<T> {
 
   // Creates an array of a floating-point type (float8, half, bfloat16, float,
   // or double) from the given nested initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, tsl::float8_e4m3fn>::value ||
-                              std::is_same<T, tsl::float8_e4m3b11>::value ||
-                              std::is_same<T, tsl::float8_e5m2>::value ||
-                              std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array2D(std::initializer_list<std::initializer_list<T2>> values)
       : Array<T>(values) {}
 
diff --git a/tensorflow/compiler/xla/array3d.h b/tensorflow/compiler/xla/array3d.h
index 5dec480fdad..2fb39461dd9 100644
--- a/tensorflow/compiler/xla/array3d.h
+++ b/tensorflow/compiler/xla/array3d.h
@@ -57,12 +57,7 @@ class Array3D : public Array<T> {
 
   // Creates an array of a floating-point type (half, bfloat16, float,
   // or double) from the given nested initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array3D(
       std::initializer_list<std::initializer_list<std::initializer_list<T2>>>
           values)
diff --git a/tensorflow/compiler/xla/array4d.h b/tensorflow/compiler/xla/array4d.h
index 3e75023fd9c..e86f212cd2e 100644
--- a/tensorflow/compiler/xla/array4d.h
+++ b/tensorflow/compiler/xla/array4d.h
@@ -82,12 +82,7 @@ class Array4D : public Array<T> {
 
   // Creates an array of a floating-point type (half, bfloat16, float,
   // or double) from the given nested initializer list of float values.
-  template <typename T2, typename = typename std::enable_if<
-                             (std::is_same<T, Eigen::half>::value ||
-                              std::is_same<T, bfloat16>::value ||
-                              std::is_same<T, float>::value ||
-                              std::is_same<T, double>::value) &&
-                             std::is_same<T2, float>::value>::type>
+  template <typename T2, array_impl::overload_for_float<T, T2> = true>
   Array4D(std::initializer_list<std::initializer_list<
               std::initializer_list<std::initializer_list<T2>>>>
               values)
diff --git a/tensorflow/compiler/xla/backends/interpreter/BUILD b/tensorflow/compiler/xla/backends/interpreter/BUILD
index f431901a4e8..4017caaf013 100644
--- a/tensorflow/compiler/xla/backends/interpreter/BUILD
+++ b/tensorflow/compiler/xla/backends/interpreter/BUILD
@@ -56,6 +56,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:map_inliner",
         "//tensorflow/compiler/xla/service:qr_expander",
         "//tensorflow/compiler/xla/service:reshape_mover",
+        "//tensorflow/compiler/xla/service:topk_rewriter",
         "//tensorflow/compiler/xla/service:triangular_solve_expander",
         "//tensorflow/compiler/xla/service:while_loop_simplifier",
         "//tensorflow/compiler/xla/stream_executor",
diff --git a/tensorflow/compiler/xla/backends/interpreter/compiler.cc b/tensorflow/compiler/xla/backends/interpreter/compiler.cc
index f614c6be078..d62b46f9051 100644
--- a/tensorflow/compiler/xla/backends/interpreter/compiler.cc
+++ b/tensorflow/compiler/xla/backends/interpreter/compiler.cc
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/map_inliner.h"
 #include "tensorflow/compiler/xla/service/qr_expander.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
+#include "tensorflow/compiler/xla/service/topk_rewriter.h"
 #include "tensorflow/compiler/xla/service/triangular_solve_expander.h"
 #include "tensorflow/compiler/xla/service/while_loop_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -81,6 +82,7 @@ StatusOr<Literal> HandleEvaluatorCustomCall(
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
+  pipeline.AddPass<TopkDecomposer>();
   pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CholeskyExpander>();
   pipeline.AddPass<QrExpander>();
diff --git a/tensorflow/compiler/xla/backends/profiler/BUILD b/tensorflow/compiler/xla/backends/profiler/BUILD
index c02ea2c3d0c..5327adc4175 100644
--- a/tensorflow/compiler/xla/backends/profiler/BUILD
+++ b/tensorflow/compiler/xla/backends/profiler/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow/tsl:tsl.bzl", "if_libtpu", "tsl_gpu_library")
+load("//tensorflow/tsl:tsl.bzl", "if_with_tpu_support", "tsl_gpu_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -20,8 +20,10 @@ tsl_gpu_library(
     deps = [
         "//tensorflow/compiler/xla/backends/profiler/cpu:host_tracer",
         "//tensorflow/compiler/xla/backends/profiler/cpu:metadata_collector",
-    ] + if_libtpu([
-        "//tensorflow/compiler/xla/backends/profiler/tpu:tpu_tracer",
-    ]),
+    ] + if_with_tpu_support(
+        [
+            "//tensorflow/compiler/xla/backends/profiler/tpu:tpu_tracer",
+        ],
+    ),
     alwayslink = True,
 )
diff --git a/tensorflow/compiler/xla/backends/profiler/gpu/BUILD b/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
index 5ae87af9e0a..a44c5735b5b 100644
--- a/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
+++ b/tensorflow/compiler/xla/backends/profiler/gpu/BUILD
@@ -25,11 +25,11 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla:internal"],
     features = [
         "-layering_check",
     ],
-    licenses = ["notice"],
 )
 
 tsl_gpu_library(
diff --git a/tensorflow/compiler/xla/client/BUILD b/tensorflow/compiler/xla/client/BUILD
index bea85c5179d..a29c8df9d8b 100644
--- a/tensorflow/compiler/xla/client/BUILD
+++ b/tensorflow/compiler/xla/client/BUILD
@@ -125,7 +125,6 @@ cc_library(
         ":xla_computation",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:shape_tree",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/service:backend",
@@ -142,7 +141,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:TargetParser",
     ],
 )
 
@@ -296,7 +294,8 @@ xla_cc_test(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/service:pattern_matcher",
+        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
diff --git a/tensorflow/compiler/xla/client/client.cc b/tensorflow/compiler/xla/client/client.cc
index bfd0e75d143..8f000fb3d8b 100644
--- a/tensorflow/compiler/xla/client/client.cc
+++ b/tensorflow/compiler/xla/client/client.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
diff --git a/tensorflow/compiler/xla/client/client.h b/tensorflow/compiler/xla/client/client.h
index afcc953a9f2..fb9b1c19be1 100644
--- a/tensorflow/compiler/xla/client/client.h
+++ b/tensorflow/compiler/xla/client/client.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_CLIENT_H_
 
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
diff --git a/tensorflow/compiler/xla/client/client_library.cc b/tensorflow/compiler/xla/client/client_library.cc
index 034ce6e927a..1868c59bc3c 100644
--- a/tensorflow/compiler/xla/client/client_library.cc
+++ b/tensorflow/compiler/xla/client/client_library.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/client_library.h"
 
 #include <memory>
+#include <optional>
+#include <set>
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/backend.h"
diff --git a/tensorflow/compiler/xla/client/client_library.h b/tensorflow/compiler/xla/client/client_library.h
index d86328320b2..af599f8eef8 100644
--- a/tensorflow/compiler/xla/client/client_library.h
+++ b/tensorflow/compiler/xla/client/client_library.h
@@ -85,7 +85,7 @@ class ClientLibrary {
   //   created, for the given platform.
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
       se::Platform* platform = nullptr,
-      const std::optional<std::set<int>>& allowed_devices = std::nullopt);
+      const std::optional<std::set<int>>& device_set = std::nullopt);
   static StatusOr<LocalClient*> GetOrCreateLocalClient(
       const LocalClientOptions& options);
 
diff --git a/tensorflow/compiler/xla/client/compile_only_client.cc b/tensorflow/compiler/xla/client/compile_only_client.cc
index 62d2057b5fd..2cea9024bf5 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.cc
+++ b/tensorflow/compiler/xla/client/compile_only_client.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
 
 #include <memory>
+#include <vector>
 
 #include "llvm/ADT/Twine.h"
 #include "llvm/TargetParser/Triple.h"
diff --git a/tensorflow/compiler/xla/client/compile_only_client.h b/tensorflow/compiler/xla/client/compile_only_client.h
index 02524eaeb2a..30766ec2dd0 100644
--- a/tensorflow/compiler/xla/client/compile_only_client.h
+++ b/tensorflow/compiler/xla/client/compile_only_client.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_COMPILE_ONLY_CLIENT_H_
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/compiler/xla/client/client.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/compile_only_service.h"
@@ -52,7 +55,7 @@ class CompileOnlyClient : public Client {
   // code. |metadata|, if provided, is populated during compilation.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(
-      const absl::Span<const AotXlaComputationInstance> computations,
+      absl::Span<const AotXlaComputationInstance> computations,
       const AotCompilationOptions& options,
       std::unique_ptr<AotCompilationMetadata>* metadata = nullptr);
 
diff --git a/tensorflow/compiler/xla/client/executable_build_options.cc b/tensorflow/compiler/xla/client/executable_build_options.cc
index 30259c323a2..37b21a7d18e 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.cc
+++ b/tensorflow/compiler/xla/client/executable_build_options.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
diff --git a/tensorflow/compiler/xla/client/executable_build_options.h b/tensorflow/compiler/xla/client/executable_build_options.h
index ed4554e9c77..b859589cbf3 100644
--- a/tensorflow/compiler/xla/client/executable_build_options.h
+++ b/tensorflow/compiler/xla/client/executable_build_options.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -138,6 +139,7 @@ class ExecutableBuildOptions {
     CHECK(device_assignment_.has_value());
     return device_assignment_.value();
   }
+  void clear_device_assignment() { device_assignment_.reset(); }
 
   // Whether input and output buffers are aliased if the associated parameter is
   // passed-through XLA modules without being changed.
diff --git a/tensorflow/compiler/xla/client/global_data.cc b/tensorflow/compiler/xla/client/global_data.cc
index 6785be501d5..49aea0c8566 100644
--- a/tensorflow/compiler/xla/client/global_data.cc
+++ b/tensorflow/compiler/xla/client/global_data.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/global_data.h"
 
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/client/lib/BUILD b/tensorflow/compiler/xla/client/lib/BUILD
index 9d46f68b6fb..bc7ba74322a 100644
--- a/tensorflow/compiler/xla/client/lib/BUILD
+++ b/tensorflow/compiler/xla/client/lib/BUILD
@@ -103,6 +103,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/tsl/platform:float8",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/client/lib/approx_topk.cc b/tensorflow/compiler/xla/client/lib/approx_topk.cc
index aafbda35c34..10cf46aa95d 100644
--- a/tensorflow/compiler/xla/client/lib/approx_topk.cc
+++ b/tensorflow/compiler/xla/client/lib/approx_topk.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <limits>
 #include <string>
+#include <tuple>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/client/lib/approx_topk_shape.h"
diff --git a/tensorflow/compiler/xla/client/lib/approx_topk_shape.cc b/tensorflow/compiler/xla/client/lib/approx_topk_shape.cc
index 2d5db586ea0..d0b1164a065 100644
--- a/tensorflow/compiler/xla/client/lib/approx_topk_shape.cc
+++ b/tensorflow/compiler/xla/client/lib/approx_topk_shape.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/approx_topk_shape.h"
 
 #include <algorithm>
+#include <cmath>
+#include <utility>
 
 #include "tensorflow/compiler/xla/util.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/approx_topk_shape.h b/tensorflow/compiler/xla/client/lib/approx_topk_shape.h
index 027acc9defe..d1e2cb62fd5 100644
--- a/tensorflow/compiler/xla/client/lib/approx_topk_shape.h
+++ b/tensorflow/compiler/xla/client/lib/approx_topk_shape.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_APPROX_TOPK_SHAPE_H_
 
+#include <utility>
+
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.cc b/tensorflow/compiler/xla/client/lib/arithmetic.cc
index 058e6b301dc..f4cd43a2127 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 
+#include <memory>
+#include <numeric>
 #include <string>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic.h b/tensorflow/compiler/xla/client/lib/arithmetic.h
index cdaa4f63b0a..449d7acb516 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic.h
+++ b/tensorflow/compiler/xla/client/lib/arithmetic.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_ARITHMETIC_H_
 
+#include <functional>
 #include <memory>
+#include <string>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
diff --git a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
index cf20b7c6549..f55aa3db0ee 100644
--- a/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
+++ b/tensorflow/compiler/xla/client/lib/arithmetic_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 
+#include <functional>
 #include <initializer_list>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/client/lib/comparators.cc b/tensorflow/compiler/xla/client/lib/comparators.cc
index 5e628545bad..19403b287de 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.cc
+++ b/tensorflow/compiler/xla/client/lib/comparators.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
 
 #include <limits>
+#include <optional>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/client/lib/comparators.h b/tensorflow/compiler/xla/client/lib/comparators.h
index 33a6a2a2ad0..81d71afa384 100644
--- a/tensorflow/compiler/xla/client/lib/comparators.h
+++ b/tensorflow/compiler/xla/client/lib/comparators.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_COMPARATORS_H_
 
+#include <optional>
+#include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -49,7 +51,7 @@ XlaComputation CreateScalarComparisonComputation(
     const std::string& name, const std::vector<PrimitiveType>& operand_types,
     const std::vector<
         std::optional<XlaOp (*)(XlaOp, XlaOp, absl::Span<const int64_t>)>>&
-        comparators,
+        generators,
     XlaBuilder* builder);
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.cc b/tensorflow/compiler/xla/client/lib/constants.cc
index aa48c54905e..0752bc99e24 100644
--- a/tensorflow/compiler/xla/client/lib/constants.cc
+++ b/tensorflow/compiler/xla/client/lib/constants.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 
+#include <limits>
+
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace xla {
 
@@ -41,24 +46,19 @@ XlaOp One(XlaBuilder* builder, PrimitiveType type) {
 }
 
 XlaOp Epsilon(XlaBuilder* builder, PrimitiveType type) {
-  switch (type) {
-    case F16:
-      return ConstantR0<Eigen::half>(
-          builder,
-          static_cast<Eigen::half>(Eigen::NumTraits<Eigen::half>::epsilon()));
-    case BF16:
-      return ConstantR0<Eigen::bfloat16>(
-          builder, static_cast<Eigen::bfloat16>(
-                       Eigen::NumTraits<Eigen::bfloat16>::epsilon()));
-    case F32:
-      return ConstantR0<float>(builder, std::numeric_limits<float>::epsilon());
-    case F64:
-      return ConstantR0<double>(builder,
-                                std::numeric_limits<double>::epsilon());
-    default:
-      return builder->ReportError(InvalidArgument(
-          "Invalid type for Epsilon (%s).", PrimitiveType_Name(type)));
-  }
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return ConstantR0<NativeT>(builder,
+                                     std::numeric_limits<NativeT>::epsilon());
+        }
+        return builder->ReportError(InvalidArgument(
+            "Invalid type for Epsilon (%s).", PrimitiveType_Name(type)));
+      },
+      type);
 }
 
 XlaOp MinValue(XlaBuilder* builder, PrimitiveType type) {
@@ -66,39 +66,35 @@ XlaOp MinValue(XlaBuilder* builder, PrimitiveType type) {
 }
 
 XlaOp MinFiniteValue(XlaBuilder* builder, PrimitiveType type) {
-  switch (type) {
-    case F16:
-      return ConstantR0<Eigen::half>(builder,
-                                     Eigen::NumTraits<Eigen::half>::lowest());
-    case BF16:
-      return ConstantR0<Eigen::bfloat16>(
-          builder, Eigen::NumTraits<Eigen::bfloat16>::lowest());
-    case F32:
-      return ConstantR0<float>(builder, -std::numeric_limits<float>::max());
-    case F64:
-      return ConstantR0<double>(builder, -std::numeric_limits<double>::max());
-    default:
-      return MinValue(builder, type);
-  }
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return ConstantR0<NativeT>(builder,
+                                     std::numeric_limits<NativeT>::lowest());
+        }
+        return MinValue(builder, type);
+      },
+      type);
 }
 
 XlaOp MinPositiveNormalValue(XlaBuilder* builder, PrimitiveType type) {
-  switch (type) {
-    case F16:
-      return ConstantR0<Eigen::half>(builder,
-                                     std::numeric_limits<Eigen::half>::min());
-    case BF16:
-      return ConstantR0<Eigen::bfloat16>(
-          builder, std::numeric_limits<Eigen::bfloat16>::min());
-    case F32:
-      return ConstantR0<float>(builder, std::numeric_limits<float>::min());
-    case F64:
-      return ConstantR0<double>(builder, std::numeric_limits<double>::min());
-    default:
-      return builder->ReportError(
-          InvalidArgument("Invalid type for MinPositiveNormalValue (%s).",
-                          PrimitiveType_Name(type)));
-  }
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return ConstantR0<NativeT>(builder,
+                                     std::numeric_limits<NativeT>::min());
+        }
+        return builder->ReportError(
+            InvalidArgument("Invalid type for MinPositiveNormalValue (%s).",
+                            PrimitiveType_Name(type)));
+      },
+      type);
 }
 
 XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type) {
@@ -106,44 +102,34 @@ XlaOp MaxValue(XlaBuilder* builder, PrimitiveType type) {
 }
 
 XlaOp MaxFiniteValue(XlaBuilder* builder, PrimitiveType type) {
-  switch (type) {
-    case F16:
-      return ConstantR0<Eigen::half>(builder,
-                                     Eigen::NumTraits<Eigen::half>::highest());
-    case BF16:
-      return ConstantR0<Eigen::bfloat16>(
-          builder, Eigen::NumTraits<Eigen::bfloat16>::highest());
-    case F32:
-      return ConstantR0<float>(builder, std::numeric_limits<float>::max());
-    case F64:
-      return ConstantR0<double>(builder, std::numeric_limits<double>::max());
-    default:
-      return MaxValue(builder, type);
-  }
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return ConstantR0<NativeT>(builder,
+                                     std::numeric_limits<NativeT>::max());
+        }
+        return MaxValue(builder, type);
+      },
+      type);
 }
 
 XlaOp NanValue(XlaBuilder* builder, PrimitiveType type) {
-  return builder->ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    switch (type) {
-      case F16:
-        return ConstantR0<Eigen::half>(
-            builder, Eigen::NumTraits<Eigen::half>::quiet_NaN());
-      case BF16:
-        return ConstantR0<Eigen::bfloat16>(
-            builder, Eigen::NumTraits<Eigen::bfloat16>::quiet_NaN());
-      case F32:
-        return ConstantR0<float>(builder,
-                                 std::numeric_limits<float>::quiet_NaN());
-      case F64:
-        return ConstantR0<double>(builder,
-                                  std::numeric_limits<double>::quiet_NaN());
-      default:
-        return InvalidArgument(
-            "Operand to NanValue was %s, but must be a real-valued "
-            "floating-point type.",
-            PrimitiveType_Name(type));
-    }
-  });
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return ConstantR0<NativeT>(builder,
+                                     std::numeric_limits<NativeT>::quiet_NaN());
+        }
+        return builder->ReportError(InvalidArgument(
+            "Invalid type for NanValue (%s).", PrimitiveType_Name(type)));
+      },
+      type);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/client/lib/constants.h b/tensorflow/compiler/xla/client/lib/constants.h
index 779c40eee48..9fc69f45836 100644
--- a/tensorflow/compiler/xla/client/lib/constants.h
+++ b/tensorflow/compiler/xla/client/lib/constants.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/float8.h"
 
 namespace xla {
 
@@ -45,42 +46,17 @@ XlaOp ConstantR0WithType(XlaBuilder* builder, PrimitiveType type, T value) {
         "Invalid cast from complex type to %s in ConstantR0WithType.",
         PrimitiveType_Name(type)));
   }
-  switch (type) {
-    case PRED:
-      return ConstantR0<bool>(builder, static_cast<bool>(value));
-    case F16:
-      return ConstantR0<half>(builder, static_cast<half>(value));
-    case BF16:
-      return ConstantR0<bfloat16>(builder, static_cast<bfloat16>(value));
-    case F32:
-      return ConstantR0<float>(builder, static_cast<float>(value));
-    case F64:
-      return ConstantR0<double>(builder, static_cast<double>(value));
-    case C64:
-      return ConstantR0<complex64>(builder, static_cast<complex64>(value));
-    case C128:
-      return ConstantR0<complex128>(builder, static_cast<complex128>(value));
-    case U8:
-      return ConstantR0<uint8_t>(builder, static_cast<uint8_t>(value));
-    case U16:
-      return ConstantR0<uint16_t>(builder, static_cast<uint16_t>(value));
-    case U32:
-      return ConstantR0<uint32_t>(builder, static_cast<uint32_t>(value));
-    case U64:
-      return ConstantR0<uint64_t>(builder, static_cast<uint64_t>(value));
-    case S8:
-      return ConstantR0<int8_t>(builder, static_cast<int8_t>(value));
-    case S16:
-      return ConstantR0<int16_t>(builder, static_cast<int16_t>(value));
-    case S32:
-      return ConstantR0<int32_t>(builder, static_cast<int32_t>(value));
-    case S64:
-      return ConstantR0<int64_t>(builder, static_cast<int64_t>(value));
-    default:
-      return builder->ReportError(
-          InvalidArgument("Invalid type for ConstantR0WithType (%s).",
-                          PrimitiveType_Name(type)));
-  }
+  return primitive_util::PrimitiveTypeSwitch<XlaOp>(
+      [&](auto primitive_type_constant) -> XlaOp {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return ConstantR0<NativeT>(builder, static_cast<NativeT>(value));
+        }
+        return builder->ReportError(
+            InvalidArgument("Invalid type for ConstantR0WithType (%s).",
+                            PrimitiveType_Name(type)));
+      },
+      type);
 }
 
 // Returns a scalar containing 'value' cast to the same run-time type as
diff --git a/tensorflow/compiler/xla/client/lib/constants_test.cc b/tensorflow/compiler/xla/client/lib/constants_test.cc
index 5b034dde320..051cfc898da 100644
--- a/tensorflow/compiler/xla/client/lib/constants_test.cc
+++ b/tensorflow/compiler/xla/client/lib/constants_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 
+#include <limits>
+
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc
index 3dde6cdcafe..8230f848df5 100644
--- a/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc
+++ b/tensorflow/compiler/xla/client/lib/conv_grad_size_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
 
+#include <algorithm>
+
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/tsl/platform/errors.h"
 
diff --git a/tensorflow/compiler/xla/client/lib/logdet.cc b/tensorflow/compiler/xla/client/lib/logdet.cc
index 3201323f4dc..b77694f0cbe 100644
--- a/tensorflow/compiler/xla/client/lib/logdet.cc
+++ b/tensorflow/compiler/xla/client/lib/logdet.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/logdet.h"
 
+#include <limits>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/client/lib/logdet_test.cc b/tensorflow/compiler/xla/client/lib/logdet_test.cc
index b5f78aea82d..ac61cbfad27 100644
--- a/tensorflow/compiler/xla/client/lib/logdet_test.cc
+++ b/tensorflow/compiler/xla/client/lib/logdet_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/logdet.h"
 
+#include <limits>
+
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
diff --git a/tensorflow/compiler/xla/client/lib/loops.cc b/tensorflow/compiler/xla/client/lib/loops.cc
index 7e7426812ee..4da691f3a9d 100644
--- a/tensorflow/compiler/xla/client/lib/loops.cc
+++ b/tensorflow/compiler/xla/client/lib/loops.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/loops.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/client/lib/lu_decomposition.cc b/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
index dac74300215..7a52980e599 100644
--- a/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
+++ b/tensorflow/compiler/xla/client/lib/lu_decomposition.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/lu_decomposition.h"
 
+#include <algorithm>
 #include <vector>
 
 #include "tensorflow/compiler/xla/shape_util.h"
diff --git a/tensorflow/compiler/xla/client/lib/math.cc b/tensorflow/compiler/xla/client/lib/math.cc
index 25179617548..95fd4621d0b 100644
--- a/tensorflow/compiler/xla/client/lib/math.cc
+++ b/tensorflow/compiler/xla/client/lib/math.cc
@@ -15,7 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/math.h"
 
+#include <algorithm>
+#include <array>
 #include <cmath>
+#include <functional>
+#include <limits>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
@@ -153,6 +158,9 @@ XlaOp IsNegZero(XlaOp operand) {
       case F32:
         return Eq(BitcastConvertType(operand, U32),
                   ConstantR0WithType(&b, U32, uint32_t{1} << 31));
+      case F8E5M2:
+      case F8E4M3FN:
+      case F8E4M3B11FNUZ:
       case F16:
       case BF16:
         // Not all XLA backends handle U16 well, so we convert to F32/U32.
@@ -293,10 +301,11 @@ XlaOp Erfc(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16}, [](XlaOp x) {
-      return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
-                    ScalarLike(x, 1) - ErfImpl32Cephes(x));
-    });
+    return DoWithUpcastToF32(
+        x, {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ}, [](XlaOp x) {
+          return Select(Gt(Abs(x), ScalarLike(x, 1)), ErfcImpl32(x),
+                        ScalarLike(x, 1) - ErfImpl32Cephes(x));
+        });
   });
 }
 
@@ -338,7 +347,7 @@ XlaOp Erf(XlaOp x) {
     }
     // Erf(c)Impl don't have enough precision when run with bf16 intermediates
     // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(x, {BF16, F16},
+    return DoWithUpcastToF32(x, {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ},
                              [](XlaOp x) { return ErfImpl32(x); });
   });
 }
@@ -487,7 +496,7 @@ XlaOp ErfInv(XlaOp x) {
     if (shape.element_type() == F64) {
       return ErfInv64(x);
     }
-    return DoWithUpcastToF32(x, {BF16, F16},
+    return DoWithUpcastToF32(x, {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ},
                              [](XlaOp x) { return ErfInv32(x); });
   });
 }
@@ -616,7 +625,8 @@ XlaOp Lgamma(XlaOp input) {
     // F16 and BF16 don't provide sufficient precision for intermediate results
     // here (although it's better than you might expect!), so do the
     // computations in F32.
-    return DoWithUpcastToF32(input, {BF16, F16}, do_it);
+    return DoWithUpcastToF32(
+        input, {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ}, do_it);
   });
 }
 
@@ -711,7 +721,8 @@ XlaOp Digamma(XlaOp input) {
   auto& b = *input.builder();
   return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
     TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Digamma", input));
-    return DoWithUpcastToF32(input, {BF16, F16}, do_it);
+    return DoWithUpcastToF32(
+        input, {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ}, do_it);
   });
 }
 
@@ -965,8 +976,13 @@ XlaOp Igamma(XlaOp a, XlaOp x) {
     }
     TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Igamma", a));
     PrimitiveType a_x_type = a_shape.element_type();
-    bool needs_upcast =
-        a_shape.element_type() == F16 || a_shape.element_type() == BF16;
+    bool needs_upcast = false;
+    for (PrimitiveType type : {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ}) {
+      if (a_shape.element_type() == type) {
+        needs_upcast = true;
+        break;
+      }
+    }
 
     if (needs_upcast) {
       a = ConvertElementType(a, F32);
@@ -1012,8 +1028,13 @@ XlaOp IgammaGradA(XlaOp a, XlaOp x) {
           a_shape.ToString(), x_shape.ToString());
     }
     TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("IgammaGradA", a));
-    bool needs_upcast =
-        a_shape.element_type() == F16 || a_shape.element_type() == BF16;
+    bool needs_upcast = false;
+    for (PrimitiveType type : {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ}) {
+      if (a_shape.element_type() == type) {
+        needs_upcast = true;
+        break;
+      }
+    }
 
     if (needs_upcast) {
       a = ConvertElementType(a, F32);
diff --git a/tensorflow/compiler/xla/client/lib/math_test.cc b/tensorflow/compiler/xla/client/lib/math_test.cc
index ccd4ee2b1cc..cd571a66978 100644
--- a/tensorflow/compiler/xla/client/lib/math_test.cc
+++ b/tensorflow/compiler/xla/client/lib/math_test.cc
@@ -15,7 +15,14 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/math.h"
 
+#include <cmath>
+#include <complex>
+#include <functional>
 #include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
@@ -120,10 +127,6 @@ class MathTypedTest : public MathTest {
   //
   // For good measure, we also check pow with an exponent other than 0.5.
   void TestSqrtPowInequivalence() {
-    // TODO(b/145798892): test fails on GPU for double values.
-    if (std::is_same<T, double>::value) {
-      return;
-    }
     SetFastMathDisabled(true);
 
     // Tests disable constant folding by default, but this test needs it
@@ -222,10 +225,6 @@ XLA_TEST_F(MathTest, RealFpOnlyOps) {
     } else {
       continue;
     }
-    if (ty == F8E5M2 || ty == F8E4M3FN || ty == F8E4M3B11FNUZ) {
-      // TODO(b/259609697): Add FP8 support to math ops
-      continue;
-    }
 
     for (const auto& test :
          std::vector<std::pair<std::function<XlaOp(XlaOp)>, std::string>>({
diff --git a/tensorflow/compiler/xla/client/lib/matrix.cc b/tensorflow/compiler/xla/client/lib/matrix.cc
index f8eb44de9aa..eb4a8a5e0b5 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.cc
+++ b/tensorflow/compiler/xla/client/lib/matrix.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <limits>
+#include <map>
 #include <numeric>
 #include <optional>
 #include <string>
diff --git a/tensorflow/compiler/xla/client/lib/matrix.h b/tensorflow/compiler/xla/client/lib/matrix.h
index 5ceda40af02..b24feca3ea8 100644
--- a/tensorflow/compiler/xla/client/lib/matrix.h
+++ b/tensorflow/compiler/xla/client/lib/matrix.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <array>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/compiler/xla/client/lib/pooling.cc b/tensorflow/compiler/xla/client/lib/pooling.cc
index 42340910fd5..7db9f364d3a 100644
--- a/tensorflow/compiler/xla/client/lib/pooling.cc
+++ b/tensorflow/compiler/xla/client/lib/pooling.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/pooling.h"
 
+#include <numeric>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/conv_grad_size_util.h"
diff --git a/tensorflow/compiler/xla/client/lib/pooling.h b/tensorflow/compiler/xla/client/lib/pooling.h
index 9510193f8a6..3a26c02d0d5 100644
--- a/tensorflow/compiler/xla/client/lib/pooling.h
+++ b/tensorflow/compiler/xla/client/lib/pooling.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_POOLING_H_
 
+#include <utility>
+#include <vector>
+
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 
@@ -57,8 +60,7 @@ XlaOp MaxPool(XlaOp operand, absl::Span<const int64_t> kernel_size,
 XlaOp AvgPool(XlaOp operand, absl::Span<const int64_t> kernel_size,
               absl::Span<const int64_t> stride,
               absl::Span<const std::pair<int64_t, int64_t>> padding,
-              const TensorFormat& data_format,
-              const bool counts_include_padding);
+              const TensorFormat& data_format, bool counts_include_padding);
 
 // Returns the list of low and high padding elements in each spatial dimension
 // for the given 'padding' specification.
@@ -72,8 +74,7 @@ XlaOp AvgPoolGrad(XlaOp out_backprop, absl::Span<const int64_t> gradients_size,
                   absl::Span<const int64_t> kernel_size,
                   absl::Span<const int64_t> stride,
                   absl::Span<const std::pair<int64_t, int64_t>> spatial_padding,
-                  const TensorFormat& data_format,
-                  const bool counts_include_padding);
+                  const TensorFormat& data_format, bool counts_include_padding);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/client/lib/pooling_test.cc b/tensorflow/compiler/xla/client/lib/pooling_test.cc
index 44d0091f0c9..496a9a931e1 100644
--- a/tensorflow/compiler/xla/client/lib/pooling_test.cc
+++ b/tensorflow/compiler/xla/client/lib/pooling_test.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/compiler/xla/client/lib/pooling.h"
+
+#include <utility>
+#include <vector>
+
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
diff --git a/tensorflow/compiler/xla/client/lib/prng.cc b/tensorflow/compiler/xla/client/lib/prng.cc
index b0b66dd1b0a..4c6d0eae79d 100644
--- a/tensorflow/compiler/xla/client/lib/prng.cc
+++ b/tensorflow/compiler/xla/client/lib/prng.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/prng.h"
 
+#include <array>
 #include <cmath>
+#include <iterator>
+#include <tuple>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/lib/constants.h"
@@ -597,57 +601,59 @@ XlaOp PhiloxIncreaseCounter(XlaOp counter, XlaOp delta) {
 RngOutput ThreeFryBitGenerator(XlaOp key, XlaOp initial_state,
                                const Shape& shape) {
   PrimitiveType type = shape.element_type();
-  switch (type) {
-    case S8:
-    case U8:
-    case F16:
-    case U16:
-    case S16:
-      return ThreeFryRngBitNarrow(key, initial_state, shape);
-    case F32:
-    case U32:
-    case S32:
-      return ThreeFryRngBit32(key, initial_state, shape);
-    case F64:
-    case U64:
-    case S64:
-      return ThreeFryRngBit64(key, initial_state, shape);
-    default:
-      return {
-          key.builder()->ReportError(Unimplemented(
-              "Types other than F16, F32, F64, U16, S16, U32, S32, U64 and S64 "
-              "are not implemented by ThreeFryBitGenerator; got %s",
-              primitive_util::LowercasePrimitiveTypeName(type))),
-          initial_state};
-  }
+  return primitive_util::PrimitiveTypeSwitch<RngOutput>(
+      [&](auto primitive_type_constant) -> RngOutput {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant) &&
+                      !primitive_util::IsComplexType(primitive_type_constant) &&
+                      primitive_type_constant != PRED) {
+          const int kBits = primitive_util::BitWidth(primitive_type_constant);
+          if (kBits < 32) {
+            return ThreeFryRngBitNarrow(key, initial_state, shape);
+          }
+          if (kBits == 32) {
+            return ThreeFryRngBit32(key, initial_state, shape);
+          }
+          if (kBits == 64) {
+            return ThreeFryRngBit64(key, initial_state, shape);
+          }
+        }
+        return {
+            key.builder()->ReportError(Unimplemented(
+                "Types other than F16, F32, F64, U16, S16, U32, S32, U64 and "
+                "S64 are not implemented by ThreeFryBitGenerator; got %s",
+                primitive_util::LowercasePrimitiveTypeName(type))),
+            initial_state};
+      },
+      type);
 }
 
 RngOutput PhiloxBitGenerator(XlaOp key, XlaOp initial_state,
                              const Shape& shape) {
   PrimitiveType type = shape.element_type();
-  switch (type) {
-    case S8:
-    case U8:
-    case F16:
-    case U16:
-    case S16:
-      return PhiloxRngBitNarrow(key, initial_state, shape);
-    case F32:
-    case U32:
-    case S32:
-      return PhiloxRngBit32(key, initial_state, shape);
-    case F64:
-    case U64:
-    case S64:
-      return PhiloxRngBit64(key, initial_state, shape);
-    default:
-      return {
-          key.builder()->ReportError(Unimplemented(
-              "Types other than F16, F32, F64, U16, S16, U32, S32, U64 and S64 "
-              "are not implemented by PhiloxBitGenerator; got %s",
-              primitive_util::LowercasePrimitiveTypeName(type))),
-          initial_state};
-  }
+  return primitive_util::PrimitiveTypeSwitch<RngOutput>(
+      [&](auto primitive_type_constant) -> RngOutput {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant) &&
+                      !primitive_util::IsComplexType(primitive_type_constant) &&
+                      primitive_type_constant != PRED) {
+          const int kBits = primitive_util::BitWidth(primitive_type_constant);
+          if (kBits < 32) {
+            return PhiloxRngBitNarrow(key, initial_state, shape);
+          }
+          if (kBits == 32) {
+            return PhiloxRngBit32(key, initial_state, shape);
+          }
+          if (kBits == 64) {
+            return PhiloxRngBit64(key, initial_state, shape);
+          }
+        }
+        return {
+            key.builder()->ReportError(Unimplemented(
+                "Types other than F16, F32, F64, U16, S16, U32, S32, U64 and "
+                "S64 are not implemented by PhiloxBitGenerator; got %s",
+                primitive_util::LowercasePrimitiveTypeName(type))),
+            initial_state};
+      },
+      type);
 }
 
 std::pair<XlaOp, XlaOp> ScramblePhiloxKey(XlaOp key) {
diff --git a/tensorflow/compiler/xla/client/lib/prng.h b/tensorflow/compiler/xla/client/lib/prng.h
index ef60bd74486..35d6d05ac33 100644
--- a/tensorflow/compiler/xla/client/lib/prng.h
+++ b/tensorflow/compiler/xla/client/lib/prng.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_LIB_PRNG_H_
 
 #include <array>
+#include <functional>
+#include <utility>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
diff --git a/tensorflow/compiler/xla/client/lib/qr.cc b/tensorflow/compiler/xla/client/lib/qr.cc
index 12ed7af8821..41ed04a7c87 100644
--- a/tensorflow/compiler/xla/client/lib/qr.cc
+++ b/tensorflow/compiler/xla/client/lib/qr.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/qr.h"
 
+#include <algorithm>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/client/lib/quantize_test.cc b/tensorflow/compiler/xla/client/lib/quantize_test.cc
index 2dbbd21666c..52668d27ac8 100644
--- a/tensorflow/compiler/xla/client/lib/quantize_test.cc
+++ b/tensorflow/compiler/xla/client/lib/quantize_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/quantize.h"
 
 #include <limits>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
index 26f15fb3203..8caf8c1784e 100644
--- a/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/compiler/xla/client/lib/arithmetic.h"
diff --git a/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
index 0f47c41975c..6e48ae35cfa 100644
--- a/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
+++ b/tensorflow/compiler/xla/client/lib/self_adjoint_eig_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/self_adjoint_eig.h"
 
+#include <algorithm>
+#include <numeric>
+#include <vector>
+
 #include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
diff --git a/tensorflow/compiler/xla/client/lib/slicing.cc b/tensorflow/compiler/xla/client/lib/slicing.cc
index d8a36e22aa9..91c35d2cd4c 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.cc
+++ b/tensorflow/compiler/xla/client/lib/slicing.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
 
 #include <algorithm>
+#include <functional>
 #include <limits>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/client/lib/slicing.h b/tensorflow/compiler/xla/client/lib/slicing.h
index 3befbd311eb..2bf9a27aee5 100644
--- a/tensorflow/compiler/xla/client/lib/slicing.h
+++ b/tensorflow/compiler/xla/client/lib/slicing.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
+
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/types.h"
diff --git a/tensorflow/compiler/xla/client/lib/sorting.cc b/tensorflow/compiler/xla/client/lib/sorting.cc
index cdd1f4a542a..32a59b6025d 100644
--- a/tensorflow/compiler/xla/client/lib/sorting.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 
+#include <vector>
+
 #include "tensorflow/compiler/xla/client/lib/comparators.h"
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/loops.h"
@@ -188,24 +190,9 @@ XlaOp TopKWithPartitions(XlaOp input, int64_t k, int64_t num_partitions,
       auto iota = values_and_indices[3];
 
       // Slice value and indices for this partition.
-      XlaOp start;
-      switch (index_type) {
-        case PrimitiveType::S16:
-          start = Mul(Add(partition, ConstantR0<int16_t>(builder, 1)),
-                      ConstantR0<int16_t>(builder, per_partition_size));
-          break;
-        case PrimitiveType::S32:
-          start = Mul(Add(partition, ConstantR0<int32_t>(builder, 1)),
-                      ConstantR0<int32_t>(builder, per_partition_size));
-          break;
-        case PrimitiveType::S64:
-          start = Mul(Add(partition, ConstantR0<int64_t>(builder, 1)),
-                      ConstantR0<int64_t>(builder, per_partition_size));
-          break;
-        default:
-          LOG(FATAL) << "Unsupported index type "
-                     << PrimitiveType_Name(index_type);
-      }
+      XlaOp start =
+          Mul(Add(partition, One(builder, index_type)),
+              ConstantR0WithType(builder, index_type, per_partition_size));
       XlaOp sliced_input =
           DynamicSliceInMinorDims(input, {start}, {per_partition_size});
       XlaOp sliced_indices =
diff --git a/tensorflow/compiler/xla/client/lib/sorting_test.cc b/tensorflow/compiler/xla/client/lib/sorting_test.cc
index 7d5de392067..8573329b5ae 100644
--- a/tensorflow/compiler/xla/client/lib/sorting_test.cc
+++ b/tensorflow/compiler/xla/client/lib/sorting_test.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/sorting.h"
 
+#include <algorithm>
+#include <functional>
 #include <limits>
+#include <random>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/test.h"
diff --git a/tensorflow/compiler/xla/client/lib/svd_test.cc b/tensorflow/compiler/xla/client/lib/svd_test.cc
index 597c2d7747f..034771d2fb6 100644
--- a/tensorflow/compiler/xla/client/lib/svd_test.cc
+++ b/tensorflow/compiler/xla/client/lib/svd_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/svd.h"
 
+#include <numeric>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/array2d.h"
 #include "tensorflow/compiler/xla/array3d.h"
diff --git a/tensorflow/compiler/xla/client/lib/testing.cc b/tensorflow/compiler/xla/client/lib/testing.cc
index bb13a3b15c3..4f2aeb9438c 100644
--- a/tensorflow/compiler/xla/client/lib/testing.cc
+++ b/tensorflow/compiler/xla/client/lib/testing.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/lib/testing.h"
 
+#include <memory>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/execution_options_util.h"
diff --git a/tensorflow/compiler/xla/client/local_client.cc b/tensorflow/compiler/xla/client/local_client.cc
index 3418616fcc9..ae82cd46167 100644
--- a/tensorflow/compiler/xla/client/local_client.cc
+++ b/tensorflow/compiler/xla/client/local_client.cc
@@ -18,15 +18,14 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
-#include "llvm/TargetParser/Triple.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/service/backend.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/source_map_util.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 
 using xla::source_map_util::InvalidParameterArgument;
 
@@ -167,8 +166,8 @@ LocalExecutable::RunHelper(const absl::Span<const Shape* const> argument_shapes,
   //    ExecutableRunOptions.eigen_intra_op_thread_pool.
   // *) The thread pool used for XLA CPU ops is from
   //    backend_->eigen_intra_op_thread_pool().
-  ServiceExecutableRunOptions service_options(run_options,
-                                              backend_->StreamBorrower());
+  ServiceExecutableRunOptions service_options(
+      run_options, backend_->StreamBorrowerWithPriority());
   return std::make_pair(service_options, std::move(stream));
 }
 
diff --git a/tensorflow/compiler/xla/client/local_client.h b/tensorflow/compiler/xla/client/local_client.h
index 1e2cbf11c60..c79425bca90 100644
--- a/tensorflow/compiler/xla/client/local_client.h
+++ b/tensorflow/compiler/xla/client/local_client.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -49,7 +50,7 @@ class LocalExecutable {
   // Run the compiled computation with the given arguments and options and
   // return the result.
   StatusOr<ScopedShapedBuffer> Run(
-      const absl::Span<const ShapedBuffer* const> arguments,
+      absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
   // Similar to Run(), but allows for donating argument buffers to the
@@ -60,7 +61,7 @@ class LocalExecutable {
   // Similar to Run(), but need not block the host waiting for the computation
   // to complete before returning.
   StatusOr<ScopedShapedBuffer> RunAsync(
-      const absl::Span<const ShapedBuffer* const> arguments,
+      absl::Span<const ShapedBuffer* const> arguments,
       ExecutableRunOptions run_options);
 
   // Similar to RunAsync(), but allows for donating argument buffers to the
@@ -91,7 +92,7 @@ class LocalExecutable {
   StatusOr<Literal> LiteralFromShapedBuffer(const ShapedBuffer& shaped_buffer);
 
   StatusOr<std::pair<ServiceExecutableRunOptions, StreamPool::Ptr>> RunHelper(
-      const absl::Span<const Shape* const> argument_shapes,
+      absl::Span<const Shape* const> argument_shapes,
       ExecutableRunOptions run_options);
 
   // The ordinal of the device which this executable was compiled for. The
@@ -143,7 +144,7 @@ class LocalClient : public Client {
   // environment variable.
   StatusOr<std::vector<std::unique_ptr<LocalExecutable>>> Compile(
       const XlaComputation& computation,
-      const absl::Span<const Shape* const> argument_layouts,
+      absl::Span<const Shape* const> argument_layouts,
       const ExecutableBuildOptions& options);
 
   // Same as Compile() above, but return AotCompilationResult objects (instead
@@ -151,7 +152,7 @@ class LocalClient : public Client {
   // LocalExecutable(s) using the Load() method below.
   StatusOr<std::vector<std::unique_ptr<AotCompilationResult>>>
   CompileAheadOfTime(const XlaComputation& computation,
-                     const absl::Span<const Shape* const> argument_layouts,
+                     absl::Span<const Shape* const> argument_layouts,
                      const ExecutableBuildOptions& options);
 
   // Return a LocalExecutable object loaded from a serialized
diff --git a/tensorflow/compiler/xla/client/padding.cc b/tensorflow/compiler/xla/client/padding.cc
index 7fec04e2ac5..a78cd490a4c 100644
--- a/tensorflow/compiler/xla/client/padding.cc
+++ b/tensorflow/compiler/xla/client/padding.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/padding.h"
 
 #include <algorithm>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/lib/math/math_util.h"
diff --git a/tensorflow/compiler/xla/client/padding_test.cc b/tensorflow/compiler/xla/client/padding_test.cc
index 1b249596138..79306a40d2e 100644
--- a/tensorflow/compiler/xla/client/padding_test.cc
+++ b/tensorflow/compiler/xla/client/padding_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/padding.h"
 
+#include <utility>
+
 #include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/client/sharding_builder.cc b/tensorflow/compiler/xla/client/sharding_builder.cc
index e3290f8afd1..718b411a6a9 100644
--- a/tensorflow/compiler/xla/client/sharding_builder.cc
+++ b/tensorflow/compiler/xla/client/sharding_builder.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/client/sharding_builder.h"
 
+#include <vector>
+
 namespace xla {
 namespace sharding_builder {
 
diff --git a/tensorflow/compiler/xla/client/value_inference.cc b/tensorflow/compiler/xla/client/value_inference.cc
index 1b211b5cb54..2ffa00e234c 100644
--- a/tensorflow/compiler/xla/client/value_inference.cc
+++ b/tensorflow/compiler/xla/client/value_inference.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -609,8 +610,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(operands)
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(operands)
                 .WithComputation(std::move(computation))
                 .WithSubshape(context.shape_index)
                 .Evaluate();
@@ -629,8 +630,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstantValueFallback(
             .AddVisit([](Literal operand) { return operand; });
       }
       return result.AddVisit([root, this](absl::Span<Literal> operands) {
-        return HloProtoEvaluator(evaluator, *root)
-            .WithOperands(operands)
+        return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+            ->WithOperands(operands)
             .Evaluate();
       });
     }
@@ -763,8 +764,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
             std::vector<Literal> new_operands;
             new_operands.emplace_back(std::move(upper_bound));
             new_operands.emplace_back(std::move(lower_bound));
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(absl::MakeSpan(new_operands))
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(absl::MakeSpan(new_operands))
                 .Evaluate();
           });
     }
@@ -796,8 +797,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeUpperBound(
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kConstantValue, context)
           .AddVisit([root, this](absl::Span<Literal> operands) {
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(operands)
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(operands)
                 .Evaluate();
           });
     }
@@ -874,8 +875,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
                          PostorderDFSNodeType::kConstantUpperBound, context)
           .AddVisit(
               [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
-                return HloProtoEvaluator(evaluator, *root)
-                    .WithOperands(operands)
+                return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                    ->WithOperands(operands)
                     .Evaluate();
               });
     }
@@ -886,8 +887,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeLowerBound(
           .AddDependency(root->operand_ids(1),
                          PostorderDFSNodeType::kConstantValue, context)
           .AddVisit([root, this](absl::Span<Literal> operands) {
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(operands)
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(operands)
                 .Evaluate();
           });
     }
@@ -939,8 +940,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
       }
       return result.AddVisit(
           [root, this](absl::Span<Literal> operands) -> StatusOr<Literal> {
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(operands)
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(operands)
                 .Evaluate();
           });
     }
@@ -984,8 +985,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeConstant(
             TF_ASSIGN_OR_RETURN(
                 auto computation,
                 HloComputation::CreateFromProto(*computation_proto, {}));
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(operands)
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(operands)
                 .WithComputation(std::move(computation))
                 .WithSubshape(context.shape_index)
                 .Evaluate();
@@ -1149,8 +1150,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kShiftRightArithmetic:
     case HloOpcode::kShiftRightLogical: {
       return result.AddVisit([root, this](absl::Span<Literal> operands) {
-        return HloProtoEvaluator(evaluator, *root)
-            .WithOperands(operands)
+        return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+            ->WithOperands(operands)
             .WithPrimitiveType(PRED)
             .WithOpCode(HloOpcode::kOr)
             .Evaluate();
@@ -1176,8 +1177,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
             .AddVisit([](Literal operand) { return operand; });
       }
       return result.AddVisit([root, this](absl::Span<Literal> operands) {
-        return HloProtoEvaluator(evaluator, *root)
-            .WithOperands(operands)
+        return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+            ->WithOperands(operands)
             .WithPrimitiveType(PRED)
             .Evaluate();
       });
@@ -1341,8 +1342,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
               reduce_or = b.Build();
             }
 
-            return HloProtoEvaluator(evaluator, *root)
-                .WithOperands(operands)
+            return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                ->WithOperands(operands)
                 .WithPrimitiveType(PRED)
                 .WithComputation(std::move(reduce_or))
                 // Reduce could produce tuple shape, only fetch what we need.
@@ -1429,8 +1430,8 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
                 new_operands.emplace_back(
                     optional_selector_literal.GetValue()->Clone());
 
-                return HloProtoEvaluator(evaluator, *root)
-                    .WithOperands(absl::MakeSpan(new_operands))
+                return std::make_unique<HloProtoEvaluator>(evaluator, *root)
+                    ->WithOperands(absl::MakeSpan(new_operands))
                     .WithPrimitiveType(PRED)
                     .Evaluate();
               });
@@ -1655,7 +1656,7 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
   TF_ASSIGN_OR_RETURN(auto* inst, builder_->LookUpInstructionByHandle(handle));
   TF_ASSIGN_OR_RETURN(HloOpcode opcode, StringToHloOpcode(inst->opcode()));
   std::vector<Literal> operands;
-  auto output_shape = Shape(inst->shape());
+  auto output_shape = std::make_unique<const Shape>(inst->shape());
   switch (opcode) {
     case HloOpcode::kSlice:
     case HloOpcode::kConcatenate:
@@ -1667,8 +1668,8 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
       }
       // We put handles into the tensor and evaluate the results into a literal.
       // The literal also contain handles for each element position.
-      return HloProtoEvaluator(evaluator_, *inst)
-          .WithOperands(absl::MakeSpan(operands))
+      return std::make_unique<HloProtoEvaluator>(evaluator_, *inst)
+          ->WithOperands(absl::MakeSpan(operands))
           .WithPrimitiveType(S64)
           .Evaluate();
     }
@@ -1676,23 +1677,23 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
       // Only identity kConvert can be optimized away.
       auto operand =
           builder_->LookUpInstructionByHandle(inst->operand_ids(0)).value();
-      if (Shape::Equal()(output_shape, Shape(operand->shape()))) {
+      if (Shape::Equal()(*output_shape, Shape(operand->shape()))) {
         // Forward operand handle as result.
         return SimplifyOp(inst->operand_ids(0));
       } else {
-        return CreateS64Literal(-1, output_shape);
+        return CreateS64Literal(-1, *output_shape);
       }
     }
     case HloOpcode::kAdd: {
       // a + (b - a) => b
       // a + b + (c - a) => b + c
-      if (output_shape.rank() == 0) {
+      if (output_shape->rank() == 0) {
         TF_ASSIGN_OR_RETURN(auto lhs, SimplifyOp(inst->operand_ids(0)));
         TF_ASSIGN_OR_RETURN(auto rhs, SimplifyOp(inst->operand_ids(1)));
         int64_t lhs_handle = lhs.Get<int64_t>({});
         int64_t rhs_handle = rhs.Get<int64_t>({});
         if (lhs_handle == -1 || rhs_handle == -1) {
-          return CreateS64Literal(-1, output_shape);
+          return CreateS64Literal(-1, *output_shape);
         }
         // Recursive lambda needs explicit signature.
         std::function<std::optional<int64_t>(int64_t, int64_t)>
@@ -1749,14 +1750,14 @@ StatusOr<Literal> ValueInference::SimplifyOp(int64_t handle) {
 
         return LiteralUtil::CreateR0<int64_t>(new_sum.handle());
       } else {
-        return CreateS64Literal(-1, output_shape);
+        return CreateS64Literal(-1, *output_shape);
       }
     }
     default: {
-      if (ShapeUtil::IsScalar(output_shape)) {
+      if (ShapeUtil::IsScalar(*output_shape)) {
         return LiteralUtil::CreateR0<int64_t>(handle);
       } else {
-        return CreateS64Literal(-1, output_shape);
+        return CreateS64Literal(-1, *output_shape);
       }
     }
   }
diff --git a/tensorflow/compiler/xla/client/value_inference.h b/tensorflow/compiler/xla/client/value_inference.h
index 2579f65059f..3d371eef283 100644
--- a/tensorflow/compiler/xla/client/value_inference.h
+++ b/tensorflow/compiler/xla/client/value_inference.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_CLIENT_VALUE_INFERENCE_H_
 
 #include <optional>
+#include <utility>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
diff --git a/tensorflow/compiler/xla/client/xla_builder.cc b/tensorflow/compiler/xla/client/xla_builder.cc
index 339ce5b2ad8..b913f2c4f00 100644
--- a/tensorflow/compiler/xla/client/xla_builder.cc
+++ b/tensorflow/compiler/xla/client/xla_builder.cc
@@ -2603,30 +2603,15 @@ XlaOp XlaBuilder::RngBitGenerator(RandomAlgorithm algorithm,
     TF_RETURN_IF_ERROR(ShapeUtil::ValidateShapeWithOptionalLayout(shape));
     TF_ASSIGN_OR_RETURN(Shape state_shape, GetShape(initial_state));
     Shape output_shape = shape;
-    switch (output_shape.element_type()) {
-      case PrimitiveType::S8:
-      case PrimitiveType::U8:
-        output_shape.set_element_type(PrimitiveType::U8);
-        break;
-      case PrimitiveType::BF16:
-      case PrimitiveType::F16:
-      case PrimitiveType::S16:
-      case PrimitiveType::U16:
-        output_shape.set_element_type(PrimitiveType::U16);
-        break;
-      case PrimitiveType::F32:
-      case PrimitiveType::S32:
-      case PrimitiveType::U32:
-        output_shape.set_element_type(PrimitiveType::U32);
-        break;
-      case PrimitiveType::F64:
-      case PrimitiveType::S64:
-      case PrimitiveType::U64:
-        output_shape.set_element_type(PrimitiveType::U64);
-        break;
-      default:
-        return InvalidArgument("Unsupported shape for RngBitGenerator: %s",
-                               PrimitiveType_Name(output_shape.element_type()));
+    output_shape.set_element_type(PRIMITIVE_TYPE_INVALID);
+    if (primitive_util::IsArrayType(shape.element_type())) {
+      output_shape.set_element_type(
+          primitive_util::UnsignedIntegralTypeForBitWidth(
+              primitive_util::BitWidth(shape.element_type())));
+    }
+    if (!primitive_util::IsUnsignedIntegralType(output_shape.element_type())) {
+      return InvalidArgument("Unsupported shape for RngBitGenerator: %s",
+                             PrimitiveType_Name(shape.element_type()));
     }
     return RngBitGeneratorInternal(
         ShapeUtil::MakeTupleShapeWithPtrs({&state_shape, &output_shape}),
@@ -4994,6 +4979,18 @@ XlaOp AllReduce(const XlaOp operand, const XlaComputation& computation,
                                       use_global_device_ids);
 }
 
+XlaOp AllReduceTuple(const absl::Span<const XlaOp> operands,
+                     const XlaComputation& computation,
+                     absl::Span<const ReplicaGroup> replica_groups,
+                     const std::optional<ChannelHandle>& channel_id,
+                     const std::optional<Shape>& shape_with_layout,
+                     const std::optional<bool> use_global_device_ids) {
+  CHECK(!operands.empty());
+  return operands[0].builder()->AllReduce(
+      operands[0].builder()->Tuple(operands), computation, replica_groups,
+      channel_id, shape_with_layout, use_global_device_ids);
+}
+
 XlaOp ReduceScatter(const XlaOp operand, const XlaComputation& computation,
                     int64_t scatter_dimension, int64_t shard_count,
                     absl::Span<const ReplicaGroup> replica_groups,
diff --git a/tensorflow/compiler/xla/client/xla_builder.h b/tensorflow/compiler/xla/client/xla_builder.h
index bde606e2e35..ad74675704d 100644
--- a/tensorflow/compiler/xla/client/xla_builder.h
+++ b/tensorflow/compiler/xla/client/xla_builder.h
@@ -67,16 +67,16 @@ struct XlaBuilderFriend {
       XlaBuilder* builder, absl::Span<const XlaOp> operands,
       std::string execution_thread, const XlaComputation& called_computation,
       const Shape& shape);
-  static XlaOp BuildAsyncUpdate(XlaBuilder* builder, const XlaOp operands,
+  static XlaOp BuildAsyncUpdate(XlaBuilder* builder, XlaOp operands,
                                 std::string execution_thread, int64_t group_id,
                                 int64_t called_computation, const Shape& shape);
-  static XlaOp BuildAsyncUpdate(XlaBuilder* builder, const XlaOp operands,
+  static XlaOp BuildAsyncUpdate(XlaBuilder* builder, XlaOp operands,
                                 std::string execution_thread,
                                 int64_t called_computation, const Shape& shape);
-  static XlaOp BuildAsyncDone(XlaBuilder* builder, const XlaOp operands,
+  static XlaOp BuildAsyncDone(XlaBuilder* builder, XlaOp operands,
                               std::string execution_thread, int64_t group_id,
                               int64_t called_computation, const Shape& shape);
-  static XlaOp BuildAsyncDone(XlaBuilder* builder, const XlaOp operands,
+  static XlaOp BuildAsyncDone(XlaBuilder* builder, XlaOp operands,
                               std::string execution_thread,
                               int64_t called_computation, const Shape& shape);
 
@@ -85,8 +85,8 @@ struct XlaBuilderFriend {
       int64_t shard_count, absl::Span<const ReplicaGroup> replica_groups = {},
       const std::optional<ChannelHandle>& channel_id = std::nullopt,
       const std::optional<Layout>& layout = std::nullopt,
-      const std::optional<bool> use_global_device_ids = std::nullopt);
-  static XlaOp BuildAllGatherDone(XlaBuilder* builder, const XlaOp operands,
+      std::optional<bool> use_global_device_ids = std::nullopt);
+  static XlaOp BuildAllGatherDone(XlaBuilder* builder, XlaOp operands,
                                   const Shape& shape);
 
   static XlaOp BuildAllReduceStart(
@@ -94,22 +94,21 @@ struct XlaBuilderFriend {
       absl::Span<const ReplicaGroup> replica_groups = {},
       const std::optional<ChannelHandle>& channel_id = std::nullopt,
       const std::optional<Shape>& layout = std::nullopt,
-      const std::optional<bool> use_global_device_ids = std::nullopt);
-  static XlaOp BuildAllReduceDone(XlaBuilder* builder, const XlaOp operands,
+      std::optional<bool> use_global_device_ids = std::nullopt);
+  static XlaOp BuildAllReduceDone(XlaBuilder* builder, XlaOp operands,
                                   const Shape& shape);
 
   static XlaOp BuildCollectivePermuteStart(
       XlaBuilder* builder, XlaOp operand,
       const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
       const std::optional<ChannelHandle>& channel_id = std::nullopt);
-  static XlaOp BuildCollectivePermuteDone(XlaBuilder* builder,
-                                          const XlaOp operands,
+  static XlaOp BuildCollectivePermuteDone(XlaBuilder* builder, XlaOp operands,
                                           const Shape& shape);
 
   static XlaOp BuildCopyStart(
       XlaBuilder* builder, XlaOp operand,
       std::optional<int> cross_program_prefetch_index = std::nullopt);
-  static XlaOp BuildCopyDone(XlaBuilder* builder, const XlaOp operand,
+  static XlaOp BuildCopyDone(XlaBuilder* builder, XlaOp operand,
                              const Shape& shape);
 
   static XlaOp BuildFusion(
@@ -135,9 +134,8 @@ struct XlaBuilderFriend {
                              const Shape& shape, const ChannelHandle& handle,
                              bool is_host_transfer);
 
-  static XlaOp BuildDomain(XlaBuilder* builder, XlaOp operand,
-                           const OpSharding entry, const OpSharding exit,
-                           const Shape& shape);
+  static XlaOp BuildDomain(XlaBuilder* builder, XlaOp operand, OpSharding entry,
+                           OpSharding exit, const Shape& shape);
 
   static XlaOp BuildRngGetAndUpdateState(XlaBuilder* builder, int64_t delta,
                                          const Shape& shape);
@@ -521,9 +519,8 @@ class XlaBuilder {
 
   XlaOp Broadcast(XlaOp operand, absl::Span<const int64_t> broadcast_sizes);
 
-  XlaOp BroadcastInDim(XlaOp operand,
-                       const absl::Span<const int64_t> out_dim_size,
-                       const absl::Span<const int64_t> broadcast_dimensions);
+  XlaOp BroadcastInDim(XlaOp operand, absl::Span<const int64_t> out_dim_size,
+                       absl::Span<const int64_t> broadcast_dimensions);
 
   XlaOp Pad(XlaOp operand, XlaOp padding_value,
             const PaddingConfig& padding_config);
@@ -810,19 +807,18 @@ class XlaBuilder {
   XlaOp CrossReplicaSum(XlaOp operand,
                         absl::Span<const ReplicaGroup> replica_groups = {});
 
-  XlaOp AllGather(
-      XlaOp operand, int64_t all_gather_dimension, int64_t shard_count,
-      absl::Span<const ReplicaGroup> replica_groups = {},
-      const std::optional<ChannelHandle>& channel_id = std::nullopt,
-      const std::optional<Layout>& layout = std::nullopt,
-      const std::optional<bool> use_global_device_ids = std::nullopt);
+  XlaOp AllGather(XlaOp operand, int64_t all_gather_dimension,
+                  int64_t shard_count,
+                  absl::Span<const ReplicaGroup> replica_groups = {},
+                  const std::optional<ChannelHandle>& channel_id = std::nullopt,
+                  const std::optional<Layout>& layout = std::nullopt,
+                  std::optional<bool> use_global_device_ids = std::nullopt);
 
-  XlaOp AllReduce(
-      XlaOp operand, const XlaComputation& computation,
-      absl::Span<const ReplicaGroup> replica_groups = {},
-      const std::optional<ChannelHandle>& channel_id = std::nullopt,
-      const std::optional<Shape>& shape_with_layout = std::nullopt,
-      const std::optional<bool> use_global_device_ids = std::nullopt);
+  XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
+                  absl::Span<const ReplicaGroup> replica_groups = {},
+                  const std::optional<ChannelHandle>& channel_id = std::nullopt,
+                  const std::optional<Shape>& shape_with_layout = std::nullopt,
+                  std::optional<bool> use_global_device_ids = std::nullopt);
 
   XlaOp ReduceScatter(
       XlaOp operand, const XlaComputation& computation,
@@ -830,7 +826,7 @@ class XlaBuilder {
       absl::Span<const ReplicaGroup> replica_groups = {},
       const std::optional<ChannelHandle>& channel_id = std::nullopt,
       const std::optional<Layout>& layout = std::nullopt,
-      const std::optional<bool> use_global_device_ids = std::nullopt);
+      std::optional<bool> use_global_device_ids = std::nullopt);
 
   XlaOp AllToAll(XlaOp operand, int64_t split_dimension,
                  int64_t concat_dimension, int64_t split_count,
@@ -937,12 +933,11 @@ class XlaBuilder {
                     absl::Span<const XlaComputation* const> branch_computations,
                     absl::Span<const XlaOp> branch_operands);
 
-  XlaOp ReducePrecision(XlaOp operand, const int exponent_bits,
-                        const int mantissa_bits);
+  XlaOp ReducePrecision(XlaOp operand, int exponent_bits, int mantissa_bits);
   virtual StatusOr<XlaOp> ReducePrecisionInternal(const Shape& shape,
                                                   XlaOp operand,
-                                                  const int exponent_bits,
-                                                  const int mantissa_bits);
+                                                  int exponent_bits,
+                                                  int mantissa_bits);
 
   XlaOp Gather(XlaOp input, XlaOp start_indices,
                const GatherDimensionNumbers& dimension_numbers,
@@ -1083,7 +1078,7 @@ class XlaBuilder {
   // operation such as `RngNormal` or `Infeed`. The visitor walks the
   // computation starting at a given operation and sets is_constant to false iff
   // a parameter or stateful operation is encountered.
-  void IsConstantVisitor(const int64_t op_handle, int depth,
+  void IsConstantVisitor(int64_t op_handle, int depth,
                          absl::flat_hash_set<int64_t>* visited,
                          bool* is_constant) const;
 
@@ -1176,9 +1171,9 @@ class XlaBuilder {
   friend XlaOp Broadcast(XlaOp operand,
                          absl::Span<const int64_t> broadcast_sizes);
 
-  friend XlaOp BroadcastInDim(
-      XlaOp operand, const absl::Span<const int64_t> out_dim_size,
-      const absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp BroadcastInDim(XlaOp operand,
+                              absl::Span<const int64_t> out_dim_size,
+                              absl::Span<const int64_t> broadcast_dimensions);
 
   friend XlaOp Copy(XlaOp operand);
 
@@ -1439,18 +1434,24 @@ class XlaBuilder {
                          absl::Span<const ReplicaGroup> replica_groups,
                          const std::optional<ChannelHandle>& channel_id,
                          const std::optional<Layout>& layout,
-                         const std::optional<bool> use_global_device_ids);
+                         std::optional<bool> use_global_device_ids);
   friend XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
                          absl::Span<const ReplicaGroup> replica_groups,
                          const std::optional<ChannelHandle>& channel_id,
                          const std::optional<Shape>& shape_with_layout,
-                         const std::optional<bool> use_global_device_ids);
+                         std::optional<bool> use_global_device_ids);
+  friend XlaOp AllReduceTuple(absl::Span<const XlaOp> operand,
+                              const XlaComputation& computation,
+                              absl::Span<const ReplicaGroup> replica_groups,
+                              const std::optional<ChannelHandle>& channel_id,
+                              const std::optional<Shape>& shape_with_layout,
+                              std::optional<bool> use_global_device_ids);
   friend XlaOp ReduceScatter(XlaOp operand, const XlaComputation& computation,
                              int64_t scatter_dimension, int64_t shard_count,
                              absl::Span<const ReplicaGroup> replica_groups,
                              const std::optional<ChannelHandle>& channel_id,
                              const std::optional<Layout>& layout,
-                             const std::optional<bool> use_global_device_ids);
+                             std::optional<bool> use_global_device_ids);
 
   friend XlaOp AllToAll(XlaOp operand, int64_t split_dimension,
                         int64_t concat_dimension, int64_t split_count,
@@ -1546,8 +1547,8 @@ class XlaBuilder {
       XlaOp branch_index,
       absl::Span<const XlaComputation* const> branch_computations,
       absl::Span<const XlaOp> branch_operands);
-  friend XlaOp ReducePrecision(XlaOp operand, const int exponent_bits,
-                               const int mantissa_bits);
+  friend XlaOp ReducePrecision(XlaOp operand, int exponent_bits,
+                               int mantissa_bits);
   friend XlaOp Gather(XlaOp input, XlaOp start_indices,
                       const GatherDimensionNumbers& dimension_numbers,
                       absl::Span<const int64_t> slice_sizes,
@@ -1604,15 +1605,13 @@ class XlaBuilder {
                       absl::Span<const ReplicaGroup> replica_groups,
                       const std::optional<ChannelHandle>& channel_id,
                       const std::optional<Layout>& layout,
-                      const std::optional<bool> use_global_device_ids,
-                      bool async);
+                      std::optional<bool> use_global_device_ids, bool async);
 
   XlaOp AllReduceImpl(XlaOp operand, const XlaComputation& computation,
                       absl::Span<const ReplicaGroup> replica_groups,
                       const std::optional<ChannelHandle>& channel_id,
                       const std::optional<Shape>& layout,
-                      const std::optional<bool> use_global_device_ids,
-                      bool async);
+                      std::optional<bool> use_global_device_ids, bool async);
 
   XlaOp CollectivePermuteImpl(
       XlaOp operand,
@@ -1848,9 +1847,8 @@ XlaOp Broadcast(XlaOp operand, absl::Span<const int64_t> broadcast_sizes);
 //   will generate output
 //   {{1 , 1},
 //    {2 , 2}}
-XlaOp BroadcastInDim(XlaOp operand,
-                     const absl::Span<const int64_t> out_dim_size,
-                     const absl::Span<const int64_t> broadcast_dimensions);
+XlaOp BroadcastInDim(XlaOp operand, absl::Span<const int64_t> out_dim_size,
+                     absl::Span<const int64_t> broadcast_dimensions);
 
 // Copies the input operand to the output. This operation is for internal
 // purpose and is only used by the compiler for optimization purposes or to
@@ -2428,7 +2426,7 @@ XlaOp AllGather(XlaOp operand, int64_t all_gather_dimension,
                 absl::Span<const ReplicaGroup> replica_groups = {},
                 const std::optional<ChannelHandle>& channel_id = std::nullopt,
                 const std::optional<Layout>& layout = std::nullopt,
-                const std::optional<bool> use_global_device_ids = std::nullopt);
+                std::optional<bool> use_global_device_ids = std::nullopt);
 
 // Enqueues an operation that do an AllReduce of the operand cross cores. Here
 // AllReduce means doing a reduction on the input operand cross cores and then
@@ -2453,14 +2451,21 @@ XlaOp AllReduce(XlaOp operand, const XlaComputation& computation,
                 absl::Span<const ReplicaGroup> replica_groups = {},
                 const std::optional<ChannelHandle>& channel_id = std::nullopt,
                 const std::optional<Shape>& shape_with_layout = std::nullopt,
-                const std::optional<bool> use_global_device_ids = std::nullopt);
+                std::optional<bool> use_global_device_ids = std::nullopt);
+
+XlaOp AllReduceTuple(
+    absl::Span<const XlaOp> operand, const XlaComputation& computation,
+    absl::Span<const ReplicaGroup> replica_groups = {},
+    const std::optional<ChannelHandle>& channel_id = std::nullopt,
+    const std::optional<Shape>& shape_with_layout = std::nullopt,
+    std::optional<bool> use_global_device_ids = std::nullopt);
 
 XlaOp ReduceScatter(
     XlaOp operand, const XlaComputation& computation, int64_t scatter_dimension,
     int64_t shard_count, absl::Span<const ReplicaGroup> replica_groups = {},
     const std::optional<ChannelHandle>& channel_id = std::nullopt,
     const std::optional<Layout>& layout = std::nullopt,
-    const std::optional<bool> use_global_device_ids = std::nullopt);
+    std::optional<bool> use_global_device_ids = std::nullopt);
 
 // Enqueues an operation that do an Alltoall of the operand cross cores.
 // An optional `layout` can be specified to force the layout of the instruction.
@@ -2702,8 +2707,7 @@ XlaOp Conditional(XlaOp branch_index,
                   absl::Span<const XlaOp> branch_operands);
 
 // Enqueues a ReducePrecision node onto the computation.
-XlaOp ReducePrecision(XlaOp operand, const int exponent_bits,
-                      const int mantissa_bits);
+XlaOp ReducePrecision(XlaOp operand, int exponent_bits, int mantissa_bits);
 
 // Enqueues a Gather node onto the computation.
 XlaOp Gather(XlaOp input, XlaOp start_indices,
diff --git a/tensorflow/compiler/xla/client/xla_builder_test.cc b/tensorflow/compiler/xla/client/xla_builder_test.cc
index 1b0eb3bc073..97c7dfcfc7b 100644
--- a/tensorflow/compiler/xla/client/xla_builder_test.cc
+++ b/tensorflow/compiler/xla/client/xla_builder_test.cc
@@ -29,7 +29,8 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
-#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -40,7 +41,7 @@ namespace xla {
 
 namespace {
 
-namespace op = xla::testing::opcode_matchers;
+namespace m = ::xla::match;
 
 using ::testing::HasSubstr;
 
@@ -80,58 +81,57 @@ TEST_F(XlaBuilderTest, OnePlusTwo) {
   Add(ConstantR0<float>(&b, 1.0), ConstantR0<float>(&b, 2.0));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Constant(), op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Constant(), m::Constant())));
 }
 
 TEST_F(XlaBuilderTest, UnaryOperatorsBuildExpectedHLO) {
-  auto test_unary_operator =
-      [&](std::function<XlaOp(XlaOp)> op,
-          ::testing::Matcher<const ::xla::HloInstruction*> matches_pattern) {
-        XlaBuilder b(TestName());
-        op(ConstantR0<int32_t>(&b, 1));
-        TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
-        auto root = module->entry_computation()->root_instruction();
-        EXPECT_THAT(root, matches_pattern);
-      };
-  test_unary_operator([](XlaOp x) { return -x; }, op::Negate(op::Constant()));
-  test_unary_operator([](XlaOp x) { return ~x; }, op::Not(op::Constant()));
+  auto test_unary_operator = [&](std::function<XlaOp(XlaOp)> op,
+                                 auto matches_pattern) {
+    XlaBuilder b(TestName());
+    op(ConstantR0<int32_t>(&b, 1));
+    TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+    auto root = module->entry_computation()->root_instruction();
+    EXPECT_THAT(root, matches_pattern);
+  };
+  test_unary_operator([](XlaOp x) { return -x; },
+                      GmockMatch(m::Negate(m::Constant())));
+  test_unary_operator([](XlaOp x) { return ~x; },
+                      GmockMatch(m::Not(m::Constant())));
 }
 
 TEST_F(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
-  auto test_binary_operator =
-      [&](std::function<XlaOp(XlaOp, XlaOp)> op,
-          ::testing::Matcher<const ::xla::HloInstruction*> matches_pattern) {
-        XlaBuilder b(TestName());
-        op(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
-        TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
-        auto root = module->entry_computation()->root_instruction();
-        EXPECT_THAT(root, matches_pattern);
-      };
+  auto test_binary_operator = [&](std::function<XlaOp(XlaOp, XlaOp)> op,
+                                  auto matches_pattern) {
+    XlaBuilder b(TestName());
+    op(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
+    TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+    auto root = module->entry_computation()->root_instruction();
+    EXPECT_THAT(root, matches_pattern);
+  };
 
   test_binary_operator([](XlaOp x, XlaOp y) { return x + y; },
-                       op::Add(op::Constant(), op::Constant()));
+                       GmockMatch(m::Add(m::Constant(), m::Constant())));
   test_binary_operator([](XlaOp x, XlaOp y) { return x - y; },
-                       op::Subtract(op::Constant(), op::Constant()));
+                       GmockMatch(m::Subtract(m::Constant(), m::Constant())));
   test_binary_operator([](XlaOp x, XlaOp y) { return x * y; },
-                       op::Multiply(op::Constant(), op::Constant()));
+                       GmockMatch(m::Multiply(m::Constant(), m::Constant())));
   test_binary_operator([](XlaOp x, XlaOp y) { return x / y; },
-                       op::Divide(op::Constant(), op::Constant()));
+                       GmockMatch(m::Divide(m::Constant(), m::Constant())));
 
   test_binary_operator([](XlaOp x, XlaOp y) { return x & y; },
-                       op::And(op::Constant(), op::Constant()));
+                       GmockMatch(m::And(m::Constant(), m::Constant())));
   test_binary_operator([](XlaOp x, XlaOp y) { return x | y; },
-                       op::Or(op::Constant(), op::Constant()));
+                       GmockMatch(m::Or(m::Constant(), m::Constant())));
   test_binary_operator([](XlaOp x, XlaOp y) { return x ^ y; },
-                       op::Xor(op::Constant(), op::Constant()));
+                       GmockMatch(m::Xor(m::Constant(), m::Constant())));
   test_binary_operator([](XlaOp x, XlaOp y) { return x << y; },
-                       op::ShiftLeft(op::Constant(), op::Constant()));
+                       GmockMatch(m::ShiftLeft(m::Constant(), m::Constant())));
   test_binary_operator(
       [](XlaOp x, XlaOp y) { return x >> y; },
-      op::ShiftRightArithmetic(op::Constant(), op::Constant()));
+      GmockMatch(m::ShiftRightArithmetic(m::Constant(), m::Constant())));
 
   auto test_unsigned_binary_operator =
-      [&](std::function<XlaOp(XlaOp, XlaOp)> op,
-          ::testing::Matcher<const ::xla::HloInstruction*> matches_pattern) {
+      [&](std::function<XlaOp(XlaOp, XlaOp)> op, auto matches_pattern) {
         XlaBuilder b(TestName());
         op(ConstantR0<uint32_t>(&b, 1), ConstantR0<uint32_t>(&b, 2));
         TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
@@ -140,7 +140,7 @@ TEST_F(XlaBuilderTest, BinaryOperatorsBuildExpectedHLO) {
       };
   test_unsigned_binary_operator(
       [](XlaOp x, XlaOp y) { return x >> y; },
-      op::ShiftRightLogical(op::Constant(), op::Constant()));
+      GmockMatch(m::ShiftRightLogical(m::Constant(), m::Constant())));
 }
 
 TEST_F(XlaBuilderTest, VariadicAnd) {
@@ -151,12 +151,12 @@ TEST_F(XlaBuilderTest, VariadicAnd) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   // Don't specify in the test whether And(x, y, z) is right- or
   // left-associative; accept either one.
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      ::testing::AnyOf(op::And(op::Parameter(0),
-                               op::And(op::Parameter(1), op::Parameter(2))),
-                       op::And(op::And(op::Parameter(0), op::Parameter(1)),
-                               op::Parameter(2))));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              ::testing::AnyOf(
+                  GmockMatch(m::And(m::Parameter(0),
+                                    m::And(m::Parameter(1), m::Parameter(2)))),
+                  GmockMatch(m::And(m::And(m::Parameter(0), m::Parameter(1)),
+                                    m::Parameter(2)))));
 }
 
 TEST_F(XlaBuilderTest, VariadicOr) {
@@ -167,12 +167,12 @@ TEST_F(XlaBuilderTest, VariadicOr) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   // Don't specify in the test whether Or(x, y, z) is right- or
   // left-associative; accept either one.
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      ::testing::AnyOf(
-          op::Or(op::Parameter(0), op::Or(op::Parameter(1), op::Parameter(2))),
-          op::Or(op::Or(op::Parameter(0), op::Parameter(1)),
-                 op::Parameter(2))));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              ::testing::AnyOf(
+                  GmockMatch(m::Or(m::Parameter(0),
+                                   m::Or(m::Parameter(1), m::Parameter(2)))),
+                  GmockMatch(m::Or(m::Or(m::Parameter(0), m::Parameter(1)),
+                                   m::Parameter(2)))));
 }
 
 TEST_F(XlaBuilderTest, ShiftRightOperatorOnNonIntegerProducesError) {
@@ -191,7 +191,8 @@ TEST_F(XlaBuilderTest, ParamPlusConstantHasScalarBroadcast) {
   Add(x, ConstantR0<float>(&b, 1.0));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Parameter(), op::Broadcast(op::Constant())));
+  EXPECT_THAT(root,
+              GmockMatch(m::Add(m::Parameter(), m::Broadcast(m::Constant()))));
 }
 
 TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
@@ -207,7 +208,8 @@ TEST_F(XlaBuilderTest, ParamPlusParamHasBroadcast) {
 
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Parameter(0), op::Broadcast(op::Parameter(1))));
+  EXPECT_THAT(
+      root, GmockMatch(m::Add(m::Parameter(0), m::Broadcast(m::Parameter(1)))));
 }
 
 TEST_F(XlaBuilderTest, XPlusX) {
@@ -216,7 +218,7 @@ TEST_F(XlaBuilderTest, XPlusX) {
   Add(x, x);
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Parameter(0), op::Parameter(0)));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Parameter(0))));
 }
 
 TEST_F(XlaBuilderTest, ShapeInferenceError) {
@@ -268,8 +270,8 @@ TEST_F(XlaBuilderTest, Call) {
   Add(Call(&b, call, {x, y}), Call(&b, call, {one, two}));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Call(op::Parameter(), op::Parameter()),
-                            op::Call(op::Constant(), op::Constant())));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Call(m::Parameter(), m::Parameter()),
+                                      m::Call(m::Constant(), m::Constant()))));
 }
 
 TEST_F(XlaBuilderTest, BinopHasDegenerateBroadcast) {
@@ -289,8 +291,9 @@ TEST_F(XlaBuilderTest, BinopHasDegenerateBroadcast) {
   //       \             /
   //            add
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Parameter(0),
-                            op::Broadcast(op::Reshape(op::Parameter(1)))));
+  EXPECT_THAT(root,
+              GmockMatch(m::Add(m::Parameter(0),
+                                m::Broadcast(m::Reshape(m::Parameter(1))))));
 }
 
 TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
@@ -314,8 +317,9 @@ TEST_F(XlaBuilderTest, BinopHasInDimAndDegenerateBroadcast) {
   //       \                      /
   //                 add
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Broadcast(op::Parameter(0)),
-                            op::Broadcast(op::Reshape(op::Parameter(1)))));
+  EXPECT_THAT(root,
+              GmockMatch(m::Add(m::Broadcast(m::Parameter(0)),
+                                m::Broadcast(m::Reshape(m::Parameter(1))))));
 }
 
 TEST_F(XlaBuilderTest, BroadcastInDim) {
@@ -325,7 +329,7 @@ TEST_F(XlaBuilderTest, BroadcastInDim) {
                  /*broadcast_dimensions=*/{0, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Broadcast());
+  EXPECT_THAT(root, GmockMatch(m::Broadcast()));
 }
 
 TEST_F(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
@@ -335,7 +339,7 @@ TEST_F(XlaBuilderTest, BroadcastInDimWithDegeneratedDim) {
                  /*broadcast_dimensions=*/{0, 1, 2});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Broadcast(op::Reshape(op::Broadcast())));
+              GmockMatch(m::Broadcast(m::Reshape(m::Broadcast()))));
 }
 
 TEST_F(XlaBuilderTest, BroadcastInDimWithNegativeSize) {
@@ -368,7 +372,7 @@ TEST_F(XlaBuilderTest, ReshapeDefaultOrder) {
   Reshape(x, /*new_sizes=*/{6, 35});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Parameter()));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Parameter())));
 }
 
 TEST_F(XlaBuilderTest, ReshapeHasTranspose) {
@@ -377,7 +381,7 @@ TEST_F(XlaBuilderTest, ReshapeHasTranspose) {
   Reshape(x, /*dimensions=*/{3, 2, 1, 0}, /*new_sizes=*/{6, 35});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Reshape(op::Transpose(op::Parameter())));
+  EXPECT_THAT(root, GmockMatch(m::Reshape(m::Transpose(m::Parameter()))));
 }
 
 TEST_F(XlaBuilderTest, Transpose) {
@@ -386,7 +390,7 @@ TEST_F(XlaBuilderTest, Transpose) {
   Transpose(x, /*permutation=*/{1, 0});
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Transpose(op::Parameter()));
+  EXPECT_THAT(root, GmockMatch(m::Transpose(m::Parameter())));
 }
 
 TEST_F(XlaBuilderTest, AllGatherR1) {
@@ -481,14 +485,48 @@ TEST_F(XlaBuilderTest, AllToAllTuple) {
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
 
-  // AllToAll is converted into a single all-to-all HloInstruction.
-  EXPECT_EQ(root->opcode(), HloOpcode::kAllToAll);
+  // Check shape and replica groups.
   auto expected_shape =
       ShapeUtil::MakeShapeWithDenseLayout(F32, /* dimensions= */ {2, 4},
                                           /* minor_to_major= */ {0, 1});
-  EXPECT_THAT(root, op::ShapeWithLayout(ShapeUtil::MakeTupleShape(
-                        {expected_shape, expected_shape})));
-  EXPECT_THAT(root, op::ReplicaGroups({{0, 1}}));
+  auto tuple_shape =
+      ShapeUtil::MakeTupleShape({expected_shape, expected_shape});
+  auto is_replica_group_pred = [](const HloInstruction* instr) {
+    return instr->replica_groups().size() == 1 &&
+           absl::c_equal(instr->replica_groups()[0].replica_ids(),
+                         std::vector<int64_t>{0, 1});
+  };
+
+  // AllToAll is converted into a single all-to-all HloInstruction.
+  EXPECT_THAT(root, GmockMatch(m::Op()
+                                   .WithOpcode(HloOpcode::kAllToAll)
+                                   .WithShapeEqualTo(&tuple_shape)
+                                   .WithPredicate(is_replica_group_pred)));
+}
+
+TEST_F(XlaBuilderTest, AllReduceTuple) {
+  XlaBuilder b(TestName());
+  auto shape0 = ShapeUtil::MakeShape(F32, {});
+  auto shape1 = ShapeUtil::MakeShape(F32, {1, 2});
+  auto p0 = Parameter(&b, 0, shape0, "p0");
+  auto p1 = Parameter(&b, 1, shape1, "p1");
+
+  XlaBuilder bsum(TestName());
+  auto f32Scalar = ShapeUtil::MakeShape(F32, {});
+  Add(Parameter(&bsum, 0, f32Scalar, "x"), Parameter(&bsum, 1, f32Scalar, "y"));
+  TF_ASSERT_OK_AND_ASSIGN(auto sum, bsum.Build());
+
+  AllReduceTuple({p0, p1}, sum);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
+  auto root = module->entry_computation()->root_instruction();
+
+  // Check shape and replica groups.
+  auto tuple_shape = ShapeUtil::MakeTupleShape({shape0, shape1});
+
+  // AllToAll is converted into a single all-to-all HloInstruction.
+  EXPECT_THAT(root, GmockMatch(m::Op()
+                                   .WithOpcode(HloOpcode::kAllReduce)
+                                   .WithShapeEqualTo(&tuple_shape)));
 }
 
 TEST_F(XlaBuilderTest, CollectivePermute) {
@@ -514,7 +552,7 @@ TEST_F(XlaBuilderTest, GetDimensionSizeConstant) {
   XlaBuilder b(TestName());
   auto x =
       Parameter(&b, 0, ShapeUtil::MakeShape(F32, {5, 7}, {false, true}), "x");
-  // Get dimension size from a contant dimension gives us a constant.
+  // Get dimension size from a constant dimension gives us a constant.
   GetDimensionSize(x, 0);
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
@@ -536,7 +574,7 @@ TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesNonErrors) {
   Add(b.ReportErrorOrReturn(op), ConstantR0<float>(&b, 2.0));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Add(op::Constant(), op::Constant()));
+  EXPECT_THAT(root, GmockMatch(m::Add(m::Constant(), m::Constant())));
 }
 
 TEST_F(XlaBuilderTest, ReportErrorOrReturnHandlesErrors) {
@@ -554,7 +592,7 @@ TEST_F(XlaBuilderTest, BuildWithSpecificRoot) {
   Add(constant, ConstantR0<float>(&b, 2.0));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/constant));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Constant());
+  EXPECT_THAT(root, GmockMatch(m::Constant()));
 }
 
 TEST_F(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) {
@@ -568,7 +606,7 @@ TEST_F(XlaBuilderTest, BuildWithSpecificRootAndMultipleParameters) {
   Add(x, Sub(y, z));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b, /*root=*/x));
   auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, op::Parameter());
+  EXPECT_THAT(root, GmockMatch(m::Parameter()));
   EXPECT_EQ(module->entry_computation()->num_parameters(), 3);
   EXPECT_EQ(module->entry_computation()->instruction_count(), 5);
 }
@@ -821,19 +859,19 @@ TEST_F(XlaBuilderTest, SelectIntoConditional) {
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           BuildHloModule(&b));
-  EXPECT_THAT(
-      module->entry_computation()->root_instruction(),
-      op::Conditional(op::Parameter(0), op::Parameter(1), op::Parameter(2)));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              GmockMatch(m::Conditional(m::Parameter(0), m::Parameter(1),
+                                        m::Parameter(2))));
   EXPECT_THAT(module->entry_computation()
                   ->root_instruction()
                   ->branch_computation(0)
                   ->root_instruction(),
-              op::Parameter(0));
+              GmockMatch(m::Parameter(0)));
   EXPECT_THAT(module->entry_computation()
                   ->root_instruction()
                   ->branch_computation(1)
                   ->root_instruction(),
-              op::Parameter(0));
+              GmockMatch(m::Parameter(0)));
 }
 
 TEST_F(XlaBuilderTest, DynamicPad) {
@@ -1420,7 +1458,7 @@ TEST_F(XlaBuilderTest, ComparisonType) {
   (void)Le(ConstantR0<int32_t>(&b, 1), ConstantR0<int32_t>(&b, 2));
   TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(&b));
   auto root = module->entry_computation()->root_instruction();
-  ASSERT_THAT(root, op::Compare(op::Constant(), op::Constant()));
+  ASSERT_THAT(root, GmockMatch(m::Compare(m::Constant(), m::Constant())));
   EXPECT_EQ(Comparison::Type::kSigned,
             DynCast<HloCompareInstruction>(root)->type());
 }
diff --git a/tensorflow/compiler/xla/client/xla_computation.h b/tensorflow/compiler/xla/client/xla_computation.h
index c7f8280a066..d8f2d0a4d5b 100644
--- a/tensorflow/compiler/xla/client/xla_computation.h
+++ b/tensorflow/compiler/xla/client/xla_computation.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
 #define TENSORFLOW_COMPILER_XLA_CLIENT_XLA_COMPUTATION_H_
 
+#include <memory>
+#include <string>
 #include <utility>
 
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -32,7 +34,7 @@ class XlaComputation {
   XlaComputation(HloModuleProto proto)
       : unique_id_(proto.id()), proto_(std::move(proto)) {}
 
-  ~XlaComputation() {}
+  ~XlaComputation() = default;
 
   XlaComputation(const XlaComputation&) = delete;
   XlaComputation& operator=(const XlaComputation&) = delete;
diff --git a/tensorflow/compiler/xla/comparison_util.cc b/tensorflow/compiler/xla/comparison_util.cc
index 69f4c0b2100..7cda7d4457c 100644
--- a/tensorflow/compiler/xla/comparison_util.cc
+++ b/tensorflow/compiler/xla/comparison_util.cc
@@ -31,37 +31,14 @@ namespace {
 // Verifies that this is a valid Comparison: (1) not a partial ordering on
 // integers, and (2) a valid PrimitiveType.
 bool IsValidComparison(xla::PrimitiveType type, Comparison::Order order) {
-  switch (type) {
-    case F16:
-    case F32:
-    case BF16:
-    case F64:
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E4M3B11FNUZ:
-    case C64:
-    case C128:
-      return true;
-    case S4:
-    case S8:
-    case S16:
-    case S32:
-    case S64:
-    case PRED:
-    case U4:
-    case U8:
-    case U16:
-    case U32:
-    case U64:
-      return order == Comparison::Order::kTotal;
-    case TUPLE:
-    case OPAQUE_TYPE:
-    case TOKEN:
-    case PRIMITIVE_TYPE_INVALID:
-    case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
-    case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
-      return false;
+  if (primitive_util::IsFloatingPointType(type) ||
+      primitive_util::IsComplexType(type)) {
+    return true;
   }
+  if (primitive_util::IsIntegralType(type) || type == PRED) {
+    return order == Comparison::Order::kTotal;
+  }
+  LOG(FATAL) << "Unsupported type: " << PrimitiveType_Name(type);
 }
 
 // Returns the X32 primitive type for each Type.
@@ -91,32 +68,14 @@ Comparison::Order DefaultOrdering(Comparison::Type type) {
 
 // Returns the expected ordering for each primitive type.
 Comparison::Order DefaultOrdering(PrimitiveType type) {
-  switch (type) {
-    case S4:
-    case S8:
-    case S16:
-    case S32:
-    case S64:
-    case PRED:
-    case U4:
-    case U8:
-    case U16:
-    case U32:
-    case U64:
-      return Comparison::Order::kTotal;
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E4M3B11FNUZ:
-    case BF16:
-    case F16:
-    case F32:
-    case F64:
-    case C64:
-    case C128:
-      return Comparison::Order::kPartial;
-    default:
-      LOG(FATAL) << "Unsupported type: " << PrimitiveType_Name(type);
+  if (primitive_util::IsFloatingPointType(type) ||
+      primitive_util::IsComplexType(type)) {
+    return Comparison::Order::kPartial;
   }
+  if (primitive_util::IsIntegralType(type) || type == PRED) {
+    return Comparison::Order::kTotal;
+  }
+  LOG(FATAL) << "Unsupported type: " << PrimitiveType_Name(type);
 }
 
 // Returns the converse of `direction`.
@@ -248,33 +207,17 @@ StatusOr<Comparison::Type> StringToComparisonType(
 }
 
 Comparison::Type Comparison::DefaultComparisonType(PrimitiveType type) {
-  switch (type) {
-    case S4:
-    case S8:
-    case S16:
-    case S32:
-    case S64:
-      return Type::kSigned;
-    case PRED:
-    case U4:
-    case U8:
-    case U16:
-    case U32:
-    case U64:
-      return Type::kUnsigned;
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E4M3B11FNUZ:
-    case F16:
-    case F32:
-    case BF16:
-    case F64:
-    case C64:
-    case C128:
-      return Type::kFloat;
-    default:
-      LOG(FATAL) << "Unexpected: " << PrimitiveType_Name(type);
+  if (primitive_util::IsFloatingPointType(type) ||
+      primitive_util::IsComplexType(type)) {
+    return Type::kFloat;
   }
+  if (primitive_util::IsSignedIntegralType(type)) {
+    return Type::kSigned;
+  }
+  if (primitive_util::IsUnsignedIntegralType(type) || type == PRED) {
+    return Type::kUnsigned;
+  }
+  LOG(FATAL) << "Unexpected: " << PrimitiveType_Name(type);
 }
 
 Comparison::Comparison(Direction dir, PrimitiveType type, Order order)
@@ -312,36 +255,10 @@ std::optional<Comparison> Comparison::Inverse() const {
     // operand is NaN.
     return std::nullopt;
   }
-  switch (primitive_type_) {
-    case F16:
-    case F32:
-    case BF16:
-    case F64:
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E4M3B11FNUZ:
-    case C64:
-    case C128:
-    case S4:
-    case S8:
-    case S16:
-    case S32:
-    case S64:
-    case PRED:
-    case U4:
-    case U8:
-    case U16:
-    case U32:
-    case U64:
-      return Comparison(xla::Inverse(dir_), primitive_type_, order_);
-    case TUPLE:
-    case OPAQUE_TYPE:
-    case TOKEN:
-    case PRIMITIVE_TYPE_INVALID:
-    case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
-    case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
-      return std::nullopt;
+  if (primitive_util::IsArrayType(primitive_type_)) {
+    return Comparison(xla::Inverse(dir_), primitive_type_, order_);
   }
+  return std::nullopt;
 }
 
 bool Comparison::IsReflexive() const {
diff --git a/tensorflow/compiler/xla/comparison_util.h b/tensorflow/compiler/xla/comparison_util.h
index 1b6f349e8b9..5cd434104b2 100644
--- a/tensorflow/compiler/xla/comparison_util.h
+++ b/tensorflow/compiler/xla/comparison_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_COMPARISON_UTIL_H_
 
+#include <cmath>
+#include <limits>
 #include <optional>
 #include <ostream>
 #include <string>
@@ -183,7 +185,8 @@ class Comparison {
 
   // Applies the comparison from this Comparison's direction and ordering for
   // integral types.
-  template <typename T, absl::enable_if_t<std::is_integral<T>::value, int> = 0>
+  template <typename T,
+            absl::enable_if_t<std::numeric_limits<T>::is_integer, int> = 0>
   inline bool Compare(const T a, const T b) const {
     DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
     return GetComparator<T>()(a, b);
@@ -192,9 +195,7 @@ class Comparison {
   // Applies the comparison from this Comparison's direction and ordering
   // for floating point types.
   template <typename T,
-            absl::enable_if_t<std::is_floating_point<T>::value ||
-                                  std::is_same<T, xla::bfloat16>::value,
-                              int> = 0>
+            absl::enable_if_t<!std::numeric_limits<T>::is_integer, int> = 0>
   inline bool Compare(const T a, const T b) const {
     DCHECK(primitive_util::IsCanonicalRepresentation<T>(primitive_type_));
     if (IsTotalOrder()) {
diff --git a/tensorflow/compiler/xla/cpu_function_runtime.h b/tensorflow/compiler/xla/cpu_function_runtime.h
index 14c5fec2ff8..151fc90d2d0 100644
--- a/tensorflow/compiler/xla/cpu_function_runtime.h
+++ b/tensorflow/compiler/xla/cpu_function_runtime.h
@@ -24,17 +24,37 @@ limitations under the License.
 
 namespace xla {
 namespace cpu_function_runtime {
+
+struct EncodedBufferInfo {
+  uint64_t packed_kind_and_size = 0;
+  uint32_t entry_param_number = -1;
+  uint32_t result_param_number = -1;
+};
+
 // Stores information about one buffer used by an XLA:CPU compiled function.
 // These buffers are used for holding inputs to the computation, outputs from
 // the computation and as temporary scratch space.
 class BufferInfo {
  public:
   // Creates a BufferInfo from a serialized encoding generated by `Encode`.
-  explicit BufferInfo(std::pair<uint64_t, uint64_t> encoding)
-      : entry_param_number_(encoding.second) {
+  // TODO(ecg): remove once there are no users left.
+  explicit BufferInfo(uint64_t packed_kind_and_size,
+                      uint32_t entry_param_number, uint32_t result_param_number)
+      : entry_param_number_(entry_param_number),
+        result_param_number_(result_param_number) {
     Kind kind;
     uint64_t size;
-    Unpack(encoding.first, &kind, &size);
+    Unpack(packed_kind_and_size, &kind, &size);
+    kind_ = kind;
+    size_ = size;
+  }
+
+  explicit BufferInfo(const EncodedBufferInfo& encoded)
+      : entry_param_number_(encoded.entry_param_number),
+        result_param_number_(encoded.result_param_number) {
+    Kind kind;
+    uint64_t size;
+    Unpack(encoded.packed_kind_and_size, &kind, &size);
     kind_ = kind;
     size_ = size;
   }
@@ -46,14 +66,31 @@ class BufferInfo {
   // Returns true if this buffer stores an entry parameter.  These may or may
   // not need to be allocated by the runtime, depending on
   // XlaCompiledCpuFunction::AllocMode.
-  bool is_entry_parameter() const { return kind() == Kind::kEntryParameter; }
+  bool is_entry_parameter() const {
+    return kind() == Kind::kParameter && entry_param_number_ >= 0;
+  }
 
   // Returns the entry parameter number of this buffer.
-  uint64_t entry_parameter_number() const {
+  uint32_t entry_parameter_number() const {
     assert(is_entry_parameter());
     return entry_param_number_;
   }
 
+  void set_result_parameter_number(uint32_t param_number) {
+    result_param_number_ = param_number;
+  }
+
+  bool is_result_parameter() const {
+    // Note: the kind is not unique, e.g. could be a kTempBuffer, or a
+    // kParameter if it is an in-out argument.
+    return result_param_number_ >= 0;
+  }
+
+  uint32_t result_parameter_number() const {
+    assert(is_result_parameter());
+    return result_param_number_;
+  }
+
   // Returns true if this buffer is temporary scratch space required by the XLA
   // computations.  These are always allocated by the runtime.
   bool is_temp_buffer() const { return kind() == Kind::kTempBuffer; }
@@ -69,11 +106,13 @@ class BufferInfo {
   // reconstruct the BufferInfo later using the constructor.  We need this
   // because we use BufferInfo in places where using protocol buffers would
   // negatively impact binary size.
-  std::pair<uint64_t, uint64_t> Encode() const {
+  EncodedBufferInfo Encode() const {
     static_assert(sizeof(*this) == 16, "");
-    uint64_t upper = Pack(kind(), size_);
-    uint64_t lower = entry_param_number_;
-    return {upper, lower};
+    EncodedBufferInfo ret;
+    ret.packed_kind_and_size = Pack(kind(), size_);
+    ret.entry_param_number = entry_param_number_;
+    ret.result_param_number = result_param_number_;
+    return ret;
   }
 
   bool operator==(const BufferInfo& buffer_info) const {
@@ -87,20 +126,26 @@ class BufferInfo {
   // Factory methods:
 
   static BufferInfo MakeTempBuffer(uint64_t size) {
-    return BufferInfo(Kind::kTempBuffer, /*size=*/size,
-                      /*entry_param_number=*/-1);
+    return BufferInfo(Kind::kTempBuffer, size);
   }
   static BufferInfo MakeConstant(uint64_t size) {
-    return BufferInfo(Kind::kConstant, /*size=*/size,
-                      /*entry_param_number=*/-1);
+    return BufferInfo(Kind::kConstant, size);
   }
-  static BufferInfo MakeEntryParameter(uint64_t size, uint64_t param_number) {
-    return BufferInfo(Kind::kEntryParameter, /*size=*/size,
-                      /*entry_param_number=*/param_number);
+  // Note: in-out parameters are possible by first creating an entry parameter
+  // and then calling set_result_parameter_number().
+  static BufferInfo MakeEntryParameter(uint64_t size,
+                                       uint32_t entry_param_number) {
+    return BufferInfo(Kind::kParameter, size, entry_param_number);
+  }
+  // Only used in tests. Here we use kTempBuffer but it is unimportant.
+  static BufferInfo MakeResultParameter(uint64_t size,
+                                        uint32_t result_param_number) {
+    // Here we
+    return BufferInfo(Kind::kTempBuffer, size, /*entry_param_number=*/-1,
+                      result_param_number);
   }
   static BufferInfo MakeOnStackBuffer(uint64_t size) {
-    return BufferInfo(Kind::kOnStackBuffer, /*size=*/size,
-                      /*entry_param_number=*/-1);
+    return BufferInfo(Kind::kOnStackBuffer, size);
   }
 
  private:
@@ -109,14 +154,25 @@ class BufferInfo {
   enum class Kind : uint64_t {
     kConstant,
     kTempBuffer,
-    kEntryParameter,
+    kParameter,
     kOnStackBuffer
   };
 
   Kind kind() const { return static_cast<Kind>(kind_); }
 
-  explicit BufferInfo(Kind kind, uint64_t size, uint64_t entry_param_number)
-      : kind_(kind), size_(size), entry_param_number_(entry_param_number) {}
+  explicit BufferInfo(Kind kind, uint64_t size)
+      : BufferInfo(kind, size,
+                   /*entry_param_number=*/-1,
+                   /*result_param_number=*/-1) {}
+  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number)
+      : BufferInfo(kind, size, entry_param_number,
+                   /*result_param_number=*/-1) {}
+  explicit BufferInfo(Kind kind, uint64_t size, uint32_t entry_param_number,
+                      uint32_t result_param_number)
+      : kind_(kind),
+        size_(size),
+        entry_param_number_(entry_param_number),
+        result_param_number_(result_param_number) {}
 
   static uint64_t Pack(Kind kind, uint64_t size) {
     return (static_cast<uint64_t>(size) << 2) | static_cast<uint64_t>(kind);
@@ -129,7 +185,8 @@ class BufferInfo {
 
   Kind kind_ : 2;
   uint64_t size_ : 62;
-  int64_t entry_param_number_;
+  int32_t entry_param_number_ = -1;
+  int32_t result_param_number_ = -1;
 };
 
 // Align to 64-bytes, to mimic tsl::Allocator::kAllocatorAlignment.
diff --git a/tensorflow/compiler/xla/debug_options_flags.cc b/tensorflow/compiler/xla/debug_options_flags.cc
index dabed025c92..783af8e2300 100644
--- a/tensorflow/compiler/xla/debug_options_flags.cc
+++ b/tensorflow/compiler/xla/debug_options_flags.cc
@@ -81,6 +81,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_cuda_graph_instantiation_threshold(2);
   opts.set_xla_gpu_enable_persistent_temp_buffers(false);
   opts.set_xla_gpu_cuda_graph_capture_threshold(2);
+  opts.set_xla_gpu_cuda_graph_enable_concurrent_region(true);
 
   // Despite the name, fast min/max on GPUs does not seem to be any faster, and
   // adds very counter-intuitive "NaN-swallowing" behavior.
@@ -90,6 +91,8 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_allow_excess_precision(true);
   opts.set_xla_force_host_platform_device_count(1);
   opts.set_xla_gpu_all_reduce_combine_threshold_bytes(30 * 1024 * 1024);
+  opts.set_xla_gpu_all_gather_combine_threshold_bytes(1024 * 1024 * 1024);
+  opts.set_xla_gpu_reduce_scatter_combine_threshold_bytes(30 * 1024 * 1024);
   opts.set_xla_gpu_enable_async_all_reduce(true);
   opts.set_xla_gpu_enable_reassociation_for_converted_ar(true);
 
@@ -105,13 +108,14 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   // Set 4GB space limit for redzone scratch allocator.
   opts.set_xla_gpu_redzone_scratch_max_megabytes(1LL << 12);
   opts.set_xla_gpu_shape_checks(DebugOptions::RUNTIME);
-  opts.set_xla_gpu_enable_mlir_lowering(true);
   opts.set_xla_gpu_normalize_layouts(true);
   opts.set_xla_gpu_simplify_all_fp_conversions(true);
   opts.set_xla_dump_latency_hiding_schedule(false);
   opts.set_xla_gpu_enable_latency_hiding_scheduler(false);
   opts.set_xla_gpu_lhs_enable_gpu_async_tracker(false);
-  opts.set_xla_gpu_pgle_profile_directory("");
+  opts.set_xla_gpu_pgle_profile_file_or_directory_path("");
+  opts.set_xla_gpu_enable_highest_priority_async_stream(false);
+  opts.set_xla_gpu_enable_data_parallel_collective_optimizer(false);
 
   opts.set_xla_cpu_enable_mlir_tiling_and_fusion(true);
   opts.set_xla_cpu_enable_custom_matmul_tiling(false);
@@ -128,11 +132,14 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_gpu_enable_cudnn_int8x32_convolution_reordering(true);
   opts.set_xla_gpu_triton_gemm_any(false);
 
-  // Moving reduce-scatter out of while loops can incrase memory footprint, so
+  // Moving reduce-scatter out of while loops can increase memory footprint, so
   // turning it off by default.
   opts.set_xla_gpu_enable_while_loop_reduce_scatter_code_motion(false);
 
   opts.set_xla_gpu_collective_inflation_factor(1);
+
+  opts.set_xla_gpu_enable_experimental_block_size(false);
+
   return opts;
 }
 
@@ -265,11 +272,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
         return true;
       };
 
-  auto setter_for_xla_gpu_enable_mlir_lowering = [debug_options](bool value) {
-    debug_options->set_xla_gpu_enable_mlir_lowering(value);
-    return true;
-  };
-
   // Custom "sub-parser" lambda for xla_partitioning_algorithm.
   auto setter_for_xla_partitioning_algorithm =
       [debug_options](const std::string& value) {
@@ -773,6 +775,18 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
           &DebugOptions::set_xla_gpu_all_reduce_combine_threshold_bytes),
       debug_options->xla_gpu_all_reduce_combine_threshold_bytes(),
       "Size threshold (in bytes) for the GPU all-reduce combiner."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_all_gather_combine_threshold_bytes",
+      int64_setter_for(
+          &DebugOptions::set_xla_gpu_all_gather_combine_threshold_bytes),
+      debug_options->xla_gpu_all_gather_combine_threshold_bytes(),
+      "Size threshold (in bytes) for the GPU all-gather combiner."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_reduce_scatter_combine_threshold_bytes",
+      int64_setter_for(
+          &DebugOptions::set_xla_gpu_reduce_scatter_combine_threshold_bytes),
+      debug_options->xla_gpu_reduce_scatter_combine_threshold_bytes(),
+      "Size threshold (in bytes) for the GPU reduce-scatter combiner."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_all_reduce_contiguous",
       bool_setter_for(&DebugOptions::set_xla_gpu_all_reduce_contiguous),
@@ -844,6 +858,13 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_cuda_graph_capture_threshold(),
       "Capture a region as a function to be launched as cuda graph if the "
       "number of moved instructions reaches this threshold."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_cuda_graph_enable_concurrent_region",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_cuda_graph_enable_concurrent_region),
+      debug_options->xla_gpu_cuda_graph_enable_concurrent_region(),
+      "Identify concurrent regions in cuda graphs and execute them "
+      "concurrently."));
 
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_persistent_temp_buffers",
@@ -904,10 +925,6 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "xla_gpu_shape_checks", setter_for_xla_gpu_shape_checks,
       DebugOptions::ShapeChecks_Name(debug_options->xla_gpu_shape_checks()),
       "When to perform shape checks in XLA:GPU."));
-  flag_list->push_back(tsl::Flag(
-      "xla_gpu_enable_mlir_lowering", setter_for_xla_gpu_enable_mlir_lowering,
-      debug_options->xla_gpu_enable_mlir_lowering(),
-      "Enable MLIR-based lowering in XLA:GPU instead of LLVM emitters."));
   flag_list->push_back(
       tsl::Flag("xla_gpu_normalize_layouts",
                 bool_setter_for(&DebugOptions::set_xla_gpu_normalize_layouts),
@@ -969,15 +986,28 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_enable_latency_hiding_scheduler(),
                 "Enable latency-hiding scheduler for XLA:GPU"));
   flag_list->push_back(tsl::Flag(
-      "xla_gpu_pgle_profile_directory",
-      string_setter_for(&DebugOptions::set_xla_gpu_pgle_profile_directory),
-      debug_options->xla_gpu_pgle_profile_directory(),
-      "Directory for PGLE profiles in XLA:GPU"));
+      "xla_gpu_pgle_profile_file_or_directory_path",
+      string_setter_for(
+          &DebugOptions::set_xla_gpu_pgle_profile_file_or_directory_path),
+      debug_options->xla_gpu_pgle_profile_file_or_directory_path(),
+      "Directory or file for PGLE profiles in XLA:GPU"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_lhs_enable_gpu_async_tracker",
       bool_setter_for(&DebugOptions::set_xla_gpu_lhs_enable_gpu_async_tracker),
       debug_options->xla_gpu_lhs_enable_gpu_async_tracker(),
       "Enable GPU async tracker for latency-hiding scheduler in XLA:GPU"));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_highest_priority_async_stream",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_highest_priority_async_stream),
+      debug_options->xla_gpu_enable_highest_priority_async_stream(),
+      "Enable async stream to have the highest priority."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_data_parallel_collective_optimizer",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_data_parallel_collective_optimizer),
+      debug_options->xla_gpu_enable_data_parallel_collective_optimizer(),
+      "Enable data parallel collective optimizer."));
   flag_list->push_back(tsl::Flag(
       "xla_partitioning_algorithm", setter_for_xla_partitioning_algorithm,
       DebugOptions::PartitioningAlgorithm_Name(
@@ -1001,6 +1031,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
                 debug_options->xla_gpu_triton_gemm_any(),
                 "Use Triton-based matrix multiplication for any GEMM it "
                 "supports without filtering only faster ones."));
+  flag_list->push_back(
+      tsl::Flag("xla_gpu_enable_experimental_block_size",
+                bool_setter_for(
+                    &DebugOptions::set_xla_gpu_enable_experimental_block_size),
+                debug_options->xla_gpu_enable_experimental_block_size(),
+                "Enable experimental block size."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/tensorflow/compiler/xla/examples/axpy/BUILD b/tensorflow/compiler/xla/examples/axpy/BUILD
index a2e266481dd..8c1442922c3 100644
--- a/tensorflow/compiler/xla/examples/axpy/BUILD
+++ b/tensorflow/compiler/xla/examples/axpy/BUILD
@@ -1,5 +1,7 @@
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 xla_cc_test(
     name = "stablehlo_compile_test",
     srcs = ["stablehlo_compile_test.cc"],
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD
index 769e8a76bb0..c09b88d80c9 100644
--- a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/BUILD
@@ -1,9 +1,11 @@
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 cc_library(
     name = "sm_bw_utils",
     hdrs = ["sm_bw_utils.h"],
-    defines = if_cuda(["GOOGLE_CUDA=1"]),
     deps = [
         "//tensorflow/tsl/platform:logging",
     ] + if_cuda([
@@ -20,13 +22,15 @@ cuda_library(
     ],
 )
 
-cc_test(
+xla_cc_test(
     name = "sm_bw_test",
     srcs = ["sm_bw_test.cc"],
-    tags = ["requires-gpu-sm80-only"],
+    tags = ["requires-gpu-nvidia"],
     deps = [
         ":sm_bw_kernels",
         ":sm_bw_utils",
         "@com_google_googletest//:gtest_main",
-    ],
+    ] + if_cuda([
+        "//tensorflow/tsl/platform:cuda",
+    ]),
 )
diff --git a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
index d0bc62cd0de..e170e44d66d 100644
--- a/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
+++ b/tensorflow/compiler/xla/experiments/sm_bandwidth_benchmark/sm_bw_kernels.cu.cc
@@ -77,21 +77,22 @@ template <int chunks>
 __launch_bounds__(kMaxBlockSize) __global__
     void BenchmarkDeviceCopyKernel(const float* __restrict__ in,
                                    float* __restrict__ out, int64_t size) {
+  constexpr int kVecWidth = chunks < 4 ? 1 : 4;
   const int64_t lines = size / (blockDim.x * chunks);
   const int64_t start_line = lines * blockIdx.x / gridDim.x;
   const int64_t end_line = lines * (blockIdx.x + 1) / gridDim.x;
   const int64_t start_offset =
-      start_line * blockDim.x * chunks + 4 * threadIdx.x;
+      start_line * blockDim.x * chunks + kVecWidth * threadIdx.x;
   const int64_t end_offset = end_line * blockDim.x * chunks;
-  Vec<float, 4> buffer[chunks / 4];
+  Vec<float, kVecWidth> buffer[chunks / kVecWidth];
   for (int64_t i = start_offset; i < end_offset; i += blockDim.x * chunks) {
 #pragma unroll
-    for (int j = 0; j < chunks; j += 4) {
-      LoadNc(buffer[j / 4], in + i + blockDim.x * j, 0);
+    for (int j = 0; j < chunks; j += kVecWidth) {
+      LoadNc(buffer[j / kVecWidth], in + i + blockDim.x * j, 0);
     }
 #pragma unroll
-    for (int j = 0; j < chunks; j += 4) {
-      Store(buffer[j / 4], out + i + blockDim.x * j, 0);
+    for (int j = 0; j < chunks; j += kVecWidth) {
+      Store(buffer[j / kVecWidth], out + i + blockDim.x * j, 0);
     }
   }
 }
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/__init__.py b/tensorflow/compiler/xla/experiments/triton_autotuning/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/check_csv.py b/tensorflow/compiler/xla/experiments/triton_autotuning/check_csv.py
new file mode 100755
index 00000000000..c38fe4fef75
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/triton_autotuning/check_csv.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python3
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Measures timings of tilings provided in a CSV file."""
+import sys
+
+from absl import app
+from absl import flags
+from matmul_lib import benchmark_matmul
+from matmul_lib import MatmulSize
+from matmul_lib import MatmulTiling
+from matmul_lib import QuantizedInputType
+import pandas as pd
+import torch
+import tqdm
+
+_DATA = flags.DEFINE_string('data', '', 'Data to check')
+_OUTPUT_FILE = flags.DEFINE_string(
+    'output_file', '/tmp/checked.csv', 'File to write output data to'
+)
+_NUM_SAMPLES = flags.DEFINE_integer(
+    'num_samples', 100, 'Number of samples to check'
+)
+_M = flags.DEFINE_integer('m', 64, 'Size of first matrix')
+_K = flags.DEFINE_integer('k', 64, 'Size of contracting dimension')
+_N = flags.DEFINE_integer('n', 64, 'Size of second matrix')
+_QUANTIZED_LHS = flags.DEFINE_enum_class(
+    'quantized_lhs',
+    QuantizedInputType.FULL,
+    QuantizedInputType,
+    'Type to use for LHS quantization',
+)
+
+
+def get_actual_time(r, s, pbar):
+  dims = MatmulSize(_M.value, _N.value, _K.value, _QUANTIZED_LHS.value)
+  return benchmark_matmul(
+      dims=dims,
+      pbar=pbar,
+      shared_stream=s,
+      tilings=[
+          MatmulTiling(
+              r.block_m,
+              r.block_n,
+              r.block_k,
+              r.split_k,
+              r.num_stages,
+              r.num_warps,
+          )
+      ],
+      repetitions_ms=300,
+  )[0].min_time_ms
+
+
+def main():
+  df = pd.read_csv(_DATA.value).sample(_NUM_SAMPLES.value)
+  shared_stream = torch.cuda.Stream()
+  measured_times = []
+  pbar = tqdm.tqdm(total=_NUM_SAMPLES.value, ncols=0)
+  with torch.cuda.stream(shared_stream):
+    for _, r in df.iterrows():
+      measured_times.append(get_actual_time(r, shared_stream, pbar))
+  df = df.assign(measured_min_time_ms=measured_times)
+  pbar.close()
+
+  def absolute_error(r):
+    return abs(r.measured_min_time_ms - r.min_time_ms)
+
+  def relative_error(r):
+    return absolute_error(r) / r.min_time_ms
+
+  errors = df.assign(absolute_error=absolute_error).assign(
+      relative_error=relative_error
+  )[['absolute_error', 'relative_error']]
+  print(errors)
+  print(errors.describe())
+  df.to_csv(_OUTPUT_FILE.value)
+
+
+if __name__ == '__main__':
+  app.parse_flags_with_usage(sys.argv)
+  main()
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/check_data.py b/tensorflow/compiler/xla/experiments/triton_autotuning/check_data.py
new file mode 100755
index 00000000000..8ad71b671bd
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/triton_autotuning/check_data.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python3
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Plot actual min time vs estimated min time from Triton performance model."""
+
+from collections.abc import Sequence
+from absl import app
+import pandas as pd
+import plotext as plt
+import torch
+import triton
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) != 2:
+    raise app.UsageError('Incorrect number of command-line arguments.')
+  f = argv[1]
+
+  df = pd.read_csv(
+      f,
+      dtype={
+          'M': int,
+          'N': int,
+          'K': int,
+          'BLOCK_M': int,
+          'BLOCK_N': int,
+          'BLOCK_K': int,
+          'SPLIT_K': int,
+          'num_stages': int,
+          'num_warps': int,
+          'min_time_ms': float,
+      },
+  )
+  grouped_df = df.groupby(['M', 'N', 'K']).min().sort_values('min_time_ms')
+
+  estimated_times = []
+  actual_times = []
+
+  matrix = torch.randn(1, 1, device='cuda', dtype=torch.float16)
+  for dims, r in grouped_df.iterrows():
+    m, n, k = dims
+    estimated_time = triton.ops.matmul_perf_model.estimate_matmul_time(
+        num_warps=r.num_warps,
+        num_stages=r.num_stages,
+        A=matrix,
+        B=matrix,
+        C=matrix,
+        M=m,
+        N=n,
+        K=k,
+        BLOCK_M=r.BLOCK_M,
+        BLOCK_N=r.BLOCK_N,
+        BLOCK_K=r.BLOCK_K,
+        SPLIT_K=r.SPLIT_K,
+    )
+    actual_times.append(r.min_time_ms)
+    estimated_times.append(estimated_time)
+
+  plt.theme('dark')
+  plt.plot(actual_times, estimated_times)
+  plt.xlabel('Actual Time (ms)')
+  plt.ylabel('Estimated Time (ms)')
+  plt.title('Estimated time as a function of actual time')
+  plt.show()
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/matmul_lib.py b/tensorflow/compiler/xla/experiments/triton_autotuning/matmul_lib.py
new file mode 100755
index 00000000000..2497b2100bb
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/triton_autotuning/matmul_lib.py
@@ -0,0 +1,451 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Library for running matmuls."""
+import enum
+import itertools
+import logging
+import math
+import typing
+
+import torch
+import tqdm
+import triton
+import triton.language as tl
+
+LOG = logging.getLogger(__name__)
+
+logging.basicConfig(
+    format=(
+        '%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d]'
+        ' %(threadName)15s: %(message)s'
+    ),
+    datefmt='%Y-%m-%d:%H:%M:%S',
+    level=logging.INFO,
+)
+
+
+@enum.unique
+class QuantizedInputType(enum.Enum):
+  """Type to use for quantized matmul inputs."""
+
+  FULL = 'full'
+  INT8 = 'int8'
+  FLOAT8 = 'float8'
+
+
+class MatmulTiling(typing.NamedTuple):
+  """Tiling parameterization of a matmul."""
+
+  BLOCK_M: int
+  BLOCK_N: int
+  BLOCK_K: int
+  SPLIT_K: int
+  num_stages: int
+  num_warps: int
+
+
+class MatmulSize(typing.NamedTuple):
+  """[M, K] @ [K, N]."""
+
+  M: int
+  N: int
+  K: int
+  quantized_lhs: QuantizedInputType
+
+
+class MatmulTiming(typing.NamedTuple):
+  """Timing result of a configuration."""
+
+  dims: MatmulSize
+  tiling: MatmulTiling
+  min_time_ms: float
+
+
+def parse_int_list(v: str) -> typing.List[int]:
+  """Converts a string of comma-separated ints into a list of strings."""
+  return list(map(int, v.split(',')))
+
+
+def generate_tiling_configs(
+    tilings_m: typing.List[int],
+    tilings_n: typing.List[int],
+    tilings_k: typing.List[int],
+    split_ks: typing.List[int],
+    num_stages: typing.List[int],
+    num_warps: typing.List[int],
+) -> typing.Iterator[MatmulTiling]:
+  """Generate a list of matmul configs to evaluate."""
+  product = itertools.product(
+      tilings_m,
+      tilings_n,
+      tilings_k,
+      split_ks,
+      num_stages,
+      num_warps,
+  )
+  return [MatmulTiling(*p) for p in product]
+
+
+@triton.jit
+def _fix_type_for_load(x):
+  """Bitcasts a pointer to a type that can be loaded by Triton."""
+  load_dtype = x.dtype
+  if x.dtype == tl.pointer_type(tl.float8e5):
+    load_dtype = tl.pointer_type(tl.int8)
+  return x.to(load_dtype, bitcast=True)
+
+
+@triton.jit
+def _matmul_kernel(
+    lhs,
+    rhs,
+    out,
+    m: tl.constexpr,
+    n: tl.constexpr,
+    k: tl.constexpr,
+    stride_am: tl.constexpr,
+    stride_ak: tl.constexpr,
+    stride_bk: tl.constexpr,
+    stride_bn: tl.constexpr,
+    stride_cm: tl.constexpr,
+    stride_cn: tl.constexpr,
+    block_m: tl.constexpr,
+    block_n: tl.constexpr,
+    block_k: tl.constexpr,
+    group_m: tl.constexpr,
+    split_k: tl.constexpr,
+    acc_ty: tl.constexpr,
+    # Workaround for a bug in Triton cache:
+    # force recompilation on different num_warps/num_stages.
+    force_num_warps: tl.constexpr,  # pylint: disable=unused-argument
+    force_num_stages: tl.constexpr,  # pylint: disable=unused-argument
+):
+  """Computes a block-level matmul."""
+  even_k = k % (block_k * split_k) == 0
+  pid0 = tl.program_id(0)
+  pid1 = tl.program_id(1)
+  pid2 = tl.program_id(2)
+  grid_m = (m + block_m - 1) // block_m
+  grid_n = (n + block_n - 1) // block_n
+  # re-order program ID for better L2 performance
+  width = group_m * grid_n
+  group_id = pid0 // width
+  group_size = min(grid_m - group_id * group_m, group_m)
+  pid_m = group_id * group_m + pid0 % group_size
+  pid_n = (pid0 % width) // group_size
+  rm = pid_m * block_m + tl.arange(0, block_m)
+  rn = pid_n * block_n + tl.arange(0, block_n)
+  ram = tl.max_contiguous(tl.multiple_of(rm % m, block_m), block_m)
+  rbn = tl.max_contiguous(tl.multiple_of(rn % n, block_n), block_n)
+  rk = pid1 * block_k + tl.arange(0, block_k)
+  lhs += ram[:, None] * stride_am + rk[None, :] * stride_ak + pid2 * m * k
+  rhs += rk[:, None] * stride_bk + rbn[None, :] * stride_bn
+  acc = tl.zeros((block_m, block_n), dtype=acc_ty)
+  # for ki in range(0, k, block_k * split_k):  # pytype: disable=wrong-arg-types
+  for ki in range(k, 0, -block_k * split_k):  # pytype: disable=wrong-arg-types
+    if even_k:
+      a = tl.load(_fix_type_for_load(lhs))
+      b = tl.load(rhs)
+    else:
+      a = tl.load(_fix_type_for_load(lhs), mask=rk[None, :] < ki, other=0)
+      b = tl.load(rhs, mask=rk[:, None] < ki, other=0)
+    casted_a = a.to(lhs.dtype.element_ty, bitcast=True).to(out.dtype.element_ty)
+    casted_b = b.to(out.dtype.element_ty)
+    acc += tl.dot(casted_a, casted_b, allow_tf32=True)
+    lhs += block_k * split_k * stride_ak
+    rhs += block_k * split_k * stride_bk
+  acc = acc.to(out.dtype.element_ty)
+  # rematerialize rm and rn to save registers
+  rm = pid_m * block_m + tl.arange(0, block_m)
+  rn = pid_n * block_n + tl.arange(0, block_n)
+  out += rm[:, None] * stride_cm + rn[None, :] * stride_cn + pid2 * m * n
+  out += m * n * pid1
+  mask = (rm < m)[:, None] & (rn < n)[None, :]
+  tl.store(out, acc, mask=mask)
+
+
+@triton.jit
+def _reduce_kernel(
+    src,
+    dest,
+    row_size: tl.constexpr,
+    col_size: tl.constexpr,
+    row_block_size: tl.constexpr,
+):
+  """Computes a column reduction."""
+  pid0 = tl.program_id(0)
+  idx = pid0 * row_block_size + tl.arange(0, row_block_size)
+  src += idx
+  acc = tl.zeros((row_block_size,), dtype=dest.dtype.element_ty)
+  for _ in range(col_size):
+    acc += tl.load(src, mask=idx < row_size, other=0)
+    src += row_size
+  tl.store(dest + idx, acc, mask=idx < row_size)
+
+
+@triton.jit
+def _to_f8_kernel(src, dest, size, block_size: tl.constexpr):
+  pid = tl.program_id(0)
+  offs = pid * block_size + tl.arange(0, block_size)
+  mask = offs < size
+  x = tl.load(src + offs, mask=mask)
+  y = x.to(tl.float8e5)
+  tl.store(dest + offs, y, mask=mask)
+
+
+def to_triton_f8(x: torch.Tensor) -> triton.TensorWrapper:
+  """Converts torch tensors to triton.language.float8e5."""
+  assert x.is_contiguous(), 'Kernel only works for contiguous tensors'
+  ret = triton.reinterpret(
+      torch.empty(x.shape, dtype=torch.int8, device=x.device, layout=x.layout),
+      tl.float8e5,
+  )
+  grid = lambda META: (triton.cdiv(x.numel(), META['block_size']),)
+  _to_f8_kernel[grid](ret, x, x.numel(), block_size=1024)
+  return ret
+
+
+def benchmark_matmul_tiling(
+    dims: MatmulSize,
+    tiling: MatmulTiling,
+    s: torch.cuda.Stream,
+    shared_stream: torch.cuda.Stream,
+    a: torch.Tensor | triton.TensorWrapper,
+    b: torch.Tensor,
+    c: torch.Tensor,
+    scratchpad: torch.Tensor,  # Largest size: c * SPLIT_K
+    repetitions_ms: int,
+    debug=False,
+) -> typing.Optional[MatmulTiming]:
+  """Benchmarks a single matmul tiling."""
+  grid = lambda META: (  # pylint: disable=g-long-lambda
+      triton.cdiv(dims.M, tiling.BLOCK_M) * triton.cdiv(dims.N, tiling.BLOCK_N),
+      tiling.SPLIT_K,
+      1,  # batch
+  )
+  data_a = getattr(a, 'base', a)
+
+  def run_matmul():
+    used_output = c if tiling.SPLIT_K == 1 else scratchpad
+    _matmul_kernel[grid](
+        a,
+        b,
+        used_output,
+        m=int(dims.M),
+        n=int(dims.N),
+        k=int(dims.K),
+        stride_am=data_a.stride(0),
+        stride_ak=data_a.stride(1),
+        stride_bk=b.stride(0),
+        stride_bn=b.stride(1),
+        stride_cm=c.stride(0),
+        stride_cn=c.stride(1),
+        block_m=int(tiling.BLOCK_M),
+        block_n=int(tiling.BLOCK_N),
+        block_k=int(tiling.BLOCK_K),
+        group_m=8,
+        split_k=tiling.SPLIT_K,
+        num_warps=tiling.num_warps,
+        num_stages=tiling.num_stages,
+        force_num_warps=tiling.num_warps,
+        force_num_stages=tiling.num_stages,
+        acc_ty=tl.float32,
+    )
+    if tiling.SPLIT_K != 1:
+      # Run reduction kernel.
+      _reduce_kernel[(triton.cdiv(dims.M * dims.N, 1024),)](
+          scratchpad,
+          c,
+          row_size=int(dims.M),
+          col_size=tiling.SPLIT_K,
+          num_stages=1,
+          num_warps=1024 // 32,
+          row_block_size=1024,
+      )
+
+  for dim in ['M', 'N', 'K']:
+    next_pow2 = lambda v: 2 ** int(math.ceil(math.log2(v)))
+    dim_size: int = getattr(dims, dim)
+    if dim == 'K':
+      dim_size = math.ceil(dim_size / tiling.SPLIT_K)
+    tile_size = getattr(tiling, f'BLOCK_{dim}')
+    if next_pow2(dim_size) < tile_size:
+      if debug:
+        LOG.error(
+            'Tile %s larger than the dimension %s (%s)',
+            tile_size,
+            dim,
+            dim_size,
+        )
+      return None
+
+  if tiling.BLOCK_M * tiling.BLOCK_N > 131072:
+    if debug:
+      LOG.error('Overly large tile')
+    return None
+
+  # TODO(cheshire): Compilation time is huge for such tiles.
+  if tiling.BLOCK_M > 512 or tiling.BLOCK_N > 512:
+    if debug:
+      LOG.error('Overly large tile')
+    return None
+
+  max_shared_memory = triton.runtime.driver.utils.get_device_properties(
+      torch.cuda.current_device()
+  )['max_shared_mem']
+
+  required_shared_memory = (
+      (tiling.BLOCK_M + tiling.BLOCK_N)
+      * tiling.BLOCK_K
+      * tiling.num_stages
+      * b.element_size()
+  )
+  if required_shared_memory > max_shared_memory:
+    if debug:
+      LOG.error('Skipping %s due to exceeding shmem bound', tiling)
+    return None
+  with torch.cuda.stream(s):
+    try:
+      run_matmul()  # Warmup on our own stream.
+    except Exception as exc:
+      LOG.error('%s for %s generated %s', tiling, dims, exc, exc_info=True)
+      raise
+
+  # Use shared stream to take actual measurements.
+  with torch.cuda.stream(shared_stream):
+    try:
+      percentiles = triton.testing.do_bench(
+          run_matmul,
+          warmup=0,
+          rep=repetitions_ms,
+          quantiles=(0.001, 0.1, 0.5, 0.9),
+      )
+      min_ms = percentiles[0]
+    except Exception as exc:
+      LOG.error('%s for %s generated %s', tiling, dims, exc, exc_info=True)
+      raise
+    return MatmulTiming(dims, tiling, min_ms)
+
+
+def benchmark_cublas(dims: MatmulSize) -> MatmulTiming:
+  """Measure cublas performance."""
+  a = torch.randn(dims.M, dims.K, device='cuda', dtype=torch.bfloat16)
+  b = torch.randn(dims.K, dims.N, device='cuda', dtype=torch.bfloat16)
+  run_matmul = lambda: torch.matmul(a, b)
+  percentiles = triton.testing.do_bench(
+      run_matmul, warmup=0, rep=300, quantiles=(0.001, 0.1, 0.5, 0.9)
+  )
+  min_ms = percentiles[0]
+  return min_ms
+
+
+def benchmark_matmul(
+    dims: MatmulSize,
+    pbar: tqdm.std.tqdm,
+    shared_stream: torch.cuda.Stream,
+    tilings: typing.List[MatmulTiling],
+    repetitions_ms: int,
+    debug=False,
+) -> typing.Sequence[MatmulTiming]:
+  """For a given matmul configuration, benchmark it.
+
+  Args:
+    dims: the dimensions of the matmul
+    pbar: a progress bar
+    shared_stream: stream to execute benchmarks on
+    tilings: list of tilings to benchmark
+    repetitions_ms: how many milliseconds to spend running each configuration
+    debug: whether to print debug output
+
+  Returns:
+    A sequence of matmul timings.
+  """
+  out: list[MatmulTiming] = []
+  largest_splitk = max(tilings, key=lambda t: t.SPLIT_K).SPLIT_K
+
+  s = torch.cuda.Stream()
+
+  # Use our own stream for compilation.
+  with torch.cuda.stream(s):
+    if dims.quantized_lhs == QuantizedInputType.INT8:
+      a = torch.randint(
+          0, 128, (dims.M, dims.K), device='cuda', dtype=torch.int8
+      )
+    elif dims.quantized_lhs == QuantizedInputType.FLOAT8:
+      a = to_triton_f8(
+          torch.randn(dims.M, dims.K, device='cuda', dtype=torch.bfloat16)
+      )
+    else:
+      a = torch.randn(dims.M, dims.K, device='cuda', dtype=torch.bfloat16)
+
+    b = torch.randn(dims.K, dims.N, device='cuda', dtype=torch.bfloat16)
+    data_a = getattr(a, 'base', a)
+    assert data_a.shape[1] == b.shape[0], 'incompatible dimensions'
+    assert data_a.is_contiguous(), 'matrix A must be contiguous'
+    assert b.is_contiguous(), 'matrix B must be contiguous'
+    c = torch.empty((dims.M, dims.N), device=a.device, dtype=torch.bfloat16)
+    scratchpad = torch.empty(
+        (largest_splitk, dims.M, dims.N), device=a.device, dtype=torch.bfloat16
+    )
+
+  LOG.info('Autotuning for %s', dims)
+
+  for tiling in tilings:
+    pbar.update(1)
+
+    timing = benchmark_matmul_tiling(
+        dims,
+        tiling,
+        s,
+        shared_stream,
+        a,
+        b,
+        c,
+        scratchpad,
+        repetitions_ms=repetitions_ms,
+        debug=debug,
+    )
+    if not timing:
+      continue
+
+    out.append(timing)
+  return out
+
+
+def print_roofline_performance(dims: MatmulSize, time_ms: float):
+  """Print theoretical roofline model performance."""
+  gbps: float = triton.testing.get_dram_gbps()
+  tflops: float = triton.testing.get_max_tensorcore_tflops(torch.bfloat16)
+  lhs_size_bytes = dims.M * dims.K
+  rhs_size_bytes = dims.K * dims.N * 2
+  out_size_bytes = dims.M * dims.N * 2
+
+  size_gb = (lhs_size_bytes + rhs_size_bytes + out_size_bytes) / 1e9
+  roofline_time_ms_bw = (size_gb / gbps) * 1e3
+  roofline_time_ms_flops = 2 * (dims.M * dims.N * dims.K) / (tflops * 1e9)
+
+  best_time_ms = max(roofline_time_ms_bw, roofline_time_ms_flops)
+  bound = (
+      'bandwidth' if roofline_time_ms_bw > roofline_time_ms_flops else 'flops'
+  )
+
+  print(
+      f'Percentage of roofline: {(best_time_ms * 100 / time_ms):0.4f}%'
+      f' ({bound} bound)'
+  )
+
+  print(f'Roofline time if bandwidth bound: {roofline_time_ms_bw:0.4f}ms')
+  print(f'Roofline time if flops bound: {roofline_time_ms_flops:0.4f}ms')
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/run_single_matmul.py b/tensorflow/compiler/xla/experiments/triton_autotuning/run_single_matmul.py
new file mode 100755
index 00000000000..382ff29b19f
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/triton_autotuning/run_single_matmul.py
@@ -0,0 +1,91 @@
+#!/usr/bin/python3
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Runs a single matmul with a supplied configuration."""
+import sys
+
+from absl import app
+from absl import flags
+from matmul_lib import benchmark_cublas
+from matmul_lib import benchmark_matmul
+from matmul_lib import MatmulSize
+from matmul_lib import MatmulTiling
+from matmul_lib import print_roofline_performance
+from matmul_lib import QuantizedInputType
+import torch
+import tqdm
+
+
+_M = flags.DEFINE_integer('m', 64, 'Size of first matrix')
+_K = flags.DEFINE_integer('k', 64, 'Size of contracting dimension')
+_N = flags.DEFINE_integer('n', 64, 'Size of second matrix')
+_QUANTIZED_LHS = flags.DEFINE_enum_class(
+    'quantized_lhs',
+    QuantizedInputType.FULL,
+    QuantizedInputType,
+    'Type to use for LHS quantization',
+)
+
+_BLOCK_M = flags.DEFINE_integer('block_m', 16, 'Tiling in M-dimension')
+_BLOCK_N = flags.DEFINE_integer('block_n', 16, 'Tiling in N-dimension')
+_BLOCK_K = flags.DEFINE_integer('block_k', 16, 'Tiling in K-dimension')
+
+_SPLIT_K = flags.DEFINE_integer(
+    'split_k', 1, 'Number of splits for contracting dimension'
+)
+_NUM_STAGES = flags.DEFINE_integer(
+    'num_stages', 1, 'Number of pipelining stages'
+)
+_NUM_WARPS = flags.DEFINE_integer(
+    'num_warps', 4, 'Number of warps to allocate in a given block'
+)
+_DEBUG = flags.DEFINE_bool('debug', False, 'Print debug information')
+
+
+def main():
+  s = torch.cuda.Stream()
+  pbar = tqdm.tqdm(ncols=0)
+  dims = MatmulSize(_M.value, _N.value, _K.value, _QUANTIZED_LHS.value)
+  timing = benchmark_matmul(
+      dims=dims,
+      pbar=pbar,
+      shared_stream=s,
+      tilings=[
+          MatmulTiling(
+              _BLOCK_M.value,
+              _BLOCK_N.value,
+              _BLOCK_K.value,
+              _SPLIT_K.value,
+              _NUM_STAGES.value,
+              _NUM_WARPS.value,
+          )
+      ],
+      repetitions_ms=300,
+      debug=_DEBUG.value,
+  )
+  if len(timing) != 1:
+    print('Failed to find working configuration')
+    sys.exit(1)
+  t = timing[0]
+  print(f'Timing: {t}')
+  print_roofline_performance(dims, t.min_time_ms)
+  cublas_time = benchmark_cublas(dims)
+  print(f'Reference cuBLAS time (bf16xbf16->bf16): {cublas_time:0.4f}ms')
+
+
+if __name__ == '__main__':
+  app.parse_flags_with_usage(sys.argv)
+  main()
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/search.py b/tensorflow/compiler/xla/experiments/triton_autotuning/search.py
new file mode 100755
index 00000000000..4b2fb618c76
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/triton_autotuning/search.py
@@ -0,0 +1,223 @@
+#!/usr/bin/python3
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Launch Triton search for good tiling sizes, save to CSV."""
+
+import concurrent.futures
+import csv
+import itertools
+import logging
+import os
+import random
+import sys
+import time
+import typing
+
+from absl import app
+from absl import flags
+from matmul_lib import benchmark_matmul
+from matmul_lib import generate_tiling_configs
+from matmul_lib import MatmulSize
+from matmul_lib import MatmulTiming
+from matmul_lib import parse_int_list
+from matmul_lib import QuantizedInputType
+import numpy as np
+import torch
+import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
+
+LOG = logging.getLogger(__name__)
+
+_OUTPUT_FILE = flags.DEFINE_string(
+    'output_file',
+    'out.csv',
+    """File to generate output into.
+
+1) Output is streamed: for each point processed, incremental output is written
+out.
+2) Restarts with checkpointing are supported: the script will not regenerate data
+for files already present.
+""",
+)
+_MAX_WORKERS = flags.DEFINE_integer(
+    'max_workers', 64, 'Number of threads to use'
+)
+_REPETITIONS_MS = flags.DEFINE_integer(
+    'repetitions_ms', 300, 'Number of requests'
+)
+_NUM_SAMPLES = flags.DEFINE_integer('num_samples', 1000, 'Number of samples ')
+_TILINGS_M = flags.DEFINE_string(
+    'tilings_m', '32, 64, 128, 256', 'Tilings to try for M'
+)
+_TILINGS_N = flags.DEFINE_string(
+    'tilings_n', '32, 64, 128, 256', 'Tilings to try for N'
+)
+_TILINGS_K = flags.DEFINE_string(
+    'tilings_k', '32, 64, 128, 256, 512', 'Tilings to try for K'
+)
+_NUM_STAGES = flags.DEFINE_string(
+    'num_stages', '1,2,3', 'Number of stages to try'
+)
+_NUM_WARPS = flags.DEFINE_string('num_warps', '4,8', 'Number of warps to try')
+_SPLIT_KS = flags.DEFINE_string(
+    'split_ks', '1,2,3,4,5', 'Number of split_k values to try'
+)
+
+logging.basicConfig(
+    format=(
+        '%(asctime)s,%(msecs)03d %(levelname)-8s [%(filename)s:%(lineno)d]'
+        ' %(threadName)15s: %(message)s'
+    ),
+    datefmt='%Y-%m-%d:%H:%M:%S',
+    level=logging.INFO,
+)
+
+# pylint: disable=g-long-lambda
+# pylint: disable=g-complex-comprehension
+# pylint: disable=cell-var-from-loop
+
+
+def read_timings() -> typing.Set[MatmulSize]:
+  """Find timings already existing in the file."""
+  out: typing.Set[MatmulSize] = set()
+  with open(_OUTPUT_FILE.value) as f:
+    reader = csv.reader(f)
+    for row in reader:
+      if row[0].isdigit():
+        # M, N, K + quantized_lhs
+        out.add(MatmulSize(*map(int, row[:4])))
+  return out
+
+
+def write_csv_header() -> None:
+  """Write CSV file header."""
+  with open(_OUTPUT_FILE.value, 'w') as f:
+    fieldnames = [
+        'M',
+        'N',
+        'K',
+        'quantized_lhs',
+        'BLOCK_M',
+        'BLOCK_N',
+        'BLOCK_K',
+        'SPLIT_K',
+        'num_stages',
+        'num_warps',
+        'min_time_ms',
+    ]
+    writer = csv.writer(f)
+    writer.writerow(fieldnames)
+
+
+def write_timings(timings: typing.Sequence[MatmulTiming]) -> None:
+  """Write matmul timing data to CSV output."""
+  with open(_OUTPUT_FILE.value, 'a') as f:
+    writer = csv.writer(f)
+    for d in timings:
+      writer.writerow([
+          d.dims.M,
+          d.dims.N,
+          d.dims.K,
+          d.dims.quantized_lhs,
+          d.tiling.BLOCK_M,
+          d.tiling.BLOCK_N,
+          d.tiling.BLOCK_K,
+          d.tiling.SPLIT_K,
+          d.tiling.num_stages,
+          d.tiling.num_warps,
+          d.min_time_ms,
+      ])
+
+
+def generate_samples() -> typing.List[MatmulSize]:
+  """Generate a list of matmuls we will be benchmarking."""
+  m_axis = np.unique(np.logspace(4, 13, num=200, dtype=np.int64, base=2))
+  n_axis = np.unique(np.logspace(4, 13, num=200, dtype=np.int64, base=2))
+  k_axis = np.unique(np.logspace(4, 13, num=200, dtype=np.int64, base=2))
+  q = [QuantizedInputType.INT8]
+  out = [MatmulSize(*p) for p in itertools.product(m_axis, n_axis, k_axis, q)]
+  out = random.choices(out, k=_NUM_SAMPLES.value)
+  return out
+
+
+def run_search(
+    existing_samples: typing.Set[MatmulSize],
+) -> typing.Sequence[MatmulTiming]:
+  """Run search on a list of matmul configurations."""
+  samples: typing.Sequence[MatmulSize] = [
+      s for s in generate_samples() if s not in existing_samples
+  ]
+  t0 = time.time()
+  shared_stream = torch.cuda.Stream()
+  tilings = generate_tiling_configs(
+      parse_int_list(_TILINGS_M.value),
+      parse_int_list(_TILINGS_N.value),
+      parse_int_list(_TILINGS_K.value),
+      parse_int_list(_SPLIT_KS.value),
+      parse_int_list(_NUM_STAGES.value),
+      parse_int_list(_NUM_WARPS.value),
+  )
+
+  with concurrent.futures.ThreadPoolExecutor(
+      max_workers=_MAX_WORKERS.value
+  ) as executor:
+    pbar = tqdm.tqdm(total=len(samples) * len(tilings), ncols=0)
+    results = []
+    with logging_redirect_tqdm():
+      if _MAX_WORKERS.value == 1:
+        for c in samples:
+          res = benchmark_matmul(
+              c, pbar, shared_stream, tilings, _REPETITIONS_MS.value
+          )
+          results.extend(res)
+          write_timings(res)
+      else:
+        future_to_dims = {
+            executor.submit(
+                benchmark_matmul,
+                c,
+                pbar,
+                shared_stream,
+                tilings,
+                _REPETITIONS_MS.value,
+            ): c
+            for c in samples
+        }
+        for future in concurrent.futures.as_completed(future_to_dims):
+          res = future.result()
+          results.extend(res)
+          write_timings(res)
+
+    pbar.close()
+
+  LOG.info('%d datapoints generated in %.2fs', len(results), (time.time() - t0))
+  return results
+
+
+def main() -> None:
+  existing_samples: typing.Set[MatmulSize] = set()
+  if os.path.isfile(_OUTPUT_FILE.value):
+    existing_samples = read_timings()
+  else:
+    write_csv_header()
+
+  run_search(existing_samples)
+
+
+if __name__ == '__main__':
+  random.seed(42)
+  app.parse_flags_with_usage(sys.argv)
+  main()
diff --git a/tensorflow/compiler/xla/experiments/triton_autotuning/tune_single_matmul.py b/tensorflow/compiler/xla/experiments/triton_autotuning/tune_single_matmul.py
new file mode 100755
index 00000000000..09971e6baaa
--- /dev/null
+++ b/tensorflow/compiler/xla/experiments/triton_autotuning/tune_single_matmul.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Finds best tuning for a single matmul."""
+import csv
+import sys
+
+from absl import app
+from absl import flags
+from matmul_lib import benchmark_cublas
+from matmul_lib import benchmark_matmul
+from matmul_lib import generate_tiling_configs
+from matmul_lib import MatmulSize
+from matmul_lib import MatmulTiming
+from matmul_lib import parse_int_list
+from matmul_lib import print_roofline_performance
+from matmul_lib import QuantizedInputType
+import torch
+import tqdm
+
+_M = flags.DEFINE_integer('m', 64, 'Size of first matrix')
+_K = flags.DEFINE_integer('k', 64, 'Size of contracting dimension')
+_N = flags.DEFINE_integer('n', 64, 'Size of second matrix')
+_QUANTIZED_LHS = flags.DEFINE_enum_class(
+    'quantized_lhs',
+    QuantizedInputType.FULL,
+    QuantizedInputType,
+    'Type to use for LHS quantization',
+)
+
+_TILINGS_M = flags.DEFINE_string(
+    'tilings_m', '32, 64, 128, 256', 'Tilings to try for M'
+)
+_TILINGS_N = flags.DEFINE_string(
+    'tilings_n', '32, 64, 128, 256', 'Tilings to try for N'
+)
+_TILINGS_K = flags.DEFINE_string(
+    'tilings_k', '32, 64, 128, 256, 512', 'Tilings to try for K'
+)
+_NUM_STAGES = flags.DEFINE_string(
+    'num_stages', '1,2,3', 'Number of stages to try'
+)
+_NUM_WARPS = flags.DEFINE_string('num_warps', '4,8', 'Number of warps to try')
+_SPLIT_KS = flags.DEFINE_string(
+    'split_ks', '1,2,3,4,5', 'Number of split_k values to try'
+)
+_DEBUG = flags.DEFINE_bool('debug', False, 'Print debug information')
+_APPEND_TO_CSV = flags.DEFINE_string(
+    'append_to_csv',
+    None,
+    'If set, appends the best tiling to the CSV file passed',
+)
+
+
+def main() -> None:
+  dims = MatmulSize(
+      M=_M.value, N=_N.value, K=_K.value, quantized_lhs=_QUANTIZED_LHS.value
+  )
+  s = torch.cuda.Stream()
+  tilings = generate_tiling_configs(
+      parse_int_list(_TILINGS_M.value),
+      parse_int_list(_TILINGS_N.value),
+      parse_int_list(_TILINGS_K.value),
+      parse_int_list(_SPLIT_KS.value),
+      parse_int_list(_NUM_STAGES.value),
+      parse_int_list(_NUM_WARPS.value),
+  )
+  pbar = tqdm.tqdm(total=len(tilings), ncols=0)
+  timings = sorted(
+      benchmark_matmul(
+          dims, pbar, s, tilings, repetitions_ms=300, debug=_DEBUG.value
+      ),
+      key=lambda t: t.min_time_ms,
+  )
+  fastest: MatmulTiming = timings[0]
+  print(f'Fastest configuration: {fastest}')
+
+  features_list = [
+      'BLOCK_M',
+      'BLOCK_N',
+      'BLOCK_K',
+      'SPLIT_K',
+      'num_stages',
+      'num_warps',
+  ]
+  features = frozenset(features_list)
+  for f in features:
+    other_features = features - {f}
+
+    def other_features_equal_to_best(t):
+      return all(
+          getattr(fastest.tiling, of) == getattr(t.tiling, of)
+          for of in other_features  # pylint: disable=cell-var-from-loop
+      )
+
+    # Keep everyting but the currently evaluated feature fixed to the best
+    # value.
+    others_fixed = [t for t in timings if other_features_equal_to_best(t)]
+
+    # TODO(cheshire): Visualize.
+    print(
+        f'Varying feature {f}:',
+        ', '.join(
+            f'{t.min_time_ms:0.4f} @ {f}={getattr(t.tiling, f)}'
+            for t in others_fixed
+        ),
+    )
+
+  print_roofline_performance(dims, fastest.min_time_ms)
+  cublas_time = benchmark_cublas(dims)
+  print(f'Reference cuBLAS time (bf16xbf16->bf16): {cublas_time:0.4f}ms')
+
+  if _APPEND_TO_CSV.value:
+    fields = (
+        ['M', 'N', 'K', 'quantized_lhs']
+        + features_list
+        + ['min_time_ms', 'cublas_time_ms']
+    )
+    with open(_APPEND_TO_CSV.value, 'a') as f:
+      writer = csv.DictWriter(f, fieldnames=fields)
+      if f.tell() == 0:
+        writer.writeheader()
+      writer.writerow(
+          dict(
+              fastest.dims._asdict(),
+              **fastest.tiling._asdict(),
+              min_time_ms=fastest.min_time_ms,
+              cublas_time_ms=cublas_time,
+          )
+      )
+
+
+if __name__ == '__main__':
+  app.parse_flags_with_usage(sys.argv)
+  main()
diff --git a/tensorflow/compiler/xla/g3doc/operation_semantics.md b/tensorflow/compiler/xla/g3doc/operation_semantics.md
index 1f9ffa35a3d..83e21ca58de 100644
--- a/tensorflow/compiler/xla/g3doc/operation_semantics.md
+++ b/tensorflow/compiler/xla/g3doc/operation_semantics.md
@@ -2950,6 +2950,36 @@ relative order of the equal values is preserved. Two elements `e1` and `e2` are
 equal if and only if `comparator(e1, e2) = comparator(e2, e1) = false`. By
 default, `is_stable` is set to false.
 
+## Top-K
+
+See also the `jax.lax.top_k` operation.
+
+<b>`TopK(operand)`</b>
+
+Arguments    | Type             | Semantics
+------------ | ---------------- | ---------------------------------------------
+`operand`    | `XlaOp`          | N-dimensional array
+`k`          | `int64`          | Integer specifying the number of top entries.
+`comparator` | `XlaComputation` | The comparator computation to use.
+
+Returns top `k` values and their indices as a tuple, along the last dimension of
+the operand using the given `comparator` (for usual topk behavior, it should be
+strict-greater-than operation).
+
+For example, given strict `>` operator, `k=1` and the following operand of shape
+`f32[2,3]`:
+
+```
+[[0.1, 0.3, 0.1], [0.7, 0.2, -0.1]]
+```
+
+The TopK application returns the following tuple of shape `(f32[2,1],
+s32[2,1])`:
+
+```
+([[0.3], [0.7]], [[1], [0]])
+```
+
 ## Transpose
 
 See also the `tf.reshape` operation.
diff --git a/tensorflow/compiler/xla/glob_lit_test.bzl b/tensorflow/compiler/xla/glob_lit_test.bzl
index 8863805fff7..217dd18e608 100644
--- a/tensorflow/compiler/xla/glob_lit_test.bzl
+++ b/tensorflow/compiler/xla/glob_lit_test.bzl
@@ -65,6 +65,7 @@ def _run_lit_test(name, data, size, tags, driver, features, exec_properties):
     )
 
 def glob_lit_tests(
+        name = None,
         exclude = [],
         test_file_exts = _default_test_file_exts,
         default_size = _default_size,
@@ -79,6 +80,7 @@ def glob_lit_tests(
     """Creates all plausible Lit tests (and their inputs) under this directory.
 
     Args:
+      name: str, name of the test_suite rule to generate for running all tests.
       exclude: [str], paths to exclude (for tests and inputs).
       test_file_exts: [str], extensions for files that are tests.
       default_size: str, the test size for targets not in "size_override".
@@ -104,7 +106,10 @@ def glob_lit_tests(
 
     # Run tests individually such that errors can be attributed to a specific
     # failure.
+    all_tests = []
     for curr_test in tests:
+        all_tests.append(curr_test + ".test")
+
         # Instantiate this test with updated parameters.
         _run_lit_test(
             name = curr_test + ".test",
@@ -115,3 +120,11 @@ def glob_lit_tests(
             features = features,
             exec_properties = exec_properties,
         )
+
+    # TODO: remove this check after making it a required param.
+    if name:
+        native.test_suite(
+            name = name,
+            tests = all_tests,
+            tags = ["manual"],
+        )
diff --git a/tensorflow/compiler/xla/hlo/evaluator/BUILD b/tensorflow/compiler/xla/hlo/evaluator/BUILD
index d9a7ef2218d..a4253c21179 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/BUILD
+++ b/tensorflow/compiler/xla/hlo/evaluator/BUILD
@@ -63,6 +63,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu:runtime_single_threaded_matmul",
         "//tensorflow/tsl/lib/core:bitmap",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:status",
@@ -104,11 +105,9 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
     ],
 )
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
index 67311db899f..9a6f0dc8cf9 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
 
 #include <algorithm>
+#include <atomic>
 #include <cmath>
 #include <complex>
 #include <cstdint>
@@ -64,6 +65,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/bitmap.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/float8.h"
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/protobuf.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -74,9 +76,7 @@ namespace xla {
 
 namespace {
 
-template <PrimitiveType kType>
-using NativeTypeOf =
-    typename primitive_util::PrimitiveTypeToNative<kType>::type;
+using primitive_util::NativeTypeOf;
 
 template <typename OperandT>
 StatusOr<Literal> Compare(const Shape& shape, ComparisonDirection direction,
@@ -237,48 +237,17 @@ struct PopulateImpl {
 // native types to avoid templating the whole implementations.
 template <template <PrimitiveType> typename Trait, typename F>
 Status Apply(Literal& literal, F&& literal_generator) {
-  switch (literal.shape().element_type()) {
-    case U8:
-      return Trait<U8>::Run(literal, std::forward<F>(literal_generator));
-    case U16:
-      return Trait<U16>::Run(literal, std::forward<F>(literal_generator));
-    case U32:
-      return Trait<U32>::Run(literal, std::forward<F>(literal_generator));
-    case U64:
-      return Trait<U64>::Run(literal, std::forward<F>(literal_generator));
-    case S8:
-      return Trait<S8>::Run(literal, std::forward<F>(literal_generator));
-    case S16:
-      return Trait<S16>::Run(literal, std::forward<F>(literal_generator));
-    case S32:
-      return Trait<S32>::Run(literal, std::forward<F>(literal_generator));
-    case S64:
-      return Trait<S64>::Run(literal, std::forward<F>(literal_generator));
-    case F8E5M2:
-      return Trait<F8E5M2>::Run(literal, std::forward<F>(literal_generator));
-    case F8E4M3FN:
-      return Trait<F8E4M3FN>::Run(literal, std::forward<F>(literal_generator));
-    case F8E4M3B11FNUZ:
-      return Trait<F8E4M3B11FNUZ>::Run(literal,
-                                       std::forward<F>(literal_generator));
-    case F16:
-      return Trait<F16>::Run(literal, std::forward<F>(literal_generator));
-    case BF16:
-      return Trait<BF16>::Run(literal, std::forward<F>(literal_generator));
-    case F32:
-      return Trait<F32>::Run(literal, std::forward<F>(literal_generator));
-    case F64:
-      return Trait<F64>::Run(literal, std::forward<F>(literal_generator));
-    case C64:
-      return Trait<C64>::Run(literal, std::forward<F>(literal_generator));
-    case C128:
-      return Trait<C128>::Run(literal, std::forward<F>(literal_generator));
-    case PRED:
-      return Trait<PRED>::Run(literal, std::forward<F>(literal_generator));
-    default:
-      LOG(FATAL) << "Unhandled primitive type "
-                 << literal.shape().element_type();
-  }
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&, literal_generator = std::forward<F>(literal_generator)](
+          auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          return Trait<primitive_type_constant>::Run(
+              literal, std::move(literal_generator));
+        }
+        LOG(FATAL) << "Unhandled primitive type "
+                   << literal.shape().element_type();
+      },
+      literal.shape().element_type());
 }
 
 constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl";
@@ -1214,23 +1183,17 @@ Status HloEvaluator::EvaluateParameterFromCallerArgument(
 std::vector<int64_t> HloEvaluator::GetS64Indices(
     absl::Span<HloInstruction* const> start_indices) {
   auto get_first_s64 = [&](const Literal& index) -> int64_t {
-    switch (index.shape().element_type()) {
-      case S16:
-        return index.GetFirstElement<NativeTypeOf<S16>>();
-      case S32:
-        return index.GetFirstElement<NativeTypeOf<S32>>();
-      case S64:
-        return index.GetFirstElement<NativeTypeOf<S64>>();
-      case U16:
-        return index.GetFirstElement<NativeTypeOf<U16>>();
-      case U32:
-        return index.GetFirstElement<NativeTypeOf<U32>>();
-      case U64:
-        return index.GetFirstElement<NativeTypeOf<U64>>();
-      default:
-        LOG(FATAL) << "GetS64Indices: unhandled primitive type for "
-                   << PrimitiveType_Name(index.shape().element_type());
-    }
+    return primitive_util::PrimitiveTypeSwitch<int64_t>(
+        [&](auto primitive_type_constant) -> int64_t {
+          if constexpr (primitive_util::IsIntegralType(
+                            primitive_type_constant)) {
+            return static_cast<int64_t>(
+                index.GetFirstElement<NativeTypeOf<primitive_type_constant>>());
+          }
+          LOG(FATAL) << "GetS64Indices: unhandled primitive type for "
+                     << PrimitiveType_Name(index.shape().element_type());
+        },
+        index.shape().element_type());
   };
   std::vector<int64_t> start;
   start.reserve(start_indices.size());
@@ -1498,188 +1461,83 @@ Status HloEvaluator::HandleConcatenate(HloInstruction* concatenate) {
 Status HloEvaluator::HandleIsFinite(HloInstruction* is_finite) {
   auto operand = is_finite->operand(0);
   auto elem_ty = operand->shape().element_type();
-  switch (elem_ty) {
-    case PRED:
-    case TUPLE:
-    case OPAQUE_TYPE:
-    case TOKEN:
-    case S4:
-    case S8:
-    case S16:
-    case S32:
-    case S64:
-    case U4:
-    case U8:
-    case U16:
-    case U32:
-    case U64:
-    case C64:
-    case C128:
-    // Explicitly enumerate all types in this switch so that when we add a new
-    // type, we'll get a compile error here.
-    case PRIMITIVE_TYPE_INVALID:
-    case PrimitiveType_INT_MIN_SENTINEL_DO_NOT_USE_:
-    case PrimitiveType_INT_MAX_SENTINEL_DO_NOT_USE_:
-      return InvalidArgument(
-          "expected element type in shape to be floating point, but "
-          "got: %s",
-          PrimitiveType_Name(elem_ty));
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E4M3B11FNUZ:
-      return InvalidArgument("F8 is unsupported in IsFinite");
-
-    case F16: {
-      auto result_or = ElementWiseUnaryOpImpl<bool, Eigen::half>(
-          is_finite,
-          [](Eigen::half elem_operand) {
-            return std::isfinite(static_cast<float>(elem_operand));
-          },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
-      break;
-    }
-    case BF16: {
-      auto result_or = ElementWiseUnaryOpImpl<bool, bfloat16>(
-          is_finite,
-          [](bfloat16 elem_operand) {
-            return std::isfinite(static_cast<float>(elem_operand));
-          },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
-      break;
-    }
-    case F32: {
-      auto result_or = ElementWiseUnaryOpImpl<bool, float>(
-          is_finite,
-          [](float elem_operand) { return std::isfinite(elem_operand); },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
-      break;
-    }
-    case F64: {
-      auto result_or = ElementWiseUnaryOpImpl<bool, double>(
-          is_finite,
-          [](double elem_operand) { return std::isfinite(elem_operand); },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
-      break;
-    }
-  }
-
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          auto result_or = ElementWiseUnaryOpImpl<bool, NativeT>(
+              is_finite,
+              [](NativeT elem_operand) {
+                return Eigen::numext::isfinite(elem_operand);
+              },
+              GetEvaluatedLiteralFor(operand));
+          TF_ASSIGN_OR_RETURN(evaluated_[is_finite], std::move(result_or));
+          return OkStatus();
+        }
+        return InvalidArgument(
+            "expected element type in shape to be floating point, but got: %s",
+            PrimitiveType_Name(elem_ty));
+      },
+      elem_ty);
 }
 
 Status HloEvaluator::HandleReal(HloInstruction* real) {
   auto operand = real->operand(0);
-  switch (operand->shape().element_type()) {
-    case BF16: {
-      auto result_or = ElementWiseUnaryOpImpl<bfloat16, bfloat16>(
-          real, [](bfloat16 elem_operand) { return elem_operand; },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-      break;
-    }
-    case C64: {
-      auto result_or = ElementWiseUnaryOpImpl<float, complex64>(
-          real, [](complex64 elem_operand) { return std::real(elem_operand); },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-      break;
-    }
-    case C128: {
-      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
-          real, [](complex128 elem_operand) { return std::real(elem_operand); },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-      break;
-    }
-    case F16: {
-      auto result_or = ElementWiseUnaryOpImpl<Eigen::half, Eigen::half>(
-          real, [](Eigen::half elem_operand) { return elem_operand; },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-      break;
-    }
-    case F32: {
-      auto result_or = ElementWiseUnaryOpImpl<float, float>(
-          real, [](float elem_operand) { return elem_operand; },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-      break;
-    }
-    case F64: {
-      auto result_or = ElementWiseUnaryOpImpl<double, double>(
-          real, [](double elem_operand) { return elem_operand; },
-          GetEvaluatedLiteralFor(operand));
-      TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
-      break;
-    }
-    default:
-      LOG(FATAL) << "HandleReal: unknown/unhandled primitive type: "
-                 << PrimitiveType_Name(operand->shape().element_type());
-  }
-
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          auto result_or = ElementWiseUnaryOpImpl<NativeT, NativeT>(
+              real, [](NativeT elem_operand) { return elem_operand; },
+              GetEvaluatedLiteralFor(operand));
+          TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
+          return OkStatus();
+        }
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          auto result_or =
+              ElementWiseUnaryOpImpl<typename NativeT::value_type, NativeT>(
+                  real,
+                  [](NativeT elem_operand) { return std::real(elem_operand); },
+                  GetEvaluatedLiteralFor(operand));
+          TF_ASSIGN_OR_RETURN(evaluated_[real], std::move(result_or));
+          return OkStatus();
+        }
+        LOG(FATAL) << "HandleReal: unknown/unhandled primitive type: "
+                   << PrimitiveType_Name(operand->shape().element_type());
+      },
+      operand->shape().element_type());
 }
 
 Status HloEvaluator::HandleImag(HloInstruction* imag) {
   auto operand = imag->operand(0);
-  switch (operand->shape().element_type()) {
-    case BF16: {
-      auto result_or = ElementWiseUnaryOpImpl<bfloat16, bfloat16>(
-          imag, [](bfloat16 elem_operand) { return bfloat16(0); },
-          GetEvaluatedLiteralFor(imag->operand(0)));
-
-      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-      break;
-    }
-    case C64: {
-      auto result_or = ElementWiseUnaryOpImpl<float, complex64>(
-          imag, [](complex64 elem_operand) { return std::imag(elem_operand); },
-          GetEvaluatedLiteralFor(imag->operand(0)));
-
-      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-      break;
-    }
-    case C128: {
-      auto result_or = ElementWiseUnaryOpImpl<double, complex128>(
-          imag, [](complex128 elem_operand) { return std::imag(elem_operand); },
-          GetEvaluatedLiteralFor(imag->operand(0)));
-
-      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-      break;
-    }
-    case F16: {
-      auto result_or = ElementWiseUnaryOpImpl<Eigen::half, Eigen::half>(
-          imag, [](Eigen::half elem_operand) { return Eigen::half(0); },
-          GetEvaluatedLiteralFor(imag->operand(0)));
-
-      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-      break;
-    }
-    case F32: {
-      auto result_or = ElementWiseUnaryOpImpl<float, float>(
-          imag, [](float elem_operand) { return 0; },
-          GetEvaluatedLiteralFor(imag->operand(0)));
-
-      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-      break;
-    }
-    case F64: {
-      auto result_or = ElementWiseUnaryOpImpl<double, double>(
-          imag, [](double elem_operand) { return 0; },
-          GetEvaluatedLiteralFor(imag->operand(0)));
-
-      TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
-      break;
-    }
-    default:
-      LOG(FATAL) << "HandleImag: unknown/unhandled primitive type: "
-                 << PrimitiveType_Name(operand->shape().element_type());
-  }
-
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          auto result_or = ElementWiseUnaryOpImpl<NativeT, NativeT>(
+              imag, [](NativeT elem_operand) { return NativeT(0); },
+              GetEvaluatedLiteralFor(operand));
+          TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+          return OkStatus();
+        }
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          auto result_or =
+              ElementWiseUnaryOpImpl<typename NativeT::value_type, NativeT>(
+                  imag,
+                  [](NativeT elem_operand) { return std::imag(elem_operand); },
+                  GetEvaluatedLiteralFor(operand));
+          TF_ASSIGN_OR_RETURN(evaluated_[imag], std::move(result_or));
+          return OkStatus();
+        }
+        LOG(FATAL) << "HandleImag: unknown/unhandled primitive type: "
+                   << PrimitiveType_Name(operand->shape().element_type());
+      },
+      operand->shape().element_type());
 }
 
 Status HloEvaluator::HandleComplex(HloInstruction* complex) {
@@ -1688,30 +1546,23 @@ Status HloEvaluator::HandleComplex(HloInstruction* complex) {
   TF_RET_CHECK(ShapeUtil::Compatible(real.shape(), imag.shape()));
 
   Literal result(complex->shape());
-  switch (complex->shape().element_type()) {
-    case C64: {
-      TF_RETURN_IF_ERROR(result.Populate<complex64>(
-          [&](absl::Span<const int64_t> multi_index) {
-            return std::complex<float>(real.Get<float>(multi_index),
-                                       imag.Get<float>(multi_index));
-          }));
-      break;
-    }
-    case C128: {
-      TF_RETURN_IF_ERROR(result.Populate<complex128>(
-          [&](absl::Span<const int64_t> multi_index) {
-            return std::complex<double>(real.Get<double>(multi_index),
-                                        imag.Get<double>(multi_index));
-          }));
-      break;
-    }
-    default:
-      LOG(FATAL) << "HandleComplex: unknown/unhandled primitive type: "
-                 << PrimitiveType_Name(complex->shape().element_type());
-  }
-
-  evaluated_[complex] = std::move(result);
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          TF_RETURN_IF_ERROR(result.Populate<NativeT>(
+              [&](absl::Span<const int64_t> multi_index) {
+                return NativeT(
+                    real.Get<typename NativeT::value_type>(multi_index),
+                    imag.Get<typename NativeT::value_type>(multi_index));
+              }));
+          evaluated_[complex] = std::move(result);
+          return OkStatus();
+        }
+        LOG(FATAL) << "HandleComplex: unknown/unhandled primitive type: "
+                   << PrimitiveType_Name(complex->shape().element_type());
+      },
+      complex->shape().element_type());
 }
 
 Status HloEvaluator::HandleCompare(HloInstruction* compare) {
@@ -1727,103 +1578,19 @@ Status HloEvaluator::HandleCompare(HloInstruction* compare) {
   const Literal& rhs_literal = GetEvaluatedLiteralFor(rhs);
 
   // Note here we switch on the operand's type.
-  switch (lhs->shape().element_type()) {
-    case PRED: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<bool>(compare->shape(), direction, lhs_literal, rhs_literal));
-    } break;
-    case U8: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<uint8_t>(compare->shape(), direction,
-                                           lhs_literal, rhs_literal));
-    } break;
-    case U16: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<uint16_t>(compare->shape(), direction,
-                                            lhs_literal, rhs_literal));
-    } break;
-    case U32: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<uint32_t>(compare->shape(), direction,
-                                            lhs_literal, rhs_literal));
-    } break;
-    case U64: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<uint64_t>(compare->shape(), direction,
-                                            lhs_literal, rhs_literal));
-    } break;
-    case S8: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<int8_t>(compare->shape(), direction,
-                                          lhs_literal, rhs_literal));
-    } break;
-    case S16: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<int16_t>(compare->shape(), direction,
-                                           lhs_literal, rhs_literal));
-    } break;
-    case S32: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<int32_t>(compare->shape(), direction,
-                                           lhs_literal, rhs_literal));
-    } break;
-    case S64: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<int64_t>(compare->shape(), direction,
-                                           lhs_literal, rhs_literal));
-    } break;
-    case F8E5M2: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<tsl::float8_e5m2>(compare->shape(), direction,
-                                                    lhs_literal, rhs_literal));
-    } break;
-    case F8E4M3FN: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare], Compare<tsl::float8_e4m3fn>(
-                                                   compare->shape(), direction,
-                                                   lhs_literal, rhs_literal));
-    } break;
-    case F8E4M3B11FNUZ: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare], Compare<tsl::float8_e4m3b11>(
-                                                   compare->shape(), direction,
-                                                   lhs_literal, rhs_literal));
-    } break;
-    case F16: {
-      TF_ASSIGN_OR_RETURN(
-          evaluated_[compare],
-          Compare<half>(compare->shape(), direction, lhs_literal, rhs_literal));
-    } break;
-    case BF16: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<bfloat16>(compare->shape(), direction,
-                                            lhs_literal, rhs_literal));
-    } break;
-    case F32: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<float>(compare->shape(), direction,
-                                         lhs_literal, rhs_literal));
-    } break;
-    case F64: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<double>(compare->shape(), direction,
-                                          lhs_literal, rhs_literal));
-    } break;
-    case C64: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<complex64>(compare->shape(), direction,
-                                             lhs_literal, rhs_literal));
-    } break;
-    case C128: {
-      TF_ASSIGN_OR_RETURN(evaluated_[compare],
-                          Compare<complex128>(compare->shape(), direction,
-                                              lhs_literal, rhs_literal));
-    } break;
-    default:
-      LOG(FATAL) << "HandleCompare: unknown primitive type: "
-                 << PrimitiveType_Name(lhs->shape().element_type());
-  }
-
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          TF_ASSIGN_OR_RETURN(evaluated_[compare],
+                              Compare<NativeT>(compare->shape(), direction,
+                                               lhs_literal, rhs_literal));
+          return OkStatus();
+        }
+        LOG(FATAL) << "HandleCompare: unknown primitive type: "
+                   << PrimitiveType_Name(lhs->shape().element_type());
+      },
+      lhs->shape().element_type());
 }
 
 Status HloEvaluator::HandleTuple(HloInstruction* tuple) {
@@ -3684,36 +3451,15 @@ namespace {
 
 StatusOr<Literal> CreateScalarLiteral(int64_t value,
                                       PrimitiveType element_type) {
-  Literal result;
-  switch (element_type) {
-    case S8:
-      result = LiteralUtil::CreateR0(static_cast<int8_t>(value));
-      break;
-    case U8:
-      result = LiteralUtil::CreateR0(static_cast<uint8_t>(value));
-      break;
-    case S16:
-      result = LiteralUtil::CreateR0(static_cast<int16_t>(value));
-      break;
-    case U16:
-      result = LiteralUtil::CreateR0(static_cast<uint16_t>(value));
-      break;
-    case S32:
-      result = LiteralUtil::CreateR0(static_cast<int32_t>(value));
-      break;
-    case U32:
-      result = LiteralUtil::CreateR0(static_cast<uint32_t>(value));
-      break;
-    case S64:
-      result = LiteralUtil::CreateR0(static_cast<int64_t>(value));
-      break;
-    case U64:
-      result = LiteralUtil::CreateR0(static_cast<uint64_t>(value));
-      break;
-    default:
-      return InvalidArgument("Unsupported element type.");
-  }
-  return result;
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+      [&](auto primitive_type_constant) -> StatusOr<Literal> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          return LiteralUtil::CreateR0(
+              static_cast<NativeTypeOf<primitive_type_constant>>(value));
+        }
+        return InvalidArgument("Unsupported element type.");
+      },
+      element_type);
 }
 
 // Parses the while loop if it matches one of the known patterns. Returns the
@@ -3835,58 +3581,16 @@ Literal ExtractLiteralFromIndexPositions(const Literal& from,
 StatusOr<Literal> ExtractFromIndexPositions(const Literal& from,
                                             absl::Span<int64_t const> indices) {
   PrimitiveType type = from.shape().element_type();
-  switch (type) {
-    case PRED: {
-      return ExtractLiteralFromIndexPositions<bool>(from, indices);
-    }
-    case U8: {
-      return ExtractLiteralFromIndexPositions<uint8_t>(from, indices);
-    }
-    case S8: {
-      return ExtractLiteralFromIndexPositions<int8_t>(from, indices);
-    }
-    case BF16: {
-      return ExtractLiteralFromIndexPositions<bfloat16>(from, indices);
-    }
-    case F16: {
-      return ExtractLiteralFromIndexPositions<Eigen::half>(from, indices);
-    }
-    case U16: {
-      return ExtractLiteralFromIndexPositions<uint16_t>(from, indices);
-    }
-    case S16: {
-      return ExtractLiteralFromIndexPositions<int16_t>(from, indices);
-    }
-    case F32: {
-      return ExtractLiteralFromIndexPositions<float>(from, indices);
-    }
-    case U32: {
-      return ExtractLiteralFromIndexPositions<uint32_t>(from, indices);
-    }
-    case S32: {
-      return ExtractLiteralFromIndexPositions<int32_t>(from, indices);
-    }
-    case F64: {
-      return ExtractLiteralFromIndexPositions<double>(from, indices);
-    }
-    case C64: {
-      return ExtractLiteralFromIndexPositions<std::complex<float>>(from,
-                                                                   indices);
-    }
-    case U64: {
-      return ExtractLiteralFromIndexPositions<uint64_t>(from, indices);
-    }
-    case S64: {
-      return ExtractLiteralFromIndexPositions<int64_t>(from, indices);
-    }
-    case C128: {
-      return ExtractLiteralFromIndexPositions<std::complex<double>>(from,
-                                                                    indices);
-    }
-    default:
-      return InvalidArgument("Unsupported type for Sort: %s",
-                             PrimitiveType_Name(type));
-  }
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+      [&](auto primitive_type_constant) -> StatusOr<Literal> {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          return ExtractLiteralFromIndexPositions<
+              NativeTypeOf<primitive_type_constant>>(from, indices);
+        }
+        return InvalidArgument("Unsupported type for Sort: %s",
+                               PrimitiveType_Name(type));
+      },
+      type);
 }
 
 // For one particular placement of a window in a base shape (the placement is
@@ -3942,7 +3646,7 @@ StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
                                       const Shape& result_shape) {
   std::function<ResultT(Fp, Uint)> stochastic_convert_op =
       [](Fp operand, Uint random) -> ResultT {
-    bool is_negative = ToSignMagnitude(operand) < 0;
+    bool is_negative = static_cast<bool>(Eigen::numext::signbit(operand));
     if (Eigen::numext::isinf(operand)) {
       return is_negative ? std::numeric_limits<ResultT>::min()
                          : std::numeric_limits<ResultT>::max();
@@ -4016,49 +3720,44 @@ template <PrimitiveType operand_type, PrimitiveType random_type>
 StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
                                       const Literal& random_literal,
                                       const Shape& result_shape) {
-  switch (result_shape.element_type()) {
-#define CONVERT_IF_RESULT_TYPES_MATCH(type)                        \
-  case (type):                                                     \
-    return StochasticConvertOp<operand_type, random_type, (type)>( \
-        operand_literal, random_literal, result_shape);
-    CONVERT_IF_RESULT_TYPES_MATCH(S32)
-    CONVERT_IF_RESULT_TYPES_MATCH(S16)
-    CONVERT_IF_RESULT_TYPES_MATCH(S8)
-#undef CONVERT_IF_RESULT_TYPES_MATCH
-    default:
-      break;
-  }
-  // TODO(b/232442915): Enable converting big floats to small floats.
-  return Unimplemented(
-      "Stochastically converting from type %s to type %s is not implemented.",
-      PrimitiveType_Name(operand_literal.shape().element_type()),
-      PrimitiveType_Name(result_shape.element_type()));
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+      [&](auto primitive_type_constant) -> StatusOr<Literal> {
+        if constexpr (primitive_util::IsSignedIntegralType(
+                          primitive_type_constant)) {
+          return StochasticConvertOp<operand_type, random_type,
+                                     primitive_type_constant>(
+              operand_literal, random_literal, result_shape);
+        }
+        // TODO(b/232442915): Enable converting big floats to small floats.
+        return Unimplemented(
+            "Stochastically converting from type %s to type %s is not "
+            "implemented.",
+            PrimitiveType_Name(operand_literal.shape().element_type()),
+            PrimitiveType_Name(result_shape.element_type()));
+      },
+      result_shape.element_type());
 }
 
 StatusOr<Literal> StochasticConvertOp(const Literal& operand_literal,
                                       const Literal& random_literal,
                                       const Shape& result_shape) {
-  switch (operand_literal.shape().element_type()) {
-    case F16:
-      return StochasticConvertOp<F16, U16>(operand_literal, random_literal,
-                                           result_shape);
-    case BF16:
-      return StochasticConvertOp<BF16, U16>(operand_literal, random_literal,
-                                            result_shape);
-    case F32:
-      return StochasticConvertOp<F32, U32>(operand_literal, random_literal,
-                                           result_shape);
-    case F64:
-      return StochasticConvertOp<F64, U64>(operand_literal, random_literal,
-                                           result_shape);
-    default:
-      break;
-  }
-  // TODO(b/232442915): Enable converting big floats to small floats.
-  return Unimplemented(
-      "Stochastically converting from type %s to type %s is not implemented.",
-      PrimitiveType_Name(operand_literal.shape().element_type()),
-      PrimitiveType_Name(result_shape.element_type()));
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+      [&](auto primitive_type_constant) -> StatusOr<Literal> {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          return StochasticConvertOp<
+              primitive_type_constant,
+              primitive_util::UnsignedIntegralTypeForBitWidth(
+                  primitive_util::BitWidth(primitive_type_constant))>(
+              operand_literal, random_literal, result_shape);
+        }  // TODO(b/232442915): Enable converting big floats to small floats.
+        return Unimplemented(
+            "Stochastically converting from type %s to type %s is not "
+            "implemented.",
+            PrimitiveType_Name(operand_literal.shape().element_type()),
+            PrimitiveType_Name(result_shape.element_type()));
+      },
+      operand_literal.shape().element_type());
 }
 }  // namespace
 
@@ -4501,8 +4200,8 @@ static StatusOr<bool> GenerateReduceOutputElement(
   return true;
 }
 
-Status HloEvaluator::HandleReduce(HloInstruction* instr) {
-  HloReduceInstruction* reduce = Cast<HloReduceInstruction>(instr);
+Status HloEvaluator::HandleReduce(HloInstruction* hlo) {
+  HloReduceInstruction* reduce = Cast<HloReduceInstruction>(hlo);
   int64_t num_args = reduce->inputs().size();
   absl::Span<const int64_t> dimensions_to_reduce(reduce->dimensions());
   HloComputation* function = reduce->to_apply();
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
index f8a0bd14179..6623e0ea710 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h
@@ -376,7 +376,7 @@ class HloEvaluator : public DfsHloVisitorWithDefault {
 
   Status HandleComplex(HloInstruction* complex) override;
 
-  Status HandleReduce(HloInstruction* reduce) override;
+  Status HandleReduce(HloInstruction* hlo) override;
 
   Status HandleReduceWindow(HloInstruction* hlo) override;
 
diff --git a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index 96668095a3e..f68c8d90a55 100644
--- a/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -66,20 +66,6 @@ T Nibble1(T t) {
   return t;
 }
 
-// TODO(b/79274244): We'd like these type traits to live inside of
-// HloEvaluatorTypedVisitor so they don't pollute namespace xla, but that
-// crashes clang in the frontend.
-//
-// Anyway this is relatively safe as-is because hlo_evaluator_typed_visitor.h is
-// a "private" header that's not exposed outside of hlo_evaluator.cc.
-template <typename T>
-struct is_complex_t : std::false_type {};
-template <typename T>
-struct is_complex_t<std::complex<T>> : std::true_type {};
-
-template <typename T>
-inline constexpr bool is_complex_v = is_complex_t<T>::value;
-
 namespace detail {
 template <typename T>
 using unsigned_promoted_type_t =
@@ -103,38 +89,6 @@ auto ToArithmeticSafeType(T t) {
   }
 }
 
-// std::make_signed_t is “behavior undefined” for custom types, so provide a
-// general util to make signed/unsigned for both primitive and custom types.
-template <typename T>
-struct MakeSigned {
-  using type = std::make_signed_t<T>;
-};
-
-template <>
-struct MakeSigned<u4> {
-  using type = s4;
-};
-
-template <>
-struct MakeSigned<s4> {
-  using type = s4;
-};
-
-template <typename T>
-struct MakeUnsigned {
-  using type = std::make_unsigned_t<T>;
-};
-
-template <>
-struct MakeUnsigned<u4> {
-  using type = u4;
-};
-
-template <>
-struct MakeUnsigned<s4> {
-  using type = u4;
-};
-
 // Templated DfsHloVisitor for use by HloEvaluator.
 //
 // Typically ReturnT here indicates the resulting literal type of each evaluated
@@ -676,7 +630,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleShiftRightArithmetic(HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
-      using SignedT = typename MakeSigned<ReturnT>::type;
+      using SignedT = make_specialized_signed_t<ReturnT>;
       TF_ASSIGN_OR_RETURN(
           parent_->evaluated_[shr],
           ElementWiseBinaryOp(
@@ -696,7 +650,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
   Status HandleShiftRightLogical(HloInstruction* shr) override {
     if constexpr (std::is_integral_v<ElementwiseT> &&
                   !std::is_same_v<ElementwiseT, bool>) {
-      using UnsignedT = typename MakeUnsigned<ReturnT>::type;
+      using UnsignedT = make_specialized_unsigned_t<ReturnT>;
       TF_ASSIGN_OR_RETURN(parent_->evaluated_[shr],
                           ElementWiseBinaryOp(shr, [](ElementwiseT lhs_elem,
                                                       ElementwiseT rhs_elem) {
@@ -1665,7 +1619,7 @@ class HloEvaluatorTypedVisitor : public DfsHloVisitorWithDefault {
 
   template <typename NativeT>
   static bool IsShiftOutOfBounds(ElementwiseT rhs) {
-    using UnsignedT = typename MakeUnsigned<NativeT>::type;
+    using UnsignedT = make_specialized_unsigned_t<NativeT>;
     UnsignedT lhs_bits_unsigned =
         static_cast<UnsignedT>(std::numeric_limits<UnsignedT>::digits);
     UnsignedT rhs_unsigned = static_cast<UnsignedT>(rhs);
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
index 1fbde7d6569..5733418a74f 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/BUILD
@@ -32,6 +32,7 @@ cc_library(
         ":cluster_environment",
         ":matrix",
         ":metrics",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/compiler/xla/service:dump",
@@ -43,7 +44,6 @@ cc_library(
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -122,7 +122,6 @@ cc_library(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
-        "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/status",
@@ -171,7 +170,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        #"//tensorflow/core:test",
         "//tensorflow/tsl/lib/core:status_test_util",
         "@com_google_ortools//ortools/linear_solver",
     ],
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 7b4ddc0b684..f90402706e0 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/sharding_propagation.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "ortools/linear_solver/linear_solver.h"
@@ -159,7 +160,8 @@ GenerateReshardingCostsAndMissingShardingsForAllOperands(
             GetInputSharding(ins, operand, k, output_sharding, call_graph);
       }
       if (!cur_input_sharding.has_value() &&
-          ins->opcode() == HloOpcode::kGather && k == 0) {
+          ((ins->opcode() == HloOpcode::kGather && k == 0) ||
+           (ins->opcode() == HloOpcode::kScatter && k != 0))) {
         cur_input_sharding = HloSharding::Replicate();
       }
       CHECK(cur_input_sharding.has_value());
@@ -210,7 +212,7 @@ std::unique_ptr<StrategyVector> MaybeFollowInsStrategyVector(
     size_t instruction_id, bool have_memory_cost,
     LeafStrategies& leaf_strategies, const ClusterEnvironment& cluster_env,
     StableHashMap<int64_t, std::vector<ShardingStrategy>>&
-        trimmed_strategy_map) {
+        pretrimmed_strategy_map) {
   std::unique_ptr<StrategyVector> strategies;
   if (src_strategies->is_tuple) {
     CHECK(shape.IsTuple());
@@ -221,16 +223,16 @@ std::unique_ptr<StrategyVector> MaybeFollowInsStrategyVector(
       strategies->childs.push_back(MaybeFollowInsStrategyVector(
           src_strategies->childs[i].get(), shape.tuple_shapes(i),
           instruction_id, have_memory_cost, leaf_strategies, cluster_env,
-          trimmed_strategy_map));
+          pretrimmed_strategy_map));
     }
   } else {
-    CHECK(shape.IsArray());
+    CHECK(shape.IsArray() || shape.IsToken());
     strategies =
         CreateLeafStrategyVectorWithoutInNodes(instruction_id, leaf_strategies);
     strategies->in_nodes.push_back(src_strategies);
     // Only follows the given strategy when there is no other strategy to be
     // restored.
-    if (!trimmed_strategy_map.contains(src_strategies->id)) {
+    if (!pretrimmed_strategy_map.contains(src_strategies->id)) {
       strategies->following = src_strategies;
     }
     strategies->leaf_vector.reserve(src_strategies->leaf_vector.size());
@@ -238,16 +240,16 @@ std::unique_ptr<StrategyVector> MaybeFollowInsStrategyVector(
     // there is any.
     for (int64_t sid = 0;
          sid < src_strategies->leaf_vector.size() +
-                   trimmed_strategy_map[src_strategies->id].size();
+                   pretrimmed_strategy_map[src_strategies->id].size();
          ++sid) {
       const HloSharding* output_spec;
       if (sid < src_strategies->leaf_vector.size()) {
         output_spec = &src_strategies->leaf_vector[sid].output_sharding;
       } else {
         output_spec =
-            &trimmed_strategy_map[src_strategies->id]
-                                 [sid - src_strategies->leaf_vector.size()]
-                                     .output_sharding;
+            &pretrimmed_strategy_map[src_strategies->id]
+                                    [sid - src_strategies->leaf_vector.size()]
+                                        .output_sharding;
         VLOG(1) << "Adding outspec from the trimmed strategy map: "
                 << output_spec->ToString();
       }
@@ -264,7 +266,8 @@ std::unique_ptr<StrategyVector> MaybeFollowInsStrategyVector(
                             communication_cost,
                             memory_cost,
                             {std::move(resharding_costs)},
-                            {}}));
+                            // {}}));
+                            {*output_spec}}));
     }
   }
   return strategies;
@@ -443,10 +446,16 @@ void AddReplicatedStrategy(const HloInstruction* ins, const Shape& shape,
   } else {
     for (int64_t k = 0; k < ins->operand_count(); ++k) {
       auto operand = ins->operand(k);
-      resharding_costs.push_back(ReshardingCostVector(
-          strategy_map.at(operand).get(), ins->operand(k)->shape(), output_spec,
-          cluster_env));
-      input_shardings.push_back(output_spec);
+      if (ins->opcode() == HloOpcode::kConditional) {
+        resharding_costs.push_back(std::vector<double>(
+            strategy_map.at(operand)->leaf_vector.size(), 0));
+        input_shardings.push_back(output_spec);
+      } else {
+        resharding_costs.push_back(ReshardingCostVector(
+            strategy_map.at(operand).get(), ins->operand(k)->shape(),
+            output_spec, cluster_env));
+        input_shardings.push_back(output_spec);
+      }
     }
   }
   double memory_cost = GetBytes(shape) / output_spec.NumTiles();
@@ -462,16 +471,20 @@ std::vector<std::vector<double>> CreateZeroReshardingCostsForAllOperands(
     auto operand = ins->operand(i);
     const auto& operand_strategies = strategy_map.at(operand);
     if (operand->shape().IsTuple()) {
-      CHECK_EQ(ins->operand_count(), 0)
-          << "Do not support instructions with more than one tuple "
-             "operand.";
-      for (size_t tuple_element_idx = 0;
-           tuple_element_idx < operand->shape().tuple_shapes_size();
-           tuple_element_idx++) {
-        auto tuple_element_strategies =
-            operand_strategies->childs.at(tuple_element_idx).get();
-        resharding_costs.push_back(std::vector<double>(
-            tuple_element_strategies->leaf_vector.size(), 0));
+      if (ins->opcode() == HloOpcode::kConditional) {
+        resharding_costs.push_back(std::vector<double>(1, 0));
+      } else {
+        CHECK_EQ(ins->operand_count(), 0)
+            << "Do not support instructions with more than one tuple "
+               "operand.";
+        for (size_t tuple_element_idx = 0;
+             tuple_element_idx < operand->shape().tuple_shapes_size();
+             tuple_element_idx++) {
+          auto tuple_element_strategies =
+              operand_strategies->childs.at(tuple_element_idx).get();
+          resharding_costs.push_back(std::vector<double>(
+              tuple_element_strategies->leaf_vector.size(), 0));
+        }
       }
     } else {
       resharding_costs.push_back(
@@ -948,6 +961,11 @@ StatusOr<std::unique_ptr<StrategyVector>> CreateAllStrategiesVector(
       TF_RETURN_IF_ERROR(FilterStrategy(ins, shape, strategies, cluster_env,
                                         batch_dim_map, solver_option));
     }
+  } else if (shape.IsToken()) {
+    strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
+                                          leaf_strategies);
+    AddReplicatedStrategy(ins, shape, cluster_env, strategy_map, strategies,
+                          replicated_penalty);
   } else {
     LOG(FATAL) << "Unsupported instruction shape: " << shape.DebugString();
   }
@@ -1013,14 +1031,15 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
     const StrategyMap& strategy_map,
     const std::vector<HloInstruction*> instructions,
     const HloSharding& existing_sharding, const ClusterEnvironment& cluster_env,
-    StableHashMap<int64_t, std::vector<ShardingStrategy>>& trimmed_strategy_map,
+    StableHashMap<int64_t, std::vector<ShardingStrategy>>&
+        pretrimmed_strategy_map,
     const CallGraph& call_graph, bool strict) {
   if (strategies->is_tuple) {
     for (size_t i = 0; i < strategies->childs.size(); ++i) {
       TrimOrGenerateStrategiesBasedOnExistingSharding(
           output_shape.tuple_shapes(i), strategies->childs.at(i).get(),
           strategy_map, instructions, existing_sharding.tuple_elements().at(i),
-          cluster_env, trimmed_strategy_map, call_graph, strict);
+          cluster_env, pretrimmed_strategy_map, call_graph, strict);
     }
   } else {
     if (ShardingIsComplete(existing_sharding,
@@ -1039,7 +1058,7 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
         // only keeps the one we found.
         ShardingStrategy found_strategy =
             strategies->leaf_vector[strategy_index];
-        trimmed_strategy_map[strategies->id] = strategies->leaf_vector;
+        pretrimmed_strategy_map[strategies->id] = strategies->leaf_vector;
         strategies->leaf_vector.clear();
         strategies->leaf_vector.push_back(found_strategy);
       } else {
@@ -1089,7 +1108,7 @@ void TrimOrGenerateStrategiesBasedOnExistingSharding(
         double memory_cost =
             GetBytes(output_shape) / existing_sharding.NumTiles();
         if (!strategies->leaf_vector.empty()) {
-          trimmed_strategy_map[strategies->id] = strategies->leaf_vector;
+          pretrimmed_strategy_map[strategies->id] = strategies->leaf_vector;
         }
         strategies->leaf_vector.clear();
         strategies->leaf_vector.push_back(
@@ -1257,7 +1276,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
   // is useful when the operand is forced to use a user sharding, and the op
   // doesn't need to strictly follow it. We restore the trimmed strategies in
   // this situation.
-  StableHashMap<int64_t, std::vector<ShardingStrategy>> trimmed_strategy_map;
+  StableHashMap<int64_t, std::vector<ShardingStrategy>> pretrimmed_strategy_map;
   LeafStrategies leaf_strategies;
   AssociativeDotPairs associative_dot_pairs;
   absl::flat_hash_set<const HloInstruction*> undefined_set;
@@ -1329,6 +1348,38 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                               strategies, 0);
         break;
       }
+      case HloOpcode::kScatter: {
+        strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
+                                              leaf_strategies);
+        // We follow the first operand (the array we're scattering into)
+        auto src_strategies = strategy_map.at(ins->operand(0)).get();
+        CHECK(!src_strategies->is_tuple);
+        for (int64_t sid = 0; sid < src_strategies->leaf_vector.size(); ++sid) {
+          HloSharding output_spec =
+              src_strategies->leaf_vector[sid].output_sharding;
+          std::string name = ToStringSimple(output_spec);
+          double compute_cost = 0, communication_cost = 0;
+          double memory_cost = GetBytes(ins->shape()) / output_spec.NumTiles();
+
+          std::vector<std::optional<HloSharding>> input_shardings_optional(
+              {output_spec, std::nullopt, std::nullopt});
+          std::vector<std::vector<double>> resharding_cost =
+              GenerateReshardingCostsAndMissingShardingsForAllOperands(
+                  ins, output_spec, strategy_map, cluster_env, call_graph,
+                  input_shardings_optional);
+
+          std::vector<HloSharding> input_shardings;
+          for (auto sharding_optional : input_shardings_optional) {
+            CHECK(sharding_optional.has_value());
+            input_shardings.push_back(sharding_optional.value());
+          }
+
+          strategies->leaf_vector.push_back(ShardingStrategy(
+              {name, output_spec, compute_cost, communication_cost, memory_cost,
+               std::move(resharding_cost), input_shardings}));
+        }
+        break;
+      }
       case HloOpcode::kGather: {
         strategies = CreateLeafStrategyVector(instruction_id, ins, strategy_map,
                                               leaf_strategies);
@@ -1644,6 +1695,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         }
 
         if (strategies->leaf_vector.empty()) {
+          strategies->following = nullptr;
           AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
                                 strategies, 0);
         }
@@ -1727,31 +1779,37 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             continue;
           }
 
-          const StrategyVector* src_strategies =
-              strategy_map.at(ins->operand(i)).get();
+          auto process_src_strategies = [&](const std::vector<ShardingStrategy>
+                                                src_strategies_leaf_vector) {
+            for (int64_t sid = 0; sid < src_strategies_leaf_vector.size();
+                 ++sid) {
+              HloSharding output_spec =
+                  src_strategies_leaf_vector[sid].output_sharding;
+              std::string name = ToStringSimple(output_spec);
+              double compute_cost = 0, communication_cost = 0;
+              double memory_cost =
+                  GetBytes(ins->shape()) / output_spec.NumTiles();
+              std::vector<std::vector<double>> resharding_costs;
+              std::vector<HloSharding> input_shardings;
+              for (int64_t k = 0; k < ins->operand_count(); ++k) {
+                resharding_costs.push_back(ReshardingCostVector(
+                    strategy_map.at(ins->operand(k)).get(),
+                    ins->operand(k)->shape(), output_spec, cluster_env));
+                input_shardings.push_back(output_spec);
+              }
+
+              strategies->leaf_vector.push_back(ShardingStrategy(
+                  {name, output_spec, compute_cost, communication_cost,
+                   memory_cost, std::move(resharding_costs), input_shardings}));
+            }
+          };
+          auto src_strategies = strategy_map.at(ins->operand(i)).get();
           CHECK(!src_strategies->is_tuple);
 
-          for (int64_t sid = 0; sid < src_strategies->leaf_vector.size();
-               ++sid) {
-            HloSharding output_spec =
-
-                src_strategies->leaf_vector[sid].output_sharding;
-            std::string name = ToStringSimple(output_spec);
-            double compute_cost = 0, communication_cost = 0;
-            double memory_cost =
-                GetBytes(ins->shape()) / output_spec.NumTiles();
-            std::vector<std::vector<double>> resharding_costs;
-            std::vector<HloSharding> input_shardings;
-            for (int64_t k = 0; k < ins->operand_count(); ++k) {
-              resharding_costs.push_back(ReshardingCostVector(
-                  strategy_map.at(ins->operand(k)).get(),
-                  ins->operand(k)->shape(), output_spec, cluster_env));
-              input_shardings.push_back(output_spec);
-            }
-
-            strategies->leaf_vector.push_back(ShardingStrategy(
-                {name, output_spec, compute_cost, communication_cost,
-                 memory_cost, std::move(resharding_costs), input_shardings}));
+          process_src_strategies(src_strategies->leaf_vector);
+          if (pretrimmed_strategy_map.contains(src_strategies->id)) {
+            process_src_strategies(
+                pretrimmed_strategy_map.at(src_strategies->id));
           }
         }
         if (ins->opcode() == HloOpcode::kAdd) {
@@ -1839,7 +1897,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           strategies->childs.push_back(MaybeFollowInsStrategyVector(
               src_strategies, operand->shape(), instruction_id,
               /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-              trimmed_strategy_map));
+              pretrimmed_strategy_map));
         }
         break;
       }
@@ -1851,7 +1909,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             src_strategies->childs[ins->tuple_index()].get(), ins->shape(),
             instruction_id,
             /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-            trimmed_strategy_map);
+            pretrimmed_strategy_map);
         break;
       }
       case HloOpcode::kCustomCall: {
@@ -1862,7 +1920,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           strategies = MaybeFollowInsStrategyVector(
               src_strategies, ins->shape(), instruction_id,
               /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-              trimmed_strategy_map);
+              pretrimmed_strategy_map);
         } else if (ins->has_sharding()) {
           if (ins->shape().IsTuple()) {
             strategies = CreateTupleStrategyVector(instruction_id);
@@ -1882,7 +1940,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             strategies = MaybeFollowInsStrategyVector(
                 src_strategies, ins->shape(), instruction_id,
                 /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-                trimmed_strategy_map);
+                pretrimmed_strategy_map);
           }
         } else {
           // TODO (b/258723035) Handle CustomCall ops for GPUs in a better way.
@@ -1917,7 +1975,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
               src_strategies->childs[i].get(),
               ins->shape().tuple_shapes().at(i), instruction_id,
               /* have_memory_cost= */ true, leaf_strategies, cluster_env,
-              trimmed_strategy_map));
+              pretrimmed_strategy_map));
         }
 
         break;
@@ -1951,7 +2009,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       // this sharding spec when merging node using strategies->following.
       TrimOrGenerateStrategiesBasedOnExistingSharding(
           ins->shape(), strategies.get(), strategy_map, instructions,
-          ins->sharding(), cluster_env, trimmed_strategy_map, call_graph,
+          ins->sharding(), cluster_env, pretrimmed_strategy_map, call_graph,
           solver_option.nd_sharding_iteratively_strict_search_space);
     }
     if (!strategies->is_tuple && strategies->following) {
@@ -2016,7 +2074,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
     }
     // Checks the shape of resharding_costs is valid. It will check fail if the
     // shape is not as expected.
-    CheckReshardingCostsShape(strategies.get());
+    // CheckReshardingCostsShape(strategies.get());
     CheckMemoryCosts(strategies.get(), ins->shape());
     strategy_map[ins] = std::move(strategies);
   }  // end of for loop
@@ -2100,6 +2158,18 @@ void PrintLargestInstructions(
   }
 }
 
+struct ORToolsSolverResult {
+ public:
+  ORToolsSolverResult(
+      StatusOr<std::tuple<std::vector<int64_t>, std::vector<int64_t>, double>>
+          status,
+      bool skip_auto_sharding)
+      : status(status), skip_auto_sharding(skip_auto_sharding) {}
+  StatusOr<std::tuple<std::vector<int64_t>, std::vector<int64_t>, double>>
+      status;
+  bool skip_auto_sharding;
+};
+
 // NOLINTEND
 
 // We formulate the auto sharding process as the following ILP problem:
@@ -2146,24 +2216,23 @@ void PrintLargestInstructions(
 //        s[i][p] + s[j][q] <= 1 if v[p, q] == 1.0
 // Serialize parameters of the ILP problem as numpy arrays and call the python
 // solver.
-StatusOr<std::tuple<std::vector<int64_t>, std::vector<int64_t>, double>>
-CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
-                  const std::vector<int>& s_follow,
-                  const std::vector<std::pair<int, int>>& E,
-                  const std::vector<std::vector<int>>& L,
-                  const std::vector<std::vector<double>>& c,
-                  const std::vector<std::vector<double>>& d,
-                  const std::vector<std::vector<double>>& m,
-                  const std::vector<std::vector<double>>& r,
-                  const std::vector<std::pair<int, int>>& A,
-                  const std::vector<std::vector<double>>& v,
-                  const std::vector<std::string>& instruction_names,
-                  bool crash_at_infinity_costs_check) {
+ORToolsSolverResult CallORToolsSolver(
+    int64_t N, int64_t M, const std::vector<int>& s_len,
+    const std::vector<int>& s_follow, const std::vector<std::pair<int, int>>& E,
+    const std::vector<std::vector<int>>& L,
+    const std::vector<std::vector<double>>& c,
+    const std::vector<std::vector<double>>& d,
+    const std::vector<std::vector<double>>& m,
+    const std::vector<std::vector<double>>& r,
+    const std::vector<std::pair<int, int>>& A,
+    const std::vector<std::vector<double>>& v,
+    const std::vector<std::string>& instruction_names,
+    int64_t solver_timeout_in_seconds, bool crash_at_infinity_costs_check) {
   size_t num_edges = E.size();
 
   int32_t num_workers = 32;
   // SAT or SCIP
-  std::unique_ptr<MPSolver> solver(std::make_unique<MPSolver>("", MPSolver::GLPK_MIXED_INTEGER_PROGRAMMING));
+  std::unique_ptr<MPSolver> solver(std::make_unique<MPSolver>("", MPSolver::SAT_INTEGER_PROGRAMMING));
   CHECK(solver);
   solver->MutableObjective()->SetMinimization();
   std::string solver_parameter_str;
@@ -2276,7 +2345,7 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
         LOG(FATAL) << err_msg;
       } else {
         LOG(WARNING) << err_msg;
-        return tsl::errors::Internal(err_msg);
+        return ORToolsSolverResult(absl::InternalError(err_msg), false);
       }
     }
   }
@@ -2380,7 +2449,7 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
     }
   }
 #endif
-  solver->set_time_limit(3600 * 1000);  // in ms
+  solver->SetTimeLimit(absl::Seconds(solver_timeout_in_seconds));
   VLOG(0) << "Starting solver " << solver->ProblemType() << "\n"
           << "Solver parameter string: " << solver_parameter_str << "\n"
           << "Number of workers: " << num_workers << "\n"
@@ -2392,6 +2461,7 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
           << "Memory budget: " << M / (1024 * 1024 * 1024) << "GB\n"
           << "Number of ILP constraints: " << solver->NumConstraints();
   auto status = solver->Solve();
+
   if (status == operations_research::MPSolver::INFEASIBLE) {
     LOG(ERROR) << "MPSolver could not find any feasible solution.";
 #ifdef PLATFORM_GOOGLE
@@ -2420,11 +2490,16 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
     }
 #endif
 
-    return tsl::errors::Internal(
-        "MPSolver could not find any feasible solution.");
-  }
-  if (status != operations_research::MPSolver::OPTIMAL) {
-    return tsl::errors::Internal("Solver errors.");
+    return ORToolsSolverResult(
+        absl::InternalError("MPSolver could not find any feasible solution."),
+        false);
+  } else if (status != operations_research::MPSolver::OPTIMAL) {
+    auto err_msg = "Solver timed out. Will proceed without auto sharding.";
+    LOG(WARNING) << err_msg;
+
+    // The solver timed out. We now rely on heuristic-based sharding propagation
+    // to degrade gracefully.
+    return ORToolsSolverResult(absl::InternalError(err_msg), true);
   }
 
   LOG(INFO) << "Solver Status: " << status
@@ -2476,16 +2551,18 @@ CallORToolsSolver(int64_t N, int64_t M, const std::vector<int>& s_len,
     LOG(INFO) << "memory budget: " << M / (1024 * 1024 * 1024) << " GB";
   }
   PrintLargestInstructions(chosen_strategy, m, L, instruction_names);
-  return std::make_tuple(std::move(chosen_strategy), std::move(e_val),
-                         solver->Objective().Value());
+  return ORToolsSolverResult(
+      std::make_tuple(std::move(chosen_strategy), std::move(e_val),
+                      solver->Objective().Value()),
+      false);
 }
 
-StatusOr<std::tuple<std::vector<int64_t>, std::vector<int64_t>, double>>
-CallSolver(const HloInstructionSequence& sequence,
-           const LivenessSet& liveness_set, const StrategyMap& strategy_map,
-           const LeafStrategies& leaf_strategies, const CostGraph& cost_graph,
-           const AliasSet& alias_set, int64_t memory_budget_per_device,
-           bool crash_at_infinity_costs_check) {
+ORToolsSolverResult CallSolver(
+    const HloInstructionSequence& sequence, const LivenessSet& liveness_set,
+    const StrategyMap& strategy_map, const LeafStrategies& leaf_strategies,
+    const CostGraph& cost_graph, const AliasSet& alias_set,
+    int64_t memory_budget_per_device, bool crash_at_infinity_costs_check,
+    int64_t solver_timeout_in_seconds) {
   // Serialize edges and edge costs to 1d numpy arrays
   int64_t N = leaf_strategies.size();
   int64_t M = memory_budget_per_device;
@@ -2603,7 +2680,8 @@ CallSolver(const HloInstructionSequence& sequence,
     }
   }
   return CallORToolsSolver(N, M, s_len, s_follow, E, L, c, d, m, r, A, v,
-                           instruction_names, crash_at_infinity_costs_check);
+                           instruction_names, solver_timeout_in_seconds,
+                           crash_at_infinity_costs_check);
 }
 
 void CheckHloSharding(const HloInstructionSequence& sequence,
@@ -2831,36 +2909,60 @@ void SetHloShardingPostProcessing(const HloInstructionSequence& sequence,
       // GetShardingStrategy, which is invoked below does not currently support
       // such instructions. Implement this support.
       if (inst->shape().IsTuple()) {
-        continue;
-      }
-      const ShardingStrategy& stra =
-          GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
-      if (stra.input_shardings.empty()) {
-        continue;
-      }
-      if (inst->opcode() == HloOpcode::kGetTupleElement) {
-        FixMixedMeshShapeReshardingGetTupleElement(inst, inst->sharding(),
-                                                   device_mesh);
+        switch (inst->opcode()) {
+          case HloOpcode::kReduce:
+          case HloOpcode::kCustomCall:
+          case HloOpcode::kSort: {
+            for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
+              const ShardingStrategy& stra = GetShardingStrategyForTuple(
+                  inst, i, strategy_map, cost_graph, s_val);
+              if (stra.input_shardings.size() > i) {
+                FixMixedMeshShapeResharding(inst, i, stra.input_shardings[i],
+                                            device_mesh, resharding_cache);
+              }
+            }
+            break;
+          }
+          case HloOpcode::kTuple: {
+            for (size_t i = 0; i < inst->shape().tuple_shapes_size(); ++i) {
+              const ShardingStrategy& stra = GetShardingStrategyForTuple(
+                  inst, i, strategy_map, cost_graph, s_val);
+              CHECK_EQ(stra.input_shardings.size(), 1);
+              FixMixedMeshShapeResharding(inst, i, stra.input_shardings[0],
+                                          device_mesh, resharding_cache);
+            }
+            break;
+          }
+          case HloOpcode::kWhile:
+          case HloOpcode::kConditional: {
+            break;
+          }
+          case HloOpcode::kParameter: {
+            break;
+          }
+          default:
+            LOG(FATAL) << "Unhandled instruction: " + inst->ToString();
+        }
       } else {
-        for (size_t i = 0; i < inst->operand_count(); ++i) {
-          if (stra.input_shardings.size() > i) {
-            FixMixedMeshShapeResharding(inst, i, stra.input_shardings[i],
-                                        device_mesh, resharding_cache);
+        const ShardingStrategy& stra =
+            GetShardingStrategy(inst, strategy_map, cost_graph, s_val);
+        if (stra.input_shardings.empty()) {
+          continue;
+        }
+        if (inst->opcode() == HloOpcode::kGetTupleElement) {
+          FixMixedMeshShapeReshardingGetTupleElement(inst, inst->sharding(),
+                                                     device_mesh);
+        } else {
+          for (size_t i = 0; i < inst->operand_count(); ++i) {
+            if (stra.input_shardings.size() > i) {
+              FixMixedMeshShapeResharding(inst, i, stra.input_shardings[i],
+                                          device_mesh, resharding_cache);
+            }
           }
         }
       }
     }
   }
-
-  for (HloInstruction* inst : sequence.instructions()) {
-    if (inst->opcode() == HloOpcode::kIota) {
-      if (inst->sharding().IsReplicated()) {
-        // For fully replicated iota, leave its sharding annotation to the
-        // ShardingPropagation pass, which can typically do a better job.
-        inst->clear_sharding();
-      }
-    }
-  }
 }
 
 // Print liveness set for debugging.
@@ -3136,28 +3238,63 @@ void CheckUserShardingPreservation(
 
 int64_t MemoryBudgetLowerBound(const HloModule& module,
                                const LivenessSet& liveness_set,
+                               const HloAliasAnalysis* alias_analysis,
                                int64_t num_devices) {
+  auto get_value_sharding = [](const HloValue* value) {
+    return !value->index().empty()
+               ? value->instruction()->sharding().GetSubSharding(
+                     value->instruction()->shape(), value->index())
+               : value->instruction()->sharding();
+  };
+
+  // We below, that is HloValues A and B alias, and A has a sharding specified,
+  // the same sharding is also used to compute the per-device memory
+  // requirements of B. This can be done by associating shardings with buffers
+  // as aliasing HloValues are mapped to the same buffer.
+  absl::flat_hash_map<HloBuffer::Id, const HloValue*>
+      buffer_to_sharded_value_mapping;
+  for (size_t t = 0; t < liveness_set.size(); ++t) {
+    for (const HloValue* value : liveness_set[t]) {
+      auto buffer = alias_analysis->GetBufferContainingValue(*value);
+      if (value->instruction()->has_sharding()) {
+        auto this_value_sharding = get_value_sharding(value);
+        auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
+        if (iter != buffer_to_sharded_value_mapping.end()) {
+          auto buffer_value_sharding = get_value_sharding(iter->second);
+          if (this_value_sharding != buffer_value_sharding) {
+            // TODO(pratikf): This is an unavoidable situation, but possibly
+            // there is a better design decision that can be made here.
+            VLOG(1) << "We have a situation where two HloValues alias, but "
+                       "they have different shardings. This can happen in the "
+                       "presence of user-specified shardings, and is expected. "
+                       "This, however, means that the memory budget estimate "
+                       "is not very accurate. The aliasing HLOs are "
+                    << value->ToShortString() << " and "
+                    << iter->second->ToShortString();
+          }
+        }
+        buffer_to_sharded_value_mapping[buffer.id()] = value;
+      }
+    }
+  }
+
   int64_t max_memory_usage = 0;
   for (size_t t = 0; t < liveness_set.size(); ++t) {
     int64_t memory_usage = 0;
     for (const HloValue* value : liveness_set[t]) {
-      size_t tmp;
       if (value->instruction()->shape().IsTuple() && value->index().empty()) {
         continue;
       }
       Shape shape =
           ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
-      if (value->instruction()->has_sharding()) {
-        tmp = GetShardedInstructionSize(
-            shape, num_devices,
-            !value->index().empty()
-                ? value->instruction()->sharding().GetSubSharding(
-                      value->instruction()->shape(), value->index())
-                : value->instruction()->sharding());
-      } else {
-        tmp = GetShardedInstructionSize(shape, num_devices);
+      auto buffer = alias_analysis->GetBufferContainingValue(*value);
+      auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
+      std::optional<HloSharding> optional_sharding = std::nullopt;
+      if (iter != buffer_to_sharded_value_mapping.end()) {
+        optional_sharding = get_value_sharding(iter->second);
       }
-      memory_usage += tmp;
+      memory_usage +=
+          GetShardedInstructionSize(shape, num_devices, optional_sharding);
     }
     max_memory_usage = std::max(max_memory_usage, memory_usage);
   }
@@ -3652,7 +3789,7 @@ Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
   const Array<int64_t>& device_mesh = cluster_env.device_mesh_;
 
   if (shape.dimensions(batch_dim) % device_mesh.dim(mesh_dim) != 0) {
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "The length of batch dimension is "
         "not divisible by the number of devices");
   }
@@ -3917,14 +4054,13 @@ AutoShardingImplementation::AutoShardingImplementation(
     const AutoShardingOption& option)
     : option_(option) {}
 
-StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
+StatusOr<AutoShardingResult> AutoShardingImplementation::RunAutoSharding(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   if (!option_.enable) {
-    return false;
+    return AutoShardingResult::kModuleUnchanged;
   }
   bool module_is_changed = false;
-  VLOG(1) << "Start auto sharding pass";
 
   bool set_to_memory_lower_bound = (option_.memory_budget_per_device == 0);
   // ----- Set options for this pass -----
@@ -3980,7 +4116,7 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
       &unspecified_dims, /*saved_root_shardings=*/nullptr,
       /*saved_parameter_shardings=*/nullptr);
   if (!status_or_changed.ok()) {
-    return status_or_changed;
+    return status_or_changed.status();
   }
   if (status_or_changed.value()) {
     module_is_changed = true;
@@ -4001,7 +4137,7 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     StatusOr<bool> status_or_changed =
         RemoveShardingAnnotation(module, execution_threads);
     if (!status_or_changed.ok()) {
-      return status_or_changed;
+      return status_or_changed.status();
     }
     if (status_or_changed.value()) {
       module_is_changed = true;
@@ -4011,9 +4147,6 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     }
   }
 
-  XLA_VLOG_LINES(6,
-                 absl::StrCat("Before auto sharding:\n", module->ToString()));
-  DumpHloModuleIfEnabled(*module, "before_auto_spmd_sharding");
   // ----- Get a sequential schedule and do liveness analysis -----
   auto size_fn = [](const BufferValue& buffer) {
     return spmd::GetBytes(buffer.shape());
@@ -4072,16 +4205,22 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     LOG(INFO) << "Processing partial mesh shape: "
               << spmd::ToString(mesh_shape);
     Array<int64_t> device_mesh(mesh_shape);
+
     int64_t total_devices = 1;
     for (auto i : mesh_shape) {
       total_devices *= i;
     }
     if (mesh_idx != partial_mesh_shapes.size() - 1) {
-      bool changed = spmd::AdjustShardingsWithPartialMeshShape(
-          sequence.instructions(), mesh_shape, total_devices);
-      LOG(INFO)
-          << "Shardings are adjusted based on current partial mesh shape: "
-          << changed;
+      auto changed_or = spmd::AdjustShardingsWithPartialMeshShape(
+          sequence.instructions(), mesh_shape, total_devices,
+          /* crash_on_error */ !option_.try_multiple_mesh_shapes);
+      if (changed_or.ok()) {
+        LOG(INFO)
+            << "Shardings are adjusted based on current partial mesh shape: "
+            << *changed_or;
+      } else {
+        return changed_or.status();
+      }
     }
     std::vector<int64_t> device_mesh_ids = std::vector<int64_t>(total_devices);
     std::iota(device_mesh_ids.begin(), device_mesh_ids.end(), 0);
@@ -4093,8 +4232,10 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
         original_device_mesh, device_mesh, option_.device_mesh_alpha,
         option_.device_mesh_beta, prof_result, solver_option);
 
+    XLA_VLOG_LINES(1, module->ToString());
     int64_t memory_lower_bound = spmd::MemoryBudgetLowerBound(
-        *module, liveness_set, device_mesh.num_elements());
+        *module, liveness_set, alias_analysis.get(),
+        device_mesh.num_elements());
     // Rounds up to the next GB.
     int64_t memory_lower_bound_gb =
         1 + memory_lower_bound / (1024 * 1024 * 1024);
@@ -4124,7 +4265,7 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     if (!solver_option.force_simple_heuristic.empty()) {
       AnnotateShardingWithSimpleHeuristic(
           module, solver_option.force_simple_heuristic, alias_map, cluster_env);
-      return true;
+      return AutoShardingResult::kModuleChangedShardingPerformed;
     }
 
     if (solver_option.force_batch_dim_to_mesh_dim >= 0) {
@@ -4158,14 +4299,22 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
     std::vector<int64_t> s_val, e_val;
     double objective = -1.0;
     if (!solver_option.load_solution_vector) {
-      TF_ASSIGN_OR_RETURN(
-          auto solution,
-          CallSolver(sequence, liveness_set, strategy_map, leaf_strategies,
-                     cost_graph, alias_set, option_.memory_budget_per_device,
-                     /*crash_at_infinity_costs_check*/
-                     !option_.try_multiple_mesh_shapes));
-      std::tie(s_val, e_val, objective) = solution;
-      this->solver_optimal_objective_value_ = objective;
+      auto solver_result = CallSolver(
+          sequence, liveness_set, strategy_map, leaf_strategies, cost_graph,
+          alias_set, option_.memory_budget_per_device,
+          /*crash_at_infinity_costs_check*/
+          !option_.try_multiple_mesh_shapes, option_.solver_timeout_in_seconds);
+      if (solver_result.skip_auto_sharding) {
+        return AutoShardingResult::kModuleUnchangedNoShardingPerfomed;
+      } else if (!solver_result.status.ok()) {
+        return AutoShardingResult::kModuleUnchanged;
+      } else {
+        TF_ASSIGN_OR_RETURN(auto solution, solver_result.status);
+        std::tie(s_val, e_val, objective) = solution;
+        if (mesh_idx == partial_mesh_shapes.size() - 1) {
+          this->solver_optimal_objective_value_ = objective;
+        }
+      }
     } else {
       s_val = option_.strategy_vector;
     }
@@ -4203,10 +4352,29 @@ StatusOr<bool> AutoShardingImplementation::RunAutoSharding(
 
   // ----- Canonicalize layouts based on LayoutCanonicalizationCallback. -----
   TF_RETURN_IF_ERROR(CanonicalizeLayouts(module));
-  XLA_VLOG_LINES(6, absl::StrCat("After auto sharding:\n", module->ToString()));
+  XLA_VLOG_LINES(7, absl::StrCat("After auto sharding for mesh ",
+                                 spmd::ToString(option_.device_mesh_shape),
+                                 ":\n", module->ToString()));
   DumpHloModuleIfEnabled(*module, "after_auto_spmd_sharding");
 
-  return module_is_changed;
+  return module_is_changed ? AutoShardingResult::kModuleChangedShardingPerformed
+                           : AutoShardingResult::kModuleUnchanged;
+}
+
+bool ModuleHasUserShardings(const HloModule* module) {
+  bool has_shardings = false;
+  for (auto computation : module->computations()) {
+    for (auto instruction : computation->instructions()) {
+      if (instruction->has_sharding()) {
+        has_shardings = true;
+        break;
+      }
+    }
+    if (has_shardings) {
+      break;
+    }
+  }
+  return has_shardings;
 }
 
 AutoSharding::AutoSharding(const AutoShardingOption& option)
@@ -4215,7 +4383,14 @@ AutoSharding::AutoSharding(const AutoShardingOption& option)
 StatusOr<bool> AutoSharding::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  VLOG(1) << "Running auto-sharding pass";
+  if (!option_.enable) {
+    return false;
+  }
+  VLOG(1) << "Start auto sharding pass";
+
+  XLA_VLOG_LINES(6,
+                 absl::StrCat("Before auto sharding:\n", module->ToString()));
+  DumpHloModuleIfEnabled(*module, "before_auto_spmd_sharding");
 
 #if !defined(__APPLE__)
   // Streamz metrics.
@@ -4226,14 +4401,6 @@ StatusOr<bool> AutoSharding::Run(
   TF_RETURN_IF_ERROR(option_.CheckAndSetup());
   VLOG(1) << "AutoShardingOptions:\n" << option_.ToString();
 
-  if (!option_.try_multiple_mesh_shapes) {
-    AutoShardingImplementation pass(option_);
-    auto result = pass.RunAutoSharding(module, execution_threads);
-    this->solver_optimal_objective_value_ =
-        pass.GetSolverOptimalObjectiveValue();
-    return result;
-  }
-
   bool asymmetrical_mesh_dims = false;
   for (size_t i = 0; i < option_.device_mesh_shape.size(); ++i) {
     if (option_.device_mesh_beta[0] != option_.device_mesh_beta[i] ||
@@ -4256,61 +4423,99 @@ StatusOr<bool> AutoSharding::Run(
 
   size_t num_meshes = mesh_shapes.size();
   std::vector<std::unique_ptr<HloModule>> modules(num_meshes);
-  std::vector<StatusOr<bool>> changed(num_meshes, false);
+  std::vector<StatusOr<AutoShardingResult>> changed(
+      num_meshes, AutoShardingResult::kModuleUnchanged);
   std::vector<double> objective_values(num_meshes, -1);
 
   VLOG(1) << "Original mesh shape "
           << spmd::ToString(option_.device_mesh_shape);
   double min_objective_value = std::numeric_limits<double>::max();
   int min_mesh_shape_index = -1;
+  bool skip_auto_sharding = true;
   for (size_t i = 0; i < mesh_shapes.size(); ++i) {
     VLOG(1) << "Trying mesh shape " << spmd::ToString(mesh_shapes[i]);
     AutoShardingOption this_option = option_;
     this_option.device_mesh_shape = mesh_shapes[i];
     auto pass = new AutoShardingImplementation(this_option);
-    auto module_clone = module->Clone();
+    auto module_clone = module->Clone("");
+    module_clone->set_layout_canonicalization_callback(
+        module->layout_canonicalization_callback());
     auto pass_result =
         pass->RunAutoSharding(module_clone.get(), execution_threads);
+
     changed[i] = pass_result;
     objective_values[i] = pass->GetSolverOptimalObjectiveValue();
     modules[i] = std::move(module_clone);
     delete pass;
+    if (!pass_result.ok()) {
+      continue;
+    }
     VLOG(1) << "Mesh shape " << spmd::ToString(mesh_shapes[i])
             << " has objective value " << objective_values[i];
     if (objective_values[i] >= 0 && min_objective_value > objective_values[i]) {
       min_mesh_shape_index = i;
       min_objective_value = objective_values[i];
     }
+    if (pass_result.ok() &&
+        pass_result.value() !=
+            AutoShardingResult::kModuleUnchangedNoShardingPerfomed) {
+      skip_auto_sharding = false;
+    }
   }
 
   StatusOr<bool> module_is_changed;
-  if (!changed[min_mesh_shape_index].ok()) {
-    module_is_changed = changed[min_mesh_shape_index];
+  if (skip_auto_sharding) {
+    VLOG(1) << "Solver timed out. Will now rely on sharding propagation to "
+               "perform sharding.";
+    if (!ModuleHasUserShardings(module)) {
+      LOG(WARNING)
+          << "The auto-sharding solver has timed out without a solution. "
+             "Further, as the input module does not contain any sharding "
+             "annotations, we cannot rely on sharding propagation to perform "
+             "heuristic-guided sharding. The module therefore may not be "
+             "sharded leading to low performance.";
+    }
+    module_is_changed = false;
   } else {
-    solver_optimal_objective_value_ = min_objective_value;
-    if (*changed[min_mesh_shape_index]) {
-      VLOG(1) << "Choosing mesh shape "
-              << spmd::ToString(mesh_shapes[min_mesh_shape_index])
-              << " which had the minimal solver objective value of "
-              << min_objective_value;
+    CHECK_GE(min_mesh_shape_index, 0)
+        << "The auto-sharding pass could not find a device mesh that works for "
+           "this input. This could be the result of a low memory budget. If "
+           "you think you have set a reasonably large memory budget, please "
+           "report this as a bug.";
 
-      absl::flat_hash_map<HloComputation*, HloComputation*>
-          computation_replacements;
-      for (size_t i = 0; i < module->computation_count(); ++i) {
-        auto original_computation = module->mutable_computation(i);
-        auto new_computation =
-            modules[min_mesh_shape_index]->mutable_computation(i);
-        computation_replacements[original_computation] = new_computation;
-      }
-
-      module->ReplaceComputations(computation_replacements);
-      module->MoveComputationsFrom(modules[min_mesh_shape_index].get());
-
-      module_is_changed = true;
-    } else if (!*changed[min_mesh_shape_index]) {
-      module_is_changed = false;
+    if (!changed[min_mesh_shape_index].ok()) {
+      module_is_changed = changed[min_mesh_shape_index].status();
     } else {
-      module_is_changed = false;
+      solver_optimal_objective_value_ = min_objective_value;
+      if (changed[min_mesh_shape_index].value() ==
+          AutoShardingResult::kModuleChangedShardingPerformed) {
+        VLOG(1) << "Choosing mesh shape "
+                << spmd::ToString(mesh_shapes[min_mesh_shape_index])
+                << " which had the minimal solver objective value of "
+                << min_objective_value;
+
+        absl::flat_hash_map<HloComputation*, HloComputation*>
+            computation_replacements;
+        for (size_t i = 0; i < module->computation_count(); ++i) {
+          auto original_computation = module->mutable_computation(i);
+          auto new_computation =
+              modules[min_mesh_shape_index]->mutable_computation(i);
+          computation_replacements[original_computation] = new_computation;
+        }
+
+        module->ReplaceComputations(computation_replacements);
+        module->MoveComputationsFrom(modules[min_mesh_shape_index].get());
+
+        *module->config().mutable_entry_computation_layout() =
+            modules[min_mesh_shape_index]->entry_computation_layout();
+
+        module_is_changed = true;
+      } else if (changed[min_mesh_shape_index].value() ==
+                 AutoShardingResult::kModuleUnchanged) {
+        module_is_changed = false;
+      } else {
+        module_is_changed = false;
+      }
     }
   }
 
@@ -4320,6 +4525,10 @@ StatusOr<bool> AutoSharding::Run(
   metrics::RecordAutoShardingCompilationTime(
       absl::ToInt64Microseconds(duration));
 #endif
+
+  XLA_VLOG_LINES(6, absl::StrCat("After auto sharding:\n", module->ToString()));
+  DumpHloModuleIfEnabled(*module, "after_auto_spmd_sharding");
+
   return module_is_changed;
 }
 
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
index c37b843b9bb..fc1e4fab4a7 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -162,6 +162,11 @@ struct AutoShardingOption {
   // Explore other mesh shapes with the same number of devices as the provided
   // one for a potentially better auto-sharding solution.
   bool try_multiple_mesh_shapes = false;
+
+  // Timeout for the solver. If the solver fails to find an optimal solution
+  // before the timeout, we rely on the heuristic-based sharding implemented in
+  // sharding_propagation.cc.
+  int64_t solver_timeout_in_seconds = 3600;
   std::vector<int64_t> strategy_vector;
 
   std::string ToString() {
@@ -239,11 +244,11 @@ struct AutoShardingOption {
 
   Status CheckAndSetup() {
     if (device_mesh_shape.empty()) {
-      return tsl::errors::OutOfRange(
+      return absl::OutOfRangeError(
           "device_mesh_shape is empty and it needs to be specified.");
     }
     if (device_mesh_shape.size() > 3) {
-      return tsl::errors::OutOfRange(
+      return absl::OutOfRangeError(
           absl::StrCat("Not supported: the length of device_mesh_shape is "
                        "greater than 3, actual length: ",
                        device_mesh_shape.size()));
@@ -251,13 +256,13 @@ struct AutoShardingOption {
     // All values in device_mesh_shape must be greater than 0.
     if (absl::c_any_of(device_mesh_shape,
                        [](const int64_t i) { return i <= 0; })) {
-      return tsl::errors::OutOfRange(
+      return absl::OutOfRangeError(
           absl::StrCat("device_mesh_shape values need to be larger than 0: "
                        "device_mesh_shape=",
                        absl::StrJoin(device_mesh_shape, ",")));
     }
     if (spmd::VectorGreaterThanOneElementCount(device_mesh_shape) > 2) {
-      return tsl::errors::OutOfRange(
+      return absl::OutOfRangeError(
           absl::StrCat("the auto-sharding pass currently does not support ",
                        "more than two shardable dims: device_mesh_shape=",
                        absl::StrJoin(device_mesh_shape, ",")));
@@ -287,7 +292,7 @@ struct AutoShardingOption {
 
     if (device_mesh_shape.size() != device_mesh_alpha.size() ||
         device_mesh_shape.size() != device_mesh_beta.size()) {
-      return tsl::errors::OutOfRange(absl::StrCat(
+      return absl::OutOfRangeError(absl::StrCat(
           "Sizes do not match: length of device_mesh_shape is ",
           device_mesh_shape.size(), ", length of device_mesh_alpha is ",
           device_mesh_alpha.size(), ", length of device_mesh_beta is ",
@@ -309,7 +314,7 @@ struct AutoShardingOption {
     } else {
       // Checks whether device_mesh_shape and device_mesh_ids are compatible.
       if (total_devices != device_mesh_ids.size()) {
-        return tsl::errors::OutOfRange(absl::StrCat(
+        return absl::OutOfRangeError(absl::StrCat(
             "Expect the product of device_mesh_shape to be the same as the "
             "size of device_mesh_ids, but we have total devices = ",
             total_devices,
@@ -320,13 +325,19 @@ struct AutoShardingOption {
   }
 };
 
+enum class AutoShardingResult {
+  kModuleUnchanged,
+  kModuleChangedShardingPerformed,
+  kModuleUnchangedNoShardingPerfomed
+};
+
 class AutoShardingImplementation {
  public:
   explicit AutoShardingImplementation(const AutoShardingOption& option);
   ~AutoShardingImplementation() = default;
 
   // using HloPassInterface::Run;
-  StatusOr<bool> RunAutoSharding(
+  StatusOr<AutoShardingResult> RunAutoSharding(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads);
 
@@ -372,10 +383,14 @@ class AutoSharding : public HloModulePass {
     return solver_optimal_objective_value_;
   }
 
+  std::vector<int64_t> GetChosenDeviceMeshShape() { return chosen_mesh_shape_; }
+
  private:
   AutoShardingOption option_;
   // Stores the optimal value of the objective the solver found.
   double solver_optimal_objective_value_ = -1.0;
+  // Stores the optimal mesh shape found.
+  std::vector<int64_t> chosen_mesh_shape_;
 };
 
 namespace spmd {
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
index 718d00aea1f..5bb66289db9 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -49,6 +49,16 @@ class CostGraph {
           size_t dst_idx = strategies->id;
           Matrix edge_cost = CreateEdgeCost(src_idx, dst_idx, i, strategies);
           AddEdgeCost(src_idx, dst_idx, edge_cost);
+        } else if (strategies->in_nodes[i]->is_tuple &&
+                   strategies->in_nodes.size() > 1) {
+          for (size_t l = 0; l < strategies->in_nodes[i]->childs.size(); l++) {
+            size_t src_idx = strategies->in_nodes[i]->childs.at(l)->id;
+            size_t dst_idx = strategies->id;
+            Matrix edge_cost =
+                CreateEdgeCost(src_idx, dst_idx, i, strategies, true);
+            AddEdgeCost(src_idx, dst_idx, edge_cost);
+          }
+
         } else {
           CHECK_EQ(strategies->in_nodes.size(), 1)
               << "Do not support instructions with more than one tuple "
@@ -99,7 +109,7 @@ class CostGraph {
   }
 
   Matrix CreateEdgeCost(size_t src_idx, size_t dst_idx, size_t in_node_idx,
-                        StrategyVector* strategies) {
+                        StrategyVector* strategies, bool zero_cost = false) {
     CHECK_GE(node_lens_.size(), src_idx);
     CHECK_GE(node_lens_.size(), dst_idx);
     Matrix edge_cost(node_lens_[src_idx], node_lens_[dst_idx]);
@@ -107,7 +117,8 @@ class CostGraph {
       const ShardingStrategy& strategy = strategies->leaf_vector[k];
       for (size_t j = 0; j < strategy.resharding_costs[in_node_idx].size();
            ++j) {
-        edge_cost(j, k) = strategy.resharding_costs[in_node_idx][j];
+        edge_cost(j, k) =
+            zero_cost ? 0 : strategy.resharding_costs[in_node_idx][j];
       }
     }
     return edge_cost;
@@ -343,6 +354,19 @@ inline const ShardingStrategy& GetShardingStrategy(
   return strategies->leaf_vector[stra_idx];
 }
 
+// Get the final sharding strategy according to the ilp solution.
+inline const ShardingStrategy& GetShardingStrategyForTuple(
+    const HloInstruction* inst, size_t index, const StrategyMap& strategy_map,
+    const CostGraph& cost_graph, absl::Span<const int64_t> s_val) {
+  const StrategyVector* tuple_strategies = strategy_map.at(inst).get();
+  CHECK(tuple_strategies->is_tuple);
+  CHECK_LT(index, tuple_strategies->childs.size());
+  const auto& strategies = tuple_strategies->childs[index];
+  int node_idx = strategies->id;
+  int stra_idx = cost_graph.RemapIndex(node_idx, s_val[node_idx]);
+  return strategies->leaf_vector[stra_idx];
+}
+
 }  // namespace spmd
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_HLO_EXPERIMENTAL_AUTO_SHARDING_AUTO_SHARDING_COST_GRAPH_H_
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index 70b121d1a16..29fbf3212be 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -970,7 +970,7 @@ class ConvHandler {
 
   Status RegisterStrategies() {
     if (device_mesh_.num_dimensions() > 2) {
-      return tsl::errors::Internal(
+      return absl::InternalError(
           "This function does not support 3D mesh shape with convolution ops "
           "yet.");
     }
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index fe75e4954a8..778afd70dbb 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -63,7 +63,7 @@ ENTRY %elementwise {
 TEST(MIPSolverTest, TwoVariableToyExample) {
   // SAT or SCIP
   std::unique_ptr<MPSolver> solver(
-      std::make_unique<MPSolver>("", MPSolver::GLPK_MIXED_INTEGER_PROGRAMMING));
+      MPSolver::CreateSolver("SCIP_MIXED_INTEGER_PROGRAMMING"));
   solver->MutableObjective()->SetMaximization();
   ASSERT_TRUE(solver);
   // Test with the following integer programming problem:
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 3b4a256caf4..1ccaae42ae0 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -891,6 +891,10 @@ bool AllInfinityCosts(
     const std::vector<std::vector<double>>& resharding_costs) {
   for (const auto& costs : resharding_costs) {
     bool all_infinity = true;
+    if (costs.empty()) {
+      all_infinity = false;
+      continue;
+    }
     for (const auto& cost : costs) {
       if (cost < kInfinityCost) {
         all_infinity = false;
@@ -952,7 +956,6 @@ void RemoveDuplicatedStrategy(std::unique_ptr<StrategyVector>& strategies) {
   }
 }
 
-
 bool IsDivisible(const HloInstruction* ins, const Array<int64_t>& device_mesh,
                  absl::Span<const int64_t> tensor_dims,
                  absl::Span<const int64_t> mesh_dims) {
@@ -1138,7 +1141,7 @@ void RemoveCustomCallMarker(HloModule* module) {
 StatusOr<std::vector<int64_t>> GetValuesAlongOneDim(const Array<int64_t>& array,
                                                     int dim) {
   if (dim >= array.num_dimensions()) {
-    return tsl::errors::OutOfRange(absl::StrCat(
+    return absl::OutOfRangeError(absl::StrCat(
         "Input dim (", dim,
         ") should be smaller than the number of dimensions of array (",
         array.num_dimensions(), ")."));
@@ -1155,13 +1158,13 @@ StatusOr<std::vector<int64_t>> GetValuesAlongOneDim(const Array<int64_t>& array,
 // Check whether a sequence is an arithmetic sequence.
 StatusOr<int64_t> CheckArithmeticSequence(absl::Span<const int64_t> sequence) {
   if (sequence.size() < 2) {
-    return tsl::errors::OutOfRange(
+    return absl::OutOfRangeError(
         "Invalid device id assignment: sequence.size() < 2");
   }
   int64_t delta = sequence[1] - sequence[0];
   for (int i = 2; i < sequence.size(); ++i) {
     if (sequence[i] - sequence[i - 1] != delta) {
-      return tsl::errors::OutOfRange(
+      return absl::OutOfRangeError(
           "Invalid device id assignment: sequence[i] - sequence[i - 1] != "
           "delta");
     }
@@ -1819,6 +1822,17 @@ int64_t GetShardedInstructionSize(const Shape& shape, int64_t num_devices,
     }
     return size;
   }
+  if (sharding) {
+    if (sharding->IsReplicated()) {
+      return GetBytes(shape);
+    } else {
+      int64_t num_sharded_devices = 1;
+      for (size_t i = 0; i < shape.rank(); ++i) {
+        num_sharded_devices *= sharding->tile_assignment().dim(i);
+      }
+      return GetBytes(shape) / num_sharded_devices;
+    }
+  }
   bool shardable = false;
   for (const auto dim : shape.dimensions()) {
     if (dim >= num_devices) {
@@ -1887,13 +1901,37 @@ double ReshardingCostMixedMeshShape(
   return resharding_costs;
 }
 
-std::optional<HloSharding> AdjustShardingWithPartialMeshShapePerElement(
+std::pair<Status, std::optional<HloSharding>>
+AdjustShardingWithPartialMeshShapePerElement(
     const HloSharding& sharding,
-    const absl::flat_hash_set<int64_t>& valid_shards,
-    int64_t total_num_devices) {
+    const absl::flat_hash_set<int64_t>& valid_shards, int64_t total_num_devices,
+    bool crash_on_error) {
   if (sharding.TotalNumTiles() > total_num_devices &&
       VectorGreaterThanOneElementCount(
           sharding.tile_assignment().dimensions()) > valid_shards.size()) {
+    for (auto shard : valid_shards) {
+      bool contains_shard = false;
+      for (auto dim : sharding.tile_assignment().dimensions()) {
+        if (dim == shard) {
+          contains_shard = true;
+          break;
+        }
+      }
+
+      if (!contains_shard && !sharding.IsReplicated()) {
+        auto err_msg = absl::StrCat(
+            "There is a mismatch between the user provided sharding ",
+            sharding.ToString(),
+            " and the device mesh. This case is currently unsupported.");
+        if (crash_on_error) {
+          LOG(FATAL) << err_msg;
+        } else {
+          LOG(WARNING) << err_msg;
+          return std::make_pair(absl::InternalError(err_msg), std::nullopt);
+        }
+      }
+    }
+
     std::vector<int64_t> new_tile_assignment_dimensions;
     if (sharding.ReplicateOnLastTileDim()) {
       // If replicate on valid_shards dimensions, turns this instruction
@@ -1902,8 +1940,7 @@ std::optional<HloSharding> AdjustShardingWithPartialMeshShapePerElement(
       if (valid_shards.find(sharding.tile_assignment().dim(
               sharding.tile_assignment().num_dimensions() - 1)) !=
           valid_shards.end()) {
-        HloSharding new_sharding = HloSharding::Replicate();
-        return new_sharding;
+        return std::make_pair(OkStatus(), HloSharding::Replicate());
       }
       // If replicate on other dimensions, remove the
       // replicate_on_last_tile
@@ -1950,14 +1987,15 @@ std::optional<HloSharding> AdjustShardingWithPartialMeshShapePerElement(
     std::iota(device_ids.begin(), device_ids.end(), 0);
     tile_assignment.SetValues(device_ids);
     HloSharding new_sharding = HloSharding::Tile(std::move(tile_assignment));
-    return new_sharding;
+    return std::make_pair(OkStatus(), new_sharding);
   }
-  return std::nullopt;
+  return std::make_pair(OkStatus(), std::nullopt);
 }
 
-bool AdjustShardingsWithPartialMeshShape(
+StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     const std::vector<HloInstruction*>& instructions,
-    const std::vector<int64_t>& mesh_shape, int64_t total_num_devices) {
+    const std::vector<int64_t>& mesh_shape, int64_t total_num_devices,
+    bool crash_on_error) {
   bool changed = false;
   absl::flat_hash_set<int64_t> valid_shards;
   for (const auto shape : mesh_shape) {
@@ -1975,13 +2013,17 @@ bool AdjustShardingsWithPartialMeshShape(
       for (size_t i = 0; i < inst->shape().tuple_shapes_size(); i++) {
         auto shape = inst->shape().tuple_shapes(i);
         auto sharding = inst->sharding().tuple_elements()[i];
-        std::optional<HloSharding> new_sharding =
-            AdjustShardingWithPartialMeshShapePerElement(sharding, valid_shards,
-                                                         total_num_devices);
-        if (new_sharding.has_value()) {
-          output_flattened_shardings.push_back(*new_sharding);
+        std::pair<Status, std::optional<HloSharding>> new_sharding_result =
+            AdjustShardingWithPartialMeshShapePerElement(
+                sharding, valid_shards, total_num_devices, crash_on_error);
+        if (new_sharding_result.first.ok()) {
+          if (new_sharding_result.second.has_value()) {
+            output_flattened_shardings.push_back(*new_sharding_result.second);
+          } else {
+            output_flattened_shardings.push_back(sharding);
+          }
         } else {
-          output_flattened_shardings.push_back(sharding);
+          return new_sharding_result.first;
         }
       }
       size_t i = 0;
@@ -1990,12 +2032,17 @@ bool AdjustShardingsWithPartialMeshShape(
       }
       inst->set_sharding(HloSharding::Tuple(output_tuple_sharding));
     } else {
-      std::optional<HloSharding> sharding =
+      std::pair<Status, std::optional<HloSharding>> sharding_result =
           AdjustShardingWithPartialMeshShapePerElement(
-              inst->sharding(), valid_shards, total_num_devices);
-      if (sharding.has_value()) {
-        inst->set_sharding(*sharding);
-        changed = true;
+              inst->sharding(), valid_shards, total_num_devices,
+              crash_on_error);
+      if (sharding_result.first.ok()) {
+        if (sharding_result.second.has_value()) {
+          inst->set_sharding(*sharding_result.second);
+          changed = true;
+        }
+      } else {
+        return sharding_result.first;
       }
     }
   }
diff --git a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 3bae22d4e22..131d1818bfc 100644
--- a/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/tensorflow/compiler/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -582,9 +582,19 @@ double ReshardingCostMixedMeshShape(
 // If a sharding is [8, 4] for the complete mesh shape, we convert it to [8, 1]
 // given [1, 8, 1] as the partial mesh shape.
 // total_num_devices should equal to the product of mesh_shape elements.
-bool AdjustShardingsWithPartialMeshShape(
+StatusOr<bool> AdjustShardingsWithPartialMeshShape(
     const std::vector<HloInstruction*>& instructions,
-    const std::vector<int64_t>& mesh_shape, int64_t total_num_devices);
+    const std::vector<int64_t>& mesh_shape, int64_t total_num_devices,
+    bool crash_on_error);
+
+inline bool AdjustShardingsWithPartialMeshShape(
+    const std::vector<HloInstruction*>& instructions,
+    const std::vector<int64_t>& mesh_shape, int64_t total_num_devices) {
+  auto result = AdjustShardingsWithPartialMeshShape(instructions, mesh_shape,
+                                                    total_num_devices, true);
+  CHECK(result.ok());
+  return *result;
+}
 
 // Decompose mesh shapes into partial mesh shapes so that we can solve the auto
 // sharding problem iteratively. Returns partial mesh shapes with larger
diff --git a/tensorflow/compiler/xla/hlo/ir/BUILD b/tensorflow/compiler/xla/hlo/ir/BUILD
index 8a193aa7506..eb3d6527e85 100644
--- a/tensorflow/compiler/xla/hlo/ir/BUILD
+++ b/tensorflow/compiler/xla/hlo/ir/BUILD
@@ -71,6 +71,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_proto_cc",
         "//tensorflow/compiler/xla/service:compilation_environments",
         "//tensorflow/compiler/xla/service:computation_placer_hdr",
+        "//tensorflow/compiler/xla/service:hlo_lexer",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/service:mapped_ptr_container_sorter",
@@ -105,7 +106,7 @@ cc_library(
     srcs = ["hlo_module_group.cc"],
     hdrs = ["hlo_module_group.h"],
     deps = [
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        ":hlo",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -117,8 +118,8 @@ cc_library(
     srcs = ["hlo_reachability.cc"],
     hdrs = ["hlo_reachability.h"],
     deps = [
+        ":hlo",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h
index fd68cc08df9..f08df6af65e 100644
--- a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h
@@ -133,6 +133,7 @@ class DfsHloVisitorBase {
   /* go/keep-sorted start */
   virtual Status HandleCholesky(HloInstructionPtr hlo) = 0;
   virtual Status HandleFft(HloInstructionPtr fft) = 0;
+  virtual Status HandleTopK(HloInstructionPtr hlo) = 0;
   virtual Status HandleTriangularSolve(HloInstructionPtr hlo) = 0;
   /* go/keep-sorted end */
 
diff --git a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h
index 76c359ab26b..238836e6134 100644
--- a/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h
+++ b/tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_DFS_HLO_VISITOR_WITH_DEFAULT_H_
 
+#include <memory>
 #include <utility>
 
 #include "absl/base/optimization.h"
@@ -42,8 +43,8 @@ template <typename HloInstructionPtr>
 class DfsHloVisitorWithDefaultBase
     : public DfsHloVisitorBase<HloInstructionPtr> {
  public:
-  DfsHloVisitorWithDefaultBase() {}
-  ~DfsHloVisitorWithDefaultBase() override {}
+  DfsHloVisitorWithDefaultBase() = default;
+  ~DfsHloVisitorWithDefaultBase() override = default;
 
   // Default action performed on HloInstruction.
   virtual Status DefaultAction(HloInstructionPtr hlo_instruction) = 0;
@@ -248,6 +249,9 @@ class DfsHloVisitorWithDefaultBase
   Status HandleSend(HloInstructionPtr send) override {
     return DefaultAction(send);
   }
+  Status HandleTopK(HloInstructionPtr topk) override {
+    return DefaultAction(topk);
+  }
   Status HandleSendDone(HloInstructionPtr send_done) override {
     return DefaultAction(send_done);
   }
diff --git a/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.cc b/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.cc
index 432adcaf9e5..d6ccd00cb12 100644
--- a/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.cc
+++ b/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h"
 
+#include <optional>
+#include <ostream>
+#include <string>
+#include <vector>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h b/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h
index d43e57f2ad8..97f562beae6 100644
--- a/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h
+++ b/tensorflow/compiler/xla/hlo/ir/dynamic_parameter_binding.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <functional>
 #include <optional>
+#include <ostream>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc b/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
index 7c82eb64a18..c1402b6358b 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_computation.cc
@@ -235,10 +235,10 @@ HloInstruction* HloComputation::ReplaceParameter(
   HloInstruction* new_instruction =
       AddInstructionInternal(std::move(instruction));
   HloInstruction* old_instruction = param_instructions_[param_no];
-  CHECK(
-      old_instruction->ReplaceAllUsesWithDifferentShape(new_instruction).ok());
+  TF_CHECK_OK(
+      old_instruction->ReplaceAllUsesWithDifferentShape(new_instruction));
   param_instructions_[param_no] = new_instruction;
-  CHECK(RemoveInstruction(old_instruction).ok());
+  TF_CHECK_OK(RemoveInstruction(old_instruction));
   return new_instruction;
 }
 
@@ -824,8 +824,8 @@ HloInstruction* HloComputation::CreateFusionInstruction(
 HloInstruction* HloComputation::CreateCallInstruction(
     absl::Span<HloInstruction* const> instructions_to_call) {
   HloInstruction* root = instructions_to_call.front();
-  HloInstruction* call_instruction =
-      AddInstruction(HloInstruction::CreateCall(root->shape(), root));
+  HloInstruction* call_instruction = AddInstruction(
+      HloInstruction::CreateCall(root->shape(), root), root->name());
   AppendInstructionsIntoCalledComputation(instructions_to_call,
                                           call_instruction);
   return call_instruction;
@@ -844,7 +844,8 @@ StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
     parameter_shapes[i] = parameter_shape;
   }
   HloInstruction* root = builder.AddInstruction(
-      instruction->CloneWithNewOperands(instruction->shape(), parameters));
+      instruction->CloneWithNewOperands(instruction->shape(), parameters),
+      absl::StrCat(instruction->name(), ".cloned"));
   HloComputation* async_computation =
       parent_->AddEmbeddedComputation(builder.Build(root));
   std::vector<Shape> start_shapes = {
@@ -852,13 +853,17 @@ StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
   for (const Shape& context_shape : context_shapes) {
     start_shapes.push_back(context_shape);
   }
-  HloInstruction* async_start = AddInstruction(HloInstruction::CreateAsyncStart(
-      ShapeUtil::MakeTupleShape(start_shapes), instruction->operands(),
-      async_computation, /*async_group_id=*/std::nullopt,
-      async_execution_thread));
-  HloInstruction* async_done = AddInstruction(HloInstruction::CreateAsyncDone(
-      root->shape(), async_start, async_computation,
-      /*async_group_id=*/std::nullopt, async_execution_thread));
+  HloInstruction* async_start = AddInstruction(
+      HloInstruction::CreateAsyncStart(
+          ShapeUtil::MakeTupleShape(start_shapes), instruction->operands(),
+          async_computation, /*async_group_id=*/std::nullopt,
+          async_execution_thread),
+      absl::StrCat(root->name(), ".call-start"));
+  HloInstruction* async_done = AddInstruction(
+      HloInstruction::CreateAsyncDone(
+          root->shape(), async_start, async_computation,
+          /*async_group_id=*/std::nullopt, async_execution_thread),
+      absl::StrCat(root->name(), ".call-done"));
   async_start->set_metadata(instruction->metadata());
   async_start->CopyBackendConfigFrom(instruction);
   async_done->set_metadata(instruction->metadata());
@@ -1297,6 +1302,16 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     context_ptr = std::make_unique<HloCloneContext>(parent(), suffix);
     context = context_ptr.get();
   }
+  return CloneInContext(*context, replacements, extra_parameters, suffix,
+                        new_root);
+}
+
+std::unique_ptr<HloComputation> HloComputation::CloneInContext(
+    HloCloneContext& context,
+    const absl::flat_hash_map<const HloInstruction*,
+                              std::unique_ptr<HloInstruction>>* replacements,
+    absl::Span<const HloInstruction* const> extra_parameters,
+    const std::string& suffix, const HloInstruction* new_root) const {
   if (new_root == nullptr) {
     new_root = root_instruction();
   }
@@ -1370,10 +1385,10 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
       CHECK_NE(replaced_operand, nullptr)
           << "replacements map tried to eliminate a used instruction "
           << operand->ToString() << ", used by " << instr->ToString();
-      new_operands.push_back(context->GetInstruction(replaced_operand));
+      new_operands.push_back(context.GetInstruction(replaced_operand));
     }
     std::unique_ptr<HloInstruction> new_instr =
-        instr->CloneWithNewOperands(instr->shape(), new_operands, context);
+        instr->CloneWithNewOperands(instr->shape(), new_operands, &context);
     if (instr->opcode() == HloOpcode::kParameter &&
         instr->parameter_replicated_at_leaf_buffers().has_value()) {
       new_instr->set_parameter_replicated_at_leaf_buffers(
@@ -1384,7 +1399,7 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
 
   // To make clone behavior match uncloned behavior, we reorder instructions to
   // match the order in instructions_.
-  SortClonedInstructions(*context, replace, *this, instructions_, instructions);
+  SortClonedInstructions(context, replace, *this, instructions_, instructions);
 
   Builder builder(suffix.empty() ? std::string(name())
                                  : absl::StrCat(name(), ".", suffix));
@@ -1392,27 +1407,27 @@ std::unique_ptr<HloComputation> HloComputation::CloneWithReplacements(
     builder.AddInstruction(std::move(instr));
   }
   auto result = builder.Build(
-      /*root_instruction=*/context->GetInstruction(replace(new_root)));
+      /*root_instruction=*/context.GetInstruction(replace(new_root)));
 
   // Clone control dependencies.
   for (auto instr : postorder) {
-    HloInstruction* new_instr = context->GetInstruction(instr);
+    HloInstruction* new_instr = context.GetInstruction(instr);
     for (auto successor : instr->control_successors()) {
       auto replaced_successor = replace(successor);
       // successor may not have been remapped, because it might have been
       // removed by the replacements map.
       if (replaced_successor != nullptr) {
         TF_CHECK_OK(new_instr->AddControlDependencyTo(
-            context->GetInstruction(replaced_successor)));
+            context.GetInstruction(replaced_successor)));
       }
     }
   }
 
   // To make clone behavior match uncloned behavior, we reorder the user and
   // control lists, kept by cloned instructions.
-  SortClonedInstructionUsersAndControlLists(*context, replace, instructions_);
+  SortClonedInstructionUsersAndControlLists(context, replace, instructions_);
 
-  context->MapComputation(this, result.get());
+  context.MapComputation(this, result.get());
   result->SetExecutionThread(execution_thread());
 
   return result;
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_computation.h b/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
index 45dfefbc077..f00f63b533a 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_computation.h
@@ -556,6 +556,17 @@ class HloComputation {
       HloCloneContext* context = nullptr, const std::string& suffix = "clone",
       const HloInstruction* new_root = nullptr);
 
+  // Like CloneWithReplacements(), but this is a const method and `context` must
+  // be specified.
+  std::unique_ptr<HloComputation> CloneInContext(
+      HloCloneContext& context,
+      const absl::flat_hash_map<const HloInstruction*,
+                                std::unique_ptr<HloInstruction>>* replacements =
+          nullptr,
+      absl::Span<const HloInstruction* const> extra_parameters = {},
+      const std::string& suffix = "clone",
+      const HloInstruction* new_root = nullptr) const;
+
   // Convenience overloads for CloneWithReplacements.  You want to do
   //
   //   CloneWithReplacements({{a, std::move(b)}, {c, std::move(d)}})  // ERROR
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc b/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc
index f68103cf92d..979cb8a50f1 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
 
+#include <optional>
+#include <ostream>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 
@@ -34,8 +40,6 @@ Status HloInputOutputAliasConfig::SetUpAlias(
       << " which is an invalid index for shape "
       << ShapeUtil::HumanString(alias_.shape());
   TF_RET_CHECK(param_number >= 0) << param_number;
-  TF_RET_CHECK(!OutputHasAlias(output_index))
-      << "Output index " << output_index << " already has an alias setup";
   // Output can't be aliased with multiple parameters.
   TF_RET_CHECK(!alias_.element(output_index)) << absl::StrFormat(
       "Trying to set up output alias for param %lld at %s but failed: output "
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h b/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h
index f7f68b1135e..4a26fdf0c4c 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h
@@ -17,6 +17,8 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_INPUT_OUTPUT_ALIAS_CONFIG_H_
 
 #include <optional>
+#include <ostream>
+#include <string>
 #include <utility>
 
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
index a7c4a18c52a..28a9550b1ae 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/printer.h"
+#include "tensorflow/compiler/xla/service/hlo_lexer.h"
 #include "tensorflow/compiler/xla/service/mapped_ptr_container_sorter.h"
 #include "tensorflow/compiler/xla/service/name_uniquer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -372,6 +373,17 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
                                computations(0), proto.is_stable());
       break;
     }
+    case HloOpcode::kTopK: {
+      TF_RET_CHECK(proto.operand_ids_size() == 1)
+          << "TopK instruction should have exactly 1 operand but has "
+          << proto.operand_ids_size();
+      TF_RET_CHECK(proto.called_computation_ids_size() == 1)
+          << "TopK instruction should one called computation but sees "
+          << proto.called_computation_ids_size();
+      instruction =
+          CreateTopK(shape, all_operands()[0], proto.k(), computations(0));
+      break;
+    }
     case HloOpcode::kTranspose:
       instruction =
           CreateTranspose(shape, operands(0),
@@ -1056,6 +1068,12 @@ StatusOr<std::unique_ptr<HloInstruction>> HloInstruction::CreateFromProto(
   return std::make_unique<HloIotaInstruction>(shape, iota_dimension);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateTopK(
+    const Shape& shape, HloInstruction* input, int64_t k,
+    HloComputation* compare) {
+  return std::make_unique<HloTopKInstruction>(shape, input, k, compare);
+}
+
 /* static */ std::unique_ptr<HloInstruction>
 HloInstruction::CreateGetTupleElement(const Shape& shape,
                                       HloInstruction* operand, int64_t index) {
@@ -2102,6 +2120,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kSetDimensionSize:
     case HloOpcode::kTriangularSolve:
     case HloOpcode::kCholesky:
+    case HloOpcode::kTopK:
       clone = CloneWithNewOperandsImpl(shape, new_operands, context);
       break;
     // Unary ops.
@@ -2693,6 +2712,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kSetDimensionSize:
     case HloOpcode::kTriangularSolve:
     case HloOpcode::kCholesky:
+    case HloOpcode::kTopK:
       LOG(FATAL) << "Base class impl called for opcode with subclass: "
                  << opcode();
   }
@@ -2901,6 +2921,7 @@ bool HloInstruction::has_to_apply() const {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSort:
+    case HloOpcode::kTopK:
       return true;
     case HloOpcode::kCustomCall:
       // CustomCall can have a to_apply computation, but it is not required to
@@ -3210,9 +3231,18 @@ void HloInstruction::PrintWithCanonicalNameMap(
     printer->Append("}");
   }
   if (options.print_backend_config() && !backend_config_.empty()) {
-    printer->Append(", backend_config=\"");
-    printer->Append(CEscape(backend_config_.GetRawString()));
-    printer->Append("\"");
+    absl::string_view config = backend_config_.GetRawString();
+    printer->Append(", backend_config=");
+    // In the common case that the backend-config is valid-ish JSON, the parser
+    // doesn't need it delimited by quotes, so we can print it without
+    // CEsape'ing.  This is much easier to read.
+    if (LexesAsJsonDict(config)) {
+      printer->Append(config);
+    } else {
+      printer->Append("\"");
+      printer->Append(CEscape(config));
+      printer->Append("\"");
+    }
   }
 }
 
@@ -3342,7 +3372,7 @@ void HloInstruction::PrintExtraAttributes(
                opcode() == HloOpcode::kReduceScatter ||
                opcode() == HloOpcode::kAllReduceStart ||
                opcode() == HloOpcode::kScatter ||
-               opcode() == HloOpcode::kSort) {
+               opcode() == HloOpcode::kTopK || opcode() == HloOpcode::kSort) {
       if (!called_computations().empty()) {
         printer.Next([this, &options](Printer* printer) {
           printer->Append("to_apply=");
@@ -3435,6 +3465,7 @@ void HloInstruction::PrintExtraAttributes(
       case HloOpcode::kAllReduceStart:
       case HloOpcode::kScatter:
       case HloOpcode::kSort:
+      case HloOpcode::kTopK:
         if (!called_computations().empty()) {
           printer.Next([this, &new_options](Printer* printer) {
             printer->Append("to_apply=\n");
@@ -3838,6 +3869,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleCopyDone(this);
     case HloOpcode::kRecv:
       return visitor->HandleRecv(this);
+    case HloOpcode::kTopK:
+      return visitor->HandleTopK(this);
     case HloOpcode::kRecvDone:
       return visitor->HandleRecvDone(this);
     case HloOpcode::kSend:
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
index 7628e7174c2..ac0b10acfba 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instruction.h
@@ -553,6 +553,12 @@ class HloInstruction {
   static std::unique_ptr<HloInstruction> CreateIota(const Shape& shape,
                                                     int64_t iota_dimension);
 
+  // Creates a Top-K instruction.
+  static std::unique_ptr<HloInstruction> CreateTopK(const Shape& shape,
+                                                    HloInstruction* input,
+                                                    int64_t k,
+                                                    HloComputation* compare);
+
   // Creates a get tuple element instruction.
   static std::unique_ptr<HloInstruction> CreateGetTupleElement(
       const Shape& shape, HloInstruction* operand, int64_t index);
@@ -681,8 +687,8 @@ class HloInstruction {
   // precision, and exponent_bits and mantissa_bits describe the precision to
   // reduce it to.
   static std::unique_ptr<HloInstruction> CreateReducePrecision(
-      const Shape& shape, HloInstruction* operand, const int exponent_bits,
-      const int mantissa_bits);
+      const Shape& shape, HloInstruction* operand, int exponent_bits,
+      int mantissa_bits);
 
   // Creates an all-gather op, which concats the operands of all participants
   // along all_gather_dimension. The replica_groups, channel_id, and
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
index a44629f4992..3e3621e3993 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <algorithm>
 #include <deque>
 #include <functional>
+#include <iterator>
+#include <list>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -630,6 +632,41 @@ bool HloChannelInstruction::IdenticalSlowPath(
   return channel_id() == casted_other.channel_id();
 }
 
+HloTopKInstruction::HloTopKInstruction(const Shape& shape,
+                                       HloInstruction* input, int64_t k,
+                                       HloComputation* compare)
+    : HloInstruction(HloOpcode::kTopK, shape), k_(k) {
+  AppendOperand(input);
+  AppendComputation(compare);
+}
+
+HloInstructionProto HloTopKInstruction::ToProto() const {
+  HloInstructionProto proto = HloInstruction::ToProto();
+  proto.set_k(k_);
+  return proto;
+}
+
+void HloTopKInstruction::PrintExtraAttributesImpl(
+    AttributePrinter& printer, const HloPrintOptions& options) const {
+  printer.Next([this](Printer* printer) { AppendCat(printer, "k=", k_); });
+}
+
+std::unique_ptr<HloInstruction> HloTopKInstruction::CloneWithNewOperandsImpl(
+    const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+    HloCloneContext* context) const {
+  return std::make_unique<HloTopKInstruction>(shape, new_operands[0], k(),
+                                              to_apply());
+}
+
+bool HloTopKInstruction::IdenticalSlowPath(
+    const HloInstruction& other,
+    absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+        eq_computations) const {
+  const auto& casted_other = static_cast<const HloTopKInstruction&>(other);
+  return k() == casted_other.k() &&
+         eq_computations(to_apply(), casted_other.to_apply());
+}
+
 HloSendRecvInstruction::HloSendRecvInstruction(HloOpcode opcode,
                                                const Shape& shape,
                                                int64_t channel_id,
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
index 9a5dc6f2b0e..a1eb2b986dd 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_instructions.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_INSTRUCTIONS_H_
 
 #include <functional>
+#include <list>
 #include <memory>
 #include <optional>
 #include <string>
@@ -54,6 +55,7 @@ class HloDimensionsInstruction : public HloInstruction {
       case HloOpcode::kReduce:
       case HloOpcode::kReverse:
       case HloOpcode::kSort:
+      case HloOpcode::kTopK:
       case HloOpcode::kTranspose:
         return true;
       default:
@@ -442,6 +444,36 @@ class HloChannelInstruction : public HloInstruction {
   std::optional<int64_t> channel_id_;
 };
 
+// Class that represents a top-k instruction.
+class HloTopKInstruction : public HloInstruction {
+ public:
+  HloTopKInstruction(const Shape& shape, HloInstruction* input, int64_t k,
+                     HloComputation* compare);
+
+  HloInstructionProto ToProto() const override;
+
+  static bool ClassOf(const HloInstruction* hlo) {
+    return hlo->opcode() == HloOpcode::kTopK;
+  }
+
+  // Returns how many K-s does it need.
+  int64_t k() const { return k_; }
+
+  void PrintExtraAttributesImpl(AttributePrinter& printer,
+                                const HloPrintOptions& options) const override;
+
+ private:
+  bool IdenticalSlowPath(
+      const HloInstruction& other,
+      absl::FunctionRef<bool(const HloComputation*, const HloComputation*)>
+          eq_computations) const override;
+  std::unique_ptr<HloInstruction> CloneWithNewOperandsImpl(
+      const Shape& shape, absl::Span<HloInstruction* const> new_operands,
+      HloCloneContext* context) const override;
+
+  int64_t k_;
+};
+
 class HloSendRecvInstruction : public HloChannelInstruction {
  public:
   // Returns whether this send/recv instruction sends data to/from the host.
@@ -1328,7 +1360,7 @@ class HloFusionInstruction : public HloCallableInstruction {
 
   // Returns true if this instruction is a fusion instruction that generates
   // multiple outputs.
-  const bool IsMultiOutputFusion() const {
+  bool IsMultiOutputFusion() const {
     return fused_expression_root()->opcode() == HloOpcode::kTuple;
   }
 
@@ -1512,8 +1544,7 @@ class HloReducePrecisionInstruction : public HloInstruction {
  public:
   explicit HloReducePrecisionInstruction(const Shape& shape,
                                          HloInstruction* operand,
-                                         const int exponent_bits,
-                                         const int mantissa_bits);
+                                         int exponent_bits, int mantissa_bits);
   // Returns the number of exponent bits for a reduce-precision node.
   int32_t exponent_bits() const { return exponent_bits_; }
   // Returns the number of mantissa bits for a reduce-precision node.
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc
index b24e1ef931e..a0b0df2f11d 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.cc
@@ -15,6 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 
+#include <memory>
+#include <ostream>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
 namespace xla {
 
 HloModuleGroup::HloModuleGroup(std::unique_ptr<HloModule> module)
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module_group.h b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.h
index 3a73fd8296e..00a7e55e4b0 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module_group.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module_group.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_MODULE_GROUP_H_
 
 #include <iosfwd>
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h b/tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h
index 8c25b2d562b..77364307b10 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_module_metadata.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <functional>
 #include <optional>
+#include <string>
+#include <vector>
 
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc b/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc
index cc7b858e6e9..3a32b01c95a 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.h"
 
+#include <string>
 #include <vector>
 
 #include "absl/strings/escaping.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_opcode.cc b/tensorflow/compiler/xla/hlo/ir/hlo_opcode.cc
index c8044275edd..d5a83af1644 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_opcode.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_opcode.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 
+#include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h b/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h
index 9678279661d..3e079768532 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_opcode.h
@@ -160,6 +160,7 @@ namespace xla {
   V(kSubtract, "subtract", 2)                                                  \
   V(kTan, "tan", 1)                                                            \
   V(kTanh, "tanh", 1)                                                          \
+  V(kTopK, "topk", 1)                                                          \
   V(kTranspose, "transpose", 1)                                                \
   V(kTriangularSolve, "triangular-solve", 2)                                   \
   V(kTuple, "tuple", kHloOpcodeIsVariadic)                                     \
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_reachability.cc b/tensorflow/compiler/xla/hlo/ir/hlo_reachability.cc
index 4cd011d353e..c571504e618 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_reachability.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_reachability.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_reachability.h"
 
+#include <memory>
 #include <queue>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_schedule.h b/tensorflow/compiler/xla/hlo/ir/hlo_schedule.h
index 9a5bc3efcf7..1cfd4d7341f 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_schedule.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_schedule.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_SCHEDULE_H_
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_SCHEDULE_H_
 
+#include <algorithm>
+#include <ostream>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
index 4e9b3796594..b4142ae95df 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding.cc
@@ -43,7 +43,6 @@ limitations under the License.
 namespace xla {
 
 using absl::StrCat;
-using absl::StrJoin;
 
 HloSharding HloSharding::AssignDevice(int64_t device_id,
                                       absl::Span<const OpMetadata> metadata) {
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
index 95ffc97351b..76a9c3341c4 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
diff --git a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h
index 44866095297..b87fa01887f 100644
--- a/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h
+++ b/tensorflow/compiler/xla/hlo/ir/hlo_sharding_metadata.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_SHARDING_METADATA_H_
 #define TENSORFLOW_COMPILER_XLA_HLO_IR_HLO_SHARDING_METADATA_H_
 
+#include <memory>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/hlo/transforms/BUILD b/tensorflow/compiler/xla/hlo/transforms/BUILD
index 695cc44e295..1f399dbb837 100644
--- a/tensorflow/compiler/xla/hlo/transforms/BUILD
+++ b/tensorflow/compiler/xla/hlo/transforms/BUILD
@@ -29,7 +29,6 @@ xla_cc_test(
     srcs = ["hlo_constant_splitter_test.cc"],
     deps = [
         ":hlo_constant_splitter",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
diff --git a/tensorflow/compiler/xla/hlo/utils/BUILD b/tensorflow/compiler/xla/hlo/utils/BUILD
index 00386708974..380e95d6f5b 100644
--- a/tensorflow/compiler/xla/hlo/utils/BUILD
+++ b/tensorflow/compiler/xla/hlo/utils/BUILD
@@ -51,29 +51,14 @@ xla_cc_test(
     deps = [
         ":hlo_live_range",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:call_graph",
         "//tensorflow/compiler/xla/service:hlo_alias_analysis",
-        "//tensorflow/compiler/xla/service:hlo_memory_scheduler",
         "//tensorflow/compiler/xla/service:hlo_ordering",
-        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_value",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc b/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc
index 371b94b5fef..a601d965e12 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_matchers_test.cc
@@ -277,9 +277,11 @@ TEST_F(HloMatchersTest, ComparisonMatcher) {
 TEST_F(HloMatchersTest, AsyncCopyMatcher) {
   Shape shape_memspace1 = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {16}, /*minor_to_major=*/{0}, /*tiles=*/{},
+      /*element_size_in_bits=*/0,
       /*memory_space=*/1);
   Shape shape_memspace2 = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {16}, /*minor_to_major=*/{0}, /*tiles=*/{},
+      /*element_size_in_bits=*/0,
       /*memory_space=*/2);
 
   auto p0 = HloInstruction::CreateParameter(0, shape_memspace1, "p0");
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc
index 67a140a3509..f0677a81e62 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.cc
@@ -490,6 +490,15 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
   for (int64_t i = 0; i < target_shape.rank(); ++i) {
     target_dims_stack[i] = target_shape.dimensions(target_shape.rank() - 1 - i);
   }
+  bool inplace_add_sharding_dim = false;
+  auto append_sharding_dim = [&](int64_t size) {
+    if (inplace_add_sharding_dim) {
+      target_tile_assignment_dimensions.back() *= size;
+    } else {
+      target_tile_assignment_dimensions.push_back(size);
+    }
+    inplace_add_sharding_dim = false;
+  };
   while (!source_dims_stack.empty() || !target_dims_stack.empty()) {
     if (target_dims_stack.empty()) {
       if (Product(sharding_tile_dims_stack) != 1) {
@@ -510,15 +519,26 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
     target_dims_stack.pop_back();
     if (s_partitions * Product(sharding_tile_dims_stack) == 1) {
       // No more partitions left.
-      target_tile_assignment_dimensions.push_back(1);
+      append_sharding_dim(1);
+      continue;
+    }
+    if (s_partitions > 1 && s_size % s_partitions == 0 &&
+        t_size % s_partitions == 0) {
+      // If s_partitions evenly divides both s_size and t_size, we can add this
+      // sharding dim and work on shard sized shapes in the next iteration.
+      source_dims_stack.push_back(s_size / s_partitions);
+      target_dims_stack.push_back(t_size / s_partitions);
+      sharding_tile_dims_stack.push_back(1);
+      append_sharding_dim(s_partitions);
+      inplace_add_sharding_dim = true;
       continue;
     }
     if (s_size == t_size) {
       // Same dimension.
-      target_tile_assignment_dimensions.push_back(s_partitions);
+      append_sharding_dim(s_partitions);
     } else if (t_size == 1) {
       // Trivial dimension added.
-      target_tile_assignment_dimensions.push_back(1);
+      append_sharding_dim(1);
       source_dims_stack.push_back(s_size);
       sharding_tile_dims_stack.push_back(s_partitions);
     } else if (s_size == 1) {
@@ -533,12 +553,12 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
         return std::nullopt;
       }
       if (t_size % s_partitions == 0) {
-        target_tile_assignment_dimensions.push_back(s_partitions);
+        append_sharding_dim(s_partitions);
         // We have part of the s_size unprocessed, so put it back to stack.
         source_dims_stack.push_back(s_size / t_size);
         sharding_tile_dims_stack.push_back(1);
       } else if (s_partitions % t_size == 0) {
-        target_tile_assignment_dimensions.push_back(t_size);
+        append_sharding_dim(t_size);
         // We have part of the s_size unprocessed, so put it back to stack.
         source_dims_stack.push_back(s_size / t_size);
         sharding_tile_dims_stack.push_back(s_partitions / t_size);
@@ -597,6 +617,68 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
                                sharding.metadata());
 }
 
+HloSharding PropagateShardingThroughReshape(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding) {
+  HloSharding result = HloSharding::Replicate();
+  if (sharding.IsTileMaximal() || sharding.IsManual()) {
+    return sharding;
+  }
+  if (sharding.IsManualSubgroup()) {
+    auto group =
+        GroupShardingOnDims(sharding, {sharding.SubgroupManualDim()}, true);
+    HloSharding inner_reshaped = PropagateShardingThroughReshape(
+        source_shape, target_shape, group.sharding);
+    group.sharding = std::move(inner_reshaped);
+    group.data_rank = target_shape.rank();
+    group.group_dims[0] += target_shape.rank() - source_shape.rank();
+    return UngroupSharding(group);
+  }
+  // Find intervals of consecutive dimensions that could use ReshapeSharding().
+  // then merge the results. We start with the longest interval (whole shape),
+  // and if it fails, we find a sub-interval of it or a disjoint interval.
+  int64_t start_dim = 0;
+  while (start_dim < source_shape.rank()) {
+    int64_t found_compatible = false;
+    // For each start_dim, try to use all dims after it. If that fails, reduce
+    // the range.
+    for (int64_t end_dim = source_shape.rank(); end_dim > start_dim;
+         --end_dim) {
+      std::vector<int64_t> preserved_dims(end_dim - start_dim);
+      absl::c_iota(preserved_dims, start_dim);
+      auto group = GroupShardingOnAllDimsExcept(sharding, preserved_dims);
+      if (auto reshaped =
+              ReshapeSharding(source_shape, target_shape, group.sharding)) {
+        group.sharding = std::move(*reshaped);
+        group.group_dims.clear();
+        // Replication dim.
+        group.group_dims.push_back(target_shape.rank());
+        group.data_rank = target_shape.rank();
+        int64_t group_size = Product(group.group_dim_sizes);
+        group.group_dim_sizes.clear();
+        group.group_dim_sizes.push_back(group_size);
+        if (MergeShardingIfCompatible(UngroupSharding(group),
+                                      result.NumTiles() + 1, &result)) {
+          // If the current interval works, we can skip all dimensions within
+          // or before it in future intervals, since they have been considered
+          // already. Set start_dim to end_dim to start with the next disjoint
+          // interval.
+          result.metadata() = sharding.metadata();
+          start_dim = end_dim;
+          found_compatible = true;
+          break;
+        }
+      }
+    }
+    if (!found_compatible) {
+      // All sub-intervals with the current start_dim failed. Try the next
+      // start_dim.
+      start_dim += 1;
+    }
+  }
+  return result;
+}
+
 HloSharding ReverseSharding(const HloSharding& sharding,
                             absl::Span<const int64_t> dimensions) {
   if (sharding.IsTileMaximal() || dimensions.empty()) {
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h
index ecdbc5b76f2..27ae33324f9 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h
@@ -108,6 +108,14 @@ std::optional<HloSharding> ReshapeSharding(const Shape& source_shape,
                                            const Shape& target_shape,
                                            const HloSharding& sharding);
 
+// Propagates sharding through reshape. It tries to find partial matches on
+// subsets of dimensions that could satisfy ReshapeSharding() constraints, then
+// combine them. It doesn't require all dimensions to satisfy the constraints
+// of ReshapeSharding().
+HloSharding PropagateShardingThroughReshape(const Shape& source_shape,
+                                            const Shape& target_shape,
+                                            const HloSharding& sharding);
+
 // Returns the HloSharding with the tile dimensions and tile assignment
 // reversed based on the specified dimension numbers. In case of a tile
 // maximal sharding returns the original sharding.
diff --git a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util_test.cc b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util_test.cc
index d558ff90024..bf61310cda0 100644
--- a/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util_test.cc
+++ b/tensorflow/compiler/xla/hlo/utils/hlo_sharding_util_test.cc
@@ -231,6 +231,51 @@ TEST(HloShardingUtilTest, ReshapeToTileDimension2D_Dim2_Batch1) {
             Array3D<int64_t>({{{0, 2, 1, 3}}, {{4, 6, 5, 7}}}));
 }
 
+TEST(HloShardingUtilTest, PropagateReshapeShardingTiledSplitPartialMatch) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {14, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 7, 4, 4});
+  Array2D<int64_t> tile(4, 8);
+  tile.FillIota(0);
+  HloSharding input_sharding = HloSharding::Tile(tile);
+  tile.TransposeDimensions({1, 0});
+  tile.Reshape({1, 1, 4, 2, 4});
+  HloSharding output_sharding = HloSharding::PartialTile(tile);
+  HloSharding result = PropagateShardingThroughReshape(
+      input_shape, output_shape, input_sharding);
+  EXPECT_EQ(result, output_sharding);
+}
+
+TEST(HloShardingUtilTest, PropagateReshapeShardingTiledMergeSplitPartialMatch) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {2, 2, 14, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {4, 2, 7, 4, 4});
+  Array4D<int64_t> tile(2, 2, 4, 8);
+  tile.FillIota(0);
+  HloSharding input_sharding = HloSharding::Tile(tile);
+  tile.TransposeDimensions({0, 1, 3, 2});
+  tile.Reshape({4, 1, 1, 4, 2, 4});
+  HloSharding output_sharding = HloSharding::PartialTile(tile);
+  HloSharding result = PropagateShardingThroughReshape(
+      input_shape, output_shape, input_sharding);
+  EXPECT_EQ(result, output_sharding);
+}
+
+TEST(HloShardingUtilTest,
+     PropagateReshapeShardingTiledSplitPartialMatchManual) {
+  Shape input_shape = ShapeUtil::MakeShape(F32, {14, 16});
+  Shape output_shape = ShapeUtil::MakeShape(F32, {2, 7, 4, 4});
+  Array3D<int64_t> tile(4, 8, 2);
+  tile.FillIota(0);
+  HloSharding input_sharding =
+      HloSharding::Subgroup(tile, {OpSharding::MANUAL});
+  tile.TransposeDimensions({1, 0, 2});
+  tile.Reshape({1, 1, 4, 2, 4, 2});
+  HloSharding output_sharding =
+      HloSharding::Subgroup(tile, {OpSharding::REPLICATED, OpSharding::MANUAL});
+  HloSharding result = PropagateShardingThroughReshape(
+      input_shape, output_shape, input_sharding);
+  EXPECT_EQ(result, output_sharding);
+}
+
 TEST(HloShardingUtilTest, GetManualSubgroupSharding_ManualOnly) {
   Array<int64_t> tile_assignment({1, 2, 2});
   tile_assignment.FillIota(0);
diff --git a/tensorflow/compiler/xla/layout.cc b/tensorflow/compiler/xla/layout.cc
index 16ef16f432b..6b168ad937d 100644
--- a/tensorflow/compiler/xla/layout.cc
+++ b/tensorflow/compiler/xla/layout.cc
@@ -72,7 +72,8 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
                absl::Span<const bool> dim_unique,
                absl::Span<const bool> dim_ordered, absl::Span<const Tile> tiles,
                PrimitiveType index_primitive_type,
-               PrimitiveType pointer_primitive_type, int64_t memory_space,
+               PrimitiveType pointer_primitive_type,
+               int64_t element_size_in_bits, int64_t memory_space,
                std::unique_ptr<Shape> physical_shape,
                int64_t dynamic_shape_metadata_prefix_bytes)
     : dim_level_types_(dim_level_types.begin(), dim_level_types.end()),
@@ -82,6 +83,7 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
       tiles_(tiles.begin(), tiles.end()),
       index_primitive_type_(index_primitive_type),
       pointer_primitive_type_(pointer_primitive_type),
+      element_size_in_bits_(element_size_in_bits),
       memory_space_(memory_space),
       physical_shape_(std::move(physical_shape)),
       dynamic_shape_metadata_prefix_bytes_(
@@ -95,6 +97,7 @@ Layout::Layout(const Layout& other)
       tiles_(other.tiles_),
       index_primitive_type_(other.index_primitive_type_),
       pointer_primitive_type_(other.pointer_primitive_type_),
+      element_size_in_bits_(other.element_size_in_bits_),
       memory_space_(other.memory_space_),
       physical_shape_(other.physical_shape_ != nullptr
                           ? std::make_unique<Shape>(*other.physical_shape_)
@@ -115,6 +118,7 @@ Layout& Layout::operator=(const Layout& other) {
     tiles_ = other.tiles_;
     index_primitive_type_ = other.index_primitive_type_;
     pointer_primitive_type_ = other.pointer_primitive_type_;
+    element_size_in_bits_ = other.element_size_in_bits_;
     memory_space_ = other.memory_space_;
     if (other.physical_shape_ != nullptr) {
       physical_shape_ = std::make_unique<Shape>(*other.physical_shape_);
@@ -149,6 +153,7 @@ Layout& Layout::operator=(Layout&& other) = default;
   }
   layout.set_index_primitive_type(proto.index_primitive_type());
   layout.set_pointer_primitive_type(proto.pointer_primitive_type());
+  layout.set_element_size_in_bits(proto.element_size_in_bits());
   layout.set_memory_space(proto.memory_space());
   if (proto.has_physical_shape()) {
     *layout.mutable_physical_shape() = Shape(proto.physical_shape());
@@ -178,6 +183,7 @@ LayoutProto Layout::ToProto() const {
   }
   proto.set_index_primitive_type(index_primitive_type());
   proto.set_pointer_primitive_type(pointer_primitive_type());
+  proto.set_element_size_in_bits(element_size_in_bits_);
   proto.set_memory_space(memory_space_);
   if (has_physical_shape()) {
     *proto.mutable_physical_shape() = physical_shape_->ToProto();
@@ -267,6 +273,13 @@ void Layout::Print(Printer* printer) const {
     }
   }
 
+  if (element_size_in_bits() != 0) {
+    print_colon();
+    printer->Append("E(");
+    printer->Append(element_size_in_bits());
+    printer->Append(")");
+  }
+
   if (memory_space() != 0) {
     print_colon();
     printer->Append("S(");
@@ -317,6 +330,10 @@ bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
       lhs.pointer_primitive_type() != rhs.pointer_primitive_type()) {
     return false;
   }
+  if (!ignore_element_size_ &&
+      lhs.element_size_in_bits() != rhs.element_size_in_bits()) {
+    return false;
+  }
   if (!ignore_memory_space_ && lhs.memory_space() != rhs.memory_space()) {
     return false;
   }
@@ -356,4 +373,24 @@ Shape* Layout::mutable_physical_shape() {
 
 void Layout::clear_physical_shape() { physical_shape_ = nullptr; }
 
+Layout& Layout::DeleteDimension(int64_t dim_to_delete) {
+  for (int64_t i = 0; i < minor_to_major_.size();) {
+    if (minor_to_major_[i] == dim_to_delete) {
+      minor_to_major_.erase(minor_to_major_.begin() + i);
+      continue;
+    }
+    if (minor_to_major_[i] > dim_to_delete) {
+      minor_to_major_[i] -= 1;
+    }
+    ++i;
+  }
+  // Delete the corresponding dim level types.
+  if (LayoutUtil::IsSparse(*this)) {
+    dim_level_types_.erase(dim_level_types_.begin() + dim_to_delete);
+    dim_unique_.erase(dim_unique_.begin() + dim_to_delete);
+    dim_ordered_.erase(dim_ordered_.begin() + dim_to_delete);
+  }
+  return *this;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/layout.h b/tensorflow/compiler/xla/layout.h
index 7e266e46038..e8b83782798 100644
--- a/tensorflow/compiler/xla/layout.h
+++ b/tensorflow/compiler/xla/layout.h
@@ -86,6 +86,8 @@ class Tile {
   absl::InlinedVector<int64_t, 2> dimensions_;
 };
 
+// TODO: Rename the `dim_level_types` field to `lvl_types`, so that it
+// matches `mlir::sparse_tensor::SparseTensorEncodingAttr`.
 class Layout {
  public:
   Layout();
@@ -105,7 +107,7 @@ class Layout {
                   absl::Span<const Tile> tiles,
                   PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
                   PrimitiveType element_primitive_type = PRIMITIVE_TYPE_INVALID,
-                  int64_t memory_space = 0,
+                  int64_t element_size_in_bits = 0, int64_t memory_space = 0,
                   std::unique_ptr<Shape> physical_shape = nullptr,
                   int64_t dynamic_shape_metadata_prefix_bytes = 0);
 
@@ -161,16 +163,23 @@ class Layout {
       return *this;
     }
 
+    Equal& IgnoreElementSize() {
+      ignore_element_size_ = true;
+      return *this;
+    }
+
     Equal& MinorToMajorOnly() {
       return IgnoreTiles()
           .IgnoreIndexPrimitiveType()
           .IgnorePointerPrimitiveType()
           .IgnoreMemorySpace()
-          .IgnorePhysicalShape();
+          .IgnorePhysicalShape()
+          .IgnoreElementSize();
     }
 
    private:
     bool ignore_tiles_ = false;
+    bool ignore_element_size_ = false;
     bool ignore_index_primitive_type_ = false;
     bool ignore_pointer_primitive_type_ = false;
     bool ignore_memory_space_ = false;
@@ -264,6 +273,10 @@ class Layout {
     minor_to_major_.clear();
     return *this;
   }
+  // Removes the given dimension from 'minor_to_major_', and adjusts the other
+  // dimensions accordingly. Also adjusts 'dim_level_types_', 'dim_ordered_' and
+  // 'dim_unique_' in case it is a sparse layout.
+  Layout& DeleteDimension(int64_t dim_to_delete);
   absl::Span<const int64_t> minor_to_major() const { return minor_to_major_; }
   DimensionVector* mutable_minor_to_major() { return &minor_to_major_; }
 
@@ -282,6 +295,12 @@ class Layout {
   absl::Span<const Tile> tiles() const { return tiles_; }
   absl::InlinedVector<Tile, 2>* mutable_tiles() { return &tiles_; }
 
+  int64_t element_size_in_bits() const { return element_size_in_bits_; }
+  Layout& set_element_size_in_bits(int64_t value) {
+    element_size_in_bits_ = value;
+    return *this;
+  }
+
   PrimitiveType index_primitive_type() const { return index_primitive_type_; }
   Layout& set_index_primitive_type(PrimitiveType value) {
     index_primitive_type_ = value;
@@ -330,8 +349,8 @@ class Layout {
   template <typename H>
   friend H AbslHashValue(H h, const Layout& l) {
     return H::combine(std::move(h), l.minor_to_major_, l.tiles_,
-                      l.index_primitive_type_, l.pointer_primitive_type_,
-                      l.memory_space_);
+                      l.element_size_in_bits_, l.index_primitive_type_,
+                      l.pointer_primitive_type_, l.memory_space_);
   }
 
  private:
@@ -364,6 +383,10 @@ class Layout {
   PrimitiveType index_primitive_type_ = PRIMITIVE_TYPE_INVALID;
   PrimitiveType pointer_primitive_type_ = PRIMITIVE_TYPE_INVALID;
 
+  // The number of bits used to store an individual array element.
+  // When the value is 0, default to ShapeUtil::ByteSizeOfPrimitiveType.
+  int64_t element_size_in_bits_ = 0;
+
   // The assigned memory space.
   int64_t memory_space_ = 0;
 
diff --git a/tensorflow/compiler/xla/layout_test.cc b/tensorflow/compiler/xla/layout_test.cc
index d53eeede72a..0010802f4f0 100644
--- a/tensorflow/compiler/xla/layout_test.cc
+++ b/tensorflow/compiler/xla/layout_test.cc
@@ -36,6 +36,10 @@ TEST_F(LayoutTest, ToString) {
   EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
                 .ToString(),
             "{3,2,1,0:T(42,123)(4,5)}");
+  EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
+                .set_element_size_in_bits(42)
+                .ToString(),
+            "{3,2,1,0:T(42,123)(4,5)E(42)}");
   EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
                 .set_memory_space(3)
                 .ToString(),
@@ -69,6 +73,10 @@ TEST_F(LayoutTest, Equality) {
             Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 45})}));
   EXPECT_NE(Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}),
             Layout({0, 1, 2, 3}));
+  EXPECT_EQ(Layout({0, 1, 2}).set_element_size_in_bits(33),
+            Layout({0, 1, 2}).set_element_size_in_bits(33));
+  EXPECT_NE(Layout({0, 1, 2}).set_element_size_in_bits(33),
+            Layout({0, 1, 2}).set_element_size_in_bits(7));
   EXPECT_EQ(Layout({0, 1, 2}).set_memory_space(3),
             Layout({0, 1, 2}).set_memory_space(3));
   EXPECT_NE(Layout({0, 1, 2}).set_memory_space(1),
@@ -82,6 +90,9 @@ TEST_F(LayoutTest, Equality) {
                              PRIMITIVE_TYPE_INVALID, 32),
                       Layout({0, 1, 2}, {}, {}, {}, {}, PRIMITIVE_TYPE_INVALID,
                              PRIMITIVE_TYPE_INVALID, 1)));
+  EXPECT_TRUE(Layout::Equal().IgnoreElementSize()(
+      Layout({0, 1, 2}).set_element_size_in_bits(32),
+      Layout({0, 1, 2}).set_element_size_in_bits(1)));
   EXPECT_TRUE(Layout::Equal().IgnoreMemorySpace()(
       Layout({0, 1, 2}).set_memory_space(1),
       Layout({0, 1, 2}).set_memory_space(3)));
@@ -95,12 +106,13 @@ TEST_F(LayoutTest, LayoutToFromProto) {
 
   expect_unchanged(Layout());
   expect_unchanged(Layout({1, 3, 2, 0}));
+  expect_unchanged(Layout({0, 1}).set_element_size_in_bits(42));
   expect_unchanged(
       Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})}));
   expect_unchanged(Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {}));
   expect_unchanged(
       Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {},
-             PRIMITIVE_TYPE_INVALID, PRIMITIVE_TYPE_INVALID, 0,
+             PRIMITIVE_TYPE_INVALID, PRIMITIVE_TYPE_INVALID, 0, 0,
              std::make_unique<Shape>(ShapeUtil::MakeShape(S32, {10, 10}))));
 }
 
diff --git a/tensorflow/compiler/xla/layout_util.cc b/tensorflow/compiler/xla/layout_util.cc
index 987d5ed4556..8e8ba68170c 100644
--- a/tensorflow/compiler/xla/layout_util.cc
+++ b/tensorflow/compiler/xla/layout_util.cc
@@ -60,8 +60,8 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
     absl::Span<const DimLevelType> dim_level_types,
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     absl::Span<const Tile> tiles, PrimitiveType index_primitive_type,
-    PrimitiveType pointer_primitive_type, int64_t memory_space,
-    std::optional<Shape> physical_shape,
+    PrimitiveType pointer_primitive_type, int64_t element_size_in_bits,
+    int64_t memory_space, std::optional<Shape> physical_shape,
     int64_t dynamic_shape_metadata_prefix_bytes) {
   Layout layout;
   for (int64_t dimension_number : minor_to_major) {
@@ -89,6 +89,7 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
   }
   layout.set_index_primitive_type(index_primitive_type);
   layout.set_pointer_primitive_type(pointer_primitive_type);
+  layout.set_element_size_in_bits(element_size_in_bits);
   layout.set_memory_space(memory_space);
   if (physical_shape != std::nullopt) {
     *layout.mutable_physical_shape() = *std::move(physical_shape);
diff --git a/tensorflow/compiler/xla/layout_util.h b/tensorflow/compiler/xla/layout_util.h
index 2c7432e5a1d..a3dd66cd31e 100644
--- a/tensorflow/compiler/xla/layout_util.h
+++ b/tensorflow/compiler/xla/layout_util.h
@@ -44,7 +44,7 @@ class LayoutUtil {
       absl::Span<const Tile> tiles = {},
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
-      int64_t memory_space = 0,
+      int64_t element_size_in_bits = 0, int64_t memory_space = 0,
       std::optional<Shape> physical_shape = std::nullopt,
       int64_t dynamic_shape_metadata_prefix_bytes = 0);
 
diff --git a/tensorflow/compiler/xla/layout_util_test.cc b/tensorflow/compiler/xla/layout_util_test.cc
index acf09ac425c..a14f337685a 100644
--- a/tensorflow/compiler/xla/layout_util_test.cc
+++ b/tensorflow/compiler/xla/layout_util_test.cc
@@ -363,6 +363,21 @@ TEST_F(LayoutUtilTest, HumanStringWithTiling) {
   EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
             "pred[8,8,8]{0,2,1:T(8,128)}");
 
+  // PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  tile = shape.mutable_layout()->add_tiles();
+  tile->add_dimensions(8);
+  tile->add_dimensions(128);
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:T(8,128)E(32)}");
+
+  // No tile. PRED with element size of 32 bits.
+  shape.mutable_layout()->clear_tiles();
+  shape.mutable_layout()->set_element_size_in_bits(32);
+  EXPECT_EQ(ShapeUtil::HumanStringWithLayout(shape),
+            "pred[8,8,8]{0,2,1:E(32)}");
+
   // Tile with negative dimension size for combining dimensions.
   shape = ShapeUtil::MakeShapeWithDenseLayout(BF16, {2, 3, 1004}, {2, 1, 0});
   tile = shape.mutable_layout()->add_tiles();
diff --git a/tensorflow/compiler/xla/literal.cc b/tensorflow/compiler/xla/literal.cc
index a4efef75d60..ef84d33fe39 100644
--- a/tensorflow/compiler/xla/literal.cc
+++ b/tensorflow/compiler/xla/literal.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/compiler/xla/index_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
@@ -52,6 +53,7 @@ namespace xla {
 namespace {
 
 using absl::StrCat;
+using primitive_util::NativeTypeOf;
 
 constexpr bool kLittleEndian = __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__;
 // Literals can be used as DMA targets, which can require alignment. We
@@ -105,58 +107,14 @@ const Shape& ScalarShapeImpl() {
 }
 
 const Shape& ScalarShape(PrimitiveType type) {
-  switch (type) {
-    case U4:
-      return ScalarShapeImpl<U4>();
-    case U8:
-      return ScalarShapeImpl<U8>();
-    case U16:
-      return ScalarShapeImpl<U16>();
-    case U32:
-      return ScalarShapeImpl<U32>();
-    case U64:
-      return ScalarShapeImpl<U64>();
-    case S4:
-      return ScalarShapeImpl<S4>();
-    case S8:
-      return ScalarShapeImpl<S8>();
-    case S16:
-      return ScalarShapeImpl<S16>();
-    case S32:
-      return ScalarShapeImpl<S32>();
-    case S64:
-      return ScalarShapeImpl<S64>();
-    case F8E5M2:
-      return ScalarShapeImpl<F8E5M2>();
-    case F8E4M3FN:
-      return ScalarShapeImpl<F8E4M3FN>();
-    case F8E4M3B11FNUZ:
-      return ScalarShapeImpl<F8E4M3B11FNUZ>();
-    case F16:
-      return ScalarShapeImpl<F16>();
-    case BF16:
-      return ScalarShapeImpl<BF16>();
-    case F32:
-      return ScalarShapeImpl<F32>();
-    case F64:
-      return ScalarShapeImpl<F64>();
-    case C64:
-      return ScalarShapeImpl<C64>();
-    case C128:
-      return ScalarShapeImpl<C128>();
-    case PRED:
-      return ScalarShapeImpl<PRED>();
-    case TUPLE:
-      LOG(FATAL) << "Tuple element type cannot be a scalar type.";
-    case OPAQUE_TYPE:
-      LOG(FATAL) << "Opaque element type cannot be a scalar type.";
-    case TOKEN:
-      LOG(FATAL) << "Token element type cannot be a scalar type.";
-    case PRIMITIVE_TYPE_INVALID:
-      LOG(FATAL) << "Invalid primitive type.";
-    default:
-      LOG(FATAL) << "Unhandled primitive type " << type;
-  }
+  return primitive_util::PrimitiveTypeSwitch<const Shape&>(
+      [&](auto primitive_type_constant) -> const Shape& {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          return ScalarShapeImpl<primitive_type_constant>();
+        }
+        LOG(FATAL) << "Unhandled primitive type " << type;
+      },
+      type);
 }
 
 const Shape& NilShape() {
@@ -377,35 +335,22 @@ int32_t LiteralBase::GetDynamicSize(int64_t dim_index,
 }
 
 std::optional<int64_t> LiteralBase::GetFirstInteger() const {
-  switch (shape().element_type()) {
-    case U4:
-      return GetFirstElement<u4>();
-    case U8:
-      return GetFirstElement<uint8_t>();
-    case U16:
-      return GetFirstElement<uint16_t>();
-    case U32:
-      return GetFirstElement<uint32_t>();
-    case U64: {
-      int64_t v = GetFirstElement<uint64_t>();
-      if (v < 0) {
+  return primitive_util::PrimitiveTypeSwitch<std::optional<int64_t>>(
+      [&](auto primitive_type_constant) -> std::optional<int64_t> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          auto first_element = GetFirstElement<NativeT>();
+          if constexpr (std::is_same_v<NativeT, uint64_t>) {
+            int64_t v = static_cast<int64_t>(first_element);
+            if (v < 0) {
+              return std::nullopt;
+            }
+          }
+          return first_element;
+        }
         return std::nullopt;
-      }
-      return v;
-    }
-    case S4:
-      return GetFirstElement<s4>();
-    case S8:
-      return GetFirstElement<int8_t>();
-    case S16:
-      return GetFirstElement<int16_t>();
-    case S32:
-      return GetFirstElement<int32_t>();
-    case S64:
-      return GetFirstElement<int64_t>();
-    default:
-      return std::nullopt;
-  }
+      },
+      shape().element_type());
 }
 
 template <typename NativeT>
@@ -673,42 +618,25 @@ Status LiteralBase::Piece::CopyFrom(const LiteralBase::Piece& src,
     memcpy(buffer(), src.buffer(), src.size_bytes_dense());
   } else {
     std::vector<int64_t> origin(subshape().rank(), 0);
-    switch (subshape().element_type()) {
-#define COPY_ELEMENTS(XLA_T, NATIVE_T)                                      \
-  case (XLA_T):                                                             \
-    if (only_dynamic_bound) {                                               \
-      CopyElementsWithDynamicBound<NATIVE_T>(src);                          \
-    } else {                                                                \
-      CopyElementsBetween<NATIVE_T>(data<NATIVE_T>(), src.data<NATIVE_T>(), \
-                                    subshape(), src.subshape());            \
-    }                                                                       \
-    break;
-      COPY_ELEMENTS(U4, u4);
-      COPY_ELEMENTS(U8, uint8_t);
-      COPY_ELEMENTS(U16, uint16_t);
-      COPY_ELEMENTS(U32, uint32_t);
-      COPY_ELEMENTS(U64, uint64_t);
-      COPY_ELEMENTS(S4, s4);
-      COPY_ELEMENTS(S8, int8_t);
-      COPY_ELEMENTS(S16, int16_t);
-      COPY_ELEMENTS(S32, int32_t);
-      COPY_ELEMENTS(S64, int64_t);
-      COPY_ELEMENTS(F8E5M2, tsl::float8_e5m2);
-      COPY_ELEMENTS(F8E4M3FN, tsl::float8_e4m3fn);
-      COPY_ELEMENTS(F8E4M3B11FNUZ, tsl::float8_e4m3b11);
-      COPY_ELEMENTS(F16, half);
-      COPY_ELEMENTS(BF16, bfloat16);
-      COPY_ELEMENTS(F32, float);
-      COPY_ELEMENTS(F64, double);
-      COPY_ELEMENTS(C64, complex64);
-      COPY_ELEMENTS(C128, complex128);
-      COPY_ELEMENTS(PRED, bool);
-#undef COPY_ELEMENTS
-      default:
-        return Unimplemented(
-            "Copying a Literal object with element type %s is not implemented.",
-            PrimitiveType_Name(subshape().element_type()));
-    }
+    TF_RETURN_IF_ERROR(primitive_util::PrimitiveTypeSwitch<Status>(
+        [&](auto primitive_type_constant) -> Status {
+          if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+            using NativeT = NativeTypeOf<primitive_type_constant>;
+            if (only_dynamic_bound) {
+              CopyElementsWithDynamicBound<NativeT>(src);
+            } else {
+              CopyElementsBetween<NativeT>(this->data<NativeT>(),
+                                           src.data<NativeT>(), subshape(),
+                                           src.subshape());
+            }
+            return OkStatus();
+          }
+          return Unimplemented(
+              "Copying a Literal object with element type %s is not "
+              "implemented.",
+              PrimitiveType_Name(subshape().element_type()));
+        },
+        subshape().element_type()));
   }
   DCHECK_EQ(dynamic_size_buffer_bytes(), src.dynamic_size_buffer_bytes());
   if (subshape().is_dynamic() && src.subshape().is_dynamic()) {
@@ -834,73 +762,18 @@ Status MutableLiteralBase::CopySliceFrom(const LiteralSlice& src_literal,
   TF_RET_CHECK(src_literal.shape().rank() == src_base.size());
   TF_RET_CHECK(shape().rank() == dest_base.size());
 
-  switch (shape().element_type()) {
-    case U4:
-      return CopySliceFromInternal<u4>(src_literal, src_base, dest_base,
-                                       copy_size);
-    case U8:
-      return CopySliceFromInternal<uint8_t>(src_literal, src_base, dest_base,
-                                            copy_size);
-    case U16:
-      return CopySliceFromInternal<uint16_t>(src_literal, src_base, dest_base,
-                                             copy_size);
-    case U32:
-      return CopySliceFromInternal<uint32_t>(src_literal, src_base, dest_base,
-                                             copy_size);
-    case U64:
-      return CopySliceFromInternal<uint64_t>(src_literal, src_base, dest_base,
-                                             copy_size);
-    case S4:
-      return CopySliceFromInternal<s4>(src_literal, src_base, dest_base,
-                                       copy_size);
-    case S8:
-      return CopySliceFromInternal<int8_t>(src_literal, src_base, dest_base,
-                                           copy_size);
-    case S16:
-      return CopySliceFromInternal<int16_t>(src_literal, src_base, dest_base,
-                                            copy_size);
-    case S32:
-      return CopySliceFromInternal<int32_t>(src_literal, src_base, dest_base,
-                                            copy_size);
-    case S64:
-      return CopySliceFromInternal<int64_t>(src_literal, src_base, dest_base,
-                                            copy_size);
-    case F8E5M2:
-      return CopySliceFromInternal<tsl::float8_e5m2>(src_literal, src_base,
-                                                     dest_base, copy_size);
-    case F8E4M3FN:
-      return CopySliceFromInternal<tsl::float8_e4m3fn>(src_literal, src_base,
-                                                       dest_base, copy_size);
-    case F8E4M3B11FNUZ:
-      return CopySliceFromInternal<tsl::float8_e4m3b11>(src_literal, src_base,
-                                                        dest_base, copy_size);
-    case F16:
-      return CopySliceFromInternal<half>(src_literal, src_base, dest_base,
-                                         copy_size);
-    case BF16:
-      return CopySliceFromInternal<bfloat16>(src_literal, src_base, dest_base,
-                                             copy_size);
-    case F32:
-      return CopySliceFromInternal<float>(src_literal, src_base, dest_base,
-                                          copy_size);
-    case F64:
-      return CopySliceFromInternal<double>(src_literal, src_base, dest_base,
-                                           copy_size);
-    case C64:
-      return CopySliceFromInternal<complex64>(src_literal, src_base, dest_base,
-                                              copy_size);
-    case C128:
-      return CopySliceFromInternal<complex128>(src_literal, src_base, dest_base,
-                                               copy_size);
-    case PRED:
-      return CopySliceFromInternal<bool>(src_literal, src_base, dest_base,
-                                         copy_size);
-    default:
-      break;
-  }
-  return Unimplemented(
-      "Copying a slice from a Literal object with element type %d is not "
-      "implemented.",
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return CopySliceFromInternal<NativeT>(src_literal, src_base,
+                                                dest_base, copy_size);
+        }
+        return Unimplemented(
+            "Copying a slice from a Literal object with element type %d is not "
+            "implemented.",
+            shape().element_type());
+      },
       shape().element_type());
 }
 
@@ -992,6 +865,13 @@ Literal LiteralBase::Relayout(const Layout& new_layout,
   Shape* subshape = ShapeUtil::GetMutableSubshape(&new_shape, shape_index);
   TF_CHECK_OK(LayoutUtil::ValidateLayoutForShape(new_layout, *subshape));
   *subshape->mutable_layout() = new_layout;
+  // LINT.IfChange
+  // s4 literals are stored in uint8_t/int8_t, therefore element_size_in_bits
+  // must be removed.
+  if (subshape->layout().element_size_in_bits() == 4) {
+    subshape->mutable_layout()->set_element_size_in_bits(0);
+  }
+  // LINT.ThenChange(//tensorflow/compiler/xla/types.h)
   Literal result(new_shape);
   TF_CHECK_OK(result.CopyFrom(*this));
   return result;
@@ -1240,14 +1120,13 @@ void SliceInternal(const LiteralBase& src_literal,
                    Literal& result_literal) {
   const Shape& result_shape = result_literal.shape();
   DimensionVector new_indices(result_shape.rank());
-  CHECK(result_literal
-            .Populate<NativeT>([&](absl::Span<const int64_t> indices) {
-              for (int64_t i = 0; i < result_shape.rank(); ++i) {
-                new_indices[i] = indices[i] + start_indices[i];
-              }
-              return src_literal.Get<NativeT>(new_indices);
-            })
-            .ok());
+  TF_CHECK_OK(
+      result_literal.Populate<NativeT>([&](absl::Span<const int64_t> indices) {
+        for (int64_t i = 0; i < result_shape.rank(); ++i) {
+          new_indices[i] = indices[i] + start_indices[i];
+        }
+        return src_literal.Get<NativeT>(new_indices);
+      }));
   for (int64_t dnum = 0; dnum < src_literal.shape().rank(); ++dnum) {
     if (src_literal.shape().is_dynamic_dimension(dnum)) {
       int64_t dynamic_size =
@@ -1278,71 +1157,16 @@ Literal LiteralBase::Slice(absl::Span<const int64_t> start_indices,
       LayoutUtil::MinorToMajor(shape()));
   ShapeUtil::CopyDynamicDimensions(&result_shape, shape());
   Literal result_literal(result_shape);
-  switch (result_shape.element_type()) {
-    case PRED:
-      SliceInternal<bool>(*this, start_indices, result_literal);
-      break;
-    case U4:
-      SliceInternal<u4>(*this, start_indices, result_literal);
-      break;
-    case U8:
-      SliceInternal<uint8_t>(*this, start_indices, result_literal);
-      break;
-    case U16:
-      SliceInternal<uint16_t>(*this, start_indices, result_literal);
-      break;
-    case U32:
-      SliceInternal<uint32_t>(*this, start_indices, result_literal);
-      break;
-    case U64:
-      SliceInternal<uint64_t>(*this, start_indices, result_literal);
-      break;
-    case S4:
-      SliceInternal<s4>(*this, start_indices, result_literal);
-      break;
-    case S8:
-      SliceInternal<int8_t>(*this, start_indices, result_literal);
-      break;
-    case S16:
-      SliceInternal<int16_t>(*this, start_indices, result_literal);
-      break;
-    case S32:
-      SliceInternal<int32_t>(*this, start_indices, result_literal);
-      break;
-    case S64:
-      SliceInternal<int64_t>(*this, start_indices, result_literal);
-      break;
-    case F8E5M2:
-      SliceInternal<tsl::float8_e5m2>(*this, start_indices, result_literal);
-      break;
-    case F8E4M3FN:
-      SliceInternal<tsl::float8_e4m3fn>(*this, start_indices, result_literal);
-      break;
-    case F8E4M3B11FNUZ:
-      SliceInternal<tsl::float8_e4m3b11>(*this, start_indices, result_literal);
-      break;
-    case F16:
-      SliceInternal<half>(*this, start_indices, result_literal);
-      break;
-    case BF16:
-      SliceInternal<bfloat16>(*this, start_indices, result_literal);
-      break;
-    case F32:
-      SliceInternal<float>(*this, start_indices, result_literal);
-      break;
-    case F64:
-      SliceInternal<double>(*this, start_indices, result_literal);
-      break;
-    case C64:
-      SliceInternal<complex64>(*this, start_indices, result_literal);
-      break;
-    case C128:
-      SliceInternal<complex128>(*this, start_indices, result_literal);
-      break;
-    default:
-      LOG(FATAL) << "not yet implemented: "
-                 << PrimitiveType_Name(result_shape.element_type());
-  }
+  primitive_util::PrimitiveTypeSwitch<void>(
+      [&](auto primitive_type_constant) -> void {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return SliceInternal<NativeT>(*this, start_indices, result_literal);
+        }
+        LOG(FATAL) << "not yet implemented: "
+                   << PrimitiveType_Name(result_shape.element_type());
+      },
+      result_shape.element_type());
   return result_literal;
 }
 
@@ -1370,251 +1194,144 @@ std::string LiteralBase::GetAsString(absl::Span<const int64_t> multi_index,
                                      const ShapeIndex& shape_index) const {
   const Shape& subshape = ShapeUtil::GetSubshape(shape(), shape_index);
   CHECK(LayoutUtil::IsDenseArray(subshape));
-  switch (subshape.element_type()) {
-    case PRED:
-      return Get<bool>(multi_index, shape_index) ? "true" : "false";
-    case S4:
-      return Get<s4>(multi_index, shape_index).to_string();
-    case S8:
-      return StrCat(Get<int8_t>(multi_index, shape_index));
-    case S16:
-      return StrCat(Get<int16_t>(multi_index, shape_index));
-    case S32:
-      return StrCat(Get<int32_t>(multi_index, shape_index));
-    case S64:
-      return StrCat(Get<int64_t>(multi_index, shape_index));
-    case U4:
-      return Get<u4>(multi_index, shape_index).to_string();
-    case U8:
-      return StrCat(Get<uint8_t>(multi_index, shape_index));
-    case U16:
-      return StrCat(Get<uint16_t>(multi_index, shape_index));
-    case U32:
-      return StrCat(Get<uint32_t>(multi_index, shape_index));
-    case U64:
-      return StrCat(Get<uint64_t>(multi_index, shape_index));
-    case F16:
-      return RoundTripFpToString(Get<half>(multi_index, shape_index));
-    case F32:
-      return RoundTripFpToString(Get<float>(multi_index, shape_index));
-    case BF16:
-      return RoundTripFpToString(Get<bfloat16>(multi_index, shape_index));
-    case F8E5M2:
-      return RoundTripFpToString(
-          Get<tsl::float8_e5m2>(multi_index, shape_index));
-    case F8E4M3FN:
-      return RoundTripFpToString(
-          Get<tsl::float8_e4m3fn>(multi_index, shape_index));
-    case F8E4M3B11FNUZ:
-      return RoundTripFpToString(
-          Get<tsl::float8_e4m3b11>(multi_index, shape_index));
-    case F64:
-      return RoundTripFpToString(Get<double>(multi_index, shape_index));
-    case C64: {
-      complex64 c = Get<complex64>(multi_index, shape_index);
-      return StrCat("(", RoundTripFpToString(c.real()), ", ",
-                    RoundTripFpToString(c.imag()), ")");
-    }
-    case C128: {
-      complex128 c = Get<complex128>(multi_index, shape_index);
-      return StrCat("(", RoundTripFpToString(c.real()), ", ",
-                    RoundTripFpToString(c.imag()), ")");
-    }
-    default:
-      LOG(FATAL) << PrimitiveType_Name(subshape.element_type());
-  }
+  return primitive_util::PrimitiveTypeSwitch<std::string>(
+      [&](auto primitive_type_constant) -> std::string {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          if constexpr (primitive_util::IsIntegralType(
+                            primitive_type_constant)) {
+            return StrCat(Get<NativeT>(multi_index, shape_index));
+          }
+          if constexpr (primitive_util::IsFloatingPointType(
+                            primitive_type_constant)) {
+            return RoundTripFpToString(Get<NativeT>(multi_index, shape_index));
+          }
+          if constexpr (primitive_util::IsComplexType(
+                            primitive_type_constant)) {
+            NativeT c = Get<NativeT>(multi_index, shape_index);
+            return StrCat("(", RoundTripFpToString(c.real()), ", ",
+                          RoundTripFpToString(c.imag()), ")");
+          }
+          if constexpr (primitive_type_constant == PRED) {
+            return Get<bool>(multi_index, shape_index) ? "true" : "false";
+          }
+        }
+        LOG(FATAL) << PrimitiveType_Name(subshape.element_type());
+      },
+      subshape.element_type());
 }
 
 std::optional<int64_t> LiteralBase::GetIntegralAsS64(
     absl::Span<const int64_t> multi_index) const {
   CHECK(LayoutUtil::IsDenseArray(shape()));
-  switch (shape().element_type()) {
-    case PRED:
-      return Get<bool>(multi_index);
-    case S4:
-      return Get<s4>(multi_index);
-    case S8:
-      return Get<int8_t>(multi_index);
-    case U4:
-      return Get<u4>(multi_index);
-    case U8:
-      return Get<uint8_t>(multi_index);
-    case S16:
-      return Get<int16_t>(multi_index);
-    case U16:
-      return Get<uint16_t>(multi_index);
-    case S32:
-      return Get<int32_t>(multi_index);
-    case U32:
-      return Get<uint32_t>(multi_index);
-    case S64:
-      return Get<int64_t>(multi_index);
-    case U64:
-      return Get<uint64_t>(multi_index);
-    default:
-      return std::nullopt;
-  }
+  return primitive_util::PrimitiveTypeSwitch<std::optional<int64_t>>(
+      [&](auto primitive_type_constant) -> std::optional<int64_t> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant) ||
+                      primitive_type_constant == PRED) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return Get<NativeT>(multi_index);
+        }
+        return std::nullopt;
+      },
+      shape().element_type());
 }
 
 std::optional<double> LiteralBase::GetAsDouble(
     absl::Span<const int64_t> multi_index) const {
   const Shape& s = shape();
   CHECK(LayoutUtil::IsDenseArray(s));
-  switch (s.element_type()) {
-    case F8E5M2:
-      return static_cast<double>(Get<tsl::float8_e5m2>(multi_index));
-    case F8E4M3FN:
-      return static_cast<double>(Get<tsl::float8_e4m3fn>(multi_index));
-    case F8E4M3B11FNUZ:
-      return static_cast<double>(Get<tsl::float8_e4m3b11>(multi_index));
-    case F16:
-      return static_cast<double>(Get<half>(multi_index));
-    case F32:
-      return static_cast<double>(Get<float>(multi_index));
-    case F64:
-      return Get<double>(multi_index);
-    case BF16:
-      return static_cast<double>(Get<bfloat16>(multi_index));
-    default:
-      return std::nullopt;
-  }
+  return primitive_util::PrimitiveTypeSwitch<std::optional<double>>(
+      [&](auto primitive_type_constant) -> std::optional<double> {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return static_cast<double>(Get<NativeT>(multi_index));
+        }
+        return std::nullopt;
+      },
+      s.element_type());
 }
 
 std::optional<double> LiteralBase::GetSumAsDouble(
     absl::Span<const int64_t> linear_indices) const {
   const Shape& s = shape();
   CHECK(LayoutUtil::IsDenseArray(s));
-  double sum = 0.0;
 
-#define SUMLOOP(native_type)                   \
-  do {                                         \
-    auto d = root_piece().data<native_type>(); \
-    for (const int64_t idx : linear_indices) { \
-      sum += static_cast<double>(d[idx]);      \
-    }                                          \
-  } while (0)
-
-  switch (s.element_type()) {
-    case F8E5M2:
-      SUMLOOP(tsl::float8_e5m2);
-      break;
-    case F8E4M3FN:
-      SUMLOOP(tsl::float8_e4m3fn);
-      break;
-    case F8E4M3B11FNUZ:
-      SUMLOOP(tsl::float8_e4m3b11);
-      break;
-    case F16:
-      SUMLOOP(half);
-      break;
-    case F32:
-      SUMLOOP(float);
-      break;
-    case F64:
-      SUMLOOP(double);
-      break;
-    case BF16:
-      SUMLOOP(bfloat16);
-      break;
-    default:
-      return std::nullopt;
-  }
-#undef SUMLOOP
-
-  return sum;
+  return primitive_util::PrimitiveTypeSwitch<std::optional<double>>(
+      [&](auto primitive_type_constant) -> std::optional<double> {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          double sum = 0.0;
+          auto d = root_piece().data<NativeT>();
+          for (const int64_t idx : linear_indices) {
+            sum += static_cast<double>(d[idx]);
+          }
+          return sum;
+        }
+        return std::nullopt;
+      },
+      s.element_type());
 }
 
 std::optional<complex128> LiteralBase::GetAsComplex128(
     absl::Span<const int64_t> multi_index) const {
-  switch (shape().element_type()) {
-    case F8E5M2:
-      return {{static_cast<double>(Get<tsl::float8_e5m2>(multi_index)), 0}};
-    case F8E4M3FN:
-      return {{static_cast<double>(Get<tsl::float8_e4m3fn>(multi_index)), 0}};
-    case F8E4M3B11FNUZ:
-      return {{static_cast<double>(Get<tsl::float8_e4m3b11>(multi_index)), 0}};
-    case BF16:
-      return {{static_cast<double>(Get<bfloat16>(multi_index)), 0}};
-    case F16:
-      return {{static_cast<double>(Get<Eigen::half>(multi_index)), 0}};
-    case F32:
-      return {{Get<float>(multi_index), 0}};
-    case F64:
-      return {{Get<double>(multi_index), 0}};
-    case C64:
-      return {Get<complex64>(multi_index)};
-    case C128:
-      return {Get<complex128>(multi_index)};
-    case S4:
-      return {{static_cast<double>(Get<s4>(multi_index)), 0}};
-    case S8:
-      return {Get<int8_t>(multi_index)};
-    default:
-      return std::nullopt;
-  }
+  return primitive_util::PrimitiveTypeSwitch<std::optional<complex128>>(
+      [&](auto primitive_type_constant) -> std::optional<complex128> {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          if constexpr (primitive_util::IsComplexType(
+                            primitive_type_constant)) {
+            return {Get<NativeT>(multi_index)};
+          }
+          if constexpr (primitive_util::IsFloatingPointType(
+                            primitive_type_constant)) {
+            return {{static_cast<double>(Get<NativeT>(multi_index)), 0}};
+          }
+          if constexpr (primitive_util::IsIntegralType(
+                            primitive_type_constant) &&
+                        primitive_type_constant != S64 &&
+                        primitive_type_constant != U64) {
+            return {{static_cast<double>(Get<NativeT>(multi_index)), 0}};
+          }
+        }
+        return std::nullopt;
+      },
+      shape().element_type());
 }
 
 Status MutableLiteralBase::SetIntegralAsS64(
     absl::Span<const int64_t> multi_index, int64_t value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
-  switch (shape().element_type()) {
-    case PRED:
-      Set<bool>(multi_index, value);
-      break;
-    case U8:
-      Set<uint8_t>(multi_index, value);
-      break;
-    case S32:
-      Set<int32_t>(multi_index, value);
-      break;
-    case S64:
-      Set<int64_t>(multi_index, value);
-      break;
-    case U32:
-      Set<uint32_t>(multi_index, value);
-      break;
-    case U64:
-      Set<uint64_t>(multi_index, value);
-      break;
-    default:
-      return FailedPrecondition("Array element type is not integral: %s",
-                                PrimitiveType_Name(shape().element_type()));
-  }
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant) ||
+                      primitive_type_constant == PRED) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          Set<NativeT>(multi_index, static_cast<NativeT>(value));
+          return OkStatus();
+        }
+        return FailedPrecondition("Array element type is not integral: %s",
+                                  PrimitiveType_Name(shape().element_type()));
+      },
+      shape().element_type());
 }
 
 Status MutableLiteralBase::SetFromDouble(absl::Span<const int64_t> multi_index,
                                          double value) {
   CHECK(LayoutUtil::IsDenseArray(shape()));
-  switch (shape().element_type()) {
-    case F16:
-      Set<half>(multi_index, Eigen::half(value));
-      break;
-    case F32:
-      Set<float>(multi_index, value);
-      break;
-    case F64:
-      Set<double>(multi_index, value);
-      break;
-    case BF16:
-      Set<bfloat16>(multi_index, static_cast<bfloat16>(value));
-      break;
-    case F8E5M2:
-      Set<tsl::float8_e5m2>(multi_index, static_cast<tsl::float8_e5m2>(value));
-      break;
-    case F8E4M3FN:
-      Set<tsl::float8_e4m3fn>(multi_index,
-                              static_cast<tsl::float8_e4m3fn>(value));
-      break;
-    case F8E4M3B11FNUZ:
-      Set<tsl::float8_e4m3b11>(multi_index,
-                               static_cast<tsl::float8_e4m3b11>(value));
-      break;
-    default:
-      return FailedPrecondition("Array element type is not floating: %s",
-                                PrimitiveType_Name(shape().element_type()));
-  }
-  return OkStatus();
+  return primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          Set<NativeT>(multi_index, static_cast<NativeT>(value));
+          return OkStatus();
+        }
+        return FailedPrecondition("Array element type is not integral: %s",
+                                  PrimitiveType_Name(shape().element_type()));
+      },
+      shape().element_type());
 }
 
 namespace {
@@ -1855,10 +1572,6 @@ void LiteralBase::EachCellAsString(
 
 namespace {
 
-template <PrimitiveType kType>
-using NativeTypeOf =
-    typename primitive_util::PrimitiveTypeToNative<kType>::type;
-
 template <typename NativeSrcT, typename NativeDestT>
 void ConvertBetweenNativeTypes(absl::Span<const NativeSrcT> src_data,
                                void* dst_base) {
@@ -1906,39 +1619,20 @@ void ConvertIfDestTypeMatches(const LiteralBase& src_literal,
   auto src_data = src_literal.data<NativeSrcT>();
   void* dst_base = dst_literal.untyped_data();
   DCHECK_EQ(src_data.size(), dst_literal.element_count());
-  switch (dst_literal.shape().element_type()) {
-#define CONVERT_BETWEEN_NATIVE_TYPES(type)                                    \
-  case (type):                                                                \
-    if constexpr (kSrcType != type) {                                         \
-      using NativeDestT = NativeTypeOf<type>;                                 \
-      ConvertBetweenNativeTypes<NativeSrcT, NativeDestT>(src_data, dst_base); \
-    }                                                                         \
-    break;
-    CONVERT_BETWEEN_NATIVE_TYPES(PRED)
-    CONVERT_BETWEEN_NATIVE_TYPES(S4)
-    CONVERT_BETWEEN_NATIVE_TYPES(S8)
-    CONVERT_BETWEEN_NATIVE_TYPES(S16)
-    CONVERT_BETWEEN_NATIVE_TYPES(S32)
-    CONVERT_BETWEEN_NATIVE_TYPES(S64)
-    CONVERT_BETWEEN_NATIVE_TYPES(U4)
-    CONVERT_BETWEEN_NATIVE_TYPES(U8)
-    CONVERT_BETWEEN_NATIVE_TYPES(U16)
-    CONVERT_BETWEEN_NATIVE_TYPES(U32)
-    CONVERT_BETWEEN_NATIVE_TYPES(U64)
-    CONVERT_BETWEEN_NATIVE_TYPES(F16)
-    CONVERT_BETWEEN_NATIVE_TYPES(F32)
-    CONVERT_BETWEEN_NATIVE_TYPES(F64)
-    CONVERT_BETWEEN_NATIVE_TYPES(BF16)
-    CONVERT_BETWEEN_NATIVE_TYPES(F8E5M2)
-    CONVERT_BETWEEN_NATIVE_TYPES(F8E4M3FN)
-    CONVERT_BETWEEN_NATIVE_TYPES(F8E4M3B11FNUZ)
-    CONVERT_BETWEEN_NATIVE_TYPES(C64)
-    CONVERT_BETWEEN_NATIVE_TYPES(C128)
-#undef CONVERT_BETWEEN_NATIVE_TYPES
-    // This code path is impossible to hit.
-    default:
-      LOG(FATAL) << "Unexpected type " << dst_literal.shape().element_type();
-  }
+  primitive_util::PrimitiveTypeSwitch<void>(
+      [&](auto primitive_type_constant) -> void {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          if constexpr (kSrcType != primitive_type_constant) {
+            using NativeDestT = NativeTypeOf<primitive_type_constant>;
+            ConvertBetweenNativeTypes<NativeSrcT, NativeDestT>(src_data,
+                                                               dst_base);
+          }
+          return;
+        }
+        // This code path is impossible to hit.
+        LOG(FATAL) << "Unexpected type " << dst_literal.shape().element_type();
+      },
+      dst_literal.shape().element_type());
 }
 
 StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
@@ -1960,35 +1654,18 @@ StatusOr<Literal> ConvertSwitch(const LiteralBase& literal,
   // duplicating it N^2 times in the conversion implementation.
   Literal result(
       ShapeUtil::ChangeElementType(literal.shape(), primitive_dest_type));
-  switch (literal.shape().element_type()) {
-#define CONVERT_IF_DEST_TYPE_MATCHES(type)             \
-  case (type):                                         \
-    ConvertIfDestTypeMatches<(type)>(literal, result); \
-    break;
-    CONVERT_IF_DEST_TYPE_MATCHES(PRED)
-    CONVERT_IF_DEST_TYPE_MATCHES(S4)
-    CONVERT_IF_DEST_TYPE_MATCHES(S8)
-    CONVERT_IF_DEST_TYPE_MATCHES(S16)
-    CONVERT_IF_DEST_TYPE_MATCHES(S32)
-    CONVERT_IF_DEST_TYPE_MATCHES(S64)
-    CONVERT_IF_DEST_TYPE_MATCHES(U4)
-    CONVERT_IF_DEST_TYPE_MATCHES(U8)
-    CONVERT_IF_DEST_TYPE_MATCHES(U16)
-    CONVERT_IF_DEST_TYPE_MATCHES(U32)
-    CONVERT_IF_DEST_TYPE_MATCHES(U64)
-    CONVERT_IF_DEST_TYPE_MATCHES(F16)
-    CONVERT_IF_DEST_TYPE_MATCHES(F32)
-    CONVERT_IF_DEST_TYPE_MATCHES(F64)
-    CONVERT_IF_DEST_TYPE_MATCHES(BF16)
-    CONVERT_IF_DEST_TYPE_MATCHES(F8E5M2)
-    CONVERT_IF_DEST_TYPE_MATCHES(F8E4M3FN)
-    CONVERT_IF_DEST_TYPE_MATCHES(F8E4M3B11FNUZ)
-#undef CONVERT_IF_DEST_TYPE_MATCHES
-      // Unsupported conversions are checked before this switch, this path is
-      // not possible to hit.
-    default:
-      LOG(FATAL) << "Unexpected type " << literal.shape().element_type();
-  }
+  primitive_util::PrimitiveTypeSwitch<void>(
+      [&](auto primitive_type_constant) -> void {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant) &&
+                      !primitive_util::IsComplexType(primitive_type_constant)) {
+          ConvertIfDestTypeMatches<primitive_type_constant>(literal, result);
+          return;
+        }
+        // Unsupported conversions are checked before this switch, this path is
+        // not possible to hit.
+        LOG(FATAL) << "Unexpected type " << literal.shape().element_type();
+      },
+      literal.shape().element_type());
   return result;
 }
 
@@ -2144,51 +1821,17 @@ bool LiteralBase::Piece::EqualElements(const LiteralBase::Piece& other) const {
   }
 
   std::vector<int64_t> multi_index;
-  switch (subshape().element_type()) {
-    case PRED:
-      return EqualElementsInternal<bool>(other, &multi_index);
-    case S4:
-      return EqualElementsInternal<s4>(other, &multi_index);
-    case S8:
-      return EqualElementsInternal<int8_t>(other, &multi_index);
-    case S16:
-      return EqualElementsInternal<int16_t>(other, &multi_index);
-    case S32:
-      return EqualElementsInternal<int32_t>(other, &multi_index);
-    case S64:
-      return EqualElementsInternal<int64_t>(other, &multi_index);
-    case U4:
-      return EqualElementsInternal<u4>(other, &multi_index);
-    case U8:
-      return EqualElementsInternal<uint8_t>(other, &multi_index);
-    case U16:
-      return EqualElementsInternal<uint16_t>(other, &multi_index);
-    case U32:
-      return EqualElementsInternal<uint32_t>(other, &multi_index);
-    case U64:
-      return EqualElementsInternal<uint64_t>(other, &multi_index);
-    case F32:
-      return EqualElementsInternal<float>(other, &multi_index);
-    case F64:
-      return EqualElementsInternal<double>(other, &multi_index);
-    case F16:
-      return EqualElementsInternal<half>(other, &multi_index);
-    case BF16:
-      return EqualElementsInternal<bfloat16>(other, &multi_index);
-    case F8E5M2:
-      return EqualElementsInternal<tsl::float8_e5m2>(other, &multi_index);
-    case F8E4M3FN:
-      return EqualElementsInternal<tsl::float8_e4m3fn>(other, &multi_index);
-    case F8E4M3B11FNUZ:
-      return EqualElementsInternal<tsl::float8_e4m3b11>(other, &multi_index);
-    case C64:
-      return EqualElementsInternal<complex64>(other, &multi_index);
-    case C128:
-      return EqualElementsInternal<complex128>(other, &multi_index);
-    default:
-      LOG(FATAL) << "Unimplemented: LiteralBase::Piece::EqualElements for type "
-                 << PrimitiveType_Name(subshape().element_type());
-  }
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeSrcT = NativeTypeOf<primitive_type_constant>;
+          return EqualElementsInternal<NativeSrcT>(other, &multi_index);
+        }
+        LOG(FATAL)
+            << "Unimplemented: LiteralBase::Piece::EqualElements for type "
+            << PrimitiveType_Name(subshape().element_type());
+      },
+      subshape().element_type());
 }
 
 bool LiteralBase::operator==(const LiteralBase& other) const {
@@ -2261,70 +1904,16 @@ bool Literal::Piece::IsAll(const Literal& scalar) const {
   CHECK(LayoutUtil::IsDenseArray(subshape()))
       << __func__ << " is only supported for dense arrays: " << subshape();
   CHECK_EQ(subshape().element_type(), scalar.shape().element_type());
-  switch (subshape().element_type()) {
-    case U4:
-      return AllElementsEqualValue(data<u4>(), scalar.GetFirstElement<u4>());
-    case U8:
-      return AllElementsEqualValue(data<uint8_t>(),
-                                   scalar.GetFirstElement<uint8_t>());
-    case U16:
-      return AllElementsEqualValue(data<uint16_t>(),
-                                   scalar.GetFirstElement<uint16_t>());
-    case U32:
-      return AllElementsEqualValue(data<uint32_t>(),
-                                   scalar.GetFirstElement<uint32_t>());
-    case U64:
-      return AllElementsEqualValue(data<uint64_t>(),
-                                   scalar.GetFirstElement<uint64_t>());
-    case S4:
-      return AllElementsEqualValue(data<s4>(), scalar.GetFirstElement<s4>());
-    case S8:
-      return AllElementsEqualValue(data<int8_t>(),
-                                   scalar.GetFirstElement<int8_t>());
-    case S16:
-      return AllElementsEqualValue(data<int16_t>(),
-                                   scalar.GetFirstElement<int16_t>());
-    case S32:
-      return AllElementsEqualValue(data<int32_t>(),
-                                   scalar.GetFirstElement<int32_t>());
-    case S64:
-      return AllElementsEqualValue(data<int64_t>(),
-                                   scalar.GetFirstElement<int64_t>());
-    case PRED:
-      return AllElementsEqualValue(data<bool>(),
-                                   scalar.GetFirstElement<bool>());
-    case F8E5M2:
-      return AllElementsEqualValue(data<tsl::float8_e5m2>(),
-                                   scalar.GetFirstElement<tsl::float8_e5m2>());
-    case F8E4M3FN:
-      return AllElementsEqualValue(
-          data<tsl::float8_e4m3fn>(),
-          scalar.GetFirstElement<tsl::float8_e4m3fn>());
-    case F8E4M3B11FNUZ:
-      return AllElementsEqualValue(
-          data<tsl::float8_e4m3b11>(),
-          scalar.GetFirstElement<tsl::float8_e4m3b11>());
-    case F16:
-      return AllElementsEqualValue(data<half>(),
-                                   scalar.GetFirstElement<half>());
-    case BF16:
-      return AllElementsEqualValue(data<bfloat16>(),
-                                   scalar.GetFirstElement<bfloat16>());
-    case F32:
-      return AllElementsEqualValue(data<float>(),
-                                   scalar.GetFirstElement<float>());
-    case F64:
-      return AllElementsEqualValue(data<double>(),
-                                   scalar.GetFirstElement<double>());
-    case C64:
-      return AllElementsEqualValue(data<complex64>(),
-                                   scalar.GetFirstElement<complex64>());
-    case C128:
-      return AllElementsEqualValue(data<complex128>(),
-                                   scalar.GetFirstElement<complex128>());
-    default:
-      return false;
-  }
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return AllElementsEqualValue(this->data<NativeT>(),
+                                       scalar.GetFirstElement<NativeT>());
+        }
+        return false;
+      },
+      subshape().element_type());
 }
 
 bool LiteralBase::IsAll(const Literal& scalar) const {
@@ -2343,50 +1932,29 @@ bool LiteralBase::IsAll(int8_t value) const {
     return false;
   }
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  switch (ty) {
-    case U4:
-      scalar.Set<u4>({}, u4(value));
-      break;
-    case U8:
-      scalar.Set<uint8_t>({}, value);
-      break;
-    case U16:
-      scalar.Set<uint16_t>({}, value);
-      break;
-    case U32:
-      scalar.Set<uint32_t>({}, value);
-      break;
-    case U64:
-      scalar.Set<uint64_t>({}, value);
-      break;
-    case S4:
-      scalar.Set<s4>({}, s4(value));
-      break;
-    case S8:
-      scalar.Set<int8_t>({}, value);
-      break;
-    case S16:
-      scalar.Set<int16_t>({}, value);
-      break;
-    case S32:
-      scalar.Set<int32_t>({}, value);
-      break;
-    case S64:
-      scalar.Set<int64_t>({}, value);
-      break;
-    case PRED:
-      if (value == 0) {
-        scalar.Set<bool>({}, false);
-      } else if (value == 1) {
-        scalar.Set<bool>({}, true);
-      } else {
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          NativeT converted(value);
+          if constexpr (primitive_util::IsFloatingPointType(
+                            primitive_type_constant)) {
+            if (!Eigen::numext::isfinite(converted)) {
+              return false;
+            }
+          }
+          if constexpr (!primitive_util::IsComplexType(
+                            primitive_type_constant)) {
+            if (static_cast<int8_t>(converted) != value) {
+              return false;
+            }
+          }
+          scalar.Set<NativeT>({}, converted);
+          return root_piece().IsAll(scalar);
+        }
         return false;
-      }
-      break;
-    default:
-      return false;
-  }
-  return root_piece().IsAll(scalar);
+      },
+      ty);
 }
 
 bool LiteralBase::IsAllFloat(float value) const {
@@ -2399,37 +1967,20 @@ bool LiteralBase::IsAllFloatImpl(float value, bool round_value) const {
   }
   PrimitiveType ty = shape().element_type();
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  switch (ty) {
-    case F8E5M2:
-      scalar.Set<tsl::float8_e5m2>({}, static_cast<tsl::float8_e5m2>(value));
-      break;
-    case F8E4M3FN:
-      scalar.Set<tsl::float8_e4m3fn>({},
-                                     static_cast<tsl::float8_e4m3fn>(value));
-      break;
-    case F8E4M3B11FNUZ:
-      scalar.Set<tsl::float8_e4m3b11>({},
-                                      static_cast<tsl::float8_e4m3b11>(value));
-      break;
-    case F16:
-      scalar.Set<half>({}, static_cast<half>(value));
-      break;
-    case BF16:
-      scalar.Set<bfloat16>({}, static_cast<bfloat16>(value));
-      break;
-    case F32:
-      scalar.Set<float>({}, value);
-      break;
-    case F64:
-      scalar.Set<double>({}, value);
-      break;
-    default:
-      return false;
-  }
-  if (!round_value && scalar.GetAsDouble({}) != value) {
-    return false;
-  }
-  return root_piece().IsAll(scalar);
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          scalar.Set<NativeT>({}, static_cast<NativeT>(value));
+          if (!round_value && scalar.GetAsDouble({}) != value) {
+            return false;
+          }
+          return root_piece().IsAll(scalar);
+        }
+        return false;
+      },
+      ty);
 }
 
 bool LiteralBase::IsAllComplex(complex64 value) const {
@@ -2438,17 +1989,16 @@ bool LiteralBase::IsAllComplex(complex64 value) const {
   }
   PrimitiveType ty = shape().element_type();
   Literal scalar(ShapeUtil::MakeScalarShape(ty));
-  switch (ty) {
-    case C64:
-      scalar.Set<complex64>({}, value);
-      break;
-    case C128:
-      scalar.Set<complex128>({}, value);
-      break;
-    default:
-      return false;
-  }
-  return root_piece().IsAll(scalar);
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          scalar.Set<NativeT>({}, static_cast<NativeT>(value));
+          return root_piece().IsAll(scalar);
+        }
+        return false;
+      },
+      ty);
 }
 
 bool LiteralBase::IsAllFirst() const {
@@ -2479,63 +2029,38 @@ bool LiteralBase::IsR1Iota() const {
     return false;
   }
 
-  auto is_iota_at_idx = [&](const int64_t idx) {
-    switch (shape().element_type()) {
-      case U4:
-        return static_cast<int64_t>(Get<u4>({idx})) == idx;
-      case U8:
-        return static_cast<int64_t>(Get<uint8_t>({idx})) == idx;
-      case U16:
-        return static_cast<int64_t>(Get<uint16_t>({idx})) == idx;
-      case U32:
-        return static_cast<int64_t>(Get<uint32_t>({idx})) == idx;
-      case U64:
-        return static_cast<int64_t>(Get<uint64_t>({idx})) == idx;
-      case S4:
-        return Get<s4>({idx}) == idx;
-      case S8:
-        return Get<int8_t>({idx}) == idx;
-      case S16:
-        return Get<int16_t>({idx}) == idx;
-      case S32:
-        return Get<int32_t>({idx}) == idx;
-      case S64:
-        return Get<int64_t>({idx}) == idx;
-      case F32:
-        return Get<float>({idx}) == idx;
-      case F64:
-        return Get<double>({idx}) == idx;
-      case F16:
-        return Get<half>({idx}) == static_cast<half>(idx);
-      case BF16:
-        return Get<bfloat16>({idx}) == static_cast<bfloat16>(idx);
-      case F8E5M2:
-        return Get<tsl::float8_e5m2>({idx}) ==
-               static_cast<tsl::float8_e5m2>(idx);
-      case F8E4M3FN:
-        return Get<tsl::float8_e4m3fn>({idx}) ==
-               static_cast<tsl::float8_e4m3fn>(idx);
-      case F8E4M3B11FNUZ:
-        return Get<tsl::float8_e4m3b11>({idx}) ==
-               static_cast<tsl::float8_e4m3b11>(idx);
-      case C64:
-        return Get<complex64>({idx}) == complex64(idx, 0.0f);
-      case C128:
-        return Get<complex128>({idx}) == complex128(idx, 0.0f);
-      // pred, token, opaque, tuple, etc. are all not iota.
-      default:
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          const int64_t elements = ShapeUtil::ElementsIn(shape());
+          for (int64_t idx = 0; idx < elements; ++idx) {
+            if constexpr (primitive_util::IsIntegralType(
+                              primitive_type_constant)) {
+              if (static_cast<int64_t>(Get<NativeT>({idx})) != idx) {
+                return false;
+              }
+            } else if constexpr (primitive_util::IsFloatingPointType(
+                                     primitive_type_constant)) {
+              if (Get<NativeT>({idx}) != static_cast<NativeT>(idx)) {
+                return false;
+              }
+            } else if constexpr (primitive_util::IsComplexType(
+                                     primitive_type_constant)) {
+              if (Get<NativeT>({idx}) != NativeT(idx, 0.0f)) {
+                return false;
+              }
+            } else {
+              // pred is not iota.
+              return false;
+            }
+          }
+          return true;
+        }
+        // token, opaque, tuple, etc. are all not iota.
         return false;
-    }
-  };
-
-  const int64_t elements = ShapeUtil::ElementsIn(shape());
-  for (int64_t idx = 0; idx < elements; ++idx) {
-    if (!is_iota_at_idx(idx)) {
-      return false;
-    }
-  }
-
-  return true;
+      },
+      shape().element_type());
 }
 
 // Returns a stride if the literal is a strided iota, i.e., iota multiplied by a
@@ -2555,100 +2080,43 @@ std::optional<int64_t> LiteralBase::IsR1StridedIota() const {
     return std::nullopt;
   }
 
-  auto get_element_at = [&](const int64_t idx) -> int64_t {
-    switch (type) {
-      case U4:
-        return static_cast<int64_t>(Get<u4>({idx}));
-      case U8:
-        return static_cast<int64_t>(Get<uint8_t>({idx}));
-      case U16:
-        return static_cast<int64_t>(Get<uint16_t>({idx}));
-      case U32:
-        return static_cast<int64_t>(Get<uint32_t>({idx}));
-      case U64:
-        return static_cast<int64_t>(Get<uint64_t>({idx}));
-      case S4:
-        return static_cast<int64_t>(Get<s4>({idx}));
-      case S8:
-        return Get<int8_t>({idx});
-      case S16:
-        return Get<int16_t>({idx});
-      case S32:
-        return Get<int32_t>({idx});
-      case S64:
-        return Get<int64_t>({idx});
-      default:
-        CHECK(0);
-        return 0;
-    }
-  };
+  return primitive_util::PrimitiveTypeSwitch<std::optional<int64_t>>(
+      [&](auto primitive_type_constant) -> std::optional<int64_t> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
 
-  // Infer the stride as the second element (since first element is supposed
-  // to be zero).
-  int64_t stride = get_element_at(1);
-  if (stride == 0) {
-    return std::nullopt;
-  }
+          // Infer the stride as the second element (since first element is
+          // supposed to be zero).
+          const int64_t stride = static_cast<int64_t>(Get<NativeT>({1}));
+          if (stride == 0) {
+            return std::nullopt;
+          }
 
-  for (int64_t idx = 0; idx < elements; ++idx) {
-    if (get_element_at(idx) != idx * stride) {
-      return std::nullopt;
-    }
-  }
+          for (int64_t idx = 0; idx < elements; ++idx) {
+            if (static_cast<int64_t>(Get<NativeT>({idx})) != idx * stride) {
+              return std::nullopt;
+            }
+          }
 
-  return stride;
+          return stride;
+        }
+        return std::nullopt;
+      },
+      shape().element_type());
 }
 
 bool LiteralBase::IsZero(absl::Span<const int64_t> indices) const {
   CHECK(LayoutUtil::IsDenseArray(shape()))
       << __func__ << " is only supported for dense arrays: " << shape();
-  switch (shape().element_type()) {
-    case U4:
-      return Get<u4>(indices) == 0;
-    case U8:
-      return Get<uint8_t>(indices) == 0;
-    case U16:
-      return Get<uint16_t>(indices) == 0;
-    case U32:
-      return Get<uint32_t>(indices) == 0;
-    case U64:
-      return Get<uint64_t>(indices) == 0;
-    case S4:
-      return Get<s4>(indices) == 0;
-    case S8:
-      return Get<int8_t>(indices) == 0;
-    case S16:
-      return Get<int16_t>(indices) == 0;
-    case S32:
-      return Get<int32_t>(indices) == 0;
-    case S64:
-      return Get<int64_t>(indices) == 0;
-    case F32:
-      return Get<float>(indices) == 0.0f;
-    case F64:
-      return Get<double>(indices) == 0.0;
-    case C64:
-      return Get<complex64>(indices) == complex64(0.0f, 0.0f);
-    case C128:
-      return Get<complex128>(indices) == complex128(0.0f, 0.0f);
-    case F16:
-      return Get<half>(indices) == static_cast<half>(0.0f);
-    case BF16:
-      return Get<bfloat16>(indices) == static_cast<bfloat16>(0.0f);
-    case F8E5M2:
-      return Get<tsl::float8_e5m2>(indices) ==
-             static_cast<tsl::float8_e5m2>(0.0f);
-    case F8E4M3FN:
-      return Get<tsl::float8_e4m3fn>(indices) ==
-             static_cast<tsl::float8_e4m3fn>(0.0f);
-    case F8E4M3B11FNUZ:
-      return Get<tsl::float8_e4m3b11>(indices) ==
-             static_cast<tsl::float8_e4m3b11>(0.0f);
-    case PRED:
-      return Get<bool>(indices) == false;
-    default:
-      LOG(FATAL) << "Input literal must be an array.";
-  }
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return Get<NativeT>(indices) == NativeT{0};
+        }
+        LOG(FATAL) << "Input literal must be an array.";
+      },
+      shape().element_type());
 }
 
 namespace {
diff --git a/tensorflow/compiler/xla/literal.h b/tensorflow/compiler/xla/literal.h
index d99625740b0..74884e16f1f 100644
--- a/tensorflow/compiler/xla/literal.h
+++ b/tensorflow/compiler/xla/literal.h
@@ -175,12 +175,7 @@ class LiteralBase {
   //
   // Precondition: must be an array.
   template <typename T>
-  typename std::enable_if<(std::is_arithmetic<T>::value ||
-                           std::is_same<T, Eigen::half>::value ||
-                           std::is_same<T, bfloat16>::value ||
-                           std::is_same<T, tsl::float8_e5m2>::value ||
-                           std::is_same<T, tsl::float8_e4m3fn>::value),
-                          bool>::type
+  typename std::enable_if<std::numeric_limits<T>::is_specialized, bool>::type
   IsEqualAt(absl::Span<const int64_t> multi_index, T value) const {
     if (auto as_s64 = GetIntegralAsS64(multi_index)) {
       return *as_s64 == value;
@@ -1227,6 +1222,12 @@ template <typename NativeT>
 absl::Span<const NativeT> LiteralBase::Piece::data() const {
   DCHECK(LayoutUtil::IsDenseArray(subshape()))
       << __func__ << " is only supported for dense arrays: " << subshape();
+  // TODO(b/286368819): Explicitly support or not support element_size_in_bits
+  // in Literal.
+  DCHECK(!subshape().has_layout() ||
+         subshape().layout().element_size_in_bits() == 0)
+      << __func__
+      << " is not supported for layouts with custom bit size: " << subshape();
   DCHECK_EQ(subshape().element_type(),
             primitive_util::NativeToPrimitiveType<NativeT>())
       << "Attempting to access "
@@ -1241,6 +1242,12 @@ template <typename NativeT>
 absl::Span<NativeT> LiteralBase::Piece::data() {
   DCHECK(LayoutUtil::IsDenseArray(subshape()))
       << __func__ << " is only supported for dense arrays: " << subshape();
+  // TODO(b/286368819): Explicitly support or not support element_size_in_bits
+  // in Literal.
+  DCHECK(!subshape().has_layout() ||
+         subshape().layout().element_size_in_bits() == 0)
+      << __func__
+      << " is not supported for layouts with custom bit size: " << subshape();
   DCHECK_EQ(subshape().element_type(),
             primitive_util::NativeToPrimitiveType<NativeT>())
       << "Attempting to access "
diff --git a/tensorflow/compiler/xla/literal_comparison.cc b/tensorflow/compiler/xla/literal_comparison.cc
index 584a87e4573..951acdc86bb 100644
--- a/tensorflow/compiler/xla/literal_comparison.cc
+++ b/tensorflow/compiler/xla/literal_comparison.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/literal_comparison.h"
 
+#include "tensorflow/compiler/xla/primitive_util.h"
+
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -818,77 +820,23 @@ Status EqualHelper(const LiteralSlice& expected, const LiteralSlice& actual,
     Literal* miscompared_ptr =
         (miscompare_callback == nullptr ? nullptr : &miscompared);
 
-    switch (expected.shape().element_type()) {
-      case PRED:
-        result = Equal<bool>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case S4:
-        result = Equal<s4>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case S8:
-        result = Equal<int8_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case S16:
-        result = Equal<int16_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case S32:
-        result = Equal<int32_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case S64:
-        result = Equal<int64_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case U4:
-        result = Equal<u4>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case U8:
-        result = Equal<uint8_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case U16:
-        result = Equal<uint16_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case U32:
-        result = Equal<uint32_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case U64:
-        result = Equal<uint64_t>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case F8E5M2:
-        result = Equal<tsl::float8_e5m2>(expected, actual, index, 0,
-                                         miscompared_ptr);
-        break;
-      case F8E4M3FN:
-        result = Equal<tsl::float8_e4m3fn>(expected, actual, index, 0,
-                                           miscompared_ptr);
-        break;
-      case F8E4M3B11FNUZ:
-        result = Equal<tsl::float8_e4m3b11>(expected, actual, index, 0,
-                                            miscompared_ptr);
-        break;
-      case BF16:
-        result = Equal<bfloat16>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case F16:
-        result = Equal<half>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case F32:
-        result = Equal<float>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case F64:
-        result = Equal<double>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case C64:
-        result = Equal<complex64>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case C128:
-        result = Equal<complex128>(expected, actual, index, 0, miscompared_ptr);
-        break;
-      case TOKEN:
-        // Tokens have no on-device representation and are trivially equal.
-        return OkStatus();
-      default:
-        LOG(FATAL) << "Unsupported primitive type: "
-                   << PrimitiveType_Name(expected.shape().element_type());
-    }
+    primitive_util::PrimitiveTypeSwitch<void>(
+        [&](auto primitive_type_constant) -> void {
+          if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+            using NativeT =
+                primitive_util::NativeTypeOf<primitive_type_constant>;
+            result =
+                Equal<NativeT>(expected, actual, index, 0, miscompared_ptr);
+            return;
+          }
+          if constexpr (primitive_type_constant == TOKEN) {
+            // Tokens have no on-device representation and are trivially equal.
+            return;
+          }
+          LOG(FATAL) << "Unsupported primitive type: "
+                     << PrimitiveType_Name(expected.shape().element_type());
+        },
+        expected.shape().element_type());
 
     if (!result.ok() && miscompare_callback) {
       miscompare_callback(expected, actual, LiteralSlice(miscompared),
diff --git a/tensorflow/compiler/xla/literal_test.cc b/tensorflow/compiler/xla/literal_test.cc
index b60bb5f9171..9749cb1f6e3 100644
--- a/tensorflow/compiler/xla/literal_test.cc
+++ b/tensorflow/compiler/xla/literal_test.cc
@@ -2415,9 +2415,9 @@ TEST_F(LiteralUtilTest, PopulateR2DynamicDim1) {
 TEST_F(LiteralUtilTest, PopulateFrom1DArray) {
   auto literal = Literal(ShapeUtil::MakeShape(F32, {20}));
   literal.SetDynamicSize(0, 10);
-  xla::Array<float_t> array({10});
+  xla::Array<float> array({10});
   for (int i = 0; i < 10; i++) {
-    array(i) = static_cast<float_t>(i);
+    array(i) = static_cast<float>(i);
   }
   literal.PopulateFromArray(array);
   std::string expected = "f32[<=20](10) {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}";
@@ -2429,10 +2429,10 @@ TEST_F(LiteralUtilTest, PopulateFromArrayDynamicDim0) {
   const uint32_t rows = 3;
   const uint32_t cols = 5;
   literal.SetDynamicSize(0, rows);
-  xla::Array<float_t> array({rows, cols});
+  xla::Array<float> array({rows, cols});
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      array(i, j) = static_cast<float_t>(j);
+      array(i, j) = static_cast<float>(j);
     }
   }
   literal.PopulateFromArray(array);
@@ -2449,10 +2449,10 @@ TEST_F(LiteralUtilTest, PopulateFromArrayDynamicDim1) {
   const uint32_t rows = 5;
   const uint32_t cols = 3;
   literal.SetDynamicSize(1, cols);
-  xla::Array<float_t> array({rows, cols});
+  xla::Array<float> array({rows, cols});
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      array(i, j) = static_cast<float_t>(j);
+      array(i, j) = static_cast<float>(j);
     }
   }
   literal.PopulateFromArray(array);
@@ -2471,10 +2471,10 @@ TEST_F(LiteralUtilTest, PopulateR2FromArray2DDynamicDim0) {
   const uint32_t rows = 3;
   const uint32_t cols = 5;
   literal.SetDynamicSize(0, rows);
-  xla::Array2D<float_t> array({rows, cols});
+  xla::Array2D<float> array({rows, cols});
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      array(i, j) = static_cast<float_t>(j);
+      array(i, j) = static_cast<float>(j);
     }
   }
   literal.PopulateR2FromArray2D(array);
@@ -2491,10 +2491,10 @@ TEST_F(LiteralUtilTest, PopulateR2FromArray2DDynamicDim1) {
   const uint32_t rows = 5;
   const uint32_t cols = 3;
   literal.SetDynamicSize(1, cols);
-  xla::Array2D<float_t> array({rows, cols});
+  xla::Array2D<float> array({rows, cols});
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      array(i, j) = static_cast<float_t>(j);
+      array(i, j) = static_cast<float>(j);
     }
   }
   literal.PopulateR2FromArray2D(array);
@@ -2514,10 +2514,10 @@ TEST_F(LiteralUtilTest, PopulateR2FromArray2DDynamicDim0Dim1) {
   const uint32_t cols = 2;
   literal.SetDynamicSize(0, rows);
   literal.SetDynamicSize(1, cols);
-  xla::Array2D<float_t> array({rows, cols});
+  xla::Array2D<float> array({rows, cols});
   for (int i = 0; i < rows; i++) {
     for (int j = 0; j < cols; j++) {
-      array(i, j) = static_cast<float_t>(j);
+      array(i, j) = static_cast<float>(j);
     }
   }
   literal.PopulateR2FromArray2D(array);
diff --git a/tensorflow/compiler/xla/literal_util.cc b/tensorflow/compiler/xla/literal_util.cc
index 94b8bd4d9f6..d0509602708 100644
--- a/tensorflow/compiler/xla/literal_util.cc
+++ b/tensorflow/compiler/xla/literal_util.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/index_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -92,52 +93,15 @@ Literal CreateScalarImpl(F&& value_provider, Args... args) {
 
 template <template <PrimitiveType> class F, typename... Args>
 Literal CreateScalar(PrimitiveType primitive_type, Args... args) {
-  switch (primitive_type) {
-    case U8:
-      return CreateScalarImpl<U8>(F<U8>{}, std::forward<Args>(args)...);
-    case U16:
-      return CreateScalarImpl<U16>(F<U16>{}, std::forward<Args>(args)...);
-    case U32:
-      return CreateScalarImpl<U32>(F<U32>{}, std::forward<Args>(args)...);
-    case U64:
-      return CreateScalarImpl<U64>(F<U64>{}, std::forward<Args>(args)...);
-    case S8:
-      return CreateScalarImpl<S8>(F<S8>{}, std::forward<Args>(args)...);
-    case S16:
-      return CreateScalarImpl<S16>(F<S16>{}, std::forward<Args>(args)...);
-    case S32:
-      return CreateScalarImpl<S32>(F<S32>{}, std::forward<Args>(args)...);
-    case S64:
-      return CreateScalarImpl<S64>(F<S64>{}, std::forward<Args>(args)...);
-    case F8E5M2:
-      return CreateScalarImpl<F8E5M2>(F<F8E5M2>{}, std::forward<Args>(args)...);
-    case F8E4M3FN:
-      return CreateScalarImpl<F8E4M3FN>(F<F8E4M3FN>{},
-                                        std::forward<Args>(args)...);
-    case F8E4M3B11FNUZ:
-      return CreateScalarImpl<F8E4M3B11FNUZ>(F<F8E4M3B11FNUZ>{},
-                                             std::forward<Args>(args)...);
-    case F16:
-      return CreateScalarImpl<F16>(F<F16>{}, std::forward<Args>(args)...);
-    case BF16:
-      return CreateScalarImpl<BF16>(F<BF16>{}, std::forward<Args>(args)...);
-    case F32:
-      return CreateScalarImpl<F32>(F<F32>{}, std::forward<Args>(args)...);
-    case F64:
-      return CreateScalarImpl<F64>(F<F64>{}, std::forward<Args>(args)...);
-    case C64:
-      return CreateScalarImpl<C64>(F<C64>{}, std::forward<Args>(args)...);
-    case C128:
-      return CreateScalarImpl<C128>(F<C128>{}, std::forward<Args>(args)...);
-    case PRED:
-      return CreateScalarImpl<PRED>(F<PRED>{}, std::forward<Args>(args)...);
-    case TUPLE:
-      LOG(FATAL) << "tuple element type cannot be a scalar type.";
-    case OPAQUE_TYPE:
-      LOG(FATAL) << "opaque element type cannot be a scalar type.";
-    default:
-      LOG(FATAL) << "Unhandled primitive type " << primitive_type;
-  }
+  return primitive_util::PrimitiveTypeSwitch<Literal>(
+      [&](auto primitive_type_constant) -> Literal {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          return CreateScalarImpl<primitive_type_constant>(
+              F<primitive_type_constant>{}, std::forward<Args>(args)...);
+        }
+        LOG(FATAL) << "Unhandled primitive type " << primitive_type;
+      },
+      primitive_type);
 }
 
 template <PrimitiveType kType>
@@ -174,72 +138,24 @@ struct IsValidScalarType {
 };
 
 template <typename NativeT>
-std::enable_if_t<std::is_integral<NativeT>::value, NativeT> GetMaxImpl() {
-  return std::numeric_limits<NativeT>::max();
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_integral<NativeT>::value, NativeT> GetMinImpl() {
-  return std::numeric_limits<NativeT>::min();
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_floating_point<NativeT>::value, NativeT> GetMaxImpl() {
-  return std::numeric_limits<NativeT>::infinity();
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_floating_point<NativeT>::value, NativeT> GetMinImpl() {
-  return -std::numeric_limits<NativeT>::infinity();
-}
-
-template <typename NativeT>
-std::enable_if_t<Is16BitFloat<NativeT>::value ||
-                     std::is_same<NativeT, tsl::float8_e5m2>::value,
-                 NativeT>
-GetMaxImpl() {
-  return static_cast<NativeT>(std::numeric_limits<float>::infinity());
-}
-
-template <typename NativeT>
-std::enable_if_t<Is16BitFloat<NativeT>::value ||
-                     std::is_same<NativeT, tsl::float8_e5m2>::value,
-                 NativeT>
-GetMinImpl() {
-  return static_cast<NativeT>(-std::numeric_limits<float>::infinity());
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_same<NativeT, tsl::float8_e4m3fn>::value, NativeT>
-GetMaxImpl() {
-  return std::numeric_limits<NativeT>::max();
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_same<NativeT, tsl::float8_e4m3fn>::value, NativeT>
-GetMinImpl() {
-  return std::numeric_limits<NativeT>::lowest();
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_same<NativeT, tsl::float8_e4m3b11>::value, NativeT>
-GetMaxImpl() {
-  return std::numeric_limits<NativeT>::max();
-}
-
-template <typename NativeT>
-std::enable_if_t<std::is_same<NativeT, tsl::float8_e4m3b11>::value, NativeT>
-GetMinImpl() {
-  return std::numeric_limits<NativeT>::lowest();
-}
-
-template <typename NativeT>
-std::enable_if_t<!IsReal<NativeT>::value, NativeT> GetMaxImpl() {
+NativeT GetMaxImpl() {
+  if constexpr (IsReal<NativeT>::value) {
+    if constexpr (std::numeric_limits<NativeT>::has_infinity) {
+      return std::numeric_limits<NativeT>::infinity();
+    }
+    return std::numeric_limits<NativeT>::max();
+  }
   LOG(FATAL) << "No max value for given type.";
 }
 
 template <typename NativeT>
-std::enable_if_t<!IsReal<NativeT>::value, NativeT> GetMinImpl() {
+NativeT GetMinImpl() {
+  if constexpr (IsReal<NativeT>::value) {
+    if constexpr (std::numeric_limits<NativeT>::has_infinity) {
+      return -std::numeric_limits<NativeT>::infinity();
+    }
+    return std::numeric_limits<NativeT>::lowest();
+  }
   LOG(FATAL) << "No min value for given type.";
 }
 
@@ -382,31 +298,26 @@ void SetScalarAtIndexImpl(MutableLiteralBase& literal,
 
 /* static */ StatusOr<Literal> LiteralUtil::NanValue(
     PrimitiveType primitive_type) {
-  switch (primitive_type) {
-    case F16:
-      return LiteralUtil::CreateR0<half>(
-          static_cast<half>(std::numeric_limits<float>::quiet_NaN()));
-    case BF16:
-      return LiteralUtil::CreateR0<bfloat16>(
-          static_cast<bfloat16>(std::numeric_limits<float>::quiet_NaN()));
-    case F32:
-      return LiteralUtil::CreateR0<float>(
-          std::numeric_limits<float>::quiet_NaN());
-    case F64:
-      return LiteralUtil::CreateR0<double>(
-          std::numeric_limits<double>::quiet_NaN());
-    case C64: {
-      float nan = std::numeric_limits<float>::quiet_NaN();
-      return LiteralUtil::CreateR0<complex64>(complex64(nan, nan));
-    }
-    case C128: {
-      double nan = std::numeric_limits<double>::quiet_NaN();
-      return LiteralUtil::CreateR0<complex128>(complex128(nan, nan));
-    }
-    default:
-      return InvalidArgument("Invalid type for NanValue: %s",
-                             PrimitiveType_Name(primitive_type));
-  }
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+      [&](auto primitive_type_constant) -> StatusOr<Literal> {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return LiteralUtil::CreateR0<NativeT>(
+              std::numeric_limits<NativeT>::quiet_NaN());
+        }
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          auto nan =
+              std::numeric_limits<typename NativeT::value_type>::quiet_NaN();
+          return LiteralUtil::CreateR0<NativeT>(NativeT(nan, nan));
+        }
+        return InvalidArgument("Invalid type for NanValue: %s",
+                               PrimitiveType_Name(primitive_type));
+      },
+      primitive_type);
 }
 
 /* static */ Literal LiteralUtil::CreateR1(const tsl::core::Bitmap& values) {
@@ -457,51 +368,19 @@ void SetScalarAtIndexImpl(MutableLiteralBase& literal,
         IndexUtil::LinearIndexToMultidimensionalIndex(literal.shape(), i);
     std::vector<int64_t> to_multi_index =
         IndexUtil::LinearIndexToMultidimensionalIndex(shape_with_layout, i);
-    switch (literal.shape().element_type()) {
-      case PRED:
-        new_literal.Set<bool>(to_multi_index,
-                              literal.Get<bool>(from_multi_index));
-        break;
-      case U8:
-        new_literal.Set<uint8_t>(to_multi_index,
-                                 literal.Get<uint8_t>(from_multi_index));
-        break;
-      case U32:
-        new_literal.Set<uint32_t>(to_multi_index,
-                                  literal.Get<uint32_t>(from_multi_index));
-        break;
-      case S32:
-        new_literal.Set<int32_t>(to_multi_index,
-                                 literal.Get<int32_t>(from_multi_index));
-        break;
-      case U64:
-        new_literal.Set<uint64_t>(to_multi_index,
-                                  literal.Get<uint64_t>(from_multi_index));
-        break;
-      case S64:
-        new_literal.Set<int64_t>(to_multi_index,
-                                 literal.Get<int64_t>(from_multi_index));
-        break;
-      case F32:
-        new_literal.Set<float>(to_multi_index,
-                               literal.Get<float>(from_multi_index));
-        break;
-      case F64:
-        new_literal.Set<double>(to_multi_index,
-                                literal.Get<double>(from_multi_index));
-        break;
-      case C64:
-        new_literal.Set<complex64>(to_multi_index,
-                                   literal.Get<complex64>(from_multi_index));
-        break;
-      case C128:
-        new_literal.Set<complex128>(to_multi_index,
-                                    literal.Get<complex128>(from_multi_index));
-        break;
-      default:
-        LOG(FATAL) << "Unhandled primitive element type: "
-                   << PrimitiveType_Name(literal.shape().element_type());
-    }
+    primitive_util::PrimitiveTypeSwitch<void>(
+        [&](auto primitive_type_constant) -> void {
+          if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+            using NativeT = typename primitive_util::PrimitiveTypeToNative<
+                primitive_type_constant>::type;
+            new_literal.Set<NativeT>(to_multi_index,
+                                     literal.Get<NativeT>(from_multi_index));
+            return;
+          }
+          LOG(FATAL) << "Unhandled primitive element type: "
+                     << PrimitiveType_Name(literal.shape().element_type());
+        },
+        literal.shape().element_type());
   }
 
   return new_literal;
@@ -524,56 +403,17 @@ void SetScalarAtIndexImpl(MutableLiteralBase& literal,
 /*static*/ void LiteralUtil::SetScalarLiteral(
     MutableLiteralBase& literal, absl::Span<const int64_t> multi_index,
     const LiteralBase& scalar) {
-  switch (literal.shape().element_type()) {
-    case PRED:
-      SetScalarAtIndexImpl<PRED>(literal, multi_index, scalar);
-      break;
-    case U8:
-      SetScalarAtIndexImpl<U8>(literal, multi_index, scalar);
-      break;
-    case U16:
-      SetScalarAtIndexImpl<U16>(literal, multi_index, scalar);
-      break;
-    case U32:
-      SetScalarAtIndexImpl<U32>(literal, multi_index, scalar);
-      break;
-    case U64:
-      SetScalarAtIndexImpl<U64>(literal, multi_index, scalar);
-      break;
-    case S8:
-      SetScalarAtIndexImpl<S8>(literal, multi_index, scalar);
-      break;
-    case S16:
-      SetScalarAtIndexImpl<S16>(literal, multi_index, scalar);
-      break;
-    case S32:
-      SetScalarAtIndexImpl<S32>(literal, multi_index, scalar);
-      break;
-    case S64:
-      SetScalarAtIndexImpl<S64>(literal, multi_index, scalar);
-      break;
-    case F16:
-      SetScalarAtIndexImpl<F16>(literal, multi_index, scalar);
-      break;
-    case BF16:
-      SetScalarAtIndexImpl<BF16>(literal, multi_index, scalar);
-      break;
-    case F32:
-      SetScalarAtIndexImpl<F32>(literal, multi_index, scalar);
-      break;
-    case F64:
-      SetScalarAtIndexImpl<F64>(literal, multi_index, scalar);
-      break;
-    case C64:
-      SetScalarAtIndexImpl<C64>(literal, multi_index, scalar);
-      break;
-    case C128:
-      SetScalarAtIndexImpl<C128>(literal, multi_index, scalar);
-      break;
-    default:
-      LOG(FATAL) << "Unsupported element type: "
-                 << literal.shape().element_type();
-  }
+  primitive_util::PrimitiveTypeSwitch<void>(
+      [&](auto primitive_type_constant) -> void {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          SetScalarAtIndexImpl<primitive_type_constant>(literal, multi_index,
+                                                        scalar);
+          return;
+        }
+        LOG(FATAL) << "Unsupported element type: "
+                   << literal.shape().element_type();
+      },
+      literal.shape().element_type());
 }
 
 /* static */ Literal LiteralUtil::MaxElement(const LiteralSlice& literal) {
diff --git a/tensorflow/compiler/xla/literal_util.h b/tensorflow/compiler/xla/literal_util.h
index c8ac111ae39..b4963abfe72 100644
--- a/tensorflow/compiler/xla/literal_util.h
+++ b/tensorflow/compiler/xla/literal_util.h
@@ -248,9 +248,7 @@ class LiteralUtil {
   // Creates a literal with the supplied shape, and uses the provided value
   // generator to populate the literal's values.
   // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  template <PrimitiveType type, typename T = primitive_util::NativeTypeOf<type>>
   static StatusOr<Literal> CreateLiteralWithGenerator(
       const Shape& shape,
       absl::FunctionRef<T(absl::Span<const int64_t>)> generator);
@@ -259,9 +257,8 @@ class LiteralUtil {
   // values using a normal distribution with given mean and stddev standard
   // deviation, and using the engine as entropy generator.
   // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type, typename E,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  template <PrimitiveType type, typename E,
+            typename T = primitive_util::NativeTypeOf<type>>
   static StatusOr<Literal> CreateRandomLiteral(const Shape& shape, E* engine,
                                                T mean, T stddev);
 
@@ -269,9 +266,7 @@ class LiteralUtil {
   // values using a normal distribution with given mean and stddev standard
   // deviation.
   // Returns the new literal object, or an error Status if failed.
-  template <
-      PrimitiveType type,
-      typename T = typename primitive_util::PrimitiveTypeToNative<type>::type>
+  template <PrimitiveType type, typename T = primitive_util::NativeTypeOf<type>>
   static StatusOr<Literal> CreateRandomLiteral(const Shape& shape, T mean,
                                                T stddev);
 
@@ -522,7 +517,7 @@ template <PrimitiveType type, typename T>
 /* static */ StatusOr<Literal> LiteralUtil::CreateLiteralWithGenerator(
     const Shape& shape,
     absl::FunctionRef<T(absl::Span<const int64_t>)> generator) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  using NativeT = primitive_util::NativeTypeOf<type>;
   TF_RET_CHECK(shape.element_type() == type);
   Literal literal(shape);
   TF_RETURN_IF_ERROR(literal.Populate<NativeT>(
@@ -533,7 +528,7 @@ template <PrimitiveType type, typename T>
 template <PrimitiveType type, typename E, typename T>
 /* static */ StatusOr<Literal> LiteralUtil::CreateRandomLiteral(
     const Shape& shape, E* engine, T mean, T stddev) {
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<type>::type;
+  using NativeT = primitive_util::NativeTypeOf<type>;
   std::normal_distribution<NativeT> generator(mean, stddev);
   return CreateLiteralWithGenerator<type, NativeT>(
       shape, [&](absl::Span<const int64_t> /*indexes*/) {
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
index 8ede01d328a..55fa13e9372 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/BUILD
@@ -31,11 +31,11 @@ cc_library(
     srcs = [
         "legalize_i1_vector_transfers.cc",
         "legalize_library_ops.cc",
-        "lmhlo_to_cpu_runtime.cc",
         "remove_copies_to_out_params.cc",
         "sparse_rewrite_passes.cc",
         "xla_abi_legalization.cc",
         "xla_cpu_memref_element_cast_to_llvm.cc",
+        "xla_cpu_to_cpu_runtime.cc",
         "xla_rewrite_realloc_to_alloc.cc",
     ],
     hdrs = ["passes.h"],
@@ -56,6 +56,7 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMCommonConversion",
         "@llvm-project//mlir:LLVMDialect",
+        "@llvm-project//mlir:LinalgDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:SparseTensorDialect",
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
index 35ea93dd2a1..b6fd5af414c 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/legalize_library_ops.cc
@@ -234,6 +234,31 @@ class FftLowering : public OpRewritePattern<mhlo::FftOp> {
   };
 };
 
+class InfeedLowering : public OpRewritePattern<mhlo::InfeedOp> {
+  using OpRewritePattern<mhlo::InfeedOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(mhlo::InfeedOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+
+    llvm::SmallVector<Value> dsts;
+    for (const auto& type : op.getResultTypes()) {
+      if (auto ranked_type = type.dyn_cast<RankedTensorType>()) {
+        dsts.push_back(b.create<tensor::EmptyOp>(
+            op.getLoc(), ranked_type.getShape(), ranked_type.getElementType()));
+      } else {
+        // Last element of result types is expected to be of token type.
+        dsts.push_back(op.getToken());
+      }
+    }
+
+    rewriter.replaceOpWithNewOp<xla_cpu::InfeedOp>(
+        op, op.getResultTypes(), dsts, op.getInfeedConfigAttr(),
+        op.getLayoutAttr());
+    return success();
+  };
+};
+
 class OutfeedLowering : public OpRewritePattern<mhlo::OutfeedOp> {
   using OpRewritePattern<mhlo::OutfeedOp>::OpRewritePattern;
 
@@ -406,7 +431,8 @@ void LegalizeLibraryOpsPass::runOnOperation() {
                   ConvolutionLowering, FftLowering,
                   IdLowering<mhlo::PartitionIdOp, xla_cpu::PartitionIdOp>,
                   IdLowering<mhlo::ReplicaIdOp, xla_cpu::ReplicaIdOp>,
-                  OutfeedLowering, RngBitGeneratorLowering>(ctx);
+                  InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
+      ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
     return signalPassFailure();
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
index 0ede4247626..38af021b507 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h
@@ -30,7 +30,7 @@ namespace cpu {
 //===----------------------------------------------------------------------===//
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToCpuRuntimePass();
+createConvertXlaCpuToCpuRuntimePass();
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createXlaAbiLegalizationPass();
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
index 1dab8a2a2e7..d8c2e2a2d76 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.td
@@ -22,15 +22,15 @@ include "mlir/Pass/PassBase.td"
 // Auxiliary passes for lowering to XLA Cpu runtime.
 //===----------------------------------------------------------------------===//
 
-def ConvertLmhloToCpuRuntimePass :
-    Pass<"xla-lmhlo-to-cpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo operations to XLA Cpu runtime custom calls";
+def ConvertXlaCpuToCpuRuntimePass :
+    Pass<"xla-cpu-to-cpu-runtime", "mlir::ModuleOp"> {
+  let summary = "Converts xla_cpu operations to XLA Cpu runtime custom calls";
 
   let description = [{
-      Converts lmhlo dialect operations to XLA Cpu runtime custom calls.
+      Converts xla_cpu dialect operations to XLA Cpu runtime custom calls.
   }];
 
-  let constructor = "createConvertLmhloToCpuRuntimePass()";
+  let constructor = "createConvertXlaCpuToCpuRuntimePass()";
 }
 
 def LegalizeXlaAbiPass :
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
index 10fdcbc65af..c6c0230cea6 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Linalg/IR/Linalg.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
@@ -69,84 +70,52 @@ void getIntegersFromDenseElements(Value v, SmallVectorImpl<int64_t>& values) {
   values.append(range.begin(), range.end());
 }
 
+Value getEmptyTensor(OpBuilder& b, Location loc, RankedTensorType type) {
+  auto t = b.create<tensor::EmptyOp>(loc, type.getShape(),
+                                     type.getElementType(), ValueRange{});
+  auto zero = b.getZeroAttr(type.getElementType());
+  auto c0 = b.create<arith::ConstantOp>(loc, zero);
+  return b.create<linalg::FillOp>(loc, ValueRange{c0}, ValueRange{t})
+      .getResult(0);
+}
+
 struct SparseBatchedPackCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 3 && "Need two arrays (data/indices)");
     assert(op.getResults().size() == 1 && "Must be packing into one tensor");
-    llvm::APInt batchedLvls =
-        *getDenseIntAttrFromConstant(op.getInputs()[2]).begin();
     Value ret_sp_tensor = op.getResults()[0];
     rewriter.replaceOpWithNewOp<sparse_tensor::PackOp>(
-        op, ret_sp_tensor.getType(), op.getInputs()[0], op.getInputs()[1],
-        IntegerAttr::get(rewriter.getIndexType(), batchedLvls));
+        op, ret_sp_tensor.getType(), op.getInputs()[0],  // sparse tensor values
+        op.getInputs().drop_front());                    // sparse tensor levels
     return success();
   }
 };
 
-struct SparseUnpackCallRewriter {
+template <typename BinaryMhlo>
+struct SparseBinaryCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getResults().size() == 3 &&
-           "Must be unpacking into data/indices/nnz");
-    assert(op.getInputs().size() == 1 &&
-           "Must be unpacking from one sparse tensor");
-
-    SmallVector<Type, 3> unpack_ret_tp(op.getResults().getTypes());
-    // A scalar is treated as a zero-ranked tensor type from frontend.
-    auto nnz_type = unpack_ret_tp.back().cast<RankedTensorType>();
-    assert(nnz_type.getRank() == 0 && "nnz tensor must be zero ranked");
-    unpack_ret_tp.back() = nnz_type.getElementType();
-
-    // Constructs the UnpackOp.
-    auto unpack_op = rewriter.create<sparse_tensor::UnpackOp>(
-        op.getLoc(), unpack_ret_tp, op.getInputs());
-
-    // Converts the scalar nnz returned from UnpackOp back to tensor type.
-    SmallVector<Value, 3> unpack_ret_v(unpack_op.getResults());
-    auto scalar_nnz = unpack_op.getNse();
-    Value tensor_nnz = rewriter.create<tensor::EmptyOp>(
-        op.getLoc(), ArrayRef<int64_t>{}, scalar_nnz.getType());
-    tensor_nnz = rewriter.create<tensor::InsertOp>(op.getLoc(), scalar_nnz,
-                                                   tensor_nnz, ValueRange{});
-    unpack_ret_v.back() = tensor_nnz;
-    rewriter.replaceOp(op, unpack_ret_v);
-    return success();
-  }
-};
-
-struct SparseTransposeCallRewriter {
-  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 2 && "Need argument and permutation");
+    assert(op.getInputs().size() == 2 && "Need two argument");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-
-    // The permutation is passed in as a constant of dense int elements.
-    auto permutation_constant =
-        op.getInputs()[1].getDefiningOp<mhlo::ConstantOp>();
-    auto permutation =
-        permutation_constant.getValue().cast<DenseIntElementsAttr>();
-
-    // Reconstruct the transpose operation.
+    // Reconstruct the binary mhlo operation.
     Value ret_sp_tensor = op.getResults()[0];
-    rewriter.replaceOpWithNewOp<mhlo::TransposeOp>(
-        op, ret_sp_tensor.getType(), op.getInputs()[0], permutation);
+    rewriter.replaceOpWithNewOp<BinaryMhlo>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0], op.getInputs()[1]);
     return success();
   }
 };
 
-struct SparseDotCallRewriter {
+struct SparseBroadcastInDimCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 6 && "Need arguments and metadata");
+    assert(op.getInputs().size() == 2 &&
+           "Need argument and broadcast dimensions");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-    SmallVector<int64_t> lhs_contr, rhs_contr, lhs_batch, rhs_batch;
-    getIntegersFromDenseElements(op.getInputs()[2], lhs_contr);
-    getIntegersFromDenseElements(op.getInputs()[3], rhs_contr);
-    getIntegersFromDenseElements(op.getInputs()[4], lhs_batch);
-    getIntegersFromDenseElements(op.getInputs()[5], rhs_batch);
-    auto dot_dims = mlir::mhlo::DotDimensionNumbersAttr::get(
-        op.getContext(), lhs_batch, rhs_batch, lhs_contr, rhs_contr);
+    // Broadcast dimensions are passed in as a constant of dense int elements.
+    auto dims_constant = op.getInputs()[1].getDefiningOp<mhlo::ConstantOp>();
+    auto broadcast_dimensions =
+        dims_constant.getValue().cast<DenseIntElementsAttr>();
+    // Reconstruct the broadcast_in_dim operation.
     Value ret_sp_tensor = op.getResults()[0];
-    rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(
-        op, ret_sp_tensor.getType(), op.getInputs()[0], op.getInputs()[1],
-        dot_dims, /*defaultPrecision*/ ArrayAttr());
+    rewriter.replaceOpWithNewOp<mhlo::BroadcastInDimOp>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0], broadcast_dimensions);
     return success();
   }
 };
@@ -154,7 +123,6 @@ struct SparseDotCallRewriter {
 struct SparseConcatenateCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
     assert(op.getResults().size() == 1 && "Need one output tensor");
-
     // The concatenation dimension.
     auto concat_dim = op.getInputs().back().getDefiningOp<mhlo::ConstantOp>();
     auto concat_dim_attr = concat_dim.getValue().cast<DenseIntElementsAttr>();
@@ -172,90 +140,49 @@ struct SparseConcatenateCallRewriter {
           op, ret_sp_tensor.getType(), op.getInputs().drop_back(),
           rewriter.getIndexAttr(concat_dim_attr.getValues<uint64_t>()[0]));
     }
-
     return success();
   }
 };
 
-struct SparseBroadcastInDimCallRewriter {
+struct SparseConvCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 2 &&
-           "Need argument and broadcast dimensions");
+    assert(op.getInputs().size() == 2 && "Need two input tensors");
     assert(op.getResults().size() == 1 && "Need one output tensor");
+    auto rtp = op.getResults()[0].getType().cast<RankedTensorType>();
+    rewriter.replaceOpWithNewOp<linalg::Conv2DNchwFchwOp>(
+        op, op.getInputs(), getEmptyTensor(rewriter, op.getLoc(), rtp));
+    return success();
+  }
+};
 
-    // Broadcast dimensions are passed in as a constant of dense int elements.
-    auto dims_constant = op.getInputs()[1].getDefiningOp<mhlo::ConstantOp>();
-    auto broadcast_dimensions =
-        dims_constant.getValue().cast<DenseIntElementsAttr>();
-
-    // Reconstruct the broadcast_in_dim operation.
+struct SparseConvertCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 1 && "Need one input tensor");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
     Value ret_sp_tensor = op.getResults()[0];
-    rewriter.replaceOpWithNewOp<mhlo::BroadcastInDimOp>(
-        op, ret_sp_tensor.getType(), op.getInputs()[0], broadcast_dimensions);
+    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0]);
     return success();
   }
 };
 
-template <typename unaryChlo>
-struct SparseUnaryChloCallRewriter {
+struct SparseDotCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 1 && "Need one argument");
+    assert(op.getInputs().size() == 6 && "Need arguments and metadata");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-    // Reconstruct the unary chlo operation.
+    SmallVector<int64_t> lhs_contr, rhs_contr, lhs_batch, rhs_batch;
+    getIntegersFromDenseElements(op.getInputs()[2], lhs_contr);
+    getIntegersFromDenseElements(op.getInputs()[3], rhs_contr);
+    getIntegersFromDenseElements(op.getInputs()[4], lhs_batch);
+    getIntegersFromDenseElements(op.getInputs()[5], rhs_batch);
+    auto dot_dims = mlir::mhlo::DotDimensionNumbersAttr::get(
+        op.getContext(), lhs_batch, rhs_batch, lhs_contr, rhs_contr);
     Value ret_sp_tensor = op.getResults()[0];
-    rewriter.replaceOpWithNewOp<unaryChlo>(op, ret_sp_tensor.getType(),
-                                           op.getInputs()[0]);
-    return success();
-  }
-};
-
-struct SparseSliceCallRewriter {
-  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 4 &&
-           "Need one operand and three slicing parameters");
-    assert(op.getResults().size() == 1 && "Need one output tensor");
-
-    auto ctx = op.getContext();
-    auto loc = op.getLoc();
-    auto retTp = op.getResults().getTypes()[0].cast<RankedTensorType>();
-
-    auto offsets = getDenseIntAttrFromConstant(op.getInputs()[1]);
-    auto strides = getDenseIntAttrFromConstant(op.getInputs()[3]);
-
-    assert(offsets.getNumElements() == strides.getNumElements() &&
-           offsets.getNumElements() == retTp.getRank());
-
-    SmallVector<sparse_tensor::SparseTensorDimSliceAttr> slice_attrs;
-    SmallVector<int64_t> static_offsets, static_sizes, static_strides;
-    for (auto [offset, size, stride] :
-         llvm::zip(offsets, retTp.getShape(), strides)) {
-      int64_t o = offset.getZExtValue(), s = stride.getZExtValue();
-      // Converts limits to sizes.
-      slice_attrs.push_back(
-          sparse_tensor::SparseTensorDimSliceAttr::get(ctx, o, size, s));
-      static_offsets.push_back(o);
-      static_sizes.push_back(size);
-      static_strides.push_back(s);
-    }
-
-    auto srcEnc =
-        retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
-    // TODO(peiming): add a getSliceEncodingFrom into MLIR upstream.
-    auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
-        ctx, srcEnc.getDimLevelType(), srcEnc.getDimOrdering(),
-        srcEnc.getHigherOrdering(), srcEnc.getPosWidth(), srcEnc.getCrdWidth(),
-        slice_attrs);
-    auto sliceTp = RankedTensorType::get(retTp.getShape(),
-                                         retTp.getElementType(), sliceEnc);
-
-    auto slice = rewriter.create<tensor::ExtractSliceOp>(
-        loc, sliceTp, op.getInputs()[0], ValueRange(), ValueRange(),
-        ValueRange(), static_offsets, static_sizes, static_strides);
-
-    // TODO(peiming): This weakens the performance benefit we get from the
-    // sparse compiler by forcing every slice to be materizalized while the
-    // sparse compiler supports view-based slice.
-    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(op, retTp, slice);
+    rewriter.replaceOpWithNewOp<mhlo::DotGeneralOp>(op, ret_sp_tensor.getType(),
+                                                    op.getInputs()[0],
+                                                    op.getInputs()[1], dot_dims,
+                                                    /*defaultPrecision*/
+                                                    ArrayAttr());
     return success();
   }
 };
@@ -263,7 +190,6 @@ struct SparseSliceCallRewriter {
 struct SparseDynSliceCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
     assert(op.getResults().size() == 1 && "Need one output tensor");
-
     auto ctx = op.getContext();
     auto loc = op.getLoc();
     auto retTp = op.getResults().getTypes()[0].cast<RankedTensorType>();
@@ -299,9 +225,8 @@ struct SparseDynSliceCallRewriter {
     auto srcEnc =
         retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
     auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
-        ctx, srcEnc.getDimLevelType(), srcEnc.getDimOrdering(),
-        srcEnc.getHigherOrdering(), srcEnc.getPosWidth(), srcEnc.getCrdWidth(),
-        slice_attrs);
+        ctx, srcEnc.getLvlTypes(), srcEnc.getDimToLvl(), srcEnc.getPosWidth(),
+        srcEnc.getCrdWidth(), slice_attrs);
     auto sliceTp = RankedTensorType::get(retTp.getShape(),
                                          retTp.getElementType(), sliceEnc);
 
@@ -317,11 +242,49 @@ struct SparseDynSliceCallRewriter {
   }
 };
 
+template <typename ReduceOp>
+struct SparseReduceCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 3 &&
+           "Need one input tensor, identity, and axes");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+    SmallVector<int64_t> axes;
+    getIntegersFromDenseElements(op.getInputs()[2], axes);
+    Value result = op.getResults()[0];
+    auto resultType = result.getType().dyn_cast<RankedTensorType>();
+    auto elementType = resultType.getElementType();
+
+    Location loc = op.getLoc();
+    RankedTensorType blockArgumentType = RankedTensorType::get({}, elementType);
+    mhlo::ReduceOp reduce = rewriter.create<mhlo::ReduceOp>(
+        loc, result.getType(), op.getInputs()[0], op.getInputs()[1],
+        rewriter.getI64TensorAttr(axes));
+
+    // Setup the body for mhlo.reduce. Note that sparse reductions like
+    // add/or/xor are good to go, but the more complicated prod/min/max/and
+    // need semi-ring lowering when converting to linalg.
+    Region& region = reduce.getBody();
+    Block& block = region.emplaceBlock();
+    block.addArgument(blockArgumentType, loc);
+    block.addArgument(blockArgumentType, loc);
+    auto* firstArgument = block.args_begin();
+    auto secondArgument = block.args_rbegin();
+    {
+      OpBuilder::InsertionGuard guard(rewriter);
+      rewriter.setInsertionPointToStart(&block);
+      Value red =
+          rewriter.create<ReduceOp>(loc, *firstArgument, *secondArgument);
+      rewriter.create<mhlo::ReturnOp>(loc, red);
+    }
+    rewriter.replaceOp(op, reduce.getResults());
+    return success();
+  }
+};
+
 struct SparseReshapeCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
     assert(op.getInputs().size() == 1 && "Need one input tensor");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-
     // Reconstruct the reshape operation.
     Value ret_sp_tensor = op.getResults()[0];
     // TODO(anlunx): Fix the issue that the reshape is rewritten to a collapse +
@@ -332,53 +295,91 @@ struct SparseReshapeCallRewriter {
   }
 };
 
-struct SparseConvertCallRewriter {
+struct SparseSliceCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 1 && "Need one input tensor");
+    assert(op.getInputs().size() == 4 &&
+           "Need one operand and three slicing parameters");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-    Value ret_sp_tensor = op.getResults()[0];
-    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(
-        op, ret_sp_tensor.getType(), op.getInputs()[0]);
+    auto ctx = op.getContext();
+    auto loc = op.getLoc();
+    auto retTp = op.getResults().getTypes()[0].cast<RankedTensorType>();
+    auto offsets = getDenseIntAttrFromConstant(op.getInputs()[1]);
+    auto strides = getDenseIntAttrFromConstant(op.getInputs()[3]);
+    assert(offsets.getNumElements() == strides.getNumElements() &&
+           offsets.getNumElements() == retTp.getRank());
+    SmallVector<sparse_tensor::SparseTensorDimSliceAttr> slice_attrs;
+    SmallVector<int64_t> static_offsets, static_sizes, static_strides;
+    for (auto [offset, size, stride] :
+         llvm::zip(offsets, retTp.getShape(), strides)) {
+      int64_t o = offset.getZExtValue(), s = stride.getZExtValue();
+      // Converts limits to sizes.
+      slice_attrs.push_back(
+          sparse_tensor::SparseTensorDimSliceAttr::get(ctx, o, size, s));
+      static_offsets.push_back(o);
+      static_sizes.push_back(size);
+      static_strides.push_back(s);
+    }
+    auto srcEnc =
+        retTp.getEncoding().cast<sparse_tensor::SparseTensorEncodingAttr>();
+    // TODO(peiming): add a getSliceEncodingFrom into MLIR upstream.
+    auto sliceEnc = sparse_tensor::SparseTensorEncodingAttr::get(
+        ctx, srcEnc.getLvlTypes(), srcEnc.getDimToLvl(), srcEnc.getPosWidth(),
+        srcEnc.getCrdWidth(), slice_attrs);
+    auto sliceTp = RankedTensorType::get(retTp.getShape(),
+                                         retTp.getElementType(), sliceEnc);
+    auto slice = rewriter.create<tensor::ExtractSliceOp>(
+        loc, sliceTp, op.getInputs()[0], ValueRange(), ValueRange(),
+        ValueRange(), static_offsets, static_sizes, static_strides);
+    // TODO(peiming): This weakens the performance benefit we get from the
+    // sparse compiler by forcing every slice to be materialized while the
+    // sparse compiler supports view-based slice.
+    rewriter.replaceOpWithNewOp<sparse_tensor::ConvertOp>(op, retTp, slice);
     return success();
   }
 };
 
-struct SparseReduceSumCallRewriter {
+struct SparseTransposeCallRewriter {
   LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
-    assert(op.getInputs().size() == 2 && "Need one input tensor and axes");
+    assert(op.getInputs().size() == 2 && "Need argument and permutation");
     assert(op.getResults().size() == 1 && "Need one output tensor");
-    SmallVector<int64_t> axes;
-    getIntegersFromDenseElements(op.getInputs()[1], axes);
-    Value result = op.getResults()[0];
-    auto resultType = result.getType().dyn_cast<RankedTensorType>();
-    auto elementType = resultType.getElementType();
+    // The permutation is passed in as a constant of dense int elements.
+    auto permutation_constant =
+        op.getInputs()[1].getDefiningOp<mhlo::ConstantOp>();
+    auto permutation =
+        permutation_constant.getValue().cast<DenseIntElementsAttr>();
+    // Reconstruct the transpose operation.
+    Value ret_sp_tensor = op.getResults()[0];
+    rewriter.replaceOpWithNewOp<mhlo::TransposeOp>(
+        op, ret_sp_tensor.getType(), op.getInputs()[0], permutation);
+    return success();
+  }
+};
 
-    Location loc = op.getLoc();
-    RankedTensorType blockArgumentType = RankedTensorType::get({}, elementType);
-    Value zero = rewriter.create<mhlo::ConstantOp>(
-        loc, DenseElementsAttr::get(blockArgumentType,
-                                    rewriter.getZeroAttr(elementType)));
+template <typename unaryChlo>
+struct SparseUnaryChloCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getInputs().size() == 1 && "Need one argument");
+    assert(op.getResults().size() == 1 && "Need one output tensor");
+    // Reconstruct the unary chlo operation.
+    Value ret_sp_tensor = op.getResults()[0];
+    rewriter.replaceOpWithNewOp<unaryChlo>(op, ret_sp_tensor.getType(),
+                                           op.getInputs()[0]);
+    return success();
+  }
+};
 
-    mhlo::ReduceOp reduce = rewriter.create<mhlo::ReduceOp>(
-        loc, result.getType(), op.getInputs()[0], zero,
-        rewriter.getI64TensorAttr(axes));
-
-    // Setup the body for mhlo.reduce.
-    Region& region = reduce.getBody();
-    Block& block = region.emplaceBlock();
-    block.addArgument(blockArgumentType, loc);
-    block.addArgument(blockArgumentType, loc);
-    auto* firstArgument = block.args_begin();
-    auto secondArgument = block.args_rbegin();
-    {
-      OpBuilder::InsertionGuard guard(rewriter);
-      rewriter.setInsertionPointToStart(&block);
-      Value addResult =
-          rewriter.create<mhlo::AddOp>(loc, *firstArgument, *secondArgument);
-      rewriter.create<mhlo::ReturnOp>(loc, addResult);
-    }
-
-    rewriter.replaceOp(op, reduce.getResults());
+struct SparseUnpackCallRewriter {
+  LogicalResult operator()(mhlo::CustomCallOp op, PatternRewriter& rewriter) {
+    assert(op.getResults().size() + 1 == op.getInputs().size());
+    // Both jax.BCSR and jax.BCOO has three memref fields.
+    SmallVector<Type, 3> unpack_ret_tp(op.getResults().getTypes());
+    Value tensor = op.getInputs()[0];
+    Value out_vals = op.getInputs()[1];
+    ValueRange out_lvls = op.getInputs().drop_front(2);
+    // Constructs the UnpackOp.
+    auto unpack_op = rewriter.create<sparse_tensor::UnpackOp>(
+        op.getLoc(), unpack_ret_tp, tensor, out_vals, out_lvls);
+    rewriter.replaceOp(op, unpack_op.getResults());
     return success();
   }
 };
@@ -389,15 +390,8 @@ class SparseCustomCallRewriter : public OpRewritePattern<mhlo::CustomCallOp> {
       mhlo::CustomCallOp op, PatternRewriter& rewriter)>;
 
   const llvm::StringMap<SparseCustomTargetRewriter> rewriter_map_{
-      std::make_pair("sparse_tensor_sparse_pack",
-                     SparseBatchedPackCallRewriter()),
-      std::make_pair("sparse_tensor_sparse_unpack", SparseUnpackCallRewriter()),
-      std::make_pair("sparse_tensor_transpose", SparseTransposeCallRewriter()),
-      std::make_pair("sparse_tensor_dot_general", SparseDotCallRewriter()),
-      std::make_pair("sparse_tensor_concatenate",
-                     SparseConcatenateCallRewriter()),
-      std::make_pair("sparse_tensor_broadcast_in_dim",
-                     SparseBroadcastInDimCallRewriter()),
+      std::make_pair("sparse_tensor_add",
+                     SparseBinaryCallRewriter<mhlo::AddOp>()),
       std::make_pair("sparse_tensor_asin",
                      SparseUnaryChloCallRewriter<chlo::AsinOp>()),
       std::make_pair("sparse_tensor_asinh",
@@ -408,16 +402,44 @@ class SparseCustomCallRewriter : public OpRewritePattern<mhlo::CustomCallOp> {
                      SparseUnaryChloCallRewriter<chlo::AtanhOp>()),
       std::make_pair("sparse_tensor_bessel_i1e",
                      SparseUnaryChloCallRewriter<chlo::BesselI1eOp>()),
-      std::make_pair("sparse_tensor_sinh",
-                     SparseUnaryChloCallRewriter<chlo::SinhOp>()),
-      std::make_pair("sparse_tensor_tan",
-                     SparseUnaryChloCallRewriter<chlo::TanOp>()),
-      std::make_pair("sparse_tensor_slice", SparseSliceCallRewriter()),
+      std::make_pair("sparse_tensor_broadcast_in_dim",
+                     SparseBroadcastInDimCallRewriter()),
+      std::make_pair("sparse_tensor_concatenate",
+                     SparseConcatenateCallRewriter()),
+      std::make_pair("sparse_tensor_conv_general_dilated",
+                     SparseConvCallRewriter()),
+      std::make_pair("sparse_tensor_convert", SparseConvertCallRewriter()),
+      std::make_pair("sparse_tensor_dot_general", SparseDotCallRewriter()),
       std::make_pair("sparse_tensor_dynamic_slice",
                      SparseDynSliceCallRewriter()),
+      std::make_pair("sparse_tensor_mul",
+                     SparseBinaryCallRewriter<mhlo::MulOp>()),
+      std::make_pair("sparse_tensor_reduce_and",
+                     SparseReduceCallRewriter<mhlo::AndOp>()),
+      std::make_pair("sparse_tensor_reduce_max",
+                     SparseReduceCallRewriter<mhlo::MaxOp>()),
+      std::make_pair("sparse_tensor_reduce_min",
+                     SparseReduceCallRewriter<mhlo::MinOp>()),
+      std::make_pair("sparse_tensor_reduce_or",
+                     SparseReduceCallRewriter<mhlo::OrOp>()),
+      std::make_pair("sparse_tensor_reduce_prod",
+                     SparseReduceCallRewriter<mhlo::MulOp>()),
+      std::make_pair("sparse_tensor_reduce_sum",
+                     SparseReduceCallRewriter<mhlo::AddOp>()),
+      std::make_pair("sparse_tensor_reduce_xor",
+                     SparseReduceCallRewriter<mhlo::XorOp>()),
       std::make_pair("sparse_tensor_reshape", SparseReshapeCallRewriter()),
-      std::make_pair("sparse_tensor_convert", SparseConvertCallRewriter()),
-      std::make_pair("sparse_tensor_reduce_sum", SparseReduceSumCallRewriter()),
+      std::make_pair("sparse_tensor_sinh",
+                     SparseUnaryChloCallRewriter<chlo::SinhOp>()),
+      std::make_pair("sparse_tensor_slice", SparseSliceCallRewriter()),
+      std::make_pair("sparse_tensor_sparse_pack",
+                     SparseBatchedPackCallRewriter()),
+      std::make_pair("sparse_tensor_sparse_unpack", SparseUnpackCallRewriter()),
+      std::make_pair("sparse_tensor_sub",
+                     SparseBinaryCallRewriter<mhlo::SubtractOp>()),
+      std::make_pair("sparse_tensor_tan",
+                     SparseUnaryChloCallRewriter<chlo::TanOp>()),
+      std::make_pair("sparse_tensor_transpose", SparseTransposeCallRewriter()),
   };
 
   // Rewrites a CustomCallOp to corresponding sparse_tensor operation.
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD
index e65d104f2d4..2de8ab7761c 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
index 6bf301bfb4f..a219ff457b6 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops.mlir
@@ -463,7 +463,27 @@ func.func @general_convolution_with_zero_sized_dimension_in_output(
 //  CHECK-SAME: %[[ARG0]], %[[ARG1]]
 //       CHECK: return %[[RES]] : tensor<2x5x0x4xi64>
 
-func.func @foo(%0: tensor<3x9x9x8xf32>, %1: tensor<1x7x8x8xf32>) -> tensor<3x9x9x8xf32> {
-  %2 = mhlo.convolution(%0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 1], pad = [[0, 0], [3, 3]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [0, 0]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<3x9x9x8xf32>, tensor<1x7x8x8xf32>) -> tensor<3x9x9x8xf32>
+func.func @foo(%0: tensor<3x9x9x8xf32>, %1: tensor<1x7x8x8xf32>)
+  -> tensor<3x9x9x8xf32> {
+  %2 = mhlo.convolution(%0, %1)
+    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
+    window = {stride = [1, 1],
+    pad = [[0, 0], [3, 3]],
+    lhs_dilate = [1, 1],
+    rhs_dilate = [1, 1],
+    reverse = [0, 0]}
+    {batch_group_count = 1 : i64,
+     feature_group_count = 1 : i64,
+     precision_config = [#mhlo<precision DEFAULT>,
+                         #mhlo<precision DEFAULT>]}
+    : (tensor<3x9x9x8xf32>, tensor<1x7x8x8xf32>) -> tensor<3x9x9x8xf32>
   return %2 : tensor<3x9x9x8xf32>
 }
+
+// CHECK-LABEL: @infeed
+//       CHECK: "xla_cpu.infeed"
+func.func @infeed(%token: !mhlo.token) -> tensor<3x3xi32> {
+  %res:3 = "mhlo.infeed"(%token) {infeed_config = "foobar", layout=[[0,1], [0]]}
+    : (!mhlo.token) -> (tensor<3x3xi32>, tensor<i1>, !mhlo.token)
+  func.return %res#0 : tensor<3x3xi32>
+}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/library_ops_to_cpu_runtime.mlir
similarity index 93%
rename from tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
rename to tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/library_ops_to_cpu_runtime.mlir
index 1e287013fc0..1aed4ad53e3 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/collective_ops_to_cpu_runtime.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/library_ops_to_cpu_runtime.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime | FileCheck %s
+// RUN: xla-cpu-opt %s -split-input-file -xla-cpu-to-cpu-runtime | FileCheck %s
 
 func.func @partition_id() -> i32 {
   %0 = "xla_cpu.partition_id"() : () -> i32
@@ -82,9 +82,9 @@ func.func @rng_bit_generator_default(%state: memref<3xui64>,
 // CHECK-LABEL: @rng_bit_generator_default
 //  CHECK-SAME:   %[[ARG0:.*]]: memref<3xui64>, %[[ARG1:.*]]: memref<3xui64>,
 //  CHECK-SAME:   %[[ARG2:.*]]: memref<10xui32>
-//       CHECK: call @xla.cpu.rng.philox(%[[ARG0]], %[[ARG1]], %[[ARG2]])
-//       CHECK: func.func private @xla.cpu.rng.philox(
-//  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.rng.philox"}
+//       CHECK: call @xla_cpu_rng_philox(%[[ARG0]], %[[ARG1]], %[[ARG2]])
+//       CHECK: func.func private @xla_cpu_rng_philox(
+//  CHECK-SAME:     attributes {rt.custom_call = "xla_cpu_rng_philox"}
 
 // -----
 
@@ -97,9 +97,9 @@ func.func @rng_bit_generator_three_fry(%state: memref<2xui64>,
 }
 
 // CHECK-LABEL: @rng_bit_generator_three_fry
-//       CHECK: call @xla.cpu.rng.three_fry(
-//       CHECK: func.func private @xla.cpu.rng.three_fry(
-//  CHECK-SAME:     attributes {rt.custom_call = "xla.cpu.rng.three_fry"}
+//       CHECK: call @xla_cpu_rng_three_fry(
+//       CHECK: func.func private @xla_cpu_rng_three_fry(
+//  CHECK-SAME:     attributes {rt.custom_call = "xla_cpu_rng_three_fry"}
 
 // -----
 
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
index 4fbb3bbc1c1..82986581dbc 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_custom_call.mlir
@@ -1,4 +1,4 @@
-// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime \
+// RUN: xla-cpu-opt %s -split-input-file -xla-cpu-to-cpu-runtime \
 // RUN:   | FileCheck %s
 
 // CHECK: func @test
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_infeed.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_infeed.mlir
deleted file mode 100644
index 5db2be4725e..00000000000
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/lmhlo_infeed.mlir
+++ /dev/null
@@ -1,13 +0,0 @@
-// RUN: xla-cpu-opt %s -xla-lmhlo-to-cpu-runtime | FileCheck %s
-
-// CHECK: func @cpu_infeed(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<8xf32>
-// CHECK: )
-func.func @cpu_infeed(%arg0: memref<8xf32>) {
-  // CHECK: call @[[INFEED:.*]](%[[ARG0]]) : (memref<8xf32>) -> ()
-  "lmhlo.infeed"(%arg0) {config = "abc"} : (memref<8xf32>) -> ()
-  return
-}
-
-// CHECK: func private @[[INFEED]](memref<8xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.cpu.infeed"}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_infeed.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_infeed.mlir
new file mode 100644
index 00000000000..5ddb959a771
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_infeed.mlir
@@ -0,0 +1,16 @@
+// RUN: xla-cpu-opt %s -xla-cpu-to-cpu-runtime | FileCheck %s
+
+func.func @infeed(%arg0 : memref<3x3xi32>, %arg1 : memref<i1>) -> () {
+  "xla_cpu.infeed"(%arg0, %arg1) {config = "foobar", layout = [[0, 1], [0]]}
+    : (memref<3x3xi32>, memref<i1>) -> ()
+  return
+}
+
+//      CHECK: func @infeed(
+// CHECK-SAME:   %[[ARG0:[a-z0-9]+]]: memref<3x3xi32>
+// CHECK-SAME:   %[[ARG1:[a-z0-9]+]]: memref<i1>
+// CHECK-SAME: )
+//      CHECK:   call @[[INFEED:.*]](%[[ARG0]], %[[ARG1]])
+// CHECK SAME:   : (memref<3x3xi32>, memref<i1>) -> ()
+//      CHECK:   func private @[[INFEED]](memref<3x3xi32>, memref<i1>)
+// CHECK-SAME:   attributes {rt.custom_call = "[[INFEED]]"}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir
index 5ab7ae76411..f4f17c94705 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/tests/xla_cpu_outfeed.mlir
@@ -1,12 +1,12 @@
-// RUN: xla-cpu-opt %s -split-input-file -xla-lmhlo-to-cpu-runtime \
+// RUN: xla-cpu-opt %s -split-input-file -xla-cpu-to-cpu-runtime \
 // RUN: | FileCheck %s
 
-func.func @cpu_onfeed(%arg0: memref<8xf32>, %arg1: memref<10xui32>) {
+func.func @cpu_outfeed(%arg0: memref<8xf32>, %arg1: memref<10xui32>) {
   "xla_cpu.outfeed"(%arg0, %arg1) {config = "abc", result_type = [f32, ui32]} : (memref<8xf32>, memref<10xui32>) -> ()
   return
 }
 
-//      CHECK: func @cpu_onfeed(
+//      CHECK: func @cpu_outfeed(
 // CHECK-SAME:   %[[ARG0:[a-z0-9]+]]: memref<8xf32>
 // CHECK-SAME:   %[[ARG1:[a-z0-9]+]]: memref<10xui32>
 // CHECK-SAME: )
@@ -17,7 +17,7 @@ func.func @cpu_onfeed(%arg0: memref<8xf32>, %arg1: memref<10xui32>) {
 
 // -----
 
-func.func @cpu_onfeed_strided(
+func.func @cpu_outfeed_strided(
   %arg0: memref<8x8xf32, strided<[?, 1], offset: ?>>,
   %arg1: memref<10xui32>) {
     "xla_cpu.outfeed"(%arg0, %arg1) {config = "abc", result_type = [f32, ui32]}
@@ -25,7 +25,7 @@ func.func @cpu_onfeed_strided(
     return
 }
 
-//      CHECK: func @cpu_onfeed_strided(
+//      CHECK: func @cpu_outfeed_strided(
 // CHECK-SAME:   %[[ARG0:[a-z0-9]+]]: memref<8x8xf32, strided<[?, 1], offset: ?>>
 // CHECK-SAME:   %[[ARG1:[a-z0-9]+]]: memref<10xui32>
 // CHECK-SAME: )
@@ -34,4 +34,4 @@ func.func @cpu_onfeed_strided(
 //      CHECK:   call @[[OUTFEED:.*]](%[[ALLOC]], %[[ARG1]])
 // CHECK-SAME:   {result_type = [11 : i32, 8 : i32]} : (memref<8x8xf32>, memref<10xui32>) -> ()
 //      CHECK:   func private @[[OUTFEED]](memref<8x8xf32>, memref<10xui32>)
-// CHECK-SAME:   attributes {rt.custom_call = "xla.cpu.outfeed"}
\ No newline at end of file
+// CHECK-SAME:   attributes {rt.custom_call = "xla.cpu.outfeed"}
diff --git a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
similarity index 93%
rename from tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
rename to tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
index 5faac2a5fd0..dff9579db65 100644
--- a/tensorflow/compiler/xla/mlir/backends/cpu/transforms/lmhlo_to_cpu_runtime.cc
+++ b/tensorflow/compiler/xla/mlir/backends/cpu/transforms/xla_cpu_to_cpu_runtime.cc
@@ -43,14 +43,12 @@ namespace xla {
 namespace cpu {
 namespace {
 
-#define GEN_PASS_DEF_CONVERTLMHLOTOCPURUNTIMEPASS
+#define GEN_PASS_DEF_CONVERTXLACPUTOCPURUNTIMEPASS
 #include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h.inc"
 
 using namespace mlir;  // NOLINT
 
 using mlir::lmhlo::CustomCallOp;
-using mlir::lmhlo::InfeedOp;
-using mlir::lmhlo::OutfeedOp;
 
 using xla_cpu::PartitionIdOp;
 using xla_cpu::ReplicaIdOp;
@@ -58,9 +56,9 @@ using xla_cpu::ReplicaIdOp;
 using xla::runtime::AppendCustomCallAttrs;
 using xla::runtime::CustomCallDeclarations;
 
-class ConvertLmhloToCpuRuntimePass
-    : public impl::ConvertLmhloToCpuRuntimePassBase<
-          ConvertLmhloToCpuRuntimePass> {
+class ConvertXlaCpuToCpuRuntimePass
+    : public impl::ConvertXlaCpuToCpuRuntimePassBase<
+          ConvertXlaCpuToCpuRuntimePass> {
   void runOnOperation() override;
 
   void getDependentDialects(DialectRegistry& registry) const override {
@@ -244,38 +242,6 @@ class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
 
 //===----------------------------------------------------------------------===//
 
-class InfeedOpLowering : public OpRewritePattern<InfeedOp> {
- private:
-  static constexpr const char kCallTarget[] = "xla.cpu.infeed";
-
- public:
-  InfeedOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(InfeedOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
-
-    // By default all operands are passed to the custom call handler.
-    llvm::SmallVector<Value> operands = op->getOperands();
-
-    // Create a custom call function declaration.
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, StringRef(kCallTarget),
-                                  TypeRange(ValueRange(operands)), TypeRange());
-
-    // Call the runtime intrinsic with the original operands.
-    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(), TypeRange(),
-                                              operands);
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
 template <typename IdOp>
 class IdOpLowering : public OpRewritePattern<IdOp> {
  public:
@@ -421,7 +387,7 @@ class ConvolutionLowering : public OpRewritePattern<xla_cpu::ConvolutionOp> {
   }
 
  private:
-  static constexpr const char kCallTarget[] = "xla.cpu.convolution";
+  static constexpr const char kCallTarget[] = "xla_cpu_convolution";
 
   CustomCallDeclarations& custom_calls_;
 };
@@ -450,8 +416,48 @@ class RngBitGeneratorLowering
   }
 
  private:
-  static constexpr const char kThreeFryTarget[] = "xla.cpu.rng.three_fry";
-  static constexpr const char kPhiloxTarget[] = "xla.cpu.rng.philox";
+  static constexpr const char kThreeFryTarget[] = "xla_cpu_rng_three_fry";
+  static constexpr const char kPhiloxTarget[] = "xla_cpu_rng_philox";
+
+  CustomCallDeclarations& custom_calls_;
+};
+
+//===----------------------------------------------------------------------===//
+
+class InfeedLowering : public OpRewritePattern<xla_cpu::InfeedOp> {
+ public:
+  InfeedLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
+      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
+
+  LogicalResult matchAndRewrite(xla_cpu::InfeedOp op,
+                                PatternRewriter& rewriter) const override {
+    ImplicitLocOpBuilder b(op->getLoc(), rewriter);
+
+    // By default all operands are passed to the custom call handler.
+    llvm::SmallVector<Value> operands = EnsureFlatMemrefs(op->getOperands(), b);
+
+    // For infeed with empty tuples, bufferizer does not run, thus the token is
+    // left as the only operand. Remove it.
+    if (operands.back().getType().isa<mlir::mhlo::TokenType>()) {
+      assert(operands.size() == 1 && "Expect token only with empty tuples");
+      operands.pop_back();
+    }
+
+    // Create a custom call function declaration.
+    func::FuncOp callee =
+        custom_calls_.GetOrCreate(b, StringRef(kCallTarget),
+                                  TypeRange(ValueRange(operands)), TypeRange());
+
+    // Call the runtime intrinsic with the original operands.
+    b.create<func::CallOp>(op->getLoc(), callee.getName(), TypeRange(),
+                           operands);
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+ private:
+  static constexpr const char kCallTarget[] = "xla.cpu.infeed";
 
   CustomCallDeclarations& custom_calls_;
 };
@@ -526,7 +532,7 @@ class FftLowering : public OpRewritePattern<xla_cpu::FftOp> {
 
 //===----------------------------------------------------------------------===//
 
-void ConvertLmhloToCpuRuntimePass::runOnOperation() {
+void ConvertXlaCpuToCpuRuntimePass::runOnOperation() {
   ModuleOp module = getOperation();
   MLIRContext* ctx = module.getContext();
 
@@ -534,12 +540,12 @@ void ConvertLmhloToCpuRuntimePass::runOnOperation() {
   SymbolTable sym_table(module);
   CustomCallDeclarations custom_calls(std::move(sym_table));
 
-  // Convert lmhlo operations to XLA cpu runtime custom calls.
+  // Convert xla_cpu operations to XLA cpu runtime custom calls.
   RewritePatternSet patterns(ctx);
   patterns
       .insert<AllReduceLowering, AllToAllLowering, CollectivePermuteLowering,
               ConvolutionLowering, CustomCallOpLowering, FftLowering,
-              InfeedOpLowering, OutfeedLowering, RngBitGeneratorLowering>(
+              InfeedLowering, OutfeedLowering, RngBitGeneratorLowering>(
           ctx, custom_calls);
   patterns.insert<IdOpLowering<PartitionIdOp>>(ctx, "xla.cpu.partition_id",
                                                custom_calls);
@@ -553,8 +559,8 @@ void ConvertLmhloToCpuRuntimePass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToCpuRuntimePass() {
-  return std::make_unique<ConvertLmhloToCpuRuntimePass>();
+createConvertXlaCpuToCpuRuntimePass() {
+  return std::make_unique<ConvertXlaCpuToCpuRuntimePass>();
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
index 9ca1ae5a346..57ab76b2dc1 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/BUILD
@@ -28,6 +28,7 @@ gentbl_cc_library(
 cc_library(
     name = "passes",
     srcs = [
+        "add_concurrent_regions.cc",
         "add_hlo_trace_annotations.cc",
         "gpu_to_gpu_runtime.cc",
         "lmhlo_gpu_to_gpu_runtime.cc",
@@ -56,8 +57,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:nccl_collective_thunks",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/translate/mhlo_to_hlo:location_exporter",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_concurrent_regions.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_concurrent_regions.cc
new file mode 100644
index 00000000000..a4ae21b5883
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/add_concurrent_regions.cc
@@ -0,0 +1,285 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <algorithm>
+#include <memory>
+#include <utility>
+
+#include "absl/strings/match.h"
+#include "llvm/ADT/SmallVector.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir/runtime/utils/custom_calls.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+#define GEN_PASS_DEF_ADDCONCURRENTREGIONSPASS
+#include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
+
+using namespace mlir;  // NOLINT
+using mlir::func::FuncOp;
+using mlir::gpu::LaunchFuncOp;
+using xla::runtime::CustomCallDeclarations;
+
+class AddConcurrentRegionsPass
+    : public impl::AddConcurrentRegionsPassBase<AddConcurrentRegionsPass> {
+  void runOnOperation() override;
+};
+
+//===----------------------------------------------------------------------===//
+
+// Represents a slice of the buffer argument to the graph capture function.
+struct BufferUse {
+  BlockArgument arg;
+  size_t offset;
+  size_t len;
+
+  // The buffer is only read by the operation.
+  bool read_only;
+};
+
+BufferUse GetBufferUse(Value operand, bool read_only = false) {
+  Operation* defining_op = operand.getDefiningOp();
+  if (!defining_op) {
+    auto block_argument = cast<mlir::BlockArgument>(operand);
+    auto memref_type = cast<MemRefType>(block_argument.getType());
+    size_t len =
+        (memref_type.getNumElements() * memref_type.getElementTypeBitWidth() +
+         7) /
+        8;
+    return {block_argument, 0, len, read_only};
+  }
+
+  if (isa<memref::ViewOp>(defining_op)) {
+    auto view_op = cast<mlir::memref::ViewOp>(defining_op);
+    auto buffer_use = GetBufferUse(view_op.getSource());
+
+    IntegerAttr offset_attr;
+    bool is_constant =
+        matchPattern(view_op.getByteShift(), m_Constant(&offset_attr));
+    if (!is_constant) {
+      // Failed to refine the BufferUse.
+      return buffer_use;
+    }
+    size_t offset = offset_attr.getInt();
+
+    // Get len.
+    auto memref_type = cast<MemRefType>(view_op.getType());
+    // TODO(b/274157088): Handle the case where elements are complex numbers.
+    if (!memref_type.getElementType().isIntOrFloat()) {
+      return buffer_use;
+    }
+
+    size_t len =
+        (memref_type.getNumElements() * memref_type.getElementTypeBitWidth() +
+         7) /
+        8;
+
+    return {buffer_use.arg, buffer_use.offset + offset, len, read_only};
+  }
+
+  if (auto cast = dyn_cast<mlir::memref::ReinterpretCastOp>(defining_op)) {
+    return GetBufferUse(cast.getSource(), read_only);
+  }
+
+  return {};
+}
+
+// Arguments to the graph capture function may have the "lmhlo.constant_name"
+// attribute, which indicates that the passed-in buffer is constant.
+bool IsConstant(BlockArgument block_argument) {
+  // Check if the input buffer is marked as constant.
+  Region* parent_region = block_argument.getParentRegion();
+  auto parent_func = parent_region->getParentOfType<FuncOp>();
+  unsigned parent_func_arg_index = block_argument.getArgNumber();
+  auto cst = parent_func.getArgAttrOfType<StringAttr>(parent_func_arg_index,
+                                                      "lmhlo.constant_name");
+  return cst != nullptr;
+}
+
+// Check if buffer_use has any overlap with buffers in the region.
+bool HasDependency(llvm::ArrayRef<BufferUse> region_buffer_uses,
+                   BufferUse buffer_use) {
+  if (IsConstant(buffer_use.arg)) return false;
+
+  for (auto buffer_use_in_region : region_buffer_uses) {
+    if (IsConstant(buffer_use_in_region.arg) ||
+        buffer_use_in_region.arg.getArgNumber() !=
+            buffer_use.arg.getArgNumber()) {
+      continue;
+    }
+
+    // Two read-only accesses to the same buffer does not create dependency.
+    if (buffer_use.read_only && buffer_use_in_region.read_only) continue;
+
+    // Check if two buffer slices overlap.
+    size_t start1 = buffer_use_in_region.offset;
+    size_t end1 = buffer_use_in_region.offset + buffer_use_in_region.len;
+    size_t start2 = buffer_use.offset;
+    size_t end2 = buffer_use.offset + buffer_use.len;
+    if (std::max(start1, start2) < std::min(end1, end2)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+using RegionStartAndEnd = std::pair<Operation*, Operation*>;
+
+//
+// Return a list of pairs of operations, in which the first element is the
+// first operation in the region, and the second is the last operation in the
+// region.
+//
+// We currently use a greedy algorithm to determine region starting point:
+//   regions = []
+//   region = {first operation}
+//   for operation in the capture function
+//     if HasDependency(region, operation)
+//       regions.add(region)
+//       region = new region
+//     else
+//       region.add(operation)
+//
+llvm::SmallVector<RegionStartAndEnd> GetRegionStartAndEnd(FuncOp capture_func) {
+  llvm::SmallVector<RegionStartAndEnd> region_start_and_end;
+
+  // These two arrays stores the information about the current region that is
+  // being processed. region contains the kernels, while buffer_uses stores the
+  // buffer usage by the kernels in the region.
+  llvm::SmallVector<Operation*> region;
+  llvm::SmallVector<BufferUse> buffer_uses;
+
+  auto store_region_and_start_new_region = [&]() {
+    if (region.size() >= 2) {
+      region_start_and_end.push_back({region.front(), region.back()});
+    }
+    region.clear();
+    buffer_uses.clear();
+  };
+
+  auto operations = capture_func.getOps();
+  for (auto& operation : operations) {
+    // TODO(anlunx): Support other ops.
+    llvm::SmallVector<BufferUse> operand_buffer_uses;
+    if (auto launch_func = dyn_cast<LaunchFuncOp>(operation)) {
+      auto kernel_func =
+          SymbolTable::lookupNearestSymbolFrom<mlir::gpu::GPUFuncOp>(
+              &operation, launch_func.getKernel());
+      auto kernel_operands = launch_func.getKernelOperands();
+      for (auto it : llvm::enumerate(kernel_operands)) {
+        BufferUse buffer_use =
+            GetBufferUse(it.value(),
+                         /*read_only=*/!kernel_func.getArgAttrOfType<UnitAttr>(
+                             it.index(), "lmhlo.written"));
+        operand_buffer_uses.push_back(buffer_use);
+      }
+    } else if (auto gemm = dyn_cast<lmhlo_gpu::GEMMOp>(operation)) {
+      BufferUse buffer_use_0 = GetBufferUse(gemm.getA(), /*read_only=*/true);
+      BufferUse buffer_use_1 = GetBufferUse(gemm.getB(), /*read_only=*/true);
+      BufferUse buffer_use_2 = GetBufferUse(gemm.getC(), /*read_only=*/false);
+      operand_buffer_uses.push_back(buffer_use_0);
+      operand_buffer_uses.push_back(buffer_use_1);
+      operand_buffer_uses.push_back(buffer_use_2);
+    } else {
+      store_region_and_start_new_region();
+      continue;
+    }
+
+    bool has_dependency = false;
+    for (BufferUse buffer_use : operand_buffer_uses) {
+      if (HasDependency(buffer_uses, buffer_use)) {
+        has_dependency = true;
+      }
+    }
+
+    if (has_dependency) {
+      store_region_and_start_new_region();
+    }
+
+    region.push_back(&operation);
+    for (auto buffer_use : operand_buffer_uses) {
+      buffer_uses.push_back(buffer_use);
+    }
+  }
+
+  if (region.size() >= 2) {
+    store_region_and_start_new_region();
+  }
+
+  return region_start_and_end;
+}
+
+void InsertConcurrentRegions(FuncOp capture_func,
+                             CustomCallDeclarations& custom_calls) {
+  llvm::SmallVector<RegionStartAndEnd> region_start_and_end =
+      GetRegionStartAndEnd(capture_func);
+  auto sym_table = custom_calls.sym_table();
+
+  for (auto pair : region_start_and_end) {
+    Operation* start = pair.first;
+    Operation* end = pair.second;
+
+    ImplicitLocOpBuilder b(start->getLoc(), sym_table.getOp());
+    // See how graph launch is added.
+    func::FuncOp begin_marker = custom_calls.GetOrCreate(
+        b, "xla.gpu.concurrent_region.begin", TypeRange(), TypeRange());
+    b.setInsertionPoint(start);
+    b.create<func::CallOp>(begin_marker.getName(), TypeRange());
+
+    func::FuncOp end_marker = custom_calls.GetOrCreate(
+        b, "xla.gpu.concurrent_region.end", TypeRange(), TypeRange());
+    b.setInsertionPointAfter(end);
+    b.create<func::CallOp>(end_marker.getName(), TypeRange());
+  }
+}
+
+//===----------------------------------------------------------------------===//
+
+void AddConcurrentRegionsPass::runOnOperation() {
+  ModuleOp module = getOperation();
+  SymbolTable sym_table(module);
+  CustomCallDeclarations custom_calls(std::move(sym_table));
+
+  auto func_ops = llvm::to_vector(module.getOps<FuncOp>());
+
+  for (auto func_op : func_ops) {
+    // Find the cuda graph capture function.
+    if (absl::StrContains(func_op.getSymNameAttr().str(),
+                          "xla.gpu.cuda.graph.capture")) {
+      InsertConcurrentRegions(func_op, custom_calls);
+    }
+  }
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<ModuleOp>> createAddConcurrentRegionsPass() {
+  return std::make_unique<AddConcurrentRegionsPass>();
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
index 9cbbf957587..8d2d5ca2367 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
@@ -237,11 +237,11 @@ static void Rewrite(Operation* op, OpBuilder& b, SymbolTable& symbol_table,
   op->erase();
 }
 
-static void LowerKernelThunkToGpuOp(Operation* op, OpBuilder& b,
-                                    GPUModuleOp gpu_module,
-                                    const std::string& kernel_name,
-                                    const SmallVector<Value>& kernel_args,
-                                    const LaunchDimensions& launch_dims) {
+static void LowerKernelThunkToGpuOp(
+    Operation* op, OpBuilder& b, GPUModuleOp gpu_module,
+    const std::string& kernel_name, const SmallVector<Value>& kernel_args,
+    const SmallVector<bool>& kernel_args_written,
+    const LaunchDimensions& launch_dims) {
   mlir::Location loc = op->getLoc();
   b.setInsertionPointToStart(gpu_module.getBody());
 
@@ -251,6 +251,13 @@ static void LowerKernelThunkToGpuOp(Operation* op, OpBuilder& b,
   gpu::GPUFuncOp kernel_func =
       b.create<gpu::GPUFuncOp>(loc, kernel_name, func_type);
   kernel_func->setAttr(GPUDialect::getKernelFuncAttrName(), b.getUnitAttr());
+
+  for (int i = 0; i < kernel_args.size(); ++i) {
+    if (kernel_args_written[i]) {
+      kernel_func.setArgAttr(i, "lmhlo.written", b.getUnitAttr());
+    }
+  }
+
   b.setInsertionPointToEnd(&kernel_func.getBody().back());
   b.create<ReturnOp>(loc);
 
@@ -324,8 +331,14 @@ static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
     for (auto kernel_arg : kernel_thunk->values())
       kernel_args.push_back(kernel_arg);
 
+    SmallVector<bool> kernel_args_written;
+    for (auto written : kernel_thunk->written()) {
+      kernel_args_written.push_back(written);
+    }
+
     LowerKernelThunkToGpuOp(op, b, gpu_module, kernel_thunk->kernel_name(),
-                            kernel_args, kernel_thunk->launch_dimensions());
+                            kernel_args, kernel_args_written,
+                            kernel_thunk->launch_dimensions());
     return;
   }
 
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
index b15c149ca00..88f31e51629 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
@@ -501,12 +501,12 @@ using mlir::lmhlo_gpu::ReduceScatterStartOp;
 // TODO(ezhulenev): Once XLA runtime custom calls support returning values, we
 // should explicitly return event id from the `Start` custom call, and pass it
 // to the `Done` custom call. Longer term this should become an `!async.token`
-// and rely on XLA runtime asynchonous execution.
+// and rely on XLA runtime asynchronous execution.
 class CollectiveUidGenerator {
  public:
   CollectiveUidGenerator() : cnt_(0) {}
 
-  // Assings a unique event id to the pair of start and done operations.
+  // Assigns a unique event id to the pair of start and done operations.
   int32_t AssignUid(Operation* start, Operation* done) {
     int32_t id = next();
     uids_[start] = id;
@@ -821,7 +821,7 @@ class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
     if (!is_async) {
       erase_done_op();
     } else {
-      // For asynchonous start operation we need to produce a fake token, that
+      // For asynchronous start operation we need to produce a fake token, that
       // will be later removed, because corresponding `done` operation doesn't
       // have a token argument. We rely on the `unrealized_conversion_cast`
       // operation to create a fake token from the `i8` constant, and on the
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
index 5eb8efcb0fc..ff69d05ac48 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
@@ -86,7 +86,7 @@ class GetGlobalOpLowering : public OpRewritePattern<memref::GetGlobalOp> {
     auto func_mapping = cst_args_.find(op->getParentOfType<func::FuncOp>());
     if (func_mapping == cst_args_.end()) return failure();
 
-    // Check if the global operation correposponds to the LMHLO constant arg.
+    // Check if the global operation corresponds to the LMHLO constant arg.
     auto arg = func_mapping->second.find(op.getName());
     if (arg == func_mapping->second.end()) return failure();
 
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
index 8fa6c141530..708bb2089d8 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
@@ -353,6 +353,35 @@ static LogicalResult Outline(unsigned ordinal,
       "xla.gpu.cuda.graph.capture",
       FunctionType::get(ctx, TypeRange(ValueRange(args)), TypeRange()));
 
+  Operation* first_op = seq.front().first;
+  auto parent_func = first_op->getParentOfType<func::FuncOp>();
+
+  // If an argument to parent_func has the "lmhlo.constant_name" attribute and
+  // is passed to the graph capture function, we propagate the attribute the
+  // graph capture function.
+  for (unsigned i = 0; i < args.size(); ++i) {
+    Value arg = args[i];
+
+    // Check if arg is a function argument of parent_func.
+    if (!isa<BlockArgument>(arg)) continue;
+
+    // Function arguments are passed in as block arguments to the entry block.
+    auto block_arg = cast<BlockArgument>(arg);
+    Block* parent_block = block_arg.getParentBlock();
+    if (!parent_block->isEntryBlock()) continue;
+
+    // Check that the parent_block is in the SSACFG region of parent_func.
+    Region& parent_func_region = parent_func.getRegion();
+    if (parent_block->getParent() != &parent_func_region) continue;
+
+    unsigned parent_func_arg_index = block_arg.getArgNumber();
+    auto cst = parent_func.getArgAttrOfType<StringAttr>(parent_func_arg_index,
+                                                        "lmhlo.constant_name");
+    if (cst) {
+      func.setArgAttr(i, "lmhlo.constant_name", cst);
+    }
+  }
+
   for (auto op : seq) {
     mlir::Operation* captured_op = op.first;
     if (isa<lmhlo_gpu::GEMMOp>(captured_op)) {
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
index 0389ecabe4c..2912fb1df9f 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.cc
@@ -42,6 +42,9 @@ void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
 
   // Outline CUDA-Graph-compatible operations into graph capture functions.
   pm.addPass(createOutlineCudaGraphsPass(opts.cuda_graph_level));
+  if (opts.enable_concurrent_region) {
+    pm.addPass(createAddConcurrentRegionsPass());
+  }
 
   // Lower all Gpu operations to the XLA Gpu runtime custom calls.
   pm.addPass(createConvertLmhloGpuToGpuRuntimePass());
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
index a3099214fb1..ebf4058365e 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h
@@ -32,6 +32,7 @@ namespace gpu {
 #define GEN_PASS_DECL_CONVERTLMHLOTOGPURUNTIMEPASS
 #define GEN_PASS_DECL_CONVERTMEMREFGETGLOBALTOARGPASS
 #define GEN_PASS_DECL_OUTLINECUDAGRAPHSPASS
+#define GEN_PASS_DECL_ADDCONCURRENTREGIONSPASS
 #include "tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.h.inc"
 
 class ThunkSequence;  // forward declare
@@ -41,6 +42,7 @@ struct GpuPipelineOpts {
   // CUDA Graphs, which allows us to amortize the cost of launching multiple
   // device kernels.
   int32_t cuda_graph_level = 0;
+  bool enable_concurrent_region = false;
 };
 
 // Populate passes that lower MLIR modules from a combination of LMHLO and
@@ -101,6 +103,13 @@ createOutlineCudaGraphsPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createOutlineCudaGraphsPass(int32_t cuda_graph_level);
 
+//===----------------------------------------------------------------------===//
+// Passes for marking concurrent region in CUDA graph capture function.
+//===----------------------------------------------------------------------===//
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createAddConcurrentRegionsPass();
+
 //===-----------------------------------------------------------------------===/
 
 #define GEN_PASS_REGISTRATION
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td
index 9608e325811..1c7dbb55bc9 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/passes.td
@@ -187,4 +187,60 @@ def OutlineCudaGraphsPass :
   let constructor = "createOutlineCudaGraphsPass()";
 }
 
+//===----------------------------------------------------------------------===//
+// Add concurrent regions to CUDA graph capture functions.
+//===----------------------------------------------------------------------===//
+
+def AddConcurrentRegionsPass:
+    Pass<"xla-gpu-add-concurrent-regions", "mlir::ModuleOp"> {
+  let summary = "Identify and mark concurrent regions in CUDA graph capture "
+                "functions";
+
+  let description = [{
+    Add concurent region markers to indicate a region of operations that can be
+    executed concurrently.
+
+    Example:
+
+    ```mlir
+    func.func @capture.cuda.graph() {
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+
+      // Everything here can run concurrently
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+      // Back to sequential execution
+
+      call @xla.gpu.launch.func
+      func.return
+    }
+    ```
+
+    becomes:
+
+    ```mlir
+    func.func @capture.cuda.graph() {
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+
+      call @xla.gpu.concurrent_region.begin()
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+      call @xla.gpu.launch.func
+      call @xla.gpu.concurrent_region.end()
+
+      call @xla.gpu.launch.func
+      func.return
+    }
+    ```
+
+  }];
+
+  let constructor = "createAddConcurrentRegionsPass()";
+}
+
 #endif  // XLA_GPU_PASSES
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD
index 3d18ffe7130..9a916e7d57e 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/add_concurrent_regions.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/add_concurrent_regions.mlir
new file mode 100644
index 00000000000..5ccec0758af
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/add_concurrent_regions.mlir
@@ -0,0 +1,286 @@
+// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-add-concurrent-regions \
+// RUN:   | FileCheck %s
+
+
+// -----
+// Check that two consecutive launch_funcs using different buffers is captured
+// by a concurrent_region.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
+    %view_0 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
+
+    // CHECK: call @xla.gpu.concurrent_region.begin()
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: call @xla.gpu.concurrent_region.end()
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
+    return
+  }
+}
+
+// -----
+// Check that two consecutive launch_funcs using the same buffer is not
+// captured.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
+    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
+
+    // CHECK: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
+    return
+  }
+}
+
+// -----
+// Check that there is no dependency from launch_funcs that do not write to
+// buffers.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi64> ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi64> ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
+    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
+
+    // CHECK: call @xla.gpu.concurrent_region.begin()
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: call @xla.gpu.concurrent_region.end()
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
+    return
+  }
+}
+
+// -----
+// Check that the i1 data type is handled correctly.
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi1> {lmhlo.written} ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi1> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi1>
+    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi1>
+
+    // CHECK-NOT: xla.gpu.concurrent_region.begin()
+    // CHECK: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi1>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi1>)
+    return
+  }
+}
+
+// -----
+// Check that disjoint buffer slices does not introduce dependency.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<144xi8>) {
+    %c0 = arith.constant 0 : index
+    %c72 = arith.constant 72 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<144xi8> to memref<3x3xi64>
+    %view_0 = memref.view %arg0[%c72][] : memref<144xi8> to memref<3x3xi64>
+
+    // CHECK: call @xla.gpu.concurrent_region.begin()
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: call @xla.gpu.concurrent_region.end()
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
+    return
+  }
+}
+
+// -----
+// Check that overlapping buffer slices creates dependency.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<144xi8>) {
+    %c0 = arith.constant 0 : index
+    %c36 = arith.constant 36 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<144xi8> to memref<3x3xi64>
+    %view_0 = memref.view %arg0[%c36][] : memref<144xi8> to memref<3x3xi64>
+
+    // CHECK: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
+    return
+  }
+}
+
+// -----
+// Check that constant input buffer does not create dependency.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<144xi8> {lmhlo.constant_name = "cst0"}) {
+    %c0 = arith.constant 0 : index
+    %c36 = arith.constant 36 : index
+    %c1 = arith.constant 1 : index
+    %view = memref.view %arg0[%c0][] : memref<144xi8> to memref<3x3xi64>
+    %view_0 = memref.view %arg0[%c36][] : memref<144xi8> to memref<3x3xi64>
+
+    // CHECK: call @xla.gpu.concurrent_region.begin()
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: call @xla.gpu.concurrent_region.end()
+    // CHECK-NEXT: return
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
+    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
+      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
+    return
+  }
+}
+
+// -----
+// Check that two gemms that read the same buffer are moved into a concurrent
+// region.
+
+module attributes {gpu.container_module} {
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<16xi8>,
+                   %arg1: memref<16xi8>,
+                   %arg2: memref<16xi8>,
+                   %arg3: memref<16xi8>) {
+    %c0 = arith.constant 0 : index
+    %view_0 = memref.view %arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
+    %c1 = arith.constant 0 : index
+    %view_1 = memref.view %arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
+    %c2 = arith.constant 0 : index
+    %view_2 = memref.view %arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
+    %view_3 = memref.view %arg3[%c2][] : memref<16xi8> to memref<2x2xf32>
+
+    // CHECK: call @xla.gpu.concurrent_region.begin()
+    // CHECK-NEXT: lmhlo_gpu.gemm
+    // CHECK-NEXT: lmhlo_gpu.gemm
+    // CHECK-NEXT: call @xla.gpu.concurrent_region.end()
+    // CHECK-NEXT: return
+    "lmhlo_gpu.gemm"(%view_0, %view_1, %view_2) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
+    "lmhlo_gpu.gemm"(%view_0, %view_1, %view_3) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
+    return
+  }
+
+  func.func private @external()
+}
+
+// -----
+// Check that lmhlo_gpu.gemm is not moved into the concurrent region if it
+// uses a buffer used by a kernel launch.
+
+module attributes {gpu.container_module} {
+
+  gpu.module @gpu_module attributes {binary = "kernel binary"} {
+    gpu.func @fn0(%arg0: memref<16xi8> {lmhlo.written} ) kernel { gpu.return }
+  }
+
+  // CHECK: func @xla.gpu.cuda.graph.capture
+  func.func @xla.gpu.cuda.graph.capture(%arg0: memref<16xi8>,
+                   %arg1: memref<16xi8>,
+                   %arg2: memref<16xi8>) {
+    %c0 = arith.constant 0 : index
+    %view_0 = memref.view %arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
+    %c1 = arith.constant 0 : index
+    %view_1 = memref.view %arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
+    %c2 = arith.constant 0 : index
+    %view_2 = memref.view %arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
+
+    // CHECK-NOT: @xla.gpu.concurrent_region.begin()
+    // CHECK: lmhlo_gpu.gemm
+    // CHECK-NEXT: gpu.launch_func
+    // CHECK-NEXT: return
+    "lmhlo_gpu.gemm"(%view_0, %view_1, %view_2) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
+    gpu.launch_func  @gpu_module::@fn0 blocks in (%c0, %c0, %c0)
+      threads in (%c0, %c0, %c0) args(%arg0: memref<16xi8>)
+    return
+  }
+
+  func.func private @external()
+}
diff --git a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
index ccea3c0c00c..3cdae0c1174 100644
--- a/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
+++ b/tensorflow/compiler/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
@@ -624,3 +624,57 @@ func.func private @external()
 // CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
 // CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
 // CHECK-NEXT: return
+
+// -----
+// Check that lmhlo.constant_name is propogated to the graph capture function
+module attributes {gpu.container_module} {
+
+gpu.module @gpu_module attributes {binary = "kernel binary"} {
+  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
+    gpu.return
+  }
+}
+
+// CHECK: @func(
+// CHECK:   %[[ARG0:.*]]: memref<?xf32> {lmhlo.constant_name = "cst0"},
+// CHECK:   %[[ARG1:.*]]: memref<?xf32> {lmhlo.constant_name = "cst1"}
+// CHECK: )
+func.func @func(%arg0: memref<?xf32> {lmhlo.constant_name = "cst0"},
+                %arg1: memref<?xf32> {lmhlo.constant_name = "cst1"}) {
+  %c1 = arith.constant 1 : index
+
+  // CHECK: call @xla.gpu.cuda.graph.launch(%[[ARG0]], %[[ARG1]])
+  // CHECK-SAME: {capture = @xla.gpu.cuda.graph.capture}
+  // CHECK-NEXT: return
+
+  gpu.launch_func  @gpu_module::@fn0
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg0 : memref<?xf32>)
+
+  gpu.launch_func  @gpu_module::@fn1
+    blocks in (%c1, %c1, %c1)
+    threads in (%c1, %c1, %c1)
+    args(%arg1 : memref<?xf32>)
+
+  func.return
+}
+
+// CHECK: func @xla.gpu.cuda.graph.capture(
+// CHECK-SAME:  %[[ARG0]]: memref<?xf32> {lmhlo.constant_name = "cst0"},
+// CHECK-SAME:  %[[ARG1]]: memref<?xf32> {lmhlo.constant_name = "cst1"})
+// CHECK-NEXT:  %[[C1:.*]] = arith.constant 1
+// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn0
+// CHECK-SAME:    blocks in (%[[C1]], %[[C1]], %[[C1]])
+// CHECK-SAME:    threads in (%[[C1]], %[[C1]], %[[C1]])
+// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn1
+// CHECK-SAME:    blocks in (%[[C1]], %[[C1]], %[[C1]])
+// CHECK-SAME:    threads in (%[[C1]], %[[C1]], %[[C1]])
+// CHECK-NEXT:  return
+
+// CHECK: func private @xla.gpu.cuda.graph.launch(memref<?xf32>, memref<?xf32>)
+// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.cuda.graph.launch"}
+}
diff --git a/tensorflow/compiler/xla/mlir/framework/tests/BUILD b/tensorflow/compiler/xla/mlir/framework/tests/BUILD
index 8fefb30ac1b..7dd8c6bdf59 100644
--- a/tensorflow/compiler/xla/mlir/framework/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/framework/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc b/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc
index 77c005b37c2..033adc44ac6 100644
--- a/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc
+++ b/tensorflow/compiler/xla/mlir/math/transforms/math_approximation.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <array>
 #include <memory>
 #include <optional>
 #include <string>
@@ -454,25 +455,11 @@ LogicalResult LogApproximationBase<Op>::logMatchAndRewrite(
 
   // Polynomial coefficients.
   Value cst_cephes_sqrthf = bcast(f32Cst(builder, 0.707106781186547524f));
-  Value cst_cephes_log_p0 = bcast(f32Cst(builder, 7.0376836292E-2f));
-  Value cst_cephes_log_p1 = bcast(f32Cst(builder, -1.1514610310E-1f));
-  Value cst_cephes_log_p2 = bcast(f32Cst(builder, 1.1676998740E-1f));
-  Value cst_cephes_log_p3 = bcast(f32Cst(builder, -1.2420140846E-1f));
-  Value cst_cephes_log_p4 = bcast(f32Cst(builder, +1.4249322787E-1f));
-  Value cst_cephes_log_p5 = bcast(f32Cst(builder, -1.6668057665E-1f));
-  Value cst_cephes_log_p6 = bcast(f32Cst(builder, +2.0000714765E-1f));
-  Value cst_cephes_log_p7 = bcast(f32Cst(builder, -2.4999993993E-1f));
-  Value cst_cephes_log_p8 = bcast(f32Cst(builder, +3.3333331174E-1f));
-
-  Value x = op.getOperand();
 
   // Truncate input values to the minimum positive normal.
-  x = Max(builder, x, cst_min_norm_pos);
-
   // Extract significant in the range [0.5,1) and exponent.
-  std::pair<Value, Value> pair = Frexp(builder, x, /*isPositive=*/true);
-  x = pair.first;
-  Value e = pair.second;
+  auto [x, e] = Frexp(builder, Max(builder, op.getOperand(), cst_min_norm_pos),
+                      /*isPositive=*/true);
 
   // Shift the inputs from the range [0.5,1) to [sqrt(1/2), sqrt(2)) and shift
   // by -1.0. The values are then centered around 0, which improves the
@@ -494,27 +481,49 @@ LogicalResult LogApproximationBase<Op>::logMatchAndRewrite(
   Value x2 = builder.create<arith::MulFOp>(x, x);
   Value x3 = builder.create<arith::MulFOp>(x2, x);
 
+  Value cephes_log_p0 = bcast(f32Cst(builder, 7.0376836292E-2));
+  Value cephes_log_p1 = bcast(f32Cst(builder, -1.1514610310E-1));
+  Value cephes_log_p2 = bcast(f32Cst(builder, 1.1676998740E-1));
+  Value cephes_log_p3 = bcast(f32Cst(builder, -1.2420140846E-1));
+  Value cephes_log_p4 = bcast(f32Cst(builder, +1.4249322787E-1));
+  Value cephes_log_p5 = bcast(f32Cst(builder, -1.6668057665E-1));
+  Value cephes_log_p6 = bcast(f32Cst(builder, +2.0000714765E-1));
+  Value cephes_log_p7 = bcast(f32Cst(builder, -2.4999993993E-1));
+  Value cephes_log_p8 = bcast(f32Cst(builder, +3.3333331174E-1));
+  Value cephes_log_q1 = bcast(f32Cst(builder, -2.12194440e-4));
+  Value cephes_log_q2 = bcast(f32Cst(builder, 0.693359375));
+  Value half = bcast(f32Cst(builder, 0.5f));
+
   // Evaluate the polynomial approximant of degree 8 in three parts.
-  Value y0, y1, y2;
-  y0 = builder.create<math::FmaOp>(cst_cephes_log_p0, x, cst_cephes_log_p1);
-  y1 = builder.create<math::FmaOp>(cst_cephes_log_p3, x, cst_cephes_log_p4);
-  y2 = builder.create<math::FmaOp>(cst_cephes_log_p6, x, cst_cephes_log_p7);
-  y0 = builder.create<math::FmaOp>(y0, x, cst_cephes_log_p2);
-  y1 = builder.create<math::FmaOp>(y1, x, cst_cephes_log_p5);
-  y2 = builder.create<math::FmaOp>(y2, x, cst_cephes_log_p8);
-  y0 = builder.create<math::FmaOp>(y0, x3, y1);
-  y0 = builder.create<math::FmaOp>(y0, x3, y2);
-  y0 = builder.create<arith::MulFOp>(y0, x3);
-
-  y0 = builder.create<math::FmaOp>(cst_neg_half, x2, y0);
-  x = builder.create<arith::AddFOp>(x, y0);
+  Value y = builder.create<math::FmaOp>(x, cephes_log_p0, cephes_log_p1);
+  Value y1 = builder.create<math::FmaOp>(x, cephes_log_p3, cephes_log_p4);
+  Value y2 = builder.create<math::FmaOp>(x, cephes_log_p6, cephes_log_p7);
+  y = builder.create<math::FmaOp>(y, x, cephes_log_p2);
+  y1 = builder.create<math::FmaOp>(y1, x, cephes_log_p5);
+  y2 = builder.create<math::FmaOp>(y2, x, cephes_log_p8);
+  // y = y * x3 + y1
+  y = builder.create<math::FmaOp>(y, x3, y1);
+  // y = y * x3 + y2
+  y = builder.create<math::FmaOp>(y, x3, y2);
+  // y *= x3
+  y = builder.create<arith::MulFOp>(y, x3);
 
+  Value tmp1 = builder.create<arith::MulFOp>(cephes_log_q1, e);
+  Value tmp2 = builder.create<arith::MulFOp>(half, x2);
   if (base2) {
+    x = builder.create<arith::SubFOp>(x, tmp2);
     Value cst_log2e = bcast(f32Cst(builder, static_cast<float>(LOG2E_VALUE)));
     x = builder.create<math::FmaOp>(x, cst_log2e, e);
   } else {
-    Value cst_ln2 = bcast(f32Cst(builder, static_cast<float>(LN2_VALUE)));
-    x = builder.create<math::FmaOp>(e, cst_ln2, x);
+    // y += log_q1 * e
+    y = builder.create<arith::AddFOp>(y, tmp1);
+    // x -= 0.5 * x2
+    x = builder.create<arith::SubFOp>(x, tmp2);
+    Value tmp3 = builder.create<arith::MulFOp>(cephes_log_q2, e);
+    // x += y
+    x = builder.create<arith::AddFOp>(x, y);
+    // x += log_q2 * e
+    x = builder.create<arith::AddFOp>(x, tmp3);
   }
 
   Value invalid_mask = builder.create<arith::CmpFOp>(arith::CmpFPredicate::ULT,
@@ -585,18 +594,61 @@ LogicalResult Log1pApproximation::matchAndRewrite(
   //          ^^^^^^^^^^^^^^^^^^^^^^
   //             "log_large" below.
   Value cst_one = bcast(f32Cst(builder, 1.0f));
+  Value cst_negative_half = bcast(f32Cst(builder, -0.5f));
+
   Value x = op.getOperand();
-  Value u = builder.create<arith::AddFOp>(x, cst_one);
-  Value u_small =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, u, cst_one);
-  Value log_u = builder.create<math::LogOp>(u);
-  Value u_inf =
-      builder.create<arith::CmpFOp>(arith::CmpFPredicate::OEQ, u, log_u);
-  Value log_large = builder.create<arith::MulFOp>(
-      x, builder.create<arith::DivFOp>(
-             log_u, builder.create<arith::SubFOp>(u, cst_one)));
-  Value approximation = builder.create<arith::SelectOp>(
-      builder.create<arith::OrIOp>(u_small, u_inf), x, log_large);
+  Value for_large_x =
+      builder.create<math::LogOp>(builder.create<arith::AddFOp>(cst_one, x));
+
+  // When x is small, (defined to be less than sqrt(2) / 2), use a rational
+  // approximation. The approximation below is based on one from the Cephes
+  // Mathematical Library.
+  //
+  // sqrt(2) - 1.
+  const auto kAntilogarithmIsSmallThreshold = 0.41421356237309504880;
+
+  static const std::array<double, 7> kDenominatorCoeffs{
+      1.,
+      1.5062909083469192043167E1,
+      8.3047565967967209469434E1,
+      2.2176239823732856465394E2,
+      3.0909872225312059774938E2,
+      2.1642788614495947685003E2,
+      6.0118660497603843919306E1,
+  };
+
+  static const std::array<double, 7> kNumeratorCoeffs{
+      4.5270000862445199635215E-5, 4.9854102823193375972212E-1,
+      6.5787325942061044846969E0,  2.9911919328553073277375E1,
+      6.0949667980987787057556E1,  5.7112963590585538103336E1,
+      2.0039553499201281259648E1,
+  };
+
+  auto eval_polynomial = [&](const std::array<double, 7> &coefficients) {
+    auto poly = bcast(f32Cst(builder, 0.0));
+    for (double c : coefficients) {
+      poly = builder.create<math::FmaOp>(poly, x, bcast(f32Cst(builder, c)));
+    }
+    return poly;
+  };
+
+  auto x_squared = builder.create<arith::MulFOp>(x, x);
+  Value denominator = eval_polynomial(kDenominatorCoeffs);
+  Value numerator = eval_polynomial(kNumeratorCoeffs);
+  Value for_small_x = builder.create<arith::DivFOp>(numerator, denominator);
+  for_small_x = builder.create<arith::MulFOp>(
+      builder.create<arith::MulFOp>(x, x_squared), for_small_x);
+  for_small_x =
+      builder.create<math::FmaOp>(cst_negative_half, x_squared, for_small_x);
+  for_small_x = builder.create<arith::AddFOp>(x, for_small_x);
+
+  auto abs_x = builder.create<math::AbsFOp>(x);
+  auto x_is_small = builder.create<arith::CmpFOp>(
+      arith::CmpFPredicate::OLT, abs_x,
+      bcast(f32Cst(builder, kAntilogarithmIsSmallThreshold)));
+  Value approximation =
+      builder.create<arith::SelectOp>(x_is_small, for_small_x, for_large_x);
+
   rewriter.replaceOp(op, approximation);
   return mlir::success();
 }
diff --git a/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD
index 153584bd1a0..eee4aaa5dba 100644
--- a/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/math/transforms/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD
index 153584bd1a0..eee4aaa5dba 100644
--- a/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/memref/transforms/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD b/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD
index b5d3ee388f1..726567f1677 100644
--- a/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/ir/tests/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_porta
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD b/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
index f3cac07f5e9..cf31605b20f 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/BUILD
@@ -103,7 +103,6 @@ cc_library(
         ":custom_call_encoding",
         ":passes",
         "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
-        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir/math/transforms:passes",
         "//tensorflow/compiler/xla/mlir/memref/transforms:passes",
         "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
@@ -197,6 +196,7 @@ cc_library(
     hdrs = ["custom_call_encoding.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:tracing",
@@ -293,7 +293,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
-        "//tensorflow/compiler/xla/runtime:errors",
         "//tensorflow/compiler/xla/runtime:types",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
index aa08fcdfd4e..697570c01a6 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.cc
@@ -52,7 +52,6 @@ limitations under the License.
 #include "mlir/Target/LLVMIR/Dialect/X86Vector/X86VectorToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/math/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/memref/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
@@ -84,17 +83,14 @@ void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects) {
 }
 
 static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
-                                            const CpuPipelineOptions& opts,
-                                            bool useRuntime) {
-  if (useRuntime) {
-    pm.addPass(mlir::createAsyncFuncToAsyncRuntimePass());
+                                            const CpuPipelineOptions& opts) {
+  pm.addPass(mlir::createAsyncFuncToAsyncRuntimePass());
 
-    // Convert entry function to the XLA entrypoint.
-    pm.addPass(CreateExportRuntimeFunctionsPass());
-    pm.addPass(cpu::createConvertLmhloToCpuRuntimePass());
-    pm.addPass(CreateConvertCustomCallsPass());
-    pm.addPass(CreateConvertAssertsPass());
-  }
+  // Convert entry function to the XLA entrypoint.
+  pm.addPass(CreateExportRuntimeFunctionsPass());
+  pm.addPass(cpu::createConvertXlaCpuToCpuRuntimePass());
+  pm.addPass(CreateConvertCustomCallsPass());
+  pm.addPass(CreateConvertAssertsPass());
 
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -110,15 +106,13 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
   // Canonicalize generated scf.parallel operations to remove single iterations.
   pm.addPass(mlir::createCanonicalizerPass());
 
-  if (useRuntime) {
-    // TODO(ecg,ezhulenev): add conversion of scf.parallel to async.
+  // TODO(ecg,ezhulenev): add conversion of scf.parallel to async.
 
-    // Lower from high level async operations to async runtime.
-    pm.addPass(mlir::createAsyncToAsyncRuntimePass());
+  // Lower from high level async operations to async runtime.
+  pm.addPass(mlir::createAsyncToAsyncRuntimePass());
 
-    // Add async.runtime reference counting operations.
-    pm.addPass(mlir::createAsyncRuntimePolicyBasedRefCountingPass());
-  }
+  // Add async.runtime reference counting operations.
+  pm.addPass(mlir::createAsyncRuntimePolicyBasedRefCountingPass());
 
   // Expand math operations into std/arith dialect operations.
   pm.addNestedPass<mlir::func::FuncOp>(mlir::arith::createArithExpandOpsPass());
@@ -132,20 +126,16 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
       xla::CreateAlignedAllocationsPass(opts.alignment));
 
   // Lower everything down to LLVM dialect.
-  if (useRuntime) {
-    // Convert runtime operations and custom calls to LLVM dialect.
-    const CompilationPipelineOptions& copts = opts.common_options;
-    ConvertRuntimeToLLvmOpts rt_to_llvm_opts = {
-        copts.populate_type_id_names, copts.populate_type_conversions,
-        copts.populate_arg_encodings, copts.populate_ret_encodings,
-        copts.populate_attr_encodings};
-    pm.addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
+  // Convert runtime operations and custom calls to LLVM dialect.
+  const CompilationPipelineOptions& copts = opts.common_options;
+  ConvertRuntimeToLLvmOpts rt_to_llvm_opts = {
+      copts.populate_type_id_names, copts.populate_type_conversions,
+      copts.populate_arg_encodings, copts.populate_ret_encodings,
+      copts.populate_attr_encodings};
+  pm.addPass(CreateConvertRuntimeToLLVMPass(std::move(rt_to_llvm_opts)));
 
-    // Convert async to LLVM once everything else is in the LLVM dialect.
-    pm.addPass(mlir::createConvertAsyncToLLVMPass());
-  } else {
-    pm.addPass(mlir::xla_framework::CreateLegalizeXLAFrameworkToLLVMPass());
-  }
+  // Convert async to LLVM once everything else is in the LLVM dialect.
+  pm.addPass(mlir::createConvertAsyncToLLVMPass());
 
   // Convert everything else to LLVM dialect.
   mlir::GenericHostToLLVMPassOptions llvm_options;
@@ -174,17 +164,12 @@ static void CreateXlaCpuCompilationPipeline(mlir::OpPassManager& pm,
 
 void CreateDefaultXlaCpuRuntimeCompilationPipeline(
     PassManager& passes, const CpuPipelineOptions& opts) {
-  CreateXlaCpuCompilationPipeline(*passes, opts, /*useRuntime=*/true);
-}
-
-void CreateDefaultXlaCpuAOTCompilationPipeline(PassManager& passes,
-                                               const CpuPipelineOptions& opts) {
-  CreateXlaCpuCompilationPipeline(*passes, opts, /*useRuntime=*/false);
+  CreateXlaCpuCompilationPipeline(*passes, opts);
 }
 
 static void CreateDefaultCpuPipeline(mlir::OpPassManager& pm) {
   CpuPipelineOptions opts;
-  CreateXlaCpuCompilationPipeline(pm, opts, /*useRuntime=*/true);
+  CreateXlaCpuCompilationPipeline(pm, opts);
 }
 
 static mlir::PassPipelineRegistration<> kXlaRuntimePipeline(
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
index 151536560f3..3d7359eff4c 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h
@@ -60,11 +60,6 @@ void RegisterDefaultXlaCpuRuntimeDialects(DialectRegistry& dialects);
 void CreateDefaultXlaCpuRuntimeCompilationPipeline(
     PassManager& passes, const CpuPipelineOptions& opts);
 
-// Creates default XLA-CPU runtime compilation pipeline that lowers
-// `xla_framework` and `rt` dialects to the LLVMIR dialect.
-void CreateDefaultXlaCpuAOTCompilationPipeline(PassManager& passes,
-                                               const CpuPipelineOptions& opts);
-
 }  // namespace runtime
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
index e9e62cb5767..22f83f7827a 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/tracing.h"
 #include "tensorflow/compiler/xla/runtime/type_id.h"
@@ -616,18 +617,20 @@ static TypeID ScalarRuntimeTypeId(Type type) {
 }
 
 static PrimitiveType ScalarPrimitiveType(Type type) {
-  // Unsigned integer types.
-  if (type.isUnsignedInteger(8)) return PrimitiveType::U8;
-  if (type.isUnsignedInteger(16)) return PrimitiveType::U16;
-  if (type.isUnsignedInteger(32)) return PrimitiveType::U32;
-  if (type.isUnsignedInteger(64)) return PrimitiveType::U64;
-
-  // Signed integer types.
+  // Integer types.
   if (type.isInteger(1)) return PrimitiveType::PRED;
-  if (type.isInteger(8)) return PrimitiveType::S8;
-  if (type.isInteger(16)) return PrimitiveType::S16;
-  if (type.isInteger(32)) return PrimitiveType::S32;
-  if (type.isInteger(64)) return PrimitiveType::S64;
+  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
+    unsigned int width = int_type.getWidth();
+    if (auto primitive_type =
+            int_type.isUnsigned()
+                // Unsigned integer types.
+                ? primitive_util::UnsignedIntegralTypeForBitWidth(width)
+                // Signed integer types.
+                : primitive_util::SignedIntegralTypeForBitWidth(width);
+        primitive_type != PRIMITIVE_TYPE_INVALID) {
+      return primitive_type;
+    }
+  }
 
   // Floating point types.
   if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
@@ -640,8 +643,8 @@ static PrimitiveType ScalarPrimitiveType(Type type) {
 
   // Complex types.
   if (auto complex = type.dyn_cast<ComplexType>()) {
-    if (complex.getElementType().isF32()) return PrimitiveType::C64;
-    if (complex.getElementType().isF64()) return PrimitiveType::C128;
+    return primitive_util::ComplexType(
+        ScalarPrimitiveType(complex.getElementType()));
   }
 
   assert(false && "unsupported type id");
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
index e9e05ba8592..28fc4d0ddfe 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.cc
@@ -221,6 +221,21 @@ absl::Status JitCompiler::ComputeOrdinalsForExportedFunctions(
   return absl::OkStatus();
 }
 
+absl::Status ExportMainWithOrdinal0(mlir::ModuleOp module,
+                                    mlir::MLIRContext& mlir_context) {
+  SymbolTable sym_table(module);
+
+  // Add `rt.export` operations for all explicitly exported functions.
+  if (auto func = sym_table.lookup<FunctionOpInterface>("main")) {
+    OpBuilder(func).create<ExportOp>(func.getLoc(), func, 0);
+  }
+  mlir::PassManager pm(&mlir_context);
+  pm.addPass(CreateOrdinalAssignmentPass());
+  if (failed(pm.run(module)))
+    return absl::InternalError("failed to run ordinal assignment pass");
+  return absl::OkStatus();
+}
+
 /*static*/ absl::StatusOr<std::unique_ptr<JitCompiler>>
 JitCompiler::Instantiate(JitCompiler::Options opts,
                          std::string_view mlir_module,
@@ -265,6 +280,9 @@ MakeOptimizingTransformerForJit(llvm::TargetMachine* targetMachine) {
     llvm::ModuleAnalysisManager mam;
 
     llvm::PipelineTuningOptions tuningOptions;
+    // LLVM's loop unrolling isn't well tuned for the loops we emit. Turn it off
+    // as it consumes compile time with little benefit.
+    tuningOptions.LoopUnrolling = false;
     // Vectorization happens at the MLIR level.
     tuningOptions.LoopVectorization = false;
     llvm::PassBuilder pb(targetMachine, tuningOptions);
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h
index 0c18b831563..0034ad2e960 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h
@@ -195,6 +195,12 @@ class JitCompiler {
   bool specialized_;
 };
 
+// Adds "rt.export" with ordinal 0 to the "main" function in `module`.
+// This is done by performing a run of the OrdinalAssignment pass using
+// the given `mlir_context`.
+absl::Status ExportMainWithOrdinal0(mlir::ModuleOp module,
+                                    mlir::MLIRContext& mlir_context);
+
 }  // namespace runtime
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
index a56c4857f58..d5bb08e7827 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
index 2f8b873b84d..a991d8428a9 100644
--- a/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
+++ b/tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 
 namespace xla {
 namespace runtime {
@@ -112,25 +113,30 @@ static std::unique_ptr<Type> ConvertCanonicalType(
   if (type.isFloat8E4M3B11FNUZ()) return PrimitiveType::F8E4M3B11FNUZ;
   if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isIndex()) return PrimitiveType::S64;
-  if (type.isFloat8E4M3FN()) return PrimitiveType::F8E4M3FN;
-  if (type.isFloat8E5M2()) return PrimitiveType::F8E5M2;
   if (type.isBF16()) return PrimitiveType::BF16;
   if (type.isF16()) return PrimitiveType::F16;
   if (type.isF32()) return PrimitiveType::F32;
   if (type.isF64()) return PrimitiveType::F64;
-  if (type.isUnsignedInteger(8)) return PrimitiveType::U8;
-  if (type.isUnsignedInteger(16)) return PrimitiveType::U16;
-  if (type.isUnsignedInteger(32)) return PrimitiveType::U32;
-  if (type.isUnsignedInteger(64)) return PrimitiveType::U64;
   if (type.isInteger(1)) return PrimitiveType::PRED;
-  if (type.isInteger(8)) return PrimitiveType::S8;
-  if (type.isInteger(16)) return PrimitiveType::S16;
-  if (type.isInteger(32)) return PrimitiveType::S32;
-  if (type.isInteger(64)) return PrimitiveType::S64;
+  if (auto int_type = type.dyn_cast<mlir::IntegerType>()) {
+    unsigned int width = int_type.getWidth();
+    if (auto primitive_type =
+            int_type.isUnsigned()
+                ? primitive_util::UnsignedIntegralTypeForBitWidth(width)
+                : primitive_util::SignedIntegralTypeForBitWidth(width);
+        primitive_type != PRIMITIVE_TYPE_INVALID) {
+      return primitive_type;
+    }
+  }
   if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
     auto element_type = complex_type.getElementType();
-    if (element_type.isF32()) return PrimitiveType::C64;
-    if (element_type.isF64()) return PrimitiveType::C128;
+    TF_ASSIGN_OR_RETURN(auto element_primitive_type,
+                        ConvertElementType(element_type));
+    if (auto complex_primitive_type =
+            primitive_util::ComplexType(element_primitive_type);
+        complex_primitive_type != PRIMITIVE_TYPE_INVALID) {
+      return complex_primitive_type;
+    }
   }
 
   return InvalidArgumentError(
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/BUILD b/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
index 58d5eca7340..0b552cd7e5c 100644
--- a/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/BUILD
@@ -62,3 +62,13 @@ cc_library(
         "@llvm-project//mlir:IR",
     ],
 )
+
+cc_library(
+    name = "float_16bits",
+    hdrs = ["float_16bits.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "@llvm-project//llvm:OrcJIT",
+        "@llvm-project//mlir:mlir_float16_utils",
+    ],
+)
diff --git a/tensorflow/compiler/xla/mlir/runtime/utils/float_16bits.h b/tensorflow/compiler/xla/mlir/runtime/utils/float_16bits.h
new file mode 100644
index 00000000000..edc649e08b3
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir/runtime/utils/float_16bits.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_UTILS_FLOAT_16BITS_H_
+#define TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_UTILS_FLOAT_16BITS_H_
+
+#include <string_view>
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+
+// Provided by compiler-rt and MLIR.
+// Converts an F32 value to a BF16.
+extern "C" uint16_t __truncsfbf2(float);
+// Converts an F64 value to a BF16.
+extern "C" uint16_t __truncdfbf2(double);
+
+namespace xla {
+namespace runtime {
+
+inline llvm::orc::SymbolMap Float16bitsSymbolMap(
+    llvm::orc::MangleAndInterner mangle) {
+  llvm::orc::SymbolMap symbol_map;
+
+  auto bind = [&](std::string_view name, auto symbol_ptr) {
+    symbol_map[mangle(name)] = {llvm::orc::ExecutorAddr::fromPtr(symbol_ptr),
+                                llvm::JITSymbolFlags()};
+  };
+
+  bind("__truncsfbf2", &__truncsfbf2);
+  bind("__truncdfbf2", &__truncdfbf2);
+
+  return symbol_map;
+}
+
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_MLIR_RUNTIME_UTILS_FLOAT_16BITS_H_
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD
index 6136d63b704..8bbf5c62194 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/BUILD
@@ -2,6 +2,8 @@ load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 build_test(
     name = "mlir-bisect_build_test",
     targets = [
@@ -19,6 +21,7 @@ xla_cc_binary(
         "//tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites",
         "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:execution_trace_utils",
         "//tensorflow/compiler/xla/mlir_hlo:deallocation",
+        "//tensorflow/compiler/xla/mlir_hlo:deallocation_passes",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_passes",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_test_passes",
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h
index c57b49f3fd0..0fe1b4b17e3 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h
@@ -61,7 +61,7 @@ std::pair<OwningOpRef<ModuleOp>, Op> CloneModuleFor(Op op) {
 
 namespace detail {
 
-using CandidateVector = SmallVector<OwningOpRef<ModuleOp>>;
+using CandidateVector = SmallVector<std::function<OwningOpRef<ModuleOp>()>>;
 
 CandidateVector GetCandidates(
     const std::function<CandidateVector(BisectState&, Operation*)>& strategy,
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
index 607917ecb75..989f04b5135 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/mlir_bisect.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/test_passes.h"
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
 #include "tensorflow/compiler/xla/mlir_hlo/deallocation/IR/deallocation_ops.h"
+#include "tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/IR/gml_st_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h"
@@ -100,10 +101,7 @@ OwningOpRef<ModuleOp> ParseMlirInput(llvm::StringRef inputFilename,
 
   auto source_mgr = std::make_shared<llvm::SourceMgr>();
   source_mgr->AddNewSourceBuffer(std::move(file), SMLoc());
-  return {
-      llvm::cast<ModuleOp>(parseSourceFileForTool(source_mgr, context,
-                                                  /*insertImplicitModule=*/true)
-                               .release())};
+  return parseSourceFile<ModuleOp>(source_mgr, context);
 }
 
 LogicalResult RunPipeline(ModuleOp module, const Options& options) {
@@ -124,7 +122,7 @@ LogicalResult RunPipeline(ModuleOp module, const Options& options) {
   return success();
 }
 
-LogicalResult Run(ModuleOp module, interpreter::ExecutionTrace* trace,
+LogicalResult Run(mlir::Operation* module, interpreter::ExecutionTrace* trace,
                   const Options& options) {
   SymbolTable symbol_table{module};
   interpreter::ExecutionTraceListener tracer(trace);
@@ -144,7 +142,7 @@ LogicalResult Run(ModuleOp module, interpreter::ExecutionTrace* trace,
     return success();
   }
 
-  OwningOpRef<ModuleOp> clone(module.clone());
+  OwningOpRef<ModuleOp> clone(llvm::cast<ModuleOp>(module).clone());
   if (!succeeded(RunPipeline(*clone, options))) {
     return failure();
   }
@@ -206,9 +204,10 @@ OwningOpRef<ModuleOp> ReduceModule(OwningOpRef<ModuleOp> module,
 
   auto apply_step = [&]() -> std::optional<OwningOpRef<ModuleOp>> {
     for (auto it = strategies.begin(); it != strategies.end(); ++it) {
-      for (auto& candidate :
+      for (auto& candidate_fn :
            detail::GetCandidates(it->second, state, *module)) {
-        if (!mlir::verify(*candidate).succeeded()) {
+        auto candidate = candidate_fn();
+        if (!candidate || !mlir::verify(*candidate).succeeded()) {
           continue;
         }
         if (options.canonicalize && !Canonicalize(*candidate).succeeded()) {
@@ -228,12 +227,13 @@ OwningOpRef<ModuleOp> ReduceModule(OwningOpRef<ModuleOp> module,
         // Update the trace.
         state.SetTrace(trace);
 
-        // Move failed strategies to the end.
+        // Move strategies to the end.
         decltype(strategies) new_strategies;
-        std::copy(it, strategies.end(), std::back_inserter(new_strategies));
-        std::copy(strategies.begin(), it, std::back_inserter(new_strategies));
+        std::copy(it + 1, strategies.end(), std::back_inserter(new_strategies));
+        std::copy(strategies.begin(), it + 1,
+                  std::back_inserter(new_strategies));
         strategies = new_strategies;
-        return {std::move(candidate)};
+        return {candidate.release()};
       }
     }
     return std::nullopt;
@@ -256,8 +256,17 @@ void ReplaceArgsWithConstants(ModuleOp module,
         bbarg.getType());
     CHECK_EQ(attr.size(), 1) << "unsupported argument";
 
-    bbarg.replaceAllUsesWith(arith::ConstantOp::materialize(
-        b, attr.front(), bbarg.getType(), main.getLoc()));
+    auto constant = b.create<arith::ConstantOp>(
+        main.getLoc(), bbarg.getType(), llvm::cast<TypedAttr>(attr.front()));
+    bbarg.replaceAllUsesWith(constant);
+  }
+
+  // The remaining ops are output args, so we replace them with allocs.
+  for (auto arg :
+       main.getBody().getArguments().drop_front(snapshot.arguments().size())) {
+    CHECK(llvm::isa<MemRefType>(arg.getType())) << "unsupported argument";
+    arg.replaceAllUsesWith(b.create<memref::AllocOp>(
+        module.getLoc(), llvm::cast<MemRefType>(arg.getType())));
   }
   while (main.getBody().getNumArguments() > 0) {
     main.getBody().eraseArgument(0);
@@ -290,6 +299,7 @@ int main(int argc, char* argv[]) {
   mlir::gml_st::registerGmlStPasses();
   mlir::gml_st::registerGmlStTestPasses();
   mlir::mhlo::registerAllMhloDialects(registry);
+  mlir::deallocation::registerDeallocationPasses();
 
   registry.insert<mlir::lmhlo::LmhloDialect, mlir::gml_st::GmlStDialect,
                   mlir::deallocation::DeallocationDialect,
@@ -326,8 +336,12 @@ int main(int argc, char* argv[]) {
     for (auto& candidate : mlir::bisect::detail::GetCandidates(
              mlir::bisect::detail::GetStrategies()[options.debug_strategy],
              state, *module)) {
-      llvm::outs() << *candidate << "\n\n";
-      if (!mlir::verify(*candidate).succeeded()) {
+      auto new_module = candidate();
+      if (!new_module) {
+        continue;
+      }
+      llvm::outs() << *new_module << "\n\n";
+      if (!mlir::verify(*new_module).succeeded()) {
         some_failed = true;
         llvm::errs() << "verification failed\n";
       }
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD
index d57cd2b428c..54dfec1c63a 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/BUILD
@@ -1,5 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "rewrites",
     srcs = [
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc
index 08e5306f2a1..ccd09c33f64 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/func.cc
@@ -32,38 +32,43 @@ void SetReturnValues(func::FuncOp func, ValueRange values) {
   func.getBody().getBlocks().front().getTerminator()->setOperands(values);
 }
 
-SmallVector<OwningOpRef<ModuleOp>> TruncateFunction(BisectState&,
-                                                    func::FuncOp func) {
-  SmallVector<OwningOpRef<ModuleOp>> result;
+SmallVector<std::function<OwningOpRef<ModuleOp>()>> TruncateFunction(
+    BisectState&, func::FuncOp func) {
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
   for (auto& ret : func.getBody().getBlocks().front().without_terminator()) {
     if (func.getBody().getBlocks().front().getTerminator()->getOperands() ==
         ret.getResults()) {
       continue;
     }
-    auto [module, ret_clone] = CloneModuleFor(&ret);
-    SetReturnValues(ret_clone->getParentOfType<func::FuncOp>(),
-                    ret_clone->getResults());
-    result.push_back(std::move(module));
+    auto fun = [r = &ret]() -> OwningOpRef<ModuleOp> {
+      auto [module, ret_clone] = CloneModuleFor(r);
+      SetReturnValues(ret_clone->getParentOfType<func::FuncOp>(),
+                      ret_clone->getResults());
+      return std::move(module);
+    };
+    result.push_back(fun);
   }
   return result;
 }
 
-SmallVector<OwningOpRef<ModuleOp>> ReturnOperandsOfTerminatorOperands(
-    BisectState&, func::FuncOp func) {
-  SmallVector<OwningOpRef<ModuleOp>> result;
-  auto [module, func_clone] = CloneModuleFor(func);
-  auto* terminator = func_clone.getBody().getBlocks().front().getTerminator();
-  SmallVector<Value> new_operands;
-  for (auto operand : terminator->getOperands()) {
-    if (operand.getDefiningOp()) {
-      llvm::copy(operand.getDefiningOp()->getOperands(),
-                 std::back_inserter(new_operands));
-    } else {
-      return result;
+SmallVector<std::function<OwningOpRef<ModuleOp>()>>
+ReturnOperandsOfTerminatorOperands(BisectState&, func::FuncOp func) {
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  result.push_back([func]() -> OwningOpRef<ModuleOp> {
+    auto [module, func_clone] = CloneModuleFor(func);
+    auto* terminator = func_clone.getBody().getBlocks().front().getTerminator();
+    SmallVector<Value> new_operands;
+    for (auto operand : terminator->getOperands()) {
+      if (operand.getDefiningOp()) {
+        llvm::copy(operand.getDefiningOp()->getOperands(),
+                   std::back_inserter(new_operands));
+      } else {
+        return nullptr;
+      }
     }
-  }
-  SetReturnValues(func_clone, new_operands);
-  result.push_back(std::move(module));
+    SetReturnValues(func_clone, new_operands);
+    return std::move(module);
+  });
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
index 7e14bd65787..65675d008a2 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/general.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/Interfaces/SideEffectInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h"
@@ -37,8 +38,8 @@ bool IsTopLevelOp(Operation* op) {
   return !op->getBlock()->back().mightHaveTrait<OpTrait::IsTerminator>();
 }
 
-SmallVector<OwningOpRef<ModuleOp>> EraseOpWithoutResults(BisectState& state,
-                                                         Operation* op) {
+SmallVector<std::function<OwningOpRef<ModuleOp>()>> EraseOpWithoutResults(
+    BisectState& state, Operation* op) {
   // Only erase ops with results if they're unused.
   if (op->getNumResults() > 0 && !op->use_empty()) {
     return {};
@@ -49,55 +50,62 @@ SmallVector<OwningOpRef<ModuleOp>> EraseOpWithoutResults(BisectState& state,
     return {};
   }
 
-  auto [module, cloned_op] = CloneModuleFor(op);
-  cloned_op->erase();
-  SmallVector<OwningOpRef<ModuleOp>> ret;
-  ret.push_back(std::move(module));
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> ret;
+  ret.push_back([op]() {
+    auto [module, cloned_op] = CloneModuleFor(op);
+    cloned_op->erase();
+    return std::move(module);
+  });
   return ret;
 }
 
-llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOpWithConstant(
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReplaceOpWithConstant(
     BisectState& state, Operation* op) {
-  llvm::SmallVector<OwningOpRef<ModuleOp>> result;
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
   if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
       IsTerminator(op) || op->use_empty() || op->getNumResults() == 0) {
     return result;
   }
 
+  auto mii = llvm::dyn_cast<MemoryEffectOpInterface>(op);
+  if (mii && mii.hasEffect<MemoryEffects::Allocate>()) {
+    // Don't replace allocs with constants.
+    return result;
+  }
+
   // Ops that are never executed won't be replaced here, but we have other
   // strategies that get rid of them (e.g. deleting the entire region).
   for (auto* execution : state.GetExecutions(op)) {
     assert(execution->results_size() == op->getNumResults() &&
            "unexpected number of results");
 
-    auto [module_clone, op_clone] = CloneModuleFor(op);
-    SmallVector<Value> results;
-    OpBuilder b(op_clone);
-    bool all_replaced = true;
-    for (int64_t i = 0; i < op->getNumResults(); ++i) {
-      auto type = op->getResultTypes()[i];
-      auto value = *interpreter::TracedValueToValue(
-          execution->results(static_cast<int>(i)));
-      auto attribute = interpreter::ValueToAttribute(value, type);
-      if (attribute.size() == 1) {
-        op_clone->getResults()[i].replaceAllUsesWith(
-            arith::ConstantOp::materialize(b, attribute.front(), type,
-                                           op_clone->getLoc()));
-      } else {
+    result.push_back([execution, op]() -> OwningOpRef<ModuleOp> {
+      auto [module_clone, op_clone] = CloneModuleFor(op);
+      SmallVector<Value> results;
+      OpBuilder b(op_clone);
+      for (int64_t i = 0; i < op->getNumResults(); ++i) {
+        auto type = op->getResultTypes()[i];
+        auto value = *interpreter::TracedValueToValue(
+            execution->results(static_cast<int>(i)));
+        auto attribute = interpreter::ValueToAttribute(value, type);
         // We don't currently support tuples.
-        all_replaced = false;
+        if (attribute.size() != 1) {
+          return nullptr;
+        }
+        op_clone->getResults()[i].replaceAllUsesWith(
+            b.create<arith::ConstantOp>(
+                op_clone->getLoc(), type,
+                llvm::cast<TypedAttr>(attribute.front())));
       }
-    }
-    if (all_replaced) {
-      result.push_back(std::move(module_clone));
-    }
+      return std::move(module_clone);
+    });
   }
   return result;
 }
 
-llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOperandWithConstant(
-    BisectState& state, Operation* op) {
-  llvm::SmallVector<OwningOpRef<ModuleOp>> result;
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>>
+ReplaceOperandWithConstant(BisectState& state, Operation* op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
   if (IsTopLevelOp(op) || op->getNumOperands() == 0) {
     return result;
   }
@@ -109,18 +117,21 @@ llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOperandWithConstant(
           operand.getDefiningOp()->hasTrait<OpTrait::ConstantLike>()) {
         continue;
       }
-      auto type = op->getOperandTypes()[i];
-      auto value = *interpreter::TracedValueToValue(
-          execution->args(static_cast<int>(i)));
-      auto attribute = interpreter::ValueToAttribute(value, type);
-      if (attribute.size() == 1) {
+      result.push_back([execution, i, op]() -> OwningOpRef<ModuleOp> {
+        auto type = op->getOperandTypes()[i];
+        auto value = *interpreter::TracedValueToValue(
+            execution->args(static_cast<int>(i)));
+        auto attribute = interpreter::ValueToAttribute(value, type);
+        if (attribute.size() != 1) {
+          return nullptr;
+        }
         auto [module_clone, op_clone] = CloneModuleFor(op);
         OpBuilder b(op_clone);
-        op_clone->setOperand(
-            i, arith::ConstantOp::materialize(b, attribute.front(), type,
-                                              op_clone->getLoc()));
-        result.push_back(std::move(module_clone));
-      }
+        op_clone->setOperand(i, b.create<arith::ConstantOp>(
+                                    op_clone->getLoc(), type,
+                                    llvm::cast<TypedAttr>(attribute.front())));
+        return std::move(module_clone);
+      });
     }
   }
   return result;
@@ -128,9 +139,9 @@ llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOperandWithConstant(
 
 // Replaces an op's result with some other value with the same type defined
 // previously in the same region.
-llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOpWithValue(BisectState&,
-                                                            Operation* op) {
-  llvm::SmallVector<OwningOpRef<ModuleOp>> ret;
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReplaceOpWithValue(
+    BisectState&, Operation* op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> ret;
   if (op->hasTrait<OpTrait::ConstantLike>() || IsTopLevelOp(op) ||
       IsTerminator(op)) {
     return ret;
@@ -153,11 +164,13 @@ llvm::SmallVector<OwningOpRef<ModuleOp>> ReplaceOpWithValue(BisectState&,
 
     for (auto [new_result_op, new_result_index] :
          candidates_by_type[result.getType()]) {
-      auto [module_clone, op_clone] = CloneModuleFor(op);
-      op_clone->getResults()[index].replaceAllUsesWith(
-          FindInClone(new_result_op, module_clone.get())
-              ->getResults()[new_result_index]);
-      ret.push_back(std::move(module_clone));
+      ret.push_back(
+          [op, i = index, j = new_result_index, result_op = new_result_op]() {
+            auto [module_clone, op_clone] = CloneModuleFor(op);
+            op_clone->getResults()[i].replaceAllUsesWith(
+                FindInClone(result_op, module_clone.get())->getResults()[j]);
+            return std::move(module_clone);
+          });
     }
   }
   return ret;
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
index 5caefcc6f4e..e98b5c3d56f 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/scf.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "mlir/Dialect/Utils/StaticValueUtils.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/tools/mlir_bisect/bisect_lib.h"
@@ -34,89 +35,99 @@ constexpr int64_t kMaxWhileIterations = 1;
 // condition is executed, but its result is ignored.
 // For ease of implementation, this generates scf.execute_region ops. These are
 // subsequently canonicalized away.
-llvm::SmallVector<OwningOpRef<ModuleOp>> InlineScfWhile(BisectState&,
-                                                        scf::WhileOp whileOp) {
-  llvm::SmallVector<OwningOpRef<ModuleOp>> result;
+llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> InlineScfWhile(
+    BisectState&, scf::WhileOp while_op) {
+  llvm::SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
   for (int64_t num_executions = 0; num_executions <= kMaxWhileIterations;
        ++num_executions) {
     using ::mlir::scf::ExecuteRegionOp;
 
-    auto [module, op] = CloneModuleFor(whileOp);
-    OpBuilder b(op);
-    llvm::SmallVector<scf::ExecuteRegionOp> regions;
+    result.push_back([while_op, num_executions]() -> OwningOpRef<ModuleOp> {
+      auto [module, op] = CloneModuleFor(while_op);
+      OpBuilder b(op);
+      llvm::SmallVector<scf::ExecuteRegionOp> regions;
 
-    auto wrap_region_in_execute = [&, loc = op.getLoc()](mlir::Region& region) {
-      regions
-          .emplace_back(b.create<ExecuteRegionOp>(
-              loc,
-              region.getBlocks().front().getTerminator()->getOperandTypes(),
-              mlir::ValueRange{}))
-          .getRegion()
-          .takeBody(region);
-    };
+      auto wrap_region_in_execute = [&,
+                                     loc = op.getLoc()](mlir::Region& region) {
+        regions
+            .emplace_back(b.create<ExecuteRegionOp>(
+                loc,
+                region.getBlocks().front().getTerminator()->getOperandTypes(),
+                mlir::ValueRange{}))
+            .getRegion()
+            .takeBody(region);
+      };
 
-    wrap_region_in_execute(op.getBefore());
-    // Replace the condition terminator with a yield terminator.
-    {
-      auto& before_block = regions[0].getRegion().getBlocks().front();
-      OpBuilder before_builder(before_block.getTerminator());
-      IRRewriter before_rewriter(before_builder);
-      before_rewriter.replaceOpWithNewOp<scf::YieldOp>(
-          before_block.getTerminator(),
-          before_block.getTerminator()->getOperands());
-    }
+      wrap_region_in_execute(op.getBefore());
+      // Replace the condition terminator with a yield terminator.
+      {
+        auto& before_block = regions[0].getRegion().getBlocks().front();
+        OpBuilder before_builder(before_block.getTerminator());
+        IRRewriter before_rewriter(before_builder);
+        before_rewriter.replaceOpWithNewOp<scf::YieldOp>(
+            before_block.getTerminator(),
+            before_block.getTerminator()->getOperands());
+      }
 
-    // Clone the execute region ops the requested number of times.
-    if (num_executions > 0) {
-      wrap_region_in_execute(op.getAfter());
-      for (int64_t i = 0; i < num_executions - 1; ++i) {
+      // Clone the execute region ops the requested number of times.
+      if (num_executions > 0) {
+        wrap_region_in_execute(op.getAfter());
+        for (int64_t i = 0; i < num_executions - 1; ++i) {
+          b.insert(regions.emplace_back(regions[0].clone()));
+          b.insert(regions.emplace_back(regions[1].clone()));
+        }
         b.insert(regions.emplace_back(regions[0].clone()));
-        b.insert(regions.emplace_back(regions[1].clone()));
       }
-      b.insert(regions.emplace_back(regions[0].clone()));
-    }
 
-    // Rewire region arguments and erase them.
-    for (int64_t i = 0; i < regions.size(); ++i) {
-      auto args = i == 0 ? ValueRange{op.getOperands()}
-                         : ValueRange{regions[i - 1].getResults()};
-      bool is_after_region = (i & 1) == 1;
-      auto& region = regions[i].getRegion();
-      for (int64_t arg = static_cast<int64_t>(region.getNumArguments()) - 1;
-           arg >= 0; --arg) {
-        region.getArgument(arg).replaceAllUsesWith(
-            args[is_after_region ? arg + 1 : arg]);
-        region.eraseArgument(arg);
+      // Rewire region arguments and erase them.
+      for (int64_t i = 0; i < regions.size(); ++i) {
+        auto args = i == 0 ? ValueRange{op.getOperands()}
+                           : ValueRange{regions[i - 1].getResults()};
+        bool is_after_region = (i & 1) == 1;
+        auto& region = regions[i].getRegion();
+        for (int64_t arg = static_cast<int64_t>(region.getNumArguments()) - 1;
+             arg >= 0; --arg) {
+          region.getArgument(arg).replaceAllUsesWith(
+              args[is_after_region ? arg + 1 : arg]);
+          region.eraseArgument(arg);
+        }
       }
-    }
-    op->replaceAllUsesWith(regions.back().getResults().drop_front(1));
-    op->erase();
-    result.push_back(std::move(module));
+      op->replaceAllUsesWith(regions.back().getResults().drop_front(1));
+      op->erase();
+      return std::move(module);
+    });
   }
   return result;
 }
 
-SmallVector<OwningOpRef<ModuleOp>> ReduceScfForallBounds(
+SmallVector<std::function<OwningOpRef<ModuleOp>()>> ReduceScfForallBounds(
     BisectState&, scf::ForallOp forall_op) {
-  SmallVector<OwningOpRef<ModuleOp>> result;
-  SmallVector<OpFoldResult> newUpperBound{forall_op.getMixedUpperBound()};
+  SmallVector<OpFoldResult> new_upper_bound{forall_op.getMixedUpperBound()};
   OpBuilder b(forall_op);
-  for (auto& ub : newUpperBound) {
-    auto constantOr = mlir::getConstantIntValue(ub);
-    if (!constantOr.has_value()) continue;
-
-    ub = b.getIndexAttr(*constantOr - 1);
+  bool any_replaced = false;
+  for (auto& ub : new_upper_bound) {
+    auto constant_or = mlir::getConstantIntValue(ub);
+    if (!constant_or.has_value()) {
+      continue;
+    }
+    any_replaced = true;
+    ub = b.getIndexAttr(*constant_or - 1);
   }
-  auto [module, op] = CloneModuleFor(forall_op);
-  b.setInsertionPoint(op);
-  SmallVector<Value> dynamicUpperBound;
-  SmallVector<int64_t> staticUpperBound;
-  dispatchIndexOpFoldResults(newUpperBound, dynamicUpperBound,
-                             staticUpperBound);
-  op.getDynamicUpperBoundMutable().assign(dynamicUpperBound);
-  op.setStaticUpperBound(staticUpperBound);
-
-  result.push_back(std::move(module));
+  SmallVector<std::function<OwningOpRef<ModuleOp>()>> result;
+  if (!any_replaced) {
+    return result;
+  }
+  result.push_back([=]() -> OwningOpRef<ModuleOp> {
+    auto [module, op] = CloneModuleFor(forall_op);
+    OpBuilder b(op);
+    SmallVector<Value> dynamic_upper_bound;
+    SmallVector<int64_t> static_upper_bound;
+    dispatchIndexOpFoldResults(new_upper_bound, dynamic_upper_bound,
+                               static_upper_bound);
+    op.getDynamicUpperBoundMutable().assign(dynamic_upper_bound);
+    op.setStaticUpperBound(static_upper_bound);
+    return std::move(module);
+  });
   return result;
 }
 
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
index cba567fb924..66e100bb3f3 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/rewrites/tests/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
-package(licenses = ["notice"])
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD
index 57fad554f46..fdfa4f6294a 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_bisect/tests/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
-package(licenses = ["notice"])
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
index a89a00980f6..5b8ce488787 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/BUILD
@@ -2,6 +2,8 @@ load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_binary")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 build_test(
     name = "mlir_replay_build_test",
     targets = [
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
index eadb280f8ca..30f2cf2d356 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/mlir_replay_lib.cc
@@ -48,10 +48,10 @@ namespace interpreter {
 namespace {
 
 tsl::StatusOr<SmallVector<InterpreterValue>> LoadArgs(
-    const xla::HloSnapshot& snapshot) {
+    const xla::HloSnapshot& snapshot, TypeRange types) {
   SmallVector<InterpreterValue> result;
-  for (const auto& arg : snapshot.arguments()) {
-    TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(arg));
+  for (const auto& [arg, type] : llvm::zip(snapshot.arguments(), types)) {
+    TF_ASSIGN_OR_RETURN(auto converted, LiteralToValue(arg, type));
     result.push_back(std::move(converted));
   }
   return result;
@@ -199,7 +199,8 @@ tsl::StatusOr<SmallVector<InterpreterValue>> Run(
   }
 
   auto args_to_buffers = extractXlaBufferAssignment(main);
-  TF_ASSIGN_OR_RETURN(auto args, LoadArgs(snapshot));
+  TF_ASSIGN_OR_RETURN(auto args,
+                      LoadArgs(snapshot, main.getBody().getArgumentTypes()));
   auto out_args =
       main.getBody().getBlocks().front().getArguments().drop_front(args.size());
 
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD
index 8044abdd8a1..179d88c08f0 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/BUILD
@@ -2,7 +2,10 @@ load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
-package(default_visibility = ["//visibility:public"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+)
 
 cc_library(
     name = "compiler_trace_instrumentation",
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
index 2399961a0f2..e3a8fcd2f7e 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.cc
@@ -298,6 +298,17 @@ tsl::StatusOr<InterpreterValue> LiteralToValue(
   return LiteralToValue(deserialized);
 }
 
+tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::LiteralProto& literal,
+                                               mlir::Type type) {
+  TF_ASSIGN_OR_RETURN(auto result, LiteralToValue(literal));
+  return {dispatchScalarType(type, [&](auto dummy) -> InterpreterValue {
+    TensorOrMemref<decltype(dummy)> cast;
+    cast.view = result.view();
+    cast.buffer = result.buffer();
+    return {cast};
+  })};
+}
+
 TracedValue ValueToTracedValue(const InterpreterValue& value) {
   TraceInterpreterValueVisitor visitor;
   std::visit(visitor, value.storage);
diff --git a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
index 7a92d51585c..6501e6e0529 100644
--- a/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
+++ b/tensorflow/compiler/xla/mlir/tools/mlir_replay/public/execution_trace_utils.h
@@ -52,6 +52,9 @@ llvm::SmallVector<mlir::Attribute> ValueToAttribute(
 // Deserializes the given literal.
 tsl::StatusOr<InterpreterValue> LiteralToValue(
     const xla::LiteralProto& literal);
+// Deserializes the given literal and then casts it to the given type.
+tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::LiteralProto& literal,
+                                               mlir::Type type);
 
 // Deserializes the given literal.
 tsl::StatusOr<InterpreterValue> LiteralToValue(const xla::Literal& literal);
diff --git a/tensorflow/compiler/xla/mlir/utils/error_util_test.cc b/tensorflow/compiler/xla/mlir/utils/error_util_test.cc
index a2610a97eef..8785a2e934f 100644
--- a/tensorflow/compiler/xla/mlir/utils/error_util_test.cc
+++ b/tensorflow/compiler/xla/mlir/utils/error_util_test.cc
@@ -35,8 +35,8 @@ TEST(ErrorUtilTest, BaseScopedDiagnosticHandler) {
 
   // Test OK without diagnostic gets passed through.
   {
-    TF_EXPECT_OK(tsl::FromAbslStatus(
-        BaseScopedDiagnosticHandler(&context).Combine(absl::OkStatus())));
+    TF_EXPECT_OK(
+        BaseScopedDiagnosticHandler(&context).Combine(absl::OkStatus()));
   }
 
   // Verify diagnostics are captured as Unknown status.
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD b/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD
index 88c3b2aa49a..33922080c1d 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/BUILD
@@ -3,8 +3,8 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
 )
 
 td_library(
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
index 4b9f07312c2..ee73d9088b6 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu.h"
 
+#include <optional>
+
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -50,7 +52,15 @@ LogicalResult BufferizeOp(Op op, RewriterBase &rewriter,
     return success();
   }
   SmallVector<Value> new_operands;
+  std::optional<Value> token = std::nullopt;
   for (auto operand : op.getOperands()) {
+    if (operand.getType().template isa<TokenType>()) {
+      assert(operand == op.getOperands().back() &&
+             "Expect token type only for last operand");
+      assert(!token && "Expect at most only one token-typed operand");
+      token = operand;
+      continue;
+    }
     FailureOr<Value> maybe_buffer = getBuffer(rewriter, operand, options);
     if (failed(maybe_buffer)) {
       return failure();
@@ -59,6 +69,10 @@ LogicalResult BufferizeOp(Op op, RewriterBase &rewriter,
   }
   rewriter.create<Op>(op.getLoc(), TypeRange{}, new_operands,
                       op.getOperation()->getAttrs());
+
+  if (token) {
+    new_operands.push_back(*token);
+  }
   bufferization::replaceOpWithBufferizedValues(
       rewriter, op.getOperation(),
       llvm::ArrayRef(new_operands).drop_front(num_inputs));
@@ -109,6 +123,12 @@ LogicalResult FftOp::bufferize(
   return BufferizeOp(*this, rewriter, options, this->getNumOperands() / 2);
 }
 
+LogicalResult InfeedOp::bufferize(
+    RewriterBase &rewriter,
+    const bufferization::BufferizationOptions &options) {
+  return BufferizeOp(*this, rewriter, options, 0);
+}
+
 LogicalResult OutfeedOp::bufferize(
     RewriterBase &rewriter,
     const bufferization::BufferizationOptions &options) {
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
index 02e29687238..7be7595f89b 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/ir/xla_cpu_ops.td
@@ -31,6 +31,9 @@ class XlaCpu_Op<string mnemonic, list<Trait> traits = []> :
 def TensorOrMemref :
   AnyTypeOf<[AnyMemRef, AnyRankedTensor], "", "::mlir::ShapedType">;
 
+def TensorOrMemrefOrToken :
+  AnyTypeOf<[AnyMemRef, AnyRankedTensor, MHLO_Token]>;
+
 def AllReduceOp : XlaCpu_Op<"all_reduce",
     [SameOperandsElementType,
      SameVariadicOperandSize,
@@ -201,6 +204,40 @@ def FftOp : XlaCpu_Op<"fft", [BufferizableOpInterface]> {
   }];
 }
 
+def InfeedOp : XlaCpu_Op<"infeed", [BufferizableOpInterface]> {
+  let summary = "CPU-specific version of Infeed";
+  let description = [{
+    The difference between this and HLO's outfeed is that the former bufferizes
+    to itself.
+  }];
+  let arguments = (ins
+    Variadic<TensorOrMemrefOrToken>:$dst,
+    DefaultValuedStrAttr<StrAttr, "">:$config,
+    OptionalAttr<ArrayAttr>:$layout
+  );
+  let results = (outs
+      Variadic<TensorOrMemrefOrToken>
+  );
+  let extraClassDeclaration = [{
+    // Declarations for BufferizableOpInterface:
+    bool bufferizesToMemoryRead(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return false;
+    }
+    bool bufferizesToMemoryWrite(OpOperand &opOperand,
+        const bufferization::AnalysisState &state) {
+      return true;
+    }
+    bufferization::AliasingOpResultList getAliasingOpResults(
+        OpOperand &opOperand, const bufferization::AnalysisState &state) {
+      return {{getOperation()->getOpResult(0),
+               bufferization::BufferRelation::Equivalent}};
+    }
+    LogicalResult bufferize(RewriterBase &rewriter,
+        const bufferization::BufferizationOptions &options);
+  }];
+}
+
 def OutfeedOp : XlaCpu_Op<"outfeed", [BufferizableOpInterface]> {
   let summary = "CPU-specific version of Outfeed";
   let description = [{
diff --git a/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD b/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD
index c29814eebc5..5145bc5af7a 100644
--- a/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir/xla_cpu/tests/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 
-package(licenses = ["notice"])
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/mlir_hlo/BUILD b/tensorflow/compiler/xla/mlir_hlo/BUILD
index 3d833f466c7..d0b4f002d07 100644
--- a/tensorflow/compiler/xla/mlir_hlo/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/BUILD
@@ -417,15 +417,20 @@ gentbl_cc_library(
 cc_library(
     name = "deallocation_passes",
     srcs = [
+        "deallocation/transforms/analysis.cc",
         "deallocation/transforms/buffer_reuse.cc",
         "deallocation/transforms/convert_deallocation_ops_to_llvm.cc",
         "deallocation/transforms/deallocate.cc",
         "deallocation/transforms/deallocation_simplification.cc",
         "deallocation/transforms/deallocation_to_scf.cc",
+        "deallocation/transforms/debug_passes.cc",
         "deallocation/transforms/split_alloc_tensors.cc",
         "deallocation/transforms/xla_buffer_arg_rewrite.cc",
     ],
-    hdrs = ["deallocation/transforms/passes.h"],
+    hdrs = [
+        "deallocation/transforms/analysis.h",
+        "deallocation/transforms/passes.h",
+    ],
     strip_include_prefix = ".",
     deps = [
         ":deallocation",
@@ -814,6 +819,7 @@ cc_library(
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TensorUtils",
         "@llvm-project//mlir:Transforms",
+        "@stablehlo//:base",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -1151,7 +1157,9 @@ cc_library(
     deps = [
         ":map_stablehlo_to_hlo_op",
         ":mlir_hlo",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
@@ -1170,6 +1178,7 @@ cc_library(
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
diff --git a/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt
index 28c9cea9be5..cbbaa42f9c1 100644
--- a/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/CMakeLists.txt
@@ -169,7 +169,6 @@ add_subdirectory(stablehlo)
 add_subdirectory(tests)
 add_subdirectory(thlo)
 add_subdirectory(tools)
-add_subdirectory(tosa)
 add_subdirectory(transforms)
 add_subdirectory(utils)
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/WORKSPACE b/tensorflow/compiler/xla/mlir_hlo/WORKSPACE
index cc9eeb64f02..c3115e33da9 100644
--- a/tensorflow/compiler/xla/mlir_hlo/WORKSPACE
+++ b/tensorflow/compiler/xla/mlir_hlo/WORKSPACE
@@ -38,8 +38,6 @@ http_archive(
     urls = ["https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT)],
 )
 
-load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
+load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
 
 llvm_configure(name = "llvm-project")
-
-llvm_disable_optional_support_deps()
diff --git a/tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc
index f29916297ed..7fde2920a17 100644
--- a/tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/analysis/test_userange_analysis.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "analysis/userange_analysis.h"
 #include "lhlo/IR/lhlo_ops.h"
 #include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc
index 05731828b1b..cc25afdc695 100644
--- a/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/analysis/userange_analysis.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "analysis/userange_analysis.h"
 
 #include <algorithm>
+#include <iterator>
+#include <limits>
 #include <utility>
 
 #include "llvm/ADT/SetOperations.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc b/tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc
index 3215acd8aa0..1f96eb75a7c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/bindings/python/MlirHloModule.cc
@@ -10,6 +10,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "bindings/c/Attributes.h"
 #include "bindings/c/Dialects.h"
 #include "bindings/c/Passes.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
index 76820e2c482..a61329f0677 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/CMakeLists.txt
@@ -21,11 +21,13 @@ include_directories(BEFORE
     ${CMAKE_CURRENT_SOURCE_DIR})
 
 add_mlir_library(DeallocationPasses
+  analysis.cc
   buffer_reuse.cc
   convert_deallocation_ops_to_llvm.cc
   deallocate.cc
   deallocation_simplification.cc
   deallocation_to_scf.cc
+  debug_passes.cc
   split_alloc_tensors.cc
   xla_buffer_arg_rewrite.cc
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/analysis.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/analysis.cc
new file mode 100644
index 00000000000..d002c21de6a
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/analysis.cc
@@ -0,0 +1,114 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "deallocation/transforms/analysis.h"
+
+#include <optional>
+
+#include "deallocation/utils/util.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Block.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/TypeUtilities.h"
+#include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+
+namespace mlir {
+namespace deallocation {
+
+namespace {
+
+bool isRestrictBbArg(Value value) {
+  auto bbarg = llvm::dyn_cast<BlockArgument>(value);
+  auto func =
+      llvm::dyn_cast<func::FuncOp>(value.getParentBlock()->getParentOp());
+  if (!bbarg || !func) return false;
+  auto isRestrict = func.getArgAttrOfType<BoolAttr>(bbarg.getArgNumber(),
+                                                    "deallocation.restrict");
+  return isRestrict && isRestrict.getValue();
+}
+
+bool isMemref(Value v) { return llvm::isa<BaseMemRefType>(v.getType()); }
+
+}  // namespace
+
+void DeallocationAnalysis::collectBackingMemory(
+    Value source, DenseSet<Value>& visited,
+    breaks_if_you_move_ops::ValueSet& results) {
+  if (!isMemref(source)) return;
+  if (!visited.insert(source).second) return;
+
+  auto type = getElementTypeOrSelf(source);
+  if (auto bbarg = llvm::dyn_cast<BlockArgument>(source)) {
+    results.insert(source);
+    if (llvm::isa<func::FuncOp>(bbarg.getParentBlock()->getParentOp())) {
+      if (!isRestrictBbArg(source)) {
+        // Restrict bbargs can't alias anything else.
+        for (auto arg : bbarg.getParentBlock()->getArguments()) {
+          if (isMemref(arg) && getElementTypeOrSelf(arg.getType()) == type) {
+            results.insert(arg);
+          }
+        }
+      }
+    } else if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(
+                   bbarg.getParentRegion()->getParentOp())) {
+      for (const auto& edge : getPredecessorRegions(
+               rbi, bbarg.getParentRegion()->getRegionNumber())) {
+        if (bbarg.getArgNumber() >= edge.successorValueIndex &&
+            static_cast<size_t>(bbarg.getArgNumber() -
+                                edge.successorValueIndex) <=
+                edge.getPredecessorOperands().size()) {
+          Value dep = edge.getPredecessorOperand(bbarg.getArgNumber());
+          collectBackingMemory(dep, visited, results);
+        }
+      }
+    }
+    return;
+  }
+
+  auto result = llvm::cast<OpResult>(source);
+  if (auto rbi = llvm::dyn_cast<RegionBranchOpInterface>(result.getOwner())) {
+    for (const auto& edge : getPredecessorRegions(rbi, std::nullopt)) {
+      collectBackingMemory(edge.getPredecessorOperand(result.getResultNumber()),
+                           visited, results);
+    }
+  }
+
+  if (auto mem = llvm::dyn_cast<MemoryEffectOpInterface>(result.getOwner())) {
+    if (mem.getEffectOnValue<MemoryEffects::Allocate>(result).has_value()) {
+      results.insert(result);
+    }
+  }
+
+  for (auto operand : result.getOwner()->getOperands()) {
+    if (isMemref(operand) && getElementTypeOrSelf(operand) == type) {
+      collectBackingMemory(operand, visited, results);
+    }
+  }
+}
+
+const breaks_if_you_move_ops::ValueSet& DeallocationAnalysis::getBackingMemory(
+    Value source) {
+  auto it = backingMemory.find(source);
+  if (it != backingMemory.end()) return it->second;
+
+  auto& results = backingMemory[source];
+  DenseSet<Value> visited;
+  collectBackingMemory(source, visited, results);
+  return results;
+}
+
+}  // namespace deallocation
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/analysis.h b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/analysis.h
new file mode 100644
index 00000000000..aefb8ff03ce
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/analysis.h
@@ -0,0 +1,58 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef MLIR_HLO_DEALLOCATION_TRANSFORMS_ANALYSIS_H
+#define MLIR_HLO_DEALLOCATION_TRANSFORMS_ANALYSIS_H
+
+#include "deallocation/utils/util.h"
+#include "mlir/IR/Value.h"
+
+namespace mlir {
+namespace deallocation {
+
+class DeallocationAnalysis {
+ public:
+  // Returns the set of all possible values that may back the given value. A
+  // value `A` is considered to back another value `B` if
+  // a) `A` is an alloc or a bbarg
+  // b) `B` depends on `A` (possibly indirectly)
+  //
+  // For example, in this IR:
+  //
+  // func.func @foo(%arg0: memref<i32>) -> memref<i32> {
+  //   %c0 = arith.constant 0 : index
+  //   %c4 = arith.constant 4 : index
+  //   %c1 = arith.constant 1 : index
+  //   %ret = scf.for %i = %c0 to %c4 step %c1 iter_args(%x = %arg0)
+  //        -> memref<i32> {
+  //     %y = some.op(%x) : memref<i32> -> memref<i32>
+  //     scf.yield %y : memref<i32>
+  //   }
+  //   func.return %ret : memref<i32>
+  // }
+  //
+  // `getBackingMemory(%ret)` is {`%arg0`, `%x`, `%y`}.
+  const breaks_if_you_move_ops::ValueSet& getBackingMemory(Value source);
+
+ private:
+  void collectBackingMemory(Value source, DenseSet<Value>& visited,
+                            breaks_if_you_move_ops::ValueSet& results);
+
+  DenseMap<Value, breaks_if_you_move_ops::ValueSet> backingMemory;
+};
+
+}  // namespace deallocation
+}  // namespace mlir
+
+#endif  // MLIR_HLO_DEALLOCATION_TRANSFORMS_ANALYSIS_H
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
index f8ce24ab4cb..b6333d6b83e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/buffer_reuse.cc
@@ -370,10 +370,11 @@ bool reuseBuffers(Block& block, BufferReuseMode mode) {
 }
 
 void promoteToStack(memref::DeallocOp dealloc) {
-  auto* alloc = dealloc.getMemref().getDefiningOp();
+  auto alloc = dealloc.getMemref().getDefiningOp<memref::AllocOp>();
   OpBuilder b(alloc);
   auto alloca = b.create<memref::AllocaOp>(
-      alloc->getLoc(), alloc->getResultTypes()[0].cast<MemRefType>());
+      alloc->getLoc(), alloc->getResultTypes()[0].cast<MemRefType>(),
+      alloc.getAlignmentAttr());
   for (auto* user : alloc->getUsers()) {
     if (auto ownership = llvm::dyn_cast<OwnOp>(user)) {
       b.setInsertionPoint(ownership);
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc
index 04e99006fd4..04190a52935 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/deallocate.cc
@@ -141,7 +141,7 @@ LogicalResult Deallocator::transformModuleOp(ModuleOp op) {
 LogicalResult Deallocator::transformFuncOp(func::FuncOp op) {
   // If we find an aliasing record for this function, it is already being
   // transformed. We might be hitting a cycle in the call graph here, in which
-  // case this is a temorary aliasing overapproximation and may be refined
+  // case this is a temporary aliasing overapproximation and may be refined
   // later.
   if (functionAliasOverapprox.find(op) != functionAliasOverapprox.end())
     return success();
@@ -232,23 +232,6 @@ FailureOr<TransformResult> Deallocator::transformBlock(Block& block,
   auto yieldedMemrefs = llvm::to_vector(
       llvm::make_filter_range(block.getTerminator()->getOperands(), isMemref));
 
-  // Handle owned memrefs that don't alias with any yielded memref first.
-  for (auto v : ownedMemrefs) {
-    if (!llvm::any_of(yieldedMemrefs, [&](Value yielded) {
-          return aliasOverapprox.isEquivalent(yielded, v);
-        })) {
-      // This owned memref does not escape, so we can put it in its own
-      // retain and place it as early as possible.
-      auto* insertionPoint = block.getTerminator();
-      while (insertionPoint->getPrevNode() &&
-             !doesAlias(insertionPoint->getPrevNode(), v, aliasOverapprox)) {
-        insertionPoint = insertionPoint->getPrevNode();
-      }
-      ImplicitLocOpBuilder b(loc, insertionPoint);
-      b.create<RetainOp>(TypeRange{}, ValueRange{}, ValueRange{v});
-    }
-  }
-
   // Group yielded memrefs and owned memrefs by equivalence class leader.
   auto groupByLeader = [&](auto& values) {
     breaks_if_you_move_ops::ValueMap<SmallVector<Value>> result;
@@ -262,21 +245,22 @@ FailureOr<TransformResult> Deallocator::transformBlock(Block& block,
   auto ownedByLeader = groupByLeader(ownedMemrefs);
 
   // Create one retain per equivalence class.
+  DenseSet<Value> alreadyRetained;
   ImplicitLocOpBuilder b(loc, block.getTerminator());
   auto null = b.create<NullOp>();
   blockResult.acquired =
       SmallVector<Value>(yieldedMemrefs.size(), null.getResult());
   for (auto [leader, yielded] : yieldedByLeader) {
     auto& ownedGroup = ownedByLeader[leader];
-    if (ownedGroup.size() == 1 && yielded.size() == 1) {
-      // We know the alloc that the yielded memref is derived from, so we can
-      // omit the retain op. This would better be a canonicalization pattern,
-      // but it requires an alias analysis, which we already have here.
-      blockResult.acquired[llvm::find(yieldedMemrefs, yielded.front()) -
-                           yieldedMemrefs.begin()] = ownedGroup.front();
-      continue;
+    alreadyRetained.insert(ownedGroup.begin(), ownedGroup.end());
+    if (yielded.size() == 1 && ownedGroup.size() == 1) {
+      auto oi = ownershipIndicator.find(yielded[0]);
+      if (oi != ownershipIndicator.end() && oi->second == ownedGroup.front()) {
+        blockResult.acquired[llvm::find(yieldedMemrefs, yielded.front()) -
+                             yieldedMemrefs.begin()] = ownedGroup.front();
+        continue;
+      }
     }
-
     SmallVector<Type> types(yielded.size(), ownershipTy);
     auto retain = b.create<RetainOp>(types, yielded, ownedGroup);
     for (auto [retained, result] : llvm::zip(retain.getResults(), yielded)) {
@@ -286,6 +270,14 @@ FailureOr<TransformResult> Deallocator::transformBlock(Block& block,
     }
   }
   if (!llvm::is_contained(blockResult.acquired, null.getResult())) null.erase();
+
+  // Handle owned memrefs that don't alias any yielded memref.
+  for (auto v : ownedMemrefs) {
+    if (!alreadyRetained.contains(v)) {
+      b.create<RetainOp>(TypeRange{}, ValueRange{}, ValueRange{v});
+    }
+  }
+
   return blockResult;
 }
 
@@ -391,6 +383,7 @@ FailureOr<TransformResult> Deallocator::transformOp(
 
   ImplicitLocOpBuilder b(op.getLoc(), op);
   SmallVector<Value> operands = op->getOperands();
+  Value null = nullptr;
   // If we pass an owned memref to the loop and don't reuse it afterwards, we
   // can transfer ownership.
   for (auto operand : llvm::make_filter_range(operands, isMemref)) {
@@ -415,7 +408,8 @@ FailureOr<TransformResult> Deallocator::transformOp(
       released.insert(ownershipIndicator);
     } else {
       // Either the operand is not an alloc or it's reused.
-      op->insertOperands(op->getNumOperands(), b.create<NullOp>().getResult());
+      if (!null) null = b.create<NullOp>().getResult();
+      op->insertOperands(op->getNumOperands(), null);
     }
   }
 
@@ -426,6 +420,12 @@ FailureOr<TransformResult> Deallocator::transformOp(
   op->replaceAllUsesWith(newResults);
   op->erase();
 
+  for (auto [result, indicator] :
+       llvm::zip(llvm::make_filter_range(newOp->getResults(), isMemref),
+                 newOp->getResults().drop_front(numOriginalResults))) {
+    setOwnershipIndicator(result, indicator);
+  }
+
   auto setupAliases = [&](std::optional<unsigned> index) {
     for (auto& region : getSuccessorRegions(newOp, index)) {
       for (auto [pred, succ] : llvm::zip(region.getPredecessorOperands(),
@@ -501,6 +501,10 @@ FailureOr<TransformResult> Deallocator::transformOp(
   }
 
   if (auto me = llvm::dyn_cast<MemoryEffectOpInterface>(op)) {
+    if (llvm::isa<memref::AllocaOp>(op)) {
+      // Don't attempt to memory manage memref.alloca.
+      return TransformResult{};
+    }
     TransformResult result;
     OpBuilder b(op->getContext());
     b.setInsertionPointAfter(op);
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/debug_passes.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/debug_passes.cc
new file mode 100644
index 00000000000..9900a74b9e2
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/debug_passes.cc
@@ -0,0 +1,88 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <memory>
+#include <string>
+
+#include "deallocation/transforms/analysis.h"
+#include "deallocation/transforms/passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/AsmState.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+
+namespace mlir {
+namespace deallocation {
+namespace {
+
+#define GEN_PASS_DEF_ANNOTATEDEALLOCATIONPASS
+#include "deallocation/transforms/passes.h.inc"
+
+std::string getDebugString(AsmState& state, DeallocationAnalysis& analysis,
+                           Value value) {
+  std::string out;
+  llvm::raw_string_ostream os(out);
+  llvm::interleaveComma(analysis.getBackingMemory(value), os,
+                        [&](Value v) { v.printAsOperand(os, state); });
+  return out;
+}
+
+Attribute getDebugAttribute(AsmState& state, DeallocationAnalysis& analysis,
+                            Region& region) {
+  mlir::OpBuilder b(region.getContext());
+  return b.getArrayAttr(llvm::to_vector(
+      llvm::map_range(region.getArguments(), [&](Value arg) -> Attribute {
+        return b.getStringAttr(getDebugString(state, analysis, arg));
+      })));
+}
+
+struct AnnotatePass : public impl::AnnotateDeallocationPassBase<AnnotatePass> {
+  void runOnOperation() override {
+    DeallocationAnalysis analysis;
+    AsmState state(getOperation());
+    mlir::OpBuilder b(getOperation());
+    getOperation().walk([&](Operation* op) {
+      std::string out;
+      llvm::raw_string_ostream os(out);
+      if (op->getNumRegions() > 0) {
+        op->setAttr("deallocation.region_args_backing_memory",
+                    b.getArrayAttr(llvm::to_vector(
+                        llvm::map_range(op->getRegions(), [&](Region& region) {
+                          return getDebugAttribute(state, analysis, region);
+                        }))));
+      }
+
+      if (op->getNumResults() > 0) {
+        op->setAttr("deallocation.result_backing_memory",
+                    b.getArrayAttr(llvm::to_vector(llvm::map_range(
+                        op->getResults(), [&](Value result) -> Attribute {
+                          return b.getStringAttr(
+                              getDebugString(state, analysis, result));
+                        }))));
+      }
+    });
+  }
+};
+
+}  // namespace
+
+// Pass to annotate ops with debug information.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createDeallocationAnnotationPass() {
+  return std::make_unique<AnnotatePass>();
+}
+
+}  // namespace deallocation
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h
index 430a0e00519..183082fc54e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h
@@ -35,6 +35,10 @@ createSplitAllocTensorsPass();
 // canonicalization.
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createDeallocatePass();
 
+// Pass to annotate ops with debug information.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+createDeallocationAnnotationPass();
+
 // Pass to annotate buffer arguments with aliasing information.
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
 createXlaBufferArgRewritePass();
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td
index 7d0e59697e7..cc9ce1e5e44 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.td
@@ -27,6 +27,16 @@ def SplitAllocTensorsPass :
   let dependentDialects = ["::mlir::bufferization::BufferizationDialect"];
 }
 
+def AnnotateDeallocationPass :
+    Pass<"hlo-deallocation-annotation", "mlir::ModuleOp"> {
+  let summary = "Annotate ops with deallocation debug information.";
+  let description = [{
+    Adds attributes to annotate ops with debug information about the
+    deallocation analysis.
+  }];
+  let constructor = "::mlir::deallocation::createDeallocationAnnotationPass()";
+}
+
 def DeallocatePass : Pass<"hlo-deallocate", "mlir::ModuleOp"> {
   let summary = "Deallocate buffers by inserting `deallocation.retain` ops.";
   let description = [{
diff --git a/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc b/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc
index c7a3674144c..a71f6132cbf 100644
--- a/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/deallocation/utils/util.cc
@@ -105,6 +105,11 @@ RegionBranchOpInterface moveRegionsToNewOpButKeepOldOp(
         op.getLoc(),
         TypeRange{op->getRegion(0).front().getTerminator()->getOperands()},
         op->getOperands()[0], op->getNumRegions() > 1);
+  } else if (llvm::isa<scf::ParallelOp>(op)) {
+    auto parallel = llvm::cast<scf::ParallelOp>(op);
+    newOp = b.create<scf::ParallelOp>(
+        op.getLoc(), parallel.getLowerBound(), parallel.getUpperBound(),
+        parallel.getStep(), parallel.getInitVals());
   } else {
     llvm_unreachable("unsupported");
   }
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
index 3bc5c40f95d..f7b29a7cb1f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/canonicalization/optimize_linalg_ops.cc
@@ -25,14 +25,18 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Location.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Value.h"
+#include "mlir/Interfaces/TilingInterface.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "thlo/IR/thlo_ops.h"
 
 namespace mlir::gml_st {
 namespace {
@@ -139,6 +143,49 @@ LogicalResult replaceBroadcastWithFill(linalg::BroadcastOp op,
   return success();
 }
 
+// Rewrite `tensor.extract_slice(op(arg1, ...))` into
+// `op(tensor.extract_slice(arg1, ...))`.
+LogicalResult rewriteExtractSliceOfTileableOp(Operation* op,
+                                              PatternRewriter& rewriter) {
+  auto tileableOp = dyn_cast<TilingInterface>(op);
+  if (!tileableOp) return failure();
+
+  // Support only ops with a single result for now.
+  if (op->getNumResults() != 1) return failure();
+  auto result = op->getResult(0);
+
+  // If the op has several uses, then it is not always beneficial to rewrite.
+  if (!result.hasOneUse()) return failure();
+  auto sliceOp = dyn_cast<tensor::ExtractSliceOp>(*result.getUsers().begin());
+  // Check if the defining op and the slice op are located in the same block.
+  // Cases when they are not are covered by fusion.
+  if (!sliceOp || sliceOp->getBlock() != op->getBlock()) return failure();
+
+  rewriter.setInsertionPointAfter(sliceOp);
+  FailureOr<TilingResult> tilingResult =
+      tensor::replaceExtractSliceWithTiledProducer(rewriter, sliceOp, result);
+
+  if (failed(tilingResult)) return failure();
+  rewriter.replaceOp(sliceOp, tilingResult->tiledValues);
+
+  return success();
+}
+
+LogicalResult rewriteExtractSliceOfReverseOp(thlo::ReverseOp reverseOp,
+                                             PatternRewriter& rewriter) {
+  return rewriteExtractSliceOfTileableOp(reverseOp, rewriter);
+}
+
+struct RewriteExtractSliceOfLinalgOpPattern
+    : public OpInterfaceRewritePattern<linalg::LinalgOp> {
+  using OpInterfaceRewritePattern<linalg::LinalgOp>::OpInterfaceRewritePattern;
+
+  LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
+                                PatternRewriter& rewriter) const override {
+    return rewriteExtractSliceOfTileableOp(linalgOp, rewriter);
+  }
+};
+
 struct OptimizeLinalgOpsPass
     : public impl::OptimizeLinalgOpsPassBase<OptimizeLinalgOpsPass> {
   void runOnOperation() override {
@@ -147,13 +194,16 @@ struct OptimizeLinalgOpsPass
 
     // Populate patterns.
     RewritePatternSet patterns(ctx);
+    patterns.add<RewriteExtractSliceOfLinalgOpPattern>(ctx);
     patterns.add(foldConstantOperandsIntoMap);
     patterns.add(replaceBroadcastWithFill);
     patterns.add(replaceConstantMapWithFill);
+    patterns.add(rewriteExtractSliceOfReverseOp);
+    tensor::populateFoldTensorEmptyPatterns(patterns);
+    tensor::populateReassociativeReshapeFoldingPatterns(patterns);
 
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns)))) {
+    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
       return signalPassFailure();
-    }
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
index 577a0865b73..8c75eb08cc8 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/collect_stats/collect_stats.cc
@@ -70,7 +70,7 @@ struct CollectStatsPass : public impl::CollectStatsPassBase<CollectStatsPass> {
   void printStats() {
     llvm::outs() << "*** Tileable ops stats (detail level " << detailLevel
                  << ") ***\n";
-    for (auto it : map) {
+    for (const auto &it : map) {
       auto name = it.first;
       auto ops = it.second;
       llvm::outs() << ops.size() << "x " << name << "\n";
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
index 83daddef16f..dd912a13c58 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/fusion_outlining.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "gml_st/IR/gml_st_ops.h"
 #include "gml_st/transforms/passes.h"
@@ -34,7 +35,6 @@ namespace {
 #include "gml_st/transforms/passes.h.inc"
 
 constexpr llvm::StringRef kFusionFunctionLabel = "fusion";
-constexpr llvm::StringRef kElementwiseLabel = "__elementwise_label__";
 
 void outlineFusionOp(func::FuncOp parentFuncOp, gml_st::FusionOp fusionOp,
                      int64_t localFusionId, PatternRewriter& rewriter) {
@@ -82,8 +82,8 @@ LogicalResult outlineFusionOpPattern(func::FuncOp funcOp,
   // Outline fusion ops one by one.
   int64_t numOutlinedFusions = 0;
   funcOp.walk([&](gml_st::FusionOp fusionOp) {
-    // TODO(shyshkov): Enable outlining for elementwise clusters.
-    if (hasLabel(fusionOp, kElementwiseLabel)) return;
+    // Outline only outermost cluster.
+    if (fusionOp->getParentOfType<gml_st::FusionOp>()) return;
 
     outlineFusionOp(funcOp, fusionOp, numOutlinedFusions++, rewriter);
   });
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
index 84efb1d772f..58554490fbd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/cpu_tiling/transform_reduce_for_cpu.cc
@@ -47,6 +47,8 @@ namespace {
 #define GEN_PASS_DEF_TRANSFORMREDUCEFORCPUPASS
 #include "gml_st/transforms/passes.h.inc"
 
+constexpr llvm::StringRef kReduceCluster = "__reduce_cluster__";
+
 struct Reduce1DTileSizes {
   int64_t tileSize;
   int64_t splitRatio;
@@ -77,8 +79,7 @@ LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
     return rewriter.notifyMatchFailure(reduceOp,
                                        "expects 1 operand. 0 or > 1 received.");
   }
-  const int64_t operandRank =
-      operands[0]->get().getType().cast<RankedTensorType>().getRank();
+  const int64_t operandRank = reduceOp.getRank(operands[0]);
   if (operandRank != expectedRank) {
     return rewriter.notifyMatchFailure(reduceOp, [&](Diagnostic &diag) {
       diag << "expects rank " << expectedRank << ". " << operandRank
@@ -88,6 +89,57 @@ LogicalResult validateOp(linalg::ReduceOp reduceOp, PatternRewriter &rewriter,
   return success();
 }
 
+bool reduce1DFusionFilter(Operation *op) {
+  return isa<linalg::FillOp, linalg::MapOp, thlo::ReverseOp>(op);
+}
+
+bool reduce2DProducerFusionFilter(Operation *op) {
+  return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
+             linalg::TransposeOp, tensor::CastOp>(op);
+}
+
+bool reduce2DConsumerFusionFilter(Operation *op) {
+  return isa<linalg::MapOp, thlo::ReverseOp>(op);
+}
+
+LogicalResult wrapReduceFusionCluster(
+    PatternRewriter &rewriter, linalg::ReduceOp reduceOp,
+    llvm::function_ref<bool(Operation *)> producerFilterFn,
+    llvm::function_ref<bool(Operation *)> consumerFilterFn) {
+  auto fusionCluster =
+      getFusionCluster(reduceOp, producerFilterFn, consumerFilterFn);
+
+  auto fusionOp = wrapFusionCluster(rewriter, fusionCluster);
+  if (failed(fusionOp)) return failure();
+
+  setLabel(reduceOp, kTransformedLabel);
+  setLabel(*fusionOp, kReduceCluster);
+  return success();
+}
+
+LogicalResult fusionClusterPattern(linalg::ReduceOp reduceOp,
+                                   PatternRewriter &rewriter) {
+  if (hasLabel(reduceOp, kTransformedLabel)) return failure();
+
+  auto fusionOp = reduceOp->getParentOfType<gml_st::FusionOp>();
+  if (fusionOp && hasLabel(fusionOp, kReduceCluster)) return failure();
+
+  const int64_t rank = reduceOp.getRank(reduceOp.getDpsInputOperand(0));
+
+  if (rank == 1) {
+    return wrapReduceFusionCluster(rewriter, reduceOp, reduce1DFusionFilter,
+                                   [](Operation *) { return false; });
+  }
+
+  if (rank == 2) {
+    return wrapReduceFusionCluster(rewriter, reduceOp,
+                                   reduce2DProducerFusionFilter,
+                                   reduce2DConsumerFusionFilter);
+  }
+
+  return failure();
+}
+
 struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
   using OpRewritePattern<linalg::ReduceOp>::OpRewritePattern;
 
@@ -121,20 +173,18 @@ struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
     scf::ForOp tailLoop = splitReduce->tailLoop;
 
     // Fusion.
-    auto fusionFilterFn = [](Operation *op) {
-      return isa<linalg::FillOp, linalg::MapOp, thlo::ReverseOp>(op);
-    };
     SmallVector<Block *> blocks;
     if (mainLoop) blocks.push_back(mainLoop.getBody());
     if (tailLoop) blocks.push_back(tailLoop.getBody());
-    fuseGreedily(rewriter, blocks, fusionFilterFn);
+    fuseGreedily(rewriter, blocks, reduce1DFusionFilter);
 
     // Tiling to 1 and fusion in the tail loop.
     if (tailLoop) {
       for (auto reduOp :
            llvm::to_vector(tailLoop.getBody()->getOps<linalg::ReduceOp>())) {
-        if (failed(tileUsingSCFForOpAndFuseGreedily(
-                rewriter, reduOp, getSCFTilingOptions({1}), fusionFilterFn))) {
+        if (failed(tileUsingSCFForOpAndFuseGreedily(rewriter, reduOp,
+                                                    getSCFTilingOptions({1}),
+                                                    reduce1DFusionFilter))) {
           return failure();
         }
       }
@@ -158,6 +208,9 @@ struct Reduce1DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
                                                  linalg::ReduceOp reduceOp,
                                                  int64_t tileSize,
                                                  int64_t splitRatio) const {
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointAfter(reduceOp);
+
     // 0-d tensor with the neutral elements.
     auto fillOp = reduceOp.getInits().front().getDefiningOp<linalg::FillOp>();
     if (!fillOp)
@@ -357,27 +410,11 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
       return rewriter.notifyMatchFailure(reduceOp,
                                          "has already been transformed.");
     }
-    if (isa<scf::ForallOp, scf::ForOp>(reduceOp->getParentOp())) {
-      return rewriter.notifyMatchFailure(
-          reduceOp, "has already been tiled by another pass.");
-    }
     if (failed(validateOp(reduceOp, rewriter, /*expectedRank=*/2)))
       return failure();
 
-    auto producerFilterFn = [](Operation *op) {
-      return isa<linalg::BroadcastOp, linalg::FillOp, linalg::MapOp,
-                 linalg::TransposeOp, tensor::CastOp>(op);
-    };
-    auto consumerFilterFn = [](Operation *op) {
-      return isa<linalg::MapOp, thlo::ReverseOp>(op);
-    };
-    auto fusionClusterFn = [&](Operation *op) {
-      return producerFilterFn(op) || isa<linalg::ReduceOp>(op);
-    };
-    auto cluster =
-        getFusionCluster(reduceOp, producerFilterFn, consumerFilterFn);
-    auto fusionCluster = cluster.operations;
-    auto *tilingRoot = cluster.root;
+    auto fusionOp = reduceOp->getParentOfType<gml_st::FusionOp>();
+    auto *tilingRoot = fusionOp.getTerminator().getValues()[0].getDefiningOp();
 
     // First level tiling: parallel dimension.
     auto parallelDimsTileSizes =
@@ -386,20 +423,23 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
                                       parallelDimTileSize)
             : SmallVector<int64_t>{parallelDimTileSize};
     auto tilingParallelDimsResult = tileUsingSCFForallOpAndFuseGreedily(
-        rewriter, tilingRoot, getSCFTilingOptions(parallelDimsTileSizes),
-        [&](Operation *op) { return fusionCluster.contains(op); });
+        rewriter, tilingRoot, getSCFTilingOptions(parallelDimsTileSizes));
     if (failed(tilingParallelDimsResult)) return failure();
 
     auto peeledParallelLoop =
         peelAllLoops(tilingParallelDimsResult->loop, rewriter);
 
+    auto filterFn = [&](Operation *op) {
+      return reduce2DProducerFusionFilter(op) || isa<linalg::ReduceOp>(op);
+    };
+
     // Process main parallel loop.
     scf::ForallOp mainParallelLoop = peeledParallelLoop.mainLoop;
     if (mainParallelLoop) {
       auto tiledReduceOp =
           *mainParallelLoop.getBody()->getOps<linalg::ReduceOp>().begin();
       if (failed(tileAndPeelReductionDim(rewriter, tiledReduceOp, reductionDim,
-                                         producerFilterFn))) {
+                                         filterFn))) {
         return failure();
       }
     }
@@ -422,7 +462,7 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
               ? getSCFTilingOptions(getParallelDimTileSizes(reductionDim, 1))
               : getSCFTilingOptions({1});
       auto parallelDimTilingResult = tileUsingSCFForallOpAndFuseGreedily(
-          rewriter, definingOp, parallelDimTilingOpts, fusionClusterFn);
+          rewriter, definingOp, parallelDimTilingOpts, filterFn);
       if (failed(parallelDimTilingResult)) return failure();
 
       for (auto tiledReduceOp :
@@ -431,7 +471,7 @@ struct Reduce2DTransformPattern : public OpRewritePattern<linalg::ReduceOp> {
         auto reductionDimTilingResult = tileUsingSCFForOpAndFuseGreedily(
             rewriter, tiledReduceOp,
             getSCFTilingOptions(getReductionDimTileSizes(reductionDim, 1)),
-            producerFilterFn);
+            reduce2DProducerFusionFilter);
         if (failed(reductionDimTilingResult)) return failure();
       }
     }
@@ -495,7 +535,8 @@ struct TransformReduceForCpuPass
   using Base::Base;
 
   void getDependentDialects(DialectRegistry &registry) const final {
-    registry.insert<arith::ArithDialect, linalg::LinalgDialect, scf::SCFDialect,
+    registry.insert<arith::ArithDialect, gml_st::GmlStDialect,
+                    linalg::LinalgDialect, scf::SCFDialect,
                     tensor::TensorDialect>();
     linalg::registerTilingInterfaceExternalModels(registry);
   }
@@ -504,24 +545,43 @@ struct TransformReduceForCpuPass
     func::FuncOp f = getOperation();
     MLIRContext *ctx = &getContext();
 
-    RewritePatternSet patterns(ctx);
-    Reduce1DTileSizeComputationFn tilingHeuristic;
-    if (enableHeuristic) {
-      tilingHeuristic = [](int64_t size) {
-        if (!ShapedType::isDynamic(size) && size > 96)
-          return Reduce1DTileSizes{32, 8};
-        return Reduce1DTileSizes{8, 8};
-      };
-    } else {
-      tilingHeuristic = [=](int64_t) {
-        return Reduce1DTileSizes{tileSize1D, splitRatio1D};
-      };
+    // Cleanup passes to prepare ops for better clustering.
+    {
+      RewritePatternSet patterns(ctx);
+      populateDuplicateInitOpsPatterns(patterns);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    {
+      RewritePatternSet patterns(ctx);
+      patterns.add(fusionClusterPattern);
+
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
+    }
+
+    {
+      RewritePatternSet patterns(ctx);
+      Reduce1DTileSizeComputationFn tilingHeuristic;
+      if (enableHeuristic) {
+        tilingHeuristic = [](int64_t size) {
+          if (!ShapedType::isDynamic(size) && size > 96)
+            return Reduce1DTileSizes{32, 8};
+          return Reduce1DTileSizes{8, 8};
+        };
+      } else {
+        tilingHeuristic = [=](int64_t) {
+          return Reduce1DTileSizes{tileSize1D, splitRatio1D};
+        };
+      }
+      patterns.add<Reduce1DTransformPattern>(ctx, std::move(tilingHeuristic));
+      patterns.add<Reduce2DTransformPattern>(ctx, parallelDimTileSize2D,
+                                             reductionDimTileSize2D);
+      if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
+        return signalPassFailure();
     }
-    patterns.add<Reduce1DTransformPattern>(ctx, std::move(tilingHeuristic));
-    patterns.add<Reduce2DTransformPattern>(ctx, parallelDimTileSize2D,
-                                           reductionDimTileSize2D);
-    if (failed(applyPatternsAndFoldGreedily(f, std::move(patterns))))
-      return signalPassFailure();
   }
 };
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
index 7a72f8e5300..43fffe2a2eb 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/fusion/fusion.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "gml_st/transforms/tiling/tiling.h"
 #include "gml_st/transforms/transforms.h"
 #include "gml_st/utils/tensor_utils.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
@@ -48,23 +49,36 @@ limitations under the License.
 namespace mlir::gml_st {
 namespace {
 
-bool isEqualOp(const Operation* lhsC, const Operation* rhsC) {
-  return OperationEquivalence::isEquivalentTo(
-      const_cast<Operation*>(lhsC), const_cast<Operation*>(rhsC),
-      OperationEquivalence::exactValueMatch,
-      /*markEquivalent=*/nullptr, OperationEquivalence::IgnoreLocations);
-}
+struct SimpleOperationInfo : public llvm::DenseMapInfo<Operation*> {
+  static unsigned getHashValue(const Operation* opC) {
+    return OperationEquivalence::computeHash(
+        const_cast<Operation*>(opC),
+        /*hashOperands=*/OperationEquivalence::directHashValue,
+        /*hashResults=*/OperationEquivalence::ignoreHashValue,
+        OperationEquivalence::IgnoreLocations);
+  }
+  static bool isEqual(const Operation* lhsC, const Operation* rhsC) {
+    auto* lhs = const_cast<Operation*>(lhsC);
+    auto* rhs = const_cast<Operation*>(rhsC);
+    if (lhs == rhs) return true;
+    if (lhs == getTombstoneKey() || lhs == getEmptyKey() ||
+        rhs == getTombstoneKey() || rhs == getEmptyKey())
+      return false;
+    return OperationEquivalence::isEquivalentTo(
+        const_cast<Operation*>(lhsC), const_cast<Operation*>(rhsC),
+        OperationEquivalence::IgnoreLocations);
+  }
+};
 
 template <class OpTy>
 void eliminateEqualOps(PatternRewriter& rewriter, Block& block) {
-  SmallVector<OpTy> uniqueOps;
+  llvm::DenseMap<Operation*, Operation*, SimpleOperationInfo> uniqueOps;
+
   for (auto op : llvm::make_early_inc_range(block.getOps<OpTy>())) {
-    auto* it = llvm::find_if(
-        uniqueOps, [&](OpTy uniqueOp) { return isEqualOp(uniqueOp, op); });
-    if (it == uniqueOps.end()) {
-      uniqueOps.push_back(op);
+    if (auto* equivalentOp = uniqueOps.lookup(op)) {
+      rewriter.replaceOp(op, equivalentOp->getResults());
     } else {
-      rewriter.replaceOp(op, it->getResult());
+      uniqueOps.insert(std::make_pair(op, op));
     }
   }
 }
@@ -666,11 +680,10 @@ FailureOr<gml_st::FusionOp> wrapFusionCluster(
 
     visitUsedValuesDefinedAbove(op->getRegions(), visitOpOperand);
 
-    for (Value result : op->getResults()) {
-      if (llvm::any_of(result.getUsers(), [&](Operation* user) {
-            return !fusionCluster.operations.contains(user);
-          }))
-        clusterResults.push_back(result);
+    if (llvm::any_of(op->getUsers(), [&](Operation* user) {
+          return !fusionCluster.operations.contains(user);
+        })) {
+      llvm::append_range(clusterResults, op->getResults());
     }
   }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
index 6b5e09de5f8..12b02debc70 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h
@@ -62,14 +62,15 @@ std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeForCPUPass(
     int64_t numElementsThreshold = 1024);
 
 /// Pass to vectorize `memref.copy`.
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass(
+    int64_t numElementsThreshold = 8);
 
 /// Pass to remove redundant `memref.copy` ops.
 std::unique_ptr<OperationPass<func::FuncOp>> createNaiveCopyRemovalPass();
 
 /// Pass to gradually lower vector ops to SCF.
 std::unique_ptr<OperationPass<func::FuncOp>> createLowerVectorsPass(
-    bool enableAVX2 = true);
+    bool enableAVX2 = true, bool flatten = false);
 
 /// Pass to pack linalg.matmul as linalg.mmt4d.
 std::unique_ptr<OperationPass<func::FuncOp>> createPackMatmulPass();
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
index 2d1235d53d0..ce462a3adcd 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.td
@@ -70,11 +70,19 @@ def VectorizeForCPUPass : Pass<"vectorize-for-cpu", "mlir::func::FuncOp"> {
   ];
 }
 
-def VectorizeCopyPass :
-    Pass<"vectorize-copy", "mlir::func::FuncOp"> {
+def VectorizeCopyPass : Pass<"vectorize-copy", "mlir::func::FuncOp"> {
   let summary = "Pass to vectorize `memref.copy`.";
   let constructor = "::mlir::gml_st::createVectorizeCopyPass()";
-  let dependentDialects = ["::mlir::vector::VectorDialect"];
+  let dependentDialects = [
+    "scf::SCFDialect",
+    "vector::VectorDialect",
+  ];
+  let options = [
+    Option<"numElementsThreshold", "num-elements-threshold", "int64_t",
+           /*default=*/"8",
+           "Max number of elements in src and dst memref for a copy to be "
+           "vectorized.">,
+  ];
 }
 
 def NaiveCopyRemovalPass : Pass<"naive-copy-removal", "mlir::func::FuncOp"> {
@@ -89,10 +97,13 @@ def LowerVectorsPass : Pass<"lower-vectors", "mlir::func::FuncOp"> {
   let dependentDialects = [
     "::mlir::LLVM::LLVMDialect",
     "::mlir::vector::VectorDialect",
+    "::mlir::affine::AffineDialect",
   ];
   let options = [
     Option<"enableAVX2", "enable-avx2", "bool", /*default=*/"true",
            "Enable specialized lowerings for AVX2.">,
+    Option<"flatten", "flatten", "bool", /*default=*/"false",
+           "Flatten multiple small n-D vector transfers into a large 1-D transfer.">,
   ];
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h
index 6458bc5bc07..7b4399b9c2f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/test_passes.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
 #define MLIR_HLO_GML_ST_TRANSFORMS_TEST_PASSES_H
 
+#include <memory>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/Pass/Pass.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
index c6ee1afcfa5..f68470c86c7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/tiling/tile_by_one.cc
@@ -101,8 +101,10 @@ struct TileByOnePass : public impl::TileByOnePassBase<TileByOnePass> {
     patterns.add<
         TileByOnePattern<thlo::ConcatenateOp>,
         TileByOnePattern<thlo::GatherOp>,
+        TileByOnePattern<thlo::ReverseOp>,
         TileByOnePattern<thlo::ScatterOp>,
-        TileByOnePattern<thlo::SortOp>>(ctx);
+        TileByOnePattern<thlo::SortOp>,
+        TileByOnePattern<linalg::MapOp>>(ctx);
     // clang-format on
 
     // Apply patterns.
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
index 04ce041b6ca..5622027e00d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/lower_vectors.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "gml_st/transforms/passes.h"
 #include "mlir/Conversion/VectorToSCF/VectorToSCF.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
@@ -187,7 +188,8 @@ class TransposeUnitDimToShapeCast
 };
 
 // Run optimization transformations on vector transfer operations.
-LogicalResult optimizeVectorTransfers(MLIRContext* ctx, FuncOp funcOp) {
+LogicalResult optimizeVectorTransfers(MLIRContext* ctx, FuncOp funcOp,
+                                      bool flatten) {
   // Generate vector.shape_cast for dropping leading one dimensions in vector
   // ops. This increases the chance that we can forward more transfer writes
   // to transfer reads.
@@ -214,7 +216,7 @@ LogicalResult optimizeVectorTransfers(MLIRContext* ctx, FuncOp funcOp) {
   }
 
   // Third stage of patterns to flatten transfer ops.
-  {
+  if (flatten) {
     RewritePatternSet patterns(ctx);
     mlir::vector::populateVectorTransferDropUnitDimsPatterns(patterns);
     mlir::vector::populateFlattenVectorTransferPatterns(patterns);
@@ -249,16 +251,18 @@ struct LowerVectorsPass : public impl::LowerVectorsPassBase<LowerVectorsPass> {
       signalPassFailure();
     if (failed(rewriteVectorReductionsND(ctx, funcOp))) signalPassFailure();
     if (failed(rewriteVectorReductions1D(ctx, funcOp))) signalPassFailure();
-    if (failed(optimizeVectorTransfers(ctx, funcOp))) signalPassFailure();
+    if (failed(optimizeVectorTransfers(ctx, funcOp, flatten)))
+      signalPassFailure();
     if (failed(lowerVectorOpsToSCF(ctx, funcOp))) signalPassFailure();
   }
 };
 }  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>> createLowerVectorsPass(
-    bool enableAVX2) {
+    bool enableAVX2, bool flatten) {
   LowerVectorsPassOptions opts;
   opts.enableAVX2 = enableAVX2;
+  opts.flatten = flatten;
   return std::make_unique<LowerVectorsPass>(opts);
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
index 3e650253dda..ede8e9320ff 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorization.h
@@ -24,11 +24,6 @@ limitations under the License.
 namespace mlir {
 namespace gml_st {
 
-// The upper limit for vectorization of untiled `linalg.fill`. If a tensor has a
-// static shape with more elements, then `linalg.fill` won't be vectorized. It
-// is expected that such operations are tiled to get to small static shapes.
-static constexpr int64_t kNumElementsThreshold = 1024;
-
 // TODO(manany): This should be parameterized later on depending on hardware.
 static constexpr int64_t kNumElementsVectorization = 8;
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
index d37a1c30207..3fc5e99cce5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_copy.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <limits>
 #include <memory>
 #include <optional>
@@ -21,6 +22,8 @@ limitations under the License.
 #include "gml_st/transforms/passes.h"
 #include "gml_st/transforms/vectorization/vectorization.h"
 #include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/MemRef/Utils/MemRefUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
@@ -30,50 +33,167 @@ namespace {
 #define GEN_PASS_DEF_VECTORIZECOPYPASS
 #include "gml_st/transforms/passes.h.inc"
 
+/// Transforms a big non-contiguous `memref.copy` into a loop over smaller
+/// copies that are either contiguous or can be vectorized.
+struct TileCopyPattern : public OpRewritePattern<memref::CopyOp> {
+  TileCopyPattern(MLIRContext *context, int64_t tileSize,
+                  mlir::PatternBenefit benefit = 1)
+      : OpRewritePattern<memref::CopyOp>(context, benefit),
+        tileSize(tileSize) {}
+  LogicalResult matchAndRewrite(memref::CopyOp op,
+                                PatternRewriter &rewriter) const override {
+    auto srcType = dyn_cast<MemRefType>(op.getSource().getType());
+    auto targetType = dyn_cast<MemRefType>(op.getTarget().getType());
+
+    if (!srcType || !targetType) return failure();
+
+    if (!srcType.hasStaticShape() || !targetType.hasStaticShape())
+      return failure();
+
+    if (srcType.getShape() != targetType.getShape()) return failure();
+
+    if (memref::isStaticShapeAndContiguousRowMajor(srcType) &&
+        memref::isStaticShapeAndContiguousRowMajor(targetType)) {
+      return failure();
+    }
+
+    if (srcType.getNumElements() <= tileSize) return failure();
+
+    auto rank = srcType.getRank();
+    auto shape = srcType.getShape();
+
+    SmallVector<OpFoldResult> offsets(rank, rewriter.getIndexAttr(0));
+    SmallVector<OpFoldResult> sizes;
+    for (auto s : shape) sizes.push_back(rewriter.getIndexAttr(s));
+    SmallVector<OpFoldResult> strides(rank, rewriter.getIndexAttr(1));
+
+    createLoopsNest(rewriter, op.getLoc(), 0, op.getSource(), op.getTarget(),
+                    shape, offsets, sizes, strides);
+
+    rewriter.eraseOp(op);
+
+    return success();
+  }
+
+ private:
+  void createLoopsNest(PatternRewriter &rewriter, Location loc, int64_t dim,
+                       Value src, Value target, ArrayRef<int64_t> shape,
+                       SmallVector<OpFoldResult> &offsets,
+                       SmallVector<OpFoldResult> &sizes,
+                       SmallVector<OpFoldResult> &strides) const {
+    auto srcType = dyn_cast<MemRefType>(src.getType());
+    auto targetType = dyn_cast<MemRefType>(target.getType());
+
+    const bool isContiguous =
+        memref::isStaticShapeAndContiguousRowMajor(srcType) &&
+        memref::isStaticShapeAndContiguousRowMajor(targetType);
+    const bool isSmall = srcType.getNumElements() <= tileSize &&
+                         targetType.getNumElements() <= tileSize;
+
+    if (isContiguous || isSmall) {
+      rewriter.create<memref::CopyOp>(loc, src, target);
+      return;
+    }
+
+    const int64_t dimSize = shape[dim];
+    const int64_t sliceSize =
+        std::max((int64_t)1, tileSize * dimSize / srcType.getNumElements());
+
+    const int64_t remainderSize = dimSize % sliceSize;
+    const int64_t upperBound = shape[dim] - remainderSize;
+
+    Value zero = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+    Value tileSizeValue =
+        rewriter.create<arith::ConstantIndexOp>(loc, sliceSize);
+    Value upperBoundValue =
+        rewriter.create<arith::ConstantIndexOp>(loc, upperBound);
+
+    auto loop = rewriter.create<scf::ForOp>(loc, zero, upperBoundValue,
+                                            tileSizeValue, target);
+
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPointToStart(loop.getBody());
+    offsets[dim] = loop.getInductionVar();
+    sizes[dim] = rewriter.getIndexAttr(sliceSize);
+
+    Value srcSubview =
+        getSubView(rewriter, loc, src, shape, offsets, sizes, strides);
+    Value targetSubview = getSubView(rewriter, loc, loop.getRegionIterArgs()[0],
+                                     shape, offsets, sizes, strides);
+
+    offsets[dim] = rewriter.getIndexAttr(0);
+
+    createLoopsNest(rewriter, loc, dim + 1, srcSubview, targetSubview, shape,
+                    offsets, sizes, strides);
+
+    rewriter.create<scf::YieldOp>(loc, loop.getRegionIterArgs()[0]);
+
+    // Remainder copy can only be created for the innermost loop, for other
+    // loops remainder size is guaranteed to be 0.
+    if (remainderSize > 0) {
+      rewriter.setInsertionPointAfter(loop);
+
+      offsets[dim] = rewriter.getIndexAttr(upperBound);
+      sizes[dim] = rewriter.getIndexAttr(remainderSize);
+
+      Value srcRemainderSubview =
+          getSubView(rewriter, loc, src, shape, offsets, sizes, strides);
+      Value targetRemainderSubview =
+          getSubView(rewriter, loc, target, shape, offsets, sizes, strides);
+
+      rewriter.create<memref::CopyOp>(loc, srcRemainderSubview,
+                                      targetRemainderSubview);
+    }
+  }
+
+  memref::SubViewOp getSubView(PatternRewriter &rewriter, Location loc,
+                               Value val, ArrayRef<int64_t> shape,
+                               SmallVector<OpFoldResult> &offsets,
+                               SmallVector<OpFoldResult> &sizes,
+                               SmallVector<OpFoldResult> &strides) const {
+    auto valType = cast<MemRefType>(val.getType());
+
+    auto valSubviewType =
+        cast<MemRefType>(memref::SubViewOp::inferRankReducedResultType(
+            shape, valType, offsets, sizes, strides));
+
+    return rewriter.create<memref::SubViewOp>(loc, valSubviewType, val, offsets,
+                                              sizes, strides);
+  }
+
+  int64_t tileSize;
+};
+
 /// Custom vectorization pattern for small and non-contiguous memref::CopyOp.
 struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
-  using OpRewritePattern<memref::CopyOp>::OpRewritePattern;
+  CopyVectorizationPattern(MLIRContext *context, int64_t numElementsThreshold,
+                           mlir::PatternBenefit benefit = 1)
+      : OpRewritePattern<memref::CopyOp>(context, benefit),
+        numElementsThreshold(numElementsThreshold) {}
 
   LogicalResult matchAndRewrite(memref::CopyOp op,
                                 PatternRewriter &rewriter) const override {
-    auto srcType = op.getSource().getType().cast<BaseMemRefType>();
-    auto targetType = op.getTarget().getType().cast<BaseMemRefType>();
+    auto srcType = dyn_cast<MemRefType>(op.getSource().getType());
+    auto targetType = dyn_cast<MemRefType>(op.getTarget().getType());
 
-    auto isStaticShapeAndContiguousRowMajor = [](MemRefType type) {
-      if (!type.hasStaticShape()) return false;
+    if (!srcType || !targetType) return failure();
 
-      SmallVector<int64_t> strides;
-      int64_t offset;
-      if (failed(getStridesAndOffset(type, strides, offset))) return false;
-
-      int64_t runningStride = 1;
-      for (unsigned i = strides.size(); i > 0; --i) {
-        if (strides[i - 1] != runningStride) return false;
-        runningStride *= type.getDimSize(i - 1);
-      }
-      return true;
-    };
-
-    auto isContiguousMemrefType = [&](BaseMemRefType type) {
-      auto memrefType = type.dyn_cast<mlir::MemRefType>();
-      return memrefType && (memrefType.getLayout().isIdentity() ||
-                            isStaticShapeAndContiguousRowMajor(memrefType));
-    };
-
-    auto isSmallMemrefType = [&](BaseMemRefType type) {
-      auto memrefType = type.dyn_cast<mlir::MemRefType>();
-      return memrefType && memrefType.hasStaticShape() &&
-             memrefType.getNumElements() > 0 &&
-             memrefType.getNumElements() < kNumElementsThreshold;
-    };
+    if (!srcType.hasStaticShape() || !targetType.hasStaticShape())
+      return failure();
 
     // If memref has an identity layout or is contiguous with an arbitrary
     // offset, it will be turned into llvm.memcpy intrinsic later, do not
     // vectorize it.
-    if (isContiguousMemrefType(srcType) && isContiguousMemrefType(targetType)) {
+    if (memref::isStaticShapeAndContiguousRowMajor(srcType) &&
+        memref::isStaticShapeAndContiguousRowMajor(targetType)) {
       return failure();
     }
 
+    auto isSmallMemrefType = [&](MemRefType memrefType) {
+      return memrefType.getNumElements() > 0 &&
+             memrefType.getNumElements() <= numElementsThreshold;
+    };
+
     // If memref is too big, vectorizing it actually explodes the compilation
     // time. Also, ignore empty memrefs, which will be handled by memrefCopy
     // function.
@@ -82,16 +202,22 @@ struct CopyVectorizationPattern : public OpRewritePattern<memref::CopyOp> {
     }
     return linalg::vectorizeCopy(rewriter, op);
   }
+
+ private:
+  int64_t numElementsThreshold;
 };
 
 struct VectorizeCopyPass
     : public impl::VectorizeCopyPassBase<VectorizeCopyPass> {
+  using Base::Base;
+
   void runOnOperation() override {
     auto func = getOperation();
     auto *ctx = func.getContext();
 
     RewritePatternSet patterns(ctx);
-    patterns.add<CopyVectorizationPattern>(ctx);
+    patterns.add<TileCopyPattern, CopyVectorizationPattern>(
+        ctx, numElementsThreshold);
     if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns)))) {
       return signalPassFailure();
     }
@@ -100,8 +226,11 @@ struct VectorizeCopyPass
 
 }  // namespace
 
-std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass() {
-  return std::make_unique<VectorizeCopyPass>();
+std::unique_ptr<OperationPass<func::FuncOp>> createVectorizeCopyPass(
+    int64_t numElementsThreshold) {
+  VectorizeCopyPassOptions opts;
+  opts.numElementsThreshold = numElementsThreshold;
+  return std::make_unique<VectorizeCopyPass>(opts);
 }
 
 }  // namespace gml_st
diff --git a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
index 0be71f39893..c07760ff679 100644
--- a/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/vectorization/vectorize_for_cpu.cc
@@ -386,6 +386,7 @@ struct VectorizeForCPUPass
                ThloReverseVectorizationPattern,
                TransferReadOfOneDimExpandShape>(ctx);
       tensor::CastOp::getCanonicalizationPatterns(patterns, ctx);
+      vector::populateVectorTransferTensorSliceTransforms(patterns);
       if (failed(applyPatternsAndFoldGreedily(func, std::move(patterns))))
         return signalPassFailure();
     }
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
index 3d2197419ca..a59c565c937 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <optional>
+#include <tuple>
 #include <unordered_set>
 
 #include "lhlo/utils/lhlo_utils.h"
@@ -39,10 +40,10 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/Dialect.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/OpDefinition.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
index b1a44b270e6..6a85513005e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
@@ -40,7 +40,7 @@ add_mlir_library(LmhloPasses
   LINK_LIBS PUBLIC
   LmhloDialect
   MLIRComplexDialect
-  MLIRGPUOps
+  MLIRGPUDialect
   MLIRLinalgDialect
   MLIRLinalgTransforms
   MLIRMhloUtils
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
index a53eb342c94..1f843908f2b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/legalize_to_tensor_op/legalize_to_tensor_op.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // This file implements logic for lowering bufferization.to_tensor ops that are
 // inserted during `mhlo-legalize-to-lmhlo`.
 
+#include <memory>
+#include <utility>
+
 #include "lhlo/transforms/passes.h"
 #include "mlir/Dialect/Bufferization/IR/Bufferization.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
index e5eee50a7cb..dee3c6afe60 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for lowering LHLO dialect to Affine dialect.
 
+#include <memory>
 #include <optional>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
index 382d3d93dcd..49c7f0fbb52 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
@@ -16,7 +16,9 @@ limitations under the License.
 // This file implements logic for lowering LHLO dialect to GPU dialect.
 
 #include <cstdint>
+#include <memory>
 #include <optional>
+#include <utility>
 
 #include "lhlo/IR/lhlo_ops.h"
 #include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
@@ -29,10 +31,10 @@ limitations under the License.
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
index 0d611fc67b6..5433d93fe81 100644
--- a/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <optional>
+#include <tuple>
+#include <utility>
 
 #include "lhlo/IR/lhlo_ops.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 324ebe537e1..180a06fd776 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -23,8 +23,11 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <complex>
 #include <cstdint>
 #include <functional>
+#include <iterator>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -370,7 +373,6 @@ void ReduceScatterOp::build(OpBuilder& odsBuilder, OperationState& odsState,
   }
 
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
-INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AllReduceOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AndOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Atan2Op)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CbrtOp)
@@ -2063,9 +2065,52 @@ void AllGatherOp::build(OpBuilder& odsBuilder, OperationState& odsState,
 // AllReduceOp
 //===----------------------------------------------------------------------===//
 
-LogicalResult AllReduceOp::verify() {
-  return hlo::verifyAllReduceOp(getLoc(), getOperand(), getReplicaGroups(),
-                                getUseGlobalDeviceIds(), getComputation());
+void AllReduceOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                        Type resultType, Value operand,
+                        DenseIntElementsAttr replicaGroups,
+                        ChannelHandleAttr channelHandle,
+                        bool useGlobalDeviceIds) {
+  AllReduceOp::build(odsBuilder, odsState, resultType, ValueRange(operand),
+                     replicaGroups, channelHandle, useGlobalDeviceIds);
+}
+
+void AllReduceOp::build(OpBuilder& odsBuilder, OperationState& odsState,
+                        Value operand, DenseIntElementsAttr replicaGroups,
+                        ChannelHandleAttr channelHandle,
+                        bool useGlobalDeviceIds) {
+  AllReduceOp::build(odsBuilder, odsState, operand.getType(),
+                     ValueRange(operand), replicaGroups, channelHandle,
+                     useGlobalDeviceIds);
+}
+
+LogicalResult AllReduceOp::inferReturnTypeComponents(
+    MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  AllReduceOp::Adaptor adaptor(operands, attributes, {}, regions);
+
+  // Verify constraints
+  if (adaptor.getOperands().empty())
+    return emitOptionalError(location,
+                             "AllReduce must have have at least one operand");
+  for (auto operand : adaptor.getOperands()) {
+    if (failed(hlo::verifyAllReduceOp(
+            location, operand, adaptor.getReplicaGroups(),
+            adaptor.getUseGlobalDeviceIds(), adaptor.getComputation())))
+      return failure();
+  }
+
+  // Populate inferred return shapes
+  for (auto resultType : adaptor.getOperands().getTypes()) {
+    auto rankedResult = resultType.dyn_cast<RankedTensorType>();
+    if (rankedResult)
+      inferredReturnShapes.emplace_back(rankedResult.getShape(),
+                                        rankedResult.getElementType(),
+                                        rankedResult.getEncoding());
+    else
+      inferredReturnShapes.emplace_back(resultType.cast<ShapedType>());
+  }
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -5405,6 +5450,59 @@ void SortOp::getCanonicalizationPatterns(RewritePatternSet& results,
   results.add(sortOpInferDefaultDimension);
 }
 
+//===----------------------------------------------------------------------===//
+// TopKOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult TopKOp::inferReturnTypeComponents(
+    MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
+    DictionaryAttr attributes, OpaqueProperties properties, RegionRange regions,
+    SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
+  TopKOp::Adaptor adaptor(operands, attributes, properties, regions);
+  return hlo::inferTopKOp(location, adaptor.getOperand(), adaptor.getK(),
+                          inferredReturnShapes);
+}
+
+bool isMhloCompareOfBodyArgumentsGtOrLt(Block& body) {
+  auto terminator = dyn_cast<ReturnOp>(body.getTerminator());
+  if (!terminator || terminator->getNumOperands() != 1) return false;
+
+  auto compare = terminator.getOperand(0).getDefiningOp<CompareOp>();
+  if (!compare) return false;
+  auto direction = compare.getComparisonDirection();
+  if (direction != ComparisonDirection::GT &&
+      direction != ComparisonDirection::LT)
+    return false;
+
+  if (body.getNumArguments() != 2) return false;
+  auto arg0 = matchers::m_Val(body.getArgument(0));
+  auto arg1 = matchers::m_Val(body.getArgument(1));
+  return matchPattern(compare.getResult(), m_Op<CompareOp>(arg0, arg1)) ||
+         matchPattern(compare.getResult(), m_Op<CompareOp>(arg1, arg0));
+}
+
+LogicalResult TopKOp::verify() {
+  Builder builder(getContext());
+  auto operandType = getOperand().getType();
+  Block& body = getBody().front();
+
+  auto expectedBodyArgType =
+      RankedTensorType::get({}, operandType.getElementType());
+  auto expectedBodyType =
+      builder.getFunctionType({expectedBodyArgType, expectedBodyArgType},
+                              {RankedTensorType::get({}, builder.getI1Type())});
+  auto actualBodyType = builder.getFunctionType(
+      body.getArgumentTypes(), body.getTerminator()->getOperandTypes());
+  if (expectedBodyType != actualBodyType)
+    return emitOpError() << "unsupported body: expected: " << expectedBodyType
+                         << ", got " << actualBodyType;
+  if (!isMhloCompareOfBodyArgumentsGtOrLt(body))
+    return emitOpError() << "unsupported body: expected mhlo.compare of "
+                         << "body arguments with GT or LT comparison_direction";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // TransposeOp
 //===----------------------------------------------------------------------===//
@@ -6173,23 +6271,19 @@ using mlir::hlo::printWindowAttributes;
 
 using mlir::hlo::parseComplexOpType;
 using mlir::hlo::parseCustomCallTarget;
-using mlir::hlo::parseDenseI64Array;
 using mlir::hlo::parseExponentMantissa;
 using mlir::hlo::parsePairwiseOpType;
 using mlir::hlo::parseSameOperandsAndResultType;
 using mlir::hlo::parseSelectOpType;
 using mlir::hlo::parseTupleOpType;
-using mlir::hlo::parseVariadicOperandWithAttribute;
 using mlir::hlo::parseVariadicSameOperandsAndResultType;
 using mlir::hlo::printComplexOpType;
 using mlir::hlo::printCustomCallTarget;
-using mlir::hlo::printDenseI64Array;
 using mlir::hlo::printExponentMantissa;
 using mlir::hlo::printPairwiseOpType;
 using mlir::hlo::printSameOperandsAndResultType;
 using mlir::hlo::printSelectOpType;
 using mlir::hlo::printTupleOpType;
-using mlir::hlo::printVariadicOperandWithAttribute;
 using mlir::hlo::printVariadicSameOperandsAndResultType;
 
 #define GET_OP_CLASSES
@@ -7183,7 +7277,7 @@ LogicalResult MhloDialect::verifyRegionArgAttribute(Operation* op,
     // parameter_replication = [] or [false] is equivalent to
     // [false,...,false] and parameter_replication = [true] means
     // [true,...,true]
-    if (arrayAttr.size() == 0 || arrayAttr.size() == 1) return success();
+    if (arrayAttr.empty() || arrayAttr.size() == 1) return success();
     auto num_leaf_buffers =
         getNumLeafBuffers(func.getArgumentTypes()[argIndex]);
     if ((size_t)num_leaf_buffers != arrayAttr.size())
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 7adbfd9599d..5ac0823ed00 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -1494,13 +1494,17 @@ def MHLO_AllGatherOp : MHLO_Op<"all_gather", [SameOperandsAndResultElementType]>
   let hasVerifier = 1;
 }
 
-def MHLO_AllReduceOp : MHLO_Op<"all_reduce",
-    [HLO_CompatibleOperandsAndResultType]> {
+def MHLO_AllReduceOp : MHLO_Op<"all_reduce", [
+    SameOperandsAndResultElementType,
+    SingleBlockImplicitTerminator<"ReturnOp">,
+    InferTensorType
+  ]> {
   let summary = "AllReduce operation";
   let description = [{
     Within each process group in the process grid, applies a reduction function
-    `computation` to the values of the `operand` tensor from each process and
-    produces a `result` tensor.
+    `computation` to the values of an operand tensor from each process and
+    produces a result tensor. The `computation` is applied separately for each
+    operand in `operands`, producing one result per operand.
 
     See:
     https://github.com/openxla/stablehlo/blob/main/docs/spec.md#all_reduce
@@ -1521,17 +1525,28 @@ def MHLO_AllReduceOp : MHLO_Op<"all_reduce",
   }];
 
   let arguments = (ins
-    MHLO_Tensor:$operand,
+    Variadic<MHLO_Tensor>:$operands,
     I64ElementsAttr:$replica_groups,
     OptionalAttr<MHLO_ChannelHandle>:$channel_handle,
     UnitAttr:$use_global_device_ids
   );
   let regions = (region SizedRegion<1>:$computation);
-  let results = (outs MHLO_Tensor);
-  let hasVerifier = 1;
+  let results = (outs Variadic<MHLO_Tensor>);
+
+  let builders = [
+    OpBuilder<(ins
+      "::mlir::Type":$result_type, "::mlir::Value":$operand,
+      "::mlir::DenseIntElementsAttr":$replica_groups,
+      "::mlir::mhlo::ChannelHandleAttr":$channel_handle,
+      CArg<"bool", "false">:$use_global_device_ids)>,
+     OpBuilder<(ins
+      "::mlir::Value":$operand,
+      "::mlir::DenseIntElementsAttr":$replica_groups,
+      "::mlir::mhlo::ChannelHandleAttr":$channel_handle,
+      CArg<"bool", "false">:$use_global_device_ids)>,
+ ];
 
   let hasCustomHLOConverter = 1;
-  let hasVerifier = 1;
 }
 
 def MHLO_ReduceScatterOp : MHLO_Op<"reduce_scatter",
@@ -2957,6 +2972,42 @@ def MHLO_SortOp : MHLO_Op<"sort",
   let hasVerifier = 1;
 }
 
+def MHLO_TopKOp : MHLO_Op<"topk", [RecursiveMemoryEffects, InferTensorType]> {
+  let summary = "TopK operation";
+  let description = [{
+    Returns top `k` values and their indices, along the last
+    dimension of the operand using the given `comparator` (for usual topk
+    behavior, it should be strict-greater-than operation).
+
+    See:
+    https://www.tensorflow.org/xla/operation_semantics#top-k
+
+    Example:
+    ```mlir
+    %values, %indices = mhlo.topk(%operand, k=5) {
+      ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
+        %predicate = mhlo.compare GT, %arg0, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+        mhlo.return %predicate : tensor<i1>
+    } : tensor<100xf32> -> (tensor<5xf32>, tensor<5xi32>)
+    ```
+  }];
+
+  let arguments = (ins
+    MHLO_Tensor:$operand,
+    I64Attr:$k
+  );
+  let regions = (region SizedRegion<1>:$body);
+  let results = (outs MHLO_Tensor:$values,
+                      MHLO_Tensor:$indices);
+
+  let hasVerifier = 1;
+  let assemblyFormat = [{
+    `(`$operand `,` `k` `=` $k`)` $body attr-dict `:`
+    type($operand) `->` `(`type($values)`,` type($indices)`)`
+  }];
+  let hasCustomHLOConverter = 1;
+}
+
 def MHLO_ReverseOp: MHLO_Op<"reverse",
       [Pure, HLO_CompatibleOperandsAndResultType]> {
   let summary = "Reverse operation";
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
index eed403594b6..3a70f02633b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops_common.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "mhlo/IR/hlo_ops_common.h"
 
+#include <array>
 #include <optional>
+#include <utility>
+#include <vector>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSet.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc
index a749f7fad73..0aedbeeef38 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/analysis/test_shape_component_analysis.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/analysis/shape_component_analysis.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
index 44e09a135a1..593a6852d33 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/broadcast_propagation/broadcast_propagation.cc
@@ -15,6 +15,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <memory>
 #include <utility>
 
 #include "llvm/ADT/DenseMapInfo.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
index fa34b3abd61..998d993dfa2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
@@ -1552,19 +1552,24 @@ struct ConvertSinhOp : public OpConversionPattern<SinhOp> {
 //                           strides = dense<1> : tensor<2xi64>} :
 //                              (tensor<16x16xf32>) -> tensor<16x8xf32>
 // %6 = "mhlo.slice"(%4) ...
+//
+// TODO(b/284078162): Decide what to do with this pattern given that we now
+// have mhlo::TopKOp. No action needed for now given that mhlo::TopKOp is
+// currently categorized as `hasPrivateFeaturesNotInStablehlo`.
 struct ConvertTopKOp : public OpConversionPattern<TopKOp> {
   using OpConversionPattern<TopKOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
       TopKOp op, OpAdaptor /*adaptor*/,
       ConversionPatternRewriter &rewriter) const override {
-    // The last dimension of the operand's shape should be known so we can have
-    // clamped end_indices for slices. This is verified by the op.
-    auto operandType = op.getOperand().getType().cast<RankedTensorType>();
+    auto operandType = op.getOperand().getType().dyn_cast<RankedTensorType>();
+    if (!operandType) return failure();
     int64_t operandRank = operandType.getRank();
     int64_t lastDimIndex = operandRank - 1;
     int64_t lastDimSize = operandType.getDimSize(lastDimIndex);
     int64_t lastDimResultSize =
-        std::min(static_cast<int64_t>(op.getK()), lastDimSize);
+        hlo::isDynamicDimSize(lastDimSize)
+            ? static_cast<int64_t>(op.getK())
+            : std::min(static_cast<int64_t>(op.getK()), lastDimSize);
     int64_t isDynamic = !operandType.hasStaticShape();
     auto i32Type = rewriter.getIntegerType(32);
     Value opShapeValue, resultShapeValue;
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index 9ef25181fe1..8c26d40aff5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
index adc3937a9e0..4d1d671ce77 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/collapse_elementwise_map/collapse_elementwise_map.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
index 7758e4ec94c..2f266d30a55 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/group_reduction_dimensions/group_reduction_dimensions.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <memory>
 #include <optional>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
index e539564c87d..eb9b5db9769 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_shape_ops_to_standard/hlo_legalize_shape_ops_to_standard.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This file implements logic for lowering HLO/LHLO dialect to Linalg dialect.
 
 #include <algorithm>
+#include <memory>
 #include <numeric>
 #include <string>
 #include <utility>
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
index 35315c06225..83265b3ea12 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_lhlo/hlo_legalize_to_lhlo.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // This file implements logic for lowering HLO dialect to LHLO dialect.
 
 #include <algorithm>
+#include <array>
+#include <memory>
 #include <optional>
 #include <utility>
 
@@ -36,10 +38,10 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Attributes.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
index fd49a69a9b1..cbf4f671c05 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_memref/hlo_legalize_to_memref.cc
@@ -164,68 +164,6 @@ struct CustomCallOpInterface
   }
 };
 
-struct InfeedOpInterface
-    : public BufferizableOpInterface::ExternalModel<InfeedOpInterface,
-                                                    mhlo::InfeedOp> {
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    // Allocate buffers for the outputs of infeed.
-    SmallVector<Value> bufferArgs;
-    for (OpResult result : op->getOpResults()) {
-      if (!result.getType().isa<TensorType>()) continue;
-      AnalysisState analysisState(options);
-      auto tensorType = result.getType().cast<TensorType>();
-      FailureOr<Value> tensorAlloc =
-          bufferization::allocateTensorForShapedValue(
-              rewriter, op->getLoc(), result,
-              analysisState.isTensorYielded(result), options);
-      if (failed(tensorAlloc)) return failure();
-      auto memrefType =
-          MemRefType::get(tensorType.getShape(), tensorType.getElementType());
-      bufferArgs.push_back(rewriter.create<bufferization::ToMemrefOp>(
-          op->getLoc(), memrefType, *tensorAlloc));
-    }
-    rewriter.create<lmhlo::InfeedOp>(op->getLoc(), std::nullopt, bufferArgs,
-                                     op->getAttrs());
-    // Pass the token along.
-    bufferArgs.push_back((op->getOperand(0)));
-    bufferization::replaceOpWithBufferizedValues(rewriter, op, bufferArgs);
-    return success();
-  }
-};
-
-struct OutfeedOpInterface
-    : public BufferizableOpInterface::ExternalModel<OutfeedOpInterface,
-                                                    mhlo::OutfeedOp> {
-  bool bufferizesToMemoryRead(Operation *, OpOperand &,
-                              const AnalysisState &) const {
-    return true;
-  }
-
-  bool bufferizesToMemoryWrite(Operation *, OpOperand &,
-                               const AnalysisState &) const {
-    return false;  // Arguments are read-only.
-  }
-
-  AliasingOpResultList getAliasingOpResults(Operation *, OpOperand &,
-                                            const AnalysisState &) const {
-    return {};
-  }
-
-  LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
-                          const BufferizationOptions &options) const {
-    // Outfeed trivially bufferizes to lmhlo. Just pass the token operand along.
-    FailureOr<Value> operandBuffer =
-        getBuffer(rewriter, op->getOperand(0), options);
-    if (failed(operandBuffer)) return failure();
-    rewriter.create<lmhlo::OutfeedOp>(op->getLoc(), std::nullopt,
-                                      *operandBuffer, op->getAttrs());
-    bufferization::replaceOpWithBufferizedValues(rewriter, op,
-                                                 {op->getOperand(1)});
-    return success();
-  }
-};
-
 struct ReshapeOpInterface
     : public BufferizableOpInterface::ExternalModel<ReshapeOpInterface,
                                                     mhlo::ReshapeOp> {
@@ -487,8 +425,6 @@ std::unique_ptr<OperationPass<ModuleOp>> createLegalizeToMemrefPass() {
 void registerBufferizableOpInterfaceExternalModels(DialectRegistry &registry) {
   registry.addExtension(+[](MLIRContext *ctx, MhloDialect * /*dialect*/) {
     CustomCallOp::attachInterface<CustomCallOpInterface>(*ctx);
-    InfeedOp::attachInterface<InfeedOpInterface>(*ctx);
-    OutfeedOp::attachInterface<OutfeedOpInterface>(*ctx);
     ReshapeOp::attachInterface<ReshapeOpInterface>(*ctx);
     DynamicReshapeOp::attachInterface<DynamicReshapeOpInterface>(*ctx);
     DynamicBroadcastInDimOp::attachInterface<DynamicBroadcastInDimOpInterface>(
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index 7c3402b4736..20e5792d4b7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -23,16 +23,19 @@ limitations under the License.
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/map_stablehlo_to_hlo_op.h"
 #include "mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/SymbolTable.h"
 #include "mlir/IR/Types.h"
 #include "mlir/Support/DebugStringHelper.h"
 #include "mlir/Support/LogicalResult.h"
 #include "mlir/Transforms/DialectConversion.h"
+#include "mlir/Transforms/RegionUtils.h"
 #include "stablehlo/dialect/StablehloOps.h"
 
 namespace mlir {
@@ -49,7 +52,7 @@ bool hasPrivateFeaturesNotInStablehlo(HloOpTy hloOp) {
   // Please let us know if we missed something, and we'll recategorize them.
   if (isa<mhlo::AddDependencyOp, mhlo::AsyncDoneOp, mhlo::AsyncStartOp,
           mhlo::AsyncUpdateOp, mhlo::BitcastOp, mhlo::CopyOp, mhlo::DomainOp,
-          mhlo::FusionOp, mhlo::StochasticConvertOp,
+          mhlo::FusionOp, mhlo::StochasticConvertOp, mhlo::TopKOp,
           mhlo::XlaRngGetAndUpdateStateOp>(hloOp.getOperation())) {
     return true;
   }
@@ -89,6 +92,11 @@ bool hasPackedNibble(std::optional<ArrayAttr> precisionConfigAttr) {
 // for StableHLO, and they are usually accompanied by a StableHLO GitHub ticket.
 template <typename HloOpTy>
 bool hasExperimentalFeaturesNotInStablehlo(HloOpTy hloOp) {
+  if constexpr (std::is_same<HloOpTy, mhlo::AllReduceOp>::value) {
+    // StableHLO AllReduce doesn't support the tuple form yet.
+    // Proposal: https://github.com/openxla/stablehlo/issues/1370.
+    if (hloOp.getNumOperands() != 1) return true;
+  }
   if constexpr (std::is_same<HloOpTy, mhlo::AllToAllOp>::value) {
     // StableHLO AllToAll doesn't support the tuple form yet.
     // Proposal: https://github.com/openxla/stablehlo/issues/574.
@@ -258,6 +266,59 @@ Attribute encodePrecisionConfig(Attribute hloAttrs) {
   return ArrayAttr::get(hloAttrs.getContext(), stablehloAttrs);
 }
 
+// Converts region to function.
+// Returns failure if region has more than one block.
+// Example:
+//  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
+//  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+//    %2 = mhlo.add %arg2, %arg3 : tensor<f32>
+//    mhlo.return %2 : tensor<f32>
+//  }) {...} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+// ==>
+//  func.func @all_reduce0(%arg0: tensor<f32>, %arg1: tensor<f32>)
+//       -> tensor<f32> {
+//    %0 = mhlo.add %arg0, %arg1 : tensor<f32>
+//    mhlo.return %0 : tensor<f32>
+//  }
+FailureOr<func::FuncOp> rewriteMhloRegionAsFunc(
+    Operation* op, ConversionPatternRewriter& rewriter,
+    TypeConverter* typeConverter) {
+  auto& region = op->getRegion(0);
+  if (!region.hasOneBlock()) return failure();
+
+  // Must be isolated from above
+  SetVector<Value> values;
+  getUsedValuesDefinedAbove(region, values);
+  if (!values.empty())
+    return op->emitError(
+        "MHLO feature serialization in StableHLO only supports regions that "
+        "do not capture SSA values from above");
+
+  // Insert into the parent module
+  OpBuilder::InsertionGuard g(rewriter);
+  auto module = op->getParentOfType<ModuleOp>();
+  SymbolTable symTable(module);
+
+  // Convert so that function signature is correct
+  if (failed(rewriter.convertRegionTypes(&region, *typeConverter,
+                                         /*entryConversion=*/nullptr)))
+    return failure();
+
+  // Create function with args that match block inputs / return types
+  rewriter.setInsertionPointToEnd(&module.getBodyRegion().front());
+  auto& block = region.getBlocks().front();
+  auto type = rewriter.getFunctionType(
+      block.getArgumentTypes(), block.getTerminator()->getOperandTypes());
+  auto funcOp = rewriter.create<func::FuncOp>(
+      region.getLoc(), op->getName().stripDialect(), type);
+  symTable.insert(funcOp);
+
+  // Move region into new function
+  rewriter.inlineRegionBefore(region, funcOp.getFunctionBody(), funcOp.end());
+
+  return funcOp;
+}
+
 // Experimental and public ops in MHLO that do not exist yet in StableHLO can be
 // encoded as a StableHLO CustomCallOp to allow round-tripping between dialects.
 //
@@ -272,11 +333,12 @@ LogicalResult rewriteMhloOpAsCustomCall(HloOpTy hloOp,
                                         ConversionPatternRewriter& rewriter,
                                         TypeConverter* typeConverter,
                                         ValueRange stablehloOperands) {
-  if (hloOp->getNumRegions() != 0) {
-    // Extensibility protocol for regions hasn't been implemented yet.
+  if (hloOp->getNumRegions() > 1) {
+    // Extensibility protocol for regions is only supported for single-region
+    // ops. Support for multiple regions is not yet implemented.
     // In principle, it should be straightforward to implement by
     // converting regions into functions and calling them out in
-    // "called_computations".
+    // "called_computations" in the order the regions appear in the op.
     // https://github.com/openxla/stablehlo/issues/593.
     return failure();
   }
@@ -303,12 +365,25 @@ LogicalResult rewriteMhloOpAsCustomCall(HloOpTy hloOp,
     stablehloConvertedAttrs.push_back({hloAttr.getName(), stablehloAttr});
   }
 
+  // Create functions from regions
+  std::optional<func::FuncOp> stablehloConvertedRegion;
+  if (hloOp->getNumRegions() == 1) {
+    auto funcOp = rewriteMhloRegionAsFunc(hloOp, rewriter, typeConverter);
+    if (failed(funcOp)) return failure();
+    stablehloConvertedRegion = funcOp.value();
+  }
+
   auto stablehloCallTargetName = hloOp->getName().getStringRef();
   SmallVector<NamedAttribute> stablehloAttrs;
   stablehloAttrs.push_back(rewriter.getNamedAttr(
       "call_target_name", rewriter.getStringAttr(stablehloCallTargetName)));
   stablehloAttrs.push_back(rewriter.getNamedAttr(
       "mhlo.attributes", rewriter.getDictionaryAttr(stablehloConvertedAttrs)));
+  if (stablehloConvertedRegion)
+    stablehloAttrs.push_back(rewriter.getNamedAttr(
+        "called_computations",
+        rewriter.getArrayAttr(FlatSymbolRefAttr::get(
+            rewriter.getContext(), stablehloConvertedRegion->getSymName()))));
   if (auto featureVersion = getPublicFeaturesNotInStablehlo(hloOp))
     stablehloAttrs.push_back(rewriter.getNamedAttr(
         "mhlo.version", rewriter.getI64IntegerAttr(featureVersion.value())));
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
index e0efda1c92e..92fe4754234 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_einsum_to_dot_general/legalize_einsum_to_dot_general.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <cctype>
+#include <iterator>
+#include <memory>
 #include <utility>
 
 #include "mhlo/IR/hlo_ops.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
index 445e982a925..8dcb0228503 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_gather_to_torch_index_select/legalize_gather_to_torch_index_select.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <memory>
+#include <utility>
+
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
index 0c2524119b1..f40b19ffb5f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_mhlo_to_thlo/legalize_mhlo_to_thlo.cc
@@ -54,10 +54,24 @@ Value castToIndex(OpBuilder& b, Location loc, TensorType originalType,
   Type elementTy = originalType.getElementType();
   if (elementTy.isIndex()) return value;
 
-  Type ty = RankedTensorType::get(originalType.getShape(), b.getIndexType());
-  return elementTy.isUnsignedInteger()
-             ? b.create<arith::IndexCastUIOp>(loc, ty, value).getResult()
-             : b.create<arith::IndexCastOp>(loc, ty, value).getResult();
+  Type indexType = b.getIndexType();
+  Value emptyTensor = b.create<tensor::EmptyOp>(
+      loc, tensor::getMixedSizes(b, loc, value), indexType);
+
+  auto map = b.create<linalg::MapOp>(
+      loc, value, emptyTensor,
+      [&](OpBuilder& nestedB, Location loc, ValueRange args) {
+        Value elem = args.front();
+        Value res =
+            elementTy.isUnsignedInteger()
+                ? nestedB.create<arith::IndexCastUIOp>(loc, indexType, elem)
+                      .getResult()
+                : nestedB.create<arith::IndexCastOp>(loc, indexType, elem)
+                      .getResult();
+
+        b.create<linalg::YieldOp>(loc, res);
+      });
+  return map->getResult(0);
 }
 
 struct ConcatenateOpPattern : public OpConversionPattern<mhlo::ConcatenateOp> {
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
index 0ec48e417a0..f0c6bd12f9f 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_shape_computations/legalize_shape_computations.cc
@@ -17,6 +17,7 @@ limitations under the License.
 // operations.
 
 #include <algorithm>
+#include <memory>
 #include <numeric>
 #include <string>
 #include <utility>
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
index 922272d58c4..9e5c9a09b56 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_sort/legalize_sort.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for lowering mhlo.sort to the SCF dialect.
 #include <iterator>
+#include <memory>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
@@ -28,9 +29,9 @@ limitations under the License.
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // TF:llvm-project
 #include "mlir/IR/Block.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Location.h"
 #include "mlir/IR/PatternMatch.h"
@@ -50,10 +51,8 @@ namespace mhlo {
 namespace {
 
 using ::mlir::arith::AddIOp;
-using ::mlir::arith::CmpIPredicate;
 using ::mlir::arith::MinSIOp;
 using ::mlir::arith::SelectOp;
-using ::mlir::arith::SubIOp;
 
 constexpr int64_t kInsertionSortSize = 16;
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index ca56a4672c2..5a0a7d0dbac 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -1369,6 +1369,21 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
       return success();
     }
 
+    // Special case when the reshape is sparse-to-sparse: We generate the
+    // tensor::ReshapeOp to avoid allocating temp buffer.
+    if (sparse_tensor::getSparseTensorEncoding(operandType) &&
+        sparse_tensor::getSparseTensorEncoding(resultType) &&
+        resultType.hasStaticShape()) {
+      auto shape = DenseIntElementsAttr::get(
+          RankedTensorType::get(resultType.getRank(), rewriter.getI64Type()),
+          resultType.getShape());
+      auto constShape =
+          rewriter.create<arith::ConstantOp>(reshapeOp.getLoc(), shape);
+      rewriter.replaceOpWithNewOp<tensor::ReshapeOp>(
+          reshapeOp, resultType, operand, constShape.getResult());
+      return success();
+    }
+
     // Compute the reassociation maps for the linalg operation. This will
     // succeed if the reshape can be done with a single expand_shape or
     // collapse_shape.
@@ -1414,9 +1429,6 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
       return exprs;
     };
 
-    int64_t totalElems = resultType.getNumElements();
-    auto collapsedType = RankedTensorType::get({totalElems}, elemType);
-
     // Otherwise, we need to first reduce all source dimensions into one and
     // then expand to the destination dimensions. If there is only a single
     // source dimension, the reduce step can be skipped. TensorCollapseShape
@@ -1427,23 +1439,12 @@ class ReshapeOpConverter : public OpConversionPattern<mhlo::ReshapeOp> {
           // dimensions.
           getIdentityExprs(operandType.getRank())};
 
-      // If the operand has a sparse encoding, then the collapsed type should
-      // have a sparse encoding.
-      if (sparse_tensor::getSparseTensorEncoding(operandType)) {
-        SmallVector<int64_t, 1> collapsedShape = {totalElems};
-        auto identityMap =
-            AffineMap::getMultiDimIdentityMap(1, rewriter.getContext());
-        auto oneDimensionType = RankedTensorType::get(collapsedShape, elemType);
-        collapsedType = sparse_tensor::getCOOFromTypeWithOrdering(
-            oneDimensionType, identityMap, true);
-        collapsedOp = rewriter.create<tensor::CollapseShapeOp>(
-            loc, collapsedType, operand, collapsingMap);
-      } else {
-        collapsedOp = rewriter.create<tensor::CollapseShapeOp>(loc, operand,
-                                                               collapsingMap);
-      }
+      collapsedOp =
+          rewriter.create<tensor::CollapseShapeOp>(loc, operand, collapsingMap);
     }
     // Cast to a known static type if the input has dynamic dimensions.
+    int64_t totalElems = resultType.getNumElements();
+    auto collapsedType = RankedTensorType::get({totalElems}, elemType);
     collapsedOp =
         rewriter.create<tensor::CastOp>(loc, collapsedType, collapsedOp);
     if (resultType.getRank() == 1) {
@@ -2345,6 +2346,15 @@ class RngBitGeneratorConverter
       return success();
     }
 
+    if (op.getRngAlgorithm() == mhlo::RngAlgorithm::PHILOX ||
+        op.getRngAlgorithm() == mhlo::RngAlgorithm::DEFAULT) {
+      Value random;
+      if (generateLinalgPhilox(rewriter, loc, resultTy, state, random).failed())
+        return failure();
+      rewriter.replaceOp(op, {state, random});
+      return success();
+    }
+
     return failure();
   }
 };
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
index b0895afbd4e..f1b2d403efe 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_to_standard/legalize_to_standard.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for lowering MHLO dialect to Standard dialect.
 
+#include <memory>
 #include <optional>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
index e2fb93aa146..0a5a738c2c0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/legalize_trigonometric_to_approximation/legalize_trigonometric_to_approximation.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // This file implements the lowering for trigonometric standard ops to
 // approximations.
 
+#include <array>
+#include <memory>
 #include <utility>
 
 #include "mhlo/transforms/passes.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc
index 977db01f842..6e8f0edbcb6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_complex/lower_complex.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <utility>
 
@@ -64,13 +65,60 @@ void LowerComplexPass::runOnOperation() {
     return signalPassFailure();
 }
 
+class ConvertComplexDot : public OpRewritePattern<DotOp> {
+ public:
+  using OpRewritePattern<DotOp>::OpRewritePattern;
+
+  // Will decompose mlir::DotOp with complex parameters down to
+  // four Dot operations in the following fashion:
+  //   result.real = lhs.real <DOT> rhs.real - lhs.imag <DOT> rhs.imag
+  //   result.imag = lhs.imag <DOT> rhs.real + lhs.real <DOT> rhs.imag
+  //   result = complex(result.real, result.imag)
+  LogicalResult matchAndRewrite(DotOp dot,
+                                PatternRewriter &rewriter) const override {
+    auto precision = dot.getPrecisionConfigAttr();
+    auto lhs = dot.getLhs();
+    auto rhs = dot.getRhs();
+    ShapedType lhsType = lhs.getType();
+    ShapedType rhsType = rhs.getType();
+    if (!isa<ComplexType>(lhsType.getElementType()) ||
+        !isa<ComplexType>(rhsType.getElementType())) {
+      return rewriter.notifyMatchFailure(dot, "lhs/rhs types are not complex");
+    }
+
+    Location loc = dot.getLoc();
+    Value lhsReal = rewriter.createOrFold<mhlo::RealOp>(loc, lhs);
+    Value lhsImag = rewriter.createOrFold<mhlo::ImagOp>(loc, lhs);
+    Value rhsReal = rewriter.createOrFold<mhlo::RealOp>(loc, rhs);
+    Value rhsImag = rewriter.createOrFold<mhlo::ImagOp>(loc, rhs);
+    auto resultType = dot.getType();
+    Type newType = hlo::createRealType(resultType);
+
+    Value realComponent = rewriter.create<mhlo::SubtractOp>(
+        loc,
+        rewriter.create<mhlo::DotOp>(loc, newType, lhsReal, rhsReal, precision),
+        rewriter.create<mhlo::DotOp>(loc, newType, lhsImag, rhsImag,
+                                     precision));
+    Value imagComponent = rewriter.create<mhlo::AddOp>(
+        loc,
+        rewriter.create<mhlo::DotOp>(loc, newType, lhsReal, rhsImag, precision),
+        rewriter.create<mhlo::DotOp>(loc, newType, lhsImag, rhsReal,
+                                     precision));
+    Value result = rewriter.create<mhlo::ComplexOp>(
+        loc, resultType, realComponent, imagComponent);
+    rewriter.replaceOp(dot, result);
+    return success();
+  }
+};
+
 }  // end anonymous namespace
 }  // end namespace mhlo
 }  // end namespace mlir
 
-void mlir::mhlo::populateComplexLoweringPatterns(MLIRContext* /*context*/,
-                                                 RewritePatternSet* patterns) {
+void mlir::mhlo::populateComplexLoweringPatterns(MLIRContext *context,
+                                                 RewritePatternSet *patterns) {
   populateWithGenerated(*patterns);
+  patterns->insert<mlir::mhlo::ConvertComplexDot>(context);
 }
 
 std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
index db766421796..66fcdc9dff2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/lower_general_dot/lower_general_dot.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <sys/types.h>
 
+#include <memory>
 #include <optional>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index 167ea7a2d72..29193d67327 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define MLIR_HLO_MHLO_TRANSFORMS_MAP_MHLO_TO_SCALAR_OP_H
 
 #include <optional>
+#include <type_traits>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc
index e7eb7209aba..188b72f5e1d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/materialize_broadcasts/materialize_broadcasts_pass.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/rewriters.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
index 8f7ca771e98..8af920e29be 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/merge_assuming_ops/merge_assuming_ops.cc
@@ -15,6 +15,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <tuple>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
@@ -26,9 +27,9 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/OperationSupport.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
index 65cedd54595..49397f378c3 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_canonicalize_reduction/mhlo_canonicalize_reduction.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // This file canonicalize reduction ops in hlo dialect to match the
 // capacity of codegen backend.
 
+#include <memory>
+
 #include "mhlo/IR/hlo_ops.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
index 48ad9df9da6..f3002842167 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/mhlo_flatten_tuple/mhlo_flatten_tuple.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This file implements logic for flattening tuples in HLO ops.
 
 #include <cassert>
+#include <memory>
 #include <string>
 #include <utility>
 
@@ -67,31 +68,59 @@ Value createTupleValue(OpBuilder &builder, Location loc,
   return builder.create<mhlo::TupleOp>(loc, flattenValues);
 }
 
+void flattenTupleValue(OpBuilder &builder, Location loc, Value value,
+                       llvm::SmallVectorImpl<Value> &flattenedValues) {
+  auto tupleType = value.getType().dyn_cast<TupleType>();
+  if (!tupleType) {
+    flattenedValues.push_back(value);
+    return;
+  }
+  int flattenIdx = 0;
+  for (auto innerType : tupleType.getTypes()) {
+    auto innerValue = builder.create<mhlo::GetTupleElementOp>(
+        loc, innerType, value, builder.getI32IntegerAttr(flattenIdx++));
+    flattenTupleValue(builder, loc, innerValue, flattenedValues);
+  }
+}
+
 struct FlattenCustomCallOp : public OpRewritePattern<CustomCallOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(CustomCallOp op,
                                 PatternRewriter &rewriter) const override {
+    bool flattenResult =
+        op->getNumResults() == 1 && op->getResult(0).getType().isa<TupleType>();
+    bool flattenOperands = llvm::any_of(op.getInputs(), [](Value operand) {
+      return operand.getType().isa<TupleType>();
+    });
+
+    if (!flattenResult && !flattenOperands) return failure();
+
+    llvm::SmallVector<Value> flattenedOperands;
+    for (auto operand : op.getInputs())
+      flattenTupleValue(rewriter, op->getLoc(), operand, flattenedOperands);
+
     llvm::SmallVector<Type, 4> flattenedResultTypes;
-    if (op->getNumResults() != 1 ||
-        !op->getResult(0).getType().isa<TupleType>())
-      return failure();
+    if (!flattenResult) {
+      flattenedResultTypes.push_back(op->getResult(0).getType());
+    } else {
+      // Check for nested tuples.
+      for (Type innerType :
+           op->getResult(0).getType().cast<TupleType>().getTypes())
+        if (innerType.isa<TupleType>()) return failure();
 
-    // Check for nested tuples.
-    for (Type innerType :
-         op->getResult(0).getType().cast<TupleType>().getTypes())
-      if (innerType.isa<TupleType>()) return failure();
-
-    for (auto result : op->getResults())
-      flattenTupleType(result, flattenedResultTypes);
+      for (auto result : op->getResults())
+        flattenTupleType(result, flattenedResultTypes);
+    }
 
     auto flattenedCall = rewriter.create<mhlo::CustomCallOp>(
-        op->getLoc(), flattenedResultTypes, op->getOperands(), op->getAttrs());
+        op->getLoc(), flattenedResultTypes, flattenedOperands, op->getAttrs());
 
-    auto tuple =
-        createTupleValue(rewriter, op->getLoc(), flattenedCall.getResults(),
-                         op->getResult(0).getType());
-    rewriter.replaceOp(op, tuple);
+    rewriter.replaceOp(op, flattenResult
+                               ? createTupleValue(rewriter, op->getLoc(),
+                                                  flattenedCall.getResults(),
+                                                  op->getResult(0).getType())
+                               : flattenedCall.getResult(0));
     return success();
   }
 };
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc
index 6576edce296..8dead22b854 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mhlo/transforms/rewriters.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 192994676bb..01acfbf635e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for some optimizations to reduce size on export.
 
+#include <complex>
 #include <cstdint>
 #include <memory>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
index facbd87c7b6..ac17c21c1a7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
@@ -14,6 +14,10 @@ limitations under the License.
 
 ==============================================================================*/
 
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <tuple>
 #include <utility>
 
 #include "llvm/ADT/EquivalenceClasses.h"
@@ -30,9 +34,9 @@ limitations under the License.
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Block.h"
-#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/IRMapping.h"
 #include "mlir/IR/MLIRContext.h"
 #include "mlir/IR/Operation.h"
 #include "mlir/IR/PatternMatch.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
index a4db09b8eb9..4433af20faf 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/restrict_max_rank/restrict_max_rank.cc
@@ -15,6 +15,7 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <memory>
 #include <utility>
 
 #include "llvm/Support/Casting.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc
index 47a72c5e707..d150e5b416c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sink_constants_to_control_flow/sink_constants_to_control_flow.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/Casting.h"
 #include "mhlo/IR/hlo_ops.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc
index ccf97ab4a68..07f5454dd6d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/sparse_rewriting/sparse_rewriting.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements a set of sparse MHLO rewriting rules.
 
+#include <memory>
 #include <utility>
 
 #include "mhlo/IR/hlo_ops.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index f439eb789c4..275dcde5681 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -13,12 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/map_stablehlo_to_hlo_op.h"
 #include "mhlo/transforms/rewriters.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Location.h"
@@ -151,8 +154,42 @@ Attribute decodePrecisionConfig(Attribute stablehloAttr) {
   return ArrayAttr::get(stablehloAttr.getContext(), hloAttrs);
 }
 
-// Experimental and public ops in MHLO that do not exist yet in StableHLO can be
-// encoded as a StableHLO CustomCallOp to allow round-tripping between dialects.
+// Converts function body of `funcOp` to a region of `op`, and erases `funcOp`.
+// Returns failure if type conversion fails.
+// Example:
+//  %op:2 = "stablehlo.all_reduce"(%arg0, %arg1) ({}) {...}
+//    : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+//  func.func @funcOp(%arg0: tensor<f32>, %arg1: tensor<f32>)
+//       -> tensor<f32> {
+//    %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
+//    stablehlo.return %0 : tensor<f32>
+//  }
+// ==>
+//  %0:2 = "stablehlo.all_reduce"(%arg0, %arg1) ({
+//  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+//    %2 = stablehlo.add %arg2, %arg3 : tensor<f32>
+//    stablehlo.return %2 : tensor<f32>
+//  }) {...} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+LogicalResult convertFuncToStablehloRegion(Operation* op, func::FuncOp funcOp,
+                                           ConversionPatternRewriter& rewriter,
+                                           TypeConverter* typeConverter) {
+  auto& region = op->getRegion(0);
+  rewriter.inlineRegionBefore(funcOp.getBody(), region, region.end());
+  if (failed(rewriter.convertRegionTypes(&region, *typeConverter,
+                                         /*entryConversion=*/nullptr)))
+    return failure();
+
+  auto symUses = funcOp.getSymbolUses(funcOp->getParentOp());
+  if (!symUses || std::distance(symUses->begin(), symUses->end()) != 1)
+    funcOp->emitError(
+        "FuncOp being converted to StableHLO region must have single use");
+  rewriter.eraseOp(funcOp);
+  return success();
+}
+
+// Experimental and public ops in MHLO that do not exist yet in StableHLO
+// can be encoded as a StableHLO CustomCallOp to allow round-tripping
+// between dialects.
 //
 // Example:
 //  %0 = stablehlo.custom_call @mhlo.dot {
@@ -162,14 +199,16 @@ Attribute decodePrecisionConfig(Attribute stablehloAttr) {
 //     precision_config = [#mhlo<precision PACKED_NIBBLE>] } ...
 LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
                                         ConversionPatternRewriter& rewriter,
+                                        TypeConverter* typeConverter,
                                         SmallVector<Type>& hloTypes,
                                         ValueRange hloOperands) {
-  // Only call_target_name, backend_config, and mhlo.attributes are compatible
-  // with the extensibility protocol.
+  // Only call_target_name, backend_config, called_computations, mhlo.version,
+  // and mhlo.attributes are compatible with the extensibility protocol.
   auto isSupportedAttrName = [](NamedAttribute attr) {
     auto name = attr.getName();
     return name == "call_target_name" || name == "backend_config" ||
-           name == "mhlo.attributes" || name == "mhlo.version";
+           name == "called_computations" || name == "mhlo.attributes" ||
+           name == "mhlo.version";
   };
   if (!llvm::all_of(stablehloOp->getAttrs(), isSupportedAttrName) ||
       !stablehloOp.getBackendConfig().empty()) {
@@ -195,6 +234,11 @@ LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
     hloConvertedAttrs.push_back({stablehloAttr.getName(), hloAttr});
   }
 
+  // Check if op has a region to add after creation.
+  // Currently only supports MHLO ops with single region.
+  auto stablehloCalledComputations = stablehloOp.getCalledComputations();
+  bool stablehloHasRegion = stablehloCalledComputations.size() == 1;
+
   // Dynamically create the corresponding MHLO op using call_target_name
   // and converted attributes. (It is quite neat that we have an API for this!).
   OperationState hloOpState(stablehloOp.getLoc(),
@@ -202,8 +246,20 @@ LogicalResult rewriteCustomCallAsMhloOp(stablehlo::CustomCallOp stablehloOp,
   hloOpState.addOperands(hloOperands);
   hloOpState.addTypes(hloTypes);
   hloOpState.addAttributes(hloConvertedAttrs);
+  if (stablehloHasRegion) hloOpState.addRegion();
   Operation* hloOp = rewriter.create(hloOpState);
   rewriter.replaceOp(stablehloOp, hloOp->getResults());
+
+  // Add region if exists
+  if (stablehloHasRegion) {
+    auto stablehloRegionOp =
+        stablehloOp->getParentOfType<ModuleOp>().lookupSymbol<func::FuncOp>(
+            stablehloCalledComputations[0].cast<FlatSymbolRefAttr>());
+    if (failed(convertFuncToStablehloRegion(hloOp, stablehloRegionOp, rewriter,
+                                            typeConverter)))
+      return failure();
+  }
+
   return success();
 }
 
@@ -221,8 +277,9 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     // See `StablehloToHloTypeConverter` for more information on when this
     // conversion will succeed or fail.
     SmallVector<Type> hloTypes;
-    if (failed(this->getTypeConverter()->convertTypes(
-            stablehloOp->getResultTypes(), hloTypes)))
+    auto typeConverter = this->getTypeConverter();
+    if (failed(typeConverter->convertTypes(stablehloOp->getResultTypes(),
+                                           hloTypes)))
       return failure();
 
     // These operands have already been converted to MHLO by
@@ -233,8 +290,8 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     // supported in StableHLO. See hlo_legalize_to_stablehlo.cc for details.
     if constexpr (std::is_same<StablehloOpTy, stablehlo::CustomCallOp>::value) {
       if (stablehloOp.getCallTargetName().starts_with("mhlo.")) {
-        return rewriteCustomCallAsMhloOp(stablehloOp, rewriter, hloTypes,
-                                         hloOperands);
+        return rewriteCustomCallAsMhloOp(stablehloOp, rewriter, typeConverter,
+                                         hloTypes, hloOperands);
       }
     }
 
@@ -243,6 +300,10 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
     // with the exception of ArrayAttr which is converted recursively.
     SmallVector<NamedAttribute> hloAttrs;
     for (NamedAttribute stablehloAttr : stablehloOp->getAttrs()) {
+      if constexpr (std::is_same<StablehloOpTy,
+                                 stablehlo::CustomCallOp>::value) {
+        if (stablehloAttr.getName() == "mhlo.backend_config") continue;
+      }
       auto hloAttr = convertAttr(stablehloAttr.getValue());
       if (!hloAttr) return failure();
       hloAttrs.push_back({stablehloAttr.getName(), hloAttr});
@@ -262,13 +323,31 @@ class StablehloToHloOpConverter : public OpConversionPattern<StablehloOpTy> {
           stablehloOp, hloTypes, hloOperands, hloAttrs);
     }
 
+    if constexpr (std::is_same<StablehloOpTy, stablehlo::CustomCallOp>::value) {
+      auto stablehloBackendConfig = stablehloOp->getAttr("mhlo.backend_config");
+      if (stablehloBackendConfig) {
+        if (auto oldHloBackendConfig =
+                hloOp.getBackendConfigAttr()
+                    .template dyn_cast_or_null<StringAttr>()) {
+          if (oldHloBackendConfig != "") return failure();
+        } else {
+          return failure();
+        }
+        if (stablehloOp.getApiVersion() !=
+            stablehlo::CustomCallApiVersion::API_VERSION_ORIGINAL)
+          return failure();
+
+        hloOp.setBackendConfigAttr(stablehloBackendConfig);
+        hloOp.setApiVersion(mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI);
+      }
+    }
+
     // Finally, populate the regions while converting argument types
     // and nested operations.
     for (auto [stablehloRegion, hloRegion] :
          llvm::zip(stablehloOp->getRegions(), hloOp->getRegions())) {
       rewriter.inlineRegionBefore(stablehloRegion, hloRegion, hloRegion.end());
-      if (failed(rewriter.convertRegionTypes(&hloRegion,
-                                             *this->getTypeConverter(),
+      if (failed(rewriter.convertRegionTypes(&hloRegion, *typeConverter,
                                              /*entryConversion=*/nullptr)))
         return failure();
     }
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc
index f912946c8ef..133fe78e3ff 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm_pass.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <utility>
 
 #include "mhlo/IR/hlo_ops.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc
index 2a148b53486..d2e9b48da34 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.cc
@@ -94,6 +94,11 @@ class ArithOp {
     return ArithOp(builder, loc, res);
   }
 
+  ArithOp operator*(ArithOp &rhs) {
+    Value res = builder.create<arith::MulIOp>(loc, value, rhs.value);
+    return ArithOp(builder, loc, res);
+  }
+
   ArithOp operator|(ArithOp &rhs) {
     Value res = builder.create<arith::OrIOp>(loc, value, rhs.value);
     return ArithOp(builder, loc, res);
@@ -322,7 +327,7 @@ std::pair<ShapedType, int64_t> threeFry32Shape(ShapedType resultTy) {
 
 // This implementation generates a 32-bit tensor of ThreeFry random numbers.
 // It matches the XLA implementation bit-exact and includes an inefficient
-// method of concatenating / slicing the pairs of generated nunbers.
+// method of concatenating / slicing the pairs of generated numbers.
 //
 // We should consider dropping the complex slicing and simply generating
 // 2x the values, then downcast to a 32-bit. It substantially simplifies
@@ -464,6 +469,255 @@ LogicalResult generateLinalgThreeFry64(OpBuilder &builder, Location loc,
   return success();
 }
 
+using PhiloxKey = std::pair<ArithOp, ArithOp>;
+using PhiloxState = std::array<ArithOp, 4>;
+
+// Computes high and low words from multiplying 32 bit integers.
+// Per the paper, mulhi and mullo of the same arguments can be computed
+// Simultaneously in a single instruction on x86 architectures.
+std::pair<ArithOp, ArithOp> multiplyHilo(ArithOp counter, ArithOp key) {
+  counter = counter.extendUI(64);
+  key = key.extendUI(64);
+  ArithOp product = counter * key;
+  ArithOp ci64 = counter.constantI(/*value=*/32, /*bits=*/64);
+  ArithOp hi = product >> ci64;
+  hi = hi.truncI(32);
+  product = product.truncI(32);
+  return std::pair<ArithOp, ArithOp>{hi, product};
+}
+
+PhiloxState philoxRound(PhiloxState x, PhiloxKey key) {
+  // These are philox specific constants.
+  ArithOp m0 = x[0].constantI(0xD2511F53, 32);
+  ArithOp m1 = x[2].constantI(0xCD9E8D57, 32);
+  std::pair<ArithOp, ArithOp> p0 = multiplyHilo(x[0], m0);
+  std::pair<ArithOp, ArithOp> p1 = multiplyHilo(x[2], m1);
+
+  PhiloxState state = {p1.first ^ x[1] ^ key.first, p1.second,
+                       p0.first ^ x[3] ^ key.second, p0.second};
+  return state;
+}
+
+PhiloxKey raiseKey(PhiloxKey key) {
+  // These are philox specific constants.
+  ArithOp w0 = key.first.constantI(0x9E3779B9, 32);
+  ArithOp w1 = key.first.constantI(0xBB67AE85, 32);
+  return PhiloxKey{key.first + w0, key.second + w1};
+}
+
+// Implements the Philox 4x32 counter-based PRNG algorithm.
+// The Philox PRNG has been proposed in:
+// Salmon et al. SC 2011. Parallel random numbers: as easy as 1, 2, 3.
+// http://www.thesalmons.org/john/random123/papers/random123sc11.pdf
+std::array<ArithOp, 4> runPhilox4x32(PhiloxKey key, ArithOp state) {
+  ArithOp index = state.linalgIndex(0);
+  index = index.indexCast(64);
+  index = index + state;
+
+  // Split into the 2xi32 used for threefry.
+  std::pair<ArithOp, ArithOp> input = splitI64(index);
+  ArithOp input0 = input.first;
+  ArithOp input1 = input.second;
+
+  // We initialize the state as such to match the XLA implementation.
+  PhiloxState state4 = {input0, input1, key.first, key.second};
+
+  // We perform 10 rounds to match the XLA implementation.
+  static const int kNumRounds = 10;
+  for (int round = 0; round < kNumRounds; ++round, key = raiseKey(key)) {
+    state4 = philoxRound(state4, key);
+  }
+  return state4;
+}
+
+// Generates an array of primitive type U32 with the given shape containing
+// random bits generated by the Philox algorithm. Returns the array and the new
+// state of the random number generator.
+LogicalResult generateLinalgPhilox32(OpBuilder &builder, Location loc,
+                                     ShapedType resultTy, Value &store,
+                                     Value &result) {
+  Type resultETy = resultTy.getElementType();
+
+  Value initialState = extractState64(builder, loc, store);
+  if (!initialState) return failure();
+
+  std::pair<Value, Value> keys = extractKey32(builder, loc, store);
+  if (!keys.first || !keys.second) return failure();
+
+  int64_t numElements = resultTy.getNumElements();
+  int64_t count = (numElements + 3) / 4;
+  ShapedType intermediateType =
+      RankedTensorType::get({count, 1}, resultTy.getElementType());
+  int64_t concatDim = 1;
+
+  // Compute the number of random i64s generated and increment state.
+  Value countVal =
+      builder.create<arith::ConstantOp>(loc, builder.getI64IntegerAttr(count));
+  Value newState = builder.create<arith::AddIOp>(loc, initialState, countVal);
+
+  // set up four outputs
+  Value dest0 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
+                                                resultETy);
+  Value dest1 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
+                                                resultETy);
+  Value dest2 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
+                                                resultETy);
+  Value dest3 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
+                                                resultETy);
+
+  ShapedType destTy = dest0.getType().cast<ShapedType>();
+
+  SmallVector<AffineMap> indexingMaps(4, builder.getMultiDimIdentityMap(1));
+  SmallVector<utils::IteratorType> iterators(1, utils::IteratorType::parallel);
+
+  linalg::GenericOp generic = builder.create<linalg::GenericOp>(
+      loc, TypeRange{destTy, destTy, destTy, destTy},
+      /*inputs=*/ValueRange(),
+      /*outputs=*/ValueRange{dest0, dest1, dest2, dest3},
+      /*indexingMaps=*/indexingMaps, iterators,
+      [&](OpBuilder &b, Location nestedLoc, ValueRange) {
+        auto output =
+            runPhilox4x32(PhiloxKey{ArithOp(b, nestedLoc, keys.first),
+                                    ArithOp(b, nestedLoc, keys.second)},
+                          ArithOp(b, nestedLoc, initialState));
+        auto out0 = output[0].truncI(resultETy.getIntOrFloatBitWidth());
+        auto out1 = output[1].truncI(resultETy.getIntOrFloatBitWidth());
+        auto out2 = output[2].truncI(resultETy.getIntOrFloatBitWidth());
+        auto out3 = output[3].truncI(resultETy.getIntOrFloatBitWidth());
+        b.create<linalg::YieldOp>(
+            loc, ValueRange{out0.val(), out1.val(), out2.val(), out3.val()});
+      });
+
+  if (resultTy.getNumElements() == 1) {
+    result = reshapeToTarget(builder, loc, resultTy, generic.getResult(0));
+    store = setState64(builder, loc, store, newState);
+    return success();
+  }
+
+  Value r0 =
+      reshapeToTarget(builder, loc, intermediateType, generic.getResult(0));
+  Value r1 =
+      reshapeToTarget(builder, loc, intermediateType, generic.getResult(1));
+  Value r2 =
+      reshapeToTarget(builder, loc, intermediateType, generic.getResult(2));
+  Value r3 =
+      reshapeToTarget(builder, loc, intermediateType, generic.getResult(3));
+
+  Value concatenate = builder.create<mhlo::ConcatenateOp>(
+      loc, ValueRange{r0, r1, r2, r3}, builder.getI64IntegerAttr(concatDim));
+
+  // Collapse the concat dimension back into the parent.
+  llvm::SmallVector<int64_t> collapseShape(intermediateType.getShape());
+  collapseShape[0] = collapseShape[0] * 4;
+  Value reshapeIntermediate = builder.create<mhlo::ReshapeOp>(
+      loc, resultTy.clone(collapseShape), concatenate);
+
+  // Slice to only the required results.
+  collapseShape[0] = resultTy.getNumElements();
+
+  llvm::SmallVector<int64_t> offset(resultTy.getRank(), 0);
+  llvm::SmallVector<int64_t> stride(resultTy.getRank(), 1);
+  Value slice = builder.create<mhlo::SliceOp>(
+      loc, intermediateType.clone(collapseShape), reshapeIntermediate,
+      builder.getI64TensorAttr(offset), builder.getI64TensorAttr(collapseShape),
+      builder.getI64TensorAttr(stride));
+  Value reshapeResult = builder.create<mhlo::ReshapeOp>(loc, resultTy, slice);
+
+  // Set the new tensor values.
+  store = setState64(builder, loc, store, newState);
+  result = reshapeResult;
+
+  return success();
+}
+
+LogicalResult generateLinalgPhilox64(OpBuilder &builder, Location loc,
+                                     ShapedType resultTy, Value &store,
+                                     Value &result) {
+  Type resultETy = resultTy.getElementType();
+
+  Value initialState = extractState64(builder, loc, store);
+  if (!initialState) return failure();
+
+  std::pair<Value, Value> keys = extractKey32(builder, loc, store);
+  if (!keys.first || !keys.second) return failure();
+
+  int64_t numElements = resultTy.getNumElements();
+  int64_t count = (numElements + 1) / 2;
+  ShapedType intermediateType =
+      RankedTensorType::get({count, 1}, resultTy.getElementType());
+  int64_t concatDim = 1;
+
+  // Compute the number of random i64s generated and increment state.
+  Value countVal =
+      builder.create<arith::ConstantOp>(loc, builder.getI64IntegerAttr(count));
+  Value newState = builder.create<arith::AddIOp>(loc, initialState, countVal);
+
+  // set up four outputs
+  Value dest0 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
+                                                resultETy);
+  Value dest1 = builder.create<tensor::EmptyOp>(loc, ArrayRef<int64_t>({count}),
+                                                resultETy);
+  ShapedType destTy = dest0.getType().cast<ShapedType>();
+
+  SmallVector<AffineMap> indexingMaps(2, builder.getMultiDimIdentityMap(1));
+  SmallVector<utils::IteratorType> iterators(1, utils::IteratorType::parallel);
+
+  linalg::GenericOp generic = builder.create<linalg::GenericOp>(
+      loc, TypeRange{destTy, destTy},
+      /*inputs=*/ValueRange(),
+      /*outputs=*/ValueRange{dest0, dest1},
+      /*indexingMaps=*/indexingMaps, iterators,
+      [&](OpBuilder &b, Location nestedLoc, ValueRange) {
+        auto output =
+            runPhilox4x32(PhiloxKey{ArithOp(b, nestedLoc, keys.first),
+                                    ArithOp(b, nestedLoc, keys.second)},
+                          ArithOp(b, nestedLoc, initialState));
+        auto out0 = output[0];
+        auto out1 = output[1];
+        auto out2 = output[2];
+        auto out3 = output[3];
+        Value result1 = fuseI32s(out0, out1).val();
+        Value result2 = fuseI32s(out2, out3).val();
+        b.create<linalg::YieldOp>(loc, ValueRange{result1, result2});
+      });
+
+  if (resultTy.getNumElements() == 1) {
+    result = reshapeToTarget(builder, loc, resultTy, generic.getResult(0));
+    store = setState64(builder, loc, store, newState);
+    return success();
+  }
+
+  Value r0 =
+      reshapeToTarget(builder, loc, intermediateType, generic.getResult(0));
+  Value r1 =
+      reshapeToTarget(builder, loc, intermediateType, generic.getResult(1));
+  Value concatenate = builder.create<mhlo::ConcatenateOp>(
+      loc, ValueRange{r0, r1}, builder.getI64IntegerAttr(concatDim));
+
+  // Collapse the concat dimension back into the parent.
+  llvm::SmallVector<int64_t> collapseShape(intermediateType.getShape());
+  collapseShape[0] = collapseShape[0] * 2;
+  Value reshapeIntermediate = builder.create<mhlo::ReshapeOp>(
+      loc, resultTy.clone(collapseShape), concatenate);
+
+  // Slice to only the required results.
+  collapseShape[0] = resultTy.getNumElements();
+
+  llvm::SmallVector<int64_t> offset(resultTy.getRank(), 0);
+  llvm::SmallVector<int64_t> stride(resultTy.getRank(), 1);
+  Value slice = builder.create<mhlo::SliceOp>(
+      loc, intermediateType.clone(collapseShape), reshapeIntermediate,
+      builder.getI64TensorAttr(offset), builder.getI64TensorAttr(collapseShape),
+      builder.getI64TensorAttr(stride));
+  Value reshapeResult = builder.create<mhlo::ReshapeOp>(loc, resultTy, slice);
+
+  // Set the new tensor values.
+  store = setState64(builder, loc, store, newState);
+  result = reshapeResult;
+
+  return success();
+}
+
 }  // namespace
 
 LogicalResult generateLinalgThreeFry(OpBuilder &builder, Location loc,
@@ -485,5 +739,21 @@ LogicalResult generateLinalgThreeFry(OpBuilder &builder, Location loc,
   return failure();
 }
 
+LogicalResult generateLinalgPhilox(OpBuilder &builder, Location loc,
+                                   ShapedType resultTy, Value &state,
+                                   Value &result) {
+  Type eTy = resultTy.getElementType();
+  if (eTy.getIntOrFloatBitWidth() == 64) {
+    return generateLinalgPhilox64(builder, loc, resultTy, state, result);
+  }
+
+  // The 32 bit implementation trancates to result eTy.
+  if (eTy.getIntOrFloatBitWidth() == 32 || eTy.getIntOrFloatBitWidth() == 16) {
+    return generateLinalgPhilox32(builder, loc, resultTy, state, result);
+  }
+
+  return failure();
+}
+
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h
index de7a79a7d45..ab2c5b2e5d6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/mhlo/utils/mhlo_rng_utils.h
@@ -26,6 +26,10 @@ LogicalResult generateLinalgThreeFry(OpBuilder& builder, Location loc,
                                      ShapedType resultTy, Value& state,
                                      Value& result);
 
+LogicalResult generateLinalgPhilox(OpBuilder& builder, Location loc,
+                                   ShapedType resultTy, Value& state,
+                                   Value& result);
+
 }  // namespace mhlo
 }  // namespace mlir
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 073a833578c..3e72efd620c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file %s | FileCheck %s --dump-input-context=20
+// RUN: mlir-hlo-opt --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s --dump-input-context=20
 
 // CHECK-LABEL: func.func @asin_bf16(
 // CHECK-SAME:    %[[TMP_arg0:.*]]: tensor<bf16>
@@ -2481,41 +2481,46 @@ func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32
 
 // -----
 
-// CHECK-LABEL:   @dyn_top_k
-// CHECK-SAME: ([[ARG_0:%.*]]: tensor<i64>, [[ARG_1:%.*]]: tensor<?x5x3xi1>
-// CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
-// CHECK-DAG:     [[VAL_0:%.*]] = "mhlo.get_dimension_size"([[ARG_1]]) {dimension = 0 : i64} : (tensor<?x5x3xi1>) -> tensor<i32>
-// CHECK-DAG:     [[VAL_1:%.*]] = mhlo.reshape [[VAL_0]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:     [[VAL_2:%.*]] = "mhlo.get_dimension_size"([[ARG_1]]) {dimension = 1 : i64} : (tensor<?x5x3xi1>) -> tensor<i32>
-// CHECK-DAG:     [[VAL_3:%.*]] = mhlo.reshape [[VAL_2]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:     [[VAL_4:%.*]] = "mhlo.get_dimension_size"([[ARG_1]]) {dimension = 2 : i64} : (tensor<?x5x3xi1>) -> tensor<i32>
-// CHECK-DAG:     [[VAL_5:%.*]] = mhlo.reshape [[VAL_4]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:     [[VAL_6:%.*]] = "mhlo.concatenate"([[VAL_1]], [[VAL_3]], [[VAL_5]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-// CHECK-DAG:     [[VAL_ct2:%.*]] = mhlo.constant dense<2> : tensor<i32>
-// CHECK-DAG:     [[VAL_ct2_tensor:%.*]] = mhlo.reshape [[VAL_ct2]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK-DAG:     [[VAL_result_shape:%.*]] = "mhlo.concatenate"([[VAL_1]], [[VAL_3]], [[VAL_ct2_tensor]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-// CHECK-DAG:     [[VAL_7:%.*]] = "mhlo.dynamic_iota"([[VAL_6]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x3xi32>
-// CHECK-DAG:     [[VAL_8:%.*]]:2 = "mhlo.sort"([[ARG_1]], [[VAL_7]]) ({
-// CHECK-DAG:     ^bb0([[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i1>, [[ARG_4:%.*]]: tensor<i32>, [[ARG_5:%.*]]: tensor<i32>):
-// CHECK-DAG:       [[VAL_14:%.*]] = mhlo.compare  GT, [[ARG_2]], [[ARG_3]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
-// CHECK-DAG:       mhlo.return [[VAL_14]] : tensor<i1>
-// CHECK-DAG:     }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x3xi1>, tensor<?x5x3xi32>) -> (tensor<?x5x3xi1>, tensor<?x5x3xi32>)
-// CHECK-DAG:     [[VAL_9:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
-// CHECK-DAG:     [[VAL_10:%.*]] = mhlo.convert [[VAL_result_shape]] : (tensor<3xi32>) -> tensor<3xi64>
-// CHECK-DAG:     [[VAL_11:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
-// CHECK-DAG:     [[VAL_12:%.*]] = mhlo.real_dynamic_slice [[VAL_8]]#0, [[VAL_9]], [[VAL_10]], [[VAL_11]] : (tensor<?x5x3xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
-// CHECK-DAG:     [[VAL_13:%.*]] = mhlo.real_dynamic_slice [[VAL_8]]#1, [[VAL_9]], [[VAL_10]], [[VAL_11]] : (tensor<?x5x3xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
-// CHECK-DAG:     return [[VAL_12]], [[VAL_13]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
-// CHECK-DAG:   }
-// CHECK-DAG: }
-
-func.func @dyn_top_k(%arg0: tensor<i64>, %arg1: tensor<?x5x3xi1> {mhlo.sharding = ""}) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
-  %values, %indices = chlo.top_k(%arg1, k = 2) : tensor<?x5x3xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
+// CHECK-LABEL: @dyn_top_k
+// CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
+// CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
+func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
+  // CHECK-NEXT: [[DIM_0_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 0 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
+  // CHECK-NEXT: [[DIM_0_I32x1:%.*]] = mhlo.reshape [[DIM_0_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[DIM_1_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
+  // CHECK-NEXT: [[DIM_1_I32x1:%.*]] = mhlo.reshape [[DIM_1_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[DIM_2_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 2 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
+  // CHECK-NEXT: [[DIM_2_I32x1:%.*]] = mhlo.reshape [[DIM_2_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[IOTA_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[DIM_2_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  // CHECK-NEXT: [[K_I32:%.*]] = mhlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT: [[K_I32x1:%.*]] = mhlo.reshape [[K_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[RESULT_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[K_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  // CHECK-NEXT: [[IOTA:%.*]] = "mhlo.dynamic_iota"([[IOTA_SHAPE]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x?xi32>
+  // CHECK-NEXT: [[SORT:%.*]]:2 = "mhlo.sort"([[ARG]], [[IOTA]]) ({
+  // CHECK-NEXT: ^bb0([[ARG_1:%.*]]: tensor<i1>, [[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i32>, [[ARG_4:%.*]]: tensor<i32>):
+  // CHECK-NEXT:   [[CMP:%.*]] = mhlo.compare  GT, [[ARG_1]], [[ARG_2]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK-NEXT:   mhlo.return [[CMP]] : tensor<i1>
+  // CHECK-NEXT: }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x?xi1>, tensor<?x5x?xi32>) -> (tensor<?x5x?xi1>, tensor<?x5x?xi32>)
+  // CHECK-NEXT: [[STARTS:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
+  // CHECK-NEXT: [[LIMITS:%.*]] = mhlo.convert [[RESULT_SHAPE]] : (tensor<3xi32>) -> tensor<3xi64>
+  // CHECK-NEXT: [[STRIDES:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
+  // CHECK-NEXT: [[VAL:%.*]] = mhlo.real_dynamic_slice [[SORT]]#0, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
+  // CHECK-NEXT: [[IDX:%.*]] = mhlo.real_dynamic_slice [[SORT]]#1, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
+  // CHECK-NEXT: return [[VAL]], [[IDX]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
+  %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
 }
 
 // -----
 
+func.func @unranked_top_k(%arg : tensor<*xf32>) -> (tensor<*xf32>, tensor<*xi32>) {
+  // expected-error@+1 {{failed to legalize operation 'chlo.top_k' that was explicitly marked illegal}}
+  %1:2 = chlo.top_k(%arg, k=8) : tensor<*xf32> -> (tensor<*xf32>, tensor<*xi32>)
+  func.return %1#0, %1#1 : tensor<*xf32>, tensor<*xi32>
+}
+
+// -----
+
 // Verify bessel_i1e operator for f16, f32, f64 separately as they use
 // different coefficients.
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
index cfc9d7e1dff..e9091fa0d8b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/chlo/sparse_chlo_legalize_to_linalg.mlir
@@ -1,7 +1,7 @@
 // RUN: mlir-hlo-opt --sparse-chlo-legalize-to-linalg %s | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 // CHECK-LABEL: @asinh_scalar(
@@ -20,7 +20,7 @@ func.func @asinh_scalar(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-SAME:      tensor<10x20xf32, #{{.*}}>
 // CHECK:         %[[VAL:.*]] = linalg.generic
 // CHECK-SAME:        ins(%[[ARG]] : tensor<10x20xf32,
-// CHECK-SAME:        #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
+// CHECK-SAME:        #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
 // CHECK-SAME:        outs(%[[OUT]]
 // CHECK:           sparse_tensor.unary %{{.*}} : f32 to f32
 // CHECK:           present = {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/analysis.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/analysis.mlir
new file mode 100644
index 00000000000..fb4d520a8d6
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/analysis.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-hlo-opt %s --split-input-file --allow-unregistered-dialect \
+// RUN:     --hlo-deallocation-annotation | \
+// RUN: FileCheck %s
+
+func.func @loop_nested_alloc(
+    %lb: index, %ub: index, %step: index,
+    %buf: memref<2xf32>, %res: memref<2xf32>) {
+  // CHECK-LABEL: func.func @loop_nested_alloc
+  // CHECK-SAME:    (%[[LB:.*]]: index, %[[UB:.*]]: index, %[[STEP:.*]]: index,
+  // CHECK-SAME:    %[[BUF:.*]]: memref<2xf32>, %[[RES:.*]]: memref<2xf32>)
+  // CHECK-SAME:    attributes {deallocation.region_args_backing_memory = {{\[\[}}
+  // CHECK-SAME:      "", "", "", "%[[BUF]], %[[RES]]", "%[[BUF]], %[[RES]]"]]} {
+  %0 = memref.alloc() : memref<2xf32>
+  // CHECK: %[[ALLOC1:.*]] = memref.alloc()
+  // CHECK-SAME:  {deallocation.result_backing_memory = ["%[[ALLOC1]]"]} : memref<2xf32>
+  %1 = scf.for %i = %lb to %ub step %step
+      iter_args(%iterBuf = %buf) -> memref<2xf32> {
+    // CHECK: %[[FOR1:.*]] = scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[STEP]]
+    // CHECK-SAME: iter_args(%[[ITER_BUF:.*]] = %[[BUF]]) -> (memref<2xf32>)
+    %2 = scf.for %j = %lb to %ub step %step
+        iter_args(%iterBuf2 = %iterBuf) -> memref<2xf32> {
+      // CHECK: %[[FOR2:.*]] = scf.for %[[J:.*]] = %[[LB]] to %[[UB]] step %[[STEP]]
+      // CHECK-SAME: iter_args(%[[ITER_BUF2:.*]] = %[[ITER_BUF]]) -> (memref<2xf32>)
+      %3 = memref.alloc() : memref<2xf32>
+      // CHECK: %[[ALLOC2:.*]] = memref.alloc()
+      %4 = arith.cmpi eq, %i, %ub : index
+      // CHECK: arith.cmpi
+      %5 = scf.if %4 -> (memref<2xf32>) {
+        // CHECK: %[[IF:.*]] = scf.if
+        %6 = memref.alloc() : memref<2xf32>
+        // CHECK: %[[ALLOC3:.*]] = memref.alloc()
+        scf.yield %6 : memref<2xf32>
+        // CHECK: scf.yield %[[ALLOC3]]
+      } else {
+        scf.yield %iterBuf2 : memref<2xf32>
+        // CHECK: scf.yield %[[ITER_BUF2]]
+      }
+      scf.yield %5 : memref<2xf32>
+      // CHECK: scf.yield %[[IF]]
+    }
+    scf.yield %2 : memref<2xf32>
+    // CHECK: scf.yield %[[FOR2]]
+  }
+  // CHECK: } {deallocation.region_args_backing_memory = {{\[\[}}"", "%[[ITER_BUF]], %[[ITER_BUF2]], %[[BUF]], %[[RES]], %[[ALLOC3]]"]],
+  // CHECK-SAME: deallocation.result_backing_memory = ["%[[ITER_BUF]], %[[ITER_BUF2]], %[[BUF]], %[[RES]], %[[ALLOC3]]"]}
+  memref.copy %1, %res : memref<2xf32> to memref<2xf32>
+  return
+}
+
+// -----
+
+func.func @arith_select() -> (memref<i32>, memref<i32>) {
+  %cond = "test.make_condition"() : () -> (i1)
+  %a = memref.alloc() : memref<i32>
+  %b = memref.alloc() : memref<i32>
+  %c = arith.select %cond, %a, %b : memref<i32>
+  return %a, %c : memref<i32>, memref<i32>
+}
+
+// CHECK-LABEL: @arith_select
+// CHECK: %[[COND:.*]] = "test.make_condition"
+// CHECK: %[[A:.*]] = memref.alloc
+// CHECK: %[[B:.*]] = memref.alloc
+// CHECK: %[[C:.*]] = arith.select %[[COND]], %[[A]], %[[B]]
+// CHECK-SAME: result_backing_memory = ["%[[A]], %[[B]]"]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
index 4a2015c79b5..b7eb2bfb114 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/buffer_reuse.mlir
@@ -528,3 +528,15 @@ func.func @hoist_from_if(%cond: i1) {
 // CHECK-NEXT: some.op
 // CHECK-NEXT: else
 // CHECK-NEXT: some.op
+
+// -----
+
+func.func @propagate_alignment_attr() {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<f32>
+  "test.use"(%alloc) : (memref<f32>) -> ()
+  memref.dealloc %alloc : memref<f32>
+  return
+}
+
+// CHECK-LABEL: @propagate_alignment_attr
+// CHECK-NEXT:  memref.alloca() {alignment = 64 : i64} : memref<f32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
index 56679a3c9c8..83a03c9853d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/deallocation/deallocate.mlir
@@ -34,13 +34,11 @@ func.func @loop_nested_alloc(
 // CHECK-SAME:      %[[ARG3:[a-z0-9]*]]: memref<2xf32>, %[[OUT:.*]]: memref<2xf32>)
 // CHECK:       %[[ALLOC:.*]] = memref.alloc() : memref<2xf32>
 // CHECK:       %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:       deallocation.retain() of(%[[ALLOC_OWNED]])
 // CHECK:       %[[ARG3_UNOWNED:.*]] = deallocation.null
 // CHECK:       %[[FOR1:.*]]:2 = scf.for {{.*}}iter_args(%[[A:.*]] = %[[ARG3]], %[[A_OWNERSHIP:.*]] = %[[ARG3_UNOWNED]])
 // CHECK:         %[[FOR2:.*]]:2 = scf.for {{.*}} iter_args(%[[B:.*]] = %[[A]], %[[B_OWNERSHIP:.*]] = %[[A_OWNERSHIP]])
 // CHECK:           %[[ALLOC2:.*]] = memref.alloc() : memref<2xf32>
 // CHECK:           %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
-// CHECK:           deallocation.retain() of(%[[ALLOC2_OWNED]])
 // CHECK:           %[[IF:.*]]:2 = scf.if
 // CHECK:             %[[ALLOC3:.*]] = memref.alloc() : memref<2xf32>
 // CHECK:             %[[ALLOC3_OWNED:.*]] = deallocation.own %[[ALLOC3]]
@@ -50,11 +48,13 @@ func.func @loop_nested_alloc(
 // CHECK:             scf.yield %[[B]], %[[NULL]]
 // CHECK:           }
 // CHECK:           %[[RETAINED_IF:.*]] = deallocation.retain(%[[IF]]#0) of(%[[B_OWNERSHIP]], %[[IF]]#1)
+// CHECK:           deallocation.retain() of(%[[ALLOC2_OWNED]])
 // CHECK:           scf.yield %[[IF]]#0, %[[RETAINED_IF]]
 // CHECK:         }
 // CHECK:         scf.yield %[[FOR2]]#0, %[[FOR2]]#1
 // CHECK:       }
 // CHECK:       memref.copy %[[FOR1]]#0, %[[OUT]]
+// CHECK:       deallocation.retain() of(%[[ALLOC_OWNED]])
 // CHECK:       deallocation.retain() of(%[[FOR1]]#1)
 // CHECK:       return
 
@@ -122,18 +122,17 @@ func.func @while(%arg0: index) -> (memref<?xf32>, memref<?xf32>, memref<?xf32>)
 // CHECK-SAME:      %[[ARG0:.*]]:
 // CHECK-NEXT:    %[[ALLOC:.*]] = memref.alloc(%arg0) : memref<?xf32>
 // CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:    %[[NULL1:.*]] = deallocation.null
-// CHECK-NEXT:    %[[NULL2:.*]] = deallocation.null
+// CHECK-NEXT:    %[[NULL:.*]] = deallocation.null
 // CHECK-NEXT:    %[[WHILE:.*]]:6 = scf.while (%[[A:[a-z0-9]*]] = %[[ALLOC]], %[[B:[a-z0-9]*]] = %[[ALLOC]], %[[C:[a-z0-9]*]] = %[[ALLOC]],
-// CHECK-SAME:       %[[A_OWNERSHIP:.*]] = %[[ALLOC_OWNED]], %[[B_OWNERSHIP:.*]] = %[[NULL1]], %[[C_OWNERSHIP:.*]] = %[[NULL2]])
+// CHECK-SAME:       %[[A_OWNERSHIP:.*]] = %[[ALLOC_OWNED]], %[[B_OWNERSHIP:.*]] = %[[NULL]], %[[C_OWNERSHIP:.*]] = %[[NULL]])
 // CHECK:            scf.condition{{.*}} %[[A]], %[[B]], %[[C]], %[[A_OWNERSHIP]], %[[B_OWNERSHIP]], %[[C_OWNERSHIP]]
 // CHECK:         } do {
-// CHECK:           deallocation.retain() of(%[[C_OWNERSHIP]])
-// CHECK:           deallocation.retain() of(%[[A_OWNERSHIP]])
 // CHECK:           %[[ALLOC1:.*]] = memref.alloc(%[[ARG0]])
 // CHECK:           %[[ALLOC1_OWNED:.*]] = deallocation.own %[[ALLOC1]]
 // CHECK:           %[[ALLOC2:.*]] = memref.alloc(%[[ARG0]])
 // CHECK:           %[[ALLOC2_OWNED:.*]] = deallocation.own %[[ALLOC2]]
+// CHECK:           deallocation.retain() of(%[[A_OWNERSHIP]])
+// CHECK:           deallocation.retain() of(%[[C_OWNERSHIP]])
 // CHECK:           scf.yield %[[ALLOC2]], %[[ALLOC1]], %[[B]], %[[ALLOC2_OWNED]], %[[ALLOC1_OWNED]], %[[B_OWNERSHIP]]
 // CHECK:         }
 // CHECK:         %[[RESULTS_RETAINED:.*]] = deallocation.retain(%[[WHILE]]#0, %[[WHILE]]#1, %[[WHILE]]#2)
@@ -182,13 +181,12 @@ func.func @yield_same_alloc_twice() {
 // CHECK-LABEL: @yield_same_alloc_twice
 // CHECK-NEXT:  %[[ALLOC:.*]] = memref.alloc
 // CHECK-NEXT:  %[[ALLOC_OWNED:.*]] = deallocation.own %[[ALLOC]]
-// CHECK-NEXT:  %[[NULL1:.*]] = deallocation.null
-// CHECK-NEXT:  %[[NULL2:.*]] = deallocation.null
+// CHECK-NEXT:  %[[NULL:.*]] = deallocation.null
 // CHECK:       scf.while
 // CHECK-SAME:    %[[ALLOC]]
 // CHECK-SAME:    %[[ALLOC]]
-// CHECK-SAME:    %[[NULL1]]
-// CHECK-SAME:    %[[NULL2]]
+// CHECK-SAME:    %[[NULL]]
+// CHECK-SAME:    %[[NULL]]
 // CHECK:       do
 // CHECK-NEXT:    %[[NULL:.*]] = deallocation.null
 // CHECK-NEXT:    %[[RETAIN:.*]]:2 = deallocation.retain(%[[ALLOC]], %[[ALLOC]]) of()
@@ -212,20 +210,17 @@ func.func @yield_derived(%lb: index, %ub: index, %step: index) {
 // CHECK-NEXT:  memref.alloc
 // CHECK-NEXT:  deallocation.own
 // CHECK-NEXT:  scf.for
-// CHECK-NEXT:    deallocation.retain()
 // CHECK-NEXT:    %[[ALLOC:.*]] = memref.alloc
 // CHECK-NEXT:    %[[ALLOC_OWNED:.*]] = deallocation.own
 // CHECK-NEXT:    "test.someop"
 // CHECK-NEXT:    %[[RESULT:.*]] = "test.someop"
-// CHECK-NEXT:    scf.yield %[[RESULT]], %[[ALLOC_OWNED]]
+// CHECK-NEXT:    %[[RETAINED:.*]] = deallocation.retain
+// CHECK-NEXT:    deallocation.retain() of
+// CHECK-NEXT:    scf.yield %[[RESULT]], %[[RETAINED]]
 // CHECK-NEXT:  }
 // CHECK-NEXT:  test.use
 // CHECK-NEXT:  retain
 
-// CHECK-SIMPLE-LABEL: @yield_derived
-// CHECK-SIMPLE:       test.use
-// CHECK-SIMPLE-NEXT:  memref.dealloc
-
 // -----
 
 func.func @unknown_op() {
@@ -357,6 +352,21 @@ func.func @realloc_in_loop(%size: index, %lb: index, %ub: index, %step: index) {
 
 // -----
 
+func.func @alloca() {
+  %alloca = memref.alloca() : memref<2xf32>
+  %passthrough = "test.use"(%alloca) : (memref<2xf32>) -> (memref<2xf32>)
+  "test.use"(%passthrough) : (memref<2xf32>) -> ()
+  return
+}
+
+// CHECK-LABEL: @alloca()
+// CHECK-NEXT: memref.alloca
+// CHECK-NEXT: test.use
+// CHECK-NEXT: test.use
+// CHECK-NEXT: return
+
+// -----
+
 func.func @dealloc() {
   %alloc = memref.alloc() : memref<i32>
   "test.use"(%alloc) : (memref<i32>) -> ()
@@ -739,20 +749,20 @@ func.func @user() -> memref<1x2x3xf32> {
 // CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1
 
 // CHECK: @user()
-// CHECK:   %[[ALLOC:.*]] = memref.alloc
-// CHECK:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
-// CHECK:   %[[ALLOC_0:.*]] = memref.alloc
-// CHECK:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
-// CHECK:   %[[ALLOC_1:.*]] = memref.alloc
-// CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
-// CHECK:   %[[ALLOC_2:.*]] = memref.alloc
-// CHECK:   %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
-// CHECK:   %[[ALLOC_3:.*]] = memref.alloc
-// CHECK:   %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
-// CHECK:   %[[OWNERSHIP_0:.*]]:2 = call @f(%[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
-// CHECK:   deallocation.retain() of(%[[OWN_3]])
-// CHECK:   %[[RETAIN:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
-// CHECK:   return %[[OWNERSHIP_0]]#0, %[[RETAIN]]
+// CHECK-NEXT:   %[[ALLOC:.*]] = memref.alloc
+// CHECK-NEXT:   %[[OWN:.*]] = deallocation.own %[[ALLOC]]
+// CHECK-NEXT:   %[[ALLOC_0:.*]] = memref.alloc
+// CHECK-NEXT:   %[[OWN_0:.*]] = deallocation.own %[[ALLOC_0]]
+// CHECK-NEXT:   %[[ALLOC_1:.*]] = memref.alloc
+// CHECK-NEXT:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]]
+// CHECK-NEXT:   %[[ALLOC_2:.*]] = memref.alloc
+// CHECK-NEXT:   %[[OWN_2:.*]] = deallocation.own %[[ALLOC_2]]
+// CHECK-NEXT:   %[[ALLOC_3:.*]] = memref.alloc
+// CHECK-NEXT:   %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
+// CHECK-NEXT:   %[[OWNERSHIP_0:.*]]:2 = call @f(%[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
+// CHECK-NEXT:   %[[RETAIN:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
+// CHECK-NEXT:   deallocation.retain() of(%[[OWN_3]])
+// CHECK-NEXT:   return %[[OWNERSHIP_0]]#0, %[[RETAIN]]
 
 // -----
 
@@ -813,8 +823,8 @@ func.func @user() -> memref<1x2x3xf32> {
 // CHECK:       %[[ALLOC_3:.*]] = memref.alloc
 // CHECK:       %[[OWN_3:.*]] = deallocation.own %[[ALLOC_3]]
 // CHECK:       %[[OWNERSHIP_0:.*]]:2 = call @terminating_f(%[[C0_I32_0]], %[[ALLOC]], %[[ALLOC_0]], %[[ALLOC_1]], %[[ALLOC_2]], %[[ALLOC_3]])
-// CHECK:       deallocation.retain() of(%[[OWN_3]])
 // CHECK:       %[[RETAIN_0:.*]] = deallocation.retain(%[[OWNERSHIP_0]]#0) of(%[[OWN]], %[[OWN_0]], %[[OWN_1]], %[[OWN_2]], %[[OWNERSHIP_0]]#1)
+// CHECK:       deallocation.retain() of(%[[OWN_3]])
 // CHECK:       return %[[OWNERSHIP_0]]#0, %[[RETAIN_0]]
 
 // -----
@@ -847,10 +857,10 @@ func.func @user() -> (memref<1x2x3xf32>, memref<1x2x3xf32>) {
 // CHECK:   %[[ALLOC_1:.*]] = memref.alloc
 // CHECK:   %[[OWN_1:.*]] = deallocation.own %[[ALLOC_1]] : memref<1x2x3xf32>
 // CHECK:   %[[OWNERSHIP:.*]]:2 = call @id(%[[ALLOC]], %[[ALLOC_1]])
-// CHECK:   deallocation.retain() of(%[[OWN]])
 // CHECK:   %[[OWNERSHIP_0:.*]]:2 = call @id(%[[ALLOC_0]], %[[ALLOC_1]])
-// CHECK:   deallocation.retain() of(%[[OWN_0]])
 // CHECK:   %[[RETAIN_0:.*]]:2 = deallocation.retain(%[[OWNERSHIP]]#0, %[[OWNERSHIP_0]]#0) of(%[[OWN_1]], %[[OWNERSHIP]]#1, %[[OWNERSHIP_0]]#1)
+// CHECK:   deallocation.retain() of(%[[OWN]])
+// CHECK:   deallocation.retain() of(%[[OWN_0]])
 // CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP_0]]#0, %[[RETAIN_0]]#0, %[[RETAIN_0]]#1
 
 // -----
@@ -915,4 +925,4 @@ func.func @user() -> (memref<1x2x3xf32>, memref<1x2x3xf32>, memref<1x2x3xf32>,
 // CHECK:   %[[RETAIN_2:.*]] = deallocation.retain(%[[OWNERSHIP]]#0) of(%[[OWN_2]], %[[OWNERSHIP]]#3)
 // CHECK:   %[[RETAIN_3:.*]] = deallocation.retain(%[[OWNERSHIP]]#1) of(%[[OWN_3]], %[[OWNERSHIP]]#4)
 // CHECK:   %[[RETAIN_4:.*]] = deallocation.retain(%[[OWNERSHIP]]#2) of(%[[OWN_4]], %[[OWNERSHIP]]#5)
-// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1, %[[OWNERSHIP]]#2, %[[OWNERSHIP_0]]#0, %[[OWNERSHIP_0]]#1, %[[OWNERSHIP_0]]#2, %[[RETAIN_2]], %[[RETAIN_3]], %[[RETAIN_4]], %[[OWNERSHIP_0]]#3, %[[OWNERSHIP_0]]#4, %[[OWNERSHIP_0]]#5
+// CHECK:   return %[[OWNERSHIP]]#0, %[[OWNERSHIP]]#1, %[[OWNERSHIP]]#2, %[[OWNERSHIP_0]]#0, %[[OWNERSHIP_0]]#1, %[[OWNERSHIP_0]]#2, %[[RETAIN_2]], %[[RETAIN_3]], %[[RETAIN_4]], %[[OWNERSHIP_0]]#3, %[[OWNERSHIP_0]]#4, %[[OWNERSHIP_0]]#5
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
index 9b0d3062479..a8546b4e21e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/cpu_tiling/dot.mlir
@@ -147,7 +147,8 @@ func.func @matvec_addf(%lhs: tensor<33x17xf32>, %rhs: tensor<17xf32>,
 // CHECK-LABEL: @matvec_addf
 // CHECK-SAME:  (%{{.*}}: {{.*}}, %{{.*}}: {{.*}}, %[[ARG_INIT:.*]]: tensor<33xf32>)
 // CHECK:         scf.for {{.*}} iter_args(%[[ARG:.*]] = %[[ARG_INIT]]
-// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[ARG]]
+// CHECK:           %[[SLICE:.*]] = tensor.extract_slice %[[ARG]]
+// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[SLICE]]
 // CHECK:           %[[FOR:.*]] = scf.for {{.*}} iter_args(%[[ARG_FOR:.*]] = %[[READ_INIT]]
 // CHECK:             vector.contract {{.*}} %[[ARG_FOR]] :
 // CHECK-NEXT:        scf.yield
@@ -210,7 +211,8 @@ func.func @vecmat_addf(%lhs: tensor<17xf32>, %rhs: tensor<17x33xf32>,
 // CHECK-LABEL: @vecmat_addf
 // CHECK-SAME:  (%{{.*}}: {{.*}}, %{{.*}}: {{.*}}, %[[ARG_INIT:.*]]: tensor<33xf32>)
 // CHECK:         scf.for {{.*}} iter_args(%[[ARG:.*]] = %[[ARG_INIT]]
-// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[ARG]]
+// CHECK:           %[[SLICE:.*]] = tensor.extract_slice %[[ARG]]
+// CHECK:           %[[READ_INIT:.*]] = vector.transfer_read %[[SLICE]]
 // CHECK:           %[[FOR:.*]] = scf.for {{.*}} iter_args(%[[ARG_FOR:.*]] = %[[READ_INIT]]
 // CHECK:             vector.contract {{.*}} %[[ARG_FOR]] :
 // CHECK-NEXT:        scf.yield
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
index 357206e322b..781836a3120 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/greedy_fusion.mlir
@@ -18,10 +18,10 @@ func.func @fuse_broadcast_map(%arg0: tensor<16xf32>, %arg1: tensor<16x32xf32>)
   func.return %result : tensor<16x32xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %0 num_threads [10, 20]
+      : (!transform.any_op) -> !transform.any_op
+    %forall_op, %tiled_op = transform.structured.tile_to_forall_op %0 num_threads [10, 20] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK:      %[[INIT:.*]] = tensor.empty()
@@ -62,10 +62,10 @@ func.func @do_not_fuse_multiple_uses(%arg0: tensor<?xf32>,
   func.return %result, %bcast : tensor<?x?xf32>, tensor<?x?xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [0, 2]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [0, 2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK: tensor.empty
@@ -93,10 +93,10 @@ func.func @do_not_fuse_map_reduce(%arg0: tensor<16x32xf32>, %arg1: tensor<16xf32
   func.return %result : tensor<16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [2]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [2] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK:      %[[INIT:.*]] = tensor.empty()
@@ -175,11 +175,11 @@ func.func @fuse_fibonacci(%init : tensor<?xi64>) -> tensor<?xi64> {
   func.return %39 : tensor<?xi64>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // -----
@@ -206,11 +206,11 @@ func.func @fuse_reshape_middle_unit_dim_map(%arg0: tensor<10x16xf32>,
   return %add : tensor<10x16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_middle_unit_dim_map
@@ -250,11 +250,11 @@ func.func @fuse_reshape_trailing_unit_dim_map(%arg0: tensor<10x16xf32>,
   return %add : tensor<10x16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_trailing_unit_dim_map
@@ -294,11 +294,11 @@ func.func @fuse_reshape_leading_unit_dim_map(%arg0: tensor<10x16xf32>,
   return %add : tensor<10x16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_leading_unit_dim_map
@@ -338,11 +338,11 @@ func.func @fuse_reshape_multiple_unit_dims_map(%arg0: tensor<10x16xf32>,
   return %add : tensor<10x16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_multiple_unit_dims_map
@@ -382,11 +382,11 @@ func.func @fuse_reshape_reassoc_only_unit_dims_map(%arg0: tensor<10x16xf32>)
   return %neg : tensor<10x16x1xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @fuse_reshape_reassoc_only_unit_dims_map
@@ -426,11 +426,11 @@ func.func @do_not_fuse_collapse_shape(%arg0: tensor<10x16xf32>,
   return %add : tensor<10x16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @do_not_fuse_collapse_shape
@@ -471,11 +471,11 @@ func.func @do_not_fuse_expand_shape(%arg0: tensor<10x16xf32>,
   return %add : tensor<10x16xf32>
 }
 transform.sequence failures(propagate) {
-  ^bb0(%arg1: !pdl.operation):
+  ^bb0(%arg1: !transform.any_op):
     %0 = transform.structured.match ops{["linalg.map"]}
                                     attributes{op_label="root"} in %arg1
-      : (!pdl.operation) -> !pdl.operation
-    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8]
+      : (!transform.any_op) -> !transform.any_op
+    %loop, %1 = transform.structured.tile_to_forall_op %0 tile_sizes [1, 8] : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
 }
 
 // CHECK-LABEL: func @do_not_fuse_expand_shape
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir
index b25dada5064..6c10c564e52 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/lower_vectors.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-hlo-opt %s --lower-vectors --split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s --lower-vectors="flatten=true" --split-input-file | FileCheck %s --check-prefix=FLATTEN
 
 // CHECK-LABEL: func @vector_row
 func.func @vector_row(%arg0: vector<2x4xf32>, %acc: vector<2xf32>) -> vector<2xf32> {
@@ -151,17 +152,17 @@ func.func @optimize_pack_with_transpose(%arg0: memref<1024x1024xf32>) ->
   return %alloc_0 : memref<128x1024x8x1xf32>
 }
 
-// CHECK-LABEL: func @optimize_pack_with_transpose(
-// CHECK-SAME:      %[[INPUT:.*]]: memref<1024x1024xf32>)
+// FLATTEN-LABEL: func @optimize_pack_with_transpose(
+// FLATTEN-SAME:      %[[INPUT:.*]]: memref<1024x1024xf32>)
 
-// CHECK:         %[[ALLOC:.*]] = memref.alloc
-// CHECK:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
-// CHECK-NOT:     vector.broadcast
-// CHECK-NOT:     vector.transpose
-// CHECK:         %[[COLLAPSE:.*]] = memref.collapse_shape %[[ALLOC]]
-// CHECK-SAME:    memref<128x1024x8x1xf32> into memref<128x1024x8xf32>
-// CHECK:         %[[SHAPE_CAST:.*]] = vector.shape_cast %{{.*}}
-// CHECK:         vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE]]
+// FLATTEN:         %[[ALLOC:.*]] = memref.alloc
+// FLATTEN:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
+// FLATTEN-NOT:     vector.broadcast
+// FLATTEN-NOT:     vector.transpose
+// FLATTEN:         %[[COLLAPSE:.*]] = memref.collapse_shape %[[ALLOC]]
+// FLATTEN-SAME:    memref<128x1024x8x1xf32> into memref<128x1024x8xf32>
+// FLATTEN:         %[[SHAPE_CAST:.*]] = vector.shape_cast %{{.*}}
+// FLATTEN:         vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE]]
 
 // -----
 
@@ -188,13 +189,32 @@ func.func @optimize_pack(%arg0: memref<1024x1024xf32>) ->
   return %alloc_0 : memref<128x1024x8x1xf32>
 }
 
-// CHECK-LABEL: func @optimize_pack(
-// CHECK-SAME:      %[[INPUT:.*]]: memref<1024x1024xf32>)
+// FLATTEN-LABEL: func @optimize_pack(
+// FLATTEN-SAME:      %[[INPUT:.*]]: memref<1024x1024xf32>)
 
-// CHECK:         %[[ALLOC:.*]] = memref.alloc
-// CHECK:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
-// CHECK:         %[[COLLAPSE:.*]] = memref.collapse_shape %[[ALLOC]]
-// CHECK-SAME:    memref<128x1024x8x1xf32> into memref<128x1024x8xf32>
-// CHECK:         %[[SHAPE_CAST:.*]] = vector.shape_cast
-// CHECK-SAME:    vector<8x1xf32> to vector<8xf32>
-// CHECK:         vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE]]
+// FLATTEN:         %[[ALLOC:.*]] = memref.alloc
+// FLATTEN:         %[[READ:.*]] = vector.transfer_read %[[INPUT]]
+// FLATTEN:         %[[COLLAPSE:.*]] = memref.collapse_shape %[[ALLOC]]
+// FLATTEN-SAME:    memref<128x1024x8x1xf32> into memref<128x1024x8xf32>
+// FLATTEN:         %[[SHAPE_CAST:.*]] = vector.shape_cast
+// FLATTEN-SAME:    vector<8x1xf32> to vector<8xf32>
+// FLATTEN:         vector.transfer_write %[[SHAPE_CAST]], %[[COLLAPSE]]
+
+// -----
+
+func.func @no_flatten(%arg0: memref<2x9x10x2xf64>) ->
+                         memref<2x9x10x2xf64> {
+  %cst = arith.constant 0.000000e+00 : f64
+  %c0 = arith.constant 0 : index
+  %alloca = memref.alloca() : memref<2x9x10x2xf64>
+  %1 = vector.transfer_read %arg0[%c0, %c0, %c0, %c0], %cst {in_bounds = [true, true, true, true]} : memref<2x9x10x2xf64>, vector<2x9x10x2xf64>
+  vector.transfer_write %1, %alloca[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<2x9x10x2xf64>, memref<2x9x10x2xf64>
+  return %alloca : memref<2x9x10x2xf64>
+}
+
+
+// CHECK-LABEL:     func @no_flatten(
+
+// CHECK-NOT:         memref.collapse_shape
+// CHECK-COUNT-180:   vector.transfer_read {{.*}} memref<2x9x10x2xf64>, vector<2xf64>
+// CHECK-COUNT-180:   vector.transfer_write {{.*}} vector<2xf64>, memref<2x9x10x2xf64>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
index a8689befb4c..35b8c7c2563 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/optimize_linalg_ops.mlir
@@ -158,3 +158,26 @@ func.func @broadcast_of_single_element_tensor(%arg: tensor<f32>)
 // CHECK:          linalg.fill
 // CHECK-SAME:       ins(%[[EXTRACT]]
 // CHECK-SAME:       outs(%[[INIT]]
+
+// -----
+
+func.func @slice_of_map(%arg: tensor<32xf32>) -> tensor<8xf32> {
+  %c0 = arith.constant dense<0.0> : tensor<32xf32>
+  %init = tensor.empty() : tensor<32xf32>
+
+  %map = linalg.map { arith.maxf }
+           ins(%arg, %c0: tensor<32xf32>, tensor<32xf32>)
+           outs(%init: tensor<32xf32>)
+  %slice = tensor.extract_slice %map[0] [8] [1]
+   : tensor<32xf32> to tensor<8xf32>
+  func.return %slice : tensor<8xf32>
+}
+// CHECK-LABEL:  @slice_of_map
+// CHECK-SAME:       (%[[ARG:.*]]: tensor<32xf32>)
+
+// CHECK-DAG:      %[[CST:.*]] = arith.constant 0.0
+// CHECK-DAG:      %[[SLICE:.*]] = tensor.extract_slice %[[ARG]][0] [8] [1]
+// CHECK-DAG:      %[[INIT:.*]] = tensor.empty
+// CHECK:          linalg.map
+// CHECK-SAME:       ins(%[[SLICE]]
+// CHECK-SAME:       outs(%[[INIT]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
index aa79f0787c0..b5f9876045e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/tile_by_one.mlir
@@ -1,5 +1,54 @@
 // RUN: mlir-hlo-opt %s --gml-tile-by-one | FileCheck %s
 
+func.func @reverse_dynamic(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
+    -> tensor<?x?xf32> {
+  %reversed = thlo.reverse ins(%arg0 : tensor<?x?xf32>)
+      outs(%arg1 : tensor<?x?xf32>) reverse_dimensions = [0, 1]
+  return %reversed : tensor<?x?xf32>
+}
+
+// CHECK:      @reverse_dynamic
+// CHECK:        scf.for
+// CHECK:          scf.for
+// CHECK:            tensor.extract_slice
+// CHECK-SAME:           <?x?xf32> to tensor<1x1xf32>
+
+// -----
+
+func.func @map(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>)
+    -> tensor<?x?xf32> {
+  %mapped = linalg.map { math.absf } ins(%arg0 : tensor<?x?xf32>)
+      outs(%arg1 : tensor<?x?xf32>)
+  return %mapped : tensor<?x?xf32>
+}
+
+// CHECK:      @map
+// CHECK:        scf.for
+// CHECK:          scf.for
+// CHECK:            tensor.extract_slice
+// CHECK-SAME:           <?x?xf32> to tensor<1x1xf32>
+// CHECK:            linalg.map { math.absf }
+// CHECK-SAME:           tensor<1x1xf32>
+
+// -----
+
+func.func @dont_tile_scalarlike_map(%arg0: tensor<1x1xf32>,
+    %arg1: tensor<1x1xf32>) -> tensor<1x1xf32> {
+  %mapped = linalg.map { math.absf } ins(%arg0 : tensor<1x1xf32>)
+      outs(%arg1 : tensor<1x1xf32>)
+  return %mapped : tensor<1x1xf32>
+}
+
+// CHECK:      @dont_tile_scalarlike_map
+// CHECK-NOT:    scf.for
+// CHECK-NOT:    scf.parallel
+// CHECK:        linalg.map
+// CHECK-SAME:       tensor<1x1xf32>
+// CHECK-NOT:    scf.for
+// CHECK-NOT:    scf.parallel
+
+// -----
+
 func.func @concat(%init : tensor<?x?xi32>, %a: tensor<?x?xi32>,
     %b: tensor<?x?xi32>, %c: tensor<?x?xi32>) -> tensor<?x?xi32> {
   %concat = thlo.concatenate
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
index ba6a604b95e..b2dff59e53c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/gml_st/vectorize_copy.mlir
@@ -1,10 +1,10 @@
-// RUN: mlir-hlo-opt %s --vectorize-copy --split-input-file | FileCheck %s
+// RUN: mlir-hlo-opt %s --vectorize-copy="num-elements-threshold=8" --split-input-file | FileCheck %s
 
-func.func @vectorize_copy(%arg: memref<10x16xf32>) -> memref<10x10xf32> {
-  %subview = memref.subview %arg[0, 0] [10, 10] [1, 1] : memref<10x16xf32> to memref<10x10xf32, strided<[16, 1]>>
-  %alloc = memref.alloc() : memref<10x10xf32>
-  memref.copy %subview, %alloc : memref<10x10xf32, strided<[16, 1]>> to memref<10x10xf32>
-  return %alloc : memref<10x10xf32>
+func.func @vectorize_copy(%arg: memref<2x2xf32>) -> memref<2x2xf32> {
+  %subview = memref.subview %arg[0, 0] [2, 2] [1, 1] : memref<2x2xf32> to memref<2x2xf32, strided<[16, 1]>>
+  %alloc = memref.alloc() : memref<2x2xf32>
+  memref.copy %subview, %alloc : memref<2x2xf32, strided<[16, 1]>> to memref<2x2xf32>
+  return %alloc : memref<2x2xf32>
 }
 
 // CHECK-LABEL: func @vectorize_copy
@@ -15,14 +15,82 @@ func.func @vectorize_copy(%arg: memref<10x16xf32>) -> memref<10x10xf32> {
 
 // -----
 
-func.func @do_not_vectorize_copy(%arg: memref<10x10xf32>) -> memref<10x10xf32> {
+func.func @do_not_vectorize_continuous_copy(%arg: memref<10x10xf32>) -> memref<10x10xf32> {
   %alloc_10 = memref.alloc() : memref<10x10xf32>
   memref.copy %arg, %alloc_10 : memref<10x10xf32> to memref<10x10xf32>
   return %alloc_10 : memref<10x10xf32>
 }
 
-// CHECK-LABEL: func @do_not_vectorize_copy
+// CHECK-LABEL: func @do_not_vectorize_continuous_copy
 
 // CHECK-NOT:     vector.transfer_read
 // CHECK-NOT:     vector.transfer_write
 // CHECK:         memref.copy
+
+// -----
+
+func.func @tile_to_continuous_memref(%arg: memref<3x512xf32, strided<[768, 1]>>)
+    -> (memref<3x512xf32>) {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<3x512xf32>
+  memref.copy %arg, %alloc : memref<3x512xf32, strided<[768, 1]>> to memref<3x512xf32>
+  return %alloc: memref<3x512xf32>
+}
+
+// CHECK-LABEL: func @tile_to_continuous_memref
+// CHECK-DAG:     %[[C0:.*]] = arith.constant 0
+// CHECK-DAG:     %[[C1:.*]] = arith.constant 1
+// CHECK-DAG:     %[[C3:.*]] = arith.constant 3
+// CHECK:         scf.for %[[I:.*]] = %[[C0]] to %[[C3]] step %[[C1]]
+// CHECK-NOT:       vector.transfer_read
+// CHECK-NOT:       vector.transfer_write
+// CHECK:           memref.copy
+
+// -----
+
+func.func @tile_middle_dim_and_vectorize(%arg0: memref<4x8x2xi64, strided<[70, 14, 2]>>)
+    -> memref<4x8x2xi64> {
+  %alloc = memref.alloc() : memref<4x8x2xi64>
+  memref.copy %arg0, %alloc : memref<4x8x2xi64, strided<[70, 14, 2]>> to memref<4x8x2xi64>
+  return %alloc : memref<4x8x2xi64>
+}
+
+// CHECK-LABEL:  func.func @tile_middle_dim_and_vectorize
+// CHECK-DAG:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:      %[[C4:.*]] = arith.constant 4 : index
+// CHECK-DAG:      %[[C8:.*]] = arith.constant 8 : index
+// CHECK:          scf.for %[[I:.*]] = %[[C0]] to %[[C4]] step %[[C1]]
+// CHECK-COUNT-2:    memref.subview {{.*}}[%[[I]], 0, 0] [1, 8, 2] [1, 1, 1]
+// CHECK:            scf.for %[[J:.*]] = %[[C0]] to %[[C8]] step %[[C4]]
+// CHECK-COUNT-2:      memref.subview {{.*}}[0, %[[J]], 0] [1, 4, 2] [1, 1, 1]
+// CHECK:              vector.transfer_read
+// CHECK-SAME:           memref<1x4x2xi64, strided<[70, 14, 2], offset: ?>>, vector<1x4x2xi64>
+// CHECK:              vector.transfer_write
+// CHECK-SAME:           vector<1x4x2xi64>, memref<1x4x2xi64, strided<[16, 2, 1], offset: ?>>
+
+// -----
+
+func.func @vectorize_strided_copy(%arg: memref<1000xi64>) -> memref<500xi64> {
+  %subview = memref.subview %arg[0] [500] [2] : memref<1000xi64> to memref<500xi64, strided<[2]>>
+  %alloc = memref.alloc() : memref<500xi64>
+  memref.copy %subview, %alloc : memref<500xi64, strided<[2]>> to memref<500xi64>
+  return %alloc : memref<500xi64>
+}
+
+// CHECK-LABEL:  func.func @vectorize_strided_copy
+// CHECK-DAG:      %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:      %[[C8:.*]] = arith.constant 8 : index
+// CHECK-DAG:      %[[C496:.*]] = arith.constant 496 : index
+// CHECK:          memref.subview {{.*}}[0] [500] [2]
+// CHECK-SAME:       memref<1000xi64> to memref<500xi64, strided<[2]>>
+// CHECK:          scf.for %[[I:.*]] = %[[C0]] to %[[C496]] step %[[C8]]
+// CHECK-COUNT-2:    memref.subview {{.*}}[%[[I]]] [8] [1]
+// CHECK:            vector.transfer_read
+// CHECK-SAME:         memref<8xi64, strided<[2], offset: ?>>, vector<8xi64>
+// CHECK:            vector.transfer_write
+// CHECK-SAME:         vector<8xi64>, memref<8xi64, strided<[1], offset: ?>>
+// CHECK-COUNT-2:  memref.subview {{.*}}[496] [4] [1]
+// CHECK:          vector.transfer_read
+// CHECK-SAME:       memref<4xi64, strided<[2], offset: 992>>, vector<4xi64>
+// CHECK:          vector.transfer_write
+// CHECK-SAME:       vector<4xi64>, memref<4xi64, strided<[1], offset: 496>>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
index 56ccff75685..13077e65054 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/lhlo/lhlo-legalize-select-and-scatter.mlir
@@ -47,14 +47,14 @@ func.func @select_and_scatter(%arg: memref<112x112xf32>,
 
 // Parallel loop to initialize the output buffer.
 // CHECK: %[[INIT:.*]] = "memref.load"(%[[INIT_BUF]]) : (memref<f32>) -> f32
-// CHECK: "scf.parallel"(%[[C0]], %[[C0]], %[[C112]], %[[C112]], %[[C1]], %[[C1]]) ({
+// CHECK: "scf.parallel"(%[[C0]], %[[C0]], %[[C112]], %[[C112]], %[[C1]], %[[C1]]) <{{.*}}> ({
 // CHECK: ^bb0(%[[I:.*]]: index, %[[J:.*]]: index):
 // CHECK:   "memref.store"(%[[INIT]], %[[RESULT_BUF]], %[[I]], %[[J]])
 // CHECK:   "scf.yield"() : () -> ()
 // CHECK: })
 
 // Parallel loop over source buffer to compute scattered values.
-// CHECK: "scf.parallel"(%[[C0]], %[[C0]], %[[C56]], %[[C56]], %[[C1]], %[[C1]]) ({
+// CHECK: "scf.parallel"(%[[C0]], %[[C0]], %[[C56]], %[[C56]], %[[C1]], %[[C1]]) <{{.*}}> ({
 // CHECK: ^bb0(%[[II:.*]]: index, %[[JJ:.*]]: index):
 
 // Window loop w.r.t. first dim.
@@ -131,7 +131,7 @@ func.func @select_and_scatter(%arg: memref<112x112xf32>,
 // CHECK: }
 
 // Use selected ivs to load element from the SRC buffer.
-// CHECK: %[[SRC_ELEM:.*]] = "memref.load"(%[[SRC_BUF]], %[[II]], %[[JJ]]) {nontemporal = false} : (memref<56x56xf32>, index, index) -> f32
+// CHECK: %[[SRC_ELEM:.*]] = "memref.load"(%[[SRC_BUF]], %[[II]], %[[JJ]]) <{nontemporal = false}> : (memref<56x56xf32>, index, index) -> f32
 
 // Update of RESULT[SELECTED_I, SELECTED_J] should be done atomically, because
 // it may happen that several other threads select the same IVs if the windows
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
index d4bb83d5b8a..056fbfb2eb6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-rng-to-linalg.mlir
@@ -220,3 +220,291 @@ func.func @three_fry_i16(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
 // CHECK: %[[INSERTED:.+]] = tensor.insert %[[NEWSTATE]] into %[[ARG0]][%[[C1]]] : tensor<2xi64>
 
 // CHECK: return %[[INSERTED]], %[[COLLAPSE]] : tensor<2xi64>, tensor<8xi16>
+
+// -----
+
+func.func @philox_i64(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>) {
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>)
+  return %output_state, %output : tensor<2xi64>, tensor<8xi64>
+}
+
+
+// CHECK-LABEL: func.func @philox_i64(
+// CHECK-SAME: %[[VAL_0:.*]]: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi64>) {
+// CHECK-DAG: %[[VAL_1:.*]] = arith.constant -1767562579 : i32
+// CHECK-DAG: %[[VAL_2:.*]] = arith.constant -1879881855 : i32
+// CHECK-DAG: %[[VAL_3:.*]] = arith.constant -616729560 : i32
+// CHECK-DAG: %[[VAL_4:.*]] = arith.constant -239350328 : i32
+// CHECK-DAG: %[[VAL_5:.*]] = arith.constant 534103459 : i32
+// CHECK-DAG: %[[VAL_6:.*]] = arith.constant 1401181199 : i32
+// CHECK-DAG: %[[VAL_7:.*]] = arith.constant 1684936478 : i32
+// CHECK-DAG: %[[VAL_8:.*]] = arith.constant -1253254570 : i32
+// CHECK-DAG: %[[VAL_9:.*]] = arith.constant -1459197799 : i32
+// CHECK-DAG: %[[VAL_10:.*]] = arith.constant 387276957 : i32
+// CHECK-DAG: %[[VAL_11:.*]] = arith.constant -308364780 : i32
+// CHECK-DAG: %[[VAL_12:.*]] = arith.constant 2027808484 : i32
+// CHECK-DAG: %[[VAL_13:.*]] = arith.constant 842468239 : i32
+// CHECK-DAG: %[[VAL_14:.*]] = arith.constant -626627285 : i32
+// CHECK-DAG: %[[VAL_15:.*]] = arith.constant 1993301258 : i32
+// CHECK-DAG: %[[VAL_16:.*]] = arith.constant 1013904242 : i32
+// CHECK-DAG: %[[VAL_18:.*]] = arith.constant 3449720151 : i64
+// CHECK-DAG: %[[VAL_17:.*]] = arith.constant 3528531795 : i64
+// CHECK-DAG: %[[VAL_19:.*]] = arith.constant 1 : index
+// CHECK-DAG: %[[VAL_20:.*]] = arith.constant -1150833019 : i32
+// CHECK-DAG: %[[VAL_21:.*]] = arith.constant -1640531527 : i32
+// CHECK-DAG: %[[VAL_22:.*]] = arith.constant 4 : i64
+// CHECK-DAG: %[[VAL_23:.*]] = arith.constant 32 : i64
+// CHECK-DAG: %[[VAL_24:.*]] = arith.constant 0 : index
+// CHECK-DAG: %[[VAL_25:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_19]]] : tensor<2xi64>
+// CHECK-DAG: %[[VAL_26:.*]] = tensor.extract %[[VAL_0]]{{\[}}%[[VAL_24]]] : tensor<2xi64>
+// CHECK-DAG: %[[VAL_27:.*]] = arith.trunci %[[VAL_26]] : i64 to i32
+// CHECK-DAG: %[[VAL_28:.*]] = arith.shrui %[[VAL_26]], %[[VAL_23]] : i64
+// CHECK-DAG: %[[VAL_29:.*]] = arith.trunci %[[VAL_28]] : i64 to i32
+// CHECK-DAG: %[[VAL_30:.*]] = arith.addi %[[VAL_25]], %[[VAL_22]] : i64
+// CHECK-DAG: %[[VAL_31:.*]] = tensor.empty() : tensor<4xi64>
+// CHECK-DAG: %[[VAL_32:.*]] = tensor.empty() : tensor<4xi64>
+// CHECK-DAG: %[[VAL_33:.*]]:2 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel"]} outs(%[[VAL_31]], %[[VAL_32]] : tensor<4xi64>, tensor<4xi64>) {
+// CHECK-DAG: ^bb0(%[[VAL_34:.*]]: i64, %[[VAL_35:.*]]: i64):
+// CHECK-DAG:   %[[VAL_36:.*]] = linalg.index 0 : index
+// CHECK-DAG:   %[[VAL_37:.*]] = arith.index_cast %[[VAL_36]] : index to i64
+// CHECK-DAG:   %[[VAL_38:.*]] = arith.addi %[[VAL_37]], %[[VAL_25]] : i64
+// CHECK-DAG:   %[[VAL_39:.*]] = arith.trunci %[[VAL_38]] : i64 to i32
+// CHECK-DAG:   %[[VAL_40:.*]] = arith.shrui %[[VAL_38]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_41:.*]] = arith.trunci %[[VAL_40]] : i64 to i32
+
+// CHECK-DAG:   %[[VAL_42:.*]] = arith.extui %[[VAL_39]] : i32 to i64
+// CHECK-DAG:   %[[VAL_43:.*]] = arith.muli %[[VAL_42]], %[[VAL_17]] : i64
+// CHECK-DAG:   %[[VAL_44:.*]] = arith.shrui %[[VAL_43]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_45:.*]] = arith.trunci %[[VAL_44]] : i64 to i32
+// CHECK-DAG:   %[[VAL_46:.*]] = arith.trunci %[[VAL_43]] : i64 to i32
+// CHECK-DAG:   %[[VAL_47:.*]] = arith.extui %[[VAL_27]] : i32 to i64
+// CHECK-DAG:   %[[VAL_48:.*]] = arith.muli %[[VAL_47]], %[[VAL_18]] : i64
+// CHECK-DAG:   %[[VAL_49:.*]] = arith.shrui %[[VAL_48]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_50:.*]] = arith.trunci %[[VAL_49]] : i64 to i32
+// CHECK-DAG:   %[[VAL_51:.*]] = arith.trunci %[[VAL_48]] : i64 to i32
+// CHECK-DAG:   %[[VAL_52:.*]] = arith.xori %[[VAL_50]], %[[VAL_41]] : i32
+// CHECK-DAG:   %[[VAL_53:.*]] = arith.xori %[[VAL_52]], %[[VAL_27]] : i32
+
+// CHECK-DAG:   %[[VAL_54:.*]] = arith.addi %[[VAL_27]], %[[VAL_21]] : i32
+// CHECK-DAG:   %[[VAL_55:.*]] = arith.addi %[[VAL_29]], %[[VAL_20]] : i32
+// CHECK-DAG:   %[[VAL_56:.*]] = arith.extui %[[VAL_53]] : i32 to i64
+// CHECK-DAG:   %[[VAL_57:.*]] = arith.muli %[[VAL_56]], %[[VAL_17]] : i64
+// CHECK-DAG:   %[[VAL_58:.*]] = arith.shrui %[[VAL_57]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_59:.*]] = arith.trunci %[[VAL_58]] : i64 to i32
+// CHECK-DAG:   %[[VAL_60:.*]] = arith.trunci %[[VAL_57]] : i64 to i32
+// CHECK-DAG:   %[[VAL_61:.*]] = arith.extui %[[VAL_45]] : i32 to i64
+// CHECK-DAG:   %[[VAL_62:.*]] = arith.muli %[[VAL_61]], %[[VAL_18]] : i64
+// CHECK-DAG:   %[[VAL_63:.*]] = arith.shrui %[[VAL_62]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_64:.*]] = arith.trunci %[[VAL_63]] : i64 to i32
+// CHECK-DAG:   %[[VAL_65:.*]] = arith.trunci %[[VAL_62]] : i64 to i32
+// CHECK-DAG:   %[[VAL_66:.*]] = arith.xori %[[VAL_64]], %[[VAL_51]] : i32
+// CHECK-DAG:   %[[VAL_67:.*]] = arith.xori %[[VAL_66]], %[[VAL_54]] : i32
+// CHECK-DAG:   %[[VAL_68:.*]] = arith.xori %[[VAL_59]], %[[VAL_46]] : i32
+// CHECK-DAG:   %[[VAL_69:.*]] = arith.xori %[[VAL_68]], %[[VAL_55]] : i32
+
+// CHECK-DAG:   %[[VAL_70:.*]] = arith.addi %[[VAL_27]], %[[VAL_16]] : i32
+// CHECK-DAG:   %[[VAL_71:.*]] = arith.addi %[[VAL_29]], %[[VAL_15]] : i32
+// CHECK-DAG:   %[[VAL_72:.*]] = arith.extui %[[VAL_67]] : i32 to i64
+// CHECK-DAG:   %[[VAL_73:.*]] = arith.muli %[[VAL_72]], %[[VAL_17]] : i64
+// CHECK-DAG:   %[[VAL_74:.*]] = arith.shrui %[[VAL_73]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_75:.*]] = arith.trunci %[[VAL_74]] : i64 to i32
+// CHECK-DAG:   %[[VAL_76:.*]] = arith.trunci %[[VAL_73]] : i64 to i32
+// CHECK-DAG:   %[[VAL_77:.*]] = arith.extui %[[VAL_69]] : i32 to i64
+// CHECK-DAG:   %[[VAL_78:.*]] = arith.muli %[[VAL_77]], %[[VAL_18]] : i64
+// CHECK-DAG:   %[[VAL_79:.*]] = arith.shrui %[[VAL_78]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_80:.*]] = arith.trunci %[[VAL_79]] : i64 to i32
+// CHECK-DAG:   %[[VAL_81:.*]] = arith.trunci %[[VAL_78]] : i64 to i32
+// CHECK:   %[[VAL_82:.*]] = arith.xori %[[VAL_80]], %[[VAL_65]] : i32
+// CHECK-DAG:   %[[VAL_83:.*]] = arith.xori %[[VAL_82]], %[[VAL_70]] : i32
+// CHECK-DAG:   %[[VAL_84:.*]] = arith.xori %[[VAL_75]], %[[VAL_60]] : i32
+// CHECK-DAG:   %[[VAL_85:.*]] = arith.xori %[[VAL_84]], %[[VAL_71]] : i32
+
+// CHECK-DAG:   %[[VAL_86:.*]] = arith.addi %[[VAL_27]], %[[VAL_14]] : i32
+// CHECK-DAG:   %[[VAL_87:.*]] = arith.addi %[[VAL_29]], %[[VAL_13]] : i32
+// CHECK-DAG:   %[[VAL_88:.*]] = arith.extui %[[VAL_83]] : i32 to i64
+// CHECK-DAG:   %[[VAL_89:.*]] = arith.muli %[[VAL_88]], %[[VAL_17]] : i64
+// CHECK-DAG:   %[[VAL_90:.*]] = arith.shrui %[[VAL_89]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_91:.*]] = arith.trunci %[[VAL_90]] : i64 to i32
+// CHECK-DAG:   %[[VAL_92:.*]] = arith.trunci %[[VAL_89]] : i64 to i32
+// CHECK-DAG:   %[[VAL_93:.*]] = arith.extui %[[VAL_85]] : i32 to i64
+// CHECK-DAG:   %[[VAL_94:.*]] = arith.muli %[[VAL_93]], %[[VAL_18]] : i64
+// CHECK-DAG:   %[[VAL_95:.*]] = arith.shrui %[[VAL_94]], %[[VAL_23]] : i64
+// CHECK-DAG:   %[[VAL_96:.*]] = arith.trunci %[[VAL_95]] : i64 to i32
+// CHECK-DAG:   %[[VAL_97:.*]] = arith.trunci %[[VAL_94]] : i64 to i32
+// CHECK-DAG:   %[[VAL_98:.*]] = arith.xori %[[VAL_96]], %[[VAL_81]] : i32
+// CHECK-DAG:   %[[VAL_99:.*]] = arith.xori %[[VAL_98]], %[[VAL_86]] : i32
+// CHECK-DAG:   %[[VAL_100:.*]] = arith.xori %[[VAL_91]], %[[VAL_76]] : i32
+// CHECK-DAG:   %[[VAL_101:.*]] = arith.xori %[[VAL_100]], %[[VAL_87]] : i32
+
+// CHECK: linalg.yield %[[YIELDED_1:.*]], %[[YIELDED_2:.*]] : i64, i64
+// CHECK-DAG: %[[VAL_206:.*]] = tensor.expand_shape %[[VAL_207:.*]]#0 {{\[\[}}0, 1]] : tensor<4xi64> into tensor<4x1xi64>
+// CHECK-DAG: %[[VAL_208:.*]] = tensor.expand_shape %[[VAL_207]]#1 {{\[\[}}0, 1]] : tensor<4xi64> into tensor<4x1xi64>
+// CHECK-DAG: %[[VAL_209:.*]] = tensor.empty() : tensor<4x2xi64>
+// CHECK-DAG: %[[VAL_213:.*]] = tensor.insert %[[VAL_30]] into %[[VAL_0]]{{\[}}%[[VAL_19]]] : tensor<2xi64>
+
+// CHECK: return %[[VAL_213]], %[[GENERIC:.*]] : tensor<2xi64>, tensor<8xi64>
+
+
+
+// -----
+
+func.func @philox_i32(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>) {
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi32>)
+  return %output_state, %output : tensor<2xi64>, tensor<8xi32>
+}
+
+// CHECK-LABEL: func.func @philox_i32
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<2xi64>
+
+// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+ //CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : i64
+
+// Check we update state correctly:
+// CHECK: %[[STATE:.+]] = tensor.extract %[[ARG0]][%[[C1]]] : tensor<2xi64>
+// CHECK: %[[NEWSTATE:.+]] = arith.addi %[[STATE]], %[[C2]] : i64
+
+// CHECK: %[[DEST0:.+]] = tensor.empty() : tensor<2xi32>
+// CHECK: %[[DEST1:.+]] = tensor.empty() : tensor<2xi32>
+// CHECK: %[[DEST2:.+]] = tensor.empty() : tensor<2xi32>
+// CHECK: %[[DEST3:.+]] = tensor.empty() : tensor<2xi32>
+// CHECK: %[[GENERIC:.+]]:4 = linalg.generic
+// CHECK-SAME: indexing_maps = [#map, #map, #map, #map]
+// CHECK-SAME: iterator_types = ["parallel"]}
+// CHECK-SAME: outs(%[[DEST0]], %[[DEST1]], %[[DEST2]], %[[DEST3]] : tensor<2xi32>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>)
+
+// CHECK: %[[CONCAT:.+]] = linalg.generic
+
+// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[CONCAT]]
+// CHECK-SAME{literal}: [[0, 1]] : tensor<2x4xi32> into tensor<8xi32>
+// CHECK: %[[INSERTED:.+]] = tensor.insert %[[NEWSTATE]] into %[[ARG0]][%[[C1]]] : tensor<2xi64>
+
+// CHECK: return %[[INSERTED]], %[[COLLAPSE]]
+
+
+// -----
+
+func.func @philox_i32_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>) {
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<7x11xi32>)
+  return %output_state, %output : tensor<2xi64>, tensor<7x11xi32>
+}
+
+// CHECK-LABEL: func.func @philox_i32_odd
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<2xi64>
+
+ //CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+ //CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C20:.+]] = arith.constant 20 : i64
+
+// Check we update state correctly:
+// CHECK: %[[STATE:.+]] = tensor.extract %[[ARG0]][%[[C1]]] : tensor<2xi64>
+// CHECK: %[[NEWSTATE:.+]] = arith.addi %[[STATE]], %[[C20]] : i64
+
+// CHECK: %[[DEST0:.+]] = tensor.empty() : tensor<20xi32>
+// CHECK: %[[DEST1:.+]] = tensor.empty() : tensor<20xi32>
+// CHECK: %[[DEST2:.+]] = tensor.empty() : tensor<20xi32>
+// CHECK: %[[DEST3:.+]] = tensor.empty() : tensor<20xi32>
+// CHECK: %[[GENERIC:.+]]:4 = linalg.generic
+// CHECK-SAME: indexing_maps = [#map, #map, #map, #map]
+// CHECK-SAME: iterator_types = ["parallel"]}
+// CHECK-SAME: outs(%[[DEST0]], %[[DEST1]], %[[DEST2]], %[[DEST3]] : tensor<20xi32>, tensor<20xi32>, tensor<20xi32>, tensor<20xi32>)
+
+
+// CHECK: %expanded = tensor.expand_shape %[[GENERIC]]#0
+// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<4x1xi32>
+
+// CHECK: %expanded_1 = tensor.expand_shape %[[GENERIC]]#1
+// CHECK-SAME{literal}: [[0, 1]] : tensor<4xi32> into tensor<4x1xi32>
+
+
+// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<20x4xi32>
+// CHECK: %[[CONCAT:.+]] = linalg.generic
+// CHECK-SAME: outs(%[[EMPTY]] : tensor<20x4xi32>)
+
+// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[CONCAT]]
+
+
+// CHECK: %[[VAL_213:.*]] = tensor.expand_shape %[[COLLAPSE]] {{\[\[}}0, 1]] : tensor<80xi32> into tensor<80x1xi32>
+// CHECK: %[[VAL_214:.*]] = tensor.extract_slice %[[VAL_213]][0, 0] [77, 1] [1, 1] : tensor<80x1xi32> to tensor<77x1xi32>
+// CHECK: %[[VAL_215:.*]] = tensor.collapse_shape %[[VAL_214]] {{\[\[}}0, 1]] : tensor<77x1xi32> into tensor<77xi32>
+// CHECK: %[[VAL_216:.*]] = tensor.expand_shape %[[VAL_215]] {{\[\[}}0, 1]] : tensor<77xi32> into tensor<7x11xi32>
+// CHECK: %[[VAL_217:.*]] = tensor.insert %[[VAL_30]] into %[[VAL_0]]{{\[}}%[[VAL_19]]] : tensor<2xi64>
+// CHECK: return %[[VAL_217]], %[[VAL_216]] : tensor<2xi64>, tensor<7x11xi32>
+
+
+// -----
+
+
+func.func @philox_i64_odd(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi64>) {
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<3x5xi64>)
+  return %output_state, %output : tensor<2xi64>, tensor<3x5xi64>
+}
+
+// CHECK-LABEL: func.func @philox_i64_odd
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<2xi64>
+
+ //CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+ //CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C8:.+]] = arith.constant 8 : i64
+
+// Check we update state correctly:
+// CHECK: %[[STATE:.+]] = tensor.extract %[[ARG0]][%[[C1]]] : tensor<2xi64>
+// CHECK: %[[NEWSTATE:.+]] = arith.addi %[[STATE]], %[[C8]] : i64
+
+// CHECK: %[[DEST2:.+]] = tensor.empty() : tensor<8xi64>
+// CHECK: %[[DEST3:.+]] = tensor.empty() : tensor<8xi64>
+// CHECK: %[[GENERIC:.+]]:2 = linalg.generic
+// CHECK-SAME: indexing_maps = [#map, #map]
+// CHECK-SAME: iterator_types = ["parallel"]}
+// CHECK-SAME: outs(%[[DEST2]], %[[DEST3]] : tensor<8xi64>, tensor<8xi64>)
+
+// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<8x2xi64>
+// CHECK: %[[CONCAT:.+]] = linalg.generic
+// CHECK-SAME: outs(%[[EMPTY]] : tensor<8x2xi64>)
+
+// CHECK-DAG: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[CONCAT]] {{\[\[}}0, 1]] : tensor<8x2xi64> into tensor<16xi64>
+
+
+// CHECK-DAG: %[[EXPANDED:.*]] = tensor.expand_shape %[[COLLAPSE]] {{\[\[}}0, 1]] : tensor<16xi64> into tensor<16x1xi64>
+// CHECK-DAG: %[[SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0] [15, 1] [1, 1] : tensor<16x1xi64> to tensor<15x1xi64>
+// CHECK-DAG: %[[EXPAND_2:.*]] = tensor.collapse_shape %[[SLICE]] {{\[\[}}0, 1]] : tensor<15x1xi64> into tensor<15xi64>
+// CHECK-DAG: %[[RESHAPE:.*]] = tensor.expand_shape %[[EXPAND_2]] {{\[\[}}0, 1]] : tensor<15xi64> into tensor<3x5xi64>
+// CHECK-DAG: %[[INSERTED:.+]] = tensor.insert %[[NEWSTATE]] into %[[ARG0]][%[[C1]]] : tensor<2xi64>
+// CHECK: return %[[INSERTED]], %[[RESHAPE]]
+
+// -----
+
+func.func @philox_i16(%arg0: tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>) {
+  %output_state, %output = "mhlo.rng_bit_generator"(%arg0) {rng_algorithm = #mhlo.rng_algorithm<PHILOX>} : (tensor<2xi64>) -> (tensor<2xi64>, tensor<8xi16>)
+  return %output_state, %output : tensor<2xi64>, tensor<8xi16>
+}
+
+// CHECK-LABEL: func.func @philox_i16
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<2xi64>
+
+ //CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index
+ //CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : i64
+
+// Check we update state correctly:
+// CHECK: %[[STATE:.+]] = tensor.extract %[[ARG0]][%[[C1]]] : tensor<2xi64>
+// CHECK: %[[NEWSTATE:.+]] = arith.addi %[[STATE]], %[[C2]] : i64
+
+// CHECK: %[[DEST0:.+]] = tensor.empty() : tensor<2xi16>
+// CHECK: %[[DEST1:.+]] = tensor.empty() : tensor<2xi16>
+// CHECK: %[[DEST2:.+]] = tensor.empty() : tensor<2xi16>
+// CHECK: %[[DEST3:.+]] = tensor.empty() : tensor<2xi16>
+// CHECK: %[[GENERIC:.+]]:4 = linalg.generic
+// CHECK-SAME: indexing_maps = [#map, #map, #map, #map]
+// CHECK-SAME: iterator_types = ["parallel"]}
+// CHECK-SAME: outs(%[[DEST0]], %[[DEST1]], %[[DEST2]], %[[DEST3]] : tensor<2xi16>, tensor<2xi16>, tensor<2xi16>, tensor<2xi16>)
+
+// CHECK: %[[EMPTY:.+]] = tensor.empty() : tensor<2x4xi16>
+// CHECK: %[[CONCAT:.+]] = linalg.generic
+// CHECK-SAME: outs(%[[EMPTY]] : tensor<2x4xi16>)
+
+// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[CONCAT]]
+// CHECK-SAME{literal}: [[0, 1]] : tensor<2x4xi16> into tensor<8xi16>
+// CHECK: %[[INSERTED:.+]] = tensor.insert %[[NEWSTATE]] into %[[ARG0]][%[[C1]]] : tensor<2xi64>
+
+// CHECK: return %[[INSERTED]], %[[COLLAPSE]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
index 8ab9e1fe72c..5338551ecd5 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-linalg.mlir
@@ -6135,16 +6135,15 @@ func.func @clamp_complex(%min: tensor<8xcomplex<f32>>,
 // CHECK-PRIMITIVE-LABEL: func @reshape_sparse_encoding
 
 #ST_3D = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed", "compressed"]
 }>
 
 #ST_4D = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed", "compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed", "compressed", "compressed"]
 }>
 
 func.func @reshape_sparse_encoding(%arg0: tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<1x49x16xf32, #ST_3D>) -> tensor<1x784x1x1xf32, #ST_4D>
   func.return %0 : tensor<1x784x1x1xf32, #ST_4D>
 }
-// CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1, 2]] : tensor<1x49x16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>> into tensor<784xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>
-// CHECK-NEXT: tensor.expand_shape %{{.*}} {{\[}}[0, 1, 2, 3]] : tensor<784xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>> into tensor<1x784x1x1xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed", "compressed" ] }>>
+// CHECK: tensor.reshape %{{.*}} : (tensor<1x49x16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>, tensor<4xi64>) -> tensor<1x784x1x1xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed", "compressed" ] }>>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
index dbca93c7d9f..bc8dc3b84e6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-memref.mlir
@@ -151,16 +151,3 @@ func.func @custom_call_side_effect(%x: tensor<2xf32>,
 // CHECK: "lmhlo.custom_call"(%[[I0]], %[[I1]], %[[ALLOC]]) ({
 // CHECK-NEXT: }) {backend_config = "", call_target_name = "bar", has_side_effect = true, operand_segment_sizes = array<i32: 2, 1>, target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<num_args = 3, num_results = 2, args_to_target_args = [0, 1], results_to_target_results = [1]>} : (memref<2xf32>, memref<5xi32>, memref<2xi32>)
 // CHECK: return %[[TOKEN]] : !mhlo.token
-
-// -----
-
-// CHECK-LABEL: func @infeed_outfeed
-func.func @infeed_outfeed(%arg0: tensor<f32>) {
-  %0 = mhlo.create_token : !mhlo.token
-  %1:2 = "mhlo.infeed"(%0) {infeed_config = "", layout = [[1, 0]]} : (!mhlo.token) -> (tensor<3x4xf32>, !mhlo.token)
-// CHECK: %[[ALLOC:.*]] = memref.alloc() {{.*}} : memref<3x4xf32>
-// CHECK: "lmhlo.infeed"(%[[ALLOC]]) {config = "", infeed_config = "", layout = {{\[}}[1, 0]]} : (memref<3x4xf32>) -> ()
-  %2 = "mhlo.outfeed"(%1#0, %1#1) {outfeed_config = ""} : (tensor<3x4xf32>, !mhlo.token) -> !mhlo.token
-// CHECK: "lmhlo.outfeed"(%[[ALLOC]]) {config = "", outfeed_config = ""} : (memref<3x4xf32>) -> ()
-  func.return
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
index 051df599ddc..fb1a9e041f4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo-experimental.mlir
@@ -1,9 +1,32 @@
 // RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo --mlir-print-op-generic --split-input-file --verify-diagnostics %s
-// RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo=allow-experimental-features --mlir-print-op-generic %s | FileCheck %s
+// RUN: mlir-hlo-opt --hlo-legalize-to-stablehlo=allow-experimental-features --split-input-file --mlir-print-op-generic %s | FileCheck %s
 
 // This test file runs both FileCheck and diagnostic check. These tests all
 // error when the experimental flag is disabled, and pass when it is enabled.
 
+// CHECK-LABEL: "op_all_reduce_tuple"
+func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
+  //               CHECK: "stablehlo.custom_call"(%[[ARG0:.*]], %[[ARG1:.*]]) {
+  //          CHECK-SAME:    call_target_name = "mhlo.all_reduce"
+  //          CHECK-SAME:    called_computations = [@all_reduce]
+  // CHECK-SAME{LITERAL}:    mhlo.attributes = {replica_groups = dense<> : tensor<0x0xi64>}
+  //          CHECK-SAME: } : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  //               CHECK: func.func
+  //          CHECK-SAME: sym_name = "all_reduce"
+  //               CHECK: ^bb0(%[[REDUCE_ARG0:.*]]: tensor<f32>, %[[REDUCE_ARG1:.*]]: tensor<f32>):
+  //          CHECK-NEXT: %[[ADD:.*]] = "stablehlo.add"(%[[REDUCE_ARG0]], %[[REDUCE_ARG1]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  //          CHECK-NEXT: "stablehlo.return"(%[[ADD]]) : (tensor<f32>) -> ()
+  // expected-error@+1 {{failed to legalize operation 'mhlo.all_reduce' that was explicitly marked illegal}}
+  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %2 = mhlo.add %arg2, %arg3 : tensor<f32>
+    mhlo.return %2 : tensor<f32>
+  }) {replica_groups = dense<> : tensor<0x0xi64>} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<8xf32>, tensor<f32>
+}
+
+// -----
+
 // CHECK-LABEL: "op_all_to_all_tuple"
 func.func @op_all_to_all_tuple(%arg0: tensor<128x4xf32>, %arg1: tensor<128x4xf32>) -> (tensor<128x4xf32>, tensor<128x4xf32>) {
   //               CHECK: "stablehlo.custom_call"(%arg0, %arg1) {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index 4d2a5126cae..28c36891f78 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -1526,6 +1526,8 @@ func.func @op_tanh(%arg0: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
+// TopKOp aka mhlo.topk is unsupported at the moment (see negative test below).
+
 // CHECK-LABEL: "op_torch_index_select"
 func.func @op_torch_index_select(%arg0: tensor<5x1x5xf32>, %arg1: tensor<2xi32>) ->  tensor<2x1x5xf32> {
   //      CHECK: "stablehlo.torch_index_select"(%arg0, %arg1) {
@@ -1816,9 +1818,9 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %ar
 }
 
 // CHECK-LABEL: "type_sparsity"
-func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32> {
-  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
-  %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
+func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32> {
+  // CHECK: "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
+  %0 = "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
@@ -2031,6 +2033,18 @@ func.func @op_stochastic_convert(%arg0: tensor<f32>, %arg1: tensor<ui32>) -> ten
 
 // -----
 
+func.func @op_topk(%arg0 : tensor<16xf32>) {
+  // expected-error@+1 {{failed to legalize operation 'mhlo.topk' that was explicitly marked illegal}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
 func.func @op_xla_rng_get_and_update_state() -> tensor<2xui64> {
   // expected-error@+1 {{failed to legalize operation 'mhlo.xla.rng_get_and_update_state' that was explicitly marked illegal}}
   %0 = "mhlo.xla.rng_get_and_update_state"() {
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
index 061a4591738..18b71ea6e12 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/legalize-mhlo-to-thlo.mlir
@@ -115,8 +115,9 @@ func.func @simple_gather(%operand : tensor<3x3xf32>,
 }
 
 // CHECK-LABEL: @simple_gather
-//   CHECK-DAG: %[[CAST:.*]] = arith.index_cast {{.*}} : tensor<3x2xi64> to tensor<3x2xindex>
-//   CHECK-DAG: %[[INIT:.*]] = tensor.empty() : tensor<3x1x1xf32>
+//   CHECK:     %[[INIT:.*]] = tensor.empty() : tensor<3x1x1xf32>
+//   CHECK:     %[[CAST_INIT:.*]] = tensor.empty() : tensor<3x2xindex>
+//   CHECK:     %[[CAST:.*]] = linalg.map { arith.index_cast }
 //       CHECK: %[[GATHER:.*]] = thlo.gather
 //  CHECK-SAME:   ins(%{{.*}} : tensor<3x3xf32>, %[[CAST]] : tensor<3x2xindex>)
 //  CHECK-SAME:   outs(%[[INIT]] : tensor<3x1x1xf32>)
@@ -139,7 +140,8 @@ func.func @simple_gather_unsigned(
 // CHECK-LABEL: @simple_gather_unsigned
 //   CHECK-DAG: %[[CAST:.*]] = builtin.unrealized_conversion_cast {{.*}} : tensor<3x3xui32> to tensor<3x3xi32>
 //   CHECK-DAG: %[[INIT:.*]] = tensor.empty() : tensor<3x1x1xi32>
-//   CHECK-DAG: %[[INDEX_CAST:.*]] = arith.index_castui {{.*}} to tensor<3x2xindex>
+//   CHECK:     %[[INDEX_CAST_INIT:.*]] = tensor.empty() : tensor<3x2xindex>
+//   CHECK:     %[[INDEX_CAST:.*]] = linalg.map { arith.index_castui }
 //       CHECK: %[[GATHER:.*]] = thlo.gather
 //  CHECK-SAME:   ins(%[[CAST]] : tensor<3x3xi32>, %[[INDEX_CAST]] : tensor<3x2xindex>)
 //  CHECK-SAME:   outs(%[[INIT]] : tensor<3x1x1xi32>)
@@ -226,7 +228,8 @@ func.func @simple_scatter(%dst: tensor<3x3xf32>, %indices: tensor<2x2xi32>,
 // CHECK-LABEL: @simple_scatter
 // CHECK-SAME: (%[[DST:.*]]: tensor<3x3xf32>, %[[INDICES:.*]]: tensor<2x2xi32>,
 // CHECK-SAME:  %[[UPDATE:.*]]: tensor<2x1x3xf32>)
-//      CHECK:   %[[CAST:.*]] = arith.index_cast %[[INDICES]] {{.*}} to tensor<2x2xindex>
+//      CHECK:   %[[CAST_INIT:.*]] = tensor.empty() : tensor<2x2xindex>
+//      CHECK:   %[[CAST:.*]] = linalg.map { arith.index_cast }
 //      CHECK:   thlo.scatter 
 // CHECK-SAME:     ins(%[[CAST]] : tensor<2x2xindex>,
 // CHECK-SAME:        %[[UPDATE]] : tensor<2x1x3xf32>)
@@ -308,4 +311,4 @@ func.func @reverse_dynamic(%input: tensor<?x?xf32>)
 //  CHECK-SAME:     ins(%[[ARG0]]
 //  CHECK-SAME:     outs(%[[EMPTY]]
 //  CHECK-SAME:     reverse_dimensions = [0, 1]
-//  CHECK-NEXT:   return %[[REVERSED]]
\ No newline at end of file
+//  CHECK-NEXT:   return %[[REVERSED]]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
index ae2a8958344..13963174283 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/lower-complex.mlir
@@ -316,3 +316,22 @@ func.func @cos(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> (tensor<10xf32
   // CHECK: return %[[RDIV]], %[[IDIV]]
   func.return %2, %3 : tensor<10xf32>, tensor<10xf32>
 }
+
+// CHECK-LABEL: dot_complex
+func.func @dot_complex(%arg0: tensor<2x3xcomplex<f32>>, %arg1:  tensor<3x4xcomplex<f32>>) -> (tensor<2x4xcomplex<f32>>) {
+  // CHECK-DAG: [[ROP0:%.+]] = mhlo.real %arg0
+  // CHECK-DAG: [[IOP0:%.+]] = mhlo.imag %arg0
+  // CHECK-DAG: [[ROP1:%.+]] = mhlo.real %arg1
+  // CHECK-DAG: [[IOP1:%.+]] = mhlo.imag %arg1
+  // CHECK-DAG: %[[RR:.+]] = "mhlo.dot"([[ROP0]], [[ROP1]])
+  // CHECK-DAG: %[[II:.+]] = "mhlo.dot"([[IOP0]], [[IOP1]])
+  // CHECK-DAG: %[[RPART:.+]] = mhlo.subtract %[[RR]], %[[II]]
+  // CHECK-DAG: %[[RI:.+]] = "mhlo.dot"([[ROP0]], [[IOP1]])
+  // CHECK-DAG: %[[IR:.+]] = "mhlo.dot"([[IOP0]], [[ROP1]])
+  // CHECK-DAG: %[[IPART:.+]] = mhlo.add %[[RI]], %[[IR]]
+  // CHECK-DAG: %[[CMPLX:.+]] = mhlo.complex %[[RPART]], %[[IPART]]
+  %0 = "mhlo.dot"(%arg0, %arg1)  : (tensor<2x3xcomplex<f32>>, tensor<3x4xcomplex<f32>>) -> tensor<2x4xcomplex<f32>>
+  // CHECK: return %[[CMPLX]]
+  return %0 : tensor<2x4xcomplex<f32>>
+}
+
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir
index a6853afc8f2..f284e7d5721 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_flatten_tuple.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-hlo-opt -mhlo-flatten-tuple %s | FileCheck %s
+// RUN: mlir-hlo-opt -split-input-file -mhlo-flatten-tuple %s | FileCheck %s
 
 // CHECK-LABEL: @custom_call
 // CHECK-SAME: %[[X:.*]]: tensor<6x3xf32>
@@ -11,3 +11,19 @@ func.func @custom_call(%x: tensor<6x3xf32>) -> (tensor<6xf32>, tensor<3xf32>) {
   %2 = "mhlo.get_tuple_element"(%0) {index = 1 : i32} : (tuple<tensor<6xf32>, tensor<3xf32>>) -> tensor<3xf32>
   return %1, %2 : tensor<6xf32>, tensor<3xf32>
 }
+
+// -----
+
+// CHECK-LABEL: @custom_call_tupled_operand
+// CHECK-NOT: mhlo.tuple
+func.func @custom_call_tupled_operand(%arg: tuple<tensor<ui32>, tensor<i32>>)
+  -> (tensor<i32>, tensor<ui32>) {
+  %0 = mhlo.constant dense<1> : tensor<ui32>
+  %1 = mhlo.constant dense<10> : tensor<i32>
+  %2 = mhlo.tuple %0, %1, %arg : tuple<tensor<ui32>, tensor<i32>,
+                                       tuple<tensor<ui32>, tensor<i32>>>
+  %3 = mhlo.custom_call @ScalarProgramDummyConstant(%2)
+    : (tuple<tensor<ui32>, tensor<i32>, tuple<tensor<ui32>, tensor<i32>>>)
+    -> tensor<ui32>
+  return %1, %3 : tensor<i32>, tensor<ui32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
index e9eb32b3fd0..9bdb69a5e2d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_infer_shape_type_methods.mlir
@@ -895,7 +895,7 @@ func.func @scatter_bounds(%input_tensor: tensor<200x?x?xf32, #mhlo.type_extensio
 //===----------------------------------------------------------------------===//
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 // CHECK-LABEL: @tanh_sparsity
@@ -909,7 +909,7 @@ func.func @tanh_sparsity(%arg0: tensor<10x10xf32, #CSR>) -> tensor<10x10xindex>
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 // CHECK-LABEL: @abs_sparsity
@@ -923,7 +923,7 @@ func.func @abs_sparsity(%arg0: tensor<10x10xf32, #CSR>) -> tensor<10x10xindex> {
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 // CHECK-LABEL: @real_sparsity
@@ -937,7 +937,7 @@ func.func @real_sparsity(%arg0: tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x1
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 // CHECK-LABEL: @imag_sparsity
@@ -951,7 +951,7 @@ func.func @imag_sparsity(%arg0: tensor<10x10xcomplex<f32>, #CSR>) -> tensor<10x1
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 // CHECK-LABEL: @complex_sparsity
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
index de812256b57..feb479578a4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/mhlo_ops_prettyprint.mlir
@@ -231,11 +231,11 @@ func.func @extensions(%arg0 : tensor<?x?xf32, #mhlo.type_extensions<bounds = [3,
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed"]
 }>
 
 // CHECK-LABEL: func @encodings
@@ -245,7 +245,7 @@ func.func @encodings(%arg0: tensor<10x20xf32, #CSR>,
   // CHECK-NEXT: %1 = mhlo.add %arg1, %arg1 : tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>
   // CHECK-NEXT: %2 = mhlo.abs %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>) -> tensor<10x20xf32>
   // CHECK-NEXT: %3 = mhlo.abs %arg0 : tensor<10x20xf32, #sparse_tensor.encoding<{{.*}}>>
-  // CHECK-NEXT: %4 = mhlo.complex %arg0, %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>) -> tensor<10x20xcomplex<f32>>
+  // CHECK-NEXT: %4 = mhlo.complex %arg0, %arg0 : (tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<10x20xcomplex<f32>>
   %0 = "mhlo.add"(%arg0, %arg1) : (tensor<10x20xf32, #CSR>,
                                    tensor<10x20xf32, #DCSR>) -> tensor<10x20xf32>
   %1 = "mhlo.add"(%arg1, %arg1) : (tensor<10x20xf32, #DCSR>,
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
index eb6cb42f80c..d5c21334689 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/ops.mlir
@@ -60,7 +60,28 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 
 // -----
 
+func.func @all_reduce_tuple(%arg0: tensor<10xf32>, %arg1: tensor<f32>) -> tensor<10xf32> {
+  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
+  // Perform max reduction inside the region
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    channel_handle = #mhlo.channel_handle<
+      handle = 5,
+      type = 2
+    >,
+    use_global_device_ids
+  } : (tensor<10xf32>, tensor<f32>) -> (tensor<10xf32>, tensor<f32>)
+  func.return %0 : tensor<10xf32>
+}
+
+// -----
+
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{Reduction-region must take 2 parameters, but takes 3 parameter(s)}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f32>):
@@ -76,6 +97,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The reduction-region expected to return some value(s)}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -91,6 +113,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{Reduction-region here must produce 1 tensors, but produces 2 instead}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -106,6 +129,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{Reduction-region here must produce tensor-typed result(s), but produces 'tuple<tensor<f32>, tensor<f32>>' instead}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -122,6 +146,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The type of reduction-region's parameter at index 1 is different than the corresponding result type: 'tensor<i32>' vs 'tensor<f32>'}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<i32>):
@@ -137,6 +162,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The type of reduction-region's parameter at index 0 is different than the corresponding result type: 'tensor<f32>' vs 'tensor<i32>'}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -153,6 +179,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The type of reduction-region's result type at index 0 differs from the op's corresponding init-value type: 'tensor<i32>' vs 'tensor<f32>'}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<i32>, %arg1: tensor<i32>):
@@ -168,6 +195,7 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{The type of reduction-region's result type at index 0 differs from the op's corresponding init-value type: 'tensor<4xf32>' vs 'tensor<f32>'}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>):
@@ -183,7 +211,8 @@ func.func @all_reduce_invalid_reducer(%operand: tensor<10xf32>) -> tensor<10xf32
 // -----
 
 func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10x4xf32> {
-  // expected-error@+1 {{requires compatible types for all operands and results}}
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
+  // expected-error@+1 {{'mhlo.all_reduce' op inferred type(s) 'tensor<10xf32>' are incompatible with return type(s) of operation 'tensor<10x4xf32>'}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
@@ -198,7 +227,7 @@ func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10
 // -----
 
 func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10xi32> {
-  // expected-error@+1 {{requires compatible types for all operands and results}}
+  // expected-error@+1 {{'mhlo.all_reduce' op requires the same element type for all operands and results}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
     %max = mhlo.maximum %arg0, %arg1 : tensor<f32>
@@ -213,6 +242,7 @@ func.func @all_reduce_invalid_return_type(%operand: tensor<10xf32>) -> tensor<10
 // -----
 
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -228,6 +258,7 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 // -----
 
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   // expected-error@+1 {{replica id #1 seen more than once}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -243,6 +274,7 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 // -----
 
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   //  expected-error@+1 {{replica id #2 not seen in replica groups}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -258,6 +290,7 @@ func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<
 // -----
 
 func.func @all_reduce_invalid_replica_group(%operand: tensor<10xf32>) -> tensor<10xf32> {
+  // expected-error@+2 {{'mhlo.all_reduce' op failed to infer returned types}}
   //  expected-error@+1 {{replica groups cannot be empty}}
   %0 = "mhlo.all_reduce"(%operand) ({
   ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>):
@@ -6072,7 +6105,7 @@ func.func @scatter_variadic(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>,
 
 
 #SV = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed"]
+  lvlTypes = ["compressed"]
 }>
 
 func.func @is_compatible_sparse_mix_non_sparse(%arg0: tensor<1xf32>, %arg1: tensor<1xf32, #SV>) {
@@ -6436,3 +6469,143 @@ func.func @f8e5m2(%arg0: tensor<f16>) -> tensor<f8E5M2> {
   %0 = "mhlo.convert"(%arg0) : (tensor<f16>) -> tensor<f8E5M2>
   func.return %0 : tensor<f8E5M2>
 }
+
+// -----
+
+func.func @top_k_1d(%arg0 : tensor<16xf32>) {
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @top_k_nd(%arg0 : tensor<16x16xf32>) {
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
+  return
+}
+
+// -----
+
+func.func @top_k_unbounded(%arg0 : tensor<?x16x?xf32>) {
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<?x16x?xf32> -> (tensor<?x16x8xf32>, tensor<?x16x8xi32>)
+  return
+}
+
+// -----
+
+func.func @top_k_bounded(%arg0 : tensor<?x?x?xf32, #stablehlo.bounds<?, 16, 16>>) {
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<?x?x?xf32, #stablehlo.bounds<?, 16, 16>> -> (tensor<16x?x8xf32, #stablehlo.bounds<?, 16, ?>>, tensor<16x?x8xi32, #stablehlo.bounds<?, 16, ?>>)
+  return
+}
+
+// -----
+
+func.func @top_k_unranked(%arg0 : tensor<*xf32>) {
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<*xf32> -> (tensor<*xf32>, tensor<*xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_rank_at_least_one(%arg0 : tensor<f32>) {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{operand's rank must be at least 1}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<f32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_last_dimension_at_least_k(%arg0 : tensor<4xf32>) {
+  // expected-error@+2 {{failed to infer returned types}}
+  // expected-error@+1 {{operand's last dimension must be at least 8}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<4xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_body_must_have_two_arguments(%arg0 : tensor<16xf32>) {
+  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<f32>) -> tensor<i1>'}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_body_must_have_one_result(%arg0 : tensor<16xf32>) {
+  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<f32>, tensor<f32>) -> (tensor<i1>, tensor<i1>)'}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate, %predicate : tensor<i1>, tensor<i1>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_body_arguments_must_have_operand_element_type(%arg0 : tensor<16xf32>) {
+  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<i32>, tensor<i32>) -> tensor<i1>'}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<i32>, %arg2: tensor<i32>):
+      %predicate = mhlo.compare GT, %arg1, %arg2 : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_body_results_must_have_i1_element_type(%arg0 : tensor<16xf32>) {
+  // expected-error@+1 {{unsupported body: expected: '(tensor<f32>, tensor<f32>) -> tensor<i1>', got '(tensor<f32>, tensor<f32>) -> tensor<f32>'}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      mhlo.return %arg1 : tensor<f32>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
+
+// -----
+
+func.func @topk_body_must_consist_of_compare_gt_or_compare_lt(%arg0 : tensor<16xf32>) {
+  // expected-error@+1 {{unsupported body: expected mhlo.compare of body arguments with GT or LT comparison_direction}}
+  %0:2 = mhlo.topk(%arg0, k=8) {
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %predicate = mhlo.compare EQ, %arg1, %arg2 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %predicate : tensor<i1>
+  } : tensor<16xf32> -> (tensor<8xf32>, tensor<8xi32>)
+  return
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
index 4fd99228c4b..b4651ebbae6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_gendot_lower.mlir
@@ -2,9 +2,9 @@
 // RUN: --verify-diagnostics \
 // RUN: --mhlo-test-lower-general-dot --canonicalize | FileCheck %s
 
-#SV  = #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>
-#CSR = #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>
-#COO = #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton-nu", "singleton" ] }>
+#SV  = #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>
+#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>
+#COO = #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton-nu", "singleton" ] }>
 
 //
 // Vector-vector gendot.
@@ -78,7 +78,7 @@ func.func @sparse_matmat_1s(%arg0: tensor<16x32xf64, #CSR>,
 // CHECK-LABEL: func.func @sparse_matmat_as(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<16x32xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<32x64xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>, tensor<32x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>
+// CHECK:         %[[DOT:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]]) {precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>, tensor<32x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) -> tensor<16x64xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
 // CHECK:         return %[[DOT]] : tensor<16x64xf64, #sparse_tensor.encoding<{{{.*}}}>>
 // CHECK:       }
 //
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
index c860d960f24..7318d78e2c6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_lower.mlir
@@ -7,25 +7,25 @@
 // properly dealt with while lowering mhlo ops to linalg ops.
 
 #SV = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed"]
+  lvlTypes = ["compressed"]
 }>
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed"]
 }>
 
 #ST = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed", "compressed"]
 }>
 
 // CHECK-LABEL: func @sparse_abs_eltwise(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}} ins(%[[ARG0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>)
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}} ins(%[[ARG0]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>)
 // CHECK:         ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32):
 // CHECK:           %[[ABS:.*]] = math.absf %[[A]] : f32
 // CHECK:           linalg.yield %[[ABS]] : f32
@@ -43,7 +43,7 @@ func.func @sparse_abs_eltwise(%arg0: tensor<10x20xf32, #CSR>)
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>) {
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) {
 // CHECK:           ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32):
 // CHECK:             %[[ADD:.*]] = arith.addf %[[A]], %[[B]] : f32
 // CHECK:             linalg.yield %[[ADD]] : f32
@@ -63,7 +63,7 @@ func.func @sparse_add_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20xf32, #{{.*}}>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<10x20xf32, #{{.*}}>) -> tensor<10x20xf32, #{{.*}}> {
 // CHECK:         %[[OUT:.*]] = bufferization.alloc_tensor() : tensor<10x20xf32, #{{.*}}>
-// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>) {
+// CHECK:         %[[VAL:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) outs(%[[OUT]] : tensor<10x20xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) {
 // CHECK:           ^bb0(%[[A:.*]]: f32, %[[B:.*]]: f32, %[[C:.*]]: f32):
 // CHECK:             %[[ADD:.*]] = arith.mulf %[[A]], %[[B]] : f32
 // CHECK:             linalg.yield %[[ADD]] : f32
@@ -81,19 +81,19 @@ func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 
 // CHECK-LABEL: func @sparse_math(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x20x30xf64, #{{.*}}>) -> tensor<10x20x30xf64, #{{.*}}> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:            math.absf
 // CHECK:         }
-// CHECK:         %[[T1:.*]] = linalg.generic {{{.*}}} ins(%[[T0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T1:.*]] = linalg.generic {{{.*}}} ins(%[[T0]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:            math.expm1
 // CHECK:         }
-// CHECK:         %[[T2:.*]] = linalg.generic {{{.*}}} ins(%[[T1]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T2:.*]] = linalg.generic {{{.*}}} ins(%[[T1]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           math.log1p
 // CHECK:         }
-// CHECK:         %[[T3:.*]] = linalg.generic {{{.*}}} ins(%[[T2]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T3:.*]] = linalg.generic {{{.*}}} ins(%[[T2]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           arith.negf
 // CHECK:         }
-// CHECK:         %[[T4:.*]] = linalg.generic {{{.*}}} ins(%[[T3]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T4:.*]] = linalg.generic {{{.*}}} ins(%[[T3]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           sparse_tensor.unary %{{.*}} : f64 to f64
 // CHECK:           present = {
 // CHECK:             math.copysign
@@ -102,19 +102,19 @@ func.func @sparse_mul_eltwise(%arg0: tensor<10x20xf32, #CSR>,
 // CHECK:           absent = {
 // CHECK:           }
 // CHECK:         }
-// CHECK:         %[[T5:.*]] = linalg.generic {{{.*}}} ins(%[[T4]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T5:.*]] = linalg.generic {{{.*}}} ins(%[[T4]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           math.sin
 // CHECK:         }
-// CHECK:         %[[T6:.*]] = linalg.generic {{{.*}}} ins(%[[T5]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T6:.*]] = linalg.generic {{{.*}}} ins(%[[T5]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           math.sqrt
 // CHECK:         }
-// CHECK:         %[[T7:.*]] = linalg.generic {{{.*}}} ins(%[[T6]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T7:.*]] = linalg.generic {{{.*}}} ins(%[[T6]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           math.tanh
 // CHECK:         }
-// CHECK:         %[[T8:.*]] = linalg.generic {{{.*}}} ins(%[[T7]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T8:.*]] = linalg.generic {{{.*}}} ins(%[[T7]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           math.ceil
 // CHECK:         }
-// CHECK:         %[[T9:.*]] = linalg.generic {{{.*}}} ins(%[[T8]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed", "compressed" ] }>>) outs
+// CHECK:         %[[T9:.*]] = linalg.generic {{{.*}}} ins(%[[T8]] : tensor<10x20x30xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed", "compressed" ] }>>) outs
 // CHECK:           math.floor
 // CHECK:         }
 // CHECK:         return %[[T9]] : tensor<10x20x30xf64, #{{.*}}>
@@ -175,7 +175,7 @@ func.func @sparse_int_abs(%arg0: tensor<100xi64, #SV>) -> tensor<100xi64> {
 
 // CHECK-LABEL: func @sparse_reduce(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10xi64, #{{.*}}>) -> tensor<i64> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10xi64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>)
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]] : tensor<10xi64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>)
 // CHECK:           arith.addi
 // CHECK:         }
 // CHECK:         return %[[T0]] : tensor<i64>
@@ -193,7 +193,7 @@ func.func @sparse_reduce(%arg0: tensor<10xi64, #SV>) -> tensor<i64> {
 // CHECK-LABEL: func @sparse_dot(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<?xf32, #{{.*}}>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<?xf32, #{{.*}}>) -> tensor<f32> {
-// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>, tensor<?xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>)
+// CHECK:         %[[T0:.*]] = linalg.generic {{{.*}}} ins(%[[ARG0]], %[[ARG1]] : tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>, tensor<?xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>)
 // CHECK:           arith.mulf
 // CHECK:           arith.addf
 // CHECK:         }
@@ -212,11 +212,11 @@ func.func @sparse_dot(%arg0: tensor<?xf32, #SV>,
 
 // CHECK-LABEL: func @sparse_transpose(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<100x200xf64, #{{.*}}>) -> tensor<200x100xf64, #{{.*}}> {
-// CHECK:         %[[T0:.*]] = bufferization.alloc_tensor() : tensor<200x100xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>
-// CHECK:         %[[T1:.*]] = linalg.generic {{.*}} ins(%[[ARG0]] : tensor<100x200xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>) outs(%[[T0]] : tensor<200x100xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>) {
+// CHECK:         %[[T0:.*]] = bufferization.alloc_tensor() : tensor<200x100xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
+// CHECK:         %[[T1:.*]] = linalg.generic {{.*}} ins(%[[ARG0]] : tensor<100x200xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) outs(%[[T0]] : tensor<200x100xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>) {
 // CHECK:           linalg.yield
 // CHECK:         }
-// CHECK:         return %[[T1]] : tensor<200x100xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>
+// CHECK:         return %[[T1]] : tensor<200x100xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
 // CHECK:       }
 func.func @sparse_transpose(%arg0: tensor<100x200xf64, #CSR>)
                                 -> tensor<200x100xf64, #DCSR> {
@@ -227,7 +227,8 @@ func.func @sparse_transpose(%arg0: tensor<100x200xf64, #CSR>)
 
 // CHECK-LABEL: func @sparse_expand(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<100xf64, #{{.*}}>) -> tensor<10x10xf64, #{{.*}}> {
-// CHECK:         %[[OUT:.*]] = tensor.expand_shape %[[ARG0]] {{\[\[}}0, 1]] : tensor<100xf64, #{{.*}}> into tensor<10x10xf64, #{{.*}}>
+// CHECK:         %[[CST:.*]] = arith.constant dense<10> : tensor<2xi64>
+// CHECK:         %[[OUT:.*]] = tensor.reshape %[[ARG0]](%[[CST]]) : (tensor<100xf64, #{{.*}}>, tensor<2xi64>) -> tensor<10x10xf64, #{{.*}}>
 // CHECK:         return %[[OUT]] : tensor<10x10xf64, #{{.*}}>
 func.func @sparse_expand(%arg0: tensor<100xf64, #SV>) -> tensor<10x10xf64, #CSR> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<100xf64, #SV>) -> tensor<10x10xf64, #CSR>
@@ -236,7 +237,8 @@ func.func @sparse_expand(%arg0: tensor<100xf64, #SV>) -> tensor<10x10xf64, #CSR>
 
 // CHECK-LABEL: func @sparse_collapse(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #{{.*}}>) -> tensor<100xf64, #{{.*}}> {
-// CHECK:         %[[OUT:.*]] = tensor.collapse_shape %[[ARG0]] {{\[\[}}0, 1]] : tensor<10x10xf64, #{{.*}}> into tensor<100xf64, #{{.*}}>
+// CHECK:         %[[CST:.*]] = arith.constant dense<100> : tensor<1xi64>
+// CHECK:         %[[OUT:.*]] = tensor.reshape %[[ARG0]](%[[CST]]) : (tensor<10x10xf64, #{{.*}}>, tensor<1xi64>) -> tensor<100xf64, #{{.*}}>
 // CHECK:         return %[[OUT]] : tensor<100xf64, #{{.*}}>
 func.func @sparse_collapse(%arg0: tensor<10x10xf64, #CSR>) -> tensor<100xf64, #SV> {
   %0 = "mhlo.reshape"(%arg0) : (tensor<10x10xf64, #CSR>) -> tensor<100xf64, #SV>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir
index 873ee1f9a9c..89c3783a724 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_ops.mlir
@@ -6,15 +6,15 @@
 // output types), dense or sparse ops are semantically equivalent.
 
 #SV = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed"]
+  lvlTypes = ["compressed"]
 }>
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed"]
 }>
 
 //
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
index ba8fb31f742..3e7b17a6a3e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_rewriting.mlir
@@ -4,14 +4,14 @@
 
 // Verifies that mhlo sparse tensor type rewriting occurs.
 
-#SV= #sparse_tensor.encoding<{ dimLevelType = ["compressed"] }>
+#SV= #sparse_tensor.encoding<{ lvlTypes = ["compressed"] }>
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"]
+  lvlTypes = ["dense", "compressed"]
 }>
 
 #DCSR = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed", "compressed"]
+  lvlTypes = ["compressed", "compressed"]
 }>
 
 // CHECK-LABEL: func @rewrite_unary(
@@ -39,7 +39,7 @@ func.func @rewrite_binary(%arg0: tensor<100xf64>,
 // CHECK-LABEL: func @rewrite_binary_override(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>, tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "compressed", "compressed" ] }>>
+// CHECK:         %[[VAL:.*]] = mhlo.multiply %[[ARG0]], %[[ARG1]] : (tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>, tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "compressed", "compressed" ] }>>
 // CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>
 func.func @rewrite_binary_override(%arg0: tensor<10x10xf64, #CSR>,
                                    %arg1: tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #DCSR> {
@@ -50,8 +50,8 @@ func.func @rewrite_binary_override(%arg0: tensor<10x10xf64, #CSR>,
 
 // CHECK-LABEL: func @rewrite_convert(
 // CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64>) -> tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>> {
-// CHECK:         %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]] : tensor<10x10xf64> to tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
-// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
+// CHECK:         %[[VAL:.*]] = sparse_tensor.convert %[[ARG0]] : tensor<10x10xf64> to tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK-NEXT:    return %[[VAL:.*]] : tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
 func.func @rewrite_convert(%arg0: tensor<10x10xf64>) -> tensor<10x10xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<10x10xf64> to tensor<10x10xf64, #DCSR>
   %1 = sparse_tensor.convert %0 : tensor<10x10xf64, #DCSR> to tensor<10x10xf64, #CSR>
@@ -60,9 +60,9 @@ func.func @rewrite_convert(%arg0: tensor<10x10xf64>) -> tensor<10x10xf64, #CSR>
 }
 
 // CHECK-LABEL: func @rewrite_convert_nop(
-// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
+// CHECK-SAME:    %[[ARG0:.*]]: tensor<10x10xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
 // CHECK-NEXT:    %[[RES:.*]] = sparse_tensor.convert %[[ARG0]]
-// CHECK-NEXT:    return %[[RES]] : tensor<10x10xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
+// CHECK-NEXT:    return %[[RES]] : tensor<10x10xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
 func.func @rewrite_convert_nop(%arg0: tensor<10x10xf64, #CSR>) -> tensor<10x10xf64, #CSR> {
   %0 = sparse_tensor.convert %arg0 : tensor<10x10xf64, #CSR> to tensor<10x10xf64, #DCSR>
   %1 = sparse_tensor.convert %0 : tensor<10x10xf64, #DCSR> to tensor<10x10xf64, #CSR>
@@ -84,7 +84,7 @@ func.func @rewrite_transpose(%arg0: tensor<100x200xf64, #CSR>) -> tensor<200x100
 // CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>> {
 // CHECK:         %[[VAL:.*]] = "mhlo.dot"(%[[ARG0]], %[[ARG1]])
-// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
+// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
 func.func @rewrite_dot(%arg0: tensor<5x5xf64, #CSR>,
                        %arg1: tensor<5x5xf64, #CSR>) -> tensor<5x5xf64, #CSR> {
   %0 = "mhlo.dot"(%arg0, %arg1)
@@ -100,7 +100,7 @@ func.func @rewrite_dot(%arg0: tensor<5x5xf64, #CSR>,
 // CHECK-SAME:    %[[ARG0:.*0]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>,
 // CHECK-SAME:    %[[ARG1:.*1]]: tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>>) -> tensor<5x5xf64, #sparse_tensor.encoding<{{{.*}}}>> {
 // CHECK:         %[[VAL:.*]] = "mhlo.dot_general"(%[[ARG0]], %[[ARG1]])
-// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ] }>>
+// CHECK:         return %[[VAL]] : tensor<5x5xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
 func.func @rewrite_general_dot(%arg0: tensor<5x5xf64, #CSR>,
                                %arg1: tensor<5x5xf64, #CSR>) -> tensor<5x5xf64, #CSR> {
    %0 = "mhlo.dot_general"(%arg0, %arg1)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
index d3fe69805b7..1ef4d5700fc 100755
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/sparse_transpose.mlir
@@ -3,7 +3,7 @@
 // RUN:   --canonicalize | FileCheck %s
 
 #DCSR = #sparse_tensor.encoding<{
-  dimLevelType = [ "compressed", "compressed" ]
+  lvlTypes = [ "compressed", "compressed" ]
 }>
 
 //
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 4f82994c7af..5c206bcb18c 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -379,6 +379,21 @@ func.func @op_all_reduce(%arg0: tensor<f32>) -> tensor<f32> {
   func.return %0 : tensor<f32>
 }
 
+// CHECK-LABEL: "op_all_reduce_tuple"
+func.func @op_all_reduce_tuple(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> (tensor<8xf32>, tensor<f32>) {
+  //      CHECK: "mhlo.all_reduce"(%[[ARG0:.*]], %[[ARG1:.*]]) ({
+  // CHECK-NEXT:   ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
+  // CHECK-NEXT:     %[[ADD:.*]] = "mhlo.add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+  // CHECK-NEXT:     "mhlo.return"(%[[ADD]]) : (tensor<f32>) -> ()
+  // CHECK-NEXT: }) {replica_groups = dense<> : tensor<0x0xi64>} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  %0:2 = stablehlo.custom_call @mhlo.all_reduce(%arg0, %arg1) {called_computations = [@all_reduce0], mhlo.attributes = {replica_groups = dense<> : tensor<0x0xi64>}} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  return %0#0, %0#1 : tensor<8xf32>, tensor<f32>
+}
+func.func @all_reduce0(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
+  %0 = stablehlo.add %arg0, %arg1 : tensor<f32>
+  stablehlo.return %0 : tensor<f32>
+}
+
 // CHECK-LABEL: "op_all_to_all"
 func.func @op_all_to_all(%arg0: tensor<4x16xf32>) -> tensor<16x4xf32> {
   //               CHECK: "mhlo.all_to_all"(%arg0) {
@@ -705,6 +720,19 @@ func.func @op_custom_call_api_version_typed_ffi(%arg0: tensor<f32>) -> tensor<f3
   return %0 : tensor<f32>
 }
 
+// CHECK-LABEL: op_custom_call_mhlo_backend_config
+func.func @op_custom_call_mhlo_backend_config(%arg0: tensor<16x256xbf16>) -> tensor<16x4xbf16> {
+  // CHECK: "mhlo.custom_call"(%arg0) {
+  // CHECK-SAME: api_version = 4 : i32,
+  // CHECK-SAME: backend_config = {aggregate_to_topk = true},
+  // CHECK-SAME: call_target_name = "foo"
+  // CHECK-SAME: } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
+  %4 = stablehlo.custom_call @foo(%arg0) {
+    "mhlo.backend_config" = {aggregate_to_topk = true}
+    } : (tensor<16x256xbf16>) -> tensor<16x4xbf16>
+  return %4 : tensor<16x4xbf16>
+}
+
 // CHECK-LABEL: "op_divide"
 func.func @op_divide(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK: "mhlo.divide"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tensor<f32>
@@ -1810,9 +1838,9 @@ func.func @type_quantization(%arg0: tensor<!quant.uniform<i8:f32, 34.0:16>>, %ar
 }
 
 // CHECK-LABEL: "type_sparsity"
-func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32> {
-  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
-  %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed" ] }>>) -> tensor<16xf32>
+func.func @type_sparsity(%arg0: tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32> {
+  // CHECK: "mhlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
+  %0 = "stablehlo.abs"(%arg0) : (tensor<16xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed" ] }>>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
 
@@ -1872,3 +1900,37 @@ func.func @op_custom_call_botched_extensibility_protocol(%arg0: tensor<f32>) ->
   } : (tensor<f32>) -> tensor<f32>
   return %0 : tensor<f32>
 }
+
+// -----
+
+func.func @op_custom_call_botched_mhlo_backend_config(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{failed to legalize operation 'stablehlo.custom_call' that was explicitly marked illegal}}
+  %0 = "stablehlo.custom_call"(%arg0) {
+    call_target_name = "mhlo.custom_call",
+    mhlo.backend_config = "."
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @op_custom_call_botched_mhlo_backend_config(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{failed to legalize operation 'stablehlo.custom_call' that was explicitly marked illegal}}
+  %0 = "stablehlo.custom_call"(%arg0) {
+    call_target_name = "mhlo.custom_call",
+    mhlo.backend_config = 3
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @op_custom_call_botched_mhlo_backend_config_version(%arg0: tensor<f32>) -> tensor<f32> {
+  // expected-error@+1 {{failed to legalize operation 'stablehlo.custom_call' that was explicitly marked illegal}}
+  %0 = "stablehlo.custom_call"(%arg0) {
+    call_target_name = "mhlo.custom_call",
+    api_version = 2 : i32,
+    mhlo.backend_config = 3
+  } : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
index 7a7c90dfe8a..d4f3709d718 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tests/Dialect/thlo/legalize_sort.mlir
@@ -101,26 +101,19 @@ func.func @sort(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
 // CHECK:             }
 // COM:               // Merge subarrays of each input together until the final
 // COM:               // sorted array is computed.
-// CHECK:             %[[MERGE_RESULTS:.*]]:6 = scf.while
+// CHECK:             %[[MERGE_RESULTS:.*]]:2 = scf.while
 // CHECK-SAME:            (%[[SUBARRAY_SIZE:[A-Za-z0-9]*]] = %[[C16]],
-// CHECK-SAME:             %[[PARITY_:[A-Za-z0-9]*]] = %[[CFALSE]],
-// CHECK-SAME:             %[[READ_BUF1:[A-Za-z0-9]*]] = %[[SUBVIEW_INIT1]],
-// CHECK-SAME:             %[[READ_BUF2:[A-Za-z0-9]*]] = %[[SUBVIEW_INIT2]],
-// CHECK-SAME:             %[[WRITE_BUF1:[A-Za-z0-9]*]] = %[[SUBVIEW_SCRATCH1]],
-// CHECK-SAME:             %[[WRITE_BUF2:[A-Za-z0-9]*]] = %[[SUBVIEW_SCRATCH2]])
+// CHECK-SAME:             %[[PARITY_:[A-Za-z0-9]*]] = %[[CFALSE]])
 // CHECK:               %[[ARE_ALL_SUBARRAYS_MERGED:.*]] = arith.cmpi slt, %[[SUBARRAY_SIZE]], %[[SORT_DIM]]
-// CHECK:               scf.condition(%[[ARE_ALL_SUBARRAYS_MERGED]]) %[[SUBARRAY_SIZE]], %[[PARITY_]], %[[READ_BUF1]], %[[READ_BUF2]], %[[WRITE_BUF1]], %[[WRITE_BUF2]]
+// CHECK:               scf.condition(%[[ARE_ALL_SUBARRAYS_MERGED]]) %[[SUBARRAY_SIZE]], %[[PARITY_]]
 // CHECK:             } do {
 // CHECK:             ^bb0(%[[SUBARRAY_SIZE_:[A-Za-z0-9]*]]: index,
-// CHECK-SAME:             %[[PARITY__:[A-Za-z0-9]*]]: i1,
-// CHECK-SAME:             %[[READ_BUF1_:[A-Za-z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
-// CHECK-SAME:             %[[READ_BUF2_:[A-Za-z0-9]*]]: memref<?xi32, strided<[?], offset: ?>>,
-// CHECK-SAME:             %[[WRITE_BUF1_:[A-Za-z0-9]*]]: memref<?xf32, strided<[?], offset: ?>>,
-// CHECK-SAME:             %[[WRITE_BUF2_:[A-Za-z0-9]*]]: memref<?xi32, strided<[?], offset: ?>>):
+// CHECK-SAME:             %[[PARITY__:[A-Za-z0-9]*]]: i1):
 // CHECK:               %[[DOUBLE_SUBARRAY_SIZE:.*]] = arith.addi %[[SUBARRAY_SIZE_]], %[[SUBARRAY_SIZE_]]
 // COM:                 // Merge all successive pairs of subarrays of maximum
 // COM:                 // size SUBARRAY_SIZE.
-// CHECK:               scf.for
+// CHECK:               scf.if %[[PARITY_]] {
+// CHECK:                scf.for
 // CHECK-SAME:              %[[DOUBLE_SUBARRAY_START:.*]] = %[[C0]] to %[[SORT_DIM]]
 // CHECK-SAME:              step %[[DOUBLE_SUBARRAY_SIZE]] {
 // CHECK:                 %[[SUBARRAY1_UPPER_BOUND:.*]] = arith.addi %[[DOUBLE_SUBARRAY_START]], %[[SUBARRAY_SIZE_]]
@@ -140,15 +133,15 @@ func.func @sort(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
 // CHECK:                 ^bb0(%[[OUTPUT_INDEX_:[A-Za-z0-9]*]]: index,
 // CHECK-SAME:                 %[[SUBARRAY1_INDEX_:[A-Za-z0-9]*]]: index,
 // CHECK-SAME:                 %[[SUBARRAY2_INDEX_:[A-Za-z0-9]*]]: index):
-// CHECK:                   %[[RHS_ELEM1:.*]] = memref.load %[[READ_BUF1_]][%[[SUBARRAY1_INDEX_]]]
-// CHECK:                   %[[RHS_ELEM2:.*]] = memref.load %[[READ_BUF2_]][%[[SUBARRAY1_INDEX_]]]
-// CHECK:                   %[[LHS_ELEM1:.*]] = memref.load %[[READ_BUF1_]][%[[SUBARRAY2_INDEX_]]]
-// CHECK:                   %[[LHS_ELEM2:.*]] = memref.load %[[READ_BUF2_]][%[[SUBARRAY2_INDEX_]]]
+// CHECK:                   %[[RHS_ELEM1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[SUBARRAY1_INDEX_]]]
+// CHECK:                   %[[RHS_ELEM2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[SUBARRAY1_INDEX_]]]
+// CHECK:                   %[[LHS_ELEM1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[SUBARRAY2_INDEX_]]]
+// CHECK:                   %[[LHS_ELEM2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[SUBARRAY2_INDEX_]]]
 // CHECK:                   %[[COMPARATOR_RESULT:.*]] = arith.cmpf ogt, %[[LHS_ELEM1]], %[[RHS_ELEM1]] : f32
 // CHECK:                   %[[LEFT_ELEM1:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[LHS_ELEM1]], %[[RHS_ELEM1]] : f32
 // CHECK:                   %[[LEFT_ELEM2:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[LHS_ELEM2]], %[[RHS_ELEM2]] : i32
-// CHECK:                   memref.store %[[LEFT_ELEM1]], %[[WRITE_BUF1_]][%[[OUTPUT_INDEX_]]]
-// CHECK:                   memref.store %[[LEFT_ELEM2]], %[[WRITE_BUF2_]][%[[OUTPUT_INDEX_]]]
+// CHECK:                   memref.store %[[LEFT_ELEM1]], %[[SUBVIEW_SCRATCH1]][%[[OUTPUT_INDEX_]]]
+// CHECK:                   memref.store %[[LEFT_ELEM2]], %[[SUBVIEW_SCRATCH2]][%[[OUTPUT_INDEX_]]]
 // CHECK:                   %[[SUBARRAY1_INDEX__PLUS_1:.*]] = arith.addi %[[SUBARRAY1_INDEX_]], %[[C1]]
 // CHECK:                   %[[NEW_SUBARRAY1_INDEX:.*]] = arith.select %[[COMPARATOR_RESULT]], %[[SUBARRAY1_INDEX_]], %[[SUBARRAY1_INDEX__PLUS_1]]
 // CHECK:                   %[[SUBARRAY2_INDEX__PLUS_1:.*]] = arith.addi %[[SUBARRAY2_INDEX_]], %[[C1]]
@@ -169,14 +162,17 @@ func.func @sort(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
 // CHECK-SAME:                step %[[C1]] {
 // CHECK:                   %[[UNPROCESSED_ELEM_INDEX:.*]] = arith.addi %[[INDEX_TO_UNPROCESSED_ELEMS]], %[[I_]]
 // CHECK:                   %[[OUTPUT_INDEX__:.*]] = arith.addi %[[POST_MERGE_INDICES]]#0, %[[I_]]
-// CHECK:                   %[[UNPROCESSED_ELEM1:.*]] = memref.load %[[READ_BUF1_]][%[[UNPROCESSED_ELEM_INDEX]]]
-// CHECK:                   %[[UNPROCESSED_ELEM2:.*]] = memref.load %[[READ_BUF2_]][%[[UNPROCESSED_ELEM_INDEX]]]
-// CHECK:                   memref.store %[[UNPROCESSED_ELEM1]], %[[WRITE_BUF1_]][%[[OUTPUT_INDEX__]]]
-// CHECK:                   memref.store %[[UNPROCESSED_ELEM2]], %[[WRITE_BUF2_]][%[[OUTPUT_INDEX__]]]
+// CHECK:                   %[[UNPROCESSED_ELEM1:.*]] = memref.load %[[SUBVIEW_INIT1]][%[[UNPROCESSED_ELEM_INDEX]]]
+// CHECK:                   %[[UNPROCESSED_ELEM2:.*]] = memref.load %[[SUBVIEW_INIT2]][%[[UNPROCESSED_ELEM_INDEX]]]
+// CHECK:                   memref.store %[[UNPROCESSED_ELEM1]], %[[SUBVIEW_SCRATCH1]][%[[OUTPUT_INDEX__]]]
+// CHECK:                   memref.store %[[UNPROCESSED_ELEM2]], %[[SUBVIEW_SCRATCH2]][%[[OUTPUT_INDEX__]]]
 // CHECK:                 }
+// CHECK:                }
+// COM:                   // Else block as above, but with read and write buffers
+// COM:                   // swapped.
 // CHECK:               }
 // CHECK:               %[[NEW_PARITY:.*]] = arith.subi %[[CTRUE]], %[[PARITY__]] : i1
-// CHECK:               scf.yield %[[DOUBLE_SUBARRAY_SIZE]], %[[NEW_PARITY]], %[[WRITE_BUF1_]], %[[WRITE_BUF2_]], %[[READ_BUF1_]], %[[READ_BUF2_]]
+// CHECK:               scf.yield %[[DOUBLE_SUBARRAY_SIZE]], %[[NEW_PARITY]]
 // CHECK:             }
 // CHECK:             scf.yield %[[MERGE_RESULTS]]#1 : i1
 // CHECK:           }
@@ -184,5 +180,24 @@ func.func @sort(%input1: memref<?x?xf32>, %input2: memref<?x?xi32>,
 // CHECK:             memref.copy %[[SCRATCH1]], %[[INIT1]]
 // CHECK:             memref.copy %[[SCRATCH2]], %[[INIT2]]
 // CHECK:           }
+// CHECK:           memref.dealloc %[[SCRATCH1]]
+// CHECK:           memref.dealloc %[[SCRATCH2]]
 // CHECK:           return
 // CHECK:         }
+
+// -----
+
+// CHECK-LABEL: @sort_strided
+func.func @sort_strided(%input: memref<47x1xf32, strided<[7, 1], offset: ?>>,
+                        %init: memref<47x1xf32, strided<[1, 7], offset: ?>>) {
+  thlo.sort
+    ins(%input : memref<47x1xf32, strided<[7, 1], offset: ?>>)
+    outs(%init : memref<47x1xf32, strided<[1, 7], offset: ?>>)
+    dimension = 0
+    is_stable = true
+    (%lhs: f32, %rhs: f32) {
+      %gt = arith.cmpf ogt, %lhs, %rhs: f32
+      thlo.yield %gt : i1
+    }
+  func.return
+}
diff --git a/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
index 43f9a8cc2b7..3f3d75fc5ad 100644
--- a/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/thlo/transforms/legalize_sort/legalize_sort.cc
@@ -295,16 +295,12 @@ Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
 
   // The while arguments are:
   // 1. the current size
-  // 2. the original index of the buffers we're currently reading from
-  // 3. the buffers we're currently reading from
-  // 4. the buffers we're currently writing to.
+  // 2. a boolean stating whether we are reading from outputs0 or outputs1
   //
-  // 1 gets doubled each iteration, 2 gets negated, 3 and 4 are swapped.
+  // 1 gets doubled each iteration, 2 gets negated.
   // int currentSize = kInsertionSortSize;
   SmallVector<Value> whileInitArgs{insertionSortSize, initParity};
   // First we read from `outputs0` (initialized by the insertion sort above).
-  llvm::copy(outputs0, std::back_inserter(whileInitArgs));
-  llvm::copy(outputs1, std::back_inserter(whileInitArgs));
 
   SmallVector<Type> whileArgTypes;
   for (auto val : whileInitArgs) whileArgTypes.push_back(val.getType());
@@ -323,40 +319,57 @@ Value emitBottomUpMergeSort(ImplicitLocOpBuilder& b, Value lo, Value hi,
       [&](OpBuilder& afterBuilder, Location afterLoc, ValueRange args) {
         ImplicitLocOpBuilder impLocAfterBuilder =
             ImplicitLocOpBuilder(afterLoc, afterBuilder);
-        ArithBuilder localArithBuilder(impLocAfterBuilder, afterLoc);
-        size_t numArgs = inputMemrefs.size();
 
         //                                 {
         Value currentSize = args[0], parity = args[1];
-        auto readBufs = args.drop_front(2).take_front(numArgs);
-        auto writeBufs = args.take_back(numArgs);
-
         Value twoCurrentSize = arith.add(currentSize, currentSize);
 
-        // for (int start = 0; start < size; start += 2*currentSize) {
-        {
+        // emitMergeLoop(readBufs, writeBufs) {
+        //   for (int start = 0; start < size; start += 2*currentSize) {
+        auto emitMergeLoop = [&](OpBuilder& builder, Location loc,
+                                 ValueRange readBufs, ValueRange writeBufs) {
+          ImplicitLocOpBuilder localImpLocBuilder(loc, builder);
+          ArithBuilder localArithBuilder(localImpLocBuilder, loc);
+
           auto forOp =
-              impLocAfterBuilder.create<scf::ForOp>(zero, size, twoCurrentSize);
-          OpBuilder::InsertionGuard guard(impLocAfterBuilder);
-          impLocAfterBuilder.setInsertionPointToStart(forOp.getBody());
+              localImpLocBuilder.create<scf::ForOp>(zero, size, twoCurrentSize);
+          OpBuilder::InsertionGuard guard(localImpLocBuilder);
+          localImpLocBuilder.setInsertionPointToStart(forOp.getBody());
           Value start = forOp.getInductionVar();
 
-          Value mid = impLocAfterBuilder.create<MinSIOp>(
+          Value mid = localImpLocBuilder.create<MinSIOp>(
               size, localArithBuilder.add(start, currentSize));
-          Value end = impLocAfterBuilder.create<MinSIOp>(
+          Value end = localImpLocBuilder.create<MinSIOp>(
               size, localArithBuilder.add(start, twoCurrentSize));
-          emitMerge(impLocAfterBuilder, start, mid, end, readBufs, writeBufs,
+          emitMerge(localImpLocBuilder, start, mid, end, readBufs, writeBufs,
                     comparator);
-        }
+          return;
+        };
+        //   }
         // }
 
+        // if (parity)
+        //   emitMergeLoop(outputs1, outputs0)
+        // else
+        //   emitMergeLoop(outputs0, outputs1)
+        impLocAfterBuilder.create<scf::IfOp>(
+            /*cond=*/parity,
+            /*thenBuilder=*/
+            [&](OpBuilder& builder, Location loc) {
+              emitMergeLoop(builder, loc, outputs1, outputs0);
+              builder.create<scf::YieldOp>(loc, ValueRange{});
+            },
+            /*elseBuilder=*/
+            [&](OpBuilder& builder, Location loc) {
+              emitMergeLoop(builder, loc, outputs0, outputs1);
+              builder.create<scf::YieldOp>(loc, ValueRange{});
+            });
+
         // parity = !parity;
         Value one = impLocAfterBuilder.create<arith::ConstantIntOp>(1, 1);
         Value notParity = arith.sub(one, parity);
         // currentSize *= 2;
         SmallVector<Value> nextWhileArgs{twoCurrentSize, notParity};
-        llvm::copy(writeBufs, std::back_inserter(nextWhileArgs));
-        llvm::copy(readBufs, std::back_inserter(nextWhileArgs));
         impLocAfterBuilder.create<scf::YieldOp>(nextWhileArgs);
       });
   // }
@@ -424,7 +437,7 @@ struct SortOpPattern : public OpRewritePattern<SortOp> {
     if (!op.hasBufferSemantics())
       return op->emitError() << "expected buffer semantics";
 
-    // Note: the output memrefs aren't necessarily the ones that we return,
+    // Note: the output memrefs aren't necessarily the ones that we return
     ValueRange outputMemrefs = op.getInits();
     SmallVector<Value> scratchMemrefs;
     scratchMemrefs.reserve(outputMemrefs.size());
@@ -509,6 +522,10 @@ struct SortOpPattern : public OpRewritePattern<SortOp> {
                                            /*thenBuilder=*/thenBlock,
                                            /*elseBuilder=*/nullptr);
 
+    for (Value scratchMemref : scratchMemrefs) {
+      b.create<memref::DeallocOp>(scratchMemref);
+    }
+
     return success();
   }
 };
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
index 5179776373d..d11c9d5342d 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/arith.cc
@@ -15,8 +15,7 @@ limitations under the License.
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 
-#include <type_traits>  // NOLINT
-#include <variant>      // NOLINT
+#include <variant>  // NOLINT
 
 #include "llvm/Support/ErrorHandling.h"
 #include "tools/mlir_interpreter/dialects/comparators.h"
@@ -231,14 +230,13 @@ InterpreterValue select(InterpreterState& state, arith::SelectOp,
     return std::get<bool>(cond.storage) ? trueValue : falseValue;
   }
 
-  if (!cond.isTensor() || !cond.view().isVector) {
-    llvm::errs() << cond.toString();
+  if (!cond.isTensor() && !cond.view().isVector) {
     state.addFailure("select requires a scalar or vector argument");
     return {};
   }
 
   auto ret = trueValue.clone();
-  for (const auto& index : cond.view().indices()) {
+  for (const auto& index : cond.view().indices(/*includeVectorDims=*/true)) {
     if (cond.extractElement(index).asInt() == 0) {
       ret.insertElement(index, falseValue.extractElement(index));
     }
@@ -277,6 +275,8 @@ REGISTER_MLIR_INTERPRETER_OP("arith.uitofp", uiToFP);
 REGISTER_MLIR_INTERPRETER_OP("arith.xori", applyCwiseBinaryMap<BitXor>);
 REGISTER_MLIR_INTERPRETER_OP("arith.shrui",
                              applyCwiseBinaryMap<ShiftRightLogical>);
+REGISTER_MLIR_INTERPRETER_OP("arith.shrsi",
+                             applyCwiseBinaryMap<ShiftRightArith>);
 REGISTER_MLIR_INTERPRETER_OP("arith.shli", applyCwiseBinaryMap<ShiftLeft>);
 
 // The float implementations support ints too.
@@ -285,6 +285,7 @@ REGISTER_MLIR_INTERPRETER_OP("arith.divsi", "arith.divf");
 REGISTER_MLIR_INTERPRETER_OP("arith.maxsi", "arith.maxf");
 REGISTER_MLIR_INTERPRETER_OP("arith.minsi", "arith.minf");
 REGISTER_MLIR_INTERPRETER_OP("arith.muli", "arith.mulf");
+REGISTER_MLIR_INTERPRETER_OP("arith.remsi", "arith.remf");
 REGISTER_MLIR_INTERPRETER_OP("arith.subi", "arith.subf");
 
 REGISTER_MLIR_INTERPRETER_OP(bitcast);
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
index 11dacc95e87..8db1d3b9d43 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/deallocation.cc
@@ -39,17 +39,17 @@ InterpreterValue own(InterpreterState&, deallocation::OwnOp,
   return alloc;
 }
 
-void freeAlloc(InterpreterState& state, deallocation::FreeOp,
+void freeAlloc(InterpreterState& state, deallocation::FreeOp op,
                const InterpreterValue& alloc) {
   if (auto* stats = state.getOptions().stats) {
     stats->heapSize -= alloc.buffer()->getByteSize();
     ++stats->numDeallocations;
   }
-  alloc.buffer()->deallocate();
+  alloc.buffer()->deallocate(op);
 }
 
 SmallVector<InterpreterValue> retain(InterpreterState& state,
-                                     deallocation::RetainOp,
+                                     deallocation::RetainOp op,
                                      ArrayRef<InterpreterValue> values,
                                      ArrayRef<InterpreterValue> owned) {
   SmallVector<InterpreterValue> result(values.size(), null(state, {}));
@@ -71,7 +71,7 @@ SmallVector<InterpreterValue> retain(InterpreterState& state,
         stats->heapSize -= owned[i].buffer()->getByteSize();
         ++stats->numDeallocations;
       }
-      owned[i].buffer()->deallocate();
+      owned[i].buffer()->deallocate(op);
     }
   }
   return result;
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
new file mode 100644
index 00000000000..b5a88b78abb
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/gml_st.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "gml_st/IR/gml_st_ops.h"
+#include "tools/mlir_interpreter/dialects/util.h"
+#include "tools/mlir_interpreter/framework/interpreter.h"
+#include "tools/mlir_interpreter/framework/interpreter_value.h"
+#include "tools/mlir_interpreter/framework/registration.h"
+
+namespace mlir {
+namespace interpreter {
+namespace {
+
+llvm::SmallVector<InterpreterValue> fusion(InterpreterState& state,
+                                           gml_st::FusionOp op,
+                                           ArrayRef<InterpreterValue> inputs,
+                                           ArrayRef<InterpreterValue> inits) {
+  llvm::SmallVector<InterpreterValue> args;
+  llvm::append_range(args, inputs);
+  llvm::append_range(args, inits);
+  auto result = interpret(state, op.getRegion(), args);
+  if (op.getNumResults() == 0) {
+    result.clear();
+  }
+  return result;
+}
+
+REGISTER_MLIR_INTERPRETER_OP(fusion);
+REGISTER_MLIR_INTERPRETER_OP("gml_st.yield", noOpTerminator);
+
+}  // namespace
+}  // namespace interpreter
+}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
index 778e90ee2e4..842f5d1076b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/memref.cc
@@ -68,6 +68,7 @@ InterpreterValue alloc(InterpreterState& state, memref::AllocOp alloc,
     stats->peakHeapSize = std::max(stats->peakHeapSize, stats->heapSize);
     ++stats->numAllocations;
   }
+  result.buffer()->setAllocatedBy(alloc);
   return result;
 }
 
@@ -77,10 +78,11 @@ InterpreterValue allocA(InterpreterState&, memref::AllocaOp alloc,
   auto shape = replaceDynamicVals(ty.getShape(), dynamicSizes);
   auto result = InterpreterValue::makeTensor(ty.getElementType(), shape);
   result.buffer()->setIsAlloca();
+  result.buffer()->setAllocatedBy(alloc);
   return result;
 }
 
-void dealloc(InterpreterState& state, memref::DeallocOp,
+void dealloc(InterpreterState& state, memref::DeallocOp op,
              InterpreterValue memref) {
   if (!memref.buffer()) {
     state.addFailure("attempting to deallocate null pointer.");
@@ -96,7 +98,7 @@ void dealloc(InterpreterState& state, memref::DeallocOp,
       buffer->getByteSize()) {
     state.addFailure("Attempting to deallocate a subview");
   } else if (!state.getOptions().disableDeallocations) {
-    buffer->deallocate();
+    buffer->deallocate(op);
   }
 }
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
index abffa4219a7..d74cfea270b 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/scf.cc
@@ -56,7 +56,10 @@ llvm::SmallVector<InterpreterValue> scfFor(InterpreterState& state,
 
   auto& region = op->getRegion(0);
   for (; lb < ub; lb += step) {
-    SmallVector<InterpreterValue> inputs{{lb}};
+    SmallVector<InterpreterValue> inputs;
+    dispatchScalarType(op.getLowerBound().getType(), [&](auto dummy) {
+      inputs.push_back(InterpreterValue{static_cast<decltype(dummy)>(lb)});
+    });
     llvm::copy(results, std::back_inserter(inputs));
     results = interpret(state, region, inputs);
     if (state.hasFailure()) break;
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
index 61630520386..c082dedad88 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/BUILD
@@ -1,9 +1,10 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup")
 
-package(licenses = ["notice"])
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
index b9e227974b4..a1c32c64730 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/int_math.mlir
@@ -67,9 +67,9 @@ func.func @ori() -> i32 {
 // CHECK-NEXT: i32: 11
 
 func.func @shrui() -> i32 {
-  %c3 = arith.constant 20 : i32
+  %c20 = arith.constant 20 : i32
   %c-1 = arith.constant -1 : i32
-  %ret = arith.shrui %c-1, %c3 : i32
+  %ret = arith.shrui %c-1, %c20 : i32
   return %ret : i32
 }
 
@@ -88,3 +88,24 @@ func.func @shli() -> i32 {
 // CHECK-NEXT: Results
 // CHECK-NEXT: 168
 
+func.func @shrsi() -> i32 {
+  %c3 = arith.constant 3 : i32
+  %c-1023 = arith.constant -1023 : i32
+  %ret = arith.shrsi %c-1023, %c3 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @shrsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: -128
+
+func.func @remsi() -> i32 {
+  %c7 = arith.constant 7 : i32
+  %c-1023 = arith.constant -1023 : i32
+  %ret = arith.remsi %c-1023, %c7 : i32
+  return %ret : i32
+}
+
+// CHECK-LABEL: @remsi
+// CHECK-NEXT: Results
+// CHECK-NEXT: -1
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
index bf3ff8e5441..103842d4079 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/arith/select.mlir
@@ -38,3 +38,15 @@ func.func @scalar_vector() -> vector<4xi32> {
 // CHECK-LABEL: @scalar_vector
 // CHECK-NEXT: Results
 // CHECK-NEXT{LITERAL}: vector<4xi32>: [10, 20, 30, 40]
+
+func.func @tensor() -> tensor<4xi32> {
+  %a = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32>
+  %b = arith.constant dense<[10, 20, 30, 40]> : tensor<4xi32>
+  %c = arith.constant dense<[true, false, true, false]> : tensor<4xi1>
+  %r = arith.select %c, %a, %b : tensor<4xi1>, tensor<4xi32>
+  return %r : tensor<4xi32>
+}
+
+// CHECK-LABEL: @tensor
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: TensorOrMemref<4xi32>: [1, 20, 3, 40]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir
new file mode 100644
index 00000000000..1c12fc71438
--- /dev/null
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/gml_st/fusion.mlir
@@ -0,0 +1,35 @@
+// RUN: mlir-interpreter-runner %s -run-all | FileCheck %s
+
+func.func @gml_st_fusion() -> tensor<4xf32> {
+  %a = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf32>
+  %b = tensor.empty() : tensor<4xf32>
+  %0 = gml_st.fusion ins(%a0 = %a : tensor<4xf32>)
+                     inits(%in = %b : tensor<4xf32>) {
+    %res = linalg.map { math.exp }
+      ins(%a0 : tensor<4xf32>)
+      outs(%in : tensor<4xf32>)
+    gml_st.yield %res : tensor<4xf32>
+  } : tensor<4xf32>
+  func.return %0 : tensor<4xf32>
+}
+
+// CHECK-LABEL: @gml_st_fusion
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2.718282e+00, 7.389056e+00, 2.008554e+01, 5.459815e+01]
+
+func.func @bufferized() -> memref<4xf32> {
+  %a = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : memref<4xf32>
+  %b = memref.alloc() : memref<4xf32>
+  gml_st.fusion ins(%a0 = %a : memref<4xf32>)
+                inits(%in = %b : memref<4xf32>) {
+    linalg.map { math.exp }
+      ins(%a0 : memref<4xf32>)
+      outs(%in : memref<4xf32>)
+    gml_st.yield %in : memref<4xf32>
+  }
+  func.return %b : memref<4xf32>
+}
+
+// CHECK-LABEL: @bufferized
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: [2.718282e+00, 7.389056e+00, 2.008554e+01, 5.459815e+01]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
index 430304c22ad..d6026b5b913 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/alloc.mlir
@@ -45,3 +45,13 @@ func.func @alloc_dynamic() -> memref<?x3xi32> {
 // CHECK-LABEL: @alloc_dynamic
 // CHECK-NEXT: Results
 // CHECK-NEXT{LITERAL}: <2x3xi32>: [[0, 0, 0], [0, 0, 0]]
+
+func.func @dealloc() -> memref<i32> {
+  %a = memref.alloc() : memref<i32>
+  memref.dealloc %a : memref<i32>
+  return %a : memref<i32>
+}
+
+// CHECK-LABEL: @dealloc
+// CHECK-NEXT: Results
+// CHECK-NEXT: TensorOrMemref<i32>: <<deallocated>>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
index 7696782811f..a533a9664c2 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/memref/invalid.mlir
@@ -48,3 +48,30 @@ func.func @collapse_shape_no_common_stride()
 
 // CHECK-LABEL: @collapse_shape_no_common_stride
 // CHECK-NEXT: cannot collapse dimensions without a common stride
+
+func.func @double_free() {
+  %a = memref.alloc() : memref<i32>
+  memref.dealloc %a : memref<i32>
+  memref.dealloc %a : memref<i32>
+  return
+}
+
+// CHECK-LABEL: @double_free
+// CHECK-NEXT: Interpreter failure: double-free
+// CHECK-NEXT: Note: allocated by %alloc = memref.alloc() : memref<i32>
+// CHECK-NEXT: Note: previously freed by memref.dealloc %alloc : memref<i32>
+// CHECK-NEXT{2}: Encountered failure while executing memref.dealloc %alloc : memref<i32>
+
+func.func @use_after_free() {
+  %a = memref.alloc() : memref<i32>
+  memref.dealloc %a : memref<i32>
+  %b = arith.constant 1 : i32
+  memref.store %b, %a[] : memref<i32>
+  return
+}
+
+// CHECK-LABEL: @use_after_free
+// CHECK-NEXT: Interpreter failure: use-after-free
+// CHECK-NEXT: Note: allocated by %alloc = memref.alloc() : memref<i32>
+// CHECK-NEXT: Note: previously freed by memref.dealloc %alloc : memref<i32>
+// CHECK-NEXT{2}: Encountered failure while executing memref.store %c1_i32, %alloc[] : memref<i32>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
index c81af928f62..bad5951f4b4 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/dialects/tests/scf/for.mlir
@@ -46,5 +46,37 @@ func.func @iter_arg() -> index {
 }
 
 // CHECK-LABEL: @iter_arg
-// CHECK: Results
-// CHECK-NEXT{LITERAL}: i64: 7
\ No newline at end of file
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: i64: 7
+
+func.func @int32() -> memref<4xi32> {
+  %c0 = arith.constant 0 : i32
+  %c1 = arith.constant 1 : i32
+  %c4 = arith.constant 4 : i32
+  %alloc = memref.alloc() : memref<4xi32>
+  scf.for %i = %c0 to %c4 step %c1 : i32 {
+    %index = arith.index_cast %i : i32 to index
+    memref.store %i, %alloc[%index]: memref<4xi32>
+  }
+  return %alloc : memref<4xi32>
+}
+
+// CHECK-LABEL: int32
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <4xi32>: [0, 1, 2, 3]
+
+func.func @int16() -> memref<4xi16> {
+  %c0 = arith.constant 0 : i16
+  %c1 = arith.constant 1 : i16
+  %c4 = arith.constant 4 : i16
+  %alloc = memref.alloc() : memref<4xi16>
+  scf.for %i = %c0 to %c4 step %c1 : i16 {
+    %index = arith.index_cast %i : i16 to index
+    memref.store %i, %alloc[%index]: memref<4xi16>
+  }
+  return %alloc : memref<4xi16>
+}
+
+// CHECK-LABEL: int16
+// CHECK-NEXT: Results
+// CHECK-NEXT{LITERAL}: <4xi16>: [0, 1, 2, 3]
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
index 0ad9c928ec5..0b364a5beec 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.cc
@@ -39,6 +39,7 @@ std::optional<int64_t> BufferView::getPhysicalIndex(
 }
 
 bool BufferView::inBounds(llvm::ArrayRef<int64_t> viewIndices) const {
+  if (viewIndices.size() > sizes.size()) return false;
   for (auto [index, size] : llvm::zip(viewIndices, sizes)) {
     if (index < 0 || index >= size) return false;
   }
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
index dbb383e144d..02fcb4c41cf 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tensor_or_memref.h
@@ -25,11 +25,14 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <optional>
+#include <string>
 #include <type_traits>
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/IR/Operation.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
 
@@ -44,7 +47,7 @@ bool isEqual(T a, T b) {
 // TODO(jreiffers): Replace ifndef with a command line flag.
 #ifndef MLIR_INTERPRETER_COMPARE_DOUBLES_EXACT
 // Compare double precision float with a small tolerance, because complex
-// computations in the interpreted don't olways produce the exact same result.
+// computations in the interpreted don't always produce the exact same result.
 template <>
 inline bool isEqual(double a, double b) {
   if (isinf(a) || isinf(b)) {
@@ -193,29 +196,19 @@ class Buffer {
   }
 
   char* at(std::optional<int64_t> idx, int64_t elementSize) {
-    if (!idx || isDeallocated) {
-      setFailure("out of bounds access");
+    auto byteOffset = getByteOffset(idx, elementSize);
+    if (!byteOffset) {
       return &storage.data()[0];
     }
-    if (isDeallocated) {
-      setFailure("use-after-free");
-      return &storage.data()[0];
-    }
-    assert(!isDeallocated && "accessing deallocated buffer");
-    return &storage.data()[*idx * elementSize];
+    return &storage.data()[*byteOffset];
   }
 
   const char* at(std::optional<int64_t> idx, int64_t elementSize) const {
-    if (!idx || isDeallocated) {
-      setFailure("out of bounds access");
+    auto byteOffset = getByteOffset(idx, elementSize);
+    if (!byteOffset) {
       return &storage.data()[0];
     }
-    if (isDeallocated) {
-      setFailure("use-after-free");
-      return &storage.data()[0];
-    }
-    assert(!isDeallocated && "accessing deallocated buffer");
-    return &storage.data()[*idx * elementSize];
+    return &storage.data()[*byteOffset];
   }
 
   Buffer(Dummy, size_t numElements, size_t elementSize)
@@ -223,28 +216,60 @@ class Buffer {
 
   int64_t getByteSize() const { return storage.size(); }
 
-  void deallocate() {
+  void deallocate(mlir::Operation* op) {
     if (isAlloca) {
       setFailure("deallocated stack buffer");
-    } else if (isDeallocated) {
-      setFailure("double-free");
+    } else if (freedBy != nullptr) {
+      std::string failure;
+      llvm::raw_string_ostream os(failure);
+      os << "double-free\n";
+      os << "  Note: allocated by " << *allocatedBy << "\n";
+      os << "  Note: previously freed by " << *freedBy << "\n";
+      setFailure(failure);
     } else {
-      isDeallocated = true;
+      freedBy = op;
     }
   }
 
-  bool deallocated() const { return isDeallocated; }
+  bool deallocated() const { return freedBy != nullptr; }
+  mlir::Operation* freedByOp() const { return freedBy; }
+  void setAllocatedBy(mlir::Operation* allocatedBy) {
+    this->allocatedBy = allocatedBy;
+  }
 
-  void setFailure(llvm::StringRef failure) const { this->failure = failure; }
+  void setFailure(llvm::StringRef failure) const {
+    this->failure = failure.str();
+  }
   llvm::StringRef getFailure() const { return failure; }
 
   void setIsAlloca() { isAlloca = true; }
 
  private:
+  std::optional<size_t> getByteOffset(std::optional<int64_t> idx,
+                                      int64_t elementSize) const {
+    if (!idx) {
+      setFailure("out of bounds access");
+      return std::nullopt;
+    }
+
+    if (freedBy != nullptr) {
+      std::string failure;
+      llvm::raw_string_ostream os(failure);
+      os << "use-after-free\n";
+      os << "  Note: allocated by " << *allocatedBy << "\n";
+      os << "  Note: previously freed by " << *freedBy << "\n";
+      setFailure(failure);
+      return std::nullopt;
+    }
+
+    return *idx * elementSize;
+  }
+
   llvm::SmallVector<char> storage;
-  bool isDeallocated = false;
+  mlir::Operation* freedBy = nullptr;
+  mlir::Operation* allocatedBy = nullptr;
   bool isAlloca = false;
-  mutable llvm::StringRef failure;
+  mutable std::string failure;
 };
 
 template <typename T>
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
index b77a340d2f4..22f08aa8ba6 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/BUILD
@@ -1,5 +1,7 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_test(
     name = "tensor_or_memref_test",
     srcs = ["tensor_or_memref_test.cc"],
diff --git a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
index bbcfa509ca2..349a7ca4799 100644
--- a/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/tools/mlir_interpreter/framework/tests/interpreter_value_test.cc
@@ -205,12 +205,6 @@ TEST(InterpreterValueTest, ToStringComplex) {
   ASSERT_EQ(value.toString(), "complex<f32>: 0.000000e+00+0.000000e+00i");
 }
 
-TEST(InterpreterValueTest, ToStringDeallocated) {
-  InterpreterValue value{TensorOrMemref<int64_t>::empty({})};
-  value.buffer()->deallocate();
-  ASSERT_EQ(value.toString(), "TensorOrMemref<i64>: <<deallocated>>");
-}
-
 TEST(CastTest, UnpackTensor) {
   InterpreterValue value{TensorOrMemref<int8_t>::empty({1, 1})};
   value.insertElement({0, 0}, {int8_t{1}});
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD b/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD
deleted file mode 100644
index e2f2b3ca996..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-
-# MHLO -> TOSA bridge.
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        ":internal",
-        "//tensorflow/compiler/xla/mlir_hlo/tosa:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "internal",
-    packages = [],
-)
-
-build_test(
-    name = "mhlo-tosa-opt_build_test",
-    targets = [
-        "mhlo-tosa-opt",
-    ],
-)
-
-cc_binary(
-    name = "mhlo-tosa-opt",
-    srcs = ["mhlo_tosa_opt.cc"],
-    deps = [
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
-        "//tensorflow/compiler/xla/mlir_hlo/tosa/transforms:MHLOTOSATransforms",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MlirOptLib",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TosaDialect",
-        "@stablehlo//:register",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt
deleted file mode 100644
index 023f9643e30..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/CMakeLists.txt
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-include_directories(${CMAKE_CURRENT_BINARY_DIR})
-
-add_custom_target(check-mhlo-tosa)
-
-add_subdirectory(transforms)
-add_subdirectory(tests)
-
-add_mlir_pdll_library(MHLOTOSAPDLLPatternsIncGen
-  transforms/legalize_mhlo/legalize_mhlo.pdll
-  transforms/legalize_mhlo/legalize_mhlo.pdll.h.inc
-  )
-
-get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
-get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
-set(LIBS
-        ${dialect_libs}
-        ${conversion_libs}
-        MLIROptLib
-
-        MLIRTosaDialect
-        MhloRegisterDialects
-        MhloRegisterDialects
-
-        MHLOTOSATransforms
-        )
-add_llvm_executable(mhlo-tosa-opt mhlo_tosa_opt.cc
-)
-llvm_update_compile_flags(mhlo-tosa-opt)
-add_dependencies(mhlo-tosa-opt MHLOTOSATransformsPassIncGen)
-target_link_libraries(mhlo-tosa-opt PRIVATE ${LIBS})
-
-mlir_check_all_link_libraries(mhlo-tosa-opt)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/README.md b/tensorflow/compiler/xla/mlir_hlo/tosa/README.md
deleted file mode 100644
index eb4c4d23e44..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/README.md
+++ /dev/null
@@ -1,5 +0,0 @@
-# Support for interop between MHLO & TOSA
-
-This module contains the [MLIR](https://mlir.llvm.org) utilities for
-legalization and interop between MHLO and TOSA.
-
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc
deleted file mode 100644
index 1d891b86910..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/mhlo_tosa_opt.cc
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mhlo/IR/register.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/Dialect/Tosa/Transforms/Passes.h"
-#include "mlir/InitAllDialects.h"
-#include "mlir/InitAllPasses.h"
-#include "mlir/Tools/mlir-opt/MlirOptMain.h"
-#include "stablehlo/dialect/Register.h"
-#include "transforms/passes.h"
-
-int main(int argc, char **argv) {
-  mlir::registerAllPasses();
-  mlir::tosa::registerTosaLegalizeMhloPassPass();
-  mlir::tosa::registerTosaPrepareMhloPassPass();
-
-  mlir::DialectRegistry registry;
-  mlir::registerAllDialects(registry);
-  mlir::mhlo::registerAllMhloDialects(registry);
-  mlir::stablehlo::registerAllDialects(registry);
-
-  return failed(
-      mlir::MlirOptMain(argc, argv, "MHLO TOSA pass driver\n", registry));
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/CMakeLists.txt
deleted file mode 100644
index aa80978c61d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/CMakeLists.txt
+++ /dev/null
@@ -1,35 +0,0 @@
-# Copyright 2022	 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-configure_lit_site_cfg(
-        ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.py.in
-        ${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg.py
-        MAIN_CONFIG
-        ${CMAKE_CURRENT_SOURCE_DIR}/lit.cfg.py
-)
-
-set(MHLO_TOSA_TEST_DEPENDS
-        FileCheck count not
-        mhlo-tosa-opt
-)
-
-add_lit_testsuite(check-mhlo-tosa-lit "Running the mhlo-tosa regression tests"
-        ${CMAKE_CURRENT_BINARY_DIR}
-        DEPENDS ${MHLO_TOSA_TEST_DEPENDS}
-        )
-set_target_properties(check-mhlo-tosa-lit PROPERTIES FOLDER "Tests")
-
-add_lit_testsuites(MHLO_TOSA_OPT ${CMAKE_CURRENT_SOURCE_DIR} DEPENDS ${MHLO_TOSA_TEST_DEPENDS})
-
-add_dependencies(check-mhlo-tosa check-mhlo-tosa-lit)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
deleted file mode 100644
index 21a9d70684f..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/binary.mlir
+++ /dev/null
@@ -1,227 +0,0 @@
-// RUN: mhlo-tosa-opt %s --tosa-legalize-mhlo | FileCheck %s
-
-// CHECK-LABEL: @add
-func.func @add(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.add
-  %0 = "mhlo.add"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @and
-func.func @and(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
-  // CHECK: tosa.bitwise_and
-  %0 = "mhlo.and"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
-  return %0 : tensor<10xi32>
-}
-
-// CHECK-LABEL: @compare_eq
-func.func @compare_eq(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xi1> {
-  // CHECK: tosa.equal
-  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction EQ>} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xi1>
-  return %0 : tensor<10xi1>
-}
-
-// CHECK-LABEL: @compare_lt
-func.func @compare_lt(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xi1> {
-  // CHECK: mhlo.compare
-  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xi1>
-  return %0 : tensor<10xi1>
-}
-
-// CHECK-LABEL: @compare_ne
-func.func @compare_ne(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi1> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.equal"(%arg0, %arg1)
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.logical_not"(%[[VAR0]])
-  %0 = "mhlo.compare"(%arg0, %arg1) {comparison_direction = #mhlo<comparison_direction NE>} : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi1>
-  return %0 : tensor<10xi1>
-}
-
-// CHECK-LABEL: @concatenate
-func.func @concatenate(%arg0 : tensor<3x3xf32>, %arg1 : tensor<3x3xf32>) -> tensor<6x3xf32> {
-  // CHECK: "tosa.concat"(%arg0, %arg1) <{axis = 0 : i64}> : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-  %0 = "mhlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<6x3xf32>
-  return %0 : tensor<6x3xf32>
-}
-
-// CHECK-LABEL: @divide
-func.func @divide(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
-  // CHECK: tosa.div
-  %0 = "mhlo.divide"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
-  return %0 : tensor<10xi32>
-}
-
-// CHECK-LABEL: @divide_f32
-func.func @divide_f32(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // tosa.div only supports i32, so this should not legalize.
-  // CHECK: mhlo.divide
-  %0 = "mhlo.divide"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @dot_vector_vector
-func.func @dot_vector_vector(%arg0 : tensor<3xf32>, %arg1 : tensor<3xf32>) -> tensor<f32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 1, 3>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 3, 1>}>
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-  // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
-  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<3xf32>, tensor<3xf32>) -> tensor<f32>
-  return %0 : tensor<f32>
-}
-
-// CHECK-LABEL: @dot_vector_matrix
-func.func @dot_vector_matrix(%arg0 : tensor<2xf32>, %arg1 : tensor<2x3xf32>) -> tensor<3xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 1, 2>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 2, 3>}>
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-  // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
-  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2xf32>, tensor<2x3xf32>) -> tensor<3xf32>
-  return %0 : tensor<3xf32>
-}
-
-// CHECK-LABEL: @dot_matrix_vector
-func.func @dot_matrix_vector(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3xf32>) -> tensor<2xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 2, 3>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 3, 1>}>
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-  // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
-  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2xf32>
-  return %0 : tensor<2xf32>
-}
-
-// CHECK-LABEL: @dot_matrix_matrix
-func.func @dot_matrix_matrix(%arg0 : tensor<2x3xf32>, %arg1 : tensor<3x4xf32>) -> tensor<2x4xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.reshape"(%arg0) <{new_shape = array<i64: 1, 2, 3>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.reshape"(%arg1) <{new_shape = array<i64: 1, 3, 4>}>
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.matmul"(%[[VAR0]], %[[VAR1]])
-  // CHECK-DAG: %[[VAR3:.*]] = "tosa.reshape"(%[[VAR2]])
-  %0 = "mhlo.dot"(%arg0, %arg1) : (tensor<2x3xf32>, tensor<3x4xf32>) -> tensor<2x4xf32>
-  return %0 : tensor<2x4xf32>
-}
-
-// CHECK-LABEL: @gather
-func.func @gather(%arg0 : tensor<3x4x5xi32>, %arg1 : tensor<3x2xi32>) -> tensor<3x2x5xi32> {
-  // CHECK: tosa.gather
-  %0 = "mhlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0],
-      index_vector_dim = 1,
-      offset_dims = [1, 2],
-      start_index_map = [0, 1]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 2, 5]> : tensor<3xi64>
-  } : (tensor<3x4x5xi32>, tensor<3x2xi32>) -> tensor<3x2x5xi32>
-  return %0 : tensor<3x2x5xi32>
-}
-
-// CHECK-LABEL: @gather_unranked
-func.func @gather_unranked(%arg0 : tensor<*xi32>, %arg1 : tensor<3x2xi32>) -> tensor<*xi32> {
-  // This lowering does not support unranked tensors, so this should not
-  // legalize.
-  // CHECK: mhlo.gather
-  %0 = "mhlo.gather"(%arg0, %arg1) {
-    dimension_numbers = #mhlo.gather<
-      collapsed_slice_dims = [0],
-      index_vector_dim = 1,
-      offset_dims = [1, 2],
-      start_index_map = [0, 1]
-    >,
-    indices_are_sorted = false,
-    slice_sizes = dense<[1, 2, 5]> : tensor<3xi64>
-  } : (tensor<*xi32>, tensor<3x2xi32>) -> tensor<*xi32>
-  return %0 : tensor<*xi32>
-}
-
-// CHECK-LABEL: @maximum
-func.func @maximum(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.maximum
-  %0 = "mhlo.maximum"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @maximum_f64
-func.func @maximum_f64(%arg0 : tensor<10xf64>, %arg1 : tensor<10xf64>) -> tensor<10xf64> {
-  // CHECK: mhlo.maximum
-  %0 = "mhlo.maximum"(%arg0, %arg1) : (tensor<10xf64>, tensor<10xf64>) -> tensor<10xf64>
-  return %0 : tensor<10xf64>
-}
-
-// CHECK-LABEL: @minimum
-func.func @minimum(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.minimum
-  %0 = "mhlo.minimum"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @multiply
-func.func @multiply(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.mul
-  %0 = "mhlo.multiply"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @or
-func.func @or(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
-  // CHECK: tosa.bitwise_or
-  %0 = "mhlo.or"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
-  return %0 : tensor<10xi32>
-}
-
-// CHECK-LABEL: @power
-func.func @power(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.pow
-  %0 = "mhlo.power"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @reduce_max
-func.func @reduce_max(%arg0: tensor<1x10xf32>, %arg1: tensor<f32>) -> tensor<1xf32> {
-  // CHECK: tosa.reduce_max
-  // CHECK: tosa.reshape
-  %0 = "mhlo.reduce"(%arg0, %arg1) ({
-  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.maximum %arg2, %arg3 : tensor<f32>
-    "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {dimensions = dense<1> : tensor<1xi64>} : (tensor<1x10xf32>, tensor<f32>) -> tensor<1xf32>
-  return %0 : tensor<1xf32>
-}
-
-// CHECK-LABEL: @reduce_sum
-func.func @reduce_sum(%arg0: tensor<5x4xf32>, %arg1: tensor<f32>) -> tensor<4xf32> {
-  // CHECK: tosa.reduce_sum
-  // CHECK: tosa.reshape
-  %0 = "mhlo.reduce"(%arg0, %arg1) ({
-  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-    %1 = mhlo.add %arg2, %arg3 : tensor<f32>
-    "mhlo.return"(%1) : (tensor<f32>) -> ()
-  }) {dimensions = dense<0> : tensor<1xi64>} : (tensor<5x4xf32>, tensor<f32>) -> tensor<4xf32>
-  return %0 : tensor<4xf32>
-}
-
-// CHECK-LABEL: @shift_left
-func.func @shift_left(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
-  // CHECK: tosa.logical_left_shift
-  %0 = "mhlo.shift_left"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
-  return %0 : tensor<10xi32>
-}
-
-// CHECK-LABEL: @shift_right_logical
-func.func @shift_right_logical(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
-  // CHECK: tosa.logical_right_shift
-  %0 = "mhlo.shift_right_logical"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
-  return %0 : tensor<10xi32>
-}
-
-// CHECK-LABEL: @subtract
-func.func @subtract(%arg0 : tensor<10xf32>, %arg1 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.sub
-  %0 = "mhlo.subtract"(%arg0, %arg1) : (tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @xor
-func.func @xor(%arg0 : tensor<10xi32>, %arg1 : tensor<10xi32>) -> tensor<10xi32> {
-  // CHECK: tosa.bitwise_xor
-  %0 = "mhlo.xor"(%arg0, %arg1) : (tensor<10xi32>, tensor<10xi32>) -> tensor<10xi32>
-  return %0 : tensor<10xi32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/lit.cfg.py b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/lit.cfg.py
deleted file mode 100644
index 494c6b5e193..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/lit.cfg.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""Lit configuration to drive test in this repo."""
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# -*- Python -*-
-# pylint: disable=undefined-variable
-
-import os
-
-import lit.formats
-from lit.llvm import llvm_config
-from lit.llvm.subst import ToolSubst
-import lit.util
-
-# Configuration file for the 'lit' test runner.
-
-# name: The name of this test suite.
-config.name = 'MHLO_TOSA_OPT'
-
-config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
-
-# suffixes: A list of file extensions to treat as test files.
-config.suffixes = ['.mlir', '.mlir.py']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root path where tests should be run.
-config.test_exec_root = os.path.join(config.mlir_hlo_obj_root, 'test')
-
-config.substitutions.append(('%PATH%', config.environment['PATH']))
-config.substitutions.append(('%shlibext', config.llvm_shlib_ext))
-
-llvm_config.with_system_environment(['HOME', 'INCLUDE', 'LIB', 'TMP', 'TEMP'])
-
-llvm_config.use_default_substitutions()
-
-# excludes: A list of directories to exclude from the testsuite.
-config.excludes = ['Examples', 'CMakeLists.txt', 'README.txt', 'LICENSE.txt']
-
-# test_source_root: The root path where tests are located.
-config.test_source_root = os.path.dirname(__file__)
-
-# test_exec_root: The root path where tests should be run.
-config.test_exec_root = os.path.join(config.mlir_hlo_obj_root, 'tests')
-config.mlir_hlo_tools_dir = os.path.join(config.mlir_hlo_obj_root, 'bin')
-
-# Tweak the PATH to include the tools dir.
-llvm_config.with_environment('PATH', config.llvm_tools_dir, append_path=True)
-
-tool_dirs = [
-    config.mlir_hlo_tools_dir,
-    config.llvm_tools_dir,
-]
-tools = [
-    'mhlo-tosa-opt',
-]
-
-llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/lit.site.cfg.py.in b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/lit.site.cfg.py.in
deleted file mode 100644
index 1cc680ef0f0..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/lit.site.cfg.py.in
+++ /dev/null
@@ -1,62 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-@LIT_SITE_CFG_IN_HEADER@
-
-import sys
-
-config.host_triple = "@LLVM_HOST_TRIPLE@"
-config.target_triple = "@TARGET_TRIPLE@"
-config.llvm_src_root = "@LLVM_SOURCE_DIR@"
-config.llvm_obj_root = "@LLVM_BINARY_DIR@"
-config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
-config.llvm_lib_dir = "@LLVM_LIBRARY_DIR@"
-config.llvm_shlib_dir = "@SHLIBDIR@"
-config.llvm_shlib_ext = "@SHLIBEXT@"
-config.llvm_exe_ext = "@EXEEXT@"
-config.lit_tools_dir = "@LLVM_LIT_TOOLS_DIR@"
-config.python_executable = "@PYTHON_EXECUTABLE@"
-config.gold_executable = "@GOLD_EXECUTABLE@"
-config.ld64_executable = "@LD64_EXECUTABLE@"
-config.enable_shared = @ENABLE_SHARED@
-config.enable_assertions = @ENABLE_ASSERTIONS@
-config.targets_to_build = "@TARGETS_TO_BUILD@"
-config.native_target = "@LLVM_NATIVE_ARCH@"
-config.llvm_bindings = "@LLVM_BINDINGS@".split(' ')
-config.host_os = "@HOST_OS@"
-config.host_cc = "@HOST_CC@"
-config.host_cxx = "@HOST_CXX@"
-# Note: ldflags can contain double-quoted paths, so must use single quotes here.
-config.host_ldflags = '@HOST_LDFLAGS@'
-config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
-config.llvm_host_triple = '@LLVM_HOST_TRIPLE@'
-config.host_arch = "@HOST_ARCH@"
-config.mlir_hlo_src_root = "@MLIR_HLO_SOURCE_DIR@"
-config.mlir_hlo_obj_root = "@MLIR_HLO_BINARY_DIR@"
-
-# Support substitution of the tools_dir with user parameters. This is
-# used when we can't determine the tool dir at configuration time.
-try:
-    config.llvm_tools_dir = config.llvm_tools_dir % lit_config.params
-    config.llvm_shlib_dir = config.llvm_shlib_dir % lit_config.params
-except KeyError:
-    e = sys.exc_info()[1]
-    key, = e.args
-    lit_config.fatal("unable to find %r parameter, use '--param=%s=VALUE'" % (key,key))
-
-
-import lit.llvm
-lit.llvm.initialize(lit_config, config)
-
-# Let the main config do the real work.
-lit_config.load_config(config, "@CMAKE_SOURCE_DIR@/tosa/tests/lit.cfg.py")
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
deleted file mode 100644
index 2310ceb546d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/nullary.mlir
+++ /dev/null
@@ -1,32 +0,0 @@
-// RUN: mhlo-tosa-opt %s --tosa-legalize-mhlo | FileCheck %s
-
-// CHECK-LABEL: @constant
-func.func @constant() -> tensor<10xf32> {
-  // CHECK: tosa.const
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @constant_f64
-func.func @constant_f64() -> tensor<10xf64> {
-  // TOSA does not support 64-bit types, so this should not legalize.
-  // CHECK: mhlo.constant
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<10xf64>
-  return %0 : tensor<10xf64>
-}
-
-// CHECK-LABEL: @iota_dimension_0
-func.func @iota_dimension_0() -> tensor<4x8xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0.000000e+00, 1.000000e+00, 2.000000e+00, 3.000000e+00]> : tensor<4xf32>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) <{multiples = array<i64: 1, 8>}>
-  %0 = "mhlo.iota"() {iota_dimension = 0 : i64} : () -> (tensor<4x8xf32>)
-  return %0 : tensor<4x8xf32>
-}
-
-// CHECK-LABEL: @iota_dimension_1
-func.func @iota_dimension_1() -> tensor<4x8xi32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7]> : tensor<8xi32>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.tile"(%[[VAR0]]) <{multiples = array<i64: 4, 1>}>
-  %0 = "mhlo.iota"() {iota_dimension = 1 : i64} : () -> (tensor<4x8xi32>)
-  return %0 : tensor<4x8xi32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/prepare-mhlo.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/prepare-mhlo.mlir
deleted file mode 100644
index cd60a886eb1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/prepare-mhlo.mlir
+++ /dev/null
@@ -1,52 +0,0 @@
-// RUN: mhlo-tosa-opt %s -split-input-file --tosa-prepare-mhlo | FileCheck %s
-
-// CHECK-LABEL: func @dot_general_to_dot_vector_vector
-func.func @dot_general_to_dot_vector_vector(%arg0: tensor<3xf32>, %arg1: tensor<3xf32>) -> tensor<f32> {
-  // CHECK: "mhlo.dot"(%arg0, %arg1)
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [0], rhs_contracting_dimensions = [0]>} : (tensor<3xf32>, tensor<3xf32>) -> tensor<f32>
-  func.return %0 : tensor<f32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dot_general_to_dot_vector_matrix
-func.func @dot_general_to_dot_vector_matrix(%arg0: tensor<2xf32>, %arg1: tensor<2x3xf32>) -> tensor<3xf32> {
-  // CHECK: "mhlo.dot"(%arg0, %arg1)
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [0], rhs_contracting_dimensions = [0]>} : (tensor<2xf32>, tensor<2x3xf32>) -> tensor<3xf32>
-  func.return %0 : tensor<3xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dot_general_to_dot_matrix_vector
-func.func @dot_general_to_dot_matrix_vector(%arg0: tensor<2x3xf32>, %arg1: tensor<3xf32>) -> tensor<2xf32> {
-  // CHECK: "mhlo.dot"(%arg0, %arg1)
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<2x3xf32>, tensor<3xf32>) -> tensor<2xf32>
-  func.return %0 : tensor<2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dot_general_to_dot_matrix_matrix
-func.func @dot_general_to_dot_matrix_matrix(%arg0: tensor<2x3xf32>, %arg1: tensor<3x4xf32>) -> tensor<2x4xf32> {
-  // CHECK: "mhlo.dot"(%arg0, %arg1)
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (tensor<2x3xf32>, tensor<3x4xf32>) -> tensor<2x4xf32>
-  func.return %0 : tensor<2x4xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @dot_general_to_dot_batch_dimensions
-func.func @dot_general_to_dot_batch_dimensions(%arg0: tensor<2x2x3xf32>, %arg1: tensor<2x1x2xf32>) -> tensor<2x3x1xf32> {
-  // CHECK: mhlo.dot_general
-  %0 = "mhlo.dot_general"(%arg0, %arg1) {
-    dot_dimension_numbers = #mhlo.dot<
-      lhs_batching_dimensions = [0],
-      lhs_contracting_dimensions = [1],
-      rhs_batching_dimensions = [0],
-      rhs_contracting_dimensions = [2]
-    >,
-    precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-  } : (tensor<2x2x3xf32>, tensor<2x1x2xf32>) -> tensor<2x3x1xf32>
-  func.return %0 : tensor<2x3x1xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir
deleted file mode 100644
index b49c8f4137c..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/ternary.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: mhlo-tosa-opt %s --tosa-legalize-mhlo | FileCheck %s
-
-// CHECK-LABEL: @concatenate
-func.func @concatenate(%arg0 : tensor<5x2xf32>, %arg1 : tensor<5x5xf32>, %arg2 : tensor<5x7xf32>) -> tensor<5x14xf32> {
-  // CHECK: "tosa.concat"(%arg0, %arg1, %arg2) <{axis = 1 : i64}> : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
-  %0 = "mhlo.concatenate"(%arg0, %arg1, %arg2) {dimension = 1 : i64} : (tensor<5x2xf32>, tensor<5x5xf32>, tensor<5x7xf32>) -> tensor<5x14xf32>
-  return %0 : tensor<5x14xf32>
-}
-
-// CHECK-LABEL: @select
-func.func @select(%arg0 : tensor<10xi1>, %arg1 : tensor<10xf32>, %arg2 : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.select
-  %0 = "mhlo.select"(%arg0, %arg1, %arg2) : (tensor<10xi1>, tensor<10xf32>, tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir b/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
deleted file mode 100644
index 9119b5b2e1e..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/tests/unary.mlir
+++ /dev/null
@@ -1,157 +0,0 @@
-// RUN: mhlo-tosa-opt %s --tosa-legalize-mhlo | FileCheck %s
-
-// CHECK-LABEL: @abs
-func.func @abs(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.abs
-  %0 = "mhlo.abs"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @ceil
-func.func @ceil(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.ceil
-  %0 = "mhlo.ceil"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @convert
-func.func @convert(%arg : tensor<10xi32>) -> tensor<10xf32> {
-  // CHECK: tosa.cast
-  %0 = "mhlo.convert"(%arg) : (tensor<10xi32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @exponential
-func.func @exponential(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.exp
-  %0 = "mhlo.exponential"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @exponential_minus_one
-func.func @exponential_minus_one(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.exp"(%arg0)
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.sub"(%[[VAR1]], %[[VAR0]])
-  %0 = "mhlo.exponential_minus_one"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @floor
-func.func @floor(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.floor
-  %0 = "mhlo.floor"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @is_finite
-func.func @is_finite(%arg : tensor<10xf32>) -> tensor<10xi1> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<0x7F800000>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.abs"(%arg0)
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.equal"(%[[VAR1]], %[[VAR0]])
-  // CHECK-DAG: %[[VAR3:.*]] = "tosa.logical_not"(%[[VAR2]])
-  %0 = "mhlo.is_finite"(%arg) : (tensor<10xf32>) -> tensor<10xi1>
-  return %0 : tensor<10xi1>
-}
-
-// CHECK-LABEL: @log
-func.func @log(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.log
-  %0 = "mhlo.log"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @log_plus_one
-func.func @log_plus_one(%arg : tensor<10xf16>) -> tensor<10xf16> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<1.000000e+00>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.add"(%arg0, %[[VAR0]])
-  // CHECK-DAG: %[[VAR2:.*]] = "tosa.log"(%[[VAR1]])
-  %0 = "mhlo.log_plus_one"(%arg) : (tensor<10xf16>) -> tensor<10xf16>
-  return %0 : tensor<10xf16>
-}
-
-// CHECK-LABEL: @negate
-func.func @negate(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.negate
-  %0 = "mhlo.negate"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @slice
-func.func @slice(%arg : tensor<4x3xf32>) -> tensor<2x2xf32> {
-  // CHECK: "tosa.slice"(%arg0) <{size = array<i64: 2, 2>, start = array<i64: 2, 1>}>
-  %0 = "mhlo.slice"(%arg) {
-    start_indices = dense<[2, 1]> : tensor<2xi64>,
-    limit_indices = dense<[4, 3]> : tensor<2xi64>,
-    strides = dense<1> : tensor<2xi64>
-  } : (tensor<4x3xf32>) -> tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-// CHECK-LABEL: @slice_stride_not_one
-func.func @slice_stride_not_one(%arg : tensor<4x3xf32>) -> tensor<2x1xf32> {
-  // tosa.slice only supports strides of 1, so this should not legalize.
-  // CHECK: "mhlo.slice"
-  %0 = "mhlo.slice"(%arg) {
-    start_indices = dense<[2, 1]> : tensor<2xi64>,
-    limit_indices = dense<[4, 3]> : tensor<2xi64>,
-    strides = dense<[1, 2]> : tensor<2xi64>
-  } : (tensor<4x3xf32>) -> tensor<2x1xf32>
-  return %0 : tensor<2x1xf32>
-}
-
-// CHECK-LABEL: @slice_rank_seven
-func.func @slice_rank_seven(%arg : tensor<2x3x4x5x6x7x8xf32>) -> tensor<1x2x3x4x5x6x7xf32> {
-  // tosa.slice only supports 1D to 6D tensors, so this should not legalize.
-  // CHECK: "mhlo.slice"
-  %0 = "mhlo.slice"(%arg) {
-    start_indices = dense<[1, 1, 1, 1, 1, 1, 1]> : tensor<7xi64>,
-    limit_indices = dense<[2, 3, 4, 5, 6, 7, 8]> : tensor<7xi64>,
-    strides = dense<[1, 1, 1, 1, 1, 1, 1]> : tensor<7xi64>
-  } : (tensor<2x3x4x5x6x7x8xf32>) -> tensor<1x2x3x4x5x6x7xf32>
-  return %0 : tensor<1x2x3x4x5x6x7xf32>
-}
-
-// CHECK-LABEL: @tanh
-func.func @tanh(%arg : tensor<10xf32>) -> tensor<10xf32> {
-  // CHECK: tosa.tanh
-  %0 = "mhlo.tanh"(%arg) : (tensor<10xf32>) -> tensor<10xf32>
-  return %0 : tensor<10xf32>
-}
-
-// CHECK-LABEL: @transpose
-func.func @transpose(%arg0: tensor<1x2x3xf32>) -> tensor<3x2x1xf32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<[2, 1, 0]> : tensor<3xi64>}> : () -> tensor<3xi64>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.transpose"(%arg0, %[[VAR0]])
-  %0 = "mhlo.transpose"(%arg0) {permutation = dense<[2, 1, 0]> : tensor<3xi64>} : (tensor<1x2x3xf32>) -> tensor<3x2x1xf32>
-  return %0 : tensor<3x2x1xf32>
-}
-
-// CHECK-LABEL: @while
-func.func @while(%arg0: tensor<i32>) -> tensor<i32> {
-  // CHECK-DAG: %[[VAR0:.*]] = "tosa.const"() <{value = dense<3> : tensor<i32>}>
-  // CHECK-DAG: %[[VAR1:.*]] = "tosa.const"() <{value = dense<1> : tensor<i32>}>
-  // CHECK:     %[[VAR2:.*]] = "tosa.while_loop"(%arg0) ({
-  // CHECK:     ^bb0(%[[ARG0:.+]]: tensor<i32>):
-  // CHECK:       %[[VAR3:.*]] = "tosa.equal"(%[[ARG0]], %[[VAR0]])
-  // CHECK:       "tosa.yield"(%[[VAR3]])
-  // CHECK:     }, {
-  // CHECK:     ^bb0(%[[ARG0:.+]]: tensor<i32>):
-  // CHECK:       %[[VAR4:.*]] = "tosa.add"(%[[ARG0]], %[[VAR1]])
-  // CHECK:       "tosa.yield"(%[[VAR4]])
-  // CHECK:     }) : (tensor<i32>) -> tensor<i32>
-  // CHECK:     return %[[VAR2]] : tensor<i32>
-  // CHECK:   }
-  %0 = "mhlo.while"(%arg0) ( {
-  ^bb0(%arg1: tensor<i32>):
-    %1 = "mhlo.constant"() {value = dense<3> : tensor<i32>} : () -> tensor<i32>
-    %2 = "mhlo.compare"(%arg1, %1) {comparison_direction = #mhlo<comparison_direction EQ>}: (tensor<i32>, tensor<i32>) -> tensor<i1>
-    "mhlo.return"(%2) : (tensor<i1>) -> ()
-  },  {
-  ^bb0(%arg1: tensor<i32>):
-    %1 = "mhlo.constant"() {value = dense<1> : tensor<i32>} : () -> tensor<i32>
-    %2 = "mhlo.add"(%arg1, %1) : (tensor<i32>, tensor<i32>) -> tensor<i32>
-    "mhlo.return"(%2) : (tensor<i32>) -> ()
-  }) : (tensor<i32>) -> (tensor<i32>)
-  return %0 : tensor<i32>
-}
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/BUILD b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/BUILD
deleted file mode 100644
index 4b3bd02068d..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/BUILD
+++ /dev/null
@@ -1,80 +0,0 @@
-# Legalizations and transforms for MHLO -> TOSA.
-
-load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        ":internal",
-        "//tensorflow/compiler/xla/mlir_hlo/tosa:__subpackages__",
-    ],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "internal",
-    packages = [],
-)
-
-gentbl_cc_library(
-    name = "MHLOTOSAPDLLPatternsIncGen",
-    tbl_outs = [
-        (
-            ["-x=cpp"],
-            "legalize_mhlo/legalize_mhlo.pdll.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-pdll",
-    td_file = "legalize_mhlo/legalize_mhlo.pdll",
-    deps = [
-        "//tensorflow/compiler/xla/mlir_hlo:hlo_ops_td_files",
-        "@llvm-project//mlir:OpBaseTdFiles",
-        "@llvm-project//mlir:TosaDialectTdFiles",
-    ],
-)
-
-gentbl_cc_library(
-    name = "MHLOTOSATransformsPassIncGen",
-    compatible_with = get_compatible_with_cloud(),
-    strip_include_prefix = ".",
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=MHLOTOSATransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = [
-        "@llvm-project//mlir:PassBaseTdFiles",
-    ],
-)
-
-cc_library(
-    name = "MHLOTOSATransforms",
-    srcs = [
-        "legalize_mhlo/legalize_mhlo.cc",
-        "prepare_mhlo/prepare_mhlo.cc",
-    ],
-    hdrs = [
-        "passes.h",
-    ],
-    includes = ["."],
-    deps = [
-        ":MHLOTOSAPDLLPatternsIncGen",
-        ":MHLOTOSATransformsPassIncGen",
-        "//tensorflow/compiler/xla/mlir_hlo",
-        "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:TosaDialect",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/CMakeLists.txt
deleted file mode 100644
index 87011844519..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/CMakeLists.txt
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#      https://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-set(LLVM_TARGET_DEFINITIONS passes.td)
-mlir_tablegen(passes.h.inc -gen-pass-decls -name MHLOTOSATransforms)
-add_public_tablegen_target(MHLOTOSATransformsPassIncGen)
-
-add_mlir_library(MHLOTOSATransforms
-  legalize_mhlo/legalize_mhlo.cc
-  prepare_mhlo/prepare_mhlo.cc
-
-  DEPENDS
-  MHLOTOSATransformsPassIncGen
-  MHLOTOSAPDLLPatternsIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  MLIRIR
-  MLIRPass
-  MLIRTransforms
-  MhloPasses
-)
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.cc
deleted file mode 100644
index 272a705f087..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.cc
+++ /dev/null
@@ -1,495 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "./passes.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define GEN_PASS_DEF_TOSALEGALIZEMHLOPASS
-#include "passes.h.inc"
-
-#define PASS_NAME "tosa-legalize-mhlo"
-#define DEBUG_TYPE PASS_NAME
-
-#include "legalize_mhlo/legalize_mhlo.pdll.h.inc"
-
-namespace mlir {
-namespace tosa {
-namespace {
-
-struct LegalizeMhlo : ::impl::TosaLegalizeMhloPassBase<LegalizeMhlo> {
-  void runOnOperation() final;
-
-  LogicalResult initialize(MLIRContext* ctx) override;
-
- private:
-  FrozenRewritePatternSet patterns;
-};
-
-struct ConvertMhloCompareOp : public OpRewritePattern<mhlo::CompareOp> {
-  using OpRewritePattern<mhlo::CompareOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::CompareOp op,
-                                PatternRewriter& rewriter) const override {
-    auto direction = op.getComparisonDirection();
-    auto resultType = op->getResultTypes().front();
-
-    switch (direction) {
-      case mlir::mhlo::ComparisonDirection::EQ: {
-        rewriter.replaceOpWithNewOp<tosa::EqualOp>(op, resultType, op.getLhs(),
-                                                   op.getRhs());
-        break;
-      }
-      case mlir::mhlo::ComparisonDirection::NE: {
-        auto equalOp = rewriter.create<tosa::EqualOp>(op->getLoc(), resultType,
-                                                      op.getLhs(), op.getRhs());
-        rewriter.replaceOpWithNewOp<tosa::LogicalNotOp>(op, resultType,
-                                                        equalOp);
-        break;
-      }
-      default: {
-        return rewriter.notifyMatchFailure(
-            op, "comparison direction not yet implemented");
-      }
-    }
-    return success();
-  }
-};
-
-// TODO(jennik): Move this lowering to PDLL when variadic tensors are supported.
-struct ConvertMhloConcatenateOp : public OpRewritePattern<mhlo::ConcatenateOp> {
-  using OpRewritePattern<mhlo::ConcatenateOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::ConcatenateOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<tosa::ConcatOp>(op, op.getResult().getType(),
-                                                op.getVal(), op.getDimension());
-    return success();
-  }
-};
-
-struct ConvertMhloDotOp : public OpRewritePattern<mhlo::DotOp> {
-  using OpRewritePattern<mhlo::DotOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::DotOp op,
-                                PatternRewriter& rewriter) const override {
-    auto lhsType = op.getLhs().getType().dyn_cast<RankedTensorType>();
-    auto rhsType = op.getRhs().getType().dyn_cast<RankedTensorType>();
-    if (!lhsType | !rhsType) {
-      return rewriter.notifyMatchFailure(op, "input tensors are not ranked");
-    }
-
-    auto resultType = op.getResult().getType().dyn_cast<ShapedType>();
-    if (!resultType) {
-      return rewriter.notifyMatchFailure(op,
-                                         "result tensor does not have shape");
-    }
-
-    if (lhsType.getElementType() != rhsType.getElementType()) {
-      return rewriter.notifyMatchFailure(
-          op, "lhs and rhs element types must match");
-    }
-
-    auto lhsShape = lhsType.getShape();
-    auto rhsShape = rhsType.getShape();
-    auto resultShape = resultType.getShape();
-    llvm::SmallVector<int64_t, 3> lhsReshape;
-    llvm::SmallVector<int64_t, 3> rhsReshape;
-    llvm::SmallVector<int64_t, 3> matMulShape;
-
-    // tosa.matmul requires input tensors to have a rank of 3, so lhs and rhs
-    // need to be reshaped first.
-    if (lhsType.getRank() == 1) {
-      // Reshape lhs to [1, 1, N].
-      lhsReshape = {1, 1, lhsShape[0]};
-      if (rhsType.getRank() == 1) {
-        // Reshape rhs to [1, N, 1].
-        rhsReshape = {1, rhsShape[0], 1};
-        // MatMul shape is [1, 1, 1].
-        matMulShape = {1, 1, 1};
-      } else if (rhsType.getRank() == 2) {
-        // Reshape rhs to [1, N, K].
-        rhsReshape = {1, rhsShape[0], rhsShape[1]};
-        // MatMul shape is [1, 1, K].
-        matMulShape = {1, 1, rhsShape[1]};
-      } else {
-        return rewriter.notifyMatchFailure(op, "rhs must have rank of 1 or 2");
-      }
-    } else if (lhsType.getRank() == 2) {
-      // Reshape lhs to [1, M, K].
-      lhsReshape = {1, lhsShape[0], lhsShape[1]};
-      if (rhsType.getRank() == 1) {
-        // Reshape rhs to [1, K, 1].
-        rhsReshape = {1, rhsShape[0], 1};
-        // MatMul shape is [1, M, 1].
-        matMulShape = {1, lhsShape[0], 1};
-      } else if (rhsType.getRank() == 2) {
-        // Reshape rhs to [1, K, N].
-        rhsReshape = {1, rhsShape[0], rhsShape[1]};
-        // MatMul shape is [1, M, N].
-        matMulShape = {1, lhsShape[0], rhsShape[1]};
-      } else {
-        return rewriter.notifyMatchFailure(op, "rhs must have rank of 1 or 2");
-      }
-    } else {
-      return rewriter.notifyMatchFailure(op, "lhs must have rank of 1 or 2");
-    }
-
-    auto lhsReshapeType =
-        RankedTensorType::get(lhsReshape, lhsType.getElementType());
-    auto lhsReshapeOp = rewriter.create<tosa::ReshapeOp>(
-        op->getLoc(), lhsReshapeType, op.getLhs(),
-        rewriter.getDenseI64ArrayAttr(lhsReshape));
-
-    auto rhsReshapeType =
-        RankedTensorType::get(rhsReshape, rhsType.getElementType());
-    auto rhsReshapeOp = rewriter.create<tosa::ReshapeOp>(
-        op->getLoc(), rhsReshapeType, op.getRhs(),
-        rewriter.getDenseI64ArrayAttr(rhsReshape));
-
-    auto matMulType =
-        RankedTensorType::get(matMulShape, lhsType.getElementType());
-    auto matMulOp = rewriter.create<tosa::MatMulOp>(op->getLoc(), matMulType,
-                                                    lhsReshapeOp, rhsReshapeOp);
-
-    // Reshape the matmul result back to the original result shape.
-    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(
-        op, resultType, matMulOp, rewriter.getDenseI64ArrayAttr(resultShape));
-    return success();
-  }
-};
-
-// TODO(jennik): Consider the case of a non-constant expansion.
-struct ConvertMhloIotaOp : public OpRewritePattern<mhlo::IotaOp> {
-  using OpRewritePattern<mhlo::IotaOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::IotaOp op,
-                                PatternRewriter& rewriter) const override {
-    auto resultType = op.getResult().getType();
-    auto elementType = resultType.cast<ShapedType>().getElementType();
-    auto resultRankedType = resultType.dyn_cast<RankedTensorType>();
-
-    if (!resultRankedType) {
-      return rewriter.notifyMatchFailure(op, "result tensor must be ranked");
-    }
-    if (!resultRankedType.hasStaticShape()) {
-      return rewriter.notifyMatchFailure(op, "result tensor must be static");
-    }
-
-    auto resultShape = resultRankedType.getShape();
-    auto iotaDimension = op.getIotaDimension();
-    int64_t iotaArrayLength = resultShape[iotaDimension];
-
-    // Create a const op of [0, 1, 2...iotaArrayLength - 1] to be tiled.
-    llvm::SmallVector<mlir::Attribute, 4> constValues;
-    constValues.resize(iotaArrayLength);
-    for (int i = 0; i < iotaArrayLength; i++) {
-      if (elementType.isa<FloatType>()) {
-        constValues[i] = rewriter.getFloatAttr(elementType, i);
-      } else {
-        constValues[i] = rewriter.getIntegerAttr(elementType, i);
-      }
-    }
-
-    RankedTensorType constType =
-        RankedTensorType::get(iotaArrayLength, elementType);
-    auto constOp = rewriter.create<tosa::ConstOp>(
-        op.getLoc(), constType, DenseElementsAttr::get(constType, constValues));
-
-    // Create the multiples attr for the tile op, where all dimensions except
-    // the iota dimension are multiplied.
-    llvm::SmallVector<int64_t, 4> tileMultiples;
-    size_t tileMultiplesSize = resultShape.size();
-    tileMultiples.resize(tileMultiplesSize);
-
-    for (size_t i = 0; i < tileMultiplesSize; i++) {
-      if (i == iotaDimension) {
-        tileMultiples[i] = 1;
-      } else {
-        tileMultiples[i] = resultShape[i];
-      }
-    }
-
-    // Tile the const array to the result shape of the iota op.
-    rewriter.replaceOpWithNewOp<tosa::TileOp>(
-        op, resultType, constOp, rewriter.getDenseI64ArrayAttr(tileMultiples));
-    return success();
-  }
-};
-
-// This legalization supports the case where the MHLO start_indices directly map
-// to the TOSA indices.
-struct ConvertMhloGatherOp : public OpRewritePattern<mhlo::GatherOp> {
-  using OpRewritePattern<mhlo::GatherOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::GatherOp op,
-                                PatternRewriter& rewriter) const override {
-    // The input operand must be 3D, with shape [N, K, C].
-    auto operand = op.getOperand();
-    auto operandType = operand.getType().dyn_cast<RankedTensorType>();
-    if (!operandType) {
-      return rewriter.notifyMatchFailure(op, "requires ranked operand shape");
-    }
-    if (operandType.getRank() != 3) {
-      return rewriter.notifyMatchFailure(op, "operand must have rank of 3");
-    }
-
-    // The indices tensor must be 2D, with shape [N, W].
-    auto startIndices = op.getStartIndices();
-    auto startIndicesType = startIndices.getType().dyn_cast<RankedTensorType>();
-    if (!startIndicesType) {
-      return rewriter.notifyMatchFailure(op,
-                                         "requires ranked start_indices shape");
-    }
-    if (startIndicesType.getRank() != 2) {
-      return rewriter.notifyMatchFailure(op,
-                                         "start_indices must have rank of 2");
-    }
-
-    // The result tensor must be 3D, with shape [N, W, C].
-    auto resultType = op.getResult().getType().dyn_cast<RankedTensorType>();
-    if (!resultType) {
-      return rewriter.notifyMatchFailure(op, "requires ranked output shape");
-    }
-    if (resultType.getRank() != 3) {
-      return rewriter.notifyMatchFailure(op, "result must have rank of 3");
-    }
-
-    auto operandShape = operand.getType().getShape();
-    auto startIndicesShape = startIndices.getType().getShape();
-    auto resultShape = resultType.getShape();
-
-    if (startIndicesShape[0] != resultShape[0] ||
-        startIndicesShape[1] != resultShape[1]) {
-      return rewriter.notifyMatchFailure(op,
-                                         "start_indices and result must have "
-                                         "same number of batches and indices");
-    }
-
-    if (operandShape[0] != resultShape[0] ||
-        operandShape[2] != resultShape[2]) {
-      return rewriter.notifyMatchFailure(op,
-                                         "operand and result must have same "
-                                         "number of batches and data channels");
-    }
-
-    auto startIndexMap = op.getDimensionNumbers().getStartIndexMap();
-    for (const auto& startIndex : llvm::enumerate(startIndexMap)) {
-      if (startIndex.value() != static_cast<int64_t>(startIndex.index())) {
-        return rewriter.notifyMatchFailure(op,
-                                           "start_index_map must be in order");
-      }
-    }
-
-    rewriter.replaceOpWithNewOp<tosa::GatherOp>(op, resultType, operand,
-                                                startIndices);
-    return success();
-  }
-};
-
-struct ConvertMhloReduceOp : public OpRewritePattern<mhlo::ReduceOp> {
-  using OpRewritePattern<mhlo::ReduceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::ReduceOp op,
-                                PatternRewriter& rewriter) const override {
-    Block& bodyBlock = op.getBody().front();
-
-    // To lower to a tosa.reduce_* op, the body should contain the reduce op
-    // and a return op.
-    if (bodyBlock.getOperations().size() != 2) {
-      return rewriter.notifyMatchFailure(op, "body required to contain 2 ops");
-    }
-
-    auto operand = op.getInputs().front();
-    ShapedType inputType = operand.getType().cast<ShapedType>();
-    Operation& innerOp = bodyBlock.front();
-    uint64_t dimension = op.getDimensions().getValues<uint64_t>().begin()[0];
-    SmallVector<int64_t> innerShape(inputType.getShape());
-    innerShape[dimension] = 1;
-    Type innerTy = inputType.clone(innerShape);
-
-    Value reduceOpResult;
-    if (isa<mhlo::AddOp>(innerOp)) {
-      reduceOpResult =
-          rewriter
-              .create<tosa::ReduceSumOp>(op->getLoc(), innerTy, operand,
-                                         rewriter.getI64IntegerAttr(dimension))
-              .getResult();
-    } else if (isa<mhlo::MaxOp>(innerOp)) {
-      reduceOpResult =
-          rewriter
-              .create<tosa::ReduceMaxOp>(op->getLoc(), innerTy, operand,
-                                         rewriter.getI64IntegerAttr(dimension))
-              .getResult();
-    } else {
-      return rewriter.notifyMatchFailure(
-          op, "reducing along a " + innerOp.getName().getStringRef().str() +
-                  " op not supported");
-    }
-
-    // TOSA reduce ops do not remove the dimension being reduced, so reshape
-    // the reduced output and remove the reduction dimension.
-    llvm::SmallVector<int64_t, 2> outputShape;
-    int outputShapeLength = innerShape.size() - 1;
-    outputShape.resize(outputShapeLength);
-    for (int64_t i = 0; i < outputShapeLength; i++) {
-      if (i < static_cast<int64_t>(dimension)) {
-        outputShape[i] = innerShape[i];
-      } else {
-        outputShape[i] = innerShape[i + 1];
-      }
-    }
-
-    rewriter.replaceOpWithNewOp<tosa::ReshapeOp>(
-        op, op.getResultTypes().front(), reduceOpResult,
-        rewriter.getDenseI64ArrayAttr(outputShape));
-
-    return success();
-  }
-};
-
-struct ConvertMhloReturnOp : public OpRewritePattern<mhlo::ReturnOp> {
-  using OpRewritePattern<mhlo::ReturnOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::ReturnOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<tosa::YieldOp>(op, op->getResultTypes(),
-                                               op.getResults());
-    return success();
-  }
-};
-
-struct ConvertMhloSliceOp : public OpRewritePattern<mhlo::SliceOp> {
-  using OpRewritePattern<mhlo::SliceOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::SliceOp op,
-                                PatternRewriter& rewriter) const override {
-    auto rank = op.getOperand().getType().getRank();
-    if (rank < 1 || rank > 6) {
-      return rewriter.notifyMatchFailure(
-          op, "tosa.slice only supports 1D to 6D tensors");
-    }
-
-    auto strides = op.getStrides().getValues<int64_t>();
-    for (auto stride : strides) {
-      if (stride != 1) {
-        return rewriter.notifyMatchFailure(
-            op, "tosa.slice only supports strides of 1");
-      }
-    }
-
-    auto startIndices = op.getStartIndices().getValues<int64_t>();
-    auto endIndices = op.getLimitIndices().getValues<int64_t>();
-
-    llvm::SmallVector<int64_t, 2> size;
-    size.resize(startIndices.size());
-    llvm::SmallVector<int64_t, 2> startIndicesI64;
-    startIndicesI64.resize(startIndices.size());
-
-    for (int64_t i = 0; i < static_cast<int64_t>(startIndices.size()); i++) {
-      size[i] = endIndices[i] - startIndices[i];
-      startIndicesI64[i] = startIndices[i];
-    }
-
-    rewriter.replaceOpWithNewOp<tosa::SliceOp>(
-        op, op.getResult().getType(), op.getOperand(),
-        rewriter.getDenseI64ArrayAttr(startIndicesI64),
-        rewriter.getDenseI64ArrayAttr(size));
-    return success();
-  }
-};
-
-struct ConvertMhloTransposeOp : public OpRewritePattern<mhlo::TransposeOp> {
-  using OpRewritePattern<mhlo::TransposeOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::TransposeOp op,
-                                PatternRewriter& rewriter) const override {
-    auto rank = op.getOperand().getType().getRank();
-    if (rank < 1 || rank > 6) {
-      return rewriter.notifyMatchFailure(
-          op, "tosa.transpose only supports 1D to 6D tensors");
-    }
-
-    auto perms = op.getPermutation();
-    auto constOp = rewriter.create<tosa::ConstOp>(
-        op->getLoc(),
-        RankedTensorType::get({perms.size()}, rewriter.getI64Type()), perms);
-    rewriter.replaceOpWithNewOp<tosa::TransposeOp>(op, op.getResult().getType(),
-                                                   op.getOperand(), constOp);
-    return success();
-  }
-};
-
-struct ConvertMhloWhileOp : public OpRewritePattern<mhlo::WhileOp> {
-  using OpRewritePattern<mhlo::WhileOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(mhlo::WhileOp op,
-                                PatternRewriter& rewriter) const override {
-    auto* cond = &op.getCond();
-    auto* body = &op.getBody();
-    auto newWhileOp = rewriter.create<tosa::WhileOp>(
-        op->getLoc(), op->getResultTypes(), op->getOperands());
-
-    auto* newCond = &newWhileOp->getRegion(0);
-    auto* newBody = &newWhileOp->getRegion(1);
-    rewriter.createBlock(newCond);
-    rewriter.createBlock(newBody);
-
-    rewriter.cloneRegionBefore(*cond, &newCond->back());
-    rewriter.eraseBlock(&newCond->back());
-    rewriter.cloneRegionBefore(*body, &newBody->back());
-    rewriter.eraseBlock(&newBody->back());
-
-    rewriter.replaceOp(op, newWhileOp.getResults());
-    return success();
-  }
-};
-
-LogicalResult LegalizeMhlo::initialize(MLIRContext* ctx) {
-  RewritePatternSet patternList(ctx);
-  populateGeneratedPDLLPatterns(patternList);
-  patternList.addWithLabel<ConvertMhloCompareOp>({"MhloCompare"}, ctx);
-  patternList.addWithLabel<ConvertMhloConcatenateOp>({"MhloConcatenate"}, ctx);
-  patternList.addWithLabel<ConvertMhloDotOp>({"MhloDot"}, ctx);
-  patternList.addWithLabel<ConvertMhloGatherOp>({"MhloGather"}, ctx);
-  patternList.addWithLabel<ConvertMhloIotaOp>({"MhloIota"}, ctx);
-  patternList.addWithLabel<ConvertMhloReduceOp>({"MhloReduce"}, ctx);
-  patternList.addWithLabel<ConvertMhloReturnOp>({"MhloReturn"}, ctx);
-  patternList.addWithLabel<ConvertMhloSliceOp>({"MhloSlice"}, ctx);
-  patternList.addWithLabel<ConvertMhloTransposeOp>({"MhloTranspose"}, ctx);
-  patternList.addWithLabel<ConvertMhloWhileOp>({"MhloWhile"}, ctx);
-  patterns = std::move(patternList);
-  return success();
-}
-
-void LegalizeMhlo::runOnOperation() {
-  (void)applyPatternsAndFoldGreedily(getOperation(), patterns);
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeMhloPass() {
-  return std::make_unique<LegalizeMhlo>();
-}
-
-}  // namespace tosa
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
deleted file mode 100644
index 857bcf45155..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/legalize_mhlo/legalize_mhlo.pdll
+++ /dev/null
@@ -1,165 +0,0 @@
-// Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "mlir/Dialect/Tosa/IR/TosaOps.td"
-#include "mhlo/IR/hlo_ops.td"
-
-// Helper functions.
-Rewrite onesLike(op: Op, type: Type) -> Op [{
-  auto elementType = type.cast<mlir::TensorType>().getElementType();
-  llvm::SmallVector<mlir::Attribute, 4> outputValue;
-
-  if (elementType.isF16() || elementType.isF32() || elementType.isBF16()) {
-    outputValue.push_back(rewriter.getFloatAttr(elementType, 1));
-  } else {
-    outputValue.push_back(rewriter.getIntegerAttr(elementType, 1));
-  }
-
-  return rewriter.create<mlir::tosa::ConstOp>(
-      op->getLoc(), type,
-      mlir::DenseElementsAttr::get(
-        llvm::cast<mlir::ShapedType>(type), outputValue));
-}];
-
-Rewrite positiveFloatInfinityLike(op: Op, type: Type) -> Op [{
-  auto elementType = type.cast<mlir::TensorType>().getElementType();
-  const llvm::fltSemantics& semantic =
-      elementType.cast<mlir::FloatType>().getFloatSemantics();
-
-  llvm::SmallVector<mlir::Attribute, 4> outputValue;
-  outputValue.push_back(rewriter.getFloatAttr(
-    elementType, llvm::APFloat::getInf(semantic, false)));
-
-  return rewriter.create<mlir::tosa::ConstOp>(
-      op->getLoc(), type,
-      mlir::DenseElementsAttr::get(
-        llvm::cast<mlir::ShapedType>(type), outputValue));
-}];
-
-// Nullary ops.
-Pattern =>
-  replace op<mhlo.constant> {value = input: Attr<_: Tosa_Tensor>}
-     with op<tosa.const> {value = input};
-
-// Unary ops.
-Pattern =>
-  replace op<mhlo.abs>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.abs>(input);
-Pattern =>
-  replace op<mhlo.ceil>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.ceil>(input);
-Pattern =>
-  replace op<mhlo.convert>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.cast>(input);
-Pattern =>
-  replace op<mhlo.exponential>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.exp>(input);
-Pattern {
-  let root = op<mhlo.exponential_minus_one>
-                (input : Value<inputType: Tosa_Tensor>);
-  rewrite root with {
-    let ones = onesLike(root, inputType);
-    let expResult = op<tosa.exp>(input) -> (inputType);
-    let expMinusOneResult = op<tosa.sub>(expResult, ones) -> (inputType);
-    replace root with expMinusOneResult;
-  };
-}
-Pattern =>
-  replace op<mhlo.floor>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.floor>(input);
-Pattern {
-  let root = op<mhlo.is_finite>(input : Value<inputType: Tosa_Tensor>);
-  rewrite root with {
-    let positiveInfinity = positiveFloatInfinityLike(root, inputType);
-    let inputAbs = op<tosa.abs>(input) -> (inputType);
-    let equalsResult = op<tosa.equal>(positiveInfinity, inputAbs);
-    let notEqualsResult = op<tosa.logical_not>(equalsResult);
-    replace root with notEqualsResult;
-  };
-}
-Pattern =>
-  replace op<mhlo.log>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.log>(input);
-Pattern {
-  let root = op<mhlo.log_plus_one>(input : Value<inputType: Tosa_Tensor>);
-  rewrite root with {
-    let ones = onesLike(root, inputType);
-    let addResult = op<tosa.add>(input, ones) -> (inputType);
-    let logPlusOneResult = op<tosa.log>(addResult) -> (inputType);
-    replace root with logPlusOneResult;
-  };
-}
-Pattern =>
-  replace op<mhlo.negate>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.negate>(input);
-Pattern =>
-  replace op<mhlo.tanh>(input : Value<_: Tosa_Tensor>)
-     with op<tosa.tanh>(input);
-
-// Binary ops.
-Pattern =>
-  replace op<mhlo.add>(input0 : Value<_: Tosa_Tensor>,
-                       input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.add>(input0, input1);
-Pattern =>
-  replace op<mhlo.and>(input0 : Value<_: Tosa_Tensor>,
-                       input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.bitwise_and>(input0, input1);
-Pattern =>
-  replace op<mhlo.divide>(input0 : Value<_: Tosa_Int32Tensor>,
-                          input1 : Value<_: Tosa_Int32Tensor>)
-     with op<tosa.div>(input0, input1);
-Pattern =>
-  replace op<mhlo.maximum>(input0 : Value<_: Tosa_Tensor>,
-                           input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.maximum>(input0, input1);
-Pattern =>
-  replace op<mhlo.minimum>(input0 : Value<_: Tosa_Tensor>,
-                           input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.minimum>(input0, input1);
-Pattern =>
-  replace op<mhlo.multiply>(input0 : Value<_: Tosa_Tensor>,
-                            input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.mul>(input0, input1) {shift = attr<"0 : i32">};
-Pattern =>
-  replace op<mhlo.or>(input0 : Value<_: Tosa_Tensor>,
-                      input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.bitwise_or>(input0, input1);
-Pattern =>
-  replace op<mhlo.power>(input0 : Value<_: Tosa_Tensor>,
-                         input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.pow>(input0, input1);
-Pattern =>
-  replace op<mhlo.shift_left>(input0 : Value<_: Tosa_Tensor>,
-                              input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.logical_left_shift>(input0, input1);
-Pattern =>
-  replace op<mhlo.shift_right_logical>(input0 : Value<_: Tosa_Tensor>,
-                                       input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.logical_right_shift>(input0, input1);
-Pattern =>
-  replace op<mhlo.subtract>(input0 : Value<_: Tosa_Tensor>,
-                            input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.sub>(input0, input1);
-Pattern =>
-  replace op<mhlo.xor>(input0 : Value<_: Tosa_Tensor>,
-                       input1 : Value<_: Tosa_Tensor>)
-     with op<tosa.bitwise_xor>(input0, input1);
-
-// Ternary ops.
-Pattern =>
-  replace op<mhlo.select>(input0 : Value<_: Tosa_Tensor>,
-                          input1 : Value<_: Tosa_Tensor>,
-                          input2 : Value<_: Tosa_Tensor>)
-     with op<tosa.select>(input0, input1, input2);
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.td b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.td
deleted file mode 100644
index f5841bb9fc1..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/passes.td
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-include "mlir/Pass/PassBase.td"
-
-def TosaLegalizeMhloPass : Pass<"tosa-legalize-mhlo", "mlir::func::FuncOp"> {
-  let summary = "Legalize from MHLO to TOSA";
-  let constructor = "createLegalizeMhloPass()";
-  let dependentDialects = ["::mlir::tosa::TosaDialect"];
-}
-
-def TosaPrepareMhloPass : Pass<"tosa-prepare-mhlo", "mlir::func::FuncOp"> {
-  let summary = "Prepare MHLO for lowering to TOSA";
-  let description = [{
-    This pass adds rewriters to make MHLO ops more compatible with TOSA ops.
-    Currently simplifies mhlo.dot_general into mhlo.dot for easier lowering.
-  }];
-  let constructor = "createPrepareMhloPass()";
-  let dependentDialects = ["::mlir::tosa::TosaDialect"];
-}
\ No newline at end of file
diff --git a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/prepare_mhlo/prepare_mhlo.cc b/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/prepare_mhlo/prepare_mhlo.cc
deleted file mode 100644
index 3918a5637ff..00000000000
--- a/tensorflow/compiler/xla/mlir_hlo/tosa/transforms/prepare_mhlo/prepare_mhlo.cc
+++ /dev/null
@@ -1,57 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "./passes.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/Tosa/IR/TosaOps.h"
-#include "mlir/IR/BuiltinAttributes.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-
-#define GEN_PASS_DEF_TOSAPREPAREMHLOPASS
-#include "./passes.h.inc"
-
-#define PASS_NAME "tosa-prepare-mhlo"
-#define DEBUG_TYPE PASS_NAME
-
-namespace mlir {
-namespace tosa {
-namespace {
-
-class PrepareMhlo : public ::impl::TosaPrepareMhloPassBase<PrepareMhlo> {
- public:
-  explicit PrepareMhlo() = default;
-  void runOnOperation() override;
-};
-
-void PrepareMhlo::runOnOperation() {
-  auto* ctx = &getContext();
-  RewritePatternSet patterns(ctx);
-  mhlo::DotGeneralOp::getCanonicalizationPatterns(patterns, ctx);
-  mhlo::populateGeneralDotOpLoweringPatterns(&patterns, ctx);
-  (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createPrepareMhloPass() {
-  return std::make_unique<PrepareMhlo>();
-}
-
-}  // namespace tosa
-}  // namespace mlir
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt b/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
index 29c8626b929..a8e8ef3aed7 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/CMakeLists.txt
@@ -48,7 +48,7 @@ add_mlir_library(MLIRBufferTransforms
   ChloOps
   GmlStBufferizableOpInterface
   GmlStDialect
-  MLIRGPUOps
+  MLIRGPUDialect
   MLIRHLOAnalysis
   MLIRIR
   MLIRLinalgToLLVM
@@ -78,7 +78,7 @@ add_mlir_library(MLIRHLOGPUTransforms
   LINK_LIBS PUBLIC
   GmlStPasses
   MLIRArithTransforms
-  MLIRGPUOps
+  MLIRGPUDialect
   MLIRHLOAnalysis
   MLIRIR
   MLIRMemRefTransforms
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
index 5998ded259f..e7f957634ee 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/buffer_packing.cc
@@ -13,7 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <cmath>
+#include <iterator>
 #include <list>
+#include <memory>
+#include <utility>
+#include <vector>
 
 #include "analysis/userange_analysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
index e1c3a724b58..30df1fede70 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/lower_index_cast_pass.cc
@@ -16,6 +16,7 @@ limitations under the License.
 // This file contains the patterns to convert arith.index_cast on tensors to
 // tensor ops and index_cast on scalars.
 
+#include <memory>
 #include <utility>
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
index 6e5cb8da37c..e5b65396341 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/test_hlo_transform_dialect_interpreter.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <utility>
+
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/Transforms/TransformInterpreterPassBase.h"
 #include "mlir/IR/Builders.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc b/tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc
index 2baf35798f1..dbfc9c54d5e 100644
--- a/tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/transforms/tile_loops_pass.cc
@@ -17,6 +17,7 @@ limitations under the License.
 // tiled loops.
 
 #include <cstdint>
+#include <memory>
 #include <tuple>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc b/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc
index 8b9839fcce7..55ba8de7750 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/convert_op_folder.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include "utils/convert_op_folder.h"
 
+#include <utility>
+
 #include "llvm/ADT/APSInt.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
index d76ce70cf34..d14103f0ed0 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "utils/hlo_utils.h"
 
+#include <algorithm>
 #include <numeric>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
diff --git a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
index f099451b410..84322e23154 100644
--- a/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
+++ b/tensorflow/compiler/xla/mlir_hlo/utils/hlo_utils.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef MLIR_HLO_UTILS_HLO_UTILS_H
 #define MLIR_HLO_UTILS_HLO_UTILS_H
 
+#include <complex>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
diff --git a/tensorflow/compiler/xla/overflow_util.h b/tensorflow/compiler/xla/overflow_util.h
index 216f0938e36..7e8df8da73e 100644
--- a/tensorflow/compiler/xla/overflow_util.h
+++ b/tensorflow/compiler/xla/overflow_util.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <optional>
 #include <type_traits>
 
+#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -72,6 +73,9 @@ inline std::optional<T> OverflowSafeAdd(T x, T y) {
 
 inline bool FitsInIntegralType(int64_t x, PrimitiveType ty) {
   switch (ty) {
+    case S4:
+      return std::numeric_limits<s4>::min() <= x &&
+             std::numeric_limits<s4>::max() >= x;
     case S8:
       return std::numeric_limits<int8_t>::min() <= x &&
              std::numeric_limits<int8_t>::max() >= x;
@@ -83,6 +87,8 @@ inline bool FitsInIntegralType(int64_t x, PrimitiveType ty) {
              std::numeric_limits<int32_t>::max() >= x;
     case S64:
       return true;
+    case U4:
+      return 0 <= x && std::numeric_limits<u4>::max() >= x;
     case U8:
       return 0 <= x && std::numeric_limits<uint8_t>::max() >= x;
     case U16:
diff --git a/tensorflow/compiler/xla/pjrt/BUILD b/tensorflow/compiler/xla/pjrt/BUILD
index 7551a3ae6f6..81a644bab15 100644
--- a/tensorflow/compiler/xla/pjrt/BUILD
+++ b/tensorflow/compiler/xla/pjrt/BUILD
@@ -19,6 +19,7 @@ package_group(
     ],
     packages = [
         "//third_party/australis/...",
+        "//third_party/openxla_pjrt_plugin/...",
         "//third_party/py/jax/...",
     ],
 )
@@ -29,7 +30,6 @@ cc_library(
     hdrs = ["worker_thread.h"],
     deps = [
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -43,7 +43,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/stream_executor",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -55,7 +54,6 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:types",
         "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -67,7 +65,6 @@ xla_cc_test(
         ":semaphore",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/synchronization",
     ],
@@ -91,7 +88,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/stream_executor:event",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/synchronization",
     ],
@@ -108,7 +104,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:test_main",
     ],
@@ -129,7 +124,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:event",
         "//tensorflow/tsl/profiler/lib:traceme",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -167,6 +161,8 @@ cc_library(
     hdrs = ["pjrt_client.h"],
     visibility = ["//tensorflow/compiler/xla:friends"],
     deps = [
+        ":pjrt_compiler",
+        ":pjrt_device_description",
         ":pjrt_executable",
         ":pjrt_future",
         "//tensorflow/compiler/xla:literal",
@@ -180,7 +176,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_cost_analysis",
         "//tensorflow/tsl/framework:allocator",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:fingerprint",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -248,15 +243,26 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "pjrt_device_description",
+    hdrs = ["pjrt_device_description.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
 cc_library(
     name = "pjrt_compiler",
     srcs = ["pjrt_compiler.cc"],
     hdrs = ["pjrt_compiler.h"],
     visibility = [":friends"],
     deps = [
-        ":pjrt_client",
+        ":pjrt_device_description",
         ":pjrt_executable",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/tsl/platform:fingerprint",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@llvm-project//mlir:IR",
@@ -475,7 +481,6 @@ cc_library(
     visibility = [":friends"],
     deps = [
         "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/types:span",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
     ],
@@ -505,7 +510,6 @@ xla_cc_test(
     deps = [
         ":tracked_tfrt_cpu_device_buffer",
         "@com_google_googletest//:gtest_main",
-        "@tf_runtime//:hostcontext",
     ],
 )
 
@@ -520,11 +524,16 @@ cc_library(
         ":pjrt_client",
         ":pjrt_future",
         ":tracked_tfrt_cpu_device_buffer",
+        ":transpose",
+        ":utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/runtime:cpu_event",
+        "//tensorflow/compiler/xla/service:shaped_buffer",
+        "//tensorflow/compiler/xla/service/cpu:cpu_executable",
+        "//tensorflow/compiler/xla/service/cpu:cpu_xfeed",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:connected_traceme",
         "@com_google_absl//absl/base:core_headers",
@@ -558,7 +567,6 @@ cc_library(
         ":transpose",
         ":utils",
         "//tensorflow/compiler/xla:array",
-        "//tensorflow/compiler/xla:cpu_function_runtime",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla:literal",
@@ -592,8 +600,6 @@ cc_library(
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:connected_traceme",
         "//third_party/eigen3",  # TODO(zhangqiaorjc): Remove if use TFRT threadpool.
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -631,6 +637,7 @@ xla_cc_test(
         "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/synchronization",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -650,7 +657,6 @@ xla_cc_test(
     deps = [
         ":lru_cache",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -719,8 +725,6 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
         "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_helpers",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",  # TODO(b/238999986): Remove this.
-        "//tensorflow/compiler/xla/stream_executor/tpu:tpu_initializer_helper",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/strings",
@@ -747,7 +751,6 @@ xla_cc_test(
     deps = [
         ":tf_pjrt_client",
         ":tfrt_cpu_pjrt_client",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/platform:env",
diff --git a/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc
index 732c1dfbd90..b55f1d08e51 100644
--- a/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc
+++ b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstring>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -31,7 +33,12 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/transpose.h"
+#include "tensorflow/compiler/xla/pjrt/utils.h"
 #include "tensorflow/compiler/xla/runtime/cpu_event.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_executable.h"
+#include "tensorflow/compiler/xla/service/cpu/cpu_xfeed.h"
+#include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -44,9 +51,12 @@ limitations under the License.
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 
 namespace xla {
+namespace {
 
 using ::xla::runtime::CpuEvent;
 
+constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
+
 void CopyCpuBufferToLiteral(const Shape& device_shape,
                             TrackedTfrtCpuDeviceBuffer* device_buffer,
                             MutableLiteralBase* literal) {
@@ -68,6 +78,31 @@ void CopyCpuBufferToLiteral(const Shape& device_shape,
   }
 }
 
+ShapedBuffer AsShapedBuffer(
+    int device_ordinal, const Shape& on_device_shape,
+    absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffers) {
+  ShapedBuffer shaped_buffer(on_device_shape, device_ordinal);
+  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
+      shaped_buffer.buffers().begin();
+  for (const auto& buf : buffers) {
+    CHECK(iterator != shaped_buffer.buffers().end());
+    iterator->second = se::DeviceMemoryBase(buf->data(), buf->size());
+    ++iterator;
+  }
+  CHECK(iterator == shaped_buffer.buffers().end());
+  return shaped_buffer;
+}
+
+}  //  namespace
+
+UnpinnedHostMemorySpace::UnpinnedHostMemorySpace(int id, PjRtClient* client)
+    : id_(id), client_(client) {
+  debug_string_ = absl::StrFormat(
+      "UnpinnedHostMemorySpace(id=%i, process_index=%i, client=%s)", id_,
+      client_->process_index(), client_->platform_name());
+  to_string_ = absl::StrFormat("UNPINNED_HOST_%i", id_);
+}
+
 AbstractTfrtCpuBuffer::AbstractTfrtCpuBuffer(
     Shape on_device_shape,
     std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer)
@@ -76,7 +111,35 @@ AbstractTfrtCpuBuffer::AbstractTfrtCpuBuffer(
 
 AbstractTfrtCpuBuffer::~AbstractTfrtCpuBuffer() {
   AbstractTfrtCpuBuffer::Delete();
-  CHECK_EQ(external_reference_counter_, 0);
+}
+
+StatusOr<Shape> AbstractTfrtCpuBuffer::logical_on_device_shape() {
+  if (on_device_shape_.is_static()) {
+    return on_device_shape_;
+  }
+
+  auto usage_event = tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    return InvalidArgument(
+        "logical_on_device_shape() called on deleted or donated buffer");
+  }
+  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  // Wait for the definition event.
+  const auto& av = device_buffer->definition_event();
+  BlockUntilReady(av.GetAsyncValue());
+  if (auto* error = av.GetErrorIfPresent()) {
+    return InternalError("Error Execute: %s", error->message());
+  }
+
+  ShapedBuffer shaped_buffer =
+      AsShapedBuffer(device()->local_hardware_id(), on_device_shape_,
+                     device_buffer->Buffers());
+  Shape ret_shape = on_device_shape_;
+  TF_RETURN_IF_ERROR(ReadDynamicShapesOnCpu(
+      &shaped_buffer, &ret_shape, cpu::CpuExecutable::ShapeSizeBytes));
+  return ret_shape;
 }
 
 StatusOr<size_t> AbstractTfrtCpuBuffer::GetOnDeviceSizeInBytes() const {
@@ -114,6 +177,15 @@ AbstractTfrtCpuBuffer::AcquireExternalReference() {
       this, tracked_device_buffer_->Buffers()[0])};
 }
 
+void AbstractTfrtCpuBuffer::DropExternalReference() {
+  absl::MutexLock lock(&mu_);
+  CHECK_GT(external_reference_counter_, 0);
+  --external_reference_counter_;
+  if (external_reference_counter_ == 0 && external_references_dropped_event_) {
+    external_references_dropped_event_->SetStateConcrete();
+  }
+}
+
 class TrackedCpuDeviceBufferExternalReference
     : public PjRtBuffer::ExternalReference {
  public:
@@ -165,8 +237,19 @@ void AbstractTfrtCpuBuffer::AbortDonation(
 }
 
 void AbstractTfrtCpuBuffer::Delete() {
-  auto device_buffer = ReleaseBufferLocked();
-  if (device_buffer == nullptr) return;
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer;
+  std::optional<tfrt::AsyncValueRef<CpuEvent>>
+      external_references_dropped_event;
+  {
+    absl::MutexLock lock(&mu_);
+    device_buffer = ReleaseBufferLocked();
+    if (device_buffer == nullptr) return;
+
+    if (external_reference_counter_ > 0) {
+      external_references_dropped_event = external_references_dropped_event_ =
+          tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+    }
+  }
 
   // Now that all holds have completed and no more can be added, we can get
   // the final set of usage events.
@@ -181,6 +264,9 @@ void AbstractTfrtCpuBuffer::Delete() {
 
   // We should also wait for the definition event.
   event_avs.push_back(device_buffer->definition_event().GetAsyncValue());
+  if (external_references_dropped_event) {
+    event_avs.push_back(external_references_dropped_event->GetAsyncValue());
+  }
 
   RunWhenReady(event_avs, [device_buffer = std::move(device_buffer)]() mutable {
     device_buffer.reset();
@@ -194,7 +280,6 @@ bool AbstractTfrtCpuBuffer::IsDeleted() {
 
 std::unique_ptr<TrackedTfrtCpuDeviceBuffer>
 AbstractTfrtCpuBuffer::ReleaseBufferLocked() {
-  absl::MutexLock lock(&mu_);
   auto condition = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) {
     return !pending_donation_;
   };
@@ -204,8 +289,11 @@ AbstractTfrtCpuBuffer::ReleaseBufferLocked() {
 
 StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
 AbstractTfrtCpuBuffer::Release(bool wait_for_operations_to_complete) {
-  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer =
-      ReleaseBufferLocked();
+  std::unique_ptr<TrackedTfrtCpuDeviceBuffer> device_buffer;
+  {
+    absl::MutexLock lock(&mu_);
+    device_buffer = ReleaseBufferLocked();
+  }
   if (device_buffer == nullptr) return {nullptr};
 
   absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> events;
@@ -263,6 +351,162 @@ AbstractTfrtCpuBuffer::AcquireDonation() {
   return DonationTransaction(this, std::move(tracked_device_buffer_));
 }
 
+PjRtFuture<Status> AbstractTfrtCpuBuffer::ToLiteralHelper(
+    MutableLiteralBase* literal, AsyncWorkRunner* async_work_runner) {
+  std::string message = absl::StrCat(buffer_name(), "::ToLiteral");
+  absl::string_view message_view(message);
+  tsl::profiler::TraceMe traceme(message_view);
+  if (IsEmptyTuple()) {
+    return PjRtFuture<Status>(
+        InvalidArgument("ToLiteral called on empty tuple"));
+  }
+  auto usage_event = tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto* device_buffer = AcquireUsage(usage_event);
+  if (device_buffer == nullptr) {
+    return PjRtFuture<Status>(InvalidArgument(
+        "CopyToHostAsync() called on deleted or donated buffer"));
+  }
+  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  std::vector<tfrt::RCReference<tfrt::AsyncValue>> device_buffer_wait_avs = {
+      device_buffer->definition_event().CopyRCRef()};
+  std::vector<tfrt::RCReference<tfrt::AsyncValue>> device_buffer_wait_avs_copy =
+      {device_buffer->definition_event().CopyRCRef()};
+
+  bool should_sync_copy = device_buffer_wait_avs.empty() &&
+                          literal->size_bytes() < kSmallDataTransferByteSize;
+  StatusOr<Shape> device_shape = logical_on_device_shape();
+  if (!device_shape.ok()) {
+    return PjRtFuture<Status>(device_shape.status());
+  }
+  if (should_sync_copy) {
+    CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
+    // Unblock ToLiteral caller.
+    return PjRtFuture<Status>(OkStatus());
+  } else {
+    auto ready_event = tfrt::MakeUnconstructedAsyncValueRef<Status>();
+    // Wait for buffer definition events to finish before d2h dispatch. D2H
+    // dispatch should be in parallel, e.g. one Execute event finish may trigger
+    // multiple outputs' D2H, they should happen in different threads in
+    // parallel.
+    async_work_runner->ScheduleWhenReady(
+        device_buffer_wait_avs,
+        [device_buffer_wait_avs = std::move(device_buffer_wait_avs_copy),
+         literal, ready_event = ready_event.CopyRef(), device_buffer,
+         device_shape, ready_on_exit = std::move(ready_on_exit)]() mutable {
+          tsl::profiler::TraceMe traceme("D2H Dispatch");
+          // Errors in src buffer are surfaced to user.
+          for (const auto& av : device_buffer_wait_avs) {
+            if (auto* error = av->GetErrorIfPresent()) {
+              ready_event.emplace(Internal("Error converting to literal: %s",
+                                           error->message()));
+              return;
+            }
+          }
+          CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
+          // Unblock ToLiteral event.
+          ready_event.emplace(OkStatus());
+        });
+    return PjRtFuture<Status>(
+        std::move(ready_event),
+        /*on_block_start=*/
+        [message]() {
+          absl::string_view message_view(message);
+          tsl::profiler::TraceMeProducer traceme(message_view);
+          VLOG(1) << message_view;
+          return PjRtFutureHelpers::ProfilingKeys(
+              {/*traceme_context_id =*/traceme.GetContextId()});
+        },
+        /*on_block_end=*/
+        [message](PjRtFutureHelpers::ProfilingKeys keys) {
+          absl::string_view message_view(message);
+          tsl::profiler::TraceMeConsumer traceme(message_view,
+                                                 keys.traceme_context_id);
+        });
+  }
+}
+
+StatusOr<std::unique_ptr<PjRtBuffer>>
+AbstractTfrtCpuBuffer::CopyToDeviceAcrossClients(PjRtDevice* dst_device) {
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
+  // Avoid use-after-free on `literal` due to unsequenced move and use.
+  Literal* literal_pointer = literal.get();
+  absl::InlinedVector<int64_t, 4> byte_strides(
+      literal->shape().dimensions_size());
+  TF_RETURN_IF_ERROR(
+      ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
+  return dst_device->client()->BufferFromHostBuffer(
+      literal_pointer->untyped_data(), literal_pointer->shape().element_type(),
+      literal_pointer->shape().dimensions(), byte_strides,
+      PjRtClient::HostBufferSemantics::kZeroCopy,
+      [literal{std::move(literal)}]() { /* frees literal */ }, dst_device);
+}
+
+StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+AbstractTfrtCpuBuffer::CopyToDeviceHelper(AsyncWorkRunner* async_work_runner) {
+  // Copy each leaf buffer to a destination buffer.
+  auto usage_event = tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+  auto* src_device_buffer = AcquireUsage(usage_event);
+  if (src_device_buffer == nullptr) {
+    return InvalidArgument("CopyToDevice called on deleted or donated buffer");
+  }
+  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
+
+  int num_leaf_buffers = src_device_buffer->Buffers().size();
+  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> src_buffers;
+  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> dst_buffers;
+  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> dst_definition_events;
+  src_buffers.reserve(num_leaf_buffers);
+  dst_buffers.reserve(num_leaf_buffers);
+  dst_definition_events.reserve(num_leaf_buffers);
+
+  for (int i = 0; i < num_leaf_buffers; ++i) {
+    auto src_buffer = src_device_buffer->Buffers()[i];
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<MaybeOwningCpuMemory> dst_buffer,
+        MaybeOwningCpuMemory::AllocateShared(src_buffer->size()));
+    src_buffers.push_back(std::move(src_buffer));
+    dst_buffers.push_back(std::move(dst_buffer));
+    dst_definition_events.push_back(
+        tfrt::MakeConstructedAsyncValueRef<CpuEvent>());
+  }
+
+  // Wait for src buffer definition events to finish before d2d dispatch.
+  // Errors are propagated asynchronously in dst buffer's definition events.
+  const auto& src_definition_event = src_device_buffer->definition_event();
+
+  auto copy_task = [num_leaf_buffers, src_buffers = std::move(src_buffers),
+                    dst_buffers_copies = dst_buffers, dst_definition_events,
+                    src_definition_event,
+                    ready_on_exit = std::move(ready_on_exit)]() mutable {
+    tsl::profiler::TraceMe traceme("D2D Dispatch");
+    if (auto* error = src_definition_event.GetErrorIfPresent()) {
+      for (int i = 0; i < num_leaf_buffers; ++i) {
+        // Any error discovered in src buffer are propagated to dst buffer
+        // definition events, which will surface to users in
+        // dst_buffer->ToLiteral().
+        dst_definition_events[i].SetError(*error);
+      }
+      return;
+    }
+
+    for (int i = 0; i < num_leaf_buffers; ++i) {
+      std::memcpy(dst_buffers_copies[i]->data(), src_buffers[i]->data(),
+                  src_buffers[i]->size());
+      dst_definition_events[i].SetStateConcrete();
+    }
+  };
+
+  src_definition_event.AndThen(
+      [async_work_runner, copy_task = std::move(copy_task)]() mutable {
+        async_work_runner->Schedule(std::move(copy_task));
+      });
+
+  return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
+      on_device_shape_.IsTuple(), std::move(dst_buffers),
+      std::move(dst_definition_events));
+}
+
 PjRtFuture<Status> AbstractTfrtCpuBuffer::GetReadyFuture() {
   tfrt::AsyncValueRef<CpuEvent> definition_event;
   {
@@ -317,4 +561,175 @@ PjRtFuture<Status> AbstractTfrtCpuBuffer::GetReadyFuture() {
   }
 }
 
+void AbstractTfrtCpuBuffer::CopyFromLiteral(
+    const LiteralSlice& literal, const Shape& shape,
+    absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4>* avs,
+    AsyncWorkRunner* async_work_runner) {
+  auto usage_event = tfrt::MakeAvailableAsyncValueRef<CpuEvent>();
+  auto* device_buffer = AcquireUsage(std::move(usage_event));
+  CHECK(device_buffer);
+  if (!shape.IsTuple()) {
+    // It is OK to capture `buffer` pointer because the `output_buffer` can't be
+    // deleted until all the usage holds have gone away.
+    async_work_runner->Schedule(
+        [literal, av = (*avs)[0].CopyRef(), device_buffer, shape]() mutable {
+          tsl::profiler::TraceMe traceme("H2D Dispatch");
+          const std::shared_ptr<MaybeOwningCpuMemory>& b =
+              device_buffer->Buffers()[0];
+          CHECK_EQ(literal.size_bytes(), b->size());
+          std::memcpy(b->data(), literal.untyped_data(), b->size());
+          // Signal copy is complete.
+          av->SetStateConcrete();
+        });
+  } else {
+    // For tuple, transfer leaf literal individually in parallel.
+    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
+      // It is OK to capture `buffer` pointer because the `output_buffer` can't
+      // be deleted until all the usage holds have gone away.
+      async_work_runner->Schedule([i, literal, av = (*avs)[i].CopyRef(), shape,
+                                   device_buffer]() mutable {
+        tsl::profiler::TraceMe traceme("H2D Dispatch");
+        auto slice = LiteralSlice(literal, {i});
+        const std::shared_ptr<MaybeOwningCpuMemory>& b =
+            device_buffer->Buffers()[i];
+        CHECK_EQ(slice.size_bytes(), b->size());
+        std::memcpy(b->data(), slice.untyped_data(), slice.size_bytes());
+        // Signal copy is complete.
+        av->SetStateConcrete();
+      });
+    }
+  }
+}
+
+/*static*/ StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+AbstractTfrtCpuBuffer::AllocateTrackedDeviceBuffer(
+    const Shape& on_device_shape,
+    absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events) {
+  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
+  if (!on_device_shape.IsTuple()) {
+    size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<MaybeOwningCpuMemory> device_buffer,
+                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    buffers.push_back(std::move(device_buffer));
+    return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
+        /*is_tuple=*/false, std::move(buffers), std::move(definition_events));
+  }
+  // Tuple case.
+  buffers.reserve(on_device_shape.tuple_shapes().size());
+  for (const auto& leaf_shape : on_device_shape.tuple_shapes()) {
+    size_t byte_size = ShapeUtil::ByteSizeOf(leaf_shape);
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<MaybeOwningCpuMemory> device_buffer,
+                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    buffers.push_back(std::move(device_buffer));
+  }
+  return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
+      /*is_tuple=*/true, std::move(buffers), std::move(definition_events));
+}
+
+/*static*/ void AbstractTfrtCpuBuffer::AllocateAvsAndEvents(
+    const Shape& shape,
+    absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4>* avs,
+    absl::InlinedVector<tfrt::AsyncValueRef<runtime::CpuEvent>, 4>*
+        definition_events) {
+  // Nested tuple shapes are not supported here.
+  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes_size() : 1;
+  for (int i = 0; i < num_leaf_buffers; ++i) {
+    tfrt::AsyncValueRef<CpuEvent> definition_event =
+        tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+    definition_events->push_back(definition_event.CopyRef());
+    avs->push_back(std::move(definition_event));
+  }
+}
+
+/*static*/ StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
+    const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+    std::optional<absl::Span<int64_t const>> byte_strides,
+    PjRtClient::HostBufferSemantics host_buffer_semantics,
+    std::function<void()> on_done_with_host_buffer, const Shape& shape,
+    AsyncWorkRunner* async_work_runner, absl::Mutex* transpose_mu,
+    TransposePlanCache* transpose_cache) {
+  bool has_default_layout =
+      !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
+  // If the input buffer has a default layout and is sufficiently aligned, we
+  // can simply point to the input array's data without any further copies. At
+  // the time of writing we require a 16-byte alignment because XLA may generate
+  // code which requires it.
+  bool can_use_zero_copy =
+      has_default_layout &&
+      host_buffer_semantics == PjRtClient::HostBufferSemantics::kZeroCopy &&
+      ((absl::bit_cast<std::uintptr_t>(data) &
+        (cpu_function_runtime::MinAlign() - 1)) == 0);
+  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
+  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events;
+  std::function<void()> on_delete_callback;
+  size_t byte_size = ShapeUtil::ByteSizeOf(shape);
+  if (can_use_zero_copy) {
+    auto device_buffer = std::make_shared<MaybeOwningCpuMemory>(
+        const_cast<void*>(data), byte_size);
+    buffers.push_back(std::move(device_buffer));
+    on_delete_callback = std::move(on_done_with_host_buffer);
+  } else {
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<MaybeOwningCpuMemory> device_buffer,
+                        MaybeOwningCpuMemory::AllocateShared(byte_size));
+    auto dst_data_ptr = device_buffer->data();
+    buffers.push_back(device_buffer);
+    if (!has_default_layout) {
+      // If the input array does not have a major-to-minor layout, transpose it
+      // into major-to-minor layout. Currently we choose to always do this
+      // synchronously.
+      // TODO(phawkins): consider performing the transpose asynchronously.
+      // TODO(phawkins): parallelize the transpose.
+      std::shared_ptr<TransposePlan> transpose;
+      {
+        absl::InlinedVector<int64_t, 4> permutation(dims.size());
+        absl::c_iota(permutation, 0);
+        absl::MutexLock lock(transpose_mu);
+        TF_ASSIGN_OR_RETURN(
+            transpose, transpose_cache->GetOrCreate(
+                           primitive_util::ByteWidth(type), dims, permutation,
+                           TransposePlan::Striding{*byte_strides}));
+      }
+      transpose->Execute(data, dst_data_ptr);
+      if (on_done_with_host_buffer) {
+        on_done_with_host_buffer();
+        on_done_with_host_buffer = nullptr;
+      }
+    } else {
+      bool should_sync_copy =
+          host_buffer_semantics ==
+              PjRtClient::HostBufferSemantics::kImmutableOnlyDuringCall ||
+          (byte_size < kSmallDataTransferByteSize);
+      if (should_sync_copy) {
+        std::memcpy(dst_data_ptr, data, byte_size);
+        if (on_done_with_host_buffer) {
+          on_done_with_host_buffer();
+          on_done_with_host_buffer = nullptr;
+        }
+      } else {
+        tfrt::AsyncValueRef<CpuEvent> copy_event =
+            tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
+        definition_events.push_back(copy_event.CopyRef());
+        async_work_runner->Schedule(
+            [device_buffer = std::move(device_buffer), dst_data_ptr, data,
+             byte_size, copy_event = std::move(copy_event),
+             on_done_with_host_buffer =
+                 std::move(on_done_with_host_buffer)]() mutable {
+              tsl::profiler::TraceMe traceme("H2D Dispatch");
+              std::memcpy(dst_data_ptr, data, byte_size);
+              if (on_done_with_host_buffer) {
+                on_done_with_host_buffer();
+                on_done_with_host_buffer = nullptr;
+              }
+              // Signal copy is complete.
+              copy_event.SetStateConcrete();
+            });
+      }
+    }
+  }
+  return std::make_unique<TrackedTfrtCpuDeviceBuffer>(
+      /*is_tuple=*/false, std::move(buffers), std::move(definition_events),
+      std::move(on_delete_callback));
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h
index 1b472bca189..33800ecf175 100644
--- a/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h
+++ b/tensorflow/compiler/xla/pjrt/abstract_tfrt_cpu_buffer.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -29,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/transpose.h"
 #include "tensorflow/compiler/xla/runtime/cpu_event.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -38,10 +41,6 @@ limitations under the License.
 
 namespace xla {
 
-void CopyCpuBufferToLiteral(const Shape& device_shape,
-                            TrackedTfrtCpuDeviceBuffer* device_buffer,
-                            MutableLiteralBase* literal);
-
 // A RAII helper class used to set an AsyncValueRef<CpuEvent> to a ready state
 // upon destruction. In many cases in PjRt implementation, there will be
 // multiple return statements in the function, all of which require setting some
@@ -69,6 +68,49 @@ class MarkEventReadyOnExit {
   tfrt::AsyncValueRef<runtime::CpuEvent> event_;
 };
 
+// Async work runner abstracts away the implementation of the underlying thread
+// pool (or concurrent work queue).
+class AsyncWorkRunner {
+ public:
+  virtual ~AsyncWorkRunner() = default;
+
+  virtual void Schedule(absl::AnyInvocable<void()> work) = 0;
+  virtual void ScheduleWhenReady(
+      absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
+      absl::AnyInvocable<void()> work) = 0;
+};
+
+// Represents the unpinned host memory accessible to a PjRtDevice.
+class UnpinnedHostMemorySpace : public PjRtMemorySpace {
+ public:
+  static constexpr absl::string_view kMemorySpaceKind = "unpinned_host";
+
+  UnpinnedHostMemorySpace(int id, PjRtClient* client);
+
+  PjRtClient* client() const override { return client_; }
+
+  absl::Span<PjRtDevice* const> devices() const override { return devices_; }
+
+  int id() const override { return id_; }
+
+  absl::string_view memory_space_kind() const override {
+    return kMemorySpaceKind;
+  }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+  void AttachDevice(PjRtDevice* device) { devices_.push_back(device); }
+
+ private:
+  int id_;
+  PjRtClient* client_;
+  std::vector<PjRtDevice*> devices_;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
 class AbstractTfrtCpuBuffer : public PjRtBuffer {
  public:
   AbstractTfrtCpuBuffer(
@@ -78,6 +120,8 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
 
   const Shape& on_device_shape() const override { return on_device_shape_; }
 
+  StatusOr<Shape> logical_on_device_shape() override;
+
   StatusOr<std::unique_ptr<ExternalReference>> AcquireExternalReference()
       override;
 
@@ -175,19 +219,63 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   // already donated or there is outstanding external references.
   StatusOr<DonationTransaction> AcquireDonation();
 
+  // A helper function for PjRtClient::BufferFromHostLiteral. Copy the literal
+  // to the current buffer asynchronously. `avs` is used to signal when the copy
+  // is complete and `async_work_runner` is used to schedule the async work into
+  // the underlying thread pool or work queue (usually owned by the client).
+  void CopyFromLiteral(
+      const LiteralSlice& literal, const Shape& shape,
+      absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4>* avs,
+      AsyncWorkRunner* async_work_runner);
+
+  // Allocates a new `TrackedTfrtCpuDeviceBuffer` with the given shape and
+  // definition events.
+  static StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+  AllocateTrackedDeviceBuffer(
+      const Shape& on_device_shape,
+      absl::InlinedVector<tfrt::AsyncValueRef<runtime::CpuEvent>, 4>
+          definition_events);
+
+  // Allocates new cpu events to `avs` and `definition_events`. If `shape` is a
+  // tuple, multiple events will be allocated. Otherwise, `avs` and
+  // `definition_events` will only contain one event.
+  static void AllocateAvsAndEvents(
+      const Shape& shape,
+      absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4>* avs,
+      absl::InlinedVector<tfrt::AsyncValueRef<runtime::CpuEvent>, 4>*
+          definition_events);
+
+  // A helper function for PjRtClient::BufferFromHostBuffer. Creates a new cpu
+  // device buffer from the host buffer (maybe zero-copy or async).
+  // `transpose_mu` and `transpose_cache` are used to transpose the input
+  // layout.
+  static StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>>
+  BufferFromHostBufferHelper(
+      const void* data, PrimitiveType type, absl::Span<int64_t const> dims,
+      std::optional<absl::Span<int64_t const>> byte_strides,
+      PjRtClient::HostBufferSemantics host_buffer_semantics,
+      std::function<void()> on_done_with_host_buffer, const Shape& shape,
+      AsyncWorkRunner* async_work_runner, absl::Mutex* transpose_mu,
+      TransposePlanCache* transpose_cache);
+
  protected:
   virtual absl::string_view buffer_name() const = 0;
 
+  PjRtFuture<Status> ToLiteralHelper(MutableLiteralBase* literal,
+                                     AsyncWorkRunner* async_work_runner);
+
+  StatusOr<std::unique_ptr<PjRtBuffer>> CopyToDeviceAcrossClients(
+      PjRtDevice* dst_device);
+
+  StatusOr<std::unique_ptr<TrackedTfrtCpuDeviceBuffer>> CopyToDeviceHelper(
+      AsyncWorkRunner* async_work_runner);
+
   bool IsEmptyTuple() const {
     return on_device_shape_.IsTuple() &&
            on_device_shape_.tuple_shapes_size() == 0;
   }
 
-  void DropExternalReference() {
-    absl::MutexLock lock(&mu_);
-    CHECK_GT(external_reference_counter_, 0);
-    --external_reference_counter_;
-  }
+  void DropExternalReference();
 
   // Commits the pending donation by setting `pending_donation_` to false.
   // `pending_donation_` must be true before calling this method.
@@ -219,7 +307,7 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
   // outstanding donation or usage holds, this method blocks until those holds
   // are committed or dropped.
   std::unique_ptr<TrackedTfrtCpuDeviceBuffer> ReleaseBufferLocked()
-      ABSL_LOCKS_EXCLUDED(mu_);
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   const Shape on_device_shape_;
 
@@ -228,6 +316,13 @@ class AbstractTfrtCpuBuffer : public PjRtBuffer {
       ABSL_GUARDED_BY(mu_);
   // Count of external references on the buffer.
   int external_reference_counter_ ABSL_GUARDED_BY(mu_) = 0;
+
+  // If this buffer has external references when Delete() is called, this event
+  // is populated by Delete(). When the last external reference is released,
+  // the event is triggered, which is a precondition for the buffer being
+  std::optional<tfrt::AsyncValueRef<runtime::CpuEvent>>
+      external_references_dropped_event_ ABSL_GUARDED_BY(mu_);
+
   // `pending_donation_` indicates whether a donation is pending. The destructor
   // of the AbstractTfrtCpuBuffer will wait for a pending donation, as the
   // donation might fail. Note that concurrent calls to AcquireUsage() and
diff --git a/tensorflow/compiler/xla/pjrt/c/BUILD b/tensorflow/compiler/xla/pjrt/c/BUILD
index 8faaf03eba4..3de1333e3ff 100644
--- a/tensorflow/compiler/xla/pjrt/c/BUILD
+++ b/tensorflow/compiler/xla/pjrt/c/BUILD
@@ -1,5 +1,9 @@
-load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load(
+    "//tensorflow/tsl/platform:build_config_root.bzl",
+    "tf_cuda_tests_tags",
+)
+load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -7,9 +11,6 @@ cc_library(
     name = "pjrt_c_api_hdrs",
     hdrs = ["pjrt_c_api.h"],
     visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_decl",  # TODO(b/238999986): Remove this.
-    ],
 )
 
 cc_library(
@@ -21,20 +22,6 @@ cc_library(
     ],
 )
 
-# TODO(269360974): remove this target after pjrt_c_api_client no longer needs
-# to access wrapped objects.
-cc_library(
-    name = "pjrt_c_api_wrapper_impl_header",
-    hdrs = ["pjrt_c_api_wrapper_impl.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":pjrt_c_api_hdrs",
-        "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
-        "//tensorflow/compiler/xla/pjrt:pjrt_future",
-    ],
-)
-
 cc_library(
     name = "pjrt_c_api_wrapper_impl",
     srcs = ["pjrt_c_api_wrapper_impl.cc"],
@@ -56,7 +43,6 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt:pjrt_executable",
         "//tensorflow/compiler/xla/pjrt:pjrt_future",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/stream_executor/tpu:c_api_conversions",  # TODO(b/238999986): Remove this.
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/synchronization",
     ],
@@ -92,6 +78,44 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "pjrt_c_api_gpu_internal",
+    srcs = ["pjrt_c_api_gpu_internal.cc"],
+    hdrs = ["pjrt_c_api_gpu_internal.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":pjrt_c_api_hdrs",
+        ":pjrt_c_api_helpers",
+        ":pjrt_c_api_wrapper_impl",
+        "//tensorflow/compiler/xla/pjrt/gpu:gpu_helpers",
+        "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
+    ],
+)
+
+cc_library(
+    name = "pjrt_c_api_gpu",
+    srcs = ["pjrt_c_api_gpu.cc"],
+    hdrs = ["pjrt_c_api_gpu.h"],
+    deps = [
+        ":pjrt_c_api_gpu_internal",
+        ":pjrt_c_api_hdrs",
+    ],
+)
+
+xla_cc_test(
+    name = "pjrt_c_api_gpu_test",
+    srcs = ["pjrt_c_api_gpu_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
+        ":pjrt_c_api_gpu",
+        ":pjrt_c_api_hdrs",
+        ":pjrt_c_api_wrapper_impl",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "pjrt_c_api_helpers_test",
     srcs = ["pjrt_c_api_helpers_test.cc"],
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
index e5918b8eb1d..2540eca811f 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h
@@ -20,9 +20,6 @@ limitations under the License.
 #include <stddef.h>
 #include <stdint.h>
 
-// TODO(b/238999986): Remove this.
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
-
 #define PJRT_STRUCT_SIZE(struct_type, last_field) \
   offsetof(struct_type, last_field) + sizeof(((struct_type*)0)->last_field)
 
@@ -34,6 +31,41 @@ limitations under the License.
 extern "C" {
 #endif
 
+// --------------------------------- Version -----------------------------------
+
+// Incremented when an ABI-incompatible change is made to the interface.
+// Changes include:
+// * Deleting a method or argument
+// * Changing the type of an argument
+// * Rearranging fields in the PJRT_Api or argument structs
+#define PJRT_API_MAJOR 0
+
+// Incremented when the interface is updated in a way that is potentially
+// ABI-compatible with older versions, if supported by the caller and/or
+// implementation.
+//
+// Callers can implement forwards compatibility by using PJRT_Api_Version to
+// check if the implementation is aware of newer interface additions.
+//
+// Implementations can implement backwards compatibility by using the
+// `struct_size` fields to detect how many struct fields the caller is aware of.
+//
+// Changes include:
+// * Adding a new field to the PJRT_Api or argument structs
+// * Renaming a method or argument (doesn't affect ABI)
+#define PJRT_API_MINOR 1
+
+// The plugin should set the major_version and minor_version of
+// PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
+// this header that the implementation was compiled with.
+struct PJRT_Api_Version {
+  size_t struct_size;
+  void* priv;
+  int major_version;  // out
+  int minor_version;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Api_Version, minor_version);
+
 // ---------------------------------- Errors -----------------------------------
 
 // PJRT C API methods generally return a PJRT_Error*, which is nullptr if there
@@ -441,6 +473,10 @@ typedef enum {
   PJRT_Buffer_Type_F8E5M2,
   PJRT_Buffer_Type_F8E4M3FN,
   PJRT_Buffer_Type_F8E4M3B11FNUZ,
+
+  // 4-bit integer types
+  PJRT_Buffer_Type_S4,
+  PJRT_Buffer_Type_U4,
 } PJRT_Buffer_Type;
 
 typedef enum {
@@ -641,6 +677,55 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_LocalHardwareId_Args, local_hardware_id);
 typedef PJRT_Error* PJRT_Device_LocalHardwareId(
     PJRT_Device_LocalHardwareId_Args* args);
 
+struct PJRT_Device_MemoryStats_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_Device* device;
+
+  // Number of bytes in use.
+  int64_t bytes_in_use;  // out
+
+  // The peak bytes in use.
+  int64_t peak_bytes_in_use;      // out
+  bool peak_bytes_in_use_is_set;  // out
+  // Number of allocations.
+  int64_t num_allocs;      // out
+  bool num_allocs_is_set;  // out
+  // The largest single allocation seen.
+  int64_t largest_alloc_size;      // out
+  bool largest_alloc_size_is_set;  // out
+  // The upper limit of user-allocatable device memory in bytes.
+  int64_t bytes_limit;      // out
+  bool bytes_limit_is_set;  // out
+
+  // Number of bytes reserved.
+  int64_t bytes_reserved;      // out
+  bool bytes_reserved_is_set;  // out
+  // The peak number of bytes reserved.
+  int64_t peak_bytes_reserved;      // out
+  bool peak_bytes_reserved_is_set;  // out
+  // The upper limit on the number bytes of reservable memory.
+  int64_t bytes_reservable_limit;      // out
+  bool bytes_reservable_limit_is_set;  // out
+
+  // Largest free block size in bytes.
+  int64_t largest_free_block_bytes;      // out
+  bool largest_free_block_bytes_is_set;  // out
+
+  // Number of bytes of memory held by the allocator.  This may be higher than
+  // bytes_in_use if the allocator holds a pool of memory (e.g. BFCAllocator).
+  int64_t pool_bytes;           // out
+  bool pool_bytes_is_set;       // out
+  int64_t peak_pool_bytes;      // out
+  bool peak_pool_bytes_is_set;  // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Device_MemoryStats_Args, peak_pool_bytes_is_set);
+
+// Device memory/allocator statistics. All returned stats except `bytes_in_use`
+// are optional and may not be returned by all platforms. Implementations may
+// also return PJRT_Error_Code_UNIMPLEMENTED. Intended for diagnostic purposes.
+typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
+
 // ------------------------------- Executables ---------------------------------
 
 struct PJRT_Executable_Destroy_Args {
@@ -1020,6 +1105,58 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Destroy_Args, buffer);
 // called and frees `buffer`. `buffer` can be nullptr.
 typedef PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
 
+// Maximum number of array elements to inline into structs for performance.
+#define PJRT_C_API_MAX_INLINED 6
+
+typedef struct PJRT_IntList {
+  union {
+    int* heap;  // owned
+    int inlined[PJRT_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} PJRT_IntList;
+
+typedef struct PJRT_Int64List {
+  union {
+    int64_t* heap;  // owned
+    int64_t inlined[PJRT_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} PJRT_Int64List;
+
+typedef struct PJRT_BoolList {
+  union {
+    bool* heap;  // owned
+    bool inlined[PJRT_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} PJRT_BoolList;
+
+typedef struct PJRT_XLA_Tile {
+  PJRT_Int64List dimensions;
+} PJRT_XLA_Tile;
+
+typedef struct PJRT_XLA_TileList {
+  union {
+    PJRT_XLA_Tile* heap;  // owned
+    PJRT_XLA_Tile inlined[PJRT_C_API_MAX_INLINED];
+  };
+  int64_t size;
+} PJRT_XLA_TileList;
+
+typedef struct PJRT_XLA_Layout {
+  PJRT_Int64List minor_to_major;
+  PJRT_IntList dim_level_types;
+  PJRT_IntList dim_unique;
+  PJRT_IntList dim_ordered;
+  PJRT_XLA_TileList tiles;
+  int index_primitive_type;
+  int pointer_primitive_type;
+  int64_t element_size_in_bits;
+  int64_t memory_space;
+  int64_t dynamic_shape_metadata_prefix_bytes;
+} PJRT_XLA_Layout;
+
 // This trimmed shape doesn't have any Tuple information. In case of Tuple,
 // assert is triggered from the C API  Client.
 // TODO(b/238999986): This is a temporary solution. Remove this later.
@@ -1027,13 +1164,13 @@ struct PJRT_Buffer_OnDeviceTrimmedShape_Args {
   size_t struct_size;
   void* priv;
   PJRT_Buffer* buffer;
-  int element_type;             // out
-  Int64List dimensions;         // out
-  BoolList dynamic_dimensions;  // out
+  int element_type;                  // out
+  PJRT_Int64List dimensions;         // out
+  PJRT_BoolList dynamic_dimensions;  // out
   bool has_layout;
   // Whether it calls logical_on_device_shape.
   bool is_logical_on_device_shape;
-  XLA_Layout layout;  // out
+  PJRT_XLA_Layout layout;  // out
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_OnDeviceTrimmedShape_Args, layout);
 
@@ -1294,6 +1431,23 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_PlatformName_Args,
 typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
     PJRT_TopologyDescription_PlatformName_Args* args);
 
+struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
+  size_t struct_size;
+  void* priv;
+  PJRT_TopologyDescription* topology;
+  // Has the same lifetime as topology.
+  PJRT_DeviceDescription** descriptions;  // out
+  size_t num_descriptions;                // out
+};
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_GetDeviceDescriptions_Args,
+                          num_descriptions);
+
+// Returns descriptions for all devices in this topology. The device
+// descriptions can be returned in any order, but will be in the same order
+// across calls within a process.
+typedef PJRT_Error* PJRT_TopologyDescription_GetDeviceDescriptions(
+    PJRT_TopologyDescription_GetDeviceDescriptions_Args* args);
+
 struct PJRT_Compile_Args {
   size_t struct_size;
   void* priv;
@@ -1326,6 +1480,8 @@ typedef struct {
   size_t struct_size;
   void* priv;
 
+  PJRT_Api_Version pjrt_api_version;
+
   _PJRT_API_STRUCT_FIELD(PJRT_Error_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_Error_Message);
   _PJRT_API_STRUCT_FIELD(PJRT_Error_GetCode);
@@ -1359,6 +1515,7 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_Device_GetDescription);
   _PJRT_API_STRUCT_FIELD(PJRT_Device_IsAddressable);
   _PJRT_API_STRUCT_FIELD(PJRT_Device_LocalHardwareId);
+  _PJRT_API_STRUCT_FIELD(PJRT_Device_MemoryStats);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_Executable_Name);
@@ -1402,6 +1559,7 @@ typedef struct {
   _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_Destroy);
   _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_PlatformName);
   _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_PlatformVersion);
+  _PJRT_API_STRUCT_FIELD(PJRT_TopologyDescription_GetDeviceDescriptions);
 
   _PJRT_API_STRUCT_FIELD(PJRT_Compile);
 } PJRT_Api;
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc
index 98d505b2594..25b594fcd8c 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_cpu_test.cc
@@ -88,6 +88,11 @@ TEST_F(PjrtCApiCpuTest, PlatformName) {
   ASSERT_EQ("cpu", platform_name);
 }
 
+TEST_F(PjrtCApiCpuTest, ApiVersion) {
+  CHECK_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
+  CHECK_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
+}
+
 }  // namespace
 }  // namespace pjrt
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.cc
new file mode 100644
index 00000000000..ed580ea9479
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.cc
@@ -0,0 +1,20 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.h"
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.h"
+
+const PJRT_Api* GetPjrtApi() { return pjrt::gpu_plugin::GetGpuPjrtApi(); }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/constants.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.h
similarity index 53%
rename from tensorflow/compiler/mlir/quantization/tensorflow/constants.h
rename to tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.h
index 036b357210c..7ed6121547c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/constants.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.h
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,16 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CONSTANTS_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CONSTANTS_H_
 
-#include "absl/strings/string_view.h"
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_GPU_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_GPU_H_
 
-namespace tensorflow::quantization {
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
 
-// Prefix string identifying the node name of the initializer op.
-inline constexpr absl::string_view kInitOpNamePrefix = "init_op_";
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-}  // namespace tensorflow::quantization
+// Does not pass ownership of returned PJRT_Api* to caller.
+const PJRT_Api* GetPjrtApi();
 
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_CONSTANTS_H_
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_GPU_H_
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
new file mode 100644
index 00000000000..fcfe101da32
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
+
+namespace pjrt {
+namespace gpu_plugin {
+
+PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Client_Create_Args", PJRT_Client_Create_Args_STRUCT_SIZE,
+      args->struct_size));
+
+  // TODO(b/261916900) initializing allocator_config is important as should be
+  // passed through the args later.
+  xla::GpuAllocatorConfig allocator_config;
+  PJRT_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> client,
+                        xla::GetStreamExecutorGpuClient(
+                            /*asynchronous=*/true, allocator_config,
+                            /*node_id=*/0));
+  args->client = pjrt::CreateWrapperClient(std::move(client));
+  return nullptr;
+}
+
+PJRT_Error* PJRT_GpuDeviceTopology_Create(
+    PJRT_TopologyDescription_Create_Args* args) {
+  return new PJRT_Error{tsl::errors::Unimplemented(
+      "Topology not supported for GPU compilation.")};
+}
+
+constexpr PJRT_Api pjrt_api =
+    pjrt::CreatePjrtApi(pjrt::gpu_plugin::PJRT_Client_Create,
+                        pjrt::gpu_plugin::PJRT_GpuDeviceTopology_Create);
+
+const PJRT_Api* GetGpuPjrtApi() { return &pjrt_api; }
+
+}  // namespace gpu_plugin
+}  // namespace pjrt
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.h
new file mode 100644
index 00000000000..d48623f97e5
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_internal.h
@@ -0,0 +1,29 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_GPU_INTERNAL_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_GPU_INTERNAL_H_
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+
+namespace pjrt {
+namespace gpu_plugin {
+
+const PJRT_Api* GetGpuPjrtApi();
+
+}  // namespace gpu_plugin
+}  // namespace pjrt
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_C_PJRT_C_API_GPU_INTERNAL_H_
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_test.cc
new file mode 100644
index 00000000000..1ad2f3a4011
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -0,0 +1,99 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_gpu.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api.h"
+#include "tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+
+namespace xla {
+namespace pjrt {
+namespace {
+
+class PjrtCApiGpuTest : public ::testing::Test {
+ protected:
+  const PJRT_Api* api_;
+  PJRT_Client* client_;
+  // We directly access the internal C++ client to test if the C API has the
+  // same behavior as the C++ API.
+  xla::PjRtClient* cc_client_;
+
+  void SetUp() override {
+    api_ = GetPjrtApi();
+    client_ = make_client();
+    cc_client_ = client_->client.get();
+  }
+
+  void TearDown() override { destroy_client(client_); }
+
+  void destroy_client(PJRT_Client* client) {
+    PJRT_Client_Destroy_Args destroy_args = PJRT_Client_Destroy_Args{
+        .struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE,
+        .priv = nullptr,
+        .client = client,
+    };
+    PJRT_Error* error = api_->PJRT_Client_Destroy(&destroy_args);
+    CHECK_EQ(error, nullptr);
+  }
+
+  PJRT_Client* make_client() {
+    PJRT_Client_Create_Args create_args = PJRT_Client_Create_Args{
+        .struct_size = PJRT_Client_Create_Args_STRUCT_SIZE,
+        .priv = nullptr,
+        .client = nullptr,
+    };
+    PJRT_Error* error = api_->PJRT_Client_Create(&create_args);
+    CHECK_EQ(error, nullptr);
+    CHECK_NE(create_args.client, nullptr);
+    return create_args.client;
+  }
+};
+
+TEST_F(PjrtCApiGpuTest, ClientProcessIndex) {
+  PJRT_Client_ProcessIndex_Args process_index_args =
+      PJRT_Client_ProcessIndex_Args{
+          .struct_size = PJRT_Client_ProcessIndex_Args_STRUCT_SIZE,
+          .priv = nullptr,
+          .client = client_,
+          .process_index = -1,
+      };
+  PJRT_Error* error = api_->PJRT_Client_ProcessIndex(&process_index_args);
+  CHECK_EQ(error, nullptr);
+
+  // Single-process test should return 0
+  CHECK_EQ(process_index_args.process_index, 0);
+}
+
+TEST_F(PjrtCApiGpuTest, PlatformName) {
+  PJRT_Client_PlatformName_Args args;
+  args.client = client_;
+  args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  PJRT_Error* error = api_->PJRT_Client_PlatformName(&args);
+  ASSERT_EQ(error, nullptr);
+  absl::string_view platform_name(args.platform_name, args.platform_name_size);
+  ASSERT_EQ("gpu", platform_name);
+}
+
+TEST_F(PjrtCApiGpuTest, ApiVersion) {
+  CHECK_EQ(api_->pjrt_api_version.major_version, PJRT_API_MAJOR);
+  CHECK_EQ(api_->pjrt_api_version.minor_version, PJRT_API_MINOR);
+}
+
+}  // namespace
+}  // namespace pjrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
index e2639c62be1..09e51b4a15c 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -214,6 +214,8 @@ PJRT_Buffer_Type ConvertToPjRtBufferType(xla::PrimitiveType type) {
       return PJRT_Buffer_Type::PJRT_Buffer_Type_INVALID;
     case xla::PrimitiveType::PRED:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_PRED;
+    case xla::PrimitiveType::S4:
+      return PJRT_Buffer_Type::PJRT_Buffer_Type_S4;
     case xla::PrimitiveType::S8:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_S8;
     case xla::PrimitiveType::S16:
@@ -222,6 +224,8 @@ PJRT_Buffer_Type ConvertToPjRtBufferType(xla::PrimitiveType type) {
       return PJRT_Buffer_Type::PJRT_Buffer_Type_S32;
     case xla::PrimitiveType::S64:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_S64;
+    case xla::PrimitiveType::U4:
+      return PJRT_Buffer_Type::PJRT_Buffer_Type_U4;
     case xla::PrimitiveType::U8:
       return PJRT_Buffer_Type::PJRT_Buffer_Type_U8;
     case xla::PrimitiveType::U16:
@@ -259,6 +263,8 @@ xla::PrimitiveType ConvertFromPjRtBufferType(PJRT_Buffer_Type type) {
   switch (type) {
     case PJRT_Buffer_Type::PJRT_Buffer_Type_PRED:
       return xla::PrimitiveType::PRED;
+    case PJRT_Buffer_Type::PJRT_Buffer_Type_S4:
+      return xla::PrimitiveType::S4;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_S8:
       return xla::PrimitiveType::S8;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_S16:
@@ -267,6 +273,8 @@ xla::PrimitiveType ConvertFromPjRtBufferType(PJRT_Buffer_Type type) {
       return xla::PrimitiveType::S32;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_S64:
       return xla::PrimitiveType::S64;
+    case PJRT_Buffer_Type::PJRT_Buffer_Type_U4:
+      return xla::PrimitiveType::U4;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_U8:
       return xla::PrimitiveType::U8;
     case PJRT_Buffer_Type::PJRT_Buffer_Type_U16:
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index 300b92c5342..123cb0b57ed 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -43,9 +43,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/statusor.h"
-
-// TODO(b/238999986): Remove this.
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/errors.h"
 
@@ -523,6 +520,54 @@ PJRT_Error* PJRT_Device_LocalHardwareId(
   return nullptr;
 }
 
+PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args) {
+  PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
+      "PJRT_Device_MemoryStats_Args", PJRT_Device_MemoryStats_Args_STRUCT_SIZE,
+      args->struct_size));
+  PJRT_ASSIGN_OR_RETURN(tsl::AllocatorStats stats,
+                        args->device->device->GetAllocatorStats());
+
+  args->bytes_in_use = stats.bytes_in_use;
+
+  args->peak_bytes_in_use_is_set = true;
+  args->peak_bytes_in_use = stats.peak_bytes_in_use;
+  args->num_allocs_is_set = true;
+  args->num_allocs = stats.num_allocs;
+  args->largest_alloc_size_is_set = true;
+  args->largest_alloc_size = stats.largest_alloc_size;
+
+  args->bytes_limit_is_set = stats.bytes_limit.has_value();
+  if (stats.bytes_limit) {
+    args->bytes_limit = *stats.bytes_limit;
+  }
+
+  args->bytes_reserved_is_set = true;
+  args->bytes_reserved = stats.bytes_reserved;
+  args->peak_bytes_reserved_is_set = true;
+  args->peak_bytes_reserved = stats.peak_bytes_reserved;
+
+  args->bytes_reservable_limit_is_set =
+      stats.bytes_reservable_limit.has_value();
+  if (stats.bytes_reservable_limit) {
+    args->bytes_reservable_limit = *stats.bytes_reservable_limit;
+  }
+
+  args->largest_free_block_bytes_is_set = true;
+  args->largest_free_block_bytes = stats.largest_free_block_bytes;
+
+  args->pool_bytes_is_set = stats.pool_bytes.has_value();
+  if (stats.pool_bytes) {
+    args->pool_bytes = *stats.pool_bytes;
+  }
+
+  args->peak_pool_bytes_is_set = stats.peak_pool_bytes.has_value();
+  if (stats.peak_pool_bytes) {
+    args->peak_pool_bytes = *stats.peak_pool_bytes;
+  }
+
+  return nullptr;
+}
+
 // ------------------------------- Executables ---------------------------------
 
 PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args) {
@@ -1007,6 +1052,73 @@ PJRT_Error* PJRT_SerializedExecutable_Data(
   return nullptr;
 }
 
+namespace {
+
+// Helper functions for copying data to possibly-inlined C arrays.
+
+// 'Src' and 'Dst' are allowed to be different types to make this usable with
+// memory-identical types, e.g. int64_t and int64_t. This should not be used
+// with types that require a static_cast.
+template <typename Src, typename Dst, typename DstList>
+static void CreateVectorBase(const absl::Span<Src> src, DstList* dst) {
+  dst->size = src.size();
+  if (dst->size > PJRT_C_API_MAX_INLINED) {
+    dst->heap = new Dst[dst->size];
+    std::copy(src.begin(), src.end(), dst->heap);
+  } else {
+    std::copy(src.begin(), src.end(), dst->inlined);
+  }
+}
+
+void CreateVector(const absl::Span<const int64_t> src, PJRT_Int64List* dst) {
+  return CreateVectorBase<const int64_t, int64_t, PJRT_Int64List>(src, dst);
+}
+void CreateVector(const absl::Span<const bool> src, PJRT_BoolList* dst) {
+  return CreateVectorBase<const bool, bool, PJRT_BoolList>(src, dst);
+}
+static void CreateVector(const absl::Span<const xla::DimLevelType> src,
+                         PJRT_IntList* dst) {
+  CreateVectorBase<const xla::DimLevelType, int, PJRT_IntList>(src, dst);
+}
+void CreateVector(const absl::Span<const bool> src, PJRT_IntList* dst) {
+  CreateVectorBase<const bool, int, PJRT_IntList>(src, dst);
+}
+
+void ToC(const xla::Tile& tile, PJRT_XLA_Tile* c_tile) {
+  CreateVector(tile.dimensions(), &c_tile->dimensions);
+}
+
+void CreateVector(const absl::Span<const xla::Tile> src,
+                  PJRT_XLA_TileList* dst) {
+  dst->size = src.size();
+  PJRT_XLA_Tile* c_tiles;
+  if (dst->size > PJRT_C_API_MAX_INLINED) {
+    dst->heap = new PJRT_XLA_Tile[dst->size];
+    c_tiles = dst->heap;
+  } else {
+    c_tiles = dst->inlined;
+  }
+  for (int i = 0; i < dst->size; ++i) {
+    ToC(src[i], &c_tiles[i]);
+  }
+}
+
+void ToC(const xla::Layout& layout, PJRT_XLA_Layout* c_layout) {
+  CreateVector(layout.minor_to_major(), &c_layout->minor_to_major);
+  CreateVector(layout.dim_level_types(), &c_layout->dim_level_types);
+  CreateVector(layout.dim_unique(), &c_layout->dim_unique);
+  CreateVector(layout.dim_ordered(), &c_layout->dim_ordered);
+  c_layout->index_primitive_type = layout.index_primitive_type();
+  c_layout->pointer_primitive_type = layout.pointer_primitive_type();
+  c_layout->element_size_in_bits = layout.element_size_in_bits();
+  c_layout->memory_space = layout.memory_space();
+  c_layout->dynamic_shape_metadata_prefix_bytes =
+      layout.dynamic_shape_metadata_prefix_bytes();
+  CreateVector(layout.tiles(), &c_layout->tiles);
+}
+
+}  // namespace
+
 // ---------------------------------- Buffers ----------------------------------
 // TODO(b/238999986): Replace this with decomposed shape methods.
 PJRT_Error* PJRT_Buffer_OnDeviceTrimmedShape(
@@ -1023,13 +1135,12 @@ PJRT_Error* PJRT_Buffer_OnDeviceTrimmedShape(
     shape = args->buffer->buffer->on_device_shape();
   }
   args->element_type = shape.element_type();
-  ApiConverter::CreateVector(shape.dimensions(), &args->dimensions);
-  ApiConverter::CreateVector(shape.dynamic_dimensions(),
-                             &args->dynamic_dimensions);
+  CreateVector(shape.dimensions(), &args->dimensions);
+  CreateVector(shape.dynamic_dimensions(), &args->dynamic_dimensions);
 
   if (shape.has_layout()) {
     args->has_layout = true;
-    ApiConverter::ToC(shape.layout(), &args->layout);
+    ToC(shape.layout(), &args->layout);
   } else {
     args->has_layout = false;
   }
@@ -1315,6 +1426,13 @@ PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
   return nullptr;
 }
 
+PJRT_Error* PJRT_TopologyDescription_GetDeviceDescriptions(
+    PJRT_TopologyDescription_GetDeviceDescriptions_Args* args) {
+  args->descriptions = args->topology->description_pointers.data();
+  args->num_descriptions = args->topology->description_pointers.size();
+  return nullptr;
+}
+
 PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args) {
   PJRT_RETURN_IF_ERROR(CheckMatchingStructSizes(
       "PJRT_Compile_Args", PJRT_Compile_Args_STRUCT_SIZE, args->struct_size));
@@ -1419,6 +1537,16 @@ PJRT_TopologyDescription* CreateWrapperDeviceTopology(
     std::unique_ptr<xla::PjRtTopologyDescription> cpp_topology) {
   PJRT_TopologyDescription* c_topology =
       new PJRT_TopologyDescription{std::move(cpp_topology)};
+  c_topology->cpp_descriptions = c_topology->topology->DeviceDescriptions();
+  c_topology->descriptions.reserve(c_topology->cpp_descriptions.size());
+  c_topology->description_pointers.reserve(c_topology->cpp_descriptions.size());
+  for (auto& description : c_topology->cpp_descriptions) {
+    c_topology->descriptions.emplace_back(
+        PJRT_DeviceDescription{description.get()});
+    c_topology->description_pointers.emplace_back(
+        &c_topology->descriptions.back());
+    PopulatePjrtDeviceDescriptionAttributes(&c_topology->descriptions.back());
+  }
   return c_topology;
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
index 14d77aaa2f1..99cf6891ca5 100644
--- a/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
+++ b/tensorflow/compiler/xla/pjrt/c/pjrt_c_api_wrapper_impl.h
@@ -115,6 +115,10 @@ struct PJRT_SerializedExecutable {
 
 struct PJRT_TopologyDescription {
   std::unique_ptr<xla::PjRtTopologyDescription> topology;
+  std::vector<std::unique_ptr<const xla::PjRtDeviceDescription>>
+      cpp_descriptions;
+  std::vector<PJRT_DeviceDescription> descriptions;
+  std::vector<PJRT_DeviceDescription*> description_pointers;
 };
 
 struct PJRT_TransferMetadata {
@@ -171,6 +175,7 @@ PJRT_Error* PJRT_DeviceDescription_ToString(
 PJRT_Error* PJRT_Device_GetDescription(PJRT_Device_GetDescription_Args* args);
 PJRT_Error* PJRT_Device_IsAddressable(PJRT_Device_IsAddressable_Args* args);
 PJRT_Error* PJRT_Device_LocalHardwareId(PJRT_Device_LocalHardwareId_Args* args);
+PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
 PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
@@ -235,6 +240,8 @@ PJRT_Error* PJRT_TopologyDescription_PlatformName(
     PJRT_TopologyDescription_PlatformName_Args* args);
 PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
     PJRT_TopologyDescription_PlatformVersion_Args* args);
+PJRT_Error* PJRT_TopologyDescription_GetDeviceDescriptions(
+    PJRT_TopologyDescription_GetDeviceDescriptions_Args* args);
 
 PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 
@@ -287,108 +294,123 @@ constexpr PJRT_Api CreatePjrtApi(
     PJRT_Client_Create* create_fn,
     PJRT_TopologyDescription_Create* topology_create_fn) {
   return PJRT_Api{
-      .struct_size = PJRT_Api_STRUCT_SIZE,
-      .priv = nullptr,
+      /*struct_size=*/PJRT_Api_STRUCT_SIZE,
+      /*priv=*/nullptr,
 
-      .PJRT_Error_Destroy = pjrt::PJRT_Error_Destroy,
-      .PJRT_Error_Message = pjrt::PJRT_Error_Message,
-      .PJRT_Error_GetCode = pjrt::PJRT_Error_GetCode,
+      /*pjrt_api_version=*/
+      PJRT_Api_Version{/*struct_size=*/PJRT_Api_Version_STRUCT_SIZE,
+                       /*priv=*/nullptr,
+                       /*major_version=*/PJRT_API_MAJOR,
+                       /*minor_version=*/PJRT_API_MINOR},
 
-      .PJRT_Event_Destroy = pjrt::PJRT_Event_Destroy,
-      .PJRT_Event_IsReady = pjrt::PJRT_Event_IsReady,
-      .PJRT_Event_Error = pjrt::PJRT_Event_Error,
-      .PJRT_Event_Await = pjrt::PJRT_Event_Await,
-      .PJRT_Event_OnReady = pjrt::PJRT_Event_OnReady,
+      /*PJRT_Error_Destroy=*/pjrt::PJRT_Error_Destroy,
+      /*PJRT_Error_Message=*/pjrt::PJRT_Error_Message,
+      /*PJRT_Error_GetCode=*/pjrt::PJRT_Error_GetCode,
 
-      .PJRT_Client_Create = create_fn,
-      .PJRT_Client_Destroy = pjrt::PJRT_Client_Destroy,
-      .PJRT_Client_PlatformName = pjrt::PJRT_Client_PlatformName,
-      .PJRT_Client_ProcessIndex = pjrt::PJRT_Client_ProcessIndex,
-      .PJRT_Client_PlatformVersion = pjrt::PJRT_Client_PlatformVersion,
-      .PJRT_Client_Devices = pjrt::PJRT_Client_Devices,
-      .PJRT_Client_AddressableDevices = pjrt::PJRT_Client_AddressableDevices,
-      .PJRT_Client_LookupDevice = pjrt::PJRT_Client_LookupDevice,
-      .PJRT_Client_LookupAddressableDevice =
-          pjrt::PJRT_Client_LookupAddressableDevice,
-      .PJRT_Client_Compile = pjrt::PJRT_Client_Compile,
-      .PJRT_Client_DefaultDeviceAssignment =
-          pjrt::PJRT_Client_DefaultDeviceAssignment,
-      .PJRT_Client_BufferFromHostBuffer =
-          pjrt::PJRT_Client_BufferFromHostBuffer,
+      /*PJRT_Event_Destroy=*/pjrt::PJRT_Event_Destroy,
+      /*PJRT_Event_IsReady=*/pjrt::PJRT_Event_IsReady,
+      /*PJRT_Event_Error=*/pjrt::PJRT_Event_Error,
+      /*PJRT_Event_Await=*/pjrt::PJRT_Event_Await,
+      /*PJRT_Event_OnReady=*/pjrt::PJRT_Event_OnReady,
 
-      .PJRT_DeviceDescription_Id = pjrt::PJRT_DeviceDescription_Id,
-      .PJRT_DeviceDescription_ProcessIndex =
-          pjrt::PJRT_DeviceDescription_ProcessIndex,
-      .PJRT_DeviceDescription_Attributes =
-          pjrt::PJRT_DeviceDescription_Attributes,
-      .PJRT_DeviceDescription_Kind = pjrt::PJRT_DeviceDescription_Kind,
-      .PJRT_DeviceDescription_DebugString =
-          pjrt::PJRT_DeviceDescription_DebugString,
-      .PJRT_DeviceDescription_ToString = pjrt::PJRT_DeviceDescription_ToString,
+      /*PJRT_Client_Create=*/create_fn,
+      /*PJRT_Client_Destroy=*/pjrt::PJRT_Client_Destroy,
+      /*PJRT_Client_PlatformName=*/pjrt::PJRT_Client_PlatformName,
+      /*PJRT_Client_ProcessIndex=*/pjrt::PJRT_Client_ProcessIndex,
+      /*PJRT_Client_PlatformVersion= */ pjrt::PJRT_Client_PlatformVersion,
+      /*PJRT_Client_Devices= */ pjrt::PJRT_Client_Devices,
+      /*PJRT_Client_AddressableDevices=*/
+      pjrt::PJRT_Client_AddressableDevices,
+      /*PJRT_Client_LookupDevice=*/pjrt::PJRT_Client_LookupDevice,
+      /*PJRT_Client_LookupAddressableDevice=*/
+      pjrt::PJRT_Client_LookupAddressableDevice,
+      /*PJRT_Client_Compile=*/pjrt::PJRT_Client_Compile,
+      /*PJRT_Client_DefaultDeviceAssignment=*/
+      pjrt::PJRT_Client_DefaultDeviceAssignment,
+      /*PJRT_Client_BufferFromHostBuffer=*/
+      pjrt::PJRT_Client_BufferFromHostBuffer,
 
-      .PJRT_Device_GetDescription = pjrt::PJRT_Device_GetDescription,
-      .PJRT_Device_IsAddressable = pjrt::PJRT_Device_IsAddressable,
-      .PJRT_Device_LocalHardwareId = pjrt::PJRT_Device_LocalHardwareId,
+      /*PJRT_DeviceDescription_Id=*/pjrt::PJRT_DeviceDescription_Id,
+      /*PJRT_DeviceDescription_ProcessIndex=*/
+      pjrt::PJRT_DeviceDescription_ProcessIndex,
+      /*PJRT_DeviceDescription_Attributes=*/
+      pjrt::PJRT_DeviceDescription_Attributes,
+      /*PJRT_DeviceDescription_Kind=*/pjrt::PJRT_DeviceDescription_Kind,
+      /*PJRT_DeviceDescription_DebugString=*/
+      pjrt::PJRT_DeviceDescription_DebugString,
+      /*PJRT_DeviceDescription_ToString=*/
+      pjrt::PJRT_DeviceDescription_ToString,
 
-      .PJRT_Executable_Destroy = pjrt::PJRT_Executable_Destroy,
-      .PJRT_Executable_Name = pjrt::PJRT_Executable_Name,
-      .PJRT_Executable_NumReplicas = pjrt::PJRT_Executable_NumReplicas,
-      .PJRT_Executable_NumPartitions = pjrt::PJRT_Executable_NumPartitions,
-      .PJRT_Executable_NumOutputs = pjrt::PJRT_Executable_NumOutputs,
-      .PJRT_Executable_SizeOfGeneratedCodeInBytes =
-          pjrt::PJRT_Executable_SizeOfGeneratedCodeInBytes,
-      .PJRT_Executable_OptimizedProgram =
-          pjrt::PJRT_Executable_OptimizedProgram,
-      .PJRT_Executable_Serialize = pjrt::PJRT_Executable_Serialize,
+      /*PJRT_Device_GetDescription=*/pjrt::PJRT_Device_GetDescription,
+      /*PJRT_Device_IsAddressable=*/pjrt::PJRT_Device_IsAddressable,
+      /*PJRT_Device_LocalHardwareId=*/pjrt::PJRT_Device_LocalHardwareId,
+      /*.PJRT_Device_MemoryStats=*/pjrt::PJRT_Device_MemoryStats,
 
-      .PJRT_LoadedExecutable_Destroy = pjrt::PJRT_LoadedExecutable_Destroy,
-      .PJRT_LoadedExecutable_GetExecutable =
-          pjrt::PJRT_LoadedExecutable_GetExecutable,
-      .PJRT_LoadedExecutable_AddressableDevices =
-          pjrt::PJRT_LoadedExecutable_AddressableDevices,
-      .PJRT_LoadedExecutable_GetCostAnalysis =
-          pjrt::PJRT_LoadedExecutable_GetCostAnalysis,
-      .PJRT_LoadedExecutable_Delete = pjrt::PJRT_LoadedExecutable_Delete,
-      .PJRT_LoadedExecutable_IsDeleted = pjrt::PJRT_LoadedExecutable_IsDeleted,
-      .PJRT_LoadedExecutable_Execute = pjrt::PJRT_LoadedExecutable_Execute,
-      .PJRT_Executable_DeserializeAndLoad =
-          pjrt::PJRT_Executable_DeserializeAndLoad,
+      /*PJRT_Executable_Destroy=*/pjrt::PJRT_Executable_Destroy,
+      /*PJRT_Executable_Name=*/pjrt::PJRT_Executable_Name,
+      /*PJRT_Executable_NumReplicas=*/pjrt::PJRT_Executable_NumReplicas,
+      /*PJRT_Executable_NumPartitions=*/
+      pjrt::PJRT_Executable_NumPartitions,
+      /*PJRT_Executable_NumOutputs=*/pjrt::PJRT_Executable_NumOutputs,
+      /*PJRT_Executable_SizeOfGeneratedCodeInBytes=*/
+      pjrt::PJRT_Executable_SizeOfGeneratedCodeInBytes,
+      /*PJRT_Executable_OptimizedProgram=*/
+      pjrt::PJRT_Executable_OptimizedProgram,
+      /*PJRT_Executable_Serialize=*/pjrt::PJRT_Executable_Serialize,
 
-      .PJRT_SerializedExecutable_Destroy =
-          pjrt::PJRT_SerializedExecutable_Destroy,
-      .PJRT_SerializedExecutable_Data = pjrt::PJRT_SerializedExecutable_Data,
+      /*PJRT_LoadedExecutable_Destroy=*/pjrt::PJRT_LoadedExecutable_Destroy,
+      /*PJRT_LoadedExecutable_GetExecutable=*/
+      pjrt::PJRT_LoadedExecutable_GetExecutable,
+      /*PJRT_LoadedExecutable_AddressableDevices=*/
+      pjrt::PJRT_LoadedExecutable_AddressableDevices,
+      /*PJRT_LoadedExecutable_GetCostAnalysis=*/
+      pjrt::PJRT_LoadedExecutable_GetCostAnalysis,
+      /*PJRT_LoadedExecutable_Delete=*/pjrt::PJRT_LoadedExecutable_Delete,
+      /*PJRT_LoadedExecutable_IsDeleted=*/
+      pjrt::PJRT_LoadedExecutable_IsDeleted,
+      /*PJRT_LoadedExecutable_Execute=*/pjrt::PJRT_LoadedExecutable_Execute,
+      /*PJRT_Executable_DeserializeAndLoad=*/
+      pjrt::PJRT_Executable_DeserializeAndLoad,
 
-      .PJRT_Buffer_Destroy = pjrt::PJRT_Buffer_Destroy,
-      .PJRT_Buffer_OnDeviceTrimmedShape =
-          pjrt::PJRT_Buffer_OnDeviceTrimmedShape,
-      .PJRT_Buffer_OnDeviceSizeInBytes = pjrt::PJRT_Buffer_OnDeviceSizeInBytes,
-      .PJRT_Buffer_Device = pjrt::PJRT_Buffer_Device,
-      .PJRT_Buffer_Delete = pjrt::PJRT_Buffer_Delete,
-      .PJRT_Buffer_IsDeleted = pjrt::PJRT_Buffer_IsDeleted,
-      .PJRT_Buffer_CopyToDevice = pjrt::PJRT_Buffer_CopyToDevice,
-      .PJRT_Buffer_ToHostBuffer = pjrt::PJRT_Buffer_ToHostBuffer,
-      .PJRT_Buffer_IsOnCpu = pjrt::PJRT_Buffer_IsOnCpu,
-      .PJRT_Buffer_ReadyEvent = pjrt::PJRT_Buffer_ReadyEvent,
-      .PJRT_Buffer_UnsafePointer = pjrt::PJRT_Buffer_UnsafePointer,
+      /*PJRT_SerializedExecutable_Destroy=*/
+      pjrt::PJRT_SerializedExecutable_Destroy,
+      /*PJRT_SerializedExecutable_Data=*/
+      pjrt::PJRT_SerializedExecutable_Data,
 
-      .PJRT_CopyToDeviceStream_AddChunk =
-          pjrt::PJRT_CopyToDeviceStream_AddChunk,
-      .PJRT_CopyToDeviceStream_TotalBytes =
-          pjrt::PJRT_CopyToDeviceStream_TotalBytes,
-      .PJRT_CopyToDeviceStream_GranuleSize =
-          pjrt::PJRT_CopyToDeviceStream_GranuleSize,
-      .PJRT_CopyToDeviceStream_CurrentBytes =
-          pjrt::PJRT_CopyToDeviceStream_CurrentBytes,
+      /*PJRT_Buffer_Destroy=*/pjrt::PJRT_Buffer_Destroy,
+      /*PJRT_Buffer_OnDeviceTrimmedShape=*/
+      pjrt::PJRT_Buffer_OnDeviceTrimmedShape,
+      /*PJRT_Buffer_OnDeviceSizeInBytes=*/
+      pjrt::PJRT_Buffer_OnDeviceSizeInBytes,
+      /*PJRT_Buffer_Device=*/pjrt::PJRT_Buffer_Device,
+      /*PJRT_Buffer_Delete=*/pjrt::PJRT_Buffer_Delete,
+      /*PJRT_Buffer_IsDeleted=*/pjrt::PJRT_Buffer_IsDeleted,
+      /*PJRT_Buffer_CopyToDevice=*/pjrt::PJRT_Buffer_CopyToDevice,
+      /*PJRT_Buffer_ToHostBuffer=*/pjrt::PJRT_Buffer_ToHostBuffer,
+      /*PJRT_Buffer_IsOnCpu=*/pjrt::PJRT_Buffer_IsOnCpu,
+      /*PJRT_Buffer_ReadyEvent=*/pjrt::PJRT_Buffer_ReadyEvent,
+      /*PJRT_Buffer_UnsafePointer=*/pjrt::PJRT_Buffer_UnsafePointer,
 
-      .PJRT_TopologyDescription_Create = topology_create_fn,
-      .PJRT_TopologyDescription_Destroy =
-          pjrt::PJRT_TopologyDescription_Destroy,
-      .PJRT_TopologyDescription_PlatformName =
-          pjrt::PJRT_TopologyDescription_PlatformName,
-      .PJRT_TopologyDescription_PlatformVersion =
-          pjrt::PJRT_TopologyDescription_PlatformVersion,
+      /*PJRT_CopyToDeviceStream_AddChunk=*/
+      pjrt::PJRT_CopyToDeviceStream_AddChunk,
+      /*PJRT_CopyToDeviceStream_TotalBytes=*/
+      pjrt::PJRT_CopyToDeviceStream_TotalBytes,
+      /*PJRT_CopyToDeviceStream_GranuleSize=*/
+      pjrt::PJRT_CopyToDeviceStream_GranuleSize,
+      /*PJRT_CopyToDeviceStream_CurrentBytes=*/
+      pjrt::PJRT_CopyToDeviceStream_CurrentBytes,
 
-      .PJRT_Compile = pjrt::PJRT_Compile,
+      /*PJRT_TopologyDescription_Create=*/topology_create_fn,
+      /*PJRT_TopologyDescription_Destroy=*/
+      pjrt::PJRT_TopologyDescription_Destroy,
+      /*PJRT_TopologyDescription_PlatformName=*/
+      pjrt::PJRT_TopologyDescription_PlatformName,
+      /*PJRT_TopologyDescription_PlatformVersion=*/
+      pjrt::PJRT_TopologyDescription_PlatformVersion,
+      /*PJRT_TopologyDescription_GetDeviceDescriptions=*/
+      pjrt::PJRT_TopologyDescription_GetDeviceDescriptions,
+
+      /*.PJRT_Compile=*/pjrt::PJRT_Compile,
   };
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/distributed/BUILD b/tensorflow/compiler/xla/pjrt/distributed/BUILD
index 76585aafc1d..9109ee6518b 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/BUILD
+++ b/tensorflow/compiler/xla/pjrt/distributed/BUILD
@@ -44,6 +44,7 @@ cc_library(
         ":key_value_store",
         ":protocol",
         ":protocol_cc_grpc_proto",
+        ":topology_util",
         ":util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla:statusor",
@@ -66,13 +67,11 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "service_test",
-    srcs = ["service_test.cc"],
+    name = "topology_util_test",
+    srcs = ["topology_util_test.cc"],
     deps = [
         ":protocol_proto_cc",
-        ":service",
-        "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service:gpu_plugin",
+        ":topology_util",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
@@ -128,6 +127,18 @@ cc_library(
     ] + tsl_grpc_cc_dependencies(),
 )
 
+cc_library(
+    name = "topology_util",
+    srcs = ["topology_util.cc"],
+    hdrs = ["topology_util.h"],
+    deps = [
+        ":protocol_proto_cc",
+        "//tensorflow/tsl/platform:logging",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 xla_cc_test(
     name = "client_server_test",
     size = "small",
diff --git a/tensorflow/compiler/xla/pjrt/distributed/client.h b/tensorflow/compiler/xla/pjrt/distributed/client.h
index 2de61b10e36..0629e1440c2 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/client.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/client.h
@@ -78,14 +78,17 @@ class DistributedRuntimeClient {
                     << "Terminating process because the coordinator detected "
                        "missing heartbeats. This most likely indicates that "
                        "another task died; see the other task logs for more "
-                       "details. Status: "
+                       "details. Disable Python buffering, i.e. `python -u`, "
+                       "to be sure to see all the previous output. Status: "
                     << status;
               } else {
                 LOG(QFATAL)
                     << "Terminating process because of missing heartbeat "
                        "response from the coordinator. This most likely "
                        "indicates that the coordinator task died; see the "
-                       "coordinator's task logs for more details. Status: "
+                       "coordinator's task logs for more details. "
+                       "Disable Python buffering, i.e. `python -u`, to be "
+                       "sure to see all the previous output. Status: "
                     << status;
               }
             };
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.cc b/tensorflow/compiler/xla/pjrt/distributed/service.cc
index 68dbf8c718b..2c06dced454 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "grpcpp/server_builder.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/topology_util.h"
 #include "tensorflow/compiler/xla/pjrt/distributed/util.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -120,36 +121,6 @@ DistributedRuntimeServiceImpl::~DistributedRuntimeServiceImpl() {
   }
 }
 
-// Steals the contents of `local_topologies`.
-void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
-                         GlobalTopologyProto* global_topology) {
-  int next_global_device_id = 0;
-  // Assign local devices of the same host to the same slice_index.
-  int next_slice_index = 0;
-  absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
-  for (LocalTopologyProto& local : local_topologies) {
-    // Every new boot_id seen is treated as a new host/slice.
-    absl::string_view boot_id = local.boot_id();
-    auto [it, inserted] =
-        boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
-    if (inserted) {
-      ++next_slice_index;
-    }
-    for (DeviceProto& device : *local.mutable_devices()) {
-      device.set_global_device_id(next_global_device_id++);
-      device.set_slice_index(it->second);
-    }
-    global_topology->add_nodes()->Swap(&local);
-  }
-  if (VLOG_IS_ON(10)) {
-    for (auto it = boot_id_to_slice_index.begin();
-         it != boot_id_to_slice_index.end(); ++it) {
-      LOG(INFO) << "BuildGlobalTopology boot_id_to_slice_index " << it->first
-                << "->" << it->second;
-    }
-  }
-}
-
 xla::Status DistributedRuntimeServiceImpl::ValidateNodeId(int node_id) {
   if (node_id < 0) {
     return xla::InvalidArgument("Invalid node ID %d, must be non-negative",
@@ -332,9 +303,8 @@ xla::Status DistributedRuntimeServiceImpl::ValidateSessionId(
   }
 
   if (node_id == 0) {
-    topology_.emplace();
-    BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies_),
-                        &*topology_);
+    topology_ =
+        BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies_));
     local_topologies_.clear();
   } else {
     auto topology_ready = [&]() -> bool {
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service.h b/tensorflow/compiler/xla/pjrt/distributed/service.h
index 2ef4f499edc..2a15bc8e6ac 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service.h
+++ b/tensorflow/compiler/xla/pjrt/distributed/service.h
@@ -207,13 +207,6 @@ class DistributedRuntimeService {
   std::unique_ptr<::grpc::Server> server_;
 };
 
-// Everything below this point is exposed only for tests.
-
-// Given a LocalTopologyProto object from each node, builds a
-// GlobalTopologyProto that describes all nodes.
-void BuildGlobalTopology(absl::Span<LocalTopologyProto> local_topologies,
-                         GlobalTopologyProto* global_topology);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_SERVICE_H_
diff --git a/tensorflow/compiler/xla/pjrt/distributed/topology_util.cc b/tensorflow/compiler/xla/pjrt/distributed/topology_util.cc
new file mode 100644
index 00000000000..ffa671a9131
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/distributed/topology_util.cc
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/distributed/topology_util.h"
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/tsl/platform/logging.h"
+
+namespace xla {
+
+// Steals the contents of `local_topologies`.
+GlobalTopologyProto BuildGlobalTopology(
+    absl::Span<LocalTopologyProto> local_topologies) {
+  GlobalTopologyProto global_topology;
+  int next_global_device_id = 0;
+  // Assign local devices of the same host to the same slice_index.
+  int next_slice_index = 0;
+  absl::flat_hash_map<std::string, int> boot_id_to_slice_index;
+  for (LocalTopologyProto& local : local_topologies) {
+    // Every new boot_id seen is treated as a new host/slice.
+    absl::string_view boot_id = local.boot_id();
+    auto [it, inserted] =
+        boot_id_to_slice_index.try_emplace(boot_id, next_slice_index);
+    if (inserted) {
+      ++next_slice_index;
+    }
+    for (DeviceProto& device : *local.mutable_devices()) {
+      device.set_global_device_id(next_global_device_id++);
+      device.set_slice_index(it->second);
+    }
+    global_topology.add_nodes()->Swap(&local);
+  }
+  if (VLOG_IS_ON(10)) {
+    for (auto it = boot_id_to_slice_index.begin();
+         it != boot_id_to_slice_index.end(); ++it) {
+      LOG(INFO) << "BuildGlobalTopology boot_id_to_slice_index " << it->first
+                << "->" << it->second;
+    }
+  }
+  return global_topology;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/distributed/topology_util.h b/tensorflow/compiler/xla/pjrt/distributed/topology_util.h
new file mode 100644
index 00000000000..a39b69c4071
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/distributed/topology_util.h
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
+
+namespace xla {
+
+// Given a LocalTopologyProto object from each node, builds a
+// GlobalTopologyProto that describes all nodes.
+GlobalTopologyProto BuildGlobalTopology(
+    absl::Span<LocalTopologyProto> local_topologies);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_DISTRIBUTED_TOPOLOGY_UTIL_H_
diff --git a/tensorflow/compiler/xla/pjrt/distributed/service_test.cc b/tensorflow/compiler/xla/pjrt/distributed/topology_util_test.cc
similarity index 88%
rename from tensorflow/compiler/xla/pjrt/distributed/service_test.cc
rename to tensorflow/compiler/xla/pjrt/distributed/topology_util_test.cc
index 5901e694210..9206d452f53 100644
--- a/tensorflow/compiler/xla/pjrt/distributed/service_test.cc
+++ b/tensorflow/compiler/xla/pjrt/distributed/topology_util_test.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/compiler/xla/pjrt/distributed/service.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/topology_util.h"
+
+#include <vector>
 
 #include "tensorflow/compiler/xla/pjrt/distributed/protocol.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
@@ -33,8 +35,8 @@ TEST(TopologyTest, BuildGlobalTopology) {
   DeviceProto* d3 = locals[1].add_devices();
   d3->set_local_device_ordinal(1);
 
-  GlobalTopologyProto global;
-  BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals), &global);
+  GlobalTopologyProto global =
+      BuildGlobalTopology(absl::Span<LocalTopologyProto>(locals));
   EXPECT_EQ(global.nodes_size(), 2);
   EXPECT_EQ(global.nodes()[0].devices_size(), 2);
   EXPECT_EQ(global.nodes()[1].devices_size(), 2);
diff --git a/tensorflow/compiler/xla/pjrt/event_pool.cc b/tensorflow/compiler/xla/pjrt/event_pool.cc
index b00f99a420f..9ab9de95197 100644
--- a/tensorflow/compiler/xla/pjrt/event_pool.cc
+++ b/tensorflow/compiler/xla/pjrt/event_pool.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/event_pool.h"
 
 #include <memory>
+#include <utility>
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/compiler/xla/pjrt/gpu/BUILD b/tensorflow/compiler/xla/pjrt/gpu/BUILD
index 03baed9de4b..f1b160947f0 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/BUILD
+++ b/tensorflow/compiler/xla/pjrt/gpu/BUILD
@@ -5,6 +5,7 @@ load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 load("//tensorflow/compiler/xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//tensorflow/tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
+load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -41,11 +42,18 @@ cc_library(
     visibility = ["//tensorflow/compiler/xla/pjrt:friends"],
     deps = [
         ":gpu_helpers",
+        ":gpu_topology",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:client_library",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
         "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
+        "//tensorflow/compiler/xla/pjrt:tracked_device_buffer",
+        "//tensorflow/compiler/xla/pjrt:utils",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
+        "//tensorflow/compiler/xla/pjrt/distributed:topology_util",
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
@@ -56,10 +64,16 @@ cc_library(
         "//tensorflow/tsl/framework:device_id_impl",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/profiler/lib:connected_traceme",
         "//tensorflow/tsl/util:env_var",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@tf_runtime//:hostcontext",
     ] + if_cuda([
         ":nccl_id_store_cuda",
         "@local_config_cuda//cuda:cuda_headers",
@@ -78,12 +92,22 @@ xla_cc_test(
     tags = ["requires-gpu-nvidia"],
     deps = [
         ":se_gpu_pjrt_client",
+        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/pjrt:utils",
+        "//tensorflow/compiler/xla/pjrt/distributed:topology_util",
         "//tensorflow/compiler/xla/service:gpu_plugin",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@tf_runtime//:hostcontext",
     ],
 )
 
@@ -99,6 +123,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
         "//tensorflow/compiler/xla/service:global_device_id",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
@@ -116,6 +141,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt/distributed:client",
         "//tensorflow/compiler/xla/service:global_device_id",
         "//tensorflow/compiler/xla/service/gpu:gpu_executable_run_options",
@@ -140,3 +166,56 @@ xla_cc_test(
         "//tensorflow/tsl/platform:test_main",
     ],
 )
+
+tf_proto_library(
+    name = "gpu_topology_proto",
+    srcs = ["gpu_topology.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "gpu_topology",
+    srcs = ["gpu_topology.cc"],
+    hdrs = ["gpu_topology.h"],
+    deps = [
+        ":gpu_topology_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "se_gpu_pjrt_compiler",
+    srcs = ["se_gpu_pjrt_compiler.cc"],
+    hdrs = ["se_gpu_pjrt_compiler.h"],
+    deps = [
+        ":se_gpu_pjrt_client",
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
+        "//tensorflow/compiler/xla/pjrt:pjrt_executable",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+xla_cc_test(
+    name = "se_gpu_pjrt_compiler_test",
+    srcs = if_gpu_is_configured(["se_gpu_pjrt_compiler_test.cc"]),
+    tags = [
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":se_gpu_pjrt_client",
+        ":se_gpu_pjrt_compiler",
+        "//tensorflow/compiler/xla/client:xla_computation",
+        "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/service:gpu_plugin",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin",
+        "//tensorflow/compiler/xla/tests:literal_test_util",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/status",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:Parser",
+    ],
+)
diff --git a/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.cc b/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.cc
new file mode 100644
index 00000000000..08a98282d00
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/gpu/gpu_topology.h"
+
+#include <memory>
+#include <vector>
+
+namespace xla {
+
+std::unique_ptr<const GpuTopology> GpuTopology::FromProto(
+    const GpuTopologyProto& gpu_topology_proto) {
+  return std::make_unique<GpuTopology>(
+      std::vector<int>{gpu_topology_proto.device_ids().begin(),
+                       gpu_topology_proto.device_ids().end()});
+}
+
+GpuTopologyProto GpuTopology::ToProto() const {
+  GpuTopologyProto proto;
+  proto.mutable_device_ids()->Add(device_ids().begin(), device_ids().end());
+  return proto;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.h b/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.h
new file mode 100644
index 00000000000..876c05740bf
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_GPU_GPU_TOPOLOGY_H_
+
+#include <memory>
+#include <vector>
+
+#include "tensorflow/compiler/xla/pjrt/gpu/gpu_topology.pb.h"
+
+namespace xla {
+class GpuTopology {
+ public:
+  explicit GpuTopology(const std::vector<int>& gpu_device_ids)
+      : devices_ids_(gpu_device_ids) {}
+
+  int number_of_devices() const { return devices_ids_.size(); }
+  const std::vector<int>& device_ids() const { return devices_ids_; }
+
+  static std::unique_ptr<const GpuTopology> FromProto(
+      const GpuTopologyProto& proto);
+  GpuTopologyProto ToProto() const;
+
+ private:
+  const std::vector<int> devices_ids_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_GPU_GPU_TOPOLOGY_H_
diff --git a/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.proto b/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.proto
new file mode 100644
index 00000000000..0b8fca0e0ad
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/gpu_topology.proto
@@ -0,0 +1,8 @@
+syntax = "proto3";
+
+package xla;
+
+// A proto used to serialize GpuTopology instances.
+message GpuTopologyProto {
+  repeated int32 device_ids = 1;
+}
diff --git a/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.cc b/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.cc
index 66e4b25fd1b..00d6bc21d0d 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.cc
@@ -53,13 +53,12 @@ StatusOr<std::string> NcclIdStore::GetNcclUniqueId(
     ncclResult_t r = ncclGetUniqueId(&id);
     TF_RET_CHECK(r == ncclSuccess);
     id_string = std::string(id.internal, NCCL_UNIQUE_ID_BYTES);
-    TF_RETURN_IF_ERROR(client_->KeyValueSet(key.ToString(), id_string));
+    TF_RETURN_IF_ERROR(kv_put_(key.ToString(), id_string));
 #else
     return FailedPrecondition("NCCL support was not built into XLA binary.");
 #endif
   } else {
-    TF_ASSIGN_OR_RETURN(id_string, client_->BlockingKeyValueGet(
-                                       key.ToString(), absl::Minutes(5)));
+    TF_ASSIGN_OR_RETURN(id_string, kv_get_(key.ToString(), absl::Minutes(5)));
   }
   absl::MutexLock lock(&mu_);
   auto result = cache_.emplace(key, std::move(id_string));
diff --git a/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.h b/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.h
index 3c52cd1bb82..a0d0e99c919 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.h
+++ b/tensorflow/compiler/xla/pjrt/gpu/nccl_id_store.h
@@ -23,7 +23,7 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/synchronization/mutex.h"
-#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/service/global_device_id.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -36,18 +36,22 @@ namespace xla {
 // id.
 class NcclIdStore {
  public:
-  NcclIdStore(int node_id, std::shared_ptr<DistributedRuntimeClient> client,
-              absl::flat_hash_map<GlobalDeviceId, int> device_to_node)
+  NcclIdStore(int node_id,
+              absl::flat_hash_map<GlobalDeviceId, int> device_to_node,
+              PjRtClient::KeyValueGetCallback kv_get,
+              PjRtClient::KeyValuePutCallback kv_put)
       : node_id_(node_id),
-        client_(std::move(client)),
-        device_to_node_(std::move(device_to_node)) {}
+        device_to_node_(std::move(device_to_node)),
+        kv_get_(kv_get),
+        kv_put_(kv_put) {}
 
   StatusOr<std::string> GetNcclUniqueId(const gpu::NcclCliqueKey& key);
 
  private:
   const int node_id_;
-  const std::shared_ptr<DistributedRuntimeClient> client_;
   const absl::flat_hash_map<GlobalDeviceId, int> device_to_node_;
+  const PjRtClient::KeyValueGetCallback kv_get_;
+  const PjRtClient::KeyValuePutCallback kv_put_;
 
   absl::Mutex mu_;
   absl::flat_hash_map<gpu::NcclCliqueKey, std::string> cache_
diff --git a/tensorflow/compiler/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc b/tensorflow/compiler/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc
index b6be8624800..01229f297c1 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/pjrt_client_test_se_gpu.cc
@@ -23,7 +23,7 @@ namespace {
 const bool kUnused = (RegisterTestClientFactory([]() {
                         return GetStreamExecutorGpuClient(
                             /*asynchronous=*/true, GpuAllocatorConfig{},
-                            /*distributed_client=*/nullptr, /*node_id=*/0);
+                            /*node_id=*/0);
                       }),
                       true);
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index 49f72b6092f..a5660ef176b 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <fstream>
 #include <map>
+#include <memory>
 #include <optional>
 #include <set>
 #include <string>
@@ -25,11 +26,27 @@ limitations under the License.
 
 #include "absl/base/attributes.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/blocking_counter.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/pjrt/distributed/topology_util.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
+#include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
+#include "tensorflow/compiler/xla/pjrt/utils.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/tsl/framework/bfc_allocator.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/profiler/lib/connected_traceme.h"
+#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
+#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
 
 #ifdef GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -56,6 +73,456 @@ limitations under the License.
 #include "tensorflow/tsl/util/env_var.h"
 
 namespace xla {
+class AsyncHostToDeviceTransferManager
+    : public xla::PjRtClient::AsyncHostToDeviceTransferManager {
+ public:
+  static StatusOr<std::unique_ptr<AsyncHostToDeviceTransferManager>> Create(
+      absl::Span<const Shape> shapes, PjRtStreamExecutorDevice* device,
+      PjRtStreamExecutorClient* client) {
+    absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers;
+    absl::InlinedVector<std::shared_ptr<TrackedDeviceBuffer>, 4> buffer_ptrs;
+    absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
+        definition_events;
+    buffers.reserve(shapes.size());
+    buffer_ptrs.reserve(shapes.size());
+    definition_events.reserve(shapes.size());
+    for (const auto& shape : shapes) {
+      if (shape.IsTuple()) {
+        return Unimplemented(
+            "Async buffer transfer of tuples not implemented.");
+      }
+      // Initialize a definition event for each async buffer. The definition
+      // event will block the buffer usage until the transfer is done.
+      definition_events.push_back(std::make_shared<BufferSequencingEvent>());
+      TF_ASSIGN_OR_RETURN(auto buffer,
+                          client->CreateUninitializedBuffer(
+                              shape, device, definition_events.back()));
+      // Get a temporary hold just so we can fish out a shared_ptr to the
+      // TrackedDeviceBuffer. It's ok to drop the hold before return the
+      // buffers, because the invariants of this class ensure that the buffer
+      // definition event will not fire until after all of this class' uses of
+      // the TrackedDeviceBuffer have completed.
+      auto* se_buffer =
+          tensorflow::down_cast<PjRtStreamExecutorBuffer*>(buffer.get());
+      DCHECK(se_buffer);
+      auto hold = se_buffer->GetBufferWithUsageHold();
+      buffer_ptrs.push_back(hold.buffer());
+      buffers.push_back(std::move(buffer));
+    }
+
+    return std::make_unique<AsyncHostToDeviceTransferManager>(
+        std::move(buffers), std::move(buffer_ptrs),
+        std::move(definition_events), device);
+  }
+
+  AsyncHostToDeviceTransferManager(
+      absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers,
+      absl::InlinedVector<std::shared_ptr<TrackedDeviceBuffer>, 4> buffer_ptrs,
+      absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
+          definition_events,
+      PjRtStreamExecutorDevice* device)
+      : buffers_(std::move(buffers)),
+        buffer_ptrs_(std::move(buffer_ptrs)),
+        definition_events_(std::move(definition_events)),
+        remaining_buffer_count_(buffer_ptrs_.size()),
+        transfers_in_flight_(0),
+        device_(device) {
+    buffer_sizes_.reserve(buffer_ptrs_.size());
+    for (const auto& ptr : buffer_ptrs_) {
+      DCHECK_EQ(ptr->device_memory().size(), 1);
+      buffer_sizes_.push_back(ptr->device_memory()[0].size());
+    }
+    last_transfer_started_.resize(buffer_ptrs_.size(), false);
+  }
+
+  ~AsyncHostToDeviceTransferManager() override {
+    auto transfers_finished = [this]() {
+      mu_.AssertHeld();
+      return transfers_in_flight_ == 0;
+    };
+    {
+      absl::MutexLock l(&mu_);
+      // Make sure we don't leave dangling pointers in cleanup routines even
+      // if the client lets the object go out of scope.
+      mu_.Await(absl::Condition(&transfers_finished));
+    }
+  }
+
+  size_t buffer_count() const override { return buffers_.size(); };
+
+  size_t buffer_size(int buffer_index) const override {
+    DCHECK_LT(buffer_index, buffer_sizes_.size());
+    return buffer_sizes_[buffer_index];
+  }
+
+  PjRtDevice* device() const override { return device_; }
+
+  std::unique_ptr<PjRtBuffer> RetrieveBuffer(int buffer_index) override {
+    DCHECK_LT(buffer_index, buffers_.size());
+    return std::move(buffers_[buffer_index]);
+  };
+
+  Status TransferLiteralToBuffer(
+      int buffer_index, const LiteralSlice& literal,
+      absl::AnyInvocable<void() &&> on_done) override {
+    tsl::profiler::TraceMe traceme(
+        "AsyncHostToDeviceTransferManager::TransferLiteralToBuffer");
+    auto* stream = device_->local_device_state()->host_to_device_stream();
+    auto* se_client =
+        tensorflow::down_cast<PjRtStreamExecutorClient*>(device_->client());
+    DCHECK(se_client);
+
+    TransferManager* transfer_manager =
+        se_client->client()->backend().transfer_manager();
+    TF_ASSIGN_OR_RETURN(
+        Shape compact_shape,
+        transfer_manager->ChooseCompactLayoutForShape(literal.shape()));
+
+    std::shared_ptr<TrackedDeviceBuffer> buffer;
+    {
+      absl::MutexLock l(&mu_);
+
+      DCHECK_LT(buffer_index, buffer_ptrs_.size());
+      if (last_transfer_started_[buffer_index]) {
+        return InvalidArgument(
+            "TransferLiteralToBuffer requested for buffer index %d which has "
+            "already been fully transferred",
+            buffer_index);
+      }
+      last_transfer_started_[buffer_index] = true;
+      buffer = buffer_ptrs_[buffer_index];
+      DCHECK(buffer);
+      if (buffer->device_memory().empty()) {
+        return InvalidArgument(
+            "TransferLiteralToBuffer requested for buffer index %d which has "
+            "been donated. Async transfer of donated buffers is not supported "
+            "in SE:GPU",
+            buffer_index);
+      }
+      DCHECK_EQ(buffer->device_memory().size(), 1);
+
+      auto& buffer_memory = buffer->device_memory()[0];
+      if (transfer_manager->GetByteSizeRequirement(compact_shape) !=
+          buffer_memory.size()) {
+        return InvalidArgument(
+            "TransferLiteralToBuffer shape %s has size %lld "
+            "but buffer has size %lld",
+            ShapeUtil::HumanStringWithLayout(compact_shape),
+            transfer_manager->GetByteSizeRequirement(compact_shape),
+            buffer_memory.size());
+      }
+      ++transfers_in_flight_;
+    }
+
+    // The host to device transfer is performed on a thread pool, mostly because
+    // it includes linearization that may be slow.
+    // TODO(misard) assess if it would be preferable to introduce a heuristic to
+    // put the transfer into the calling thread for small literals.
+    auto transfer_h2d = [this, buffer_index, stream, transfer_manager, literal,
+                         device_buffer = buffer.get(), compact_shape,
+                         local_device =
+                             std::move(device_->local_device_state()),
+                         on_done = std::move(on_done)]() mutable {
+      tsl::profiler::TraceMe traceme(
+          "AsyncHostToDeviceTransferManager::TransferLiteralToBuffer::transfer_"
+          "h2d");
+
+      auto event = local_device->event_pool().AllocateEvent(stream->parent());
+
+      // Initiate linearization and transfer of the buffer on the stream.
+      ShapedBuffer buffer = device_buffer->AsShapedBuffer(compact_shape);
+      TF_CHECK_OK(transfer_manager->TransferLiteralToDeviceAsync(
+          stream, literal, buffer));
+      local_device->event_pool().ThenRecordEvent(stream, event.value());
+
+      // Call cleanup once the transfer has finished on the stream.
+      auto cleanup = [this, buffer_index, stream, on_done = std::move(on_done),
+                      event = std::move(event).value()]() mutable {
+        CleanUp(buffer_index, std::move(event), stream,
+                /*is_last_transfer=*/true, std::move(on_done));
+      };
+      stream->ThenDoHostCallback(std::move(cleanup));
+    };
+    se_client->thread_pool()->Schedule(
+        ([ptr = new absl::AnyInvocable<void()>(std::move(transfer_h2d))]() {
+          (*ptr)();
+          delete ptr;
+        }));
+    return OkStatus();
+  }
+
+  Status TransferRawDataToBuffer(
+      int buffer_index, absl::string_view data,
+      absl::AnyInvocable<void() &&> on_done) override {
+    return TransferRawDataToSubBuffer(buffer_index, data.data(),
+                                      /*offset=*/0, data.size(),
+                                      /*is_last_transfer=*/true,
+                                      std::move(on_done));
+  }
+
+  Status TransferRawDataToSubBuffer(
+      int buffer_index, const void* data, int64_t offset, int64_t transfer_size,
+      bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) override {
+    auto* stream = device_->local_device_state()->host_to_device_stream();
+
+    absl::ReleasableMutexLock l(&mu_);
+    DCHECK_LT(buffer_index, buffer_ptrs_.size());
+    if (last_transfer_started_[buffer_index]) {
+      return InvalidArgument(
+          "TransferRawData requested for buffer index %d which has "
+          "already been fully transferred",
+          buffer_index);
+    }
+    if (is_last_transfer) {
+      last_transfer_started_[buffer_index] = true;
+    }
+    DCHECK(buffer_ptrs_[buffer_index]);
+    if (buffer_ptrs_[buffer_index]->device_memory().empty()) {
+      return InvalidArgument(
+          "TransferRawDataToSubBuffer requested for buffer index %d which has "
+          "been donated. Async transfer of donated buffers is not supported "
+          "in SE:GPU",
+          buffer_index);
+    }
+    DCHECK_EQ(buffer_ptrs_[buffer_index]->device_memory().size(), 1);
+    auto& buffer_memory = buffer_ptrs_[buffer_index]->device_memory()[0];
+    se::DeviceMemoryBase sub_buffer;
+    CHECK_LE(offset, buffer_memory.size());
+    CHECK_LE(transfer_size, buffer_memory.size() - offset);
+    if (transfer_size < buffer_memory.size()) {
+      sub_buffer = se::DeviceMemoryBase(
+          reinterpret_cast<char*>(buffer_memory.opaque()) + offset,
+          transfer_size);
+    } else {
+      sub_buffer = buffer_memory;
+    }
+
+    ++transfers_in_flight_;
+    auto event = device_->local_device_state()->event_pool().AllocateEvent(
+        stream->parent());
+    if (transfer_size != 0) {
+      stream->ThenMemcpy(&sub_buffer, data, transfer_size);
+    }
+    device_->local_device_state()->event_pool().ThenRecordEvent(stream,
+                                                                event.value());
+    // Release the lock before calling ThenDoHostCallback in case cleanup
+    // could be called on this thread, to avoid deadlock.
+    l.Release();
+
+    auto cleanup = [this, buffer_index, event = std::move(event).value(),
+                    stream, is_last_transfer,
+                    on_done = std::move(on_done)]() mutable {
+      CleanUp(buffer_index, std::move(event), stream, is_last_transfer,
+              std::move(on_done));
+    };
+    stream->ThenDoHostCallback(std::move(cleanup));
+    return OkStatus();
+  }
+
+  void SetBufferError(int buffer_index, Status error) override {
+    // We don't have a good way to "poison" StreamExecutor buffers to make
+    // errors propagate, so for now we just kill the process if the transfers
+    // are never going to complete.
+    LOG(FATAL)
+        << "Killing process because of failed AsyncTransfer to PjRt buffers: "
+        << error;
+  }
+  void AddTransferMetadata(const TransferMetadata& meta) override {}
+
+  void CleanUp(int buffer_index, EventPool::Handle event, se::Stream* stream,
+               bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) {
+    {
+      absl::MutexLock l(&mu_);
+
+      CHECK_GT(transfers_in_flight_, 0);
+      --transfers_in_flight_;
+      if (is_last_transfer) {
+        // Drop our reference to the TrackedDeviceBuffer for this buffer.
+        CHECK(buffer_ptrs_[buffer_index]);
+        buffer_ptrs_[buffer_index] = nullptr;
+        CHECK_GT(remaining_buffer_count_, 0);
+        --remaining_buffer_count_;
+        definition_events_[buffer_index]->SetSequencingEvent(std::move(event),
+                                                             stream);
+        if (remaining_buffer_count_ == 0) {
+          VLOG(3) << "TransferLiteralToBuffer for all buffers is done.";
+        }
+      }
+    }
+
+    // Call on_done after finishing all housekeeping and releasing the lock.
+    std::move(on_done)();
+  }
+
+ private:
+  absl::Mutex mu_;
+  // The newly created buffers, which will be returned to the caller via
+  // Retrieve.
+  absl::InlinedVector<std::unique_ptr<PjRtBuffer>, 4> buffers_;
+  // Cached versions of the sizes of all the buffers, so we can return them
+  // without acquiring mu_.
+  absl::InlinedVector<size_t, 4> buffer_sizes_;
+  // References to the underlying storage for all the buffers, which ensures
+  // that the buffers can't be freed before all transfers complete.
+  absl::InlinedVector<std::shared_ptr<TrackedDeviceBuffer>, 4> buffer_ptrs_
+      ABSL_GUARDED_BY(mu_);
+  // True if the last transfer for a buffer has been initiated. Used to prevent
+  // a client initiating another transfer after the last transfer has already
+  // been initiated.
+  absl::InlinedVector<bool, 4> last_transfer_started_ ABSL_GUARDED_BY(mu_);
+  // The buffer definition events on all the buffers, unblocked once the
+  // corresponding buffer transfer has completed.
+  absl::InlinedVector<std::shared_ptr<BufferSequencingEvent>, 4>
+      definition_events_ ABSL_GUARDED_BY(mu_);
+  // Count of buffers that have not yet been fully transferred.
+  size_t remaining_buffer_count_ ABSL_GUARDED_BY(mu_);
+  // Count of transfers that have been started but have not yet called cleanup.
+  // Used to block in the destructor to avoid dangling pointers in cleanup.
+  int transfers_in_flight_ ABSL_GUARDED_BY(mu_);
+
+  PjRtStreamExecutorDevice* device_;  // not owned.
+};
+
+absl::string_view StreamExecutorGpuClient::platform_version() const {
+#define STRINGIFY2(X) #X
+#define STRINGIFY(X) STRINGIFY2(X)
+#if TENSORFLOW_USE_ROCM && defined(TF_ROCM_VERSION)  // rocm
+  // TF_ROCM_VERSION format may change in future. Use it
+  // cautiously
+  return "rocm " STRINGIFY(TF_ROCM_VERSION);
+#elif GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
+  return "cuda " STRINGIFY(CUDART_VERSION);
+#else
+  return "<unknown>";
+#endif  // TENSORFLOW_USE_ROCM && defined(TF_ROCM_VERSION)
+}
+
+StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+StreamExecutorGpuClient::CreateBuffersForAsyncHostToDevice(
+    absl::Span<const Shape> shapes, PjRtDevice* device) {
+  auto* stream_executor_device =
+      tensorflow::down_cast<PjRtStreamExecutorDevice*>(device);
+  return xla::AsyncHostToDeviceTransferManager::Create(
+      shapes, stream_executor_device, this);
+}
+
+xla::StatusOr<xla::DeviceAssignment>
+StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
+                                                    int num_partitions) const {
+  if (num_partitions == 1 && num_replicas <= addressable_devices().size()) {
+    xla::DeviceAssignment assignment(num_replicas, 1);
+    for (int i = 0; i < num_replicas; ++i) {
+      assignment(i, 0) = addressable_devices().at(i)->id();
+    }
+    return assignment;
+  }
+  // Fallback to default global device assignment if we can't run locally.
+  return PjRtStreamExecutorClient::GetDefaultDeviceAssignment(num_replicas,
+                                                              num_partitions);
+}
+
+PjRtFuture<absl::Status> StreamExecutorGpuClient::CopyRawSubBufferToHost(
+    PjRtBuffer* pjrt_buffer, void* dst, int64_t offset, int64_t transfer_size) {
+  auto* buffer = tensorflow::down_cast<PjRtStreamExecutorBuffer*>(pjrt_buffer);
+  DCHECK(buffer);
+  PjRtStreamExecutorDevice* device = buffer->device();
+  LocalDeviceState* local_device = device->local_device_state();
+  // Always borrow a stream to avoid potential deadlocks enqueueing transfers
+  // that might be required in order to compute the inputs for computations
+  // that have already been enqueued. Such cycles can occur when there are
+  // cross-host data dependencies.
+  auto stream = local_device->BorrowStreamFromPool();
+
+  PjRtStreamExecutorBuffer::ScopedHold hold(buffer->GetBufferWithUsageHold());
+  if (!hold.ok()) {
+    return PjRtFuture<absl::Status>(hold.status());
+  }
+  auto device_buffer = hold.buffer();
+  if (device_buffer->device_memory().size() != 1) {
+    return PjRtFuture<absl::Status>(
+        InvalidArgument("Copy raw buffer called on tuple"));
+  }
+  auto& device_memory = device_buffer->device_memory()[0];
+  if (offset < 0 || offset > device_memory.size() ||
+      device_memory.size() - offset < transfer_size) {
+    return PjRtFuture<absl::Status>(
+        InvalidArgument("Copy raw buffer called on buffer size %lld with "
+                        "invalid offset %lld, transfer size %lld",
+                        device_memory.size(), offset, transfer_size));
+  }
+  WaitForBufferDefinitionEventsOnStream(*device_buffer, stream.get());
+  absl::StatusOr<EventPool::Handle> event_or =
+      local_device->event_pool().AllocateEvent(stream->parent());
+  if (!event_or.ok()) {
+    return PjRtFuture<absl::Status>(event_or.status());
+  }
+
+  std::unique_ptr<se::DeviceMemoryBase> sub_buffer;
+  if (transfer_size < device_memory.size()) {
+    sub_buffer = std::make_unique<se::DeviceMemoryBase>(
+        reinterpret_cast<char*>(device_memory.opaque()) + offset,
+        transfer_size);
+  } else {
+    sub_buffer = std::make_unique<se::DeviceMemoryBase>(device_memory);
+  }
+
+  if (transfer_size != 0) {
+    // D2H request holds a non-owned pointer into sub_buffer base address
+    // that needs to outlive the transfer until the stream callback is invoked.
+    stream->ThenMemcpy(dst, *sub_buffer, transfer_size);
+  }
+
+  auto usage_event = std::make_shared<BufferSequencingEvent>();
+  local_device->event_pool().ThenRecordEvent(stream.get(), event_or.value());
+  usage_event->SetSequencingEvent(std::move(event_or).value(), stream.get());
+  // This usage hold will prevent device_buffer from being deleted before
+  // the transfer is complete.
+  hold.ConvertUsageHold(stream.get(), std::move(usage_event),
+                        /*reference_held=*/false);
+
+  auto promise = PjRtFuture<absl::Status>::CreatePromise();
+  local_device->ThenExecuteCallback(
+      stream.get(), [promise, free_sub_range = sub_buffer.release(),
+                     free_stream = stream.release(), local_device]() mutable {
+        auto stream = std::unique_ptr<se::Stream>(free_stream);
+        auto sub_range = std::unique_ptr<se::DeviceMemoryBase>(free_sub_range);
+        local_device->ReturnStreamToPool(std::move(stream));
+        promise.Set(OkStatus());
+      });
+
+  return PjRtFuture<Status>(
+      std::move(promise),
+      /*on_block_start=*/
+      []() {
+        tsl::profiler::TraceMeProducer traceme(
+            "StreamExecutorGpuClient::CopyRawSubBufferToHost");
+        VLOG(1) << "StreamExecutorGpuClient::CopyRawSubBufferToHost";
+        return PjRtFutureHelpers::ProfilingKeys(
+            {/*traceme_context_id =*/traceme.GetContextId()});
+      },
+      /*on_block_end=*/
+      [](PjRtFutureHelpers::ProfilingKeys keys) {
+        tsl::profiler::TraceMeConsumer traceme(
+            "StreamExecutorGpuClient::CopyRawSubBufferToHost",
+            keys.traceme_context_id);
+      });
+}
+
+std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id) {
+  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
+  for (auto& ordinal_and_device : local_device_states) {
+    const se::DeviceDescription& description =
+        ordinal_and_device.second->executor()->GetDeviceDescription();
+    auto device = std::make_unique<StreamExecutorGpuDevice>(
+        ordinal_and_device.first, std::move(ordinal_and_device.second),
+        description.name(), description.device_vendor(), node_id);
+    devices.push_back(std::move(device));
+  }
+  return devices;
+}
+
 namespace {
 
 #if defined(GOOGLE_CUDA) && CUDA_VERSION >= 11020
@@ -116,44 +583,6 @@ StatusOr<std::unique_ptr<se::MultiDeviceAdapter>> CreateCudaAsyncAllocator(
 
 #endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 11020
 
-// A custom PjRtClient that overrides the device assignment method.
-class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
- public:
-  using xla::PjRtStreamExecutorClient::PjRtStreamExecutorClient;
-
-  xla::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
-      int num_replicas, int num_partitions) const override;
-
-  absl::string_view platform_version() const override {
-#define STRINGIFY2(X) #X
-#define STRINGIFY(X) STRINGIFY2(X)
-#if TENSORFLOW_USE_ROCM && defined(TF_ROCM_VERSION)  // rocm
-    // TF_ROCM_VERSION fomrat may change in future. Use it
-    // cautiously
-    return "rocm " STRINGIFY(TF_ROCM_VERSION);
-#elif GOOGLE_CUDA && defined(CUDART_VERSION)  // cuda
-    return "cuda " STRINGIFY(CUDART_VERSION);
-#else
-    return "<unknown>";
-#endif  // TENSORFLOW_USE_ROCM && defined(TF_ROCM_VERSION)
-  }
-};
-
-xla::StatusOr<xla::DeviceAssignment>
-StreamExecutorGpuClient::GetDefaultDeviceAssignment(int num_replicas,
-                                                    int num_partitions) const {
-  if (num_partitions == 1 && num_replicas <= addressable_devices().size()) {
-    xla::DeviceAssignment assignment(num_replicas, 1);
-    for (int i = 0; i < num_replicas; ++i) {
-      assignment(i, 0) = addressable_devices().at(i)->id();
-    }
-    return assignment;
-  }
-  // Fallback to default global device assignment if we can't run locally.
-  return PjRtStreamExecutorClient::GetDefaultDeviceAssignment(num_replicas,
-                                                              num_partitions);
-}
-
 // Builds a LocalDeviceState for each GPU present.
 StatusOr<std::map<int, std::unique_ptr<LocalDeviceState>>>
 BuildLocalDeviceStates(LocalClient* xla_client) {
@@ -222,21 +651,6 @@ GetStreamExecutorGpuDeviceAllocator(
   return std::move(allocator);
 }
 
-std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
-    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
-    int node_id) {
-  std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
-  for (auto& ordinal_and_device : local_device_states) {
-    const se::DeviceDescription& description =
-        ordinal_and_device.second->executor()->GetDeviceDescription();
-    auto device = std::make_unique<StreamExecutorGpuDevice>(
-        ordinal_and_device.first, std::move(ordinal_and_device.second),
-        description.name(), description.device_vendor(), node_id);
-    devices.push_back(std::move(device));
-  }
-  return devices;
-}
-
 // Exists on Linux systems. Unique per OS kernel restart.
 static constexpr char kBootIdPath[] = "/proc/sys/kernel/random/boot_id";
 
@@ -259,11 +673,76 @@ StatusOr<std::string> GetBootIdString() {
   return boot_id_str;
 }
 
+static std::string GetLocalTopologyKey(int node_id) {
+  return absl::StrCat("local_topology:", node_id);
+}
+
+static std::string GetGlobalTopologyKey() { return "global_topology"; }
+
+static StatusOr<std::vector<LocalTopologyProto>> GetAllLocalTopologies(
+    int num_nodes, const PjRtClient::KeyValueGetCallback& kv_get,
+    absl::Duration timeout) {
+  std::vector<StatusOr<std::string>> local_topology_strs(num_nodes);
+  auto host_context = std::make_unique<tfrt::HostContext>(
+      [](const tfrt::DecodedDiagnostic& diag) {
+        LOG(ERROR) << "Encountered runtime error: " << diag.message() << "\n";
+      },
+      tfrt::CreateMallocAllocator(),
+      tfrt::CreateMultiThreadedWorkQueue(
+          /*num_threads=*/DefaultThreadPoolSize(),
+          /*num_blocking_threads=*/4));
+
+  absl::BlockingCounter blocking_counter(num_nodes);
+  absl::Mutex mu;
+  for (int i = 0; i < num_nodes; i++) {
+    tfrt::EnqueueWork(
+        host_context.get(),
+        [&mu, &local_topology_strs, &blocking_counter, &kv_get, i, &timeout] {
+          StatusOr<std::string> local_topology_str =
+              kv_get(GetLocalTopologyKey(i), timeout);
+          {
+            absl::MutexLock lock(&mu);
+            local_topology_strs[i] = local_topology_str;
+          }
+          blocking_counter.DecrementCount();
+        });
+  }
+  blocking_counter.Wait();
+
+  std::vector<std::string> error_messages;
+  std::vector<LocalTopologyProto> local_topologies;
+  int max_num_failed_message = 10;
+  int failed_count = 0;
+  for (const StatusOr<std::string>& str : local_topology_strs) {
+    if (str.ok()) {
+      LocalTopologyProto local;
+      local.ParseFromString(*str);
+      local_topologies.push_back(local);
+    } else {
+      error_messages.push_back(
+          absl::StrCat("Error ", ++failed_count, ": ", str.status().message()));
+      if (failed_count > max_num_failed_message) {
+        break;
+      }
+    }
+  }
+  if (error_messages.empty()) {
+    return local_topologies;
+  }
+  return absl::InternalError(
+      absl::StrCat("Getting local topologies failed: ",
+                   absl::StrJoin(error_messages, "\n\n")));
+}
+
 Status BuildDistributedDevices(
     std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
-    std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
+    int node_id, int num_nodes,
     std::vector<std::unique_ptr<PjRtStreamExecutorDevice>>* devices,
-    gpu::GpuExecutableRunOptions* gpu_executable_run_options) {
+    gpu::GpuExecutableRunOptions* gpu_executable_run_options,
+    PjRtClient::KeyValueGetCallback kv_get,
+    PjRtClient::KeyValuePutCallback kv_put,
+    absl::Duration get_local_topology_timeout = absl::Minutes(2),
+    absl::Duration get_global_topology_timeout = absl::Minutes(5)) {
   LocalTopologyProto local_topology;
   local_topology.set_node_id(node_id);
   std::string boot_id_str;
@@ -286,10 +765,26 @@ Status BuildDistributedDevices(
     device_proto->set_vendor(desc->device_vendor());
   }
   VLOG(3) << "GPU Local Topology:\n" << local_topology.DebugString();
+  TF_RETURN_IF_ERROR(
+      kv_put(GetLocalTopologyKey(node_id), local_topology.SerializeAsString()));
 
   GlobalTopologyProto global_topology;
-  TF_RETURN_IF_ERROR(
-      distributed_client->EnumerateDevices(local_topology, &global_topology));
+  // The lead node gets all local topologies, builds the global topology and
+  // puts it to the key-value store.
+  if (node_id == 0) {
+    TF_ASSIGN_OR_RETURN(
+        std::vector<LocalTopologyProto> local_topologies,
+        GetAllLocalTopologies(num_nodes, kv_get, get_local_topology_timeout));
+    global_topology =
+        BuildGlobalTopology(absl::Span<LocalTopologyProto>(local_topologies));
+    TF_RETURN_IF_ERROR(
+        kv_put(GetGlobalTopologyKey(), global_topology.SerializeAsString()));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        std::string global_topology_str,
+        kv_get(GetGlobalTopologyKey(), get_global_topology_timeout));
+    global_topology.ParseFromString(global_topology_str);
+  }
   VLOG(3) << "GPU Global Topology:\n" << global_topology.DebugString();
 
   std::map<int, GlobalDeviceId> gpu_device_ids;
@@ -320,8 +815,8 @@ Status BuildDistributedDevices(
   gpu_executable_run_options->set_gpu_global_device_ids(
       std::move(gpu_device_ids));
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  auto nccl_id_store = std::make_shared<NcclIdStore>(
-      node_id, distributed_client, device_to_node);
+  auto nccl_id_store =
+      std::make_shared<NcclIdStore>(node_id, device_to_node, kv_get, kv_put);
   gpu_executable_run_options->set_nccl_unique_id_callback(
       [nccl_id_store](const gpu::NcclCliqueKey& key) {
         return nccl_id_store->GetNcclUniqueId(key);
@@ -340,13 +835,13 @@ StreamExecutorGpuDevice::StreamExecutorGpuDevice(
                                std::move(device_kind), node_id),
       device_vendor_(std::move(device_vendor)),
       slice_index_(slice_index) {
-  attributes_ = {
+  description().SetAttributes({
       {"device_vendor", device_vendor_},
       {"slice_index", static_cast<int64_t>(slice_index)},
-  };
-  to_string_ = absl::StrFormat(
+  });
+  description().SetToString(absl::StrFormat(
       "StreamExecutorGpuDevice(id=%i, process_index=%i, slice_index=%i)", id,
-      process_index(), slice_index);
+      process_index(), slice_index));
 }
 
 int StreamExecutorGpuDevice::slice_index() const { return slice_index_; }
@@ -355,15 +850,13 @@ absl::string_view StreamExecutorGpuDevice::device_vendor() const {
   return device_vendor_;
 }
 
-absl::string_view StreamExecutorGpuDevice::ToString() const {
-  return to_string_;
-}
-
 StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
-    bool asynchronous, const GpuAllocatorConfig& allocator_config,
-    std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
-    const std::optional<std::set<int>>& allowed_devices,
-    std::optional<std::string> platform_name) {
+    bool asynchronous, const GpuAllocatorConfig& allocator_config, int node_id,
+    int num_nodes, const std::optional<std::set<int>>& allowed_devices,
+    std::optional<std::string> platform_name,
+    bool should_stage_host_to_device_transfers,
+    PjRtClient::KeyValueGetCallback kv_get,
+    PjRtClient::KeyValuePutCallback kv_put) {
   TF_ASSIGN_OR_RETURN(LocalClient * xla_client,
                       GetGpuXlaClient(platform_name, allowed_devices));
   std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states;
@@ -378,10 +871,12 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
 
   std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices;
   auto gpu_run_options = std::make_unique<gpu::GpuExecutableRunOptions>();
-  if (distributed_client) {
+  if (num_nodes > 1) {
+    TF_RET_CHECK(kv_get != nullptr);
+    TF_RET_CHECK(kv_put != nullptr);
     TF_RETURN_IF_ERROR(BuildDistributedDevices(
-        std::move(local_device_states), std::move(distributed_client), node_id,
-        &devices, gpu_run_options.get()));
+        std::move(local_device_states), node_id, num_nodes, &devices,
+        gpu_run_options.get(), kv_get, kv_put));
   } else {
     devices = BuildLocalDevices(std::move(local_device_states), node_id);
   }
@@ -389,8 +884,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
   return std::unique_ptr<PjRtClient>(std::make_unique<StreamExecutorGpuClient>(
       GpuName(), xla_client, std::move(devices),
       /*node_id=*/node_id, std::move(allocator),
-      std::move(host_memory_allocator),
-      /*should_stage_host_to_device_transfers=*/true,
+      std::move(host_memory_allocator), should_stage_host_to_device_transfers,
       /*gpu_run_options=*/std::move(gpu_run_options)));
 }
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 529c51bb28d..aa2d07d5720 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -16,17 +16,112 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_GPU_SE_GPU_PJRT_CLIENT_H_
 
+#include <map>
 #include <memory>
 #include <optional>
+#include <set>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/pjrt/distributed/client.h"
 #include "tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/gpu_topology.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
+namespace stream_executor {
+
+class MultiDeviceAdapter;
+
+}
+
 namespace xla {
 
+class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
+ public:
+  static StreamExecutorGpuTopologyDescription Create(
+      const PjRtPlatformId platform_id, const absl::string_view platform_name,
+      const absl::string_view platform_version,
+      const std::vector<PjRtDevice*>& devices) {
+    std::vector<int> device_ids;
+    device_ids.reserve(devices.size());
+    for (PjRtDevice* device : devices) {
+      device_ids.push_back(device->id());
+    }
+    return StreamExecutorGpuTopologyDescription(platform_id, platform_name,
+                                                platform_version, device_ids);
+  }
+  // `gpu_device_ids` is the list of logical device ids for the GPU devices and
+  // will be used to initialize the GPU topology.
+  StreamExecutorGpuTopologyDescription(const PjRtPlatformId platform_id,
+                                       const absl::string_view platform_name,
+                                       const absl::string_view platform_version,
+                                       const std::vector<int>& gpu_device_ids)
+      : platform_id_(platform_id),
+        platform_name_(platform_name),
+        platform_version_(platform_version),
+        gpu_topology_(gpu_device_ids) {}
+
+  bool operator==(const StreamExecutorGpuTopologyDescription& other) const {
+    return this->platform_id() == other.platform_id() &&
+           this->platform_name() == other.platform_name() &&
+           this->platform_version() == other.platform_version() &&
+           this->gpu_topology().device_ids() ==
+               other.gpu_topology().device_ids();
+  }
+
+  PjRtPlatformId platform_id() const override { return platform_id_; }
+
+  absl::string_view platform_name() const override { return platform_name_; }
+
+  absl::string_view platform_version() const override {
+    return platform_version_;
+  }
+
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
+      const override {
+    std::vector<std::unique_ptr<const PjRtDeviceDescription>> devices;
+    devices.reserve(gpu_topology_.number_of_devices());
+    for (const int device_id : gpu_topology_.device_ids()) {
+      devices.push_back(std::make_unique<PjRtStreamExecutorDeviceDescription>(
+          device_id, platform_version_));
+    }
+    return devices;
+  }
+
+  const GpuTopology& gpu_topology() const { return gpu_topology_; }
+  const GpuTopology* gpu_topology_ptr() const { return &gpu_topology_; }
+
+  // No subslice is supported.
+  bool is_subslice_topology() const override { return false; }
+
+  // The topology support only single host now.
+  absl::StatusOr<int> ProcessCount() const override { return 1; }
+
+  absl::StatusOr<int> CoreCountOfDefaultType() const override {
+    return gpu_topology_.number_of_devices();
+  }
+
+  absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const override {
+    return gpu_topology_.number_of_devices();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const override {
+    return gpu_topology_.number_of_devices();
+  }
+
+  absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const override {
+    return 1;
+  }
+
+ private:
+  const PjRtPlatformId platform_id_;
+  const std::string platform_name_;
+  const std::string platform_version_;
+  const GpuTopology gpu_topology_;
+};
+
 class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
  public:
   StreamExecutorGpuDevice(int id,
@@ -38,22 +133,76 @@ class StreamExecutorGpuDevice : public PjRtStreamExecutorDevice {
 
   absl::string_view device_vendor() const;
 
-  absl::string_view ToString() const override;
-
  private:
   std::string device_vendor_;
-  std::string to_string_;
   int slice_index_;
 };
 
-// distributed_client may be nullptr in non-distributed settings.
-// distributed_client should be in the connected state before calling this
-// function.
+// A custom PjRtClient that overrides the device assignment method.
+class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
+ public:
+  using xla::PjRtStreamExecutorClient::PjRtStreamExecutorClient;
+
+  StreamExecutorGpuClient(
+      std::string platform_name, LocalClient* client,
+      std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> devices,
+      int process_index, std::unique_ptr<se::DeviceMemoryAllocator> allocator,
+      std::unique_ptr<tsl::Allocator> host_memory_allocator,
+      bool should_stage_host_to_device_transfers,
+      std::unique_ptr<gpu::GpuExecutableRunOptions> gpu_run_options)
+      : xla::PjRtStreamExecutorClient(
+            platform_name, client, std::move(devices), process_index,
+            std::move(allocator), std::move(host_memory_allocator),
+            should_stage_host_to_device_transfers, std::move(gpu_run_options)),
+        topology_(xla::StreamExecutorGpuTopologyDescription::Create(
+            xla::GpuId(), std::move(platform_name),
+            devices_.back()->device_kind(), devices_)) {}
+
+  xla::StatusOr<xla::DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+
+  absl::string_view platform_version() const override;
+
+  StatusOr<std::unique_ptr<PjRtClient::AsyncHostToDeviceTransferManager>>
+  CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
+                                    PjRtDevice* device) override;
+
+  PjRtFuture<Status> CopyRawSubBufferToHost(PjRtBuffer* buffer, void* dst,
+                                            int64_t offset,
+                                            int64_t transfer_size) override;
+
+  StatusOr<const xla::PjRtTopologyDescription*> GetTopologyDescription()
+      const override {
+    return &topology_;
+  }
+
+  // TODO(b/285385306): Enable loading a non-loaded PjRtExecutable.
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Load(
+      std::unique_ptr<PjRtExecutable> executable,
+      const LoadOptions& load_options) override {
+    return absl::WrapUnique<PjRtLoadedExecutable>(
+        tensorflow::down_cast<PjRtLoadedExecutable*>(executable.release()));
+  }
+
+ private:
+  xla::StreamExecutorGpuTopologyDescription topology_;
+};
+
+std::vector<std::unique_ptr<PjRtStreamExecutorDevice>> BuildLocalDevices(
+    std::map<int, std::unique_ptr<LocalDeviceState>> local_device_states,
+    int node_id);
+
+// kv_get and kv_put are callbacks provided by the caller to access a key-value
+// store shared between nodes. kv_get and kv_put must be non-null if num_nodes
+// > 1.
 StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
-    bool asynchronous, const GpuAllocatorConfig& allocator_config,
-    std::shared_ptr<DistributedRuntimeClient> distributed_client, int node_id,
+    bool asynchronous, const GpuAllocatorConfig& allocator_config, int node_id,
+    int num_nodes = 1,
     const std::optional<std::set<int>>& allowed_devices = std::nullopt,
-    std::optional<std::string> platform_name = std::nullopt);
+    std::optional<std::string> platform_name = std::nullopt,
+    bool should_stage_host_to_device_transfers = true,
+    PjRtClient::KeyValueGetCallback kv_get = nullptr,
+    PjRtClient::KeyValuePutCallback kv_put = nullptr);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
index 458917337e5..ac515f9da88 100644
--- a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client_test.cc
@@ -17,18 +17,39 @@ limitations under the License.
 
 #include <array>
 #include <memory>
+#include <numeric>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "tensorflow/compiler/xla/pjrt/utils.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
+#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
+#include "tfrt/host_context/host_context.h"  // from @tf_runtime
 
 namespace xla {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
+
 StatusOr<std::unique_ptr<xla::PjRtLoadedExecutable>> CompileExecutable(
     absl::string_view program, xla::PjRtClient& client,
     xla::CompileOptions compile_options = xla::CompileOptions()) {
@@ -86,7 +107,7 @@ static constexpr char const* kProgram = R"(HloModule HostTransfer
 TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*distributed_client=*/nullptr,
+
                                               /*node_id=*/0));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
@@ -140,7 +161,6 @@ TEST(StreamExecutorGpuClientTest, SendRecvChunked) {
 TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*distributed_client=*/nullptr,
                                               /*node_id=*/0));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
@@ -176,7 +196,6 @@ TEST(StreamExecutorGpuClientTest, SendErrorNoDeadLock) {
 TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
-                                              /*distributed_client=*/nullptr,
                                               /*node_id=*/0));
 
   TF_ASSERT_OK_AND_ASSIGN(auto executable,
@@ -212,5 +231,240 @@ TEST(StreamExecutorGpuClientTest, RecvErrorNoDeadLock) {
                                 "of size 8 (0 already transferred)"));
 }
 
+TEST(StreamExecutorGpuClientTest, ToLiteralAsync) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  auto src_literal = LiteralUtil::CreateR1<float>({41.0f, 42.0f, 43.0f, 44.0f});
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto transfer_manager,
+      client->CreateBuffersForAsyncHostToDevice(
+          {src_literal.shape()}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+
+  absl::Mutex mu;
+  auto literal = std::make_shared<Literal>(
+      ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape()));
+  bool got_literal = false;
+
+  TF_ASSERT_OK(
+      transfer_manager->TransferLiteralToBuffer(0, src_literal, [&]() {}));
+
+  buffer->ToLiteral(literal.get(), [&](Status s) {
+    absl::MutexLock l(&mu);
+    TF_ASSERT_OK(s);
+    got_literal = true;
+  });
+  buffer.reset();
+
+  {
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&got_literal));
+  }
+
+  ASSERT_TRUE(ShapeUtil::Compatible(src_literal.shape(), literal->shape()));
+  ASSERT_EQ(src_literal.data<float>(),
+            literal->Relayout(src_literal.shape().layout()).data<float>());
+}
+
+TEST(StreamExecutorGpuClientTest, FromHostAsync) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  ASSERT_GE(client->addressable_devices().size(), 1);
+
+  std::vector<Literal> src_literals;
+  std::vector<Shape> src_shapes;
+  for (int i = 0; i < 4; ++i) {
+    std::vector<float> data(i + 1);
+    std::iota(data.begin(), data.end(), static_cast<float>(i + 10));
+    src_literals.emplace_back(LiteralUtil::CreateR1<float>(data));
+    src_shapes.push_back(src_literals.back().shape());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              src_shapes, client->addressable_devices()[0]));
+  std::vector<std::unique_ptr<PjRtBuffer>> buffers;
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    buffers.emplace_back(transfer_manager->RetrieveBuffer(i));
+  }
+
+  absl::Mutex mu;
+  std::vector<std::shared_ptr<Literal>> literals;
+  int got_literal_count = 0;
+  int got_callback_count = 0;
+
+  for (int i = 0; i < src_shapes.size(); ++i) {
+    TF_ASSERT_OK(transfer_manager->TransferRawDataToBuffer(
+        i,
+        absl::string_view(static_cast<char*>(src_literals[i].untyped_data()),
+                          src_literals[i].size_bytes()),
+        [&]() {}));
+  }
+
+  for (auto& buffer : buffers) {
+    literals.push_back(std::make_shared<Literal>(
+        ShapeUtil::DeviceShapeToHostShape(buffer->on_device_shape())));
+    buffer->ToLiteral(literals.back().get(), [&](Status s) {
+      absl::MutexLock l(&mu);
+      TF_ASSERT_OK(s);
+      ++got_literal_count;
+    });
+    buffer->OnReady([&](Status s) {
+      absl::MutexLock l(&mu);
+      TF_ASSERT_OK(s);
+      ++got_callback_count;
+    });
+    buffer.reset();
+  }
+
+  {
+    auto done = [&]() {
+      return got_literal_count == src_literals.size() &&
+             got_callback_count == src_literals.size();
+    };
+    absl::MutexLock l(&mu);
+    mu.Await(absl::Condition(&done));
+  }
+
+  for (int i = 0; i < src_literals.size(); ++i) {
+    ASSERT_TRUE(
+        ShapeUtil::Compatible(src_literals[i].shape(), literals[i]->shape()));
+    ASSERT_EQ(
+        src_literals[i].data<float>(),
+        literals[i]->Relayout(src_literals[i].shape().layout()).data<float>());
+  }
+}
+TEST(StreamExecutorGpuClientTest, CopyRawToHostFullBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
+
+  void* dst = aligned_malloc(buffer->GetOnDeviceSizeInBytes().value(), 0);
+
+  auto result =
+      buffer->CopyRawToHost(dst, 0, buffer->GetOnDeviceSizeInBytes().value());
+  EXPECT_OK(result.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+  EXPECT_EQ(*(static_cast<float*>(dst) + 1), 42.0f);
+
+  aligned_free(dst);
+}
+
+TEST(StreamExecutorGpuClientTest, CopyRawToHostSubBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
+  void* dst = aligned_malloc(buffer->GetOnDeviceSizeInBytes().value(), 0);
+
+  auto result = buffer->CopyRawToHost(dst, 0, sizeof(float));
+  EXPECT_OK(result.Await());
+  EXPECT_EQ(*(static_cast<float*>(dst)), 41.0f);
+
+  aligned_free(dst);
+}
+
+TEST(StreamExecutorGpuClientTest, CopyRawToHostOutOfRange) {
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  auto literal = xla::LiteralUtil::CreateR1<float>({41.0f, 42.0f});
+
+  ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<PjRtBuffer> buffer,
+      client->BufferFromHostLiteral(literal, client->addressable_devices()[0]));
+  void* dst = aligned_malloc(buffer->GetOnDeviceSizeInBytes().value(), 0);
+
+  auto result =
+      buffer->CopyRawToHost(dst, 1, buffer->GetOnDeviceSizeInBytes().value());
+  EXPECT_THAT(result.Await(), StatusIs(absl::StatusCode::kInvalidArgument,
+                                       HasSubstr("invalid offset 1")));
+  aligned_free(dst);
+}
+
+TEST(GpuTopology, FromProto) {
+  GpuTopologyProto msg;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      R"pb(
+        device_ids: [ 3, 2, 1 ]
+      )pb",
+      &msg));
+
+  std::unique_ptr<const GpuTopology> gpu_topology = GpuTopology::FromProto(msg);
+  EXPECT_THAT(gpu_topology->device_ids(), ElementsAre(3, 2, 1));
+}
+
+TEST(GpuTopology, ToProto) {
+  GpuTopology gpu_topology({3, 2, 1});
+  GpuTopologyProto msg = gpu_topology.ToProto();
+  EXPECT_THAT(msg.device_ids(), ElementsAre(3, 2, 1));
+}
+
+TEST(StreamExecutorGpuClientTest, DistributeInit) {
+  absl::flat_hash_map<std::string, std::string> kv_store;
+  absl::Mutex mu;
+  PjRtClient::KeyValueGetCallback kv_get =
+      [&kv_store, &mu](const std::string& k,
+                       absl::Duration timeout) -> xla::StatusOr<std::string> {
+    absl::Duration wait_interval = absl::Milliseconds(10);
+    int num_retry = timeout / wait_interval;
+    for (int i = 0; i < num_retry; i++) {
+      {
+        absl::MutexLock lock(&mu);
+        auto iter = kv_store.find(k);
+        if (iter != kv_store.end()) {
+          return iter->second;
+        }
+      }
+      absl::SleepFor(wait_interval);
+    }
+    return absl::NotFoundError(
+        absl::StrCat(k, " is not found in the kv store."));
+  };
+  PjRtClient::KeyValuePutCallback kv_put =
+      [&kv_store, &mu](const std::string& k,
+                       const std::string& v) -> xla::Status {
+    {
+      absl::MutexLock lock(&mu);
+      kv_store[k] = v;
+    }
+    return tsl::OkStatus();
+  };
+  auto host_context = std::make_unique<tfrt::HostContext>(
+      [](const tfrt::DecodedDiagnostic& diag) {
+        LOG(ERROR) << "Encountered runtime error: " << diag.message() << "\n";
+      },
+      tfrt::CreateMallocAllocator(),
+      tfrt::CreateMultiThreadedWorkQueue(
+          /*num_threads=*/DefaultThreadPoolSize(),
+          /*num_blocking_threads=*/4));
+  int num_nodes = 2;
+  for (int i = 0; i < num_nodes; i++) {
+    tfrt::EnqueueWork(host_context.get(), [&kv_get, &kv_put, i, num_nodes] {
+      TF_ASSERT_OK_AND_ASSIGN(
+          auto client,
+          GetStreamExecutorGpuClient(
+              true, /*allocator_config=*/{},
+              /*node_id=*/i, num_nodes, /*allowed_devices=*/std::nullopt,
+              /*platform_name=*/std::nullopt,
+              /*should_stage_host_to_device_transfers=*/true, kv_get, kv_put));
+      EXPECT_EQ(client->platform_name(), "gpu");
+      EXPECT_EQ(client->addressable_device_count(), 1);
+      EXPECT_EQ(client->device_count(), 2);
+    });
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
new file mode 100644
index 00000000000..e4d43b0d27c
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -0,0 +1,88 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+
+namespace xla {
+namespace {
+
+bool IsGpuClient(const PjRtClient& client) {
+  return client.platform_id() == GpuId();
+}
+
+bool IsSameTopology(const PjRtTopologyDescription& topology1,
+                    const PjRtTopologyDescription& topology2) {
+  const StreamExecutorGpuTopologyDescription& gpu_topology1 =
+      tensorflow::down_cast<const StreamExecutorGpuTopologyDescription&>(
+          topology1);
+  const StreamExecutorGpuTopologyDescription& gpu_topology2 =
+      tensorflow::down_cast<const StreamExecutorGpuTopologyDescription&>(
+          topology2);
+  return gpu_topology1 == gpu_topology2;
+}
+
+absl::Status IsValidTopologyAndClientForCompile(
+    const PjRtTopologyDescription& topology, PjRtClient* client) {
+  if (client == nullptr) {
+    return absl::UnimplementedError(
+        "SE:GPU compiler requires non-null client.");
+  }
+  if (!IsGpuClient(*client)) {
+    return absl::InvalidArgumentError(
+        "SE:GPU compiler requires a GPU PjRtClient.");
+  }
+  TF_ASSIGN_OR_RETURN(auto client_topology, client->GetTopologyDescription());
+
+  if (!IsSameTopology(topology, *client_topology)) {
+    return absl::UnimplementedError(
+        "SE:GPU compiler requires the topology same as the one in the client.");
+  }
+  return absl::OkStatus();
+}
+}  // namespace
+
+// TODO(b/285385306): Enable compilation on provided `topology`.
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+StreamExecutorGpuCompiler::Compile(CompileOptions options,
+                                   const XlaComputation& computation,
+                                   const PjRtTopologyDescription& topology,
+                                   PjRtClient* client) {
+  TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
+  return client->Compile(computation, options);
+}
+
+absl::StatusOr<std::unique_ptr<PjRtExecutable>>
+StreamExecutorGpuCompiler::Compile(CompileOptions options,
+                                   mlir::ModuleOp module,
+                                   const PjRtTopologyDescription& topology,
+                                   PjRtClient* client) {
+  TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
+  return client->Compile(module, options);
+}
+
+REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
+  std::unique_ptr<PjRtCompiler> compiler =
+      std::make_unique<StreamExecutorGpuCompiler>();
+  PjRtRegisterCompiler(GpuName(), std::move(compiler));
+});
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.h b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
new file mode 100644
index 00000000000..8b9038c0f2e
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
+
+#include <memory>
+
+#include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
+
+namespace xla {
+// Implements the interfaces that are needed for the registered compiler.
+// TODO(b/285385306): current implementation purely relies on the `client`
+// Compile() functions and ignores the `topology` parameter.
+class StreamExecutorGpuCompiler : public PjRtCompiler {
+ public:
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, const XlaComputation& computation,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+
+  absl::StatusOr<std::unique_ptr<PjRtExecutable>> Compile(
+      CompileOptions options, mlir::ModuleOp module,
+      const PjRtTopologyDescription& topology, PjRtClient* client) override;
+};
+}  // namespace xla
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_GPU_SE_GPU_PJRT_COMPILER_H_
diff --git a/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
new file mode 100644
index 00000000000..e3f2e8e4a5e
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler_test.cc
@@ -0,0 +1,182 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_compiler.h"
+
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/literal_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace {
+
+using ::tsl::testing::StatusIs;
+
+constexpr absl::string_view kProgram = R"(HloModule Computation
+
+ENTRY Computation() -> s32[] {
+  ROOT result = s32[] constant(2)
+})";
+
+constexpr absl::string_view mlir_str = R"mlir(
+  module {
+    func.func @main() -> tensor<i32> {
+      %0 = mhlo.constant dense<2> : tensor<i32>
+      return %0 : tensor<i32>
+    }
+  })mlir";
+
+absl::StatusOr<xla::XlaComputation> GetXlaComputation(
+    absl::string_view program) {
+  TF_ASSIGN_OR_RETURN(auto hlo_module,
+                      xla::ParseAndReturnUnverifiedModule(program, {}));
+
+  return XlaComputation(hlo_module->ToProto());
+}
+
+TEST(StreamExecutorGpuCompilerTest, NoClientXla) {
+  StreamExecutorGpuCompiler compiler;
+  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+                                                "Fake_device", {0, 1});
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
+  EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
+                               /*client=*/nullptr),
+              StatusIs(absl::StatusCode::kUnimplemented));
+}
+
+TEST(StreamExecutorGpuCompilerTest, TopologyNotSameXla) {
+  StreamExecutorGpuCompiler compiler;
+  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+                                                "Fake_device", {0, 1});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
+  EXPECT_THAT(compiler.Compile(xla::CompileOptions(), computation, topology,
+                               client.get()),
+              StatusIs(absl::StatusCode::kUnimplemented));
+}
+
+TEST(StreamExecutorGpuCompilerTest, SuccessXla) {
+  StreamExecutorGpuCompiler compiler;
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, GetXlaComputation(kProgram));
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(auto executable,
+                          compiler.Compile(xla::CompileOptions(), computation,
+                                           *topology, client.get()));
+  const LoadOptions load_options;
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          client->Load(std::move(executable), load_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+
+  ASSERT_EQ(result.size(), 1);
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  ASSERT_EQ(result_buffers.size(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result_buffers[0]->ToLiteralSync());
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
+}
+
+TEST(StreamExecutorGpuCompilerTest, NoClientMlir) {
+  StreamExecutorGpuCompiler compiler;
+
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
+
+  auto mlir_module =
+      mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
+
+  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+                                                "Fake_device", {0, 1});
+
+  EXPECT_THAT(
+      compiler.Compile(xla::CompileOptions(), mlir_module.get(), topology,
+                       /*client=*/nullptr),
+      StatusIs(absl::StatusCode::kUnimplemented));
+}
+
+TEST(StreamExecutorGpuCompilerTest, TopologyNotSameMlir) {
+  StreamExecutorGpuCompiler compiler;
+
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
+
+  auto mlir_module =
+      mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
+
+  StreamExecutorGpuTopologyDescription topology(GpuId(), GpuName(),
+                                                "Fake_device", {0, 1});
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  EXPECT_THAT(compiler.Compile(xla::CompileOptions(), mlir_module.get(),
+                               topology, client.get()),
+              StatusIs(absl::StatusCode::kUnimplemented));
+}
+
+TEST(StreamExecutorGpuCompilerTest, SuccessMlir) {
+  StreamExecutorGpuCompiler compiler;
+
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::mhlo::MhloDialect, mlir::func::FuncDialect>();
+
+  auto mlir_module =
+      mlir::parseSourceString<mlir::ModuleOp>(mlir_str, &context);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client, GetStreamExecutorGpuClient(true, /*allocator_config=*/{},
+                                              /*node_id=*/0));
+  TF_ASSERT_OK_AND_ASSIGN(auto topology, client->GetTopologyDescription());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(xla::CompileOptions(), mlir_module.get(), *topology,
+                       client.get()));
+  const LoadOptions load_options;
+  TF_ASSERT_OK_AND_ASSIGN(auto loaded_executable,
+                          client->Load(std::move(executable), load_options));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, loaded_executable->Execute(/*argument_handles=*/{{}}, {}));
+
+  ASSERT_EQ(result.size(), 1);
+  std::vector<std::unique_ptr<xla::PjRtBuffer>>& result_buffers = result[0];
+  ASSERT_EQ(result_buffers.size(), 1);
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::Literal> result_literal,
+                          result_buffers[0]->ToLiteralSync());
+  EXPECT_TRUE(
+      LiteralTestUtil::Equal(LiteralUtil::CreateR0(2), *result_literal));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/host_callback.cc b/tensorflow/compiler/xla/pjrt/host_callback.cc
index 0d26cc73eae..ea5591da276 100644
--- a/tensorflow/compiler/xla/pjrt/host_callback.cc
+++ b/tensorflow/compiler/xla/pjrt/host_callback.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <utility>
+#include <vector>
 
 namespace xla {
 
diff --git a/tensorflow/compiler/xla/pjrt/host_callback.h b/tensorflow/compiler/xla/pjrt/host_callback.h
index c3f054028a6..70020928e4f 100644
--- a/tensorflow/compiler/xla/pjrt/host_callback.h
+++ b/tensorflow/compiler/xla/pjrt/host_callback.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PJRT_HOST_CALLBACK_H_
 
 #include <atomic>
+#include <deque>
 #include <functional>
+#include <memory>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/pjrt/host_callback_test.cc b/tensorflow/compiler/xla/pjrt/host_callback_test.cc
index 0c95b5168fd..2df6b32cb48 100644
--- a/tensorflow/compiler/xla/pjrt/host_callback_test.cc
+++ b/tensorflow/compiler/xla/pjrt/host_callback_test.cc
@@ -92,7 +92,7 @@ TEST(HostCallbackTest, Basic) {
   auto context = CreateHostCallbackStateAndAppendSendRecvCallbacks(
       std::move(host_callback), &test_host_memory_for_device_manager,
       send_callbacks, recv_callbacks,
-      /*use_major_to_minor_data_layout_for_callback=*/false);
+      /*use_major_to_minor_data_layout_for_callbacks=*/false);
 
   PjRtTransferMetadata metadata;
   metadata.device_shape = shape;
diff --git a/tensorflow/compiler/xla/pjrt/interpreter_device.cc b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
index f23faf0527f..ff93761a0d9 100644
--- a/tensorflow/compiler/xla/pjrt/interpreter_device.cc
+++ b/tensorflow/compiler/xla/pjrt/interpreter_device.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/interpreter_device.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
diff --git a/tensorflow/compiler/xla/pjrt/local_device_state.cc b/tensorflow/compiler/xla/pjrt/local_device_state.cc
index cab539d0cda..850bcf99c36 100644
--- a/tensorflow/compiler/xla/pjrt/local_device_state.cc
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.cc
@@ -15,7 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
 
+#include <functional>
+#include <limits>
 #include <memory>
+#include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/synchronization/mutex.h"
@@ -31,7 +35,8 @@ LocalDeviceState::LocalDeviceState(se::StreamExecutor* executor,
                                    AllocationModel allocation_model,
                                    int max_inflight_computations,
                                    bool allow_event_reuse,
-                                   bool use_callback_stream)
+                                   bool use_callback_stream,
+                                   std::optional<StreamOptions> stream_options)
     : allocation_model_(allocation_model),
       event_pool_(allow_event_reuse),
       compute_semaphore_(
@@ -41,23 +46,42 @@ LocalDeviceState::LocalDeviceState(se::StreamExecutor* executor,
       prng_seed_generator_(prng_seed_device_()),
       prng_seed_distribution_(std::numeric_limits<int>::min(),
                               std::numeric_limits<int>::max()) {
+  int num_device_to_host_streams =
+      stream_options.has_value() ? stream_options->num_device_to_host_streams
+                                 : kNumDeviceToHostStreams;
+  int num_device_to_device_streams =
+      stream_options.has_value() ? stream_options->num_device_to_device_streams
+                                 : kNumDeviceToDeviceStreams;
   compute_stream_ = std::make_unique<se::Stream>(executor);
+  if (stream_options.has_value()) {
+    compute_stream_->implementation()->SetPriority(stream_options->priority);
+  }
   host_to_device_stream_ = std::make_unique<se::Stream>(executor);
+  if (stream_options.has_value()) {
+    host_to_device_stream_->implementation()->SetPriority(
+        stream_options->priority);
+  }
   compute_stream_->Init();
   host_to_device_stream_->Init();
   if (use_callback_stream) {
     callback_stream_map_ =
         absl::flat_hash_map<se::Stream*, std::unique_ptr<se::Stream>>();
   }
-  device_to_host_streams_.reserve(kNumDeviceToHostStreams);
-  for (int i = 0; i < kNumDeviceToHostStreams; ++i) {
+  device_to_host_streams_.reserve(num_device_to_host_streams);
+  for (int i = 0; i < num_device_to_host_streams; ++i) {
     auto stream = std::make_unique<se::Stream>(executor);
+    if (stream_options.has_value()) {
+      stream->implementation()->SetPriority(stream_options->priority);
+    }
     stream->Init();
     device_to_host_streams_.push_back(std::move(stream));
   }
-  device_to_device_streams_.reserve(kNumDeviceToDeviceStreams);
-  for (int i = 0; i < kNumDeviceToDeviceStreams; ++i) {
+  device_to_device_streams_.reserve(num_device_to_device_streams);
+  for (int i = 0; i < num_device_to_device_streams; ++i) {
     auto stream = std::make_unique<se::Stream>(executor);
+    if (stream_options.has_value()) {
+      stream->implementation()->SetPriority(stream_options->priority);
+    }
     stream->Init();
     device_to_device_streams_.push_back(std::move(stream));
   }
@@ -144,6 +168,16 @@ se::Stream* LocalDeviceState::GetDeviceToDeviceStream() {
   return device_to_device_streams_.at(i).get();
 }
 
+std::vector<se::Stream*> LocalDeviceState::GetDeviceToDeviceStreams() {
+  absl::MutexLock lock(&mu_);
+  std::vector<se::Stream*> result;
+  result.reserve(device_to_device_streams_.size());
+  for (const auto& stream : device_to_device_streams_) {
+    result.push_back(stream.get());
+  }
+  return result;
+}
+
 std::unique_ptr<se::Stream> LocalDeviceState::BorrowStreamFromPool() {
   absl::MutexLock lock(&mu_);
   if (usage_stream_pool_.empty()) {
diff --git a/tensorflow/compiler/xla/pjrt/local_device_state.h b/tensorflow/compiler/xla/pjrt/local_device_state.h
index adf812b8d64..7af975f94ee 100644
--- a/tensorflow/compiler/xla/pjrt/local_device_state.h
+++ b/tensorflow/compiler/xla/pjrt/local_device_state.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_LOCAL_DEVICE_STATE_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_LOCAL_DEVICE_STATE_H_
 
+#include <functional>
 #include <memory>
+#include <optional>
 #include <random>
+#include <stack>
 #include <vector>
 
 #include "absl/synchronization/mutex.h"
@@ -26,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/semaphore.h"
 #include "tensorflow/compiler/xla/pjrt/worker_thread.h"
 #include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -86,12 +90,20 @@ class LocalDeviceState {
     kAsynchronous
   };
 
+  // Options for stream creations.
+  struct StreamOptions {
+    int priority = 0;
+    int num_device_to_host_streams = 1;
+    int num_device_to_device_streams = 1;
+  };
+
   // If asynchronous is false, the host will synchronize to the device after
   // each execution or transfer. This is intended for debugging only.
   LocalDeviceState(se::StreamExecutor* executor, LocalClient* client,
                    AllocationModel allocation_model,
                    int max_inflight_computations, bool allow_event_reuse,
-                   bool use_callback_stream);
+                   bool use_callback_stream,
+                   std::optional<StreamOptions> stream_options = std::nullopt);
   virtual ~LocalDeviceState();
 
   se::StreamExecutor* executor() const { return executor_; }
@@ -117,6 +129,9 @@ class LocalDeviceState {
   // fashion amongst the available streams.
   se::Stream* GetDeviceToDeviceStream();
 
+  // Returns a vector of device to device streams.
+  std::vector<se::Stream*> GetDeviceToDeviceStreams();
+
   // Returns a stream from a pool. The stream is guaranteed not to have any
   // currently outstanding work at its tail.
   std::unique_ptr<se::Stream> BorrowStreamFromPool();
diff --git a/tensorflow/compiler/xla/pjrt/metrics.h b/tensorflow/compiler/xla/pjrt/metrics.h
index 5bc7cbcba4d..dca5719a306 100644
--- a/tensorflow/compiler/xla/pjrt/metrics.h
+++ b/tensorflow/compiler/xla/pjrt/metrics.h
@@ -22,8 +22,7 @@ limitations under the License.
 
 namespace xla {
 
-void ReportExecutableEnqueueTime(const uint64_t running_time_usecs);
-
+void ReportExecutableEnqueueTime(uint64_t running_time_usecs);
 }
 
 #endif  // TENSORFLOW_COMPILER_XLA_PJRT_METRICS_H_
diff --git a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
index 90ef18ea512..f17c876f172 100644
--- a/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
+++ b/tensorflow/compiler/xla/pjrt/mlir_to_hlo.cc
@@ -53,7 +53,7 @@ Status MlirToXlaComputation(mlir::ModuleOp module,
     if (failed(pm.run(module))) {
       VLOG(1) << "MHLO->HLO lowering passes failed.";
       module->dump();
-      return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+      return diagnostic_handler.ConsumeStatus();
     }
 
     VLOG(5) << "MHLO module after lowering, before HLO import ";
@@ -84,12 +84,12 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
       llvm::StringRef(mlir_module_str.data(), mlir_module_str.size()),
       &context);
   if (!module) {
-    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+    return diagnostic_handler.ConsumeStatus();
   }
   if (failed(module->verifyInvariants())) {
     VLOG(1) << "MLIR verification failed.";
     module->dump();
-    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+    return diagnostic_handler.ConsumeStatus();
   }
   return std::move(module);
 }
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
index 1ea892250f6..723c2d2e6ae 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.cc
@@ -35,14 +35,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape.h"
 #include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"  // NOLINT(unused-includes): required for tensorflow::tpu::FindAndLoadTpuLibrary
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/status.h"
 
-// TODO(b/238999986): Remove this when we have decomposed shape.
-#include "tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.h"
-
 namespace xla {
 
 // Helper macros
@@ -270,7 +266,11 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> PjRtCApiClient::Compile(
   std::string module_bytecode;
   {
     llvm::raw_string_ostream os(module_bytecode);
-    if (mlir::failed(mlir::writeBytecodeToFile(module, os)))
+    mlir::BytecodeWriterConfig config;
+    // Pin bytecode version to 1 until transition to stable.
+    // TODO(285913864): Remove post enabling frameworks to set it.
+    config.setDesiredBytecodeVersion(1);
+    if (mlir::failed(mlir::writeBytecodeToFile(module, os, config)))
       return absl::UnknownError("writeBytecodeToFile() failed.");
   }
   std::string format(pjrt::kMlirFormat);
@@ -539,6 +539,66 @@ int PjRtCApiDevice::local_hardware_id() const {
   return args.local_hardware_id;
 }
 
+StatusOr<tsl::AllocatorStats> PjRtCApiDevice::GetAllocatorStats() const {
+  PJRT_Device_MemoryStats_Args args;
+  args.struct_size = PJRT_Device_MemoryStats_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.device = device_;
+  const PJRT_Api* api = client_->pjrt_c_api();
+  RETURN_STATUS_IF_ERROR(api->PJRT_Device_MemoryStats(&args), api);
+
+  tsl::AllocatorStats result;
+  result.bytes_in_use = args.bytes_in_use;
+
+  // The PJRT C API supports optionally returning most fields, but only some
+  // fields in tsl::AllocatorStats are optional. Return -1 for unset,
+  // non-optional fields. We could change tsl::AllocatorStats to have all
+  // optional fields, but that requires changing a lot of callers.
+  if (args.peak_bytes_in_use_is_set) {
+    result.peak_bytes_in_use = args.peak_bytes_in_use;
+  } else {
+    result.peak_bytes_in_use = -1;
+  }
+  if (args.num_allocs_is_set) {
+    result.num_allocs = args.num_allocs;
+  } else {
+    result.num_allocs = -1;
+  }
+  if (args.largest_alloc_size_is_set) {
+    result.largest_alloc_size = args.largest_alloc_size;
+  } else {
+    result.largest_alloc_size = -1;
+  }
+  if (args.bytes_limit_is_set) {
+    result.bytes_limit = args.bytes_limit;
+  }
+  if (args.bytes_reserved_is_set) {
+    result.bytes_reserved = args.bytes_reserved;
+  } else {
+    result.bytes_reserved = -1;
+  }
+  if (args.peak_bytes_reserved_is_set) {
+    result.peak_bytes_reserved = args.peak_bytes_reserved;
+  } else {
+    result.peak_bytes_reserved = -1;
+  }
+  if (args.bytes_reservable_limit_is_set) {
+    result.bytes_reservable_limit = args.bytes_reservable_limit;
+  }
+  if (args.largest_free_block_bytes_is_set) {
+    result.largest_free_block_bytes = args.largest_free_block_bytes;
+  } else {
+    result.largest_free_block_bytes = -1;
+  }
+  if (args.pool_bytes_is_set) {
+    result.pool_bytes = args.pool_bytes;
+  }
+  if (args.peak_pool_bytes_is_set) {
+    result.peak_pool_bytes = args.peak_pool_bytes;
+  }
+  return result;
+}
+
 // ------------------------------- Executables ---------------------------------
 
 PjRtCApiExecutable::PjRtCApiExecutable(const PJRT_Api* c_api,
@@ -1186,6 +1246,77 @@ const Shape& PjRtCApiBuffer::on_device_shape() const {
   return shape_.value();
 }
 
+namespace {
+
+// TODO(b/238999986): these utilities exist only to serialize an XLA shape, and
+// will likely be removed, in favor of a more targeted representation of shapes.
+
+// Helper functions for creating a view of possibly-inlined C arrays.
+
+// 'Src' and 'Dst' are allowed to be different types to make this usable with
+// memory-identical types, e.g. int64_t and int64_t. This should not be used
+// with types that require a static_cast.
+template <typename Dst, typename Src, typename SrcList>
+static absl::Span<const Dst> MakeSpanBase(const SrcList& src_list) {
+  static_assert(sizeof(Src) == sizeof(Dst), "Mismatched types");
+  const Src* src = src_list.size > PJRT_C_API_MAX_INLINED
+                       ? src_list.heap
+                       : &src_list.inlined[0];
+  return absl::Span<const Dst>(reinterpret_cast<const Dst*>(src),
+                               src_list.size);
+}
+
+absl::Span<const int> MakeSpan(const PJRT_IntList& src_list) {
+  return MakeSpanBase<int, int, PJRT_IntList>(src_list);
+}
+
+absl::Span<const int64_t> MakeSpan(const PJRT_Int64List& src_list) {
+  return MakeSpanBase<int64_t, int64_t, PJRT_Int64List>(src_list);
+}
+
+absl::Span<const bool> MakeSpan(const PJRT_BoolList& src_list) {
+  return MakeSpanBase<bool, bool, PJRT_BoolList>(src_list);
+}
+
+xla::Tile FromC(const PJRT_XLA_Tile* c_tile) {
+  absl::Span<const int64_t> dims = MakeSpan(c_tile->dimensions);
+  return xla::Tile(dims);
+}
+
+xla::Layout FromC(const PJRT_XLA_Layout* c_layout) {
+  absl::Span<const int64_t> minor_to_major = MakeSpan(c_layout->minor_to_major);
+  absl::Span<const int> dim_level_type_ints =
+      MakeSpan(c_layout->dim_level_types);
+  xla::DimLevelTypeVector dim_level_types;
+  dim_level_types.reserve(dim_level_type_ints.size());
+  for (int dim_level_type : dim_level_type_ints) {
+    dim_level_types.push_back(static_cast<xla::DimLevelType>(dim_level_type));
+  }
+  absl::Span<const int> dim_unique_ints = MakeSpan(c_layout->dim_unique);
+  absl::InlinedVector<bool, xla::InlineRank()> dim_unique(
+      dim_unique_ints.begin(), dim_unique_ints.end());
+  absl::Span<const int> dim_ordered_ints = MakeSpan(c_layout->dim_unique);
+  absl::InlinedVector<bool, xla::InlineRank()> dim_ordered(
+      dim_ordered_ints.begin(), dim_ordered_ints.end());
+  absl::InlinedVector<xla::Tile, 1> tiles;
+  const PJRT_XLA_Tile* c_tiles = c_layout->tiles.size > PJRT_C_API_MAX_INLINED
+                                     ? c_layout->tiles.heap
+                                     : c_layout->tiles.inlined;
+  tiles.reserve(c_layout->tiles.size);
+  for (int i = 0; i < c_layout->tiles.size; ++i) {
+    tiles.push_back(FromC(&c_tiles[i]));
+  }
+  return xla::Layout(
+      minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
+      static_cast<xla::PrimitiveType>(c_layout->index_primitive_type),
+      static_cast<xla::PrimitiveType>(c_layout->pointer_primitive_type),
+      c_layout->element_size_in_bits, c_layout->memory_space,
+      /*physical_shape=*/nullptr,
+      c_layout->dynamic_shape_metadata_prefix_bytes);
+}
+
+}  // namespace
+
 static Shape GetDeviceShape(PJRT_Buffer* c_buffer, const PJRT_Api* api,
                             bool is_logical_on_device_shape) {
   PJRT_Buffer_OnDeviceTrimmedShape_Args args;
@@ -1201,31 +1332,30 @@ static Shape GetDeviceShape(PJRT_Buffer* c_buffer, const PJRT_Api* api,
 
   CHECK_NE(element_type, xla::PrimitiveType::TUPLE);
 
-  absl::Span<const int64_t> dims = ApiConverter::MakeSpan(args.dimensions);
-  absl::Span<const bool> dynamic_dims =
-      ApiConverter::MakeSpan(args.dynamic_dimensions);
+  absl::Span<const int64_t> dims = MakeSpan(args.dimensions);
+  absl::Span<const bool> dynamic_dims = MakeSpan(args.dynamic_dimensions);
 
   Shape trimmed_shape = Shape(element_type, dims, dynamic_dims, {});
 
   if (args.has_layout) {
-    *(trimmed_shape.mutable_layout()) = ApiConverter::FromC(&args.layout);
+    *(trimmed_shape.mutable_layout()) = FromC(&args.layout);
   }
 
   // TODO(amangu): Refactor the deletion.
-  if (args.dimensions.size > TPU_C_API_MAX_INLINED) {
+  if (args.dimensions.size > PJRT_C_API_MAX_INLINED) {
     delete[] args.dimensions.heap;
   }
 
-  if (args.dynamic_dimensions.size > TPU_C_API_MAX_INLINED) {
+  if (args.dynamic_dimensions.size > PJRT_C_API_MAX_INLINED) {
     delete[] args.dynamic_dimensions.heap;
   }
 
   if (args.has_layout) {
-    if (args.layout.minor_to_major.size > TPU_C_API_MAX_INLINED) {
+    if (args.layout.minor_to_major.size > PJRT_C_API_MAX_INLINED) {
       delete[] args.layout.minor_to_major.heap;
     }
 
-    if (args.layout.tiles.size > TPU_C_API_MAX_INLINED) {
+    if (args.layout.tiles.size > PJRT_C_API_MAX_INLINED) {
       delete[] args.layout.tiles.heap;
     }
   }
@@ -1432,6 +1562,26 @@ absl::string_view PjRtCApiTopologyDescription::platform_version() const {
   return absl::string_view(args.platform_version, args.platform_version_size);
 }
 
+std::vector<std::unique_ptr<const PjRtDeviceDescription>>
+PjRtCApiTopologyDescription::DeviceDescriptions() const {
+  PJRT_TopologyDescription_GetDeviceDescriptions_Args args;
+  args.struct_size =
+      PJRT_TopologyDescription_GetDeviceDescriptions_Args_STRUCT_SIZE;
+  args.priv = nullptr;
+  args.topology = c_topology_.get();
+  pjrt::LogFatalIfPjrtError(
+      c_api_->PJRT_TopologyDescription_GetDeviceDescriptions(&args), c_api_);
+  std::vector<std::unique_ptr<const PjRtDeviceDescription>> out;
+  out.reserve(args.num_descriptions);
+  for (PJRT_DeviceDescription* device_desc :
+       absl::Span<PJRT_DeviceDescription*>(args.descriptions,
+                                           args.num_descriptions)) {
+    out.push_back(
+        std::make_unique<PjRtCApiDeviceDescription>(c_api_, device_desc));
+  }
+  return out;
+}
+
 // Initializes `PJRT_Compile_Args`, which will be used to call
 // API PJRT_Compile().
 static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
@@ -1486,7 +1636,11 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
   std::string module_bytecode;
   {
     llvm::raw_string_ostream os(module_bytecode);
-    if (mlir::failed(mlir::writeBytecodeToFile(module, os)))
+    mlir::BytecodeWriterConfig config;
+    // Pin bytecode version to 1 until transition to stable.
+    // TODO(285913864): Remove post enabling frameworks to set it.
+    config.setDesiredBytecodeVersion(1);
+    if (mlir::failed(mlir::writeBytecodeToFile(module, os, config)))
       return absl::UnknownError("writeBytecodeToFile() failed.");
   }
   std::string format(pjrt::kMlirFormat);
@@ -1499,12 +1653,6 @@ StatusOr<std::unique_ptr<PjRtExecutable>> PjRtCApiCompiler::Compile(
 StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
     absl::string_view device_type,
     const absl::flat_hash_map<std::string, PjRtValueType>& create_options) {
-#if !defined(PLATFORM_GOOGLE) || defined(LIBTPU_STATIC)
-  if (absl::AsciiStrToLower(device_type) == "tpu") {
-    // TODO(b/261484192): handle device specific initialization.
-    TF_RETURN_IF_ERROR(tensorflow::tpu::FindAndLoadTpuLibrary());
-  }
-#endif
   TF_ASSIGN_OR_RETURN(const PJRT_Api* c_api, pjrt::PjrtApi(device_type));
   if (c_api == nullptr) {
     return InternalError("PJRT C API is nullptr for %s", device_type);
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
index 0e16e95e074..f5168175af8 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h
@@ -34,7 +34,7 @@ class PjRtCApiClient;
 
 class PjRtCApiDeviceDescription : public PjRtDeviceDescription {
  public:
-  PjRtCApiDeviceDescription(const PJRT_Api* c_api_,
+  PjRtCApiDeviceDescription(const PJRT_Api* c_api,
                             PJRT_DeviceDescription* device_description);
 
   int id() const override;
@@ -91,6 +91,8 @@ class PjRtCApiDevice : public PjRtDevice {
     return description_;
   }
 
+  StatusOr<tsl::AllocatorStats> GetAllocatorStats() const override;
+
  private:
   PjRtCApiClient* client_ = nullptr;
   // `device_` is owned by the `PJRT_Client` wrapped by `client_`
@@ -511,9 +513,7 @@ class PjRtCApiTopologyDescription : public PjRtTopologyDescription {
   }
 
   std::vector<std::unique_ptr<const PjRtDeviceDescription>> DeviceDescriptions()
-      const {
-    LOG(FATAL) << "PJRT C API DeviceDescription not implemented.";
-  }
+      const override;
 
  private:
   std::unique_ptr<PjRtCApiCompiler> compiler_;
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
index 700bc83f7f0..b9a8187ff3a 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.cc
@@ -44,14 +44,22 @@ StatusOr<std::uintptr_t> PjRtClient::UnsafeBufferPointer(PjRtBuffer* buffer) {
 
 PjRtFuture<Status> PjRtBuffer::CopyRawToHostFuture(
     PjRtFuture<StatusOr<void*>> dst, int64_t offset, int64_t transfer_size) {
-  StatusOr<void*> awaited_dst = dst.Await();
-  if (!awaited_dst.ok()) {
-    return PjRtFuture<Status>(std::move(awaited_dst).status());
-  }
-  return CopyRawToHost(*awaited_dst, offset, transfer_size);
+  auto promise = PjRtFuture<Status>::CreatePromise();
+  dst.OnReady(
+      [this, promise, offset, transfer_size](StatusOr<void*> dst) mutable {
+        if (dst.ok()) {
+          CopyRawToHost(*dst, offset, transfer_size)
+              .OnReady([promise = std::move(promise)](Status status) mutable {
+                promise.Set(status);
+              });
+        } else {
+          promise.Set(dst.status());
+        }
+      });
+  return PjRtFuture<Status>(std::move(promise));
 }
 
-MultiSliceConfig::~MultiSliceConfig() {}
+MultiSliceConfig::~MultiSliceConfig() = default;
 
 std::string CompiledMemoryStats::DebugString() const {
   return absl::Substitute(
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_client.h b/tensorflow/compiler/xla/pjrt/pjrt_client.h
index 51b0ec292d4..52aed69eb1f 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_client.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
-#include <variant>
 #include <vector>
 
 #include "absl/base/attributes.h"
@@ -35,6 +34,8 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_compiler.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_device_description.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_future.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
@@ -45,40 +46,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/framework/allocator.h"
 #include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/fingerprint.h"
 
 // API notes:
 // PjRt stands for "Pretty much Just another RunTime".
 
 namespace xla {
 
-using PjRtPlatformId = uint64_t;
-
-inline const char* CpuName() {
-  static constexpr char kCpuName[] = "cpu";
-  return kCpuName;
-}
-inline const char* GpuName() {
-  static constexpr char kGpuName[] = "gpu";
-  return kGpuName;
-}
-inline const char* TpuName() {
-  static constexpr char kTpuName[] = "tpu";
-  return kTpuName;
-}
-inline PjRtPlatformId CpuId() {
-  static const PjRtPlatformId kCpuId = tsl::Fingerprint64(CpuName());
-  return kCpuId;
-}
-inline PjRtPlatformId GpuId() {
-  static const PjRtPlatformId kGpuId = tsl::Fingerprint64(GpuName());
-  return kGpuId;
-}
-inline PjRtPlatformId TpuId() {
-  static const PjRtPlatformId kTpuId = tsl::Fingerprint64(TpuName());
-  return kTpuId;
-}
-
 enum PjRtRuntimeType { kStreamExecutor, kTfrt };
 inline constexpr absl::string_view PjRtRuntimeTypeString(PjRtRuntimeType type) {
   switch (type) {
@@ -92,10 +65,6 @@ inline constexpr absl::string_view PjRtRuntimeTypeString(PjRtRuntimeType type) {
 class PjRtClient;
 class PjRtDevice;
 
-using PjRtValueType =
-    std::variant<std::string, int64_t, std::vector<int64_t>, float>;
-using PjRtDeviceAttribute = PjRtValueType;
-
 class PjRtMemorySpace {
  public:
   virtual ~PjRtMemorySpace() = default;
@@ -117,42 +86,9 @@ class PjRtMemorySpace {
   // Debug string suitable for logging when errors occur. Should be verbose
   // enough to describe the current memory space unambiguously.
   virtual absl::string_view DebugString() const = 0;
-};
 
-class PjRtDeviceDescription {
- public:
-  virtual ~PjRtDeviceDescription() = default;
-
-  // The ID of this device. IDs are unique among devices of this type
-  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
-  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
-  virtual int id() const = 0;
-
-  // The index of the process that this device belongs to, i.e. is addressable
-  // from. This is not always identical to PjRtClient::process_index() in a
-  // multi-process setting, where each client can see devices from all
-  // processes, but only a subset of them are addressable and have the same
-  // process_index as the client.
-  virtual int process_index() const = 0;
-
-  // A vendor-dependent string that uniquely identifies the kind of device,
-  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
-  // compatible compilation.
-  virtual absl::string_view device_kind() const = 0;
-
-  // Debug string suitable for logging when errors occur. Should be verbose
-  // enough to describe the current device unambiguously.
-  virtual absl::string_view DebugString() const = 0;
-
-  // Debug string suitable for reading by end users, should be reasonably terse,
-  // for example: "CpuDevice(id=0)".
+  // Debug string suitable for reading by end users, should be reasonably terse.
   virtual absl::string_view ToString() const = 0;
-
-  // Returns vendor specific attributes about the device. For example the model
-  // number of a GPU, or the mesh coordinates of a TPU device. The returned
-  // reference will remain valid for the lifetime of the PjRtDevice.
-  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
-  Attributes() const = 0;
 };
 
 class PjRtDevice {
@@ -421,16 +357,6 @@ class PjRtHostMemoryForDeviceManager {
                               size_t dst_size, const Shape& dst_shape) = 0;
 };
 
-struct LoadOptions {
-  // Origin of the subslice of the target topology to run computation on.
-  struct ComputationOrigin {
-    int x = 0;
-    int y = 0;
-    int z = 0;
-  };
-  std::optional<ComputationOrigin> computation_origin;
-};
-
 class PjRtLoadedExecutable;
 
 // Encapsulates the state of Python session with XLA.
@@ -482,6 +408,21 @@ class PjRtLoadedExecutable;
 // will eventually be able to make progress.
 class PjRtClient {
  public:
+  // In the multi-node case, the caller of PjRtClient can provide a key-value
+  // store accessible across nodes. The caller can provide the two callbacks
+  // below to access the key-value store. There are a few requirements:
+  // (1) KeyValueGetCallback and KeyValuePutCallback must be thread-safe.
+  // (2) The caller that provides the two callbacks is responsible for avoiding
+  // key collisions between different users of key-value store (i.e. between
+  // different plugins, but not between different GPU plugin nodes).
+  // (3) KeyValueGetCallback is blocking.
+  // Subclasses of PjRtClient can optionally take these callbacks in their
+  // constructors.
+  using KeyValueGetCallback = std::function<xla::StatusOr<std::string>(
+      const std::string& key, absl::Duration timeout)>;
+  using KeyValuePutCallback = std::function<xla::Status(
+      const std::string& key, const std::string& value)>;
+
   PjRtClient() = default;
   explicit PjRtClient(std::unique_ptr<PjRtHostMemoryForDeviceManager>
                           host_memory_for_device_manager)
@@ -613,6 +554,18 @@ class PjRtClient {
     return Unimplemented("CreateErrorBuffer not supported.");
   }
 
+  // Gets the pointer to the topology description held by the client.
+  virtual StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+      const {
+    return Unimplemented("GetTopologyDescription not supported!");
+  }
+
+  // Returns topology object for compilation based on this client's topology.
+  virtual StatusOr<const PjRtTopologyDescription*>
+  GetFullTopologyForCompilation() const {
+    return GetTopologyDescription();
+  }
+
   // A client may want to create a buffer, and hand the buffer to other PjRt
   // methods, before the data to store in the buffer is available to the client.
   // This is supported using CreateBuffersForAsyncHostToDevice, which returns an
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_compiler.h b/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
index a77d47e9d57..30a6c7e35b3 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_compiler.h
@@ -21,13 +21,44 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/client/xla_computation.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_device_description.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
+#include "tensorflow/tsl/platform/fingerprint.h"
 
 namespace xla {
 
+using PjRtPlatformId = uint64_t;
+
+inline const char* CpuName() {
+  static constexpr char kCpuName[] = "cpu";
+  return kCpuName;
+}
+inline const char* GpuName() {
+  static constexpr char kGpuName[] = "gpu";
+  return kGpuName;
+}
+inline const char* TpuName() {
+  static constexpr char kTpuName[] = "tpu";
+  return kTpuName;
+}
+inline PjRtPlatformId CpuId() {
+  static const PjRtPlatformId kCpuId = tsl::Fingerprint64(CpuName());
+  return kCpuId;
+}
+inline PjRtPlatformId GpuId() {
+  static const PjRtPlatformId kGpuId = tsl::Fingerprint64(GpuName());
+  return kGpuId;
+}
+inline PjRtPlatformId TpuId() {
+  static const PjRtPlatformId kTpuId = tsl::Fingerprint64(TpuName());
+  return kTpuId;
+}
+
 class PjRtCompiler;
+class PjRtClient;
 
 // TODO(b/240299401): Move CompileOptions to this file.
 
@@ -52,12 +83,44 @@ class PjRtTopologyDescription {
   // Returns an unordered list of descriptions for all devices in this topology.
   virtual std::vector<std::unique_ptr<const PjRtDeviceDescription>>
   DeviceDescriptions() const = 0;
+
+  // Returns true if the topology represents subslice.
+  virtual bool is_subslice_topology() const { return false; }
+
+  // Returns the number of processes (usually the number of hosts, except in
+  // topologies with multiple processes per host).
+  virtual absl::StatusOr<int> ProcessCount() const {
+    return absl::UnimplementedError("ProcessCount is unsupported.");
+  }
+
+  // Returns the total number of cores of the default type.
+  virtual absl::StatusOr<int> CoreCountOfDefaultType() const {
+    return absl::UnimplementedError("CoreCountOfDefaultType is unsupported.");
+  }
+
+  // Returns the total number of logical devices of the default type.
+  virtual absl::StatusOr<int> LogicalDeviceCountOfDefaultType() const {
+    return absl::UnimplementedError(
+        "LogicalDeviceCountOfDefaultType is unsupported.");
+  }
+
+  // Returns the number of cores of the default type per process.
+  virtual absl::StatusOr<int> CoreCountOfDefaultTypePerProcess() const {
+    return absl::UnimplementedError(
+        "CoreCountOfDefaultTypePerProcess is unsupported.");
+  }
+
+  // Returns the number of cores per chip for the default type.
+  virtual absl::StatusOr<int> CoreCountOfDefaultTypePerChip() const {
+    return absl::UnimplementedError(
+        "CoreCountOfDefaultTypePerChip is unsupported.");
+  }
 };
 
 // Abstract interface that all registered compilers must implement.
 class PjRtCompiler {
  public:
-  virtual ~PjRtCompiler() {}
+  virtual ~PjRtCompiler() = default;
 
   // Compiles the 'computation' and returns a 'PjRtExecutable'. The returned
   // PjRtExecutable must be loaded by a compatible client before execution.
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_device_description.h b/tensorflow/compiler/xla/pjrt/pjrt_device_description.h
new file mode 100644
index 00000000000..c2218660cf1
--- /dev/null
+++ b/tensorflow/compiler/xla/pjrt/pjrt_device_description.h
@@ -0,0 +1,72 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
+#define TENSORFLOW_COMPILER_XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <string_view>
+#include <variant>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+
+namespace xla {
+
+using PjRtValueType =
+    std::variant<std::string, int64_t, std::vector<int64_t>, float>;
+using PjRtDeviceAttribute = PjRtValueType;
+
+class PjRtDeviceDescription {
+ public:
+  virtual ~PjRtDeviceDescription() = default;
+
+  // The ID of this device. IDs are unique among devices of this type
+  // (e.g. CPUs, GPUs). On multi-host platforms, this will be unique across all
+  // hosts' devices.  This is the ID that should be used in a DeviceAssignment.
+  virtual int id() const = 0;
+
+  // The index of the process that this device belongs to, i.e. is addressable
+  // from. This is not always identical to PjRtClient::process_index() in a
+  // multi-process setting, where each client can see devices from all
+  // processes, but only a subset of them are addressable and have the same
+  // process_index as the client.
+  virtual int process_index() const = 0;
+
+  // A vendor-dependent string that uniquely identifies the kind of device,
+  // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
+  // compatible compilation.
+  virtual std::string_view device_kind() const = 0;
+
+  // Debug string suitable for logging when errors occur. Should be verbose
+  // enough to describe the current device unambiguously.
+  virtual std::string_view DebugString() const = 0;
+
+  // Debug string suitable for reading by end users, should be reasonably terse,
+  // for example: "CpuDevice(id=0)".
+  virtual std::string_view ToString() const = 0;
+
+  // Returns vendor specific attributes about the device. For example the model
+  // number of a GPU, or the mesh coordinates of a TPU device. The returned
+  // reference will remain valid for the lifetime of the PjRtDevice.
+  virtual const absl::flat_hash_map<std::string, PjRtDeviceAttribute>&
+  Attributes() const = 0;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PJRT_PJRT_DEVICE_DESCRIPTION_H_
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_executable.h b/tensorflow/compiler/xla/pjrt/pjrt_executable.h
index aeaa1f615be..1c495a25279 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_executable.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_executable.h
@@ -104,6 +104,20 @@ struct CompileOptions {
   static StatusOr<CompileOptions> FromProto(const CompileOptionsProto& proto);
 };
 
+struct LoadOptions {
+  // Origin of the subslice of the target topology to run computation on.
+  struct ComputationOrigin {
+    int x = 0;
+    int y = 0;
+    int z = 0;
+  };
+  std::optional<ComputationOrigin> computation_origin;
+
+  // multi_slice_config to associate with the executable during load of a multi
+  // slice operation.
+  const MultiSliceConfig* multi_slice_config = nullptr;
+};
+
 // Static device memory usage for a compiled program.
 // The on-device memory needed to run an executable is at least
 //   generated_code_size_in_bytes
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
index 140b614486b..a7a3335aa66 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.cc
@@ -145,14 +145,6 @@ StatusOr<LocalDeviceState*> PjRtStreamExecutorDevice::GetLocalDeviceState()
   return InvalidArgument("Device %s is not a local device.", DebugString());
 }
 
-absl::string_view PjRtStreamExecutorDevice::DebugString() const {
-  return debug_string_;
-}
-
-absl::string_view PjRtStreamExecutorDevice::ToString() const {
-  return to_string_;
-}
-
 StatusOr<DeviceAssignment> DevicesToDeviceAssignment(
     absl::Span<const std::vector<PjRtDevice*>> devices) {
   if (devices.empty()) {
@@ -2042,7 +2034,7 @@ static SendDeviceMemoryFunction ConvertSendCallbacksToSendFunction(
                                  /*done=*/true);
 
       if (!sent.ok()) {
-        done_event.SetError(ToAbslStatus(sent));
+        done_event.SetError(sent);
       } else {
         done_event.SetStateConcrete();
       }
@@ -2082,7 +2074,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
       done_.SetError(absl::InvalidArgumentError(absl::StrFormat(
           "Chunk size (%d) was not a multiple of the granule size (%d)",
           chunk.size(), granule_size_in_bytes())));
-      return PjRtFuture<Status>(FromAbslStatus(done_.GetError()));
+      return PjRtFuture<Status>(done_.GetError());
     }
 
     if (current_bytes_ + chunk.size() > total_bytes_) {
@@ -2090,7 +2082,7 @@ class StreamExecutorCopyToDeviceStream : public CopyToDeviceStream {
           absl::StrFormat("Adding chunk of size %d would overflow buffer of "
                           "size %d (%d already transferred)",
                           chunk.size(), total_bytes_, current_bytes_)));
-      return PjRtFuture<Status>(FromAbslStatus(done_.GetError()));
+      return PjRtFuture<Status>(done_.GetError());
     }
 
     se::DeviceMemoryBase dst(
@@ -2185,6 +2177,8 @@ static RecvDeviceMemoryFunction ConvertRecvCallbacksToRecvFunction(
 // Enqueues a computation onto the compute stream. Each buffer returned in
 // device_buffers has a usage hold added that must be dropped on error or
 // converted on success.
+// When `options` has non-zero `launch_id`, use `launch_id` instead of `run_id`
+// to initialize `run_options`.
 StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
     absl::Span<PjRtBuffer* const> argument_handles, int replica, int partition,
     int executable_idx, const RunId& run_id, const ExecuteOptions& options,
@@ -2261,6 +2255,12 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
     // before the buffer is mutated. Usage holds are excluded during a donation
     // hold so we know that the set of usage events won't be modified while we
     // are enqueueing.
+    if (device_state->allocation_model() ==
+        LocalDeviceState::kComputeSynchronized) {
+      GetDeviceBufferEvents(*device_buffer, /*get_usage_events=*/false,
+                            &events);
+    }
+
     GetDeviceBufferEvents(*device_buffer, /*get_usage_events=*/must_donate,
                           &events);
   }
@@ -2304,7 +2304,11 @@ StatusOr<ScopedShapedBuffer> PjRtStreamExecutorExecutable::EnqueueExecution(
   run_options.set_intra_op_thread_pool(
       client_->client()->backend().eigen_intra_op_thread_pool_device());
   run_options.set_device_assignment(device_assignment.get());
-  run_options.set_run_id(run_id);
+  if (options.launch_id != 0) {
+    run_options.set_run_id(RunId(options.launch_id));
+  } else {
+    run_options.set_run_id(run_id);
+  }
   run_options.set_rng_seed(device_state->GetNewPrngSeed());
   run_options.set_gpu_executable_run_options(client_->gpu_run_options());
   run_options.set_launch_id(options.launch_id);
@@ -2933,4 +2937,11 @@ PjRtStreamExecutorClient::DeserializeExecutable(
   return std::unique_ptr<PjRtLoadedExecutable>(std::move(executable));
 }
 
+StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+PjRtStreamExecutorClient::LoadSerializedExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options,
+    const LoadOptions& load_options) {
+  return DeserializeExecutable(serialized, options);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
index 20ea59d870e..1c6021e1db6 100644
--- a/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
+++ b/tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h
@@ -60,18 +60,59 @@ limitations under the License.
 
 namespace xla {
 
+class PjRtStreamExecutorDeviceDescription : public PjRtDeviceDescription {
+ public:
+  explicit PjRtStreamExecutorDeviceDescription(int id, std::string device_kind,
+                                               int process_index = 0)
+      : id_(id),
+        process_index_(process_index),
+        device_kind_(std::move(device_kind)) {}
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return process_index_; }
+
+  absl::string_view device_kind() const override { return device_kind_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+  void SetAttributes(
+      absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes) {
+    attributes_ = std::move(attributes);
+  }
+
+  void SetDebugString(std::string debug_string) {
+    debug_string_ = std::move(debug_string);
+  }
+
+  void SetToString(std::string to_string) { to_string_ = std::move(to_string); }
+
+ private:
+  const int id_;
+  const int process_index_;
+  const std::string device_kind_;
+  std::string debug_string_ = "<unknown SE device>";
+  std::string to_string_ = "<unknown SE device>";
+  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_;
+};
+
 class PjRtStreamExecutorDevice : public PjRtDevice {
  public:
   explicit PjRtStreamExecutorDevice(
       int id, std::unique_ptr<LocalDeviceState> local_device_state,
       std::string device_kind, int process_index = 0)
-      : id_(id),
+      : description_(id, std::move(device_kind), process_index),
         device_ordinal_(
             local_device_state ? local_device_state->device_ordinal() : -1),
-        local_device_state_(std::move(local_device_state)),
-        process_index_(process_index),
-        device_kind_(std::move(device_kind)) {}
-  ~PjRtStreamExecutorDevice() override {}
+        local_device_state_(std::move(local_device_state)) {}
+  ~PjRtStreamExecutorDevice() override = default;
 
   // Must set client exactly once.
   void SetClient(PjRtClient* client) {
@@ -79,11 +120,14 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
     client_ = client;
     // We have to define debug_string_ and to_string_ here, because
     // platform_name() requires client_ to be set.
-    debug_string_ = absl::StrCat(platform_name(), ":", id());
-    to_string_ = absl::StrCat(platform_name(), "(id=", id(), ")");
+    description().SetDebugString(absl::StrCat(platform_name(), ":", id()));
+    description().SetToString(absl::StrCat(platform_name(), "(id=", id(), ")"));
   }
 
-  int process_index() const override { return process_index_; }
+  PjRtStreamExecutorDeviceDescription& description() { return description_; }
+  const PjRtStreamExecutorDeviceDescription& description() const override {
+    return description_;
+  }
 
   // Return `platform_id` from client.
   PjRtPlatformId platform_id() const;
@@ -93,8 +137,6 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
 
   PjRtClient* client() const override { return client_; }
 
-  int id() const override { return id_; }
-
   bool IsAddressable() const override { return device_ordinal_ != -1; }
 
   int local_hardware_id() const override { return device_ordinal_; }
@@ -111,12 +153,6 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
   // is not local to this host.
   StatusOr<LocalDeviceState*> GetLocalDeviceState() const;
 
-  absl::string_view device_kind() const override { return device_kind_; }
-
-  absl::string_view ToString() const override;
-
-  absl::string_view DebugString() const override;
-
   Status TransferToInfeed(const LiteralSlice& literal) override;
 
   Status TransferFromOutfeed(MutableBorrowingLiteral literal) override;
@@ -126,22 +162,10 @@ class PjRtStreamExecutorDevice : public PjRtDevice {
     return nullptr;
   }
 
-  const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& Attributes()
-      const override {
-    return attributes_;
-  }
-
- protected:
-  absl::flat_hash_map<std::string, PjRtDeviceAttribute> attributes_;
-
  private:
-  const int id_;
+  PjRtStreamExecutorDeviceDescription description_;
   const int device_ordinal_;  // -1 means not local.
   const std::unique_ptr<LocalDeviceState> local_device_state_;
-  const int process_index_;
-  const std::string device_kind_;
-  std::string debug_string_;
-  std::string to_string_;
   PjRtClient* client_ = nullptr;
 };
 
@@ -213,6 +237,10 @@ class PjRtStreamExecutorClient : public PjRtClient {
       absl::string_view serialized,
       std::optional<CompileOptions> options) override;
 
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> LoadSerializedExecutable(
+      absl::string_view serialized, std::optional<CompileOptions> options,
+      const LoadOptions& load_options) override;
+
   StatusOr<std::unique_ptr<HloCostAnalysis>> GetHloCostAnalysis()
       const override;
 
diff --git a/tensorflow/compiler/xla/pjrt/plugin/BUILD b/tensorflow/compiler/xla/pjrt/plugin/BUILD
index 35cdea363ea..6b9480f24a2 100644
--- a/tensorflow/compiler/xla/pjrt/plugin/BUILD
+++ b/tensorflow/compiler/xla/pjrt/plugin/BUILD
@@ -30,8 +30,8 @@ load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 # ** Please don't remove this file - it is supporting some 3rd party plugins **
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
index 11bf0a0ac43..123eeacbc18 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.cc
@@ -29,8 +29,6 @@ limitations under the License.
 
 #define EIGEN_USE_THREADS
 
-#include "absl/algorithm/container.h"
-#include "absl/base/casts.h"
 #include "absl/base/dynamic_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
@@ -47,7 +45,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
-#include "tensorflow/compiler/xla/cpu_function_runtime.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
@@ -65,7 +62,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.h"
 #include "tensorflow/compiler/xla/pjrt/transpose.h"
 #include "tensorflow/compiler/xla/pjrt/utils.h"
-#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/runtime/cpu_event.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/computation_placer.h"
@@ -101,32 +97,12 @@ StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBuffer(
     const Shape& on_device_shape,
     absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events,
     TfrtCpuDevice* device, TfrtCpuClient* client) {
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
-  if (!on_device_shape.IsTuple()) {
-    size_t byte_size = ShapeUtil::ByteSizeOf(on_device_shape);
-    TF_ASSIGN_OR_RETURN(auto device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
-    buffers.push_back(std::move(device_buffer));
-    return std::make_unique<TfrtCpuBuffer>(
-        on_device_shape,
-        std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-            /*is_tuple=*/false, std::move(buffers),
-            std::move(definition_events)),
-        client, device);
-  }
-  // Tuple case.
-  buffers.reserve(on_device_shape.tuple_shapes().size());
-  for (const auto& leaf_shape : on_device_shape.tuple_shapes()) {
-    size_t byte_size = ShapeUtil::ByteSizeOf(leaf_shape);
-    TF_ASSIGN_OR_RETURN(auto device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
-    buffers.push_back(std::move(device_buffer));
-  }
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      AbstractTfrtCpuBuffer::AllocateTrackedDeviceBuffer(
+          on_device_shape, std::move(definition_events)));
   return std::make_unique<TfrtCpuBuffer>(
-      on_device_shape,
-      std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-          /*is_tuple=*/true, std::move(buffers), std::move(definition_events)),
-      client, device);
+      on_device_shape, std::move(tracked_device_buffer), client, device);
 }
 
 StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBufferAndAvs(
@@ -136,23 +112,16 @@ StatusOr<std::unique_ptr<TfrtCpuBuffer>> AllocateDestinationBufferAndAvs(
   // Add a placeholder definition event for each leaf buffer when creating the
   // buffer.
   absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events;
-  int num_leaf_buffers = shape.IsTuple() ? shape.tuple_shapes_size() : 1;
-  for (int i = 0; i < num_leaf_buffers; ++i) {
-    tfrt::AsyncValueRef<CpuEvent> definition_event =
-        tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
-    definition_events.push_back(definition_event.CopyRef());
-    avs->push_back(std::move(definition_event));
-  }
+  AbstractTfrtCpuBuffer::AllocateAvsAndEvents(shape, avs, &definition_events);
   return AllocateDestinationBuffer(
       shape, std::move(definition_events),
       tensorflow::down_cast<TfrtCpuDevice*>(device), client);
 }
 
-static const char kCpuPlatformName[] = "cpu";
-static constexpr size_t kSmallDataTransferByteSize = 102400;  // 100 KiB
+const char kCpuPlatformName[] = "cpu";
 
-static void EnqueueWork(tsl::thread::ThreadPool* pool,
-                        absl::AnyInvocable<void()> callee) {
+void EnqueueWork(tsl::thread::ThreadPool* pool,
+                 absl::AnyInvocable<void()> callee) {
   // TSL TheadPool expects std::function that must be copyable, so we are
   // forced to do a little bit of manual memory management here.
   pool->Schedule([ptr = new absl::AnyInvocable<void()>(std::move(callee))]() {
@@ -162,15 +131,34 @@ static void EnqueueWork(tsl::thread::ThreadPool* pool,
 }
 
 // Enqueue to PjRtClient pool when all `values` are ready.
-static void EnqueueWorkWhenReady(
+void EnqueueWorkWhenReady(
     tsl::thread::ThreadPool* pool,
-    llvm::ArrayRef<tfrt::RCReference<tfrt::AsyncValue>> values,
+    absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
     absl::AnyInvocable<void()> callee) {
   RunWhenReady(values, [pool, callee = std::move(callee)]() mutable {
     EnqueueWork(pool, std::move(callee));
   });
 }
 
+class ThreadPoolAsyncWorkRunner : public AsyncWorkRunner {
+ public:
+  explicit ThreadPoolAsyncWorkRunner(tsl::thread::ThreadPool* pool)
+      : pool_(pool) {}
+
+  void Schedule(absl::AnyInvocable<void()> work) override {
+    EnqueueWork(pool_, std::move(work));
+  }
+
+  void ScheduleWhenReady(
+      absl::Span<const tsl::RCReference<tsl::AsyncValue>> values,
+      absl::AnyInvocable<void()> work) override {
+    EnqueueWorkWhenReady(pool_, values, std::move(work));
+  }
+
+ private:
+  tsl::thread::ThreadPool* pool_;
+};
+
 class TfrtCpuAsyncHostToDeviceTransferManager
     : public PjRtClient::AsyncHostToDeviceTransferManager {
  public:
@@ -188,6 +176,10 @@ class TfrtCpuAsyncHostToDeviceTransferManager
     buffers.reserve(shapes.size());
     absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> avs;
     avs.reserve(shapes.size());
+    absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight;
+    buffer_transfers_in_flight.resize(shapes.size(), 0);
+    absl::InlinedVector<bool, 4> last_transfer_finished;
+    last_transfer_finished.resize(shapes.size(), false);
     for (const Shape& shape : shapes) {
       absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> local_avs;
       TF_ASSIGN_OR_RETURN(
@@ -216,7 +208,9 @@ class TfrtCpuAsyncHostToDeviceTransferManager
 
     return absl::WrapUnique(new TfrtCpuAsyncHostToDeviceTransferManager(
         std::move(avs), std::move(buffers), std::move(device_buffers),
-        std::move(buffer_sizes), DefaultThreadPoolSize(), device));
+        std::move(buffer_sizes), DefaultThreadPoolSize(), device,
+        std::move(buffer_transfers_in_flight),
+        std::move(last_transfer_finished)));
   }
 
   ~TfrtCpuAsyncHostToDeviceTransferManager() override {
@@ -273,12 +267,15 @@ class TfrtCpuAsyncHostToDeviceTransferManager
     CHECK_GE(buffer_index, 0);
     CHECK_LT(buffer_index, buffers_.size());
     CHECK_LE(transfer_size + offset, buffer_sizes_[buffer_index]);
+    CHECK(!last_transfer_finished_[buffer_index]);
+    ++buffer_transfers_in_flight_[buffer_index];
     ++transfers_in_flight_;
     EnqueueWork(
         thread_pool_.get(),
         [this, device_buffer = device_buffers_[buffer_index],
          av = avs_[buffer_index].CopyRef(), data, offset, transfer_size,
-         is_last_transfer, on_done = std::move(on_done)]() mutable -> void {
+         is_last_transfer, on_done = std::move(on_done),
+         buffer_index]() mutable -> void {
           absl::MutexLock l(&mu_);
           const std::shared_ptr<MaybeOwningCpuMemory>& b =
               device_buffer->Buffers()[0];
@@ -286,16 +283,21 @@ class TfrtCpuAsyncHostToDeviceTransferManager
                       transfer_size);
           std::move(on_done)();
           if (is_last_transfer) {
+            last_transfer_finished_[buffer_index] = true;
+          }
+          --buffer_transfers_in_flight_[buffer_index];
+          --transfers_in_flight_;
+          if (buffer_transfers_in_flight_[buffer_index] == 0 &&
+              last_transfer_finished_[buffer_index]) {
             av->SetStateConcrete();
           }
-          --transfers_in_flight_;
         });
     return tsl::OkStatus();
   }
 
   void SetBufferError(int buffer_index, Status error) override {
     absl::MutexLock l(&mu_);
-    avs_[buffer_index]->SetError(ToAbslStatus(error));
+    avs_[buffer_index]->SetError(error);
   }
 
   void AddTransferMetadata(const TransferMetadata& meta) override {
@@ -308,9 +310,13 @@ class TfrtCpuAsyncHostToDeviceTransferManager
       absl::InlinedVector<std::unique_ptr<TfrtCpuBuffer>, 4> buffers,
       absl::InlinedVector<TrackedTfrtCpuDeviceBuffer*, 4> device_buffers,
       absl::InlinedVector<size_t, 4> buffer_sizes, size_t num_threads,
-      TfrtCpuDevice* device)
+      TfrtCpuDevice* device,
+      absl::InlinedVector<int64_t, 4> transfers_for_buffer,
+      absl::InlinedVector<bool, 4> seen_last_transfer)
       : transfers_in_flight_(0),
         avs_(std::move(avs)),
+        buffer_transfers_in_flight_(std::move(transfers_for_buffer)),
+        last_transfer_finished_(std::move(seen_last_transfer)),
         buffers_(std::move(buffers)),
         device_buffers_(std::move(device_buffers)),
         buffer_sizes_(std::move(buffer_sizes)),
@@ -325,6 +331,11 @@ class TfrtCpuAsyncHostToDeviceTransferManager
   // AsyncValues used to mark buffers as ready for consumption.
   absl::InlinedVector<tfrt::RCReference<tfrt::AsyncValue>, 4> avs_
       ABSL_GUARDED_BY(mu_);
+  // Holds the number of in-flight transfers for each buffer.
+  absl::InlinedVector<int64_t, 4> buffer_transfers_in_flight_
+      ABSL_GUARDED_BY(mu_);
+  // Flag to indicate whether we have seen the last transfer of each buffer.
+  absl::InlinedVector<bool, 4> last_transfer_finished_ ABSL_GUARDED_BY(mu_);
   // The newly created buffers, which will be returned to the caller via
   // Retrieve.
   absl::InlinedVector<std::unique_ptr<TfrtCpuBuffer>, 4> buffers_
@@ -359,10 +370,10 @@ absl::string_view TfrtCpuDeviceDescription::ToString() const {
   return to_string_;
 }
 
-TfrtCpuDevice::TfrtCpuDevice(int id, bool asynchronous)
+TfrtCpuDevice::TfrtCpuDevice(int id, int max_inflight_computations)
     : description_(id),
-      max_inflight_computations_semaphore_(/*capacity=*/asynchronous ? 32 : 1) {
-}
+      max_inflight_computations_semaphore_(
+          /*capacity=*/max_inflight_computations) {}
 
 Status TfrtCpuDevice::TransferToInfeed(const LiteralSlice& literal) {
   return TransferLiteralToInfeedOnCpu(local_hardware_id(), literal);
@@ -380,23 +391,25 @@ static int CpuDeviceCount() {
 }
 
 static StatusOr<std::vector<std::unique_ptr<TfrtCpuDevice>>> GetTfrtCpuDevices(
-    bool asynchronous, int cpu_device_count) {
+    int cpu_device_count, int max_inflight_computations_per_device) {
   std::vector<std::unique_ptr<TfrtCpuDevice>> devices;
   for (int i = 0; i < cpu_device_count; ++i) {
     auto device = std::make_unique<TfrtCpuDevice>(
-        /*id=*/i, asynchronous);
+        /*id=*/i, max_inflight_computations_per_device);
     devices.push_back(std::move(device));
   }
   return std::move(devices);
 }
 
-StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(bool asynchronous,
-                                                       int cpu_device_count) {
+StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+    bool asynchronous, int cpu_device_count,
+    int max_inflight_computations_per_device) {
   // Need at least CpuDeviceCount threads to launch one collective.
   size_t num_threads = std::max(DefaultThreadPoolSize(), cpu_device_count);
 
   TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<TfrtCpuDevice>> devices,
-                      GetTfrtCpuDevices(asynchronous, cpu_device_count));
+                      GetTfrtCpuDevices(cpu_device_count,
+                                        max_inflight_computations_per_device));
 
   return std::unique_ptr<PjRtClient>(std::make_unique<TfrtCpuClient>(
       /*process_index=*/0, std::move(devices), num_threads));
@@ -414,6 +427,8 @@ TfrtCpuClient::TfrtCpuClient(
       computation_placer_(std::make_unique<ComputationPlacer>()),
       pjrt_client_thread_pool_(new tsl::thread::ThreadPool(
           tsl::Env::Default(), "XLATfrtCpuClient", num_threads)),
+      async_work_runner_(std::make_unique<ThreadPoolAsyncWorkRunner>(
+          pjrt_client_thread_pool_.get())),
       eigen_intraop_pool_(new tsl::thread::ThreadPool(
           tsl::Env::Default(), "XLAEigen", DefaultThreadPoolSize())),
       eigen_intraop_device_(
@@ -840,87 +855,14 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostBuffer(
   Shape shape = ShapeUtil::MakeShape(type, dims);
   VLOG(2) << "TfrtCpuClient::BufferFromHostBuffer: shape: " << shape.ToString()
           << " device: " << device->DebugString();
-  bool has_default_layout =
-      !byte_strides || HasMajorToMinorLayout(type, dims, *byte_strides);
-  // If the input buffer has a default layout and is sufficiently aligned, we
-  // can simply point to the input array's data without any further copies. At
-  // the time of writing we require a 16-byte alignment because XLA may generate
-  // code which requires it.
-  bool can_use_zero_copy =
-      has_default_layout &&
-      host_buffer_semantics == HostBufferSemantics::kZeroCopy &&
-      ((absl::bit_cast<std::uintptr_t>(data) &
-        (cpu_function_runtime::MinAlign() - 1)) == 0);
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> buffers;
-  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> definition_events;
-  std::function<void()> on_delete_callback;
-  size_t byte_size = ShapeUtil::ByteSizeOf(shape);
-  if (can_use_zero_copy) {
-    auto device_buffer = std::make_shared<MaybeOwningCpuMemory>(
-        const_cast<void*>(data), byte_size);
-    buffers.push_back(std::move(device_buffer));
-    on_delete_callback = std::move(on_done_with_host_buffer);
-  } else {
-    TF_ASSIGN_OR_RETURN(auto device_buffer,
-                        MaybeOwningCpuMemory::AllocateShared(byte_size));
-    auto dst_data_ptr = device_buffer->data();
-    buffers.push_back(device_buffer);
-    if (!has_default_layout) {
-      // If the input array does not have a major-to-minor layout, transpose it
-      // into major-to-minor layout. Currently we choose to always do this
-      // synchronously.
-      // TODO(phawkins): consider performing the transpose asynchronously.
-      // TODO(phawkins): parallelize the transpose.
-      std::shared_ptr<TransposePlan> transpose;
-      {
-        absl::InlinedVector<int64_t, 4> permutation(dims.size());
-        absl::c_iota(permutation, 0);
-        absl::MutexLock lock(&transpose_mu_);
-        TF_ASSIGN_OR_RETURN(
-            transpose, transpose_cache_.GetOrCreate(
-                           primitive_util::ByteWidth(type), dims, permutation,
-                           TransposePlan::Striding{*byte_strides}));
-      }
-      transpose->Execute(data, dst_data_ptr);
-      if (on_done_with_host_buffer) {
-        on_done_with_host_buffer();
-        on_done_with_host_buffer = nullptr;
-      }
-    } else {
-      bool should_sync_copy =
-          host_buffer_semantics ==
-              HostBufferSemantics::kImmutableOnlyDuringCall ||
-          (byte_size < kSmallDataTransferByteSize);
-      if (should_sync_copy) {
-        std::memcpy(dst_data_ptr, data, byte_size);
-        if (on_done_with_host_buffer) {
-          on_done_with_host_buffer();
-          on_done_with_host_buffer = nullptr;
-        }
-      } else {
-        tfrt::AsyncValueRef<CpuEvent> copy_event =
-            tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
-        definition_events.push_back(copy_event.CopyRef());
-        EnqueueWork(pjrt_client_thread_pool(),
-                    [device_buffer = std::move(device_buffer), dst_data_ptr,
-                     data, byte_size, copy_event = std::move(copy_event),
-                     on_done_with_host_buffer =
-                         std::move(on_done_with_host_buffer)]() mutable {
-                      tsl::profiler::TraceMe traceme("H2D Dispatch");
-                      std::memcpy(dst_data_ptr, data, byte_size);
-                      if (on_done_with_host_buffer) {
-                        on_done_with_host_buffer();
-                        on_done_with_host_buffer = nullptr;
-                      }
-                      // Signal copy is complete.
-                      copy_event.SetStateConcrete();
-                    });
-      }
-    }
-  }
-  auto tracked_device_buffer = std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-      /*is_tuple=*/false, std::move(buffers), std::move(definition_events),
-      std::move(on_delete_callback));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      AbstractTfrtCpuBuffer::BufferFromHostBufferHelper(
+          data, type, dims, byte_strides, host_buffer_semantics,
+          std::move(on_done_with_host_buffer), shape, async_work_runner(),
+          &transpose_mu_, &transpose_cache_));
+
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
       shape, std::move(tracked_device_buffer), this,
       tensorflow::down_cast<TfrtCpuDevice*>(device)));
@@ -940,40 +882,8 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuClient::BufferFromHostLiteral(
       AllocateDestinationBufferAndAvs(
           shape, &avs, tensorflow::down_cast<TfrtCpuDevice*>(device), this));
 
-  auto usage_event = tfrt::MakeAvailableAsyncValueRef<CpuEvent>();
-  auto* device_buffer = output_buffer->AcquireUsage(std::move(usage_event));
-  CHECK(device_buffer);
-  if (!shape.IsTuple()) {
-    // It is OK to capture `buffer` pointer because the `output_buffer` can't be
-    // deleted until all the usage holds have gone away.
-    EnqueueWork(pjrt_client_thread_pool(), [literal, av = avs[0].CopyRef(),
-                                            device_buffer, shape]() mutable {
-      tsl::profiler::TraceMe traceme("H2D Dispatch");
-      const std::shared_ptr<MaybeOwningCpuMemory>& b =
-          device_buffer->Buffers()[0];
-      CHECK_EQ(literal.size_bytes(), b->size());
-      std::memcpy(b->data(), literal.untyped_data(), b->size());
-      // Signal copy is complete.
-      av->SetStateConcrete();
-    });
-  } else {
-    // For tuple, transfer leaf literal individually in parallel.
-    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
-      // It is OK to capture `buffer` pointer because the `output_buffer` can't
-      // be deleted until all the usage holds have gone away.
-      EnqueueWork(pjrt_client_thread_pool(), [i, literal, av = avs[i].CopyRef(),
-                                              shape, device_buffer]() mutable {
-        tsl::profiler::TraceMe traceme("H2D Dispatch");
-        auto slice = LiteralSlice(literal, {i});
-        const std::shared_ptr<MaybeOwningCpuMemory>& b =
-            device_buffer->Buffers()[i];
-        CHECK_EQ(slice.size_bytes(), b->size());
-        std::memcpy(b->data(), slice.untyped_data(), slice.size_bytes());
-        // Signal copy is complete.
-        av->SetStateConcrete();
-      });
-    }
-  }
+  output_buffer->CopyFromLiteral(literal, shape, &avs, async_work_runner());
+
   return std::unique_ptr<PjRtBuffer>(std::move(output_buffer));
 }
 
@@ -986,59 +896,6 @@ TfrtCpuBuffer::TfrtCpuBuffer(
       client_(client),
       device_(device) {}
 
-static ShapedBuffer AsShapedBuffer(
-    int device_ordinal, const Shape& on_device_shape,
-    absl::Span<const std::shared_ptr<MaybeOwningCpuMemory>> buffers) {
-  ShapedBuffer shaped_buffer(on_device_shape, device_ordinal);
-  ShapeTree<se::DeviceMemoryBase>::iterator iterator =
-      shaped_buffer.buffers().begin();
-  for (const auto& buf : buffers) {
-    CHECK(iterator != shaped_buffer.buffers().end());
-    iterator->second = se::DeviceMemoryBase(buf->data(), buf->size());
-    ++iterator;
-  }
-  CHECK(iterator == shaped_buffer.buffers().end());
-  return shaped_buffer;
-}
-
-StatusOr<Shape> TfrtCpuBuffer::logical_on_device_shape() {
-  if (on_device_shape_.is_static()) {
-    return on_device_shape_;
-  }
-
-  auto usage_event = tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* device_buffer = AcquireUsage(usage_event);
-  if (device_buffer == nullptr) {
-    return InvalidArgument(
-        "logical_on_device_shape() called on deleted or donated buffer");
-  }
-  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
-
-  // Wait for the definition event.
-  const auto& av = device_buffer->definition_event();
-  BlockUntilReady(av.GetAsyncValue());
-  if (auto* error = av.GetErrorIfPresent()) {
-    return InternalError("Error Execute: %s", error->message());
-  }
-
-  ShapedBuffer shaped_buffer = AsShapedBuffer(
-      device_->local_hardware_id(), on_device_shape_, device_buffer->Buffers());
-  Shape ret_shape = on_device_shape_;
-  TF_RETURN_IF_ERROR(ReadDynamicShapesOnCpu(
-      &shaped_buffer, &ret_shape, cpu::CpuExecutable::ShapeSizeBytes));
-  return ret_shape;
-}
-
-static std::vector<tfrt::RCReference<tfrt::AsyncValue>> GetAsyncValues(
-    absl::Span<const tfrt::AsyncValueRef<CpuEvent>> events) {
-  std::vector<tfrt::RCReference<tfrt::AsyncValue>> avs;
-  avs.reserve(events.size());
-  for (const auto& ev : events) {
-    avs.push_back(ev.CopyRCRef());
-  }
-  return avs;
-}
-
 static std::vector<tfrt::RCReference<tfrt::AsyncValue>> CopyAsyncValues(
     absl::Span<const tfrt::RCReference<tfrt::AsyncValue>> events) {
   std::vector<tfrt::RCReference<tfrt::AsyncValue>> avs;
@@ -1050,73 +907,7 @@ static std::vector<tfrt::RCReference<tfrt::AsyncValue>> CopyAsyncValues(
 }
 
 PjRtFuture<Status> TfrtCpuBuffer::ToLiteral(MutableLiteralBase* literal) {
-  tsl::profiler::TraceMe traceme("TfrtCpuBuffer::ToLiteral");
-  if (IsEmptyTuple()) {
-    return PjRtFuture<Status>(
-        InvalidArgument("ToLiteral called on empty tuple"));
-  }
-  auto usage_event = tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* device_buffer = AcquireUsage(usage_event);
-  if (device_buffer == nullptr) {
-    return PjRtFuture<Status>(InvalidArgument(
-        "CopyToHostAsync() called on deleted or donated buffer"));
-  }
-  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
-
-  std::vector<tfrt::RCReference<tfrt::AsyncValue>> device_buffer_wait_avs = {
-      device_buffer->definition_event().CopyRCRef()};
-  std::vector<tfrt::RCReference<tfrt::AsyncValue>> device_buffer_wait_avs_copy =
-      CopyAsyncValues(device_buffer_wait_avs);
-
-  bool should_sync_copy = device_buffer_wait_avs.empty() &&
-                          literal->size_bytes() < kSmallDataTransferByteSize;
-  StatusOr<Shape> device_shape = logical_on_device_shape();
-  if (!device_shape.ok()) {
-    return PjRtFuture<Status>(device_shape.status());
-  }
-  if (should_sync_copy) {
-    CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
-    // Unblock ToLiteral caller.
-    return PjRtFuture<Status>(OkStatus());
-  } else {
-    auto ready_event = tfrt::MakeUnconstructedAsyncValueRef<Status>();
-    // Wait for buffer definition events to finish before d2h dispatch. D2H
-    // dispatch should be in parallel, e.g. one Execute event finish may trigger
-    // multiple outputs' D2H, they should happen in different threads in
-    // parallel.
-    EnqueueWorkWhenReady(
-        client()->pjrt_client_thread_pool(), device_buffer_wait_avs,
-        [this, device_buffer_wait_avs = std::move(device_buffer_wait_avs_copy),
-         literal, ready_event = ready_event.CopyRef(), device_buffer,
-         device_shape, ready_on_exit = std::move(ready_on_exit)]() mutable {
-          tsl::profiler::TraceMe traceme("D2H Dispatch");
-          // Errors in src buffer are surfaced to user.
-          for (const auto& av : device_buffer_wait_avs) {
-            if (auto* error = av->GetErrorIfPresent()) {
-              ready_event.emplace(Internal("Error converting to literal: %s",
-                                           error->message()));
-              return;
-            }
-          }
-          CopyCpuBufferToLiteral(*device_shape, device_buffer, literal);
-          // Unblock ToLiteral event.
-          ready_event.emplace(OkStatus());
-        });
-    return PjRtFuture<Status>(
-        std::move(ready_event),
-        /*on_block_start=*/
-        []() {
-          tsl::profiler::TraceMeProducer traceme("TfrtCpuBuffer::ToLiteral");
-          VLOG(1) << "TfrtCpuBuffer::ToLiteral";
-          return PjRtFutureHelpers::ProfilingKeys(
-              {/*traceme_context_id =*/traceme.GetContextId()});
-        },
-        /*on_block_end=*/
-        [](PjRtFutureHelpers::ProfilingKeys keys) {
-          tsl::profiler::TraceMeConsumer traceme("TfrtCpuBuffer::ToLiteral",
-                                                 keys.traceme_context_id);
-        });
-  }
+  return ToLiteralHelper(literal, client()->async_work_runner());
 }
 
 // TODO(zhangqiaorjc): Consider disallowing multiple CPU devices and assign
@@ -1133,84 +924,16 @@ StatusOr<std::unique_ptr<PjRtBuffer>> TfrtCpuBuffer::CopyToDevice(
 
   // Copying across PjRtClients involves a copy through the host.
   if (dst_device->client() != client_) {
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<Literal> literal, ToLiteralSync());
-    // Avoid use-after-free on `literal` due to unsequenced move and use.
-    Literal* literal_pointer = literal.get();
-    absl::InlinedVector<int64_t, 4> byte_strides(
-        literal->shape().dimensions_size());
-    TF_RETURN_IF_ERROR(
-        ShapeUtil::ByteStrides(literal->shape(), absl::MakeSpan(byte_strides)));
-    return dst_device->client()->BufferFromHostBuffer(
-        literal_pointer->untyped_data(),
-        literal_pointer->shape().element_type(),
-        literal_pointer->shape().dimensions(), byte_strides,
-        TfrtCpuClient::HostBufferSemantics::kZeroCopy,
-        [literal{std::move(literal)}]() { /* frees literal */ }, dst_device);
+    return CopyToDeviceAcrossClients(dst_device);
   }
 
-  // Copy each leaf buffer to a destination buffer.
-  auto usage_event = tfrt::MakeConstructedAsyncValueRef<CpuEvent>();
-  auto* src_device_buffer = AcquireUsage(usage_event);
-  if (src_device_buffer == nullptr) {
-    return InvalidArgument("CopyToDevice called on deleted or donated buffer");
-  }
-  MarkEventReadyOnExit ready_on_exit(std::move(usage_event));
-
-  int num_leaf_buffers = src_device_buffer->Buffers().size();
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> src_buffers;
-  absl::InlinedVector<std::shared_ptr<MaybeOwningCpuMemory>, 4> dst_buffers;
-  absl::InlinedVector<tfrt::AsyncValueRef<CpuEvent>, 4> dst_definition_events;
-  src_buffers.reserve(num_leaf_buffers);
-  dst_buffers.reserve(num_leaf_buffers);
-  dst_definition_events.reserve(num_leaf_buffers);
-
-  for (int i = 0; i < num_leaf_buffers; ++i) {
-    auto src_buffer = src_device_buffer->Buffers()[i];
-    TF_ASSIGN_OR_RETURN(auto dst_buffer, MaybeOwningCpuMemory::AllocateShared(
-                                             src_buffer->size()));
-    src_buffers.push_back(std::move(src_buffer));
-    dst_buffers.push_back(std::move(dst_buffer));
-    dst_definition_events.push_back(
-        tfrt::MakeConstructedAsyncValueRef<CpuEvent>());
-  }
-
-  // Wait for src buffer definition events to finish before d2d dispatch.
-  // Errors are propagated asynchronously in dst buffer's definition events.
-  const auto& src_definition_event = src_device_buffer->definition_event();
-
-  auto copy_task = [num_leaf_buffers, src_buffers = std::move(src_buffers),
-                    dst_buffers_copies = dst_buffers, dst_definition_events,
-                    src_definition_event,
-                    ready_on_exit = std::move(ready_on_exit)]() mutable {
-    tsl::profiler::TraceMe traceme("D2D Dispatch");
-    if (auto* error = src_definition_event.GetErrorIfPresent()) {
-      for (int i = 0; i < num_leaf_buffers; ++i) {
-        // Any error discovered in src buffer are propagated to dst buffer
-        // definition events, which will surface to users in
-        // dst_buffer->ToLiteral().
-        dst_definition_events[i].SetError(*error);
-      }
-      return;
-    }
-
-    for (int i = 0; i < num_leaf_buffers; ++i) {
-      std::memcpy(dst_buffers_copies[i]->data(), src_buffers[i]->data(),
-                  src_buffers[i]->size());
-      dst_definition_events[i].SetStateConcrete();
-    }
-  };
-
-  src_definition_event.AndThen([pool = client()->pjrt_client_thread_pool(),
-                                copy_task = std::move(copy_task)]() mutable {
-    EnqueueWork(pool, std::move(copy_task));
-  });
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<TrackedTfrtCpuDeviceBuffer> tracked_device_buffer,
+      CopyToDeviceHelper(client()->async_work_runner()));
 
   return std::unique_ptr<PjRtBuffer>(std::make_unique<TfrtCpuBuffer>(
-      on_device_shape_,
-      std::make_unique<TrackedTfrtCpuDeviceBuffer>(
-          on_device_shape_.IsTuple(), std::move(dst_buffers),
-          std::move(dst_definition_events)),
-      client(), tensorflow::down_cast<TfrtCpuDevice*>(dst_device)));
+      on_device_shape_, std::move(tracked_device_buffer), client(),
+      tensorflow::down_cast<TfrtCpuDevice*>(dst_device)));
 }
 
 TfrtCpuExecutable::TfrtCpuExecutable(
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
index 41d17558a9c..e15e5db1968 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h
@@ -89,7 +89,7 @@ class TfrtCpuDeviceDescription final : public PjRtDeviceDescription {
 
 class TfrtCpuDevice final : public PjRtDevice {
  public:
-  TfrtCpuDevice(int id, bool asynchronous);
+  explicit TfrtCpuDevice(int id, int max_inflight_computations = 32);
 
   const TfrtCpuDeviceDescription& description() const override {
     return description_;
@@ -244,6 +244,10 @@ class TfrtCpuClient final : public PjRtClient {
     return pjrt_client_thread_pool_.get();
   }
 
+  AsyncWorkRunner* async_work_runner() const {
+    return async_work_runner_.get();
+  }
+
   Eigen::ThreadPoolDevice* eigen_intraop_device() const {
     return eigen_intraop_device_.get();
   }
@@ -273,6 +277,7 @@ class TfrtCpuClient final : public PjRtClient {
 
   // Thread pool for running PjRtClient tasks.
   std::unique_ptr<tsl::thread::ThreadPool> pjrt_client_thread_pool_;
+  std::unique_ptr<AsyncWorkRunner> async_work_runner_;
 
   // TODO(zhangqiaorjc): Use tfrt::compat::EigenHostContextThreadPool.
   std::unique_ptr<tsl::thread::ThreadPool> eigen_intraop_pool_;
@@ -314,8 +319,6 @@ class TfrtCpuBuffer final : public AbstractTfrtCpuBuffer {
   TfrtCpuDevice* device() const override { return device_; }
   TfrtCpuClient* client() const override { return client_; }
 
-  StatusOr<Shape> logical_on_device_shape() override;
-
   using PjRtBuffer::ToLiteralSync;
   PjRtFuture<Status> ToLiteral(MutableLiteralBase* literal) override;
 
@@ -487,10 +490,11 @@ class TfrtCpuExecutable final : public PjRtLoadedExecutable {
 // the XLA_FLAGS environment variable.
 StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(bool asynchronous);
 
-// Similar to the function above, but you can set the number of devices
-// explicitly.
-StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(bool asynchronous,
-                                                       int cpu_device_count);
+// Similar to the function above, but you can set the number of devices and max
+// number of inflight computations per device explicitly.
+StatusOr<std::unique_ptr<PjRtClient>> GetTfrtCpuClient(
+    bool asynchronous, int cpu_device_count,
+    int max_inflight_computations_per_device = 32);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
index 87dbc81c839..39c0275ba9e 100644
--- a/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 
+#include <unistd.h>
+
 #include <algorithm>
 #include <cstring>
 #include <string>
@@ -22,6 +24,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/synchronization/notification.h"
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/custom_call_status.h"
@@ -102,7 +105,11 @@ TEST(TfrtCpuClientTest, HloSnapshot) {
       ROOT add = f32[3,2] add(x, y)
     })";
 
-  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto client,
+      GetTfrtCpuClient(/*asynchronous=*/true,
+                       /*cpu_device_count=*/1,
+                       /*max_inflight_computations_per_device=*/32));
   TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
                           ParseAndReturnUnverifiedModule(kProgram, {}));
 
@@ -273,5 +280,29 @@ TEST(TfrtCpuClientTest, AsyncTransferSetBufferError) {
       tsl::testing::StatusIs(tsl::error::INTERNAL, HasSubstr("foobar")));
 }
 
+TEST(TfrtCpuClientTest, AsyncTransferRawDataToSubBuffer) {
+  TF_ASSERT_OK_AND_ASSIGN(auto client, GetTfrtCpuClient(/*asynchronous=*/true));
+  xla::Shape shape = ShapeUtil::MakeShape(U32, {3, 2});
+  TF_ASSERT_OK_AND_ASSIGN(auto transfer_manager,
+                          client->CreateBuffersForAsyncHostToDevice(
+                              {shape}, client->addressable_devices()[0]));
+  auto buffer = transfer_manager->RetrieveBuffer(0);
+  auto ready_future = buffer->GetReadyFuture();
+  EXPECT_THAT(ready_future.IsReady(), IsFalse());
+  constexpr size_t raw_data_size = 3 * 2 * 4;
+  char raw_data[raw_data_size];
+  std::fill(raw_data, raw_data + raw_data_size, 0x42);
+  absl::string_view raw_data_view(raw_data, raw_data_size);
+  TF_ASSERT_OK(transfer_manager->TransferRawDataToSubBuffer(
+      0, raw_data_view.data(), 0, raw_data_size - 1, /*is_last_transfer=*/false,
+      []() {}));
+  TF_ASSERT_OK(transfer_manager->TransferRawDataToSubBuffer(
+      0, raw_data_view.data(), raw_data_size - 1, 1, /*is_last_transfer=*/true,
+      []() {}));
+  TF_ASSERT_OK_AND_ASSIGN(auto literal, buffer->ToLiteralSync());
+  ASSERT_EQ(literal->element_count(), 3 * 2);
+  EXPECT_THAT(literal->data<uint32_t>(), Each(0x42424242));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/pjrt/tpu_client.h b/tensorflow/compiler/xla/pjrt/tpu_client.h
index d64702d3dc3..1468e6edf99 100644
--- a/tensorflow/compiler/xla/pjrt/tpu_client.h
+++ b/tensorflow/compiler/xla/pjrt/tpu_client.h
@@ -41,31 +41,25 @@ class PjRtTpuDevice : public PjRtStreamExecutorDevice {
         coords_(coords) {
     std::vector<int64_t> v_coords(coords_.begin(), coords_.end());
     int64_t core_index = core_on_chip();
-    attributes_ = {
+    description().SetAttributes({
         {"coords", xla::PjRtDeviceAttribute(v_coords)},
         {"core_on_chip", xla::PjRtDeviceAttribute(core_index)},
-    };
-    debug_string_ = absl::StrFormat("TPU_%i(process=%i,(%i,%i,%i,%i))",
-                                    core_.Id(), process_index, coords_[0],
-                                    coords_[1], coords_[2], core_.index());
-    to_string_ = absl::StrFormat(
+    });
+    description().SetDebugString(absl::StrFormat(
+        "TPU_%i(process=%i,(%i,%i,%i,%i))", core_.Id(), process_index,
+        coords_[0], coords_[1], coords_[2], core_.index()));
+    description().SetToString(absl::StrFormat(
         "TpuDevice(id=%i, process_index=%i, coords=(%s), core_on_chip=%i)",
-        id(), process_index, absl::StrJoin(coords_, ","), core_on_chip());
+        id(), process_index, absl::StrJoin(coords_, ","), core_on_chip()));
   }
 
   const std::array<int, 3>& coords() const { return coords_; }
   int core_on_chip() const { return core_.index(); }
-  const tensorflow::tpu::TpuCoreLocationExternal core() const { return core_; }
-
-  absl::string_view ToString() const override { return to_string_; }
-
-  absl::string_view DebugString() const override { return debug_string_; }
+  tensorflow::tpu::TpuCoreLocationExternal core() const { return core_; }
 
  private:
   const tensorflow::tpu::TpuCoreLocationExternal core_;
   const std::array<int, 3> coords_;
-  std::string debug_string_;
-  std::string to_string_;
 };
 
 class PjRtTpuClient : public PjRtStreamExecutorClient {
diff --git a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
index 70f8085309e..aa301454651 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.cc
@@ -15,9 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 
+#include <algorithm>
 #include <atomic>
+#include <functional>
 #include <iterator>
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/pjrt/local_device_state.h"
diff --git a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
index 8eea17ac6fc..f1753cb563f 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
 #define TENSORFLOW_COMPILER_XLA_PJRT_TRACKED_DEVICE_BUFFER_H_
 
+#include <atomic>
+#include <functional>
 #include <memory>
 
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc b/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
index 8618f2089df..cafddf54008 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_device_buffer_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/tracked_device_buffer.h"
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/literal_util.h"
diff --git a/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc b/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc
index 2073b7542f8..fe9f7a5b53a 100644
--- a/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc
+++ b/tensorflow/compiler/xla/pjrt/tracked_tfrt_cpu_device_buffer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <atomic>
 #include <functional>
+#include <memory>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/compiler/xla/pjrt/transpose.cc b/tensorflow/compiler/xla/pjrt/transpose.cc
index 18897562a5b..0309ca4bf33 100644
--- a/tensorflow/compiler/xla/pjrt/transpose.cc
+++ b/tensorflow/compiler/xla/pjrt/transpose.cc
@@ -70,11 +70,17 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/transpose.h"
 
 #include <algorithm>
+#include <cstdlib>
+#include <cstring>
 #include <functional>
+#include <memory>
 #include <numeric>
 #include <stack>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <variant>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/str_format.h"
diff --git a/tensorflow/compiler/xla/pjrt/transpose.h b/tensorflow/compiler/xla/pjrt/transpose.h
index 296d1b3caf2..27b342ad768 100644
--- a/tensorflow/compiler/xla/pjrt/transpose.h
+++ b/tensorflow/compiler/xla/pjrt/transpose.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
diff --git a/tensorflow/compiler/xla/pjrt/transpose_test.cc b/tensorflow/compiler/xla/pjrt/transpose_test.cc
index 202e56672a8..e226fb63087 100644
--- a/tensorflow/compiler/xla/pjrt/transpose_test.cc
+++ b/tensorflow/compiler/xla/pjrt/transpose_test.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <numeric>
+#include <ostream>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/compiler/xla/pjrt/utils.cc b/tensorflow/compiler/xla/pjrt/utils.cc
index 2dee5841c51..e0edb9a83a9 100644
--- a/tensorflow/compiler/xla/pjrt/utils.cc
+++ b/tensorflow/compiler/xla/pjrt/utils.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
@@ -39,33 +40,21 @@ namespace xla {
 namespace {
 StatusOr<Shape> GetShardedShape(const Shape& shape,
                                 const OpSharding& sharding) {
-  if (sharding.type() == OpSharding::TUPLE) {
-    if (!shape.IsTuple()) {
-      return InvalidArgument(
-          "Got tuple OpSharding (%s) for non-tuple shape (%s)",
-          sharding.DebugString(), shape.ToString());
-    }
-    if (sharding.tuple_shardings_size() != shape.tuple_shapes_size()) {
-      return InvalidArgument(
-          "Got mismatched OpSharding tuple size (%d) and shape tuple size (%d)."
-          " (OpSharding: %s, shape: %s)",
-          sharding.tuple_shardings_size(), shape.tuple_shapes_size(),
-          sharding.DebugString(), shape.ToString());
-    }
-    std::vector<Shape> sharded_subshapes;
-    const int tuple_shapes_size = shape.tuple_shapes_size();
-    sharded_subshapes.reserve(tuple_shapes_size);
-    for (int i = 0; i < tuple_shapes_size; ++i) {
-      TF_ASSIGN_OR_RETURN(
-          Shape sharded_subshape,
-          GetShardedShape(shape.tuple_shapes(i), sharding.tuple_shardings(i)));
-      sharded_subshapes.emplace_back(std::move(sharded_subshape));
-    }
-    return ShapeUtil::MakeTupleShape(sharded_subshapes);
-  }
   TF_ASSIGN_OR_RETURN(HloSharding hlo_sharding,
                       HloSharding::FromProto(sharding));
-  return hlo_sharding.TileShape(shape);
+  if (shape.IsTuple()) {
+    Shape sharded_shape = shape;
+    ShapeUtil::ForEachMutableSubshape(
+        &sharded_shape, [&](Shape* subshape, const ShapeIndex& index) {
+          if (!subshape->IsTuple()) {
+            HloSharding subsharding = hlo_sharding.GetSubSharding(shape, index);
+            *subshape = subsharding.TileShape(*subshape);
+          }
+        });
+    return sharded_shape;
+  } else {
+    return hlo_sharding.TileShape(shape);
+  }
 }
 
 StatusOr<Shape> GetShardedShape(const HloInstructionProto& instr) {
diff --git a/tensorflow/compiler/xla/pjrt/worker_thread.cc b/tensorflow/compiler/xla/pjrt/worker_thread.cc
index 787699bf511..dff8493aa2e 100644
--- a/tensorflow/compiler/xla/pjrt/worker_thread.cc
+++ b/tensorflow/compiler/xla/pjrt/worker_thread.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/pjrt/worker_thread.h"
 
+#include <functional>
+#include <string>
+#include <utility>
+
 namespace xla {
 
 WorkerThread::WorkerThread(tsl::Env* env, const std::string& name) {
diff --git a/tensorflow/compiler/xla/primitive_util.cc b/tensorflow/compiler/xla/primitive_util.cc
index e4b41d87e8b..57012251789 100644
--- a/tensorflow/compiler/xla/primitive_util.cc
+++ b/tensorflow/compiler/xla/primitive_util.cc
@@ -113,42 +113,6 @@ int OverflowExponent(PrimitiveType type) {
   }
 }
 
-bool IsFloatingPointType(PrimitiveType type) {
-  return type == F16 || type == F32 || type == F64 || type == BF16 ||
-         type == F8E5M2 || type == F8E4M3FN || type == F8E4M3B11FNUZ;
-}
-
-bool IsComplexType(PrimitiveType type) { return type == C64 || type == C128; }
-
-bool IsSignedIntegralType(PrimitiveType type) {
-  return type == S4 || type == S8 || type == S16 || type == S32 || type == S64;
-}
-
-bool IsUnsignedIntegralType(PrimitiveType type) {
-  return type == U4 || type == U8 || type == U16 || type == U32 || type == U64;
-}
-
-bool IsIntegralType(PrimitiveType type) {
-  return IsUnsignedIntegralType(type) || IsSignedIntegralType(type);
-}
-
-xla::PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
-  switch (src_bitwidth) {
-    case 4:
-      return xla::U4;
-    case 8:
-      return xla::U8;
-    case 16:
-      return xla::U16;
-    case 32:
-      return xla::U32;
-    case 64:
-      return xla::U64;
-    default:
-      return xla::PRIMITIVE_TYPE_INVALID;
-  }
-}
-
 xla::PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   switch (src_bitwidth) {
     case 4:
@@ -166,18 +130,6 @@ xla::PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   }
 }
 
-PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
-  switch (complex_type) {
-    case C64:
-      return F32;
-    case C128:
-      return F64;
-    default:
-      LOG(FATAL) << "Primitive type is not complex: "
-                 << PrimitiveType_Name(complex_type);
-  }
-}
-
 // Class to memoize the computation of
 //   absl::AsciiStrToLower(PrimitiveType_Name(p))
 // for all PrimitiveType values "p"
diff --git a/tensorflow/compiler/xla/primitive_util.h b/tensorflow/compiler/xla/primitive_util.h
index 656291250f1..b868498ed56 100644
--- a/tensorflow/compiler/xla/primitive_util.h
+++ b/tensorflow/compiler/xla/primitive_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PRIMITIVE_UTIL_H_
 #define TENSORFLOW_COMPILER_XLA_PRIMITIVE_UTIL_H_
 
+#include <limits>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -168,15 +169,26 @@ inline PrimitiveType NativeToPrimitiveType<complex128>() {
   return C128;
 }
 
-bool IsFloatingPointType(PrimitiveType type);
+constexpr bool IsFloatingPointType(PrimitiveType type) {
+  return type == F16 || type == F32 || type == F64 || type == BF16 ||
+         type == F8E5M2 || type == F8E4M3FN || type == F8E4M3B11FNUZ;
+}
 
-bool IsComplexType(PrimitiveType type);
+constexpr bool IsComplexType(PrimitiveType type) {
+  return type == C64 || type == C128;
+}
 
-bool IsSignedIntegralType(PrimitiveType type);
+constexpr bool IsSignedIntegralType(PrimitiveType type) {
+  return type == S4 || type == S8 || type == S16 || type == S32 || type == S64;
+}
 
-bool IsUnsignedIntegralType(PrimitiveType type);
+constexpr bool IsUnsignedIntegralType(PrimitiveType type) {
+  return type == U4 || type == U8 || type == U16 || type == U32 || type == U64;
+}
 
-bool IsIntegralType(PrimitiveType type);
+constexpr bool IsIntegralType(PrimitiveType type) {
+  return IsUnsignedIntegralType(type) || IsSignedIntegralType(type);
+}
 
 inline bool IsF8Type(PrimitiveType type) {
   return type == F8E5M2 || type == F8E4M3FN || type == F8E4M3B11FNUZ;
@@ -189,7 +201,7 @@ inline constexpr bool IsArrayType(PrimitiveType primitive_type) {
 }
 
 // Returns the number of bits in the representation for a given type.
-ABSL_ATTRIBUTE_ALWAYS_INLINE inline int BitWidth(PrimitiveType type) {
+constexpr ABSL_ATTRIBUTE_ALWAYS_INLINE inline int BitWidth(PrimitiveType type) {
   switch (type) {
     case PRED:
       return 1;
@@ -288,13 +300,48 @@ ABSL_ATTRIBUTE_ALWAYS_INLINE inline int ByteWidth(PrimitiveType type) {
   }
 }
 
-PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth);
+constexpr PrimitiveType UnsignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
+  switch (src_bitwidth) {
+    case 4:
+      return xla::U4;
+    case 8:
+      return xla::U8;
+    case 16:
+      return xla::U16;
+    case 32:
+      return xla::U32;
+    case 64:
+      return xla::U64;
+    default:
+      return xla::PRIMITIVE_TYPE_INVALID;
+  }
+}
 
 PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth);
 
 // Returns the real, imag component type underlying the given complex type.
 // LOG(FATAL)'s if complex_type is not complex.
-PrimitiveType ComplexComponentType(PrimitiveType complex_type);
+constexpr PrimitiveType ComplexComponentType(PrimitiveType complex_type) {
+  switch (complex_type) {
+    case C64:
+      return F32;
+    case C128:
+      return F64;
+    default:
+      LOG(FATAL) << "Primitive type is not complex: "
+                 << PrimitiveType_Name(complex_type);
+  }
+}
+
+constexpr PrimitiveType ComplexType(PrimitiveType base_type) {
+  if (base_type == F32) {
+    return C64;
+  }
+  if (base_type == F64) {
+    return C128;
+  }
+  return PRIMITIVE_TYPE_INVALID;
+}
 
 // Returns the higher-precision element type if a and b are both floating
 // point types; otherwise, checks that they have the same element type
@@ -550,16 +597,18 @@ bool IsCanonicalRepresentation(PrimitiveType type) {
     case S16:
     case S32:
     case S64:
-      return std::is_integral<T>::value && std::is_signed<T>::value &&
-             ByteWidth(type) <= sizeof(T);
+      return std::numeric_limits<T>::is_integer &&
+             std::numeric_limits<T>::is_signed &&
+             BitWidth(type) <= (std::numeric_limits<T>::digits + 1);
     case PRED:
     case U4:
     case U8:
     case U16:
     case U32:
     case U64:
-      return std::is_integral<T>::value && std::is_unsigned<T>::value &&
-             ByteWidth(type) <= sizeof(T);
+      return std::numeric_limits<T>::is_integer &&
+             !std::numeric_limits<T>::is_signed &&
+             BitWidth(type) <= std::numeric_limits<T>::digits;
     case TUPLE:
     case OPAQUE_TYPE:
     case TOKEN:
@@ -570,6 +619,78 @@ bool IsCanonicalRepresentation(PrimitiveType type) {
   }
 }
 
+template <PrimitiveType kPrimitiveType>
+struct PrimitiveTypeConstantImpl {
+  static constexpr PrimitiveType kValue = kPrimitiveType;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  constexpr operator PrimitiveType() const { return kValue; }
+  constexpr PrimitiveType operator()() const { return kValue; }
+};
+
+template <PrimitiveType kPrimitiveType>
+using PrimitiveTypeConstant =
+    std::integral_constant<PrimitiveType, kPrimitiveType>;
+
+template <typename R, typename F>
+R PrimitiveTypeSwitch(F&& f, PrimitiveType type) {
+  switch (type) {
+    case PRED:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::PRED>());
+    case S4:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::S4>());
+    case S8:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::S8>());
+    case S16:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::S16>());
+    case S32:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::S32>());
+    case S64:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::S64>());
+    case U4:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::U4>());
+    case U8:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::U8>());
+    case U16:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::U16>());
+    case U32:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::U32>());
+    case U64:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::U64>());
+    case F8E4M3FN:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::F8E4M3FN>());
+    case F8E4M3B11FNUZ:
+      return std::invoke(f,
+                         PrimitiveTypeConstant<PrimitiveType::F8E4M3B11FNUZ>());
+    case F8E5M2:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::F8E5M2>());
+    case F16:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::F16>());
+    case BF16:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::BF16>());
+    case F32:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::F32>());
+    case F64:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::F64>());
+    case C64:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::C64>());
+    case C128:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::C128>());
+    case TUPLE:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::TUPLE>());
+    case OPAQUE_TYPE:
+      return std::invoke(f,
+                         PrimitiveTypeConstant<PrimitiveType::OPAQUE_TYPE>());
+    case TOKEN:
+      return std::invoke(f, PrimitiveTypeConstant<PrimitiveType::TOKEN>());
+    default:
+      LOG(FATAL) << "unhandled type " << type;
+  }
+}
+
+template <PrimitiveType kType>
+using NativeTypeOf =
+    typename primitive_util::PrimitiveTypeToNative<kType>::type;
+
 }  // namespace primitive_util
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/BUILD b/tensorflow/compiler/xla/python/BUILD
index 69957460518..a8762601183 100644
--- a/tensorflow/compiler/xla/python/BUILD
+++ b/tensorflow/compiler/xla/python/BUILD
@@ -1,6 +1,6 @@
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/tsl/platform:build_config.bzl", "pyx_library")
+load("//tensorflow/tsl/platform:build_config.bzl", "pyx_library", "tf_proto_library")
 load(
     "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -84,7 +84,10 @@ py_test(
     main = "xla_client_test.py",
     python_version = "PY3",
     srcs_version = "PY3",
-    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
+    tags = [
+        "no_oss",
+        "not_run:arm",
+    ],  # TODO(phawkins): This test passes, but requires --config=monolithic.
     deps = [
         ":custom_call_for_test",
         ":xla_client",
@@ -209,8 +212,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/pjrt:pjrt_client",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
@@ -307,6 +308,7 @@ cc_library(
         "py_client.cc",
         "py_compile_only_client.cc",
         "py_executable.cc",
+        "py_host_callback.cc",
         "py_values.cc",
         "sharding.cc",
     ] + if_cuda_or_rocm([
@@ -319,6 +321,7 @@ cc_library(
         "py_client.h",
         "py_compile_only_client.h",
         "py_executable.h",
+        "py_host_callback.h",
         "py_values.h",
         "sharded_device_array.h",
         "sharding.h",
@@ -337,6 +340,7 @@ cc_library(
     deps = [
         ":exceptions",
         ":pprof_profile_builder",
+        ":py_host_callback_proto_cc",
         ":python_ref_manager",
         ":python_utils",
         ":status_casters",
@@ -345,6 +349,7 @@ cc_library(
         ":types",
         ":util",
         "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
@@ -360,6 +365,8 @@ cc_library(
         "//tensorflow/compiler/xla/pjrt:transpose",
         "//tensorflow/compiler/xla/python/ifrt",
         "//tensorflow/compiler/xla/python/pjrt_ifrt",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt:xla_ifrt",
         "//tensorflow/compiler/xla/service:custom_call_status",
         "//tensorflow/compiler/xla/service:custom_call_target_registry",
         "//tensorflow/compiler/xla/service:platform_util",
@@ -376,6 +383,7 @@ cc_library(
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/types:variant",
+        "@com_google_protobuf//:protobuf",
         "@llvm-project//llvm:Support",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
@@ -487,8 +495,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/compiler/xla/service:custom_call_sharding_helper",
         "//tensorflow/compiler/xla/service/spmd:spmd_partitioner",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/synchronization",
         "@pybind11",
     ],
 )
@@ -613,7 +619,6 @@ xla_cc_test(
     srcs = ["outfeed_receiver_test.cc"],
     deps = [
         ":outfeed_receiver",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:executable_build_options",
@@ -622,9 +627,7 @@ xla_cc_test(
         "//tensorflow/compiler/xla/pjrt:pjrt_stream_executor_client",
         "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ],
 )
@@ -653,6 +656,28 @@ cc_library(
     ],
 )
 
+py_test(
+    name = "pytree_test",
+    srcs = ["pytree_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = ["no_oss"],  # TODO(phawkins): This test passes, but requires --config=monolithic.
+    deps = [
+        ":xla_client",
+        ":xla_extension",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/logging",
+        "@absl_py//absl/testing:absltest",
+    ] + xla_py_test_deps(),
+)
+
+tf_proto_library(
+    name = "pytree_proto",
+    srcs = ["pytree.proto"],
+    cc_api_version = 2,
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "pytree",
     srcs = ["pytree.cc"],
@@ -666,6 +691,7 @@ cc_library(
     visibility = [":friends"],
     deps = [
         ":exceptions",
+        ":pytree_proto_cc",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -689,6 +715,7 @@ cc_library(
     ],
     features = ["-use_header_modules"],
     deps = [
+        ":refine_polymorphic_shapes",
         ":status_casters",
         ":types",
         "//tensorflow/compiler/xla:status",
@@ -713,6 +740,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "refine_polymorphic_shapes",
+    srcs = ["refine_polymorphic_shapes.cc"],
+    hdrs = ["refine_polymorphic_shapes.h"],
+    deps = [
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla/mlir/utils:error_util",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@stablehlo//:chlo_ops",
+        "@stablehlo//:stablehlo_ops",
+        "@stablehlo//:stablehlo_passes",
+    ],
+)
+
 cc_library(
     name = "profiler",
     srcs = ["profiler.cc"],
@@ -734,7 +783,6 @@ cc_library(
         "//tensorflow/tsl/profiler/lib:profiler_session",
         "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
         "//tensorflow/tsl/profiler/rpc/client:capture_profile",
-        "//tensorflow/tsl/profiler/rpc/client:profiler_client",
         "@pybind11",
     ],
 )
@@ -814,6 +862,7 @@ cc_library(
         ":py_client",
         ":status_casters",
         ":types",
+        "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -847,6 +896,12 @@ cc_library(
     ],
 )
 
+tf_proto_library(
+    name = "py_host_callback_proto",
+    srcs = ["py_host_callback.proto"],
+    cc_api_version = 2,
+)
+
 # TODO(phawkins): the configuration settings here are overly confusing. The right fix is to split
 # xla_extension.so so that each backend is a separate plugin, however that must wait for a clean
 # ABI separation between devices.
@@ -961,6 +1016,7 @@ tsl_pybind_extension(
         ":py_client",
         ":python_ref_manager",
         ":pytree",
+        ":refine_polymorphic_shapes",
         ":status_casters",
         ":traceback",
         ":transfer_guard_lib",
@@ -977,6 +1033,7 @@ tsl_pybind_extension(
         "//tensorflow/compiler/xla/pjrt:interpreter_device",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_api",
+        "//tensorflow/compiler/xla/pjrt:pjrt_c_api_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:pjrt_compiler",
         "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
@@ -1005,8 +1062,8 @@ tsl_pybind_extension(
         "//conditions:default": [],
     }) + select({
         ":tpu_enabled": [
-            "//tensorflow/compiler/xla/pjrt:pjrt_c_api_client",
             "//tensorflow/compiler/xla/pjrt:tpu_client",
+            "//tensorflow/compiler/xla/stream_executor/tpu:tpu_initializer_helper",
         ],
         "//conditions:default": [],
     }) + select({
diff --git a/tensorflow/compiler/xla/python/callback.h b/tensorflow/compiler/xla/python/callback.h
index 7a9b11edb07..bf032352c44 100644
--- a/tensorflow/compiler/xla/python/callback.h
+++ b/tensorflow/compiler/xla/python/callback.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <optional>
 #include <utility>
+#include <vector>
 
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/compiler/xla/pjrt/transpose.h"
diff --git a/tensorflow/compiler/xla/python/dlpack.h b/tensorflow/compiler/xla/python/dlpack.h
index 8212cdf9125..bbef0910ca0 100644
--- a/tensorflow/compiler/xla/python/dlpack.h
+++ b/tensorflow/compiler/xla/python/dlpack.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_DLPACK_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_DLPACK_H_
 
+#include <memory>
+
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
diff --git a/tensorflow/compiler/xla/python/ifrt/BUILD b/tensorflow/compiler/xla/python/ifrt/BUILD
index 3001886bf4c..73c9f112ee9 100644
--- a/tensorflow/compiler/xla/python/ifrt/BUILD
+++ b/tensorflow/compiler/xla/python/ifrt/BUILD
@@ -18,11 +18,11 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
         ":internal",
     ],
-    licenses = ["notice"],
 )
 
 exports_files([
@@ -35,9 +35,11 @@ cc_library(
         "array.cc",
         "client.cc",
         "compiler.cc",
+        "device.cc",
         "dtype.cc",
         "executable.cc",
         "future.cc",
+        "host_callback.cc",
         "index.cc",
         "index_domain.cc",
         "shape.cc",
@@ -53,6 +55,7 @@ cc_library(
         "dtype.h",
         "executable.h",
         "future.h",
+        "host_callback.h",
         "index.h",
         "index_domain.h",
         "shape.h",
@@ -65,6 +68,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_executable",
         "//tensorflow/compiler/xla/python/ifrt/ir",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -157,7 +161,13 @@ cc_library(
     deps = [
         ":ifrt",
         "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:ref_count",
     ],
 )
 
@@ -216,32 +226,6 @@ xla_cc_test(
     ],
 )
 
-cc_library(
-    name = "executable_impl_test_lib",
-    testonly = 1,
-    srcs = ["executable_impl_test_lib.cc"],
-    deps = [
-        ":ifrt",
-        ":test_util",
-        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
-        "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/strings",
-    ],
-    alwayslink = 1,
-)
-
-xla_cc_test(
-    name = "executable_test_no_impl",
-    srcs = [],
-    deps = [
-        ":executable_impl_test_lib",
-        ":no_impl_test_main",
-        "@com_google_googletest//:gtest",
-    ],
-)
-
 cc_library(
     name = "tuple_impl_test_lib",
     testonly = 1,
diff --git a/tensorflow/compiler/xla/python/ifrt/array.cc b/tensorflow/compiler/xla/python/ifrt/array.cc
index 193b33970fd..05a4067f60b 100644
--- a/tensorflow/compiler/xla/python/ifrt/array.cc
+++ b/tensorflow/compiler/xla/python/ifrt/array.cc
@@ -33,5 +33,11 @@ std::vector<Array*> MakeArrayPointerList(
   return result;
 }
 
+StatusOr<tsl::RCReference<Array>> Array::FullyReplicatedShard(
+    ArrayCopySemantics semantics) {
+  TF_ASSIGN_OR_RETURN(auto items, DisassembleIntoSingleDeviceArrays(semantics));
+  return items.front();
+}
+
 }  // namespace ifrt
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/array.h b/tensorflow/compiler/xla/python/ifrt/array.h
index 96052ac4132..782830d3349 100644
--- a/tensorflow/compiler/xla/python/ifrt/array.h
+++ b/tensorflow/compiler/xla/python/ifrt/array.h
@@ -74,6 +74,12 @@ class Array : public llvm::RTTIExtends<Array, Value> {
   // counterpart of `Client::AssembleArrayFromSingleDeviceArrays()`.
   virtual StatusOr<std::vector<tsl::RCReference<Array>>>
   DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) = 0;
+  // Returns a shard of an Array which is fully replicated. This is an
+  // optimization so that instead of disassembling into all the shards when
+  // the Array is fully replicated, we can just get 1 shard out and create an
+  // Array from it.
+  virtual StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+      ArrayCopySemantics semantics);
 
   // Fetches the array to host and stores it as unreplicated, unsharded data.
   //
diff --git a/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc b/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc
index 2ec81877828..4dfab9f28cd 100644
--- a/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc
+++ b/tensorflow/compiler/xla/python/ifrt/array_impl_test_lib.cc
@@ -345,29 +345,41 @@ TEST(ArrayImplTest, AssembleAndDisassembleArray) {
   std::vector<tsl::RCReference<Array>> arrays({array0, array1});
   std::vector<Shape> single_device_shapes({shape, shape});
   Shape assembled_shape({4, 3});
-  auto assembled_sharding = OpaqueSharding::Create(
+  ShardingParam sharding_param(
+      /*dim_shards=*/{2, 1}, {/*permutation=*/{0, 1}, /*axis_sizes=*/{2, 1}});
+  auto ifrt_device_list =
       DeviceList(DeviceList::Devices({array0->sharding().devices().front(),
-                                      array1->sharding().devices().front()})),
-      OpaqueSharding::MakeDisassembleFuncFromShapes(single_device_shapes));
+                                      array1->sharding().devices().front()}));
   TF_ASSERT_OK_AND_ASSIGN(
-      auto assembled_array,
-      client->AssembleArrayFromSingleDeviceArrays(
-          assembled_shape, assembled_sharding, absl::MakeSpan(arrays),
-          ArrayCopySemantics::kAlwaysCopy));
+      std::shared_ptr<const Sharding> sharding_param_sharding,
+      ShardingParamSharding::Create(std::move(sharding_param),
+                                    ifrt_device_list));
+  auto assembled_shardings = {
+      OpaqueSharding::Create(
+          ifrt_device_list,
+          OpaqueSharding::MakeDisassembleFuncFromShapes(single_device_shapes)),
+      sharding_param_sharding};
+  for (auto& assembled_sharding : assembled_shardings) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        auto assembled_array,
+        client->AssembleArrayFromSingleDeviceArrays(
+            assembled_shape, assembled_sharding, absl::MakeSpan(arrays),
+            ArrayCopySemantics::kAlwaysCopy));
 
-  TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
-                          assembled_array->DisassembleIntoSingleDeviceArrays(
-                              ArrayCopySemantics::kAlwaysCopy));
+    TF_ASSERT_OK_AND_ASSIGN(auto single_device_arrays,
+                            assembled_array->DisassembleIntoSingleDeviceArrays(
+                                ArrayCopySemantics::kAlwaysCopy));
 
-  ASSERT_THAT(single_device_arrays, SizeIs(2));
-  EXPECT_EQ(single_device_arrays[0]->dtype(), array0->dtype());
-  EXPECT_EQ(single_device_arrays[0]->shape(), array0->shape());
-  EXPECT_THAT(single_device_arrays[0]->sharding().devices().devices(),
-              ElementsAreArray(array0->sharding().devices().devices()));
-  EXPECT_EQ(single_device_arrays[1]->dtype(), array1->dtype());
-  EXPECT_EQ(single_device_arrays[1]->shape(), array1->shape());
-  EXPECT_THAT(single_device_arrays[1]->sharding().devices().devices(),
-              ElementsAreArray(array1->sharding().devices().devices()));
+    ASSERT_THAT(single_device_arrays, SizeIs(2));
+    EXPECT_EQ(single_device_arrays[0]->dtype(), array0->dtype());
+    EXPECT_EQ(single_device_arrays[0]->shape(), array0->shape());
+    EXPECT_THAT(single_device_arrays[0]->sharding().devices().devices(),
+                ElementsAreArray(array0->sharding().devices().devices()));
+    EXPECT_EQ(single_device_arrays[1]->dtype(), array1->dtype());
+    EXPECT_EQ(single_device_arrays[1]->shape(), array1->shape());
+    EXPECT_THAT(single_device_arrays[1]->sharding().devices().devices(),
+                ElementsAreArray(array1->sharding().devices().devices()));
+  }
 }
 
 TEST(ArrayImplTest, ReshardToSameSharding) {
diff --git a/tensorflow/compiler/xla/python/ifrt/client.h b/tensorflow/compiler/xla/python/ifrt/client.h
index 5040c3db5b8..1279e06fdf7 100644
--- a/tensorflow/compiler/xla/python/ifrt/client.h
+++ b/tensorflow/compiler/xla/python/ifrt/client.h
@@ -34,7 +34,6 @@ namespace xla {
 namespace ifrt {
 
 using PlatformId = ::xla::PjRtPlatformId;
-using ChannelHandle = ::xla::ChannelHandle;
 
 // TODO(hyeontaek): Generalize DeviceAssignment or hide it from the top-level
 // API.
@@ -131,9 +130,6 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
       int num_replicas, int num_partitions) const = 0;
   virtual StatusOr<Device*> LookupDevice(int device_id) const = 0;
 
-  virtual StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() = 0;
-  virtual StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() = 0;
-
   // TODO(hyeontaek): Potentially remove this method to encourage supporting
   // only ahead-of-time compilation.
   virtual Compiler* GetDefaultCompiler() = 0;
diff --git a/tensorflow/compiler/xla/python/ifrt/compiler.cc b/tensorflow/compiler/xla/python/ifrt/compiler.cc
index cd73d6d59da..1a824e804ab 100644
--- a/tensorflow/compiler/xla/python/ifrt/compiler.cc
+++ b/tensorflow/compiler/xla/python/ifrt/compiler.cc
@@ -18,6 +18,8 @@ limitations under the License.
 namespace xla {
 namespace ifrt {
 
+char CompileOptions::ID = 0;
+char DeserializeOptions::ID = 0;
 char Compiler::ID = 0;
 
 }  // namespace ifrt
diff --git a/tensorflow/compiler/xla/python/ifrt/compiler.h b/tensorflow/compiler/xla/python/ifrt/compiler.h
index e02403ea576..fec93cb668f 100644
--- a/tensorflow/compiler/xla/python/ifrt/compiler.h
+++ b/tensorflow/compiler/xla/python/ifrt/compiler.h
@@ -18,35 +18,60 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/python/ifrt/executable.h"
 
 namespace xla {
 namespace ifrt {
 
-// TODO(hyeontaek): Generalize `xla::CompileOptions`.
-using CompileOptions = ::xla::CompileOptions;
+// Abstract options for compiling an MLIR module and load it as
+// `LoadedExecutable`. Ideally, compile options should be present in the MLIR
+// module being compiled to help static checking and completeness. This option
+// structure is to express legacy compilation options that are not included in
+// the MLIR module.
+// TODO(hyeontaek): Make an new `LoadOptions` that is specific for loading.
+// TODO(hyeontaek): Add `Serialize()`.
+struct CompileOptions : llvm::RTTIExtends<CompileOptions, llvm::RTTIRoot> {
+  static char ID;  // NOLINT
+};
+
+// Abstract options for deserializing an `Executable` and load it as
+// `LoadedExecutable`. This option structure is to express legacy compilation
+// options that are not included in the MLIR module.
+// TODO(hyeontaek): Make an new `LoadOptions` that is specific for loading.
+struct DeserializeOptions
+    : llvm::RTTIExtends<DeserializeOptions, llvm::RTTIRoot> {
+  static char ID;  // NOLINT
+};
 
 // Represents a compiler that creates an `Executable` that can run a computation
 // on devices.
+//
+// TODO(hyeontaek): All `Compiler` methods should take target information such
+// as "Platform" or "Topology" that is not tied to a real hardware allocation,
+// and return unloaded objects only. `Client` should take over the role of
+// loading of compiled objects into the target low-level runtime and hardware to
+// ready the them for execution. This will enable ahead-of-time compilation,
+// better separation between compilation, loading, and serialization and
+// deserialization.
 class Compiler : public llvm::RTTIExtends<Compiler, llvm::RTTIRoot> {
  public:
-  // Compiles `mlir_module` and returns an `Executable`.
-
-  // TODO(hyeontaek): Introduce `Platform`/`Topology` and return `Executable`
-  // instead of `LoadedExecutable`. This will factor out the loading portion of
-  // the compilation, enabling ahead-of-time compilation.
+  // Compiles `mlir_module` and returns a `LoadedExecutable`.
+  // TODO(hyeontaek): Move executable loading to `Client`.
   virtual StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
-      mlir::ModuleOp mlir_module, CompileOptions options) = 0;
+      mlir::ModuleOp mlir_module, std::unique_ptr<CompileOptions> options) = 0;
 
   // Deserializes a serialized executable as produced by
   // `LoadedExecutable::Serialize()`. The compatibility of `serialized` is
   // implementation specific.
+  // TODO(hyeontaek): Move executable loading to `Client`.
   virtual StatusOr<std::unique_ptr<LoadedExecutable>>
   DeserializeLoadedExecutable(absl::string_view serialized,
-                              std::optional<CompileOptions> options) = 0;
+                              std::unique_ptr<DeserializeOptions> options) = 0;
 
   static char ID;  // NOLINT
 };
diff --git a/tensorflow/compiler/xla/python/ifrt/device.cc b/tensorflow/compiler/xla/python/ifrt/device.cc
new file mode 100644
index 00000000000..0f02149ae48
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/device.cc
@@ -0,0 +1,33 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+
+#include <vector>
+
+namespace xla {
+namespace ifrt {
+
+std::vector<int> GetDeviceIds(DeviceList device_list) {
+  std::vector<int> ids;
+  ids.reserve(device_list.devices().size());
+  for (const Device* device : device_list.devices()) {
+    ids.push_back(device->id());
+  }
+  return ids;
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/device.h b/tensorflow/compiler/xla/python/ifrt/device.h
index b21a708a510..a2d5f61dd35 100644
--- a/tensorflow/compiler/xla/python/ifrt/device.h
+++ b/tensorflow/compiler/xla/python/ifrt/device.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DEVICE_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_DEVICE_H_
 
-#include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
@@ -61,6 +61,9 @@ class DeviceList {
   Devices devices_;
 };
 
+// Returns the id of each device in `device_list`.
+std::vector<int> GetDeviceIds(DeviceList device_list);
+
 }  // namespace ifrt
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/python/ifrt/dtype.h b/tensorflow/compiler/xla/python/ifrt/dtype.h
index b4b9ceed950..e449195b9fd 100644
--- a/tensorflow/compiler/xla/python/ifrt/dtype.h
+++ b/tensorflow/compiler/xla/python/ifrt/dtype.h
@@ -41,12 +41,14 @@ class DType {
     kPred = 1,
 
     // Signed integral values of fixed width.
+    kS4 = 21,
     kS8 = 2,
     kS16 = 3,
     kS32 = 4,
     kS64 = 5,
 
     // Unsigned integral values of fixed width.
+    kU4 = 22,
     kU8 = 6,
     kU16 = 7,
     kU32 = 8,
diff --git a/tensorflow/compiler/xla/python/ifrt/host_callback.cc b/tensorflow/compiler/xla/python/ifrt/host_callback.cc
new file mode 100644
index 00000000000..cfac5d12eb8
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/host_callback.cc
@@ -0,0 +1,25 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
+
+namespace xla {
+namespace ifrt {
+
+char HostCallback::ID = 0;
+char LoadedHostCallback::ID = 0;
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/host_callback.h b/tensorflow/compiler/xla/python/ifrt/host_callback.h
new file mode 100644
index 00000000000..10c09d8833d
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/host_callback.h
@@ -0,0 +1,75 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_HOST_CALLBACK_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_HOST_CALLBACK_H_
+
+#include <string>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+
+class Client;
+
+// Abstract (unloaded) host callback. It wraps a serializable host computation
+// that can be loaded as`LoadedHostCallback`.
+//
+// TODO(hyeontaek): Unify `HostCallback` with `Executable` once `Executable` is
+// added.
+class HostCallback : public llvm::RTTIExtends<HostCallback, llvm::RTTIRoot> {
+ public:
+  // Returns a serialized host callback.
+  virtual std::string Serialize() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+// Abstract loaded host callback. It wraps a host computation that may be called
+// during an execution of a `LoadedExecutable`. This interface only represents
+// an opaque reference of the host computation; the details of the host
+// computation call are implementation specific.
+//
+// TODO(hyeontaek): Merge `LoadedHostCallback` into `LoadedExecutable`. They
+// share a similar lifecycle, and only how their execution is invoked:
+// `LoadedExecutable` runs as a top-level standalone runnable, while
+// `LoadedHostCallback` runs as a sub-computation of another `LoadedExecutable`
+// execution.
+class LoadedHostCallback
+    : public tsl::ReferenceCounted<LoadedHostCallback>,
+      public llvm::RTTIExtends<LoadedHostCallback, llvm::RTTIRoot> {
+ public:
+  virtual Client* client() const = 0;
+
+  // Returns a serialized host callback.
+  //
+  // The implementation may return an error if this `LoadedHostCallback` is not
+  // serializable, or the information required for serialization is not
+  // preserved within this `LoadedHostCallback`.
+  //
+  // TODO(hyeontaek): Change `Serialize()` to return `HostCallback` instead of a
+  // serialized host callback directly.
+  virtual StatusOr<std::string> Serialize() const = 0;
+
+  static char ID;  // NOLINT
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_HOST_CALLBACK_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/BUILD b/tensorflow/compiler/xla/python/ifrt/ir/BUILD
index b344b8b9a54..52ceaa64bda 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/BUILD
+++ b/tensorflow/compiler/xla/python/ifrt/ir/BUILD
@@ -117,6 +117,7 @@ cc_library(
         "sharding_param.cc",
     ],
     hdrs = [
+        "constants.h",
         "ifrt_dialect.h",
         "ifrt_interfaces.h",
         "ifrt_ops.h",
@@ -127,9 +128,22 @@ cc_library(
         ":ifrt_dialect_inc_gen",
         ":ifrt_interfaces_inc_gen",
         ":ifrt_ops_inc_gen",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
 )
+
+cc_library(
+    name = "compile_options",
+    srcs = ["compile_options.cc"],
+    hdrs = ["compile_options.h"],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        "//tensorflow/compiler/xla/python/ifrt",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/compile_options.cc b/tensorflow/compiler/xla/python/ifrt/ir/compile_options.cc
new file mode 100644
index 00000000000..1814691f347
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/compile_options.cc
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/ir/compile_options.h"
+
+#include <memory>
+
+namespace xla {
+namespace ifrt {
+
+char IfrtIRCompileOptions::ID = 0;
+
+StatusOr<std::unique_ptr<IfrtIRCompileOptions>> GetIfrtIRCompileOptions(
+    std::unique_ptr<CompileOptions> options) {
+  if (!llvm::isa<IfrtIRCompileOptions>(options.get())) {
+    return absl::InvalidArgumentError("options must be IfrtIRCompileOptions");
+  }
+  return std::unique_ptr<IfrtIRCompileOptions>(
+      static_cast<IfrtIRCompileOptions*>(options.release()));
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/compile_options.h b/tensorflow/compiler/xla/python/ifrt/ir/compile_options.h
new file mode 100644
index 00000000000..aa3d41025a5
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/compile_options.h
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_COMPILE_OPTIONS_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_COMPILE_OPTIONS_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/compiler.h"
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+
+namespace xla {
+namespace ifrt {
+
+// CompileOptions for an IFRT IR program.
+struct IfrtIRCompileOptions
+    : llvm::RTTIExtends<IfrtIRCompileOptions, CompileOptions> {
+  IfrtIRCompileOptions() = default;
+  explicit IfrtIRCompileOptions(
+      std::vector<int> device_assignments,
+      absl::flat_hash_map<std::string, LoadedExecutable*> loaded_exec_binding =
+          {})
+      : device_assignments(std::move(device_assignments)),
+        loaded_exec_binding(std::move(loaded_exec_binding)) {}
+
+  // Map from logical device ids in MLIR module to runtime device ids obtained
+  // from IFRT client.
+  std::vector<int> device_assignments;
+
+  // Map from `getSymName()` of declared LoadedExecutableOp in the `mlir_module`
+  // to pre-compiled LoadedExecutable instance. The LoadedExecutables must
+  // outlive the LoadedExecutable to be compiled.
+  absl::flat_hash_map<std::string, LoadedExecutable*> loaded_exec_binding;
+
+  static char ID;  // NOLINT
+};
+
+// Gets `xla::ifrt::IfrtIRCompileOptions` from `xla::ifrt::CompileOptions`.
+StatusOr<std::unique_ptr<IfrtIRCompileOptions>> GetIfrtIRCompileOptions(
+    std::unique_ptr<CompileOptions> options);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_COMPILE_OPTIONS_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/constants.h b/tensorflow/compiler/xla/python/ifrt/ir/constants.h
new file mode 100644
index 00000000000..4dcb4c021f2
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/constants.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_CONSTANTS_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_CONSTANTS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace ifrt {
+
+// Name of UnitAttr on FuncOp to indicate it's an IFRT IR function, telling it
+// apart from atom program FuncOps (callee of `ifrt.Call`).
+inline constexpr absl::string_view kIfrtFunctionAttrName = "ifrt.function";
+
+// Name of UnitAttr on arguments of FuncOp to indicate a donated input.
+// Must be used in a FuncOp with `ifrt.function` attr.
+inline constexpr absl::string_view kIfrtDonatedArgAttrName = "ifrt.donated";
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_CONSTANTS_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc
index 5ea4e7a8cd9..4f1f1e36a7f 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.cc
@@ -22,11 +22,16 @@ limitations under the License.
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Dialect.h"  // from @llvm-project
 #include "mlir/IR/DialectImplementation.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/constants.h"
 #include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h"
 
 // Generated definitions.
@@ -55,12 +60,48 @@ void IfrtDialect::initialize() {
       >();
 }
 
+mlir::LogicalResult IfrtDialect::verifyOperationAttribute(
+    mlir::Operation* op, mlir::NamedAttribute attr) {
+  if (attr.getName() == kIfrtFunctionAttrName) {
+    if (!llvm::isa<mlir::func::FuncOp>(op)) {
+      return op->emitOpError() << "has `" << kIfrtFunctionAttrName
+                               << "` attr but is not a function";
+    }
+    if (!attr.getValue().isa<mlir::UnitAttr>()) {
+      return op->emitOpError() << "has `" << kIfrtFunctionAttrName
+                               << "` attr that is not a UnitAttr";
+    }
+  }
+  return mlir::success();
+}
+
+mlir::LogicalResult IfrtDialect::verifyRegionArgAttribute(
+    mlir::Operation* op, unsigned regionIndex, unsigned argIndex,
+    mlir::NamedAttribute attr) {
+  if (attr.getName() == kIfrtDonatedArgAttrName) {
+    if (!llvm::isa<mlir::func::FuncOp>(op)) {
+      return op->emitOpError() << "has `" << kIfrtDonatedArgAttrName
+                               << "` arg attr but is not a function";
+    }
+    if (!attr.getValue().isa<mlir::UnitAttr>()) {
+      return op->emitOpError() << "has `" << kIfrtDonatedArgAttrName
+                               << "` arg attr that is not a UnitAttr";
+    }
+    if (!op->hasAttr(kIfrtFunctionAttrName)) {
+      return op->emitOpError()
+             << "has `" << kIfrtDonatedArgAttrName << "` arg attr but not has `"
+             << kIfrtFunctionAttrName << "` attr";
+    }
+  }
+  return mlir::success();
+}
+
 // static
 mlir::LogicalResult IfrtArrayType::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emit_error,
     mlir::RankedTensorType global, ShardingParam sharding,
-    llvm::ArrayRef<int64_t> devices) {
-  llvm::SmallSet<int64_t, 4> device_set;
+    llvm::ArrayRef<int> devices) {
+  llvm::SmallSet<int, 4> device_set;
   for (auto device : devices) {
     if (!device_set.insert(device).second) {
       return emit_error() << "`devices` has duplicated id " << device;
@@ -71,8 +112,8 @@ mlir::LogicalResult IfrtArrayType::verify(
     return mlir::failure();
   }
 
-  int64_t devices_in_mesh = 1;
-  for (const int64_t axis_size : sharding.minor_to_major().axis_sizes) {
+  int devices_in_mesh = 1;
+  for (const int axis_size : sharding.minor_to_major().axis_sizes) {
     devices_in_mesh *= axis_size;
   }
   if (devices_in_mesh != devices.size()) {
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td
index 95a99a3c78c..b5e7db7a7e9 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td
@@ -26,6 +26,8 @@ def Ifrt_Dialect : Dialect {
   let cppNamespace = "::xla::ifrt";
   let useDefaultTypePrinterParser = 1;
   let useDefaultAttributePrinterParser = 1;
+  let hasOperationAttrVerify = 1;
+  let hasRegionArgAttrVerify = 1;
 }
 
 def Ifrt_ShardingParameter :
@@ -40,7 +42,7 @@ def Ifrt_ArrayType : TypeDef<Ifrt_Dialect, "IfrtArray"> {
   let parameters = (ins
     Builtin_RankedTensor:$shape,
     Ifrt_ShardingParameter:$sharding,
-    ArrayRefParameter<"int64_t">:$devices);
+    ArrayRefParameter<"int">:$devices);
 
   let assemblyFormat = "`<` $shape`,` $sharding`,` `[`$devices`]` `>`";
 
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc
index 2d395cb34c2..c5c53639a79 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc
@@ -41,6 +41,28 @@ limitations under the License.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.cc.inc"
 
+namespace mlir {
+namespace OpTrait {
+namespace xla {
+namespace ifrt {
+namespace impl {
+
+LogicalResult verifyNestedInIfrtFunc(Operation* op) {
+  auto func_op = op->getParentOfType<func::FuncOp>();
+  if (func_op != nullptr &&
+      !func_op->hasAttr(::xla::ifrt::kIfrtFunctionAttrName)) {
+    return op->emitOpError() << "must be in a FuncOp with attr `"
+                             << ::xla::ifrt::kIfrtFunctionAttrName << "`";
+  }
+  return success();
+}
+
+}  // namespace impl
+}  // namespace ifrt
+}  // namespace xla
+}  // namespace OpTrait
+}  // namespace mlir
+
 namespace xla {
 namespace ifrt {
 
@@ -86,11 +108,11 @@ mlir::LogicalResult VerifySameGlobalShape(mlir::Operation* op,
 // 1. Elements in `devices` are unique.
 // 2. Each of `inputs` and `outputs` is placed on a subset of `devices`.
 mlir::LogicalResult VerifyDevicePlacement(
-    mlir::Operation* op, llvm::ArrayRef<int64_t> devices,
+    mlir::Operation* op, llvm::ArrayRef<int> devices,
     llvm::ArrayRef<IfrtArrayType> inputs,
     llvm::ArrayRef<IfrtArrayType> outputs) {
-  llvm::SmallSet<int64_t, 4> attr_devices;
-  for (const int64_t device : devices) {
+  llvm::SmallSet<int, 4> attr_devices;
+  for (const int device : devices) {
     if (!attr_devices.insert(device).second) {
       return op->emitOpError()
              << "has duplicate device id " << device << " in `devices` attr";
@@ -98,7 +120,7 @@ mlir::LogicalResult VerifyDevicePlacement(
   }
 
   for (const IfrtArrayType input : inputs) {
-    for (const int64_t input_device : input.getDevices()) {
+    for (const int input_device : input.getDevices()) {
       if (!attr_devices.count(input_device)) {
         return op->emitOpError()
                << "requires all inputs placed on `devices` attr. The following "
@@ -109,7 +131,7 @@ mlir::LogicalResult VerifyDevicePlacement(
   }
 
   for (const IfrtArrayType output : outputs) {
-    for (const int64_t output_device : output.getDevices()) {
+    for (const int output_device : output.getDevices()) {
       if (!attr_devices.count(output_device)) {
         return op->emitOpError()
                << "requires all outputs placed on `devices` attr. The "
@@ -187,7 +209,7 @@ mlir::LogicalResult ReshardOp::verify() {
 }
 
 mlir::LogicalResult AssembleOp::verify() {
-  llvm::SmallVector<int64_t, 4> input_devices;
+  llvm::SmallVector<int, 4> input_devices;
   for (const mlir::Value input : getInputs()) {
     const auto array = llvm::cast<IfrtArrayType>(input.getType());
     if (array.getDevices().size() != 1) {
@@ -197,8 +219,7 @@ mlir::LogicalResult AssembleOp::verify() {
     }
     input_devices.push_back(array.getDevices()[0]);
   }
-  const llvm::ArrayRef<int64_t> output_devices =
-      getOutput().getType().getDevices();
+  const llvm::ArrayRef<int> output_devices = getOutput().getType().getDevices();
   if (!std::equal(input_devices.begin(), input_devices.end(),
                   output_devices.begin())) {
     return emitOpError() << "requires the same input/output device list. Input "
@@ -208,7 +229,7 @@ mlir::LogicalResult AssembleOp::verify() {
 }
 
 mlir::LogicalResult DisassembleOp::verify() {
-  llvm::SmallVector<int64_t, 4> output_devices;
+  llvm::SmallVector<int, 4> output_devices;
   for (const mlir::Value output : getOutputs()) {
     const auto array = llvm::cast<IfrtArrayType>(output.getType());
     if (array.getDevices().size() != 1) {
@@ -218,8 +239,7 @@ mlir::LogicalResult DisassembleOp::verify() {
     }
     output_devices.push_back(array.getDevices()[0]);
   }
-  const llvm::ArrayRef<int64_t> input_devices =
-      getInput().getType().getDevices();
+  const llvm::ArrayRef<int> input_devices = getInput().getType().getDevices();
   if (!std::equal(input_devices.begin(), input_devices.end(),
                   output_devices.begin())) {
     return emitOpError() << "requires the same input/output device list. Input "
@@ -231,22 +251,20 @@ mlir::LogicalResult DisassembleOp::verify() {
 mlir::CallInterfaceCallable CallOp::getCallableForCallee() {
   return (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
 }
+void CallOp::setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+  // Direct call
+  if ((*this)->getAttrOfType<mlir::SymbolRefAttr>("callee")) {
+    (*this)->setAttr("callee", callee.get<mlir::SymbolRefAttr>());
+  }
+  // Indirect call, callee Value is the first operand.
+  return setOperand(0, callee.get<mlir::Value>());
+}
 
 mlir::Operation::operand_range CallOp::getArgOperands() { return getInputs(); }
 
 mlir::LogicalResult CallOp::verifySymbolUses(
-    mlir::SymbolTableCollection& symbolTable) {
-  const auto callee_attr =
-      (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
-  if (!callee_attr) {
-    return emitOpError() << "requires `callee` SymbolRefAttr";
-  }
-  auto callee = symbolTable.lookupNearestSymbolFrom<mlir::func::FuncOp>(
-      *this, callee_attr);
-  if (!callee) {
-    return emitOpError() << "requires '" << callee_attr
-                         << "' to reference a valid function";
-  }
+    mlir::SymbolTableCollection& symbol_table) {
+  mlir::func::FuncOp callee = getCalleeOp(symbol_table);
   mlir::FunctionType callee_type = callee.getFunctionType();
 
   // Verify inputs.
@@ -305,25 +323,22 @@ mlir::LogicalResult CallOp::verify() {
 mlir::CallInterfaceCallable CallLoadedExecutableOp::getCallableForCallee() {
   return (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
 }
+void CallLoadedExecutableOp::setCalleeFromCallable(
+    mlir::CallInterfaceCallable callee) {
+  // Direct call
+  if ((*this)->getAttrOfType<mlir::SymbolRefAttr>("callee")) {
+    (*this)->setAttr("callee", callee.get<mlir::SymbolRefAttr>());
+  }
+  // Indirect call, callee Value is the first operand.
+  return setOperand(0, callee.get<mlir::Value>());
+}
 
 mlir::Operation::operand_range CallLoadedExecutableOp::getArgOperands() {
   return getInputs();
 }
 
 mlir::LogicalResult CallLoadedExecutableOp::verifySymbolUses(
-    mlir::SymbolTableCollection& symbolTable) {
-  const auto callee_attr =
-      (*this)->getAttrOfType<mlir::SymbolRefAttr>("callee");
-  if (!callee_attr) {
-    return emitOpError() << "requires `callee` SymbolRefAttr";
-  }
-  auto callee = symbolTable.lookupNearestSymbolFrom<LoadedExecutableOp>(
-      *this, callee_attr);
-  if (!callee) {
-    return emitOpError() << "requires '" << callee_attr
-                         << "' to reference a valid LoadedExecutable";
-  }
-
+    mlir::SymbolTableCollection& symbol_table) {
   llvm::SmallVector<mlir::Type, 4> input_types;
   input_types.reserve(getInputs().size());
   for (const mlir::Value input : getInputs()) {
@@ -336,6 +351,7 @@ mlir::LogicalResult CallLoadedExecutableOp::verifySymbolUses(
   }
   auto func_type =
       mlir::FunctionType::get(getContext(), input_types, output_types);
+  LoadedExecutableOp callee = getCalleeOp(symbol_table);
   if (callee.getFunctionType() != func_type) {
     return emitOpError() << "requires callee signature matching " << func_type
                          << ". Actual " << callee.getFunctionType();
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h
index 6a0ef3f2991..55abbed428b 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h
@@ -16,14 +16,73 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_IFRT_OPS_H_
 
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/constants.h"
 #include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
 
+namespace mlir {
+namespace OpTrait {
+namespace xla {
+namespace ifrt {
+
+namespace impl {
+
+// Verifies `op` used in a FuncOp with `ifrt.function` attr.
+LogicalResult verifyNestedInIfrtFunc(Operation* op);
+
+}  // namespace impl
+
+template <typename ConcreteType>
+class NestedInIfrtFuncTrait
+    : public TraitBase<ConcreteType, NestedInIfrtFuncTrait> {
+ public:
+  static LogicalResult verifyTrait(Operation* op) {
+    return impl::verifyNestedInIfrtFunc(op);
+  }
+};
+
+template <typename CalleeOpType>
+class IfrtCallLikeTrait {
+ public:
+  template <typename ConcreteType>
+  class Impl : public TraitBase<ConcreteType, Impl> {
+   public:
+    // Verifies getCallee() is a valid SymbolRefAttr to CalleeOpType.
+    static LogicalResult verifyTrait(Operation* op) {
+      mlir::SymbolTableCollection symbol_table;
+      ConcreteType concrete = llvm::cast<ConcreteType>(op);
+      CalleeOpType callee = concrete.getCalleeOp(symbol_table);
+      if (callee == nullptr) {
+        return op->emitOpError() << "requires '" << concrete.getCallee()
+                                 << "' to reference a valid `"
+                                 << CalleeOpType::getOperationName() << "`";
+      }
+      return success();
+    }
+
+    CalleeOpType getCalleeOp(mlir::SymbolTableCollection& symbol_table) {
+      SymbolRefAttr callee_attr = static_cast<ConcreteType*>(this)->getCallee();
+      return symbol_table.lookupNearestSymbolFrom<CalleeOpType>(
+          this->getOperation(), callee_attr);
+    }
+  };
+};
+
+}  // namespace ifrt
+}  // namespace xla
+}  // namespace OpTrait
+}  // namespace mlir
+
 // Generated definitions.
 #define GET_OP_CLASSES
 #include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h.inc"  // IWYU pragma: export
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td
index c98770323b4..43a87358283 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td
+++ b/tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.td
@@ -26,7 +26,16 @@ include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.td"
 class Ifrt_Op<string mnemonic, list<Trait> traits = []> :
     Op<Ifrt_Dialect, mnemonic, traits>;
 
-def Ifrt_ReshardOp : Ifrt_Op<"Reshard"> {
+// Verifies op used in a FuncOp with `ifrt.function` attr.
+def NestedInIfrtFunc : NativeOpTrait<"xla::ifrt::NestedInIfrtFuncTrait">;
+
+// Verifies callee is a valid symbol ref to callee_op_type.
+// Provides `callee_op_type getCalleeOp(SymbolTableCollection)`.
+// Concrete type must have a SymbolRefAttr named `callee`.
+class IfrtCallLike<string callee_op_type>
+    : ParamNativeOpTrait<"xla::ifrt::IfrtCallLikeTrait", callee_op_type>;
+
+def Ifrt_ReshardOp : Ifrt_Op<"Reshard", [NestedInIfrtFunc]> {
   let summary = "Reshards a host tensor or device array";
   let description = [{
     Copies the host tensor or device array to a new sharding.
@@ -46,7 +55,8 @@ def Ifrt_ReshardOp : Ifrt_Op<"Reshard"> {
   let hasVerifier = 1;
 }
 
-def Ifrt_AssembleOp : Ifrt_Op<"Assemble", [AttrSizedOperandSegments]> {
+def Ifrt_AssembleOp
+    : Ifrt_Op<"Assemble", [AttrSizedOperandSegments, NestedInIfrtFunc]> {
   let summary = "Assembles single device arrays to a sharded array";
   let description = [{
     Builds a larger array out of individual per-device arrays.
@@ -67,7 +77,7 @@ def Ifrt_AssembleOp : Ifrt_Op<"Assemble", [AttrSizedOperandSegments]> {
   let hasVerifier = 1;
 }
 
-def Ifrt_DisassembleOp : Ifrt_Op<"Disassemble"> {
+def Ifrt_DisassembleOp : Ifrt_Op<"Disassemble", [NestedInIfrtFunc]> {
   let summary = "Disassembles a sharded array to single device arrays";
   let description = [{
     Breaks an array up into per-device arrays.
@@ -94,6 +104,8 @@ def IoAliasesAttr : TypedArrayAttrBase<
 
 def Ifrt_CallOp : Ifrt_Op<"Call",
     [AttrSizedOperandSegments,
+     NestedInIfrtFunc,
+     IfrtCallLike<"::mlir::func::FuncOp">,
      DeclareOpInterfaceMethods<CallOpInterface>,
      DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call some function on a set of devices";
@@ -122,7 +134,7 @@ def Ifrt_CallOp : Ifrt_Op<"Call",
     Variadic<Ifrt_ControlType>:$control_inputs,
 
     SymbolRefAttr:$callee,
-    DenseI64ArrayAttr:$devices,
+    DenseI32ArrayAttr:$devices,
     DefaultValuedAttr<IoAliasesAttr, "{}">:$io_aliases);
   let results = (outs
     Variadic<Ifrt_ArrayType>:$outputs,
@@ -136,6 +148,8 @@ def Ifrt_CallOp : Ifrt_Op<"Call",
 
 def Ifrt_CallLoadedExecutableOp : Ifrt_Op<"CallLoadedExecutable",
     [AttrSizedOperandSegments,
+     NestedInIfrtFunc,
+     IfrtCallLike<"LoadedExecutableOp">,
      DeclareOpInterfaceMethods<CallOpInterface>,
      DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = "Call some loaded executable";
@@ -180,14 +194,14 @@ def Ifrt_LoadedExecutableOp : Ifrt_Op<"LoadedExecutable",
   let arguments = (ins
     SymbolNameAttr: $sym_name,
     TypeAttrOf<FunctionType>: $function_type,
-    DenseI64ArrayAttr:$devices
+    DenseI32ArrayAttr:$devices
   );
 
   let assemblyFormat = "$sym_name attr-dict `:` $function_type";
   let hasVerifier = 1;
 }
 
-def Ifrt_AfterOp : Ifrt_Op<"After"> {
+def Ifrt_AfterOp : Ifrt_Op<"After", [NestedInIfrtFunc]> {
   let summary = "Get a control handle for array materialization.";
   let description = [{
     When depending on the `control_output`, the op will be not be scheduled until
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc
index ca14e0a5254..b50b081c074 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc
+++ b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.cc
@@ -31,7 +31,8 @@ namespace xla {
 namespace ifrt {
 namespace {
 
-void PrintDims(llvm::raw_ostream& os, llvm::ArrayRef<int64_t> dims) {
+template <typename T>
+void PrintDims(llvm::raw_ostream& os, llvm::ArrayRef<T> dims) {
   os << dims[0];
   for (int i = 1; i < dims.size(); ++i) {
     os << "x" << dims[i];
@@ -42,15 +43,14 @@ void PrintDims(llvm::raw_ostream& os, llvm::ArrayRef<int64_t> dims) {
 // `axis_sizes` is the size of mesh dimensions before the permutation.
 // `cum_sizes` is the cumulative product of the element in `sizes`.
 // `base` is the start device id of this slice of `permutation`.
-void PopulateDevices(llvm::ArrayRef<int64_t> permutation,
-                     llvm::ArrayRef<int64_t> axis_sizes,
-                     llvm::ArrayRef<int64_t> cum_sizes,
-                     llvm::SmallVectorImpl<int64_t>& out_devices,
-                     int64_t base = 0) {
-  const int64_t expanding_dim = permutation.back();
-  const int64_t expanding_dim_size = axis_sizes[expanding_dim];
-  const int64_t expanding_cum_dim_size = cum_sizes[expanding_dim];
-  for (int64_t i = 0; i < expanding_dim_size; ++i) {
+void PopulateDevices(llvm::ArrayRef<int> permutation,
+                     llvm::ArrayRef<int> axis_sizes,
+                     llvm::ArrayRef<int> cum_sizes,
+                     llvm::SmallVectorImpl<int>& out_devices, int base = 0) {
+  const int expanding_dim = permutation.back();
+  const int expanding_dim_size = axis_sizes[expanding_dim];
+  const int expanding_cum_dim_size = cum_sizes[expanding_dim];
+  for (int i = 0; i < expanding_dim_size; ++i) {
     if (permutation.size() == 1) {
       out_devices.push_back(base + i * expanding_cum_dim_size);
     } else {
@@ -64,18 +64,18 @@ void PopulateDevices(llvm::ArrayRef<int64_t> permutation,
 
 mlir::LogicalResult ShardingParam::MinorToMajor::verify(
     llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const {
-  if (permutation.size() != axis_sizes.size()) {
-    return emit_error()
-           << "Expect same size for `permutation` and `axis_sizes`. Actual "
-           << permutation.size() << " vs " << axis_sizes.size();
+  if (permutation.size() != axis_sizes.size() || axis_sizes.empty()) {
+    return emit_error() << "Expect same non-zero size for `permutation` and "
+                           "`axis_sizes`. Actual "
+                        << permutation.size() << " vs " << axis_sizes.size();
   }
   return mlir::success();
 }
 
 void ShardingParam::MinorToMajor::ToDeviceList(
-    llvm::SmallVectorImpl<int64_t>& out_devices) const {
-  llvm::SmallVector<int64_t, 4> cum_sizes;
-  int64_t cum_size = 1;
+    llvm::SmallVectorImpl<int>& out_devices) const {
+  llvm::SmallVector<int, 4> cum_sizes;
+  int cum_size = 1;
   cum_sizes.reserve(axis_sizes.size());
   for (auto size : axis_sizes) {
     cum_sizes.push_back(cum_size);
@@ -99,15 +99,20 @@ mlir::FailureOr<ShardingParam> ShardingParam::Parse(
     return mlir::ParseResult::success();
   };
 
+  llvm::SmallVector<int64_t, 4> axis_sizes_64;
   if (ods_parser.parseDimensionList(dim_shards, false, false) ||
       ods_parser.parseKeyword("to") ||
       ods_parser.parseCommaSeparatedList(mlir::AsmParser::Delimiter::Square,
                                          parseIntoPermutation) ||
       ods_parser.parseKeyword("on") ||
-      ods_parser.parseDimensionList(minor_to_major.axis_sizes, false, false)) {
+      ods_parser.parseDimensionList(axis_sizes_64, false, false)) {
     return mlir::failure();
   }
 
+  minor_to_major.axis_sizes.reserve(axis_sizes_64.size());
+  for (int64_t size : axis_sizes_64) {
+    minor_to_major.axis_sizes.push_back(size);
+  }
   return ShardingParam(dim_shards, minor_to_major);
 }
 
@@ -116,10 +121,13 @@ mlir::LogicalResult ShardingParam::verify(
   if (mlir::failed(minor_to_major().verify(emit_error))) {
     return mlir::failure();
   }
+  if (dim_shards().empty()) {
+    return emit_error() << "Dim shards is empty";
+  }
 
-  int64_t dim_index = 0;
-  int64_t cum_size = 1;
-  for (const int64_t index : minor_to_major().permutation) {
+  int dim_index = 0;
+  int cum_size = 1;
+  for (const int index : minor_to_major().permutation) {
     while (dim_index < dim_shards().size() && dim_shards()[dim_index] == 1) {
       dim_index++;
     }
@@ -169,9 +177,9 @@ llvm::raw_ostream& operator<<(llvm::raw_ostream& os, ShardingParam sharding) {
   PrintDims(os, sharding.dim_shards());
   os << " to [";
   llvm::interleaveComma(
-      llvm::ArrayRef<int64_t>(sharding.minor_to_major().permutation), os);
+      llvm::ArrayRef<int>(sharding.minor_to_major().permutation), os);
   os << "] on ";
-  PrintDims(os, sharding.minor_to_major().axis_sizes);
+  PrintDims<int>(os, sharding.minor_to_major().axis_sizes);
   return os;
 }
 
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h
index 6c965e1cfd3..459a3292d97 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h
+++ b/tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h
@@ -75,9 +75,9 @@ class ShardingParam {
   // Sizes of `permutation` and `sizes` must be equal.
   struct MinorToMajor {
     // A permutation of range [0...n].
-    llvm::SmallVector<int64_t, 4> permutation;
+    llvm::SmallVector<int, 4> permutation;
     // The size of mesh dimensions before the permutation.
-    llvm::SmallVector<int64_t, 4> axis_sizes;
+    llvm::SmallVector<int, 4> axis_sizes;
 
     mlir::LogicalResult verify(
         llvm::function_ref<mlir::InFlightDiagnostic()> emit_error) const;
@@ -87,7 +87,7 @@ class ShardingParam {
     }
 
     // Produces a flat list of device ids according to the permutation.
-    void ToDeviceList(llvm::SmallVectorImpl<int64_t>& out_devices) const;
+    void ToDeviceList(llvm::SmallVectorImpl<int>& out_devices) const;
   };
 
   ShardingParam(llvm::ArrayRef<int64_t> dim_shards, MinorToMajor minor_to_major)
@@ -110,9 +110,9 @@ class ShardingParam {
   }
 
   llvm::hash_code hash_value() const {
-    return llvm::hash_combine(
-        dim_shards(), llvm::ArrayRef<int64_t>(minor_to_major_.permutation),
-        llvm::ArrayRef<int64_t>(minor_to_major_.axis_sizes));
+    return llvm::hash_combine(dim_shards(),
+                              llvm::ArrayRef<int>(minor_to_major_.permutation),
+                              llvm::ArrayRef<int>(minor_to_major_.axis_sizes));
   }
 
   std::string DebugString() const;
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD b/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD
index 32967d42ea0..ea27710d6da 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "//tensorflow/compiler/xla:run_lit.sh",
     test_file_exts = ["mlir"],
@@ -26,8 +28,66 @@ cc_binary(
     srcs = ["ifrt-opt.cc"],
     deps = [
         "//tensorflow/compiler/xla/python/ifrt/ir",
+        "//tensorflow/compiler/xla/python/ifrt/ir/transforms:passes",
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MlirOptLib",
     ],
 )
+
+cc_library(
+    name = "executable_impl_test_base",
+    testonly = True,
+    srcs = ["executable_impl_test_base.cc"],
+    hdrs = ["executable_impl_test_base.h"],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        "//tensorflow/compiler/xla:status_macros",
+        "//tensorflow/compiler/xla/mlir_hlo:hlo_dialect_registration",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/ifrt:test_util",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@tf_runtime//:ref_count",
+    ],
+)
+
+cc_library(
+    name = "executable_impl_test_lib",
+    testonly = True,
+    srcs = ["executable_impl_test_lib.cc"],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        ":executable_impl_test_base",
+        "//tensorflow/compiler/xla/pjrt:pjrt_executable",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/ifrt:test_util",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "//tensorflow/compiler/xla/python/ifrt/ir:compile_options",
+        "//tensorflow/compiler/xla/python/pjrt_ifrt:xla_ifrt",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:status_matchers",
+        "//tensorflow/tsl/platform:statusor",
+        "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@tf_runtime//:ref_count",
+    ],
+    alwayslink = True,
+)
+
+xla_cc_test(
+    name = "executable_test_no_impl",
+    srcs = [],
+    deps = [
+        ":executable_impl_test_lib",
+        "//tensorflow/compiler/xla/python/ifrt:no_impl_test_main",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.cc b/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.cc
new file mode 100644
index 00000000000..360c7980d2a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.cc
@@ -0,0 +1,97 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.h"
+
+#include <vector>
+
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/InitAllDialects.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/register.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/compiler/xla/status_macros.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+IfrtIrExecutableImplTestBase::IfrtIrExecutableImplTestBase() {
+  mlir::registerMLIRContextCLOptions();
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::mhlo::registerAllMhloDialects(registry);
+  registry.insert<xla::ifrt::IfrtDialect>();
+  mlir_context_.appendDialectRegistry(registry);
+}
+
+void IfrtIrExecutableImplTestBase::SetUp() {
+  TF_ASSERT_OK_AND_ASSIGN(client_, GetClient());
+}
+
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
+IfrtIrExecutableImplTestBase::LoadFromSource(absl::string_view source) {
+  auto op_ref = mlir::parseSourceString<mlir::ModuleOp>(source, &mlir_context_);
+  TF_RET_CHECK(op_ref) << "Failed to parse MLIR source";
+  return op_ref;
+}
+
+absl::StatusOr<tsl::RCReference<Array>>
+IfrtIrExecutableImplTestBase::CreateArray(
+    absl::Span<void* const> per_shard_data, Shape shape, DType dtype,
+    ShardingParam sharding_param, DeviceList device_list) {
+  TF_RET_CHECK(per_shard_data.size() == device_list.devices().size())
+      << "Inconsistent sizes. per_shard_data " << per_shard_data.size()
+      << " vs device_list " << device_list.devices().size();
+  TF_ASSIGN_OR_RETURN(auto sharding, ShardingParamSharding::Create(
+                                         sharding_param, device_list));
+  TF_ASSIGN_OR_RETURN(auto per_shard, sharding->Disassemble(shape));
+  // All shards have the same shape. Just pick 0.
+  Shape per_shard_shape = per_shard[0].first;
+  std::vector<tsl::RCReference<Array>> per_shard_arrays;
+  per_shard_arrays.reserve(per_shard_data.size());
+  for (int i = 0; i < per_shard_data.size(); ++i) {
+    TF_ASSIGN_OR_RETURN(
+        tsl::RCReference<Array> per_shard_array,
+        client_->MakeArrayFromHostBuffer(
+            per_shard_data[i], dtype, per_shard_shape,
+            /*byte_strides=*/std::nullopt,
+            SingleDeviceSharding::Create(device_list[i]),
+            Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/nullptr));
+    per_shard_arrays.push_back(per_shard_array);
+  }
+  return client_->AssembleArrayFromSingleDeviceArrays(
+      shape, sharding, absl::MakeSpan(per_shard_arrays),
+      ArrayCopySemantics::kAlwaysCopy);
+}
+
+absl::StatusOr<DeviceList> IfrtIrExecutableImplTestBase::PickDevices(
+    int count) {
+  absl::Span<Device* const> devices = client_->devices();
+  TF_RET_CHECK(count <= devices.size())
+      << "Requested " << count << " devices. Only have " << devices.size();
+  auto picked = devices.first(count);
+  return DeviceList(DeviceList::Devices(picked.begin(), picked.end()));
+}
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.h b/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.h
new file mode 100644
index 00000000000..e903b57713a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.h
@@ -0,0 +1,68 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TESTS_EXECUTABLE_IMPL_TEST_BASE_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TESTS_EXECUTABLE_IMPL_TEST_BASE_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+namespace test_util {
+
+// Base class to help create tests that compile and execute IFRT IR.
+class IfrtIrExecutableImplTestBase : public testing::Test {
+ public:
+  IfrtIrExecutableImplTestBase();
+  void SetUp() override;
+
+ protected:
+  // Loads mlir from source string.
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> LoadFromSource(
+      absl::string_view source);
+
+  // Creates an Array from per shard data.
+  // TODO(hyeontaek): Remove this when MakeArrayFromHostBuffer supports it
+  // directly.
+  absl::StatusOr<tsl::RCReference<Array>> CreateArray(
+      absl::Span<void* const> per_shard_data, Shape shape, DType dtype,
+      ShardingParam sharding_param, DeviceList device_list);
+
+  // Picks a given number of devices.
+  // Error when `count` is larger than the total number of devices.
+  absl::StatusOr<DeviceList> PickDevices(int count);
+
+  mlir::MLIRContext mlir_context_;
+  std::unique_ptr<Client> client_;
+};
+
+}  // namespace test_util
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TESTS_EXECUTABLE_IMPL_TEST_BASE_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc b/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
new file mode 100644
index 00000000000..679258442f6
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_lib.cc
@@ -0,0 +1,353 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/compile_options.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/tests/executable_impl_test_base.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+
+namespace xla {
+namespace ifrt {
+namespace {
+
+using ::tsl::testing::IsOk;
+using ::xla::ifrt::test_util::AssertPerShardData;
+
+class IfrtIrExecutableImplTest
+    : public test_util::IfrtIrExecutableImplTestBase {};
+
+TEST_F(IfrtIrExecutableImplTest, CallXla) {
+  std::string source = R"(
+module {
+  func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @add_one(%arg0) {devices=array<i32: 0, 1>}
+        : (!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+        -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    return %0 : !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+  }
+
+  func.func private @add_one(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+    return %1 : tensor<2x2xi32>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceList devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LoadedExecutable> loaded_exec,
+      client_->GetDefaultCompiler()->Compile(
+          *mlir_module,
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::RCReference<Array> input,
+      CreateArray({data0.data(), data1.data()}, xla::ifrt::Shape({2, 2}),
+                  xla::ifrt::DType(xla::ifrt::DType::kS32),
+                  xla::ifrt::ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), /*options=*/{},
+                           /*devices=*/std::nullopt));
+
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], xla::ifrt::DType(xla::ifrt::DType::kS32),
+      xla::ifrt::Shape({1, 2}), {{1, 2}, {3, 4}}, devices));
+}
+
+TEST_F(IfrtIrExecutableImplTest, Reshard) {
+  std::string source = R"(
+module {
+  func.func @main(%arg0: !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [0]>)
+      -> !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [1]>
+      attributes {ifrt.function} {
+    %0 = "ifrt.Reshard"(%arg0)
+        : (!ifrt.array<tensor<2xi32>, 1 to [0] on 1, [0]>)
+        -> !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [1]>
+    return %0 : !ifrt.array<tensor<2xi32>, 1 to [0] on 1, [1]>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceList devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LoadedExecutable> loaded_exec,
+      client_->GetDefaultCompiler()->Compile(
+          *mlir_module,
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data = {1, 2};
+  TF_ASSERT_OK_AND_ASSIGN(tsl::RCReference<Array> input,
+                          CreateArray({data.data()}, xla::ifrt::Shape({2}),
+                                      xla::ifrt::DType(xla::ifrt::DType::kS32),
+                                      xla::ifrt::ShardingParam({1}, {{0}, {1}}),
+                                      DeviceList({devices[0]})));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), /*options=*/{},
+                           /*devices=*/std::nullopt));
+
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], xla::ifrt::DType(xla::ifrt::DType::kS32),
+      xla::ifrt::Shape({2}), {{1, 2}}, DeviceList({devices[1]})));
+}
+
+TEST_F(IfrtIrExecutableImplTest, ZeroInput) {
+  std::string source = R"(
+module {
+  func.func @main() -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @one() {devices=array<i32: 0, 1>}
+        : () -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    return %0 : !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+  }
+
+  func.func private @one() -> tensor<2x2xi32> {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    return %0 : tensor<2x2xi32>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceList devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LoadedExecutable> loaded_exec,
+      client_->GetDefaultCompiler()->Compile(
+          *mlir_module,
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  TF_ASSERT_OK_AND_ASSIGN(LoadedExecutable::ExecuteResult result,
+                          loaded_exec->Execute(/*args=*/{}, /*options=*/{},
+                                               /*devices=*/std::nullopt));
+
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], xla::ifrt::DType(xla::ifrt::DType::kS32),
+      xla::ifrt::Shape({1, 2}), {{1, 1}, {1, 1}}, devices));
+}
+
+TEST_F(IfrtIrExecutableImplTest, ZeroOutput) {
+  std::string source = R"(
+module {
+  func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+      attributes {ifrt.function} {
+    %ctrl_0 = ifrt.Call @add_one(%arg0) {devices=array<i32: 0, 1>}
+        : (!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>) -> ()
+    return
+  }
+
+  func.func private @add_one(%arg0: tensor<2x2xi32>) {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+    return
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceList devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LoadedExecutable> loaded_exec,
+      client_->GetDefaultCompiler()->Compile(
+          *mlir_module,
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::RCReference<Array> input,
+      CreateArray({data0.data(), data1.data()}, xla::ifrt::Shape({2, 2}),
+                  xla::ifrt::DType(xla::ifrt::DType::kS32),
+                  xla::ifrt::ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), /*options=*/{},
+                           /*devices=*/std::nullopt));
+
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 0);
+}
+
+TEST_F(IfrtIrExecutableImplTest, BufferDonation) {
+  std::string source = R"(
+module {
+  func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+          {ifrt.donated})
+      -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @add_one(%arg0)
+        {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0, 0>]}
+        : (!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+        -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    return %0 : !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+  }
+
+  func.func private @add_one(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+    return %1 : tensor<2x2xi32>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  TF_ASSERT_OK_AND_ASSIGN(DeviceList devices, PickDevices(2));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LoadedExecutable> loaded_exec,
+      client_->GetDefaultCompiler()->Compile(
+          *mlir_module,
+          std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices))));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::RCReference<Array> input,
+      CreateArray({data0.data(), data1.data()}, xla::ifrt::Shape({2, 2}),
+                  xla::ifrt::DType(xla::ifrt::DType::kS32),
+                  xla::ifrt::ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), /*options=*/{},
+                           /*devices=*/std::nullopt));
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], xla::ifrt::DType(xla::ifrt::DType::kS32),
+      xla::ifrt::Shape({1, 2}), {{1, 2}, {3, 4}}, devices));
+
+  std::vector<int> data(input->shape().num_elements());
+  EXPECT_THAT(input
+                  ->CopyToHostBuffer(data.data(), std::nullopt,
+                                     ArrayCopySemantics::kAlwaysCopy)
+                  .Await(),
+              testing::Not(IsOk()));
+}
+
+TEST_F(IfrtIrExecutableImplTest, LoadedExecBinding) {
+  TF_ASSERT_OK_AND_ASSIGN(DeviceList devices, PickDevices(2));
+  std::string mhlo_source = R"(
+module {
+  func.func @main(
+      %arg0: tensor<2x2xi32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"})
+      -> (tensor<2x2xi32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"}) {
+    %0 = mhlo.constant dense<1> : tensor<2x2xi32>
+    %1 = mhlo.add %arg0, %0 : tensor<2x2xi32>
+    return %1 : tensor<2x2xi32>
+  }
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mhlo_module,
+                          LoadFromSource(mhlo_source));
+  xla::CompileOptions xla_options;
+  {
+    auto& exec_build_options = xla_options.executable_build_options;
+    exec_build_options.set_num_replicas(1);
+    exec_build_options.set_num_partitions(2);
+    exec_build_options.set_use_spmd_partitioning(true);
+    xla::DeviceAssignment device_assignment(1, 2);
+    for (auto [logical, device_id] : llvm::enumerate(GetDeviceIds(devices))) {
+      device_assignment(0, logical) = device_id;
+    }
+    exec_build_options.set_device_assignment(device_assignment);
+  }
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<LoadedExecutable> child_exec,
+                          client_->GetDefaultCompiler()->Compile(
+                              *mhlo_module, std::make_unique<XlaCompileOptions>(
+                                                std::move(xla_options))));
+
+  std::string source = R"(
+module {
+  func.func @main(%arg0: !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.CallLoadedExecutable @add_one(%arg0)
+        : (!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+        -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+    return %0 : !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+  }
+
+  ifrt.LoadedExecutable @add_one {devices=array<i32: 0, 1>}
+      : (!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(mlir::OwningOpRef<mlir::ModuleOp> mlir_module,
+                          LoadFromSource(source));
+  auto options = std::make_unique<IfrtIRCompileOptions>(GetDeviceIds(devices));
+  options->loaded_exec_binding["add_one"] = child_exec.get();
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<LoadedExecutable> loaded_exec,
+      client_->GetDefaultCompiler()->Compile(*mlir_module, std::move(options)));
+
+  std::vector<int> data0 = {0, 1};
+  std::vector<int> data1 = {2, 3};
+  TF_ASSERT_OK_AND_ASSIGN(
+      tsl::RCReference<Array> input,
+      CreateArray({data0.data(), data1.data()}, xla::ifrt::Shape({2, 2}),
+                  xla::ifrt::DType(xla::ifrt::DType::kS32),
+                  xla::ifrt::ShardingParam({2, 1}, {{0}, {2}}), devices));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      LoadedExecutable::ExecuteResult result,
+      loaded_exec->Execute(absl::MakeSpan(&input, 1), /*options=*/{},
+                           /*devices=*/std::nullopt));
+
+  TF_ASSERT_OK(result.status.Await());
+  ASSERT_EQ(result.outputs.size(), 1);
+  ASSERT_NO_FATAL_FAILURE(AssertPerShardData<int>(
+      result.outputs[0], xla::ifrt::DType(xla::ifrt::DType::kS32),
+      xla::ifrt::Shape({1, 2}), {{1, 2}, {3, 4}}, devices));
+}
+
+}  // namespace
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc b/tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc
index 63d173a6dab..d50592e014f 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/ifrt-opt.cc
@@ -17,11 +17,13 @@ limitations under the License.
 #include "mlir/InitAllDialects.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_dialect.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h"
 
 int main(int argc, char** argv) {
   mlir::DialectRegistry registry;
   mlir::registerAllDialects(registry);
   registry.insert<xla::ifrt::IfrtDialect>();
+  xla::ifrt::registerIfrtIrPasses();
 
   return mlir::asMainReturnCode(
       mlir::MlirOptMain(argc, argv, "IFRT dialect driver\n", registry));
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir
new file mode 100644
index 00000000000..5b15e7543ad
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/spmd_interface_verification.mlir
@@ -0,0 +1,67 @@
+// RUN: ifrt-opt %s -split-input-file -spmd-expandable-interface-verification='excluded-dialects=arith' -verify-diagnostics
+
+module @good_return_only {
+  func.func @main(
+      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @simple_return(%arg0) {devices=array<i32: 0, 1>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    return
+  }
+
+  func.func @simple_return(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    return %arg0 : tensor<2x2xi32>
+  }
+}
+
+module @good_non_expandable_on_one_device{
+  func.func @main(
+      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>)
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @math_absi(%arg0) {devices=array<i32: 0>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>)
+      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>
+    return
+  }
+
+  func.func @math_absi(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    %0 = math.absi %arg0 : tensor<2x2xi32>
+    return %0 : tensor<2x2xi32>
+  }
+}
+
+module @good_excluded_dialect_on_two_devices {
+  func.func @main(
+      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @arith_self_add(%arg0) {devices=array<i32: 0, 1>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    return
+  }
+
+  func.func @arith_self_add(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    %0 = arith.addi %arg0, %arg0 : tensor<2x2xi32>
+    return %0 : tensor<2x2xi32>
+  }
+}
+
+// -----
+
+module @unexpandable_on_two_devices {
+  func.func @main(
+      %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      attributes {ifrt.function} {
+    %0, %ctrl_0 = ifrt.Call @math_absi(%arg0) {devices=array<i32: 0, 1>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+    return
+  }
+
+  func.func @math_absi(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+    // expected-error@+1 {{requires op to have `IfrtSpmdExpandable` OpInterface implemented}}
+    %0 = math.absi %arg0 : tensor<2x2xi32>
+    return %0 : tensor<2x2xi32>
+  }
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir
index 73d63bb49bc..ca1a74494e3 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_array.mlir
@@ -29,7 +29,7 @@ func.func @array_devices_should_be_distinct() {
 // -----
 
 func.func @array_requires_same_permutation_and_axis_sizes() {
-  // expected-error@+2 {{Expect same size for `permutation` and `axis_sizes`. Actual 2 vs 1}}
+  // expected-error@+2 {{Expect same non-zero size for `permutation` and `axis_sizes`. Actual 2 vs 1}}
   %0 = builtin.unrealized_conversion_cast to
       !ifrt.array<tensor<4x4xi32>, 1x1 to [0,1] on 2, [0,1]>
   return
@@ -61,3 +61,21 @@ func.func @array_requires_same_size_of_devices_and_from_axes() {
       !ifrt.array<tensor<4x4xi32>, 2x2 to [0,1] on 2x2, [0,1,2]>
   return
 }
+
+// -----
+
+func.func @array_requires_non_empty_dim_shards() {
+  // expected-error@+2 {{Dim shards is empty}}
+  %0 = builtin.unrealized_conversion_cast to
+       !ifrt.array<tensor<4x4xi32>,  to [0,1] on 2x2, [0,1,2,3]>
+  return
+}
+
+// -----
+
+func.func @array_requires_non_empty_permutation() {
+  // expected-error@+2 {{Expect same non-zero size for `permutation` and `axis_sizes`. Actual 0 vs 0}}
+  %0 = builtin.unrealized_conversion_cast to
+       !ifrt.array<tensor<4x4xi32>, 2x2 to [] on , [0,1,2,3]>
+  return
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir
index 3c5017a5020..e82a09e8dcf 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_assemble.mlir
@@ -1,8 +1,23 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_assemble(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+    attributes {ifrt.function} {
+  %0 = "ifrt.Assemble"(%arg0, %arg1)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+      -> !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+// -----
+
+func.func @assemble_requires_in_ifrt_function(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
     %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>) {
+  // expected-error@+1 {{'ifrt.Assemble' op must be in a FuncOp with attr `ifrt.function`}}
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operand_segment_sizes=array<i32: 2, 0>}
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
@@ -15,7 +30,8 @@ func.func @good_assemble(
 
 func.func @assemble_requires_inputs_on_single_devices(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0,1]>,
-    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>) {
+    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [2]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Assemble' op requires every input to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0, 1]>'}}
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operand_segment_sizes=array<i32: 2, 0>}
@@ -29,7 +45,8 @@ func.func @assemble_requires_inputs_on_single_devices(
 
 func.func @assemble_requires_same_device_list(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
-    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>) {
+    %arg1: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Assemble' op requires the same input/output device list. Input 0, 1 vs Output 1, 2}}
   %0 = "ifrt.Assemble"(%arg0, %arg1)
       {operand_segment_sizes=array<i32: 2, 0>}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_attrs.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_attrs.mlir
new file mode 100644
index 00000000000..654a65617ed
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_attrs.mlir
@@ -0,0 +1,41 @@
+// RUN: ifrt-opt %s -split-input-file -verify-diagnostics
+
+func.func @good_function_attr() attributes {ifrt.function} {
+  return
+}
+
+func.func @good_donated_attr(
+    %arg0: !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 1, [0]> {ifrt.donated})
+    attributes {ifrt.function} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{'func.func' op has `ifrt.function` attr that is not a UnitAttr}}
+func.func @func_attr_should_be_unit() attributes {ifrt.function = "1"} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{'builtin.module' op has `ifrt.function` attr but is not a function}}
+module @func_attr_should_be_on_func_op attributes {ifrt.function} {}
+
+// -----
+
+// expected-error@+1 {{'func.func' op has `ifrt.donated` arg attr that is not a UnitAttr}}
+func.func @donated_attr_should_be_unit(
+    %arg0: !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 1, [0]>
+        {ifrt.donated = "1"})
+    attributes {ifrt.function} {
+  return
+}
+
+// -----
+
+// expected-error@+1 {{'func.func' op has `ifrt.donated` arg attr but not has `ifrt.function` attr}}
+func.func @donated_attr_should_be_with_func_attr(
+    %arg0: !ifrt.array<tensor<4x4xi32>, 1x1 to [0] on 1, [0]> {ifrt.donated}) {
+  return
+}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir
index 3220029bf31..3610352d343 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call.mlir
@@ -1,41 +1,46 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_call(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
-  %0, %ctrl_0 = ifrt.Call @good_call_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
   return
 }
 
-func.func @good_call_callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
-  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
-  return %0 : tensor<4x4xi32>
-}
-
-// -----
-
 func.func @good_call_with_control_dep(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>, %arg1: !ifrt.control) {
-  %0, %ctrl_0 = ifrt.Call @good_call_with_control_dep_callee(%arg0) after %arg1
-    {devices=array<i64: 0, 1>}
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
+    %arg1: !ifrt.control)
+    attributes {ifrt.function} {
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) after %arg1
+    {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
   return
 }
 
-func.func @good_call_with_control_dep_callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
-  %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
-  return %0 : tensor<4x4xi32>
+func.func @good_call_with_io_aliases(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0, 0>]}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
+  return
+}
+
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
+  return %arg0 : tensor<2x2xi32>
 }
 
 // -----
 
-func.func @good_call_with_io_aliases(
+
+func.func @call_requires_in_ifrt_function(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
-  %0, %ctrl_0 = ifrt.Call @callee(%arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>]}
+  // expected-error@+1 {{'ifrt.Call' op must be in a FuncOp with attr `ifrt.function`}}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
   return
@@ -48,10 +53,10 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @call_requires_valid_reference(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
-  // expected-error@+1 {{'ifrt.Call' op requires '@missing_reference' to reference a valid function}}
-  %0, %ctrl_0 = ifrt.Call @missing_reference(%arg0)
-    {devices=array<i64: 0, 1>}
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.Call' op requires '@missing_reference' to reference a valid `func.func`}}
+  %0, %ctrl_0 = ifrt.Call @missing_reference(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
@@ -60,16 +65,16 @@ func.func @call_requires_valid_reference(
 // -----
 
 func.func @call_requires_same_input_size(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same input size. Input 1 vs Callee 0}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_same_input_size_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-func.func @call_requires_same_input_size_callee() -> (tensor<4x4xi32>) {
+func.func @callee() -> (tensor<4x4xi32>) {
   %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
   return %0 : tensor<4x4xi32>
 }
@@ -77,17 +82,16 @@ func.func @call_requires_same_input_size_callee() -> (tensor<4x4xi32>) {
 // -----
 
 func.func @call_requires_same_input_shape(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Input #0 'tensor<2x2xi32>' vs Callee 'tensor<2x4xi32>'}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_same_input_shape_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-func.func @call_requires_same_input_shape_callee(%arg0: tensor<2x4xi32>)
-   -> tensor<4x4xi32> {
+func.func @callee(%arg0: tensor<2x4xi32>) -> tensor<4x4xi32> {
   %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
   return %0 : tensor<4x4xi32>
 }
@@ -95,33 +99,32 @@ func.func @call_requires_same_input_shape_callee(%arg0: tensor<2x4xi32>)
 // -----
 
 func.func @call_requires_same_output_size(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same output size. Output 1 vs Callee 0}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_same_output_size_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-func.func @call_requires_same_output_size_callee(%arg0: tensor<2x2xi32>) {
+func.func @callee(%arg0: tensor<2x2xi32>) {
   return
 }
 
 // -----
 
 func.func @call_requires_same_output_shape(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires the same global shape. Output #0 'tensor<4x4xi32>' vs Callee 'tensor<2x4xi32>'}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_same_output_shape_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-func.func @call_requires_same_output_shape_callee(%arg0: tensor<2x2xi32>)
-    -> tensor<2x4xi32> {
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x4xi32> {
   %0 = builtin.unrealized_conversion_cast to tensor<2x4xi32>
   return %0 : tensor<2x4xi32>
 }
@@ -129,17 +132,16 @@ func.func @call_requires_same_output_shape_callee(%arg0: tensor<2x2xi32>)
 // -----
 
 func.func @call_requires_unique_devices_attr(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op has duplicate device id 0 in `devices` attr}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_unique_devices_attr_callee(%arg0)
-    {devices=array<i64: 0, 0>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 0>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-func.func @call_requires_unique_devices_attr_callee(%arg0: tensor<2x2xi32>)
-    -> tensor<4x4xi32> {
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
   %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
   return %0 : tensor<4x4xi32>
 }
@@ -147,17 +149,16 @@ func.func @call_requires_unique_devices_attr_callee(%arg0: tensor<2x2xi32>)
 // -----
 
 func.func @call_requires_input_place_on_devices(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 2]>'}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_input_place_on_devices_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-func.func @call_requires_input_place_on_devices_callee(%arg0: tensor<2x2xi32>)
-    -> tensor<4x4xi32> {
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
   %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
   return %0 : tensor<4x4xi32>
 }
@@ -165,17 +166,16 @@ func.func @call_requires_input_place_on_devices_callee(%arg0: tensor<2x2xi32>)
 // -----
 
 func.func @call_requires_output_place_on_devices(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 2]>'}}
-  %0, %ctrl_0 = ifrt.Call @call_requires_output_place_on_devices_callee(%arg0)
-    {devices=array<i64: 0, 1>}
+  %0, %ctrl_0 = ifrt.Call @callee(%arg0) {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,2]>
   return
 }
 
-func.func @call_requires_output_place_on_devices_callee(%arg0: tensor<2x2xi32>)
-    -> tensor<4x4xi32> {
+func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<4x4xi32> {
   %0 = builtin.unrealized_conversion_cast to tensor<4x4xi32>
   return %0 : tensor<4x4xi32>
 }
@@ -183,10 +183,11 @@ func.func @call_requires_output_place_on_devices_callee(%arg0: tensor<2x2xi32>)
 // -----
 
 func.func @io_aliases_should_be_pairs(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0>]}
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0>]}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
   return
@@ -199,10 +200,11 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @io_aliases_should_have_valid_input_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #1 to output #0 as only having 1 inputs}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 1, 0>]}
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 1, 0>]}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
   return
@@ -215,10 +217,11 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @io_aliases_should_only_alias_input_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #0 more than once}}
   %0, %1, %ctrl_0 = ifrt.Call @callee(%arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
         !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
@@ -233,10 +236,11 @@ func.func @callee(%arg0: tensor<2x2xi32>)
 // -----
 
 func.func @io_aliases_should_have_valid_output_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #1 as only having 1 outputs}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 1>]}
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0, 1>]}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
   return
@@ -249,10 +253,11 @@ func.func @callee(%arg0: tensor<2x2xi32>) -> tensor<2x2xi32> {
 // -----
 
 func.func @io_aliases_should_only_alias_output_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias output #0 more than once}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0, %arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
        !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
@@ -267,10 +272,11 @@ func.func @callee(%arg0: tensor<2x2xi32>, %arg1: tensor<2x2xi32>)
 // -----
 
 func.func @io_aliases_should_have_same_type(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Call' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0, 1]>'}}
   %0, %ctrl_0 = ifrt.Call @callee(%arg0)
-    {devices=array<i64: 0, 1>, io_aliases=[array<i32: 0, 0>]}
+    {devices=array<i32: 0, 1>, io_aliases=[array<i32: 0, 0>]}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
   return
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
index 5de3b308b19..031fc3a70a1 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_call_loaded_executable.mlir
@@ -1,36 +1,47 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
-    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
-    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
-
-// -----
-
 func.func @good_with_control_dep(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>,
-    %arg1: !ifrt.control) {
+    %arg1: !ifrt.control)
+    attributes {ifrt.function} {
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0) after %arg1
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
 
 // -----
 
-func.func @requires_valid_reference() {
-  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@missing_reference' to reference a valid LoadedExecutable}}
+func.func @requires_in_ifrt_function(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op must be in a FuncOp with attr `ifrt.function`}}
+  %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+  return
+}
+
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
+    : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
+
+// -----
+
+func.func @requires_valid_reference() attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@missing_reference' to reference a valid `ifrt.LoadedExecutable`}}
   %ctrl_0 = ifrt.CallLoadedExecutable @missing_reference() : () -> ()
   return
 }
@@ -38,8 +49,9 @@ func.func @requires_valid_reference() {
 // -----
 
 func.func @requires_loaded_executable_callee(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
-  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@wrong_reference' to reference a valid LoadedExecutable}}
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
+  // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires '@wrong_reference' to reference a valid `ifrt.LoadedExecutable`}}
   %ctrl_0 = ifrt.CallLoadedExecutable @wrong_reference() : () -> ()
   return
 }
@@ -51,7 +63,8 @@ func.func @wrong_reference() {
 // -----
 
 func.func @requires_matching_signature(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op requires callee signature matching '(!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>) -> !ifrt.array<tensor<4x3xi32>, 1x2 to [0] on 2, [0, 1]>'. Actual '(!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>) -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 1]>'}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
@@ -59,14 +72,15 @@ func.func @requires_matching_signature(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
 
 // -----
 
 func.func @io_aliases_should_be_pairs(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op attribute 'io_aliases' failed to satisfy constraint: Array of pairs of aliased input/output indices}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0>]}
@@ -75,14 +89,15 @@ func.func @io_aliases_should_be_pairs(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
 
 // -----
 
 func.func @io_aliases_should_have_valid_input_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #1 to output #0 as only having 1 inputs}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 1, 0>]}
@@ -91,14 +106,15 @@ func.func @io_aliases_should_have_valid_input_index(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
 
 // -----
 
 func.func @io_aliases_should_only_alias_input_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 more than once}}
   %0, %1, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0, 0>, array<i32: 0, 1>]}
@@ -108,14 +124,15 @@ func.func @io_aliases_should_only_alias_input_once(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
 
 // -----
 
 func.func @io_aliases_should_have_valid_output_index(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #1 as only having 1 outputs}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0, 1>]}
@@ -124,14 +141,15 @@ func.func @io_aliases_should_have_valid_output_index(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
 
 // -----
 
 func.func @io_aliases_should_only_alias_output_once(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias output #0 more than once}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0, %arg0)
     {io_aliases=[array<i32: 0, 0>, array<i32: 1, 0>]}
@@ -141,14 +159,15 @@ func.func @io_aliases_should_only_alias_output_once(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>
 
 // -----
 
 func.func @io_aliases_should_have_same_type(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.CallLoadedExecutable' op can't alias input #0 to output #0 with different types: '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 1]>' vs '!ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0, 1]>'}}
   %0, %ctrl_0 = ifrt.CallLoadedExecutable @callee(%arg0)
     {io_aliases=[array<i32: 0, 0>]}
@@ -157,6 +176,6 @@ func.func @io_aliases_should_have_same_type(
   return
 }
 
-ifrt.LoadedExecutable @callee {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @callee {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<2x2xi32>, 2x1 to [0] on 2, [0,1]>
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir
index 28fc16a0f4b..e7000af3b5f 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_disassemble.mlir
@@ -1,7 +1,21 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_disassemble(
+    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
+  %0, %1 = "ifrt.Disassemble"(%arg0)
+      {operand_segment_sizes=array<i32: 2, 0>}
+      : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+      -> (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [0]>,
+          !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 1, [1]>)
+  return
+}
+
+// -----
+
+func.func @disassemble_requires_in_ifrt_function(
     %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Disassemble' op must be in a FuncOp with attr `ifrt.function`}}
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
       : (!ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
@@ -13,7 +27,8 @@ func.func @good_disassemble(
 // -----
 
 func.func @disassemble_requires_outputs_on_single_devices(
-    %arg0: !ifrt.array<tensor<2x4xi32>, 1x4 to [0, 1] on 2x2, [0,1,2,3]>) {
+    %arg0: !ifrt.array<tensor<2x4xi32>, 1x4 to [0, 1] on 2x2, [0,1,2,3]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Disassemble' op requires every output to be a single device array. Actual: '!ifrt.array<tensor<2x2xi32>, 1x2 to [0] on 2, [0, 1]>'}}
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
@@ -26,7 +41,8 @@ func.func @disassemble_requires_outputs_on_single_devices(
 // -----
 
 func.func @disassemble_requires_same_device_list(
-    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x4xi32>, 1x2 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Disassemble' op requires the same input/output device list. Input 0, 1 vs Output 1, 2}}
   %0, %1 = "ifrt.Disassemble"(%arg0)
       {operand_segment_sizes=array<i32: 2, 0>}
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
index b5c978e5bd2..4e6c4f95e05 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_loaded_executable.mlir
@@ -1,25 +1,25 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
-ifrt.LoadedExecutable @good {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @good {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
 
 // -----
 
 // expected-error@+1 {{'ifrt.LoadedExecutable' op requires all inputs to be IfrtArrayType. Found 'tensor<2x2xi32>'}}
-ifrt.LoadedExecutable @requires_array_input {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @requires_array_input {devices=array<i32: 0, 1>}
     : (tensor<2x2xi32>) -> ()
 
 // -----
 
 // expected-error@+1 {{'ifrt.LoadedExecutable' op requires all outputs to be IfrtArrayType. Found 'tensor<2x2xi32>'}}
-ifrt.LoadedExecutable @requires_array_output {devices=array<i64: 0, 1>}
+ifrt.LoadedExecutable @requires_array_output {devices=array<i32: 0, 1>}
     : () -> tensor<2x2xi32>
 
 // -----
 
 // expected-error@+1 {{'ifrt.LoadedExecutable' op has duplicate device id 0 in `devices` attr}}
-ifrt.LoadedExecutable @requires_unique_devices_attr {devices=array<i64: 0, 0>}
+ifrt.LoadedExecutable @requires_unique_devices_attr {devices=array<i32: 0, 0>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
 
@@ -27,7 +27,7 @@ ifrt.LoadedExecutable @requires_unique_devices_attr {devices=array<i64: 0, 0>}
 
 // expected-error@+1 {{'ifrt.LoadedExecutable' op requires all inputs placed on `devices` attr. The following input is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0, 2]>'}}
 ifrt.LoadedExecutable @requires_input_place_on_devices
-    {devices=array<i64: 0, 1>}
+    {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,2]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,1]>
 
@@ -35,6 +35,6 @@ ifrt.LoadedExecutable @requires_input_place_on_devices
 
 // expected-error@+1 {{'ifrt.LoadedExecutable' op requires all outputs placed on `devices` attr. The following output is placed on device 2 not found in `devices` attr. '!ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0, 2]>'}}
 ifrt.LoadedExecutable @requires_output_place_on_devices
-    {devices=array<i64: 0, 1>}
+    {devices=array<i32: 0, 1>}
     : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
     -> !ifrt.array<tensor<4x4xi32>, 1x2 to [0] on 2, [0,2]>
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir
index 821bbbef57d..05b932ecb7f 100644
--- a/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir
+++ b/tensorflow/compiler/xla/python/ifrt/ir/tests/verify_reshard.mlir
@@ -1,7 +1,19 @@
 // RUN: ifrt-opt %s -split-input-file -verify-diagnostics
 
 func.func @good_reshard(
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
+  %0 = "ifrt.Reshard"(%arg0)
+      : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+      -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 4, [0,1,2,3]>
+  return
+}
+
+// -----
+
+func.func @reshard_requires_in_ifrt_function(
     %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+  // expected-error@+1 {{'ifrt.Reshard' op must be in a FuncOp with attr `ifrt.function`}}
   %0 = "ifrt.Reshard"(%arg0)
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
       -> !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 4, [0,1,2,3]>
@@ -11,7 +23,8 @@ func.func @good_reshard(
 // -----
 
 func.func @reshard_requires_same_global_shape(
-    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>) {
+    %arg0: !ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
+    attributes {ifrt.function} {
   // expected-error@+1 {{'ifrt.Reshard' op requires the same global shape. Input 'tensor<2x2xi32>' vs Output 'tensor<2x1xi32>'}}
   %0 = "ifrt.Reshard"(%arg0)
       : (!ifrt.array<tensor<2x2xi32>, 1x1 to [0] on 2, [0,1]>)
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/transforms/BUILD b/tensorflow/compiler/xla/python/ifrt/ir/transforms/BUILD
new file mode 100644
index 00000000000..9fed859495c
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/transforms/BUILD
@@ -0,0 +1,43 @@
+load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+gentbl_cc_library(
+    name = "passes_inc_gen",
+    tbl_outs = [
+        (
+            [
+                "-gen-pass-decls",
+                "-name=IfrtIr",
+            ],
+            "passes.h.inc",
+        ),
+    ],
+    tblgen = "@llvm-project//mlir:mlir-tblgen",
+    td_file = "passes.td",
+    deps = [
+        "@llvm-project//mlir:PassBaseTdFiles",
+    ],
+)
+
+cc_library(
+    name = "passes",
+    srcs = [
+        "spmd_expandable_interface_verification_pass.cc",
+    ],
+    hdrs = ["passes.h"],
+    visibility = ["//tensorflow/compiler/xla/python/ifrt:friends"],
+    deps = [
+        ":passes_inc_gen",
+        "//tensorflow/compiler/xla/python/ifrt/ir",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h b/tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h
new file mode 100644
index 00000000000..9fdddf85513
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h
@@ -0,0 +1,44 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_H_
+
+#include <memory>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+
+namespace xla {
+namespace ifrt {
+
+#define GEN_PASS_DECL
+#include "tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h.inc"  // IWYU pragma: export
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSpmdExpandableInterfaceVerificationPass(
+    SpmdExpandableInterfaceVerificationPassOptions options = {});
+
+// Generated definitions. This should be placed after all Pass creations.
+#define GEN_PASS_REGISTRATION
+#include "tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h.inc"  // IWYU pragma: export
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.td b/tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.td
new file mode 100644
index 00000000000..101c2441b83
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.td
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_TD_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_TD_
+
+include "mlir/Pass/PassBase.td"
+
+def SpmdExpandableInterfaceVerificationPass
+    : Pass<"spmd-expandable-interface-verification", "mlir::ModuleOp"> {
+  let summary = "Verify all ops have IFRT SpmdExpandableInterface implemented";
+
+  let description = [{
+    This pass verifies all ops appearing in atom programs have implemented the
+    IFRT SpmdExpandableInterface. This makes sure the interface methods have
+    implementation before they are being called in later passes.
+
+    This pass will fail if it encounters any op without the implementation.
+
+    When the atom program is invoked on 1 device, the verification is skipped as
+    no expansion is needed.
+  }];
+
+  let constructor = "CreateSpmdExpandableInterfaceVerificationPass()";
+
+  let options = [
+    ListOption<"excluded_dialects_", "excluded-dialects", "std::string",
+       "Comma-separated list of MILR dialect namespaces that are excluded from checking">
+  ];
+}
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_IR_TRANSFORMS_PASSES_TD_
diff --git a/tensorflow/compiler/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc b/tensorflow/compiler/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc
new file mode 100644
index 00000000000..46879acbfd4
--- /dev/null
+++ b/tensorflow/compiler/xla/python/ifrt/ir/transforms/spmd_expandable_interface_verification_pass.cc
@@ -0,0 +1,120 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "llvm/ADT/SmallSet.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_interfaces.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/ifrt_ops.h"
+#include "tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h"
+
+namespace xla {
+namespace ifrt {
+
+namespace {
+
+#define GEN_PASS_DEF_SPMDEXPANDABLEINTERFACEVERIFICATIONPASS
+#include "tensorflow/compiler/xla/python/ifrt/ir/transforms/passes.h.inc"
+
+class SpmdExpandableInterfaceVerificationPass
+    : public impl::SpmdExpandableInterfaceVerificationPassBase<
+          SpmdExpandableInterfaceVerificationPass> {
+ public:
+  using impl::SpmdExpandableInterfaceVerificationPassBase<
+      SpmdExpandableInterfaceVerificationPass>::
+      SpmdExpandableInterfaceVerificationPassBase;
+
+  mlir::LogicalResult initialize(mlir::MLIRContext* context) override {
+    dialects_require_no_spmd_interface_.insert(excluded_dialects_.begin(),
+                                               excluded_dialects_.end());
+    // TODO(b/261623129): Add SPMD expander for func.return and remove the
+    // following line. Add FuncDialect as all functions will need this dialect.
+    dialects_require_no_spmd_interface_.insert(
+        mlir::func::FuncDialect::getDialectNamespace().str());
+    return mlir::success();
+  }
+
+  void runOnOperation() override {
+    mlir::ModuleOp module_op = getOperation();
+    llvm::SmallSet<mlir::func::FuncOp, 4> visited_callees;
+    mlir::SymbolTableCollection symbol_table;
+
+    auto result = module_op.walk([&](CallOp call_op) -> mlir::WalkResult {
+      mlir::DenseI32ArrayAttr device_attr = call_op.getDevicesAttr();
+      DCHECK_GT(device_attr.size(), 0);
+      // CallOp with only 1 device need no SPMD expansion, so skip checking.
+      if (device_attr.size() == 1) {
+        return mlir::WalkResult::advance();
+      }
+
+      mlir::func::FuncOp callee = call_op.getCalleeOp(symbol_table);
+
+      if (auto [unused, inserted] = visited_callees.insert(callee); !inserted) {
+        return mlir::WalkResult::advance();
+      }
+
+      // Check each op in the callee function.
+      if (HasOpWithUnimplementedInterface(callee)) {
+        return mlir::WalkResult::interrupt();
+      }
+
+      return mlir::WalkResult::advance();
+    });
+
+    if (result.wasInterrupted()) {
+      signalPassFailure();
+    }
+  }
+
+ private:
+  bool HasOpWithUnimplementedInterface(mlir::func::FuncOp func_op) {
+    auto result = func_op->walk([&](mlir::Operation* op) -> mlir::WalkResult {
+      if (dialects_require_no_spmd_interface_.contains(
+              op->getName().getDialectNamespace())) {
+        return mlir::WalkResult::advance();
+      }
+
+      // Other ops should implement the interface.
+      if (!llvm::isa<IfrtSpmdExpandable>(op)) {
+        return op->emitOpError()
+               << "requires op to have `IfrtSpmdExpandable` "
+                  "OpInterface implemented or the dialect `"
+               << op->getName().getDialectNamespace().str()
+               << "` to be added to the excluded-dialects list.";
+      }
+      return mlir::WalkResult::advance();
+    });
+    return result.wasInterrupted();
+  }
+
+  absl::flat_hash_set<std::string> dialects_require_no_spmd_interface_;
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSpmdExpandableInterfaceVerificationPass(
+    SpmdExpandableInterfaceVerificationPassOptions options) {
+  return std::make_unique<SpmdExpandableInterfaceVerificationPass>(options);
+}
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/ifrt/sharding.cc b/tensorflow/compiler/xla/python/ifrt/sharding.cc
index f9a7a2bb3bd..1a1eb80bcf5 100644
--- a/tensorflow/compiler/xla/python/ifrt/sharding.cc
+++ b/tensorflow/compiler/xla/python/ifrt/sharding.cc
@@ -301,10 +301,10 @@ StatusOr<std::vector<IndexDomain>> ShardingParamSharding::IndexDomains(
 
   // Calculate the device assignments.
   // `origins[i]` should go to `device_list[i]`.
-  static constexpr int64_t kInvalidIndex = -1;
-  llvm::SmallVector<int64_t, 4> device_list;
+  static constexpr int kInvalidIndex = -1;
+  llvm::SmallVector<int, 4> device_list;
   sharding_param_.minor_to_major().ToDeviceList(device_list);
-  std::vector<int64_t> device_to_index(device_list.size(), kInvalidIndex);
+  std::vector<int> device_to_index(device_list.size(), kInvalidIndex);
   for (int i = 0; i < device_list.size(); ++i) {
     device_to_index[device_list[i]] = i;
   }
@@ -316,7 +316,7 @@ StatusOr<std::vector<IndexDomain>> ShardingParamSharding::IndexDomains(
   std::vector<IndexDomain> result;
   result.reserve(device_to_index.size());
   for (int i = 0; i < device_to_index.size(); ++i) {
-    int64_t index = device_to_index[i];
+    int index = device_to_index[i];
     DCHECK_NE(index, kInvalidIndex);
     result.push_back(IndexDomain(origins[index / replication], local_shape));
   }
diff --git a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc
index 295de10acde..1e518df140e 100644
--- a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc
+++ b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.cc
@@ -29,7 +29,7 @@ namespace ifrt {
 namespace support {
 
 StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
-                                  absl::Span<const int64_t> device_mapping) {
+                                  absl::Span<const int> device_mapping) {
   OpSharding op_sharding;
   op_sharding.set_type(OpSharding::OTHER);
 
@@ -41,8 +41,8 @@ StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
     cum_size *= dim_shard;
     tile_assignment_dims->Add(dim_shard);
   }
-  int64_t device_count = 1;
-  for (const int64_t axis_size : sharding_param.minor_to_major().axis_sizes) {
+  int device_count = 1;
+  for (const int axis_size : sharding_param.minor_to_major().axis_sizes) {
     device_count *= axis_size;
   }
   if (device_count != cum_size) {
@@ -51,11 +51,11 @@ StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
   }
 
   // Populate tile_assignment_devices.
-  llvm::SmallVector<int64_t, 4> devices;
+  llvm::SmallVector<int, 4> devices;
   sharding_param.minor_to_major().ToDeviceList(devices);
   auto* tile_assignment_devices = op_sharding.mutable_tile_assignment_devices();
   tile_assignment_devices->Reserve(devices.size());
-  for (const int64_t device : devices) {
+  for (const int device : devices) {
     if (device < 0 || device >= device_mapping.size()) {
       return tsl::errors::OutOfRange("Can't map device ", device);
     }
diff --git a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h
index 36ab05e6bd2..569cdcbc120 100644
--- a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h
+++ b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding.h
@@ -16,8 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SUPPORT_SHARDING_PARAM_TO_OP_SHARDING_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_IFRT_SUPPORT_SHARDING_PARAM_TO_OP_SHARDING_H_
 
-#include <cstdint>
-
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/python/ifrt/ir/sharding_param.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -34,7 +32,7 @@ namespace support {
 // Returns error when `device_mapping` can't map the logical devices in
 // `sharding_param`.
 StatusOr<OpSharding> ToOpSharding(const ShardingParam& sharding_param,
-                                  absl::Span<const int64_t> device_mapping);
+                                  absl::Span<const int> device_mapping);
 
 }  // namespace support
 }  // namespace ifrt
diff --git a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc
index 47c1113d23c..ad61143c31c 100644
--- a/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc
+++ b/tensorflow/compiler/xla/python/ifrt/support/sharding_param_to_op_sharding_test.cc
@@ -42,9 +42,8 @@ namespace {
 
 using ::tsl::testing::StatusIs;
 
-StatusOr<xla::HloSharding> ToHloSharding(
-    const ShardingParam& sharding_param,
-    absl::Span<const int64_t> device_list) {
+StatusOr<xla::HloSharding> ToHloSharding(const ShardingParam& sharding_param,
+                                         absl::Span<const int> device_list) {
   TF_ASSIGN_OR_RETURN(xla::OpSharding op_sharding,
                       ToOpSharding(sharding_param, device_list));
   return xla::HloSharding::FromProto(op_sharding);
diff --git a/tensorflow/compiler/xla/python/ifrt/test_util.h b/tensorflow/compiler/xla/python/ifrt/test_util.h
index 26654c0099f..c90b74da928 100644
--- a/tensorflow/compiler/xla/python/ifrt/test_util.h
+++ b/tensorflow/compiler/xla/python/ifrt/test_util.h
@@ -1,3 +1,4 @@
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 /* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -18,9 +19,18 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/python/ifrt/array.h"
 #include "tensorflow/compiler/xla/python/ifrt/client.h"
-#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/python/ifrt/device.h"
+#include "tensorflow/compiler/xla/python/ifrt/dtype.h"
+#include "tensorflow/compiler/xla/python/ifrt/shape.h"
+#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
 
 namespace xla {
 namespace ifrt {
@@ -36,6 +46,36 @@ bool IsClientFactoryRegistered();
 // Gets a new IFRT client using the registered client factory.
 StatusOr<std::unique_ptr<Client>> GetClient();
 
+// Asserts the content of an Array.
+// This will blocking copy the data to host buffer.
+template <typename ElementT>
+void AssertPerShardData(
+    tsl::RCReference<Array> actual, DType expected_dtype,
+    Shape expected_per_shard_shape,
+    absl::Span<const absl::Span<const ElementT>> expected_per_shard_data,
+    DeviceList expected_device_list) {
+  ASSERT_EQ(actual->dtype(), expected_dtype);
+  EXPECT_THAT(GetDeviceIds(actual->sharding().devices()),
+              testing::ElementsAreArray(GetDeviceIds(expected_device_list)));
+  TF_ASSERT_OK_AND_ASSIGN(auto actual_per_shard_arrays,
+                          actual->DisassembleIntoSingleDeviceArrays(
+                              ArrayCopySemantics::kAlwaysCopy));
+  ASSERT_EQ(actual_per_shard_arrays.size(), expected_per_shard_data.size());
+  for (int i = 0; i < actual_per_shard_arrays.size(); ++i) {
+    SCOPED_TRACE(absl::StrCat("Shard ", i));
+    tsl::RCReference<Array> array = actual_per_shard_arrays[i];
+    ASSERT_EQ(array->shape(), expected_per_shard_shape);
+    std::vector<ElementT> actual_data(expected_per_shard_shape.num_elements());
+    TF_ASSERT_OK(array
+                     ->CopyToHostBuffer(actual_data.data(),
+                                        /*byte_strides=*/std::nullopt,
+                                        ArrayCopySemantics::kAlwaysCopy)
+                     .Await());
+    EXPECT_THAT(actual_data,
+                testing::ElementsAreArray(expected_per_shard_data[i]));
+  }
+}
+
 }  // namespace test_util
 }  // namespace ifrt
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/jax_jit.cc b/tensorflow/compiler/xla/python/jax_jit.cc
index 763d4c391d9..d5354f2593f 100644
--- a/tensorflow/compiler/xla/python/jax_jit.cc
+++ b/tensorflow/compiler/xla/python/jax_jit.cc
@@ -346,7 +346,7 @@ void BuildJaxjitSubmodule(py::module& m) {
 
   // TODO(yashkatariya, phawkins): Remove after 3 months from March 20, 2023.
   struct CompiledFunction {};
-  py::class_<CompiledFunction>(m, "CompiledFunction");
+  py::class_<CompiledFunction> give_me_a_name(m, "CompiledFunction");
 
   py::class_<xla::PyArgSignature> arg_signature(jitlib, "PyArgSignature");
   arg_signature
diff --git a/tensorflow/compiler/xla/python/jax_jit.h b/tensorflow/compiler/xla/python/jax_jit.h
index 462e525129b..652497ad57f 100644
--- a/tensorflow/compiler/xla/python/jax_jit.h
+++ b/tensorflow/compiler/xla/python/jax_jit.h
@@ -17,8 +17,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_JAX_JIT_H_
 
 #include <memory>
+#include <optional>
 #include <stdexcept>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
diff --git a/tensorflow/compiler/xla/python/mlir.cc b/tensorflow/compiler/xla/python/mlir.cc
index 39e2dd1d21e..f68562fb0b6 100644
--- a/tensorflow/compiler/xla/python/mlir.cc
+++ b/tensorflow/compiler/xla/python/mlir.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
+#include "tensorflow/compiler/xla/python/refine_polymorphic_shapes.h"
 #include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -58,12 +59,12 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseModule(
   module = mlir::parseSourceString<mlir::ModuleOp>(
       llvm::StringRef(str.data(), str.size()), context);
   if (!module) {
-    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+    return diagnostic_handler.ConsumeStatus();
   }
   if (failed(module->verifyInvariants())) {
     VLOG(1) << "MLIR verification failed.";
     module->dump();
-    return FromAbslStatus(diagnostic_handler.ConsumeStatus());
+    return diagnostic_handler.ConsumeStatus();
   }
   return module;
 }
@@ -220,6 +221,20 @@ void BuildMlirSubmodule(py::module& m) {
   mlir_module.def("deserialize_portable_artifact",
                   xla::ValueOrThrowWrapper(PyDeserializePortableArtifact),
                   py::arg("mlir_module"));
+  mlir_module.def(
+      "refine_polymorphic_shapes",
+      [](std::string mlir_module) -> py::bytes {
+        std::string buffer;
+        llvm::raw_string_ostream os(buffer);
+        xla::ThrowIfError(RefinePolymorphicShapes(mlir_module, os));
+        return py::bytes(buffer);
+      },
+      py::arg("mlir_module"),
+      R"(Refines the dynamic shapes for a module.
+        The "main" function must have static shapes and all the
+        intermediate dynamic shapes depend only on the input static
+        shapes.
+      )");
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver.cc b/tensorflow/compiler/xla/python/outfeed_receiver.cc
index 71e82601509..bdb4c62f9a1 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <queue>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
@@ -277,7 +279,7 @@ void OutfeedReceiverImpl::Shutdown() {
     shutdown_started_ = true;
   }
   for (int device_idx = 0; device_idx < devices_.size(); ++device_idx) {
-    CHECK(SendShutdownOutfeedHeader(device_idx).ok());
+    TF_CHECK_OK(SendShutdownOutfeedHeader(device_idx));
   }
   VLOG(2) << "Shutdown waiting for listening and callback threads to stop";
   absl::MutexLock lock(&mu_);
@@ -492,7 +494,7 @@ OutfeedReceiver::OutfeedReceiver(
                                                   executable_build_options);
 }
 
-OutfeedReceiver::~OutfeedReceiver() {}
+OutfeedReceiver::~OutfeedReceiver() = default;
 
 void OutfeedReceiver::Start() { p_impl_->Start(); }
 
diff --git a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
index f1a88c8a67b..7c40b417ed7 100644
--- a/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
+++ b/tensorflow/compiler/xla/python/outfeed_receiver_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <vector>
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
diff --git a/tensorflow/compiler/xla/python/pjit.cc b/tensorflow/compiler/xla/python/pjit.cc
index 12b6f401244..dc96ba34ac6 100644
--- a/tensorflow/compiler/xla/python/pjit.cc
+++ b/tensorflow/compiler/xla/python/pjit.cc
@@ -781,6 +781,7 @@ PyObject* PjitFunction_tp_new(PyTypeObject* subtype, PyObject* args,
 }
 
 void PjitFunction_tp_dealloc(PyObject* self) {
+  PyObject_GC_UnTrack(self);
   PyTypeObject* tp = Py_TYPE(self);
   PjitFunctionObject* o = reinterpret_cast<PjitFunctionObject*>(self);
   if (o->weakrefs) {
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD b/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD
index 44a4709498a..20ca1c924e4 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/BUILD
@@ -1,4 +1,5 @@
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
+load("//tensorflow/tsl/platform:build_config.bzl", "tf_proto_library")
 
 package_group(
     name = "friends",
@@ -18,17 +19,67 @@ package_group(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":friends",
         ":internal",
     ],
-    licenses = ["notice"],
 )
 
 exports_files([
     "BUILD",
 ])
 
+# TODO(hyeontaek): Move this target out of pjrt_ifrt.
+cc_library(
+    name = "xla_ifrt",
+    srcs = [
+        "xla_compiler.cc",
+    ],
+    hdrs = [
+        "xla_compiler.h",
+    ],
+    deps = [
+        "//tensorflow/compiler/xla/python/ifrt",
+        "@llvm-project//llvm:Support",
+    ],
+)
+
+tf_proto_library(
+    name = "xla_host_callback_proto",
+    srcs = ["xla_host_callback.proto"],
+    cc_api_version = 2,
+    protodeps = ["//tensorflow/compiler/xla:xla_data_proto"],
+)
+
+# TODO(hyeontaek): Move this target out of pjrt_ifrt.
+cc_library(
+    name = "xla_executable_impl_test_lib",
+    testonly = 1,
+    srcs = ["xla_executable_impl_test_lib.cc"],
+    deps = [
+        ":xla_ifrt",
+        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/python/ifrt",
+        "//tensorflow/compiler/xla/python/ifrt:test_util",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
+# TODO(hyeontaek): Move this target out of pjrt_ifrt.
+xla_cc_test(
+    name = "xla_executable_test_no_impl",
+    srcs = [],
+    deps = [
+        ":xla_executable_impl_test_lib",
+        "//tensorflow/compiler/xla/python/ifrt:no_impl_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "pjrt_ifrt",
     srcs = [
@@ -36,6 +87,7 @@ cc_library(
         "pjrt_client.cc",
         "pjrt_compiler.cc",
         "pjrt_executable.cc",
+        "pjrt_host_callback.cc",
         "pjrt_tuple.cc",
     ],
     hdrs = [
@@ -43,17 +95,20 @@ cc_library(
         "pjrt_client.h",
         "pjrt_compiler.h",
         "pjrt_executable.h",
+        "pjrt_host_callback.h",
         "pjrt_tuple.h",
     ],
     deps = [
+        ":xla_ifrt",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
+        "//tensorflow/compiler/xla/pjrt:host_callback",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
+        "//tensorflow/compiler/xla/pjrt:pjrt_executable",
         "//tensorflow/compiler/xla/pjrt:utils",
         "//tensorflow/compiler/xla/python/ifrt",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
@@ -109,7 +164,7 @@ xla_cc_test(
     srcs = ["pjrt_executable_impl_test_tfrt_cpu.cc"],
     deps = [
         ":tfrt_cpu_client_test_lib",
-        "//tensorflow/compiler/xla/python/ifrt:executable_impl_test_lib",
+        ":xla_executable_impl_test_lib",
         "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
index c215b841835..7d911b404bd 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.cc
@@ -41,10 +41,12 @@ StatusOr<xla::PrimitiveType> ToPrimitiveType(DType dtype) {
   switch (dtype.kind()) {
     case DType::kInvalid:
     case DType::kPred:
+    case DType::kS4:
     case DType::kS8:
     case DType::kS16:
     case DType::kS32:
     case DType::kS64:
+    case DType::kU4:
     case DType::kU8:
     case DType::kU16:
     case DType::kU32:
@@ -71,10 +73,12 @@ StatusOr<DType> ToDType(xla::PrimitiveType primitive_type) {
   switch (primitive_type) {
     case xla::PrimitiveType::PRIMITIVE_TYPE_INVALID:
     case xla::PrimitiveType::PRED:
+    case xla::PrimitiveType::S4:
     case xla::PrimitiveType::S8:
     case xla::PrimitiveType::S16:
     case xla::PrimitiveType::S32:
     case xla::PrimitiveType::S64:
+    case xla::PrimitiveType::U4:
     case xla::PrimitiveType::U8:
     case xla::PrimitiveType::U16:
     case xla::PrimitiveType::U32:
@@ -121,6 +125,27 @@ StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
                                  PjRtBuffers({std::move(pjrt_buffer)}));
 }
 
+StatusOr<tsl::RCReference<Array>> PjRtArray::FullyReplicatedShard(
+    ArrayCopySemantics semantics) {
+  return PjRtArray::Create(client(), GetPjRtBuffer(semantics, 0));
+}
+
+std::shared_ptr<PjRtBuffer> PjRtArray::GetPjRtBuffer(
+    ArrayCopySemantics semantics, int index) const {
+  switch (semantics) {
+    case ArrayCopySemantics::kAlwaysCopy:
+      // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the PjRt
+      // API does not have efficient buffer cloning on the same device.
+      return pjrt_buffers_[index];
+    case ArrayCopySemantics::kReuseInput:
+      return pjrt_buffers_[index];
+    case ArrayCopySemantics::kDonateInput:
+      // TODO(hyeontaek): We may try std::move(pjrt_buffers_[i]), but this
+      // would be unsafe if there is a subsequent access to the buffer.
+      return pjrt_buffers_[index];
+  }
+}
+
 StatusOr<tsl::RCReference<PjRtArray>> PjRtArray::Create(
     PjRtCompatibleClient* client, Shape shape, PjRtBuffers pjrt_buffers) {
   TF_ASSIGN_OR_RETURN(
@@ -162,21 +187,7 @@ PjRtArray::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
   for (int i = 0; i < sharding_->devices().size(); ++i) {
     PjRtBuffers buffers;
     buffers.reserve(1);
-    switch (semantics) {
-      case ArrayCopySemantics::kAlwaysCopy:
-        // TODO(hyeontaek): kAlwaysCopy should clone the buffer, but the PjRt
-        // API does not have efficient buffer cloning on the same device.
-        buffers.push_back(pjrt_buffers_[i]);
-        break;
-      case ArrayCopySemantics::kReuseInput:
-        buffers.push_back(pjrt_buffers_[i]);
-        break;
-      case ArrayCopySemantics::kDonateInput:
-        // TODO(hyeontaek): We may try std::move(pjrt_buffers_[i]), but this
-        // would be unsafe if there is a subsequent access to the buffer.
-        buffers.push_back(pjrt_buffers_[i]);
-        break;
-    }
+    buffers.push_back(GetPjRtBuffer(semantics, i));
     TF_ASSIGN_OR_RETURN(
         auto array, PjRtArray::Create(client_, dtype_,
                                       std::move(shape_and_shardings[i].first),
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h
index 5f635cda89b..10846dbe059 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_array.h
@@ -84,6 +84,9 @@ class PjRtArray final
     return absl::MakeSpan(pjrt_buffers_);
   }
 
+  StatusOr<tsl::RCReference<Array>> FullyReplicatedShard(
+      ArrayCopySemantics semantics) override;
+
   // Array implementation.
 
   ~PjRtArray() override = default;
@@ -124,6 +127,9 @@ class PjRtArray final
 
   Future<Status> GetReadyFuture() const override;
 
+  std::shared_ptr<PjRtBuffer> GetPjRtBuffer(ArrayCopySemantics semantics,
+                                            int index) const;
+
   Future<Status> Delete() override;
   bool IsDeleted() const override;
 
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc
index 8983b71782f..28d66a00746 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -80,9 +80,12 @@ PjRtClient::AssembleArrayFromSingleDeviceArrays(
     Shape shape, std::shared_ptr<const Sharding> sharding,
     absl::Span<tsl::RCReference<Array>> arrays, ArrayCopySemantics semantics) {
   DCHECK(this);
-  if (!llvm::isa<const OpaqueSharding>(sharding.get())) {
-    return InvalidArgument("Only OpaqueSharding is supported: sharding=%s",
-                           sharding->DebugString());
+  if (!llvm::isa<const OpaqueSharding>(sharding.get()) &&
+      !llvm::isa<const ShardingParamSharding>(sharding.get())) {
+    return InvalidArgument(
+        "Only OpaqueSharding and ShardingParamSharding are supported: "
+        "sharding=%s",
+        sharding->DebugString());
   }
   if (sharding->devices().size() != arrays.size()) {
     return InvalidArgument(
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h
index 507b3071a12..93def4ef77c 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h
@@ -58,6 +58,7 @@ class PjRtCompatibleClient
 class PjRtClient final
     : public llvm::RTTIExtends<PjRtClient, PjRtCompatibleClient> {
  public:
+  // Creates a `Client` with a `PjRtClient`.
   static std::unique_ptr<PjRtClient> Create(
       std::shared_ptr<xla::PjRtClient> pjrt_client);
 
@@ -137,15 +138,6 @@ class PjRtClient final
     return pjrt_client_->LookupDevice(device_id);
   }
 
-  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() override {
-    DCHECK(this);
-    return pjrt_client_->CreateDeviceToHostChannelHandle();
-  }
-  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
-    DCHECK(this);
-    return pjrt_client_->CreateHostToDeviceChannelHandle();
-  }
-
   Compiler* GetDefaultCompiler() override {
     DCHECK(this);
     return &default_compiler_;
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
index 55e28572271..82ac6004efe 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.cc
@@ -15,14 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h"
 
-#include <functional>
 #include <memory>
 #include <optional>
 #include <utility>
 
-#include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
@@ -31,20 +30,28 @@ namespace ifrt {
 char PjRtCompiler::ID = 0;
 
 StatusOr<std::unique_ptr<LoadedExecutable>> PjRtCompiler::Compile(
-    mlir::ModuleOp mlir_module, CompileOptions options) {
+    mlir::ModuleOp mlir_module, std::unique_ptr<CompileOptions> options) {
   DCHECK(this);
-  return PjRtLoadedExecutable::Create(client_, mlir_module, std::move(options));
+  TF_ASSIGN_OR_RETURN(auto xla_compile_options,
+                      GetXlaCompileOptions(std::move(options)));
+  return PjRtLoadedExecutable::Create(
+      client_, mlir_module, std::move(xla_compile_options->compile_options),
+      std::move(xla_compile_options->loaded_host_callbacks));
 }
 
 StatusOr<std::unique_ptr<LoadedExecutable>>
 PjRtCompiler::DeserializeLoadedExecutable(
-    absl::string_view serialized, std::optional<CompileOptions> options) {
+    absl::string_view serialized, std::unique_ptr<DeserializeOptions> options) {
   DCHECK(this);
-  TF_ASSIGN_OR_RETURN(auto pjrt_loaded_executble,
-                      client_->pjrt_client()->DeserializeExecutable(
-                          serialized, std::move(options)));
-  return PjRtLoadedExecutable::Create(client_,
-                                      std::move(pjrt_loaded_executble));
+  TF_ASSIGN_OR_RETURN(auto xla_deserialize_options,
+                      GetXlaDeserializeOptions(std::move(options)));
+  TF_ASSIGN_OR_RETURN(
+      auto pjrt_loaded_executble,
+      client_->pjrt_client()->DeserializeExecutable(
+          serialized, std::move(xla_deserialize_options->compile_options)));
+  return PjRtLoadedExecutable::Create(
+      client_, std::move(pjrt_loaded_executble),
+      std::move(xla_deserialize_options->loaded_host_callbacks));
 }
 
 }  // namespace ifrt
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
index 467d80381b5..694ee7c0e7d 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_compiler.h
@@ -16,32 +16,35 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
 #define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_COMPILER_H_
 
-#include <functional>
 #include <memory>
-#include <optional>
-#include <utility>
 
 #include "llvm/Support/ExtensibleRTTI.h"
-#include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/python/ifrt/compiler.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace ifrt {
 
 class PjRtClient;
 
+// Compiler that produces PjRt executables.
+//
+// TODO(hyeontaek): Move executable loading to `PjRtClient` and remove the
+// requirement of `PjRtClient`, which will enable ahead-of-time compilation.
 class PjRtCompiler final : public llvm::RTTIExtends<PjRtCompiler, Compiler> {
  public:
   explicit PjRtCompiler(PjRtClient* client) : client_(client) {}
+
+  // Compiler implementation.
+
   ~PjRtCompiler() override = default;
 
   StatusOr<std::unique_ptr<LoadedExecutable>> Compile(
-      mlir::ModuleOp mlir_module, CompileOptions options) override;
+      mlir::ModuleOp mlir_module,
+      std::unique_ptr<CompileOptions> options) override;
 
   StatusOr<std::unique_ptr<LoadedExecutable>> DeserializeLoadedExecutable(
       absl::string_view serialized,
-      std::optional<CompileOptions> options) override;
+      std::unique_ptr<DeserializeOptions> options) override;
 
   static char ID;  // NOLINT
 
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc
index 4de101b5023..9e704cbe81f 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -22,7 +22,9 @@ limitations under the License.
 #include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "tensorflow/compiler/xla/pjrt/host_callback.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
 #include "tensorflow/compiler/xla/python/ifrt/device.h"
 #include "tensorflow/compiler/xla/python/ifrt/dtype.h"
 #include "tensorflow/compiler/xla/python/ifrt/sharding.h"
@@ -86,14 +88,18 @@ StatusOr<std::string> PjRtExecutable::Serialize() const {
 
 StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client,
-    std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable) {
-  return Create(client, std::shared_ptr<xla::PjRtLoadedExecutable>(
-                            pjrt_loaded_executable.release()));
+    std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
+  return Create(client,
+                std::shared_ptr<xla::PjRtLoadedExecutable>(
+                    pjrt_loaded_executable.release()),
+                std::move(loaded_host_callbacks));
 }
 
 StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client,
-    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable) {
+    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
   // TODO(hyeontaek): We should request output sharding instead of the entire
   // HLO modules once PjRt supports it.
   // TODO(hyeontaek): We would not need to use GetHloModules() if
@@ -112,7 +118,7 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
   // apply sharding twice.
   const xla::Shape& result_shape = hlo_module->result_shape();
   return CreateInternal(client, std::move(pjrt_loaded_executable), result_shape,
-                        /*result_hlo_sharding=*/nullptr);
+                        /*result_hlo_sharding=*/nullptr, loaded_host_callbacks);
 }
 
 static StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
@@ -133,13 +139,14 @@ static StatusOr<std::vector<xla::Shape>> ResultShapesOfModule(
 
 StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client, mlir::ModuleOp module,
-    CompileOptions options) {
+    xla::CompileOptions compile_options,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
   VLOG(3) << "PjRtLoadedExecutable::Create";
   if (VLOG_IS_ON(3)) {
     module.dump();
   }
-  VLOG(3) << options.ToProto()->DebugString();
-  const auto& build_options = options.executable_build_options;
+  VLOG(3) << compile_options.ToProto()->DebugString();
+  const auto& build_options = compile_options.executable_build_options;
   const bool auto_spmd_partitioning =
       build_options.use_spmd_partitioning() &&
       build_options.num_partitions() > 1 &&
@@ -147,7 +154,7 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
        build_options.any_allow_spmd_sharding_propagation_to_output());
   TF_ASSIGN_OR_RETURN(
       auto pjrt_loaded_executable,
-      client->pjrt_client()->Compile(module, std::move(options)));
+      client->pjrt_client()->Compile(module, std::move(compile_options)));
 
   if (auto_spmd_partitioning) {
     // TODO(hyeontaek): We should request output shapes and shardings instead of
@@ -163,9 +170,9 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     // (e.g., from hlo_module->spmd_output_sharding()), which would accidentally
     // apply sharding twice.
     const xla::Shape& result_shape = hlo_module->result_shape();
-    return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          result_shape,
-                          /*result_hlo_sharding=*/nullptr);
+    return CreateInternal(
+        client, std::move(pjrt_loaded_executable), result_shape,
+        /*result_hlo_sharding=*/nullptr, std::move(loaded_host_callbacks));
   } else {
     VLOG(3) << "Not requesting GetHloModules";
     TF_ASSIGN_OR_RETURN(auto result_shapes, ResultShapesOfModule(module));
@@ -198,17 +205,19 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
       result_hlo_sharding = &*result_hlo_sharding_holder;
     }
     return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          result_shape, result_hlo_sharding);
+                          result_shape, result_hlo_sharding,
+                          std::move(loaded_host_callbacks));
   }
 }
 
 StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     PjRtCompatibleClient* client, const XlaComputation& computation,
-    CompileOptions options) {
+    xla::CompileOptions compile_options,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
   VLOG(3) << "PjRtLoadedExecutable::Create";
   VLOG(3) << computation.proto().DebugString();
-  VLOG(3) << options.ToProto()->DebugString();
-  const auto& build_options = options.executable_build_options;
+  VLOG(3) << compile_options.ToProto()->DebugString();
+  const auto& build_options = compile_options.executable_build_options;
   const bool auto_spmd_partitioning =
       build_options.use_spmd_partitioning() &&
       build_options.num_partitions() > 1 &&
@@ -216,7 +225,7 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
        build_options.any_allow_spmd_sharding_propagation_to_output());
   TF_ASSIGN_OR_RETURN(
       auto pjrt_loaded_executable,
-      client->pjrt_client()->Compile(computation, std::move(options)));
+      client->pjrt_client()->Compile(computation, std::move(compile_options)));
 
   if (auto_spmd_partitioning) {
     // TODO(hyeontaek): We should request output shapes and shardings instead of
@@ -232,9 +241,9 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
     // (e.g., from hlo_module->spmd_output_sharding()), which would accidentally
     // apply sharding twice.
     const xla::Shape& result_shape = hlo_module->result_shape();
-    return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          result_shape,
-                          /*result_hlo_sharding=*/nullptr);
+    return CreateInternal(
+        client, std::move(pjrt_loaded_executable), result_shape,
+        /*result_hlo_sharding=*/nullptr, std::move(loaded_host_callbacks));
   } else {
     VLOG(3) << "Not requesting GetHloModules";
     TF_ASSIGN_OR_RETURN(const auto* root_instruction,
@@ -249,7 +258,8 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
       result_hlo_sharding = &*result_hlo_sharding_holder;
     }
     return CreateInternal(client, std::move(pjrt_loaded_executable),
-                          result_shape, result_hlo_sharding);
+                          result_shape, result_hlo_sharding,
+                          std::move(loaded_host_callbacks));
   }
 }
 
@@ -257,8 +267,8 @@ StatusOr<std::unique_ptr<LoadedExecutable>>
 PjRtLoadedExecutable::CreateInternal(
     PjRtCompatibleClient* client,
     std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-    const xla::Shape& result_shape,
-    const xla::HloSharding* result_hlo_sharding) {
+    const xla::Shape& result_shape, const xla::HloSharding* result_hlo_sharding,
+    std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks) {
   DeviceList devices(
       DeviceList::Devices(pjrt_loaded_executable->addressable_devices().begin(),
                           pjrt_loaded_executable->addressable_devices().end()));
@@ -343,12 +353,60 @@ PjRtLoadedExecutable::CreateInternal(
         "The computation result is not a support type (array, token, tuple)");
   }
 
+  std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+      host_send_and_recv_callbacks;
+  host_send_and_recv_callbacks.reserve(loaded_host_callbacks.size());
+  // Gather all `PjRtLoadedHostCallback` separately, as each execution will
+  // register `PjRtLoadedHostCallback` for host send and recv. All host
+  // callbacks will be referenced by the executable and any pending execution to
+  // guarantee the liveliness of host callbacks during executions.
+  for (auto& loaded_host_callback : loaded_host_callbacks) {
+    auto* host_send_and_recv_callback =
+        llvm::dyn_cast<PjRtHostSendAndRecvLoadedHostCallback>(
+            loaded_host_callback.get());
+    if (host_send_and_recv_callback != nullptr) {
+      host_send_and_recv_callbacks.push_back(host_send_and_recv_callback);
+    }
+  }
+  if (!loaded_host_callbacks.empty() &&
+      !client->pjrt_client()->SupportsSendRecvCallbacks()) {
+    return InternalError("Host callback not supported for runtime type: %s",
+                         client->runtime_type());
+  }
+
   return std::unique_ptr<LoadedExecutable>(new PjRtLoadedExecutable(
       client, std::move(pjrt_loaded_executable), std::move(devices),
+      std::move(loaded_host_callbacks), std::move(host_send_and_recv_callbacks),
       std::move(output_dtypes), std::move(output_shapes),
       std::move(output_shardings)));
 }
 
+PjRtLoadedExecutable::PjRtLoadedExecutable(
+    PjRtCompatibleClient* client,
+    std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+    DeviceList devices,
+    std::vector<tsl::RCReference<LoadedHostCallback>> all_loaded_host_callbacks,
+    std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+        host_send_recv_callbacks,
+    std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+    std::vector<std::shared_ptr<const Sharding>> output_shardings)
+    : client_(client),
+      pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
+      devices_(std::move(devices)),
+      all_loaded_host_callbacks_(
+          std::make_shared<std::vector<tsl::RCReference<LoadedHostCallback>>>(
+              std::move(all_loaded_host_callbacks))),
+      host_send_recv_callbacks_(std::move(host_send_recv_callbacks)),
+      output_dtypes_(std::move(output_dtypes)),
+      output_shapes_(std::move(output_shapes)),
+      output_shardings_(std::move(output_shardings)) {}
+
+PjRtLoadedExecutable::~PjRtLoadedExecutable() {
+  // Reset the PjRt executable before host callbacks.
+  pjrt_loaded_executable_ = nullptr;
+  all_loaded_host_callbacks_->clear();
+}
+
 StatusOr<PjRtLoadedExecutable::ExecuteResult> PjRtLoadedExecutable::Execute(
     absl::Span<tsl::RCReference<Array>> args, const ExecuteOptions& options,
     std::optional<DeviceList> devices) {
@@ -405,6 +463,38 @@ StatusOr<PjRtLoadedExecutable::ExecuteResult> PjRtLoadedExecutable::Execute(
   const bool returned_future_supported =
       pjrt_loaded_executable_->IsReturnedFutureSupported();
 
+  auto opts = options;
+
+  if (!all_loaded_host_callbacks_->empty() && !returned_future_supported) {
+    return InternalError(
+        "Host callback not supported without returned future support in "
+        "runtime: %s",
+        client_->runtime_type());
+  }
+
+  std::unique_ptr<HostCallbackStates> host_callback_states;
+  if (!host_send_recv_callbacks_.empty()) {
+    host_callback_states = std::make_unique<HostCallbackStates>();
+    for (int i = 0; i < num_computations; ++i) {
+      auto& contexts = host_callback_states->contexts.emplace_back();
+      auto& send_callbacks =
+          host_callback_states->send_callbacks.emplace_back();
+      auto& recv_callbacks =
+          host_callback_states->recv_callbacks.emplace_back();
+
+      for (const auto& host_send_recv_callback : host_send_recv_callbacks_) {
+        contexts.push_back(CreateHostCallbackStateAndAppendSendRecvCallbacks(
+            host_send_recv_callback->host_callback(),
+            /*host_memory_for_device_manager=*/nullptr, send_callbacks,
+            recv_callbacks,
+            /*use_major_to_minor_data_layout_for_callbacks=*/
+            options.use_major_to_minor_data_layout_for_callbacks));
+      }
+    }
+    opts.send_callbacks = host_callback_states->send_callbacks;
+    opts.recv_callbacks = host_callback_states->recv_callbacks;
+  }
+
   // Execute the computation.
   std::vector<std::vector<std::unique_ptr<PjRtBuffer>>> pjrt_outputs;
   ExecuteResult result;
@@ -413,7 +503,7 @@ StatusOr<PjRtLoadedExecutable::ExecuteResult> PjRtLoadedExecutable::Execute(
     TF_ASSIGN_OR_RETURN(
         std::vector<std::unique_ptr<PjRtBuffer>> single_device_pjrt_results,
         pjrt_loaded_executable_->ExecutePortable(
-            argument_handles.front(), portable_execution_device, options,
+            argument_handles.front(), portable_execution_device, opts,
             returned_pjrt_future, /*fill_future=*/returned_future_supported));
 
     pjrt_outputs.push_back(std::move(single_device_pjrt_results));
@@ -428,9 +518,9 @@ StatusOr<PjRtLoadedExecutable::ExecuteResult> PjRtLoadedExecutable::Execute(
       returned_pjrt_futures.emplace();
     }
 
-    TF_ASSIGN_OR_RETURN(pjrt_outputs,
-                        pjrt_loaded_executable_->Execute(
-                            argument_handles, options, returned_pjrt_futures));
+    TF_ASSIGN_OR_RETURN(
+        pjrt_outputs, pjrt_loaded_executable_->Execute(argument_handles, opts,
+                                                       returned_pjrt_futures));
 
     if (returned_future_supported) {
       result.status = JoinFutures(absl::MakeSpan(*returned_pjrt_futures));
@@ -439,6 +529,16 @@ StatusOr<PjRtLoadedExecutable::ExecuteResult> PjRtLoadedExecutable::Execute(
     }
   }
 
+  if (!all_loaded_host_callbacks_->empty()) {
+    // For host callbacks to work, returned futures must be supported so that we
+    // can use the futures to extend the lifetime of the host callbacks until
+    // the execution finishes.
+    result.status.OnReady(
+        [all_loaded_host_callbacks = all_loaded_host_callbacks_,
+         host_callback_states = std::move(host_callback_states)](
+            Status) mutable { all_loaded_host_callbacks.reset(); });
+  }
+
   // Convert 2-level PjRtBuffer vectors into an Array vector.
   std::vector<tsl::RCReference<Array>> outputs;
   // TODO(hyeontaek): Check output dtype/shape consistency with the actual
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
index 366755532a1..b0e9720fec7 100644
--- a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_executable.h
@@ -26,7 +26,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/xla_computation.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/ifrt/executable.h"
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
 #include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.h"
 #include "tensorflow/compiler/xla/statusor.h"
 
 namespace xla {
@@ -140,10 +142,12 @@ class PjRtLoadedExecutable final
   // PjRtLoadedExecutable::GetHloModules() must be implemented.
   static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
       PjRtCompatibleClient* client,
-      std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable);
+      std::unique_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
   static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
       PjRtCompatibleClient* client,
-      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable);
+      std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
 
   // Creates PjRtExecutable from an MHLO or StableHLO MLIR module. We expect
   // that xla::PjRtLoadedExecutable has fixed output dtypes/shapes/shardings. If
@@ -152,11 +156,13 @@ class PjRtLoadedExecutable final
   // PjRtLoadedExecutable::GetHloModules() must be implemented.
   static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
       PjRtCompatibleClient* client, mlir::ModuleOp module,
-      CompileOptions options);
+      xla::CompileOptions compile_options,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
   // TODO(phawkins): remove the XlaComputation overload.
   static StatusOr<std::unique_ptr<LoadedExecutable>> Create(
       PjRtCompatibleClient* client, const XlaComputation& computation,
-      CompileOptions options);
+      xla::CompileOptions compile_options,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
 
   // PjRtCompatibleLoadedExecutable implementation.
 
@@ -172,7 +178,7 @@ class PjRtLoadedExecutable final
 
   // LoadedExecutable implementation.
 
-  ~PjRtLoadedExecutable() override = default;
+  ~PjRtLoadedExecutable() override;
 
   absl::string_view name() const override {
     DCHECK(this);
@@ -245,24 +251,26 @@ class PjRtLoadedExecutable final
       PjRtCompatibleClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
       const xla::Shape& result_shape,
-      const xla::HloSharding* result_hlo_sharding);
+      const xla::HloSharding* result_hlo_sharding,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks);
 
   PjRtLoadedExecutable(
       PjRtCompatibleClient* client,
       std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable,
-      DeviceList devices, std::vector<DType> output_dtypes,
-      std::vector<Shape> output_shapes,
-      std::vector<std::shared_ptr<const Sharding>> output_shardings)
-      : client_(client),
-        pjrt_loaded_executable_(std::move(pjrt_loaded_executable)),
-        devices_(std::move(devices)),
-        output_dtypes_(std::move(output_dtypes)),
-        output_shapes_(std::move(output_shapes)),
-        output_shardings_(std::move(output_shardings)) {}
+      DeviceList devices,
+      std::vector<tsl::RCReference<LoadedHostCallback>>
+          all_loaded_host_callbacks,
+      std::vector<PjRtHostSendAndRecvLoadedHostCallback*>
+          host_send_recv_callbacks,
+      std::vector<DType> output_dtypes, std::vector<Shape> output_shapes,
+      std::vector<std::shared_ptr<const Sharding>> output_shardings);
 
   PjRtCompatibleClient* client_;
   std::shared_ptr<xla::PjRtLoadedExecutable> pjrt_loaded_executable_;
   DeviceList devices_;
+  std::shared_ptr<std::vector<tsl::RCReference<LoadedHostCallback>>>
+      all_loaded_host_callbacks_;
+  std::vector<PjRtHostSendAndRecvLoadedHostCallback*> host_send_recv_callbacks_;
   std::vector<DType> output_dtypes_;
   std::vector<Shape> output_shapes_;
   std::vector<std::shared_ptr<const Sharding>> output_shardings_;
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.cc
new file mode 100644
index 00000000000..ecd07e02091
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.cc
@@ -0,0 +1,31 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.h"
+
+#include <string>
+
+namespace xla {
+namespace ifrt {
+
+char PjRtHostSendAndRecvLoadedHostCallback::ID = 0;
+
+StatusOr<std::string> PjRtHostSendAndRecvLoadedHostCallback::Serialize() const {
+  return Unimplemented(
+      "PjRtHostSendAndRecvLoadedHostCallback serialization is not supported");
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.h b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.h
new file mode 100644
index 00000000000..e3760a374e3
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.h
@@ -0,0 +1,69 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_HOST_CALLBACK_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_HOST_CALLBACK_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/pjrt/host_callback.h"
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wrapper of a PjRt `xla::HostCallback` that uses XLA host send and recv. This
+// object is expected to be passed to the compiler when creating
+// `xla::ifrt::PjRtLoadedExecutable`.
+//
+// `PjRtLoadedHostCallback` does not support serialization by default, but it
+// may be implemented by subclassing it.
+//
+// TODO(hyeontaek): Update the comment (compiler to client) after splitting
+// compilation and loading.
+class PjRtHostSendAndRecvLoadedHostCallback
+    : public llvm::RTTIExtends<PjRtHostSendAndRecvLoadedHostCallback,
+                               LoadedHostCallback> {
+ public:
+  PjRtHostSendAndRecvLoadedHostCallback(
+      Client* client, std::unique_ptr<xla::HostCallback> host_callback)
+      : client_(client), host_callback_(std::move(host_callback)) {}
+
+  const xla::HostCallback& host_callback() const { return *host_callback_; }
+
+  // LoadedHostCallback implementation.
+
+  ~PjRtHostSendAndRecvLoadedHostCallback() override = default;
+
+  Client* client() const override { return client_; }
+
+  StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  Client* client_;
+  std::unique_ptr<xla::HostCallback> host_callback_;
+};
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_PJRT_HOST_CALLBACK_H_
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.cc
new file mode 100644
index 00000000000..92437efd2fe
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.cc
@@ -0,0 +1,45 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h"
+
+#include <memory>
+
+namespace xla {
+namespace ifrt {
+
+char XlaCompileOptions::ID = 0;
+char XlaDeserializeOptions::ID = 0;
+
+StatusOr<std::unique_ptr<XlaCompileOptions>> GetXlaCompileOptions(
+    std::unique_ptr<CompileOptions> options) {
+  if (!llvm::isa<XlaCompileOptions>(options.get())) {
+    return xla::InvalidArgument("options must be XlaCompileOptions");
+  }
+  return std::unique_ptr<XlaCompileOptions>(
+      static_cast<XlaCompileOptions*>(options.release()));
+}
+
+StatusOr<std::unique_ptr<XlaDeserializeOptions>> GetXlaDeserializeOptions(
+    std::unique_ptr<DeserializeOptions> options) {
+  if (!llvm::isa<XlaDeserializeOptions>(options.get())) {
+    return xla::InvalidArgument("options must be XlaDeserializeOptions");
+  }
+  return std::unique_ptr<XlaDeserializeOptions>(
+      static_cast<XlaDeserializeOptions*>(options.release()));
+}
+
+}  // namespace ifrt
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h
new file mode 100644
index 00000000000..4ae3f0bcdcc
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h
@@ -0,0 +1,95 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_XLA_COMPILER_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_XLA_COMPILER_H_
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/ifrt/compiler.h"
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
+
+namespace xla {
+namespace ifrt {
+
+// Wraps compilation options for an XLA computation.
+//
+// TODO(hyeontaek): Move this class out of pjrt_ifrt.
+//
+// TODO(hyeontaek): Move `loaded_host_callbacks` to a (new) `LoadOptions`
+// because compilation (without loading) should not take them.
+struct XlaCompileOptions
+    : llvm::RTTIExtends<XlaCompileOptions, CompileOptions> {
+  XlaCompileOptions() = default;
+  explicit XlaCompileOptions(xla::CompileOptions compile_options,
+                             std::vector<tsl::RCReference<LoadedHostCallback>>
+                                 loaded_host_callbacks = {})
+      : compile_options(std::move(compile_options)),
+        loaded_host_callbacks(std::move(loaded_host_callbacks)) {}
+
+  xla::CompileOptions compile_options;
+  std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks;
+
+  // CompileOptions implementation.
+
+  ~XlaCompileOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+// Wraps deserialization options for an XLA computation.
+//
+// TODO(hyeontaek): Move this class out of pjrt_ifrt.
+//
+// TODO(hyeontaek): Move `loaded_host_callbacks` to a (new) `LoadOptions`
+// because deserialization (without loading) should not take them.
+struct XlaDeserializeOptions
+    : llvm::RTTIExtends<XlaDeserializeOptions, DeserializeOptions> {
+  XlaDeserializeOptions() = default;
+  explicit XlaDeserializeOptions(
+      std::optional<xla::CompileOptions> compile_options,
+      std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks =
+          {})
+      : compile_options(std::move(compile_options)),
+        loaded_host_callbacks(std::move(loaded_host_callbacks)) {}
+
+  // `compile_options` may be unspecified if deserialization does not override
+  // it.
+  std::optional<xla::CompileOptions> compile_options;
+  std::vector<tsl::RCReference<LoadedHostCallback>> loaded_host_callbacks;
+
+  // DeserializeOptions implementation.
+
+  ~XlaDeserializeOptions() override = default;
+
+  static char ID;  // NOLINT
+};
+
+// Gets `xla::ifrt::XlaCompileOptions` from `xla::ifrt::CompileOptions`.
+StatusOr<std::unique_ptr<XlaCompileOptions>> GetXlaCompileOptions(
+    std::unique_ptr<CompileOptions> options);
+
+// Gets `xla::ifrt::XlaDeserializeOptions` from `xla::ifrt::DeserializeOptions`.
+StatusOr<std::unique_ptr<XlaDeserializeOptions>> GetXlaDeserializeOptions(
+    std::unique_ptr<DeserializeOptions> options);
+
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PJRT_IFRT_XLA_COMPILER_H_
diff --git a/tensorflow/compiler/xla/python/ifrt/executable_impl_test_lib.cc b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
similarity index 96%
rename from tensorflow/compiler/xla/python/ifrt/executable_impl_test_lib.cc
rename to tensorflow/compiler/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
index 31376cb4257..e29fe7dc645 100644
--- a/tensorflow/compiler/xla/python/ifrt/executable_impl_test_lib.cc
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_executable_impl_test_lib.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/python/ifrt/client.h"
 #include "tensorflow/compiler/xla/python/ifrt/test_util.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/statusor.h"
-#include "tensorflow/tsl/platform/test.h"
 
 namespace xla {
 namespace ifrt {
@@ -53,9 +53,10 @@ StatusOr<std::unique_ptr<LoadedExecutable>> CompileOnDevices(
   TF_ASSIGN_OR_RETURN(mlir::OwningOpRef<mlir::ModuleOp> module,
                       xla::ParseMlirModuleString(mlir_module_str, context));
 
-  CompileOptions compile_options;
+  auto compile_options =
+      std::make_unique<XlaCompileOptions>(xla::CompileOptions());
   ExecutableBuildOptions& build_options =
-      compile_options.executable_build_options;
+      compile_options->compile_options.executable_build_options;
   for (Device* device : devices) {
     build_options.set_device_ordinal(device->id());
     if (replicated) {
diff --git a/tensorflow/compiler/xla/python/pjrt_ifrt/xla_host_callback.proto b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_host_callback.proto
new file mode 100644
index 00000000000..909646980ba
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pjrt_ifrt/xla_host_callback.proto
@@ -0,0 +1,53 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla.ifrt;
+
+import "google/protobuf/any.proto";
+import "tensorflow/compiler/xla/xla_data.proto";
+
+// Represents a host callback in an XLA computation.
+//
+// XLA computation may use XLA send/recv to represent communication between the
+// host and the device. This can be used to implement "host callbacks", where
+// host-side computation is invoked in the middle of XLA computation. This
+// message contains information that is necessary to instantiate host callbacks
+// that are marshalled from the client.
+//
+// Modeled after `xla::HostCallback`.
+message XlaHostCallbackProto {
+  message ArgInfo {
+    // The channel id associated with this value in HLO. Declared as `uint32`
+    // even though `xla::HostCallbackArgInfo::channel_id` is `uint16_t` because
+    // protobuf doesn't have a 16-bit integer type.
+    uint32 channel_id = 1;
+
+    // The host shape for the value.
+    xla.ShapeProto shape = 2;
+  }
+
+  // The metadata (e.g. channel_id, shape) for the operands and results.
+  repeated ArgInfo operands = 1;
+  repeated ArgInfo results = 2;
+
+  // Serialized host callback.
+  google.protobuf.Any serialized_callback = 3;
+
+  // See comment for PJRT
+  // ExecuteOptions::use_major_to_minor_data_layout_for_callbacks.
+  bool use_major_to_minor_data_layout_for_callbacks = 4;
+}
diff --git a/tensorflow/compiler/xla/python/pmap_lib.cc b/tensorflow/compiler/xla/python/pmap_lib.cc
index a134bf54fdc..b301f64b78d 100644
--- a/tensorflow/compiler/xla/python/pmap_lib.cc
+++ b/tensorflow/compiler/xla/python/pmap_lib.cc
@@ -768,6 +768,7 @@ PyObject* JaxPmapFunction_tp_new(PyTypeObject* subtype, PyObject* args,
 }
 
 void JaxPmapFunction_tp_dealloc(PyObject* self) {
+  PyObject_GC_UnTrack(self);
   PyTypeObject* tp = Py_TYPE(self);
   JaxPmapFunctionObject* o = reinterpret_cast<JaxPmapFunctionObject*>(self);
   if (o->weakrefs) {
diff --git a/tensorflow/compiler/xla/python/pprof_profile_builder.cc b/tensorflow/compiler/xla/python/pprof_profile_builder.cc
index fed6b15bcbc..cdecd6037cb 100644
--- a/tensorflow/compiler/xla/python/pprof_profile_builder.cc
+++ b/tensorflow/compiler/xla/python/pprof_profile_builder.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/python/pprof_profile_builder.h"
 
 #include <string>
+#include <utility>
 
 #include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -60,7 +61,7 @@ int PprofProfileBuilder::LocationId(PyCodeObject* code, int instruction) {
     location->set_id(ret.first->second);
     auto* line = location->add_line();
     line->set_function_id(FunctionId(code));
-    line->set_line(PyCode_Addr2Line(code, instruction * kLastiWordBytes));
+    line->set_line(PyCode_Addr2Line(code, instruction));
   }
   return ret.first->second;
 }
diff --git a/tensorflow/compiler/xla/python/py_array.cc b/tensorflow/compiler/xla/python/py_array.cc
index bf69a033a39..b44ef4e4990 100644
--- a/tensorflow/compiler/xla/python/py_array.cc
+++ b/tensorflow/compiler/xla/python/py_array.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/py_array.h"
 
+#include <cstring>
 #include <memory>
 #include <new>
 #include <optional>
@@ -105,6 +106,7 @@ extern "C" PyObject* PyArray_tp_new(PyTypeObject* type, PyObject*, PyObject*) {
 }
 
 extern "C" void PyArray_tp_dealloc(PyObject* self) {
+  PyObject_GC_UnTrack(self);
   PyTypeObject* tp = Py_TYPE(self);
   auto* obj = reinterpret_cast<PyArrayObject*>(self);
 
@@ -380,7 +382,7 @@ py::object PyArray::arrays() {
   // should return the same PyArrays (to avoid duplicate device to host
   // transfers). So we create PyArrays the first time it is called and reuse
   // them later.
-  if (ifrt_array() == nullptr) return py::none();
+  if (ifrt_array() == nullptr || ifrt_array()->IsDeleted()) return py::none();
 
   if (llvm::isa<ifrt::SingleDeviceSharding>(&ifrt_array()->sharding())) {
     std::vector<PyArray> py_arrays;
@@ -452,18 +454,12 @@ StatusOr<PyArray> PyArray::FullyReplicatedShard() {
         "FullyReplicatedShard() called on deleted or donated buffer");
   }
 
-  auto* client = llvm::dyn_cast_or_null<xla::ifrt::PjRtCompatibleClient>(
-      ifrt_array()->client());
-  auto* arr = llvm::dyn_cast_or_null<ifrt::PjRtCompatibleArray>(ifrt_array());
-  if (arr == nullptr) {
-    throw XlaRuntimeError(
-        "This operation is implemented for a PjRt-compatible backend only.");
-  }
-  auto fully_replicated_ifrt_shard =
-      ifrt::PjRtArray::Create(client, std::move(arr->pjrt_buffers().front()));
+  TF_ASSIGN_OR_RETURN(auto fully_replicated_ifrt_shard,
+                      ifrt_array()->FullyReplicatedShard(
+                          ifrt::ArrayCopySemantics::kReuseInput));
   return MakeFromSingleDeviceArray(py_client(), traceback(),
-                                   *fully_replicated_ifrt_shard, weak_type(),
-                                   committed());
+                                   std::move(fully_replicated_ifrt_shard),
+                                   weak_type(), committed());
 }
 
 Status PyArray::BlockUntilReady() const {
@@ -740,6 +736,159 @@ std::vector<py::object> PyClient::LiveArrays() {
   return result;
 }
 
+// PEP 3118 buffer protocol implementation.
+
+namespace {
+
+// Extra data to be kept alive by the consumer of the buffer protocol.
+struct ExtraBufferInfo {
+  explicit ExtraBufferInfo(
+      std::shared_ptr<PjRtBuffer> buffer,
+      std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold)
+      : buffer(std::move(buffer)),
+        external_reference_hold(std::move(external_reference_hold)) {}
+
+  std::vector<Py_ssize_t> strides;
+  // We keep an external reference hold to the PjRtBuffer. This prevents a
+  // use-after-free in the event that Delete() is called on a buffer with an
+  // live buffer protocol view. It does however mean that Delete() sometimes
+  // won't actually delete immediately.
+  std::shared_ptr<PjRtBuffer> buffer;
+  std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold;
+};
+
+int PyArray_bf_getbuffer(PyObject* exporter, Py_buffer* view, int flags) {
+  Status status = [&]() {
+    PyArray py_array = py::reinterpret_borrow<PyArray>(exporter);
+    if (py_array.ifrt_array() == nullptr) {
+      // TODO(phawkins): why is this happening?
+      return InvalidArgument("Array is null");
+    }
+    if (!llvm::isa<ifrt::PjRtCompatibleArray>(py_array.ifrt_array())) {
+      return InvalidArgument("Only local arrays are supported, got %s",
+                             py_array.ifrt_array()->DebugString());
+    }
+    auto* array =
+        static_cast<ifrt::PjRtCompatibleArray*>(py_array.ifrt_array());
+    absl::Span<const std::shared_ptr<PjRtBuffer>> buffers =
+        array->pjrt_buffers();
+
+    PjRtBuffer& buffer = *buffers.front();
+    if (!buffer.IsOnCpu()) {
+      return InvalidArgument(
+          "Python buffer protocol is only defined for CPU buffers.");
+    }
+
+    if (buffers.size() != 1) {
+      return InvalidArgument(
+          "Python buffer protocol is only defined for buffers with a single "
+          "shard.");
+    }
+    if (!py_array.sharding().get_type().is(jax::SingleDeviceSharding::type())) {
+      return InvalidArgument(
+          "Python buffer protocol is only defined for single-device sharded "
+          "buffers.");
+    }
+
+    const xla::Shape& shape = buffer.on_device_shape();
+
+    const char* format =
+        PEP3118FormatDescriptorForPrimitiveType(shape.element_type());
+    // It isn't an option for us to export unknown types as, say, bytes. When
+    // converting an object to an ndarray, NumPy tries the buffer protocol
+    // first. We very much want NumPy to fail and fall back to using
+    // __array__, which allows us to handle custom dtypes correctly.
+    if (!format) {
+      return InvalidArgument(
+          "Buffers of type %s are not supported by the Python buffer protocol.",
+          PrimitiveType_Name(shape.element_type()));
+    }
+
+    // Py_buffer objects are POD C structures, so we don't need to hold the GIL.
+    // Additionally we call BlockHostUntilReady() below, which may block.
+    py::gil_scoped_release gil_release;
+
+    if (!shape.IsArray()) {
+      return InvalidArgument(
+          "Python buffer protocol is only defined for array buffers.");
+    }
+    if ((flags & PyBUF_WRITEABLE) == PyBUF_WRITEABLE) {
+      return InvalidArgument("XLA buffers are read-only.");
+    }
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<PjRtBuffer::ExternalReference> external_reference_hold,
+        buffer.AcquireExternalReference());
+    if (buffer.IsDeleted()) {
+      return InvalidArgument("Deleted buffer used in buffer protocol.");
+    }
+
+    if (((flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS ||
+         (flags & PyBUF_STRIDES) == PyBUF_ND) &&
+        !LayoutUtil::IsMonotonicWithDim0Major(shape.layout())) {
+      return InvalidArgument("Buffer is not in C-contiguous layout.");
+    } else if ((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS &&
+               !LayoutUtil::IsMonotonicWithDim0Minor(shape.layout())) {
+      return InvalidArgument("Buffer is not in F-contiguous layout.");
+    } else if ((flags & PyBUF_ANY_CONTIGUOUS) == PyBUF_ANY_CONTIGUOUS &&
+               !LayoutUtil::IsMonotonicWithDim0Major(shape.layout()) &&
+               !LayoutUtil::IsMonotonicWithDim0Minor(shape.layout())) {
+      return InvalidArgument("Buffer is not in contiguous layout.");
+    }
+    std::memset(view, 0, sizeof(Py_buffer));
+    const void* root_ptr =
+        external_reference_hold->OpaqueDeviceMemoryDataPointer();
+    view->buf = const_cast<void*>(root_ptr);
+    auto extra = std::make_unique<ExtraBufferInfo>(
+        buffers.front(), std::move(external_reference_hold));
+    view->itemsize = ShapeUtil::ByteSizeOfPrimitiveType(shape.element_type());
+    view->len = ShapeUtil::ByteSizeOf(shape);
+    view->readonly = 1;
+    if ((flags & PyBUF_FORMAT) == PyBUF_FORMAT) {
+      view->format = const_cast<char*>(format);
+    }
+    if ((flags & PyBUF_ND) == PyBUF_ND) {
+      view->ndim = shape.dimensions_size();
+      static_assert(sizeof(int64_t) == sizeof(Py_ssize_t),
+                    "Py_ssize_t must be 64 bits");
+      if (view->ndim != 0) {
+        view->shape = reinterpret_cast<Py_ssize_t*>(
+            const_cast<int64_t*>(shape.dimensions().data()));
+        if ((flags & PyBUF_STRIDES) == PyBUF_STRIDES) {
+          extra->strides = ByteStridesForShape(shape);
+          view->strides = extra->strides.data();
+        }
+      }
+    }
+    TF_RETURN_IF_ERROR(buffer.BlockHostUntilReady());
+    view->internal = extra.release();
+    return OkStatus();
+  }();
+  if (!status.ok()) {
+    // numpy.asarray(...) eats the PyExc_BufferError. Adding a log here helps
+    // debugging when the error really occurs.
+    VLOG(1) << "Buffer Protocol Error: " << status;
+    PyErr_SetString(PyExc_BufferError, status.ToString().c_str());
+    return -1;
+  }
+  view->obj = exporter;
+  Py_INCREF(view->obj);
+  return 0;
+}
+
+void PyArray_bf_releasebuffer(PyObject*, Py_buffer* buffer) {
+  auto extra = static_cast<ExtraBufferInfo*>(buffer->internal);
+  delete extra;
+}
+
+PyBufferProcs PyArray_tp_as_buffer = []() {
+  PyBufferProcs procs;
+  procs.bf_getbuffer = &PyArray_bf_getbuffer;
+  procs.bf_releasebuffer = &PyArray_bf_releasebuffer;
+  return procs;
+}();
+
+}  // namespace
+
 Status PyArray::SetUpType() {
   static constexpr char kName[] = "ArrayImpl";
 
@@ -767,6 +916,7 @@ Status PyArray::SetUpType() {
   type->tp_as_number = &heap_type->as_number;
   type->tp_as_sequence = &heap_type->as_sequence;
   type->tp_as_mapping = &heap_type->as_mapping;
+  type->tp_as_buffer = &PyArray_tp_as_buffer;
 
   // Allow dynamic attributes.
   EnableDynamicAttribute(heap_type);
diff --git a/tensorflow/compiler/xla/python/py_client.cc b/tensorflow/compiler/xla/python/py_client.cc
index 8a9618b51aa..7397862d97a 100644
--- a/tensorflow/compiler/xla/python/py_client.cc
+++ b/tensorflow/compiler/xla/python/py_client.cc
@@ -22,24 +22,25 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/strings/numbers.h"
-#include "tensorflow/compiler/xla/python/ifrt/client.h"
-#include "tensorflow/compiler/xla/pjrt/host_callback.h"
 #include "tensorflow/compiler/xla/pjrt/mlir_to_hlo.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_stream_executor_client.h"
 #include "tensorflow/compiler/xla/python/callback.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/compiler.h"
+#include "tensorflow/compiler/xla/python/ifrt/executable.h"
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/xla_compiler.h"
 #include "tensorflow/compiler/xla/python/pprof_profile_builder.h"
 #include "tensorflow/compiler/xla/python/py_array.h"
 #include "tensorflow/compiler/xla/python/py_buffer.h"
 #include "tensorflow/compiler/xla/python/py_executable.h"
+#include "tensorflow/compiler/xla/python/py_host_callback.h"
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 #include "tensorflow/compiler/xla/python/traceback.h"
 #include "tensorflow/compiler/xla/python/transfer_guard_lib.h"
-#include "tensorflow/compiler/xla/python/types.h"
 #include "tensorflow/compiler/xla/service/custom_call_target_registry.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -222,8 +223,7 @@ PyClient::GetDefaultDeviceAssignment1D(int num_replicas) {
 
 StatusOr<py::object> PyClient::BufferFromPyval(
     pybind11::handle argument, PjRtDevice* device, bool force_copy,
-    ifrt::Client::HostBufferSemantics host_buffer_semantics
-) {
+    ifrt::Client::HostBufferSemantics host_buffer_semantics) {
   if (device == nullptr) {
     TF_RET_CHECK(!ifrt_client_->addressable_devices().empty());
     device = ifrt_client_->addressable_devices().front();
@@ -337,11 +337,54 @@ PyClient::MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
   return result;
 }
 
+namespace {
+
+// Makes IFRT `CompileOptions` from XLA `CompileOptions` and optional host
+// callbacks.
+std::unique_ptr<ifrt::CompileOptions> MakeIfrtCompileOptions(
+    CompileOptions options, std::vector<pybind11::capsule> host_callbacks) {
+  std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
+      ifrt_loaded_host_callbacks;
+  ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
+  // Extract `ifrt::LoadedHostCallback`s from host callback capsules that were
+  // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
+  // `PyClient::GetEmitPythonCallbackDescriptor()`.
+  for (auto& host_callback : host_callbacks) {
+    ifrt_loaded_host_callbacks.push_back(
+        tsl::FormRef(host_callback.get_pointer<ifrt::LoadedHostCallback>()));
+  }
+  return std::make_unique<ifrt::XlaCompileOptions>(
+      std::move(options), std::move(ifrt_loaded_host_callbacks));
+}
+
+// Makes IFRT `DeserializeOptions` from XLA `CompileOptions` and optional host
+// callbacks.
+std::unique_ptr<ifrt::DeserializeOptions> MakeIfrtDeserializeOptions(
+    std::optional<CompileOptions> options,
+    std::vector<pybind11::capsule> host_callbacks) {
+  std::vector<tsl::RCReference<ifrt::LoadedHostCallback>>
+      ifrt_loaded_host_callbacks;
+  ifrt_loaded_host_callbacks.reserve(host_callbacks.size());
+  // Extract `ifrt::LoadedHostCallback`s from host callback capsules that were
+  // created by `PyClient::MakePythonCallbackUsingHostSendAndRecv()` or
+  // `PyClient::GetEmitPythonCallbackDescriptor()`.
+  for (auto& host_callback : host_callbacks) {
+    ifrt_loaded_host_callbacks.push_back(
+        tsl::FormRef(host_callback.get_pointer<ifrt::LoadedHostCallback>()));
+  }
+  return std::make_unique<ifrt::XlaDeserializeOptions>(
+      std::move(options), std::move(ifrt_loaded_host_callbacks));
+}
+
+}  // namespace
+
 StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
     std::string mlir_module, CompileOptions options,
     std::vector<pybind11::capsule> host_callbacks) {
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
+  auto ifrt_compile_options =
+      MakeIfrtCompileOptions(std::move(options), std::move(host_callbacks));
   {
     py::gil_scoped_release gil_release;
     mlir::MLIRContext context;
@@ -349,13 +392,13 @@ StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::Compile(
                         ParseMlirModuleString(mlir_module, context));
     TF_ASSIGN_OR_RETURN(ifrt_loaded_executable,
                         ifrt_client_->GetDefaultCompiler()->Compile(
-                            module.get(), std::move(options)));
+                            module.get(), std::move(ifrt_compile_options)));
     TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   auto traceback = Traceback::Get();
   return std::make_shared<PyLoadedExecutable>(
       shared_from_this(), std::move(ifrt_loaded_executable),
-      std::move(traceback), std::move(fingerprint), std::move(host_callbacks));
+      std::move(traceback), std::move(fingerprint));
 }
 
 StatusOr<py::bytes> PyClient::SerializeExecutable(
@@ -368,19 +411,21 @@ StatusOr<std::shared_ptr<PyLoadedExecutable>> PyClient::DeserializeExecutable(
     std::vector<pybind11::capsule> host_callbacks) {
   std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable;
   std::optional<std::string> fingerprint;
+  auto ifrt_deserialize_options =
+      MakeIfrtDeserializeOptions(std::move(options), std::move(host_callbacks));
   {
     py::gil_scoped_release gil_release;
     TF_ASSIGN_OR_RETURN(
         ifrt_loaded_executable,
         ifrt_client_->GetDefaultCompiler()->DeserializeLoadedExecutable(
-            serialized, std::move(options)));
+            serialized, std::move(ifrt_deserialize_options)));
     TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   }
   TF_ASSIGN_OR_RETURN(fingerprint, ifrt_loaded_executable->Fingerprint());
   auto traceback = Traceback::Get();
   return std::make_shared<PyLoadedExecutable>(
       shared_from_this(), std::move(ifrt_loaded_executable),
-      std::move(traceback), std::move(fingerprint), std::move(host_callbacks));
+      std::move(traceback), std::move(fingerprint));
 }
 
 namespace {
@@ -497,114 +542,20 @@ StatusOr<py::bytes> PyClient::HeapProfile() {
   return py::bytes(builder.profile().SerializeAsString());
 }
 
-namespace {
-
-StatusOr<std::vector<CpuCallback::Arg>> CreateCallbackArgs(
-    absl::Span<Shape const> operand_shapes) {
-  std::vector<CpuCallback::Arg> callback_args(operand_shapes.size());
-  for (int i = 0; i < operand_shapes.size(); ++i) {
-    Shape shape = operand_shapes[i];
-
-    if (shape.IsArray()) {
-      Shape layout =
-          (shape.has_layout() ? shape
-                              : LayoutUtil::GetWithDefaultLayout(shape));
-      callback_args[i].dims.resize(shape.dimensions_size());
-      absl::c_copy(shape.dimensions(), callback_args[i].dims.begin());
-      callback_args[i].strides = ByteStridesForShape(layout);
-      callback_args[i].type = shape.element_type();
-      callback_args[i].size_in_bytes = ShapeUtil::ByteSizeOf(layout);
-      TF_ASSIGN_OR_RETURN(callback_args[i].dtype,
-                          PrimitiveTypeToDtype(shape.element_type()));
-    } else if (shape.IsToken()) {
-      callback_args[i].type = TOKEN;
-    } else {
-      return InvalidArgument(
-          "Only array and token arguments to Python callbacks are supported, "
-          "got %s",
-          shape.ToString());
-    }
-  }
-  return callback_args;
-}
-
-StatusOr<std::vector<CpuCallback::Result>> CreateCallbackResults(
-    absl::Span<Shape const> result_shapes) {
-  std::vector<CpuCallback::Result> callback_results(result_shapes.size());
-  for (int i = 0; i < result_shapes.size(); ++i) {
-    if (result_shapes[i].IsArray()) {
-      const Shape& shape =
-          result_shapes[i].has_layout()
-              ? result_shapes[i]
-              : LayoutUtil::GetWithDefaultLayout(result_shapes[i]);
-      callback_results[i].expected_dims.resize(shape.dimensions_size());
-      absl::c_copy(shape.dimensions(),
-                   callback_results[i].expected_dims.begin());
-      callback_results[i].expected_strides = ByteStridesForShapeInt64(shape);
-      callback_results[i].type = shape.element_type();
-      callback_results[i].size_in_bytes = ShapeUtil::ByteSizeOf(shape);
-      callback_results[i].reversed_layout.resize(shape.dimensions_size());
-      absl::c_reverse_copy(shape.layout().minor_to_major(),
-                           callback_results[i].reversed_layout.begin());
-    } else if (result_shapes[i].IsToken()) {
-      callback_results[i].type = TOKEN;
-    } else {
-      return InvalidArgument(
-          "Only array and token return values from Python callbacks are "
-          "supported, got %s",
-          result_shapes[i].ToString());
-    }
-  }
-  return callback_results;
-}
-
-}  // namespace
-
 StatusOr<pybind11::object> PyClient::MakePythonCallbackUsingHostSendAndRecv(
     pybind11::function callable, absl::Span<Shape const> operand_shapes,
     absl::Span<Shape const> result_shapes,
     absl::Span<uint16_t const> send_channel_ids,
-    absl::Span<uint16_t const> recv_channel_ids) {
-  static_assert(sizeof(uintptr_t) == sizeof(uint64_t),
-                "Expected 64-bit pointers");
-
-  TF_ASSIGN_OR_RETURN(auto callback_args, CreateCallbackArgs(operand_shapes));
-  TF_ASSIGN_OR_RETURN(auto callback_results,
-                      CreateCallbackResults(result_shapes));
-
-  auto callback = std::make_shared<CpuCallback>(
-      std::move(callable), callback_args, callback_results);
-
-  auto* host_callback = new HostCallback;
-
-  auto assign_arg_info = [](absl::Span<Shape const> shapes,
-                            absl::Span<uint16_t const> channel_ids,
-                            std::vector<HostCallbackArgInfo>& arg_infos) {
-    DCHECK_EQ(shapes.size(), channel_ids.size());
-    arg_infos.reserve(shapes.size());
-    for (int i = 0; i < shapes.size(); ++i) {
-      HostCallbackArgInfo host_callback_arg_info;
-      host_callback_arg_info.channel_id = channel_ids[i];
-      const auto& shape = shapes[i];
-      Shape layout =
-          (shape.has_layout() ? shape
-                              : LayoutUtil::GetWithDefaultLayout(shape));
-      host_callback_arg_info.shape = layout;
-      arg_infos.push_back(std::move(host_callback_arg_info));
-    }
-  };
-
-  assign_arg_info(operand_shapes, send_channel_ids, host_callback->operands);
-  assign_arg_info(result_shapes, recv_channel_ids, host_callback->results);
-
-  host_callback->callback = [callback = std::move(callback)](void** outputs,
-                                                             void** inputs) {
-    return callback->PrepareAndCall(outputs, inputs);
-  };
-
-  py::capsule callback_capsule(
-      host_callback, [](void* ptr) { delete static_cast<HostCallback*>(ptr); });
-
+    absl::Span<uint16_t const> recv_channel_ids,
+    pybind11::function serializer) {
+  TF_ASSIGN_OR_RETURN(
+      auto loaded_host_callback,
+      PyHostSendAndRecvLoadedHostCallback::Create(
+          ifrt_client(), std::move(callable), operand_shapes, result_shapes,
+          send_channel_ids, recv_channel_ids, std::move(serializer)));
+  py::capsule callback_capsule(loaded_host_callback.release(), [](void* ptr) {
+    static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
+  });
   return callback_capsule;
 }
 
@@ -612,25 +563,14 @@ StatusOr<std::pair<uint64_t, pybind11::object>>
 PyClient::GetEmitPythonCallbackDescriptor(
     pybind11::function callable, absl::Span<Shape const> operand_shapes,
     absl::Span<Shape const> result_shapes) {
-  ifrt::PlatformId platform_id = ifrt_client_->platform_id();
-  if (platform_id != GpuId() && platform_id != CpuId()) {
-    return Unimplemented(
-        "EmitPythonCallback is only implemented on CPU and GPU");
-  }
+  TF_ASSIGN_OR_RETURN(
+      auto loaded_host_callback,
+      PyCpuLoadedHostCallback::Create(ifrt_client(), std::move(callable),
+                                      operand_shapes, result_shapes));
+  const uint64_t descriptor = loaded_host_callback->descriptor();
 
-  static_assert(sizeof(uintptr_t) == sizeof(uint64_t),
-                "Expected 64-bit pointers");
-
-  TF_ASSIGN_OR_RETURN(auto callback_args, CreateCallbackArgs(operand_shapes));
-  TF_ASSIGN_OR_RETURN(auto callback_results,
-                      CreateCallbackResults(result_shapes));
-
-  auto callback = std::make_unique<CpuCallback>(
-      std::move(callable), callback_args, callback_results);
-  uint64_t descriptor = absl::bit_cast<std::uint64_t>(callback.get());
-
-  py::capsule callback_capsule(callback.release(), [](void* ptr) {
-    delete reinterpret_cast<CpuCallback*>(ptr);
+  py::capsule callback_capsule(loaded_host_callback.release(), [](void* ptr) {
+    static_cast<ifrt::LoadedHostCallback*>(ptr)->DropRef();
   });
   return std::make_pair(descriptor, py::object(std::move(callback_capsule)));
 }
diff --git a/tensorflow/compiler/xla/python/py_client.h b/tensorflow/compiler/xla/python/py_client.h
index e71de116387..c5d18524917 100644
--- a/tensorflow/compiler/xla/python/py_client.h
+++ b/tensorflow/compiler/xla/python/py_client.h
@@ -175,14 +175,6 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   StatusOr<std::vector<ClientAndPtr<PjRtDevice>>> GetDefaultDeviceAssignment1D(
       int num_replicas);
 
-  StatusOr<ChannelHandle> CreateChannelHandle() { return ChannelHandle(); }
-  StatusOr<ChannelHandle> CreateDeviceToHostChannelHandle() {
-    return ifrt_client_->CreateDeviceToHostChannelHandle();
-  }
-  StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() {
-    return ifrt_client_->CreateHostToDeviceChannelHandle();
-  }
-
   StatusOr<std::vector<std::pair<pybind11::bytes, pybind11::object>>>
   MakeCrossHostReceiveBuffers(absl::Span<const Shape> shapes,
                               PjRtDevice* device);
@@ -207,16 +199,14 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   // takes in arguments of shapes `operand_shapes` and returns values of shapes
   // `result_shapes`. It returns a pair of a `uint64_t` descriptor and a Python
   // object whose reference will keep the Python callback alive. The descriptor
-  // should be passed into a 'xla_cpu_python_callback' CustomCall as its first
-  // argument. Typically the callback may be kept alive by attaching the
-  // keep-alive object to the executable built from this computation.
+  // should be passed into a 'xla_python_cpu_callback' or
+  // 'xla_python_gpu_callback' CustomCall as its first argument. Typically the
+  // callback may be kept alive by attaching the keep-alive object to the
+  // executable built from this computation.
   //
   // The callable receives as arguments NumPy arrays for arguments with array
   // types, and None for Token argument. The callable must return a tuple of
   // either arrays or None values.
-  //
-  // This is a method of PyClient since different platforms may implement this
-  // functionality in different ways.
   StatusOr<std::pair<uint64_t, pybind11::object>>
   GetEmitPythonCallbackDescriptor(pybind11::function callable,
                                   absl::Span<Shape const> operand_shapes,
@@ -239,14 +229,21 @@ class PyClient : public std::enable_shared_from_this<PyClient> {
   // program through `send_channel_ids` and the results correspond to Recv ops
   // through `recv_channel_ids`. It returns the host callback as an opaque
   // object whose reference will keep the Python callback alive. The host
-  // callback can be passed to PyLoadedExecutable::Execute() so that the
-  // corresponding Send/Recv ops can trigger the execution of this host
-  // callback.
+  // callback can be passed to `PyClient::Compile` or
+  // `PyClient::DeserializeExecutable`. The corresponding Send/Recv ops in the
+  // XLA computation can trigger the execution of this host callback.
+  // `serializer` is a function that takes `callable` as an argument and returns
+  // a serialized callable as a string.
+  //
+  // The callable receives as arguments NumPy arrays for arguments with array
+  // types, and None for Token argument. The callable must return a tuple of
+  // either arrays or None values.
   StatusOr<pybind11::object> MakePythonCallbackUsingHostSendAndRecv(
       pybind11::function callable, absl::Span<Shape const> operand_shapes,
       absl::Span<Shape const> result_shapes,
       absl::Span<uint16_t const> send_channel_ids,
-      absl::Span<uint16_t const> recv_channel_ids);
+      absl::Span<uint16_t const> recv_channel_ids,
+      pybind11::function serializer);
 
   std::vector<pybind11::object> LiveArrays();
 
diff --git a/tensorflow/compiler/xla/python/py_compile_only_client.cc b/tensorflow/compiler/xla/python/py_compile_only_client.cc
index 1af4c733b5f..92be4381a04 100644
--- a/tensorflow/compiler/xla/python/py_compile_only_client.cc
+++ b/tensorflow/compiler/xla/python/py_compile_only_client.cc
@@ -62,19 +62,20 @@ class InvalidIfrtCompiler final
     : public llvm::RTTIExtends<InvalidIfrtCompiler, ifrt::Compiler> {
  public:
   StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> Compile(
-      mlir::ModuleOp mlir_module, CompileOptions options) override {
+      mlir::ModuleOp mlir_module,
+      std::unique_ptr<ifrt::CompileOptions> options) override {
     return Unimplemented("Compile not implemented.");
   }
 
   StatusOr<std::unique_ptr<ifrt::LoadedExecutable>> DeserializeLoadedExecutable(
       absl::string_view serialized,
-      std::optional<CompileOptions> options) override {
+      std::unique_ptr<ifrt::DeserializeOptions> options) override {
     return Unimplemented("DeserializeLoadedExecutable not implemented.");
   }
 
   static char ID;  // NOLINT
 };
-char InvalidIfrtCompiler::ID = 0;
+char InvalidIfrtCompiler::ID = 0;  // NOLINT
 
 class CompileOnlyIfRtClient final
     : public llvm::RTTIExtends<CompileOnlyIfRtClient, ifrt::Client> {
@@ -145,17 +146,6 @@ class CompileOnlyIfRtClient final
         "LookupDevice not available with compile-only client.");
   }
 
-  StatusOr<ifrt::ChannelHandle> CreateDeviceToHostChannelHandle() override {
-    return Unimplemented(
-        "CreateDeviceToHostChannelHandle not available with compile-only "
-        "client.");
-  }
-  StatusOr<ifrt::ChannelHandle> CreateHostToDeviceChannelHandle() override {
-    return Unimplemented(
-        "CreateHostToDeviceChannelHandle not available with compile-only "
-        "client.");
-  }
-
   ifrt::Compiler* GetDefaultCompiler() override { return &default_compiler_; }
 
   static char ID;  // NOLINT
@@ -170,7 +160,7 @@ class CompileOnlyIfRtClient final
   std::vector<PjRtDevice*> devices_;
 };
 
-char CompileOnlyIfRtClient::ID = 0;
+char CompileOnlyIfRtClient::ID = 0;  // NOLINT
 
 class CompileOnlyPyClient : public PyClient {
  public:
diff --git a/tensorflow/compiler/xla/python/py_executable.cc b/tensorflow/compiler/xla/python/py_executable.cc
index 69d4194b4e5..a71b9d441e6 100644
--- a/tensorflow/compiler/xla/python/py_executable.cc
+++ b/tensorflow/compiler/xla/python/py_executable.cc
@@ -23,7 +23,6 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/pjrt/host_callback.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/ifrt/array.h"
 #include "tensorflow/compiler/xla/python/ifrt/device.h"
@@ -56,13 +55,11 @@ PyLoadedExecutable::PyLoadedExecutable(
     std::shared_ptr<PyClient> client,
     std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
     std::shared_ptr<Traceback> traceback,
-    std::optional<std::string> fingerprint,
-    std::vector<pybind11::capsule> host_callbacks)
+    std::optional<std::string> fingerprint)
     : client_(std::move(client)),
       ifrt_loaded_executable_(std::move(ifrt_loaded_executable)),
       traceback_(std::move(traceback)),
-      fingerprint_(std::move(fingerprint)),
-      host_callbacks_(std::move(host_callbacks)) {
+      fingerprint_(std::move(fingerprint)) {
   CHECK(PyGILState_Check());
   next_ = client_->executables_;
   client_->executables_ = this;
@@ -177,42 +174,12 @@ void PopulateExecuteShardedResults(
 template <typename ArgT, typename ArgAdapter = ShardedBufferAdapter<ArgT>>
 StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
     const ExecuteOptions& options, const std::shared_ptr<PyClient>& client,
-    ifrt::LoadedExecutable* ifrt_loaded_executable,
-    absl::Span<const py::capsule> host_callbacks, absl::Span<const ArgT> args,
+    ifrt::LoadedExecutable* ifrt_loaded_executable, absl::Span<const ArgT> args,
     std::optional<std::vector<PjRtFuture<Status>>>& returned_futures) {
   std::vector<tsl::RCReference<ifrt::Array>> output_arrays;
   std::unique_ptr<ifrt::Future<Status>> returned_future;
   int num_computations = ifrt_loaded_executable->addressable_devices().size();
   {
-    auto opts = options;
-    std::shared_ptr<HostCallbackStates> host_callback_states;
-    if (!host_callbacks.empty()) {
-      if (!client->pjrt_client()->SupportsSendRecvCallbacks()) {
-        return InternalError("Host callback not supported for runtime type: %s",
-                             client->runtime_type());
-      }
-      returned_futures.emplace();
-      host_callback_states = std::make_shared<HostCallbackStates>();
-
-      for (int i = 0; i < num_computations; ++i) {
-        auto& contexts = host_callback_states->contexts.emplace_back();
-        auto& send_callbacks =
-            host_callback_states->send_callbacks.emplace_back();
-        auto& recv_callbacks =
-            host_callback_states->recv_callbacks.emplace_back();
-
-        for (const py::capsule& host_callback : host_callbacks) {
-          contexts.push_back(CreateHostCallbackStateAndAppendSendRecvCallbacks(
-              *host_callback.get_pointer<HostCallback>(),
-              /*host_memory_for_device_manager=*/nullptr, send_callbacks,
-              recv_callbacks,
-              /*use_major_to_minor_data_layout_for_callbacks=*/true));
-        }
-      }
-      opts.send_callbacks = host_callback_states->send_callbacks;
-      opts.recv_callbacks = host_callback_states->recv_callbacks;
-    }
-
     py::gil_scoped_release gil_release;
     for (const auto& arg : args) {
       if (ArgAdapter::num_devices(arg) != num_computations) {
@@ -230,20 +197,12 @@ StatusOr<PyExecuteResults> ExecuteShardedOnLocalDevicesInternal(
       return ArgAdapter::GetIfRtArray(arg);
     });
     TF_ASSIGN_OR_RETURN(auto result, ifrt_loaded_executable->Execute(
-                                         absl::MakeSpan(arg_arrays), opts,
+                                         absl::MakeSpan(arg_arrays), options,
                                          /*devices=*/std::nullopt));
     output_arrays = std::move(result.outputs);
     if (returned_futures.has_value()) {
       returned_futures->resize(num_computations, std::move(result.status));
     }
-
-    if (!host_callbacks.empty()) {
-      // For host callbacks to work, `returned_futures` must not be nullopt.
-      returned_futures.value().at(0).OnReady(
-          [host_callback_states](Status) mutable {
-            host_callback_states.reset();
-          });
-    }
   }
 
   // TODO(b/240696624): Although the PjRt interface require `returned_futures`
@@ -361,7 +320,7 @@ PyLoadedExecutable::ExecuteShardedOnLocalDevices(
   TF_ASSIGN_OR_RETURN(auto outputs_and_tokens,
                       ExecuteShardedOnLocalDevicesInternal(
                           options_, client_, ifrt_loaded_executable_.get(),
-                          host_callbacks_, args, returned_futures));
+                          args, returned_futures));
   return outputs_and_tokens.DisassembleIntoSingleDeviceArrays();
 }
 
@@ -373,7 +332,7 @@ PyLoadedExecutable::ExecuteShardedOnLocalDevicesWithTokens(
   TF_ASSIGN_OR_RETURN(auto outputs_and_tokens,
                       ExecuteShardedOnLocalDevicesInternal(
                           options_, client_, ifrt_loaded_executable_.get(),
-                          host_callbacks_, args, returned_futures));
+                          args, returned_futures));
   return std::make_pair(outputs_and_tokens.DisassembleIntoSingleDeviceArrays(),
                         outputs_and_tokens.ConsumeToken());
 }
@@ -385,9 +344,9 @@ StatusOr<PyExecuteResults> PyLoadedExecutable::ExecuteSharded(
     returned_futures.emplace();
   }
   absl::Span<const ExecuteShardedArg> span_args = args;
-  return ExecuteShardedOnLocalDevicesInternal(
-      options_, client_, ifrt_loaded_executable_.get(), host_callbacks_,
-      span_args, returned_futures);
+  return ExecuteShardedOnLocalDevicesInternal(options_, client_,
+                                              ifrt_loaded_executable_.get(),
+                                              span_args, returned_futures);
 }
 
 StatusOr<std::vector<std::shared_ptr<HloModule>>>
diff --git a/tensorflow/compiler/xla/python/py_executable.h b/tensorflow/compiler/xla/python/py_executable.h
index f38832e8d59..5d6f4e7bda3 100644
--- a/tensorflow/compiler/xla/python/py_executable.h
+++ b/tensorflow/compiler/xla/python/py_executable.h
@@ -116,8 +116,7 @@ class PyLoadedExecutable
       std::shared_ptr<PyClient> client,
       std::unique_ptr<ifrt::LoadedExecutable> ifrt_loaded_executable,
       std::shared_ptr<Traceback> traceback,
-      std::optional<std::string> fingerprint,
-      std::vector<pybind11::capsule> host_callbacks);
+      std::optional<std::string> fingerprint);
   ~PyLoadedExecutable();
 
   std::shared_ptr<PyClient> client() const { return client_; }
@@ -212,9 +211,6 @@ class PyLoadedExecutable
   // aren't implemented.
   std::optional<std::string> fingerprint_;
 
-  // The python callbacks implemented using send/recv support.
-  std::vector<pybind11::capsule> host_callbacks_;
-
   // The options to pass to `executable_.Execute`.
   ExecuteOptions options_;
 
diff --git a/tensorflow/compiler/xla/python/py_host_callback.cc b/tensorflow/compiler/xla/python/py_host_callback.cc
new file mode 100644
index 00000000000..d49ef458e02
--- /dev/null
+++ b/tensorflow/compiler/xla/python/py_host_callback.cc
@@ -0,0 +1,264 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/py_host_callback.h"
+
+#include <exception>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "google/protobuf/any.pb.h"
+#include "absl/types/span.h"
+#include "pybind11/gil.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/pjrt/host_callback.h"
+#include "tensorflow/compiler/xla/python/callback.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/xla_host_callback.pb.h"
+#include "tensorflow/compiler/xla/python/py_host_callback.pb.h"
+#include "tensorflow/compiler/xla/python/python_ref_manager.h"
+#include "tensorflow/compiler/xla/python/types.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+char PyCpuLoadedHostCallback::ID = 0;
+char PyHostSendAndRecvLoadedHostCallback::ID = 0;
+
+namespace {
+
+StatusOr<std::vector<CpuCallback::Arg>> CreateCallbackArgs(
+    absl::Span<const Shape> operand_shapes) {
+  std::vector<CpuCallback::Arg> callback_args(operand_shapes.size());
+  for (int i = 0; i < operand_shapes.size(); ++i) {
+    Shape shape = operand_shapes[i];
+
+    if (shape.IsArray()) {
+      Shape layout =
+          (shape.has_layout() ? shape
+                              : LayoutUtil::GetWithDefaultLayout(shape));
+      callback_args[i].dims.resize(shape.dimensions_size());
+      absl::c_copy(shape.dimensions(), callback_args[i].dims.begin());
+      callback_args[i].strides = ByteStridesForShape(layout);
+      callback_args[i].type = shape.element_type();
+      callback_args[i].size_in_bytes = ShapeUtil::ByteSizeOf(layout);
+      TF_ASSIGN_OR_RETURN(callback_args[i].dtype,
+                          PrimitiveTypeToDtype(shape.element_type()));
+    } else if (shape.IsToken()) {
+      callback_args[i].type = TOKEN;
+    } else {
+      return InvalidArgument(
+          "Only array and token arguments to Python callbacks are supported, "
+          "got %s",
+          shape.ToString());
+    }
+  }
+  return callback_args;
+}
+
+StatusOr<std::vector<CpuCallback::Result>> CreateCallbackResults(
+    absl::Span<const Shape> result_shapes) {
+  std::vector<CpuCallback::Result> callback_results(result_shapes.size());
+  for (int i = 0; i < result_shapes.size(); ++i) {
+    if (result_shapes[i].IsArray()) {
+      const Shape& shape =
+          result_shapes[i].has_layout()
+              ? result_shapes[i]
+              : LayoutUtil::GetWithDefaultLayout(result_shapes[i]);
+      callback_results[i].expected_dims.resize(shape.dimensions_size());
+      absl::c_copy(shape.dimensions(),
+                   callback_results[i].expected_dims.begin());
+      callback_results[i].expected_strides = ByteStridesForShapeInt64(shape);
+      callback_results[i].type = shape.element_type();
+      callback_results[i].size_in_bytes = ShapeUtil::ByteSizeOf(shape);
+      callback_results[i].reversed_layout.resize(shape.dimensions_size());
+      absl::c_reverse_copy(shape.layout().minor_to_major(),
+                           callback_results[i].reversed_layout.begin());
+    } else if (result_shapes[i].IsToken()) {
+      callback_results[i].type = TOKEN;
+    } else {
+      return InvalidArgument(
+          "Only array and token return values from Python callbacks are "
+          "supported, got %s",
+          result_shapes[i].ToString());
+    }
+  }
+  return callback_results;
+}
+
+}  // namespace
+
+StatusOr<tsl::RCReference<PyCpuLoadedHostCallback>>
+PyCpuLoadedHostCallback::Create(ifrt::Client* ifrt_client,
+                                pybind11::function callable,
+                                absl::Span<const Shape> operand_shapes,
+                                absl::Span<const Shape> result_shapes) {
+  ifrt::PlatformId platform_id = ifrt_client->platform_id();
+  if (platform_id != GpuId() && platform_id != CpuId()) {
+    return Unimplemented("CpuCallback supports CPU and GPU only");
+  }
+
+  TF_ASSIGN_OR_RETURN(auto callback_args, CreateCallbackArgs(operand_shapes));
+  TF_ASSIGN_OR_RETURN(auto callback_results,
+                      CreateCallbackResults(result_shapes));
+
+  // `callable` will be destroyed safely with `PythonRefManager` when
+  // `CpuCallback` is destroyed.
+  auto cpu_callback = std::make_unique<CpuCallback>(
+      std::move(callable), callback_args, callback_results);
+  return tsl::RCReference<PyCpuLoadedHostCallback>(
+      tsl::MakeRef<PyCpuLoadedHostCallback>(ifrt_client,
+                                            std::move(cpu_callback)));
+}
+
+StatusOr<std::string> PyCpuLoadedHostCallback::Serialize() const {
+  return Unimplemented(
+      "PyHostSendAndRecvLoadedHostCallback serialization is not supported");
+}
+
+StatusOr<tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>>
+PyHostSendAndRecvLoadedHostCallback::Create(
+    ifrt::Client* ifrt_client, pybind11::function callable,
+    absl::Span<const Shape> operand_shapes,
+    absl::Span<const Shape> result_shapes,
+    absl::Span<const uint16_t> send_channel_ids,
+    absl::Span<const uint16_t> recv_channel_ids,
+    pybind11::function serializer) {
+  TF_ASSIGN_OR_RETURN(auto callback_args, CreateCallbackArgs(operand_shapes));
+  TF_ASSIGN_OR_RETURN(auto callback_results,
+                      CreateCallbackResults(result_shapes));
+
+  // `callable` will be destroyed safely with `PythonRefManager` when
+  // `CpuCallback` is destroyed.
+  auto cpu_callback =
+      std::make_shared<CpuCallback>(callable, callback_args, callback_results);
+
+  auto host_callback = std::make_unique<HostCallback>();
+
+  auto assign_arg_info = [](absl::Span<const xla::Shape> shapes,
+                            absl::Span<const uint16_t> channel_ids,
+                            std::vector<HostCallbackArgInfo>& arg_infos) {
+    DCHECK_EQ(shapes.size(), channel_ids.size());
+    arg_infos.reserve(shapes.size());
+    for (int i = 0; i < shapes.size(); ++i) {
+      HostCallbackArgInfo host_callback_arg_info;
+      host_callback_arg_info.channel_id = channel_ids[i];
+      const auto& shape = shapes[i];
+      Shape layout =
+          (shape.has_layout() ? shape
+                              : LayoutUtil::GetWithDefaultLayout(shape));
+      host_callback_arg_info.shape = layout;
+      arg_infos.push_back(std::move(host_callback_arg_info));
+    }
+  };
+
+  assign_arg_info(operand_shapes, send_channel_ids, host_callback->operands);
+  assign_arg_info(result_shapes, recv_channel_ids, host_callback->results);
+
+  host_callback->callback = [cpu_callback = std::move(cpu_callback)](
+                                void** outputs, void** inputs) {
+    return cpu_callback->PrepareAndCall(outputs, inputs);
+  };
+  return tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>(
+      tsl::MakeRef<PyHostSendAndRecvLoadedHostCallback>(
+          ifrt_client, std::move(host_callback), callable, operand_shapes,
+          result_shapes, send_channel_ids, recv_channel_ids,
+          std::move(serializer)));
+}
+
+PyHostSendAndRecvLoadedHostCallback::PyHostSendAndRecvLoadedHostCallback(
+    ifrt::Client* ifrt_client,
+    std::unique_ptr<xla::HostCallback> xla_host_callback,
+    pybind11::function callable, absl::Span<const Shape> operand_shapes,
+    absl::Span<const Shape> result_shapes,
+    absl::Span<const uint16_t> send_channel_ids,
+    absl::Span<const uint16_t> recv_channel_ids, pybind11::function serializer)
+    : llvm::RTTIExtends<PyHostSendAndRecvLoadedHostCallback,
+                        ifrt::PjRtHostSendAndRecvLoadedHostCallback>(
+          ifrt_client, std::move(xla_host_callback)),
+      callable_(std::move(callable)),
+      operand_shapes_(operand_shapes.begin(), operand_shapes.end()),
+      result_shapes_(result_shapes.begin(), result_shapes.end()),
+      send_channel_ids_(send_channel_ids.begin(), send_channel_ids.end()),
+      recv_channel_ids_(recv_channel_ids.begin(), recv_channel_ids.end()),
+      serializer_(serializer) {}
+
+PyHostSendAndRecvLoadedHostCallback::~PyHostSendAndRecvLoadedHostCallback() {
+  GlobalPyRefManager()->AddGarbage(
+      absl::MakeSpan(static_cast<pybind11::object*>(&callable_), 1));
+}
+
+StatusOr<std::string> PyHostSendAndRecvLoadedHostCallback::Serialize() const {
+  if (serializer_.is_none()) {
+    return InvalidArgument(
+        "Host callback cannot be serialized because serializer was not "
+        "provided by JAX");
+  }
+  ifrt::XlaHostCallbackProto xla_host_callback_proto;
+
+  TF_RET_CHECK(operand_shapes_.size() == send_channel_ids_.size());
+  for (int i = 0; i < operand_shapes_.size(); ++i) {
+    ifrt::XlaHostCallbackProto::ArgInfo* const operand =
+        xla_host_callback_proto.add_operands();
+    operand->set_channel_id(send_channel_ids_[i]);
+    *operand->mutable_shape() = operand_shapes_[i].ToProto();
+  }
+
+  TF_RET_CHECK(result_shapes_.size() == recv_channel_ids_.size());
+  for (int i = 0; i < result_shapes_.size(); ++i) {
+    ifrt::XlaHostCallbackProto::ArgInfo* const result =
+        xla_host_callback_proto.add_results();
+    result->set_channel_id(recv_channel_ids_[i]);
+    *result->mutable_shape() = result_shapes_[i].ToProto();
+  }
+
+  std::string callable;
+  {
+    pybind11::gil_scoped_acquire gil_acquire;
+    try {
+      callable = pybind11::cast<std::string>(serializer_(callable_));
+    } catch (const pybind11::error_already_set& e) {
+      return absl::InternalError(absl::StrCat(
+          "Unable to pickle the host_callback callable: ", e.what()));
+    } catch (const std::exception& e) {
+      std::exception_ptr p = std::current_exception();
+      return absl::InternalError(absl::StrCat(
+          "Exception while pickling the host_callback callable: ", e.what()));
+    } catch (...) {
+      // Ensure to avoid leaking any exception because this method could have
+      // been called outside of a Python context where C++ exceptions are not
+      // necessarily enabled.
+      return absl::InternalError(
+          "Unknown exception while pickling the host_callback callable.");
+    }
+  }
+  PyHostCallbackProto py_host_callback_proto;
+  py_host_callback_proto.set_callable(std::move(callable));
+  if (!xla_host_callback_proto.mutable_serialized_callback()->PackFrom(
+          py_host_callback_proto)) {
+    return absl::InternalError("Could not serialize a Python host callback");
+  }
+  xla_host_callback_proto.set_use_major_to_minor_data_layout_for_callbacks(
+      true);
+  return xla_host_callback_proto.SerializeAsString();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/py_host_callback.h b/tensorflow/compiler/xla/python/py_host_callback.h
new file mode 100644
index 00000000000..97f55e158c6
--- /dev/null
+++ b/tensorflow/compiler/xla/python/py_host_callback.h
@@ -0,0 +1,132 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_PY_HOST_CALLBACK_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_PY_HOST_CALLBACK_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "tensorflow/compiler/xla/python/callback.h"
+#include "tensorflow/compiler/xla/python/ifrt/client.h"
+#include "tensorflow/compiler/xla/python/ifrt/host_callback.h"
+#include "tensorflow/compiler/xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "tensorflow/compiler/xla/shape.h"
+
+namespace xla {
+
+using PyLoadedHostCallback = ::xla::ifrt::LoadedHostCallback;
+
+// `PyCpuLoadedHostCallback` implements a Python host callback that uses a
+// descriptor (a raw pointer to JAX `CpuCallback`). The descriptor should be
+// passed into a 'xla_python_cpu_callback' or 'xla_python_gpu_callback'
+// CustomCall as its first argument.
+//
+// Serialization is not supported. Once the descriptor is embedded in
+// CustomCall in an XLA computation, the computation will not be serializable.
+class PyCpuLoadedHostCallback final
+    : public llvm::RTTIExtends<PyCpuLoadedHostCallback,
+                               ifrt::LoadedHostCallback> {
+ public:
+  static StatusOr<tsl::RCReference<PyCpuLoadedHostCallback>> Create(
+      ifrt::Client* ifrt_client, pybind11::function callable,
+      absl::Span<const Shape> operand_shapes,
+      absl::Span<const Shape> result_shapes);
+
+  // Returns the descriptor of `CpuCallback`.
+  uint64_t descriptor() const {
+    return absl::bit_cast<uint64_t>(cpu_callback_.get());
+  }
+
+  // LoadedHostCallback implementation.
+
+  ~PyCpuLoadedHostCallback() override = default;
+
+  ifrt::Client* client() const override { return ifrt_client_; }
+
+  StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PyCpuLoadedHostCallback(ifrt::Client* ifrt_client,
+                          std::unique_ptr<CpuCallback> cpu_callback)
+      : ifrt_client_(ifrt_client), cpu_callback_(std::move(cpu_callback)) {}
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  ifrt::Client* ifrt_client_;
+  std::unique_ptr<CpuCallback> cpu_callback_;
+};
+
+// `PyHostSendAndRecvLoadedHostCallback` implements a Python host callback that
+// uses XLA host send and recv. This object should be passed to the compiler
+// when creating `xla::ifrt::LoadedExecutable`.
+//
+// Serialization is supported if the Python host callback using the
+// `cloudpickle` third-party library.
+//
+// TODO(hyeontaek): Update the comment ("compiler" to "client") after splitting
+// compilation and loading.
+class PyHostSendAndRecvLoadedHostCallback final
+    : public llvm::RTTIExtends<PyHostSendAndRecvLoadedHostCallback,
+                               ifrt::PjRtHostSendAndRecvLoadedHostCallback> {
+ public:
+  static StatusOr<tsl::RCReference<PyHostSendAndRecvLoadedHostCallback>> Create(
+      ifrt::Client* ifrt_client, pybind11::function callable,
+      absl::Span<const Shape> operand_shapes,
+      absl::Span<const Shape> result_shapes,
+      absl::Span<const uint16_t> send_channel_ids,
+      absl::Span<const uint16_t> recv_channel_ids,
+      pybind11::function serializer);
+
+  // PjRtLoadedHostCallback implementation.
+
+  ~PyHostSendAndRecvLoadedHostCallback() override;
+
+  StatusOr<std::string> Serialize() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  PyHostSendAndRecvLoadedHostCallback(
+      ifrt::Client* ifrt_client,
+      std::unique_ptr<xla::HostCallback> xla_host_callback,
+      pybind11::function callable, absl::Span<const Shape> operand_shapes,
+      absl::Span<const Shape> result_shapes,
+      absl::Span<const uint16_t> send_channel_ids,
+      absl::Span<const uint16_t> recv_channel_ids,
+      pybind11::function serializer);
+
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  // Retained arguments for host callback serialization.
+  pybind11::function callable_;
+  std::vector<Shape> operand_shapes_;
+  std::vector<Shape> result_shapes_;
+  std::vector<uint16_t> send_channel_ids_;
+  std::vector<uint16_t> recv_channel_ids_;
+  pybind11::function serializer_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_PY_HOST_CALLBACK_H_
diff --git a/tensorflow/compiler/xla/python/py_host_callback.proto b/tensorflow/compiler/xla/python/py_host_callback.proto
new file mode 100644
index 00000000000..642ddf377ac
--- /dev/null
+++ b/tensorflow/compiler/xla/python/py_host_callback.proto
@@ -0,0 +1,25 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+syntax = "proto3";
+
+package xla;
+
+// Represents a JAX host callback that is serialized using the 'cloudpickle'
+// Python library. Typically used for
+// `xla.ifrt.XlaHostCallbackProto.serialized_callback`.
+message PyHostCallbackProto {
+  bytes callable = 1;
+}
diff --git a/tensorflow/compiler/xla/python/py_values.cc b/tensorflow/compiler/xla/python/py_values.cc
index 70a3278104a..34bb2c4ee72 100644
--- a/tensorflow/compiler/xla/python/py_values.cc
+++ b/tensorflow/compiler/xla/python/py_values.cc
@@ -98,8 +98,7 @@ StatusOr<DevicePutResult> HandlePythonScalar(py::handle obj,
   return DevicePutResult(std::move(ifrt_array), /*weak_type=*/true);
 }
 
-StatusOr<DevicePutResult> HandlePythonInt(py::handle obj,
-                                          ifrt::Client* client,
+StatusOr<DevicePutResult> HandlePythonInt(py::handle obj, ifrt::Client* client,
                                           ifrt::Device* to_device,
                                           const DevicePutOptions& options) {
   void* ptr;
@@ -147,28 +146,30 @@ StatusOr<DevicePutResult> HandlePythonInt(py::handle obj,
 }
 
 template <typename T, typename SquashedT = T>
-StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h,
-                                            ifrt::Client* client,
+StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h, ifrt::Client* client,
                                             ifrt::Device* to_device,
                                             const DevicePutOptions& options) {
   T data;
   SquashedT data_squashed;
   void* ptr;
   PrimitiveType type;
-  if (std::is_same<T, bfloat16>()) {
-    // For extension types, ScalarAsCtype returns a pointer to the data.
+  // For extension types, ScalarAsCtype returns a pointer to the data.
+  if (std::is_same<T, xla::s4>()) {
+    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    type = S4;
+  } else if (std::is_same<T, xla::u4>()) {
+    PyArray_ScalarAsCtype(h.ptr(), &ptr);
+    type = U4;
+  } else if (std::is_same<T, bfloat16>()) {
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
     type = BF16;
   } else if (std::is_same<T, tsl::float8_e4m3fn>()) {
-    // For extension types, ScalarAsCtype returns a pointer to the data.
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
     type = F8E4M3FN;
   } else if (std::is_same<T, tsl::float8_e4m3b11>()) {
-    // For extension types, ScalarAsCtype returns a pointer to the data.
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
     type = F8E4M3B11FNUZ;
   } else if (std::is_same<T, tsl::float8_e5m2>()) {
-    // For extension types, ScalarAsCtype returns a pointer to the data.
     PyArray_ScalarAsCtype(h.ptr(), &ptr);
     type = F8E5M2;
   } else if (std::is_same<T, SquashedT>() || !options.squash_64bit_types) {
@@ -195,8 +196,7 @@ StatusOr<DevicePutResult> HandleNumpyScalar(py::handle h,
   return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
 }
 
-StatusOr<DevicePutResult> HandleNumpyArray(py::handle h,
-                                           ifrt::Client* client,
+StatusOr<DevicePutResult> HandleNumpyArray(py::handle h, ifrt::Client* client,
                                            ifrt::Device* to_device,
                                            const DevicePutOptions& options) {
   py::array array = py::cast<py::array>(h);
@@ -248,8 +248,7 @@ StatusOr<DevicePutResult> HandleNumpyArray(py::handle h,
   return DevicePutResult(std::move(ifrt_array), /*weak_type=*/false);
 }
 
-StatusOr<DevicePutResult> HandlePyArray(py::handle obj,
-                                        ifrt::Client* client,
+StatusOr<DevicePutResult> HandlePyArray(py::handle obj, ifrt::Client* client,
                                         ifrt::Device* to_device,
                                         const DevicePutOptions& options) {
   auto py_array = py::reinterpret_borrow<PyArray>(obj);
@@ -287,8 +286,7 @@ StatusOr<DevicePutResult> HandlePyArray(py::handle obj,
 
 }  // namespace
 
-StatusOr<DevicePutResult> DevicePut(py::handle arg,
-                                    ifrt::Client* client,
+StatusOr<DevicePutResult> DevicePut(py::handle arg, ifrt::Client* client,
                                     ifrt::Device* to_device,
                                     const DevicePutOptions& options) {
   tsl::profiler::TraceMe traceme("DevicePut");
@@ -313,18 +311,26 @@ StatusOr<DevicePutResult> DevicePut(py::handle arg,
         // Numpy scalar types. For some of them, we share the handler with
         // Python types (np_int64, np_float64, np_complex128).
         (*p)[dtypes.np_bool.ptr()] = HandleNumpyScalar<bool>;
+        if (dtypes.np_int4) {
+          (*p)[dtypes.np_int4->ptr()] = HandleNumpyScalar<xla::s4>;
+        }
         (*p)[dtypes.np_int8.ptr()] = HandleNumpyScalar<int8_t>;
         (*p)[dtypes.np_int16.ptr()] = HandleNumpyScalar<int16_t>;
         (*p)[dtypes.np_int32.ptr()] = HandleNumpyScalar<int32_t>;
         (*p)[dtypes.np_int64.ptr()] = HandleNumpyScalar<int64_t, int32_t>;
+        if (dtypes.np_uint4) {
+          (*p)[dtypes.np_uint4->ptr()] = HandleNumpyScalar<xla::u4>;
+        }
         (*p)[dtypes.np_uint8.ptr()] = HandleNumpyScalar<uint8_t>;
         (*p)[dtypes.np_uint16.ptr()] = HandleNumpyScalar<uint16_t>;
         (*p)[dtypes.np_uint32.ptr()] = HandleNumpyScalar<uint32_t>;
         (*p)[dtypes.np_uint64.ptr()] = HandleNumpyScalar<uint64_t, uint32_t>;
         (*p)[dtypes.np_float8_e4m3fn.ptr()] =
             HandleNumpyScalar<tsl::float8_e4m3fn>;
-        (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] =
-            HandleNumpyScalar<tsl::float8_e4m3b11>;
+        if (dtypes.np_float8_e4m3b11fnuz) {
+          (*p)[dtypes.np_float8_e4m3b11fnuz->ptr()] =
+              HandleNumpyScalar<tsl::float8_e4m3b11>;
+        }
         (*p)[dtypes.np_float8_e5m2.ptr()] = HandleNumpyScalar<tsl::float8_e5m2>;
         (*p)[dtypes.np_bfloat16.ptr()] = HandleNumpyScalar<bfloat16>;
         (*p)[dtypes.np_float16.ptr()] = HandleNumpyScalar<half>;
@@ -503,7 +509,7 @@ StatusOr<PyArgSignature> PyArgSignatureOfValue(py::handle arg,
         (*p)[dtypes.np_uint32.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_uint64.ptr()] = np_uint64_handler;
         (*p)[dtypes.np_float8_e4m3fn.ptr()] = numpy_array_handler;
-        (*p)[dtypes.np_float8_e4m3b11fnuz.ptr()] = numpy_array_handler;
+        (*p)[dtypes.np_float8_e4m3b11fnuz->ptr()] = numpy_array_handler;
         (*p)[dtypes.np_float8_e5m2.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_float16.ptr()] = numpy_array_handler;
         (*p)[dtypes.np_bfloat16.ptr()] = numpy_array_handler;
diff --git a/tensorflow/compiler/xla/python/python_ref_manager.cc b/tensorflow/compiler/xla/python/python_ref_manager.cc
index 9cc7e79c940..8cfa4263323 100644
--- a/tensorflow/compiler/xla/python/python_ref_manager.cc
+++ b/tensorflow/compiler/xla/python/python_ref_manager.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/python_ref_manager.h"
 
+#include <deque>
+#include <memory>
+#include <utility>
+
 #include "absl/container/inlined_vector.h"
 
 namespace xla {
diff --git a/tensorflow/compiler/xla/python/python_ref_manager.h b/tensorflow/compiler/xla/python/python_ref_manager.h
index 7626793f73d..09963823956 100644
--- a/tensorflow/compiler/xla/python/python_ref_manager.h
+++ b/tensorflow/compiler/xla/python/python_ref_manager.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <atomic>
 #include <deque>
+#include <memory>
+#include <utility>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/inlined_vector.h"
diff --git a/tensorflow/compiler/xla/python/pytree.cc b/tensorflow/compiler/xla/python/pytree.cc
index fe163b21424..fef98d126a8 100644
--- a/tensorflow/compiler/xla/python/pytree.cc
+++ b/tensorflow/compiler/xla/python/pytree.cc
@@ -20,10 +20,12 @@ limitations under the License.
 
 #include <algorithm>
 #include <iterator>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <stdexcept>
 #include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
@@ -811,6 +813,118 @@ PyTreeDef PyTreeDef::FromPickleable(py::object pickleable) {
   return tree;
 }
 
+void PyTreeDef::SetNumLeavesAndNumNodes() {
+  // num_leaves and num_nodes are fully determined by arity.
+  std::vector<std::pair<int, int>> starts;
+  int num_leaves = 0;
+  for (int i = 0; i < traversal_.size(); ++i) {
+    std::pair<int, int> start = {num_leaves, i};
+    if (traversal_[i].kind == PyTreeKind::kLeaf) {
+      num_leaves += 1;
+    }
+    if (traversal_[i].arity == 0) {
+      starts.push_back(start);
+    } else {
+      starts.resize(starts.size() - (traversal_[i].arity - 1));
+    }
+    traversal_[i].num_leaves = num_leaves - starts.back().first;
+    traversal_[i].num_nodes = i + 1 - starts.back().second;
+  }
+}
+
+void PyTreeDef::SerializeTo(jax::PyTreeDefProto& result) const {
+  absl::flat_hash_map<std::string, uint32_t> interned_strings;
+  auto intern_str = [&](const std::string& key) {
+    auto [it, added] =
+        interned_strings.emplace(key, result.interned_strings_size());
+    if (added) {
+      result.add_interned_strings(key);
+    }
+    return it->second;
+  };
+  for (const auto& node : traversal_) {
+    auto* node_data = result.add_nodes();
+    node_data->set_arity(node.arity);
+    switch (node.kind) {
+      case PyTreeKind::kLeaf:
+        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_LEAF);
+        break;
+      case PyTreeKind::kList:
+        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_LIST);
+        break;
+      case PyTreeKind::kNone:
+        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_NONE);
+        break;
+      case PyTreeKind::kTuple:
+        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_TUPLE);
+        break;
+      case PyTreeKind::kDict:
+        node_data->set_type(jax::PyTreeNodeType::PY_TREE_KIND_DICT);
+        for (auto& key : node.sorted_dict_keys) {
+          if (!py::isinstance<py::str>(key)) {
+            throw std::invalid_argument(
+                "Only string keys are supported in proto pytree "
+                "serialization.");
+          }
+          node_data->mutable_dict_keys()->add_str_id(
+              intern_str(py::cast<std::string>(key)));
+        }
+        break;
+      default:
+        throw std::invalid_argument(
+            "User-defined nodes are not supported when serializing pytrees as "
+            "protocol buffers. You should either convert the user-defined "
+            "nodes to another type or use pickle instead.");
+        break;
+    }
+  }
+}
+
+PyTreeDef PyTreeDef::DeserializeFrom(const jax::PyTreeDefProto& input) {
+  std::vector<py::str> interned_strings;
+  interned_strings.reserve(input.interned_strings().size());
+  for (auto& s : input.interned_strings()) {
+    interned_strings.push_back(py::str(s));
+  }
+  PyTreeDef result;
+  for (auto& node_proto : input.nodes()) {
+    result.traversal_.emplace_back();
+    auto& node = result.traversal_.back();
+    node.arity = node_proto.arity();
+    node.custom = nullptr;
+    switch (node_proto.type()) {
+      case jax::PyTreeNodeType::PY_TREE_KIND_LEAF:
+        node.kind = PyTreeKind::kLeaf;
+        break;
+      case jax::PyTreeNodeType::PY_TREE_KIND_LIST:
+        node.kind = PyTreeKind::kList;
+        break;
+      case jax::PyTreeNodeType::PY_TREE_KIND_NONE:
+        node.kind = PyTreeKind::kNone;
+        break;
+      case jax::PyTreeNodeType::PY_TREE_KIND_TUPLE:
+        node.kind = PyTreeKind::kTuple;
+        break;
+      case jax::PyTreeNodeType::PY_TREE_KIND_DICT:
+        node.kind = PyTreeKind::kDict;
+        for (uint32_t str_id : node_proto.dict_keys().str_id()) {
+          if (str_id >= interned_strings.size()) {
+            throw std::invalid_argument(
+                "Malformed pytree proto (dict_key out of range).");
+          }
+          node.sorted_dict_keys.push_back(interned_strings.at(str_id));
+        }
+        break;
+      default:
+        throw std::invalid_argument(
+            "Malformed pytree proto (invalid node type)");
+        break;
+    }
+  }
+  result.SetNumLeavesAndNumNodes();
+  return result;
+}
+
 void BuildPytreeSubmodule(py::module& m) {
   py::module pytree = m.def_submodule("pytree", "Python tree library");
   pytree.attr("version") = py::int_(3);
@@ -839,6 +953,27 @@ void BuildPytreeSubmodule(py::module& m) {
       .def("__ne__",
            [](const PyTreeDef& a, const PyTreeDef& b) { return a != b; })
       .def("__hash__", [](const PyTreeDef& t) { return absl::HashOf(t); })
+      .def("serialize_using_proto",
+           [](const PyTreeDef& a) {
+             jax::PyTreeDefProto result;
+             a.SerializeTo(result);
+             return py::bytes(result.SerializeAsString());
+           })
+      .def_static(
+          "deserialize_using_proto",
+          [](py::bytes data) {
+            jax::PyTreeDefProto input;
+            std::string_view serialized = data;
+            if (serialized.size() > std::numeric_limits<int>::max()) {
+              throw xla::XlaRuntimeError(
+                  "Pytree serialization too large to deserialize.");
+            }
+            if (!input.ParseFromArray(serialized.data(), serialized.size())) {
+              throw xla::XlaRuntimeError(
+                  "Could not deserialize PyTreeDefProto.");
+            }
+            return PyTreeDef::DeserializeFrom(input);
+          })
       .def(py::pickle(
           [](const PyTreeDef& t) { return t.ToPickleable(); },
           [](py::object o) { return PyTreeDef::FromPickleable(o); }));
diff --git a/tensorflow/compiler/xla/python/pytree.h b/tensorflow/compiler/xla/python/pytree.h
index e4bbb4774d1..6895c804b7b 100644
--- a/tensorflow/compiler/xla/python/pytree.h
+++ b/tensorflow/compiler/xla/python/pytree.h
@@ -23,6 +23,7 @@ limitations under the License.
 // binding code and the idiomatic way to emit Python exceptions.
 
 #include <memory>
+#include <optional>
 #include <stdexcept>
 #include <string>
 #include <utility>
@@ -34,6 +35,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
+#include "tensorflow/compiler/xla/python/pytree.pb.h"
 
 namespace xla {
 
@@ -178,7 +180,13 @@ class PyTreeDef {
   // to implement `PyTreeDef.__setstate__`.
   static PyTreeDef FromPickleable(pybind11::object pickleable);
 
+  void SerializeTo(jax::PyTreeDefProto& result) const;
+
+  static PyTreeDef DeserializeFrom(const jax::PyTreeDefProto& input);
+
  private:
+  void SetNumLeavesAndNumNodes();
+
   struct Node {
     PyTreeKind kind = PyTreeKind::kLeaf;
 
diff --git a/tensorflow/compiler/xla/python/pytree.proto b/tensorflow/compiler/xla/python/pytree.proto
new file mode 100644
index 00000000000..73c087ef55a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pytree.proto
@@ -0,0 +1,32 @@
+syntax = "proto3";
+
+package jax;
+
+enum PyTreeNodeType {
+  PY_TREE_KIND_INVALID = 0;
+  PY_TREE_KIND_LEAF = 1;
+  PY_TREE_KIND_LIST = 2;
+  PY_TREE_KIND_NONE = 3;
+  PY_TREE_KIND_TUPLE = 4;
+  PY_TREE_KIND_DICT = 5;
+}
+
+message DictKeysProto {
+  repeated uint32 str_id = 1;
+}
+
+message PyTreeNodeDefProto {
+  // Recovers the tree structure.
+  uint32 arity = 1;
+  // Node type.
+  PyTreeNodeType type = 2;
+  // Only set when type == DICT.
+  DictKeysProto dict_keys = 3;
+}
+
+// A Pytree.
+message PyTreeDefProto {
+  repeated PyTreeNodeDefProto nodes = 1;
+  // Extra strings.
+  repeated string interned_strings = 2;
+}
diff --git a/tensorflow/compiler/xla/python/pytree_test.py b/tensorflow/compiler/xla/python/pytree_test.py
new file mode 100644
index 00000000000..900bba17b5b
--- /dev/null
+++ b/tensorflow/compiler/xla/python/pytree_test.py
@@ -0,0 +1,72 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import collections
+
+from absl.testing import absltest
+
+from tensorflow.compiler.xla.python import xla_client
+
+pytree = xla_client._xla.pytree
+
+
+ExampleType = collections.namedtuple("ExampleType", "field0 field1")
+
+
+class ExampleType2:
+
+  def __init__(self, field0, field1):
+    self.field0 = field0
+    self.field1 = field1
+
+  def to_iterable(self):
+    return [self.field0, self.field1], (None,)
+
+
+def from_iterable(state, values):
+  del state
+  return ExampleType2(field0=values[0], field1=values[1])
+
+
+pytree.register_node(ExampleType2, ExampleType2.to_iterable, from_iterable)
+
+
+class PyTreeTest(absltest.TestCase):
+
+  def roundtrip(self, example):
+    original = pytree.flatten(example)[1]
+    self.assertEqual(
+        pytree.PyTreeDef.deserialize_using_proto(
+            original.serialize_using_proto()
+        ),
+        original,
+    )
+
+  def testSerializeDeserializeNoPickle(self):
+    o = object()
+    self.roundtrip(({"a": o, "b": o}, [o, (o, o), None]))
+
+  def testSerializeWithFallback(self):
+    o = object()
+    with self.assertRaises(ValueError):
+      self.roundtrip({"a": ExampleType(field0=o, field1=o)})
+
+  def testRegisteredType(self):
+    o = object()
+    with self.assertRaises(ValueError):
+      self.roundtrip({"a": ExampleType2(field0=o, field1=o)})
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tensorflow/compiler/xla/python/refine_polymorphic_shapes.cc b/tensorflow/compiler/xla/python/refine_polymorphic_shapes.cc
new file mode 100644
index 00000000000..f4161a7b13a
--- /dev/null
+++ b/tensorflow/compiler/xla/python/refine_polymorphic_shapes.cc
@@ -0,0 +1,72 @@
+/* Copyright 2023 The JAX Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/python/refine_polymorphic_shapes.h"
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Bytecode/BytecodeWriter.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "stablehlo/transforms/Passes.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/mlir/utils/error_util.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+
+xla::Status RefinePolymorphicShapes(llvm::StringRef module_str,
+                                    llvm::raw_ostream &os) {
+  mlir::MLIRContext context;
+  if (VLOG_IS_ON(3)) context.disableMultithreading();
+  context.loadDialect<mlir::func::FuncDialect>();
+  context.loadDialect<mlir::stablehlo::StablehloDialect>();
+  context.loadDialect<mlir::chlo::ChloDialect>();
+
+  auto module = mlir::parseSourceString<mlir::ModuleOp>(
+      llvm::StringRef(module_str.data(), module_str.size()), &context);
+  if (!module || failed(module->verifyInvariants())) {
+    return absl::InvalidArgumentError("Cannot parse module.");
+  }
+
+  mlir::PassManager pm(&context);
+  if (VLOG_IS_ON(3)) {
+    auto print_before = [](mlir::Pass *, mlir::Operation *) { return true; };
+    auto print_after = [](mlir::Pass *, mlir::Operation *) { return true; };
+    pm.enableIRPrinting(print_before, print_after, /*printModuleScope=*/true,
+                        /*printAfterOnlyOnChange=*/false);
+  }
+
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::stablehlo::createStablehloRefineShapesPass());
+  pm.addNestedPass<mlir::func::FuncOp>(
+      mlir::stablehlo::createStablehloCanonicalizeDynamismPass());
+  if (!mlir::succeeded(pm.run(*module))) {
+    return absl::InternalError("Cannot refine shapes.");
+  }
+
+  if (failed(mlir::writeBytecodeToFile(*module, os))) {
+    return absl::InternalError("Cannot serialize module.");
+  }
+  return xla::OkStatus();
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/python/refine_polymorphic_shapes.h b/tensorflow/compiler/xla/python/refine_polymorphic_shapes.h
new file mode 100644
index 00000000000..ac020be1d75
--- /dev/null
+++ b/tensorflow/compiler/xla/python/refine_polymorphic_shapes.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The JAX Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
+#define TENSORFLOW_COMPILER_XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
+
+#include "llvm/Support/raw_ostream.h"
+#include "tensorflow/compiler/xla/status.h"
+
+namespace xla {
+
+// Refines the dynamic shapes for a module whose "main" has static shapes
+// and all the intermediate dynamic shapes depend only on the input static
+// shapes. Serializes the refined module to the given `os`.
+xla::Status RefinePolymorphicShapes(llvm::StringRef module_str,
+                                    llvm::raw_ostream &os);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_PYTHON_REFINE_POLYMORPHIC_SHAPES_H_
diff --git a/tensorflow/compiler/xla/python/sharded_device_array.h b/tensorflow/compiler/xla/python/sharded_device_array.h
index ba157b48c32..54a8deb250f 100644
--- a/tensorflow/compiler/xla/python/sharded_device_array.h
+++ b/tensorflow/compiler/xla/python/sharded_device_array.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/types/variant.h"
diff --git a/tensorflow/compiler/xla/python/sharding.cc b/tensorflow/compiler/xla/python/sharding.cc
index b8a316768a1..ecefabfb679 100644
--- a/tensorflow/compiler/xla/python/sharding.cc
+++ b/tensorflow/compiler/xla/python/sharding.cc
@@ -164,8 +164,12 @@ void RegisterSharding(py::module& m) {
            py::arg("op_sharding"))
       .def(py::init<py::tuple, xla::OpSharding>(), py::arg("devices"),
            py::arg("op_sharding"))
+      .def(py::init<py::list, xla::HloSharding>(), py::arg("devices"),
+           py::arg("op_sharding"))
+      .def(py::init<py::tuple, xla::HloSharding>(), py::arg("devices"),
+           py::arg("op_sharding"))
       .def_property_readonly("_devices", &GSPMDSharding::devices)
-      .def_property_readonly("_op_sharding", &GSPMDSharding::op_sharding);
+      .def_property_readonly("_hlo_sharding", &GSPMDSharding::hlo_sharding);
 }
 
 }  // namespace jax
diff --git a/tensorflow/compiler/xla/python/sharding.h b/tensorflow/compiler/xla/python/sharding.h
index 656093723fe..90537b09670 100644
--- a/tensorflow/compiler/xla/python/sharding.h
+++ b/tensorflow/compiler/xla/python/sharding.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_PYTHON_SHARDING_H_
 
 #include <optional>
+#include <string>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/types/span.h"
@@ -27,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/compiler/xla/python/sharded_device_array.h"
+#include "tensorflow/compiler/xla/python/status_casters.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace jax {
@@ -50,7 +53,7 @@ class Sharding {
 // Returns a hash that may sometimes return different hashes for equal values.
 // It is not a correct implementation of `__hash__` in python, but it's fine
 // for jit/pjit dispatch since it only causes spurious cache misses.
-size_t ShardingHash(const pybind11::object& obj);
+size_t ShardingHash(const pybind11::object& sharding);
 
 bool ShardingEqual(const pybind11::object& a, const pybind11::object& b);
 
@@ -129,16 +132,27 @@ class GSPMDSharding : public XLACompatibleSharding {
  public:
   GSPMDSharding(pybind11::list devices, xla::OpSharding op_sharding)
       : XLACompatibleSharding(/*num_devices=*/devices.size()),
-        devices_(std::move(devices)),  // Implicitly converts a list to a tuple.
-        op_sharding_(std::move(op_sharding)) {}
+        devices_(std::move(devices)),
+        hlo_sharding_(
+            xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding))) {}
 
   GSPMDSharding(pybind11::tuple devices, xla::OpSharding op_sharding)
       : XLACompatibleSharding(/*num_devices=*/devices.size()),
         devices_(std::move(devices)),
-        op_sharding_(std::move(op_sharding)) {}
+        hlo_sharding_(
+            xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding))) {}
+
+  GSPMDSharding(pybind11::list devices, xla::HloSharding op_sharding)
+      : XLACompatibleSharding(/*num_devices=*/devices.size()),
+        devices_(std::move(devices)),  // Implicitly converts a list to a tuple.
+        hlo_sharding_(std::move(op_sharding)) {}
+
+  GSPMDSharding(pybind11::tuple devices, xla::HloSharding op_sharding)
+      : XLACompatibleSharding(/*num_devices=*/devices.size()),
+        devices_(std::move(devices)),
+        hlo_sharding_(std::move(op_sharding)) {}
 
   const pybind11::tuple& devices() const { return devices_; }
-  const xla::OpSharding& op_sharding() const { return op_sharding_; }
 
   size_t Hash() {
     if (!hash_.has_value()) {
@@ -152,13 +166,7 @@ class GSPMDSharding : public XLACompatibleSharding {
     return type;
   }
 
-  xla::HloSharding hlo_sharding() const {
-    auto hlo_sharding = xla::HloSharding::FromProto(op_sharding_);
-    if (!hlo_sharding.ok()) {
-      throw xla::XlaRuntimeError(std::string(hlo_sharding.status().message()));
-    }
-    return hlo_sharding.value();
-  }
+  const xla::HloSharding& hlo_sharding() const { return hlo_sharding_; }
 
   bool operator==(const GSPMDSharding& other) const {
     return AreOpShardingsEqual(*this, other) &&
@@ -167,26 +175,23 @@ class GSPMDSharding : public XLACompatibleSharding {
 
  private:
   size_t CalculateHash() const {
-    // We only hash `op_sharding_` here for performance.
-    auto hlo_sharding = xla::HloSharding::FromProto(op_sharding_);
-    if (!hlo_sharding.ok()) {
-      throw xla::XlaRuntimeError(std::string(hlo_sharding.status().message()));
-    }
-    return absl::Hash<xla::HloSharding>()(*hlo_sharding);
+    // We only hash `hlo_sharding_` here for performance.
+    return absl::Hash<xla::HloSharding>()(hlo_sharding_);
   }
 
   bool IsOpShardingReplicated() const {
-    if (op_sharding_.tile_assignment_devices().size() == 1) {
+    // For JAX, shardings with 1 device are considered as replicated in its
+    // semantics so that downstream things continue to work.
+    if (hlo_sharding_.tile_assignment().num_elements() == 1) {
       return true;
-    } else {
-      return hlo_sharding().IsReplicated();
     }
+    return hlo_sharding().IsReplicated();
   }
 
   static bool AreOpShardingsEqual(const GSPMDSharding& a,
                                   const GSPMDSharding& b) {
     // If the OpSharding object is the same, return true
-    if (&a.op_sharding() == &b.op_sharding()) {
+    if (&a.hlo_sharding() == &b.hlo_sharding()) {
       return true;
     }
     // If both OpShardings are replicated, return true
@@ -197,7 +202,7 @@ class GSPMDSharding : public XLACompatibleSharding {
   }
 
   pybind11::tuple devices_;
-  xla::OpSharding op_sharding_;
+  xla::HloSharding hlo_sharding_;
 
   std::optional<size_t> hash_;
 };
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
index fea8c8dabf7..565bd9f4283 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -591,7 +592,7 @@ PyTpuExecutable::ExecuteResult PyTpuExecutable::ExecuteHelper(
   }
 
   xla::DeviceAssignmentProto device_assignment;
-  CHECK(device_assignment_.Serialize(&device_assignment).ok());
+  TF_CHECK_OK(device_assignment_.Serialize(&device_assignment));
   std::shared_ptr<tpu_driver::Event> on_execute_finished =
       client_->driver()->ExecuteProgram(
           executables_.find(replica)->second.get(), inputs,
diff --git a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
index 378bcd68b3a..a70cc6429c9 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/client/tpu_client.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <array>
 #include <functional>
+#include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -158,8 +160,8 @@ class PyTpuClient : public std::enable_shared_from_this<PyTpuClient> {
     return id_to_device_;
   }
   int process_index() const { return process_index_; }
-  const absl::string_view platform_name() const { return platform_name_; }
-  const absl::string_view platform_version() const { return platform_version_; }
+  absl::string_view platform_name() const { return platform_name_; }
+  absl::string_view platform_version() const { return platform_version_; }
 
   StatusOr<Shape> ChooseCompactLayoutForShape(Shape subshape) {
     return Unimplemented("ChooseCompactLayoutForShape not implemented.");
@@ -252,9 +254,7 @@ class PyTpuBuffer {
 
   const Shape& on_host_shape() const { return on_host_shape_; }
   std::shared_ptr<PjRtDevice> device() const { return device_; }
-  const absl::string_view platform_name() const {
-    return client_->platform_name();
-  }
+  absl::string_view platform_name() const { return client_->platform_name(); }
   std::shared_ptr<PyTpuClient> client() const { return client_; }
 
   // Returns the buffer's value as a tuple DAG of Python arrays. If the value
@@ -335,13 +335,13 @@ class PyTpuBuffer {
 // until the computation finishes.
 class PyTpuToken {
  public:
-  PyTpuToken() {}
+  PyTpuToken() = default;
   Status Await() { return OkStatus(); }
 };
 
 class PyShardedTpuToken {
  public:
-  PyShardedTpuToken() {}
+  PyShardedTpuToken() = default;
   Status Await() { return OkStatus(); }
   PyTpuToken GetPyToken(int i) { return PyTpuToken(); }
 };
@@ -444,7 +444,7 @@ class PyTpuExecutable {
   };
 
   ExecuteResult ExecuteHelper(
-      absl::Span<const std::vector<PyTpuBuffer*>> all_core_arguments,
+      absl::Span<const std::vector<PyTpuBuffer*>> maybe_tupled_args,
       absl::Span<PyTpuBuffer* const> this_core_arguments, int replica,
       int partition, const RunId& run_id);
 
diff --git a/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc
index 5e52ba85964..4b09a2c8ab3 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/direct_tpu_driver.cc
@@ -15,6 +15,12 @@
 
 #include <dlfcn.h>
 
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_format.h"
 #include "absl/time/time.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/client/libtpu.h"
diff --git a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
index 015c42b0af6..3b19dd5a794 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.cc
@@ -13,10 +13,17 @@
 // limitations under the License.
 // ==============================================================================
 
+#include <atomic>
 #include <complex>
 #include <cstddef>
+#include <deque>
 #include <functional>
+#include <limits>
 #include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/strings/strip.h"
@@ -537,9 +544,8 @@ void GrpcTpuStream::UpdateEventStatus(EventId id, Status status) {
 
   // This is the first time this event finishes. Remember the results and call
   // the callbacks.
-  VLOG(1) << "Response received for GrpcEvent " << id << ". "
-          << status.ToString() << ". Firing " << it->second.callbacks.size()
-          << " callbacks.";
+  VLOG(1) << "Response received for GrpcEvent " << id << ". " << status
+          << ". Firing " << it->second.callbacks.size() << " callbacks.";
   it->second.done = true;
   it->second.status = status;
   for (const auto& callback : it->second.callbacks) {
diff --git a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.h b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.h
index a64df225f2a..511d1090f18 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/grpc_tpu_driver.h
@@ -16,6 +16,8 @@
 // limitations under the License.
 // ==============================================================================
 
+#include <memory>
+
 #include "grpcpp/grpcpp.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.pb.h"
diff --git a/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
index af3b6ac0729..e8b00e82154 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/pod_tpu_driver.cc
@@ -13,6 +13,15 @@
 // limitations under the License.
 // =============================================================================
 
+#include <atomic>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc
index e756df5da8d..c2ff503e7c8 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/recording_tpu_driver.cc
@@ -14,7 +14,11 @@
 // =============================================================================
 #include <atomic>
 #include <functional>
+#include <memory>
 #include <optional>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/base/internal/sysinfo.h"
 #include "absl/strings/str_split.h"
@@ -49,7 +53,7 @@ class RecordingEvent : public Event {
   explicit RecordingEvent(std::shared_ptr<Event> event, int64_t id)
       : shared_event_(event), id_(id) {}
 
-  ~RecordingEvent() override {}
+  ~RecordingEvent() override = default;
 
   xla::Status Await() override { return shared_event_->Await(); }
 
@@ -135,7 +139,7 @@ class RecordingTpuDriver : public TpuDriver {
         tsl::Env::Default()->NewAppendableFile(recording_path_, &log_file_);
     if (!file_status.ok()) {
       LOG(FATAL) << "Unable to open " << recording_path_
-                 << " for appending. Error: " << file_status.ToString();
+                 << " for appending. Error: " << file_status;
     }
   }
   ~RecordingTpuDriver() override {
@@ -511,7 +515,7 @@ class RecordingTpuDriver : public TpuDriver {
       if (!data_status.ok()) {
         LOG(WARNING) << "Unable to write data to log file. File possibly "
                         "corrupt. Error: "
-                     << data_status.ToString();
+                     << data_status;
       }
 
       if (flush_) {
@@ -519,14 +523,14 @@ class RecordingTpuDriver : public TpuDriver {
         if (!flush_status.ok()) {
           LOG(WARNING) << "Unable to flush data to log file. File possibly "
                           "corrupt. Error: "
-                       << flush_status.ToString();
+                       << flush_status;
         }
 
         auto sync_status = log_file_->Sync();
         if (!sync_status.ok()) {
           LOG(WARNING) << "Unable to sync log file. File possibly "
                           "corrupt. Error: "
-                       << sync_status.ToString();
+                       << sync_status;
         }
       }
     }
diff --git a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.cc b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.cc
index ecf70b56c14..66c42f1f5a2 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.cc
+++ b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.cc
@@ -15,6 +15,11 @@
 
 #include "tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h"
 
+#include <complex>
+#include <functional>
+#include <memory>
+#include <string>
+
 #include "absl/strings/match.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/util.h"
diff --git a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
index 001116063a0..2cf97abedc3 100644
--- a/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
+++ b/tensorflow/compiler/xla/python/tpu_driver/tpu_driver.h
@@ -50,7 +50,7 @@ int64_t ComputeBytesFromShape(const xla::ShapeProto& shape);
 // inter-operation dependencies.
 class Event {
  public:
-  virtual ~Event() {}
+  virtual ~Event() = default;
 
   // Blocks until the event completes and returns the result status.
   virtual xla::Status Await() = 0;
@@ -65,7 +65,7 @@ class Event {
 // Represents a device memory allocation.
 class BufferHandle {
  public:
-  virtual ~BufferHandle() {}
+  virtual ~BufferHandle() = default;
 
   // This event completes after the device memory is actually allocated.
   //
@@ -80,7 +80,7 @@ class BufferHandle {
 // Represents a compiled program on the host.
 class CompiledProgramHandle {
  public:
-  virtual ~CompiledProgramHandle() {}
+  virtual ~CompiledProgramHandle() = default;
 
   // This Event completes after the program is actually compiled on the host.
   //
@@ -100,7 +100,7 @@ class CompiledProgramHandle {
 // Represents a program loaded on the device.
 class LoadedProgramHandle {
  public:
-  virtual ~LoadedProgramHandle() {}
+  virtual ~LoadedProgramHandle() = default;
 
   // This Event completes after the program is actually loaded on the device.
   //
@@ -118,7 +118,7 @@ class LoadedProgramHandle {
 // in the TPU driver. This interface is not yet implemented.
 class TpuLinearizer {
  public:
-  virtual ~TpuLinearizer() {}
+  virtual ~TpuLinearizer() = default;
 
   int64_t ComputeBytesFromShape(const xla::ShapeProto& shape) {
     return ::tpu_driver::ComputeBytesFromShape(shape);
@@ -150,7 +150,7 @@ class TpuLinearizer {
 // regardless of whether the scheduled device operations have started execution.
 class TpuDriver {
  public:
-  virtual ~TpuDriver() {}
+  virtual ~TpuDriver() = default;
 
   virtual void QuerySystemInfo(SystemInfo* system_info) = 0;
   // Synchronous. Reset the state of the TPU driver. After Reset(), this TPU
diff --git a/tensorflow/compiler/xla/python/traceback.cc b/tensorflow/compiler/xla/python/traceback.cc
index 3179faa6a8f..386f5e75dc7 100644
--- a/tensorflow/compiler/xla/python/traceback.cc
+++ b/tensorflow/compiler/xla/python/traceback.cc
@@ -15,9 +15,11 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/traceback.h"
 
+#include <memory>
 #include <stdexcept>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/hash/hash.h"
 #include "absl/strings/str_format.h"
@@ -38,10 +40,19 @@ Traceback::Traceback() {
   PyThreadState* thread_state = PyThreadState_GET();
 
 #if PY_VERSION_HEX < 0x030b0000
+  // The representation of frame->f_lasti changed from bytes to words in Python
+  // 3.10, see https://docs.python.org/3/whatsnew/3.10.html#changes-in-the-c-api
+  // This should match sizeof(_Py_CODEUNIT) which is unfortunately private.
+#if PY_VERSION_HEX < 0x030a0000
+  constexpr int kLastiWordBytes = 1;
+#else   // PY_VERSION_HEX < 0x030a0000
+  constexpr int kLastiWordBytes = 2;
+#endif  // PY_VERSION_HEX < 0x030a0000
+
   for (PyFrameObject* py_frame = thread_state->frame; py_frame != nullptr;
        py_frame = py_frame->f_back) {
     Py_INCREF(py_frame->f_code);
-    frames_.emplace_back(py_frame->f_code, py_frame->f_lasti);
+    frames_.emplace_back(py_frame->f_code, py_frame->f_lasti * kLastiWordBytes);
   }
 #else   // PY_VERSION_HEX < 0x030b0000
   PyFrameObject* next;
@@ -91,7 +102,7 @@ std::vector<Traceback::Frame> Traceback::Frames() const {
         std::string(py::reinterpret_borrow<py::str>(frame.first->co_filename)),
         std::string(py::reinterpret_borrow<py::str>(frame.first->co_name)),
         frame.first->co_firstlineno,
-        PyCode_Addr2Line(frame.first, frame.second * kLastiWordBytes)});
+        PyCode_Addr2Line(frame.first, frame.second)});
   }
   return frames;
 }
@@ -118,17 +129,30 @@ py::object Traceback::AsPythonTraceback() const {
   py::dict globals;
   py::handle traceback_type(reinterpret_cast<PyObject*>(&PyTraceBack_Type));
   for (const std::pair<PyCodeObject*, int>& frame : frames_) {
-    PyFrameObject* py_frame = PyFrame_New(PyThreadState_Get(), frame.first,
+    int lineno = PyCode_Addr2Line(frame.first, frame.second);
+    // Under Python 3.11 we observed crashes when using a fake PyFrameObject
+    // with a real PyCodeObject (https://github.com/google/jax/issues/16027).
+    // because the frame does not have fields necessary to compute the locals,
+    // notably the closure object, leading to crashes in CPython in
+    // _PyFrame_FastToLocalsWithError
+    // https://github.com/python/cpython/blob/deaf509e8fc6e0363bd6f26d52ad42f976ec42f2/Objects/frameobject.c#LL1116C2-L1116C2
+    // We therefore always build a fake code object to go along with our fake
+    // frame.
+    PyCodeObject* py_code =
+        PyCode_NewEmpty(PyUnicode_AsUTF8(frame.first->co_filename),
+                        PyUnicode_AsUTF8(frame.first->co_name), lineno);
+    PyFrameObject* py_frame = PyFrame_New(PyThreadState_Get(), py_code,
                                           globals.ptr(), /*locals=*/nullptr);
+    Py_DECREF(py_code);
 
     traceback = traceback_type(
         /*tb_next=*/std::move(traceback),
         /*tb_frame=*/
         py::reinterpret_steal<py::object>(
             reinterpret_cast<PyObject*>(py_frame)),
-        /*tb_lasti=*/frame.second,
+        /*tb_lasti=*/0,
         /*tb_lineno=*/
-        PyCode_Addr2Line(frame.first, frame.second * kLastiWordBytes));
+        PyCode_Addr2Line(frame.first, frame.second));
   }
   return traceback;
 }
@@ -191,7 +215,7 @@ void BuildTracebackSubmodule(py::module& m) {
           throw xla::XlaRuntimeError("code argument must be a code object");
         }
         return PyCode_Addr2Line(reinterpret_cast<PyCodeObject*>(code.ptr()),
-                                lasti * kLastiWordBytes);
+                                lasti);
       },
       "Python wrapper around the Python C API function PyCode_Addr2Line");
 
diff --git a/tensorflow/compiler/xla/python/traceback.h b/tensorflow/compiler/xla/python/traceback.h
index 8958726db7d..d46ec05331a 100644
--- a/tensorflow/compiler/xla/python/traceback.h
+++ b/tensorflow/compiler/xla/python/traceback.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
@@ -26,11 +27,6 @@ limitations under the License.
 
 namespace xla {
 
-// The representation of frame->f_lasti changed from bytes to words in Python
-// 3.10, see https://docs.python.org/3/whatsnew/3.10.html#changes-in-the-c-api
-// This should match sizeof(_Py_CODEUNIT) which is unfortunately private.
-static constexpr int kLastiWordBytes = (PY_VERSION_HEX < 0x030a0000) ? 1 : 2;
-
 // Represents a Python traceback.
 class Traceback {
  public:
@@ -87,6 +83,11 @@ class Traceback {
   }
 
  private:
+  // Each frame is a pair of a code object and a "lasti" instruction location
+  // in bytes. The size of _Py_CODEUNIT has changed across different Python
+  // versions; the lasti value here has already been multiplied by
+  // sizeof(_Py_CODEUNIT) if needed and is suitable for passing to functions
+  // like PyCode_Addr2Line().
   absl::InlinedVector<std::pair<PyCodeObject*, int>, 32> frames_;
 
   // Protected by GIL.
diff --git a/tensorflow/compiler/xla/python/types.cc b/tensorflow/compiler/xla/python/types.cc
index 6305aa95e3a..3f90bafe0b8 100644
--- a/tensorflow/compiler/xla/python/types.cc
+++ b/tensorflow/compiler/xla/python/types.cc
@@ -15,7 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/python/types.h"
 
+#include <algorithm>
+#include <complex>
+#include <iterator>
+#include <memory>
+#include <optional>
+#include <string>
 #include <tuple>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/python/exceptions.h"
@@ -26,50 +34,116 @@ namespace xla {
 
 namespace py = pybind11;
 
+namespace {
+
+struct CustomDtypes {
+  py::dtype bfloat16;
+  py::dtype float8_e4m3fn;
+  std::optional<py::dtype> float8_e4m3b11fnuz;
+  py::dtype float8_e5m2;
+  std::optional<py::dtype> int4;
+  std::optional<py::dtype> uint4;
+};
+
+const CustomDtypes& GetCustomDtypes() {
+  static const CustomDtypes& custom_dtypes = *[]() {
+    py::module ml_dtypes = py::module::import("ml_dtypes");
+    auto* dtypes = new CustomDtypes;
+    dtypes->bfloat16 = py::dtype::from_args(ml_dtypes.attr("bfloat16"));
+    dtypes->float8_e4m3fn =
+        py::dtype::from_args(ml_dtypes.attr("float8_e4m3fn"));
+    dtypes->float8_e5m2 = py::dtype::from_args(ml_dtypes.attr("float8_e5m2"));
+    if (py::hasattr(ml_dtypes, "float8_e4m3b11fnuz")) {
+      dtypes->float8_e4m3b11fnuz =
+          py::dtype::from_args(ml_dtypes.attr("float8_e4m3b11fnuz"));
+    }
+    if (py::hasattr(ml_dtypes, "int4")) {
+      dtypes->int4 = py::dtype::from_args(ml_dtypes.attr("int4"));
+    }
+    if (py::hasattr(ml_dtypes, "uint4")) {
+      dtypes->uint4 = py::dtype::from_args(ml_dtypes.attr("uint4"));
+    }
+    return dtypes;
+  }();
+  return custom_dtypes;
+}
+
+}  // namespace
+
 xla::StatusOr<PrimitiveType> DtypeToPrimitiveType(const py::dtype& np_type) {
-  static auto* types =
-      new absl::flat_hash_map<std::tuple<char, char, int>, PrimitiveType>({
-          {{'?', 'b', 1}, PRED},          {{'b', 'i', 1}, S8},
-          {{'h', 'i', 2}, S16},           {{'i', 'i', 4}, S32},
-          {{'l', 'i', 4}, S32},           {{'q', 'i', 8}, S64},
-          {{'l', 'i', 8}, S64},           {{'B', 'u', 1}, U8},
-          {{'H', 'u', 2}, U16},           {{'I', 'u', 4}, U32},
-          {{'L', 'u', 4}, U32},           {{'Q', 'u', 8}, U64},
-          {{'L', 'u', 8}, U64},           {{'4', 'V', 1}, F8E4M3FN},
-          {{'L', 'V', 1}, F8E4M3B11FNUZ}, {{'5', 'f', 1}, F8E5M2},
-          {{'E', 'V', 2}, BF16},  // array protocol code for raw data (void*)
-          {{'e', 'f', 2}, F16},           {{'f', 'f', 4}, F32},
-          {{'d', 'f', 8}, F64},           {{'F', 'c', 8}, C64},
+  static auto& builtin_dtypes =
+      *new absl::flat_hash_map<std::tuple<char, char, int>, PrimitiveType>({
+          {{'?', 'b', 1}, PRED},
+          {{'b', 'i', 1}, S8},
+          {{'h', 'i', 2}, S16},
+          {{'i', 'i', 4}, S32},
+          {{'l', 'i', 4}, S32},
+          {{'q', 'i', 8}, S64},
+          {{'l', 'i', 8}, S64},
+          {{'B', 'u', 1}, U8},
+          {{'H', 'u', 2}, U16},
+          {{'I', 'u', 4}, U32},
+          {{'L', 'u', 4}, U32},
+          {{'Q', 'u', 8}, U64},
+          {{'L', 'u', 8}, U64},
+          {{'e', 'f', 2}, F16},
+          {{'f', 'f', 4}, F32},
+          {{'d', 'f', 8}, F64},
+          {{'F', 'c', 8}, C64},
           {{'D', 'c', 16}, C128},
       });
-  auto it = types->find({np_type.char_(), np_type.kind(), np_type.itemsize()});
-  if (it == types->end()) {
-    return InvalidArgument("Unknown NumPy type %c kind %c size %d",
-                           np_type.char_(), np_type.kind(), np_type.itemsize());
+  auto builtin_it = builtin_dtypes.find(
+      {np_type.char_(), np_type.kind(), np_type.itemsize()});
+  if (builtin_it != builtin_dtypes.end()) {
+    return builtin_it->second;
   }
-  return it->second;
+
+  struct DtypeEq {
+    bool operator()(const py::dtype& a, const py::dtype& b) const {
+      return a.equal(b);
+    }
+  };
+  struct DtypeHash {
+    ssize_t operator()(const py::dtype& key) const { return py::hash(key); }
+  };
+  static auto* custom_dtype_map = []() {
+    const CustomDtypes& custom_dtypes = GetCustomDtypes();
+    auto* map =
+        new absl::flat_hash_map<py::dtype, PrimitiveType, DtypeHash, DtypeEq>();
+    map->emplace(custom_dtypes.bfloat16, BF16);
+    map->emplace(custom_dtypes.float8_e4m3fn, F8E4M3FN);
+    if (custom_dtypes.float8_e4m3b11fnuz) {
+      map->emplace(*custom_dtypes.float8_e4m3b11fnuz, F8E4M3B11FNUZ);
+    }
+    map->emplace(custom_dtypes.float8_e5m2, F8E5M2);
+    if (custom_dtypes.int4) {
+      map->emplace(*custom_dtypes.int4, S4);
+    }
+    if (custom_dtypes.uint4) {
+      map->emplace(*custom_dtypes.uint4, U4);
+    }
+    return map;
+  }();
+
+  auto custom_it = custom_dtype_map->find(np_type);
+  if (custom_it != custom_dtype_map->end()) {
+    return custom_it->second;
+  }
+  return InvalidArgument("Unknown NumPy dtype %s char %c kind %c itemsize %d",
+                         static_cast<std::string>(py::repr(np_type)),
+                         np_type.char_(), np_type.kind(), np_type.itemsize());
 }
 
 xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
-  struct FloatTypes {
-    py::dtype bfloat16;
-    py::dtype float8_e4m3fn;
-    py::dtype float8_e4m3b11;
-    py::dtype float8_e5m2;
-  };
-
-  static const FloatTypes& float_types = *[]() {
-    py::module ml_dtypes = py::module::import("ml_dtypes");
-    return new FloatTypes{
-        py::dtype::from_args(ml_dtypes.attr("bfloat16")),
-        py::dtype::from_args(ml_dtypes.attr("float8_e4m3fn")),
-        py::dtype::from_args(ml_dtypes.attr("float8_e4m3b11")),
-        py::dtype::from_args(ml_dtypes.attr("float8_e5m2")),
-    };
-  }();
+  const CustomDtypes& custom_dtypes = GetCustomDtypes();
   switch (type) {
     case PRED:
       return py::dtype::of<bool>();
+    case S4:
+      if (custom_dtypes.int4) {
+        return *custom_dtypes.int4;
+      }
+      return InvalidArgument("ml_dtypes.int4 not found");
     case S8:
       return py::dtype::of<int8_t>();
     case S16:
@@ -78,6 +152,11 @@ xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
       return py::dtype::of<int32_t>();
     case S64:
       return py::dtype::of<int64_t>();
+    case U4:
+      if (custom_dtypes.uint4) {
+        return *custom_dtypes.uint4;
+      }
+      return InvalidArgument("ml_dtypes.uint4 not found");
     case U8:
       return py::dtype::of<uint8_t>();
     case U16:
@@ -87,13 +166,16 @@ xla::StatusOr<py::dtype> PrimitiveTypeToDtype(PrimitiveType type) {
     case U64:
       return py::dtype::of<uint64_t>();
     case F8E4M3FN:
-      return float_types.float8_e4m3fn;
+      return custom_dtypes.float8_e4m3fn;
     case F8E4M3B11FNUZ:
-      return float_types.float8_e4m3b11;
+      if (custom_dtypes.float8_e4m3b11fnuz) {
+        return *custom_dtypes.float8_e4m3b11fnuz;
+      }
+      return InvalidArgument("ml_dtypes.float8_e4m3b11fnuz not found");
     case F8E5M2:
-      return float_types.float8_e5m2;
+      return custom_dtypes.float8_e5m2;
     case BF16:
-      return float_types.bfloat16;
+      return custom_dtypes.bfloat16;
     case F16:
       return py::dtype("e");  // PEP 3118 code for "float16
     case F32:
@@ -116,18 +198,26 @@ const NumpyScalarTypes& GetNumpyScalarTypes() {
     py::module numpy = py::module::import("numpy");
     py::module ml_dtypes = py::module::import("ml_dtypes");
     dtypes->np_bool = py::object(numpy.attr("bool_"));
+    if (py::hasattr(ml_dtypes, "int4")) {
+      dtypes->np_int4 = py::object(ml_dtypes.attr("int4"));
+    }
     dtypes->np_int8 = py::object(numpy.attr("int8"));
     dtypes->np_int16 = py::object(numpy.attr("int16"));
     dtypes->np_int32 = py::object(numpy.attr("int32"));
     dtypes->np_int64 = py::object(numpy.attr("int64"));
+    if (py::hasattr(ml_dtypes, "uint4")) {
+      dtypes->np_uint4 = py::object(ml_dtypes.attr("uint4"));
+    }
     dtypes->np_uint8 = py::object(numpy.attr("uint8"));
     dtypes->np_uint16 = py::object(numpy.attr("uint16"));
     dtypes->np_uint32 = py::object(numpy.attr("uint32"));
     dtypes->np_uint64 = py::object(numpy.attr("uint64"));
     dtypes->np_bfloat16 = py::object(ml_dtypes.attr("bfloat16"));
     dtypes->np_float8_e4m3fn = py::object(ml_dtypes.attr("float8_e4m3fn"));
-    dtypes->np_float8_e4m3b11fnuz =
-        py::object(ml_dtypes.attr("float8_e4m3b11"));
+    if (py::hasattr(ml_dtypes, "float8_e4m3b11fnuz")) {
+      dtypes->np_float8_e4m3b11fnuz =
+          py::object(ml_dtypes.attr("float8_e4m3b11fnuz"));
+    }
     dtypes->np_float8_e5m2 = py::object(ml_dtypes.attr("float8_e5m2"));
     dtypes->np_float16 = py::object(numpy.attr("float16"));
     dtypes->np_float32 = py::object(numpy.attr("float32"));
@@ -141,43 +231,41 @@ const NumpyScalarTypes& GetNumpyScalarTypes() {
   return *singleton;
 }
 
-// Returns a numpy-style format descriptor string for `type`.
-StatusOr<std::string> FormatDescriptorForPrimitiveType(PrimitiveType type) {
+const char* PEP3118FormatDescriptorForPrimitiveType(PrimitiveType type) {
   // We use an "=" prefix to indicate that we prefer "standard" types like
   // np.int32 rather than "native" types like np.cint. pybind11 does not qualify
   // its format descriptors.
   switch (type) {
     case PRED:
-      return std::string("?");
+      return "?";
     case S8:
-      return std::string("=b");
+      return "=b";
     case S16:
-      return std::string("=h");
+      return "=h";
     case S32:
-      return std::string("=i");
+      return "=i";
     case S64:
-      return std::string("=q");
+      return "=q";
     case U8:
-      return std::string("=B");
+      return "=B";
     case U16:
-      return std::string("=H");
+      return "=H";
     case U32:
-      return std::string("=I");
+      return "=I";
     case U64:
-      return std::string("=Q");
+      return "=Q";
     case F16:
-      return std::string("=e");
+      return "=e";
     case F32:
-      return std::string("=f");
+      return "=f";
     case F64:
-      return std::string("=d");
+      return "=d";
     case C64:
-      return std::string("=Zf");
+      return "=Zf";
     case C128:
-      return std::string("=Zd");
+      return "=Zd";
     default:
-      return Unimplemented("Unimplemented primitive type %s",
-                           PrimitiveType_Name(type));
+      return nullptr;
   }
 }
 
diff --git a/tensorflow/compiler/xla/python/types.h b/tensorflow/compiler/xla/python/types.h
index 162c952e7ec..28aed5538d3 100644
--- a/tensorflow/compiler/xla/python/types.h
+++ b/tensorflow/compiler/xla/python/types.h
@@ -44,25 +44,28 @@ StatusOr<PrimitiveType> DtypeToPrimitiveType(const pybind11::dtype& np_type);
 // Converts a PrimitiveType to a Numpy dtype.
 StatusOr<pybind11::dtype> PrimitiveTypeToDtype(PrimitiveType type);
 
-// Returns a numpy-style format descriptor string for `type`.
-StatusOr<std::string> FormatDescriptorForPrimitiveType(PrimitiveType type);
+// Returns a Python buffer protocol (PEP 3118) format descriptor string for
+// `type`. Return nullptr if there is no suitable choice of format string.
+const char* PEP3118FormatDescriptorForPrimitiveType(PrimitiveType type);
 
 // Returns a numpy-style typestr for `type`, as returned by np.dtype(...).str
 StatusOr<pybind11::str> TypeDescriptorForPrimitiveType(PrimitiveType type);
 
 struct NumpyScalarTypes {
   pybind11::object np_bool;
+  std::optional<pybind11::object> np_int4;
   pybind11::object np_int8;
   pybind11::object np_int16;
   pybind11::object np_int32;
   pybind11::object np_int64;
+  std::optional<pybind11::object> np_uint4;
   pybind11::object np_uint8;
   pybind11::object np_uint16;
   pybind11::object np_uint32;
   pybind11::object np_uint64;
   pybind11::object np_bfloat16;
   pybind11::object np_float8_e4m3fn;
-  pybind11::object np_float8_e4m3b11fnuz;
+  std::optional<pybind11::object> np_float8_e4m3b11fnuz;
   pybind11::object np_float8_e5m2;
   pybind11::object np_float16;
   pybind11::object np_float32;
diff --git a/tensorflow/compiler/xla/python/xla.cc b/tensorflow/compiler/xla/python/xla.cc
index ca4de2a1157..955a3f0e083 100644
--- a/tensorflow/compiler/xla/python/xla.cc
+++ b/tensorflow/compiler/xla/python/xla.cc
@@ -14,9 +14,11 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstdint>
+#include <exception>
 #include <functional>
 #include <memory>
 #include <optional>
+#include <set>
 #include <string>
 #include <utility>
 #include <variant>
@@ -24,12 +26,13 @@ limitations under the License.
 
 // clang-format off
 // Must be included first
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/python/py_client.h"
 #include "tensorflow/tsl/python/lib/core/numpy.h"  //NOLINT
 // clang-format on
 
+#include "absl/strings/ascii.h"
 #include "absl/strings/str_format.h"
-#include "absl/strings/str_join.h"
 #include "absl/types/span.h"
 #include "pybind11/attr.h"  // from @pybind11
 #include "pybind11/cast.h"  // from @pybind11
@@ -53,10 +56,11 @@ limitations under the License.
 #ifdef XLA_PYTHON_ENABLE_PLUGIN_DEVICE
 #include "tensorflow/compiler/xla/pjrt/pjrt_plugin_device_client.h"
 #endif  // XLA_PYTHON_ENABLE_PLUGIN_DEVICE
+#include "tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h"
 #include "tensorflow/compiler/xla/pjrt/tfrt_cpu_pjrt_client.h"
 #ifdef XLA_PYTHON_ENABLE_TPU
-#include "tensorflow/compiler/xla/pjrt/pjrt_c_api_client.h"
 #include "tensorflow/compiler/xla/pjrt/tpu_client.h"
+#include "tensorflow/compiler/xla/stream_executor/tpu/tpu_initializer_helper.h"  // NOLINT(unused-includes): required for tensorflow::tpu::FindAndLoadTpuLibrary
 #endif  // XLA_PYTHON_ENABLE_TPU
 #include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
 #include "tensorflow/compiler/xla/python/custom_call_sharding.h"
@@ -228,7 +232,48 @@ PYBIND11_MODULE(xla_extension, m) {
                  "Per device live_buffers() is going to be deprecated. Please "
                  "use the jax.live_arrays() for jax.Arrays instead.");
              return py::list();
-           });
+           })
+      .def(
+          "memory_stats",
+          [](const PjRtDevice& device)
+              -> std::optional<std::map<std::string, int64_t>> {
+            GlobalPyRefManager()->CollectGarbage();
+            xla::StatusOr<tsl::AllocatorStats> maybe_stats =
+                device.GetAllocatorStats();
+            if (absl::IsUnimplemented(maybe_stats.status())) {
+              return std::nullopt;
+            }
+            // Raise error if any status other than Unimplemented is returned.
+            ThrowIfError(maybe_stats.status());
+
+            std::map<std::string, int64_t> result;
+            result["num_allocs"] = maybe_stats->num_allocs;
+            result["bytes_in_use"] = maybe_stats->bytes_in_use;
+            result["peak_bytes_in_use"] = maybe_stats->peak_bytes_in_use;
+            result["largest_alloc_size"] = maybe_stats->largest_alloc_size;
+            if (maybe_stats->bytes_limit) {
+              result["bytes_limit"] = *maybe_stats->bytes_limit;
+            }
+            result["bytes_reserved"] = maybe_stats->bytes_reserved;
+            result["peak_bytes_reserved"] = maybe_stats->peak_bytes_reserved;
+            if (maybe_stats->bytes_reservable_limit) {
+              result["bytes_reservable_limit"] =
+                  *maybe_stats->bytes_reservable_limit;
+            }
+            result["largest_free_block_bytes"] =
+                maybe_stats->largest_free_block_bytes;
+            if (maybe_stats->pool_bytes) {
+              result["pool_bytes"] = *maybe_stats->pool_bytes;
+            }
+            if (maybe_stats->peak_pool_bytes) {
+              result["peak_pool_bytes"] = *maybe_stats->peak_pool_bytes;
+            }
+            return result;
+          },
+          "Returns memory statistics for this device keyed by name. May not be "
+          "implemented on all platforms, and different platforms may return "
+          "different stats, or -1 for unavailable stats. 'bytes_in_use' is "
+          "usually available. Intended for diagnostic use.");
   static PyMethodDef get_attr_method = {
       "__getattr__",
       +[](PyObject* self, PyObject* args) -> PyObject* {
@@ -296,12 +341,6 @@ PYBIND11_MODULE(xla_extension, m) {
       // TODO(skye): delete after all callers can handle 2D output
       .def("get_default_device_assignment",
            xla::ValueOrThrowWrapper(&PyClient::GetDefaultDeviceAssignment1D))
-      .def("create_channel_handle",
-           xla::ValueOrThrowWrapper(&PyClient::CreateChannelHandle))
-      .def("create_device_to_host_channel_handle",
-           xla::ValueOrThrowWrapper(&PyClient::CreateDeviceToHostChannelHandle))
-      .def("create_host_to_device_channel_handle",
-           xla::ValueOrThrowWrapper(&PyClient::CreateHostToDeviceChannelHandle))
       .def(
           "buffer_from_pyval",
           [](py::handle py_client, py::handle argument, py::handle py_device,
@@ -344,7 +383,7 @@ PYBIND11_MODULE(xla_extension, m) {
                &PyClient::MakePythonCallbackUsingHostSendAndRecv),
            py::arg("callable"), py::arg("operand_shapes"),
            py::arg("result_shapes"), py::arg("send_channel_ids"),
-           py::arg("recv_channel_ids"))
+           py::arg("recv_channel_ids"), py::arg("serializer") = py::none())
       // Deprecated: please use `get_emit_python_callback_descriptor` instead.
       .def("emit_python_callback",
            xla::ValueOrThrowWrapper(&PyClient::EmitPythonCallback),
@@ -369,6 +408,10 @@ PYBIND11_MODULE(xla_extension, m) {
     return std::make_shared<PyClient>(
         ifrt::PjRtClient::Create(std::move(client)));
   });
+  m.def("pjrt_plugin_loaded", [](std::string platform_name) -> bool {
+    xla::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
+    return pjrt_api.ok();
+  });
   m.def("load_pjrt_plugin",
         [](std::string platform_name, std::string library_path) {
           xla::ThrowIfError(pjrt::LoadPjrtPlugin(platform_name, library_path));
@@ -390,21 +433,42 @@ PYBIND11_MODULE(xla_extension, m) {
       "get_gpu_client",
       [](bool asynchronous, const GpuAllocatorConfig& allocator_config,
          std::shared_ptr<DistributedRuntimeClient> distributed_client,
-         int node_id, std::optional<std::set<int>> allowed_devices,
+         int node_id, int num_nodes,
+         std::optional<std::set<int>> allowed_devices,
          std::optional<std::string> platform_name)
           -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
+        PjRtClient::KeyValueGetCallback kv_get = nullptr;
+        PjRtClient::KeyValuePutCallback kv_put = nullptr;
+        if (distributed_client != nullptr) {
+          // Use the plugin name as key prefix.
+          std::string key_prefix = "gpu:";
+          kv_get = [distributed_client, key_prefix](
+                       const std::string& k,
+                       absl::Duration timeout) -> xla::StatusOr<std::string> {
+            return distributed_client->BlockingKeyValueGet(
+                absl::StrCat(key_prefix, k), timeout);
+          };
+          kv_put = [distributed_client, key_prefix](
+                       const std::string& k,
+                       const std::string& v) -> xla::Status {
+            return distributed_client->KeyValueSet(absl::StrCat(key_prefix, k),
+                                                   v);
+          };
+        }
         std::unique_ptr<PjRtClient> client =
             xla::ValueOrThrow(GetStreamExecutorGpuClient(
-                asynchronous, allocator_config, std::move(distributed_client),
-                node_id, allowed_devices, platform_name));
+                asynchronous, allocator_config, node_id, num_nodes,
+                allowed_devices, platform_name,
+                /*should_stage_host_to_device_transfers=*/true, kv_get,
+                kv_put));
         return std::make_shared<PyClient>(
             ifrt::PjRtClient::Create(std::move(client)));
       },
       py::arg("asynchronous") = true,
       py::arg("allocator_config") = GpuAllocatorConfig(),
       py::arg("distributed_client") = nullptr, py::arg("node_id") = 0,
-      py::arg("allowed_devices") = std::nullopt,
+      py::arg("num_nodes") = 1, py::arg("allowed_devices") = std::nullopt,
       py::arg("platform_name") = std::nullopt);
 #endif  // XLA_PYTHON_ENABLE_GPU
 
@@ -419,14 +483,24 @@ PYBIND11_MODULE(xla_extension, m) {
             ifrt::PjRtClient::Create(std::move(client)));
       },
       py::arg("max_inflight_computations") = 32);
-  // TODO(b/262050449): move out from `#ifdef XLA_PYTHON_ENABLE_TPU` when
-  // GetCApiClient does not depend on TPU.
+#endif  // XLA_PYTHON_ENABLE_TPU
+
   m.def(
       "get_c_api_client",
       [](std::string platform_name,
          const absl::flat_hash_map<std::string, PjRtValueType>& options)
           -> std::shared_ptr<PyClient> {
         py::gil_scoped_release gil_release;
+#ifdef XLA_PYTHON_ENABLE_TPU
+    // TODO(b/262050449): use a common plugin discovery mechanism, rather than
+    // having TPU-specific code here.
+#if !defined(PLATFORM_GOOGLE) || defined(LIBTPU_STATIC)
+        if (absl::AsciiStrToLower(platform_name) == "tpu") {
+          // TODO(b/261484192): handle device specific initialization.
+          tensorflow::tpu::FindAndLoadTpuLibrary().IgnoreError();
+        }
+#endif
+#endif  // XLA_PYTHON_ENABLE_TPU
         std::unique_ptr<PjRtClient> c_api_client =
             xla::ValueOrThrow(GetCApiClient(platform_name, options));
         return std::make_shared<PyClient>(
@@ -441,7 +515,6 @@ PYBIND11_MODULE(xla_extension, m) {
             -> std::shared_ptr<PjRtTopologyDescription> {
           return xla::ValueOrThrow(GetCApiTopology(platform_name));
         });
-#endif  // XLA_PYTHON_ENABLE_TPU
 
 #ifdef XLA_PYTHON_ENABLE_PLUGIN_DEVICE
   m.def("get_plugin_device_client", []() -> std::shared_ptr<PyClient> {
diff --git a/tensorflow/compiler/xla/python/xla_client.py b/tensorflow/compiler/xla/python/xla_client.py
index 583512817e7..0b3c14c8fe9 100644
--- a/tensorflow/compiler/xla/python/xla_client.py
+++ b/tensorflow/compiler/xla/python/xla_client.py
@@ -44,10 +44,10 @@ profiler = _xla.profiler
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes.
-_version = 151
+_version = 161
 
 # Version number for MLIR:Python components.
-mlir_api_version = 47
+mlir_api_version = 50
 
 xla_platform_names = {
     'cpu': 'Host',
@@ -68,8 +68,8 @@ def make_cpu_client(*, use_tfrt: bool = True) -> ...:
   return _xla.get_tfrt_cpu_client(asynchronous=True)
 
 
-def make_gpu_client(distributed_client=None, node_id=0, platform_name=None,
-                    allowed_devices=None):
+def make_gpu_client(distributed_client=None, node_id=0, num_nodes=1,
+                    platform_name=None, allowed_devices=None):
   """Returns a GPU client. BFC allocator is used by default."""
   allocator = os.getenv('XLA_PYTHON_CLIENT_ALLOCATOR', 'default').lower()
   memory_fraction = os.getenv('XLA_PYTHON_CLIENT_MEM_FRACTION')
@@ -96,6 +96,7 @@ def make_gpu_client(distributed_client=None, node_id=0, platform_name=None,
       allocator_config=config,
       distributed_client=distributed_client,
       node_id=node_id,
+      num_nodes=num_nodes,
       platform_name=platform_name,
       allowed_devices=allowed_devices)
 
@@ -109,10 +110,28 @@ def make_tfrt_tpu_c_api_client(options: Optional[_NameValueMapping] = None):
 DeviceTopology = _xla.DeviceTopology
 
 
-def make_tfrt_tpu_c_api_device_topology() -> DeviceTopology:
+def make_tfrt_tpu_c_api_device_topology(
+    topology_name: Optional[str] = None, **kwargs
+) -> DeviceTopology:
+  """Creates a PJRT C API TopologyDescription."""
+
+  if not _use_pjrt_c_api():
+    raise NotImplementedError(
+        'make_tfrt_tpu_c_api_device_topology only works with the pjrt c-api.'
+    )
+  if topology_name is not None or kwargs:
+    raise NotImplementedError(
+        'Unsupported arguments to'
+        ' make_tfrt_tpu_c_api_device_topology(topology_name=%s, %s)'
+        % (repr(topology_name), repr(kwargs))
+    )
   return _xla.get_default_c_api_topology('tpu')
 
 
+def pjrt_plugin_loaded(plugin_name: str) -> bool:
+  return _xla.pjrt_plugin_loaded(plugin_name)
+
+
 def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> None:
   _xla.load_pjrt_plugin(plugin_name, library_path)
 
@@ -204,7 +223,12 @@ PrimitiveType = _xla.PrimitiveType
 
 bfloat16 = ml_dtypes.bfloat16
 float8_e4m3fn = ml_dtypes.float8_e4m3fn
-float8_e4m3b11fnuz = ml_dtypes.float8_e4m3b11
+# TODO(vanderplas): remove this conditional when min ml_dtypes >= 0.2
+float8_e4m3b11fnuz = (
+    ml_dtypes.float8_e4m3b11fnuz
+    if hasattr(ml_dtypes, 'float8_e4m3b11fnuz')
+    else ml_dtypes.float8_e4m3fn
+)
 float8_e5m2 = ml_dtypes.float8_e5m2
 
 XLA_ELEMENT_TYPE_TO_DTYPE = {
diff --git a/tensorflow/compiler/xla/python/xla_client.pyi b/tensorflow/compiler/xla/python/xla_client.pyi
index 5358903ee65..74b8f177667 100644
--- a/tensorflow/compiler/xla/python/xla_client.pyi
+++ b/tensorflow/compiler/xla/python/xla_client.pyi
@@ -84,6 +84,7 @@ def make_cpu_client(*, use_tfrt: bool = ...) -> Client:
 def make_gpu_client(
     distributed_client: Optional[DistributedRuntimeClient] = ...,
     node_id: int = ...,
+    num_nodes: int = ...,
     platform_name: Optional[str] = ...,
     allowed_devices: Optional[Set[int]] = ...) -> Client:
   ...
@@ -97,7 +98,7 @@ def make_tfrt_tpu_c_api_client(options: Optional[_NameValueMapping] = None) -> C
   ...
 
 
-def make_tfrt_tpu_c_api_device_topology() -> DeviceTopology:
+def make_tfrt_tpu_c_api_device_topology(topology_name: Optional[str] = None, **kwargs) -> DeviceTopology:
   ...
 
 
@@ -108,6 +109,9 @@ def make_tpu_client() -> Client:
 def make_c_api_client(plugin_name: str, options: Optional[_NameValueMapping] = None) -> Client:
   ...
 
+def pjrt_plugin_loaded(plugin_name: str) -> bool:
+  ...
+
 def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> None:
   ...
 
diff --git a/tensorflow/compiler/xla/python/xla_client_test.py b/tensorflow/compiler/xla/python/xla_client_test.py
index 3d6c42dead0..a235c74c9a3 100644
--- a/tensorflow/compiler/xla/python/xla_client_test.py
+++ b/tensorflow/compiler/xla/python/xla_client_test.py
@@ -19,6 +19,7 @@ import functools
 import itertools
 import re
 import threading
+import traceback
 import unittest
 
 from absl import flags
@@ -729,6 +730,13 @@ def TestFactory(xla_backend,
       for dtype in standard_dtypes:
         if dtype == np.complex128:
           continue
+        # float8_e4m3b11fnuz not supported on some TPU backends.
+        if (
+            dtype in [float8_e4m3b11fnuz]
+            and self.backend.platform == "tpu"
+        ):
+          if self.backend.platform_version.find("TPU") == -1:
+            continue
         arr = self.backend.buffer_from_pyval(np.array([0, 1], dtype))
         arr = np.asarray(arr)
         self.assertEqual(dtype, type(arr[0]))
@@ -2206,6 +2214,24 @@ def TestFactory(xla_backend,
       for device in self.backend.local_devices():
         self.assertEqual(device.platform, self.backend.platform)
 
+    @unittest.skipIf(pathways, "not implemented")
+    def testMemoryStats(self):
+      for device in self.backend.local_devices():
+        stats = device.memory_stats()
+        if self.backend.platform != "tpu" or not tfrt_tpu or external_tpu:
+          self.assertIsNone(stats)
+        else:
+          self.assertIsNotNone(stats)
+          # Spot check a few fields
+          self.assertEqual(type(stats["num_allocs"]), int)
+          self.assertGreaterEqual(stats["num_allocs"], 0)
+          self.assertEqual(type(stats["bytes_in_use"]), int)
+          self.assertGreaterEqual(stats["bytes_in_use"], 0)
+          self.assertEqual(type(stats["peak_bytes_in_use"]), int)
+          self.assertGreaterEqual(stats["peak_bytes_in_use"], 0)
+          self.assertEqual(type(stats["largest_alloc_size"]), int)
+          self.assertGreaterEqual(stats["largest_alloc_size"], 0)
+
   tests.append(DeviceTest)
 
   class ErrorTest(ComputationTest):
@@ -2506,6 +2532,29 @@ def TestFactory(xla_backend,
         self.assertEqual(frames[i - 1].function_name, "AnotherFunction")
         self.assertEqual(frames[i + 1].function_name, "testNestedFunction")
 
+    def testPythonTracebackHasCorrectLineNumbers(self):
+      def B():
+        return xla_client.Traceback.get_traceback()
+
+      def A():
+        return B()
+
+      tb = A().as_python_traceback()
+      for frame, lineno in traceback.walk_tb(tb):
+        if frame.f_code.co_name == "A":
+          line = A.__code__.co_firstlineno
+          self.assertBetween(lineno, line, line + 2)
+        elif frame.f_code.co_name == "B":
+          line = B.__code__.co_firstlineno
+          self.assertBetween(lineno, line, line + 2)
+
+    def testAccessingLocalsDoesNotCrash(self):
+      # https://github.com/google/jax/issues/16027
+      tb = xla_client.Traceback.get_traceback()
+      python_tb = tb.as_python_traceback()
+      for frame, _ in traceback.walk_tb(python_tb):
+        _ = frame.f_locals  # should not crash
+
   tests.append(TracebackTest)
 
   class ClientTest(ComputationTest):
@@ -2743,129 +2792,6 @@ def TestFactory(xla_backend,
 
   tests.append(TokenTest)
 
-  @unittest.skip("TODO(b/263274176): channel handles do not round trip")
-  class HostCallbackTest(ComputationTest):
-    """Tests related to HostCallback."""
-
-    @unittest.skipIf(not tfrt_tpu, "not implemented")
-    def testHostCallback(self):
-
-      c = self._NewComputation()
-      token = ops.CreateToken(c)
-
-      frontend_attributes = xla_client._xla.FrontendAttributes()
-      frontend_attributes["_xla_host_transfer_rendezvous"] = "undef"
-      frontend_attributes["_xla_host_transfer_original_type"] = "u32"
-      frontend_attributes["_xla_host_transfer_is_lower_bits"] = "false"
-      frontend_attributes["_xla_host_transfer_handler_name"] = "undef"
-      c.set_frontend_attributes(frontend_attributes)
-
-      send_channel_handle = self.backend.create_channel_handle()
-      send_channel_handle.type = (
-          xla_client._xla.ChannelHandle_ChannelType.DEVICE_TO_HOST)
-      send_channel_handle.handle = 1
-      ops.SendToHost(
-          ops.Constant(c, np.float32(1.25)),
-          token,
-          shape_with_layout=xla_client.Shape.scalar_shape(np.dtype(np.float32)),
-          handle=send_channel_handle)
-
-      recv_channel_handle = self.backend.create_channel_handle()
-      recv_channel_handle.type = (
-          xla_client._xla.ChannelHandle_ChannelType.HOST_TO_DEVICE)
-      recv_channel_handle.handle = 2
-      data = ops.RecvFromHost(
-          token,
-          shape=xla_client.Shape.scalar_shape(np.dtype(np.float32)),
-          handle=recv_channel_handle)
-      ops.GetTupleElement(data, 0)
-
-      def Identity(x):
-        return (x,)
-
-      host_callback = self.backend.make_python_callback_from_host_send_and_recv(
-          Identity,
-          operand_shapes=[xla_client.Shape.scalar_shape(np.dtype(np.float32))],
-          result_shapes=[xla_client.Shape.scalar_shape(np.dtype(np.float32))],
-          send_channel_ids=[1],
-          recv_channel_ids=[2])
-
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()),
-          host_callbacks=[host_callback])
-      c.clear_frontend_attributes()
-
-      results = compiled_c.execute([])
-      self.assertLen(results, 1)
-
-      np.testing.assert_equal(np.asarray(results[0]), np.float32(1.25))
-
-  tests.append(HostCallbackTest)
-
-  @unittest.skip("TODO(b/263274176): channel handles do not round trip")
-  class HostCallbackMultiReplicaTest(ComputationTest):
-    """Tests related to HostCallback for multi-replica execution."""
-
-    @unittest.skipIf(not tfrt_tpu, "not implemented")
-    def testHostCallbackMultiReplica(self):
-
-      c = self._NewComputation()
-      token = ops.CreateToken(c)
-
-      frontend_attributes = xla_client._xla.FrontendAttributes()
-      frontend_attributes["_xla_host_transfer_rendezvous"] = "undef"
-      frontend_attributes["_xla_host_transfer_original_type"] = "u32"
-      frontend_attributes["_xla_host_transfer_is_lower_bits"] = "false"
-      frontend_attributes["_xla_host_transfer_handler_name"] = "undef"
-      c.set_frontend_attributes(frontend_attributes)
-
-      send_channel_handle = self.backend.create_channel_handle()
-      send_channel_handle.type = (
-          xla_client._xla.ChannelHandle_ChannelType.DEVICE_TO_HOST)
-      send_channel_handle.handle = 1
-      ops.SendToHost(
-          ops.ReplicaId(c),
-          token,
-          shape_with_layout=xla_client.Shape.scalar_shape(np.dtype(np.uint32)),
-          handle=send_channel_handle)
-
-      recv_channel_handle = self.backend.create_channel_handle()
-      recv_channel_handle.type = (
-          xla_client._xla.ChannelHandle_ChannelType.HOST_TO_DEVICE)
-      recv_channel_handle.handle = 2
-      data = ops.RecvFromHost(
-          token,
-          shape=xla_client.Shape.scalar_shape(np.dtype(np.uint32)),
-          handle=recv_channel_handle)
-      ops.GetTupleElement(data, 0)
-
-      def Identity(x):
-        return (x,)
-
-      host_callback = self.backend.make_python_callback_from_host_send_and_recv(
-          Identity,
-          operand_shapes=[xla_client.Shape.scalar_shape(np.dtype(np.uint32))],
-          result_shapes=[xla_client.Shape.scalar_shape(np.dtype(np.uint32))],
-          send_channel_ids=[1],
-          recv_channel_ids=[2])
-
-      num_replicas = 2
-      options = xla_client.CompileOptions()
-      options.num_replicas = num_replicas
-      compiled_c = self.backend.compile(
-          xla_computation_to_mlir_module(c.build()),
-          compile_options=options, host_callbacks=[host_callback])
-      c.clear_frontend_attributes()
-
-      results = compiled_c.execute_sharded_on_local_devices([])
-      self.assertLen(results, 1)
-      self.assertLen(results[0], num_replicas)
-
-      for i in range(num_replicas):
-        np.testing.assert_equal(np.asarray(results[0][i]), np.uint32(i))
-
-  tests.append(HostCallbackMultiReplicaTest)
-
   class ExecutePortableTest(ComputationTest):
 
     @unittest.skip("Test does not work under IFRT")
diff --git a/tensorflow/compiler/xla/python/xla_compiler.cc b/tensorflow/compiler/xla/python/xla_compiler.cc
index e8494bb1099..4737b4478cf 100644
--- a/tensorflow/compiler/xla/python/xla_compiler.cc
+++ b/tensorflow/compiler/xla/python/xla_compiler.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11/stl_bind.h"  // from @pybind11
+#include "tensorflow/compiler/xla/array.h"
 #include "tensorflow/compiler/xla/client/executable_build_options.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/client/xla_computation.h"
@@ -181,6 +182,69 @@ StatusOr<Shape> MakeShapeWithDenseLayout(
   return shape;
 }
 
+// Pybind function for HloSharding.iota_tile, which is a non-crashing factory
+// that produces a HloSharding instance backed by tile assignment of a
+// transposed and reshaped iota array of device ids. More specifically the tile
+// assignment array is as if it is produced by the following numpy code:
+// numpy.arange(math.prod(dims)).reshape(reshape_dims)
+//      .transpose(transpose_perm).reshape(math.prod(dims))
+// where:
+// `dims`: is the dimensions of the tile assignment array, which corresponds to
+//   OpSharding.tile_assignment_dimensions.
+// `reshape_dims`: is the dimensions the 1D iota array is reshaped to.
+// `transpose_perm`: is the dimension permutation to transpose `reshape_dims`.
+// `subgroup_types`: indicates the subgroups of the last `subgroup_types.size()`
+//   dimensions in `dims`.
+//
+// In practice, `reshape_dims` often maps to the axises of user defined device
+// mesh, and `transpose_perm` often maps to the user specification of how a
+// tensor is partitioned based on the axes defined in the mesh, e.g. for a mesh
+// of size 4x2x2 as AxBxC:
+// PartitionSpec('A', 'B', 'C') corresponds to reshape_dims=[4,2,2],
+// transpose_perm=[0,1,2] (no transpose)
+// PartitionSpec('B', 'A', 'C') corresponds to reshape_dims=[4,2,2],
+// transpose_perm=[1,0,2] (swap A and B)
+StatusOr<HloSharding> IotaTileHelper(
+    absl::Span<const int64_t> dims, absl::Span<const int64_t> reshape_dims,
+    absl::Span<const int> transpose_perm,
+    absl::Span<const OpSharding::Type> subgroup_types) {
+  if (dims.empty()) {
+    return InvalidArgument("`dims` should not be empty.");
+  }
+  if (reshape_dims.size() != transpose_perm.size()) {
+    return InvalidArgument(
+        "`reshape_dims` and `transpose_perm` should have the same size, saw "
+        "[%s] v.s. [%s]",
+        absl::StrJoin(reshape_dims, ","), absl::StrJoin(transpose_perm, ","));
+  }
+  if (!reshape_dims.empty() && Product(dims) != Product(reshape_dims)) {
+    return InvalidArgument(
+        "Cannot reshape from `dims` [%s] to `reshape_dims` [%s].",
+        absl::StrJoin(dims, ","), absl::StrJoin(reshape_dims, ","));
+  }
+  if (subgroup_types.size() > dims.size()) {
+    return InvalidArgument(
+        "`subgroup_types`(%lld) should not have more dimensions than "
+        "`dims`(%lld).",
+        subgroup_types.size(), dims.size());
+  }
+  auto make_assignment = [&] {
+    if (reshape_dims.empty() && transpose_perm.empty()) {
+      Array<int64_t> assignment(dims);
+      assignment.FillIota(0);
+      return assignment;
+    }
+    Array<int64_t> assignment(reshape_dims);
+    assignment.FillIota(0);
+    assignment.TransposeDimensions(transpose_perm);
+    assignment.Reshape(dims);
+    return assignment;
+  };
+  return subgroup_types.empty()
+             ? HloSharding::Tile(make_assignment())
+             : HloSharding::Subgroup(make_assignment(), subgroup_types);
+}
+
 // Registers a 'fn_capsule' as a CPU custom call target.
 // 'fn_capsule' must be a void* pointer encapsulated in a PyCapsule object,
 // with name "xla._CUSTOM_CALL_TARGET".
@@ -224,66 +288,6 @@ void DefRepeatedProperty(py::class_<T>& cls, const char* name,
       });
 }
 
-StatusOr<bool> IsOpShardingFullyReplicated(const OpSharding& op_sharding) {
-  switch (op_sharding.type()) {
-    case OpSharding::REPLICATED:
-    case OpSharding::MAXIMAL:
-      return true;
-    case OpSharding::TUPLE: {
-      for (const OpSharding& tuple_sharding : op_sharding.tuple_shardings()) {
-        TF_ASSIGN_OR_RETURN(bool replicated,
-                            IsOpShardingFullyReplicated(tuple_sharding));
-        if (!replicated) {
-          return false;
-        }
-      }
-      return true;
-    }
-    case OpSharding::OTHER: {
-      if (op_sharding.tile_assignment_devices_size() == 1) {
-        return true;
-      }
-      if (op_sharding.last_tile_dims_size() > 0) {
-        if (op_sharding.last_tile_dims_size() >
-            op_sharding.tile_assignment_dimensions_size()) {
-          return InvalidArgument(
-              "last_tile_dims is larger than tile_assignment_dimensions");
-        }
-        size_t last_dims = op_sharding.tile_assignment_dimensions_size() -
-                           op_sharding.last_tile_dims_size();
-        for (size_t i = 0; i < last_dims; ++i) {
-          if (op_sharding.tile_assignment_dimensions(i) != 1) {
-            return false;
-          }
-        }
-        // This handles cases like [MANUAL, REPLICATED], where all the
-        // non-replicated dimensions have tile dimension 1.
-        for (size_t i = 0; i < op_sharding.last_tile_dims_size(); ++i) {
-          if (op_sharding.tile_assignment_dimensions(last_dims + i) != 1 &&
-              op_sharding.last_tile_dims(i) != OpSharding::REPLICATED) {
-            return false;
-          }
-        }
-        return true;
-      } else if (op_sharding.replicate_on_last_tile_dim()) {
-        for (size_t i = 0;
-             i + 1 < op_sharding.tile_assignment_dimensions_size(); ++i) {
-          if (op_sharding.tile_assignment_dimensions(i) != 1) {
-            return false;
-          }
-        }
-        return true;
-      } else {
-        return false;
-      }
-    }
-    case OpSharding::MANUAL:
-      return op_sharding.tile_assignment_devices_size() == 1;
-    default:
-      return InvalidArgument("Unknown/invalid op_sharding type");
-  }
-}
-
 }  // namespace
 
 void BuildXlaCompilerSubmodule(py::module& m) {
@@ -960,21 +964,57 @@ void BuildXlaCompilerSubmodule(py::module& m) {
   DefRepeatedProperty(op_sharding, "last_tile_dims",
                       &xla::OpSharding::mutable_last_tile_dims);
 
-  m.def("is_op_sharding_fully_replicated",
-        xla::ValueOrThrowWrapper(IsOpShardingFullyReplicated));
-
   py::class_<HloSharding> hlo_sharding(m, "HloSharding");
   hlo_sharding
       .def_static("from_proto",
                   xla::ValueOrThrowWrapper(xla::HloSharding::FromProto))
       .def_static("from_string", xla::ValueOrThrowWrapper(xla::ParseSharding))
+      .def_static(
+          "tuple_sharding",
+          [](xla::Shape shape,
+             std::vector<xla::HloSharding> shardings) -> xla::HloSharding {
+            return HloSharding::Tuple(shape, shardings);
+          },
+          "Constructs a tuple sharding.")
+      .def_static(
+          "iota_tile", xla::ValueOrThrowWrapper(IotaTileHelper),
+          py::arg("dims"),
+          py::arg("reshape_dims") = absl::Span<const int64_t>(),
+          py::arg("transpose_perm") = absl::Span<const int>(),
+          py::arg("subgroup_types") = absl::Span<const xla::OpSharding::Type>())
+      .def_static("manual", [] { return HloSharding::Manual(); })
+      .def_static("replicate", [] { return HloSharding::Replicate(); })
       .def("__eq__", [](const xla::HloSharding& a,
                         const xla::HloSharding& b) { return a == b; })
       .def("__hash__",
            [](const xla::HloSharding& self) { return absl::HashOf(self); })
       .def("is_replicated", &xla::HloSharding::IsReplicated)
+      .def("is_manual", &xla::HloSharding::IsManual)
+      .def("is_tiled", &xla::HloSharding::IsTiled)
       .def("tile", [](const xla::HloSharding& self,
                       xla::Shape shape) { return self.TileShape(shape); })
+      .def("tuple_elements",
+           [](const xla::HloSharding& self) { return self.tuple_elements(); })
+      .def("num_devices",
+           [](const xla::HloSharding& self) {
+             return self.tile_assignment().num_elements();
+           })
+      .def("num_dimensions",
+           [](const xla::HloSharding& self) {
+             return self.tile_assignment().num_dimensions();
+           })
+      .def("tile_assignment_dimensions",
+           [](const xla::HloSharding& self) {
+             return self.tile_assignment().dimensions();
+           })
+      .def("tile_assignment_devices",
+           [](const xla::HloSharding& self) {
+             return absl::MakeConstSpan(self.tile_assignment().data(),
+                                        self.tile_assignment().num_elements());
+           })
+      .def("replicate_on_last_tile_dim",
+           &xla::HloSharding::ReplicateOnLastTileDim)
+      .def("subgroup_types", &xla::HloSharding::subgroup_types)
       .def("__repr__",
            [](const xla::HloSharding& self) { return self.ToString(); })
       .def("to_proto", &xla::HloSharding::ToProto);
@@ -991,22 +1031,6 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .value("HIGH", PrecisionConfig::HIGH)
       .value("HIGHEST", PrecisionConfig::HIGHEST);
 
-  py::enum_<ChannelHandle::ChannelType>(m, "ChannelHandle_ChannelType")
-      .value("CHANNEL_TYPE_INVALID", ChannelHandle::CHANNEL_TYPE_INVALID)
-      .value("DEVICE_TO_DEVICE", ChannelHandle::DEVICE_TO_DEVICE)
-      .value("DEVICE_TO_HOST", ChannelHandle::DEVICE_TO_HOST)
-      .value("HOST_TO_DEVICE", ChannelHandle::HOST_TO_DEVICE);
-
-  py::class_<ChannelHandle>(m, "ChannelHandle")
-      .def_property("type", &ChannelHandle::type,
-                    [](ChannelHandle* h, ChannelHandle::ChannelType type) {
-                      h->set_type(type);
-                    })
-      .def_property(
-          "handle", &ChannelHandle::handle,
-          [](ChannelHandle* h, int64_t handle) { h->set_handle(handle); })
-      .def("__repr__", [](ChannelHandle* h) { return h->DebugString(); });
-
   py::enum_<FftType>(m, "FftType")
       .value("FFT", FftType::FFT)
       .value("IFFT", FftType::IFFT)
diff --git a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
index 21d1eedb2c2..a830e7b1c07 100644
--- a/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/__init__.pyi
@@ -297,30 +297,36 @@ class OpSharding:
   def SerializeToString(self) -> bytes: ...
   def clone(self) -> OpSharding: ...
 
-def is_op_sharding_fully_replicated(sharding: OpSharding, /) -> bool: ...
-
 class HloSharding:
   @staticmethod
   def from_proto(proto: OpSharding) -> HloSharding: ...
   @staticmethod
   def from_string(sharding: str) -> HloSharding: ...
+  @staticmethod
+  def tuple_sharding(shape: Shape, shardings: Sequence[HloSharding]) -> HloSharding: ...
+  @staticmethod
+  def iota_tile(dims: Sequence[int], reshape_dims: Sequence[int],
+                transpose_perm: Sequence[int],
+                subgroup_types: Sequence[OpSharding.Type]) -> HloSharding: ...
+  @staticmethod
+  def replicate() -> HloSharding: ...
+  @staticmethod
+  def manual() -> HloSharding: ...
   def __eq__(self, other: HloSharding) -> bool: ...
   def __hash__(self) -> int: ...
   def __repr__(self) -> str: ...
   def tile(self, shape: Shape) -> Shape: ...
-  def is_replicated() -> bool: ...
-  def to_proto() -> OpSharding: ...
-
-class ChannelHandle_ChannelType(enum.IntEnum):
-  CHANNEL_TYPE_INVALID: int
-  DEVICE_TO_DEVICE: int
-  DEVICE_TO_HOST: int
-  HOST_TO_DEVICE: int
-
-class ChannelHandle:
-  type: ChannelHandle_ChannelType
-  handle: int
-  def __repr__(self) -> str: ...
+  def is_replicated(self) -> bool: ...
+  def is_manual(self) -> bool: ...
+  def is_tiled(self) -> bool: ...
+  def tuple_elements(self) -> List[HloSharding]: ...
+  def num_devices(self) -> int: ...
+  def num_dimensions(self) -> int: ...
+  def tile_assignment_dimensions(self) -> Sequence[int]: ...
+  def tile_assignment_devices(self) -> Sequence[int]: ...
+  def subgroup_types(self) -> Sequence[OpSharding.Type]: ...
+  def replicate_on_last_tile_dim(self) -> bool: ...
+  def to_proto(self) -> OpSharding: ...
 
 class FftType(enum.IntEnum):
   FFT: int
@@ -342,6 +348,7 @@ class Device:
   def transfer_to_infeed(self, literal: _LiteralSlice): ...
   def transfer_from_outfeed(self, shape: Shape): ...
   def live_buffers(self) -> List[Any]: ...
+  def memory_stats(self) -> Optional[Dict[str, int]]: ...
   def __getattr__(self, name: str) -> Any: ...
 
 class _GpuAllocatorKind(enum.IntEnum):
@@ -387,9 +394,6 @@ class Client:
   def get_default_device_assignment(
       self,
       num_replicas: int) -> List[Device]: ...
-  def create_channel_handle(self) -> ChannelHandle: ...
-  def create_device_to_host_channel_handle(self) -> ChannelHandle: ...
-  def create_host_to_device_channel_handle(self) -> ChannelHandle: ...
   def buffer_from_pyval(
       self,
       argument: Any,
@@ -421,7 +425,7 @@ class Client:
   def make_python_callback_from_host_send_and_recv(
       self, callable: Callable, operand_shapes: Sequence[Shape],
       result_shapes: Sequence[Shape], send_channel_ids: Sequence[int],
-      recv_channel_ids: Sequence[int]) -> Any: ...
+      recv_channel_ids: Sequence[int], serializer: Optional[Callable] = ...) -> Any: ...
   def get_python_callback_from_host_send(callable: Any,
                                          operand_shapes: Any, send_channel_ids: Any, recv_channel_ids: Any) -> Any: ...
 
@@ -439,6 +443,7 @@ def get_tpu_client(max_inflight_computations: int = ...) -> Client: ...
 def get_c_api_client(platform_name: str, options: Dict[str, Union[str, int, List[int], float]]) -> Client: ...
 def get_default_c_api_topology(platform_name: str) -> DeviceTopology: ...
 def load_pjrt_plugin(platform_name: str, library_path: str) -> _Status: ...
+def pjrt_plugin_loaded(plugin_name: str) -> bool: ...
 
 ArrayImpl = Any
 
@@ -658,9 +663,10 @@ class PmapSharding(XLACompatibleSharding):
   sharding_spec: pmap_lib.ShardingSpec
 
 class GSPMDSharding(XLACompatibleSharding):
-  def __init__(self, devices: Sequence[Device], op_sharding: OpSharding): ...
+  def __init__(self, devices: Sequence[Device],
+               op_sharding: Union[OpSharding, HloSharding]): ...
   _devices: Tuple[Device, ...]
-  _op_sharding: OpSharding
+  _hlo_sharding: HloSharding
 
 class PjitFunction:
   def __call__(self, *args, **kwargs) -> Any: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
index f2972638ea4..2e47b174683 100644
--- a/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/mlir.pyi
@@ -24,3 +24,4 @@ def mhlo_to_stablehlo(mlir_module: Union[bytes, str]) -> str: ...
 def stablehlo_to_mhlo(mlir_module: Union[bytes, str]) -> str: ...
 def serialize_portable_artifact(mlir_module: str, target:str) -> bytes: ...
 def deserialize_portable_artifact(mlir_module: bytes) -> str: ...
+def refine_polymorphic_shapes(mlir_module: Union[bytes, str]) -> bytes: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/ops.pyi b/tensorflow/compiler/xla/python/xla_extension/ops.pyi
index 6f5a68a1eac..2e21d267ff9 100644
--- a/tensorflow/compiler/xla/python/xla_extension/ops.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/ops.pyi
@@ -18,7 +18,6 @@ from typing import Any, List, Optional, Sequence, Tuple, overload
 
 from tensorflow.compiler.xla.python import xla_extension
 
-ChannelHandle = xla_extension.ChannelHandle
 FftType = xla_extension.FftType
 XlaBuilder = xla_extension.XlaBuilder
 XlaComputation = xla_extension.XlaComputation
@@ -28,6 +27,7 @@ PrimitiveType = xla_extension.PrimitiveType
 Shape = xla_extension.Shape
 ShapeIndex = xla_extension.ShapeIndex
 
+_ChannelHandle = Any
 _ConvDimensionNumbers = Any
 _DotDimensionNumbers = Any
 _Layout = Any
@@ -66,14 +66,14 @@ def AllGather(
     all_gather_dimension: int,
     shard_count: int,
     replica_groups: Sequence[_ReplicaGroup] = ...,
-    channel_id: Optional[ChannelHandle] = ...,
+    channel_id: Optional[_ChannelHandle] = ...,
     shape_with_layout: Optional[_Layout] = ...,
     use_global_device_ids: Optional[bool] = ...) -> XlaOp: ...
 def AllReduce(
     operand: XlaOp,
     computation: XlaComputation,
     replica_groups: Sequence[_ReplicaGroup] = ...,
-    channel_id: Optional[ChannelHandle] = ...,
+    channel_id: Optional[_ChannelHandle] = ...,
     shape_with_layout: Optional[_Layout] = ...) -> XlaOp: ...
 def ApproxTopK(
     builder: XlaBuilder,
@@ -108,7 +108,7 @@ def ReduceScatter(
     scatter_dimension: int,
     shard_count: int,
     replica_groups: Sequence[_ReplicaGroup] = ...,
-    channel_id: Optional[ChannelHandle] = ...,
+    channel_id: Optional[_ChannelHandle] = ...,
     layout: Optional[_Layout] = ...,
     use_global_device_ids: Optional[bool] = ...) -> XlaOp: ...
 def AllToAll(
@@ -118,7 +118,7 @@ def AllToAll(
     split_count: int,
     replica_groups: Sequence[_ReplicaGroup] = ...,
     layout: Optional[_Layout] = ...,
-    channel_id: Optional[ChannelHandle] = ...) -> XlaOp: ...
+    channel_id: Optional[_ChannelHandle] = ...) -> XlaOp: ...
 def BitcastConvertType(operand: XlaOp,
                        new_element_type: PrimitiveType) -> XlaOp: ...
 def Broadcast(operand: XlaOp, sizes: Sequence[int]) -> XlaOp: ...
@@ -134,7 +134,7 @@ def Collapse(operand: XlaOp, dimensions: Sequence[int]) -> XlaOp: ...
 def CollectivePermute(
     operand: XlaOp,
     source_target_pairs: Sequence[Tuple[int, int]],
-    channel_id: Optional[ChannelHandle] = ...) -> XlaOp: ...
+    channel_id: Optional[_ChannelHandle] = ...) -> XlaOp: ...
 def ConcatInDim(builder: XlaBuilder,
                 operands: Sequence[XlaOp],
                 dimension: int) -> XlaOp: ...
diff --git a/tensorflow/compiler/xla/python/xla_extension/pytree.pyi b/tensorflow/compiler/xla/python/xla_extension/pytree.pyi
index a3423cc6bfb..3482a3ceef0 100644
--- a/tensorflow/compiler/xla/python/xla_extension/pytree.pyi
+++ b/tensorflow/compiler/xla/python/xla_extension/pytree.pyi
@@ -45,6 +45,9 @@ class PyTreeDef:
   def __hash__(self) -> int: ...
   def __getstate__(self) -> Any: ...
   def __setstate__(self, state: Any): ...
+  def serialize_using_proto(self) -> bytes: ...
+  @staticmethod
+  def deserialize_using_proto(self, data: bytes) -> PyTreeDef: ...
 
 _Children = TypeVar("_Children", bound=Iterable[Any])
 _AuxData = TypeVar("_AuxData", bound=Hashable)
diff --git a/tensorflow/compiler/xla/runtime/BUILD b/tensorflow/compiler/xla/runtime/BUILD
index 2cf7e7881ad..8537f08c417 100644
--- a/tensorflow/compiler/xla/runtime/BUILD
+++ b/tensorflow/compiler/xla/runtime/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow/compiler/xla:xla.bzl", "xla_cc_test")
-load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_cloud")
 load("//tensorflow/tsl/platform:build_config.bzl", "if_llvm_system_z_available", "tf_platform_deps")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 
@@ -19,15 +19,11 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         ":async_runtime",
-        ":errors",
         ":types",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla/mlir/runtime/transforms:type_converter",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -37,11 +33,8 @@ xla_cc_test(
     srcs = ["arguments_test.cc"],
     deps = [
         ":arguments",
-        ":types",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
-        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -54,7 +47,6 @@ cc_library(
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:platform_port",
         "@com_google_absl//absl/base:dynamic_annotations",
-        "@com_google_absl//absl/status",
         "@tf_runtime//:async_value",
         "@tf_runtime//:ref_count",
     ],
@@ -67,7 +59,6 @@ xla_cc_test(
         ":async_runtime",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/status",
         "@tf_runtime//:async_value",
     ],
 )
@@ -90,13 +81,44 @@ cc_library(
     hdrs = ["constraints.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
-        ":errors",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
     ],
 )
 
+filegroup(
+    name = "aot_ffi_execution_context_hdrs",
+    srcs = ["aot_ffi_execution_context.h"],
+)
+
+cc_library(
+    name = "aot_ffi_execution_context",
+    hdrs = ["aot_ffi_execution_context.h"],
+    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "aot_ffi",
+    srcs = ["aot_ffi.cc"],
+    hdrs = ["aot_ffi.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":aot_ffi_c_symbols",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
+    ],
+)
+
+cc_library(
+    name = "aot_ffi_c_symbols",
+    srcs = ["aot_ffi_c_symbols.cc"],
+    hdrs = ["aot_ffi_c_symbols.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":aot_ffi_execution_context",
+    ],
+)
+
 cc_library(
     name = "custom_call",
     srcs = ["custom_call.cc"],
@@ -108,6 +130,7 @@ cc_library(
         ":errors",
         ":logical_result",
         ":map_by_type",
+        ":memref_view",
         ":state",
         ":type_id",
         "//tensorflow/compiler/xla:shape_util",
@@ -131,7 +154,6 @@ xla_cc_test(
         ":custom_call",
         ":custom_call_registry",
         ":diagnostics",
-        ":executable",
         ":jit_executable",
         ":module",
         ":state",
@@ -186,7 +208,6 @@ cc_library(
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//llvm:Support",
     ],
 )
 
@@ -211,6 +232,7 @@ cc_library(
         ":types",
         "//tensorflow/compiler/xla/mlir/runtime/utils:async_runtime_api",
         "//tensorflow/compiler/xla/mlir/runtime/utils:c_runner_utils",
+        "//tensorflow/compiler/xla/mlir/runtime/utils:float_16bits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
@@ -238,7 +260,6 @@ xla_cc_test(
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/base:dynamic_annotations",
-        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -299,10 +320,7 @@ xla_cc_test(
     deps = [
         ":arguments",
         ":async_runtime",
-        ":custom_call",
         ":custom_call_registry",
-        ":diagnostics",
-        ":executable",
         ":ffi",
         ":jit_executable",
         ":results",
@@ -310,7 +328,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
         "//tensorflow/compiler/xla/runtime/ffi:ffi_c_api_hdrs",
         "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -323,7 +340,6 @@ cc_library(
     hdrs = ["jit_executable.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
-        ":arguments",
         ":async_values_cache",
         ":constraints",
         ":errors",
@@ -381,12 +397,21 @@ cc_library(
     ),
 )
 
+cc_library(
+    name = "memref_view",
+    hdrs = ["memref_view.h"],
+    compatible_with = get_compatible_with_cloud(),
+    deps = [
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
 cc_library(
     name = "module",
     hdrs = ["module.h"],
     compatible_with = get_compatible_with_cloud(),
     deps = [
-        ":custom_call",
         ":custom_call_registry",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -535,9 +560,6 @@ cc_library(
     name = "compiler",
     hdrs = ["compiler.h"],
     compatible_with = get_compatible_with_cloud(),
-    deps = [
-        "@llvm-project//llvm:Support",
-    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/runtime/aot_ffi.cc b/tensorflow/compiler/xla/runtime/aot_ffi.cc
new file mode 100644
index 00000000000..97a73f46b7e
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/aot_ffi.cc
@@ -0,0 +1,107 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/runtime/aot_ffi.h"
+
+#include <iostream>
+#include <string>
+
+#include "tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+
+// XLA_FFI_Error is forward-declared by the XLA FFI C API,
+// and therefore has to be defined in the global namespace.
+
+struct XLA_FFI_Error {
+  XLA_FFI_Error_Code errc;
+  std::string error;
+};
+
+namespace xla {
+namespace runtime {
+namespace aot {
+
+template <const int64_t* ptr>
+static const void* GetAotType() {
+  return static_cast<const void*>(ptr);
+}
+
+static XLA_FFI_Error* CreateError(XLA_FFI_Error_Create_Args* args) {
+  assert(XLA_FFI_Error_Create_Args_STRUCT_SIZE == args->struct_size);
+  return new XLA_FFI_Error{args->errc, std::string(args->message)};
+}
+
+XLA_FFI_Api FfiApi() {
+  XLA_FFI_Api api = {
+      /*struct_size=*/XLA_FFI_Api_STRUCT_SIZE,
+      /*priv=*/nullptr,
+
+      // Module Registration APIs.
+      nullptr,
+
+      // Execution Context APIs.
+      nullptr,  // module state
+      nullptr,  // stream
+
+      // Error Reporting APIs.
+      CreateError,
+
+      // Type table.
+      GetAotType<&__type_id_string>,
+      GetAotType<&__type_id_float>,
+      GetAotType<&__type_id_double>,
+      GetAotType<&__type_id_bool>,
+      GetAotType<&__type_id_int32>,
+      GetAotType<&__type_id_int64>,
+      GetAotType<&__type_id_array_float>,
+      GetAotType<&__type_id_array_double>,
+      GetAotType<&__type_id_array_int32>,
+      GetAotType<&__type_id_array_int64>,
+      GetAotType<&__type_id_tensor_float>,
+      GetAotType<&__type_id_tensor_double>,
+      GetAotType<&__type_id_tensor_int32>,
+      GetAotType<&__type_id_tensor_int64>,
+      GetAotType<&__type_id_memref_view>,
+      GetAotType<&__type_id_strided_memref_view>,
+      GetAotType<&__type_id_dictionary>,
+  };
+  return api;
+}
+
+XLA_FFI_Function_Args FfiArgs(XLA_FFI_Api* api, void** args, void** attrs,
+                              void** rets) {
+  XLA_FFI_Function_Args ffi_args;
+  ffi_args.api = api;
+  ffi_args.ctx = nullptr;
+  ffi_args.priv = nullptr;
+  ffi_args.struct_size = XLA_FFI_Function_Args_STRUCT_SIZE;
+  ffi_args.args = args;
+  ffi_args.attrs = attrs;
+  ffi_args.rets = rets;
+  return ffi_args;
+}
+
+bool ProcessErrorIfAny(XLA_FFI_Error* error) {
+  if (error == nullptr) {
+    return true;
+  }
+  // XLA has no way of passing errors; print to stderr.
+  std::cerr << "XLA FFI error: " << error->error << ".\n";
+  delete error;
+  return false;
+}
+
+}  // namespace aot
+}  // namespace runtime
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/aot_ffi.h b/tensorflow/compiler/xla/runtime/aot_ffi.h
new file mode 100644
index 00000000000..0d86aa7403c
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/aot_ffi.h
@@ -0,0 +1,35 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_H_
+
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+
+namespace xla {
+namespace runtime {
+namespace aot {
+
+XLA_FFI_Api FfiApi();
+
+XLA_FFI_Function_Args FfiArgs(XLA_FFI_Api* api, void** args, void** attrs,
+                              void** rets);
+
+bool ProcessErrorIfAny(XLA_FFI_Error* error);
+
+}  // namespace aot
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_H_
diff --git a/tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.cc b/tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.cc
new file mode 100644
index 00000000000..f3a4ce3b094
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.cc
@@ -0,0 +1,85 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.h"
+
+#include "tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h"
+
+void* GetResultStorage(void* execution_context, int64_t index) {
+  auto* ctx =
+      static_cast<xla::runtime::aot::ExecutionContext*>(execution_context);
+  ctx->error = "AOT uses no result storage";
+  return nullptr;
+}
+
+void runtimeSetError(void* execution_context, const char* error) {
+  auto* ctx =
+      static_cast<xla::runtime::aot::ExecutionContext*>(execution_context);
+  ctx->error = error;
+}
+
+bool CustomCall(void* execution_context, const char* target, void** args,
+                void** attrs, void** rets) {
+  auto* ctx =
+      static_cast<xla::runtime::aot::ExecutionContext*>(execution_context);
+  ctx->error = "AOT has no custom call registry";
+  return false;
+}
+
+int64_t __type_id_opaque;
+int64_t __type_id_nullopt;
+int64_t __type_id_string;
+int64_t __type_id_function_ordinal;
+
+int64_t __type_id_bool;
+int64_t __type_id_int8;
+int64_t __type_id_int16;
+int64_t __type_id_int32;
+int64_t __type_id_int64;
+int64_t __type_id_uint8;
+int64_t __type_id_uint16;
+int64_t __type_id_uint32;
+int64_t __type_id_uint64;
+int64_t __type_id_bfloat16;
+int64_t __type_id_f16;
+int64_t __type_id_float;
+int64_t __type_id_double;
+
+int64_t __type_id_memref_view;
+int64_t __type_id_strided_memref_view;
+int64_t __type_id_empty_array;
+int64_t __type_id_dictionary;
+
+int64_t __type_id_array_int8;
+int64_t __type_id_array_int16;
+int64_t __type_id_array_int32;
+int64_t __type_id_array_int64;
+int64_t __type_id_array_float;
+int64_t __type_id_array_double;
+
+int64_t __type_id_tensor_int32;
+int64_t __type_id_tensor_int64;
+int64_t __type_id_tensor_float;
+int64_t __type_id_tensor_double;
+
+int64_t __type_id_async_bool;
+int64_t __type_id_async_int8;
+int64_t __type_id_async_int16;
+int64_t __type_id_async_int32;
+int64_t __type_id_async_int64;
+int64_t __type_id_async_float;
+int64_t __type_id_async_double;
+int64_t __type_id_async_memref;
+int64_t __type_id_async_chain;
diff --git a/tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.h b/tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.h
new file mode 100644
index 00000000000..4be403ef396
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/aot_ffi_c_symbols.h
@@ -0,0 +1,79 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_C_SYMBOLS_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_C_SYMBOLS_H_
+
+#include <stdint.h>
+
+extern "C" {
+
+void* GetResultStorage(void* execution_context, int64_t index);
+void runtimeSetError(void* execution_context, const char* error);
+bool CustomCall(void* execution_context, const char* target, void** args,
+                void** attrs, void** rets);
+
+// We use int64_t just to make sure these symbols have reasonable alignment.
+// But really all we need is to have these symbols defined; we don't do anything
+// with them other than taking their address.
+extern int64_t __type_id_opaque;
+extern int64_t __type_id_nullopt;
+extern int64_t __type_id_string;
+extern int64_t __type_id_function_ordinal;
+
+extern int64_t __type_id_bool;
+extern int64_t __type_id_int8;
+extern int64_t __type_id_int16;
+extern int64_t __type_id_int32;
+extern int64_t __type_id_int64;
+extern int64_t __type_id_uint8;
+extern int64_t __type_id_uint16;
+extern int64_t __type_id_uint32;
+extern int64_t __type_id_uint64;
+extern int64_t __type_id_bfloat16;
+extern int64_t __type_id_f16;
+extern int64_t __type_id_float;
+extern int64_t __type_id_double;
+
+extern int64_t __type_id_memref_view;
+extern int64_t __type_id_strided_memref_view;
+extern int64_t __type_id_empty_array;
+extern int64_t __type_id_dictionary;
+
+extern int64_t __type_id_array_int8;
+extern int64_t __type_id_array_int16;
+extern int64_t __type_id_array_int32;
+extern int64_t __type_id_array_int64;
+extern int64_t __type_id_array_float;
+extern int64_t __type_id_array_double;
+
+extern int64_t __type_id_tensor_int32;
+extern int64_t __type_id_tensor_int64;
+extern int64_t __type_id_tensor_float;
+extern int64_t __type_id_tensor_double;
+
+extern int64_t __type_id_async_bool;
+extern int64_t __type_id_async_int8;
+extern int64_t __type_id_async_int16;
+extern int64_t __type_id_async_int32;
+extern int64_t __type_id_async_int64;
+extern int64_t __type_id_async_float;
+extern int64_t __type_id_async_double;
+extern int64_t __type_id_async_memref;
+extern int64_t __type_id_async_chain;
+
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_C_SYMBOLS_H_
diff --git a/tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h b/tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h
new file mode 100644
index 00000000000..f1a58025825
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h
@@ -0,0 +1,40 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_EXECUTION_CONTEXT_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_EXECUTION_CONTEXT_H_
+
+namespace xla {
+namespace runtime {
+namespace aot {
+
+// To keep dependencies to a minimum we cannot include the actual
+// xla::runtime::ExecutionContext. Instead, we re-define it here
+// to contain just the information we need for AOT.
+//
+// LINT.IfChange
+struct ExecutionContext {
+  void* results_memory_layout = nullptr;  // unused by aot_ffi.
+  void* call_frame = nullptr;             // unused by aot_ffi.
+  void* custom_call_data = nullptr;
+  void* custom_call_registry = nullptr;  // unused by aot_ffi.
+  const char* error = nullptr;  // Error message owned by the AOT object.
+};
+// LINT.ThenChange(//tensorflow/compiler/xla/runtime/executable.cc)
+
+}  // namespace aot
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_AOT_FFI_EXECUTION_CONTEXT_H_
diff --git a/tensorflow/compiler/xla/runtime/custom_call.cc b/tensorflow/compiler/xla/runtime/custom_call.cc
index 14b58030fea..13696dff3a3 100644
--- a/tensorflow/compiler/xla/runtime/custom_call.cc
+++ b/tensorflow/compiler/xla/runtime/custom_call.cc
@@ -93,8 +93,8 @@ void PopulateCustomCallTypeIdNames(TypeIDNameRegistry& r) {
   r.Register<Tagged<absl::Span<const float>>>("__type_id_array_float");
   r.Register<Tagged<absl::Span<const double>>>("__type_id_array_double");
 
-  r.Register<Tagged<TensorRef<int32_t>>>("__type_id__tensor_int32_t");
-  r.Register<Tagged<TensorRef<int64_t>>>("__type_id_tensor_int64_t");
+  r.Register<Tagged<TensorRef<int32_t>>>("__type_id_tensor_int32");
+  r.Register<Tagged<TensorRef<int64_t>>>("__type_id_tensor_int64");
   r.Register<Tagged<TensorRef<float>>>("__type_id_tensor_float");
   r.Register<Tagged<TensorRef<double>>>("__type_id_tensor_double");
 
diff --git a/tensorflow/compiler/xla/runtime/custom_call.h b/tensorflow/compiler/xla/runtime/custom_call.h
index f04fbdd7b9b..400edef0a89 100644
--- a/tensorflow/compiler/xla/runtime/custom_call.h
+++ b/tensorflow/compiler/xla/runtime/custom_call.h
@@ -22,7 +22,6 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <iterator>
-#include <numeric>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -36,6 +35,8 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/types/span.h"
 #include "third_party/eigen3/Eigen/Core"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/runtime/async_runtime.h"
 #include "tensorflow/compiler/xla/runtime/diagnostics.h"
@@ -43,6 +44,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/ffi/ffi_abi.h"
 #include "tensorflow/compiler/xla/runtime/logical_result.h"
 #include "tensorflow/compiler/xla/runtime/map_by_type.h"
+#include "tensorflow/compiler/xla/runtime/memref_view.h"
 #include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/runtime/type_id.h"
 #include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
@@ -699,7 +701,7 @@ void SetResultsFromTuple(std::index_sequence<ResultIs...>, FnArgs fn_args,
 
 // When decoding input data we need to keep track of how many arguments,
 // attributes, and returns we decoded so far to index into the correct data
-// strucuture.
+// structure.
 struct DecodingOffsets {
   int64_t args = 0;
   int64_t attrs = 0;
@@ -721,6 +723,9 @@ struct DecodingContext {
 
   // User-provided auxiliary data.
   const CustomCall::UserData* user_data;
+
+  // User-provided diagnostic engine for reporting detailed errors.
+  const DiagnosticEngine* diagnostic;
 };
 
 template <typename T, CustomCall::RuntimeChecks checks>
@@ -800,7 +805,11 @@ struct Decode<internal::UserData<T>, checks> {
   ABSL_ATTRIBUTE_ALWAYS_INLINE static FailureOr<T> call(
       DecodingOffsets& offsets, DecodingContext& ctx) {
     using UserDataT = std::remove_pointer_t<T>;
-    return DecodeUserData<UserDataT, checks>(ctx.user_data);
+    if (auto decoded = DecodeUserData<UserDataT, checks>(ctx.user_data);
+        LLVM_LIKELY(succeeded(decoded)))
+      return decoded;
+    return ctx.diagnostic->EmitError(InternalError(
+        "failed to decode UserData of type %s", typeid(T).name()));
   }
 };
 
@@ -930,7 +939,7 @@ class CustomCallHandler : public CustomCall {
       diagnostic = DiagnosticEngine::DefaultDiagnosticEngine();
 
     // If all runtime checks are disabled we are just reinterpreting opaque
-    // `args`, `attrs` and `rets` memory acording to the custom call handler
+    // `args`, `attrs` and `rets` memory according to the custom call handler
     // signature and skip all checks (these checks will be optimized out).
     auto eval = [](bool condition) {
       return checks == RuntimeChecks::kNone ? false : condition;
@@ -985,8 +994,8 @@ class CustomCallHandler : public CustomCall {
     internal::DecodingOffsets offsets;
 
     // Package all the data required for decoding custom call operands.
-    internal::DecodingContext ctx{args,       rets,    attrs,    attrs_,
-                                  attrs_idx_, values_, user_data};
+    internal::DecodingContext ctx{args,       rets,    attrs,     attrs_,
+                                  attrs_idx_, values_, user_data, diagnostic};
 
     // Decode all operands into FailureOr containers. It is guaranteed
     // that initializer list will be evaluated left-to-right, and we can rely
@@ -997,9 +1006,16 @@ class CustomCallHandler : public CustomCall {
     // Check if all operands and results were decoded.
     bool all_decoded = (succeeded(std::get<Is>(fn_args)) && ...);
 
-    if (LLVM_UNLIKELY(!all_decoded))
-      return diagnostic->EmitError(
-          InvalidArgument("Failed to decode all custom call operands"));
+    if (LLVM_UNLIKELY(!all_decoded)) {
+      std::array<bool, kSize> decoded = {succeeded(std::get<Is>(fn_args))...};
+      auto bad_args = llvm::make_filter_range(
+          llvm::enumerate(decoded), [](auto pair) { return !pair.value(); });
+      auto to_str = [](auto pair) { return std::to_string(pair.index()); };
+
+      return diagnostic->EmitError(InvalidArgument(
+          "Failed to decode all custom call operands (bad operads at: %s)",
+          llvm::join(llvm::map_range(bad_args, to_str), ", ")));
+    }
 
     // Custom call returns logical result to signal failures.
     if constexpr (kIsLogicalErr) {
@@ -1098,32 +1114,6 @@ constexpr int64_t CustomCallHandler<checks, Fn, Ts...>::kNumRets;
 // Custom arguments decoding.
 //===----------------------------------------------------------------------===//
 
-// A view into the memref argument. Corresponds to the MemrefDesc, however it
-// doesn't own the sizes/strides vectors, and cheap to pass around. Memrefs with
-// non-identity layouts can be decoded only as a StridedMemrefView.
-struct StridedMemrefView {
-  PrimitiveType dtype;
-  void* data;
-  absl::Span<const int64_t> sizes;
-  absl::Span<const int64_t> strides;
-};
-
-// A view into the memref argument with an identity (row major) layout.
-struct MemrefView {
-  PrimitiveType dtype;
-  void* data;
-  absl::Span<const int64_t> sizes;
-};
-
-// A flat view into memref argument with an identity (row major) layout. If the
-// memref shape and strides are not required for the custom call, it's cheaper
-// to pass the flat view.
-struct FlatMemrefView {
-  PrimitiveType dtype;
-  void* data;
-  int64_t size_in_bytes;
-};
-
 llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const StridedMemrefView&);
 llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const MemrefView&);
 llvm::raw_ostream& operator<<(llvm::raw_ostream& os, const FlatMemrefView&);
diff --git a/tensorflow/compiler/xla/runtime/custom_call_test.cc b/tensorflow/compiler/xla/runtime/custom_call_test.cc
index 8aaecbf1c0f..28888354a2e 100644
--- a/tensorflow/compiler/xla/runtime/custom_call_test.cc
+++ b/tensorflow/compiler/xla/runtime/custom_call_test.cc
@@ -879,7 +879,7 @@ TEST(CustomCallTest, ArgTypeCheck) {
   EXPECT_FALSE(status.ok());
   EXPECT_EQ(status.message(),
             "run time error: custom call 'test.custom_call' failed: Failed to "
-            "decode all custom call operands");
+            "decode all custom call operands (bad operads at: 0)");
 }
 
 // Register custom call attribute decoding for `testlib.enum_type`.
diff --git a/tensorflow/compiler/xla/runtime/diagnostics.h b/tensorflow/compiler/xla/runtime/diagnostics.h
index e72ddc36bb7..7fdb481491d 100644
--- a/tensorflow/compiler/xla/runtime/diagnostics.h
+++ b/tensorflow/compiler/xla/runtime/diagnostics.h
@@ -102,7 +102,13 @@ class InFlightDiagnostic {
   //     ...
   //   }
   //
-  operator LogicalResult() const { return failure(); }  // NOLINT
+  operator LogicalResult() const {  // NOLINT
+    return failure();
+  }
+  template <typename T>
+  operator FailureOr<T>() const {  // NOLINT
+    return failure();
+  }
 
  private:
   friend class DiagnosticEngine;
diff --git a/tensorflow/compiler/xla/runtime/executable.cc b/tensorflow/compiler/xla/runtime/executable.cc
index fb9849965b1..357d3ce9b03 100644
--- a/tensorflow/compiler/xla/runtime/executable.cc
+++ b/tensorflow/compiler/xla/runtime/executable.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "llvm/Support/ErrorOr.h"
 #include "tensorflow/compiler/xla/mlir/runtime/utils/async_runtime_api.h"
 #include "tensorflow/compiler/xla/mlir/runtime/utils/c_runner_utils.h"
+#include "tensorflow/compiler/xla/mlir/runtime/utils/float_16bits.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
 #include "tensorflow/compiler/xla/runtime/errors.h"
@@ -48,6 +49,7 @@ using llvm::dyn_cast;
 
 // ExecutionContext encapsulates all the data that is required to implement XLA
 // Runtime <-> XLA Executable integration API.
+// LINT.IfChange
 struct ExecutionContext {
   // Results memory layout is owned by the executable, and stays alive after the
   // function execution completes.
@@ -67,6 +69,7 @@ struct ExecutionContext {
   // User-defined diagnostic engine for reporting diagnostics.
   const DiagnosticEngine* diagnostic_engine = nullptr;
 };
+// LINT.ThenChange(//tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h)
 
 void DestroyExecutionContext::operator()(ExecutionContext* ctx) { delete ctx; }
 
@@ -130,6 +133,8 @@ ExecutionEngine::SymbolsBinding RuntimeSymbolsBinding(
        AsyncRuntimeMemoryAllocationSymbolMap,
        // Register Runtime API intrinsics (returning results and errors).
        RuntimeApiSymbolMap,
+       // Register LLVM f16 and bf16 API intrinsics (defined in Float16bits).
+       Float16bitsSymbolMap,
        // Register any additional user-defined symbol bindings
        std::move(custom_binding)});
 }
diff --git a/tensorflow/compiler/xla/runtime/execution_engine.cc b/tensorflow/compiler/xla/runtime/execution_engine.cc
index 0b0a448ac1a..7ee7890d020 100644
--- a/tensorflow/compiler/xla/runtime/execution_engine.cc
+++ b/tensorflow/compiler/xla/runtime/execution_engine.cc
@@ -35,7 +35,6 @@ limitations under the License.
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
@@ -99,17 +98,15 @@ static std::string GetExportedName(std::string_view name) {
   return StrFormat("__xla__%s", name);
 }
 
-// Converts exported function to an interface function that wraps all the
-// arguments of the original function into an i8** pointer to provide a function
-// with trivial ABI.
-static absl::Status SetUpExportedFunction(llvm::Module &module,
-                                          std::string_view function_name) {
+absl::Status ExportWithXlaRuntimeAbi(llvm::Module &module,
+                                     std::string_view original_name,
+                                     std::string_view exported_name) {
   llvm::IRBuilder<> builder(module.getContext());
 
   // Check that we have a function with a valid type.
-  llvm::Function *func = module.getFunction(function_name);
+  llvm::Function *func = module.getFunction(original_name);
   if (!func)
-    return InternalError("exported function not found: %s", function_name);
+    return InternalError("exported function not found: %s", original_name);
   if (!func->getReturnType()->isVoidTy())
     return InternalError("exported function must return void");
 
@@ -118,8 +115,8 @@ static absl::Status SetUpExportedFunction(llvm::Module &module,
       builder.getVoidTy(), builder.getInt8PtrTy()->getPointerTo(),
       /*isVarArg=*/false);
 
-  llvm::FunctionCallee xla_runtime_func = module.getOrInsertFunction(
-      GetExportedName(func->getName()), xla_runtime_type);
+  llvm::FunctionCallee xla_runtime_func =
+      module.getOrInsertFunction(exported_name, xla_runtime_type);
 
   llvm::Function *callee = cast<llvm::Function>(xla_runtime_func.getCallee());
   llvm::Value *packed_args = callee->arg_begin();
@@ -245,10 +242,13 @@ ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
 
   // Set up exported functions interface functions in the LLVM module.
   for (std::string_view name : exported) {
-    if (auto status = SetUpExportedFunction(*module, name); !status.ok())
+    if (auto status =
+            ExportWithXlaRuntimeAbi(*module, name, GetExportedName(name));
+        !status.ok()) {
       return InternalError(
           "failed to set up exported function %s interface: %s", name,
           status.message());
+    }
   }
 
   // Run an optimization pipeline over the LLVM module (alway run with default
@@ -312,13 +312,6 @@ ExecutionEngine::CreateFromModule(std::unique_ptr<llvm::LLVMContext> ctx,
   llvm::orc::JITDylib &main_jd = (*jit)->getMainJITDylib();
   llvm::DataLayout data_layout = (*jit)->getDataLayout();
 
-  // Register symbols that are statically linked in the current process.
-  auto generator = DynamicLibrarySearchGenerator::GetForCurrentProcess(
-      data_layout.getGlobalPrefix());
-  if (auto err = generator.takeError())
-    return InternalError("failed to construct DyLib search generator");
-  main_jd.addGenerator(std::move(*generator));
-
   // Register user-provided symbols.
   if (options.symbols_binding) {
     auto mangle = llvm::orc::MangleAndInterner(main_jd.getExecutionSession(),
diff --git a/tensorflow/compiler/xla/runtime/execution_engine.h b/tensorflow/compiler/xla/runtime/execution_engine.h
index 90c70448d6f..3ad86250ea9 100644
--- a/tensorflow/compiler/xla/runtime/execution_engine.h
+++ b/tensorflow/compiler/xla/runtime/execution_engine.h
@@ -167,6 +167,14 @@ class ExecutionEngine {
   llvm::JITEventListener *perf_listener_ = nullptr;
 };
 
+// Emits an interface function ('exported_name') that wraps all arguments
+// of a function ('original_name') into a single pointer to a ptr**,
+// thereby exposing a trivial ABI. The original function is also inlined,
+// if possible.
+absl::Status ExportWithXlaRuntimeAbi(llvm::Module &module,
+                                     std::string_view original_name,
+                                     std::string_view exported_name);
+
 }  // namespace runtime
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/runtime/ffi.cc b/tensorflow/compiler/xla/runtime/ffi.cc
index 627def6014d..58833212510 100644
--- a/tensorflow/compiler/xla/runtime/ffi.cc
+++ b/tensorflow/compiler/xla/runtime/ffi.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/runtime/ffi.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <memory>
@@ -47,6 +46,9 @@ struct XLA_FFI_ExecutionContext {
 
 //===----------------------------------------------------------------------===//
 
+template <typename T>
+using TensorRef = ::xla::runtime::CustomCall::TensorRef<T>;
+
 namespace xla {
 namespace runtime {
 namespace ffi {
@@ -120,10 +122,25 @@ absl::StatusCode ConvertErrorCode(XLA_FFI_Error_Code errc) {
 // Adaptor from the Xla custom call to an Xla FFI calling convention.
 //===----------------------------------------------------------------------===//
 
-// We use weak linking to provide a default implementation here. The XLA:GPU
-// backend overrides this implementation, and it is picked at link time.
-ABSL_ATTRIBUTE_WEAK XLA_FFI_Stream* GetXlaFfiStream(
-    const CustomCall::UserData* user_data, const DiagnosticEngine* diagnostic) {
+using StreamProvider = XLA_FFI_Stream* (*)(const CustomCall::UserData*,
+                                           const DiagnosticEngine*);
+
+static std::vector<StreamProvider>& GetStreamProviders() {
+  static auto* stream_providers = new std::vector<StreamProvider>();
+  return *stream_providers;
+}
+
+void RegisterXlaFfiStreamProvider(StreamProvider provider) {
+  GetStreamProviders().push_back(provider);
+}
+
+XLA_FFI_Stream* GetXlaFfiStream(const CustomCall::UserData* user_data,
+                                const DiagnosticEngine* diagnostic) {
+  for (auto provider : GetStreamProviders()) {
+    if (XLA_FFI_Stream* stream = provider(user_data, diagnostic)) {
+      return stream;
+    }
+  }
   return nullptr;
 }
 
@@ -473,6 +490,10 @@ const XLA_FFI_Api ffi_api = {
     FfiTypeId<absl::Span<const double>>,
     FfiTypeId<absl::Span<const int32_t>>,
     FfiTypeId<absl::Span<const int64_t>>,
+    FfiTypeId<TensorRef<float>>,
+    FfiTypeId<TensorRef<double>>,
+    FfiTypeId<TensorRef<int32_t>>,
+    FfiTypeId<TensorRef<int64_t>>,
     FfiTypeId<::xla::runtime::MemrefView>,
     FfiTypeId<::xla::runtime::StridedMemrefView>,
     FfiTypeId<::xla::runtime::Dictionary>,
diff --git a/tensorflow/compiler/xla/runtime/ffi.h b/tensorflow/compiler/xla/runtime/ffi.h
index 62d58843516..4320cc919e6 100644
--- a/tensorflow/compiler/xla/runtime/ffi.h
+++ b/tensorflow/compiler/xla/runtime/ffi.h
@@ -83,6 +83,13 @@ class FfiModulesState {
   std::vector<std::unique_ptr<Module::State>> state_;
 };
 
+// Gets the underlying FFI stream from the `user_data`.
+XLA_FFI_Stream* GetXlaFfiStream(const CustomCall::UserData* user_data,
+                                const DiagnosticEngine* diagnostic);
+
+void RegisterXlaFfiStreamProvider(
+    XLA_FFI_Stream* (*)(const CustomCall::UserData*, const DiagnosticEngine*));
+
 }  // namespace ffi
 }  // namespace runtime
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/runtime/ffi/BUILD b/tensorflow/compiler/xla/runtime/ffi/BUILD
index 12b1c274cfd..c94568b48c9 100644
--- a/tensorflow/compiler/xla/runtime/ffi/BUILD
+++ b/tensorflow/compiler/xla/runtime/ffi/BUILD
@@ -1,9 +1,18 @@
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_cloud")
+load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_cloud")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
+)
+
+filegroup(
+    name = "ffi_hdrs",
+    srcs = [
+        "ffi_abi.h",
+        "ffi_api.h",
+        "ffi_c_api.h",
+    ],
 )
 
 cc_library(
diff --git a/tensorflow/compiler/xla/runtime/ffi/ffi_api.h b/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
index 69f16d379dc..a077d0ab017 100644
--- a/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
+++ b/tensorflow/compiler/xla/runtime/ffi/ffi_api.h
@@ -78,7 +78,7 @@ inline void CheckStructSize(std::string_view struct_name, size_t expected_size,
                             args->struct_size)
 
 //===----------------------------------------------------------------------===//
-// Span is non-owning view into contiguous values ot type `T`.
+// Span is non-owning view into contiguous values of type `T`.
 //===----------------------------------------------------------------------===//
 
 // TODO(ezhulenev): Replace with `std::span` when C++20 is available.
@@ -101,6 +101,15 @@ class Span {
   size_t size_;
 };
 
+// A type for representing shaped tensors.
+// TODO(ecg): expand API to be compatible with `std::mdspan`, and eventually
+// (when C++23 is available) replace with `std::mdspan`.
+template <typename T>
+struct MdSpan {
+  Span<const int64_t> shape;
+  Span<const T> data;
+};
+
 //===----------------------------------------------------------------------===//
 // XLA FFI status wrapper around error reporting APIs.
 //===----------------------------------------------------------------------===//
@@ -336,6 +345,37 @@ enum class PrimitiveType : uint8_t {
   F64 = 12,
 };
 
+constexpr int ByteWidth(PrimitiveType type) {
+  switch (type) {
+    case PrimitiveType::PRED:
+      return 1;
+
+    case PrimitiveType::S8:
+    case PrimitiveType::U8:
+      return 1;
+
+    case PrimitiveType::S16:
+    case PrimitiveType::U16:
+    case PrimitiveType::F16:
+    case PrimitiveType::BF16:
+      return 2;
+
+    case PrimitiveType::S32:
+    case PrimitiveType::U32:
+    case PrimitiveType::F32:
+      return 4;
+
+    case PrimitiveType::S64:
+    case PrimitiveType::U64:
+    case PrimitiveType::F64:
+      return 8;
+
+    case PrimitiveType::PRIMITIVE_TYPE_INVALID:
+      assert(false && "Unsupported type");
+      return 0;
+  }
+}
+
 constexpr std::string_view PrimitiveTypeToString(PrimitiveType type) {
   switch (type) {
     case PrimitiveType::PRIMITIVE_TYPE_INVALID:
@@ -389,6 +429,17 @@ struct BufferArg {
   Span<const int64_t> sizes;
 };
 
+// A flat view into the buffer argument with an identity (row major) layout.
+// If the memref shape and strides are not required, it is cheaper to pass the
+// flat buffer argument.
+struct FlatBufferArg {
+  std::string ToString() const;
+
+  PrimitiveType dtype;
+  void* data;
+  int64_t size_in_bytes;
+};
+
 // A type tag to represent dictionary attributes that can be decoded into
 // structs using aggregate attribute decoding.
 struct Dictionary {};
@@ -412,6 +463,11 @@ bool Ffi::Isa(const XLA_FFI_Api* api, XLA_FFI_TypeId type_id) {
   ISA(Span<const int32_t>, Int32Array);
   ISA(Span<const int64_t>, Int64Array);
 
+  ISA(MdSpan<float>, FloatTensor);
+  ISA(MdSpan<double>, DoubleTensor);
+  ISA(MdSpan<int32_t>, Int32Tensor);
+  ISA(MdSpan<int64_t>, Int64Tensor);
+
   ISA(StridedBufferArg, StridedBufferArg);
   ISA(BufferArg, BufferArg);
   ISA(Dictionary, Dictionary);
@@ -451,6 +507,13 @@ inline std::string BufferArg::ToString() const {
   return ss.str();
 }
 
+inline std::string FlatBufferArg::ToString() const {
+  std::stringstream ss;
+  ss << "Buffer: dtype=" << PrimitiveTypeToString(dtype);
+  ss << " size=" << size_in_bytes;
+  return ss.str();
+}
+
 //===----------------------------------------------------------------------===//
 // FFI binding describes the function signature expected by the FFI handler
 // using its variadic template parameter.
@@ -478,12 +541,26 @@ struct StateTag {};
 template <typename T>
 struct StreamTag {};
 
+// Type tag to distinguish an argument tied to an "ApiPriv" argument
+// to `Ffi::Binding`. This is necessary to obtain `XLA_FFI_Api.priv`
+// from the foreign function. For example:
+//
+// static ffi::FfiStatus FooFfi(MyStruct* bar, ffi::BufferArg input) { ... }
+//
+// XLA_FFI_DEFINE_FUNCTION(
+//     FFI_Foo, FooFfi, ffi::Ffi::Binding()
+//     .ApriPriv<MyStruct*>
+//     .Arg<ffi::BufferArg>);
+template <typename T>
+struct ApiPrivTag {};
+
 // A template for checking if type is a wrapped attribute or user data.
 // clang-format off
-template <typename>   struct IsWrapped               : std::false_type {};
-template <typename T> struct IsWrapped<AttrTag<T>>   : std::true_type {};
-template <typename T> struct IsWrapped<StateTag<T>>  : std::true_type {};
-template <typename T> struct IsWrapped<StreamTag<T>> : std::true_type {};
+template <typename>   struct IsWrapped                : std::false_type {};
+template <typename T> struct IsWrapped<AttrTag<T>>    : std::true_type {};
+template <typename T> struct IsWrapped<StateTag<T>>   : std::true_type {};
+template <typename T> struct IsWrapped<StreamTag<T>>  : std::true_type {};
+template <typename T> struct IsWrapped<ApiPrivTag<T>> : std::true_type {};
 // clang-format on
 
 }  // namespace internal
@@ -515,6 +592,12 @@ class FfiBinding {
     return {std::move(*this)};
   }
 
+  template <typename T>
+  FfiBinding<Ts..., internal::ApiPrivTag<T>> ApiPriv() && {
+    static_assert(std::is_pointer_v<T>, "T must be a pointer type");
+    return {std::move(*this)};
+  }
+
   template <typename Fn>
   std::unique_ptr<FfiHandler<Fn, Ts...>> To(Fn fn) {
     return std::unique_ptr<FfiHandler<Fn, Ts...>>(
@@ -658,7 +741,7 @@ namespace internal {
 
 // When decoding input data we need to keep track of how many arguments,
 // attributes, and returns we decoded so far to index into the correct data
-// strucuture.
+// structure.
 struct DecodingOffsets {
   int64_t args = 0;
   int64_t attrs = 0;
@@ -740,6 +823,18 @@ struct Decode<StreamTag<T>> {
   }
 };
 
+template <typename T>
+struct Decode<ApiPrivTag<T>> {
+  static std::optional<T> call(const XLA_FFI_Api* api,
+                               XLA_FFI_ExecutionContext* ctx,
+                               DecodingOffsets& offsets, internal::DecodedArgs,
+                               const std::vector<std::string>& attrs_names,
+                               const std::vector<size_t>& attrs_idx,
+                               internal::DecodedAttrs attrs) {
+    return reinterpret_cast<T>(api->priv);
+  }
+};
+
 }  // namespace internal
 
 //===----------------------------------------------------------------------===//
@@ -755,10 +850,11 @@ namespace internal {
 
 // A helper template to extract the type of the handler argument.
 // clang-format off
-template <typename T> struct FnArgType               { using Type = T;  };
-template <typename T> struct FnArgType<AttrTag<T>>   { using Type = T;  };
-template <typename T> struct FnArgType<StateTag<T>>  { using Type = T*; };
-template <typename T> struct FnArgType<StreamTag<T>> { using Type = T;  };
+template <typename T> struct FnArgType                { using Type = T;  };
+template <typename T> struct FnArgType<AttrTag<T>>    { using Type = T;  };
+template <typename T> struct FnArgType<StateTag<T>>   { using Type = T*; };
+template <typename T> struct FnArgType<StreamTag<T>>  { using Type = T;  };
+template <typename T> struct FnArgType<ApiPrivTag<T>> { using Type = T;  };
 // clang-format on
 
 // A template for counting regular arguments in the Ts pack.
@@ -857,8 +953,16 @@ class FfiHandler : public Ffi {
     // Check if all arguments, attributes and results were decoded;
     bool all_decoded = (std::get<Is>(fn_args).has_value() && ...);
     if (!all_decoded) {
-      return ToError(
-          api, FfiStatus::InvalidArgument("Failed to decode all FFI operands"));
+      std::array<bool, kSize> decoded = {std::get<Is>(fn_args).has_value()...};
+      std::string err = "Failed to decode all FFI operands (bad operands at: ";
+      for (size_t cnt = 0, idx = 0; idx < kSize; ++idx) {
+        if (!decoded[idx]) {
+          if (cnt++) err.append(", ");
+          err.append(std::to_string(idx));
+        }
+      }
+      err.append(")");
+      return ToError(api, FfiStatus::InvalidArgument(err));
     }
 
     // Custom call returns `FfiStatus`, we can call it directly.
@@ -962,6 +1066,31 @@ struct FfiArgDecoding<BufferArg> {
   }
 };
 
+template <>
+struct FfiArgDecoding<FlatBufferArg> {
+  using EncodedMemref = internal::EncodedMemref;
+
+  static std::optional<FlatBufferArg> Decode(const XLA_FFI_Api* api,
+                                             XLA_FFI_TypeId type_id,
+                                             void* value) {
+    if (!Ffi::Isa<BufferArg>(api, type_id)) {
+      return std::nullopt;
+    }
+
+    auto* encoded = reinterpret_cast<EncodedMemref*>(value);
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(encoded, sizeof(EncodedMemref));
+    XLA_FFI_ANNOTATE_MEMORY_IS_INITIALIZED(
+        encoded, sizeof(EncodedMemref) + encoded->rank * sizeof(int64_t));
+
+    auto dtype = static_cast<PrimitiveType>(encoded->dtype);
+    int64_t size_in_bytes = ByteWidth(dtype);
+    for (int d = 0; d < encoded->rank; ++d) {
+      size_in_bytes *= encoded->dims[d];
+    }
+    return FlatBufferArg{dtype, encoded->data, size_in_bytes};
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // XLA FFI attributes decoding.
 //===----------------------------------------------------------------------===//
@@ -988,6 +1117,9 @@ XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t);
 
 #undef XLA_FFI_REGISTER_SCALAR_ATTR_DECODING
 
+// Both EncodedArray and 1-D EncodedDenseElements can be decoded as a Span.
+// Pointers to both EncodedArray and 1-D EncodedDenseElements can be
+// dereferenced as a pointer to EncodedArray.
 #define XLA_FFI_REGISTER_ARRAY_ATTR_DECODING(T)                            \
   template <>                                                              \
   struct FfiAttrDecoding<Span<const T>> {                                  \
@@ -995,7 +1127,7 @@ XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(int64_t);
                                                std::string_view name,      \
                                                XLA_FFI_TypeId type_id,     \
                                                void* value) {              \
-      if (!Ffi::Isa<Span<const T>>(api, type_id)) {                        \
+      if (!Ffi::Isa<Span<const T>, MdSpan<T>>(api, type_id)) {             \
         return std::nullopt;                                               \
       }                                                                    \
                                                                            \
diff --git a/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h b/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h
index 4f952bd9923..26328a5ba58 100644
--- a/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h
+++ b/tensorflow/compiler/xla/runtime/ffi/ffi_c_api.h
@@ -244,10 +244,17 @@ typedef struct XLA_FFI_Api {
   XLA_FFI_API_TYPEID_FIELD(Int1);
   XLA_FFI_API_TYPEID_FIELD(Int32);
   XLA_FFI_API_TYPEID_FIELD(Int64);
+
   XLA_FFI_API_TYPEID_FIELD(FloatArray);
   XLA_FFI_API_TYPEID_FIELD(DoubleArray);
   XLA_FFI_API_TYPEID_FIELD(Int32Array);
   XLA_FFI_API_TYPEID_FIELD(Int64Array);
+
+  XLA_FFI_API_TYPEID_FIELD(FloatTensor);
+  XLA_FFI_API_TYPEID_FIELD(DoubleTensor);
+  XLA_FFI_API_TYPEID_FIELD(Int32Tensor);
+  XLA_FFI_API_TYPEID_FIELD(Int64Tensor);
+
   XLA_FFI_API_TYPEID_FIELD(BufferArg);
   XLA_FFI_API_TYPEID_FIELD(StridedBufferArg);
   XLA_FFI_API_TYPEID_FIELD(Dictionary);
diff --git a/tensorflow/compiler/xla/runtime/memref_view.h b/tensorflow/compiler/xla/runtime/memref_view.h
new file mode 100644
index 00000000000..ff8acfc68af
--- /dev/null
+++ b/tensorflow/compiler/xla/runtime/memref_view.h
@@ -0,0 +1,56 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_RUNTIME_MEMREF_VIEW_H_
+#define TENSORFLOW_COMPILER_XLA_RUNTIME_MEMREF_VIEW_H_
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+
+namespace xla {
+namespace runtime {
+
+// A view into the memref argument. Corresponds to the MemrefDesc, however it
+// doesn't own the sizes/strides vectors, and cheap to pass around. Memrefs with
+// non-identity layouts can be decoded only as a StridedMemrefView.
+struct StridedMemrefView {
+  PrimitiveType dtype;
+  void* data;
+  absl::Span<const int64_t> sizes;
+  absl::Span<const int64_t> strides;
+};
+
+// A view into the memref argument with an identity (row major) layout.
+struct MemrefView {
+  PrimitiveType dtype;
+  void* data;
+  absl::Span<const int64_t> sizes;
+};
+
+// A flat view into memref argument with an identity (row major) layout. If the
+// memref shape and strides are not required for the custom call, it's cheaper
+// to pass the flat view.
+struct FlatMemrefView {
+  PrimitiveType dtype;
+  void* data;
+  int64_t size_in_bytes;
+};
+
+}  // namespace runtime
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_RUNTIME_MEMREF_VIEW_H_
diff --git a/tensorflow/compiler/xla/runtime/runner/runner.cc b/tensorflow/compiler/xla/runtime/runner/runner.cc
index dfba6e21ee6..394dd8df79a 100644
--- a/tensorflow/compiler/xla/runtime/runner/runner.cc
+++ b/tensorflow/compiler/xla/runtime/runner/runner.cc
@@ -80,9 +80,9 @@ template <typename T>
 static absl::Status ReadProtoFile(Env* env, const std::string& fname,
                                   T* proto) {
   if (absl::EndsWith(fname, ".pbtxt")) {
-    return ToAbslStatus(ReadTextProto(env, fname, proto));
+    return ReadTextProto(env, fname, proto);
   } else {
-    return ToAbslStatus(ReadBinaryProto(env, fname, proto));
+    return ReadBinaryProto(env, fname, proto);
   }
 }
 
@@ -90,9 +90,9 @@ template <typename T>
 static absl::Status WriteProtoFile(Env* env, const std::string& fname,
                                    T& proto) {
   if (absl::EndsWith(fname, ".pbtxt")) {
-    return ToAbslStatus(WriteTextProto(env, fname, proto));
+    return WriteTextProto(env, fname, proto);
   } else {
-    return ToAbslStatus(WriteBinaryProto(env, fname, proto));
+    return WriteBinaryProto(env, fname, proto);
   }
 }
 
diff --git a/tensorflow/compiler/xla/service/BUILD b/tensorflow/compiler/xla/service/BUILD
index 6b32e013760..bf5a4d86b8f 100644
--- a/tensorflow/compiler/xla/service/BUILD
+++ b/tensorflow/compiler/xla/service/BUILD
@@ -15,7 +15,7 @@ load(
     "tf_proto_library",
     "tf_pyclif_proto_library",
 )
-load("//tensorflow/tsl:tsl.bzl", "if_libtpu")
+load("//tensorflow/tsl:tsl.bzl", "if_google", "if_libtpu")
 load("//tensorflow/tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 load(
     "//tensorflow/tsl/platform/default:cuda_build_defs.bzl",
@@ -122,14 +122,13 @@ xla_cc_test(
     srcs = ["async_collective_creator_test.cc"],
     deps = [
         ":async_collective_creator",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -160,7 +159,6 @@ cc_library(
     hdrs = ["all_reduce_key.h"],
     deps = [
         ":hlo_domain_map",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
@@ -177,12 +175,9 @@ xla_cc_test(
     srcs = ["all_reduce_promotion_test.cc"],
     deps = [
         ":all_reduce_promotion",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:pattern_matcher",
-        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:test_main",
     ],
 )
@@ -196,11 +191,11 @@ cc_library(
         ":collective_ops_utils",
         ":hlo_domain_map",
         ":hlo_pass",
+        ":pattern_matcher",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_query",
-        "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/tsl/platform:errors",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -211,13 +206,12 @@ xla_cc_test(
     srcs = ["all_reduce_reassociate_test.cc"],
     deps = [
         ":all_reduce_reassociate",
+        ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
-        "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
@@ -249,7 +243,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -282,7 +275,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
     ],
 )
@@ -310,15 +302,12 @@ xla_cc_test(
         ":bfloat16_conversion_folding",
         ":float_support",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -332,7 +321,6 @@ cc_library(
         ":hlo_pass",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
@@ -350,15 +338,12 @@ xla_cc_test(
         ":hlo_creation_utils",
         ":hlo_verifier",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -391,7 +376,6 @@ xla_cc_test(
     deps = [
         ":bfloat16_propagation",
         ":float_support",
-        "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
@@ -403,6 +387,27 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "constant_value",
+    srcs = ["constant_value.cc"],
+    hdrs = ["constant_value.h"],
+    deps = [
+        "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+    ],
+)
+
+xla_cc_test(
+    name = "constant_value_test",
+    srcs = ["constant_value_test.cc"],
+    deps = [
+        ":constant_value",
+        "//tensorflow/compiler/xla:literal_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "convert_async_collectives_to_sync",
     srcs = ["convert_async_collectives_to_sync.cc"],
@@ -419,18 +424,85 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "value_range",
+    srcs = ["value_range.cc"],
+    hdrs = ["value_range.h"],
+    deps = [
+        ":constant_value",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
+
+xla_cc_test(
+    name = "value_range_test",
+    srcs = ["value_range_test.cc"],
+    deps = [
+        ":hlo_module_config",
+        ":hlo_parser",
+        ":value_range",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 xla_cc_test(
     name = "convert_async_collectives_to_sync_test",
     srcs = ["convert_async_collectives_to_sync_test.cc"],
     deps = [
         ":convert_async_collectives_to_sync",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "data_parallel_collective_optimizer",
+    srcs = ["data_parallel_collective_optimizer.cc"],
+    hdrs = ["data_parallel_collective_optimizer.h"],
+    deps = [
+        ":constant_value",
+        ":hlo_pass",
+        ":value_range",
+        "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal_util",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/hlo/evaluator:hlo_evaluator",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_cc_test(
+    name = "data_parallel_collective_optimizer_test",
+    srcs = ["data_parallel_collective_optimizer_test.cc"],
+    deps = [
+        ":data_parallel_collective_optimizer",
+        ":hlo_dce",
+        ":hlo_pass_pipeline",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -559,7 +631,6 @@ xla_cc_test(
         ":hlo_parser",
         ":sharding_propagation",
         "//tensorflow/compiler/xla:protobuf_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
@@ -582,8 +653,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -615,7 +684,6 @@ cc_library(
     deps = [
         ":shape_inference",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
@@ -629,12 +697,10 @@ xla_cc_test(
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -651,9 +717,6 @@ xla_test(
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/service/cpu:cpu_executable",
-        "//tensorflow/compiler/xla/service/cpu:parallel_task_assignment",
-        "//tensorflow/compiler/xla/service/cpu:target_machine_features",
         "//tensorflow/compiler/xla/tests:client_library_test_base",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_macros_header",
@@ -669,13 +732,10 @@ xla_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -683,6 +743,7 @@ cc_library(
     name = "pattern_matcher",
     hdrs = ["pattern_matcher.h"],
     deps = [
+        ":hlo_parser",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -695,8 +756,8 @@ xla_cc_test(
     name = "pattern_matcher_test",
     srcs = ["pattern_matcher_test.cc"],
     deps = [
+        ":hlo_parser",
         ":pattern_matcher",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -725,7 +786,6 @@ xla_cc_test(
         ":pattern_matcher_gmock",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
@@ -761,7 +821,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -771,14 +830,11 @@ xla_cc_test(
     srcs = ["hlo_sharding_test.cc"],
     deps = [
         ":hlo_parser",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -818,7 +874,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -849,8 +904,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -866,17 +919,12 @@ xla_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -907,7 +955,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -929,7 +976,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -950,7 +996,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform_id",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -965,18 +1010,13 @@ cc_library(
         ":platform_util",
         ":stream_pool",
         ":transfer_manager",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
-        "//tensorflow/tsl/platform:status",
         "//third_party/eigen3",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -1031,7 +1071,6 @@ cc_library(
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -1068,8 +1107,6 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -1110,7 +1147,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/lib/core:status_test_util",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -1138,7 +1174,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -1164,7 +1199,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1239,9 +1273,7 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
@@ -1257,13 +1289,10 @@ xla_cc_test(
         ":shaped_buffer",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1319,7 +1348,6 @@ cc_library(
         ":executable",
         ":hlo_module_config",
         ":metrics_hook_interface",
-        ":metrics_proto_cc",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -1365,10 +1393,8 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:device_memory",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:notification",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1390,9 +1416,7 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1410,8 +1434,6 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1428,9 +1450,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1455,8 +1475,6 @@ xla_cc_test(
     srcs = ["name_uniquer_test.cc"],
     deps = [
         ":name_uniquer",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
@@ -1491,7 +1509,6 @@ cc_library(
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:numbers",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1518,6 +1535,8 @@ xla_cc_test(
         ":hlo_memory_scheduler",
         ":hlo_ordering",
         ":hlo_parser",
+        ":hlo_proto_cc",
+        ":hlo_proto_util",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1528,6 +1547,7 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
     ],
@@ -1551,7 +1571,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1564,7 +1583,6 @@ xla_cc_test(
     srcs = ["hlo_ordering_test.cc"],
     deps = [
         ":hlo_dataflow_analysis",
-        ":hlo_memory_scheduler",
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
@@ -1573,7 +1591,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -1602,6 +1619,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -1637,16 +1655,11 @@ xla_cc_test(
         ":hlo_module_group_metadata",
         ":hlo_proto_cc",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_module_group",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -1668,7 +1681,6 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1724,7 +1736,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -1738,12 +1749,10 @@ xla_cc_test(
         ":hlo_ordering",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -1768,7 +1777,6 @@ cc_library(
         "//tensorflow/tsl/lib/gtl:map_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -1782,7 +1790,6 @@ xla_cc_test(
         ":hlo_dce",
         ":hlo_memory_scheduler",
         ":hlo_ordering",
-        ":hlo_parser",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
@@ -1790,7 +1797,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -1824,7 +1830,7 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-    ],
+    ] + if_google(["@com_google_absl//absl/types:source_location"]),
 )
 
 xla_cc_test(
@@ -1853,8 +1859,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -1893,7 +1897,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -1973,7 +1976,6 @@ cc_library(
     hdrs = ["gather_expander.h"],
     deps = [
         ":hlo_creation_utils",
-        ":hlo_pass",
         ":op_expander_pass",
         ":while_util",
         "//tensorflow/compiler/xla:literal_util",
@@ -2001,7 +2003,6 @@ cc_library(
         ":hlo_creation_utils",
         ":hlo_pass",
         ":op_expander_pass",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client/lib:comparators",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -2028,21 +2029,16 @@ xla_cc_test(
     name = "scatter_expander_test",
     srcs = ["scatter_expander_test.cc"],
     deps = [
-        ":hlo_parser",
         ":scatter_expander",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2063,8 +2059,6 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:slicing",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
     ],
@@ -2081,14 +2075,9 @@ xla_cc_test(
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:verified_hlo_module",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2111,7 +2100,6 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -2136,7 +2124,6 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:qr",
         "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -2147,12 +2134,7 @@ cc_library(
     hdrs = ["real_imag_expander.h"],
     deps = [
         ":op_expander_pass",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:util",
     ],
 )
 
@@ -2162,8 +2144,6 @@ xla_cc_test(
     srcs = ["real_imag_expander_test.cc"],
     deps = [
         ":hlo_creation_utils",
-        ":hlo_parser",
-        ":hlo_pass",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         ":real_imag_expander",
@@ -2171,16 +2151,11 @@ xla_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2204,7 +2179,6 @@ cc_library(
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:slicing",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
@@ -2233,7 +2207,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2263,7 +2236,6 @@ xla_cc_test(
         ":convolution_pred_expander",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2285,14 +2257,11 @@ xla_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2368,17 +2337,14 @@ xla_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2387,7 +2353,6 @@ xla_test(
     name = "algebraic_simplifier_overflow_test",
     srcs = ["algebraic_simplifier_overflow_test.cc"],
     deps = [
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
@@ -2458,16 +2423,12 @@ xla_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2483,7 +2444,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -2502,7 +2462,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2533,9 +2492,7 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2583,14 +2540,12 @@ xla_cc_test(
     srcs = ["bitcast_dtypes_expander_test.cc"],
     deps = [
         ":bitcast_dtypes_expander",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2599,12 +2554,10 @@ xla_cc_test(
     srcs = ["all_gather_broadcast_reorder_test.cc"],
     deps = [
         ":all_gather_broadcast_reorder",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2616,19 +2569,15 @@ cc_library(
         ":collective_combiner_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":shape_inference",
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -2640,20 +2589,11 @@ xla_cc_test(
     srcs = ["all_gather_combiner_test.cc"],
     deps = [
         ":all_gather_combiner",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2666,19 +2606,15 @@ cc_library(
         ":collective_combiner_utils",
         ":hlo_domain_map",
         ":hlo_pass",
-        ":shape_inference",
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/hlo/utils:hlo_sharding_util",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -2693,17 +2629,11 @@ xla_cc_test(
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2745,8 +2675,6 @@ cc_library(
         ":hlo_domain_map",
         ":hlo_pass",
         ":shape_inference",
-        "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -2755,10 +2683,8 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo_reachability",
         "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2767,19 +2693,10 @@ xla_cc_test(
     srcs = ["reduce_scatter_combiner_test.cc"],
     deps = [
         ":reduce_scatter_combiner",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2803,10 +2720,8 @@ xla_cc_test(
     deps = [
         ":all_reduce_simplifier",
         ":hlo_parser",
-        ":hlo_pass",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
@@ -2815,10 +2730,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2829,7 +2740,6 @@ cc_library(
     deps = [
         ":collective_decomposer_utils",
         ":collective_ops_utils",
-        ":hlo_creation_utils",
         ":hlo_module_config",
         ":hlo_pass",
         "//tensorflow/compiler/xla:literal_util",
@@ -2837,7 +2747,6 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_query",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -2846,19 +2755,13 @@ xla_cc_test(
     srcs = ["reduce_scatter_decomposer_test.cc"],
     deps = [
         ":collective_ops_utils",
-        ":hlo_parser",
         ":reduce_scatter_decomposer",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2887,7 +2790,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2908,20 +2810,10 @@ xla_cc_test(
     srcs = ["batch_dot_simplification_test.cc"],
     deps = [
         ":batch_dot_simplification",
-        ":hlo_pass",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:window_util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2977,7 +2869,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3022,7 +2913,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -3144,7 +3034,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3177,8 +3066,6 @@ xla_cc_test(
         ":hlo_cse",
         ":hlo_dce",
         ":hlo_parser",
-        ":hlo_pass",
-        ":hlo_pass_pipeline",
         ":tuple_simplifier",
         ":while_loop_simplifier",
         "//tensorflow/compiler/xla:test",
@@ -3187,8 +3074,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3213,14 +3098,11 @@ xla_cc_test(
         ":pattern_matcher",
         ":while_loop_simplifier",
         ":while_loop_trip_count_annotator",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3301,6 +3183,32 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "dot_dimension_merger",
+    srcs = ["dot_dimension_merger.cc"],
+    hdrs = ["dot_dimension_merger.h"],
+    deps = [
+        ":hlo_creation_utils",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
+    ],
+)
+
+xla_cc_test(
+    name = "dot_dimension_merger_test",
+    srcs = ["dot_dimension_merger_test.cc"],
+    deps = [
+        ":dot_dimension_merger",
+        ":hlo_parser",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "dot_merger",
     srcs = ["dot_merger.cc"],
@@ -3322,12 +3230,10 @@ xla_cc_test(
         ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3346,16 +3252,11 @@ xla_cc_test(
     name = "convert_mover_test",
     srcs = ["convert_mover_test.cc"],
     deps = [
-        ":algebraic_simplifier",
         ":convert_mover",
-        ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3364,7 +3265,6 @@ cc_library(
     srcs = ["all_to_all_decomposer.cc"],
     hdrs = ["all_to_all_decomposer.h"],
     deps = [
-        ":hlo_pass",
         ":op_expander_pass",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
@@ -3397,16 +3297,12 @@ xla_cc_test(
     deps = [
         ":all_gather_decomposer",
         ":hlo_parser",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3416,13 +3312,7 @@ cc_library(
     hdrs = ["tuple_simplifier.h"],
     deps = [
         ":hlo_pass",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:status",
     ],
 )
 
@@ -3440,7 +3330,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -3449,14 +3338,13 @@ cc_library(
     srcs = ["reshape_mover.cc"],
     hdrs = ["reshape_mover.h"],
     deps = [
+        ":hlo_creation_utils",
         ":hlo_pass",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
@@ -3489,23 +3377,13 @@ xla_cc_test(
     name = "reduce_decomposer_test",
     srcs = ["reduce_decomposer_test.cc"],
     deps = [
-        ":hlo_creation_utils",
         ":hlo_parser",
         ":reduce_decomposer",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -3513,24 +3391,13 @@ xla_cc_test(
     name = "reshape_decomposer_test",
     srcs = ["reshape_decomposer_test.cc"],
     deps = [
-        ":hlo_creation_utils",
         ":hlo_parser",
         ":reshape_decomposer",
-        ":reshape_mover",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -3602,16 +3469,12 @@ xla_cc_test(
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3640,7 +3503,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/monitoring:gauge",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -3678,7 +3540,6 @@ xla_test(
         "//tensorflow/compiler/xla/tests:test_macros_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
         "@com_google_absl//absl/strings",
@@ -3691,7 +3552,6 @@ xla_cc_test(
     deps = [
         ":dynamic_dimension_inference",
         ":hlo_runner",
-        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -3704,7 +3564,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
     ],
 )
@@ -3713,20 +3572,16 @@ xla_cc_test(
     name = "reshape_mover_test",
     srcs = ["reshape_mover_test.cc"],
     deps = [
+        ":algebraic_simplifier",
+        ":hlo_pass",
+        ":hlo_verifier",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
         ":reshape_mover",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/memory",
+        "//tensorflow/tsl/lib/core:status_test_util",
     ],
 )
 
@@ -3841,7 +3696,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/lib/gtl:map_util",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
@@ -3900,7 +3754,6 @@ xla_cc_test(
         ":hlo_execution_profile",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -3931,7 +3784,6 @@ xla_cc_test(
     deps = [
         ":computation_placer_hdr",
         ":hlo_memory_scheduler",
-        ":hlo_module_config",
         ":test_compilation_environment_proto_cc",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -3985,7 +3837,6 @@ cc_library(
         ":buffer_value",
         ":logical_buffer",
         "//tensorflow/tsl/lib/gtl:compactptrset",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
 )
@@ -4102,20 +3953,10 @@ xla_cc_test(
     name = "hlo_phi_graph_test",
     srcs = ["hlo_phi_graph_test.cc"],
     deps = [
-        ":hlo_dataflow_analysis",
         ":hlo_graph_dumper",
-        ":hlo_ordering",
         ":hlo_phi_graph",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -4166,7 +4007,6 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -4182,8 +4022,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -4220,7 +4058,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -4350,7 +4187,6 @@ xla_cc_test(
         ":hlo_creation_utils",
         ":instruction_fusion",
         ":tuple_points_to_analysis",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -4438,21 +4274,17 @@ cc_library(
         ":hlo_graph_dumper",
         ":hlo_ordering",
         ":hlo_pass",
-        ":logical_buffer",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla:frontend_attributes",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:any",
     ],
 )
 
@@ -4461,24 +4293,12 @@ cc_library(
     srcs = ["loop_schedule_linearizer.cc"],
     hdrs = ["loop_schedule_linearizer.h"],
     deps = [
-        ":dump",
         ":hlo_alias_analysis",
-        ":hlo_dce",
         ":hlo_graph_dumper",
-        ":hlo_ordering",
         ":hlo_pass",
-        ":logical_buffer",
-        ":tuple_simplifier",
-        "//tensorflow/compiler/xla:status_macros",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/compiler/xla/service/graphcycles",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -4499,7 +4319,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
     ],
 )
@@ -4510,20 +4329,11 @@ xla_cc_test(
     deps = [
         ":copy_insertion",
         ":hlo_graph_dumper",
-        ":hlo_runner",
         ":loop_schedule_linearizer",
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
-        "//tensorflow/tsl/platform:test_benchmark",
     ],
 )
 
@@ -4543,7 +4353,6 @@ cc_library(
     hdrs = ["memory_space_assignment_tuning_utils.h"],
     deps = [
         ":heap_simulator",
-        ":hlo_module_config",
         ":memory_space_assignment_utils",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
@@ -4585,16 +4394,24 @@ cc_library(
     deps = [
         ":heap_simulator",
         ":hlo_cost_analysis",
+        ":hlo_value",
         ":memory_space_assignment_proto_cc",
         ":memory_space_assignment_repacking",
         ":memory_space_assignment_tuning_utils",
         ":memory_space_assignment_utils",
         ":tuple_util",
         "//tensorflow/compiler/xla:debug_options_flags",
+        "//tensorflow/compiler/xla:status",
+        "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -4609,6 +4426,7 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -4632,7 +4450,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -4682,6 +4499,7 @@ cc_library(
     deps = [
         ":collective_ops_utils",
         ":hlo_pass",
+        ":pattern_matcher",
         ":shape_inference",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:permutation_util",
@@ -4713,7 +4531,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/base:log_severity",
         "@com_google_absl//absl/log:scoped_mock_log",
         "@com_google_absl//absl/strings",
@@ -4767,8 +4584,6 @@ cc_library(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/flags:flag",
     ],
 )
 
@@ -4776,19 +4591,16 @@ xla_cc_test(
     name = "hlo_rematerialization_test_utils_test",
     srcs = ["hlo_rematerialization_test_utils_test.cc"],
     deps = [
-        ":flatten_call_graph",
         ":hlo_ordering",
         ":hlo_rematerialization_test_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -4796,19 +4608,16 @@ xla_cc_test(
     name = "hlo_rematerialization_test",
     srcs = ["hlo_rematerialization_test.cc"],
     deps = [
-        ":flatten_call_graph",
         ":hlo_ordering",
         ":hlo_rematerialization",
         ":hlo_rematerialization_test_utils",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -4817,11 +4626,9 @@ xla_cc_test(
     srcs = ["hlo_dce_test.cc"],
     deps = [
         ":hlo_dce",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -4829,9 +4636,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -4840,19 +4644,14 @@ xla_cc_test(
     srcs = ["hlo_module_dce_test.cc"],
     deps = [
         ":hlo_module_dce",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -4879,7 +4678,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -4925,7 +4723,6 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -4937,18 +4734,11 @@ xla_cc_test(
     deps = [
         ":hlo_parser",
         ":hlo_pass_pipeline",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -4964,7 +4754,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
     ],
@@ -4990,8 +4779,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -5028,7 +4815,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5103,10 +4889,10 @@ xla_cc_test(
         ":hlo_domain_remover",
         ":hlo_domain_verifier",
         ":hlo_parser",
+        ":sharding_propagation",
         "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:sharding_propagation",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
@@ -5126,7 +4912,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_query",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -5158,11 +4943,9 @@ xla_cc_test(
     deps = [
         ":conditional_canonicalizer",
         ":hlo_parser",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
@@ -5170,8 +4953,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -5333,7 +5114,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -5410,7 +5190,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -5422,12 +5201,7 @@ cc_library(
     name = "stream_pool",
     srcs = ["stream_pool.cc"],
     hdrs = ["stream_pool.h"],
-    deps = [
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/tsl/platform:logging",
-        "@com_google_absl//absl/memory",
-    ],
+    deps = ["//tensorflow/compiler/xla/stream_executor"],
 )
 
 xla_cc_test(
@@ -5469,7 +5243,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -5516,7 +5289,6 @@ cc_library(
         "//tensorflow/tsl/platform:blocking_counter",
         "//tensorflow/tsl/platform:logging",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -5579,7 +5351,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -5611,7 +5382,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -5639,7 +5409,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:verified_hlo_module",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -5732,7 +5501,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
@@ -5776,7 +5544,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
@@ -5818,7 +5585,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -5828,7 +5594,6 @@ cc_library(
     hdrs = ["while_loop_expensive_invariant_code_motion.h"],
     deps = [
         ":hlo_pass",
-        ":tuple_util",
         ":while_loop_analysis",
         ":while_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -5848,12 +5613,9 @@ xla_cc_test(
     deps = [
         ":hlo_parser",
         ":while_loop_expensive_invariant_code_motion",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -5882,7 +5644,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -5896,6 +5657,7 @@ cc_library(
         ":hlo_memory_scheduler",
         ":hlo_pass",
         ":hlo_pass_pipeline",
+        ":sub_byte_normalization",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
@@ -5934,14 +5696,10 @@ xla_cc_test(
     name = "indexed_array_analysis_test",
     srcs = ["indexed_array_analysis_test.cc"],
     deps = [
-        ":hlo_parser",
         ":indexed_array_analysis",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -6058,18 +5816,12 @@ xla_cc_test(
     deps = [
         ":optimize_input_output_buffer_alias",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -6130,7 +5882,6 @@ xla_cc_test(
     srcs = ["dynamic_index_splitter_test.cc"],
     deps = [
         ":dynamic_index_splitter",
-        "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
@@ -6146,13 +5897,10 @@ xla_cc_test(
     deps = [
         ":ar_crs_combiner",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -6170,7 +5918,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -6209,12 +5956,10 @@ xla_cc_test(
         ":conditional_to_select",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -6225,9 +5970,6 @@ cc_library(
     deps = [
         ":hlo_pass",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
     ],
@@ -6322,21 +6064,16 @@ xla_cc_test(
     deps = [
         ":hlo_dce",
         ":hlo_parser",
-        ":hlo_pass",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
         ":slice_sinker",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -6400,12 +6137,37 @@ cc_library(
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:blocking_counter",
         "@com_google_absl//absl/functional:function_ref",
     ],
 )
 
+cc_library(
+    name = "collective_transformation_reorderer",
+    srcs = ["collective_transformation_reorderer.cc"],
+    hdrs = ["collective_transformation_reorderer.h"],
+    deps = [
+        ":hlo_dce",
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+    ],
+)
+
+xla_cc_test(
+    name = "collective_transformation_reorderer_test",
+    srcs = ["collective_transformation_reorderer_test.cc"],
+    deps = [
+        ":collective_transformation_reorderer",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 xla_cc_test(
     name = "collective_ops_utils_test",
     srcs = ["collective_ops_utils_test.cc"],
@@ -6429,6 +6191,8 @@ cc_library(
         ":hlo_pass",
         ":pattern_matcher",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:comparators",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
@@ -6446,6 +6210,7 @@ xla_cc_test(
         ":topk_rewriter",
         ":tuple_simplifier",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:literal_test_util",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
@@ -6584,9 +6349,6 @@ py_library(
     name = "generate_test_hlo_checks",
     srcs = ["generate_test_hlo_checks.py"],
     srcs_version = "PY3",
-    deps = [
-        "@absl_py//absl/flags",
-    ],
 )
 
 py_test(
@@ -6702,10 +6464,8 @@ cc_library(
     srcs = ["layout_normalization.cc"],
     hdrs = ["layout_normalization.h"],
     deps = [
-        ":collective_ops_utils",
         ":hlo_creation_utils",
         ":hlo_pass",
-        ":pattern_matcher",
         ":shape_inference",
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -6713,9 +6473,7 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:window_util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
     ],
@@ -6779,15 +6537,9 @@ xla_cc_test(
     srcs = ["select_and_scatter_expander_test.cc"],
     deps = [
         ":select_and_scatter_expander",
-        "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -6798,15 +6550,9 @@ xla_cc_test(
     ],
     deps = [
         ":layout_normalization",
-        "//tensorflow/compiler/xla:debug_options_flags",
-        "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla/tests:filecheck",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:llvm_irgen_test_base",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -6817,7 +6563,6 @@ cc_library(
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
     ],
 )
 
@@ -6892,6 +6637,20 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "sub_byte_normalization",
+    srcs = ["sub_byte_normalization.cc"],
+    hdrs = ["sub_byte_normalization.h"],
+    deps = [
+        ":hlo_pass",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/tsl/platform:errors",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 xla_cc_test(
     name = "gather_simplifier_test",
     srcs = ["gather_simplifier_test.cc"],
@@ -6907,19 +6666,10 @@ xla_cc_test(
     srcs = ["change_op_data_type_test.cc"],
     deps = [
         ":change_op_data_type",
-        ":hlo_parser",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
-        "//tensorflow/compiler/xla:permutation_util",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/random",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier.cc b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
index 5ff9f5e82b5..5a81371a6f3 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier.cc
@@ -71,6 +71,8 @@ namespace {
 
 namespace m = match;
 
+using primitive_util::NativeTypeOf;
+
 // Unwraps broadcasts hunting for a constant.  If we find one, checks if the
 // constant contains only the given value.
 bool IsAll(const HloInstruction* op, int8_t value) {
@@ -144,18 +146,17 @@ std::optional<double> GetConstantValue(const HloInstruction* inst) {
   if (!ShapeUtil::IsEffectiveScalar(inst->shape())) {
     return std::nullopt;
   }
-  switch (inst->shape().element_type()) {
-    case F16:
-      return static_cast<float>(inst->literal().GetFirstElement<half>());
-    case BF16:
-      return static_cast<float>(inst->literal().GetFirstElement<bfloat16>());
-    case F32:
-      return inst->literal().GetFirstElement<float>();
-    case F64:
-      return inst->literal().GetFirstElement<double>();
-    default:
-      return std::nullopt;
-  }
+  return primitive_util::PrimitiveTypeSwitch<std::optional<double>>(
+      [&](auto primitive_type_constant) -> std::optional<double> {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return static_cast<double>(
+              inst->literal().GetFirstElement<NativeT>());
+        }
+        return std::nullopt;
+      },
+      inst->shape().element_type());
 }
 
 static bool IsScalarConstant(const HloInstruction* hlo,
@@ -228,21 +229,7 @@ bool IsAllFpConstantPowerOf2(const HloInstruction* op) {
                      m::Shape().IsEffectiveScalar())))) {
     return false;
   }
-  auto val = [&]() -> std::optional<double> {
-    switch (c->shape().element_type()) {
-      case BF16:
-        return static_cast<double>(c->literal().GetFirstElement<bfloat16>());
-      case F16:
-        return static_cast<double>(c->literal().GetFirstElement<Eigen::half>());
-      case F32:
-        return c->literal().GetFirstElement<float>();
-      case F64:
-        return c->literal().GetFirstElement<double>();
-      default:
-        // Cowardly refuse to consider complex types.
-        return std::nullopt;
-    }
-  }();
+  auto val = GetConstantValue(c);
   if (!val) {
     return false;
   }
@@ -453,18 +440,18 @@ bool IsOpCodeMultiplyCommutative(HloOpcode opcode) {
 
 std::unique_ptr<HloInstruction> MakeScalarInstruction(HloInstruction* target,
                                                       float multiplier) {
-  switch (target->shape().element_type()) {
-    case BF16:
-      return HloInstruction::CreateConstant(LiteralUtil::ConvertF32ToBF16(
-          LiteralUtil::CreateR0<float>(multiplier)));
-      break;
-    case F32:
-      return HloInstruction::CreateConstant(
-          LiteralUtil::CreateR0<float>(multiplier));
-      break;
-    default:
-      LOG(FATAL) << "Unsupported data type: " << target->shape().element_type();
-  }
+  return primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
+      [&](auto primitive_type_constant) -> std::unique_ptr<HloInstruction> {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = NativeTypeOf<primitive_type_constant>;
+          return HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<NativeT>(static_cast<NativeT>(multiplier)));
+        }
+        LOG(FATAL) << "Unsupported data type: "
+                   << target->shape().element_type();
+      },
+      target->shape().element_type());
 }
 
 }  // namespace
@@ -1883,7 +1870,7 @@ std::unique_ptr<HloInstruction> TryDivideToShift(
   }
 
   if (ShapeUtil::ElementIsSigned(divide->shape())) {
-    int64_t b_value = c->literal().GetFirstElement<T>();
+    int64_t b_value = static_cast<int64_t>(c->literal().GetFirstElement<T>());
     if (b_value > 0 && absl::has_single_bit(static_cast<uint64_t>(b_value))) {
       // Handle negative dividends by negating the result of the division.
       HloInstruction* zero_like_a = MakeScalarLike(a, 0);
@@ -1914,7 +1901,7 @@ std::unique_ptr<HloInstruction> TryDivideToShift(
                                            neqated_quotient, quotient);
     }
   } else {
-    uint64_t b_value = c->literal().GetFirstElement<T>();
+    uint64_t b_value = static_cast<uint64_t>(c->literal().GetFirstElement<T>());
     if (absl::has_single_bit(b_value)) {
       return HloInstruction::CreateBinary(
           divide->shape(), HloOpcode::kShiftRightLogical, a,
@@ -1936,57 +1923,18 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
   }
 
   // A / B => A >> log2(B) if B is a power of 2.
-  switch (divide->shape().element_type()) {
-    case S8:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<int8_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case S16:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<int16_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case S32:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<int32_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case S64:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<int64_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case U8:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<uint8_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case U16:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<uint16_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case U32:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<uint32_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    case U64:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryDivideToShift<uint64_t>(divide, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(divide, std::move(shift));
-      }
-      break;
-    default:
-      break;
+  if (std::unique_ptr<HloInstruction> shift =
+          primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
+              [&](auto kType) -> std::unique_ptr<HloInstruction> {
+                if constexpr (primitive_util::IsIntegralType(kType)) {
+                  using NativeT = primitive_util::NativeTypeOf<kType>;
+                  return TryDivideToShift<NativeT>(divide, computation_,
+                                                   simplifier_);
+                }
+                return nullptr;
+              },
+              divide->shape().element_type())) {
+    return ReplaceWithNewInstruction(divide, std::move(shift));
   }
 
   Shape* shape;
@@ -2061,37 +2009,30 @@ Status AlgebraicSimplifierVisitor::HandleDivide(HloInstruction* divide) {
       (Match(b, m::Constant(&c)) || Match(b, m::Broadcast(m::Constant(&c))))) {
     Shape result_shape = c->literal().shape();
     Literal new_literal(result_shape);
-    switch (result_shape.element_type()) {
-      case F16:
-        TF_RETURN_IF_ERROR(InvertConstant<half>(*c, &new_literal));
-        break;
-      case F32:
-        TF_RETURN_IF_ERROR(InvertConstant<float>(*c, &new_literal));
-        break;
-      case BF16:
-        TF_RETURN_IF_ERROR(InvertConstant<bfloat16>(*c, &new_literal));
-        break;
-      case F64:
-        TF_RETURN_IF_ERROR(InvertConstant<double>(*c, &new_literal));
-        break;
-      case C64:
-        TF_RETURN_IF_ERROR(InvertConstant<complex64>(*c, &new_literal));
-        break;
-      case C128:
-        TF_RETURN_IF_ERROR(InvertConstant<complex128>(*c, &new_literal));
-        break;
-      default:
-        return OkStatus();
-    }
-    auto inverse = c->AddInstruction(
-        simplifier_->CreateConstantWithLayoutUpdated(new_literal.Clone()));
-    if (b != c) {
-      inverse = b->AddInstruction(HloInstruction::CreateBroadcast(
-          b->shape(), inverse, b->dimensions()));
-    }
-    TF_ASSIGN_OR_RETURN(auto new_divide,
-                        MakeBinaryHlo(HloOpcode::kMultiply, a, inverse));
-    return ReplaceInstruction(divide, new_divide);
+    return primitive_util::PrimitiveTypeSwitch<Status>(
+        [&](auto primitive_type_constant) -> Status {
+          if constexpr (primitive_util::IsFloatingPointType(
+                            primitive_type_constant) ||
+                        primitive_util::IsComplexType(
+                            primitive_type_constant)) {
+            using NativeT = NativeTypeOf<primitive_type_constant>;
+            TF_RETURN_IF_ERROR(InvertConstant<NativeT>(*c, &new_literal));
+
+            auto inverse =
+                c->AddInstruction(simplifier_->CreateConstantWithLayoutUpdated(
+                    new_literal.Clone()));
+            if (b != c) {
+              inverse = b->AddInstruction(HloInstruction::CreateBroadcast(
+                  b->shape(), inverse, b->dimensions()));
+            }
+            TF_ASSIGN_OR_RETURN(
+                auto new_divide,
+                MakeBinaryHlo(HloOpcode::kMultiply, a, inverse));
+            return ReplaceInstruction(divide, new_divide);
+          }
+          return OkStatus();
+        },
+        result_shape.element_type());
   }
 
   // (A / B) / (C / D)  =>  (A / B)*(D / C) => (A * D) / (B * C)
@@ -4278,7 +4219,7 @@ std::unique_ptr<HloInstruction> TryRemainderToAnd(
   }
 
   if (ShapeUtil::ElementIsSigned(remainder->shape())) {
-    int64_t b_value = c->literal().GetFirstElement<T>();
+    int64_t b_value = static_cast<int64_t>(c->literal().GetFirstElement<T>());
     if (b_value > 0 && absl::has_single_bit(static_cast<uint64_t>(b_value))) {
       // Handle negative dividends by negating the result of the division.
       HloInstruction* zero_like_a = BroadcastZeros(computation, a->shape());
@@ -4310,11 +4251,11 @@ std::unique_ptr<HloInstruction> TryRemainderToAnd(
           neqated_quotient, quotient);
     }
   } else {
-    uint64_t b_value = c->literal().GetFirstElement<T>();
+    uint64_t b_value = static_cast<uint64_t>(c->literal().GetFirstElement<T>());
     if (absl::has_single_bit(b_value)) {
       HloInstruction* mask_amount =
           remainder->AddInstruction(simplifier->CreateConstantWithLayoutUpdated(
-              LiteralUtil::CreateR0<T>(b_value - 1)));
+              LiteralUtil::CreateR0<T>(static_cast<T>(b_value - 1))));
       if (!ShapeUtil::IsScalar(b->shape())) {
         mask_amount = remainder->AddInstruction(
             HloInstruction::CreateBroadcast(b->shape(), mask_amount, {}));
@@ -4337,57 +4278,18 @@ Status AlgebraicSimplifierVisitor::HandleRemainder(HloInstruction* remainder) {
   }
 
   // A % B => A & (B - 1) if B is a power of 2.
-  switch (remainder->shape().element_type()) {
-    case S8:
-      if (std::unique_ptr<HloInstruction> shift =
-              TryRemainderToAnd<int8_t>(remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case S16:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<int16_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case S32:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<int32_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case S64:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<int64_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case U8:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<uint8_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case U16:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<uint16_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case U32:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<uint32_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    case U64:
-      if (std::unique_ptr<HloInstruction> shift = TryRemainderToAnd<uint64_t>(
-              remainder, computation_, simplifier_)) {
-        return ReplaceWithNewInstruction(remainder, std::move(shift));
-      }
-      break;
-    default:
-      break;
+  if (std::unique_ptr<HloInstruction> shift =
+          primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
+              [&](auto kType) -> std::unique_ptr<HloInstruction> {
+                if constexpr (primitive_util::IsIntegralType(kType)) {
+                  using NativeT = primitive_util::NativeTypeOf<kType>;
+                  return TryRemainderToAnd<NativeT>(remainder, computation_,
+                                                    simplifier_);
+                }
+                return nullptr;
+              },
+              remainder->shape().element_type())) {
+    return ReplaceWithNewInstruction(remainder, std::move(shift));
   }
 
   // If M < N, then {0, ..., M} % N ==> {0, ..., M}.
@@ -6719,6 +6621,8 @@ Status AlgebraicSimplifierVisitor::HandleTranspose(HloInstruction* transpose) {
                      SwapOperandsInDotPrecisionConfig(dot->precision_config()),
                      dot->shape().element_type())
               .value();
+      *new_dot->mutable_shape()->mutable_layout() = transpose->shape().layout();
+
       dot->SetupDerivedInstruction(new_dot);
       TF_CHECK_OK(ReplaceInstruction(transpose, new_dot));
       return true;
diff --git a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
index ede9e7f7963..621ff3771d9 100644
--- a/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
+++ b/tensorflow/compiler/xla/service/algebraic_simplifier_test.cc
@@ -82,7 +82,7 @@ TEST_F(AlgebraicSimplifierTest, AddZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, param0, zero));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -387,7 +387,7 @@ TEST_F(AlgebraicSimplifierTest, MulZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0s32, HloOpcode::kMultiply, param0, zero));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMultiply);
   AlgebraicSimplifier simplifier(default_options_);
@@ -546,7 +546,7 @@ TEST_F(AlgebraicSimplifierTest, SelectTrue) {
       r0s32, HloOpcode::kSelect, one, param0, param1));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
   AlgebraicSimplifier simplifier(default_options_);
@@ -568,7 +568,7 @@ TEST_F(AlgebraicSimplifierTest, SelectFalse) {
       r0s32, HloOpcode::kSelect, zero, param0, param1));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
   AlgebraicSimplifier simplifier(default_options_);
@@ -588,7 +588,7 @@ TEST_F(AlgebraicSimplifierTest, SelectIdentical) {
       r0s32, HloOpcode::kSelect, param0, param1, param1));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
   AlgebraicSimplifier simplifier(default_options_);
@@ -613,7 +613,7 @@ TEST_F(AlgebraicSimplifierTest, SelectWithNotPred) {
       r0s32, HloOpcode::kSelect, pred_instr, param1, param2));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSelect);
   AlgebraicSimplifier simplifier(default_options_);
@@ -728,7 +728,7 @@ TEST_F(AlgebraicSimplifierTest, TwoReducesToOne) {
   Shape r1f32 = ShapeUtil::MakeShape(F32, {5});
   builder.AddInstruction(HloInstruction::CreateReduce(r1f32, reduce0, zero,
                                                       dims1, add_computation));
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
   HloInstruction* root = m->entry_computation()->root_instruction();
@@ -829,7 +829,7 @@ TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, constant, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -855,7 +855,7 @@ TEST_F(AlgebraicSimplifierTest, AddReassociateMergeConstants) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, add1, constant2));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -941,7 +941,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR0Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -977,7 +977,7 @@ TEST_F(AlgebraicSimplifierTest, InlineTrivialMap) {
                    HloInstruction::CreateBroadcast(r2f32, zero, {}))},
       add_computation));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kMap);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1024,7 +1024,7 @@ TEST_F(AlgebraicSimplifierTest, AddBroadcastZeroR1Operand) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kAdd, bcast, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAdd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1039,7 +1039,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantToBroadcast) {
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14f, 3.14f, 3.14f})));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Constant()));
   AlgebraicSimplifier simplifier(default_options_);
@@ -1055,7 +1055,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantNotToBroadcast) {
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({3.14, 3.14, 4})));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Constant()));
   AlgebraicSimplifier simplifier(default_options_);
@@ -1070,7 +1070,7 @@ TEST_F(AlgebraicSimplifierTest, IotaToBroadcast) {
   builder.AddInstruction(HloInstruction::CreateConstant(
       LiteralUtil::CreateR1<float>({0.0f, 1.0f, 2.0f})));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_THAT(root, GmockMatch(m::Constant()));
   AlgebraicSimplifier simplifier(default_options_);
@@ -1091,7 +1091,7 @@ TEST_F(AlgebraicSimplifierTest, SubZero) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kSubtract, param0, zero));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1112,7 +1112,7 @@ TEST_F(AlgebraicSimplifierTest, SubConstCanonicalization) {
   builder.AddInstruction(HloInstruction::CreateBinary(
       r0f32, HloOpcode::kSubtract, param0, constant));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kSubtract);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1211,7 +1211,7 @@ TEST_F(AlgebraicSimplifierTest, LhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, div, param2));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Divide(m::Divide(m::Parameter(0), m::Parameter(1)),
@@ -1242,7 +1242,7 @@ TEST_F(AlgebraicSimplifierTest, RhsDivOfDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, div));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1278,7 +1278,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfDivAndDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, div0, div1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1308,7 +1308,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, exp));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Divide(m::Parameter(0), m::Exp(m::Parameter(1)))));
@@ -1337,7 +1337,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1370,7 +1370,7 @@ TEST_F(AlgebraicSimplifierTest, DivOfBroadcastingPower) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide, param0, power));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1399,7 +1399,7 @@ TEST_F(AlgebraicSimplifierTest, DivideByConstant) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1f32, HloOpcode::kDivide,
                                                       param0, constant));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
@@ -1465,7 +1465,7 @@ TEST_F(AlgebraicSimplifierTest, PowerOfPowerComplex) {
   builder.AddInstruction(HloInstruction::CreateBinary(r1c64, HloOpcode::kPower,
                                                       inner_power, exp2));
 
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_FALSE(simplifier.Run(m.get()).value());
 }
@@ -1482,7 +1482,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneScalar) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1503,7 +1503,7 @@ TEST_F(AlgebraicSimplifierTest, DivOneArray) {
   HloInstruction* div = builder.AddInstruction(
       HloInstruction::CreateBinary(r2f32, HloOpcode::kDivide, param0, one));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, div);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1527,7 +1527,7 @@ TEST_F(AlgebraicSimplifierTest, ComplexOfRealImagC) {
   HloInstruction* cplx = builder.AddInstruction(
       HloInstruction::CreateBinary(r2c64, HloOpcode::kComplex, real, imag));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, cplx);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1551,7 +1551,7 @@ TEST_F(AlgebraicSimplifierTest, RealOfComplex) {
   HloInstruction* real = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kReal, cplx));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, real);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1575,7 +1575,7 @@ TEST_F(AlgebraicSimplifierTest, ImagOfComplex) {
   HloInstruction* imag = builder.AddInstruction(
       HloInstruction::CreateUnary(r2f32, HloOpcode::kImag, cplx));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, imag);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1602,7 +1602,7 @@ TEST_F(AlgebraicSimplifierTest, SelectMakeTuple) {
   HloInstruction* add = builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kAdd, get, param2));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, add);
   AlgebraicSimplifier simplifier(default_options_);
@@ -1627,7 +1627,7 @@ TEST_F(AlgebraicSimplifierTest, ExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kDivide, exp0, exp1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(
       computation->root_instruction(),
@@ -1657,7 +1657,7 @@ TEST_F(AlgebraicSimplifierTest, ExpMul) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kMultiply, exp0, exp1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Multiply(m::Exp(m::Parameter(0)),
@@ -1684,7 +1684,7 @@ TEST_F(AlgebraicSimplifierTest, PowExp) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, exp0, param1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Exp(m::Parameter(0)), m::Parameter(1))));
@@ -1711,7 +1711,7 @@ TEST_F(AlgebraicSimplifierTest, LnPow) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, pow));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Log(m::Power(m::Parameter(0), m::Parameter(1)))));
@@ -1738,7 +1738,7 @@ TEST_F(AlgebraicSimplifierTest, LnSqrt) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, sqrt));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Log(m::Sqrt(m::Parameter(0)))));
@@ -1762,7 +1762,7 @@ TEST_F(AlgebraicSimplifierTest, LnRsqrt) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, rsqrt));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Log(m::Rsqrt(m::Parameter(0)))));
@@ -1787,7 +1787,7 @@ TEST_F(AlgebraicSimplifierTest, LnExp) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, exp0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Log(m::Exp(m::Parameter(0)))));
@@ -1816,7 +1816,7 @@ TEST_F(AlgebraicSimplifierTest, LnExpDiv) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(r0f32, HloOpcode::kLog, div));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Log(m::Divide(m::Exp(m::Parameter(0)),
@@ -1842,7 +1842,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Scalar) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
@@ -1867,7 +1867,7 @@ TEST_F(AlgebraicSimplifierTest, Pow0Vector) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r1f32, HloOpcode::kPower, param0, zero));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Parameter(0), m::Op().Is(zero))));
@@ -1896,7 +1896,7 @@ TEST_F(AlgebraicSimplifierTest, Pow1) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, one));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Parameter(0), m::Op().Is(one))));
@@ -1919,7 +1919,7 @@ TEST_F(AlgebraicSimplifierTest, Pow2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, two));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Parameter(0), m::Op().Is(two))));
@@ -1943,7 +1943,7 @@ TEST_F(AlgebraicSimplifierTest, Pow3) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0f32, HloOpcode::kPower, param0, three));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Parameter(0), m::Op().Is(three))));
@@ -1969,7 +1969,7 @@ TEST_F(AlgebraicSimplifierTest, PowNegative1) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0f32, HloOpcode::kPower,
                                                       param0, negative_one));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Power(m::Parameter(0), m::Op().Is(negative_one))));
@@ -2016,7 +2016,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedConvolution) {
   builder.AddInstruction(HloInstruction::CreateConvolve(
       ShapeUtil::MakeShape(F32, {3, 3, 3}), lhs, rhs, /*feature_group_count=*/1,
       /*batch_group_count=*/1, window, dnums, DefaultPrecisionConfig(2)));
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::Convolution(m::Op().Is(lhs), m::Op().Is(rhs))));
@@ -2060,7 +2060,7 @@ TEST_F(AlgebraicSimplifierTest, ReduceWindowIsReduceAndReshape) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
@@ -2103,7 +2103,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReduceWindow) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
       window, add_computation));
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::ReduceWindow(m::Parameter(0), m::Constant())));
@@ -2475,7 +2475,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedPad) {
       builder.AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::CreateR0(0.0f))),
       padding));
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::Pad(m::Parameter(0), m::Constant())));
   HloPassFix<AlgebraicSimplifier> simplifier(default_options_);
@@ -2499,7 +2499,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeBroadcast) {
       ShapeUtil::MakeShape(F32, {3, 2}), broadcast));
 
   auto computation = builder.Build();
-  m->AddEntryComputation(std::move(computation));
+  m->AddEntryComputationWithLayouts(std::move(computation));
 
   EXPECT_THAT(m->entry_computation()->root_instruction(),
               GmockMatch(m::Reshape(m::Broadcast(m::Reshape(m::Op().Is(op))))));
@@ -2519,7 +2519,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertBetweenSameType) {
   builder.AddInstruction(
       HloInstruction::CreateConvert(ShapeUtil::MakeShape(F32, {}), input));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Convert(m::Op().Is(input))));
@@ -2546,7 +2546,7 @@ TEST_F(AlgebraicSimplifierTest, EliminateConvertPairUpCast) {
   builder.AddInstruction(HloInstruction::CreateConvert(
       ShapeUtil::ChangeElementType(convert_1->shape(), F16), convert_1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Convert(m::Convert(m::Op().Is(input)))));
@@ -2573,7 +2573,7 @@ TEST_F(AlgebraicSimplifierTest, DoNotEliminateConvertPairDownCast) {
   builder.AddInstruction(HloInstruction::CreateConvert(
       ShapeUtil::ChangeElementType(convert_1->shape(), F32), convert_1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Convert(m::Convert(m::Op().Is(input)))));
@@ -2609,7 +2609,7 @@ TEST_F(AlgebraicSimplifierTest, EliminateConvertPairMultiOut) {
   builder.AddInstruction(
       HloInstruction::CreateTuple({convert_1, convert_2, floor}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Tuple(m::Op().Is(convert_1), m::Op().Is(convert_2),
                                   m::Op().Is(floor))));
@@ -2632,7 +2632,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveCopy) {
   builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
@@ -2661,7 +2661,7 @@ TEST_F(AlgebraicSimplifierTest, CopyOfReshapeOfCopyEqualsBitcast) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShapeWithDenseLayout(F32, {14 * 14, 64}, {1, 0}),
       HloOpcode::kCopy, reshape));
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Reshape(m::Copy(m::Parameter(0))))));
 
@@ -2688,7 +2688,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfCopyEqualsBitcast) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShapeWithDenseLayout(F32, {14 * 14, 64}, {1, 0}), copy));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Copy(m::Parameter(0)))));
 
@@ -2712,7 +2712,7 @@ TEST_F(AlgebraicSimplifierTest, CopyEqualsBitcast) {
   builder.AddInstruction(HloInstruction::CreateUnary(
       ShapeUtil::MakeShapeWithDenseLayout(F32, {1, 14, 14, 64}, {1, 2, 0, 3}),
       HloOpcode::kCopy, param));
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
@@ -2744,7 +2744,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveUnaryConcatenate) {
   builder.AddInstruction(
       HloInstruction::CreateConcatenate(param0->shape(), {param0}, 0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Concatenate(m::Parameter(0))));
@@ -2848,7 +2848,7 @@ TEST_F(AlgebraicSimplifierTest, RemoveEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, param0, param0, empty_slice, param1}, 0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Concatenate(
@@ -2901,7 +2901,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyReduceOfConcat) {
   builder.AddInstruction(HloInstruction::CreateReduce(
       reduce_shape, Concatenate, zero, {1, 2}, add_computation));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
@@ -2930,7 +2930,7 @@ TEST_F(AlgebraicSimplifierTest, OnlyEmptyConcatenateOperands) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       result_shape, {empty_literal, empty_slice}, 0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Concatenate(m::Op().Is(empty_literal),
@@ -2957,7 +2957,7 @@ TEST_F(AlgebraicSimplifierTest, ConcatenateOfBroadcastBecomesPad) {
   builder.AddInstruction(HloInstruction::CreateConcatenate(
       ShapeUtil::MakeShape(F32, {200}), {broadcast, param0}, 0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
@@ -3024,7 +3024,7 @@ TEST_F(AlgebraicSimplifierTest, SimplifyConcatenateOfSlices) {
       concat_shape,
       {slice0, slice1, slice2, slice3, slice4, slice5, slice6, slice7, slice8},
       1));
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
@@ -3057,12 +3057,13 @@ TEST_F(AlgebraicSimplifierTest, CopyWithDifferentLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
-
   // Set to different layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({1, 0});
 
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
+
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
@@ -3087,12 +3088,13 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   HloInstruction* copy = builder.AddInstruction(
       HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
-
   // Set to same layouts.
   *param0->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
   *copy->mutable_shape()->mutable_layout() = LayoutUtil::MakeLayout({0, 1});
 
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
+
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Parameter(0))));
 
@@ -3121,7 +3123,7 @@ TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
   *reshape->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3, 4, 5});
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Parameter(0))));
@@ -3156,7 +3158,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeOfTransposeOfRngToRng) {
                                 ShapeUtil::MakeShape(F32, {4}), transpose))
                             ->shape();
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
   EXPECT_TRUE(simplifier.Run(m.get()).value());
@@ -3202,7 +3204,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeReplacedWithBitcast) {
   builder.AddInstruction(HloInstruction::CreateTuple(
       {transformable_reshape, dimensions_wrong_reshape, layout_wrong_reshape}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Tuple(m::Op().Is(transformable_reshape),
@@ -3240,7 +3242,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkReshapeDoesntAffectChangedBit) {
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {4}), add));
 
   AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).value());
 }
 
@@ -3264,7 +3266,7 @@ TEST_F(AlgebraicSimplifierTest, FailureToSinkBroadcastDoesntAffectChangedBit) {
                                       /*broadcast_dimensions=*/{0, 1}));
 
   AlgebraicSimplifier simplifier(AlgebraicSimplifierOptions{});
-  m->AddEntryComputation(builder.Build());
+  m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_TRUE(simplifier.Run(m.get()).value());
 }
 
@@ -3283,7 +3285,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast1) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({0, 1, 2, 3});
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Parameter(0))));
@@ -3313,7 +3315,7 @@ TEST_F(AlgebraicSimplifierTest, TransposeEqualsBitcast2) {
   *transpose->mutable_shape()->mutable_layout() =
       LayoutUtil::MakeLayout({3, 1, 2, 0});
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Parameter(0))));
@@ -3342,7 +3344,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapesMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {1, 2, 1, 1, 2, 1}), reshape1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Reshape(m::Parameter(0)))));
@@ -3370,7 +3372,7 @@ TEST_F(AlgebraicSimplifierTest, CopiesMerged) {
       ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 2, 2}, {0, 2, 1}),
       HloOpcode::kCopy, copy1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Copy(m::Copy(m::Parameter(0)))));
@@ -3398,7 +3400,7 @@ TEST_F(AlgebraicSimplifierTest, TransposesMerged) {
   builder.AddInstruction(HloInstruction::CreateTranspose(
       ShapeUtil::MakeShape(F32, {4, 3, 2}), transpose1, {1, 0, 2}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Transpose(m::Op().Is(transpose1))));
@@ -3533,7 +3535,7 @@ TEST_F(AlgebraicSimplifierTest, ReshapeAndBroadcastMerged) {
   builder.AddInstruction(HloInstruction::CreateBroadcast(
       ShapeUtil::MakeShape(F32, {1, 2, 3, 5, 1}), reshape1, {0, 3, 4}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Broadcast(m::Reshape(m::Parameter(0)))));
@@ -3556,7 +3558,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshapeMerged) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2}), broadcast1));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
@@ -3578,7 +3580,7 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x1_3) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {3}), broadcast));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
@@ -3600,7 +3602,8 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), broadcast));
 
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
@@ -3624,7 +3627,8 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_1_3x2x1_6x1x1x1) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 1}), broadcast));
 
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
@@ -3647,7 +3651,8 @@ TEST_F(AlgebraicSimplifierTest, BroadcastAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 8}), broadcast));
 
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Broadcast(m::Parameter(0)))));
@@ -3667,7 +3672,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeMerged) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {2, 3, 7, 2, 1, 3, 2});
   builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3688,7 +3693,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeToMixedRadix) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {7, 3});
   builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3711,7 +3716,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshapeToMixedRadixExtraDims) {
   Shape result_shape = ShapeUtil::MakeShape(F32, {3, 14, 4, 3, 2, 5, 3});
   builder.AddInstruction(HloInstruction::CreateReshape(result_shape, iota));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3735,7 +3740,7 @@ TEST_F(AlgebraicSimplifierTest, IotaEffectiveScalar) {
       HloInstruction::CreateIota(ShapeUtil::MakeShape(F32, {1, 1}), 0));
   auto result_shape = iota->shape();
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Iota()));
 
@@ -3757,7 +3762,7 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2_6) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6}), iota));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3777,7 +3782,8 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4_6x1x1x4) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 4}), iota));
 
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3799,7 +3805,8 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_1_3x2x2_6x1x1x2) {
   builder.AddInstruction(HloInstruction::CreateReshape(
       ShapeUtil::MakeShape(F32, {6, 1, 1, 2}), iota));
 
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3822,7 +3829,8 @@ TEST_F(AlgebraicSimplifierTest, IotaAndReshape_4_3x2x4x2_6x8) {
   builder.AddInstruction(
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {6, 8}), iota));
 
-  HloComputation* computation = m->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Iota())));
@@ -3852,7 +3860,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopPad) {
       ShapeUtil::MakeShape(F32, {2, 2}), param, zero, no_padding));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Pad(m::Parameter(0), m::Op().Is(zero))));
@@ -3884,7 +3893,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSliceOfPad) {
       /*limit_indices=*/{5, 5}, /*strides=*/{2, 2}));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Slice(m::Pad(m::Parameter(0), m::Op().Is(zero)))));
@@ -3917,7 +3927,8 @@ TEST_F(AlgebraicSimplifierTest, NegativePadding) {
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
 
@@ -3957,7 +3968,8 @@ TEST_F(AlgebraicSimplifierTest, CanDisableBroadcastSinking) {
       ShapeUtil::MakeShape(F32, {512, 16}), HloOpcode::kNegate, broadcast));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Negate(m::Broadcast(m::Parameter(0)))));
@@ -3993,7 +4005,8 @@ TEST_F(AlgebraicSimplifierTest, CanDisableNegativePadding) {
       ShapeUtil::MakeShape(F32, {11, 5}), param, zero, padding));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   // Verify that we can disable the negative padding optimization.
   AlgebraicSimplifierOptions opts = default_options_;
@@ -4039,7 +4052,8 @@ TEST_F(AlgebraicSimplifierTest, TrivialInteriorPadding) {
       ShapeUtil::MakeShape(F32, {8, 7}), param, zero, padding));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
 
@@ -4064,7 +4078,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopReshape) {
       HloInstruction::CreateReshape(ShapeUtil::MakeShape(F32, {2, 3}), param));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reshape(m::Parameter(0))));
@@ -4087,7 +4102,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSlice) {
       /*limit_indices=*/{dim0, dim1}, /*strides=*/{1, 1}));
 
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Slice(m::Parameter(0))));
@@ -4116,7 +4132,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfSliceToSlice) {
       /*start_indices=*/{2, 3},
       /*limit_indices=*/{dim0 - 3, dim1 - 6}, /*strides=*/{1, 1}));
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Slice(m::Slice(m::Parameter(0)))));
@@ -4147,7 +4164,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfBroadcastToBroadcast) {
       /*start_indices=*/{0, 3},
       /*limit_indices=*/{dim0, dim1 - 6}, /*strides=*/{1, 1}));
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Slice(m::Broadcast(m::Parameter(0)))));
@@ -4176,7 +4194,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeToReshapeOfSlice) {
       /*start_indices=*/{0, 0, 0},
       /*limit_indices=*/{dim0 - 2, dim1, dim2}, /*strides=*/{1, 1, 1}));
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
@@ -4202,7 +4221,8 @@ TEST_F(AlgebraicSimplifierTest, SliceOfReshapeUnchanged) {
       /*start_indices=*/{0, 0},
       /*limit_indices=*/{960, 512}, /*strides=*/{1, 1}));
   auto module = CreateNewVerifiedModule();
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
 
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Slice(m::Reshape(m::Parameter(0)))));
@@ -4221,7 +4241,8 @@ TEST_F(AlgebraicSimplifierTest, RemoveNoopSort) {
   TF_ASSERT_OK(MakeSortHlo(keys_shape, {keys}, 0, /*is_stable=*/false, &builder,
                            module.get())
                    .status());
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).value());
   EXPECT_THAT(computation->root_instruction(), keys);
@@ -4244,7 +4265,8 @@ TEST_F(AlgebraicSimplifierTest, ReplaceEffectiveScalarKeyValueSortWithTuple) {
                            {keys, values0, values1}, 0, /*is_stable=*/false,
                            &builder, module.get())
                    .status());
-  HloComputation* computation = module->AddEntryComputation(builder.Build());
+  HloComputation* computation =
+      module->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).value());
   EXPECT_THAT(computation->root_instruction(),
@@ -4264,7 +4286,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
                                                       param0, const_true));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4285,7 +4307,7 @@ TEST_F(AlgebraicSimplifierTest, AndTrue2) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
                                                       const_true, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4306,7 +4328,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
                                                       param0, const_false));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4327,7 +4349,7 @@ TEST_F(AlgebraicSimplifierTest, AndFalse2) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kAnd,
                                                       const_false, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kAnd);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4348,7 +4370,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, param0, const_true));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4369,7 +4391,7 @@ TEST_F(AlgebraicSimplifierTest, OrTrue2) {
   builder.AddInstruction(
       HloInstruction::CreateBinary(r0pred, HloOpcode::kOr, const_true, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4390,7 +4412,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
                                                       param0, const_false));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4411,7 +4433,7 @@ TEST_F(AlgebraicSimplifierTest, OrFalse2) {
   builder.AddInstruction(HloInstruction::CreateBinary(r0pred, HloOpcode::kOr,
                                                       const_false, param0));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kOr);
   AlgebraicSimplifier simplifier(default_options_);
@@ -4542,7 +4564,7 @@ TEST_P(ConvInputPaddingTest, DoTest) {
       lhs_pad, filter, /*feature_group_count=*/1, /*batch_group_count=*/1,
       window, dnums, DefaultPrecisionConfig(2)));
   auto module = CreateNewVerifiedModule();
-  module->AddEntryComputation(builder.Build());
+  module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
@@ -4661,7 +4683,7 @@ TEST_P(ConvFilterPaddingTest, DoIt) {
       window, dnums, precision_config));
 
   auto module = CreateNewVerifiedModule();
-  module->AddEntryComputation(builder.Build());
+  module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   if (testcase.expected_conv_window.empty()) {
@@ -4815,7 +4837,7 @@ TEST_F(AlgebraicSimplifierTest, ConvertConvToMatmul) {
         DefaultPrecisionConfig(2)));
 
     auto module = CreateNewVerifiedModule();
-    auto* computation = module->AddEntryComputation(b.Build());
+    auto* computation = module->AddEntryComputationWithLayouts(b.Build());
 
     AlgebraicSimplifierOptions simplifier_options;
     simplifier_options.set_is_layout_sensitive(true);
@@ -5021,7 +5043,7 @@ class ConvTestBase : public HloTestBase {
  public:
   std::unique_ptr<HloModule> Simplify(ConvTestOptions options) {
     auto module = CreateNewVerifiedModule();
-    module->AddEntryComputation(options.Build(TestName()));
+    module->AddEntryComputationWithLayouts(options.Build(TestName()));
     AlgebraicSimplifierOptions simplifier_options;
     AlgebraicSimplifier simplifier{simplifier_options};
     bool result = simplifier.Run(module.get()).value();
@@ -5146,7 +5168,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToSlice) {
       slice_shape, broadcast, {0, 1, 2, 3}, {2, 3, 5, 6}, {1, 1, 1, 1}));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, slice);
@@ -5183,7 +5205,7 @@ TEST_F(AlgebraicSimplifierTest, ScalarBroadcastToTransposeReshape) {
       HloInstruction::CreateReshape(reshape_shape, transpose));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
 
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root, reshape);
@@ -5349,7 +5371,7 @@ TEST_F(AlgebraicSimplifierTest, ReversalOfTrivialDimensionsToBitcast) {
       HloInstruction::CreateReverse(shape, a, /*dimensions=*/{2, 3}));
 
   auto module = CreateNewVerifiedModule();
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(module.get()).value());
@@ -5386,7 +5408,7 @@ TEST_F(AlgebraicSimplifierTest, IteratorInvalidation) {
       HloInstruction::CreateCall(r1f32, {zero, one}, dot_computation.get()));
 
   m->AddEmbeddedComputation(std::move(dot_computation));
-  m->AddEntryComputation(call_builder.Build());
+  m->AddEntryComputationWithLayouts(call_builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
 }
@@ -5402,7 +5424,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantTupleBecomesTupleOfConstants) {
   Literal value = LiteralUtil::MakeTuple({&elements[0], &elements[1]});
   builder.AddInstruction(HloInstruction::CreateConstant(std::move(value)));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
@@ -5430,7 +5452,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicSlice) {
       params,
       /*slice_sizes=*/{10, 100, 1000}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
   EXPECT_THAT(computation->root_instruction(), GmockMatch(m::Parameter()));
@@ -5454,7 +5476,7 @@ TEST_F(AlgebraicSimplifierTest, ConstantDynamicSlice) {
       params,
       /*slice_sizes=*/{2, 20, 200}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
   EXPECT_THAT(computation->root_instruction(),
@@ -5494,7 +5516,7 @@ TEST_F(AlgebraicSimplifierTest, TrivialDynamicUpdateSlice) {
           HloInstruction::CreateParameter(4, slice_shape, "to_update")),
       slice, update_indices));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
   EXPECT_THAT(computation->root_instruction(),
@@ -5515,7 +5537,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r3f32, inner_bcast, {0, 2}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(default_options_);
@@ -5541,7 +5563,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcasts2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, inner_bcast, {1, 2, 3}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(default_options_);
@@ -5561,7 +5583,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota) {
   Shape r3f32 = ShapeUtil::MakeShape(F32, {2, 2, 2});
   builder.AddInstruction(HloInstruction::CreateBroadcast(r3f32, iota, {0, 2}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(default_options_);
@@ -5582,7 +5604,7 @@ TEST_F(AlgebraicSimplifierTest, MergeBroadcastAndIota2) {
   builder.AddInstruction(
       HloInstruction::CreateBroadcast(r4f32, iota, {1, 2, 3}));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   HloInstruction* root = computation->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kBroadcast);
   AlgebraicSimplifier simplifier(default_options_);
@@ -5623,12 +5645,14 @@ TEST_F(AlgebraicSimplifierTest, TransposeOfBatchDot) {
     HloModule module
 
     ENTRY test {
-      lhs = f32[10,20,30,40] parameter(0)
-      rhs = f32[10,20,50,30] parameter(1)
-      dot = dot(lhs,rhs), lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
-                          lhs_contracting_dims={2}, rhs_contracting_dims={3},
-                          operand_precision={high, default}
-      ROOT transpose = transpose(dot), dimensions={0,1,3,2}
+      lhs = f32[10,20,30,40]{3,2,1,0} parameter(0)
+      rhs = f32[10,20,50,30]{3,2,1,0} parameter(1)
+      dot = f32[10,20,40,50]{3,2,1,0} dot(lhs,rhs),
+              lhs_batch_dims={0,1}, rhs_batch_dims={0,1},
+              lhs_contracting_dims={2}, rhs_contracting_dims={3},
+              operand_precision={high, default}
+      ROOT transpose = f32[10,20,50,40]{2,3,1,0} transpose(dot),
+              dimensions={0,1,3,2}
     }
   )";
   TF_ASSERT_OK_AND_ASSIGN(auto module,
@@ -6195,7 +6219,7 @@ TEST_P(PadReduceWindowEffectiveBroadcastTest, DoIt) {
   builder.AddInstruction(HloInstruction::CreateReduceWindow(
       output_shape, pad, zero, window, add_computation));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
@@ -6285,7 +6309,7 @@ TEST_P(BatchDotStrengthReductionTest, BatchDotStrengthReduction) {
   }
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool changed, simplifier.Run(module.get()));
   const bool dot_should_be_transformed =
@@ -6343,7 +6367,7 @@ TEST_P(DotStrengthReductionTest, DotStrengthReduction) {
   dot_dnums.add_rhs_contracting_dimensions(0);
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
-  auto computation = module->AddEntryComputation(builder.Build());
+  auto computation = module->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   // First pass of algebraic simplifier will remove degenerate dimensions
   // and optimize dot(transpose(x),transpose(y))
@@ -6427,7 +6451,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
@@ -6492,7 +6516,7 @@ TEST_P(DotOfConcatSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
@@ -6570,7 +6594,7 @@ TEST_F(AlgebraicSimplifierTest, DynamicUpdateSliceZeroUpdate) {
       dslice_shape, operand, update,
       std::initializer_list<HloInstruction*>({start_indices})));
   const HloComputation* const computation =
-      m->AddEntryComputation(builder.Build());
+      m->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifier simplifier(default_options_);
   ASSERT_TRUE(simplifier.Run(m.get()).value());
@@ -6822,7 +6846,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantRHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, ds, rhs, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
@@ -6895,7 +6919,7 @@ TEST_P(DotOfGatherSimplificationTest, ConstantLHS) {
   builder.AddInstruction(HloInstruction::CreateDot(
       dot_shape, lhs, ds, dot_dnums, DefaultPrecisionConfig(2)));
 
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   AlgebraicSimplifier simplifier(default_options_);
   TF_ASSERT_OK_AND_ASSIGN(bool run_successful, simplifier.Run(m.get()));
   ASSERT_TRUE(run_successful);
@@ -7072,7 +7096,7 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReshapeWithoutLayout) {
       HloInstruction::CreateReshape(reshaped_shape, broadcast));
 
   std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
-  module->AddEntryComputation(builder.Build());
+  module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
@@ -7082,6 +7106,10 @@ TEST_F(AlgebraicSimplifierTest, ZeroSizedReshapeWithoutLayout) {
 }
 
 TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
+  // This test is without layouts so we have to set the verifier to be layout
+  // insensitive.
+  verifier_layout_sensitive_ = false;
+
   Shape shape = ShapeUtil::MakeShape(F32, {});
   shape.clear_layout();
   auto builder = HloComputation::Builder(TestName());
@@ -7094,7 +7122,7 @@ TEST_F(AlgebraicSimplifierTest, DividedByConstantInstructionWithoutLayout) {
                                                       param, const_value));
 
   std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
-  module->AddEntryComputation(builder.Build());
+  module->AddEntryComputationWithLayouts(builder.Build());
 
   AlgebraicSimplifierOptions options;
   AlgebraicSimplifier simplifier(options);
@@ -8891,7 +8919,7 @@ TEST_F(AlgebraicSimplifierTest, DynamicSliceShapeLayout) {
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
   ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
   const Shape& slice_shape =
-      m.get()->entry_computation()->root_instruction()->operand(0)->shape();
+      m->entry_computation()->root_instruction()->operand(0)->shape();
   EXPECT_TRUE(slice_shape.has_layout());
   EXPECT_EQ(slice_shape.layout().tiles_size(), 1);
 }
@@ -9198,7 +9226,7 @@ TEST_F(AlgebraicSimplifierTest,
   builder.AddInstruction(HloInstruction::CreateReverse(
       ShapeUtil::MakeShape(F32, {1, 3, 1, 4, 1, 5}), reshape1, rev_dims));
   SCOPED_TRACE("Before rewrite\n" + m->ToString());
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reverse(m::Reshape(m::Parameter(0)))));
   AlgebraicSimplifierOptions options;
@@ -9234,7 +9262,7 @@ TEST_F(AlgebraicSimplifierTest,
   builder.AddInstruction(HloInstruction::CreateReverse(
       ShapeUtil::MakeShape(F32, {3, 1, 1, 1, 4, 1, 5}), reshape1, rev_dims));
   SCOPED_TRACE("Before rewrite\n" + m->ToString());
-  auto computation = m->AddEntryComputation(builder.Build());
+  auto computation = m->AddEntryComputationWithLayouts(builder.Build());
   EXPECT_THAT(computation->root_instruction(),
               GmockMatch(m::Reverse(m::Reshape(m::Parameter(0)))));
   AlgebraicSimplifierOptions options;
diff --git a/tensorflow/compiler/xla/service/async_collective_creator.cc b/tensorflow/compiler/xla/service/async_collective_creator.cc
index 6b1da14b2c1..318c68332c6 100644
--- a/tensorflow/compiler/xla/service/async_collective_creator.cc
+++ b/tensorflow/compiler/xla/service/async_collective_creator.cc
@@ -75,7 +75,7 @@ StatusOr<ReplacedAsync> CreateAsyncAllGather(HloInstruction* instruction) {
 }
 
 StatusOr<ReplacedAsync> CreateAsyncCollectivePermute(
-    HloInstruction* instruction) {
+    HloInstruction* instruction, absl::Span<const Shape> context_shapes) {
   HloComputation* computation = instruction->parent();
   auto* cp = Cast<HloCollectivePermuteInstruction>(instruction);
   HloInstruction* start;
@@ -83,9 +83,9 @@ StatusOr<ReplacedAsync> CreateAsyncCollectivePermute(
   if (cp->operand_count() == 1) {
     start = computation->AddInstruction(
         HloInstruction::CreateCollectivePermuteStart(
-            ShapeUtil::MakeTupleShape({operand->shape(), cp->shape(),
-                                       ShapeUtil::MakeShape(U32, {}, {}),
-                                       ShapeUtil::MakeShape(U32, {}, {})}),
+            ShapeInference::InferCollectivePermuteStartShape(
+                {&operand->shape()}, context_shapes)
+                .value(),
             operand, cp->source_target_pairs(), cp->channel_id()));
   } else {
     CHECK_EQ(cp->operand_count(), 4);
@@ -95,7 +95,8 @@ StatusOr<ReplacedAsync> CreateAsyncCollectivePermute(
         [](const HloInstruction* operand) { return &(operand->shape()); });
     start = computation->AddInstruction(
         HloInstruction::CreateCollectivePermuteStart(
-            ShapeInference::InferCollectivePermuteStartShape(operand_shapes)
+            ShapeInference::InferCollectivePermuteStartShape(operand_shapes,
+                                                             context_shapes)
                 .value(),
             operand, cp->mutable_operand(1), cp->mutable_operand(2),
             cp->mutable_operand(3), cp->source_target_pairs(),
@@ -166,7 +167,8 @@ StatusOr<bool> AsyncCollectiveCreator::Run(
           async_pair = CreateAsyncAllGather(instruction);
           break;
         case HloOpcode::kCollectivePermute:
-          async_pair = CreateAsyncCollectivePermute(instruction);
+          async_pair = CreateAsyncCollectivePermute(
+              instruction, config_.get_context_shapes(instruction));
           break;
         case HloOpcode::kAllToAll:
         case HloOpcode::kReduceScatter:
diff --git a/tensorflow/compiler/xla/service/backend.cc b/tensorflow/compiler/xla/service/backend.cc
index 20ee88153b3..a8ab7b9129d 100644
--- a/tensorflow/compiler/xla/service/backend.cc
+++ b/tensorflow/compiler/xla/service/backend.cc
@@ -17,27 +17,25 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/backend.h"
 
-#include <algorithm>
 #include <memory>
+#include <optional>
+#include <set>
 #include <string>
 #include <utility>
 
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/compiler/xla/service/compiler.h"
 #include "tensorflow/compiler/xla/service/platform_util.h"
-#include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/host/host_platform_id.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/tsl/platform/cpu_info.h"
 #include "tensorflow/tsl/platform/env.h"
-#include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/threadpool.h"
 
 namespace xla {
+namespace se = ::stream_executor;
 
 BackendOptions& BackendOptions::set_platform(se::Platform* platform) {
   platform_ = platform;
@@ -105,17 +103,19 @@ Backend::CreateDefaultBackend() {
   return CreateBackend(backend_options);
 }
 
-StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal) {
+StatusOr<StreamPool::Ptr> Backend::BorrowStream(int device_ordinal,
+                                                se::StreamPriority priority) {
   TF_ASSIGN_OR_RETURN(auto executor, stream_executor(device_ordinal));
-  return BorrowStream(executor);
+  return BorrowStream(executor, priority);
 }
 
-StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor) {
+StatusOr<StreamPool::Ptr> Backend::BorrowStream(se::StreamExecutor* executor,
+                                                se::StreamPriority priority) {
   absl::MutexLock l(&mu_);
   if (!stream_pools_.contains(executor)) {
     stream_pools_.emplace(executor, std::make_unique<StreamPool>());
   }
-  return stream_pools_.at(executor)->BorrowStream(executor);
+  return stream_pools_.at(executor)->BorrowStream(executor, priority);
 }
 
 Backend::Backend(se::Platform* platform, Compiler* compiler,
@@ -142,7 +142,7 @@ Backend::Backend(se::Platform* platform, Compiler* compiler,
   }
 }
 
-Backend::~Backend() {}
+Backend::~Backend() = default;
 
 int Backend::default_device_ordinal() const {
   return default_stream_executor()->device_ordinal();
diff --git a/tensorflow/compiler/xla/service/backend.h b/tensorflow/compiler/xla/service/backend.h
index 44bc63142ed..2b064072ff8 100644
--- a/tensorflow/compiler/xla/service/backend.h
+++ b/tensorflow/compiler/xla/service/backend.h
@@ -16,8 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_BACKEND_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_BACKEND_H_
 
-#include <map>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <set>
 #include <string>
 #include <vector>
@@ -32,7 +33,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory_allocator.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace Eigen {
 struct ThreadPoolDevice;
@@ -119,17 +119,25 @@ class Backend {
     return stream_executors_[0];
   }
 
-  // Borrows a stream for use by the caller, either by grabbing it from an
-  // internal pool, or by constructing/initializating it, and returns the result
-  // to the caller.
-  StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal);
-  StatusOr<StreamPool::Ptr> BorrowStream(se::StreamExecutor* executor);
+  // Borrows a stream for use by the caller with a given priority, either by
+  // grabbing it from an internal pool, or by constructing/initializating it,
+  // and returns the result to the caller.
+  StatusOr<StreamPool::Ptr> BorrowStream(
+      int device_ordinal,
+      se::StreamPriority priority = se::StreamPriority::Default);
+  StatusOr<StreamPool::Ptr> BorrowStream(
+      se::StreamExecutor* executor,
+      se::StreamPriority priority = se::StreamPriority::Default);
 
-  // Returns a function to borrow a stream, as `BorrowStream` above does.
+  // Returns a function to borrow a stream with a given priority,
+  // as `BorrowStream` above does.
   // Purely for convenience, the caller could rather make this anonymous
   // function itself.
-  std::function<StatusOr<StreamPool::Ptr>(int)> StreamBorrower() {
-    return [this](int device_ordinal) { return BorrowStream(device_ordinal); };
+  std::function<StatusOr<StreamPool::Ptr>(int, se::StreamPriority)>
+  StreamBorrowerWithPriority() {
+    return [this](int device_ordinal, se::StreamPriority priority) {
+      return BorrowStream(device_ordinal, priority);
+    };
   }
 
   // Returns whether the given device ordinal of the backend is supported.
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.cc b/tensorflow/compiler/xla/service/buffer_assignment.cc
index fe2d0c9d7f3..3de2eaefbb0 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment.cc
@@ -18,18 +18,22 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 
 #include <algorithm>
+#include <cstdint>
 #include <deque>
+#include <iterator>
 #include <memory>
-#include <numeric>
 #include <ostream>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_op_metadata.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_live_range.h"
@@ -57,6 +61,59 @@ using absl::StrAppendFormat;
 using memory_space_assignment::PresetAssignments;
 using ::tsl::strings::HumanReadableNumBytes;
 
+absl::flat_hash_map<int64_t, const HloInstruction*> BuildIdToHloInstructionMap(
+    const HloModule* module) {
+  // Build a map from a unique_id to corresponding HloInstruction in the module.
+  absl::flat_hash_map<int64_t, const HloInstruction*> id_to_hlo_instruction;
+  for (const HloComputation* computation : module->computations()) {
+    for (const HloInstruction* instruction : computation->instructions()) {
+      id_to_hlo_instruction[instruction->unique_id()] = instruction;
+    }
+  }
+  return id_to_hlo_instruction;
+}
+
+StatusOr<absl::flat_hash_map<int64_t, const HloValue*>>
+BuildIdToLogicalBufferMap(
+    const BufferAssignmentProto& proto,
+    const absl::flat_hash_map<int64_t, const HloInstruction*>&
+        id_to_hlo_instruction,
+    const std::unique_ptr<HloAliasAnalysis>& alias_analysis) {
+  absl::flat_hash_map<int64_t, const HloValue*> id_to_logical_buffer;
+  // Process each logical buffer in the proto.
+  for (const LogicalBufferProto& logical_buffer_proto :
+       proto.logical_buffers()) {
+    TF_RET_CHECK(logical_buffer_proto.has_defined_at())
+        << "Expected logical buffer to have location information in the proto.";
+    TF_RET_CHECK(id_to_hlo_instruction.contains(
+        logical_buffer_proto.defined_at().instruction_id()))
+        << "Expected hlo instruction "
+        << "with the id '" << logical_buffer_proto.defined_at().instruction_id()
+        << "' in the proto to also exist in the "
+           "HLO module.";
+    // Assumption: An hlo module loaded from an hlo proto
+    // preserves each instruction's unique_id. An instruction's name is a
+    // deprecated field in the LocationProto.
+    const HloInstruction* hlo_instruction = id_to_hlo_instruction.at(
+        logical_buffer_proto.defined_at().instruction_id());
+
+    std::vector<int64_t> shape_idx_vals;
+    absl::c_copy(logical_buffer_proto.defined_at().shape_index(),
+                 std::back_inserter(shape_idx_vals));
+    ShapeIndex proto_shape_index(shape_idx_vals);
+
+    // Look up logical buffer by hlo instruction and shape index.
+    auto& logical_buffer = alias_analysis->dataflow_analysis().GetUniqueValueAt(
+        hlo_instruction, proto_shape_index);
+
+    // Assign color to a logical buffer from the proto.
+    logical_buffer.set_color(logical_buffer_proto.color());
+
+    id_to_logical_buffer[logical_buffer_proto.id()] = &logical_buffer;
+  }
+  return id_to_logical_buffer;
+}
+
 }  // namespace
 
 Status GatherComputationsByAllocationType(
@@ -338,6 +395,10 @@ bool BufferAssignment::HasAllocation(const HloValue& value) const {
   return allocation_index_for_value_.contains(&value);
 }
 
+bool BufferAssignment::HasAllocation(HloValue::Id value_id) const {
+  return HasAllocation(dataflow_analysis().GetValue(value_id));
+}
+
 bool BufferAssignment::HasAllocation(const HloBuffer& buffer) const {
   return allocation_index_for_value_.contains(buffer.values()[0]);
 }
@@ -927,6 +988,70 @@ BufferAssignmentProto BufferAssignment::ToProto() const {
   return proto;
 }
 
+/* static */
+StatusOr<std::unique_ptr<BufferAssignment>> BufferAssignment::FromProto(
+    const BufferAssignmentProto& proto, const HloModule* module,
+    BufferValue::SizeFunction buffer_size,
+    HloDataflowAnalysis::CanShareBuffer can_share_buffer) {
+  // Create alias and dataflow analysis.
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                      HloAliasAnalysis::Run(module, can_share_buffer));
+
+  // Build a map from a unique_id to corresponding HloInstruction in the module.
+  auto id_to_hlo_instruction = BuildIdToHloInstructionMap(module);
+
+  // Build a map from logical buffer id in the proto to hlo value in the
+  // existing dataflow analysis.
+  absl::flat_hash_map<int64_t, const HloValue*> id_to_logical_buffer;
+  TF_ASSIGN_OR_RETURN(
+      id_to_logical_buffer,
+      BuildIdToLogicalBufferMap(proto, id_to_hlo_instruction, alias_analysis));
+
+  std::unique_ptr<BufferAssignment> buffer_assignment =
+      absl::WrapUnique(new BufferAssignment(
+          module, /*hlo_ordering=*/nullptr, std::move(buffer_size),
+          /*color_alignment=*/nullptr, std::move(alias_analysis),
+          /*hlo_live_range=*/nullptr));
+
+  // Process each buffer allocation entry in the proto to create a new
+  // allocation.
+  for (const auto& alloc_proto : proto.buffer_allocations()) {
+    auto* allocation = buffer_assignment->NewEmptyAllocation(
+        alloc_proto.size(), alloc_proto.color());
+    CHECK(allocation->index() == alloc_proto.index())
+        << "Expected allocations in BufferAssignment proto to be sorted by "
+           "index.";
+    if (alloc_proto.is_entry_computation_parameter()) {
+      std::vector<int64_t> shape_idx_vals;
+      absl::c_copy(alloc_proto.parameter_shape_index(),
+                   std::back_inserter(shape_idx_vals));
+      ShapeIndex shape_index(shape_idx_vals);
+      allocation->set_entry_computation_parameter(
+          alloc_proto.parameter_number(), shape_index, false);
+    }
+
+    // Process each logical buffer assigned to the current allocation and create
+    // buffer assignment entries.
+    for (const auto& assignee : alloc_proto.assigned()) {
+      HloValue::Id logical_buffer_id = assignee.logical_buffer_id();
+      const auto& buffer_val = id_to_logical_buffer[logical_buffer_id];
+      buffer_assignment->AddAssignment(allocation, *buffer_val,
+                                       assignee.offset(), assignee.size());
+    }
+    CHECK_EQ(allocation->maybe_live_out(), alloc_proto.maybe_live_out())
+        << "Dataflow analysis differs from proto.";
+  }
+
+  // Ensure each buffer in the proto has an allocation assigned.
+  TF_RET_CHECK(proto.logical_buffers_size() ==
+               buffer_assignment->allocation_index_for_value_.size());
+  for (auto& logical_buffer_proto : proto.logical_buffers()) {
+    TF_RET_CHECK(buffer_assignment->HasAllocation(
+        *id_to_logical_buffer[logical_buffer_proto.id()]));
+  }
+  return buffer_assignment;
+}
+
 /* static */
 StatusOr<std::unique_ptr<BufferAssignment>> BufferAssigner::Run(
     const HloModule* module, std::unique_ptr<HloOrdering> hlo_ordering,
diff --git a/tensorflow/compiler/xla/service/buffer_assignment.h b/tensorflow/compiler/xla/service/buffer_assignment.h
index 8fc2fd67096..bde1500a700 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment.h
+++ b/tensorflow/compiler/xla/service/buffer_assignment.h
@@ -379,6 +379,10 @@ class BufferAssignment {
   // Returns whether the given buffer has been assigned an allocation.
   bool HasAllocation(const HloValue& value) const;
 
+  // Returns whether the given (logical) buffer with the id has been assigned an
+  // allocation.
+  bool HasAllocation(HloValue::Id value_id) const;
+
   bool HasAllocation(const HloBuffer& buffer) const;
 
   // Returns the allocation that a particular LogicalBuffer has been assigned
@@ -470,7 +474,13 @@ class BufferAssignment {
   // every buffer associated with each allocation.
   std::string ToVerboseString() const;
   std::string BufferInfoString() const;
+
+  // Convert BufferAssignment to or from a proto.
   BufferAssignmentProto ToProto() const;
+  static StatusOr<std::unique_ptr<BufferAssignment>> FromProto(
+      const BufferAssignmentProto& proto, const HloModule* module,
+      BufferValue::SizeFunction buffer_size,
+      HloDataflowAnalysis::CanShareBuffer can_share_buffer);
 
   // Statistics for the assignment.  Values initialized to -1 are not always
   // collected; fragmentation is only collected for instructions that have a
diff --git a/tensorflow/compiler/xla/service/buffer_assignment_test.cc b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
index 694b26310e2..007527ae9ac 100644
--- a/tensorflow/compiler/xla/service/buffer_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/buffer_assignment_test.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_schedule.h"
 #include "tensorflow/compiler/xla/literal.h"
@@ -35,10 +36,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/call_graph.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
+#include "tensorflow/compiler/xla/service/hlo.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_proto_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
@@ -46,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -100,6 +104,16 @@ class BufferAssignmentTest : public HloTestBase {
         .value();
   }
 
+  StatusOr<std::unique_ptr<BufferAssignment>> ConvertToProtoAndBack(
+      const BufferAssignment* buffers, const HloModule* module) {
+    // Dump proto for buffer assignments.
+    auto proto = buffers->ToProto();
+    // Recreate buffer assignment from proto.
+    return BufferAssignment::FromProto(
+        proto, module, backend().compiler()->BufferSizeBytesFunction(),
+        /*can_share_buffer=*/nullptr);
+  }
+
   std::unique_ptr<BufferAssignment> RunBufferAssignmentWithSequentialOrdering(
       HloModule* module, int64_t alignment = 1,
       BufferAssigner::Colorer colorer = BufferAssigner::DefaultColorer(),
@@ -472,7 +486,10 @@ TEST_F(BufferAssignmentTest, Basic) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module.get());
+  auto buffers_orig = RunBufferAssignment(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> buffers,
+      ConvertToProtoAndBack(buffers_orig.get(), module.get()));
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -495,6 +512,66 @@ TEST_F(BufferAssignmentTest, Basic) {
   GetAssignedOutputAllocation(*buffers, sub);
 }
 
+TEST_F(BufferAssignmentTest, BasicToFromProto) {
+  // paramscalar ------- (mul) -- (add) -- (sub)
+  //                     /        /        /
+  // param0[100] -------/        /        /
+  //                            /        /
+  // param1[100] --------------/--------/
+  // Create HLO module.
+  auto builder = HloComputation::Builder(TestName());
+  auto paramscalar =
+      builder.AddInstruction(HloInstruction::CreateParameter(0, r0f32_, "p"));
+  auto broadcast = builder.AddInstruction(
+      HloInstruction::CreateBroadcast(f32vec100_, paramscalar, {}));
+  auto param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, f32vec100_, "p1"));
+  auto param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, f32vec100_, "p2"));
+  auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kMultiply, broadcast, param0));
+  auto add = builder.AddInstruction(
+      HloInstruction::CreateBinary(f32vec100_, HloOpcode::kAdd, mul, param1));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      f32vec100_, HloOpcode::kSubtract, add, param1));
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  // Run the buffer assignment.
+  auto buffers_orig = RunBufferAssignment(module.get());
+
+  // Dump the original buffer assignment into a proto and read it back.
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> buffers_from_proto,
+      ConvertToProtoAndBack(buffers_orig.get(), module.get()));
+
+  // Compare the two buffer assignments and ensure that they are identical.
+  const HloDataflowAnalysis& dataflow_orig = buffers_orig->dataflow_analysis();
+  const HloDataflowAnalysis& dataflow_proto =
+      buffers_from_proto->dataflow_analysis();
+
+  // Ensure that both buffer assignments have equal number of allocations.
+  EXPECT_EQ(buffers_orig->Allocations().size(),
+            buffers_from_proto->Allocations().size());
+
+  // Ensure that each logical buffer in each buffer assignment is assigned to
+  // the same allocation.
+  for (BufferValue::Id id = 0; id < dataflow_orig.values().size(); id++) {
+    auto& orig_value = dataflow_orig.values().at(id);
+    if (buffers_orig->HasAllocation(*orig_value)) {
+      // If there was an allocation for a logical buffer in the original
+      // assignment, then it should be there in the buffer assignment recreated
+      // from a proto dump as well.
+      auto& value_proto = dataflow_proto.GetUniqueValueAt(
+          orig_value->instruction(), orig_value->index());
+      EXPECT_TRUE(buffers_from_proto->HasAllocation(value_proto));
+      EXPECT_EQ(orig_value->color(), value_proto.color());
+      EXPECT_EQ(buffers_orig->GetAssignedAllocation(*orig_value).index(),
+                buffers_from_proto->GetAssignedAllocation(value_proto).index());
+    }
+  }
+}
+
 TEST_F(BufferAssignmentTest, AliasedParamCanBeReused) {
   // If an input buffer and output buffer aliases, the input buffer can be
   // reused for other intermediate results.
@@ -517,7 +594,10 @@ TEST_F(BufferAssignmentTest, AliasedParamCanBeReused) {
 
   TF_ASSERT_OK(module->input_output_alias_config().SetUpAlias({}, 0, {}));
 
-  auto buffers = RunBufferAssignment(module.get());
+  auto buffers_orig = RunBufferAssignment(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> buffers,
+      ConvertToProtoAndBack(buffers_orig.get(), module.get()));
 
   BufferAllocation param_buffer = GetAssignedInputAllocation(*buffers, param);
   BufferAllocation neg_1_buffer = GetAllocation(*buffers, neg_1, {});
@@ -554,7 +634,10 @@ TEST_F(BufferAssignmentTest, AddCannotReuse) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignmentNoBuffersReuseForAdd(module.get());
+  auto buffers_orig = RunBufferAssignmentNoBuffersReuseForAdd(module.get());
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> buffers,
+      ConvertToProtoAndBack(buffers_orig.get(), module.get()));
 
   // Distinct input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -740,7 +823,8 @@ TEST_F(BufferAssignmentTest, PresetAssignments) {
   auto param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   Shape f32vec100_color1 = ShapeUtil::MakeShapeWithDenseLayout(
-      F32, {100}, {0}, /*tiles=*/{}, /*memory_space=*/1);
+      F32, {100}, {0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*memory_space=*/1);
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_color1, HloOpcode::kMultiply, broadcast, param0));
   auto add = builder.AddInstruction(HloInstruction::CreateBinary(
@@ -801,7 +885,8 @@ TEST_F(BufferAssignmentTest, PresetAssignmentsWhile) {
   // HloValue and HloBuffer (i.e., a while loop).
   auto module = CreateNewVerifiedModule();
   Shape f32vec10_color1 = ShapeUtil::MakeShapeWithDenseLayout(
-      F32, {10}, {0}, /*tiles=*/{}, /*memory_space=*/1);
+      F32, {10}, {0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*memory_space=*/1);
   Shape t_s32_f32v10_color1 =
       ShapeUtil::MakeTupleShape({s32_, f32vec10_color1});
 
@@ -865,9 +950,13 @@ TEST_F(BufferAssignmentTest, PresetAssignmentsWhile) {
   preset_assignments->assignment_information_for_space(/*memory_space=*/1)
       ->size = 140;
 
-  auto buffers = RunBufferAssignmentWithPresetAssignments(
+  auto buffers_orig = RunBufferAssignmentWithPresetAssignments(
       module.get(), std::move(preset_assignments));
 
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> buffers,
+      ConvertToProtoAndBack(buffers_orig.get(), module.get()));
+
   // All assigned buffers are aliased so they should have the same offset and
   // size.
   const BufferAllocation& data_buffer = GetTopLevelAllocation(*buffers, negate);
@@ -908,7 +997,11 @@ TEST_F(BufferAssignmentTest, MultipleUsersForNode) {
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
 
-  auto buffers = RunBufferAssignment(module.get());
+  auto buffers_orig = RunBufferAssignment(module.get());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> buffers,
+      ConvertToProtoAndBack(buffers_orig.get(), module.get()));
 
   // Input buffers were assigned for parameters.
   BufferAllocation paramscalar_buffer =
@@ -1239,7 +1332,11 @@ TEST_F(BufferAssignmentTest, NoReuseLiveBuffer) {
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module.get());
+  auto assignment_orig = RunBufferAssignment(module.get());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> assignment,
+      ConvertToProtoAndBack(assignment_orig.get(), module.get()));
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1276,7 +1373,11 @@ TEST_F(BufferAssignmentTest, NoReuseAliasedBuffer) {
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module.get());
+  auto assignment_orig = RunBufferAssignment(module.get());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> assignment,
+      ConvertToProtoAndBack(assignment_orig.get(), module.get()));
 
   // The instructions should not share buffers.
   EXPECT_NE(GetTopLevelAllocation(*assignment, broadcast),
@@ -1779,7 +1880,11 @@ TEST_F(BufferAssignmentTest, TupleBufferNotReused) {
 
   auto module = CreateNewVerifiedModule();
   module->AddEntryComputation(builder.Build());
-  auto assignment = RunBufferAssignment(module.get());
+  auto assignment_orig = RunBufferAssignment(module.get());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::unique_ptr<BufferAssignment> assignment,
+      ConvertToProtoAndBack(assignment_orig.get(), module.get()));
 
   // There should be no buffer reuse. The copy should not reuse the tuple
   // buffer.
diff --git a/tensorflow/compiler/xla/service/call_graph.cc b/tensorflow/compiler/xla/service/call_graph.cc
index f4b735920d1..1b07582456c 100644
--- a/tensorflow/compiler/xla/service/call_graph.cc
+++ b/tensorflow/compiler/xla/service/call_graph.cc
@@ -69,6 +69,7 @@ CallContext GetInstructionCallContext(HloOpcode opcode) {
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSort:
+    case HloOpcode::kTopK:
     case HloOpcode::kFusion:
     case HloOpcode::kCustomCall:
       return CallContext::kEmbedded;
@@ -112,7 +113,9 @@ void CallGraphNode::AddCallerCallSite(const CallSite& caller_callsite) {
   }
 }
 
-void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
+void CallGraphNode::AddCallSiteForInstruction(
+    HloInstruction* instruction,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   CHECK_EQ(instruction->parent(), computation());
   const CallContext context = GetInstructionCallContext(instruction->opcode());
   if (!instruction->called_computations().empty()) {
@@ -124,7 +127,9 @@ void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
     // Update callee computations to include any new computations called by this
     // instruction.
     for (auto* callee : callsites_.back().called_computations()) {
-      if (!ContainsKey(callee_set_, callee)) {
+      if (HloInstruction::IsThreadIncluded(callee->execution_thread(),
+                                           execution_threads) &&
+          !ContainsKey(callee_set_, callee)) {
         callees_.push_back(callee);
         callee_set_.insert(callee);
       }
@@ -132,7 +137,10 @@ void CallGraphNode::AddCallSiteForInstruction(HloInstruction* instruction) {
   }
 }
 
-CallGraph::CallGraph(const HloModule* module) : module_(module) {}
+CallGraph::CallGraph(
+    const HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads)
+    : module_(module), execution_threads_(execution_threads) {}
 
 const CallGraphNode& CallGraph::GetNode(
     const HloComputation* computation) const {
@@ -203,7 +211,8 @@ void CallGraph::SetCallContexts() {
 
   // Initialize worklist with all roots of the call graph (computations without
   // callers).
-  for (const HloComputation* computation : module_->computations()) {
+  for (const HloComputation* computation :
+       module_->computations(execution_threads_)) {
     CallGraphNode& node = GetNode(computation);
     if (node.callers().empty()) {
       node.set_context(CallContext::kControlFlow);
@@ -217,6 +226,10 @@ void CallGraph::SetCallContexts() {
 
     for (const CallSite& callsite : node->callsites()) {
       for (const HloComputation* callee : callsite.called_computations()) {
+        if (!HloInstruction::IsThreadIncluded(callee->execution_thread(),
+                                              execution_threads_)) {
+          continue;
+        }
         CallGraphNode& callee_node = GetNode(callee);
 
         // Update context of callee computation based on the callsite and its
@@ -241,7 +254,8 @@ void CallGraph::SetCallContexts() {
   }
 
   // No node should have a kNone calling context.
-  for (const HloComputation* computation : module_->computations()) {
+  for (const HloComputation* computation :
+       module_->computations(execution_threads_)) {
     CHECK_NE(GetNode(computation).context(), CallContext::kNone);
   }
 }
@@ -256,7 +270,8 @@ void CallGraph::SetNodeDepths() {
 
   // Initialize worklist with all roots of the call graph (computations without
   // callers).
-  for (const HloComputation* computation : module_->computations()) {
+  for (const HloComputation* computation :
+       module_->computations(execution_threads_)) {
     CallGraphNode& node = GetNode(computation);
     if (node.callers().empty()) {
       node.set_depth(0);
@@ -282,15 +297,18 @@ void CallGraph::SetNodeDepths() {
 }
 
 /* static */
-std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
+std::unique_ptr<CallGraph> CallGraph::Build(
+    const HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
   // Constructor for CallGraph is private so std::make_unique can't be used.
-  auto call_graph = absl::WrapUnique<CallGraph>(new CallGraph(module));
+  auto call_graph =
+      absl::WrapUnique<CallGraph>(new CallGraph(module, execution_threads));
 
   VLOG(3) << "Building call graph for:";
   XLA_VLOG_LINES(3, module->ToString());
 
   // Construct nodes of the call graph and populate the callsites.
-  for (HloComputation* computation : module->computations()) {
+  for (HloComputation* computation : module->computations(execution_threads)) {
     auto it_added = call_graph->node_indices_.insert(
         {computation, call_graph->nodes_.size()});
     // All computations should be unique, so the computation should not already
@@ -300,15 +318,21 @@ std::unique_ptr<CallGraph> CallGraph::Build(const HloModule* module) {
 
     // Add all callsites in this computation.
     for (HloInstruction* instruction : computation->instructions()) {
-      call_graph->nodes_.back().AddCallSiteForInstruction(instruction);
+      call_graph->nodes_.back().AddCallSiteForInstruction(instruction,
+                                                          execution_threads);
     }
   }
 
   // Add caller callsites to each node.
-  for (const HloComputation* computation : module->computations()) {
+  for (const HloComputation* computation :
+       module->computations(execution_threads)) {
     for (const CallSite& callsite :
          call_graph->GetNode(computation).callsites()) {
       for (auto* callee : callsite.called_computations()) {
+        if (!HloInstruction::IsThreadIncluded(callee->execution_thread(),
+                                              execution_threads)) {
+          continue;
+        }
         // Add caller callsites.
         call_graph->GetNode(callee).AddCallerCallSite(callsite);
       }
diff --git a/tensorflow/compiler/xla/service/call_graph.h b/tensorflow/compiler/xla/service/call_graph.h
index defacf095fc..048c2daee73 100644
--- a/tensorflow/compiler/xla/service/call_graph.h
+++ b/tensorflow/compiler/xla/service/call_graph.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CALL_GRAPH_H_
 
+#include <memory>
 #include <ostream>
 
 #include "absl/container/flat_hash_map.h"
@@ -149,7 +150,9 @@ class CallGraphNode {
   // If instruction calls any computations adds a call site for this instruction
   // to the call graph node. If the instruction calls no computations then no
   // call site is added.
-  void AddCallSiteForInstruction(HloInstruction* instruction);
+  void AddCallSiteForInstruction(
+      HloInstruction* instruction,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Computation represented by this call graph node.
   HloComputation* computation_;
@@ -187,8 +190,12 @@ class CallGraph {
  public:
   using VisitorFunction = absl::FunctionRef<Status(const CallGraphNode&)>;
 
-  // Builds and returns a call graph for the given HLO module.
-  static std::unique_ptr<CallGraph> Build(const HloModule* module);
+  // Builds and returns a call graph for the given HLO module. If a non-empty
+  // execution_threads is provided, only computations that are in
+  // execution_threads will be part of the returned call graph.
+  static std::unique_ptr<CallGraph> Build(
+      const HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Returns the node associated with the given computation.
   const CallGraphNode& GetNode(const HloComputation* computation) const;
@@ -259,7 +266,9 @@ class CallGraph {
   std::string ToString() const;
 
  private:
-  CallGraph(const HloModule* module);
+  explicit CallGraph(
+      const HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads = {});
 
   // Not copyable.
   CallGraph(const CallGraph&) = delete;
@@ -295,6 +304,9 @@ class CallGraph {
   // Map from HLO computation to the index of the corresponding call graph node
   // in nodes_.
   absl::flat_hash_map<const HloComputation*, int64_t> node_indices_;
+
+  // The execution threads that the call graph is built for.
+  absl::flat_hash_set<absl::string_view> execution_threads_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/call_graph_test.cc b/tensorflow/compiler/xla/service/call_graph_test.cc
index c02c5d8d68b..13c08673af9 100644
--- a/tensorflow/compiler/xla/service/call_graph_test.cc
+++ b/tensorflow/compiler/xla/service/call_graph_test.cc
@@ -568,5 +568,67 @@ TEST_F(CallGraphTest, VisitWithError) {
   ASSERT_THAT(status.message(), ::testing::HasSubstr("Visitation failed"));
 }
 
+TEST_F(CallGraphTest, ExecutionThread) {
+  // Create a module with two computations with different execution_threads and
+  // ensure call graphs with non-empty execution threads ignore the computations
+  // that are not in execution_threads.
+  HloComputation::Builder builder(TestName());
+  constexpr char kParallelThreadName[] = "parallel_thread";
+  // Create a call instruction containing a single binary operation.
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f)));
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      kScalarShape, HloOpcode::kAdd, constant1, constant2));
+  auto module = CreateNewVerifiedModule();
+  auto* main_thread_computation = module->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* async_done,
+      main_thread_computation->CreateAsyncInstructions(
+          add, {ShapeUtil::MakeScalarShape(U32)}, kParallelThreadName));
+  auto* parallel_thread_computation = async_done->async_wrapped_computation();
+
+  {
+    // Call graph with all of the execution threads.
+    std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module.get());
+    EXPECT_EQ(call_graph->nodes().size(), 2);
+    const CallGraphNode& main_thread_node =
+        call_graph->GetNode(main_thread_computation);
+    const CallGraphNode& parallel_thread_node =
+        call_graph->GetNode(parallel_thread_computation);
+    EXPECT_EQ(main_thread_node.callers().size(), 0);
+    EXPECT_EQ(main_thread_node.callees().size(), 1);
+    EXPECT_EQ(main_thread_node.depth(), 0);
+    EXPECT_EQ(parallel_thread_node.callers().size(), 1);
+    EXPECT_EQ(parallel_thread_node.callees().size(), 0);
+    EXPECT_EQ(parallel_thread_node.depth(), 1);
+  }
+
+  {
+    // Call graph with the main thread only.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(module.get(), {HloInstruction::kMainExecutionThread});
+    EXPECT_EQ(call_graph->nodes().size(), 1);
+    const CallGraphNode& main_thread_node =
+        call_graph->GetNode(main_thread_computation);
+    EXPECT_EQ(main_thread_node.callers().size(), 0);
+    EXPECT_EQ(main_thread_node.callees().size(), 0);
+    EXPECT_EQ(main_thread_node.depth(), 0);
+  }
+
+  {
+    // Call graph with the parallel thread only.
+    std::unique_ptr<CallGraph> call_graph =
+        CallGraph::Build(module.get(), {kParallelThreadName});
+    EXPECT_EQ(call_graph->nodes().size(), 1);
+    const CallGraphNode& parallel_thread_node =
+        call_graph->GetNode(parallel_thread_computation);
+    EXPECT_EQ(parallel_thread_node.callers().size(), 0);
+    EXPECT_EQ(parallel_thread_node.callees().size(), 0);
+    EXPECT_EQ(parallel_thread_node.depth(), 0);
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.cc b/tensorflow/compiler/xla/service/collective_ops_utils.cc
index 995b9103a98..ea1ba660ce5 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.cc
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 
 #include <optional>
+#include <vector>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
@@ -24,9 +25,27 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/global_device_id.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 
+namespace {
+StatusOr<ReplicaGroup> DeviceIdGroupToFlattenedIdGroup(
+    const DeviceAssignment& device_assignment,
+    const std::vector<GlobalDeviceId>& device_id_group) {
+  ReplicaGroup flattened_id_group;
+  for (const GlobalDeviceId& device_id : device_id_group) {
+    TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID logical_id,
+                        device_assignment.LogicalIdForDevice(device_id));
+    flattened_id_group.add_replica_ids(
+        logical_id.replica_id * device_assignment.computation_count() +
+        logical_id.computation_id);
+  }
+  return flattened_id_group;
+}
+
+}  // namespace
+
 // Match the instruction to a reduction kind. We can represent and/or of pred as
 // min/max. This works because pred is stored as an 8-bit int of value 0 or 1.
 std::optional<ReductionKind> MatchReductionInstruction(
@@ -253,6 +272,94 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
   }
 }
 
+StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    const DeviceAssignment& device_assignment,
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode group_mode) {
+  std::vector<ReplicaGroup> flattened_id_groups;
+  TF_ASSIGN_OR_RETURN(std::vector<std::vector<GlobalDeviceId>> device_groups,
+                      GetParticipatingDevicesGroups(
+                          device_assignment, replica_groups, group_mode));
+  for (const auto& device_group : device_groups) {
+    TF_ASSIGN_OR_RETURN(
+        ReplicaGroup flattened_id_group,
+        DeviceIdGroupToFlattenedIdGroup(device_assignment, device_group));
+    flattened_id_groups.push_back(flattened_id_group);
+  }
+  return flattened_id_groups;
+}
+
+StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode replica_group_mode, int replica_count,
+    int partition_count) {
+  std::vector<ReplicaGroup> filled_empty_replica_group;
+  absl::Span<const ReplicaGroup> original_replica_groups = replica_groups;
+  std::vector<ReplicaGroup> flattened_replica_groups;
+  if (replica_groups.empty()) {
+    filled_empty_replica_group.emplace_back();
+    const int64_t id_count =
+        replica_group_mode == CollectiveOpGroupMode::kCrossPartition
+            ? partition_count
+            : replica_count;
+    for (int i = 0; i < id_count; ++i) {
+      filled_empty_replica_group.back().add_replica_ids(i);
+    }
+    original_replica_groups = filled_empty_replica_group;
+  }
+  if (replica_group_mode == CollectiveOpGroupMode::kFlattenedID) {
+    flattened_replica_groups.insert(flattened_replica_groups.end(),
+                                    original_replica_groups.begin(),
+                                    original_replica_groups.end());
+  } else if (replica_group_mode == CollectiveOpGroupMode::kCrossReplica) {
+    flattened_replica_groups.resize(original_replica_groups.size() *
+                                    partition_count);
+    for (int64_t i = 0, current_group_offset = 0;
+         i < original_replica_groups.size();
+         ++i, current_group_offset += partition_count) {
+      for (int64_t replica_id : original_replica_groups.at(i).replica_ids()) {
+        for (int64_t partition_id = 0; partition_id < partition_count;
+             ++partition_id) {
+          const int64_t flattened_id =
+              replica_id * partition_count + partition_id;
+          flattened_replica_groups[current_group_offset + partition_id]
+              .add_replica_ids(flattened_id);
+        }
+      }
+    }
+  } else if (replica_group_mode == CollectiveOpGroupMode::kCrossPartition) {
+    flattened_replica_groups.resize(original_replica_groups.size() *
+                                    replica_count);
+    for (int64_t i = 0, current_group_offset = 0;
+         i < original_replica_groups.size();
+         ++i, current_group_offset += replica_count) {
+      for (int64_t partition_id : original_replica_groups.at(i).replica_ids()) {
+        for (int64_t replica_id = 0; replica_id < replica_count; ++replica_id) {
+          const int64_t flattened_id =
+              replica_id * partition_count + partition_id;
+          flattened_replica_groups[current_group_offset + replica_id]
+              .add_replica_ids(flattened_id);
+        }
+      }
+    }
+  } else {
+    CHECK(replica_group_mode ==
+          CollectiveOpGroupMode::kCrossReplicaAndPartition);
+    flattened_replica_groups.resize(original_replica_groups.size());
+    for (int64_t i = 0; i < original_replica_groups.size(); ++i) {
+      for (int64_t replica_id : original_replica_groups.at(i).replica_ids()) {
+        for (int64_t partition_id = 0; partition_id < partition_count;
+             ++partition_id) {
+          const int64_t flattened_id =
+              replica_id * partition_count + partition_id;
+          flattened_replica_groups[i].add_replica_ids(flattened_id);
+        }
+      }
+    }
+  }
+  return flattened_replica_groups;
+}
+
 StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
     GlobalDeviceId device_id, const DeviceAssignment& device_assignment,
     absl::Span<const ReplicaGroup> replica_groups,
diff --git a/tensorflow/compiler/xla/service/collective_ops_utils.h b/tensorflow/compiler/xla/service/collective_ops_utils.h
index 2f835dd8ee0..438431c0861 100644
--- a/tensorflow/compiler/xla/service/collective_ops_utils.h
+++ b/tensorflow/compiler/xla/service/collective_ops_utils.h
@@ -122,6 +122,19 @@ GetParticipatingDevicesGroups(const DeviceAssignment& device_assignment,
                               absl::Span<const ReplicaGroup> replica_groups,
                               CollectiveOpGroupMode group_mode);
 
+// Same as above, except that it returns the flattened id in the replica groups
+// instead of device id.
+StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    const DeviceAssignment& device_assignment,
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode group_mode);
+
+// Same as above, but take replica/partition count instead of device assignment.
+StatusOr<std::vector<ReplicaGroup>> GetParticipatingFlattenedIdGroups(
+    absl::Span<const ReplicaGroup> replica_groups,
+    CollectiveOpGroupMode replica_group_mode, int replica_count,
+    int partition_count);
+
 // Figures out which devices are participating in the collective subgroup.
 StatusOr<std::vector<GlobalDeviceId>> GetParticipatingDevices(
     GlobalDeviceId device_id, const DeviceAssignment& device_assignment,
diff --git a/tensorflow/compiler/xla/service/collective_transformation_reorderer.cc b/tensorflow/compiler/xla/service/collective_transformation_reorderer.cc
new file mode 100644
index 00000000000..881b9ea4acd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/collective_transformation_reorderer.cc
@@ -0,0 +1,220 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/collective_transformation_reorderer.h"
+
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+
+namespace xla {
+
+namespace {
+struct CollectiveTransformation {
+  HloInstruction* hlo;
+  int64_t transformed_collective_dimension;
+};
+
+// Find a list of reshapes following the all-gather that could be moved to
+// before the all-gather.
+std::optional<std::vector<CollectiveTransformation>>
+GetAllGatherTransformations(HloInstruction* all_gather) {
+  std::vector<HloInstruction*> transformation_hlos;
+  {
+    // First find the list of reshapes.
+    HloInstruction* transformation_hlo = all_gather;
+    bool found_unsupported_transformation = false;
+    while (transformation_hlo->user_count() == 1 &&
+           !found_unsupported_transformation) {
+      transformation_hlo = transformation_hlo->users()[0];
+      switch (transformation_hlo->opcode()) {
+        case HloOpcode::kReshape: {
+          transformation_hlos.push_back(transformation_hlo);
+          break;
+        }
+        default:
+          found_unsupported_transformation = true;
+      }
+    }
+  }
+  if (transformation_hlos.empty()) {
+    return std::nullopt;
+  }
+  // Find the all-gather dimension if the all-gather is to be applied to the
+  // reshaped input.
+  auto get_reshaped_all_gather_dimension =
+      [](const Shape& all_gather_shape, int64_t all_gather_dimension,
+         HloInstruction* transformation_hlo) -> std::optional<int64_t> {
+    // Stride refers to the maximal region of continuous memory before
+    // all-gather that remains continuous after all-gather. This function
+    // finds how much such regions exist before all-gather.
+    int64_t all_gather_num_strides = absl::c_accumulate(
+        all_gather_shape.dimensions().subspan(0, all_gather_dimension), 1,
+        [](int64_t product, int64_t dimension_size) {
+          return product * dimension_size;
+        });
+    // If the reshape is eligible for this transformation, it does not change
+    // the number of strides.
+    int64_t reshaped_all_gather_dimension = 0;
+    int64_t reshaped_num_strides = 1;
+    while (reshaped_all_gather_dimension <
+               transformation_hlo->shape().dimensions_size() &&
+           reshaped_num_strides < all_gather_num_strides) {
+      reshaped_num_strides *=
+          transformation_hlo->shape().dimensions(reshaped_all_gather_dimension);
+      ++reshaped_all_gather_dimension;
+    }
+    if (reshaped_num_strides != all_gather_num_strides) {
+      return std::nullopt;
+    }
+    // Additionally, we make sure the reshape does not change the size of the
+    // all-gather dimension.
+    // TODO(jlwei@): support merging dimensions following the all-gather
+    // dimension into the all-gather dimension.
+    if (transformation_hlo->shape().dimensions(reshaped_all_gather_dimension) !=
+        all_gather_shape.dimensions(all_gather_dimension)) {
+      return std::nullopt;
+    }
+    return reshaped_all_gather_dimension;
+  };
+
+  std::vector<CollectiveTransformation> transformations;
+  HloAllGatherInstruction* all_gather_instruction =
+      DynCast<HloAllGatherInstruction>(all_gather);
+  Shape all_gather_shape = all_gather_instruction->shape();
+  int64_t all_gather_dimension = all_gather_instruction->all_gather_dimension();
+  CHECK(all_gather_instruction != nullptr);
+  // Then find the reshapes that are eligible for this transformation.
+  for (HloInstruction* transformation_hlo : transformation_hlos) {
+    bool found_unsupported_transformation = false;
+    switch (transformation_hlo->opcode()) {
+      case HloOpcode::kReshape: {
+        std::optional<int64_t> reshaped_all_gather_dimension =
+            get_reshaped_all_gather_dimension(
+                all_gather_shape, all_gather_dimension, transformation_hlo);
+        if (reshaped_all_gather_dimension.has_value()) {
+          transformations.push_back(
+              {transformation_hlo, *reshaped_all_gather_dimension});
+          all_gather_shape = transformation_hlo->shape();
+          all_gather_dimension = *reshaped_all_gather_dimension;
+        } else {
+          found_unsupported_transformation = true;
+        }
+        break;
+      }
+      default:
+        return std::nullopt;
+    }
+    if (found_unsupported_transformation) {
+      break;
+    }
+  }
+  if (transformations.empty()) {
+    return std::nullopt;
+  }
+  return transformations;
+}
+}  // namespace
+
+StatusOr<bool> CollectiveTransformationReorder::ReorderAllGatherTransformations(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // First, find all all-gathers and reshapes that are eligible for this
+  // transformation.
+  HloInstructionMap<std::vector<CollectiveTransformation>>
+      all_gather_to_transformations;
+  for (HloComputation* computation :
+       module->MakeComputationPostOrder(execution_threads)) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kAllGather) {
+        if (instruction->operand_count() != 1) {
+          continue;
+        }
+        std::optional<std::vector<CollectiveTransformation>>
+            all_gather_transformations =
+                GetAllGatherTransformations(instruction);
+        if (all_gather_transformations.has_value()) {
+          all_gather_to_transformations[instruction] =
+              *std::move(all_gather_transformations);
+        }
+      }
+    }
+  }
+  if (all_gather_to_transformations.empty()) {
+    return false;
+  }
+  auto reshape_all_gather_operand =
+      [](HloInstruction* all_gather_operand,
+         int64_t original_all_gather_dimension,
+         const CollectiveTransformation& transformation) {
+        Shape reshaped_all_gather_operand_shape = transformation.hlo->shape();
+        int64_t operand_all_gather_dimension_size =
+            all_gather_operand->shape().dimensions(
+                original_all_gather_dimension);
+        reshaped_all_gather_operand_shape.set_dimensions(
+            transformation.transformed_collective_dimension,
+            operand_all_gather_dimension_size);
+        HloComputation* computation = all_gather_operand->parent();
+        return computation->AddInstruction(HloInstruction::CreateReshape(
+            reshaped_all_gather_operand_shape, all_gather_operand));
+      };
+  for (auto& [instruction, transformations] : all_gather_to_transformations) {
+    HloAllGatherInstruction* all_gather =
+        DynCast<HloAllGatherInstruction>(instruction);
+    int64_t all_gather_dimension = all_gather->all_gather_dimension();
+    int64_t original_all_gather_dimension_size =
+        all_gather->shape().dimensions(all_gather_dimension);
+    HloInstruction* all_gather_operand = instruction->mutable_operand(0);
+    // For each eligible reshape on the all-gather result, we reshape the
+    // all-gather operand instead.
+    for (const CollectiveTransformation& transformation : transformations) {
+      all_gather_operand = reshape_all_gather_operand(
+          all_gather_operand, all_gather_dimension, transformation);
+      all_gather_dimension = transformation.transformed_collective_dimension;
+    }
+    Shape new_all_gather_shape = all_gather_operand->shape();
+    new_all_gather_shape.set_dimensions(all_gather_dimension,
+                                        original_all_gather_dimension_size);
+    HloComputation* computation = all_gather_operand->parent();
+    HloInstruction* new_all_gather =
+        computation->AddInstruction(HloInstruction::CreateAllGather(
+            new_all_gather_shape, {all_gather_operand}, all_gather_dimension,
+            all_gather->replica_groups(), all_gather->constrain_layout(),
+            all_gather->channel_id(), all_gather->use_global_device_ids()));
+    TF_RETURN_IF_ERROR(
+        transformations.back().hlo->ReplaceAllUsesWith(new_all_gather));
+    if (computation->root_instruction() == transformations.back().hlo) {
+      computation->set_root_instruction(new_all_gather);
+    }
+  }
+  // Remove the original all-gather and reshapes.
+  HloDCE dce;
+  TF_RETURN_IF_ERROR(dce.Run(module, execution_threads).status());
+  return true;
+}
+
+StatusOr<bool> CollectiveTransformationReorder::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return ReorderAllGatherTransformations(module, execution_threads);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/collective_transformation_reorderer.h b/tensorflow/compiler/xla/service/collective_transformation_reorderer.h
new file mode 100644
index 00000000000..f3f9af643c9
--- /dev/null
+++ b/tensorflow/compiler/xla/service/collective_transformation_reorderer.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// Transforms all-gather + reshape into reshape + all-gather when the reshape
+// only changes the shape of the all-gather shards, i.e., it does not reshape
+// across the all-gather dimension.
+
+// Generally speaking,
+
+// input = [C_0, C_1, ..., C_i, ..., C_{n-1}, C_n] ...
+// all-gather = [C_0, C_1, ..., P*C_i, ... C_{n-1}, C_n] all-gather(input)
+// reshape = [D_0, D_1, ..., P*D_j, ..., D_{m-1}, D_m] reshape(all-gather)
+
+// can be transformed to:
+
+// input = [C_0, C_1, ..., C_i, ..., C_{n-1}, C_n] ...
+// reshape = [D_0, D_1, ..., D_j, ..., D_{m-1}, D_m] reshape(input)
+// all-gather = [D_0, D_1, ..., P*D_j, ... D_{m-1}, D_m] all-gather(input)
+
+// if and only if C_0 * C_1 * ... * C_{i-1} = D_0 * D_1 * ... * D_{j-1}
+// and C_{i+1} * ... * C_{n-1} * C_n = D_{j+1} * ... * D_{m-1} * D_{m}.
+
+class CollectiveTransformationReorder : public HloModulePass {
+ public:
+  CollectiveTransformationReorder() = default;
+  ~CollectiveTransformationReorder() override = default;
+  absl::string_view name() const override {
+    static constexpr absl::string_view kName =
+        "collective-transformation-reorderer";
+    return kName;
+  }
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  StatusOr<bool> ReorderAllGatherTransformations(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads);
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_COLLECTIVE_TRANSFORMATION_REORDERER_H_
diff --git a/tensorflow/compiler/xla/service/collective_transformation_reorderer_test.cc b/tensorflow/compiler/xla/service/collective_transformation_reorderer_test.cc
new file mode 100644
index 00000000000..ca6cab52acd
--- /dev/null
+++ b/tensorflow/compiler/xla/service/collective_transformation_reorderer_test.cc
@@ -0,0 +1,149 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/collective_transformation_reorderer.h"
+
+#include <gmock/gmock.h>
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = xla::testing::opcode_matchers;
+
+class CollectiveTransformationReordererTest : public HloTestBase {
+ public:
+  StatusOr<bool> RunCollectiveTransformationReorderer(HloModule* module) {
+    CollectiveTransformationReorder reorderer;
+    return reorderer.Run(module, {});
+  }
+};
+
+TEST_F(CollectiveTransformationReordererTest,
+       ReshapeWithinShardAfterAllGatherDim) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+  ENTRY entry {
+    param = bf16[8,4,1024] parameter(0)
+    all-gather = bf16[8,32,1024] all-gather(param), dimensions={1}, replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=1
+    ROOT reshape = bf16[8,32,8,128] reshape(all-gather)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunCollectiveTransformationReorderer(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllGather(op::Reshape(op::Parameter())));
+  HloInstruction* all_gather = module->entry_computation()->root_instruction();
+  EXPECT_THAT(all_gather->dimensions(), ::testing::ElementsAre(1));
+}
+
+TEST_F(CollectiveTransformationReordererTest,
+       ReshapeWithinShardBeforeAllGatherDim) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+  ENTRY entry {
+    param = bf16[8,32,8,4,1024] parameter(0)
+    all-gather = bf16[8,32,8,32,1024] all-gather(param), dimensions={3}, replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=1
+    ROOT reshape = bf16[2048,32,1024] reshape(all-gather)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunCollectiveTransformationReorderer(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllGather(op::Reshape(op::Parameter())));
+  HloInstruction* all_gather = module->entry_computation()->root_instruction();
+  EXPECT_THAT(all_gather->dimensions(), ::testing::ElementsAre(1));
+}
+
+TEST_F(CollectiveTransformationReordererTest,
+       ReshapeWithinShardBeforeAndAfterAllGatherDim) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+  ENTRY entry {
+    param = bf16[8,32,8,4,1024] parameter(0)
+    all-gather = bf16[8,32,8,32,1024] all-gather(param), dimensions={3}, replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=1
+    ROOT reshape = bf16[2048,32,8,128] reshape(all-gather)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunCollectiveTransformationReorderer(module.get()));
+  EXPECT_TRUE(changed);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::AllGather(op::Reshape(op::Parameter())));
+  HloInstruction* all_gather = module->entry_computation()->root_instruction();
+  EXPECT_THAT(all_gather->dimensions(), ::testing::ElementsAre(1));
+}
+
+TEST_F(CollectiveTransformationReordererTest, ReshapeAcrossShards) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+  ENTRY entry {
+    param = bf16[8,1,8,128] parameter(0)
+    all-gather = bf16[8,8,8,128] all-gather(param), dimensions={1}, replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=1
+    ROOT reshape = bf16[64,8,128] reshape(all-gather)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunCollectiveTransformationReorderer(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(CollectiveTransformationReordererTest, MergeAllGatherDimensionWithNext) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+  ENTRY entry {
+    param = bf16[8,8,16,16] parameter(0)
+    all-gather = bf16[64,8,16,16] all-gather(param), dimensions={0}, replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=1
+    ROOT reshape = bf16[512,16,16] reshape(all-gather)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunCollectiveTransformationReorderer(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+TEST_F(CollectiveTransformationReordererTest,
+       MergeAllGatherDimensionWithPrevious) {
+  absl::string_view hlo_string = R"(
+  HloModule module
+  ENTRY entry {
+    param = bf16[8,8,16,16] parameter(0)
+    all-gather = bf16[8,64,16,16] all-gather(param), dimensions={1}, replica_groups={{0,1,2,3,4,5,6,7}}, channel_id=1
+    ROOT reshape = bf16[512,16,16] reshape(all-gather)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          RunCollectiveTransformationReorderer(module.get()));
+  EXPECT_FALSE(changed);
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/computation_placer.cc b/tensorflow/compiler/xla/service/computation_placer.cc
index f00a1399aef..5847e0f022a 100644
--- a/tensorflow/compiler/xla/service/computation_placer.cc
+++ b/tensorflow/compiler/xla/service/computation_placer.cc
@@ -163,7 +163,13 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
     ComputationPlacerCreationFunction creation_function) {
   absl::MutexLock lock(&ComputationPlacer::platform_computation_placer_mutex_);
   auto* computation_placers = GetPlatformComputationPlacers();
-  CHECK(computation_placers->find(platform_id) == computation_placers->end());
+  if (computation_placers->find(platform_id) != computation_placers->end()) {
+    // TODO(b/282059652): Consider logging the platform name using
+    // MultiPlatformManager::PlatformWithId(). No doing that for now to avoid
+    // introducing unwanted dependency.
+    LOG(WARNING) << "computation placer already registered. Please check "
+                    "linkage and avoid linking the same target more than once.";
+  }
   (*computation_placers)[platform_id].creation_function = creation_function;
 }
 
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.cc b/tensorflow/compiler/xla/service/conditional_code_motion.cc
index 372f2060f79..4f5336429fd 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.cc
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.cc
@@ -1772,9 +1772,6 @@ class GroupConnectedBoundaries {
       if (hlo->shape().IsTuple()) {
         return 0;
       }
-      if (hlo->opcode() == HloOpcode::kGetTupleElement) {
-        return 0;
-      }
       return ShapeUtil::ByteSizeOf(hlo->shape(), 1) >> 9;
     };
     BoundaryVisitor visitor;
@@ -1793,7 +1790,8 @@ class GroupConnectedBoundaries {
         boundary_index++;
         auto output_size = calc_memory_size(b.operands()[0]);
         connected_boundaries_memory_increase_ -= output_size;
-        VLOG(1) << "memory incr = " << connected_boundaries_memory_increase_;
+        VLOG(1) << "memory incr = " << connected_boundaries_memory_increase_
+                << " after subtracting output size.\n";
         VLOG(1) << "boundary can be moved.";
         int64_t operand_count =
             (b.IsInsideBranch() || b.IsOutsideBranchOperand())
@@ -1803,12 +1801,10 @@ class GroupConnectedBoundaries {
           Boundary next_boundary = GetNextBoundary(b, i);
           VLOG(1) << "Add operand/user " << i << " to visit later\n";
           visitor.AddToWorkList(next_boundary);
-          if (output_size > 0) {
-            connected_boundaries_memory_increase_ +=
-                calc_memory_size(next_boundary.operands()[0]);
-            VLOG(1) << "memory incr = "
-                    << connected_boundaries_memory_increase_;
-          }
+          connected_boundaries_memory_increase_ +=
+              calc_memory_size(next_boundary.operands()[0]);
+          VLOG(1) << "memory incr = " << connected_boundaries_memory_increase_
+                  << " after adding shape size of operand " << i << "\n";
         }
       } else if (b.IsOutsideBranchOperand() &&
                  b.operands()[0]->opcode() == HloOpcode::kBroadcast &&
@@ -1896,10 +1892,10 @@ ConditionalCodeMotion::Decision ConditionalCodeMotion::ConsiderCodeMotion(
       // Stop move instruction if the memory pressure is too high.
       if (move_in_or_out.second > 0 &&
           move_in_or_out.second / move_in_or_out.first.size() >
-              kMemoryAllowance) {
+              memory_increase_allowance_) {
         VLOG(1) << "Stop moving operands because of memory pressure: "
                 << move_in_or_out.second << " / " << move_in_or_out.first.size()
-                << " > " << kMemoryAllowance << "\n";
+                << " > " << memory_increase_allowance_ << "\n";
         benefit = -1;
       } else {
         VLOG(1) << "Increase memory pressure by " << move_in_or_out.second
diff --git a/tensorflow/compiler/xla/service/conditional_code_motion.h b/tensorflow/compiler/xla/service/conditional_code_motion.h
index 610aa01ffec..69a7873a05b 100644
--- a/tensorflow/compiler/xla/service/conditional_code_motion.h
+++ b/tensorflow/compiler/xla/service/conditional_code_motion.h
@@ -111,12 +111,14 @@ class ConditionalCodeMotion : public HloModulePass {
   // start over when optimizing a new model.
   explicit ConditionalCodeMotion(bool is_layout_sensitive,
                                  bool pursue_full_conditional_code_motion,
-                                 int64_t search_config = 0)
+                                 int64_t search_config = 0,
+                                 int64_t memory_increase_allowance = 5000)
       : is_layout_sensitive_(is_layout_sensitive),
         pursue_full_conditional_code_motion_(
             /*turn off special case if tuning*/
             pursue_full_conditional_code_motion && search_config == 0),
-        search_config_index_(0) {
+        search_config_index_(0),
+        memory_increase_allowance_(memory_increase_allowance) {
     search_config_.push_back(search_config);
     if (search_config != 0) {
       search_config_map_[0] = search_config_;
@@ -124,12 +126,14 @@ class ConditionalCodeMotion : public HloModulePass {
   }
   explicit ConditionalCodeMotion(bool is_layout_sensitive,
                                  bool pursue_full_conditional_code_motion,
-                                 std::string search_config)
+                                 std::string search_config,
+                                 int64_t memory_increase_allowance = 5000)
       : is_layout_sensitive_(is_layout_sensitive),
         pursue_full_conditional_code_motion_(
             /*turn off special case if tuning*/
             pursue_full_conditional_code_motion && search_config.empty()),
-        search_config_index_(-1) {
+        search_config_index_(-1),
+        memory_increase_allowance_(memory_increase_allowance) {
     ParseSearchConfiguration(search_config);
   }
   // Parse a given string in the format of a sequence of i,s,m,t into a
@@ -222,7 +226,7 @@ class ConditionalCodeMotion : public HloModulePass {
   // How much memory increase, calculated using
   // ShapeUtil::ByteSizeOf(hlo->shape(), 1) >> 9, is allowed per instruction
   // moved.
-  const int64_t kMemoryAllowance = 10000;
+  int64_t memory_increase_allowance_ = 5000;
   int64_t memory_increase_ = 0;
   StatusOr<bool> MoveInstructionOut(HloInstruction* conditional,
                                     std::vector<Boundary>& to_move_out,
diff --git a/tensorflow/compiler/xla/service/constant_value.cc b/tensorflow/compiler/xla/service/constant_value.cc
new file mode 100644
index 00000000000..54e23552f21
--- /dev/null
+++ b/tensorflow/compiler/xla/service/constant_value.cc
@@ -0,0 +1,86 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/constant_value.h"
+
+#include <string>
+
+namespace xla {
+
+StatusOr<ConstantValue> ConstantValue::FromLiteral(const Literal& literal) {
+  CHECK_EQ(literal.shape().dimensions_size(), 0) << "Expected scalar literal";
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<ConstantValue>>(
+      [&](auto primitive_type_constant) -> StatusOr<ConstantValue> {
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          return ConstantValue(
+              static_cast<uint64_t>(
+                  literal.GetFirstElement<
+                      primitive_util::NativeTypeOf<primitive_type_constant>>()),
+              /*bitwidth=*/primitive_util::BitWidth(primitive_type_constant),
+              /*is_signed=*/
+              primitive_util::IsSignedIntegralType(primitive_type_constant));
+        }
+        return InvalidArgument("Unsupported type");
+      },
+      literal.shape().element_type());
+}
+
+ConstantValue ConstantValue::div(const ConstantValue& other) const {
+  if (!is_signed_) {
+    return ConstantValue(value_ / other.value_, bitwidth_, is_signed_);
+  }
+  return ConstantValue(
+      absl::bit_cast<uint64_t>(absl::bit_cast<int64_t>(value_) /
+                               absl::bit_cast<int64_t>(other.value_)),
+      bitwidth_, is_signed_);
+}
+ConstantValue ConstantValue::mod(const ConstantValue& other) const {
+  if (!is_signed_) {
+    return ConstantValue(value_ % other.value_, bitwidth_, is_signed_);
+  }
+  return ConstantValue(
+      absl::bit_cast<uint64_t>(absl::bit_cast<int64_t>(value_) %
+                               absl::bit_cast<int64_t>(other.value_)),
+      bitwidth_, is_signed_);
+}
+ConstantValue ConstantValue::mul(const ConstantValue& other) const {
+  if (!is_signed_) {
+    return ConstantValue(value_ * other.value_, bitwidth_, is_signed_);
+  }
+  return ConstantValue(
+      absl::bit_cast<uint64_t>(absl::bit_cast<int64_t>(value_) *
+                               absl::bit_cast<int64_t>(other.value_)),
+      bitwidth_, is_signed_);
+}
+bool ConstantValue::lt(const ConstantValue& other) const {
+  if (!is_signed_) {
+    return value_ < other.value_;
+  }
+  return absl::bit_cast<int64_t>(value_) <
+         absl::bit_cast<int64_t>(other.value_);
+}
+bool ConstantValue::gt(const ConstantValue& other) const {
+  if (!is_signed_) {
+    return value_ > other.value_;
+  }
+  return absl::bit_cast<int64_t>(value_) >
+         absl::bit_cast<int64_t>(other.value_);
+}
+std::string ConstantValue::ToString() const {
+  return is_signed_ ? absl::StrCat(GetSignedValue())
+                    : absl::StrCat(GetUnsignedValue());
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/constant_value.h b/tensorflow/compiler/xla/service/constant_value.h
new file mode 100644
index 00000000000..07e6716652a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/constant_value.h
@@ -0,0 +1,87 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CONSTANT_VALUE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CONSTANT_VALUE_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/literal.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+// Class used to represent a constant. Can contain signed/unsigned values
+// and values of many types type erasing the actual type and handling corner
+// cases like going out of bound.
+class ConstantValue {
+ public:
+  // Constructor makes sure the extra bits of the value are masked away. Handles
+  // signed and unsigned cases.
+  ConstantValue(uint64_t value, int32_t bitwidth, bool is_signed)
+      : value_(is_signed
+                   ? absl::bit_cast<uint64_t>(
+                         absl::bit_cast<int64_t>(
+                             value << (8 * sizeof(uint64_t) - bitwidth)) >>
+                         (8 * sizeof(uint64_t) - bitwidth))
+                   : KeepLowerBits(value, bitwidth)),
+        bitwidth_(bitwidth),
+        is_signed_(is_signed) {}
+  static ConstantValue GetZero(int32_t bitwidth, bool is_signed) {
+    return ConstantValue(0, bitwidth, is_signed);
+  }
+  static ConstantValue GetOne(int32_t bitwidth, bool is_signed) {
+    return ConstantValue(1, bitwidth, is_signed);
+  }
+  static ConstantValue GetSigned(int64_t value, int32_t bitwidth) {
+    return ConstantValue(absl::bit_cast<uint64_t>(value), bitwidth,
+                         /*is_signed=*/true);
+  }
+  static ConstantValue GetUnsigned(uint64_t value, int32_t bitwidth) {
+    return ConstantValue(value, bitwidth, /*is_signed=*/false);
+  }
+  static StatusOr<ConstantValue> FromLiteral(const Literal& literal);
+  ConstantValue add(const ConstantValue& other) const {
+    return ConstantValue(value_ + other.value_, bitwidth_, is_signed_);
+  }
+  ConstantValue sub(const ConstantValue& other) const {
+    return ConstantValue(value_ - other.value_, bitwidth_, is_signed_);
+  }
+  ConstantValue div(const ConstantValue& other) const;
+  ConstantValue mod(const ConstantValue& other) const;
+  ConstantValue mul(const ConstantValue& other) const;
+  bool lt(const ConstantValue& other) const;
+  bool gt(const ConstantValue& other) const;
+  bool eq(const ConstantValue& other) const { return *this == other; }
+  int64_t GetSignedValue() const { return absl::bit_cast<int64_t>(value_); }
+  uint64_t GetUnsignedValue() const { return value_; }
+  int32_t GetBitwidth() const { return bitwidth_; }
+  bool IsSigned() const { return is_signed_; }
+  bool operator==(const ConstantValue& other) const {
+    return value_ == other.value_ && bitwidth_ == other.bitwidth_ &&
+           is_signed_ == other.is_signed_;
+  }
+  std::string ToString() const;
+
+ private:
+  uint64_t value_;
+  int32_t bitwidth_;
+  bool is_signed_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CONSTANT_VALUE_H_
diff --git a/tensorflow/compiler/xla/service/constant_value_test.cc b/tensorflow/compiler/xla/service/constant_value_test.cc
new file mode 100644
index 00000000000..5cc3952c06c
--- /dev/null
+++ b/tensorflow/compiler/xla/service/constant_value_test.cc
@@ -0,0 +1,363 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/constant_value.h"
+
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/literal_util.h"
+
+namespace xla {
+namespace {
+
+class ConstantValueTest : public ::testing::Test {};
+
+TEST_F(ConstantValueTest, ZeroTest32) {
+  ConstantValue zero = ConstantValue::GetZero(32, /*is_signed=*/false);
+  EXPECT_EQ(zero.GetSignedValue(), 0);
+  EXPECT_EQ(zero.GetUnsignedValue(), 0);
+  EXPECT_EQ(zero.GetBitwidth(), 32);
+  EXPECT_FALSE(zero.IsSigned());
+
+  ConstantValue zero_s = ConstantValue::GetZero(32, /*is_signed=*/true);
+  EXPECT_EQ(zero_s.GetSignedValue(), 0);
+  EXPECT_EQ(zero_s.GetUnsignedValue(), 0);
+  EXPECT_EQ(zero_s.GetBitwidth(), 32);
+  EXPECT_TRUE(zero_s.IsSigned());
+}
+
+TEST_F(ConstantValueTest, OneTest32) {
+  ConstantValue one = ConstantValue::GetOne(32, /*is_signed=*/false);
+  EXPECT_EQ(one.GetSignedValue(), 1);
+  EXPECT_EQ(one.GetUnsignedValue(), 1);
+  EXPECT_EQ(one.GetBitwidth(), 32);
+  EXPECT_FALSE(one.IsSigned());
+
+  ConstantValue one_s = ConstantValue::GetOne(32, /*is_signed=*/true);
+  EXPECT_EQ(one_s.GetSignedValue(), 1);
+  EXPECT_EQ(one_s.GetUnsignedValue(), 1);
+  EXPECT_EQ(one_s.GetBitwidth(), 32);
+  EXPECT_TRUE(one_s.IsSigned());
+}
+
+TEST_F(ConstantValueTest, Signed23) {
+  // 4194303 is 2^22 - 1
+  ConstantValue signed_number = ConstantValue::GetSigned(4194303, 23);
+  EXPECT_EQ(signed_number.GetSignedValue(), 4194303);
+  EXPECT_EQ(signed_number.GetBitwidth(), 23);
+  EXPECT_TRUE(signed_number.IsSigned());
+
+  // 4194304 is 2^22
+  ConstantValue signed_number_of = ConstantValue::GetSigned(4194304, 23);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(signed_number_of.GetSignedValue(), -4194304);
+  EXPECT_EQ(signed_number_of.GetBitwidth(), 23);
+  EXPECT_TRUE(signed_number_of.IsSigned());
+}
+
+TEST_F(ConstantValueTest, Unsigned23) {
+  // 8388607 is 2^23 - 1
+  ConstantValue unsigned_number = ConstantValue::GetUnsigned(8388607, 23);
+  EXPECT_EQ(unsigned_number.GetUnsignedValue(), 8388607);
+  EXPECT_EQ(unsigned_number.GetBitwidth(), 23);
+  EXPECT_FALSE(unsigned_number.IsSigned());
+
+  // 8388608 is 2^23
+  ConstantValue unsigned_number_of = ConstantValue::GetUnsigned(8388608, 23);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(unsigned_number_of.GetUnsignedValue(), 0);
+  EXPECT_EQ(unsigned_number_of.GetBitwidth(), 23);
+  EXPECT_FALSE(unsigned_number_of.IsSigned());
+}
+
+TEST_F(ConstantValueTest, FromLiteral) {
+  auto cv_8 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<int8_t>(-32)));
+  EXPECT_TRUE(cv_8.ok());
+  EXPECT_TRUE(cv_8->IsSigned());
+  EXPECT_EQ(cv_8->GetBitwidth(), 8);
+  EXPECT_EQ(cv_8->GetSignedValue(), -32);
+
+  auto cv_u8 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<int8_t>(32)));
+  EXPECT_TRUE(cv_u8.ok());
+  EXPECT_TRUE(cv_u8->IsSigned());
+  EXPECT_EQ(cv_u8->GetBitwidth(), 8);
+  EXPECT_EQ(cv_u8->GetUnsignedValue(), 32);
+
+  auto cv_16 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<int16_t>(32000)));
+  EXPECT_TRUE(cv_16.ok());
+  EXPECT_TRUE(cv_16->IsSigned());
+  EXPECT_EQ(cv_16->GetBitwidth(), 16);
+  EXPECT_EQ(cv_16->GetSignedValue(), 32000);
+
+  auto cv_u16 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<uint16_t>(33000)));
+  EXPECT_TRUE(cv_u16.ok());
+  EXPECT_FALSE(cv_u16->IsSigned());
+  EXPECT_EQ(cv_u16->GetBitwidth(), 16);
+  EXPECT_EQ(cv_u16->GetUnsignedValue(), 33000);
+
+  auto cv_32 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<int32_t>(-2000000000)));
+  EXPECT_TRUE(cv_32.ok());
+  EXPECT_TRUE(cv_32->IsSigned());
+  EXPECT_EQ(cv_32->GetBitwidth(), 32);
+  EXPECT_EQ(cv_32->GetSignedValue(), -2000000000);
+
+  auto cv_u32 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<uint32_t>(3000000000)));
+  EXPECT_TRUE(cv_u32.ok());
+  EXPECT_FALSE(cv_u32->IsSigned());
+  EXPECT_EQ(cv_u32->GetBitwidth(), 32);
+  EXPECT_EQ(cv_u32->GetUnsignedValue(), 3000000000);
+
+  auto cv_64 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<int64_t>(3000000000)));
+  EXPECT_TRUE(cv_64.ok());
+  EXPECT_TRUE(cv_64->IsSigned());
+  EXPECT_EQ(cv_64->GetBitwidth(), 64);
+  EXPECT_EQ(cv_64->GetSignedValue(), 3000000000);
+
+  auto cv_u64 = ConstantValue::FromLiteral(
+      LiteralUtil::CreateR0(static_cast<uint64_t>(6000000000)));
+  EXPECT_TRUE(cv_u64.ok());
+  EXPECT_FALSE(cv_u64->IsSigned());
+  EXPECT_EQ(cv_u64->GetBitwidth(), 64);
+  EXPECT_EQ(cv_u64->GetUnsignedValue(), 6000000000);
+}
+
+TEST_F(ConstantValueTest, Add) {
+  // 8388607 is 2^23 - 1
+  ConstantValue lhs = ConstantValue::GetUnsigned(8388607, 23);
+  ConstantValue rhs = ConstantValue::GetUnsigned(1, 23);
+  // Result should overflow.
+  ConstantValue result = lhs.add(rhs);
+  EXPECT_EQ(result.GetUnsignedValue(), 0);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetUnsigned(8388600, 23);
+  rhs = ConstantValue::GetUnsigned(7, 23);
+  result = lhs.add(rhs);
+  // Verifying some unsigned computation.
+  EXPECT_EQ(result.GetUnsignedValue(), 8388607);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(4, 23);
+  result = lhs.add(rhs);
+  // Verifying some signed computation.
+  EXPECT_EQ(result.GetSignedValue(), -6);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(-1, 23);
+  result = lhs.add(rhs);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(result.GetSignedValue(), 4194303);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+}
+
+TEST_F(ConstantValueTest, Sub) {
+  // 8388607 is 2^23 - 1
+  ConstantValue lhs = ConstantValue::GetUnsigned(8388607, 23);
+  ConstantValue rhs = ConstantValue::GetUnsigned(1, 23);
+  // Test subtraction of unsigned numbers.
+  ConstantValue result = lhs.sub(rhs);
+  EXPECT_EQ(result.GetUnsignedValue(), 8388606);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetUnsigned(6, 23);
+  rhs = ConstantValue::GetUnsigned(7, 23);
+  result = lhs.sub(rhs);
+  // Verifying some unsigned computation.
+  EXPECT_EQ(result.GetUnsignedValue(), 8388607);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(4, 23);
+  result = lhs.sub(rhs);
+  // Verifying some signed computation.
+  EXPECT_EQ(result.GetSignedValue(), -14);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(1, 23);
+  result = lhs.sub(rhs);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(result.GetSignedValue(), 4194303);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+}
+
+TEST_F(ConstantValueTest, Div) {
+  ConstantValue lhs = ConstantValue::GetUnsigned(94, 23);
+  ConstantValue rhs = ConstantValue::GetUnsigned(47, 23);
+  // Test division of unsigned numbers
+  ConstantValue result = lhs.div(rhs);
+  EXPECT_EQ(result.GetUnsignedValue(), 2);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetUnsigned(6, 23);
+  rhs = ConstantValue::GetUnsigned(7, 23);
+  result = lhs.div(rhs);
+  // Test flooring to 0.
+  EXPECT_EQ(result.GetUnsignedValue(), 0);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(4, 23);
+  result = lhs.div(rhs);
+  // Test dividing signed numbers and that sign is respected.
+  EXPECT_EQ(result.GetSignedValue(), -2);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(2, 23);
+  result = lhs.div(rhs);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(result.GetSignedValue(), -2097152);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+}
+
+TEST_F(ConstantValueTest, Mod) {
+  ConstantValue lhs = ConstantValue::GetUnsigned(94, 23);
+  ConstantValue rhs = ConstantValue::GetUnsigned(47, 23);
+  // Test modulo of unsigned numbers
+  ConstantValue result = lhs.mod(rhs);
+  EXPECT_EQ(result.GetUnsignedValue(), 0);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetUnsigned(6, 23);
+  rhs = ConstantValue::GetUnsigned(7, 23);
+  result = lhs.mod(rhs);
+  // Test modulo of numbers less than divisor.
+  EXPECT_EQ(result.GetUnsignedValue(), 6);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(3, 23);
+  result = lhs.mod(rhs);
+  // Verify that signed numbers and their sign are handled correctly.
+  EXPECT_EQ(result.GetSignedValue(), -1);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(1, 23);
+  result = lhs.mod(rhs);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(result.GetSignedValue(), 0);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+}
+
+TEST_F(ConstantValueTest, Mul) {
+  ConstantValue lhs = ConstantValue::GetUnsigned(94, 23);
+  ConstantValue rhs = ConstantValue::GetUnsigned(47, 23);
+  // Test multiply of unsigned numbers
+  ConstantValue result = lhs.mul(rhs);
+  EXPECT_EQ(result.GetUnsignedValue(), 4418);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetUnsigned(8388607, 23);
+  rhs = ConstantValue::GetUnsigned(2, 23);
+  result = lhs.mul(rhs);
+  // Test multiply of numbers less than divisor.
+  EXPECT_EQ(result.GetUnsignedValue(), 8388606);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_FALSE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(3, 23);
+  result = lhs.mul(rhs);
+  // Verify that signed numbers and their sign are handled correctly.
+  EXPECT_EQ(result.GetSignedValue(), -30);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(2, 23);
+  result = lhs.mod(rhs);
+  // Verifying that if we get beyond the limit we are wrapping around.
+  EXPECT_EQ(result.GetSignedValue(), 0);
+  EXPECT_EQ(result.GetBitwidth(), 23);
+  EXPECT_TRUE(result.IsSigned());
+}
+
+TEST_F(ConstantValueTest, LtGtEq) {
+  ConstantValue lhs = ConstantValue::GetUnsigned(94, 23);
+  ConstantValue rhs = ConstantValue::GetUnsigned(47, 23);
+  // Test comparison of some numbers.
+  EXPECT_FALSE(lhs.lt(rhs));
+  EXPECT_TRUE(lhs.gt(rhs));
+
+  lhs = ConstantValue::GetUnsigned(8388607, 23);
+  rhs = ConstantValue::GetUnsigned(2, 23);
+  // Test comparison with numbers at boundary of range.
+  EXPECT_FALSE(lhs.lt(rhs));
+  EXPECT_TRUE(lhs.gt(rhs));
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(3, 23);
+  // Test comparison with signed numbers.
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(2, 23);
+  // Test comparison with signed numbers at boundary of range.
+  EXPECT_TRUE(lhs.lt(rhs));
+  EXPECT_FALSE(lhs.gt(rhs));
+
+  lhs = ConstantValue::GetUnsigned(43, 23);
+  rhs = ConstantValue::GetUnsigned(43, 23);
+  // Test equality unsigned numbers.
+  EXPECT_TRUE(lhs.eq(rhs));
+  EXPECT_TRUE(rhs.eq(lhs));
+
+  lhs = ConstantValue::GetSigned(-10, 23);
+  rhs = ConstantValue::GetSigned(-10, 23);
+  // Test equality signed numbers.
+  EXPECT_TRUE(lhs.eq(rhs));
+  EXPECT_TRUE(rhs.eq(lhs));
+
+  lhs = ConstantValue::GetUnsigned(4194304, 23);
+  rhs = ConstantValue::GetUnsigned(2, 23);
+  // Test inequality unsigned numbers.
+  EXPECT_FALSE(lhs.eq(rhs));
+  EXPECT_FALSE(rhs.eq(lhs));
+
+  lhs = ConstantValue::GetSigned(-4194304, 23);
+  rhs = ConstantValue::GetSigned(2, 23);
+  // Test inequality signed numbers.
+  EXPECT_FALSE(lhs.eq(rhs));
+  EXPECT_FALSE(rhs.eq(lhs));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/copy_insertion.cc b/tensorflow/compiler/xla/service/copy_insertion.cc
index f224e21184f..06a4981e546 100644
--- a/tensorflow/compiler/xla/service/copy_insertion.cc
+++ b/tensorflow/compiler/xla/service/copy_insertion.cc
@@ -16,9 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
 
 #include <algorithm>
-#include <cstddef>
 #include <optional>
-#include <sstream>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -26,7 +24,6 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
-#include "absl/types/any.h"
 #include "tensorflow/compiler/xla/frontend_attributes.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -37,15 +34,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_buffer.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
-#include "tensorflow/compiler/xla/service/hlo_graph_dumper.h"
 #include "tensorflow/compiler/xla/service/hlo_ordering.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/tuple_simplifier.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/statusor.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
 namespace {
diff --git a/tensorflow/compiler/xla/service/cpu/BUILD b/tensorflow/compiler/xla/service/cpu/BUILD
index e606d6fd69f..da5fac33163 100644
--- a/tensorflow/compiler/xla/service/cpu/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/BUILD
@@ -89,6 +89,8 @@ filegroup(
 filegroup(
     name = "runtime_hdrs",
     srcs = [
+        # XLA Runtime support.
+        "buffer_desc.h",
         # Single-threaded support.
         "runtime_custom_call_status.h",
         "runtime_conv_impl.h",
@@ -112,20 +114,6 @@ filegroup(
     visibility = [":friends"],
 )
 
-filegroup(
-    name = "xla_runtime_runner_hdrs",
-    srcs = [
-        "buffer_desc.h",
-    ],
-    visibility = [":friends"],
-)
-
-filegroup(
-    name = "xla_runtime_runner_srcs",
-    srcs = [],
-    visibility = [":friends"],
-)
-
 cc_library(
     name = "cpu_xfeed",
     srcs = ["cpu_xfeed.cc"],
@@ -188,6 +176,7 @@ cc_library(
     hdrs = ["buffer_info_util.h"],
     deps = [
         "//tensorflow/compiler/xla:cpu_function_runtime",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "@com_google_absl//absl/types:span",
     ],
@@ -225,10 +214,11 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir/framework/ir:xla_framework",
-        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
+        "//tensorflow/compiler/xla/mlir/runtime/ir:rt",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:calling_convention",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_cpu",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
+        "//tensorflow/compiler/xla/mlir/runtime/transforms:jit_compiler",
         "//tensorflow/compiler/xla/mlir/tools/mlir_replay/public:compiler_trace_instrumentation",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
@@ -312,7 +302,7 @@ cc_library(
         "//tensorflow/compiler/xla/service/cpu/runtime:custom_call",
         "//tensorflow/compiler/xla/service/cpu/runtime:fft_call",
         "//tensorflow/compiler/xla/service/cpu/runtime:retain",
-        "//tensorflow/compiler/xla/service/cpu/runtime:rng",
+        "//tensorflow/compiler/xla/service/cpu/runtime:rng_call",
         "//tensorflow/compiler/xla/service/cpu/runtime:xfeed",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_command_line_options",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
@@ -407,11 +397,11 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/mlir/backends/cpu/transforms:passes",
-        "//tensorflow/compiler/xla/mlir/framework/transforms:passes",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compiler",
         "//tensorflow/compiler/xla/mlir_hlo:all_passes",
         "//tensorflow/compiler/xla/mlir_hlo:gml_st_bufferizable_op_interface",
         "//tensorflow/compiler/xla/mlir_hlo:mhlo_passes",
+        "//tensorflow/compiler/xla/mlir_hlo:thlo_bufferizable_op_interface",
         "//tensorflow/compiler/xla/mlir_hlo:transforms_passes",
         "//tensorflow/compiler/xla/runtime:compiler",
         "//tensorflow/tsl/platform:errors",
@@ -783,6 +773,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla/runtime:execution_engine",
         "//tensorflow/compiler/xla/service:llvm_compiler",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
         "//tensorflow/tsl/platform:logging",
diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc
index c48164d07c9..0b87796900a 100644
--- a/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc
+++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.cc
@@ -23,7 +23,7 @@ namespace cpu {
 using BufferInfo = cpu_function_runtime::BufferInfo;
 
 std::vector<BufferInfo> CreateBufferInfosFromBufferAssignment(
-    const BufferAssignment& buffer_assignment) {
+    const HloModule& module, const BufferAssignment& buffer_assignment) {
   std::vector<BufferInfo> buffer_infos;
   for (const BufferAllocation& allocation : buffer_assignment.Allocations()) {
     if (allocation.is_thread_local()) {
@@ -38,21 +38,58 @@ std::vector<BufferInfo> CreateBufferInfosFromBufferAssignment(
       buffer_infos.push_back(BufferInfo::MakeTempBuffer(allocation.size()));
     }
   }
+
+  // Fill in the result parameters' indices, expanding all tuples.
+  auto root_instr = module.entry_computation()->root_instruction();
+  auto output_allocation = buffer_assignment.GetUniqueTopLevelOutputSlice();
+  if (output_allocation->allocation()->is_tuple()) {
+    int out_index = 0;
+    ShapeUtil::ForEachSubshape(
+        root_instr->shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsTuple()) {
+            return;
+          }
+          int64_t result_index =
+              buffer_assignment.GetUniqueSlice(root_instr, index)->index();
+          assert(result_index < buffer_infos.size());
+          buffer_infos[result_index].set_result_parameter_number(out_index++);
+        });
+  }
+
   return buffer_infos;
 }
 
 std::vector<int32_t> CreateArgIndexTableFromBufferInfos(
     absl::Span<const BufferInfo> buffer_infos) {
-  std::vector<int32_t> result;
+  std::vector<int32_t> ret;
   for (int64_t i = 0; i < buffer_infos.size(); i++) {
-    if (buffer_infos[i].is_entry_parameter()) {
-      if (buffer_infos[i].entry_parameter_number() >= result.size()) {
-        result.resize(buffer_infos[i].entry_parameter_number() + 1);
-      }
-      result[buffer_infos[i].entry_parameter_number()] = i;
+    if (!buffer_infos[i].is_entry_parameter()) {
+      continue;
     }
+    uint64_t param_index = buffer_infos[i].entry_parameter_number();
+    if (param_index >= ret.size()) {
+      ret.resize(param_index + 1);
+    }
+    ret[param_index] = i;
   }
-  return result;
+  return ret;
+}
+
+std::vector<int32_t> CreateResultIndexTableFromBufferInfos(
+    absl::Span<const BufferInfo> buffer_infos) {
+  std::vector<int32_t> ret;
+  for (int64_t i = 0; i < buffer_infos.size(); i++) {
+    if (!buffer_infos[i].is_result_parameter()) {
+      continue;
+    }
+    uint64_t result_index = buffer_infos[i].result_parameter_number();
+    if (result_index >= ret.size()) {
+      ret.resize(result_index + 1);
+    }
+    ret[result_index] = i;
+  }
+  return ret;
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/buffer_info_util.h b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h
index 9b4255d3a61..66335c72488 100644
--- a/tensorflow/compiler/xla/service/cpu/buffer_info_util.h
+++ b/tensorflow/compiler/xla/service/cpu/buffer_info_util.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "tensorflow/compiler/xla/cpu_function_runtime.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 
 namespace xla {
@@ -26,7 +27,7 @@ namespace cpu {
 // information from `buffer_assignment`.
 std::vector<cpu_function_runtime::BufferInfo>
 CreateBufferInfosFromBufferAssignment(
-    const BufferAssignment& buffer_assignment);
+    const HloModule& module, const BufferAssignment& buffer_assignment);
 
 // Creates and returns a table containing the mapping from entry computation
 // parameters to buffer allocation indices.
@@ -35,6 +36,9 @@ CreateBufferInfosFromBufferAssignment(
 // V[i].
 std::vector<int32_t> CreateArgIndexTableFromBufferInfos(
     absl::Span<const cpu_function_runtime::BufferInfo> buffer_infos);
+
+std::vector<int32_t> CreateResultIndexTableFromBufferInfos(
+    absl::Span<const cpu_function_runtime::BufferInfo> buffer_infos);
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
index 8033eadd78a..c8d24f49dbe 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Instrumentation/DataFlowSanitizer.h"
+#include "tensorflow/compiler/xla/runtime/execution_engine.h"
 #include "tensorflow/compiler/xla/service/cpu/cpu_runtime.h"
 #include "tensorflow/compiler/xla/service/cpu/llvm_ir_runtime.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
@@ -151,6 +152,20 @@ llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> CompilerFunctor::operator()(
 
   llvm::ModulePassManager pm;
 
+  for (const auto& func_name : convert_to_xla_runtime_abi_) {
+    llvm::Function* func = module.getFunction(func_name);
+    // Create a new function with the XLA Runtime ABI and inline the original
+    // (i.e. with ctx + memref args) into it.
+    std::string inlined_func_name =
+        absl::StrCat(func_name, "__orig_xla_runtime_abi");
+    func->setName(inlined_func_name);
+    absl::Status status = xla::runtime::ExportWithXlaRuntimeAbi(
+        module, inlined_func_name, func_name);
+    if (!status.ok()) {
+      LOG(FATAL) << status.message();
+    }
+  }
+
   if (dfsan_enabled_) {
     pm.addPass(llvm::DataFlowSanitizerPass(dfsan_abi_list_files_));
   }
diff --git a/tensorflow/compiler/xla/service/cpu/compiler_functor.h b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
index f706b9ff5ab..47552846b78 100644
--- a/tensorflow/compiler/xla/service/cpu/compiler_functor.h
+++ b/tensorflow/compiler/xla/service/cpu/compiler_functor.h
@@ -46,7 +46,8 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
       std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook =
           nullptr,
       bool dfsan_enabled = false,
-      const std::vector<std::string>& dfsan_abi_list_files = {})
+      const std::vector<std::string>& dfsan_abi_list_files = {},
+      const std::vector<std::string>& convert_to_xla_runtime_abi = {})
       : IRCompiler(llvm::orc::IRSymbolMapper::ManglingOptions()),
         target_machine_(target_machine),
         opt_level_(opt_level),
@@ -58,7 +59,8 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
         post_optimization_hook_(std::move(post_optimization_hook)),
         post_codegen_hook_(std::move(post_codegen_hook)),
         dfsan_enabled_(dfsan_enabled),
-        dfsan_abi_list_files_(dfsan_abi_list_files) {}
+        dfsan_abi_list_files_(dfsan_abi_list_files),
+        convert_to_xla_runtime_abi_(convert_to_xla_runtime_abi) {}
 
   // Compile a Module to an ObjectFile.
   llvm::Expected<std::unique_ptr<llvm::MemoryBuffer>> operator()(
@@ -76,6 +78,7 @@ class CompilerFunctor : public llvm::orc::IRCompileLayer::IRCompiler {
   std::function<void(const llvm::object::ObjectFile&)> post_codegen_hook_;
   const bool dfsan_enabled_ = false;
   const std::vector<std::string> dfsan_abi_list_files_;
+  const std::vector<std::string> convert_to_xla_runtime_abi_;
 };
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
index 3e99b2941b2..af89a05de25 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_compiler.cc
@@ -83,10 +83,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/mlir/framework/ir/xla_framework.h"
-#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir/runtime/ir/rt_dialect.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/calling_convention.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_cpu.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
+#include "tensorflow/compiler/xla/mlir/runtime/transforms/jit_compiler.h"
 #include "tensorflow/compiler/xla/mlir/tools/mlir_replay/public/compiler_trace_instrumentation.h"
 #include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
@@ -128,7 +129,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/cpu/runtime/convolution_call.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/fft_call.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime/rng.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng_call.h"
 #include "tensorflow/compiler/xla/service/cpu/runtime/xfeed.h"
 #include "tensorflow/compiler/xla/service/cpu/simple_orc_jit.h"
 #include "tensorflow/compiler/xla/service/cpu/xla_framework.h"
@@ -205,11 +206,41 @@ void LoadMLIRDialects(mlir::MLIRContext& context) {
                       mlir::scf::SCFDialect, mlir::vector::VectorDialect,
                       mlir::func::FuncDialect, mlir::affine::AffineDialect,
                       mlir::tensor::TensorDialect,
-                      mlir::xla_framework::XLAFrameworkDialect>();
+                      mlir::xla_framework::XLAFrameworkDialect,
+                      xla::runtime::RuntimeDialect>();
   mlir::registerBuiltinDialectTranslation(context);
   mlir::registerLLVMDialectTranslation(context);
 }
 
+xla::cpu::HloXlaRuntimePipelineOptions GetHloXlaRuntimePipelineOptions(
+    llvm::Triple target_triple, llvm::StringRef cpu_name) {
+  xla::cpu::HloXlaRuntimePipelineOptions options;
+  options.enable_tiling_and_fusion =
+      xla::GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
+  if (xla::GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
+    options.matmul_tile_sizes = {
+        xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
+        xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
+        xla::GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
+  }
+  options.experimental_deallocation =
+      xla::GetDebugOptionsFromFlags()
+          .xla_cpu_enable_experimental_deallocation();
+  options.enable_avx2 = [&] {
+    // Derive whether this is an x86 CPU with AVX2 enabled.
+    if (!target_triple.isX86()) return false;
+    llvm::SmallVector<llvm::StringRef> cpu_features;
+    llvm::X86::getFeaturesForCPU(cpu_name, cpu_features);
+    return llvm::is_contained(cpu_features, "avx2");
+  }();
+  options.cpu_name = cpu_name;
+  if (xla::GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
+    options.enable_fusion_outlining = true;
+    options.experimental_deallocation = true;
+  }
+  return options;
+}
+
 }  // namespace
 
 namespace xla {
@@ -339,22 +370,9 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
   opts.compiler
       .create_compilation_pipeline = [&module, copts](
                                          xla::runtime::PassManager& passes) {
-    HloXlaRuntimePipelineOptions options;
-    options.enable_tiling_and_fusion =
-        GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
-    options.experimental_deallocation =
-        GetDebugOptionsFromFlags().xla_cpu_enable_experimental_deallocation();
-    options.cpu_name = llvm::sys::getHostCPUName();
-    if (GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
-      options.matmul_tile_sizes = {
-          GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
-          GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
-          GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
-    }
-    if (GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
-      options.enable_fusion_outlining = true;
-      options.experimental_deallocation = true;
-    }
+    HloXlaRuntimePipelineOptions options = GetHloXlaRuntimePipelineOptions(
+        llvm::Triple(llvm::sys::getProcessTriple()),
+        llvm::sys::getHostCPUName());
     options.xla_cpu_sparse_cuda_threads =
         GetDebugOptionsFromFlags().xla_cpu_sparse_cuda_threads();
 
@@ -735,6 +753,9 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pipeline.AddPass<ConditionalSimplifier>();
   }();
   pipeline.AddPass<BitcastDtypesExpander>();
+  pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kTopK;
+  });
 
   // XLA lowers topk to a libcall while the MLIR based pipeline does not yet
   // support libcalls. Disable this for now.
@@ -1078,36 +1099,13 @@ Status LowerMLIRModule(HloModule* module, mlir::ModuleOp mlir_module,
   }
 
   xla::runtime::PassManager xla_pm(&pm);
-  HloXlaRuntimePipelineOptions options;
-  options.enable_tiling_and_fusion =
-      GetDebugOptionsFromFlags().xla_cpu_enable_mlir_tiling_and_fusion();
-  if (GetDebugOptionsFromFlags().xla_cpu_enable_custom_matmul_tiling()) {
-    options.matmul_tile_sizes = {
-        GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_m_dim(),
-        GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_n_dim(),
-        GetDebugOptionsFromFlags().xla_cpu_matmul_tiling_k_dim()};
-  }
+  HloXlaRuntimePipelineOptions options = GetHloXlaRuntimePipelineOptions(
+      target.getTargetTriple(), target.getTargetCPU());
   options.sparse_bufferization = false;
-  options.outline_with_xla_framework = true;
-  options.experimental_deallocation =
-      GetDebugOptionsFromFlags().xla_cpu_enable_experimental_deallocation();
-  options.enable_avx2 = [&] {
-    // Derive whether this is an x86 CPU with AVX2 enabled.
-    if (!target.getTargetTriple().isX86()) return false;
-    llvm::SmallVector<llvm::StringRef> cpu_features;
-    llvm::X86::getFeaturesForCPU(target.getTargetCPU(), cpu_features);
-    return llvm::is_contained(cpu_features, "avx2");
-  }();
-  options.cpu_name = target.getTargetCPU();
-  if (GetDebugOptionsFromFlags().xla_cpu_enable_mlir_fusion_outlining()) {
-    options.enable_fusion_outlining = true;
-    options.sparse_bufferization = false;
-    options.experimental_deallocation = true;
-  }
   TF_RETURN_IF_ERROR(CreateHloXlaRuntimePipeline(xla_pm, options));
 
   runtime::CpuPipelineOptions cpu_pipeline_opts;
-  CreateDefaultXlaCpuAOTCompilationPipeline(xla_pm, cpu_pipeline_opts);
+  CreateDefaultXlaCpuRuntimeCompilationPipeline(xla_pm, cpu_pipeline_opts);
 
   if (pm.run(mlir_module).failed()) {
     mlir_module->dump();
@@ -1679,18 +1677,19 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
     LLVMTargetMachineFeatures target_machine_features(target_machine.get());
     std::vector<BufferInfo> buffer_infos =
-        CreateBufferInfosFromBufferAssignment(*assignment);
+        CreateBufferInfosFromBufferAssignment(*module, *assignment);
     HloComputation* computation = module->entry_computation();
 
     if (options.use_mlir_hlo_lowering()) {
       TF_ASSIGN_OR_RETURN(
           auto mlir_module,
           createMLIRModule(module, mlir_context, assignment.get()));
+      TF_RETURN_IF_ERROR(
+          xla::runtime::ExportMainWithOrdinal0(*mlir_module, mlir_context));
       TF_RETURN_IF_ERROR(
           LowerMLIRModule(module, *mlir_module, mlir_context, *target_machine));
 
-      llvm::cast<mlir::LLVM::LLVMFuncOp>(
-          mlir_module->lookupSymbol("main_xla_framework"))
+      llvm::cast<mlir::LLVM::LLVMFuncOp>(mlir_module->lookupSymbol("main"))
           .setName(options.entry_point_name());
 
       llvm_module = mlir::translateModuleToLLVMIR(*mlir_module, llvm_context);
@@ -1780,6 +1779,11 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
                                         obj_file.getData().size()));
     };
 
+    std::vector<std::string> xla_runtime_abi_conversions;
+    if (options.use_mlir_hlo_lowering()) {
+      xla_runtime_abi_conversions.push_back(options.entry_point_name());
+    }
+
     CompilerFunctor compiler_functor(
         target_machine.get(), opt_level,
         options::OptimizeForSizeRequested(module->config()),
@@ -1788,7 +1792,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         llvm_ir::GetCpuFastMathFlags(module->config()),
         pre_optimization_ir_hook, post_optimization_ir_hook, post_codegen_hook,
         aot_options.sanitize_dataflow(),
-        aot_options.sanitize_abilists_dataflow());
+        aot_options.sanitize_abilists_dataflow(), xla_runtime_abi_conversions);
     std::unique_ptr<llvm::MemoryBuffer> object_file =
         cantFail(compiler_functor(*llvm_module));
     ObjectFileData object_file_data(object_file->getBufferStart(),
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
index 797e9dcdbfe..bb74517b91c 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_executable.cc
@@ -570,7 +570,7 @@ Status XlaRuntimeCpuExecutable::Execute(
   // Initialize state required for running functions exported from FFI modules.
   absl::StatusOr<runtime::ffi::FfiStateVector> ffi_state =
       ffi_modules_state_.state_vector();
-  if (!ffi_state.ok()) return FromAbslStatus(ffi_state.status());
+  if (!ffi_state.ok()) return ffi_state.status();
 
   runtime::CustomCall::UserData user_data(run_options, &ffi_state.value());
 
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
index 5b709d2ed62..b1201c84399 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/cpu/cpu_layout_assignment.h
@@ -38,9 +38,6 @@ class CpuLayoutAssignment : public LayoutAssignment {
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
-  // The CPU backend does not use memory spaces, so there is no need to
-  // propagate them.
-  Status PropagateMemorySpace(HloModule* module) override { return OkStatus(); }
 
   const TargetMachineFeatures& target_machine_features_;
 };
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
index 5d3246bf44f..d352063f979 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_runtime.cc
@@ -158,11 +158,6 @@ extern const char* const kReplicaIdSymbolName = "__xla_cpu_runtime_ReplicaId";
 
 namespace {
 
-template <class T>
-struct is_complex : std::false_type {};
-template <class T>
-struct is_complex<std::complex<T>> : std::true_type {};
-
 struct CollectivePermuteParticipantData : ParticipantData {
   CollectivePermuteParticipantData(const RendezvousKey& rendezvous_key_p,
                                    int64_t device_ordinal_p,
@@ -514,7 +509,7 @@ class CpuAllReduceRendezvous
   };
 
   template <typename T,
-            typename std::enable_if<!is_complex<T>::value>::type* = nullptr>
+            typename std::enable_if<!is_complex_v<T>>::type* = nullptr>
   T PerformReductionStep(ReductionKind reduction_kind, T a, T b) {
     using SumProductType = typename SumProductTypeForReductionStep<
         T, std::is_integral<T>::value && std::is_signed<T>::value>::type;
@@ -535,7 +530,7 @@ class CpuAllReduceRendezvous
   }
 
   template <typename T,
-            typename std::enable_if<is_complex<T>::value>::type* = nullptr>
+            typename std::enable_if<is_complex_v<T>>::type* = nullptr>
   T PerformReductionStep(ReductionKind reduction_kind, T a, T b) {
     using SumProductType = typename SumProductTypeForReductionStep<
         T, std::is_integral<T>::value && std::is_signed<T>::value>::type;
diff --git a/tensorflow/compiler/xla/service/cpu/cpu_shape_verifier.cc b/tensorflow/compiler/xla/service/cpu/cpu_shape_verifier.cc
index d420499cf57..eeb879ebec3 100644
--- a/tensorflow/compiler/xla/service/cpu/cpu_shape_verifier.cc
+++ b/tensorflow/compiler/xla/service/cpu/cpu_shape_verifier.cc
@@ -26,6 +26,11 @@ Status CpuShapeVerifier::Preprocess(HloInstruction* hlo) {
                 "The XLA CPU backend does not support sparse shapes: %s",
                 hlo->ToString());
           }
+          if (shape.layout().element_size_in_bits() != 0) {
+            return InvalidArgument(
+                "The XLA CPU backend does not support custom element sizes: %s",
+                hlo->ToString());
+          }
         }
         return OkStatus();
       }));
diff --git a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
index e48b466cd02..1e46613caa1 100644
--- a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
+++ b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.cc
@@ -44,13 +44,14 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir/backends/cpu/transforms/passes.h"
-#include "tensorflow/compiler/xla/mlir/framework/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compiler.h"
 #include "tensorflow/compiler/xla/mlir_hlo/deallocation/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/interfaces/bufferizable_op_interface_impl.h"
 #include "tensorflow/compiler/xla/mlir_hlo/gml_st/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/interfaces/bufferizable_op_interface_impl.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/mlir_hlo/thlo/interfaces/bufferizable_op_interface_impl.h"
+#include "tensorflow/compiler/xla/mlir_hlo/thlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/mlir_hlo/transforms/passes.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -87,6 +88,8 @@ void AddSparsificationPasses(mlir::OpPassManager& pm, bool new_deallocator,
   // Sparse GPU acceleration enables parallel loops.
   const bool gpu_codegen = xla_cpu_sparse_cuda_threads > 0;
   mlir::SparsificationOptions sparsification_options;
+  sparsification_options.enableRuntimeLibrary = false;
+  sparsification_options.enableIndexReduction = true;
   if (gpu_codegen) {
     sparsification_options.parallelizationStrategy =
         mlir::SparseParallelizationStrategy::kDenseOuterLoop;
@@ -161,15 +164,11 @@ static Status CreateHloXlaPipeline(
   }
 
   // Transform HLO operations to Linalg.
-  pm.addNestedPass<mlir::func::FuncOp>(mlir::mhlo::createLegalizeSortPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createLegalizeControlFlowPass());
   pm.addPass(::mlir::mhlo::createLegalizeToArithmeticPass());
-  // Outlined ABI doesn't support XLA Runtime FFI.
-  if (!options.outline_with_xla_framework) {
-    pm.addNestedPass<mlir::func::FuncOp>(
-        xla::cpu::createLegalizeLibraryOpsPass());
-  }
+  pm.addNestedPass<mlir::func::FuncOp>(
+      xla::cpu::createLegalizeLibraryOpsPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createMhloExpandOpsSimplifierPass());
   pm.addNestedPass<mlir::func::FuncOp>(
@@ -212,7 +211,6 @@ static Status CreateHloXlaPipeline(
     opts.matmulTileSizes = options.matmul_tile_sizes;
     opts.inlineFusionClusters = false;
     mlir::gml_st::addCPUTilingPipeline(pm, opts);
-
   } else {
     pm.addNestedPass<mlir::func::FuncOp>(
         mlir::createLinalgElementwiseOpFusionPass());
@@ -277,9 +275,6 @@ static Status CreateHloXlaPipeline(
   };
   pm.addPass(mlir::bufferization::createBufferResultsToOutParamsPass(
       out_params_options));
-  if (options.outline_with_xla_framework) {
-    pm.addPass(mlir::xla_framework::CreateOutlineWithXLAFrameworkPass());
-  }
 
   if (options.experimental_deallocation) {
     pm.addNestedPass<FuncOp>(
@@ -302,6 +297,7 @@ static Status CreateHloXlaPipeline(
     pm.addNestedPass<mlir::func::FuncOp>(
         xla::cpu::createRemoveCopiesToOutParamsPass());
   }
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::thlo::createLegalizeSortPass());
 
   // Specialize linalg.matmul to linalg.dot, linalg.matvec or linalg.vecmat,
   // and immediately canonicalize to clean up not taken branches.
@@ -343,6 +339,7 @@ void RegisterHloXlaRuntimePipelineDialects(mlir::DialectRegistry& dialects) {
   mlir::linalg::registerTilingInterfaceExternalModels(dialects);
   mlir::mhlo::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::scf::registerBufferizableOpInterfaceExternalModels(dialects);
+  mlir::thlo::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::shape::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::sparse_tensor::registerBufferizableOpInterfaceExternalModels(dialects);
   mlir::tensor::registerBufferizableOpInterfaceExternalModels(dialects);
diff --git a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
index 15c4e081708..f9fb464fa78 100644
--- a/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
+++ b/tensorflow/compiler/xla/service/cpu/hlo_xla_runtime_pipeline.h
@@ -33,7 +33,6 @@ struct HloXlaRuntimePipelineOptions {
   bool enable_tiling_and_fusion = false;
   bool enable_fusion_outlining = true;
   bool sparse_bufferization = true;
-  bool outline_with_xla_framework = false;
   bool experimental_deallocation = false;
   bool enable_avx2 = true;
   // Accelerate sparse computations with CUDA threading.
diff --git a/tensorflow/compiler/xla/service/cpu/ir_emitter.h b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
index 70a78129588..09c5d5c059a 100644
--- a/tensorflow/compiler/xla/service/cpu/ir_emitter.h
+++ b/tensorflow/compiler/xla/service/cpu/ir_emitter.h
@@ -488,8 +488,11 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   // management rules, their memory is owned by the module (Note that IrFunction
   // creates the encapsulated llvm::Function s.t. it is added to the llvm
   // module's function list).
-  std::unique_ptr<IrFunction> compute_function_;
+  // N.B. `b_` must be ordered before `compute_function_` as
+  // `IrFunction::~IrFunction` references `b_`. This will ensure that the
+  // destructor for `compute_function_` will run before the destructor for `b_`.
   llvm::IRBuilder<> b_;
+  std::unique_ptr<IrFunction> compute_function_;
   mlir::MLIRContext* mlir_context_;
   bool allow_reassociation_;
 
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/BUILD b/tensorflow/compiler/xla/service/cpu/runtime/BUILD
index 413a51132c7..0835f0294da 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/cpu/runtime/BUILD
@@ -37,21 +37,44 @@ cc_library(
 )
 
 cc_library(
-    name = "convolution_call",
-    srcs = ["convolution_call.cc"],
-    hdrs = ["convolution_call.h"],
+    name = "convolution",
+    srcs = ["convolution.cc"],
+    hdrs = ["convolution.h"],
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/compiler/xla:xla_proto_cc",
-        "//tensorflow/compiler/xla/runtime:custom_call",
-        "//tensorflow/compiler/xla/runtime:custom_call_registry",
-        "//tensorflow/compiler/xla/runtime:executable",
-        "//tensorflow/compiler/xla/service:hlo_proto_cc",
+        "//tensorflow/compiler/xla/runtime:memref_view",
         "//tensorflow/compiler/xla/service/cpu:runtime_conv2d",
         "//tensorflow/compiler/xla/service/cpu:runtime_conv3d",
         "//tensorflow/compiler/xla/service/cpu:runtime_fft",
         "@com_google_absl//absl/status",
-        "@llvm-project//llvm:Support",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "convolution_ffi",
+    srcs = ["convolution_ffi.cc"],
+    hdrs = ["convolution_ffi.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":convolution",
+        "//tensorflow/compiler/xla/runtime:aot_ffi",
+        "//tensorflow/compiler/xla/runtime:aot_ffi_execution_context",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
+    ],
+)
+
+cc_library(
+    name = "convolution_call",
+    srcs = ["convolution_call.cc"],
+    hdrs = ["convolution_call.h"],
+    deps = [
+        ":convolution",
+        "//tensorflow/compiler/xla:executable_run_options",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -111,11 +134,33 @@ cc_library(
     hdrs = ["rng.h"],
     deps = [
         "//tensorflow/compiler/xla:executable_run_options",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        "//tensorflow/compiler/xla/runtime:memref_view",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_library(
+    name = "rng_call",
+    srcs = ["rng_call.cc"],
+    hdrs = ["rng_call.h"],
+    deps = [
+        ":rng",
+        "//tensorflow/compiler/xla:executable_run_options",
         "//tensorflow/compiler/xla/runtime:custom_call",
         "//tensorflow/compiler/xla/runtime:custom_call_registry",
         "//tensorflow/compiler/xla/runtime:executable",
-        "@com_google_absl//absl/status",
-        "@llvm-project//mlir:Support",
+    ],
+)
+
+cc_library(
+    name = "rng_ffi",
+    srcs = ["rng_ffi.cc"],
+    hdrs = ["rng_ffi.h"],
+    visibility = ["//visibility:public"],
+    deps = [
+        ":rng",
+        "//tensorflow/compiler/xla/runtime:aot_ffi",
+        "//tensorflow/compiler/xla/runtime:aot_ffi_execution_context",
+        "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution.cc b/tensorflow/compiler/xla/service/cpu/runtime/convolution.cc
new file mode 100644
index 00000000000..77c77b8ece6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution.cc
@@ -0,0 +1,216 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/convolution.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_conv3d.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
+
+namespace xla {
+namespace cpu {
+
+using ::xla::runtime::MemrefView;
+
+absl::Status XlaConvolution::operator()(
+    const ExecutableRunOptions* run_options, MemrefView input,
+    MemrefView kernel, MemrefView output, int64_t inputBatchDimension,
+    absl::Span<const int64_t> inputSpatialDimensions,
+    int64_t inputFeatureDimension,
+    absl::Span<const int64_t> kernelSpatialDimensions,
+    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
+    absl::Span<const int64_t> outputSpatialDimensions,
+    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
+    absl::Span<const int64_t> lhs_dilation,
+    absl::Span<const int64_t> rhs_dilation, int64_t feature_group_count) const {
+  auto size = inputSpatialDimensions.size();
+  if (size < 1 || size > 3) {
+    return absl::InvalidArgumentError(
+        "Only 1D, 2D and 3D convolutions are supported");
+  }
+
+  if (size != kernelSpatialDimensions.size() ||
+      size != outputSpatialDimensions.size() || size != window_strides.size() ||
+      size * 2 != padding.size() || size != lhs_dilation.size() ||
+      size != rhs_dilation.size()) {
+    return absl::InvalidArgumentError("Number of attributes mismatched");
+  }
+
+  // We lower 1D convolutions into calls to the same Eigen function as 2D
+  // convolutions, except that we pretend that the 1D convolution is really a 2D
+  // convolution with the missing dimension set to 1.  We also adjust the
+  // padding, dilation parameters as needed.
+  std::vector<int64_t> input_dims;
+  std::vector<int64_t> kernel_dims;
+  std::vector<int64_t> output_dims;
+  std::vector<int64_t> strides;
+  std::vector<int64_t> pad;
+  std::vector<int64_t> base_dilation;
+  std::vector<int64_t> window_dilation;
+  if (size == 1) {
+    input_dims.push_back(1);
+    kernel_dims.push_back(1);
+    output_dims.push_back(1);
+    strides.push_back(1);
+    pad.insert(pad.end(), {0, 0});
+    base_dilation.push_back(1);
+    window_dilation.push_back(1);
+  }
+  for (auto dim : inputSpatialDimensions) {
+    input_dims.push_back(input.sizes[dim]);
+  }
+  for (auto dim : kernelSpatialDimensions) {
+    kernel_dims.push_back(kernel.sizes[dim]);
+  }
+  for (auto dim : outputSpatialDimensions) {
+    output_dims.push_back(output.sizes[dim]);
+  }
+  strides.insert(strides.end(), window_strides.begin(), window_strides.end());
+  pad.insert(pad.end(), padding.begin(), padding.end());
+  base_dilation.insert(base_dilation.end(), lhs_dilation.begin(),
+                       lhs_dilation.end());
+  window_dilation.insert(window_dilation.end(), rhs_dilation.begin(),
+                         rhs_dilation.end());
+
+  if (output.dtype == PrimitiveType::F16) {
+    auto* out = reinterpret_cast<Eigen::half*>(output.data);
+    auto* lhs = reinterpret_cast<Eigen::half*>(input.data);
+    auto* rhs = reinterpret_cast<Eigen::half*>(kernel.data);
+    if (size != 3) {
+      __xla_cpu_runtime_EigenConv2DF16(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_rows*/ input_dims[0],
+          /*input_cols*/ input_dims[1],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_rows*/ kernel_dims[0],
+          /*kernel_cols*/ kernel_dims[1],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_rows*/ output_dims[0],
+          /*output_cols*/ output_dims[1],
+          /*row_stride*/ strides[0],
+          /*col_stride*/ strides[1],
+          /*padding_top*/ pad[0],
+          /*padding_bottom*/ pad[1],
+          /*padding_left*/ pad[2],
+          /*padding_right*/ pad[3],
+          /*lhs_row_dilation*/ base_dilation[0],
+          /*lhs_col_dilation*/ base_dilation[1],
+          /*rhs_row_dilation*/ window_dilation[0],
+          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
+    } else {
+      __xla_cpu_runtime_EigenConv3DF16(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_x*/ input_dims[0],
+          /*input_y*/ input_dims[1],
+          /*input_z*/ input_dims[2],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_x*/ kernel_dims[0],
+          /*kernel_y*/ kernel_dims[1],
+          /*kernel_z*/ kernel_dims[2],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_x*/ output_dims[0],
+          /*output_y*/ output_dims[1],
+          /*output_z*/ output_dims[2],
+          /*x_stride*/ strides[0],
+          /*y_stride*/ strides[1],
+          /*z_stride*/ strides[2],
+          /*padding_x_before*/ pad[0],
+          /*padding_x_after*/ pad[1],
+          /*padding_y_before*/ pad[2],
+          /*padding_y_after*/ pad[3],
+          /*padding_z_before*/ pad[4],
+          /*padding_z_after*/ pad[5],
+          /*lhs_x_dilation*/ base_dilation[0],
+          /*lhs_y_dilation*/ base_dilation[1],
+          /*lhs_z_dilation*/ base_dilation[2],
+          /*rhs_x_dilation*/ window_dilation[0],
+          /*rhs_y_dilation*/ window_dilation[1],
+          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
+    }
+  } else {
+    auto* out = reinterpret_cast<float*>(output.data);
+    auto* lhs = reinterpret_cast<float*>(input.data);
+    auto* rhs = reinterpret_cast<float*>(kernel.data);
+    if (size != 3) {
+      __xla_cpu_runtime_EigenConv2DF32(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_rows*/ input_dims[0],
+          /*input_cols*/ input_dims[1],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_rows*/ kernel_dims[0],
+          /*kernel_cols*/ kernel_dims[1],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_rows*/ output_dims[0],
+          /*output_cols*/ output_dims[1],
+          /*row_stride*/ strides[0],
+          /*col_stride*/ strides[1],
+          /*padding_top*/ pad[0],
+          /*padding_bottom*/ pad[1],
+          /*padding_left*/ pad[2],
+          /*padding_right*/ pad[3],
+          /*lhs_row_dilation*/ base_dilation[0],
+          /*lhs_col_dilation*/ base_dilation[1],
+          /*rhs_row_dilation*/ window_dilation[0],
+          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
+    } else {
+      __xla_cpu_runtime_EigenConv3DF32(
+          run_options, out, lhs, rhs,
+          /*input_batch*/ input.sizes[inputBatchDimension],
+          /*input_x*/ input_dims[0],
+          /*input_y*/ input_dims[1],
+          /*input_z*/ input_dims[2],
+          /*input_channels*/ input.sizes[inputFeatureDimension],
+          /*kernel_x*/ kernel_dims[0],
+          /*kernel_y*/ kernel_dims[1],
+          /*kernel_z*/ kernel_dims[2],
+          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
+          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
+          /*output_x*/ output_dims[0],
+          /*output_y*/ output_dims[1],
+          /*output_z*/ output_dims[2],
+          /*x_stride*/ strides[0],
+          /*y_stride*/ strides[1],
+          /*z_stride*/ strides[2],
+          /*padding_x_before*/ pad[0],
+          /*padding_x_after*/ pad[1],
+          /*padding_y_before*/ pad[2],
+          /*padding_y_after*/ pad[3],
+          /*padding_z_before*/ pad[4],
+          /*padding_z_after*/ pad[5],
+          /*lhs_x_dilation*/ base_dilation[0],
+          /*lhs_y_dilation*/ base_dilation[1],
+          /*lhs_z_dilation*/ base_dilation[2],
+          /*rhs_x_dilation*/ window_dilation[0],
+          /*rhs_y_dilation*/ window_dilation[1],
+          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
+    }
+  }
+
+  return absl::OkStatus();
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution.h b/tensorflow/compiler/xla/service/cpu/runtime/convolution.h
new file mode 100644
index 00000000000..9af99e8cf02
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution.h
@@ -0,0 +1,46 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_H_
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/memref_view.h"
+
+namespace xla {
+namespace cpu {
+
+struct XlaConvolution {
+  absl::Status operator()(
+      const ExecutableRunOptions* run_options, xla::runtime::MemrefView input,
+      xla::runtime::MemrefView kernel, xla::runtime::MemrefView output,
+      int64_t inputBatchDimension,
+      absl::Span<const int64_t> inputSpatialDimensions,
+      int64_t inputFeatureDimension,
+      absl::Span<const int64_t> kernelSpatialDimensions,
+      int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
+      absl::Span<const int64_t> outputSpatialDimensions,
+      absl::Span<const int64_t> window_strides,
+      absl::Span<const int64_t> padding, absl::Span<const int64_t> lhs_dilation,
+      absl::Span<const int64_t> rhs_dilation,
+      int64_t feature_group_count) const;
+  static XlaConvolution Handler() { return XlaConvolution(); }
+};
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc b/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc
index 850e31e5dac..6074530c50a 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution_call.cc
@@ -24,16 +24,10 @@
 #include <utility>
 #include <vector>
 
-#include "absl/status/status.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_conv2d.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_conv3d.h"
-#include "tensorflow/compiler/xla/service/cpu/runtime_fft.h"
-#include "tensorflow/compiler/xla/service/hlo.pb.h"
-#include "tensorflow/compiler/xla/xla.pb.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/convolution.h"
 
 namespace xla {
 namespace cpu {
@@ -51,211 +45,10 @@ static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
 #endif
 }
 
-namespace {
-struct XlaConvolution {
-  absl::Status operator()(const ExecutableRunOptions* run_options,
-                          MemrefView input, MemrefView kernel,
-                          MemrefView output, int64_t inputBatchDimension,
-                          absl::Span<const int64_t> inputSpatialDimensions,
-                          int64_t inputFeatureDimension,
-                          absl::Span<const int64_t> kernelSpatialDimensions,
-                          int64_t kernelInputFeatureDimension,
-                          int64_t kernelOutputFeatureDimension,
-                          absl::Span<const int64_t> outputSpatialDimensions,
-                          absl::Span<const int64_t> window_strides,
-                          absl::Span<const int64_t> padding,
-                          absl::Span<const int64_t> lhs_dilation,
-                          absl::Span<const int64_t> rhs_dilation,
-                          int64_t feature_group_count) const;
-  static XlaConvolution Handler() { return XlaConvolution(); }
-};
-}  // namespace
-
-absl::Status XlaConvolution::operator()(
-    const ExecutableRunOptions* run_options, MemrefView input,
-    MemrefView kernel, MemrefView output, int64_t inputBatchDimension,
-    absl::Span<const int64_t> inputSpatialDimensions,
-    int64_t inputFeatureDimension,
-    absl::Span<const int64_t> kernelSpatialDimensions,
-    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
-    absl::Span<const int64_t> outputSpatialDimensions,
-    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
-    absl::Span<const int64_t> lhs_dilation,
-    absl::Span<const int64_t> rhs_dilation, int64_t feature_group_count) const {
-  auto size = inputSpatialDimensions.size();
-  if (size < 1 || size > 3) {
-    return absl::InvalidArgumentError(
-        "Only 1D, 2D and 3D convolutions are supported");
-  }
-
-  if (size != kernelSpatialDimensions.size() ||
-      size != outputSpatialDimensions.size() || size != window_strides.size() ||
-      size * 2 != padding.size() || size != lhs_dilation.size() ||
-      size != rhs_dilation.size()) {
-    return absl::InvalidArgumentError("Number of attributes mismatched");
-  }
-
-  // We lower 1D convolutions into calls to the same Eigen function as 2D
-  // convolutions, except that we pretend that the 1D convolution is really a 2D
-  // convolution with the missing dimension set to 1.  We also adjust the
-  // padding, dilation parameters as needed.
-  llvm::SmallVector<int64_t, 3> input_dims;
-  llvm::SmallVector<int64_t, 3> kernel_dims;
-  llvm::SmallVector<int64_t, 3> output_dims;
-  llvm::SmallVector<int64_t, 3> strides;
-  llvm::SmallVector<int64_t, 3> pad;
-  llvm::SmallVector<int64_t, 3> base_dilation;
-  llvm::SmallVector<int64_t, 3> window_dilation;
-  if (size == 1) {
-    input_dims.push_back(1);
-    kernel_dims.push_back(1);
-    output_dims.push_back(1);
-    strides.push_back(1);
-    pad.append({0, 0});
-    base_dilation.push_back(1);
-    window_dilation.push_back(1);
-  }
-  for (auto dim : inputSpatialDimensions) {
-    input_dims.push_back(input.sizes[dim]);
-  }
-  for (auto dim : kernelSpatialDimensions) {
-    kernel_dims.push_back(kernel.sizes[dim]);
-  }
-  for (auto dim : outputSpatialDimensions) {
-    output_dims.push_back(output.sizes[dim]);
-  }
-  strides.append(window_strides.begin(), window_strides.end());
-  pad.append(padding.begin(), padding.end());
-  base_dilation.append(lhs_dilation.begin(), lhs_dilation.end());
-  window_dilation.append(rhs_dilation.begin(), rhs_dilation.end());
-
-  if (output.dtype == PrimitiveType::F16) {
-    auto* out = reinterpret_cast<Eigen::half*>(output.data);
-    auto* lhs = reinterpret_cast<Eigen::half*>(input.data);
-    auto* rhs = reinterpret_cast<Eigen::half*>(kernel.data);
-    if (size != 3) {
-      __xla_cpu_runtime_EigenConv2DF16(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_rows*/ input_dims[0],
-          /*input_cols*/ input_dims[1],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_rows*/ kernel_dims[0],
-          /*kernel_cols*/ kernel_dims[1],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_rows*/ output_dims[0],
-          /*output_cols*/ output_dims[1],
-          /*row_stride*/ strides[0],
-          /*col_stride*/ strides[1],
-          /*padding_top*/ pad[0],
-          /*padding_bottom*/ pad[1],
-          /*padding_left*/ pad[2],
-          /*padding_right*/ pad[3],
-          /*lhs_row_dilation*/ base_dilation[0],
-          /*lhs_col_dilation*/ base_dilation[1],
-          /*rhs_row_dilation*/ window_dilation[0],
-          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
-    } else {
-      __xla_cpu_runtime_EigenConv3DF16(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_x*/ input_dims[0],
-          /*input_y*/ input_dims[1],
-          /*input_z*/ input_dims[2],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_x*/ kernel_dims[0],
-          /*kernel_y*/ kernel_dims[1],
-          /*kernel_z*/ kernel_dims[2],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_x*/ output_dims[0],
-          /*output_y*/ output_dims[1],
-          /*output_z*/ output_dims[2],
-          /*x_stride*/ strides[0],
-          /*y_stride*/ strides[1],
-          /*z_stride*/ strides[2],
-          /*padding_x_before*/ pad[0],
-          /*padding_x_after*/ pad[1],
-          /*padding_y_before*/ pad[2],
-          /*padding_y_after*/ pad[3],
-          /*padding_z_before*/ pad[4],
-          /*padding_z_after*/ pad[5],
-          /*lhs_x_dilation*/ base_dilation[0],
-          /*lhs_y_dilation*/ base_dilation[1],
-          /*lhs_z_dilation*/ base_dilation[2],
-          /*rhs_x_dilation*/ window_dilation[0],
-          /*rhs_y_dilation*/ window_dilation[1],
-          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
-    }
-  } else {
-    auto* out = reinterpret_cast<float*>(output.data);
-    auto* lhs = reinterpret_cast<float*>(input.data);
-    auto* rhs = reinterpret_cast<float*>(kernel.data);
-    if (size != 3) {
-      __xla_cpu_runtime_EigenConv2DF32(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_rows*/ input_dims[0],
-          /*input_cols*/ input_dims[1],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_rows*/ kernel_dims[0],
-          /*kernel_cols*/ kernel_dims[1],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_rows*/ output_dims[0],
-          /*output_cols*/ output_dims[1],
-          /*row_stride*/ strides[0],
-          /*col_stride*/ strides[1],
-          /*padding_top*/ pad[0],
-          /*padding_bottom*/ pad[1],
-          /*padding_left*/ pad[2],
-          /*padding_right*/ pad[3],
-          /*lhs_row_dilation*/ base_dilation[0],
-          /*lhs_col_dilation*/ base_dilation[1],
-          /*rhs_row_dilation*/ window_dilation[0],
-          /*rhs_col_dilation*/ window_dilation[1], feature_group_count);
-    } else {
-      __xla_cpu_runtime_EigenConv3DF32(
-          run_options, out, lhs, rhs,
-          /*input_batch*/ input.sizes[inputBatchDimension],
-          /*input_x*/ input_dims[0],
-          /*input_y*/ input_dims[1],
-          /*input_z*/ input_dims[2],
-          /*input_channels*/ input.sizes[inputFeatureDimension],
-          /*kernel_x*/ kernel_dims[0],
-          /*kernel_y*/ kernel_dims[1],
-          /*kernel_z*/ kernel_dims[2],
-          /*kernel_channels*/ kernel.sizes[kernelInputFeatureDimension],
-          /*kernel_filters*/ kernel.sizes[kernelOutputFeatureDimension],
-          /*output_x*/ output_dims[0],
-          /*output_y*/ output_dims[1],
-          /*output_z*/ output_dims[2],
-          /*x_stride*/ strides[0],
-          /*y_stride*/ strides[1],
-          /*z_stride*/ strides[2],
-          /*padding_x_before*/ pad[0],
-          /*padding_x_after*/ pad[1],
-          /*padding_y_before*/ pad[2],
-          /*padding_y_after*/ pad[3],
-          /*padding_z_before*/ pad[4],
-          /*padding_z_after*/ pad[5],
-          /*lhs_x_dilation*/ base_dilation[0],
-          /*lhs_y_dilation*/ base_dilation[1],
-          /*lhs_z_dilation*/ base_dilation[2],
-          /*rhs_x_dilation*/ window_dilation[0],
-          /*rhs_y_dilation*/ window_dilation[1],
-          /*rhs_z_dilation*/ window_dilation[2], feature_group_count);
-    }
-  }
-
-  return absl::OkStatus();
-}
-
 static bool Convolution(xla::runtime::ExecutionContext* ctx, void** args,
                         void** attrs, void** rets) {
   static auto* handler =
-      CustomCall::Bind("xla.cpu.convolution")
+      CustomCall::Bind("xla_cpu_convolution")
           .UserData<const ExecutableRunOptions*>()
           .Arg<MemrefView>()  // input
           .Arg<MemrefView>()  // kernel
@@ -272,14 +65,14 @@ static bool Convolution(xla::runtime::ExecutionContext* ctx, void** args,
           .Attr<absl::Span<const int64_t>>("lhs_dilation")
           .Attr<absl::Span<const int64_t>>("rhs_dilation")
           .Attr<int64_t>("feature_group_count")
-          .To<RuntimeChecks()>(XlaConvolution::Handler())
+          .To<RuntimeChecks()>(xla::cpu::XlaConvolution::Handler())
           .release();
   return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
 }
 
 void PopulateXlaCpuConvolutionCall(
     xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.cpu.convolution", &Convolution);
+  registry.Register("xla_cpu_convolution", &Convolution);
 }
 
 }  // namespace cpu
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.cc b/tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.cc
new file mode 100644
index 00000000000..184c4334ece
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.cc
@@ -0,0 +1,101 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.h"
+
+#include "tensorflow/compiler/xla/runtime/aot_ffi.h"
+#include "tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/convolution.h"
+
+namespace xla {
+struct ExecutableRunOptions;
+}  // namespace xla
+
+namespace aot = ::xla::runtime::aot;
+namespace ffi = ::xla::runtime::ffi;
+
+namespace {
+
+using ::xla::runtime::MemrefView;
+
+ffi::FfiStatus ConvolutionFfi(
+    xla::ExecutableRunOptions* executable_run_options, ffi::BufferArg input,
+    ffi::BufferArg kernel, ffi::BufferArg output, int64_t inputBatchDimension,
+    ffi::Span<const int64_t> inputSpatialDimensions,
+    int64_t inputFeatureDimension,
+    ffi::Span<const int64_t> kernelSpatialDimensions,
+    int64_t kernelInputFeatureDimension, int64_t kernelOutputFeatureDimension,
+    ffi::Span<const int64_t> outputSpatialDimensions,
+    ffi::Span<const int64_t> window_strides, ffi::Span<const int64_t> padding,
+    ffi::Span<const int64_t> lhs_dilation,
+    ffi::Span<const int64_t> rhs_dilation, int64_t feature_group_count) {
+  auto to_memref_view = [](const ffi::BufferArg& view) -> MemrefView {
+    auto dtype = static_cast<xla::PrimitiveType>(view.dtype);
+    return MemrefView{
+        dtype, view.data,
+        absl::MakeConstSpan(view.sizes.begin(), view.sizes.end())};
+  };
+  auto to_span =
+      [](ffi::Span<const int64_t> span) -> absl::Span<const int64_t> {
+    return absl::MakeConstSpan(span.begin(), span.end());
+  };
+
+  xla::cpu::XlaConvolution convolution;
+  absl::Status status = convolution(
+      executable_run_options, to_memref_view(input), to_memref_view(kernel),
+      to_memref_view(output), inputBatchDimension,
+      to_span(inputSpatialDimensions), inputFeatureDimension,
+      to_span(kernelSpatialDimensions), kernelInputFeatureDimension,
+      kernelOutputFeatureDimension, to_span(outputSpatialDimensions),
+      to_span(window_strides), to_span(padding), to_span(lhs_dilation),
+      to_span(rhs_dilation), feature_group_count);
+  return status.ok() ? ffi::FfiStatus::Ok() : ffi::FfiStatus::Internal("err");
+}
+
+XLA_FFI_DEFINE_FUNCTION(
+    FFI_Convolution, ConvolutionFfi,
+    ffi::Ffi::Binding()
+        .ApiPriv<xla::ExecutableRunOptions*>()
+        .Arg<ffi::BufferArg>()  // input
+        .Arg<ffi::BufferArg>()  // kernel
+        .Arg<ffi::BufferArg>()  // output
+        .Attr<int64_t>("inputBatchDimension")
+        .Attr<ffi::Span<const int64_t>>("inputSpatialDimensions")
+        .Attr<int64_t>("inputFeatureDimension")
+        .Attr<ffi::Span<const int64_t>>("kernelSpatialDimensions")
+        .Attr<int64_t>("kernelInputFeatureDimension")
+        .Attr<int64_t>("kernelOutputFeatureDimension")
+        .Attr<ffi::Span<const int64_t>>("outputSpatialDimensions")
+        .Attr<ffi::Span<const int64_t>>("window_strides")
+        .Attr<ffi::Span<const int64_t>>("padding")
+        .Attr<ffi::Span<const int64_t>>("lhs_dilation")
+        .Attr<ffi::Span<const int64_t>>("rhs_dilation")
+        .Attr<int64_t>("feature_group_count"));
+
+}  // namespace
+
+bool xla_cpu_convolution(void* execution_context, void** args, void** attrs,
+                         void** rets) {
+  auto ctx = static_cast<aot::ExecutionContext*>(execution_context);
+  void* executable_run_options = ctx->custom_call_data;
+
+  XLA_FFI_Api api = aot::FfiApi();
+  api.priv = executable_run_options;
+
+  XLA_FFI_Function_Args ffi_args = aot::FfiArgs(&api, args, attrs, rets);
+
+  XLA_FFI_Error* error = FFI_Convolution(&ffi_args);
+  return aot::ProcessErrorIfAny(error);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.h b/tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.h
new file mode 100644
index 00000000000..d4fe91e2014
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/convolution_ffi.h
@@ -0,0 +1,22 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_FFI_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_FFI_H_
+
+extern "C" {
+bool xla_cpu_convolution(void* execution_context, void** args, void** attrs,
+                         void** rets);
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_CONVOLUTION_FFI_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng.cc b/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
index f3c731d1952..c1cb0ca7684 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng.cc
@@ -18,44 +18,13 @@
 #include <cstdint>
 
 #include "absl/status/status.h"
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/executable_run_options.h"
-#include "tensorflow/compiler/xla/runtime/custom_call.h"
-#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
-#include "tensorflow/compiler/xla/runtime/executable.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
 
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::Executable;
 using ::xla::runtime::FlatMemrefView;
 
-// Disable all CustomCall checks in optimized build.
-static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
-#if defined(NDEBUG)
-  return CustomCall::RuntimeChecks::kNone;
-#else
-  return CustomCall::RuntimeChecks::kDefault;
-#endif
-}
-
-namespace {
-struct XlaThreeFry {
-  absl::Status operator()(const ExecutableRunOptions*,
-                          FlatMemrefView state_buffer,
-                          FlatMemrefView state_out_buffer,
-                          FlatMemrefView values_buffer) const;
-};
-struct XlaPhilox {
-  absl::Status operator()(const ExecutableRunOptions*,
-                          FlatMemrefView state_buffer,
-                          FlatMemrefView state_out_buffer,
-                          FlatMemrefView values_buffer) const;
-};
-}  // namespace
-
 static std::array<uint32_t, 2> threefry2x32(std::array<uint32_t, 2> key,
                                             std::array<uint32_t, 2> ctr) {
   constexpr std::array<std::array<int, 4>, 2> rotations{
@@ -152,6 +121,9 @@ absl::Status XlaThreeFry::operator()(const ExecutableRunOptions*,
   switch (values_buffer.dtype) {
     case S8:
     case U8:
+      FillBuffer<uint8_t>(values_buffer.data, state_out_buffer.data,
+                          values_buffer.size_in_bytes, threefry2x32, ctr, key);
+      break;
     case F16:
     case U16:
     case S16:
@@ -197,6 +169,9 @@ absl::Status XlaPhilox::operator()(const ExecutableRunOptions*,
   switch (values_buffer.dtype) {
     case S8:
     case U8:
+      FillBuffer<uint8_t>(values_buffer.data, state_out_buffer.data,
+                          values_buffer.size_in_bytes, philox4x32, ctr, key);
+      break;
     case F16:
     case U16:
     case S16:
@@ -219,34 +194,5 @@ absl::Status XlaPhilox::operator()(const ExecutableRunOptions*,
   return absl::OkStatus();
 }
 
-static bool ThreeFry(xla::runtime::ExecutionContext* ctx, void** args,
-                     void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.rng.three_fry")
-                             .UserData<const ExecutableRunOptions*>()
-                             .Arg<FlatMemrefView>()
-                             .Arg<FlatMemrefView>()
-                             .Arg<FlatMemrefView>()
-                             .To<RuntimeChecks()>(XlaThreeFry())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-static bool Philox(xla::runtime::ExecutionContext* ctx, void** args,
-                   void** attrs, void** rets) {
-  static auto* handler = CustomCall::Bind("xla.cpu.rng.philox")
-                             .UserData<const ExecutableRunOptions*>()
-                             .Arg<FlatMemrefView>()
-                             .Arg<FlatMemrefView>()
-                             .Arg<FlatMemrefView>()
-                             .To<RuntimeChecks()>(XlaPhilox())
-                             .release();
-  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
-}
-
-void PopulateXlaCpuRngCall(xla::runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.cpu.rng.three_fry", &ThreeFry);
-  registry.Register("xla.cpu.rng.philox", &Philox);
-}
-
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng.h b/tensorflow/compiler/xla/service/cpu/runtime/rng.h
index 79e73eae543..c59d48f0c6e 100644
--- a/tensorflow/compiler/xla/service/cpu/runtime/rng.h
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng.h
@@ -15,13 +15,30 @@
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_H_
 
-#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/memref_view.h"
 
 namespace xla {
 namespace cpu {
 
-// Populate custom call implementing XLA CPU RNGs.
-void PopulateXlaCpuRngCall(runtime::DirectCustomCallRegistry& registry);
+struct XlaThreeFry {
+  absl::Status operator()(const ExecutableRunOptions*,
+                          xla::runtime::FlatMemrefView state_buffer,
+                          xla::runtime::FlatMemrefView state_out_buffer,
+                          xla::runtime::FlatMemrefView values_buffer) const;
+  static XlaThreeFry Handler() { return XlaThreeFry(); }
+};
+
+struct XlaPhilox {
+  absl::Status operator()(const ExecutableRunOptions*,
+                          xla::runtime::FlatMemrefView state_buffer,
+                          xla::runtime::FlatMemrefView state_out_buffer,
+                          xla::runtime::FlatMemrefView values_buffer) const;
+  static XlaPhilox Handler() { return XlaPhilox(); }
+};
 
 }  // namespace cpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng_call.cc b/tensorflow/compiler/xla/service/cpu/runtime/rng_call.cc
new file mode 100644
index 00000000000..ef3ae347782
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng_call.cc
@@ -0,0 +1,74 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng_call.h"
+
+#include <array>
+#include <cstdint>
+
+#include "tensorflow/compiler/xla/executable_run_options.h"
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng.h"
+
+namespace xla {
+namespace cpu {
+
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::Executable;
+using ::xla::runtime::FlatMemrefView;
+
+// Disable all CustomCall checks in optimized build.
+static constexpr CustomCall::RuntimeChecks RuntimeChecks() {
+#if defined(NDEBUG)
+  return CustomCall::RuntimeChecks::kNone;
+#else
+  return CustomCall::RuntimeChecks::kDefault;
+#endif
+}
+
+static bool ThreeFry(xla::runtime::ExecutionContext* ctx, void** args,
+                     void** attrs, void** rets) {
+  static auto* handler =
+      CustomCall::Bind("xla_cpu_rng_three_fry")
+          .UserData<const ExecutableRunOptions*>()
+          .Arg<FlatMemrefView>()
+          .Arg<FlatMemrefView>()
+          .Arg<FlatMemrefView>()
+          .To<RuntimeChecks()>(xla::cpu::XlaThreeFry::Handler())
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+static bool Philox(xla::runtime::ExecutionContext* ctx, void** args,
+                   void** attrs, void** rets) {
+  static auto* handler =
+      CustomCall::Bind("xla_cpu_rng_philox")
+          .UserData<const ExecutableRunOptions*>()
+          .Arg<FlatMemrefView>()
+          .Arg<FlatMemrefView>()
+          .Arg<FlatMemrefView>()
+          .To<RuntimeChecks()>(xla::cpu::XlaPhilox::Handler())
+          .release();
+  return succeeded(Executable::Call(ctx, *handler, args, attrs, rets));
+}
+
+void PopulateXlaCpuRngCall(xla::runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla_cpu_rng_three_fry", &ThreeFry);
+  registry.Register("xla_cpu_rng_philox", &Philox);
+}
+
+}  // namespace cpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng_call.h b/tensorflow/compiler/xla/service/cpu/runtime/rng_call.h
new file mode 100644
index 00000000000..59be9d8a4b0
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng_call.h
@@ -0,0 +1,29 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_CALL_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_CALL_H_
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+
+namespace xla {
+namespace cpu {
+
+// Populate custom call implementing XLA CPU RNGs.
+void PopulateXlaCpuRngCall(runtime::DirectCustomCallRegistry& registry);
+
+}  // namespace cpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_CALL_H_
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.cc b/tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.cc
new file mode 100644
index 00000000000..79ffba000b5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.cc
@@ -0,0 +1,103 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.h"
+
+#include "tensorflow/compiler/xla/runtime/aot_ffi.h"
+#include "tensorflow/compiler/xla/runtime/aot_ffi_execution_context.h"
+#include "tensorflow/compiler/xla/runtime/ffi/ffi_api.h"
+#include "tensorflow/compiler/xla/service/cpu/runtime/rng.h"
+
+namespace xla {
+struct ExecutableRunOptions;
+}  // namespace xla
+
+namespace aot = ::xla::runtime::aot;
+namespace ffi = ::xla::runtime::ffi;
+
+namespace {
+
+using ::xla::runtime::FlatMemrefView;
+
+// Converts an ffi::FlatBufferArg to an xla::runtime::FlatMemrefView.
+FlatMemrefView ToFlatMemrefView(const ffi::FlatBufferArg& view) {
+  auto dtype = static_cast<xla::PrimitiveType>(view.dtype);
+  return FlatMemrefView{dtype, view.data, view.size_in_bytes};
+}
+
+ffi::FfiStatus ThreeFryFfi(xla::ExecutableRunOptions* executable_run_options,
+                           ffi::FlatBufferArg state_buffer,
+                           ffi::FlatBufferArg state_out_buffer,
+                           ffi::FlatBufferArg values_buffer) {
+  xla::cpu::XlaThreeFry three_fry;
+  absl::Status status = three_fry(
+      executable_run_options, ToFlatMemrefView(state_buffer),
+      ToFlatMemrefView(state_out_buffer), ToFlatMemrefView(values_buffer));
+  return status.ok() ? ffi::FfiStatus::Ok() : ffi::FfiStatus::Internal("err");
+}
+
+XLA_FFI_DEFINE_FUNCTION(FFI_ThreeFry, ThreeFryFfi,
+                        ffi::Ffi::Binding()
+                            .ApiPriv<xla::ExecutableRunOptions*>()
+                            .Arg<ffi::FlatBufferArg>()    // state_buffer
+                            .Arg<ffi::FlatBufferArg>()    // state_out_buffer
+                            .Arg<ffi::FlatBufferArg>());  // values_buffer
+
+ffi::FfiStatus PhiloxFfi(xla::ExecutableRunOptions* executable_run_options,
+                         ffi::FlatBufferArg state_buffer,
+                         ffi::FlatBufferArg state_out_buffer,
+                         ffi::FlatBufferArg values_buffer) {
+  xla::cpu::XlaPhilox philox;
+  absl::Status status = philox(
+      executable_run_options, ToFlatMemrefView(state_buffer),
+      ToFlatMemrefView(state_out_buffer), ToFlatMemrefView(values_buffer));
+  return status.ok() ? ffi::FfiStatus::Ok() : ffi::FfiStatus::Internal("err");
+}
+
+XLA_FFI_DEFINE_FUNCTION(FFI_Philox, PhiloxFfi,
+                        ffi::Ffi::Binding()
+                            .ApiPriv<xla::ExecutableRunOptions*>()
+                            .Arg<ffi::FlatBufferArg>()    // state_buffer
+                            .Arg<ffi::FlatBufferArg>()    // state_out_buffer
+                            .Arg<ffi::FlatBufferArg>());  // values_buffer
+
+}  // namespace
+
+bool xla_cpu_rng_three_fry(void* execution_context, void** args, void** attrs,
+                           void** rets) {
+  auto ctx = static_cast<aot::ExecutionContext*>(execution_context);
+  void* executable_run_options = ctx->custom_call_data;
+
+  XLA_FFI_Api api = aot::FfiApi();
+  api.priv = executable_run_options;
+
+  XLA_FFI_Function_Args ffi_args = aot::FfiArgs(&api, args, attrs, rets);
+
+  XLA_FFI_Error* error = FFI_ThreeFry(&ffi_args);
+  return aot::ProcessErrorIfAny(error);
+}
+
+bool xla_cpu_rng_philox(void* execution_context, void** args, void** attrs,
+                        void** rets) {
+  auto ctx = static_cast<aot::ExecutionContext*>(execution_context);
+  void* executable_run_options = ctx->custom_call_data;
+
+  XLA_FFI_Api api = aot::FfiApi();
+  api.priv = executable_run_options;
+
+  XLA_FFI_Function_Args ffi_args = aot::FfiArgs(&api, args, attrs, rets);
+
+  XLA_FFI_Error* error = FFI_Philox(&ffi_args);
+  return aot::ProcessErrorIfAny(error);
+}
diff --git a/tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.h b/tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.h
new file mode 100644
index 00000000000..145e02cc55f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/cpu/runtime/rng_ffi.h
@@ -0,0 +1,24 @@
+// Copyright 2023 The TensorFlow Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//      http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_FFI_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_FFI_H_
+
+extern "C" {
+bool xla_cpu_rng_three_fry(void* execution_context, void** args, void** attrs,
+                           void** rets);
+bool xla_cpu_rng_philox(void* execution_context, void** args, void** attrs,
+                        void** rets);
+}  // extern "C"
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_CPU_RUNTIME_RNG_FFI_H_
diff --git a/tensorflow/compiler/xla/service/data_parallel_collective_optimizer.cc b/tensorflow/compiler/xla/service/data_parallel_collective_optimizer.cc
new file mode 100644
index 00000000000..88c505b357e
--- /dev/null
+++ b/tensorflow/compiler/xla/service/data_parallel_collective_optimizer.cc
@@ -0,0 +1,1510 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h"
+
+#include <cstdint>
+#include <functional>
+#include <limits>
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
+#include "tensorflow/compiler/xla/hlo/evaluator/hlo_evaluator.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
+#include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/map_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
+#include "tensorflow/compiler/xla/service/constant_value.h"
+#include "tensorflow/compiler/xla/service/value_range.h"
+#include "tensorflow/compiler/xla/shape.h"
+#include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace xla {
+
+const char* const DataParallelCollectiveOptimizer::kInsertedByPreviousStep =
+    "InsertedByPreviousStep";
+
+namespace {
+
+using InstructionMap =
+    absl::flat_hash_map<const HloInstruction*, HloInstruction*>;
+
+// Checks for the condition where all indices except the one passed as parameter
+// of a dynamic slice are constants. Something like dynamic-slice(operand, i, c,
+// c), where "c" are constants and "i" is a dynamic value.
+bool AllIndicesConstantsExceptOne(
+    const HloDynamicUpdateSliceInstruction* dyn_update, int64_t index) {
+  if (dyn_update->operand(index)->IsConstant()) {
+    return false;
+  }
+  for (int64_t i = dyn_update->first_index_operand_number();
+       i < dyn_update->operand_count(); ++i) {
+    if (i == index) {
+      continue;
+    }
+    if (!dyn_update->operand(i)->IsConstant()) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// Checks if a dynamic-update-slice() HLO has only the first dimension being
+// actually inserted "sliced" and the other dimensions are the same size of the
+// output on the tensor to be "inserted".
+std::optional<int> GetSlicedDimension(
+    const HloDynamicUpdateSliceInstruction* dyn_update) {
+  std::optional<int> sliced_dim;
+  for (int64_t i = dyn_update->first_index_operand_number();
+       i < dyn_update->operand_count(); ++i) {
+    const HloInstruction* idx = dyn_update->operand(i);
+    if (!idx->IsConstant()) {
+      if (sliced_dim.has_value()) {
+        return std::nullopt;
+      }
+      sliced_dim = i - dyn_update->first_index_operand_number();
+      continue;
+    }
+    if (Cast<HloConstantInstruction>(idx)->literal().GetFirstInteger() != 0) {
+      return std::nullopt;
+    }
+  }
+  return sliced_dim;
+}
+
+bool CheckIndexIsMonotonic(
+    const HloInstruction* index,
+    const absl::flat_hash_map<const HloInstruction*, Range>& induction_map) {
+  // Because the only math operations supported by RecursivelyIdentifyRange()
+  // are only sub/add then checking that we can compute the range here is enough
+  // to guarantee that the index is monotonic if the base index is monotonic. If
+  // we want to make the function more powerful we need to have a more
+  // sophisticated check for monothonicity.
+  Range range = RecursivelyIdentifyRange(index, induction_map);
+  VLOG(5) << "Range for: " << index->ToString() << " " << range.ToString();
+  return !range.IsEmpty() && range.IsLinear();
+}
+
+// Check that the parameter is only used in a pattern param -> gte ->
+// dyn-slice(,i, ...) where the only users of the parameter are an extraction of
+// a subslice of it driven by the loop iteration counter.
+bool CheckParameterUsageIsCompatible(const HloInstruction* gte,
+                                     const HloInstruction* dus,
+                                     const HloInstruction* dus_idx,
+                                     int64_t sliced_index) {
+  for (auto* user : gte->users()) {
+    // Expected all users are dynamic-slices
+    if (user->opcode() != HloOpcode::kDynamicSlice && dus != user) {
+      VLOG(5) << "CheckParameterUsageIsCompatible(): User not a dynamic slice "
+                 "or the dynamic-update-slice for the output."
+              << user->ToString();
+      return false;
+    }
+    // Expected same index as dynamic-update-slice().
+    if (user->operand(static_cast<HloDynamicSliceInstruction*>(user)
+                          ->first_index_operand_number() +
+                      sliced_index) != dus_idx) {
+      VLOG(5) << "CheckParameterUsageIsCompatible(): Idx is not the same as "
+                 "dynamic-update-slice() "
+              << user->ToString();
+      return false;
+    }
+  }
+  return true;
+}
+
+// Given a kInsertedByPreviousStep custom call return the level it represents.
+std::optional<int64_t> GetLevelFromCustomCall(const HloInstruction* instr) {
+  if (!instr->IsCustomCall(
+          DataParallelCollectiveOptimizer::kInsertedByPreviousStep)) {
+    return std::nullopt;
+  }
+  return Cast<HloConstantInstruction>(instr->operand(1))
+      ->literal()
+      .GetFirstInteger();
+}
+
+std::optional<std::vector<HloInstruction*>>
+CollectDynamicSliceIndicesIfConstant(HloInstruction* instr) {
+  CHECK_EQ(instr->opcode(), HloOpcode::kDynamicSlice);
+  std::vector<HloInstruction*> indices;
+  HloDynamicSliceInstruction* dyn_slice =
+      Cast<HloDynamicSliceInstruction>(instr);
+  for (int64_t i = dyn_slice->first_index_operand_number();
+       i < instr->operand_count(); ++i) {
+    HloInstruction* operand = dyn_slice->mutable_operand(i);
+    CHECK_EQ(operand->shape().dimensions_size(), 0);
+    std::vector<std::pair<HloInstruction*, int>> stack(
+        1, std::make_pair(operand, 0));
+    absl::flat_hash_set<HloInstruction*> visited;
+    while (!stack.empty()) {
+      auto& [curr_instr, operand_idx] = stack.back();
+      if (operand_idx == curr_instr->operand_count()) {
+        indices.push_back(curr_instr);
+        stack.pop_back();
+        continue;
+      }
+      HloInstruction* next_operand = curr_instr->mutable_operand(operand_idx++);
+      if (next_operand->opcode() == HloOpcode::kParameter ||
+          next_operand->HasSideEffect()) {
+        return std::nullopt;
+      }
+      if (visited.insert(next_operand).second) {
+        stack.push_back(std::make_pair(next_operand, 0));
+      }
+    }
+  }
+  return indices;
+}
+
+bool IsSupportedLoopIndexType(PrimitiveType type) {
+  switch (type) {
+    case PrimitiveType::S32:
+    case PrimitiveType::S64:
+    case PrimitiveType::S16:
+    case PrimitiveType::S8:
+    case PrimitiveType::U32:
+    case PrimitiveType::U64:
+    case PrimitiveType::U16:
+    case PrimitiveType::U8:
+      return true;
+    default:
+      return false;
+  }
+}
+
+std::optional<Literal> CreateLiteralOfShape(const Shape& shape, int64_t value) {
+  return primitive_util::PrimitiveTypeSwitch<std::optional<Literal>>(
+      [&](auto kType) -> std::optional<Literal> {
+        if constexpr (primitive_util::IsIntegralType(kType)) {
+          using NativeT = typename primitive_util::NativeTypeOf<kType>;
+          CHECK_LE(value, static_cast<absl::int128>(
+                              std::numeric_limits<NativeT>::max()));
+          CHECK_GE(value, static_cast<absl::int128>(
+                              std::numeric_limits<NativeT>::min()));
+          return LiteralUtil::CreateR0(static_cast<NativeT>(value));
+        }
+        return std::nullopt;
+      },
+      shape.element_type());
+}
+
+// Check that the value we plan to push to the next iteration is stored
+// in a way we support into an output to the loop.
+// If this level 0 we require the unique dynamic update slice to feed directly
+// into the root instruction. If this is level > 1 then we require that the
+// unique dynamic_update slice is inserted using the index created in the
+// previous level.
+std::pair<HloDynamicUpdateSliceInstruction*, std::vector<HloInstruction*>>
+CheckStoreIntoSliceIsCompatible(HloInstruction* instr,
+                                const HloComputation* while_body,
+                                int64_t level_to_operate_on) {
+  if (instr->user_count() != 1 || instr->operand_count() != 1) {
+    return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+  }
+  HloInstruction* folded_instr = instr;
+  std::vector<HloInstruction*> formatting_ops;
+  // Look through and register formatting op.
+  while (folded_instr->user_count() == 1) {
+    HloInstruction* folded_user = folded_instr->users()[0];
+    if (!HloPredicateIsOp<HloOpcode::kSlice, HloOpcode::kDynamicSlice,
+                          HloOpcode::kPad, HloOpcode::kCollectivePermute,
+                          HloOpcode::kReshape, HloOpcode::kTranspose>(
+            folded_user) &&
+        !folded_user->IsCustomCall(
+            DataParallelCollectiveOptimizer::kInsertedByPreviousStep)) {
+      break;
+    }
+
+    if (folded_user->opcode() == HloOpcode::kDynamicSlice) {
+      auto indices = CollectDynamicSliceIndicesIfConstant(folded_user);
+      if (!indices.has_value()) {
+        return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+      }
+      formatting_ops.insert(formatting_ops.end(), indices->begin(),
+                            indices->end());
+    } else if (folded_user->opcode() == HloOpcode::kPad) {
+      formatting_ops.push_back(folded_user->mutable_operand(1));
+    } else {
+      if (folded_user->operand_count() != 1 &&
+          folded_user->opcode() != HloOpcode::kDynamicSlice &&
+          folded_user->opcode() != HloOpcode::kCustomCall) {
+        return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+      }
+      if (folded_user->opcode() == HloOpcode::kCustomCall) {
+        CHECK_EQ(folded_user->operand_count(), 2);
+        CHECK_EQ(folded_user->operand(1)->opcode(), HloOpcode::kConstant);
+        formatting_ops.push_back(folded_user->mutable_operand(1));
+      }
+    }
+    formatting_ops.push_back(folded_user);
+    folded_instr = folded_user;
+  }
+  if (folded_instr->user_count() != 1) {
+    return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+  }
+  HloDynamicUpdateSliceInstruction* dyn_update =
+      DynCast<HloDynamicUpdateSliceInstruction>(folded_instr->users()[0]);
+  if (dyn_update == nullptr || dyn_update->user_count() != 1) {
+    return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+  }
+  if (level_to_operate_on == 0) {
+    if (dyn_update->users()[0] == while_body->root_instruction()) {
+      return std::make_pair(dyn_update, formatting_ops);
+    }
+    return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+  }
+  for (int64_t i = dyn_update->first_index_operand_number();
+       i < dyn_update->operand_count(); ++i) {
+    if (auto level = GetLevelFromCustomCall(dyn_update->operand(i))) {
+      if (*level == level_to_operate_on) {
+        return std::make_pair(dyn_update, formatting_ops);
+      }
+      return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+    }
+  }
+  return std::make_pair(nullptr, std::vector<HloInstruction*>{});
+}
+
+bool IsLoopIterator(const HloInstruction* instr,
+                    int64_t loop_iteration_tuple_idx) {
+  if (instr->opcode() != HloOpcode::kGetTupleElement ||
+      instr->operand(0)->opcode() != HloOpcode::kParameter) {
+    return false;
+  }
+  return instr->tuple_index() == loop_iteration_tuple_idx;
+}
+
+std::optional<std::vector<HloInstruction*>> CollectIndependentOperandChain(
+    HloInstruction* instr, int64_t loop_iter,
+    const absl::flat_hash_set<const HloInstruction*>& loop_invariant_params) {
+  std::vector<HloInstruction*> chain;
+  absl::flat_hash_set<const HloInstruction*> visited_set({instr});
+  std::vector<std::pair<HloInstruction*, int>> stack(1, {instr, 0});
+  auto is_loop_variant_parameter_input =
+      [&loop_invariant_params, loop_iter](const HloInstruction* instr) {
+        if (instr->opcode() != HloOpcode::kGetTupleElement ||
+            instr->operand(0)->opcode() != HloOpcode::kParameter) {
+          return false;
+        }
+        return !IsLoopIterator(instr, loop_iter) &&
+               !loop_invariant_params.count(instr);
+      };
+  while (!stack.empty()) {
+    auto& curr = stack.back();
+    if (curr.second == curr.first->operand_count()) {
+      if (curr.first != instr) {
+        chain.push_back(curr.first);
+      }
+      stack.pop_back();
+      continue;
+    }
+    HloInstruction* curr_operand = curr.first->mutable_operand(curr.second++);
+    if (curr_operand->opcode() == HloOpcode::kParameter) {
+      continue;
+    }
+    if (is_loop_variant_parameter_input(curr_operand)) {
+      return std::nullopt;
+    }
+    if (visited_set.insert(curr_operand).second) {
+      stack.emplace_back(curr_operand, 0);
+    }
+  }
+  for (auto* chain_instr : chain) {
+    const bool all_users_in_chain = absl::c_all_of(
+        chain_instr->users(), [&visited_set](const HloInstruction* u) {
+          return visited_set.contains(u);
+        });
+    const bool is_scalar_shaped =
+        ShapeUtil::IsEffectiveScalar(chain_instr->shape());
+    if (!all_users_in_chain) {
+      if (!loop_invariant_params.contains(chain_instr) && !is_scalar_shaped) {
+        return std::nullopt;
+      }
+    }
+  }
+  return std::move(chain);
+}
+
+// Collect chains of instructions that we can pipeline backwards.
+// These are chains of instructions culminating in one of the instructions we
+// are interested in pipelining (like all-gather for example), that have uses
+// only inside the chain (except for scalar instructions that get duplicated)
+// and use a parameter value from the loop that is invariant (doesn't get
+// updated between loop iterations).
+std::optional<std::vector<HloInstruction*>> CollectChainsToPushBackwards(
+    HloInstruction* instr, int64_t loop_iter, const HloComputation* while_body,
+    int64_t level_to_operate_on,
+    const absl::flat_hash_set<const HloInstruction*>& loop_invariant_params) {
+  if (instr->user_count() != 1) {
+    return std::nullopt;
+  }
+  return CollectIndependentOperandChain(instr, loop_iter,
+                                        loop_invariant_params);
+}
+
+// Given a dynamic-update-slice find the output index of the loop we feed into.
+// We assume that the insertion instruction has been already validated.
+std::optional<int64_t> FindOutputIndexForDynamicUpdateSlice(
+    const HloInstruction* dus, const HloInstruction* root_instr) {
+  std::optional<int64_t> output_idx;
+  while (dus->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    if (dus->user_count() != 1) {
+      output_idx = std::nullopt;
+      break;
+    }
+    if (dus->users()[0] == root_instr) {
+      auto indices = root_instr->OperandIndices(dus);
+      if (indices.size() != 1) {
+        output_idx = std::nullopt;
+        break;
+      }
+      output_idx = indices[0];
+      break;
+    }
+    dus = Cast<HloDynamicUpdateSliceInstruction>(dus->users()[0]);
+  }
+  return output_idx;
+}
+
+std::vector<HloInstruction*> MapNewOperands(
+    absl::Span<HloInstruction* const> operands, const InstructionMap& clone_map,
+    bool allow_unmapped = false) {
+  std::vector<HloInstruction*> new_operands;
+  new_operands.reserve(operands.size());
+  for (HloInstruction* operand : operands) {
+    auto it = clone_map.find(operand);
+    HloInstruction* mapped_operand = operand;
+    CHECK(it != clone_map.end() || allow_unmapped)
+        << operand->ToString() << " not present in map";
+    if (it != clone_map.end()) {
+      mapped_operand = it->second;
+    }
+    new_operands.push_back(mapped_operand);
+  }
+  return new_operands;
+}
+
+// Collect information regarding movement of data either backward or forward
+// through loop iterations. Except collective_to_move every other information
+// here can be empty/null/-1 to indicate absence.
+struct WhileMoveInfo {
+  HloInstruction* collective_to_move;
+  HloDynamicUpdateSliceInstruction* dynamic_update_slice;
+  std::vector<HloInstruction*> formatting_ops;
+  int64_t sliced_idx;
+  int64_t output_idx;
+};
+
+// Set channel_id of instruction to next available to avoid collisions.
+void UpdateInstructionChannelId(HloInstruction* cloned_instr,
+                                int64_t& next_channel_id) {
+  if (auto* channel_instr = DynCast<HloChannelInstruction>(cloned_instr)) {
+    if (channel_instr->channel_id()) {
+      channel_instr->set_channel_id(next_channel_id++);
+    }
+  }
+}
+
+// Clones a chain of instructions from a move_info for backward movement.
+template <typename Comp>
+HloInstruction* CloneBackwardChain(Comp& target_computation,
+                                   const WhileMoveInfo& move_info,
+                                   InstructionMap& clone_map,
+                                   int64_t loop_iter_idx,
+                                   int64_t& next_channel_id) {
+  std::vector<HloInstruction*> to_clone(move_info.formatting_ops.begin(),
+                                        move_info.formatting_ops.end());
+  to_clone.push_back(move_info.collective_to_move);
+  HloInstruction* last_cloned = nullptr;
+  for (auto* chain_op : to_clone) {
+    if (IsLoopIterator(chain_op, loop_iter_idx)) {
+      continue;
+    }
+    auto new_operands = MapNewOperands(chain_op->operands(), clone_map);
+    HloInstruction* cloned = target_computation.AddInstruction(
+        chain_op->CloneWithNewOperands(chain_op->shape(), new_operands));
+    UpdateInstructionChannelId(cloned, next_channel_id);
+    clone_map[chain_op] = cloned;
+    last_cloned = cloned;
+  }
+  CHECK_NE(last_cloned, nullptr);
+  return last_cloned;
+}
+
+// Analyzes a loop and collects information to understand if this transformation
+// can be performed or if it should be performed (because there are collectives
+// to optimize).
+class WhileLoopAnalysis {
+ public:
+  explicit WhileLoopAnalysis(
+      HloInstruction* while_instr, int64_t max_pipelining_per_loop,
+      bool process_different_sized_options,
+      std::optional<ConstantValue> known_start = std::nullopt)
+      : while_(while_instr),
+        loop_start_(known_start),
+        max_pipelining_per_loop_(max_pipelining_per_loop),
+        process_different_sized_options_(process_different_sized_options) {}
+  std::optional<ConstantValue> GetLoopIterationCount() const;
+  std::optional<ConstantValue> GetLoopStart() const;
+  std::optional<ConstantValue> GetLoopIncrement() const;
+  const std::vector<WhileMoveInfo>& GetMoveInfos() const;
+  std::optional<int64_t> GetLoopIterationIdx() const {
+    return loop_iteration_idx_;
+  }
+  int64_t GetDUSIndex(const HloInstruction* dus) const;
+  const absl::flat_hash_map<HloInstruction*, int64_t>& GetDUSIndices() const {
+    return dus_index_map_;
+  }
+  int64_t GetUniqueDUSIndices() const { return dus_index_map_.size(); }
+  int64_t GetMaxPipeliningPerLoop() const { return max_pipelining_per_loop_; }
+
+  bool ComputeLoopStatistics();
+  void CollectCollectivesToMove(
+      int64_t level_to_operate_on,
+      DataParallelCollectiveOptimizer::PipeliningDirection direction,
+      HloPredicate should_process);
+  HloInstruction* while_loop_instruction() const { return while_; }
+
+ private:
+  HloInstruction* while_;
+  std::optional<ConstantValue> loop_iteration_count_;
+  std::optional<ConstantValue> loop_increment_;
+  std::optional<ConstantValue> loop_start_;
+  std::optional<ConstantValue> loop_bound_;
+  std::optional<int64_t> loop_iteration_idx_;
+  std::vector<WhileMoveInfo> move_infos_;
+  absl::flat_hash_map<HloInstruction*, int64_t> dus_index_map_;
+  absl::flat_hash_set<const HloInstruction*> invariant_loop_parameters_;
+  int64_t max_pipelining_per_loop_;
+  bool process_different_sized_options_;
+};
+
+int64_t WhileLoopAnalysis::GetDUSIndex(const HloInstruction* dus) const {
+  auto it = dus_index_map_.find(dus);
+  CHECK(it != dus_index_map_.end());
+  return it->second;
+}
+
+bool WhileLoopAnalysis::ComputeLoopStatistics() {
+  // Loop iteration count already computed. This means a previous analysis as
+  // been successful and we don't need to do anything.
+  if (loop_iteration_count_) {
+    return true;
+  }
+  std::optional<ParsedWhileLoop> parsed_loop =
+      PatternMatchParseWhileLoop(while_);
+  if (!parsed_loop || !parsed_loop->static_while_loop) {
+    return false;
+  }
+  if (!IsSupportedLoopIndexType(
+          while_->shape()
+              .tuple_shapes(parsed_loop->static_while_loop->induction_var_index)
+              .element_type())) {
+    return false;
+  }
+  const HloInstruction* loop_root = while_->while_body()->root_instruction();
+  const int64_t bitwidth = primitive_util::BitWidth(
+      loop_root->operand(parsed_loop->static_while_loop->induction_var_index)
+          ->shape()
+          .element_type());
+  const bool is_signed = primitive_util::IsSignedIntegralType(
+      loop_root->operand(parsed_loop->static_while_loop->induction_var_index)
+          ->shape()
+          .element_type());
+  const ConstantValue bound =
+      is_signed ? ConstantValue::GetSigned(
+                      parsed_loop->static_while_loop->loop_bound, bitwidth)
+                : ConstantValue::GetUnsigned(
+                      parsed_loop->static_while_loop->loop_bound, bitwidth);
+  const ConstantValue increment =
+      is_signed ? ConstantValue::GetSigned(
+                      parsed_loop->static_while_loop->step_size, bitwidth)
+                : ConstantValue::GetUnsigned(
+                      parsed_loop->static_while_loop->step_size, bitwidth);
+  loop_start_ =
+      is_signed ? ConstantValue::GetSigned(
+                      parsed_loop->static_while_loop->induction_var_init_value,
+                      bitwidth)
+                : ConstantValue::GetUnsigned(
+                      parsed_loop->static_while_loop->induction_var_init_value,
+                      bitwidth);
+
+  auto iteration_range = bound.sub(*loop_start_);
+  auto iter_count = iteration_range.div(increment);
+  loop_iteration_count_ =
+      iteration_range.mod(increment).gt(
+          ConstantValue::GetZero(increment.GetBitwidth(), increment.IsSigned()))
+          ? iter_count.add(ConstantValue::GetOne(increment.GetBitwidth(),
+                                                 increment.IsSigned()))
+          : iter_count;
+
+  // Overflowing the iteration count.
+  if (loop_iteration_count_->lt(iter_count)) {
+    return false;
+  }
+
+  loop_bound_ = bound;
+  loop_increment_ = increment;
+  loop_iteration_idx_ = parsed_loop->static_while_loop->induction_var_index;
+
+  VLOG(1) << "Bound: " << loop_bound_->ToString()
+          << " Start: " << loop_start_->ToString()
+          << " Increment: " << loop_increment_->ToString();
+  // Simple invariant analysis. Just support arrays in the first nest of the
+  // while() input.
+  if (loop_root->opcode() == HloOpcode::kTuple) {
+    for (int i = 0; i < loop_root->operand_count(); ++i) {
+      if (loop_root->operand(i)->opcode() != HloOpcode::kGetTupleElement) {
+        continue;
+      }
+      if (i != loop_root->operand(i)->tuple_index()) {
+        continue;
+      }
+      invariant_loop_parameters_.insert(loop_root->operand(i));
+    }
+  }
+  return true;
+}
+
+void WhileLoopAnalysis::CollectCollectivesToMove(
+    int64_t level_to_operate_on,
+    DataParallelCollectiveOptimizer::PipeliningDirection direction,
+    HloPredicate should_process) {
+  move_infos_.clear();
+  HloComputation* while_body = while_->while_body();
+  const HloInstruction* loop_parameter =
+      while_body->parameter_instructions()[0];
+
+  // If the parameter tuple escapes then we can't guarantee that the replacement
+  // for the next iteration is used by everybody unless we create a tuple with
+  // the replacement that would probably limit overlap, so avoid this.
+  if (absl::c_any_of(loop_parameter->users(), [](const HloInstruction* instr) {
+        return instr->opcode() != HloOpcode::kGetTupleElement;
+      })) {
+    return;
+  }
+
+  if (absl::c_any_of(while_->users(), [](const HloInstruction* instr) {
+        return instr->opcode() != HloOpcode::kGetTupleElement;
+      })) {
+    return;
+  }
+  absl::flat_hash_map<int64_t, int64_t> parameter_gtes_count;
+  for (auto* user : loop_parameter->users()) {
+    CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement);
+    ++parameter_gtes_count[user->tuple_index()];
+  }
+  absl::flat_hash_map<const HloInstruction*, Range> index_ranges;
+  if (loop_bound_) {
+    // Compute the range of the index as "start + iteration_count * increment"
+    Range index_range =
+        Range{*loop_start_,
+              loop_start_->add(
+                  loop_iteration_count_
+                      ->sub(ConstantValue::GetOne(loop_start_->GetBitwidth(),
+                                                  loop_start_->IsSigned()))
+                      .mul(*loop_increment_)),
+              /*is_linear=*/true};
+    for (auto* instr : while_body->instructions()) {
+      if (instr->opcode() == HloOpcode::kGetTupleElement) {
+        if (instr->tuple_index() == 0) {
+          index_ranges.insert({instr, index_range});
+        }
+      }
+    }
+  }
+  for (auto* instr : while_body->instructions()) {
+    if (direction ==
+            DataParallelCollectiveOptimizer::PipeliningDirection::kForward &&
+        (instr->operand_count() != 1 ||
+         instr->shape().dimensions_size() !=
+             instr->operand(0)->shape().dimensions_size())) {
+      continue;
+    }
+    if (!should_process(instr)) {
+      continue;
+    }
+    if (direction ==
+        DataParallelCollectiveOptimizer::PipeliningDirection::kForward) {
+      auto [dyn_update, formatting_ops] = CheckStoreIntoSliceIsCompatible(
+          instr, while_body, level_to_operate_on);
+      if (dyn_update == nullptr) {
+        VLOG(5)
+            << "Skipping " << instr->name()
+            << " because update users > 1 or single user is not the root of "
+               "computation";
+        continue;
+      }
+      if (!process_different_sized_options_) {
+        if (!formatting_ops.empty()) {
+          if (instr->operand(0)->shape() != formatting_ops.back()->shape()) {
+            continue;
+          }
+        } else {
+          if (instr->operand(0)->shape() != instr->shape()) {
+            continue;
+          }
+        }
+      }
+      std::optional<int64_t> sliced_dim = GetSlicedDimension(dyn_update);
+      if (!sliced_dim.has_value()) {
+        VLOG(5) << "Skipping " << instr->name()
+                << " because couldn't find sliced dimension";
+        continue;
+      }
+      const HloInstruction* to_insert_into = dyn_update->operand(0);
+      if (level_to_operate_on == 0 &&
+          (to_insert_into->opcode() != HloOpcode::kGetTupleElement ||
+           to_insert_into->operand(0) != loop_parameter)) {
+        VLOG(5) << "Skipping " << instr->name()
+                << " because slice to insert into is not a GTE from input "
+                   "parameter "
+                << to_insert_into->ToString();
+        continue;
+      }
+      if (dyn_update->user_count() != 1) {
+        continue;
+      }
+      // If Level is > 0 then we already did our analysis in the previous
+      // iteration for safeness of this index to transform.
+      if (level_to_operate_on == 0) {
+        if (to_insert_into->opcode() == HloOpcode::kGetTupleElement) {
+          // GTE for this parameter is not CSEd. Abort because we don't analyze
+          // every single use from other GTEs.
+          if (parameter_gtes_count.at(to_insert_into->tuple_index()) != 1) {
+            VLOG(5)
+                << "Skipping " << instr->name()
+                << " because there are multiple parameter GTEs for this slice";
+            continue;
+          }
+        }
+        HloInstruction* dyn_update_idx = dyn_update->mutable_operand(
+            dyn_update->first_index_operand_number() + *sliced_dim);
+        if (level_to_operate_on == 0 &&
+            !CheckParameterUsageIsCompatible(to_insert_into, dyn_update,
+                                             dyn_update_idx, *sliced_dim)) {
+          VLOG(5)
+              << "Skipping " << instr->name()
+              << " because parameter usage doesn't follow the expected pattern";
+          continue;
+        }
+        if (!AllIndicesConstantsExceptOne(
+                dyn_update,
+                dyn_update->first_index_operand_number() + *sliced_dim)) {
+          VLOG(5) << "Skipping " << instr->name()
+                  << " because update slicing doesn't match expectation";
+          continue;
+        }
+        if (!CheckIndexIsMonotonic(dyn_update_idx, index_ranges)) {
+          VLOG(5) << "Skipping " << instr->name()
+                  << " because update index is not monotonic";
+          continue;
+        }
+      }
+      std::optional<int64_t> output_idx = FindOutputIndexForDynamicUpdateSlice(
+          dyn_update, while_body->root_instruction());
+      if (!output_idx.has_value()) {
+        VLOG(5) << "Skipping " << instr->name()
+                << " because couldn't find unique output index for insertion";
+        continue;
+      }
+      move_infos_.push_back({instr, dyn_update, std::move(formatting_ops),
+                             *sliced_dim, *output_idx});
+    } else {
+      CHECK_EQ(direction,
+               DataParallelCollectiveOptimizer::PipeliningDirection::kBackward);
+      auto chain_collected = CollectChainsToPushBackwards(
+          instr, *loop_iteration_idx_, while_body, level_to_operate_on,
+          invariant_loop_parameters_);
+      if (!chain_collected.has_value()) {
+        continue;
+      }
+      move_infos_.push_back(
+          WhileMoveInfo{instr, nullptr, std::move(*chain_collected), -1, -1});
+    }
+    if (move_infos_.size() >= max_pipelining_per_loop_) {
+      break;
+    }
+  }
+  if (direction !=
+      DataParallelCollectiveOptimizer::PipeliningDirection::kForward) {
+    return;
+  }
+  dus_index_map_.clear();
+  for (auto& to_move : move_infos_) {
+    HloInstruction* dus_index = to_move.dynamic_update_slice->mutable_operand(
+        to_move.dynamic_update_slice->first_index_operand_number() +
+        to_move.sliced_idx);
+    auto it = dus_index_map_.find(dus_index);
+    int64_t dus_index_tuple_position = dus_index_map_.size();
+    if (it != dus_index_map_.end()) {
+      dus_index_tuple_position = it->second;
+    } else {
+      dus_index_map_[dus_index] = dus_index_tuple_position;
+    }
+  }
+}
+
+std::optional<ConstantValue> WhileLoopAnalysis::GetLoopIterationCount() const {
+  return loop_iteration_count_;
+}
+
+std::optional<ConstantValue> WhileLoopAnalysis::GetLoopStart() const {
+  return loop_start_;
+}
+
+std::optional<ConstantValue> WhileLoopAnalysis::GetLoopIncrement() const {
+  return loop_increment_;
+}
+
+const std::vector<WhileMoveInfo>& WhileLoopAnalysis::GetMoveInfos() const {
+  return move_infos_;
+}
+
+}  // namespace
+
+// Function that does the work of pushing forward instructions that have been
+// determined that can be pipelined. Rough transformation: while (i < LAYERS) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   x = computation(p0)
+//   xg = all-reduce(x)
+//   y = computation(p1)
+//   yg = all-reduce(y)
+// }
+//
+// to
+//
+// x_prev = computation(p0)
+// y_prev = computation(p1)
+// i = i + 1
+// while (i < LAYERS, x_prev, y_prev) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   xg = all-reduce(x_prev)
+//   yg = all-reduce(y_prev)
+//   x = computation(p0)
+//   y = computation(p1)
+//   x_prev = x
+//   y_prev = y
+// }
+// xg_last = all-reduce(x)
+// yg_last = all-reduce(y)
+static Status TransformLoopForward(const WhileLoopAnalysis& loop_analysis,
+                                   bool insert_non_alias_custom_call,
+                                   int64_t level_to_operate_on,
+                                   bool process_different_sized_ops,
+                                   HloPredicate should_process,
+                                   int64_t& next_channel_id) {
+  // Defining some maps/sets to keep track of instructions duplicated.
+  InstructionMap while_body_to_peeled;
+  absl::flat_hash_set<HloInstruction*> to_skip_set;
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> formatting_map;
+  absl::flat_hash_map<HloInstruction*, int64_t> is_output_instruction;
+  std::vector<int64_t> moves_requiring_special_output;
+  int64_t count = 0;
+  // Add all-reduces to duplicate into a set.
+  for (auto& to_move : loop_analysis.GetMoveInfos()) {
+    to_skip_set.insert(to_move.collective_to_move);
+    if (!to_move.formatting_ops.empty()) {
+      formatting_map[to_move.formatting_ops.back()] =
+          to_move.collective_to_move;
+    }
+    const Shape& output_shape = to_move.formatting_ops.empty()
+                                    ? to_move.collective_to_move->shape()
+                                    : to_move.formatting_ops.back()->shape();
+    if (output_shape != to_move.collective_to_move->operand(0)->shape()) {
+      moves_requiring_special_output.push_back(count);
+      to_skip_set.insert(to_move.dynamic_update_slice);
+    }
+    ++count;
+  }
+  // Map get-tuple-elements() inside of the loop with elements passed to the
+  // tuple that is the "init" of the loop.
+  HloInstruction* while_loop = loop_analysis.while_loop_instruction();
+  HloComputation* while_body = while_loop->while_body();
+  CHECK_EQ(while_body->parameter_instructions().size(), 1)
+      << "Expected only one parameter";
+  HloInstruction* loop_parameter = while_body->parameter_instructions()[0];
+  HloInstruction* loop_init = while_loop->mutable_operand(0);
+  const int64_t initial_inputs = loop_init->operand_count();
+  while_body_to_peeled[loop_parameter] = loop_init;
+  for (auto* user : loop_parameter->users()) {
+    CHECK_EQ(user->opcode(), HloOpcode::kGetTupleElement)
+        << "Expected only get-tuple-elements as users";
+    while_body_to_peeled[user] =
+        loop_init->mutable_operand(user->tuple_index());
+  }
+  CHECK_EQ(while_body->root_instruction()->opcode(), HloOpcode::kTuple);
+  for (int i = 0; i < while_body->root_instruction()->operand_count(); ++i) {
+    is_output_instruction[while_body->root_instruction()->mutable_operand(i)] =
+        i;
+  }
+
+  // Collect the new parameter shapes with the additional state for the indices
+  // and construct new operand vectors for the init of the new loop and its root
+  // instruction.
+  HloComputation* loop_computation = while_loop->parent();
+  std::vector<HloInstruction*> new_init_operands;
+  std::vector<Shape> new_parameter_shapes;
+  std::vector<HloInstruction*> new_root_operands;
+  const int64_t operands_indices_count =
+      loop_init->operand_count() + loop_analysis.GetUniqueDUSIndices();
+  const int64_t new_loop_tuple_operand_count =
+      operands_indices_count + moves_requiring_special_output.size();
+  new_parameter_shapes.resize(new_loop_tuple_operand_count);
+  new_root_operands.resize(new_loop_tuple_operand_count);
+  new_init_operands.resize(new_loop_tuple_operand_count);
+  for (int i = 0; i < loop_parameter->shape().tuple_shapes().size(); ++i) {
+    new_parameter_shapes[i] = loop_parameter->shape().tuple_shapes(i);
+    new_root_operands[i] = while_body->root_instruction()->mutable_operand(i);
+    new_init_operands[i] = loop_init->mutable_operand(i);
+  }
+
+  // Duplicate the loop body into the loop parent computation, so that the first
+  // iteration happens there.
+  for (auto* instr : while_body->MakeInstructionPostOrder()) {
+    if (instr == loop_parameter) {
+      continue;
+    }
+    if (ContainsKey(to_skip_set, instr)) {
+      auto it = while_body_to_peeled.find(instr->operand(0));
+      CHECK(it != while_body_to_peeled.end());
+      HloInstruction* passthrough_operand = it->second;
+      while_body_to_peeled[instr] = passthrough_operand;
+      continue;
+    }
+    auto formatting_it = formatting_map.find(instr);
+    if (formatting_it != formatting_map.end()) {
+      auto it = while_body_to_peeled.find(formatting_it->second);
+      CHECK(it != while_body_to_peeled.end());
+      HloInstruction* passthrough_operand = it->second;
+      while_body_to_peeled[instr] = passthrough_operand;
+      continue;
+    }
+    std::vector<HloInstruction*> new_operands =
+        MapNewOperands(instr->operands(), while_body_to_peeled);
+    HloInstruction* cloned_instr = loop_computation->AddInstruction(
+        instr->CloneWithNewOperands(instr->shape(), new_operands));
+    UpdateInstructionChannelId(cloned_instr, next_channel_id);
+    while_body_to_peeled[instr] = cloned_instr;
+    auto output_it = is_output_instruction.find(instr);
+    if (output_it != is_output_instruction.end()) {
+      new_init_operands[output_it->second] = cloned_instr;
+    }
+  }
+
+  // Add indices to access the slices for the previous iteration to the
+  // loop state. Indices used multiple times for multiple slices have been
+  // deduped.
+  for (auto& dus : loop_analysis.GetDUSIndices()) {
+    new_parameter_shapes[dus.second + initial_inputs] = dus.first->shape();
+    new_root_operands[dus.second + initial_inputs] = dus.first;
+    new_init_operands[dus.second + initial_inputs] =
+        while_body_to_peeled[dus.first];
+  }
+  for (int i = 0; i < moves_requiring_special_output.size(); ++i) {
+    HloInstruction* collective =
+        loop_analysis.GetMoveInfos()[moves_requiring_special_output[i]]
+            .collective_to_move;
+    new_parameter_shapes[operands_indices_count + i] =
+        collective->operand(0)->shape();
+    new_root_operands[operands_indices_count + i] =
+        collective->mutable_operand(0);
+    new_init_operands[operands_indices_count + i] =
+        while_body_to_peeled[collective->mutable_operand(0)];
+  }
+  // Clone new loop computations (cond and body) and create the new loop
+  // instruction and connect it to the users/operands of the old loop.
+  Shape loop_state_shape = ShapeUtil::MakeTupleShape(new_parameter_shapes);
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements[loop_parameter] = HloInstruction::CreateParameter(
+      0, ShapeUtil::MakeTupleShape(new_parameter_shapes), "loop_peel_param");
+  replacements[while_loop->while_condition()->parameter_instructions()[0]] =
+      HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeTupleShape(new_parameter_shapes),
+          "loop_peel_cond_param");
+  replacements[while_body->root_instruction()] =
+      HloInstruction::CreateTuple(new_root_operands);
+  HloComputation* new_while_condition =
+      loop_computation->parent()->AddEmbeddedComputation(
+          while_loop->while_condition()->CloneWithReplacements(&replacements));
+  HloComputation* new_while_body =
+      loop_computation->parent()->AddEmbeddedComputation(
+          while_body->CloneWithReplacements(&replacements));
+  HloInstruction* new_init = loop_computation->AddInstruction(
+      HloInstruction::CreateTuple(new_init_operands));
+  HloInstruction* new_while_loop =
+      loop_computation->AddInstruction(HloInstruction::CreateWhile(
+          loop_state_shape, new_while_condition, new_while_body, new_init));
+  TF_RETURN_IF_ERROR(
+      while_loop->ReplaceAllUsesWithDifferentShape(new_while_loop));
+  TF_RETURN_IF_ERROR(
+      loop_computation->RemoveInstructionAndUnusedOperands(while_loop));
+  // Run WhileLoopAnalysis again on the new loop to collect the position of the
+  // all-reduces in the new cloned loop as they aren't the same of the old.
+  // Loop analysis should result exactly the same, because the loop is the same
+  // except some new scalar unused parameters added at the end.
+  WhileLoopAnalysis new_loop_analysis(
+      new_while_loop, loop_analysis.GetMaxPipeliningPerLoop(),
+      process_different_sized_ops,
+      loop_analysis.GetLoopStart()->add(*loop_analysis.GetLoopIncrement()));
+  new_loop_analysis.ComputeLoopStatistics();
+  new_loop_analysis.CollectCollectivesToMove(
+      level_to_operate_on,
+      DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+      should_process);
+  CHECK_EQ(new_loop_analysis.GetMoveInfos().size(),
+           loop_analysis.GetMoveInfos().size());
+  auto insert_slice = [](HloInstruction* to_insert, int64_t index_position,
+                         int64_t num_indices, HloInstruction* dus_index,
+                         HloInstruction* base) {
+    HloComputation* computation = to_insert->parent();
+    HloInstruction* zero =
+        computation->AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::Zero(dus_index->shape().element_type())));
+    std::vector<HloInstruction*> indices(num_indices, zero);
+    indices[index_position] = dus_index;
+    return computation->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        base->shape(), base, to_insert, indices));
+  };
+  auto process_slice = [&next_channel_id, insert_non_alias_custom_call,
+                        level_to_operate_on](HloInstruction* stacked_data,
+                                             const WhileMoveInfo& move_info) {
+    HloInstruction* processed = stacked_data->parent()->AddInstruction(
+        move_info.collective_to_move->CloneWithNewOperands(
+            move_info.collective_to_move->shape(), {stacked_data}));
+    UpdateInstructionChannelId(processed, next_channel_id);
+    if (insert_non_alias_custom_call) {
+      HloInstruction* level =
+          stacked_data->parent()->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0(level_to_operate_on + 1)));
+      processed = stacked_data->parent()->AddInstruction(
+          HloInstruction::CreateCustomCall(
+              processed->shape(), {processed, level},
+              DataParallelCollectiveOptimizer::kInsertedByPreviousStep));
+    }
+    InstructionMap cloned_map;
+    cloned_map[move_info.collective_to_move] = processed;
+    for (auto* formatting_op : move_info.formatting_ops) {
+      auto new_operands = MapNewOperands(formatting_op->operands(), cloned_map);
+      processed = stacked_data->parent()->AddInstruction(
+          formatting_op->CloneWithNewOperands(formatting_op->shape(),
+                                              new_operands));
+      cloned_map[formatting_op] = processed;
+    }
+    return processed;
+  };
+  auto extract_and_process_slice = [&process_slice](
+                                       HloInstruction* stacked_data,
+                                       HloInstruction* data_to_slice,
+                                       const WhileMoveInfo& move_info,
+                                       HloInstruction* dus_index) {
+    HloComputation* computation = stacked_data->parent();
+    const Shape& slice_target_shape =
+        move_info.collective_to_move->operand(0)->shape();
+    HloInstruction* sliced_data = data_to_slice;
+    PrimitiveType element_type =
+        move_info.dynamic_update_slice
+            ->operand(
+                move_info.dynamic_update_slice->first_index_operand_number() +
+                move_info.sliced_idx)
+            ->shape()
+            .element_type();
+    HloInstruction* zero = computation->AddInstruction(
+        HloInstruction::CreateConstant(LiteralUtil::Zero(element_type)));
+    std::vector<HloInstruction*> indices(
+        move_info.dynamic_update_slice->operand_count() -
+            move_info.dynamic_update_slice->first_index_operand_number(),
+        zero);
+    indices[move_info.sliced_idx] = dus_index;
+    if (slice_target_shape != data_to_slice->shape()) {
+      // Slice matrix.
+      absl::InlinedVector<int64_t, 4> dynamic_slice_sizes;
+      dynamic_slice_sizes.reserve(slice_target_shape.dimensions_size());
+      for (int i = 0; i < slice_target_shape.dimensions_size(); ++i) {
+        dynamic_slice_sizes.push_back(slice_target_shape.dimensions(i));
+      }
+      sliced_data =
+          computation->AddInstruction(HloInstruction::CreateDynamicSlice(
+              slice_target_shape, data_to_slice, indices, dynamic_slice_sizes));
+    }
+    sliced_data = process_slice(sliced_data, move_info);
+    return computation->AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+        move_info.dynamic_update_slice->shape(), stacked_data, sliced_data,
+        indices));
+  };
+  absl::flat_hash_map<int64_t, int64_t> moves_requiring_special_output_to_idx;
+  for (int i = 0; i < moves_requiring_special_output.size(); ++i) {
+    moves_requiring_special_output_to_idx[moves_requiring_special_output[i]] =
+        i;
+  }
+  for (int i = 0; i < new_loop_analysis.GetMoveInfos().size(); ++i) {
+    auto& move_info = new_loop_analysis.GetMoveInfos()[i];
+    std::vector<HloInstruction*> loop_output_to_replace;
+    HloInstruction* parameter_instr =
+        new_while_body->parameter_instructions()[0];
+    for (auto* user : new_while_loop->users()) {
+      if (user->tuple_index() != move_info.output_idx) {
+        continue;
+      }
+      loop_output_to_replace.push_back(user);
+    }
+    const HloInstruction* dus_index_curr_iteration =
+        move_info.dynamic_update_slice->operand(
+            move_info.dynamic_update_slice->first_index_operand_number() +
+            move_info.sliced_idx);
+    const int64_t offset_for_index =
+        new_loop_analysis.GetDUSIndex(dus_index_curr_iteration) +
+        initial_inputs;
+    Shape index_shape = dus_index_curr_iteration->shape();
+    HloInstruction* input_dus_idx =
+        new_while_body->AddInstruction(HloInstruction::CreateGetTupleElement(
+            index_shape, parameter_instr, offset_for_index));
+    if (insert_non_alias_custom_call) {
+      HloInstruction* level =
+          new_while_body->AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0(level_to_operate_on + 1)));
+      input_dus_idx =
+          new_while_body->AddInstruction(HloInstruction::CreateCustomCall(
+              index_shape, {input_dus_idx, level},
+              DataParallelCollectiveOptimizer::kInsertedByPreviousStep));
+    }
+    HloInstruction* output_dus_idx =
+        loop_computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            index_shape, new_while_loop, offset_for_index));
+    HloInstruction* input_stacked_data =
+        move_info.dynamic_update_slice->mutable_operand(0);
+    HloInstruction* output_stacked_data =
+        loop_computation->AddInstruction(HloInstruction::CreateGetTupleElement(
+            move_info.dynamic_update_slice->shape(), new_while_loop,
+            move_info.output_idx));
+    HloInstruction* input_data_to_slice = input_stacked_data;
+    HloInstruction* output_data_to_slice = output_stacked_data;
+    auto it = moves_requiring_special_output_to_idx.find(i);
+    if (it != moves_requiring_special_output_to_idx.end()) {
+      input_data_to_slice =
+          new_while_body->AddInstruction(HloInstruction::CreateGetTupleElement(
+              move_info.collective_to_move->operand(0)->shape(),
+              parameter_instr, it->second + operands_indices_count));
+      output_data_to_slice = loop_computation->AddInstruction(
+          HloInstruction::CreateGetTupleElement(
+              move_info.collective_to_move->operand(0)->shape(), new_while_loop,
+              it->second + operands_indices_count));
+    }
+    input_stacked_data = extract_and_process_slice(
+        input_stacked_data, input_data_to_slice, move_info, input_dus_idx);
+    output_stacked_data = extract_and_process_slice(
+        output_stacked_data, output_data_to_slice, move_info, output_dus_idx);
+    auto replace_instructions_with =
+        [](absl::Span<HloInstruction*> to_replace_instrs,
+           HloInstruction* new_instr) {
+          for (auto* to_replace : to_replace_instrs) {
+            HloComputation* computation = to_replace->parent();
+            TF_RETURN_IF_ERROR(to_replace->ReplaceAllUsesWith(new_instr));
+            TF_RETURN_IF_ERROR(
+                computation->RemoveInstructionAndUnusedOperands(to_replace));
+          }
+          return OkStatus();
+        };
+    auto* new_peeled_dus = input_stacked_data;
+    if (it == moves_requiring_special_output_to_idx.end()) {
+      new_peeled_dus = insert_slice(
+          move_info.collective_to_move->mutable_operand(0),
+          move_info.sliced_idx,
+          move_info.dynamic_update_slice->operand_count() -
+              move_info.dynamic_update_slice->first_index_operand_number(),
+          move_info.dynamic_update_slice->mutable_operand(
+              move_info.dynamic_update_slice->first_index_operand_number() +
+              move_info.sliced_idx),
+          input_stacked_data);
+    }
+    TF_RETURN_IF_ERROR(
+        move_info.dynamic_update_slice->ReplaceAllUsesWith(new_peeled_dus));
+    TF_RETURN_IF_ERROR(new_while_body->RemoveInstructionAndUnusedOperands(
+        move_info.dynamic_update_slice));
+    TF_RETURN_IF_ERROR(replace_instructions_with(
+        absl::MakeSpan(loop_output_to_replace), output_stacked_data));
+  }
+  TF_RETURN_IF_ERROR(loop_computation->parent()->RemoveUnusedComputations());
+  return OkStatus();
+}
+
+// Function that does the work of pushing backward instructions that have been
+// determined that can be pipelined. Rough transformation: while (i < LAYERS) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   p0_ag = all-gather(p0)
+//   x = computation(p0_ag)
+//   y = computation(p1)
+// }
+//
+// to
+//
+// x_ag = all-gather(x)
+// while (i < LAYERS-1, x_ag) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   p0_ag = param(2)
+//   p0_ag_next = all-gather(p0)
+//   x = computation(p0_ag)
+//   y = computation(p1)
+//   x_ag = p0_ag_next
+// }
+// x_last = computation(p0_ag_next)
+static Status TransformLoopBackward(const WhileLoopAnalysis& loop_analysis,
+                                    bool insert_non_alias_custom_call,
+                                    int64_t level_to_operate_on,
+                                    bool process_different_sized_ops,
+                                    HloPredicate should_process,
+                                    int64_t& next_channel_id) {
+  // Defining some maps/sets to keep track of instructions duplicated.
+  absl::flat_hash_map<HloInstruction*, HloInstruction*> while_body_to_peeled;
+  absl::flat_hash_map<HloInstruction*, int64_t> collective_to_move_map;
+  absl::flat_hash_set<HloInstruction*> is_pipelined_instruction;
+  absl::flat_hash_map<HloInstruction*, int64_t> is_output_instruction;
+  int64_t count = 0;
+  // Add instructions to duplicate into a set.
+  for (auto& to_move : loop_analysis.GetMoveInfos()) {
+    collective_to_move_map[to_move.collective_to_move] = count;
+    is_pipelined_instruction.insert(to_move.collective_to_move);
+    is_pipelined_instruction.insert(to_move.formatting_ops.begin(),
+                                    to_move.formatting_ops.end());
+    ++count;
+  }
+  HloInstruction* while_loop = loop_analysis.while_loop_instruction();
+  HloComputation* while_body = while_loop->while_body();
+  CHECK_EQ(while_body->parameter_instructions().size(), 1)
+      << "Expected only one parameter";
+  HloInstruction* loop_parameter = while_body->parameter_instructions()[0];
+  HloInstruction* loop_initial_iteration_idx =
+      while_loop->mutable_operand(0)->mutable_operand(
+          *loop_analysis.GetLoopIterationIdx());
+  // Map loop_parameter to the input tuple for peeling backward.
+  while_body_to_peeled[loop_parameter] = while_loop;
+  CHECK_EQ(while_body->root_instruction()->opcode(), HloOpcode::kTuple);
+  // Record instructions that are part of the output of the loop.
+  for (int i = 0; i < while_body->root_instruction()->operand_count(); ++i) {
+    is_output_instruction[while_body->root_instruction()->mutable_operand(i)] =
+        i;
+  }
+
+  // Collect the new parameter shapes with the additional state for the indices
+  // and construct new operand vectors for the init of the new loop and its root
+  // instruction.
+  std::vector<HloInstruction*> new_init_operands;
+  std::vector<Shape> new_parameter_shapes;
+  std::vector<HloInstruction*> new_root_operands;
+  // Number of tuple elements is all the original inputs/outputs to the loop +
+  // the pipelined values + the previous iteration loop iteration, which is the
+  // only dynamic thing that is allowed to be used by the computation pipelined
+  // in the previous iteration.
+  const int64_t operands_indices_count =
+      while_loop->shape().tuple_shapes_size() +
+      loop_analysis.GetMoveInfos().size() + 1;
+  new_parameter_shapes.resize(operands_indices_count);
+  new_root_operands.resize(operands_indices_count);
+  new_init_operands.resize(operands_indices_count);
+  // Fill up root and init operands for the new loop.
+  for (int i = 0; i < loop_parameter->shape().tuple_shapes_size(); ++i) {
+    new_parameter_shapes[i] = loop_parameter->shape().tuple_shapes(i);
+    new_root_operands[i] = while_body->root_instruction()->mutable_operand(i);
+    new_init_operands[i] = while_loop->mutable_operand(0)->mutable_operand(i);
+  }
+  // Populating map for cloned instructions in chains pushed backwards.
+  // We need a different map, because we want to map the loop iterator
+  // differently from the rest of the loop. The whole chain is copied
+  // completely, so we don't share anything with the rest of the loop except
+  // parameter.
+  InstructionMap chain_clone_map;
+  chain_clone_map[loop_parameter] = while_loop->mutable_operand(0);
+  for (auto* u : loop_parameter->users()) {
+    if (IsLoopIterator(u, *loop_analysis.GetLoopIterationIdx())) {
+      chain_clone_map[u] = loop_initial_iteration_idx;
+    }
+  }
+  // Add to the rewritten loop the new parameter/output data that is going to be
+  // pipelined. Clone chains of pipelined data in the parent computation in the
+  // process (they will endup being executed before the loop).
+  for (int i = 0; i < loop_analysis.GetMoveInfos().size(); ++i) {
+    const int64_t idx = i + loop_parameter->shape().tuple_shapes_size();
+    new_parameter_shapes[idx] =
+        loop_analysis.GetMoveInfos()[i].collective_to_move->shape();
+    new_root_operands[idx] = loop_analysis.GetMoveInfos()[i].collective_to_move;
+    new_init_operands[idx] = CloneBackwardChain(
+        *while_loop->parent(), loop_analysis.GetMoveInfos()[i], chain_clone_map,
+        *loop_analysis.GetLoopIterationIdx(), next_channel_id);
+  }
+  ConstantValue next_loop_iteration =
+      loop_analysis.GetLoopStart()->add(*loop_analysis.GetLoopIncrement());
+  const Shape& loop_index_shape =
+      while_loop->shape().tuple_shapes(*loop_analysis.GetLoopIterationIdx());
+  HloInstruction* next_iteration_idx = while_loop->parent()->AddInstruction(
+      HloInstruction::CreateConstant(*CreateLiteralOfShape(
+          loop_index_shape, next_loop_iteration.GetSignedValue())));
+  new_parameter_shapes.back() = loop_parameter->shape().tuple_shapes(
+      *loop_analysis.GetLoopIterationIdx());
+  new_init_operands.back() = next_iteration_idx;
+  auto body_builder = HloComputation::Builder(while_body->name());
+  HloInstruction* new_loop_param =
+      body_builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeTupleShape(new_parameter_shapes), "param"));
+  HloInstruction* loop_iterator_for_pipelined_instrs =
+      body_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_loop_param, new_init_operands.size() - 1));
+  InstructionMap while_body_replacement_map;
+  while_body_replacement_map[loop_parameter] = new_loop_param;
+  chain_clone_map.clear();
+  chain_clone_map[loop_parameter] = new_loop_param;
+  for (auto* u : loop_parameter->users()) {
+    if (IsLoopIterator(u, *loop_analysis.GetLoopIterationIdx())) {
+      chain_clone_map[u] = loop_iterator_for_pipelined_instrs;
+    }
+  }
+  // Clone loop in the body of the new loop. We change some things like
+  // input/output shapes and how we connect loop iterator to the original
+  // chains that we are pipelining.
+  for (auto* instr : while_body->MakeInstructionPostOrder()) {
+    if (instr == loop_parameter || instr == while_body->root_instruction()) {
+      continue;
+    }
+    HloInstruction* cloned_instr = nullptr;
+    auto it = collective_to_move_map.find(instr);
+    if (it != collective_to_move_map.end()) {
+      cloned_instr = CloneBackwardChain(
+          body_builder, loop_analysis.GetMoveInfos()[it->second],
+          chain_clone_map, *loop_analysis.GetLoopIterationIdx(),
+          next_channel_id);
+    } else {
+      auto new_operands =
+          MapNewOperands(instr->operands(), while_body_replacement_map);
+      cloned_instr = body_builder.AddInstruction(
+          instr->CloneWithNewOperands(instr->shape(), new_operands));
+      UpdateInstructionChannelId(cloned_instr, next_channel_id);
+    }
+    if (it != collective_to_move_map.end()) {
+      const int64_t tuple_idx =
+          while_loop->shape().tuple_shapes_size() + it->second;
+      HloInstruction* pipelined_value = body_builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(new_loop_param, tuple_idx));
+      while_body_replacement_map[instr] = pipelined_value;
+      new_root_operands[tuple_idx] = cloned_instr;
+      continue;
+    }
+    while_body_replacement_map[instr] = cloned_instr;
+  }
+  new_root_operands.back() =
+      body_builder.AddInstruction(HloInstruction::CreateBinary(
+          loop_index_shape, HloOpcode::kAdd,
+          while_body_replacement_map
+              [new_root_operands[*loop_analysis.GetLoopIterationIdx()]],
+          body_builder.AddInstruction(
+              HloInstruction::CreateConstant(*CreateLiteralOfShape(
+                  loop_index_shape, next_loop_iteration.GetSignedValue())))));
+  HloInstruction* new_loop_root =
+      body_builder.AddInstruction(HloInstruction::CreateTuple(
+          MapNewOperands(new_root_operands, while_body_replacement_map,
+                         /*allow_unmapped=*/true)));
+  HloComputation* new_while_body =
+      while_loop->GetModule()->AddEmbeddedComputation(
+          body_builder.Build(new_loop_root));
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*>
+      loop_cond_replacements;
+  auto cond_builder =
+      HloComputation::Builder(while_loop->while_condition()->name());
+  HloInstruction* new_cond_param =
+      cond_builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeTupleShape(new_parameter_shapes), "cond_param"));
+
+  // Update the loop bound of the loop to iterate one iteration less.
+  // The updated bound is loop_start + (num_iterations-1) * loop_increment.
+  HloInstruction* loop_bound = cond_builder.AddInstruction(
+      HloInstruction::CreateConstant(*CreateLiteralOfShape(
+          loop_initial_iteration_idx->shape(),
+          loop_analysis.GetLoopStart()
+              ->add(loop_analysis.GetLoopIterationCount()
+                        ->sub(ConstantValue::GetOne(
+                            loop_analysis.GetLoopStart()->GetBitwidth(),
+                            loop_analysis.GetLoopStart()->IsSigned()))
+                        .mul(*loop_analysis.GetLoopIncrement()))
+              .GetSignedValue())));
+  // Construct the new loop condition computation.
+  ComparisonDirection cd =
+      loop_analysis.GetLoopIncrement()->GetSignedValue() > 0
+          ? ComparisonDirection::kLt
+          : ComparisonDirection::kGt;
+  HloInstruction* loop_iterator =
+      cond_builder.AddInstruction(HloInstruction::CreateGetTupleElement(
+          new_cond_param, *loop_analysis.GetLoopIterationIdx()));
+  HloInstruction* comparison =
+      cond_builder.AddInstruction(HloInstruction::CreateCompare(
+          while_loop->while_condition()->root_instruction()->shape(),
+          loop_iterator, loop_bound, cd));
+
+  HloComputation* new_while_condition =
+      while_loop->GetModule()->AddEmbeddedComputation(
+          cond_builder.Build(comparison));
+  HloInstruction* new_loop_init = while_loop->parent()->AddInstruction(
+      HloInstruction::CreateTuple(new_init_operands));
+  // Create the new loop.
+  HloInstruction* new_while_loop =
+      while_loop->parent()->AddInstruction(HloInstruction::CreateWhile(
+          new_while_body->root_instruction()->shape(), new_while_condition,
+          new_while_body, new_loop_init));
+  // Clone the loop body in the parent computation of the loop. This is the
+  // peeled computation that happens after the loop happened to handle the
+  // computation that we peeled away.
+  while_body_replacement_map.clear();
+  while_body_replacement_map[loop_parameter] = new_while_loop;
+  std::vector<HloInstruction*> output_tuple_instructions(
+      while_loop->shape().tuple_shapes_size(), nullptr);
+  for (auto* instr : while_body->MakeInstructionPostOrder()) {
+    if (instr == loop_parameter || instr == while_body->root_instruction()) {
+      continue;
+    }
+    auto new_operands =
+        MapNewOperands(instr->operands(), while_body_replacement_map);
+    HloInstruction* cloned_instr = while_loop->parent()->AddInstruction(
+        instr->CloneWithNewOperands(instr->shape(), new_operands));
+    UpdateInstructionChannelId(cloned_instr, next_channel_id);
+    auto instruction_is_output_it = is_output_instruction.find(instr);
+    auto it = collective_to_move_map.find(instr);
+    if (it != collective_to_move_map.end()) {
+      const int64_t tuple_idx =
+          while_loop->shape().tuple_shapes_size() + it->second;
+      HloInstruction* pipelined_value = while_loop->parent()->AddInstruction(
+          HloInstruction::CreateGetTupleElement(new_while_loop, tuple_idx));
+      while_body_replacement_map[instr] = pipelined_value;
+      if (instruction_is_output_it != is_output_instruction.end()) {
+        output_tuple_instructions[instruction_is_output_it->second] =
+            pipelined_value;
+      }
+      continue;
+    }
+    while_body_replacement_map[instr] = cloned_instr;
+    if (instruction_is_output_it != is_output_instruction.end()) {
+      output_tuple_instructions[instruction_is_output_it->second] =
+          cloned_instr;
+    }
+  }
+  // Substitute old loop with the result of the last peeled iteration.
+  HloInstruction* final_loop_output = while_loop->parent()->AddInstruction(
+      HloInstruction::CreateTuple(output_tuple_instructions));
+  TF_RETURN_IF_ERROR(
+      while_loop->ReplaceAllUsesWithDifferentShape(final_loop_output));
+  return OkStatus();
+}
+
+StatusOr<bool> DataParallelCollectiveOptimizer::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  std::vector<HloInstruction*> while_loop_instructions;
+  for (HloComputation* computation : module->MakeComputationPostOrder()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
+      if (instruction->opcode() == HloOpcode::kWhile) {
+        while_loop_instructions.push_back(instruction);
+      }
+    }
+  }
+  next_channel_id_ = hlo_query::NextChannelId(*module);
+  for (HloInstruction* instruction : while_loop_instructions) {
+    WhileLoopAnalysis loop_analysis(instruction, max_pipelining_per_loop_,
+                                    process_different_sized_ops_);
+    loop_analysis.ComputeLoopStatistics();
+    if (!loop_analysis.GetLoopIterationCount() ||
+        loop_analysis.GetLoopIterationCount()->GetUnsignedValue() == 0) {
+      continue;
+    }
+    VLOG(1) << "While: " << instruction->ToString();
+    VLOG(1) << "While iterations: "
+            << loop_analysis.GetLoopIterationCount()->ToString();
+    loop_analysis.CollectCollectivesToMove(
+        level_to_operate_on_, pipelining_direction_, should_process_);
+    if (loop_analysis.GetMoveInfos().empty()) {
+      continue;
+    }
+    VLOG(1) << "Found Collectives to optimize";
+    if (VLOG_IS_ON(1)) {
+      for (auto& to_move : loop_analysis.GetMoveInfos()) {
+        VLOG(1) << "\t" << to_move.collective_to_move->ToString();
+        if (to_move.dynamic_update_slice) {
+          VLOG(1) << "\t" << to_move.dynamic_update_slice->ToString();
+        }
+        VLOG(1) << "\t" << to_move.output_idx;
+      }
+    }
+    if (pipelining_direction_ == PipeliningDirection::kForward) {
+      TF_RETURN_IF_ERROR(TransformLoopForward(
+          loop_analysis, !last_run_, level_to_operate_on_,
+          process_different_sized_ops_, should_process_, next_channel_id_));
+    } else {
+      CHECK_EQ(pipelining_direction_, PipeliningDirection::kBackward);
+      TF_RETURN_IF_ERROR(TransformLoopBackward(
+          loop_analysis, !last_run_, level_to_operate_on_,
+          process_different_sized_ops_, should_process_, next_channel_id_));
+    }
+    changed = true;
+  }
+  // If this is the last expected run then remove all the custom-calls that we
+  // inserted as they shouldn't reach the backend.
+  if (last_run_) {
+    std::vector<HloInstruction*> to_remove;
+    for (HloComputation* computation : module->MakeComputationPostOrder()) {
+      for (HloInstruction* instruction : computation->instructions()) {
+        if (instruction->IsCustomCall(
+                DataParallelCollectiveOptimizer::kInsertedByPreviousStep)) {
+          to_remove.push_back(instruction);
+          TF_RETURN_IF_ERROR(
+              instruction->ReplaceAllUsesWith(instruction->mutable_operand(0)));
+          changed = true;
+        }
+      }
+    }
+    for (auto* instruction : to_remove) {
+      TF_RETURN_IF_ERROR(
+          instruction->parent()->RemoveInstructionAndUnusedOperands(
+              instruction));
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h b/tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h
new file mode 100644
index 00000000000..4a2888c5e31
--- /dev/null
+++ b/tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h
@@ -0,0 +1,121 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DATA_PARALLEL_COLLECTIVE_OPTIMIZER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DATA_PARALLEL_COLLECTIVE_OPTIMIZER_H_
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// This transformation peels off loop iterations of models with stacked layers
+// that perform data parallelism using reduce-scatter/all-reduce/all-gather.
+// Collective instructions are pushed to the next iteration in which they can
+// overlap with the entirely of the next layer rather than with a more limited
+// amount of computation in the current iteration. An example of transformation
+// is this:
+//
+// while (i < LAYERS) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   x = computation(p0)
+//   xg = all-reduce(x)
+//   y = computation(p1)
+//   yg = all-reduce(y)
+// }
+//
+// to
+//
+// x_prev = computation(p0)
+// y_prev = computation(p1)
+// i = i + 1
+// while (i < LAYERS, x_prev, y_prev) {
+//   p0 = param(0)
+//   p1 = param(1)
+//   xg = all-reduce(x_prev)
+//   yg = all-reduce(y_prev)
+//   x = computation(p0)
+//   y = computation(p1)
+//   x_prev = x
+//   y_prev = y
+// }
+class DataParallelCollectiveOptimizer : public HloModulePass {
+ public:
+  enum PipeliningDirection {
+    kBackward,
+    kForward,
+  };
+  struct DataParallelCollectiveConfig {
+    int64_t level_to_operate_on = 0;
+    // Maximum number of HLOs to pipeline per loop. (Meant to help controlling
+    // memory pressure manually).
+    int64_t max_pipelining_per_loop = 0;
+    bool last_run = true;
+    bool process_different_sized_ops = false;
+    PipeliningDirection pipelining_direction = PipeliningDirection::kForward;
+    HloPredicate should_process;
+  };
+  static const char* const kInsertedByPreviousStep;
+  explicit DataParallelCollectiveOptimizer(
+      int64_t level_to_operate_on = 0, bool last_run = true,
+      bool process_different_sized_ops = false,
+      PipeliningDirection pipelining_direction = kForward,
+      HloPredicate should_process = nullptr)
+      : next_channel_id_(0),
+        level_to_operate_on_(level_to_operate_on),
+        max_pipelining_per_loop_(INT64_MAX),
+        last_run_(last_run),
+        process_different_sized_ops_(process_different_sized_ops),
+        pipelining_direction_(pipelining_direction),
+        should_process_(should_process) {}
+  explicit DataParallelCollectiveOptimizer(
+      const DataParallelCollectiveConfig& config)
+      : next_channel_id_(0),
+        level_to_operate_on_(config.level_to_operate_on),
+        max_pipelining_per_loop_(config.max_pipelining_per_loop),
+        last_run_(config.last_run),
+        process_different_sized_ops_(config.process_different_sized_ops),
+        pipelining_direction_(config.pipelining_direction),
+        should_process_(config.should_process) {}
+  DataParallelCollectiveOptimizer(DataParallelCollectiveOptimizer&& other) =
+      default;
+  DataParallelCollectiveOptimizer& operator=(
+      DataParallelCollectiveOptimizer&& other) = default;
+
+  absl::string_view name() const override {
+    return "data-parallel-collective-optimizer";
+  }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  int64_t next_channel_id_;
+  int64_t level_to_operate_on_;
+  int64_t max_pipelining_per_loop_;
+  bool last_run_;
+  bool process_different_sized_ops_;
+  PipeliningDirection pipelining_direction_;
+  HloPredicate should_process_;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DATA_PARALLEL_COLLECTIVE_OPTIMIZER_H_
diff --git a/tensorflow/compiler/xla/service/data_parallel_collective_optimizer_test.cc b/tensorflow/compiler/xla/service/data_parallel_collective_optimizer_test.cc
new file mode 100644
index 00000000000..7033a378d74
--- /dev/null
+++ b/tensorflow/compiler/xla/service/data_parallel_collective_optimizer_test.cc
@@ -0,0 +1,1154 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h"
+
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+using ::testing::_;
+namespace op = xla::testing::opcode_matchers;
+
+class DataParallelCollectiveOptimizerTest : public HloTestBase {
+ public:
+  DataParallelCollectiveOptimizerTest() {
+    const int64_t kNumReplicas = 4;
+    const int64_t kNumComputations = 2;
+    config_ = GetModuleConfigForTest(/*replica_count=*/kNumReplicas,
+                                     /*num_partitions=*/kNumComputations);
+  }
+
+ protected:
+  const HloPredicate IsAllGather = HloPredicateIsOp<HloOpcode::kAllGather>;
+  HloModuleConfig config_;
+};
+
+StatusOr<bool> RunOptimizer(
+    HloModule* module, bool last_run, int64_t level_to_operate_on = 0,
+    bool process_different_sized_ops = true,
+    DataParallelCollectiveOptimizer::PipeliningDirection direction =
+        DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+    HloPredicate should_process = HloPredicateIsOp<HloOpcode::kAllReduce>) {
+  HloPassPipeline pass("optimizer");
+  pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false);
+  pass.AddPass<DataParallelCollectiveOptimizer>(level_to_operate_on, last_run,
+                                                process_different_sized_ops,
+                                                direction, should_process);
+  pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false);
+  pass.AddPass<HloDCE>(/*remove_cross_partition_collective_ops=*/true);
+  return pass.Run(module);
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, TransformIncrementIndexByOne) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::DynamicUpdateSlice(_, op::AllReduce(), _, _, _));
+  const HloInstruction* sliced = root->operand(1)->operand(0);
+  EXPECT_EQ(sliced->opcode(), HloOpcode::kDynamicSlice);
+  const HloInstruction* index = sliced->operand(1);
+  EXPECT_EQ(index->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(index->tuple_index(), 2);
+  const HloInstruction* while_inst = index->operand(0);
+  EXPECT_EQ(while_inst->opcode(), HloOpcode::kWhile);
+  const HloInstruction* while_root =
+      while_inst->while_body()->root_instruction();
+  EXPECT_EQ(while_root->opcode(), HloOpcode::kTuple);
+  const HloInstruction* dyn_upd = while_root->operand(1);
+  EXPECT_EQ(dyn_upd->opcode(), HloOpcode::kDynamicUpdateSlice);
+  const HloInstruction* dyn_upd2 = dyn_upd->operand(0);
+  EXPECT_EQ(dyn_upd2->opcode(), HloOpcode::kDynamicUpdateSlice);
+  const HloInstruction* prev_ar = dyn_upd2->operand(1);
+  EXPECT_EQ(prev_ar->opcode(), HloOpcode::kAllReduce);
+  const HloInstruction* dyn_slice_top = prev_ar->operand(0);
+  EXPECT_EQ(dyn_slice_top->opcode(), HloOpcode::kDynamicSlice);
+  const HloInstruction* get_tuple_value = dyn_slice_top->operand(0);
+  const HloInstruction* get_tuple_index = dyn_slice_top->operand(1);
+  EXPECT_EQ(get_tuple_value->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(get_tuple_index->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(get_tuple_value->tuple_index(), 1);
+  EXPECT_EQ(get_tuple_index->tuple_index(), 2);
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       TransformIncrementIndexByOneNotFirstIdx) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[8,3,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[8,3,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[8,3,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[8,1,128] dynamic-slice(get-tuple-element.395, constant.2561, select.1348, constant.2561), dynamic_slice_sizes={8,1,128}
+  mul = bf16[8,1,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[8,1,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[8,3,128] dynamic-update-slice(get-tuple-element.395, ar.1, constant.2561, select.1348, constant.2561)
+  ROOT tuple = (s32[], bf16[8,3,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[8,3,128] parameter(0)
+  tuple = (s32[], bf16[8,3,128]) tuple(c0, p0)
+  while = (s32[], bf16[8,3,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[8,3,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::DynamicUpdateSlice(_, op::AllReduce(), _, _, _));
+  const HloInstruction* sliced = root->operand(1)->operand(0);
+  EXPECT_EQ(sliced->opcode(), HloOpcode::kDynamicSlice);
+  const HloInstruction* index = sliced->operand(2);
+  EXPECT_EQ(index->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(index->tuple_index(), 2);
+  const HloInstruction* while_inst = index->operand(0);
+  EXPECT_EQ(while_inst->opcode(), HloOpcode::kWhile);
+  const HloInstruction* while_root =
+      while_inst->while_body()->root_instruction();
+  EXPECT_EQ(while_root->opcode(), HloOpcode::kTuple);
+  const HloInstruction* dyn_upd = while_root->operand(1);
+  EXPECT_EQ(dyn_upd->opcode(), HloOpcode::kDynamicUpdateSlice);
+  const HloInstruction* dyn_upd2 = dyn_upd->operand(0);
+  EXPECT_EQ(dyn_upd2->opcode(), HloOpcode::kDynamicUpdateSlice);
+  const HloInstruction* prev_ar = dyn_upd2->operand(1);
+  EXPECT_EQ(prev_ar->opcode(), HloOpcode::kAllReduce);
+  const HloInstruction* dyn_slice_top = prev_ar->operand(0);
+  EXPECT_EQ(dyn_slice_top->opcode(), HloOpcode::kDynamicSlice);
+  const HloInstruction* get_tuple_value = dyn_slice_top->operand(0);
+  const HloInstruction* get_tuple_index = dyn_slice_top->operand(2);
+  EXPECT_EQ(get_tuple_value->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(get_tuple_index->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(get_tuple_value->tuple_index(), 1);
+  EXPECT_EQ(get_tuple_index->tuple_index(), 2);
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, TransformIncrementByTwo) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(2)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::DynamicUpdateSlice(_, op::AllReduce(), _, _, _));
+  const HloInstruction* sliced = root->operand(1)->operand(0);
+  EXPECT_EQ(sliced->opcode(), HloOpcode::kDynamicSlice);
+  const HloInstruction* index = sliced->operand(1);
+  EXPECT_EQ(index->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(index->tuple_index(), 2);
+  const HloInstruction* while_inst = index->operand(0);
+  EXPECT_EQ(while_inst->opcode(), HloOpcode::kWhile);
+  const HloInstruction* while_root =
+      while_inst->while_body()->root_instruction();
+  EXPECT_EQ(while_root->opcode(), HloOpcode::kTuple);
+  const HloInstruction* dyn_upd = while_root->operand(1);
+  EXPECT_EQ(dyn_upd->opcode(), HloOpcode::kDynamicUpdateSlice);
+  const HloInstruction* dyn_upd2 = dyn_upd->operand(0);
+  EXPECT_EQ(dyn_upd2->opcode(), HloOpcode::kDynamicUpdateSlice);
+  const HloInstruction* prev_ar = dyn_upd2->operand(1);
+  EXPECT_EQ(prev_ar->opcode(), HloOpcode::kAllReduce);
+  const HloInstruction* dyn_slice_top = prev_ar->operand(0);
+  EXPECT_EQ(dyn_slice_top->opcode(), HloOpcode::kDynamicSlice);
+  const HloInstruction* get_tuple_value = dyn_slice_top->operand(0);
+  const HloInstruction* get_tuple_index = dyn_slice_top->operand(1);
+  EXPECT_EQ(get_tuple_value->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(get_tuple_index->opcode(), HloOpcode::kGetTupleElement);
+  EXPECT_EQ(get_tuple_value->tuple_index(), 1);
+  EXPECT_EQ(get_tuple_index->tuple_index(), 2);
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       NoTransformCantProveIndexDoesntWrap) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(4)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-1)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       TransformNegativeIndexIterationToZero) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/false).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::DynamicUpdateSlice(
+                        _,
+                        op::CustomCall(op::AllReduce(op::DynamicSlice(
+                                           op::GetTupleElement(op::While()),
+                                           op::GetTupleElement(),
+                                           op::Constant(), op::Constant())),
+                                       op::Constant()),
+                        op::GetTupleElement(), op::Constant(), op::Constant()));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, EscapedInputNoTransform) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[1,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[1,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.911 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, constant.2561, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[1,8,128]) tuple(add.230, dynamic-update-slice.35, dynamic-slice.911)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,8,128] parameter(0)
+  cc = bf16[] constant(0)
+  c1 = bf16[1,8,128] broadcast(cc), dimensions={}
+  tuple = (s32[], bf16[3,8,128], bf16[1,8,128]) tuple(c0, p0, c1)
+  while = (s32[], bf16[3,8,128], bf16[1,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_FALSE(RunOptimizer(module.get(), /*last_run=*/true).value());
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, TransformWithAg) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  rs.1 = bf16[1,1,128] reduce-scatter(mul), replica_groups={}, to_apply=add, channel_id=1, dimensions={1}
+  ag.1 = bf16[1,8,128] all-gather(rs.1), replica_groups={}, channel_id=2, dimensions={1}
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ag.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,8,128] parameter(0)
+  cc = bf16[] constant(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*process_different_sized_ops=*/true,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root, op::DynamicUpdateSlice(
+                        _, op::AllGather(op::GetTupleElement(op::While())),
+                        op::GetTupleElement(), op::Constant(), op::Constant()));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, TransformWithAgWithFormatting) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,9,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,9,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,9,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,9,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,9,128}
+  mul = bf16[1,9,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  cpd = bf16[] constant(0)
+  %pd = bf16[1,16,128] pad(mul, cpd), padding=0_0x0_7x0_0
+  rs.1 = bf16[1,2,128] reduce-scatter(pd), replica_groups={}, to_apply=add, channel_id=1, dimensions={1}
+  ag.1 = bf16[1,16,128] all-gather(rs.1), replica_groups={}, channel_id=2, dimensions={1}
+  slc = bf16[1,9,128] slice(ag.1), slice={[0:1], [0:9], [0:128]}
+  dynamic-update-slice.35 = bf16[3,9,128] dynamic-update-slice(get-tuple-element.395, slc, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,9,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,9,128] parameter(0)
+  cc = bf16[] constant(0)
+  tuple = (s32[], bf16[3,9,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,9,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,9,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*process_different_sized_ops=*/true,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(root,
+              op::DynamicUpdateSlice(
+                  _, op::Slice(op::AllGather(op::GetTupleElement(op::While()))),
+                  op::GetTupleElement(), op::Constant(), op::Constant()));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, TransformWithAgInsertCustomCall) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  constant.2561 = s32[] constant(0)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, get-tuple-element.394, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  rs.1 = bf16[1,1,128] reduce-scatter(mul), replica_groups={}, to_apply=add, channel_id=1, dimensions={1}
+  ag.1 = bf16[1,8,128] all-gather(rs.1), replica_groups={}, channel_id=2, dimensions={1}
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ag.1, get-tuple-element.394, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-8)
+  p0 = bf16[3,8,128] parameter(0)
+  cc = bf16[] constant(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/false, 0,
+          /*process_different_sized_ops=*/true,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  RunOptimizer(module.get(), /*last_run=*/true, 1).value();
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+  // Matching the pattern we expect for the output of the loop when an
+  // all-gather is pipelined through the loop. We dynamic-slice the stacked
+  // data, perform the all-gather and then put it in the stacked data again.
+  EXPECT_THAT(root, op::DynamicUpdateSlice(
+                        _, op::AllGather(op::GetTupleElement(op::While())),
+                        op::GetTupleElement(), op::Constant(), op::Constant()));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, PushAgOver) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(bf16[3,8,128]{2,1,0})->bf16[3,8,128]{2,1,0}}
+
+%add (lhs: bf16[], rhs: bf16[]) -> bf16[] {
+  %lhs = bf16[] parameter(0)
+  %rhs = bf16[] parameter(1)
+  ROOT %add = bf16[] add(bf16[] %lhs, bf16[] %rhs)
+}
+
+%while_body.clone (loop_peel_param: (s32[], bf16[3,8,128], s32[])) -> (s32[], bf16[3,8,128], s32[]) {
+  %loop_peel_param = (s32[], bf16[3,8,128]{2,1,0}, s32[]) parameter(0)
+  %get-tuple-element.2 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=0
+  %constant.7 = s32[] constant(1)
+  %add.4 = s32[] add(s32[] %get-tuple-element.2, s32[] %constant.7)
+  %get-tuple-element.3 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=1
+  %get-tuple-element.4 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=2
+  %constant.12 = s64[] constant(1)
+  %custom-call = s32[] custom-call(s32[] %get-tuple-element.4, s64[] %constant.12), custom_call_target="InsertedByPreviousStep"
+  %constant.13 = s32[] constant(0)
+  %constant.10 = s32[] constant(0)
+  %dynamic-slice.2 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, s32[] %custom-call, s32[] %constant.13, s32[] %constant.13), dynamic_slice_sizes={1,8,128}
+  %ar.2 = bf16[1,1,128]{2,1,0} reduce-scatter(bf16[1,8,128]{2,1,0} %dynamic-slice.2), channel_id=2, replica_groups={}, to_apply=%add, dimensions={1}
+  %ag.2 = bf16[1,8,128]{2,1,0} all-gather(bf16[1,1,128]{2,1,0} %ar.2), channel_id=32, replica_groups={}, dimensions={1}
+  %dynamic-update-slice.2 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, bf16[1,8,128]{2,1,0} %ag.2, s32[] %custom-call, s32[] %constant.13, s32[] %constant.13)
+  %dynamic-slice.1 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, s32[] %get-tuple-element.2, s32[] %constant.10, s32[] %constant.10), dynamic_slice_sizes={1,8,128}
+  %mul.2 = bf16[1,8,128]{2,1,0} multiply(bf16[1,8,128]{2,1,0} %dynamic-slice.1, bf16[1,8,128]{2,1,0} %dynamic-slice.1)
+  %constant.15 = s32[] constant(0)
+  %dynamic-update-slice.4 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %dynamic-update-slice.2, bf16[1,8,128]{2,1,0} %mul.2, s32[] %get-tuple-element.2, s32[] %constant.15, s32[] %constant.15)
+  ROOT %tuple.3 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) tuple(s32[] %add.4, bf16[3,8,128]{2,1,0} %dynamic-update-slice.4, s32[] %get-tuple-element.2)
+}
+
+%while_cond.clone (loop_peel_cond_param: (s32[], bf16[3,8,128], s32[])) -> pred[] {
+  %loop_peel_cond_param = (s32[], bf16[3,8,128]{2,1,0}, s32[]) parameter(0)
+  %gte.1 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_cond_param), index=0
+  %constant.6 = s32[] constant(0)
+  ROOT %cmp.1 = pred[] compare(s32[] %gte.1, s32[] %constant.6), direction=LT
+}
+
+ENTRY %entry (p0: bf16[3,8,128]) -> bf16[3,8,128] {
+  %c0 = s32[] constant(-3)
+  %p0 = bf16[3,8,128]{2,1,0} parameter(0)
+  %tuple.1 = (s32[], bf16[3,8,128]{2,1,0}) tuple(s32[] %c0, bf16[3,8,128]{2,1,0} %p0)
+  %get-tuple-element.0 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}) %tuple.1), index=0
+  %constant.0 = s32[] constant(1)
+  %constant.4 = s32[] constant(0)
+  %add.1 = s32[] add(s32[] %get-tuple-element.0, s32[] %constant.0)
+  %get-tuple-element.1 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}) %tuple.1), index=1
+  %dynamic-slice.0 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.1, s32[] %get-tuple-element.0, s32[] %constant.4, s32[] %constant.4), dynamic_slice_sizes={1,8,128}
+  %mul.1 = bf16[1,8,128]{2,1,0} multiply(bf16[1,8,128]{2,1,0} %dynamic-slice.0, bf16[1,8,128]{2,1,0} %dynamic-slice.0)
+  %dynamic-update-slice.0 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.1, bf16[1,8,128]{2,1,0} %mul.1, s32[] %get-tuple-element.0, s32[] %constant.4, s32[] %constant.4)
+  %tuple.4 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) tuple(s32[] %add.1, bf16[3,8,128]{2,1,0} %dynamic-update-slice.0, s32[] %get-tuple-element.0)
+  %while.1 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) while((s32[], bf16[3,8,128]{2,1,0}, s32[]) %tuple.4), condition=%while_cond.clone, body=%while_body.clone
+  %get-tuple-element.6 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %while.1), index=1
+  %get-tuple-element.5 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %while.1), index=2
+  %constant.14 = s32[] constant(0)
+  %dynamic-slice.3 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.6, s32[] %get-tuple-element.5, s32[] %constant.14, s32[] %constant.14), dynamic_slice_sizes={1,8,128}
+  %ar.3 = bf16[1,8,128]{2,1,0} all-reduce(bf16[1,8,128]{2,1,0} %dynamic-slice.3), channel_id=3, replica_groups={}, to_apply=%add
+  ROOT %dynamic-update-slice.3 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.6, bf16[1,8,128]{2,1,0} %ar.3, s32[] %get-tuple-element.5, s32[] %constant.14, s32[] %constant.14)
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 1,
+          /*process_different_sized_ops=*/true,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* root = module->entry_computation()->root_instruction();
+  // Check that the all-gather can be pipelined after we had already a previous
+  // round of pipelining performed previously for another op. (in this case
+  // AllReduce).
+  EXPECT_THAT(
+      root,
+      op::DynamicUpdateSlice(
+          op::DynamicUpdateSlice(_, op::AllGather(), _, _, _),
+          op::AllReduce(op::DynamicSlice(op::DynamicUpdateSlice(), _, _, _)),
+          op::GetTupleElement(), op::Constant(), op::Constant()));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, NoPushAgOverBecauseDifferentSize) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(bf16[3,8,128]{2,1,0})->bf16[3,8,128]{2,1,0}}
+
+%add (lhs: bf16[], rhs: bf16[]) -> bf16[] {
+  %lhs = bf16[] parameter(0)
+  %rhs = bf16[] parameter(1)
+  ROOT %add = bf16[] add(bf16[] %lhs, bf16[] %rhs)
+}
+
+%while_body.clone (loop_peel_param: (s32[], bf16[3,8,128], s32[])) -> (s32[], bf16[3,8,128], s32[]) {
+  %loop_peel_param = (s32[], bf16[3,8,128]{2,1,0}, s32[]) parameter(0)
+  %get-tuple-element.2 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=0
+  %constant.7 = s32[] constant(1)
+  %add.4 = s32[] add(s32[] %get-tuple-element.2, s32[] %constant.7)
+  %get-tuple-element.3 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=1
+  %get-tuple-element.4 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=2
+  %constant.12 = s64[] constant(1)
+  %custom-call = s32[] custom-call(s32[] %get-tuple-element.4, s64[] %constant.12), custom_call_target="InsertedByPreviousStep"
+  %constant.13 = s32[] constant(0)
+  %constant.10 = s32[] constant(0)
+  %dynamic-slice.2 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, s32[] %custom-call, s32[] %constant.13, s32[] %constant.13), dynamic_slice_sizes={1,8,128}
+  %ar.2 = bf16[1,1,128]{2,1,0} reduce-scatter(bf16[1,8,128]{2,1,0} %dynamic-slice.2), channel_id=2, replica_groups={}, to_apply=%add, dimensions={1}
+  %ag.2 = bf16[1,8,128]{2,1,0} all-gather(bf16[1,1,128]{2,1,0} %ar.2), channel_id=32, replica_groups={}, dimensions={1}
+  %dynamic-update-slice.2 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, bf16[1,8,128]{2,1,0} %ag.2, s32[] %custom-call, s32[] %constant.13, s32[] %constant.13)
+  %dynamic-slice.1 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, s32[] %get-tuple-element.2, s32[] %constant.10, s32[] %constant.10), dynamic_slice_sizes={1,8,128}
+  %mul.2 = bf16[1,8,128]{2,1,0} multiply(bf16[1,8,128]{2,1,0} %dynamic-slice.1, bf16[1,8,128]{2,1,0} %dynamic-slice.1)
+  %constant.15 = s32[] constant(0)
+  %dynamic-update-slice.4 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %dynamic-update-slice.2, bf16[1,8,128]{2,1,0} %mul.2, s32[] %get-tuple-element.2, s32[] %constant.15, s32[] %constant.15)
+  ROOT %tuple.3 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) tuple(s32[] %add.4, bf16[3,8,128]{2,1,0} %dynamic-update-slice.4, s32[] %get-tuple-element.2)
+}
+
+%while_cond.clone (loop_peel_cond_param: (s32[], bf16[3,8,128], s32[])) -> pred[] {
+  %loop_peel_cond_param = (s32[], bf16[3,8,128]{2,1,0}, s32[]) parameter(0)
+  %gte.1 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_cond_param), index=0
+  %constant.6 = s32[] constant(0)
+  ROOT %cmp.1 = pred[] compare(s32[] %gte.1, s32[] %constant.6), direction=LT
+}
+
+ENTRY %entry (p0: bf16[3,8,128]) -> bf16[3,8,128] {
+  %c0 = s32[] constant(-3)
+  %p0 = bf16[3,8,128]{2,1,0} parameter(0)
+  %tuple.1 = (s32[], bf16[3,8,128]{2,1,0}) tuple(s32[] %c0, bf16[3,8,128]{2,1,0} %p0)
+  %get-tuple-element.0 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}) %tuple.1), index=0
+  %constant.0 = s32[] constant(1)
+  %constant.4 = s32[] constant(0)
+  %add.1 = s32[] add(s32[] %get-tuple-element.0, s32[] %constant.0)
+  %get-tuple-element.1 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}) %tuple.1), index=1
+  %dynamic-slice.0 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.1, s32[] %get-tuple-element.0, s32[] %constant.4, s32[] %constant.4), dynamic_slice_sizes={1,8,128}
+  %mul.1 = bf16[1,8,128]{2,1,0} multiply(bf16[1,8,128]{2,1,0} %dynamic-slice.0, bf16[1,8,128]{2,1,0} %dynamic-slice.0)
+  %dynamic-update-slice.0 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.1, bf16[1,8,128]{2,1,0} %mul.1, s32[] %get-tuple-element.0, s32[] %constant.4, s32[] %constant.4)
+  %tuple.4 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) tuple(s32[] %add.1, bf16[3,8,128]{2,1,0} %dynamic-update-slice.0, s32[] %get-tuple-element.0)
+  %while.1 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) while((s32[], bf16[3,8,128]{2,1,0}, s32[]) %tuple.4), condition=%while_cond.clone, body=%while_body.clone
+  %get-tuple-element.6 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %while.1), index=1
+  %get-tuple-element.5 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %while.1), index=2
+  %constant.14 = s32[] constant(0)
+  %dynamic-slice.3 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.6, s32[] %get-tuple-element.5, s32[] %constant.14, s32[] %constant.14), dynamic_slice_sizes={1,8,128}
+  %ar.3 = bf16[1,8,128]{2,1,0} all-reduce(bf16[1,8,128]{2,1,0} %dynamic-slice.3), channel_id=3, replica_groups={}, to_apply=%add
+  ROOT %dynamic-update-slice.3 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.6, bf16[1,8,128]{2,1,0} %ar.3, s32[] %get-tuple-element.5, s32[] %constant.14, s32[] %constant.14)
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/false, 1,
+          /*process_different_sized_ops=*/false,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest, TransformIncrementByTwoFormat) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,16,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,16,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.396 = bf16[3,16,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(2)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,16,128] dynamic-slice(get-tuple-element.396, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,16,128}
+  mul = bf16[1,16,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,16,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  ds.1 = bf16[1,8,128] dynamic-slice(ar.1, constant.2561, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ds.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,16,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.396)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,16,128] parameter(0)
+  c1 = bf16[] constant(0)
+  b1 = bf16[3,8,128] broadcast(c1), dimensions={}
+  tuple = (s32[], bf16[3,8,128], bf16[3,16,128]) tuple(c0, b1, p0)
+  while = (s32[], bf16[3,8,128], bf16[3,16,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::DynamicUpdateSlice(
+          _, op::DynamicSlice(op::AllReduce(op::GetTupleElement()), _, _, _), _,
+          _, _));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       TransformIncrementByTwoFormatTranspose) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,16,128], bf16[3,16,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,16,128], bf16[3,16,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,16,128] get-tuple-element(param), index=1
+  get-tuple-element.396 = bf16[3,16,128] get-tuple-element(param), index=2
+  constant.2557 = s32[] constant(2)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,16,128] dynamic-slice(get-tuple-element.396, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,16,128}
+  mul = bf16[1,16,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  reshape.1 = bf16[2,16,64] reshape(mul)
+  ar.1 = bf16[2,16,64] all-reduce(reshape.1), replica_groups={}, to_apply=add, channel_id=1
+  transpose.1 = bf16[64,2,16] transpose(ar.1), dimensions={2,0,1}
+  reshape.2 = bf16[1,16,128] reshape(transpose.1)
+  dynamic-update-slice.35 = bf16[3,16,128] dynamic-update-slice(get-tuple-element.395, reshape.2, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,16,128], bf16[3,16,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.396)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,16,128] parameter(0)
+  c1 = bf16[] constant(0)
+  b1 = bf16[3,16,128] broadcast(c1), dimensions={}
+  tuple.1 = (s32[], bf16[3,16,128], bf16[3,16,128]) tuple(c0, b1, p0)
+  while = (s32[], bf16[3,16,128], bf16[3,16,128]) while(tuple.1), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,16,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_THAT(
+      root,
+      op::DynamicUpdateSlice(
+          _, op::Reshape(op::Transpose(op::AllReduce(op::GetTupleElement()))),
+          _, _, _));
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       TransformIncrementIndexByOneBackwards) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
+  constant.2561 = s32[] constant(0)
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.k = bf16[1,1,2,128] dynamic-slice(get-tuple-element.k, select.1348, constant.2561, constant.2561, constant.2561), dynamic_slice_sizes={1,1,2,128}
+  r = bf16[1,2,128] reshape(dynamic-slice.k)
+  a = bf16[1,2,128] add(r, r)
+  ag = bf16[1,8,128] all-gather(a), dimensions={1}, replica_groups={}
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, ag)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.k)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  p1 = bf16[3,1,2,128] parameter(1)
+  tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(c0, p0, p1)
+  while = (s32[], bf16[3,8,128], bf16[3,1,2,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*process_different_sized_ops=*/false,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kBackward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       TransformIncrementIndexByOneBackwardsModifyOut) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
+  constant.2561 = s32[] constant(0)
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.k = bf16[1,1,2,128] dynamic-slice(get-tuple-element.k, select.1348, constant.2561, constant.2561, constant.2561), dynamic_slice_sizes={1,1,2,128}
+  r = bf16[1,2,128] reshape(dynamic-slice.k)
+  a = bf16[1,2,128] add(r, r)
+  ag = bf16[1,8,128] all-gather(a), dimensions={1}, replica_groups={}
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, ag)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  constant.10 = bf16[] constant(0)
+  b = bf16[3,1,2,128] broadcast(constant.10), dimensions={}
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(add.230, dynamic-update-slice.35, b)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  p1 = bf16[3,1,2,128] parameter(1)
+  tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(c0, p0, p1)
+  while = (s32[], bf16[3,8,128], bf16[3,1,2,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_FALSE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*process_different_sized_ops=*/false,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kBackward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+TEST_F(DataParallelCollectiveOptimizerTest,
+       TransformIncrementIndexByOneBackwardsPlusForward) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
+  constant.2561 = s32[] constant(0)
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.k = bf16[1,1,2,128] dynamic-slice(get-tuple-element.k, select.1348, constant.2561, constant.2561, constant.2561), dynamic_slice_sizes={1,1,2,128}
+  r = bf16[1,2,128] reshape(dynamic-slice.k)
+  a = bf16[1,2,128] add(r, r)
+  ag = bf16[1,8,128] all-gather(a), dimensions={1}, replica_groups={}
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, ag)
+  ar.1 = bf16[1,8,128] all-reduce(mul), replica_groups={}, to_apply=add, channel_id=1
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.k)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  p1 = bf16[3,1,2,128] parameter(1)
+  tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(c0, p0, p1)
+  while = (s32[], bf16[3,8,128], bf16[3,1,2,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string, config_).value();
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/false, 0,
+          /*process_different_sized_ops=*/true,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kBackward,
+          IsAllGather)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0,
+          /*process_different_sized_ops=*/true,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward)
+          .value());
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/despecializer.cc b/tensorflow/compiler/xla/service/despecializer.cc
index 0ec441c57fa..ad761daace9 100644
--- a/tensorflow/compiler/xla/service/despecializer.cc
+++ b/tensorflow/compiler/xla/service/despecializer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/defuser.h"
 #include "tensorflow/compiler/xla/service/float_normalization.h"
 #include "tensorflow/compiler/xla/service/hlo_memory_scheduler.h"
+#include "tensorflow/compiler/xla/service/sub_byte_normalization.h"
 
 namespace xla {
 
@@ -31,6 +32,7 @@ Despecializer::Despecializer() : pipeline_("despecializer") {
   pipeline_.AddPass<ControlDepRemover>();
   pipeline_.AddPass<Defuser>();
   pipeline_.AddPass<BFloat16MixedPrecisionRemoval>();
+  pipeline_.AddPass<SubByteNormalization>();
 }
 
 void Despecializer::AddReduceWindowToReduceBroadcastDeconstruct() {
diff --git a/tensorflow/compiler/xla/service/dot_dimension_merger.cc b/tensorflow/compiler/xla/service/dot_dimension_merger.cc
new file mode 100644
index 00000000000..eaf571aa7b5
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_dimension_merger.cc
@@ -0,0 +1,171 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_dimension_merger.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
+
+namespace xla {
+
+namespace {
+
+// Decrease dimension numbers that are >= `start` by `shift`. Copy the other
+// ones unmodified.
+std::vector<int64_t> ShiftDimensions(absl::Span<const int64_t> dimensions,
+                                     const int64_t start, const int64_t shift) {
+  std::vector<int64_t> new_dimensions;
+  new_dimensions.reserve(dimensions.size());
+  for (const int64_t i : dimensions) {
+    if (i < start) {
+      new_dimensions.push_back(i);
+    } else {
+      new_dimensions.push_back(i - shift);
+    }
+  }
+  return new_dimensions;
+}
+
+// Merge all batch dimensions into logically first one for both operands.
+class BatchDimensionMerger : public DfsHloRewriteVisitor {
+ public:
+  Status HandleDot(HloInstruction* dot) override {
+    const DotDimensionNumbers& dnums = dot->dot_dimension_numbers();
+    const Shape& lhs_shape = dot->operand(0)->shape();
+    const Shape& rhs_shape = dot->operand(1)->shape();
+    CHECK_EQ(dnums.lhs_batch_dimensions_size(),
+             dnums.rhs_batch_dimensions_size());
+    const int64_t batch_dimension_count = dnums.lhs_batch_dimensions_size();
+
+    if (batch_dimension_count < 2 ||
+        // Logical consecutiveness is required only to simplify the code.
+        !DistinctNumbersAreConsecutiveIfSorted(dnums.lhs_batch_dimensions()) ||
+        !DistinctNumbersAreConsecutiveIfSorted(dnums.rhs_batch_dimensions()) ||
+        !absl::c_is_sorted(dnums.lhs_batch_dimensions()) ||
+        !absl::c_is_sorted(dnums.rhs_batch_dimensions()) ||
+        !LayoutUtil::AreDimensionsConsecutive(lhs_shape.layout(),
+                                              dnums.lhs_batch_dimensions()) ||
+        !LayoutUtil::AreDimensionsConsecutive(rhs_shape.layout(),
+                                              dnums.rhs_batch_dimensions())) {
+      return OkStatus();
+    }
+
+    // Index of logically first original batch dimension and the only kept one.
+    const int64_t lhs_batch_dimension =
+        *absl::c_min_element(dnums.lhs_batch_dimensions());
+    const int64_t rhs_batch_dimension =
+        *absl::c_min_element(dnums.rhs_batch_dimensions());
+
+    int64_t batch_size = 1;
+    for (const int64_t dimension_number : dnums.lhs_batch_dimensions()) {
+      batch_size *= lhs_shape.dimensions(dimension_number);
+    }
+
+    // Sizes of new dimensions of the operand where batch dimensions are merged
+    // into batch_dimension. Non-batch dimensions keep their sizes and order.
+    auto operand_merged_dimensions = [&](Shape shape, int batch_dimension) {
+      std::vector<int64_t> dimensions;
+      dimensions.reserve(shape.rank() + 1 - batch_dimension_count);
+      for (int i = 0; i < batch_dimension; ++i) {
+        dimensions.push_back(shape.dimensions(i));
+      }
+      dimensions.push_back(batch_size);
+      for (int i = batch_dimension + batch_dimension_count; i < shape.rank();
+           ++i) {
+        dimensions.push_back(shape.dimensions(i));
+      }
+      return dimensions;
+    };
+
+    std::vector<int64_t> lhs_reshape_dimensions =
+        operand_merged_dimensions(lhs_shape, lhs_batch_dimension);
+    std::vector<int64_t> rhs_reshape_dimensions =
+        operand_merged_dimensions(rhs_shape, rhs_batch_dimension);
+
+    DotDimensionNumbers new_dot_dimension_numbers;
+    new_dot_dimension_numbers.add_lhs_batch_dimensions(lhs_batch_dimension);
+    new_dot_dimension_numbers.add_rhs_batch_dimensions(rhs_batch_dimension);
+
+    // Dimensions past the batch ones get shifted down.
+    {
+      const std::vector<int64_t> shifted_contracting_dimensions =
+          ShiftDimensions(dnums.lhs_contracting_dimensions(),
+                          lhs_batch_dimension, batch_dimension_count - 1);
+      new_dot_dimension_numbers.mutable_lhs_contracting_dimensions()->Assign(
+          shifted_contracting_dimensions.begin(),
+          shifted_contracting_dimensions.end());
+    }
+    {
+      const std::vector<int64_t> shifted_contracting_dimensions =
+          ShiftDimensions(dnums.rhs_contracting_dimensions(),
+                          rhs_batch_dimension, batch_dimension_count - 1);
+      new_dot_dimension_numbers.mutable_rhs_contracting_dimensions()->Assign(
+          shifted_contracting_dimensions.begin(),
+          shifted_contracting_dimensions.end());
+    }
+
+    std::vector<int64_t> new_dot_output_dimensions;
+    new_dot_output_dimensions.reserve(dot->shape().rank() + 1 -
+                                      batch_dimension_count);
+    new_dot_output_dimensions.push_back(batch_size);
+    for (int i = batch_dimension_count; i < dot->shape().rank(); ++i) {
+      new_dot_output_dimensions.push_back(dot->shape().dimensions(i));
+    }
+
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * reshaped_lhs,
+        MakeReshapeHlo(ShapeUtil::MakeShape(lhs_shape.element_type(),
+                                            lhs_reshape_dimensions),
+                       dot->mutable_operand(0)));
+
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * reshaped_rhs,
+        MakeReshapeHlo(ShapeUtil::MakeShape(rhs_shape.element_type(),
+                                            rhs_reshape_dimensions),
+                       dot->mutable_operand(1)));
+
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * new_dot,
+        MakeDotHlo(reshaped_lhs, reshaped_rhs, new_dot_dimension_numbers,
+                   dot->precision_config(), dot->shape().element_type(),
+                   &dot->metadata()));
+    dot->SetupDerivedInstruction(new_dot);
+
+    std::unique_ptr<HloInstruction> out_reshape =
+        HloInstruction::CreateReshape(dot->shape(), new_dot);
+    return ReplaceWithNewInstruction(dot, std::move(out_reshape));
+  }
+};
+
+}  // namespace
+
+StatusOr<bool> DotDimensionMerger::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  return BatchDimensionMerger().RunOnModule(module, execution_threads);
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/dot_dimension_merger.h b/tensorflow/compiler/xla/service/dot_dimension_merger.h
new file mode 100644
index 00000000000..907652b47ff
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_dimension_merger.h
@@ -0,0 +1,39 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DIMENSION_MERGER_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DIMENSION_MERGER_H_
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+// Merge consecutive batch dimensions of a dot() by inserting reshapes.
+class DotDimensionMerger : public HloModulePass {
+ public:
+  absl::string_view name() const override { return "dot_dimension_merger"; }
+
+  // Run the pass on computations in 'module'.
+  // Return whether the 'module' was changed.
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_DOT_DIMENSION_MERGER_H_
diff --git a/tensorflow/compiler/xla/service/dot_dimension_merger_test.cc b/tensorflow/compiler/xla/service/dot_dimension_merger_test.cc
new file mode 100644
index 00000000000..8657aab740b
--- /dev/null
+++ b/tensorflow/compiler/xla/service/dot_dimension_merger_test.cc
@@ -0,0 +1,112 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/dot_dimension_merger.h"
+
+#include <string>
+
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+using DotDimensionMergerTest = HloTestBase;
+
+TEST_F(DotDimensionMergerTest, MergeConsecutiveBatchDimensions) {
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+ p0 = bf16[79,2,4,12,11] parameter(0)
+ p1 = bf16[79,2,4,11,44] parameter(1)
+ ROOT d = bf16[2,4,12,44] dot(p0, p1),
+  lhs_batch_dims={1,2}, lhs_contracting_dims={0,4},
+  rhs_batch_dims={1,2}, rhs_contracting_dims={0,3},
+  metadata={op_name="testname"}
+})";
+
+  RunAndFilecheckHloRewrite(kHloText, DotDimensionMerger(), R"(
+; CHECK: %[[R0:.*]] = bf16[79,8,12,11]{3,2,1,0} reshape(%p0)
+; CHECK: %[[R1:.*]] = bf16[79,8,11,44]{3,2,1,0} reshape(%p1)
+; CHECK: %[[DOT:.*]] = bf16[8,12,44]{2,1,0} dot(%[[R0]], %[[R1]])
+; CHECK-SAME: lhs_batch_dims={1}
+; CHECK-SAME: lhs_contracting_dims={0,3}
+; CHECK-SAME: rhs_batch_dims={1}
+; CHECK-SAME: rhs_contracting_dims={0,2}
+; CHECK-NEXT: ROOT {{[^ ]+}} = bf16[2,4,12,44]{3,2,1,0} reshape(%[[DOT]])
+; CHECK-SAME: metadata={op_name="testname"}
+  )");
+}
+
+TEST_F(DotDimensionMergerTest, SkipPhysicallyNonConsecutiveBatchDimensions) {
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+ p0 = bf16[2,4,12,13]{3,1,2,0} parameter(0)
+ p1 = bf16[2,4,13,55]{3,2,1,0} parameter(1)
+ ROOT d = bf16[2,4,12,55]{3,2,1,0} dot(p0, p1),
+  lhs_batch_dims={0,1}, lhs_contracting_dims={3},
+  rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  TF_ASSERT_OK_AND_ASSIGN(bool modified,
+                          DotDimensionMerger().Run(module.get()));
+  EXPECT_FALSE(modified);
+}
+
+TEST_F(DotDimensionMergerTest, SkipUnsortedBatchDimensions) {
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+ p0 = bf16[4,2,12,13] parameter(0)
+ p1 = bf16[2,4,13,55] parameter(1)
+ ROOT d = bf16[2,4,12,55] dot(p0, p1),
+  lhs_batch_dims={1,0}, lhs_contracting_dims={3},
+  rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  TF_ASSERT_OK_AND_ASSIGN(bool modified,
+                          DotDimensionMerger().Run(module.get()));
+  EXPECT_FALSE(modified);
+}
+
+TEST_F(DotDimensionMergerTest, SkipLogicallyNonConsecutiveBatchDimensions) {
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+ p0 = bf16[2,12,4,13] parameter(0)
+ p1 = bf16[2,4,13,55] parameter(1)
+ ROOT d = bf16[2,4,12,55] dot(p0, p1),
+  lhs_batch_dims={0,2}, lhs_contracting_dims={3},
+  rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  TF_ASSERT_OK_AND_ASSIGN(bool modified,
+                          DotDimensionMerger().Run(module.get()));
+  EXPECT_FALSE(modified);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
index 5fb73e43218..9a1b5491017 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter.cc
@@ -2801,6 +2801,9 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
   PrimitiveType primitive_type = hlo->shape().element_type();
   llvm::Type* primitive_type_llvm =
       llvm_ir::PrimitiveTypeToIrType(primitive_type, module_);
+  if (primitive_type == BF16) {
+    primitive_type_llvm = llvm_ir::PrimitiveTypeToIrType(F32, module_);
+  }
   llvm::AllocaInst* accumulator_alloca =
       llvm_ir::EmitAllocaAtFunctionEntry(primitive_type_llvm, "dot_acc", b_);
   Store(llvm::Constant::getNullValue(primitive_type_llvm), accumulator_alloca);
@@ -2863,12 +2866,22 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitElementalDot(
       Load(accumulator_alloca->getAllocatedType(), accumulator_alloca);
   TF_ASSIGN_OR_RETURN(llvm::Value * lhs_value, lhs_generator(lhs_index));
   TF_ASSIGN_OR_RETURN(llvm::Value * rhs_value, rhs_generator(rhs_index));
+
+  if (primitive_type == BF16) {
+    lhs_value = EmitBF16ToF32(lhs_value, b_);
+    rhs_value = EmitBF16ToF32(rhs_value, b_);
+  }
+
   llvm::Value* next_accumulator =
-      EmitMulAdd(lhs_value, rhs_value, current_accumulator, primitive_type);
+      EmitMulAdd(lhs_value, rhs_value, current_accumulator,
+                 primitive_type == BF16 ? F32 : primitive_type);
   Store(next_accumulator, accumulator_alloca);
 
   SetToFirstInsertPoint(inner_loop->GetExitBasicBlock(), b_);
-  return Load(accumulator_alloca->getAllocatedType(), accumulator_alloca);
+  llvm::Value* result =
+      Load(accumulator_alloca->getAllocatedType(), accumulator_alloca);
+
+  return primitive_type == BF16 ? EmitF32ToBF16(result) : result;
 }
 
 llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
diff --git a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
index a13e5e2041c..120fd46efd8 100644
--- a/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/elemental_ir_emitter_test.cc
@@ -348,5 +348,24 @@ XLA_TEST_F(ElementalIrEmitterExecutionTest, IotaBF16) {
   RunTest(hlo_text, {});
 }
 
+XLA_TEST_F(ElementalIrEmitterExecutionTest, BatchDotBF16) {
+  const char* const hlo_text = R"(
+  HloModule matmul
+
+  ENTRY main {
+    x = bf16[8,16] parameter(0)
+    y = bf16[8,16,32] parameter(1)
+    ROOT dot = bf16[8,32] dot(x, y), lhs_batch_dims={0}, rhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_contracting_dims={1}
+  }
+  )";
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  config.set_debug_options(debug_options);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_text, config));
+  EXPECT_TRUE(RunAndCompare(std::move(module), ErrorSpec{1e-5, 1e-5}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/flatten_call_graph.cc b/tensorflow/compiler/xla/service/flatten_call_graph.cc
index e93c9dc062a..36bacdeaccd 100644
--- a/tensorflow/compiler/xla/service/flatten_call_graph.cc
+++ b/tensorflow/compiler/xla/service/flatten_call_graph.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/flatten_call_graph.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
@@ -155,7 +157,8 @@ StatusOr<bool> FlattenCallGraph::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(3, "Before flatten call graph:\n" + module->ToString());
 
-  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::unique_ptr<CallGraph> call_graph =
+      CallGraph::Build(module, execution_threads);
   TF_RETURN_IF_ERROR(call_graph->VisitNodes(FlattenNode));
 
   XLA_VLOG_LINES(3, "After flatten call graph:\n" + module->ToString());
diff --git a/tensorflow/compiler/xla/service/gpu/BUILD b/tensorflow/compiler/xla/service/gpu/BUILD
index 65abde1e004..7c374a418d4 100644
--- a/tensorflow/compiler/xla/service/gpu/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/BUILD
@@ -58,6 +58,7 @@ tf_proto_library(
     protodeps = [
         "//tensorflow/compiler/xla:xla_data_proto",
         "//tensorflow/compiler/xla/stream_executor:dnn_proto",
+        "//tensorflow/tsl/protobuf:autotuning_proto",
     ],
 )
 
@@ -119,8 +120,12 @@ cc_library(
     compatible_with = get_compatible_with_cloud(),
     deps = [
         ":gpu_device_info",
+        "//tensorflow/compiler/xla:debug_options_flags",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo",
+        "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
         "//tensorflow/tsl/platform:logging",
+        "@llvm-project//mlir:IR",
     ],
 )
 
@@ -173,6 +178,19 @@ xla_cc_test(
     ]),
 )
 
+xla_cc_test(
+    name = "gpu_copy_insertion_test",
+    srcs = ["gpu_copy_insertion_test.cc"],
+    deps = [
+        ":gpu_compiler",
+        "//tensorflow/compiler/xla:test",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+    ],
+)
+
 cc_library(
     name = "hlo_to_ir_bindings",
     srcs = ["hlo_to_ir_bindings.cc"],
@@ -248,20 +266,24 @@ cc_library(
 
 xla_cc_test(
     name = "gpu_device_info_test",
-    srcs = if_cuda_is_configured(["gpu_device_info_test.cc"]),
-    tags = tf_cuda_tests_tags() + ["no_rocm"],
-    deps = if_cuda_is_configured([
+    srcs = ["gpu_device_info_test.cc"],
+    tags = tf_cuda_tests_tags(),
+    deps = [
         ":gpu_device_info",
         ":gpu_device_info_for_tests",
-        "@com_google_absl//absl/strings",
-        "@local_config_cuda//cuda:cuda_headers",
         "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
-        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_description",
-        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_gpu_executor_header",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
+        "@com_google_absl//absl/strings",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/compiler/xla/stream_executor/cuda:cuda_gpu_executor_header",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/compiler/xla/stream_executor/rocm:rocm_gpu_executor_header",
     ]),
 )
 
@@ -436,8 +458,10 @@ xla_test(
     backends = [
         "gpu",
     ],
+    shard_count = 10,
     tags = ["nomac"],
     deps = [
+        ":backend_configs_cc",
         ":gpu_device_info_for_tests",
         ":ir_emitter_triton",
         ":launch_dimensions",
@@ -487,6 +511,7 @@ xla_test(
     backends = [
         "gpu",
     ],
+    shard_count = 10,
     tags = ["nomac"],
     deps = [
         "//tensorflow/compiler/xla:comparison_util",
@@ -538,6 +563,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:float_normalization",
         "//tensorflow/compiler/xla/service:hlo_module_config",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service/gpu/llvm_gpu_backend",
         "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
@@ -566,6 +592,7 @@ xla_test(
     ],
     tags = ["nomac"],
     deps = [
+        ":backend_configs_cc",
         ":gemm_rewriter_triton",
         ":triton_autotuner",
         "//tensorflow/compiler/xla:xla_proto_cc",
@@ -865,26 +892,17 @@ cc_library(
         "//tensorflow/compiler/xla/mlir/runtime/transforms:compilation_pipeline_gpu",
         "//tensorflow/compiler/xla/mlir/runtime/transforms:type_converter",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
-        "//tensorflow/compiler/xla/runtime:diagnostics",
         "//tensorflow/compiler/xla/runtime:executable",
-        "//tensorflow/compiler/xla/runtime:jit_executable",
         "//tensorflow/compiler/xla/service:buffer_assignment",
         "//tensorflow/compiler/xla/service:custom_call_status_internal",
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_execution_profile",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/compiler/xla/service:logical_buffer",
         "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/service:xla_debug_info_manager",
-        "//tensorflow/compiler/xla/service/gpu/runtime:collectives",
-        "//tensorflow/compiler/xla/service/gpu/runtime:cublas_lt_matmul",
         "//tensorflow/compiler/xla/service/gpu/runtime:executable",
-        "//tensorflow/compiler/xla/service/gpu/runtime:gemm",
-        "//tensorflow/compiler/xla/service/gpu/runtime:kernel_launch",
         "//tensorflow/compiler/xla/service/gpu/runtime:support",
-        "//tensorflow/compiler/xla/service/llvm_ir:buffer_assignment_util",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
@@ -895,18 +913,14 @@ cc_library(
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_asm_opts",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
-        "//tensorflow/tsl/lib/gtl:map_util",
-        "//tensorflow/tsl/platform:casts",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/profiler/lib:scoped_annotation",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
@@ -933,6 +947,7 @@ cc_library(
     ]) + if_rocm_is_configured([
         "//tensorflow/compiler/xla/stream_executor/rocm:stream_executor_rocm",
         "@local_config_rocm//rocm:rocm_headers",
+        "//tensorflow/tsl/platform:random",
     ]),
 )
 
@@ -1097,6 +1112,8 @@ cc_library(
     srcs = ["gemm_rewriter_triton.cc"],
     hdrs = ["gemm_rewriter_triton.h"],
     deps = [
+        ":backend_configs_cc",
+        ":gpu_types",
         ":ir_emission_utils",
         ":matmul_utils",
         "//tensorflow/compiler/xla:literal_util",
@@ -1107,7 +1124,6 @@ cc_library(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_creation_utils",
         "//tensorflow/compiler/xla/service:hlo_pass",
-        "//tensorflow/compiler/xla/service/gpu:gpu_types",
         "//tensorflow/compiler/xla/stream_executor:device_description",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
@@ -1301,6 +1317,7 @@ cc_library(
     deps = [
         "//tensorflow/compiler/xla:permutation_util",
         "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_pass",
@@ -1914,6 +1931,7 @@ xla_cc_test(
         "//tensorflow/compiler/xla/service:algebraic_simplifier",
         "//tensorflow/compiler/xla/service:call_inliner",
         "//tensorflow/compiler/xla/service:hlo_pass",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
         "//tensorflow/compiler/xla/service:pattern_matcher",
         "//tensorflow/compiler/xla/service:pattern_matcher_gmock",
         "//tensorflow/compiler/xla/service:reshape_mover",
@@ -2171,7 +2189,9 @@ cc_library(
         "//tensorflow/compiler/xla/service:convolution_4d_expander",
         "//tensorflow/compiler/xla/service:convolution_pred_expander",
         "//tensorflow/compiler/xla/service:copy_insertion",
+        "//tensorflow/compiler/xla/service:data_parallel_collective_optimizer",
         "//tensorflow/compiler/xla/service:dot_decomposer",
+        "//tensorflow/compiler/xla/service:dot_dimension_merger",
         "//tensorflow/compiler/xla/service:dot_merger",
         "//tensorflow/compiler/xla/service:dump",
         "//tensorflow/compiler/xla/service:dynamic_dimension_simplifier",
@@ -2248,6 +2268,7 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/profiler/lib:traceme",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
         "@llvm-project//llvm:AsmParser",
diff --git a/tensorflow/compiler/xla/service/gpu/backend_configs.proto b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
index 6488ad460c5..68c007defab 100644
--- a/tensorflow/compiler/xla/service/gpu/backend_configs.proto
+++ b/tensorflow/compiler/xla/service/gpu/backend_configs.proto
@@ -4,6 +4,7 @@ package xla.gpu;
 
 import "tensorflow/compiler/xla/stream_executor/dnn.proto";
 import "tensorflow/compiler/xla/xla_data.proto";
+import "tensorflow/tsl/protobuf/autotuning.proto";
 
 // Backend configs for XLA:GPU.
 //
@@ -31,9 +32,8 @@ message CudnnConvBackendConfig {
   // Below are the fields related to cuDNN's fused convolution. Refer to
   // GpuConvParams for their meanings.
 
-  // The requested activation (e.g. relu) after the convolution. It is with type
-  // stream_executor::dnn::ActivationMode.
-  int64 activation_mode = 3;
+  // The requested activation (e.g. relu) after the convolution.
+  stream_executor.dnn.ActivationMode activation_mode = 3;
 
   // The scaling factor multiplied with the side input. If no side input buffer
   // is provided, this field must be 0.
@@ -96,3 +96,20 @@ message BitcastBackendConfig {
 message CollectiveBackendConfig {
   bool is_sync = 1;
 }
+
+message FusionBackendConfig {
+  // kLoop, kInput, or kOutput (from HloInstruction::FusionKind), or your own
+  // custom string.
+  //
+  // Don't put "kCustom" in here -- just put a string describing the custom
+  // fusion, like "__triton_gemm".
+  //
+  // This is somewhat redundant with HloInstruction::fusion_kind().  We need it
+  // here because LMHLO does not have the concept of a fusion kind, and we use
+  // this same backend-config proto for both HLO and LMHLO.
+  string kind = 1;
+
+  // Only valid when kind == "__triton_gemm".  Even then it's optional: If not
+  // present, we use the default Triton config.
+  tensorflow.AutotuneResult.TritonGemmKey triton_gemm_config = 2;
+}
diff --git a/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc b/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc
index 05fd734df4e..059875f6420 100644
--- a/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/tensorflow/compiler/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -115,6 +115,8 @@ static Status LowerToXlaGpuRuntime(mlir::ModuleOp module,
 
   GpuPipelineOpts opts;
   opts.cuda_graph_level = debug_options.xla_gpu_cuda_graph_level();
+  opts.enable_concurrent_region =
+      debug_options.xla_gpu_cuda_graph_enable_concurrent_region();
   populateXlaGpuRuntimePasses(pm, thunk_sequence, opts);
 
   if (pm.run(module).failed()) {
diff --git a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
index 856c7d7b445..9589cfdb8b4 100644
--- a/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/convolution_thunk.cc
@@ -57,6 +57,7 @@ Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
 
   std::vector<se::DeviceMemoryBase> operand_se_buffers;
+  operand_se_buffers.reserve(operand_buffers_.size());
   for (const auto& buffer : operand_buffers_) {
     operand_se_buffers.push_back(buffer_allocations.GetDeviceAddress(buffer));
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc
index 44f1901fbf2..49254fd68a5 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_simplify_padding_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/cudnn_pad_for_convolutions.h"
 #include "tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
@@ -69,14 +70,14 @@ class CudnnSimplifyPaddingTest : public HloTestBase {
                         RunHloPass(CudnnSimplifyPadding(), module));
     VLOG(1) << "after simplify_padding:\n" << module->ToString();
 
-    TF_RETURN_IF_ERROR(RunHloPass(HloPassFix<ReshapeMover>(), module).status());
-    VLOG(1) << "after reshape mover:\n" << module->ToString();
-
-    TF_RETURN_IF_ERROR(RunHloPass(HloPassFix<AlgebraicSimplifier>(
-                                      AlgebraicSimplifierOptions()),
-                                  module)
-                           .status());
-    VLOG(1) << "after algsimp:\n" << module->ToString();
+    {
+      // reshape-mover expects to be run alongside algsimp.
+      HloPassFix<HloPassPipeline> pipeline("reshape-mover and algsimp");
+      pipeline.AddPass<ReshapeMover>();
+      pipeline.AddPass<AlgebraicSimplifier>(AlgebraicSimplifierOptions());
+      TF_RETURN_IF_ERROR(RunHloPass(pipeline, module).status());
+    }
+    VLOG(1) << "after reshape mover + algsimp:\n" << module->ToString();
 
     return changed;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
index 6af1dff9d24..e9ed5cf2b00 100644
--- a/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/cudnn_vectorize_convolutions_test.cc
@@ -79,32 +79,12 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeTo4) {
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 20, 30, 10, 4}),
                                        m::Reshape(m::Parameter(1))
-                                           .WithShape(S8, {2, 2, 10, 4, 44})))
+                                           .WithShape(S8, {2, 2, 10, 4, 44}))
+                             .WithConvDnums("b01f?_01i?o->b01f?"))
                          .WithShape(S8, {10, 20, 30, 11, 4})),
           m::Op())));
 
   EXPECT_EQ(conv->raw_backend_config_string(), "{bar: 0}");
-
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.input_feature_dimension(), 3);
-
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 0);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 1);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 2);
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 4);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.output_feature_dimension(), 3);
 }
 
 TEST_F(CudnnVectorizeConvolutionsTest, NoVectorizeTo4UnsupportedFilterType) {
@@ -154,30 +134,10 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeTo4NCHW) {
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {10, 12, 4, 20, 30}),
                                        m::Reshape(m::Parameter(1))
-                                           .WithShape(S8, {12, 4, 44, 2, 2})))
+                                           .WithShape(S8, {12, 4, 44, 2, 2}))
+                             .WithConvDnums("bf?01_i?o01->bf?01"))
                          .WithShape(S8, {10, 11, 4, 20, 30})),
           m::Op())));
-
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 4);
-
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 4);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_feature_dimension(), 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 4);
 }
 
 TEST_F(CudnnVectorizeConvolutionsTest, IncrementAllDnums) {
@@ -207,30 +167,10 @@ TEST_F(CudnnVectorizeConvolutionsTest, IncrementAllDnums) {
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {4, 4, 16, 16, 16}),
                                        m::Reshape(m::Parameter(1))
-                                           .WithShape(S8, {4, 4, 16, 3, 3})))
+                                           .WithShape(S8, {4, 4, 16, 3, 3}))
+                             .WithConvDnums("f?b01_i?01o->f?b01"))
                          .WithShape(S8, {4, 4, 16, 16, 16})),
           m::Op())));
-
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_feature_dimension(), 0);
-  EXPECT_EQ(dnums.input_batch_dimension(), 2);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 4);
-
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 3);
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 4);
-
-  EXPECT_EQ(dnums.output_feature_dimension(), 0);
-  EXPECT_EQ(dnums.output_batch_dimension(), 2);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 4);
 }
 
 TEST_F(CudnnVectorizeConvolutionsTest, FilterDnums) {
@@ -260,30 +200,10 @@ TEST_F(CudnnVectorizeConvolutionsTest, FilterDnums) {
                                        m::Reshape(m::Parameter(0))
                                            .WithShape(S8, {1, 5, 4, 9, 9}),
                                        m::Reshape(m::Parameter(1))
-                                           .WithShape(S8, {3, 3, 5, 4, 32})))
+                                           .WithShape(S8, {3, 3, 5, 4, 32}))
+                             .WithConvDnums("bf?01_01i?o->bf?01"))
                          .WithShape(S8, {1, 8, 4, 9, 9})),
           m::Op())));
-
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 4);
-
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 0);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 1);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 2);
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 4);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_feature_dimension(), 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 3);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 4);
 }
 
 TEST_F(CudnnVectorizeConvolutionsTest, NoVectorizeTo4) {
@@ -574,7 +494,8 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32) {
               m::Reshape(m::Transpose(m::Reshape(m::Parameter(3))
                                           .WithShape(S8, {10, 20, 30, 2, 8, 4}))
                              .WithShape(S8, {10, 20, 30, 2, 8, 4}))
-                  .WithShape(S8, {10, 20, 30, 2, 32})))
+                  .WithShape(S8, {10, 20, 30, 2, 32}))
+              .WithConvDnums("b01f?_oi01?->b01f?"))
           .WithShape(S8, {10, 20, 30, 6, 32});
   ASSERT_THAT(root, GmockMatch(m::Tuple(
                         m::Reshape(m::Transpose(m::Reshape(conv_pat).WithShape(
@@ -583,27 +504,6 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32) {
                             .WithShape(S8, {10, 20, 30, 48, 4}),
                         m::Op())));
 
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.input_feature_dimension(), 3);
-
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 3);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.output_feature_dimension(), 3);
-
   EXPECT_TRUE(conv->backend_config<CudnnConvBackendConfig>()
                   ->reordered_int8_nchw_vect());
 }
@@ -651,7 +551,8 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32NCHW) {
               m::Reshape(m::Transpose(m::Reshape(m::Parameter(3))
                                           .WithShape(S8, {10, 2, 8, 20, 30, 4}))
                              .WithShape(S8, {10, 2, 20, 30, 8, 4}))
-                  .WithShape(S8, {10, 2, 20, 30, 32})))
+                  .WithShape(S8, {10, 2, 20, 30, 32}))
+              .WithConvDnums("bf01_oi01->bf01"))
           .WithShape(S8, {10, 4, 20, 30, 32});
   ASSERT_THAT(root, GmockMatch(m::Tuple(
                         m::Reshape(m::Transpose(m::Reshape(conv_pat).WithShape(
@@ -660,27 +561,6 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32NCHW) {
                             .WithShape(S8, {10, 32, 20, 30, 4}),
                         m::Op())));
 
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 3);
-
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 3);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_feature_dimension(), 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 3);
-
   EXPECT_TRUE(conv->backend_config<CudnnConvBackendConfig>()
                   ->reordered_int8_nchw_vect());
 }
@@ -728,7 +608,8 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32VectorDimFirst) {
               m::Reshape(m::Transpose(m::Reshape(m::Parameter(3))
                                           .WithShape(S8, {4, 10, 20, 30, 2, 8}))
                              .WithShape(S8, {8, 4, 10, 20, 30, 2}))
-                  .WithShape(S8, {32, 10, 20, 30, 2})))
+                  .WithShape(S8, {32, 10, 20, 30, 2}))
+              .WithConvDnums("?b01f_oi01->?b01f"))
           .WithShape(S8, {32, 10, 20, 30, 6});
   ASSERT_THAT(root, GmockMatch(m::Tuple(
                         m::Reshape(m::Transpose(m::Reshape(conv_pat).WithShape(
@@ -737,27 +618,6 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize4To32VectorDimFirst) {
                             .WithShape(S8, {4, 10, 20, 30, 48}),
                         m::Op())));
 
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-  ASSERT_EQ(dnums.input_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.kernel_spatial_dimensions().size(), 2);
-  ASSERT_EQ(dnums.output_spatial_dimensions().size(), 2);
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 3);
-  EXPECT_EQ(dnums.input_feature_dimension(), 4);
-
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 3);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 3);
-  EXPECT_EQ(dnums.output_feature_dimension(), 4);
-
   EXPECT_TRUE(conv->backend_config<CudnnConvBackendConfig>()
                   ->reordered_int8_nchw_vect());
 }
@@ -818,7 +678,8 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize16To32) {
                   m::Transpose(m::Reshape(filter_pat)
                                    .WithShape(S8, {3, 5, 2, 24, 4, 2, 8, 4}))
                       .WithShape(S8, {2, 3, 5, 24, 2, 8, 4, 4}))
-                  .WithShape(S8, {192, 2, 3, 5, 32})))
+                  .WithShape(S8, {192, 2, 3, 5, 32}))
+              .WithConvDnums("b01f_oi01->b01f"))
           .WithShape(S8, {10, 20, 30, 6, 32});
   ASSERT_THAT(root, GmockMatch(m::Tuple(
                         m::Reshape(m::Transpose(m::Reshape(conv_pat).WithShape(
@@ -826,25 +687,6 @@ TEST_F(CudnnVectorizeConvolutionsTest, Vectorize16To32) {
                                        .WithShape(S8, {10, 20, 30, 6, 2, 16}))
                             .WithShape(S8, {10, 20, 30, 12, 16}),
                         m::Op())));
-
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.input_feature_dimension(), 3);
-
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 3);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.output_feature_dimension(), 3);
-
   EXPECT_TRUE(conv->backend_config<CudnnConvBackendConfig>()
                   ->reordered_int8_nchw_vect());
 }
@@ -880,7 +722,8 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeMixedTo32) {
                   m::Transpose(m::Reshape(m::Parameter(1))
                                    .WithShape(S8, {3, 5, 2, 24, 4, 2, 8, 4}))
                       .WithShape(S8, {2, 3, 5, 24, 2, 8, 4, 4}))
-                  .WithShape(S8, {192, 2, 3, 5, 32})))
+                  .WithShape(S8, {192, 2, 3, 5, 32}))
+              .WithConvDnums("b01f_oi01->b01f"))
           .WithShape(S8, {10, 20, 30, 6, 32});
   ASSERT_THAT(root, GmockMatch(m::Tuple(
                         m::Reshape(m::Transpose(m::Reshape(conv_pat).WithShape(
@@ -888,25 +731,6 @@ TEST_F(CudnnVectorizeConvolutionsTest, VectorizeMixedTo32) {
                                        .WithShape(S8, {10, 20, 30, 6, 16, 2}))
                             .WithShape(S8, {10, 20, 30, 96, 2}),
                         m::Op())));
-
-  const ConvolutionDimensionNumbers& dnums =
-      conv->convolution_dimension_numbers();
-
-  EXPECT_EQ(dnums.input_batch_dimension(), 0);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.input_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.input_feature_dimension(), 3);
-
-  EXPECT_EQ(dnums.kernel_output_feature_dimension(), 0);
-  EXPECT_EQ(dnums.kernel_input_feature_dimension(), 1);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[0], 2);
-  EXPECT_EQ(dnums.kernel_spatial_dimensions()[1], 3);
-
-  EXPECT_EQ(dnums.output_batch_dimension(), 0);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[0], 1);
-  EXPECT_EQ(dnums.output_spatial_dimensions()[1], 2);
-  EXPECT_EQ(dnums.output_feature_dimension(), 3);
-
   EXPECT_TRUE(conv->backend_config<CudnnConvBackendConfig>()
                   ->reordered_int8_nchw_vect());
 }
diff --git a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc
index 8dd3f2ae6c7..ff051e36f3f 100644
--- a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc
+++ b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
+#include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -36,12 +37,6 @@ namespace gpu {
 
 namespace {
 
-// Check that a sequence of distinct numbers is a continuous interval.
-bool ConsecutiveIfSorted(absl::Span<const int64_t> seq) {
-  return *absl::c_max_element(seq) - *absl::c_min_element(seq) ==
-         seq.size() - 1;
-}
-
 // Sort contracting dimensions of a dot() instruction preserving lhs-rhs pairs.
 Status SortDotDimensions(HloInstruction* dot) {
   const DotDimensionNumbers& dims = dot->dot_dimension_numbers();
@@ -49,7 +44,7 @@ Status SortDotDimensions(HloInstruction* dot) {
   new_dims.clear_lhs_contracting_dimensions();
   new_dims.clear_rhs_contracting_dimensions();
   const bool sort_by_lhs =
-      ConsecutiveIfSorted(dims.lhs_contracting_dimensions());
+      DistinctNumbersAreConsecutiveIfSorted(dims.lhs_contracting_dimensions());
   // Sort lhs and rhs by sort_key using the fact that
   // sort_key is guaranteed to have only distinct consecutive numbers.
   const absl::Span<const int64_t>& sort_key =
@@ -103,10 +98,10 @@ StatusOr<bool> DotDimensionSorter::Run(
       if (dims.lhs_contracting_dimensions_size() == 0) {
         continue;
       }
-      const bool cons_lhs =
-          ConsecutiveIfSorted(dims.lhs_contracting_dimensions());
-      const bool cons_rhs =
-          ConsecutiveIfSorted(dims.rhs_contracting_dimensions());
+      const bool cons_lhs = DistinctNumbersAreConsecutiveIfSorted(
+          dims.lhs_contracting_dimensions());
+      const bool cons_rhs = DistinctNumbersAreConsecutiveIfSorted(
+          dims.rhs_contracting_dimensions());
       const bool sorted_lhs =
           absl::c_is_sorted(dims.lhs_contracting_dimensions());
       const bool sorted_rhs =
diff --git a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
index 80c671ba8c6..f6bd810b457 100644
--- a/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/dot_dimension_sorter_test.cc
@@ -93,7 +93,7 @@ ENTRY e {
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_modified,
-                                      ErrorSpec{1e-5, 1e-5},
+                                      ErrorSpec{1e-5, 1e-3},
                                       /*run_hlo_passes=*/true));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
index 526555ceaf5..9c1bdaea060 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter.cc
@@ -222,7 +222,6 @@ HloInstruction *TransposeMatrix(HloInstruction *instr, int64_t contracting_dim,
       HloInstruction::CreateTranspose(new_shape, instr, permutation));
 }
 
-
 // If the bias is a sequence of ops that depend only on broadcasts of
 // constants, materialize the bias if it's small.
 //
@@ -725,28 +724,19 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
     // Fuse the possible addition of a matrix bias here to enable the subsequent
     // fusion of the scaling and conversion of D into the Custom Call. Fusing
     // a matrix bias is only supported with CUDA 12 and above.
-    HloInstruction *c = nullptr;
+    HloInstruction *c = nullptr, *add = nullptr;
 
     if (instr->user_count() == 1 &&
         instr->users()[0]->opcode() == HloOpcode::kAdd) {
-      HloInstruction *add = instr->users()[0];
-      HloInstruction *bias = add->mutable_operand(!add->operand_index(instr));
+      HloInstruction *bias = instr->users()[0]->mutable_operand(
+          !instr->users()[0]->operand_index(instr));
       if (bias->opcode() != HloOpcode::kBroadcast) {
         c = bias;
         gemm_backend_config.set_beta(1.0);
-        TF_RETURN_IF_ERROR(ReplaceInstruction(add, instr));
+        add = instr->users()[0];
       }
     }
 
-    // If a matrix bias was not fused, set C to a matrix of zeros.
-    if (!c) {
-      Literal c_literal = LiteralUtil::Zero(c_type);
-      HloInstruction *c_const = instr->AddInstruction(
-          HloInstruction::CreateConstant(c_literal.Clone()));
-      c = instr->AddInstruction(HloInstruction::CreateBroadcast(
-          ShapeUtil::ChangeElementType(instr->shape(), c_type), c_const, {}));
-    }
-
     // Each operand must have exactly one contracting and one non-contracting
     // dimension.
     absl::Span<const int64_t> a_contracting_dims =
@@ -841,23 +831,31 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       b = TransposeMatrix(b, b_contracting_dims[0], batch_dims);
     }
 
-    // Pad the non-batch dimensions of the operands to multiples of 16 as
-    // required by cuBLASLt.
-    auto pad_operand = [&instr, &batch_dims](HloInstruction *&x) -> void {
-      PaddingConfig padding_config;
-      Shape padded_shape = x->shape();
-      for (int i = 0; i < x->shape().rank(); ++i) {
-        auto dimension = padding_config.add_dimensions();
+    // Get the padded shape.
+    auto pad_shape = [&batch_dims](const Shape old_shape) {
+      Shape padded_shape = old_shape;
+      for (int i = 0; i < old_shape.rank(); ++i) {
         if (!absl::c_linear_search(batch_dims, i)) {
           int64_t padded_dimension =
-              RoundUpTo<int64_t>(x->shape().dimensions(i), 16);
-          dimension->set_edge_padding_low(0);
-          dimension->set_edge_padding_high(padded_dimension -
-                                           x->shape().dimensions(i));
-          dimension->set_interior_padding(0);
+              RoundUpTo<int64_t>(old_shape.dimensions(i), 16);
           padded_shape.set_dimensions(i, padded_dimension);
         }
       }
+      return padded_shape;
+    };
+
+    // Pad the non-batch dimensions of the operands to multiples of 16 as
+    // required by cuBLASLt.
+    auto pad_operand = [&instr, &pad_shape](HloInstruction *&x) -> void {
+      PaddingConfig padding_config;
+      Shape padded_shape = pad_shape(x->shape());
+      for (int i = 0; i < x->shape().rank(); ++i) {
+        auto dimension = padding_config.add_dimensions();
+        dimension->set_edge_padding_low(0);
+        dimension->set_edge_padding_high(padded_shape.dimensions(i) -
+                                         x->shape().dimensions(i));
+        dimension->set_interior_padding(0);
+      }
       if (!ShapeUtil::Equal(padded_shape, x->shape())) {
         HloInstruction *zero =
             instr->AddInstruction(HloInstruction::CreateConstant(
@@ -867,17 +865,26 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       }
       return;
     };
+
     pad_operand(a);
     pad_operand(b);
-    pad_operand(c);
+    if (c != nullptr) {
+      pad_operand(c);
+    }
+    Shape new_output_shape = pad_shape(instr->shape());
+
+    std::vector<HloInstruction *> operands_list = {
+        a, b, scales_f32[0], scales_f32[1], one, one};
+    if (c != nullptr) {
+      operands_list.insert(operands_list.begin() + 2, c);
+    }
 
     HloInstruction *new_custom_call =
         instr->AddInstruction(HloInstruction::CreateCustomCall(
             ShapeUtil::MakeShapeWithDenseLayout(
-                instr->shape().element_type(), c->shape().dimensions(),
+                instr->shape().element_type(), new_output_shape.dimensions(),
                 instr->shape().layout().minor_to_major()),
-            {a, b, c, scales_f32[0], scales_f32[1], one, one},
-            kCublasLtMatmulF8CallTarget));
+            operands_list, kCublasLtMatmulF8CallTarget));
 
     TF_RETURN_IF_ERROR(
         new_custom_call->set_backend_config(gemm_backend_config));
@@ -885,7 +892,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
 
     // Slice the result of the GEMM if the operands were padded.
     HloInstruction *slice = nullptr;
-    if (c->shape().dimensions() != instr->shape().dimensions()) {
+    if (new_output_shape.dimensions() != instr->shape().dimensions()) {
       std::vector<int64_t> start_indices(instr->shape().rank(), 0);
       std::vector<int64_t> strides(instr->shape().rank(), 1);
       slice = instr->AddInstruction(HloInstruction::CreateSlice(
@@ -893,7 +900,7 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
           instr->shape().dimensions(), strides));
     }
     TF_RETURN_IF_ERROR(
-        ReplaceInstruction(instr, slice ? slice : new_custom_call));
+        ReplaceInstruction(add ? add : instr, slice ? slice : new_custom_call));
     return true;
   }
 
@@ -970,28 +977,17 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       return OkStatus();
     }
 
-    // Change the data type of C to BF16 as required by cuBLASLt for GEMMs with
-    // FP8 outputs (see cuBLASLt documentation).
-    if (existing_gemm->operand(2)->shape().element_type() != BF16 &&
+    TF_ASSIGN_OR_RETURN(auto gemm_backend_config,
+                        existing_gemm->backend_config<GemmBackendConfig>());
+    if (gemm_backend_config.beta() != 0.0 &&
+        existing_gemm->operand(2)->shape().element_type() != BF16 &&
         existing_gemm->operand(2)->shape().element_type() != F16) {
-      TF_ASSIGN_OR_RETURN(auto gemm_backend_config,
-                          existing_gemm->backend_config<GemmBackendConfig>());
-      if (gemm_backend_config.beta() == 1.0) {
-        VLOG(1) << "The scaling and conversion of the result of "
-                << existing_gemm->ToShortString()
-                << " is not fused into the FP8 Custom Call because it "
-                   "conflicts with the existing fusion of the addition of a "
-                   "matrix bias with element type other than BF16 or F16.";
-        return OkStatus();
-      } else {
-        Literal c_literal = LiteralUtil::Zero(BF16);
-        HloInstruction *c = instr->AddInstruction(
-            HloInstruction::CreateConstant(c_literal.Clone()));
-        HloInstruction *c_bcast =
-            instr->AddInstruction(HloInstruction::CreateBroadcast(
-                ShapeUtil::ChangeElementType(instr->shape(), BF16), c, {}));
-        TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(2, c_bcast));
-      }
+      VLOG(1) << "The scaling and conversion of the result of "
+              << existing_gemm->ToShortString()
+              << " is not fused into the FP8 Custom Call because it "
+                 "conflicts with the existing fusion of the addition of a "
+                 "matrix bias with element type other than BF16 or F16.";
+      return OkStatus();
     }
 
     // If necessary, invert the scaling factor of D and convert to F32.
@@ -1006,7 +1002,8 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
       d_scale = instr->AddInstruction(HloInstruction::CreateConvert(
           ShapeUtil::MakeScalarShape(F32), d_scale));
     }
-    TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(6, d_scale));
+    TF_RETURN_IF_ERROR(existing_gemm->ReplaceOperandWith(
+        gemm_backend_config.beta() == 0.0 ? 5 : 6, d_scale));
 
     // If present, elide the calculation of the maximum of the absolute values
     // of the result of the GEMM.
@@ -1511,6 +1508,10 @@ class GemmRewriterVisitor : public DfsHloRewriteVisitor {
   StatusOr<bool> GemmIsSupportedByCublasLt(
       const HloInstruction &instr,
       const GemmBackendConfig &gemm_backend_config) const {
+    if (std::holds_alternative<se::RocmComputeCapability>(gpu_version_)) {
+      return false;
+    }
+
     const HloInstruction *lhs = instr.operand(0);
     const HloInstruction *rhs = instr.operand(1);
     const Shape &output_shape = instr.shape();
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc
index 8a0542c4853..5872cbc808d 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/ir_emission_utils.h"
 #include "tensorflow/compiler/xla/service/gpu/matmul_utils.h"
 #include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
@@ -64,14 +65,6 @@ const tsl::protobuf::RepeatedField<int64_t>& BatchDimensionsForOperand(
   return dimension_numbers.rhs_batch_dimensions();
 }
 
-// Index of first batch dimension of dot instruction operand; -1 if none exist.
-int64_t FirstBatchDimensionForOperand(const HloInstruction& dot,
-                                      const int operand_number) {
-  tsl::protobuf::RepeatedField<int64_t> dimensions =
-      BatchDimensionsForOperand(dot, operand_number);
-  return dimensions.empty() ? -1 : dimensions[0];
-}
-
 // Index of first contracting dimension of dot instruction operand.
 int64_t FirstContractingDimensionIndex(const HloInstruction& dot,
                                        const int operand_number) {
@@ -89,6 +82,7 @@ bool IsTritonSupportedInputType(PrimitiveType t, GpuVersion gpu_version) {
   switch (t) {
     case PRED:
     case S8:
+    case S16:
     case S32:
     case F16:
     case F32:
@@ -140,10 +134,8 @@ class DimensionOrder {
   // dimension indices describing the operand
   // are stored along with the dimension order for later analysis.
   explicit DimensionOrder(const HloInstruction* hlo,
-                          const int64_t batch_dimension_index,
                           const int64_t splittable_dimension_index)
-      : batch_dimension_index_(batch_dimension_index),
-        splittable_dimension_index_(splittable_dimension_index) {
+      : splittable_dimension_index_(splittable_dimension_index) {
     dim_order_.reserve(hlo->shape().rank());
     for (const int64_t i : hlo->shape().layout().minor_to_major()) {
       dim_order_.push_back({i, 0, hlo->shape().dimensions(i)});
@@ -180,8 +172,6 @@ class DimensionOrder {
 
   const DimOrderVector& GetDimOrderVector() const { return dim_order_; }
 
-  int64_t BatchDimensionIndex() const { return batch_dimension_index_; }
-
   int64_t SplittableDimensionIndex() const {
     return splittable_dimension_index_;
   }
@@ -192,7 +182,6 @@ class DimensionOrder {
   Status HandleCopyOrTranspose(const HloInstruction* hlo);
 
   DimOrderVector dim_order_;
-  int64_t batch_dimension_index_;
   int64_t splittable_dimension_index_;
 };
 
@@ -207,14 +196,14 @@ DimensionOrder DimensionOrder::FromDotOperand(const HloInstruction& dot,
       dot.dot_dimension_numbers().lhs_batch_dimensions_size() -
               num_split_k_batch_dims ==
           0) {
-    return DimensionOrder(
-        operand, /*batch_dimension_index=*/-1,
-        GetNonContractingDims(operand->shape(), /*batch_dims=*/{},
-                              {FirstContractingDimensionIndex(dot, 0)})
-            .value()[0]);
+    StatusOr<std::vector<int64_t>> non_contracting_dims = GetNonContractingDims(
+        operand->shape(), BatchDimensionsForOperand(dot, operand_number),
+        {FirstContractingDimensionIndex(dot, 0)});
+    TF_CHECK_OK(non_contracting_dims.status());
+    CHECK_EQ(non_contracting_dims->size(), 1);
+    return DimensionOrder(operand, non_contracting_dims->front());
   }
   return DimensionOrder(operand,
-                        FirstBatchDimensionForOperand(dot, operand_number),
                         /*splittable_dimension_index=*/-1);
 }
 
@@ -416,7 +405,8 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
     // TODO(b/266857789): also fuse convert(dot()) at output if present:
     // seen on s8xf32->bf16
     std::string suggested_name = absl::StrCat("triton_gemm_", dot->name());
-    HloComputation::Builder builder(suggested_name);
+    HloComputation::Builder builder(
+        absl::StrCat(suggested_name, "_computation"));
     // Original instruction -> fused one.
     absl::flat_hash_map<const HloInstruction*, HloInstruction*>
         old_to_new_mapping;
@@ -492,8 +482,12 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
             computation));
     dot_fusion->GetModule()->SetAndUniquifyInstrName(dot_fusion,
                                                      suggested_name);
-    dot_fusion->set_raw_backend_config_string(
-        std::string(kTritonGemmBackendConfig));
+
+    TF_ASSIGN_OR_RETURN(auto backend_config,
+                        dot_fusion->backend_config<FusionBackendConfig>());
+    backend_config.set_kind(std::string(kTritonGemmFusionKind));
+    TF_RETURN_IF_ERROR(dot_fusion->set_backend_config(backend_config));
+
     if (dot->IsRoot()) {
       dot->parent()->set_root_instruction(dot_fusion);
       TF_RETURN_IF_ERROR(
@@ -531,7 +525,7 @@ void CopyIncrementingAboveThreshold(
 }
 
 StatusOr<HloInstruction*> MakeSplitKOperand(
-    HloInstruction& dot,
+    HloInstruction& dot, const DotFusionAnalysis& analysis,
     const tensorflow::AutotuneResult::TritonGemmKey& tiling,
     const int64_t contracting_dim_idx, const int operand_number) {
   const Shape& shape = dot.operand(operand_number)->shape();
@@ -541,7 +535,6 @@ StatusOr<HloInstruction*> MakeSplitKOperand(
   if (tiling.split_k() > shape.dimensions(contracting_dim_idx)) {
     return Cancelled("Too small total contracting dimension size.");
   }
-  const DotFusionAnalysis analysis(&dot);
   int64_t size_to_split = tiling.split_k();
   auto fragment = analysis.IterSpec(operand_number, contracting_dim_idx)[0]
                       .subfragments.crbegin();
@@ -572,20 +565,18 @@ StatusOr<HloInstruction*> MakeSplitKOperand(
     }
   }
 
-  absl::Span<const int64_t> physical_dim_order =
-      shape.layout().minor_to_major();
-  const int contracting_dim_physical_idx =
-      absl::c_find(physical_dim_order, contracting_dim_idx) -
-      physical_dim_order.begin();
-  Layout* batch_dot_layout = new_shape.mutable_layout();
-  for (int64_t physical_dim_idx : physical_dim_order) {
-    // When physical_dim_idx == contracting_dim_physical_idx add both
-    // physical_dim_idx+1 and physical_dim_idx because it gets split into two.
-    if (physical_dim_idx >= contracting_dim_physical_idx) {
-      batch_dot_layout->add_minor_to_major(physical_dim_idx + 1);
+  Layout* new_layout = new_shape.mutable_layout();
+  // Iterate through the logical dimension numbers in their physical order;
+  // copy them into the new layout incrementing by one those that get shifted
+  // by the insertion of the new batch dimension.
+  for (int64_t logical_dim_idx : shape.layout().minor_to_major()) {
+    // When 'logical_dim_idx' == 'contracting_dim_idx' add both
+    // 'logical_dim_idx'+1 and 'logical_dim_idx' because it gets split into two.
+    if (logical_dim_idx >= contracting_dim_idx) {
+      new_layout->add_minor_to_major(logical_dim_idx + 1);
     }
-    if (physical_dim_idx <= contracting_dim_physical_idx) {
-      batch_dot_layout->add_minor_to_major(physical_dim_idx);
+    if (logical_dim_idx <= contracting_dim_idx) {
+      new_layout->add_minor_to_major(logical_dim_idx);
     }
   }
   return MakeBitcastHlo(dot.mutable_operand(operand_number), new_shape);
@@ -598,12 +589,14 @@ Status MakeDotComputationSplitKBatch(
     const tensorflow::AutotuneResult::TritonGemmKey& tiling) {
   HloInstruction* dot = computation->root_instruction();
   CHECK_EQ(dot->opcode(), HloOpcode::kDot);
+  const DotFusionAnalysis analysis(dot);
   const DotDimensionNumbers& old_dim_numbers = dot->dot_dimension_numbers();
   DotDimensionNumbers new_dim_numbers;
 
   const int64_t lhs_contracting_idx = FirstContractingDimensionIndex(*dot, 0);
-  TF_ASSIGN_OR_RETURN(HloInstruction * lhs,
-                      MakeSplitKOperand(*dot, tiling, lhs_contracting_idx, 0));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * lhs,
+      MakeSplitKOperand(*dot, analysis, tiling, lhs_contracting_idx, 0));
   CopyIncrementingAboveThreshold(
       old_dim_numbers.lhs_contracting_dimensions(),
       *new_dim_numbers.mutable_lhs_contracting_dimensions(),
@@ -614,8 +607,9 @@ Status MakeDotComputationSplitKBatch(
   new_dim_numbers.mutable_lhs_batch_dimensions()->Add(lhs_contracting_idx);
 
   const int64_t rhs_contracting_idx = FirstContractingDimensionIndex(*dot, 1);
-  TF_ASSIGN_OR_RETURN(HloInstruction * rhs,
-                      MakeSplitKOperand(*dot, tiling, rhs_contracting_idx, 1));
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * rhs,
+      MakeSplitKOperand(*dot, analysis, tiling, rhs_contracting_idx, 1));
   CopyIncrementingAboveThreshold(
       old_dim_numbers.rhs_contracting_dimensions(),
       *new_dim_numbers.mutable_rhs_contracting_dimensions(),
@@ -629,6 +623,9 @@ Status MakeDotComputationSplitKBatch(
       MakeDotHlo(lhs, rhs, new_dim_numbers, dot->precision_config(),
                  dot->shape().element_type())
           .value();
+  // `new_dot` will have default output layout even if `dot` had a custom one.
+  // We will set the original output layout on the reduce operation.
+
   dot->SetupDerivedInstruction(new_dot);
   TF_RETURN_IF_ERROR(dot->ReplaceAllUsesWithDifferentShape(new_dot));
   TF_RETURN_IF_ERROR(dot->parent()->RemoveInstruction(dot));
@@ -639,6 +636,9 @@ Status MakeDotSplitKBatch(
     HloInstruction* dot_fusion,
     const tensorflow::AutotuneResult::TritonGemmKey& tiling) {
   CHECK_EQ(dot_fusion->opcode(), HloOpcode::kFusion);
+
+  Layout old_dot_layout = dot_fusion->fused_expression_root()->shape().layout();
+
   TF_RETURN_IF_ERROR(MakeDotComputationSplitKBatch(
       dot_fusion->fused_instructions_computation(), tiling));
   const HloInstruction* dot = dot_fusion->fused_expression_root();
@@ -653,6 +653,10 @@ Status MakeDotSplitKBatch(
       MakeReduceHlo(dot_fusion, zero, {new_batch_dim_idx}, HloOpcode::kAdd)
           .value();
 
+  // If the original dot had non-standard layout, this reduce should have that
+  // too.
+  *reduce->mutable_shape()->mutable_layout() = old_dot_layout;
+
   if (dot_fusion->IsRoot()) {
     dot_fusion->parent()->set_root_instruction(reduce, true);
   } else {
@@ -675,6 +679,7 @@ DotFusionAnalysis::DotFusionAnalysis(const HloInstruction* root,
     const HloInstruction* parameter = root->operand(operand_number);
     DimensionOrder dim_order =
         DimensionOrder::FromDotOperand(*root, operand_number, split_k);
+    TF_CHECK_OK(RequireTritonGemmSupportedDimOrder(dim_order));
     while (parameter->opcode() != HloOpcode::kParameter) {
       CHECK_EQ(parameter->operand_count(), 1);
       TF_CHECK_OK(dim_order.HandleInstruction(parameter));
@@ -757,8 +762,20 @@ bool IsTritonHandledGEMM(const HloInstruction& dot,
     return false;
   }
 
+  const DotDimensionNumbers& dim_numbers = dot.dot_dimension_numbers();
+
   // TODO(b/269580541): support multiple batch dimensions.
-  if (dot.dot_dimension_numbers().lhs_batch_dimensions().size() > 1) {
+  if (dim_numbers.lhs_batch_dimensions().size() > 1) {
+    return false;
+  }
+
+  // Cases where lhs or rhs have no non-contracting dims are not handled.
+  if (dim_numbers.lhs_batch_dimensions().size() +
+              dim_numbers.lhs_contracting_dimensions().size() ==
+          dot.operand(0)->shape().rank() ||
+      dim_numbers.rhs_batch_dimensions().size() +
+              dim_numbers.rhs_contracting_dimensions().size() ==
+          dot.operand(1)->shape().rank()) {
     return false;
   }
 
diff --git a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc
index 915196d4ac4..18ded724ecb 100644
--- a/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -37,7 +37,12 @@ using ::testing::FieldsAre;
 
 namespace m = ::xla::match;
 
-using GemmRewriterTritonTest = HloTestBase;
+class GemmRewriterTritonTest : public HloTestBase {
+ public:
+  GemmRewriterTritonTest()
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
+};
 
 TEST_F(GemmRewriterTritonTest, TransposeSubdimensionGroup) {
   // This HLO is artificial because unnecessary reshapes get optimized
@@ -113,7 +118,7 @@ ENTRY e {
     called_computations={triton_dot}
   ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   const HloComputation* dot_computation = module->entry_computation()
                                               ->root_instruction()
@@ -160,7 +165,7 @@ ENTRY e {
     called_computations={triton_dot}
   ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   const HloComputation* dot_computation = module->entry_computation()
                                               ->root_instruction()
@@ -206,7 +211,7 @@ ENTRY e {
     custom_call_target="__triton",
     called_computations={triton_dot}
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   const HloComputation* dot_computation =
       module->entry_computation()->root_instruction()->called_computations()[0];
@@ -252,7 +257,7 @@ ENTRY e {
     called_computations={triton_dot}
   ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   const HloComputation* dot_computation = module->entry_computation()
                                               ->root_instruction()
@@ -301,7 +306,7 @@ ENTRY e {
     called_computations={triton_dot}
   ROOT bitcast.2 = bf16[1,8,6,3]{3,2,1,0} bitcast(custom-call)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   const HloComputation* dot_computation = module->entry_computation()
                                               ->root_instruction()
@@ -347,7 +352,7 @@ ENTRY e {
     custom_call_target="__triton", called_computations={triton_dot}
   ROOT bitcast.2 = bf16[3,8,1,3]{3,2,1,0} bitcast(custom-call)
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   const HloComputation* dot_computation = module->entry_computation()
                                               ->root_instruction()
@@ -398,7 +403,7 @@ ENTRY e {
   ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   tensorflow::AutotuneResult::TritonGemmKey key;
   key.set_block_m(16);
@@ -409,13 +414,50 @@ ENTRY e {
   key.set_num_warps(4);
   TF_EXPECT_OK(
       MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
-  EXPECT_TRUE(VerifyHloModule(module.get(), /*layout_sensitive=*/true,
-                              /*allow_mixed_precision=*/false)
-                  .ok());
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
 }
 
+TEST_F(SplitKTest, MakeSplitKWithNonStandardOutputLayout) {
+  const std::string kHloText = R"(
+HloModule t
+
+triton_gemm_dot {
+  parameter_0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  bitcast.1 = s8[3,5,32,128]{2,1,3,0} bitcast(parameter_0)
+  copy.1 = s8[3,5,32,128]{3,2,1,0} copy(bitcast.1)
+  reshape.5 = s8[480,128]{1,0} reshape(copy.1)
+  convert.8 = bf16[480,128]{1,0} convert(reshape.5)
+  parameter_1 = bf16[16,128]{1,0} parameter(1)
+  ROOT dot.0 = bf16[480,16]{0,1} dot(convert.8, parameter_1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={1}
+}
+
+ENTRY e {
+  p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
+  p1 = bf16[16,128]{1,0} parameter(1)
+  ROOT fusion = bf16[480,16]{0,1} fusion(p0, p1),
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+})";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  tensorflow::AutotuneResult::TritonGemmKey key;
+  key.set_block_m(16);
+  key.set_block_n(16);
+  key.set_block_k(16);
+  key.set_split_k(4);
+  key.set_num_stages(1);
+  key.set_num_warps(4);
+
+  TF_EXPECT_OK(
+      MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
+
+  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
+            HloOpcode::kReduce);
+  EXPECT_EQ(module->entry_computation()->root_instruction()->shape().layout(),
+            Layout({0, 1}));
+}
+
 TEST_F(SplitKTest, MakeSplitKWithExistingBatchDim) {
   const std::string hlo_text = R"(
 HloModule m
@@ -438,7 +480,7 @@ ENTRY e {
     kind=kCustom, calls=triton_gemm_dot.24,
     backend_config="__triton_gemm"
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   tensorflow::AutotuneResult::TritonGemmKey key;
   key.set_block_m(32);
@@ -449,9 +491,6 @@ ENTRY e {
   key.set_num_warps(4);
   TF_EXPECT_OK(
       MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
-  EXPECT_TRUE(VerifyHloModule(module.get(), /*layout_sensitive=*/true,
-                              /*allow_mixed_precision=*/false)
-                  .ok());
   EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
             HloOpcode::kReduce);
 }
@@ -477,7 +516,7 @@ ENTRY e {
   ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   tensorflow::AutotuneResult::TritonGemmKey key;
   key.set_block_m(16);
@@ -513,7 +552,7 @@ ENTRY e {
   ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   tensorflow::AutotuneResult::TritonGemmKey key;
   key.set_block_m(16);
@@ -548,7 +587,7 @@ ENTRY e {
   ROOT fusion = f16[7,5] fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
   tensorflow::AutotuneResult::TritonGemmKey key;
@@ -569,8 +608,6 @@ ENTRY e {
   key.set_split_k(8);
   TF_EXPECT_OK(
       MakeDotSplitKBatch(module->entry_computation()->root_instruction(), key));
-  TF_EXPECT_OK(VerifyHloModule(module.get(), /*layout_sensitive=*/true,
-                               /*allow_mixed_precision=*/false));
   const HloInstruction* root = module->entry_computation()->root_instruction();
   EXPECT_EQ(root->opcode(), HloOpcode::kReduce);
   DotFusionAnalysis analysis(root->operand(0)->fused_expression_root(),
@@ -601,7 +638,7 @@ ENTRY e {
   ROOT fusion = f32[77,25] fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
 })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
 
   tensorflow::AutotuneResult::TritonGemmKey key;
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
index 6cc0b79efa9..b50c577f045 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.cc
@@ -22,12 +22,14 @@ limitations under the License.
 #include <iterator>
 #include <memory>
 #include <optional>
+#include <queue>
 #include <string>
 #include <system_error>  // NOLINT
 #include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
 #include "llvm/AsmParser/Parser.h"
@@ -72,7 +74,9 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/convolution_4d_expander.h"
 #include "tensorflow/compiler/xla/service/convolution_pred_expander.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h"
 #include "tensorflow/compiler/xla/service/dot_decomposer.h"
+#include "tensorflow/compiler/xla/service/dot_dimension_merger.h"
 #include "tensorflow/compiler/xla/service/dot_merger.h"
 #include "tensorflow/compiler/xla/service/dump.h"
 #include "tensorflow/compiler/xla/service/dynamic_dimension_simplifier.h"
@@ -362,6 +366,11 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
   pre_spmd_pipeline.AddPass<CallInliner>();
   pre_spmd_pipeline.AddPass<ZeroSizedHloElimination>();
   pre_spmd_pipeline.AddPass<ConditionalCanonicalizer>();
+
+  pre_spmd_pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kTopK;
+  });
+
   // The SPMD partitioner would mess up the sort+slice structure, so we need to
   // rewrite Topk before that happens.
   pre_spmd_pipeline.AddPass<TopkRewriter>(
@@ -393,7 +402,9 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
     spmd_simplify.AddPass<WhileLoopConstantSinking>();
     spmd_simplify.AddPass<WhileLoopSimplifier>();
 
-    spmd_simplify.AddPass<ReshapeMover>();
+    ReshapeMoverOptions reshape_mover_options;
+    reshape_mover_options.reshape_of_1d_broadcast_is_cheap = true;
+    spmd_simplify.AddPass<ReshapeMover>(reshape_mover_options);
     spmd_simplify.AddPass<HloConstantFolding>();
     spmd_simplify.AddPass<ConditionalSimplifier>();
     spmd_simplify.AddPass<HloDCE>();
@@ -543,7 +554,10 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
       pipeline.AddPass<WhileLoopConstantSinking>();
       pipeline.AddPass<WhileLoopSimplifier>();
       pipeline.AddPass<SliceSinker>();
-      pipeline.AddPass<ReshapeMover>();
+
+      ReshapeMoverOptions reshape_mover_options;
+      reshape_mover_options.reshape_of_1d_broadcast_is_cheap = true;
+      pipeline.AddPass<ReshapeMover>(reshape_mover_options);
       pipeline.AddPass<HloConstantFolding>();
       pipeline.AddPass<ConditionalSimplifier>();
       pipeline.AddPass<RealImagExpander>();
@@ -586,10 +600,21 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
     collectives_pipeline.AddPass<AllReduceReassociate>(
         debug_options.xla_gpu_enable_reassociation_for_converted_ar());
     collectives_pipeline.AddPass<ReduceScatterReassociate>();
+    const DebugOptions& debug_options = hlo_module->config().debug_options();
     collectives_pipeline.AddPass<WhileLoopAllReduceCodeMotion>(
-        /*enable_reduce_scatter=*/hlo_module->config()
-            .debug_options()
+        /*enable_reduce_scatter=*/debug_options
             .xla_gpu_enable_while_loop_reduce_scatter_code_motion());
+    if (debug_options.xla_gpu_enable_data_parallel_collective_optimizer()) {
+      DataParallelCollectiveOptimizer::DataParallelCollectiveConfig config{
+          /*level_to_operate_on=*/0,
+          /*max_pipelining_per_loop=*/INT64_MAX,
+          /*last_run=*/true,
+          /*process_different_sized_ops=*/true,
+          /*pipelining_direction=*/
+          DataParallelCollectiveOptimizer::PipeliningDirection::kForward,
+          /*should_process=*/HloPredicateIsOp<HloOpcode::kAllReduce>};
+      collectives_pipeline.AddPass<DataParallelCollectiveOptimizer>(config);
+    }
 
     // Run algebraic simplifier to reshape(broadcast) into a broadcast when
     // the reshape is just adding a unit dimension. This will help with the
@@ -693,13 +718,13 @@ Status GpuCompiler::OptimizeHloModule(HloModule* hlo_module,
   {
     HloPassPipeline pipeline("post-fusion optimization");
     pipeline.AddPass<AllGatherCombiner>(
-        /*combine_threshold_in_bytes=*/1024 * 1024 * 1024,
+        debug_options.xla_gpu_all_gather_combine_threshold_bytes(),
         /*combine_threshold_count=*/256);
     pipeline.AddPass<AllReduceCombiner>(
         debug_options.xla_gpu_all_reduce_combine_threshold_bytes(),
         /*combine_threshold_count=*/256);
     pipeline.AddPass<ReduceScatterCombiner>(
-        /*combine_threshold_in_bytes=*/30 * 1024 * 1024,
+        debug_options.xla_gpu_reduce_scatter_combine_threshold_bytes(),
         /*combine_threshold_count=*/256);
 
     if (debug_options.xla_gpu_all_reduce_contiguous()) {
@@ -817,6 +842,8 @@ Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   {
     HloPassPipeline pipeline("hlo normalization");
 
+    pipeline.AddPass<DotDimensionMerger>();
+
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
     AlgebraicSimplifierOptions options;
@@ -1153,6 +1180,14 @@ GpuCompiler::CompileToTargetBinary(const HloModuleConfig& module_config,
     return result;
   };
 
+  // Disable multi-threading during deviceless AOT compilation.
+  // TODO(anlunx): Enable multi-threading once deviceless AOT compilation is
+  // enabled.
+  if (!stream_exec) {
+    return compile_single_module(llvm_module.get(), /*relocatable=*/false,
+                                 /*shard_number=*/std::nullopt);
+  }
+
   tsl::thread::ThreadPool* thread_pool;
   std::optional<tsl::thread::ThreadPool> overriding_thread_pool;
   switch (
@@ -1647,5 +1682,88 @@ StatusOr<std::unique_ptr<Executable>> CompileLmhloToExecutable(
        enable_persistent_temp_buffers});
 }
 
+std::optional<bool> GpuCompiler::FusionCanShareBufferHint(
+    const HloInstruction* user, const HloInstruction* operand,
+    const ShapeIndex& user_index) {
+  if (user->opcode() != HloOpcode::kFusion) {
+    return std::nullopt;
+  }
+
+  // First, do the trivial check: if the fusion operand and the fusion output
+  // have a different number of elements or have a different element byte size,
+  // the buffer cannot be shared.
+  const Shape& user_subshape =
+      ShapeUtil::GetSubshape(user->shape(), user_index);
+  const Shape& operand_shape = operand->shape();
+  const bool shapes_equal = ShapeUtil::Equal(operand_shape, user_subshape);
+  if (!shapes_equal) {
+    if (!operand_shape.IsArray() || !user_subshape.IsArray()) {
+      return false;
+    }
+    // We cannot share the buffer if the iteration space is not the same.
+    if (ShapeUtil::ElementsIn(operand_shape) !=
+        ShapeUtil::ElementsIn(user_subshape)) {
+      return false;
+    }
+    // The buffers needed for 'user_subshape' and 'operand_shape' need to have
+    // the same size, otherwise they cannot be shared. We already checked that
+    // the number of elements are the same, so now we check the number of bytes
+    // needed for the element types.
+    if (ShapeUtil::ByteSizeOfPrimitiveType(operand_shape.element_type()) !=
+        ShapeUtil::ByteSizeOfPrimitiveType(user_subshape.element_type())) {
+      return false;
+    }
+  }
+
+  // We need to make sure that the fusion parameter is accessed in the same
+  // iteration order as the fusion output. Also, there should not be two fusion
+  // outputs that consume the fusion parameter, because we do not want to share
+  // the same fusion operand with two different fusion outputs. To make sure
+  // that the iteration order is the same, we only allow ops on the path from
+  // fusion parameter to fusion output which are elementwise (no copy) or
+  // bitcast or a dynamic update slice with the first operand being on this
+  // path.
+  HloInstruction* fusion_param =
+      user->fused_parameter(user->operand_index(operand));
+  HloInstruction* output = user->fused_expression_root();
+  for (int64_t o : user_index) {
+    output = output->mutable_operand(o);
+  }
+  std::queue<HloInstruction*> q;
+  absl::flat_hash_set<HloInstruction*> visited;
+  q.push(fusion_param);
+  visited.insert(fusion_param);
+  bool found_path_to_output = false;
+  while (!q.empty()) {
+    HloInstruction* hlo_operand = q.front();
+    q.pop();
+    if (hlo_operand == output) {
+      found_path_to_output = true;
+      // The output should have at most 1 user: the tuple op (in case of a
+      // multi-output fusion)
+      if (hlo_operand->user_count() > 1) {
+        return false;
+      }
+      continue;
+    }
+    for (HloInstruction* hlo : hlo_operand->users()) {
+      if (visited.contains(hlo)) {
+        continue;
+      }
+      // This check also catches the case that we reach a different fusion
+      // output, as that fusion output would have a tuple op as user, which we
+      // do not allow here.
+      if ((!hlo->IsElementwiseOnOperand(hlo->operand_index(hlo_operand)) ||
+           hlo->opcode() == HloOpcode::kCopy) &&
+          hlo->opcode() != HloOpcode::kBitcast) {
+        return false;
+      }
+      visited.insert(hlo);
+      q.push(hlo);
+    }
+  }
+  return found_path_to_output;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
index 5233bcedc8c..ea0d6cc3601 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_compiler.h
@@ -174,6 +174,10 @@ class GpuCompiler : public LLVMCompiler {
   StatusOr<std::unique_ptr<AotCompilationResult>> Export(
       Executable* executable) const override;
 
+  static std::optional<bool> FusionCanShareBufferHint(
+      const HloInstruction* user, const HloInstruction* operand,
+      const ShapeIndex& user_index);
+
  protected:
   // During compilation with device, stream_exec != null and autotune_results
   // == null. During deviceless AOT compilation, stream_exec == null and
@@ -199,9 +203,7 @@ class GpuCompiler : public LLVMCompiler {
       se::DeviceMemoryAllocator* device_allocator) = 0;
 
   virtual HloDataflowAnalysis::CanShareBuffer GetCanShareBuffer() {
-    return
-        [](const HloInstruction*, const HloInstruction*,
-           const ShapeIndex&) -> std::optional<bool> { return std::nullopt; };
+    return &FusionCanShareBufferHint;
   }
 
   // TODO(timshen): Replace `debug_module` with some portable debug information
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion_test.cc
new file mode 100644
index 00000000000..eb6cf100a5f
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/gpu_copy_insertion_test.cc
@@ -0,0 +1,360 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/copy_insertion.h"
+#include "tensorflow/compiler/xla/service/gpu/gpu_compiler.h"
+#include "tensorflow/compiler/xla/test.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+int64_t CountCopies(const HloComputation& computation) {
+  int64_t count = 0;
+  for (const auto& instruction : computation.instructions()) {
+    if (instruction->opcode() == HloOpcode::kCopy) {
+      count++;
+    }
+  }
+  return count;
+}
+
+int64_t CountCopies(const HloModule& module) {
+  int64_t count = 0;
+  for (const auto& computation : module.computations()) {
+    count += CountCopies(*computation);
+  }
+  return count;
+}
+
+void ExpectOptionalTrue(std::optional<bool> value) {
+  EXPECT_TRUE(value.has_value());
+  CHECK(value.has_value());
+  EXPECT_TRUE(*value);
+}
+
+void ExpectOptionalFalse(std::optional<bool> value) {
+  EXPECT_TRUE(value.has_value());
+  CHECK(value.has_value());
+  EXPECT_FALSE(*value);
+}
+
+using GpuCopyInsertionTest = HloTestBase;
+
+// This is some kind of end-to-end test for FusionCanShareBufferHint.
+TEST_F(GpuCopyInsertionTest, DUSBitcastNoCopy) {
+  const char* const kModuleString = R"(
+HloModule bitcast_fusion
+
+fused_computation.549 {
+  param_1.8511 = bf16[15,1,2,2048,48,128]{3,5,4,2,1,0} parameter(1)
+  bitcast.52601 = bf16[15,1,2,48,128,2048]{5,4,3,2,1,0} bitcast(param_1.8511)
+  param_0.6313 = bf16[2,48,128,2048]{3,2,1,0} parameter(0)
+  bitcast.52600 = bf16[1,1,2,48,128,2048]{5,4,3,2,1,0} bitcast(param_0.6313)
+  param_2.5901 = s32[] parameter(2)
+  constant_7564 = s32[] constant(0)
+  compare.3477 = pred[] compare(param_2.5901, constant_7564), direction=LT
+  constant_11524 = s32[] constant(15)
+  add.6580 = s32[] add(param_2.5901, constant_11524)
+  select.5360 = s32[] select(compare.3477, add.6580, param_2.5901)
+  ROOT dynamic-update-slice.325 = bf16[15,1,2,48,128,2048]{5,4,3,2,1,0} dynamic-update-slice(bitcast.52601, bitcast.52600, select.5360, constant_7564, constant_7564, constant_7564, constant_7564, constant_7564)
+}
+
+condition {
+  constant_6915 = s32[] constant(15)
+  param.218 = (bf16[2,48,128,2048]{3,2,1,0}, bf16[15,1,2,2048,48,128]{3,5,4,2,1,0}, s32[]) parameter(0)
+  get-tuple-element.3714 = s32[] get-tuple-element(param.218), index=2
+  ROOT compare.1738 = pred[] compare(get-tuple-element.3714, constant_6915), direction=LT
+}
+
+body {
+  tuple_param = (bf16[2,48,128,2048]{3,2,1,0}, bf16[15,1,2,2048,48,128]{3,5,4,2,1,0}, s32[]) parameter(0)
+  param_0 = bf16[2,48,128,2048]{3,2,1,0} get-tuple-element(tuple_param), index=0
+  param_1 = bf16[15,1,2,2048,48,128]{3,5,4,2,1,0} get-tuple-element(tuple_param), index=1
+  param_2 = s32[] get-tuple-element(tuple_param), index=2
+  fusion.549 = bf16[15,1,2,48,128,2048]{5,4,3,2,1,0} fusion(param_0, param_1, param_2), kind=kLoop, calls=fused_computation.549
+  bitcast = bf16[15,1,2,2048,48,128]{3,5,4,2,1,0} bitcast(fusion.549)
+  constant_one = s32[] constant(1)
+  add = s32[] add(param_2, constant_one), control-predecessors={fusion.549}
+  ROOT tuple = (bf16[2,48,128,2048]{3,2,1,0}, bf16[15,1,2,2048,48,128]{3,5,4,2,1,0}, s32[]) tuple(param_0, bitcast, add)
+}
+
+ENTRY main {
+  param_0 = bf16[2,48,128,2048]{3,2,1,0} parameter(0)
+  param_1 = bf16[15,1,2,2048,48,128]{3,5,4,2,1,0} parameter(1)
+  zero = s32[] constant(0)
+  tuple = tuple(param_0, param_1, zero)
+  ROOT while = (bf16[2,48,128,2048]{3,2,1,0}, bf16[15,1,2,2048,48,128]{3,5,4,2,1,0}, s32[]) while(tuple), condition=condition, body=body, backend_config="{\"known_trip_count\":{\"n\":\"15\"}}"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+
+  CopyInsertion copy_insertion(GpuCompiler::FusionCanShareBufferHint,
+                               /*use_region_based_live_range_analysis=*/0);
+  ASSERT_IS_OK(copy_insertion.Run(module.get(), {"foobar"}).status());
+  VLOG(2) << module->ToString();
+  // Copy insertion adds two copies inside the entry computation.
+  EXPECT_EQ(CountCopies(*module->entry_computation()), 2);
+  // We expect that for fusion.549, no further copy needs to be added to the
+  // module.
+  EXPECT_EQ(CountCopies(*module), 2);
+}
+
+using FusionCanShareBufferHintTest = HloTestBase;
+
+TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedSameShape) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = f32[2,3]{1,0} parameter(0)
+  neg = f32[2,3]{1,0} negate(param_0.1)
+  ROOT mul = f32[2,3]{1,0} multiply(param_0.1, neg)
+}
+
+ENTRY main {
+  param_0 = f32[2,3]{1,0} parameter(0)
+  ROOT fusion = f32[2,3]{1,0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedBitcastedShape) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = f32[2,3]{1,0} parameter(0)
+  neg = f32[2,3]{1,0} negate(param_0.1)
+  mul = f32[2,3]{1,0} multiply(param_0.1, neg)
+  ROOT bitcast = f32[6]{0} bitcast(mul)
+}
+
+ENTRY main {
+  param_0 = f32[2,3]{1,0} parameter(0)
+  ROOT fusion = f32[6]{0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest,
+       BufferCanBeSharedConvertedShapeSameByteWidth) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = f32[2,3]{1,0} parameter(0)
+  neg = f32[2,3]{1,0} negate(param_0.1)
+  mul = f32[2,3]{1,0} multiply(param_0.1, neg)
+  ROOT convert = s32[2,3]{1,0} convert(mul)
+}
+
+ENTRY main {
+  param_0 = f32[2,3]{1,0} parameter(0)
+  ROOT fusion = s32[2,3]{1,0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest, BufferCanBeSharedMultiOutputFusion) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = f32[2,3]{1,0} parameter(0)
+  param_1.1 = f32[2,3]{1,0} parameter(1)
+  neg = f32[2,3]{1,0} negate(param_1.1)
+  mul = f32[2,3]{1,0} multiply(param_0.1, neg)
+  ROOT tuple = (f32[2,3]{1,0}, f32[2,3]{1,0}) tuple(mul, neg)
+}
+
+ENTRY main {
+  param_0 = f32[2,3]{1,0} parameter(0)
+  param_1 = f32[2,3]{1,0} parameter(1)
+  ROOT fusion = (f32[2,3]{1,0}, f32[2,3]{1,0}) fusion(param_0, param_1), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalTrue(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {0}));
+  // The second operand cannot share the buffer with the second fusion output,
+  // because the 'neg' op is also used on the path to the first fusion output.
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(1), {1}));
+  // The first operand cannot share the buffer with the second fusion output,
+  // because there is no path between them.
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {1}));
+}
+
+TEST_F(FusionCanShareBufferHintTest,
+       BufferCannotBeSharedConvertedShapeDifferentByteWidth) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = f32[2,3]{1,0} parameter(0)
+  neg = f32[2,3]{1,0} negate(param_0.1)
+  mul = f32[2,3]{1,0} multiply(param_0.1, neg)
+  ROOT convert = f16[2,3]{1,0} convert(mul)
+}
+
+ENTRY main {
+  param_0 = f32[2,3]{1,0} parameter(0)
+  ROOT fusion = f16[2,3]{1,0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest, BufferCannotBeSharedShapeBitcastConvert) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = s32[3]{0} parameter(0)
+  neg = s32[3]{0} negate(param_0.1)
+  mul = s32[3]{0} multiply(param_0.1, neg)
+  ROOT bitcast-convert = s16[3,2]{1,0} bitcast-convert(mul)
+}
+
+ENTRY main {
+  param_0 = s32[3]{0} parameter(0)
+  ROOT fusion = s16[3,2]{1,0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest, BufferCannotBeSharedDueToCopy) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = s32[2,3]{0,1} parameter(0)
+  copy = s32[2,3]{1,0} copy(param_0.1)
+  ROOT neg = s32[2,3]{1,0} negate(copy)
+}
+
+ENTRY main {
+  param_0 = s32[2,3]{0,1} parameter(0)
+  ROOT fusion = s32[2,3]{1,0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest, BufferCannotBeSharedDueToTranspose) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+fused_computation {
+  param_0.1 = s32[2,3]{1,0} parameter(0)
+  transpose = s32[3,2]{1,0} transpose(param_0.1), dimensions={1,0}
+  ROOT neg = s32[3,2]{1,0} negate(transpose)
+}
+
+ENTRY main {
+  param_0 = s32[2,3]{1,0} parameter(0)
+  ROOT fusion = s32[3,2]{1,0} fusion(param_0), kind=kLoop, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+TEST_F(FusionCanShareBufferHintTest,
+       BufferCannotBeSharedDueToReduceAndBroadcast) {
+  const char* const kModuleString = R"(
+HloModule fusion
+
+add {
+  lhs = s32[] parameter(0)
+  rhs = s32[] parameter(1)
+  ROOT add = s32[] add(lhs, rhs)
+}
+
+fused_computation {
+  param_0.1 = s32[3]{0} parameter(0)
+  broadcast = s32[3,2]{1,0} broadcast(param_0.1), dimensions={0}
+  zero = s32[] constant(0)
+  ROOT reduce = s32[3]{0} reduce(broadcast, zero), to_apply=add, dimensions={1}
+}
+
+ENTRY main {
+  param_0 = s32[3]{0} parameter(0)
+  ROOT fusion = s32[3]{0} fusion(param_0), kind=kInput, calls=fused_computation
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<xla::HloModule> module,
+                          ParseAndReturnVerifiedModule(kModuleString));
+  HloInstruction* fusion = module->entry_computation()->root_instruction();
+  ExpectOptionalFalse(
+      GpuCompiler::FusionCanShareBufferHint(fusion, fusion->operand(0), {}));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
index b5b5f942057..a11e67cab21 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.cc
@@ -41,5 +41,26 @@ GpuDeviceInfo TestGpuDeviceInfo::RTXA6000DeviceInfo() {
   return info;
 }
 
+GpuDeviceInfo TestGpuDeviceInfo::AMDMI210DeviceInfo() {
+  GpuDeviceInfo info;
+  info.name = "AMD Instinct MI210";
+  info.threads_per_block_limit = 1024;
+  info.threads_per_warp = 64;
+  info.shared_memory_per_block = 64 * 1024;
+  info.shared_memory_per_block_optin = 0;
+  info.shared_memory_per_core = 64 * 1024;
+  info.threads_per_core_limit = 2048;
+  info.core_count = 104;
+  info.fpus_per_core = 0;
+  info.block_dim_limit_x = 2'147'483'647;
+  info.block_dim_limit_y = 2'147'483'647;
+  info.block_dim_limit_z = 2'147'483'647;
+  info.memory_bandwidth = 1'638'400'000'000;
+  info.l2_cache_size = 8 * 1024 * 1024;
+  info.clock_rate_ghz = 1.7;
+  info.device_memory_size = 67'628'957'696;
+  return info;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h
index fc56b242c41..5b58c214abd 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h
@@ -24,6 +24,7 @@ namespace gpu {
 class TestGpuDeviceInfo {
  public:
   static GpuDeviceInfo RTXA6000DeviceInfo();
+  static GpuDeviceInfo AMDMI210DeviceInfo();
 };
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
index c16a82f16b2..cd9424ed7f9 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_device_info_test.cc
@@ -15,11 +15,17 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 
+#include <string>
+
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/tsl/platform/test.h"
 
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#endif
+
 namespace stream_executor {
 namespace gpu {
 namespace {
@@ -27,8 +33,12 @@ namespace {
 namespace se = stream_executor;
 
 TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
+  std::string test_platform = "cuda";
+#if TENSORFLOW_USE_ROCM
+  test_platform = "rocm";
+#endif
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("cuda").value();
+      se::MultiPlatformManager::PlatformWithName(test_platform).value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   const xla::gpu::GpuDeviceInfo dev_info = xla::gpu::GetGpuDeviceInfo(executor);
   absl::string_view name(dev_info.name);
@@ -82,7 +92,58 @@ TEST(DeviceInfoTest, DeviceInfoIsCorrect) {
                              /*l2_cache_size=*/4 * 1024 * 1024,
                              /*clock_rate_ghz=*/::testing::Ge(1.4),
                              /*device_memory_size=*/17'066'622'976));
-  } else {
+  }
+#if TF_ROCM_VERSION >= 50500
+  else if (name == "AMD Instinct MI210") {  // NOLINT
+    xla::gpu::GpuDeviceInfo test_info =
+        xla::gpu::TestGpuDeviceInfo::AMDMI210DeviceInfo();
+    EXPECT_THAT(
+        dev_info,
+        ::testing::FieldsAre(
+            test_info.name, test_info.threads_per_block_limit,
+            test_info.threads_per_warp, test_info.shared_memory_per_block,
+            test_info.shared_memory_per_block_optin,
+            test_info.shared_memory_per_core, test_info.threads_per_core_limit,
+            test_info.core_count, test_info.fpus_per_core,
+            test_info.block_dim_limit_x, test_info.block_dim_limit_y,
+            test_info.block_dim_limit_z, test_info.memory_bandwidth,
+            test_info.l2_cache_size, ::testing::Ge(test_info.clock_rate_ghz),
+            dev_info.device_memory_size));
+  } else if (name == "AMD Instinct MI100") {
+    EXPECT_THAT(
+        dev_info,
+        ::testing::FieldsAre(
+            name, /*threads_per_block_limit=*/1024,
+            /*threads_per_warp=*/64, /*shared_memory_per_block=*/64 * 1024,
+            /*shared_memory_per_block_optin=*/0,
+            /*shared_memory_per_core=*/64 * 1024,
+            /*threads_per_core_limit=*/2560, /*core_count=*/120,
+            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
+            /*block_dim_limit_y=*/2'147'483'647,
+            /*block_dim_limit_z=*/2'147'483'647,
+            /*memory_bandwidth=*/1228800000000,
+            /*l2_cache_size=*/8 * 1024 * 1024,
+            /*clock_rate_ghz=*/::testing::Ge(1.5),
+            /*device_memory_size=*/33'806'090'240));
+  } else if (name == "AMD Instinct M100") {
+    EXPECT_THAT(
+        dev_info,
+        ::testing::FieldsAre(
+            name, /*threads_per_block_limit=*/1024,
+            /*threads_per_warp=*/64, /*shared_memory_per_block=*/64 * 1024,
+            /*shared_memory_per_block_optin=*/0,
+            /*shared_memory_per_core=*/64 * 1024,
+            /*threads_per_core_limit=*/2560, /*core_count=*/60,
+            /*fpus_per_core=*/0, /*block_dim_limit_x=*/2'147'483'647,
+            /*block_dim_limit_y=*/2'147'483'647,
+            /*block_dim_limit_z=*/2'147'483'647,
+            /*memory_bandwidth=*/256000000000,
+            /*l2_cache_size=*/8 * 1024 * 1024,
+            /*clock_rate_ghz=*/::testing::Ge(1.7),
+            /*device_memory_size=*/17'163'091'968));
+  }
+#endif    // TF_ROCM_VERSION >= 50500
+  else {  // NOLINT
     VLOG(1) << "Not tested for " << name;
   }
 }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
index 75928db319d..c521f5c7365 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable.cc
@@ -16,7 +16,6 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/gpu_executable.h"
 
 #include <algorithm>
-#include <array>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -27,39 +26,25 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
-#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/map_util.h"
 #include "tensorflow/compiler/xla/mlir/runtime/ir/rt_ops.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/type_converter.h"
-#include "tensorflow/compiler/xla/runtime/diagnostics.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
-#include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/service/buffer_assignment.h"
 #include "tensorflow/compiler/xla/service/gpu/buffer_allocations.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_constants.h"
-#include "tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/collectives.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/executable.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/gemm.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h"
-#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
-#include "tensorflow/compiler/xla/service/llvm_ir/buffer_assignment_util.h"
-#include "tensorflow/compiler/xla/service/logical_buffer.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
-#include "tensorflow/compiler/xla/service/transfer_manager.h"
 #include "tensorflow/compiler/xla/service/xla_debug_info_manager.h"
 #include "tensorflow/compiler/xla/shape_tree.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -69,14 +54,14 @@ limitations under the License.
 #include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_pimpl.h"
 #include "tensorflow/compiler/xla/util.h"
-#include "tensorflow/tsl/lib/gtl/map_util.h"
-#include "tensorflow/tsl/platform/casts.h"
 #include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/logging.h"
-#include "tensorflow/tsl/platform/random.h"
 #include "tensorflow/tsl/profiler/lib/scoped_annotation.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
 
+#if TENSORFLOW_USE_ROCM
+#include "tensorflow/tsl/platform/random.h"
+#endif
+
 namespace xla {
 namespace gpu {
 
@@ -140,7 +125,7 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
 #if TENSORFLOW_USE_ROCM
   // ROCm uses hsaco hashes to distinguish between modules.
   // Bad things happen if multiple modules with identical code are loaded.
-  binary_.reserve(binary_.size() + 272);
+  binary_.resize(binary_.size() + 16);
   *(uint64_t*)(&binary_[binary_.size() - 16]) = tsl::EnvTime::NowNanos();
   *(uint64_t*)(&binary_[binary_.size() - 8]) = tsl::random::New64();
 #endif
@@ -202,12 +187,18 @@ Status ExecuteThunks(const std::string& module_name, ModuleIdentifier module_id,
                      const ThunkSequence& thunk_sequence,
                      const ServiceExecutableRunOptions* run_options,
                      const BufferAllocations& buffer_allocations,
-                     bool block_host_until_done) {
+                     bool block_host_until_done,
+                     bool use_highest_priority_for_async_stream) {
   se::Stream* main_stream = run_options->stream();
   se::StreamExecutor* executor = main_stream->parent();
+  stream_executor::StreamPriority stream_priority =
+      stream_executor::StreamPriority::Default;
+  if (use_highest_priority_for_async_stream) {
+    stream_priority = stream_executor::StreamPriority::Highest;
+  }
 
   StatusOr<StreamPool::Ptr> async_comms_stream =
-      run_options->BorrowStream(executor->device_ordinal());
+      run_options->BorrowStream(executor->device_ordinal(), stream_priority);
 
   uint64_t start_nanos = tsl::Env::Default()->NowNanos();
 
@@ -566,7 +557,6 @@ StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   const bool block_host_until_done =
       !memory_allocator->AllowsAsynchronousDeallocation();
 
-
   // Lock the GPU with a shared lock so that we don't interfere with autotuning
   // that may be running during JIT compilation while allowing multiple XLA
   // computations to use the same GPU simultaneously.
@@ -740,8 +730,14 @@ Status GpuExecutable::ExecuteThunksOrXlaRuntime(
       TF_RETURN_IF_ERROR(thunk->Initialize(*this, executor));
     }
 
-    return ExecuteThunks(module_name_, unique_id, *thunks_, run_options,
-                         buffer_allocations, block_host_until_done);
+    return ExecuteThunks(
+        module_name_, unique_id, *thunks_, run_options, buffer_allocations,
+        block_host_until_done,
+        /*use_highest_priority_for_async_stream*/
+        has_module() ? module_config()
+                           .debug_options()
+                           .xla_gpu_enable_highest_priority_async_stream()
+                     : false);
   }
 
   if (gpu_runtime_executable_) {
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc
index 6405131f18a..4708b8bb705 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.cc
@@ -24,13 +24,6 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-NcclCliqueKey::NcclCliqueKey(std::vector<GlobalDeviceId> devices)
-    : devices_(std::move(devices)) {}
-
-std::string NcclCliqueKey::ToString() const {
-  return GlobalDeviceIdsToString(devices_);
-}
-
 GpuExecutableRunOptions& GpuExecutableRunOptions::set_gpu_global_device_ids(
     std::optional<std::map<int, GlobalDeviceId>> gpu_global_device_ids) {
   gpu_global_device_ids_ = std::move(gpu_global_device_ids);
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h
index 43c432abe22..fcf8e67773b 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_executable_run_options.h
@@ -20,37 +20,46 @@ limitations under the License.
 #include <map>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/service/global_device_id.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
 namespace gpu {
 
 // Key for naming up a particular NCCL clique.  This is just a set of unique
-// device IDs (i.e. GPU IDs). The device IDs must be global within a cluster.
+// device IDs (i.e. GPU IDs) and a stream_id. The device IDs must be global
+// within a cluster. The stream_id is used to create different NCCL clique and
+// communicators for collectives executed on different streams within an
+// executable.
 class NcclCliqueKey {
  public:
-  explicit NcclCliqueKey(std::vector<GlobalDeviceId> devices);
+  explicit NcclCliqueKey(std::vector<GlobalDeviceId> devices,
+                         int64_t stream_id = 0)
+      : devices_(std::move(devices)), stream_id_(stream_id) {}
 
   template <typename H>
   friend H AbslHashValue(H h, const NcclCliqueKey& k) {
-    return H::combine(std::move(h), k.devices_);
+    return H::combine(std::move(h), k.devices_, k.stream_id_);
   }
   friend bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
-    return a.devices_ == b.devices_;
+    return a.devices_ == b.devices_ && a.stream_id_ == b.stream_id_;
   }
 
   const std::vector<GlobalDeviceId>& devices() const { return devices_; }
 
-  std::string ToString() const;
+  std::string ToString() const {
+    return absl::StrCat("stream[", stream_id_, "]",
+                        GlobalDeviceIdsToString(devices_));
+  }
 
  private:
-  std::vector<GlobalDeviceId> devices_;
+  const std::vector<GlobalDeviceId> devices_;
+  const int64_t stream_id_;
 };
 
 using NcclUniqueIdCallback =
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
index 0559629001d..004bbf3bca7 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <deque>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -251,7 +252,7 @@ class GpuAsyncTracker : public GpuAsyncTrackerBase {
 
   ResourcesVector GetResourcesFromInstruction(
       const HloInstruction& instr) const override {
-    CanonicalAsyncOp op = GetCanonicalAsyncOp(instr);
+    CanonicalAsyncOp op = DefaultGetCanonicalAsyncOp(instr);
     if (op.outer == HloOpcode::kAsyncStart ||
         op.outer == HloOpcode::kAsyncDone) {
       ResourceUsageType usage = op.outer == HloOpcode::kAsyncStart
@@ -342,6 +343,43 @@ class GpuLatencyEstimator : public ApproximateLatencyEstimator {
   }
 };
 
+std::optional<ProfiledInstructionsProto> ReadPGLEProfile(
+    const HloModule* module, const std::string& fingerprint) {
+  const std::string& pgle_profile_file_or_dir_path =
+      module->config()
+          .debug_options()
+          .xla_gpu_pgle_profile_file_or_directory_path();
+  if (pgle_profile_file_or_dir_path.empty()) {
+    return std::nullopt;
+  }
+  tsl::Env* env = tsl::Env::Default();
+  ProfiledInstructionsProto profile;
+  // If its a directory, use fingerprint to look for the profile for this
+  // specific module.
+  if (env->IsDirectory(pgle_profile_file_or_dir_path).ok()) {
+    std::string pgle_profile_path =
+        pgle_profile_file_or_dir_path + "/" + fingerprint + ".pbtxt";
+    Status s =
+        tsl::ReadTextProto(tsl::Env::Default(), pgle_profile_path, &profile);
+    if (!s.ok()) {
+      // Unable to read PGLE using fingerprint.
+      return std::nullopt;
+    }
+    LOG(INFO) << "Found profile for module using fingerprint";
+    return profile;
+  }
+
+  // The pgle_profile_file_or_dir is a file. Read the profile and see if its
+  // applicable for this HLO module (all instruction names in the profile should
+  // be present in the HLO module)
+  Status s = tsl::ReadTextProto(tsl::Env::Default(),
+                                pgle_profile_file_or_dir_path, &profile);
+  if (!s.ok()) {
+    return std::nullopt;
+  }
+  return profile;
+}
+
 }  // end namespace
 
 int64_t GetSizeOfShape(const Shape& shape, int pointer_size) {
@@ -367,7 +405,7 @@ Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
       HloPrintOptions::Canonical().set_print_backend_config(true));
   HloInstruction* root = module->entry_computation()->root_instruction();
   FrontendAttributes attributes;
-  (*attributes.mutable_map())[kFingerprintBeforeLHS] = fingerprint;
+  (*attributes.mutable_map())[std::string(kFingerprintBeforeLHS)] = fingerprint;
   root->add_frontend_attributes(attributes);
   VLOG(1) << "Fingerprint before LHS for module " << module->name() << "("
           << module->unique_id() << ") = " << fingerprint;
@@ -385,22 +423,12 @@ Status ScheduleGpuModule(HloModule* module, int64_t pointer_size,
   auto gpu_latency_estimator = std::make_unique<GpuLatencyEstimator>();
 
   std::unique_ptr<LatencyEstimator> latency_estimator;
-  const std::string& pgle_profile_dir =
-      module->config().debug_options().xla_gpu_pgle_profile_directory();
-  if (!pgle_profile_dir.empty()) {
-    std::string pgle_profile_path =
-        pgle_profile_dir + "/" + fingerprint + ".pbtxt";
-    ProfiledInstructionsProto proto;
-    Status s =
-        tsl::ReadTextProto(tsl::Env::Default(), pgle_profile_path, &proto);
-    if (s.ok()) {
-      LOG(INFO) << "Found profile for module, using PGLE";
-      latency_estimator = std::make_unique<ProfileGuidedLatencyEstimator>(
-          config, std::move(gpu_latency_estimator), proto);
-    } else {
-      LOG(INFO) << "Unable to read PGLE profile: " << s.message();
-      latency_estimator = std::move(gpu_latency_estimator);
-    }
+  std::optional<ProfiledInstructionsProto> profile =
+      ReadPGLEProfile(module, fingerprint);
+  if (profile.has_value()) {
+    latency_estimator = std::make_unique<ProfileGuidedLatencyEstimator>(
+        config, std::move(gpu_latency_estimator), profile.value());
+    LOG(INFO) << "Found profile, using profile guided latency estimator";
   } else {
     latency_estimator = std::move(gpu_latency_estimator);
   }
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
index 3fb86614da9..3a620e9be9c 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -215,7 +215,7 @@ TEST_F(GpuHloScheduleTest, AsyncCollectivePermute) {
   Shape u32_scalar = ShapeUtil::MakeShape(U32, {});
 
   Shape collective_permute_start_shape =
-      ShapeUtil::MakeTupleShape({f32_2x2_, f32_2x2_, u32_scalar, u32_scalar});
+      ShapeUtil::MakeTupleShape({f32_2x2_, f32_2x2_});
   HloInstruction* collective_permute_start =
       builder.AddInstruction(HloInstruction::CreateCollectivePermuteStart(
           collective_permute_start_shape, add0,
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
index 431b01e135c..dd932bd00b1 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
+++ b/tensorflow/compiler/xla/service/gpu/gpu_layout_assignment.h
@@ -39,9 +39,6 @@ class GpuLayoutAssignment : public LayoutAssignment {
 
  protected:
   Status AddBackendConstraints(LayoutConstraints* constraints) override;
-  // The GPU backend does not use memory spaces, so there is no need to
-  // propagate them.
-  Status PropagateMemorySpace(HloModule* module) override { return OkStatus(); }
 
  private:
   Status AddBackendConstraintsToDnnConvCustomCall(
diff --git a/tensorflow/compiler/xla/service/gpu/gpu_shape_verifier.cc b/tensorflow/compiler/xla/service/gpu/gpu_shape_verifier.cc
index ac18a1e8c0a..5fcd2a11545 100644
--- a/tensorflow/compiler/xla/service/gpu/gpu_shape_verifier.cc
+++ b/tensorflow/compiler/xla/service/gpu/gpu_shape_verifier.cc
@@ -26,6 +26,11 @@ Status GpuShapeVerifier::Preprocess(HloInstruction* hlo) {
                 "The XLA GPU backend does not support sparse shapes: %s",
                 hlo->ToString());
           }
+          if (shape.layout().element_size_in_bits() != 0) {
+            return InvalidArgument(
+                "The XLA GPU backend does not support custom element sizes: %s",
+                hlo->ToString());
+          }
         }
         return OkStatus();
       }));
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
index 24e5b0543a7..5d4080904cb 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <numeric>
 #include <optional>
+#include <queue>
 #include <string>
 #include <utility>
 #include <vector>
@@ -652,36 +653,38 @@ std::vector<HloInstruction*> GetFusionRoots(HloComputation* computation) {
   return out;
 }
 
-std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
-                                          Vector3& permutation) {
+std::optional<TransposeDescription> FindTiledTranspose(
+    const HloInstruction& instr) {
   if (instr.opcode() != HloOpcode::kCopy) {
     return std::nullopt;
   }
 
   if (std::optional<Vector3> tr = ShapeUtil::GetNormalizedTransposeShape(
           instr.operand(0)->shape(), instr.shape(), Vector3{0, 2, 1})) {
-    if (tr->at(1) >= kMinDimensionToTransposeTiled &&
-        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+    if ((tr->at(1) >= kMinDimensionToTransposeTiled &&
+         tr->at(2) >= kMinDimensionToTransposeTiled) ||
+        (tr->at(1) >= kMinDimensionToTransposeTiled2 &&
+         tr->at(2) >= kMinDimensionToTransposeTiled2 &&
          tr->at(1) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
-      permutation = Vector3{0, 2, 1};
-      return tr;
+      return TransposeDescription{*tr, /*permutation=*/Vector3{0, 2, 1}};
     }
   }
   if (std::optional<Vector3> tr = ShapeUtil::GetNormalizedTransposeShape(
           instr.operand(0)->shape(), instr.shape(), Vector3{2, 1, 0})) {
-    if (tr->at(0) >= kMinDimensionToTransposeTiled &&
-        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+    if ((tr->at(0) >= kMinDimensionToTransposeTiled &&
+         tr->at(2) >= kMinDimensionToTransposeTiled) ||
+        (tr->at(0) >= kMinDimensionToTransposeTiled2 &&
+         tr->at(2) >= kMinDimensionToTransposeTiled2 &&
          tr->at(0) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
-      permutation = Vector3{2, 1, 0};
-      return tr;
+      return TransposeDescription{*tr, /*permutation=*/Vector3{2, 1, 0}};
     }
   }
   return std::nullopt;
 }
 
 // Find 021 or 210 transpose in logical + physical transposition.
-std::optional<Vector3> FindTiledLogicalTranspose(const HloInstruction& instr,
-                                                 Vector3& permutation) {
+std::optional<TransposeDescription> FindTiledLogicalTranspose(
+    const HloInstruction& instr) {
   if (instr.opcode() != HloOpcode::kTranspose) {
     return std::nullopt;
   }
@@ -690,44 +693,53 @@ std::optional<Vector3> FindTiledLogicalTranspose(const HloInstruction& instr,
   if (std::optional<Vector3> tr = ShapeUtil::GetNormalizedLogicalTransposeShape(
           instr.operand(0)->shape(), instr.shape(), instr.dimensions(),
           Vector3{0, 2, 1})) {
-    if (tr->at(1) >= kMinDimensionToTransposeTiled &&
-        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+    if ((tr->at(1) >= kMinDimensionToTransposeTiled &&
+         tr->at(2) >= kMinDimensionToTransposeTiled) ||
+        (tr->at(1) >= kMinDimensionToTransposeTiled2 &&
+         tr->at(2) >= kMinDimensionToTransposeTiled2 &&
          tr->at(1) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
-      permutation = Vector3{0, 2, 1};
-      return tr;
+      return TransposeDescription{*tr, /*permutation=*/Vector3{0, 2, 1}};
     }
   }
   if (std::optional<Vector3> tr = ShapeUtil::GetNormalizedLogicalTransposeShape(
           instr.operand(0)->shape(), instr.shape(), instr.dimensions(),
           Vector3{2, 1, 0})) {
-    if (tr->at(0) >= kMinDimensionToTransposeTiled &&
-        (tr->at(2) >= kMinDimensionToTransposeTiled ||
+    if ((tr->at(0) >= kMinDimensionToTransposeTiled &&
+         tr->at(2) >= kMinDimensionToTransposeTiled) ||
+        (tr->at(0) >= kMinDimensionToTransposeTiled2 &&
+         tr->at(2) >= kMinDimensionToTransposeTiled2 &&
          tr->at(0) * tr->at(2) >= kMinTotalDimensionsToTransposeTiled)) {
-      permutation = Vector3{2, 1, 0};
-      return tr;
+      return TransposeDescription{*tr, /*permutation=*/Vector3{2, 1, 0}};
     }
   }
   return std::nullopt;
 }
 
-std::optional<std::pair<Vector3, Vector3>> FindAnyTiledTranspose(
+std::optional<TransposeDescription> FindAnyTiledTranspose(
     const HloInstruction& instr) {
   const HloInstruction& hero = FindNonTrivialHero(instr);
-
-  Vector3 permutation;
-  if (std::optional<Vector3> d1 = FindTiledTranspose(hero, permutation)) {
-    return std::make_pair(d1.value(), permutation);
+  // TODO(b/284431534): Figure out how to make the shared memory transpose
+  // emitter faster for this case.
+  if (hero.shape().element_type() == F32 &&
+      instr.shape().element_type() == S8) {
+    return std::nullopt;
   }
-  if (std::optional<Vector3> d2 =
-          FindTiledLogicalTranspose(hero, permutation)) {
-    return std::make_pair(d2.value(), permutation);
+
+  if (auto d1 = FindTiledTranspose(hero)) {
+    return d1;
+  }
+  if (auto d2 = FindTiledLogicalTranspose(hero)) {
+    return d2;
   }
   return std::nullopt;
 }
 
-static bool IsIntermediate(const HloInstruction* instr) {
+static bool IsIntermediate(const HloInstruction* instr,
+                           int allowed_operand_count = 1) {
   return (
-      instr->operand_count() == 1 && instr->user_count() <= 1 &&
+      instr->operand_count() > 0 &&
+      instr->operand_count() <= allowed_operand_count &&
+      instr->user_count() <= 1 &&
       ((instr->IsElementwise() && instr->opcode() != HloOpcode::kCopy) ||
        instr->opcode() == HloOpcode::kBitcast ||
        (instr->opcode() == HloOpcode::kReshape &&
@@ -748,7 +760,42 @@ const HloInstruction& FindNonTrivialHero(const HloInstruction& instr) {
   while (IsIntermediate(idx)) {
     idx = idx->operand(0);
   }
-  return *idx;
+  if (!IsIntermediate(idx, /*allowed_operand_count=*/3)) {
+    return *idx;
+  }
+  // Try a bit harder to find a transpose hero. The shared memory transpose
+  // emitter also works if there are ops with more than 1 operand on the path
+  // between root and the transpose op, we still want the restriction though
+  // that each op on the path is elementwise and has only 1 user.
+  absl::flat_hash_set<const HloInstruction*> visited;
+  std::queue<const HloInstruction*> q;
+  auto enqueue_operands = [&](const HloInstruction* idx) {
+    for (HloInstruction* hlo : idx->operands()) {
+      if (visited.insert(hlo).second) {
+        q.push(hlo);
+      }
+    }
+  };
+  enqueue_operands(idx);
+  const HloInstruction* non_trivial_hero = nullptr;
+  while (!q.empty()) {
+    const HloInstruction* hlo = q.front();
+    q.pop();
+    if (FindTiledLogicalTranspose(*hlo)) {
+      // If we do not find a unique transpose op, use the original non-trivial
+      // hero.
+      if (non_trivial_hero != nullptr) {
+        return *idx;
+      }
+      non_trivial_hero = hlo;
+    } else if (IsIntermediate(hlo, /*allowed_operand_count=*/3)) {
+      enqueue_operands(hlo);
+    }
+  }
+  if (non_trivial_hero == nullptr) {
+    return *idx;
+  }
+  return *non_trivial_hero;
 }
 
 bool HasAnyTiledTransposeRoot(HloComputation* computation) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
index 1a63f980f64..49206d5b8b9 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils.h
@@ -31,11 +31,14 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-// If the most minor dimension in the transpose operand is smaller than this,
-// untiled transposition may be more efficient.
+// If a dimensions is smaller than this, untiled transposition may be more
+// efficient.
 inline constexpr int64_t kMinDimensionToTransposeTiled = 16;
-// But if the product of the dimensions to be swapped is larger than this, tiled
-// transposition may be more efficient.
+// But if both swap dimensions are larger than 'kMinDimensionToTransposeTiled2',
+// and the product of the dimensions to be swapped is larger than
+// 'kMinTotalDimensionsToTransposeTiled', tiled transposition may be more
+// efficient.
+inline constexpr int64_t kMinDimensionToTransposeTiled2 = 8;
 inline constexpr int64_t kMinTotalDimensionsToTransposeTiled = 64 * 128;
 
 // Matrix multiplication before the rewrite.
@@ -53,9 +56,8 @@ inline constexpr int64_t MinThreadsXRowReduction() { return 1024; }
 // When doing batched row reduction, how big the batch dimension could be.
 inline constexpr int64_t BatchedReductionRaceFreeBound() { return 8; }
 
-// GemmRewriterTriton sets backend_config of Triton GEMM custom fusions to
-// this string. TritonAutotuner replaces it with TritonGemmKey proto.
-inline constexpr absl::string_view kTritonGemmBackendConfig = "__triton_gemm";
+// Fusions that use Triton have FusionBackendConfig.kind equal to this string.
+inline constexpr absl::string_view kTritonGemmFusionKind = "__triton_gemm";
 
 // Returns true if `hlo` will be implemented as a call to a cuSolver routine.
 //
@@ -207,13 +209,34 @@ const HloInstruction& FindNonTrivialHero(const HloInstruction& instr);
 // Whether there is a fusion root triggering transposition emitter.
 bool HasAnyTiledTransposeRoot(HloComputation* computation);
 
-std::optional<Vector3> FindTiledTranspose(const HloInstruction& instr,
-                                          Vector3& permutation);
+struct TransposeDescription {
+  Vector3 dimensions;
+  Vector3 permutation;
 
-std::optional<Vector3> FindTiledLogicalTranspose(const HloInstruction& instr,
-                                                 Vector3& permutation);
+  TransposeDescription(Vector3 dimensions, Vector3 permutation)
+      : dimensions(dimensions), permutation(permutation) {}
 
-std::optional<std::pair<Vector3, Vector3>> FindAnyTiledTranspose(
+  std::string ToString() const {
+    return absl::StrCat("dimensions=", VectorString(dimensions),
+                        ", permutation=", VectorString(permutation));
+  }
+
+  bool operator==(const TransposeDescription& other) const {
+    return dimensions == other.dimensions && permutation == other.permutation;
+  }
+
+  bool operator!=(const TransposeDescription& other) const {
+    return !(*this == other);
+  }
+};
+
+std::optional<TransposeDescription> FindTiledTranspose(
+    const HloInstruction& instr);
+
+std::optional<TransposeDescription> FindTiledLogicalTranspose(
+    const HloInstruction& instr);
+
+std::optional<TransposeDescription> FindAnyTiledTranspose(
     const HloInstruction& instr);
 
 // Log and verify an LLVM module.
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
index 99d0243a1a7..f4b218bbe7d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emission_utils_test.cc
@@ -97,11 +97,9 @@ ENTRY entry {
                           ParseAndReturnVerifiedModule(hlo));
 
   HloInstruction* tr = module->entry_computation()->root_instruction();
-  Vector3 permutation;
-  EXPECT_EQ(FindTiledLogicalTranspose(*tr, permutation),
-            std::make_optional(Vector3{1, 64, 1536}));
-  Vector3 expected_permutation{0, 2, 1};
-  EXPECT_EQ(permutation, expected_permutation);
+
+  EXPECT_EQ(*FindTiledLogicalTranspose(*tr),
+            TransposeDescription(Vector3{1, 64, 1536}, Vector3{0, 2, 1}));
 }
 
 TEST_F(IrEmissionUtilsTest, FindAnyTiledTranspose) {
@@ -119,7 +117,112 @@ ENTRY entry {
   HloInstruction* tr = module->entry_computation()->root_instruction();
   EXPECT_EQ(FindAnyTiledTranspose(*tr),
             std::make_optional(
-                std::make_pair(Vector3{64, 48, 32}, Vector3{2, 1, 0})));
+                TransposeDescription(Vector3{64, 48, 32}, Vector3{2, 1, 0})));
+}
+
+TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithIntermediateUnaryOp) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,48,64]{2,1,0} parameter(0)
+  t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
+  ROOT n = f32[64,48,32]{2,1,0} negate(t)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  EXPECT_EQ(FindAnyTiledTranspose(*r),
+            std::make_optional(
+                TransposeDescription(Vector3{64, 48, 32}, Vector3{2, 1, 0})));
+  EXPECT_EQ(&FindNonTrivialHero(*r), r->operand(0));
+}
+
+TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithIntermediateUnaryOpS8) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,48,64]{2,1,0} parameter(0)
+  t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
+  ROOT c = s8[64,48,32]{2,1,0} convert(t)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  // TODO(b/284431534): Update this test when the shared memory transpose
+  // emitter is fast for S8 output.
+  EXPECT_FALSE(FindAnyTiledTranspose(*r).has_value());
+  EXPECT_EQ(&FindNonTrivialHero(*r), r->operand(0));
+}
+
+TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithIntermediateBinaryOp) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,48,64]{2,1,0} parameter(0)
+  p2 = f32[64,48,32]{2,1,0} parameter(1)
+  t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
+  ROOT add = f32[64,48,32]{2,1,0} add(t, p2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  EXPECT_EQ(FindAnyTiledTranspose(*r),
+            std::make_optional(
+                TransposeDescription(Vector3{64, 48, 32}, Vector3{2, 1, 0})));
+  EXPECT_EQ(&FindNonTrivialHero(*r), r->operand(0));
+}
+
+TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithTwoIntermediateBinaryOps) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,48,64]{2,1,0} parameter(0)
+  p2 = f32[64,48,32]{2,1,0} parameter(1)
+  p3 = f32[64,48,32]{2,1,0} parameter(2)
+  t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
+  mul = f32[64,48,32]{2,1,0} multiply(t, p3)
+  ROOT add = f32[64,48,32]{2,1,0} add(mul, p3)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  EXPECT_EQ(FindAnyTiledTranspose(*r),
+            std::make_optional(
+                TransposeDescription(Vector3{64, 48, 32}, Vector3{2, 1, 0})));
+  EXPECT_EQ(&FindNonTrivialHero(*r), r->operand(0)->operand(0));
+}
+
+TEST_F(IrEmissionUtilsTest,
+       FindAnyTiledTransposeWithIntermediateBinaryOpTwoTransposes) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[32,48,64]{2,1,0} parameter(0)
+  p2 = f32[48,32,64]{2,1,0} parameter(1)
+  t = f32[64,48,32]{2,1,0} transpose(p), dimensions={2,1,0}
+  t2 = f32[64,48,32]{2,1,0} transpose(p2), dimensions={2,0,1}
+  ROOT add = f32[64,48,32]{2,1,0} add(t, t2)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  EXPECT_FALSE(FindAnyTiledTranspose(*r).has_value());
+  EXPECT_EQ(&FindNonTrivialHero(*r), r);
 }
 
 TEST_F(IrEmissionUtilsTest, FindTiledTransposeOneSwapDimIsSmall) {
@@ -127,19 +230,17 @@ TEST_F(IrEmissionUtilsTest, FindTiledTransposeOneSwapDimIsSmall) {
 HloModule module
 
 ENTRY entry {
-  p = f32[10,11,12,16]{3,2,1,0} parameter(0)
-  ROOT c = f32[10,11,12,16]{1,0,2,3} copy(p)
+  p = f32[100,11,12,8]{3,2,1,0} parameter(0)
+  ROOT c = f32[100,11,12,8]{1,0,2,3} copy(p)
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
   HloInstruction* copy = module->entry_computation()->root_instruction();
-  Vector3 permutation;
-  EXPECT_EQ(FindTiledTranspose(*copy, permutation),
-            std::make_optional(Vector3{16, 12, 110}));
-  Vector3 expected_permutation{2, 1, 0};
-  EXPECT_EQ(permutation, expected_permutation);
+  EXPECT_EQ(FindTiledTranspose(*copy),
+            std::make_optional(
+                TransposeDescription{Vector3{8, 12, 1100}, Vector3{2, 1, 0}}));
 }
 
 TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOneSwapDimIsSmall) {
@@ -147,19 +248,55 @@ TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOneSwapDimIsSmall) {
 HloModule module
 
 ENTRY entry {
-  p = f32[10,11,12,16]{3,2,1,0} parameter(0)
-  ROOT t = f32[16,12,10,11]{3,2,1,0} transpose(p), dimensions={3,2,0,1}
+  p = f32[100,11,12,8]{3,2,1,0} parameter(0)
+  ROOT t = f32[8,12,100,11]{3,2,1,0} transpose(p), dimensions={3,2,0,1}
 }
 )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo));
 
   HloInstruction* tr = module->entry_computation()->root_instruction();
-  Vector3 permutation;
-  EXPECT_EQ(FindTiledLogicalTranspose(*tr, permutation),
-            std::make_optional(Vector3{16, 12, 110}));
-  Vector3 expected_permutation{2, 1, 0};
-  EXPECT_EQ(permutation, expected_permutation);
+  EXPECT_EQ(FindTiledLogicalTranspose(*tr),
+            std::make_optional(
+                TransposeDescription{Vector3{8, 12, 1100}, Vector3{2, 1, 0}}));
+}
+
+TEST_F(IrEmissionUtilsTest, FindTiledTransposeOtherSwapDimIsSmall) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[8,12,100,11]{3,2,1,0} parameter(0)
+  ROOT c = f32[8,12,100,11]{0,1,3,2} copy(p)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* copy = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(FindTiledTranspose(*copy),
+            std::make_optional(
+                TransposeDescription{Vector3{1100, 12, 8}, Vector3{2, 1, 0}}));
+}
+
+TEST_F(IrEmissionUtilsTest, FindTiledLogicalTransposeOtherSwapDimIsSmall) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY entry {
+  p = f32[8,12,100,11]{3,2,1,0} parameter(0)
+  ROOT t = f32[100,11,12,8]{3,2,1,0} transpose(p), dimensions={2,3,1,0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* tr = module->entry_computation()->root_instruction();
+
+  EXPECT_EQ(FindTiledLogicalTranspose(*tr),
+            std::make_optional(
+                TransposeDescription{Vector3{1100, 12, 8}, Vector3{2, 1, 0}}));
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc
index baabb8743bd..b04145d155d 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_large_test.cc
@@ -49,7 +49,7 @@ ENTRY e {
   p0 = f16[65536,32800] parameter(0)
   p1 = f16[32800,32] parameter(1)
   ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\"}"
+    backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\"}}"
 }
 )";
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 674c39d00ba..6c7ea0fe769 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -73,7 +73,7 @@ ENTRY e {
   MatchOptimizedHlo(hlo_string, R"(
 ; CHECK: fusion(%p0, %p1)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
   EXPECT_TRUE(RunAndCompare(hlo_string, ErrorSpec{params.aabs, params.arel}));
@@ -101,6 +101,7 @@ INSTANTIATE_TEST_SUITE_P(RewriteTestSuite, ParametrizedRewriteTest,
                              GemmTestParams{S8, F32, 101, 2555, 303, 0.5, 0.1},
                              // Is supported but overflows.
                              //  GemmTestParams{S32, F16},
+                             GemmTestParams{S16, F16, 30, 19, 12},
                              GemmTestParams{S32, F32, 4, 4, 4, 1, 1e-2},
                              GemmTestParams{F16, BF16, 16, 32, 8},
                              GemmTestParams{F16, F32, 16, 32, 8, 1e-3, 1e-6},
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc
index 2563312f046..d26626e78b6 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_triton_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/error_spec.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info_for_tests.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
 #include "tensorflow/compiler/xla/service/gpu/tests/gpu_codegen_test.h"
@@ -120,10 +121,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p1, %p0)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
   )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, NoPadding) {
@@ -141,12 +142,12 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p1, %p0)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 ; CHECK-NOT: pad(
 ; CHECK-NOT: slice(
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, SplitLhsNoncontractingTransposeRhs) {
@@ -164,10 +165,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p1, %p0)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-2, 1e-2}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonGemmTest, SplitLhsNoncontracting) {
@@ -188,10 +189,65 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p1, %p0)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, SplitAndTransposeLhsExecutesCorrectly) {
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY e {
+  tmp_0 = s8[5,50,2,128] parameter(1)
+  tmp_2 = s8[50,5,2,128] transpose(tmp_0), dimensions={1,0,2,3}
+  tmp_3 = s8[50,1280] reshape(tmp_2)
+  tmp_4 = f16[50,1280] convert(tmp_3)
+  tmp_5 = f16[50,79] parameter(0)
+  ROOT tmp_6 = f16[1280,79] dot(tmp_4, tmp_5),
+    lhs_contracting_dims={0}, rhs_contracting_dims={0}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: ROOT
+; CHECK-SAME: fusion
+; CHECK-SAME: kind=kCustom
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
+}
+
+TEST_F(TritonGemmTest, NondefaultOperandLayoutIsSupported) {
+  // TODO(bchetioui): reenable when b/285866137 is fixed.
+#ifndef NDEBUG
+  GTEST_SKIP() << "This test times out when -UNDEBUG is set.";
+#endif
+  const std::string kHloText = R"(
+HloModule m
+
+ENTRY r {
+  p1 = f16[9,1440,128]{2,1,0} parameter(1)
+  cp6 = f16[9,1440,128]{2,0,1} copy(p1)
+  cv4 = f32[9,1440,128]{2,0,1} convert(cp6)
+  p0 = f32[9,1440,1234]{2,1,0} parameter(0)
+  ROOT dot.10 = f32[9,128,1234]{2,1,0} dot(cv4, p0),
+    lhs_batch_dims={0}, lhs_contracting_dims={1},
+    rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: fusion
+; CHECK-SAME: kind=kCustom
+)");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, DoNotFuseSplitRhsContractingTranspose) {
@@ -209,12 +265,12 @@ ENTRY e {
 })";
 
   MatchOptimizedHlo(hlo_text, R"(
-; CHECK: fusion(%transpose.2, %p0)
+; CHECK: fusion(%transpose{{[^)]*}}, %p0)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, DoNotFuseSplitLhsContractingTranspose) {
@@ -234,10 +290,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p1, %transpose)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, BatchF32F16) {
@@ -256,10 +312,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%y, %x)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 1e-2}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonGemmTest, NonMajorMostInputBatchWorksCorrectly) {
@@ -278,10 +334,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%y, %x)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, BatchTransposeF32F16) {
@@ -301,10 +357,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%y, %x)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 1e-2}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonGemmTest, DoNotFuseArbitraryReshape) {
@@ -325,14 +381,14 @@ ENTRY e {
 ; CHECK: f32[5,3,4]{2,1,0} bitcast(%p1)
 ; CHECK: fusion(%p0, %bitcast
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-4, 1e-4}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
 }
 
-TEST_F(TritonGemmTest, SkipMultipleBatch) {
-  const std::string hlo_text = R"(
+TEST_F(TritonGemmTest, MultipleBatchRequireSeparateTranspose) {
+  const std::string kHloText = R"(
 HloModule m
 
 ENTRY e {
@@ -344,9 +400,17 @@ ENTRY e {
     rhs_batch_dims={0,1,2}, rhs_contracting_dims={4}
 })";
 
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK-NOT: __triton
+  MatchOptimizedHlo(kHloText, R"(
+; CHECK: ENTRY
+; CHECK-NEXT: parameter
+; CHECK-NEXT: parameter
+; CHECK-NEXT: kLoop
+; CHECK-NEXT: kCustom
+; CHECK-NEXT: ROOT
+; CHECK-SAME: bitcast
 )");
+
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-4, /*arel=*/1e-4}));
 }
 
 TEST_F(TritonGemmTest, SkipU8) {
@@ -405,10 +469,10 @@ ENTRY e {
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p0, %p1)
 ; CHECK-SAME: kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 TEST_F(TritonGemmTest, SameInput) {
@@ -424,10 +488,10 @@ ENTRY e {
 
   MatchOptimizedHlo(hlo_text, R"(
 ; CHECK: fusion(%p0), kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
+; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-6, 1e-6}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
 }
 
 TEST_F(TritonGemmTest, Naming) {
@@ -443,12 +507,43 @@ ENTRY e {
 })";
 
   MatchOptimizedHlo(hlo_text, R"(
-; CHECK: %triton_gemm_r (
+; CHECK: %triton_gemm_r_computation (
 ; CHECK: %triton_gemm_r =
 ; CHECK-SAME: fusion
 )");
 }
 
+TEST_F(TritonGemmTestAny,
+       ShouldNotLowerDotWithLhsWithoutNonContractingDimThroughTriton) {
+  const std::string hlo_text = R"(
+HloModule t
+
+ENTRY e {
+  parameter_0 = f32[32,40] parameter(0)
+  parameter_1 = f32[32,40,64] parameter(1)
+  ROOT dot = f32[32,64] dot(f32[32,40] parameter_0, f32[32,40,64] parameter_1), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(hlo_text, "CHECK-NOT: triton");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+}
+
+TEST_F(TritonGemmTestAny,
+       ShouldNotLowerDotWithRhsWithoutNonContractingDimThroughTriton) {
+  const std::string hlo_text = R"(
+HloModule t
+
+ENTRY e {
+  parameter_0 = f32[32,40,64] parameter(0)
+  parameter_1 = f32[32,40] parameter(1)
+  ROOT dot = f32[32,64] dot(f32[32,40,64] parameter_0, f32[32,40] parameter_1), lhs_batch_dims={0}, lhs_contracting_dims={1}, rhs_batch_dims={0}, rhs_contracting_dims={1}
+})";
+
+  MatchOptimizedHlo(hlo_text, "CHECK-NOT: triton");
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+}
 
 // This group of tests compares GPU results of dots already rewritten
 // into Triton fusions.
@@ -469,7 +564,7 @@ ENTRY e {
   p0 = s8[101,202]{1,0} parameter(0)
   p1 = f32[202,303]{1,0} parameter(1)
   ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"16\",\"block_n\":\"64\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"3\",\"num_warps\":\"8\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":1,"num_stages":3,"num_warps":8}}
 })";
 
   const char* hlo_text_triton = R"(
@@ -486,11 +581,11 @@ ENTRY e {
   p0 = s8[101,202]{1,0} parameter(0)
   p1 = f32[202,303]{1,0} parameter(1)
   ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"128\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":128,"block_k":32,"split_k":1,"num_stages":2,"num_warps":4}}
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -503,7 +598,7 @@ ENTRY e {
   arg1 = f16[7,33] parameter(1)
   ROOT custom-call = f16[5,33] custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
-    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+    backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
 }
 )";
 
@@ -521,12 +616,12 @@ ENTRY e {
   p0 = f16[5,7]{1,0} parameter(0)
   p1 = f16[7,33]{1,0} parameter(1)
   ROOT _ = f16[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1}}
 }
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -539,7 +634,7 @@ ENTRY e {
   arg1 = f32[7,33] parameter(1)
   ROOT custom-call = f32[5,33] custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
-    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+    backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
 }
 )";
 
@@ -557,12 +652,12 @@ ENTRY e {
   p0 = f32[5,7]{1,0} parameter(0)
   p1 = f32[7,33]{1,0} parameter(1)
   ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1}}
 }
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-3, 1e-3},
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -580,7 +675,7 @@ ENTRY e {
   arg1 = bf16[512,256]{1,0} parameter(1)
   ROOT custom-call = bf16[16,256]{1,0} custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
-    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+    backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[0],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
 }
 )";
 
@@ -598,12 +693,12 @@ ENTRY e {
   arg0 = bf16[512,16]{1,0} parameter(0)
   arg1 = bf16[512,256]{1,0} parameter(1)
   ROOT _ = bf16[16,256]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"128\",\"block_n\":\"32\",\"block_k\":\"16\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":128,"block_n":32,"block_k":16,"split_k":1,"num_stages":2,"num_warps":4}}
 }
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-2, 1e-2},
+                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -632,7 +727,7 @@ ENTRY e {
   p0 = s8[332,441]{1,0} parameter(0)
   p1 = f16[441,39]{1,0} parameter(1)
   ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"128\",\"block_n\":\"128\",\"block_k\":\"128\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"32\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":128,"block_n":128,"block_k":128,"split_k":1,"num_stages":2,"num_warps":32}}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
@@ -645,16 +740,15 @@ ENTRY e {
   llvm::Module llvm_module("module", llvm_ctx);
   mlir::MLIRContext mlir_context;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      const tensorflow::AutotuneResult::TritonGemmKey config,
-      hlo_module->entry_computation()
-          ->root_instruction()
-          ->backend_config<tensorflow::AutotuneResult::TritonGemmKey>());
-  TF_ASSERT_OK_AND_ASSIGN(
-      const LaunchDimensions launch_dimensions,
-      TritonWrapper("test_fn", triton_dot_computation,
-                    GetCudaComputeCapability(), dev_info, config, &llvm_module,
-                    &MatMul, mlir_context));
+  TF_ASSERT_OK_AND_ASSIGN(auto config,
+                          hlo_module->entry_computation()
+                              ->root_instruction()
+                              ->backend_config<FusionBackendConfig>());
+  TF_ASSERT_OK_AND_ASSIGN(const LaunchDimensions launch_dimensions,
+                          TritonWrapper("test_fn", triton_dot_computation,
+                                        GetCudaComputeCapability(), dev_info,
+                                        config.triton_gemm_config(),
+                                        &llvm_module, &MatMul, mlir_context));
   // The config is chosen so that the used memory size is slightly above the
   // 48 kB boundary of standard / optin shared memory so that any GPU that
   // has the optin one should be able to execute the test.
@@ -677,11 +771,11 @@ ENTRY e {
   p0 = s8[332,441]{1,0} parameter(0)
   p1 = f16[441,39]{1,0} parameter(1)
   ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4}}
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextLowShmem, kHloTextOptinShmem,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -694,7 +788,7 @@ ENTRY e {
   arg1 = f16[64,32]{1,0} parameter(1)
   ROOT custom-call = f16[128,64]{1,0} custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
-    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"1\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+    backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
 }
 )";
 
@@ -712,12 +806,12 @@ ENTRY e {
   arg0 = f16[128,32]{1,0} parameter(0)
   arg1 = f16[64,32]{1,0} parameter(1)
   ROOT _ = f16[128,64]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"128\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":128,"block_n":32,"block_k":64,"split_k":1,"num_stages":2,"num_warps":4}}
 }
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-2, 1e-2},
+                                      ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -730,7 +824,7 @@ ENTRY e {
   arg1 = f32[1024,64]{1,0} parameter(1)
   ROOT custom-call = f32[128,1024]{1,0} custom-call(arg0, arg1),
     custom_call_target="__cublas$gemm",
-    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"0\"],\"rhs_contracting_dimensions\":[\"1\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+    backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[0],"rhs_contracting_dimensions":[1],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
 }
 )";
 
@@ -748,12 +842,12 @@ ENTRY e {
   arg0 = f32[64,128]{1,0} parameter(0)
   arg1 = f32[1024,64]{1,0} parameter(1)
   ROOT _ = f32[128,1024]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":64,"split_k":1,"num_stages":2,"num_warps":4}}
 }
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-3, 1e-3},
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -776,7 +870,7 @@ ENTRY e {
   p1 = bf16[256,122]{1,0} parameter(1)
   ROOT custom-call = bf16[144,122]{1,0} custom-call(fusion, p1),
     custom_call_target="__cublas$gemm",
-    backend_config="{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
+    backend_config={"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}
 }
 )";
 
@@ -794,12 +888,12 @@ ENTRY e {
   p0 = s8[144,256]{1,0} parameter(0)
   p1 = bf16[256,122]{1,0} parameter(1)
   ROOT _ = bf16[144,122]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"block_m\":\"64\",\"block_n\":\"64\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"2\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":64,"block_k":64,"split_k":1,"num_stages":1,"num_warps":2}}
 }
 )";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -825,7 +919,7 @@ ENTRY e {
   bitcast.4 = s8[480,120]{1,0} bitcast(p0)
   ROOT triton_gemm_r = bf16[480,16]{1,0} fusion(bitcast.4, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,"split_k":1,"num_stages":4,"num_warps":4}}
 })";
 
   const std::string hlo_text_splitk = R"(
@@ -864,13 +958,13 @@ ENTRY e {
   bitcast.4 = s8[480,120]{1,0} bitcast(p0)
   triton_gemm_r = bf16[4,480,16]{2,1,0} fusion(bitcast.4, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"128\",\"split_k\":\"4\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":128,"split_k":4,"num_stages":1,"num_warps":4}}
   ROOT fusion.1 = bf16[480,16]{1,0} fusion(triton_gemm_r), kind=kLoop,
     calls=fused_computation
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_splitk,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -896,7 +990,7 @@ ENTRY e {
   tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
   ROOT triton_gemm_dot.24 = f32[5,128,700]{2,1,0} fusion(tmp_3, tmp_0),
     kind=kCustom, calls=triton_gemm_dot.24,
-    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,"split_k":1,"num_stages":2,"num_warps":8}}
 })";
 
   const std::string hlo_text_splitk = R"(
@@ -924,13 +1018,13 @@ ENTRY e {
   tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
   triton_gemm_dot.24 = f32[5,8,128,700]{3,2,1,0} fusion(tmp_3, tmp_0),
     kind=kCustom, calls=triton_gemm_dot,
-    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"8\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,"split_k":8,"num_stages":1,"num_warps":4}}
   constant = f32[] constant(0)
   ROOT reduce = f32[5,128,700]{2,1,0} reduce(triton_gemm_dot.24, constant), dimensions={1}, to_apply=add
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_splitk,
-                                      ErrorSpec{1e-3, 1e-3},
+                                      ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -957,7 +1051,7 @@ ENTRY entry {
   parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
   ROOT triton_gemm_dot.5316 = bf16[16,96]{1,0} fusion(bitcast.6, parameter_1.1),
     kind=kCustom, calls=triton_gemm_dot.5316,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"256\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":256,"split_k":1,"num_stages":1,"num_warps":4}}
 })";
 
   const std::string kHloTextSplitK = R"(
@@ -997,13 +1091,13 @@ ENTRY entry {
   parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
   triton_gemm_dot.5316 = bf16[16,16,96]{2,1,0} fusion(bitcast.6, parameter_1.1),
     kind=kCustom, calls=triton_gemm_dot.5316,
-    backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"16\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":32,"split_k":16,"num_stages":1,"num_warps":4}}
   ROOT fusion.1 = bf16[16,96]{1,0} fusion(triton_gemm_dot.5316),
     kind=kLoop, calls=fused_computation
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextSplitK,
-                                      ErrorSpec{1e-2, 1},
+                                      ErrorSpec{/*aabs=*/2, /*arel=*/1e-2},
                                       /*run_hlo_passes=*/false));
 }
 
@@ -1025,7 +1119,7 @@ ENTRY e {
   p1 = f32[32,50,104]{2,1,0} parameter(1)
   ROOT triton_gemm_dot.6 = f32[32,50,26]{2,0,1} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot.6,
-    backend_config="{\"block_m\":\"64\",\"block_n\":\"16\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":16,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -1051,12 +1145,12 @@ ENTRY e {
   %parameter_1 = f32[32,50,104]{2,1,0} parameter(1)
   %triton_gemm_dot.127 = f32[32,50,26]{2,1,0} fusion(%parameter_0, %parameter_1),
     kind=kCustom, calls=%triton_gemm_dot.127,
-    backend_config="{\"block_m\":\"32\",\"block_n\":\"128\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"4\"}"
+    backend_config={kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":128,"block_k":64,"split_k":1,"num_stages":2,"num_warps":4}}
   ROOT %fusion.1 = f32[32,50,26]{2,0,1} fusion(%triton_gemm_dot.127), kind=kLoop, calls=%fused_computation
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextRef, kHloTextTest,
-                                      ErrorSpec{1e-6, 1e-6},
+                                      ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6},
                                       /*run_hlo_passes=*/false));
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
index 6af87fcb3dc..aa2aa1da059 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.cc
@@ -1075,8 +1075,7 @@ Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
   auto set_activation_mode = [&](auto op) -> Status {
     TF_ASSIGN_OR_RETURN(stream_executor::dnn::ActivationMode activation_mode,
                         ConvertConvActivationMode(op.getActivationMode()));
-    descriptor.backend_config.set_activation_mode(
-        static_cast<int64_t>(activation_mode));
+    descriptor.backend_config.set_activation_mode(activation_mode);
     return OkStatus();
   };
 
@@ -1763,7 +1762,8 @@ Status IrEmitterUnnested::EmitLaunchFunc(mlir::Operation* op) {
 
 #if GOOGLE_CUDA
 Status IrEmitterUnnested::EmitTritonFusion(
-    mlir::Operation* op, tensorflow::AutotuneResult::TritonGemmKey& config) {
+    mlir::Operation* op,
+    const tensorflow::AutotuneResult::TritonGemmKey& config) {
   // Note: In this method we can't use `BuildKernelThunk` as usual,
   // because we only get the launch dimensions after code generation. So we
   // implement kernel reuse using lower level APIs, such as
@@ -1909,10 +1909,9 @@ Status IrEmitterUnnested::EmitLoopFusion(mlir::Operation* op) {
     launch_config.few_waves = false;
   }
 
-  TF_ASSIGN_OR_RETURN(
-      LaunchDimensions launch_dimensions,
-      CalculateLaunchDimensions(element_shape, gpu_device_info, launch_config));
-
+  TF_ASSIGN_OR_RETURN(LaunchDimensions launch_dimensions,
+                      CalculateLaunchDimensions(element_shape, gpu_device_info,
+                                                launch_config, op));
   TF_ASSIGN_OR_RETURN(
       std::optional<std::vector<llvm_ir::IrArray>> opt_ir_arrays,
       BuildKernelThunkForFusion(fusion, launch_dimensions));
@@ -1961,9 +1960,10 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
 
   // TODO(cheshire): avoid duplication of FindTiledTranspose function, is it
   // possible?
-  auto dims_and_order = FindAnyTiledTranspose(**absl::c_find_if(
-      hlo_roots,
-      [](HloInstruction* instr) { return FindAnyTiledTranspose(*instr); }));
+  std::optional<TransposeDescription> dims_and_order = FindAnyTiledTranspose(
+      **absl::c_find_if(hlo_roots, [](HloInstruction* instr) {
+        return FindAnyTiledTranspose(*instr);
+      }));
 
   // TODO(cheshire): have a more robust way of checking this.
   CHECK(dims_and_order.has_value());
@@ -1972,8 +1972,9 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
   CHECK_EQ(WarpSize() % kNumRows, 0);
 
   // 3D view over the input shape.
-  Vector3 dims = dims_and_order->first;
-  Vector3 order = dims_and_order->second;
+  Vector3 dims = dims_and_order->dimensions;
+  Vector3 order = dims_and_order->permutation;
+
   // We expect that the last dimension is swapped with a different dimension.
   CHECK_NE(order[2], 2);
   Vector3 permuted_dims = {dims[order[0]], dims[order[1]], dims[order[2]]};
@@ -2011,8 +2012,57 @@ Status IrEmitterUnnested::EmitUnnestedTranspose(
   return OkStatus();
 }
 
+// Returns true if the fusion has consistent transpose heros.
+static bool HasConsistentTransposeHeros(HloComputation* fusion) {
+  std::vector<HloInstruction*> hlo_roots = GetFusionRoots(fusion);
+  if (!HasAnyTiledTransposeRoot(fusion)) {
+    return false;
+  }
+  const HloInstruction* first_transpose = &FindNonTrivialHero(**absl::c_find_if(
+      hlo_roots,
+      [](HloInstruction* instr) { return FindAnyTiledTranspose(*instr); }));
+  const Shape& transpose_in_shape = first_transpose->operand(0)->shape();
+  std::optional<TransposeDescription> first_tiled_transpose =
+      FindAnyTiledTranspose(*first_transpose);
+
+  // We need the following invariant:
+  // For every tuple element:
+  //  -> EITHER it's a kCopy: S{L} -> S{L'}
+  //  -> OR it's an elementwise op of shape S{L}
+  for (HloInstruction* root : hlo_roots) {
+    std::optional<TransposeDescription> tiled_transpose =
+        FindAnyTiledTranspose(*root);
+    if (tiled_transpose) {
+      if (*tiled_transpose != *first_tiled_transpose) {
+        return false;
+      }
+    } else {
+      if (!ShapeUtil::IsReshapeOrTransposeBitcast(
+              root->shape(), transpose_in_shape,
+              /*ignore_element_type=*/true)) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 Status IrEmitterUnnested::EmitFusion(mlir::Operation* op) {
   auto fusion_op = mlir::cast<mlir::lmhlo::FusionOp>(op);
+
+  FusionBackendConfig backend_config;
+  if (auto backend_config_str = fusion_op.getBackendConfig()
+                                    .value_or(mlir::Attribute())
+                                    .dyn_cast_or_null<mlir::StringAttr>()) {
+    auto status = tsl::HumanReadableJsonToProto(backend_config_str.str(),
+                                                &backend_config);
+    if (!status.ok()) {
+      LOG(ERROR) << "Ignoring invalid backend config on "
+                 << GetIrNameFromLoc(op->getLoc()) << ": "
+                 << backend_config_str.str();
+    }
+  }
+
   TF_ASSIGN_OR_RETURN(
       HloComputation * fused_computation,
       GetOrCreateSubComputationFromRegion(&fusion_op.getRegion(),
@@ -2022,29 +2072,25 @@ Status IrEmitterUnnested::EmitFusion(mlir::Operation* op) {
     return EmitUnnestedReduction(fusion_op, fused_computation);
   }
 
-  if (HasAnyTiledTransposeRoot(fused_computation)) {
+  // TODO(b/286029825): Do not generate fusions with inconsistent transposes.
+  if (HasConsistentTransposeHeros(fused_computation)) {
     return EmitUnnestedTranspose(fusion_op, fused_computation);
   }
 
 #if GOOGLE_CUDA
-  if (auto backend_config = fusion_op.getBackendConfig()
-                                .value_or(mlir::Attribute())
-                                .dyn_cast_or_null<mlir::StringAttr>()) {
-    tensorflow::AutotuneResult::TritonGemmKey triton_config;
-    if (backend_config == kTritonGemmBackendConfig) {
-      LOG(WARNING) << "Using fallback triton GEMM config";
+  if (backend_config.kind() == kTritonGemmFusionKind) {
+    if (!backend_config.has_triton_gemm_config()) {
+      LOG(WARNING) << "Using fallback triton GEMM config for op "
+                   << GetIrNameFromLoc(op->getLoc());
+      auto& triton_config = *backend_config.mutable_triton_gemm_config();
       triton_config.set_block_m(64);
       triton_config.set_block_k(64);
       triton_config.set_block_n(64);
       triton_config.set_split_k(1);
       triton_config.set_num_stages(1);
       triton_config.set_num_warps(2);
-      return EmitTritonFusion(fusion_op, triton_config);
-    } else if (tsl::HumanReadableJsonToProto(backend_config.str(),
-                                             &triton_config)
-                   .ok()) {
-      return EmitTritonFusion(fusion_op, triton_config);
     }
+    return EmitTritonFusion(fusion_op, backend_config.triton_gemm_config());
   }
 #endif  // GOOGLE_CUDA
 
@@ -2990,7 +3036,7 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
   const int64_t replica_count = hlo_module_config_.replica_count();
   const int64_t partition_count = hlo_module_config_.num_partitions();
 
-  NcclCollectiveThunk::AsyncExecutor* async_executor = nullptr;
+  NcclCollectiveThunk::AsyncExecutor* async_executor;
   if (NcclThunkType::IsDegenerate(collective_permute_op, replica_count,
                                   partition_count)) {
     // For a degenerate collective permute, just generate a copy thunk.
@@ -3001,6 +3047,8 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
         /*mem_size=*/ShapeUtil::ByteSizeOf(shape),
         /*source_value=*/collective_permute_op.getOperand(),
         /*destination_value=*/collective_permute_op.getOutput()));
+    // Signal that start thunk not created with nullptr.
+    async_executor = nullptr;
   } else {
     const NcclCollectiveThunk::Buffer buffer = {
         /*element_count=*/ShapeUtil::ElementsIn(shape),
@@ -3009,16 +3057,10 @@ Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
     auto thunk =
         std::make_unique<NcclThunkType>(GetThunkInfo(op), collective_permute_op,
                                         replica_count, partition_count, buffer);
-    if constexpr (NcclThunkType::IsAsync()) {
-      async_executor = &thunk->async_executor();
-    }
+    async_executor = thunk->async_executor();
     AddThunkToThunkSequence(std::move(thunk));
   }
-
-  // Signal that start thunk not created with nullptr.
-  if constexpr (NcclThunkType::IsAsync()) {
-    async_executors_.insert({op, async_executor});
-  }
+  async_executors_.insert({op, async_executor});
   return OkStatus();
 }
 
@@ -3064,11 +3106,7 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
     auto thunk =
         std::make_unique<NcclThunkType>(GetThunkInfo(op), op,
                                         /*buffers=*/std::move(buffers));
-
-    if constexpr (NcclThunkType::IsAsync()) {
-      async_executors_.insert({untyped_op, &thunk->async_executor()});
-    }
-
+    async_executors_.insert({untyped_op, thunk->async_executor()});
     AddThunkToThunkSequence(std::move(thunk));
     return OkStatus();
   }
@@ -3078,9 +3116,7 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
   }
 
   // Signal that start thunk not created with nullptr.
-  if constexpr (NcclThunkType::IsAsync()) {
-    async_executors_.insert({untyped_op, nullptr});
-  }
+  async_executors_.insert({untyped_op, nullptr});
 
   VLOG(1) << "Collective call is degenerate, not doing NCCL call";
 
@@ -3106,8 +3142,9 @@ Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
   return OkStatus();
 }
 
-template <typename NcclThunkType, typename OpT>
-Status IrEmitterUnnested::EmitNcclAsyncDone(mlir::Operation* op) {
+template <typename OpT>
+Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
+                                            mlir::Operation* op) {
   auto start_op = mlir::cast<OpT>(op).getToken().getDefiningOp();
   auto async_executor = async_executors_.extract(start_op);
   TF_RET_CHECK(async_executor) << "couldn't find async executor for start op";
@@ -3115,8 +3152,8 @@ Status IrEmitterUnnested::EmitNcclAsyncDone(mlir::Operation* op) {
   // Can be null if no start thunk was created (e.g. if the start op is
   // degenerate), in which case there's nothing to do here.
   if (async_executor.mapped() != nullptr) {
-    AddThunkToThunkSequence(std::make_unique<NcclThunkType>(
-        GetThunkInfo(op), *async_executor.mapped()));
+    AddThunkToThunkSequence(std::make_unique<NcclCollectiveDoneThunk>(
+        kind, GetThunkInfo(op), *async_executor.mapped()));
   }
   return OkStatus();
 }
@@ -3347,9 +3384,12 @@ StatusOr<KernelThunk*> IrEmitterUnnested::BuildKernelThunkImpl(
     Thunk::ThunkInfo thunk_info, const LaunchDimensions& launch_dimensions) {
   std::vector<BufferAllocation::Slice> arg_slices;
   arg_slices.reserve(kernel_arguments.size());
+  std::vector<bool> written;
+  written.reserve(kernel_arguments.size());
   for (const auto& kernel_argument : kernel_arguments) {
     if (!kernel_argument.first_with_same_slice.has_value()) {
       arg_slices.push_back(kernel_argument.slice);
+      written.push_back(kernel_argument.written);
     }
   }
 
@@ -3364,8 +3404,8 @@ StatusOr<KernelThunk*> IrEmitterUnnested::BuildKernelThunkImpl(
   }
 
   auto thunk_ptr = std::make_unique<KernelThunk>(
-      std::move(thunk_info), std::move(arg_slices), std::string(kernel_name),
-      launch_dimensions, std::move(values));
+      std::move(thunk_info), std::move(arg_slices), std::move(written),
+      std::string(kernel_name), launch_dimensions, std::move(values));
   KernelThunk* raw_thunk_ptr = thunk_ptr.get();
   AddThunkToThunkSequence(std::move(thunk_ptr));
 
@@ -3833,8 +3873,8 @@ static IrArray::Index GetUnnormalizedIndex(
     return IrArray::Index({multidim[2], multidim[1]}, unnormalized_shape,
                           normalized_shape_index.GetType());
   }
-  llvm::Value* linear = normalized_shape_index.Linearize(dims_in_elems, b_);
-  return IrArray::Index(linear, unnormalized_shape, b_);
+  return normalized_shape_index.SourceIndexOfBitcast(
+      ShapeUtil::MakeShape(F32, dims_in_elems), unnormalized_shape, b_);
 }
 
 static int GetNumOutputs(const Shape& shape) {
@@ -3903,14 +3943,16 @@ ReductionCodegenState IrEmitterUnnested::GenerateReductionCodegenState(
                                 "shared_cache");
         } else {
           // Allocate __shared__
-          // cache[num_partial_results][num_threads][num_threads + 1], where
+          // cache[num_threads][num_threads + 1], where
           // num_threads == num_threads_x == num_threads_y.  The "+1" is used to
           // avoid bank conflicts.
+          //
+          // (Although each thread produces num_partial_results results, we
+          // don't need that much cache: Only one result is live at a time.)
           CHECK_EQ(num_threads_x, tiling_scheme.GetNumThreadsFor(kDimY));
-          return AllocateShared(
-              tiling_scheme, element_type,
-              {num_partial_results, num_threads_x, num_threads_x + 1},
-              "shared_cache");
+          return AllocateShared(tiling_scheme, element_type,
+                                {num_threads_x, num_threads_x + 1},
+                                "shared_cache");
         }
       }();
 
@@ -4259,6 +4301,13 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
   const TilingScheme& tiling_scheme = reduction_codegen_state.GetTilingScheme();
   int num_outputs = reducer->num_parameters() / 2;
 
+  // Wait for reads from shmem in the last iteration to complete.  (If this is
+  // slow, we could "double-buffer" by having two shmem buffers and switching
+  // between them.)
+  if (partial_result_idx > 0) {
+    EmitSyncThreads();
+  }
+
   // Store the transpose in shared memory.
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
     const ReductionCodegenState::ReductionCalculationState& state =
@@ -4267,8 +4316,7 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
     llvm::AddrSpaceCastInst* shmem_output_addr =
         llvm::cast<llvm::AddrSpaceCastInst>(thread_id_info.GEPIntoSharedMemory(
             &b_, shared_cache,
-            {constant(partial_result_idx), thread_id_info.thread_id_x,
-             thread_id_info.thread_id_y},
+            {thread_id_info.thread_id_x, thread_id_info.thread_id_y},
             "shmem_output_address"));
     llvm::Value* current_output =
         InBoundsGEP(state.partial_result_address->getAllocatedType(),
@@ -4290,8 +4338,7 @@ void IrEmitterUnnested::EmitReductionOutputForColumnReduction(
     llvm::AddrSpaceCastInst* shmem_transposed_addr =
         llvm::cast<llvm::AddrSpaceCastInst>(thread_id_info.GEPIntoSharedMemory(
             &b_, state.shared_cache,
-            {constant(partial_result_idx), thread_id_info.thread_id_y,
-             thread_id_info.thread_id_x},
+            {thread_id_info.thread_id_y, thread_id_info.thread_id_x},
             "shmem_transposed_addr"));
     shmem_transposed_addrs.push_back(
         {shmem_transposed_addr, llvm::cast<llvm::GetElementPtrInst>(
@@ -4489,29 +4536,6 @@ Status IrEmitterUnnested::EmitTransposeTile(
     const TilingScheme& tiling_scheme,
     const LaunchDimensions& launch_dimensions) {
   std::vector<HloInstruction*> hlo_roots = GetFusionRoots(fusion_hlo);
-  const HloInstruction* first_transpose = &FindNonTrivialHero(
-      **absl::c_find_if(hlo_roots, [](HloInstruction* instr) {
-        return FindAnyTiledTranspose(FindNonTrivialHero(*instr));
-      }));
-
-  const Shape& transpose_in_shape = first_transpose->operand(0)->shape();
-  auto first_tiled_transpose = FindAnyTiledTranspose(*first_transpose);
-
-  // We need the following invariant:
-  // For every tuple element:
-  //  -> EITHER it's a kCopy: S{L} -> S{L'}
-  //  -> OR it's an elementwise op of shape S{L}
-  for (HloInstruction* root : hlo_roots) {
-    auto tiled_transpose = FindAnyTiledTranspose(*root);
-    if (tiled_transpose) {
-      CHECK(*tiled_transpose == *first_tiled_transpose);
-    } else {
-      CHECK(ShapeUtil::IsReshapeOrTransposeBitcast(
-          root->shape(), transpose_in_shape,
-          /*ignore_element_type=*/true));
-    }
-  }
-
   FusedIrEmitter fused_emitter(elemental_emitter_);
   for (int i = 0; i < fusion_hlo->num_parameters(); i++) {
     llvm_ir::IrArray ir_array = operand_arrays[i];
@@ -4528,7 +4552,7 @@ Status IrEmitterUnnested::EmitTransposeTile(
   Vector3 permutation;
   for (const auto& [tile_idx, root] : llvm::enumerate(hlo_roots)) {
     if (auto tr = FindAnyTiledTranspose(*root)) {
-      permutation = tr->second;
+      permutation = tr->permutation;
       const HloInstruction& hero = FindNonTrivialHero(*root);
       tiles[&hero] =
           AllocateShared(tiling_scheme,
@@ -4557,35 +4581,26 @@ Status IrEmitterUnnested::EmitTransposeTile(
               scheduled_writes;
 
           for (const auto& [output_idx, root] : llvm::enumerate(hlo_roots)) {
-            IrArray::Index input_index = GetUnnormalizedIndex(
-                index, transpose_in_shape, &b_, tiling_scheme.GetDimsInElems());
             if (FindAnyTiledTranspose(*root)) {
               const HloInstruction& hero = FindNonTrivialHero(*root);
               llvm_ir::ElementGenerator input_gen =
                   *fused_emitter.GetGenerator(*hero.operand(0));
-              IrArray::Index used_index = input_index;
-              if (!ShapeUtil::EqualIgnoringElementType(hero.operand(0)->shape(),
-                                                       transpose_in_shape)) {
-                used_index = used_index.SourceIndexOfBitcast(
-                    transpose_in_shape, hero.operand(0)->shape(), &b_);
-              }
-              llvm::Value* value = *input_gen(used_index);
+              IrArray::Index untiled_index =
+                  GetUnnormalizedIndex(index, hero.operand(0)->shape(), &b_,
+                                       tiling_scheme.GetDimsInElems());
+              llvm::Value* value = *input_gen(untiled_index);
               llvm::Value* addr = thread_id_info.GEPIntoSharedMemory(
                   &b_, tiles[&hero], {y_loc, x_loc});
 
               b_.CreateStore(value, addr);
             } else {
-              IrArray::Index used_index = input_index;
-              if (!ShapeUtil::EqualIgnoringElementType(root->shape(),
-                                                       transpose_in_shape)) {
-                used_index = used_index.SourceIndexOfBitcast(
-                    transpose_in_shape, root->shape(), &b_);
-              }
+              IrArray::Index untiled_index = GetUnnormalizedIndex(
+                  index, root->shape(), &b_, tiling_scheme.GetDimsInElems());
               llvm_ir::ElementGenerator output_gen =
                   *fused_emitter.GetGenerator(*root);
-              llvm::Value* output_value = *output_gen(used_index);
+              llvm::Value* output_value = *output_gen(untiled_index);
               scheduled_writes.emplace_back(output_arrays[output_idx],
-                                            used_index, output_value);
+                                            untiled_index, output_value);
             }
           }
 
@@ -4611,9 +4626,6 @@ Status IrEmitterUnnested::EmitTransposeTile(
             if (FindAnyTiledTranspose(*root)) {
               const HloInstruction& hero = FindNonTrivialHero(*root);
 
-              IrArray::Index untiled_index = GetUnnormalizedIndex(
-                  index, hero.shape(), &b_,
-                  Permute(tiling_scheme.GetDimsInElems(), permutation));
               std::vector<llvm::Value*> idx = {x_loc, y_loc};
               llvm::Value* gep =
                   thread_id_info.GEPIntoSharedMemory(&b_, tiles[&hero], idx);
@@ -4624,6 +4636,17 @@ Status IrEmitterUnnested::EmitTransposeTile(
               FusedIrEmitter fused_emitter(elemental_emitter_);
               fused_emitter.BindGenerator(
                   hero, [&](const IrArray::Index& index) { return loaded; });
+              for (int64_t i = 0; i < fusion_hlo->num_parameters(); ++i) {
+                llvm_ir::IrArray ir_array = operand_arrays[i];
+                HloInstruction* fused_operand =
+                    fusion_hlo->parameter_instruction(i);
+                fused_emitter.BindGenerator(
+                    *fused_operand, [this, ir_array, fused_operand](
+                                        const llvm_ir::IrArray::Index& index) {
+                      return ir_array.EmitReadArrayElement(
+                          index, &b_, fused_operand->name());
+                    });
+              }
 
               // Apply codegeneration for the code after the real hero.
               TF_ASSIGN_OR_RETURN(llvm_ir::ElementGenerator gen,
@@ -4631,13 +4654,11 @@ Status IrEmitterUnnested::EmitTransposeTile(
 
               // Both for emission and writing it should be index-as-transformed
               // by the computation.
-              IrArray::Index output_index = untiled_index;
-              if (root->shape() != hero.shape()) {
-                output_index = output_index.SourceIndexOfBitcast(
-                    hero.shape(), root->shape(), &b_);
-              }
-              TF_ASSIGN_OR_RETURN(llvm::Value * generated, gen(output_index));
-              output_arrays[output_idx].EmitWriteArrayElement(output_index,
+              IrArray::Index untiled_index = GetUnnormalizedIndex(
+                  index, root->shape(), &b_,
+                  Permute(tiling_scheme.GetDimsInElems(), permutation));
+              TF_ASSIGN_OR_RETURN(llvm::Value * generated, gen(untiled_index));
+              output_arrays[output_idx].EmitWriteArrayElement(untiled_index,
                                                               generated, &b_);
             }
           }
@@ -4696,17 +4717,45 @@ int64_t NumInputsWithMoreElementsThan(mlir::lmhlo::FusionOp fusion,
       });
 }
 
+// If the reduce is relatively small, it's possible to unroll so much that we
+// don't have enough blocks to saturate the GPU.  This function computes the max
+// number of times we can unroll the reduction while still saturating the GPU.
+static int64_t MaxBeneficialColumnReductionUnrollBasedOnBlockSize(
+    const GpuDeviceInfo& gpu_info, HloComputation* fused_computation) {
+  int64_t num_reduce_output_elems = 0;
+  for (const HloInstruction* root : GetFusionRoots(fused_computation)) {
+    if (!IsReductionFromOrToContiguousDimensions(*root)) {
+      continue;
+    }
+    const Shape* output_shape = &root->shape();
+    // Unwrap multi-output reduction.  All outputs should be the same shape.
+    if (output_shape->IsTuple()) {
+      output_shape = &output_shape->tuple_shapes()[0];
+    }
+    num_reduce_output_elems =
+        std::max(num_reduce_output_elems, ShapeUtil::ElementsIn(*output_shape));
+  }
+
+  // A column reduction that's unrolled N times uses one warp to generate N
+  // output elements.  The block size is always 32 warps = 1024 threads.
+  int64_t num_blocks = CeilOfRatio(num_reduce_output_elems, int64_t{32});
+  int64_t num_threads = num_blocks * 1024;
+  // Number of SMs we can saturate with this work.
+  int num_cores =
+      CeilOfRatio<int64_t>(num_threads, gpu_info.threads_per_core_limit);
+  return static_cast<int>(CeilOfRatio(num_cores, gpu_info.core_count));
+}
+
 // The benefit of unrolling a kInput fusion that is a column reduction comes
 // from the vectorization of non-reduction fusion outputs and fusion inputs.
 // On the other hand, unrolling can also introduce factors that can cause
 // the kernel to run slower. This routine uses a simple heuristic to estimate
 // the benefit as well as the overhead of unrolling in order to decide whether
 // unrolling is beneficial for the given kInput fusion.
-bool IsUnrollingColumnReductionBeneficial(mlir::lmhlo::FusionOp fusion,
-                                          HloComputation* fused_computation,
-                                          const Shape& input_shape,
-                                          int64_t num_kept_minor,
-                                          bool reduction_is_race_free) {
+static bool IsUnrollingColumnReductionBeneficial(
+    const GpuDeviceInfo& gpu_info, mlir::lmhlo::FusionOp fusion,
+    HloComputation* fused_computation, const Shape& input_shape,
+    int64_t num_kept_minor, bool reduction_is_race_free) {
   if (num_kept_minor % (WarpSize() * 2) != 0) {
     return false;
   }
@@ -4741,7 +4790,12 @@ bool IsUnrollingColumnReductionBeneficial(mlir::lmhlo::FusionOp fusion,
   // unrolled even with such an assumption,  and the accesses to those inputs
   // turn out to be vectorizable, the compiler will still vectorize them.
   cannot_be_vectorized += NumInputsWithMoreElementsThan(fusion, input_shape);
-  return can_be_vectorized >= cannot_be_vectorized;
+  if (can_be_vectorized < cannot_be_vectorized) {
+    return false;
+  }
+
+  return MaxBeneficialColumnReductionUnrollBasedOnBlockSize(
+             gpu_info, fused_computation) > 1;
 }
 
 int64_t NearestPowerOfTwo(int64_t v) {
@@ -4835,16 +4889,30 @@ llvm::GlobalVariable* IrEmitterUnnested::AllocateShared(
                                            array_type, buffer_name);
 }
 
+// Returns the size of the dtype of the smallest "nontrivial" input to this
+// fusion.
+//
+// We use this as part of the heuristic of choosing our fusion unroll factor.
+// Fusions which read smaller elements (e.g. i8) should unroll more, so that
+// they execute larger coalesced loads.
+static int SmallestInputDtypeBits(mlir::lmhlo::FusionOp fusion) {
+  int bits = std::numeric_limits<int>::max();
+  for (mlir::Value operand : fusion.getInputBuffers()) {
+    bits = std::min(GetPrimitiveBitwidth(operand), bits);
+  }
+  return bits;
+}
+
 // Whether the reduction can be vectorized.
 static bool CanVectorizeReduction(
-    se::CudaComputeCapability cc, mlir::lmhlo::FusionOp fusion,
-    HloComputation* fused_computation,
+    se::CudaComputeCapability cc, const GpuDeviceInfo& gpu_info,
+    mlir::lmhlo::FusionOp fusion, HloComputation* fused_computation,
     const ReductionDimensions& reduction_dimensions, int num_threads_x,
     Vector3 reduction_tiling, const Shape& input_shape,
     bool reduction_is_race_free) {
   if (!reduction_dimensions.is_row_reduction) {
     return IsUnrollingColumnReductionBeneficial(
-        fusion, fused_computation, input_shape,
+        gpu_info, fusion, fused_computation, input_shape,
         reduction_dimensions.dimensions[kDimX], reduction_is_race_free);
   }
 
@@ -4863,13 +4931,8 @@ static bool CanVectorizeReduction(
     return true;
   }
 
-  int smallest_input_dtype_bits = std::numeric_limits<int>::max();
-  for (mlir::Value operand : fusion.getInputBuffers()) {
-    smallest_input_dtype_bits =
-        std::min(GetPrimitiveBitwidth(operand), smallest_input_dtype_bits);
-  }
   if (cc.IsAtLeast(se::CudaComputeCapability::PASCAL_)) {
-    return smallest_input_dtype_bits <= 32 &&
+    return SmallestInputDtypeBits(fusion) <= 32 &&
            reduction_dimensions.dimensions[kDimX] %
                    (reduction_tiling[2] * num_threads_x) ==
                0;
@@ -4934,12 +4997,6 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
 
   se::CudaComputeCapability cc = ir_emitter_context_->cuda_compute_capability();
 
-  int smallest_input_dtype_bits = std::numeric_limits<int>::max();
-  for (mlir::Value operand : fusion.getInputBuffers()) {
-    smallest_input_dtype_bits =
-        std::min(GetPrimitiveBitwidth(operand), smallest_input_dtype_bits);
-  }
-
   TilingScheme::IndexingOrder indexing_order =
       reduction_dimensions.is_row_reduction ? kStridedIndexingX
                                             : kLinearIndexingX;
@@ -4951,13 +5008,15 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
   bool vectorize =
       // Vectorization might cause us to run out of budget.
       (shmem_usage * 2 <= shmem_budget) &&
-      CanVectorizeReduction(cc, fusion, fused_computation, reduction_dimensions,
+      CanVectorizeReduction(cc, ir_emitter_context_->gpu_device_info(), fusion,
+                            fused_computation, reduction_dimensions,
                             num_threads_x, reduction_tiling, input_shape,
                             reduction_is_race_free);
   int vector_size = vectorize ? 2 : 1;
 
   int num_partial_results = 1;
   if (!reduction_dimensions.is_row_reduction && vectorize) {
+    int smallest_input_dtype_bits = SmallestInputDtypeBits(fusion);
     if (smallest_input_dtype_bits <= 32) {
       // Make sure to use all the data read at once.
       // Instead of hardcoding the granularity, we can query the granularity we
@@ -4977,12 +5036,31 @@ StatusOr<ReductionCodegenInfo> IrEmitterUnnested::ComputeReductionCodegenInfo(
     } else {
       num_partial_results = 2;
     }
+
+    // Take into account MaxBeneficialColumnReductionUnrollBasedOnBlockSize.
+    // (We can't go below 2 for the unroll factor -- if we wanted to use 1 as
+    // the unroll factor, we should have set this reduction as unvectorized.)
+    num_partial_results = std::clamp<int>(
+        2,  //
+        num_partial_results,
+        MaxBeneficialColumnReductionUnrollBasedOnBlockSize(
+            ir_emitter_context_->gpu_device_info(), fused_computation));
   }
 
-  while (shmem_usage * num_partial_results > shmem_budget) {
-    num_partial_results /= 2;
-    if (num_partial_results == 1) {
-      break;
+  // TODO(b/283542954): Autotune num_partial_results?  This can make a big
+  // difference, e.g. by affecting register spilling.
+
+  // Row reductions use one shmem block per partial result, so we have to make
+  // sure we fit in budget.  Column reductions only ever use one shmem block.
+  // (Indeed I *think* "num_partial_results" is a misnomer for column
+  // reductions; I think it's the number of *complete*, i.e. not partial,
+  // results per warp.)
+  if (reduction_dimensions.is_row_reduction) {
+    while (shmem_usage * num_partial_results > shmem_budget) {
+      num_partial_results /= 2;
+      if (num_partial_results == 1) {
+        break;
+      }
     }
   }
 
@@ -5734,8 +5812,8 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op)) {
-    return EmitNcclAsyncDone<NcclCollectivePermuteDoneThunk,
-                             mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op);
+    return EmitNcclAsyncDone<mlir::lmhlo_gpu::CollectivePermuteDoneOp>(
+        Thunk::kNcclCollectivePermuteDone, op);
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllGatherStartOp>(op)) {
@@ -5744,8 +5822,8 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllGatherDoneOp>(op)) {
-    return EmitNcclAsyncDone<NcclAllGatherDoneThunk,
-                             mlir::lmhlo_gpu::AllGatherDoneOp>(op);
+    return EmitNcclAsyncDone<mlir::lmhlo_gpu::AllGatherDoneOp>(
+        Thunk::kNcclAllGatherDone, op);
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllReduceStartOp>(op)) {
@@ -5754,8 +5832,8 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllReduceDoneOp>(op)) {
-    return EmitNcclAsyncDone<NcclAllReduceDoneThunk,
-                             mlir::lmhlo_gpu::AllReduceDoneOp>(op);
+    return EmitNcclAsyncDone<mlir::lmhlo_gpu::AllReduceDoneOp>(
+        Thunk::kNcclAllReduceDone, op);
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::ReduceScatterStartOp>(op)) {
@@ -5764,8 +5842,8 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::ReduceScatterDoneOp>(op)) {
-    return EmitNcclAsyncDone<NcclReduceScatterDoneThunk,
-                             mlir::lmhlo_gpu::ReduceScatterDoneOp>(op);
+    return EmitNcclAsyncDone<mlir::lmhlo_gpu::ReduceScatterDoneOp>(
+        Thunk::kNcclReduceScatterDone, op);
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllToAllStartOp>(op)) {
@@ -5774,8 +5852,8 @@ Status IrEmitterUnnested::EmitOp(mlir::Operation* op) {
   }
 
   if (mlir::isa<mlir::lmhlo_gpu::AllToAllDoneOp>(op)) {
-    return EmitNcclAsyncDone<NcclAllToAllDoneThunk,
-                             mlir::lmhlo_gpu::AllToAllDoneOp>(op);
+    return EmitNcclAsyncDone<mlir::lmhlo_gpu::AllToAllDoneOp>(
+        Thunk::kNcclAllToAllDone, op);
   }
 
   if (mlir::isa<mlir::lmhlo::InfeedOp>(op)) {
diff --git a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
index 90e94ff1b6a..ea031d335d4 100644
--- a/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
+++ b/tensorflow/compiler/xla/service/gpu/ir_emitter_unnested.h
@@ -201,8 +201,9 @@ class IrEmitterUnnested : public IrEmitter {
   Status EmitCublasLtMatmulThunk(mlir::Operation* op);
   Status EmitCublasLtMatmulThunkF8(mlir::Operation* op);
   Status EmitConvolutionReorderThunk(mlir::Operation* op);
-  Status EmitTritonFusion(mlir::Operation* op,
-                          tensorflow::AutotuneResult::TritonGemmKey& config);
+  Status EmitTritonFusion(
+      mlir::Operation* op,
+      const tensorflow::AutotuneResult::TritonGemmKey& config);
 #endif  // GOOGLE_CUDA
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   Status EmitCholeskyThunk(mlir::Operation* op);
@@ -226,8 +227,8 @@ class IrEmitterUnnested : public IrEmitter {
 
   template <typename NcclThunkType, typename OpT>
   Status EmitNcclThunk(mlir::Operation* op);
-  template <typename NcclThunkType, typename OpT>
-  Status EmitNcclAsyncDone(mlir::Operation* op);
+  template <typename OpT>
+  Status EmitNcclAsyncDone(Thunk::Kind kind, mlir::Operation* op);
 
   template <typename ThunkType, typename OpT>
   Status EmitReplicaOrPartitionId(mlir::Operation* op);
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
index eaa5c5383dd..932c0db080c 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.cc
@@ -38,11 +38,13 @@ namespace gpu {
 
 KernelThunk::KernelThunk(ThunkInfo thunk_info,
                          std::vector<BufferAllocation::Slice> args,
+                         std::vector<bool> written,
                          const std::string& kernel_name,
                          const LaunchDimensions& launch_dimensions,
                          std::vector<mlir::Value> values)
     : Thunk(Kind::kKernel, thunk_info),
       args_(std::move(args)),
+      written_(std::move(written)),
       kernel_name_(kernel_name),
       launch_dimensions_(launch_dimensions),
       values_(std::move(values)) {}
diff --git a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
index f0baff61037..4ea6cc0df7f 100644
--- a/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/kernel_thunk.h
@@ -50,7 +50,7 @@ class KernelThunk : public Thunk {
   // must correspond to each arg directly, not to their base allocation (e.g.
   // they can be the result of an mlir::memref::ViewOp).
   KernelThunk(ThunkInfo thunk_info, std::vector<BufferAllocation::Slice> args,
-              const std::string& kernel_name,
+              std::vector<bool> written, const std::string& kernel_name,
               const LaunchDimensions& launch_dimensions,
               std::vector<mlir::Value> values);
   KernelThunk(const KernelThunk&) = delete;
@@ -73,6 +73,8 @@ class KernelThunk : public Thunk {
   const std::vector<BufferAllocation::Slice>& arguments() const {
     return args_;
   }
+  const std::vector<bool>& written() const { return written_; }
+
   const std::string& kernel_name() const { return kernel_name_; }
   const LaunchDimensions& launch_dimensions() const {
     return launch_dimensions_;
@@ -83,6 +85,9 @@ class KernelThunk : public Thunk {
   // Buffer slices passed to the kernel as arguments.
   const std::vector<BufferAllocation::Slice> args_;
 
+  // args_[i] is written iff (written_[i] == true).
+  const std::vector<bool> written_;
+
   // Entry kernel name for the computation.
   const std::string kernel_name_;
 
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
index 436f7413960..159aea7a357 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "tensorflow/compiler/xla/debug_options_flags.h"
+#include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/tsl/platform/logging.h"
 
@@ -76,9 +78,62 @@ int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
   return -1;
 }
 
-StatusOr<LaunchDimensions> CalculateLaunchDimensions(
+// Check if the last dimensions worth up to cache line size
+// participate in transpose. If so, then we want to use max number of threads
+// per block for communication via shared memory. Use the default pipeline
+// otherwise.
+bool IsTransposeDimensionWithinCacheLine(mlir::mhlo::TransposeOp transpose,
+                                         GpuDeviceInfo gpu_device_info) {
+  const int64_t kCacheLineBits = 1024;
+  int64_t total_bytes =
+      transpose.getResult().getType().getElementTypeBitWidth();
+  auto perm = transpose.getPermutation().getValues<int64_t>();
+  auto result_shape = transpose.getResult().getType().getShape();
+  for (int64_t i = perm.size() - 1; total_bytes < kCacheLineBits && i >= 0;
+       --i) {
+    if (perm[i] != i) return true;
+    total_bytes *= result_shape[i];
+  }
+  return false;
+}
+
+StatusOr<LaunchDimensions> CalculateLaunchDimensionsImplExperimental(
     const Shape& shape, GpuDeviceInfo gpu_device_info,
-    LaunchDimensionsConfig dim_config) {
+    LaunchDimensionsConfig dim_config, mlir::Operation* op) {
+  int64_t num_elements = ShapeUtil::ElementsIn(shape);
+  if (num_elements <= 1) {
+    return LaunchDimensions();
+  }
+  CHECK_EQ(num_elements % dim_config.unroll_factor, 0);
+  num_elements = num_elements / dim_config.unroll_factor;
+  int64_t threads_per_block_x = [&]() {
+    const int kWarpSchedulers = 4;
+    int64_t block_size = std::min<int64_t>(
+        gpu_device_info.threads_per_warp * kWarpSchedulers, num_elements);
+    auto fusion = mlir::dyn_cast_or_null<mlir::lmhlo::FusionOp>(op);
+    if (!fusion) {
+      return block_size;
+    }
+    for (mlir::Operation& op : fusion.getRegion().front()) {
+      auto transpose = mlir::dyn_cast<mlir::mhlo::TransposeOp>(op);
+      if (transpose &&
+          IsTransposeDimensionWithinCacheLine(transpose, gpu_device_info)) {
+        return std::min<int64_t>(gpu_device_info.threads_per_block_limit,
+                                 num_elements);
+      }
+    }
+    VLOG(2) << "Block size: " << block_size;
+    return block_size;
+  }();
+
+  int64_t block_count = CeilOfRatio(num_elements, threads_per_block_x);
+
+  return LaunchDimensions({block_count, 1, 1}, {threads_per_block_x, 1, 1});
+}
+
+StatusOr<LaunchDimensions> CalculateLaunchDimensionsImpl(
+    const Shape& shape, GpuDeviceInfo gpu_device_info,
+    LaunchDimensionsConfig dim_config, mlir::Operation* op) {
   int64_t num_elements = ShapeUtil::ElementsIn(shape);
   if (num_elements <= 1) {
     return LaunchDimensions();
@@ -179,5 +234,17 @@ StatusOr<LaunchDimensions> CalculateLaunchDimensions(
                           {threads_per_block_x, threads_per_block_y, 1});
 }
 
+StatusOr<LaunchDimensions> CalculateLaunchDimensions(
+    const Shape& shape, GpuDeviceInfo gpu_device_info,
+    LaunchDimensionsConfig dim_config, mlir::Operation* op) {
+  auto debug_options = GetDebugOptionsFromFlags();
+  if (debug_options.xla_gpu_enable_experimental_block_size()) {
+    VLOG(2) << "Experimental block size is enabled";
+    return CalculateLaunchDimensionsImplExperimental(shape, gpu_device_info,
+                                                     dim_config, op);
+  }
+  return CalculateLaunchDimensionsImpl(shape, gpu_device_info, dim_config, op);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
index fbf2abe989a..776061c1056 100644
--- a/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
+++ b/tensorflow/compiler/xla/service/gpu/launch_dimensions.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <ostream>
 #include <string>
 
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/service/gpu/gpu_device_info.h"
 #include "tensorflow/compiler/xla/shape.h"
 
@@ -136,7 +137,7 @@ int64_t ThreadsPerBlockRowVectorized(const Shape& shape,
 // Calculates the launch dimensions used to invoke `hlo`.
 StatusOr<LaunchDimensions> CalculateLaunchDimensions(
     const Shape& shape, GpuDeviceInfo gpu_device_info,
-    LaunchDimensionsConfig dim_config = {});
+    LaunchDimensionsConfig dim_config = {}, mlir::Operation* op = nullptr);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
index 56e4d551bd4..fd40f485580 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -78,7 +78,6 @@ xla_cc_test(
     ],
     deps = [
         ":llvm_gpu_backend",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 211572ce025..7a8bd709d0d 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -891,7 +891,7 @@ std::pair<std::string, std::string> GetFeatureStrFromGCNArchName(
   // feature str, based on the underlying GPU HW to get max performance.
   std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
   std::vector<std::string> mapped_tokens;
-  if (tokens.size() > 0) gfx = tokens[0];
+  if (!tokens.empty()) gfx = tokens[0];
   for (auto it = tokens.begin(); it != tokens.end(); it++) {
     // Skip the first token, that is the gfxNNN str
     // The rest of the tokens are the feature/targetid strings
diff --git a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
index 0175e2c58f7..87cd51dd04e 100644
--- a/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.cc
@@ -15,13 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/llvm_gpu_backend/utils.h"
 
+#include <memory>
+#include <string>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/Support/SourceMgr.h"
-#include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/tsl/platform/logging.h"
 
 namespace {
@@ -40,9 +42,9 @@ namespace gpu {
 std::unique_ptr<llvm::Module> LoadIRModule(const std::string& filename,
                                            llvm::LLVMContext* llvm_context) {
   llvm::SMDiagnostic diagnostic_err;
-  std::unique_ptr<llvm::Module> module(
-      llvm::parseIRFile(llvm::StringRef(filename.data(), filename.size()),
-                        diagnostic_err, *llvm_context));
+  std::unique_ptr<llvm::Module> module =
+      llvm::getLazyIRFileModule(filename, diagnostic_err, *llvm_context,
+                                /*ShouldLazyLoadMetadata=*/true);
 
   if (module == nullptr) {
     DieWithSMDiagnosticError(&diagnostic_err);
diff --git a/tensorflow/compiler/xla/service/gpu/matmul_utils.cc b/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
index 5dae63157b5..1ea3a2ae1d9 100644
--- a/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/matmul_utils.cc
@@ -264,8 +264,9 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     std::optional<int64_t> algorithm, int64_t compute_precision) {
   return GemmConfig::For(lhs_shape, lhs_batch_dims, lhs_contracting_dims,
                          rhs_shape, rhs_batch_dims, rhs_contracting_dims,
-                         output_shape, output_shape, alpha_real, alpha_imag,
-                         beta, algorithm, compute_precision);
+                         /*c_shape=*/output_shape, /*bias_shape_ptr=*/nullptr,
+                         output_shape, alpha_real, alpha_imag, beta, algorithm,
+                         compute_precision);
 }
 
 /*static*/ StatusOr<GemmConfig> GemmConfig::For(
@@ -273,8 +274,9 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
     absl::Span<const int64_t> lhs_contracting_dims, const Shape& rhs_shape,
     absl::Span<const int64_t> rhs_batch_dims,
     absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
-    const Shape& output_shape, double alpha_real, double alpha_imag,
-    double beta, std::optional<int64_t> algorithm, int64_t compute_precision) {
+    const Shape* bias_shape_ptr, const Shape& output_shape, double alpha_real,
+    double alpha_imag, double beta, std::optional<int64_t> algorithm,
+    int64_t compute_precision) {
   absl::Span<const int64_t> lhs_col_dims = lhs_contracting_dims;
   TF_ASSIGN_OR_RETURN(
       std::vector<int64_t> lhs_row_dims,
@@ -312,9 +314,21 @@ StatusOr<bool> CanFoldTransposeOperandIntoDot(const HloInstruction& dot,
   TF_ASSIGN_OR_RETURN(MatrixLayout output_layout,
                       MatrixLayout::For(output_shape, output_batch_dims,
                                         output_row_dims, output_col_dims));
+  Shape c_matrix_shape = c_shape;
+  if (primitive_util::IsF8Type(lhs_shape.element_type()) &&
+      primitive_util::IsF8Type(output_shape.element_type()) && beta == 0.0) {
+    // By default, if c is not present (i.e., beta is 0), c_shape will be the
+    // output shape. cublasLT requires a valid c_shape to be passed, even if c
+    // is not present, and normally setting it to the output shape is fine. But
+    // for matmuls with FP8 inputs and outputs, C must instead have the same
+    // dtype as the vector bias if present, and either BF16 or F16 otherwise. So
+    // we set the dtype of C here.
+    c_matrix_shape.set_element_type(
+        bias_shape_ptr != nullptr ? bias_shape_ptr->element_type() : BF16);
+  }
 
   TF_ASSIGN_OR_RETURN(MatrixLayout c_layout,
-                      MatrixLayout::For(c_shape, output_batch_dims,
+                      MatrixLayout::For(c_matrix_shape, output_batch_dims,
                                         output_row_dims, output_col_dims));
 
   // TODO(cjfj): We should also check that the batch, contracting and
@@ -571,11 +585,13 @@ Status DoGemm(int64_t batch_size, int64_t m, int64_t n, int64_t k,
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
   se::DeviceMemory<Input> output_data(output.data);
 
+#if GOOGLE_CUDA
   if (algorithm) {
     return DoGemmWithAlgorithm<Input, Input>(
         batch_size, m, n, k, lhs, rhs, output, alpha, beta, stream, *algorithm,
         compute_precision, profile_result);
   }
+#endif
 
   if (batch_size != 1) {
     return stream->ThenBlasGemmStridedBatched(
diff --git a/tensorflow/compiler/xla/service/gpu/matmul_utils.h b/tensorflow/compiler/xla/service/gpu/matmul_utils.h
index 1a53375dd52..6fb7d252a25 100644
--- a/tensorflow/compiler/xla/service/gpu/matmul_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/matmul_utils.h
@@ -100,14 +100,16 @@ struct GemmConfig {
       double alpha_real, double alpha_imag, double beta,
       std::optional<int64_t> algorithm, int64_t compute_precision);
 
-  // As above with additional `c_shape` parameter.
+  // As above with additional `c_shape` and `bias_shape_ptr` parameter, both
+  // which are only necessarily for F8 gemms.
   static StatusOr<GemmConfig> For(
       const Shape& lhs_shape, absl::Span<const int64_t> lhs_batch_dims,
       absl::Span<const int64_t> lhs_contracting_dims, const Shape& rhs_shape,
       absl::Span<const int64_t> rhs_batch_dims,
       absl::Span<const int64_t> rhs_contracting_dims, const Shape& c_shape,
-      const Shape& output_shape, double alpha_real, double alpha_imag,
-      double beta, std::optional<int64_t> algorithm, int64_t compute_precision);
+      const Shape* bias_shape_ptr, const Shape& output_shape, double alpha_real,
+      double alpha_imag, double beta, std::optional<int64_t> algorithm,
+      int64_t compute_precision);
 
   MatrixLayout lhs_layout;
   MatrixLayout rhs_layout;
@@ -180,6 +182,10 @@ class MatmulPlan {
       }
     }
 
+    Shape bias_shape;
+    if (op.getBias() != nullptr) {
+      bias_shape = GetShape(op.getBias());
+    }
     TF_ASSIGN_OR_RETURN(
         GemmConfig config,
         GemmConfig::For(
@@ -187,6 +193,7 @@ class MatmulPlan {
             dot_dims.getLhsContractingDimensions(), GetShape(op.getB()),
             dot_dims.getRhsBatchingDimensions(),
             dot_dims.getRhsContractingDimensions(), GetShape(op.getC()),
+            op.getBias() == nullptr ? nullptr : &bias_shape,
             GetShape(op.getD()), op.getAlphaReal().convertToDouble(),
             op.getAlphaImag().convertToDouble(), op.getBeta().convertToDouble(),
             op.getAlgorithm(), compute_precision));
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc
index 6df3d19341f..38306eb9319 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.cc
@@ -29,17 +29,17 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using mlir::lmhlo_gpu::AllGatherStartOp;
+
 namespace impl {
-template <typename OpT>
-NcclAllGatherConfig GetNcclAllGatherConfig(OpT op) {
+NcclAllGatherConfig GetNcclAllGatherConfig(AllGatherStartOp op) {
   NcclAllGatherConfig config;
   config.config =
       GetNcclCollectiveConfigForMlir(op, op.getUseGlobalDeviceIds());
   return config;
 }
 
-template <typename OpT>
-Status CheckImplementable(OpT op) {
+Status CheckImplementable(AllGatherStartOp op) {
   TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
   for (mlir::Value operand : op.getInputs()) {
     TF_RETURN_IF_ERROR(IsValidOperand(operand, Thunk::kNcclAllGather));
@@ -55,18 +55,37 @@ Status CheckImplementable(OpT op) {
 }
 }  // namespace impl
 
-NcclAllGatherThunkBase::NcclAllGatherThunkBase(Kind kind, ThunkInfo thunk_info,
-                                               NcclAllGatherConfig config,
-                                               std::vector<Buffer> buffers)
-    : NcclCollectiveThunk(kind, thunk_info),
-      config_(std::move(config)),
+NcclAllGatherStartThunk::NcclAllGatherStartThunk(
+    ThunkInfo thunk_info, AllGatherStartOp op,
+    std::vector<NcclCollectiveThunk::Buffer> buffers)
+    : NcclCollectiveThunk(Thunk::kNcclAllGatherStart, thunk_info,
+                          op.getIsSync()),
+      config_(impl::GetNcclAllGatherConfig(op)),
       buffers_(std::move(buffers)) {
   CHECK_EQ(config_.config.operand_count, buffers_.size());
 }
 
-Status NcclAllGatherThunkBase::RunAllGather(const ExecuteParams& params,
-                                            se::Stream& stream,
-                                            ncclComm_t comm) {
+/*static*/ Status NcclAllGatherStartThunk::CheckImplementable(
+    AllGatherStartOp op, int64_t replica_count, int64_t partition_count) {
+  return AddOpDescription<NcclAllGatherStartThunk>(
+      impl::CheckImplementable(op), op, replica_count, partition_count);
+}
+
+/*static*/ bool NcclAllGatherStartThunk::IsDegenerate(AllGatherStartOp op,
+                                                      int64_t replica_count,
+                                                      int64_t partition_count) {
+  return impl::GetNcclAllGatherConfig(op).config.IsDegenerate(replica_count,
+                                                              partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode NcclAllGatherStartThunk::GetGroupMode(
+    AllGatherStartOp op) {
+  return impl::GetNcclAllGatherConfig(op).config.group_mode;
+}
+
+Status NcclAllGatherStartThunk::RunNcclCollective(const ExecuteParams& params,
+                                                  se::Stream& stream,
+                                                  ncclComm_t comm) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
@@ -74,45 +93,6 @@ Status NcclAllGatherThunkBase::RunAllGather(const ExecuteParams& params,
   return xla::gpu::RunAllGather(device_buffers, stream, comm);
 }
 
-NcclAllGatherStartThunk::NcclAllGatherStartThunk(
-    ThunkInfo thunk_info, mlir::lmhlo_gpu::AllGatherStartOp op,
-    std::vector<NcclCollectiveThunk::Buffer> buffers)
-    : NcclAllGatherThunkBase(Thunk::kNcclAllGatherStart, thunk_info,
-                             impl::GetNcclAllGatherConfig(op),
-                             std::move(buffers)) {}
-
-/*static*/ Status NcclAllGatherStartThunk::CheckImplementable(
-    mlir::lmhlo_gpu::AllGatherStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclAllGatherStartThunk>(
-      impl::CheckImplementable(op), op, replica_count, partition_count);
-}
-
-/*static*/ bool NcclAllGatherStartThunk::IsDegenerate(
-    mlir::lmhlo_gpu::AllGatherStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::GetNcclAllGatherConfig(op).config.IsDegenerate(replica_count,
-                                                              partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclAllGatherStartThunk::GetGroupMode(
-    mlir::lmhlo_gpu::AllGatherStartOp op) {
-  return impl::GetNcclAllGatherConfig(op).config.group_mode;
-}
-
-Status NcclAllGatherStartThunk::RunNcclCollective(const ExecuteParams& params,
-                                                  ncclComm_t comm) {
-  return async_.Execute(
-      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
-        return RunAllGather(params, stream, comm);
-      },
-      params, comm);
-}
-
-NcclAllGatherDoneThunk::NcclAllGatherDoneThunk(
-    ThunkInfo thunk_info, NcclCollectiveThunk::AsyncExecutor& async)
-    : NcclCollectiveDoneThunk(Thunk::kNcclAllGatherDone, thunk_info, async) {}
-
 Status RunAllGather(std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
                     ncclComm_t comm) {
 #if XLA_ENABLE_XCCL
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
index 72d51d72d5f..6d5780f276d 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_gather_thunk.h
@@ -28,25 +28,8 @@ struct NcclAllGatherConfig {
   NcclCollectiveConfig config;
 };
 
-// Base class for thunk that performs a NCCL-based All-Gather among CUDA
-// GPU-based replicas.
-class NcclAllGatherThunkBase : public NcclCollectiveThunk {
- public:
-  NcclAllGatherThunkBase(Kind kind, ThunkInfo thunk_info,
-                         NcclAllGatherConfig config,
-                         std::vector<Buffer> buffers);
-
- protected:
-  Status RunAllGather(const ExecuteParams& params, se::Stream& stream,
-                      ncclComm_t comm);
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-
- private:
-  const NcclAllGatherConfig config_;
-  const std::vector<Buffer> buffers_;
-};
-
-class NcclAllGatherStartThunk : public NcclAllGatherThunkBase {
+// Thunk that performs a NCCL-based All-Gather among CUDA GPU-based replicas.
+class NcclAllGatherStartThunk : public NcclCollectiveThunk {
  public:
   NcclAllGatherStartThunk(ThunkInfo thunk_info,
                           mlir::lmhlo_gpu::AllGatherStartOp op,
@@ -61,22 +44,15 @@ class NcclAllGatherStartThunk : public NcclAllGatherThunkBase {
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::AllGatherStartOp op);
-  static constexpr bool IsAsync() { return true; }
-
-  AsyncExecutor& async_executor() { return async_; }
 
  protected:
-  Status RunNcclCollective(const ExecuteParams& params,
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
                            ncclComm_t comm) override;
 
  private:
-  AsyncExecutor async_;
-};
-
-class NcclAllGatherDoneThunk : public NcclCollectiveDoneThunk {
- public:
-  NcclAllGatherDoneThunk(ThunkInfo thunk_info,
-                         NcclCollectiveThunk::AsyncExecutor& async);
+  const NcclAllGatherConfig config_;
+  const std::vector<Buffer> buffers_;
 };
 
 Status RunAllGather(std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
index 049023aeb20..83bb577c052 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.cc
@@ -34,6 +34,9 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using mlir::lmhlo_gpu::AllReduceStartOp;
+using mlir::lmhlo_gpu::ReduceScatterStartOp;
+
 Status RunAllReduce(ReductionKind reduction_kind,
                     std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
                     ncclComm_t comm) {
@@ -214,16 +217,43 @@ NcclAllReduceReduceScatterThunkBase::MatchAllReduceComputation(
 
 NcclAllReduceReduceScatterThunkBase::NcclAllReduceReduceScatterThunkBase(
     Thunk::Kind kind, ThunkInfo thunk_info, NcclAllReduceConfig config,
-    std::vector<Buffer> buffers)
-    : NcclCollectiveThunk(kind, thunk_info),
+    std::vector<Buffer> buffers, bool is_sync)
+    : NcclCollectiveThunk(kind, thunk_info, is_sync),
       config_(std::move(config)),
       buffers_(std::move(buffers)) {
   CHECK_EQ(config_.config.operand_count, buffers_.size());
 }
 
-Status NcclAllReduceThunkBase::RunAllReduce(const ExecuteParams& params,
-                                            se::Stream& stream,
-                                            ncclComm_t comm) {
+NcclAllReduceStartThunk::NcclAllReduceStartThunk(ThunkInfo thunk_info,
+                                                 AllReduceStartOp op,
+                                                 std::vector<Buffer> buffers)
+    : NcclAllReduceReduceScatterThunkBase(Thunk::kNcclAllReduceStart,
+                                          thunk_info,
+                                          impl::GetNcclAllReduceConfig(op),
+                                          std::move(buffers), op.getIsSync()) {}
+
+Status NcclAllReduceStartThunk::CheckImplementable(AllReduceStartOp op,
+                                                   int64_t replica_count,
+                                                   int64_t partition_count) {
+  return AddOpDescription<NcclAllReduceStartThunk>(
+      impl::CheckImplementable(op, Thunk::kNcclAllReduceStart), op,
+      replica_count, partition_count);
+}
+
+bool NcclAllReduceStartThunk::IsDegenerate(AllReduceStartOp op,
+                                           int64_t replica_count,
+                                           int64_t partition_count) {
+  return impl::IsDegenerate(op, replica_count, partition_count);
+}
+
+CollectiveOpGroupMode NcclAllReduceStartThunk::GetGroupMode(
+    AllReduceStartOp op) {
+  return impl::GetGroupMode(op);
+}
+
+Status NcclAllReduceStartThunk::RunNcclCollective(const ExecuteParams& params,
+                                                  se::Stream& stream,
+                                                  ncclComm_t comm) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
@@ -232,85 +262,39 @@ Status NcclAllReduceThunkBase::RunAllReduce(const ExecuteParams& params,
                                   stream, comm);
 }
 
-NcclAllReduceStartThunk::NcclAllReduceStartThunk(
-    ThunkInfo thunk_info, mlir::lmhlo_gpu::AllReduceStartOp op,
-    std::vector<Buffer> buffers)
-    : NcclAllReduceThunkBase(Thunk::kNcclAllReduceStart, thunk_info,
-                             impl::GetNcclAllReduceConfig(op),
-                             std::move(buffers)) {}
-
-Status NcclAllReduceStartThunk::CheckImplementable(
-    mlir::lmhlo_gpu::AllReduceStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclAllReduceStartThunk>(
-      impl::CheckImplementable(op, Thunk::kNcclAllReduceStart), op,
-      replica_count, partition_count);
-}
-
-bool NcclAllReduceStartThunk::IsDegenerate(mlir::lmhlo_gpu::AllReduceStartOp op,
-                                           int64_t replica_count,
-                                           int64_t partition_count) {
-  return impl::IsDegenerate(op, replica_count, partition_count);
-}
-
-CollectiveOpGroupMode NcclAllReduceStartThunk::GetGroupMode(
-    mlir::lmhlo_gpu::AllReduceStartOp op) {
-  return impl::GetGroupMode(op);
-}
-
-Status NcclAllReduceStartThunk::RunNcclCollective(const ExecuteParams& params,
-                                                  ncclComm_t comm) {
-  return async_.Execute(
-      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
-        return RunAllReduce(params, stream, comm);
-      },
-      params, comm);
-}
-
-Status NcclReduceScatterThunkBase::RunReduceScatter(const ExecuteParams& params,
-                                                    se::Stream& stream,
-                                                    ncclComm_t comm) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<DeviceBufferPair> device_buffers,
-      ConvertToDeviceBuffers(params, buffers_,
-                             config_.config.operand_element_type));
-  return ::xla::gpu::RunReduceScatter(config_.reduction_kind, device_buffers,
-                                      stream, comm);
-}
-
 NcclReduceScatterStartThunk::NcclReduceScatterStartThunk(
-    ThunkInfo thunk_info, mlir::lmhlo_gpu::ReduceScatterStartOp op,
+    ThunkInfo thunk_info, ReduceScatterStartOp op,
     std::vector<NcclCollectiveThunk::Buffer> buffers)
-    : NcclReduceScatterThunkBase(Thunk::kNcclReduceScatterStart, thunk_info,
-                                 impl::GetNcclAllReduceConfig(op),
-                                 std::move(buffers)) {}
+    : NcclAllReduceReduceScatterThunkBase(Thunk::kNcclReduceScatterStart,
+                                          thunk_info,
+                                          impl::GetNcclAllReduceConfig(op),
+                                          std::move(buffers), op.getIsSync()) {}
 
 /*static*/ Status NcclReduceScatterStartThunk::CheckImplementable(
-    mlir::lmhlo_gpu::ReduceScatterStartOp op, int64_t replica_count,
-    int64_t partition_count) {
+    ReduceScatterStartOp op, int64_t replica_count, int64_t partition_count) {
   return AddOpDescription<NcclReduceScatterStartThunk>(
       impl::CheckImplementable(op, Thunk::kNcclReduceScatterStart), op,
       replica_count, partition_count);
 }
 
 /*static*/ bool NcclReduceScatterStartThunk::IsDegenerate(
-    mlir::lmhlo_gpu::ReduceScatterStartOp op, int64_t replica_count,
-    int64_t partition_count) {
+    ReduceScatterStartOp op, int64_t replica_count, int64_t partition_count) {
   return impl::IsDegenerate(op, replica_count, partition_count);
 }
 
 /*static*/ CollectiveOpGroupMode NcclReduceScatterStartThunk::GetGroupMode(
-    mlir::lmhlo_gpu::ReduceScatterStartOp op) {
+    ReduceScatterStartOp op) {
   return impl::GetGroupMode(op);
 }
 
 Status NcclReduceScatterStartThunk::RunNcclCollective(
-    const ExecuteParams& params, ncclComm_t comm) {
-  return async_.Execute(
-      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
-        return RunReduceScatter(params, stream, comm);
-      },
-      params, comm);
+    const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
+  TF_ASSIGN_OR_RETURN(
+      std::vector<DeviceBufferPair> device_buffers,
+      ConvertToDeviceBuffers(params, buffers_,
+                             config_.config.operand_element_type));
+  return ::xla::gpu::RunReduceScatter(config_.reduction_kind, device_buffers,
+                                      stream, comm);
 }
 
 Status RunReduceScatter(ReductionKind reduction_kind,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
index 60f03cb0e81..639ed1c29a9 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_reduce_thunk.h
@@ -40,7 +40,8 @@ class NcclAllReduceReduceScatterThunkBase : public NcclCollectiveThunk {
 
   NcclAllReduceReduceScatterThunkBase(Kind kind, ThunkInfo thunk_info,
                                       NcclAllReduceConfig config,
-                                      std::vector<Buffer> buffers);
+                                      std::vector<Buffer> buffers,
+                                      bool is_sync);
 
  protected:
   const NcclCollectiveConfig& config() const override { return config_.config; }
@@ -50,20 +51,10 @@ class NcclAllReduceReduceScatterThunkBase : public NcclCollectiveThunk {
 };
 
 // -----------------------------------------------------------------------------
-// AllReduce thunks
+// AllReduce thunk.
 // -----------------------------------------------------------------------------
 
-class NcclAllReduceThunkBase : public NcclAllReduceReduceScatterThunkBase {
- public:
-  using NcclAllReduceReduceScatterThunkBase::
-      NcclAllReduceReduceScatterThunkBase;
-
- protected:
-  Status RunAllReduce(const ExecuteParams& params, se::Stream& stream,
-                      ncclComm_t comm);
-};
-
-class NcclAllReduceStartThunk : public NcclAllReduceThunkBase {
+class NcclAllReduceStartThunk : public NcclAllReduceReduceScatterThunkBase {
  public:
   NcclAllReduceStartThunk(ThunkInfo thunk_info,
                           mlir::lmhlo_gpu::AllReduceStartOp op,
@@ -78,40 +69,16 @@ class NcclAllReduceStartThunk : public NcclAllReduceThunkBase {
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::AllReduceStartOp op);
-  static constexpr bool IsAsync() { return true; }
-
-  AsyncExecutor& async_executor() { return async_; }
 
  protected:
-  Status RunNcclCollective(const ExecuteParams& params,
+  Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
                            ncclComm_t comm) override;
-
- private:
-  AsyncExecutor async_;
-};
-
-class NcclAllReduceDoneThunk : public NcclCollectiveDoneThunk {
- public:
-  NcclAllReduceDoneThunk(ThunkInfo thunk_info,
-                         NcclCollectiveThunk::AsyncExecutor& async)
-      : NcclCollectiveDoneThunk(Thunk::kNcclAllReduceDone, thunk_info, async) {}
 };
 
 // -----------------------------------------------------------------------------
-// ReduceScatter thunks
+// ReduceScatter thunk
 // -----------------------------------------------------------------------------
-
-class NcclReduceScatterThunkBase : public NcclAllReduceReduceScatterThunkBase {
- public:
-  using NcclAllReduceReduceScatterThunkBase::
-      NcclAllReduceReduceScatterThunkBase;
-
- protected:
-  Status RunReduceScatter(const ExecuteParams& params, se::Stream& stream,
-                          ncclComm_t comm);
-};
-
-class NcclReduceScatterStartThunk : public NcclReduceScatterThunkBase {
+class NcclReduceScatterStartThunk : public NcclAllReduceReduceScatterThunkBase {
  public:
   NcclReduceScatterStartThunk(ThunkInfo thunk_info,
                               mlir::lmhlo_gpu::ReduceScatterStartOp op,
@@ -126,23 +93,10 @@ class NcclReduceScatterStartThunk : public NcclReduceScatterThunkBase {
                            int64_t replica_count, int64_t partition_count);
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::ReduceScatterStartOp op);
-  static constexpr bool IsAsync() { return true; }
-  AsyncExecutor& async_executor() { return async_; }
 
  protected:
-  Status RunNcclCollective(const ExecuteParams& params,
+  Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
                            ncclComm_t comm) override;
-
- private:
-  AsyncExecutor async_;
-};
-
-class NcclReduceScatterDoneThunk : public NcclCollectiveDoneThunk {
- public:
-  NcclReduceScatterDoneThunk(ThunkInfo thunk_info,
-                             NcclCollectiveThunk::AsyncExecutor& async)
-      : NcclCollectiveDoneThunk(Thunk::kNcclReduceScatterDone, thunk_info,
-                                async) {}
 };
 
 // -----------------------------------------------------------------------------
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
index f46f743ed16..1e4101317bf 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.cc
@@ -31,9 +31,10 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+using mlir::lmhlo_gpu::AllToAllStartOp;
+
 namespace impl {
-template <typename OpT>
-NcclAllToAllConfig GetNcclAllToAllConfig(OpT op) {
+NcclAllToAllConfig GetNcclAllToAllConfig(AllToAllStartOp op) {
   NcclAllToAllConfig config;
   // FIXME(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
   // attribute and it should be removed.
@@ -42,8 +43,7 @@ NcclAllToAllConfig GetNcclAllToAllConfig(OpT op) {
   return config;
 }
 
-template <typename OpT>
-Status CheckImplementable(OpT op) {
+Status CheckImplementable(AllToAllStartOp op) {
   TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
   std::optional<uint64_t> split_dim = op.getSplitDimension();
   for (mlir::Value operand : op.getInputs()) {
@@ -60,17 +60,37 @@ Status CheckImplementable(OpT op) {
 }
 }  // namespace impl
 
-NcclAllToAllThunkBase::NcclAllToAllThunkBase(Kind kind, ThunkInfo thunk_info,
-                                             NcclAllToAllConfig config,
-                                             std::vector<Buffer> buffers)
-    : NcclCollectiveThunk(kind, thunk_info),
-      config_(std::move(config)),
+NcclAllToAllStartThunk::NcclAllToAllStartThunk(
+    ThunkInfo thunk_info, AllToAllStartOp op,
+    std::vector<NcclCollectiveThunk::Buffer> buffers)
+    : NcclCollectiveThunk(Thunk::kNcclAllToAllStart, thunk_info,
+                          op.getIsSync()),
+      config_(impl::GetNcclAllToAllConfig(op)),
       buffers_(std::move(buffers)) {
   CHECK_EQ(config_.config.operand_count, buffers_.size());
 }
 
-Status NcclAllToAllThunkBase::RunAllToAll(const ExecuteParams& params,
-                                          se::Stream& stream, ncclComm_t comm) {
+/*static*/ Status NcclAllToAllStartThunk::CheckImplementable(
+    AllToAllStartOp op, int64_t replica_count, int64_t partition_count) {
+  return AddOpDescription<NcclAllToAllStartThunk>(
+      impl::CheckImplementable(op), op, replica_count, partition_count);
+}
+
+/*static*/ bool NcclAllToAllStartThunk::IsDegenerate(AllToAllStartOp op,
+                                                     int64_t replica_count,
+                                                     int64_t partition_count) {
+  return impl::GetNcclAllToAllConfig(op).config.IsDegenerate(replica_count,
+                                                             partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode NcclAllToAllStartThunk::GetGroupMode(
+    AllToAllStartOp op) {
+  return impl::GetNcclAllToAllConfig(op).config.group_mode;
+}
+
+Status NcclAllToAllStartThunk::RunNcclCollective(const ExecuteParams& params,
+                                                 se::Stream& stream,
+                                                 ncclComm_t comm) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params, buffers_,
@@ -79,45 +99,6 @@ Status NcclAllToAllThunkBase::RunAllToAll(const ExecuteParams& params,
                                stream, comm);
 }
 
-NcclAllToAllStartThunk::NcclAllToAllStartThunk(
-    ThunkInfo thunk_info, mlir::lmhlo_gpu::AllToAllStartOp op,
-    std::vector<NcclCollectiveThunk::Buffer> buffers)
-    : NcclAllToAllThunkBase(Thunk::kNcclAllToAllStart, thunk_info,
-                            impl::GetNcclAllToAllConfig(op),
-                            std::move(buffers)) {}
-
-/*static*/ Status NcclAllToAllStartThunk::CheckImplementable(
-    mlir::lmhlo_gpu::AllToAllStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclAllToAllStartThunk>(
-      impl::CheckImplementable(op), op, replica_count, partition_count);
-}
-
-/*static*/ bool NcclAllToAllStartThunk::IsDegenerate(
-    mlir::lmhlo_gpu::AllToAllStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::GetNcclAllToAllConfig(op).config.IsDegenerate(replica_count,
-                                                             partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclAllToAllStartThunk::GetGroupMode(
-    mlir::lmhlo_gpu::AllToAllStartOp op) {
-  return impl::GetNcclAllToAllConfig(op).config.group_mode;
-}
-
-Status NcclAllToAllStartThunk::RunNcclCollective(const ExecuteParams& params,
-                                                 ncclComm_t comm) {
-  return async_.Execute(
-      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
-        return RunAllToAll(params, stream, comm);
-      },
-      params, comm);
-}
-
-NcclAllToAllDoneThunk::NcclAllToAllDoneThunk(
-    ThunkInfo thunk_info, NcclCollectiveThunk::AsyncExecutor& async)
-    : NcclCollectiveDoneThunk(Thunk::kNcclAllToAllDone, thunk_info, async) {}
-
 Status RunAllToAll(bool has_split_dimension,
                    std::vector<DeviceBufferPair>& buffers, se::Stream& stream,
                    ncclComm_t comm) {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
index b6f33caec99..ad731a53e67 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_all_to_all_thunk.h
@@ -29,24 +29,8 @@ struct NcclAllToAllConfig {
   bool has_split_dimension;
 };
 
-// Base class for thunks that performs a NCCL-based All-to-All among CUDA
-// GPU-based replicas.
-class NcclAllToAllThunkBase : public NcclCollectiveThunk {
- public:
-  NcclAllToAllThunkBase(Kind kind, ThunkInfo thunk_info,
-                        NcclAllToAllConfig config, std::vector<Buffer> buffers);
-
- protected:
-  Status RunAllToAll(const ExecuteParams& params, se::Stream& stream,
-                     ncclComm_t comm);
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-
- private:
-  const NcclAllToAllConfig config_;
-  const std::vector<Buffer> buffers_;
-};
-
-class NcclAllToAllStartThunk : public NcclAllToAllThunkBase {
+// Thunk that performs a NCCL-based All-to-All among CUDA GPU-based replicas.
+class NcclAllToAllStartThunk : public NcclCollectiveThunk {
  public:
   NcclAllToAllStartThunk(ThunkInfo thunk_info,
                          mlir::lmhlo_gpu::AllToAllStartOp op,
@@ -64,21 +48,14 @@ class NcclAllToAllStartThunk : public NcclAllToAllThunkBase {
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::AllToAllStartOp op);
 
-  static constexpr bool IsAsync() { return true; }
-  AsyncExecutor& async_executor() { return async_; }
-
  protected:
-  Status RunNcclCollective(const ExecuteParams& params,
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
                            ncclComm_t comm) override;
 
  private:
-  AsyncExecutor async_;
-};
-
-class NcclAllToAllDoneThunk : public NcclCollectiveDoneThunk {
- public:
-  NcclAllToAllDoneThunk(ThunkInfo thunk_info,
-                        NcclCollectiveThunk::AsyncExecutor& async);
+  const NcclAllToAllConfig config_;
+  const std::vector<Buffer> buffers_;
 };
 
 Status RunAllToAll(bool has_split_dimension,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
index 5476359bfb4..31e745a70f4 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.cc
@@ -33,17 +33,19 @@ limitations under the License.
 
 namespace xla {
 namespace gpu {
+
+using mlir::lmhlo_gpu::CollectivePermuteStartOp;
+
 namespace impl {
 
-template <typename OpT>
-CollectiveOpGroupMode GetGroupMode(OpT op) {
+CollectiveOpGroupMode GetGroupMode(CollectivePermuteStartOp op) {
   return GetCollectiveOpGroupMode(op.getChannelId().has_value(), std::nullopt)
       .value();
 }
 
-template <typename OpT>
 NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
-    OpT op, int64_t replica_count, int64_t partition_count) {
+    CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
   NcclCollectivePermuteConfig collective_permute_config;
   auto& config = collective_permute_config.config;
 
@@ -83,8 +85,8 @@ NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
 
 // The collective permute is degenerate if all source-target pairs are identity,
 // and all the IDs appear in the list.
-template <typename OpT>
-bool IsDegenerate(OpT op, int64_t replica_count, int64_t partition_count) {
+bool IsDegenerate(CollectivePermuteStartOp op, int64_t replica_count,
+                  int64_t partition_count) {
   const std::vector<std::pair<int64_t, int64_t>> source_target_pairs =
       ConvertNx2Attribute(op.getSourceTargetPairs()).value();
   // Each ID can appear only once as a source and as a target. So if all pairs
@@ -99,22 +101,49 @@ bool IsDegenerate(OpT op, int64_t replica_count, int64_t partition_count) {
                         });
 }
 
-template <typename OpT>
-Status CheckImplementable(OpT op) {
+Status CheckImplementable(CollectivePermuteStartOp op) {
   TF_RETURN_IF_ERROR(NcclCollectiveThunk::CheckImplementable());
   return IsValidOperand(op.getOperand(), Thunk::kNcclCollectivePermute);
 }
 
 }  // namespace impl
 
-NcclCollectivePermuteThunkBase::NcclCollectivePermuteThunkBase(
-    Kind kind, ThunkInfo thunk_info, NcclCollectivePermuteConfig config,
-    const Buffer& buffer)
-    : NcclCollectiveThunk(kind, thunk_info),
-      config_(std::move(config)),
+NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
+    ThunkInfo thunk_info, CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count, const Buffer& buffer)
+    : NcclCollectiveThunk(Thunk::kNcclCollectivePermuteStart, thunk_info,
+                          op.getIsSync()),
+      config_(
+          GetNcclCollectivePermuteConfig(op, replica_count, partition_count)),
       buffer_(buffer) {}
 
-Status NcclCollectivePermuteThunkBase::RunCollectivePermute(
+/*static*/ NcclCollectivePermuteConfig
+NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
+    CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return impl::GetNcclCollectivePermuteConfig(op, replica_count,
+                                              partition_count);
+}
+
+/*static*/ Status NcclCollectivePermuteStartThunk::CheckImplementable(
+    CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return AddOpDescription<NcclCollectivePermuteStartThunk>(
+      impl::CheckImplementable(op), op, replica_count, partition_count);
+}
+
+/*static*/ bool NcclCollectivePermuteStartThunk::IsDegenerate(
+    CollectivePermuteStartOp op, int64_t replica_count,
+    int64_t partition_count) {
+  return impl::IsDegenerate(op, replica_count, partition_count);
+}
+
+/*static*/ CollectiveOpGroupMode NcclCollectivePermuteStartThunk::GetGroupMode(
+    CollectivePermuteStartOp op) {
+  return impl::GetGroupMode(op);
+}
+
+Status NcclCollectivePermuteStartThunk::RunNcclCollective(
     const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
@@ -142,54 +171,6 @@ Status NcclCollectivePermuteThunkBase::RunCollectivePermute(
                                           current_id);
 }
 
-/*static*/ NcclCollectivePermuteConfig
-NcclCollectivePermuteStartThunk::GetNcclCollectivePermuteConfig(
-    mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::GetNcclCollectivePermuteConfig(op, replica_count,
-                                              partition_count);
-}
-
-/*static*/ Status NcclCollectivePermuteStartThunk::CheckImplementable(
-    mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return AddOpDescription<NcclCollectivePermuteStartThunk>(
-      impl::CheckImplementable(op), op, replica_count, partition_count);
-}
-
-/*static*/ bool NcclCollectivePermuteStartThunk::IsDegenerate(
-    mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
-    int64_t partition_count) {
-  return impl::IsDegenerate(op, replica_count, partition_count);
-}
-
-/*static*/ CollectiveOpGroupMode NcclCollectivePermuteStartThunk::GetGroupMode(
-    mlir::lmhlo_gpu::CollectivePermuteStartOp op) {
-  return impl::GetGroupMode(op);
-}
-
-NcclCollectivePermuteStartThunk::NcclCollectivePermuteStartThunk(
-    ThunkInfo thunk_info, mlir::lmhlo_gpu::CollectivePermuteStartOp op,
-    int64_t replica_count, int64_t partition_count, const Buffer& buffer)
-    : NcclCollectivePermuteThunkBase(
-          Thunk::kNcclCollectivePermuteStart, thunk_info,
-          GetNcclCollectivePermuteConfig(op, replica_count, partition_count),
-          buffer) {}
-
-Status NcclCollectivePermuteStartThunk::RunNcclCollective(
-    const ExecuteParams& params, ncclComm_t comm) {
-  return async_.Execute(
-      [this](const ExecuteParams& params, se::Stream& stream, ncclComm_t comm) {
-        return RunCollectivePermute(params, stream, comm);
-      },
-      params, comm);
-}
-
-NcclCollectivePermuteDoneThunk::NcclCollectivePermuteDoneThunk(
-    ThunkInfo thunk_info, NcclCollectiveThunk::AsyncExecutor& async)
-    : NcclCollectiveDoneThunk(Thunk::kNcclCollectivePermuteDone, thunk_info,
-                              async) {}
-
 Status RunCollectivePermute(
     NcclCollectivePermuteConfig::SourceTargetMapEntry source_target,
     DeviceBufferPair& buffer, se::Stream& stream, ncclComm_t comm,
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
index 94feb73225b..20da50ea370 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_permute_thunk.h
@@ -55,24 +55,7 @@ struct NcclCollectivePermuteConfig {
 };
 
 // Thunk that performs a NCCL-based collective permute.
-class NcclCollectivePermuteThunkBase : public NcclCollectiveThunk {
- public:
-  NcclCollectivePermuteThunkBase(Kind kind, ThunkInfo thunk_info,
-                                 NcclCollectivePermuteConfig config,
-                                 const Buffer& buffer);
-
- protected:
-  Status RunCollectivePermute(const ExecuteParams& params, se::Stream& stream,
-                              ncclComm_t comm);
-
-  const NcclCollectiveConfig& config() const override { return config_.config; }
-
- private:
-  const NcclCollectivePermuteConfig config_;
-  const Buffer buffer_;
-};
-
-class NcclCollectivePermuteStartThunk : public NcclCollectivePermuteThunkBase {
+class NcclCollectivePermuteStartThunk : public NcclCollectiveThunk {
  public:
   static NcclCollectivePermuteConfig GetNcclCollectivePermuteConfig(
       mlir::lmhlo_gpu::CollectivePermuteStartOp op, int64_t replica_count,
@@ -86,7 +69,6 @@ class NcclCollectivePermuteStartThunk : public NcclCollectivePermuteThunkBase {
   static CollectiveOpGroupMode GetGroupMode(
       mlir::lmhlo_gpu::CollectivePermuteStartOp op);
   static const char* GetHloOpName() { return "collective-permute-start"; }
-  static constexpr bool IsAsync() { return true; }
 
   NcclCollectivePermuteStartThunk(ThunkInfo thunk_info,
                                   mlir::lmhlo_gpu::CollectivePermuteStartOp op,
@@ -94,20 +76,14 @@ class NcclCollectivePermuteStartThunk : public NcclCollectivePermuteThunkBase {
                                   int64_t partition_count,
                                   const Buffer& buffer);
 
-  AsyncExecutor& async_executor() { return async_; }
-
  protected:
-  Status RunNcclCollective(const ExecuteParams& params,
+  const NcclCollectiveConfig& config() const override { return config_.config; }
+  Status RunNcclCollective(const ExecuteParams& params, se::Stream& stream,
                            ncclComm_t comm) override;
 
  private:
-  AsyncExecutor async_;
-};
-
-class NcclCollectivePermuteDoneThunk : public NcclCollectiveDoneThunk {
- public:
-  NcclCollectivePermuteDoneThunk(ThunkInfo thunk_info,
-                                 NcclCollectiveThunk::AsyncExecutor& async);
+  const NcclCollectivePermuteConfig config_;
+  const Buffer buffer_;
 };
 
 Status RunCollectivePermute(
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
index 71c52bca3ab..0f193ab0d32 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.cc
@@ -16,11 +16,11 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h"
 
 #include <cstdlib>
+#include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_format.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
 #include "tensorflow/compiler/xla/service/global_device_id.h"
@@ -130,6 +130,14 @@ bool NcclCollectiveConfig::IsDegenerate(int64_t replica_count,
   }
 }
 
+NcclCollectiveThunk::NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info,
+                                         bool is_sync)
+    : Thunk(kind, thunk_info) {
+  if (!is_sync) {
+    async_ = std::make_unique<AsyncExecutor>();
+  }
+}
+
 /* static */ bool NcclCollectiveThunk::NcclIsEnabled() {
 #if XLA_ENABLE_XCCL
   return true;
@@ -149,7 +157,7 @@ bool NcclCollectiveConfig::IsDegenerate(int64_t replica_count,
 StatusOr<NcclComm::Lock> LockNcclComm(
     const NcclExecuteParams& params,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id) {
+    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id) {
   TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
                       params.GetGlobalDeviceId());
 
@@ -187,7 +195,8 @@ StatusOr<NcclComm::Lock> LockNcclComm(
   se::gpu::ScopedActivateExecutorContext scoped_context(params.stream_executor);
 
   return AcquireNcclComm(params.run_id, OpId(op_id), std::move(participants),
-                         num_local_participants, *unique_id_callback, rank);
+                         num_local_participants, *unique_id_callback, rank,
+                         stream_id);
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -213,19 +222,35 @@ StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
 
 Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
 #if XLA_ENABLE_XCCL
-  VLOG(1) << absl::StreamFormat("Starting %s.", Thunk::KindToString(kind()));
-  TF_ASSIGN_OR_RETURN(NcclComm::Lock comm,
-                      LockNcclComm(params.nccl_params, config().replica_groups,
-                                   config().group_mode, config().op_id));
+  VLOG(1) << absl::StreamFormat("Starting %s %s.", IsAsync() ? "async" : "sync",
+                                Thunk::KindToString(kind()));
+  const int64_t stream_id = IsAsync() ? 1 : 0;
+  TF_ASSIGN_OR_RETURN(
+      NcclComm::Lock comm,
+      LockNcclComm(params.nccl_params, config().replica_groups,
+                   config().group_mode, config().op_id, stream_id));
 
-  TF_RETURN_IF_ERROR(RunNcclCollective(params, *comm));
+  // Run the collective on main stream or using the async executor.
+  Status status = [&]() {
+    if (!IsAsync()) {
+      return RunNcclCollective(params, *params.stream, *comm);
+    }
+    return async_->Execute(
+        [this](const ExecuteParams& params, se::Stream& stream,
+               ncclComm_t comm) {
+          return RunNcclCollective(params, stream, comm);
+        },
+        params, *comm);
+  }();
+  TF_RETURN_IF_ERROR(status);
 
   // Block host on the first call to ensure that all devices have allocated the
   // required buffers for their communicators before allowing any device to
   // continue enqueuing operations. Otherwise, the allocations can cause
   // deadlock in the CUDA driver (b/215649390).
   if (first_call_to_execute_) {
-    TF_RETURN_IF_ERROR(params.stream->BlockHostUntilDone());
+    se::Stream* stream = IsAsync() ? params.async_comms_stream : params.stream;
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
     first_call_to_execute_ = false;
   }
   return OkStatus();
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
index 1f40efd7767..45cf57e57f6 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_collective_thunk.h
@@ -16,9 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_NCCL_COLLECTIVE_THUNK_H_
 
+#include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include "absl/functional/function_ref.h"
@@ -96,7 +96,7 @@ NcclCollectiveConfig GetNcclCollectiveConfigForMlir(
 // Thunk base class for NCCL collective operations.
 class NcclCollectiveThunk : public Thunk {
  public:
-  using Thunk::Thunk;
+  NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info, bool is_sync);
 
   struct Buffer {
     int64_t element_count;
@@ -135,17 +135,20 @@ class NcclCollectiveThunk : public Thunk {
   // Logging support.
   static std::string GetDeviceString(const NcclExecuteParams& params);
 
+  AsyncExecutor* async_executor() { return async_.get(); }
   Status ExecuteOnStream(const ExecuteParams& params) override;
 
  protected:
   virtual Status RunNcclCollective(const ExecuteParams& params,
-                                   ncclComm_t comm) = 0;
+                                   se::Stream& stream, ncclComm_t comm) = 0;
   virtual const NcclCollectiveConfig& config() const = 0;
 
  private:
+  bool IsAsync() const { return async_ != nullptr; }
 #if XLA_ENABLE_XCCL
   bool first_call_to_execute_ = true;
 #endif  // XLA_ENABLE_XCCL
+  std::unique_ptr<AsyncExecutor> async_;  // null if not async.
 };
 
 class NcclCollectiveDoneThunk : public Thunk {
@@ -184,7 +187,7 @@ Status AddOpDescription(Status status, OpT op, int64_t replica_count,
 StatusOr<NcclComm::Lock> LockNcclComm(
     const NcclExecuteParams& params,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id);
+    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id);
 #endif  // XLA_ENABLE_XCCL
 
 struct DeviceBufferPair {
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
index c3c90bb9c5d..68b63067654 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.cc
@@ -235,10 +235,11 @@ StatusOr<const NcclUniqueIdCallback*> GetNcclUniqueIdCallback(
 StatusOr<NcclComm::Lock> AcquireNcclComm(
     RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
     size_t num_local_participants,
-    const NcclUniqueIdCallback& unique_id_callback, int rank) {
+    const NcclUniqueIdCallback& unique_id_callback, int rank,
+    int64_t stream_id) {
   // Ensure that this group of threads have exclusive access to the clique to
   // prevent threads from different groups locking communicators in the clique.
-  NcclCliqueKey clique_key(std::move(participants));
+  NcclCliqueKey clique_key(std::move(participants), stream_id);
   std::shared_ptr<StatusOr<NcclClique::Lock>> clique = AcquireNcclClique(
       run_id, op_id, clique_key, unique_id_callback, num_local_participants);
 
diff --git a/tensorflow/compiler/xla/service/gpu/nccl_utils.h b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
index d098e0cafe0..af6fa79dca9 100644
--- a/tensorflow/compiler/xla/service/gpu/nccl_utils.h
+++ b/tensorflow/compiler/xla/service/gpu/nccl_utils.h
@@ -124,7 +124,8 @@ struct NcclComm : public Lockable<ncclComm_t> {
 StatusOr<NcclComm::Lock> AcquireNcclComm(
     RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
     size_t num_local_participants,
-    const NcclUniqueIdCallback& unique_id_callback, int rank);
+    const NcclUniqueIdCallback& unique_id_callback, int rank,
+    int64_t stream_id);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
index b58cd2d5989..0b0d5d38f54 100644
--- a/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
+++ b/tensorflow/compiler/xla/service/gpu/nvptx_compiler.cc
@@ -131,8 +131,6 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
 
   AlgebraicSimplifierOptions algsimp_options;
   algsimp_options.set_enable_conv_operand_swap(false);
-  algsimp_options.set_unconditionally_simplify_reduce_of_transpose_or_reshape(
-      true);
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(algsimp_options);
 
   // CudnnSimplifyPadding gets rid of some padding introduced by
@@ -143,8 +141,16 @@ Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddPass<CudnnSimplifyPadding>();
 
   // tf2xla bridge, DepthwiseConvolutionConverter, GpuConvRewriter, and
-  // CudnnSimplifyPadding introduce reshapes and transposes.
-  pipeline.AddPass<HloPassFix<ReshapeMover>>();
+  // CudnnSimplifyPadding introduce reshapes and transposes.  Run ReshapeMover
+  // to a fixed point.  Include algsimp because ReshapeMover relies on it.
+  [&, &pipeline = pipeline.AddPass<HloPassFix<HloPassPipeline>>(
+          "reshape_mover_after_conv_canonicalization")] {
+    ReshapeMoverOptions reshape_mover_options;
+    reshape_mover_options.reshape_of_1d_broadcast_is_cheap = true;
+    pipeline.AddPass<HloPassFix<ReshapeMover>>(reshape_mover_options);
+    pipeline.AddPass<AlgebraicSimplifier>(algsimp_options);
+  }();
+
   // The reshapes and transposes can possibly be eliminated using
   // AlgebraicSimplifier. ConvertMover and ReshapeMover fight with each other.
   // ConvertMover wants to move some converts down the graph, but ReshapeMover
@@ -232,6 +238,8 @@ std::optional<bool> CanShareBufferHint(const HloInstruction* user,
         return user_index.size() == 1 && user_index[0] == 0;
       }
       return false;
+    case HloOpcode::kFusion:
+      return GpuCompiler::FusionCanShareBufferHint(user, operand, user_index);
     default:
       return std::nullopt;
   }
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/BUILD b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
index c37013f4b5c..327392575c9 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/runtime/BUILD
@@ -102,6 +102,11 @@ cc_library(
     name = "custom_call",
     srcs = ["custom_call.cc"],
     hdrs = ["custom_call.h"],
+    copts = [
+        "-fexceptions",
+        "-fno-strict-aliasing",
+    ],
+    features = ["-use_header_modules"],
     deps = [
         ":support",
         ":triangular_solve",
@@ -125,6 +130,7 @@ cc_library(
     deps = [
         ":cholesky",
         ":collectives",
+        ":concurrent_region",
         ":conv",
         ":conv_reorder",
         ":cublas_lt_matmul",
@@ -151,7 +157,6 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_stream",
         "//tensorflow/tsl/protobuf:dnn_proto_cc",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -220,6 +225,7 @@ xla_cc_test(
         ":topk_kernel",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_types_header",
+        "//tensorflow/compiler/xla/stream_executor/host:host_platform",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//tensorflow/tsl/platform:test_main",
@@ -325,6 +331,7 @@ cc_library(
     srcs = ["graph_launch.cc"],
     hdrs = ["graph_launch.h"],
     deps = [
+        ":concurrent_region",
         ":conv",
         ":gemm",
         ":kernel_launch",
@@ -337,13 +344,31 @@ cc_library(
         "//tensorflow/compiler/xla/service:executable",
         "//tensorflow/compiler/xla/service/gpu:non_atomically_upgradeable_rw_lock",
         "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/tsl/profiler/lib:scoped_annotation_stack",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
     ] + if_cuda_is_configured([
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_graph",
     ]),
 )
 
+cc_library(
+    name = "concurrent_region",
+    srcs = ["concurrent_region.cc"],
+    hdrs = ["concurrent_region.h"],
+    deps = [
+        ":support",
+        "//tensorflow/compiler/xla/runtime:custom_call",
+        "//tensorflow/compiler/xla/runtime:custom_call_registry",
+        "//tensorflow/compiler/xla/runtime:executable",
+        "//tensorflow/compiler/xla/service:executable",
+        "//tensorflow/compiler/xla/service:stream_pool",
+        "//tensorflow/compiler/xla/stream_executor",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "io_feed",
     srcs = ["io_feed.cc"],
@@ -363,6 +388,7 @@ cc_library(
     srcs = ["kernel_launch.cc"],
     hdrs = ["kernel_launch.h"],
     deps = [
+        ":concurrent_region",
         ":support",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/runtime:custom_call",
@@ -431,6 +457,7 @@ cc_library(
 
 cc_library(
     name = "support",
+    srcs = ["support.cc"],
     hdrs = ["support.h"],
     deps = [
         "//tensorflow/compiler/xla:shape_util",
@@ -439,6 +466,8 @@ cc_library(
         "//tensorflow/compiler/xla/service/gpu:matmul_utils",
         "//tensorflow/compiler/xla/stream_executor:blas",
         "//tensorflow/compiler/xla/stream_executor:device_memory",
+        "//tensorflow/tsl/profiler/lib:scoped_annotation_stack",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
 )
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc b/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc
index 09314bc0728..bb1be319dc4 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/cholesky.cc
@@ -56,11 +56,8 @@ static absl::Status CholeskyImpl(const ServiceExecutableRunOptions* run_options,
 
   CholeskyParams params{n,        batch_size,       uplo,
                         a_buffer, workspace_buffer, info_buffer};
-  auto executed = RunCholesky(xla::gpu::PtxOptsFromDebugOptions(*debug_options),
-                              operand.dtype, &params, stream);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  return absl::OkStatus();
+  return RunCholesky(xla::gpu::PtxOptsFromDebugOptions(*debug_options),
+                     operand.dtype, &params, stream);
 #else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return absl::InternalError("Cholesky is not supported without GPU");
 #endif
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc b/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
index f290bb1411e..6896a63a53e 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/collectives.cc
@@ -69,11 +69,11 @@ absl::Status RunSyncOrAsync(
   }
 
   // Launch the collective on either the main or async stream.
-  auto status = to_run(is_async ? async_stream : main_stream);
-  if (!status.ok()) return status;
+  se::Stream* stream = is_async ? async_stream : main_stream;
+  TF_RETURN_IF_ERROR(to_run(stream));
 
   if (is_async) {
-    return async_collectives->RecordEvent(uid);
+    TF_RETURN_IF_ERROR(async_collectives->RecordEvent(uid));
   }
   int32_t device_ordinal = main_stream->parent()->device_ordinal();
   return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, main_stream);
@@ -83,7 +83,7 @@ absl::Status RunSyncOrAsync(
 StatusOr<NcclComm::Lock> GetNcclComm(
     const NcclExecuteParams& params, int64_t group_mode, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+    absl::Span<const int64_t> replica_group_values, bool is_async) {
   // TODO(b/233930690): Pass the attribute below as a nested array.
   // Pass an array of arrays using two vectors; one specifying all the values
   // and another specifying the (ending) offsets of each array in the other
@@ -98,8 +98,10 @@ StatusOr<NcclComm::Lock> GetNcclComm(
     replica_groups.push_back(replica_group);
   }
 
+  const int64_t stream_id = is_async ? 1 : 0;
   return LockNcclComm(params, replica_groups,
-                      static_cast<CollectiveOpGroupMode>(group_mode), op_id);
+                      static_cast<CollectiveOpGroupMode>(group_mode), op_id,
+                      stream_id);
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -126,19 +128,16 @@ StatusOr<std::vector<DeviceBufferPair>> GetDeviceBufferPairs(
 }
 
 absl::Status AsyncDoneImpl(const ServiceExecutableRunOptions* run_options,
-                           CollectivesSupport* collectives,
                            AsyncCollectivesSupport* async_collectives,
                            int32_t uid, std::string_view done_type) {
 #if XLA_ENABLE_XCCL
   VLOG(3) << "Running " << done_type;
   se::Stream* stream = run_options->stream();
 
-  auto event = async_collectives->PopEvent(uid);
-  if (!event.ok()) return event.status();
-  stream->ThenWaitFor(&*event);
+  TF_ASSIGN_OR_RETURN(se::Event event, async_collectives->PopEvent(uid));
+  stream->ThenWaitFor(&event);
 
-  int32_t device_ordinal = stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, stream);
+  return absl::OkStatus();
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
 #endif  // XLA_ENABLE_XCCL
@@ -156,33 +155,29 @@ absl::Status CollectivePermuteImplCommon(
     absl::Span<const int64_t> replica_group_offsets,
     absl::Span<const int64_t> replica_group_values,
     absl::Span<const int64_t> source_peers,
-    absl::Span<const int64_t> target_peers, int32_t repeat_count = 1) {
+    absl::Span<const int64_t> target_peers, bool is_async) {
   NcclExecuteParams params(*run_options, stream->parent());
 
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (!comm.ok()) return ToAbslStatus(comm.status());
+  TF_ASSIGN_OR_RETURN(
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, is_async));
+  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
-
-  if (device_buffers->size() != 1) {
+  if (device_buffers.size() != 1) {
     return absl::InternalError(absl::StrFormat(
-        "Expected device buffer size: 1, got %d", device_buffers->size()));
+        "Expected device buffer size: 1, got %d", device_buffers.size()));
   }
 
-  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
-  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
+  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
+                      params.GetGlobalDeviceId());
 
-  StatusOr<DeviceAssignment::LogicalID> current_logical_id =
-      params.device_assn->LogicalIdForDevice(global_device_id.value());
-  if (!current_logical_id.ok())
-    return ToAbslStatus(current_logical_id.status());
+  TF_ASSIGN_OR_RETURN(DeviceAssignment::LogicalID current_logical_id,
+                      params.device_assn->LogicalIdForDevice(global_device_id));
 
   const int64_t current_id = static_cast<CollectiveOpGroupMode>(group_mode) ==
                                      CollectiveOpGroupMode::kCrossReplica
-                                 ? current_logical_id.value().replica_id
-                                 : current_logical_id.value().computation_id;
+                                 ? current_logical_id.replica_id
+                                 : current_logical_id.computation_id;
   std::string device_string = NcclCollectiveThunk::GetDeviceString(params);
 
   NcclCollectivePermuteConfig::IdToSourceTargetMap id_to_source_target;
@@ -194,11 +189,11 @@ absl::Status CollectivePermuteImplCommon(
       NcclCollectivePermuteConfig::GetSourceTarget(id_to_source_target,
                                                    current_id);
 
-  return ToAbslStatus(
-      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
-        return RunCollectivePermute(source_target, (*device_buffers)[0],
-                                    *stream, **comm, device_string, current_id);
-      }));
+  return RunRepeated(
+      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+        return RunCollectivePermute(source_target, device_buffers[0], *stream,
+                                    *comm, device_string, current_id);
+      });
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -218,7 +213,8 @@ absl::Status CollectivePermuteImpl(
                           return CollectivePermuteImplCommon(
                               run_options, debug_options, stream, args,
                               group_mode, op_id, replica_group_offsets,
-                              replica_group_values, source_peers, target_peers);
+                              replica_group_values, source_peers, target_peers,
+                              is_async);
                         });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
@@ -252,19 +248,18 @@ absl::Status AllGatherImplCommon(
     const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+    absl::Span<const int64_t> replica_group_values, bool is_async) {
   NcclExecuteParams params(*run_options, stream->parent());
 
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (!comm.ok()) return ToAbslStatus(comm.status());
+  TF_ASSIGN_OR_RETURN(
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, is_async));
 
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
-  return ToAbslStatus(RunRepeated(
+  return RunRepeated(
       debug_options->xla_gpu_collective_inflation_factor(),
-      [&]() { return RunAllGather(*device_buffers, *stream, **comm); }));
+      [&]() { return RunAllGather(device_buffers, *stream, *comm); });
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -283,7 +278,7 @@ absl::Status AllGatherImpl(const ServiceExecutableRunOptions* run_options,
                           return AllGatherImplCommon(
                               run_options, debug_options, stream, args,
                               group_mode, op_id, replica_group_offsets,
-                              replica_group_values);
+                              replica_group_values, is_async);
                         });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL diasbled");
@@ -315,21 +310,20 @@ absl::Status AllReduceImplCommon(
     const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+    absl::Span<const int64_t> replica_group_values, bool is_async) {
   NcclExecuteParams params(*run_options, stream->parent());
 
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (!comm.ok()) return ToAbslStatus(comm.status());
+  TF_ASSIGN_OR_RETURN(
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, is_async));
 
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
-  return ToAbslStatus(
-      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+  return RunRepeated(
+      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
         return RunAllReduce(static_cast<ReductionKind>(reduction_kind),
-                            *device_buffers, *stream, **comm);
-      }));
+                            device_buffers, *stream, *comm);
+      });
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -349,7 +343,8 @@ absl::Status AllReduceImpl(const ServiceExecutableRunOptions* run_options,
                           return AllReduceImplCommon(
                               run_options, debug_options, stream, args,
                               group_mode, op_id, reduction_kind,
-                              replica_group_offsets, replica_group_values);
+                              replica_group_offsets, replica_group_values,
+                              is_async);
                         });
 #else   // XLA_ENABLE_XCCL
   // NCCL disabled.
@@ -378,27 +373,27 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
 //===----------------------------------------------------------------------===//
 
 #if XLA_ENABLE_XCCL
-absl::Status AllToAllImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode,
-    bool has_split_dimension, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+absl::Status AllToAllImplCommon(const ServiceExecutableRunOptions* run_options,
+                                const DebugOptions* debug_options,
+                                se::Stream* stream,
+                                CustomCall::RemainingArgs args,
+                                int64_t group_mode, bool has_split_dimension,
+                                int64_t op_id,
+                                absl::Span<const int64_t> replica_group_offsets,
+                                absl::Span<const int64_t> replica_group_values,
+                                bool is_async) {
   NcclExecuteParams params(*run_options, stream->parent());
 
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (!comm.ok()) return ToAbslStatus(comm.status());
+  TF_ASSIGN_OR_RETURN(
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, is_async));
 
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
-  return ToAbslStatus(
-      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
-        return RunAllToAll(has_split_dimension, *device_buffers, *stream,
-                           **comm);
-      }));
+  return RunRepeated(
+      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+        return RunAllToAll(has_split_dimension, device_buffers, *stream, *comm);
+      });
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -418,7 +413,8 @@ absl::Status AllToAllImpl(const ServiceExecutableRunOptions* run_options,
                           return AllToAllImplCommon(
                               run_options, debug_options, stream, args,
                               group_mode, has_split_dimension, op_id,
-                              replica_group_offsets, replica_group_values);
+                              replica_group_offsets, replica_group_values,
+                              is_async);
                         });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
@@ -451,21 +447,20 @@ absl::Status ReduceScatterImplCommon(
     const DebugOptions* debug_options, se::Stream* stream,
     CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
     int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values) {
+    absl::Span<const int64_t> replica_group_values, bool is_async) {
   NcclExecuteParams params(*run_options, stream->parent());
 
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values);
-  if (!comm.ok()) return ToAbslStatus(comm.status());
+  TF_ASSIGN_OR_RETURN(
+      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
+                             replica_group_values, is_async));
 
-  auto device_buffers = GetDeviceBufferPairs(args);
-  if (!device_buffers.ok()) return ToAbslStatus(device_buffers.status());
+  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
 
-  return ToAbslStatus(
-      RunRepeated(debug_options->xla_gpu_collective_inflation_factor(), [&]() {
+  return RunRepeated(
+      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
         return RunReduceScatter(static_cast<ReductionKind>(reduction_kind),
-                                *device_buffers, *stream, **comm);
-      }));
+                                device_buffers, *stream, *comm);
+      });
 }
 #endif  // XLA_ENABLE_XCCL
 
@@ -485,7 +480,8 @@ absl::Status ReduceScatterImpl(const ServiceExecutableRunOptions* run_options,
                           return ReduceScatterImplCommon(
                               run_options, debug_options, stream, args,
                               group_mode, op_id, reduction_kind,
-                              replica_group_offsets, replica_group_values);
+                              replica_group_offsets, replica_group_values,
+                              is_async);
                         });
 #else   // XLA_ENABLE_XCCL
   return absl::InternalError("NCCL disabled");
@@ -516,7 +512,6 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
     AsyncDone, FunctionWrapper<AsyncDoneImpl>(), checks,
     CustomCall::Bind("xla.gpu.async_collective_done")
         .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<CollectivesSupport*>()
         .UserData<AsyncCollectivesSupport*>()
         .Attr<int32_t>("uid")
         .Attr<std::string_view>("done_type"));
@@ -532,16 +527,15 @@ absl::Status ReplicaPartitionIdImpl(
   se::Stream* stream = run_options->stream();
   NcclExecuteParams params(*run_options, stream->parent());
 
-  StatusOr<GlobalDeviceId> global_device_id = params.GetGlobalDeviceId();
-  if (!global_device_id.ok()) return ToAbslStatus(global_device_id.status());
+  TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
+                      params.GetGlobalDeviceId());
 
-  StatusOr<DeviceAssignment::LogicalID> logical_id =
-      params.device_assn->LogicalIdForDevice(global_device_id.value());
-  if (!logical_id.ok()) return ToAbslStatus(logical_id.status());
+  TF_ASSIGN_OR_RETURN(DeviceAssignment::LogicalID logical_id,
+                      params.device_assn->LogicalIdForDevice(global_device_id));
 
   se::DeviceMemoryBase result_data = GetDeviceAddress(result);
   const uint32_t id =
-      is_replica_id ? logical_id->replica_id : logical_id->computation_id;
+      is_replica_id ? logical_id.replica_id : logical_id.computation_id;
   stream->ThenMemset32(&result_data, id, /*size=*/4);
   return absl::OkStatus();
 }
@@ -591,7 +585,7 @@ absl::Status CollectivesSupport::MaybeBlockAfterFirstRun(int32_t uid,
     absl::MutexLock lock(&mutex_);
     return executed_.insert(Key(uid, device_ordinal)).second;
   }();
-  return block ? ToAbslStatus(stream->BlockHostUntilDone()) : absl::OkStatus();
+  return block ? stream->BlockHostUntilDone() : absl::OkStatus();
 }
 
 AsyncCollectivesSupport::AsyncCollectivesSupport(se::Stream* async_comm_stream)
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.cc b/tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.cc
new file mode 100644
index 00000000000..32a5f93180a
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.cc
@@ -0,0 +1,122 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h"
+
+#include <utility>
+
+#include "tensorflow/compiler/xla/runtime/custom_call.h"
+#include "tensorflow/compiler/xla/runtime/executable.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+#include "tensorflow/compiler/xla/service/stream_pool.h"
+#include "tensorflow/compiler/xla/stream_executor/stream.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+//===----------------------------------------------------------------------===//
+// Definitions for ConcurrentRegionStatus.
+//===----------------------------------------------------------------------===//
+
+ConcurrentRegionStatus::ConcurrentRegionStatus(
+    const ServiceExecutableRunOptions* run_options,
+    const int max_num_borrowed_streams)
+    : max_num_borrowed_streams_(max_num_borrowed_streams),
+      stream_index_(0),
+      run_options_(run_options),
+      capture_stream_(nullptr) {}
+
+ConcurrentRegionStatus::~ConcurrentRegionStatus() {
+  DCHECK(!IsInConcurrentRegion());
+}
+
+absl::StatusOr<se::Stream*> ConcurrentRegionStatus::GetNextStream() {
+  DCHECK(IsInConcurrentRegion());
+
+  int index = stream_index_ % max_num_borrowed_streams_;
+  if (index == borrowed_streams_.size()) {
+    se::StreamExecutor* executor = run_options_->stream()->parent();
+    TF_ASSIGN_OR_RETURN(StreamPool::Ptr ptr,
+                        run_options_->BorrowStream(executor->device_ordinal()));
+    ptr->ThenWaitFor(capture_stream_);
+    borrowed_streams_.push_back(std::move(ptr));
+  }
+
+  stream_index_++;
+  return borrowed_streams_[index].get();
+}
+
+absl::Status ConcurrentRegionStatus::StartConcurrentRegion(
+    se::Stream* capture_stream) {
+  DCHECK(!IsInConcurrentRegion());
+  capture_stream_ = capture_stream;
+  return absl::OkStatus();
+}
+
+void ConcurrentRegionStatus::EndConcurrentRegion() {
+  DCHECK(IsInConcurrentRegion());
+
+  // Synchronize main capture stream with all borrowed streams.
+  for (StreamPool::Ptr& stream : borrowed_streams_) {
+    capture_stream_->ThenWaitFor(stream.get());
+  }
+
+  borrowed_streams_.clear();
+  stream_index_ = 0;
+  capture_stream_ = nullptr;
+}
+
+bool ConcurrentRegionStatus::IsInConcurrentRegion() {
+  return capture_stream_ != nullptr;
+}
+
+//===----------------------------------------------------------------------===//
+// Define custom calls that mark the concurrent region in CUDA graphs.
+//===----------------------------------------------------------------------===//
+
+using xla::runtime::CustomCall;
+
+static absl::Status RegionBegin(const ServiceExecutableRunOptions* run_options,
+                                ConcurrentRegionStatus* region_status) {
+  se::Stream* capture_stream = run_options->stream();
+  return region_status->StartConcurrentRegion(capture_stream);
+}
+
+static absl::Status RegionEnd(ConcurrentRegionStatus* region_status) {
+  region_status->EndConcurrentRegion();
+  return absl::OkStatus();
+}
+
+//===----------------------------------------------------------------------===//
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(
+    Begin, FunctionWrapper<RegionBegin>(), checks,
+    CustomCall::Bind("xla.gpu.concurrent_region.begin")
+        .UserData<const ServiceExecutableRunOptions*>()
+        .UserData<ConcurrentRegionStatus*>());
+
+XLA_RUNTIME_DEFINE_CUSTOM_CALL(End, FunctionWrapper<RegionEnd>(), checks,
+                               CustomCall::Bind("xla.gpu.concurrent_region.end")
+                                   .UserData<ConcurrentRegionStatus*>());
+
+void RegisterConcurrentRegionCustomCalls(
+    runtime::DirectCustomCallRegistry& registry) {
+  registry.Register("xla.gpu.concurrent_region.begin", Begin);
+  registry.Register("xla.gpu.concurrent_region.end", End);
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h b/tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h
new file mode 100644
index 00000000000..57353a594ca
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h
@@ -0,0 +1,65 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CONCURRENT_REGION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CONCURRENT_REGION_H_
+
+#include <vector>
+
+#include "tensorflow/compiler/xla/runtime/custom_call_registry.h"
+#include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+
+namespace xla {
+namespace gpu {
+
+// Registers XLA Gpu runtime kernel launch custom calls.
+void RegisterConcurrentRegionCustomCalls(
+    runtime::DirectCustomCallRegistry& registry);
+
+// The state to keep track of the information regarding concurrent regions
+// between custom calls.
+class ConcurrentRegionStatus {
+ public:
+  explicit ConcurrentRegionStatus(
+      const ServiceExecutableRunOptions* run_options,
+      int max_num_borrowed_streams = 10);
+
+  ~ConcurrentRegionStatus();
+
+  absl::Status StartConcurrentRegion(se::Stream* capture_stream);
+  void EndConcurrentRegion();
+
+  // Get a stream on which the concurrent-executable kernel runs. It returns a
+  // different stream each time to avoid building dependencies in the CUDA
+  // graph.
+  absl::StatusOr<se::Stream*> GetNextStream();
+
+  bool IsInConcurrentRegion();
+
+ private:
+  const int max_num_borrowed_streams_;
+  std::vector<StreamPool::Ptr> borrowed_streams_;
+
+  int32_t stream_index_;
+  const ServiceExecutableRunOptions* run_options_;
+
+  // It is set to nullptr if not in a concurrent region.
+  se::Stream* capture_stream_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_CONCURRENT_REGION_H_
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/conv.cc b/tensorflow/compiler/xla/service/gpu/runtime/conv.cc
index 06901e21768..1beed05f524 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/conv.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/conv.cc
@@ -359,7 +359,8 @@ static absl::Status ConvImpl(
   }
 
   // Get or create the convolution runner state.
-  absl::StatusOr<ConvRunner*> conv =
+  TF_ASSIGN_OR_RETURN(
+      ConvRunner * conv,
       runner.GetOrCreate([&]() -> absl::StatusOr<ConvRunner> {
         GpuConvDescriptor descriptor = GetConvDescriptor(
             kind, operand0, operand1, output, scratch, conv_dims,
@@ -368,12 +369,11 @@ static absl::Status ConvImpl(
             backend_config, {feature_group_count, result_scale}, fused_attrs,
             side_input_attrs);
 
-        StatusOr<GpuConvConfig> conv_config = GetGpuConvConfig(descriptor, "");
-        if (!conv_config.ok()) return ToAbslStatus(conv_config.status());
+        TF_ASSIGN_OR_RETURN(GpuConvConfig conv_config,
+                            GetGpuConvConfig(descriptor, ""));
 
-        return ConvRunner(*std::move(conv_config));
-      });
-  if (!conv.ok()) return conv.status();
+        return ConvRunner(std::move(conv_config));
+      }));
 
   // Prepare buffer arguments.
   std::vector<se::DeviceMemoryBase> buffers = {GetDeviceAddress(operand0),
@@ -397,49 +397,49 @@ static absl::Status ConvImpl(
     DeviceConfig device_config = {stream_exec, allocator};
     GpuConvAlgorithmPicker conv_algorithm_picker(device_config);
 
-    GpuConvConfig gpu_conv_config = conv.value()->config;
-    auto autotune_result =
+    GpuConvConfig gpu_conv_config = conv->config;
+    TF_ASSIGN_OR_RETURN(
+        AutotuneResult best_algo,
         conv_algorithm_picker.PickBestAlgorithmWithAllocatedBuffer(
             gpu_conv_config, run_options, debug_options, buffers,
-            result_buffer);
-    if (!autotune_result.ok()) return ToAbslStatus(autotune_result.status());
+            result_buffer));
 
     // Set algorithm in the convolution runner state.
-    AutotuneResult best_algo = autotune_result.value();
     se::dnn::AlgorithmDesc algo_desc(best_algo.conv().algorithm(),
                                      best_algo.conv().tensor_ops_enabled());
-    (*conv)->config.algorithm = algo_desc;
+    conv->config.algorithm = algo_desc;
 
     // Set scratch buffer size according to the selected algorithm.
     scratch_buffer_size = best_algo.scratch_bytes();
   }
 
   RunConvOptions opts;
-  opts.runner_cache = &(*conv)->runner;
+  opts.runner_cache = &conv->runner;
 
   if (scratch_buffer_size > scratch_buffer.size()) {
     // Need to reallocate scratch buffer.
     se::DeviceMemoryAllocator* allocator = run_options->allocator();
-    StatusOr<se::OwningDeviceMemory> allocated_buffer =
-        allocator->Allocate(run_options->device_ordinal(), scratch_buffer_size);
-    if (!allocated_buffer.ok()) return ToAbslStatus(allocated_buffer.status());
-    se::DeviceMemoryBase new_scratch_buffer(allocated_buffer->ptr(),
+    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
+                        allocator->Allocate(run_options->device_ordinal(),
+                                            scratch_buffer_size));
+    se::DeviceMemoryBase new_scratch_buffer(allocated_buffer.ptr(),
                                             scratch_buffer_size);
 
     // Run the convolution using the new scratch buffer.
-    auto st = RunGpuConv((*conv)->config, buffers, result_buffer,
-                         new_scratch_buffer, run_options->stream(), opts);
-    if (!st.ok() || !run_options->stream()->ok()) {
-      return ToAbslStatus(st);
+    TF_RETURN_IF_ERROR(RunGpuConv(conv->config, buffers, result_buffer,
+                                  new_scratch_buffer, run_options->stream(),
+                                  opts));
+    if (!run_options->stream()->ok()) {
+      return absl::InternalError("run_options stream not ok");
     }
     return absl::OkStatus();
   }
 
   // Run the convolution.
-  auto st = RunGpuConv((*conv)->config, buffers, result_buffer, scratch_buffer,
-                       run_options->stream(), opts);
-  if (!st.ok() || !run_options->stream()->ok()) {
-    return ToAbslStatus(st);
+  TF_RETURN_IF_ERROR(RunGpuConv(conv->config, buffers, result_buffer,
+                                scratch_buffer, run_options->stream(), opts));
+  if (!run_options->stream()->ok()) {
+    return absl::InternalError("run_options stream not ok");
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/conv_reorder.cc b/tensorflow/compiler/xla/service/gpu/runtime/conv_reorder.cc
index 1feb6725d81..d443884f5dc 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/conv_reorder.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/conv_reorder.cc
@@ -50,12 +50,9 @@ absl::Status ConvReorderFilterImpl(
   auto input = se::DeviceMemory<int8_t>(GetDeviceAddress(input_view));
   auto output = se::DeviceMemory<int8_t>(GetDeviceAddress(output_view));
 
-  auto executed = run_options->stream()->CudnnReorderConvolutionFilterAndBias(
+  return run_options->stream()->CudnnReorderConvolutionFilterAndBias(
       GetFilterDescriptor(filter_dims), input, &output, std::nullopt,
       std::nullopt);
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  return absl::OkStatus();
 }
 
 absl::Status ConvReorderFilterAndBiasImpl(
@@ -71,12 +68,9 @@ absl::Status ConvReorderFilterAndBiasImpl(
   auto bias_output =
       se::DeviceMemory<float>(GetDeviceAddress(bias_output_view));
 
-  auto executed = run_options->stream()->CudnnReorderConvolutionFilterAndBias(
+  return run_options->stream()->CudnnReorderConvolutionFilterAndBias(
       GetFilterDescriptor(filter_dims), filter_input, &filter_output,
       std::make_optional(bias_input), std::make_optional(bias_output));
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc b/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc
index 3b6de30acf5..28112c0b09f 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.cc
@@ -15,7 +15,6 @@ limitations under the License.1
 
 #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
 
-#include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
@@ -95,22 +94,21 @@ static absl::Status CublasLtMatmulImpl(
   se::Stream* stream = run_options->stream();
 
   // Find the gemm config for this instance of matmul.
-  absl::StatusOr<GemmConfig*> config = gemm_config.GetOrCreate([&] {
+  TF_ASSIGN_OR_RETURN(GemmConfig * config, gemm_config.GetOrCreate([&] {
     return ToAbsl(GetGemmConfig(
         a, b, c, algorithm, alpha_real, alpha_imag, beta, dot_dims.lhs_batch,
         dot_dims.lhs_contract, dot_dims.rhs_batch, dot_dims.rhs_contract,
         precision.empty() ? se::blas::kDefaultComputePrecision
                           : *absl::c_max_element(precision)));
-  });
-  if (!config.ok()) return config.status();
+  }));
 
   // Get the matmul plan for this instance of matmul.
-  absl::StatusOr<cublas_lt::MatmulPlan*> plan = matmul_plan.GetOrCreate(
-      [&] { return ToAbsl(cublas_lt::MatmulPlan::From(**config, epilogue)); });
-  if (!plan.ok()) return plan.status();
+  TF_ASSIGN_OR_RETURN(
+      cublas_lt::MatmulPlan * plan, matmul_plan.GetOrCreate([&] {
+        return ToAbsl(cublas_lt::MatmulPlan::From(*config, epilogue));
+      }));
 
-  auto algos = (*plan)->GetAlgorithms(stream);
-  if (!algos.ok()) return ToAbslStatus(algos.status());
+  TF_ASSIGN_OR_RETURN(auto algos, plan->GetAlgorithms(stream));
 
   se::DeviceMemoryBase a_data = GetDeviceAddress(a);
   se::DeviceMemoryBase b_data = GetDeviceAddress(b);
@@ -135,13 +133,10 @@ static absl::Status CublasLtMatmulImpl(
   se::OwningScratchAllocator<> scratch_allocator(
       stream->parent()->device_ordinal(), stream->parent()->GetAllocator());
 
-  auto st = (*plan)->ExecuteOnStream(
-      stream, a_data, b_data, c_data, d_data, bias_data, aux_data, a_scale_data,
-      b_scale_data, c_scale_data, d_scale_data, d_amax_data,
-      (*algos)[algorithm], scratch_allocator);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
+  return plan->ExecuteOnStream(stream, a_data, b_data, c_data, d_data,
+                               bias_data, aux_data, a_scale_data, b_scale_data,
+                               c_scale_data, d_scale_data, d_amax_data,
+                               algos[algorithm], scratch_allocator);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc b/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc
index 5d309669ed5..52d65334db4 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/custom_call.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/runtime/custom_call.h"
 
+#include <exception>
 #include <string>
 #include <string_view>
 
@@ -100,9 +101,21 @@ static absl::Status XlaCustomCallImpl(
         void (*)(se::gpu::GpuStreamHandle, void**, const char*, size_t);
     auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
 
-    xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
-                    buffers.data(), backend_config.data(),
-                    backend_config.size());
+    // As this is calling an external library, we should catch the
+    // error as there isn't another working correctly path to return
+    // an error to XLA.
+    try {
+      xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
+                      buffers.data(), backend_config.data(),
+                      backend_config.size());
+    } catch (std::exception& e) {
+      return absl::UnknownError(
+          absl::StrCat(call_target_name,
+                       " XLA extension have thrown an exception: ", e.what()));
+    } catch (...) {
+      return absl::UnknownError(absl::StrCat(
+          call_target_name, " XLA extension have thrown an exception."));
+    }
 
     return absl::OkStatus();
   }
@@ -116,9 +129,21 @@ static absl::Status XlaCustomCallImpl(
     auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
 
     XlaCustomCallStatus custom_call_status;
-    xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
-                    buffers.data(), backend_config.data(),
-                    backend_config.size(), &custom_call_status);
+    // As this is calling an external library, we should catch the
+    // error as there isn't another working correctly path to return
+    // an error to XLA.
+    try {
+      xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
+                      buffers.data(), backend_config.data(),
+                      backend_config.size(), &custom_call_status);
+    } catch (std::exception& e) {
+      return absl::UnknownError(
+          absl::StrCat(call_target_name,
+                       " XLA extension have thrown an exception: ", e.what()));
+    } catch (...) {
+      return absl::UnknownError(absl::StrCat(
+          call_target_name, " XLA extension have thrown an exception."));
+    }
 
     if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
       return absl::InternalError(message.value());
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
index c1c6f74feaf..3d90510b7d4 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/executable.cc
@@ -22,13 +22,13 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
-#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/ffi.h"
 #include "tensorflow/compiler/xla/runtime/jit_executable.h"
 #include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cholesky.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv_reorder.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/cublas_lt_matmul.h"
@@ -50,39 +50,37 @@ limitations under the License.
 #endif  // #if GOOGLE_CUDA
 
 namespace xla {
-
-#if GOOGLE_CUDA
-namespace runtime {
-namespace ffi {
-
-// Override weak symbol defined in the `xla/runtime/ffi.cc` with a strong one
-// that provides implementation for the XLA:GPU backend.
-XLA_FFI_Stream* GetXlaFfiStream(const CustomCall::UserData* user_data,
-                                const DiagnosticEngine* diagnostic) {
-  auto run_opts = user_data->getIfExists<const ServiceExecutableRunOptions>();
-  auto stream = se::gpu::AsGpuStreamValue(run_opts->stream());
-  return reinterpret_cast<XLA_FFI_Stream*>(stream);
-}
-
-}  // namespace ffi
-}  // namespace runtime
-#endif  // GOOGLE_CUDA
-
 namespace gpu {
 
 using ::xla::runtime::CustomCallAttrEncodingSet;
 using ::xla::runtime::DirectCustomCallRegistry;
 using ::xla::runtime::Executable;
 using ::xla::runtime::JitExecutable;
-using ::xla::runtime::success;
 using ::xla::runtime::Tagged;
 using ::xla::runtime::TypeIDNameRegistry;
 
+using ::xla::runtime::CustomCall;
+using ::xla::runtime::DiagnosticEngine;
 using ::xla::runtime::ExportModules;
 using ::xla::runtime::ffi::ExportFfiModules;
 using ::xla::runtime::ffi::FfiStateVector;
+using ::xla::runtime::ffi::RegisterXlaFfiStreamProvider;
+
+#if GOOGLE_CUDA
+static XLA_FFI_Stream* GetXlaFfiGpuStream(const CustomCall::UserData* user_data,
+                                          const DiagnosticEngine* diagnostic) {
+  auto run_opts = user_data->getIfExists<const ServiceExecutableRunOptions>();
+  if (!run_opts) return nullptr;
+  auto stream = se::gpu::AsGpuStreamValue(run_opts->stream());
+  return reinterpret_cast<XLA_FFI_Stream*>(stream);
+}
+#endif  // GOOGLE_CUDA
 
 void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
+#if GOOGLE_CUDA
+  RegisterXlaFfiStreamProvider(GetXlaFfiGpuStream);
+#endif  // GOOGLE_CUDA
+
   RegisterKernelLaunchCustomCalls(registry);
   RegisterTracingCustomCalls(registry);
   RegisterFftCustomCalls(registry);
@@ -100,6 +98,7 @@ void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
 #if GOOGLE_CUDA
   // Graph launch kernels depend on Cuda Graph API.
   RegisterGraphLaunchCustomCalls(registry);
+  RegisterConcurrentRegionCustomCalls(registry);
   RegisterMatmulCustomCalls(registry);
 #endif  // GOOGLE_CUDA
 
@@ -343,12 +342,16 @@ Status GpuRuntimeExecutable::Execute(
 
   // Get the async communications stream for async collectives.
   se::StreamExecutor* executor = run_options->stream()->parent();
-  int device_ordinal = executor->device_ordinal();
+  se::StreamPriority stream_priority = se::StreamPriority::Default;
+  if (debug_options_.xla_gpu_enable_highest_priority_async_stream()) {
+    stream_priority = se::StreamPriority::Highest;
+  }
+
   StatusOr<StreamPool::Ptr> async_comms_stream =
-      run_options->BorrowStream(device_ordinal);
+      run_options->BorrowStream(executor->device_ordinal(), stream_priority);
 
   // Async Collectives support and Send/Recv events instantiated for each Gpu
-  // executable run, so that concurrent executions can run independenty using a
+  // executable run, so that concurrent executions can run independently using a
   // separate set of events for communication.
   AsyncCollectivesSupport async_collectives(
       async_comms_stream.ok() ? async_comms_stream->get() : nullptr);
@@ -372,6 +375,12 @@ Status GpuRuntimeExecutable::Execute(
       captured_function_counts_(executor)->snapshot();
 #endif  // GOOGLE_CUDA
 
+  // Kernels in concurrent regions should be launched on borrowed stream, so
+  // that the cuda graph won't record dependencies between kernels.
+  // This state stores if the kernel being run is in a concurrent region and
+  // the borrowed streams for executing kernels in concurrent regions.
+  ConcurrentRegionStatus concurrent_region_status(run_options);
+
   // State cached globally for gpu executable.
   GemmConfigs::Snapshot gemm_configs = gemm_configs_.snapshot();
   FftPlans::Snapshot fft_plans = fft_plans_.snapshot();
@@ -381,18 +390,19 @@ Status GpuRuntimeExecutable::Execute(
 #endif  // GOOGLE_CUDA
 
   // Initialize state required for running functions exported from FFI modules.
-  absl::StatusOr<FfiStateVector> ffi_state = ffi_modules_state_.state_vector();
-  if (!ffi_state.ok()) return FromAbslStatus(ffi_state.status());
+  TF_ASSIGN_OR_RETURN(FfiStateVector ffi_state,
+                      ffi_modules_state_.state_vector());
 
   // Pass auxiliary data to the custom call handlers.
   runtime::CustomCall::UserData user_data(
       run_options, &executable, &debug_options_, &temp_buffer, &asm_text,
-      &ffi_state.value(), &binary, &kernels, &gemm_configs, &conv_runners,
+      &ffi_state, &binary, &kernels, &gemm_configs, &conv_runners,
       &collectives_, &fft_plans, &send_recv_events, &gpu_lock,
 #if GOOGLE_CUDA
       // Auxiliary data that is available only if compiled with CUDA support.
       &matmul_plans, &graph_instances, &execution_count,
 #endif  // GOOGLE_CUDA
+      &concurrent_region_status,
       // Null pointer will be interpreted as an absence of async collectives
       // support and custom calls will safely return an error.
       async_collectives.async_comm_stream() ? &async_collectives : nullptr);
@@ -406,10 +416,7 @@ Status GpuRuntimeExecutable::Execute(
   // Collect all emitted diagnostic messages.
   std::string diagnostic;
   runtime::DiagnosticEngine diagnostic_engine;
-  diagnostic_engine.AddHandler([&](runtime::Diagnostic& d) {
-    absl::StrAppend(&diagnostic, d.status().message());
-    return success();
-  });
+  AppendDiagnosticToString(diagnostic_engine, &diagnostic, true);
 
   // Prepare options for executing XLA Runtime program.
   runtime::Executable::ExecuteOpts opts;
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/fft.cc b/tensorflow/compiler/xla/service/gpu/runtime/fft.cc
index 443d6d18f42..ecca993d94e 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/fft.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/fft.cc
@@ -101,19 +101,16 @@ static absl::Status FftImpl(const ServiceExecutableRunOptions* run_options,
     }
   }
 
-  absl::StatusOr<std::unique_ptr<FftPlanCache>*> fft_plan_cache =
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<FftPlanCache> * fft_plan_cache,
       state.GetOrCreate([]() -> absl::StatusOr<std::unique_ptr<FftPlanCache>> {
         return std::make_unique<FftPlanCache>();
-      });
-  if (!fft_plan_cache.ok()) return fft_plan_cache.status();
+      }));
 
-  auto st =
-      RunFft(GetDeviceAddress(input), ToShape(input), GetDeviceAddress(output),
-             ToShape(output), fft_type, fft_length, executor->device_ordinal(),
-             (*fft_plan_cache)->get(), stream, run_options->allocator());
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
+  return RunFft(GetDeviceAddress(input), ToShape(input),
+                GetDeviceAddress(output), ToShape(output), fft_type, fft_length,
+                executor->device_ordinal(), fft_plan_cache->get(), stream,
+                run_options->allocator());
 }
 
 XLA_RUNTIME_DEFINE_CUSTOM_CALL(
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
index 05a18260c51..268149ba534 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/gemm.cc
@@ -116,7 +116,7 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
   Shape output_shape = ToShape(out);
 
   // Get the gemm config from the state.
-  absl::StatusOr<GemmConfig*> config_from_state = state.GetOrCreate([&] {
+  TF_ASSIGN_OR_RETURN(GemmConfig * gemm_config, state.GetOrCreate([&] {
     StatusOr<GemmConfig> gemm_config =
         GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta,
                       dot_dims.lhs_batch, dot_dims.lhs_contract,
@@ -124,10 +124,7 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
                       precision.empty() ? se::blas::kDefaultComputePrecision
                                         : *absl::c_max_element(precision));
     return ToAbsl(gemm_config);
-  });
-
-  if (!config_from_state.ok()) return config_from_state.status();
-  GemmConfig* gemm_config = *config_from_state;
+  }));
 
   // Set the gemm algorithm by runtime autotuning. We do runtime autotuning
   // outside of state.GetOrCreate() because otherwise it would be a potential
@@ -146,12 +143,7 @@ static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
 #endif
   }
 
-  Status executed =
-      RunGemm(*gemm_config, lhs_data, rhs_data, output_data, stream);
-
-  if (!executed.ok()) return ToAbslStatus(executed);
-
-  return absl::OkStatus();
+  return RunGemm(*gemm_config, lhs_data, rhs_data, output_data, stream);
 }
 
 XLA_RUNTIME_DEFINE_CUSTOM_CALL(
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
index b9af00c4255..7284085283e 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/graph_launch.cc
@@ -21,18 +21,21 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/conv.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/gemm.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
+#include "tensorflow/tsl/profiler/lib/scoped_annotation_stack.h"
 
 #if GOOGLE_CUDA
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h"
@@ -48,6 +51,7 @@ using xla::runtime::Executable;
 using xla::runtime::MemrefDesc;
 using xla::runtime::ScalarArg;
 using xla::runtime::StridedMemrefView;
+using xla::runtime::success;
 
 //===----------------------------------------------------------------------===//
 // CUDA graphs caching.
@@ -137,12 +141,10 @@ static absl::StatusOr<OwnedCudaGraph> CaptureGraph(
   const ServiceExecutableRunOptions capture_opts(capture_run_options);
   user_data.insert(&capture_opts);
 
-  std::string error;
+  // Collect all emitted diagnostic messages.
+  std::string diagnostic;
   runtime::DiagnosticEngine diagnostic_engine;
-  diagnostic_engine.AddHandler([&](runtime::Diagnostic& diagnostic) {
-    error.append(diagnostic.status().message());
-    return runtime::success();
-  });
+  AppendDiagnosticToString(diagnostic_engine, &diagnostic);
 
   // Prepare options for executing graph capture function.
   Executable::ExecuteOpts opts;
@@ -177,12 +179,16 @@ static absl::StatusOr<OwnedCudaGraph> CaptureGraph(
 
   // Create a graph from running the graph capture function.
   auto captured = se::gpu::CaptureCudaGraph(capture_stream->get(), [&]() {
-    return FromAbslStatus(function_ref(args, runtime::NoResultConverter{}, opts,
-                                       /*verify_arguments=*/InDebugMode())
-                              .status());
+    return function_ref(args, runtime::NoResultConverter{}, opts,
+                        /*verify_arguments=*/InDebugMode())
+        .status();
   });
 
-  if (!captured.ok()) return ToAbslStatus(captured.status());
+  if (!captured.ok()) {
+    return InternalError("CaptureCudaGraph failed (%s): %s",
+                         diagnostic.empty() ? "<no details>" : diagnostic,
+                         captured.status().ToString());
+  }
   return std::move(*captured);
 }
 
@@ -194,12 +200,11 @@ static absl::Status RunGraphWithoutCapture(
   Executable::ExecuteOpts opts;
   opts.custom_call_data = &user_data;
 
-  std::string error;
+  // Collect all emitted diagnostic messages.
+  std::string diagnostic;
   runtime::DiagnosticEngine diagnostic_engine;
-  diagnostic_engine.AddHandler([&](runtime::Diagnostic& diagnostic) {
-    error.append(diagnostic.status().message());
-    return runtime::success();
-  });
+  AppendDiagnosticToString(diagnostic_engine, &diagnostic);
+
   opts.diagnostic_engine = &diagnostic_engine;
 
   // Graph capture function should not launch any async tasks.
@@ -224,8 +229,15 @@ static absl::Status RunGraphWithoutCapture(
     return absl::InvalidArgumentError("Unsupported argument type");
   }
 
-  return function_ref(args, runtime::NoResultConverter{}, opts, InDebugMode())
-      .status();
+  auto status =
+      function_ref(args, runtime::NoResultConverter{}, opts, InDebugMode())
+          .status();
+  if (!status.ok()) {
+    return InternalError("RunGraphWithoutCapture failed (%s): %s",
+                         diagnostic.empty() ? "<no details>" : diagnostic,
+                         status.ToString());
+  }
+  return absl::OkStatus();
 }
 
 #endif  // #if GOOGLE_CUDA
@@ -244,7 +256,8 @@ static absl::Status LaunchGraph(
     CapturedFunctionExecutionCount::Snapshot* counts,
     GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
     NonAtomicallyUpgradeableRWLock* gpu_lock,
-    CustomCall::RemainingArgs fwd_args, CustomCall::FunctionOrdinal capture) {
+    ConcurrentRegionStatus* region_status, CustomCall::RemainingArgs fwd_args,
+    CustomCall::FunctionOrdinal capture) {
 #if GOOGLE_CUDA
   VLOG(1) << "Launch Cuda Graph: capture=" << capture.ordinal;
 
@@ -258,64 +271,75 @@ static absl::Status LaunchGraph(
   auto user_data = [&] {
     return CustomCall::UserData(run_options, debug_options, ptx, cubin,
                                 temp_buffer, kernels, convs, executable,
-                                gemm_config, gpu_lock);
+                                gemm_config, gpu_lock, region_status);
   };
 
-  absl::StatusOr<std::unique_ptr<std::atomic<uint64_t>>*> get_count =
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<std::atomic<uint64_t>> * get_count,
       counts->GetOrCreate(
           capture.ordinal,
           []() -> absl::StatusOr<std::unique_ptr<std::atomic<uint64_t>>> {
             return std::make_unique<std::atomic<uint64_t>>(0);
-          });
-  if (!get_count.ok()) return get_count.status();
-  uint64_t count = (**get_count)->fetch_add(1);
+          }));
+  uint64_t count = (*get_count)->fetch_add(1);
   uint64_t instantiation_threshold =
       debug_options->xla_gpu_cuda_graph_instantiation_threshold();
-  if (count < instantiation_threshold) {
+
+  // TODO(ezhulenev): Cupti tracing leads to deadlocks in CUDA 11. Always fall
+  // back on regular execution if we detect tracing activity.
+#if CUDA_VERSION >= 12000
+  bool is_profiling = false;
+#else
+  bool is_profiling = tsl::profiler::ScopedAnnotationStack::IsEnabled();
+#endif
+
+  if (count < instantiation_threshold || is_profiling) {
     // Run captured graph directly.
-    absl::Status result = RunGraphWithoutCapture(run_options, function_ref,
-                                                 fwd_args, user_data());
-    if (!result.ok()) return result;
-    return absl::OkStatus();
+    return RunGraphWithoutCapture(run_options, function_ref, fwd_args,
+                                  user_data());
   }
 
-  absl::StatusOr<GraphInstance*> instance = instances->GetOrCreate(
-      capture.ordinal, [&]() -> absl::StatusOr<GraphInstance> {
-        auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
-        if (!g.ok()) return g.status();
+  TF_ASSIGN_OR_RETURN(
+      GraphInstance * instance,
+      instances->GetOrCreate(
+          capture.ordinal, [&]() -> absl::StatusOr<GraphInstance> {
+            TF_ASSIGN_OR_RETURN(auto g, CaptureGraph(run_options, function_ref,
+                                                     fwd_args, user_data()));
 
-        auto e = se::gpu::InstantiateCudaGraph(std::move(*g));
-        if (!e.ok()) return ToAbslStatus(e.status());
+            TF_ASSIGN_OR_RETURN(auto e,
+                                se::gpu::InstantiateCudaGraph(std::move(g)));
+            return GraphInstance(ptrs_hash, std::move(e));
+          }));
 
-        return GraphInstance(ptrs_hash, std::move(*e));
-      });
-  if (!instance.ok()) return instance.status();
+  {
+    // Lock graph instance for read only access. If we'll have to update the
+    // graph, we'll update to a writer lock below.
+    absl::ReaderMutexLock lock(instance->mutex.get());
 
-  // Lock graph instance mutex for exclusive access, because we potentially
-  // might have to update it with a new graph version.
-  absl::MutexLock lock((*instance)->mutex.get());
-
-  // If pointers did not change we can run captured graph.
-  if (ptrs_hash == (*instance)->ptr_hash) {
-    VLOG(3) << "Execute cached graph instance";
-    return ToAbslStatus((*instance)->exec.Launch(run_options->stream()));
+    // If pointers did not change we can run captured graph.
+    if (ptrs_hash == instance->ptr_hash) {
+      VLOG(3) << "Execute cached graph instance";
+      return instance->exec.Launch(run_options->stream());
+    }
   }
 
   // Otherwise we have to re-capture the graph and update the graph instance.
   VLOG(3) << "Update cached graph instance";
-
   // Capture CUDA graph by running capture function.
-  auto g = CaptureGraph(run_options, function_ref, fwd_args, user_data());
-  if (!g.ok()) return g.status();
+  TF_ASSIGN_OR_RETURN(
+      auto g, CaptureGraph(run_options, function_ref, fwd_args, user_data()));
+
+  // At this point we have to grab a writer lock, because we might potentially
+  // have concurrent execution of the cached graph instance.
+  absl::WriterMutexLock lock(instance->mutex.get());
 
   // Update captured graph executable.
-  auto updated = (*instance)->exec.Update(std::move(*g));
-  if (!updated.ok()) return ToAbslStatus(updated);
+  TF_RETURN_IF_ERROR(instance->exec.Update(std::move(g)));
 
   // Update captured graph pointers hash.
-  (*instance)->ptr_hash = ptrs_hash;
+  instance->ptr_hash = ptrs_hash;
 
-  return ToAbslStatus((*instance)->exec.Launch(run_options->stream()));
+  return instance->exec.Launch(run_options->stream());
 
 #else  // #if !GOOGLE_CUDA
 
@@ -341,6 +365,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<GemmConfigs::Snapshot*>()
         .UserData<Executable*>()
         .UserData<NonAtomicallyUpgradeableRWLock*>()
+        .UserData<ConcurrentRegionStatus*>()
         .RemainingArgs()
         .Attr<CustomCall::FunctionOrdinal>("capture"));
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc b/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc
index e1922ebaa82..7e7a0ae6968 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/io_feed.cc
@@ -69,8 +69,7 @@ static absl::Status InfeedImpl(const ServiceExecutableRunOptions* run_options,
   }
 
   // TODO(ezhulenev): Make this function async?
-  Status block_status = stream->BlockHostUntilDone();
-  if (!block_status.ok()) return ToAbslStatus(block_status);
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
 
   VLOG(3) << "Infeeding to GPU complete";
 
@@ -143,8 +142,7 @@ static absl::Status OutfeedImpl(const ServiceExecutableRunOptions* run_options,
         .ThenDoHostCallback([&buffer]() { buffer->Done(); });
   }
 
-  Status block_status = stream->BlockHostUntilDone();
-  if (!block_status.ok()) return ToAbslStatus(block_status);
+  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
 
   VLOG(3) << "Outfeeding from GPU complete";
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
index ceef00ae284..d1adc1496df 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/kernel_launch.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/runtime/executable.h"
 #include "tensorflow/compiler/xla/runtime/state.h"
 #include "tensorflow/compiler/xla/service/gpu/launch_dimensions.h"
+#include "tensorflow/compiler/xla/service/gpu/runtime/concurrent_region.h"
 #include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 #include "tensorflow/compiler/xla/service/service_executable_run_options.h"
@@ -54,6 +55,7 @@ StreamExecutorKernels* GpuExecutableKernels::operator()(
 static absl::Status LaunchImpl(
     const ServiceExecutableRunOptions* run_options, const std::string* ptx,
     const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
+    ConcurrentRegionStatus* region_status,
     State<std::unique_ptr<se::KernelBase>> device_kernel,
     int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
     int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
@@ -69,27 +71,32 @@ static absl::Status LaunchImpl(
   const int args_size_including_temp_buffer = args.size() + 1;
 
   // If kernel does not exist create it from the ptx and cubin.
-  absl::StatusOr<std::unique_ptr<se::KernelBase>*> kernel =
-      device_kernel.GetOrCreate([&] {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::KernelBase> * kernel, device_kernel.GetOrCreate([&] {
         return ToAbsl(CreateKernel(absl::string_view(name.data(), name.size()),
                                    args_size_including_temp_buffer, *ptx,
                                    *cubin, executor, shared_memory_bytes));
-      });
-  if (!kernel.ok()) return kernel.status();
-  assert((**kernel)->name() == name && "unexpected loaded kernel");
+      }));
+  assert((*kernel)->name() == name && "unexpected loaded kernel");
 
 #if GOOGLE_CUDA
-  absl::StatusOr<bool> is_capturing = se::gpu::IsStreamCapturing(stream);
-  if (!is_capturing.ok()) return is_capturing.status();
-  if (is_capturing.value()) {
-    VLOG(3) << "Launching " << (**kernel)->name()
-            << "during CUDA graph capture";
-  } else {
-    VLOG(3) << "Launching " << (**kernel)->name();
-  }
+  TF_ASSIGN_OR_RETURN(bool is_capturing, se::gpu::IsStreamCapturing(stream));
 #else
-  VLOG(3) << "Launching " << (**kernel)->name();
+  bool is_capturing = false;
 #endif
+
+  if (is_capturing) {
+    if (region_status->IsInConcurrentRegion()) {
+      VLOG(3) << "Launching " << (*kernel)->name()
+              << "in a concurrent region during CUDA graph capture";
+    } else {
+      VLOG(3) << "Launching " << (*kernel)->name()
+              << "during CUDA graph capture";
+    }
+  } else {
+    VLOG(3) << "Launching " << (*kernel)->name();
+  }
+
   absl::InlinedVector<se::DeviceMemoryBase, 8> buffer_args(
       args_size_including_temp_buffer);
 
@@ -110,12 +117,16 @@ static absl::Status LaunchImpl(
   // Always add temporary buffer as the last kernel argument.
   buffer_args.back() = *temp_buffer;
 
-  // Execute device kernel on a main stream.
-  auto executed =
-      ExecuteKernelOnStream(***kernel, buffer_args, launch_dimensions, stream);
-  if (!executed.ok()) return ToAbslStatus(executed);
+  // If we are capturing a concurrent region in a CUDA graph, then use the
+  // stream provided by ConcurrentRegionStatus to execute the kernel.
+  se::Stream* execution_stream = stream;
+  if (region_status->IsInConcurrentRegion()) {
+    TF_ASSIGN_OR_RETURN(execution_stream, region_status->GetNextStream());
+  }
 
-  return absl::OkStatus();
+  // Execute device kernel on the execution stream.
+  return ExecuteKernelOnStream(**kernel, buffer_args, launch_dimensions,
+                               execution_stream);
 }
 
 //===----------------------------------------------------------------------===//
@@ -127,6 +138,7 @@ XLA_RUNTIME_DEFINE_CUSTOM_CALL(
         .UserData<const std::string*>()
         .UserData<const std::vector<uint8_t>*>()
         .UserData<se::DeviceMemoryBase*>()
+        .UserData<ConcurrentRegionStatus*>()
         .State<std::unique_ptr<se::KernelBase>>("uid")
         .Arg<int32_t>()   // shared_memory_bytes
         .Arg<int32_t>()   // grid_size_x
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc b/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc
index 27ec72e3bfe..29772e26fa6 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/memcpy.cc
@@ -64,8 +64,7 @@ absl::Status MemcpyImpl(const ServiceExecutableRunOptions* run_options,
   // thread should return an async token that will become available when
   // transfer is completed.
   if (direction != MemcpyDirection::kD2D) {
-    auto st = stream->BlockHostUntilDone();
-    if (!st.ok()) return ToAbslStatus(st);
+    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
   }
 
   return absl::OkStatus();
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/memset.cc b/tensorflow/compiler/xla/service/gpu/runtime/memset.cc
index 45812c793cc..016412c7903 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/memset.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/memset.cc
@@ -119,13 +119,13 @@ static absl::Status MemsetImpl(const ServiceExecutableRunOptions* run_options,
   }
 
   // If the constant is not zero, use the given pattern to `memset`.
-  absl::StatusOr<uint32_t> pattern = ToBitPattern(constant);
-  if (!pattern.ok()) return pattern.status();
+  TF_ASSIGN_OR_RETURN(uint32_t pattern, ToBitPattern(constant));
 
-  if (dst_data.size() % 4 != 0)
+  if (dst_data.size() % 4 != 0) {
     return absl::InvalidArgumentError("Memref size is not divisible by 4");
+  }
 
-  stream->ThenMemset32(&dst_data, *pattern, dst_data.size());
+  stream->ThenMemset32(&dst_data, pattern, dst_data.size());
 
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
index cc890b17e97..ea2e487980c 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/send_recv.cc
@@ -167,11 +167,11 @@ static absl::Status SendImpl(const ServiceExecutableRunOptions* run_options,
 
   // Send buffer to a handler registered with the run options.
   if (auto* send = run_options->run_options().send_device_memory_function()) {
-    auto done_event =
+    TF_ASSIGN_OR_RETURN(
+        auto done_event,
         (*send)(channel.handle, stream, ToShape(arg), GetDeviceAddress(arg),
-                GenerateFrontEndAttributeMap(frontend_attrs));
-    if (!done_event.ok()) return ToAbslStatus(done_event.status());
-    return events->PushEvent(channel.handle, std::move(*done_event));
+                GenerateFrontEndAttributeMap(frontend_attrs)));
+    return events->PushEvent(channel.handle, std::move(done_event));
   }
 
   return InvalidArgumentError("SendDeviceMemoryFunction is not available");
@@ -198,10 +198,10 @@ static absl::Status RecvImpl(const ServiceExecutableRunOptions* run_options,
   // Recv buffer from a handler registered with the run options.
   if (auto* recv = run_options->run_options().recv_device_memory_function()) {
     auto dst = GetDeviceAddress(arg);
-    auto done_event = (*recv)(channel.handle, stream, ToShape(arg), &dst,
-                              GenerateFrontEndAttributeMap(frontend_attrs));
-    if (!done_event.ok()) return ToAbslStatus(done_event.status());
-    return events->PushEvent(channel.handle, std::move(*done_event));
+    TF_ASSIGN_OR_RETURN(auto done_event,
+                        (*recv)(channel.handle, stream, ToShape(arg), &dst,
+                                GenerateFrontEndAttributeMap(frontend_attrs)));
+    return events->PushEvent(channel.handle, std::move(done_event));
   }
 
   return InvalidArgumentError("RecvDeviceMemoryFunction is not available");
@@ -218,18 +218,17 @@ static absl::Status SendDoneImpl(const ServiceExecutableRunOptions* run_options,
                          {{"channel", channel.handle}});
   });
 
-  auto done_event = events->PopEvent(channel.handle);
-  if (!done_event.ok()) return done_event.status();
+  TF_ASSIGN_OR_RETURN(auto done_event, events->PopEvent(channel.handle));
 
   // Wait until send handler will record an event on the stream.
-  BlockUntilReady(done_event->GetAsyncValue());
-  if (done_event->IsError()) return done_event->GetError();
+  BlockUntilReady(done_event.GetAsyncValue());
+  if (done_event.IsError()) return done_event.GetError();
 
   VLOG(5) << "Completed Host Send operation: "
           << " channel=" << channel.handle;
 
   // Once event is recorded we can add a stream dependency.
-  run_options->stream()->ThenWaitFor(&done_event->get());
+  run_options->stream()->ThenWaitFor(&done_event.get());
   return absl::OkStatus();
 }
 
@@ -244,18 +243,17 @@ static absl::Status RecvDoneImpl(const ServiceExecutableRunOptions* run_options,
                          {{"channel", channel.handle}});
   });
 
-  auto done_event = events->PopEvent(channel.handle);
-  if (!done_event.ok()) return done_event.status();
+  TF_ASSIGN_OR_RETURN(auto done_event, events->PopEvent(channel.handle));
 
   // Wait until send handler will record an event on the stream.
-  BlockUntilReady(done_event->GetAsyncValue());
-  if (done_event->IsError()) return done_event->GetError();
+  BlockUntilReady(done_event.GetAsyncValue());
+  if (done_event.IsError()) return done_event.GetError();
 
   VLOG(5) << "Completed Host Recv operation: "
           << " channel=" << channel.handle;
 
   // Once event is recorded we can add a stream dependency.
-  run_options->stream()->ThenWaitFor(&done_event->get());
+  run_options->stream()->ThenWaitFor(&done_event.get());
   return absl::OkStatus();
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/support.cc b/tensorflow/compiler/xla/service/gpu/runtime/support.cc
new file mode 100644
index 00000000000..333e5aab646
--- /dev/null
+++ b/tensorflow/compiler/xla/service/gpu/runtime/support.cc
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/gpu/runtime/support.h"
+
+#include <string>
+#include <string_view>
+
+#include "tensorflow/tsl/profiler/lib/scoped_annotation_stack.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+static thread_local std::string_view current_tracing_scope = {};
+}  // namespace
+
+void SetCurrentTracingScope(std::string_view scope) {
+  current_tracing_scope = scope;
+}
+
+void ResetCurrentTracingScope() { current_tracing_scope = std::string_view(); }
+
+void AppendDiagnosticToString(runtime::DiagnosticEngine& diagnostic_engine,
+                              std::string* diagnostic,
+                              bool append_annotation_stack) {
+  diagnostic_engine.AddHandler(
+      [append_annotation_stack, diagnostic](runtime::Diagnostic& d) {
+        if (!diagnostic->empty()) absl::StrAppend(diagnostic, "; ");
+        absl::StrAppend(diagnostic, d.status().message());
+
+        // Append the current trace which should help identifying original HLO
+        // operation that fails.
+        if (!current_tracing_scope.empty()) {
+          absl::StrAppend(diagnostic,
+                          "; current tracing scope: ", current_tracing_scope);
+        }
+
+        // Append current profiling annotation which will have the XLA
+        // executable name and program id.
+        if (append_annotation_stack) {
+          absl::StrAppend(diagnostic, "; current profiling annotation: ",
+                          tsl::profiler::AnnotationStack::Get());
+        }
+
+        LOG(WARNING) << "Intercepted XLA runtime error:\n"
+                     << d.status().ToString(
+                            absl::StatusToStringMode::kWithEverything);
+
+        return runtime::success();
+      });
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/support.h b/tensorflow/compiler/xla/service/gpu/runtime/support.h
index 4f5c722db9e..671b2991080 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/support.h
+++ b/tensorflow/compiler/xla/service/gpu/runtime/support.h
@@ -16,8 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_SUPPORT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_GPU_RUNTIME_SUPPORT_H_
 
+#include <string>
+#include <string_view>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "tensorflow/compiler/xla/mlir/runtime/transforms/custom_call_encoding.h"
 #include "tensorflow/compiler/xla/runtime/custom_call.h"
@@ -49,7 +52,7 @@ inline constexpr runtime::CustomCall::RuntimeChecks checks =  // NOLINT
 
 template <typename T>
 absl::StatusOr<T> ToAbsl(StatusOr<T> status_or) {
-  if (!status_or.ok()) return ToAbslStatus(status_or.status());
+  if (!status_or.ok()) return status_or.status();
   return std::move(status_or).value();
 }
 
@@ -116,6 +119,18 @@ inline void PopulateDotDimsAttrEncoding(
           .Add("rhs_contract", &DotDimsAttr::getRhsContractingDimensions));
 }
 
+// Appends to `diagnostic_engine` a handler that appends all emitted errors to
+// the `diagnostic` string. If `append_annotation_stack` is true, it will append
+// current profiler annotation stack to the diagnostic message (annotation used
+// in Xprof).
+void AppendDiagnosticToString(runtime::DiagnosticEngine& diagnostic_engine,
+                              std::string* diagnostic,
+                              bool append_annotation_stack = false);
+
+// Sets the current tracing scope that will be added to all emitted diagnostics.
+void SetCurrentTracingScope(std::string_view scope);
+void ResetCurrentTracingScope();
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc
index bbbae3da28a..2796e492260 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/topk_kernel.cc
@@ -97,8 +97,7 @@ absl::Status TypedTopK(TopkArgs<T> args) {
         "Invalid kernel pameters. This is likely a bug in the "
         "TopkSpecializer.");
   }
-  absl::StatusOr<void*> kernel = GetKernel<T>(args.num_elements, args.k);
-  if (!kernel.ok()) return kernel.status();
+  TF_ASSIGN_OR_RETURN(void* kernel, GetKernel<T>(args.num_elements, args.k));
   int blocks_per_grid = args.batch_size;
   constexpr size_t max_kv_size = sizeof(uint64_t);
   // Allocate shmem assuming we have a full reduction.
@@ -106,7 +105,7 @@ absl::Status TypedTopK(TopkArgs<T> args) {
   void* kernel_args[] = {&args.data, &args.num_elements, &args.top_elements,
                          &args.top_indices, &args.k};
   cudaError_t launch_status =
-      cudaLaunchKernel(*kernel, blocks_per_grid, num_threads, kernel_args,
+      cudaLaunchKernel(kernel, blocks_per_grid, num_threads, kernel_args,
                        shmem_size, args.stream);
   if (launch_status != cudaSuccess) {
     return absl::InternalError(absl::StrCat("Failed to launch kernel: ",
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc b/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc
index eb0da9e0fa4..b8a58d2a538 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/tracing.cc
@@ -45,6 +45,7 @@ void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry) {
 //===----------------------------------------------------------------------===//
 
 static absl::StatusOr<int64_t> ActivityStart(runtime::HloTrace annotation) {
+  SetCurrentTracingScope(annotation.hlo_op);
   return ScopedAnnotationStack::ActivityStart([&] {
     // We use the same tracing annotation scheme as the ThunkSequence (see
     // implementation of `GetThunkInfo` in `ir_emitter_unnested.cc`).
@@ -53,6 +54,7 @@ static absl::StatusOr<int64_t> ActivityStart(runtime::HloTrace annotation) {
 }
 
 static absl::Status ActivityEnd(int64_t activity_id) {
+  ResetCurrentTracingScope();
   ScopedAnnotationStack::ActivityEnd(activity_id);
   return absl::OkStatus();
 }
diff --git a/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc b/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc
index 8188b138511..9693ec0c7e0 100644
--- a/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc
+++ b/tensorflow/compiler/xla/service/gpu/runtime/triangular_solve.cc
@@ -60,8 +60,7 @@ absl::Status TriangularSolve::run(
   const std::string backend_config_str =
       std::string(backend_config.data(), backend_config.length());
 
-  auto st = tsl::HumanReadableJsonToProto(backend_config_str, &opts);
-  if (!st.ok()) return ToAbslStatus(st);
+  TF_RETURN_IF_ERROR(tsl::HumanReadableJsonToProto(backend_config_str, &opts));
 
   return handler(run_options, debug_options, *a, *b, *result, *temp,
                  opts.left_side(), opts.lower(), opts.unit_diagonal(),
@@ -125,13 +124,10 @@ absl::Status TriangularSolve::operator()(
   if (failed(transpose))
     return absl::InternalError("Failed to convert transpose type");
 
-  auto st = RunTriangularSolve(
-      a_data, result_data, temp_data, PtxOptsFromDebugOptions(*debug_options),
-      uplo, side, diagonal, *transpose, elem_type, batch_size, m, n,
-      a_batch_stride, b_batch_stride, stream);
-  if (!st.ok()) return ToAbslStatus(st);
-
-  return absl::OkStatus();
+  return RunTriangularSolve(a_data, result_data, temp_data,
+                            PtxOptsFromDebugOptions(*debug_options), uplo, side,
+                            diagonal, *transpose, elem_type, batch_size, m, n,
+                            a_batch_stride, b_batch_stride, stream);
 #else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return absl::InternalError("Not implemented without Gpu");
 #endif
diff --git a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
index bd0fc2cae18..a8b74049758 100644
--- a/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
+++ b/tensorflow/compiler/xla/service/gpu/stream_executor_util.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/gpu/stream_executor_util.h"
 
+#include <iterator>
+#include <limits>
+#include <map>
 #include <memory>
+#include <optional>
 #include <random>
+#include <sstream>
 #include <string_view>
 #include <tuple>
 #include <utility>
@@ -401,24 +406,22 @@ static void InitializeTypedBuffer(se::Stream* stream,
     // Default-seeded random numbers.
     std::mt19937 gen;
     for (auto& element : *ret) {
+      constexpr bool kIsIntegral = std::numeric_limits<T>::is_integer;
+      constexpr bool kIsLowRange =
+          !kIsIntegral && std::numeric_limits<T>::max_exponent <=
+                              std::numeric_limits<Eigen::half>::max_exponent;
       // Only double gets random values in double.  Other data types get random
       // values in float then cast them to the target data types.
-      using RandomFloatingPointType =
-          typename std::conditional<std::is_same<T, Eigen::half>::value ||
-                                        std::is_same<T, Eigen::bfloat16>::value,
-                                    float, T>::type;
-      using RandomType =
-          typename std::conditional<std::is_integral<T>::value, float,
-                                    RandomFloatingPointType>::type;
+      using RandomType = typename std::conditional<std::is_same_v<T, double>,
+                                                   double, float>::type;
       // Scale down the values for fp16 to have less overflows.
-      auto upper_bound =
-          RandomType(std::is_same<T, Eigen::half>::value ? 0.1 : 1.0);
+      auto upper_bound = RandomType(kIsLowRange ? 0.1 : 1.0);
       auto rand_val = UniformDistribution(RandomType(0), upper_bound, &gen);
       // For bf16, float or double, it is between [0,1].
       // For fp16, it ranges between [0, 0.1].
       // For integer types, element is either 0 or 1 for less overflows
       // especially for int8_t.
-      element = T(std::is_integral<T>::value ? rand_val + 0.5 : rand_val);
+      element = T(kIsIntegral ? rand_val + 0.5 : rand_val);
     }
     return ret;
   }();
@@ -446,28 +449,30 @@ static void InitializeTypedBuffer(se::Stream* stream,
 
 void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
                       int64_t* rng_state, se::DeviceMemoryBase buffer) {
-  switch (buffer_type) {
-    case xla::F16:
-      return InitializeTypedBuffer<Eigen::half>(stream, buffer, rng_state);
-    case xla::BF16:
-      return InitializeTypedBuffer<Eigen::bfloat16>(stream, buffer, rng_state);
-    case xla::F32:
-    case xla::C64:
-      return InitializeTypedBuffer<float>(stream, buffer, rng_state);
-    case xla::F64:
-    case xla::C128:
-      return InitializeTypedBuffer<double>(stream, buffer, rng_state);
-    case xla::PRED:
-      // Using S8 for PRED initialization, as vector<bool> has different
-      // semantics and cannot be used as a buffer.
-    case xla::S8:
-      return InitializeTypedBuffer<int8_t>(stream, buffer, rng_state);
-    case xla::S32:
-      return InitializeTypedBuffer<int32_t>(stream, buffer, rng_state);
-    default:
-      LOG(FATAL) << "Unexpected type: "
-                 << primitive_util::LowercasePrimitiveTypeName(buffer_type);
-  }
+  return primitive_util::PrimitiveTypeSwitch<void>(
+      [&](auto primitive_type_constant) -> void {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant) ||
+                      primitive_util::IsIntegralType(primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return InitializeTypedBuffer<NativeT>(stream, buffer, rng_state);
+        }
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = typename primitive_util::PrimitiveTypeToNative<
+              primitive_type_constant>::type;
+          return InitializeTypedBuffer<typename NativeT::value_type>(
+              stream, buffer, rng_state);
+        }
+        // Using S8 for PRED initialization, as vector<bool> has different
+        // semantics and cannot be used as a buffer.
+        if constexpr (primitive_type_constant == PRED) {
+          return InitializeTypedBuffer<int8_t>(stream, buffer, rng_state);
+        }
+        LOG(FATAL) << "Unexpected type: "
+                   << primitive_util::LowercasePrimitiveTypeName(buffer_type);
+      },
+      buffer_type);
 }
 
 StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
diff --git a/tensorflow/compiler/xla/service/gpu/tests/BUILD b/tensorflow/compiler/xla/service/gpu/tests/BUILD
index 3f8aae17d67..af8a7cef497 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/BUILD
+++ b/tensorflow/compiler/xla/service/gpu/tests/BUILD
@@ -803,6 +803,7 @@ xla_cc_binary(
         "//tensorflow/compiler/xla/stream_executor:dnn",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_impl",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
+        "//tensorflow/compiler/xla/stream_executor/host:host_platform",
         "//tensorflow/compiler/xla/tools:hlo_module_loader",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:platform_port",
@@ -820,6 +821,7 @@ xla_cc_binary(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     default_tags = tf_cuda_tests_tags() + [
         "no_pip",
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
index 694555e7047..6e3aa712bf4 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gemm_broadcast_folding_rewrite_test.cc
@@ -55,21 +55,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[3,2,2]{2,1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas${{(lt\$matmul|gemm)}}",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -94,21 +94,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,2,2]{2,1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[3,2,2]{2,1,0} custom-call([[P0]], [[P1]]),
 ; CHECK    :       custom_call_target="__cublas${{(lt\$matmul|gemm)}}",
-; CHECK    :       backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK    :       backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
index 3b37efa1ce8..e5de6feb157 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -247,21 +247,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -285,21 +285,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -327,21 +327,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[BITCAST1:%[^ ]+]] = f32[12,5]{1,0} bitcast([[P1]])
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,5]{1,0} custom-call([[BITCAST0]], [[BITCAST1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -366,21 +366,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -405,21 +405,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -444,21 +444,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":["1"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -484,21 +484,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[FUSION:%[^ ]+]] = f32[5,2,3]{2,1,0} transpose([[P0]])
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[FUSION]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -520,24 +520,27 @@ ENTRY AddDotsFunc {
   MatchOptimizedHlo(hlo_text,
                     R"(
 ; CHECK-LABEL: ENTRY %AddDotsFunc (x: f32[20000,4,3,2], y: f32[20000,4,3,4]) -> f32[20000,4,2,4] {
-; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[20000,4,3,2]{3,2,1,0} parameter(0)
-; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[20000,4,3,4]{3,2,1,0} parameter(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[20000,4,2,4]{3,2,1,0} custom-call([[P0]], [[P1]]),
+; CHECK:    [[P0:%[^ ]+]] = f32[20000,4,3,2]{3,2,1,0} parameter(0)
+; CHECK:    [[BC0:%[^ ]+]] = f32[80000,3,2]{2,1,0} bitcast([[P0]])
+; CHECK:    [[P1:%[^ ]+]] = f32[20000,4,3,4]{3,2,1,0} parameter(1)
+; CHECK:    [[BC1:%[^ ]+]] = f32[80000,3,4]{2,1,0} bitcast([[P1]])
+; CHECK:    [[OUT:%[^ ]+]] = f32[80000,2,4]{2,1,0} custom-call([[BC0]], [[BC1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\",\"1\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\",\"1\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK:           }"
+; CHECK:           }
+; CHECK:   ROOT {{[^ ]+}} = f32[20000,4,2,4]{3,2,1,0} bitcast([[OUT]])
 )");
 }
 
@@ -562,21 +565,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} custom-call([[P1]], [[P0]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -601,21 +604,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[5,2,4]{2,0,1} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,5,4]{2,1,0} bitcast([[GEMM]])
 )");
 }
@@ -641,21 +644,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[5,3,4]{2,1,0} parameter(1)
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[5,2,4]{2,1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4,5]{2,1,0} [[OP:[^ ]+]]([[GEMM]])
 )");
 }
@@ -683,21 +686,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -724,21 +727,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = c64[2,2]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = c64[2,2]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":3
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":3
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -763,21 +766,21 @@ ENTRY AddDotsFunc {
                     R"(
 ; CHECK:    {{[^ ]+}} = f32[2,2]{1,0} custom-call({{[^,]+}}, {{[^)]+}}),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -803,21 +806,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-NEXT:    [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -924,9 +927,9 @@ ENTRY int8gemm {
                       R"(
 ; CHECK: s32[12,8]{1,0} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:       \"alpha_real\":1
-; CHECK-DAG:       \"alpha_imag\":0
+; CHECK:           backend_config={
+; CHECK-DAG:       "alpha_real":1
+; CHECK-DAG:       "alpha_imag":0
   )",
                       /*print_operand_shape=*/true);
   } else {
@@ -958,10 +961,10 @@ ENTRY int8gemm {
                       R"(
 ; CHECK: s32[12,8]{1,0} custom-call(s8[12,4]{1,0} [[A:%[^ ]+]], s8[4,8]{0,1} [[B:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:       \"alpha_real\":1
-; CHECK-DAG:       \"alpha_imag\":0
-; CHECK-DAG:       \"beta\":0
+; CHECK:           backend_config={
+; CHECK-DAG:       "alpha_real":1
+; CHECK-DAG:       "alpha_imag":0
+; CHECK-DAG:       "beta":0
   )",
                       /*print_operand_shape=*/true);
   } else {
@@ -1222,21 +1225,21 @@ ENTRY AddDotsFunc {
 ; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[X]], [[Y]], {{[^,)]+}}),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1265,21 +1268,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1304,21 +1307,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[2,2]{1,0} parameter(1)
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1348,21 +1351,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]], [[BIAS_COPY]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1393,21 +1396,21 @@ ENTRY AddDotsFunc {
 ; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[X]], [[Y]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1434,21 +1437,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = f32[1024,1024]{1,0} parameter(1)
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1480,21 +1483,21 @@ ENTRY BF16GemmWithBias {
 ; CHECK:        ROOT [[GEMM:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[X]], [[Y]], {{[^,)]+}}),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1527,21 +1530,21 @@ ENTRY test {
 ; CHECK:         ROOT [[GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], {{[^,)]+}}),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1571,39 +1574,39 @@ ENTRY test {
 ; CHECK-DAG:     [[P3:%[^ ]+]] = f32[3,4]{1,0} parameter(3)
 ; CHECK-NEXT:    [[FIRST_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[SECOND_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P2]], [[P3]], [[FIRST_GEMM]]),
 ; CHECK:           custom_call_target="__cublas$gemm",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1719,21 +1722,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[2,2]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[X]], [[Y]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1764,21 +1767,21 @@ ENTRY AddDotsFunc {
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[2,2]{1,0} custom-call([[P0]], [[P1]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK-NOT:       output_to_operand_aliasing
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1806,21 +1809,21 @@ ENTRY AddDotsFunc {
 ; CHECK-DAG:     [[BIAS:%[^ ]+]] = f32[1024,1024]{1,0} parameter(2)
 ; CHECK-NEXT:    [[GEMM:%[^ ]+]] = f32[1024,1024]{1,0} custom-call([[P0]], [[P1]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[1024,1024]{1,0} add([[GEMM]], [[BIAS]])
 )");
 }
@@ -1847,21 +1850,21 @@ ENTRY BF16GemmWithBias {
 ; CHECK-DAG:    [[BIAS:%[^ ]+]] = bf16[8,8]{1,0} parameter(2)
 ; CHECK-NEXT:   ROOT [[GEMM:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[X]], [[Y]], [[BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1888,21 +1891,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2,4]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1932,39 +1935,39 @@ ENTRY test {
 ; CHECK-DAG:     [[P3:%[^ ]+]] = f32[3,4]{1,0} parameter(3)
 ; CHECK-NEXT:    [[FIRST_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[SECOND_GEMM:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P2]], [[P3]], [[FIRST_GEMM]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           output_to_operand_aliasing={{{{}: \(2, {}\)}}},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -1992,21 +1995,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
 )");
 }
 
@@ -2046,59 +2049,59 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[MATMUL0:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[4,4]{1,0} fusion([[MATMUL0]], [[P2]]), kind=kLoop, calls=[[FUSED_COMPUTATION]]
 ; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(5)
 ; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[4,4]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[MATMUL1:%[^ ]+]] = f32[4,4]{1,0} custom-call([[MATMUL0]], [[C0_BCAST]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,4]{1,0} custom-call([[FUSION]], [[MATMUL1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -2137,21 +2140,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           output_to_operand_aliasing={{[{][{]}}}: (2, {})},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2191,21 +2194,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
 ; CHECK:           output_to_operand_aliasing={{[{][{]}}}: (2, {})},
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2235,21 +2238,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{0,1} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} bitcast([[MATMUL]])
 )");
 }
@@ -2280,21 +2283,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[3]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3]{1,0} slice([[MATMUL]]), slice={[0:2], [0:3]}
       )");
 }
@@ -2338,59 +2341,59 @@ ENTRY test {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = f32[2]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL0:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[2,2]{1,0} fusion([[P2]], [[MATMUL0]]), kind=kLoop, calls=[[FUSED_COMPUTATION]]
 ; CHECK-NEXT:    [[SLICE:%[^ ]+]] = f32[2,2]{1,0} slice([[MATMUL0]]), slice={[0:2], [0:2]}
 ; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(5)
 ; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[2,2]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[MATMUL1:%[^ ]+]] = f32[2,2]{1,0} custom-call([[SLICE]], [[C0_BCAST]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} custom-call([[FUSION]], [[MATMUL1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -2417,21 +2420,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2_BCAST:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2_BCAST]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 )");
 }
 
@@ -2462,21 +2465,21 @@ ENTRY test {
 ; CHECK-DAG:     [[MATRIX_BIAS:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[MATRIX_BIAS]], [[VECTOR_BIAS]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
 )");
 }
 
@@ -2505,20 +2508,20 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = bf16[32]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[16,32]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK-DAG:         "epilogue":"BIAS"
       )");
 }
 
@@ -2555,20 +2558,20 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = bf16[4]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
+; CHECK-DAG:         "epilogue":"BIAS"
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[2,4]{1,0} slice([[MATMUL]]), slice={[0:2], [0:4]}
       )");
 }
@@ -2597,21 +2600,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -2641,21 +2644,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,30]{1,0}
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call([[P0_BITCAST]], [[P1_BITCAST]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2685,21 +2688,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,2]{1,0} slice([[MATMUL]]), slice={[0:2], [0:2]}
       )");
 }
@@ -2731,21 +2734,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2,4]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -2776,21 +2779,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,4]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -2822,21 +2825,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -2877,21 +2880,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[FUSION:%[^ ]+]] = f32[6,30]{1,0} fusion([[P2]]), kind=kLoop, calls=[[FUSED_COMPUTATION]]
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[6,30]{1,0} custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,3,5,6]{3,2,1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2925,21 +2928,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[2]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f32[2,4]{0,1} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:       \"alpha_real\":1
-; CHECK-DAG:       \"alpha_imag\":0
-; CHECK-DAG:       \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:       "alpha_real":1
+; CHECK-DAG:       "alpha_imag":0
+; CHECK-DAG:       "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[4,2]{1,0} bitcast([[MATMUL]])
       )");
 }
@@ -2975,21 +2978,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P3]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -3031,21 +3034,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"GELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"GELU"
+; CHECK:           }
       )");
 }
 
@@ -3128,21 +3131,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_GELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_GELU"
+; CHECK:           }
       )");
 }
 
@@ -3185,21 +3188,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[3,4]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}) custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"GELU_AUX\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"GELU_AUX"
+; CHECK:           }
       )");
 }
 
@@ -3246,21 +3249,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}) custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_GELU_AUX\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_GELU_AUX"
+; CHECK:           }
       )");
 }
 
@@ -3290,21 +3293,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8,8]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -3334,21 +3337,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
       )");
 }
 
@@ -3383,21 +3386,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[6]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
 ; CHECK-NEXT:    [[OUT:%[^ ]+]] = f16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
@@ -3427,21 +3430,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f16[16,8]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -3475,21 +3478,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P1_PADDED:%[^ ]+]] = f16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
@@ -3521,21 +3524,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8,8]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -3568,21 +3571,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[8]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -3620,21 +3623,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[6]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         "beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -3664,21 +3667,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8,8]{1,0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -3740,21 +3743,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
       )");
 }
 
@@ -3790,21 +3793,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[6]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
 ; CHECK-NEXT:    [[OUT:%[^ ]+]] = bf16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
@@ -3834,21 +3837,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P1:%[^ ]+]] = bf16[16,8]{1,0} parameter(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -3883,21 +3886,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P1_PADDED:%[^ ]+]] = bf16[16,8]{1,0} pad([[P1]], [[C0]]), padding=0_4x0_2
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
@@ -3931,21 +3934,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[8]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -3984,21 +3987,21 @@ ENTRY test {
 ; CHECK-DAG:     [[P2:%[^ ]+]] = bf16[6]{0} parameter(2)
 ; CHECK-NEXT:    [[MATMUL:%[^ ]+]] = bf16[8,8]{1,0} custom-call([[P0_PADDED]], [[P1_PADDED]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = bf16[6,6]{1,0} slice([[MATMUL]]), slice={[0:6], [0:6]}
       )");
 }
@@ -4031,21 +4034,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f64[4]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f64[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -4080,21 +4083,21 @@ ENTRY test {
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[2,4]{1,0} custom-call([[P0]], [[P1]], [[P2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -4257,21 +4260,21 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnsupportedTypesF8) {
 ; CHECK-NEXT:    [[P1_CONVERT:%[^ ]+]] = f16[16,16]{1,0} convert([[P1]])
 ; CHECK-NEXT:    [[DOT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0_CONVERT]], [[P1_CONVERT]]),
 ; CHECK:           custom_call_target="<<CUBLAS_CUSTOM_CALL_TARGET_PLACEHOLDER>>",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"0\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["0"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e5m2[16,16]{1,0} convert([[DOT]])
       )",
                                                 replacements_));
@@ -4301,26 +4304,24 @@ TEST_P(ParameterizedFp8GemmRewriteTest, UnscaledABUnscaledDF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:[^ ]+]] = bf16[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[C1:[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[C1]], [[C1]], /*index=5*/[[C1]], [[C1]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C1]], [[C1]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -4356,28 +4357,26 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -4417,30 +4416,26 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDPaddedF8) {
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[31,17]{1,0} transpose([[P1]]), dimensions={1,0}
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f8e4m3fn[] constant(0)
 ; CHECK-NEXT:    [[P1_TRANSPOSE_PADDED:%[^ ]+]] = f8e4m3fn[32,32]{1,0} pad([[P1_TRANSPOSE]], [[C1]])
-; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C2_BCAST:%[^ ]+]] = f32[13,31]{1,0} broadcast([[C2]]), dimensions={}
-; CHECK-NEXT:    [[C3:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C2_BCAST_PADDED:%[^ ]+]] = f32[16,32]{1,0} pad([[C2_BCAST]], [[C3]]), padding=0_3x0_1
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C4:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    [[DOT:%[^ ]+]] = f32[16,32]{1,0} custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[C2_BCAST_PADDED]], [[P2]], [[P3]], /*index=5*/[[C4]], [[C4]]),
+; CHECK-NEXT:    [[DOT:%[^ ]+]] = f32[16,32]{1,0} custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[P2]], [[P3]], [[C4]], /*index=5*/[[C4]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK-NEXT: ROOT [[OUT:%[^ ]+]] = f32[13,31]{1,0} slice([[DOT]]), slice={[0:13], [0:31]}
       )");
 }
@@ -4525,28 +4520,26 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDUnaryOpsF8) {
 ; CHECK-NEXT:    [[P0_U3:%[^ ]+]] = f8e4m3fn[16,32]{1,0} reshape([[P0_U2]])
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C1_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C1]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_U3]], [[P1_TRANSPOSE]], [[C1_BCAST]], [[P2]], [[P3]], /*index=5*/[[C2]], [[C2]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_U3]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C2]], /*index=5*/[[C2]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -4582,28 +4575,26 @@ TEST_P(ParameterizedFp8GemmRewriteTest, BatchedScaledABUnscaledDF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[10,16,32]{2,1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[10,32,16]{2,1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[10,16,32]{2,1,0} transpose([[P1]]), dimensions={0,2,1}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[10,16,16]{2,1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[10,16,16]{2,1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[10,16,16]{2,1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"2\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[\"0\"]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[\"0\"]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["2"]
+; CHECK-DAG:           "lhs_batch_dimensions":["0"]
+; CHECK-DAG:           "rhs_batch_dimensions":["0"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -4643,28 +4634,26 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABAlphaDF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":3
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":3
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -4704,28 +4693,26 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDReluActivationF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -4802,21 +4789,87 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasF8) {
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[P2]], [[P3]], /*index=5*/[[C1]], [[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+      )");
+}
+
+TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDMatrixBiasPaddedF8) {
+#if CUDA_VERSION < 12000
+  GTEST_SKIP() << "F8 gemm rewrite is only supported in CUDA 12 and above.";
+#endif  // CUDA_VERSION < 12000
+  const char* hlo_text = R"(
+    HloModule test
+
+    ENTRY test {
+      x = f8e4m3fn[14,31] parameter(0)
+      y = f8e4m3fn[31,14] parameter(1)
+      b = f32[14,14] parameter(2)
+      x_f32 = f32[14,31] convert(x)
+      y_f32 = f32[31,14] convert(y)
+      x_scale = f32[] parameter(3)
+      y_scale = f32[] parameter(4)
+      x_scale_bcast = f32[14,31] broadcast(x_scale), dimensions={}
+      y_scale_bcast = f32[31,14] broadcast(y_scale), dimensions={}
+      x_unscaled = f32[14,31] multiply(x_f32, x_scale_bcast)
+      y_unscaled = f32[31,14] multiply(y_f32, y_scale_bcast)
+      dot_a = f32[14,14] dot(x_unscaled, y_unscaled), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+      ROOT out = add(dot_a, b)
+          }
+
+)";
+
+  CheckFp8IfOnHopper(hlo_text);
+  RunAndFilecheckHloRewrite(hlo_text,
+                            GemmRewriter(se::CudaComputeCapability{
+                                se::CudaComputeCapability::HOPPER, 0}),
+                            R"(
+
+; CHECK-LABEL: ENTRY %test (x: f8e4m3fn[14,31], y: f8e4m3fn[31,14], b: f32[14,14], x_scale: f32[], y_scale: f32[]) -> f32[14,14] {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[14,31]{1,0} parameter(0)
+; CHECK-NEXT:    [[C0:%[^ ]+]] = f8e4m3fn[] constant(0)
+; CHECK-NEXT:    [[P0_PADDED:%[^ ]+]] = f8e4m3fn[16,32]{1,0} pad([[P0]], [[C0]]), padding=0_2x0_1
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[31,14]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[14,31]{1,0} transpose([[P1]]), dimensions={1,0}
+; CHECK-NEXT:    [[C1:%[^ ]+]] = f8e4m3fn[] constant(0)
+; CHECK-NEXT:    [[P1_TRANSPOSE_PADDED:%[^ ]+]] = f8e4m3fn[16,32]{1,0} pad([[P1_TRANSPOSE]], [[C1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[14,14]{1,0} parameter(2)
+; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(0)
+; CHECK-NEXT:    [[P2_PADDED:%[^ ]+]] = f32[16,16]{1,0} pad([[P2]], [[C2]]), padding=0_2x0_2
+; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
+; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
+; CHECK-NEXT:    [[C3:%[^ ]+]] = f32[] constant(1)
+; CHECK-NEXT:    [[DOT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0_PADDED]], [[P1_TRANSPOSE_PADDED]], [[P2_PADDED]], [[P3]], [[P4]], /*index=5*/[[C3]], [[C3]]),
+; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
+; CHECK-DAG:         }
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
+; CHECK-DAG:         }
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
+; CHECK-NEXT: ROOT [[OUT:%[^ ]+]] = f32[14,14]{1,0} slice([[DOT]]), slice={[0:14], [0:14]}
       )");
 }
 
@@ -4861,31 +4914,29 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDF8) {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -4976,31 +5027,29 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDReluActivationF8) {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK-NEXT:    ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -5055,21 +5104,21 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDMatrixBiasF8) {
 ; CHECK:         [[P4:%[^ ]+]] = f16[] parameter(5)
 ; CHECK:       ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[C0]], [[DUMMY0:%[^ ]+]], [[DUMMY1:%[^ ]+]], /*index=5*/[[C1]], [[DUMMY2:%[^ ]+]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -5118,8 +5167,6 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C1:%[^ ]+]] = f16[] constant(0)
-; CHECK-NEXT:    [[BC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[C1]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(3)
 ; CHECK-NEXT:    [[CV:%[^ ]+]] = f32[] convert([[P2]])
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(4)
@@ -5130,23 +5177,23 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDVectorBiasF8) {
 ; CHECK-NEXT:    [[DV:%[^ ]+]] = f16[] divide([[C2]], [[P4]])
 ; CHECK-NEXT:    [[CV2:%[^ ]+]] = f32[] convert([[DV]])
 ; CHECK-NEXT:    [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
-; CHECK:         ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[BC]], [[CV]], [[CV1]], /*index=5*/[[C]], [[CV2]], [[VB]]),
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f8e4m3fn[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[CV]], [[CV1]], [[C]], /*index=5*/[[CV2]], [[VB]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{ 
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{ 
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[] 
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
       )");
 }
 
@@ -5187,30 +5234,28 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABUnscaledDF32VectorBiasF8) {
 ; CHECK:         [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(0)
-; CHECK-NEXT:    [[BC:%[^ ]+]] = f32[16,16]{1,0} broadcast([[C1]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[VB:%[^ ]+]] = f32[16]{0} parameter(2)
 ; CHECK-NEXT:    [[VBC:%[^ ]+]] = bf16[16]{0} convert([[VB]])
-; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[BC]], [[P2]], [[P3]], /*index=5*/[[C]], [[C]], [[VBC]]),
+; CHECK:         ROOT [[OUT:%[^ ]+]] = f32[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C]], /*index=5*/[[C]], [[VBC]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{ 
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{ 
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[] 
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS"
+; CHECK:           }
       )");
 }
 
@@ -5252,31 +5297,29 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]]), dimensions={1,0}
-; CHECK-NEXT:    [[C1:%[^ ]+]] = f16[] constant(0)
-; CHECK-NEXT:    [[BC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[C1]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(3)
 ; CHECK-NEXT:    [[CV:%[^ ]+]] = f32[] convert([[P2]])
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(4)
 ; CHECK-NEXT:    [[CV1:%[^ ]+]] = f32[] convert([[P3]])
 ; CHECK-NEXT:    [[C:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
-; CHECK     :    ROOT [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[BC]], [[CV]], [[CV1]], /*index=5*/[[C]], [[C]], [[VB]]),
+; CHECK     :    ROOT [[OUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[CV]], [[CV1]], [[C]], /*index=5*/[[C]], [[VB]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"BIAS_RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"BIAS_RELU"
+; CHECK:           }
       )");
 }
 
@@ -5325,21 +5368,21 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK:         [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK:         [[GEMMOUT:%[^ ]+]] = f16[16,16]{1,0} custom-call([[P0]], [[P1_TRANSPOSE]], [[MB]], [[CV0]], [[CV1]], /*index=5*/[[C1]], [[C1]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":1
-; CHECK-DAG:         \"dot_dimension_numbers\":{ 
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[] 
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":1
+; CHECK-DAG:         "dot_dimension_numbers":{ 
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[] 
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
 ; CHECK:         [[VB:%[^ ]+]] = f16[16]{0} parameter(2)
 ; CHECK:         [[VBC:%[^ ]+]] = f16[16,16]{1,0} broadcast([[VB]]), dimensions={1}
 ; CHECK:         ROOT [[OUT:%[^ ]+]] = f16[16,16]{1,0} add([[GEMMOUT]], [[VBC]])
@@ -5397,31 +5440,29 @@ TEST_P(ParameterizedFp8GemmRewriteTest, ScaledABScaledDWithDAmaxF8) {
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]])
-; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -5479,8 +5520,6 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]])
-; CHECK-NEXT:    [[C0:%[^ ]+]] = f16[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = f16[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f16[] parameter(2)
 ; CHECK-NEXT:    [[P2_CONVERT:%[^ ]+]] = f32[] convert([[P2]])
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f16[] parameter(3)
@@ -5490,23 +5529,23 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P4:%[^ ]+]] = f16[] parameter(4)
 ; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f16[] divide([[C2]], [[P4]])
 ; CHECK-NEXT:    [[P4_INV_CONVERT:%[^ ]+]] = f32[] convert([[P4_INV]])
-; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2_CONVERT]], [[P3_CONVERT]], /*index=5*/[[C1]], [[P4_INV_CONVERT]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2_CONVERT]], [[P3_CONVERT]], [[C1]], /*index=5*/[[P4_INV_CONVERT]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"DEFAULT\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"DEFAULT"
+; CHECK:           }
       )");
 }
 
@@ -5564,31 +5603,29 @@ TEST_P(ParameterizedFp8GemmRewriteTest,
 ; CHECK-NEXT:    [[P0:%[^ ]+]] = f8e4m3fn[16,32]{1,0} parameter(0)
 ; CHECK-NEXT:    [[P1:%[^ ]+]] = f8e4m3fn[32,16]{1,0} parameter(1)
 ; CHECK-NEXT:    [[P1_TRANSPOSE:%[^ ]+]] = f8e4m3fn[16,32]{1,0} transpose([[P1]])
-; CHECK-NEXT:    [[C0:%[^ ]+]] = bf16[] constant(0)
-; CHECK-NEXT:    [[C0_BCAST:%[^ ]+]] = bf16[16,16]{1,0} broadcast([[C0]]), dimensions={}
 ; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[] parameter(2)
 ; CHECK-NEXT:    [[P3:%[^ ]+]] = f32[] parameter(3)
 ; CHECK-NEXT:    [[C1:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[C2:%[^ ]+]] = f32[] constant(1)
 ; CHECK-NEXT:    [[P4:%[^ ]+]] = f32[] parameter(4)
 ; CHECK-NEXT:    [[P4_INV:%[^ ]+]] = f32[] divide([[C2]], [[P4]])
-; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[C0_BCAST]], [[P2]], [[P3]], /*index=5*/[[C1]], [[P4_INV]]),
+; CHECK-NEXT:    [[OUT:%[^ ]+]] = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call([[P0]], [[P1_TRANSPOSE]], [[P2]], [[P3]], [[C1]], /*index=5*/[[P4_INV]]),
 ; CHECK:           custom_call_target="__cublas$lt$matmul$f8",
-; CHECK:           backend_config="{
-; CHECK-DAG:         \"alpha_real\":1
-; CHECK-DAG:         \"alpha_imag\":0
-; CHECK-DAG:         \"beta\":0
-; CHECK-DAG:         \"dot_dimension_numbers\":{
-; CHECK-DAG:           \"lhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"rhs_contracting_dimensions\":[\"1\"]
-; CHECK-DAG:           \"lhs_batch_dimensions\":[]
-; CHECK-DAG:           \"rhs_batch_dimensions\":[]
+; CHECK:           backend_config={
+; CHECK-DAG:         "alpha_real":1
+; CHECK-DAG:         "alpha_imag":0
+; CHECK-DAG:         "beta":0
+; CHECK-DAG:         "dot_dimension_numbers":{
+; CHECK-DAG:           "lhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "rhs_contracting_dimensions":["1"]
+; CHECK-DAG:           "lhs_batch_dimensions":[]
+; CHECK-DAG:           "rhs_batch_dimensions":[]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"precision_config\":{
-; CHECK-DAG:           \"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]
+; CHECK-DAG:         "precision_config":{
+; CHECK-DAG:           "operand_precision":["DEFAULT","DEFAULT"]
 ; CHECK-DAG:         }
-; CHECK-DAG:         \"epilogue\":\"RELU\"
-; CHECK:           }"
+; CHECK-DAG:         "epilogue":"RELU"
+; CHECK:           }
       )");
 }
 
@@ -5786,7 +5823,7 @@ ENTRY main.10 {
 })";
 
   MatchOptimizedHlo(hlo, R"(
-// CHECK: \"beta\":0
+// CHECK: "beta":0
   )");
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
index 32958ae1ad8..3cec99151b4 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -44,9 +44,13 @@ ENTRY main {
 }
 )";
 
-  CompileAndVerifyIr(hlo_string, R"(
+  auto expected_ir = is_built_with_rocm_ ? R"(
+CHECK: @fusion(ptr noalias align 128 dereferenceable(800) %arg0, ptr noalias align 16 dereferenceable(400) %arg1, ptr noalias align 128 dereferenceable(600) %arg2)
+)"
+                                         : R"(
 CHECK: define void @fusion(ptr noalias align 128 dereferenceable(800) %arg0, ptr noalias align 16 dereferenceable(400) %arg1, ptr noalias align 128 dereferenceable(600) %arg2)
-)");
+)";
+  CompileAndVerifyIr(hlo_string, expected_ir);
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index b51f894dcda..03c8074ec8c 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -291,9 +291,9 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithPowerOf2OutputElementsUnrolled) {
 
   ENTRY kernel_entry {
     constant0 = f32[] constant(0)
-    arg1 = f16[1024,512]{1,0} parameter(0)
-    arg1_conv = f32[1024,512]{1,0} convert(arg1)
-    ROOT reduce = f32[512]{0} reduce(arg1_conv, constant0), dimensions={0}, to_apply=reduction
+    arg1 = f16[1024,512,128]{2,1,0} parameter(0)
+    arg1_conv = f32[1024,512,128]{2,1,0} convert(arg1)
+    ROOT reduce = f32[512,128]{1,0} reduce(arg1_conv, constant0), dimensions={0}, to_apply=reduction
   })";
 
   // Check that two calls to llvm.nvvm.atomic are generated.
@@ -322,23 +322,23 @@ TEST_F(GpuKernelTilingTest, ColumnReductionMOFUnrolled) {
 
   fused_computation {
     constant0 = f32[] constant(0)
-    arg.1 = f16[1024,512]{1,0} parameter(0)
-    arg.2 = f16[1024,512]{1,0} parameter(1)
-    arg1.conv = f32[1024,512]{1,0} convert(arg.1)
-    arg2.conv = f32[1024,512]{1,0} convert(arg.2)
-    reduce1 = f32[512]{0} reduce(arg1.conv, constant0), dimensions={0},
+    arg.1 = f16[1024,512,128]{2,1,0} parameter(0)
+    arg.2 = f16[1024,512,128]{2,1,0} parameter(1)
+    arg1.conv = f32[1024,512,128]{2,1,0} convert(arg.1)
+    arg2.conv = f32[1024,512,128]{2,1,0} convert(arg.2)
+    reduce1 = f32[512,128]{1,0} reduce(arg1.conv, constant0), dimensions={0},
       to_apply=reduction22
-    reduce2 = f32[512]{0} reduce(arg2.conv, constant0), dimensions={0},
+    reduce2 = f32[512,128]{1,0} reduce(arg2.conv, constant0), dimensions={0},
       to_apply=reduction22
-    add = f32[1024,512]{1,0} add(arg1.conv, arg2.conv)
-    ROOT tuple = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+    add = f32[1024,512,128]{2,1,0} add(arg1.conv, arg2.conv)
+    ROOT tuple = (f32[512,128]{1,0}, f32[512,128]{1,0}, f32[1024,512,128]{2,1,0})
       tuple(reduce1, reduce2, add)
   }
 
   ENTRY kernel_entry {
-    arg1 = f16[1024,512]{1,0} parameter(0)
-    arg2 = f16[1024,512]{1,0} parameter(1)
-    ROOT fusion = (f32[512]{0}, f32[512]{0}, f32[1024,512]{1,0})
+    arg1 = f16[1024,512,128]{2,1,0} parameter(0)
+    arg2 = f16[1024,512,128]{2,1,0} parameter(1)
+    ROOT fusion = (f32[512,128]{1,0}, f32[512,128]{1,0}, f32[1024,512,128]{2,1,0})
       fusion(arg1, arg2), kind=kInput, calls=fused_computation
   })";
 
@@ -731,8 +731,8 @@ reduction {
 
 ENTRY kernel_entry {
     constant0 = f32[] constant(0)
-    arg1 = f32[1024,512]{1,0} parameter(0)
-    ROOT reduce = f32[512]{0} reduce(arg1, constant0), dimensions={0}, to_apply=reduction
+    arg1 = f32[1024,512,128]{2,1,0} parameter(0)
+    ROOT reduce = f32[512,128]{1,0} reduce(arg1, constant0), dimensions={0}, to_apply=reduction
 }
   )";
   auto expected_ir = R"(
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
index abfd438d202..ed4679d4163 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_noalias_test.cc
@@ -52,10 +52,14 @@ TEST_F(GpuNoAliasTest, Concat) {
   // - After optimizations we have "concatenate(x, y, x)".
   // - We only pass the same parameters once, so the kernel will have these
   // parameters: (x, y, output), and all of them will be noalias.
-  CompileAndVerifyIr(
-      std::move(hlo_module),
-      R"(CHECK: define void @{{[a-zA-Z0-9_]+}}(ptr noalias align 16 dereferenceable(16) %arg0, ptr noalias align 16 dereferenceable(16) %arg1, ptr noalias align 128 dereferenceable(48) %arg2))",
-      /*match_optimized_ir=*/false);
+  auto expected_ir = is_built_with_rocm_ ? R"(
+CHECK: define amdgpu_kernel void @{{[a-zA-Z0-9_]+}}(ptr noalias align 16 dereferenceable(16) %arg0, ptr noalias align 16 dereferenceable(16) %arg1, ptr noalias align 128 dereferenceable(48) %arg2) #0
+  )"
+                                         : R"(
+CHECK: define void @{{[a-zA-Z0-9_]+}}(ptr noalias align 16 dereferenceable(16) %arg0, ptr noalias align 16 dereferenceable(16) %arg1, ptr noalias align 128 dereferenceable(48) %arg2)
+  )";
+  CompileAndVerifyIr(std::move(hlo_module), expected_ir,
+                     /*match_optimized_ir=*/false);
 }
 
 }  // namespace gpu
diff --git a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
index 4152893498c..5415c139fbe 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -45,7 +45,6 @@ TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
   // The default unrolling factor is 4.
   HloModuleConfig config;
   auto debug_options = GetDebugOptionsFromFlags();
-  debug_options.set_xla_gpu_enable_mlir_lowering(false);
   config.set_debug_options(debug_options);
   auto hlo_module = ParseAndReturnVerifiedModule(kAddModule, config).value();
 
@@ -77,7 +76,6 @@ TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
 TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
   HloModuleConfig config;
   auto debug_options = HloTestBase::GetDebugOptionsForTest();
-  debug_options.set_xla_gpu_enable_mlir_lowering(false);
   config.set_debug_options(debug_options);
 
   const char *const kUnfusedAddModule = R"(
diff --git a/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo b/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo
index fde36ed636c..f2c3d300f4a 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/kernel_reuse.hlo
@@ -69,8 +69,8 @@ ENTRY e {
   p2 = f16[15,19]{1,0} parameter(2)
   p1 = s8[19,17]{1,0} parameter(1)
   p0 = f16[15,19]{1,0} parameter(0)
-  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}"
-  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}"
+  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}"
+  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}"
   ROOT tuple = (f16[15,17]{1,0}, f16[15,17]{1,0}) tuple(triton_gemm_dot0, triton_gemm_dot1)
 }
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
index dbffbbf1af6..d94f1263336 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/launch_dimensions.hlo
@@ -1,5 +1,3 @@
-// Disable the MLIR lowering path to test the ir emitter block size logic.
-// RUN: XLA_FLAGS="--xla_gpu_enable_mlir_lowering=false" \
 // RUN:   hlo_to_llvm_ir %s | FileCheck %s
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
diff --git a/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc b/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc
index a2ca96e457c..9bf2b49fb45 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/mlir_gpu_test_base.cc
@@ -78,8 +78,8 @@ StatusOr<ExecutionOutput> MlirGpuTestBase::RunMlirModule(
   ExecutableRunOptions executable_run_options;
   executable_run_options.set_stream(stream);
   executable_run_options.set_allocator(backend_->memory_allocator());
-  ServiceExecutableRunOptions run_options(executable_run_options,
-                                          backend_->StreamBorrower());
+  ServiceExecutableRunOptions run_options(
+      executable_run_options, backend_->StreamBorrowerWithPriority());
   std::vector<ExecutionInput> execution_inputs;
   execution_inputs.reserve(arguments.size());
 
diff --git a/tensorflow/compiler/xla/service/gpu/tests/transpose_emitter_test.cc b/tensorflow/compiler/xla/service/gpu/tests/transpose_emitter_test.cc
index 9f9ad817d80..cd834b36819 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/transpose_emitter_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/tests/transpose_emitter_test.cc
@@ -272,5 +272,31 @@ ENTRY entry {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
 }
 
+TEST_F(TransposeEmitterTest, InconsistentTransposes) {
+  const char* hlo = R"(
+HloModule module
+
+fusion {
+  p0 = f32[32, 64] parameter(0)
+  p1 = f32[64, 32] parameter(1)
+  t0 = f32[64, 32] transpose(p0), dimensions={1,0}
+  t1 = f32[32, 64] transpose(p1), dimensions={1,0}
+  ROOT tuple = (f32[64, 32], f32[32, 64]) tuple(t0, t1)
+}
+
+ENTRY module {
+  p0 = f32[32, 64] parameter(0)
+  p1 = f32[64, 32] parameter(1)
+  ROOT fusion = (f32[64, 32], f32[32, 64]) fusion(p0, p1), kind=kLoop, calls=fusion
+}
+  )";
+  CompileAndVerifyIr(hlo, MakePlatformSpecificLlvm(R"(
+// CHECK-NOT: call void BARRIER()
+  )"),
+                     /*match_optimized_ir=*/true,
+                     /*run_optimization_passes=*/false);
+  EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo b/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo
index 7937b0ee4b7..0bb7e485043 100644
--- a/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo
+++ b/tensorflow/compiler/xla/service/gpu/tests/triton_naming.hlo
@@ -14,5 +14,5 @@ HloModule t, entry_computation_layout={(f16[15,19]{1,0},s8[19,17]{1,0})->f16[15,
 ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
   %p1 = s8[19,17]{1,0} parameter(1)
   %p0 = f16[15,19]{1,0} parameter(0)
-  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\"}"
+  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\"}}"
 }
diff --git a/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc b/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc
index b5948df588b..861c8620c33 100644
--- a/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc
+++ b/tensorflow/compiler/xla/service/gpu/triton_autotuner.cc
@@ -62,6 +62,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/gpu/target_constants.h"
 #include "tensorflow/compiler/xla/service/gpu/thunk.h"
 #include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/stream_executor/device_description.h"
 #include "tensorflow/compiler/xla/stream_executor/device_memory.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/asm_compiler.h"
@@ -196,7 +197,9 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
       : config_(config), thread_pool_(thread_pool) {}
 
   Status HandleFusion(HloInstruction* hlo) override {
-    if (hlo->raw_backend_config_string() != kTritonGemmBackendConfig) {
+    TF_ASSIGN_OR_RETURN(auto backend_config,
+                        hlo->backend_config<FusionBackendConfig>());
+    if (backend_config.kind() != kTritonGemmFusionKind) {
       return OkStatus();
     }
 
@@ -212,7 +215,8 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
       TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, tiling));
     }
 
-    TF_RETURN_IF_ERROR(hlo->set_backend_config(tiling));
+    *backend_config.mutable_triton_gemm_config() = tiling;
+    TF_RETURN_IF_ERROR(hlo->set_backend_config(backend_config));
     MarkAsChanged();
     return OkStatus();
   }
@@ -265,18 +269,21 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
     if (!stream_exec->SynchronizeAllActivity()) {
       return InternalError("Failed to synchronize GPU for autotuning.");
     }
+    se::DeviceMemoryAllocator* allocator = device_config.allocator;
+    if (allocator == nullptr) {
+      allocator = stream_exec->GetAllocator();
+    }
 
     HloInstruction* root = fusion.root_instruction();
-    TF_ASSIGN_OR_RETURN(
-        se::Stream* const stream,
-        device_config.allocator->GetStream(stream_exec->device_ordinal()));
+    TF_ASSIGN_OR_RETURN(se::Stream* const stream,
+                        allocator->GetStream(stream_exec->device_ordinal()));
 
     const DebugOptions debug_opts = fusion.parent()->config().debug_options();
     const AutotuneConfig autotune_cfg = GetConfig(debug_opts);
 
     std::vector<AutotuneResult> results;
     se::RedzoneAllocator rz_allocator(
-        stream, device_config.allocator, PtxOptsFromDebugOptions(debug_opts),
+        stream, allocator, PtxOptsFromDebugOptions(debug_opts),
         /*memory_limit=*/std::numeric_limits<int64_t>::max(),
         /*redzone_size=*/autotune_cfg.should_check_correctness()
             ? se::RedzoneAllocator::kDefaultRedzoneSize
@@ -305,7 +312,7 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
           StatusOr<CompilationResult*> res =
               Compile(fusion, device_config, conf);
           if (!res.ok()) {
-            LOG(ERROR) << "Failure: " << res.status().ToString();
+            LOG(ERROR) << "Failure: " << res.status();
           }
           counter.DecrementCount();
         });
@@ -518,7 +525,16 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
     std::unique_ptr<HloModule> new_hlo_module = ExtractInstructionIntoNewModule(
         *original_computation.FusionInstruction());
 
+    // Copy the config from the original computations's module, but use the new
+    // entry computation layout. If we extract an instruction into a new
+    // module, then its entry computation layout can be different from that of
+    // the original module.
+    ComputationLayout new_entry_computation_layout =
+        new_hlo_module->config().entry_computation_layout();
     new_hlo_module->set_config(original_computation.parent()->config());
+    *new_hlo_module->config().mutable_entry_computation_layout() =
+        new_entry_computation_layout;
+
     DebugOptions options =
         original_computation.parent()->config().debug_options();
     // Require thunks because so far we are relying on them for execution here.
@@ -533,7 +549,13 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
     new_hlo_module->config().set_debug_options(options);
     HloComputation* entry_computation = new_hlo_module->entry_computation();
     HloInstruction* cloned_dot_fusion = entry_computation->root_instruction();
-    TF_RETURN_IF_ERROR(cloned_dot_fusion->set_backend_config(autotune_config));
+
+    TF_ASSIGN_OR_RETURN(
+        auto backend_config,
+        cloned_dot_fusion->backend_config<FusionBackendConfig>());
+    *backend_config.mutable_triton_gemm_config() = autotune_config;
+    TF_RETURN_IF_ERROR(cloned_dot_fusion->set_backend_config(backend_config));
+
     if (autotune_config.split_k() > 1) {
       if (!MakeDotSplitKBatch(cloned_dot_fusion, autotune_config).ok()) {
         return {std::nullopt};
@@ -563,6 +585,13 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
 
     llvm::LLVMContext llvm_context;
     CompileModuleResults compile_module_results;
+
+    // Verify the HLO here to catch potential rewrite errors.
+    TF_RETURN_IF_ERROR(HloVerifier(/*layout_sensitive=*/true,
+                                   /*allow_mixed_precision=*/false)
+                           .Run(new_hlo_module.get())
+                           .status());
+
     Status compilation_status = xla::gpu::CompileModuleToLlvmIrImpl(
         new_hlo_module.get(), &llvm_context,
         /*target_triple=*/nvptx::TargetTriple(),
@@ -635,17 +664,29 @@ std::vector<AutotuneResult::TritonGemmKey> GetPossibleMatmulAutotuneConfigs(
   if (compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)) {
     absl::c_copy(
         std::vector<AutotuneResult::TritonGemmKey>{
-            GemmKey(128, 256, 32, 1, 3, 8), GemmKey(256, 128, 32, 1, 3, 8),
-            GemmKey(256, 64, 32, 1, 4, 4), GemmKey(64, 256, 32, 1, 4, 4),
-            GemmKey(128, 64, 32, 1, 4, 4), GemmKey(64, 128, 32, 1, 4, 4),
-            GemmKey(128, 256, 32, 1, 3, 8), GemmKey(256, 128, 128, 1, 3, 8),
-            GemmKey(256, 64, 128, 1, 4, 4), GemmKey(64, 256, 128, 1, 4, 4),
+            GemmKey(128, 256, 32, 1, 3, 8),  GemmKey(256, 128, 32, 1, 3, 8),
+            GemmKey(256, 64, 32, 1, 4, 4),   GemmKey(64, 256, 32, 1, 4, 4),
+            GemmKey(128, 64, 32, 1, 4, 4),   GemmKey(64, 128, 32, 1, 4, 4),
+            GemmKey(128, 256, 32, 1, 3, 8),  GemmKey(256, 128, 128, 1, 3, 8),
+            GemmKey(256, 64, 128, 1, 4, 4),  GemmKey(64, 256, 128, 1, 4, 4),
             GemmKey(128, 128, 128, 1, 4, 4), GemmKey(128, 64, 64, 1, 4, 4),
-            GemmKey(64, 128, 64, 1, 4, 4), GemmKey(128, 32, 64, 1, 4, 4),
-            GemmKey(64, 32, 64, 1, 4, 4), GemmKey(32, 128, 32, 1, 4, 4),
-            GemmKey(128, 128, 32, 1, 4, 4), GemmKey(16, 16, 256, 1, 3, 4)},
+            GemmKey(64, 128, 64, 1, 4, 4),   GemmKey(128, 32, 64, 1, 4, 4),
+            GemmKey(64, 32, 64, 1, 4, 4),    GemmKey(32, 128, 32, 1, 4, 4),
+            GemmKey(128, 128, 32, 1, 4, 4),  GemmKey(16, 16, 256, 1, 3, 4),
+            GemmKey(128, 128, 64, 2, 1, 8),  GemmKey(64, 64, 64, 1, 2, 4),
+            GemmKey(16, 64, 256, 8, 1, 4),   GemmKey(256, 256, 128, 1, 3, 8)},
         std::back_inserter(configs));
   }
+  if (compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER)) {
+    configs.erase(
+        std::remove_if(configs.begin(), configs.end(),
+                       [](const AutotuneResult::TritonGemmKey& config) {
+                         return (config.block_m() * config.block_n() / 256) %
+                                    config.num_warps() !=
+                                0;
+                       }),
+        configs.end());
+  }
   return configs;
 }
 
@@ -668,13 +709,16 @@ std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
   std::unique_ptr<HloInstruction> new_instruction =
       hlo.CloneWithNewOperands(hlo.shape(), new_operands, &clone_context);
   builder.AddInstruction(std::move(new_instruction));
-  new_hlo_module->AddEntryComputation(builder.Build());
+  new_hlo_module->AddEntryComputationWithLayouts(builder.Build());
   return new_hlo_module;
 }
 
 StatusOr<bool> TritonAutotuner::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (module->config().debug_options().xla_gpu_autotune_level() == 0) {
+    return false;
+  }
   return TritonAutotunerVisitor{config_, thread_pool_}.RunOnModule(
       module, execution_threads);
 }
diff --git a/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc b/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc
index 9c07b145525..ee69c56f4db 100644
--- a/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc
+++ b/tensorflow/compiler/xla/service/gpu/triton_autotuner_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/gpu/gemm_rewriter_triton.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
@@ -84,6 +85,10 @@ ENTRY entry {
 
 class TritonAutotunerTest : public HloTestBase {
  public:
+  TritonAutotunerTest()
+      : HloTestBase(/*verifier_layout_sensitive=*/true,
+                    /*allow_mixed_precision_in_hlo_verifier=*/false) {}
+
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
     debug_options.set_xla_gpu_enable_triton_gemm(true);
@@ -120,12 +125,11 @@ class TritonAutotunerTest : public HloTestBase {
             dot_fusion = dot_fusion->operand(0);
           }
           CHECK_EQ(dot_fusion->opcode(), HloOpcode::kFusion);
-          CHECK_GT(
-              dot_fusion
-                  ->backend_config<tensorflow::AutotuneResult::TritonGemmKey>()
-                  .value()
-                  .block_m(),
-              0);
+          CHECK_GT(dot_fusion->backend_config<xla::gpu::FusionBackendConfig>()
+                       .value()
+                       .triton_gemm_config()
+                       .block_m(),
+                   0);
         });
   }
 };
@@ -168,12 +172,13 @@ ENTRY e {
 }
 )";
   CheckTritonAutotuning(hlo, R"(
-// CHECK:   %triton_gemm_out
+// CHECK:   %triton_gemm_out_computation (
 // CHECK:   ROOT %out.1 = f16[128,6144]{1,0} dot(%c.1, %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-// CHECK:   ROOT %triton_gemm_out = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out, backend_config="{\"block_m\":\"
+// CHECK:   ROOT %triton_gemm_out = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out_computation
+// CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{5e-3, 5e-3}));
+  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{/*aabs=*/5e-3, /*arel=*/5e-3}));
 }
 
 TEST_F(TritonAutotunerTest, Int8FusedGemm256) {
@@ -191,12 +196,13 @@ ENTRY e {
 )";
 
   CheckTritonAutotuning(hlo, R"(
-// CHECK:   %triton_gemm_out (
+// CHECK:   %triton_gemm_out_computation (
 // CHECK:   ROOT %out.1 = f16[128,6144]{1,0} dot(%c.1, %parameter_1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-// CHECK:   ROOT %triton_gemm_out = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out, backend_config="{\"block_m\":\"
+// CHECK:   ROOT %triton_gemm_out = f16[128,6144]{1,0} fusion(%x, %y), kind=kCustom, calls=%triton_gemm_out_computation
+// CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{1e-2, 1e-2}));
+  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
 }
 
 TEST_F(TritonAutotunerTest, SelectsSplitK) {
@@ -209,9 +215,9 @@ TEST_F(TritonAutotunerTest, SelectsSplitK) {
 HloModule t
 
 ENTRY e {
-  p0 = s8[7,4096] parameter(0)
-  p0c = bf16[7,4096] convert(p0)
-  p1 = bf16[4096,18] parameter(1)
+  p0 = s8[7,8192] parameter(0)
+  p0c = bf16[7,8192] convert(p0)
+  p1 = bf16[8192,18] parameter(1)
   ROOT dot.0 = bf16[7,18] dot(p0c, p1),
     lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
@@ -222,29 +228,11 @@ ENTRY e {
 ; CHECK: ROOT %fusion.1
 )");
 
-  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{1e-1, 1e-1}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/0.5, /*arel=*/1e-1}));
 }
 
-TEST_F(TritonAutotunerTest, SkipConfigsProducingDeviantResults) {
-  const std::string kHloText = R"(
-HloModule module
-
-ENTRY e {
-  tmp_1 = pred[8192,12800]{1,0} parameter(0)
-  tmp_2 = f16[8192,12800]{1,0} convert(tmp_1)
-  tmp_3 = f16[4096,12800]{1,0} parameter(1)
-  ROOT tmp_4 = f16[8192,4096]{0,1} dot(tmp_2, tmp_3),
-    lhs_contracting_dims={1}, rhs_contracting_dims={1}
-})";
-
-  // Here split-K configs deviate strongly due to intermediate rounding
-  // but do execute fast - make sure they are filtered out (split_k = 1).
-
-  MatchOptimizedHlo(kHloText, R"(
-; CHECK: fusion(%tmp_1, %tmp_3), kind=kCustom
-; CHECK-SAME: split_k\":\"1\"
-)");
-}
+// TODO(b/281489442): Write a testcase called
+// `SkipConfigsProducingDeviantResults` or similar.
 
 class TritonAutotunerLevelTest : public HloTestBase,
                                  public ::testing::WithParamInterface<int> {
@@ -256,8 +244,8 @@ class TritonAutotunerLevelTest : public HloTestBase,
   }
 };
 
-TEST_P(TritonAutotunerLevelTest, PredF32) {
-  const std::string hlo_text = R"(
+TEST_P(TritonAutotunerLevelTest, AllAutotuningLevelsWorkCorrectly) {
+  const std::string kHloText = R"(
 HloModule m
 
 ENTRY e {
@@ -270,12 +258,19 @@ ENTRY e {
 
   TritonAutotuner::ClearAutotuneResults();
 
-  MatchOptimizedHlo(hlo_text, R"(
-; CHECK: fusion(%p0, %p1), kind=kCustom
-; CHECK-SAME: backend_config="{\"block_m\":\"
-)");
+  if (GetDebugOptionsForTest().xla_gpu_autotune_level() == 0) {
+    MatchOptimizedHlo(kHloText, R"(
+; CHECK: kind=kCustom
+; CHECK-NOT: block_m
+      )");
+  } else {
+    MatchOptimizedHlo(kHloText, R"(
+; CHECK: kind=kCustom
+; CHECK-SAME: block_m
+      )");
+  }
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-3, 1e-3}));
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
 INSTANTIATE_TEST_SUITE_P(TritonAutotunerLevelSweep, TritonAutotunerLevelTest,
diff --git a/tensorflow/compiler/xla/service/heap_simulator.cc b/tensorflow/compiler/xla/service/heap_simulator.cc
index 94b65eb3223..cd073bb5274 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <cstdlib>
 #include <functional>
 #include <iterator>
 #include <limits>
@@ -32,6 +33,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
@@ -846,20 +848,126 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval::ToString() const {
                       " }");
 }
 
+template <typename BufferType>
+const  // NOLINT(readability-const-return-type)
+    typename GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval
+    GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
+        CreateConstInterval(const BufferInterval& full_buffer_interval) {
+  return SlicedBufferInterval(full_buffer_interval);
+}
+
+template <typename BufferType>
+typename GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
+    CreateMutableInterval(BufferInterval& full_buffer_interval) {
+  return SlicedBufferInterval(full_buffer_interval, &full_buffer_interval);
+}
+
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::Slice(
+    absl::Span<int64_t> slice_sizes_sorted_by_offset) {
+  if (slice_sizes_sorted_by_offset.empty()) {
+    slice_sizes_sorted_by_offset_ = {full_buffer_interval_.size};
+    make_free_chunks_intervals_ = {full_buffer_interval_};
+    return;
+  }
+
+  const int64_t min_slice_size =
+      *absl::c_min_element(slice_sizes_sorted_by_offset);
+  slice_sizes_sorted_by_offset_ = std::vector<int64_t>(
+      slice_sizes_sorted_by_offset.begin(), slice_sizes_sorted_by_offset.end());
+
+  size_t num_slices = slice_sizes_sorted_by_offset.size();
+  make_free_chunks_intervals_.clear();
+  make_free_chunks_intervals_.reserve(num_slices);
+
+  int64_t size_total = 0;
+  absl::InlinedVector<const BufferType*, 2> empty_colocations;
+  for (int i = 0; i < num_slices; ++i) {
+    int64_t new_size = slice_sizes_sorted_by_offset[i];
+    size_total += new_size;
+    make_free_chunks_intervals_.push_back(BufferInterval{
+        full_buffer_interval_.buffer,
+        /*size=*/
+        (i == num_slices - 1 ? full_buffer_interval_.size : min_slice_size),
+        /*start=*/0,
+        /*end=*/full_buffer_interval_.end,
+        /*colocations=*/
+        (i == num_slices - 1 ? full_buffer_interval_.colocations
+                             : empty_colocations),
+        full_buffer_interval_.need_allocation});
+  }
+
+  CHECK_EQ(size_total, full_buffer_interval_.size);
+}
+
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
+    UpdateSliceStartTimes(const std::vector<int64_t>& start_times) {
+  CHECK_EQ(start_times.size(), num_slices());
+  CHECK(mutable_full_buffer_interval_ != nullptr);
+  mutable_full_buffer_interval_->start = start_times.front();
+  for (size_t slice_time = 0; slice_time < num_slices(); ++slice_time) {
+    make_free_chunks_intervals_[slice_time].start = start_times[slice_time];
+    if (slice_time != num_slices() - 1) {
+      make_free_chunks_intervals_[slice_time].end = start_times[slice_time + 1];
+    } else {
+      make_free_chunks_intervals_[slice_time].end = full_buffer_interval_.end;
+    }
+  }
+}
+
+template <typename BufferType>
+void GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedBufferInterval::UpdateEndTime(int64_t end_time) {
+  CHECK(mutable_full_buffer_interval_ != nullptr);
+  mutable_full_buffer_interval_->end = end_time;
+  make_free_chunks_intervals_.back().end = end_time;
+}
+
+template <typename BufferType>
+const typename GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval&
+GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedBufferInterval::full_buffer_interval() const {
+  return full_buffer_interval_;
+}
+
+template <typename BufferType>
+const std::vector<int64_t>& GlobalDecreasingSizeBestFitHeap<
+    BufferType>::SlicedBufferInterval::SliceSizesSortedByOffset() const {
+  return slice_sizes_sorted_by_offset_;
+}
+
+template <typename BufferType>
+const typename GlobalDecreasingSizeBestFitHeap<BufferType>::BufferInterval&
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
+    IntervalForMakeFreeChunks(int64_t slice_time) const {
+  CHECK_LT(slice_time, num_slices());
+  return make_free_chunks_intervals_[slice_time];
+}
+
+template <typename BufferType>
+GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedBufferInterval::
+    SlicedBufferInterval(const BufferInterval& full_buffer_interval,
+                         BufferInterval* mutable_full_buffer_interval)
+    : full_buffer_interval_(full_buffer_interval),
+      mutable_full_buffer_interval_(mutable_full_buffer_interval) {
+  // Start with 1 slice. Slice() will initialize the remaining data members.
+  Slice({});
+}
+
 template <typename BufferType>
 std::string GlobalDecreasingSizeBestFitHeap<
     BufferType>::SlicedBufferInterval::ToString() const {
   return absl::StrCat(
-      "{ full_buffer_interval: ", full_buffer_interval.ToString(),
-      ", sorted_slices: [ ",
-      absl::StrJoin(sorted_slices, ", ",
-                    [](std::string* out,
-                       const SlicedBufferInterval::IntervalSlice& slice) {
-                      absl::StrAppend(out, "{ size: ", slice.size,
-                                      ", allocation_start_time: ",
-                                      slice.allocation_start_time, " }");
+      "{ full_buffer_interval: ", full_buffer_interval_.ToString(), ", ",
+      "MakeFreeChunks intervals: { ",
+      absl::StrJoin(make_free_chunks_intervals_, ", ",
+                    [](std::string* out, const BufferInterval& interval) {
+                      absl::StrAppend(out, interval.ToString());
                     }),
-      " ] }");
+      " }, ", "slize_sizes_sorted_by_offsets: { ",
+      absl::StrJoin(slice_sizes_sorted_by_offset_, ", "), " } }");
 }
 
 template <typename BufferType>
@@ -1050,11 +1158,15 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
   // Start by initializing FreeChunkRoots at LatestSliceTime().
   for (const std::pair<const int64_t, int64_t>& free_chunk_pair :
        free_chunks_per_slice_time.back()) {
+    Chunk free_chunk =
+        Chunk::FromOffsetEnd(free_chunk_pair.first, free_chunk_pair.second);
+    if (free_chunk.size == 0) {
+      continue;
+    }
+    CHECK_GT(free_chunk.size, 0);
+
     free_chunks_.insert(
-        {free_chunk_pair.first,
-         FreeChunkRoot(Chunk::FromOffsetEnd(free_chunk_pair.first,
-                                            free_chunk_pair.second),
-                       LatestSliceTime())});
+        {free_chunk_pair.first, FreeChunkRoot(free_chunk, LatestSliceTime())});
   }
   // For slice times < LatestSliceTime(), slice the space of each root according
   // to when each subset of that root space is available.
@@ -1070,6 +1182,11 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
       Chunk free_chunk =
           Chunk::FromOffsetEnd(free_chunk_pair.first, free_chunk_pair.second);
 
+      if (free_chunk.size == 0) {
+        continue;
+      }
+      CHECK_GT(free_chunk.size, 0);
+
       // Increment it while all of free_chunk < all of it.
       for (; it != free_chunks_.end() &&
              free_chunk.chunk_end() - 1 < it->second.chunk.offset;
@@ -1108,7 +1225,7 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::
     }
   }
 
-  VLOG(1) << "Initial candidates:\n" << FreeChunksToAsciiArt();
+  VLOG(2) << "Initial candidates:\n" << FreeChunksToAsciiArt();
   VLOG(2) << "SlicedAllocationFinder:\n" << ToString();
 }
 
@@ -1173,7 +1290,7 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::Find()
     const {
   // Check if we can place the fully allocated buffer at the preferred offset
   if (preferred_offset_ >= 0) {
-    VLOG(2) << "SlicedAllocationFinder::Find() searching preferred offset "
+    VLOG(3) << "SlicedAllocationFinder::Find() searching preferred offset "
             << preferred_offset_;
     auto it = free_chunks_.lower_bound(preferred_offset_);
     if (it != free_chunks_.end()) {
@@ -1181,6 +1298,9 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::Find()
       ChunksSortedBySliceTime chunks =
           FindInRoot(*root, /*only_try_preferred_offset=*/true);
       if (!chunks.empty()) {
+        VLOG(1) << "SlicedAllocationFinder found chunks: "
+                << "{ " << absl::StrJoin(chunks, ", ", absl::StreamFormatter())
+                << " }";
         return chunks;
       }
     }
@@ -1210,10 +1330,13 @@ GlobalDecreasingSizeBestFitHeap<BufferType>::SlicedAllocationFinder::Find()
   // Each call to heap_next() gives us the next smallest root.
   for (const FreeChunkRoot* root = heap_next(); root != nullptr;
        root = heap_next()) {
-    VLOG(2) << "SlicedAllocationFinder::Find() searching " << root->ToString();
+    VLOG(3) << "SlicedAllocationFinder::Find() searching " << root->ToString();
     ChunksSortedBySliceTime chunks =
         FindInRoot(*root, /*only_try_preferred_offset=*/false);
     if (!chunks.empty()) {
+      VLOG(1) << "SlicedAllocationFinder found chunks: "
+              << "{ " << absl::StrJoin(chunks, ", ", absl::StreamFormatter())
+              << " }";
       return chunks;
     }
   }
@@ -1469,7 +1592,8 @@ typename GlobalDecreasingSizeBestFitHeap<BufferType>::Chunk
 GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidate(
     const GlobalDecreasingSizeBestFitHeap::BufferInterval& buffer_interval,
     int64_t preferred_offset) const {
-  SlicedBufferInterval sliced_buffer_interval(buffer_interval);
+  const SlicedBufferInterval sliced_buffer_interval =
+      SlicedBufferInterval::CreateConstInterval(buffer_interval);
   std::vector<Chunk> chunks =
       FindChunkCandidates(sliced_buffer_interval, preferred_offset);
   CHECK_EQ(chunks.size(), 1);
@@ -1542,49 +1666,54 @@ std::vector<typename GlobalDecreasingSizeBestFitHeap<BufferType>::Chunk>
 GlobalDecreasingSizeBestFitHeap<BufferType>::FindChunkCandidates(
     const SlicedBufferInterval& sliced_buffer_interval,
     int64_t preferred_offset) const {
-  const BufferInterval& buffer_interval =
-      sliced_buffer_interval.full_buffer_interval;
-  // TODO(b/275905276): changes this method to account for slicing and remove
-  // the following check
-  CHECK(sliced_buffer_interval.sorted_slices.empty())
-      << "Chunk slicing is not yet supported.";
-
   VLOG(1) << "Finding chunks for sliced buffer interval: "
           << sliced_buffer_interval.ToString();
 
   // Find the max size of interval across its colocations and use this value
   // to determine whether the buffer will fit in the heap.
-  int64_t max_colocation_size = buffer_interval.size;
-  for (const BufferType* colocation :
-       GetTransitiveColocations(buffer_interval)) {
+  int64_t max_colocation_size =
+      sliced_buffer_interval.full_buffer_interval().size;
+  for (const BufferType* colocation : GetTransitiveColocations(
+           sliced_buffer_interval.full_buffer_interval())) {
     max_colocation_size =
         std::max(max_colocation_size, buffer_intervals_.at(colocation).size);
   }
 
-  // Get all colocated buffers and gather all interferenced chunks.
-  FreeChunks free_chunks = MakeFreeChunks(buffer_interval, max_colocation_size);
-
-  // TODO(b/275905276): when slicing, build free_chunks for each consecutive
-  // slice time, where slice time is logical time.
-
-  // TODO(b/275905276): when slicing, merge the free_chunks for each slice time.
-  // The end result should be a list of free chunks in which buffer_interval not
-  // only fits in each free chunk, but the slices of buffer interval can be
-  // allocated according to their requirements. Try to find a large enough free
-  // chunk containing the preferred offset.
-  Chunk chunk = Chunk::FromOffsetSize(preferred_offset, max_colocation_size);
-  auto it = (preferred_offset < 0) ? free_chunks.end()
-                                   : free_chunks.lower_bound(preferred_offset);
-  if (it == free_chunks.end() || (it->second < chunk.chunk_end())) {
-    // Otherwise, find the smallest free chunk. In the case of a tie, prefer the
-    // smallest offset. We ensure above that all of the free chunks are large
-    // enough to store the buffer.
-    chunk.offset = absl::c_min_element(free_chunks, [](auto a, auto b) {
-                     return std::forward_as_tuple(a.second - a.first, a.first) <
-                            std::forward_as_tuple(b.second - b.first, b.first);
-                   })->first;
+  // Build up a list of free chunks for each slice time.
+  std::vector<FreeChunks> free_chunks_per_slice_time;
+  free_chunks_per_slice_time.reserve(sliced_buffer_interval.num_slices());
+  for (int slice_time = 0; slice_time < sliced_buffer_interval.num_slices() - 1;
+       ++slice_time) {
+    // We don't need to account for colocation until the last slice time, in
+    // which we've allocated all the slices. So we set max_colocation_size to
+    // -1.
+    free_chunks_per_slice_time.push_back(MakeFreeChunks(
+        sliced_buffer_interval.IntervalForMakeFreeChunks(slice_time),
+        /*max_colocation_size=*/-1));
   }
-  return {chunk};
+  // We account for colocation size in the last slice time, where we've
+  // allocated all the slices.
+  free_chunks_per_slice_time.push_back(
+      MakeFreeChunks(sliced_buffer_interval.IntervalForMakeFreeChunks(
+                         sliced_buffer_interval.num_slices() - 1),
+                     max_colocation_size));
+
+  auto chunks =
+      SlicedAllocationFinder(free_chunks_per_slice_time,
+                             sliced_buffer_interval.SliceSizesSortedByOffset(),
+                             max_colocation_size, preferred_offset, alignment_)
+          .Find();
+  if (chunks.empty()) {
+    return {};
+  }
+  CHECK_EQ(chunks.size(), sliced_buffer_interval.num_slices() + 1);
+  // The extra chunk is for colocations, so merge the last two chunks.
+  Chunk last = chunks.back();
+  chunks.pop_back();
+  chunks.back() = Chunk::FromOffsetSize(chunks.back().offset,
+                                        chunks.back().size + last.size);
+
+  return chunks;
 }
 
 template <typename BufferType>
diff --git a/tensorflow/compiler/xla/service/heap_simulator.h b/tensorflow/compiler/xla/service/heap_simulator.h
index 342523b25e9..3649b5b4414 100644
--- a/tensorflow/compiler/xla/service/heap_simulator.h
+++ b/tensorflow/compiler/xla/service/heap_simulator.h
@@ -425,9 +425,10 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   using BufferIntervalCompare =
       std::function<bool(const BufferInterval&, const BufferInterval&)>;
 
-  // A BufferInterval that we intend to allocate in slices. If
-  // sorted_slices.empty(), the allocation is not sliced. Sometimes we refer to
-  // such allocations as having a single slice.
+  // SlicedBufferInterval is a wrapper around BufferInterval with parameters
+  // indicating whether the BufferInterval should be allocated in slices. (If
+  // NumSlices() is 1, the allocation will not be sliced.) This class is used as
+  // input to GlobalDecreasingSizeBestFitHeap::FindChunkCandidates().
   //
   // For example, instead of allocating A in space and time as illustrated on
   // the left, we may wish to allocate A0 and A1 overlapping in time, contiguous
@@ -443,41 +444,76 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
   // e | +-----------+                 e | +-----------+
   //   --|-----------|------->           --|-----|-----|------->
   //     s           e   time              s     i     e   time
-  //
-  // The allocation slices of a SlicedBufferInterval have the following
-  // properties:
-  // slice 0:
-  //   * size = full_buffer_interval.size - sum_over_j(sorted_slices[j].size)
-  //   * lifetime = [full_buffer_interval.start, full_buffer_interval.end)
-  // slice i (for i > 0):
-  //   * size = sorted_slices[i - 1].size
-  //   * lifetime = [sorted_slices[i - 1].start, full_buffer_interval.end)
-  //
-  // The only requirement on the spatial ordering of the slices is that they
-  // form a contiguous spatial block of memory, once all slices have been
-  // allocated.
-  struct SlicedBufferInterval {
-    // Represents a slice of full_buffer_interval that lives from start to
-    // full_buffer_interval.end.
-    struct IntervalSlice {
-      int64_t size;
-      int64_t allocation_start_time;
-    };
+  class SlicedBufferInterval {
+   public:
+    // Factory constructors.
+    static const SlicedBufferInterval CreateConstInterval(
+        const BufferInterval& full_buffer_interval);
+    static SlicedBufferInterval CreateMutableInterval(
+        BufferInterval& full_buffer_interval);
 
-    explicit SlicedBufferInterval(const BufferInterval& buffer_interval)
-        : full_buffer_interval(buffer_interval) {}
     SlicedBufferInterval() = delete;
 
+    // Updates the number of slices, and slice sizes. An empty
+    // slice_sizes_sorted_by_offset is treated the same as setting the number of
+    // slices to 1. Every time Slice() is called with a set of sizes > 1, it
+    // should be followed at some point by a call to UpdateSliceStartTimes, to
+    // update slice start times. Otherwise, the slice start times are
+    // meaningless.
+    //
+    // REQUIRES:
+    // - sum(slice_sizes_sorted_by_offset) == full_buffer_interval_.size
+    void Slice(absl::Span<int64_t> slice_sizes_sorted_by_offset);
+
+    // Updates the times at which we will start each slice. However, we have not
+    // yet decided which slice size will correspond to which start time.
+    //
+    // Mutates mutable_full_buffer_interval_.
+    //
+    // REQUIRES:
+    // - The SlicedBufferInterval was constructed using CreateMutableInterval.
+    // - start_times.size() == NumSlices()
+    // - start_times should be set such that it is permissible for any slice
+    //   size to map to any start time.
+    void UpdateSliceStartTimes(const std::vector<int64_t>& start_times);
+
+    // Updates the free time for all the slices.
+    //
+    // Mutates mutable_full_buffer_interval_.
+    //
+    // REQUIRES:
+    // - The SlicedBufferInterval was constructed using CreateMutableInterval.
+    void UpdateEndTime(int64_t end_time);
+
+    const BufferInterval& full_buffer_interval() const;
+    size_t num_slices() const { return slice_sizes_sorted_by_offset_.size(); }
+    const std::vector<int64_t>& SliceSizesSortedByOffset() const;
+
+    // Returns a BufferInterval with the requirements to call
+    // GlobalDecreasingSizeBestFitHeap::MakeFreeChunks at the specified slice
+    // time. The requirements are:
+    // - At the latest slice time, we need a contiguous buffer that is big
+    //   enough to fit all slices. In addition, that contiguous buffer will have
+    //   the same colocation requirements as the full_buffer_interval().
+    // - At other slice times, required chunks may be as small as the smallest
+    //   slice. Furthermore, their colocation requirements are empty.
+    // - The logical start time of the interval at slice time i is the end time
+    //   of the interval at slice time i-1.
+    const BufferInterval& IntervalForMakeFreeChunks(int64_t slice_time) const;
+
     // Convenience method for use with debugging and logging.
     std::string ToString() const;
 
-    const BufferInterval& full_buffer_interval;
+   private:
+    explicit SlicedBufferInterval(
+        const BufferInterval& full_buffer_interval,
+        BufferInterval* mutable_full_buffer_interval = nullptr);
 
-    // Describes allocations slices, after slice 0.
-    //
-    // sorted_slices is expected to be sorted according to
-    // sorted_slices[i].start < sorted_slices[i+1].start.
-    std::vector<IntervalSlice> sorted_slices;
+    const BufferInterval& full_buffer_interval_;
+    BufferInterval* mutable_full_buffer_interval_ = nullptr;
+    std::vector<int64_t> slice_sizes_sorted_by_offset_;
+    // make_free_chunks_intervals are indexed by slice time.
+    std::vector<BufferInterval> make_free_chunks_intervals_;
   };
 
   // A class for finding locations to allocate a sliced allocation. A sliced
@@ -709,8 +745,7 @@ class GlobalDecreasingSizeBestFitHeap : public HeapAlgorithm<BufferType> {
                            int64_t preferred_offset = -1) const;
   // FindChunkCandidates is the same as FindChunkCandidate, except it finds
   // spatially contiguous chunks candidates for a sliced buffer interval.
-  // Returned chunk i will correspond to slice i, as described in
-  // SlicedBufferInterval::sorted_slices.
+  // Returned chunk i should be copied at slice time i.
   std::vector<Chunk> FindChunkCandidates(
       const SlicedBufferInterval& sliced_buffer_interval,
       int64_t preferred_offset = -1) const;
diff --git a/tensorflow/compiler/xla/service/heap_simulator_test.cc b/tensorflow/compiler/xla/service/heap_simulator_test.cc
index 31d3c0a5868..69829003b87 100644
--- a/tensorflow/compiler/xla/service/heap_simulator_test.cc
+++ b/tensorflow/compiler/xla/service/heap_simulator_test.cc
@@ -16,8 +16,10 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 
 #include <cstdint>
+#include <functional>
 #include <limits>
 #include <memory>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -1707,6 +1709,127 @@ TEST_F(IntervalTreeTest, ThreeLevelsRightLeftChunkDifferent) {
   ASSERT_EQ(tree.GetRoot(), nullptr);
 }
 
+class SlicedBufferIntervalTest : public ::testing::Test {
+ public:
+  using HeapTy = GlobalDecreasingSizeBestFitHeap<HloValue>;
+  using ColocationTy = absl::InlinedVector<const HloValue*, 2>;
+
+  static std::tuple<const HloValue*, int64_t, int64_t, int64_t,
+                    const ColocationTy&, bool>
+  BufferIntervalToTuple(const HeapTy::BufferInterval& buffer_interval) {
+    return std::make_tuple(buffer_interval.buffer, buffer_interval.size,
+                           buffer_interval.start, buffer_interval.end,
+                           std::ref(buffer_interval.colocations),
+                           buffer_interval.need_allocation);
+  }
+
+  SlicedBufferIntervalTest() {
+    HloModuleConfig config;
+    module_ = std::make_unique<HloModule>("TestModule", config);
+
+    Shape f32vec4 = ShapeUtil::MakeShape(F32, {4});
+
+    auto builder = HloComputation::Builder("TestComputation");
+    auto p0 = builder.AddInstruction(
+        HloInstruction::CreateParameter(0, f32vec4, "p0"));
+    auto p1 = builder.AddInstruction(
+        HloInstruction::CreateParameter(1, f32vec4, "p1"));
+    builder.AddInstruction(
+        HloInstruction::CreateBinary(f32vec4, HloOpcode::kAdd, p0, p1));
+
+    module_->AddEntryComputation(builder.Build());
+
+    p0_value_ = std::make_unique<HloValue>(0, p0, ShapeIndex{});
+    p1_value_ = std::make_unique<HloValue>(0, p1, ShapeIndex{});
+
+    full_buffer_interval_ = HeapTy::BufferInterval({
+        p0_value_.get(),
+        /*size=*/20,
+        /*start=*/100,
+        /*end=*/200,
+        /*colocations=*/{p1_value_.get()},
+        /*need_allocation=*/true,
+    });
+    sliced_buffer_interval_ = std::make_unique<HeapTy::SlicedBufferInterval>(
+        HeapTy::SlicedBufferInterval::CreateConstInterval(
+            full_buffer_interval_));
+    mutable_sliced_buffer_interval_ =
+        std::make_unique<HeapTy::SlicedBufferInterval>(
+            HeapTy::SlicedBufferInterval::CreateMutableInterval(
+                full_buffer_interval_));
+  }
+
+ protected:
+  std::unique_ptr<HloModule> module_;
+  std::unique_ptr<HloValue> p0_value_;
+  std::unique_ptr<HloValue> p1_value_;
+  HeapTy::BufferInterval full_buffer_interval_;
+  std::unique_ptr<const HeapTy::SlicedBufferInterval> sliced_buffer_interval_;
+  std::unique_ptr<HeapTy::SlicedBufferInterval> mutable_sliced_buffer_interval_;
+};
+
+TEST_F(SlicedBufferIntervalTest, NoSlices) {
+  EXPECT_EQ(
+      BufferIntervalToTuple(sliced_buffer_interval_->full_buffer_interval()),
+      BufferIntervalToTuple(full_buffer_interval_));
+  EXPECT_EQ(sliced_buffer_interval_->num_slices(), 1);
+  EXPECT_THAT(sliced_buffer_interval_->SliceSizesSortedByOffset(),
+              ::testing::ElementsAre(20));
+  EXPECT_EQ(BufferIntervalToTuple(
+                sliced_buffer_interval_->IntervalForMakeFreeChunks(0)),
+            BufferIntervalToTuple(full_buffer_interval_));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->full_buffer_interval()),
+            BufferIntervalToTuple(full_buffer_interval_));
+}
+
+TEST_F(SlicedBufferIntervalTest, Sliced) {
+  std::vector<int64_t> slice_sizes = {4, 5, 5, 6};
+  mutable_sliced_buffer_interval_->Slice(absl::Span<int64_t>(slice_sizes));
+
+  EXPECT_EQ(mutable_sliced_buffer_interval_->num_slices(), 4);
+  EXPECT_THAT(mutable_sliced_buffer_interval_->SliceSizesSortedByOffset(),
+              ::testing::ElementsAre(4, 5, 5, 6));
+
+  mutable_sliced_buffer_interval_->UpdateSliceStartTimes({100, 125, 150, 175});
+
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(0)),
+            BufferIntervalToTuple(
+                {p0_value_.get(), 4, 100, 125, ColocationTy(), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(1)),
+            BufferIntervalToTuple(
+                {p0_value_.get(), 4, 125, 150, ColocationTy(), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(2)),
+            BufferIntervalToTuple(
+                {p0_value_.get(), 4, 150, 175, ColocationTy(), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(3)),
+            BufferIntervalToTuple({p0_value_.get(), 20, 175, 200,
+                                   ColocationTy({p1_value_.get()}), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->full_buffer_interval()),
+            BufferIntervalToTuple({p0_value_.get(), 20, 100, 200,
+                                   ColocationTy({p1_value_.get()}), true}));
+
+  mutable_sliced_buffer_interval_->UpdateEndTime(300);
+
+  // Only the BufferInterval for the last slice time should have changed end
+  // times.
+  EXPECT_EQ(mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(2).end,
+            175);
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->IntervalForMakeFreeChunks(3)),
+            BufferIntervalToTuple({p0_value_.get(), 20, 175, 300,
+                                   ColocationTy({p1_value_.get()}), true}));
+  EXPECT_EQ(BufferIntervalToTuple(
+                mutable_sliced_buffer_interval_->full_buffer_interval()),
+            BufferIntervalToTuple({p0_value_.get(), 20, 100, 300,
+                                   ColocationTy({p1_value_.get()}), true}));
+}
+
 class SlicedAllocationFinderTest : public ::testing::Test {
  public:
   using HeapTy = GlobalDecreasingSizeBestFitHeap<HloValue>;
@@ -2163,6 +2286,55 @@ t0 |xxxxx  xxx00000                         xxxxxx   xxxxxxxxxxx          x
                   Chunk::FromOffsetSize(18, 4), Chunk::FromOffsetSize(22, 0)));
 }
 
+TEST_F(SlicedAllocationFinderTest, ZeroSizeFreeChunk) {
+  /*
+Slice time vs allocation space (x = previously allocated, <number> = index of
+                                the slice that will be allocated at the
+                                specified position and time):
+   ^
+t2 |xxxxxxxxxx                              xxxxx         xxxxxx000111222 x
+t1 |xxxxxxxxxx                              xxxxx      xxxxxxxxx000111    x
+t0 |xxxxxxxxxx                              xxxxxxxxxxxxxxxxxxxx000       x
+   +!----|----!----|----!----|----!----|----!----|----!----|----!----|----!>
+         space
+*/
+  std::vector<FreeChunks> free_chunks_per_slice_time = {
+      // Slice time 0
+      {
+          {5, 5},
+          {10, 40},
+          {45, 48},
+          {60, 70},
+      },
+      // Slice time 1
+      {
+          {5, 7},
+          {10, 40},
+          {45, 51},
+          {60, 70},
+      },
+      // Slice time 2
+      {
+          {5, 7},
+          {10, 40},
+          {45, 45},
+          {60, 70},
+      },
+  };
+  std::vector<int64_t> sorted_slice_sizes = {3, 3, 3};
+  int64_t max_colocation_size = -1;
+  int64_t preferred_offset = -1;
+  int64_t alignment = 1;
+
+  Finder finder(free_chunks_per_slice_time, sorted_slice_sizes,
+                max_colocation_size, preferred_offset, alignment);
+
+  EXPECT_THAT(finder.Find(),
+              ::testing::ElementsAre(
+                  Chunk::FromOffsetSize(60, 3), Chunk::FromOffsetSize(63, 3),
+                  Chunk::FromOffsetSize(66, 3), Chunk::FromOffsetSize(69, 0)));
+}
+
 TEST_F(SlicedAllocationFinderTest, LargerMaxColloc) {
   /*
 Slice time vs allocation space (x = previously allocated, <number> = index of
diff --git a/tensorflow/compiler/xla/service/hlo.proto b/tensorflow/compiler/xla/service/hlo.proto
index 2af6be22570..0b5398220a8 100644
--- a/tensorflow/compiler/xla/service/hlo.proto
+++ b/tensorflow/compiler/xla/service/hlo.proto
@@ -111,7 +111,7 @@ enum CustomCallApiVersion {
 }
 
 // Serialization of HloInstruction.
-// Next ID: 81
+// Next ID: 82
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -360,6 +360,9 @@ message HloInstructionProto {
   // Each HLO module may contain a main thread and one or more parallel threads.
   // Empty async_execution_thread is equivalent to main thread.
   string async_execution_thread = 79;
+
+  // Represents the K value for top-k.
+  int64 k = 81;
 }
 
 // Serialization of HloComputation.
diff --git a/tensorflow/compiler/xla/service/hlo_computation_test.cc b/tensorflow/compiler/xla/service/hlo_computation_test.cc
index b832559525e..00cff8fdd23 100644
--- a/tensorflow/compiler/xla/service/hlo_computation_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_computation_test.cc
@@ -518,6 +518,46 @@ TEST_F(HloComputationTest, CloneWithReplacements) {
       ShapeUtil::Equal(clone->parameter_instruction(3)->shape(), r0u32));
 }
 
+TEST_F(HloComputationTest, CloneInContext) {
+  HloComputation::Builder builder(TestName());
+  Shape r0s64 = ShapeUtil::MakeShape(S64, {});
+  Shape r0s32 = ShapeUtil::MakeShape(S32, {});
+  Shape r0u32 = ShapeUtil::MakeShape(U32, {});
+  HloInstruction* param0 = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, r0f32_, "p.0.lhs"));
+  HloInstruction* param1 = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, r0f32_, "p.0.rhs"));
+  HloInstruction* param2 =
+      builder.AddInstruction(HloInstruction::CreateParameter(2, r0s64, "p.1"));
+  HloInstruction* lt = builder.AddInstruction(
+      HloInstruction::CreateCompare(ShapeUtil::MakeShape(PRED, {}), param0,
+                                    param1, ComparisonDirection::kLt));
+  std::unique_ptr<VerifiedHloModule> module = CreateNewVerifiedModule();
+  const HloComputation& computation =
+      *module->AddEntryComputation(builder.Build(/*root_instruction=*/lt));
+  absl::flat_hash_map<const HloInstruction*, std::unique_ptr<HloInstruction>>
+      replacements;
+  replacements.emplace(param2,
+                       HloInstruction::CreateParameter(2, r0s32, "p.1"));
+  std::unique_ptr<HloInstruction> param3 =
+      HloInstruction::CreateParameter(3, r0u32, "p.2");
+  std::vector<const HloInstruction*> extra_parameters = {param3.get()};
+  HloCloneContext clone_context(module.get());
+
+  std::unique_ptr<HloComputation> clone = computation.CloneInContext(
+      clone_context, &replacements, extra_parameters);
+
+  ASSERT_EQ(clone->num_parameters(), 4);
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(0)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(1)->shape(), r0f32_));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(2)->shape(), r0s32));
+  EXPECT_TRUE(
+      ShapeUtil::Equal(clone->parameter_instruction(3)->shape(), r0u32));
+}
+
 TEST_F(HloComputationTest, Stringification) {
   const Shape s1 = ShapeUtil::MakeShape(F32, {5, 10});
   const Shape s2 = ShapeUtil::MakeShape(F32, {20, 10});
diff --git a/tensorflow/compiler/xla/service/hlo_constant_folding.cc b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
index 492f3257d45..5d5d3ff9f2b 100644
--- a/tensorflow/compiler/xla/service/hlo_constant_folding.cc
+++ b/tensorflow/compiler/xla/service/hlo_constant_folding.cc
@@ -76,7 +76,9 @@ StatusOr<bool> HloConstantFolding::Run(
   // fast-path lets us e.g. use Eigen for matmuls.
   evaluator->set_use_fast_path(true);
 
-  bool changed = false;
+  // We delay deleting dead instructions so that we can print them out if we are
+  // taking too long without use-after-free or other sorts of races.
+  std::vector<HloInstruction*> dead_instructions;
 
   for (auto* computation :
        module->MakeNonfusionComputations(execution_threads)) {
@@ -184,23 +186,7 @@ StatusOr<bool> HloConstantFolding::Run(
 
       absl::Duration slow_timeout =
           absl::Seconds(uint64_t{1} << slow_op_counter_.load());
-      // We cannot call `instruction->ToString() within the callback, because
-      // the instruction may be modified and invalidated in place, and ToString
-      // will fail if the compilation is slow. We probably do not want to
-      // call `ToString()` for all the instructions, thus, we only display the
-      // name by default.
-      std::string instruction_msg;
-      if (VLOG_IS_ON(4)) {
-        instruction_msg = instruction->ToString();
-      } else {
-        instruction_msg =
-            absl::StrCat(instruction->name(),
-                         " (displaying the full instruction incurs a runtime "
-                         "overhead. Raise your logging level to 4 or above).");
-      }
-      SlowOperationAlarm slow_alarm(slow_timeout, [instruction_msg = std::move(
-                                                       instruction_msg),
-                                                   slow_timeout] {
+      SlowOperationAlarm slow_alarm(slow_timeout, [instruction, slow_timeout] {
         const bool ndebug =
 #if NDEBUG
             true;
@@ -223,7 +209,7 @@ StatusOr<bool> HloConstantFolding::Run(
             "Constant folding an instruction is taking > %s:\n\n"
             "  %s\n\n"  // instruction->name() or instruction->ToString()
             "%s",       // explanation_msg
-            absl::FormatDuration(slow_timeout), instruction_msg,
+            absl::FormatDuration(slow_timeout), instruction->ToString(),
             explanation_msg);
       });
 
@@ -244,12 +230,18 @@ StatusOr<bool> HloConstantFolding::Run(
       }
 
       VLOG(4) << "Constant folded: " << instruction->ToString();
-
-      TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
-          instruction, HloInstruction::CreateConstant(std::move(result))));
-      changed = true;
+      dead_instructions.push_back(instruction);
+      HloInstruction* new_constant = computation->AddInstruction(
+          HloInstruction::CreateConstant(std::move(result)));
+      TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(new_constant));
     }
   }
+  const bool changed = !dead_instructions.empty();
+  for (HloInstruction* dead_instruction : dead_instructions) {
+    CHECK(dead_instruction->IsDead());
+    HloComputation* computation = dead_instruction->parent();
+    TF_RETURN_IF_ERROR(computation->RemoveInstruction(dead_instruction));
+  }
   return changed;
 }
 
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
index 686f5615fb7..3c3ffcca820 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.cc
@@ -1160,6 +1160,11 @@ Status HloCostAnalysis::HandleSort(const HloInstruction* sort) {
   return OkStatus();
 }
 
+Status HloCostAnalysis::HandleTopK(const HloInstruction* topk) {
+  // TODO(cheshire): Cost analysis for TopK.
+  return OkStatus();
+}
+
 Status HloCostAnalysis::HandleWhile(const HloInstruction* xla_while) {
   // Since the number of iterations of the while node will not always be
   // something that we can statically analyze, we cannot precisely compute the
diff --git a/tensorflow/compiler/xla/service/hlo_cost_analysis.h b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
index 8559e0aefa8..103a1a98d3c 100644
--- a/tensorflow/compiler/xla/service/hlo_cost_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_cost_analysis.h
@@ -488,6 +488,7 @@ class HloCostAnalysis : public ConstDfsHloVisitor {
   Status HandleScatter(const HloInstruction* hlo) override;
   Status HandleGetDimensionSize(const HloInstruction* get_size) override;
   Status HandleSetDimensionSize(const HloInstruction* set_size) override;
+  Status HandleTopK(const HloInstruction* topk) override;
   Status FinishVisit(const HloInstruction* root) override;
 
   Status Preprocess(const HloInstruction* hlo) override;
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
index c9b8b81c975..7b70288b9b2 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.cc
@@ -88,12 +88,14 @@ using absl::StrCat;
 
 HloDataflowAnalysis::HloDataflowAnalysis(const HloModule& module, bool ssa_form,
                                          bool bitcast_defines_value,
-                                         const CanShareBuffer& can_share_buffer)
+                                         const CanShareBuffer& can_share_buffer,
+                                         const ForwardsValue& forwards_value)
     : module_(module),
       ssa_form_(ssa_form),
       bitcast_defines_value_(bitcast_defines_value),
       call_graph_(CallGraph::Build(&module)),
-      can_share_buffer_(can_share_buffer) {}
+      can_share_buffer_(can_share_buffer),
+      forwards_value_(forwards_value) {}
 
 bool HloDataflowAnalysis::AreTransitiveUsesElementwiseOrTuple(
     const HloInstruction* inst) {
@@ -1147,60 +1149,125 @@ bool HloDataflowAnalysis::UpdateCollectivePermuteDoneValueSet(
 bool HloDataflowAnalysis::UpdateInstructionValueSet(
     HloInstruction* instruction) {
   // Recompute from operands.
+  bool changed = false;
   switch (instruction->opcode()) {
-    case HloOpcode::kAddDependency:
-      return UpdateAddDependencyValueSet(instruction);
-    case HloOpcode::kAllGatherStart:
-      return UpdateAllGatherStartValueSet(instruction);
-    case HloOpcode::kAllGatherDone:
-      return UpdateAllGatherDoneValueSet(instruction);
-    case HloOpcode::kAsyncStart:
-      return UpdateAsyncStartValueSet(instruction);
-    case HloOpcode::kAsyncUpdate:
-      return UpdateAsyncUpdateValueSet(instruction);
-    case HloOpcode::kAsyncDone:
-      return UpdateAsyncDoneValueSet(instruction);
-    case HloOpcode::kBitcast:
-      return UpdateBitcastValueSet(instruction);
-    case HloOpcode::kSetDimensionSize:
-      return UpdateSetDimensionSizeValueSet(instruction);
-    case HloOpcode::kDomain:
-      return UpdateDomainValueSet(instruction);
-    case HloOpcode::kCopy:
-      return UpdateCopyValueSet(instruction);
-    case HloOpcode::kGetTupleElement:
-      return UpdateGetTupleElementValueSet(instruction);
-    case HloOpcode::kTuple:
-      return UpdateTupleValueSet(instruction);
-    case HloOpcode::kParameter:
-      return UpdateParameterValueSet(instruction);
-    case HloOpcode::kCall:
-      return UpdateCallValueSet(instruction);
-    case HloOpcode::kWhile:
-      return UpdateWhileValueSet(instruction);
-    case HloOpcode::kSend:
-      return UpdateSendValueSet(instruction);
-    case HloOpcode::kRecvDone:
-      return UpdateRecvDoneValueSet(instruction);
-    case HloOpcode::kCopyStart:
-      return UpdateCopyStartValueSet(instruction);
-    case HloOpcode::kCopyDone:
-      return UpdateCopyDoneValueSet(instruction);
-    case HloOpcode::kConditional:
-      return UpdateConditionalValueSet(instruction);
-    case HloOpcode::kAllReduceDone:
-      return UpdateAllReduceDoneValueSet(instruction);
-    case HloOpcode::kCollectivePermuteStart:
-      return UpdateCollectivePermuteStartValueSet(instruction);
-    case HloOpcode::kCollectivePermuteDone:
-      return UpdateCollectivePermuteDoneValueSet(instruction);
-    case HloOpcode::kOptimizationBarrier:
-      return UpdateOptimizationBarrierValueSet(instruction);
+    case HloOpcode::kAddDependency: {
+      changed = UpdateAddDependencyValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAllGatherStart: {
+      changed = UpdateAllGatherStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAllGatherDone: {
+      changed = UpdateAllGatherDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAsyncStart: {
+      changed = UpdateAsyncStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAsyncUpdate: {
+      changed = UpdateAsyncUpdateValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAsyncDone: {
+      changed = UpdateAsyncDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kBitcast: {
+      changed = UpdateBitcastValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kSetDimensionSize: {
+      changed = UpdateSetDimensionSizeValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kDomain: {
+      changed = UpdateDomainValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCopy: {
+      changed = UpdateCopyValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kGetTupleElement: {
+      changed = UpdateGetTupleElementValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kTuple: {
+      changed = UpdateTupleValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kParameter: {
+      changed = UpdateParameterValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCall: {
+      changed = UpdateCallValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kWhile: {
+      changed = UpdateWhileValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kSend: {
+      changed = UpdateSendValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kRecvDone: {
+      changed = UpdateRecvDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCopyStart: {
+      changed = UpdateCopyStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCopyDone: {
+      changed = UpdateCopyDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kConditional: {
+      changed = UpdateConditionalValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kAllReduceDone: {
+      changed = UpdateAllReduceDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCollectivePermuteStart: {
+      changed = UpdateCollectivePermuteStartValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kCollectivePermuteDone: {
+      changed = UpdateCollectivePermuteDoneValueSet(instruction);
+      break;
+    }
+    case HloOpcode::kOptimizationBarrier: {
+      changed = UpdateOptimizationBarrierValueSet(instruction);
+      break;
+    }
     default:
-      // Instruction does not forward HloValues (it defines all values in its
-      // output). No update is necessary.
-      return false;
+      break;
   }
+
+  if (forwards_value_ != nullptr) {
+    for (auto& [index, value_set] : GetInstructionValueSet(instruction)) {
+      if (std::optional<ForwardedOperand> forwarded_operand =
+              forwards_value_(instruction, index);
+          forwarded_operand.has_value()) {
+        HloValueSet& operand_value_set =
+            GetValueSet(instruction->operand(forwarded_operand->operand_number),
+                        forwarded_operand->operand_index);
+        if (value_set != operand_value_set) {
+          value_set = operand_value_set;
+          changed = true;
+        }
+      }
+    }
+  }
+
+  return changed;
 }
 
 void HloDataflowAnalysis::Propagate() {
@@ -1348,7 +1415,16 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
                   [](const ShapeIndex&) { return true; }) {
             for (auto& pair : GetInstructionValueSet(instruction)) {
               const ShapeIndex& index = pair.first;
-              if (should_define(index)) {
+
+              bool defines_value;
+              if (forwards_value_ != nullptr &&
+                  forwards_value_(instruction, index).has_value()) {
+                defines_value = false;
+              } else {
+                defines_value = should_define(index);
+              }
+
+              if (defines_value) {
                 HloValue* value =
                     NewHloValue(instruction, index, /*is_phi=*/false);
                 GetValueSet(instruction, index).AddValue(value);
@@ -1464,11 +1540,14 @@ Status HloDataflowAnalysis::InitializeInstructionValueSets() {
           break;
         case HloOpcode::kCollectivePermuteStart:
           // CollectivePermuteStart produces a tuple of
-          // {aliased operand, destination buffer, U32 context, U32 context}.
+          // {aliased operand, destination buffer, contexts}, where the context
+          // data are optional.
           define_value_at(/*index=*/{});
           define_value_at(/*index=*/{1});
-          define_value_at(/*index=*/{2});
-          define_value_at(/*index=*/{3});
+          for (int i = 2; i < instruction->shape().tuple_shapes_size(); ++i) {
+            define_value_at(/*index=*/{i});
+          }
+
           if (instruction->operand_count() > 1) {
             CHECK_EQ(instruction->operand_count(), 4);
             if (instruction->operand(1)->shape().IsTuple()) {
@@ -1552,12 +1631,14 @@ void HloDataflowAnalysis::OptimizePhiValues() {
 /* static */
 StatusOr<std::unique_ptr<HloDataflowAnalysis>> HloDataflowAnalysis::Run(
     const HloModule& module, bool ssa_form, bool bitcast_defines_value,
-    const CanShareBuffer& can_share_buffer) {
+    const CanShareBuffer& can_share_buffer,
+    const ForwardsValue& forwards_value) {
   VLOG(1) << "HloDataflowAnalysis::Run on module " << module.name();
   XLA_VLOG_LINES(2, module.ToString());
 
-  auto dataflow_analysis = absl::WrapUnique(new HloDataflowAnalysis(
-      module, ssa_form, bitcast_defines_value, can_share_buffer));
+  auto dataflow_analysis = absl::WrapUnique(
+      new HloDataflowAnalysis(module, ssa_form, bitcast_defines_value,
+                              can_share_buffer, forwards_value));
 
   TF_RETURN_IF_ERROR(dataflow_analysis->InitializeInstructionValueSets());
   dataflow_analysis->Propagate();
diff --git a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
index 37902400106..c62afc3ef24 100644
--- a/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
+++ b/tensorflow/compiler/xla/service/hlo_dataflow_analysis.h
@@ -75,6 +75,19 @@ class HloDataflowAnalysis {
       const HloInstruction* instr, const HloInstruction* operand,
       const ShapeIndex& user_index)>;
 
+  // Infrastructure for overriding whether an instruction defines a new value.
+  //
+  // The first parameter is the instruction and the second parameter is the
+  // output index. If an empty optional is used, default rules are used. If a
+  // ForwardedOperand object is returned, the value at the corresponding
+  // operand's index is used for the output, overriding all default logic.
+  struct ForwardedOperand {
+    int64_t operand_number;
+    ShapeIndex operand_index;
+  };
+  using ForwardsValue = std::function<std::optional<ForwardedOperand>(
+      const HloInstruction* instr, const ShapeIndex& index)>;
+
   // Runs dataflow analysis on the given module. Parameters:
   //
   //   ssa_form : If true then new values are defined at the merge points of
@@ -96,7 +109,8 @@ class HloDataflowAnalysis {
   static StatusOr<std::unique_ptr<HloDataflowAnalysis>> Run(
       const HloModule& module, bool ssa_form = false,
       bool bitcast_defines_value = false,
-      const CanShareBuffer& can_share_buffer = nullptr);
+      const CanShareBuffer& can_share_buffer = nullptr,
+      const ForwardsValue& forwards_value = nullptr);
 
   // Returns true if 'instruction' defines an HLO value at the given shape index
   // of its output.
@@ -214,7 +228,8 @@ class HloDataflowAnalysis {
 
   HloDataflowAnalysis(const HloModule& module, bool ssa_form,
                       bool bitcast_defines_value = false,
-                      const CanShareBuffer& can_share_buffer = nullptr);
+                      const CanShareBuffer& can_share_buffer = nullptr,
+                      const ForwardsValue& forwards_value = nullptr);
 
   // 1. During value propagation (Propagate function), always create phi
   // values once it see multiple inputs merging at the same point. It then
@@ -344,6 +359,8 @@ class HloDataflowAnalysis {
   // Backend specific function that decides whether an instruction can share
   // a buffer with its operand.
   CanShareBuffer can_share_buffer_ = nullptr;
+
+  ForwardsValue forwards_value_ = nullptr;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
index fe749367453..57bf6c2aa42 100644
--- a/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
+++ b/tensorflow/compiler/xla/service/hlo_graph_dumper.cc
@@ -899,6 +899,7 @@ std::string HloDotDumper::GetInstructionNodeInlinedOperands(
   };
 
   std::vector<std::string> lines;
+  constexpr int64_t kMaxOperandsShown = 32;
   for (int64_t i = 0; i < instr->operand_count(); ++i) {
     const HloInstruction* operand = instr->operand(i);
     optional<std::string> operand_str;
@@ -939,6 +940,10 @@ std::string HloDotDumper::GetInstructionNodeInlinedOperands(
         lines.push_back(StrFormat("<b>operand</b> = %s", *operand_str));
       }
     }
+    if (lines.size() == kMaxOperandsShown && i < instr->operand_count() - 1) {
+      lines.push_back("...");
+      break;
+    }
   }
 
   // Special case: fused parameter is fed from a get-tuple-element.  If
@@ -1042,6 +1047,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
+    case HloOpcode::kTopK:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.cc b/tensorflow/compiler/xla/service/hlo_lexer.cc
index 5479d830db0..048db84d43b 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.cc
+++ b/tensorflow/compiler/xla/service/hlo_lexer.cc
@@ -512,6 +512,46 @@ TokKind HloLexer::LexString() {
   return TokKind::kError;
 }
 
+TokKind HloLexer::LexJsonDict() {
+  // We require that you've already lexed the open curly brace.
+  if (GetKind() != TokKind::kLbrace) return TokKind::kError;
+
+  absl::string_view orig = StringViewFromPointers(token_state_.token_start,
+                                                  buf_.data() + buf_.size());
+  absl::string_view str = orig;
+
+  int64_t object_depth = 0;
+  if (str.empty()) return TokKind::kError;
+
+  if (str.front() != '{') return TokKind::kError;
+  ++object_depth;
+  str = str.substr(1);
+
+  while (!str.empty()) {
+    if (object_depth == 0) break;
+
+    if (str.front() == '"') {
+      static LazyRE2 string_pattern = {R"("([^"\\]|\\.)*")"};
+      if (!RE2::Consume(&str, *string_pattern)) {
+        return TokKind::kError;
+      }
+      continue;
+    }
+
+    if (str.front() == '{') ++object_depth;
+    if (str.front() == '}') --object_depth;
+    str = str.substr(1);
+  }
+  if (object_depth != 0) {
+    return TokKind::kError;
+  }
+  current_ptr_ = str.data();
+  token_state_.current_kind = TokKind::kString;
+  token_state_.str_val =
+      std::string(orig.substr(0, orig.length() - str.length()));
+  return TokKind::kString;
+}
+
 std::string TokKindToString(TokKind kind) {
   switch (kind) {
     case TokKind::kEof:
@@ -595,4 +635,11 @@ std::string TokKindToString(TokKind kind) {
   }
 }
 
+bool LexesAsJsonDict(absl::string_view str) {
+  HloLexer lexer(str);
+  return lexer.Lex() == TokKind::kLbrace &&
+         lexer.LexJsonDict() == TokKind::kString &&
+         lexer.Lex() == TokKind::kEof;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_lexer.h b/tensorflow/compiler/xla/service/hlo_lexer.h
index 825dcd9d924..7f5b0272fcd 100644
--- a/tensorflow/compiler/xla/service/hlo_lexer.h
+++ b/tensorflow/compiler/xla/service/hlo_lexer.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
 
+#include <optional>
 #include <string>
+#include <utility>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/shape.h"
@@ -138,6 +140,16 @@ class HloLexer {
   // Looks ahead one token and returns it. Lexer state is unchanged.
   TokKind LookAhead();
 
+  // Lexes a string delimited by matching curly braces.  Curlies contained
+  // inside double quotes don't count.
+  //
+  // Requires that you've already lexed the open curly brace.
+  //
+  // The returned string value includes the outer curlies.
+  //
+  // Returns TokKind::kString on success.
+  TokKind LexJsonDict();
+
  private:
   // Returns the current character. If it's neither the end of input buffer nor
   // an invalid character, moves the pointer forward.
@@ -188,6 +200,14 @@ class HloLexer {
   mutable LineNoCacheTy line_no_cache_{nullptr, 0};
 };
 
+// Does this string start with "{", end with "}", and contain valid-ish JSON
+// in-between?  If so, hlo_parser can parse e.g. backend_config={blah: "blah"}
+// instead of the much uglier backend_config="{blah: \"blah\"}".
+//
+// (Technically we're not checking for fully-valid JSON, just something we can
+// find the end of reasonably.)
+bool LexesAsJsonDict(absl::string_view str);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_HLO_LEXER_H_
diff --git a/tensorflow/compiler/xla/service/hlo_parser.cc b/tensorflow/compiler/xla/service/hlo_parser.cc
index da986031399..3cebaeb6ed8 100644
--- a/tensorflow/compiler/xla/service/hlo_parser.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser.cc
@@ -167,6 +167,7 @@ bool CanInferShape(HloOpcode code) {
     case HloOpcode::kTriangularSolve:
     case HloOpcode::kTuple:
     case HloOpcode::kWhile:
+    case HloOpcode::kTopK:
       return true;
     // Technically the following ops do not require an explicit result shape,
     // but we made it so that we always write the shapes explicitly.
@@ -281,6 +282,9 @@ class HloParserImpl : public HloParser {
     kInstructionAliasing,
     kCustomCallSchedule,
     kCustomCallApiVersion,
+    // A double-quoted string, or a string that looks like a JSON dictionary
+    // enclosed in matching curly braces (returned value includes the curlies).
+    kStringOrJsonDict,
   };
 
   struct AttrConfig {
@@ -497,6 +501,7 @@ class HloParserImpl : public HloParser {
   bool ParseName(std::string* result);
   bool ParseAttributeName(std::string* result);
   bool ParseString(std::string* result);
+  bool ParseJsonDict(std::string* result);
   bool ParseDimensionSizes(std::vector<int64_t>* dimension_sizes,
                            std::vector<bool>* dynamic_dimensions);
   bool ParseShape(Shape* result);
@@ -1214,7 +1219,7 @@ bool HloParserImpl::ParseInstructionRhs(HloComputation::Builder* builder,
   attrs["metadata"] = {/*required=*/false, AttrTy::kMetadata, &metadata};
 
   optional<std::string> backend_config;
-  attrs["backend_config"] = {/*required=*/false, AttrTy::kString,
+  attrs["backend_config"] = {/*required=*/false, AttrTy::kStringOrJsonDict,
                              &backend_config};
 
   std::optional<Shape> maybe_shape;
@@ -1357,6 +1362,25 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
       return builder->AddInstruction(
           HloInstruction::CreateIota(*shape, *iota_dimension));
     }
+    case HloOpcode::kTopK: {
+      optional<int64_t> k;
+      attrs["k"] = {/*required=*/true, AttrTy::kInt64, &k};
+      std::optional<HloComputation*> to_apply;
+      attrs["to_apply"] = {/*required=*/true, AttrTy::kHloComputation,
+                           &to_apply};
+      if ((!preset_operands && !ParseOperands(&operands, builder,
+                                              /*expected_size=*/1)) ||
+          !ParseAttributes(attrs, allow_attributes)) {
+        return nullptr;
+      }
+      if (!maybe_infer_shape([&] {
+            return ShapeInference::InferTopKShape(operands[0]->shape(), *k);
+          })) {
+        return nullptr;
+      }
+      return builder->AddInstruction(
+          HloInstruction::CreateTopK(*shape, operands[0], *k, *to_apply));
+    }
     // Unary ops.
     case HloOpcode::kAbs:
     case HloOpcode::kAllGatherDone:
@@ -3373,62 +3397,36 @@ bool HloParserImpl::ParseInstructionNames(
 bool HloParserImpl::SetValueInLiteral(LocTy loc, int64_t value, int64_t index,
                                       Literal* literal) {
   const Shape& shape = literal->shape();
-  switch (shape.element_type()) {
-    case S4:
-      return SetValueInLiteralHelper<s4>(loc, value, index, literal);
-    case S8:
-      return SetValueInLiteralHelper<int8_t>(loc, value, index, literal);
-    case S16:
-      return SetValueInLiteralHelper<int16_t>(loc, value, index, literal);
-    case S32:
-      return SetValueInLiteralHelper<int32_t>(loc, value, index, literal);
-    case S64:
-      return SetValueInLiteralHelper<int64_t>(loc, value, index, literal);
-    case U4:
-      return SetValueInLiteralHelper<u4>(loc, value, index, literal);
-    case U8:
-      return SetValueInLiteralHelper<uint8_t>(loc, value, index, literal);
-    case U16:
-      return SetValueInLiteralHelper<uint16_t>(loc, value, index, literal);
-    case U32:
-      return SetValueInLiteralHelper<uint32_t>(loc, value, index, literal);
-    case U64:
-      return SetValueInLiteralHelper<uint64_t>(loc, value, index, literal);
-    case PRED:
-      // Bool type literals with rank >= 1 are printed in 0s and 1s.
-      return SetValueInLiteralHelper<bool>(loc, static_cast<bool>(value), index,
-                                           literal);
-    default:
-      LOG(FATAL) << "unknown integral primitive type "
-                 << PrimitiveType_Name(shape.element_type());
-  }
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_type_constant == PRED) {
+          return SetValueInLiteralHelper<bool>(loc, static_cast<bool>(value),
+                                               index, literal);
+        }
+        if constexpr (primitive_util::IsIntegralType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return SetValueInLiteralHelper<NativeT>(loc, value, index, literal);
+        }
+        LOG(FATAL) << "unknown integral primitive type "
+                   << PrimitiveType_Name(shape.element_type());
+      },
+      shape.element_type());
 }
 
 bool HloParserImpl::SetValueInLiteral(LocTy loc, double value, int64_t index,
                                       Literal* literal) {
   const Shape& shape = literal->shape();
-  switch (shape.element_type()) {
-    case F8E5M2:
-      return SetValueInLiteralHelper<tsl::float8_e5m2>(loc, value, index,
-                                                       literal);
-    case F8E4M3FN:
-      return SetValueInLiteralHelper<tsl::float8_e4m3fn>(loc, value, index,
-                                                         literal);
-    case F8E4M3B11FNUZ:
-      return SetValueInLiteralHelper<tsl::float8_e4m3b11>(loc, value, index,
-                                                          literal);
-    case F16:
-      return SetValueInLiteralHelper<Eigen::half>(loc, value, index, literal);
-    case BF16:
-      return SetValueInLiteralHelper<tsl::bfloat16>(loc, value, index, literal);
-    case F32:
-      return SetValueInLiteralHelper<float>(loc, value, index, literal);
-    case F64:
-      return SetValueInLiteralHelper<double>(loc, value, index, literal);
-    default:
-      LOG(FATAL) << "unknown floating point primitive type "
-                 << PrimitiveType_Name(shape.element_type());
-  }
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsFloatingPointType(
+                          primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return SetValueInLiteralHelper<NativeT>(loc, value, index, literal);
+        }
+        LOG(FATAL) << "unknown floating point primitive type "
+                   << PrimitiveType_Name(shape.element_type());
+      },
+      shape.element_type());
 }
 
 bool HloParserImpl::SetValueInLiteral(LocTy loc, bool value, int64_t index,
@@ -3446,17 +3444,16 @@ bool HloParserImpl::SetValueInLiteral(LocTy loc, bool value, int64_t index,
 bool HloParserImpl::SetValueInLiteral(LocTy loc, std::complex<double> value,
                                       int64_t index, Literal* literal) {
   const Shape& shape = literal->shape();
-  switch (shape.element_type()) {
-    case C64:
-      return SetValueInLiteralHelper<std::complex<float>>(loc, value, index,
-                                                          literal);
-    case C128:
-      return SetValueInLiteralHelper<std::complex<double>>(loc, value, index,
-                                                           literal);
-    default:
-      LOG(FATAL) << PrimitiveType_Name(shape.element_type())
-                 << " is not a complex type";
-  }
+  return primitive_util::PrimitiveTypeSwitch<bool>(
+      [&](auto primitive_type_constant) -> bool {
+        if constexpr (primitive_util::IsComplexType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          return SetValueInLiteralHelper<NativeT>(loc, value, index, literal);
+        }
+        LOG(FATAL) << PrimitiveType_Name(shape.element_type())
+                   << " is not a complex type";
+      },
+      shape.element_type());
 }
 
 template <typename T>
@@ -4440,7 +4437,25 @@ bool HloParserImpl::ParseAttributeHelper(
         if (!ParseString(&result)) {
           return false;
         }
-        static_cast<optional<std::string>*>(attr_out_ptr)->emplace(result);
+        static_cast<optional<std::string>*>(attr_out_ptr)
+            ->emplace(std::move(result));
+        return true;
+      }
+      case AttrTy::kStringOrJsonDict: {
+        std::string result;
+        if (lexer_.GetKind() == TokKind::kString) {
+          if (!ParseString(&result)) {
+            return false;
+          }
+        } else if (lexer_.GetKind() == TokKind::kLbrace) {
+          if (!ParseJsonDict(&result)) {
+            return false;
+          }
+        } else {
+          return false;
+        }
+        static_cast<optional<std::string>*>(attr_out_ptr)
+            ->emplace(std::move(result));
         return true;
       }
       case AttrTy::kMetadata: {
@@ -5321,10 +5336,14 @@ bool HloParserImpl::ParseLayoutIntAttribute(
 //   ::= '{' int64_list
 //       (':' dim_level_types
 //            tiles
+//            element_size_in_bits
 //            memory_space
 //            physical_shape
 //            dynamic_shape_metadata_prefix_bytes)?
 //       '}'
+// element_size_in_bits
+//   ::= /*empty*/
+//   ::= 'E' '(' int64_t ')'
 // memory_space
 //   ::= /*empty*/
 //   ::= 'S' '(' int64_t ')'
@@ -5336,6 +5355,7 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
   std::vector<Tile> tiles;
   PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID;
   PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID;
+  int64_t element_size_in_bits = 0;
   int64_t memory_space = 0;
   std::optional<Shape> physical_shape;
   int64_t dynamic_shape_metadata_prefix_bytes = 0;
@@ -5401,6 +5421,11 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
                           TokKindToString(TokKind::kRparen)));
       }
 
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "E") {
+        lexer_.Lex();
+        ParseLayoutIntAttribute(&element_size_in_bits, "element size in bits");
+      }
+
       if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "S") {
         lexer_.Lex();
         ParseLayoutIntAttribute(&memory_space, "memory space");
@@ -5429,10 +5454,11 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
   for (int i = 0; i < tiles.size(); i++) {
     vec_tiles[i] = Tile(tiles[i]);
   }
-  *layout = LayoutUtil::MakeLayout(
-      minor_to_major, dim_level_types, dim_unique, dim_ordered, vec_tiles,
-      index_primitive_type, pointer_primitive_type, memory_space,
-      std::move(physical_shape), dynamic_shape_metadata_prefix_bytes);
+  *layout = LayoutUtil::MakeLayout(minor_to_major, dim_level_types, dim_unique,
+                                   dim_ordered, vec_tiles, index_primitive_type,
+                                   pointer_primitive_type, element_size_in_bits,
+                                   memory_space, std::move(physical_shape),
+                                   dynamic_shape_metadata_prefix_bytes);
   return true;
 }
 
@@ -5561,6 +5587,16 @@ bool HloParserImpl::ParseString(std::string* result) {
   return true;
 }
 
+bool HloParserImpl::ParseJsonDict(std::string* result) {
+  VLOG(3) << "ParseJsonDict";
+  if (lexer_.LexJsonDict() != TokKind::kString) {
+    return TokenError("expects JSON dict");
+  }
+  *result = lexer_.GetStrVal();
+  lexer_.Lex();
+  return true;
+}
+
 bool HloParserImpl::ParseDxD(const std::string& name,
                              std::vector<int64_t>* result) {
   LocTy loc = lexer_.GetLoc();
diff --git a/tensorflow/compiler/xla/service/hlo_parser_test.cc b/tensorflow/compiler/xla/service/hlo_parser_test.cc
index 42ec54a6206..26d0af537a3 100644
--- a/tensorflow/compiler/xla/service/hlo_parser_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_parser_test.cc
@@ -457,6 +457,18 @@ ENTRY %CustomCall () -> f32[1,2,3] {
   ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar", backend_config="this string is opaque"
 }
 
+)"
+},
+// CustomCall with backend_config in curly braces rather than double quotes.
+{
+"CustomCallWithBackendConfigInCurlyBraces",
+R"(HloModule custom_call, entry_computation_layout={()->f32[1,2,3]{0,2,1}}
+
+ENTRY %CustomCall () -> f32[1,2,3] {
+  %constant = f32[1]{0} constant({12345})
+  ROOT %custom-call = f32[1,2,3]{0,2,1} custom-call(f32[1]{0} %constant), custom_call_target="foo\"bar", backend_config={key: "value"}
+}
+
 )"
 },
 
@@ -1600,6 +1612,23 @@ ENTRY Sort {
   ROOT sorted = f32[1024]{0} sort(x), dimensions={0}, is_stable=true, to_apply=compare
 }
 
+)"
+},
+{
+"TopK",
+R"(HloModule topk, entry_computation_layout={(f32[10,10]{0,1})->(f32[10,2]{0,1}, s32[10,2]{0,1})}
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY TopK {
+  x = f32[10,10]{0,1} parameter(0)
+  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+}
+
 )"
 },
 // Indexed Conditional
@@ -3965,12 +3994,23 @@ TEST_F(HloParserTest, ParseShapeStringWithTilingLayout) {
                   "Dimensions size is 3, but minor to major size is 1.");
 }
 
+TEST_F(HloParserTest, ParseShapeStringWithElementSizeInBits) {
+  // Tile, element size, and memory space.
+  std::string shape_string = "s4[123,456]{1,0:T(2,128)E(4)}";
+  TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
+  Shape expected = ShapeUtil::MakeShapeWithDenseLayout(S4, {123, 456}, {1, 0},
+                                                       {Tile({2, 128})}, 4);
+  EXPECT_EQ(expected, actual)
+      << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
+      << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
+}
+
 TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   // Tile, element size, and memory space.
   std::string shape_string = "pred[123,456]{1,0:T(2,128)S(3)}";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0},
-                                                       {Tile({2, 128})}, 3);
+                                                       {Tile({2, 128})}, 0, 3);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -3979,7 +4019,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   shape_string = "pred[123,456]{1,0:S(3)}";
   TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected =
-      ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {}, 3);
+      ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {}, 0, 3);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -3988,7 +4028,7 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   shape_string = "pred[123,456]{1,0:S(3)}";
   TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
   expected =
-      ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {}, 3);
+      ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {}, 0, 3);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -4474,5 +4514,22 @@ comp2 {
       Layout({1, 0, 2, 3}));
 }
 
+TEST_F(HloParserTest, LexesAsJsonDict) {
+  EXPECT_TRUE(LexesAsJsonDict("{}"));
+  EXPECT_TRUE(LexesAsJsonDict("{abc: 123}"));
+  EXPECT_TRUE(LexesAsJsonDict("{{abc: 123}, {{{d}}}}"));
+  EXPECT_TRUE(LexesAsJsonDict(R"({"}"})"));
+  EXPECT_TRUE(LexesAsJsonDict(R"({"\"}"})"));
+  EXPECT_TRUE(LexesAsJsonDict(R"({"\"{"})"));
+  EXPECT_FALSE(LexesAsJsonDict(""));
+  EXPECT_FALSE(LexesAsJsonDict("{"));
+  EXPECT_FALSE(LexesAsJsonDict("}"));
+  EXPECT_FALSE(LexesAsJsonDict("{{}"));
+  EXPECT_FALSE(LexesAsJsonDict("{}}"));
+  EXPECT_FALSE(LexesAsJsonDict("{}a"));
+  EXPECT_FALSE(LexesAsJsonDict("a{}"));
+  EXPECT_FALSE(LexesAsJsonDict("{{{{}}}"));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
index 02a9fc025db..418a8be88cc 100644
--- a/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
+++ b/tensorflow/compiler/xla/service/hlo_pass_pipeline.cc
@@ -253,6 +253,18 @@ std::vector<HloPassInterface*> HloPassPipeline::GetEnabledPasses(
 
   CHECK(disabled_pass_names.empty() || enabled_pass_names.empty());
 
+  if (disabled_pass_names.contains(name())) {
+    // Disable the full pass.
+    VLOG(1) << "Disable the full pass: " << name();
+    return {};
+  }
+
+  if (enabled_pass_names.contains(name())) {
+    VLOG(1) << "Enable the full pass: " << name();
+    // Enable the full pass.
+    enabled_pass_names.clear();
+  }
+
   std::vector<HloPassInterface*> enabled_passes;
   if (!enabled_pass_names.empty()) {
     for (auto& pass : passes_) {
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization.cc b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
index c635f466b7e..ce804404efb 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization.cc
@@ -1149,6 +1149,7 @@ Status MemoryUsageTracker::AddRematerializedInstruction(
         RematerializeBuffer(old_buffer, remat_item, std::move(unplaced_users));
 
     remat_item->buffers_defined.push_back(new_buffer.id);
+    remat_item->buffers_output.push_back(new_buffer.id);
     auto update_buffers = [old_buffer_id, new_buffer_id = new_buffer.id](
                               BufferIdList& to_update) {
       std::replace(to_update.begin(), to_update.end(), old_buffer_id,
@@ -1910,10 +1911,19 @@ StatusOr<bool> HloRematerialization::RematerializeComputation(
     HloComputation* computation, HloSchedule* schedule,
     int64_t memory_limit_bytes, int64_t min_remat_size,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  const auto peak_memory_usage = computation_peak_memory_.at(computation);
+  if (peak_memory_usage <= memory_limit_bytes) {
+    // Nothing to do.
+    VLOG(1) << "Asked to rematerialize computation of size "
+            << peak_memory_usage
+            << " but it already fits within the given memory limit ("
+            << memory_limit_bytes << ")";
+    return false;
+  }
   VLOG(1) << "Rematerializing computation " << computation->name()
           << " with limit " << HumanReadableNumBytes(memory_limit_bytes);
   VLOG(1) << "peak memory usage is "
-          << HumanReadableNumBytes(computation_peak_memory_.at(computation));
+          << HumanReadableNumBytes(peak_memory_usage);
   CHECK(!ContainsKey(rematerialized_computations_, computation));
 
   InstructionList instruction_list(schedule->sequence(computation));
diff --git a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
index b165d696918..c5a17ae983d 100644
--- a/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_rematerialization_test.cc
@@ -45,10 +45,12 @@ class HloRematerializationTest : public RematerializationTestBase {
                                          HloModule* module,
                                          int64_t min_remat_size = 0) {
     TF_EXPECT_OK(verifier().Run(module).status());
-    HloMemoryScheduler scheduler(
-        [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
-        ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
-    TF_EXPECT_OK(scheduler.Run(module).status());
+    if (!module->has_schedule()) {
+      HloMemoryScheduler scheduler(
+          [](const BufferValue& buffer) { return ByteSizeOf(buffer.shape()); },
+          ComputationSchedulerToModuleScheduler(DefaultMemoryScheduler));
+      TF_EXPECT_OK(scheduler.Run(module).status());
+    }
     HloRematerialization remat(
         ByteSizeOf, memory_limit_bytes,
         /*sizes=*/nullptr,
diff --git a/tensorflow/compiler/xla/service/hlo_runner.cc b/tensorflow/compiler/xla/service/hlo_runner.cc
index 8a2b6b07720..c9e6d8cb7c8 100644
--- a/tensorflow/compiler/xla/service/hlo_runner.cc
+++ b/tensorflow/compiler/xla/service/hlo_runner.cc
@@ -620,7 +620,8 @@ ServiceExecutableRunOptions HloRunner::GetServiceRunOptionsForDevice(
     run_options.set_device_assignment(device_assignment);
   }
   run_options.set_run_id(run_id);
-  return ServiceExecutableRunOptions(run_options, backend().StreamBorrower());
+  return ServiceExecutableRunOptions(run_options,
+                                     backend().StreamBorrowerWithPriority());
 }
 
 Backend& HloRunner::backend() {
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.cc b/tensorflow/compiler/xla/service/hlo_verifier.cc
index 93fe75744a0..e88f471b87f 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier.cc
@@ -42,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/collective_ops_utils.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -51,6 +52,8 @@ limitations under the License.
 
 namespace xla {
 
+namespace m = match;
+
 namespace {
 
 bool IsCallerInstruction(HloInstruction* hlo) {
@@ -70,6 +73,7 @@ bool IsCallerInstruction(HloInstruction* hlo) {
     case HloOpcode::kScatter:
     case HloOpcode::kSelectAndScatter:
     case HloOpcode::kSort:
+    case HloOpcode::kTopK:
     case HloOpcode::kFusion:
     case HloOpcode::kCustomCall:
       return true;
@@ -793,8 +797,13 @@ Status ShapeVerifier::HandleCollectivePermuteStart(HloInstruction* hlo) {
   absl::c_transform(
       hlo->operands(), std::back_inserter(operand_shapes),
       [](const HloInstruction* operand) { return &(operand->shape()); });
-  return CheckShape(
-      hlo, ShapeInference::InferCollectivePermuteStartShape(operand_shapes));
+  std::vector<Shape> context_shapes;
+  if (hlo->shape().tuple_shapes_size() > 2) {
+    context_shapes = std::vector<Shape>(hlo->shape().tuple_shapes().begin() + 2,
+                                        hlo->shape().tuple_shapes().end());
+  }
+  return CheckShape(hlo, ShapeInference::InferCollectivePermuteStartShape(
+                             operand_shapes, context_shapes));
 }
 
 Status ShapeVerifier::HandleCollectivePermuteDone(HloInstruction* hlo) {
@@ -959,6 +968,40 @@ Status ShapeVerifier::HandleReverse(HloInstruction* reverse) {
                                                  reverse->dimensions()));
 }
 
+static bool IsStrictComparison(const HloComputation* cmp) {
+  const HloInstruction* root = cmp->root_instruction();
+  return Match(root, m::Compare(m::Parameter(0), m::Parameter(1))
+                         .WithComparisonDirection(ComparisonDirection::kGt)) ||
+         Match(root, m::Compare(m::Parameter(1), m::Parameter(0))
+                         .WithComparisonDirection(ComparisonDirection::kGt)) ||
+         Match(root, m::Compare(m::Parameter(0), m::Parameter(1))
+                         .WithComparisonDirection(ComparisonDirection::kLt)) ||
+         Match(root, m::Compare(m::Parameter(1), m::Parameter(0))
+                         .WithComparisonDirection(ComparisonDirection::kLt));
+}
+
+Status ShapeVerifier::HandleTopK(HloInstruction* hlo) {
+  HloComputation* compare = hlo->to_apply();
+  Shape compare_shape = compare->root_instruction()->shape();
+  if (!ShapeUtil::Compatible(compare_shape, ShapeUtil::MakeShape(PRED, {}))) {
+    return InternalError(
+        "The TopK compare computation shape does not lead to a scalar "
+        "predicate shape: %s",
+        StringifyShape(compare_shape));
+  }
+
+  TF_RETURN_IF_ERROR(CheckParameterCount(hlo, compare, 2));
+  if (!IsStrictComparison(compare)) {
+    // TODO(cheshire): Less strict restriction.
+    return InternalError(
+        "TopK HLO expects a strict comparison of the operands");
+  }
+
+  return CheckShape(
+      hlo, ShapeInference::InferTopKShape(hlo->operand(0)->shape(),
+                                          Cast<HloTopKInstruction>(hlo)->k()));
+}
+
 Status ShapeVerifier::HandleSort(HloInstruction* hlo) {
   HloSortInstruction* sort = Cast<HloSortInstruction>(hlo);
   if (sort->operand_count() < 1) {
@@ -1901,13 +1944,17 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
   TF_RETURN_IF_ERROR(
       ShapeUtil::ValidateShapeWithOptionalLayout(result_layout.shape()));
 
-  if (!ShapeUtil::Compatible(computation->root_instruction()->shape(),
-                             result_layout.shape())) {
+  // TPU layout assignment doesn't set the tiles on entry_computation_layout, so
+  // let's not check that.
+  if (!ShapesSame(computation->root_instruction()->shape(),
+                  result_layout.shape(),
+                  /*minor_to_major_only=*/false, /*ignore_memory_space=*/false,
+                  /*ignore_tiles=*/true)) {
     return InternalError(
         "Shape of the root instruction of entry computation (%s) should be "
         "compatible to one specified in module's entry computation layout (%s)",
-        ShapeUtil::HumanString(computation->root_instruction()->shape()),
-        ShapeUtil::HumanString(result_layout.shape()));
+        StringifyShape(computation->root_instruction()->shape()),
+        StringifyShape(result_layout.shape()));
   }
 
   if (computation->num_parameters() != layout.parameter_count()) {
@@ -1921,13 +1968,18 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
     const HloInstruction* parameter = computation->parameter_instruction(i);
     TF_RETURN_IF_ERROR(
         ShapeUtil::ValidateShapeWithOptionalLayout(layout.parameter_shape(i)));
-    if (!ShapeUtil::Compatible(parameter->shape(), layout.parameter_shape(i))) {
+    // TPU layout assignment doesn't set the tiles on entry_computation_layout,
+    // so let's not check that.
+    if (!ShapesSame(parameter->shape(), layout.parameter_shape(i),
+                    /*minor_to_major_only=*/false,
+                    /*ignore_memory_space=*/false,
+                    /*ignore_tiles=*/true)) {
       return InternalError(
           "Shape of the entry computation parameter %d is %s should be "
           "compatible to the one specified in module's entry computation "
           "layout %s",
-          i, ShapeUtil::HumanString(parameter->shape()),
-          ShapeUtil::HumanString(layout.parameter_shape(i)));
+          i, StringifyShape(parameter->shape()),
+          StringifyShape(layout.parameter_shape(i)));
     }
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier.h b/tensorflow/compiler/xla/service/hlo_verifier.h
index e1c60f80c35..32f56a11380 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier.h
+++ b/tensorflow/compiler/xla/service/hlo_verifier.h
@@ -183,6 +183,7 @@ class ShapeVerifier : public DfsHloVisitor {
   Status HandleRngGetAndUpdateState(HloInstruction*) override;
   Status HandleReverse(HloInstruction* reverse) override;
   Status HandleSort(HloInstruction* hlo) override;
+  Status HandleTopK(HloInstruction* hlo) override;
   Status HandleConstant(HloInstruction* constant) override;
   Status HandleGetTupleElement(HloInstruction* get_tuple_element) override;
   Status HandleReduce(HloInstruction* reduce) override;
@@ -250,7 +251,7 @@ class ShapeVerifier : public DfsHloVisitor {
   // Helpers that switch on layout_sensitive_.
   bool ShapesSame(const Shape& a, const Shape& b,
                   bool minor_to_major_only = false,
-                  bool ignore_memory_space = false) {
+                  bool ignore_memory_space = false, bool ignore_tiles = false) {
     if (!opts_.layout_sensitive) {
       return ShapeUtil::Compatible(a, b);
     }
@@ -261,6 +262,9 @@ class ShapeVerifier : public DfsHloVisitor {
     if (minor_to_major_only) {
       equal.MinorToMajorOnlyInLayout();
     }
+    if (ignore_tiles) {
+      equal.IgnoreTilesInLayout();
+    }
     return equal(a, b);
   }
 
diff --git a/tensorflow/compiler/xla/service/hlo_verifier_test.cc b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
index 42258cf0ca1..f351eaed4ca 100644
--- a/tensorflow/compiler/xla/service/hlo_verifier_test.cc
+++ b/tensorflow/compiler/xla/service/hlo_verifier_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/base/log_severity.h"
 #include "absl/log/scoped_mock_log.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
@@ -2127,7 +2128,7 @@ TEST_F(HloVerifierTest, CollectivePermuteStartAndDoneWrongType) {
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.message(),
               HasSubstr("Expected instruction to have shape equal to "
-                        "(f32[2,3], f32[2,3], u32[], u32[])"));
+                        "(f32[2,3], f32[2,3])"));
 }
 
 TEST_F(HloVerifierTest, CollectivePermuteStartAndMultipleDone) {
@@ -2720,5 +2721,166 @@ TEST(MetadataTrackerTest, MetadataTrackerLogsInfo) {
   }
 }
 
+TEST_F(HloVerifierTest, TopKWrongComparator) {
+  const char* const hlo = R"(
+HloModule module
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  p3 = f32[] parameter(2)
+  p4 = f32[] parameter(3)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY entry {
+  x = f32[10,10]{0,1} parameter(0)
+  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+}
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(), HasSubstr("to have 2 parameters"));
+}
+
+TEST_F(HloVerifierTest, TopKUnexpectedComparator) {
+  const char* const hlo = R"(
+HloModule module
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LE
+}
+
+ENTRY entry {
+  x = f32[10,10]{0,1} parameter(0)
+  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+}
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(), HasSubstr("expects a strict comparison"));
+}
+
+TEST_F(HloVerifierTest, TopKOK) {
+  const char* const hlo = R"(
+HloModule topk, entry_computation_layout={(f32[10,10]{0,1})->(f32[10,2]{0,1}, s32[10,2]{0,1})}
+
+compare {
+  p.0.lhs = f32[] parameter(0)
+  p.0.rhs = f32[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY TopK {
+  x = f32[10,10]{0,1} parameter(0)
+  ROOT topk = (f32[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnUnverifiedModule(hlo));
+  auto status = verifier().Run(module.get()).status();
+  ASSERT_TRUE(status.ok());
+}
+
+TEST_F(HloVerifierTest, InputLayoutMismatchIgnored) {
+  // Note: The mismatch is between the entry_computation_layout and the layout
+  // of parameter(1).
+
+  constexpr absl::string_view kHlo = R"(
+HloModule module, entry_computation_layout={(f32[10,10]{1,0},f32[10,10]{1,0})->f32[10,10]{1,0}}
+
+ENTRY entry {
+  x = f32[10,10]{1,0} parameter(0)
+  y = f32[10,10]{0,1} parameter(1)
+  ROOT z = f32[10,10]{1,0} dot(x, y),
+             lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHlo));
+  Status status = verifier().Run(module.get()).status();
+
+  TF_ASSERT_OK(status);
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, InputLayoutMismatchReported) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module, entry_computation_layout={(f32[10,10]{1,0},f32[10,10]{1,0})->f32[10,10]{1,0}}
+
+ENTRY entry {
+  x = f32[10,10]{1,0} parameter(0)
+  y = f32[10,10]{0,1} parameter(1)
+  ROOT z = f32[10,10]{1,0} dot(x, y),
+             lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHlo));
+  Status status = verifier().Run(module.get()).status();
+
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(), HasSubstr("should be compatible"));
+}
+
+TEST_F(HloVerifierTest, OutputLayoutMismatchIgnored) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module, entry_computation_layout={(f32[10,10]{1,0},f32[10,10]{1,0})->f32[10,10]{1,0}}
+
+ENTRY entry {
+  x = f32[10,10]{1,0} parameter(0)
+  y = f32[10,10]{1,0} parameter(1)
+  ROOT z = f32[10,10]{0,1} dot(x, y),
+             lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHlo));
+  Status status = verifier().Run(module.get()).status();
+
+  TF_ASSERT_OK(status);
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, OutputLayoutMismatchReported) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module, entry_computation_layout={(f32[10,10]{1,0},f32[10,10]{1,0})->f32[10,10]{1,0}}
+
+ENTRY entry {
+  x = f32[10,10]{1,0} parameter(0)
+  y = f32[10,10]{1,0} parameter(1)
+  ROOT z = f32[10,10]{0,1} dot(x, y),
+             lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHlo));
+  Status status = verifier().Run(module.get()).status();
+
+  ASSERT_FALSE(status.ok());
+  EXPECT_THAT(status.message(), HasSubstr("should be compatible"));
+}
+
+TEST_F(HloVerifierTestLayoutSensitive, LayoutOK) {
+  constexpr absl::string_view kHlo = R"(
+HloModule module, entry_computation_layout={(f32[10,10]{1,0},f32[10,10]{1,0})->f32[10,10]{1,0}}
+
+ENTRY entry {
+  x = f32[10,10]{1,0} parameter(0)
+  y = f32[10,10]{1,0} parameter(1)
+  ROOT z = f32[10,10]{1,0} dot(x, y),
+             lhs_contracting_dims={1}, rhs_contracting_dims={0}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnUnverifiedModule(kHlo));
+  Status status = verifier().Run(module.get()).status();
+
+  TF_ASSERT_OK(status);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.cc b/tensorflow/compiler/xla/service/instruction_fusion.cc
index db7315afb59..f6af2d58d8f 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.cc
+++ b/tensorflow/compiler/xla/service/instruction_fusion.cc
@@ -40,6 +40,18 @@ limitations under the License.
 #include "tensorflow/tsl/platform/logging.h"
 
 namespace xla {
+
+#if defined(PLATFORM_GOOGLE)
+FusionDecision::FusionDecision(bool decision,
+                               absl::SourceLocation source_location) {
+  if (!decision) {
+    explanation_ =
+        absl::StrCat("Not fusing: due to ", source_location.file_name(), ":",
+                     source_location.line());
+  }
+}
+#endif  // PLATFORM_GOOGLE
+
 namespace {
 
 // These nodes can always be duplicated into consumers, even if
@@ -194,6 +206,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kSend:
     case HloOpcode::kSendDone:
     case HloOpcode::kSort:
+    case HloOpcode::kTopK:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
     case HloOpcode::kTanh:
diff --git a/tensorflow/compiler/xla/service/instruction_fusion.h b/tensorflow/compiler/xla/service/instruction_fusion.h
index cedcb66e81c..68969c37a0b 100644
--- a/tensorflow/compiler/xla/service/instruction_fusion.h
+++ b/tensorflow/compiler/xla/service/instruction_fusion.h
@@ -23,6 +23,10 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+// The source_location.h is not available in open source.
+#if defined(PLATFORM_GOOGLE)
+#include "absl/types/source_location.h"
+#endif  // PLATFORM_GOOGLE
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
@@ -57,6 +61,15 @@ class FusionDecision {
     }
   }
 
+#if defined(PLATFORM_GOOGLE)
+  // We can fuse iff. the decision is `true`. The source location indicates
+  // where an instance was created, making debugging easier without a need to
+  // provide explicit explanation.
+  FusionDecision(  // NOLINT
+      bool decision,
+      absl::SourceLocation source_location = absl::SourceLocation::current());
+#endif  // PLATFORM_GOOGLE
+
   // Can be fused.
   FusionDecision() = default;
 
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc b/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
index 62a9c2aef63..1761242b620 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler.cc
@@ -57,7 +57,7 @@ bool IsNopInstruction(const HloInstruction& hlo) {
 }
 }  // namespace
 
-CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) {
+CanonicalAsyncOp DefaultGetCanonicalAsyncOp(const HloInstruction& hlo) {
   switch (hlo.opcode()) {
     case HloOpcode::kAsyncStart:
     case HloOpcode::kAsyncDone:
@@ -79,8 +79,8 @@ CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) {
   }
 }
 
-/*static*/ bool LatencyEstimator::IsAsyncPair(const HloGraphNode& from,
-                                              const HloGraphNode& target) {
+bool LatencyEstimator::IsAsyncPair(const HloGraphNode& from,
+                                   const HloGraphNode& target) const {
   CanonicalAsyncOp from_op = GetCanonicalAsyncOp(from.GetInstr());
   CanonicalAsyncOp target_op = GetCanonicalAsyncOp(target.GetInstr());
   return from_op.outer == HloOpcode::kAsyncStart &&
@@ -122,6 +122,11 @@ bool AsyncTracker::IsSupportedAsyncDone(const HloInstruction& hlo) const {
   }
 
   if (op.outer == HloOpcode::kAsyncDone) {
+    // If there are parallel thread computations, always schedule.
+    if (hlo.IsAsynchronous() &&
+        hlo.async_execution_thread() != hlo.parent()->execution_thread()) {
+      return true;
+    }
     switch (op.inner) {
       case HloOpcode::kAllToAll:
       case HloOpcode::kAllGather:
@@ -144,6 +149,11 @@ bool AsyncTracker::IsSupportedAsyncStart(const HloInstruction& hlo) const {
   }
 
   if (op.outer == HloOpcode::kAsyncStart) {
+    // If there are parallel thread computations, always schedule.
+    if (hlo.IsAsynchronous() &&
+        hlo.async_execution_thread() != hlo.parent()->execution_thread()) {
+      return true;
+    }
     switch (op.inner) {
       case HloOpcode::kAllToAll:
       case HloOpcode::kAllGather:
@@ -1488,17 +1498,23 @@ LatencyHidingScheduler::LatencyHidingStatistics(
     kAllGather,
     kAllReduce,
     kCollectivePermute,
+    kAllToAll,
+    kReduceScatter,
     kSend,
     kRecv,
   };
   auto opcode_to_async_kind = [](HloOpcode opcode) {
     switch (opcode) {
-      case HloOpcode::kAllGatherStart:
+      case HloOpcode::kAllGather:
         return AsyncKind::kAllGather;
-      case HloOpcode::kAllReduceStart:
+      case HloOpcode::kAllReduce:
         return AsyncKind::kAllReduce;
-      case HloOpcode::kCollectivePermuteStart:
+      case HloOpcode::kCollectivePermute:
         return AsyncKind::kCollectivePermute;
+      case HloOpcode::kAllToAll:
+        return AsyncKind::kAllToAll;
+      case HloOpcode::kReduceScatter:
+        return AsyncKind::kReduceScatter;
       case HloOpcode::kSend:
         return AsyncKind::kSend;
       case HloOpcode::kRecv:
@@ -1517,9 +1533,11 @@ LatencyHidingScheduler::LatencyHidingStatistics(
     CHECK(edge_it != graph_node.GetSuccessors().end());
     return edge_it;
   };
-  auto find_outstanding_async = [&outstanding_collectives](
-                                    const HloInstruction* instr) {
-    const auto& collective_vec = outstanding_collectives[instr->opcode()];
+  auto find_outstanding_async = [&outstanding_collectives,
+                                 async_tracker](const HloInstruction* instr) {
+    const auto& collective_vec =
+        outstanding_collectives[async_tracker->GetCanonicalAsyncOp(*instr)
+                                    .inner];
     auto it = absl::c_find_if(
         collective_vec,
         [instr](const std::tuple<const HloInstruction*, int64_t, int64_t>& p) {
@@ -1545,8 +1563,8 @@ LatencyHidingScheduler::LatencyHidingStatistics(
     const HloGraphNode& instr_node = schedule_graph.GetNode(instr);
     current_time += instr_node.GetCost();
     if (async_tracker->IsSupportedAsyncStart(*instr)) {
-      outstanding_collectives[instr->opcode()].push_back(
-          {instr, current_time, curr_pos});
+      outstanding_collectives[async_tracker->GetCanonicalAsyncOp(*instr).inner]
+          .push_back({instr, current_time, curr_pos});
     } else if (async_tracker->IsSupportedAsyncDone(*instr)) {
       const HloInstruction* start_instr = instr->operand(0);
       auto it = find_outstanding_async(start_instr);
@@ -1554,8 +1572,9 @@ LatencyHidingScheduler::LatencyHidingStatistics(
       auto edge_it = find_node_successor_edge(start_node, instr_node);
       const double async_wasted_cycles =
           std::max(0.0, edge_it->Latency() - (current_time - std::get<1>(*it)));
-      wasted_time_per_collective[opcode_to_async_kind(start_instr->opcode())] +=
-          async_wasted_cycles;
+      AsyncKind kind = opcode_to_async_kind(
+          async_tracker->GetCanonicalAsyncOp(*start_instr).inner);
+      wasted_time_per_collective[kind] += async_wasted_cycles;
       current_time += async_wasted_cycles;
     }
     curr_pos++;
@@ -1582,6 +1601,10 @@ LatencyHidingScheduler::LatencyHidingStatistics(
       wasted_time_per_collective[AsyncKind::kAllReduce],
       /*collective_permute_wasted_cycles=*/
       wasted_time_per_collective[AsyncKind::kCollectivePermute],
+      /*all_to_all_wasted_cycles=*/
+      wasted_time_per_collective[AsyncKind::kAllToAll],
+      /*reduce_scatter_wasted_cycles=*/
+      wasted_time_per_collective[AsyncKind::kReduceScatter],
       /*send_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kSend],
       /*recv_wasted_cycles=*/wasted_time_per_collective[AsyncKind::kRecv],
       /*total_cycles=*/current_time,
@@ -1595,23 +1618,30 @@ LatencyHidingScheduler::LatencyHidingStatistics(
 std::string LatencyHidingScheduler::SchedulerStatisticsString(
     const SchedulerStatistics& sched_stats) {
   std::string result;
-  if (sched_stats.computation != nullptr) {
-    absl::StrAppend(&result,
-                    "For computation: ", sched_stats.computation->name(), "\n");
+  if (const HloComputation* comp = sched_stats.computation) {
+    absl::StrAppend(&result, "For computation: ", comp->name(), ", module ",
+                    comp->parent()->name(), "(", comp->parent()->unique_id(),
+                    ")\n");
   }
   absl::StrAppend(&result, "Total wasted cycles: ",
                   sched_stats.all_gather_wasted_cycles +
                       sched_stats.all_reduce_wasted_cycles +
                       sched_stats.collective_permute_wasted_cycles +
+                      sched_stats.all_to_all_wasted_cycles +
+                      sched_stats.reduce_scatter_wasted_cycles +
                       sched_stats.send_wasted_cycles +
                       sched_stats.recv_wasted_cycles,
                   "\n");
-  absl::StrAppend(&result, "Wasted cycles for collective-permute: ",
-                  sched_stats.collective_permute_wasted_cycles, "\n");
-  absl::StrAppend(&result, "Wasted cycles for all-gather: ",
-                  sched_stats.all_gather_wasted_cycles, "\n");
   absl::StrAppend(&result, "Wasted cycles for all-reduce: ",
                   sched_stats.all_reduce_wasted_cycles, "\n");
+  absl::StrAppend(&result, "Wasted cycles for all-gather: ",
+                  sched_stats.all_gather_wasted_cycles, "\n");
+  absl::StrAppend(&result, "Wasted cycles for collective-permute: ",
+                  sched_stats.collective_permute_wasted_cycles, "\n");
+  absl::StrAppend(&result, "Wasted cycles for all-to-all: ",
+                  sched_stats.all_to_all_wasted_cycles, "\n");
+  absl::StrAppend(&result, "Wasted cycles for reduce-scatter: ",
+                  sched_stats.reduce_scatter_wasted_cycles, "\n");
   absl::StrAppend(&result,
                   "Wasted cycles for send: ", sched_stats.send_wasted_cycles,
                   "\n");
diff --git a/tensorflow/compiler/xla/service/latency_hiding_scheduler.h b/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
index 0a8c3dac204..ad9a0fcbc71 100644
--- a/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
+++ b/tensorflow/compiler/xla/service/latency_hiding_scheduler.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/hlo_alias_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
@@ -33,6 +34,17 @@ limitations under the License.
 
 namespace xla {
 
+struct CanonicalAsyncOp {
+  HloOpcode outer;  // kAsyncStart or kAsyncDone
+  HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectivePermute,
+                    // or kReduceScatter
+};
+
+CanonicalAsyncOp DefaultGetCanonicalAsyncOp(const HloInstruction& hlo);
+
+using GetCanonicalAsyncOpFunc =
+    std::function<CanonicalAsyncOp(const HloInstruction& hlo)>;
+
 class HloGraphNode;
 class ModulePressureState;
 
@@ -108,7 +120,16 @@ class LatencyEstimator {
   virtual int CyclesPerMicrosecond() const = 0;
   virtual ~LatencyEstimator() = default;
 
-  static bool IsAsyncPair(const HloGraphNode& from, const HloGraphNode& target);
+  inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
+    return get_canonical_async_op_(hlo);
+  }
+  bool IsAsyncPair(const HloGraphNode& from, const HloGraphNode& target) const;
+  explicit LatencyEstimator(
+      GetCanonicalAsyncOpFunc func = DefaultGetCanonicalAsyncOp)
+      : get_canonical_async_op_(func) {}
+
+ private:
+  GetCanonicalAsyncOpFunc get_canonical_async_op_;
 };
 
 // Implementation of LatencyEstimator using an approximate cost model.
@@ -188,13 +209,21 @@ class AsyncTracker {
   // Default resources have a hazard type of kUnshareable.
   virtual ResourceHazardType GetResourceHazardType(int64_t resource_type) const;
 
-  explicit AsyncTracker(const SchedulerConfig& config) : config_(config) {}
+  inline CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo) const {
+    return get_canonical_async_op_(hlo);
+  }
+
+  explicit AsyncTracker(
+      const SchedulerConfig& config,
+      GetCanonicalAsyncOpFunc func = DefaultGetCanonicalAsyncOp)
+      : config_(config), get_canonical_async_op_(func) {}
 
  private:
   const SchedulerConfig config_;
   mutable absl::flat_hash_map<const HloComputation*,
                               absl::flat_hash_map<int64_t, int64_t>>
       async_in_computation_cache_;
+  GetCanonicalAsyncOpFunc get_canonical_async_op_;
 };
 
 // Base class for the core scheduling algorithm.
@@ -743,6 +772,8 @@ class LatencyHidingScheduler : public HloModulePass {
     double all_gather_wasted_cycles = 0;
     double all_reduce_wasted_cycles = 0;
     double collective_permute_wasted_cycles = 0;
+    double all_to_all_wasted_cycles = 0;
+    double reduce_scatter_wasted_cycles = 0;
     double send_wasted_cycles = 0;
     double recv_wasted_cycles = 0;
     double total_cycles = 0;
@@ -787,14 +818,6 @@ class LatencyHidingScheduler : public HloModulePass {
   absl::flat_hash_set<HloComputation*> computations_to_schedule_;
 };
 
-struct CanonicalAsyncOp {
-  HloOpcode outer;  // kAsyncStart or kAsyncDone
-  HloOpcode inner;  // kAllReduce, kAllGather, kAllToAll, kCollectivePermute,
-                    // or kReduceScatter
-};
-
-CanonicalAsyncOp GetCanonicalAsyncOp(const HloInstruction& hlo);
-
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_SERVICE_LATENCY_HIDING_SCHEDULER_H_
diff --git a/tensorflow/compiler/xla/service/layout_assignment.cc b/tensorflow/compiler/xla/service/layout_assignment.cc
index 44f4eaaaea0..756b5bdf952 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.cc
+++ b/tensorflow/compiler/xla/service/layout_assignment.cc
@@ -2427,43 +2427,6 @@ Status LayoutAssignment::ConstrainChannelLayouts(
   return OkStatus();
 }
 
-Status LayoutAssignment::PropagateMemorySpace(HloModule* module) {
-  TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(module));
-  for (const auto& buffer : alias_analysis->buffers()) {
-    // First go through values to collect the memory spaces.
-    int64_t buffer_memory_space = Layout::kDefaultMemorySpace;
-    for (auto value : buffer.values()) {
-      const Shape& defining_shape = value->defining_position().shape();
-      if (!defining_shape.has_layout()) {
-        continue;
-      }
-      int64_t memory_space = defining_shape.layout().memory_space();
-      if (memory_space != Layout::kDefaultMemorySpace) {
-        if (buffer_memory_space != Layout::kDefaultMemorySpace &&
-            memory_space != buffer_memory_space) {
-          return InternalError(
-              "Buffer %d (%s) has conflicting memory spaces: %d and %d.",
-              buffer.id(), value->ToShortString(), buffer_memory_space,
-              memory_space);
-        }
-        buffer_memory_space = memory_space;
-      }
-    }
-
-    // If we encounter a memory space other than the default, then propagate all
-    // the positions with the buffer's memory space.
-    if (buffer_memory_space != Layout::kDefaultMemorySpace) {
-      for (auto value : buffer.values()) {
-        for (auto& position : value->positions()) {
-          Shape* shape = ShapeUtil::GetMutableSubshape(
-              position.instruction->mutable_shape(), position.index);
-          shape->mutable_layout()->set_memory_space(buffer_memory_space);
-        }
-      }
-    }
-  }
-  return OkStatus();
-}
 
 Status LayoutAssignment::PropagateComputationLayouts(
     HloComputation* computation, ComputationLayout* computation_layout) {
@@ -2658,8 +2621,6 @@ StatusOr<bool> LayoutAssignment::Run(
   TF_RETURN_IF_ERROR(PropagateComputationLayouts(module->entry_computation(),
                                                  entry_computation_layout_));
 
-  TF_RETURN_IF_ERROR(PropagateMemorySpace(module));
-
   TF_RETURN_IF_ERROR(CheckLayouts(module, execution_threads));
 
   // All layouts are reset then reassigned by this pass.
@@ -2730,6 +2691,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kSin:
     case HloOpcode::kSlice:
     case HloOpcode::kSort:
+    case HloOpcode::kTopK:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
     case HloOpcode::kSubtract:
diff --git a/tensorflow/compiler/xla/service/layout_assignment.h b/tensorflow/compiler/xla/service/layout_assignment.h
index 4b22e0facf3..fea25410873 100644
--- a/tensorflow/compiler/xla/service/layout_assignment.h
+++ b/tensorflow/compiler/xla/service/layout_assignment.h
@@ -483,10 +483,6 @@ class LayoutAssignment : public HloModulePass {
                                       int64_t priority,
                                       const HloInstruction* user = nullptr);
 
-  // Propagates the memory space defined in the entry computation to the called
-  // computations.
-  virtual Status PropagateMemorySpace(HloModule* module);
-
   // Chooses a layout of operand `operand_no` of `instruction` that minimizes
   // the cost of `instruction`. `output_layout` is the layout of `instruction`.
   // Returns null if it can't decide the best layout.
diff --git a/tensorflow/compiler/xla/service/layout_normalization.cc b/tensorflow/compiler/xla/service/layout_normalization.cc
index 80b4d53cdde..6536b376c29 100644
--- a/tensorflow/compiler/xla/service/layout_normalization.cc
+++ b/tensorflow/compiler/xla/service/layout_normalization.cc
@@ -225,7 +225,6 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     for (int64_t& d : br_dimensions) {
       d = FindIndex(orig_output_layout_as_permutation, d);
     }
-    absl::c_sort(br_dimensions);
     auto normalized_broadcast = MakeBroadcastHlo(
         normalized_input, br_dimensions, normalized_shape, &hlo->metadata());
     VLOG(3) << "Generated broadcast: " << normalized_broadcast->ToString();
diff --git a/tensorflow/compiler/xla/service/layout_normalization_test.cc b/tensorflow/compiler/xla/service/layout_normalization_test.cc
index 1f1f3bdf033..961cd90c1ac 100644
--- a/tensorflow/compiler/xla/service/layout_normalization_test.cc
+++ b/tensorflow/compiler/xla/service/layout_normalization_test.cc
@@ -255,6 +255,26 @@ ENTRY main {
 )");
 }
 
+TEST_F(LayoutNormalizationTest, BroadcastUnsortedDimensions) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[2,3]{1,0} parameter(0)
+  b = f32[3,4,2]{2,1,0} broadcast(a), dimensions={2,0}
+  ROOT out = abs(b)
+}
+)";
+
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: [[a_0:%[^ ]+]] = f32[2,3]{1,0} parameter(0)
+// CHECK: [[bitcast_1:%[^ ]+]] = f32[2,3]{1,0} bitcast([[a_0]])
+// CHECK: [[broadcast_2:%[^ ]+]] = f32[3,4,2]{2,1,0} broadcast([[bitcast_1]]), dimensions={2,0}
+// CHECK: [[abs_3:%[^ ]+]] = f32[3,4,2]{2,1,0} abs([[broadcast_2]])
+// CHECK: ROOT [[bitcast_3_4:%[^ ]+]] = f32[3,4,2]{2,1,0} bitcast([[abs_3]])
+)");
+}
+
 TEST_F(LayoutNormalizationTest, BroadcastCustomOutputLayoutWithDegenerate) {
   const char* hlo = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
index f16dfc359ef..ca738bd36a9 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -136,8 +136,16 @@ StatusOr<FusedIrEmitter::IndexedGenerator> FusedIrEmitter::HandleTuple(
       [&, b, type](const IrArray::Index& index) -> StatusOr<llvm::Value*> {
         llvm::Value* ret = llvm::UndefValue::get(type);
         for (size_t i = 0; i < tuple.operand_count(); ++i) {
-          TF_ASSIGN_OR_RETURN(llvm::Value * value,
-                              indexed_generators_.at(tuple.operand(i))(index));
+          IrArray::Index used_index = index;
+          if (i > 0 &&
+              !ShapeUtil::EqualIgnoringElementType(tuple.operand(i)->shape(),
+                                                   tuple.operand(0)->shape())) {
+            used_index = used_index.SourceIndexOfBitcast(
+                tuple.operand(0)->shape(), tuple.operand(i)->shape(), b);
+          }
+          TF_ASSIGN_OR_RETURN(
+              llvm::Value * value,
+              indexed_generators_.at(tuple.operand(i))(used_index));
           ret = b->CreateInsertValue(ret, value, i);
         }
         return ret;
diff --git a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
index 57520be7806..dba7da4984c 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/ir_array.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <optional>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "llvm/IR/Constants.h"
@@ -326,70 +327,49 @@ IrArray::Index IrArray::Index::SourceIndexOfTranspose(
   return Index(operand_multidim_index, operand_shape, index_type_);
 }
 
-static absl::InlinedVector<int64_t, 8> ReverseIota(int64_t n) {
-  absl::InlinedVector<int64_t, 8> ret(n);
-  absl::c_generate(ret, [n = ret.size()]() mutable { return --n; });
-  return ret;
-}
-
 IrArray::Index IrArray::Index::SourceIndexOfBitcast(
     const Shape& shape, const Shape& operand_shape,
     llvm::IRBuilder<>* builder) const {
   CHECK(LayoutUtil::HasLayout(shape) && LayoutUtil::HasLayout(operand_shape));
 
+  const ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(operand_shape, shape);
+
   // In case the bitcast is just a reshape, we can use SourceIndexOfReshape()
   // instead. This will reuse linear() if possible, so we don't have to build a
   // new 'linear_index'.
-  if (ShapeUtil::ReshapeIsBitcast(operand_shape, shape)) {
+  if (std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
+          decomposition)) {
     return SourceIndexOfReshape(shape, operand_shape, builder);
   }
 
-  if (std::optional<std::vector<int64_t>> dimensions =
-          ShapeUtil::DeduceTransposeDimensionsForBitcast(operand_shape,
-                                                         shape)) {
-    return SourceIndexOfTranspose(shape, operand_shape, *dimensions);
+  if (std::holds_alternative<ShapeUtil::BitcastDecompositionTranspose>(
+          decomposition)) {
+    const auto& decomposition_transpose =
+        std::get<ShapeUtil::BitcastDecompositionTranspose>(decomposition);
+    return SourceIndexOfTranspose(shape, operand_shape,
+                                  decomposition_transpose.transpose_dims);
   }
 
-  // Every bitcast from A to B can be represented as a sequence of:
-  // 1) Transpose to a normalized layout of A
-  // 2) Reshape to a normalized layout of B
-  // 3) Transpose from (2) to B
-  //
-  // Steps (1) and (3) can be skipped if the layout is already descending.
-  // Such a sequence of index transformations is markedly faster than the
-  // previous approach of linearizing and delinearizing the entire index.
-  Shape normalized_operand_shape =
-      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-          operand_shape);
+  CHECK(std::holds_alternative<ShapeUtil::BitcastDecompositionTrt>(
+      decomposition));
+  const auto& decomposition_trt =
+      std::get<ShapeUtil::BitcastDecompositionTrt>(decomposition);
 
-  std::vector<int64_t> transpose_dims_to_normalized_operand_shape =
-      ComposePermutations(operand_shape.layout().minor_to_major(),
-                          ReverseIota(operand_shape.rank()));
-
-  // We need to go from target `shape` to an index of `operand_shape`.
-  std::vector<int64_t> transpose_perm =
-      ComposePermutations(ReverseIota(shape.rank()),
-                          InversePermutation(shape.layout().minor_to_major()));
-
-  // Has same rank as output shape.
-  Shape reshape_shape = ShapeUtil::MakeShapeWithDescendingLayout(
-      shape.element_type(),
-      ComposePermutations(shape.dimensions(),
-                          InversePermutation(transpose_perm)));
-  Index transposed_index =
-      absl::c_is_sorted(transpose_perm)
-          ? *this
-          : SourceIndexOfTranspose(shape, reshape_shape, transpose_perm);
-
-  CHECK(ShapeUtil::ReshapeIsBitcast(reshape_shape, normalized_operand_shape,
-                                    /*ignore_element_type=*/true));
-  Index out = transposed_index.SourceIndexOfReshape(
-      reshape_shape, normalized_operand_shape, builder);
-  return absl::c_is_sorted(transpose_dims_to_normalized_operand_shape)
-             ? out
-             : out.SourceIndexOfTranspose(
-                   normalized_operand_shape, operand_shape,
-                   transpose_dims_to_normalized_operand_shape);
+  Index index = *this;
+  if (!decomposition_trt.IsTranspose2Identity()) {
+    index = index.SourceIndexOfTranspose(shape, decomposition_trt.reshape_shape,
+                                         decomposition_trt.transpose2_dims);
+  }
+  index =
+      index.SourceIndexOfReshape(decomposition_trt.reshape_shape,
+                                 decomposition_trt.transpose1_shape, builder);
+  if (!decomposition_trt.IsTranspose1Identity()) {
+    index = index.SourceIndexOfTranspose(decomposition_trt.transpose1_shape,
+                                         operand_shape,
+                                         decomposition_trt.transpose1_dims);
+  }
+  return index;
 }
 
 IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
diff --git a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
index 3e0d1df4e70..00b55d0aea7 100644
--- a/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
+++ b/tensorflow/compiler/xla/service/llvm_ir/loop_emitter.cc
@@ -92,8 +92,16 @@ BodyEmitter MakeBodyEmitter(const ElementGenerator& target_element_generator,
              target_arrays_vec.size());
 
     for (int64_t i = 0; i < target_arrays_vec.size(); ++i) {
+      IrArray::Index used_index = array_index;
+      if (i > 0 && !ShapeUtil::EqualIgnoringElementType(
+                       target_arrays_vec[i].GetShape(),
+                       target_arrays_vec[0].GetShape())) {
+        used_index =
+            used_index.SourceIndexOfBitcast(target_arrays_vec[0].GetShape(),
+                                            target_arrays_vec[i].GetShape(), b);
+      }
       target_arrays_vec[i].EmitWriteArrayElement(
-          array_index, b->CreateExtractValue(target_element, i), b);
+          used_index, b->CreateExtractValue(target_element, i), b);
     }
     return OkStatus();
   };
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc b/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
index 4da638cf45f..9628fe55876 100644
--- a/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "tensorflow/compiler/xla/hlo/utils/hlo_query.h"
 #include "tensorflow/compiler/xla/service/graphcycles/graphcycles.h"
 
 namespace xla {
@@ -25,7 +26,8 @@ namespace {
 
 // Calculate ordering for HLO, for fast online checking of whether adding
 // additional dependencies would create cycles.
-struct ComputationInstructionOrdering {
+class ComputationInstructionOrdering {
+ public:
   explicit ComputationInstructionOrdering(const HloComputation& computation) {
     for (const HloInstruction* instr : computation.instructions()) {
       for (const HloInstruction* control_pred : instr->control_predecessors()) {
@@ -43,13 +45,13 @@ struct ComputationInstructionOrdering {
 
   int32_t NodeIdForInstruction(const HloInstruction& instr) {
     int32_t instruction_id = instr.unique_id();
-    auto it = node_id_to_graph_id.find(instruction_id);
+    auto it = node_id_to_graph_id_.find(instruction_id);
 
-    if (it != node_id_to_graph_id.end()) {
+    if (it != node_id_to_graph_id_.end()) {
       return it->second;
     }
-    int32_t node_id = graph_cycles.NewNode();
-    node_id_to_graph_id[instruction_id] = node_id;
+    int32_t node_id = graph_cycles_.NewNode();
+    node_id_to_graph_id_[instruction_id] = node_id;
     return node_id;
   }
 
@@ -58,12 +60,12 @@ struct ComputationInstructionOrdering {
   bool InsertEdge(const HloInstruction& source, const HloInstruction& dest) {
     int32_t source_id = NodeIdForInstruction(source);
     int32_t dest_id = NodeIdForInstruction(dest);
-    return graph_cycles.InsertEdge(source_id, dest_id);
+    return graph_cycles_.InsertEdge(source_id, dest_id);
   }
 
-  absl::flat_hash_map<int32_t, int32_t> node_id_to_graph_id;
-
-  tensorflow::GraphCycles graph_cycles;
+ private:
+  absl::flat_hash_map<int32_t, int32_t> node_id_to_graph_id_;
+  tensorflow::GraphCycles graph_cycles_;
 };
 
 }  // namespace
@@ -81,7 +83,7 @@ static StatusOr<bool> AddControlEdgesForLoopWrites(
   // computations is because it is hard to extract the underlying graph from
   // those abstractions.
   ComputationInstructionOrdering ordering(*body);
-  ShapeTree<bool> indices_to_copy(xla_while->shape());
+  ShapeTree<bool> indices_to_copy(&xla_while->shape());
 
   for (auto& p : indices_to_copy) {
     const ShapeIndex& index = p.first;
@@ -128,12 +130,14 @@ static StatusOr<bool> AddControlEdgesForLoopWrites(
             continue;
           }
 
-          changed |= absl::c_linear_search(read->control_successors(), write);
-
-          // Unless we want a copy, read should happen before write.
-          TF_RETURN_IF_ERROR(read->AddControlDependencyTo(write));
-          VLOG(2) << "Adding dependency: " << read->ToShortString()
-                  << " before " << write->ToShortString();
+          // Add control dependency if it does not already exist.
+          if (!absl::c_linear_search(read->control_successors(), write)) {
+            // Unless we want a copy, read should happen before write.
+            TF_RETURN_IF_ERROR(read->AddControlDependencyTo(write));
+            VLOG(2) << "Adding dependency: " << read->ToShortString()
+                    << " before " << write->ToShortString();
+            changed = true;
+          }
         }
       }
     }
@@ -151,12 +155,28 @@ StatusOr<bool> LoopScheduleLinearizer::Run(
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    for (HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
+    for (HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() != HloOpcode::kWhile) {
         continue;
       }
 
+      // Skip loops that have async collectives, as the additional control deps
+      // inserted by this pass can constrain scheduling and hamper compute
+      // and communication overlap.
+      const HloComputation* body = instruction->while_body();
+      bool has_async_collectives =
+          absl::c_any_of(body->instructions(), [](const HloInstruction* instr) {
+            HloOpcode op = instr->opcode();
+            return hlo_query::IsAsyncCollectiveStartOp(op) ||
+                   hlo_query::IsAsyncCollectiveDoneOp(op);
+          });
+
+      if (has_async_collectives) {
+        VLOG(2) << "Skipping " << instruction->name()
+                << " since body has async collectives";
+        continue;
+      }
+
       if (alias_analysis == nullptr) {
         TF_ASSIGN_OR_RETURN(alias_analysis,
                             HloAliasAnalysis::Run(module, can_share_buffer_));
diff --git a/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc b/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
index 42b5b1ed668..b7f5cc8a997 100644
--- a/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
+++ b/tensorflow/compiler/xla/service/loop_schedule_linearizer_test.cc
@@ -15,23 +15,13 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/loop_schedule_linearizer.h"
 
-#include <set>
-
-#include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/service/copy_insertion.h"
-#include "tensorflow/compiler/xla/service/hlo_runner.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/test_helpers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
-#include "tensorflow/tsl/platform/test_benchmark.h"
 
 namespace xla {
 namespace {
@@ -72,9 +62,10 @@ int64_t CountControlEdges(const HloModule& module) {
 
 class LoopScheduleLinearizerTest : public HloTestBase {
  protected:
-  void InsertCopies(HloModule* module) {
+  void InsertCopies(HloModule* module, bool expect_change) {
     LoopScheduleLinearizer loop_schedule_linearizer;
-    ASSERT_IS_OK(loop_schedule_linearizer.Run(module).status());
+    TF_ASSERT_OK_AND_ASSIGN(bool changed, loop_schedule_linearizer.Run(module));
+    ASSERT_EQ(changed, expect_change);
 
     CopyInsertion copy_insertion;
     ASSERT_IS_OK(copy_insertion.Run(module).status());
@@ -115,7 +106,7 @@ ENTRY entry {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(hlo_string));
-  InsertCopies(module.get());
+  InsertCopies(module.get(), /*expect_change=*/true);
   EXPECT_EQ(CountCopies(
                 *module->entry_computation()->root_instruction()->while_body()),
             0);
@@ -124,5 +115,49 @@ ENTRY entry {
             1);
 }
 
+TEST_F(LoopScheduleLinearizerTest, SkipAsyncCollectives) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  x = s32[] parameter(0)
+  y = s32[] parameter(1)
+  ROOT add = s32[] add(x, y)
+}
+
+while_body {
+  input = (s32[], s32[]) parameter(0)
+  counter = s32[] get-tuple-element(input), index=0
+  buffer = s32[] get-tuple-element(input), index=1
+
+  one = s32[] constant(1)
+
+  updated_counter = s32[] add(counter, one)
+
+  updated_buffer = s32[] add(buffer, counter)
+  ar_start = s32[] all-reduce-start(updated_buffer), replica_groups={}, to_apply=add
+  ar_done = s32[] all-reduce-done(ar_start)
+  ROOT out = (s32[], s32[]) tuple(updated_counter, ar_done)
+}
+
+while_cond {
+  input = (s32[], s32[]) parameter(0)
+  counter = s32[] get-tuple-element(input), index=0
+  bound = s32[] constant(100)
+  ROOT cmp = pred[] compare(counter, bound), direction=LT
+}
+
+ENTRY entry {
+  zero = s32[] constant(0)
+  buffer = s32[] parameter(0)
+  while_input = (s32[], s32[]) tuple(zero, buffer)
+  ROOT out = (s32[], s32[]) while(while_input), condition=while_cond, body=while_body
+}
+
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  InsertCopies(module.get(), /*expect_change=*/false);
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.cc b/tensorflow/compiler/xla/service/memory_space_assignment.cc
index 4b765f6cc17..318aead7900 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.cc
@@ -17,23 +17,34 @@ limitations under the License.
 
 #include <algorithm>
 #include <cmath>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <limits>
+#include <memory>
 #include <optional>
+#include <set>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
+#include "tensorflow/compiler/xla/service/hlo_value.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_tuning_utils.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment_utils.h"
 #include "tensorflow/compiler/xla/service/tuple_util.h"
+#include "tensorflow/compiler/xla/status.h"
+#include "tensorflow/compiler/xla/util.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 
@@ -452,10 +463,18 @@ float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
     }
   }
 
-  // Penalize larger buffers by dividing the benefit by the square root of the
-  // size. Empirically, we observed this resulted in better performance compared
-  // to dividing by the size.
-  float memory_boundedness = alternate_mem_benefit / std::sqrt(interval.size);
+  // Penalize larger buffers by dividing the benefit by the square root of
+  // the size. Empirically, we observed this resulted in better performance
+  // compared to dividing by the size.
+  float memory_boundedness = 1;
+  if (options_
+          .xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers ==
+      "NO_SCALE") {
+    memory_boundedness = alternate_mem_benefit;
+  } else {
+    memory_boundedness = alternate_mem_benefit / std::sqrt(interval.size);
+  }
+
   if (cache) {
     cache->memory_boundedness[interval.buffer->defining_position()] =
         memory_boundedness;
@@ -1376,6 +1395,8 @@ AlternateMemoryBestFitHeap::AlternateMemoryBestFitHeap(
     buffer_interval_compare_ = *options.buffer_interval_compare;
   }
 
+  call_graph_ = CallGraph::Build(&alias_analysis_.dataflow_analysis().module());
+
   std::vector<float> initial_resources(hlo_live_range.schedule_end_time(), 1.0);
   if (options.cost_analysis) {
     const std::vector<HloInstruction*>& flattened_instructions =
@@ -1388,9 +1409,20 @@ AlternateMemoryBestFitHeap::AlternateMemoryBestFitHeap(
       } else {
         initial_resources[i] =
             options.cost_analysis->GetInstructionElapsed(*inst);
-        if (options_.use_repeated_instance_for_preferred_prefetch_time) {
-          const std::string fingerprint =
-              inst->ToString(HloPrintOptions::Fingerprint());
+        if (options_.use_repeated_instance_for_preferred_prefetch_time ||
+            options_.memory_bound_loop_optimizer_options.enabled()) {
+          std::string fingerprint;
+          absl::StrAppend(&fingerprint, inst->shape().ToString(), " ",
+                          HloOpcodeString(inst->opcode()), "(");
+          for (int operand_idx = 0; operand_idx < inst->operands().size();
+               ++operand_idx) {
+            if (operand_idx > 0) {
+              absl::StrAppend(&fingerprint, ", ");
+            }
+            absl::StrAppend(&fingerprint,
+                            inst->operand(operand_idx)->shape().ToString());
+          }
+          absl::StrAppend(&fingerprint, ")");
           fingerprint_map_[inst] = fingerprint;
           repeated_inst_map_[fingerprint].push_back(inst);
         }
@@ -1801,6 +1833,1484 @@ void AlternateMemoryBestFitHeap::DumpDebugStringsIfEnabled() const {
   options_.dump_fn("allocinfo", allocation_info_str_);
 }
 
+/*static*/ StatusOr<std::unique_ptr<MemoryBoundLoopOptimizer>>
+MemoryBoundLoopOptimizer::Create(
+    int loop_start, int loop_end, uint64_t alternate_memory_size,
+    const MemoryBoundLoopOptimizerOptions& options,
+    const HloLiveRange& hlo_live_range, const HloAliasAnalysis& alias_analysis,
+    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+    const BufferValue::SizeFunction& size_function) {
+  std::unique_ptr<MemoryBoundLoopOptimizer> optimizer =
+      absl::WrapUnique(new MemoryBoundLoopOptimizer(
+          loop_start, loop_end, alternate_memory_size, options, hlo_live_range,
+          alias_analysis, cost_analysis, size_function));
+  TF_RETURN_IF_ERROR(optimizer->Initialize());
+  return std::move(optimizer);
+}
+
+MemoryBoundLoopOptimizer::MemoryBoundLoopOptimizer(
+    int loop_start, int loop_end, uint64_t alternate_memory_size,
+    const MemoryBoundLoopOptimizerOptions& options,
+    const HloLiveRange& hlo_live_range, const HloAliasAnalysis& alias_analysis,
+    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+    const BufferValue::SizeFunction& size_function)
+    : loop_start_(loop_start),
+      loop_end_(loop_end),
+      loop_size_(loop_end - loop_start),
+      alternate_memory_size_(alternate_memory_size),
+      options_(options),
+      hlo_live_range_(hlo_live_range),
+      alias_analysis_(alias_analysis),
+      cost_analysis_(cost_analysis),
+      size_function_(size_function) {}
+
+Status MemoryBoundLoopOptimizer::Initialize() {
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  VLOG(3) << "MemoryBoundLoopOptimizer::Initialize, loop start: " << loop_start_
+          << ", loop end: " << loop_end_ << ", loop size: " << loop_size_;
+  const HloComputation* loop_computation = nullptr;
+  // Initialize the remaining memory array with the size of the alternate
+  // memory. Also populate instructions_in_loop_ and
+  // instructions_in_{prev,next}_iterations_ data structures to help find the
+  // loop values.
+  for (int i = loop_start_; i < loop_end_; ++i) {
+    const HloInstruction* inst = instruction_sequence[i];
+    instructions_in_loop_[inst] = i - loop_start_;
+    VLOG(3) << "  inst in loop [" << (i - loop_start_) << "]: " << inst->name();
+    if (!loop_computation) {
+      loop_computation = inst->parent();
+    } else {
+      TF_RET_CHECK(loop_computation == inst->parent());
+    }
+    remaining_memory_.push_back(alternate_memory_size_);
+  }
+
+  for (int i = loop_start_ - loop_size_; i < loop_start_; ++i) {
+    const HloInstruction* inst = instruction_sequence[i];
+    instructions_in_prev_iteration_[inst] = i - loop_start_ + loop_size_;
+  }
+  for (int i = loop_end_; i < loop_end_ + loop_size_; ++i) {
+    const HloInstruction* inst = instruction_sequence[i];
+    instructions_in_next_iteration_[inst] = i - loop_end_;
+  }
+
+  // Create a tree set to keep track of all the values that the loop
+  // instructions produce and consume. We use a tree set instead of a hash set
+  // to ensure the iteration order is the same as insertion order. Since we
+  // traverse the program in instruction order, the buffers would be inserted in
+  // a deterministic order, so we'll be able to iterate over these buffers in a
+  // deterministic order.
+  std::set<const HloBuffer*> buffers_to_process;
+  for (const auto& [instruction, idx] : instructions_in_loop_) {
+    auto maybe_add_buffer = [&](const HloInstruction* instruction) {
+      return [this, &buffers_to_process, instruction](const Shape& subshape,
+                                                      const ShapeIndex& index) {
+        if (!subshape.IsArray()) {
+          return;
+        }
+        const HloBuffer& buffer =
+            alias_analysis_.GetUniqueBufferAt(instruction, index);
+        if (buffers_to_process.find(&buffer) == buffers_to_process.end()) {
+          buffers_to_process.insert(&buffer);
+        }
+      };
+    };
+    ShapeUtil::ForEachSubshape(instruction->shape(),
+                               maybe_add_buffer(instruction));
+    for (const HloInstruction* operand : instruction->operands()) {
+      ShapeUtil::ForEachSubshape(operand->shape(), maybe_add_buffer(operand));
+    }
+  }
+
+  // Process the buffers and decide if they should be added as LoopValues.
+  for (const HloBuffer* buffer : buffers_to_process) {
+    MaybeCreateLoopValue(*buffer, loop_computation);
+  }
+  return OkStatus();
+}
+
+void MemoryBoundLoopOptimizer::MaybeCreateLoopValue(
+    const HloBuffer& buffer, const HloComputation* loop_computation) {
+  // Define helper lambdas to get the loop-relative index of the given
+  // instruction.
+  auto get_index_in_loop =
+      [&](const HloInstruction* instruction,
+          const absl::flat_hash_map<const HloInstruction*, int64_t>&
+              instructions_in_loop,
+          int64_t relative_index = 0) {
+        std::optional<int64_t> loop_index;
+        if (instructions_in_loop.contains(instruction)) {
+          loop_index = hlo_live_range_.instruction_schedule().at(instruction) -
+                       loop_start_ + relative_index;
+          CHECK_GE(*loop_index, 0);
+          CHECK_LT(*loop_index, loop_size_);
+        }
+        return loop_index;
+      };
+  auto get_index_in_current_iteration = [&](const HloInstruction* instruction) {
+    return get_index_in_loop(instruction, instructions_in_loop_);
+  };
+  auto get_index_in_prev_iteration = [&](const HloInstruction* instruction) {
+    return get_index_in_loop(instruction, instructions_in_prev_iteration_,
+                             loop_size_);
+  };
+  auto get_index_in_next_iteration = [&](const HloInstruction* instruction) {
+    return get_index_in_loop(instruction, instructions_in_next_iteration_,
+                             -loop_size_);
+  };
+
+  loop_values_.push_back({});
+  LoopValue& loop_value = loop_values_.back();
+  float pos_bytes = 0;
+  float use_bytes = 0;
+  bool has_footer_consumer = false;
+  for (const HloValue* value : buffer.values()) {
+    // For each position and use of the value, populate the respecive position
+    // and use fields for the current, previous, and next iterations along with
+    // the loop indices.
+    for (const HloPosition& position : value->positions()) {
+      if (position.instruction->opcode() == HloOpcode::kGetTupleElement) {
+        continue;
+      }
+      std::optional<int64_t> loop_index =
+          get_index_in_current_iteration(position.instruction);
+      std::optional<int64_t> prev_iteration_index;
+      if (loop_index) {
+        loop_value.loop_positions.push_back({*loop_index, position});
+        VLOG(3) << "Pos match: " << position.instruction->name() << " at "
+                << *loop_index;
+      } else if ((prev_iteration_index =
+                      get_index_in_prev_iteration(position.instruction))) {
+        loop_value.prev_iteration_positions.push_back(
+            {*prev_iteration_index, position});
+        VLOG(3) << "Pos match (prev iteration): "
+                << position.instruction->name() << " at "
+                << *prev_iteration_index;
+      } else if (loop_value.prev_iteration_positions.empty() &&
+                 loop_value.loop_positions.empty() &&
+                 position.instruction->parent() == loop_computation &&
+                 !loop_value.header_position) {
+        loop_value.header_position = position;
+      }
+
+      // Keep track of bytes accessed by this value.
+      if (loop_index || prev_iteration_index) {
+        float bytes_accessed =
+            cost_analysis_.cost_analysis().output_bytes_accessed(
+                *position.instruction, position.index);
+        pos_bytes += bytes_accessed;
+        VLOG(3) << " accessed: " << bytes_accessed;
+      }
+    }
+
+    for (const HloUse& use : value->GetUses()) {
+      if (use.instruction->opcode() == HloOpcode::kGetTupleElement) {
+        continue;
+      }
+      std::optional<int64_t> loop_index =
+          get_index_in_current_iteration(use.instruction);
+      std::optional<int64_t> next_iteration_index;
+      if (loop_index) {
+        loop_value.loop_uses.push_back({*loop_index, use});
+        VLOG(3) << "Use match: " << use.instruction->name() << " at "
+                << *loop_index;
+      } else if ((next_iteration_index =
+                      get_index_in_next_iteration(use.instruction))) {
+        loop_value.next_iteration_uses.push_back({*next_iteration_index, use});
+        VLOG(3) << "Use match (next iteration): " << use.instruction->name()
+                << " at " << *next_iteration_index;
+      } else if (!loop_value.loop_positions.empty() ||
+                 !loop_value.loop_uses.empty()) {
+        has_footer_consumer = true;
+      }
+
+      // Keep track of bytes accessed by this value.
+      if (loop_index || next_iteration_index) {
+        float bytes_accessed =
+            cost_analysis_.cost_analysis().operand_bytes_accessed(
+                *use.instruction, use.operand_number, use.operand_index);
+        use_bytes += bytes_accessed;
+        VLOG(3) << " accessed: " << bytes_accessed;
+      }
+    }
+  }
+
+  // We only add the loop position if it has a position or use in the current
+  // iteration and its previous iteration positions are empty. The reason why we
+  // disallow values with previous iteration positions is because there will be
+  // a different value that corresponds to the same value but one iteration
+  // later, so we will add that one instead.
+  if ((!loop_value.loop_positions.empty() || !loop_value.loop_uses.empty()) &&
+      loop_value.prev_iteration_positions.empty()) {
+    loop_value.size = size_function_(**buffer.values().begin());
+    VLOG(3) << "Size: " << loop_value.size;
+    // Classify the type of allocation. See the comment in LoopValue definition.
+    loop_value.allocation_type = LoopValue::AllocationType::kUnsupported;
+    auto position_compare = [](const std::pair<int64_t, HloPosition>& a,
+                               const std::pair<int64_t, HloPosition>& b) {
+      return a.first < b.first;
+    };
+    auto use_compare = [](const std::pair<int64_t, HloUse>& a,
+                          const std::pair<int64_t, HloUse>& b) {
+      return a.first < b.first;
+    };
+    absl::c_sort(loop_value.loop_positions, position_compare);
+    absl::c_sort(loop_value.prev_iteration_positions, position_compare);
+    absl::c_sort(loop_value.loop_uses, use_compare);
+    absl::c_sort(loop_value.next_iteration_uses, use_compare);
+    if (!loop_value.loop_positions.empty()) {
+      if (loop_value.next_iteration_uses.empty() &&
+          !loop_value.loop_uses.empty()) {
+        loop_value.allocation_type = LoopValue::AllocationType::kTemporary;
+      } else if (!loop_value.next_iteration_uses.empty()) {
+        if (loop_value.next_iteration_uses.back().first >=
+            loop_value.loop_positions.front().first) {
+          loop_value.allocation_type =
+              LoopValue::AllocationType::kLoopCarriedDependence;
+        } else {
+          loop_value.allocation_type = LoopValue::AllocationType::kTemporary;
+        }
+      }
+    } else if (loop_value.header_position && !loop_value.loop_uses.empty()) {
+      if (loop_value.loop_uses.size() ==
+              loop_value.next_iteration_uses.size() &&
+          loop_value.loop_uses.front().first ==
+              loop_value.next_iteration_uses.front().first) {
+        loop_value.allocation_type = LoopValue::AllocationType::kPinned;
+      } else if (loop_value.next_iteration_uses.empty() ||
+                 loop_value.next_iteration_uses.back().first <
+                     loop_value.loop_uses.front().first) {
+        loop_value.allocation_type = LoopValue::AllocationType::kPrefetch;
+      }
+    }
+
+    VLOG(3) << "Allocation type "
+            << LoopValue::AllocationTypeToString(loop_value.allocation_type);
+    VLOG(3) << "Pos bytes: " << pos_bytes << " use bytes: " << use_bytes;
+
+    // We calculate the savings of allocating this buffer in the alternate
+    // memory.
+    float savings = pos_bytes + use_bytes;
+    if (loop_value.header_position) {
+      savings -= loop_value.size;
+    }
+    if (!loop_value.loop_positions.empty() && has_footer_consumer) {
+      savings -= loop_value.size;
+    }
+    loop_value.savings = savings;
+    loop_value.savings_per_byte = savings / loop_value.size;
+    VLOG(3) << "Savings: " << loop_value.savings;
+    VLOG(3) << "Savings per byte: " << loop_value.savings_per_byte;
+    for (const HloValue* value : buffer.values()) {
+      VLOG(3) << value->ToString();
+    }
+    auto sort_positions = [](const std::pair<int64_t, HloPosition>& a,
+                             const std::pair<int64_t, HloPosition>& b) {
+      return a.first < b.first;
+    };
+    auto sort_uses = [](const std::pair<int64_t, HloUse>& a,
+                        const std::pair<int64_t, HloUse>& b) {
+      return a.first < b.first;
+    };
+    absl::c_sort(loop_value.loop_positions, sort_positions);
+    absl::c_sort(loop_value.prev_iteration_positions, sort_positions);
+    absl::c_sort(loop_value.loop_uses, sort_uses);
+    absl::c_sort(loop_value.next_iteration_uses, sort_uses);
+    loop_value.hlo_values = buffer.values();
+  } else {
+    loop_values_.pop_back();
+  }
+}
+
+void MemoryBoundLoopOptimizer::Optimize() {
+  SortLoopValues();
+  AllocateLoopValues();
+  PostProcess();
+}
+
+float MemoryBoundLoopOptimizer::CalculateExecutionTime() const {
+  // First populate the list of prefetches.
+  std::vector<std::pair<const MemorySpaceAssignment::CopyAllocation*, float>>
+      prefetches;
+  for (const LoopValue& value : loop_values_) {
+    if (!value.allocations.empty() &&
+        value.allocations.back()->is_copy_allocation()) {
+      prefetches.push_back(
+          {static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+               value.allocations.back().get()),
+           cost_analysis_.GetAsyncCopyElapsed(
+               value.hlo_values.front()->shape())});
+    }
+  }
+
+  // Returns the effective prefetch completion time. The effective time is a
+  // value that will be larger than loop size for prefetches that start in this
+  // iteration but complete in the next iteration.
+  auto get_effective_done_time =
+      [&](int64_t copy_start_schedule_after,
+          int64_t copy_done_schedule_before) -> int64_t {
+    if (copy_start_schedule_after == loop_size_ - 1 &&
+        copy_done_schedule_before == 0) {
+      return 2 * loop_size_;
+    }
+    if (copy_start_schedule_after + 1 >= copy_done_schedule_before) {
+      return copy_done_schedule_before + loop_size_;
+    }
+    return copy_done_schedule_before;
+  };
+
+  // Sort the prefetches by first the start time, then the effective done time.
+  absl::c_sort(
+      prefetches,
+      [&](const std::pair<const MemorySpaceAssignment::CopyAllocation*, float>&
+              a,
+          const std::pair<const MemorySpaceAssignment::CopyAllocation*, float>&
+              b) {
+        return std::forward_as_tuple(
+                   a.first->copy_start_schedule_after(),
+                   get_effective_done_time(
+                       a.first->copy_start_schedule_after(),
+                       a.first->copy_done_schedule_before())) <
+               std::forward_as_tuple(b.first->copy_start_schedule_after(),
+                                     get_effective_done_time(
+                                         b.first->copy_start_schedule_after(),
+                                         b.first->copy_done_schedule_before()));
+      });
+  // Populate the required prefetch completions array. For each instruction in
+  // the loop, this vector holds the index of the latest-issued prefetch that
+  // needs to be completed before the instruction executes, or nullopt if there
+  // is no prefetch that needs to finish by this instruction. To represent
+  // prefetches that started in the previous iteration, we use negative numbers.
+  std::vector<std::optional<int>> required_prefetch_completions(loop_size_);
+  for (int i = 0; i < prefetches.size(); ++i) {
+    const auto& [prefetch, elapsed] = prefetches[i];
+    int required_prefetch_completion = i;
+    if (prefetch->copy_start_schedule_after() == loop_size_ - 1 &&
+        prefetch->copy_done_schedule_before() == 0) {
+      required_prefetch_completion -= 2 * prefetches.size();
+    } else if (prefetch->copy_start_schedule_after() + 1 >=
+               prefetch->copy_done_schedule_before()) {
+      required_prefetch_completion -= prefetches.size();
+    }
+    VLOG(3) << "Prefetch #" << i << " (elapsed " << elapsed
+            << "): " << prefetch->ToString();
+    if (required_prefetch_completions[prefetch->copy_done_schedule_before()]) {
+      required_prefetch_completions[prefetch->copy_done_schedule_before()] =
+          std::max(
+              *required_prefetch_completions[prefetch
+                                                 ->copy_done_schedule_before()],
+              required_prefetch_completion);
+    } else {
+      required_prefetch_completions[prefetch->copy_done_schedule_before()] =
+          required_prefetch_completion;
+    }
+    VLOG(4)
+        << "Required completion at " << prefetch->copy_done_schedule_before()
+        << " = "
+        << *required_prefetch_completions[prefetch
+                                              ->copy_done_schedule_before()];
+  }
+
+  // Populate the elapsed times of instructions and bandwidth idle times at each
+  // point.
+  float result;
+  std::vector<float> bandwidth_idle_times;
+  std::vector<float> instructions_elapsed;
+  bandwidth_idle_times.reserve(loop_size_);
+  instructions_elapsed.reserve(loop_size_);
+  for (int i = 0; i < loop_size_; ++i) {
+    bandwidth_idle_times.push_back(GetBandwidthIdleTime(i));
+    instructions_elapsed.push_back(GetInstructionElapsed(i));
+  }
+  // We simulate the loop for three iterations to measure the steady state.
+  const int kNumIterations = 3;
+  // This data structure keeps track of the elapsed time remaining of each
+  // prefetch. Note that there is a separate entry for each prefetch in each
+  // iteration simulated.
+  std::vector<float> prefetch_remaining_elapsed_times(prefetches.size() *
+                                                      kNumIterations);
+  int prefetch_start_index = 0;
+  int prefetch_done_index = 0;
+  int prefetch_completed_index = 0;
+
+  for (int iteration = 0; iteration < kNumIterations; ++iteration) {
+    float total_elapsed = 0;
+    float total_bandwidth_idle_time = 0;
+    float total_critical_prefetch = 0;
+    for (int i = 0; i < loop_size_; ++i) {
+      // If any prefetches are expected to be completed, check if they have any
+      // remaining elapsed time associated with them, and if so add this to
+      // critical prefetch time.
+      std::optional<int> required_prefetch_completion =
+          required_prefetch_completions[i];
+      if (required_prefetch_completion) {
+        int required_prefetch_done_index =
+            iteration * static_cast<int>(prefetches.size()) +
+            *required_prefetch_completion;
+        VLOG(4) << "Prefetch #"
+                << ((*required_prefetch_completion + prefetches.size()) %
+                    prefetches.size())
+                << " (" << required_prefetch_done_index
+                << ") is required to be completed at " << i;
+        for (; prefetch_done_index <= required_prefetch_done_index;
+             ++prefetch_done_index) {
+          CHECK_LE(prefetch_done_index, prefetch_start_index);
+          if (prefetch_done_index == prefetch_completed_index) {
+            float& prefetch_remaining =
+                prefetch_remaining_elapsed_times[prefetch_done_index];
+            VLOG(4) << "Prefetch #" << (prefetch_done_index % prefetches.size())
+                    << " (" << prefetch_done_index
+                    << ") did not complete, remaining elapsed = "
+                    << prefetch_remaining;
+            total_critical_prefetch += prefetch_remaining;
+            prefetch_remaining = 0;
+            ++prefetch_completed_index;
+          }
+        }
+      }
+
+      float elapsed = instructions_elapsed[i];
+      total_elapsed += elapsed;
+      float bandwidth_idle_time = bandwidth_idle_times[i];
+      // Find the outstanding prefetches during this instruction, and if any of
+      // them have remaining time, spend some or all of the bandwidth idle time
+      // to satisfy them.
+      for (; prefetch_completed_index < prefetch_start_index;
+           ++prefetch_completed_index) {
+        float& prefetch_remaining =
+            prefetch_remaining_elapsed_times[prefetch_completed_index];
+        if (bandwidth_idle_time < prefetch_remaining) {
+          prefetch_remaining -= bandwidth_idle_time;
+          bandwidth_idle_time = 0;
+          VLOG(4) << "Prefetch #"
+                  << (prefetch_completed_index % prefetches.size()) << " ("
+                  << prefetch_completed_index << ") still ongoing at " << i
+                  << ", remaining elapsed = " << prefetch_remaining;
+          break;
+        }
+        bandwidth_idle_time -= prefetch_remaining;
+        prefetch_remaining = 0;
+        VLOG(4) << "Prefetch #"
+                << (prefetch_completed_index % prefetches.size()) << " ("
+                << prefetch_completed_index << ") completed at " << i
+                << ", bandwidth idle time = " << bandwidth_idle_time;
+      }
+      if (bandwidth_idle_time > 0) {
+        VLOG(4) << "Bandwidth idle time at " << i << " = "
+                << bandwidth_idle_time;
+        total_bandwidth_idle_time += bandwidth_idle_time;
+      }
+
+      // Start new prefetches that are scheduled to start after this
+      // instruction.
+      for (; prefetch_start_index < (iteration + 1) * prefetches.size() &&
+             prefetches[prefetch_start_index % prefetches.size()]
+                     .first->copy_start_schedule_after() == i;
+           ++prefetch_start_index) {
+        float& prefetch_remaining =
+            prefetch_remaining_elapsed_times[prefetch_start_index];
+        prefetch_remaining =
+            prefetches[prefetch_start_index % prefetches.size()].second;
+        VLOG(4) << "Prefetch #" << (prefetch_start_index % prefetches.size())
+                << " (" << prefetch_start_index << ") started at " << i
+                << ", remaining elapsed = " << prefetch_remaining;
+      }
+    }
+    VLOG(3) << "Iteration " << iteration;
+    VLOG(3) << "Total elapsed: " << total_elapsed
+            << ", total critical prefetch: " << total_critical_prefetch
+            << ", total bandwidth idle time: " << total_bandwidth_idle_time;
+    result = total_elapsed + total_critical_prefetch;
+  }
+  return result;
+}
+
+/*static*/ std::string
+MemoryBoundLoopOptimizer::LoopValue::AllocationTypeToString(
+    LoopValue::AllocationType allocation_type) {
+  switch (allocation_type) {
+    case AllocationType::kTemporary:
+      return "temporary";
+    case AllocationType::kLoopCarriedDependence:
+      return "loop-carried dependence";
+    case AllocationType::kPinned:
+      return "pinned";
+    case AllocationType::kPrefetch:
+      return "prefetch";
+    default:
+      CHECK(allocation_type == AllocationType::kUnsupported);
+      return "unsupported";
+  }
+}
+
+std::string MemoryBoundLoopOptimizer::LoopValue::ToString() const {
+  std::string values_str;
+  absl::StrAppend(&values_str, "Values:");
+  for (const HloValue* hlo_value : hlo_values) {
+    absl::StrAppend(&values_str, "\n  - ", hlo_value->ToShortString());
+  }
+  std::string allocations_str;
+  if (!allocations.empty()) {
+    absl::StrAppend(&allocations_str, "Allocations:");
+  }
+  for (const auto& allocation : allocations) {
+    absl::StrAppend(&allocations_str, "\n  - ", allocation->ToString());
+  }
+  return absl::StrCat(
+      "Size: ", size, " savings: ", savings,
+      " savings per byte: ", savings_per_byte,
+      " allocation type: ", AllocationTypeToString(allocation_type), "\n",
+      values_str, "\n", allocations_str);
+}
+
+void MemoryBoundLoopOptimizer::SortLoopValues() {
+  absl::c_stable_sort(loop_values_, [](const LoopValue& a, const LoopValue& b) {
+    return a.savings_per_byte > b.savings_per_byte;
+  });
+}
+
+void MemoryBoundLoopOptimizer::AllocateLoopValues() {
+  // This function allocates loop values.
+  std::vector<LoopValue*> prefetch_values;
+  VLOG(3) << "Pre optimization execution time: " << CalculateExecutionTime();
+  for (LoopValue& value : loop_values_) {
+    switch (value.allocation_type) {
+      case LoopValue::AllocationType::kTemporary:
+        AllocateTemporary(value);
+        break;
+      case LoopValue::AllocationType::kPinned:
+        AllocatePinned(value);
+        break;
+      case LoopValue::AllocationType::kPrefetch:
+        prefetch_values.push_back(&value);
+        break;
+      case LoopValue::AllocationType::kLoopCarriedDependence:
+      case LoopValue::AllocationType::kUnsupported:
+        VLOG(1) << "Unsupported allocation: " << value.ToString();
+    }
+  }
+  VLOG(3) << "Execution time after allocating temporaries: "
+          << CalculateExecutionTime();
+  AllocatePrefetches(absl::MakeSpan(prefetch_values));
+  VLOG(3) << "Execution time after allocating prefetches:  "
+          << CalculateExecutionTime();
+}
+
+void MemoryBoundLoopOptimizer::PostProcess() {
+  // At the end, ensure that all loop uses have a corresponding Allocation and
+  // create one in the default memory space if they don't.
+  for (LoopValue& value : loop_values_) {
+    absl::flat_hash_set<HloUse> allocated_uses;
+    for (const auto& allocation : value.allocations) {
+      for (const HloUse& use : allocation->uses()) {
+        allocated_uses.insert(use);
+      }
+    }
+    std::vector<HloUse> unallocated_uses;
+    absl::flat_hash_set<int> use_indices;
+    for (const auto& [idx, use] : value.loop_uses) {
+      use_indices.insert(idx);
+      if (!allocated_uses.contains(use)) {
+        unallocated_uses.push_back(use);
+      }
+    }
+    for (const auto& [next_iteration_idx, use] : value.next_iteration_uses) {
+      if (use_indices.contains(next_iteration_idx)) {
+        continue;
+      }
+      HloInstruction* loop_instruction =
+          hlo_live_range_.flattened_instruction_sequence().instructions().at(
+              loop_start_ + next_iteration_idx);
+      HloUse loop_use{loop_instruction, use.operand_number, use.operand_index};
+      if (!allocated_uses.contains(loop_use)) {
+        unallocated_uses.push_back(loop_use);
+      }
+    }
+    if (!unallocated_uses.empty()) {
+      // TODO(b/281582241): We should find the correct position. For now, we're
+      // using the defining position on the first HLO value.
+      value.allocations.push_back(
+          std::make_unique<MemorySpaceAssignment::Allocation>(
+              value.hlo_values.front()->defining_position(),
+              MemorySpaceAssignment::MemorySpace::kDefault, std::nullopt, 0,
+              loop_size_, /*is_scoped_allocation=*/false));
+      for (const HloUse& use : unallocated_uses) {
+        value.allocations.back()->AddUse(use);
+      }
+    }
+  }
+}
+
+bool MemoryBoundLoopOptimizer::AllocateBetween(int64_t begin_idx,
+                                               int64_t end_idx, int64_t size) {
+  int64_t end_idx_sentinel = end_idx;
+  if (end_idx < begin_idx) {
+    end_idx_sentinel += loop_size_;
+  }
+  for (int64_t i = begin_idx; i <= end_idx_sentinel; ++i) {
+    if (remaining_memory_[i % loop_size_] < size) {
+      return false;
+    }
+  }
+  for (int64_t i = begin_idx; i <= end_idx_sentinel; ++i) {
+    remaining_memory_[i % loop_size_] -= size;
+  }
+  return true;
+}
+
+bool MemoryBoundLoopOptimizer::AllocateTemporary(LoopValue& value) {
+  VLOG(3) << "AllocateTemporary: " << value.ToString();
+  if (value.hlo_values.size() > 1) {
+    VLOG(3) << "LoopValue has more than one hlo value associated.";
+    return false;
+  }
+  int64_t definition_idx = value.loop_positions.front().first;
+  int64_t max_use_idx;
+  if (!value.next_iteration_uses.empty()) {
+    max_use_idx = value.next_iteration_uses.back().first;
+    // If max_use_idx >= definition_idx, then this is a loop carried dependence
+    // and we should not have called this function.
+    CHECK_LT(max_use_idx, definition_idx);
+  } else {
+    max_use_idx = value.loop_uses.back().first;
+  }
+  bool success = AllocateBetween(definition_idx, max_use_idx, value.size);
+  if (success) {
+    VLOG(3) << "Pos: " << value.loop_positions[0].second;
+    value.allocations.push_back(
+        std::make_unique<MemorySpaceAssignment::Allocation>(
+            value.loop_positions[0].second,
+            MemorySpaceAssignment::MemorySpace::kAlternate, std::nullopt,
+            definition_idx, max_use_idx,
+            /*is_scoped_allocation=*/false));
+    AddAllLoopPositionsAndUses(value, /*allocate_next_iteration_uses=*/true);
+  }
+  return success;
+}
+
+bool MemoryBoundLoopOptimizer::AllocatePinned(LoopValue& value) {
+  bool success = AllocateBetween(0, loop_size_, value.size);
+  if (success) {
+    CHECK(value.header_position);
+    value.allocations.push_back(
+        std::make_unique<MemorySpaceAssignment::Allocation>(
+            *value.header_position,
+            MemorySpaceAssignment::MemorySpace::kAlternate, std::nullopt, 0,
+            loop_size_,
+            /*is_scoped_allocation=*/false));
+    AddAllLoopPositionsAndUses(value, /*allocate_next_iteration_uses=*/false);
+  }
+  return success;
+}
+
+bool MemoryBoundLoopOptimizer::AllocatePrefetches(
+    absl::Span<LoopValue*> values) {
+  VLOG(3) << "Allocating prefetches num values: " << values.size();
+  AllocatePrefetchesContext context;
+  context.values = values;
+  // Populate value_indices, which is a list of indices into values array sorted
+  // by the start time of the first use.
+  context.value_indices.resize(values.size());
+  absl::c_iota(context.value_indices, 0);
+  absl::c_stable_sort(context.value_indices, [&](int a, int b) {
+    return std::forward_as_tuple(
+               values[a]->loop_uses.begin()->first,
+               values[a]->loop_uses.begin()->second.operand_number) >
+           std::forward_as_tuple(
+               values[b]->loop_uses.begin()->first,
+               values[b]->loop_uses.begin()->second.operand_number);
+  });
+
+  // Populate the data structures that contain additional positions and uses
+  // that would get alternate memory allocations if all of the prefetches were
+  // successful.
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<std::pair<int64_t, ShapeIndex>>>
+      additional_uses_in_alternate_mem;
+  absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>
+      additional_positions_in_alternate_mem;
+  for (const LoopValue* value : values) {
+    VLOG(3) << "  prefetch value: " << value->ToString();
+    for (const auto& [idx, use] : value->loop_uses) {
+      additional_uses_in_alternate_mem[use.instruction].push_back(
+          {use.operand_number, use.operand_index});
+    }
+    for (const auto& [idx, position] : value->loop_positions) {
+      additional_positions_in_alternate_mem[position.instruction].push_back(
+          position.index);
+    }
+  }
+  // Calculate the default-memory remaining bandwidths assuming all prefetches
+  // succeed.
+  for (int i = 0; i < loop_size_; ++i) {
+    context.bandwidth_idle_times.push_back(
+        GetBandwidthIdleTime(i, additional_uses_in_alternate_mem,
+                             additional_positions_in_alternate_mem));
+    VLOG(3) << "Remaining bandwidth at " << i << " = "
+            << *context.bandwidth_idle_times.rbegin();
+  }
+
+  context.additional_memory_used.resize(loop_size_, 0);
+
+  // Allocate prefetches by traversing the loop values in reverse order of
+  // the first uses.
+  for (int value_index : context.value_indices) {
+    AllocatePrefetch(value_index, context);
+  }
+
+  for (int i = 0; i < loop_size_; ++i) {
+    remaining_memory_[i] -= context.additional_memory_used[i];
+    VLOG(3) << "Additional memory [" << i
+            << "]: " << context.additional_memory_used[i];
+    VLOG(3) << "Remaining memory [" << i << "]: " << remaining_memory_[i];
+    VLOG(3) << "Remaining bandwidth [" << i
+            << "] : " << context.bandwidth_idle_times[i];
+  }
+  return true;
+}
+
+bool MemoryBoundLoopOptimizer::AllocatePrefetch(
+    int value_index, AllocatePrefetchesContext& context) {
+  LoopValue* value = context.values.at(value_index);
+  VLOG(3) << "Allocating value: " << value->ToString();
+  int first_use_idx = value->loop_uses.front().first;
+  int last_use_idx = value->loop_uses.back().first;
+  int last_use_idx_sentinel = last_use_idx;
+  if (!value->next_iteration_uses.empty()) {
+    last_use_idx = value->next_iteration_uses.back().first;
+    last_use_idx_sentinel = last_use_idx + loop_size_;
+    CHECK_LT(last_use_idx, first_use_idx);
+  }
+  bool out_of_memory = false;
+  for (int i = first_use_idx; i <= last_use_idx_sentinel; ++i) {
+    int loop_idx = i % loop_size_;
+    if (context.additional_memory_used[loop_idx] + value->size >
+        remaining_memory_[loop_idx]) {
+      VLOG(3) << "Ran out of memory allocating for uses.";
+      out_of_memory = true;
+    }
+  }
+  if (out_of_memory) {
+    return false;
+  }
+  float copy_resource =
+      cost_analysis_.GetAsyncCopyElapsed(value->hlo_values.front()->shape());
+  VLOG(3) << "First use: " << value->loop_uses.begin()->second
+          << " use idx: " << first_use_idx
+          << " copy resource: " << copy_resource;
+  std::optional<int> copy_start_time;
+  // The general allocation algorithm for prefetches is to first calculate the
+  // default-memory bandwidth idle times at each point (assuming all prefetches
+  // succeeded).  We show this pictorially below. We also show the previous
+  // iteration for clarity. The algorithm solves allocation for one iteration
+  // and this will be used for all iterations.
+  //
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  2  2  1  2  3  1| 2  2  1  2  3  1|
+  // additional memory:  0  0  0  0  0  0| 0  0  0  0  0  0|
+  //         iteration:       prev       |      current    |
+  //
+  // Now, let's assume there are two prefetches that need to be scheduled. For
+  // the sake of the example, assume 1 MiB of prefetch uses 1 memory bandwidth
+  // resource:
+  //   - Prefetch 1 is 4 MiB and is first used at index 5.
+  //   - Prefetch 2 is 5 MiB and is first used at index 1.
+  //
+  // We first order these prefetches by their first use from latest to earliest.
+  // Then starting from the prefetch completion time (i.e. the first use time),
+  // move the prefetch start time earlier until the copy resource is satisfied
+  // (or reaching another resource satisfaction criteria explained below) by
+  // consuming the bandwidth idle time of the overlapped instructions. We also
+  // keep track of the additional memory required. Note that index 5 also
+  // accounts for the additional 4 MiB consumed since the data needs to reside
+  // during the execution of the instruction at index 5.  Below is the updated
+  // state after scheduling prefetch 1:
+  //
+  //        prefetch 1:          +====+            +====+
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  2  2  1  1  0  1| 2  2  1  1  0  1|
+  // additional memory:  0  0  0  4  4  4| 0  0  0  4  4  4|
+  //         iteration:       prev       |      current    |
+  //
+  // To schedule prefetch 2, we similarly start the same way, from its first use
+  // and bring the prefetch start earlier. We first reach index 0 with still an
+  // unsatisfied copy resource of 3:
+  //
+  //        prefetch 2: +=+               +=+                unsat res: 3
+  //        prefetch 1:          +====+            +====+
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  0  2  1  1  0  1| 0  2  1  1  0  1|
+  // additional memory:  5  5  0  4  4  4| 5  5  0  4  4  4|
+  //         iteration:       prev       |      current    |
+  //
+  // We continue onto the previous iteration:
+  //
+  //        prefetch 2:===+            +====+            +== unsat res: 2
+  //        prefetch 1:          +====+            +====+
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  0  2  1  1  0  0| 0  2  1  1  0  0|
+  // additional memory:  5  5  0  4  4  9| 5  5  0  4  4  9|
+  //         iteration:       prev       |      current    |
+  //
+  // As we bring the start time of prefetch 2 earlier, it starts overlapping
+  // with prefetch 1:
+  //
+  //        prefetch 2:===+      +==========+      +======== unsat res: 1
+  //        prefetch 1:          +====+            +====+
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  0  2  1  0  0  0| 0  2  1  0  0  0|
+  // additional memory:  5  5  0  9  9  9| 5  5  0  9  9  9|
+  //         iteration:       prev       |      current    |
+  //
+  // The prefetch resource is still unsatisfied at this point. We can bring the
+  // prefetch earlier. However, the first prefetch's end time is earlier than
+  // the second and we need to maintain FIFO order with regard to prefetches. In
+  // order to maintain this FIFO order, we "early force" prefetches that are
+  // already scheduled by moving the start time earlier along with prefetch 2:
+  //
+  //        prefetch 2:===+   +=============+   +===========
+  //        prefetch 1:       +=======+         +=======+
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  0  2  0  0  0  0| 0  2  0  0  0  0|
+  // additional memory:  5  5  9  9  9  9| 5  5  9  9  9  9|
+  //         iteration:       prev       |      current    |
+  //
+  // Depending on the options provided, we can use alternative resource
+  // satisfaction criteria. One option is to specify a percentage of the copy
+  // resource that needs to be satisfied instead of the complete amount (100%).
+  // This is called the "desired copy ratio". The reason why desired copy ratio
+  // can be less than 100% is that in a memory-bound loop, we probably do not
+  // have enough aggregate bandwidth resources to satisfy all of the prefetches,
+  // but using up all of the default-memory bandwidth is more important than
+  // having some prefetches with unsatisfied resources. In a similar vein,
+  // another option is to accept prefetches that are fully pipelined, i.e.
+  // their copy start time is scheduled the same time as the copy done time in
+  // the previous iteration, regardless of how much of its copy resources are
+  // actually satisfied. To illustrate a fully pipelined prefetch, consider
+  // prefetch 3 (assume no prefetch 1 or 2 in this example) which is 15 MiB and
+  // its first use is at index 4:
+  //
+  //        prefetch 3:=============+=================+===== unsat res: 4
+  //               idx:  0  1  2  3  4  5| 0  1  2  3  4  5|
+  //      bw idle time:  0  0  0  0  0  0| 0  0  0  0  0  0|
+  // additional memory: 15 15 15 15 30 15|15 15 15 15 30 15|
+  //         iteration:       prev       |      current    |
+  //
+  // Note that the additional memory consumption at index 4 is actually twice
+  // the size of the prefetch as we are effectively double buffering. Also note
+  // that the prefetch has an unsatisfied copy resource of 4 meaning the copy
+  // will be in the critical path, but this actually will be faster than not
+  // scheduling this particular prefetch in the first place since the bandwidth
+  // idle time resource would go unused.
+  float accumulated_copy_resource = 0;
+  std::vector<int> early_forced_prefetch_value_indices;
+  int early_forced_prefetch_value_search_index = 0;
+  float early_forced_prefetch_additional_memory = 0;
+  for (int i = first_use_idx - 1; i >= last_use_idx_sentinel - loop_size_;
+       --i) {
+    int loop_idx = (i + loop_size_) % loop_size_;
+    // Check if this prefetch rolls over to the previous iteration, check if any
+    // already-scheduled prefetches would violate the FIFO order, and if so,
+    // "early-force" them to be co-scheduled with this prefetch to maintain the
+    // FIFO order. This of course increases the required memory, so also keep
+    // track of additional memory that would be consumed.
+    if (i < 0) {
+      for (; context.value_indices[early_forced_prefetch_value_search_index] !=
+             value_index;
+           ++early_forced_prefetch_value_search_index) {
+        VLOG(3) << "Searching for early forced: "
+                << early_forced_prefetch_value_search_index;
+        LoopValue* early_forced_value = context.values.at(
+            context.value_indices[early_forced_prefetch_value_search_index]);
+        if (early_forced_value->allocations.empty()) {
+          continue;
+        }
+        const MemorySpaceAssignment::CopyAllocation* early_forced_prefetch =
+            static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+                early_forced_value->allocations.back().get());
+        VLOG(3) << "Prefetch: " << early_forced_prefetch->ToString();
+
+        // If the prefetch is already a roll-around prefetch, no need to further
+        // early force it.
+        if (early_forced_prefetch->copy_done_schedule_before() <=
+                early_forced_prefetch->copy_start_schedule_after() + 1 ||
+            (early_forced_prefetch->copy_start_schedule_after() ==
+                 loop_size_ - 1 &&
+             early_forced_prefetch->copy_done_schedule_before() == 0)) {
+          break;
+        }
+        if (early_forced_prefetch->copy_start_schedule_after() != loop_idx) {
+          break;
+        }
+        early_forced_prefetch_value_indices.push_back(
+            early_forced_prefetch_value_search_index);
+        early_forced_prefetch_additional_memory += early_forced_value->size;
+        VLOG(3) << "Found early-forced prefetch value: "
+                << early_forced_value->ToString();
+        VLOG(3) << "Early forced prefetch additional memory: "
+                << early_forced_prefetch_additional_memory;
+      }
+    }
+
+    // Overlap memory overhead only happens if the copy start overlaps with the
+    // first use (i.e. fully pipelined), so we'd need to account for 2X the
+    // buffer at this time.
+    int64_t overlap_memory_overhead = 0;
+    if (loop_idx == last_use_idx) {
+      overlap_memory_overhead = value->size;
+      VLOG(3) << "Loop idx == last use idx (" << loop_idx
+              << "), overlap memory overhead = " << overlap_memory_overhead;
+    }
+
+    // OOM; give up prefetch.
+    if (context.additional_memory_used[loop_idx] + value->size +
+            overlap_memory_overhead + early_forced_prefetch_additional_memory >
+        remaining_memory_[loop_idx]) {
+      VLOG(3) << "Ran out of memory. Accumulated copy resource "
+              << accumulated_copy_resource << " out of " << copy_resource
+              << " at " << loop_idx;
+      break;
+    }
+
+    // We ideally find a time to overlap the prefetch fully where the previous
+    // iteration's memory use is disjoint from this iteration. If that is not
+    // possible, there are two compromises we could pick:
+    //   - Find a prefetch time that satisfies a desired ratio < 1 of the
+    //      prefetch elapsed time. This means the prefetch will be critical.
+    //   - Overlap the prefetch with the previous iteration's buffer use, i.e.
+    //     full pipelining. This would increase the peak memory consumption.
+    float bandwidth_idle_time = context.bandwidth_idle_times[loop_idx];
+    VLOG(3) << "Idx " << loop_idx
+            << " bandwidth_idle_time: " << bandwidth_idle_time
+            << " copy resource remaining: "
+            << (copy_resource - accumulated_copy_resource) << " diff: "
+            << (bandwidth_idle_time -
+                (copy_resource - accumulated_copy_resource));
+    if (bandwidth_idle_time >= copy_resource - accumulated_copy_resource) {
+      accumulated_copy_resource = copy_resource;
+      copy_start_time = loop_idx;
+      VLOG(3) << "Found the complete copy ratio and updated accumulated copy "
+                 "resource: "
+              << accumulated_copy_resource;
+      break;
+    } else if (!copy_start_time &&
+               accumulated_copy_resource + bandwidth_idle_time >=
+                   copy_resource * options_.desired_copy_ratio()) {
+      accumulated_copy_resource += bandwidth_idle_time;
+      copy_start_time = loop_idx;
+      VLOG(3) << "Found the desired copy ratio and updated accumulated copy "
+                 "resource: "
+              << accumulated_copy_resource;
+    } else if (options_.allow_unsatisfied_fully_pipelined_prefetch() &&
+               loop_idx == last_use_idx) {
+      // Even if desired resource isn't reached, and if the options allow it,
+      // allow a fully pipelined prefetch.
+      accumulated_copy_resource += bandwidth_idle_time;
+      copy_start_time = loop_idx;
+      VLOG(3) << "Could not reach the desired copy ratio but scheduling "
+                 "fully pipelined prefetch anyway: "
+              << accumulated_copy_resource;
+      break;
+    } else {
+      accumulated_copy_resource += bandwidth_idle_time;
+      VLOG(3) << "Updated accumulated copy resource: "
+              << accumulated_copy_resource;
+    }
+  }
+
+  // Could not find a suitable copy start time.
+  if (!copy_start_time) {
+    return false;
+  }
+
+  VLOG(3) << "Success: copy_start_time: " << *copy_start_time
+          << " leftover copy resource: "
+          << (copy_resource - accumulated_copy_resource);
+  auto update_additional_memory_used = [&](int loop_idx, int64_t addition) {
+    VLOG(4) << "Updating additional memory used at " << loop_idx << ". "
+            << context.additional_memory_used[loop_idx] << " + " << addition
+            << " => " << (context.additional_memory_used[loop_idx] + addition)
+            << " (remaining: " << remaining_memory_[loop_idx] << ")";
+    context.additional_memory_used[loop_idx] += addition;
+    CHECK_LE(context.additional_memory_used[loop_idx],
+             remaining_memory_[loop_idx]);
+  };
+  for (int i = first_use_idx; i <= last_use_idx_sentinel; ++i) {
+    int loop_idx = i % loop_size_;
+    update_additional_memory_used(loop_idx, value->size);
+  }
+  for (int i = first_use_idx - 1; i >= last_use_idx_sentinel - loop_size_;
+       --i) {
+    int loop_idx = (i + loop_size_) % loop_size_;
+    float& bandwidth_idle_time = context.bandwidth_idle_times[loop_idx];
+    // Overlap memory overhead only happens if the copy start overlaps with the
+    // first use (i.e. fully pipelined), so we'd need to account for 2X the
+    // buffer at this time.
+    int64_t overlap_memory_overhead = 0;
+    update_additional_memory_used(loop_idx,
+                                  value->size + overlap_memory_overhead);
+    if (bandwidth_idle_time < copy_resource) {
+      copy_resource -= bandwidth_idle_time;
+      bandwidth_idle_time = 0;
+      if (loop_idx == *copy_start_time) {
+        VLOG(3) << "Remaining copy resource: " << copy_resource;
+        break;
+      }
+    } else {
+      bandwidth_idle_time -= copy_resource;
+      copy_resource = 0;
+      CHECK_EQ(loop_idx, *copy_start_time);
+      break;
+    }
+  }
+
+  // Create the Allocation objects that correspond to the scheduled prefetch.
+  CHECK(value->header_position);
+  value->allocations.push_back(
+      std::make_unique<MemorySpaceAssignment::Allocation>(
+          *value->header_position, MemorySpaceAssignment::MemorySpace::kDefault,
+          std::nullopt, 0, loop_size_, /*is_scoped_allocation=*/false));
+  value->allocations.push_back(
+      std::make_unique<MemorySpaceAssignment::CopyAllocation>(
+          *value->allocations.back(),
+          MemorySpaceAssignment::MemorySpace::kAlternate, std::nullopt,
+          ((*copy_start_time - 1) + loop_size_) % loop_size_,
+          last_use_idx_sentinel, first_use_idx));
+  AddAllLoopPositionsAndUses(*value, /*allocate_next_iteration_uses=*/true);
+
+  // Account for the additional memory used by early forcing the already
+  // scheduled prefetches. Also modify the start times of these to this
+  // prefetch's copy start time.
+  for (int early_forced_prefetch_value_index :
+       early_forced_prefetch_value_indices) {
+    LoopValue* early_forced_value = context.values.at(
+        context.value_indices[early_forced_prefetch_value_index]);
+    CHECK(!early_forced_value->allocations.empty());
+    MemorySpaceAssignment::CopyAllocation* early_forced_prefetch =
+        static_cast<MemorySpaceAssignment::CopyAllocation*>(
+            early_forced_value->allocations.back().get());
+    for (int index = early_forced_prefetch->copy_start_schedule_after();
+         index >= *copy_start_time; --index) {
+      update_additional_memory_used(index, early_forced_value->size);
+      VLOG(3) << "Additional memory used: " << index << " "
+              << context.additional_memory_used[index];
+    }
+    early_forced_prefetch->set_copy_start_schedule_after(
+        ((*copy_start_time - 1) + loop_size_) % loop_size_);
+    VLOG(3) << "Updated prefetch: " << early_forced_prefetch->ToString();
+  }
+  return true;
+}
+
+void MemoryBoundLoopOptimizer::AddAllLoopPositionsAndUses(
+    LoopValue& value, bool allocate_next_iteration_uses) {
+  CHECK_GE(value.allocations.size(), 1);
+  MemorySpaceAssignment::Allocation& allocation = *value.allocations.back();
+  for (const auto& [idx, position] : value.loop_positions) {
+    positions_in_alternate_mem_[position.instruction].push_back(position.index);
+  }
+  for (const auto& [idx, use] : value.loop_uses) {
+    uses_in_alternate_mem_[use.instruction].push_back(
+        {use.operand_number, use.operand_index});
+    allocation.AddUse(use);
+  }
+  if (allocate_next_iteration_uses) {
+    for (const auto& [next_iteration_idx, use] : value.next_iteration_uses) {
+      HloInstruction* loop_instruction =
+          hlo_live_range_.flattened_instruction_sequence().instructions().at(
+              loop_start_ + next_iteration_idx);
+      uses_in_alternate_mem_[loop_instruction].push_back(
+          {use.operand_number, use.operand_index});
+      allocation.AddUse(
+          {loop_instruction, use.operand_number, use.operand_index});
+    }
+  }
+}
+
+float MemoryBoundLoopOptimizer::GetBandwidthIdleTime(int idx) const {
+  const HloInstruction* inst =
+      hlo_live_range_.flattened_instruction_sequence().instructions().at(
+          loop_start_ + idx);
+  std::vector<std::pair<int64_t, ShapeIndex>> empty_operands;
+  std::vector<ShapeIndex> empty_outputs;
+  const std::vector<std::pair<int64_t, ShapeIndex>>* operands_in_alternate_mem =
+      &empty_operands;
+  const std::vector<ShapeIndex>* outputs_in_alternate_mem = &empty_outputs;
+  auto uses_it = uses_in_alternate_mem_.find(inst);
+  if (uses_it != uses_in_alternate_mem_.end()) {
+    operands_in_alternate_mem = &uses_it->second;
+  }
+  auto positions_it = positions_in_alternate_mem_.find(inst);
+  if (positions_it != positions_in_alternate_mem_.end()) {
+    outputs_in_alternate_mem = &positions_it->second;
+  }
+  return cost_analysis_.GetDefaultMemoryBandwidthIdleTime(
+      *inst, *operands_in_alternate_mem, *outputs_in_alternate_mem);
+}
+
+float MemoryBoundLoopOptimizer::GetBandwidthIdleTime(
+    int idx,
+    const absl::flat_hash_map<const HloInstruction*,
+                              std::vector<std::pair<int64_t, ShapeIndex>>>&
+        additional_uses_in_alternate_mem,
+    const absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>&
+        additional_positions_in_alternate_mem) const {
+  const HloInstruction* inst =
+      hlo_live_range_.flattened_instruction_sequence().instructions().at(
+          loop_start_ + idx);
+  std::vector<std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem;
+  std::vector<ShapeIndex> outputs_in_alternate_mem;
+  auto uses_it = uses_in_alternate_mem_.find(inst);
+  if (uses_it != uses_in_alternate_mem_.end()) {
+    operands_in_alternate_mem = uses_it->second;
+  }
+  auto additional_uses_it = additional_uses_in_alternate_mem.find(inst);
+  if (additional_uses_it != additional_uses_in_alternate_mem.end()) {
+    absl::c_copy(additional_uses_it->second,
+                 std::back_inserter(operands_in_alternate_mem));
+  }
+  auto positions_it = positions_in_alternate_mem_.find(inst);
+  if (positions_it != positions_in_alternate_mem_.end()) {
+    outputs_in_alternate_mem = positions_it->second;
+  }
+  auto additional_positions_it =
+      additional_positions_in_alternate_mem.find(inst);
+  if (additional_positions_it != additional_positions_in_alternate_mem.end()) {
+    absl::c_copy(additional_positions_it->second,
+                 std::back_inserter(outputs_in_alternate_mem));
+  }
+  return cost_analysis_.GetDefaultMemoryBandwidthIdleTime(
+      *inst, operands_in_alternate_mem, outputs_in_alternate_mem);
+}
+
+float MemoryBoundLoopOptimizer::GetInstructionElapsed(int idx) const {
+  const HloInstruction* inst =
+      hlo_live_range_.flattened_instruction_sequence().instructions().at(
+          loop_start_ + idx);
+  std::vector<std::pair<int64_t, ShapeIndex>> empty_operands;
+  std::vector<ShapeIndex> empty_outputs;
+  const std::vector<std::pair<int64_t, ShapeIndex>>* operands_in_alternate_mem =
+      &empty_operands;
+  const std::vector<ShapeIndex>* outputs_in_alternate_mem = &empty_outputs;
+  auto uses_it = uses_in_alternate_mem_.find(inst);
+  if (uses_it != uses_in_alternate_mem_.end()) {
+    operands_in_alternate_mem = &uses_it->second;
+  }
+  auto positions_it = positions_in_alternate_mem_.find(inst);
+  if (positions_it != positions_in_alternate_mem_.end()) {
+    outputs_in_alternate_mem = &positions_it->second;
+  }
+  return cost_analysis_.GetInstructionElapsedInAlternateMemory(
+      *inst, *operands_in_alternate_mem, *outputs_in_alternate_mem);
+}
+
+Status AlternateMemoryBestFitHeap::OptimizeMemoryBoundLoop(int loop_start_idx,
+                                                           int loop_end_idx,
+                                                           int loop_size) {
+  // The MemoryBoundLoopOptimizer works with a minimum of three unrolled loop
+  // iterations: previous, current, and next. So, we pick the second iteration
+  // out of the loop as the current iteration.
+  const int iteration_start_idx = loop_start_idx + loop_size;
+  const int iteration_end_idx = iteration_start_idx + loop_size;
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<MemoryBoundLoopOptimizer> optimizer,
+      MemoryBoundLoopOptimizer::Create(
+          iteration_start_idx, iteration_end_idx, options_.max_size_in_bytes,
+          options_.memory_bound_loop_optimizer_options, hlo_live_range_,
+          alias_analysis_, *options_.cost_analysis, options_.size_fn));
+  optimizer->Optimize();
+
+  const int loop_optimized_allocations_original_size =
+      loop_optimized_allocations_.size();
+  for (MemoryBoundLoopOptimizer::LoopValue& value : optimizer->loop_values()) {
+    if (!value.allocations.empty()) {
+      loop_optimized_allocations_.push_back(std::move(value.allocations));
+    }
+  }
+
+  // Check if this unrolled loop is in a while loop.
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  std::vector<HloInstruction*> callers = call_graph_->GetComputationCallers(
+      instruction_sequence[loop_start_idx]->parent());
+  const bool is_in_while_loop =
+      callers.size() == 1 && callers.front()->opcode() == HloOpcode::kWhile;
+
+  // Update the loop_optimized_allocations_map_ with the output of the
+  // optimizer.
+  for (int i = loop_optimized_allocations_original_size;
+       i < loop_optimized_allocations_.size(); ++i) {
+    const MemorySpaceAssignment::AllocationSequence& sequence =
+        loop_optimized_allocations_.at(i);
+    CHECK(!sequence.empty());
+    VLOG(3) << "  alloc: " << sequence.back()->ToString();
+    for (const auto& allocation : sequence) {
+      // Check if the loop is in a while loop and the position needs to be
+      // allocated in the default memory.
+      const bool require_pos_in_default_space =
+          is_in_while_loop &&
+          (allocation->memory_space() == MemorySpace::kDefault ||
+           allocation->is_copy_allocation());
+      for (const HloUse& use : allocation->uses()) {
+        const int64_t use_idx =
+            hlo_live_range_.instruction_schedule().at(use.instruction) -
+            iteration_start_idx;
+        CHECK_GE(use_idx, 0);
+        CHECK_LT(use_idx, loop_size);
+        for (int64_t i = loop_start_idx + use_idx; i <= loop_end_idx;
+             i += loop_size) {
+          HloInstruction* repeated_inst = instruction_sequence[i];
+          HloUse repeated_use{repeated_inst, use.operand_number,
+                              use.operand_index};
+          loop_optimized_allocations_map_[repeated_use] = {use_idx, loop_size,
+                                                           allocation.get()};
+          VLOG(3) << " Setting optimized allocations map. Use: "
+                  << repeated_use.ToString() << " idx: " << use_idx
+                  << " allocation: " << allocation->ToString();
+          if (require_pos_in_default_space) {
+            const HloValue& value =
+                alias_analysis_.dataflow_analysis().GetUniqueValueAt(
+                    repeated_inst->operand(use.operand_number),
+                    use.operand_index);
+            // If any of the positions is a parameter in a while loop, we add a
+            // required assignment in the default memory space.
+            for (const HloPosition& value_position : value.positions()) {
+              if (value_position.instruction->parent() ==
+                      repeated_inst->parent() &&
+                  value_position.instruction->opcode() ==
+                      HloOpcode::kParameter) {
+                AddRequiredAssignment(value_position.instruction,
+                                      value_position.index,
+                                      MemorySpace::kDefault);
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  return OkStatus();
+}
+
+namespace {
+// A helper function to get the distance between a use and its producer (or -1
+// if producer is a gte, parameter or tuple).
+std::function<int(const HloInstruction*)> GetOperandDistanceFunction(
+    const HloLiveRange& hlo_live_range, const HloInstruction* use_inst) {
+  const int use_idx = hlo_live_range.instruction_schedule().at(use_inst);
+  return [&, use_idx](const HloInstruction* operand) -> int {
+    // We just use -1 for parameter, tuple, and gte instructions. We could make
+    // this "see through" the gtes if we get too many false positives.
+    if (operand->opcode() == HloOpcode::kParameter ||
+        operand->opcode() == HloOpcode::kTuple ||
+        operand->opcode() == HloOpcode::kGetTupleElement) {
+      return -1;
+    }
+    return use_idx - hlo_live_range.instruction_schedule().at(operand);
+  };
+}
+
+// A helper function to check if the operand distances of two instructions
+// are compatible. This assumes `a` is scheduled loop size candidate
+// instructions before `b`. The operand distances are compatible if either
+// distance is -1, or if they are the same, or if they are separated by loop
+// size candidate.
+bool AreOperandCandidatesCompatible(int loop_size_candidate,
+                                    absl::Span<const int> a_distances,
+                                    absl::Span<const int> b_distances) {
+  if (a_distances.size() != b_distances.size()) {
+    return false;
+  }
+  for (int i = 0; i < a_distances.size(); ++i) {
+    const int a_value = a_distances.at(i);
+    const int b_value = b_distances.at(i);
+    if (a_value != -1 && b_value != -1 &&
+        a_value + loop_size_candidate != b_value && a_value != b_value) {
+      return false;
+    }
+  }
+  return true;
+}
+}  // namespace
+
+void AlternateMemoryBestFitHeap::IdentifyAndOptimizeMemoryBoundLoops() {
+  absl::flat_hash_map<absl::string_view, int> fingerprint_schedule_map;
+  const auto& instruction_sequence =
+      hlo_live_range_.flattened_instruction_sequence().instructions();
+  // The minimum and maximum loop sizes that we consider.
+  const int kMinLoopSize = 4;
+  const int kMaxLoopSize = 400;
+  const float kMinNumIterations = 3.0;
+  int optimized_loop_idx = 0;
+  while (optimized_loop_idx < instruction_sequence.size()) {
+    // Iterate over the flattened instruction sequence. We first try to find a
+    // loop candidate where the fingerprint between two instructions matches by
+    // the loop size candidate.
+    int loop_size_candidate = -1;
+    int loop_start_idx = -1;
+    int loop_end_idx = -1;
+    for (; optimized_loop_idx < instruction_sequence.size();
+         ++optimized_loop_idx) {
+      const HloInstruction* inst = instruction_sequence[optimized_loop_idx];
+      auto fingerprint_it = fingerprint_map_.find(inst);
+      if (inst->opcode() != HloOpcode::kParameter &&
+          inst->opcode() != HloOpcode::kTuple &&
+          inst->opcode() != HloOpcode::kGetTupleElement &&
+          fingerprint_it != fingerprint_map_.end()) {
+        // Find and the latest instruction with the same fingerprint as this.
+        auto fingerprint_schedule_it =
+            fingerprint_schedule_map.find(fingerprint_it->second);
+        if (fingerprint_schedule_it != fingerprint_schedule_map.end()) {
+          int distance = optimized_loop_idx - fingerprint_schedule_it->second;
+          if (distance >= kMinLoopSize && distance <= kMaxLoopSize) {
+            // We found two instructions with the same fingerprint. The distance
+            // between the two is the loop size candidate.
+            loop_size_candidate = distance;
+            break;
+          }
+        }
+        fingerprint_schedule_map[fingerprint_it->second] = optimized_loop_idx;
+      }
+
+      VLOG(3) << " " << optimized_loop_idx << ": "
+              << instruction_sequence[optimized_loop_idx]->parent()->name()
+              << " " << instruction_sequence[optimized_loop_idx]->name()
+              << " fingerprint: "
+              << (fingerprint_it == fingerprint_map_.end()
+                      ? "none"
+                      : fingerprint_it->second);
+    }
+    VLOG(3) << "Loop size candidate: " << loop_size_candidate;
+    if (loop_size_candidate == -1) {
+      break;
+    }
+
+    std::vector<std::vector<int>> operand_distances;
+
+    // Scan the instructions with the candidate loop size. We try to calculate
+    // the size of the loop by finding the instructions that are loop size
+    // candidate apart, have the same fingerprint and compatible operand
+    // distances. We start scanning the candidate loop a few instructions
+    // earlier than the fingerprint identified in case the loop starts a bit
+    // earlier than the fingerprint logic.
+    const int kLoopScanHeadStart = 10;
+    for (int i = std::max(
+             0, optimized_loop_idx - loop_size_candidate - kLoopScanHeadStart);
+         i < instruction_sequence.size(); ++i) {
+      const HloInstruction* inst = instruction_sequence[i];
+      auto fingerprint_it = fingerprint_map_.find(inst);
+      auto ignore_op = [](const HloInstruction* instruction) {
+        return instruction->opcode() == HloOpcode::kParameter ||
+               instruction->opcode() == HloOpcode::kTuple ||
+               instruction->opcode() == HloOpcode::kGetTupleElement;
+      };
+      if (loop_start_idx == -1) {
+        if (i > optimized_loop_idx - loop_size_candidate) {
+          break;
+        }
+        if (ignore_op(inst) || fingerprint_it == fingerprint_map_.end()) {
+          continue;
+        }
+        if (i + loop_size_candidate >= instruction_sequence.size()) {
+          break;
+        }
+        const HloInstruction* candidate_inst =
+            instruction_sequence[i + loop_size_candidate];
+        auto candidate_fingerprint_it = fingerprint_map_.find(candidate_inst);
+        if (ignore_op(candidate_inst) ||
+            candidate_fingerprint_it == fingerprint_map_.end() ||
+            fingerprint_it->second != candidate_fingerprint_it->second) {
+          // Fingerprint mismatch.
+          continue;
+        }
+        std::vector<int> inst_operand_distances;
+        absl::c_transform(inst->operands(),
+                          std::back_inserter(inst_operand_distances),
+                          GetOperandDistanceFunction(hlo_live_range_, inst));
+        std::vector<int> candidate_inst_operand_distances;
+        absl::c_transform(
+            candidate_inst->operands(),
+            std::back_inserter(candidate_inst_operand_distances),
+            GetOperandDistanceFunction(hlo_live_range_, candidate_inst));
+        VLOG(3) << "i : " << i << " "
+                << absl::StrJoin(inst_operand_distances, ", ") << " | "
+                << absl::StrJoin(candidate_inst_operand_distances, ", ");
+        if (!AreOperandCandidatesCompatible(loop_size_candidate,
+                                            inst_operand_distances,
+                                            candidate_inst_operand_distances)) {
+          // Operand distance mistatch.
+          continue;
+        }
+        // Found the start of the loop.
+        loop_start_idx = i;
+      }
+      if (inst->parent() != instruction_sequence[loop_start_idx]->parent()) {
+        VLOG(3) << "Mismatch (computation) at " << i << ": "
+                << inst->parent()->name() << " vs "
+                << instruction_sequence[loop_start_idx]->parent()->name();
+        break;
+      }
+      operand_distances.push_back({});
+      if (ignore_op(inst) || fingerprint_it == fingerprint_map_.end()) {
+        continue;
+      }
+      absl::c_transform(inst->operands(),
+                        std::back_inserter(operand_distances.back()),
+                        GetOperandDistanceFunction(hlo_live_range_, inst));
+      if (i >= loop_start_idx + loop_size_candidate) {
+        // Verify that this still obeys the fingerprint and operand distance
+        // invariants.
+        const HloInstruction* prev_inst =
+            instruction_sequence[i - loop_size_candidate];
+        auto prev_fingerprint_it = fingerprint_map_.find(prev_inst);
+        if (prev_fingerprint_it == fingerprint_map_.end()) {
+          break;
+        }
+        if (fingerprint_it->second != prev_fingerprint_it->second) {
+          VLOG(3) << "Mismatch (fp) at " << i << ", "
+                  << (i - loop_size_candidate) << ": " << fingerprint_it->second
+                  << " vs " << prev_fingerprint_it->second;
+          break;
+        }
+        if (!AreOperandCandidatesCompatible(
+                loop_size_candidate,
+                *(operand_distances.rbegin() + loop_size_candidate),
+                operand_distances.back())) {
+          VLOG(3) << "Mismatch (op) at " << i << ", "
+                  << (i - loop_size_candidate) << ": "
+                  << absl::StrJoin(operand_distances.back(), ", ") << " vs "
+                  << absl::StrJoin(
+                         *(operand_distances.rbegin() + loop_size_candidate),
+                         ", ");
+          break;
+        }
+      }
+      loop_end_idx = i;
+    }
+    float num_iterations = 0;
+    if (loop_start_idx != -1) {
+      num_iterations = static_cast<float>(loop_end_idx + 1 - loop_start_idx) /
+                       loop_size_candidate;
+    }
+    VLOG(3) << "Loop start: " << loop_start_idx << " loop end: " << loop_end_idx
+            << " num iterations: " << num_iterations;
+
+    optimized_loop_idx = std::max(optimized_loop_idx, loop_end_idx) + 1;
+
+    if (num_iterations >= kMinNumIterations) {
+      VLOG(2) << "Found valid loop. Loop start: " << loop_start_idx
+              << " loop end: " << loop_end_idx
+              << " num iterations: " << num_iterations;
+
+      TF_CHECK_OK(OptimizeMemoryBoundLoop(loop_start_idx, loop_end_idx,
+                                          loop_size_candidate));
+    }
+  }
+}
+
 HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
   if (options_.autotuning_config.has_value()) {
     CHECK_EQ((*options_.autotuning_config).size(), buffer_intervals_.size());
@@ -1859,6 +3369,10 @@ HeapSimulator::Result<HloValue> AlternateMemoryBestFitHeap::Finish() {
     }
   }
 
+  if (options_.memory_bound_loop_optimizer_options.enabled()) {
+    IdentifyAndOptimizeMemoryBoundLoops();
+  }
+
   for (const auto& interval : sorted_buffer_intervals) {
     auto colocated_intervals = GetSortedColocatedIntervals(interval);
     if (AreIntervalsReservedInAlternateMemory(colocated_intervals)) {
@@ -2118,6 +3632,8 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
       int64_t use_time = instruction_schedule.at(hlo_use.instruction);
       int64_t latest_prefetch_time = use_time;
       bool allow_no_copy_alternate_mem_allocation = true;
+      bool allow_prefetch = true;
+      bool prefer_no_copy_alternate_mem_allocation = false;
       std::optional<int64_t> earliest_prefetch_time = std::nullopt;
 
       // Control flow  calls include kWhile, kCall, and kConditional opcodes.
@@ -2211,6 +3727,51 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
           hlo_use.instruction ==
               hlo_use.instruction->parent()->root_instruction()) {
         std::optional<int64_t> preferred_prefetch_time = std::nullopt;
+        auto loop_optimized_allocation_it =
+            loop_optimized_allocations_map_.find(use.hlo_use);
+        if (loop_optimized_allocation_it !=
+            loop_optimized_allocations_map_.end()) {
+          const LoopOptimizedAllocationInfo& loop_optimized_allocation_info =
+              loop_optimized_allocation_it->second;
+          const MemorySpaceAssignment::Allocation* allocation =
+              loop_optimized_allocation_info.loop_optimized_allocation;
+          VLOG(3) << "Found optimized allocation for " << use.hlo_use.ToString()
+                  << " (loop idx: " << loop_optimized_allocation_info.use_index
+                  << "): " << allocation->ToString();
+          if (allocation->is_copy_allocation()) {
+            allow_no_copy_alternate_mem_allocation = true;
+            const MemorySpaceAssignment::CopyAllocation* copy_allocation =
+                static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+                    allocation);
+            int64_t effective_copy_start_time =
+                copy_allocation->copy_start_schedule_after();
+            if (copy_allocation->copy_start_schedule_after() ==
+                    loop_optimized_allocation_info.loop_size - 1 &&
+                copy_allocation->copy_done_schedule_before() == 0) {
+              effective_copy_start_time =
+                  -loop_optimized_allocation_info.loop_size;
+            } else if (copy_allocation->copy_start_schedule_after() + 1 >=
+                       copy_allocation->copy_done_schedule_before()) {
+              effective_copy_start_time -=
+                  loop_optimized_allocation_info.loop_size;
+            }
+            preferred_prefetch_time =
+                hlo_live_range_.instruction_schedule().at(hlo_use.instruction) -
+                loop_optimized_allocation_info.use_index +
+                effective_copy_start_time;
+            VLOG(3) << "Prefer prefetch at " << *preferred_prefetch_time
+                    << " (effective: " << effective_copy_start_time << ")";
+          } else if (allocation->memory_space() == MemorySpace::kDefault) {
+            allow_prefetch = false;
+            allow_no_copy_alternate_mem_allocation = false;
+            VLOG(3) << "Disallowing alternate memory allocation.";
+          } else {
+            CHECK(allocation->memory_space() == MemorySpace::kAlternate);
+            prefer_no_copy_alternate_mem_allocation = true;
+            VLOG(3) << "Prefer no-copy alternate memory allocation.";
+          }
+        }
+
         if (options_.use_repeated_instance_for_preferred_prefetch_time) {
           const std::vector<const HloInstruction*>* repeated_insts =
               GetRepeatedInstructionList(hlo_use.instruction);
@@ -2271,8 +3832,11 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
         request.end_time = use_time;
         request.latest_prefetch_time = latest_prefetch_time;
         request.size = allocation_value.size();
+        request.prefer_no_copy_alternate_mem_allocation =
+            prefer_no_copy_alternate_mem_allocation;
         request.allow_no_copy_alternate_mem_allocation =
             allow_no_copy_alternate_mem_allocation;
+        request.allow_prefetch = allow_prefetch;
         request.earliest_prefetch_time = earliest_prefetch_time;
         request.preferred_prefetch_time = preferred_prefetch_time;
         request.preferred_offset = preferred_offset;
@@ -3430,12 +4994,39 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
   }
 
   // Finally, try to prefetch the buffer into alternate memory.
-  if (!request.allocation_value->requires_contiguous_allocation()) {
+  if (request.allow_prefetch &&
+      !request.allocation_value->requires_contiguous_allocation()) {
     Result prefetch_result =
         Prefetch(request, **prev_allocation_in_default_mem_it);
     if (prefetch_result == Result::kSuccess) {
+      if (request.preferred_prefetch_time) {
+        // Warn if the prefetch time picked doesn't match the preferred prefetch
+        // time.
+        CHECK(!request.allocation_value->allocation_sequence()->empty());
+        const MemorySpaceAssignment::Allocation* allocation =
+            request.allocation_value->allocation_sequence()->back().get();
+        CHECK(allocation->is_copy_allocation());
+        const int64_t prefetch_time =
+            static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+                allocation)
+                ->copy_start_schedule_after();
+        if (prefetch_time != *request.preferred_prefetch_time) {
+          LOG(WARNING) << "Scheduled prefetch time (" << prefetch_time
+                       << ") doesn't match the preferred prefetch time ("
+                       << *request.preferred_prefetch_time
+                       << "): " << request.use->hlo_use.ToString();
+        }
+      }
       return Result::kSuccess;
     }
+    // Warn if there was a preferred prefetch time but we couldn't actually
+    // prefetch.
+    if (request.preferred_prefetch_time) {
+      LOG(WARNING) << "The request has a preferred prefetch time ("
+                   << *request.preferred_prefetch_time
+                   << ") which could not be satisfied: "
+                   << request.use->hlo_use.ToString();
+    }
     result_mark(prefetch_result, allocation_result);
   }
 
@@ -3552,7 +5143,10 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
 
   const HloPosition& defining_position =
       request.allocation_value->defining_position();
-  if (!options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
+  // If prefer_no_copy_alternate_mem_allocation is true, bypass the live range
+  // duration checks.
+  if (!request.prefer_no_copy_alternate_mem_allocation &&
+      !options_.prefetch_interval_picker->CanAllocateInAlternateMemoryNoCopy(
           defining_position.shape(), request.start_time + 1,
           request.end_time)) {
     return Result::kFailLiveRangeTooLong;
@@ -3639,6 +5233,10 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
         request.use->hlo_use);
     return Result::kSuccess;
   }
+  if (request.prefer_no_copy_alternate_mem_allocation) {
+    LOG(WARNING) << "Preferred no-copy allocation, but this was not possible: "
+                 << request.use->hlo_use.ToString();
+  }
   return Result::kFailOutOfMemory;
 }
 
@@ -3858,6 +5456,12 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
         options_.cost_analysis
             ? options_.cost_analysis->GetAsyncCopyElapsed(shape)
             : 0.1;
+    // If there is a preferred prefetch time due to a loop optimized allocation,
+    // we already keep track of the prefetch resources there, so skip tracking
+    // resources here.
+    if (request.preferred_prefetch_time) {
+      prefetch_resource = 0;
+    }
     if (!prefetch_async_copy_resource_.HasEnoughResource(
             alternate_mem_interval.start, prefetch_end_time,
             prefetch_resource)) {
@@ -4231,9 +5835,10 @@ HloInstruction* MemorySpaceAssignment::Allocation::AddGetTupleElements() const {
 }
 
 std::string MemorySpaceAssignment::Allocation::ToString() const {
-  std::string memory_space_str = "def";
-  if (memory_space_ == MemorySpace::kAlternate) {
-    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  std::string memory_space_str =
+      memory_space_ == MemorySpace::kDefault ? "def" : "alt";
+  if (chunk_) {
+    absl::StrAppend(&memory_space_str, " (off: ", chunk_->offset, ")");
   }
   return absl::StrCat((is_scoped_allocation() ? "Scoped " : ""),
                       "Allocation in ", memory_space_str, " defined at ",
@@ -4243,9 +5848,10 @@ std::string MemorySpaceAssignment::Allocation::ToString() const {
 }
 
 std::string MemorySpaceAssignment::CopyAllocation::ToString() const {
-  std::string memory_space_str = "def";
-  if (memory_space_ == MemorySpace::kAlternate) {
-    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  std::string memory_space_str =
+      memory_space_ == MemorySpace::kDefault ? "def" : "alt";
+  if (chunk_) {
+    absl::StrAppend(&memory_space_str, " (off: ", chunk_->offset, ")");
   }
   return absl::StrCat("Copy Allocation in ", memory_space_str,
                       ", start_time:", start_time(), ", end_time:", end_time(),
@@ -4255,6 +5861,308 @@ std::string MemorySpaceAssignment::CopyAllocation::ToString() const {
                       prev_allocation_.ToString());
 }
 
+std::string MemorySpaceAssignment::SliceParam::ToString() const {
+  return absl::StrCat("[", start_inclusive, ",", end_exclusive, ")");
+}
+
+bool MemorySpaceAssignment::SliceParam::operator==(
+    const SliceParam& other) const {
+  return start_inclusive == other.start_inclusive &&
+         end_exclusive == other.end_exclusive;
+}
+
+std::string MemorySpaceAssignment::SliceProposal::ToString() const {
+  return absl::StrCat(
+      "{ slice_shape: ", slice_shape.ToString(true), ", slice_params: { ",
+      absl::StrJoin(slice_params, ", ",
+                    [](std::string* out, const SliceParam& param) {
+                      absl::StrAppend(out, param.ToString());
+                    }),
+      " }, slice_size: ", slice_size, " }");
+}
+
+std::tuple<const Shape&, const std::vector<MemorySpaceAssignment::SliceParam>&,
+           int64_t>
+MemorySpaceAssignment::SliceProposal::ToTuple() const {
+  return std::make_tuple(std::ref(slice_shape), std::ref(slice_params),
+                         slice_size);
+}
+
+bool MemorySpaceAssignment::SliceProposal::operator==(
+    const SliceProposal& other) const {
+  return ToTuple() == other.ToTuple();
+}
+
+std::string
+MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails::ToString() const {
+  return absl::StrCat(
+      "{ chunk: ", chunk.ToString(), ", copy_time: (", copy_start_after_time,
+      ", ", copy_done_before_time, "), slice_params: { ",
+      absl::StrJoin(slice_params, ", ",
+                    [](std::string* out, const SliceParam& param) {
+                      absl::StrAppend(out, param.ToString());
+                    }),
+      " } }");
+}
+
+std::tuple<const MemorySpaceAssignment::Chunk&, int64_t, int64_t,
+           const std::vector<MemorySpaceAssignment::SliceParam>&,
+           const HloInstruction*, const HloInstruction*>
+MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails::ToTuple() const {
+  return std::make_tuple(std::ref(chunk), copy_start_after_time,
+                         copy_done_before_time, std::ref(slice_params),
+                         copy_start, copy_done);
+}
+
+bool MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails::operator==(
+    const SliceDetails& other) const {
+  return ToTuple() == other.ToTuple();
+}
+
+Status
+MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails::CreateAsyncSlice(
+    const Shape& original_shape, HloInstruction& producer,
+    HloComputation& parent) {
+  if (original_shape.rank() != slice_params.size()) {
+    return FailedPrecondition(
+        "%s",
+        absl::StrCat("The number of SlicedCopyAllocation parameters ",
+                     slice_params.size(), " does not match the rank ",
+                     original_shape.rank(), " of the tensor we are slicing."));
+  }
+
+  std::vector<int64_t> start_indices, limit_indices, strides;
+  start_indices.reserve(slice_params.size());
+  limit_indices.reserve(slice_params.size());
+  strides.reserve(slice_params.size());
+  Shape new_shape(original_shape);
+
+  for (int i = 0; i < slice_params.size(); ++i) {
+    const SliceParam& slice_param = slice_params[i];
+    start_indices.push_back(slice_param.start_inclusive);
+    limit_indices.push_back(slice_param.end_exclusive);
+    strides.push_back(1);
+    int64_t new_value = slice_param.end_exclusive - slice_param.start_inclusive;
+    if (new_value <= 0) {
+      return FailedPrecondition(
+          "%s", absl::StrCat("SlicedCopyAllocation new dimension size is ",
+                             new_value, ", expected something > 0."));
+    }
+    if (new_shape.dimensions(i) < new_value) {
+      return FailedPrecondition(
+          "%s",
+          absl::StrCat("SlicedCopyAllocation sliced dimension size ", new_value,
+                       " is bigger than its original dimension size of ",
+                       new_shape.dimensions(i), "."));
+    }
+    new_shape.set_dimensions(i, new_value);
+  }
+
+  HloInstruction* slice = parent.AddInstruction(HloInstruction::CreateSlice(
+      new_shape, &producer, start_indices, limit_indices, strides));
+  TF_ASSIGN_OR_RETURN(copy_done, parent.CreateAsyncInstructions(
+                                     slice, {ShapeUtil::MakeShape(S32, {})}));
+  copy_start = copy_done->mutable_operand(0);
+
+  return OkStatus();
+}
+
+namespace {
+
+// Helper function to compute the underlying Allocation chunk for a
+// SlicedCopyAllocation.
+std::optional<MemorySpaceAssignment::Chunk> GetSlicedCopyAllocationChunk(
+    const std::vector<MemorySpaceAssignment::SlicedCopyAllocation::SliceInput>&
+        sorted_slice_input) {
+  if (sorted_slice_input.empty()) {
+    return std::nullopt;
+  }
+  auto offset_cmp =
+      [](const MemorySpaceAssignment::SlicedCopyAllocation::SliceInput& lhs,
+         const MemorySpaceAssignment::SlicedCopyAllocation::SliceInput& rhs) {
+        return lhs.chunk.offset < rhs.chunk.offset;
+      };
+  auto end_cmp =
+      [](const MemorySpaceAssignment::SlicedCopyAllocation::SliceInput& lhs,
+         const MemorySpaceAssignment::SlicedCopyAllocation::SliceInput& rhs) {
+        return lhs.chunk.chunk_end() < rhs.chunk.chunk_end();
+      };
+  return MemorySpaceAssignment::Chunk::FromOffsetEnd(
+      std::min_element(sorted_slice_input.begin(), sorted_slice_input.end(),
+                       offset_cmp)
+          ->chunk.offset,
+      std::max_element(sorted_slice_input.begin(), sorted_slice_input.end(),
+                       end_cmp)
+          ->chunk.chunk_end());
+}
+
+// Helper function to compute the start time for a SlicedCopyAllocation.
+int64_t GetSlicedCopyAllocationStartTime(
+    const std::vector<MemorySpaceAssignment::SlicedCopyAllocation::SliceInput>&
+        sorted_slice_input) {
+  if (sorted_slice_input.empty()) {
+    return -1;
+  }
+
+  return sorted_slice_input.front().start_time;
+}
+
+}  // namespace
+
+MemorySpaceAssignment::SlicedCopyAllocation::SlicedCopyAllocation(
+    const Allocation& prev_allocation, MemorySpace memory_space,
+    const std::vector<SliceInput>& sorted_slice_input, int64_t end_time,
+    int64_t copy_done_schedule_before_time)
+    : Allocation(
+          /*defining_position=*/{nullptr, {}}, memory_space,
+          GetSlicedCopyAllocationChunk(sorted_slice_input),
+          GetSlicedCopyAllocationStartTime(sorted_slice_input), end_time,
+          /*is_scoped_allocation=*/false),
+      prev_allocation_(prev_allocation) {
+  CHECK_GE(sorted_slice_input.size(), 2);
+  sorted_slice_details_.reserve(sorted_slice_input.size());
+  for (const SliceInput& input : sorted_slice_input) {
+    sorted_slice_details_.push_back(SliceDetails{
+        input.chunk,
+        input.start_time,
+        copy_done_schedule_before_time,
+        input.slice_params,
+        /*copy_start=*/nullptr,
+        /*copy_done=*/nullptr,
+    });
+  }
+}
+
+namespace {
+
+// Sets defining_position with the copy_complete instruction and replaces all
+// uses of the allocation with the copy_complete instruction.
+Status ProcessCopyLikeAllocationUses(HloPosition& defining_position,
+                                     std::vector<HloUse>& uses,
+                                     HloComputation* computation,
+                                     HloInstruction* copy_complete) {
+  // Update the allocation position with the copy complete instruction, so that
+  // if there are further copies from it, they can find the correct position.
+  defining_position = HloPosition{copy_complete, {}};
+
+  // Replace all the uses of the copy-like allocation with the copy complete
+  // instruction.
+  for (HloUse use : uses) {
+    // If the operand is a tuple, we need to descend to the actual instruction
+    // we want to replace.
+    HloInstruction* replacement_instruction = copy_complete;
+    Shape operand_shape = use.instruction->operand(use.operand_number)->shape();
+    if (operand_shape.IsTuple()) {
+      TF_ASSIGN_OR_RETURN(
+          replacement_instruction,
+          TupleUtil::ReplaceTupleWith(
+              copy_complete,
+              use.instruction->mutable_operand(use.operand_number),
+              use.operand_index));
+    } else if (operand_shape != copy_complete->shape()) {
+      VLOG(4) << "Old shape = " << operand_shape.ToString()
+              << ", new shape = " << copy_complete->shape().ToString()
+              << "; inserting a bitcast.";
+      replacement_instruction = computation->AddInstruction(
+          HloInstruction::CreateBitcast(operand_shape, copy_complete));
+    }
+    TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
+        use.operand_number, replacement_instruction));
+  }
+
+  return OkStatus();
+}
+
+}  // namespace
+
+Status MemorySpaceAssignment::SlicedCopyAllocation::Process() {
+  Shape shape = defining_position().shape();
+  HloInstruction* producing_instruction = AddGetTupleElements();
+  HloComputation* computation = producing_instruction->parent();
+  std::vector<HloInstruction*> slice_dones;
+  slice_dones.reserve(sorted_slice_details_.size());
+
+  // Sliced copy allocations need to insert asynchronous copy nodes.
+  for (SliceDetails& slice_details : sorted_slice_details_) {
+    TF_RETURN_IF_ERROR(slice_details.CreateAsyncSlice(
+        shape, *producing_instruction, *computation));
+    VLOG(4) << "Created " << slice_details.copy_start->name()
+            << " for copy allocation: " << ToString();
+    slice_dones.push_back(slice_details.copy_done);
+  }
+
+  TF_RETURN_IF_ERROR(CreateBitcastConcat(shape, slice_dones));
+
+  return ProcessCopyLikeAllocationUses(defining_position_, uses_, computation,
+                                       concat_);
+}
+
+void MemorySpaceAssignment::SlicedCopyAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+  prev_allocation_.MarkNeeded(needed_allocations);
+}
+
+HloPosition MemorySpaceAssignment::SlicedCopyAllocation::defining_position()
+    const {
+  // Unless explicitly set, the defining position of a sliced copy allocation is
+  // retrieved from the previous allocation. This is because we don't create
+  // new CopyStart/CopyDone instructions until later and the position should
+  // point to the previous (copy or otherwise) allocation's position for the
+  // original defining position.
+  if (defining_position_.instruction == nullptr) {
+    return prev_allocation_.defining_position();
+  }
+  return defining_position_;
+}
+
+int64_t MemorySpaceAssignment::SlicedCopyAllocation::earliest_available_time()
+    const {
+  return sorted_slice_details().back().copy_done_before_time;
+}
+
+const std::vector<MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails>&
+MemorySpaceAssignment::SlicedCopyAllocation::sorted_slice_details() const {
+  return sorted_slice_details_;
+}
+
+std::vector<MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails>&
+MemorySpaceAssignment::SlicedCopyAllocation::mutable_sorted_slice_details() {
+  return sorted_slice_details_;
+}
+
+bool MemorySpaceAssignment::SlicedCopyAllocation::operator==(
+    const SlicedCopyAllocation& other) const {
+  return static_cast<const Allocation&>(*this) ==
+             static_cast<const Allocation&>(other) &&
+         sorted_slice_details_ == other.sorted_slice_details_ &&
+         concat_ == other.concat_;
+}
+
+std::string MemorySpaceAssignment::SlicedCopyAllocation::ToString() const {
+  std::string memory_space_str = "def";
+  if (memory_space_ == MemorySpace::kAlternate) {
+    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
+  }
+  return absl::StrCat("Sliced Copy Allocation in ", memory_space_str,
+                      ", start_time:", start_time(), ", end_time:", end_time(),
+                      ", first_slice_copy_start_after_time: ",
+                      sorted_slice_details().front().copy_start_after_time,
+                      ", last_slice_copy_done_before_time: ",
+                      sorted_slice_details().back().copy_done_before_time,
+                      ", uses: ", UsesToString(uses()), ", from ",
+                      prev_allocation_.ToString());
+}
+
+Status MemorySpaceAssignment::SlicedCopyAllocation::CreateBitcastConcat(
+    const Shape& shape, absl::Span<HloInstruction* const> slices) {
+  CHECK(!slices.empty());
+  concat_ =
+      slices.front()->parent()->AddInstruction(HloInstruction::CreateCustomCall(
+          shape, slices, kConcatBitcastCustomCall));
+  return OkStatus();
+}
+
 std::string MemorySpaceAssignment::MirroredAllocation::ToString() const {
   return absl::StrCat("Mirrored Allocation for ",
                       original_allocation_.ToString());
@@ -4278,36 +6186,9 @@ Status MemorySpaceAssignment::CopyAllocation::Process() {
       HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
   VLOG(4) << "Created " << copy_start_->name()
           << " for copy allocation: " << ToString();
-  // Update the allocation position with the copy done instruction so that if
-  // there are further copies from it, it can find the correct position.
-  defining_position_ = HloPosition{copy_done_, {}};
 
-  // Replace all the uses with the new copy instruction.
-  for (HloUse use : uses_) {
-    // If the operand is a tuple, we need to descend to the actual instruction
-    // we want to replace.
-    HloInstruction* replacement_instruction;
-    Shape operand_shape = use.instruction->operand(use.operand_number)->shape();
-    if (operand_shape.IsTuple()) {
-      TF_ASSIGN_OR_RETURN(
-          replacement_instruction,
-          TupleUtil::ReplaceTupleWith(
-              copy_done_, use.instruction->mutable_operand(use.operand_number),
-              use.operand_index));
-    } else if (operand_shape != copy_done_->shape()) {
-      VLOG(4) << "Old shape = " << operand_shape.ToString()
-              << ", new shape = " << copy_done_->shape().ToString()
-              << "; inserting a bitcast.";
-      replacement_instruction = computation->AddInstruction(
-          HloInstruction::CreateBitcast(operand_shape, copy_done_));
-    } else {
-      replacement_instruction = copy_done_;
-    }
-    TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
-        use.operand_number, replacement_instruction));
-  }
-
-  return OkStatus();
+  return ProcessCopyLikeAllocationUses(defining_position_, uses_, computation,
+                                       copy_done_);
 }
 
 Status MemorySpaceAssignment::MirroredAllocation::Process() {
@@ -4429,6 +6310,21 @@ Status MemorySpaceAssignment::Process() {
       alternate_memory_size_ =
           std::max(alternate_memory_size_, allocation->chunk().chunk_end());
     } else if (allocation->memory_space() == MemorySpace::kAlternate) {
+      if (allocation->is_sliced_copy_allocation()) {
+        // Add slices
+        const SlicedCopyAllocation& sliced_copy_allocation =
+            *static_cast<const SlicedCopyAllocation*>(allocation.get());
+        for (const SlicedCopyAllocation::SliceDetails& details :
+             sliced_copy_allocation.sorted_slice_details()) {
+          alternate_memory_assignments_.emplace_back(
+              sliced_copy_allocation.defining_position(), details.chunk);
+          alternate_memory_size_ =
+              std::max(alternate_memory_size_, details.chunk.chunk_end());
+        }
+        CHECK(
+            !sliced_copy_allocation.cross_program_prefetch_index().has_value());
+      }
+
       alternate_memory_assignments_.emplace_back(
           allocation->defining_position(), allocation->chunk());
       alternate_memory_size_ =
@@ -4645,48 +6541,221 @@ Status MemorySpaceAssignment::SimplifyGraph() {
   return OkStatus();
 }
 
+namespace {
+
+// An interface that is used to wrap asynchronous copies, asynchronous slices,
+// and asynchronous slice concat operations, for use in MSA's scheduling
+// algorithm (ScheduleAsynchronousCopies).
+//
+// Each AsyncCopy step represents 1 copy, 1 slice, or 1 concat. Each step
+// has an optional start phase (e.g., to start a copy or slice), and a required
+// done phase (e.g., to finish a copy or slice, or to perform a concat).
+class AsyncCopyStep {
+ public:
+  struct StartPhase {
+    int64_t schedule_after_time;
+    HloInstruction* instruction;
+  };
+  struct DonePhase {
+    int64_t schedule_before_time;
+    HloInstruction* instruction;
+  };
+
+  virtual ~AsyncCopyStep() = default;
+
+  bool operator<(const AsyncCopyStep& rhs) const {
+    std::optional<StartPhase> lhs_start_phase = start_phase();
+    auto lhs_tuple = std::make_tuple(
+        done_phase().schedule_before_time,
+        (lhs_start_phase.has_value() ? lhs_start_phase->schedule_after_time
+                                     : done_phase().schedule_before_time));
+    std::optional<StartPhase> rhs_start_phase = rhs.start_phase();
+    auto rhs_tuple = std::make_tuple(
+        rhs.done_phase().schedule_before_time,
+        (rhs_start_phase.has_value() ? rhs_start_phase->schedule_after_time
+                                     : rhs.done_phase().schedule_before_time));
+
+    return lhs_tuple < rhs_tuple;
+  }
+
+  virtual HloPosition defining_position() const = 0;
+
+  virtual std::optional<StartPhase> start_phase() const = 0;
+  virtual void set_start_phase_schedule_after_time(int64_t schedule_after) = 0;
+  virtual DonePhase done_phase() const = 0;
+
+ protected:
+  AsyncCopyStep() = default;
+};
+
+class AsyncCopyStepForCopyAllocation : public AsyncCopyStep {
+ public:
+  explicit AsyncCopyStepForCopyAllocation(
+      MemorySpaceAssignment::CopyAllocation* copy_allocation)
+      : AsyncCopyStep(), copy_allocation_(copy_allocation) {}
+
+  ~AsyncCopyStepForCopyAllocation() override = default;
+
+  HloPosition defining_position() const override {
+    return copy_allocation_->defining_position();
+  }
+
+  std::optional<StartPhase> start_phase() const override {
+    StartPhase phase{copy_allocation_->copy_start_schedule_after(),
+                     copy_allocation_->copy_start()};
+
+    return phase;
+  }
+
+  void set_start_phase_schedule_after_time(int64_t schedule_after) override {
+    copy_allocation_->set_copy_start_schedule_after(schedule_after);
+  }
+
+  DonePhase done_phase() const override {
+    return {copy_allocation_->copy_done_schedule_before(),
+            copy_allocation_->copy_done()};
+  }
+
+ private:
+  MemorySpaceAssignment::CopyAllocation* copy_allocation_ = nullptr;
+};
+
+class AsyncCopyStepForSlice : public AsyncCopyStep {
+ public:
+  AsyncCopyStepForSlice(
+      MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation,
+      size_t slice_index)
+      : AsyncCopyStep(),
+        sliced_copy_allocation_(sliced_copy_allocation),
+        slice_index_(slice_index) {}
+
+  ~AsyncCopyStepForSlice() override = default;
+
+  HloPosition defining_position() const override {
+    return sliced_copy_allocation_->defining_position();
+  }
+
+  std::optional<StartPhase> start_phase() const override {
+    const MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails&
+        slice_details = sliced_copy_allocation_
+                            ->mutable_sorted_slice_details()[slice_index_];
+    StartPhase phase{slice_details.copy_start_after_time,
+                     slice_details.copy_start};
+
+    return phase;
+  }
+
+  void set_start_phase_schedule_after_time(int64_t schedule_after) override {
+    sliced_copy_allocation_->mutable_sorted_slice_details()[slice_index_]
+        .copy_start_after_time = schedule_after;
+  }
+
+  DonePhase done_phase() const override {
+    MemorySpaceAssignment::SlicedCopyAllocation::SliceDetails& slice_details =
+        sliced_copy_allocation_->mutable_sorted_slice_details()[slice_index_];
+    DonePhase phase{slice_details.copy_done_before_time,
+                    slice_details.copy_done};
+
+    return phase;
+  }
+
+ private:
+  MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation_ =
+      nullptr;
+  size_t slice_index_;
+};
+
+class AsyncCopyStepForSliceConcat : public AsyncCopyStep {
+ public:
+  explicit AsyncCopyStepForSliceConcat(
+      MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation)
+      : AsyncCopyStep(), sliced_copy_allocation_(sliced_copy_allocation) {}
+
+  ~AsyncCopyStepForSliceConcat() override = default;
+
+  HloPosition defining_position() const override {
+    return sliced_copy_allocation_->defining_position();
+  }
+
+  std::optional<StartPhase> start_phase() const override {
+    return std::nullopt;
+  }
+
+  void set_start_phase_schedule_after_time(int64_t schedule_after) override {}
+
+  DonePhase done_phase() const override {
+    return {sliced_copy_allocation_->earliest_available_time(),
+            sliced_copy_allocation_->concat()};
+  }
+
+ private:
+  MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation_ =
+      nullptr;
+};
+
+}  // namespace
+
 void MemorySpaceAssignment::ScheduleAsynchronousCopies() {
   VLOG(1) << "Scheduling asynchronous copies...";
   for (MemorySpace memory_space :
        {MemorySpace::kDefault, MemorySpace::kAlternate}) {
-    std::vector<CopyAllocation*> copy_allocations;
+    std::vector<std::unique_ptr<AsyncCopyStep>> async_copy_steps;
     for (auto& allocation : allocations_) {
+      if (allocation->memory_space() != memory_space) {
+        continue;
+      }
+
       if (allocation->is_copy_allocation()) {
         auto copy_allocation = static_cast<CopyAllocation*>(allocation.get());
-        if (copy_allocation->memory_space() == memory_space) {
-          copy_allocations.push_back(copy_allocation);
+        async_copy_steps.push_back(
+            std::make_unique<AsyncCopyStepForCopyAllocation>(copy_allocation));
+      } else if (allocation->is_sliced_copy_allocation()) {
+        auto sliced_copy_allocation =
+            static_cast<SlicedCopyAllocation*>(allocation.get());
+        for (int i = 0;
+             i < sliced_copy_allocation->mutable_sorted_slice_details().size();
+             ++i) {
+          async_copy_steps.push_back(std::make_unique<AsyncCopyStepForSlice>(
+              sliced_copy_allocation, i));
         }
+        async_copy_steps.push_back(
+            std::make_unique<AsyncCopyStepForSliceConcat>(
+                sliced_copy_allocation));
       }
     }
 
     absl::c_stable_sort(
-        copy_allocations, [](CopyAllocation* first, CopyAllocation* second) {
-          return std::forward_as_tuple(first->copy_done_schedule_before(),
-                                       first->copy_start_schedule_after()) <
-                 std::forward_as_tuple(second->copy_done_schedule_before(),
-                                       second->copy_start_schedule_after());
-        });
-    for (CopyAllocation* copy_allocation : copy_allocations) {
-      // If the copy start doesn't happen to be scheduled at the correct
-      // computation, delay it until the correct computation starts.
-      int64_t copy_start_schedule_after =
-          copy_allocation->copy_start_schedule_after();
-      // Accessing flattened_instructions_ here without checking if it is
-      // nullptr is safe because this method is called before SimplifyGraph.
-      while (copy_allocation->defining_position().instruction->parent() !=
-             flattened_instructions_[copy_start_schedule_after]->parent()) {
-        VLOG(4) << "Delaying CopyStart (" << copy_start_schedule_after << " to "
-                << (copy_start_schedule_after + 1) << ") for "
-                << copy_allocation->copy_start()->ToString()
-                << " because it is not in the correct computation.";
-        copy_allocation->set_copy_start_schedule_after(
-            ++copy_start_schedule_after);
+        async_copy_steps,
+        [](const std::unique_ptr<AsyncCopyStep>& lhs,
+           const std::unique_ptr<AsyncCopyStep>& rhs) { return *lhs < *rhs; });
+    for (std::unique_ptr<AsyncCopyStep>& async_copy_step : async_copy_steps) {
+      std::optional<AsyncCopyStep::StartPhase> start_phase =
+          async_copy_step->start_phase();
+      if (start_phase.has_value()) {
+        // If the copy start doesn't happen to be scheduled at the correct
+        // computation, delay it until the correct computation starts.
+        int64_t copy_start_schedule_after = start_phase->schedule_after_time;
+
+        // Accessing flattened_instructions_ here without checking if it is
+        // nullptr is safe because this method is called before SimplifyGraph.
+        while (async_copy_step->defining_position().instruction->parent() !=
+               flattened_instructions_[copy_start_schedule_after]->parent()) {
+          VLOG(4) << "Delaying CopyStart (" << copy_start_schedule_after
+                  << " to " << (copy_start_schedule_after + 1) << ") for "
+                  << start_phase->instruction->ToString()
+                  << " because it is not in the correct computation.";
+          async_copy_step->set_start_phase_schedule_after_time(
+              ++copy_start_schedule_after);
+        }
+
+        start_phase = async_copy_step->start_phase();
+        schedule_after_[start_phase->schedule_after_time].push_back(
+            start_phase->instruction);
       }
 
-      schedule_after_[copy_allocation->copy_start_schedule_after()].push_back(
-          copy_allocation->copy_start());
-      schedule_before_[copy_allocation->copy_done_schedule_before()].push_back(
-          copy_allocation->copy_done());
+      AsyncCopyStep::DonePhase done_phase = async_copy_step->done_phase();
+      schedule_before_[done_phase.schedule_before_time].push_back(
+          done_phase.instruction);
     }
   }
 }
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.h b/tensorflow/compiler/xla/service/memory_space_assignment.h
index ca7b84f6498..9d83f3adfb8 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.h
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.h
@@ -16,19 +16,25 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_H_
 
+#include <cstdint>
 #include <functional>
 #include <map>
 #include <memory>
 #include <optional>
 #include <set>
 #include <string>
+#include <string_view>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 // TODO(b/210891274): Use btree_map after build issue in Windows is resolved.
 #if defined(__GNUC__) || defined(__clang__)
 #include "absl/container/btree_map.h"
 #endif
 #include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/heap_simulator.h"
 #include "tensorflow/compiler/xla/service/hlo_cost_analysis.h"
 #include "tensorflow/compiler/xla/service/memory_space_assignment.pb.h"
@@ -40,6 +46,8 @@ namespace memory_space_assignment {
 // Forward Declaration of Options.
 class Options;
 
+inline constexpr char kConcatBitcastCustomCall[] = "ConcatBitcast";
+
 // This class contains pre-set assignments determined by memory space
 // assignment. It contains two data structures: (1) a chunks vector that maps a
 // defining HloPosition to a Chunk (offset and size), and (2) an assignment_info
@@ -550,6 +558,7 @@ class MemorySpaceAssignment {
       std::function<bool(const HloPosition&)>;
   using ReservedScopedMemoryFunction =
       std::function<int64_t(const HloInstruction*)>;
+  using UpdateLayoutFunction = std::function<void(Shape*)>;
 
   // MemorySpaceAssignment uses a notion of a slow and large default memory
   // space and a fast and small alternate memory space.
@@ -608,6 +617,7 @@ class MemorySpaceAssignment {
     virtual ~Allocation() = default;
 
     virtual bool is_copy_allocation() const { return false; }
+    virtual bool is_sliced_copy_allocation() const { return false; }
 
     // Adds a use to this allocation.
     void AddUse(HloUse use);
@@ -761,7 +771,154 @@ class MemorySpaceAssignment {
     std::optional<int64_t> cross_program_prefetch_index_;
   };
 
-  // TODO(b/275905276): create a SlicedCopyAllocation
+  // The parameters for slicing a single dimension of a tensor.
+  struct SliceParam {
+    std::string ToString() const;
+    bool operator==(const SliceParam& other) const;
+
+    int64_t start_inclusive;
+    int64_t end_exclusive;
+  };
+
+  // A proposed way to slice a buffer.
+  struct SliceProposal {
+    std::string ToString() const;
+    std::tuple<const Shape&,
+               const std::vector<MemorySpaceAssignment::SliceParam>&, int64_t>
+    ToTuple() const;
+    bool operator==(const SliceProposal& other) const;
+
+    // Shape resulting from the slice.
+    Shape slice_shape;
+
+    // slice_params map to the parameters that would be passed to a slice
+    // instruction. Thus:
+    // * There should be a slice parameter for every dimension in the shape of
+    //   the tensor being sliced.
+    // * The ith slice_param applies to the ith logical dimension in the shape
+    //   being sliced.
+    // * If a dimension is not being sliced, it should have a SliceParam of
+    //   {0, dim size}.
+    std::vector<MemorySpaceAssignment::SliceParam> slice_params;
+
+    // The size to be allocated for the slice. Note, this may be > the size of
+    // the slice shape, due to additional padding that may occur when the slices
+    // are concatenated back together.
+    int64_t slice_size;
+  };
+
+  // A SliceProposalCollection proposes a way to to slice an AllocationRequest.
+  // A SliceProposalCollection is generated from a SliceProposalFunction and is
+  // used when we want to slice a prefetch.
+  using SliceProposalCollection = std::vector<SliceProposal>;
+  using SliceProposalFunction = std::function<StatusOr<SliceProposalCollection>(
+      const Shape& shape, const SlicedPrefetchOptions& options)>;
+
+  // This class represents an allocation resulting from asynchronous sliced
+  // copies.
+  //
+  // Let the sliced allocation be represented as follows, and imagine that t3
+  // is the time when the entire buffer [p0, p3) is available for use
+  //
+  //   space
+  //    ^
+  // p3 |       +-----------+
+  //    |       |           |
+  // p2 |   +---+           |
+  //    |   |               |
+  // p1 |   +-------+       |
+  //    |           |       |
+  // p0 |           +-------+
+  //    +---|---|---|---|---|----> time
+  //        t0  t1  t2  t3  t4
+  //
+  // The Allocation underlying the SlicedCopyAllocation will use the following
+  // dimensions:
+  // - chunk = [p0, p3)
+  // - start time = t2
+  // - earliest_available_time = t3
+  // - end_time = t4
+  class SlicedCopyAllocation : public Allocation {
+   public:
+    // Input description of 1 slice.
+    struct SliceInput {
+      Chunk chunk;
+      int64_t start_time;
+      std::vector<SliceParam> slice_params;
+    };
+
+    // Details about a slice in the sliced allocation.
+    struct SliceDetails {
+      std::string ToString() const;
+      std::tuple<const Chunk&, int64_t, int64_t, const std::vector<SliceParam>&,
+                 const HloInstruction*, const HloInstruction*>
+      ToTuple() const;
+      bool operator==(const SliceDetails& other) const;
+
+      // Create the instructions to copy the slice. This method updates
+      // copy_start and copy_done.
+      Status CreateAsyncSlice(const Shape& original_shape,
+                              HloInstruction& producer, HloComputation& parent);
+
+      Chunk chunk;
+      int64_t copy_start_after_time = -1;
+      int64_t copy_done_before_time = -1;
+      std::vector<SliceParam> slice_params;
+      HloInstruction* copy_start = nullptr;
+      HloInstruction* copy_done = nullptr;
+    };
+
+    // sorted_slice_input is sorted by start_time
+    //
+    // REQUIRES:
+    // - sorted_slice_input.size() >= 2, otherwise, CopyAllocation should be
+    //   used.
+    SlicedCopyAllocation(const Allocation& prev_allocation,
+                         MemorySpace memory_space,
+                         const std::vector<SliceInput>& sorted_slice_input,
+                         int64_t end_time,
+                         int64_t copy_done_schedule_before_time);
+
+    bool is_sliced_copy_allocation() const override { return true; }
+
+    // MemorySpaceAssignment::Process() calls Process() to create asynchronous
+    // slice copies, and a bitcast-concat call to glue the slices back together.
+    Status Process() override;
+
+    // Marks the allocation as needed.
+    void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+        const override;
+
+    // Returns the defining position for this allocation.
+    HloPosition defining_position() const override;
+
+    // Returns the time the buffer is first available to be used. For
+    // SlicedCopyAllocation, this is when all copies have ended.
+    int64_t earliest_available_time() const override;
+
+    const std::vector<SliceDetails>& sorted_slice_details() const;
+    std::vector<SliceDetails>& mutable_sorted_slice_details();
+    HloInstruction* concat() const { return concat_; }
+
+    bool operator==(const SlicedCopyAllocation& other) const;
+    std::string ToString() const override;
+
+   private:
+    SlicedCopyAllocation() = delete;
+
+    // Create an instruction to concatenate the slices. Populates concat_.
+    Status CreateBitcastConcat(const Shape& shape,
+                               absl::Span<HloInstruction* const> slices);
+
+    const Allocation& prev_allocation_;
+    // REQUIRES:
+    // - sorted_segments_[i].copy_start_after_time <=
+    //   sorted_segments_[i+j].copy.start_after_time
+    // - sorted_segments_[i].copy_done_before_time <=
+    //   sorted_segments_[i+j].copy.start_before_time
+    std::vector<SliceDetails> sorted_slice_details_;
+    HloInstruction* concat_ = nullptr;
+  };
 
   // An allocation in the default memory space that mirrors another Allocation
   // object. This is useful to model an eviction that happens before a while op
@@ -1223,6 +1380,12 @@ struct Options {
   // executed pow(while_execution_count, nesting_level) times.
   uint64_t xla_tpu_memory_space_assignment_while_execution_count = 5ULL;
 
+  // This variable is used to scale the alternate memory benefit factor for
+  // large buffers. The default scaling function is sqrt.
+  std::string
+      xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers =
+          "SQRT";
+
   float async_copy_bandwidth_bytes_per_second = 0.0f;
 
   float alternate_mem_bandwidth_bytes_per_second = 0.0f;
@@ -1292,6 +1455,18 @@ struct Options {
 
   // Options for slicing prefetches into smaller asynchronously copied pieces.
   SlicedPrefetchOptions sliced_prefetch_options;
+
+  // Options for the memory-bound loop optimizer feature.
+  MemoryBoundLoopOptimizerOptions memory_bound_loop_optimizer_options;
+
+  // A function for updating shape layouts.
+  MemorySpaceAssignment::UpdateLayoutFunction update_layout_fn = [](Shape*) {};
+
+  MemorySpaceAssignment::SliceProposalFunction propose_slice_fn =
+      [](const Shape&, const SlicedPrefetchOptions&)
+      -> xla::StatusOr<MemorySpaceAssignment::SliceProposalCollection> {
+    return UnimplementedStrCat("Generation of SliceProposals unimplemented");
+  };
 };
 
 // A struct representing an asynchronous copy with its logical start and end
@@ -1439,6 +1614,248 @@ class AsynchronousCopyResource {
   std::vector<float> delay_;
 };
 
+// TODO(b/280618622): Refactor this class out of this file.
+//
+// An optimizer for unrolled memory-bound loops. It keeps track of alternate
+// memory capacity and default memory bandwidth to decide the allocations of
+// each tensor within a loop iteration. The assumption is that all of the
+// unrolled loop iterations will use the same allocation decisions, so we can
+// spend more time to optimize this one iteration as optimally as possible.
+//
+// To represent instructions, we keep track of three iterations (previous,
+// current, and next), as well as the header and footer regions that are before
+// and after the loop, respectively.
+//
+// We classify each tensor used in the current iteration as one of the following
+// allocations based on its positions and uses:
+//
+// Temporary Allocations: These are produced by a producer in the current
+// iteration and consumed either in this or the next iteration. For these, we
+// try to give them alternate memory allocations for their entire live range.
+//
+// Case 1: producer and consumer all in the current iteration.
+//                                     p-----c--c
+// Case 2: producer is in the current iter, consumer is in the next iter.
+//                                           p-----c
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+//
+// Loop Carried Dependences: This is where the last use is at a larger index
+// than the producer. This would require 2X peak buffer consumption because both
+// this and next iteration's buffer is alive at the same time. This case is
+// currently not supported.
+//
+// Case 3: producer is in the current iter, consumer is in the next iter
+//         (consumer idx >= producer idx).
+//                                           p-----------------c
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+//
+// Pinned Allocations: These are values produced at the header and are used in
+// every iteration at the same indices. For these, we just allocate the buffer
+// for the duration of the loop:
+//
+// Case 4: producer: kHead, consumer: kCurrent
+//         p---------------c--------------c--------------c--------
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+//
+// Prefetch Allocations: These are values produced at the header and are used in
+// the current (and possibly next) iteration. We will try to prefetch these
+// values into the alternate memory:
+//
+// Case 5: producer: kHead, consumer: kCurrent
+//         p---------------------------------c--------c
+//  idx:       |...| 0  1  2  3  4| 0  1  2  3  4| 0  1  2  3  4|...|
+// iter: head  |...|      prev    |    current   |     next     |...| foot
+class MemoryBoundLoopOptimizer {
+ public:
+  // We represent each tensor used in the current iteration as a LoopValue,
+  // wrapping the relevant information such as its HLO value, indices and
+  // pointers to its use and position sites in different iterations.
+  struct LoopValue {
+    // An enum that encodes the allocation type that is suitable for this
+    // LoopValue. See the comment above on what each of these mean.
+    enum class AllocationType {
+      kTemporary,
+      kLoopCarriedDependence,
+      kPinned,
+      kPrefetch,
+      kUnsupported
+    };
+
+    // ToString methods for logging/debugging.
+    static std::string AllocationTypeToString(AllocationType allocation_type);
+    std::string ToString() const;
+
+    // The HloValues that correspond to this LoopValue.
+    std::vector<const HloValue*> hlo_values;
+    // The position in the header, if any.
+    std::optional<HloPosition> header_position;
+    // The loop index and position in the previous and current iterations.
+    std::vector<std::pair<int64_t, HloPosition>> prev_iteration_positions;
+    std::vector<std::pair<int64_t, HloPosition>> loop_positions;
+    // The loop index and use in the current and next iterations.
+    std::vector<std::pair<int64_t, HloUse>> loop_uses;
+    std::vector<std::pair<int64_t, HloUse>> next_iteration_uses;
+    // The allocation type.
+    AllocationType allocation_type;
+    // Size of this tensor.
+    int64_t size;
+    // The default memory bandwidth savings were we to successfully put this in
+    // the alternate memory using the allocation type, in bytes.
+    float savings;
+    // The savings divided by the size. This is typically 2 for temporary
+    // allocations (skip a write and a read to the default memory). More complex
+    // production/consumption patterns may result in higher or lower values. We
+    // use this value to sort LoopValues so that the algorithm can prioritize
+    // allocating the buffers with the highest savings per byte to the alternate
+    // memory.
+    float savings_per_byte;
+    // The optimized AllocationSequence.
+    MemorySpaceAssignment::AllocationSequence allocations;
+  };
+
+  // Factory method to create and initialize a MemoryBoundLoopOptimizer.
+  static StatusOr<std::unique_ptr<MemoryBoundLoopOptimizer>> Create(
+      int loop_start, int loop_end, uint64_t alternate_memory_size,
+      const MemoryBoundLoopOptimizerOptions& options,
+      const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis_,
+      const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+      const BufferValue::SizeFunction& size_function);
+
+  // Optimize the loop. Initialize must be called first.
+  void Optimize();
+
+  // Calculate the steady-state execution time of one loop iteration using the
+  // allocation decisions so far.
+  float CalculateExecutionTime() const;
+
+  // Return the LoopValues.
+  const std::vector<LoopValue>& loop_values() const { return loop_values_; }
+  std::vector<LoopValue>& loop_values() { return loop_values_; }
+
+  // Return the remaining memory vector for each point in time in the loop using
+  // the allocation decisions so far.
+  const std::vector<int64_t>& remaining_memory() const {
+    return remaining_memory_;
+  }
+
+  // The loop start, end, and size accessors.
+  int loop_start() const { return loop_start_; }
+  int loop_end() const { return loop_end_; }
+  int loop_size() const { return loop_size_; }
+
+ private:
+  // Temporary data structures used by the AllocatePrefetch function.
+  struct AllocatePrefetchesContext {
+    // The values that are requested to be prefetched.
+    absl::Span<LoopValue*> values;
+
+    // A list of indices into values array, sorted by the start time of the
+    // first use.
+    std::vector<int> value_indices;
+
+    // Default memory remaining bandwidths assuming all prefetches succeeded.
+    std::vector<float> bandwidth_idle_times;
+
+    // Additional memory used while performing prefetching.
+    std::vector<int64_t> additional_memory_used;
+  };
+
+  MemoryBoundLoopOptimizer(
+      int loop_start, int loop_end, uint64_t alternate_memory_size,
+      const MemoryBoundLoopOptimizerOptions& options,
+      const HloLiveRange& hlo_live_range,
+      const HloAliasAnalysis& alias_analysis_,
+      const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+      const BufferValue::SizeFunction& size_function);
+
+  // Initializes the data structures used by the optimizer.
+  Status Initialize();
+
+  // Given an HloBuffer object, determines if this buffer represents a LoopValue
+  // that can be optimized by the optimizer, and if so it adds a LoopValue to
+  // the back of loop_values_ that represents the HloBuffer. Otherwise, no new
+  // LoopValue is added to loop_values_.
+  void MaybeCreateLoopValue(const HloBuffer& buffer,
+                            const HloComputation* loop_computation);
+
+  // Sort LoopValues by savings_per_byte.
+  void SortLoopValues();
+
+  // After allocation finishes, we fix up by creating Allocation objects to any
+  // LoopValues that didn't get alternate memory allocations.
+  void PostProcess();
+
+  // Allocate LoopValues by dispatching to the correct Allocate method.
+  void AllocateLoopValues();
+
+  // Allocate and reserve memory between the given indices.
+  bool AllocateBetween(int64_t begin_idx, int64_t end_idx, int64_t size);
+
+  // Perform allocation type kTemporary. Return true if successful.
+  bool AllocateTemporary(LoopValue& value);
+
+  // Perform allocation type kPinned. Return true if successful.
+  bool AllocatePinned(LoopValue& value);
+
+  // Perform allocation type kPrefetch. Unlike the other Allocate methods, this
+  // performs allocation of multiple LoopValues in order to consider the effect
+  // of remaining bandwidth assuming the other prefetches were successful.
+  // Return true if successful.
+  bool AllocatePrefetches(absl::Span<LoopValue*> values);
+
+  // Allocate one prefetch for the loop value index that corresponds to
+  // context.context.values. Returns true if successful.
+  bool AllocatePrefetch(int value_index, AllocatePrefetchesContext& context);
+
+  // Keeps track of successful allocation of all uses and positions of this
+  // LoopValue.
+  void AddAllLoopPositionsAndUses(LoopValue& value,
+                                  bool allocate_next_iteration_uses);
+
+  // Returns the default memory bandwidth idle time at the index.
+  float GetBandwidthIdleTime(int idx) const;
+
+  // Returns the default memory bandwidth idle time at the index assuming the
+  // given uses and positions got alternate memory allocations.
+  float GetBandwidthIdleTime(
+      int idx,
+      const absl::flat_hash_map<const HloInstruction*,
+                                std::vector<std::pair<int64_t, ShapeIndex>>>&
+          additional_uses_in_alternate_mem,
+      const absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>&
+          additional_positions_in_alternate_mem) const;
+
+  // Returns the instruction elapsed at the index.
+  float GetInstructionElapsed(int idx) const;
+
+  int loop_start_;
+  int loop_end_;
+  int loop_size_;
+  uint64_t alternate_memory_size_;
+  MemoryBoundLoopOptimizerOptions options_;
+  const HloLiveRange& hlo_live_range_;
+  const HloAliasAnalysis& alias_analysis_;
+  const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
+  BufferValue::SizeFunction size_function_;
+
+  absl::flat_hash_map<const HloInstruction*, int64_t> instructions_in_loop_;
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instructions_in_prev_iteration_;
+  absl::flat_hash_map<const HloInstruction*, int64_t>
+      instructions_in_next_iteration_;
+  std::vector<LoopValue> loop_values_;
+  std::vector<int64_t> remaining_memory_;
+  absl::flat_hash_map<const HloInstruction*,
+                      std::vector<std::pair<int64_t, ShapeIndex>>>
+      uses_in_alternate_mem_;
+  absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>
+      positions_in_alternate_mem_;
+};
+
 // This class inherits from GlobalDecreasingSizeBestFitHeap with a notion of
 // maximum size.
 class AlternateMemoryBestFitHeap
@@ -1531,7 +1948,9 @@ class AlternateMemoryBestFitHeap
     int64_t end_time;
     int64_t latest_prefetch_time;
     int64_t size;
+    bool prefer_no_copy_alternate_mem_allocation;
     bool allow_no_copy_alternate_mem_allocation;
+    bool allow_prefetch;
     std::optional<int64_t> earliest_prefetch_time;
     std::optional<int64_t> preferred_prefetch_time;
     AliasedOffset* preferred_offset;
@@ -1566,6 +1985,19 @@ class AlternateMemoryBestFitHeap
     }
   };
 
+  // A struct that contains a pointer to loop-optimized allocation along with
+  // essential data about the loop itself.
+  struct LoopOptimizedAllocationInfo {
+    // The use_idx is the instruction index of the use within the loop.
+    int64_t use_index;
+    // The number of instructions in one iteration of the loop. We use use_index
+    // and loop_size to calculate when exactly to schedule a prefetch
+    // instruction.
+    int64_t loop_size;
+    // A pointer into an Allocation in loop_optimized_allocations_.
+    const MemorySpaceAssignment::Allocation* loop_optimized_allocation;
+  };
+
   // Result of an allocation, prefetch, eviction etc. request.  The result is
   // either kSuccess or a bitwise OR of one or more failures. The values are
   // unique powers of two. To check if a result contains a particular failure,
@@ -1622,6 +2054,16 @@ class AlternateMemoryBestFitHeap
            result_is(result, Result::kFailViolatesAsyncCopyResource);
   }
 
+  // For the given loop with the start and end index and loop size, run the
+  // MemoryBoundLoopOptimizer and record its outputs into
+  // optimized_allocations_map_.
+  Status OptimizeMemoryBoundLoop(int loop_start_idx, int loop_end_idx,
+                                 int loop_size);
+
+  // Identify memory-bound loops in the graph and call OptimizeMemoryBoundLoop
+  // for the found loops.
+  void IdentifyAndOptimizeMemoryBoundLoops();
+
   // Allocates buffers for instructions that need reserved scoped allocations in
   // the alternate memory space.
   void AllocateReservedScopedAllocations();
@@ -1841,6 +2283,7 @@ class AlternateMemoryBestFitHeap
   const Options& options_;
   const HloAliasAnalysis& alias_analysis_;
   const HloLiveRange& hlo_live_range_;
+  std::unique_ptr<CallGraph> call_graph_;
   // We use a interval tree to keep track of the number of outstanding
   // prefetches and evictions.
   BufferIntervalTree prefetch_interval_tree_;
@@ -1884,6 +2327,16 @@ class AlternateMemoryBestFitHeap
   // fingerprint.
   absl::flat_hash_map<std::string, std::vector<const HloInstruction*>>
       repeated_inst_map_;
+
+  // Loop-optimized allocations found by MemoryBoundLoopOptimizer. These
+  // allocation objects describe the allocations for one iteration of the loop,
+  // so we translate them into the program-level Allocation objects in
+  // allocations_.
+  std::vector<MemorySpaceAssignment::AllocationSequence>
+      loop_optimized_allocations_;
+  // A map to look up the loop-optimized allocation info by use.
+  absl::flat_hash_map<HloUse, LoopOptimizedAllocationInfo>
+      loop_optimized_allocations_map_;
   // Debug strings.
   std::string buffer_info_str_;
   std::string allocation_info_str_;
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment.proto b/tensorflow/compiler/xla/service/memory_space_assignment.proto
index ec622ea37e0..f85692df7af 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment.proto
+++ b/tensorflow/compiler/xla/service/memory_space_assignment.proto
@@ -21,10 +21,41 @@ package xla.memory_space_assignment;
 // asynchronous copies, reducing prefetch memory allocation pressure.
 //
 // No prefetch slicing is performed if max_slices == 0.
+//
+// TODO(b/275905276): Consider adding another option that indicates that we want
+// slices of a certain size, rather than just always creating max_slices.
 message SlicedPrefetchOptions {
   // The maximum number of slices into which to slice a prefetch.
   uint32 max_slices = 1;
 
-  // The minimum size (in bytes) of any slice.
-  uint64 min_slice_bytes = 2;
+  // The minimum tensor size in bytes that we will attempt to slice.
+  uint64 min_bytes = 2;
+
+  // This option should never be set to true in production. When this is true,
+  // we will crash if we propose a slice (other than the final slice) with a
+  // size that is not a multiple of the required hardware alignment. Otherwise,
+  // we will choose not to slice such situations, which is always safe.
+  bool fail_on_non_alignment_boundary_slice_proposal = 3;
+}
+
+// Options for memory-bound loop optimizations in memory space assignment. If
+// enabled, this pass can optimize memory-bound unrolled loops to maximize the
+// bandwidth utilized and minimize the execution time.
+message MemoryBoundLoopOptimizerOptions {
+  // Enable the memory-bound loop optimizations.
+  optional bool enabled = 1;
+
+  // The desired ratio of overlapped operations that is sufficient to overlap
+  // prefetches with. If this value is 1, the algorithm will try to fully
+  // overlap the prefetches with other compute, if less than 1, the algorithm
+  // may schedule prefetches such that some of the prefetch is not overlapped,
+  // so may become critical. For example, if this value is 0.5, we are willing
+  // for the prefetch time to take up to 2X of the overlapped computation time.
+  optional float desired_copy_ratio = 2;
+
+  // If true, the algorithm allows a fully pipelined prefetch to be scheduled
+  // even if the copy resources haven't reached the desired copy ratio. A fully
+  // pipelined prefetch starts the same time as its counterpart in the previous
+  // iteration finishes.
+  optional bool allow_unsatisfied_fully_pipelined_prefetch = 3;
 }
diff --git a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
index be0470a17e3..17825278631 100644
--- a/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
+++ b/tensorflow/compiler/xla/service/memory_space_assignment_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/memory_space_assignment.h"
 
+#include "absl/strings/str_replace.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/instruction_hoister.h"
@@ -30,6 +31,8 @@ using memory_space_assignment::AsynchronousCopyOrdering;
 using memory_space_assignment::AsynchronousCopyResource;
 using memory_space_assignment::CostAnalysisPrefetchIntervalPicker;
 using memory_space_assignment::InstructionCountPrefetchIntervalPicker;
+using memory_space_assignment::MemoryBoundLoopOptimizer;
+using memory_space_assignment::MemoryBoundLoopOptimizerOptions;
 using memory_space_assignment::MemorySpaceAssignment;
 using memory_space_assignment::MemorySpaceAssignmentCostAnalysis;
 using memory_space_assignment::Options;
@@ -47,6 +50,10 @@ int64_t ShapeSize(const Shape& shape) {
   return ShapeUtil::ByteSizeOf(shape, kPointerSize);
 }
 
+int64_t SizeFunction(const BufferValue& value) {
+  return ShapeSize(value.shape());
+}
+
 class MemorySpaceAssignmentTest : public HloTestBase,
                                   public ::testing::WithParamInterface<bool> {
  protected:
@@ -481,10 +488,11 @@ TEST_P(MemorySpaceAssignmentTest, Simple) {
 
   // Inputs and outputs are currently placed in the default memory. Everything
   // else should be in the alternate memory.
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   EXPECT_THAT(mul, op::ShapeWithLayout(shape));
@@ -542,10 +550,11 @@ TEST_P(MemorySpaceAssignmentTest, NegateChain) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -614,10 +623,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -688,10 +698,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchBeforeTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -762,10 +773,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchAfterTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -835,10 +847,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchTooLateTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -902,10 +915,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigPrecedenceTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -976,10 +990,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchPrecedenceTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1048,10 +1063,11 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchNoMatchTest) {
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   // Negate instructions are in the alternate memory space (1).
-  Shape shape_in_alternate_mem =
-      ShapeUtil::MakeShapeWithDenseLayout(F32, {2, 3},
-                                          /*minor_to_major=*/{1, 0},
-                                          /*tiles=*/{}, kAlternateMemorySpace);
+  Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
+      F32, {2, 3},
+      /*minor_to_major=*/{1, 0},
+      /*tiles=*/{},
+      /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1366,7 +1382,8 @@ TEST_P(MemorySpaceAssignmentTest, While) {
   }
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   EXPECT_THAT(body_data_mul, op::ShapeWithLayout(shape_in_alternate_mem));
 }
 
@@ -1704,7 +1721,8 @@ TEST_P(MemorySpaceAssignmentTest, BitcastMultiUse) {
   AssignMemorySpace(module.get());
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   EXPECT_THAT(negate0->operand(0), op::ShapeWithLayout(shape));
   EXPECT_THAT(add->operand(0), op::ShapeWithLayout(shape_in_alternate_mem));
 }
@@ -1758,7 +1776,8 @@ TEST_P(MemorySpaceAssignmentTest, BitcastMultiUseTuple) {
   AssignMemorySpace(module.get());
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   EXPECT_THAT(negate0->operand(0), op::ShapeWithLayout(shape));
   EXPECT_THAT(fusion->operand(0)->operand(0),
               op::ShapeWithLayout(shape_in_alternate_mem));
@@ -3610,7 +3629,7 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
           /*dim_ordered=*/{}, /*tiles=*/{},
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
-          kAlternateMemorySpace);
+          /*element_size_in_bits=*/0, kAlternateMemorySpace);
   // Index {1} is a scalar, so it is always placed in the default memory.
   *ShapeUtil::GetMutableSubshape(&tuple_shape, {1})->mutable_layout() =
       LayoutUtil::MakeLayout(
@@ -3618,7 +3637,7 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
           /*dim_ordered=*/{}, /*tiles=*/{},
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
-          kDefaultMemorySpace);
+          /*element_size_in_bits=*/0, kDefaultMemorySpace);
   // Index {2} of the while loop is placed in the default memory.
   *ShapeUtil::GetMutableSubshape(&tuple_shape, {2})->mutable_layout() =
       LayoutUtil::MakeLayout(
@@ -3626,7 +3645,7 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
           /*dim_ordered=*/{}, /*tiles=*/{},
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
-          kDefaultMemorySpace);
+          /*element_size_in_bits=*/0, kDefaultMemorySpace);
 
   // Expect the layout for the while loop and its aliased buffers.
   EXPECT_THAT(while_op, op::ShapeWithLayout(tuple_shape));
@@ -4059,7 +4078,8 @@ TEST_P(MemorySpaceAssignmentTest, CostAnalysis) {
   // Negate instructions are in the alternate memory space (1).
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate2, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -4128,7 +4148,8 @@ TEST_P(MemorySpaceAssignmentTest, MemoryBoundednessBufferIntervalCompare) {
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   Shape shape_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {4, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kDefaultMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kDefaultMemorySpace);
   // Expect only negates to be in alternate memory space. Not all might fit but
   // make sure at least one does.
   std::vector<HloInstruction*> negate_instructions = {negate0, negate1, negate2,
@@ -4235,13 +4256,16 @@ TEST_P(MemorySpaceAssignmentTest, SimpleWhileTupleTest) {
   // Ensure all parameters and while are placed in default memory.
   Shape shape_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {4, 6},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kDefaultMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kDefaultMemorySpace);
   Shape s32_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       xla::S32, {},
-      /*minor_to_major=*/{}, /*tiles=*/{}, kDefaultMemorySpace);
+      /*minor_to_major=*/{}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kDefaultMemorySpace);
   Shape f32v1_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {1},
-      /*minor_to_major=*/{0}, /*tiles=*/{}, kDefaultMemorySpace);
+      /*minor_to_major=*/{0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kDefaultMemorySpace);
   Shape t_s32_f32v1_in_default_mem =
       ShapeUtil::MakeTupleShape({s32_in_default_mem, f32v1_in_default_mem});
   EXPECT_THAT(param, op::ShapeWithLayout(t_s32_f32v1_in_default_mem));
@@ -4351,7 +4375,8 @@ TEST_P(MemorySpaceAssignmentTest,
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   // p0 is in the default memory space.
   HloInstruction* p0 =
       builder.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
@@ -7400,7 +7425,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTest) {
   auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
   auto rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {kFeature, kOutput},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
   HloInstruction* lhs = builder.AddInstruction(
       HloInstruction::CreateParameter(0, lhs_shape, "lhs"));
@@ -7446,7 +7472,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTupleTest) {
   auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
   auto rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {kFeature, kOutput},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, kAlternateMemorySpace);
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      kAlternateMemorySpace);
   auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
   auto tuple_shape = ShapeUtil::MakeTupleShape({lhs_shape, rhs_shape});
   HloInstruction* param = builder.AddInstruction(
@@ -8467,5 +8494,1037 @@ TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) {
             expected_compute_elapsed);
 }
 
+class MemoryBoundLoopOptimizerTest : public HloTestBase {
+ public:
+  MemoryBoundLoopOptimizerTest() = default;
+
+ protected:
+  const int64_t kAlternateMemorySpace = 1;
+  const int64_t kDefaultMemorySpace = 0;
+
+  Status Initialize(const HloModule* module,
+                    uint64_t alternate_memory_size = 256) {
+    HloCostAnalysis::Options options;
+    MemoryBoundLoopOptimizerOptions optimizer_options;
+    optimizer_options.set_enabled(true);
+    optimizer_options.set_desired_copy_ratio(0.7);
+    optimizer_options.set_allow_unsatisfied_fully_pipelined_prefetch(false);
+    options_.memory_bound_loop_optimizer_options = optimizer_options;
+    options_.alternate_mem_bandwidth_bytes_per_second = 128;
+    options_.async_copy_bandwidth_bytes_per_second = 32;
+    options_.pipeline_overhead_window_size_mib = 1;
+    options.shape_size = ShapeSize;
+    options.set_flops_per_second(16);
+    options.set_bytes_per_second(32);
+    options.set_transcendentals_per_second(16);
+    hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
+    TF_RETURN_IF_ERROR(
+        module->entry_computation()->Accept(hlo_cost_analysis_.get()));
+    TF_ASSIGN_OR_RETURN(cost_analysis_,
+                        MemorySpaceAssignmentCostAnalysis::Create(
+                            *hlo_cost_analysis_, options_, *module));
+    TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
+    TF_ASSIGN_OR_RETURN(live_range_,
+                        HloLiveRange::Run(module->schedule(), *alias_analysis_,
+                                          module->entry_computation()));
+    return OkStatus();
+  }
+
+  StatusOr<MemoryBoundLoopOptimizer*> CreateOptimizer(
+      int loop_start, int loop_end, const HloModule* module,
+      uint64_t alternate_memory_size = 256) {
+    TF_RETURN_IF_ERROR(Initialize(module, alternate_memory_size));
+    MemoryBoundLoopOptimizerOptions optimizer_options;
+    optimizer_options.set_enabled(true);
+    optimizer_options.set_desired_copy_ratio(0.7);
+    optimizer_options.set_allow_unsatisfied_fully_pipelined_prefetch(false);
+    TF_ASSIGN_OR_RETURN(
+        optimizer_,
+        MemoryBoundLoopOptimizer::Create(
+            loop_start, loop_end, alternate_memory_size, optimizer_options,
+            *live_range_, *alias_analysis_, *cost_analysis_, SizeFunction));
+    return optimizer_.get();
+  }
+
+  StatusOr<std::unique_ptr<HloModule>> ParseAndCreateOptimizer(
+      absl::string_view hlo_loop_str, uint64_t alternate_memory_size,
+      int& loop_start_idx, MemoryBoundLoopOptimizer** optimizer) {
+    int loop_end_idx;
+    TF_ASSIGN_OR_RETURN(
+        std::string module_str,
+        ParseAndCreateModuleString(hlo_loop_str, loop_start_idx, loop_end_idx));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        ParseAndReturnVerifiedModule(module_str));
+    TF_ASSIGN_OR_RETURN(
+        *optimizer, CreateOptimizer(loop_start_idx, loop_end_idx, module.get(),
+                                    alternate_memory_size));
+    return std::move(module);
+  }
+
+  // Parse a loop string description like the following:
+  //  $op0 = f32[1,4] add(f32[1,4] $param0, f32[1,4] $prev_op4)
+  //  $op1 = f32[8,4] add(f32[8,4] $param1, f32[8,4] $prev_op3)
+  //  $op2 = f32[1,4] add(f32[1,4] $param2, f32[1,4] $op0)
+  //  $op3 = f32[8,4] add(f32[8,4] $param3, f32[8,4] $op1)
+  //  $op4 = f32[1,4] add(f32[1,4] $param4, f32[1,4] $op2)
+  StatusOr<std::string> ParseAndCreateModuleString(
+      absl::string_view hlo_loop_str, int& loop_start_idx, int& loop_end_idx) {
+    // Parse op name and types first.
+    RE2 op_re("\\$op([0-9]+) += +(\\S+).*");
+    std::vector<absl::string_view> ops;
+    std::vector<absl::string_view> op_types;
+    int begin_pos = 0;
+    absl::string_view submatch[3];
+    while (op_re.Match(hlo_loop_str, begin_pos, hlo_loop_str.size(),
+                       RE2::UNANCHORED, submatch, /*nsubmatch=*/3)) {
+      for (int i = 0; i < 3; ++i) {
+        if (submatch[i].data() == nullptr) {
+          VLOG(4) << "Submatch[" << i << "] = nullptr";
+        } else {
+          VLOG(4) << "Submatch[" << i << "] = " << submatch[i]
+                  << " (idx: " << (submatch[i].data() - hlo_loop_str.data())
+                  << ")";
+        }
+      }
+      int op_num;
+      if (!absl::SimpleAtoi(submatch[1], &op_num)) {
+        return InvalidArgument("Op name expects to contain a number, found %s.",
+                               submatch[1]);
+      }
+      if (op_num != ops.size()) {
+        return InvalidArgument("Op number expected to be %d found %d.",
+                               op_types.size(), op_num);
+      }
+      ops.push_back(submatch[0]);
+      op_types.push_back(submatch[2]);
+      begin_pos = submatch[0].data() - hlo_loop_str.data() + submatch[0].size();
+    }
+
+    RE2 param_re("([[:alnum:]]+\\[\\S*\\]) +\\$param([0-9]+)");
+    std::vector<absl::string_view> param_types;
+    begin_pos = 0;
+    while (param_re.Match(hlo_loop_str, begin_pos, hlo_loop_str.size(),
+                          RE2::UNANCHORED, submatch, /*nsubmatch=*/3)) {
+      for (int i = 0; i < 3; ++i) {
+        if (submatch[i].data() == nullptr) {
+          VLOG(4) << "Submatch[" << i << "] = nullptr";
+        } else {
+          VLOG(4) << "Submatch[" << i << "] = " << submatch[i]
+                  << " (idx: " << (submatch[i].data() - hlo_loop_str.data())
+                  << ")";
+        }
+      }
+      int param_num;
+      if (!absl::SimpleAtoi(submatch[2], &param_num)) {
+        return InvalidArgument(
+            "Param name expects to contain a number, found %s.", submatch[2]);
+      }
+      while (param_num >= param_types.size()) {
+        param_types.push_back({});
+      }
+      param_types[param_num] = submatch[1];
+
+      begin_pos = submatch[0].data() - hlo_loop_str.data() + submatch[0].size();
+    }
+
+    RE2 root_re("ROOT \\$root += +tuple\\((.*)\\)");
+    absl::string_view root_values;
+    if (root_re.Match(hlo_loop_str, 0, hlo_loop_str.size(), RE2::UNANCHORED,
+                      submatch, /*nsubmatch=*/2)) {
+      for (int i = 0; i < 2; ++i) {
+        if (submatch[i].data() == nullptr) {
+          VLOG(4) << "Submatch[" << i << "] = nullptr";
+        } else {
+          VLOG(4) << "Submatch[" << i << "] = " << submatch[i]
+                  << " (idx: " << (submatch[i].data() - hlo_loop_str.data())
+                  << ")";
+        }
+      }
+      root_values = submatch[1];
+    }
+
+    for (absl::string_view op_type : op_types) {
+      VLOG(4) << "op_type: " << op_type;
+    }
+    for (absl::string_view param_type : param_types) {
+      VLOG(4) << "param_type: " << param_type;
+    }
+
+    std::string hlo_string = R"(
+HloModule module, is_scheduled=true
+
+ENTRY Entry {
+)";
+    int total_instructions = 0;
+    for (absl::string_view param_prefix : {"prev_", "", "next_"}) {
+      for (int i = 0; i < param_types.size(); ++i) {
+        int parameter_number = total_instructions;
+        absl::StrAppend(&hlo_string, "  ", param_prefix, "param", i, " = ",
+                        param_types[i], " parameter(", parameter_number,
+                        ")  // ", total_instructions++, "\n");
+      }
+    }
+
+    for (int i = 0; i < op_types.size(); ++i) {
+      int parameter_number = total_instructions;
+      absl::StrAppend(&hlo_string, "  ", "prev_prev_op", i, " = ", op_types[i],
+                      " parameter(", parameter_number, ")  // ",
+                      total_instructions++, "\n");
+    }
+
+    std::string new_root_values;
+    auto print_ops =
+        [&](const std::vector<std::pair<const absl::string_view, std::string>>&
+                replacements) {
+          for (int i = 0; i < ops.size(); ++i) {
+            absl::StrAppend(&hlo_string, "  ",
+                            absl::StrReplaceAll(ops[i], replacements), "  // ",
+                            total_instructions++, "\n");
+          }
+          if (!root_values.empty()) {
+            absl::StrAppend(&new_root_values,
+                            new_root_values.empty() ? "" : ", ",
+                            absl::StrReplaceAll(root_values, replacements));
+          }
+        };
+
+    std::vector<std::pair<const absl::string_view, std::string>>
+        prev_replacements;
+    prev_replacements.push_back({"$prev_op", "prev_prev_op"});
+    prev_replacements.push_back({"$op", "prev_op"});
+    prev_replacements.push_back({"$param", "prev_param"});
+    absl::StrAppend(&hlo_string, "  // Prev iteration body:\n");
+    print_ops(prev_replacements);
+
+    loop_start_idx = total_instructions;
+    std::vector<std::pair<const absl::string_view, std::string>> replacements;
+    replacements.push_back({"$", ""});
+    absl::StrAppend(&hlo_string, "  // Loop body:\n");
+    print_ops(replacements);
+    loop_end_idx = total_instructions;
+
+    std::vector<std::pair<const absl::string_view, std::string>>
+        next_replacements;
+    next_replacements.push_back({"$prev_op", "op"});
+    next_replacements.push_back({"$op", "next_op"});
+    next_replacements.push_back({"$param", "next_param"});
+    absl::StrAppend(&hlo_string, "  // Next iteration body:\n");
+    print_ops(next_replacements);
+
+    absl::StrAppend(&hlo_string, "  ROOT root = tuple(", new_root_values,
+                    ")\n");
+    absl::StrAppend(&hlo_string, "}");
+
+    VLOG(1) << hlo_string;
+    return hlo_string;
+  }
+
+  StatusOr<std::unique_ptr<PresetAssignments>> RunMsa(
+      HloModule* module, uint64_t alternate_memory_size = 256) {
+    options_.max_size_in_bytes = alternate_memory_size;
+    options_.alignment_in_bytes = 8;
+    options_.verify = true;
+
+    options_.alternate_memory_space = kAlternateMemorySpace;
+
+    if (!cost_analysis_) {
+      TF_RETURN_IF_ERROR(Initialize(module, alternate_memory_size));
+    }
+    MemorySpaceAssignmentCostAnalysis::Cache cache;
+    options_.buffer_interval_compare =
+        MemorySpaceAssignment::GetMemoryBoundednessBufferIntervalCompare(
+            *cost_analysis_, &cache);
+    CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
+        CostAnalysisPrefetchIntervalPicker(
+            *cost_analysis_, /*min_overlap_to_async_copy_ratio=*/0.8,
+            /*preferred_overlap_to_async_copy_ratio=*/1.5,
+            /*max_overlap_to_mem_size_async_copy_ratio=*/10.0,
+            /*mem_size_bytes=*/alternate_memory_size));
+    options_.prefetch_interval_picker = &prefetch_interval_picker;
+
+    auto size_fn = [](const BufferValue& buffer) {
+      return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+    };
+    options_.size_fn = size_fn;
+
+    auto is_allowed_in_alternate_mem = [](const HloValue& value) {
+      // Check if the value belongs to the entry computation.
+      HloInstruction* instruction = value.instruction();
+      HloComputation* computation = instruction->parent();
+      bool in_entry_computation =
+          (computation == computation->parent()->entry_computation());
+      if (in_entry_computation &&
+          instruction->opcode() == HloOpcode::kParameter) {
+        return false;
+      }
+      return true;
+    };
+    options_.is_allowed_in_alternate_mem_fn = is_allowed_in_alternate_mem;
+    options_.max_outstanding_prefetches = -1;
+    options_.max_outstanding_evictions = -1;
+    options_.allocate_across_sequential_calls = true;
+    options_.cost_analysis = cost_analysis_.get();
+
+    std::unique_ptr<PresetAssignments> preset_assignments =
+        MemorySpaceAssignment::Run(module, *live_range_, *alias_analysis_,
+                                   options_)
+            .value();
+    return preset_assignments;
+  }
+
+  Status VerifyMsaEquivalence(HloModule* module) {
+    // Create a map indexed by instruction number and operand number.
+    absl::flat_hash_map<std::pair<int, int>,
+                        const MemorySpaceAssignment::Allocation*>
+        allocation_map;
+    for (const MemoryBoundLoopOptimizer::LoopValue& value :
+         optimizer_->loop_values()) {
+      for (const auto& allocation : value.allocations) {
+        for (const HloUse& use : allocation->uses()) {
+          absl::string_view inst_name = use.instruction->name();
+          TF_RET_CHECK(absl::StartsWith(inst_name, "op"));
+          int inst_number;
+          TF_RET_CHECK(absl::SimpleAtoi(inst_name.substr(2), &inst_number));
+          allocation_map[{inst_number, use.operand_number}] = allocation.get();
+        }
+      }
+    }
+
+    auto get_inst_prefix_in_iter = [](int iteration) {
+      switch (iteration) {
+        case 0:
+          return "prev_";
+        case 1:
+          return "";
+        case 2:
+          return "next_";
+        default:
+          LOG(FATAL) << "Invalid iteration " << iteration;
+          return "INVALID";
+      }
+    };
+
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                        HloAliasAnalysis::Run(module));
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloLiveRange> live_range,
+                        HloLiveRange::Run(module->schedule(), *alias_analysis,
+                                          module->entry_computation()));
+    const auto& flattened_instructions =
+        live_range->flattened_instruction_sequence().instructions();
+    for (int iteration = 1; iteration < 3; ++iteration) {
+      for (int inst_number = 0; inst_number < optimizer_->loop_size();
+           ++inst_number) {
+        HloInstruction* inst = FindInstruction(
+            module, absl::StrCat(get_inst_prefix_in_iter(iteration), "op",
+                                 inst_number));
+        for (int operand_number = 0; operand_number < 2; ++operand_number) {
+          const HloInstruction* operand = inst->operand(operand_number);
+          LOG(INFO) << inst->name() << ", operand " << operand_number;
+          TF_RET_CHECK(allocation_map.contains({inst_number, operand_number}));
+          const MemorySpaceAssignment::Allocation* allocation =
+              allocation_map.at({inst_number, operand_number});
+          if (!allocation->is_copy_allocation()) {
+            // We don't expect a prefetch here.
+            EXPECT_NE(operand->opcode(), HloOpcode::kCopyDone);
+            int expected_memory_space =
+                allocation->memory_space() ==
+                        MemorySpaceAssignment::MemorySpace::kDefault
+                    ? kDefaultMemorySpace
+                    : kAlternateMemorySpace;
+            EXPECT_EQ(operand->shape().layout().memory_space(),
+                      expected_memory_space);
+          } else {
+            EXPECT_EQ(allocation->memory_space(),
+                      MemorySpaceAssignment::MemorySpace::kAlternate);
+            TF_RET_CHECK(operand->opcode() == HloOpcode::kCopyDone);
+            const MemorySpaceAssignment::CopyAllocation* copy_allocation =
+                static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+                    allocation);
+            if (copy_allocation->copy_done_schedule_before() != inst_number) {
+              // The only case where the copy done schedule before is not the
+              // same as this use would be that this use is not the first use of
+              // the copy allocation.
+              EXPECT_NE(allocation->uses().front(),
+                        (HloUse{inst, operand_number}));
+              continue;
+            }
+            int expected_copy_start_iteration = iteration;
+            if (copy_allocation->copy_start_schedule_after() ==
+                    optimizer_->loop_size() &&
+                copy_allocation->copy_done_schedule_before() == 0) {
+              expected_copy_start_iteration -= 2;
+            } else if (copy_allocation->copy_start_schedule_after() + 1 >=
+                       copy_allocation->copy_done_schedule_before()) {
+              expected_copy_start_iteration -= 1;
+            }
+
+            if (expected_copy_start_iteration >= 0) {
+              const HloInstruction* expected_copy_start_schedule_after =
+                  FindInstruction(
+                      module,
+                      absl::StrCat(
+                          get_inst_prefix_in_iter(
+                              expected_copy_start_iteration),
+                          "op", copy_allocation->copy_start_schedule_after()));
+              LOG(INFO) << "Expected copy start schedule after: "
+                        << expected_copy_start_schedule_after->name();
+              const HloInstruction* copy_start = operand->operand(0);
+              TF_RET_CHECK(copy_start->opcode() == HloOpcode::kCopyStart);
+              // Find the instruction before this copy start that is not an
+              // async copy or gte or parameter.
+              int copy_start_idx =
+                  live_range->instruction_schedule().at(copy_start);
+              const HloInstruction* copy_start_schedule_after = nullptr;
+              for (int i = copy_start_idx - 1; i >= 0; --i) {
+                HloOpcode opcode = flattened_instructions.at(i)->opcode();
+                if (opcode != HloOpcode::kCopyStart &&
+                    opcode != HloOpcode::kCopyDone &&
+                    opcode != HloOpcode::kGetTupleElement &&
+                    opcode != HloOpcode::kParameter) {
+                  copy_start_schedule_after = flattened_instructions.at(i);
+                  break;
+                }
+              }
+              TF_RET_CHECK(copy_start_schedule_after != nullptr);
+              EXPECT_EQ(copy_start_schedule_after,
+                        expected_copy_start_schedule_after);
+            }
+          }
+        }
+      }
+    }
+    return OkStatus();
+  }
+
+ private:
+  Options options_;
+  std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
+  std::unique_ptr<MemorySpaceAssignmentCostAnalysis> cost_analysis_;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  std::unique_ptr<HloLiveRange> live_range_;
+  std::unique_ptr<MemoryBoundLoopOptimizer> optimizer_;
+};
+
+TEST_F(MemoryBoundLoopOptimizerTest, SimplePrefetch) {
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[1,4] add(f32[1,4] $prev_op3, f32[1,4] $prev_op4)
+    $op1 = f32[1,4] add(f32[1,4] $prev_op4, f32[1,4] $op0)
+    $op2 = f32[1,4] add(f32[1,4] $op0, f32[1,4] $op1)
+    $op3 = f32[1,4] add(f32[1,4] $op1, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $param0, f32[1,4] $op3)
+    ROOT $root = tuple($op4, $param0)
+  )";
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndCreateOptimizer(hlo_loop_str,
+                                                  /*alternate_memory_size=*/128,
+                                                  loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  absl::flat_hash_set<HloUse> seen_uses;
+  for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
+       optimizer->loop_values()) {
+    LOG(INFO) << loop_value.ToString();
+    if (loop_value.hlo_values.front()
+            ->defining_position()
+            .instruction->name() == "param0") {
+      EXPECT_TRUE(loop_value.allocations.back()->is_copy_allocation());
+    }
+    for (const auto& allocation : loop_value.allocations) {
+      for (const HloUse& use : allocation->uses()) {
+        EXPECT_FALSE(seen_uses.contains(use)) << use.ToString();
+        seen_uses.insert(use);
+      }
+    }
+  }
+
+  // Ensure all of the uses in the loop have an associated use.
+  for (absl::string_view inst_name : {"op0", "op1", "op2", "op3", "op4"}) {
+    HloInstruction* inst =
+        module->entry_computation()->GetInstructionWithName(inst_name);
+    EXPECT_TRUE(seen_uses.contains(HloUse{inst, 0})) << inst_name;
+    EXPECT_TRUE(seen_uses.contains(HloUse{inst, 1})) << inst_name;
+  }
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, NoAlternateMem) {
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[1,4] add(f32[1,4] $prev_op3, f32[1,4] $prev_op4)
+    $op1 = f32[1,4] add(f32[1,4] $prev_op4, f32[1,4] $op0)
+    $op2 = f32[1,4] add(f32[1,4] $op0, f32[1,4] $op1)
+    $op3 = f32[1,4] add(f32[1,4] $op1, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $param0, f32[1,4] $op3)
+    ROOT $root = tuple($op4, $param0)
+  )";
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  // Set alternate memory size to zero so nothing should be in the alternate
+  // memory. We still expect to find an allocation for all uses.
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndCreateOptimizer(hlo_loop_str,
+                                                  /*alternate_memory_size=*/0,
+                                                  loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  absl::flat_hash_set<HloUse> seen_uses;
+  for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
+       optimizer->loop_values()) {
+    LOG(INFO) << loop_value.ToString();
+    for (const auto& allocation : loop_value.allocations) {
+      EXPECT_EQ(allocation->memory_space(),
+                MemorySpaceAssignment::MemorySpace::kDefault);
+      for (const HloUse& use : allocation->uses()) {
+        EXPECT_FALSE(seen_uses.contains(use)) << use.ToString();
+        seen_uses.insert(use);
+      }
+    }
+  }
+
+  // Ensure all of the uses in the loop have an associated use.
+  for (absl::string_view inst_name : {"op0", "op1", "op2", "op3", "op4"}) {
+    HloInstruction* inst =
+        module->entry_computation()->GetInstructionWithName(inst_name);
+    EXPECT_TRUE(seen_uses.contains(HloUse{inst, 0})) << inst_name;
+    EXPECT_TRUE(seen_uses.contains(HloUse{inst, 1})) << inst_name;
+  }
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithOverlap) {
+  // Test for enforcing FIFO order of prefetches. There are three parameters
+  // that will be prefetched (param0, param1, and param2). param2 is one eighth
+  // the size of the other parameters and is scheduled later in the loop. So, we
+  // expect the allocation algorithm to initially allocate param2's prefetch
+  // with a short live range (since copying it doesn't take very long), but then
+  // as we try to prefetch param0 and param1, we will wrap around into the
+  // previous iterations and would need to "early force" param2's prefetch to be
+  // scheduled earlier to enforce the FIFO order.
+  //
+  // alternate_mem_bytes_per_second = 128
+  // default_mem_bytes_per_second = 32
+  // flops_per_second = 16
+  // f32[1,4] add: flops: 4, bytes: 48, compute elapsed: 0.25
+  //    - All default memory elapsed: 1.5
+  //    - All alternate memory elapsed: 0.375
+  // f32[8,4] add: flops: 32, bytes: 384, compute elapsed: 2
+  //    - All default memory elapsed: 12
+  //    - All alternate memory elapsed: 3
+  // f32[1,4] copy: bytes: 16, memory elapsed: 0.5
+  // f32[8,4] copy: bytes: 128, memory elapsed: 4
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[1,4] add(f32[1,4] $prev_op13, f32[1,4] $prev_op14)
+    $op1 = f32[8,4] add(f32[8,4] $param0, f32[8,4] $param1)
+    $op2 = f32[1,4] add(f32[1,4] $prev_op14, f32[1,4] $op0)
+    $op3 = f32[1,4] add(f32[1,4] $op0, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $op2, f32[1,4] $op3)
+    $op5 = f32[1,4] add(f32[1,4] $op3, f32[1,4] $op4)
+    $op6 = f32[1,4] add(f32[1,4] $op4, f32[1,4] $op5)
+    $op7 = f32[1,4] add(f32[1,4] $op5, f32[1,4] $op6)
+    $op8 = f32[1,4] add(f32[1,4] $op6, f32[1,4] $op7)
+    $op9 = f32[1,4] add(f32[1,4] $op7, f32[1,4] $op8)
+    $op10 = f32[1,4] add(f32[1,4] $op8, f32[1,4] $op9)
+    $op11 = f32[1,4] add(f32[1,4] $op9, f32[1,4] $op10)
+    $op12 = f32[1,4] add(f32[1,4] $op10, f32[1,4] $op11)
+    $op13 = f32[1,4] add(f32[1,4] $op11, f32[1,4] $op12)
+    $op14 = f32[1,4] add(f32[1,4] $param2, f32[1,4] $op13)
+  )";
+
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndCreateOptimizer(hlo_loop_str,
+                                                  /*alternate_memory_size=*/512,
+                                                  loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  // We expect the prefetches to be scheduled this way:
+  //
+  //
+  // param0 or param1:
+  // ===========>       =====================================>
+  // param1 or param0:
+  // ===========>                                           ===
+  //           ==============================================>
+  // param2:
+  // =====>    ========================================>    ===
+  //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
+  //  prev |                  loop                      | next
+  //
+  // Temporaries:
+  //  +======+
+  //     +=========+
+  //        +=========+
+  //              +======+
+  //                 +======+
+  //                    +======+
+  //                       +======+
+  //                          +======+
+  //                             +======+
+  //                                +======+
+  //                                   +======+
+  //                                      +======+
+  //                                         +======+
+  //                                            +===+
+  //                                               +======+
+  //                                                  +=========+
+  //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
+  //  prev |                  loop                      | next
+  std::vector<const MemorySpaceAssignment::CopyAllocation*> prefetches;
+  for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
+       optimizer->loop_values()) {
+    if (!loop_value.allocations.empty() &&
+        loop_value.allocations.back()->is_copy_allocation()) {
+      prefetches.push_back(
+          static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+              loop_value.allocations.back().get()));
+    }
+  }
+  EXPECT_EQ(prefetches.size(), 3);
+  bool seen_overlap = false;
+  bool seen_nonoverlap = false;
+  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+    const HloUse& use = *prefetch->uses().begin();
+    if (use.instruction->name() == "op14") {
+      EXPECT_EQ(prefetch->copy_done_schedule_before(), 14);
+      EXPECT_EQ(prefetch->copy_start_schedule_after(), 0);
+    } else {
+      ASSERT_EQ(use.instruction->name(), "op1");
+      EXPECT_EQ(prefetch->copy_done_schedule_before(), 1);
+      if (prefetch->copy_start_schedule_after() == 0) {
+        EXPECT_FALSE(seen_overlap);
+        seen_overlap = true;
+      } else {
+        EXPECT_GT(prefetch->copy_start_schedule_after(), 1);
+        EXPECT_FALSE(seen_nonoverlap);
+        seen_nonoverlap = true;
+      }
+    }
+  }
+  // We expect to fully saturate the default memory bandwidth. Total default
+  // memory accesses:
+  //   param0 (128 B) + param1 (128 B) + op1 (128 B) + param2 (16 B) = 400 B
+  // execution time:
+  //  400 B / 32 B/s = 12.5 s.
+  EXPECT_EQ(optimizer->CalculateExecutionTime(), 12.5);
+
+  // Check the memory used at each point of the loop.
+  const std::vector<int64_t>& remaining_memory = optimizer->remaining_memory();
+  // Time 0: 3 temporaries (16 B) + param0 (128 B) + param1 (128 B)
+  EXPECT_EQ(remaining_memory.at(0), 512 - (3 * 16 + 128 + 128));
+  // Time 1: 2 temporaries (16 B) + 2*param0 (128 B) + param1 (128 B)
+  //         + param2 (16 B)
+  EXPECT_EQ(remaining_memory.at(1), 512 - (2 * 16 + 2 * 128 + 128 + 16));
+  // Times 2 and 3: 3 temporaries (16 B) + param0 (128 B) + param2 (16 B)
+  EXPECT_EQ(remaining_memory.at(2), 512 - (3 * 16 + 128 + 16));
+  EXPECT_EQ(remaining_memory.at(3), 512 - (3 * 16 + 128 + 16));
+  // Times 4 to 13: 3 temporaries (16 B) + param0 (128 B) + param1 (128 B)
+  //                + param2 (16 B)
+  for (int i = 4; i <= 13; ++i) {
+    EXPECT_EQ(remaining_memory.at(i), 512 - (3 * 16 + 128 + 128 + 16));
+  }
+  // Time 14: 2 temporaries (16 B) + param0 (128 B) + param1 (128 B)
+  //          + param2 (16 B)
+  EXPECT_EQ(remaining_memory.at(14), 512 - (2 * 16 + 128 + 128 + 16));
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithoutOverlap) {
+  // Same as the test above, except the size of alternate memory is less than
+  // 384, which is the minimum amount needed to keep the three 128-byte sized
+  // parameters alive (one of the parameters would need to be overlapped with
+  // the previous iteration, so counts 2X). In that case, we won't be able to
+  // fully saturate the bandwidth.
+  //
+  // alternate_mem_bytes_per_second = 128
+  // default_mem_bytes_per_second = 32
+  // flops_per_second = 16
+  // f32[1,4] add: flops: 4, bytes: 48, compute elapsed: 0.25
+  //    - All default memory elapsed: 1.5
+  //    - All alternate memory elapsed: 0.375
+  // f32[8,4] add: flops: 32, bytes: 384, compute elapsed: 2
+  //    - All default memory elapsed: 12
+  //    - All alternate memory elapsed: 3
+  // f32[1,4] copy: bytes: 16, memory elapsed: 0.5
+  // f32[8,4] copy: bytes: 128, memory elapsed: 4
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[1,4] add(f32[1,4] $prev_op13, f32[1,4] $prev_op14)
+    $op1 = f32[8,4] add(f32[8,4] $param0, f32[8,4] $param1)
+    $op2 = f32[1,4] add(f32[1,4] $prev_op14, f32[1,4] $op0)
+    $op3 = f32[1,4] add(f32[1,4] $op0, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $op2, f32[1,4] $op3)
+    $op5 = f32[1,4] add(f32[1,4] $op3, f32[1,4] $op4)
+    $op6 = f32[1,4] add(f32[1,4] $op4, f32[1,4] $op5)
+    $op7 = f32[1,4] add(f32[1,4] $op5, f32[1,4] $op6)
+    $op8 = f32[1,4] add(f32[1,4] $op6, f32[1,4] $op7)
+    $op9 = f32[1,4] add(f32[1,4] $op7, f32[1,4] $op8)
+    $op10 = f32[1,4] add(f32[1,4] $op8, f32[1,4] $op9)
+    $op11 = f32[1,4] add(f32[1,4] $op9, f32[1,4] $op10)
+    $op12 = f32[1,4] add(f32[1,4] $op10, f32[1,4] $op11)
+    $op13 = f32[1,4] add(f32[1,4] $op11, f32[1,4] $op12)
+    $op14 = f32[1,4] add(f32[1,4] $param2, f32[1,4] $op13)
+  )";
+
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndCreateOptimizer(hlo_loop_str,
+                                                  /*alternate_memory_size=*/350,
+                                                  loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  // We expect the prefetches to be scheduled this way:
+  //
+  //
+  // param0 or param1:
+  // ===========>       =====================================>
+  // param2:
+  // =====>             ===============================>
+  //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
+  //  prev |                  loop                      | next
+  std::vector<const MemorySpaceAssignment::CopyAllocation*> prefetches;
+  for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
+       optimizer->loop_values()) {
+    if (!loop_value.allocations.empty() &&
+        loop_value.allocations.back()->is_copy_allocation()) {
+      prefetches.push_back(
+          static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+              loop_value.allocations.back().get()));
+    }
+  }
+  EXPECT_EQ(prefetches.size(), 2);
+  std::optional<int> expected_op14_copy_start_time;
+  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+    const HloUse& use = *prefetch->uses().begin();
+    if (use.instruction->name() == "op1") {
+      EXPECT_EQ(prefetch->copy_done_schedule_before(), 1);
+      EXPECT_GT(prefetch->copy_start_schedule_after(), 1);
+      expected_op14_copy_start_time = prefetch->copy_start_schedule_after();
+    }
+  }
+  EXPECT_TRUE(expected_op14_copy_start_time.has_value());
+  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+    const HloUse& use = *prefetch->uses().begin();
+    if (use.instruction->name() == "op14") {
+      EXPECT_EQ(prefetch->copy_done_schedule_before(), 14);
+      EXPECT_EQ(prefetch->copy_start_schedule_after(),
+                *expected_op14_copy_start_time);
+    }
+  }
+  // We expect not to fully saturate the default memory bandwidth.
+  EXPECT_GT(optimizer->CalculateExecutionTime(), 12.5);
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithOverlap2) {
+  // Same as PrefetchFifoOrderWithOverlap, except the instructions are shifted
+  // earlier by one such that param0 and param1 are used by op0. This tests that
+  // we are accounting for overlaps for prefetches that span three iterations.
+  //
+  // alternate_mem_bytes_per_second = 128
+  // default_mem_bytes_per_second = 32
+  // flops_per_second = 16
+  // f32[1,4] add: flops: 4, bytes: 48, compute elapsed: 0.25
+  //    - All default memory elapsed: 1.5
+  //    - All alternate memory elapsed: 0.375
+  // f32[8,4] add: flops: 32, bytes: 384, compute elapsed: 2
+  //    - All default memory elapsed: 12
+  //    - All alternate memory elapsed: 3
+  // f32[1,4] copy: bytes: 16, memory elapsed: 0.5
+  // f32[8,4] copy: bytes: 128, memory elapsed: 4
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[8,4] add(f32[8,4] $param0, f32[8,4] $param1)
+    $op1 = f32[1,4] add(f32[1,4] $prev_op13, f32[1,4] $prev_op14)
+    $op2 = f32[1,4] add(f32[1,4] $prev_op14, f32[1,4] $op1)
+    $op3 = f32[1,4] add(f32[1,4] $op1, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $op2, f32[1,4] $op3)
+    $op5 = f32[1,4] add(f32[1,4] $op3, f32[1,4] $op4)
+    $op6 = f32[1,4] add(f32[1,4] $op4, f32[1,4] $op5)
+    $op7 = f32[1,4] add(f32[1,4] $op5, f32[1,4] $op6)
+    $op8 = f32[1,4] add(f32[1,4] $op6, f32[1,4] $op7)
+    $op9 = f32[1,4] add(f32[1,4] $op7, f32[1,4] $op8)
+    $op10 = f32[1,4] add(f32[1,4] $op8, f32[1,4] $op9)
+    $op11 = f32[1,4] add(f32[1,4] $op9, f32[1,4] $op10)
+    $op12 = f32[1,4] add(f32[1,4] $op10, f32[1,4] $op11)
+    $op13 = f32[1,4] add(f32[1,4] $param2, f32[1,4] $op12)
+    $op14 = f32[1,4] add(f32[1,4] $op12, f32[1,4] $op13)
+  )";
+
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndCreateOptimizer(hlo_loop_str,
+                                                  /*alternate_memory_size=*/512,
+                                                  loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  // We expect the prefetches to be scheduled this way:
+  //
+  //
+  // param0 or param1:
+  // ========>       =====================================> ===
+  // param1 or param0:
+  // ========>                                           ======
+  //        ==============================================>
+  // param2:
+  // ==>    ========================================>    ======
+  //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
+  //  prev |                  loop                      | next
+  std::vector<const MemorySpaceAssignment::CopyAllocation*> prefetches;
+  for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
+       optimizer->loop_values()) {
+    if (!loop_value.allocations.empty() &&
+        loop_value.allocations.back()->is_copy_allocation()) {
+      prefetches.push_back(
+          static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+              loop_value.allocations.back().get()));
+    }
+  }
+  EXPECT_EQ(prefetches.size(), 3);
+  bool seen_overlap = false;
+  bool seen_nonoverlap = false;
+  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+    const HloUse& use = *prefetch->uses().begin();
+    if (use.instruction->name() == "op13") {
+      EXPECT_EQ(prefetch->copy_done_schedule_before(), 13);
+      EXPECT_EQ(prefetch->copy_start_schedule_after(), 14);
+    } else {
+      ASSERT_EQ(use.instruction->name(), "op0");
+      EXPECT_EQ(prefetch->copy_done_schedule_before(), 0);
+      if (prefetch->copy_start_schedule_after() == 14) {
+        EXPECT_FALSE(seen_overlap);
+        seen_overlap = true;
+      } else {
+        EXPECT_LT(prefetch->copy_start_schedule_after(), 14);
+        EXPECT_FALSE(seen_nonoverlap);
+        seen_nonoverlap = true;
+      }
+    }
+  }
+  // We expect to fully saturate the default memory bandwidth. Total default
+  // memory accesses:
+  //   param0 (128 B) + param1 (128 B) + op1 (128 B) + param2 (16 B) = 400 B
+  // execution time:
+  //  400 B / 32 B/s = 12.5 s.
+  EXPECT_EQ(optimizer->CalculateExecutionTime(), 12.5);
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, OptimizerEndToEnd) {
+  absl::string_view hlo_loop_str = R"(
+    $op0 = f32[1,4] add(f32[1,4] $prev_op13, f32[1,4] $prev_op14)
+    $op1 = f32[8,4] add(f32[8,4] $param0, f32[8,4] $param1)
+    $op2 = f32[1,4] add(f32[1,4] $prev_op14, f32[1,4] $op0)
+    $op3 = f32[1,4] add(f32[1,4] $op0, f32[1,4] $op2)
+    $op4 = f32[1,4] add(f32[1,4] $op2, f32[1,4] $op3)
+    $op5 = f32[1,4] add(f32[1,4] $op3, f32[1,4] $op4)
+    $op6 = f32[1,4] add(f32[1,4] $op4, f32[1,4] $op5)
+    $op7 = f32[1,4] add(f32[1,4] $op5, f32[1,4] $op6)
+    $op8 = f32[1,4] add(f32[1,4] $op6, f32[1,4] $op7)
+    $op9 = f32[1,4] add(f32[1,4] $op7, f32[1,4] $op8)
+    $op10 = f32[1,4] add(f32[1,4] $op8, f32[1,4] $op9)
+    $op11 = f32[1,4] add(f32[1,4] $op9, f32[1,4] $op10)
+    $op12 = f32[1,4] add(f32[1,4] $op10, f32[1,4] $op11)
+    $op13 = f32[1,4] add(f32[1,4] $op11, f32[1,4] $op12)
+    $op14 = f32[1,4] add(f32[1,4] $param2, f32[1,4] $op13)
+    ROOT $root = tuple($op1, $op14)
+  )";
+
+  int loop_start_idx;
+  MemoryBoundLoopOptimizer* optimizer;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto module, ParseAndCreateOptimizer(hlo_loop_str,
+                                           /*alternate_memory_size=*/1024,
+                                           loop_start_idx, &optimizer));
+
+  optimizer->Optimize();
+  TF_ASSERT_OK_AND_ASSIGN(auto preset_assignments,
+                          RunMsa(module.get(), /*alternate_memory_size=*/1024));
+
+  TF_ASSERT_OK(VerifyMsaEquivalence(module.get()));
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, OptimizerEndToEndWhileLoop) {
+  absl::string_view hlo_str = R"(
+HloModule module, is_scheduled=true
+
+while_cond {
+  while_cond_param = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) parameter(0)
+  ROOT p = pred[] get-tuple-element(while_cond_param), index=6
+}
+
+while_body {
+  while_body_param = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) parameter(0)
+  prev_param0 = f32[1,4] get-tuple-element(while_body_param), index=0
+  param0 = f32[1,4] get-tuple-element(while_body_param), index=1
+  next_param0 = f32[1,4] get-tuple-element(while_body_param), index=2
+  prev_prev_op3 = f32[1,4] get-tuple-element(while_body_param), index=3
+  prev_prev_op4 = f32[1,4] get-tuple-element(while_body_param), index=4
+  prev_op0 = f32[1,4] add(f32[1,4] prev_prev_op3, f32[1,4] prev_prev_op4)
+  prev_op1 = f32[1,4] add(f32[1,4] prev_prev_op4, f32[1,4] prev_op0)
+  prev_op2 = f32[1,4] add(f32[1,4] prev_op0, f32[1,4] prev_op1)
+  prev_op3 = f32[1,4] add(f32[1,4] prev_op1, f32[1,4] prev_op2)
+  prev_op4 = f32[1,4] multiply(f32[1,4] prev_param0, f32[1,4] prev_op3)
+  op0 = f32[1,4] add(f32[1,4] prev_op3, f32[1,4] prev_op4)
+  op1 = f32[1,4] add(f32[1,4] prev_op4, f32[1,4] op0)
+  op2 = f32[1,4] add(f32[1,4] op0, f32[1,4] op1)
+  op3 = f32[1,4] add(f32[1,4] op1, f32[1,4] op2)
+  op4 = f32[1,4] multiply(f32[1,4] param0, f32[1,4] op3)
+  next_op0 = f32[1,4] add(f32[1,4] op3, f32[1,4] op4)
+  next_op1 = f32[1,4] add(f32[1,4] op4, f32[1,4] next_op0)
+  next_op2 = f32[1,4] add(f32[1,4] next_op0, f32[1,4] next_op1)
+  next_op3 = f32[1,4] add(f32[1,4] next_op1, f32[1,4] next_op2)
+  next_op4 = f32[1,4] multiply(f32[1,4] next_param0, f32[1,4] next_op3)
+  p = pred[] get-tuple-element(while_body_param), index=6
+  ROOT root = tuple(prev_param0, param0, next_param0, prev_prev_op3, prev_prev_op4, next_op4, p)
+}
+
+ENTRY entry {
+  p0 = f32[1,4] parameter(0)
+  p1 = f32[1,4] parameter(1)
+  p2 = f32[1,4] parameter(2)
+  p3 = f32[1,4] parameter(3)
+  p4 = f32[1,4] parameter(4)
+  p5 = pred[] parameter(5)
+  copy = f32[1,4] copy(p4)
+  tuple = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) tuple(p0, p1, p2, p3, p4, copy, p5)
+  while = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) while(tuple), condition=while_cond, body=while_body
+  ROOT root = f32[1,4] get-tuple-element(while), index=5
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_str));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto preset_assignments,
+                          RunMsa(module.get(), /*alternate_memory_size=*/512));
+
+  // We expect operand 0 of prev_op4, op4, and next_op4 to all be prefetches of
+  // same distance from the user.
+  TF_ASSERT_OK_AND_ASSIGN(auto alias_analysis,
+                          HloAliasAnalysis::Run(module.get()));
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_live_range,
+                          HloLiveRange::Run(module->schedule(), *alias_analysis,
+                                            module->entry_computation()));
+  const HloInstruction* prev_copy_done =
+      FindInstruction(module.get(), "prev_op4")->operand(0);
+  const HloInstruction* copy_done =
+      FindInstruction(module.get(), "op4")->operand(0);
+  const HloInstruction* next_copy_done =
+      FindInstruction(module.get(), "next_op4")->operand(0);
+  ASSERT_EQ(prev_copy_done->opcode(), HloOpcode::kCopyDone);
+  ASSERT_EQ(copy_done->opcode(), HloOpcode::kCopyDone);
+  ASSERT_EQ(next_copy_done->opcode(), HloOpcode::kCopyDone);
+  EXPECT_EQ(prev_copy_done->shape().layout().memory_space(),
+            kAlternateMemorySpace);
+  EXPECT_EQ(copy_done->shape().layout().memory_space(), kAlternateMemorySpace);
+  EXPECT_EQ(next_copy_done->shape().layout().memory_space(),
+            kAlternateMemorySpace);
+  auto prefetch_distance = [&](const HloInstruction* copy_done) {
+    return hlo_live_range->instruction_schedule().at(copy_done) -
+           hlo_live_range->instruction_schedule().at(copy_done->operand(0));
+  };
+  EXPECT_EQ(prefetch_distance(prev_copy_done), prefetch_distance(copy_done));
+  EXPECT_EQ(prefetch_distance(next_copy_done), prefetch_distance(copy_done));
+}
+
+TEST_F(MemoryBoundLoopOptimizerTest, OptimizerEndToEndNestedWhileLoopBug) {
+  absl::string_view hlo_str = R"(
+HloModule module, is_scheduled=true
+
+prev_while_cond {
+  prev_while_cond_param = (f32[1,4], pred[]) parameter(0)
+  ROOT p = pred[] get-tuple-element(prev_while_cond_param), index=1
+}
+
+prev_while_body {
+  prev_while_body_param = (f32[1,4], pred[]) parameter(0)
+  prev_while_body_gte = f32[1,4] get-tuple-element(prev_while_body_param), index=0
+  prev_while_body_pred = pred[] get-tuple-element(prev_while_body_param), index=1
+  prev_while_body_op = f32[1,4] negate(prev_while_body_gte)
+  ROOT prev_while_body_root = (f32[1,4], pred[]) tuple(prev_while_body_op, prev_while_body_pred)
+}
+
+current_while_cond {
+  current_while_cond_param = (f32[1,4], pred[]) parameter(0)
+  ROOT p = pred[] get-tuple-element(current_while_cond_param), index=1
+}
+
+current_while_body {
+  current_while_body_param = (f32[1,4], pred[]) parameter(0)
+  current_while_body_gte = f32[1,4] get-tuple-element(current_while_body_param), index=0
+  current_while_body_pred = pred[] get-tuple-element(current_while_body_param), index=1
+  current_while_body_op = f32[1,4] negate(current_while_body_gte)
+  ROOT current_while_body_root = (f32[1,4], pred[]) tuple(current_while_body_op, current_while_body_pred)
+}
+
+next_while_cond {
+  next_while_cond_param = (f32[1,4], pred[]) parameter(0)
+  ROOT p = pred[] get-tuple-element(next_while_cond_param), index=1
+}
+
+next_while_body {
+  next_while_body_param = (f32[1,4], pred[]) parameter(0)
+  next_while_body_gte = f32[1,4] get-tuple-element(next_while_body_param), index=0
+  next_while_body_pred = pred[] get-tuple-element(next_while_body_param), index=1
+  next_while_body_op = f32[1,4] negate(next_while_body_gte)
+  ROOT next_while_body_root = (f32[1,4], pred[]) tuple(next_while_body_op, next_while_body_pred)
+}
+
+while_cond {
+  while_cond_param = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) parameter(0)
+  ROOT p = pred[] get-tuple-element(while_cond_param), index=6
+}
+
+while_body {
+  while_body_param = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) parameter(0)
+  prev_param0 = f32[1,4] get-tuple-element(while_body_param), index=0
+  param0 = f32[1,4] get-tuple-element(while_body_param), index=1
+  next_param0 = f32[1,4] get-tuple-element(while_body_param), index=2
+  prev_prev_op3 = f32[1,4] get-tuple-element(while_body_param), index=3
+  prev_prev_op4 = f32[1,4] get-tuple-element(while_body_param), index=4
+  while_pred = pred[] get-tuple-element(while_body_param), index=6
+  prev_op0 = f32[1,4] add(f32[1,4] prev_prev_op3, f32[1,4] prev_prev_op4)
+  prev_op1 = f32[1,4] add(f32[1,4] prev_prev_op4, f32[1,4] prev_op0)
+  prev_op2 = f32[1,4] add(f32[1,4] prev_op0, f32[1,4] prev_op1)
+  prev_op3 = f32[1,4] add(f32[1,4] prev_op1, f32[1,4] prev_op2)
+  prev_tuple = (f32[1,4], pred[]) tuple(prev_op3, while_pred)
+  prev_while = (f32[1,4], pred[]) while(prev_tuple), condition=prev_while_cond, body=prev_while_body
+  prev_gte = f32[1,4] get-tuple-element(prev_while), index=0
+  prev_op4 = f32[1,4] multiply(f32[1,4] prev_param0, f32[1,4] prev_gte)
+  op0 = f32[1,4] add(f32[1,4] prev_op3, f32[1,4] prev_op4)
+  op1 = f32[1,4] add(f32[1,4] prev_op4, f32[1,4] op0)
+  op2 = f32[1,4] add(f32[1,4] op0, f32[1,4] op1)
+  op3 = f32[1,4] add(f32[1,4] op1, f32[1,4] op2)
+  current_tuple = (f32[1,4], pred[]) tuple(op3, while_pred)
+  current_while = (f32[1,4], pred[]) while(current_tuple), condition=current_while_cond, body=current_while_body
+  current_gte = f32[1,4] get-tuple-element(current_while), index=0
+  op4 = f32[1,4] multiply(f32[1,4] param0, f32[1,4] current_gte)
+  next_op0 = f32[1,4] add(f32[1,4] op3, f32[1,4] op4)
+  next_op1 = f32[1,4] add(f32[1,4] op4, f32[1,4] next_op0)
+  next_op2 = f32[1,4] add(f32[1,4] next_op0, f32[1,4] next_op1)
+  next_op3 = f32[1,4] add(f32[1,4] next_op1, f32[1,4] next_op2)
+  next_tuple = (f32[1,4], pred[]) tuple(next_op3, while_pred)
+  next_while = (f32[1,4], pred[]) while(next_tuple), condition=next_while_cond, body=next_while_body
+  next_gte = f32[1,4] get-tuple-element(next_while), index=0
+  next_op4 = f32[1,4] multiply(f32[1,4] next_param0, f32[1,4] next_gte)
+  ROOT root = tuple(prev_param0, param0, next_param0, prev_prev_op3, prev_prev_op4, next_op4, while_pred)
+}
+
+ENTRY entry {
+  p0 = f32[1,4] parameter(0)
+  p1 = f32[1,4] parameter(1)
+  p2 = f32[1,4] parameter(2)
+  p3 = f32[1,4] parameter(3)
+  p4 = f32[1,4] parameter(4)
+  p5 = pred[] parameter(5)
+  copy = f32[1,4] copy(p4)
+  tuple = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) tuple(p0, p1, p2, p3, p4, copy, p5)
+  while = (f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], f32[1,4], pred[]) while(tuple), condition=while_cond, body=while_body
+  ROOT root = f32[1,4] get-tuple-element(while), index=5
+}
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_str));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto preset_assignments,
+                          RunMsa(module.get(), /*alternate_memory_size=*/512));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/pattern_matcher.h b/tensorflow/compiler/xla/service/pattern_matcher.h
index 11ccc007029..1271b8def68 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher.h
+++ b/tensorflow/compiler/xla/service/pattern_matcher.h
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/layout_util.h"
 #include "tensorflow/compiler/xla/literal_util.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
 namespace xla {
@@ -80,6 +81,7 @@ namespace xla {
 //     - WithOneUser: Instruction is used by exactly one other instruction, but
 //       is possibly used more than once as an operand (e.g. multiply(x,x)).
 //     - WithComparisonDirection: instr has the given direction
+//     - WithConvDnums(string or proto): checks convolution_dimension_numbers().
 //     - WithPredicate: Instruction matches an arbitrary function you pass.
 //       Function must have signature `bool(const HloInstruction*)`.
 //
@@ -1852,6 +1854,54 @@ class HloInstructionPatternComparisonDirectionImpl {
   ComparisonDirection direction_;
 };
 
+class HloInstructionPatternConvDnumsImpl {
+ public:
+  explicit HloInstructionPatternConvDnumsImpl(absl::string_view dnums)
+      : HloInstructionPatternConvDnumsImpl(
+            ParseConvolutionDimensionNumbers(dnums).value()) {}
+
+  explicit HloInstructionPatternConvDnumsImpl(ConvolutionDimensionNumbers dnums)
+      : dnums_(std::move(dnums)) {}
+
+  bool Match(const ::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  bool Match(::xla::HloInstruction* inst, MatchOption option) const {
+    return MatchImpl(inst, option);
+  }
+
+  void DescribeTo(std::ostream* os, int64_t indent = 0) const {
+    *os << "which has convolution dimension numbers "
+        << ConvolutionDimensionNumbersToString(dnums_);
+  }
+
+ private:
+  template <typename HloInstructionType>
+  bool MatchImpl(HloInstructionType* inst, MatchOption option) const {
+    if (inst->opcode() != HloOpcode::kConvolution &&
+        inst->opcode() != HloOpcode::kCustomCall) {
+      EXPLAIN << "HloInstruction is not convolution or custom-call and so "
+                 "can't have convolution_dimension_numbers";
+      return false;
+    }
+
+    const ConvolutionDimensionNumbers& actual_dnums =
+        inst->convolution_dimension_numbers();
+    if (!tsl::protobuf::util::MessageDifferencer::Equals(dnums_,
+                                                         actual_dnums)) {
+      EXPLAIN << "convolution_dimension_numbers "
+              << ConvolutionDimensionNumbersToString(actual_dnums)
+              << " don't match expected "
+              << ConvolutionDimensionNumbersToString(dnums_);
+      return false;
+    }
+    return true;
+  }
+
+  ConvolutionDimensionNumbers dnums_;
+};
+
 class HloInstructionPredicateImpl {
  public:
   explicit HloInstructionPredicateImpl(HloPredicate fn) : fn_(std::move(fn)) {}
@@ -2153,6 +2203,13 @@ class HloInstructionPattern {
     return AppendImpl(HloInstructionPatternComparisonDirectionImpl(direction));
   }
 
+  auto WithConvDnums(absl::string_view dnums) const {
+    return AppendImpl(HloInstructionPatternConvDnumsImpl(dnums));
+  }
+  auto WithConvDnums(ConvolutionDimensionNumbers dnums) const {
+    return AppendImpl(HloInstructionPatternConvDnumsImpl(dnums));
+  }
+
   auto WithPredicate(HloPredicate fn) const {
     return AppendImpl(HloInstructionPredicateImpl(std::move(fn)));
   }
@@ -2343,6 +2400,7 @@ XLA_COMMUTATIVE_BINOP_PATTERN(Or)
 XLA_BINOP_PATTERN(ShiftLeft)
 XLA_BINOP_PATTERN(ShiftRightArithmetic)
 XLA_BINOP_PATTERN(ShiftRightLogical)
+XLA_COMMUTATIVE_BINOP_PATTERN(Xor)
 #undef XLA_COMMUTATIVE_BINOP_PATTERN
 #undef XLA_BINOP_PATTERN
 
diff --git a/tensorflow/compiler/xla/service/pattern_matcher_test.cc b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
index 646718cefad..5b80fb7dac2 100644
--- a/tensorflow/compiler/xla/service/pattern_matcher_test.cc
+++ b/tensorflow/compiler/xla/service/pattern_matcher_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/test.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -1211,6 +1212,38 @@ TEST_F(PatternMatcherTest, Comparison) {
       "direction=EQ");
 }
 
+TEST_F(PatternMatcherTest, ConvDnums) {
+  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers dnums,
+                          ParseConvolutionDimensionNumbers("bf01_oi01->bf01"));
+  auto param =
+      HloInstruction::CreateParameter(0, ShapeUtil::MakeShape(F32, {}), "p0");
+  auto op = HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                             /*operands=*/{},
+                                             /*custom_call_target=*/"foo");
+  op->set_convolution_dimension_numbers(dnums);
+
+  EXPECT_TRUE(Match(op.get(), m::CustomCall().WithConvDnums(dnums)));
+  EXPECT_TRUE(
+      Match(op.get(), m::CustomCall().WithConvDnums("bf01_oi01->bf01")));
+  TF_ASSERT_OK_AND_ASSIGN(ConvolutionDimensionNumbers different_dnums,
+                          ParseConvolutionDimensionNumbers("b01f_oi01->bf01"));
+  EXPECT_FALSE(Match(op.get(), m::CustomCall().WithConvDnums(different_dnums)));
+  EXPECT_FALSE(
+      Match(op.get(), m::CustomCall().WithConvDnums("b01f_oi01->bf01")));
+  EXPECT_FALSE(
+      Match(param.get(), m::CustomCall().WithConvDnums("b01f_oi01->bf01")));
+
+  EXPECT_DESC_AND_EXPLANATION(
+      op.get(), m::CustomCall().WithConvDnums("b01f_oi01->bf01"),
+      "an HloInstruction:\n"
+      " * with opcode custom-call AND\n"
+      " * which has convolution dimension numbers b01f_oi01->bf01",
+      "convolution_dimension_numbers bf01_oi01->bf01 don't match expected "
+      "b01f_oi01->bf01\n"
+      "in custom-call = f32[] custom-call(), dim_labels=bf01_oi01->bf01, "
+      "custom_call_target=\"foo\"");
+}
+
 TEST_F(PatternMatcherTest, CustomCallMatchers) {
   constexpr char kModuleStr[] = R"(
     HloModule test_module
diff --git a/tensorflow/compiler/xla/service/reshape_decomposer.cc b/tensorflow/compiler/xla/service/reshape_decomposer.cc
index f4d26517dca..7f58945eb9f 100644
--- a/tensorflow/compiler/xla/service/reshape_decomposer.cc
+++ b/tensorflow/compiler/xla/service/reshape_decomposer.cc
@@ -49,7 +49,7 @@ class ReshapeDecomposerVisitor : public DfsHloRewriteVisitor {
       DCHECK(ShapeUtil::ReshapeIsBitcast(r->shape(), r->operand(0)->shape()));
       HloInstruction* copied_result = MakeCopyHlo(r, s);
       VLOG(3) << "Decomposing reshape into reshape-bitcast and a physical "
-                 "transposition on the operand: "
+                 "transposition on the result: "
               << copied_result->ToString();
       TF_RETURN_IF_ERROR(ReplaceInstruction(reshape, copied_result));
     } else {
diff --git a/tensorflow/compiler/xla/service/reshape_mover.cc b/tensorflow/compiler/xla/service/reshape_mover.cc
index f045758d4d1..ea9606e53c0 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover.cc
@@ -13,30 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// Implementation note:
-//
-// The general idea behind this pass is that we're converting from this:
-//   %param.A = OldShape
-//   %param.B = OldShape
-//   %reshape.A = NewShape reshape(%param.A)
-//   %reshape.B = NewShape reshape(%param.B)
-//   %instruction = NewShape instruction(%reshape.A, %reshape.B)
-// To this:
-//   %param.A = OldShape
-//   %param.B = OldShape
-//   %instruction = OldShape instruction(%param.A, %param.B)
-//   %reshape = NewShape reshape(%instruction)
-//
-// Where the instruction must be elementwise, and both reshapes and transposes
-// are moved.
-
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 
 #include <algorithm>
+#include <memory>
+#include <vector>
 
 #include "absl/algorithm/container.h"
-#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/permutation_util.h"
+#include "tensorflow/compiler/xla/service/hlo_creation_utils.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/compiler/xla/util.h"
@@ -46,76 +31,14 @@ namespace xla {
 
 namespace {
 
-bool IsReshapeOrTranspose(const HloInstruction* instruction) {
+// In this file, let a "rearrange" op be a reshape or a transpose.
+bool IsRearrange(const HloInstruction* instruction) {
   return instruction->opcode() == HloOpcode::kReshape ||
          instruction->opcode() == HloOpcode::kTranspose;
 }
 
-// Returns true if `instruction` can change its shape simply by adjusting
-// metadata or if `instruction` is a broadcast of a scalar value.
-bool CanTriviallyChangeShape(const HloInstruction* instruction) {
-  // NOTE: Technically a sequence of reshape(reshape(constant)) is also
-  // trivially reshapable, so we might be tempted to simply recurse if
-  // IsReshapeOrTranspose(instruction)==true.
-  //
-  // But it's not that simple. E.g. reshape(reshape(rng)) is only trivially
-  // reshapable if *all* instructions in the chain have user_count == 1. And
-  // reshape(scalar) isn't trivial at all if the reshape itself isn't scalar.
-  // In addition, these cases make it harder to maintain correctness of the
-  // UpdateOperand logic below.
-  //
-  // So don't handle these chains, unless you update the tests and code to deal
-  // with these properly. One idea is to add a pass immediately beforehand that
-  // collapses trivial runs of reshapes / transposes.
-
-  // A constant can trivially reshape the literal it holds.
-  if (instruction->opcode() == HloOpcode::kConstant) {
-    return true;
-  }
-
-  // An Rng instruction can be any shape as long as it has one user. Two copies
-  // of the same Rng would be problematic if an Rng of a different shape would
-  // produce random numbers in a different order.
-  if (instruction->opcode() == HloOpcode::kRng &&
-      instruction->user_count() == 1) {
-    return true;
-  }
-
-  // A broadcast of scalar can trivially change its shape.
-  if (instruction->opcode() == HloOpcode::kBroadcast &&
-      ShapeUtil::IsScalar(instruction->operand(0)->shape())) {
-    return true;
-  }
-
-  return false;
-}
-
-// Returns true iff `instruction` is a reshape/transpose instruction for which
-// a shape change is nontrivial.
-bool IsNontrivialReshape(const HloInstruction* instruction) {
-  return !ShapeUtil::IsEffectiveScalar(instruction->shape()) &&
-         IsReshapeOrTranspose(instruction) &&
-         !CanTriviallyChangeShape(instruction->operand(0));
-}
-
-// Finds the first operand of an instruction that is a non-trivial reshape or
-// transpose. Returns such an operand or nullptr if not found.
-HloInstruction* FirstNonScalarAndNonTrivialReshapeOperand(
-    const HloInstruction* hlo) {
-  for (HloInstruction* operand : hlo->operands()) {
-    if (IsNontrivialReshape(operand)) {
-      VLOG(5) << "Found first non-trivial reshape operand of "
-              << hlo->ToString(HloPrintOptions().set_print_metadata(false))
-              << ":\n\t"
-              << operand->ToString(HloPrintOptions().set_print_metadata(false));
-      return operand;
-    }
-  }
-  return nullptr;
-}
-
 // Returns whether `a` and `b` are equivalent reshapes/transposes.
-bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
+bool AreEquivalentRearranges(const HloInstruction* a, const HloInstruction* b) {
   if (a->opcode() != b->opcode() ||
       !ShapeUtil::SameDimensions(a->shape(), b->shape())) {
     return false;
@@ -131,276 +54,334 @@ bool AreEquivalentReshapes(const HloInstruction* a, const HloInstruction* b) {
   }
 }
 
-// This function is called once we've decided to sink reshape/transpose operands
-// across an instruction. It returns an updated `operand` with a shape that
-// plays nicely with `new_operand_shape`; it has the same shape (of the
-// correct type).
-HloInstruction* UpdateOperand(const HloInstruction* first_reshape_operand,
-                              const Shape& new_operand_shape,
-                              HloInstruction* operand) {
-  HloComputation* computation = operand->parent();
-  const PrimitiveType element_type = operand->shape().element_type();
-  const Shape new_shape =
-      ShapeUtil::ChangeElementType(new_operand_shape, element_type);
+// Computes where broadcast dims end up after a transpose.
+//
+// Consider a simple case:
+//
+//  bcast = f32[1,2,3,4] broadcast(f32[2,4] x), dimensions={1,3}
+//  trans = f32[2,3,1,4] transpose(f32[1,2,3,4] bcast), dimensions={1,2,0,3}.
+//
+// We want to transform this into
+//
+//  bcast' = f32[2,3,1,4] broadcast(f32[2,4] x), dimensions={0,3}.
+//
+// The algorithm is:
+//
+//  * Invert the permutation {1,2,0,3} to give us p' = {2,0,1,3}.
+//
+//  * Compute where each broadcast dim ends up after the transpose.  p'[1] = 0,
+//    meaning that broadcast dim 1 (size 2) ends up at index 0 after the
+//    transpose.  Similarly, p'[3] = 3.
+//
+// Thus the new broadcast's dims are [p'[dim] for dim in bcast.dimensions()].
+absl::InlinedVector<int64_t, 4> TransposedBcastDims(
+    absl::Span<const int64_t> bcast_dims,
+    absl::Span<const int64_t> transpose_dims) {
+  auto inv_perm = InversePermutation(transpose_dims);
+  absl::InlinedVector<int64_t, 4> new_bcast_dims;
+  for (int64_t dim : bcast_dims) {
+    new_bcast_dims.push_back(inv_perm[dim]);
+  }
+  return new_bcast_dims;
+}
 
-  switch (operand->opcode()) {
-    case HloOpcode::kConstant: {
-      if (first_reshape_operand->opcode() == HloOpcode::kReshape) {
-        VLOG(5) << "Adding reshape to kConstant operand";
-        return computation->AddInstruction(
-            HloInstruction::CreateReshape(new_shape, operand));
+}  // namespace
+
+// Returns true if `instr` can easily change its shape according to the inverse
+// of `rearrange`, which must be a kReshape or kTranspose op.
+bool ReshapeMover::CanTriviallyRearrange(const HloInstruction* instr,
+                                         const HloInstruction* rearrange) {
+  CHECK(IsRearrange(rearrange)) << rearrange->ToString();
+
+  // Check for nop reshapes / transposes.  These are, by definition, trivial.
+  // These "shouldn't happen", because algsimp should run before this pass.  But
+  // sometimes they appear anyway, e.g. because algsimp does not run to a fixed
+  // point before this pass runs.
+  if (rearrange->opcode() == HloOpcode::kReshape &&
+      ShapeUtil::Equal(rearrange->shape(), rearrange->operand(0)->shape())) {
+    return true;
+  }
+  if (rearrange->opcode() == HloOpcode::kTranspose &&
+      IsIdentityPermutation(rearrange->dimensions())) {
+    return true;
+  }
+
+  // NOTE: Technically a sequence of rearrange(rearrange(constant)) is also
+  // trivially rearrangeable, so we might be tempted to simply recurse if
+  // instruction is kReshape or kTranspose.
+  //
+  // But it's not that simple. E.g. reshape(reshape(rng)) is only trivially
+  // reshapable if *all* instructions in the chain have user_count == 1. And
+  // reshape(scalar) isn't trivial at all if the reshape itself isn't scalar.
+  //
+  // So don't handle these chains, unless you update the tests and code to deal
+  // with these properly. One idea is to add a pass immediately beforehand that
+  // collapses trivial runs of reshapes / transposes.
+
+  // A constant can trivially rearrange the literal it holds.
+  if (instr->opcode() == HloOpcode::kConstant) {
+    return true;
+  }
+
+  // An Rng instruction can be any shape as long as it has one user. Two copies
+  // of the same Rng would be problematic if an Rng of a different shape would
+  // produce random numbers in a different order.
+  if (instr->opcode() == HloOpcode::kRng && instr->user_count() == 1) {
+    return true;
+  }
+
+  if (instr->opcode() == HloOpcode::kBroadcast) {
+    // Cowardly refuse to handle broadcasts where the broadcast dims are not
+    // sorted.  Such broadcasts are basically transposes, which is confusing.
+    if (!absl::c_is_sorted(instr->dimensions())) {
+      return false;
+    }
+
+    // reshape(broadcast(x)) is trivially representable as broadcast'(x) if
+    //  * the reshape does not modify any of the broadcasted dims, or
+    //  * x is scalar or effective rank 1 (in which case, the reshape is trivial
+    //    even if it *does* modify broadcasted dims).
+    //
+    // (It doesn't really matter, but note that we go *from* rearrange->shape()
+    // *to* its operand shape -- not the other way around -- because we're
+    // interested in applying the *inverse* of the rearrange.
+    //
+    // TODO(jlebar): Get rid of the reshape_of_1d_broadcast_is_cheap check on
+    // the ReshapeLeavesDimensionsUnmodified branch.  I think this is needed
+    // only because algsimp doesn't currently do this simplification itself.
+    if (rearrange->opcode() == HloOpcode::kReshape) {
+      return ShapeUtil::IsScalar(instr->operand(0)->shape()) ||
+             (options_.reshape_of_1d_broadcast_is_cheap &&
+              ShapeUtil::TrueRank(instr->operand(0)->shape()) <= 1) ||
+             (options_.reshape_of_1d_broadcast_is_cheap &&
+              ShapeUtil::ReshapeLeavesDimensionsUnmodified(
+                  /*from_shape=*/rearrange->shape(),
+                  /*to_shape=*/rearrange->operand(0)->shape(),
+                  instr->dimensions())
+                  .has_value());
+    }
+
+    // Similarly, transpose(broadcast(x)) is trivially representable as
+    // broadcast'(x) if the transpose does not change the relative order of any
+    // of the broadcasted dims.
+    //
+    // (The permutation we're interested in is the inverse of `transpose`
+    // because we're considering applying transpose' to the broadcast operand.
+    // Although like in the case of kReshape, this doesn't really matter,
+    // because the inverse permutation leaves the relative order of the dims
+    // unchanged iff the non-inverse permutation leaves them unchanged.)
+    if (rearrange->opcode() == HloOpcode::kTranspose) {
+      return absl::c_is_sorted(TransposedBcastDims(
+          instr->dimensions(), InversePermutation(rearrange->dimensions())));
+    }
+  }
+
+  return false;
+}
+
+const HloInstruction* ReshapeMover::FirstNontrivialRearrange(
+    absl::Span<const HloInstruction* const> instrs) {
+  auto rearrange_it = absl::c_find_if(instrs, [&](const HloInstruction* instr) {
+    return IsRearrange(instr) &&
+           !CanTriviallyRearrange(instr->operand(0), instr);
+  });
+  if (rearrange_it == instrs.end()) {
+    return nullptr;
+  }
+  return *rearrange_it;
+}
+
+// Returns true if the instruction is a reshape-move candidate:
+//
+//   * at least one operand is a rearrange, and
+//   * all rearrange operands are equivalent (if there's more than one), and
+//   * we can trivially apply the inverse rearrange to all other operands.
+bool ReshapeMover::IsReshapeMoveCandidate(HloInstruction* instruction) {
+  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
+  VLOG(5) << "** Checking instruction: "
+          << instruction->ToString(print_no_metadata);
+
+  // Only perform reshape-move for elementwise instructions.
+  if (!instruction->IsElementwise()) {
+    return false;
+  }
+
+  const HloInstruction* rearrange =
+      FirstNontrivialRearrange(instruction->operands());
+  if (rearrange == nullptr) {
+    return false;
+  }
+  return absl::c_all_of(
+      instruction->operands(), [&](const HloInstruction* operand) {
+        return (IsRearrange(operand) &&
+                AreEquivalentRearranges(operand, rearrange)) ||
+               (!IsRearrange(operand) &&
+                CanTriviallyRearrange(operand, rearrange));
+      });
+}
+
+// Returns a reshape/transpose of `operand` according to the inverse of
+// `rearrange`.
+//
+// This will often create redundant operations that we expect to be eliminated
+// by algsimp.  For example, if we have an operand rearrange(x), this will
+// produce rearrange'(rearrange(x)), which can be simplified to x.
+StatusOr<HloInstruction*> ReshapeMover::ApplyInverseRearrange(
+    const HloInstruction* rearrange, HloInstruction* operand) {
+  switch (rearrange->opcode()) {
+    case HloOpcode::kReshape: {
+      // To make algsimp's life a little easier, don't insert a nop reshape.
+      Shape new_shape = ShapeUtil::ChangeElementType(
+          rearrange->operand(0)->shape(), operand->shape().element_type());
+      if (operand->shape() != new_shape) {
+        return MakeReshapeHlo(new_shape, operand);
       } else {
-        CHECK(first_reshape_operand->opcode() == HloOpcode::kTranspose);
-        VLOG(5) << "Adding transpose to kConstant operand";
-        std::vector<int64_t> inverse_permutation =
-            InversePermutation(first_reshape_operand->dimensions());
-        return computation->AddInstruction(HloInstruction::CreateTranspose(
-            new_shape, operand, inverse_permutation));
+        return operand;
       }
     }
-    case HloOpcode::kRng: {
-      CHECK_EQ(operand->user_count(), 1);
-      VLOG(5) << "Cloning kRng operand with new shape";
-      return computation->AddInstruction(
-          operand->CloneWithNewOperands(new_shape, operand->operands()));
-    }
-    case HloOpcode::kReshape:
     case HloOpcode::kTranspose: {
-      VLOG(5) << "Using existing operand of kReshape or kTranspose";
-      return operand->mutable_operand(0);
+      // To make algsimp's life a little easier, don't insert a nop transpose.
+      if (!IsIdentityPermutation(rearrange->dimensions())) {
+        return MakeTransposeHlo(operand,
+                                InversePermutation(rearrange->dimensions()));
+      } else {
+        return operand;
+      }
     }
-    case HloOpcode::kBroadcast: {
-      CHECK(ShapeUtil::IsScalar(operand->operand(0)->shape()));
-      HloInstruction* inst = computation->AddInstruction(
-          operand->CloneWithNewOperands(new_shape, operand->operands()));
-      VLOG(5) << "Changing broadcast from " << operand->ToString() << " to "
-              << inst->ToString();
-      return inst;
-    }
-
     default:
-      LOG(FATAL) << "Unexpected operand opcode during update: " << operand;
+      LOG(FATAL) << "Invalid rearrange op: " << rearrange->ToString();
   }
 }
 
 // Actually performs the reshape-move transformation -- that is, sinks the
 // reshape or transpose operands of `instruction` across it.
-StatusOr<bool> PerformSinkReshapeOrTranspose(
-    HloInstruction* instruction, const HloInstruction* first_reshape_operand) {
+StatusOr<bool> ReshapeMover::SinkRearrangeOperands(
+    HloInstruction* instruction) {
   auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
-  // At this point we've decided to sink reshape/transpose operands.
-  const Shape& new_operand_shape = first_reshape_operand->operand(0)->shape();
+
+  HloComputation* computation = instruction->parent();
+
+  const HloInstruction* rearrange =
+      FirstNontrivialRearrange(instruction->operands());
+  CHECK(rearrange != nullptr);
+
+  const Shape& new_operand_shape = rearrange->operand(0)->shape();
   VLOG(3) << "** Sinking reshape or transpose: "
           << instruction->ToString(print_no_metadata)
-          << "\n\tfirst reshape operand: "
-          << first_reshape_operand->ToString(print_no_metadata)
+          << "\n\tfirst rearrange operand: "
+          << rearrange->ToString(print_no_metadata)  //
           << "\n\tnew operand shape: "
           << ShapeUtil::HumanString(new_operand_shape);
 
   auto operands = instruction->operands();
   for (size_t i = 0; i < operands.size(); ++i) {
-    // All scalar operands remain as-is, even if they're reshape or transpose,
-    // to simplify handling wrt special scalar broadcast rules for ops like
-    // Select. Scalar reshapes should be cheap anyways.
-    if (ShapeUtil::IsScalar(operands[i]->shape())) {
-      continue;
-    }
     VLOG(3) << "Updating operand #" << i << ": "
             << operands[i]->ToString(print_no_metadata);
-    operands[i] =
-        UpdateOperand(first_reshape_operand, new_operand_shape, operands[i]);
+    TF_ASSIGN_OR_RETURN(operands[i],
+                        ApplyInverseRearrange(rearrange, operands[i]));
+    VLOG(3) << "Updated operand #" << i
+            << " to: " << operands[i]->ToString(print_no_metadata);
   }
-  if (HloOpcode::kFusion == instruction->opcode()) {
-    // Here we already know `instruction` is elementwise, and all the fused
-    // instructions have the same dimensions.
-    for (const auto& fused_instruction : instruction->fused_instructions()) {
-      Shape* shape = fused_instruction->mutable_shape();
-      shape->clear_dimensions();
-      for (int64_t i : new_operand_shape.dimensions()) {
-        shape->add_dimensions(i);
-      }
-      *shape->mutable_layout() = new_operand_shape.layout();
-    }
-  }
-  HloComputation* computation = instruction->parent();
+
   HloInstruction* new_elementwise =
       computation->AddInstruction(instruction->CloneWithNewOperands(
-          // `instruction` may change the element type, e.g., from
-          //   operands[0] -> reshape -> convert (`instruction`)
-          // to
-          //   operands[0] -> convert' -> reshape'
-          //
-          // In this case, convert' should have the same element type as
-          // `convert` and the same dimensions as operands[0].
           ShapeUtil::ChangeElementType(new_operand_shape,
                                        instruction->shape().element_type()),
           operands));
 
-  std::unique_ptr<HloInstruction> new_reshape;
-  switch (first_reshape_operand->opcode()) {
+  std::unique_ptr<HloInstruction> new_rearrange;
+  switch (rearrange->opcode()) {
     case HloOpcode::kReshape:
       VLOG(3) << "Creating new reshape for new elementwise op: "
               << new_elementwise->ToString(print_no_metadata);
-      new_reshape =
+      new_rearrange =
           HloInstruction::CreateReshape(instruction->shape(), new_elementwise);
       break;
     case HloOpcode::kTranspose:
-      new_reshape =
-          HloInstruction::CreateTranspose(instruction->shape(), new_elementwise,
-                                          first_reshape_operand->dimensions());
+      new_rearrange = HloInstruction::CreateTranspose(
+          instruction->shape(), new_elementwise, rearrange->dimensions());
       break;
     default:
       LOG(FATAL) << "Bad opcode";
   }
   TF_RETURN_IF_ERROR(computation->ReplaceWithNewInstruction(
-      instruction, std::move(new_reshape)));
+      instruction, std::move(new_rearrange)));
   return true;
 }
 
-// Returns true if the instruction is a reshape-move candidate.
+// Reshape-moves all qualifying instructions in candidates.  Returns true if it
+// makes changes.
 //
-// An instruction is a reshape-move candidate if the instruction is elementwise,
-// has at least one nontrivial reshape/transpose operand, and its operands are
-// either trivially reshapable or are equivalent nontrivial reshapes/transposes.
-bool IsReshapeMoveCandidate(HloInstruction* instruction) {
-  auto print_no_metadata = HloPrintOptions().set_print_metadata(false);
-  VLOG(5) << "** Checking instruction: "
-          << instruction->ToString(print_no_metadata);
-
-  // Only perform reshape-move for live elementwise instructions with operands.
-  if (!instruction->IsElementwise() || instruction->operands().empty() ||
-      instruction->IsDead()) {
-    return false;
-  }
-
-  // Check whether all operands:
-  //    0. Have the same dimensions as the output.
-  //
-  // And one of the following:
-  //    1. Are reshapes or transposes that have the same input and
-  //       output shapes as all other reshaped or transposed operands.
-  //     or
-  //    2. Are one of kConstant, kRng, broadcast of a scalar value.
-  const HloInstruction* first_reshape_operand = nullptr;
-  for (const HloInstruction* operand : instruction->operands()) {
-    if (!ShapeUtil::SameDimensions(operand->shape(), instruction->shape())) {
-      VLOG(5) << "Operand shape differs from output shape; so preventing "
-                 "movement\n\toperand: "
-              << operand->ToString(print_no_metadata) << "\n\tinstruction: "
-              << instruction->ToString(print_no_metadata);
-      return false;
-    }
-
-    if (CanTriviallyChangeShape(operand)) {
-      VLOG(5) << "Operand can trivially change shape: "
-              << operand->ToString(print_no_metadata);
-      continue;
-    }
-
-    if (!IsNontrivialReshape(operand)) {
-      VLOG(5) << "Operand can't trivially change shape: "
-              << operand->ToString(print_no_metadata);
-      return false;
-    }
-
-    if (first_reshape_operand == nullptr) {
-      first_reshape_operand = operand;
-      VLOG(5) << "First reshape operand "
-              << operand->ToString(print_no_metadata);
-    } else if (AreEquivalentReshapes(first_reshape_operand, operand)) {
-      VLOG(5)
-          << "Operand is an equivalent reshape of the first reshape operand "
-          << operand->ToString(print_no_metadata);
-    } else {
-      // TODO(someone): Look into supporting general ops for the operands as
-      // well.
-      VLOG(5) << "Operand is a reshape but is not equivalent to the first "
-                 "Reshape operand"
-              << operand->ToString(print_no_metadata);
-      return false;
-    }
-  }
-
-  if (first_reshape_operand) {
-    VLOG(5) << "All operands have easy shape changes: "
-            << instruction->ToString(print_no_metadata);
-  }
-
-  return first_reshape_operand != nullptr;
-}
-
-// Reshape-moves all qualifying instructions in reshape_candidates.  Returns
-// true if it makes changes.
+// `candidates` is a set of HloInstructions with rearrange operands, and a
+// instruction in the set can be reshape-moved iff all the users of its
+// rearrange operands can also be reshaped-moved.
 //
-// `reshape_candidates` is a set of HloInstructions with nontrivial reshape
-// operands, and a instruction in the set can be reshape-moved iff all the users
-// of its nontrivial reshape operands can also be reshaped-moved.
-//
-// The algorithm here iteratively finds the nontrivial operands with users that
-// are outside the set of `reshape_candidates`, and removes their users from
-// `reshape_candidates`, until either `reshape_candidates` becomes empty or none
-// of the remaining nontrivial operands have users outside `reshape_candidates`.
-// In the later case, all the remaining instructions in `reshape_candidates`
-// are reshape-moved and the routine returns true.
-StatusOr<bool> TryReshapeMoveOnCandidates(
-    HloInstructionSet* reshape_candidates) {
+// The algorithm here iteratively finds the rearrange operands with users that
+// are outside the set of `candidates`, and removes their users from
+// `candidates`, until either `candidates` becomes empty or none of the
+// remaining rearrange operands have users outside `candidates`.  In the later
+// case, all the remaining instructions in `candidates` are reshape-moved and
+// the routine returns true.
+StatusOr<bool> ReshapeMover::TryReshapeMoveOnCandidates(
+    HloInstructionSet* candidates) {
   bool removed = true;
-  while (!reshape_candidates->empty() && removed) {
+  while (!candidates->empty() && removed) {
     if (VLOG_IS_ON(5)) {
-      for (const HloInstruction* instruction : *reshape_candidates) {
+      for (const HloInstruction* instruction : *candidates) {
         VLOG(5) << "candidate " << instruction->ToString();
       }
     }
-    ConstHloInstructionSet nontrivial_operands;
-    for (const HloInstruction* instruction : *reshape_candidates) {
+    ConstHloInstructionSet rearrange_operands;
+    for (const HloInstruction* instruction : *candidates) {
       for (const auto* operand : instruction->operands()) {
-        if (IsNontrivialReshape(operand)) {
-          nontrivial_operands.insert(operand);
+        if (IsRearrange(operand)) {
+          rearrange_operands.insert(operand);
         }
       }
     }
 
     removed = false;
-    for (auto operand : nontrivial_operands) {
+    for (auto operand : rearrange_operands) {
       if (absl::c_any_of(operand->users(), [&](HloInstruction* user) {
-            return !reshape_candidates->count(user);
+            return !candidates->count(user);
           })) {
         for (auto* user : operand->users()) {
-          removed |= reshape_candidates->erase(user) > 0;
+          removed |= candidates->erase(user) > 0;
         }
       }
     }
   }
 
-  if (reshape_candidates->empty()) {
+  if (candidates->empty()) {
     return false;
   }
-  for (HloInstruction* instruction : *reshape_candidates) {
-    const HloInstruction* first_reshape_operand =
-        FirstNonScalarAndNonTrivialReshapeOperand(instruction);
-    TF_ASSIGN_OR_RETURN(
-        bool did_change,
-        PerformSinkReshapeOrTranspose(instruction, first_reshape_operand));
+  for (HloInstruction* instruction : *candidates) {
+    if (!ConsumeFuel("reshape-mover", [&] {
+          return absl::StrCat("instruction: ", instruction->ToString(),
+                              "\nFull module:\n",
+                              instruction->GetModule()->ToString());
+        })) {
+      break;
+    }
+    TF_ASSIGN_OR_RETURN(bool did_change, SinkRearrangeOperands(instruction));
     CHECK(did_change);
   }
   return true;
 }
 
-}  // namespace
-
 StatusOr<bool> ReshapeMover::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
   for (auto* comp : module->MakeNonfusionComputations(execution_threads)) {
-    HloInstructionSet reshape_candidates;
+    HloInstructionSet candidates;
     for (HloInstruction* instruction : comp->instructions()) {
       if (IsReshapeMoveCandidate(instruction)) {
-        reshape_candidates.insert(instruction);
+        candidates.insert(instruction);
       }
     }
     TF_ASSIGN_OR_RETURN(bool did_change,
-                        TryReshapeMoveOnCandidates(&reshape_candidates));
+                        TryReshapeMoveOnCandidates(&candidates));
     changed |= did_change;
   }
   return changed;
diff --git a/tensorflow/compiler/xla/service/reshape_mover.h b/tensorflow/compiler/xla/service/reshape_mover.h
index 8ac9b3946cc..e14d33aac25 100644
--- a/tensorflow/compiler/xla/service/reshape_mover.h
+++ b/tensorflow/compiler/xla/service/reshape_mover.h
@@ -20,18 +20,53 @@ limitations under the License.
 
 namespace xla {
 
-// A pass which moves Reshapes and Transposes to let later passes combine them.
-// This now only moves them outputward across elementwise ops all whose operands
-// are equivalent Reshapes or Transposes, but in future could potentially move
-// them inputward also.
+// This pass sinks kReshape and kTranspose operations (known as "rearrange" ops)
+// down through elementwise ops:
+//
+//   op(rearrange(x), rearrange(y)) => rearrange(op(x, y)).
+//
+// We also handle the case where one of the operands is not itself a rearrange
+// op but can be trivially rearranged.  For example:
+//
+//   op(rearrange(x), broadcast(scalar_y)) =>
+//   rearrange(x, broadcast'(scalar_y)).
+//
+// This pass should be run to a fixed point.  It also expects algsimp to be run
+// after each iteration.
+
+struct ReshapeMoverOptions {
+  // On some platforms, it's cheap to do `reshape(broadcast(f32[n] x))`.  The
+  // reshape and broadcast can always be fused, and the index calculations are
+  // not expensive.  In such cases it can be beneficial for us to create these
+  // reshapes eagerly, allowing us to get rid of more expensive ones.
+  bool reshape_of_1d_broadcast_is_cheap = false;
+};
+
 class ReshapeMover : public HloModulePass {
  public:
+  explicit ReshapeMover(
+      const ReshapeMoverOptions& options = ReshapeMoverOptions{})
+      : options_(options) {}
+
   absl::string_view name() const override { return "reshape-mover"; }
 
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  StatusOr<bool> TryReshapeMoveOnCandidates(HloInstructionSet* candidates);
+  StatusOr<bool> SinkRearrangeOperands(HloInstruction* instruction);
+  StatusOr<HloInstruction*> ApplyInverseRearrange(
+      const HloInstruction* rearrange, HloInstruction* operand);
+  bool IsReshapeMoveCandidate(HloInstruction* instruction);
+  const HloInstruction* FirstNontrivialRearrange(
+      absl::Span<const HloInstruction* const> instrs);
+  bool CanTriviallyRearrange(const HloInstruction* instr,
+                             const HloInstruction* rearrange);
+
+  ReshapeMoverOptions options_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/reshape_mover_test.cc b/tensorflow/compiler/xla/service/reshape_mover_test.cc
index 06a2a390b1f..459f8701551 100644
--- a/tensorflow/compiler/xla/service/reshape_mover_test.cc
+++ b/tensorflow/compiler/xla/service/reshape_mover_test.cc
@@ -16,516 +16,186 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/reshape_mover.h"
 
 #include <memory>
+#include <string>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
-#include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
-#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
-#include "tensorflow/compiler/xla/layout_util.h"
-#include "tensorflow/compiler/xla/literal.h"
-#include "tensorflow/compiler/xla/shape_util.h"
-#include "tensorflow/compiler/xla/test.h"
-#include "tensorflow/compiler/xla/test_helpers.h"
+#include "tensorflow/compiler/xla/service/algebraic_simplifier.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_fix.h"
+#include "tensorflow/compiler/xla/service/hlo_verifier.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher.h"
+#include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
-#include "tensorflow/compiler/xla/types.h"
-#include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
 
-namespace op = xla::testing::opcode_matchers;
+namespace m = xla::match;
 
-class ReshapeMoverTest : public HloTestBase {};
+class ReshapeMoverTest : public HloTestBase {
+ protected:
+  // ReshapeMover relies on algsimp for cleanup.
+  Status RunPass(HloModule* module, bool change_expected,
+                 ReshapeMoverOptions options = ReshapeMoverOptions{}) {
+    TF_ASSIGN_OR_RETURN(bool changed,
+                        RunHloPass(ReshapeMover(options), module));
+    SCOPED_TRACE(module->ToString());
+    EXPECT_EQ(changed, change_expected);
+    TF_EXPECT_OK(RunHloPass(HloVerifier(HloVerifierOpts()), module).status());
+    TF_EXPECT_OK(RunHloPass(HloPassFix<AlgebraicSimplifier>(
+                                AlgebraicSimplifierOptions()),
+                            module)
+                     .status());
+    return OkStatus();
+  }
+};
 
 TEST_F(ReshapeMoverTest, ReshapesWithDifferentInputShapesNotMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 8, 7, 1}), "param1"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, reshape1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), op::Reshape(param1)));
-
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), op::Reshape(param1)));
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      reshape0 = f32[8,7] reshape(f32[1,8,1,7] parameter(0))
+      reshape1 = f32[8,7] reshape(f32[1,8,7,1] parameter(1))
+      ROOT add = add(reshape0, reshape1)
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
 }
 
-// For a graph that looks like:
-//
-// +- reshape0 - rng0
-// |
-// +- const1
-// |
-// add
-//
-// where rng0 has a different shape than reshape0.
-//
-// Verifies that the reshape is not moved, since rng0 is trivially reshapable
-// and therefore there is no nontrivial reshapes to move.
-TEST_F(ReshapeMoverTest, 1ConstantAnd1ReshapesOnRngNotMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto rng0 = builder.AddInstruction(HloInstruction::CreateRng(
-      ShapeUtil::MakeShape(F32, {1, 8, 1, 7, 1}),
-      RandomDistribution::RNG_UNIFORM,
-      {builder.AddInstruction(
-           HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(0.0f))),
-       builder.AddInstruction(HloInstruction::CreateConstant(
-           LiteralUtil::CreateR0<float>(1.0f)))}));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, rng0));
-
-  auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(Literal::CreateFromShape(root_shape)));
-
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, const1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(rng0), const1));
-
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(rng0), const1));
-}
-
-TEST_F(ReshapeMoverTest, ScalarReshapesNotMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 1, 1}), "param0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 1, 1}), "param1"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, reshape1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), op::Reshape(param1)));
-
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Add(op::Reshape(op::Parameter()), op::Reshape(op::Parameter())));
+TEST_F(ReshapeMoverTest, OneConstantAndOneReshapesOnRngNotMoved) {
+  // The reshape should not be moved, since rng0 is trivially reshapable and
+  // therefore there are no nontrivial reshapes to move.
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      rng = f32[1,8,1,7,1] rng(f32[] constant(0), f32[] constant(1)), distribution=rng_uniform
+      ROOT add = add(f32[8,7] reshape(rng), f32[8,7] constant({...}))
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
 }
 
 TEST_F(ReshapeMoverTest, EquivalentReshapesMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, reshape1));
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      reshape0 = f32[8,7] reshape(f32[1,8,1,7] parameter(0))
+      reshape1 = f32[8,7] reshape(f32[1,8,1,7] parameter(1))
+      ROOT add = f32[8,7] add(reshape0, reshape1)
+    }
+  )";
 
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), op::Reshape(param1)));
-  EXPECT_TRUE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Add(param0, param1)));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(m::Add(m::Parameter(0), m::Parameter(1)))));
 }
 
-// For a graph that looks like:
-//
-// +- reshape2 - param2
-// |
-// +- reshape1 - param1
-// |
-// +- constant0
-// |
-// select
-//
-// Verifies that the reshape1 and reshape2 sink past select:
-//
-// +- param2
-// |
-// +- param1
-// |
-// +- reshape3(constant0)
-// |
-// select
-// |
-// reshape4
-TEST_F(ReshapeMoverTest, 1ConstantAnd2ReshapesMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
-  auto const0 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR2<bool>(
-          {{true, true, false}, {false, false, true}})));
+TEST_F(ReshapeMoverTest, SinkReshapeBelowSelect) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      ROOT select = f32[2,3] select(
+        pred[2,3] reshape(pred[6] parameter(0)),
+        f32[2,3] reshape(f32[6] parameter(1)),
+        f32[2,3] reshape(f32[6] parameter(2)))
+    }
+  )";
 
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param1"));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-
-  auto param2 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param2"));
-  auto reshape2 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param2));
-
-  builder.AddInstruction(HloInstruction::CreateTernary(
-      root_shape, HloOpcode::kSelect, const0, reshape1, reshape2));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Select(const0, reshape1, reshape2));
-
-  EXPECT_TRUE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Select(op::Reshape(const0), param1, param2)));
-
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(m::Select(m::Parameter(0), m::Parameter(1),
+                                              m::Parameter(2)))));
 }
 
-// For a graph that looks like:
-//
-// +- reshape0 - param0
-// |
-// +- param1
-// |
-// add
-//
-// Verifies that the reshape0 does not sink below add, because param1 is not
-// trivially reshapable nor is a Reshape/Transpose.
-TEST_F(ReshapeMoverTest, 1ParameterAnd1ReshapeNotMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, root_shape, "param1"));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, param1));
+TEST_F(ReshapeMoverTest, SinkReshapeBelowSelectWithConstant) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      ROOT select = f32[2,3] select(
+        pred[2,3] reshape(pred[6] parameter(0)),
+        f32[2,3] reshape(f32[6] parameter(1)),
+        f32[2,3] constant({...}))
+    }
+  )";
 
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), param1));
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), param1));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(m::Select(m::Parameter(0), m::Parameter(1),
+                                              m::Reshape(m::Constant())))));
 }
 
-// For a graph that looks like:
-//
-// +- pred
-// |
-// +- reshape0 - const0
-// |
-// +- reshape1 - const1
-// |
-// select
-//
-// Verifies that we don't unnecessarily sink reshapes, which are in fact
-// trivial reshapes.
-TEST_F(ReshapeMoverTest, 2TrivialConstantReshapeNotMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {3, 2});
-  auto const0 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const0));
-
-  auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const1));
-
-  auto pred = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(PRED, {3, 2}), "pred"));
-
-  builder.AddInstruction(HloInstruction::CreateTernary(
-      root_shape, HloOpcode::kSelect, pred, reshape0, reshape1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
-
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Select(pred, op::Reshape(const0), op::Reshape(const1)));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
+TEST_F(ReshapeMoverTest, OneParameterAndOneReshapeNotMoved) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      reshape0 = f32[8,7] reshape(f32[1,8,1,7] parameter(0))
+      ROOT add = add(reshape0, f32[8,7] parameter(1))
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
 }
 
-// For a graph that looks like:
-//
-// +- reshape0 - param0
-// |
-// +- const1
-// |
-// add
-//
-// where there is only 1 non-trivial reshape (reshape0), we sink the reshape
-// here for canonicalization benefit:
-//
-// +- param0
-// |
-// +- reshape1 - const1
-// |
-// add
-// |
-// reshape2
-//
-// (note that reshape1 here is trivial).
-TEST_F(ReshapeMoverTest, 1NonTrivialReshapeMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {2, 3});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 3, 1, 2}), "param0"));
-  auto const1 = builder.AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR2<float>({{1, 2, 3}, {4, 5, 6}})));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, const1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), const1));
-
-  EXPECT_TRUE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Add(param0, op::Reshape(const1))));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
+TEST_F(ReshapeMoverTest, DontSinkReshapesOfConstants) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      ROOT select = select(
+        pred[3,2] parameter(0),
+        f32[3,2] reshape(f32[2,3] constant({...})),
+        f32[3,2] reshape(f32[2,3] constant({...})))
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
 }
 
-// For a graph that looks like:
-//
-// +- reshape0 - param0 (shape A)
-// |
-// +- reshape1 - const1 (shape B)
-// |
-// add
-//
-// There is 1 non-trivial reshape (reshape0). It's not clear whether reshape1
-// should be trivial or not; conceptually it's trivial, but handling it would
-// complicate the rest of our logic.
-//
-// For now we treat it as non-trivial, so we verify that we don't sink the
-// reshapes in this case.
-TEST_F(ReshapeMoverTest, 1NonTrivialReshapeWith1ReshapedConstNotMoved) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {1, 1, 3});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 3}), "param0"));
-  auto const1 = builder.AddInstruction(
-      HloInstruction::CreateConstant(LiteralUtil::CreateR1<float>({9, 8, 7})));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, const1));
+TEST_F(ReshapeMoverTest, OneNontrivialReshapeMoved) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      ROOT add = add(
+        f32[3,2] reshape(f32[2,3] parameter(0)),
+        f32[3,2] constant({...}))
+    }
+  )";
 
-  builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, reshape1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), op::Reshape(const1)));
-
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Add(op::Reshape(param0), op::Reshape(const1)));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(
+                  m::Add(m::Parameter(0), m::Reshape(m::Constant())))));
 }
 
-TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossFusion) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
-      root_shape, HloOpcode::kAdd, reshape0, reshape1));
+TEST_F(ReshapeMoverTest, MultipleReshapes) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      add0 = f32[8,7,1] add(
+        f32[8,7,1] reshape(f32[1,8,1,7] parameter(0)),
+        f32[8,7,1] reshape(f32[1,8,1,7] parameter(1)))
+      ROOT add1 = f32[8,7] add(
+        f32[8,7] reshape(add0),
+        f32[8,7] reshape(f32[8,7,1] parameter(2)))
+    }
+  )";
 
-  auto computation = m->AddEntryComputation(builder.Build());
-  computation->CreateFusionInstruction({add},
-                                       HloInstruction::FusionKind::kLoop);
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Fusion(op::Reshape(param0), op::Reshape(param1)));
-
-  EXPECT_TRUE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Fusion(param0, param1)));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
-}
-
-TEST_F(ReshapeMoverTest, EquivalentReshapesMovedAcrossSelect) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {8, 7});
-  auto pred_shape = ShapeUtil::MakeShape(PRED, {8, 7});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {1, 8, 1, 7}), "param1"));
-  auto pred = builder.AddInstruction(HloInstruction::CreateParameter(
-      2, ShapeUtil::MakeShape(PRED, {1, 8, 1, 7}), "pred"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(root_shape, param1));
-  auto reshape_pred =
-      builder.AddInstruction(HloInstruction::CreateReshape(pred_shape, pred));
-  builder.AddInstruction(HloInstruction::CreateTernary(
-      root_shape, HloOpcode::kSelect, reshape_pred, reshape0, reshape1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Select(op::Reshape(pred), op::Reshape(param0), op::Reshape(param1)));
-
-  EXPECT_TRUE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Reshape(op::Select(pred, param0, param1)));
-  EXPECT_EQ(root_shape.DebugString(),
-            computation->root_instruction()->shape().DebugString());
-}
-
-TEST_F(ReshapeMoverTest, ScalarReshapeNotMovedAcrossSelect) {
-  auto m = CreateNewVerifiedModule();
-  HloComputation::Builder builder(TestName());
-  auto root_shape = ShapeUtil::MakeShape(F32, {});
-  auto pred_shape = ShapeUtil::MakeShape(PRED, {});
-  auto param0 = builder.AddInstruction(HloInstruction::CreateParameter(
-      0, ShapeUtil::MakeShape(F32, {}), "param0"));
-  auto param1 = builder.AddInstruction(HloInstruction::CreateParameter(
-      1, ShapeUtil::MakeShape(F32, {}), "param1"));
-  auto pred = builder.AddInstruction(HloInstruction::CreateParameter(
-      2, ShapeUtil::MakeShape(PRED, {1, 1, 1}), "pred"));
-  auto reshape_pred =
-      builder.AddInstruction(HloInstruction::CreateReshape(pred_shape, pred));
-  auto select = builder.AddInstruction(HloInstruction::CreateTernary(
-      root_shape, HloOpcode::kSelect, reshape_pred, param0, param1));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-  EXPECT_THAT(computation->root_instruction(),
-              op::Select(op::Reshape(pred), param0, param1));
-
-  EXPECT_FALSE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(computation->root_instruction(),
-              op::Select(op::Reshape(pred), param0, param1));
-  EXPECT_EQ(select, computation->root_instruction());
-}
-
-// Tree looks like this:
-//
-// add1
-// |
-// +- reshape2 - param2
-// |
-// +- reshape3 - add0
-//               |
-//               + reshape0 - param0
-//               |
-//               + reshape1 - param1
-//
-// We expect reshape{0,1} AND reshape{2,3} to be lifted.
-TEST_F(ReshapeMoverTest, MultiplePasses) {
-  auto m = CreateNewVerifiedModule();
-  auto shape1 = ShapeUtil::MakeShape(F32, {1, 8, 1, 7});
-  auto shape2 = ShapeUtil::MakeShape(F32, {8, 7, 1});
-  auto shape3 = ShapeUtil::MakeShape(F32, {8, 7});
-  HloComputation::Builder builder(TestName());
-  auto param0 = builder.AddInstruction(
-      HloInstruction::CreateParameter(0, shape1, "param0"));
-  auto param1 = builder.AddInstruction(
-      HloInstruction::CreateParameter(1, shape1, "param1"));
-  auto param2 = builder.AddInstruction(
-      HloInstruction::CreateParameter(2, shape2, "param2"));
-  auto reshape0 =
-      builder.AddInstruction(HloInstruction::CreateReshape(shape2, param0));
-  auto reshape1 =
-      builder.AddInstruction(HloInstruction::CreateReshape(shape2, param1));
-  auto add0 = builder.AddInstruction(HloInstruction::CreateBinary(
-      shape2, HloOpcode::kAdd, reshape0, reshape1));
-  auto reshape2 =
-      builder.AddInstruction(HloInstruction::CreateReshape(shape3, param2));
-  auto reshape3 =
-      builder.AddInstruction(HloInstruction::CreateReshape(shape3, add0));
-  builder.AddInstruction(HloInstruction::CreateBinary(shape3, HloOpcode::kAdd,
-                                                      reshape2, reshape3));
-
-  auto computation = m->AddEntryComputation(builder.Build());
-
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Add(op::Reshape(param2),
-              op::Reshape(op::Add(op::Reshape(param0), op::Reshape(param1)))));
-
-  EXPECT_TRUE(ReshapeMover().Run(m.get()).value());
-
-  EXPECT_THAT(
-      computation->root_instruction(),
-      op::Reshape(op::Add(param2, op::Reshape(op::Add(param0, param1)))));
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(
+                  m::Add(m::Reshape(m::Add(m::Parameter(0), m::Parameter(1))),
+                         m::Parameter(2)))));
 }
 
 TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
@@ -541,11 +211,14 @@ TEST_F(ReshapeMoverTest, SinkTransposeAcrossBroadcastScalar) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
-  EXPECT_TRUE(changed);
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
 
+  // ReshapeMover transforms to transpose(broadcast(param(1))), and then algsimp
+  // transforms to broadcast'(param(1)).
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Transpose(op::Multiply()));
+              GmockMatch(m::Transpose(m::Multiply(
+                  m::Parameter(0), m::Broadcast(m::Parameter(1))))));
 }
 
 TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
@@ -570,8 +243,7 @@ TEST_F(ReshapeMoverTest, ReshapeWithUsersOutsideCandidatesNotSink) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
-  EXPECT_FALSE(changed);
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
 }
 
 TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
@@ -595,10 +267,14 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink1) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
-  EXPECT_TRUE(changed);
-  EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Tuple(op::Reshape(), op::Reshape(), op::Reshape()));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::Reshape(m::Add(m::Parameter(0), m::Broadcast(m::Parameter(1)))),
+          m::Reshape(m::Add(m::Parameter(0), m::Parameter(2))),
+          m::Reshape(m::Add(m::Parameter(2), m::Parameter(3))))));
 }
 
 TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
@@ -612,10 +288,115 @@ TEST_F(ReshapeMoverTest, ReshapeNoUsersOutsideCandidatesSink2) {
   )";
 
   TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK_AND_ASSIGN(bool changed, ReshapeMover().Run(m.get()));
-  EXPECT_TRUE(changed);
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
   EXPECT_THAT(m->entry_computation()->root_instruction(),
-              op::Reshape(op::Add()));
+              GmockMatch(m::Reshape(m::Add())));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeOfRank1BroadcastIsNotTrivial) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      a = f32[2,3] broadcast(f32[2] parameter(0)), dimensions={0}
+      b = f32[2,3] reshape(f32[6] parameter(1))
+      ROOT add0 = add(a, b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeOfRank1BroadcastIsTrivial) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      a = f32[2,3] broadcast(f32[2] parameter(0)), dimensions={0}
+      b = f32[2,3] reshape(f32[6] parameter(1))
+      ROOT add0 = add(a, b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+
+  ReshapeMoverOptions options;
+  options.reshape_of_1d_broadcast_is_cheap = true;
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true, options));
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Reshape(
+          m::Add(m::Reshape(m::Broadcast(m::Parameter(0))), m::Parameter(1)))));
+}
+
+TEST_F(ReshapeMoverTest, ReshapeOfRank2BroadcastIsAllowed) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      a = f32[2,3,35] broadcast(f32[2,3] parameter(0)), dimensions={0,1}
+      b = f32[2,3,35] reshape(f32[2,3,5,7] parameter(1))
+      ROOT add0 = add(a, b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  ReshapeMoverOptions options;
+  options.reshape_of_1d_broadcast_is_cheap = true;
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true, options));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Reshape(
+                  m::Add(m::Broadcast(m::Parameter(0)), m::Parameter(1)))));
+}
+
+TEST_F(ReshapeMoverTest, SinkDisallowedIfReshapeChangesBroadcastDims) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      a = f32[2,3,35] broadcast(f32[2,3] parameter(0)), dimensions={0,1}
+      b = f32[2,3,35] reshape(f32[6,5,7] parameter(1))
+      ROOT add0 = add(a, b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
+}
+
+TEST_F(ReshapeMoverTest, TransposeOfBroadcastIsAllowed) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      a = f32[2,3] broadcast(f32[2] parameter(0)), dimensions={0}
+      b = f32[2,3] transpose(f32[3,2] parameter(1)), dimensions={1,0}
+
+      ROOT add0 = add(a, b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/true));
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Transpose(
+                  m::Add(m::Broadcast(m::Parameter(0)), m::Parameter(1)))));
+}
+
+TEST_F(ReshapeMoverTest, TransposeReordersBroadcastDims) {
+  const std::string hlo_string = R"(
+    HloModule test
+    ENTRY test {
+      a = f32[2,3,5] broadcast(f32[2,3] parameter(0)), dimensions={0,1}
+      b = f32[2,3,5] transpose(f32[3,2,5] parameter(1)), dimensions={1,0,2}
+
+      ROOT add0 = add(a, b)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(RunPass(m.get(), /*change_expected=*/false));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/xla/service/service.cc b/tensorflow/compiler/xla/service/service.cc
index 2e93ffe93b9..1a5456c1fba 100644
--- a/tensorflow/compiler/xla/service/service.cc
+++ b/tensorflow/compiler/xla/service/service.cc
@@ -440,8 +440,8 @@ Service::ExecuteParallelAndRegisterResult(
       if (i == 0) {
         options.set_execution_profile(profile);
       }
-      ServiceExecutableRunOptions run_options(options,
-                                              backend->StreamBorrower());
+      ServiceExecutableRunOptions run_options(
+          options, backend->StreamBorrowerWithPriority());
 
       // Asynchronously launch the computation.
       TF_ASSIGN_OR_RETURN(ScopedShapedBuffer result,
@@ -534,7 +534,7 @@ StatusOr<GlobalDataHandle> Service::ExecuteAndRegisterResult(
         backend->eigen_intra_op_thread_pool_device());
     options.set_device_assignment(&device_assignment);
     options.set_execution_profile(profile);
-    run_options.emplace_back(options, backend->StreamBorrower());
+    run_options.emplace_back(options, backend->StreamBorrowerWithPriority());
   }
 
   if (options_.number_of_replicas() == 1) {
diff --git a/tensorflow/compiler/xla/service/service_executable_run_options.h b/tensorflow/compiler/xla/service/service_executable_run_options.h
index a0bde697ec7..74a861442ba 100644
--- a/tensorflow/compiler/xla/service/service_executable_run_options.h
+++ b/tensorflow/compiler/xla/service/service_executable_run_options.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_SERVICE_EXECUTABLE_RUN_OPTIONS_H_
 
+#include <functional>
+#include <utility>
+
 #include "tensorflow/compiler/xla/executable_run_options.h"
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -27,7 +30,11 @@ namespace xla {
 // data.
 class ServiceExecutableRunOptions {
  public:
-  using StreamBorrower = std::function<StatusOr<StreamPool::Ptr>(int)>;
+  // Defines the interface of the stream borrower function pointer
+  // with the first argument being the device ordinal and second
+  // argument being the priority of the stream.
+  using StreamBorrower =
+      std::function<StatusOr<StreamPool::Ptr>(int, se::StreamPriority)>;
 
   ServiceExecutableRunOptions()
       : ServiceExecutableRunOptions(ExecutableRunOptions()) {}
@@ -50,9 +57,11 @@ class ServiceExecutableRunOptions {
 
   // Borrows a stream and returns a smart pointer which returns the stream on
   // destruction.
-  StatusOr<StreamPool::Ptr> BorrowStream(int device_ordinal) const {
+  StatusOr<StreamPool::Ptr> BorrowStream(
+      int device_ordinal,
+      se::StreamPriority priority = se::StreamPriority::Default) const {
     return borrow_stream_
-               ? borrow_stream_(device_ordinal)
+               ? borrow_stream_(device_ordinal, priority)
                : Status(absl::StatusCode::kUnimplemented, "No stream cache");
   }
 
diff --git a/tensorflow/compiler/xla/service/shape_inference.cc b/tensorflow/compiler/xla/service/shape_inference.cc
index f18f490a386..8a026f227a4 100644
--- a/tensorflow/compiler/xla/service/shape_inference.cc
+++ b/tensorflow/compiler/xla/service/shape_inference.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <iterator>
 #include <numeric>
 #include <set>
 #include <string>
@@ -352,7 +353,6 @@ StatusOr<PrimitiveType> MaybeUpcast(
             PrimitiveType_Name(shape.element_type()));
       }
       return ShapeUtil::ChangeElementType(shape, PRED);
-
     default:
       return InvalidArgument(
           "Unknown operation for unary shape inference: \"%s\".",
@@ -360,6 +360,28 @@ StatusOr<PrimitiveType> MaybeUpcast(
   }
 }
 
+/* static */ StatusOr<Shape> ShapeInference::InferTopKShape(
+    const Shape& operand_shape, int64_t k) {
+  TF_RETURN_IF_ERROR(ExpectArray(operand_shape, "operand of top-k operation"));
+  int64_t last_dim = operand_shape.rank() - 1;
+  std::vector<bool> is_dynamic(operand_shape.rank());
+  std::vector<int64_t> dimensions(operand_shape.rank());
+
+  TF_RET_CHECK(operand_shape.dimensions(last_dim) >= k)
+      << "k=" << k << " is larger than the last dimension of size="
+      << operand_shape.dimensions(last_dim);
+  for (int64_t i = 0; i < operand_shape.dimensions_size(); ++i) {
+    is_dynamic[i] =
+        i == last_dim ? false : operand_shape.is_dynamic_dimension(i);
+    dimensions[i] = i == last_dim ? k : operand_shape.dimensions(i);
+  }
+
+  Shape out = ShapeUtil::MakeShape(operand_shape.element_type(), dimensions,
+                                   is_dynamic);
+  Shape idxs_shape = ShapeUtil::ChangeElementType(out, PrimitiveType::S32);
+  return ShapeUtil::MakeTupleShape({out, idxs_shape});
+}
+
 /* static */ StatusOr<Shape> ShapeInference::InferConcatOpShape(
     absl::Span<const Shape* const> arg_shapes, const int64_t dimension) {
   if (arg_shapes.empty()) {
@@ -2234,18 +2256,20 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferCollectivePermuteStartShape(
-    absl::Span<const Shape* const> operand_shapes) {
-  const Shape u32_scalar = ShapeUtil::MakeShape(U32, {});
+    absl::Span<const Shape* const> operand_shapes,
+    absl::Span<const Shape> context_shapes) {
+  absl::InlinedVector<const Shape*, 4> shapes;
   if (operand_shapes.size() == 1) {
     TF_RETURN_IF_ERROR(ExpectArray(*(operand_shapes[0]),
                                    "operand of collective-permute-start"));
-    return ShapeUtil::MakeTupleShapeWithPtrs(
-        {operand_shapes[0], operand_shapes[0], &u32_scalar, &u32_scalar});
+    shapes = {operand_shapes[0], operand_shapes[0]};
   } else {
     TF_RET_CHECK(operand_shapes.size() == 4);
-    return ShapeUtil::MakeTupleShapeWithPtrs(
-        {operand_shapes[0], operand_shapes[1], &u32_scalar, &u32_scalar});
+    shapes = {operand_shapes[0], operand_shapes[1]};
   }
+  absl::c_transform(context_shapes, std::back_inserter(shapes),
+                    [](const Shape& shape) { return &shape; });
+  return ShapeUtil::MakeTupleShapeWithPtrs(shapes);
 }
 
 /* static */ StatusOr<Shape> ShapeInference::InferCollectivePermuteDoneShape(
@@ -3183,7 +3207,7 @@ ShapeInference::InferDegenerateDimensionBroadcastShape(HloOpcode operation,
     if ((input_dim_end - input_dim_start) > 1 &&
         (output_dim_end - output_dim_start) > 1) {
       // We don't support the case when a dynamic dimension is both combined
-      // with and splitted into other dimensions:
+      // with and split into other dimensions:
       //
       //  [x, yz]
       //     | Reshape
diff --git a/tensorflow/compiler/xla/service/shape_inference.h b/tensorflow/compiler/xla/service/shape_inference.h
index 7304e165a5e..ca15a080056 100644
--- a/tensorflow/compiler/xla/service/shape_inference.h
+++ b/tensorflow/compiler/xla/service/shape_inference.h
@@ -175,7 +175,8 @@ class ShapeInference {
 
   // Infers the shape of a collective permute start operation.
   static StatusOr<Shape> InferCollectivePermuteStartShape(
-      absl::Span<const Shape* const> operand_shapes);
+      absl::Span<const Shape* const> operand_shapes,
+      absl::Span<const Shape> context_shapes);
 
   // Infers the shape of a collective permute operation.
   static StatusOr<Shape> InferCollectivePermuteDoneShape(
@@ -370,6 +371,8 @@ class ShapeInference {
                                                     const Shape& val_shape,
                                                     int64_t dimension);
 
+  static StatusOr<Shape> InferTopKShape(const Shape& operand_shape, int64_t k);
+
   // Helper function for creating a Window proto from user-supplied data.
   // Returns error if the user-supplied data was invalid.
   static StatusOr<Window> InferWindowFromDimensions(
diff --git a/tensorflow/compiler/xla/service/shape_inference_test.cc b/tensorflow/compiler/xla/service/shape_inference_test.cc
index 3516e5e21fd..77eaf534f6f 100644
--- a/tensorflow/compiler/xla/service/shape_inference_test.cc
+++ b/tensorflow/compiler/xla/service/shape_inference_test.cc
@@ -2481,6 +2481,21 @@ TEST_F(ShapeInferenceTest, SortManyValues) {
       ShapeUtil::MakeTupleShape({keys, values_s32, values_u32})));
 }
 
+TEST_F(ShapeInferenceTest, GoodTopK) {
+  auto input = ShapeUtil::MakeShape(F32, {3, 4, 5});
+  StatusOr<Shape> s = ShapeInference::InferTopKShape(input, /*k=*/2);
+  ASSERT_IS_OK(s.status());
+  ASSERT_TRUE(ShapeUtil::Equal(
+      *s, ShapeUtil::MakeTupleShape({ShapeUtil::MakeShape(F32, {3, 4, 2}),
+                                     ShapeUtil::MakeShape(S32, {3, 4, 2})})));
+}
+
+TEST_F(ShapeInferenceTest, FailTopKLargeK) {
+  auto input = ShapeUtil::MakeShape(F32, {3, 4, 5});
+  StatusOr<Shape> statusor = ShapeInference::InferTopKShape(input, /*k=*/10);
+  EXPECT_FALSE(statusor.ok());
+}
+
 TEST_F(ShapeInferenceTest, InferStochasticConvertShape) {
   const Shape operand = ShapeUtil::MakeShape(F32, {4, 3});
   const Shape random = ShapeUtil::MakeShape(U32, {4, 3});
diff --git a/tensorflow/compiler/xla/service/sharding_propagation.cc b/tensorflow/compiler/xla/service/sharding_propagation.cc
index db979bac52a..b4c60a41279 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation.cc
@@ -245,6 +245,7 @@ const HloInstruction* PickRepresentativeOperand(
     case HloOpcode::kSelect:
     case HloOpcode::kSign:
     case HloOpcode::kSin:
+    case HloOpcode::kTopK:
     case HloOpcode::kSort:
     case HloOpcode::kSqrt:
     case HloOpcode::kCbrt:
@@ -1357,6 +1358,7 @@ StatusOr<bool> ProcessShardingInstruction(
       }
       TF_RET_CHECK(instruction->has_sharding())
           << "Sharding instruction must have a sharding attribute";
+      VLOG(3) << "ProcessShardingInstruction: " << instruction->ToString();
       const HloSharding& sharding = instruction->sharding();
 
       std::vector<int64_t> unspec_dims;
@@ -1552,13 +1554,8 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
       return reduce_window->sharding();
     }
     case HloOpcode::kReshape: {
-      auto reshaped_sharding = hlo_sharding_util::ReshapeSharding(
+      return hlo_sharding_util::PropagateShardingThroughReshape(
           user.shape(), instruction.shape(), user.sharding());
-      if (reshaped_sharding.has_value()) {
-        return reshaped_sharding;
-      }
-      return hlo_sharding_util::ReplicateAllDataDims(
-          user.sharding(), instruction.shape().rank());
     }
     case HloOpcode::kPad: {
       if (&instruction != user.operand(0)) {
@@ -1656,31 +1653,43 @@ std::optional<HloSharding> ShardingPropagation::GetShardingFromUser(
               ? user.sharding().GetSubSharding(
                     user.shape(), {user.operand_index(&instruction)})
               : user.sharding();
-      if (user_sharding.IsTileMaximal()) {
-        return user_sharding;
+      if (!user_sharding.IsTileMaximal()) {
+        std::vector<int64_t> target_tile_assignment_dimensions(
+            instruction.shape().rank() +
+            (user_sharding.ReplicateOnLastTileDim() ? 1 : 0) +
+            user_sharding.subgroup_types().size());
+        const auto& dimensions = user.dimensions();
+        int64_t next_output_dim = 0;
+        for (int64_t i = 0; i < target_tile_assignment_dimensions.size(); ++i) {
+          if (absl::c_find(dimensions, i) == dimensions.end()) {
+            target_tile_assignment_dimensions[i] =
+                user_sharding.tile_assignment().dim(next_output_dim++);
+          } else {
+            target_tile_assignment_dimensions[i] = 1;
+          }
+        }
+        auto tile_assignment = user_sharding.tile_assignment();
+        tile_assignment.Reshape(target_tile_assignment_dimensions);
+        user_sharding =
+            user_sharding.ReplicateOnLastTileDim()
+                ? HloSharding::PartialTile(tile_assignment,
+                                           user_sharding.metadata())
+                : HloSharding::Subgroup(tile_assignment,
+                                        user_sharding.subgroup_types(),
+                                        user_sharding.metadata());
       }
-      std::vector<int64_t> target_tile_assignment_dimensions(
-          instruction.shape().rank() +
-          (user_sharding.ReplicateOnLastTileDim() ? 1 : 0) +
-          user_sharding.subgroup_types().size());
-      const auto& dimensions = user.dimensions();
-      int64_t next_output_dim = 0;
-      for (int64_t i = 0; i < target_tile_assignment_dimensions.size(); ++i) {
-        if (absl::c_find(dimensions, i) == dimensions.end()) {
-          target_tile_assignment_dimensions[i] =
-              user_sharding.tile_assignment().dim(next_output_dim++);
-        } else {
-          target_tile_assignment_dimensions[i] = 1;
+
+      // Try to merge with sharding from other operands if they can improve
+      // current sharding.
+      const auto* reduce = Cast<const HloReduceInstruction>(&user);
+      for (const HloInstruction* operand : reduce->inputs()) {
+        if (operand != &instruction && operand->has_sharding()) {
+          hlo_sharding_util::MergeShardingIfCompatible(
+              operand->sharding(), user_sharding.NumTiles() + 1,
+              &user_sharding);
         }
       }
-      auto tile_assignment = user_sharding.tile_assignment();
-      tile_assignment.Reshape(target_tile_assignment_dimensions);
-      return user_sharding.ReplicateOnLastTileDim()
-                 ? HloSharding::PartialTile(tile_assignment,
-                                            user_sharding.metadata())
-                 : HloSharding::Subgroup(tile_assignment,
-                                         user_sharding.subgroup_types(),
-                                         user_sharding.metadata());
+      return user_sharding;
     }
     case HloOpcode::kSort: {
       HloSharding user_sharding = user.sharding();
@@ -2140,21 +2149,14 @@ bool ShardingPropagation::InferShardingFromOperands(
       if (!IsSpatiallyPartitioned(instruction->operand(0))) {
         return false;
       }
-      std::optional<HloSharding> new_sharding =
-          hlo_sharding_util::ReshapeSharding(
+      HloSharding new_sharding =
+          hlo_sharding_util::PropagateShardingThroughReshape(
               instruction->operand(0)->shape(), instruction->shape(),
               instruction->operand(0)->sharding());
-      if (new_sharding.has_value()) {
-        return MaybeImproveInstructionSharding(
-            std::move(*new_sharding), instruction, may_combine_partial_sharding,
-            /*allow_aggressive_resharding=*/
-            ComputeNonRootUsers(instruction) == 1);
-      }
-      if (!instruction->has_sharding()) {
-        instruction->set_sharding(hlo_sharding_util::ReplicateAllDataDims(
-            instruction->operand(0)->sharding(), instruction->shape().rank()));
-        return true;
-      }
+      return MaybeImproveInstructionSharding(
+          std::move(new_sharding), instruction, may_combine_partial_sharding,
+          /*allow_aggressive_resharding=*/
+          ComputeNonRootUsers(instruction) == 1);
       return false;
     }
     case HloOpcode::kReverse: {
diff --git a/tensorflow/compiler/xla/service/sharding_propagation_test.cc b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
index 166eea9381d..cee2d1c7335 100644
--- a/tensorflow/compiler/xla/service/sharding_propagation_test.cc
+++ b/tensorflow/compiler/xla/service/sharding_propagation_test.cc
@@ -791,7 +791,7 @@ ENTRY %reduce {
 }
 
 TEST_P(ParameterizedMetadataTestWithOutput,
-       ShardedTupleReduceForwardAndBackwardPass) {
+       ShardedOnNonReduceDimTupleReduceForwardAndBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
 
@@ -853,6 +853,73 @@ ENTRY %main {
   }
 }
 
+TEST_P(ParameterizedMetadataTestWithOutput,
+       ShardedOnReduceDimTupleReduceForwardAndBackwardPass) {
+  const char* const hlo_string = R"(
+HloModule module
+
+%minmax_func {
+  %lhs_value = f32[] parameter(0)
+  %rhs_value = f32[] parameter(2)
+  %compare.2 = pred[] compare(%lhs_value, %rhs_value), direction=GT
+  %select.4 = f32[] select(%compare.2, %lhs_value, %rhs_value)
+  %lhs_index = s32[] parameter(1)
+  %rhs_index = s32[] parameter(3)
+  %select.5 = s32[] select(%compare.2, %lhs_index, %rhs_index)
+  ROOT %tuple.2 = (f32[], s32[]) tuple(%select.4, %select.5)
+}
+
+ENTRY %main {
+  %param0 = f32[28,10] parameter(0)
+  %param1 = s32[28,10] parameter(1), sharding={devices=[2,2]0,1,2,3 metadata={op_name="a"}}
+  %copy_param0 = f32[28,10] copy(%param0)
+  %init0 = f32[] parameter(2)
+  %init1 = s32[] parameter(3)
+  %reduce = (f32[28], s32[28]) reduce(%copy_param0, %param1, %init0, %init1),
+    dimensions={1}, to_apply=%minmax_func
+  %gte0 = f32[28] get-tuple-element(%reduce), index=0
+  %gte1 = s32[28] get-tuple-element(%reduce), index=1
+  %copy0 = f32[28] copy(%gte0)
+  %copy1 = s32[28] copy(%gte1)
+  ROOT %tuple = (f32[28], s32[28]) tuple(%copy0, %copy1)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/true, GetParam().propagate_metadata,
+                          {GetParam().allow_root_sharding_propagation})
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* reduce = FindInstruction(module.get(), "reduce");
+  ASSERT_NE(reduce, nullptr);
+  EXPECT_THAT(reduce, op::Sharding("{{devices=[2,2]0,1,2,3 "
+                                   "last_tile_dim_replicate},{devices=[2,2]0,1,"
+                                   "2,3 last_tile_dim_replicate}}"));
+  auto* copy_param0 = FindInstruction(module.get(), "copy_param0");
+  ASSERT_NE(copy_param0, nullptr);
+  EXPECT_THAT(copy_param0, op::Sharding("{devices=[2,2]0,1,2,3}"));
+  for (const HloSharding& sharding :
+       {copy_param0->sharding(), reduce->sharding().tuple_elements()[0],
+        reduce->sharding().tuple_elements()[1]}) {
+    if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+      EXPECT_THAT(sharding, ShardingMetadata({CreateMetadata("a")}));
+    } else {
+      EXPECT_THAT(sharding, ShardingMetadata({}));
+    }
+  }
+  if (GetParam().allow_root_sharding_propagation) {
+    EXPECT_THAT(module->entry_computation()->root_instruction(),
+                op::Sharding("{{devices=[2,2]0,1,2,3 "
+                             "last_tile_dim_replicate},{devices=[2,2]0,1,2,3 "
+                             "last_tile_dim_replicate}}"));
+  }
+}
+
 TEST_P(ParameterizedMetadataTestWithOutput, GetTupleElementForwardPass) {
   const char* const hlo_string = R"(
 HloModule module
@@ -1359,6 +1426,72 @@ ENTRY %reshape {
   }
 }
 
+TEST_P(ParameterizedMetadataTest, ReshapeForwardPassPartialMatch) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[14,32] parameter(0),
+    sharding={devices=[4,4]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 metadata={op_name="a"}}
+  %reshape = f32[7,2,2,16] reshape(%param0)
+  ROOT %copy = f32[7,2,2,16] copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "reshape");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(instruction,
+              op::Sharding("{devices=[1,1,2,2,4]0,4,8,12,1,5,9,13,2,6,10,14,3,"
+                           "7,11,15 last_tile_dim_replicate}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("a")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
+TEST_P(ParameterizedMetadataTest, ReshapeForwardPassPartialMatch2) {
+  const char* const hlo_string = R"(
+HloModule module
+ENTRY %reshape {
+  %param0 = f32[12,8] parameter(0),
+    sharding={devices=[2,4]0,1,2,3,4,5,6,7 metadata={op_name="a"}}
+  %reshape = f32[8,12] reshape(%param0)
+  ROOT %copy = f32[8,12] copy(%reshape)
+})";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  if (GetParam().clear_metadata) {
+    ClearMetadata(module.get());
+  }
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      ShardingPropagation(/*is_spmd=*/false, GetParam().propagate_metadata)
+          .Run(module.get()));
+  XLA_VLOG_LINES(1, module->ToString());
+  EXPECT_TRUE(changed);
+  auto* instruction = FindInstruction(module.get(), "reshape");
+  ASSERT_NE(instruction, nullptr);
+  EXPECT_THAT(
+      instruction,
+      op::Sharding("{devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}"));
+  if (GetParam().propagate_metadata && !GetParam().clear_metadata) {
+    EXPECT_THAT(instruction->sharding(),
+                ShardingMetadata({CreateMetadata("a")}));
+  } else {
+    EXPECT_THAT(instruction->sharding(), ShardingMetadata({}));
+  }
+}
+
 TEST_P(ParameterizedMetadataTest, ReshapeBackwardPass) {
   const char* const hlo_string = R"(
 HloModule module
diff --git a/tensorflow/compiler/xla/service/spmd/BUILD b/tensorflow/compiler/xla/service/spmd/BUILD
index 50c70060598..d1087e27b7b 100644
--- a/tensorflow/compiler/xla/service/spmd/BUILD
+++ b/tensorflow/compiler/xla/service/spmd/BUILD
@@ -35,6 +35,7 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/xla:comparison_util",
+        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
diff --git a/tensorflow/compiler/xla/service/spmd/dot_handler.cc b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
index 0063930eaee..c721de68e6a 100644
--- a/tensorflow/compiler/xla/service/spmd/dot_handler.cc
+++ b/tensorflow/compiler/xla/service/spmd/dot_handler.cc
@@ -472,6 +472,7 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
   auto check_users_sharding = [original_hlo, &call_graph,
                                &options](const HloInstruction* to_loop_over) {
     if (options.skip_checking_windowed_einsum_users) {
+      VLOG(2) << "skip_checking_windowed_einsum_users";
       return true;
     }
     if (to_loop_over->users().size() <= 1) {
@@ -605,18 +606,25 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
           ShapeUtil::ByteSizeOf(dot->shape()), collective->replica_groups());
     }
 
-    VLOG(2) << "collective: " << collective->ToString() << "\n"
-            << "dot: " << dot->ToString() << "\n"
-            << "num_partitions: " << num_partitions << "\n"
-            << "computation_time_in_ms: " << computation_time_in_ms
-            << " communication_time_in_ms: " << communication_time_in_ms;
     double extra_collective_permute_time = 0.0;
     if (communication_time_in_ms != 0.0) {
       extra_collective_permute_time =
           communication_time_in_ms *
           visitor->GetCommunicationMultiplier(collective->replica_groups()) *
           2 / num_partitions;
+      VLOG(2) << "GetCommunicationMultiplier: "
+              << visitor->GetCommunicationMultiplier(
+                     collective->replica_groups());
     }
+    VLOG(2) << "collective: " << collective->ToString() << "\n"
+            << "dot: " << dot->ToString() << "\n"
+            << "num_partitions: " << num_partitions << "\n"
+            << "computation_time_in_ms: " << computation_time_in_ms
+            << " communication_time_in_ms: " << communication_time_in_ms
+            << " extra_collective_permute_time: "
+            << extra_collective_permute_time << "\n"
+            << "lhr_needs_ag: " << lhs_needs_ag
+            << " rhs_needs_ag: " << rhs_needs_ag;
     if (communication_time_in_ms > 1e-5 &&
         (std::max(
              computation_time_in_ms,
@@ -624,6 +632,7 @@ std::optional<WindowedEinsumConfig> GetWindowedEinsumConfiguration(
                                             collective->replica_groups())) +
          extra_collective_permute_time) >=
             (computation_time_in_ms + communication_time_in_ms)) {
+      VLOG(2) << "Overhead outweighs benefit. Skipping windowed einsum";
       return true;
     } else {
       return false;
@@ -757,6 +766,948 @@ std::vector<ReplicaGroup> GetLoopReplicaGroups(HloInstruction* while_loop) {
   return groups;
 }
 
+// Try to emit windowed DotGeneral when one operand is partitioned in the same
+// way as the output along non-contracting dimensions, but the other operand
+// is tiled in other dimensions. Or both operands are partitioned in the same
+// way along contracting dimensions, but the output is partitioned along
+// non-contracting dimensions.
+StatusOr<HloInstruction*> EmitWindowedDotGeneral(
+    PartitionedHlo lhs, PartitionedHlo rhs, const Shape& output_base_shape,
+    const HloSharding& output_sharding, const DotConvDimsMapping& dims_mapping,
+    int64_t num_partitions,
+    absl::FunctionRef<StatusOr<HloInstruction*>(HloInstruction*,
+                                                HloInstruction*, SpmdBuilder*,
+                                                const Window& conv_window)>
+        create_sharded_dot,
+    const Window& conv_window, HloModule* module, HloInstruction* original_hlo,
+    const SpmdPartitionerOptions& options, SpmdBuilder* b,
+    std::vector<SpmdPartitioningVisitor::WindowedDotGeneralLoop>*
+        windowed_dot_general_loops,
+    const WindowedEinsumConfig& einsum_config,
+    DotDimensionIndexMapping indices_map,
+    std::optional<HloSharding> lhs_sharding_transposed_to_match_output,
+    std::optional<HloSharding> rhs_sharding_transposed_to_match_output,
+    std::optional<HloSharding> rhs_sharding_transposed_to_match_lhs,
+    std::optional<HloSharding> lhs_sharding_transposed_to_match_rhs,
+    std::optional<HloSharding> output_sharding_transposed_to_match_rhs,
+    std::optional<HloSharding> output_sharding_transposed_to_match_lhs) {
+  CHECK(!einsum_config.windowed_at_batch_dims ||
+        !einsum_config.windowed_at_contracting_dims);
+  const bool windowed_at_batch_dims = einsum_config.windowed_at_batch_dims;
+  const bool windowed_at_contracting_dims =
+      einsum_config.windowed_at_contracting_dims;
+  const bool operands_sharded_at_contracting_dims =
+      einsum_config.operands_sharded_at_contracting_dims;
+  auto unpadded_result_buffer_shape =
+      MakePartitionedShape(output_base_shape, output_sharding);
+  auto padded_result_buffer_shape = unpadded_result_buffer_shape;
+  const bool windowed_op_is_lhs =
+      einsum_config.windowed_op == WindowedEinsumOperand::LHS;
+  // For windowing at batch/non-contracting dims, we produce the result one
+  // partition at a time, so we need to pad the shape in case of uneven
+  // partitioning in order to make dynamic-update-slice in-bound.
+  if (!windowed_at_contracting_dims && !operands_sharded_at_contracting_dims) {
+    padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
+        padded_result_buffer_shape,
+        windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
+                           : *rhs_sharding_transposed_to_match_output);
+  }
+  // Mask the padding area of the windowed operand with zero if there is
+  // uneven partitioning.
+  if (windowed_at_contracting_dims) {
+    auto& to_mask = windowed_op_is_lhs ? lhs : rhs;
+    to_mask = to_mask.PadWithZero();
+  }
+  if (operands_sharded_at_contracting_dims) {
+    lhs = lhs.PadWithZero();
+    rhs = rhs.PadWithZero();
+  }
+
+  // Check if contracting dimension sharding requires lhs/rhs resharding.
+  if (RequiresTransposeSharding(lhs.hlo()->sharding(), rhs.hlo()->sharding(),
+                                dims_mapping.contracting_dims) &&
+      rhs_sharding_transposed_to_match_lhs.has_value() &&
+      lhs_sharding_transposed_to_match_rhs.has_value()) {
+    if (ShapeSizeInBytes(lhs.hlo()->shape()) <
+        ShapeSizeInBytes(rhs.hlo()->shape())) {
+      lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithZero();
+    } else {
+      rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithZero();
+    }
+  }
+
+  // Get slice sharding, sharding dim, and lhs/rhs concat dim.
+  const HloSharding* slice_sharding;
+  if (operands_sharded_at_contracting_dims) {
+    slice_sharding = windowed_op_is_lhs
+                         ? &*output_sharding_transposed_to_match_rhs
+                         : &*output_sharding_transposed_to_match_lhs;
+  } else if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+    slice_sharding = windowed_op_is_lhs
+                         ? &*lhs_sharding_transposed_to_match_rhs
+                         : &*rhs_sharding_transposed_to_match_lhs;
+  } else {
+    slice_sharding = windowed_op_is_lhs
+                         ? &*lhs_sharding_transposed_to_match_output
+                         : &*rhs_sharding_transposed_to_match_output;
+  }
+  CHECK_EQ(Product(slice_sharding->tile_assignment().dimensions()),
+           num_partitions);
+  int64_t slice_sharding_dim = -1;
+  for (int64_t i = 0; i < slice_sharding->tile_assignment().num_dimensions();
+       ++i) {
+    if (slice_sharding->tile_assignment().dim(i) > 1) {
+      slice_sharding_dim = i;
+      break;
+    }
+  }
+  int64_t lhs_concat_dim = -1;
+  int64_t rhs_concat_dim = -1;
+  if (operands_sharded_at_contracting_dims) {
+    if (windowed_op_is_lhs) {
+      rhs_concat_dim = slice_sharding_dim;
+    } else {
+      lhs_concat_dim = slice_sharding_dim;
+    }
+  } else if (windowed_at_contracting_dims || windowed_at_batch_dims) {
+    lhs_concat_dim = windowed_op_is_lhs
+                         ? indices_map.rhs_to_lhs_indices[slice_sharding_dim]
+                         : slice_sharding_dim;
+    rhs_concat_dim = windowed_op_is_lhs
+                         ? slice_sharding_dim
+                         : indices_map.lhs_to_rhs_indices[slice_sharding_dim];
+  } else {
+    if (windowed_op_is_lhs) {
+      lhs_concat_dim = indices_map.output_to_lhs_indices[slice_sharding_dim];
+    } else {
+      rhs_concat_dim = indices_map.output_to_rhs_indices[slice_sharding_dim];
+    }
+  }
+
+  auto lhs_hlo = lhs.hlo();
+  auto rhs_hlo = rhs.hlo();
+  // Reshape lhs and rhs before the loop for bidirectional communication case.
+  if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0) {
+    if (lhs_concat_dim != -1 && windowed_op_is_lhs &&
+        !operands_sharded_at_contracting_dims) {
+      std::vector<int64_t> reshaped_dims(lhs_hlo->shape().dimensions().begin(),
+                                         lhs_hlo->shape().dimensions().end());
+      reshaped_dims.insert(reshaped_dims.begin() + lhs_concat_dim, 1);
+      lhs_hlo = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(lhs_hlo->shape().element_type(), reshaped_dims),
+          lhs_hlo));
+    }
+    if (rhs_concat_dim != -1 && !windowed_op_is_lhs &&
+        !operands_sharded_at_contracting_dims) {
+      std::vector<int64_t> reshaped_dims(rhs_hlo->shape().dimensions().begin(),
+                                         rhs_hlo->shape().dimensions().end());
+      reshaped_dims.insert(reshaped_dims.begin() + rhs_concat_dim, 1);
+      rhs_hlo = b->AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(rhs_hlo->shape().element_type(), reshaped_dims),
+          rhs_hlo));
+    }
+  }
+
+  auto result_buffer = CreateZero(padded_result_buffer_shape, b);
+  auto extra_buffer =
+      (!(options.bidirectional_windowed_einsum && num_partitions % 4 == 0) ||
+       operands_sharded_at_contracting_dims)
+          ? CreateZero(padded_result_buffer_shape, b)
+      : windowed_op_is_lhs ? lhs_hlo
+                           : rhs_hlo;
+
+  if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0 &&
+      !operands_sharded_at_contracting_dims) {
+    std::vector<std::pair<int64_t, int64_t>> pre_sd_pairs(num_partitions);
+    for (int64_t source = 0; source < num_partitions; ++source) {
+      // 0 -> 1, 1 -> 2, 2 -> 3, ...
+      pre_sd_pairs[source] = {source, (source + 1) % num_partitions};
+    }
+    extra_buffer =
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                b, extra_buffer, pre_sd_pairs,
+                (*lhs.state().next_channel_id)++);
+  }
+
+  auto iteration = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(0)));
+
+  // Create a while loop that computes one window per iteration. During each
+  // iteration, each partition sends its input window to its neighbor using
+  // collective-permute for the next iteration.
+  SpmdBuilder body_b("windowed_dot_general_body", original_hlo);
+
+  // Generate partial results used by bidirectional algorithm.
+  auto get_partial_bid_results =
+      [&](HloInstruction* l, HloInstruction* r, HloInstruction* o,
+          HloInstruction* extra_inout, HloInstruction* cw_cp_output,
+          HloInstruction* i) -> StatusOr<std::vector<HloInstruction*>> {
+    auto partition_id =
+        lhs.state().collective_ops_creator.create_partition_id(&body_b);
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32_t>(num_partitions)));
+    auto ccw_data_partition_id =
+        body_b.AddInstruction(HloInstruction::CreateBinary(
+            i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto cw_data_partition_id =
+        body_b.AddInstruction(HloInstruction::CreateBinary(
+            i->shape(), HloOpcode::kAdd, partition_count, partition_id));
+    if (operands_sharded_at_contracting_dims) {
+      ccw_data_partition_id =
+          body_b.AddInstruction(HloInstruction::CreateBinary(
+              i->shape(), HloOpcode::kAdd, ccw_data_partition_id,
+              body_b.AddInstruction(HloInstruction::CreateConstant(
+                  LiteralUtil::CreateR0<uint32_t>(num_partitions / 2 + 1)))));
+      cw_data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+          i->shape(), HloOpcode::kSubtract, cw_data_partition_id,
+          body_b.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32_t>(num_partitions / 2)))));
+    } else {
+      cw_data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+          i->shape(), HloOpcode::kSubtract, cw_data_partition_id,
+          CreateOne(cw_data_partition_id->shape(), &body_b)));
+    }
+    ccw_data_partition_id = body_b.AddInstruction(
+        HloInstruction::CreateBinary(i->shape(), HloOpcode::kRemainder,
+                                     ccw_data_partition_id, partition_count));
+    cw_data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kSubtract, cw_data_partition_id, i));
+    cw_data_partition_id = body_b.AddInstruction(
+        HloInstruction::CreateBinary(i->shape(), HloOpcode::kRemainder,
+                                     cw_data_partition_id, partition_count));
+
+    DotDimensionNumbers new_ddnums;
+    if (original_hlo->opcode() == HloOpcode::kDot) {
+      new_ddnums = original_hlo->dot_dimension_numbers();
+    }
+
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    auto original_dot_lhs = l;
+    auto original_dot_rhs = r;
+    // Recover original lhs and rhs, will not be used in real computation.
+    if (lhs_concat_dim != -1 && windowed_op_is_lhs) {
+      std::vector<int64_t> reshaped_dims(
+          original_dot_lhs->shape().dimensions().begin(),
+          original_dot_lhs->shape().dimensions().end());
+      reshaped_dims.erase(reshaped_dims.begin() + lhs_concat_dim);
+      original_dot_lhs = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(original_dot_lhs->shape().element_type(),
+                               reshaped_dims),
+          original_dot_lhs));
+    }
+    if (rhs_concat_dim != -1 && !windowed_op_is_lhs) {
+      std::vector<int64_t> reshaped_dims(
+          original_dot_rhs->shape().dimensions().begin(),
+          original_dot_rhs->shape().dimensions().end());
+      reshaped_dims.erase(reshaped_dims.begin() + rhs_concat_dim);
+      original_dot_rhs = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(original_dot_rhs->shape().element_type(),
+                               reshaped_dims),
+          original_dot_rhs));
+    }
+
+    if (windowed_at_contracting_dims || windowed_at_batch_dims ||
+        operands_sharded_at_contracting_dims) {
+      // Slice the matching operand according to the partitioned dimensions
+      // on the windowed operand or the output.
+      auto slice_operand = !windowed_op_is_lhs ? l : r;
+
+      // Pad the sharding dim first (then the concat dim) for correctness.
+      auto sharding_dim_size =
+          slice_operand->shape().dimensions(slice_sharding_dim);
+      if (sharding_dim_size % num_partitions != 0) {
+        slice_operand = PadBaseShapeBeforeUnevenTiledSharding(
+            slice_operand, *slice_sharding, &body_b);
+      }
+
+      // We do this by treating the matching operand as replicated, and
+      // resharding it to match the windowed operand or the output.
+      auto gen_slice = [&](HloInstruction* data_partition_id,
+                           bool ccw) -> HloInstruction* {
+        std::vector<int64_t> new_dims;
+        const int64_t dimensions_size =
+            slice_operand->shape().dimensions_size();
+        new_dims.reserve(dimensions_size + 1);
+        for (int64_t i = 0; i < dimensions_size; ++i) {
+          if (i == slice_sharding_dim) {
+            new_dims.push_back(1);
+          }
+          new_dims.push_back(slice_operand->shape().dimensions(i));
+        }
+        auto reshaped_slice_operand =
+            body_b.AddInstruction(HloInstruction::CreateReshape(
+                ShapeUtil::MakeShape(slice_operand->shape().element_type(),
+                                     new_dims),
+                slice_operand));
+        auto pad_value = body_b.AddInstruction(HloInstruction::CreateConstant(
+            ShapeUtil::ElementIsFloating(reshaped_slice_operand->shape())
+                ? LiteralUtil::MinValue(
+                      reshaped_slice_operand->shape().element_type())
+                : LiteralUtil::Zero(
+                      reshaped_slice_operand->shape().element_type())));
+
+        std::vector<int64_t> padding(reshaped_slice_operand->shape().rank());
+        auto padded_slice_operand = reshaped_slice_operand;
+        auto padded_shape = padded_slice_operand->shape();
+        int64_t padding_dim = slice_sharding_dim;
+        padded_shape.set_dimensions(padding_dim, 2);
+        if (ccw) {
+          // ccw pad high
+          PaddingConfig ccw_pad_config =
+              window_util::MakeSymmetricPadding(padding);
+          ccw_pad_config.mutable_dimensions(padding_dim)
+              ->set_edge_padding_low(0);
+          ccw_pad_config.mutable_dimensions(padding_dim)
+              ->set_edge_padding_high(1);
+          padded_slice_operand = body_b.AddInstruction(
+              HloInstruction::CreatePad(padded_shape, padded_slice_operand,
+                                        pad_value, ccw_pad_config));
+        } else {
+          // cw pad low
+          PaddingConfig cw_pad_config =
+              window_util::MakeSymmetricPadding(padding);
+          cw_pad_config.mutable_dimensions(padding_dim)
+              ->set_edge_padding_low(1);
+          cw_pad_config.mutable_dimensions(padding_dim)
+              ->set_edge_padding_high(0);
+          padded_slice_operand = body_b.AddInstruction(
+              HloInstruction::CreatePad(padded_shape, padded_slice_operand,
+                                        pad_value, cw_pad_config));
+        }
+
+        padded_slice_operand->set_sharding(HloSharding::Replicate());
+        auto state = lhs.state();
+        state.b = &body_b;
+        state.partition_id = data_partition_id;
+        state.reshard_cache->per_hlo_cache.erase(padded_slice_operand);
+        auto padded_slice_sharding = hlo_sharding_util::ReshapeSharding(
+            slice_operand->shape(), reshaped_slice_operand->shape(),
+            *slice_sharding);
+        auto padded_slice = PartitionedHlo(padded_slice_operand,
+                                           padded_slice_operand->shape(), state)
+                                .Reshard(*padded_slice_sharding)
+                                .hlo();
+        padded_slice_operand->clear_sharding();
+        return padded_slice;
+      };
+
+      auto ccw_slice = gen_slice(ccw_data_partition_id, true);
+      auto cw_slice = gen_slice(cw_data_partition_id, false);
+      auto slice = body_b.AddInstruction(HloInstruction::CreateBinary(
+          ccw_slice->shape(),
+          ShapeUtil::ElementIsFloating(ccw_slice->shape()) ? HloOpcode::kMaximum
+                                                           : HloOpcode::kAdd,
+          ccw_slice, cw_slice));
+      // Reshape. The reshaped slice will not be used to produce the final
+      // result, but used as a hint for the shape inference.
+      std::vector<int64_t> reshaped_slice_dims;
+      const int64_t dim_size = slice->shape().dimensions_size();
+      reshaped_slice_dims.reserve(dim_size);
+      for (int64_t i = 0; i < dim_size; ++i) {
+        auto dim_size = slice->shape().dimensions(i);
+        if (i == (slice_sharding_dim + 1)) {
+          reshaped_slice_dims.push_back(dim_size * 2);
+        } else if (i != slice_sharding_dim) {
+          reshaped_slice_dims.push_back(dim_size);
+        }
+      }
+      auto reshaped_slice = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(slice->shape().element_type(),
+                               reshaped_slice_dims),
+          slice));
+
+      if (!windowed_op_is_lhs) {
+        dot_lhs = slice;
+        original_dot_lhs = reshaped_slice;
+        if (original_hlo->opcode() == HloOpcode::kDot) {
+          UpdateDDNums(&new_ddnums, slice_sharding_dim, true);
+        }
+      } else {
+        dot_rhs = slice;
+        original_dot_rhs = reshaped_slice;
+        if (original_hlo->opcode() == HloOpcode::kDot) {
+          UpdateDDNums(&new_ddnums, slice_sharding_dim, false);
+        }
+      }
+    }
+
+    auto ccw_dot_lhs = l;
+    auto ccw_dot_rhs = r;
+    auto cw_dot_lhs = windowed_op_is_lhs ? extra_inout : l;
+    auto cw_dot_rhs = windowed_op_is_lhs ? r : extra_inout;
+    if (lhs_concat_dim != -1 && windowed_op_is_lhs) {
+      // Concat
+      auto lhs_concat_shape = ccw_dot_lhs->shape();
+      lhs_concat_shape.set_dimensions(lhs_concat_dim, 2);
+      dot_lhs = body_b.AddInstruction(HloInstruction::CreateConcatenate(
+          lhs_concat_shape, {ccw_dot_lhs, cw_dot_lhs}, lhs_concat_dim));
+
+      std::vector<int64_t> reshaped_dims(
+          ccw_dot_lhs->shape().dimensions().begin(),
+          ccw_dot_lhs->shape().dimensions().end());
+      reshaped_dims.erase(reshaped_dims.begin() + lhs_concat_dim);
+      reshaped_dims[lhs_concat_dim] *= 2;
+      original_dot_lhs = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(dot_lhs->shape().element_type(), reshaped_dims),
+          dot_lhs));
+
+      if (original_hlo->opcode() == HloOpcode::kDot) {
+        UpdateDDNums(&new_ddnums, lhs_concat_dim, true);
+      }
+    }
+    if (rhs_concat_dim != -1 && !windowed_op_is_lhs) {
+      // Concat
+      auto rhs_concat_shape = ccw_dot_rhs->shape();
+      rhs_concat_shape.set_dimensions(rhs_concat_dim, 2);
+      dot_rhs = body_b.AddInstruction(HloInstruction::CreateConcatenate(
+          rhs_concat_shape, {ccw_dot_rhs, cw_dot_rhs}, rhs_concat_dim));
+
+      std::vector<int64_t> reshaped_dims(
+          ccw_dot_rhs->shape().dimensions().begin(),
+          ccw_dot_rhs->shape().dimensions().end());
+      reshaped_dims.erase(reshaped_dims.begin() + rhs_concat_dim);
+      reshaped_dims[rhs_concat_dim] *= 2;
+      original_dot_rhs = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(dot_rhs->shape().element_type(), reshaped_dims),
+          dot_rhs));
+
+      if (original_hlo->opcode() == HloOpcode::kDot) {
+        UpdateDDNums(&new_ddnums, rhs_concat_dim, false);
+      }
+    }
+
+    // The generated original dot will not be used.
+    TF_ASSIGN_OR_RETURN(auto original_dot,
+                        create_sharded_dot(original_dot_lhs, original_dot_rhs,
+                                           &body_b, conv_window));
+    VLOG(2) << original_dot->ToString();
+
+    // Generate the correct shape of the new dot/conv.
+    auto original_sharded_dot_shape = original_dot->shape();
+    auto new_dot_shape = original_sharded_dot_shape;
+    std::vector<int64_t> new_dims(new_dot_shape.dimensions().begin(),
+                                  new_dot_shape.dimensions().end());
+    if (!windowed_at_contracting_dims) {
+      auto slice_dim = lhs_concat_dim != -1
+                           ? indices_map.lhs_to_output_indices[lhs_concat_dim]
+                           : indices_map.rhs_to_output_indices[rhs_concat_dim];
+      new_dims[slice_dim] /= 2;
+      new_dims.insert(new_dims.begin() + slice_dim, 2);
+    } else if (original_hlo->opcode() != HloOpcode::kDot) {
+      new_dims.push_back(1);
+    }
+    new_dot_shape =
+        ShapeUtil::MakeShape(original_hlo->shape().element_type(), new_dims);
+
+    HloInstruction* dot;
+    if (original_hlo->opcode() == HloOpcode::kDot) {
+      dot = body_b.AddInstruction(
+          HloInstruction::CreateDot(new_dot_shape, dot_lhs, dot_rhs, new_ddnums,
+                                    original_hlo->precision_config()));
+    } else {
+      if (!windowed_at_contracting_dims && !windowed_at_batch_dims) {
+        if (lhs_concat_dim != -1) {
+          std::vector<int64_t> new_dims(dot_rhs->shape().dimensions().begin(),
+                                        dot_rhs->shape().dimensions().end());
+          new_dims.push_back(1);
+          dot_rhs = body_b.AddInstruction(HloInstruction::CreateReshape(
+              ShapeUtil::MakeShape(dot_rhs->shape().element_type(), new_dims),
+              dot_rhs));
+        }
+        if (rhs_concat_dim != -1) {
+          std::vector<int64_t> new_dims(dot_lhs->shape().dimensions().begin(),
+                                        dot_lhs->shape().dimensions().end());
+          new_dims.push_back(1);
+          dot_lhs = body_b.AddInstruction(HloInstruction::CreateReshape(
+              ShapeUtil::MakeShape(dot_lhs->shape().element_type(), new_dims),
+              dot_lhs));
+        }
+      }
+
+      dot = body_b.AddInstruction(HloInstruction::CreateConvolve(
+          new_dot_shape, dot_lhs, dot_rhs, original_dot->feature_group_count(),
+          original_dot->batch_group_count(),
+          GenNewWindow(original_dot, dot_lhs, dot_rhs, lhs_concat_dim,
+                       rhs_concat_dim, windowed_at_contracting_dims,
+                       windowed_at_batch_dims),
+          GenNewConvDNums(original_dot, dot_lhs, dot_rhs, lhs_concat_dim,
+                          rhs_concat_dim, windowed_at_contracting_dims,
+                          windowed_at_batch_dims,
+                          indices_map.lhs_to_output_indices,
+                          indices_map.rhs_to_output_indices, new_dot_shape),
+          original_dot->precision_config()));
+    }
+    VLOG(2) << dot->ToString();
+
+    if (windowed_at_contracting_dims) {
+      if (original_hlo->opcode() != HloOpcode::kDot) {
+        // Reshape to the original sharded dot shape.
+        dot = body_b.AddInstruction(
+            HloInstruction::CreateReshape(original_sharded_dot_shape, dot));
+      }
+
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto slice_shape = dot->shape();
+      auto slice_dim = lhs_concat_dim != -1
+                           ? indices_map.lhs_to_output_indices[lhs_concat_dim]
+                           : indices_map.rhs_to_output_indices[rhs_concat_dim];
+      slice_shape.set_dimensions(slice_dim, 1);
+      std::vector<int64_t> ccw_start_indices(dot->shape().rank(), 0);
+      std::vector<int64_t> cw_start_indices(dot->shape().rank(), 0);
+      cw_start_indices[slice_dim] = 1;
+      auto ccw_dot = body_b.AddInstruction(HloInstruction::CreateSlice(
+          slice_shape, dot, ccw_start_indices, slice_shape.dimensions(),
+          std::vector<int64_t>(dot->shape().rank(), 1)));
+      auto cw_dot = body_b.AddInstruction(HloInstruction::CreateSlice(
+          slice_shape, dot, cw_start_indices, dot->shape().dimensions(),
+          std::vector<int64_t>(dot->shape().rank(), 1)));
+
+      std::vector<int64_t> reshaped_dims(
+          original_sharded_dot_shape.dimensions().begin(),
+          original_sharded_dot_shape.dimensions().end());
+      reshaped_dims[slice_dim] /= 2;
+      ccw_dot = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(ccw_dot->shape().element_type(), reshaped_dims),
+          ccw_dot));
+      cw_dot = body_b.AddInstruction(HloInstruction::CreateReshape(
+          ShapeUtil::MakeShape(cw_dot->shape().element_type(), reshaped_dims),
+          cw_dot));
+
+      if (operands_sharded_at_contracting_dims) {
+        // Accumulate the partial output to the result buffer.
+        o = body_b.AddInstruction(HloInstruction::CreateBinary(
+            o->shape(), HloOpcode::kAdd, o, ccw_dot));
+        cw_cp_output = body_b.AddInstruction(HloInstruction::CreateBinary(
+            o->shape(), HloOpcode::kAdd, cw_cp_output, cw_dot));
+      } else {
+        auto ccw_offsets = MakePartitionOffsets(
+            o->shape(),
+            windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
+                               : *rhs_sharding_transposed_to_match_output,
+            ccw_data_partition_id, &body_b);
+        auto cw_offsets = MakePartitionOffsets(
+            o->shape(),
+            windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
+                               : *rhs_sharding_transposed_to_match_output,
+            cw_data_partition_id, &body_b);
+        o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+            o->shape(), o, ccw_dot, ccw_offsets));
+        o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+            o->shape(), o, cw_dot, cw_offsets));
+      }
+    }
+
+    std::vector<HloInstruction*> partial_results;
+    partial_results.push_back(o);
+    partial_results.push_back(cw_cp_output);
+    return partial_results;
+  };
+
+  // Generate partial result used by unidirectional algorithm.
+  auto get_partial_unid_result =
+      [&](HloInstruction* l, HloInstruction* r, HloInstruction* o,
+          HloInstruction* i) -> StatusOr<HloInstruction*> {
+    auto partition_id =
+        lhs.state().collective_ops_creator.create_partition_id(&body_b);
+    auto data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, partition_id));
+    auto partition_count = body_b.AddInstruction(HloInstruction::CreateConstant(
+        LiteralUtil::CreateR0<uint32_t>(num_partitions)));
+    data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kRemainder, data_partition_id, partition_count));
+    auto dot_lhs = l;
+    auto dot_rhs = r;
+    if (windowed_at_contracting_dims || windowed_at_batch_dims ||
+        operands_sharded_at_contracting_dims) {
+      // Slice the matching operand according to the partitioned dimensions on
+      // the windowed operand or the output.
+      auto slice_operand = !windowed_op_is_lhs ? l : r;
+      // We do this by treating the matching operand as replicated, and
+      // resharding it to match the windowed operand or the output.
+      slice_operand->set_sharding(HloSharding::Replicate());
+      auto state = lhs.state();
+      state.b = &body_b;
+      state.partition_id = data_partition_id;
+      state.reshard_cache->per_hlo_cache.erase(slice_operand);
+      auto slice = PartitionedHlo(slice_operand, slice_operand->shape(), state)
+                       .Reshard(*slice_sharding)
+                       .hlo();
+      slice_operand->clear_sharding();
+      if (!windowed_op_is_lhs) {
+        dot_lhs = slice;
+      } else {
+        dot_rhs = slice;
+      }
+    }
+    TF_ASSIGN_OR_RETURN(
+        auto dot, create_sharded_dot(dot_lhs, dot_rhs, &body_b, conv_window));
+    if (windowed_at_contracting_dims || operands_sharded_at_contracting_dims) {
+      // Accumulate the partial output to the result buffer.
+      o = body_b.AddInstruction(
+          HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
+    } else {
+      // The windowing operand is partitioned along batch/non-contracting
+      // dimensions, so we need a dynamic-update-slice to save the partial
+      // output in the result buffer.
+      auto offsets = MakePartitionOffsets(
+          o->shape(),
+          windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
+                             : *rhs_sharding_transposed_to_match_output,
+          data_partition_id, &body_b);
+      o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
+          o->shape(), o, dot, offsets));
+    }
+    return o;
+  };
+
+  auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeTupleShapeWithPtrs(
+          {&lhs_hlo->shape(), &rhs_hlo->shape(), &result_buffer->shape(),
+           &extra_buffer->shape(), &iteration->shape()}),
+      "param"));
+  auto l = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(lhs_hlo->shape(), param, 0));
+  auto r = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(rhs_hlo->shape(), param, 1));
+  auto o = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(result_buffer->shape(), param, 2));
+  auto extra_inout = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(extra_buffer->shape(), param, 3));
+  auto i = body_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(iteration->shape(), param, 4));
+
+  // The bidirectional collective permute implementation has loop unrolling
+  // of degree 2, so num_partitions is required to be a multiple of 4.
+  if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0) {
+    std::vector<std::pair<int64_t, int64_t>> ccw_sd_pairs(num_partitions);
+    for (int64_t source = 0; source < num_partitions; ++source) {
+      // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+      ccw_sd_pairs[source] = {source,
+                              (source - 1 + num_partitions) % num_partitions};
+    }
+    std::vector<std::pair<int64_t, int64_t>> cw_sd_pairs(num_partitions);
+    for (int64_t source = 0; source < num_partitions; ++source) {
+      // 0 -> 1, 1 -> 2, 2 -> 3, ...
+      cw_sd_pairs[source] = {source, (source + 1) % num_partitions};
+    }
+
+    // Even number iteration.
+    auto next_l = l;
+    auto next_r = r;
+    auto ccw_cp_input = operands_sharded_at_contracting_dims ? o
+                        : windowed_op_is_lhs                 ? l
+                                                             : r;
+    auto ccw_cp_output =
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                &body_b, ccw_cp_input, ccw_sd_pairs,
+                (*lhs.state().next_channel_id)++);
+    if (operands_sharded_at_contracting_dims) {
+      o = ccw_cp_output;
+    } else if (windowed_op_is_lhs) {
+      next_l = ccw_cp_output;
+    } else {
+      next_r = ccw_cp_output;
+    }
+    auto cw_cp_input = extra_inout;
+    auto cw_cp_output =
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                &body_b, cw_cp_input, cw_sd_pairs,
+                (*lhs.state().next_channel_id)++);
+
+    TF_ASSIGN_OR_RETURN(
+        auto outputs,
+        get_partial_bid_results(l, r, o, extra_inout, cw_cp_output, i));
+    o = outputs[0];
+    cw_cp_output = outputs[1];
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, CreateOne(i->shape(), &body_b)));
+
+    // Odd number iteration.
+    auto second_next_l = next_l;
+    auto second_next_r = next_r;
+    ccw_cp_input = operands_sharded_at_contracting_dims ? o
+                   : windowed_op_is_lhs                 ? next_l
+                                                        : next_r;
+    ccw_cp_output =
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                &body_b, ccw_cp_input, ccw_sd_pairs,
+                (*lhs.state().next_channel_id)++);
+    if (operands_sharded_at_contracting_dims) {
+      o = ccw_cp_output;
+    } else if (windowed_op_is_lhs) {
+      second_next_l = ccw_cp_output;
+    } else {
+      second_next_r = ccw_cp_output;
+    }
+    auto next_cw_cp_input = cw_cp_output;
+    auto next_cw_cp_output =
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                &body_b, next_cw_cp_input, cw_sd_pairs,
+                (*lhs.state().next_channel_id)++);
+
+    TF_ASSIGN_OR_RETURN(outputs,
+                        get_partial_bid_results(next_l, next_r, o, cw_cp_output,
+                                                next_cw_cp_output, i));
+    o = outputs[0];
+    next_cw_cp_output = outputs[1];
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i, CreateOne(i->shape(), &body_b)));
+
+    body_b.AddInstruction(HloInstruction::CreateTuple(
+        {second_next_l, second_next_r, o, next_cw_cp_output, i}));
+
+  } else if (options.unroll_windowed_einsum && num_partitions % 2 == 0) {
+    if (operands_sharded_at_contracting_dims) {
+      std::vector<std::pair<int64_t, int64_t>> output_sd_pairs(num_partitions);
+      for (int64_t source = 0; source < num_partitions; ++source) {
+        // 0 -> n-2, 1 -> n-1, 2 -> 0, ...
+        output_sd_pairs[source] = {
+            source, (source - 2 + num_partitions) % num_partitions};
+      }
+
+      o = lhs.state()
+              .collective_ops_creator.create_cross_partition_collective_permute(
+                  &body_b, o, output_sd_pairs,
+                  (*lhs.state().next_channel_id)++);
+
+      TF_ASSIGN_OR_RETURN(extra_inout,
+                          get_partial_unid_result(l, r, extra_inout, i));
+
+      extra_inout =
+          lhs.state()
+              .collective_ops_creator.create_cross_partition_collective_permute(
+                  &body_b, extra_inout, output_sd_pairs,
+                  (*lhs.state().next_channel_id)++);
+
+      // i+2
+      i = body_b.AddInstruction(HloInstruction::CreateBinary(
+          i->shape(), HloOpcode::kAdd, i,
+          body_b.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32_t>(2)))));
+      auto real_i = body_b.AddInstruction(HloInstruction::CreateBinary(
+          i->shape(), HloOpcode::kAdd, i,
+          body_b.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32_t>(1)))));
+
+      TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(l, r, o, real_i));
+      body_b.AddInstruction(
+          HloInstruction::CreateTuple({l, r, o, extra_inout, i}));
+    } else {
+      std::vector<std::pair<int64_t, int64_t>> sd_pairs(num_partitions);
+      for (int64_t source = 0; source < num_partitions; ++source) {
+        // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+        sd_pairs[source] = {source,
+                            (source - 1 + num_partitions) % num_partitions};
+      }
+
+      // Even number iteration.
+      auto next_l = l;
+      auto next_r = r;
+      auto cp_input = windowed_op_is_lhs ? l : r;
+      auto cp_output =
+          lhs.state()
+              .collective_ops_creator.create_cross_partition_collective_permute(
+                  &body_b, cp_input, sd_pairs,
+                  (*lhs.state().next_channel_id)++);
+      if (windowed_op_is_lhs) {
+        next_l = cp_output;
+      } else {
+        next_r = cp_output;
+      }
+      TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(l, r, o, i));
+
+      // ++i
+      i = body_b.AddInstruction(HloInstruction::CreateBinary(
+          i->shape(), HloOpcode::kAdd, i,
+          body_b.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32_t>(1)))));
+
+      // Odd number iteration.
+      auto second_next_l = next_l;
+      auto second_next_r = next_r;
+      cp_input = windowed_op_is_lhs ? next_l : next_r;
+      cp_output =
+          lhs.state()
+              .collective_ops_creator.create_cross_partition_collective_permute(
+                  &body_b, cp_input, sd_pairs,
+                  (*lhs.state().next_channel_id)++);
+      if (windowed_op_is_lhs) {
+        second_next_l = cp_output;
+      } else {
+        second_next_r = cp_output;
+      }
+      TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(next_l, next_r, o, i));
+
+      // ++i
+      i = body_b.AddInstruction(HloInstruction::CreateBinary(
+          i->shape(), HloOpcode::kAdd, i,
+          body_b.AddInstruction(HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0<uint32_t>(1)))));
+
+      body_b.AddInstruction(HloInstruction::CreateTuple(
+          {second_next_l, second_next_r, o, extra_inout, i}));
+    }
+  } else {
+    auto real_i = i;
+    if (operands_sharded_at_contracting_dims) {
+      // For reduce-scatter case, start from the data_partition_id + 1 to make
+      // the data_partition_id of the final data shard in each partition the
+      // same as the corresponding partition_id.
+      real_i = body_b.AddInstruction(
+          HloInstruction::CreateBinary(real_i->shape(), HloOpcode::kAdd, real_i,
+                                       CreateOne(real_i->shape(), &body_b)));
+    }
+    TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(l, r, o, real_i));
+
+    // ++i
+    i = body_b.AddInstruction(HloInstruction::CreateBinary(
+        i->shape(), HloOpcode::kAdd, i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32_t>(1)))));
+    auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
+        ShapeUtil::MakeShape(PRED, {}), i,
+        body_b.AddInstruction(HloInstruction::CreateConstant(
+            LiteralUtil::CreateR0<uint32_t>(num_partitions))),
+        ComparisonDirection::kLt));
+    // Collective-permute for the next window. We don't need it for the last
+    // iteration, so we use a conditional around the collective-permute.
+    HloInstruction* conditional;
+    {
+      SpmdBuilder cp_b("window_collective_permute", original_hlo);
+      {
+        auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
+            0,
+            operands_sharded_at_contracting_dims ? o->shape()
+            : windowed_op_is_lhs                 ? l->shape()
+                                                 : r->shape(),
+            "window"));
+        std::vector<std::pair<int64_t, int64_t>> sd_pairs(num_partitions);
+        for (int64_t source = 0; source < num_partitions; ++source) {
+          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
+          sd_pairs[source] = {source,
+                              (source - 1 + num_partitions) % num_partitions};
+        }
+        lhs.state()
+            .collective_ops_creator.create_cross_partition_collective_permute(
+                &cp_b, p, sd_pairs, (*lhs.state().next_channel_id)++);
+      }
+      SpmdBuilder ncp_b("last_iteration_noop", original_hlo);
+      {
+        ncp_b.AddInstruction(HloInstruction::CreateParameter(
+            0,
+            operands_sharded_at_contracting_dims ? o->shape()
+            : windowed_op_is_lhs                 ? l->shape()
+                                                 : r->shape(),
+            "window"));
+      }
+      conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
+          operands_sharded_at_contracting_dims ? o->shape()
+          : windowed_op_is_lhs                 ? l->shape()
+                                               : r->shape(),
+          has_more,
+          operands_sharded_at_contracting_dims ? o
+          : windowed_op_is_lhs                 ? l
+                                               : r,
+          module->AddEmbeddedComputation(cp_b.Build()),
+          operands_sharded_at_contracting_dims ? o
+          : windowed_op_is_lhs                 ? l
+                                               : r,
+          module->AddEmbeddedComputation(ncp_b.Build())));
+    }
+    if (operands_sharded_at_contracting_dims) {
+      o = conditional;
+    } else if (windowed_op_is_lhs) {
+      l = conditional;
+    } else {
+      r = conditional;
+    }
+    body_b.AddInstruction(
+        HloInstruction::CreateTuple({l, r, o, extra_inout, i}));
+  }
+
+  SpmdBuilder cond_b("windowed_dot_general_cond", original_hlo);
+  auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0,
+      ShapeUtil::MakeTupleShapeWithPtrs(
+          {&lhs_hlo->shape(), &rhs_hlo->shape(), &result_buffer->shape(),
+           &extra_buffer->shape(), &iteration->shape()}),
+      "param"));
+  auto cond_i = cond_b.AddInstruction(
+      HloInstruction::CreateGetTupleElement(iteration->shape(), cond_param, 4));
+  int64_t adapted_num_partitions =
+      (options.bidirectional_windowed_einsum && num_partitions % 4 == 0)
+          ? num_partitions / 2
+          : num_partitions;
+  cond_b.AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::MakeShape(PRED, {}), cond_i,
+      cond_b.AddInstruction(HloInstruction::CreateConstant(
+          LiteralUtil::CreateR0<uint32_t>(adapted_num_partitions))),
+      ComparisonDirection::kLt));
+  auto while_loop = b->AddInstruction(HloInstruction::CreateWhile(
+      cond_param->shape(), module->AddEmbeddedComputation(cond_b.Build()),
+      module->AddEmbeddedComputation(body_b.Build()),
+      b->AddInstruction(HloInstruction::CreateTuple(
+          {lhs_hlo, rhs_hlo, result_buffer, extra_buffer, iteration}))));
+  windowed_dot_general_loops->push_back(
+      {while_loop, windowed_op_is_lhs ? 0 : 1, windowed_at_contracting_dims,
+       windowed_at_batch_dims, operands_sharded_at_contracting_dims,
+       num_partitions, GetLoopReplicaGroups(while_loop)});
+  auto result = b->AddInstruction(HloInstruction::CreateGetTupleElement(
+      result_buffer->shape(), while_loop, 2));
+  if (((options.bidirectional_windowed_einsum && num_partitions % 4 == 0) ||
+       (options.unroll_windowed_einsum && num_partitions % 2 == 0)) &&
+      operands_sharded_at_contracting_dims) {
+    std::vector<std::pair<int64_t, int64_t>> extra_sd_pairs(num_partitions);
+    for (int64_t source = 0; source < num_partitions; ++source) {
+      // 0 -> 1, 1 -> 2, 2 -> 3, ...
+      extra_sd_pairs[source] = {source, (source + 1) % num_partitions};
+    }
+    auto extra_result = b->AddInstruction(HloInstruction::CreateGetTupleElement(
+        extra_buffer->shape(), while_loop, 3));
+    if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0) {
+      extra_result =
+          lhs.state()
+              .collective_ops_creator.create_cross_partition_collective_permute(
+                  b, extra_result, extra_sd_pairs,
+                  (*lhs.state().next_channel_id)++);
+    }
+    if (options.unroll_windowed_einsum && num_partitions % 2 == 0) {
+      result =
+          lhs.state()
+              .collective_ops_creator.create_cross_partition_collective_permute(
+                  b, result, extra_sd_pairs, (*lhs.state().next_channel_id)++);
+    }
+    result = b->AddInstruction(HloInstruction::CreateBinary(
+        result->shape(), HloOpcode::kAdd, result, extra_result));
+  }
+  if (!ShapeUtil::Compatible(padded_result_buffer_shape,
+                             unpadded_result_buffer_shape)) {
+    result = b->AddInstruction(HloInstruction::CreateSlice(
+        unpadded_result_buffer_shape, result,
+        std::vector<int64_t>(padded_result_buffer_shape.rank(), 0),
+        unpadded_result_buffer_shape.dimensions(),
+        std::vector<int64_t>(padded_result_buffer_shape.rank(), 1)));
+  }
+  return result;
+}
+
 // We use a recursive approach where sets of matching dimensions are recognized
 // one at a time. The base shapes and shardings can be changed during the
 // recursion as we group devices together. So refer to the passed in shapes and
@@ -883,956 +1834,6 @@ StatusOr<HloInstruction*> PartitionBaseCase(
     }
   }
 
-  // Try to emit windowed DotGeneral when one operand is partitioned in the same
-  // way as the output along non-contracting dimensions, but the other operand
-  // is tiled in other dimensions. Or both operands are partitioned in the same
-  // way along contracting dimensions, but the output is partitioned along
-  // non-contracting dimensions.
-  auto emit_windowed_dot_general =
-      [&](const WindowedEinsumConfig& einsum_config)
-      -> StatusOr<HloInstruction*> {
-    CHECK(!einsum_config.windowed_at_batch_dims ||
-          !einsum_config.windowed_at_contracting_dims);
-    const bool windowed_at_batch_dims = einsum_config.windowed_at_batch_dims;
-    const bool windowed_at_contracting_dims =
-        einsum_config.windowed_at_contracting_dims;
-    const bool operands_sharded_at_contracting_dims =
-        einsum_config.operands_sharded_at_contracting_dims;
-    auto unpadded_result_buffer_shape =
-        MakePartitionedShape(output_base_shape, output_sharding);
-    auto padded_result_buffer_shape = unpadded_result_buffer_shape;
-    const bool windowed_op_is_lhs =
-        einsum_config.windowed_op == WindowedEinsumOperand::LHS;
-    // For windowing at batch/non-contracting dims, we produce the result one
-    // partition at a time, so we need to pad the shape in case of uneven
-    // partitioning in order to make dynamic-update-slice in-bound.
-    if (!windowed_at_contracting_dims &&
-        !operands_sharded_at_contracting_dims) {
-      padded_result_buffer_shape = GetPaddedShapeForUnevenPartitioning(
-          padded_result_buffer_shape,
-          windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
-                             : *rhs_sharding_transposed_to_match_output);
-    }
-    // Mask the padding area of the windowed operand with zero if there is
-    // uneven partitioning.
-    if (windowed_at_contracting_dims) {
-      auto& to_mask = windowed_op_is_lhs ? lhs : rhs;
-      to_mask = to_mask.PadWithZero();
-    }
-    if (operands_sharded_at_contracting_dims) {
-      lhs = lhs.PadWithZero();
-      rhs = rhs.PadWithZero();
-    }
-
-    // Check if contracting dimension sharding requires lhs/rhs resharding.
-    if (RequiresTransposeSharding(lhs.hlo()->sharding(), rhs.hlo()->sharding(),
-                                  dims_mapping.contracting_dims) &&
-        rhs_sharding_transposed_to_match_lhs.has_value() &&
-        lhs_sharding_transposed_to_match_rhs.has_value()) {
-      if (ShapeSizeInBytes(lhs.hlo()->shape()) <
-          ShapeSizeInBytes(rhs.hlo()->shape())) {
-        lhs = lhs.Reshard(*rhs_sharding_transposed_to_match_lhs).PadWithZero();
-      } else {
-        rhs = rhs.Reshard(*lhs_sharding_transposed_to_match_rhs).PadWithZero();
-      }
-    }
-
-    // Get slice sharding, sharding dim, and lhs/rhs concat dim.
-    const HloSharding* slice_sharding;
-    if (operands_sharded_at_contracting_dims) {
-      slice_sharding = windowed_op_is_lhs
-                           ? &*output_sharding_transposed_to_match_rhs
-                           : &*output_sharding_transposed_to_match_lhs;
-    } else if (windowed_at_contracting_dims || windowed_at_batch_dims) {
-      slice_sharding = windowed_op_is_lhs
-                           ? &*lhs_sharding_transposed_to_match_rhs
-                           : &*rhs_sharding_transposed_to_match_lhs;
-    } else {
-      slice_sharding = windowed_op_is_lhs
-                           ? &*lhs_sharding_transposed_to_match_output
-                           : &*rhs_sharding_transposed_to_match_output;
-    }
-    CHECK_EQ(Product(slice_sharding->tile_assignment().dimensions()),
-             num_partitions);
-    int64_t slice_sharding_dim = -1;
-    for (int64_t i = 0; i < slice_sharding->tile_assignment().num_dimensions();
-         ++i) {
-      if (slice_sharding->tile_assignment().dim(i) > 1) {
-        slice_sharding_dim = i;
-        break;
-      }
-    }
-    int64_t lhs_concat_dim = -1;
-    int64_t rhs_concat_dim = -1;
-    if (operands_sharded_at_contracting_dims) {
-      if (windowed_op_is_lhs) {
-        rhs_concat_dim = slice_sharding_dim;
-      } else {
-        lhs_concat_dim = slice_sharding_dim;
-      }
-    } else if (windowed_at_contracting_dims || windowed_at_batch_dims) {
-      lhs_concat_dim = windowed_op_is_lhs
-                           ? indices_map.rhs_to_lhs_indices[slice_sharding_dim]
-                           : slice_sharding_dim;
-      rhs_concat_dim = windowed_op_is_lhs
-                           ? slice_sharding_dim
-                           : indices_map.lhs_to_rhs_indices[slice_sharding_dim];
-    } else {
-      if (windowed_op_is_lhs) {
-        lhs_concat_dim = indices_map.output_to_lhs_indices[slice_sharding_dim];
-      } else {
-        rhs_concat_dim = indices_map.output_to_rhs_indices[slice_sharding_dim];
-      }
-    }
-
-    auto lhs_hlo = lhs.hlo();
-    auto rhs_hlo = rhs.hlo();
-    // Reshape lhs and rhs before the loop for bidirectional communication case.
-    if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0) {
-      if (lhs_concat_dim != -1 && windowed_op_is_lhs &&
-          !operands_sharded_at_contracting_dims) {
-        std::vector<int64_t> reshaped_dims(
-            lhs_hlo->shape().dimensions().begin(),
-            lhs_hlo->shape().dimensions().end());
-        reshaped_dims.insert(reshaped_dims.begin() + lhs_concat_dim, 1);
-        lhs_hlo = b->AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(lhs_hlo->shape().element_type(),
-                                 reshaped_dims),
-            lhs_hlo));
-      }
-      if (rhs_concat_dim != -1 && !windowed_op_is_lhs &&
-          !operands_sharded_at_contracting_dims) {
-        std::vector<int64_t> reshaped_dims(
-            rhs_hlo->shape().dimensions().begin(),
-            rhs_hlo->shape().dimensions().end());
-        reshaped_dims.insert(reshaped_dims.begin() + rhs_concat_dim, 1);
-        rhs_hlo = b->AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(rhs_hlo->shape().element_type(),
-                                 reshaped_dims),
-            rhs_hlo));
-      }
-    }
-
-    auto result_buffer = CreateZero(padded_result_buffer_shape, b);
-    auto extra_buffer =
-        (!(options.bidirectional_windowed_einsum && num_partitions % 4 == 0) ||
-         operands_sharded_at_contracting_dims)
-            ? CreateZero(padded_result_buffer_shape, b)
-        : windowed_op_is_lhs ? lhs_hlo
-                             : rhs_hlo;
-
-    if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0 &&
-        !operands_sharded_at_contracting_dims) {
-      std::vector<std::pair<int64_t, int64_t>> pre_sd_pairs(num_partitions);
-      for (int64_t source = 0; source < num_partitions; ++source) {
-        // 0 -> 1, 1 -> 2, 2 -> 3, ...
-        pre_sd_pairs[source] = {source, (source + 1) % num_partitions};
-      }
-      extra_buffer =
-          lhs.state()
-              .collective_ops_creator.create_cross_partition_collective_permute(
-                  b, extra_buffer, pre_sd_pairs,
-                  (*lhs.state().next_channel_id)++);
-    }
-
-    auto iteration = b->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::CreateR0<uint32_t>(0)));
-
-    // Create a while loop that computes one window per iteration. During each
-    // iteration, each partition sends its input window to its neighbor using
-    // collective-permute for the next iteration.
-    SpmdBuilder body_b("windowed_dot_general_body", original_hlo);
-
-    // Generate partial results used by bidirectional algorithm.
-    auto get_partial_bid_results =
-        [&](HloInstruction* l, HloInstruction* r, HloInstruction* o,
-            HloInstruction* extra_inout, HloInstruction* cw_cp_output,
-            HloInstruction* i) -> StatusOr<std::vector<HloInstruction*>> {
-      auto partition_id =
-          lhs.state().collective_ops_creator.create_partition_id(&body_b);
-      auto partition_count =
-          body_b.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<uint32_t>(num_partitions)));
-      auto ccw_data_partition_id =
-          body_b.AddInstruction(HloInstruction::CreateBinary(
-              i->shape(), HloOpcode::kAdd, i, partition_id));
-      auto cw_data_partition_id =
-          body_b.AddInstruction(HloInstruction::CreateBinary(
-              i->shape(), HloOpcode::kAdd, partition_count, partition_id));
-      if (operands_sharded_at_contracting_dims) {
-        ccw_data_partition_id =
-            body_b.AddInstruction(HloInstruction::CreateBinary(
-                i->shape(), HloOpcode::kAdd, ccw_data_partition_id,
-                body_b.AddInstruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<uint32_t>(num_partitions / 2 + 1)))));
-        cw_data_partition_id =
-            body_b.AddInstruction(HloInstruction::CreateBinary(
-                i->shape(), HloOpcode::kSubtract, cw_data_partition_id,
-                body_b.AddInstruction(HloInstruction::CreateConstant(
-                    LiteralUtil::CreateR0<uint32_t>(num_partitions / 2)))));
-      } else {
-        cw_data_partition_id =
-            body_b.AddInstruction(HloInstruction::CreateBinary(
-                i->shape(), HloOpcode::kSubtract, cw_data_partition_id,
-                CreateOne(cw_data_partition_id->shape(), &body_b)));
-      }
-      ccw_data_partition_id = body_b.AddInstruction(
-          HloInstruction::CreateBinary(i->shape(), HloOpcode::kRemainder,
-                                       ccw_data_partition_id, partition_count));
-      cw_data_partition_id = body_b.AddInstruction(HloInstruction::CreateBinary(
-          i->shape(), HloOpcode::kSubtract, cw_data_partition_id, i));
-      cw_data_partition_id = body_b.AddInstruction(
-          HloInstruction::CreateBinary(i->shape(), HloOpcode::kRemainder,
-                                       cw_data_partition_id, partition_count));
-
-      DotDimensionNumbers new_ddnums;
-      if (original_hlo->opcode() == HloOpcode::kDot) {
-        new_ddnums = original_hlo->dot_dimension_numbers();
-      }
-
-      auto dot_lhs = l;
-      auto dot_rhs = r;
-      auto original_dot_lhs = l;
-      auto original_dot_rhs = r;
-      // Recover original lhs and rhs, will not be used in real computation.
-      if (lhs_concat_dim != -1 && windowed_op_is_lhs) {
-        std::vector<int64_t> reshaped_dims(
-            original_dot_lhs->shape().dimensions().begin(),
-            original_dot_lhs->shape().dimensions().end());
-        reshaped_dims.erase(reshaped_dims.begin() + lhs_concat_dim);
-        original_dot_lhs = body_b.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(original_dot_lhs->shape().element_type(),
-                                 reshaped_dims),
-            original_dot_lhs));
-      }
-      if (rhs_concat_dim != -1 && !windowed_op_is_lhs) {
-        std::vector<int64_t> reshaped_dims(
-            original_dot_rhs->shape().dimensions().begin(),
-            original_dot_rhs->shape().dimensions().end());
-        reshaped_dims.erase(reshaped_dims.begin() + rhs_concat_dim);
-        original_dot_rhs = body_b.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(original_dot_rhs->shape().element_type(),
-                                 reshaped_dims),
-            original_dot_rhs));
-      }
-
-      if (windowed_at_contracting_dims || windowed_at_batch_dims ||
-          operands_sharded_at_contracting_dims) {
-        // Slice the matching operand according to the partitioned dimensions
-        // on the windowed operand or the output.
-        auto slice_operand = !windowed_op_is_lhs ? l : r;
-
-        // Pad the sharding dim first (then the concat dim) for correctness.
-        auto sharding_dim_size =
-            slice_operand->shape().dimensions(slice_sharding_dim);
-        if (sharding_dim_size % num_partitions != 0) {
-          slice_operand = PadBaseShapeBeforeUnevenTiledSharding(
-              slice_operand, *slice_sharding, &body_b);
-        }
-
-        // We do this by treating the matching operand as replicated, and
-        // resharding it to match the windowed operand or the output.
-        auto gen_slice = [&](HloInstruction* data_partition_id,
-                             bool ccw) -> HloInstruction* {
-          std::vector<int64_t> new_dims;
-          const int64_t dimensions_size =
-              slice_operand->shape().dimensions_size();
-          new_dims.reserve(dimensions_size + 1);
-          for (int64_t i = 0; i < dimensions_size; ++i) {
-            if (i == slice_sharding_dim) {
-              new_dims.push_back(1);
-            }
-            new_dims.push_back(slice_operand->shape().dimensions(i));
-          }
-          auto reshaped_slice_operand =
-              body_b.AddInstruction(HloInstruction::CreateReshape(
-                  ShapeUtil::MakeShape(slice_operand->shape().element_type(),
-                                       new_dims),
-                  slice_operand));
-          auto pad_value = body_b.AddInstruction(HloInstruction::CreateConstant(
-              ShapeUtil::ElementIsFloating(reshaped_slice_operand->shape())
-                  ? LiteralUtil::MinValue(
-                        reshaped_slice_operand->shape().element_type())
-                  : LiteralUtil::Zero(
-                        reshaped_slice_operand->shape().element_type())));
-
-          std::vector<int64_t> padding(reshaped_slice_operand->shape().rank());
-          auto padded_slice_operand = reshaped_slice_operand;
-          auto padded_shape = padded_slice_operand->shape();
-          int64_t padding_dim = slice_sharding_dim;
-          padded_shape.set_dimensions(padding_dim, 2);
-          if (ccw) {
-            // ccw pad high
-            PaddingConfig ccw_pad_config =
-                window_util::MakeSymmetricPadding(padding);
-            ccw_pad_config.mutable_dimensions(padding_dim)
-                ->set_edge_padding_low(0);
-            ccw_pad_config.mutable_dimensions(padding_dim)
-                ->set_edge_padding_high(1);
-            padded_slice_operand = body_b.AddInstruction(
-                HloInstruction::CreatePad(padded_shape, padded_slice_operand,
-                                          pad_value, ccw_pad_config));
-          } else {
-            // cw pad low
-            PaddingConfig cw_pad_config =
-                window_util::MakeSymmetricPadding(padding);
-            cw_pad_config.mutable_dimensions(padding_dim)
-                ->set_edge_padding_low(1);
-            cw_pad_config.mutable_dimensions(padding_dim)
-                ->set_edge_padding_high(0);
-            padded_slice_operand = body_b.AddInstruction(
-                HloInstruction::CreatePad(padded_shape, padded_slice_operand,
-                                          pad_value, cw_pad_config));
-          }
-
-          padded_slice_operand->set_sharding(HloSharding::Replicate());
-          auto state = lhs.state();
-          state.b = &body_b;
-          state.partition_id = data_partition_id;
-          state.reshard_cache->per_hlo_cache.erase(padded_slice_operand);
-          auto padded_slice_sharding = hlo_sharding_util::ReshapeSharding(
-              slice_operand->shape(), reshaped_slice_operand->shape(),
-              *slice_sharding);
-          auto padded_slice =
-              PartitionedHlo(padded_slice_operand,
-                             padded_slice_operand->shape(), state)
-                  .Reshard(*padded_slice_sharding)
-                  .hlo();
-          padded_slice_operand->clear_sharding();
-          return padded_slice;
-        };
-
-        auto ccw_slice = gen_slice(ccw_data_partition_id, true);
-        auto cw_slice = gen_slice(cw_data_partition_id, false);
-        auto slice = body_b.AddInstruction(HloInstruction::CreateBinary(
-            ccw_slice->shape(),
-            ShapeUtil::ElementIsFloating(ccw_slice->shape())
-                ? HloOpcode::kMaximum
-                : HloOpcode::kAdd,
-            ccw_slice, cw_slice));
-        // Reshape. The reshaped slice will not be used to produce the final
-        // result, but used as a hint for the shape inference.
-        std::vector<int64_t> reshaped_slice_dims;
-        const int64_t dim_size = slice->shape().dimensions_size();
-        reshaped_slice_dims.reserve(dim_size);
-        for (int64_t i = 0; i < dim_size; ++i) {
-          auto dim_size = slice->shape().dimensions(i);
-          if (i == (slice_sharding_dim + 1)) {
-            reshaped_slice_dims.push_back(dim_size * 2);
-          } else if (i != slice_sharding_dim) {
-            reshaped_slice_dims.push_back(dim_size);
-          }
-        }
-        auto reshaped_slice =
-            body_b.AddInstruction(HloInstruction::CreateReshape(
-                ShapeUtil::MakeShape(slice->shape().element_type(),
-                                     reshaped_slice_dims),
-                slice));
-
-        if (!windowed_op_is_lhs) {
-          dot_lhs = slice;
-          original_dot_lhs = reshaped_slice;
-          if (original_hlo->opcode() == HloOpcode::kDot) {
-            UpdateDDNums(&new_ddnums, slice_sharding_dim, true);
-          }
-        } else {
-          dot_rhs = slice;
-          original_dot_rhs = reshaped_slice;
-          if (original_hlo->opcode() == HloOpcode::kDot) {
-            UpdateDDNums(&new_ddnums, slice_sharding_dim, false);
-          }
-        }
-      }
-
-      auto ccw_dot_lhs = l;
-      auto ccw_dot_rhs = r;
-      auto cw_dot_lhs = windowed_op_is_lhs ? extra_inout : l;
-      auto cw_dot_rhs = windowed_op_is_lhs ? r : extra_inout;
-      if (lhs_concat_dim != -1 && windowed_op_is_lhs) {
-        // Concat
-        auto lhs_concat_shape = ccw_dot_lhs->shape();
-        lhs_concat_shape.set_dimensions(lhs_concat_dim, 2);
-        dot_lhs = body_b.AddInstruction(HloInstruction::CreateConcatenate(
-            lhs_concat_shape, {ccw_dot_lhs, cw_dot_lhs}, lhs_concat_dim));
-
-        std::vector<int64_t> reshaped_dims(
-            ccw_dot_lhs->shape().dimensions().begin(),
-            ccw_dot_lhs->shape().dimensions().end());
-        reshaped_dims.erase(reshaped_dims.begin() + lhs_concat_dim);
-        reshaped_dims[lhs_concat_dim] *= 2;
-        original_dot_lhs = body_b.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(dot_lhs->shape().element_type(),
-                                 reshaped_dims),
-            dot_lhs));
-
-        if (original_hlo->opcode() == HloOpcode::kDot) {
-          UpdateDDNums(&new_ddnums, lhs_concat_dim, true);
-        }
-      }
-      if (rhs_concat_dim != -1 && !windowed_op_is_lhs) {
-        // Concat
-        auto rhs_concat_shape = ccw_dot_rhs->shape();
-        rhs_concat_shape.set_dimensions(rhs_concat_dim, 2);
-        dot_rhs = body_b.AddInstruction(HloInstruction::CreateConcatenate(
-            rhs_concat_shape, {ccw_dot_rhs, cw_dot_rhs}, rhs_concat_dim));
-
-        std::vector<int64_t> reshaped_dims(
-            ccw_dot_rhs->shape().dimensions().begin(),
-            ccw_dot_rhs->shape().dimensions().end());
-        reshaped_dims.erase(reshaped_dims.begin() + rhs_concat_dim);
-        reshaped_dims[rhs_concat_dim] *= 2;
-        original_dot_rhs = body_b.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(dot_rhs->shape().element_type(),
-                                 reshaped_dims),
-            dot_rhs));
-
-        if (original_hlo->opcode() == HloOpcode::kDot) {
-          UpdateDDNums(&new_ddnums, rhs_concat_dim, false);
-        }
-      }
-
-      // The generated original dot will not be used.
-      TF_ASSIGN_OR_RETURN(auto original_dot,
-                          create_sharded_dot(original_dot_lhs, original_dot_rhs,
-                                             &body_b, conv_window));
-      VLOG(2) << original_dot->ToString();
-
-      // Generate the correct shape of the new dot/conv.
-      auto original_sharded_dot_shape = original_dot->shape();
-      auto new_dot_shape = original_sharded_dot_shape;
-      std::vector<int64_t> new_dims(new_dot_shape.dimensions().begin(),
-                                    new_dot_shape.dimensions().end());
-      if (!windowed_at_contracting_dims) {
-        auto slice_dim =
-            lhs_concat_dim != -1
-                ? indices_map.lhs_to_output_indices[lhs_concat_dim]
-                : indices_map.rhs_to_output_indices[rhs_concat_dim];
-        new_dims[slice_dim] /= 2;
-        new_dims.insert(new_dims.begin() + slice_dim, 2);
-      } else if (original_hlo->opcode() != HloOpcode::kDot) {
-        new_dims.push_back(1);
-      }
-      new_dot_shape =
-          ShapeUtil::MakeShape(original_hlo->shape().element_type(), new_dims);
-
-      HloInstruction* dot;
-      if (original_hlo->opcode() == HloOpcode::kDot) {
-        dot = body_b.AddInstruction(HloInstruction::CreateDot(
-            new_dot_shape, dot_lhs, dot_rhs, new_ddnums,
-            original_hlo->precision_config()));
-      } else {
-        if (!windowed_at_contracting_dims && !windowed_at_batch_dims) {
-          if (lhs_concat_dim != -1) {
-            std::vector<int64_t> new_dims(dot_rhs->shape().dimensions().begin(),
-                                          dot_rhs->shape().dimensions().end());
-            new_dims.push_back(1);
-            dot_rhs = body_b.AddInstruction(HloInstruction::CreateReshape(
-                ShapeUtil::MakeShape(dot_rhs->shape().element_type(), new_dims),
-                dot_rhs));
-          }
-          if (rhs_concat_dim != -1) {
-            std::vector<int64_t> new_dims(dot_lhs->shape().dimensions().begin(),
-                                          dot_lhs->shape().dimensions().end());
-            new_dims.push_back(1);
-            dot_lhs = body_b.AddInstruction(HloInstruction::CreateReshape(
-                ShapeUtil::MakeShape(dot_lhs->shape().element_type(), new_dims),
-                dot_lhs));
-          }
-        }
-
-        dot = body_b.AddInstruction(HloInstruction::CreateConvolve(
-            new_dot_shape, dot_lhs, dot_rhs,
-            original_dot->feature_group_count(),
-            original_dot->batch_group_count(),
-            GenNewWindow(original_dot, dot_lhs, dot_rhs, lhs_concat_dim,
-                         rhs_concat_dim, windowed_at_contracting_dims,
-                         windowed_at_batch_dims),
-            GenNewConvDNums(original_dot, dot_lhs, dot_rhs, lhs_concat_dim,
-                            rhs_concat_dim, windowed_at_contracting_dims,
-                            windowed_at_batch_dims,
-                            indices_map.lhs_to_output_indices,
-                            indices_map.rhs_to_output_indices, new_dot_shape),
-            original_dot->precision_config()));
-      }
-      VLOG(2) << dot->ToString();
-
-      if (windowed_at_contracting_dims) {
-        if (original_hlo->opcode() != HloOpcode::kDot) {
-          // Reshape to the original sharded dot shape.
-          dot = body_b.AddInstruction(
-              HloInstruction::CreateReshape(original_sharded_dot_shape, dot));
-        }
-
-        // Accumulate the partial output to the result buffer.
-        o = body_b.AddInstruction(
-            HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
-      } else {
-        // The windowing operand is partitioned along batch/non-contracting
-        // dimensions, so we need a dynamic-update-slice to save the partial
-        // output in the result buffer.
-        auto slice_shape = dot->shape();
-        auto slice_dim =
-            lhs_concat_dim != -1
-                ? indices_map.lhs_to_output_indices[lhs_concat_dim]
-                : indices_map.rhs_to_output_indices[rhs_concat_dim];
-        slice_shape.set_dimensions(slice_dim, 1);
-        std::vector<int64_t> ccw_start_indices(dot->shape().rank(), 0);
-        std::vector<int64_t> cw_start_indices(dot->shape().rank(), 0);
-        cw_start_indices[slice_dim] = 1;
-        auto ccw_dot = body_b.AddInstruction(HloInstruction::CreateSlice(
-            slice_shape, dot, ccw_start_indices, slice_shape.dimensions(),
-            std::vector<int64_t>(dot->shape().rank(), 1)));
-        auto cw_dot = body_b.AddInstruction(HloInstruction::CreateSlice(
-            slice_shape, dot, cw_start_indices, dot->shape().dimensions(),
-            std::vector<int64_t>(dot->shape().rank(), 1)));
-
-        std::vector<int64_t> reshaped_dims(
-            original_sharded_dot_shape.dimensions().begin(),
-            original_sharded_dot_shape.dimensions().end());
-        reshaped_dims[slice_dim] /= 2;
-        ccw_dot = body_b.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(ccw_dot->shape().element_type(),
-                                 reshaped_dims),
-            ccw_dot));
-        cw_dot = body_b.AddInstruction(HloInstruction::CreateReshape(
-            ShapeUtil::MakeShape(cw_dot->shape().element_type(), reshaped_dims),
-            cw_dot));
-
-        if (operands_sharded_at_contracting_dims) {
-          // Accumulate the partial output to the result buffer.
-          o = body_b.AddInstruction(HloInstruction::CreateBinary(
-              o->shape(), HloOpcode::kAdd, o, ccw_dot));
-          cw_cp_output = body_b.AddInstruction(HloInstruction::CreateBinary(
-              o->shape(), HloOpcode::kAdd, cw_cp_output, cw_dot));
-        } else {
-          auto ccw_offsets = MakePartitionOffsets(
-              o->shape(),
-              windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
-                                 : *rhs_sharding_transposed_to_match_output,
-              ccw_data_partition_id, &body_b);
-          auto cw_offsets = MakePartitionOffsets(
-              o->shape(),
-              windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
-                                 : *rhs_sharding_transposed_to_match_output,
-              cw_data_partition_id, &body_b);
-          o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-              o->shape(), o, ccw_dot, ccw_offsets));
-          o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-              o->shape(), o, cw_dot, cw_offsets));
-        }
-      }
-
-      std::vector<HloInstruction*> partial_results;
-      partial_results.push_back(o);
-      partial_results.push_back(cw_cp_output);
-      return partial_results;
-    };
-
-    // Generate partial result used by unidirectional algorithm.
-    auto get_partial_unid_result =
-        [&](HloInstruction* l, HloInstruction* r, HloInstruction* o,
-            HloInstruction* i) -> StatusOr<HloInstruction*> {
-      auto partition_id =
-          lhs.state().collective_ops_creator.create_partition_id(&body_b);
-      auto data_partition_id =
-          body_b.AddInstruction(HloInstruction::CreateBinary(
-              i->shape(), HloOpcode::kAdd, i, partition_id));
-      auto partition_count =
-          body_b.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<uint32_t>(num_partitions)));
-      data_partition_id = body_b.AddInstruction(
-          HloInstruction::CreateBinary(i->shape(), HloOpcode::kRemainder,
-                                       data_partition_id, partition_count));
-      auto dot_lhs = l;
-      auto dot_rhs = r;
-      if (windowed_at_contracting_dims || windowed_at_batch_dims ||
-          operands_sharded_at_contracting_dims) {
-        // Slice the matching operand according to the partitioned dimensions on
-        // the windowed operand or the output.
-        auto slice_operand = !windowed_op_is_lhs ? l : r;
-        // We do this by treating the matching operand as replicated, and
-        // resharding it to match the windowed operand or the output.
-        slice_operand->set_sharding(HloSharding::Replicate());
-        auto state = lhs.state();
-        state.b = &body_b;
-        state.partition_id = data_partition_id;
-        state.reshard_cache->per_hlo_cache.erase(slice_operand);
-        auto slice =
-            PartitionedHlo(slice_operand, slice_operand->shape(), state)
-                .Reshard(*slice_sharding)
-                .hlo();
-        slice_operand->clear_sharding();
-        if (!windowed_op_is_lhs) {
-          dot_lhs = slice;
-        } else {
-          dot_rhs = slice;
-        }
-      }
-      TF_ASSIGN_OR_RETURN(
-          auto dot, create_sharded_dot(dot_lhs, dot_rhs, &body_b, conv_window));
-      if (windowed_at_contracting_dims ||
-          operands_sharded_at_contracting_dims) {
-        // Accumulate the partial output to the result buffer.
-        o = body_b.AddInstruction(
-            HloInstruction::CreateBinary(o->shape(), HloOpcode::kAdd, o, dot));
-      } else {
-        // The windowing operand is partitioned along batch/non-contracting
-        // dimensions, so we need a dynamic-update-slice to save the partial
-        // output in the result buffer.
-        auto offsets = MakePartitionOffsets(
-            o->shape(),
-            windowed_op_is_lhs ? *lhs_sharding_transposed_to_match_output
-                               : *rhs_sharding_transposed_to_match_output,
-            data_partition_id, &body_b);
-        o = body_b.AddInstruction(HloInstruction::CreateDynamicUpdateSlice(
-            o->shape(), o, dot, offsets));
-      }
-      return o;
-    };
-
-    auto param = body_b.AddInstruction(HloInstruction::CreateParameter(
-        /*parameter_number=*/0,
-        ShapeUtil::MakeTupleShapeWithPtrs(
-            {&lhs_hlo->shape(), &rhs_hlo->shape(), &result_buffer->shape(),
-             &extra_buffer->shape(), &iteration->shape()}),
-        "param"));
-    auto l = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(lhs_hlo->shape(), param, 0));
-    auto r = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(rhs_hlo->shape(), param, 1));
-    auto o = body_b.AddInstruction(HloInstruction::CreateGetTupleElement(
-        result_buffer->shape(), param, 2));
-    auto extra_inout = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(extra_buffer->shape(), param, 3));
-    auto i = body_b.AddInstruction(
-        HloInstruction::CreateGetTupleElement(iteration->shape(), param, 4));
-
-    // The bidirectional collective permute implementation has loop unrolling
-    // of degree 2, so num_partitions is required to be a multiple of 4.
-    if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0) {
-      std::vector<std::pair<int64_t, int64_t>> ccw_sd_pairs(num_partitions);
-      for (int64_t source = 0; source < num_partitions; ++source) {
-        // 0 -> n-1, 1 -> 0, 2 -> 1, ...
-        ccw_sd_pairs[source] = {source,
-                                (source - 1 + num_partitions) % num_partitions};
-      }
-      std::vector<std::pair<int64_t, int64_t>> cw_sd_pairs(num_partitions);
-      for (int64_t source = 0; source < num_partitions; ++source) {
-        // 0 -> 1, 1 -> 2, 2 -> 3, ...
-        cw_sd_pairs[source] = {source, (source + 1) % num_partitions};
-      }
-
-      // Even number iteration.
-      auto next_l = l;
-      auto next_r = r;
-      auto ccw_cp_input = operands_sharded_at_contracting_dims ? o
-                          : windowed_op_is_lhs                 ? l
-                                                               : r;
-      auto ccw_cp_output =
-          lhs.state()
-              .collective_ops_creator.create_cross_partition_collective_permute(
-                  &body_b, ccw_cp_input, ccw_sd_pairs,
-                  (*lhs.state().next_channel_id)++);
-      if (operands_sharded_at_contracting_dims) {
-        o = ccw_cp_output;
-      } else if (windowed_op_is_lhs) {
-        next_l = ccw_cp_output;
-      } else {
-        next_r = ccw_cp_output;
-      }
-      auto cw_cp_input = extra_inout;
-      auto cw_cp_output =
-          lhs.state()
-              .collective_ops_creator.create_cross_partition_collective_permute(
-                  &body_b, cw_cp_input, cw_sd_pairs,
-                  (*lhs.state().next_channel_id)++);
-
-      TF_ASSIGN_OR_RETURN(
-          auto outputs,
-          get_partial_bid_results(l, r, o, extra_inout, cw_cp_output, i));
-      o = outputs[0];
-      cw_cp_output = outputs[1];
-
-      // ++i
-      i = body_b.AddInstruction(HloInstruction::CreateBinary(
-          i->shape(), HloOpcode::kAdd, i, CreateOne(i->shape(), &body_b)));
-
-      // Odd number iteration.
-      auto second_next_l = next_l;
-      auto second_next_r = next_r;
-      ccw_cp_input = operands_sharded_at_contracting_dims ? o
-                     : windowed_op_is_lhs                 ? next_l
-                                                          : next_r;
-      ccw_cp_output =
-          lhs.state()
-              .collective_ops_creator.create_cross_partition_collective_permute(
-                  &body_b, ccw_cp_input, ccw_sd_pairs,
-                  (*lhs.state().next_channel_id)++);
-      if (operands_sharded_at_contracting_dims) {
-        o = ccw_cp_output;
-      } else if (windowed_op_is_lhs) {
-        second_next_l = ccw_cp_output;
-      } else {
-        second_next_r = ccw_cp_output;
-      }
-      auto next_cw_cp_input = cw_cp_output;
-      auto next_cw_cp_output =
-          lhs.state()
-              .collective_ops_creator.create_cross_partition_collective_permute(
-                  &body_b, next_cw_cp_input, cw_sd_pairs,
-                  (*lhs.state().next_channel_id)++);
-
-      TF_ASSIGN_OR_RETURN(
-          outputs, get_partial_bid_results(next_l, next_r, o, cw_cp_output,
-                                           next_cw_cp_output, i));
-      o = outputs[0];
-      next_cw_cp_output = outputs[1];
-
-      // ++i
-      i = body_b.AddInstruction(HloInstruction::CreateBinary(
-          i->shape(), HloOpcode::kAdd, i, CreateOne(i->shape(), &body_b)));
-
-      body_b.AddInstruction(HloInstruction::CreateTuple(
-          {second_next_l, second_next_r, o, next_cw_cp_output, i}));
-
-    } else if (options.unroll_windowed_einsum && num_partitions % 2 == 0) {
-      if (operands_sharded_at_contracting_dims) {
-        std::vector<std::pair<int64_t, int64_t>> output_sd_pairs(
-            num_partitions);
-        for (int64_t source = 0; source < num_partitions; ++source) {
-          // 0 -> n-2, 1 -> n-1, 2 -> 0, ...
-          output_sd_pairs[source] = {
-              source, (source - 2 + num_partitions) % num_partitions};
-        }
-
-        o = lhs.state()
-                .collective_ops_creator
-                .create_cross_partition_collective_permute(
-                    &body_b, o, output_sd_pairs,
-                    (*lhs.state().next_channel_id)++);
-
-        TF_ASSIGN_OR_RETURN(extra_inout,
-                            get_partial_unid_result(l, r, extra_inout, i));
-
-        extra_inout = lhs.state()
-                          .collective_ops_creator
-                          .create_cross_partition_collective_permute(
-                              &body_b, extra_inout, output_sd_pairs,
-                              (*lhs.state().next_channel_id)++);
-
-        // i+2
-        i = body_b.AddInstruction(HloInstruction::CreateBinary(
-            i->shape(), HloOpcode::kAdd, i,
-            body_b.AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::CreateR0<uint32_t>(2)))));
-        auto real_i = body_b.AddInstruction(HloInstruction::CreateBinary(
-            i->shape(), HloOpcode::kAdd, i,
-            body_b.AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::CreateR0<uint32_t>(1)))));
-
-        TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(l, r, o, real_i));
-        body_b.AddInstruction(
-            HloInstruction::CreateTuple({l, r, o, extra_inout, i}));
-      } else {
-        std::vector<std::pair<int64_t, int64_t>> sd_pairs(num_partitions);
-        for (int64_t source = 0; source < num_partitions; ++source) {
-          // 0 -> n-1, 1 -> 0, 2 -> 1, ...
-          sd_pairs[source] = {source,
-                              (source - 1 + num_partitions) % num_partitions};
-        }
-
-        // Even number iteration.
-        auto next_l = l;
-        auto next_r = r;
-        auto cp_input = windowed_op_is_lhs ? l : r;
-        auto cp_output = lhs.state()
-                             .collective_ops_creator
-                             .create_cross_partition_collective_permute(
-                                 &body_b, cp_input, sd_pairs,
-                                 (*lhs.state().next_channel_id)++);
-        if (windowed_op_is_lhs) {
-          next_l = cp_output;
-        } else {
-          next_r = cp_output;
-        }
-        TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(l, r, o, i));
-
-        // ++i
-        i = body_b.AddInstruction(HloInstruction::CreateBinary(
-            i->shape(), HloOpcode::kAdd, i,
-            body_b.AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::CreateR0<uint32_t>(1)))));
-
-        // Odd number iteration.
-        auto second_next_l = next_l;
-        auto second_next_r = next_r;
-        cp_input = windowed_op_is_lhs ? next_l : next_r;
-        cp_output = lhs.state()
-                        .collective_ops_creator
-                        .create_cross_partition_collective_permute(
-                            &body_b, cp_input, sd_pairs,
-                            (*lhs.state().next_channel_id)++);
-        if (windowed_op_is_lhs) {
-          second_next_l = cp_output;
-        } else {
-          second_next_r = cp_output;
-        }
-        TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(next_l, next_r, o, i));
-
-        // ++i
-        i = body_b.AddInstruction(HloInstruction::CreateBinary(
-            i->shape(), HloOpcode::kAdd, i,
-            body_b.AddInstruction(HloInstruction::CreateConstant(
-                LiteralUtil::CreateR0<uint32_t>(1)))));
-
-        body_b.AddInstruction(HloInstruction::CreateTuple(
-            {second_next_l, second_next_r, o, extra_inout, i}));
-      }
-    } else {
-      auto real_i = i;
-      if (operands_sharded_at_contracting_dims) {
-        // For reduce-scatter case, start from the data_partition_id + 1 to make
-        // the data_partition_id of the final data shard in each partition the
-        // same as the corresponding partition_id.
-        real_i = body_b.AddInstruction(HloInstruction::CreateBinary(
-            real_i->shape(), HloOpcode::kAdd, real_i,
-            CreateOne(real_i->shape(), &body_b)));
-      }
-      TF_ASSIGN_OR_RETURN(o, get_partial_unid_result(l, r, o, real_i));
-
-      // ++i
-      i = body_b.AddInstruction(HloInstruction::CreateBinary(
-          i->shape(), HloOpcode::kAdd, i,
-          body_b.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<uint32_t>(1)))));
-      auto has_more = body_b.AddInstruction(HloInstruction::CreateCompare(
-          ShapeUtil::MakeShape(PRED, {}), i,
-          body_b.AddInstruction(HloInstruction::CreateConstant(
-              LiteralUtil::CreateR0<uint32_t>(num_partitions))),
-          ComparisonDirection::kLt));
-      // Collective-permute for the next window. We don't need it for the last
-      // iteration, so we use a conditional around the collective-permute.
-      HloInstruction* conditional;
-      {
-        SpmdBuilder cp_b("window_collective_permute", original_hlo);
-        {
-          auto p = cp_b.AddInstruction(HloInstruction::CreateParameter(
-              0,
-              operands_sharded_at_contracting_dims ? o->shape()
-              : windowed_op_is_lhs                 ? l->shape()
-                                                   : r->shape(),
-              "window"));
-          std::vector<std::pair<int64_t, int64_t>> sd_pairs(num_partitions);
-          for (int64_t source = 0; source < num_partitions; ++source) {
-            // 0 -> n-1, 1 -> 0, 2 -> 1, ...
-            sd_pairs[source] = {source,
-                                (source - 1 + num_partitions) % num_partitions};
-          }
-          lhs.state()
-              .collective_ops_creator.create_cross_partition_collective_permute(
-                  &cp_b, p, sd_pairs, (*lhs.state().next_channel_id)++);
-        }
-        SpmdBuilder ncp_b("last_iteration_noop", original_hlo);
-        {
-          ncp_b.AddInstruction(HloInstruction::CreateParameter(
-              0,
-              operands_sharded_at_contracting_dims ? o->shape()
-              : windowed_op_is_lhs                 ? l->shape()
-                                                   : r->shape(),
-              "window"));
-        }
-        conditional = body_b.AddInstruction(HloInstruction::CreateConditional(
-            operands_sharded_at_contracting_dims ? o->shape()
-            : windowed_op_is_lhs                 ? l->shape()
-                                                 : r->shape(),
-            has_more,
-            operands_sharded_at_contracting_dims ? o
-            : windowed_op_is_lhs                 ? l
-                                                 : r,
-            module->AddEmbeddedComputation(cp_b.Build()),
-            operands_sharded_at_contracting_dims ? o
-            : windowed_op_is_lhs                 ? l
-                                                 : r,
-            module->AddEmbeddedComputation(ncp_b.Build())));
-      }
-      if (operands_sharded_at_contracting_dims) {
-        o = conditional;
-      } else if (windowed_op_is_lhs) {
-        l = conditional;
-      } else {
-        r = conditional;
-      }
-      body_b.AddInstruction(
-          HloInstruction::CreateTuple({l, r, o, extra_inout, i}));
-    }
-
-    SpmdBuilder cond_b("windowed_dot_general_cond", original_hlo);
-    auto cond_param = cond_b.AddInstruction(HloInstruction::CreateParameter(
-        /*parameter_number=*/0,
-        ShapeUtil::MakeTupleShapeWithPtrs(
-            {&lhs_hlo->shape(), &rhs_hlo->shape(), &result_buffer->shape(),
-             &extra_buffer->shape(), &iteration->shape()}),
-        "param"));
-    auto cond_i = cond_b.AddInstruction(HloInstruction::CreateGetTupleElement(
-        iteration->shape(), cond_param, 4));
-    int64_t adapted_num_partitions =
-        (options.bidirectional_windowed_einsum && num_partitions % 4 == 0)
-            ? num_partitions / 2
-            : num_partitions;
-    cond_b.AddInstruction(HloInstruction::CreateCompare(
-        ShapeUtil::MakeShape(PRED, {}), cond_i,
-        cond_b.AddInstruction(HloInstruction::CreateConstant(
-            LiteralUtil::CreateR0<uint32_t>(adapted_num_partitions))),
-        ComparisonDirection::kLt));
-    auto while_loop = b->AddInstruction(HloInstruction::CreateWhile(
-        cond_param->shape(), module->AddEmbeddedComputation(cond_b.Build()),
-        module->AddEmbeddedComputation(body_b.Build()),
-        b->AddInstruction(HloInstruction::CreateTuple(
-            {lhs_hlo, rhs_hlo, result_buffer, extra_buffer, iteration}))));
-    windowed_dot_general_loops->push_back(
-        {while_loop, windowed_op_is_lhs ? 0 : 1, windowed_at_contracting_dims,
-         windowed_at_batch_dims, operands_sharded_at_contracting_dims,
-         num_partitions, GetLoopReplicaGroups(while_loop)});
-    auto result = b->AddInstruction(HloInstruction::CreateGetTupleElement(
-        result_buffer->shape(), while_loop, 2));
-    if (((options.bidirectional_windowed_einsum && num_partitions % 4 == 0) ||
-         (options.unroll_windowed_einsum && num_partitions % 2 == 0)) &&
-        operands_sharded_at_contracting_dims) {
-      std::vector<std::pair<int64_t, int64_t>> extra_sd_pairs(num_partitions);
-      for (int64_t source = 0; source < num_partitions; ++source) {
-        // 0 -> 1, 1 -> 2, 2 -> 3, ...
-        extra_sd_pairs[source] = {source, (source + 1) % num_partitions};
-      }
-      auto extra_result =
-          b->AddInstruction(HloInstruction::CreateGetTupleElement(
-              extra_buffer->shape(), while_loop, 3));
-      if (options.bidirectional_windowed_einsum && num_partitions % 4 == 0) {
-        extra_result = lhs.state()
-                           .collective_ops_creator
-                           .create_cross_partition_collective_permute(
-                               b, extra_result, extra_sd_pairs,
-                               (*lhs.state().next_channel_id)++);
-      }
-      if (options.unroll_windowed_einsum && num_partitions % 2 == 0) {
-        result = lhs.state()
-                     .collective_ops_creator
-                     .create_cross_partition_collective_permute(
-                         b, result, extra_sd_pairs,
-                         (*lhs.state().next_channel_id)++);
-      }
-      result = b->AddInstruction(HloInstruction::CreateBinary(
-          result->shape(), HloOpcode::kAdd, result, extra_result));
-    }
-    if (!ShapeUtil::Compatible(padded_result_buffer_shape,
-                               unpadded_result_buffer_shape)) {
-      result = b->AddInstruction(HloInstruction::CreateSlice(
-          unpadded_result_buffer_shape, result,
-          std::vector<int64_t>(padded_result_buffer_shape.rank(), 0),
-          unpadded_result_buffer_shape.dimensions(),
-          std::vector<int64_t>(padded_result_buffer_shape.rank(), 1)));
-    }
-    return result;
-  };
   // Hard limit on iteration count based on empirical data (above this amount
   // there's pretty significant overhead).
   constexpr int64_t kMaxIterations = 32;
@@ -1851,7 +1852,16 @@ StatusOr<HloInstruction*> PartitionBaseCase(
       original_hlo, &lhs, &rhs, create_sharded_dot, b, module, visitor);
   if (e_config) {
     VLOG(2) << "Emit windowed dot.";
-    return emit_windowed_dot_general(*e_config);
+    return EmitWindowedDotGeneral(
+        lhs, rhs, output_base_shape, output_sharding, dims_mapping,
+        num_partitions, create_sharded_dot, conv_window, module, original_hlo,
+        options, b, windowed_dot_general_loops, *e_config, indices_map,
+        lhs_sharding_transposed_to_match_output,
+        rhs_sharding_transposed_to_match_output,
+        rhs_sharding_transposed_to_match_lhs,
+        lhs_sharding_transposed_to_match_rhs,
+        output_sharding_transposed_to_match_rhs,
+        output_sharding_transposed_to_match_lhs);
   }
 
   {
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
index 5578674e792..2af4216ab36 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.cc
@@ -57,6 +57,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/util.h"
 #include "tensorflow/compiler/xla/window_util.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace xla {
 namespace spmd {
@@ -529,12 +530,48 @@ PartitionedHlo PartitionedHlo::ReshardNoCache(const HloSharding& target,
         if (reshard.has_value()) {
           return reshard.value();
         }
+        // Try to simplify the resharding by grouping those equal-sized sharding
+        // dims first.
+        std::vector<int64_t> equal_dims;
+        for (int64_t dim = 0; dim < hlo_->shape().rank(); ++dim) {
+          if (sharding().tile_assignment().dim(dim) == 1 ||
+              target.tile_assignment().dim(dim) !=
+                  sharding().tile_assignment().dim(dim)) {
+            continue;
+          }
+          equal_dims.push_back(dim);
+        }
+        if (!equal_dims.empty()) {
+          auto grouped =
+              hlo_sharding_util::GroupShardingOnDims(sharding(), equal_dims);
+          auto grouped_target = AlignGroupsWith(
+              hlo_sharding_util::GroupShardingOnDims(target, equal_dims),
+              grouped);
+          Shape inner_base_shape = base_shape_;
+          for (int64_t dim : equal_dims) {
+            inner_base_shape.set_dimensions(dim, hlo_->shape().dimensions(dim));
+          }
+          auto state = CreatePerGroupPartitioningState(
+              state_, grouped.device_groups, state_.b);
+          HloInstruction* copy =
+              state_.b->AddInstruction(HloInstruction::CreateUnary(
+                  hlo_->shape(), HloOpcode::kCopy, hlo_));
+          copy->set_sharding(grouped.sharding);
+          HloInstruction* resharded =
+              PartitionedHlo(copy, inner_base_shape, state)
+                  .ReshardNoCache(grouped_target.sharding)
+                  .hlo();
+          resharded->set_sharding(
+              hlo_sharding_util::UngroupSharding(grouped_target));
+          return PartitionedHlo(resharded, base_shape_, state_)
+              .ReshardNoCache(target);
+        }
       }
       if (!allow_full_replication) {
         return *this;
       }
       LOG(ERROR)
-          << "[spmd] Involuntary full rematerialization. The compiled was "
+          << "[spmd] Involuntary full rematerialization. The compiler was "
              "not able to go from sharding "
           << sharding().ToString(/*include_metadata=*/true) << " to "
           << target.ToString(/*include_metadata=*/true)
@@ -666,7 +703,8 @@ std::optional<PartitionedHlo::WindowedInputShardReturnValue>
 PartitionedHlo::ReshardAsWindowedInput(const Window& window,
                                        const HloSharding& target,
                                        HloInstruction* pad_value,
-                                       bool mask_invalid_region) {
+                                       bool mask_invalid_region,
+                                       bool force_mask_in_compact) {
   auto& cache = state_.reshard_cache->per_hlo_cache[hlo()].window_reshard_cache;
   for (auto& entry : cache) {
     if (std::get<0>(entry) == target &&
@@ -1140,7 +1178,8 @@ PartitionedHlo::ReshardAsWindowedInput(const Window& window,
         padded_shape.dimensions(dim), shard_shape.dimensions(dim), dim,
         *halo_exchange_target, offsets_on_padded_shape[dim], pad_value,
         partition_ordinals[dim], state_.collective_ops_creator,
-        state_.next_channel_id, state_.b, mask_invalid_region);
+        state_.next_channel_id, state_.b, mask_invalid_region,
+        force_mask_in_compact);
     if (!resharded) {
       VLOG(1) << "ReshardAsWindowedInput failed without replicate first: halo "
                  "is beyond the neighbor.";
@@ -1340,7 +1379,7 @@ PartitionedHlo::ReshardToPartialReplicateWithAllGather(
     return std::nullopt;
   }
   // Tiled/partial replicate to partial replicate
-  // Get the comptible sharding to target with resharding by all reduce.
+  // Get the compatible sharding to target with resharding by all reduce.
   auto compatible_sharding =
       PartialReplicateReshardCompatibleSharding(target, sharding());
   if (!compatible_sharding.has_value()) {
@@ -1862,7 +1901,9 @@ PatternMatchUnmergeSharding(const Shape& shape, const Shape& base_shape,
           VLOG(10) << "Skipped for target dim being 1" << target_dim;
           return std::nullopt;
         }
-        if (target.tile_assignment().dim(target_dim) != dimension_size) {
+        if (source.tile_assignment().dim(target_dim) *
+                target.tile_assignment().dim(target_dim) !=
+            dimension_size) {
           VLOG(10) << "Skipped for target dim different from dimension_size "
                    << target_dim
                    << " src size: " << source.tile_assignment().dim(i)
@@ -2987,158 +3028,246 @@ Status SpmdPartitioningVisitor::HandleReshape(HloInstruction* hlo) {
     return OkStatus();
   }
 
-  // Check if operand sharding and sharding are both tiled or partial replicate.
-  // If both of them are partial replicate, check num_replications are the same.
-  if (operand.sharding().ReplicateOnLastTileDim() !=
-          sharding.ReplicateOnLastTileDim() ||
-      (sharding.ReplicateOnLastTileDim() &&
-       (operand.sharding().tile_assignment().dimensions().back() !=
-        sharding.tile_assignment().dimensions().back()))) {
-    return DefaultAction(hlo);
-  }
+  auto shard_reshape =
+      [](PartitionedHlo& operand, const HloSharding& sharding,
+         const Shape& base_shape) -> StatusOr<HloInstruction*> {
+    auto replicate = [&] {
+      HloInstruction* rep = operand.Replicate().hlo();
+      HloInstruction* reshape = operand.state().b->AddInstruction(
+          HloInstruction::CreateReshape(base_shape, rep));
+      reshape->set_sharding(HloSharding::Replicate());
+      return PartitionedHlo(reshape, base_shape, operand.state())
+          .Reshard(sharding)
+          .hlo();
+    };
+    // Check if operand sharding and sharding have the same number of tiles.
+    if (operand.sharding().NumTiles() != sharding.NumTiles()) {
+      return replicate();
+    }
 
-  // Try use halo exchange for certain split-dim/merge-dims cases.
-  // ReshapeSharding failed in these cases probably due to uneven partitioning,
-  // where halo exchange could help. Specifically we check the following
-  // conditions to detect supported cases:
-  // 1) Both input and output are partitioned on one dimension.
-  // 2) The combined size of dimensions before the partitioned dimension are the
-  // same on input and output. This means we don't need to consider the major
-  // dimensions.
-  // 3) Let A = the input size on the partitioned dimension, and
-  //        B = the output size on the partitioned dimension; then
-  //    either A % B == 0 (split dim) or B % A == 0 (merge dims).
-  auto maybe_input_sharded_dim = UniqueTiledDim(operand.sharding());
-  auto maybe_output_sharded_dim = UniqueTiledDim(sharding);
-  if (!maybe_input_sharded_dim || !maybe_output_sharded_dim) {
-    return DefaultAction(hlo);
-  }
-  int64_t input_sharded_dim = *maybe_input_sharded_dim;
-  int64_t output_sharded_dim = *maybe_output_sharded_dim;
-  // Check that the major dims before the sharded dim have the same total size
-  // for input and output.
-  int64_t input_major_dims_size = 1;
-  for (int64_t i = 0; i < input_sharded_dim; ++i) {
-    input_major_dims_size *= operand.base_shape().dimensions(i);
-  }
-  int64_t output_major_dims_size = 1;
-  for (int64_t i = 0; i < output_sharded_dim; ++i) {
-    output_major_dims_size *= hlo->shape().dimensions(i);
-  }
-  if (input_major_dims_size != output_major_dims_size) {
-    return DefaultAction(hlo);
-  }
-  // Fix potential device ordering mismatch in tile assignment.
-  Array<int64_t> new_input_tile_assignment = sharding.tile_assignment();
-  new_input_tile_assignment.Reshape(
-      operand.sharding().tile_assignment().dimensions());
-  auto aligned_sharding =
-      sharding.ReplicateOnLastTileDim()
-          ? HloSharding::PartialTile(new_input_tile_assignment)
-          : HloSharding::Tile(new_input_tile_assignment);
-  operand = operand.Reshard(aligned_sharding);
-  auto replication_count = sharding.ReplicateOnLastTileDim()
-                               ? sharding.tile_assignment().dimensions().back()
-                               : 1;
+    // Try use halo exchange for certain split-dim/merge-dims cases.
+    // ReshapeSharding failed in these cases probably due to uneven
+    // partitioning, where halo exchange could help. Specifically we check the
+    // following conditions to detect supported cases: 1) Both input and output
+    // are partitioned on one dimension. 2) The combined size of dimensions
+    // before the partitioned dimension are the same on input and output. This
+    // means we don't need to consider the major dimensions. 3) Let A = the
+    // input size on the partitioned dimension, and
+    //        B = the output size on the partitioned dimension; then
+    //    either A % B == 0 (split dim) or B % A == 0 (merge dims).
+    auto maybe_input_sharded_dim = UniqueTiledDim(operand.sharding());
+    auto maybe_output_sharded_dim = UniqueTiledDim(sharding);
+    if (!maybe_input_sharded_dim || !maybe_output_sharded_dim) {
+      return replicate();
+    }
+    int64_t input_sharded_dim = *maybe_input_sharded_dim;
+    int64_t output_sharded_dim = *maybe_output_sharded_dim;
+    // Check that the major dims before the sharded dim have the same total size
+    // for input and output.
+    int64_t input_major_dims_size = 1;
+    for (int64_t i = 0; i < input_sharded_dim; ++i) {
+      input_major_dims_size *= operand.base_shape().dimensions(i);
+    }
+    int64_t output_major_dims_size = 1;
+    for (int64_t i = 0; i < output_sharded_dim; ++i) {
+      output_major_dims_size *= base_shape.dimensions(i);
+    }
+    if (input_major_dims_size != output_major_dims_size) {
+      return replicate();
+    }
+    // Fix potential device ordering mismatch in tile assignment.
+    Array<int64_t> new_input_tile_assignment = sharding.tile_assignment();
+    new_input_tile_assignment.Reshape(
+        operand.sharding().tile_assignment().dimensions());
+    auto aligned_sharding =
+        sharding.ReplicateOnLastTileDim()
+            ? HloSharding::PartialTile(new_input_tile_assignment)
+            : HloSharding::Tile(new_input_tile_assignment);
+    operand = operand.Reshard(aligned_sharding);
+    auto replication_count =
+        sharding.ReplicateOnLastTileDim()
+            ? sharding.tile_assignment().dimensions().back()
+            : 1;
 
-  int64_t input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
-  int64_t output_dim_size = hlo->shape().dimensions(output_sharded_dim);
-  auto input_shard_shape =
-      MakePartitionedShape(operand.base_shape(), operand.sharding());
-  auto output_shard_shape = MakePartitionedShape(hlo->shape(), sharding);
-  if (input_dim_size % output_dim_size == 0) {
-    // Split dim.
-    int64_t split_factor = input_dim_size / output_dim_size;
-    int64_t output_shard_size =
-        output_shard_shape.dimensions(output_sharded_dim);
-    // Use halo exchange to fix misaligned data.
-    Window window;
-    for (int64_t i = 0; i < hlo->shape().rank(); ++i) {
-      WindowDimension* dim = window.add_dimensions();
-      dim->set_size(1);
-      dim->set_stride(1);
-      dim->set_window_dilation(1);
-      dim->set_window_reversal(false);
-      dim->set_base_dilation(1);
-      dim->set_padding_low(0);
-      if (i == input_sharded_dim) {
-        dim->set_padding_high(output_shard_size * split_factor *
-                                  num_partitions_ / replication_count -
-                              input_dim_size);
-      } else {
-        dim->set_padding_high(0);
+    int64_t input_dim_size = operand.base_shape().dimensions(input_sharded_dim);
+    int64_t output_dim_size = base_shape.dimensions(output_sharded_dim);
+    auto input_shard_shape =
+        MakePartitionedShape(operand.base_shape(), operand.sharding());
+    auto output_shard_shape = MakePartitionedShape(base_shape, sharding);
+    if (input_dim_size % output_dim_size == 0) {
+      // Split dim.
+      int64_t split_factor = input_dim_size / output_dim_size;
+      int64_t output_shard_size =
+          output_shard_shape.dimensions(output_sharded_dim);
+      // Use halo exchange to fix misaligned data.
+      Window window;
+      for (int64_t i = 0; i < base_shape.rank(); ++i) {
+        WindowDimension* dim = window.add_dimensions();
+        dim->set_size(1);
+        dim->set_stride(1);
+        dim->set_window_dilation(1);
+        dim->set_window_reversal(false);
+        dim->set_base_dilation(1);
+        dim->set_padding_low(0);
+        if (i == input_sharded_dim) {
+          dim->set_padding_high(output_shard_size * split_factor *
+                                    sharding.tile_assignment().num_elements() /
+                                    replication_count -
+                                input_dim_size);
+        } else {
+          dim->set_padding_high(0);
+        }
       }
-    }
 
-    auto reshard_operand = operand.ReshardAsWindowedInput(
-        window, operand.sharding(),
-        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
-        /*mask_invalid_region=*/false);
-    if (!reshard_operand.has_value()) {
-      return DefaultAction(hlo);
-    }
-    TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
-    CHECK_EQ(
-        reshard_operand->sharded_input->shape().dimensions(input_sharded_dim),
-        output_shard_size * split_factor);
-    SetPartitionedHlo(hlo, [&] {
-      // Do a local reshape.
-      return b_.AddInstruction(HloInstruction::CreateReshape(
+      auto reshard_operand = operand.ReshardAsWindowedInput(
+          window, operand.sharding(),
+          CreateZero(ShapeUtil::MakeShape(base_shape.element_type(), {}),
+                     operand.state().b),
+          /*mask_invalid_region=*/false);
+      if (!reshard_operand.has_value()) {
+        return replicate();
+      }
+      TF_RET_CHECK(!reshard_operand->dynamic_slice_index_on_output.has_value());
+      CHECK_EQ(
+          reshard_operand->sharded_input->shape().dimensions(input_sharded_dim),
+          output_shard_size * split_factor);
+      return operand.state().b->AddInstruction(HloInstruction::CreateReshape(
           output_shard_shape, reshard_operand->sharded_input));
-    });
-    return OkStatus();
-  } else if (output_dim_size % input_dim_size == 0) {
-    // Merge dims.
-    int64_t merge_factor = output_dim_size / input_dim_size;
-    // First reshape locally. (The sharded dimension could include padded data.)
-    auto tmp_shard_shape = output_shard_shape;
-    tmp_shard_shape.set_dimensions(
-        output_sharded_dim,
-        input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
-    auto tmp_reshape = b_.AddInstruction(
-        HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
-    tmp_reshape->copy_sharding(hlo);
-    auto tmp_full_shape = tmp_shard_shape;
-    tmp_full_shape.set_dimensions(
-        output_sharded_dim, tmp_shard_shape.dimensions(output_sharded_dim) *
-                                num_partitions_ / replication_count);
-    auto tmp_output =
-        PartitionedHlo(tmp_reshape, tmp_full_shape, MakePartitioningState());
+    } else if (output_dim_size % input_dim_size == 0) {
+      // Merge dims.
+      int64_t merge_factor = output_dim_size / input_dim_size;
+      // First reshape locally. (The sharded dimension could include padded
+      // data.)
+      auto tmp_shard_shape = output_shard_shape;
+      tmp_shard_shape.set_dimensions(
+          output_sharded_dim,
+          input_shard_shape.dimensions(input_sharded_dim) * merge_factor);
+      auto tmp_reshape = operand.state().b->AddInstruction(
+          HloInstruction::CreateReshape(tmp_shard_shape, operand.hlo()));
+      tmp_reshape->set_sharding(sharding);
+      auto tmp_full_shape = tmp_shard_shape;
+      tmp_full_shape.set_dimensions(
+          output_sharded_dim, tmp_shard_shape.dimensions(output_sharded_dim) *
+                                  sharding.tile_assignment().num_elements() /
+                                  replication_count);
+      auto tmp_output =
+          PartitionedHlo(tmp_reshape, tmp_full_shape, operand.state());
 
-    // Use halo exchange to fix misaligned data.
-    Window window;
-    for (int64_t i = 0; i < tmp_shard_shape.rank(); ++i) {
-      WindowDimension* dim = window.add_dimensions();
-      dim->set_size(1);
-      dim->set_stride(1);
-      dim->set_window_dilation(1);
-      dim->set_window_reversal(false);
-      dim->set_base_dilation(1);
-      dim->set_padding_low(0);
-      if (i == output_sharded_dim) {
-        dim->set_padding_high(output_dim_size -
-                              tmp_shard_shape.dimensions(output_sharded_dim) *
-                                  num_partitions_ / replication_count);
-      } else {
-        dim->set_padding_high(0);
+      // Use halo exchange to fix misaligned data.
+      Window window;
+      for (int64_t i = 0; i < tmp_shard_shape.rank(); ++i) {
+        WindowDimension* dim = window.add_dimensions();
+        dim->set_size(1);
+        dim->set_stride(1);
+        dim->set_window_dilation(1);
+        dim->set_window_reversal(false);
+        dim->set_base_dilation(1);
+        dim->set_padding_low(0);
+        if (i == output_sharded_dim) {
+          dim->set_padding_high(output_dim_size -
+                                tmp_shard_shape.dimensions(output_sharded_dim) *
+                                    sharding.tile_assignment().num_elements() /
+                                    replication_count);
+        } else {
+          dim->set_padding_high(0);
+        }
       }
-    }
 
-    auto reshard_output = tmp_output.ReshardAsWindowedInput(
-        window, sharding,
-        CreateZero(ShapeUtil::MakeShape(hlo->shape().element_type(), {}), &b_),
-        /*mask_invalid_region=*/false);
-    if (!reshard_output.has_value()) {
-      return DefaultAction(hlo);
+      auto reshard_output = tmp_output.ReshardAsWindowedInput(
+          window, sharding,
+          CreateZero(ShapeUtil::MakeShape(base_shape.element_type(), {}),
+                     operand.state().b),
+          /*mask_invalid_region=*/false);
+      if (!reshard_output.has_value()) {
+        return replicate();
+      }
+      TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
+      CHECK_EQ(
+          reshard_output->sharded_input->shape().dimensions(output_sharded_dim),
+          output_shard_shape.dimensions(output_sharded_dim));
+      return reshard_output->sharded_input;
     }
-    TF_RET_CHECK(!reshard_output->dynamic_slice_index_on_output.has_value());
-    CHECK_EQ(
-        reshard_output->sharded_input->shape().dimensions(output_sharded_dim),
-        output_shard_shape.dimensions(output_sharded_dim));
-    SetPartitionedHlo(hlo, [&] { return reshard_output->sharded_input; });
-    return OkStatus();
-  }
-  return DefaultAction(hlo);
+    return replicate();
+  };
+
+  // Try to use PropagateShardingThroughReshape to find compatible dimensions,
+  // then group them and recursively partition other dimensions.
+  std::function<StatusOr<HloInstruction*>(PartitionedHlo&, const HloSharding&,
+                                          const Shape&)>
+      recursive_shard =
+          [&](PartitionedHlo& operand, const HloSharding& sharding,
+              const Shape& base_shape) -> StatusOr<HloInstruction*> {
+    const Shape& operand_base_shape = operand.base_shape();
+    HloSharding propagated = hlo_sharding_util::PropagateShardingThroughReshape(
+        operand_base_shape, base_shape, operand.sharding());
+    if (propagated.IsTiled()) {
+      // We should be able to call ReshapeSharding in the reverse direction to
+      // get an operand sharding that's fully compatible with propagated. This
+      // helps us find the compatible dimensions on the operand.
+      auto operand_propagated_back = hlo_sharding_util::ReshapeSharding(
+          base_shape, operand_base_shape, propagated);
+      std::vector<int64_t> operand_group_dims;
+      if (!operand_propagated_back.has_value()) {
+        // Unlikely, but if certain case is not implemented properly in
+        // ReshapeSharding we fallback to shard_reshape().
+        return shard_reshape(operand, sharding, base_shape);
+      }
+      CHECK(operand_propagated_back->IsTiled());
+      Shape inner_operand_base_shape = operand_base_shape;
+      for (int64_t i = 0; i < operand_base_shape.rank(); ++i) {
+        if (operand_propagated_back->tile_assignment().dim(i) > 1) {
+          operand_group_dims.push_back(i);
+          inner_operand_base_shape.set_dimensions(
+              i, operand.hlo()->shape().dimensions(i));
+        }
+      }
+      Shape inner_base_shape = base_shape;
+      // If original output sharding is compatible with propagated in all tiled
+      // dims, but is more sharded more ways, we use that instead of propagated.
+      bool use_original_output_sharding =
+          sharding.NumTiles() > propagated.NumTiles();
+      std::vector<int64_t> output_group_dims;
+      for (int64_t i = 0; i < inner_base_shape.rank(); ++i) {
+        int64_t num_shards = propagated.tile_assignment().dim(i);
+        if (num_shards > 1) {
+          inner_base_shape.set_dimensions(
+              i, CeilOfRatio(base_shape.dimensions(i), num_shards));
+          output_group_dims.push_back(i);
+          if (num_shards != sharding.tile_assignment().dim(i)) {
+            use_original_output_sharding = false;
+          }
+        }
+      }
+      auto operand_group = hlo_sharding_util::GroupShardingOnDims(
+          operand.sharding(), operand_group_dims);
+      auto output_group = hlo_sharding_util::GroupShardingOnDims(
+          use_original_output_sharding ? sharding : propagated,
+          output_group_dims);
+      if (use_original_output_sharding) {
+        output_group = AlignGroupsWith(std::move(output_group), operand_group);
+      }
+      auto inner_state = CreatePerGroupPartitioningState(
+          operand.state(), operand_group.device_groups, operand.state().b);
+      HloInstruction* inner_operand_hlo =
+          b_.AddInstruction(HloInstruction::CreateUnary(
+              operand.hlo()->shape(), HloOpcode::kCopy, operand.hlo()));
+      inner_operand_hlo->set_sharding(operand_group.sharding);
+      auto inner_operand = PartitionedHlo(
+          inner_operand_hlo, inner_operand_base_shape, inner_state);
+      TF_ASSIGN_OR_RETURN(HloInstruction * reshape,
+                          recursive_shard(inner_operand, output_group.sharding,
+                                          inner_base_shape));
+      reshape->set_sharding(hlo_sharding_util::UngroupSharding(output_group));
+      return PartitionedHlo(reshape, base_shape, operand.state())
+          .Reshard(sharding)
+          .hlo();
+    }
+    return shard_reshape(operand, sharding, base_shape);
+  };
+  TF_ASSIGN_OR_RETURN(HloInstruction * partitioned,
+                      recursive_shard(operand, sharding, hlo->shape()));
+  SetPartitionedHlo(hlo, [&] { return partitioned; });
+  return OkStatus();
 }
 
 Status SpmdPartitioningVisitor::HandleIota(HloInstruction* hlo) {
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
index 68ba4b15f2e..fd513b1c3d1 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner.h
@@ -405,7 +405,8 @@ class PartitionedHlo {
   // only modify the reshard cache.
   std::optional<WindowedInputShardReturnValue> ReshardAsWindowedInput(
       const Window& window, const HloSharding& target,
-      HloInstruction* pad_value, bool mask_invalid_region = true);
+      HloInstruction* pad_value, bool mask_invalid_region = true,
+      bool force_mask_in_compact = false);
 
   const PartitioningState& state() const { return state_; }
 
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
index f9dd5a22423..5565bd550f9 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_test.cc
@@ -3526,6 +3526,31 @@ ENTRY entry {
   EXPECT_THAT(root, AllOf(op::Reshape(param0), op::Shape("f32[19,38,4,81]")));
 }
 
+TEST_F(SpmdPartitioningTest, ReshapePartialHaloExchange) {
+  absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %param0 = f32[4,14,4] parameter(0),
+    sharding={devices=[2,4,2]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  ROOT %reshape = f32[2,2,2,7,2,2] reshape(%param0),
+    sharding={devices=[2,1,4,1,2,1]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+  VLOG(1) << module->ToString();
+
+  const auto root = module->entry_computation()->root_instruction();
+  auto halo_exchange =
+      AllOf(op::Concatenate(op::Copy(op::Parameter()), op::CollectivePermute(),
+                            op::CollectivePermute(), op::CollectivePermute()));
+  EXPECT_THAT(
+      root,
+      AllOf(op::Reshape(op::DynamicSlice(op::Pad(halo_exchange, _), _, _, _)),
+            op::Shape("f32[1,2,1,7,1,2]")));
+}
+
 TEST_F(SpmdPartitioningTest, ReshapeWithReshard) {
   absl::string_view hlo_string = R"(
 HloModule module
@@ -3653,7 +3678,10 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   VLOG(1) << module->ToString();
   auto reshape = AllOf(op::Reshape(op::AllReduce(op::Select(
-                           _, op::CollectivePermute(op::Parameter()), _))),
+                           _,
+                           op::Select(_, op::CollectivePermute(op::Parameter()),
+                                      op::Parameter()),
+                           _))),
                        op::Shape("f32[1,1,123]"));
   const auto root = module->entry_computation()->root_instruction();
   EXPECT_THAT(root, reshape);
@@ -8648,15 +8676,9 @@ TEST_F(SpmdPartitioningTest,
 HloModule module
 
 ENTRY entry {
-  constant = f32[6,3]{1,0}
-    constant({{1,3,7},{5,1,4},{1,2,8},{2,3,7},{5,2,4},{2,2,8}}),
-    sharding={replicated}
-  constant.1 = f32[6,3]{1,0}
-    constant({{2,7,2},{2,9,2},{2,6,2},{3,7,2},{2,9,3},{2,3,2}}),
-    sharding={replicated}
-  multiply = f32[6,3]{1,0} multiply(constant, constant.1),
+  input = f32[6,3] parameter(0), 
     sharding={devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}
-  ROOT add = f32[6,3]{1,0} add(multiply, constant.1),
+  ROOT copy = f32[6,3]{1,0} copy(input),
     sharding={devices=[4,1]0,1,2,3}
 }
 )";
@@ -8664,29 +8686,16 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/4));
   VLOG(1) << module->ToString();
-  auto partial_replicate_lhs =
-      AllOf(op::Shape("f32[3,3]"),
-            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
-  auto partial_replicate_rhs =
-      AllOf(op::Shape("f32[3,3]"),
-            op::DynamicSlice(op::Constant(), op::Reshape(), op::Constant()));
-  auto multiply =
-      AllOf(op::Shape("f32[3,3]"),
-            op::Multiply(partial_replicate_lhs, partial_replicate_rhs));
+  auto input = AllOf(op::Shape("f32[3,3]"), op::Parameter(0));
   auto right_halo =
-      AllOf(op::Shape("f32[1,3]"), op::CollectivePermute(op::Slice(multiply)));
-  auto add_lhs = AllOf(
-      op::Shape("f32[2,3]"),
-      op::DynamicSlice(
-          op::DynamicSlice(
-              op::Pad(op::Concatenate(multiply, right_halo), op::Constant()),
-              op::Reshape(), op::Constant()),
-          op::Subtract(), op::Subtract()));
-  auto add_rhs = AllOf(op::Shape("f32[2,3]"),
-                       op::DynamicSlice(op::Pad(op::Constant(), op::Constant()),
-                                        op::Reshape(), op::Constant()));
+      AllOf(op::Shape("f32[1,3]"), op::CollectivePermute(op::Slice(input)));
+  auto concat = op::Concatenate(
+      input, AllOf(op::Shape("f32[2,3]"), op::Pad(right_halo, _)));
+  auto valid_slice =
+      AllOf(op::Shape("f32[4,3]"), op::DynamicSlice(concat, _, _));
   const auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]"), op::Add(add_lhs, add_rhs)));
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]"),
+                          op::Copy(op::DynamicSlice(valid_slice, _, _))));
 }
 
 TEST_F(SpmdPartitioningTest, TileToPartialReplicateReshard) {
@@ -8910,10 +8919,9 @@ TEST_F(SpmdPartitioningTest,
 HloModule module
 
 ENTRY entry {
-  %param0 = f32[6,3] parameter(0)
-  %copy = f32[6,3] copy(param0),
+  %param0 = f32[6,3] parameter(0),
     sharding={devices=[4,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
-  ROOT %copy0 = f32[6,3] copy(%copy),
+  ROOT %copy0 = f32[6,3] copy(%param0),
     sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
 })";
 
@@ -8921,22 +8929,19 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
 
   VLOG(1) << module->ToString();
-  auto partially_replicated_init =
+  auto input = AllOf(op::Shape("f32[2,3]"), op::Parameter(0));
+  auto piece1 =
       AllOf(op::Shape("f32[2,3]"),
-            op::Copy(op::DynamicSlice(op::Pad(op::Parameter(0), op::Constant()),
-                                      op::Reshape(), op::Constant())));
-  auto slice =
-      AllOf(op::Shape("f32[2,3]"),
-            op::DynamicSlice(op::Concatenate(op::CollectivePermute(op::Slice(
-                                                 partially_replicated_init)),
-                                             partially_replicated_init),
-                             _, _));
+            op::Select(_, op::Pad(op::CollectivePermute(op::Slice(input)), _),
+                       input));
+  auto piece2 = AllOf(op::Shape("f32[1,3]"), op::Slice(input));
+  auto concat = op::Concatenate(piece1, piece2);
   auto partially_replicated =
-      AllOf(op::Shape("f32[3,3]"),
-            op::Copy(op::Slice(op::AllReduce(
-                op::DynamicUpdateSlice(op::Broadcast(_), slice, _, _)))));
+      AllOf(op::Shape("f32[4,3]"),
+            op::AllReduce(op::DynamicUpdateSlice(
+                op::Broadcast(_), op::DynamicSlice(concat, _, _), _, _)));
   const auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, partially_replicated);
+  EXPECT_THAT(root, op::Copy(op::Slice(partially_replicated)));
 }
 
 TEST_F(SpmdPartitioningTest,
@@ -8945,10 +8950,9 @@ TEST_F(SpmdPartitioningTest,
 HloModule module
 
 ENTRY entry {
-  %param0 = f32[6,3] parameter(0)
-  %copy = f32[6,3] copy(param0),
+  %param0 = f32[6,3] parameter(0),
     sharding={devices=[2,1,4]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
-  ROOT %copy0 = f32[6,3] copy(%copy),
+  ROOT %copy0 = f32[6,3] copy(%param0),
     sharding={devices=[4,1,2]0,1,2,3,4,5,6,7 last_tile_dim_replicate}
 })";
 
@@ -8956,21 +8960,16 @@ ENTRY entry {
                           PartitionComputation(hlo_string, /*num_devices=*/8));
 
   VLOG(1) << module->ToString();
-  auto partially_replicated_init =
-      AllOf(op::Shape("f32[3,3]"),
-            op::Copy(op::DynamicSlice(op::Parameter(0), op::Reshape(),
-                                      op::Constant())));
-  auto slice = AllOf(
-      op::Shape("f32[4,3]"),
-      op::DynamicSlice(op::Pad(op::Concatenate(partially_replicated_init,
-                                               op::CollectivePermute(op::Slice(
-                                                   partially_replicated_init))),
-                               op::Constant()),
-                       _, _));
-  auto partially_replicated =
-      AllOf(op::Shape("f32[2,3]"), op::Copy(op::DynamicSlice(slice, _, _)));
+  auto input = AllOf(op::Shape("f32[3,3]"), op::Parameter(0));
+  auto slice =
+      AllOf(op::Shape("f32[4,3]"),
+            op::DynamicSlice(
+                op::Concatenate(
+                    input, op::Pad(op::CollectivePermute(op::Slice(input)), _)),
+                _, _));
   const auto root = module->entry_computation()->root_instruction();
-  EXPECT_THAT(root, partially_replicated);
+  EXPECT_THAT(root, AllOf(op::Shape("f32[2,3]"),
+                          op::Copy(op::DynamicSlice(slice, _, _))));
 }
 
 TEST_F(SpmdPartitioningTest, PartitionConvWithBathGroupCount) {
@@ -12661,13 +12660,11 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/8));
   auto slice = AllOf(op::Slice(op::Parameter()), op::Shape("f32[2]"));
-  auto halo1 = AllOf(op::CollectivePermute(slice), op::Shape("f32[2]"));
-  auto halo2 =
-      AllOf(op::CollectivePermute(op::Slice(slice)), op::Shape("f32[1]"));
+  auto halo_slice = AllOf(op::Slice(slice), op::Shape("f32[1]"));
+  auto halo = AllOf(op::CollectivePermute(halo_slice), op::Shape("f32[1]"));
   VLOG(1) << module->ToString();
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Copy(AllOf(op::DynamicSlice(op::Concatenate(halo1, halo2), _),
-                             op::Shape("f32[1]"))));
+              op::Copy(op::Select(_, halo, halo)));
 }
 
 TEST_F(SpmdPartitioningTest, SliceToMiddle2PartiallyReplicated) {
@@ -12684,13 +12681,117 @@ ENTRY entry {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           PartitionComputation(hlo_string, /*num_devices=*/16));
   auto slice = AllOf(op::Slice(op::Parameter()), op::Shape("f32[2]"));
-  auto halo1 = AllOf(op::CollectivePermute(slice), op::Shape("f32[2]"));
-  auto halo2 =
-      AllOf(op::CollectivePermute(op::Slice(slice)), op::Shape("f32[1]"));
+  auto halo_slice = AllOf(op::Slice(slice), op::Shape("f32[1]"));
+  auto halo = AllOf(op::CollectivePermute(halo_slice), op::Shape("f32[1]"));
   VLOG(1) << module->ToString();
   EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Copy(AllOf(op::DynamicSlice(op::Concatenate(halo1, halo2), _),
-                             op::Shape("f32[1]"))));
+              op::Copy(op::Select(_, halo, halo)));
+}
+
+TEST_F(SpmdPartitioningTest, SliceToHalfSize) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[32] parameter(0),
+    sharding={devices=[16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  ROOT %slice = f32[16] slice(input), slice={[0:16]},
+    sharding={devices=[16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+  VLOG(1) << module->ToString();
+  auto piece1 =
+      AllOf(op::Pad(op::CollectivePermute(op::Slice(op::Parameter())), _),
+            op::Shape("f32[2]"));
+  auto piece2 =
+      op::Select(_, op::CollectivePermute(op::Parameter()), op::Parameter());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(op::DynamicSlice(op::Select(_, piece1, piece2), _)));
+}
+
+TEST_F(SpmdPartitioningTest, PadToDoubleSize) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[16] parameter(0),
+    sharding={devices=[16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  %pv = f32[] constant(-1)
+  ROOT %pad = f32[32] pad(input, pv), padding=0_16,
+    sharding={devices=[16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+  VLOG(1) << module->ToString();
+  auto cp1 = op::CollectivePermute(op::Parameter(0));
+  auto cp2 = op::CollectivePermute(op::Parameter(0));
+  auto piece1 = op::Select(_, cp1, op::Parameter(0));
+  auto piece2 = op::Select(_, cp2, cp1);
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Select(_, op::Concatenate(piece1, piece2),
+                         op::Broadcast(op::Constant())));
+}
+
+TEST_F(SpmdPartitioningTest, PadAllPadvalue) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[16] parameter(0),
+    sharding={devices=[16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+  %pv = f32[] constant(-1)
+  ROOT %pad = f32[16] pad(input, pv), padding=16_-16,
+    sharding={devices=[16]0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/16));
+  VLOG(1) << module->ToString();
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              AllOf(op::Broadcast(op::Constant()), op::Shape("f32[1]")));
+}
+
+TEST_F(SpmdPartitioningTest, PadFrom1To24) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[1] parameter(0), sharding={devices=[8]0,1,2,3,4,5,6,7}
+  %pv = f32[] constant(-1)
+  ROOT %pad = f32[24] pad(input, pv), padding=3_20,
+    sharding={devices=[8]0,1,2,3,4,5,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+  VLOG(1) << module->ToString();
+  auto cp = op::CollectivePermute(op::Parameter(0));
+  EXPECT_THAT(
+      module->entry_computation()->root_instruction(),
+      AllOf(op::Shape("f32[3]"),
+            op::Select(_, op::Concatenate(cp, op::Broadcast(op::Constant())),
+                       op::Broadcast(op::Constant()))));
+}
+
+TEST_F(SpmdPartitioningTest, SliceToLessThanHalf) {
+  const char* const hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  %input = f32[100,2] parameter(0), sharding={devices=[2,1]0,1}
+  ROOT slice.20 = f32[6,2] slice(input), slice={[0:6], [0:2]}, sharding={devices=[2,1]0,1}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/2));
+  VLOG(1) << module->ToString();
+  auto cp = op::CollectivePermute(op::Slice(op::Parameter(0)));
+  auto self = op::Slice(op::Parameter(0));
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Copy(op::Select(_, cp, self)));
 }
 
 TEST_F(SpmdPartitioningTest, PartialDusReplicate) {
@@ -13331,6 +13432,66 @@ ENTRY main.4 {
   EXPECT_EQ(alltoall, nullptr);
 }
 
+TEST_F(SpmdPartitioningTest, ReshardCrash) {
+  const char* const hlo_string = R"(
+HloModule Test
+
+ENTRY main.6 {
+  Arg_0.1 = f32[8,32,4] parameter(0), sharding={devices=[4,2,1]0,2,1,3,4,6,5,7}
+  ROOT copy = copy(Arg_0.1), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* alltoall = FindInstruction(module.get(), HloOpcode::kAllToAll);
+  EXPECT_NE(alltoall, nullptr);
+}
+
+TEST_F(SpmdPartitioningTest, ReshardNoFullRematCompatible) {
+  const char* const hlo_string = R"(
+HloModule Test
+
+ENTRY main.6 {
+  Arg_0.1 = f32[6,32,4] parameter(0), sharding={devices=[4,2,1]0,2,1,3,4,6,5,7}
+  ROOT copy = copy(Arg_0.1), sharding={devices=[2,2,2]0,1,2,3,4,5,6,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* allreduce = FindInstruction(module.get(), HloOpcode::kAllReduce);
+  EXPECT_NE(allreduce, nullptr);
+  // It should not touch the middle dim in the [2,2,2] sharding.
+  EXPECT_EQ(allreduce->replica_groups().size(), 2);
+  EXPECT_EQ(FindInstruction(module.get(), HloOpcode::kCollectivePermute),
+            nullptr);
+}
+
+TEST_F(SpmdPartitioningTest, ReshardNoFullRematIncompatible) {
+  const char* const hlo_string = R"(
+HloModule Test
+
+ENTRY main.6 {
+  Arg_0.1 = f32[6,32,4] parameter(0), sharding={devices=[4,2,1]0,2,1,3,4,6,5,7}
+  ROOT copy = copy(Arg_0.1), sharding={devices=[2,2,2]0,1,3,4,2,6,5,7}
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          PartitionComputation(hlo_string, /*num_devices=*/8));
+
+  XLA_VLOG_LINES(1, module->ToString());
+  auto* allreduce = FindInstruction(module.get(), HloOpcode::kAllReduce);
+  EXPECT_NE(allreduce, nullptr);
+  // It should not touch the middle dim in the [2,2,2] sharding.
+  EXPECT_EQ(allreduce->replica_groups().size(), 2);
+  // Collective permute to resolve different device orders.
+  EXPECT_NE(FindInstruction(module.get(), HloOpcode::kCollectivePermute),
+            nullptr);
+}
+
 }  // namespace
 }  // namespace spmd
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
index 266af694a12..261cc7216f0 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/str_join.h"
+#include "tensorflow/compiler/xla/comparison_util.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_sharding.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_sharding_util.h"
+#include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/shape_inference.h"
@@ -178,12 +180,8 @@ std::vector<HloInstruction*> MakePartitionOffsets(
       offsets.push_back(b->AddInstruction(
           HloInstruction::CreateConstant(LiteralUtil::Zero(S32))));
     } else {
-      auto offset_table = b->AddInstruction(HloInstruction::CreateConstant(
-          LiteralUtil::CreateR1<int32_t>(offset_arrays[i])));
-      auto index = b->AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(S32, {1}), offset_table, {partition_id}, {1}));
-      offsets.push_back(b->AddInstruction(
-          HloInstruction::CreateReshape(ShapeUtil::MakeShape(S32, {}), index)));
+      offsets.push_back(
+          TableLookup<int32_t>(offset_arrays[i], S32, partition_id, b));
     }
   }
   return offsets;
@@ -227,12 +225,7 @@ HloInstruction* GetInGroupPartitionId(
       in_group_ids[device_groups[i][j]] = j;
     }
   }
-  auto id_table = b->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<uint32_t>(in_group_ids)));
-  return b->AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeScalarShape(U32),
-      b->AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(U32, {1}), id_table, {partition_id}, {1}))));
+  return TableLookup<uint32_t>(in_group_ids, U32, partition_id, b);
 }
 
 namespace {
@@ -442,7 +435,7 @@ std::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
   }
 
   auto partition_ordinals =
-      MakeTiledPartitionOrdinals(dst_sharding, partition_id, b);
+      MakeTiledPartitionOrdinals(src_sharding, partition_id, b);
 
   auto result = hlo;
   auto hlo_shape = hlo->shape();
@@ -468,41 +461,11 @@ std::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
     // while 2 way sharding has 3 elements, the last element in the first shard
     // will be sliced out. re-distribution is needed.
     int64_t replicate_factor = src_shard_count / dst_shard_count;
-    int64_t src_shard_size = padded_src_shape.dimensions(dim) / src_shard_count;
-    if (dst_per_shard_size % src_shard_size == 0) {
-      // Simple halo exchange with a single collective permute.
-      std::vector<std::pair<int64_t, int64_t>> source_target_pairs;
-      int64_t holes = 0;
-      for (int64_t i = 0; i < dst_shard_count; i++) {
-        for (int64_t j = 0; j < replicate_factor; j++) {
-          if (j < dst_per_shard_size / src_shard_size) {
-            source_target_pairs.emplace_back(i * replicate_factor + j - holes,
-                                             i * replicate_factor + j);
-          } else {
-            holes++;
-          }
-        }
-      }
-      std::vector<int64_t> other_dims;
-      other_dims.reserve(src_shard_count);
-      for (int64_t i = 0; i < src_sharding.tile_assignment().num_dimensions();
-           i++) {
-        if (i != dim) {
-          other_dims.push_back(i);
-        }
-      }
-      auto groups =
-          hlo_sharding_util::GroupShardingOnDims(src_sharding, other_dims)
-              .device_groups;
-      result = GetPerGroupCollectiveOpsCreator(collective_ops_creator, groups)
-                   .create_cross_partition_collective_permute(
-                       b, result, source_target_pairs, (*next_channel_id)++);
-      continue;
-    }
+
     // 1. Calculate left_halo size.
     // left-halo size is
-    //   (src_per_shard_size - dst_per_shard_size) * floor(i / replicate_factor)
-
+    //   (src_per_dst_shard_size - dst_per_shard_size) *
+    //   floor(i / replicate_factor)
     OffsetCalculation left_halo_size_function = OffsetCalculation(
         HloOpcode::kMultiply,
         OffsetCalculation(MultiplyAddDivideOffsetCalculation(
@@ -511,43 +474,16 @@ std::optional<HloInstruction*> TileToPartialReplicateHaloExchange(
             MultiplyAddDivideOffsetCalculation(1, 0, replicate_factor)));
 
     // 2. Calculate right_halo size.
-    // right-halo size is 0
+    // right-halo size is -left_halo_size_function;
     OffsetCalculation right_halo_size_function =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1));
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1)) -
+        left_halo_size_function;
 
-    auto concat = result;
     // 3. Halo exchange.
-    auto halo_exchange_result = ExchangeHalo(
-        result, left_halo_size_function, right_halo_size_function, dim,
-        src_sharding, collective_ops_creator, next_channel_id, b);
-
-    if (halo_exchange_result.has_value()) {
-      concat = halo_exchange_result.value();
-    } else {
-      return std::nullopt;
-    }
-
-    // 4. Slice the valid result.
-    // Slice offset is
-    // (dst_shard_count - i - 1) *
-    // (src_per_shard_size - dst_per_shard_size)
-    // i is the index in dst_sharindg.
-    auto zero_s32 = b->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-    OffsetCalculation start_offset_on_padded_concat_calculation =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-            dst_per_shard_size - src_per_dst_shard_size,
-            (src_per_dst_shard_size - dst_per_shard_size) *
-                (dst_shard_count - 1),
-            1));
-    auto slice_shape = concat->shape();
-    slice_shape.set_dimensions(dim, src_shard_size);
-    std::vector<HloInstruction*> slice_offsets(concat->shape().rank(),
-                                               zero_s32);
-    slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
-        partition_ordinals[dim], b);
-    result = b->AddInstruction(HloInstruction::CreateDynamicSlice(
-        slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+    result = ExchangeHaloCompact(result, base_shape, left_halo_size_function,
+                                 right_halo_size_function, nullptr, dim,
+                                 src_sharding, partition_ordinals[dim],
+                                 collective_ops_creator, next_channel_id, b);
   }
   return result;
 }
@@ -588,7 +524,7 @@ std::optional<HloInstruction*> PadFromPartialReplicateShape(
     if (src_per_shard_size >= dst_per_shard_size) {
       continue;
     }
-    // If src sharding at this dimension is not partitoned, simply pad to
+    // If src sharding at this dimension is not partitioned, simply pad to
     // the desired shape.
     if (src_shard_count == 1) {
       expand_dims_without_halo_exchange.emplace_back(dim);
@@ -603,60 +539,21 @@ std::optional<HloInstruction*> PadFromPartialReplicateShape(
     // in each shard, re-distribution is needed.
     //
     // 1. Calculate left_halo size.
-    // left-halo size is 0
+    // left-halo size is -(D * i - S * i) = (S - D) * i
     OffsetCalculation left_halo_size_function =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation(0, 0, 1));
-
+        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+            src_per_shard_size - dst_per_shard_size, 0, 1));
     // 2. Calculate right_halo size.
     // right-halo size is D * (i + 1) - S * (i + 1) = (D - S) * i + (D - S)
     OffsetCalculation right_halo_size_function =
         OffsetCalculation(MultiplyAddDivideOffsetCalculation(
             dst_per_shard_size - src_per_shard_size,
             dst_per_shard_size - src_per_shard_size, 1));
-
-    auto concat = result;
     // 3. Halo exchange.
-    auto halo_exchange_result = ExchangeHalo(
-        result, left_halo_size_function, right_halo_size_function, dim,
-        src_sharding, collective_ops_creator, next_channel_id, b);
-
-    if (halo_exchange_result.has_value()) {
-      concat = halo_exchange_result.value();
-    } else {
-      return std::nullopt;
-    }
-
-    // 4. Pad.
-    std::vector<int64_t> zero_padding(concat->shape().rank());
-    PaddingConfig pad_config = window_util::MakeSymmetricPadding(zero_padding);
-    pad_config.mutable_dimensions(dim)->set_edge_padding_low(0);
-    int64_t max_right_halo_size =
-        right_halo_size_function.MaxInRange(0, src_shard_count - 1);
-    pad_config.mutable_dimensions(dim)->set_edge_padding_high(
-        std::max(int64_t{0}, padded_dst_shape.dimensions(dim) -
-                                 padded_src_shape.dimensions(dim) -
-                                 max_right_halo_size));
-    auto padded_concat_shape = ShapeInference::InferPadShape(
-                                   concat->shape(), zero->shape(), pad_config)
-                                   .value();
-    concat = b->AddInstruction(HloInstruction::CreatePad(
-        padded_concat_shape, concat, zero, pad_config));
-
-    // 5. Slice the valid result.
-    // Slice offset is (D-S) * i
-    auto zero_s32 = b->AddInstruction(
-        HloInstruction::CreateConstant(LiteralUtil::Zero(S32)));
-    OffsetCalculation start_offset_on_padded_concat_calculation =
-        OffsetCalculation(MultiplyAddDivideOffsetCalculation(
-            dst_per_shard_size - src_per_shard_size, 0, 1));
-    auto slice_shape = concat->shape();
-    slice_shape.set_dimensions(dim, dst_per_shard_size);
-    std::vector<HloInstruction*> slice_offsets(concat->shape().rank(),
-                                               zero_s32);
-    slice_offsets[dim] = start_offset_on_padded_concat_calculation.Calculate(
-        partition_ordinals[dim], b);
-    result = b->AddInstruction(HloInstruction::CreateDynamicSlice(
-        slice_shape, concat, slice_offsets, slice_shape.dimensions()));
+    result = ExchangeHaloCompact(result, base_shape, left_halo_size_function,
+                                 right_halo_size_function, nullptr, dim,
+                                 src_sharding, partition_ordinals[dim],
+                                 collective_ops_creator, next_channel_id, b);
   }
 
   // Pad other dimensions that won't need halo exchange with a single pad.
@@ -716,6 +613,15 @@ OffsetCalculation MultiplyAddDivideOffsetCalculation::operator-(
   return OffsetCalculation(HloOpcode::kSubtract, *this, other);
 }
 
+OffsetCalculation MultiplyAddDivideOffsetCalculation::operator+(
+    const MultiplyAddDivideOffsetCalculation& other) const {
+  if (divisor_ == 1 && other.divisor_ == 1) {
+    return OffsetCalculation(MultiplyAddDivideOffsetCalculation(
+        multiplier_ + other.multiplier_, offset_ + other.offset_, 1));
+  }
+  return OffsetCalculation(HloOpcode::kAdd, *this, other);
+}
+
 void MultiplyAddDivideOffsetCalculation::Simplify() {
   // We could simplify the calculation when multiplier is a multiple of
   // divisor_. However, when offset_ is not a multiple of divisor_, we must
@@ -801,6 +707,14 @@ OffsetCalculation OffsetCalculation::operator-(
   return OffsetCalculation(HloOpcode::kSubtract, *this, other);
 }
 
+OffsetCalculation OffsetCalculation::operator+(
+    const OffsetCalculation& other) const {
+  if (opcode_ == HloOpcode::kCopy && other.opcode_ == HloOpcode::kCopy) {
+    return copy_from_ + other.copy_from_;
+  }
+  return OffsetCalculation(HloOpcode::kAdd, *this, other);
+}
+
 bool OffsetCalculation::operator==(const OffsetCalculation& other) const {
   if (opcode_ != other.opcode_) {
     return false;
@@ -813,6 +727,8 @@ bool OffsetCalculation::operator==(const OffsetCalculation& other) const {
 
 int64_t OffsetCalculation::Calculate(int64_t shard_ordinal) const {
   switch (opcode_) {
+    case HloOpcode::kAdd:
+      return lhs_->Calculate(shard_ordinal) + rhs_->Calculate(shard_ordinal);
     case HloOpcode::kCopy:
       return copy_from_.Calculate(shard_ordinal);
     case HloOpcode::kSubtract:
@@ -996,6 +912,437 @@ std::optional<HloInstruction*> ExchangeHalo(
   return concat;
 }
 
+HloInstruction* ExchangeHaloCompact(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    HloInstruction* pad_value, int64_t dim, const HloSharding& sharding,
+    HloInstruction* shard_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, SpmdBuilder* b) {
+  int64_t input_shard_size = hlo->shape().dimensions(dim);
+  int64_t shard_count = sharding.tile_assignment().dim(dim);
+  auto grouped =
+      hlo_sharding_util::GroupShardingOnAllDimsExcept(sharding, {dim});
+  auto g_creator = GetPerGroupCollectiveOpsCreator(collective_ops_creator,
+                                                   grouped.device_groups);
+  const bool ignore_pad_vale = pad_value == nullptr;
+  if (ignore_pad_vale) {
+    pad_value = CreateR0WithType(hlo->shape().element_type(), 0, b);
+  }
+  struct Halo {
+    int64_t my_index;
+    int64_t start;
+    int64_t limit;
+    int64_t cp_idx;
+    int64_t halo_offset;
+    int64_t halo_at_shard;
+  };
+
+  // Find a list of halos for each shard. Each halo can be a real collective-
+  // permute, a slice of the self tensor, or all padding.
+  std::vector<std::vector<Halo>> halos(shard_count);
+  constexpr int64_t kPaddingShard = -2;
+  constexpr int64_t kSelfShard = -1;
+  int64_t max_window_size = 0;
+  for (int64_t i = 0; i < shard_count; ++i) {
+    const int64_t start =
+        i * input_shard_size - left_halo_size_function.Calculate(i);
+    int64_t next_start = start;
+    const int64_t limit =
+        (i + 1) * input_shard_size + right_halo_size_function.Calculate(i);
+    max_window_size = std::max(max_window_size, limit - start);
+    while (next_start < limit) {
+      halos[i].emplace_back();
+      Halo& halo = halos[i].back();
+      halo.my_index = i;
+      halo.halo_offset = next_start - start;
+      halo.start = next_start % input_shard_size;
+      if (halo.start < 0) {
+        halo.start += input_shard_size;
+      }
+      int64_t size = limit - next_start;
+      if (next_start < 0 || next_start >= base_shape.dimensions(dim)) {
+        if (next_start < 0) {
+          // Left padding bounded by offset zero.
+          size = std::min(size, 0 - next_start);
+        }
+        VLOG(3) << "Halo for shard i " << i << ": pad, size " << size;
+        halo.limit = halo.start + size;
+        halo.cp_idx = kPaddingShard;
+        next_start += size;
+        continue;
+      }
+      size = std::min(input_shard_size - halo.start, size);
+      halo.limit = halo.start + size;
+      int64_t shard = next_start / input_shard_size;
+      halo.halo_at_shard = shard;
+      // To be assigned.
+      halo.cp_idx = kSelfShard;
+      next_start += size;
+      VLOG(3) << "Halo for shard i " << i << ": shard " << shard << ", size "
+              << size << ", start " << halo.start;
+    }
+  }
+  // Element at index i: dst halos for src core i, where each halo is
+  // represented as a pair (shard_ordinal, offset in halos[shard_ordinal]).
+  std::vector<std::vector<std::pair<int64_t, int64_t>>> src_to_dst(shard_count);
+  {
+    // At each offset, unless all shards have padding data, we limit the size of
+    // the paddings to input_shard_size so that they don't force to pad the
+    // non-padding buffers too much.
+    std::vector<std::vector<Halo>> halos2(shard_count);
+    std::vector<int64_t> next_halo_idx(halos2.size(), 0);
+    while (true) {
+      bool all_padding = true;
+      bool empty = true;
+      for (int64_t i = 0; i < halos.size(); ++i) {
+        if (next_halo_idx[i] >= halos[i].size()) {
+          continue;
+        }
+        if (halos[i][next_halo_idx[i]].cp_idx != kPaddingShard) {
+          all_padding = false;
+        }
+        empty = false;
+      }
+      if (empty) {
+        break;
+      }
+      for (int64_t i = 0; i < halos.size(); ++i) {
+        if (next_halo_idx[i] >= halos[i].size()) {
+          continue;
+        }
+        Halo& h = halos[i][next_halo_idx[i]];
+        halos2[i].push_back(h);
+        Halo& new_h = halos2[i].back();
+        if (!all_padding && h.cp_idx == kPaddingShard &&
+            h.limit > input_shard_size) {
+          new_h.limit = input_shard_size;
+          h.start = 0;
+          h.limit -= input_shard_size;
+          VLOG(3) << "Split padding halo for shard i " << i << ": size "
+                  << new_h.limit - new_h.start;
+        } else {
+          next_halo_idx[i] += 1;
+        }
+        if (h.cp_idx != kPaddingShard && h.halo_at_shard != i) {
+          src_to_dst[h.halo_at_shard].emplace_back(i, halos2[i].size() - 1);
+        }
+      }
+    }
+    halos = std::move(halos2);
+  }
+  // Sort halos that are from the same src according to halo_offset, so that
+  // they are more likely to have similar characteristics.
+  for (int64_t i = 0; i < src_to_dst.size(); ++i) {
+    absl::c_sort(src_to_dst[i], [&](const std::pair<int64_t, int64_t>& a,
+                                    const std::pair<int64_t, int64_t>& b) {
+      return halos[a.first][a.second].halo_offset <
+             halos[b.first][b.second].halo_offset;
+    });
+  }
+
+  // Build collective permutes with distinct src/dst values.
+  // A list of [<cp, offset_in_shard>].
+  std::vector<std::pair<HloInstruction*, int64_t>> cps;
+  std::vector<int64_t> next_dst_idx(src_to_dst.size(), 0);
+  while (true) {
+    std::vector<std::pair<int64_t, int64_t>> source_target_pairs;
+    std::vector<bool> dst_seen(shard_count, false);
+    int64_t start = input_shard_size;
+    int64_t limit = 0;
+    for (int64_t i = 0; i < src_to_dst.size(); ++i) {
+      if (src_to_dst[i].size() <= next_dst_idx[i]) {
+        continue;
+      }
+      const auto& halo_idx = src_to_dst[i][next_dst_idx[i]];
+      Halo& halo = halos[halo_idx.first][halo_idx.second];
+      // Delay this pair if there is dst collision, or it increases the range by
+      // too much (we just test if we are combining small left halos with small
+      // right halos).
+      if (!source_target_pairs.empty() &&
+          (dst_seen[halo.my_index] ||
+           (start > halo.limit && limit == input_shard_size &&
+            halo.start == 0) ||
+           (limit < halo.start && start == 0 &&
+            halo.limit == input_shard_size))) {
+        continue;
+      }
+      halo.cp_idx = cps.size();
+      dst_seen[halo.my_index] = true;
+      source_target_pairs.emplace_back(i, halo.my_index);
+      start = std::min(start, halo.start);
+      limit = std::max(limit, halo.limit);
+      next_dst_idx[i] += 1;
+    }
+    if (source_target_pairs.empty()) {
+      break;
+    }
+    CHECK_LT(start, limit);
+    const int64_t halo_size = limit - start;
+    Shape halo_shape = hlo->shape();
+    HloInstruction* source_halo_slice = hlo;
+    if (halo_size != hlo->shape().dimensions(dim)) {
+      halo_shape.set_dimensions(dim, halo_size);
+      std::vector<int64_t> halo_start_indices(halo_shape.rank(), 0);
+      halo_start_indices[dim] = start;
+      std::vector<int64_t> halo_limit_indices(hlo->shape().dimensions().begin(),
+                                              hlo->shape().dimensions().end());
+      halo_limit_indices[dim] = limit;
+      std::vector<int64_t> halo_slice_strides(halo_shape.rank(), 1);
+      source_halo_slice = b->AddInstruction(
+          HloInstruction::CreateSlice(halo_shape, hlo, halo_start_indices,
+                                      halo_limit_indices, halo_slice_strides));
+    }
+    HloInstruction* cp = g_creator.create_cross_partition_collective_permute(
+        b, source_halo_slice, source_target_pairs, (*next_channel_id)++);
+    VLOG(3) << "Halo collective-permute created: " << cp->ToString();
+    cps.emplace_back(cp, start);
+  }
+
+  // Build pieces to concat in order. Each piece may be a select from different
+  // collectiver permutes/self/padding, because different shards may not share
+  // the same collective permute at each offset.
+  std::vector<HloInstruction*> concat_pieces;
+  Shape concat_shape = hlo->shape();
+  concat_shape.set_dimensions(dim, 0);
+  int64_t self_piece_start = input_shard_size;
+  bool all_padding = true;
+  for (int64_t current_halo_idx = 0; true; ++current_halo_idx) {
+    int64_t max_size = 0;
+    constexpr int64_t kUnseen = -5;
+    std::vector<int64_t> cp_index(halos.size(), kUnseen);
+    int64_t min_self_start = input_shard_size;
+    int64_t max_self_limit = 0;
+    for (int64_t i = 0; i < halos.size(); ++i) {
+      if (current_halo_idx >= halos[i].size()) {
+        continue;
+      }
+      const Halo& halo = halos[i][current_halo_idx];
+      cp_index[i] = halo.cp_idx;
+      if (halo.cp_idx >= 0) {
+        max_size =
+            std::max(max_size, cps[cp_index[i]].first->shape().dimensions(dim));
+      } else if (halo.cp_idx == kSelfShard) {
+        // Use the full input to allow all possible slicing offsets.
+        min_self_start = std::min(min_self_start, halo.start);
+        max_self_limit = std::max(max_self_limit, halo.limit);
+        max_size = std::max(max_size, max_self_limit - min_self_start);
+      } else {
+        max_size = std::max(max_size, halo.limit - halo.start);
+      }
+    }
+    if (absl::c_all_of(cp_index, [&](int64_t idx) { return idx == kUnseen; })) {
+      break;
+    }
+    // Adjust self start for max_size.
+    min_self_start -= max_size - (max_self_limit - min_self_start);
+    min_self_start = std::max<int64_t>(min_self_start, 0);
+    if (current_halo_idx == 0) {
+      self_piece_start = min_self_start;
+    }
+    concat_shape.set_dimensions(dim, max_size + concat_shape.dimensions(dim));
+    Shape piece_shape = hlo->shape();
+    piece_shape.set_dimensions(dim, max_size);
+    // Create the concat piece, which can be a select over different buffers.
+    // For each unique buffer we cache its value, then we use a table lookup to
+    // find which buffer each device uses.
+    HloInstruction* padding = b->AddInstruction(
+        HloInstruction::CreateBroadcast(piece_shape, pad_value, {}));
+    std::vector<HloInstruction*> unique_pieces;
+    std::vector<int64_t> slices_cache(cps.size() + 2, kUnseen);
+    std::vector<int32_t> piece_index(halos.size());
+    for (int64_t i = 0; i < halos.size(); ++i) {
+      HloInstruction* piece;
+      int64_t cache_idx = cp_index[i];
+      if (cp_index[i] >= 0) {
+        all_padding = false;
+        piece = cps[cp_index[i]].first;
+      } else if (cp_index[i] == kSelfShard) {
+        if (hlo->shape().dimensions(dim) == max_size) {
+          piece = hlo;
+        } else {
+          std::vector<int64_t> starts(piece_shape.rank(), 0);
+          starts[dim] = min_self_start;
+          std::vector<int64_t> limits(piece_shape.dimensions().begin(),
+                                      piece_shape.dimensions().end());
+          std::vector<int64_t> strides(piece_shape.rank(), 1);
+          limits[dim] += min_self_start;
+          piece = b->AddInstruction(HloInstruction::CreateSlice(
+              piece_shape, hlo, starts, limits, strides));
+        }
+        cache_idx = cps.size();
+        all_padding = false;
+      } else {
+        piece = padding;
+        cache_idx = cps.size() + 1;
+      }
+      if (slices_cache[cache_idx] != kUnseen) {
+        piece_index[i] = slices_cache[cache_idx];
+        continue;
+      }
+      if (piece->shape().dimensions(dim) != max_size) {
+        PaddingConfig pc;
+        for (int64_t k = 0; k < piece_shape.rank(); ++k) {
+          auto pc_dim = pc.add_dimensions();
+          pc_dim->set_interior_padding(0);
+          pc_dim->set_edge_padding_low(0);
+          pc_dim->set_edge_padding_high(0);
+          if (k != dim) {
+            continue;
+          }
+          int64_t padding_size = max_size - piece->shape().dimensions(dim);
+          if (concat_pieces.empty()) {
+            pc_dim->set_edge_padding_low(padding_size);
+          } else {
+            pc_dim->set_edge_padding_high(padding_size);
+          }
+        }
+        piece = b->AddInstruction(
+            HloInstruction::CreatePad(piece_shape, piece, pad_value, pc));
+      }
+      piece_index[i] = unique_pieces.size();
+      unique_pieces.push_back(piece);
+      slices_cache[cache_idx] = piece_index[i];
+    }
+    // Select a buffer based on the shard ordinal.
+    HloInstruction* selector =
+        TableLookup<int32_t>(piece_index, S32, shard_ordinal, b);
+    int64_t init_piece = 0;
+    if (unique_pieces.size() > 1 && unique_pieces[init_piece] == padding) {
+      // Init with a non-padding piece.
+      init_piece = 1;
+    }
+    HloInstruction* selected = unique_pieces[init_piece];
+    for (int64_t i = init_piece + 1; i < unique_pieces.size(); ++i) {
+      if (unique_pieces[i] == padding) {
+        // Padding masking will be applied later.
+        continue;
+      }
+      HloInstruction* pred = b->AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeScalarShape(PRED), selector,
+          CreateR0WithType(S32, i, b), ComparisonDirection::kEq));
+      pred = b->AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::MakeShape(PRED, selected->shape().dimensions()), pred,
+          {}));
+      selected = b->AddInstruction(
+          HloInstruction::CreateTernary(selected->shape(), HloOpcode::kSelect,
+                                        pred, unique_pieces[i], selected));
+    }
+    concat_pieces.push_back(selected);
+  }
+  if (all_padding) {
+    concat_shape.set_dimensions(dim, max_window_size);
+    return b->AddInstruction(
+        HloInstruction::CreateBroadcast(concat_shape, pad_value, {}));
+  }
+  CHECK_GE(concat_shape.dimensions(dim), max_window_size);
+  HloInstruction* concat;
+  if (concat_pieces.size() == 1) {
+    concat = concat_pieces[0];
+  } else {
+    concat = b->AddInstruction(
+        HloInstruction::CreateConcatenate(concat_shape, concat_pieces, dim));
+  }
+
+  // Find extra paddings on the left to slice out.
+  std::vector<int32_t> slice_offset(halos.size(), 0);
+  std::vector<int32_t> non_padding_starts(halos.size(), 0);
+  std::vector<int32_t> non_padding_limits(halos.size(), 0);
+  const int64_t first_piece_size = concat_pieces[0]->shape().dimensions(dim);
+  int64_t padded_concat_size = concat_shape.dimensions(dim);
+  for (int64_t i = 0; i < halos.size(); ++i) {
+    if (halos[i].empty()) {
+      // All padding.
+      continue;
+    }
+    const Halo& halo = halos[i][0];
+    for (int64_t j = 0; j < halos[i].size(); ++j) {
+      if (halos[i][j].cp_idx != kPaddingShard) {
+        break;
+      }
+      non_padding_starts[i] += halos[i][j].limit - halos[i][j].start;
+    }
+    non_padding_limits[i] = left_halo_size_function.Calculate(i) +
+                            right_halo_size_function.Calculate(i) +
+                            input_shard_size;
+    int64_t high_padding = right_halo_size_function.Calculate(i) +
+                           input_shard_size * (i + 1) -
+                           base_shape.dimensions(dim);
+    if (high_padding > 0) {
+      non_padding_limits[i] -= high_padding;
+    }
+    if (halo.cp_idx >= 0) {
+      slice_offset[i] = halo.start - cps[halo.cp_idx].second +
+                        first_piece_size -
+                        cps[halo.cp_idx].first->shape().dimensions(dim);
+    } else if (halo.cp_idx == kSelfShard) {
+      slice_offset[i] = halo.start - self_piece_start;
+    } else {
+      slice_offset[i] = first_piece_size - (halo.limit - halo.start);
+    }
+    padded_concat_size =
+        std::max(padded_concat_size, slice_offset[i] + max_window_size);
+  }
+  if (padded_concat_size > concat_shape.dimensions(dim)) {
+    // Need increase the shape size before slicing.
+    PaddingConfig pc;
+    for (int64_t k = 0; k < concat_shape.rank(); ++k) {
+      auto pc_dim = pc.add_dimensions();
+      pc_dim->set_interior_padding(0);
+      pc_dim->set_edge_padding_low(0);
+      pc_dim->set_edge_padding_high(0);
+      if (k != dim) {
+        continue;
+      }
+      pc_dim->set_edge_padding_high(padded_concat_size -
+                                    concat_shape.dimensions(dim));
+    }
+    concat_shape.set_dimensions(dim, padded_concat_size);
+    concat = b->AddInstruction(
+        HloInstruction::CreatePad(concat_shape, concat, pad_value, pc));
+  }
+  if (concat_shape.dimensions(dim) > max_window_size) {
+    Shape result_shape = concat_shape;
+    result_shape.set_dimensions(dim, max_window_size);
+    std::vector<HloInstruction*> offsets(result_shape.rank(),
+                                         CreateR0WithType(S32, 0, b));
+    offsets[dim] = TableLookup<int32_t>(slice_offset, S32, shard_ordinal, b);
+    concat = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+        result_shape, concat, offsets, result_shape.dimensions()));
+  }
+  if (ignore_pad_vale) {
+    return concat;
+  }
+  // Compute the range of non-padding data, then mask the padding with the given
+  // pad_value.
+  HloInstruction* iota = b->AddInstruction(HloInstruction::CreateIota(
+      ShapeUtil::ChangeElementType(concat->shape(), S32), dim));
+  HloInstruction* valid_limit =
+      b->AddInstruction(HloInstruction::CreateBroadcast(
+          ShapeUtil::ChangeElementType(concat->shape(), S32),
+          TableLookup<int32_t>(non_padding_limits, S32, shard_ordinal, b), {}));
+  HloInstruction* mask = b->AddInstruction(HloInstruction::CreateCompare(
+      ShapeUtil::ChangeElementType(concat->shape(), PRED), iota, valid_limit,
+      ComparisonDirection::kLt));
+  if (absl::c_any_of(non_padding_starts,
+                     [](const int32_t s) { return s > 0; })) {
+    HloInstruction* valid_start =
+        b->AddInstruction(HloInstruction::CreateBroadcast(
+            ShapeUtil::ChangeElementType(concat->shape(), S32),
+            TableLookup<int32_t>(non_padding_starts, S32, shard_ordinal, b),
+            {}));
+    mask = b->AddInstruction(HloInstruction::CreateBinary(
+        mask->shape(), HloOpcode::kAnd, mask,
+        b->AddInstruction(HloInstruction::CreateCompare(
+            mask->shape(), iota, valid_start, ComparisonDirection::kGe))));
+  }
+  HloInstruction* padding = b->AddInstruction(
+      HloInstruction::CreateBroadcast(concat->shape(), pad_value, {}));
+  return b->AddInstruction(HloInstruction::CreateTernary(
+      concat->shape(), HloOpcode::kSelect, mask, concat, padding));
+}
+
 std::optional<HloInstruction*> ExchangeHalo(
     HloInstruction* hlo,
     std::vector<OffsetCalculation> left_halo_size_functions,
@@ -1028,7 +1375,34 @@ std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
     HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
     HloInstruction* partition_ordinal,
     const SPMDCollectiveOpsCreator& collective_ops_creator,
-    int64_t* next_channel_id, SpmdBuilder* b, bool mask_invalid_region) {
+    int64_t* next_channel_id, SpmdBuilder* b, bool mask_invalid_region,
+    bool force_mask_in_compact) {
+  int64_t shard_count = target.tile_assignment().dim(dim);
+  if (explicit_left_padding_on_full_shape ==
+      left_halo_size_function.Calculate(0)) {
+    int64_t max_halo =
+        std::max(left_halo_size_function.MaxInRange(0, shard_count),
+                 right_halo_size_function.MaxInRange(0, shard_count));
+    int64_t max_shard_size =
+        hlo->shape().dimensions(dim) +
+        (left_halo_size_function + right_halo_size_function)
+            .MaxInRange(0, shard_count);
+    // TODO(xla): Check if it's safe ot use it in other cases: e.g.
+    // explicit_left_padding_on_full_shape != left_halo(0), or
+    // max_shard_size != shard_size_with_halo.
+    if (max_shard_size == shard_size_with_halo &&
+        max_halo > 2 * shard_size_with_halo) {
+      if (max_shard_size * 2 >= shard_count * hlo->shape().dimensions(dim)) {
+        // Easier to fallback to replication.
+        return std::nullopt;
+      }
+      return ExchangeHaloCompact(
+          hlo, base_shape, left_halo_size_function, right_halo_size_function,
+          mask_invalid_region || force_mask_in_compact ? pad_value : nullptr,
+          dim, target, partition_ordinal, collective_ops_creator,
+          next_channel_id, b);
+    }
+  }
   auto halo_exchange_result =
       ExchangeHalo(hlo, left_halo_size_function, right_halo_size_function, dim,
                    target, collective_ops_creator, next_channel_id, b);
@@ -1036,7 +1410,6 @@ std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
     return std::nullopt;
   }
   auto concat = *halo_exchange_result;
-  int64_t shard_count = target.tile_assignment().dim(dim);
   int64_t max_left_halo_size =
       left_halo_size_function.MaxInRange(1, shard_count);
 
@@ -1629,13 +2002,7 @@ HloInstruction* PerGroupSliceFromReplicated(
       group_ids[device] = g;
     }
   }
-  auto group_id_table = b->AddInstruction(HloInstruction::CreateConstant(
-      LiteralUtil::CreateR1<uint32_t>(group_ids)));
-  auto group_id = b->AddInstruction(HloInstruction::CreateReshape(
-      ShapeUtil::MakeScalarShape(U32),
-      b->AddInstruction(HloInstruction::CreateDynamicSlice(
-          ShapeUtil::MakeShape(U32, {1}), group_id_table, {partition_id},
-          {1}))));
+  auto group_id = TableLookup<uint32_t>(group_ids, U32, partition_id, b);
   std::vector<int64_t> group_level_tile_dims(replicated->shape().rank(), 1);
   for (int64_t i = 0; i < group_dims.size(); ++i) {
     group_level_tile_dims[group_dims[i]] = group_dim_sizes[i];
@@ -2000,9 +2367,10 @@ std::optional<PartitionedHlo::WindowedInputShardReturnValue> ReshardDataForPad(
          pd.interior_padding() > 0) &&
         (!pad_value_is_zero || target_shape.dimensions(i) % shard_count != 0);
   }
+  // In compact halo exchange, we can't skip masking.
   return to_reshard.ReshardAsWindowedInput(
       window, target_sharding, pad_value,
-      /*mask_invalid_region=*/needs_masking);
+      /*mask_invalid_region=*/needs_masking, /*force_mask_in_compact=*/true);
 }
 
 HloInstruction* PadDataFromWindowReshard(
diff --git a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
index 2fdd0036692..57413f86569 100644
--- a/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
+++ b/tensorflow/compiler/xla/service/spmd/spmd_partitioner_util.h
@@ -32,6 +32,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/literal_util.h"
 #include "tensorflow/compiler/xla/service/spmd/spmd_partitioner.h"
 #include "tensorflow/compiler/xla/shape_util.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace spmd {
@@ -149,6 +150,18 @@ Shape MakePartitionedShape(const Shape& shape, const HloSharding& sharding);
 // since this can be before layout assignment.
 int64_t ShapeSizeInBytes(const Shape& shape);
 
+// Creates a table lookup HLO using the ordinal as the offset.
+template <typename NativeT>
+HloInstruction* TableLookup(absl::Span<const NativeT> table, PrimitiveType type,
+                            HloInstruction* ordinal, SpmdBuilder* b) {
+  HloInstruction* table_hlo = b->AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR1<NativeT>(table)));
+  HloInstruction* value = b->AddInstruction(HloInstruction::CreateDynamicSlice(
+      ShapeUtil::MakeShape(type, {1}), table_hlo, {ordinal}, {1}));
+  return b->AddInstruction(
+      HloInstruction::CreateReshape(ShapeUtil::MakeShape(type, {}), value));
+}
+
 // Returns the shard shape for a partition without padding due to uneven
 // sharding.
 Shape MakeNonPaddedShapeForGivenPartition(const Shape& shape,
@@ -229,6 +242,8 @@ class MultiplyAddDivideOffsetCalculation {
 
   OffsetCalculation operator-(
       const MultiplyAddDivideOffsetCalculation& other) const;
+  OffsetCalculation operator+(
+      const MultiplyAddDivideOffsetCalculation& other) const;
 
   bool operator==(const MultiplyAddDivideOffsetCalculation& other) const {
     return multiplier_ == other.multiplier_ && offset_ == other.offset_ &&
@@ -280,6 +295,7 @@ class OffsetCalculation {
   bool IsConstant() const;
 
   OffsetCalculation operator-(const OffsetCalculation& other) const;
+  OffsetCalculation operator+(const OffsetCalculation& other) const;
   bool operator==(const OffsetCalculation& other) const;
   int64_t Calculate(int64_t shard_ordinal) const;
   HloInstruction* Calculate(HloInstruction* shard_ordinal,
@@ -316,6 +332,20 @@ std::optional<HloInstruction*> ExchangeHalo(
     const SPMDCollectiveOpsCreator& collective_ops_creator,
     int64_t* next_channel_id, SpmdBuilder* b);
 
+// A compact version of halo exchange, which generates fewer collective permutes
+// when the halo ranges are far from the current shard while the final result
+// size is small. It tries to reuse the same collective permute to do as many
+// disjoint communications as possible. It also includes data masking. pad_value
+// can be nullptr, which means the value in padding regions doesn't matter.
+HloInstruction* ExchangeHaloCompact(
+    HloInstruction* hlo, const Shape& base_shape,
+    const OffsetCalculation& left_halo_size_function,
+    const OffsetCalculation& right_halo_size_function,
+    HloInstruction* pad_value, int64_t dim, const HloSharding& sharding,
+    HloInstruction* shard_ordinal,
+    const SPMDCollectiveOpsCreator& collective_ops_creator,
+    int64_t* next_channel_id, SpmdBuilder* b);
+
 // Exchanges halos and performs pad/dynamic-slice on the concatenated data such
 // that the result starts with the first needed element on each shard. It also
 // masks off invalid data due to padding.
@@ -334,6 +364,9 @@ std::optional<HloInstruction*> ExchangeHalo(
 //  offset_on_padded_shape: the offset HLO (S32) that represents the start of
 //   each shard on the padded full shape.
 //  pad_value: the padding value used on the full shape.
+//  force_mask_in_compact: If true, masking is always applied if it uses
+//   ExchangeHaloCompact. An example is that certain cases in pad can skip
+//   masking in non-compact halo exchange, but not in compact ones.
 std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
     HloInstruction* hlo, const Shape& base_shape,
     const OffsetCalculation& left_halo_size_function,
@@ -343,7 +376,8 @@ std::optional<HloInstruction*> ExchangeHaloAndGetValidData(
     HloInstruction* offset_on_padded_shape, HloInstruction* pad_value,
     HloInstruction* partition_ordinal,
     const SPMDCollectiveOpsCreator& collective_ops_creator,
-    int64_t* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true);
+    int64_t* next_channel_id, SpmdBuilder* b, bool mask_invalid_region = true,
+    bool force_mask_in_compact = false);
 
 // Uses halo exchange to change from right-padding to left-padding for uneven
 // tiled sharding on the given dimensions. Tiled sharding always pads uneven
@@ -383,7 +417,7 @@ hlo_sharding_util::GroupedSharding AlignGroupsWith(
     const hlo_sharding_util::GroupedSharding& reference,
     bool ignore_group_order = false);
 
-// Align device groups between the two ahrdings. Equivalent in calling
+// Align device groups between the two shardings. Equivalent in calling
 // GroupShardingOnDims on the two sharding AlignGroupsWith and then
 // UngroupSharding
 HloSharding AlignShardingOnDims(const HloSharding& sharding,
@@ -407,7 +441,7 @@ HloInstruction* GetInGroupPartitionId(
     HloInstruction* partition_id,
     const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b);
 
-// Creates the nested partitioner state for in-group patitioning.
+// Creates the nested partitioner state for in-group partitioning.
 PartitionedHlo::PartitioningState CreatePerGroupPartitioningState(
     const PartitionedHlo::PartitioningState& state,
     const std::vector<std::vector<int64_t>>& device_groups, SpmdBuilder* b);
diff --git a/tensorflow/compiler/xla/service/stream_pool.cc b/tensorflow/compiler/xla/service/stream_pool.cc
index f247f8b4959..ca1e5921cfb 100644
--- a/tensorflow/compiler/xla/service/stream_pool.cc
+++ b/tensorflow/compiler/xla/service/stream_pool.cc
@@ -16,26 +16,33 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/stream_pool.h"
 
 #include <memory>
-
-#include "tensorflow/tsl/platform/logging.h"
+#include <utility>
 
 namespace xla {
 
-StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) {
+StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor,
+                                         se::StreamPriority priority) {
   std::unique_ptr<se::Stream> stream;
+
   {
     absl::MutexLock lock(&mu_);
-    while (!streams_.empty() && !stream) {
-      // Re-use an existing stream from the pool.
-      stream = std::move(streams_.back());
-      streams_.pop_back();
-      if (stream->ok()) {
-        VLOG(1) << stream->DebugStreamPointers()
-                << " StreamPool reusing existing stream";
-      } else {
-        VLOG(1) << stream->DebugStreamPointers()
-                << " stream was not ok, StreamPool deleting";
-        stream = nullptr;
+    if (streams_with_pri_.find(priority) == streams_with_pri_.end()) {
+      stream = nullptr;
+    } else {
+      while (!streams_with_pri_[priority].empty() && !stream) {
+        // Re-use an existing stream from the pool.
+        stream = std::move(streams_with_pri_[priority].back());
+        streams_with_pri_[priority].pop_back();
+        if (stream->ok()) {
+          VLOG(1) << stream->DebugStreamPointers()
+                  << " StreamPool reusing existing stream with priority: "
+                  << se::StreamPriorityToString(priority);
+        } else {
+          VLOG(1) << stream->DebugStreamPointers()
+                  << " stream was not ok, StreamPool deleting with priority: "
+                  << se::StreamPriorityToString(priority);
+          stream = nullptr;
+        }
       }
     }
   }
@@ -43,6 +50,10 @@ StreamPool::Ptr StreamPool::BorrowStream(se::StreamExecutor* executor) {
   if (!stream) {
     // Create a new stream.
     stream = std::make_unique<se::Stream>(executor);
+    auto stream_impl = stream->implementation();
+    stream_impl->SetPriority(priority);
+    VLOG(1) << "Set stream priority to: "
+            << se::StreamPriorityToString(priority);
     stream->Init();
     VLOG(1) << stream->DebugStreamPointers()
             << " StreamPool created new stream";
@@ -58,7 +69,9 @@ void StreamPool::ReturnStream(se::Stream* stream) {
     VLOG(1) << stream->DebugStreamPointers()
             << " StreamPool returning ok stream";
     absl::MutexLock lock(&mu_);
-    streams_.emplace_back(stream);
+    auto priority =
+        std::get<se::StreamPriority>(stream->implementation()->priority());
+    streams_with_pri_[priority].emplace_back(stream);
   } else {
     // If the stream has encountered any errors, all subsequent operations on it
     // will fail. So just delete the stream, and rely on new streams to be
diff --git a/tensorflow/compiler/xla/service/stream_pool.h b/tensorflow/compiler/xla/service/stream_pool.h
index 5d706728a17..78165c394d9 100644
--- a/tensorflow/compiler/xla/service/stream_pool.h
+++ b/tensorflow/compiler/xla/service/stream_pool.h
@@ -17,12 +17,13 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_SERVICE_STREAM_POOL_H_
 
 #include <memory>
+#include <unordered_map>
 #include <vector>
 
 #include "tensorflow/compiler/xla/stream_executor/stream_executor.h"
-#include "tensorflow/compiler/xla/types.h"
 
 namespace xla {
+namespace se = ::stream_executor;
 
 // Pool of stream_executor::Streams, which are created as needed and
 // destroyed when the pool is destroyed.
@@ -37,14 +38,15 @@ class StreamPool {
   // stream to the pool on destruction.
   using Ptr = std::unique_ptr<se::Stream, PtrDeleter>;
 
-  StreamPool() {}
+  StreamPool() = default;
 
   // Returns a pointer to a stream in the pool, creating a new stream
   // if none are available in the pool. The returned smart pointer
   // returns the stream to the pool on destruction.
   //
   // This method is thread-safe.
-  Ptr BorrowStream(se::StreamExecutor* executor);
+  Ptr BorrowStream(se::StreamExecutor* executor,
+                   se::StreamPriority priority = se::StreamPriority::Default);
 
  private:
   // Puts a pointer to a stream back into the pool, leaving it free
@@ -55,7 +57,10 @@ class StreamPool {
   void ReturnStream(se::Stream* stream);
 
   absl::Mutex mu_;
-  std::vector<std::unique_ptr<se::Stream>> streams_ ABSL_GUARDED_BY(mu_);
+  // This stores streams with user-specified priority.
+  std::unordered_map<se::StreamPriority,
+                     std::vector<std::unique_ptr<se::Stream>>>
+      streams_with_pri_ ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sub_byte_normalization.cc b/tensorflow/compiler/xla/service/sub_byte_normalization.cc
new file mode 100644
index 00000000000..b465c614f11
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sub_byte_normalization.cc
@@ -0,0 +1,80 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/sub_byte_normalization.h"
+
+#include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/tsl/platform/errors.h"
+
+namespace xla {
+
+namespace {
+
+bool RemoveInt4SizeFromShape(Shape* shape) {
+  if (shape->IsTuple()) {
+    bool changed = false;
+    for (int idx = 0; idx < shape->tuple_shapes_size(); ++idx) {
+      changed |= RemoveInt4SizeFromShape(shape->mutable_tuple_shapes(idx));
+    }
+    return changed;
+  }
+  if (shape->IsArray()) {
+    const int64_t element_size_in_bits = shape->layout().element_size_in_bits();
+    if (element_size_in_bits != 0 && element_size_in_bits < 8) {
+      shape->mutable_layout()->set_element_size_in_bits(0);
+      return true;
+    }
+  }
+  return false;
+}
+
+}  // namespace
+
+StatusOr<bool> SubByteNormalization::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  FunctionVisitor visitor([&](HloInstruction* hlo) -> Status {
+    auto* shape = hlo->mutable_shape();
+    changed |= RemoveInt4SizeFromShape(shape);
+    return OkStatus();
+  });
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    TF_RETURN_IF_ERROR(computation->Accept(&visitor));
+  }
+  auto* computation_layout = module->mutable_entry_computation_layout();
+  for (int param_no = 0; param_no < computation_layout->parameter_count();
+       ++param_no) {
+    auto* shape_layout = computation_layout->mutable_parameter_layout(param_no);
+    if (shape_layout->LayoutIsSet() && shape_layout->shape().IsArray()) {
+      Layout layout = shape_layout->layout();
+      const int64_t element_size_in_bits = layout.element_size_in_bits();
+      if (element_size_in_bits != 0 && element_size_in_bits < 8) {
+        layout.set_element_size_in_bits(0);
+        shape_layout->ResetLayout(layout);
+        changed = true;
+      }
+    }
+  }
+  if (changed) {
+    XLA_VLOG_LINES(2, "SubByteNormalization::Run() modified hlo_module:\n" +
+                          module->ToString());
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/sub_byte_normalization.h b/tensorflow/compiler/xla/service/sub_byte_normalization.h
new file mode 100644
index 00000000000..3f544981009
--- /dev/null
+++ b/tensorflow/compiler/xla/service/sub_byte_normalization.h
@@ -0,0 +1,46 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
+#include "tensorflow/compiler/xla/statusor.h"
+
+namespace xla {
+
+// A pass that unconditionally removes the sub-byte element_size_in_bits
+// annotation for platforms that doesn't support nibble-packed types. After this
+// pass, a sub-byte type is treated as int8 for space occupation and arithmetic
+// operations. This pass is used in HloEvaluation and testing only.
+class SubByteNormalization : public HloModulePass {
+ public:
+  SubByteNormalization() = default;
+
+  ~SubByteNormalization() override = default;
+
+  absl::string_view name() const override { return "int4-size-removal"; }
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_SUB_BYTE_NORMALIZATION_H_
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.cc b/tensorflow/compiler/xla/service/topk_rewriter.cc
index 3d6dcfb3b83..37e5bbca234 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter.cc
+++ b/tensorflow/compiler/xla/service/topk_rewriter.cc
@@ -15,12 +15,15 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/service/topk_rewriter.h"
 
+#include <array>
 #include <memory>
 #include <optional>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/match.h"
+#include "tensorflow/compiler/xla/client/lib/comparators.h"
+#include "tensorflow/compiler/xla/client/xla_builder.h"
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_casting_utils.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
@@ -31,6 +34,22 @@ limitations under the License.
 
 namespace xla {
 
+namespace m = match;
+
+// TODO(cheshire): Avoid duplication w/ cudnn_vectorize_convolutions.
+static StatusOr<HloComputation*> BuilderToHloComputation(
+    XlaComputation& comp, HloComputation* sibling_computation) {
+  TF_ASSIGN_OR_RETURN(ProgramShape program_shape, comp.GetProgramShape());
+  HloModuleConfig config(program_shape);
+  TF_ASSIGN_OR_RETURN(auto new_module,
+                      HloModule::CreateFromProto(comp.proto(), config));
+
+  HloModule* dest_module = sibling_computation->parent();
+  HloCloneContext context(dest_module);
+  return dest_module->DeepCloneComputation(new_module->entry_computation(),
+                                           &context);
+}
+
 static bool IsNanSafeGt(HloComputation* comp) {
   namespace m = match;
   auto match_bitcast_f32 = [](int64_t parameter_number) {
@@ -151,7 +170,8 @@ static bool IsNanSafeGt(HloComputation* comp) {
 // Look for the instructions emitted from: xla/client/lib/sorting.cc
 static bool HasIota(HloSortInstruction* sort, HloInstruction* data) {
   namespace m = match;
-  const auto sort_dims = {data->shape().dimensions(sort->sort_dimension())};
+  const std::array<int64_t, 1> sort_dims = {
+      data->shape().dimensions(sort->sort_dimension())};
   auto match_iota = [](auto dims) {
     return m::Iota().WithShape(m::Shape().WithElementType(S32).WithDims(dims));
   };
@@ -325,20 +345,74 @@ StatusOr<bool> TopkRewriter::Run(
 
 class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
  public:
+  explicit TopkDecomposerVisitor(HloPredicate should_decompose)
+      : should_decompose_(should_decompose) {}
+
   Status HandleCustomCall(HloInstruction* inst) override {
+    if (should_decompose_ && !should_decompose_(inst)) {
+      return OkStatus();
+    }
     HloCustomCallInstruction* call = DynCast<HloCustomCallInstruction>(inst);
-    HloComputation* comp = inst->parent();
     if (call == nullptr || call->custom_call_target() != "TopK") {
       return OkStatus();
     }
+    HloComputation* comparator = call->to_apply();
+    return DecomposeTopK(call, comparator);
+  }
 
+  Status HandleTopK(HloInstruction* topk) override {
+    if (should_decompose_ && !should_decompose_(topk)) {
+      return OkStatus();
+    }
+    TF_ASSIGN_OR_RETURN(HloComputation * comparator,
+                        CreateVariadicComparator(topk));
+    return DecomposeTopK(topk, comparator);
+  }
+
+ private:
+  StatusOr<HloComputation*> CreateVariadicComparator(HloInstruction* topk) {
+    XlaBuilder b(absl::StrCat("comparator_", topk->name()));
+    std::vector<PrimitiveType> ptypes = {
+        topk->operand(0)->shape().element_type(), PrimitiveType::S32};
+    HloComputation* comparison_computation = topk->to_apply();
+
+    auto comparison = [&]() -> StatusOr<XlaComputation> {
+      if (Match(comparison_computation->root_instruction(),
+                m::Compare(m::Parameter(0), m::Parameter(1))
+                    .WithComparisonDirection(ComparisonDirection::kGt)) ||
+          Match(comparison_computation->root_instruction(),
+                m::Compare(m::Parameter(1), m::Parameter(0))
+                    .WithComparisonDirection(ComparisonDirection::kLt))) {
+        return CreateScalarGtComputation(ptypes, &b);
+      } else if (Match(
+                     comparison_computation->root_instruction(),
+                     m::Compare(m::Parameter(0), m::Parameter(1))
+                         .WithComparisonDirection(ComparisonDirection::kLt)) ||
+                 Match(
+                     comparison_computation->root_instruction(),
+                     m::Compare(m::Parameter(1), m::Parameter(0))
+                         .WithComparisonDirection(ComparisonDirection::kGt))) {
+        return CreateScalarLtComputation(ptypes, &b);
+      } else {
+        return InternalError("Unexpected comparator: %s",
+                             comparison_computation->ToString());
+      }
+    }();
+    TF_RETURN_IF_ERROR(comparison.status());
+    TF_ASSIGN_OR_RETURN(HloComputation * comparator,
+                        BuilderToHloComputation(*comparison, topk->parent()));
+    return comparator;
+  }
+
+  Status DecomposeTopK(HloInstruction* call,
+                       HloComputation* variadic_comparator) {
+    HloComputation* comp = call->parent();
     HloInstruction* input = call->mutable_operand(0);
     Shape iota_shape = input->shape();
     iota_shape.set_element_type(S32);
     size_t sort_dimension = input->shape().dimensions_size() - 1;
     std::vector<int64_t> zeroes(iota_shape.rank(), 0);
     std::vector<int64_t> ones(iota_shape.rank(), 1);
-    HloComputation* comparator = call->to_apply();
     // Apply a slice to a tuple.
     auto slice_tuple = [&](HloInstruction* sort, const size_t index) {
       return comp->AddInstruction(HloInstruction::CreateSlice(
@@ -347,7 +421,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
               sort->shape().tuple_shapes(index), sort, index)),
           zeroes, call->shape().tuple_shapes(index).dimensions(), ones));
     };
-    CHECK_NE(comparator, nullptr);
+    CHECK_NE(variadic_comparator, nullptr);
     // If only the topk values are necessary, skip the iota.
     if (call->user_count() == 1 && call->users().front()->tuple_index() == 0) {
       HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
@@ -364,7 +438,7 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
           HloInstruction::CreateIota(iota_shape, iota_shape.rank() - 1));
       HloInstruction* sort = comp->AddInstruction(HloInstruction::CreateSort(
           ShapeUtil::MakeTupleShape({input->shape(), iota_shape}),
-          sort_dimension, {input, iota}, call->to_apply(),
+          sort_dimension, {input, iota}, variadic_comparator,
           /*is_stable=*/true));
       TF_RETURN_IF_ERROR(ReplaceInstruction(
           call, comp->AddInstruction(HloInstruction::CreateTuple(
@@ -373,12 +447,16 @@ class TopkDecomposerVisitor : public DfsHloRewriteVisitor {
     }
     return OkStatus();
   }
+
+ private:
+  HloPredicate should_decompose_;
 };
 
 StatusOr<bool> TopkDecomposer::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  return TopkDecomposerVisitor().RunOnModule(module, execution_threads);
+  return TopkDecomposerVisitor(should_decompose_)
+      .RunOnModule(module, execution_threads);
 }
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/topk_rewriter.h b/tensorflow/compiler/xla/service/topk_rewriter.h
index 949342a30ef..957dfb8f22f 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter.h
+++ b/tensorflow/compiler/xla/service/topk_rewriter.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
 #define TENSORFLOW_COMPILER_XLA_SERVICE_TOPK_REWRITER_H_
 
+#include <functional>
 #include <memory>
+#include <optional>
+#include <utility>
 
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
@@ -25,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/service/hlo_pass_interface.h"
 
 namespace xla {
+
 // This pass pattern-matches soups of HLOs executing a TopK operation and
 // replaces them with a TopK CustomCall when the given values are supported by
 // the CustomCall and it is more efficient to use that implementation.
@@ -61,10 +65,16 @@ class TopkDecomposer : public HloModulePass {
  public:
   absl::string_view name() const override { return "topk-decomposer"; }
 
+  explicit TopkDecomposer(HloPredicate should_decompose = {})
+      : should_decompose_(should_decompose) {}
+
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  HloPredicate should_decompose_;
 };
 
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/topk_rewriter_test.cc b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
index c55aba942b0..cc31870e462 100644
--- a/tensorflow/compiler/xla/service/topk_rewriter_test.cc
+++ b/tensorflow/compiler/xla/service/topk_rewriter_test.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instructions.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/service/hlo_dce.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher.h"
 #include "tensorflow/compiler/xla/service/pattern_matcher_gmock.h"
@@ -41,6 +42,7 @@ namespace m = ::xla::match;
 namespace xla {
 namespace {
 
+namespace op = xla::testing::opcode_matchers;
 using ::tsl::testing::IsOkAndHolds;
 using TopkRewriterTest = HloTestBase;
 
@@ -576,5 +578,42 @@ ENTRY cluster {
                                        round_trip));
 }
 
+TEST_F(TopkRewriterTest, TopKDecomposition) {
+  const std::string hlo_string = R"(
+HloModule topk
+
+compare {
+  p.0.lhs = bf16[] parameter(0)
+  p.0.rhs = bf16[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=GT
+}
+
+ENTRY TopK {
+  x = bf16[10,10]{0,1} parameter(0)
+  ROOT topk = (bf16[10,2]{0,1}, s32[10,2]{0,1}) topk(x), k=2, to_apply=compare
+}
+
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool decomposer_changed,
+                          TopkDecomposer().Run(module.get()));
+  EXPECT_TRUE(decomposer_changed);
+  TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+  TF_ASSERT_OK(TupleSimplifier().Run(module.get()).status());
+  auto sort_matcher = op::Sort(op::Parameter(0), op::Iota());
+  EXPECT_THAT(module->entry_computation()->root_instruction(),
+              op::Tuple(op::Slice(op::GetTupleElement(sort_matcher, 0)),
+                        op::Slice(op::GetTupleElement(sort_matcher, 1))));
+
+  // Check that we also match the topk rewriter, effectively roundtripping.
+  TopkRewriter rewriter(
+      [](const HloSortInstruction*, int64_t) { return true; });
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, rewriter.Run(module.get()));
+  TF_ASSERT_OK(HloDCE().Run(module.get()).status());
+  EXPECT_TRUE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/service/value_range.cc b/tensorflow/compiler/xla/service/value_range.cc
new file mode 100644
index 00000000000..c1c843da092
--- /dev/null
+++ b/tensorflow/compiler/xla/service/value_range.cc
@@ -0,0 +1,180 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/value_range.h"
+
+#include <optional>
+#include <string>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+
+namespace xla {
+
+std::optional<int64_t> Range::GetSingleSignedValue() const {
+  if (!IsSingleValue()) {
+    return std::nullopt;
+  }
+  return min_.GetSignedValue();
+}
+
+std::optional<int64_t> Range::GetSingleUnsignedValue() const {
+  if (!IsSingleValue()) {
+    return std::nullopt;
+  }
+  return min_.GetUnsignedValue();
+}
+
+std::string Range::ToString() const {
+  if (IsEmpty()) {
+    return std::string("Empty");
+  }
+  return absl::StrCat("min: ", min_.ToString(), " max: ", max_.ToString());
+}
+
+// Identify the value ranges of a scalar HLO with a integer type. It returns
+// a range of values that the instruction can have.
+Range RecursivelyIdentifyRange(
+    const HloInstruction* instr,
+    const absl::flat_hash_map<const HloInstruction*, Range>&
+        predefined_ranges) {
+  // Non scalar or non-integer HLO. Abort.
+  if ((!instr->shape().IsInteger() && instr->shape().element_type() != PRED) ||
+      instr->shape().dimensions_size() != 0) {
+    return Range{};
+  }
+  VLOG(5) << "Computing Range for " << instr->ToString();
+  auto it = predefined_ranges.find(instr);
+  if (it != predefined_ranges.end()) {
+    VLOG(5) << "Found range! " << it->second.max().GetSignedValue() << " "
+            << it->second.min().GetSignedValue();
+    return it->second;
+  }
+  switch (instr->opcode()) {
+    case HloOpcode::kCompare: {
+      VLOG(5) << "Handling Compare";
+      Range lhs =
+          RecursivelyIdentifyRange(instr->operand(0), predefined_ranges);
+      Range rhs =
+          RecursivelyIdentifyRange(instr->operand(1), predefined_ranges);
+      VLOG(5) << "Returned Rhs: " << rhs.ToString()
+              << " Lhs: " << lhs.ToString();
+      // Only kLt supported right now.
+      if (instr->comparison_direction() != ComparisonDirection::kLt) {
+        return Range{};
+      }
+      if (lhs.max().lt(rhs.min())) {
+        return Range{ConstantValue::GetOne(/*bitwidth=*/1, /*is_signed=*/false),
+                     ConstantValue::GetOne(/*bitwidth=*/1, /*is_signed=*/false),
+                     /*is_linear=*/true};
+      }
+      if (!lhs.min().lt(rhs.max())) {
+        return Range{
+            ConstantValue::GetZero(/*bitwidth=*/1, /*is_signed=*/false),
+            ConstantValue::GetZero(/*bitwidth=*/1, /*is_signed=*/false),
+            /*is_linear=*/true};
+      }
+      VLOG(5) << "Compare failed";
+      VLOG(5) << "rhs max " << rhs.max().GetSignedValue() << " rhs min "
+              << rhs.min().GetSignedValue() << " lhs max "
+              << lhs.max().GetSignedValue() << " lhs min "
+              << lhs.min().GetSignedValue();
+      return Range{};
+    }
+    case HloOpcode::kConstant: {
+      if (!instr->shape().IsInteger()) {
+        return Range{};
+      }
+      VLOG(5) << "Handling Constant";
+      const int64_t bitwidth =
+          primitive_util::BitWidth(instr->shape().element_type());
+      const bool is_signed =
+          primitive_util::IsSignedIntegralType(instr->shape().element_type());
+      if (is_signed) {
+        const int64_t value = *instr->literal().GetFirstInteger();
+        return Range{ConstantValue::GetSigned(value, bitwidth),
+                     ConstantValue::GetSigned(value, bitwidth),
+                     /*is_linear=*/true};
+      }
+      const uint64_t value = *instr->literal().GetFirstInteger();
+      return Range{ConstantValue::GetUnsigned(value, bitwidth),
+                   ConstantValue::GetUnsigned(value, bitwidth),
+                   /*is_linear=*/true};
+    }
+    case HloOpcode::kAdd: {
+      if (!instr->shape().IsInteger()) {
+        return Range{};
+      }
+      VLOG(5) << "Handling Add";
+      Range lhs =
+          RecursivelyIdentifyRange(instr->operand(0), predefined_ranges);
+      Range rhs =
+          RecursivelyIdentifyRange(instr->operand(1), predefined_ranges);
+      VLOG(5) << "Returned Rhs: " << rhs.ToString()
+              << " Lhs: " << lhs.ToString();
+      if (lhs.IsEmpty() || rhs.IsEmpty()) {
+        return Range{};
+      }
+      ConstantValue min = lhs.min().add(rhs.min());
+      ConstantValue max = lhs.max().add(rhs.max());
+      if (max.lt(min)) {
+        VLOG(5) << "Add wrapped";
+        return Range{};
+      }
+      return Range{min, max, lhs.IsLinear() && rhs.IsLinear()};
+    }
+    case HloOpcode::kSelect: {
+      VLOG(5) << "Handling Select";
+      const HloInstruction* cmp = instr->operand(0);
+      Range cmp_range = RecursivelyIdentifyRange(cmp, predefined_ranges);
+      // Support only when the select has a constant value as condition.
+      if (cmp_range.IsEmpty() || !cmp_range.IsSingleValue()) {
+        VLOG(5) << "Select failed";
+        return Range{};
+      }
+      if (cmp_range.GetSingleSignedValue() == 0) {
+        return RecursivelyIdentifyRange(instr->operand(2), predefined_ranges);
+      }
+      return RecursivelyIdentifyRange(instr->operand(1), predefined_ranges);
+    }
+    case HloOpcode::kSubtract: {
+      if (!instr->shape().IsInteger()) {
+        return Range{};
+      }
+      VLOG(5) << "Handling Subtract";
+      Range lhs =
+          RecursivelyIdentifyRange(instr->operand(0), predefined_ranges);
+      Range rhs =
+          RecursivelyIdentifyRange(instr->operand(1), predefined_ranges);
+      VLOG(5) << "Returned Rhs: " << rhs.ToString()
+              << " Lhs: " << lhs.ToString();
+      if (lhs.IsEmpty() || rhs.IsEmpty()) {
+        return Range{};
+      }
+      ConstantValue min = lhs.min().sub(rhs.max());
+      ConstantValue max = lhs.max().sub(rhs.min());
+      if (max.lt(min)) {
+        VLOG(5) << "Subtract wrapped";
+        return Range{};
+      }
+      return Range{min, max, lhs.IsLinear() && rhs.IsLinear()};
+    }
+    default:
+      break;
+  }
+  VLOG(5) << "Unsupported instruction: " << instr->ToString();
+  return Range{};
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/value_range.h b/tensorflow/compiler/xla/service/value_range.h
new file mode 100644
index 00000000000..2df1d6f3f96
--- /dev/null
+++ b/tensorflow/compiler/xla/service/value_range.h
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_SERVICE_VALUE_RANGE_H_
+#define TENSORFLOW_COMPILER_XLA_SERVICE_VALUE_RANGE_H_
+
+#include <optional>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/constant_value.h"
+
+namespace xla {
+
+// Class keeping track of the range of an HLO value.
+class Range {
+ public:
+  Range()
+      : min_(ConstantValue::GetZero(/*bitwidth=*/64, /*is_signed=*/false)),
+        max_(ConstantValue::GetZero(/*bitwidth=*/64, /*is_signed=*/false)),
+        empty_(true),
+        is_linear_(false) {}
+  Range(const ConstantValue& min, const ConstantValue& max, bool is_linear)
+      : min_(min), max_(max), empty_(false), is_linear_(is_linear) {}
+  // Minimum value of the range.
+  const ConstantValue& min() const { return min_; }
+  // Maximum value of the range.
+  const ConstantValue& max() const { return max_; }
+  // Returns if the range is empty (no value in set).
+  bool IsEmpty() const { return empty_; }
+  // Only one value in set. This means the range is a constant.
+  bool IsSingleValue() const { return !IsEmpty() && min_ == max_; }
+  // This is a way to track in some way recurring values that change in a
+  // monotonic way. This true means that the variables driving the range change
+  // in a monotonic way and that the way they are composed together is linear
+  // causing the final value represented by the range in a monotonic way during
+  // loop recursion.
+  bool IsLinear() const { return is_linear_; }
+  // If this range represents a single value return that signed value.
+  std::optional<int64_t> GetSingleSignedValue() const;
+  // If this range represents a single value return that unsigned value.
+  std::optional<int64_t> GetSingleUnsignedValue() const;
+
+  std::string ToString() const;
+
+ private:
+  ConstantValue min_;
+  ConstantValue max_;
+  bool empty_;
+  bool is_linear_;
+};
+
+// Constructs a Range object from a HloInstruction. Gets a "predefined_ranges"
+// object as input that returns known ranges for some variables for which we
+// already know the range. The final range is composed from operations over
+// these predetermined ranges.
+// The input HLO needs to be of scalar type and integer.
+Range RecursivelyIdentifyRange(
+    const HloInstruction* instr,
+    const absl::flat_hash_map<const HloInstruction*, Range>& predefined_ranges);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_SERVICE_VALUE_RANGE_H_
diff --git a/tensorflow/compiler/xla/service/value_range_test.cc b/tensorflow/compiler/xla/service/value_range_test.cc
new file mode 100644
index 00000000000..42320153cd6
--- /dev/null
+++ b/tensorflow/compiler/xla/service/value_range_test.cc
@@ -0,0 +1,248 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/service/value_range.h"
+
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/compiler/xla/service/hlo_module_config.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+class ValueRangeTest : public HloTestBase {};
+
+TEST_F(ValueRangeTest, AddedValue) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s32[] constant(124)
+  p0 = s32[] parameter(0)
+  ROOT %a = s32[] add(p0, c0)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(std::make_pair(
+      p0, Range{ConstantValue::GetZero(32, /*is_signed=*/true),
+                ConstantValue::GetSigned(5, 32), /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_FALSE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_TRUE(range.IsLinear());
+  EXPECT_EQ(range.min().GetSignedValue(), 124);
+  EXPECT_EQ(range.max().GetSignedValue(), 129);
+}
+
+TEST_F(ValueRangeTest, AddedValueUnsigned) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = u16[] constant(32768)
+  p0 = u16[] parameter(0)
+  ROOT %a = u16[] add(p0, c0)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(std::make_pair(
+      p0, Range{ConstantValue::GetZero(32, /*is_signed=*/false),
+                ConstantValue::GetUnsigned(5, 32), /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_FALSE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_TRUE(range.IsLinear());
+  EXPECT_EQ(range.min().GetUnsignedValue(), 32768);
+  EXPECT_EQ(range.max().GetUnsignedValue(), 32773);
+}
+
+TEST_F(ValueRangeTest, SubtractValue) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s32[] constant(124)
+  p0 = s32[] parameter(0)
+  ROOT %a = s32[] subtract(p0, c0)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(std::make_pair(
+      p0, Range{ConstantValue::GetZero(32, /*is_signed=*/true),
+                ConstantValue::GetSigned(5, 32), /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_FALSE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_TRUE(range.IsLinear());
+  EXPECT_EQ(range.min().GetSignedValue(), -124);
+  EXPECT_EQ(range.max().GetSignedValue(), -119);
+}
+
+TEST_F(ValueRangeTest, SelectValue) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s32[] constant(124)
+  p0 = s32[] parameter(0)
+  c = pred[] compare(p0, c0), direction=LT
+  %s = s32[] subtract(p0, c0)
+  %a = s32[] add(c0, p0)
+  ROOT slct = s32[] select(c, s, a)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0)->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(std::make_pair(
+      p0, Range{ConstantValue::GetZero(32, /*is_signed=*/true),
+                ConstantValue::GetSigned(5, 32), /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_FALSE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_TRUE(range.IsLinear());
+  EXPECT_EQ(range.max().GetSignedValue(), -119);
+  EXPECT_EQ(range.min().GetSignedValue(), -124);
+}
+
+TEST_F(ValueRangeTest, SelectValue2) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s32[] constant(124)
+  p0 = s32[] parameter(0)
+  c = pred[] compare(c0, p0), direction=LT
+  %s = s32[] subtract(p0, c0)
+  %a = s32[] add(c0, p0)
+  ROOT slct = s32[] select(c, s, a)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0)->operand(1);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(std::make_pair(
+      p0, Range{ConstantValue::GetZero(32, /*is_signed=*/true),
+                ConstantValue::GetSigned(5, 32), /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_FALSE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_TRUE(range.IsLinear());
+  EXPECT_EQ(range.max().GetSignedValue(), 129);
+  EXPECT_EQ(range.min().GetSignedValue(), 124);
+}
+
+TEST_F(ValueRangeTest, AddSubtractValue) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s32[] constant(124)
+  c1 = s32[] constant(12)
+  c2 = s32[] constant(5)
+  p0 = s32[] parameter(0)
+  s = s32[] subtract(p0, c0)
+  a = s32[] add(s, c1)
+  s2 = s32[] subtract(c2, a)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(1)->operand(0)->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(std::make_pair(
+      p0, Range{ConstantValue::GetZero(32, /*is_signed=*/true),
+                ConstantValue::GetSigned(5, 32), /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_FALSE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_TRUE(range.IsLinear());
+  EXPECT_EQ(range.min().GetSignedValue(), 112);
+  EXPECT_EQ(range.max().GetSignedValue(), 117);
+}
+
+TEST_F(ValueRangeTest, SubtractWrapAroundValue) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s16[] constant(124)
+  p0 = s16[] parameter(0)
+  ROOT %a = s16[] subtract(p0, c0)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(
+      std::make_pair(p0, Range{ConstantValue::GetSigned(-32768, 16),
+                               ConstantValue::GetZero(16, /*is_signed=*/true),
+                               /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_TRUE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_FALSE(range.IsLinear());
+}
+
+TEST_F(ValueRangeTest, AddWrapAroundValue) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY entry {
+  c0 = s16[] constant(124)
+  p0 = s16[] parameter(0)
+  ROOT %a = s16[] add(p0, c0)
+}
+)";
+  auto module =
+      ParseAndReturnUnverifiedModule(hlo_string, HloModuleConfig{}).value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* p0 = root->operand(0);
+  absl::flat_hash_map<const HloInstruction*, Range> fs;
+  fs.insert(
+      std::make_pair(p0, Range{ConstantValue::GetZero(16, /*is_signed=*/true),
+                               ConstantValue::GetSigned(32760, 16),
+                               /*is_linear=*/true}));
+  auto range = RecursivelyIdentifyRange(root, fs);
+  EXPECT_TRUE(range.IsEmpty());
+  EXPECT_FALSE(range.IsSingleValue());
+  EXPECT_FALSE(range.IsLinear());
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc b/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc
index e51f638b008..b755c3de2d2 100644
--- a/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_gpu_test.cc
@@ -127,7 +127,8 @@ TEST(XlaCompileTest, LoadGpuExecutableWithGemm) {
       tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
 
   // Check that GemmAlgorithmPicker successfully loaded autotune results.
-  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "algorithm = 13 : i64"));
+  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "algorithm = 13 : i64"))
+      << serialized_aot_result;
 
   // Get a LocalClient
   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
@@ -235,7 +236,8 @@ TEST(XlaCompileTest, LoadGpuExecutableWithConvolution) {
       tsl::ReadFileToString(tsl::Env::Default(), path, &serialized_aot_result));
 
   // Check that GpuConvAlgorithmPicker successfully loaded autotune results.
-  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "\"algo_id\":\"3\""));
+  EXPECT_TRUE(absl::StrContains(serialized_aot_result, "\"algo_id\":\"3\""))
+      << serialized_aot_result;
 
   // Get a LocalClient
   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
diff --git a/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt b/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt
index 60e8cb34bb8..3614a42cb7d 100644
--- a/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt
+++ b/tensorflow/compiler/xla/service/xla_aot_compile_test_autotune_results.prototxt
@@ -14,7 +14,7 @@
 
 dots {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "f32[3,3]{1,0} custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config=\"{\\\"alpha_real\\\":1,\\\"beta\\\":0,\\\"dot_dimension_numbers\\\":{\\\"lhs_contracting_dimensions\\\":[\\\"1\\\"],\\\"rhs_contracting_dimensions\\\":[\\\"0\\\"],\\\"lhs_batch_dimensions\\\":[],\\\"rhs_batch_dimensions\\\":[]},\\\"alpha_imag\\\":0,\\\"precision_config\\\":{\\\"operand_precision\\\":[\\\"DEFAULT\\\",\\\"DEFAULT\\\"]},\\\"epilogue\\\":\\\"DEFAULT\\\"}\""
+  hlo: "f32[3,3]{1,0} custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\"}"
   result {
     gemm {
       algorithm: 13
@@ -23,7 +23,7 @@ dots {
 }
 convs {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
-  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[2,1,4,4]{3,2,1,0}, f32[2,1,3,2]{3,2,1,0}), window={size=2x3}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convBackwardFilter\", backend_config=\"{\\\"activation_mode\\\":\\\"0\\\",\\\"conv_result_scale\\\":1,\\\"side_input_scale\\\":0}\""
+  hlo: "(f32[1,1,2,3]{3,2,1,0}, u8[0]{0}) custom-call(f32[2,1,4,4]{3,2,1,0}, f32[2,1,3,2]{3,2,1,0}), window={size=2x3}, dim_labels=bf01_oi01->bf01, custom_call_target=\"__cudnn$convBackwardFilter\", backend_config={\"activation_mode\":\"kNone\",\"conv_result_scale\":1,\"side_input_scale\":0}"
   result {
     run_time {
       nanos: 45408
@@ -34,4 +34,4 @@ convs {
       }
     }
   }
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/xla/shape.cc b/tensorflow/compiler/xla/shape.cc
index a4cf14bd53e..fbed430f431 100644
--- a/tensorflow/compiler/xla/shape.cc
+++ b/tensorflow/compiler/xla/shape.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/compiler/xla/layout_util.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/printer.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 
@@ -112,22 +113,14 @@ std::string Shape::ToString(bool print_layout) const {
 }
 
 bool Shape::IsInteger() const {
-  switch (element_type()) {
-    case PrimitiveType::S8:
-    case PrimitiveType::S16:
-    case PrimitiveType::S32:
-    case PrimitiveType::S64:
-    case PrimitiveType::U8:
-    case PrimitiveType::U16:
-    case PrimitiveType::U32:
-    case PrimitiveType::U64:
-      return true;
-    case PrimitiveType::TUPLE:
-      return absl::c_any_of(tuple_shapes_,
-                            [](const Shape& s) { return s.IsInteger(); });
-    default:
-      return false;
+  if (primitive_util::IsIntegralType(element_type())) {
+    return true;
   }
+  if (IsTuple()) {
+    return absl::c_any_of(tuple_shapes_,
+                          [](const Shape& s) { return s.IsInteger(); });
+  }
+  return false;
 }
 
 bool Shape::is_static() const {
@@ -148,26 +141,7 @@ void Shape::DeleteDimension(int64_t dim_to_delete) {
   dimensions_.erase(dimensions_.begin() + dim_to_delete);
   dynamic_dimensions_.erase(dynamic_dimensions_.begin() + dim_to_delete);
   if (LayoutUtil::HasLayout(*this)) {
-    for (int64_t i = 0; i < layout_->minor_to_major().size();) {
-      if (layout_->minor_to_major(i) == dim_to_delete) {
-        layout_->mutable_minor_to_major()->erase(
-            layout_->mutable_minor_to_major()->begin() + i);
-        continue;
-      }
-      if (layout_->minor_to_major(i) > dim_to_delete) {
-        (*layout_->mutable_minor_to_major())[i] -= 1;
-      }
-      ++i;
-    }
-    // Delete the corresponding dim level types.
-    if (LayoutUtil::IsSparse(this->layout())) {
-      auto* mut_dlt = layout_->mutable_dim_level_types();
-      auto* mut_dim_unique = layout_->mutable_dim_unique();
-      auto* mut_dim_ordered = layout_->mutable_dim_ordered();
-      mut_dlt->erase(mut_dlt->begin() + dim_to_delete);
-      mut_dim_unique->erase(mut_dim_unique->begin() + dim_to_delete);
-      mut_dim_ordered->erase(mut_dim_ordered->begin() + dim_to_delete);
-    }
+    layout_->DeleteDimension(dim_to_delete);  // NOLINT: optional-access
   }
 }
 
@@ -228,6 +202,9 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
         if (ignore_tiles_in_layout_) {
           equal.IgnoreTiles();
         }
+        if (ignore_element_size_in_layout_) {
+          equal.IgnoreElementSize();
+        }
         if (ignore_memory_space_in_layout_) {
           equal.IgnoreMemorySpace();
         }
diff --git a/tensorflow/compiler/xla/shape.h b/tensorflow/compiler/xla/shape.h
index 445bbe82cc7..6de1b51b28a 100644
--- a/tensorflow/compiler/xla/shape.h
+++ b/tensorflow/compiler/xla/shape.h
@@ -107,7 +107,7 @@ class Shape {
 
   // Add dimension_upper_bound().
 
-  // Removes the given dimension form the shape. Layout, if it exists, is
+  // Removes the given dimension from the shape. Layout, if it exists, is
   // adjusted to match the modified shape.
   void DeleteDimension(int64_t dim_to_delete);
 
diff --git a/tensorflow/compiler/xla/shape_util.cc b/tensorflow/compiler/xla/shape_util.cc
index 526aa4a858f..996856b732e 100644
--- a/tensorflow/compiler/xla/shape_util.cc
+++ b/tensorflow/compiler/xla/shape_util.cc
@@ -129,8 +129,8 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
     absl::Span<const DimLevelType> dim_level_types,
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     absl::Span<const Tile> tiles, PrimitiveType index_primitive_type,
-    PrimitiveType pointer_primitive_type, int64_t memory_space,
-    std::optional<Shape> physical_shape) {
+    PrimitiveType pointer_primitive_type, int64_t element_size_in_bits,
+    int64_t memory_space, std::optional<Shape> physical_shape) {
   if (dimensions.size() != minor_to_major.size()) {
     return InvalidArgument("Dimensions size is %ld, but layout size is %ld.",
                            dimensions.size(), minor_to_major.size());
@@ -142,10 +142,15 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   }
   TF_ASSIGN_OR_RETURN(Shape shape,
                       ShapeUtil::MakeValidatedShape(element_type, dimensions));
+  if (element_size_in_bits ==
+      ShapeUtil::ByteSizeOfPrimitiveType(element_type) * 8) {
+    // Only set element_size_in_bits if it's different from the default value.
+    element_size_in_bits = 0;
+  }
   *shape.mutable_layout() = LayoutUtil::MakeLayout(
       minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
-      index_primitive_type, pointer_primitive_type, memory_space,
-      std::move(physical_shape));
+      index_primitive_type, pointer_primitive_type, element_size_in_bits,
+      memory_space, std::move(physical_shape));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
   return shape;
 }
@@ -337,12 +342,13 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
 /* static */ Shape ShapeUtil::MakeShapeWithDenseLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major, absl::Span<const Tile> tiles,
-    int64_t memory_space) {
+    int64_t element_size_in_bits, int64_t memory_space) {
   auto ret = MakeShapeWithLayoutInternal(
       element_type, dimensions, minor_to_major, /*dim_level_types=*/{},
       /*dim_unique=*/{}, /*dim_ordered=*/{}, tiles,
       /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
-      /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID, memory_space,
+      /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID, element_size_in_bits,
+      memory_space,
       /*physical_shape=*/std::nullopt);
   if (!ret.ok()) LOG(ERROR) << ret.status();
   return ret.value();
@@ -354,11 +360,12 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
     absl::Span<const DimLevelType> dim_level_types,
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
-    int64_t memory_space, std::optional<Shape> physical_shape) {
+    int64_t element_size_in_bits, int64_t memory_space,
+    std::optional<Shape> physical_shape) {
   auto ret = MakeShapeWithLayoutInternal(
       element_type, dimensions, minor_to_major, dim_level_types, dim_unique,
       dim_ordered, /*tiles=*/{}, index_primitive_type, pointer_primitive_type,
-      memory_space, std::move(physical_shape));
+      element_size_in_bits, memory_space, std::move(physical_shape));
   if (!ret.ok()) LOG(ERROR) << ret.status();
   return ret.value();
 }
@@ -413,6 +420,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
   if (shape.has_layout()) {
     new_shape.mutable_layout()->mutable_tiles()->assign(
         shape.layout().tiles().begin(), shape.layout().tiles().end());
+    new_shape.mutable_layout()->set_element_size_in_bits(
+        shape.layout().element_size_in_bits());
   }
   for (int i = 0; i < shape.dimensions_size(); ++i) {
     new_shape.set_dynamic_dimension(i, shape.is_dynamic_dimension(i));
@@ -565,37 +574,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 }
 
 /* static */ bool ShapeUtil::ElementIsSigned(const Shape& shape) {
-  switch (shape.element_type()) {
-    case S4:
-    case S8:
-    case S16:
-    case S32:
-    case S64:
-    case F8E5M2:
-    case F8E4M3FN:
-    case F8E4M3B11FNUZ:
-    case F16:
-    case BF16:
-    case F32:
-    case F64:
-      return true;
-
-    case PRED:
-    case U4:
-    case U8:
-    case U16:
-    case U32:
-    case U64:
-    case C64:
-    case C128:
-    case TUPLE:
-    case OPAQUE_TYPE:
-    case TOKEN:
-      return false;
-
-    default:
-      LOG(FATAL) << "Unhandled element type " << shape.element_type();
-  }
+  return primitive_util::IsSignedIntegralType(shape.element_type()) ||
+         primitive_util::IsFloatingPointType(shape.element_type());
 }
 
 /* static */ bool ShapeUtil::ElementIsComplex(const Shape& shape) {
@@ -871,6 +851,12 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
 
   CHECK(LayoutUtil::IsDenseArray(shape)) << shape.ShortDebugString();
   allocated_element_count = ElementsIn(shape);
+
+  if (shape.has_layout() && shape.layout().element_size_in_bits() != 0) {
+    const int64_t num_bits =
+        allocated_element_count * shape.layout().element_size_in_bits();
+    return CeilOfRatio<int64_t>(num_bits, CHAR_BIT);
+  }
   return allocated_element_count *
          ByteSizeOfPrimitiveType(shape.element_type());
 }
@@ -1486,6 +1472,81 @@ ShapeUtil::DeduceTransposeDimensionsForBitcast(const Shape& input_shape,
   return transpose_perm;
 }
 
+namespace {
+
+static absl::InlinedVector<int64_t, 8> ReverseIota(int64_t n) {
+  absl::InlinedVector<int64_t, 8> ret(n);
+  absl::c_generate(ret, [n = ret.size()]() mutable { return --n; });
+  return ret;
+}
+
+}  // namespace
+
+bool ShapeUtil::BitcastDecompositionTrt::IsTranspose1Identity() const {
+  return absl::c_is_sorted(transpose1_dims);
+}
+
+bool ShapeUtil::BitcastDecompositionTrt::IsTranspose2Identity() const {
+  return absl::c_is_sorted(transpose2_dims);
+}
+
+/* static */ ShapeUtil::BitcastDecompositionTrt
+ShapeUtil::DecomposeBitcastToTrt(const Shape& input_shape,
+                                 const Shape& output_shape) {
+  CHECK(input_shape.has_layout()) << input_shape.ToString();
+  CHECK(output_shape.has_layout()) << output_shape.ToString();
+
+  BitcastDecompositionTrt decomposition;
+  decomposition.transpose1_shape =
+      MakeShapeWithDescendingLayoutAndSamePhysicalLayout(input_shape);
+  decomposition.reshape_shape =
+      MakeShapeWithDescendingLayoutAndSamePhysicalLayout(output_shape);
+  CHECK(ReshapeIsBitcast(decomposition.transpose1_shape,
+                         decomposition.reshape_shape,
+                         /*ignore_element_type=*/true));
+
+  // Let a * b denote Permute(a, perm=b).
+  //
+  // (input_dims * transpose1_dims) * R = input_dims * input_layout
+  // transpose1_dims * R = input_layout  | * R, knowing R * R = I
+  // transpose1_dims = input_layout * R
+  decomposition.transpose1_dims = ComposePermutations(
+      LayoutPerm(input_shape), ReverseIota(input_shape.rank()));
+  CHECK(TransposeIsBitcast(input_shape, decomposition.transpose1_shape,
+                           decomposition.transpose1_dims,
+                           /*ignore_element_type=*/false));
+
+  // (reshape_dims * transpose2_dims) * output_layout = reshape_dims * R
+  // transpose2_dims * output_layout = R  | * inv(output_layout)
+  // transpose2_dims = R * inv(output_layout)
+  decomposition.transpose2_dims =
+      ComposePermutations(ReverseIota(output_shape.rank()),
+                          InversePermutation(LayoutPerm(output_shape)));
+  CHECK(TransposeIsBitcast(decomposition.reshape_shape, output_shape,
+                           decomposition.transpose2_dims,
+                           /*ignore_element_type=*/false));
+
+  return decomposition;
+}
+
+/* static */ ShapeUtil::BitcastDecomposition ShapeUtil::DecomposeBitcast(
+    const Shape& input_shape, const Shape& output_shape) {
+  CHECK(input_shape.has_layout()) << input_shape.ToString();
+  CHECK(output_shape.has_layout()) << output_shape.ToString();
+
+  if (ShapeUtil::ReshapeIsBitcast(input_shape, output_shape,
+                                  /*ignore_element_type=*/true)) {
+    return BitcastDecompositionReshape{};
+  }
+
+  if (std::optional<std::vector<int64_t>> transpose_dims =
+          DeduceTransposeDimensionsForBitcast(input_shape, output_shape)) {
+    return BitcastDecompositionTranspose{transpose_dims.value()};
+  }
+
+  return DecomposeBitcastToTrt(input_shape, output_shape);
+}
+
 /* static */ std::optional<Shape> ShapeUtil::AlignLayouts(
     const Shape& input_shape, const Shape& output_shape) {
   CHECK(input_shape.IsArray());
@@ -1950,6 +2011,7 @@ Shape ShapeUtil::DeviceShapeToHostShape(Shape s) {
       subshape->mutable_layout()->clear_tiles();
       subshape->mutable_layout()->set_memory_space(Layout::kDefaultMemorySpace);
       subshape->mutable_layout()->clear_physical_shape();
+      subshape->mutable_layout()->set_element_size_in_bits(0);
     }
   });
   return s;
@@ -2001,10 +2063,11 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
     num_of_elements *= dim_size;
   }
 
-  if (ShapeUtil::ElementHasBitWidth(shape, 4)) {
-    return num_of_elements / 2;
+  if (shape.layout().element_size_in_bits() != 0) {
+    const int64_t num_bits =
+        num_of_elements * shape.layout().element_size_in_bits();
+    return CeilOfRatio<int64_t>(num_bits, CHAR_BIT);
   }
-
   return num_of_elements * ByteSizeOfPrimitiveType(shape.element_type());
 }
 
@@ -2015,9 +2078,12 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
     indices.push_back(dim - 1);
   }
   int64_t size = LayoutUtil::LinearIndex(shape, indices) + 1;
-  int64_t num_bits = size * primitive_util::BitWidth(shape.element_type());
 
-  return CeilOfRatio<int64_t>(num_bits, CHAR_BIT);
+  if (shape.layout().element_size_in_bits() != 0) {
+    const int64_t num_bits = size * shape.layout().element_size_in_bits();
+    return CeilOfRatio<int64_t>(num_bits, CHAR_BIT);
+  }
+  return size * ByteSizeOfPrimitiveType(shape.element_type());
 }
 
 int64_t ShapeUtil::ForEachState::CalculateNumSteps() const {
diff --git a/tensorflow/compiler/xla/shape_util.h b/tensorflow/compiler/xla/shape_util.h
index e7810703733..adc93dc5408 100644
--- a/tensorflow/compiler/xla/shape_util.h
+++ b/tensorflow/compiler/xla/shape_util.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include <string>
 #include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/base/macros.h"
@@ -380,7 +381,8 @@ class ShapeUtil {
   static Shape MakeShapeWithDenseLayout(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions,
       absl::Span<const int64_t> minor_to_major,
-      absl::Span<const Tile> tiles = {}, int64_t memory_space = 0);
+      absl::Span<const Tile> tiles = {}, int64_t element_size_in_bits = 0,
+      int64_t memory_space = 0);
 
   // Constructs a new sparse array shape with the given minor_to_major order and
   // dim_level_types in its Layout. Returns a value shape such that
@@ -393,7 +395,7 @@ class ShapeUtil {
       absl::Span<const bool> dim_ordered = {},
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
-      int64_t memory_space = 0,
+      int64_t element_size_in_bits = 0, int64_t memory_space = 0,
       std::optional<Shape> physical_shape = std::nullopt);
 
   // Constructs a new shape with the given dimension `dim` as the most major
@@ -657,6 +659,51 @@ class ShapeUtil {
   DeduceTransposeDimensionsForBitcast(const Shape& input_shape,
                                       const Shape& output_shape);
 
+  // This means that the bitcast can be decomposed to a single reshape.
+  struct BitcastDecompositionReshape {};
+
+  // This means that the bitcast can be decomposed to a single transpose.
+  struct BitcastDecompositionTranspose {
+    std::vector<int64_t> transpose_dims;
+  };
+
+  // Every bitcast from A to B can be represented as a sequence of:
+  // 1) Transpose to a normalized layout of A
+  // 2) Reshape to a normalized layout of B
+  // 3) Transpose from (2) to B
+  //
+  // All members are always set, even if they correspond to an identity
+  // operation.
+  //
+  // Note: Some bitcasts can be converted to a single transpose or reshape,
+  // using other methods.
+  struct BitcastDecompositionTrt {
+    std::vector<int64_t> transpose1_dims;
+    // Has a normalized layout.
+    Shape transpose1_shape;
+    // Has a normalized layout.
+    Shape reshape_shape;
+    std::vector<int64_t> transpose2_dims;
+
+    bool IsTranspose1Identity() const;
+    bool IsTranspose2Identity() const;
+  };
+
+  // A variant type holding one of the possible bitcast decompositions.
+  using BitcastDecomposition =
+      std::variant<BitcastDecompositionReshape, BitcastDecompositionTranspose,
+                   BitcastDecompositionTrt>;
+
+  // Decomposes a bitcast to a sequence of transpose, reshape, transpose.
+  //
+  // See the comment on BitcastDecompositionTrt.
+  static BitcastDecompositionTrt DecomposeBitcastToTrt(
+      const Shape& input_shape, const Shape& output_shape);
+
+  // Decomposes a bitcast to one of the possible decompositions.
+  static BitcastDecomposition DecomposeBitcast(const Shape& input_shape,
+                                               const Shape& output_shape);
+
   // Find a physical layout for 'output_shape' such that
   // ShapeUtil::ReshapeIsBitcast(input_shape, output_shape_with_layout) returns
   // true (where 'output_shape_with_layout' is 'output_shape' with the found
diff --git a/tensorflow/compiler/xla/shape_util_test.cc b/tensorflow/compiler/xla/shape_util_test.cc
index 44c632f587b..4006c38e055 100644
--- a/tensorflow/compiler/xla/shape_util_test.cc
+++ b/tensorflow/compiler/xla/shape_util_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <numeric>
 #include <optional>
+#include <variant>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -1121,7 +1123,9 @@ TEST(ShapeUtilTest, B_251055887) {
 
 TEST(ShapeUtilTest, Int4ShapeSize) {
   Shape int4_shape = ShapeUtil::MakeShape(S4, {64, 128});
+  int4_shape.mutable_layout()->set_element_size_in_bits(4);
   EXPECT_EQ(ShapeUtil::ArrayDataSize(int4_shape), 64 * 128 / 2);
+  EXPECT_EQ(ShapeUtil::ArraySize(int4_shape), 64 * 128 / 2);
 
   // Ensure the size is correct with int4 tiling.
   Shape int4_shape2 = ShapeUtil::MakeShape(S4, {9216, 6144});
@@ -1131,7 +1135,161 @@ TEST(ShapeUtilTest, Int4ShapeSize) {
   layout->add_tiles();
   *layout->mutable_tiles(0) = Tile({8 * (32 / 4), 128});
   *layout->mutable_tiles(1) = Tile({32 / 4, 1});
+  layout->set_element_size_in_bits(4);
   EXPECT_EQ(ShapeUtil::ArrayDataSize(int4_shape2), 9216 * 6144 / 2);
+  EXPECT_EQ(ShapeUtil::ArraySize(int4_shape2), 9216 * 6144 / 2);
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToReshape) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {1, 16, 17, 3}, {3, 2, 1, 0});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 51}, {1, 0});
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  EXPECT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
+      decomposition));
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToReshape2) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {17, 3, 1, 16}, {1, 0, 3, 2});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {51, 16}, {0, 1});
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  EXPECT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
+      decomposition));
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToTranspose) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {3, 7, 6, 4}, {3, 2, 1, 0});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {3, 6, 4, 7}, {2, 1, 3, 0});
+  const std::vector<int64_t> kExpectedTransposeDims = {0, 2, 3, 1};
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  ASSERT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionTranspose>(
+      decomposition));
+  ShapeUtil::BitcastDecompositionTranspose decomposition_transpose =
+      std::get<ShapeUtil::BitcastDecompositionTranspose>(decomposition);
+  EXPECT_EQ(decomposition_transpose.transpose_dims, kExpectedTransposeDims);
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToReshapeAndTranspose) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 17, 3}, {2, 1, 0});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {51, 16}, {0, 1});
+
+  const std::vector<int64_t> kExpectedTranspose1Dims = {0, 1, 2};
+  const Shape kExpectedTranspose1Shape = kInputShape;
+  const Shape kExpectedReshapeShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 51}, {1, 0});
+  const std::vector<int64_t> kExpectedTranspose2Dims = {1, 0};
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  ASSERT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionTrt>(
+      decomposition));
+  ShapeUtil::BitcastDecompositionTrt decomposition_trt =
+      std::get<ShapeUtil::BitcastDecompositionTrt>(decomposition);
+  EXPECT_EQ(decomposition_trt.transpose1_dims, kExpectedTranspose1Dims);
+  EXPECT_TRUE(decomposition_trt.IsTranspose1Identity());
+  EXPECT_EQ(decomposition_trt.transpose1_shape, kExpectedTranspose1Shape);
+  EXPECT_EQ(decomposition_trt.reshape_shape, kExpectedReshapeShape);
+  EXPECT_EQ(decomposition_trt.transpose2_dims, kExpectedTranspose2Dims);
+  EXPECT_FALSE(decomposition_trt.IsTranspose2Identity());
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToReshapeAndTranspose2) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 17, 3, 7}, {3, 2, 1, 0});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {7, 16, 51}, {0, 2, 1});
+
+  const std::vector<int64_t> kExpectedTranspose1Dims = {0, 1, 2, 3};
+  const Shape kExpectedTranspose1Shape = kInputShape;
+  const Shape kExpectedReshapeShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 51, 7}, {2, 1, 0});
+  const std::vector<int64_t> kExpectedTranspose2Dims = {2, 0, 1};
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  ASSERT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionTrt>(
+      decomposition));
+  ShapeUtil::BitcastDecompositionTrt decomposition_trt =
+      std::get<ShapeUtil::BitcastDecompositionTrt>(decomposition);
+  EXPECT_EQ(decomposition_trt.transpose1_dims, kExpectedTranspose1Dims);
+  EXPECT_TRUE(decomposition_trt.IsTranspose1Identity());
+  EXPECT_EQ(decomposition_trt.transpose1_shape, kExpectedTranspose1Shape);
+  EXPECT_EQ(decomposition_trt.reshape_shape, kExpectedReshapeShape);
+  EXPECT_EQ(decomposition_trt.transpose2_dims, kExpectedTranspose2Dims);
+  EXPECT_FALSE(decomposition_trt.IsTranspose2Identity());
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToTransposeAndReshape) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 3, 17}, {1, 2, 0});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {51, 16}, {1, 0});
+
+  const std::vector<int64_t> kExpectedTranspose1Dims = {0, 2, 1};
+  const Shape kExpectedTranspose1Shape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 17, 3}, {2, 1, 0});
+  const Shape kExpectedReshapeShape = kOutputShape;
+  const std::vector<int64_t> kExpectedTranspose2Dims = {0, 1};
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  ASSERT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionTrt>(
+      decomposition));
+  ShapeUtil::BitcastDecompositionTrt decomposition_trt =
+      std::get<ShapeUtil::BitcastDecompositionTrt>(decomposition);
+  EXPECT_EQ(decomposition_trt.transpose1_dims, kExpectedTranspose1Dims);
+  EXPECT_FALSE(decomposition_trt.IsTranspose1Identity());
+  EXPECT_EQ(decomposition_trt.transpose1_shape, kExpectedTranspose1Shape);
+  EXPECT_EQ(decomposition_trt.reshape_shape, kExpectedReshapeShape);
+  EXPECT_EQ(decomposition_trt.transpose2_dims, kExpectedTranspose2Dims);
+  EXPECT_TRUE(decomposition_trt.IsTranspose2Identity());
+}
+
+TEST(ShapeUtilTest, DecomposeBitcastToTrt) {
+  const Shape kInputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 3, 17}, {1, 2, 0});
+  const Shape kOutputShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 51}, {0, 1});
+
+  const std::vector<int64_t> kExpectedTranspose1Dims = {0, 2, 1};
+  const Shape kExpectedTranspose1Shape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {16, 17, 3}, {2, 1, 0});
+  const Shape kExpectedReshapeShape =
+      ShapeUtil::MakeShapeWithDenseLayout(F32, {51, 16}, {1, 0});
+  const std::vector<int64_t> kExpectedTranspose2Dims = {1, 0};
+
+  ShapeUtil::BitcastDecomposition decomposition =
+      ShapeUtil::DecomposeBitcast(kInputShape, kOutputShape);
+
+  ASSERT_TRUE(std::holds_alternative<ShapeUtil::BitcastDecompositionTrt>(
+      decomposition));
+  ShapeUtil::BitcastDecompositionTrt decomposition_trt =
+      std::get<ShapeUtil::BitcastDecompositionTrt>(decomposition);
+  EXPECT_EQ(decomposition_trt.transpose1_dims, kExpectedTranspose1Dims);
+  EXPECT_FALSE(decomposition_trt.IsTranspose1Identity());
+  EXPECT_EQ(decomposition_trt.transpose1_shape, kExpectedTranspose1Shape);
+  EXPECT_EQ(decomposition_trt.reshape_shape, kExpectedReshapeShape);
+  EXPECT_EQ(decomposition_trt.transpose2_dims, kExpectedTranspose2Dims);
+  EXPECT_FALSE(decomposition_trt.IsTranspose2Identity());
 }
 
 TEST(Transpose021Test, NoTranspose) {
diff --git a/tensorflow/compiler/xla/stream_executor/BUILD b/tensorflow/compiler/xla/stream_executor/BUILD
index 0425da4aea4..927e5a31afc 100644
--- a/tensorflow/compiler/xla/stream_executor/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/BUILD
@@ -897,7 +897,6 @@ cc_library(
         ":scratch_allocator",
         ":stream_executor",
         "//tensorflow/compiler/xla/stream_executor/cuda:cuda_platform_id",
-        "//tensorflow/compiler/xla/stream_executor/host:host_platform",
         "//tensorflow/compiler/xla/stream_executor/host:host_platform_id",
         "//tensorflow/compiler/xla/stream_executor/platform:dso_loader",
         "//tensorflow/compiler/xla/stream_executor/rocm:rocm_platform_id",
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/BUILD b/tensorflow/compiler/xla/stream_executor/cuda/BUILD
index 6c81194cc3f..6ef657ad1c3 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/cuda/BUILD
@@ -108,6 +108,7 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor:device_options",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
         "//tensorflow/compiler/xla/stream_executor/platform",
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
index d81122db3c2..e86fefe7a49 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc
@@ -59,11 +59,13 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 // clang-format on
 
+#ifdef __clang__
 #pragma clang diagnostic push
 
 // Make sure that Eigen::half forward declaration in dnn.h matches the
 // declaration in Eigen.
 #pragma clang diagnostic warning "-Wmismatched-tags"
+#endif
 
 namespace stream_executor {
 namespace gpu {
@@ -744,6 +746,8 @@ class CudnnFilterDescriptor {
 // Additionally, users can specify an additional errata JSON file via
 // CUDNN_ERRATA_JSON_FILE at runtime.
 // We are also excluding two flavors of ConvFwd_eng42 due to b/234183340.
+// Excluding ConvFwd_Add_Add_eng32 to avoid misaligned address on A100,
+// see b/279920986.
 const json* CudnnExecutionPlanEngineFilterStatic() {
   static absl::string_view filter_str = R"({
       "version" : 1,
@@ -782,6 +786,28 @@ const json* CudnnExecutionPlanEngineFilterStatic() {
             },
             "cudnn_version_start" : 8000,
             "cudnn_version_end"   : -1
+          },
+          { "rule_id"             : "ConvFwd_Add_Add_eng34_k24=11",
+            "operation"           : "ConvFwd_Add_Add",
+            "engine"              : 34,
+            "cudnn_version_start" : 8700,
+            "cudnn_version_end"   : 8900
+          },
+          { "rule_id"             : "ConvFwd_Add_Add_ReluFwd_eng15_k5=1_k6=0_k7=1_k10=1",
+            "operation"           : "ConvFwd_Add_Add_ReluFwd",
+            "engine"              : 15,
+            "knob"                : ["k5=1", "k6=0", "k7=1", "k10=1"],
+            "cudnn_version_start" : 8900,
+            "cudnn_version_end"   : -1,
+            "comment"             : "b/281585171"
+          },
+          { "rule_id"             : "ConvFwd_Add_Add_eng15_k5=1_k6=0_k7=1_k10=1",
+            "operation"           : "ConvFwd_Add_Add",
+            "engine"              : 15,
+            "knob"                : ["k5=1", "k6=0", "k7=1", "k10=1"],
+            "cudnn_version_start" : 8900,
+            "cudnn_version_end"   : -1,
+            "comment"             : "b/281887114"
           }
       ]})";
   static const json* json_handle = new json(json::parse(filter_str));
@@ -3949,16 +3975,19 @@ GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
       dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
 #endif
 
-  TF_ASSIGN_OR_RETURN(auto tensor_w,
-                      CreateCudnnTensor(filter_dims, filter_strides, 'w',
-                                        input_type, vector_size, vector_dim,
-                                        /*is_virtual=*/false,
 #if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
-                                        tensor_ordering_type
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_w,
+      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
+                        vector_size, vector_dim,
+                        /*is_virtual=*/false, tensor_ordering_type));
 #else
-                                        is_reordered_nchw_vect
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_w,
+      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
+                        vector_size, vector_dim,
+                        /*is_virtual=*/false, is_reordered_nchw_vect));
 #endif
-                                        ));
 
   // conv_desc.
   auto mode = convolution_descriptor.convolution_not_crosscorr()
@@ -4099,16 +4128,21 @@ GetCudnnFusedOperationGraph(
       dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
 #endif
 
-  TF_ASSIGN_OR_RETURN(auto tensor_w,
-                      CreateCudnnTensor(filter_dims, filter_strides, 'w',
-                                        input_type, vector_size, vector_dim,
-                                        /*is_virtual=*/false,
 #if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
-                                        tensor_ordering_type
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_w,
+      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
+                        vector_size, vector_dim,
+                        /*is_virtual=*/false,
+                        tensor_ordering_type));  // cuDNN 8.3 fails here
 #else
-                                        is_reordered_nchw_vect
+  TF_ASSIGN_OR_RETURN(
+      auto tensor_w,
+      CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
+                        vector_size, vector_dim,
+                        /*is_virtual=*/false,
+                        is_reordered_nchw_vect));  // cuDNN 8.3 fails here
 #endif
-                                        ));  // cuDNN 8.3 fails here
 
   // For the purposes of the cudnn graph, say that the bias tensor has the same
   // layout as the output tensor.  It doesn't actually matter, because bias is a
@@ -7676,7 +7710,9 @@ void initialize_cudnn() {
 
 }  // namespace stream_executor
 
+#ifdef __clang__
 #pragma clang diagnostic pop
+#endif
 
 REGISTER_MODULE_INITIALIZER(register_cudnn,
                             { stream_executor::initialize_cudnn(); });
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
index 95410dbadfc..ad22852e5f6 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc
@@ -855,6 +855,23 @@ bool DeviceOptionsToContextFlags(const DeviceOptions& device_options,
   return true;
 }
 
+/* static */ int GpuDriver::GetGpuStreamPriority(
+    GpuContext* context, stream_executor::StreamPriority stream_priority) {
+  ScopedActivateContext activation(context);
+  if (stream_priority == stream_executor::StreamPriority::Default) {
+    return 0;
+  }
+  int lowest, highest;
+  CUresult res = cuCtxGetStreamPriorityRange(&lowest, &highest);
+  if (res != CUDA_SUCCESS) {
+    LOG(ERROR)
+        << "Could not query stream priority range. Returning default priority.";
+    return 0;
+  }
+  return stream_priority == stream_executor::StreamPriority::Highest ? highest
+                                                                     : lowest;
+}
+
 #if CUDA_VERSION >= 10020
 /* static */ tsl::StatusOr<GpuDriver::VmemSpan> GpuDriver::ReserveVirtualMemory(
     GpuContext* context, uint64_t bytes) {
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
index 5a0b9e099ea..23de42ff176 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h"
 
+#include <atomic>
 #include <string>
 
 #include "absl/strings/str_format.h"
@@ -36,6 +37,22 @@ static tsl::Status InternalError(const absl::FormatSpec<Args...>& format,
 // RAII helpers for CUDA graph types.
 //===----------------------------------------------------------------------===//
 
+std::atomic<size_t> CudaGraphSupport::allocated_cuda_graph_execs_;
+std::atomic<size_t> CudaGraphSupport::alive_cuda_graph_execs_;
+
+/*static*/ size_t CudaGraphSupport::NotifyGraphExecCreated() {
+  alive_cuda_graph_execs_.fetch_add(1, std::memory_order_relaxed);
+  return allocated_cuda_graph_execs_.fetch_add(1, std::memory_order_relaxed);
+}
+
+/*static*/ size_t CudaGraphSupport::allocated_cuda_graph_execs() {
+  return allocated_cuda_graph_execs_.load(std::memory_order_relaxed);
+}
+
+/*static*/ size_t CudaGraphSupport::alive_cuda_graph_execs() {
+  return alive_cuda_graph_execs_.load(std::memory_order_relaxed);
+}
+
 void CudaGraphSupport::DestroyGraph::operator()(cudaGraph_t graph) {
   cudaError_t err = cudaGraphDestroy(graph);
   CHECK(err == cudaSuccess)
@@ -44,6 +61,9 @@ void CudaGraphSupport::DestroyGraph::operator()(cudaGraph_t graph) {
 
 void CudaGraphSupport::DestroyGraphExec::operator()(cudaGraphExec_t instance) {
   cudaError_t err = cudaGraphExecDestroy(instance);
+  alive_cuda_graph_execs_.fetch_sub(1, std::memory_order_relaxed);
+  VLOG(5) << "Destroy CUDA graph exec (remaining alive instances: "
+          << CudaGraphSupport::alive_cuda_graph_execs() << ")";
   CHECK(err == cudaSuccess)
       << "Failed to destroy CUDA graph instance: " << cudaGetErrorString(err);
 }
@@ -172,6 +192,10 @@ tsl::StatusOr<OwnedCudaGraphExec> InstantiateCudaGraph(OwnedCudaGraph graph) {
                          cudaGetErrorString(err));
   }
 
+  size_t id = CudaGraphSupport::NotifyGraphExecCreated();
+  VLOG(5) << "Instantiated CUDA graph exec instance #" << id
+          << " (alive instances: " << CudaGraphSupport::alive_cuda_graph_execs()
+          << ")";
   return OwnedCudaGraphExec(exec);
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
index 3480e96c095..0b851440126 100644
--- a/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
+++ b/tensorflow/compiler/xla/stream_executor/cuda/cuda_graph.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_CUDA_CUDA_GRAPH_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_CUDA_CUDA_GRAPH_H_
 
+#include <atomic>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -38,6 +39,17 @@ class CudaGraphSupport {
   struct DestroyGraphExec {
     void operator()(cudaGraphExec_t);
   };
+
+  static size_t NotifyGraphExecCreated();
+
+  static size_t allocated_cuda_graph_execs();
+  static size_t alive_cuda_graph_execs();
+
+ private:
+  // Global counters for the total number of allocated and alive CUDA graph
+  // execs to track the resource usage at run time.
+  static std::atomic<size_t> allocated_cuda_graph_execs_;
+  static std::atomic<size_t> alive_cuda_graph_execs_;
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/BUILD b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
index 8843902db19..8aeff6aea52 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/gpu/BUILD
@@ -83,7 +83,7 @@ cc_library(
     deps = [
         ":gpu_types_header",
         "//tensorflow/compiler/xla/stream_executor:device_options",
-        "//tensorflow/compiler/xla/stream_executor/platform",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
     ] + if_libtpu(
@@ -214,7 +214,7 @@ cc_library(
     name = "gpu_stream_header",
     hdrs = if_gpu_is_configured(["gpu_stream.h"]),
     deps = [
-        ":gpu_driver_header",
+        ":gpu_types_header",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/base:core_headers",
     ],
@@ -225,11 +225,10 @@ cc_library(
     srcs = if_gpu_is_configured(["gpu_stream.cc"]),
     hdrs = if_gpu_is_configured(["gpu_stream.h"]),
     deps = [
-        ":gpu_driver_header",
         ":gpu_executor_header",
+        ":gpu_types_header",
         "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/tsl/platform:status",
-        "@com_google_absl//absl/base:core_headers",
     ],
 )
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
index 397f29b44f7..0489ec6f0a6 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h
@@ -24,7 +24,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/device_options.h"
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
-#include "tensorflow/compiler/xla/stream_executor/platform/port.h"
+#include "tensorflow/compiler/xla/stream_executor/platform.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
 
@@ -142,6 +142,13 @@ class GpuDriver {
   // previously registered.
   static bool HostUnregister(GpuContext* context, void* location);
 
+  // Queries the priority range and returns the corresponding integer value via
+  // cuCtxGetStreamPriorityRange
+  //
+  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__CTX.html#group__CUDA__CTX_1g137920ab61a71be6ce67605b9f294091
+  static int GetGpuStreamPriority(
+      GpuContext* context, stream_executor::StreamPriority stream_priority);
+
   // Virtual memory support was added to CUDA in 10.2
 #if CUDA_VERSION >= 10020
 
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
index 121e86a2b9a..3730a18c41f 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h"
 
+#include <variant>
+
 #include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
 #include "tensorflow/compiler/xla/stream_executor/stream.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -23,8 +25,15 @@ namespace stream_executor {
 namespace gpu {
 
 bool GpuStream::Init() {
+  int priority = [&]() {
+    if (std::holds_alternative<int>(stream_priority_)) {
+      return std::get<int>(stream_priority_);
+    }
+    return GpuDriver::GetGpuStreamPriority(
+        parent_->gpu_context(), std::get<StreamPriority>(stream_priority_));
+  }();
   if (!GpuDriver::CreateStream(parent_->gpu_context(), &gpu_stream_,
-                               priority_)) {
+                               priority)) {
     return false;
   }
   return GpuDriver::InitEvent(parent_->gpu_context(), &completed_event_,
diff --git a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h
index 86ccd912ffd..f2ab88c3b7c 100644
--- a/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h
+++ b/tensorflow/compiler/xla/stream_executor/gpu/gpu_stream.h
@@ -19,8 +19,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_GPU_GPU_STREAM_H_
 
-#include "absl/base/thread_annotations.h"
-#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_driver.h"
+#include <variant>
+
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_types.h"
 #include "tensorflow/compiler/xla/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
@@ -38,7 +39,7 @@ class GpuStream : public internal::StreamInterface {
       : parent_(parent), gpu_stream_(nullptr), completed_event_(nullptr) {}
 
   // Note: teardown is handled by a parent's call to DeallocateStream.
-  ~GpuStream() override {}
+  ~GpuStream() override = default;
 
   void* GpuStreamHack() override { return gpu_stream_; }
   void** GpuStreamMemberHack() override {
@@ -48,8 +49,16 @@ class GpuStream : public internal::StreamInterface {
   // Explicitly initialize the CUDA resources associated with this stream, used
   // by StreamExecutor::AllocateStream().
   bool Init();
-  void SetPriority(int priority) { priority_ = priority; }
-  int priority() const { return priority_; }
+
+  void SetPriority(StreamPriority priority) override {
+    stream_priority_ = priority;
+  }
+
+  void SetPriority(int priority) override { stream_priority_ = priority; }
+
+  std::variant<StreamPriority, int> priority() const override {
+    return stream_priority_;
+  }
 
   // Explicitly destroy the CUDA resources associated with this stream, used by
   // StreamExecutor::DeallocateStream().
@@ -80,7 +89,7 @@ class GpuStream : public internal::StreamInterface {
  private:
   GpuExecutor* parent_;         // Executor that spawned this stream.
   GpuStreamHandle gpu_stream_;  // Wrapped CUDA stream handle.
-  int priority_ = 0;
+  std::variant<StreamPriority, int> stream_priority_;
 
   // Event that indicates this stream has completed.
   GpuEventHandle completed_event_ = nullptr;
@@ -92,7 +101,6 @@ GpuStream* AsGpuStream(Stream* stream);
 
 // Extracts a GpuStreamHandle from a GpuStream-backed Stream object.
 GpuStreamHandle AsGpuStreamValue(Stream* stream);
-
 }  // namespace gpu
 }  // namespace stream_executor
 
diff --git a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
index 2fd4a264077..26173f8c323 100644
--- a/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
+++ b/tensorflow/compiler/xla/stream_executor/multi_platform_manager.cc
@@ -95,9 +95,10 @@ tsl::Status MultiPlatformManagerImpl::RegisterPlatform(
   std::string key = absl::AsciiStrToLower(platform->Name());
   absl::MutexLock lock(&mu_);
   if (name_map_.find(key) != name_map_.end()) {
-    return tsl::Status(absl::StatusCode::kInternal,
-                       "platform is already registered with name: \"" +
-                           platform->Name() + "\"");
+    LOG(WARNING)
+        << "platform is already registered with name: \"" << platform->Name()
+        << "\". Please check if you linked the platform more than once.";
+    return ::tsl::OkStatus();
   }
   Platform* platform_ptr = platform.get();
   CHECK(id_map_.emplace(platform->id(), platform_ptr).second);
diff --git a/tensorflow/compiler/xla/stream_executor/platform.cc b/tensorflow/compiler/xla/stream_executor/platform.cc
index 04bfbad66d9..ec25b332056 100644
--- a/tensorflow/compiler/xla/stream_executor/platform.cc
+++ b/tensorflow/compiler/xla/stream_executor/platform.cc
@@ -40,6 +40,17 @@ std::string PlatformKindString(PlatformKind kind) {
   }
 }
 
+std::string StreamPriorityToString(StreamPriority priority) {
+  switch (priority) {
+    case StreamPriority::Lowest:
+      return "Lowest priority";
+    case StreamPriority::Highest:
+      return "Highest priority";
+    default:
+      return "Default Priority";
+  }
+}
+
 PlatformKind PlatformKindFromString(std::string kind) {
   for (int i = 0; i < static_cast<int>(PlatformKind::kSize); ++i) {
     if (kind == PlatformKindString(static_cast<PlatformKind>(i))) {
diff --git a/tensorflow/compiler/xla/stream_executor/platform.h b/tensorflow/compiler/xla/stream_executor/platform.h
index 76caf18b635..78e2a0ed156 100644
--- a/tensorflow/compiler/xla/stream_executor/platform.h
+++ b/tensorflow/compiler/xla/stream_executor/platform.h
@@ -49,6 +49,10 @@ enum class PlatformKind {
   kSize,
 };
 
+// An enum to represent different levels of stream priorities.
+// This is to avoid platform-specific representations in abstractions.
+enum class StreamPriority { Default = 0, Lowest, Highest };
+
 // Returns true if kind represents a valid platform capable of enqueuing items
 // on a stream, but not necessarily on an accelerator device.
 // Returns false for kMock and any invalid PlatformKind values.
@@ -62,6 +66,9 @@ bool PlatformIsRunnableOnDevice(PlatformKind kind);
 // Returns a printable description of a PlatformKind.
 std::string PlatformKindString(PlatformKind kind);
 
+// Returns a printable description of StreamPriority.
+std::string StreamPriorityToString(StreamPriority priority);
+
 // Returns the PlatformKind corresponding to the input string; returns kInvalid
 // in the case of no match.
 PlatformKind PlatformKindFromString(std::string platform_string);
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/BUILD b/tensorflow/compiler/xla/stream_executor/rocm/BUILD
index b29f7f9f35c..3f978276b31 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/BUILD
+++ b/tensorflow/compiler/xla/stream_executor/rocm/BUILD
@@ -44,6 +44,7 @@ cc_library(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
+        "//tensorflow/compiler/xla/stream_executor:stream_executor_headers",
         "//tensorflow/compiler/xla/stream_executor:device_options",
         "//tensorflow/compiler/xla/stream_executor/gpu:gpu_driver_header",
         "//tensorflow/compiler/xla/stream_executor/platform",
@@ -85,6 +86,7 @@ cc_library(
 cc_library(
     name = "rocm_gpu_executor",
     srcs = if_rocm_is_configured(["rocm_gpu_executor.cc"]),
+    hdrs = if_rocm_is_configured(["rocm_gpu_executor.h"]),
     deps = if_rocm_is_configured([
         ":rocm_diagnostics",
         ":rocm_driver",
@@ -110,6 +112,18 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "rocm_gpu_executor_header",
+    textual_hdrs = if_rocm_is_configured(["rocm_gpu_executor.h"]),
+    visibility = ["//visibility:public"],
+    deps = if_rocm_is_configured([
+        ":rocm_kernel",
+        "//tensorflow/compiler/xla/stream_executor:event",
+        "//tensorflow/compiler/xla/stream_executor/gpu:gpu_executor_header",
+        "//tensorflow/compiler/xla/stream_executor/platform",
+    ]),
+)
+
 cc_library(
     name = "rocm_kernel",
     srcs = if_rocm_is_configured(["rocm_kernel.cc"]),
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
index 9e5af1ec863..1cfedd559ac 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/hipsolver_wrapper.h
@@ -86,6 +86,14 @@ namespace wrap {
   __macro(hipsolverSgetrs_bufferSize)        \
   __macro(hipsolverZgetrs)                   \
   __macro(hipsolverZgetrs_bufferSize)        \
+  __macro(hipsolverSgesvd)                   \
+  __macro(hipsolverSgesvd_bufferSize)        \
+  __macro(hipsolverDgesvd)                   \
+  __macro(hipsolverDgesvd_bufferSize)        \
+  __macro(hipsolverCgesvd)                   \
+  __macro(hipsolverCgesvd_bufferSize)        \
+  __macro(hipsolverZgesvd)                   \
+  __macro(hipsolverZgesvd_bufferSize)        \
   __macro(hipsolverCpotrf)                   \
   __macro(hipsolverCpotrf_bufferSize)        \
   __macro(hipsolverDpotrf)                   \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
index 6c603f51b47..9949199258a 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_blas.cc
@@ -48,6 +48,10 @@ namespace gpu {
 
 PLUGIN_REGISTRY_DEFINE_PLUGIN_ID(kRocBlasPlugin);
 
+extern void rocm_Broadcast_fp32(void *stream, float *dst, int dst_stride,
+                                int batches, int src_batches, float *src,
+                                int size);
+
 template <class T>
 const typename RocBlasTypeConversionHelper<T>::mapped_type *complex_cast(
     const DeviceMemory<T> &a) {
@@ -528,7 +532,8 @@ tsl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
     blas::AlgorithmType algorithm, blas::ComputePrecision precision,
     blas::ProfileResult *output_profile_result) {
   // ROCM TODO: properly implement the interface
-  return tsl::errors::Internal("Not implemented on ROCm");
+  return tsl::errors::Internal("DoBlasGemmWithAlgorithm ",
+                               "is not implemented on ROCm yet");
 }
 
 tsl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
@@ -541,7 +546,8 @@ tsl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
     blas::AlgorithmType algorithm, blas::ComputePrecision precision,
     blas::ProfileResult *output_profile_result) {
   // ROCM TODO: properly implement the interface
-  return tsl::errors::Internal("Not implemented on ROCm");
+  return tsl::errors::Internal("DoBlasGemmStridedBatchedWithAlgorithm ",
+                               "is not implemented on ROCm yet");
 }
 
 bool ROCMBlas::GetBlasGemmAlgorithms(
@@ -550,53 +556,116 @@ bool ROCMBlas::GetBlasGemmAlgorithms(
   return true;
 }
 
+struct MemoryCopyOp {
+  char *src_ptr;
+  char *dst_ptr;
+  uint64_t size;
+  uint64_t count;
+  uint64_t dst_stride;
+  uint64_t src_count;
+};
+
+// Check whether two Memory Copy Ops can be fold together.
+// If it's true, fold it. Otherwise, return false.
+static bool MemCopyOpsFold(MemoryCopyOp &y, const MemoryCopyOp &x) {
+  bool misaligned = (x.size & 3) ||
+                    (reinterpret_cast<uint64_t>(x.dst_ptr) & 3) ||
+                    (reinterpret_cast<uint64_t>(x.src_ptr) & 3) ||
+                    (reinterpret_cast<uint64_t>(y.dst_ptr) & 3) ||
+                    (reinterpret_cast<uint64_t>(y.src_ptr) & 3);
+
+  int64_t dst_step = reinterpret_cast<int64_t>(x.dst_ptr) -
+                     reinterpret_cast<int64_t>(y.dst_ptr);
+
+  if (x.src_ptr == y.src_ptr && x.size == y.size &&
+      (y.count == 1 || x.dst_ptr == y.dst_ptr + y.count * y.dst_stride) &&
+      !misaligned && y.src_count == 1 && !(dst_step & 3)) {
+    if (y.count == 1) {
+      y.dst_stride = dst_step;
+    }
+    y.count++;
+    return true;
+  } else if (x.src_ptr == y.src_ptr + y.size &&
+             x.dst_ptr == y.dst_ptr + y.size && y.count == 1 &&
+             y.src_count == 1) {
+    y.size += x.size;
+    return true;
+  }
+  if (x.src_ptr == y.src_ptr + y.size * y.src_count &&
+      x.dst_ptr == y.dst_ptr + y.dst_stride * y.src_count * y.count &&
+      x.count == y.count && x.dst_stride == y.dst_stride) {
+    y.src_count += x.src_count;
+    return true;
+  }
+  return false;
+}
+
 // This copies from source memory: raw_ptrs[i] to target memory:
 // device_memory_ptr at the interval of matrix_byte_size, or vice versa.
 // The below algorithm tries to minimize the number of memcpy by consolidating
-// neighboring memcpy into a single request
+// neighboring memcpy into a single request.
 template <typename MAPPED_T>
 tsl::Status ReorganizeMemory(Stream *stream,
                              DeviceMemory<MAPPED_T> *device_memory,
                              const std::vector<MAPPED_T *> &raw_ptrs,
                              int batch_count, uint64_t batch_stride,
                              bool gather) {
+  if (gather == false) {
+    return tsl::Status(absl::StatusCode::kUnimplemented,
+                       "gather=false is unsupported");
+  }
+
   assert(batch_count > 0);
   char *device_memory_ptr = static_cast<char *>(device_memory->opaque());
   char *src_ptr = reinterpret_cast<char *>(raw_ptrs[0]);
   char *dst_ptr = device_memory_ptr;
   size_t matrix_byte_size = batch_stride * sizeof(MAPPED_T);
-  uint64_t cur_stride_size = matrix_byte_size;
+
+  std::vector<MemoryCopyOp> mem_copy_ops{
+      MemoryCopyOp{src_ptr, dst_ptr, matrix_byte_size, 1, 0, 1}};
 
   for (int i = 1; i < batch_count; ++i) {
-    if (reinterpret_cast<char *>(raw_ptrs[i]) == src_ptr + cur_stride_size) {
-      cur_stride_size += matrix_byte_size;
+    src_ptr = reinterpret_cast<char *>(raw_ptrs[i]);
+    dst_ptr = device_memory_ptr + i * matrix_byte_size;
+
+    MemoryCopyOp x{src_ptr, dst_ptr, matrix_byte_size, 1, 0, 1};
+    while (mem_copy_ops.size() > 1 &&
+           MemCopyOpsFold(mem_copy_ops[mem_copy_ops.size() - 2],
+                          mem_copy_ops.back())) {
+      mem_copy_ops.pop_back();
+    }
+    MemoryCopyOp &op = mem_copy_ops.back();
+    if (MemCopyOpsFold(op, x)) {
+      continue;
+    }
+    mem_copy_ops.push_back(x);
+  }
+
+  while (mem_copy_ops.size() > 1 &&
+         MemCopyOpsFold(mem_copy_ops[mem_copy_ops.size() - 2],
+                        mem_copy_ops.back())) {
+    mem_copy_ops.pop_back();
+  }
+
+  int i = 0;
+  for (auto &x : mem_copy_ops) {
+    if (x.src_count > 1 || x.count > 1) {
+      rocm_Broadcast_fp32(AsGpuStreamValue(stream),
+                          reinterpret_cast<float *>(x.dst_ptr),
+                          x.dst_stride >> 2, x.count, x.src_count,
+                          reinterpret_cast<float *>(x.src_ptr), x.size >> 2);
     } else {
-      DeviceMemoryBase src_mem = DeviceMemoryBase(src_ptr, cur_stride_size);
-      DeviceMemoryBase target_mem = DeviceMemoryBase(dst_ptr, cur_stride_size);
-      bool a_status =
-          gather
-              ? stream->ThenMemcpy(&target_mem, src_mem, cur_stride_size).ok()
-              : stream->ThenMemcpy(&src_mem, target_mem, cur_stride_size).ok();
+      DeviceMemoryBase src_mem = DeviceMemoryBase(x.src_ptr, x.size);
+      DeviceMemoryBase target_mem = DeviceMemoryBase(x.dst_ptr, x.size);
+      bool a_status = stream->ThenMemcpy(&target_mem, src_mem, x.size).ok();
       if (!a_status) {
         return tsl::Status(
             absl::StatusCode::kInternal,
             "failed to copy device memory in ROCMBlas::DoBlasGemmBatched");
       }
-      src_ptr = reinterpret_cast<char *>(raw_ptrs[i]);
-      dst_ptr = device_memory_ptr + i * matrix_byte_size;
-      cur_stride_size = matrix_byte_size;
     }
+    i++;
   }
-
-  DeviceMemoryBase src_mem = DeviceMemoryBase(src_ptr, cur_stride_size);
-  DeviceMemoryBase target_mem = DeviceMemoryBase(dst_ptr, cur_stride_size);
-  bool a_status =
-      gather ? stream->ThenMemcpy(&target_mem, src_mem, cur_stride_size).ok()
-             : stream->ThenMemcpy(&src_mem, target_mem, cur_stride_size).ok();
-  if (!a_status)
-    return tsl::Status(
-        absl::StatusCode::kInternal,
-        "failed to copy device memory in ROCMBlas::DoBlasGemmBatched");
   return tsl::OkStatus();
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
index 106314e8109..a31d37a3517 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_dnn.cc
@@ -1819,10 +1819,14 @@ miopenDataType_t ToMIOpenDataType(
     dnn::DataType data_type,
     dnn::DataLayout data_layout = dnn::DataLayout::kBatchDepthYX) {
   switch (data_type) {
+    case dnn::DataType::kBF16:
+      return miopenBFloat16;
     case dnn::DataType::kFloat:
       return miopenFloat;
     case dnn::DataType::kHalf:
       return miopenHalf;
+    case dnn::DataType::kInt8:
+      if (data_layout == dnn::DataLayout::kBatchDepthYX) return miopenInt8;
     case dnn::DataType::kDouble:
     default:
       LOG(FATAL) << "Invalid DNN data type: " << static_cast<int>(data_type);
@@ -2314,8 +2318,7 @@ bool MIOpenSupport::DoRnnForwardImpl(
     if (reserve_space_size_in_bytes > 0) {
       auto allocated =
           reserve_space_allocator->AllocateBytes(reserve_space_size_in_bytes);
-      if (!allocated.ok() ||
-          (reserve_space = allocated.value()) == nullptr) {
+      if (!allocated.ok() || (reserve_space = allocated.value()) == nullptr) {
         LOG(ERROR) << "Fail to allocate RNN reserve space";
         return false;
       }
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
index 0687778d410..1687cd90509 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver.cc
@@ -691,6 +691,23 @@ GpuDriver::ContextGetSharedMemConfig(GpuContext* context) {
   return true;
 }
 
+/* static */ int GpuDriver::GetGpuStreamPriority(
+    GpuContext* context, stream_executor::StreamPriority stream_priority) {
+  ScopedActivateContext activation(context);
+  if (stream_priority == stream_executor::StreamPriority::Default) {
+    return 0;
+  }
+  int lowest, highest;
+  hipError_t res = wrap::hipDeviceGetStreamPriorityRange(&lowest, &highest);
+  if (res != hipSuccess) {
+    LOG(ERROR)
+        << "Could not query stream priority range. Returning default priority.";
+    return 0;
+  }
+  return stream_priority == stream_executor::StreamPriority::Highest ? highest
+                                                                     : lowest;
+}
+
 /* static */ tsl::Status GpuDriver::DestroyEvent(GpuContext* context,
                                                  GpuEventHandle* event) {
   if (*event == nullptr) {
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
index ad62ffdf455..3ef762f8cd2 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_driver_wrapper.h
@@ -121,6 +121,7 @@ namespace wrap {
   __macro(hipModuleUnload)                          \
   __macro(hipPointerGetAttributes)                  \
   __macro(hipSetDevice)                             \
+  __macro(hipDeviceGetStreamPriorityRange)          \
   __macro(hipStreamAddCallback)                     \
   __macro(hipStreamCreateWithFlags)                 \
   __macro(hipStreamCreateWithPriority)              \
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
index 583e47d55ce..c159df782e8 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.cc
@@ -915,6 +915,8 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
     int64_t memory_bandwidth = 2 * (int64_t(prop.memoryBusWidth) / 8) *
                                (int64_t(prop.memoryClockRate) * 1000);
     builder.set_memory_bandwidth(memory_bandwidth);
+
+    builder.set_l2_cache_size(prop.l2CacheSize);
   }
 
   {
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.h b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.h
new file mode 100644
index 00000000000..fa0b7a96c51
--- /dev/null
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_gpu_executor.h
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// The ROCm implementation of the StreamExecutorInterface functionality.
+// ROCm inclusions are ideally confined to this implementation file.
+//
+// The notions from the StreamExecutor basically correspond to the ROCm streams
+// programming model provided by the librocm.so driver APIs, so we don't have
+// to do much more than wrap the calls to the libraries appropriately.
+#ifndef TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCM_GPU_EXECUTOR_H_
+#define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCM_GPU_EXECUTOR_H_
+
+#include "tensorflow/compiler/xla/stream_executor/gpu/gpu_executor.h"
+
+namespace stream_executor {
+namespace rocm {
+
+using ROCMExecutor = gpu::GpuExecutor;
+
+}  // namespace rocm
+}  // namespace stream_executor
+
+#endif  // TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_ROCM_ROCM_GPU_EXECUTOR_H_
diff --git a/tensorflow/compiler/xla/stream_executor/rocm/rocm_helpers.cu.cc b/tensorflow/compiler/xla/stream_executor/rocm/rocm_helpers.cu.cc
index 6d0934c8b73..ccfa1a9e828 100644
--- a/tensorflow/compiler/xla/stream_executor/rocm/rocm_helpers.cu.cc
+++ b/tensorflow/compiler/xla/stream_executor/rocm/rocm_helpers.cu.cc
@@ -19,11 +19,44 @@ limitations under the License.
 namespace stream_executor {
 namespace gpu {
 
+__global__ void rocm_Broadcast_fp32Kernel(float* dst, int dst_stride,
+                                          int batches, float* src, int size) {
+  dst += blockIdx.y * 4 * dst_stride + blockIdx.z * dst_stride * batches;
+  src += blockIdx.z * size;
+  float* dst2 = dst + dst_stride;
+  float* dst3 = dst + dst_stride * 2;
+  float* dst4 = dst + dst_stride * 3;
+  bool b2 = (blockIdx.y * 4 + 1 < batches);
+  bool b3 = (blockIdx.y * 4 + 2 < batches);
+  bool b4 = (blockIdx.y * 4 + 3 < batches);
+  for (int i = threadIdx.x + blockIdx.x * 256; i < size;
+       i += blockDim.x * gridDim.x) {
+    dst[i] = src[i];
+    if (b2) {
+      dst2[i] = src[i];
+    }
+    if (b3) {
+      dst3[i] = src[i];
+    }
+    if (b4) {
+      dst4[i] = src[i];
+    }
+  }
+}
+
+void rocm_Broadcast_fp32(void* stream, float* dst, int dst_stride, int batches,
+                         int src_batches, float* src, int size) {
+  int x_blocks = (size + 255) / 256;
+  hipLaunchKernelGGL(rocm_Broadcast_fp32Kernel,
+                     dim3(x_blocks, (batches + 3) / 4, src_batches),
+                     min(256, (int)size), 0, (hipStream_t)stream, dst,
+                     dst_stride, batches, src, size);
+}
+
 // GPU kernel to populate an array of pointers:
 //
 //   [base + stride * i for i in range(n)].
 //
-
 __global__ void __xla_MakeBatchPointers(char* base, int stride, int n,
                                         void** ptrs_out) {
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
diff --git a/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h b/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h
index 4e7d5cf7a05..eb426ffd3cf 100644
--- a/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h
+++ b/tensorflow/compiler/xla/stream_executor/stream_executor_internal.h
@@ -22,12 +22,10 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_INTERNAL_H_
 
 #include <cstdint>
-#include <functional>
-#include <map>
 #include <memory>
 #include <optional>
 #include <string>
-#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/functional/any_invocable.h"
@@ -60,7 +58,7 @@ class Timer;
 // An instance of this is returned from StreamExecutor::GetModule.
 class ModuleHandle {
  public:
-  /*implicit*/ ModuleHandle(void* id = nullptr) : id_(id) {}
+  explicit ModuleHandle(void* id = nullptr) : id_(id) {}
 
   // A ModuleHandle with id() == nullptr is an invalid module handle, akin to a
   // null pointer.
@@ -78,8 +76,8 @@ namespace internal {
 // the PIMPL style.
 class EventInterface {
  public:
-  EventInterface() {}
-  virtual ~EventInterface() {}
+  EventInterface() = default;
+  virtual ~EventInterface() = default;
 
  private:
   SE_DISALLOW_COPY_AND_ASSIGN(EventInterface);
@@ -92,10 +90,10 @@ class EventInterface {
 class KernelInterface {
  public:
   // Default constructor for the abstract interface.
-  KernelInterface() {}
+  KernelInterface() = default;
 
   // Default destructor for the abstract interface.
-  virtual ~KernelInterface() {}
+  virtual ~KernelInterface() = default;
 
   // Returns the number of formal parameters that this kernel accepts.
   virtual unsigned Arity() const = 0;
@@ -117,10 +115,24 @@ class KernelInterface {
 class StreamInterface {
  public:
   // Default constructor for the abstract interface.
-  StreamInterface() {}
+  StreamInterface() = default;
 
   // Default destructor for the abstract interface.
-  virtual ~StreamInterface() {}
+  virtual ~StreamInterface() = default;
+
+  // Sets priority for a stream.
+  virtual void SetPriority(StreamPriority priority) {
+    LOG(ERROR) << "SetPriority unimplemented for this stream.";
+  }
+
+  virtual void SetPriority(int priority) {
+    LOG(ERROR) << "SetPriority unimplemented for this stream.";
+  }
+
+  // Gets priority for a stream.
+  virtual std::variant<StreamPriority, int> priority() const {
+    return StreamPriority::Default;
+  }
 
   // Returns the GPU stream associated with this platform's stream
   // implementation, or nullptr otherwise.
@@ -141,10 +153,10 @@ class StreamInterface {
 class TimerInterface {
  public:
   // Default constructor for the abstract interface.
-  TimerInterface() {}
+  TimerInterface() = default;
 
   // Default destructor for the abstract interface.
-  virtual ~TimerInterface() {}
+  virtual ~TimerInterface() = default;
 
   // Returns the number of microseconds elapsed in a completed timer.
   virtual uint64_t Microseconds() const = 0;
@@ -162,10 +174,10 @@ class TimerInterface {
 class StreamExecutorInterface {
  public:
   // Default constructor for the abstract interface.
-  StreamExecutorInterface() {}
+  StreamExecutorInterface() = default;
 
   // Default destructor for the abstract interface.
-  virtual ~StreamExecutorInterface() {}
+  virtual ~StreamExecutorInterface() = default;
 
   // Returns the (transitively) wrapped executor if this executor is
   // wrapping another executor; otherwise, returns this.
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
index 3a69351c58b..a7b8f793cca 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_conversions.cc
@@ -298,6 +298,7 @@ void ToC(const xla::Layout& layout, XLA_Layout* c_layout) {
   CreateVector(layout.dim_ordered(), &c_layout->dim_ordered);
   c_layout->index_primitive_type = layout.index_primitive_type();
   c_layout->pointer_primitive_type = layout.pointer_primitive_type();
+  c_layout->element_size_in_bits = layout.element_size_in_bits();
   c_layout->memory_space = layout.memory_space();
   c_layout->dynamic_shape_metadata_prefix_bytes =
       layout.dynamic_shape_metadata_prefix_bytes();
@@ -331,7 +332,8 @@ xla::Layout FromC(const XLA_Layout* c_layout) {
       minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
       static_cast<xla::PrimitiveType>(c_layout->index_primitive_type),
       static_cast<xla::PrimitiveType>(c_layout->pointer_primitive_type),
-      c_layout->memory_space, /*physical_shape=*/nullptr,
+      c_layout->element_size_in_bits, c_layout->memory_space,
+      /*physical_shape=*/nullptr,
       c_layout->dynamic_shape_metadata_prefix_bytes);
 }
 
diff --git a/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h b/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
index d8e4ace253c..8d0ca967b51 100644
--- a/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
+++ b/tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h
@@ -244,6 +244,7 @@ typedef struct XLA_Layout {
   TileList tiles;
   int index_primitive_type;
   int pointer_primitive_type;
+  int64_t element_size_in_bits;
   int64_t memory_space;
   int64_t dynamic_shape_metadata_prefix_bytes;
 } XLA_Layout;
diff --git a/tensorflow/compiler/xla/tests/BUILD b/tensorflow/compiler/xla/tests/BUILD
index 65ca47f80af..b782cce7f0c 100644
--- a/tensorflow/compiler/xla/tests/BUILD
+++ b/tensorflow/compiler/xla/tests/BUILD
@@ -89,7 +89,6 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_dataflow_analysis",
         "//tensorflow/compiler/xla/service:hlo_verifier",
         "//tensorflow/compiler/xla/service:transfer_manager",
-        "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -112,7 +111,6 @@ cc_library(
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/base:core_headers",
@@ -160,7 +158,6 @@ cc_library(
     ],
     deps = [
         ":pjrt_client_registry",
-        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt:tfrt_cpu_pjrt_client",
     ],
 )
@@ -173,7 +170,6 @@ cc_library(
     ],
     deps = [
         ":pjrt_client_registry",
-        "//tensorflow/compiler/xla/pjrt:pjrt_client",
         "//tensorflow/compiler/xla/pjrt/gpu:se_gpu_pjrt_client",
     ],
 )
@@ -249,7 +245,9 @@ cc_library(
     srcs = ["client_library_test_base.cc"],
     hdrs = ["client_library_test_base.h"],
     deps = [
+        ":literal_test_util",
         ":manifest_checking_test",
+        ":test_utils",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
@@ -269,13 +267,10 @@ cc_library(
         "//tensorflow/compiler/xla/service:interpreter_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
         "//tensorflow/tsl/lib/core:bitmap",
         "//tensorflow/tsl/platform:float8",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -323,7 +318,6 @@ cc_library(
         "//tensorflow/compiler/xla:util",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
         "//tensorflow/tsl/platform:resource_loader",
         "//tensorflow/tsl/platform:subprocess",
@@ -361,7 +355,6 @@ cc_library(
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:logging",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -371,18 +364,16 @@ xla_test(
     name = "bad_rng_shape_validation_test",
     srcs = ["bad_rng_shape_validation_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -392,7 +383,6 @@ xla_test(
     deps = [
         ":hlo_test_base",
         ":literal_test_util",
-        ":test_utils",
         ":verified_hlo_module",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
@@ -414,17 +404,17 @@ xla_test(
     ],
     shard_count = 50,
     deps = [
+        ":client_library_test_base",
         ":conv_depthwise_common",
+        ":hlo_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:despecializer",
         "//tensorflow/compiler/xla/service:float_normalization",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -434,16 +424,16 @@ xla_test(
     srcs = ["conv_depthwise_backprop_filter_test.cc"],
     shard_count = 40,
     deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:despecializer",
         "//tensorflow/compiler/xla/service:float_normalization",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -476,7 +466,9 @@ xla_test(
     name = "check_execution_arity_test",
     srcs = ["check_execution_arity_test.cc"],
     deps = [
+        ":client_library_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -486,10 +478,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -497,15 +485,14 @@ xla_test(
     name = "query_inferred_shape_test",
     srcs = ["query_inferred_shape_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -557,7 +544,6 @@ xla_test(
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/compiler/xla/service:stream_pool",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:regexp",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
@@ -570,13 +556,12 @@ xla_test(
     name = "axpy_simple_test",
     srcs = ["axpy_simple_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -616,7 +601,10 @@ xla_test(
         "optonly",
     ],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
@@ -626,10 +614,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:test",
     ],
@@ -644,7 +628,6 @@ xla_test(
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
@@ -655,16 +638,13 @@ xla_test(
     name = "select_test",
     srcs = ["select_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -687,17 +667,14 @@ xla_test(
     name = "unary_op_test",
     srcs = ["unary_op_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -707,7 +684,10 @@ xla_test(
     srcs = ["scalar_computations_test.cc"],
     shard_count = 32,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:status_macros",
@@ -718,10 +698,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -734,7 +710,6 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":test_macros_header",
-        ":test_utils",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
@@ -774,7 +749,10 @@ xla_test(
     srcs = ["array_elementwise_ops_test.cc"],
     shard_count = 25,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
@@ -783,14 +761,9 @@ xla_test(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
     ],
@@ -802,16 +775,16 @@ cc_library(
     srcs = ["conv_depthwise_common.cc"],
     hdrs = ["conv_depthwise_common.h"],
     deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:execution_options_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:despecializer",
         "//tensorflow/compiler/xla/service:float_normalization",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -819,21 +792,19 @@ xla_test(
     name = "reduce_precision_test",
     srcs = ["reduce_precision_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/strings",
     ],
@@ -895,7 +866,6 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",
@@ -908,7 +878,6 @@ xla_test(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "@com_google_absl//absl/strings",
@@ -935,7 +904,6 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",
@@ -948,7 +916,6 @@ xla_test(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "@com_google_absl//absl/strings",
@@ -1002,7 +969,6 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
         ":test_utils",
         ":xla_internal_test_main",
@@ -1015,7 +981,6 @@ xla_test(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/service:hlo_parser",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "@com_google_absl//absl/strings",
@@ -1026,19 +991,16 @@ xla_test(
     name = "transpose_test",
     srcs = ["transpose_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1240,7 +1202,10 @@ xla_test(
     },
     shard_count = 30,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
@@ -1249,11 +1214,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "//tensorflow/tsl/platform:tensor_float_32_utils",
         "//tensorflow/tsl/platform:test",
@@ -1266,23 +1226,17 @@ xla_test(
     srcs = ["convolution_dimension_numbers_test.cc"],
     shard_count = 20,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:padding",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1349,30 +1303,25 @@ xla_test(
     srcs = ["bfloat16_test.cc"],
     shard_count = 40,
     deps = [
+        ":client_library_test_base",
+        ":hlo_test_base",
+        ":literal_test_util",
         ":test_macros_header",
         ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla:util",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1381,12 +1330,10 @@ xla_test(
     name = "float8_test",
     srcs = ["float8_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:float8",
     ],
 )
 
@@ -1398,20 +1345,15 @@ xla_test(
         "gpu",
     ],
     deps = [
+        ":client_library_test_base",
         ":test_macros_header",
         ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -1441,15 +1383,14 @@ xla_test(
     name = "multidimensional_slice_test",
     srcs = ["multidimensional_slice_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1478,7 +1419,6 @@ xla_test(
         "//tensorflow/compiler/xla/service:transfer_manager",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
     ],
@@ -1494,7 +1434,6 @@ xla_test(
         ":test_macros_header",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
-        "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1506,7 +1445,6 @@ xla_test(
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1514,17 +1452,16 @@ xla_test(
     name = "vector_ops_reduce_test",
     srcs = ["vector_ops_reduce_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array3d",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1557,7 +1494,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -1620,7 +1556,10 @@ xla_test(
         "optonly",
     ],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
@@ -1632,10 +1571,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1648,7 +1583,6 @@ xla_test(
         ":client_library_test_base",
         ":hlo_test_base",
         ":literal_test_util",
-        ":pjrt_client_registry",
         ":test_macros_header",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
@@ -1656,7 +1590,6 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:hlo_runner_pjrt",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:test",
     ],
@@ -1675,6 +1608,17 @@ xla_test(
     ],
 )
 
+xla_test(
+    name = "topk_test",
+    srcs = ["topk_test.cc"],
+    deps = [
+        ":hlo_test_base",
+        ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
+    ],
+)
+
 xla_test(
     name = "token_hlo_test",
     srcs = ["token_hlo_test.cc"],
@@ -1692,7 +1636,10 @@ xla_test(
     name = "call_test",
     srcs = ["call_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -1700,10 +1647,6 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1738,16 +1681,15 @@ xla_test(
     name = "binop_scaling_test",
     srcs = ["binop_scaling_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1756,7 +1698,10 @@ xla_test(
     name = "broadcast_simple_test",
     srcs = ["broadcast_simple_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
@@ -1765,10 +1710,6 @@ xla_test(
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -1776,22 +1717,18 @@ xla_test(
     name = "pad_test",
     srcs = ["pad_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:reference_util",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -1799,13 +1736,12 @@ xla_test(
     name = "fmax_fmin_test",
     srcs = ["fmax_fmin_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1814,13 +1750,12 @@ xla_test(
     name = "log_test",
     srcs = ["log_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1830,7 +1765,11 @@ xla_test(
     timeout = "long",
     srcs = ["matrix_ops_simple_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:reference_util",
@@ -1841,13 +1780,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
@@ -1865,7 +1798,9 @@ xla_test(
         "nosan",
     ],
     deps = [
+        ":client_library_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
@@ -1873,9 +1808,6 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/types:span",
@@ -1887,7 +1819,10 @@ xla_test(
     srcs = ["reshape_test.cc"],
     shard_count = 30,
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal_util",
@@ -1901,10 +1836,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -1913,15 +1844,14 @@ xla_test(
     name = "reverse_test",
     srcs = ["reverse_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -1932,7 +1862,10 @@ xla_test(
     name = "vector_ops_simple_test",
     srcs = ["vector_ops_simple_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -1944,10 +1877,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -1979,18 +1908,16 @@ xla_test(
     name = "convert_test",
     srcs = ["convert_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:float8",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -2006,23 +1933,13 @@ xla_test(
     ],
     tags = ["test_hlo_pjrt_runner"],
     deps = [
-        ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2051,7 +1968,6 @@ xla_test(
         "cpu",
     ],
     deps = [
-        ":client_library_test_base",
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
@@ -2059,17 +1975,9 @@ xla_test(
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:blocking_counter",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -2095,23 +2003,30 @@ xla_test(
         "gpu",
     ],
     deps = [
-        ":client_library_test_base",
         ":hlo_test_base",
         ":literal_test_util",
         ":test_macros_header",
-        ":test_utils",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/compiler/xla/service/gpu:backend_configs_cc",
-        "//tensorflow/tsl/platform:test",
+    ],
+)
+
+xla_test(
+    name = "data_parallel_collective_optimizer_execution_test",
+    srcs = ["data_parallel_collective_optimizer_execution_test.cc"],
+    deps = [
+        ":hlo_test_base",
+        ":xla_internal_test_main",
+        "//tensorflow/compiler/xla:statusor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:data_parallel_collective_optimizer",
+        "//tensorflow/compiler/xla/service:hlo_dce",
+        "//tensorflow/compiler/xla/service:hlo_parser",
+        "//tensorflow/compiler/xla/service:hlo_pass_pipeline",
+        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -2132,27 +2047,15 @@ xla_test(
     },
     backends = ["gpu"],
     deps = [
-        ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
-        ":test_utils",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:literal_util",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/hlo/ir:hlo",
-        "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2171,7 +2074,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:float8",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2183,21 +2085,13 @@ xla_test(
         "gpu",
     ],
     deps = [
-        ":client_library_test_base",
         ":hlo_test_base",
-        ":literal_test_util",
         ":test_macros_header",
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:test_helpers",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
-        "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/stream_executor",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2205,7 +2099,11 @@ xla_test(
     name = "compilation_cache_test",
     srcs = ["compilation_cache_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
@@ -2215,10 +2113,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/types:span",
     ],
@@ -2228,12 +2122,12 @@ xla_test(
     name = "floor_ceil_test",
     srcs = ["floor_ceil_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
@@ -2260,10 +2154,8 @@ xla_test(
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -2290,11 +2182,8 @@ xla_test(
         "//tensorflow/compiler/xla/client/lib:arithmetic",
         "//tensorflow/compiler/xla/client/lib:prng",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
-        "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -2319,8 +2208,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2329,7 +2216,11 @@ xla_test(
     name = "client_test",
     srcs = ["client_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":test_utils",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:statusor",
@@ -2339,11 +2230,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_utils",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2358,10 +2244,10 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2378,11 +2264,10 @@ xla_test(
     deps = [
         ":client_library_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2391,7 +2276,10 @@ xla_test(
     name = "replay_test",
     srcs = ["replay_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:protobuf_util",
         "//tensorflow/compiler/xla:shape_util",
@@ -2402,10 +2290,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client:xla_computation",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2421,11 +2305,9 @@ xla_test(
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:util",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/tsl/platform:test",
-        "@com_google_absl//absl/memory",
     ],
 )
 
@@ -2448,8 +2330,6 @@ xla_cc_test(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
-        "@com_google_absl//absl/memory",
-        "@llvm-project//llvm:Core",
     ],
 )
 
@@ -2457,7 +2337,10 @@ xla_test(
     name = "round_trip_packed_literal_test",
     srcs = ["round_trip_packed_literal_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:packed_literal_reader",
         "//tensorflow/compiler/xla:shape_util",
@@ -2465,11 +2348,7 @@ xla_test(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
@@ -2495,10 +2374,8 @@ xla_test(
         "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
         "//third_party/eigen3",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -2521,11 +2398,9 @@ xla_test(
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/service:hlo_runner",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:protobuf",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
-        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -2551,17 +2426,16 @@ xla_test(
     name = "local_client_allocation_test",
     srcs = ["local_client_allocation_test.cc"],
     deps = [
+        ":literal_test_util",
+        ":local_client_test_base",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:literal",
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:local_service",
         "//tensorflow/compiler/xla/service:shaped_buffer",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:local_client_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2616,7 +2490,6 @@ xla_test(
         ":test_macros_header",
         ":xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2640,19 +2513,15 @@ xla_test(
     name = "round_trip_transfer_test",
     srcs = ["round_trip_transfer_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
-        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:statusor",
-        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2661,7 +2530,10 @@ xla_test(
     name = "reshape_motion_test",
     srcs = ["reshape_motion_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:array4d",
         "//tensorflow/compiler/xla:literal",
@@ -2673,10 +2545,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:global_data",
         "//tensorflow/compiler/xla/client:local_client",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/types:span",
     ],
@@ -2686,9 +2554,9 @@ xla_test(
     name = "deep_graph_test",
     srcs = ["deep_graph_test.cc"],
     deps = [
+        ":client_library_test_base",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla/client:xla_builder",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
     ],
 )
 
@@ -2728,7 +2596,6 @@ xla_test(
         "//tensorflow/compiler/xla/stream_executor",
         "//tensorflow/compiler/xla/stream_executor:device_memory_allocator",
         "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_benchmark",
     ],
 )
@@ -2744,12 +2611,11 @@ xla_test(
     ],
     deps = [
         ":hlo_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -2761,16 +2627,12 @@ xla_cc_test(
     tags = tf_cuda_tests_tags(),
     deps = [
         ":hlo_test_base",
-        ":literal_test_util",
         ":xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/compiler/xla:test",
-        "//tensorflow/compiler/xla:types",
         "//tensorflow/compiler/xla/service:cpu_plugin",  # reference backend
         "//tensorflow/compiler/xla/service:gpu_plugin",  # test backend
         "//tensorflow/compiler/xla/service:platform_util",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:path",
-        "//tensorflow/tsl/platform:resource_loader",
         "//tensorflow/tsl/platform:test",
     ],
 )
@@ -2789,8 +2651,6 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:logging",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_set",
     ],
@@ -2811,7 +2671,6 @@ xla_test(
         ":xla_internal_test_main",
         "//tensorflow/compiler/xla:error_spec",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -2822,14 +2681,11 @@ xla_cc_test(
     deps = [
         ":xla_internal_test_main",  # fixdeps: keep
         "//tensorflow/compiler/xla:shape_util",
-        "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla/client:client_library",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/service:cpu_plugin",
-        "//tensorflow/compiler/xla/service:platform_util",
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:env",
-        "//tensorflow/tsl/platform:logging",
         "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/synchronization",
     ],
@@ -2873,7 +2729,10 @@ xla_test(
         "optonly",
     ],
     deps = [
+        ":client_library_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:array",
         "//tensorflow/compiler/xla:array2d",
         "//tensorflow/compiler/xla:literal",
@@ -2884,11 +2743,7 @@ xla_test(
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:math",
         "//tensorflow/compiler/xla/client/lib:matrix",
-        "//tensorflow/compiler/xla/tests:client_library_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
         "//tensorflow/tsl/lib/core:status_test_util",
-        "//tensorflow/tsl/platform:test",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -2916,7 +2771,6 @@ xla_test(
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:tensor_float_32_hdr_lib",
         "//tensorflow/tsl/platform:tensor_float_32_utils",
-        "//tensorflow/tsl/platform:test",
     ],
 )
 
@@ -2925,12 +2779,11 @@ xla_test(
     srcs = ["constant_reduction_function_test.cc"],
     deps = [
         ":hlo_test_base",
+        ":literal_test_util",
         ":test_macros_header",
+        ":xla_internal_test_main",
         "//tensorflow/compiler/xla:test",
         "//tensorflow/compiler/xla:types",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:logging",
     ],
 )
 
@@ -2941,13 +2794,13 @@ xla_test(
     srcs = ["xla_ffi_test.cc"],
     backends = ["gpu"],
     deps = [
+        ":hlo_test_base",
+        ":literal_test_util",
+        ":test_macros_header",
         "//tensorflow/compiler/xla/pjrt:mlir_to_hlo",
-        "//tensorflow/compiler/xla/runtime:ffi",
+        "//tensorflow/compiler/xla/runtime:ffi",  # fixdeps: keep
         "//tensorflow/compiler/xla/runtime/ffi:ffi_api",
-        "//tensorflow/compiler/xla/service:custom_call_target_registry",
-        "//tensorflow/compiler/xla/tests:hlo_test_base",
-        "//tensorflow/compiler/xla/tests:literal_test_util",
-        "//tensorflow/compiler/xla/tests:test_macros_header",
+        "//tensorflow/compiler/xla/service:custom_call_target_registry",  # fixdeps: keep
         "//tensorflow/tsl/lib/core:status_test_util",
         "//tensorflow/tsl/platform:test_main",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/compiler/xla/tests/buffer_donation_test.cc b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
index 2d6f9362ec5..e1329c8f33b 100644
--- a/tensorflow/compiler/xla/tests/buffer_donation_test.cc
+++ b/tensorflow/compiler/xla/tests/buffer_donation_test.cc
@@ -86,8 +86,8 @@ class BufferDonationTest : public HloTestBase {
     ExecutableRunOptions run_options;
     run_options.set_stream(&stream);
     run_options.set_allocator(&memory_allocator);
-    ServiceExecutableRunOptions service_run_options(run_options,
-                                                    backend_->StreamBorrower());
+    ServiceExecutableRunOptions service_run_options(
+        run_options, backend_->StreamBorrowerWithPriority());
 
     std::vector<ExecutionInput> args;
     std::vector<ShapeTree<se::DeviceMemoryBase>> inputs_buffers;
diff --git a/tensorflow/compiler/xla/tests/collective_ops_test.cc b/tensorflow/compiler/xla/tests/collective_ops_test.cc
index b885396a681..d63e913e3f8 100644
--- a/tensorflow/compiler/xla/tests/collective_ops_test.cc
+++ b/tensorflow/compiler/xla/tests/collective_ops_test.cc
@@ -542,7 +542,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllReduce)) {
 
       ENTRY test_computation {
         id = u32[] replica-id()
-        start = u32[] all-reduce-start(id), to_apply=apply_op
+        start = u32[] all-reduce-start(id), to_apply=apply_op, backend_config="{\"is_sync\":false}"
         ROOT done = u32[] all-reduce-done(start)
       }
     )";
@@ -551,9 +551,10 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllReduce)) {
       GetModuleConfigForTest(/*replica_count=*/num_devices_);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {}, num_devices_,
-                                            /*use_threads=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, num_devices_,
+                        /*use_threads=*/true, /*run_hlo_passes=*/false));
 
   ASSERT_EQ(results.size(), num_devices_);
   // sum [0, num_devices)
@@ -576,7 +577,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllReduceTwoOperands)) {
       ENTRY test_computation {
         id = u32[] replica-id()
         id2 = u32[] multiply(id, id)
-        start = (u32[], u32[]) all-reduce-start(id, id2), to_apply=apply_op
+        start = (u32[], u32[]) all-reduce-start(id, id2), to_apply=apply_op, backend_config="{\"is_sync\":false}"
         ROOT done = (u32[], u32[]) all-reduce-done(start)
       }
     )";
@@ -585,9 +586,10 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllReduceTwoOperands)) {
       GetModuleConfigForTest(/*replica_count=*/num_devices_);
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {}, num_devices_,
-                                            /*use_threads=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, num_devices_,
+                        /*use_threads=*/true, /*run_hlo_passes=*/false));
 
   ASSERT_EQ(results.size(), num_devices_);
   // sum [0, num_devices)
@@ -773,7 +775,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncCollectivePermute)) {
         ten = u32[] constant(10)
         sum = u32[] add(replica, ten)
         p = u32[2] broadcast(sum), dimensions={}
-        start = (u32[2], u32[2], u32[], u32[]) collective-permute-start(p), source_target_pairs={{0,1}, {1,0}}
+        start = (u32[2], u32[2]) collective-permute-start(p), source_target_pairs={{0,1}, {1,0}}, backend_config="{\"is_sync\":false}"
         ROOT done = u32[2] collective-permute-done(start)
       }
     )";
@@ -786,9 +788,10 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncCollectivePermute)) {
   TF_ASSERT_OK_AND_ASSIGN(auto module,
                           ParseAndReturnVerifiedModule(kModuleStr, config));
 
-  TF_ASSERT_OK_AND_ASSIGN(std::vector<Literal> results,
-                          ExecuteReplicated(std::move(module), {}, kNumReplicas,
-                                            /*use_threads=*/true));
+  TF_ASSERT_OK_AND_ASSIGN(
+      std::vector<Literal> results,
+      ExecuteReplicated(std::move(module), {}, kNumReplicas,
+                        /*use_threads=*/true, /*run_hlo_passes=*/false));
   ASSERT_EQ(results.size(), kNumReplicas);
   EXPECT_TRUE(LiteralTestUtil::Equal(LiteralUtil::CreateR1<uint32_t>({11, 11}),
                                      results[0]));
@@ -1596,7 +1599,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllGather)) {
     id2 = u32[1, 2] broadcast(id), dimensions={}
     a0 = u32[1, 2] constant({{10, 15}})
     a1 = u32[1, 2] add(id2, a0)
-    ags = (u32[1, 2], u32[2, 2]) all-gather-start(a1), dimensions={0}
+    ags = (u32[1, 2], u32[2, 2]) all-gather-start(a1), dimensions={0}, backend_config="{\"is_sync\":false}"
     allgather = u32[2,2] all-gather-done(ags)
     ROOT out = u32[4] reshape(allgather)
   }
@@ -1610,7 +1613,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllGather)) {
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<Literal> results,
       ExecuteReplicated(std::move(module), {}, kNumReplicas,
-                        /*use_threads=*/true, /*run_hlo_passes=*/true));
+                        /*use_threads=*/true, /*run_hlo_passes=*/false));
   ASSERT_EQ(results.size(), kNumReplicas);
   for (const Literal& result : results) {
     LiteralTestUtil::ExpectR1Equal<uint32_t>({10, 15, 11, 16}, result);
@@ -1643,7 +1646,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncReduceScatter)) {
     pb = pred[8] broadcast(p), dimensions={}
     // data = c0 for replica 0 and c1 for replica 1
     data = u32[8] select(pb, c0, c1)
-    rs-start = ((u32[8]{0}), u32[4]{0}) async-start(u32[8]{0} %data), calls=reduce_scatter
+    rs-start = ((u32[8]{0}), u32[4]{0}) async-start(u32[8]{0} %data), calls=reduce_scatter, backend_config="{\"is_sync\":false}"
     ROOT %ars = u32[4]{0} async-done(((u32[8]{0}), u32[4]{0}) %rs-start), calls=reduce_scatter
   }
   )";
@@ -1676,7 +1679,7 @@ XLA_TEST_F(CollectiveOpsTest, DISABLED_ON_CPU(AsyncAllToAll)) {
     id2 = u32[2] broadcast(id), dimensions={}
     a0 = u32[2] constant({10, 15})
     a1 = u32[2] add(id2, a0)
-    a2a-start = ((u32[2]), u32[2]) async-start(u32[2] %a1), calls=all_to_all
+    a2a-start = ((u32[2]), u32[2]) async-start(u32[2] %a1), calls=all_to_all, backend_config="{\"is_sync\":false}"
     ROOT a2s = u32[2] async-done(a2a-start), calls=all_to_all
   }
   )";
diff --git a/tensorflow/compiler/xla/tests/data_parallel_collective_optimizer_execution_test.cc b/tensorflow/compiler/xla/tests/data_parallel_collective_optimizer_execution_test.cc
new file mode 100644
index 00000000000..dec89db298f
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/data_parallel_collective_optimizer_execution_test.cc
@@ -0,0 +1,750 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/data_parallel_collective_optimizer.h"
+#include "tensorflow/compiler/xla/service/hlo_dce.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
+#include "tensorflow/compiler/xla/service/hlo_pass_pipeline.h"
+#include "tensorflow/compiler/xla/statusor.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+using DataParallelCollectiveOptimizerExecutionTest = HloTestBase;
+
+StatusOr<bool> RunOptimizer(
+    HloModule* module, bool last_run, int64_t level_to_operate_on = 0,
+    HloPredicate should_process = HloPredicateIsOp<HloOpcode::kNegate>,
+    DataParallelCollectiveOptimizer::PipeliningDirection pipelining_direction =
+        DataParallelCollectiveOptimizer::PipeliningDirection::kForward) {
+  HloPassPipeline pass("optimizer");
+  pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false);
+  pass.AddPass<DataParallelCollectiveOptimizer>(
+      level_to_operate_on, last_run,
+      /*process_different_sized_ops=*/true, pipelining_direction,
+      should_process);
+  pass.AddPass<HloVerifier>(/*layout_sensitive=*/false,
+                            /*allow_mixed_precision=*/false);
+  pass.AddPass<HloDCE>(/*remove_cross_partition_collective_ops=*/true);
+  return pass.Run(module);
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       TransformIncrementIndexByOne) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  ar.1 = bf16[1,8,128] negate(mul)
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest, PushAgOver) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module, entry_computation_layout={(bf16[3,8,128]{2,1,0})->bf16[3,8,128]{2,1,0}}
+
+%add (lhs: bf16[], rhs: bf16[]) -> bf16[] {
+  %lhs = bf16[] parameter(0)
+  %rhs = bf16[] parameter(1)
+  ROOT %add = bf16[] add(bf16[] %lhs, bf16[] %rhs)
+}
+
+%while_body.clone (loop_peel_param: (s32[], bf16[3,8,128], s32[])) -> (s32[], bf16[3,8,128], s32[]) {
+  %loop_peel_param = (s32[], bf16[3,8,128]{2,1,0}, s32[]) parameter(0)
+  %get-tuple-element.2 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=0
+  %constant.7 = s32[] constant(1)
+  %add.4 = s32[] add(s32[] %get-tuple-element.2, s32[] %constant.7)
+  %get-tuple-element.3 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=1
+  %get-tuple-element.4 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_param), index=2
+  %constant.12 = s64[] constant(1)
+  %custom-call = s32[] custom-call(s32[] %get-tuple-element.4, s64[] %constant.12), custom_call_target="InsertedByPreviousStep"
+  %constant.13 = s32[] constant(0)
+  %constant.10 = s32[] constant(0)
+  %dynamic-slice.2 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, s32[] %custom-call, s32[] %constant.13, s32[] %constant.13), dynamic_slice_sizes={1,8,128}
+  %ar.2 = bf16[1,8,128]{2,1,0} negate(bf16[1,8,128]{2,1,0} %dynamic-slice.2)
+  %ag.2 = bf16[1,8,128]{2,1,0} negate(bf16[1,8,128]{2,1,0} %ar.2)
+  %dynamic-update-slice.2 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, bf16[1,8,128]{2,1,0} %ag.2, s32[] %custom-call, s32[] %constant.13, s32[] %constant.13)
+  %dynamic-slice.1 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.3, s32[] %get-tuple-element.2, s32[] %constant.10, s32[] %constant.10), dynamic_slice_sizes={1,8,128}
+  %mul.2 = bf16[1,8,128]{2,1,0} multiply(bf16[1,8,128]{2,1,0} %dynamic-slice.1, bf16[1,8,128]{2,1,0} %dynamic-slice.1)
+  %constant.15 = s32[] constant(0)
+  %dynamic-update-slice.4 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %dynamic-update-slice.2, bf16[1,8,128]{2,1,0} %mul.2, s32[] %get-tuple-element.2, s32[] %constant.15, s32[] %constant.15)
+  ROOT %tuple.3 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) tuple(s32[] %add.4, bf16[3,8,128]{2,1,0} %dynamic-update-slice.4, s32[] %get-tuple-element.2)
+}
+
+%while_cond.clone (loop_peel_cond_param: (s32[], bf16[3,8,128], s32[])) -> pred[] {
+  %loop_peel_cond_param = (s32[], bf16[3,8,128]{2,1,0}, s32[]) parameter(0)
+  %gte.1 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %loop_peel_cond_param), index=0
+  %constant.6 = s32[] constant(0)
+  ROOT %cmp.1 = pred[] compare(s32[] %gte.1, s32[] %constant.6), direction=LT
+}
+
+ENTRY %entry (p0: bf16[3,8,128]) -> bf16[3,8,128] {
+  %c0 = s32[] constant(-3)
+  %p0 = bf16[3,8,128]{2,1,0} parameter(0)
+  %tuple.1 = (s32[], bf16[3,8,128]{2,1,0}) tuple(s32[] %c0, bf16[3,8,128]{2,1,0} %p0)
+  %get-tuple-element.0 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}) %tuple.1), index=0
+  %constant.0 = s32[] constant(1)
+  %constant.4 = s32[] constant(0)
+  %add.1 = s32[] add(s32[] %get-tuple-element.0, s32[] %constant.0)
+  %get-tuple-element.1 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}) %tuple.1), index=1
+  %dynamic-slice.0 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.1, s32[] %get-tuple-element.0, s32[] %constant.4, s32[] %constant.4), dynamic_slice_sizes={1,8,128}
+  %mul.1 = bf16[1,8,128]{2,1,0} multiply(bf16[1,8,128]{2,1,0} %dynamic-slice.0, bf16[1,8,128]{2,1,0} %dynamic-slice.0)
+  %dynamic-update-slice.0 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.1, bf16[1,8,128]{2,1,0} %mul.1, s32[] %get-tuple-element.0, s32[] %constant.4, s32[] %constant.4)
+  %tuple.4 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) tuple(s32[] %add.1, bf16[3,8,128]{2,1,0} %dynamic-update-slice.0, s32[] %get-tuple-element.0)
+  %while.1 = (s32[], bf16[3,8,128]{2,1,0}, s32[]) while((s32[], bf16[3,8,128]{2,1,0}, s32[]) %tuple.4), condition=%while_cond.clone, body=%while_body.clone
+  %get-tuple-element.6 = bf16[3,8,128]{2,1,0} get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %while.1), index=1
+  %get-tuple-element.5 = s32[] get-tuple-element((s32[], bf16[3,8,128]{2,1,0}, s32[]) %while.1), index=2
+  %constant.14 = s32[] constant(0)
+  %dynamic-slice.3 = bf16[1,8,128]{2,1,0} dynamic-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.6, s32[] %get-tuple-element.5, s32[] %constant.14, s32[] %constant.14), dynamic_slice_sizes={1,8,128}
+  %ar.3 = bf16[1,8,128]{2,1,0} add(bf16[1,8,128]{2,1,0} %dynamic-slice.3, bf16[1,8,128]{2,1,0} %dynamic-slice.3)
+  ROOT %dynamic-update-slice.3 = bf16[3,8,128]{2,1,0} dynamic-update-slice(bf16[3,8,128]{2,1,0} %get-tuple-element.6, bf16[1,8,128]{2,1,0} %ar.3, s32[] %get-tuple-element.5, s32[] %constant.14, s32[] %constant.14)
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 1).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       TransformIncrementIndexByOneNotFirstIdx) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[8,3,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(3)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[8,3,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[8,3,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(1)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   constant.2559 = s32[] constant(3)
+   subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+   constant.2560 = s32[] constant(-1)
+   add.231 = s32[] add(subtract.139, constant.2560)
+   constant.2561 = s32[] constant(0)
+   compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+   constant.2562 = s32[] constant(2)
+   add.232 = s32[] add(subtract.139, constant.2562)
+   select.1348 = s32[] select(compare.747, add.232, add.231)
+   dynamic-slice.99 = bf16[8,1,128] dynamic-slice(get-tuple-element.395,
+   constant.2561, select.1348, constant.2561), dynamic_slice_sizes={8,1,128}
+   mul = bf16[8,1,128] multiply(dynamic-slice.99, dynamic-slice.99)
+   ar.1 = bf16[8,1,128] negate(mul)
+   dynamic-update-slice.35 = bf16[8,3,128]
+   dynamic-update-slice(get-tuple-element.395, ar.1, constant.2561,
+   select.1348, constant.2561) ROOT tuple = (s32[], bf16[8,3,128])
+   tuple(add.230, dynamic-update-slice.35)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(0)
+   p0 = bf16[8,3,128] parameter(0)
+   tuple = (s32[], bf16[8,3,128]) tuple(c0, p0)
+   while = (s32[], bf16[8,3,128]) while(tuple), condition=while_cond,
+   body=while_body ROOT gte1 = bf16[8,3,128] get-tuple-element(while),
+   index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest, TransformIncrementByTwo) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(3)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(2)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   constant.2559 = s32[] constant(3)
+   subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+   constant.2560 = s32[] constant(-1)
+   add.231 = s32[] add(subtract.139, constant.2560)
+   constant.2561 = s32[] constant(0)
+   compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+   constant.2562 = s32[] constant(2)
+   add.232 = s32[] add(subtract.139, constant.2562)
+   select.1348 = s32[] select(compare.747, add.232, add.231)
+   dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395,
+   select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+   mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+   ar.1 = bf16[1,8,128] negate(mul)
+   dynamic-update-slice.35 = bf16[3,8,128]
+   dynamic-update-slice(get-tuple-element.395, ar.1, select.1348,
+   constant.2561, constant.2561) ROOT tuple = (s32[], bf16[3,8,128])
+   tuple(add.230, dynamic-update-slice.35)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(0)
+   p0 = bf16[3,8,128] parameter(0)
+   tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+   while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond,
+   body=while_body ROOT gte1 = bf16[3,8,128] get-tuple-element(while),
+   index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       NoTransformCantProveIndexDoesntWrap) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(4)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(1)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   constant.2559 = s32[] constant(3)
+   subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+   constant.2560 = s32[] constant(-1)
+   add.231 = s32[] add(subtract.139, constant.2560)
+   constant.2561 = s32[] constant(0)
+   compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+   constant.2562 = s32[] constant(2)
+   add.232 = s32[] add(subtract.139, constant.2562)
+   select.1348 = s32[] select(compare.747, add.232, add.231)
+   dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395,
+   select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+   mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+   ar.1 = bf16[1,8,128] negate(mul)
+   dynamic-update-slice.35 = bf16[3,8,128]
+   dynamic-update-slice(get-tuple-element.395, ar.1, select.1348,
+   constant.2561, constant.2561) ROOT tuple = (s32[], bf16[3,8,128])
+   tuple(add.230, dynamic-update-slice.35)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(-1)
+   p0 = bf16[3,8,128] parameter(0)
+   tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+   while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond,
+   body=while_body ROOT gte1 = bf16[3,8,128] get-tuple-element(while),
+   index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       TransformNegativeIndexIterationToZero) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(0)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(1)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   constant.2559 = s32[] constant(3)
+   subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+   constant.2560 = s32[] constant(-1)
+   add.231 = s32[] add(subtract.139, constant.2560)
+   constant.2561 = s32[] constant(0)
+   compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+   constant.2562 = s32[] constant(2)
+   add.232 = s32[] add(subtract.139, constant.2562)
+   select.1348 = s32[] select(compare.747, add.232, add.231)
+   dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395,
+   select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+   mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+   ar.1 = bf16[1,8,128] negate(mul)
+   dynamic-update-slice.35 = bf16[3,8,128]
+   dynamic-update-slice(get-tuple-element.395, ar.1, select.1348,
+   constant.2561, constant.2561) ROOT tuple = (s32[], bf16[3,8,128])
+   tuple(add.230, dynamic-update-slice.35)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(-3)
+   p0 = bf16[3,8,128] parameter(0)
+   tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+   while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond,
+   body=while_body ROOT gte1 = bf16[3,8,128] get-tuple-element(while),
+   index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest, EscapedInputNoTransform) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[3,8,128], bf16[1,8,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(0)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[3,8,128], bf16[1,8,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(1)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   constant.2559 = s32[] constant(3)
+   subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+   constant.2560 = s32[] constant(-1)
+   add.231 = s32[] add(subtract.139, constant.2560)
+   constant.2561 = s32[] constant(0)
+   compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+   constant.2562 = s32[] constant(2)
+   add.232 = s32[] add(subtract.139, constant.2562)
+   select.1348 = s32[] select(compare.747, add.232, add.231)
+   dynamic-slice.911 = bf16[1,8,128] dynamic-slice(get-tuple-element.395,
+   constant.2561, constant.2561, constant.2561),
+   dynamic_slice_sizes={1,8,128} dynamic-slice.99 = bf16[1,8,128]
+   dynamic-slice(get-tuple-element.395, select.1348, constant.2561,
+   constant.2561), dynamic_slice_sizes={1,8,128} mul = bf16[1,8,128]
+   multiply(dynamic-slice.99, dynamic-slice.99) ar.1 = bf16[1,8,128]
+   negate(mul)
+   dynamic-update-slice.35 = bf16[3,8,128]
+   dynamic-update-slice(get-tuple-element.395, ar.1, select.1348,
+   constant.2561, constant.2561) ROOT tuple = (s32[], bf16[3,8,128],
+   bf16[1,8,128]) tuple(add.230, dynamic-update-slice.35, dynamic-slice.911)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(-3)
+   p0 = bf16[3,8,128] parameter(0)
+   cc = bf16[] constant(0)
+   c1 = bf16[1,8,128] broadcast(cc), dimensions={}
+   tuple = (s32[], bf16[3,8,128], bf16[1,8,128]) tuple(c0, p0, c1)
+   while = (s32[], bf16[3,8,128], bf16[1,8,128]) while(tuple),
+   condition=while_cond, body=while_body ROOT gte1 = bf16[3,8,128]
+   get-tuple-element(while), index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest, TransformWithAg) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(0)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(1)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   constant.2559 = s32[] constant(3)
+   subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+   constant.2560 = s32[] constant(-1)
+   add.231 = s32[] add(subtract.139, constant.2560)
+   constant.2561 = s32[] constant(0)
+   compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+   constant.2562 = s32[] constant(2)
+   add.232 = s32[] add(subtract.139, constant.2562)
+   select.1348 = s32[] select(compare.747, add.232, add.231)
+   dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395,
+   select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+   mul = bf16[1,8,128] multiply(dynamic-slice.99, dynamic-slice.99)
+   rs.1 = bf16[1,8,128] negate(mul)
+   ag.1 = bf16[1,8,128] negate(rs.1)
+   dynamic-update-slice.35 =
+   bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ag.1,
+   select.1348, constant.2561, constant.2561) ROOT tuple = (s32[],
+   bf16[3,8,128]) tuple(add.230, dynamic-update-slice.35)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(-3)
+   p0 = bf16[3,8,128] parameter(0)
+   cc = bf16[] constant(0)
+   tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+   while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond,
+   body=while_body ROOT gte1 = bf16[3,8,128] get-tuple-element(while),
+   index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       TransformWithAgWithFormatting) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,9,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(0)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,9,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,9,128] get-tuple-element(param), index=1
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  constant.2561 = s32[] constant(0)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.99 = bf16[1,9,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,9,128}
+  mul = bf16[1,9,128] multiply(dynamic-slice.99, dynamic-slice.99)
+  cpd = bf16[] constant(0)
+  %pd = bf16[1,16,128] pad(mul, cpd), padding=0_0x0_7x0_0
+  rs.1 = bf16[1,16,128] negate(pd)
+  ag.1 = bf16[1,16,128] negate(rs.1)
+  slc = bf16[1,9,128] slice(ag.1), slice={[0:1], [0:9], [0:128]}
+  dynamic-update-slice.35 = bf16[3,9,128] dynamic-update-slice(get-tuple-element.395, slc, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,9,128]) tuple(add.230, dynamic-update-slice.35)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(-3)
+  p0 = bf16[3,9,128] parameter(0)
+  cc = bf16[] constant(0)
+  tuple = (s32[], bf16[3,9,128]) tuple(c0, p0)
+  while = (s32[], bf16[3,9,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,9,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       TransformWithAgInsertCustomCall) {
+  constexpr absl::string_view hlo_string = R"(
+ HloModule module
+
+ add {
+   lhs = bf16[] parameter(0)
+   rhs = bf16[] parameter(1)
+   ROOT add = bf16[] add(lhs, rhs)
+ }
+
+ while_cond {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   gte = s32[] get-tuple-element(param), index=0
+   constant.1 = s32[] constant(0)
+   ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+ }
+
+ while_body {
+   param = (s32[], bf16[3,8,128]) parameter(0)
+   get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+   get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+   constant.2557 = s32[] constant(1)
+   constant.2561 = s32[] constant(0)
+   add.230 = s32[] add(get-tuple-element.394, constant.2557)
+   dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395,
+   get-tuple-element.394, constant.2561, constant.2561),
+   dynamic_slice_sizes={1,8,128} mul = bf16[1,8,128]
+   multiply(dynamic-slice.99, dynamic-slice.99) rs.1 = bf16[1,8,128]
+   negate(mul)
+   ag.1 = bf16[1,8,128] negate(rs.1)
+   dynamic-update-slice.35 = bf16[3,8,128]
+   dynamic-update-slice(get-tuple-element.395, ag.1, get-tuple-element.394,
+   constant.2561, constant.2561) ROOT tuple = (s32[], bf16[3,8,128])
+   tuple(add.230, dynamic-update-slice.35)
+ }
+
+ ENTRY entry {
+   c0 = s32[] constant(-8)
+   p0 = bf16[3,8,128] parameter(0)
+   cc = bf16[] constant(0)
+   tuple = (s32[], bf16[3,8,128]) tuple(c0, p0)
+   while = (s32[], bf16[3,8,128]) while(tuple), condition=while_cond,
+   body=while_body ROOT gte1 = bf16[3,8,128] get-tuple-element(while),
+   index=1
+ }
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  EXPECT_TRUE(RunOptimizer(module2.get(), /*last_run=*/true, 200).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+TEST_F(DataParallelCollectiveOptimizerExecutionTest,
+       TransformIncrementIndexByOneBackwardsPlusForward) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+add {
+  lhs = bf16[] parameter(0)
+  rhs = bf16[] parameter(1)
+  ROOT add = bf16[] add(lhs, rhs)
+}
+
+while_cond {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  gte = s32[] get-tuple-element(param), index=0
+  constant.1 = s32[] constant(3)
+  ROOT cmp = pred[] compare(gte, constant.1), direction=LT
+}
+
+while_body {
+  param = (s32[], bf16[3,8,128], bf16[3,1,2,128]) parameter(0)
+  get-tuple-element.394 = s32[] get-tuple-element(param), index=0
+  get-tuple-element.395 = bf16[3,8,128] get-tuple-element(param), index=1
+  get-tuple-element.k = bf16[3,1,2,128] get-tuple-element(param), index=2
+  constant.2561 = s32[] constant(0)
+  constant.2557 = s32[] constant(1)
+  add.230 = s32[] add(get-tuple-element.394, constant.2557)
+  constant.2559 = s32[] constant(3)
+  subtract.139 = s32[] subtract(constant.2559, get-tuple-element.394)
+  constant.2560 = s32[] constant(-1)
+  add.231 = s32[] add(subtract.139, constant.2560)
+  compare.747 = pred[] compare(add.231, constant.2561), direction=LT
+  constant.2562 = s32[] constant(2)
+  add.232 = s32[] add(subtract.139, constant.2562)
+  select.1348 = s32[] select(compare.747, add.232, add.231)
+  dynamic-slice.k = bf16[1,1,2,128] dynamic-slice(get-tuple-element.k, select.1348, constant.2561, constant.2561, constant.2561), dynamic_slice_sizes={1,1,2,128}
+  r = bf16[1,2,128] reshape(dynamic-slice.k)
+  a = bf16[1,2,128] add(r, r)
+  ag = bf16[1,8,128] concatenate(a, a, a, a), dimensions={1}
+  dynamic-slice.99 = bf16[1,8,128] dynamic-slice(get-tuple-element.395, select.1348, constant.2561, constant.2561), dynamic_slice_sizes={1,8,128}
+  mul = bf16[1,8,128] multiply(dynamic-slice.99, ag)
+  ar.1 = bf16[1,8,128] negate(mul)
+  dynamic-update-slice.35 = bf16[3,8,128] dynamic-update-slice(get-tuple-element.395, ar.1, select.1348, constant.2561, constant.2561)
+  ROOT tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(add.230, dynamic-update-slice.35, get-tuple-element.k)
+}
+
+ENTRY entry {
+  c0 = s32[] constant(0)
+  p0 = bf16[3,8,128] parameter(0)
+  p1 = bf16[3,1,2,128] parameter(1)
+  tuple = (s32[], bf16[3,8,128], bf16[3,1,2,128]) tuple(c0, p0, p1)
+  while = (s32[], bf16[3,8,128], bf16[3,1,2,128]) while(tuple), condition=while_cond, body=while_body
+  ROOT gte1 = bf16[3,8,128] get-tuple-element(while), index=1
+}
+)";
+  auto module = ParseAndReturnUnverifiedModule(hlo_string).value();
+  auto module2 = ParseAndReturnUnverifiedModule(hlo_string).value();
+
+  EXPECT_TRUE(
+      RunOptimizer(
+          module.get(), /*last_run=*/true, 0, /*should_process=*/
+          HloPredicateIsOp<HloOpcode::kConcatenate>,
+          DataParallelCollectiveOptimizer::PipeliningDirection::kBackward)
+          .value());
+  EXPECT_TRUE(RunOptimizer(module.get(), /*last_run=*/true, 0).value());
+  XLA_VLOG_LINES(1, module->ToString());
+  XLA_VLOG_LINES(1, module2->ToString());
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(module), std::move(module2),
+                                      ErrorSpec{0.1, 0.1}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tests/exhaustive/BUILD b/tensorflow/compiler/xla/tests/exhaustive/BUILD
index 99b7c73f269..0a7933e4f93 100644
--- a/tensorflow/compiler/xla/tests/exhaustive/BUILD
+++ b/tensorflow/compiler/xla/tests/exhaustive/BUILD
@@ -28,6 +28,8 @@ cc_library(
     tags = ["no_pip"],
     deps = [
         "//tensorflow/compiler/xla:bit_cast",
+        "//tensorflow/compiler/xla:shape_util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
         "//tensorflow/compiler/xla/client/lib:constants",
         "//tensorflow/compiler/xla/client/lib:math",
diff --git a/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc
index d7c14825faa..f8059f38ec9 100644
--- a/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.cc
@@ -421,8 +421,7 @@ void ExhaustiveOpTestBase<T, N>::ExpectNear(const InputLiterals& input_literals,
     CHECK_EQ(subnormal_test_inputs.size(), subnormal_test_results.size());
     for (int i = 0; i < subnormal_test_inputs.size(); ++i) {
       using IntegralNativeRefT =
-          typename ExhaustiveOpTestBase<RefT::value,
-                                        N>::ComponentIntegralNativeT;
+          typename ExhaustiveOpTestBase<kRef, N>::ComponentIntegralNativeT;
       absl::StrAppend(
           &mismatch,
           absl::StrFormat("  %10s (evaluated at %s)\n",
diff --git a/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h
index d640cb4e14e..b7d5abb976d 100644
--- a/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h
+++ b/tensorflow/compiler/xla/tests/exhaustive/exhaustive_op_test_utils.h
@@ -29,10 +29,12 @@ limitations under the License.
 #include "tensorflow/compiler/xla/client/lib/constants.h"
 #include "tensorflow/compiler/xla/client/lib/math.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/shaped_buffer.h"
 #include "tensorflow/compiler/xla/tests/client_library_test_base.h"
 #include "tensorflow/compiler/xla/tests/literal_test_util.h"
 #include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace exhaustive_op_test {
@@ -97,59 +99,44 @@ struct ErrorSpecGenWrapper<T, 2> {
 template <PrimitiveType T, size_t N>
 typename ErrorSpecGenWrapper<T, N>::type GetDefaultSpecGenerator();
 
+// The primitive type used to compute the reference output.
+constexpr PrimitiveType Ref(PrimitiveType T) {
+  return !primitive_util::IsFloatingPointType(T) || T == F64 ? T : F32;
+}
+
+// The primitive type of the component of T. If T is not complex, then
+// ComponentT = T.
+constexpr PrimitiveType Component(PrimitiveType T) {
+  return primitive_util::IsComplexType(T)
+             ? primitive_util::ComplexComponentType(T)
+             : T;
+}
+
 // T: The primitive type being tested.
 // N: The number of operands that the function being tested takes.
 template <PrimitiveType T, size_t N>
 class ExhaustiveOpTestBase : public ClientLibraryTestBase {
  public:
   // Definitions depending on the primitive type T.
+  static constexpr bool kIsComplex = primitive_util::IsComplexType(T);
+  static constexpr PrimitiveType kComponent = Component(T);
+  static constexpr PrimitiveType kRef = Ref(T);
+  // Same as kComponent, but for the kRef primitive type.
+  static constexpr PrimitiveType kComponentRef = Component(kRef);
 
-  static constexpr bool kIsComplex = (T == C128 || T == C64);
-
-  // The primitive type used to compute the reference output.
-  struct RefT {
-    static constexpr PrimitiveType value = (T == F16 || T == BF16) ? F32 : T;
-  };
-
-  // The primitive type of the component of T. If T is not complex, then
-  // ComponentT = T.
-  struct ComponentT {
-    static constexpr PrimitiveType value = !kIsComplex ? T
-                                           : T == C128 ? F64
-                                           : T == C64  ? F32
-                                                       : PRIMITIVE_TYPE_INVALID;
-  };
-
-  // Same as ComponentT, but for the RefT primitive type.
-  struct ComponentRefT {
-    static constexpr PrimitiveType value = !kIsComplex           ? RefT::value
-                                           : RefT::value == C128 ? F64
-                                           : RefT::value == C64
-                                               ? F32
-                                               : PRIMITIVE_TYPE_INVALID;
-  };
-
-  // The primitive type of an unsigned integer that can be bitcasted to and from
-  // ComponentT.
-  struct ComponentIntegralT {
-    static constexpr PrimitiveType value = (T == C128 || T == F64)  ? U64
-                                           : (T == C64 || T == F32) ? U32
-                                           : (T == F16 || T == BF16)
-                                               ? U16
-                                               : PRIMITIVE_TYPE_INVALID;
-  };
+  // The primitive type of an unsigned integer that can be bitcasted to and
+  // from ComponentT.
+  static constexpr PrimitiveType kComponentIntegral =
+      primitive_util::UnsignedIntegralTypeForBitWidth(
+          primitive_util::BitWidth(kComponent));
 
   // Native types that correspond to the primitive types above.
-  using NativeT = typename primitive_util::PrimitiveTypeToNative<T>::type;
-  using NativeRefT =
-      typename primitive_util::PrimitiveTypeToNative<RefT::value>::type;
-  using ComponentNativeT =
-      typename primitive_util::PrimitiveTypeToNative<ComponentT::value>::type;
-  using ComponentNativeRefT = typename primitive_util::PrimitiveTypeToNative<
-      ComponentRefT::value>::type;
+  using NativeT = primitive_util::NativeTypeOf<T>;
+  using NativeRefT = primitive_util::NativeTypeOf<kRef>;
+  using ComponentNativeT = primitive_util::NativeTypeOf<kComponent>;
+  using ComponentNativeRefT = primitive_util::NativeTypeOf<kComponentRef>;
   using ComponentIntegralNativeT =
-      typename primitive_util::PrimitiveTypeToNative<
-          ComponentIntegralT::value>::type;
+      primitive_util::NativeTypeOf<kComponentIntegral>;
 
   using InputLiterals = std::array<Literal, N>;
 
diff --git a/tensorflow/compiler/xla/tests/fuzz/BUILD b/tensorflow/compiler/xla/tests/fuzz/BUILD
index 54670996a75..6f44ff634b7 100644
--- a/tensorflow/compiler/xla/tests/fuzz/BUILD
+++ b/tensorflow/compiler/xla/tests/fuzz/BUILD
@@ -1,16 +1,18 @@
 load("//tensorflow/compiler/xla/tests/fuzz:build_defs.bzl", "hlo_test")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 hlo_test(
     name = "rand",
     hlo_files = glob(
         include = ["rand_*.hlo"],
         exclude = [
-            "rand_1.hlo",  # fails on GPU
-            "rand_4.hlo",  # times out during coverage
+            "rand_000001.hlo",  # fails on GPU
+            "rand_000004.hlo",  # times out during coverage
             # These fail on all platforms
-            "rand_60.hlo",
-            "rand_67.hlo",
-            "rand_72.hlo",
+            "rand_000060.hlo",
+            "rand_000067.hlo",
+            "rand_000072.hlo",
         ],
     ),
 )
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_0.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000000.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_0.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000000.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_1.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000001.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_1.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000001.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_3.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000003.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_3.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000003.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_4.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000004.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_4.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000004.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_5.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000005.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_5.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000005.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_6.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000006.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_6.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000006.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_7.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000007.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_7.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000007.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_8.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000008.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_8.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000008.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_9.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000009.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_9.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000009.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_13.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000013.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_13.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000013.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_15.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000015.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_15.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000015.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_16.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000016.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_16.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000016.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_17.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000017.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_17.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000017.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_18.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000018.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_18.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000018.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_19.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000019.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_19.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000019.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_20.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000020.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_20.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000020.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_22.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000022.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_22.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000022.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_24.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000024.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_24.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000024.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_25.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000025.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_25.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000025.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_26.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000026.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_26.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000026.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_30.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000030.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_30.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000030.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_31.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000031.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_31.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000031.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_32.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000032.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_32.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000032.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_33.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000033.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_33.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000033.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_34.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000034.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_34.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000034.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_35.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000035.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_35.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000035.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_36.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000036.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_36.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000036.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_39.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000039.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_39.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000039.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_40.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000040.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_40.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000040.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_41.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000041.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_41.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000041.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_43.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000043.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_43.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000043.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_49.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000049.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_49.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000049.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_53.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000053.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_53.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000053.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_56.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000056.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_56.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000056.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_59.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000059.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_59.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000059.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_60.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000060.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_60.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000060.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_61.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000061.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_61.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000061.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_62.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000062.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_62.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000062.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_64.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000064.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_64.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000064.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_66.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000066.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_66.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000066.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_67.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000067.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_67.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000067.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_69.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000069.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_69.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000069.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_71.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000071.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_71.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000071.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_72.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000072.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_72.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000072.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_77.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000077.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_77.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000077.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_78.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000078.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_78.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000078.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_79.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000079.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_79.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000079.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_81.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000081.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_81.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000081.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_84.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000084.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_84.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000084.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_85.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000085.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_85.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000085.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_86.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000086.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_86.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000086.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_88.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000088.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_88.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000088.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_89.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000089.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_89.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000089.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_90.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000090.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_90.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000090.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_92.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000092.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_92.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000092.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_94.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000094.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_94.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000094.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_95.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_000095.hlo
similarity index 100%
rename from tensorflow/compiler/xla/tests/fuzz/rand_95.hlo
rename to tensorflow/compiler/xla/tests/fuzz/rand_000095.hlo
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_10.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_10.hlo
deleted file mode 100644
index 65edb2e1226..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_10.hlo
+++ /dev/null
@@ -1,24 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[3,7,1,2]{3,2,1,0})->(pred[5,16]{1,0})}
-
-region_0.12 {
-  Arg_0.13 = pred[] parameter(0)
-  ROOT Arg_1.14 = pred[] parameter(1)
-}
-
-ENTRY main.17 {
-  constant.4 = pred[] constant(false)
-  broadcast.5 = pred[5,16]{1,0} broadcast(constant.4), dimensions={}
-  Arg_0.1 = s32[3,7,1,2]{3,2,1,0} parameter(0)
-  constant.9 = s32[] constant(1)
-  broadcast.10 = s32[3,6,1,1]{3,2,1,0} broadcast(constant.9), dimensions={}
-  constant.8 = s32[] constant(1)
-  constant.7 = s32[] constant(4)
-  constant.6 = s32[] constant(0)
-  dynamic-update-slice.11 = s32[3,7,1,2]{3,2,1,0} dynamic-update-slice(Arg_0.1, broadcast.10, constant.8, constant.7, constant.6, constant.6)
-  constant.2 = pred[] constant(true)
-  broadcast.3 = pred[3,4,8,7,1]{4,3,2,1,0} broadcast(constant.2), dimensions={}
-  scatter.15 = pred[5,16]{1,0} scatter(broadcast.5, dynamic-update-slice.11, broadcast.3), update_window_dims={1,2}, inserted_window_dims={}, scatter_dims_to_operand_dims={1,0}, index_vector_dim=3, to_apply=region_0.12
-  ROOT tuple.16 = (pred[5,16]{1,0}) tuple(scatter.15)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_11.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_11.hlo
deleted file mode 100644
index 339e7f5f8ab..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_11.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(f16[10,4,11]{2,1,0})}
-
-ENTRY main.4 {
-  constant.1 = f16[] constant(0.15735)
-  broadcast.2 = f16[10,4,11]{2,1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (f16[10,4,11]{2,1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_12.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_12.hlo
deleted file mode 100644
index b3eb4b912c2..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_12.hlo
+++ /dev/null
@@ -1,11 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u16[13,6,16]{2,1,0})}
-
-ENTRY main.6 {
-  constant.1 = u16[] constant(1)
-  broadcast.2 = u16[13,6,16]{2,1,0} broadcast(constant.1), dimensions={}
-  shift-left.3 = u16[13,6,16]{2,1,0} shift-left(broadcast.2, broadcast.2)
-  shift-right-arithmetic.4 = u16[13,6,16]{2,1,0} shift-right-arithmetic(broadcast.2, shift-left.3)
-  ROOT tuple.5 = (u16[13,6,16]{2,1,0}) tuple(shift-right-arithmetic.4)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_14.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_14.hlo
deleted file mode 100644
index 8e6ee6a6870..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_14.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(s8[13]{0})}
-
-ENTRY main.4 {
-  constant.1 = s8[] constant(1)
-  broadcast.2 = s8[13]{0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (s8[13]{0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_2.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_2.hlo
deleted file mode 100644
index 07465550744..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_2.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[8]{0},s8[8]{0})->(s8[8]{0})}
-
-ENTRY main.9 {
-  constant.3 = s8[] constant(1)
-  broadcast.4 = s8[8]{0} broadcast(constant.3), dimensions={}
-  Arg_0.1 = s8[8]{0} parameter(0)
-  remainder.5 = s8[8]{0} remainder(broadcast.4, Arg_0.1)
-  Arg_1.2 = s8[8]{0} parameter(1)
-  minimum.6 = s8[8]{0} minimum(Arg_1.2, broadcast.4)
-  xor.7 = s8[8]{0} xor(remainder.5, minimum.6)
-  ROOT tuple.8 = (s8[8]{0}) tuple(xor.7)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_21.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_21.hlo
deleted file mode 100644
index e06c1db9bd0..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_21.hlo
+++ /dev/null
@@ -1,11 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[])->(s8[])}
-
-ENTRY main.6 {
-  Arg_0.1 = s8[] parameter(0)
-  constant.2 = s8[] constant(1)
-  divide.3 = s8[] divide(Arg_0.1, constant.2)
-  multiply.4 = s8[] multiply(divide.3, divide.3)
-  ROOT tuple.5 = (s8[]) tuple(multiply.4)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_23.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_23.hlo
deleted file mode 100644
index 15d04acb6db..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_23.hlo
+++ /dev/null
@@ -1,15 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[9,8,15,3,11]{4,3,2,1,0})->(u32[9,8,15,3,11]{4,3,2,1,0})}
-
-ENTRY main.10 {
-  Arg_0.1 = u32[9,8,15,3,11]{4,3,2,1,0} parameter(0)
-  negate.6 = u32[9,8,15,3,11]{4,3,2,1,0} negate(Arg_0.1)
-  constant.4 = u32[] constant(0)
-  broadcast.5 = u32[9,8,15,3,11]{4,3,2,1,0} broadcast(constant.4), dimensions={}
-  compare.7 = pred[9,8,15,3,11]{4,3,2,1,0} compare(negate.6, broadcast.5), direction=EQ
-  constant.2 = u32[] constant(1)
-  broadcast.3 = u32[9,8,15,3,11]{4,3,2,1,0} broadcast(constant.2), dimensions={}
-  select.8 = u32[9,8,15,3,11]{4,3,2,1,0} select(compare.7, broadcast.5, broadcast.3)
-  ROOT tuple.9 = (u32[9,8,15,3,11]{4,3,2,1,0}) tuple(select.8)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_27.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_27.hlo
deleted file mode 100644
index 8e47055ebea..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_27.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[1,14,8]{2,1,0},u32[1,14,8]{2,1,0})->(u32[1,14,8]{2,1,0})}
-
-ENTRY main.8 {
-  Arg_0.1 = u32[1,14,8]{2,1,0} parameter(0)
-  Arg_1.2 = u32[1,14,8]{2,1,0} parameter(1)
-  remainder.5 = u32[1,14,8]{2,1,0} remainder(Arg_0.1, Arg_1.2)
-  constant.3 = u32[] constant(4294967295)
-  broadcast.4 = u32[1,14,8]{2,1,0} broadcast(constant.3), dimensions={}
-  add.6 = u32[1,14,8]{2,1,0} add(remainder.5, broadcast.4)
-  ROOT tuple.7 = (u32[1,14,8]{2,1,0}) tuple(add.6)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_28.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_28.hlo
deleted file mode 100644
index 7c922222ac4..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_28.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(pred[7,5,5,4]{3,2,1,0})}
-
-ENTRY main.4 {
-  constant.1 = pred[] constant(true)
-  broadcast.2 = pred[7,5,5,4]{3,2,1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (pred[7,5,5,4]{3,2,1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_29.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_29.hlo
deleted file mode 100644
index c236afc9207..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_29.hlo
+++ /dev/null
@@ -1,10 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[9,5,3]{2,1,0},s32[9,5,3]{2,1,0})->(s32[9,5,3]{2,1,0})}
-
-ENTRY main.5 {
-  Arg_0.1 = s32[9,5,3]{2,1,0} parameter(0)
-  Arg_1.2 = s32[9,5,3]{2,1,0} parameter(1)
-  shift-right-arithmetic.3 = s32[9,5,3]{2,1,0} shift-right-arithmetic(Arg_0.1, Arg_1.2)
-  ROOT tuple.4 = (s32[9,5,3]{2,1,0}) tuple(shift-right-arithmetic.3)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_37.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_37.hlo
deleted file mode 100644
index de2f67aa748..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_37.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(f32[1,7]{1,0})}
-
-ENTRY main.4 {
-  constant.1 = f32[] constant(-4.85203028)
-  broadcast.2 = f32[1,7]{1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (f32[1,7]{1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_38.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_38.hlo
deleted file mode 100644
index a63c8ccd304..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_38.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(f32[9,9,3,9]{3,2,1,0})}
-
-ENTRY main.4 {
-  constant.1 = f32[] constant(0)
-  broadcast.2 = f32[9,9,3,9]{3,2,1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (f32[9,9,3,9]{3,2,1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_42.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_42.hlo
deleted file mode 100644
index 23b907c8662..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_42.hlo
+++ /dev/null
@@ -1,11 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[7]{0})->(u32[7]{0})}
-
-ENTRY main.6 {
-  Arg_0.1 = u32[7]{0} parameter(0)
-  constant.2 = u32[] constant(1)
-  broadcast.3 = u32[7]{0} broadcast(constant.2), dimensions={}
-  minimum.4 = u32[7]{0} minimum(Arg_0.1, broadcast.3)
-  ROOT tuple.5 = (u32[7]{0}) tuple(minimum.4)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_44.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_44.hlo
deleted file mode 100644
index 8e5e1edf8b2..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_44.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(s32[10,3,2,15,10]{4,3,2,1,0})}
-
-ENTRY main.4 {
-  constant.1 = s32[] constant(0)
-  broadcast.2 = s32[10,3,2,15,10]{4,3,2,1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (s32[10,3,2,15,10]{4,3,2,1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_45.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_45.hlo
deleted file mode 100644
index cfcd6bdce9b..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_45.hlo
+++ /dev/null
@@ -1,11 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(c64[1,13,1,5,7]{4,3,2,1,0})}
-
-ENTRY main.6 {
-  constant.1 = c64[] constant((1, 0))
-  broadcast.2 = c64[1,1,13,1,5,7]{5,4,3,2,1,0} broadcast(constant.1), dimensions={}
-  subtract.3 = c64[1,1,13,1,5,7]{5,4,3,2,1,0} subtract(broadcast.2, broadcast.2)
-  reshape.4 = c64[1,13,1,5,7]{4,3,2,1,0} reshape(subtract.3)
-  ROOT tuple.5 = (c64[1,13,1,5,7]{4,3,2,1,0}) tuple(reshape.4)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_46.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_46.hlo
deleted file mode 100644
index ad6d8aa1d02..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_46.hlo
+++ /dev/null
@@ -1,11 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[13]{0},s8[13]{0})->(s8[13]{0})}
-
-ENTRY main.6 {
-  Arg_0.1 = s8[13]{0} parameter(0)
-  Arg_1.2 = s8[13]{0} parameter(1)
-  shift-right-logical.3 = s8[13]{0} shift-right-logical(Arg_0.1, Arg_1.2)
-  abs.4 = s8[13]{0} abs(shift-right-logical.3)
-  ROOT tuple.5 = (s8[13]{0}) tuple(abs.4)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_47.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_47.hlo
deleted file mode 100644
index dd80454d348..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_47.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[143,1,1]{2,1,0})->(u32[11,13]{1,0})}
-
-ENTRY main.7 {
-  Arg_0.1 = u32[143,1,1]{2,1,0} parameter(0)
-  constant.2 = u32[] constant(1)
-  broadcast.3 = u32[143,1,1]{2,1,0} broadcast(constant.2), dimensions={}
-  add.4 = u32[143,1,1]{2,1,0} add(Arg_0.1, broadcast.3)
-  reshape.5 = u32[11,13]{1,0} reshape(add.4)
-  ROOT tuple.6 = (u32[11,13]{1,0}) tuple(reshape.5)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_48.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_48.hlo
deleted file mode 100644
index 16d977f4bba..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_48.hlo
+++ /dev/null
@@ -1,17 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f32[16,6,12,7,6,12]{5,4,3,2,1,0})->(f32[16,12,7]{2,1,0})}
-
-region_0.4 {
-  Arg_0.5 = f32[] parameter(0)
-  Arg_1.6 = f32[] parameter(1)
-  ROOT multiply.7 = f32[] multiply(Arg_0.5, Arg_1.6)
-}
-
-ENTRY main.10 {
-  Arg_0.1 = f32[16,6,12,7,6,12]{5,4,3,2,1,0} parameter(0)
-  exponential.3 = f32[16,6,12,7,6,12]{5,4,3,2,1,0} exponential(Arg_0.1)
-  constant.2 = f32[] constant(1)
-  reduce.8 = f32[16,12,7]{2,1,0} reduce(exponential.3, constant.2), dimensions={1,4,5}, to_apply=region_0.4
-  ROOT tuple.9 = (f32[16,12,7]{2,1,0}) tuple(reduce.8)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_50.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_50.hlo
deleted file mode 100644
index edd5fc810a3..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_50.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f16[6,1]{1,0})->(f16[6,1]{1,0})}
-
-ENTRY main.9 {
-  Arg_0.1 = f16[6,1]{1,0} parameter(0)
-  constant.2 = f16[] constant(1)
-  broadcast.3 = f16[2,1]{1,0} broadcast(constant.2), dimensions={}
-  constant.4 = s32[] constant(3)
-  constant.5 = s32[] constant(0)
-  dynamic-update-slice.6 = f16[6,1]{1,0} dynamic-update-slice(Arg_0.1, broadcast.3, constant.4, constant.5)
-  exponential.7 = f16[6,1]{1,0} exponential(dynamic-update-slice.6)
-  ROOT tuple.8 = (f16[6,1]{1,0}) tuple(exponential.7)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_51.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_51.hlo
deleted file mode 100644
index 17e0f8d240c..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_51.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[13]{0})->(s16[13]{0})}
-
-ENTRY main.7 {
-  constant.2 = s16[] constant(1)
-  broadcast.3 = s16[13]{0} broadcast(constant.2), dimensions={}
-  Arg_0.1 = s16[13]{0} parameter(0)
-  subtract.4 = s16[13]{0} subtract(broadcast.3, Arg_0.1)
-  abs.5 = s16[13]{0} abs(subtract.4)
-  ROOT tuple.6 = (s16[13]{0}) tuple(abs.5)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_52.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_52.hlo
deleted file mode 100644
index ebc5714ab0d..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_52.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(f32[3,4,2,14]{3,2,1,0})}
-
-ENTRY main.4 {
-  constant.1 = f32[] constant(1.55740774)
-  broadcast.2 = f32[3,4,2,14]{3,2,1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (f32[3,4,2,14]{3,2,1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_54.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_54.hlo
deleted file mode 100644
index 912e9bfbe67..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_54.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[13,16,16]{2,1,0},s32[13,16,16]{2,1,0})->(s32[13,16,16]{2,1,0})}
-
-ENTRY main.8 {
-  Arg_0.1 = s32[13,16,16]{2,1,0} parameter(0)
-  Arg_1.2 = s32[13,16,16]{2,1,0} parameter(1)
-  shift-right-logical.5 = s32[13,16,16]{2,1,0} shift-right-logical(Arg_0.1, Arg_1.2)
-  constant.3 = s32[] constant(1)
-  broadcast.4 = s32[13,16,16]{2,1,0} broadcast(constant.3), dimensions={}
-  minimum.6 = s32[13,16,16]{2,1,0} minimum(shift-right-logical.5, broadcast.4)
-  ROOT tuple.7 = (s32[13,16,16]{2,1,0}) tuple(minimum.6)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_57.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_57.hlo
deleted file mode 100644
index bdbf53c3e8d..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_57.hlo
+++ /dev/null
@@ -1,20 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[195,2]{1,0},s32[8,1,2]{2,1,0})->(s8[13,2,15]{2,1,0})}
-
-region_0.5 {
-  Arg_0.6 = s8[] parameter(0)
-  Arg_1.7 = s8[] parameter(1)
-  ROOT minimum.8 = s8[] minimum(Arg_0.6, Arg_1.7)
-}
-
-ENTRY main.13 {
-  Arg_0.1 = s8[195,2]{1,0} parameter(0)
-  Arg_1.2 = s32[8,1,2]{2,1,0} parameter(1)
-  constant.3 = s8[] constant(1)
-  broadcast.4 = s8[56,2,8,1]{3,2,1,0} broadcast(constant.3), dimensions={}
-  scatter.9 = s8[195,2]{1,0} scatter(Arg_0.1, Arg_1.2, broadcast.4), update_window_dims={0,1}, inserted_window_dims={}, scatter_dims_to_operand_dims={1,0}, index_vector_dim=2, to_apply=region_0.5
-  transpose.10 = s8[2,195]{0,1} transpose(scatter.9), dimensions={1,0}
-  reshape.11 = s8[13,2,15]{2,1,0} reshape(transpose.10)
-  ROOT tuple.12 = (s8[13,2,15]{2,1,0}) tuple(reshape.11)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_58.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_58.hlo
deleted file mode 100644
index 36f8ca328b2..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_58.hlo
+++ /dev/null
@@ -1,48 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[],s8[6]{0},u8[4,3,8,13,11]{4,3,2,1,0})->(u8[4,3,8,4,11]{4,3,2,1,0})}
-
-region_1.7 {
-  Arg_0.8 = u8[] parameter(0)
-  Arg_1.9 = u8[] parameter(1)
-  ROOT add.10 = u8[] add(Arg_0.8, Arg_1.9)
-}
-
-region_0.11 {
-  Arg_.12 = s8[6]{0} parameter(0)
-  convert.14 = u8[6]{0} convert(Arg_.12)
-  broadcast.15 = u8[4,3,8,4,6]{4,3,2,1,0} broadcast(convert.14), dimensions={4}
-  constant.13 = u8[] constant(0)
-  reduce.16 = u8[4,3,8,4]{3,2,1,0} reduce(broadcast.15, constant.13), dimensions={4}, to_apply=region_1.7
-  broadcast.17 = u8[13,4,3,8,4]{4,3,2,1,0} broadcast(reduce.16), dimensions={1,2,3,4}
-  ROOT reshape.18 = u8[4,3,8,4,13]{4,3,2,1,0} reshape(broadcast.17)
-}
-
-region_3.19 {
-  Arg_0.20 = u8[] parameter(0)
-  Arg_1.21 = u8[] parameter(1)
-  ROOT add.22 = u8[] add(Arg_0.20, Arg_1.21)
-}
-
-region_2.23 {
-  Arg_.24 = s8[6]{0} parameter(0)
-  convert.26 = u8[6]{0} convert(Arg_.24)
-  broadcast.27 = u8[4,3,8,4,6]{4,3,2,1,0} broadcast(convert.26), dimensions={4}
-  constant.25 = u8[] constant(0)
-  reduce.28 = u8[4,3,8,4]{3,2,1,0} reduce(broadcast.27, constant.25), dimensions={4}, to_apply=region_3.19
-  broadcast.29 = u8[13,4,3,8,4]{4,3,2,1,0} broadcast(reduce.28), dimensions={1,2,3,4}
-  ROOT reshape.30 = u8[4,3,8,4,13]{4,3,2,1,0} reshape(broadcast.29)
-}
-
-ENTRY main.35 {
-  Arg_0.1 = pred[] parameter(0)
-  convert.6 = s32[] convert(Arg_0.1)
-  Arg_1.2 = s8[6]{0} parameter(1)
-  conditional.31 = u8[4,3,8,4,13]{4,3,2,1,0} conditional(convert.6, Arg_1.2, Arg_1.2), branch_computations={region_0.11, region_2.23}
-  Arg_2.3 = u8[4,3,8,13,11]{4,3,2,1,0} parameter(2)
-  constant.4 = u8[] constant(1)
-  broadcast.5 = u8[4,3,8,13,11]{4,3,2,1,0} broadcast(constant.4), dimensions={}
-  subtract.32 = u8[4,3,8,13,11]{4,3,2,1,0} subtract(Arg_2.3, broadcast.5)
-  dot.33 = u8[4,3,8,4,11]{4,3,2,1,0} dot(conditional.31, subtract.32), lhs_batch_dims={0,1,2}, lhs_contracting_dims={4}, rhs_batch_dims={0,1,2}, rhs_contracting_dims={3}
-  ROOT tuple.34 = (u8[4,3,8,4,11]{4,3,2,1,0}) tuple(dot.33)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_63.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_63.hlo
deleted file mode 100644
index 70266fb3545..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_63.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u32[],u32[])->(u32[])}
-
-ENTRY main.8 {
-  Arg_0.1 = u32[] parameter(0)
-  constant.3 = u32[] constant(1)
-  shift-right-arithmetic.4 = u32[] shift-right-arithmetic(Arg_0.1, constant.3)
-  Arg_1.2 = u32[] parameter(1)
-  multiply.5 = u32[] multiply(Arg_1.2, Arg_1.2)
-  shift-left.6 = u32[] shift-left(shift-right-arithmetic.4, multiply.5)
-  ROOT tuple.7 = (u32[]) tuple(shift-left.6)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_65.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_65.hlo
deleted file mode 100644
index ddd269e13e3..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_65.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[],s8[])->(s8[])}
-
-ENTRY main.8 {
-  Arg_0.1 = s8[] parameter(0)
-  constant.4 = s8[] constant(1)
-  Arg_1.2 = s8[] parameter(1)
-  clamp.5 = s8[] clamp(Arg_0.1, constant.4, Arg_1.2)
-  constant.3 = s8[] constant(-1)
-  subtract.6 = s8[] subtract(clamp.5, constant.3)
-  ROOT tuple.7 = (s8[]) tuple(subtract.6)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_68.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_68.hlo
deleted file mode 100644
index cd4c01d607b..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_68.hlo
+++ /dev/null
@@ -1,158 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f32[12]{0})->(f32[12]{0})}
-
-ENTRY main.153 {
-  Arg_0.1 = f32[12]{0} parameter(0)
-  sign.148 = f32[12]{0} sign(Arg_0.1)
-  abs.62 = f32[12]{0} abs(Arg_0.1)
-  constant.52 = f32[] constant(8)
-  broadcast.53 = f32[12]{0} broadcast(constant.52), dimensions={}
-  compare.146 = pred[12]{0} compare(abs.62, broadcast.53), direction=LE
-  constant.58 = f32[] constant(0.5)
-  broadcast.59 = f32[12]{0} broadcast(constant.58), dimensions={}
-  multiply.63 = f32[12]{0} multiply(abs.62, broadcast.59)
-  constant.56 = f32[] constant(2)
-  broadcast.57 = f32[12]{0} broadcast(constant.56), dimensions={}
-  subtract.64 = f32[12]{0} subtract(multiply.63, broadcast.57)
-  constant.50 = f32[] constant(0)
-  broadcast.51 = f32[12]{0} broadcast(constant.50), dimensions={}
-  multiply.65 = f32[12]{0} multiply(subtract.64, broadcast.51)
-  subtract.66 = f32[12]{0} subtract(multiply.65, broadcast.51)
-  constant.48 = f32[] constant(9.38153732e-09)
-  broadcast.49 = f32[12]{0} broadcast(constant.48), dimensions={}
-  add.67 = f32[12]{0} add(subtract.66, broadcast.49)
-  multiply.68 = f32[12]{0} multiply(subtract.64, add.67)
-  subtract.69 = f32[12]{0} subtract(multiply.68, broadcast.51)
-  constant.46 = f32[] constant(-4.44505908e-08)
-  broadcast.47 = f32[12]{0} broadcast(constant.46), dimensions={}
-  add.70 = f32[12]{0} add(subtract.69, broadcast.47)
-  multiply.71 = f32[12]{0} multiply(subtract.64, add.70)
-  subtract.72 = f32[12]{0} subtract(multiply.71, add.67)
-  constant.44 = f32[] constant(2.00329481e-07)
-  broadcast.45 = f32[12]{0} broadcast(constant.44), dimensions={}
-  add.73 = f32[12]{0} add(subtract.72, broadcast.45)
-  multiply.74 = f32[12]{0} multiply(subtract.64, add.73)
-  subtract.75 = f32[12]{0} subtract(multiply.74, add.70)
-  constant.42 = f32[] constant(-8.56872e-07)
-  broadcast.43 = f32[12]{0} broadcast(constant.42), dimensions={}
-  add.76 = f32[12]{0} add(subtract.75, broadcast.43)
-  multiply.77 = f32[12]{0} multiply(subtract.64, add.76)
-  subtract.78 = f32[12]{0} subtract(multiply.77, add.73)
-  constant.40 = f32[] constant(3.47025139e-06)
-  broadcast.41 = f32[12]{0} broadcast(constant.40), dimensions={}
-  add.79 = f32[12]{0} add(subtract.78, broadcast.41)
-  multiply.80 = f32[12]{0} multiply(subtract.64, add.79)
-  subtract.81 = f32[12]{0} subtract(multiply.80, add.76)
-  constant.38 = f32[] constant(-1.32731639e-05)
-  broadcast.39 = f32[12]{0} broadcast(constant.38), dimensions={}
-  add.82 = f32[12]{0} add(subtract.81, broadcast.39)
-  multiply.83 = f32[12]{0} multiply(subtract.64, add.82)
-  subtract.84 = f32[12]{0} subtract(multiply.83, add.79)
-  constant.36 = f32[] constant(4.78156508e-05)
-  broadcast.37 = f32[12]{0} broadcast(constant.36), dimensions={}
-  add.85 = f32[12]{0} add(subtract.84, broadcast.37)
-  multiply.86 = f32[12]{0} multiply(subtract.64, add.85)
-  subtract.87 = f32[12]{0} subtract(multiply.86, add.82)
-  constant.34 = f32[] constant(-0.000161760821)
-  broadcast.35 = f32[12]{0} broadcast(constant.34), dimensions={}
-  add.88 = f32[12]{0} add(subtract.87, broadcast.35)
-  multiply.89 = f32[12]{0} multiply(subtract.64, add.88)
-  subtract.90 = f32[12]{0} subtract(multiply.89, add.85)
-  constant.32 = f32[] constant(0.000512286)
-  broadcast.33 = f32[12]{0} broadcast(constant.32), dimensions={}
-  add.91 = f32[12]{0} add(subtract.90, broadcast.33)
-  multiply.92 = f32[12]{0} multiply(subtract.64, add.91)
-  subtract.93 = f32[12]{0} subtract(multiply.92, add.88)
-  constant.30 = f32[] constant(-0.00151357241)
-  broadcast.31 = f32[12]{0} broadcast(constant.30), dimensions={}
-  add.94 = f32[12]{0} add(subtract.93, broadcast.31)
-  multiply.95 = f32[12]{0} multiply(subtract.64, add.94)
-  subtract.96 = f32[12]{0} subtract(multiply.95, add.91)
-  constant.28 = f32[] constant(0.0041564228)
-  broadcast.29 = f32[12]{0} broadcast(constant.28), dimensions={}
-  add.97 = f32[12]{0} add(subtract.96, broadcast.29)
-  multiply.98 = f32[12]{0} multiply(subtract.64, add.97)
-  subtract.99 = f32[12]{0} subtract(multiply.98, add.94)
-  constant.26 = f32[] constant(-0.0105640851)
-  broadcast.27 = f32[12]{0} broadcast(constant.26), dimensions={}
-  add.100 = f32[12]{0} add(subtract.99, broadcast.27)
-  multiply.101 = f32[12]{0} multiply(subtract.64, add.100)
-  subtract.102 = f32[12]{0} subtract(multiply.101, add.97)
-  constant.24 = f32[] constant(0.0247264486)
-  broadcast.25 = f32[12]{0} broadcast(constant.24), dimensions={}
-  add.103 = f32[12]{0} add(subtract.102, broadcast.25)
-  multiply.104 = f32[12]{0} multiply(subtract.64, add.103)
-  subtract.105 = f32[12]{0} subtract(multiply.104, add.100)
-  constant.22 = f32[] constant(-0.0529459827)
-  broadcast.23 = f32[12]{0} broadcast(constant.22), dimensions={}
-  add.106 = f32[12]{0} add(subtract.105, broadcast.23)
-  multiply.107 = f32[12]{0} multiply(subtract.64, add.106)
-  subtract.108 = f32[12]{0} subtract(multiply.107, add.103)
-  constant.20 = f32[] constant(0.102643661)
-  broadcast.21 = f32[12]{0} broadcast(constant.20), dimensions={}
-  add.109 = f32[12]{0} add(subtract.108, broadcast.21)
-  multiply.110 = f32[12]{0} multiply(subtract.64, add.109)
-  subtract.111 = f32[12]{0} subtract(multiply.110, add.106)
-  constant.18 = f32[] constant(-0.176416516)
-  broadcast.19 = f32[12]{0} broadcast(constant.18), dimensions={}
-  add.112 = f32[12]{0} add(subtract.111, broadcast.19)
-  multiply.113 = f32[12]{0} multiply(subtract.64, add.112)
-  subtract.114 = f32[12]{0} subtract(multiply.113, add.109)
-  constant.16 = f32[] constant(0.252587199)
-  broadcast.17 = f32[12]{0} broadcast(constant.16), dimensions={}
-  add.115 = f32[12]{0} add(subtract.114, broadcast.17)
-  subtract.116 = f32[12]{0} subtract(add.115, add.109)
-  multiply.117 = f32[12]{0} multiply(subtract.116, broadcast.59)
-  multiply.118 = f32[12]{0} multiply(abs.62, multiply.117)
-  constant.54 = f32[] constant(32)
-  broadcast.55 = f32[12]{0} broadcast(constant.54), dimensions={}
-  divide.119 = f32[12]{0} divide(broadcast.55, abs.62)
-  subtract.120 = f32[12]{0} subtract(divide.119, broadcast.57)
-  multiply.121 = f32[12]{0} multiply(subtract.120, broadcast.51)
-  subtract.122 = f32[12]{0} subtract(multiply.121, broadcast.51)
-  constant.14 = f32[] constant(-3.83538046e-09)
-  broadcast.15 = f32[12]{0} broadcast(constant.14), dimensions={}
-  add.123 = f32[12]{0} add(subtract.122, broadcast.15)
-  multiply.124 = f32[12]{0} multiply(subtract.120, add.123)
-  subtract.125 = f32[12]{0} subtract(multiply.124, broadcast.51)
-  constant.12 = f32[] constant(-2.63146891e-08)
-  broadcast.13 = f32[12]{0} broadcast(constant.12), dimensions={}
-  add.126 = f32[12]{0} add(subtract.125, broadcast.13)
-  multiply.127 = f32[12]{0} multiply(subtract.120, add.126)
-  subtract.128 = f32[12]{0} subtract(multiply.127, add.123)
-  constant.10 = f32[] constant(-2.51223611e-07)
-  broadcast.11 = f32[12]{0} broadcast(constant.10), dimensions={}
-  add.129 = f32[12]{0} add(subtract.128, broadcast.11)
-  multiply.130 = f32[12]{0} multiply(subtract.120, add.129)
-  subtract.131 = f32[12]{0} subtract(multiply.130, add.126)
-  constant.8 = f32[] constant(-3.88256467e-06)
-  broadcast.9 = f32[12]{0} broadcast(constant.8), dimensions={}
-  add.132 = f32[12]{0} add(subtract.131, broadcast.9)
-  multiply.133 = f32[12]{0} multiply(subtract.120, add.132)
-  subtract.134 = f32[12]{0} subtract(multiply.133, add.129)
-  constant.6 = f32[] constant(-0.000110588939)
-  broadcast.7 = f32[12]{0} broadcast(constant.6), dimensions={}
-  add.135 = f32[12]{0} add(subtract.134, broadcast.7)
-  multiply.136 = f32[12]{0} multiply(subtract.120, add.135)
-  subtract.137 = f32[12]{0} subtract(multiply.136, add.132)
-  constant.4 = f32[] constant(-0.00976109784)
-  broadcast.5 = f32[12]{0} broadcast(constant.4), dimensions={}
-  add.138 = f32[12]{0} add(subtract.137, broadcast.5)
-  multiply.139 = f32[12]{0} multiply(subtract.120, add.138)
-  subtract.140 = f32[12]{0} subtract(multiply.139, add.135)
-  constant.2 = f32[] constant(0.778576255)
-  broadcast.3 = f32[12]{0} broadcast(constant.2), dimensions={}
-  add.141 = f32[12]{0} add(subtract.140, broadcast.3)
-  subtract.142 = f32[12]{0} subtract(add.141, add.135)
-  multiply.143 = f32[12]{0} multiply(subtract.142, broadcast.59)
-  sqrt.144 = f32[12]{0} sqrt(abs.62)
-  divide.145 = f32[12]{0} divide(multiply.143, sqrt.144)
-  select.147 = f32[12]{0} select(compare.146, multiply.118, divide.145)
-  multiply.149 = f32[12]{0} multiply(sign.148, select.147)
-  constant.60 = f32[] constant(-0.9921875)
-  broadcast.61 = f32[12]{0} broadcast(constant.60), dimensions={}
-  maximum.150 = f32[12]{0} maximum(multiply.149, broadcast.61)
-  log-plus-one.151 = f32[12]{0} log-plus-one(maximum.150)
-  ROOT tuple.152 = (f32[12]{0}) tuple(log-plus-one.151)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_70.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_70.hlo
deleted file mode 100644
index a8bda75ecbd..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_70.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[5]{0})->(s32[5]{0})}
-
-ENTRY main.7 {
-  Arg_0.1 = s32[5]{0} parameter(0)
-  constant.2 = s32[] constant(1)
-  broadcast.3 = s32[5]{0} broadcast(constant.2), dimensions={}
-  shift-right-logical.4 = s32[5]{0} shift-right-logical(Arg_0.1, broadcast.3)
-  divide.5 = s32[5]{0} divide(shift-right-logical.4, broadcast.3)
-  ROOT tuple.6 = (s32[5]{0}) tuple(divide.5)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_73.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_73.hlo
deleted file mode 100644
index 4a666c5bd7d..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_73.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[7,8,7,12,14]{4,3,2,1,0})->(s32[7,8,7,12,14]{4,3,2,1,0})}
-
-ENTRY main.9 {
-  constant.2 = s32[] constant(0)
-  broadcast.3 = s32[7,8,7,12,14]{4,3,2,1,0} broadcast(constant.2), dimensions={}
-  Arg_0.1 = s32[7,8,7,12,14]{4,3,2,1,0} parameter(0)
-  constant.4 = s32[] constant(1)
-  broadcast.5 = s32[7,8,7,12,14]{4,3,2,1,0} broadcast(constant.4), dimensions={}
-  shift-right-logical.6 = s32[7,8,7,12,14]{4,3,2,1,0} shift-right-logical(Arg_0.1, broadcast.5)
-  shift-left.7 = s32[7,8,7,12,14]{4,3,2,1,0} shift-left(broadcast.3, shift-right-logical.6)
-  ROOT tuple.8 = (s32[7,8,7,12,14]{4,3,2,1,0}) tuple(shift-left.7)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_74.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_74.hlo
deleted file mode 100644
index c595acc8f99..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_74.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u32[7,13]{1,0})}
-
-ENTRY main.4 {
-  constant.1 = u32[] constant(1)
-  broadcast.2 = u32[7,13]{1,0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (u32[7,13]{1,0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_75.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_75.hlo
deleted file mode 100644
index c221028e4e8..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_75.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[4,14,14]{2,1,0})->(s16[4,14,14]{2,1,0})}
-
-ENTRY main.7 {
-  constant.2 = s16[] constant(1)
-  broadcast.3 = s16[4,14,14]{2,1,0} broadcast(constant.2), dimensions={}
-  Arg_0.1 = s16[4,14,14]{2,1,0} parameter(0)
-  clamp.4 = s16[4,14,14]{2,1,0} clamp(broadcast.3, Arg_0.1, broadcast.3)
-  negate.5 = s16[4,14,14]{2,1,0} negate(clamp.4)
-  ROOT tuple.6 = (s16[4,14,14]{2,1,0}) tuple(negate.5)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_76.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_76.hlo
deleted file mode 100644
index 130d640f4df..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_76.hlo
+++ /dev/null
@@ -1,19 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s16[12]{0},s16[12]{0},s16[12]{0})->(s16[12]{0})}
-
-ENTRY main.14 {
-  Arg_0.1 = s16[12]{0} parameter(0)
-  constant.4 = s16[] constant(0)
-  broadcast.5 = s16[12]{0} broadcast(constant.4), dimensions={}
-  Arg_1.2 = s16[12]{0} parameter(1)
-  compare.8 = pred[12]{0} compare(broadcast.5, Arg_1.2), direction=EQ
-  constant.6 = s16[] constant(1)
-  broadcast.7 = s16[12]{0} broadcast(constant.6), dimensions={}
-  select.9 = s16[12]{0} select(compare.8, broadcast.7, Arg_1.2)
-  divide.10 = s16[12]{0} divide(Arg_0.1, select.9)
-  Arg_2.3 = s16[12]{0} parameter(2)
-  maximum.11 = s16[12]{0} maximum(Arg_2.3, broadcast.7)
-  xor.12 = s16[12]{0} xor(divide.10, maximum.11)
-  ROOT tuple.13 = (s16[12]{0}) tuple(xor.12)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_80.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_80.hlo
deleted file mode 100644
index 409322f592a..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_80.hlo
+++ /dev/null
@@ -1,14 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[],u16[],u16[])->(u16[])}
-
-ENTRY main.9 {
-  Arg_0.1 = u16[] parameter(0)
-  Arg_1.2 = u16[] parameter(1)
-  shift-left.5 = u16[] shift-left(Arg_0.1, Arg_1.2)
-  Arg_2.3 = u16[] parameter(2)
-  constant.4 = u16[] constant(1)
-  xor.6 = u16[] xor(Arg_2.3, constant.4)
-  minimum.7 = u16[] minimum(shift-left.5, xor.6)
-  ROOT tuple.8 = (u16[]) tuple(minimum.7)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_82.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_82.hlo
deleted file mode 100644
index f8256dfb10d..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_82.hlo
+++ /dev/null
@@ -1,27 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(f32[6]{0},s32[10,4,1,15]{3,2,1,0},f32[10,15,5,4]{3,2,1,0})->(f32[6]{0})}
-
-region_0.14 {
-  Arg_0.15 = f32[] parameter(0)
-  Arg_1.16 = f32[] parameter(1)
-  ROOT minimum.17 = f32[] minimum(Arg_0.15, Arg_1.16)
-}
-
-ENTRY main.20 {
-  Arg_0.1 = f32[6]{0} parameter(0)
-  constant.4 = s32[] constant(0)
-  broadcast.5 = s32[10,15,4,1]{3,2,1,0} broadcast(constant.4), dimensions={}
-  Arg_1.2 = s32[10,4,1,15]{3,2,1,0} parameter(1)
-  transpose.10 = s32[10,15,4,1]{1,3,2,0} transpose(Arg_1.2), dimensions={0,3,1,2}
-  constant.6 = s32[] constant(1)
-  broadcast.7 = s32[10,15,4,1]{3,2,1,0} broadcast(constant.6), dimensions={}
-  clamp.13 = s32[10,15,4,1]{1,3,2,0} clamp(broadcast.5, transpose.10, broadcast.7)
-  Arg_2.3 = f32[10,15,5,4]{3,2,1,0} parameter(2)
-  constant.8 = f32[] constant(0.0078125)
-  broadcast.9 = f32[10,15,5,4]{3,2,1,0} broadcast(constant.8), dimensions={}
-  maximum.11 = f32[10,15,5,4]{3,2,1,0} maximum(Arg_2.3, broadcast.9)
-  rsqrt.12 = f32[10,15,5,4]{3,2,1,0} rsqrt(maximum.11)
-  scatter.18 = f32[6]{0} scatter(Arg_0.1, clamp.13, rsqrt.12), update_window_dims={2}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=3, to_apply=region_0.14
-  ROOT tuple.19 = (f32[6]{0}) tuple(scatter.18)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_83.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_83.hlo
deleted file mode 100644
index 7b264681fc4..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_83.hlo
+++ /dev/null
@@ -1,10 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(pred[],pred[])->(pred[])}
-
-ENTRY main.5 {
-  Arg_0.1 = pred[] parameter(0)
-  Arg_1.2 = pred[] parameter(1)
-  minimum.3 = pred[] minimum(Arg_0.1, Arg_1.2)
-  ROOT tuple.4 = (pred[]) tuple(minimum.3)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_87.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_87.hlo
deleted file mode 100644
index 9de1f088876..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_87.hlo
+++ /dev/null
@@ -1,12 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s32[14,5,5]{2,1,0})->(s32[5,5,14]{1,0,2})}
-
-ENTRY main.7 {
-  Arg_0.1 = s32[14,5,5]{2,1,0} parameter(0)
-  constant.2 = s32[] constant(1)
-  broadcast.3 = s32[14,5,5]{2,1,0} broadcast(constant.2), dimensions={}
-  maximum.4 = s32[14,5,5]{2,1,0} maximum(Arg_0.1, broadcast.3)
-  transpose.5 = s32[5,5,14]{1,0,2} transpose(maximum.4), dimensions={1,2,0}
-  ROOT tuple.6 = (s32[5,5,14]{1,0,2}) tuple(transpose.5)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_91.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_91.hlo
deleted file mode 100644
index 9b78d72ea2f..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_91.hlo
+++ /dev/null
@@ -1,18 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u32[8,8]{1,0})}
-
-region_0.4 {
-  Arg_0.5 = u32[] parameter(0)
-  Arg_1.6 = u32[] parameter(1)
-  ROOT add.7 = u32[] add(Arg_0.5, Arg_1.6)
-}
-
-ENTRY main.11 {
-  constant.1 = u32[] constant(1)
-  broadcast.2 = u32[8,8]{1,0} broadcast(constant.1), dimensions={}
-  constant.3 = u32[] constant(0)
-  reduce-window.8 = u32[8,8]{1,0} reduce-window(broadcast.2, constant.3), window={size=1x1 stride=2x1 lhs_dilate=2x1 rhs_dilate=4x1}, to_apply=region_0.4
-  or.9 = u32[8,8]{1,0} or(reduce-window.8, broadcast.2)
-  ROOT tuple.10 = (u32[8,8]{1,0}) tuple(or.9)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_93.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_93.hlo
deleted file mode 100644
index 783744b30b4..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_93.hlo
+++ /dev/null
@@ -1,19 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(s32[8,16,3,11]{3,2,1,0})}
-
-ENTRY main.14 {
-  constant.1 = s32[] constant(1)
-  broadcast.2 = s32[8,16,3,11]{3,2,1,0} broadcast(constant.1), dimensions={}
-  constant.5 = s32[] constant(1)
-  broadcast.6 = s32[6,13,1,3]{3,2,1,0} broadcast(constant.5), dimensions={}
-  constant.7 = s32[] constant(7)
-  constant.8 = s32[] constant(0)
-  constant.9 = s32[] constant(1)
-  constant.10 = s32[] constant(10)
-  dynamic-update-slice.11 = s32[8,16,3,11]{3,2,1,0} dynamic-update-slice(broadcast.2, broadcast.6, constant.7, constant.8, constant.9, constant.10)
-  constant.3 = s32[] constant(2)
-  broadcast.4 = s32[8,16,3,11]{3,2,1,0} broadcast(constant.3), dimensions={}
-  clamp.12 = s32[8,16,3,11]{3,2,1,0} clamp(dynamic-update-slice.11, broadcast.4, broadcast.2)
-  ROOT tuple.13 = (s32[8,16,3,11]{3,2,1,0}) tuple(clamp.12)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_96.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_96.hlo
deleted file mode 100644
index 7d81c1ef120..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_96.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(s8[8,6]{1,0})}
-
-ENTRY main.8 {
-  constant.1 = s8[] constant(0)
-  broadcast.2 = s8[8,2]{1,0} broadcast(constant.1), dimensions={}
-  constant.3 = s8[] constant(1)
-  broadcast.4 = s8[2,6]{1,0} broadcast(constant.3), dimensions={}
-  shift-left.5 = s8[2,6]{1,0} shift-left(broadcast.4, broadcast.4)
-  dot.6 = s8[8,6]{1,0} dot(broadcast.2, shift-left.5), lhs_contracting_dims={1}, rhs_contracting_dims={0}
-  ROOT tuple.7 = (s8[8,6]{1,0}) tuple(dot.6)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_97.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_97.hlo
deleted file mode 100644
index 55f203ed8b9..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_97.hlo
+++ /dev/null
@@ -1,9 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={()->(u16[2]{0})}
-
-ENTRY main.4 {
-  constant.1 = u16[] constant(4)
-  broadcast.2 = u16[2]{0} broadcast(constant.1), dimensions={}
-  ROOT tuple.3 = (u16[2]{0}) tuple(broadcast.2)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_98.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_98.hlo
deleted file mode 100644
index 42ca545a117..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_98.hlo
+++ /dev/null
@@ -1,22 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(u16[4]{0},s32[1,5,1,4,1]{4,3,2,1,0})->(u16[4]{0})}
-
-region_0.8 {
-  Arg_0.9 = u16[] parameter(0)
-  Arg_1.10 = u16[] parameter(1)
-  ROOT maximum.11 = u16[] maximum(Arg_0.9, Arg_1.10)
-}
-
-ENTRY main.15 {
-  constant.3 = u16[] constant(1)
-  broadcast.4 = u16[4]{0} broadcast(constant.3), dimensions={}
-  Arg_0.1 = u16[4]{0} parameter(0)
-  shift-right-arithmetic.7 = u16[4]{0} shift-right-arithmetic(broadcast.4, Arg_0.1)
-  Arg_1.2 = s32[1,5,1,4,1]{4,3,2,1,0} parameter(1)
-  constant.5 = u16[] constant(1)
-  broadcast.6 = u16[1,5,1,4,1]{4,3,2,1,0} broadcast(constant.5), dimensions={}
-  scatter.12 = u16[4]{0} scatter(broadcast.4, Arg_1.2, broadcast.6), update_window_dims={4}, inserted_window_dims={}, scatter_dims_to_operand_dims={0}, index_vector_dim=4, to_apply=region_0.8
-  shift-right-arithmetic.13 = u16[4]{0} shift-right-arithmetic(shift-right-arithmetic.7, scatter.12)
-  ROOT tuple.14 = (u16[4]{0}) tuple(shift-right-arithmetic.13)
-}
-
diff --git a/tensorflow/compiler/xla/tests/fuzz/rand_99.hlo b/tensorflow/compiler/xla/tests/fuzz/rand_99.hlo
deleted file mode 100644
index 2f7a329ebf3..00000000000
--- a/tensorflow/compiler/xla/tests/fuzz/rand_99.hlo
+++ /dev/null
@@ -1,13 +0,0 @@
-/* This file is autogenerated! Please don't edit! */
-HloModule xla_computation_unknown, entry_computation_layout={(s8[8,13,14,12]{3,2,1,0})->(s8[8,13,14,12]{3,2,1,0})}
-
-ENTRY main.8 {
-  Arg_0.1 = s8[8,13,14,12]{3,2,1,0} parameter(0)
-  sign.4 = s8[8,13,14,12]{3,2,1,0} sign(Arg_0.1)
-  constant.2 = s8[] constant(1)
-  broadcast.3 = s8[16,23,56,16]{3,2,1,0} broadcast(constant.2), dimensions={}
-  slice.5 = s8[8,13,14,12]{3,2,1,0} slice(broadcast.3), slice={[6:14:1], [10:23:1], [14:56:3], [2:14:1]}
-  or.6 = s8[8,13,14,12]{3,2,1,0} or(sign.4, slice.5)
-  ROOT tuple.7 = (s8[8,13,14,12]{3,2,1,0}) tuple(or.6)
-}
-
diff --git a/tensorflow/compiler/xla/tests/hlo_test_base.cc b/tensorflow/compiler/xla/tests/hlo_test_base.cc
index 909ea247ffe..4238442ae40 100644
--- a/tensorflow/compiler/xla/tests/hlo_test_base.cc
+++ b/tensorflow/compiler/xla/tests/hlo_test_base.cc
@@ -57,7 +57,8 @@ bool ProgramShapesEqual(const ProgramShape& lhs, const ProgramShape& rhs) {
     return false;
   }
   for (int i = 0; i < lhs.parameters_size(); i++) {
-    if (!ShapeUtil::Equal(lhs.parameters(i), rhs.parameters(i))) {
+    if (!Shape::Equal().IgnoreElementSizeInLayout()(lhs.parameters(i),
+                                                    rhs.parameters(i))) {
       return false;
     }
   }
diff --git a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
index 9fbf7772987..cb13819609a 100644
--- a/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
+++ b/tensorflow/compiler/xla/tests/multioutput_fusion_test.cc
@@ -245,6 +245,34 @@ XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFusion) {
   LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0, 1.0}, result);
 }
 
+XLA_TEST_F(MultiOutputFusionTest,
+           MultiOutputLoopFusionBitcastCompatibleShapes) {
+  const char* testcase = R"(
+    HloModule m, is_scheduled=true
+
+    fused_computation {
+      p = f32[4] parameter(0)
+      multiply = f32[4] multiply(p, p)
+      less-than = pred[4] compare(p, multiply), direction=LT
+      bitcast = pred[1, 4] bitcast(less-than)
+      ROOT tuple = (pred[1,4], f32[4]) tuple(bitcast, multiply)
+    }
+
+    ENTRY PredFloatMOF {
+      p0 = f32[4] parameter(0)
+      fusion = (pred[1,4], f32[4]) fusion(p0), kind=kLoop, calls=fused_computation
+      gte0 = pred[1,4] get-tuple-element(fusion), index=0
+      bitcast0 = pred[4] bitcast(gte0)
+      gte1 = f32[4] get-tuple-element(fusion), index=1
+      const = f32[4] constant({0, 0, 0, 0})
+      ROOT select = f32[4] select(bitcast0, gte1, const)
+    })";
+  auto module = ParseAndReturnVerifiedModule(testcase).value();
+  auto param = LiteralUtil::CreateR1<float>({1.0, 2.0, 3.0, -1.0});
+  Literal result = ExecuteNoHloPasses(std::move(module), {&param});
+  LiteralTestUtil::ExpectR1Equal<float>({0.0, 4.0, 9.0, 1.0}, result);
+}
+
 XLA_TEST_F(MultiOutputFusionTest, MultiOutputLoopFeedingMap) {
   const char* testcase = R"(
     HloModule m, is_scheduled=true
diff --git a/tensorflow/compiler/xla/tests/pjrt_gpu_client_registry.cc b/tensorflow/compiler/xla/tests/pjrt_gpu_client_registry.cc
index 116290d2385..556e6a22701 100644
--- a/tensorflow/compiler/xla/tests/pjrt_gpu_client_registry.cc
+++ b/tensorflow/compiler/xla/tests/pjrt_gpu_client_registry.cc
@@ -27,7 +27,6 @@ const bool kUnused =
        gpu_config.preallocate = true;
        gpu_config.memory_fraction = 0.08;
        return GetStreamExecutorGpuClient(/*asynchronous=*/true, gpu_config,
-                                         /*distributed_client=*/nullptr,
                                          /*node_id=*/0);
      }),
      true);
diff --git a/tensorflow/compiler/xla/tests/test_macros.h b/tensorflow/compiler/xla/tests/test_macros.h
index 31dcbbf65d2..7e9da2d47c4 100644
--- a/tensorflow/compiler/xla/tests/test_macros.h
+++ b/tensorflow/compiler/xla/tests/test_macros.h
@@ -30,6 +30,7 @@ limitations under the License.
 
 #define DISABLED_ON_CPU(X) X
 #define DISABLED_ON_GPU(X) X
+#define DISABLED_ON_GPU_A100(X) X
 #define DISABLED_ON_GPU_ROCM(X) X
 #define DISABLED_ON_INTERPRETER(X) X
 #define DISABLED_ON_INTERPRETER_TSAN(X) X
@@ -59,6 +60,11 @@ limitations under the License.
 
 #endif  // XLA_TEST_BACKEND_GPU
 
+#ifdef XLA_TEST_BACKEND_GPU_A100
+# undef DISABLED_ON_GPU_A100
+# define DISABLED_ON_GPU_A100(X) XLA_TEST_PASTE(DISABLED_, X)
+#endif  // XLA_TEST_BACKEND_GPU_A100
+
 #ifdef XLA_TEST_BACKEND_INTERPRETER
 # undef DISABLED_ON_INTERPRETER
 # define DISABLED_ON_INTERPRETER(X) XLA_TEST_PASTE(DISABLED_, X)
diff --git a/tensorflow/compiler/xla/tests/test_utils.cc b/tensorflow/compiler/xla/tests/test_utils.cc
index 02e557e64d6..64ceb4abda8 100644
--- a/tensorflow/compiler/xla/tests/test_utils.cc
+++ b/tensorflow/compiler/xla/tests/test_utils.cc
@@ -63,14 +63,8 @@ void PopulateWithRandomFullRangeFloatingPointData(Literal* literal,
 
   // Generates floating points with a log-uniform distribution. This causes the
   // exponent of the floating point to have a uniform distribution.
-  int min_exp, max_exp;
-  if (std::is_same<FloatT, bfloat16>()) {
-    min_exp = std::numeric_limits<float>::min_exponent;
-    max_exp = std::numeric_limits<float>::max_exponent;
-  } else {
-    min_exp = std::numeric_limits<FloatT>::min_exponent;
-    max_exp = std::numeric_limits<FloatT>::max_exponent;
-  }
+  const int min_exp = std::numeric_limits<FloatT>::min_exponent;
+  const int max_exp = std::numeric_limits<FloatT>::max_exponent;
   std::uniform_real_distribution<double> generator(min_exp - 1, max_exp - 1);
 
   for (FloatT& value : literal->data<FloatT>()) {
@@ -88,68 +82,48 @@ void PopulateWithRandomFullRangeFloatingPointData(Literal* literal,
 }
 
 template <typename FloatT>
-void PopulateWithIntNext(Literal* literal);
-
-template <>
-void PopulateWithIntNext<half>(Literal* literal) {
+void PopulateWithIntNext(Literal* literal) {
+  using BitRepT = UnsignedIntegerTypeForSizeType<sizeof(FloatT)>;
   // Duplicates may be generated if we don't have enough bits.
-  uint16_t next_value = 0;
-  for (half& value : literal->data<half>()) {
-    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
-    // the sign bit. We could be less wasteful, but this is best-effort anyway.
-    uint16_t exponent_msb = next_value & 0x4000;
-    value = Eigen::numext::bit_cast<half, uint16_t>((next_value & 0xBFFF) |
-                                                    (exponent_msb << 1));
-    next_value++;
-  }
-}
-
-template <>
-void PopulateWithIntNext<bfloat16>(Literal* literal) {
-  // Duplicates may be generated if we don't have enough bits.
-  // Start at 0x80 rather than 0 to avoid denormals.
-  uint16_t next_value = 0x80;
-  for (bfloat16& value : literal->data<bfloat16>()) {
-    // Zero-out the MSB of the exponent to avoid Infs and NaNs, and put it into
-    // the sign bit. We could be less wasteful, but this is best-effort anyway.
-    uint16_t exponent_msb = next_value & 0x4000;
-    value = Eigen::numext::bit_cast<bfloat16, uint16_t>((next_value & 0xBFFF) |
-                                                        (exponent_msb << 1));
-    next_value++;
+  // Skip bfloat16 and float32 subnormals.
+  const FloatT kFirstValue =
+      std::is_same_v<FloatT, bfloat16> || sizeof(FloatT) >= sizeof(float)
+          ? std::numeric_limits<FloatT>::min()
+          : std::numeric_limits<FloatT>::denorm_min();
+  // `current` keeps track of the next value we need to populate.
+  auto current = literal->data<FloatT>().begin();
+  auto end = literal->data<FloatT>().end();
+  // `sign` keeps track of the sign of the next value.
+  bool sign = false;
+  while (current != end) {
+    // We start populating values at zero and increase magnitude from there.
+    *current = sign ? static_cast<FloatT>(-0.0f) : static_cast<FloatT>(0.0f);
+    current++;
+    // The next value is either the smallest denormal or normal.
+    auto value = sign ? -kFirstValue : kFirstValue;
+    // Fill the array with values of increasing magnitude until we hit a
+    // non-finite value.
+    while (current != end && Eigen::numext::isfinite(value)) {
+      // Populate the value.
+      *current = value;
+      // Generate the next value by lexicographically increasing the bit
+      // representation.
+      const BitRepT next_value = Eigen::numext::bit_cast<BitRepT>(value) + 1;
+      value = Eigen::numext::bit_cast<FloatT>(next_value);
+      current++;
+    }
+    // We ran out of finite values, flip the sign and begin again.
+    sign = !sign;
   }
 }
 
 template <typename FloatT>
-void PopulateWithNextAfter(Literal* literal) {
-  // Duplicates may be generated if the number of elements in the literal
-  // exceeds the number of positive values supported by the type.
-  float next_value = std::numeric_limits<float>::min();
-  for (float& value : literal->data<float>()) {
-    value = next_value;
-    next_value = std::nextafter(next_value, std::numeric_limits<float>::max());
-  }
-}
-
-template <typename FloatT,
-          typename std::enable_if<std::is_same<bfloat16, FloatT>::value ||
-                                      std::is_same<half, FloatT>::value,
-                                  int>::type = 0>
 void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
   PopulateWithIntNext<FloatT>(literal);
   std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
                *engine);
 }
 
-template <typename FloatT,
-          typename std::enable_if<!std::is_same<bfloat16, FloatT>::value &&
-                                      !std::is_same<half, FloatT>::value,
-                                  int>::type = 0>
-void PopulateWithNoDuplicateData(Literal* literal, std::minstd_rand0* engine) {
-  PopulateWithNextAfter<FloatT>(literal);
-  std::shuffle(literal->data<FloatT>().begin(), literal->data<FloatT>().end(),
-               *engine);
-}
-
 template <typename FloatT>
 void PopulateWithFloatingPointData(Literal* literal, std::minstd_rand0* engine,
                                    bool no_duplicates, bool use_large_range) {
@@ -196,19 +170,10 @@ void PopulateWithComplexData(Literal* result, std::minstd_rand0* engine,
 // uniform_int_distribution is not defined for 8-bit integers.
 // Use 'short' for those types.
 template <typename IntT>
-struct RngT {
-  using type = IntT;
-};
-
-template <>
-struct RngT<int8_t> {
-  using type = int16_t;
-};
-
-template <>
-struct RngT<uint8_t> {
-  using type = uint16_t;
-};
+using RngT = std::conditional_t<
+    sizeof(IntT) < sizeof(uint16_t),
+    std::conditional_t<std::numeric_limits<IntT>::is_signed, int16_t, uint16_t>,
+    IntT>;
 
 template <typename IntT>
 void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
@@ -218,59 +183,17 @@ void PopulateWithRandomIntegralDataWithBounds(Literal* literal,
   CHECK(engine != nullptr);
   CHECK_EQ(literal->shape().element_type(),
            primitive_util::NativeToPrimitiveType<IntT>());
-  if (no_duplicates && ShapeUtil::ElementsIn(literal->shape()) < max) {
-    std::iota(literal->data<IntT>().begin(), literal->data<IntT>().end(), 0);
+  if (no_duplicates &&
+      ShapeUtil::ElementsIn(literal->shape()) < static_cast<int64_t>(max)) {
+    std::iota(literal->data<IntT>().begin(), literal->data<IntT>().end(),
+              static_cast<IntT>(0));
     std::shuffle(literal->data<IntT>().begin(), literal->data<IntT>().end(),
                  *engine);
   } else {
-    std::uniform_int_distribution<typename RngT<IntT>::type> generator(min,
-                                                                       max);
+    std::uniform_int_distribution<RngT<IntT>> generator(
+        static_cast<RngT<IntT>>(min), static_cast<RngT<IntT>>(max));
     for (IntT& value : literal->data<IntT>()) {
-      value = generator(*engine);
-    }
-  }
-}
-
-template <>
-void PopulateWithRandomIntegralDataWithBounds<u4>(Literal* literal,
-                                                  std::minstd_rand0* engine,
-                                                  bool no_duplicates, u4 min,
-                                                  u4 max) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<u4>());
-  if (no_duplicates &&
-      ShapeUtil::ElementsIn(literal->shape()) < static_cast<int64_t>(max)) {
-    std::iota(literal->data<u4>().begin(), literal->data<u4>().end(), u4(0));
-    std::shuffle(literal->data<u4>().begin(), literal->data<u4>().end(),
-                 *engine);
-  } else {
-    std::uniform_int_distribution<uint8_t> generator(static_cast<uint8_t>(min),
-                                                     static_cast<uint8_t>(max));
-    for (u4& value : literal->data<u4>()) {
-      value = static_cast<u4>(generator(*engine));
-    }
-  }
-}
-
-template <>
-void PopulateWithRandomIntegralDataWithBounds<s4>(Literal* literal,
-                                                  std::minstd_rand0* engine,
-                                                  bool no_duplicates, s4 min,
-                                                  s4 max) {
-  CHECK(engine != nullptr);
-  CHECK_EQ(literal->shape().element_type(),
-           primitive_util::NativeToPrimitiveType<s4>());
-  if (no_duplicates &&
-      ShapeUtil::ElementsIn(literal->shape()) < static_cast<int64_t>(max)) {
-    std::iota(literal->data<s4>().begin(), literal->data<s4>().end(), s4(0));
-    std::shuffle(literal->data<s4>().begin(), literal->data<s4>().end(),
-                 *engine);
-  } else {
-    std::uniform_int_distribution<uint8_t> generator(static_cast<uint8_t>(min),
-                                                     static_cast<uint8_t>(max));
-    for (s4& value : literal->data<s4>()) {
-      value = static_cast<s4>(generator(*engine));
+      value = static_cast<IntT>(generator(*engine));
     }
   }
 }
@@ -311,209 +234,56 @@ StatusOr<Literal> MakeFakeLiteralInternal(
   // literal.
   Shape new_shape = shape;
   new_shape.mutable_layout()->clear_tiles();
+  new_shape.mutable_layout()->set_element_size_in_bits(0);
   Literal literal(new_shape);
 
-  int64_t max = std::numeric_limits<int64_t>::max();
-  int64_t min = std::numeric_limits<int64_t>::lowest();
-  switch (shape.element_type()) {
-    case F8E5M2:
-      PopulateWithFloatingPointData<tsl::float8_e5m2>(
-          &literal, engine, no_duplicates, use_large_range);
-      break;
-    case F8E4M3FN:
-      PopulateWithFloatingPointData<tsl::float8_e4m3fn>(
-          &literal, engine, no_duplicates, use_large_range);
-      break;
-    case F8E4M3B11FNUZ:
-      PopulateWithFloatingPointData<tsl::float8_e4m3b11>(
-          &literal, engine, no_duplicates, use_large_range);
-      break;
-    case BF16:
-      PopulateWithFloatingPointData<bfloat16>(&literal, engine, no_duplicates,
-                                              use_large_range);
-      break;
-    case F16:
-      PopulateWithFloatingPointData<half>(&literal, engine, no_duplicates,
-                                          use_large_range);
-      break;
-    case F32:
-      PopulateWithFloatingPointData<float>(&literal, engine, no_duplicates,
-                                           use_large_range);
-      break;
-    case F64:
-      PopulateWithFloatingPointData<double>(&literal, engine, no_duplicates,
-                                            use_large_range);
-      break;
-    case S4: {
-      max = static_cast<int64_t>(std::numeric_limits<s4>::max());
-      min = static_cast<int64_t>(std::numeric_limits<s4>::lowest());
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<s4>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<s4>(min), static_cast<s4>(max));
-      if (is_sorted) {
-        std::sort(literal.data<s4>().begin(), literal.data<s4>().end());
-      }
-      break;
-    }
-    case S8:
-      max = std::numeric_limits<int8_t>::max();
-      min = std::numeric_limits<int8_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<int8_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<int8_t>(min), static_cast<int8_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<int8_t>().begin(), literal.data<int8_t>().end());
-      }
-      break;
-    case U4:
-      max = static_cast<int64_t>(std::numeric_limits<u4>::max());
-      min = static_cast<int64_t>(std::numeric_limits<u4>::lowest());
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<u4>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<u4>(min), static_cast<u4>(max));
-      if (is_sorted) {
-        std::sort(literal.data<u4>().begin(), literal.data<u4>().end());
-      }
-      break;
-    case U8:
-      max = std::numeric_limits<uint8_t>::max();
-      min = std::numeric_limits<uint8_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<uint8_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<uint8_t>(min), static_cast<uint8_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<uint8_t>().begin(),
-                  literal.data<uint8_t>().end());
-      }
-      break;
-    case S16:
-      max = std::numeric_limits<int16_t>::max();
-      min = std::numeric_limits<int16_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<int16_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<int16_t>(min), static_cast<int16_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<int16_t>().begin(),
-                  literal.data<int16_t>().end());
-      }
-      break;
-    case U16:
-      max = std::numeric_limits<uint16_t>::max();
-      min = std::numeric_limits<uint16_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<uint16_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<uint16_t>(min), static_cast<uint16_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<uint16_t>().begin(),
-                  literal.data<uint16_t>().end());
-      }
-      break;
-    case S32:
-      max = std::numeric_limits<int32_t>::max();
-      min = std::numeric_limits<int32_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<int32_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<int32_t>(min), static_cast<int32_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<int32_t>().begin(),
-                  literal.data<int32_t>().end());
-      }
-      break;
-    case U32:
-      max = std::numeric_limits<uint32_t>::max();
-      min = std::numeric_limits<uint32_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<uint32_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<uint32_t>(min), static_cast<uint32_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<uint32_t>().begin(),
-                  literal.data<uint32_t>().end());
-      }
-      break;
-    case S64:
-      max = std::numeric_limits<int64_t>::max();
-      min = std::numeric_limits<int64_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<int64_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<int64_t>(min), static_cast<int64_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<int64_t>().begin(),
-                  literal.data<int64_t>().end());
-      }
-      break;
-    case U64:
-      max = std::numeric_limits<uint64_t>::max();
-      min = std::numeric_limits<uint64_t>::lowest();
-      if (limit.has_value()) {
-        max = limit->second;
-        min = limit->first;
-      }
-      PopulateWithRandomIntegralDataWithBounds<uint64_t>(
-          &literal, engine, /*no_duplicate*/ no_duplicates,
-          static_cast<uint64_t>(min), static_cast<uint64_t>(max));
-      if (is_sorted) {
-        std::sort(literal.data<uint64_t>().begin(),
-                  literal.data<uint64_t>().end());
-      }
-      break;
-    case C64:
-      PopulateWithComplexData<complex64>(&literal, engine, no_duplicates,
-                                         use_large_range);
-      break;
-    case C128:
-      PopulateWithComplexData<complex128>(&literal, engine, no_duplicates,
-                                          use_large_range);
-      break;
-    case PRED: {
-      std::uniform_int_distribution<int> generator(0, 1);
-      TF_CHECK_OK(
-          literal.Populate<bool>([&](absl::Span<const int64_t> /*indices*/) {
-            return generator(*engine);
-          }));
-      break;
-    }
-    default:
-      return Unimplemented(
-          "Unsupported type for fake random literal generation with bounds: "
-          "%s",
-          ShapeUtil::HumanString(shape));
-  }
+  TF_RETURN_IF_ERROR(primitive_util::PrimitiveTypeSwitch<Status>(
+      [&](auto primitive_type_constant) -> Status {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<primitive_type_constant>;
+          if constexpr (primitive_util::IsFloatingPointType(
+                            primitive_type_constant)) {
+            PopulateWithFloatingPointData<NativeT>(
+                &literal, engine, no_duplicates, use_large_range);
+            return OkStatus();
+          }
+          if constexpr (primitive_type_constant == PRED) {
+            std::uniform_int_distribution<int> generator(0, 1);
+            TF_CHECK_OK(literal.Populate<bool>(
+                [&](absl::Span<const int64_t> /*indices*/) {
+                  return generator(*engine);
+                }));
+            return OkStatus();
+          }
+          if constexpr (primitive_util::IsIntegralType(
+                            primitive_type_constant)) {
+            NativeT max = std::numeric_limits<NativeT>::max();
+            NativeT min = std::numeric_limits<NativeT>::lowest();
+            if (limit.has_value()) {
+              max = static_cast<NativeT>(limit->second);
+              min = static_cast<NativeT>(limit->first);
+            }
+            PopulateWithRandomIntegralDataWithBounds<NativeT>(
+                &literal, engine, /*no_duplicate*/ no_duplicates, min, max);
+            if (is_sorted) {
+              std::sort(literal.data<NativeT>().begin(),
+                        literal.data<NativeT>().end());
+            }
+            return OkStatus();
+          }
+          if constexpr (primitive_util::IsComplexType(
+                            primitive_type_constant)) {
+            PopulateWithComplexData<NativeT>(&literal, engine, no_duplicates,
+                                             use_large_range);
+            return OkStatus();
+          }
+        }
+        return Unimplemented(
+            "Unsupported type for fake random literal generation with bounds: "
+            "%s",
+            ShapeUtil::HumanString(shape));
+      },
+      shape.element_type()));
   return std::move(literal);
 }
 
diff --git a/tensorflow/compiler/xla/tests/topk_test.cc b/tensorflow/compiler/xla/tests/topk_test.cc
new file mode 100644
index 00000000000..d2b17796e70
--- /dev/null
+++ b/tensorflow/compiler/xla/tests/topk_test.cc
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+#include "tensorflow/compiler/xla/tests/test_macros.h"
+#include "tensorflow/compiler/xla/tests/test_utils.h"
+
+namespace xla {
+namespace {
+
+class TopkTest : public HloTestBase {};
+
+XLA_TEST_F(TopkTest, SimpleTopK) {
+  absl::string_view hlo = R"(
+HloModule topk
+
+compare {
+  p.0.lhs = bf16[] parameter(0)
+  p.0.rhs = bf16[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=GT
+}
+
+ENTRY TopK {
+  x = bf16[10,10] parameter(0)
+  ROOT topk = (bf16[10,2], s32[10,2]) topk(x), k=2, to_apply=compare
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{1e-5, 1e-5}));
+}
+
+XLA_TEST_F(TopkTest, SimpleTopKReverseDirection) {
+  absl::string_view hlo = R"(
+HloModule topk
+
+compare {
+  p.0.lhs = bf16[] parameter(0)
+  p.0.rhs = bf16[] parameter(1)
+  ROOT lt = pred[] compare(p.0.lhs, p.0.rhs), direction=LT
+}
+
+ENTRY TopK {
+  x = bf16[10,10] parameter(0)
+  ROOT topk = (bf16[10,2], s32[10,2]) topk(x), k=2, to_apply=compare
+}
+)";
+  EXPECT_TRUE(RunAndCompare(hlo, ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/BUILD b/tensorflow/compiler/xla/tools/BUILD
index 724e32b4dc5..f0d39892b82 100644
--- a/tensorflow/compiler/xla/tools/BUILD
+++ b/tensorflow/compiler/xla/tools/BUILD
@@ -235,10 +235,11 @@ xla_cc_test(
     srcs = ["hlo_extractor_test.cc"],
     deps = [
         ":hlo_extractor",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
         "//tensorflow/compiler/xla/tests:hlo_test_base",
         "//tensorflow/compiler/xla/tests:xla_internal_test_main",
-        "//tensorflow/tsl/platform:test",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -256,6 +257,32 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "hlo_slicer_test",
+    srcs = ["hlo_slicer_test.cc"],
+    deps = [
+        ":hlo_slicer",
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/hlo/utils:hlo_matchers",
+        "//tensorflow/compiler/xla/tests:hlo_test_base",
+        "//tensorflow/compiler/xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "hlo_slicer",
+    srcs = ["hlo_slicer.cc"],
+    hdrs = ["hlo_slicer.h"],
+    deps = [
+        "//tensorflow/compiler/xla/hlo/ir:hlo",
+        "//tensorflow/compiler/xla/service:call_graph",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
+    ],
+)
+
 xla_cc_binary(
     name = "interactive_graphviz",
     srcs = ["interactive_graphviz.cc"],
diff --git a/tensorflow/compiler/xla/tools/hlo_bisect/BUILD b/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
index 78e2235c193..6ac516f14c1 100644
--- a/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
+++ b/tensorflow/compiler/xla/tools/hlo_bisect/BUILD
@@ -9,8 +9,8 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow/compiler/xla:internal"],
-    licenses = ["notice"],
 )
 
 build_test(
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor.cc b/tensorflow/compiler/xla/tools/hlo_extractor.cc
index 6f99177fc16..cee4bae447c 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor.cc
+++ b/tensorflow/compiler/xla/tools/hlo_extractor.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/tools/hlo_extractor.h"
 
-#include <stdio.h>
 #include <unistd.h>
 
 #include <memory>
@@ -25,6 +24,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_clone_context.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/service/compilation_environments.h"
 #include "tensorflow/compiler/xla/service/hlo_verifier.h"
 #include "tensorflow/compiler/xla/status.h"
@@ -43,14 +43,16 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
  public:
   explicit ExtractionVisitor(
       const HloModule& old_module,
-      absl::flat_hash_set<const HloInstruction*>* boundary)
+      absl::flat_hash_set<const HloInstruction*>* boundary,
+      HloSelector hlo_selector)
       : old_module_(old_module),
         module_(std::make_unique<HloModule>(
             "extracted", config_,
             std::make_unique<CompilationEnvironments>(old_module.comp_envs()))),
         clone_context_(module_.get()),
         builder_("entry_computation"),
-        boundary_(boundary) {}
+        boundary_(boundary),
+        hlo_selector_(hlo_selector) {}
 
   Status HandleParameter(const HloInstruction* parameter) override {
     // Entry parameters need renumbering.
@@ -62,9 +64,12 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
   }
 
   Status DefaultAction(const HloInstruction* hlo) override {
-    // Replace instructions at the boundary with parameters, but leave constants
-    // untouched.
-    if (boundary_ != nullptr && boundary_->count(hlo) > 0) {
+    // Replace the following two types of instructions with parameters, with
+    // constants untouched: (1) the instructions at the boundary with
+    // non-constant parameters, (2) the instructions that are not selected by
+    // the hlo_selector
+    if ((boundary_ != nullptr && boundary_->contains(hlo) > 0) ||
+        (hlo_selector_ != nullptr && !hlo_selector_(hlo))) {
       auto new_parameter = HloInstruction::CreateParameter(
           parameter_number_, hlo->shape(), hlo->name());
       parameter_number_++;
@@ -109,6 +114,7 @@ class ExtractionVisitor : public ConstDfsHloVisitorWithDefault {
   HloCloneContext clone_context_;
   HloComputation::Builder builder_;
   absl::flat_hash_set<const HloInstruction*>* boundary_;
+  HloSelector hlo_selector_;
   int64_t parameter_number_ = 0;
 };
 
@@ -139,19 +145,21 @@ void ComputeBoundary(const HloInstruction* root, int64_t limit,
 }  // namespace
 
 std::unique_ptr<HloModule> ExtractModule(HloInstruction* instruction,
-                                         int64_t height) {
+                                         int64_t height,
+                                         HloSelector hlo_selector) {
   absl::flat_hash_set<const HloInstruction*> boundary;
   if (height != -1) {
     ComputeBoundary(instruction, height, &boundary);
   }
-  ExtractionVisitor visitor(*instruction->GetModule(), &boundary);
+  ExtractionVisitor visitor(*instruction->GetModule(), &boundary, hlo_selector);
   CHECK(instruction->Accept(&visitor).ok());
 
   // The first pass may leave unused parameter instructions. Do another
   // extraction pass to remove unused parameters. This is done because
   // HloComputation does not allow removing parameters after the computation has
   // been built.
-  ExtractionVisitor cleanup_visitor(*visitor.module(), /*boundary=*/nullptr);
+  ExtractionVisitor cleanup_visitor(*visitor.module(), /*boundary=*/nullptr,
+                                    /*hlo_selector=*/nullptr);
   TF_CHECK_OK(visitor.module()->entry_computation()->root_instruction()->Accept(
       &cleanup_visitor));
 
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor.h b/tensorflow/compiler/xla/tools/hlo_extractor.h
index 04828254089..b5675fc7b12 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor.h
+++ b/tensorflow/compiler/xla/tools/hlo_extractor.h
@@ -16,11 +16,19 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
 #define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_EXTRACTOR_H_
 
+#include <functional>
+#include <memory>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
 
 namespace xla {
 
+// Define HloSelector, which is a lambda that, given an HLO
+// instruction, returns true if selected, otherwise return false.
+using HloSelector = std::function<bool(const HloInstruction*)>;
+
 // Creates a new HLO module rooted with an entry computation rooted at the given
 // instruction.
 //
@@ -28,8 +36,13 @@ namespace xla {
 //  operands of `root`.  If you specify a different height, the new computation
 //  will include all instructions <= `height` hops away from `root`.
 //  Instructions at the boundary are replaced by parameters.
+//
+// The `hlo_selector` will return true/false for each hlo instruction. If false
+// is returned, the corresponding instruction and its predecessors will not be
+// included in the extracted hlo module
 std::unique_ptr<HloModule> ExtractModule(HloInstruction* instruction,
-                                         int64_t height = -1);
+                                         int64_t height = -1,
+                                         HloSelector hlo_selector = nullptr);
 
 }  // namespace xla
 
diff --git a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
index f96cf7588fc..b0d514d3160 100644
--- a/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
+++ b/tensorflow/compiler/xla/tools/hlo_extractor_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <string>
 
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
 #include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
 #include "tensorflow/compiler/xla/tests/hlo_test_base.h"
 
@@ -137,5 +139,126 @@ ENTRY %entry {
   }
 }
 
+TEST_F(HloExtractorTest, ExtractFromMultipleComputation) {
+  const std::string& hlo_string = R"(
+  HloModule axpy_module
+    calculate_alpha {
+      c.1 = f32[] constant(1)
+      c.2 = f32[] constant(2)
+      c.3 = f32[] add(c.1, c.2)
+      c.4 = f32[] constant(4)
+      ROOT ret = f32[] subtract(c.4, c.3)
+    }
+    
+    ENTRY axpy_computation {
+      alpha = f32[] call(), to_apply=calculate_alpha
+      broadcast = f32[10] broadcast(alpha), dimensions={}
+      x = f32[10] parameter(0)
+      ax = f32[10] multiply(broadcast, x)
+      y = f32[10] parameter(1)
+      ROOT add = f32[10] add(ax, y)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  {
+    // Find the kSubtract instruction in computation: calculate_alpha
+    HloInstruction* inst =
+        FindInstruction(hlo_module.get(), HloOpcode::kSubtract);
+    EXPECT_NE(inst, nullptr);
+    EXPECT_THAT(inst, op::Subtract());
+
+    // Extract from the non-entry computation, with kSubstract instruction as
+    // the new root instruction
+    auto extracted_module = ExtractModule(inst, /*height=*/1);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Subtract(op::Constant(), op::Add()));
+    EXPECT_EQ(extracted_module->computation_count(), 1);
+  }
+
+  {
+    // FindInstruction iterates from the ENTRY computation, therefore, it
+    // matches to the kAdd instruction at the entry computation, instead of the
+    // kAdd instruction in the Computation:calculate_alpha
+    HloInstruction* inst = FindInstruction(hlo_module.get(), "add");
+    EXPECT_NE(inst, nullptr);
+    EXPECT_THAT(inst, op::Add(op::Multiply(), op::Parameter()));
+
+    auto extracted_module = ExtractModule(inst);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Add(op::Multiply(), op::Parameter()));
+    EXPECT_EQ(extracted_module->computation_count(), 2);
+  }
+}
+
+TEST_F(HloExtractorTest, HloSelector) {
+  const std::string& hlo_string = R"(
+  HloModule axpy_module
+    calculate_alpha {
+      c.1 = f32[] constant(1)
+      c.2 = f32[] constant(2)
+      c.3 = f32[] add(c.1, c.2)
+      c.4 = f32[] constant(4)
+      ROOT ret = f32[] multiply(c.4, c.3)
+    }
+    
+    ENTRY axpy_computation {
+      p.0 = f32[10] parameter(0)
+      p.1 = f32[10] parameter(1)
+      add.0 = f32[10] add(p.0, p.1)
+      alpha = f32[] call(), to_apply=calculate_alpha
+      broadcast = f32[10] broadcast(alpha), dimensions={}
+      p.2 = f32[10] parameter(2)
+      y = f32[10] multiply(broadcast, p.2)
+      x = f32[10] subtract(y, add.0)
+      p.3 = f32[10] parameter(3)
+      ROOT add = f32[10] add(x, p.3)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  // Find the kSubtract instruction in ENTRY computation
+  HloInstruction* inst =
+      FindInstruction(hlo_module.get(), HloOpcode::kSubtract);
+  EXPECT_NE(inst, nullptr);
+  EXPECT_THAT(inst, op::Subtract(op::Multiply(), op::Add()));
+
+  {
+    auto hlo_selector = [](const HloInstruction* hlo_inst) -> bool {
+      return hlo_inst->opcode() != HloOpcode::kCall;
+    };
+    auto extracted_module = ExtractModule(inst, /*height=*/-1, hlo_selector);
+    EXPECT_EQ(extracted_module->computation_count(), 1);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Subtract(op::Multiply(op::Broadcast(op::Parameter()),
+                                          op::Parameter()),
+                             op::Add(op::Parameter(), op::Parameter())));
+  }
+
+  {
+    auto hlo_selector = [](const HloInstruction* hlo_inst) -> bool {
+      return hlo_inst->opcode() != HloOpcode::kBroadcast;
+    };
+    auto extracted_module = ExtractModule(inst, /*height=*/2, hlo_selector);
+    EXPECT_EQ(extracted_module->computation_count(), 1);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Subtract(op::Multiply(op::Parameter(), op::Parameter()),
+                             op::Add(op::Parameter(), op::Parameter())));
+  }
+
+  {
+    auto hlo_selector = [](const HloInstruction* hlo_inst) -> bool {
+      return hlo_inst->opcode() != HloOpcode::kAdd;
+    };
+    auto extracted_module = ExtractModule(inst, /*height=*/-1, hlo_selector);
+    // Here the extracted module should contain Computation: calculate_alpha
+    EXPECT_EQ(extracted_module->computation_count(), 2);
+    EXPECT_THAT(extracted_module->entry_computation()->root_instruction(),
+                op::Subtract(op::Multiply(), op::Parameter()));
+  }
+}
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_slicer.cc b/tensorflow/compiler/xla/tools/hlo_slicer.cc
new file mode 100644
index 00000000000..2816a549653
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_slicer.cc
@@ -0,0 +1,115 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_slicer.h"
+
+#include <deque>
+#include <memory>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+#include "tensorflow/compiler/xla/service/call_graph.h"
+
+namespace xla {
+namespace {
+
+// Intra-Computation forward slicing: Conduct forward slicing inside the given
+// computation. It begins with the relevant instructions in
+// `sliced_comp_insts_map[computation]`, and it adds all the instructions
+// propagated in-place.
+//
+// We assume that the root inst will be propagated (otherwise the given relevant
+// insts are dead code).
+void IntraCompForwardSlicing(
+    HloComputation* computation,
+    absl::flat_hash_map<const HloComputation*,
+                        absl::flat_hash_set<const HloInstruction*>>&
+        sliced_comp_insts_map,
+    bool ignore_control_predecessors) {
+  std::deque<const HloInstruction*> worklist(
+      sliced_comp_insts_map[computation].begin(),
+      sliced_comp_insts_map[computation].end());
+
+  while (!worklist.empty()) {
+    const HloInstruction* inst = worklist.back();
+
+    std::vector<HloInstruction*> successors(inst->users().begin(),
+                                            inst->users().end());
+    if (ignore_control_predecessors) {
+      successors.insert(successors.end(), inst->control_successors().begin(),
+                        inst->control_successors().end());
+    }
+
+    for (auto user_inst : successors) {
+      if (!sliced_comp_insts_map[computation].contains(user_inst)) {
+        worklist.push_front(user_inst);
+        sliced_comp_insts_map[computation].insert(user_inst);
+      }
+    }
+    worklist.pop_back();
+  }
+
+  // The root instruction should be included
+  QCHECK(sliced_comp_insts_map[computation].contains(
+      computation->root_instruction()))
+      << "The root instruction should be included in the sliced computation";
+}
+
+}  // namespace
+
+absl::flat_hash_map<const HloComputation*,
+                    absl::flat_hash_set<const HloInstruction*>>
+SliceModule(const HloModule* hlo_module,
+            std::vector<const HloInstruction*>& relevant_instructions,
+            bool ignore_control_predecessors) {
+  // Initialize `sliced_comp_insts_map`
+  absl::flat_hash_map<const HloComputation*,
+                      absl::flat_hash_set<const HloInstruction*>>
+      sliced_comp_insts_map;
+  for (auto inst : relevant_instructions) {
+    sliced_comp_insts_map[inst->parent()].insert(inst);
+  }
+
+  // Build call graph
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(hlo_module);
+
+  // Traverse computations in the post-order manner, and conduct
+  // inter-computation forward slicing in that order.
+  // `sliced_comp_insts_map` keeps track of all the relevant insts.
+  for (auto computation : hlo_module->MakeComputationPostOrder()) {
+    if (sliced_comp_insts_map.contains(computation)) {
+      // Do intra-computation forward slicing
+      IntraCompForwardSlicing(computation, sliced_comp_insts_map,
+                              ignore_control_predecessors);
+
+      // Track successor of the current computation computations containing the
+      // caller instruction of the current computation.
+      // TODO: Note that, theorectically, we can add only one of
+      // these caller instructions (computations)
+      for (auto caller_inst : call_graph->GetComputationCallers(computation)) {
+        sliced_comp_insts_map[caller_inst->parent()].insert(caller_inst);
+      }
+    }
+  }
+
+  return sliced_comp_insts_map;
+}
+
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/hlo_slicer.h b/tensorflow/compiler/xla/tools/hlo_slicer.h
new file mode 100644
index 00000000000..cce6a4d93e3
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_slicer.h
@@ -0,0 +1,42 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_XLA_TOOLS_HLO_SLICER_H_
+#define TENSORFLOW_COMPILER_XLA_TOOLS_HLO_SLICER_H_
+
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// Conduct inter-computation forward program slicing, with  the provided
+// HLO instructions as the starting points and ROOT instruction in the ENTRY
+// computation as the ending point. It will return a map that maps from relevant
+// HLO computation to relevant HLO instructions (excluding the parts of the HLO
+// computations/instructions that are irrelevant).
+absl::flat_hash_map<const HloComputation*,
+                    absl::flat_hash_set<const HloInstruction*>>
+SliceModule(const HloModule* hlo_module,
+            std::vector<const HloInstruction*>& relevant_instructions,
+            bool ignore_control_predecessors = false);
+
+}  // namespace xla
+
+#endif  // TENSORFLOW_COMPILER_XLA_TOOLS_HLO_SLICER_H_
diff --git a/tensorflow/compiler/xla/tools/hlo_slicer_test.cc b/tensorflow/compiler/xla/tools/hlo_slicer_test.cc
new file mode 100644
index 00000000000..53d5ac45e18
--- /dev/null
+++ b/tensorflow/compiler/xla/tools/hlo_slicer_test.cc
@@ -0,0 +1,198 @@
+/* Copyright 2018 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/xla/tools/hlo_slicer.h"
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/utils/hlo_matchers.h"
+#include "tensorflow/compiler/xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace {
+
+namespace op = testing::opcode_matchers;
+
+using HloSlicerTest = HloTestBase;
+
+TEST_F(HloSlicerTest, SingleComputationSlice) {
+  const std::string& hlo_string = R"(
+  HloModule axpy_module
+    ENTRY axpy_computation {
+      p.0 = f32[10] parameter(0)
+      p.1 = f32[10] parameter(1)
+      add.0 = f32[10] add(p.0, p.1)
+      alpha = f32[] constant(1)
+      broadcast = f32[10] broadcast(alpha), dimensions={}
+      p.2 = f32[10] parameter(2)
+      y = f32[10] multiply(broadcast, p.2)
+      x = f32[10] subtract(y, add.0)
+      p.3 = f32[10] parameter(3)
+      ROOT add.1 = f32[10] add(x, p.3)
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto p2 = FindInstruction(hlo_module.get(), "p.2");
+  EXPECT_THAT(p2, op::Parameter());
+  auto p3 = FindInstruction(hlo_module.get(), "p.3");
+  EXPECT_THAT(p3, op::Parameter());
+  auto x = FindInstruction(hlo_module.get(), "x");
+  EXPECT_THAT(x, op::Subtract());
+  auto y = FindInstruction(hlo_module.get(), "y");
+  EXPECT_THAT(y, op::Multiply());
+  auto add0 = FindInstruction(hlo_module.get(), "add.0");
+  EXPECT_THAT(add0, op::Add());
+  auto add1 = FindInstruction(hlo_module.get(), "add.1");
+  EXPECT_THAT(add1, op::Add());
+
+  auto entry_comp = FindComputation(hlo_module.get(), "axpy_computation");
+  EXPECT_NE(entry_comp, nullptr);
+
+  {
+    std::vector<const HloInstruction*> relevant_instructions({p2, x});
+    auto sliced_result = SliceModule(hlo_module.get(), relevant_instructions);
+    EXPECT_EQ(sliced_result.size(), 1);
+    EXPECT_EQ(sliced_result[entry_comp].size(), 4);
+    EXPECT_TRUE(sliced_result[entry_comp].contains(p2));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(x));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(y));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(add1));
+  }
+
+  {
+    std::vector<const HloInstruction*> relevant_instructions({add0, p3});
+    auto sliced_result = SliceModule(hlo_module.get(), relevant_instructions);
+    EXPECT_EQ(sliced_result.size(), 1);
+    EXPECT_EQ(sliced_result[entry_comp].size(), 4);
+    EXPECT_TRUE(sliced_result[entry_comp].contains(add0));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(x));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(p3));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(add1));
+  }
+}
+
+TEST_F(HloSlicerTest, MultipleComputationSlice) {
+  const std::string& hlo_string = R"(
+  HloModule test
+  
+  calculate_alpha {
+    constant.5 = s32[] constant(2)
+    constant.6 = s32[] constant(3)
+    ROOT ret = s32[] subtract(constant.5, constant.6)
+  }
+
+  While.body {
+    loop_var.1 = (s32[], s32[3]{0}) parameter(0)
+    get_tuple_element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s32[] constant(23)
+    add.3 = s32[] add(get_tuple_element.1, constant.1)
+    get_tuple_element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} multiply(get_tuple_element.2, get_tuple_element.2)
+    ROOT tuple = (s32[], s32[3]{0}) tuple(add.3, multiply)
+  }
+
+  While.condition {
+    loop_var.2 = (s32[], s32[3]{0}) parameter(0)
+    get_tuple_element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant(100)
+    ROOT less_than = pred[] compare(get_tuple_element.3, constant.2), direction=LT
+  } 
+  
+  ENTRY Test {
+    p.1 = s32[] parameter(0)
+    p.2 = s32[] parameter(1)
+    add.1 = s32[] add(p.1, p.2)
+    constant.3 = s32[] call(), to_apply=calculate_alpha
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s32[], s32[3]{0}) tuple(constant.3, constant.4)
+    while.1 = (s32[], s32[3]{0}) while(tuple.1), condition=While.condition, body=While.body
+    loop_count = s32[] get-tuple-element(while.1), index=0
+    ROOT add.2 = s32[] add(loop_count, add.1)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  auto add1 = FindInstruction(hlo_module.get(), "add.1");
+  EXPECT_THAT(add1, op::Add());
+  auto while1 = FindInstruction(hlo_module.get(), "while.1");
+  EXPECT_THAT(while1, op::While());
+  auto loop_count = FindInstruction(hlo_module.get(), "loop_count");
+  EXPECT_THAT(loop_count, op::GetTupleElement());
+  auto add2 = FindInstruction(hlo_module.get(), "add.2");
+  EXPECT_THAT(add2, op::Add());
+  auto gte1 = FindInstruction(hlo_module.get(), "get_tuple_element.1");
+  EXPECT_THAT(gte1, op::GetTupleElement());
+  auto gte2 = FindInstruction(hlo_module.get(), "get_tuple_element.2");
+  EXPECT_THAT(gte2, op::GetTupleElement());
+  auto constant5 = FindInstruction(hlo_module.get(), "constant.5");
+  EXPECT_THAT(constant5, op::Constant());
+  auto tuple1 = FindInstruction(hlo_module.get(), "tuple.1");
+  EXPECT_THAT(tuple1, op::Tuple());
+
+  auto entry_comp = FindComputation(hlo_module.get(), "Test");
+  EXPECT_NE(entry_comp, nullptr);
+  auto while_cond_comp = FindComputation(hlo_module.get(), "While.condition");
+  EXPECT_NE(while_cond_comp, nullptr);
+  auto while_body_comp = FindComputation(hlo_module.get(), "While.body");
+  EXPECT_NE(while_body_comp, nullptr);
+  auto calculate_alpha_comp =
+      FindComputation(hlo_module.get(), "calculate_alpha");
+  EXPECT_NE(calculate_alpha_comp, nullptr);
+
+  {
+    std::vector<const HloInstruction*> relevant_instructions({add1, while1});
+    auto sliced_result = SliceModule(hlo_module.get(), relevant_instructions);
+    EXPECT_EQ(sliced_result.size(), 1);
+    EXPECT_EQ(sliced_result[entry_comp].size(), 4);
+    EXPECT_TRUE(sliced_result[entry_comp].contains(add2));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(add1));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(while1));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(loop_count));
+  }
+
+  {
+    std::vector<const HloInstruction*> relevant_instructions({constant5});
+    auto sliced_result = SliceModule(hlo_module.get(), relevant_instructions);
+    EXPECT_EQ(sliced_result.size(), 2);
+    EXPECT_TRUE(sliced_result.contains(entry_comp));
+    EXPECT_TRUE(sliced_result.contains(calculate_alpha_comp));
+    EXPECT_FALSE(sliced_result[entry_comp].contains(add1));
+  }
+
+  {
+    std::vector<const HloInstruction*> relevant_instructions({gte2});
+    auto sliced_result = SliceModule(hlo_module.get(), relevant_instructions);
+    EXPECT_EQ(sliced_result.size(), 2);
+    EXPECT_TRUE(sliced_result.contains(entry_comp));
+    EXPECT_TRUE(sliced_result.contains(while_body_comp));
+    EXPECT_FALSE(sliced_result.contains(while_cond_comp));
+    EXPECT_FALSE(sliced_result[entry_comp].contains(tuple1));
+    EXPECT_FALSE(sliced_result[entry_comp].contains(add1));
+    EXPECT_TRUE(sliced_result[entry_comp].contains(add2));
+    EXPECT_FALSE(sliced_result[while_body_comp].contains(gte1));
+  }
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD b/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD
index 531461788bb..85f80e0452a 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/BUILD
@@ -58,6 +58,7 @@ cc_library(
     hdrs = ["functional_hlo_runner.h"],
     deps = [
         "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:status",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/pjrt:pjrt_client",
diff --git a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
index 108205b60eb..f541cab3423 100644
--- a/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
+++ b/tensorflow/compiler/xla/tools/multihost_hlo_runner/functional_hlo_runner.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_client.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_executable.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/tests/test_utils.h"
@@ -73,60 +74,21 @@ StatusOr<Literal> MakeFakeLiteralWithSameValue(const Shape& shape, int value) {
   }
   Shape new_shape = shape;
   new_shape.mutable_layout()->clear_tiles();
-  Literal literal(new_shape);
-  switch (new_shape.element_type()) {
-    case BF16:
-      PopulateWithSameValue(&literal, bfloat16(static_cast<float>(value)));
-      break;
-    case F16:
-      PopulateWithSameValue(&literal, static_cast<half>(value));
-      break;
-    case F32:
-      PopulateWithSameValue(&literal, static_cast<float>(value));
-      break;
-    case F64:
-      PopulateWithSameValue(&literal, static_cast<double>(value));
-      break;
-    case S8:
-      PopulateWithSameValue(&literal, static_cast<int8_t>(value));
-      break;
-    case U8:
-      PopulateWithSameValue(&literal, static_cast<uint8_t>(value));
-      break;
-    case S16:
-      PopulateWithSameValue(&literal, static_cast<int16_t>(value));
-      break;
-    case U16:
-      PopulateWithSameValue(&literal, static_cast<uint16_t>(value));
-      break;
-    case S32:
-      PopulateWithSameValue(&literal, static_cast<int32_t>(value));
-      break;
-    case U32:
-      PopulateWithSameValue(&literal, static_cast<uint32_t>(value));
-      break;
-    case S64:
-      PopulateWithSameValue(&literal, static_cast<int64_t>(value));
-      break;
-    case U64:
-      PopulateWithSameValue(&literal, static_cast<uint64_t>(value));
-      break;
-    case C64:
-      PopulateWithSameValue(&literal,
-                            static_cast<complex64>(complex64(value, 0.0)));
-      break;
-    case C128:
-      PopulateWithSameValue(&literal,
-                            static_cast<complex128>(complex128(value, 0.0)));
-      break;
-    case PRED:
-      PopulateWithSameValue(&literal, (value % 2) == 0);
-      break;
-    default:
-      return Unimplemented("Unsupported type for fake literal generation: %s",
-                           ShapeUtil::HumanString(shape));
-  }
-  return literal;
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<Literal>>(
+      [&](auto type) -> StatusOr<Literal> {
+        if constexpr (primitive_util::IsArrayType(type)) {
+          using NativeT = primitive_util::NativeTypeOf<type>;
+
+          Literal literal(new_shape);
+          PopulateWithSameValue(
+              &literal,
+              static_cast<NativeT>(type == PRED ? (value % 2) == 0 : value));
+          return literal;
+        }
+        return Unimplemented("Unsupported type for fake literal generation: %s",
+                             ShapeUtil::HumanString(shape));
+      },
+      new_shape.element_type());
 }
 
 }  // namespace
@@ -274,8 +236,7 @@ void AddShardingAnnotationsToSpmdPartitionedModule(HloModule* hlo_module) {
 
 StatusOr<std::unique_ptr<PjRtClient>> FunctionalHloRunner::CreateGpuClient() {
   return GetStreamExecutorGpuClient(
-      /*asynchronous=*/true, GpuAllocatorConfig(),
-      /*distributed_client=*/nullptr, /*node_id=*/0);
+      /*asynchronous=*/true, GpuAllocatorConfig(), /*node_id=*/0);
 }
 
 StatusOr<ExecutionOptions> FunctionalHloRunner::LoadExecutionOptions(
@@ -1051,6 +1012,7 @@ FunctionalHloRunner::RunInternal(
         running_options.profiler->CreateSession();
       }
     }
+    execute_options.launch_id = repeat;
     TF_ASSIGN_OR_RETURN(output_buffers,
                         executable->Execute(argument_ptrs, execute_options));
     VLOG(1) << "FunctionalHloRunner: ExecuteOnDevices succeeded (repeat = "
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD b/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD
index 75efdc59e85..b74c6bdc074 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/BUILD
@@ -82,14 +82,14 @@ cc_library(
     includes = ["include"],
     deps = [
         "//tensorflow/compiler/xla:literal",
+        "//tensorflow/compiler/xla:shape_util",
         "//tensorflow/compiler/xla:util",
+        "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/hlo/ir:hlo",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:convert_op_folder",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo",
         "//tensorflow/compiler/xla/service/llvm_ir:llvm_util",
-        "//tensorflow/tsl/platform:bfloat16",
-        "//tensorflow/tsl/platform:float8",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SparseTensorDialect",
@@ -115,6 +115,8 @@ cc_library(
     name = "mlir_hlo_builder",
     srcs = ["mlir_hlo_builder.cc"],
     hdrs = ["mlir_hlo_builder.h"],
+    # This is going to be deleted, so prevent new usage.
+    visibility = ["//tensorflow/compiler/mlir/tf2xla/transforms:__pkg__"],
     deps = [
         ":attribute_importer",
         ":hlo_module_importer",
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc
index 591f3e4c369..4b878508ae7 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.cc
@@ -258,4 +258,16 @@ StatusOr<mlir::ArrayAttr> ExtractLayoutsFromTuple(const Shape shape,
   return ExtractLayoutsFromShapes(shape.tuple_shapes(), builder);
 }
 
+mlir::Attribute ConvertSharding(const xla::HloSharding& sharding,
+                                mlir::Builder* builder) {
+  return builder->getStringAttr(sharding.ToString(/*include_metadata=*/true));
+}
+
+mlir::Attribute ConvertSharding(const xla::OpSharding& sharding,
+                                mlir::Builder* builder) {
+  auto hlo_sharding = xla::HloSharding::FromProto(sharding);
+  if (!hlo_sharding.ok()) return {};
+  return ConvertSharding(hlo_sharding.value(), builder);
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h
index 15f29868aa3..b8d0f9d8943 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h
@@ -84,6 +84,19 @@ StatusOr<mlir::ArrayAttr> ExtractLayoutsFromShapes(
 StatusOr<mlir::ArrayAttr> ExtractLayoutsFromTuple(const xla::Shape shape,
                                                   mlir::Builder* builder);
 
+// Returns a StringAttr that carries a prettyprinted representation of the
+// given HLO C++ sharding.
+// Always succeeds and returns a non-empty attribute.
+mlir::Attribute ConvertSharding(const xla::HloSharding& sharding,
+                                mlir::Builder* builder);
+
+// Returns a StringAttr that carries a prettyprinted representation of the
+// given HLO proto sharding.
+// Will fail and return an empty attribute if the proto sharding cannot be
+// converted to the C++ sharding.
+mlir::Attribute ConvertSharding(const xla::OpSharding& sharding,
+                                mlir::Builder* builder);
+
 }  // namespace xla
 
 #endif  // TENSORFLOW_COMPILER_XLA_TRANSLATE_HLO_TO_MHLO_ATTRIBUTE_IMPORTER_H_
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index c11af417b0a..45abb9bdc9d 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -161,6 +161,30 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportOldStyleAsyncStart(
   function.setPrivate();
   auto async_builder = mlir::OpBuilder(function.getBody());
 
+  llvm::SmallVector<mlir::NamedAttribute> async_attributes;
+  async_attributes.push_back(builder_->getNamedAttr(
+      "called_computation", mlir::FlatSymbolRefAttr::get(builder_->getContext(),
+                                                         function.getName())));
+  async_attributes.push_back(builder_->getNamedAttr(
+      "execution_thread", builder_->getStringAttr("main")));
+
+  // Attach the frontend_attributes and sharding attributes to the async op
+  // instead of the sync op. First, semantically sharding attributes cannot be
+  // attached to the sync op since the sync op may not produce the same number
+  // of results as the sharding's tuple element count, e.g., `mhlo.send` vs. HLO
+  // `send`. Second, `mlir_hlo_to_hlo.cc` imports these attributes from the
+  // `mhlo.async_start` ops, so attaching them to the sync op will make them
+  // disappear during MHLO to HLO lowering.
+  for (auto it = attributes.begin(); it != attributes.end();) {
+    if (it->getName() == kShardingAttr ||
+        it->getName() == kFrontendAttributesAttr) {
+      async_attributes.push_back(*it);
+      it = attributes.erase(it);
+    } else {
+      ++it;
+    }
+  }
+
   llvm::SmallVector<mlir::Location, 1> locs(Untuple(result_types[0]).size(),
                                             loc);
   auto sync_operand =
@@ -172,12 +196,6 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportOldStyleAsyncStart(
   async_builder.create<mlir::func::ReturnOp>(loc, sync_operation->getResults());
   TF_RETURN_IF_ERROR(mutate_op(sync_operation));
 
-  llvm::SmallVector<mlir::NamedAttribute> async_attributes;
-  async_attributes.push_back(builder_->getNamedAttr(
-      "called_computation", mlir::FlatSymbolRefAttr::get(builder_->getContext(),
-                                                         function.getName())));
-  async_attributes.push_back(builder_->getNamedAttr(
-      "execution_thread", builder_->getStringAttr("main")));
   function->setAttr("execution_thread", builder_->getStringAttr("main"));
 
   auto bundle_result_type =
@@ -302,11 +320,6 @@ static mlir::Attribute GetLayoutAttribute(mlir::Builder& b,
   return b.getIndexTensorAttr(layout);
 }
 
-mlir::Attribute GetShardingAttribute(mlir::Builder& b,
-                                     const xla::HloSharding& sharding) {
-  return b.getStringAttr(sharding.ToString(/*include_metadata=*/true));
-}
-
 mlir::Attribute GetFrontendAttributes(
     mlir::Builder& b, const xla::FrontendAttributes& attributes) {
   llvm::SmallVector<mlir::NamedAttribute> attrs;
@@ -414,9 +427,8 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
     HloParameterInstruction* parameter =
         Cast<HloParameterInstruction>(entry.value());
     if (parameter->has_sharding()) {
-      function.setArgAttr(
-          entry.index(), kShardingAttr,
-          GetShardingAttribute(*builder_, parameter->sharding()));
+      function.setArgAttr(entry.index(), kShardingAttr,
+                          ConvertSharding(parameter->sharding(), builder_));
     }
     if (parameter->frontend_attributes().map_size() > 0) {
       function.setArgAttr(
@@ -444,7 +456,7 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
                       function.getNumResults());
     }
     function.setResultAttr(0, kShardingAttr,
-                           GetShardingAttribute(*builder_, result->sharding()));
+                           ConvertSharding(result->sharding(), builder_));
   }
   if (computation.execution_thread() != "main") {
     function->setAttr("execution_thread",
@@ -668,8 +680,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
   llvm::SmallVector<NamedAttribute, 10> attributes;
   if (instruction->has_sharding()) {
     attributes.push_back(builder_->getNamedAttr(
-        kShardingAttr,
-        GetShardingAttribute(*builder_, instruction->sharding())));
+        kShardingAttr, ConvertSharding(instruction->sharding(), builder_)));
   }
 
   llvm::SmallVector<NamedAttribute, 4> frontend_attributes;
@@ -1385,6 +1396,13 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kAllReduce: {
       auto all_reduce = Cast<HloAllReduceInstruction>(instruction);
+      auto result_tuple_ty = result_type.dyn_cast<mlir::TupleType>();
+
+      llvm::SmallVector<Type> result_types = {result_type};
+      if (result_tuple_ty) {
+        result_types = llvm::to_vector(result_tuple_ty.getTypes());
+      }
+
       attributes.push_back(
           ConvertReplicaGroups(all_reduce->replica_groups(), builder_));
       if (all_reduce->channel_id().has_value())
@@ -1393,10 +1411,16 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       if (all_reduce->use_global_device_ids())
         attributes.push_back(ConvertUseGlobalDeviceIds());
       auto all_reduce_op = func_builder->create<mlir::mhlo::AllReduceOp>(
-          loc, result_type, operands, attributes);
+          loc, result_types, operands, attributes);
       TF_RETURN_IF_ERROR(ImportAsRegion(*all_reduce->to_apply(),
                                         &all_reduce_op.getComputation(),
                                         /*flatten_region_arg_tuple=*/true));
+      if (result_tuple_ty) {
+        return func_builder
+            ->create<mlir::mhlo::TupleOp>(loc, result_type,
+                                          all_reduce_op.getResults())
+            .getOperation();
+      }
       return all_reduce_op.getOperation();
     }
     case HloOpcode::kAllReduceStart: {
@@ -1408,6 +1432,9 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             ConvertChannelHandle(all_reduce_start->channel_id().value()));
       if (all_reduce_start->use_global_device_ids())
         attributes.push_back(ConvertUseGlobalDeviceIds());
+      if (all_reduce_start->operands().size() > 1)
+        return InvalidArgument(
+            "Async tuple all-reduce is not supported in MHLO");
 
       return ImportOldStyleAsyncStart<mlir::mhlo::AllReduceOp>(
           attributes, operands, loc, result_type, func_builder, "all_reduce_",
@@ -1845,12 +1872,10 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           &instruction->user_side_metadata());
       attributes.push_back(builder_->getNamedAttr(
           "exit_metadata",
-          builder_->getStringAttr(
-              (*exit_metadata)->sharding()->ToProto().SerializeAsString())));
+          ConvertSharding(*(*exit_metadata)->sharding(), builder_)));
       attributes.push_back(builder_->getNamedAttr(
           "entry_metadata",
-          builder_->getStringAttr(
-              (*entry_metadata)->sharding()->ToProto().SerializeAsString())));
+          ConvertSharding(*(*entry_metadata)->sharding(), builder_)));
 
       return func_builder
           ->create<mlir::mhlo::DomainOp>(loc, result_type, operands, attributes)
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
index e0edf62d058..a276f6de5e5 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/permutation_util.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/attribute_importer.h"
 #include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_function_importer.h"
+#include "tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "tensorflow/compiler/xla/xla.pb.h"
 
 namespace xla {
@@ -60,15 +61,13 @@ Status HloModuleImporter::Import(const xla::HloModule& hlo_module) {
   if (hlo_module.has_spmd_output_sharding()) {
     module->setAttr(
         "mhlo.spmd_output_sharding",
-        builder_.getStringAttr(
-            hlo_module.spmd_output_sharding().ToProto().SerializeAsString()));
+        ConvertSharding(hlo_module.spmd_output_sharding(), &builder_));
   }
 
   if (hlo_module.has_spmd_parameters_shardings()) {
     llvm::SmallVector<mlir::Attribute> parameter_shardings;
     for (const auto& sharding : hlo_module.spmd_parameters_shardings()) {
-      parameter_shardings.push_back(
-          builder_.getStringAttr(sharding.ToProto().SerializeAsString()));
+      parameter_shardings.push_back(ConvertSharding(sharding, &builder_));
     }
     module->setAttr("mhlo.spmd_parameters_shardings",
                     builder_.getArrayAttr(parameter_shardings));
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
index e7558c6a6ea..cc5c808e1da 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/literal.h"
 #include "tensorflow/compiler/xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/llvm_ir/llvm_util.h"
-#include "tensorflow/tsl/platform/bfloat16.h"
-#include "tensorflow/tsl/platform/float8.h"
+#include "tensorflow/compiler/xla/xla_data.pb.h"
 
 namespace xla {
 namespace {
@@ -110,46 +110,17 @@ StatusOr<mlir::DenseElementsAttr> CreateDenseElementsAttrFromLiteral(
 
   // TODO(hinsu): Support remaining XLA primitive types.
   auto element_type = literal.shape().element_type();
-  switch (element_type) {
-    case PrimitiveType::PRED:
-      return CreateDenseAttrFromLiteral<bool>(type, literal);
-    case PrimitiveType::F8E5M2:
-      return CreateDenseAttrFromLiteral<tsl::float8_e5m2>(type, literal);
-    case PrimitiveType::F8E4M3FN:
-      return CreateDenseAttrFromLiteral<tsl::float8_e4m3fn>(type, literal);
-    case PrimitiveType::F8E4M3B11FNUZ:
-      return CreateDenseAttrFromLiteral<tsl::float8_e4m3b11>(type, literal);
-    case PrimitiveType::F16:
-      return CreateDenseAttrFromLiteral<half>(type, literal);
-    case PrimitiveType::BF16:
-      return CreateDenseAttrFromLiteral<bfloat16>(type, literal);
-    case PrimitiveType::F32:
-      return CreateDenseAttrFromLiteral<float>(type, literal);
-    case PrimitiveType::F64:
-      return CreateDenseAttrFromLiteral<double>(type, literal);
-    case PrimitiveType::S8:
-      return CreateDenseAttrFromLiteral<int8_t>(type, literal);
-    case PrimitiveType::S16:
-      return CreateDenseAttrFromLiteral<int16_t>(type, literal);
-    case PrimitiveType::S32:
-      return CreateDenseAttrFromLiteral<int32_t>(type, literal);
-    case PrimitiveType::S64:
-      return CreateDenseAttrFromLiteral<int64_t>(type, literal);
-    case PrimitiveType::U8:
-      return CreateDenseAttrFromLiteral<uint8_t>(type, literal);
-    case PrimitiveType::U16:
-      return CreateDenseAttrFromLiteral<uint16_t>(type, literal);
-    case PrimitiveType::U32:
-      return CreateDenseAttrFromLiteral<uint32_t>(type, literal);
-    case PrimitiveType::U64:
-      return CreateDenseAttrFromLiteral<uint64_t>(type, literal);
-    case PrimitiveType::C64:
-      return CreateDenseAttrFromLiteral<complex64>(type, literal);
-    case PrimitiveType::C128:
-      return CreateDenseAttrFromLiteral<complex128>(type, literal);
-    default:
-      return Internal("Unsupported type: %s", PrimitiveType_Name(element_type));
-  }
+  return primitive_util::PrimitiveTypeSwitch<StatusOr<mlir::DenseElementsAttr>>(
+      [&](auto primitive_type_constant) -> StatusOr<mlir::DenseElementsAttr> {
+        if constexpr (primitive_util::IsArrayType(primitive_type_constant)) {
+          return CreateDenseAttrFromLiteral<
+              primitive_util::NativeTypeOf<primitive_type_constant>>(type,
+                                                                     literal);
+        }
+        return Internal("Unsupported type: %s",
+                        PrimitiveType_Name(element_type));
+      },
+      element_type);
 }
 
 Status CopyDenseElementsDataToXlaFormat(mlir::DenseElementsAttr data,
@@ -260,32 +231,24 @@ StatusOr<mlir::Type> ConvertPrimitiveTypeToMLIRType(PrimitiveType element_type,
       return builder.getF32Type();
     case PrimitiveType::F64:
       return builder.getF64Type();
-    case PrimitiveType::S4:
-      return builder.getIntegerType(4);
-    case PrimitiveType::S8:
-      return builder.getIntegerType(8);
-    case PrimitiveType::S16:
-      return builder.getIntegerType(16);
-    case PrimitiveType::S32:
-      return builder.getIntegerType(32);
-    case PrimitiveType::S64:
-      return builder.getIntegerType(64);
-    case PrimitiveType::U4:
-      return builder.getIntegerType(4, /*isSigned=*/false);
-    case PrimitiveType::U8:
-      return builder.getIntegerType(8, /*isSigned=*/false);
-    case PrimitiveType::U16:
-      return builder.getIntegerType(16, /*isSigned=*/false);
-    case PrimitiveType::U32:
-      return builder.getIntegerType(32, /*isSigned=*/false);
-    case PrimitiveType::U64:
-      return builder.getIntegerType(64, /*isSigned=*/false);
-    case PrimitiveType::C64:
-      return mlir::ComplexType::get(builder.getF32Type());
-    case PrimitiveType::C128:
-      return mlir::ComplexType::get(builder.getF64Type());
     // TODO(b/130356985): Support unsigned primitive types.
     default:
+      if (primitive_util::IsIntegralType(element_type)) {
+        return mlir::IntegerType::get(
+            builder.getContext(),
+            /*width=*/primitive_util::BitWidth(element_type),
+            /*signed=*/
+            primitive_util::IsUnsignedIntegralType(element_type)
+                ? mlir::IntegerType::Unsigned
+                : mlir::IntegerType::Signless);
+      }
+      if (primitive_util::IsComplexType(element_type)) {
+        TF_ASSIGN_OR_RETURN(
+            mlir::Type component_type,
+            ConvertPrimitiveTypeToMLIRType(
+                primitive_util::ComplexComponentType(element_type), builder));
+        return mlir::ComplexType::get(component_type);
+      }
       return Internal("Unsupported type: %s", PrimitiveType_Name(element_type));
   }
 }
@@ -514,5 +477,4 @@ StatusOr<::xla::HloOpcode> MhloToHloOpcode(mlir::Operation* op) {
                          llvm_ir::DumpToString(op));
   }
 }
-
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
index abd6b3c517a..ccd7db568e8 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/hlo_utils.h
@@ -101,19 +101,19 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
             i < layout.dim_unique().size() ? layout.dim_unique()[i] : true;
         switch (dlt) {
           case DimLevelType::DIM_DENSE:
-            dlts.push_back(*mlir::sparse_tensor::getDimLevelType(
+            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::Dense, ordered, unique));
             break;
           case DimLevelType::DIM_COMPRESSED:
-            dlts.push_back(*mlir::sparse_tensor::getDimLevelType(
+            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::Compressed, ordered, unique));
             break;
           case DimLevelType::DIM_SINGLETON:
-            dlts.push_back(*mlir::sparse_tensor::getDimLevelType(
+            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::Singleton, ordered, unique));
             break;
           case DimLevelType::DIM_COMPRESSED_WITH_HI:
-            dlts.push_back(*mlir::sparse_tensor::getDimLevelType(
+            dlts.push_back(*mlir::sparse_tensor::buildLevelType(
                 mlir::sparse_tensor::LevelFormat::CompressedWithHi, ordered,
                 unique));
             break;
@@ -127,8 +127,8 @@ static StatusOr<TypeT> ConvertTensorShapeToType(const Shape& xla_ty,
       auto id_map = mlir::AffineMap::getPermutationMap(major_to_minor,
                                                        builder.getContext());
       // TODO(atondwal): support sizes other than 32 when XLA does
-      encoding = SparseTensorEncodingAttr::get(
-          builder.getContext(), dlts, id_map, mlir::AffineMap(), 32, 32);
+      encoding = SparseTensorEncodingAttr::get(builder.getContext(), dlts,
+                                               id_map, 32, 32);
     }
   }
   return TypeT::get(shape, element_type_or.value(), encoding);
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
index c56a70c3445..8c4ddeda027 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/mlir_hlo_builder.cc
@@ -92,21 +92,23 @@ void AddFrontendAttributesToOperation(
   op->setAttr(kFrontendAttributesAttr, updated_attributes);
 }
 
-void AddShapeAttributeToOperation(mlir::OpBuilder builder, mlir::Operation* op,
-                                  std::optional<xla::OpSharding> sharding) {
-  if (sharding) {
-    op->setAttr(kShardingAttr, builder.getStringAttr(
-                                   HloSharding::FromProto(*sharding)->ToString(
-                                       /*include_metadata=*/true)));
-  }
+Status AddShardingAttributeToOperation(
+    mlir::OpBuilder builder, mlir::Operation* op,
+    std::optional<xla::OpSharding> sharding) {
+  if (!sharding) return OkStatus();
+  auto shardingAttr = ConvertSharding(*sharding, &builder);
+  if (!shardingAttr)
+    return absl::InvalidArgumentError("Failed to convert sharding");
+  op->setAttr(kShardingAttr, shardingAttr);
+  return OkStatus();
 }
 
 // Adds sharding and frontend_attributes to op.
-void AddAttributesToOperation(mlir::OpBuilder builder, mlir::Operation* op,
-                              std::optional<xla::OpSharding> sharding,
-                              mlir::DictionaryAttr& frontend_attributes) {
-  AddShapeAttributeToOperation(builder, op, sharding);
+Status AddAttributesToOperation(mlir::OpBuilder builder, mlir::Operation* op,
+                                std::optional<xla::OpSharding> sharding,
+                                mlir::DictionaryAttr& frontend_attributes) {
   AddFrontendAttributesToOperation(op, frontend_attributes);
+  return AddShardingAttributeToOperation(builder, op, sharding);
 }
 
 static std::string GetMlirOpName(HloOpcode opcode) {
@@ -147,8 +149,8 @@ StatusOr<XlaOp> MlirHloBuilder::MakeXlaOp(mlir::Value val) {
     return InvalidArgument("unsupported type: %s", llvm_ir::DumpToString(ty));
   }
 
-  AddAttributesToOperation(builder_, val.getDefiningOp(), sharding(),
-                           frontend_attributes_);
+  TF_RETURN_IF_ERROR(AddAttributesToOperation(
+      builder_, val.getDefiningOp(), sharding(), frontend_attributes_));
   int64_t handle = reinterpret_cast<int64_t>(val.getAsOpaquePointer());
   handle_to_shape_[handle] = std::move(shape);
   return XlaOp(handle, this);
@@ -315,7 +317,8 @@ StatusOr<XlaOp> MlirHloBuilder::ReduceInternal(
   if (op.getNumResults() == 1) return MakeXlaOp(op.getResult(0));
   // Add frontend attributes to the ReduceOp as no MakeXlaOp is called.
   // TODO(hinsu): Avoid this duplicated call for ops returning multiple results.
-  AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
+  TF_RETURN_IF_ERROR(
+      AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_));
   auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
   return MakeXlaOp(tuple);
 }
@@ -411,7 +414,8 @@ StatusOr<XlaOp> MlirHloBuilder::SortInternal(const Shape& shape,
     // Add frontend attributes to the SortOp as no MakeXlaOp is called.
     // TODO(hinsu): Avoid this duplicated call for ops returning multiple
     // results.
-    AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
+    TF_RETURN_IF_ERROR(AddAttributesToOperation(builder_, op, sharding(),
+                                                frontend_attributes_));
     auto tuple = builder_.create<mlir::mhlo::TupleOp>(loc_, op.getResults());
     return MakeXlaOp(tuple);
   }
@@ -445,7 +449,8 @@ StatusOr<XlaOp> MlirHloBuilder::WhileInternal(const Shape& shape,
     // Add frontend attributes to the WhileOp as no MakeXlaOp is called.
     // TODO(hinsu): Avoid this duplicated call for ops returning multiple
     // results.
-    AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
+    TF_RETURN_IF_ERROR(AddAttributesToOperation(builder_, op, sharding(),
+                                                frontend_attributes_));
     llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
     llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
     auto result = HloFunctionImporter::CreateTupleValue(
@@ -565,7 +570,8 @@ StatusOr<XlaOp> MlirHloBuilder::RngBitGeneratorInternal(
     // called.
     // TODO(hinsu): Avoid this duplicated call for ops returning multiple
     // results.
-    AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
+    TF_RETURN_IF_ERROR(AddAttributesToOperation(builder_, op, sharding(),
+                                                frontend_attributes_));
     llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
     llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
     auto result = HloFunctionImporter::CreateTupleValue(
@@ -707,7 +713,8 @@ StatusOr<XlaOp> MlirHloBuilder::InfeedWithTokenInternal(
 
   // Add frontend attributes to the InfeedOp as no MakeXlaOp is called.
   // TODO(hinsu): Avoid this duplicated call for ops returning multiple results.
-  AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_);
+  TF_RETURN_IF_ERROR(
+      AddAttributesToOperation(builder_, op, sharding(), frontend_attributes_));
   llvm::SmallVector<mlir::Value> flattened_results = op->getResults();
   llvm::MutableArrayRef<mlir::Value> flattened_results_ref(flattened_results);
   auto result = HloFunctionImporter::CreateTupleValue(
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD
index 31413298636..a4d673eeefa 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/BUILD
@@ -10,6 +10,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 75e99c8413b..dbfb75bebd7 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -145,6 +145,15 @@ add {
   ROOT result = f32[8] all-reduce(input), channel_id=1, replica_groups={{0}, {1,2}, {3}}, use_global_device_ids=true, to_apply=add
 }
 
+// CHECK-LABEL:  func private @test_tuple_all_reduce
+%test_tuple_all_reduce {
+  %input.1 = f32[8] parameter(0)
+  %input.2 = f32[] parameter(1)
+  // CHECK:        %[[ALL_REDUCE:.*]]:2 = "mhlo.all_reduce"(%arg0, %arg1)
+  // CHECK:        %[[TUPLE:.*]] = mhlo.tuple %[[ALL_REDUCE]]#0, %[[ALL_REDUCE]]#1 {{.*}} : tuple<tensor<8xf32>, tensor<f32>>
+  ROOT result = (f32[8]{0}, f32[]) all-reduce(f32[8]{0} %input.1, f32[] %input.2), replica_groups={}, to_apply=add
+}
+
 // CHECK-LABEL:  func private @test_and
 %test_and (Arg_0.1: pred[4], Arg_1.2: pred[4]) -> pred[4] {
   %Arg_0.1 = pred[4] parameter(0)
@@ -1724,19 +1733,19 @@ add {
 // CHECK-LABEL : func private domain
 %domain (Arg_0.1: u32[]) -> u32[] {
   // CHECK: "mhlo.domain"(
-  // CHECK-SAME: {entry_metadata = "\08\01\1A\01\01\22\01\01", exit_metadata = "\08\02", kind = #mhlo<kind sharding>}
+  // CHECK-SAME: {entry_metadata = "{maximal device=1}", exit_metadata = "{}", kind = #mhlo<kind sharding>}
   %Arg_0.1 = u32[] parameter(0)
   ROOT %domain.2 = u32[] domain(u32[] %Arg_0.1), domain={kind="sharding", entry={maximal device=1}, exit={}}
 }
 
 // CHECK-LABEL : func private @sparse
-// CHECK: tensor<10x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>) -> tensor<10x10xf32, #sparse_tensor.encoding<{ dimLevelType = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>
+// CHECK: tensor<10x10xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>) -> tensor<10x10xf32, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ], posWidth = 32, crdWidth = 32 }>>
 %sparse {
   ROOT root = f32[10,10]{1,0:D(D,C)} parameter(0)
 }
 
 // CHECK-LABEL : func private @sparse-nu-no
-// CHECK: tensor<3x4x5xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton-nu-no", "singleton-no" ], posWidth = 32, crdWidth = 32 }>>) -> tensor<3x4x5xf32, #sparse_tensor.encoding<{ dimLevelType = [ "compressed-nu", "singleton-nu-no", "singleton-no" ], posWidth = 32, crdWidth = 32 }>>
+// CHECK: tensor<3x4x5xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton-nu-no", "singleton-no" ], posWidth = 32, crdWidth = 32 }>>) -> tensor<3x4x5xf32, #sparse_tensor.encoding<{ lvlTypes = [ "compressed-nu", "singleton-nu-no", "singleton-no" ], posWidth = 32, crdWidth = 32 }>>
 %sparse-nu-no {
   ROOT root = f32[3,4,5]{2,1,0:D(C+,S+~,S~)} parameter(0)
 }
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/send_recv.hlotxt b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/send_recv.hlotxt
new file mode 100644
index 00000000000..a1cd5352124
--- /dev/null
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/send_recv.hlotxt
@@ -0,0 +1,55 @@
+// RUN: xla-translate -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+
+HloModule foo
+
+// CHECK:      func private @[[RECV_FUNC:[^(]*]]
+// CHECK:        mhlo.recv
+// CHECK-SAME:     channel_handle = #mhlo.channel_handle<handle = 5, type = 3>
+// CHECK-NOT:      mhlo.sharding
+
+// CHECK:      func private @[[SEND_FUNC:[^(]*]]
+// CHECK:        mhlo.send
+// CHECK-SAME:     channel_handle = #mhlo.channel_handle<handle = 3, type = 2>
+
+// CHECK:      func @main
+// CHECK:        mhlo.async_start
+// CHECK-SAME:     called_computation = @[[SEND_FUNC]]
+// CHECK-SAME:     mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "_foo_dtoh_0"}
+// CHECK-SAME:     mhlo.sharding = "{
+// CHECK-SAME:       {maximal device=0}, {maximal device=0}, {maximal device=0}
+// CHECK-SAME:     }"
+// CHECK-SAME:     (tensor<i32>, !mhlo.token) -> !mhlo.async_bundle<tuple<tensor<i32>, !mhlo.token>, !mhlo.token, tensor<ui32>>
+// CHECK:        mhlo.async_done
+// CHECK-SAME:     called_computation = @[[SEND_FUNC]]
+// CHECK-SAME:     mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "_foo_dtoh_0"}
+// CHECK-SAME:     mhlo.sharding = "{maximal device=0}"
+// CHECK-SAME:     (!mhlo.async_bundle<tuple<tensor<i32>, !mhlo.token>, !mhlo.token, tensor<ui32>>) -> !mhlo.token
+// CHECK:        mhlo.async_start
+// CHECK-SAME:     called_computation = @[[RECV_FUNC]]
+// CHECK-SAME:     mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "_foo_htod_0"}
+// CHECK-SAME:     mhlo.sharding = "{
+// CHECK-SAME:       {maximal device=0}, {maximal device=0}, {maximal device=0}
+// CHECK-SAME:     }"
+// CHECK-SAME:     (!mhlo.token) -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<i32>, !mhlo.token>, tensor<ui32>>
+// CHECK:        mhlo.async_done
+// CHECK-SAME:     called_computation = @[[RECV_FUNC]]
+// CHECK-SAME:     mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_original_type = "s32", _xla_host_transfer_rendezvous = "_foo_htod_0"}
+// CHECK-SAME:     mhlo.sharding = "{
+// CHECK-SAME:       {maximal device=0}, {maximal device=0}
+// CHECK-SAME:     }"
+// CHECK-SAME:     (!mhlo.async_bundle<!mhlo.token, tuple<tensor<i32>, !mhlo.token>, tensor<ui32>>) -> (tensor<i32>, !mhlo.token)
+
+ENTRY %foo (arg_0: s32[], arg_1: token[]) -> (s32[], token[]) {
+  %arg_0 = s32[] parameter(0)
+  %arg_1 = token[] parameter(1)
+
+  %send.0 = (s32[], u32[], token[]) send(s32[] %arg_0, token[] %arg_1), channel_id=3, is_host_transfer=true, sharding={{maximal device=0}, {maximal device=0}, {maximal device=0}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_original_type="s32",_xla_host_transfer_rendezvous="_foo_dtoh_0"}
+  %send-done.1 = token[] send-done((s32[], u32[], token[]) %send.0), channel_id=3, is_host_transfer=true, sharding={maximal device=0}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_original_type="s32",_xla_host_transfer_rendezvous="_foo_dtoh_0"}
+
+  %recv.2 = (s32[], u32[], token[]) recv(token[] %send-done.1), channel_id=5, is_host_transfer=true, sharding={{maximal device=0}, {maximal device=0}, {maximal device=0}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_original_type="s32",_xla_host_transfer_rendezvous="_foo_htod_0"}
+  %recv-done.3 = (s32[], token[]) recv-done((s32[], u32[], token[]) %recv.2), channel_id=5, is_host_transfer=true, sharding={{maximal device=0}, {maximal device=0}}, frontend_attributes={_xla_host_transfer_handler_name="tf_rendezvous",_xla_host_transfer_original_type="s32",_xla_host_transfer_rendezvous="_foo_htod_0"}
+
+  %get-tuple-element.4 = s32[] get-tuple-element((s32[], token[]) %recv-done.3), index=0, sharding={maximal device=0}
+  %get-tuple-element.5 = token[] get-tuple-element((s32[], token[]) %recv-done.3), index=1, sharding={maximal device=0}
+  ROOT %tuple.6 = (s32[], token[]) tuple(s32[] %get-tuple-element.4, token[] %get-tuple-element.5)
+}
diff --git a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
index 4b03803581d..a7edd74aa4a 100644
--- a/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
+++ b/tensorflow/compiler/xla/translate/hlo_to_mhlo/tests/spmd_module_sharding.hlo
@@ -1,8 +1,8 @@
 # RUN: xla-translate -hlo-to-mlir-hlo %s | FileCheck %s
 
 # CHECK-LABEL: module @main attributes
-# CHECK-SAME: mhlo.spmd_output_sharding = "\08\03\1A\02\01\02\22\02\00\01"
-# CHECK-SAME: mhlo.spmd_parameters_shardings = ["\08\03\1A\02\01\02\22\02\00\01"]
+# CHECK-SAME: mhlo.spmd_output_sharding = "{devices=[1,2]0,1}"
+# CHECK-SAME: mhlo.spmd_parameters_shardings = ["{devices=[1,2]0,1}"]
 
 hlo_module       {
   name: "main"
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
index b0085d0ee10..dedfb49b68f 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/BUILD
@@ -22,6 +22,7 @@ cc_library(
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/mlir_hlo",
         "//tensorflow/compiler/xla/mlir_hlo:lhlo_gpu",
+        "//tensorflow/compiler/xla/service:hlo_parser",
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/compiler/xla/stream_executor:dnn",
         "@llvm-project//mlir:IR",
@@ -63,6 +64,7 @@ cc_library(
         ":attribute_exporter",
         ":layout_util",
         ":location_exporter",
+        ":operator_writer_inc",
         ":type_to_shape",
         "//tensorflow/compiler/xla:comparison_util",
         "//tensorflow/compiler/xla:literal",
@@ -72,6 +74,8 @@ cc_library(
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/client:xla_builder",
+        "//tensorflow/compiler/xla/client/lib:approx_topk",
+        "//tensorflow/compiler/xla/client/lib:approx_topk_shape",
         "//tensorflow/compiler/xla/client/lib:matrix",
         "//tensorflow/compiler/xla/client/lib:quantize",
         "//tensorflow/compiler/xla/client/lib:slicing",
@@ -85,7 +89,6 @@ cc_library(
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -95,7 +98,6 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TensorDialect",
         "@llvm-project//mlir:TransformUtils",
-        "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
     ],
 )
@@ -171,6 +173,7 @@ cc_library(
         "//tensorflow/compiler/xla:statusor",
         "//tensorflow/compiler/xla:xla_data_proto_cc",
         "//tensorflow/compiler/xla/mlir_hlo",
+        "//tensorflow/compiler/xla/translate/hlo_to_mhlo:hlo_utils",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:SparseTensorDialect",
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc
index e906adbb9a1..8a11bdaa35d 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/stream_executor/dnn.h"
 #include "tensorflow/compiler/xla/types.h"
@@ -217,4 +218,12 @@ ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr) {
   return aliasInfo;
 }
 
+std::optional<xla::OpSharding> ConvertSharding(llvm::StringRef sharding) {
+  xla::OpSharding sharding_proto;
+  if (sharding_proto.ParseFromString(sharding.str())) return sharding_proto;
+  StatusOr<xla::HloSharding> sharding_cpp = xla::ParseSharding(sharding.str());
+  if (sharding_cpp.ok()) return sharding_cpp->ToProto();
+  return std::nullopt;
+}
+
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h
index 7d8cf421d75..fc86b032a84 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/attribute_exporter.h
@@ -57,5 +57,11 @@ StatusOr<xla::CustomCallApiVersion> ConvertCustomCallApiVersion(
 
 StatusOr<std::vector<std::pair<ShapeIndex, std::pair<int64_t, ShapeIndex>>>>
 ConvertOutputOperandAliasing(mlir::ArrayAttr aliasArrayAttr);
+
+// Returns an OpSharding that represents the result of parsing the given string:
+// first, as serialized protobuf, and then as prettyprinted representation.
+// Will fail if both attempts at parsing failed.
+std::optional<xla::OpSharding> ConvertSharding(mlir::StringRef sharding);
+
 }  // namespace xla
 #endif  // TENSORFLOW_COMPILER_XLA_TRANSLATE_MHLO_TO_HLO_ATTRIBUTE_EXPORTER_H_
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index c40ddfff108..1070265dc0f 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <iterator>
 #include <memory>
 #include <optional>
@@ -51,9 +52,12 @@ limitations under the License.
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/xla/client/lib/approx_topk.h"
+#include "tensorflow/compiler/xla/client/lib/approx_topk_shape.h"
 #include "tensorflow/compiler/xla/client/lib/matrix.h"
 #include "tensorflow/compiler/xla/client/lib/quantize.h"
 #include "tensorflow/compiler/xla/client/lib/slicing.h"
@@ -67,6 +71,7 @@ limitations under the License.
 #include "tensorflow/compiler/xla/mlir/utils/error_util.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/transforms/passes.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/service/gpu/backend_configs.pb.h"
 #include "tensorflow/compiler/xla/service/hlo_parser.h"
 #include "tensorflow/compiler/xla/shape_util.h"
@@ -155,38 +160,22 @@ StatusOr<xla::Literal> CreateArrayLiteralFromAttr(mlir::ElementsAttr attr,
 
   xla::Shape shape = xla::TypeToShape(dense_attr.getType());
 
-#define ELEMENTS_ATTR_TO_LITERAL(xla_type, cpp_type)                         \
-  case xla_type: {                                                           \
-    xla::Array<cpp_type> source_data(shape.dimensions());                    \
-    source_data.SetValues(dense_attr.getValues<cpp_type>());                 \
-    return xla::LiteralUtil::CreateFromArrayWithLayout(source_data, layout); \
-  }
-
-  switch (shape.element_type()) {
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::PRED, bool)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F32, float)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F64, double)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S8, int8)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S16, int16)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S32, int32)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::S64, int64_t)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U8, uint8)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U16, uint16)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U32, uint32)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::U64, uint64)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C64, std::complex<float>)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::C128, std::complex<double>)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F16, Eigen::half)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::BF16, Eigen::bfloat16)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E5M2, tsl::float8_e5m2)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E4M3FN, tsl::float8_e4m3fn)
-    ELEMENTS_ATTR_TO_LITERAL(xla::PrimitiveType::F8E4M3B11FNUZ,
-                             tsl::float8_e4m3b11)
-    default:
-      return tsl::errors::Internal(absl::StrCat(  // NOLINT
-          "Unsupported type: ", xla::PrimitiveType_Name(shape.element_type())));
-  }
-#undef ELEMENTS_ATTR_TO_LITERAL
+  return xla::primitive_util::PrimitiveTypeSwitch<StatusOr<xla::Literal>>(
+      [&](auto primitive_type_constant) -> StatusOr<xla::Literal> {
+        if constexpr (xla::primitive_util::IsArrayType(
+                          primitive_type_constant)) {
+          using cpp_type =
+              xla::primitive_util::NativeTypeOf<primitive_type_constant>;
+          xla::Array<cpp_type> source_data(shape.dimensions());
+          source_data.SetValues(dense_attr.getValues<cpp_type>());
+          return xla::LiteralUtil::CreateFromArrayWithLayout(source_data,
+                                                             layout);
+        }
+        return tsl::errors::Internal(absl::StrCat(  // NOLINT
+            "Unsupported type: ",
+            xla::PrimitiveType_Name(shape.element_type())));
+      },
+      shape.element_type());
 }
 
 // Convert APInt into an int.
@@ -546,24 +535,14 @@ static xla::ScatterDimensionNumbers Convert_scatter_dimension_numbers(
   return output;
 }
 
-// Extracts sharding from attribute string.
-static std::optional<xla::OpSharding> CreateOpShardingFromStringRef(
-    llvm::StringRef sharding_str) {
-  xla::OpSharding sharding_proto;
-  if (sharding_proto.ParseFromString(sharding_str.str())) return sharding_proto;
-  StatusOr<xla::HloSharding> sharding = xla::ParseSharding(sharding_str.str());
-  if (sharding.ok()) return sharding->ToProto();
-  return std::nullopt;
-}
-
 // Returns an OpSharding proto from the "sharding" attribute of the op. If the
 // op doesn't have a sharding attribute or the sharding attribute is invalid,
 // returns std::nullopt.
 static std::optional<xla::OpSharding> CreateOpShardingFromAttribute(
     mlir::Operation* op) {
-  auto sharding = op->getAttrOfType<mlir::StringAttr>(kShardingAttr);
-  if (!sharding) return std::nullopt;
-  return CreateOpShardingFromStringRef(sharding.getValue());
+  auto shardingAttr = op->getAttrOfType<mlir::StringAttr>(kShardingAttr);
+  if (!shardingAttr) return std::nullopt;
+  return xla::ConvertSharding(shardingAttr.getValue());
 }
 
 // Returns a FrontendAttributes proto from the "frontend_attributes" attribute
@@ -629,14 +608,14 @@ static void ExtractShardingsFromFunction(
   for (int i = 0, end = function.getNumArguments(); i < end; ++i)
     if (auto sharding =
             function.getArgAttrOfType<mlir::StringAttr>(i, kShardingAttr))
-      (*arg_shardings)[i] = CreateOpShardingFromStringRef(sharding.getValue());
+      (*arg_shardings)[i] = xla::ConvertSharding(sharding.getValue());
 
   ret_shardings->resize(function.getNumResults(),
                         std::optional<xla::OpSharding>());
   for (int i = 0, end = function.getNumResults(); i < end; ++i)
     if (auto sharding =
             function.getResultAttrOfType<mlir::StringAttr>(i, kShardingAttr))
-      (*ret_shardings)[i] = CreateOpShardingFromStringRef(sharding.getValue());
+      (*ret_shardings)[i] = xla::ConvertSharding(sharding.getValue());
 }
 
 namespace mlir {
@@ -966,14 +945,29 @@ LogicalResult ExportXlaOp(AllReduceOp op, OpLoweringContext ctx) {
     return failure();
   }
 
-  xla::XlaOp operand;
-  if (failed(GetXlaOp(op.getOperand(), value_map, &operand, op)))
+  SmallVector<xla::XlaOp> operands;
+  if (failed(GetTuple(op.getOperation(), op.getOperands(), ctx, operands)))
     return failure();
 
-  value_map[op] = xla::AllReduce(
-      operand, computation, Convert_replica_groups(op.getReplicaGroups()),
-      Convert_channel_handle(op.getChannelHandle()), std::nullopt,
-      Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+  mlir::FailureOr<xla::Shape> shape_or = ExtractXlaShape(op.getOperation());
+  if (failed(shape_or)) return failure();
+  if (shape_or->IsTuple()) {
+    std::optional<xla::Shape> shape_with_layout = std::nullopt;
+    if (shape_or->has_layout()) shape_with_layout = shape_or.value();
+    auto tuple = xla::AllReduceTuple(
+        operands, computation, Convert_replica_groups(op.getReplicaGroups()),
+        Convert_channel_handle(op.getChannelHandle()), shape_with_layout,
+        Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+    for (auto [index, result] : llvm::enumerate(op.getResults())) {
+      value_map[result] = xla::GetTupleElement(tuple, index);
+    }
+  } else {
+    value_map[op->getResults()[0]] = xla::AllReduce(
+        operands[0], computation, Convert_replica_groups(op.getReplicaGroups()),
+        Convert_channel_handle(op.getChannelHandle()), std::nullopt,
+        Convert_use_global_device_ids(op.getUseGlobalDeviceIds()));
+  }
+
   return success();
 }
 
@@ -1274,7 +1268,7 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   if (all_reduce_op && SimplyReturnedOp(all_reduce_op)) {
     value_map[op.getResult(0)] =
         xla::internal::XlaBuilderFriend::BuildAllReduceDone(
-            ctx.builder, operand, xla::TypeToShape(all_reduce_op.getType()));
+            ctx.builder, operand, xla::TypeToShape(all_reduce_op.getType(0)));
     return success();
   }
   auto collective_permute_op =
@@ -1302,15 +1296,23 @@ LogicalResult ExportXlaOp(AsyncDoneOp op, OpLoweringContext ctx) {
   }
   auto recv_op = dyn_cast_or_null<RecvOp>(callee.getBody().front().front());
   if (recv_op && SimplyReturnedOp(recv_op)) {
-    auto result_type =
+    auto result_types =
         op.getBundle().getType().cast<AsyncBundleType>().getTypes()[1];
+
+    mlir::Type received_type = mlir::TupleType::get(op->getContext(), {});
+    if (isa<TupleType>(result_types)) {
+      received_type = result_types.cast<TupleType>().getType(0);
+    }
+
     xla::XlaOp xla_recv = xla::internal::XlaBuilderFriend::BuildRecvDone(
-        ctx.builder, operand, xla::TypeToShape(result_type),
+        ctx.builder, operand, xla::TypeToShape(received_type),
         Convert_channel_handle(recv_op.getChannelHandle()),
         recv_op.getIsHostTransfer());
     if (op.getNumResults() == 1) {
       value_map[op.getResult(0)] = xla_recv;
     } else {
+      xla::XlaScopedShardingAssignment scoped_sharding(ctx.builder,
+                                                       std::nullopt);
       for (const auto& item : llvm::enumerate(op.getResults())) {
         value_map[item.value()] = xla::GetTupleElement(xla_recv, item.index());
       }
@@ -1446,9 +1448,9 @@ LogicalResult ExportXlaOp(DomainOp op, OpLoweringContext ctx) {
   if (failed(GetXlaOp(op.getOperand(), valueMap, &operand, op)))
     return failure();
 
-  auto entry = CreateOpShardingFromStringRef(op.getEntryMetadata());
+  auto entry = xla::ConvertSharding(op.getEntryMetadata());
   if (!entry) return failure();
-  auto exit = CreateOpShardingFromStringRef(op.getExitMetadata());
+  auto exit = xla::ConvertSharding(op.getExitMetadata());
   if (!exit) return failure();
 
   valueMap[op] = xla::internal::XlaBuilderFriend::BuildDomain(
@@ -1653,6 +1655,293 @@ LogicalResult ExportXlaOp(ConvertOp op, OpLoweringContext ctx) {
 }
 
 LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
+  auto& value_map = *ctx.values;
+  llvm::SmallVector<xla::XlaOp> args;
+  if (failed(GetTuple(op, op.getInputs(), ctx, args))) return failure();
+
+  // Specially handle custom_calls from StableHLO that need stability guarantees
+  // that XLA doesn't provide at the moment.
+  //
+  // In particular, we need 6mo backward compat and 1mo forward compat. This
+  // will be provided by the StableHLO team by updating the following lowering.
+  // This lowering provides that compatibility guarantee, lowering to the
+  // appropriate HLO as the HLO implementing this custom_call may change.
+  //
+  // The only custom_call covered by the guarantee right now is ApproxTopK.
+  // This means that any custom_call with call_target_name = "ApproxTopK"
+  // written against the specification below will continue to behave as
+  // described within the compatibility window.
+  //
+  // The attributes supported by the ApproxTopK custom_call are:
+  //
+  //  - called_computation : This indicates the comparator for scoring entries
+  //  - api_version : always 4, the typed FFI API
+  //  - backend_config : The actual arguments to ApproxTopK. This includes
+  //    + top_k:i64 : the number of results to return
+  //    + reduction_dim:i64 : which dimension to search for the top k elements
+  //    + recall_target:f32: the expected number of top-k entries returned,
+  //        divided by k.
+  //    + aggregate_to_topk:bool : When true, aggregates approximate results to
+  //        top-k. When false, returns the approximate results. The number of
+  //        the approximate results is implementation defined and is greater
+  //        equals to the specified `k`.
+  //    + reduction_input_size_override:i64 : When set to a nonnegative value,
+  //        it overrides the size determined by `input[reduction_dim]` for
+  //        evaluating the recall. This option is useful when the given
+  //        `input` is only a subset of the overall computation in SPMD or
+  //        distributed pipelines, where the true input size cannot be deferred
+  //        by the `input` shape.
+  //    + is_fallback:bool : use the CPU/GPU fallback instead of the TPU
+  //        implementation that uses PartialReduce (optional)
+  //
+  // The operands are a sequence of inputs over which to search, followed
+  // by a list of initial values for each tensor in the first
+  // list. Thus, we must have an even number of operands consisting of a
+  // sequence of tensors with the same shape followed by the same number of
+  // rank-0 tensors with the same element types as the corresponding inputs.
+  // NB. Here, We mean "shape" in the StableHLO/MHLO sense of the dimensions of
+  // a the tensor, excluding the element type, not the the HLO sense, which
+  // includes it.
+  //
+  // Given the above operands and attributes, the custom_call returns tensors
+  // with the same shapes as the inputs (i.e. the first half of the operands),
+  // save for reduction_dim, which may have changed in accordance with the
+  // values of aggregate_to_topk, recall_target, and
+  // reduction_input_size_override above. These tensors will contain slices of
+  // the input tensors perpendicular to that axis, which have approximately the
+  // top values of the comparator along that axis to within recall_target.
+  //
+  // The operands and attributes must obey the following constraints:
+  //
+  // (C1) size(inputs) = size(init_values) = size(results)
+  // (C2) All inputs have the same shape.
+  // (C3) element_type(inputs[i]) = element_type(init_values[i])
+  //                              = element_type(results[i]) for all i in [0, N)
+  // (C4) shape(results[i]) = shape(inputs[i]) except that the dimension size
+  //      of inputs[i] corresponding to reduction_dim_are replaced with a
+  //      value >=k, which can be determined using ApproxTopKReductionOutputSize
+  // (C5) called_computation has type
+  //      (tensor<E0>, tensor<E0>, ..., tensor<EN-1>, tensor<EN-1>) ->
+  //      tensor<i1>
+  //        where Ei = element_type(inputs[i])
+  // (C6) 0 <= reduction_dim < rank(inputs[0])
+  // (C7) 0 < recall_target <= 1.0
+  // (C8) dim(inputs[0],reduction_dim) < reduction_input_size_override
+  //        || reduction_input_size_override < 0
+  //
+  // See arxiv:2206.14286 for more details.
+  //
+  // This feature is at time of writing only used by JAX, and is tested in the
+  // jax2tf backwards compatibility tests.
+
+  if (op.getCallTargetName() == "ApproxTopK") {
+    auto isSupportedAttrName = [](NamedAttribute attr) {
+      auto name = attr.getName();
+      return name == "call_target_name" || name == "backend_config" ||
+             name == "api_version" || name == "called_computations";
+    };
+    for (const auto& attr : op->getAttrs()) {
+      if (!isSupportedAttrName(attr))
+        return op.emitOpError()
+               << attr.getName().getValue()
+               << " is not a supported attribute for ApproxTopK";
+    }
+    auto backend_config =
+        op.getBackendConfigAttr().dyn_cast_or_null<mlir::DictionaryAttr>();
+    if (!backend_config)
+      return op.emitOpError() << "Missing backend_config attribute";
+
+    for (auto attr : backend_config) {
+      auto name = attr.getName();
+      if (!(name == "top_k" || name == "reduction_dim" ||
+            name == "recall_target" || name == "aggregate_to_topk" ||
+            name == "reduction_input_size_override" || name == "is_fallback"))
+        return op.emitOpError()
+               << name.getValue() << " is not a supported backend_config"
+               << " attribute for ApproxTopK";
+    }
+
+    auto checkI64Attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name))
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      auto attr = backend_config.getAs<IntegerAttr>(attr_name);
+      if (!attr || !attr.getType().isInteger(64))
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of i64 type";
+      return success();
+    };
+    auto checkF32Attr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name))
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      auto attr = backend_config.getAs<FloatAttr>(attr_name);
+      if (!attr || !attr.getType().isF32())
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of f32 type";
+      return success();
+    };
+    auto checkBoolAttr =
+        [&](const std::string& attr_name) -> mlir::LogicalResult {
+      if (!backend_config.contains(attr_name))
+        return op.emitOpError()
+               << "Missing " << attr_name << " attribute in backend_config";
+      auto attr = backend_config.getAs<BoolAttr>(attr_name);
+      if (!attr)
+        return op.emitOpError()
+               << attr_name
+               << " attribute in backend_config must be of bool type";
+      return success();
+    };
+    if (failed(checkI64Attr("top_k"))) return failure();
+    if (failed(checkI64Attr("reduction_dim"))) return failure();
+    if (failed(checkF32Attr("recall_target"))) return failure();
+    if (failed(checkBoolAttr("aggregate_to_topk"))) return failure();
+    if (failed(checkI64Attr("reduction_input_size_override"))) return failure();
+    bool has_is_fallback = backend_config.contains("is_fallback");
+    if (has_is_fallback && !backend_config.getAs<BoolAttr>("is_fallback"))
+      return op.emitOpError()
+             << "is_fallback attribute in backend_config must be of bool type";
+
+    int64_t top_k = backend_config.getAs<IntegerAttr>("top_k").getInt();
+    int64_t reduction_dim =
+        backend_config.getAs<IntegerAttr>("reduction_dim").getInt();
+    float recall_target = backend_config.getAs<FloatAttr>("recall_target")
+                              .getValue()
+                              .convertToFloat();
+    bool aggregate_to_topk =
+        backend_config.getAs<BoolAttr>("aggregate_to_topk").getValue();
+    int64_t reduction_input_size_override =
+        backend_config.getAs<IntegerAttr>("reduction_input_size_override")
+            .getInt();
+    bool is_fallback = has_is_fallback &&
+                       backend_config.getAs<BoolAttr>("is_fallback").getValue();
+
+    // (C1)
+    if (args.size() % 2 != 0) {
+      return op.emitOpError() << "ApproxTopK takes an even number of operands.";
+    }
+    auto num_inputs = args.size() / 2;
+    absl::Span<const xla::XlaOp> inputs(args.begin(), num_inputs);
+    absl::Span<const xla::XlaOp> init_values(args.begin() + num_inputs,
+                                             num_inputs);
+    if (num_inputs != op.getNumResults()) {
+      return op.emitOpError() << "num_results does not match num_inputs";
+    }
+
+    SmallVector<RankedTensorType> input_types, init_value_types, result_types;
+    for (size_t i = 0; i < num_inputs; ++i) {
+      auto input_type = op.getOperand(i).getType().dyn_cast<RankedTensorType>();
+      if (!input_type) return failure();
+      input_types.push_back(input_type);
+      auto init_value_type =
+          op.getOperand(num_inputs + i).getType().dyn_cast<RankedTensorType>();
+      if (!init_value_type) return failure();
+      init_value_types.push_back(init_value_type);
+      auto result_type = op.getResult(i).getType().dyn_cast<RankedTensorType>();
+      if (!result_type) return failure();
+      result_types.push_back(result_type);
+    }
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      // (C2)
+      if (input_types[0].getShape() != input_types[i].getShape()) {
+        return op.emitOpError() << "input shape mismatch at position " << i;
+      }
+
+      // (C3)
+      if (init_value_types[i].getElementType() !=
+          input_types[i].getElementType()) {
+        return op.emitOpError()
+               << "input and init_value element type mismatch at position "
+               << i;
+      }
+      if (input_types[i].getElementType() != result_types[i].getElementType()) {
+        return op.emitOpError()
+               << "result element type mismatch at position " << i;
+      }
+
+      // (C4)
+      for (size_t j = 0; j < input_types[i].getRank(); ++j) {
+        if (j == reduction_dim) {
+          auto reduction_output_size = xla::ApproxTopKReductionOutputSize(
+              input_types[i].getShape()[j], input_types[i].getRank(), top_k,
+              recall_target, aggregate_to_topk, reduction_input_size_override);
+          if (!reduction_output_size.ok()) return failure();
+          if (result_types[i].getShape()[j] != reduction_output_size->first)
+            return op.emitOpError()
+                   << "ApproxTopK aggregates to k="
+                   << reduction_output_size->first << ", but got "
+                   << result_types[i].getShape()[j];
+          continue;
+        }
+        if (input_types[i].getShape()[j] != result_types[i].getShape()[j]) {
+          return op.emitOpError() << "result shape mismatch at position " << i
+                                  << ", index " << j;
+        }
+      }
+    }
+
+    // (C5)
+    auto called_computations = op.getCalledComputations();
+    if (called_computations.size() != 1) {
+      return op.emitOpError()
+             << "ApproxTopK takes exactly 1 called_computation.";
+    }
+    mlir::func::FuncOp callee = ctx.converter->LookUpSymbol(
+        op.getCalledComputations()[0].cast<FlatSymbolRefAttr>());
+    mlir::FunctionType callee_type = callee.getFunctionType();
+    SmallVector<Type, 4> expected_callee_input_types;
+    for (unsigned i = 0; i < num_inputs; ++i) {
+      auto scalar = RankedTensorType::get({}, input_types[i].getElementType());
+      expected_callee_input_types.push_back(scalar);
+      expected_callee_input_types.push_back(scalar);
+    }
+    FunctionType expected_callee_type = mlir::FunctionType::get(
+        op->getContext(), expected_callee_input_types,
+        RankedTensorType::get({}, IntegerType::get(op->getContext(), 1)));
+    if (callee_type != expected_callee_type) {
+      return op.emitOpError()
+             << "called_computation type does not match the expected type. Got "
+             << callee_type << " expected " << expected_callee_type;
+    }
+
+    if (failed(ctx.converter->RunOnFunction(callee))) return failure();
+    xla::XlaComputation& comparator =
+        ctx.converter->GetLoweredComputation(callee);
+
+    // (C6)
+    if (reduction_dim < 0 || reduction_dim > input_types[0].getRank())
+      return op.emitOpError() << "reduction_dim out of range";
+    // (C7)
+    if (recall_target <= 0 || recall_target > 1.0)
+      return op.emitOpError() << "recall_target out of range";
+    // (C8)
+    if (reduction_input_size_override >= 0 &&
+        reduction_input_size_override <
+            input_types[0].getShape()[reduction_dim])
+      return op.emitOpError() << "reduction_input_size_override out of range";
+
+    xla::XlaOp cc_op;
+    if (is_fallback) {
+      cc_op = xla::ApproxTopKFallback(
+          ctx.builder, inputs, init_values, top_k, reduction_dim, comparator,
+          recall_target, aggregate_to_topk, reduction_input_size_override);
+    } else {
+      cc_op = xla::ApproxTopK(ctx.builder, inputs, init_values, top_k,
+                              reduction_dim, comparator, recall_target,
+                              aggregate_to_topk, reduction_input_size_override);
+    }
+    for (const auto& item : llvm::enumerate(op.getResults())) {
+      value_map[item.value()] = xla::GetTupleElement(cc_op, item.index());
+    }
+    return success();
+  }
+
   if (op.getCalledComputations().size() > 1)
     return op.emitOpError()
            << "cannot export with more than one called computations";
@@ -1665,8 +1954,6 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
                                "layouts are specified";
   }
 
-  llvm::SmallVector<xla::XlaOp> args;
-  if (failed(GetTuple(op, op.getInputs(), ctx, args))) return failure();
   auto xla_api_version = xla::ConvertCustomCallApiVersion(op.getApiVersion());
   if (!xla_api_version.ok()) return failure();
 
@@ -1701,7 +1988,6 @@ LogicalResult ExportXlaOp(CustomCallOp op, OpLoweringContext ctx) {
     literal_ptr = &*literal;
   }
 
-  auto& value_map = *ctx.values;
   auto aliasInfo =
       xla::ConvertOutputOperandAliasing(op.getOutputOperandAliases());
   auto output_operand_aliasing = absl::MakeSpan(*aliasInfo);
@@ -2381,6 +2667,11 @@ LogicalResult ExportXlaOp(UniformDequantizeOp op, OpLoweringContext ctx) {
   return failure();
 }
 
+LogicalResult ExportXlaOp(TopKOp op, OpLoweringContext ctx) {
+  // TODO(b/284077883): Implement HLO roundtrip for mhlo::TopKOp.
+  return failure();
+}
+
 }  // namespace
 }  // namespace mhlo
 }  // namespace mlir
@@ -3209,8 +3500,7 @@ xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
   xla::XlaBuilder module_builder("main");
   ConvertToHloModule converter(module, module_builder, use_tuple_args,
                                return_tuple, options);
-  if (failed(converter.Run()))
-    return ::tsl::FromAbslStatus(diag_handler.ConsumeStatus());
+  if (failed(converter.Run())) return diag_handler.ConsumeStatus();
   auto hlo_module = converter.ConsumeMainProto();
   StringRef module_name = module.getName() ? *module.getName() : "main";
   hlo_module.set_name(module_name.str());
@@ -3240,14 +3530,13 @@ xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
   if (auto spmd_output_sharding = module->getAttrOfType<mlir::StringAttr>(
           "mhlo.spmd_output_sharding")) {
     *hlo_module.mutable_spmd_output_sharding() =
-        *CreateOpShardingFromStringRef(spmd_output_sharding.getValue());
+        *xla::ConvertSharding(spmd_output_sharding.getValue());
   }
   if (auto spmd_parameters_sharding = module->getAttrOfType<mlir::ArrayAttr>(
           "mhlo.spmd_parameters_shardings")) {
     for (const auto& sharding : spmd_parameters_sharding.getValue()) {
       *hlo_module.add_spmd_parameters_shardings() =
-          *CreateOpShardingFromStringRef(
-              sharding.cast<mlir::StringAttr>().getValue());
+          *xla::ConvertSharding(sharding.cast<mlir::StringAttr>().getValue());
     }
   }
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
@@ -3284,7 +3573,7 @@ xla::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
         unsigned index = ret.getOperandNumber();
         xla::XlaOp operand;
         if (failed(GetXlaOp(ret.get(), lowering, &operand, &inst)))
-          return ::tsl::FromAbslStatus(diag_handler.ConsumeStatus());
+          return diag_handler.ConsumeStatus();
         returns[index] = operand;
       }
     } else {
@@ -3292,7 +3581,7 @@ xla::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
       if (failed(converter.Lower(&inst, /*is_entry_function=*/true,
                                  /*ret_shardings=*/{}, &builder, &lowering,
                                  &return_value)))
-        return ::tsl::FromAbslStatus(diag_handler.ConsumeStatus());
+        return diag_handler.ConsumeStatus();
     }
   }
 
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD
index f4533845ec7..dd214111598 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/BUILD
@@ -10,6 +10,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = [
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
index 1b2b9347444..6a07a9b2182 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export-with-layouts.mlir
@@ -2,8 +2,8 @@
 // RUN: xla-translate -split-input-file -mlir-hlo-to-hlo-text  -with-layouts -print-layouts --via-builder=true %s | FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"],
-  dimOrdering = affine_map<(i, j) -> (i, j)>,
+  lvlTypes = ["dense", "compressed"],
+  dimToLvl = affine_map<(i, j) -> (i, j)>,
   posWidth = 32,
   crdWidth = 32
 }>
@@ -17,8 +17,8 @@ func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
 // -----
 
 #COO = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed-nu", "singleton"],
-  dimOrdering = affine_map<(i, j) -> (i, j)>,
+  lvlTypes = ["compressed-nu", "singleton"],
+  dimToLvl = affine_map<(i, j) -> (i, j)>,
   posWidth = 32,
   crdWidth = 32
 }>
@@ -32,7 +32,7 @@ func.func @main(%arg: tensor<3x4xf32, #COO>) -> tensor<3x4xf32, #COO> {
 // -----
 
 #CSR = #sparse_tensor.encoding<{
-  dimLevelType = ["dense", "compressed"],
+  lvlTypes = ["dense", "compressed"],
   posWidth = 32,
   crdWidth = 32
 }>
@@ -46,7 +46,7 @@ func.func @main(%arg: tensor<3x4xf32, #CSR>) -> tensor<3x4xf32, #CSR> {
 // -----
 
 #UnorderedCOOTensor = #sparse_tensor.encoding<{
-  dimLevelType = ["compressed-nu", "singleton-nu-no", "singleton-no"],
+  lvlTypes = ["compressed-nu", "singleton-nu-no", "singleton-no"],
   posWidth = 32,
   crdWidth = 32
 }>
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
index b95949bd1db..9187b1c0c7f 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -1,5 +1,5 @@
-// RUN: xla-translate --print-sugar=false -split-input-file -mlir-hlo-to-hlo-text %s | FileCheck %s
-// RUN: xla-translate --print-sugar=false -split-input-file -mlir-hlo-to-hlo-text --via-builder=true %s | FileCheck %s
+// RUN: xla-translate --print-sugar=false -split-input-file -mlir-hlo-to-hlo-text -verify-diagnostics %s | FileCheck %s
+// RUN: xla-translate --print-sugar=false -split-input-file -mlir-hlo-to-hlo-text -verify-diagnostics --via-builder=true %s | FileCheck %s
 
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<2xi1>) -> tensor<2xi1> {
@@ -161,6 +161,32 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 
 // -----
 
+// expected-error@-3 {{'mhlo.async_start' op can't be translated to XLA HLO}}
+func.func @all_reduce_0(%arg0: tensor<10xf32>, %arg1: tensor<1xf32>) -> (tensor<10xf32>, tensor<1xf32>) attributes {execution_thread = "main"} {
+  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
+  ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
+    %max = mhlo.maximum %lhs, %rhs : tensor<f32>
+    "mhlo.return"(%max) : (tensor<f32>) -> ()
+  })
+  {
+    replica_groups = dense<[[0, 2, 4, 6], [1, 3, 5, 7]]> : tensor<2x4xi64>,
+    channel_handle = #mhlo.channel_handle<
+      handle = 5,
+      type = 2
+    >,
+    use_global_device_ids
+  } : (tensor<10xf32>, tensor<1xf32>) -> (tensor<10xf32>, tensor<1xf32>)
+  func.return %0#0, %0#1 : tensor<10xf32>, tensor<1xf32>
+}
+
+func.func @main(%arg0: tensor<10xf32>, %arg1: tensor<1xf32>) -> (tensor<10xf32>, tensor<1xf32>) {
+  %0 = "mhlo.async_start"(%arg0, %arg1) {called_computation = @all_reduce_0, execution_thread = "main"} : (tensor<10xf32>, tensor<1xf32>) -> !mhlo.async_bundle<tuple<tensor<10xf32>,tensor<1xf32>>, tuple<tensor<10xf32>,tensor<1xf32>>>
+  %1:2 = "mhlo.async_done"(%0) {called_computation = @all_reduce_0, execution_thread = "main"} : (!mhlo.async_bundle<tuple<tensor<10xf32>,tensor<1xf32>>, tuple<tensor<10xf32>,tensor<1xf32>>>) -> (tensor<10xf32>, tensor<1xf32>)
+  return %1#0, %1#1 : tensor<10xf32>, tensor<1xf32>
+}
+
+// -----
+
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
   %0 = "mhlo.all_reduce"(%arg0) ({
@@ -248,6 +274,24 @@ func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
 
 // -----
 
+func.func private @main(%arg0: tensor<8xf32>, %arg1: tensor<f32>) -> tuple<tensor<8xf32>, tensor<f32>> {
+  // CHECK:      %[[ARG0:.*]] = f32[8] parameter(0)
+  // CHECK-NEXT: %[[ARG1:.*]] = f32[] parameter(1)
+  // CHECK-NEXT: %[[TUPLE:.*]] = (f32[8], f32[]) tuple
+  // CHECK-NEXT: %[[TUPLE_ARG0:.*]] = f32[8] get-tuple-element((f32[8], f32[]) %[[TUPLE]]), index=0
+  // CHECK-NEXT: %[[TUPLE_ARG1:.*]] = f32[] get-tuple-element((f32[8], f32[]) %[[TUPLE]]), index=1
+  // CHECK-NEXT: (f32[8], f32[]) all-reduce(f32[8] %[[TUPLE_ARG0]], f32[] %[[TUPLE_ARG1]]), replica_groups={}, to_apply={{.*}}
+  %0:2 = "mhlo.all_reduce"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %2 = mhlo.add %arg2, %arg3 : tensor<f32>
+    mhlo.return %2 : tensor<f32>
+  }) {replica_groups = dense<> : tensor<0x0xi64>} : (tensor<8xf32>, tensor<f32>) -> (tensor<8xf32>, tensor<f32>)
+  %1 = mhlo.tuple %0#0, %0#1 {xla_shape = "(f32[8]{0}, f32[])"} : tuple<tensor<8xf32>, tensor<f32>>
+  return %1 : tuple<tensor<8xf32>, tensor<f32>>
+}
+
+// -----
+
 // CHECK:  HloModule
 func.func @main(%arg0: tensor<10xf32>) -> tensor<5xf32> {
   %0 = "mhlo.reduce_scatter"(%arg0) ({
@@ -822,6 +866,667 @@ func.func @main(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
 // CHECK-SAME:  f32[2,3] custom-call(f32[2,3] [[VAL_1]])
 // CHECK-SAME:  custom_call_target="SetBound"
 // CHECK-SAME:  literal=s32[] 1
+// -----
+
+// CHECK:  HloModule
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      top_k = 4 : i64,
+      is_fallback = true
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// CHECK: ENTRY
+// CHECK-DAG:   [[ARG0:%.*]] = bf16[16,256] parameter(0)
+// CHECK-DAG:   [[ARG1:%.*]] = s32[] parameter(1)
+// CHECK-DAG:   [[ARG2:%.*]] = s32[16,256] parameter(2)
+// CHECK-DAG:   [[ARG3:%.*]] = bf16[] parameter(3)
+// CHECK-DAG:   [[VAL0:%.*]] = (bf16[16,256], s32[16,256]) sort(bf16[16,256] [[ARG0]], s32[16,256] [[ARG2]])
+// CHECK-DAG:   [[VAL1:%.*]] = s32[16,256] get-tuple-element((bf16[16,256], s32[16,256]) [[VAL0]])
+// CHECK-DAG:   [[VAL2:%.*]] = s32[16,4] slice(s32[16,256] [[VAL1]])
+
+// -----
+
+// CHECK:  HloModule
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// CHECK: %top_k_gt_comparator.[[COMPARATOR:[0-9]+]]
+// CHECK:   s32[] parameter(2)
+// CHECK:   s32[] parameter(3)
+// CHECK:   [[ARG0:%.*]] = bf16[] parameter(0)
+// CHECK:   [[ARG1:%.*]] = bf16[] parameter(1)
+// CHECK:   ROOT [[VAL:%.*]] = pred[] compare(bf16[] [[ARG0]], bf16[] [[ARG1]]), direction=GT
+
+// CHECK: ENTRY
+// CHECK-DAG:   [[ARG0:%.*]] = bf16[16,256] parameter(0)
+// CHECK-DAG:   [[ARG1:%.*]] = s32[] parameter(1)
+// CHECK-DAG:   [[ARG2:%.*]] = s32[16,256] parameter(2)
+// CHECK-DAG:   [[ARG3:%.*]] = bf16[] parameter(3)
+// CHECK-DAG:   (bf16[16,128], s32[16,128]) custom-call(bf16[16,256] [[ARG0]], s32[16,256] [[ARG2]], bf16[] [[ARG3]], s32[] [[ARG1]]),
+// CHECK-SAME: custom_call_target="PartialReduce", called_computations={%top_k_gt_comparator.[[COMPARATOR]]}
+// CHECK-SAME: backend_config={"log2_reduction": 1, "reduction_dim": 1, "to_apply_type": "comparator", "top_k": 4, "recall_target": 0.949218}
+
+
+// -----
+
+// expected-error@-3 {{ApproxTopK aggregates to k=4, but got 5}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x5xbf16>, tensor<16x5xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x5xbf16>, tensor<16x5xi32>)
+  return %4#0, %4#1 : tensor<16x5xbf16>, tensor<16x5xi32>
+}
+
+// -----
+
+// expected-error@-3 {{input shape mismatch at position 1}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<17x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+     api_version = 4 : i32,
+     called_computations = [@top_k_gt_comparator],
+     backend_config = {
+       aggregate_to_topk = true,
+       recall_target = 9.492180e-01 : f32,
+       reduction_dim = 1 : i64,
+       reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+       top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<17x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{input and init_value element type mismatch at position 1}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i64>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i64>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+
+// -----
+
+// expected-error@-3 {{called_computation type does not match the expected type. Got '(tensor<bf16>, tensor<bf16>, tensor<i32>, tensor<i32>) -> tensor<i32>' expected '(tensor<bf16>, tensor<bf16>, tensor<i32>, tensor<i32>) -> tensor<i1>'}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i32> {
+  %0 = mhlo.constant dense<0> : tensor<i32>
+  return %0 : tensor<i32>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{result shape mismatch at position 1, index 0}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<17x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<17x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<17x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{result element type mismatch at position 1}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi64>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi64>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi64>
+}
+
+// -----
+
+// expected-error@-3 {{num_results does not match num_inputs}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.constant dense<0> : tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>) {
+  %4 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>)
+  return %4: tensor<16x4xbf16>
+}
+
+// -----
+
+// expected-error@-3 {{ApproxTopK takes an even number of operands.}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{invalid_attribute is not a supported attribute for ApproxTopK}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      },
+      invalid_attribute = 123 : i64
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{invalid_attribute is not a supported backend_config attribute for ApproxTopK}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64,
+      invalid_attribute = 123 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+
+// -----
+
+// expected-error@-3 {{ApproxTopK takes exactly 1 called_computation.}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator,
+    @top_k_gt_comparator], backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{Missing backend_config attribute}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator]
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{Missing top_k attribute in backend_config}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      is_fallback = false,
+      reduction_input_size_override = -1 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{Missing reduction_dim attribute in backend_config}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      is_fallback = false,
+      top_k = 4 : i64,
+      reduction_input_size_override = -1 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{Missing reduction_input_size_override attribute in backend_config}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{Missing aggregate_to_topk attribute in backend_config}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64,
+      reduction_input_size_override = -1 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{top_k attribute in backend_config must be of i64 type}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i32
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{reduction_dim attribute in backend_config must be of i64 type}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i32,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{reduction_input_size_override attribute in backend_config must be of i64 type}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i32,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{Missing recall_target attribute in backend_config}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      reduction_dim = 1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64,
+      reduction_input_size_override = -1 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+
+// -----
+
+// expected-error@-3 {{recall_target attribute in backend_config must be of f32 type}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 0.01 : bf16,
+      reduction_dim = 1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64,
+      reduction_input_size_override = -1 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{aggregate_to_topk attribute in backend_config must be of bool type}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = 3 : i32,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{is_fallback attribute in backend_config must be of bool type}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 9.492180e-01 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = 3 : i64,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{recall_target out of range}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 1.1 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
+
+// -----
+
+// expected-error@-3 {{reduction_dim out of range}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x256xbf16>, tensor<16x256xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 0.5 : f32,
+      reduction_dim = 400 : i64,
+      reduction_input_size_override = -1 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x256xbf16>, tensor<16x256xi32>)
+  return %4#0, %4#1 : tensor<16x256xbf16>, tensor<16x256xi32>
+}
+
+// -----
+
+// expected-error@-3 {{reduction_input_size_override out of range}}
+func.func @top_k_gt_comparator(%arg0: tensor<bf16>, %arg1: tensor<bf16>, %arg2: tensor<i32>, %arg3: tensor<i32>) -> tensor<i1> {
+  %0 = mhlo.compare  GT, %arg0, %arg1 : (tensor<bf16>, tensor<bf16>) -> tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func public @main(%arg0: tensor<16x256xbf16>, %arg1: tensor<i32>, %arg2: tensor<16x256xi32>, %arg3: tensor<bf16>) -> (tensor<16x4xbf16>, tensor<16x4xi32>) {
+  %4:2 = mhlo.custom_call @ApproxTopK(%arg0, %arg2, %arg3, %arg1) {
+    api_version = 4 : i32,
+    called_computations = [@top_k_gt_comparator],
+    backend_config = {
+      aggregate_to_topk = true,
+      recall_target = 0.5 : f32,
+      reduction_dim = 1 : i64,
+      reduction_input_size_override = 3 : i64,
+      is_fallback = false,
+      top_k = 4 : i64
+      }
+    } : (tensor<16x256xbf16>, tensor<16x256xi32>, tensor<bf16>, tensor<i32>) -> (tensor<16x4xbf16>, tensor<16x4xi32>)
+  return %4#0, %4#1 : tensor<16x4xbf16>, tensor<16x4xi32>
+}
 
 // -----
 
@@ -1372,15 +2077,26 @@ func.func @recv_0(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) attribu
 }
 
 func.func @main(%token: !mhlo.token) -> (tensor<3x4xi32>, !mhlo.token) {
-  %0 = "mhlo.async_start"(%token) {called_computation = @recv_0, execution_thread = "main"} : (!mhlo.token) -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>
-  %1, %2 = "mhlo.async_done"(%0) {called_computation = @recv_0, execution_thread = "main"} : (!mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>) -> (tensor<3x4xi32>, !mhlo.token)
+  %0 = "mhlo.async_start"(%token) {called_computation = @recv_0, execution_thread = "main", mhlo.sharding = "{{maximal device=0}, {maximal device=0}, {maximal device=0}}"} : (!mhlo.token) -> !mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>
+  %1, %2 = "mhlo.async_done"(%0) {called_computation = @recv_0, execution_thread = "main", mhlo.sharding = "{{maximal device=0}, {maximal device=0}}"} : (!mhlo.async_bundle<!mhlo.token, tuple<tensor<3x4xi32>, !mhlo.token>, tensor<i32>>) -> (tensor<3x4xi32>, !mhlo.token)
   return %1, %2 : tensor<3x4xi32>, !mhlo.token
 }
 
 // CHECK:  ENTRY
 // CHECK:  [[TOKEN:%.*]] = token[] parameter(0)
 // CHECK:  [[RECV:%.*]] = (s32[3,4], u32[], token[]) recv(token[] [[TOKEN]]), channel_id=5, is_host_transfer
-// CHECK:  (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5, is_host_transfer
+// CHECK-SAME:  sharding={
+// CHECK-SAME:    {maximal device=0}, {maximal device=0}, {maximal device=0}
+// CHECK-SAME:  }
+// CHECK:  [[RECV_DONE:%.*]] = (s32[3,4], token[]) recv-done((s32[3,4], u32[], token[]) [[RECV]]), channel_id=5, is_host_transfer
+// CHECK-SAME:  sharding={
+// CHECK-SAME:    {maximal device=0}, {maximal device=0}
+// CHECK-SAME:  }
+// CHECK:  [[TUPLE0:%.*]] = s32[3,4] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=0
+// CHECK-NOT: sharding=
+// CHECK:  [[TUPLE1:%.*]] = token[] get-tuple-element((s32[3,4], token[]) [[RECV_DONE]]), index=1
+// CHECK-NOT: sharding=
+// CHECK:  ROOT {{%.*}} = (s32[3,4], token[]) tuple(s32[3,4] [[TUPLE0]], token[] [[TUPLE1]]) 
 
 // -----
 
@@ -2307,7 +3023,7 @@ func.func @main(%arg0: tensor<2x3xf32>, %arg1: tensor<5x5xf32>) -> tensor<1x2x3x
 // CHECK-SAME:  custom_call_target="foo"
 // CHECK-SAME:  custom_call_has_side_effect=true
 // CHECK-SAME:  api_version=API_VERSION_TYPED_FFI
-// CHECK-SAME:  backend_config="{user_attr0 = 123 : i32, user_attr1 = dense<42> : tensor<i32>}"
+// CHECK-SAME:  backend_config={user_attr0 = 123 : i32, user_attr1 = dense<42> : tensor<i32>}
 
 // -----
 
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc
index c59d21c93d2..504516e9d54 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/translate.cc
@@ -34,6 +34,7 @@ mlir::LogicalResult MlirHloToHloTranslateFunction(mlir::ModuleOp module,
   Status status = mlir::ConvertMlirHloToHlo(
       module, &hloProto, emit_use_tuple_arg, emit_return_tuple);
   if (!status.ok()) {
+    module.emitOpError() << status.message();
     LOG(ERROR) << "Module conversion failed: " << status;
     return mlir::failure();
   }
@@ -121,6 +122,7 @@ mlir::LogicalResult MlirHloToHloTextTranslateFunction(
           : mlir::ConvertMlirHloToHlo(module, &hloProto, emit_use_tuple_arg,
                                       emit_return_tuple, options);
   if (!status.ok()) {
+    module.emitOpError() << status.message();
     LOG(ERROR) << "Module conversion failed: " << status;
     return mlir::failure();
   }
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
index 5ee56c3dbe0..51744e6153e 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_hlo/type_to_shape.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "tensorflow/compiler/xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "tensorflow/compiler/xla/primitive_util.h"
 #include "tensorflow/compiler/xla/shape_util.h"
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/xla_data.pb.h"
@@ -62,31 +63,16 @@ PrimitiveType TypeToPrimitiveType(mlir::Type type) {
     return PrimitiveType::F64;
   } else if (auto complex_type = type.dyn_cast<mlir::ComplexType>()) {
     mlir::Type element_ty = complex_type.getElementType();
-    if (element_ty.isF32()) {
-      return PrimitiveType::C64;
-
-    } else if (element_ty.isF64()) {
-      return PrimitiveType::C128;
-    }
-    return PrimitiveType::PRIMITIVE_TYPE_INVALID;
+    return primitive_util::ComplexType(TypeToPrimitiveType(element_ty));
   } else if (auto integer_type = type.dyn_cast<mlir::IntegerType>()) {
     bool is_unsigned = integer_type.isUnsigned();
-    switch (integer_type.getWidth()) {
-      case 1:
-        return PrimitiveType::PRED;
-      case 4:
-        return is_unsigned ? PrimitiveType::U4 : PrimitiveType::S4;
-      case 8:
-        return is_unsigned ? PrimitiveType::U8 : PrimitiveType::S8;
-      case 16:
-        return is_unsigned ? PrimitiveType::U16 : PrimitiveType::S16;
-      case 32:
-        return is_unsigned ? PrimitiveType::U32 : PrimitiveType::S32;
-      case 64:
-        return is_unsigned ? PrimitiveType::U64 : PrimitiveType::S64;
-      default:
-        return PrimitiveType::PRIMITIVE_TYPE_INVALID;
+    if (integer_type.getWidth() == 1) {
+      return PrimitiveType::PRED;
     }
+    return is_unsigned ? primitive_util::UnsignedIntegralTypeForBitWidth(
+                             integer_type.getWidth())
+                       : primitive_util::SignedIntegralTypeForBitWidth(
+                             integer_type.getWidth());
   }
   return PrimitiveType::PRIMITIVE_TYPE_INVALID;
 }
@@ -214,13 +200,13 @@ Shape TypeToShape(mlir::Type type) {
       // added to xla
       if (sparse.getPosWidth() != 32 || sparse.getCrdWidth() != 32) return {};
 
-      llvm::SmallVector<DimLevelType, 3> dim_level_types;
+      llvm::SmallVector<DimLevelType, 3> lvl_types;
       llvm::SmallVector<bool, 3> level_unique;
       llvm::SmallVector<bool, 3> level_ordered;
-      for (auto dlt : sparse.getDimLevelType()) {
+      for (auto dlt : sparse.getLvlTypes()) {
         auto new_dlt = ConvertDimLevelType(dlt);
         if (!new_dlt) return {};
-        dim_level_types.push_back(std::get<0>(*new_dlt));
+        lvl_types.push_back(std::get<0>(*new_dlt));
         level_unique.push_back(std::get<1>(*new_dlt));
         level_ordered.push_back(std::get<2>(*new_dlt));
       }
@@ -228,14 +214,14 @@ Shape TypeToShape(mlir::Type type) {
       std::vector<int64_t> ordering(rank);
       std::iota(ordering.rbegin(), ordering.rend(), 0);
       // Uses an identity map for dim ordering as the default value.
-      auto dimOrder = sparse.getDimOrdering()
-                          ? sparse.getDimOrdering()
+      auto dimToLvl = sparse.getDimToLvl()
+                          ? sparse.getDimToLvl()
                           : mlir::AffineMap::getMultiDimIdentityMap(
                                 rank, sparse.getContext());
       auto final_ordering = mlir::applyPermutationMap(
-          dimOrder, llvm::ArrayRef<int64_t>(ordering));
+          dimToLvl, llvm::ArrayRef<int64_t>(ordering));
       auto sparse_shape = ::xla::ShapeUtil::MakeShapeWithSparseLayout(
-          primitive_type, shape, final_ordering, dim_level_types, level_unique,
+          primitive_type, shape, final_ordering, lvl_types, level_unique,
           level_ordered);
       return sparse_shape;
     }
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
index 279414fb198..935dc9294b8 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
@@ -319,8 +319,7 @@ tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::CreateOpInFusion(
 
     TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
         *instr->called_computations()[0], symbol_table_, &reduce_op.getBody(),
-        &builder_,
-        /*flatten_region_arg_tuple=*/true));
+        &builder_, /*flatten_region_arg_tuple=*/true));
     op = reduce_op;
   } else {
     TF_ASSIGN_OR_RETURN(op,
@@ -607,8 +606,18 @@ tsl::StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
     }
   }
 
-  fusion.setBackendConfigAttr(
-      builder_.getStringAttr(instr->raw_backend_config_string()));
+  // The fusion op might not have a backend-config.  But we at least want to set
+  // the fusion kind, because LMHLO doesn't have this concept.
+  TF_ASSIGN_OR_RETURN(auto backend_config,
+                      instr->backend_config<xla::gpu::FusionBackendConfig>());
+  if (backend_config.kind().empty() &&
+      instr->opcode() == xla::HloOpcode::kFusion) {
+    backend_config.set_kind(std::string(ToString(instr->fusion_kind())));
+  }
+
+  TF_ASSIGN_OR_RETURN(std::string backend_config_str,
+                      HloInstruction::BackendConfigToRawString(backend_config));
+  fusion.setBackendConfigAttr(builder_.getStringAttr(backend_config_str));
 
   // Fold GTE/Tuple pairs.
   //
@@ -994,27 +1003,40 @@ tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmulF8(
       custom_call->backend_config<xla::gpu::GemmBackendConfig>());
 
   int ops_num = custom_call->operand_count();
-  TF_RET_CHECK(ops_num == 7 || ops_num == 8);
-
+  TF_RET_CHECK(ops_num == 6 || ops_num == 7 || ops_num == 8);
   TF_ASSIGN_OR_RETURN(
       bool has_vector_bias,
       xla::gpu::cublas_lt::EpilogueAddsVectorBias(config.epilogue()));
 
   bool has_damax = custom_call->shape().IsTuple();
+  bool has_matrix_bias = config.beta() != 0.;
   xla::ShapeIndex output_index =
       has_damax ? xla::ShapeIndex{0} : xla::ShapeIndex{};
 
   llvm::SmallVector<Value, 10> operands;
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(4), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
+
+  int a_scale_index = has_matrix_bias ? 3 : 2;
+  if (has_matrix_bias) {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
+  } else {
+    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
+  }
+
+  TF_RETURN_IF_ERROR(
+      GetOrCreateView(custom_call->operand(a_scale_index), &operands));
+  TF_RETURN_IF_ERROR(
+      GetOrCreateView(custom_call->operand(a_scale_index + 1), &operands));
+  TF_RETURN_IF_ERROR(
+      GetOrCreateView(custom_call->operand(a_scale_index + 2), &operands));
+  TF_RETURN_IF_ERROR(
+      GetOrCreateView(custom_call->operand(a_scale_index + 3), &operands));
   TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
+
   if (has_vector_bias) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(7), &operands));
+    TF_RETURN_IF_ERROR(
+        GetOrCreateView(custom_call->operand(a_scale_index + 4), &operands));
   }
   if (has_damax) {
     TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, {1}));
@@ -2107,10 +2129,10 @@ tsl::Status HloToLhloModule(const BufferAssignment& assignment,
 
   const std::vector<HloInstruction*>& ordering = schedule->instructions();
   TF_RETURN_IF_ERROR(computation->AcceptOrdered(&emitter, ordering));
-  TF_RETURN_IF_ERROR(tsl::FromAbslStatus(status_handler.ConsumeStatus()));
+  TF_RETURN_IF_ERROR(status_handler.ConsumeStatus());
 
   (void)mlir::verify(module);
-  return tsl::FromAbslStatus(status_handler.ConsumeStatus());
+  return status_handler.ConsumeStatus();
 }
 
 OwningOpRef<mlir::ModuleOp> HloTextToLhloTranslateFunction(
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
index bed57ce10ff..a53073f341d 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
@@ -11,6 +11,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     tags_override = {
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
index 4c328378fee..b9d9702d540 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
@@ -177,7 +177,7 @@ HloModule CublasLtMatmulF8
 // CHECK: "lmhlo_gpu.cublas.lt.matmul.f8"
 // CHECK-SAME: alpha_imag = 0.000000e+00 : f64
 // CHECK-SAME: alpha_real = 1.000000e+00 : f64
-// CHECK-SAME: beta = 0.000000e+00 : f64
+// CHECK-SAME: beta = 1.000000e+00 : f64
 // CHECK-NOT: lhs_batching_dimensions
 // CHECK-NOT: rhs_batching_dimensions
 // CHECK-SAME: lhs_contracting_dimensions = [1]
@@ -194,7 +194,7 @@ ENTRY main {
   %C_SCALE = f32[] parameter(5)
   %D_SCALE = f32[] parameter(6)
   ROOT %custom-call = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call(f8e4m3fn[16,16]{1,0} %A, f8e4m3fn[16,16]{1,0} %B, f16[16,16]{1,0} %C, f32[] %A_SCALE, f32[] %B_SCALE, f32[] %C_SCALE, f32[] %D_SCALE), custom_call_target="__cublas$lt$matmul$f8",
-    backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}"
+    backend_config="{\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1.0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}"
 }
 
 
diff --git a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir
index 89b3ce53c95..d3f834f83b8 100644
--- a/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir
+++ b/tensorflow/compiler/xla/translate/mhlo_to_lhlo_with_xla/tests/ops.mlir
@@ -710,7 +710,8 @@ func.func @main(%key: tensor<5x5xi32>, %value: tensor<5x5xf32>) -> (tensor<5x5xi
 // CHECK:   %[[VAR2:.*]] = mhlo.add %[[VAR0]], %[[VAR1]] : tensor<f32>
 // CHECK:   tensor_store %[[VAR2]], %[[MEMREF:.*]] : memref<f32>
 // CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: }) {backend_config = ""} : () -> ()
+// CHECK: }) 
+// CHECK-SAME: : () -> ()
 func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   %result = "mhlo.fusion"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -732,7 +733,8 @@ func.func @main(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
 // CHECK:   tensor_store %[[VAL1]], %{{.*}} : memref<f32>
 // CHECK:   tensor_store %[[VAL2]], %{{.*}} : memref<f32>
 // CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: }) {backend_config = ""} : () -> ()
+// CHECK: }) 
+// CHECK-SAME: : () -> ()
 func.func @main(%arg0: tuple<tuple<tensor<f32>>, tensor<f32>>, %arg1: tuple<tensor<f32>>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
   %0 = "mhlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tuple<tensor<f32>>, tensor<f32>>) -> tuple<tensor<f32>>
   %1 = "mhlo.get_tuple_element"(%0) {index = 0 : i32} : (tuple<tensor<f32>>) -> tensor<f32>
diff --git a/tensorflow/compiler/xla/types.h b/tensorflow/compiler/xla/types.h
index f3f5a4b9e88..e020561408a 100644
--- a/tensorflow/compiler/xla/types.h
+++ b/tensorflow/compiler/xla/types.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include <complex>
 #include <istream>
 #include <limits>
-#include <optional>
 #include <ostream>
 #include <string>
+#include <type_traits>
 
 #include "absl/strings/str_format.h"
 #include "third_party/eigen3/Eigen/Core"
@@ -34,50 +34,61 @@ using ::Eigen::half;      // NOLINT(misc-unused-using-decls)
 using complex64 = std::complex<float>;
 using complex128 = std::complex<double>;
 
+template <class T>
+struct is_complex : std::false_type {};
+template <class T>
+struct is_complex<std::complex<T>> : std::true_type {};
+
+template <typename T>
+inline constexpr bool is_complex_v = is_complex<T>::value;
+
+template <typename T>
+constexpr bool is_specialized_floating_point_v =
+    std::numeric_limits<T>::is_specialized &&
+    !std::numeric_limits<T>::is_integer;
+
+// LINT.IfChange
 template <typename UnderlyingTy>
 struct i4 {
  private:
   UnderlyingTy v : 4;
 
  public:
-  i4() : v(0) {}
-  explicit i4(UnderlyingTy val) : v(val & 0x0F) {}
+  constexpr i4() : v(0) {}
+  constexpr explicit i4(UnderlyingTy val) : v(val & 0x0F) {}
   template <typename T>
-  explicit i4(T t) : i4(static_cast<UnderlyingTy>(t)) {}
-  i4(const i4& other) = default;
+  constexpr explicit i4(T t) : i4(static_cast<UnderlyingTy>(t)) {}
+  constexpr i4(const i4& other) = default;
 
-  template <typename T, typename = std::enable_if_t<std::is_arithmetic_v<T>>>
-  explicit operator T() const {
-    return static_cast<T>(v);
-  }
   // NOLINTNEXTLINE(google-explicit-constructor)
-  operator std::optional<int64_t>() const { return static_cast<int64_t>(v); }
-  explicit operator complex64() const { return complex64(v); }
-  explicit operator complex128() const { return complex128(v); }
+  constexpr operator UnderlyingTy() const {
+    return static_cast<UnderlyingTy>(v);
+  }
 
-  i4 operator+(const i4& other) const { return i4((v + other.v)); }
-  i4 operator-(const i4& other) const { return i4((v - other.v)); }
-  i4 operator*(const i4& other) const { return i4((v * other.v)); }
-  i4 operator/(const i4& other) const { return i4((v / other.v)); }
+  template <typename T>
+  i4 operator>>(const T amount) const {
+    return i4(v >> amount);
+  }
+  template <typename T>
+  i4 operator<<(const T amount) const {
+    return i4(v << amount);
+  }
 
-  i4 operator>>(const int amount) const { return i4((v >> amount)); }
-  i4 operator<<(const int amount) const { return i4((v << amount)); }
+  constexpr bool operator==(const i4 other) const { return v == other.v; }
+  constexpr bool operator!=(const i4 other) const { return v != other.v; }
+  constexpr bool operator<(const i4 other) const { return v < other.v; }
+  constexpr bool operator>(const i4 other) const { return v > other.v; }
+  constexpr bool operator<=(const i4 other) const { return v <= other.v; }
+  constexpr bool operator>=(const i4 other) const { return v >= other.v; }
 
-  bool operator==(const i4& other) const { return v == other.v; }
-  bool operator!=(const i4& other) const { return v != other.v; }
-  bool operator<(const i4& other) const { return v < other.v; }
-  bool operator>(const i4& other) const { return v > other.v; }
-  bool operator<=(const i4& other) const { return v <= other.v; }
-  bool operator>=(const i4& other) const { return v >= other.v; }
-
-  bool operator==(const int64_t other) const { return v == other; }
-  bool operator!=(const int64_t other) const { return v != other; }
-  bool operator<(const int64_t other) const { return v < other; }
-  bool operator>(const int64_t other) const { return v > other; }
-  bool operator<=(const int64_t other) const { return v <= other; }
-  bool operator>=(const int64_t other) const { return v >= other; }
-
-  i4& operator++() {
+  constexpr i4 operator-() const { return i4(-v); }
+  constexpr i4 operator~() const { return i4(~v); }
+  constexpr i4 operator++(int) {
+    i4 tmp(*this);
+    v = (v + 1) & 0x0F;
+    return tmp;
+  }
+  constexpr i4& operator++() {
     v = (v + 1) & 0x0F;
     return *this;
   }
@@ -104,6 +115,8 @@ struct i4 {
 
 using u4 = i4<uint8_t>;
 using s4 = i4<int8_t>;
+// LINT.ThenChange(//tensorflow/compiler/xla/literal.cc)
+
 }  // namespace xla
 
 // Alias namespace ::stream_executor as ::xla::se.
@@ -137,12 +150,12 @@ class numeric_limits_int4t {
   static constexpr int max_exponent10 = 0;
   static constexpr bool tinyness_before = false;
 
-  static Int4T epsilon() { return Int4T(0); }
-  static Int4T round_error() { return Int4T(0); }
-  static Int4T infinity() { return Int4T(0); }
-  static Int4T quiet_NaN() { return Int4T(0); }
-  static Int4T signaling_NaN() { return Int4T(0); }
-  static Int4T denorm_min() { return Int4T(0); }
+  static constexpr Int4T epsilon() { return Int4T(0); }
+  static constexpr Int4T round_error() { return Int4T(0); }
+  static constexpr Int4T infinity() { return Int4T(0); }
+  static constexpr Int4T quiet_NaN() { return Int4T(0); }
+  static constexpr Int4T signaling_NaN() { return Int4T(0); }
+  static constexpr Int4T denorm_min() { return Int4T(0); }
 };
 
 template <>
@@ -154,9 +167,9 @@ class numeric_limits<xla::u4> : public numeric_limits_int4t<xla::u4> {
   static constexpr bool is_modulo = true;
   static constexpr bool traps = numeric_limits<uint8_t>::traps;
 
-  static xla::u4(min)() { return xla::u4(0); }
-  static xla::u4 lowest() { return xla::u4(0); }
-  static xla::u4(max)() { return xla::u4(15); }
+  static constexpr xla::u4(min)() { return xla::u4(0); }
+  static constexpr xla::u4 lowest() { return xla::u4(0); }
+  static constexpr xla::u4(max)() { return xla::u4(15); }
 };
 
 template <>
@@ -168,11 +181,43 @@ class numeric_limits<xla::s4> : public numeric_limits_int4t<xla::s4> {
   static constexpr bool is_modulo = false;
   static constexpr bool traps = numeric_limits<int8_t>::traps;
 
-  static xla::s4(min)() { return xla::s4(-8); }
-  static xla::s4 lowest() { return xla::s4(-8); }
-  static xla::s4(max)() { return xla::s4(7); }
+  static constexpr xla::s4(min)() { return xla::s4(-8); }
+  static constexpr xla::s4 lowest() { return xla::s4(-8); }
+  static constexpr xla::s4(max)() { return xla::s4(7); }
 };
 // NOLINTEND
 }  // namespace std
 
+namespace xla {
+
+// std::make_signed_t is “behavior undefined” for custom types, so provide a
+// general util to make signed/unsigned for both primitive and custom types.
+template <typename T>
+struct make_specialized_unsigned {
+  using type = std::make_unsigned_t<T>;
+};
+
+template <typename UnderlyingTy>
+struct make_specialized_unsigned<xla::i4<UnderlyingTy>> {
+  using type = xla::i4<std::make_unsigned_t<UnderlyingTy>>;
+};
+
+template <typename T>
+using make_specialized_unsigned_t = typename make_specialized_unsigned<T>::type;
+
+template <typename T>
+struct make_specialized_signed {
+  using type = std::make_signed_t<T>;
+};
+
+template <typename UnderlyingTy>
+struct make_specialized_signed<xla::i4<UnderlyingTy>> {
+  using type = xla::i4<std::make_signed_t<UnderlyingTy>>;
+};
+
+template <typename T>
+using make_specialized_signed_t = typename make_specialized_signed<T>::type;
+
+}  // namespace xla
+
 #endif  // TENSORFLOW_COMPILER_XLA_TYPES_H_
diff --git a/tensorflow/compiler/xla/util.cc b/tensorflow/compiler/xla/util.cc
index 963f3715629..4a314bd2709 100644
--- a/tensorflow/compiler/xla/util.cc
+++ b/tensorflow/compiler/xla/util.cc
@@ -434,6 +434,11 @@ std::string SanitizeFileName(std::string file_name) {
   return file_name;
 }
 
+bool DistinctNumbersAreConsecutiveIfSorted(absl::Span<const int64_t> seq) {
+  return *absl::c_max_element(seq) - *absl::c_min_element(seq) ==
+         seq.size() - 1;
+}
+
 // Utility function to split a double-precision float (F64) into a pair of F32s.
 // For a p-bit number, and a splitting point (p/2) <= s <= (p - 1), the
 // algorithm produces a (p - s)-bit value 'hi' and a non-overlapping (s - 1)-bit
diff --git a/tensorflow/compiler/xla/util.h b/tensorflow/compiler/xla/util.h
index 292ab5a3fd9..3cd93bf0bbf 100644
--- a/tensorflow/compiler/xla/util.h
+++ b/tensorflow/compiler/xla/util.h
@@ -521,13 +521,35 @@ template <size_t kBytes>
 using SignedIntegerTypeForSizeType =
     std::make_signed_t<UnsignedIntegerTypeForSizeType<kBytes>>;
 
+template <typename T>
+auto SignAndMagnitude(T x) {
+  using BitType = UnsignedIntegerTypeForSizeType<sizeof(T)>;
+  BitType x_abs_bits = Eigen::numext::bit_cast<BitType>(Eigen::numext::abs(x));
+  const BitType x_bits = Eigen::numext::bit_cast<BitType>(x);
+  const BitType x_sign = x_bits ^ x_abs_bits;
+  if constexpr (std::is_same_v<T, tsl::float8_e4m3b11>) {
+    //  f8e4m3b11 does not support -0, adjust negative numbers to fill in the
+    //  gap.
+    if (x_sign) {
+      x_abs_bits -= 1;
+    }
+  }
+  return std::make_pair(x_sign, x_abs_bits);
+}
+
+template <typename T>
+auto SignAndMagnitudeToTwosComplement(T sign, T magnitude) {
+  static_assert(!std::numeric_limits<T>::is_signed);
+  using SignedType = std::make_signed_t<T>;
+  return static_cast<SignedType>(magnitude) ^
+         (static_cast<SignedType>(sign) < 0 ? SignedType{-1} : SignedType{0});
+}
+
 // Returns the signed magnitude of T.
 template <typename T>
-SignedIntegerTypeForSizeType<sizeof(T)> ToSignMagnitude(T input) {
-  auto as_bits = absl::bit_cast<SignedIntegerTypeForSizeType<sizeof(T)>>(input);
-  auto sign_mask = absl::bit_cast<UnsignedIntegerTypeForSizeType<sizeof(T)>>(
-      tsl::MathUtil::Sign(as_bits));
-  return as_bits ^ (sign_mask >> 1);
+auto ToSignMagnitude(T input) {
+  auto [sign, magnitude] = SignAndMagnitude(input);
+  return SignAndMagnitudeToTwosComplement(sign, magnitude);
 }
 
 template <typename T>
@@ -617,6 +639,9 @@ ConvertedDimensionNumbers ConvertDimensionNumbers(
 // Removes illegal characters from filenames.
 std::string SanitizeFileName(std::string file_name);
 
+// Check that a sequence of distinct numbers can form a continuous interval.
+bool DistinctNumbersAreConsecutiveIfSorted(absl::Span<const int64_t>);
+
 template <typename C, typename Value>
 int64_t FindIndex(const C& c, Value&& value) {
   auto it = absl::c_find(c, std::forward<Value>(value));
diff --git a/tensorflow/compiler/xla/util_test.cc b/tensorflow/compiler/xla/util_test.cc
index a6044222ca1..4782e3b810c 100644
--- a/tensorflow/compiler/xla/util_test.cc
+++ b/tensorflow/compiler/xla/util_test.cc
@@ -128,6 +128,9 @@ TEST(UtilTest, RoundTripFpToString) {
   EXPECT_EQ(
       RoundTripFpToString(std::numeric_limits<tsl::float8_e4m3fn>::quiet_NaN()),
       "nan");
+  EXPECT_EQ(RoundTripFpToString(
+                std::numeric_limits<tsl::float8_e4m3b11>::quiet_NaN()),
+            "-nan");
   EXPECT_EQ(RoundTripFpToString(
                 -std::numeric_limits<tsl::float8_e4m3fn>::quiet_NaN()),
             "-nan");
@@ -182,5 +185,78 @@ TEST(UtilTest, SplitF64ToF32) {
   EXPECT_EQ(SplitF64ToF32(std::numeric_limits<double>::max()).second, 0.0f);
 }
 
+namespace {
+template <typename T>
+void TotalOrderHelper(T x, T y) {
+  auto x_sm = ToSignMagnitude(x);
+  bool x_sign = static_cast<bool>(Eigen::numext::signbit(x));
+  bool y_sign = static_cast<bool>(Eigen::numext::signbit(y));
+  auto y_sm = ToSignMagnitude(y);
+  if (x_sign && !y_sign) {
+    EXPECT_LT(x_sm, y_sm) << x << " " << y;
+  }
+  if (!x_sign && y_sign) {
+    EXPECT_GT(x_sm, y_sm) << x << " " << y;
+  }
+  if (x == y && x_sign == y_sign) {
+    EXPECT_EQ(x_sm, y_sm) << x << " " << y;
+  }
+  if (x < y) {
+    EXPECT_LT(x_sm, y_sm) << x << " " << y;
+  }
+  if (x > y) {
+    EXPECT_GT(x_sm, y_sm) << x << " " << y;
+  }
+  if (Eigen::numext::isnan(x) && x_sign && !Eigen::numext::isnan(y)) {
+    EXPECT_LT(x_sm, y_sm) << x << " " << y;
+  }
+  if (Eigen::numext::isnan(x) && !x_sign && !Eigen::numext::isnan(y)) {
+    EXPECT_GT(x_sm, y_sm) << x << " " << y;
+  }
+  if (Eigen::numext::isnan(y) && y_sign && !Eigen::numext::isnan(x)) {
+    EXPECT_GT(x_sm, y_sm) << x << " " << y;
+  }
+  if (Eigen::numext::isnan(y) && !y_sign && !Eigen::numext::isnan(x)) {
+    EXPECT_LT(x_sm, y_sm) << x << " " << y;
+  }
+}
+}  // namespace
+
+TEST(UtilTest, TotalOrder_F8E5M2) {
+  for (int a = 0; a < 256; ++a) {
+    tsl::float8_e5m2 x =
+        Eigen::numext::bit_cast<tsl::float8_e5m2>(static_cast<uint8_t>(a));
+    for (int b = 0; b < 256; ++b) {
+      tsl::float8_e5m2 y =
+          Eigen::numext::bit_cast<tsl::float8_e5m2>(static_cast<uint8_t>(b));
+      TotalOrderHelper(x, y);
+    }
+  }
+}
+
+TEST(UtilTest, TotalOrder_F8E4M3FN) {
+  for (int a = 0; a < 256; ++a) {
+    tsl::float8_e4m3fn x =
+        Eigen::numext::bit_cast<tsl::float8_e4m3fn>(static_cast<uint8_t>(a));
+    for (int b = 0; b < 256; ++b) {
+      tsl::float8_e4m3fn y =
+          Eigen::numext::bit_cast<tsl::float8_e4m3fn>(static_cast<uint8_t>(b));
+      TotalOrderHelper(x, y);
+    }
+  }
+}
+
+TEST(UtilTest, TotalOrder_F8E4M3B11) {
+  for (int a = 0; a < 256; ++a) {
+    tsl::float8_e4m3b11 x =
+        Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(a));
+    for (int b = 0; b < 256; ++b) {
+      tsl::float8_e4m3b11 y =
+          Eigen::numext::bit_cast<tsl::float8_e4m3b11>(static_cast<uint8_t>(b));
+      TotalOrderHelper(x, y);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/tensorflow/compiler/xla/xla.proto b/tensorflow/compiler/xla/xla.proto
index 616fb68ea8f..f102bb54bc8 100644
--- a/tensorflow/compiler/xla/xla.proto
+++ b/tensorflow/compiler/xla/xla.proto
@@ -220,9 +220,7 @@ message DebugOptions {
 
   reserved 171;  // Was xla_cpu_enable_mlir_lowering
 
-  // If true, use MLIR instead of IR emitter to generate device code for
-  // supported lmhlo.fusion ops. See xla::gpu::RewriteFusionOps() for details.
-  bool xla_gpu_enable_mlir_lowering = 173;
+  reserved 173;  // Was xla_gpu_enable_mlir_lowering
 
   reserved 179;  // Was xla_gpu_enable_softmax_fusion
 
@@ -376,8 +374,10 @@ message DebugOptions {
   bool xla_gpu_enable_async_reduce_scatter = 200;
   bool xla_gpu_enable_async_all_to_all = 201;
 
-  // Size threshold (in bytes) for the GPU all-reduce combiner.
+  // Size threshold (in bytes) for the GPU collective combiners.
   int64 xla_gpu_all_reduce_combine_threshold_bytes = 157;
+  int64 xla_gpu_all_gather_combine_threshold_bytes = 212;
+  int64 xla_gpu_reduce_scatter_combine_threshold_bytes = 213;
 
   // Combine GPU all-reduces into a single operation over a contiguous buffer.
   bool xla_gpu_all_reduce_contiguous = 158;
@@ -449,6 +449,9 @@ message DebugOptions {
   // graph.
   int32 xla_gpu_cuda_graph_capture_threshold = 208;
 
+  // Identify concurrent regions in cuda graphs and execute them concurrently.
+  bool xla_gpu_cuda_graph_enable_concurrent_region = 215;
+
   // Allocate temp buffers once during the first execution of an executable.
   // Reuse the allocated buffers in subsequent executions. Executables cannot
   // run concurrently if this is enabled.
@@ -500,8 +503,10 @@ message DebugOptions {
   bool xla_cpu_enable_experimental_deallocation = 191;
 
   bool xla_gpu_enable_latency_hiding_scheduler = 186;
+  bool xla_gpu_enable_highest_priority_async_stream = 216;
   bool xla_gpu_lhs_enable_gpu_async_tracker = 204;
-  string xla_gpu_pgle_profile_directory = 210;
+  string xla_gpu_pgle_profile_file_or_directory_path = 210;
+  bool xla_gpu_enable_data_parallel_collective_optimizer = 217;
 
   enum PartitioningAlgorithm {
     PARTITIONING_ALGORITHM_NOOP = 0;
@@ -518,7 +523,13 @@ message DebugOptions {
 
   bool xla_gpu_triton_gemm_any = 190;
 
-  // Next id: 211
+  reserved 211;  // Was xla_gpu_enable_dot_strength_reduction
+
+  // Enables experimental heuristic of choosing block size for launching a
+  // kernel on GPU.
+  bool xla_gpu_enable_experimental_block_size = 214;
+
+  // Next id: 218
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/tensorflow/compiler/xla/xla_data.proto b/tensorflow/compiler/xla/xla_data.proto
index c84c2100ddf..fd26fe59c9e 100644
--- a/tensorflow/compiler/xla/xla_data.proto
+++ b/tensorflow/compiler/xla/xla_data.proto
@@ -199,6 +199,10 @@ message LayoutProto {
   // error.
   repeated TileProto tiles = 6;
 
+  // (Optional) Bit size of each element. When unspecified or being 0, default
+  // to ShapeUtil::ByteSizeOfPrimitiveType.
+  int64 element_size_in_bits = 7;
+
   // Memory space where this array resides. The integer field is interpreted in
   // a backend-specific manner.
   int64 memory_space = 8;
@@ -234,8 +238,6 @@ message LayoutProto {
   reserved "format";
   reserved 5;
   reserved "max_sparse_elements";
-  reserved 7;
-  reserved "element_size_in_bits";
 }
 // LINT.ThenChange( \
 //     https://www.tensorflow.org/code/tensorflow/compiler/xla/shape_util.cc,      \
diff --git a/tensorflow/compiler/xrt/kernels/BUILD b/tensorflow/compiler/xrt/kernels/BUILD
index 811eb35410d..1433f666b2c 100644
--- a/tensorflow/compiler/xrt/kernels/BUILD
+++ b/tensorflow/compiler/xrt/kernels/BUILD
@@ -97,6 +97,7 @@ cc_library(
         "//tensorflow/core/tpu/kernels:tpu_program_group",
         "//tensorflow/core/tpu/kernels:tpu_program_group_interface",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
diff --git a/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc b/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
index 63e93aa4664..ea74c76db7d 100644
--- a/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
+++ b/tensorflow/compiler/xrt/kernels/tpu_compile_ops.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/xla/client/client_library.h"
 #include "tensorflow/compiler/xla/client/compile_only_client.h"
@@ -141,7 +143,7 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   // If the RPC was cancelled before we registered the cancellation callback,
   // don't compile the TPU program.
   OP_REQUIRES(ctx, !already_cancelled,
-              errors::Cancelled("RPC cancelled, not compiling TPU program"));
+              absl::CancelledError("RPC cancelled, not compiling TPU program"));
 
   // We only want to abort the process if a cancellation actually occurs during
   // compilation; we must deregister the callback in the success case. It
@@ -164,14 +166,15 @@ void XRTCompileOp::Compute(OpKernelContext* ctx) {
   core::ScopedUnref mesh_state_unref(mesh_state);
 
   const Tensor& computation_input = ctx->input(0);
-  OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(computation_input.shape()),
-              errors::Internal("computation input should be a string scalar"));
+  OP_REQUIRES(
+      ctx, TensorShapeUtils::IsScalar(computation_input.shape()),
+      absl::InternalError("computation input should be a string scalar"));
 
   xrt::XLAComputation computation_proto;
   OP_REQUIRES(
       ctx,
       computation_proto.ParseFromString(computation_input.scalar<tstring>()()),
-      errors::InvalidArgument(
+      absl::InvalidArgumentError(
           "Unable to parse computation input to XLAComputation"));
 
   const xrt::XLAComputationConfig& config = computation_proto.config();
@@ -242,7 +245,7 @@ void XRTReleaseCompilationRefOp::Compute(OpKernelContext* ctx) {
   VLOG(1) << "XRTReleaseCompilationRefOp::Compute";
   auto timed = monitoring::MakeTimed(xrt_metrics::GetReleaseCompilationCell());
   ResourceMgr* rm = GetTPUConfigResourceMgr();
-  OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
+  OP_REQUIRES(ctx, rm != nullptr, absl::InternalError("No resource manager."));
 
   // Process-wide cache of Tpu executables.
   tpu::TpuCompilationCacheInterface* cache;
diff --git a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
index a52b2a78455..6f485d82cbe 100644
--- a/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
+++ b/tensorflow/compiler/xrt/ops/xrt_execute_op.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
diff --git a/tensorflow/compiler/xrt/tests/BUILD b/tensorflow/compiler/xrt/tests/BUILD
index 27e6438da91..84533e5c20d 100644
--- a/tensorflow/compiler/xrt/tests/BUILD
+++ b/tensorflow/compiler/xrt/tests/BUILD
@@ -44,7 +44,6 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:tensorflow_opensource",
         "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/compiler/xrt/tests/raw_api_test.cc b/tensorflow/compiler/xrt/tests/raw_api_test.cc
index 6e65fe1709a..2d2503e90a9 100644
--- a/tensorflow/compiler/xrt/tests/raw_api_test.cc
+++ b/tensorflow/compiler/xrt/tests/raw_api_test.cc
@@ -13,8 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.cc b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
index ea856ba605f..4c4c4678af1 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.cc
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <stdlib.h>
 
+#include <functional>
+#include <memory>
 #include <string>
+#include <utility>
 
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
diff --git a/tensorflow/compiler/xrt/xrt_compilation_cache.h b/tensorflow/compiler/xrt/xrt_compilation_cache.h
index b3830224bfb..491c48e2b73 100644
--- a/tensorflow/compiler/xrt/xrt_compilation_cache.h
+++ b/tensorflow/compiler/xrt/xrt_compilation_cache.h
@@ -16,9 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_COMPILATION_CACHE_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_COMPILATION_CACHE_H_
 
+#include <functional>
+#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/statusor.h"
@@ -227,7 +231,7 @@ class XRTCompilationCache : public ResourceBase {
   std::unordered_map<string, CompiledSubgraph*> cache_ TF_GUARDED_BY(mu_);
   // All the executable entries that can be looked up in the cache indexed by
   // uid.
-  std::unordered_map<int64_t, CompiledSubgraph*> entries_by_uid_
+  absl::flat_hash_map<int64_t, CompiledSubgraph*> entries_by_uid_
       TF_GUARDED_BY(mu_);
   // Map from last_use to entry, used to mark entries for eviction in LRU
   // order. If an entry's last_use counter is not present as a key in
diff --git a/tensorflow/compiler/xrt/xrt_device.cc b/tensorflow/compiler/xrt/xrt_device.cc
index a72ab83182a..1527d0b3b38 100644
--- a/tensorflow/compiler/xrt/xrt_device.cc
+++ b/tensorflow/compiler/xrt/xrt_device.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include "tensorflow/compiler/xrt/xrt_device.h"
 
 #include <map>
+#include <memory>
+#include <string>
 
 #include "absl/container/node_hash_map.h"
 #include "tensorflow/compiler/jit/xla_device.h"
diff --git a/tensorflow/compiler/xrt/xrt_device.h b/tensorflow/compiler/xrt/xrt_device.h
index 348d9f3babf..a004ef9f48a 100644
--- a/tensorflow/compiler/xrt/xrt_device.h
+++ b/tensorflow/compiler/xrt/xrt_device.h
@@ -18,6 +18,9 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_DEVICE_H_
 
+#include <memory>
+#include <string>
+
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/stream_executor/tf_allocator_adapter.h"
@@ -43,8 +46,8 @@ class XRTGenericDeviceAccessor {
   // similar as possible.
   class ScopedRef {
    public:
-    ScopedRef() {}
-    ~ScopedRef() {}
+    ScopedRef() = default;
+    ~ScopedRef() = default;
 
     ScopedRef(const ScopedRef&) = delete;
     ScopedRef& operator=(const ScopedRef&) = delete;
diff --git a/tensorflow/compiler/xrt/xrt_memory_manager.cc b/tensorflow/compiler/xrt/xrt_memory_manager.cc
index ef95bdd4e6c..05325a822d9 100644
--- a/tensorflow/compiler/xrt/xrt_memory_manager.cc
+++ b/tensorflow/compiler/xrt/xrt_memory_manager.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <list>
+#include <memory>
 #include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/memory/memory.h"
 #include "tensorflow/compiler/xrt/xrt_metrics.h"
@@ -316,7 +319,7 @@ XRTMemoryManager::DeviceContext* XRTMemoryManager::GetDeviceContext(
   }
   DeviceContext* device_context = device_contexts_[device_ordinal].get();
   if (device_context == nullptr && create_if_missing) {
-    device_contexts_[device_ordinal] = absl::make_unique<DeviceContext>();
+    device_contexts_[device_ordinal] = std::make_unique<DeviceContext>();
     device_context = device_contexts_[device_ordinal].get();
   }
   return device_context;
diff --git a/tensorflow/compiler/xrt/xrt_metrics.cc b/tensorflow/compiler/xrt/xrt_metrics.cc
index f310810c2fb..e6e4ca8c5fe 100644
--- a/tensorflow/compiler/xrt/xrt_metrics.cc
+++ b/tensorflow/compiler/xrt/xrt_metrics.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/xrt_metrics.h"
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/lib/monitoring/collection_registry.h"
 #include "tensorflow/core/platform/regexp.h"
 
diff --git a/tensorflow/compiler/xrt/xrt_state.cc b/tensorflow/compiler/xrt/xrt_state.cc
index f4fe56db9b8..a40089efe0b 100644
--- a/tensorflow/compiler/xrt/xrt_state.cc
+++ b/tensorflow/compiler/xrt/xrt_state.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xrt/xrt_state.h"
 
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
@@ -93,7 +94,7 @@ Status AllocateScopedShapedBuffer(
   // The ScopedShapedBuffer frees the buffers that have so far been allocated if
   // it goes out of scope. That's useful if we return early as the result of an
   // error allocating one of the later buffers.
-  *buffer = absl::make_unique<xla::ScopedShapedBuffer>(
+  *buffer = std::make_unique<xla::ScopedShapedBuffer>(
       shape, on_device_shape, allocator, device_ordinal);
   for (auto& index_to_buffer : (*buffer)->buffers()) {
     const xla::Shape& subshape =
@@ -283,7 +284,7 @@ xla::StatusOr<bool> XRTTupleAllocation::SwapOut(xla::Backend* backend,
     xla::Literal literal(on_host_shape());
     TF_RETURN_IF_ERROR(StoreToLiteral(backend, &literal));
     ReleaseBuffers();
-    literal_ = absl::make_unique<xla::Literal>(std::move(literal));
+    literal_ = std::make_unique<xla::Literal>(std::move(literal));
     return true;
   }
   return false;
@@ -477,7 +478,7 @@ void XRTTupleAllocation::SetDeviceMemorySize() {
   // writes index tables will be happy lower down.
   xla::Shape spine_shape = elements.shape();
   xla::LayoutUtil::SetToDefaultLayout(&spine_shape);
-  auto new_tuple_buffers = absl::make_unique<xla::ScopedShapedBuffer>(
+  auto new_tuple_buffers = std::make_unique<xla::ScopedShapedBuffer>(
       spine_shape, spine_shape, allocator, device_ordinal);
   TF_RETURN_IF_ERROR(elements.ForEachElementWithStatus(
       [&](const xla::ShapeIndex& index, const ExpandedTupleInput& element) {
diff --git a/tensorflow/compiler/xrt/xrt_tpu_device.h b/tensorflow/compiler/xrt/xrt_tpu_device.h
index ca82924d3c7..50cf8cc97c9 100644
--- a/tensorflow/compiler/xrt/xrt_tpu_device.h
+++ b/tensorflow/compiler/xrt/xrt_tpu_device.h
@@ -18,6 +18,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_TPU_DEVICE_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_TPU_DEVICE_H_
 
+#include <memory>
+
 #include "tensorflow/compiler/xla/client/local_client.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -33,8 +35,8 @@ class XRTTpuDeviceAccessor {
 
   class ScopedRef {
    public:
-    ScopedRef() {}
-    ~ScopedRef() {}
+    ScopedRef() = default;
+    ~ScopedRef() = default;
 
     ScopedRef(const ScopedRef&) = delete;
     ScopedRef& operator=(const ScopedRef&) = delete;
diff --git a/tensorflow/compiler/xrt/xrt_util.cc b/tensorflow/compiler/xrt/xrt_util.cc
index 9ff747ee6c3..c6722b1c4e8 100644
--- a/tensorflow/compiler/xrt/xrt_util.cc
+++ b/tensorflow/compiler/xrt/xrt_util.cc
@@ -18,6 +18,11 @@ limitations under the License.
 #include <stdlib.h>
 #include <string.h>
 
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/types.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/compiler/xrt/xrt_util.h b/tensorflow/compiler/xrt/xrt_util.h
index 16774d43320..255bc821781 100644
--- a/tensorflow/compiler/xrt/xrt_util.h
+++ b/tensorflow/compiler/xrt/xrt_util.h
@@ -18,8 +18,10 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 #define TENSORFLOW_COMPILER_XRT_XRT_UTIL_H_
 
+#include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_input_output_alias_config.h"
@@ -40,7 +42,7 @@ namespace tensorflow {
 // Factory class which creates NCCL unique IDs based on the replicas
 // participating to a given communication. This is only used for GPU backends.
 struct NcclUniqueIdFactory {
-  virtual ~NcclUniqueIdFactory() {}
+  virtual ~NcclUniqueIdFactory() = default;
 
   // Generates the NCCL unique ID for the given set of replica IDs.
   virtual std::string GetUniqueId(absl::Span<const int64_t> replicas) = 0;
diff --git a/tensorflow/core/activity_watcher/activity.cc b/tensorflow/core/activity_watcher/activity.cc
index 28965aa6a55..e9aa0584db8 100644
--- a/tensorflow/core/activity_watcher/activity.cc
+++ b/tensorflow/core/activity_watcher/activity.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/activity_watcher/activity.h"
 
+#include <atomic>
 #include <memory>
 
 namespace tensorflow {
diff --git a/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt b/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt
index b176e07556d..fe54b0ee771 100644
--- a/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt
+++ b/tensorflow/core/api_def/base_api/api_def_ConcatOffset.pbtxt
@@ -10,14 +10,14 @@ END
   in_arg {
     name: "shape"
     description: <<END
-The `N` int32 vectors representing shape of tensors being concatenated.
+The `N` int32 or int64 vectors representing shape of tensors being concatenated.
 END
   }
   out_arg {
     name: "offset"
     description: <<END
-The `N` int32 vectors representing the starting offset
-of input tensors within the concatenated output.
+The `N` vectors representing the starting offset
+of input tensors within the concatenated output with type matching `shape`.
 END
   }
   summary: "Computes offsets of concat inputs within its output."
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 9718b256dfb..c239c6ce053 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -622,6 +622,7 @@ cc_library(
         "//tensorflow/core/framework:device_attributes_proto_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -797,6 +798,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:math",
         "//tensorflow/core/kernels:random_ops",
         "//tensorflow/core/kernels:state",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -810,6 +812,7 @@ cc_library(
         ":device_factory",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -2912,6 +2915,7 @@ tf_cc_test(
         "//tensorflow/core/kernels:shape_ops",
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/common_runtime/device_propagation.cc b/tensorflow/core/common_runtime/device_propagation.cc
index 7e4935c245a..2489067f94e 100644
--- a/tensorflow/core/common_runtime/device_propagation.cc
+++ b/tensorflow/core/common_runtime/device_propagation.cc
@@ -34,11 +34,11 @@ const std::string& AssignedOrRequestedDevice(const Node& node) {
   return node.requested_device();
 }
 
-void UpdateDeviceFromInputs(
+bool UpdateDeviceFromInputs(
     const device_propagation::NodeFilter& node_filter,
     const device_propagation::DeviceFilter& device_filter, Node* node) {
   if (!AssignedOrRequestedDevice(*node).empty() || !node_filter(*node)) {
-    return;
+    return false;
   }
   string proposed_device = "";
   Node* proposed_src = nullptr;
@@ -63,19 +63,22 @@ void UpdateDeviceFromInputs(
     }
 
     // If a source device is not propagatable, stop.
-    if (!device_filter(src_device)) return;
+    if (!device_filter(src_device)) return false;
 
     if (proposed_src == nullptr) {
       proposed_device = src_device;
       proposed_src = src;
     } else if (proposed_device != src_device) {
       // The device assignments of some input nodes are not the same. Stop.
-      return;
+      return false;
     }
   }
   if (proposed_src) {
     node->set_assigned_device_name(proposed_src->assigned_device_name());
     node->set_requested_device(proposed_src->requested_device());
+    return true;
+  } else {
+    return false;
   }
 }
 
@@ -84,9 +87,15 @@ void UpdateDeviceFromInputs(
 void PropagateDevices(const device_propagation::NodeFilter& node_filter,
                       const device_propagation::DeviceFilter& device_filter,
                       Graph* graph) {
-  ReverseDFS(*graph, {}, [&node_filter, &device_filter](Node* node) {
-    UpdateDeviceFromInputs(node_filter, device_filter, node);
-  });
+  bool nodes_changed = true;
+  while (nodes_changed) {
+    nodes_changed = false;
+    BreadthFirstTraversal(
+        *graph, {}, [&nodes_changed, &node_filter, &device_filter](Node* node) {
+          nodes_changed |=
+              UpdateDeviceFromInputs(node_filter, device_filter, node);
+        });
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
index ceb773663e7..4c2394d80da 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/platform/errors.h"
 
@@ -24,7 +25,7 @@ Status DeviceResolverLocal::GetDeviceAttributes(const string& device,
   Device* dev;
   // LookupDevice returns InvalidArgument if the device is not found.
   Status s = dev_mgr_->LookupDevice(device, &dev);
-  if (errors::IsInvalidArgument(s)) {
+  if (absl::IsInvalidArgument(s)) {
     return errors::NotFound(device, " not found");
   } else if (!s.ok()) {
     return s;
diff --git a/tensorflow/core/common_runtime/device_set.cc b/tensorflow/core/common_runtime/device_set.cc
index 902ca2c2ee2..69e940398f0 100644
--- a/tensorflow/core/common_runtime/device_set.cc
+++ b/tensorflow/core/common_runtime/device_set.cc
@@ -26,9 +26,9 @@ limitations under the License.
 
 namespace tensorflow {
 
-DeviceSet::DeviceSet() {}
+DeviceSet::DeviceSet() = default;
 
-DeviceSet::~DeviceSet() {}
+DeviceSet::~DeviceSet() = default;
 
 void DeviceSet::AddDevice(Device* device) {
   mutex_lock l(devices_mu_);
@@ -39,18 +39,27 @@ void DeviceSet::AddDevice(Device* device) {
        DeviceNameUtils::GetNamesForDeviceMappings(device->parsed_name())) {
     device_by_name_.insert({name, device});
   }
+  matching_device_cache_.clear();
 }
 
 void DeviceSet::FindMatchingDevices(const DeviceNameUtils::ParsedName& spec,
                                     std::vector<Device*>* devices) const {
-  // TODO(jeff): If we are going to repeatedly lookup the set of devices
-  // for the same spec, maybe we should have a cache of some sort
+  {
+    mutex_lock l(devices_mu_);
+    auto match = matching_device_cache_.find(spec);
+    if (match != matching_device_cache_.end()) {
+      *devices = match->second;
+    }
+  }
+
   devices->clear();
   for (Device* d : devices_) {
     if (DeviceNameUtils::IsCompleteSpecification(spec, d->parsed_name())) {
       devices->push_back(d);
     }
   }
+  mutex_lock l(devices_mu_);
+  matching_device_cache_.insert({spec, *devices});
 }
 
 Device* DeviceSet::FindDeviceByName(const string& name) const {
diff --git a/tensorflow/core/common_runtime/device_set.h b/tensorflow/core/common_runtime/device_set.h
index f59f84c2066..d6f1ab43073 100644
--- a/tensorflow/core/common_runtime/device_set.h
+++ b/tensorflow/core/common_runtime/device_set.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
@@ -106,6 +107,9 @@ class DeviceSet {
  private:
   mutable mutex devices_mu_;
 
+  mutable absl::flat_hash_map<DeviceNameUtils::ParsedName, std::vector<Device*>>
+      matching_device_cache_;
+
   // Not owned.
   std::vector<Device*> devices_;
 
diff --git a/tensorflow/core/common_runtime/eager/BUILD b/tensorflow/core/common_runtime/eager/BUILD
index 82ec86ed4e7..271581ba77e 100644
--- a/tensorflow/core/common_runtime/eager/BUILD
+++ b/tensorflow/core/common_runtime/eager/BUILD
@@ -12,6 +12,7 @@ load(
 load(
     "//third_party/mkl:build_defs.bzl",
     "if_mkl",
+    "mkl_deps",
 )
 
 package(
@@ -74,6 +75,7 @@ tf_cuda_library(
             "//tensorflow/core:lib",
             "//tensorflow/core:lib_internal",
             "//tensorflow/core:protos_all_cc",
+            "@com_google_absl//absl/container:flat_hash_map",
         ],
     }),
 )
@@ -108,6 +110,7 @@ tf_cuda_library(
         ":kernel_and_device",
         ":rendezvous_cache",
         ":small_constants_optimizer",
+        ":summary_optimizer",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/c/eager:immediate_execution_context",
         "//tensorflow/c/eager:immediate_execution_distributed_manager",
@@ -117,6 +120,7 @@ tf_cuda_library(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/nccl:collective_communicator",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -242,6 +246,7 @@ tf_cc_test(
     srcs = ["context_test.cc"],
     deps = [
         ":context",
+        ":context_distributed_manager",
         ":core",
         ":eager_operation",
         ":execute",
@@ -249,11 +254,11 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/common_runtime/eager:context_distributed_manager",
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/kernels:cast_op",
         "//tensorflow/core/kernels:logging_ops",
         "//tensorflow/core/kernels:math",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
     ],
 )
@@ -464,6 +469,33 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "summary_optimizer",
+    srcs = ["summary_optimizer.cc"],
+    hdrs = ["summary_optimizer.h"],
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "summary_optimizer_test",
+    srcs = ["summary_optimizer_test.cc"],
+    deps = [
+        ":summary_optimizer",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 KERNEL_AND_DEVICE_DEPS = [
     "//tensorflow/core:core_cpu_lib",
     "//tensorflow/core:framework",
@@ -490,6 +522,7 @@ tf_cuda_library(
         ":attr_builder",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@farmhash_archive//:farmhash",
@@ -551,6 +584,7 @@ cc_library(
         ":eager_operation",
         ":kernel_and_device",
         ":small_constants_optimizer",
+        ":summary_optimizer",
         ":tensor_handle",
         "//tensorflow/c:tf_tensor_internal",
         "//tensorflow/compiler/jit:common",
@@ -644,12 +678,11 @@ tf_mkl_kernel_library(
     srcs = ["mkl_eager_op_rewrite.cc"],
     deps = [
         ":eager_op_rewrite_registry",
-        "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core/graph:mkl_graph_util",
-    ],
+    ] + mkl_deps(),
     alwayslink = 1,
 )
 
@@ -663,6 +696,7 @@ tf_cc_test_mkl(
         ":core",
         ":eager_op_rewrite_registry",
         ":mkl_eager_op_rewrite",
+        "//tensorflow/core:all_kernels",
         "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
@@ -715,6 +749,7 @@ cc_library(
         ":kernel_and_device",
         ":placement_utils",
         ":small_constants_optimizer",
+        ":summary_optimizer",
         ":tensor_handle",
         "//tensorflow/c:c_api_internal",
         "//tensorflow/c:tf_tensor_internal",
@@ -766,6 +801,7 @@ tf_cuda_library(
         ":eager_operation",
         "//tensorflow/c/eager:immediate_execution_operation",
         "//tensorflow/c/eager:immediate_execution_tensor_handle",
+        "@com_google_absl//absl/status",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/core:portable_tensorflow_lib_lite",
@@ -820,6 +856,7 @@ tf_cuda_library(
     hdrs = ["attr_builder.h"],
     visibility = ["//tensorflow:internal"],
     deps = [
+        "@com_google_absl//absl/status",
         # Only the TF_AttrType enum is required, so pull in just the C headers.
         # TODO(b/113535673): Break this dependency and avoid the C header completely.
         "//tensorflow/c:tf_attrtype",
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.cc b/tensorflow/core/common_runtime/eager/attr_builder.cc
index 1abddd75b3e..ff381d0916d 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.cc
+++ b/tensorflow/core/common_runtime/eager/attr_builder.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 
+#include <memory>
+
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -82,7 +85,7 @@ Status AttrTypeMapForOp(const char* op_name, const AttrTypeMap** out,
 
   const OpDef* op_def = nullptr;
   Status s = OpDefForOp(op_name, &op_def);
-  if (errors::IsNotFound(s)) {
+  if (absl::IsNotFound(s)) {
     // If we did not find the op def, we assume `op_name` is a function.
     // If it is actually a misspelled op, user will get another error when
     // trying to run it.
diff --git a/tensorflow/core/common_runtime/eager/attr_builder.h b/tensorflow/core/common_runtime/eager/attr_builder.h
index c979020245d..2f05463facf 100644
--- a/tensorflow/core/common_runtime/eager/attr_builder.h
+++ b/tensorflow/core/common_runtime/eager/attr_builder.h
@@ -19,6 +19,7 @@ limitations under the License.
 // Support for eager execution of TensorFlow kernels.
 
 #include <memory>
+#include <optional>
 #include <unordered_map>
 
 #include "tensorflow/c/eager/abstract_op_attrs.h"
@@ -96,7 +97,7 @@ class AttrBuilder : public AbstractOpAttrs {
   AttrBuilder()
       : AbstractOpAttrs(AbstractOpAttrs::AbstractOpAttrsKind::kEager) {}
 
-  ~AttrBuilder() override {}
+  ~AttrBuilder() override = default;
   explicit AttrBuilder(const char* op)
       : AbstractOpAttrs(AbstractOpAttrs::AbstractOpAttrsKind::kEager) {
     Reset(op);
@@ -107,7 +108,7 @@ class AttrBuilder : public AbstractOpAttrs {
     num_inputs_ = 0;
     encoded_attrs_.clear();
     node_def_finalized_ = false;
-    cached_cache_key_ = absl::nullopt;
+    cached_cache_key_ = std::nullopt;
     device_for_cached_cache_key_.clear();
   }
 
@@ -122,7 +123,7 @@ class AttrBuilder : public AbstractOpAttrs {
     SetAttrValue(value, &attr_tmp_);
     AddAttrIfNotPresent(attr_name, attr_tmp_);
     node_def_finalized_ = false;
-    cached_cache_key_ = absl::nullopt;
+    cached_cache_key_ = std::nullopt;
     return *this;
   }
 
@@ -130,7 +131,7 @@ class AttrBuilder : public AbstractOpAttrs {
 
   AttrBuilder& Set(StringPiece attr_name, const AttrValue& value) {
     AddAttrIfNotPresent(attr_name, value);
-    cached_cache_key_ = absl::nullopt;
+    cached_cache_key_ = std::nullopt;
     return *this;
   }
 
@@ -150,7 +151,7 @@ class AttrBuilder : public AbstractOpAttrs {
     return GetNodeAttr(AttrSlice(node_def_), attr_name, value);
   }
 
-  tensorflow::Fprint128 CacheKey(const StringPiece device);
+  tensorflow::Fprint128 CacheKey(StringPiece device);
 
   // Fill `m` with the attr-value pairs set via AttrBuilder::Set() so far, as
   // well as any default attr-value pairs from the associated op_def, if there
@@ -183,7 +184,7 @@ class AttrBuilder : public AbstractOpAttrs {
       absl::InlinedVector<DataType, 4>* type_list) const override;
 
  private:
-  tensorflow::Fprint128 BuildCacheKeyForDevice(const StringPiece device) const;
+  tensorflow::Fprint128 BuildCacheKeyForDevice(StringPiece device) const;
 
   template <class T>
   void SetInAttrValueMap(AttrValueMap* m, const string& attr_name,
@@ -205,7 +206,7 @@ class AttrBuilder : public AbstractOpAttrs {
   bool node_def_initialized_;
   bool node_def_finalized_;
 
-  absl::optional<tensorflow::Fprint128> cached_cache_key_;
+  std::optional<tensorflow::Fprint128> cached_cache_key_;
   string device_for_cached_cache_key_;
 };
 
diff --git a/tensorflow/core/common_runtime/eager/context.cc b/tensorflow/core/common_runtime/eager/context.cc
index a813b9ed6ba..6de581afd1f 100644
--- a/tensorflow/core/common_runtime/eager/context.cc
+++ b/tensorflow/core/common_runtime/eager/context.cc
@@ -18,6 +18,9 @@ limitations under the License.
 #include <algorithm>
 #include <functional>
 #include <memory>
+#include <thread>
+#include <unordered_map>
+#include <unordered_set>
 #include <utility>
 #include <vector>
 
@@ -46,6 +49,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/device_resolver_local.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/eager/small_constants_optimizer.h"
+#include "tensorflow/core/common_runtime/eager/summary_optimizer.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph_def_util.h"
@@ -236,10 +240,10 @@ void EagerContext::ResetPFLR(const DeviceMgr* device_mgr, Env* env,
   if (opts_.config.experimental().has_session_metadata()) {
     session_metadata = &opts_.config.experimental().session_metadata();
   }
-  pflr_.reset(new ProcessFunctionLibraryRuntime(
+  pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
       device_mgr, env, config, graph_def_version, lib_def, optimizer_options,
       thread_pool, cluster_flr, session_metadata, std::move(rendezvous_factory),
-      StatsPublisherInterface::GetStatsPublisherFactory()));
+      StatsPublisherInterface::GetStatsPublisherFactory());
 }
 
 void EagerContext::InitPrioritizedDeviceTypeList() {
@@ -803,8 +807,8 @@ void EagerContext::EndStep() {
   if (num_active_steps_ == 0) {
     // TODO(b/139809335): This does not properly clean up remote resources
     // Clean up the previous step container and create a new one.
-    step_container_.reset(new ScopedStepContainer(
-        0, [this](const string& name) { ClearResourceContainer(name); }));
+    step_container_ = std::make_unique<ScopedStepContainer>(
+        0, [this](const string& name) { ClearResourceContainer(name); });
   }
 }
 
@@ -953,6 +957,13 @@ Status EagerContext::AddFunctionDef(const FunctionDef& fdef,
         AddFunctionDef(fdef_to_add, library, add_to_local_only, stack_traces));
   }
 
+  auto stripped_fdefs_to_add =
+      summary_optimizer::StripSummaries(fdef, func_lib_def_);
+  for (const auto& fdef_to_add : stripped_fdefs_to_add) {
+    TF_RETURN_IF_ERROR(
+        AddFunctionDef(fdef_to_add, library, add_to_local_only, stack_traces));
+  }
+
   bool is_first_ref = false;
   {
     mutex_lock l(cache_mu_);
@@ -1181,7 +1192,7 @@ void EagerContext::SetShouldStoreGraphs(bool value) {
   mutex_lock ml(metadata_mu_);
   should_store_graphs_.store(value);
   if (!value) {
-    run_metadata_.reset(new RunMetadata);
+    run_metadata_ = std::make_unique<RunMetadata>();
   }
 }
 
diff --git a/tensorflow/core/common_runtime/eager/context.h b/tensorflow/core/common_runtime/eager/context.h
index 88ad3f02012..dbf1d253ab6 100644
--- a/tensorflow/core/common_runtime/eager/context.h
+++ b/tensorflow/core/common_runtime/eager/context.h
@@ -16,16 +16,21 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CONTEXT_H_
 
 #include <algorithm>
+#include <atomic>
 #include <cstddef>
 #include <functional>
 #include <map>
 #include <memory>
 #include <queue>
 #include <string>
+#include <thread>
+#include <unordered_map>
 #include <unordered_set>
+#include <utility>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/core/common_runtime/composite_device.h"
@@ -155,7 +160,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
 
   void SetJitCompileRewrite(bool enable) override;
 
-  void ListDevices(std::vector<DeviceAttributes>* devices) override;
+  void ListDevices(std::vector<DeviceAttributes>* device_attributes) override;
 
   Status AddDevices(std::vector<std::unique_ptr<Device>> devices) override;
 
@@ -173,7 +178,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Specify a executor for this thread.
   void SetExecutorForThread(EagerExecutor* executor) override;
 
-  const std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list()
+  std::shared_ptr<std::vector<DeviceType>> prioritized_device_type_list()
       const {
     mutex_lock l(device_type_list_mu_);
     return prioritized_device_type_list_;
@@ -351,9 +356,9 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
     return collective_executor_mgr_.Get();
   }
   std::unique_ptr<CollectiveExecutor::Handle> GetCollectiveExecutorHandle() {
-    return std::unique_ptr<CollectiveExecutor::Handle>(
-        new CollectiveExecutor::Handle(
-            collective_executor_mgr()->FindOrCreate(0), true /*inherit_ref*/));
+    return std::make_unique<CollectiveExecutor::Handle>(
+
+        collective_executor_mgr()->FindOrCreate(0), true /*inherit_ref*/);
   }
 
   void SetCollectiveExecutorMgr(CollectiveExecutorMgrInterface* mgr) {
@@ -390,7 +395,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   void EndStep() override;
   ScopedStepContainer* StepContainer();
 
-  FunctionLibraryDefinition* FuncLibDef() { return &func_lib_def_; }
+  FunctionLibraryDefinition* FuncLibDef() override { return &func_lib_def_; }
 
 #if !defined(IS_MOBILE_PLATFORM)
   // Assign the EagerClient pointer to `client` based on the given device / task
@@ -667,7 +672,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   template <typename T>
   struct OwnedOrUnownedHelper {
    public:
-    OwnedOrUnownedHelper() {}
+    OwnedOrUnownedHelper() = default;
     explicit OwnedOrUnownedHelper(T* object, const bool owned = false) {
       Reset(object, owned);
     }
@@ -753,7 +758,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   mutex device_cache_mu_;
   mutex remove_function_notifiers_mu_;
   struct RegisteredFunction : public core::RefCounted {
-    ~RegisteredFunction() override {}
+    ~RegisteredFunction() override = default;
 
     std::unique_ptr<std::vector<Fprint128>> cached_kernel_keys;
   };
@@ -785,7 +790,7 @@ class EagerContext : public ImmediateExecutionContext, public core::RefCounted {
   // Not owned.
   std::unordered_map<std::thread::id, EagerExecutor*> thread_local_executor_
       TF_GUARDED_BY(executor_map_mu_);
-  std::unordered_map<std::thread::id, std::unordered_set<EagerExecutor*>>
+  std::unordered_map<std::thread::id, absl::flat_hash_set<EagerExecutor*>>
       has_cleanup_ TF_GUARDED_BY(executor_map_mu_);
 
   const bool log_memory_;
diff --git a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
index 1f4d2c1d5fb..27c77a6a9e4 100644
--- a/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
+++ b/tensorflow/core/common_runtime/eager/context_distributed_manager.cc
@@ -16,9 +16,13 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/context_distributed_manager.h"
 
 #include <algorithm>
+#include <iterator>
+#include <memory>
 #include <numeric>
 #include <string>
+#include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
diff --git a/tensorflow/core/common_runtime/eager/context_test.cc b/tensorflow/core/common_runtime/eager/context_test.cc
index e4c406251d5..ff42e778137 100644
--- a/tensorflow/core/common_runtime/eager/context_test.cc
+++ b/tensorflow/core/common_runtime/eager/context_test.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/context.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "tensorflow/core/common_runtime/eager/context_distributed_manager.h"
 #include "tensorflow/core/framework/function.h"
@@ -109,7 +114,7 @@ TEST_F(EagerContextTest, CompositeDevice) {
       "/job:localhost/replica:0/task:0/device:COMPOSITE:1", &device));
   EXPECT_EQ(device, composite_device_2);
 
-  EXPECT_TRUE(errors::IsNotFound(context()->FindCompositeDeviceFromName(
+  EXPECT_TRUE(absl::IsNotFound(context()->FindCompositeDeviceFromName(
       "/job:localhost/replica:0/task:0/device:COMPOSITE:2", &device)));
 }
 
diff --git a/tensorflow/core/common_runtime/eager/copy_to_device_node.h b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
index 67b1bc833da..6cbc7b87dd4 100644
--- a/tensorflow/core/common_runtime/eager/copy_to_device_node.h
+++ b/tensorflow/core/common_runtime/eager/copy_to_device_node.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_COPY_TO_DEVICE_NODE_H_
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
diff --git a/tensorflow/core/common_runtime/eager/core.cc b/tensorflow/core/common_runtime/eager/core.cc
index b613cf62c6d..741b9a4966a 100644
--- a/tensorflow/core/common_runtime/eager/core.cc
+++ b/tensorflow/core/common_runtime/eager/core.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <utility>
+#include <variant>
+
 #include "tensorflow/c/c_api_internal.h"
 #include "tensorflow/c/eager/abstract_function.h"
 #include "tensorflow/c/tf_tensor_internal.h"
@@ -161,7 +164,7 @@ Status EagerOperation::Execute(absl::Span<AbstractTensorHandle*> retvals,
   }
 
   // Run eager placement logic.
-  class Device* device = absl::get<class Device*>(Device());
+  class Device* device = std::get<class Device*>(Device());
   if (device == nullptr) {
     TF_RETURN_IF_ERROR(eager::MaybePinToResourceDevice(&device, *this));
   }
diff --git a/tensorflow/core/common_runtime/eager/custom_device.cc b/tensorflow/core/common_runtime/eager/custom_device.cc
index 918c80c3854..40f7bde226d 100644
--- a/tensorflow/core/common_runtime/eager/custom_device.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/custom_device.h"
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/common_runtime/eager/custom_device_op_handler.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/custom_device.h b/tensorflow/core/common_runtime/eager/custom_device.h
index e6ac9c4ceb1..ed8716fb4b2 100644
--- a/tensorflow/core/common_runtime/eager/custom_device.h
+++ b/tensorflow/core/common_runtime/eager/custom_device.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_H_
 
+#include <optional>
 #include <string>
+#include <variant>
 
 #include "tensorflow/c/eager/immediate_execution_context.h"
 #include "tensorflow/c/eager/immediate_execution_operation.h"
@@ -34,7 +36,7 @@ class CustomDeviceTensorHandle;
 // typically implemented with one or more of the custom device's own executions.
 class CustomDevice {
  public:
-  virtual ~CustomDevice() {}
+  virtual ~CustomDevice() = default;
   virtual const string& name() = 0;
   virtual Status CopyTensorToDevice(
       ImmediateExecutionTensorHandle* tensor,
@@ -62,7 +64,7 @@ class CustomDevice {
 // Custom devices do many of the same things as physical Devices, but have a
 // much more restricted interface. We pass around ambiguous pointers since
 // operations may be placed either on custom or physical devices.
-using VariantDevice = absl::variant<Device*, CustomDevice*>;
+using VariantDevice = std::variant<Device*, CustomDevice*>;
 
 // Indicates either HostCPU or an unset physical device. We never set a null
 // CustomDevice*.
@@ -124,7 +126,7 @@ class CustomDeviceTensorHandle : public ImmediateExecutionTensorHandle {
   const tensorflow::DataType dtype_;
   tensorflow::FullTypeDef full_type_;
 
-  mutable absl::optional<DeviceNameUtils::ParsedName> parsed_name_;
+  mutable std::optional<DeviceNameUtils::ParsedName> parsed_name_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
index 6adaf646717..61f3605732e 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/custom_device_op_handler.h"
 
+#include <memory>
+#include <utility>
+
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
 
diff --git a/tensorflow/core/common_runtime/eager/custom_device_op_handler.h b/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
index 92d7593e4a5..2f60726566f 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
+++ b/tensorflow/core/common_runtime/eager/custom_device_op_handler.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_OP_HANDLER_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_CUSTOM_DEVICE_OP_HANDLER_H_
 
+#include <memory>
+#include <unordered_map>
+
 #include "tensorflow/c/eager/immediate_execution_operation.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/common_runtime/eager/custom_device.h"
@@ -24,7 +27,7 @@ namespace tensorflow {
 // TODO(tfrt-devs): Figure out a way to unify it with OpHandler in TFRT.
 class CustomDeviceOpHandler {
  public:
-  ~CustomDeviceOpHandler() {}
+  ~CustomDeviceOpHandler() = default;
   // Register a new custom device.
   Status RegisterCustomDevice(const string& device_name,
                               std::unique_ptr<CustomDevice> device);
diff --git a/tensorflow/core/common_runtime/eager/custom_device_test.cc b/tensorflow/core/common_runtime/eager/custom_device_test.cc
index 25ebdb40a27..f80285c9af9 100644
--- a/tensorflow/core/common_runtime/eager/custom_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/custom_device_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/custom_device.h"
 
+#include <string>
+#include <utility>
+
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.cc b/tensorflow/core/common_runtime/eager/eager_executor.cc
index daa80b6e92a..cf1bd979c2d 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.cc
+++ b/tensorflow/core/common_runtime/eager/eager_executor.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 
 #include <forward_list>
+#include <functional>
+#include <memory>
+#include <utility>
 
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
@@ -148,7 +151,7 @@ Status EagerExecutor::AddOrExecute(std::unique_ptr<EagerNode> node) {
   } else {
     tensorflow::mutex_lock l(node_queue_mutex_);
     DVLOG(3) << "Add node [id " << item->id << "]" << item->node->DebugString()
-             << " with status: " << status_.ToString();
+             << " with status: " << status_;
     if (state_ != ExecutorState::kActive) {
       status = errors::FailedPrecondition(
           "EagerExecutor accepts new EagerNodes to run only in Active state. "
@@ -232,7 +235,7 @@ void EagerExecutor::ClearError() {
 void EagerExecutor::NodeDone(const core::RefCountPtr<NodeItem>& item,
                              const Status& status, bool from_queue) {
   DVLOG(3) << "Node Done: [id " << item->id << "] " << item->node->DebugString()
-           << " with status: " << status.ToString();
+           << " with status: " << status;
   DCHECK(item->state != NodeState::kDONE);
   item->state = NodeState::kDONE;
 
@@ -329,11 +332,11 @@ void EagerExecutor::NotifyWaiters(uint64 id) {
     // occurred. These calling threads are responsible for checking status_
     // before proceeding.
     const auto range =
-        status_.ok()
-            ? make_pair(node_done_notifications_.lower_bound(id),
-                        node_done_notifications_.upper_bound(upperbound_id))
-            : make_pair(node_done_notifications_.begin(),
-                        node_done_notifications_.end());
+        status_.ok() ? std::make_pair(
+                           node_done_notifications_.lower_bound(id),
+                           node_done_notifications_.upper_bound(upperbound_id))
+                     : std::make_pair(node_done_notifications_.begin(),
+                                      node_done_notifications_.end());
     for (auto it = range.first; it != range.second; ++it) {
       it->second->notify_all();
     }
diff --git a/tensorflow/core/common_runtime/eager/eager_executor.h b/tensorflow/core/common_runtime/eager/eager_executor.h
index 6abfe7a35d6..63e9dedc4fa 100644
--- a/tensorflow/core/common_runtime/eager/eager_executor.h
+++ b/tensorflow/core/common_runtime/eager/eager_executor.h
@@ -16,13 +16,17 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_EXECUTOR_H_
 
 #include <algorithm>
+#include <atomic>
 #include <cstddef>
+#include <functional>
 #include <map>
 #include <memory>
 #include <queue>
 #include <string>
+#include <unordered_map>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
@@ -49,9 +53,9 @@ class EagerClient;
 // device to another.
 class EagerNode {
  public:
-  EagerNode() {}
+  EagerNode() = default;
 
-  virtual ~EagerNode() {}
+  virtual ~EagerNode() = default;
 
   // Prepares the node when adding it into EagerExecutor. If any errors happens,
   // EagerExecutor will abort the node immediately.
@@ -267,7 +271,7 @@ class EagerExecutor {
   const bool enable_streaming_enqueue_;
 
   // Callbacks to run on destruction.
-  std::unordered_map<intptr_t, std::vector<std::function<void()>>> cleanups_;
+  absl::flat_hash_map<intptr_t, std::vector<std::function<void()>>> cleanups_;
 
   // Limit the number of in-flight nodes. When the number of in-flight eager
   // async nodes reach this number, enqueuing to the eager async queue is
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.cc
index 002d2b4aec6..cb9413d2f9c 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
 
+#include <memory>
+#include <utility>
+
 namespace tensorflow {
 
 EagerOpRewriteRegistry* EagerOpRewriteRegistry::Global() {
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
index f8e56ee2ffb..a70877b5711 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OP_REWRITE_REGISTRY_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OP_REWRITE_REGISTRY_H_
 
+#include <array>
+#include <list>
+#include <memory>
+#include <utility>
+
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 
 namespace tensorflow {
@@ -29,7 +34,7 @@ class EagerOpRewrite {
     debug_info_.line = line;
   }
 
-  virtual ~EagerOpRewrite() {}
+  virtual ~EagerOpRewrite() = default;
 
   // To be implemented by an Eager op rewrite pass.
   virtual Status Run(EagerOperation* orig_op,
diff --git a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
index 1350ba720b9..6db175842f6 100644
--- a/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
+++ b/tensorflow/core/common_runtime/eager/eager_op_rewrite_registry_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/eager_op_rewrite_registry.h"
 
+#include <memory>
+
 #include "tensorflow/core/platform/test.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.cc b/tensorflow/core/common_runtime/eager/eager_operation.cc
index 5eae055ec0b..545585750b6 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.cc
+++ b/tensorflow/core/common_runtime/eager/eager_operation.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 
+#include <memory>
+#include <string>
+#include <variant>
+#include <vector>
+
 #include "absl/types/span.h"
 #include "tensorflow/c/eager/abstract_operation.h"
 #include "tensorflow/c/eager/abstract_tensor_handle.h"
@@ -477,10 +482,10 @@ bool EagerOperation::IsLocal() const {
 string VariantDeviceDebugString(VariantDevice device) {
   if (device == kVariantDeviceNull) {
     return "[]";
-  } else if (absl::holds_alternative<CustomDevice*>(device)) {
-    return absl::get<CustomDevice*>(device)->name();
+  } else if (std::holds_alternative<CustomDevice*>(device)) {
+    return std::get<CustomDevice*>(device)->name();
   } else {
-    return absl::get<Device*>(device)->DebugString();
+    return std::get<Device*>(device)->DebugString();
   }
 }
 const AbstractOpAttrs* EagerOperation::GetOpAttrs() const { return &attrs_; }
diff --git a/tensorflow/core/common_runtime/eager/eager_operation.h b/tensorflow/core/common_runtime/eager/eager_operation.h
index 6d94719fb07..ccde391e8dc 100644
--- a/tensorflow/core/common_runtime/eager/eager_operation.h
+++ b/tensorflow/core/common_runtime/eager/eager_operation.h
@@ -15,6 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_EAGER_OPERATION_H_
 
+#include <optional>
+#include <string>
+#include <variant>
+#include <vector>
+
 #include "absl/container/inlined_vector.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
@@ -71,7 +76,7 @@ class EagerOperation : public ImmediateExecutionOperation {
 
   void SetDevice(VariantDevice device) {
     device_ = device;
-    device_name_ = absl::visit(
+    device_name_ = std::visit(
         [](auto* device) { return device == nullptr ? "" : device->name(); },
         device);
     DeviceNameUtils::ParseFullName(device_name_, &device_parsed_name_);
@@ -101,7 +106,7 @@ class EagerOperation : public ImmediateExecutionOperation {
   Status SetAttrBool(const char* attr_name, bool value) override;
   Status SetAttrType(const char* attr_name, DataType value) override;
   Status SetAttrShape(const char* attr_name, const int64_t* dims,
-                      const int num_dims) override;
+                      int num_dims) override;
   Status SetAttrFunction(const char* attr_name,
                          const AbstractOperation* value) override;
   Status SetAttrFunctionName(const char* attr_name, const char* data,
@@ -134,14 +139,14 @@ class EagerOperation : public ImmediateExecutionOperation {
     stack_trace_ = stack_trace;
   }
 
-  absl::optional<ManagedStackTrace> GetStackTrace() override {
+  std::optional<ManagedStackTrace> GetStackTrace() override {
     return stack_trace_;
   }
 
-  Status Reset(const char* op, const char* device_name, bool remote,
-               EagerExecutor* executor,
-               const absl::optional<EagerFunctionParams> remote_func_params =
-                   absl::nullopt);
+  Status Reset(
+      const char* op, const char* device_name, bool remote,
+      EagerExecutor* executor,
+      absl::optional<EagerFunctionParams> eager_func_params = std::nullopt);
 
   bool is_function() const { return is_function_; }
   bool colocation_exempt() const { return colocation_exempt_; }
@@ -243,7 +248,7 @@ class EagerOperation : public ImmediateExecutionOperation {
   Status InferInputListAttrs(int num_inputs);
 
   void InferSingleTypeInputListAttrs(const OpDef::ArgDef& input_def,
-                                     const DataType dtype, int num_inputs);
+                                     DataType dtype, int num_inputs);
   void InferMixedTypeInputListAttrs(const OpDef::ArgDef& input_def,
                                     const std::vector<DataType>& dtypes);
 
@@ -278,13 +283,13 @@ class EagerOperation : public ImmediateExecutionOperation {
   // updated accordingly.
   VariantDevice device_;
 
-  absl::optional<ManagedStackTrace> stack_trace_;
+  std::optional<ManagedStackTrace> stack_trace_;
   bool is_function_;  // Conceptually const, but can't be because of Reset
   bool colocation_exempt_;
   CancellationManager* cancellation_manager_ = nullptr;  // Not owned.
   EagerExecutor* executor_;                              // Not owned.
 
-  absl::optional<EagerFunctionParams> eager_func_params_;
+  std::optional<EagerFunctionParams> eager_func_params_;
 
   // Inference information
   const tensorflow::OpDef* op_def_;  // op definition from protobuf
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index 1bf82581731..6b61dc7f921 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -17,10 +17,15 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstddef>
+#include <functional>
+#include <memory>
 #include <optional>
 #include <queue>
 #include <string>
 #include <string_view>
+#include <unordered_map>
+#include <utility>
+#include <variant>
 #include <vector>
 
 // clang-format off
@@ -31,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/arg_ret_placement.h"
 #include "tensorflow/core/common_runtime/eager/eager_operation.h"
 #include "tensorflow/core/common_runtime/eager/small_constants_optimizer.h"
+#include "tensorflow/core/common_runtime/eager/summary_optimizer.h"
 #include "tensorflow/core/common_runtime/int32_fulltype.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/full_type.pb.h"
@@ -378,7 +384,7 @@ Status GetDeviceForInput(const EagerOperation& op, const EagerContext& ctx,
       // to the op's device. This allows us to avoid expensive D2H copies if a
       // mirror of the tensor already exists on the op's device.
       if (!op.is_function() && device != cpu_device && !is_host_memory_arg) {
-        device = absl::get<Device*>(op.Device());
+        device = std::get<Device*>(op.Device());
       }
       *result = (device == nullptr ? cpu_device : device);
     }
@@ -1110,6 +1116,50 @@ StatusOr<BoolTensorInputs> RemoveBoolInputs(EagerOperation* op) {
   return result;
 }
 
+// Returns the value of the `op`'s input `arg_name`.
+// Returns `std::nullopt` by default in the following cases:
+// - `arg_name` is not a boolean tensor.
+// - `op` does nat have an input `arg_name`.
+// - `arg_name` is not on HOST.
+// - any issues with the `FunctionDef` in the `EagerContext`.
+std::optional<bool> GetBoolArgumentValue(const EagerOperation& op,
+                                         const absl::string_view arg_name) {
+  if (!op.is_function()) return std::nullopt;
+  // Extract tensor inputs.
+  const absl::InlinedVector<TensorHandle*, 4>* inputs;
+  if (!op.TensorHandleInputs(&inputs).ok()) return std::nullopt;
+  // Extract the FunctionDef.
+  const FunctionDef* fdef = op.EagerContext().GetFunctionDef(op.Name());
+  if (fdef == nullptr) return std::nullopt;
+  // Ensure the number of inputs matches the specification in the FunctionDef.
+  if (fdef->signature().input_arg_size() != inputs->size()) return std::nullopt;
+
+  // Identify the value of the boolean input.
+  for (int32_t i = 0; i < fdef->signature().input_arg_size(); ++i) {
+    const auto& input_arg = fdef->signature().input_arg(i);
+    if (input_arg.name() != arg_name) continue;
+    if (input_arg.type() != DT_BOOL) return std::nullopt;
+
+    // If the input is not on host returns std::nullopt.
+    const TensorHandle* handle = inputs->at(i);
+    Status s;
+    const char* input_device = handle->DeviceType(&s);
+    if (!s.ok() || !absl::StrContains(input_device, "CPU")) return std::nullopt;
+
+    // If there was an error reading the input value returns std::nullopt.
+    const Tensor* tensor;
+    auto read_tensor_status = handle->Tensor(&tensor);
+    if (!read_tensor_status.ok()) return std::nullopt;
+
+    // Return the input value `arg_name` passed to the `op`.
+    const bool input_value = tensor->scalar<bool>()();
+    return input_value;
+  }
+
+  // Could not find `arg_name` return std::nullopt by default.
+  return std::nullopt;
+}
+
 bool IsSmallConstantOptimizationEnabled(const EagerOperation& op) {
   if (!op.is_function()) return false;
   const FunctionDef* fdef = op.EagerContext().GetFunctionDef(op.Name());
@@ -1117,6 +1167,18 @@ bool IsSmallConstantOptimizationEnabled(const EagerOperation& op) {
   return small_constants_optimizer::IsSmallConstantOptimizationEnabled(*fdef);
 }
 
+bool IsSummaryOptimizerEnabled(const EagerOperation* op) {
+  if (!op->is_function()) return false;
+  const FunctionDef* fdef = op->EagerContext().GetFunctionDef(op->Name());
+  if (fdef == nullptr) return false;
+  const auto include_summary_arg =
+      summary_optimizer::GetDisableSummariesInputArg(*fdef);
+  if (include_summary_arg.first.empty()) return false;
+  const auto arg_value = GetBoolArgumentValue(*op, include_summary_arg.first);
+  if (!arg_value.has_value()) return false;
+  return arg_value.value() == include_summary_arg.second;
+}
+
 StatusOr<Fprint128> GetKernelCacheKey(
     const EagerOperation& op, const Fprint128& op_cache_key,
     const std::vector<Device*>& input_device_ptrs,
@@ -1184,7 +1246,7 @@ Status ExtractFunctionInputInfo(
   Device* op_device = nullptr;
   const NodeDef* node_def = nullptr;
   if (!op->is_function()) {
-    op_device = absl::get<Device*>(op->Device());
+    op_device = std::get<Device*>(op->Device());
     node_def = &op->MutableAttrs()->BuildNodeDef();
   }
   for (int i = 0, end = inputs->size(); i < end; ++i) {
@@ -1264,7 +1326,7 @@ Status GetOrCreateKernelAndDevice(
     EagerOperation* op, TensorHandle** retvals, int* num_retvals,
     core::RefCountPtr<KernelAndDevice>* out_kernel) {
   EagerContext& ctx = op->EagerContext();
-  Device* device = absl::get<Device*>(op->Device());
+  Device* device = std::get<Device*>(op->Device());
 
   // Update the EagerOperation with information about the boolean input tensors
   // when small constant optimization is enabled.
@@ -1278,6 +1340,12 @@ Status GetOrCreateKernelAndDevice(
     op->UpdateName(folded_name);
   }
 
+  // Update the EagerOperation with information about the boolean input tensors
+  // when the summary_optimizer is enabled.
+  if (IsSummaryOptimizerEnabled(op)) {
+    op->UpdateName(summary_optimizer::StrippedFunctionName(op->Name()));
+  }
+
   // Set the EagerOperation's device prior to extracting the input_device_ptrs
   // to avoid any redundant H2D/D2H copies.
   if (device == nullptr && !op->is_function()) {
@@ -1339,7 +1407,7 @@ Status GetOrCreateKernelAndDevice(
   //    special nodes and attributes)
   if (kernel == nullptr) {
     VLOG(2) << "Creating new kernel for " << op->Name() << " on device "
-            << DeviceNameOrUnspecified(absl::get<Device*>(op->Device()));
+            << DeviceNameOrUnspecified(std::get<Device*>(op->Device()));
 
     if (device == nullptr) {
       TF_RETURN_IF_ERROR(SetOpDevice(ctx, op, &device));
@@ -1349,7 +1417,7 @@ Status GetOrCreateKernelAndDevice(
     }
 
     bool run_function_with_flr = false;
-    absl::optional<string> xla_compile_device_type;
+    std::optional<string> xla_compile_device_type;
     if (op->is_function()) {
       bool compile_with_xla;
       // By default we should run functions with FunctionLibraryRuntime.
@@ -1465,7 +1533,8 @@ Status GetOrCreateKernelAndDevice(
           function_outputs_on_op_device, allow_small_function_optimizations,
           allow_control_flow_sync_execution,
           shape_inference_on_tfe_dialect_import, int_args_and_retvals_on_device,
-          xla_compile_device_type, std::move(rendezvous_creator), get_op_id));
+          xla_compile_device_type, ctx.AllowSoftPlacement(),
+          std::move(rendezvous_creator), get_op_id));
     } else {
       VLOG(2) << "Running " << ndef.op() << " using op kernel. "
               << ". Full node_def=" << ndef.DebugString();
@@ -1551,7 +1620,7 @@ Status AddOrExecuteNode(core::RefCountPtr<KernelAndDevice> kernel,
     graph_collector = ctx.GetGraphCollector();
   }
   const int num_outputs = kernel->num_outputs();
-  absl::optional<EagerFunctionParams> eager_func_params =
+  std::optional<EagerFunctionParams> eager_func_params =
       op->eager_func_params();
   if (kernel->IsCrossProcess() && !eager_func_params.has_value()) {
     // Create an eager op id for a cross-process function if not exist.
@@ -1737,7 +1806,7 @@ void PrepareRemoteOp(eager::Operation* remote_op, EagerOperation* op) {
   remote_op->set_name(op->Name());
 
   op->Attrs().FillAttrValueMapWithoutDefaults(remote_op->mutable_attrs());
-  remote_op->set_device(absl::get<Device*>(op->Device())->name());
+  remote_op->set_device(std::get<Device*>(op->Device())->name());
   remote_op->set_is_function(op->is_function());
 }
 
@@ -1790,7 +1859,7 @@ Status EagerRemoteExecute(EagerOperation* op, TensorHandle** retvals,
 
   eager::Operation* remote_op = request->add_queue()->mutable_operation();
 
-  tensorflow::Device* op_device = absl::get<Device*>(op->Device());
+  tensorflow::Device* op_device = std::get<Device*>(op->Device());
   {
     profiler::TraceMe activity("CopyInputToExpectedDevice",
                                profiler::TraceMeLevel::kInfo);
@@ -1942,7 +2011,7 @@ Status GetKernelOutputs(
       Device* output_device = ctx->CanonicalDevice(kernel->OutputDevice(i));
       if (ret.index() == 0) {
         retvals[i] = TensorHandle::CreateLocalHandle(
-            std::move(absl::get<Tensor>(ret)),
+            std::move(std::get<Tensor>(ret)),
             /* d= */ output_device,
             /* op_device= */ kernel->device(),
             /* resource_device= */ kernel->OutputResourceDevice(i), ctx);
@@ -1953,7 +2022,7 @@ Status GetKernelOutputs(
                                  eager_func_params, ctx, &retvals[i]));
 #if !defined(IS_MOBILE_PLATFORM)
         TF_RETURN_IF_ERROR(
-            retvals[i]->SetRemoteShape(absl::get<TensorShape>(ret),
+            retvals[i]->SetRemoteShape(std::get<TensorShape>(ret),
                                        output_device, ctx->GetContextViewId()));
 #endif  // IS_MOBILE_PLATFORM
       }
@@ -1974,7 +2043,7 @@ Status GetKernelOutputs(
       EagerKernelRet& ret = (*outputs)[i];
       if (ret.index() == 0) {
         TF_RETURN_IF_ERROR(retvals[i]->SetTensor(
-            std::move(absl::get<Tensor>(ret)),
+            std::move(std::get<Tensor>(ret)),
             ctx->CanonicalDevice(kernel->OutputDevice(i))));
       } else {
 #if defined(IS_MOBILE_PLATFORM)
@@ -1982,7 +2051,7 @@ Status GetKernelOutputs(
             "Remote outputs are not available on mobile devices.");
 #else  // !IS_MOBILE_PLATFORM
         TF_RETURN_IF_ERROR(retvals[i]->SetRemoteShape(
-            absl::get<TensorShape>(ret), retvals[i]->device(),
+            std::get<TensorShape>(ret), retvals[i]->device(),
             ctx->GetContextViewId()));
 #endif  // !IS_MOBILE_PLATFORM
       }
diff --git a/tensorflow/core/common_runtime/eager/execute_node.h b/tensorflow/core/common_runtime/eager/execute_node.h
index 275c56d009d..8fbe075c34c 100644
--- a/tensorflow/core/common_runtime/eager/execute_node.h
+++ b/tensorflow/core/common_runtime/eager/execute_node.h
@@ -18,8 +18,11 @@ limitations under the License.
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
 #include <cstddef>
+#include <functional>
 #include <memory>
+#include <optional>
 #include <string>
+#include <utility>
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/platform.h"
@@ -76,8 +79,8 @@ class ExecuteNodeArgs : public EagerKernelArgs {
 #endif  // IS_MOBILE_PLATFORM
 
   // Initialize a packed TensorHandle which is the `index`-th argument.
-  Status InitPackedHandle(const int index, EagerContext* ctx,
-                          Device* input_device, TensorHandle* packed_handle);
+  Status InitPackedHandle(int index, EagerContext* ctx, Device* input_device,
+                          TensorHandle* packed_handle);
 
   bool has_remote_inputs_ = false;
   bool has_packed_inputs_ = false;
@@ -98,7 +101,7 @@ class ExecuteNode : public EagerNode {
               GraphCollector* graph_collector,
               CancellationManager* cancellation_manager,
               absl::Span<TensorHandle*> retvals,
-              absl::optional<ManagedStackTrace> stack_trace)
+              std::optional<ManagedStackTrace> stack_trace)
       : EagerNode(),
         ctx_(ctx),
         inputs_(inputs),
@@ -142,7 +145,7 @@ class ExecuteNode : public EagerNode {
   GraphCollector* graph_collector_;
   CancellationManager* const cancellation_manager_;
   absl::Span<TensorHandle*> retvals_;
-  absl::optional<ManagedStackTrace> stack_trace_;
+  std::optional<ManagedStackTrace> stack_trace_;
 };
 
 class AsyncExecuteNode : public EagerNode {
@@ -154,7 +157,7 @@ class AsyncExecuteNode : public EagerNode {
                    GraphCollector* graph_collector,
                    CancellationManager* cancellation_manager,
                    absl::Span<TensorHandle*> retvals,
-                   absl::optional<ManagedStackTrace> stack_trace)
+                   std::optional<ManagedStackTrace> stack_trace)
       : EagerNode(),
         ctx_(ctx),
         inputs_(inputs),
@@ -204,7 +207,9 @@ class AsyncExecuteNode : public EagerNode {
         cancellation_manager_, absl::MakeSpan(retvals_), stack_trace_);
     if (!status.ok()) {
       if (stack_trace_.has_value()) {
-        errors::SetStackTrace(status, stack_trace_->ToStackFrames({}, {}));
+        errors::SetStackTrace(
+            status, stack_trace_->ToStackFrames(
+                        {}, {}, /*reverse_traversal=*/false, /*limit=*/-1));
       }
       Abort(status);
       return status;
@@ -235,7 +240,7 @@ class AsyncExecuteNode : public EagerNode {
   core::RefCountPtr<KernelAndDevice> kernel_;
   GraphCollector* graph_collector_;
   CancellationManager* const cancellation_manager_;
-  absl::optional<ManagedStackTrace> stack_trace_;
+  std::optional<ManagedStackTrace> stack_trace_;
   absl::InlinedVector<TensorHandle*, 2> retvals_;
 };
 
diff --git a/tensorflow/core/common_runtime/eager/execute_node_test.cc b/tensorflow/core/common_runtime/eager/execute_node_test.cc
index 9c0ca0948cc..b2714b3e1bb 100644
--- a/tensorflow/core/common_runtime/eager/execute_node_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_node_test.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/execute_node.h"
 
 #include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -42,7 +45,8 @@ class TestKernelAndDeviceFunc final : public KernelAndDeviceFunc {
             /*allow_control_flow_sync_execution=*/false,
             /*shape_inference_on_tfe_dialect_import=*/true,
             /*int_args_and_retvals_on_device=*/false,
-            /*xla_compile_device_type=*/absl::nullopt,
+            /*xla_compile_device_type=*/std::nullopt,
+            /*allow_soft_placement=*/false,
             /*rendezvous_factory=*/Rendezvous::Factory(),
             /*get_op_id=*/nullptr),
         test_input_devices_(std::move(input_devices)) {}
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.cc b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
index 0d95d16fbd7..7b3b383b3dd 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 
+#include <functional>
 #include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
@@ -93,7 +98,7 @@ KernelAndDeviceFunc::~KernelAndDeviceFunc() {
     if (!status.ok()) {
       LOG(INFO) << "Ignoring error status when releasing multi-device function "
                    "handle "
-                << status.ToString();
+                << status;
     }
   }
 }
@@ -236,6 +241,8 @@ Status KernelAndDeviceFunc::InstantiateFunc(const bool log_device_placement,
     options.xla_compile_device_type = xla_compile_device_type_.value();
   }
 
+  options.allow_soft_placement = allow_soft_placement_;
+
   TF_RETURN_IF_ERROR(
       pflr_->Instantiate(ndef.op(), AttrSlice(ndef), options, &handle_));
   return pflr_->IsCrossProcess(handle_, &is_cross_process_);
@@ -328,7 +335,7 @@ Status KernelAndDeviceOp::Run(
 
   Status s = context.status();
   if (TF_PREDICT_FALSE(!s.ok())) {
-    if (errors::IsUnavailable(s) && !is_distributed_communication_op_) {
+    if (absl::IsUnavailable(s) && !is_distributed_communication_op_) {
       s = errors::ReplaceErrorFromNonCommunicationOps(s, kernel_->name());
     }
     return s;
@@ -471,7 +478,7 @@ void KernelAndDeviceFunc::RunAsync(
   tsl::core::RefCountPtr<Rendezvous> created_rendezvous;
   std::shared_ptr<FunctionLibraryRuntime::Options> opts = PrepareForRun(
       step_container, outputs, cancellation_manager, eager_func_params,
-      absl::nullopt, coordination_service_agent, &created_rendezvous);
+      std::nullopt, coordination_service_agent, &created_rendezvous);
 
   pflr_->Run(
       *opts, handle_, inputs, outputs,
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device.h b/tensorflow/core/common_runtime/eager/kernel_and_device.h
index 6c70e9fb031..a98427a9e04 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device.h
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device.h
@@ -18,8 +18,13 @@ limitations under the License.
 
 // Support for eager execution of TensorFlow kernels.
 
+#include <functional>
 #include <memory>
+#include <optional>
 #include <unordered_map>
+#include <utility>
+#include <variant>
+#include <vector>
 
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
@@ -62,19 +67,19 @@ const int64_t kInvalidOpId = -1;
 struct EagerFunctionParams {
   int64_t op_id = kInvalidOpId;
   bool is_component_function;
-  absl::optional<int64_t> step_id = absl::nullopt;
+  std::optional<int64_t> step_id = std::nullopt;
 };
 
 class EagerKernelArgs : public FunctionArgsInterface {
  public:
-  EagerKernelArgs() {}
+  EagerKernelArgs() = default;
 
   explicit EagerKernelArgs(int count) : tensor_args_(count) {}
 
   explicit EagerKernelArgs(gtl::InlinedVector<TensorValue, 4>&& tensor_args)
       : tensor_args_(std::move(tensor_args)) {}
 
-  ~EagerKernelArgs() override{};
+  ~EagerKernelArgs() override = default;
 
   bool HasRemoteOrPackedInputs() const override { return false; };
   TensorValue* MutableInput(int i) { return &tensor_args_[i]; }
@@ -91,7 +96,7 @@ class EagerKernelArgs : public FunctionArgsInterface {
   gtl::InlinedVector<TensorValue, 4> tensor_args_;
 };
 
-typedef absl::variant<Tensor, TensorShape> EagerKernelRet;
+typedef std::variant<Tensor, TensorShape> EagerKernelRet;
 
 // KernelAndDevice encapsulates the logic needed to run a computation eagerly.
 // The computation can be a single instantiated kernel (implemented by
@@ -108,7 +113,7 @@ class KernelAndDevice : public core::RefCounted {
   //
   // The provided FunctionLibraryRuntime MUST outlive all calls to
   // Run() on the returned KernelAndDevice.
-  virtual Status Init(const bool log_device_placement, const NodeDef& ndef,
+  virtual Status Init(bool log_device_placement, const NodeDef& ndef,
                       GraphCollector* graph_collector) = 0;
 
   // Non-multi-device functions are run using regular CallOp and look like
@@ -127,7 +132,7 @@ class KernelAndDevice : public core::RefCounted {
         runner_(runner) {}
 
   // Not thread safe.
-  ~KernelAndDevice() override {}
+  ~KernelAndDevice() override = default;
 
   virtual bool IsFunction() { return false; }
 
@@ -208,9 +213,9 @@ class KernelAndDeviceOp final : public KernelAndDevice {
         rendezvous_(rendezvous),
         log_memory_(log_memory) {}
 
-  ~KernelAndDeviceOp() override {}
+  ~KernelAndDeviceOp() override = default;
 
-  Status Init(const bool log_device_placement, const NodeDef& ndef,
+  Status Init(bool log_device_placement, const NodeDef& ndef,
               GraphCollector* graph_collector) override;
 
   Status Run(
@@ -282,8 +287,8 @@ class KernelAndDeviceFunc : public KernelAndDevice {
       const bool allow_control_flow_sync_execution,
       const bool shape_inference_on_tfe_dialect_import,
       const bool int_args_and_retvals_on_device,
-      absl::optional<string> xla_compile_device_type,
-      Rendezvous::Factory rendezvous_factory,
+      std::optional<string> xla_compile_device_type,
+      const bool allow_soft_placement, Rendezvous::Factory rendezvous_factory,
       std::function<int64_t()> get_op_id)
       : KernelAndDevice(flr, runner, std::move(collective_executor),
                         host_cpu_device),
@@ -296,6 +301,7 @@ class KernelAndDeviceFunc : public KernelAndDevice {
             shape_inference_on_tfe_dialect_import),
         int_args_and_retvals_on_device_(int_args_and_retvals_on_device),
         xla_compile_device_type_(xla_compile_device_type),
+        allow_soft_placement_(allow_soft_placement),
         input_devices_(std::move(input_devices)),
         composite_devices_(std::move(composite_devices)),
         input_resource_dtypes_and_shapes_(
@@ -310,10 +316,10 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   bool IsCrossProcess() override { return is_cross_process_; }
 
-  Status InstantiateFunc(const bool log_device_placement, const NodeDef& ndef,
+  Status InstantiateFunc(bool log_device_placement, const NodeDef& ndef,
                          GraphCollector* graph_collector);
 
-  Status Init(const bool log_device_placement, const NodeDef& ndef,
+  Status Init(bool log_device_placement, const NodeDef& ndef,
               GraphCollector* graph_collector) override;
 
   Status Run(
@@ -380,6 +386,8 @@ class KernelAndDeviceFunc : public KernelAndDevice {
 
   const absl::optional<string> xla_compile_device_type_;
 
+  const bool allow_soft_placement_;
+
   // CPU devices are null. Resource handles' devices are actual backing
   // devices.
   std::vector<Device*> output_devices_;
diff --git a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
index 742847a0e17..bec5989f3bd 100644
--- a/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
+++ b/tensorflow/core/common_runtime/eager/kernel_and_device_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/eager/kernel_and_device.h"
 
 #include <memory>
+#include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/memory/memory.h"
@@ -138,8 +140,8 @@ void BM_KernelAndDeviceRun(::testing::benchmark::State& state) {
   TF_CHECK_OK(k.Init({}, ndef, nullptr));
   const EagerKernelArgs args(std::move(inputs));
   for (auto s : state) {
-    TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, absl::nullopt,
-                      absl::nullopt, nullptr));
+    TF_CHECK_OK(k.Run(nullptr, args, &outputs, nullptr, std::nullopt,
+                      std::nullopt, nullptr));
   }
 }
 BENCHMARK(BM_KernelAndDeviceRun);
diff --git a/tensorflow/core/common_runtime/eager/placement_test.cc b/tensorflow/core/common_runtime/eager/placement_test.cc
index 9347258deea..60cb730f986 100644
--- a/tensorflow/core/common_runtime/eager/placement_test.cc
+++ b/tensorflow/core/common_runtime/eager/placement_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -124,7 +127,7 @@ TEST_F(PlacementTest, SelectDeviceExplicitHardPlacement) {
   NodeDef invalid_op = NDef("invalid_op", "InvalidOp", {}, {});
 
   Status status = context()->SelectDevice(requested, invalid_op, &dev);
-  LOG(ERROR) << status.ToString();
+  LOG(ERROR) << status;
   EXPECT_TRUE(errors::IsNotFound(status));
   EXPECT_TRUE(
       absl::StrContains(status.message(), "Could not find device for node"))
@@ -165,7 +168,7 @@ TEST_F(PlacementTest, SelectDeviceExplicitSoftPlacement) {
   NodeDef invalid_op = NDef("invalid_op", "InvalidOp", {}, {});
 
   Status status = context()->SelectDevice(requested, invalid_op, &dev);
-  LOG(ERROR) << status.ToString();
+  LOG(ERROR) << status;
   EXPECT_TRUE(errors::IsNotFound(status));
   EXPECT_TRUE(
       absl::StrContains(status.message(), "Could not find device for node"))
diff --git a/tensorflow/core/common_runtime/eager/placement_utils.cc b/tensorflow/core/common_runtime/eager/placement_utils.cc
index 804ab2f42a0..21b184c04d3 100644
--- a/tensorflow/core/common_runtime/eager/placement_utils.cc
+++ b/tensorflow/core/common_runtime/eager/placement_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/placement_utils.h"
 
+#include <variant>
+
+#include "absl/status/status.h"
 #include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
 #include "tensorflow/core/common_runtime/eager/attr_builder.h"
 #include "tensorflow/core/common_runtime/eager/custom_device.h"
@@ -68,8 +71,8 @@ bool IsFunction(StringPiece op_name) {
   const OpDef* op_def = nullptr;
   Status s = OpDefForOp(string(op_name), &op_def);
   if (!s.ok()) {
-    if (!errors::IsNotFound(s)) {
-      LOG(WARNING) << "Looking up OpDef failed with error: " << s.ToString();
+    if (!absl::IsNotFound(s)) {
+      LOG(WARNING) << "Looking up OpDef failed with error: " << s;
     }
     // Cannot find OpDef, it is a function.
     return true;
@@ -143,7 +146,7 @@ Status MaybePinToResourceDevice(Device** device, const EagerOperation& op) {
   TF_RETURN_IF_ERROR(op.TensorHandleInputs(&inputs));
   Device* op_device = op.Device() == kVariantDeviceNull
                           ? ctx.HostCPU()
-                          : absl::get<Device*>(op.Device());
+                          : std::get<Device*>(op.Device());
   for (int i = 0; i < inputs->size(); ++i) {
     TensorHandle* tensor_handle = (*inputs)[i];
     if (tensor_handle->dtype == DT_RESOURCE) {
diff --git a/tensorflow/core/common_runtime/eager/rendezvous_cache.h b/tensorflow/core/common_runtime/eager/rendezvous_cache.h
index f3971768e48..6dde573a90a 100644
--- a/tensorflow/core/common_runtime/eager/rendezvous_cache.h
+++ b/tensorflow/core/common_runtime/eager/rendezvous_cache.h
@@ -35,7 +35,7 @@ template <typename T>
 class RendezvousCache : public tsl::core::WeakRefCounted {
  public:
   RendezvousCache() = default;
-  virtual ~RendezvousCache() {
+  ~RendezvousCache() override {
     for (auto& p : table_) {
       auto rendez = p.second.GetNewRef();
       if (rendez) {
diff --git a/tensorflow/core/common_runtime/eager/summary_optimizer.cc b/tensorflow/core/common_runtime/eager/summary_optimizer.cc
new file mode 100644
index 00000000000..51a8a62cef5
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/summary_optimizer.cc
@@ -0,0 +1,184 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/summary_optimizer.h"
+
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_split.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow::summary_optimizer {
+namespace {
+
+constexpr char kDisableSummariesAtRuntime[] = "disable_summaries_at_runtime";
+constexpr char kFlushSummaryWriter[] = "FlushSummaryWriter";
+constexpr char kWriteSummary[] = "write_summary";
+constexpr char kForwardFunctionName[] = "forward_function_name";
+constexpr char kBackwardFunctionName[] = "backward_function_name";
+constexpr char kEmptyString[] = "";
+
+using summary_optimizer::internal::NormalizeEdgeName;
+using ArgDef = OpDef::ArgDef;
+
+void UpdateNestedFunctionName(NodeDef& ndef) {
+  for (auto& [k, v] : *ndef.mutable_attr()) {
+    if (v.has_func()) {
+      v.mutable_func()->set_name(StrippedFunctionName(v.func().name()));
+    } else if (v.list().func_size() > 0) {
+      for (auto& func : *v.mutable_list()->mutable_func()) {
+        func.set_name(StrippedFunctionName(func.name()));
+      }
+    }
+  }
+}
+
+void PruneDeletedInputDeps(
+    const absl::flat_hash_set<std::string>& nodes_to_keep, NodeDef& ndef) {
+  auto inputs = ndef.input();
+  ndef.clear_input();
+  for (const std::string& input : inputs) {
+    if (nodes_to_keep.contains(NormalizeEdgeName(input))) {
+      ndef.add_input(input);
+    }
+  }
+}
+
+FunctionDef StripSummary(const FunctionDef& fdef_with_summaries) {
+  FunctionDef fdef = fdef_with_summaries;
+  fdef.mutable_signature()->set_name(
+      StrippedFunctionName(fdef.signature().name()));
+  auto nodes = fdef.node_def();
+  fdef.clear_node_def();
+  absl::flat_hash_set<std::string> nodes_to_keep;
+  absl::c_transform(nodes, std::inserter(nodes_to_keep, nodes_to_keep.end()),
+                    [](const NodeDef& node_def) { return node_def.name(); });
+  absl::c_transform(fdef.signature().input_arg(),
+                    std::inserter(nodes_to_keep, nodes_to_keep.end()),
+                    [](const ArgDef& input_arg) { return input_arg.name(); });
+
+  // Prune all nodes corresponding to `summary_ops`.
+  for (const NodeDef& ndef : nodes) {
+    // `FlushSummaryWriter` node indicates the final node in a function that
+    // writes summaries.
+    if (ndef.op() == kFlushSummaryWriter) nodes_to_keep.erase(ndef.name());
+
+    // summary.write ops are created under a `write_summary` name scope.
+    // Prune ops created internally for the summary.write operations.
+    for (const auto& substr : absl::StrSplit(ndef.name(), '/')) {
+      if (substr == kWriteSummary) {
+        nodes_to_keep.erase(ndef.name());
+        break;
+      }
+    }
+  }
+
+  // Update the FunctionDef to exclude the pruned nodes.
+  for (NodeDef& ndef : nodes) {
+    if (!nodes_to_keep.contains(ndef.name())) continue;
+    PruneDeletedInputDeps(nodes_to_keep, ndef);
+    UpdateNestedFunctionName(ndef);
+    *fdef.add_node_def() = std::move(ndef);
+  }
+
+  // Prune out any control outputs that were used only for the summary_ops.
+  auto control_ret = fdef.control_ret();
+  fdef.clear_control_ret();
+  for (const auto& [signature_node_name, node_name] : control_ret) {
+    if (!nodes_to_keep.contains(NormalizeEdgeName(node_name))) continue;
+    fdef.mutable_control_ret()->insert({signature_node_name, node_name});
+  }
+
+  // Prune out any summary_ops-related-control-nodes from the function's output
+  // signature.
+  auto control_outputs = fdef.signature().control_output();
+  fdef.mutable_signature()->clear_control_output();
+  for (const std::string& control_output : control_outputs) {
+    if (!fdef.control_ret().contains(control_output)) continue;
+    fdef.mutable_signature()->add_control_output(control_output);
+  }
+
+  for (auto& [k, v] : *fdef.mutable_attr()) {
+    // Update the names of the forward and backward function names.
+    if (k == kForwardFunctionName || k == kBackwardFunctionName) {
+      v.set_s(StrippedFunctionName(v.s()));
+    }
+    // Disable summary stripping on functions that have already been stripped.
+    if (k == kDisableSummariesAtRuntime) v.clear_list();
+  }
+  return fdef;
+}
+
+}  // namespace
+
+namespace internal {
+
+std::string NormalizeEdgeName(absl::string_view name) {
+  // Control nodes begin with '^'.
+  // If an edge_name is split by `:` it indicates which output of a node to
+  // return. Since we only care about the node's name we can discard everything
+  // following the first `:`.
+  std::vector<std::string> edge_name =
+      absl::StrSplit(name, absl::ByAnyChar("^:"));
+  return edge_name[0].empty() ? edge_name[1] : edge_name[0];
+}
+
+}  // namespace internal
+
+std::pair<absl::string_view, bool> GetDisableSummariesInputArg(
+    const FunctionDef& fdef) {
+  auto it = fdef.attr().find(kDisableSummariesAtRuntime);
+  if (it == fdef.attr().end()) return {kEmptyString, false};
+  if (it->second.has_list()) {
+    const auto& list = it->second.list();
+    if (list.s_size() == 1 && list.b_size() == 1) {
+      return {list.s(0), list.b(0)};
+    }
+  }
+  return {kEmptyString, false};
+}
+
+std::vector<FunctionDef> StripSummaries(const FunctionDef& fdef,
+                                        const FunctionLibraryDefinition& flib) {
+  std::vector<FunctionDef> results;
+  if (GetDisableSummariesInputArg(fdef).first.empty()) return results;
+
+  // Strip the summaries from the provided `fdef`.
+  results.push_back(StripSummary(fdef));
+
+  // Strip the summaries from all nested functions within `fdef`.
+  FunctionLibraryDefinition reachable_library = flib.ReachableDefinitions(fdef);
+  for (const std::string& fname : reachable_library.ListFunctionNames()) {
+    auto* nested_fdef = flib.Find(fname);
+    if (nested_fdef == nullptr) continue;
+    results.push_back(StripSummary(*nested_fdef));
+  }
+
+  return results;
+}
+
+std::string StrippedFunctionName(absl::string_view fname) {
+  return absl::StrCat(fname, "__instance__no_summaries");
+}
+
+}  // namespace tensorflow::summary_optimizer
diff --git a/tensorflow/core/common_runtime/eager/summary_optimizer.h b/tensorflow/core/common_runtime/eager/summary_optimizer.h
new file mode 100644
index 00000000000..0b337e04f37
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/summary_optimizer.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SUMMARY_OPTIMIZER_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SUMMARY_OPTIMIZER_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+
+namespace tensorflow::summary_optimizer {
+namespace internal {
+
+// Normalizes an edge's name to match the names stored in a NodeDef.
+std::string NormalizeEdgeName(absl::string_view name);
+
+}  // namespace internal
+
+// Returns the name of the input_arg and the bool value that determines whether
+// or not to disable summaries. If no such arg exists returns an empty string.
+std::pair<absl::string_view, bool> GetDisableSummariesInputArg(
+    const FunctionDef& fdef);
+
+// Generates new FunctionDef(s) with the summaries stripped out.
+// This function will traverse all the nested functions and generate a version
+// of the nested functions with summaries stripped out.
+std::vector<FunctionDef> StripSummaries(const FunctionDef& fdef,
+                                        const FunctionLibraryDefinition& flib);
+
+// Generates a new function name for the stripped function.
+std::string StrippedFunctionName(absl::string_view fname);
+
+}  // namespace tensorflow::summary_optimizer
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_SUMMARY_OPTIMIZER_H_
diff --git a/tensorflow/core/common_runtime/eager/summary_optimizer_test.cc b/tensorflow/core/common_runtime/eager/summary_optimizer_test.cc
new file mode 100644
index 00000000000..64df9d36421
--- /dev/null
+++ b/tensorflow/core/common_runtime/eager/summary_optimizer_test.cc
@@ -0,0 +1,250 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/eager/summary_optimizer.h"
+
+#include <algorithm>
+#include <string>
+#include <vector>
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace {
+
+using ::tensorflow::summary_optimizer::GetDisableSummariesInputArg;
+using ::tensorflow::summary_optimizer::StrippedFunctionName;
+using ::tensorflow::summary_optimizer::StripSummaries;
+using ::tensorflow::summary_optimizer::internal::NormalizeEdgeName;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::protobuf::util::MessageDifferencer;
+
+template <typename T>
+void CompareProto(const T& expected, const std::string& text_proto) {
+  T proto;
+  ASSERT_TRUE(TextFormat::ParseFromString(text_proto, &proto));
+  MessageDifferencer differencer;
+  EXPECT_TRUE(differencer.Compare(expected, proto));
+}
+
+TEST(SummaryOptimizerInternal, NormalizesEdgeName) {
+  EXPECT_EQ(NormalizeEdgeName("include_summary"), "include_summary");
+  EXPECT_EQ(NormalizeEdgeName("^include_summary"), "include_summary");
+  EXPECT_EQ(NormalizeEdgeName("^include_summary:0"), "include_summary");
+  EXPECT_EQ(NormalizeEdgeName("^include_summary/identity:0"),
+            "include_summary/identity");
+}
+
+TEST(SummaryOptimizer, GetsDisableSummariesInputArg) {
+  FunctionDef fdef;
+  // When no disable_summaries_at_runtime attr is populated expect an empty str.
+  auto input_arg = GetDisableSummariesInputArg(fdef);
+  EXPECT_EQ(input_arg.first, "");
+  EXPECT_FALSE(input_arg.second);
+
+  AttrValue attr_val;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            list { s: "remove_summary" b: true }
+                                          )pb",
+                                          &attr_val));
+  fdef.mutable_attr()->insert({"disable_summaries_at_runtime", attr_val});
+  input_arg = GetDisableSummariesInputArg(fdef);
+  EXPECT_EQ(input_arg.first, "remove_summary");
+  EXPECT_TRUE(input_arg.second);
+}
+
+TEST(SummaryOptimizer, StripsSummaries) {
+  FunctionDef fdef;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        signature {
+          name: "train"  # Function name should be updated.
+          input_arg: { name: "include_summaries" }
+          control_output: "out_pruned"  # Control output should be pruned
+                                        # because it was pruned from
+                                        # `control_ret`.
+          control_output: "out"
+        }
+        node_def { name: "x" }
+        node_def {
+          name: "write_summary/Identity"
+        }  # Node should get pruned based on name.
+        node_def {
+          name: "Identity/x"
+          input: "write_summary/Identity"  # Summary scope input should get
+                                           # pruned.
+          input: "x"
+        }
+        node_def {
+          name: "nested_fn"
+          op: "PartitionedCall"
+          attr {
+            key: "f"
+            value: { func: { name: "nested_fn" } }
+          }
+        }
+        node_def {
+          name: "list_of_nested_fns"
+          op: "SomeCustomOp"
+          attr {
+            key: "functions"
+            value: {
+              list: {
+                func: { name: "nested_fn2" }
+                func: { name: "nested_fn3" }
+              }
+            }
+          }
+        }
+        node_def {
+          op: "FlushSummaryWriter"
+        }  # Node should get pruned based on op.
+        control_ret {
+          key: "out_pruned",
+          value: "write_summary/Identity:0"
+        }  # Control return should get pruned because node was pruned.
+        control_ret { key: "out", value: "Identity/x" }
+        attr {
+          key: "forward_function_name"
+          value: {
+            s: "__inference_train_1"
+          }  # Forward function name should be updated.
+        }
+        attr {
+          key: "backward_function_name"
+          value: {
+            s: "__inference_train_2"
+          }  # Backward function name should be updated.
+        }
+        attr {
+          key: "disable_summaries_at_runtime"
+          value: { list { s: "include_summaries" b: false } }
+        }
+      )pb",
+      &fdef));
+  FunctionDef nested_fdef;
+  nested_fdef.mutable_signature()->set_name("nested_fn");
+  FunctionDef nested_fdef2;
+  nested_fdef2.mutable_signature()->set_name("nested_fn2");
+  FunctionDef nested_fdef3;
+  nested_fdef3.mutable_signature()->set_name("nested_fn3");
+
+  FunctionLibraryDefinition flib(OpRegistry::Global());
+  TF_ASSERT_OK(flib.AddFunctionDef(fdef));
+  TF_ASSERT_OK(flib.AddFunctionDef(nested_fdef));
+  TF_ASSERT_OK(flib.AddFunctionDef(nested_fdef2));
+  TF_ASSERT_OK(flib.AddFunctionDef(nested_fdef3));
+
+  std::vector<FunctionDef> stripped_fdefs = StripSummaries(fdef, flib);
+  ASSERT_EQ(stripped_fdefs.size(), 4);
+  // Sort the FunctionDefs so we are able to compare them in a deterministic
+  // order.
+  struct {
+    bool operator()(const FunctionDef& lhs, const FunctionDef& rhs) const {
+      return lhs.signature().name() > rhs.signature().name();
+    }
+  } fdefOrdering;
+  std::sort(stripped_fdefs.begin(), stripped_fdefs.end(), fdefOrdering);
+  CompareProto(stripped_fdefs[0], R"pb(
+    signature {
+      name: "train__instance__no_summaries"
+      input_arg: { name: "include_summaries" }
+      control_output: "out"
+    }
+    node_def { name: "x" }
+    node_def { name: "Identity/x" input: "x" }
+    node_def {
+      name: "nested_fn"
+      op: "PartitionedCall"
+      attr {
+        key: "f"
+        value: { func: { name: "nested_fn__instance__no_summaries" } }
+      }
+    }
+    node_def {
+      name: "list_of_nested_fns"
+      op: "SomeCustomOp"
+      attr {
+        key: "functions"
+        value: {
+          list: {
+            func: { name: "nested_fn2__instance__no_summaries" }
+            func: { name: "nested_fn3__instance__no_summaries" }
+          }
+        }
+      }
+    }
+    control_ret { key: "out", value: "Identity/x" }
+    attr {
+      key: "forward_function_name",
+      value: { s: "__inference_train_1__instance__no_summaries" }
+    }
+    attr {
+      key: "backward_function_name",
+      value: { s: "__inference_train_2__instance__no_summaries" }
+    }
+    attr {
+      key: "disable_summaries_at_runtime"
+      value {}
+    }
+  )pb");
+  CompareProto(stripped_fdefs[1], R"pb(
+    signature { name: "nested_fn__instance__no_summaries" }
+  )pb");
+  CompareProto(stripped_fdefs[2], R"pb(
+    signature { name: "nested_fn3__instance__no_summaries" }
+  )pb");
+  CompareProto(stripped_fdefs[3], R"pb(
+    signature { name: "nested_fn2__instance__no_summaries" }
+  )pb");
+}
+
+TEST(SummaryOptimizer, DoesNotStripSummariesWhenNotEnabled) {
+  FunctionDef fdef;
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(
+                                    signature { name: "train" }
+                                    attr {
+                                      key: "disable_summaries_at_runtime",
+                                      value: {}
+                                    }
+                                  )pb",
+                                  &fdef));
+  FunctionLibraryDefinition flib(OpRegistry::Global());
+  TF_ASSERT_OK(flib.AddFunctionDef(fdef));
+
+  // No stripped FunctionDefs generated when disable_summaries_at_runtime has no
+  // value.
+  EXPECT_TRUE(StripSummaries(fdef, flib).empty());
+
+  // No stripped FunctionDefs generated when there is no
+  // `disable_summaries_at_runtime` attr in the FunctionDef.
+  fdef.clear_attr();
+  TF_ASSERT_OK(flib.RemoveFunction("train"));
+  TF_ASSERT_OK(flib.AddFunctionDef(fdef));
+  EXPECT_TRUE(StripSummaries(fdef, flib).empty());
+}
+
+TEST(SummaryOptimizer, GeneratesNewFunctionName) {
+  EXPECT_EQ(StrippedFunctionName("train"), "train__instance__no_summaries");
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.cc b/tensorflow/core/common_runtime/eager/tensor_handle.cc
index a5b479d29e3..b8c052cb1da 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.cc
@@ -20,7 +20,9 @@ limitations under the License.
 #include <memory>
 #include <queue>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/strings/substitute.h"
@@ -102,8 +104,8 @@ Status TensorHandle::PackedTensorHandleData::NumElements(
 
 Status TensorHandle::PackedTensorHandleData::Unprotect() {
   for (auto* handle : handles_) {
-    TF_RETURN_IF_ERROR(absl::visit([](auto& data) { return data.Unprotect(); },
-                                   handle->data_));
+    TF_RETURN_IF_ERROR(
+        std::visit([](auto& data) { return data.Unprotect(); }, handle->data_));
   }
   return OkStatus();
 }
@@ -146,8 +148,8 @@ string TensorHandle::PackedTensorHandleData::DebugString() const {
   string debug_str = "PackedTensorHandleData: ";
   for (const auto* handle : handles_) {
     debug_str.append(
-        absl::StrCat(absl::visit([](auto& data) { return data.DebugString(); },
-                                 handle->data_),
+        absl::StrCat(std::visit([](auto& data) { return data.DebugString(); },
+                                handle->data_),
                      "; "));
   }
   return debug_str;
@@ -189,7 +191,7 @@ Status TensorHandle::GetResourceHandleDtypesAndShapes(
   // Wait for this TensorHandle to be ready.
   profiler::TraceMe activity("TensorHandle::GetResourceHandleInfo WaitReady",
                              profiler::TraceMeLevel::kVerbose);
-  auto& data = absl::get<LocalTensorHandleData>(data_);
+  auto& data = std::get<LocalTensorHandleData>(data_);
   TF_RETURN_IF_ERROR(data.WaitReady("TensorHandle::GetResourceHandleInfo"));
 
   *result = handle_dtypes_and_shapes_;
@@ -200,7 +202,7 @@ int TensorHandle::NumPackedHandles() const {
   if (Type() != PACKED) {
     return 0;
   }
-  return absl::get<PackedTensorHandleData>(data_).NumPackedHandles();
+  return std::get<PackedTensorHandleData>(data_).NumPackedHandles();
 }
 
 Status TensorHandle::ExtractPackedHandle(const int index,
@@ -209,8 +211,8 @@ Status TensorHandle::ExtractPackedHandle(const int index,
     return errors::Internal("Invalid ExtractPackedHandleOnDevice call on a",
                             TypeString(), " handle: ", this);
   }
-  return absl::get<PackedTensorHandleData>(data_).ExtractPackedHandle(index,
-                                                                      handle);
+  return std::get<PackedTensorHandleData>(data_).ExtractPackedHandle(index,
+                                                                     handle);
 }
 
 TensorHandle* TensorHandle::CreateLocalHandle(const tensorflow::Tensor& t) {
@@ -426,12 +428,12 @@ void TensorHandle::Release() {
 tensorflow::DataType TensorHandle::DataType() const { return dtype; }
 
 bool TensorHandle::IsReady() const {
-  return absl::visit([](auto& data) { return data.IsReady(); }, data_);
+  return std::visit([](auto& data) { return data.IsReady(); }, data_);
 }
 
 Status TensorHandle::WaitReady(const char* caller) const {
-  return absl::visit([caller](auto& data) { return data.WaitReady(caller); },
-                     data_);
+  return std::visit([caller](auto& data) { return data.WaitReady(caller); },
+                    data_);
 }
 
 TensorHandle::HandleType TensorHandle::Type() const {
@@ -462,7 +464,7 @@ Status TensorHandle::Tensor(const tensorflow::Tensor** t) const {
                             " handle: ", this);
   }
 
-  auto& data = absl::get<LocalTensorHandleData>(data_);
+  auto& data = std::get<LocalTensorHandleData>(data_);
   return data.Tensor(t);
 }
 
@@ -476,7 +478,7 @@ Status TensorHandle::TensorFromDevice(const Device* d,
                               " handle: ", this);
     }
 
-    auto& data = absl::get<LocalTensorHandleData>(data_);
+    auto& data = std::get<LocalTensorHandleData>(data_);
     return data.Tensor(t);
   }
 
@@ -500,7 +502,7 @@ Status TensorHandle::TensorValue(const Device* d, tensorflow::TensorValue* t) {
                               " handle: ", this);
     }
 
-    auto& data = absl::get<LocalTensorHandleData>(data_);
+    auto& data = std::get<LocalTensorHandleData>(data_);
     return data.TensorValue(t);
   }
 
@@ -517,7 +519,7 @@ Status TensorHandle::TensorValue(const Device* d, tensorflow::TensorValue* t) {
 
 Status TensorHandle::WaitUnknownDevice() const {
   if (unknown_device_) {
-    TF_RETURN_IF_ERROR(absl::visit(
+    TF_RETURN_IF_ERROR(std::visit(
         [](auto& data) {
           return data.WaitReady("TensorHandle::UnknownDevice");
         },
@@ -536,8 +538,7 @@ Status TensorHandle::Shape(tensorflow::TensorShape* shape) {
     DCHECK(fill);
     return OkStatus();
   } else {
-    return absl::visit([shape](auto& data) { return data.Shape(shape); },
-                       data_);
+    return std::visit([shape](auto& data) { return data.Shape(shape); }, data_);
   }
 }
 
@@ -587,7 +588,7 @@ void TensorHandle::SetInferenceShape(
   }
   auto s = PartialTensorShape::MakePartialShape(dims.data(), num_dims,
                                                 &inference_shape_);
-  DCHECK(s.ok());
+  TF_DCHECK_OK(s);
 }
 
 Status TensorHandle::CopyInferenceShape(TensorHandle* other) {
@@ -611,7 +612,7 @@ Status TensorHandle::Shape(tensorflow::PartialTensorShape* shape) const {
     *shape = inference_shape_;
     return OkStatus();
   } else {
-    auto result = absl::visit(
+    auto result = std::visit(
         [](auto& data) {
           TensorShape shape;
           Status s = data.Shape(&shape);
@@ -630,8 +631,8 @@ Status TensorHandle::NumDims(int* num_dims) const {
     *num_dims = inference_shape_.dims();
     return OkStatus();
   } else {
-    return absl::visit(
-        [num_dims](auto& data) { return data.NumDims(num_dims); }, data_);
+    return std::visit([num_dims](auto& data) { return data.NumDims(num_dims); },
+                      data_);
   }
 }
 
@@ -642,7 +643,7 @@ Status TensorHandle::Dim(int dim_index, int64_t* dim) const {
     *dim = inference_shape_.dim_size(dim_index);
     return OkStatus();
   } else {
-    return absl::visit(
+    return std::visit(
         [dim_index, dim](auto& data) { return data.Dim(dim_index, dim); },
         data_);
   }
@@ -654,7 +655,7 @@ Status TensorHandle::NumElements(int64_t* num_elements) const {
     *num_elements = inference_shape_.num_elements();
     return OkStatus();
   } else {
-    return absl::visit(
+    return std::visit(
         [num_elements](auto& data) { return data.NumElements(num_elements); },
         data_);
   }
@@ -664,7 +665,7 @@ Status TensorHandle::Unprotect(const Device* d) {
   DVLOG(3) << "Unprotect on TensorHandle: " << this << " device: " << d;
 
   if (d == device_) {
-    return absl::visit([](auto& data) { return data.Unprotect(); }, data_);
+    return std::visit([](auto& data) { return data.Unprotect(); }, data_);
   }
 
   tf_shared_lock l(mu_);
@@ -727,7 +728,7 @@ Status TensorHandle::RemoteAddress(const Device* d, const bool wait_until_ready,
     return errors::InvalidArgument("Primary device is not remote");
   }
 
-  auto& data = absl::get<RemoteTensorHandleData>(data_);
+  auto& data = std::get<RemoteTensorHandleData>(data_);
   return data.OpIdAndOutputNum(wait_until_ready, op_id, output_num);
 }
 
@@ -853,7 +854,7 @@ Status TensorHandle::SetRemoteShapeAndDevice(const TensorShape& shape,
         "SetRemoteShape should only be called on remote handles.");
   }
 
-  auto& data = absl::get<RemoteTensorHandleData>(data_);
+  auto& data = std::get<RemoteTensorHandleData>(data_);
   // context_view_id is currently used to validate mirrors. The shape of
   // RemoteTensorHandleData should be set without checking context_view_id.
   // The reason behind it is that for the primary copy of data, if the remote
@@ -894,7 +895,7 @@ void TensorHandle::PoisonRemote(Status status, const Device* d,
     DCHECK(Type() == REMOTE)
         << "Poison can only be on remote handles: " << this;
 
-    auto& data = absl::get<RemoteTensorHandleData>(data_);
+    auto& data = std::get<RemoteTensorHandleData>(data_);
     data.Poison(status);
   } else {
     tf_shared_lock l(mu_);
@@ -936,7 +937,7 @@ Status TensorHandle::SetTensor(tensorflow::Tensor&& t, const Device* d) {
       auto& resource_handle = t.flat<class ResourceHandle>()(0);
       handle_dtypes_and_shapes_ = resource_handle.dtypes_and_shapes();
     }
-    auto& data = absl::get<LocalTensorHandleData>(data_);
+    auto& data = std::get<LocalTensorHandleData>(data_);
     return data.SetTensor(std::move(t));
   } else {
     tf_shared_lock l(mu_);
@@ -958,7 +959,7 @@ void TensorHandle::Poison(Status status, const Device* d) {
 
   if (d == device_) {
     DCHECK(Type() != REMOTE) << "Poison can only be on local handles: " << this;
-    absl::visit([status](auto& data) { data.Poison(status); }, data_);
+    std::visit([status](auto& data) { data.Poison(status); }, data_);
   } else {
     tf_shared_lock l(mu_);
     auto elem = local_mirrors_.find(d);
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle.h b/tensorflow/core/common_runtime/eager/tensor_handle.h
index 3d50054af2f..53c733aaa5c 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <queue>
 #include <string>
 #include <unordered_map>
+#include <variant>
 #include <vector>
 
 // clang-format off
@@ -67,9 +68,9 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
 #if !defined(IS_MOBILE_PLATFORM)
   TensorHandle(int64_t op_id, int32_t output_num, const string& remote_task,
                tensorflow::DataType dtype, Device* device, EagerContext* ctx,
-               const bool unknown_device);
+               bool unknown_device);
   TensorHandle(int64_t op_id, int32_t output_num, tensorflow::DataType dtype,
-               Device* device, const bool is_ready, EagerContext* ctx);
+               Device* device, bool is_ready, EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
 
  public:
@@ -94,7 +95,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // count will be increased by one after a call to `CreatePackedHandle`.
   // TODO(b/170414377): Use `TensorHandlePtr` instead.
   static Status CreatePackedHandle(std::vector<TensorHandle*>&& handles,
-                                   const tensorflow::DataType dtype,
+                                   tensorflow::DataType dtype,
                                    const tensorflow::TensorShape& shape,
                                    const string& device_name, EagerContext* ctx,
                                    TensorHandle** packed_handle);
@@ -106,17 +107,19 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // An unshaped remote handle refers to a tensor on a remote worker. It's not
   // ready until the shape is set. It controls the lifetime of the remote
   // tensor.
-  static TensorHandle* CreateUnshapedRemoteHandle(
-      int64_t op_id, int32_t output_num, const string& remote_task,
-      tensorflow::DataType dtype, Device* d, EagerContext* ctx,
-      const bool unknown_device = false);
+  static TensorHandle* CreateUnshapedRemoteHandle(int64_t op_id,
+                                                  int32_t output_num,
+                                                  const string& remote_task,
+                                                  tensorflow::DataType dtype,
+                                                  Device* d, EagerContext* ctx,
+                                                  bool unknown_device = false);
   // A lazy remote handle refers to a tensor on a remote worker. The lifetime of
   // the remote tensor is controlled by the remote worker, but not by the lazy
   // remote handle. Lazy handles are normally created on a default function
   // device.
   static TensorHandle* CreateLazyRemoteHandle(int64_t op_id, int32_t output_num,
                                               tensorflow::DataType dtype,
-                                              Device* d, const bool is_ready,
+                                              Device* d, bool is_ready,
                                               EagerContext* ctx);
 #endif  // IS_MOBILE_PLATFORM
 
@@ -195,8 +198,8 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // Return the op_id and output num if the handle refers to a remote tensor.
   // If wait_until_ready is true, block until the remote tensor is ready on the
   // given remote worker.
-  Status RemoteAddress(const Device* d, const bool wait_until_ready,
-                       int64_t* op_id, int32* output_num) const;
+  Status RemoteAddress(const Device* d, bool wait_until_ready, int64_t* op_id,
+                       int32* output_num) const;
 
   // Called on an async remote tensor once it's shape has been determined. This
   // transitions the tensor handle from a non-ready to a ready state by
@@ -237,12 +240,10 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   Status CopyToDevice(const EagerContext& ctx, tensorflow::Device* d,
                       tensorflow::Tensor* output) const;
 
-  Status InferenceShape(
-      shape_inference::InferenceContext* const inference_context,
-      shape_inference::ShapeHandle* shape_handle);
-  void SetInferenceShape(
-      shape_inference::InferenceContext* const inference_context,
-      const shape_inference::ShapeHandle& shape_handle);
+  Status InferenceShape(shape_inference::InferenceContext* inference_context,
+                        shape_inference::ShapeHandle* shape_handle);
+  void SetInferenceShape(shape_inference::InferenceContext* inference_context,
+                         const shape_inference::ShapeHandle& shape_handle);
   Status CopyInferenceShape(TensorHandle* other);
 
   // dtype for the handle. It must be the same as t.dtype() once the handle is
@@ -266,7 +267,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   int NumPackedHandles() const;
   // It's called on a packed TensorHandle. Extract a handle with the given
   // index.
-  Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
+  Status ExtractPackedHandle(int index, TensorHandle** handle) const;
 
   // For LLVM style RTTI.
   static bool classof(const AbstractTensorHandle* ptr) {
@@ -281,8 +282,8 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   friend class PackedTensorHandleTest;
 
   TensorHandle(std::vector<TensorHandle*>&& handles, Device* device,
-               const tensorflow::DataType dtype,
-               const tensorflow::TensorShape& shape, EagerContext* ctx);
+               tensorflow::DataType dtype, const tensorflow::TensorShape& shape,
+               EagerContext* ctx);
 
   ~TensorHandle() override;
 
@@ -324,8 +325,6 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // then.
   std::unordered_map<string, RemoteTensorHandleData> resource_shape_mirrors_
       TF_GUARDED_BY(mu_);
-  // TODO(gjn): Is std::map the most optimal choice here? Perhaps this should be
-  // a fixed size map.
   std::unordered_map<string, RemoteTensorHandleData> remote_mirrors_
       TF_GUARDED_BY(mu_);
 #endif
@@ -374,7 +373,7 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
     // Number of packed handles.
     int NumPackedHandles() const;
     // Extract a handle on the given index.
-    Status ExtractPackedHandle(const int index, TensorHandle** handle) const;
+    Status ExtractPackedHandle(int index, TensorHandle** handle) const;
 
    private:
     // TODO(b/170414377): Use `TensorHandlePtr` instead.
@@ -388,8 +387,8 @@ class TensorHandle : public ImmediateExecutionTensorHandle {
   // Does not need synchronization because it can be accessed only after
   // WaitReady() has returned. At that point, data_ is immutable.
 #if !defined(IS_MOBILE_PLATFORM)
-  absl::variant<LocalTensorHandleData, PackedTensorHandleData,
-                RemoteTensorHandleData>
+  std::variant<LocalTensorHandleData, PackedTensorHandleData,
+               RemoteTensorHandleData>
       data_;
 #else
   absl::variant<LocalTensorHandleData, PackedTensorHandleData> data_;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
index ea9cd3a843b..66b47961d52 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/eager/tensor_handle_data.h"
 
+#include <utility>
+#include <variant>
+
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -87,7 +90,7 @@ Status LocalTensorHandleData::SetTensor(tensorflow::Tensor&& t) {
   // Create copy of original tensor to avoid forwarding
   forwarding_protection_tensor_ = tensor_;
 
-  auto& state = absl::get<BlockingControl>(ctrl_);
+  auto& state = std::get<BlockingControl>(ctrl_);
   state.SetReady();
 
   return OkStatus();
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_data.h b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
index 2f7898ed878..8fb2083295a 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_data.h
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_data.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_DATA_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_EAGER_TENSOR_HANDLE_DATA_H_
 
+#include <utility>
+#include <variant>
+
 #include "absl/types/variant.h"
 #include "tensorflow/core/common_runtime/eager/context.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -41,18 +44,18 @@ class LocalTensorHandleData {
   Status Unprotect();
 
   bool IsReady() const {
-    return absl::visit([](auto& data) { return data.IsReady(); }, ctrl_);
+    return std::visit([](auto& data) { return data.IsReady(); }, ctrl_);
   }
 
   Status WaitReady(const char* caller) const {
-    return absl::visit([caller](auto& data) { return data.WaitReady(caller); },
-                       ctrl_);
+    return std::visit([caller](auto& data) { return data.WaitReady(caller); },
+                      ctrl_);
   }
   void Poison(Status status) {
-    return absl::visit([status](auto& data) { data.Poison(status); }, ctrl_);
+    return std::visit([status](auto& data) { data.Poison(status); }, ctrl_);
   }
   Status IsPoisoned() const {
-    return absl::visit([](auto& data) { return data.IsPoisoned(); }, ctrl_);
+    return std::visit([](auto& data) { return data.IsPoisoned(); }, ctrl_);
   }
 
   Status SetTensor(tensorflow::Tensor&& t);
@@ -102,7 +105,7 @@ class LocalTensorHandleData {
     Status is_poisoned_ TF_GUARDED_BY(mu_);
   };
 
-  absl::variant<NonBlockingControl, BlockingControl> ctrl_;
+  std::variant<NonBlockingControl, BlockingControl> ctrl_;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index 365a4dca9b7..ec9de73ed75 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/eager/tensor_handle.h"
 
+#include <iostream>
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/common_runtime/composite_device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index fb994a09c9f..0fd5acd9192 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -1078,7 +1078,7 @@ Status ExecutorState<PropagatorStateType>::ProcessOutputs(
                !item.is_distributed_communication) {
       s = errors::ReplaceErrorFromNonCommunicationOps(s, item.kernel->name());
     }
-    return s;
+    return ADD_SOURCE_LOCATION(s);
   }
 
   for (int i = 0; i < item.num_outputs; ++i) {
@@ -1201,20 +1201,11 @@ bool ExecutorState<PropagatorStateType>::NodeDone(
     if (abort_run) {
       TRACEPRINTF("StartAbort: %s", s.ToString().c_str());
       if (cancellation_manager_) {
-        // Only log when the abort happens during the actual run time.
-        // Use LOG(INFO) instead of LOG(WARNING) because error status is
-        // expected when the executor is run under the grappler optimization
-        // phase. Do not log OutOfRange erros because they are expected when
-        // iterating through a tf.data input pipeline.
-        if (!errors::IsOutOfRange(s)) {
-          LOG(INFO) << "[" << immutable_state_.params().device->name()
-                    << "] (DEBUG INFO) Executor start aborting (this does not "
-                       "indicate an error and you can ignore this message): "
-                    << s;
-        } else {
-          VLOG(1) << "[" << immutable_state_.params().device->name()
-                  << "] Executor start aborting: " << s;
-        }
+        // Use VLOG instead of LOG(warning) because error status is expected
+        // when the executor is run under the grappler optimization phase or
+        // when iterating through a tf.data input pipeline.
+        VLOG(1) << "[" << immutable_state_.params().device->name()
+                << "] Executor start aborting: " << s;
       }
 
       if (rendezvous_) {
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index f06d214572d..648a2e80acf 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/step_stats_collector.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/local_rendezvous.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
@@ -65,6 +66,10 @@ class ExecutorTest : public ::testing::Test {
     // LocalRendezvous::AsyncRecv() might still executing after done_callback
     // returns. Wait until the local rc_owner_ releases.
     while (!rendez_->RefCountIsOne()) {
+      LOG(INFO) << "Waiting for rendezvous to release. Current refcount: "
+                << rendez_->RefCount();
+      absl::SleepFor(absl::Milliseconds(200));
+      LocalRendezvous::ReleaseAbortedRendezvous();
     }
     // There should always be exactly one Ref left on the Rendezvous
     // when the test completes.
diff --git a/tensorflow/core/common_runtime/function_optimization_registration_test.cc b/tensorflow/core/common_runtime/function_optimization_registration_test.cc
index 34ed3302963..1a8dddc21f9 100644
--- a/tensorflow/core/common_runtime/function_optimization_registration_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registration_test.cc
@@ -30,7 +30,7 @@ class TestFunctionPass : public FunctionOptimizationPass {
 
   Status Run(const std::string& function_name, const DeviceSet& device_set,
              const ConfigProto& config_proto,
-             absl::string_view xla_compile_device_type,
+             const FunctionOptions& function_options,
              std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
@@ -49,9 +49,9 @@ TEST(FunctionOptimizationPassRegistry, RegisteredPass) {
 
   DeviceSet device_set;
   ConfigProto config_proto;
+  FunctionOptimizationPass::FunctionOptions function_options;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto,
-      /*xla_compile_device_type=*/"",
+      "test_func", device_set, config_proto, function_options,
       /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
diff --git a/tensorflow/core/common_runtime/function_optimization_registry.cc b/tensorflow/core/common_runtime/function_optimization_registry.cc
index 8fc12e99535..9a6ff8040d9 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry.cc
@@ -29,7 +29,8 @@ void FunctionOptimizationPassRegistry::Init(
 
 Status FunctionOptimizationPassRegistry::Run(
     const std::string& function_name, const DeviceSet& device_set,
-    const ConfigProto& config_proto, absl::string_view xla_compile_device_type,
+    const ConfigProto& config_proto,
+    const FunctionOptimizationPass::FunctionOptions& function_options,
     std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
     std::vector<std::string>* control_ret_node_names,
     bool* control_rets_updated) {
@@ -39,9 +40,9 @@ Status FunctionOptimizationPassRegistry::Run(
       tensorflow::metrics::GetGraphOptimizationCounter(),
       {"GraphOptimizationPass", "FunctionOptimizationPassRegistry"});
 
-  return pass_->Run(function_name, device_set, config_proto,
-                    xla_compile_device_type, graph, flib_def,
-                    control_ret_node_names, control_rets_updated);
+  return pass_->Run(function_name, device_set, config_proto, function_options,
+                    graph, flib_def, control_ret_node_names,
+                    control_rets_updated);
 }
 
 // static
diff --git a/tensorflow/core/common_runtime/function_optimization_registry.h b/tensorflow/core/common_runtime/function_optimization_registry.h
index 8179cb329bd..29fe381fe54 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry.h
+++ b/tensorflow/core/common_runtime/function_optimization_registry.h
@@ -41,11 +41,21 @@ namespace tensorflow {
 // Graph and other associated arguments are performed inplace by the pass.
 class FunctionOptimizationPass {
  public:
+  // Grouped Options for the optimized function.
+  struct FunctionOptions {
+    // Specifies the compilation device type(CPU, GPU, etc)
+    // that should be used for entire function.
+    std::string xla_compile_device_type = "";
+    // Whether soft placement and outside compilation
+    // are enabled for the function.
+    bool allow_soft_placement = false;
+  };
+
   virtual ~FunctionOptimizationPass() {}
   virtual Status Run(const std::string& function_name,
                      const DeviceSet& device_set,
                      const ConfigProto& config_proto,
-                     absl::string_view xla_compile_device_type,
+                     const FunctionOptions& function_options,
                      std::unique_ptr<Graph>* graph,
                      FunctionLibraryDefinition* flib_def,
                      std::vector<std::string>* control_ret_node_names,
@@ -65,7 +75,7 @@ class FunctionOptimizationPassRegistry {
   // Runs a pass if the registry contains one.
   Status Run(const std::string& function_name, const DeviceSet& device_set,
              const ConfigProto& config_proto,
-             absl::string_view xla_compile_device_type,
+             const FunctionOptimizationPass::FunctionOptions& function_options,
              std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated);
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
index da25acf9ef5..49bb5c8f06d 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
@@ -28,9 +28,10 @@ TEST(FunctionOptimizationPassRegistry, NoPassSet) {
       std::unique_ptr<FunctionOptimizationPass>());
   DeviceSet device_set;
   ConfigProto config_proto;
+  FunctionOptimizationPass::FunctionOptions function_options;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
       "test_func", device_set, config_proto,
-      /*xla_compile_device_type=*/"",
+      /*function_options=*/function_options,
       /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
index 06f127706b4..99a156dd8e7 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_pass_failure_test.cc
@@ -31,7 +31,7 @@ class FailingFunctionPass : public FunctionOptimizationPass {
 
   Status Run(const std::string& function_name, const DeviceSet& device_set,
              const ConfigProto& config_proto,
-             absl::string_view xla_compile_device_type,
+             const FunctionOptions& function_options,
              std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
@@ -49,9 +49,9 @@ TEST(FunctionOptimizationPassRegistry, PassWithError) {
       std::make_unique<FailingFunctionPass>());
   DeviceSet device_set;
   ConfigProto config_proto;
+  FunctionOptimizationPass::FunctionOptions function_options;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto,
-      /*xla_compile_device_type=*/"",
+      "test_func", device_set, config_proto, function_options,
       /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_test.cc
index 75da5482015..d23aa4e8b1b 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_test.cc
@@ -32,7 +32,7 @@ class PassingFunctionPass : public FunctionOptimizationPass {
 
   Status Run(const std::string& function_name, const DeviceSet& device_set,
              const ConfigProto& config_proto,
-             absl::string_view xla_compile_device_type,
+             const FunctionOptions& function_options,
              std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
@@ -50,8 +50,9 @@ TEST(FunctionOptimizationPassRegistry, PassNoError) {
       std::make_unique<PassingFunctionPass>());
   DeviceSet device_set;
   ConfigProto config_proto;
+  FunctionOptimizationPass::FunctionOptions function_options;
   Status status = FunctionOptimizationPassRegistry::Global().Run(
-      "test_func", device_set, config_proto, /*xla_compile_device_type=*/"",
+      "test_func", device_set, config_proto, function_options,
       /*graph=*/nullptr,
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 440a8bb9946..eb2add08a95 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <gtest/gtest.h>
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/cc/ops/array_ops_internal.h"
@@ -253,7 +254,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsNotFound(status2))
+    EXPECT_TRUE(absl::IsNotFound(status2))
         << "Actual status: " << status2.ToString();
     EXPECT_TRUE(absl::StrContains(status2.message(), "Handle"));
     EXPECT_TRUE(absl::StrContains(status2.message(), "not found"));
@@ -312,7 +313,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     if (!status.ok()) return status;
 
     Status status2 = Run(flr, handle, opts, args, std::move(rets));
-    EXPECT_TRUE(errors::IsNotFound(status2));
+    EXPECT_TRUE(absl::IsNotFound(status2));
     EXPECT_TRUE(absl::StrContains(status2.message(), "Handle"));
     EXPECT_TRUE(absl::StrContains(status2.message(), "not found"));
 
@@ -2457,6 +2458,7 @@ class TestStackTrace : public AbstractStackTrace {
       : frames_(frames) {}
 
   absl::Span<StackFrame const> ToFrames() const override { return frames_; }
+  std::vector<StackFrame> ToUncachedFrames() const override { return frames_; }
 
   StackFrame LastUserFrame() const override { return frames_.back(); }
 
@@ -2568,15 +2570,17 @@ TEST(StackTracesMapToGraphDebugInfoTest, TwoFramesDifferentFile) {
             "other_function_name");
 }
 
-TEST(StackTracesTest, CacheCleaningWorks) {
+TEST(StackTracesTest, ToFrames) {
   StackTracesMap map;
   std::vector<StackFrame> frames = {
       StackFrame({"dummy_file_name", 10, "dummy_function_name"}),
       StackFrame({"other_file_name", 20, "other_function_name"})};
   auto stack_trace = TestStackTrace(frames);
   EXPECT_EQ(stack_trace.ToFrames().size(), 2);
-  stack_trace.WipeCache();
-  EXPECT_EQ(stack_trace.ToFrames().size(), 2);
+  auto uncached_frames = stack_trace.ToUncachedFrames();
+  EXPECT_EQ(uncached_frames.size(), 2);
+  EXPECT_EQ(frames[0], uncached_frames[0]);
+  EXPECT_EQ(frames[1], uncached_frames[1]);
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.cc b/tensorflow/core/common_runtime/gpu/gpu_device.cc
index a86ef1c7e24..62507f9f752 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.cc
@@ -253,8 +253,7 @@ class BaseGPUDevice::StreamGroupFactory {
     if (!group->compute) {
       int priority = GetPriority(tf_device_id.value(), options);
       group->priority = priority;
-      group->compute = GetStream(executor, priority);
-      group->compute->Init();
+      group->compute = GetInitializedStream(executor, priority);
       VLOG(2) << "Created stream[" << stream_group_within_gpu
               << "] = " << group->compute << " with priority: " << priority;
 
@@ -262,8 +261,7 @@ class BaseGPUDevice::StreamGroupFactory {
       // ROCm streams are lightweight and will not necessarily trigger device
       // queue init until they are first used. For optimal performance,
       // compute and nccl streams must be immediate siblings.
-      group->nccl = GetStream(executor, priority);
-      group->nccl->Init();
+      group->nccl = GetInitializedStream(executor, priority);
       VLOG(2) << "Created nccl_stream[" << stream_group_within_gpu
               << "] = " << group->nccl;
 
@@ -272,13 +270,11 @@ class BaseGPUDevice::StreamGroupFactory {
       group->nccl->ThenWaitFor(group->compute);
 #endif
 
-      group->host_to_device = GetStream(executor, priority);
-      group->host_to_device->Init();
+      group->host_to_device = GetInitializedStream(executor, priority);
       VLOG(2) << "Created host_to_device_stream[" << stream_group_within_gpu
               << "] = " << group->host_to_device;
 
-      group->device_to_host = GetStream(executor, priority);
-      group->device_to_host->Init();
+      group->device_to_host = GetInitializedStream(executor, priority);
       VLOG(2) << "Created device_to_host_stream[" << stream_group_within_gpu
               << "] = " << group->device_to_host;
 
@@ -292,8 +288,7 @@ class BaseGPUDevice::StreamGroupFactory {
         num_d2d_streams = 1;
       }
       for (int i = 0; i < num_d2d_streams; ++i) {
-        se::Stream* stream = GetStream(executor, priority);
-        stream->Init();
+        se::Stream* stream = GetInitializedStream(executor, priority);
         group->device_to_device.push_back(stream);
         VLOG(2) << "Created device_to_device_stream[" << stream_group_within_gpu
                 << "] = " << group->device_to_device.back();
@@ -375,10 +370,10 @@ class BaseGPUDevice::StreamGroupFactory {
   }
 
   // Returns a Stream with the underlying GPUStream with the given priority.
-  se::Stream* GetStream(se::StreamExecutor* executor, int priority) {
+  se::Stream* GetInitializedStream(se::StreamExecutor* executor, int priority) {
     auto stream = new se::Stream(executor);
-    static_cast<stream_executor::gpu::GpuStream*>(stream->implementation())
-        ->SetPriority(priority);
+    stream->implementation()->SetPriority(priority);
+    stream->Init();
     return stream;
   }
 
@@ -1075,7 +1070,7 @@ int64_t MinSystemMemory(int64_t available_memory, int cc_major) {
 #if defined(__GNUC__) && defined(__OPTIMIZE__)
 // Do nothing
 #elif !defined(__GNUC__) && defined(NDEBUG)
-// Do nothing
+  // Do nothing
 #else
   // Double the amount of available GPU memory in non-opt builds (debug
   // builds in windows); because in non-opt builds more system memory
@@ -1546,18 +1541,39 @@ Status BaseGPUDeviceFactory::CreateDevices(
   TF_RETURN_IF_ERROR(GetDeviceLocalities(
       tf_device_specs.size(), interconnect_maps, &device_localities));
 
+  GPUProcessState* process_state = GPUProcessState::singleton();
+
   // Build the GPUDevices
   for (int di = 0; di < tf_device_specs.size(); ++di) {
     tsl::TfDeviceId tf_device_id(di);
+
+    std::vector<tsl::TfDeviceId> peer_gpu_ids;
+    size_t num_tf_gpus = tf_device_specs.size();
+    peer_gpu_ids.reserve(num_tf_gpus);
+    for (int id = 0; id < num_tf_gpus; ++id) {
+      tsl::TfDeviceId peer_tf_device_id(id);
+      if (peer_tf_device_id != di) {
+        peer_gpu_ids.push_back(peer_tf_device_id);
+      }
+    }
+
+    int64_t memory_limit = tf_device_specs[di].memory_limit_bytes;
+    Allocator* gpu_allocator = process_state->GetGPUAllocator(
+        options.config.gpu_options(), tf_device_id, memory_limit, peer_gpu_ids);
+    if (gpu_allocator == nullptr) {
+      return absl::InternalError(absl::StrCat(
+          "Failed to get memory allocator for TF GPU ", tf_device_id.value(),
+          " with ", memory_limit, " bytes of memory."));
+    }
+
     auto it = device_localities.find(tf_device_id);
     if (it == device_localities.end()) {
       return errors::Internal("Failed to find DeviceLocality for GPU device ",
                               tf_device_id.value());
     }
     TF_RETURN_IF_ERROR(CreateGPUDevice(options, name_prefix, tf_device_id,
-                                       tf_device_specs[di].memory_limit_bytes,
-                                       it->second, tf_device_specs.size(),
-                                       devices));
+                                       /*dev_locality=*/it->second,
+                                       gpu_allocator, devices));
   }
   return OkStatus();
 }
@@ -1581,9 +1597,8 @@ static string GetShortDeviceDescription(
 
 Status BaseGPUDeviceFactory::CreateGPUDevice(
     const SessionOptions& options, const string& name_prefix,
-    tsl::TfDeviceId tf_device_id, int64_t memory_limit,
-    const DeviceLocality& dev_locality, size_t num_tf_gpus,
-    std::vector<std::unique_ptr<Device>>* devices) {
+    tsl::TfDeviceId tf_device_id, const DeviceLocality& dev_locality,
+    Allocator* gpu_allocator, std::vector<std::unique_ptr<Device>>* devices) {
   CHECK_GE(tf_device_id.value(), 0);
   const string device_name =
       strings::StrCat(name_prefix, "/device:GPU:", tf_device_id.value());
@@ -1602,23 +1617,6 @@ Status BaseGPUDeviceFactory::CreateGPUDevice(
   }
   auto desc = std::move(desc_status).value();
 
-  std::vector<tsl::TfDeviceId> peer_gpu_ids;
-  peer_gpu_ids.reserve(num_tf_gpus);
-  for (int id = 0; id < num_tf_gpus; ++id) {
-    tsl::TfDeviceId peer_tf_device_id(id);
-    if (peer_tf_device_id != tf_device_id) {
-      peer_gpu_ids.push_back(peer_tf_device_id);
-    }
-  }
-
-  GPUProcessState* process_state = GPUProcessState::singleton();
-  Allocator* gpu_allocator = process_state->GetGPUAllocator(
-      options.config.gpu_options(), tf_device_id, memory_limit, peer_gpu_ids);
-  if (gpu_allocator == nullptr) {
-    return errors::Internal("Failed to get memory allocator for TF GPU ",
-                            tf_device_id.value(), " with ", memory_limit,
-                            " bytes of memory.");
-  }
   std::optional<AllocatorStats> stats = gpu_allocator->GetStats();
   if (!stats) {
     return errors::Internal("No allocator statistics");
diff --git a/tensorflow/core/common_runtime/gpu/gpu_device.h b/tensorflow/core/common_runtime/gpu/gpu_device.h
index 3aefa850a8a..b8941107d72 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_device.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_device.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <functional>
 #include <memory>
+#include <optional>
 #include <set>
 #include <string>
 #include <unordered_map>
@@ -85,6 +86,17 @@ class BaseGPUDevice : public LocalDevice {
 
   ~BaseGPUDevice() override;
 
+  struct StreamGroup {
+    se::Stream* compute = nullptr;
+#if TENSORFLOW_USE_ROCM
+    se::Stream* nccl = nullptr;
+#endif
+    se::Stream* host_to_device = nullptr;
+    se::Stream* device_to_host = nullptr;
+    gtl::InlinedVector<se::Stream*, 4> device_to_device;
+    int priority = 0;
+  };
+
   // Initialize the device and return the status of initialization.
   Status Init(const SessionOptions& options);
 
@@ -150,6 +162,8 @@ class BaseGPUDevice : public LocalDevice {
     return stream_->compute->implementation()->GpuStreamMemberHack();
   }
 
+  se::Stream* compute_stream() { return stream_->compute; }
+
   // Given the compute stream for a GPU or virtual GPU, return the TfDeviceId
   // for the GPU or vGPU.
   static std::optional<tsl::TfDeviceId> FindTfDeviceId(se::Stream* compute);
@@ -163,16 +177,6 @@ class BaseGPUDevice : public LocalDevice {
 
  private:
   friend class GPUDeviceTestHelper;
-  struct StreamGroup {
-    se::Stream* compute = nullptr;
-#if TENSORFLOW_USE_ROCM
-    se::Stream* nccl = nullptr;
-#endif
-    se::Stream* host_to_device = nullptr;
-    se::Stream* device_to_host = nullptr;
-    gtl::InlinedVector<se::Stream*, 4> device_to_device;
-    int priority = 0;
-  };
   class StreamGroupFactory;
 
   StreamGroup* stream_;
@@ -387,13 +391,14 @@ class BaseGPUDeviceFactory : public DeviceFactory {
       LocalityMap* localities);
 
  private:
-  // Creates a BaseGPUDevice associated with 'tf_device_id', allocates
-  // (strictly) 'memory_limit' bytes of GPU memory to it, and adds it to the
-  // 'devices' vector.
+  // Creates a BaseGPUDevice associated with 'tf_device_id', and adds it to the
+  // 'devices' vector. The 'gpu_allocator' is created by the caller and usually
+  // preallocates a set amount of GPU memory.
   Status CreateGPUDevice(const SessionOptions& options,
                          const std::string& name_prefix,
-                         tsl::TfDeviceId tf_device_id, int64_t memory_limit,
-                         const DeviceLocality& dev_locality, size_t num_tf_gpus,
+                         tsl::TfDeviceId tf_device_id,
+                         const DeviceLocality& dev_locality,
+                         Allocator* gpu_allocator,
                          std::vector<std::unique_ptr<Device>>* devices);
 
   virtual std::unique_ptr<BaseGPUDevice> CreateGPUDevice(
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index 1dcbca494ed..bcb255f02fe 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 
 #include <algorithm>
+#include <memory>
 #include <optional>
 #include <set>
 #include <sstream>
@@ -27,6 +28,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -229,6 +231,11 @@ class GraphConstructor {
   Status PopulateReturnNodes();
   Status PopulateMissingUnusedInputMapKeys();
 
+  FunctionDefLibraryStackTraces CreateStackTracesForFunctionDefLibrary(
+      const FunctionDefLibrary& library) const;
+  std::shared_ptr<AbstractStackTrace> CreateStackTraceForNode(
+      absl::string_view node_name) const;
+
   void Undo();
 
   // Prints cycles in the graph.
@@ -1143,11 +1150,51 @@ void GraphConstructor::PrintCycles() {
   }
 }
 
+FunctionDefLibraryStackTraces
+GraphConstructor::CreateStackTracesForFunctionDefLibrary(
+    const FunctionDefLibrary& library) const {
+  FunctionDefLibraryStackTraces library_traces;
+  if (debug_info() == nullptr) {
+    return library_traces;
+  }
+  for (const FunctionDef& fdef : library.function()) {
+    const std::string& function_name = fdef.signature().name();
+    StackTracesMap stack_traces;
+    std::string key_suffix = absl::StrCat("@", function_name);
+    for (const auto& [traces_key, stack_trace] : debug_info()->traces()) {
+      if (!absl::EndsWith(traces_key, key_suffix)) continue;
+      std::string node_key =
+          std::string(absl::StripSuffix(traces_key, key_suffix));
+      stack_traces[node_key] =
+          std::make_shared<FrozenStackTrace>(stack_trace, *debug_info());
+    }
+    if (!stack_traces.empty()) {
+      library_traces[function_name] = std::move(stack_traces);
+    }
+  }
+  return library_traces;
+}
+
+std::shared_ptr<AbstractStackTrace> GraphConstructor::CreateStackTraceForNode(
+    absl::string_view node_name) const {
+  if (debug_info() == nullptr) {
+    return nullptr;
+  }
+  auto iterator = debug_info()->traces().find(node_name);
+  if (iterator != debug_info()->traces().end()) {
+    return std::make_shared<FrozenStackTrace>(iterator->second, *debug_info());
+  }
+  return nullptr;
+}
+
 Status GraphConstructor::Convert() {
   // Import functions before adding nodes, since imported nodes may refer to
   // functions
   if (auto library = consume_library(); library.has_value()) {
-    TF_RETURN_IF_ERROR(g_->AddFunctionLibrary(*std::move(library)));
+    FunctionDefLibraryStackTraces library_traces =
+        CreateStackTracesForFunctionDefLibrary(library.value());
+    TF_RETURN_IF_ERROR(
+        g_->AddFunctionLibrary(*std::move(library), library_traces));
   }
 
   std::vector<InputInfo> inputs;
@@ -1276,6 +1323,14 @@ Status GraphConstructor::Convert() {
 
     TF_RETURN_IF_ERROR(MakeNode(std::move(node_def), &node));
 
+    if (node != nullptr) {
+      std::shared_ptr<AbstractStackTrace> stack_trace =
+          CreateStackTraceForNode(node_name);
+      if (stack_trace != nullptr) {
+        node->SetStackTrace(stack_trace);
+      }
+    }
+
     gdef_nodes_[node_name].node = node;
 
     // Remove duplicate control inputs before adding edges to the graph. It
diff --git a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
index 5b82e1d313a..ea56d9be774 100644
--- a/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_fuzz.cc
@@ -12,7 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
+#include <fuzzer/FuzzedDataProvider.h>
+
+#include <iostream>
 #include <memory>
+#include <string>
+#include <vector>
 
 #include "fuzztest/fuzztest.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
@@ -93,5 +99,319 @@ void FuzzGraphEndToEndAllStatic(const GraphDef& graph_def) {
 }
 FUZZ_TEST(GraphDefFuzz, FuzzGraphEndToEndAllStatic);
 
+Node* FindNode(const string& name, Graph* graph) {
+  for (Node* n : graph->nodes()) {
+    if (n->name() == name) return n;
+  }
+  return nullptr;
+}
+
+bool HasNode(const string& name, Graph* graph) {
+  return FindNode(name, graph) != nullptr;
+}
+
+class EmptyErrorCollector : public protobuf::io::ErrorCollector {
+ public:
+  EmptyErrorCollector() {}
+  ~EmptyErrorCollector() override {}
+  void AddError(int line, int column, const std::string& message) override {
+    // log error
+  }
+  void AddWarning(int line, int column, const std::string& message) override {
+    // log warning
+  }
+};
+
+std::vector<std::string> ops = {
+    "Abs",
+    "Acos",
+    "Acosh",
+    "Add",
+    "AddV2",
+    "Asin",
+    "Asinh",
+    "Atan",
+    "Atan2",
+    "Atanh",
+    "BitwiseAnd",
+    "BitwiseOr",
+    "BitwiseXor",
+    "Ceil",
+    "ClipByValue",
+    "ComplexAbs",
+    "Conj",
+    "Cos",
+    "Cosh",
+    "Cross",
+    "Digamma",
+    "Div",
+    "DivNoNan",
+    "Equal",
+    "Erf",
+    "Erfc",
+    "Erfinv",
+    "Exp",
+    "Expint",
+    "Expm1",
+    "Floor",
+    "FloorDiv",
+    "FloorMod",
+    "FresnelCos",
+    "FresnelSin",
+    "Greater",
+    "GreaterEqual",
+    "Igamma",
+    "Inv",
+    "Invert",
+    "IsFinite",
+    "IsInf",
+    "LeftShift",
+    "Less",
+    "LessEqual",
+    "Lgamma",
+    "Log1p",
+    "Maximum",
+    "Minimum",
+    "Mod",
+    "Mul",
+    "MulNoNan",
+    "Ndtri",
+    "Neg",
+    "NextAfter",
+    "NotEqual",
+    "Polygamma",
+    "Pow",
+    "RandomGammaGrad",
+    "RealDiv",
+    "Reciprocal",
+    "RightShift",
+    "Rint",
+    "Round",
+    "Rsqrt",
+    "RsqrtGrad",
+    "Select",
+    "SelectV2",
+    "Sigmoid",
+    "SigmoidGrad",
+    "Sign",
+    "Sin",
+    "Sinh",
+    "Sqrt",
+    "SqrtGrad",
+    "Square",
+    "Sub",
+    "Tan",
+    "TanhGrad",
+    "TruncateDiv",
+    "TruncateMod",
+    "Xdivy",
+    "Xlog1py",
+    "Xlogy",
+    "Zeta",
+};
+
+std::vector<std::string> types = {"DT_INT32", "DT_FLOAT"};
+
+std::string generate_node(const std::string name, const std::string op,
+                          const std::string type,
+                          std::vector<std::string> inputs,
+                          bool includeAttrValue, std::string keytype) {
+  std::string node_start = "";
+  std::string node_inputs = "";
+  std::string attr_key = "";
+  std::string attr_value = "";
+  std::string node_end = "";
+
+  node_start += std::string(
+      "node {\n"
+      "  name: \"" +
+      std::string(name) +
+      "\"\n"
+      "  op: \"" +
+      std::string(op) + "\"\n");
+
+  // Go through all inputs
+  node_inputs += "";
+  for (auto& input : inputs) {
+    node_inputs += "  input: \"" + input + "\"\n";
+  }
+
+  attr_key += std::string(
+      "  attr {\n"
+      "    key: \"" +
+      std::string(keytype) +
+      "\"\n"
+      "    value: {\n"
+      "      type: " +
+      std::string(type) +
+      "\n"
+      "    }\n"
+      "  }\n");
+
+  if (includeAttrValue) {
+    attr_value += std::string(
+        "  attr {\n"
+        "    key: \"shape\"\n"
+        "    value {\n"
+        "      shape {\n"
+        "        unknown_rank: true\n"
+        "      }\n"
+        "    }\n"
+        "  }\n");
+  }
+
+  node_end += "}\n";
+
+  return node_start + node_inputs + attr_key + attr_value + node_end;
+}
+
+void FuzzGraphEndToEndFDP(std::vector<uint8_t> data) {
+  // Fuzzer that assembles a graph that has a high chance of being a working
+  // graph. Specifically, the nodes are connected to each other in terms of
+  // naming, and the string representing ops are actual ops from Tensorflow.
+  // Types (DT_FLOAT) is not necessarily compatible in the graph, although many
+  // of the generated graph will be well defined in that sense.
+  // TODOs:
+  // - Extend to all possible ops.
+  // - Extend to all types (currently only DT_FLOAT and DT_INT32 are used).
+  //   - I think complex types are particularly useful as they will open up
+  //     for more ops.
+  // - Extend the possible set of inputs, in particular in terms of structure.
+  //
+  // It would be smart to approach the above TODOs in a manner that keeps
+  // the fuzzer so it has a high chance of making valid graphs. For example,
+  // when extending types it may become smart to implement features for
+  // mapping ops to the set of input types they accept, and then ensure
+  // only such types are created in the graph.
+  FuzzedDataProvider fdp(data.data(), data.size());
+
+  std::vector<std::string> names_used;
+  std::string graphFdp = "";
+
+  // Create a set of nodes for the graph
+  // Max number of nodes in the graph
+  int MAX_NODES = 10;
+
+  // The actual number of nodes in the graph
+  int nodes_in_graph = fdp.ConsumeIntegralInRange<int>(3, MAX_NODES);
+
+  // Add initial placeholders
+  graphFdp +=
+      generate_node("N0", "Placeholder",
+                    types[fdp.ConsumeIntegralInRange<int>(0, types.size() - 1)],
+                    {}, true, "dtype");
+  graphFdp +=
+      generate_node("N1", "Placeholder",
+                    types[fdp.ConsumeIntegralInRange<int>(0, types.size() - 1)],
+                    {}, true, "dtype");
+
+  names_used.push_back("N0");
+  names_used.push_back("N1");
+
+  // Create all the nodes in the graph that will do computations.
+  // We start at the third node because we've alrady added 2.
+  std::string last_node = "";
+  for (int i = 2; i < nodes_in_graph; i++) {
+    std::string name = "N" + std::to_string(i);
+    last_node = name;
+    std::vector<std::string> inputs;
+
+    bool should_include_inputs = fdp.ConsumeBool();
+    if (should_include_inputs) {
+      int inputs_to_include = fdp.ConsumeIntegralInRange<int>(1, 3);
+      for (int j = 0; j < inputs_to_include; j++) {
+        inputs.push_back(names_used[fdp.ConsumeIntegralInRange<int>(
+            0, names_used.size() - 1)]);
+      }
+    }
+
+    std::string op = ops[fdp.ConsumeIntegralInRange<int>(0, ops.size() - 1)];
+    std::string type =
+        types[fdp.ConsumeIntegralInRange<int>(0, types.size() - 1)];
+
+    graphFdp += generate_node(name, op, type, inputs, false, "T");
+    // Add the name of the node to the used nodes.
+    names_used.push_back(name);
+  }
+
+  graphFdp += "versions { producer: 21 min_consumer: 12 }";
+
+  // For debugging
+  // std::cout << ">>>>>>>>>>>>>>>>>>>>>>>\n";
+  // std::cout << graphFdp;
+  // std::cout << "<<<<<<<<<<<<<<<<<<<<<<<\n";
+
+  // Convert the ASCII graph to an actual graph
+  GraphDef gdef_;
+  ImportGraphDefOptions opts;
+  EmptyErrorCollector emptyErrorCollector;
+  protobuf::TextFormat::Parser parser;
+  parser.RecordErrorsTo(&emptyErrorCollector);
+  bool parsed = parser.ParseFromString(graphFdp, &gdef_);
+  if (!parsed) {
+    return;
+  }
+
+  std::unique_ptr<Graph> graph = std::make_unique<Graph>(OpRegistry::Global());
+  Status s = ImportGraphDef(opts, gdef_, graph.get(), nullptr, nullptr);
+  if (!s.ok()) {
+    return;
+  }
+
+  // Ensure at this point we actually do have our placeholder nodes and our last
+  // node.
+  if (!HasNode("N0", graph.get())) return;
+  if (!HasNode("N1", graph.get())) return;
+  if (!HasNode(last_node, graph.get())) return;
+
+  // Create a session with our graph.
+  GraphDef gdef;
+  graph->ToGraphDef(&gdef);
+  std::unique_ptr<Session> sess(NewSession(SessionOptions()));
+  s = sess->Create(gdef);
+  if (!s.ok()) {
+    return;
+  }
+
+  // Create input tensors and run the session.
+  // TODO: Add more options here, although probably not make it too general or
+  // large.
+  std::vector<Tensor> input_tensors;
+  for (int i = 0; i < 2; i++) {
+    int type_choice = fdp.ConsumeIntegral<int>();
+
+    Tensor input_tensor;
+    if (type_choice % 4 == 0) {
+      input_tensor = Tensor(DT_FLOAT, TensorShape({2}));
+      input_tensor.vec<float>()(0) = fdp.ConsumeFloatingPoint<float>();
+      input_tensor.vec<float>()(1) = fdp.ConsumeFloatingPoint<float>();
+    } else if (type_choice % 4 == 1) {
+      input_tensor = Tensor(DT_INT32, TensorShape({1}));
+      input_tensor.scalar<int>()() = fdp.ConsumeIntegral<int>();
+    } else if (type_choice % 4 == 2) {
+      input_tensor = Tensor(DT_FLOAT, TensorShape({1}));
+      input_tensor.scalar<float>()() = fdp.ConsumeFloatingPoint<float>();
+    } else if (type_choice % 4 == 3) {
+      input_tensor = Tensor(DT_FLOAT, TensorShape({2, 2}));
+      input_tensor.matrix<float>()(0, 0) = fdp.ConsumeFloatingPoint<float>();
+      input_tensor.matrix<float>()(0, 1) = fdp.ConsumeFloatingPoint<float>();
+      input_tensor.matrix<float>()(1, 0) = fdp.ConsumeFloatingPoint<float>();
+      input_tensor.matrix<float>()(1, 1) = fdp.ConsumeFloatingPoint<float>();
+    }
+    input_tensors.push_back(input_tensor);
+  }
+
+  std::vector<std::pair<string, Tensor>> inputs = {{"N0", input_tensors[0]},
+                                                   {"N1", input_tensors[1]}};
+  std::vector<string> output_names = {last_node};
+  std::vector<string> target_names;
+  std::vector<Tensor> outputs;
+  s = sess->Run(inputs, output_names, target_names, &outputs);
+  if (!s.ok()) {
+    return;
+  }
+}
+FUZZ_TEST(GraphDefFuzz, FuzzGraphEndToEndFDP);
+
 }  // namespace
 }  // namespace tensorflow::fuzzing
diff --git a/tensorflow/core/common_runtime/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
index 1f88351919f..2af347b29b3 100644
--- a/tensorflow/core/common_runtime/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -15,10 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 
+#include <utility>
 #include <vector>
 
+#include <gtest/gtest.h>
 #include "tensorflow/core/common_runtime/shape_refiner.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
+#include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/shape_inference.h"
@@ -3252,10 +3255,260 @@ TEST_F(GraphConstructorTest, ImportGraphDef_UnknownOps) {
       {"Make sure the Op and Kernel are registered in the "
        "binary running in this process. Note that if you "
        "are loading a saved graph which used ops from "
-       "tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done "
+       "tf.contrib (e.g. `tf.contrib.resampler`), accessing should be done "
        "before importing the graph, as contrib ops are lazily registered "
        "when the module is first accessed."});
 }
 
+TEST_F(GraphConstructorTest, GraphDebugInfo_Node_StackTrace_Deserialize) {
+  ExpectOK(R"(
+    node     {
+      name: "w1"
+      op: "TestParams"
+    }
+    node {
+      name: "input"
+      op: "TestInput"
+    }
+    node {
+      name: "t1"
+      op: "TestMul"
+      input: "w1"
+      input: "input:1"
+    }
+    debug_info {
+      files: "alpha.cc"
+      files: "beta.cc"
+      files: "gamma.cc"
+      traces {
+        key: "w1"
+        value {
+          file_line_cols {
+            file_index: 0
+            line: 20
+            func: "foo"
+          }
+          file_line_cols {
+            file_index: 1
+            line: 30
+            func: "bar"
+          }
+        }
+      }
+      traces {
+        key: "input"
+        value {
+          file_line_cols {
+            file_index: 0
+            line: 20
+            func: "foo"
+          }
+          file_line_cols {
+            file_index: 2
+            line: 35
+            func: "tree"
+          }
+        }
+      }
+      traces {
+        key: "a1@foo"
+        value {
+          file_line_cols {
+            file_index: 0
+            line: 20
+            func: "foo"
+          }
+          file_line_cols {
+            file_index: 1
+            line: 30
+            func: "bar"
+          }
+        }
+      }
+  })");
+
+  Node* w1 = FindNode("w1");
+  EXPECT_NE(w1, nullptr);
+  const std::shared_ptr<AbstractStackTrace>& w1_stack = w1->GetStackTrace();
+  EXPECT_NE(w1_stack, nullptr);
+  EXPECT_EQ(w1_stack->ToString({}),
+            "File \"alpha.cc\", line 20, in foo\n"
+            "File \"beta.cc\", line 30, in bar");
+
+  Node* input = FindNode("input");
+  EXPECT_NE(input, nullptr);
+  const std::shared_ptr<AbstractStackTrace>& input_stack =
+      input->GetStackTrace();
+  EXPECT_NE(input_stack, nullptr);
+  EXPECT_EQ(input_stack->ToString({}),
+            "File \"alpha.cc\", line 20, in foo\n"
+            "File \"gamma.cc\", line 35, in tree");
+}
+
+TEST_F(GraphConstructorTest,
+       GraphDebugInfo_Node_StackTrace_Deserialize_InvalidFileIndex) {
+  ExpectOK(R"(
+    node     {
+      name: "w1"
+      op: "TestParams"
+    }
+    node {
+      name: "input"
+      op: "TestInput"
+    }
+    node {
+      name: "t1"
+      op: "TestMul"
+      input: "w1"
+      input: "input:1"
+    }
+    debug_info {
+      files: "alpha.cc"
+      files: "beta.cc"
+      files: "gamma.cc"
+      traces {
+        key: "w1"
+        value {
+          file_line_cols {
+            file_index: 2
+            line: 20
+            func: "foo"
+          }
+          file_line_cols {
+            file_index: -1
+            line: 30
+            func: "negative_index"
+          }
+          file_line_cols {
+            file_index: 3
+            line: 40
+            func: "index_ge_length"
+          }
+        }
+      }
+  })");
+
+  Node* w1 = FindNode("w1");
+  EXPECT_NE(w1, nullptr);
+  const std::shared_ptr<AbstractStackTrace>& w1_stack = w1->GetStackTrace();
+  EXPECT_NE(w1_stack, nullptr);
+  EXPECT_EQ(w1_stack->ToString({}),
+            "File \"gamma.cc\", line 20, in foo\n"
+            "File \"<UNKNOWN_FILE_NAME>\", line 30, in negative_index\n"
+            "File \"<UNKNOWN_FILE_NAME>\", line 40, in index_ge_length");
+}
+
+TEST_F(GraphConstructorTest,
+       GraphDebugInfo_FunctionLibrary_StackTrace_Deserialize) {
+  ExpectOK(R"(
+    node     {
+      name: "a"
+      op: "TestParams"
+    }
+    node {
+      name: "b"
+      op: "TestInput"
+    }
+    node {
+      name: "t1"
+      op: "TestMul"
+      input: "a"
+      input: "b:1"
+    }
+    library {
+      function {
+        signature { name: "foo" }
+        node_def { name: "a1" }
+        node_def { name: "a2" }
+      }
+      function {
+        signature { name: "bar" }
+        node_def { name: "b1" }
+        node_def { name: "b2" }
+      }
+    }
+    debug_info {
+      files: "alpha.cc"
+      files: "beta.cc"
+      files: "gamma.cc"
+      files: "delta.cc"
+      traces {
+        key: "input"
+        value {
+          file_line_cols { file_index: 0 line: 20 func: "foo" }
+          file_line_cols { file_index: 2 line: 35 func: "tree" }
+        }
+      }
+      traces {
+        key: "a1@foo"
+        value {
+          file_line_cols { file_index: 0 line: 20 func: "jazz" }
+          file_line_cols { file_index: 1 line: 30 func: "buzz" }
+        }
+      }
+      traces {
+        key: "a2@foo"
+        value {
+          file_line_cols { file_index: 1 line: 25 func: "fuzz" }
+          file_line_cols { file_index: 2 line: 35 func: "fizz" }
+        }
+      }
+      traces {
+        key: "b1@bar"
+        value {
+          file_line_cols { file_index: 0 line: 23 func: "chez" }
+          file_line_cols { file_index: 3 line: 33 func: "whiz" }
+        }
+      }
+      traces {
+        key: "b2@bar"
+        value {
+          file_line_cols { file_index: 1 line: 24 func: "quip" }
+          file_line_cols { file_index: 3 line: 34 func: "jape" }
+        }
+      }
+  })");
+
+  const FunctionLibraryDefinition& flib_def = graph_.flib_def();
+
+  core::RefCountPtr<FunctionRecord> foo_function_record =
+      flib_def.FindRecord("foo");
+  EXPECT_NE(foo_function_record.get(), nullptr);
+  const StackTracesMap& foo_stack_traces = foo_function_record->stack_traces();
+  auto a1_iter = foo_stack_traces.find("a1");
+  EXPECT_NE(a1_iter, foo_stack_traces.end());
+  std::shared_ptr<AbstractStackTrace> a1_stack_trace = a1_iter->second;
+  EXPECT_NE(a1_stack_trace.get(), nullptr);
+  EXPECT_EQ(a1_stack_trace->ToString({}),
+            "File \"alpha.cc\", line 20, in jazz\n"
+            "File \"beta.cc\", line 30, in buzz");
+  auto a2_iter = foo_stack_traces.find("a2");
+  EXPECT_NE(a2_iter, foo_stack_traces.end());
+  std::shared_ptr<AbstractStackTrace> a2_stack_trace = a2_iter->second;
+  EXPECT_NE(a2_stack_trace.get(), nullptr);
+  EXPECT_EQ(a2_stack_trace->ToString({}),
+            "File \"beta.cc\", line 25, in fuzz\n"
+            "File \"gamma.cc\", line 35, in fizz");
+
+  core::RefCountPtr<FunctionRecord> bar_function_record =
+      flib_def.FindRecord("bar");
+  EXPECT_NE(bar_function_record.get(), nullptr);
+  const StackTracesMap& bar_stack_traces = bar_function_record->stack_traces();
+  auto b1_iter = bar_stack_traces.find("b1");
+  EXPECT_NE(b1_iter, bar_stack_traces.end());
+  std::shared_ptr<AbstractStackTrace> b1_stack_trace = b1_iter->second;
+  EXPECT_NE(b1_stack_trace.get(), nullptr);
+  EXPECT_EQ(b1_stack_trace->ToString({}),
+            "File \"alpha.cc\", line 23, in chez\n"
+            "File \"delta.cc\", line 33, in whiz");
+  auto b2_iter = bar_stack_traces.find("b2");
+  EXPECT_NE(b2_iter, bar_stack_traces.end());
+  std::shared_ptr<AbstractStackTrace> b2_stack_trace = b2_iter->second;
+  EXPECT_NE(b2_stack_trace.get(), nullptr);
+  EXPECT_EQ(b2_stack_trace->ToString({}),
+            "File \"beta.cc\", line 24, in quip\n"
+            "File \"delta.cc\", line 34, in jape");
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index f3d585b98b5..5072bbcf6f3 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -128,8 +128,9 @@ class LowerWhileHelper {
   // (name_), infix and a suffix to ensure it is unique within the graph.
   string NewName(const string& infix);
 
-  // Returns whether the While op's input/output at `index` is a `DT_RESOURCE`.
-  bool IsResource(int index);
+  // Returns true if the input at index is a resource and the same resource is
+  // returned as an output.
+  bool IsLoopCarriedResource(int index);
 
   // The original While op.
   Node* while_op_;
@@ -229,7 +230,7 @@ Status LowerWhileHelper::RunInternal() {
 void LowerWhileHelper::InitializeInputOutputToLoweredNodeMap() {
   int counter = 0;
   for (int i = 0; i < num_loop_inputs_; i++) {
-    if (!IsResource(i)) {
+    if (!IsLoopCarriedResource(i)) {
       op_input_output_to_lowered_node_[i] = counter++;
     }
   }
@@ -250,7 +251,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
             .Attr("parallel_iterations", parallel_iterations_)
             .Device(edge->src()->requested_device())
             .AssignedDevice(edge->src()->assigned_device_name());
-    if (IsResource(edge->dst_input())) {
+    if (IsLoopCarriedResource(edge->dst_input())) {
       builder.Attr("is_constant", true);
     }
     TF_RETURN_IF_ERROR(builder.Finalize(graph_, &enter_node));
@@ -280,7 +281,8 @@ Status LowerWhileHelper::CreateEnterNodes() {
 
 Status LowerWhileHelper::CreateMergeNodes() {
   for (Node* enter_node : enter_nodes_) {
-    if (enter_node->output_type(0) == DT_RESOURCE) {
+    bool is_constant = enter_node->attrs().FindByString("is_constant")->b();
+    if (is_constant && enter_node->output_type(0) == DT_RESOURCE) {
       continue;
     }
     Node* merge_node;
@@ -297,7 +299,7 @@ Status LowerWhileHelper::CreateMergeNodes() {
 
 Status LowerWhileHelper::CreateCondFuncCallNode() {
   for (int i = 0; i < num_loop_inputs_; i++) {
-    if (IsResource(i)) {
+    if (IsLoopCarriedResource(i)) {
       cond_call_builder_.Input(NodeOut(enter_nodes_[i], 0));
     } else {
       cond_call_builder_.Input(
@@ -320,7 +322,7 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
 
 Status LowerWhileHelper::CreateSwitchNodes() {
   for (int i = 0; i < num_loop_inputs_; i++) {
-    if (IsResource(i)) {
+    if (IsLoopCarriedResource(i)) {
       continue;
     }
     string op_name;
@@ -349,7 +351,7 @@ Status LowerWhileHelper::CreateSwitchNodes() {
 
 Status LowerWhileHelper::CreateBodyFuncCallNode() {
   for (int i = 0; i < num_loop_inputs_; i++) {
-    if (IsResource(i)) {
+    if (IsLoopCarriedResource(i)) {
       body_call_builder_.Input(NodeOut(enter_nodes_[i], 0));
     } else {
       body_call_builder_.Input(
@@ -383,7 +385,7 @@ Status LowerWhileHelper::CreateExitNodes() {
   std::vector<NodeOut> outputs;
   outputs.reserve(num_loop_inputs_);
   for (int i = 0; i < num_loop_inputs_; i++) {
-    if (IsResource(i)) {
+    if (IsLoopCarriedResource(i)) {
       // Note(srbs): A resource output of this While should never be used but we
       // need this for the IdentityN node below.
       OutputTensor resource_tensor;
@@ -443,7 +445,7 @@ Status LowerWhileHelper::CreateExitNodes() {
 Status LowerWhileHelper::CreateNextIterationNodes() {
   for (int i = 0; i < num_loop_inputs_; i++) {
     Node* next_iteration;
-    if (IsResource(i)) {
+    if (IsLoopCarriedResource(i)) {
       continue;
     }
     Node* merge_node = merge_nodes_[op_input_output_to_lowered_node_[i]];
@@ -472,7 +474,7 @@ Status LowerWhileHelper::UpdateConsumers() {
     if (e->IsControlEdge()) {
       graph_->AddControlEdge(lowered_while_executed_, e->dst());
     } else {
-      if (IsResource(e->src_output())) {
+      if (IsLoopCarriedResource(e->src_output())) {
         OutputTensor resource;
         TF_RETURN_IF_ERROR(
             enter_nodes_[e->src_output()]->input_tensor(0, &resource));
@@ -498,8 +500,17 @@ string LowerWhileHelper::NewName(const string& infix) {
   return graph_->NewName(strings::StrCat(name_, "/", infix));
 }
 
-bool LowerWhileHelper::IsResource(int index) {
-  return while_op_->input_type(index) == DT_RESOURCE;
+bool LowerWhileHelper::IsLoopCarriedResource(int index) {
+  if (while_op_->input_type(index) != DT_RESOURCE) return false;
+
+  auto body_func_name = while_op_->attrs().Find("body")->func().name();
+  auto body_func = flib_def_->Find(body_func_name);
+  auto arg_name = body_func->signature().input_arg(index).name();
+  // Technically, we should check that the position in the return matches
+  // 'index' but proto2 maps have undefined order.
+  for (auto& ret : body_func->ret())
+    if (ret.second == arg_name) return true;
+  return false;
 }
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/mkl_layout_pass.cc b/tensorflow/core/common_runtime/mkl_layout_pass.cc
index 18bf9d23298..a35e06f98c0 100644
--- a/tensorflow/core/common_runtime/mkl_layout_pass.cc
+++ b/tensorflow/core/common_runtime/mkl_layout_pass.cc
@@ -503,7 +503,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
     rinfo_.push_back({csinfo_.fused_conv2d,
                       native_fmt ? csinfo_.mkl_native_fused_conv2d
                                  : csinfo_.mkl_fused_conv2d,
-                      CopyAttrsFusedConv2DCheckConstFilter, FusedConv2DRewrite,
+                      CopyAttrsAllCheckConstFilter, FusedConv2DRewrite,
                       GetRewriteCause()});
     rinfo_.push_back({csinfo_.fused_conv3d, csinfo_.mkl_native_fused_conv3d,
                       CopyAttrsAllCheckConstFilter, AlwaysRewrite,
@@ -1796,6 +1796,7 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
             fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add", "Relu6"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add", "Elu"} ||
+            fused_ops == std::vector<string>{"BiasAdd", "Mish"} ||
             fused_ops == std::vector<string>{"LeakyRelu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "LeakyRelu"} ||
             fused_ops == std::vector<string>{"BiasAdd", "Add", "LeakyRelu"} ||
@@ -2036,9 +2037,6 @@ class MklLayoutRewritePass : public GraphOptimizationPass {
   static void CopyAttrsConvCheckConstFilter(const Node* orig_node,
                                             NodeBuilder* nb,
                                             bool change_format = false);
-  static void CopyAttrsFusedConv2DCheckConstFilter(const Node* orig_node,
-                                                   NodeBuilder* nb,
-                                                   bool change_format = false);
   static void CopyAttrsFromPadAndConv2D(const Node* orig_node1,
                                         const Node* orig_node2, NodeBuilder* nb,
                                         bool change_format = false);
@@ -2459,14 +2457,6 @@ Status MklLayoutRewritePass::CopyInputs(
   // of Input slots because inputs of type list could be unfolded.
   auto old_node_input_size = old_node_inputs.size();
 
-  if (old_node->type_string() == "_FusedConv2D") {
-    // [TODO(intel-tf)]
-    // commit 5be9a5 updates _FusedConv2D with additional host_args,
-    // but mkl version currently doesn't have this input arg, needs to
-    // remove this extra input when replace node with mkl node.
-    old_node_input_slots--;
-  }
-
   DCHECK_GE(old_node_input_size, old_node_input_slots);
 
   // Let's copy all inputs of old node to new node.
@@ -2734,60 +2724,6 @@ void MklLayoutRewritePass::CopyAttrsFromPadAndConv2D(const Node* orig_node1,
   nb->Attr("Tpaddings", Tpaddings);
 }
 
-void MklLayoutRewritePass::CopyAttrsFusedConv2DCheckConstFilter(
-    const Node* orig_node, NodeBuilder* nb, bool change_format) {
-  DataType T;
-  int num_args;
-  string data_format;
-  string padding;
-  std::vector<int32> strides;
-  std::vector<int32> dilations;
-  std::vector<int32> explicit_paddings;
-  float epsilon;
-  std::vector<string> fused_ops;
-  float leakyrelu_alpha;
-  bool use_cudnn_on_gpu;
-
-  // Get all attributes from old node.
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "T", &T));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "num_args", &num_args));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "strides", &strides));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "padding", &padding));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "data_format", &data_format));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "dilations", &dilations));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "fused_ops", &fused_ops));
-  TF_CHECK_OK(GetNodeAttr(orig_node->def(), "epsilon", &epsilon));
-  TF_CHECK_OK(
-      GetNodeAttr(orig_node->def(), "leakyrelu_alpha", &leakyrelu_alpha));
-  TF_CHECK_OK(
-      GetNodeAttr(orig_node->def(), "use_cudnn_on_gpu", &use_cudnn_on_gpu));
-
-  // Add attributes to new node.
-  nb->Attr("T", T);
-  nb->Attr("num_args", num_args);
-  nb->Attr("strides", strides);
-  nb->Attr("padding", padding);
-  nb->Attr("data_format", data_format);
-  nb->Attr("dilations", dilations);
-  nb->Attr("epsilon", epsilon);
-  nb->Attr("fused_ops", fused_ops);
-  nb->Attr("leakyrelu_alpha", leakyrelu_alpha);
-  nb->Attr("use_cudnn_on_gpu", use_cudnn_on_gpu);
-
-  // Check `explicit_paddings` first because some Conv ops don't have
-  // this attribute.
-  if (TryGetNodeAttr(orig_node->def(), "explicit_paddings",
-                     &explicit_paddings) &&
-      !explicit_paddings.empty()) {
-    nb->Attr("explicit_paddings", explicit_paddings);
-  }
-
-  // Check and set filter attribute.
-  Node* filter_node = nullptr;
-  TF_CHECK_OK(orig_node->input_node(1, &filter_node));
-  nb->Attr("is_filter_const", filter_node->IsConstant());
-}
-
 void MklLayoutRewritePass::CopyAttrsFromPadAndFusedConv2D(
     const Node* fused_conv2d, const Node* pad, NodeBuilder* nb,
     bool change_format) {
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/BUILD b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
index 2e51b26bcc1..1d283021737 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/next_pluggable_device/BUILD
@@ -66,6 +66,13 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "next_pluggable_device_c_api_flag",
+    srcs = ["next_pluggable_device_c_api_flag.cc"],
+    hdrs = ["next_pluggable_device_c_api_flag.h"],
+    visibility = ["//visibility:public"],
+)
+
 cc_library(
     name = "next_pluggable_device_factory",
     srcs = [
@@ -211,22 +218,16 @@ cc_library(
 cc_library(
     name = "plugin_op_kernel_helper",
     hdrs = ["plugin_op_kernel_helper.h"],
-    defines = select({
-        "tf_npd_use_c_api_enabled": ["TF_NEXT_PLUGGABLE_DEVICE_USE_C_API"],
-        "//conditions:default": [],
-    }),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_plugin_op_kernel",
+        ":direct_plugin_op_kernel",
         ":loose_headers",
+        ":next_pluggable_device_c_api_flag",
         ":plugin_op_kernel",
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_status_helper",
-    ] + select({
-        "tf_npd_use_c_api_enabled": [":c_plugin_op_kernel"],
-        "//conditions:default": [
-            ":direct_plugin_op_kernel",
-        ],
-    }),
+    ],
 )
 
 # For a more maintainable build this target should not exist and the headers
@@ -269,6 +270,7 @@ cc_library(
     name = "c_plugin_coordination_service_agent",
     srcs = ["c_plugin_coordination_service_agent.cc"],
     hdrs = ["c_plugin_coordination_service_agent.h"],
+    defines = ["TF_CAPI_WEAK"],
     visibility = ["//visibility:public"],
     deps = [
         ":plugin_coordination_service_agent",
@@ -284,23 +286,15 @@ cc_library(
 cc_library(
     name = "plugin_coordination_service_agent_helper",
     hdrs = ["plugin_coordination_service_agent_helper.h"],
-    defines = select({
-        "tf_npd_use_c_api_enabled": ["TF_NEXT_PLUGGABLE_DEVICE_USE_C_API"],
-        "//conditions:default": [],
-    }),
     visibility = ["//visibility:public"],
     deps = [
+        ":c_plugin_coordination_service_agent",
+        ":direct_plugin_coordination_service_agent",
+        ":next_pluggable_device_c_api_flag",
         ":plugin_coordination_service_agent",
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:tf_status_helper",
-    ] + select({
-        "tf_npd_use_c_api_enabled": [
-            ":c_plugin_coordination_service_agent",
-        ],
-        "//conditions:default": [
-            ":direct_plugin_coordination_service_agent",
-        ],
-    }),
+    ],
 )
 
 cc_library(
@@ -328,6 +322,7 @@ cc_library(
     name = "c_plugin_variable",
     srcs = ["c_plugin_variable.cc"],
     hdrs = ["c_plugin_variable.h"],
+    defines = ["TF_CAPI_WEAK"],
     visibility = ["//visibility:public"],
     deps = [
         ":plugin_variable",
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.cc
new file mode 100644
index 00000000000..79c948838f4
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.cc
@@ -0,0 +1,24 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.h"
+
+namespace tensorflow {
+namespace npd {
+
+// Define as a weak symbol so that it can be overridden when necessary.
+extern const bool kTfNextPluggableDeviceUseCApi __attribute__((weak)) = false;
+
+}  // namespace npd
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.h b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.h
new file mode 100644
index 00000000000..2628a6e1648
--- /dev/null
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.h
@@ -0,0 +1,26 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_C_API_FLAG_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_C_API_FLAG_H_
+
+namespace tensorflow {
+namespace npd {
+
+extern const bool kTfNextPluggableDeviceUseCApi;
+
+}  // namespace npd
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_NEXT_PLUGGABLE_DEVICE_C_API_FLAG_H_
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
index 2ab55e1b83f..4935f56b2c0 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/pjrt_compile_on_demand_op.cc
@@ -113,8 +113,11 @@ void RegisterPjRtCompileOnDemand(const char* device, const char* jit_device) {
     return new PjRtCompileOnDemandOp(context);
   };
   XlaOpRegistry::RegisterCompilationKernels();
-  RegisterXlaDeviceKernels(device, jit_device, factory,
-                           "PjRtCompileOnDemandOp");
+  // TODO(b/281686536): support register PjRtCompileOnDemand when there are
+  // multiple next pluggable devices registered.
+  static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
+      device, jit_device, factory, "PjRtCompileOnDemandOp");
+  (void)registrations;
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
index 726737dbeb4..3fd2bacba29 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent_helper.h
@@ -16,25 +16,22 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_COORDINATION_SERVICE_AGENT_HELPER_H_
 
-#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
-
-#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
-#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h"
-#else
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent.h"
-#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_coordination_service_agent.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_coordination_service_agent.h"
 
 namespace tensorflow {
 
 inline PluginCoordinationServiceAgent* CreatePluginCoordinationServiceAgent(
     void* agent) {
-#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
-  return new DirectPluginCoordinationServiceAgent(agent);
-#else
-  return new CPluginCoordinationServiceAgent(agent);
-#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
+  if (!tensorflow::npd::kTfNextPluggableDeviceUseCApi) {
+    return new DirectPluginCoordinationServiceAgent(agent);
+  } else {
+    return new CPluginCoordinationServiceAgent(agent);
+  }
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
index b0d0f6b7f4b..3e087b9d411 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel_helper.h
@@ -16,24 +16,21 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_NEXT_PLUGGABLE_DEVICE_PLUGIN_OP_KERNEL_HELPER_H_
 
-#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
-
-#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
-#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
-#else
 #include "tensorflow/c/kernels.h"
 #include "tensorflow/c/tf_status_helper.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/c_plugin_op_kernel.h"
-#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
+#include "tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_c_api_flag.h"
+#include "tensorflow/core/common_runtime/next_pluggable_device/plugin_op_kernel.h"
 
 namespace tensorflow {
 
 inline PluginOpKernelConstruction* CreatePluginOpKernelConstruction(void* ctx) {
-#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
-  return new DirectPluginOpKernelConstruction(ctx);
-#else
-  return new CPluginOpKernelConstruction(ctx);
-#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
+  if (!npd::kTfNextPluggableDeviceUseCApi) {
+    return new DirectPluginOpKernelConstruction(ctx);
+  } else {
+    return new CPluginOpKernelConstruction(ctx);
+  }
 }
 
 inline void DeletePluginOpKernelConstruction(
@@ -42,11 +39,11 @@ inline void DeletePluginOpKernelConstruction(
 }
 
 inline PluginOpKernelContext* CreatePluginOpKernelContext(void* ctx) {
-#ifndef TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
-  return new DirectPluginOpKernelContext(ctx);
-#else
-  return new CPluginOpKernelContext(ctx);
-#endif  // TF_NEXT_PLUGGABLE_DEVICE_USE_C_API
+  if (!npd::kTfNextPluggableDeviceUseCApi) {
+    return new DirectPluginOpKernelContext(ctx);
+  } else {
+    return new CPluginOpKernelContext(ctx);
+  }
 }
 
 inline void DeletePluginOpKernelContext(PluginOpKernelContext* wrapper) {
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
index 1ebc43db062..921eb3a4c06 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -171,9 +172,9 @@ Status WriteToCache(const string& dir_name, const string& file_name,
 
   const absl::Duration cache_writing_duration =
       absl::Now() - cache_writing_start_time;
-  VLOG(3) << "Finished writing optimized graph into cache; took "
-          << absl::ToInt64Seconds(cache_writing_duration)
-          << " secs, file name: " << file_name;
+  VLOG(3) << "Finished writing Tensorflow optimized graph into cache; took "
+          << absl::ToInt64Milliseconds(cache_writing_duration)
+          << " msecs, file name: " << file_name;
 
   return OkStatus();
 }
@@ -198,8 +199,8 @@ StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(const string& file_name,
 
   const absl::Duration cache_reading_duration =
       absl::Now() - cache_reading_start_time;
-  VLOG(3) << "Finished reading optimized graph from cache; took "
-          << absl::ToInt64Seconds(cache_reading_duration) << " secs";
+  VLOG(3) << "Finished reading Tensorflow optimized graph from cache; took "
+          << absl::ToInt64Milliseconds(cache_reading_duration) << " msecs";
 
   return optimized_function_graph_info_restored;
 }
@@ -208,9 +209,25 @@ StatusOr<OptimizedFunctionGraphInfo> ReadFromCache(const string& file_name,
 // TODO(b/276813768) Include more runtime specific info like env/flag
 // values, or line number. An alternative is to use the fingerprint of the
 // graph once graph building cache is enabled.
-string GetFileCacheName(const string& dir_name, const string& function_name) {
+//
+// Current file cache key components:
+// 1) Job name.
+// 2) Task ID.
+// 3) Function name (without UUID suffix).
+// 4) TF graph node count.
+string GetFileCacheName(const string& dir_name, const string& function_name,
+                        const FunctionDef* fdef) {
+  string plain_func_name = function_name;
+  // Remove the random UUID in the function name.
+  if (absl::StrContains(function_name, "_")) {
+    std::vector<string> func_name_tokens = absl::StrSplit(function_name, '_');
+    func_name_tokens.pop_back();
+    plain_func_name = absl::StrJoin(func_name_tokens, "_");
+  }
+
   return absl::StrCat(dir_name, "/", tsl::port::JobName(), "_",
-                      tsl::port::TaskId(), "_", function_name);
+                      tsl::port::TaskId(), "_", plain_func_name, "_",
+                      fdef->node_def_size());
 }
 }  // namespace
 
@@ -500,10 +517,11 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
 
   bool control_rets_updated = false;
   if (should_run_optimization_passes) {
+    FunctionOptimizationPass::FunctionOptions function_options{
+        options.xla_compile_device_type, options.allow_soft_placement};
     TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-        function_name, dev_set, options.config_proto,
-        options.xla_compile_device_type, &graph, &reachable_lib_def,
-        &control_ret_node_names, &control_rets_updated));
+        function_name, dev_set, options.config_proto, function_options, &graph,
+        &reachable_lib_def, &control_ret_node_names, &control_rets_updated));
   }
 
   if (control_rets_updated) {
@@ -571,7 +589,7 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
         &reachable_lib_def, dev_set, cpu_device, &graph);
     if (!status.ok()) {
       LOG(WARNING) << "Ignoring multi-device function optimization failure: "
-                   << status.ToString();
+                   << status;
     }
     DEBUG_DATA_DUMPER()->DumpGraph(function_name, kDebugGroupMain,
                                    "after_graph_optimization", graph.get(),
@@ -621,11 +639,23 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
                                  OptimizedFunctionGraph::JIT);
   }
 
-  const string file_name = GetFileCacheName(dir_name, function_name);
+  const FunctionLibraryDefinition* lib_def =
+      options.lib_def == nullptr ? input_lib_def : options.lib_def;
+  const FunctionDef* fdef = lib_def->Find(function_name);
+  // If the function definition can't be found, return as error because there's
+  // no point running the graph optimization passes as that will fail too.
+  if (fdef == nullptr) {
+    return absl::AbortedError(absl::StrCat(
+        "Failed to find function ", function_name,
+        " in function library: ", lib_def->ToProto().DebugString()));
+  }
+  const string file_name = GetFileCacheName(dir_name, function_name, fdef);
 
   // Scenario (2): File cache exists for this function; restore from the cache.
   if (env->FileExists(file_name).ok()) {
-    VLOG(3) << "Cache existed; reading from cache; file_name: " << file_name;
+    LOG(INFO)
+        << "TensorFlow graph cache existed; reading from cache; function name: "
+        << function_name << ", full cache file path: " << file_name;
 
     StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info =
         ReadFromCache(file_name, env);
@@ -635,15 +665,23 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
           metrics::GraphOptimizationSource::kJit);
       metrics::IncrementFunctionGraphOptimizationCacheHitCount(
           1, metrics::GraphOptimizationSource::kJit);
+      LOG(INFO)
+          << "Successfully restored the Tensorflow optimized graph from "
+             "the cache for the function: "
+          << function_name << ", saved optimized time: "
+          << absl::ToInt64Milliseconds(absl::Microseconds(
+                 optimized_function_graph_info->optimization_duration_usecs))
+          << " msecs";
       return optimized_function_graph_info;
     }
 
     // Run the optimization passes if reading from cache fails.
     metrics::IncrementFunctionGraphOptimizationCacheFailureCount(
         1, metrics::GraphOptimizationSource::kJit);
-    LOG(ERROR) << "Reading from file cache failed. Continue to run the "
-                  "optimization passes instead. Error message: "
-               << optimized_function_graph_info.status().ToString();
+    LOG(ERROR)
+        << "Reading from Tensorflow graph optimization cache failed. Continue "
+           "to run the Tensorflow graph optimization passes instead. Error: "
+        << optimized_function_graph_info.status();
     return OptimizeFunctionGraph(function_name, attrs, options, dev_set,
                                  input_lib_def, composite_devices, cpu_device,
                                  default_device, env,
@@ -672,17 +710,25 @@ StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraphOrReadFromFileCache(
 
   // Step 2: Write the optimized function graph into the cache if eligible.
   if (graph_optimization_duration >= caching_threshold_duration) {
-    VLOG(3) << "Writing optimized graph into cache: function name: "
-            << function_name << ", full cache file path: " << file_name;
+    LOG(INFO)
+        << "Writing the optimized TensorFlow graph into cache: function name: "
+        << function_name << ", full cache file path: " << file_name;
     Status s = WriteToCache(dir_name, file_name,
                             optimized_function_graph_info.value(), env);
     // If writing to cache failed, log the error message and move on without
     // failing the program.
     if (!s.ok()) {
-      LOG(ERROR) << "Caching the graph optimization results failed; "
+      LOG(ERROR) << "Caching the Tensorflow graph optimization results failed; "
                     "cotinue without caching. Error message: "
-                 << s.ToString();
+                 << s;
     }
+
+    LOG(INFO) << "Successfully wrote the optimized Tensorflow graph into cache "
+                 "for the function: "
+              << function_name << ", graph optimization time ( / threshold): "
+              << absl::ToInt64Milliseconds(graph_optimization_duration)
+              << " / (" << absl::ToInt64Milliseconds(caching_threshold_duration)
+              << ") msecs";
   }
 
   return optimized_function_graph_info;
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
index e8375b01855..81b267eb32b 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils_test.cc
@@ -50,8 +50,7 @@ void CreateCpuDeviceList(absl::string_view name_prefix, int num_devices,
       DeviceFactory::AddDevices(options, "/job:a/replica:0/task:0", &devices));
 }
 
-TEST(OptimizeFunctionGraphTest,
-     OptimizeFunctionGraphReturnsErrorIfNoFunctionFound) {
+void TestOptimizeFunctionGraphWithFunctionNotFound(bool load_from_cache) {
   FunctionLibraryRuntime::InstantiateOptions opts;
   opts.is_multi_device_function = true;
   auto lib_def =
@@ -66,15 +65,30 @@ TEST(OptimizeFunctionGraphTest,
 
   // Try to optimize a function called "FindDevice" which does not exist in
   // library.
-  const StatusOr<OptimizedFunctionGraphInfo> aot_result = OptimizeFunctionGraph(
-      "FindDevice", {}, opts, device_set, lib_def.get(),
-      /*composite_devices=*/{}, devices[0].get(), devices[0].get(),
-      Env::Default(), OptimizedFunctionGraph::AOT);
-  EXPECT_TRUE(errors::IsInvalidArgument(aot_result.status()))
-      << "Actual status: " << aot_result.status();
-  EXPECT_TRUE(absl::StrContains(aot_result.status().message(),
-                                "Failed to find function"))
-      << "Actual error message: " << aot_result.status().message();
+  StatusOr<OptimizedFunctionGraphInfo> optimized_function_graph_info;
+  if (load_from_cache) {
+    optimized_function_graph_info = OptimizeFunctionGraphOrReadFromFileCache(
+        "FindDevice", {}, opts, device_set, lib_def.get(),
+        /*composite_devices=*/{}, devices[0].get(), devices[0].get(),
+        Env::Default(), absl::ZeroDuration());
+  } else {
+    optimized_function_graph_info = OptimizeFunctionGraph(
+        "FindDevice", {}, opts, device_set, lib_def.get(),
+        /*composite_devices=*/{}, devices[0].get(), devices[0].get(),
+        Env::Default(), OptimizedFunctionGraph::AOT);
+  }
+  EXPECT_TRUE(absl::IsInvalidArgument(optimized_function_graph_info.status()))
+      << "Actual status: " << optimized_function_graph_info.status();
+  EXPECT_TRUE(
+      absl::StrContains(optimized_function_graph_info.status().message(),
+                        "Failed to find function"))
+      << "Actual error message: "
+      << optimized_function_graph_info.status().message();
+}
+
+TEST(OptimizeFunctionGraphTest,
+     OptimizeFunctionGraphReturnsErrorIfNoFunctionFound) {
+  TestOptimizeFunctionGraphWithFunctionNotFound(/*load_from_cache=*/false);
 }
 
 TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
@@ -109,6 +123,10 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphReturnsCorrectResult) {
   EXPECT_EQ(aot_result->optimization_source, OptimizedFunctionGraph::AOT);
 }
 
+TEST(OptimizeFunctionGraphTest, ReloadFromCacheReturnsErrorIfNoFunctionFound) {
+  TestOptimizeFunctionGraphWithFunctionNotFound(/*load_from_cache=*/true);
+}
+
 TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphAndWriteToCache) {
   Env* env = Env::Default();
 
@@ -176,7 +194,7 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphAndWriteToCache) {
   // Check that only one cache file exists.
   file_list.clear();
   TF_ASSERT_OK(env->GetMatchingPaths(
-      absl::StrCat(temp_dir, "/_-1_FindDevice_1234"), &file_list));
+      absl::StrCat(temp_dir, "/_-1_FindDevice_1"), &file_list));
   EXPECT_EQ(file_list.size(), 1);
   EXPECT_EQ(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
                 metrics::GraphOptimizationSource::kJit),
@@ -196,7 +214,7 @@ TEST(OptimizeFunctionGraphTest, OptimizeFunctionGraphAndWriteToCache) {
   TF_ASSERT_OK(optimized_info.status());
   file_list.clear();
   TF_ASSERT_OK(env->GetMatchingPaths(
-      absl::StrCat(temp_dir, "/_-1_FindDevice_1234"), &file_list));
+      absl::StrCat(temp_dir, "/_-1_FindDevice_1"), &file_list));
   EXPECT_EQ(file_list.size(), 1);
   EXPECT_GT(metrics::GetFunctionGraphOptimizationSavingTimeUsecs(
                 metrics::GraphOptimizationSource::kJit),
diff --git a/tensorflow/core/common_runtime/pluggable_device/BUILD b/tensorflow/core/common_runtime/pluggable_device/BUILD
index fd4b1cd0fa0..8b46ac56337 100644
--- a/tensorflow/core/common_runtime/pluggable_device/BUILD
+++ b/tensorflow/core/common_runtime/pluggable_device/BUILD
@@ -59,6 +59,8 @@ cc_library(
         "//tensorflow/core/platform:stream_executor",
         "//tensorflow/tsl/framework:device_id_utils",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
@@ -81,6 +83,8 @@ cc_library(
         "//tensorflow/c/experimental/pluggable_profiler:pluggable_profiler_internal",
         "//tensorflow/c/experimental/stream_executor",
         "//tensorflow/c/experimental/stream_executor:stream_executor_internal",
+        "//tensorflow/compiler/jit:flags_headers",
+        "//tensorflow/compiler/jit:xla_device_no_jit_rewrite_registration",
         "//tensorflow/compiler/xla/pjrt:pjrt_api",
         "//tensorflow/compiler/xla/stream_executor:device_mem_allocator",
         "//tensorflow/compiler/xla/stream_executor:event",
@@ -100,6 +104,7 @@ cc_library(
         "//tensorflow/core/common_runtime/next_pluggable_device:pjrt_compile_on_demand_op",
         "//tensorflow/core/platform:stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
     ] + if_static([
         "//tensorflow/core/common_runtime:copy_tensor",
     ]),
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
index 8e58c006524..24cbcacab8c 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
@@ -21,7 +21,10 @@ limitations under the License.
 #include <algorithm>
 #include <list>
 #include <map>
+#include <memory>
+#include <string>
 #include <tuple>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
@@ -215,12 +218,12 @@ Status PluggableDevice::Init(const SessionOptions& options) {
     TF_RETURN_IF_ERROR(
         ReadInt64FromEnvVar("TF_GPU_THREAD_COUNT", 2, &device_thread_count));
     if (device_thread_mode == "gpu_private") {
-      thread_pool_.reset(new thread::ThreadPool(
+      thread_pool_ = std::make_unique<thread::ThreadPool>(
           options.env, ThreadOptions(),
           strings::StrCat("gpu_private_", tf_device_id_.value()),
           static_cast<int32>(device_thread_count),
           !options.config.experimental().disable_thread_spinning(),
-          /*allocator=*/nullptr));
+          /*allocator=*/nullptr);
       set_tensorflow_device_thread_pool(thread_pool_.get());
     } else if (device_thread_mode == "gpu_shared") {
       static thread::ThreadPool* thread_pool = new thread::ThreadPool(
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
index 2b33061c9b4..74ad5893921 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.h
@@ -65,7 +65,7 @@ class PluggableDevice : public LocalDevice {
   Allocator* GetAllocator(AllocatorAttributes attr) override;
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
+                             AllocatorAttributes alloc_attrs,
                              Tensor* tensor) override;
 
   void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
@@ -100,7 +100,7 @@ class PluggableDevice : public LocalDevice {
   std::unique_ptr<thread::ThreadPool> thread_pool_;
   bool force_gpu_compatible_ = false;
   std::string ComputeOpKernelDebugString(const OpKernel& op_kernel,
-                                         const int stream_id);
+                                         int stream_id);
 
   // This method returns an initialization status, in addition to
   // calling the "done" StatusCallback, if there is a failure to
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
index a1eef8d9bcf..eba49bb8a1a 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h"
 
+#include <cstdlib>
+
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
index 516acab7837..a0e6eea7c3e 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_bfc_allocator.h
@@ -40,7 +40,7 @@ class PluggableDeviceBFCAllocator : public BFCAllocator {
                               size_t total_memory,
                               const GPUOptions& gpu_options, const string& name,
                               bool force_memory_growth_requested);
-  ~PluggableDeviceBFCAllocator() override {}
+  ~PluggableDeviceBFCAllocator() override = default;
 
   TF_DISALLOW_COPY_AND_ASSIGN(PluggableDeviceBFCAllocator);
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
index 12d223936d9..5c74e602603 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h"
 
+#include <functional>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device/device_event_mgr.h"
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.h"
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
index a5b279fb9fd..5798be1c13b 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_CONTEXT_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_CONTEXT_H_
 
+#include <functional>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -39,7 +41,7 @@ class PluggableDeviceContext : public DeviceContext {
         device_to_host_stream_(device_to_host_stream),
         device_to_device_stream_(device_to_device_stream) {}
 
-  ~PluggableDeviceContext() override {}
+  ~PluggableDeviceContext() override = default;
 
   se::Stream* stream() const override { return stream_; }
   se::Stream* host_to_device_stream() const { return host_to_device_stream_; }
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
index b760c8f1def..0ff0de28e70 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
@@ -29,6 +29,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/stream_executor/device_id_utils.h"
 #include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/device/device_id_manager.h"
@@ -91,9 +93,9 @@ Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
                                platform, platform_device_id)
                                .value();
   if (!se->DeviceMemoryUsage(&available_memory, &total_memory)) {
-    return errors::Unknown(
-        "Failed to query available memory for PluggableDevice ",
-        platform_device_id.value());
+    return absl::UnknownError(
+        absl::StrCat("Failed to query available memory for PluggableDevice ",
+                     platform_device_id.value()));
   }
 
   int64_t allocated_memory = 0;
@@ -101,7 +103,7 @@ Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
       device_options.per_process_gpu_memory_fraction();
   if (per_process_device_memory_fraction > 1.0 ||
       device_options.experimental().use_unified_memory()) {
-    return errors::Internal("Unified memory is not supported yet.");
+    return absl::InternalError("Unified memory is not supported yet.");
   }
 
   if (per_process_device_memory_fraction == 0) {
@@ -147,7 +149,8 @@ Status PluggableDeviceFactory::GetDeviceDetails(
 
   int device_count = platform->VisibleDeviceCount();
   if (device_index < 0 || device_index >= device_count) {
-    return errors::Internal("Invalid device index: ", device_index);
+    return absl::InternalError(
+        absl::StrCat("Invalid device index: ", device_index));
   }
 
   auto desc_status = platform->DescriptionForDevice(device_index);
@@ -246,13 +249,13 @@ Status PluggableDeviceFactory::CreatePluggableDevice(
   Allocator* device_allocator = process_state->GetPluggableDeviceAllocator(
       options.config.gpu_options(), tf_device_id, memory_limit);
   if (device_allocator == nullptr) {
-    return errors::Internal(
+    return absl::InternalError(absl::StrCat(
         "Failed to get memory allocator for TF PluggableDevice ",
-        tf_device_id.value(), " with", memory_limit, " bytes of memory. ");
+        tf_device_id.value(), " with", memory_limit, " bytes of memory. "));
   }
   const std::optional<AllocatorStats> stats = device_allocator->GetStats();
   if (!stats) {
-    return errors::Internal("No allocator statistics");
+    return absl::InternalError("No allocator statistics");
   }
   // 'memory_limit' is the required memory size, but if the allocator with
   // given 'tf_device_id' was created before, we'll use it instead of creating
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index a9201e487dc..a4a779c0e97 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -17,9 +17,12 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/status/status.h"
 #include "tensorflow/c/experimental/grappler/grappler_internal.h"
 #include "tensorflow/c/experimental/pluggable_profiler/pluggable_profiler_internal.h"
 #include "tensorflow/c/experimental/stream_executor/stream_executor_internal.h"
+#include "tensorflow/compiler/jit/flags.h"
+#include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/xla/pjrt/pjrt_api.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_api.h"
@@ -41,7 +44,7 @@ static Status InitDeviceModule(void* dso_handle) {
   Status status =
       env->GetSymbolFromLibrary(dso_handle, "SE_InitPlugin", &dso_symbol);
 
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     VLOG(1) << "Device module not found.";
     return OkStatus();
   } else if (status != OkStatus()) {
@@ -74,24 +77,23 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
   // Loads the next pluggable device.
   Status status =
       env->GetSymbolFromLibrary(dso_handle, "TFNPD_InitPlugin", &dso_symbol);
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     VLOG(1) << "Next pluggable device module not found.";
     return OkStatus();
   } else if (status != OkStatus()) {
     return status;
   }
   auto init_fn = reinterpret_cast<TFNPDInitPluginFn>(dso_symbol);
-  string device_type, compilation_device_name;
   TF_ASSIGN_OR_RETURN(auto init_params, InitNextPluggableDevicePlugin(init_fn));
-  device_type = std::string(init_params.device_type);
-  compilation_device_name = std::string(init_params.compilation_device_name);
+  std::string device_type(init_params.device_type);
+  std::string compilation_device_name(init_params.compilation_device_name);
   int priority = init_params.priority;
   bool is_pluggable_device = init_params.is_pluggable_device;
 
   // Loads the PJRT plugin.
   // TODO(b/265301627): use LoadPjrtPlugin when it supports windows.
   status = env->GetSymbolFromLibrary(dso_handle, "GetPjrtApi", &dso_symbol);
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     VLOG(1) << "Loading PJRT plugin failed for " << device_type << ": "
             << status.message();
     return OkStatus();
@@ -106,10 +108,24 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
                               device_type, compilation_device_name),
                           priority, is_pluggable_device);
   if (init_params.use_pjrt_on_demand_compile) {
-    // PjRtCompileOnDemand op compiles a TensorFlow op to a PjRtExecutable and
+    // XlaCompileOnDemand op compiles a TensorFlow op to a PjRtExecutable and
     // runs it.
-    RegisterPjRtCompileOnDemand(device_type.c_str(),
-                                compilation_device_name.c_str());
+    auto& pjrt_rollout_config = GetXlaOpsCommonFlags()->tf_xla_use_device_api;
+    pjrt_rollout_config.AllowForDeviceInXlaCompileOnDemand(
+        DeviceType(device_type));
+    CHECK(  // Crash OK
+        pjrt_rollout_config.IsEnabledInXlaCompileOnDemandForDevice(
+            DeviceType(device_type)))
+        << "Using Device API (PjRt) for 'on-demand' mode needs to be turned on "
+           "by setting the '--tf_xla_use_device_api_for_compile_on_demand' "
+           "flag in the `TF_XLA_FLAGS` environment variable.";
+
+    static XlaDeviceOpRegistrations* registrations = RegisterXlaDeviceKernels(
+        device_type.c_str(), compilation_device_name.c_str());
+    (void)registrations;
+
+    VLOG(1) << "Registered XlaCompileOnDemand op for device_type: "
+            << device_type;
   }
 
   VLOG(1) << "Successfully initialized NextPluggableDevice module.";
@@ -122,7 +138,7 @@ static Status InitGraphModule(void* dso_handle) {
   Status status =
       env->GetSymbolFromLibrary(dso_handle, "TF_InitGraph", &dso_symbol);
 
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     VLOG(1) << "Graph module not found.";
     return OkStatus();
   } else if (status != OkStatus()) {
@@ -142,7 +158,7 @@ static Status InitKernelModule(void* dso_handle) {
   Status status =
       env->GetSymbolFromLibrary(dso_handle, "TF_InitKernel", &dso_symbol);
 
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     VLOG(1) << "Kernel module not found.";
     return OkStatus();
   } else if (status != OkStatus()) {
@@ -163,7 +179,7 @@ static Status InitProfilerModule(void* dso_handle) {
   Status status =
       env->GetSymbolFromLibrary(dso_handle, "TF_InitProfiler", &dso_symbol);
 
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     VLOG(1) << "Profiler module not found.";
     return OkStatus();
   } else if (status != OkStatus()) {
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
index 6e366f91c67..f0882d83503 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h"
 
 #include <cstring>
+#include <memory>
 #include <unordered_map>
 #include <vector>
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
index eeff5eeae8a..b968c15fa84 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_process_state.h
@@ -18,7 +18,9 @@ limitations under the License.
 
 #include <functional>
 #include <map>
+#include <memory>
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/core/common_runtime/device/device_id.h"
 #include "tensorflow/core/common_runtime/process_state.h"
@@ -88,7 +90,7 @@ class PluggableDeviceProcessState {
   // deleted except at process shutdown.
   PluggableDeviceProcessState(const string& device_type,
                               const string& platform_name);
-  virtual ~PluggableDeviceProcessState() {}
+  virtual ~PluggableDeviceProcessState() = default;
 
   ProcessState::MDMap* mem_desc_map() {
     if (process_state_) return &process_state_->mem_desc_map_;
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.cc
index 59fb7192939..c8fee4b0ef6 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h"
 
+#include <optional>
+
 #include "tensorflow/core/lib/strings/strcat.h"
 
 namespace tensorflow {
@@ -32,7 +34,7 @@ void PluggableDeviceSimpleAllocator::DeallocateRaw(void* ptr) {
   return sub_allocator_->Free(ptr, 0);
 }
 
-absl::optional<AllocatorStats> PluggableDeviceSimpleAllocator::GetStats() {
+std::optional<AllocatorStats> PluggableDeviceSimpleAllocator::GetStats() {
   AllocatorStats stats_;
   stats_.num_allocs = 0;
   stats_.peak_bytes_in_use = 0;
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
index 712d3b11487..40c23a7f7be 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_simple_allocator.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_COMMON_RUNTIME_PLUGGABLE_DEVICE_PLUGGABLE_DEVICE_SIMPLE_ALLOCATOR_H_
 
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -30,14 +31,14 @@ namespace tensorflow {
 class PluggableDeviceSimpleAllocator : public Allocator {
  public:
   explicit PluggableDeviceSimpleAllocator(DeviceMemAllocator* sub_allocator);
-  ~PluggableDeviceSimpleAllocator() override {}
+  ~PluggableDeviceSimpleAllocator() override = default;
 
   void* AllocateRaw(size_t alignment, size_t num_bytes) override;
   void DeallocateRaw(void* ptr) override;
 
   bool TracksAllocationSizes() const override { return false; }
   string Name() override { return "Simple allocator"; }
-  absl::optional<AllocatorStats> GetStats() override;
+  std::optional<AllocatorStats> GetStats() override;
 
   AllocatorMemoryType GetMemoryType() const override {
     return sub_allocator_->GetMemoryType();
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc
index 25fff996034..1289675cab1 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc
@@ -51,7 +51,6 @@ limitations under the License.
 namespace tensorflow {
 
 using se::DeviceMemoryBase;
-using se::Stream;
 
 static Status PrepareCopy(Device* device, const DeviceContext* ctx,
                           const Tensor& src, const Tensor* dst,
diff --git a/tensorflow/core/common_runtime/single_threaded_executor_test.cc b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
index 8cba0525051..c1343aea326 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor_test.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/executor.h"
@@ -335,7 +336,7 @@ TEST_F(ExecutorTest, OpError) {
   Create(std::move(g));
   FunctionCallFrame call_frame({}, {});
   // Fails due to invalid dtype.
-  EXPECT_TRUE(errors::IsInvalidArgument(Run(&call_frame)));
+  EXPECT_TRUE(absl::IsInvalidArgument(Run(&call_frame)));
 }
 
 TEST_F(ExecutorTest, ControlDependenciesFromSpecialNodes) {
diff --git a/tensorflow/core/config/BUILD b/tensorflow/core/config/BUILD
index bbecd83815d..d01bc32d310 100644
--- a/tensorflow/core/config/BUILD
+++ b/tensorflow/core/config/BUILD
@@ -75,6 +75,6 @@ py_strict_test(
     tags = ["no_pip"],
     deps = [
         ":flags_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/core/config/flag_defs.h b/tensorflow/core/config/flag_defs.h
index 1d53d92993c..4ab5fb4750d 100644
--- a/tensorflow/core/config/flag_defs.h
+++ b/tensorflow/core/config/flag_defs.h
@@ -41,6 +41,14 @@ class Flags {
       "Optimize tf.Operation building for faster tf.function tracing.");
   TF_DECLARE_FLAG(saved_model_fingerprinting, true,
                   "Add fingerprint to SavedModels.");
+  TF_DECLARE_FLAG(
+      tf_shape_default_int64, false,
+      "The default output of tf.shape (i.e. when out_type is not specified) is "
+      "int64 when this flag is true and int32 otherwise. Setting this to true "
+      "is an unsupported, experimental setting that causes known breakages.");
+  TF_DECLARE_FLAG(more_stack_traces, false,
+                  "Enable experimental code that preserves and propagates "
+                  "graph node stack traces in C++.");
   // LINT.ThenChange(//tensorflow/core/config/flags_api_wrapper.cc)
 };
 
diff --git a/tensorflow/core/config/flags_api_wrapper.cc b/tensorflow/core/config/flags_api_wrapper.cc
index c91f677ecdf..58074fb0625 100644
--- a/tensorflow/core/config/flags_api_wrapper.cc
+++ b/tensorflow/core/config/flags_api_wrapper.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "absl/types/optional.h"
@@ -48,5 +49,7 @@ PYBIND11_MODULE(flags_pybind, m) {
   TF_PY_DECLARE_FLAG(graph_building_optimization);
   TF_PY_DECLARE_FLAG(op_building_optimization);
   TF_PY_DECLARE_FLAG(saved_model_fingerprinting);
+  TF_PY_DECLARE_FLAG(tf_shape_default_int64);
+  TF_PY_DECLARE_FLAG(more_stack_traces);
   // LINT.ThenChange(//tensorflow/core/config/flag_defs.h)
 };
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index 5bee7afc0a0..ad5f519c94a 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -153,6 +153,7 @@ cc_library(
         "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -167,6 +168,7 @@ tf_cc_test(
         ":dataset_test_base",
         ":dataset_utils",
         ":serialization_utils",
+        ":test_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -388,9 +390,10 @@ cc_library(
         ":dataset_utils",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
-        "//tensorflow/core:graph",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/core:status",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
@@ -403,6 +406,7 @@ tf_cc_test(
         ":dataset_test_base",
         ":dataset_utils",
         ":serialization_utils",
+        ":test_utils",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
@@ -452,13 +456,13 @@ cc_library(
     hdrs = ["snapshot_utils.h"],
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
+        ":name_utils",
         "//tensorflow/core:core_cpu_lib",
         "//tensorflow/core:dataset_ops_op_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data:name_utils",
         "//tensorflow/core/platform:coding",
         "//tensorflow/core/platform:random",
         "//tensorflow/core/profiler/lib:traceme",
@@ -466,6 +470,7 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -552,6 +557,23 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "test_utils",
+    srcs = ["test_utils.cc"],
+    hdrs = ["test_utils.h"],
+    # copybara:uncomment copts = ["-Wthread-safety-analysis"],
+    deps = [
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/framework:function_proto_cc",
+        "//tensorflow/core/public:version",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "unbounded_thread_pool",
     srcs = ["unbounded_thread_pool.cc"],
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index 1825830f48d..7387559ce31 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -14,7 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/captured_function.h"
 
+#include <functional>
+#include <map>
+#include <memory>
+#include <string>
+#include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -419,16 +425,6 @@ Status MakeIteratorFromInputElement(
   return OkStatus();
 }
 
-IteratorContext MakeNestedIteratorContext(IteratorContext* ctx) {
-  // Strip out any split providers so that they don't apply to sub-iterators.
-  if (ctx->split_providers().empty()) {
-    return *ctx;
-  }
-  IteratorContext::Params params(ctx);
-  params.split_providers.clear();
-  return IteratorContext(std::move(params));
-}
-
 /* static */
 Status FunctionMetadata::Create(
     OpKernelConstruction* ctx, const string& func_name, Params params,
diff --git a/tensorflow/core/data/captured_function.h b/tensorflow/core/data/captured_function.h
index f70a0cfbf0e..947ad4c1503 100644
--- a/tensorflow/core/data/captured_function.h
+++ b/tensorflow/core/data/captured_function.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_CAPTURED_FUNCTION_H_
 #define TENSORFLOW_CORE_DATA_CAPTURED_FUNCTION_H_
 
+#include <functional>
 #include <memory>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/cancellation.h"
@@ -58,11 +60,6 @@ Status MakeIteratorFromInputElement(
     std::unique_ptr<IteratorBase>* out_iterator,
     const std::shared_ptr<model::Node>& node);
 
-// Creates an iterator context appropriate for a nested dataset's iterator. A
-// nested dataset is a dataset created within another dataset, e.g. by the
-// function passed to `interleave` or `flat_map`.
-IteratorContext MakeNestedIteratorContext(IteratorContext* ctx);
-
 struct ShortCircuitInfo {
   std::vector<int> indices;
   std::vector<bool> can_move;
diff --git a/tensorflow/core/data/dataset_test_base.cc b/tensorflow/core/data/dataset_test_base.cc
index 4e48a18a905..6c130cf837d 100644
--- a/tensorflow/core/data/dataset_test_base.cc
+++ b/tensorflow/core/data/dataset_test_base.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <complex>
 #include <functional>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
@@ -349,7 +350,7 @@ Status DatasetOpsTestBase::CreateDatasetContext(
     std::unique_ptr<OpKernelContext>* dataset_context) {
   Status status = CheckOpKernelInput(*dateset_kernel, *inputs);
   if (!status.ok()) {
-    VLOG(0) << "WARNING: " << status.ToString();
+    VLOG(0) << "WARNING: " << status;
   }
   TF_RETURN_IF_ERROR(CreateOpKernelContext(
       dateset_kernel, inputs, dataset_context_params, dataset_context));
diff --git a/tensorflow/core/data/dataset_test_base.h b/tensorflow/core/data/dataset_test_base.h
index b39ea2a09fc..ec9805b806f 100644
--- a/tensorflow/core/data/dataset_test_base.h
+++ b/tensorflow/core/data/dataset_test_base.h
@@ -133,7 +133,7 @@ class DatasetParams {
                 std::vector<PartialTensorShape> output_shapes,
                 string node_name);
 
-  virtual ~DatasetParams() {}
+  virtual ~DatasetParams() = default;
 
   // Returns the inputs (except the input datasets) as a tensor vector.
   virtual std::vector<Tensor> GetInputTensors() const = 0;
@@ -766,14 +766,13 @@ class DatasetOpsTestBase : public ::testing::Test {
 
   // Creates a new op kernel context.
   Status CreateDatasetContext(
-      OpKernel* const dateset_kernel,
-      gtl::InlinedVector<TensorValue, 4>* const inputs,
+      OpKernel* dateset_kernel, gtl::InlinedVector<TensorValue, 4>* inputs,
       std::unique_ptr<OpKernelContext::Params>* dataset_context_params,
       std::unique_ptr<OpKernelContext>* dataset_context);
 
   // Creates a new dataset.
   Status CreateDataset(OpKernel* kernel, OpKernelContext* context,
-                       DatasetBase** const dataset);
+                       DatasetBase** dataset);
 
   // Restores the state of the input iterator. It resets the iterator before
   // restoring it to make sure the input iterator does not hold any
@@ -786,7 +785,7 @@ class DatasetOpsTestBase : public ::testing::Test {
 
   // Fetches the dataset from the operation context.
   Status GetDatasetFromContext(OpKernelContext* context, int output_index,
-                               DatasetBase** const dataset);
+                               DatasetBase** dataset);
 
   // Runs an operation producing outputs.
   Status RunOpKernel(OpKernel* op_kernel, OpKernelContext* context);
@@ -814,7 +813,7 @@ class DatasetOpsTestBase : public ::testing::Test {
 
   // Creates a new iterator context for iterating the dataset.
   Status CreateIteratorContext(
-      OpKernelContext* const op_context,
+      OpKernelContext* op_context,
       std::unique_ptr<IteratorContext>* iterator_context);
 
   // Creates a new iterator context for iterating the dataset.
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 6195ae07480..3a72fabd98a 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -29,6 +29,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -726,7 +727,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
                     IteratorContext* ctx, std::vector<Tensor>* output,
                     bool* end_of_sequence, std::vector<Tensor>* batch) {
   if (num_elements == 0) {
-    if (status.ok() || errors::IsOutOfRange(status)) {
+    if (status.ok() || absl::IsOutOfRange(status)) {
       *end_of_sequence = true;
       return OkStatus();
     } else {
@@ -734,7 +735,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
       return status;
     }
   }
-  if (!status.ok() && !errors::IsOutOfRange(status)) {
+  if (!status.ok() && !absl::IsOutOfRange(status)) {
     *end_of_sequence = false;
     return status;
   }
@@ -921,6 +922,16 @@ int64 GetAutotuneDefaultParallelism(IteratorContext* ctx) {
   return std::min(kAutotuneDefaultParallelism, ctx->runner_threadpool_size());
 }
 
+IteratorContext MakeNestedIteratorContext(IteratorContext* ctx) {
+  // Strips out any split providers so that they don't apply to sub-iterators.
+  if (ctx->split_providers().empty()) {
+    return *ctx;
+  }
+  IteratorContext::Params params(ctx);
+  params.split_providers.clear();
+  return IteratorContext(std::move(params));
+}
+
 // static
 void DatasetExperimentRegistry::Register(const string& experiment,
                                          JobSelector job_selector,
@@ -951,8 +962,7 @@ namespace {
 REGISTER_DATASET_EXPERIMENT("allow_small_function_optimizations",
                             RandomJobSamplePercentage<0>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("autotune_buffer_optimization",
-                            RandomJobSamplePercentage<25>,
-                            IndependentHostTasks);
+                            RandomJobSamplePercentage<0>, IndependentHostTasks);
 REGISTER_DATASET_EXPERIMENT(kFilterParallelizationOpt,
                             RandomJobSamplePercentage<0>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("min_outer_interleave_parallelism",
@@ -965,7 +975,7 @@ REGISTER_DATASET_EXPERIMENT("stage_based_autotune",
                             RandomJobSamplePercentage<0>, IndependentHostTasks);
 REGISTER_DATASET_EXPERIMENT("stage_based_autotune_v2",
                             RandomJobSamplePercentage<0>, IndependentHostTasks);
-REGISTER_DATASET_EXPERIMENT("data_transfer", RandomJobSamplePercentage<1>,
+REGISTER_DATASET_EXPERIMENT("data_transfer", RandomJobSamplePercentage<10>,
                             IndependentHostTasks);
 }  // namespace
 }  // namespace data
diff --git a/tensorflow/core/data/dataset_utils.h b/tensorflow/core/data/dataset_utils.h
index c2dd54eacef..b975fc0aad5 100644
--- a/tensorflow/core/data/dataset_utils.h
+++ b/tensorflow/core/data/dataset_utils.h
@@ -15,8 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_DATASET_UTILS_H_
 #define TENSORFLOW_CORE_DATA_DATASET_UTILS_H_
 
+#include <atomic>
 #include <functional>
+#include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/function.h"
@@ -363,6 +367,11 @@ inline int GetCpuBudget() {
 // optimization.
 int64 GetAutotuneDefaultParallelism(IteratorContext* ctx);
 
+// Creates an iterator context appropriate for a nested dataset's iterator. A
+// nested dataset is a dataset created within another dataset, e.g. by the
+// function passed to `interleave` or `flat_map`.
+IteratorContext MakeNestedIteratorContext(IteratorContext* ctx);
+
 // A `DatasetExperimentRegistry::JobSelector` that randomly selects
 // `rollout_pct` percent of all jobs. `name_hash` is a hash of the experiment
 // and job names.
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index 53057c1a857..2927d0bcf71 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/data/dataset_utils.h"
 
 #include <functional>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -23,6 +24,8 @@ limitations under the License.
 #include "tensorflow/core/data/compression_utils.h"
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/serialization_utils.h"
+#include "tensorflow/core/data/test_utils.h"
+#include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -194,6 +197,36 @@ TEST(DatasetUtilsTest, BoolConstructor) {
   EXPECT_FALSE(DeterminismPolicy(false).IsDefault());
 }
 
+class TestSplitProvider : public SplitProvider {
+ public:
+  Status GetNext(Tensor* split, bool* end_of_splits) override {
+    return OkStatus();
+  }
+
+  Status Reset() override { return OkStatus(); }
+
+  Status Save(std::function<std::string(std::string)> key_name_fn,
+              IteratorStateWriter* writer) override {
+    return OkStatus();
+  }
+
+  Status Restore(std::function<std::string(std::string)> key_name_fn,
+                 IteratorStateReader* reader) override {
+    return OkStatus();
+  }
+};
+
+TEST(DatasetUtilsTest, MakeNestedIteratorContext) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<TestContext> test_ctx,
+                          TestContext::Create());
+  IteratorContext::Params params(test_ctx->op_ctx());
+  params.split_providers.push_back(std::make_unique<TestSplitProvider>());
+  IteratorContext iter_ctx(params);
+  IteratorContext nested_ctx = MakeNestedIteratorContext(&iter_ctx);
+  EXPECT_FALSE(iter_ctx.split_providers().empty());
+  EXPECT_TRUE(nested_ctx.split_providers().empty());
+}
+
 REGISTER_DATASET_EXPERIMENT("test_only_experiment_0",
                             RandomJobSamplePercentage<0>, AllTasks);
 REGISTER_DATASET_EXPERIMENT("test_only_experiment_1",
diff --git a/tensorflow/core/data/hash_utils.cc b/tensorflow/core/data/hash_utils.cc
index 116827d0b8f..235a5d28e8d 100644
--- a/tensorflow/core/data/hash_utils.cc
+++ b/tensorflow/core/data/hash_utils.cc
@@ -14,7 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/hash_utils.h"
 
+#include <array>
+#include <memory>
 #include <queue>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/core/data/hash_utils_test.cc b/tensorflow/core/data/hash_utils_test.cc
index b0b34607a55..7219aff2db1 100644
--- a/tensorflow/core/data/hash_utils_test.cc
+++ b/tensorflow/core/data/hash_utils_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/data/hash_utils.h"
 
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -1221,7 +1224,7 @@ static void BM_ParallelFunctionCallsGraph(benchmark::State& state) {
 
   uint64 hash_value;
   for (auto _ : state) {
-    CHECK(HashNode(graph_def, *target, &hash_value).ok());
+    TF_CHECK_OK(HashNode(graph_def, *target, &hash_value));
   }
 }
 BENCHMARK(BM_ParallelFunctionCallsGraph);
@@ -1273,7 +1276,7 @@ static void BM_ChainedFunctionCallsGraph(benchmark::State& state) {
 
   uint64 hash_value;
   for (auto _ : state) {
-    CHECK(HashNode(graph_def, target, &hash_value).ok());
+    TF_CHECK_OK(HashNode(graph_def, target, &hash_value));
   }
 }
 BENCHMARK(BM_ChainedFunctionCallsGraph);
@@ -1346,7 +1349,7 @@ static void BM_ComposedFunctionCallsGraph(benchmark::State& state) {
 
   uint64 hash_value;
   for (auto _ : state) {
-    CHECK(HashNode(graph_def, target, &hash_value).ok());
+    TF_CHECK_OK(HashNode(graph_def, target, &hash_value));
   }
 }
 BENCHMARK(BM_ComposedFunctionCallsGraph);
diff --git a/tensorflow/core/data/name_utils.cc b/tensorflow/core/data/name_utils.cc
index d0378683c71..39f76dc1be0 100644
--- a/tensorflow/core/data/name_utils.cc
+++ b/tensorflow/core/data/name_utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/name_utils.h"
 
+#include <vector>
+
 #include "absl/strings/str_join.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/data/rewrite_utils.cc b/tensorflow/core/data/rewrite_utils.cc
index d5fcb856ad4..b82d181fae3 100644
--- a/tensorflow/core/data/rewrite_utils.cc
+++ b/tensorflow/core/data/rewrite_utils.cc
@@ -227,7 +227,7 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
       uint64 hash = 0;
       Status s = HashNode(graph_def, *node_def, *lib_def, &hash);
       if (!s.ok()) {
-        VLOG(3) << "Failed to hash graph: " << s.ToString();
+        VLOG(3) << "Failed to hash graph: " << s;
         return;
       }
       for (const auto& pair : input_list) {
@@ -237,7 +237,7 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
         if (s.ok()) {
           hash = Hash64CombineUnordered(hash, tensor_hash);
         } else {
-          VLOG(3) << "Failed to hash tensor: " << s.ToString();
+          VLOG(3) << "Failed to hash tensor: " << s;
         }
       }
       string graph_hash =
diff --git a/tensorflow/core/data/rewrite_utils_test.cc b/tensorflow/core/data/rewrite_utils_test.cc
index b9d4bd56419..21132e76820 100644
--- a/tensorflow/core/data/rewrite_utils_test.cc
+++ b/tensorflow/core/data/rewrite_utils_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/function.h"
diff --git a/tensorflow/core/data/root_dataset.h b/tensorflow/core/data/root_dataset.h
index 2f89152f416..02adf444ee3 100644
--- a/tensorflow/core/data/root_dataset.h
+++ b/tensorflow/core/data/root_dataset.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_ROOT_DATASET_H_
 #define TENSORFLOW_CORE_DATA_ROOT_DATASET_H_
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/model.pb.h"
diff --git a/tensorflow/core/data/serialization_utils.cc b/tensorflow/core/data/serialization_utils.cc
index 33c6635bbc8..e07ec49b913 100644
--- a/tensorflow/core/data/serialization_utils.cc
+++ b/tensorflow/core/data/serialization_utils.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/serialization_utils.h"
 
+#include <map>
 #include <memory>
 #include <string>
 #include <utility>
@@ -274,6 +275,20 @@ Status VariantTensorDataReader::ReadDatasetInternal(FunctionLibraryRuntime* flr,
   return OkStatus();
 }
 
+std::map<string, Tensor> VariantTensorDataReader::ReadAllTensors() {
+  std::map<string, Tensor> result;
+  for (const auto& entry : map_) {
+    string key1 = entry.first;
+    for (const auto& inner : entry.second) {
+      string key2 = inner.first;
+      size_t index = inner.second;
+      result[absl::StrCat(key1, kDelimiter, key2)] =
+          data_[key1]->tensors(index);
+    }
+  }
+  return result;
+}
+
 Status VariantTensorDataWriter::WriteScalar(StringPiece key,
                                             const int64_t val) {
   string prefix;
@@ -549,5 +564,38 @@ Status AsGraphDef(const DatasetBase* dataset,
   return OkStatus();
 }
 
+tsl::StatusOr<absl::flat_hash_map<std::string, int64_t>> CheckpointStats(
+    const std::string& checkpoint_bytes) {
+  TensorProto proto;
+  if (!ParseProtoUnlimited(&proto, checkpoint_bytes)) {
+    return absl::InvalidArgumentError(
+        "Failed to parse checkpoint bytes into proto.");
+  }
+  Tensor t;
+  if (!t.FromProto(proto)) {
+    return absl::InvalidArgumentError(
+        "Failed to parse checkpoint tensor from proto.");
+  }
+
+  int64_t num_tensors = t.dim_size(0);
+  auto serialized_vec = t.vec<Variant>();
+  std::vector<const VariantTensorData*> data;
+  data.reserve(num_tensors);
+  for (int i = 0; i < num_tensors; ++i) {
+    auto* w = serialized_vec(i).get<IteratorStateVariant>();
+    if (!w) {
+      return absl::InvalidArgumentError(
+          "Failed to access IteratorStateVariant inside checkpoint tensor");
+    }
+    data.push_back(w->GetData());
+  }
+  auto reader = std::make_unique<VariantTensorDataReader>(data);
+  absl::flat_hash_map<std::string, int64_t> stats;
+  for (const auto& [key, tensor] : reader->ReadAllTensors()) {
+    stats[key] = tensor.TotalBytes();
+  }
+  return stats;
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/serialization_utils.h b/tensorflow/core/data/serialization_utils.h
index e03fb80c01e..2ed6db8d4d0 100644
--- a/tensorflow/core/data/serialization_utils.h
+++ b/tensorflow/core/data/serialization_utils.h
@@ -16,13 +16,19 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_DATA_SERIALIZATION_UTILS_H_
 #define TENSORFLOW_CORE_DATA_SERIALIZATION_UTILS_H_
 
+#include <cstdint>
+#include <map>
 #include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace data {
@@ -71,6 +77,12 @@ class VariantTensorDataReader : public IteratorStateReader {
                             StringPiece key, Tensor* val) const;
   Status ReadDatasetInternal(FunctionLibraryRuntime* flr, StringPiece name,
                              StringPiece key, Tensor* val) const;
+  // Produces all key/value pairs stored in this reader. Useful for debugging.
+  std::map<string, Tensor> ReadAllTensors();
+
+  // For access to ReadAllTensors()
+  friend tsl::StatusOr<absl::flat_hash_map<std::string, int64_t>>
+  CheckpointStats(const std::string& checkpoint_bytes);
 
   std::map<string, std::map<string, size_t>> map_;
   std::map<string, const VariantTensorData*> data_;  // Not owned.
@@ -88,9 +100,8 @@ class VariantTensorDataReader : public IteratorStateReader {
 // Now the VariantTensorData objects can be used to serialize.
 class VariantTensorDataWriter : public IteratorStateWriter {
  public:
-  Status WriteScalar(StringPiece key, const int64_t val) override;
-  Status WriteScalar(StringPiece name, StringPiece key,
-                     const int64_t val) override;
+  Status WriteScalar(StringPiece key, int64_t val) override;
+  Status WriteScalar(StringPiece name, StringPiece key, int64_t val) override;
 
   Status WriteScalar(StringPiece key, const tstring& val) override;
   Status WriteScalar(StringPiece name, StringPiece key,
@@ -198,6 +209,11 @@ Status AsGraphDefForRewrite(OpKernelContext* ctx, const DatasetBase* input,
                             std::vector<std::pair<string, Tensor>>* input_list,
                             GraphDef* result, string* dataset_node);
 
+// Analyzes the bytes of a tf.data iterator checkpoint to identify all of the
+// keys in the checkpoint along with their sizes in bytes.
+tsl::StatusOr<absl::flat_hash_map<std::string, int64_t>> CheckpointStats(
+    const std::string& checkpoint_bytes);
+
 }  // namespace data
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/data/serialization_utils_test.cc b/tensorflow/core/data/serialization_utils_test.cc
index 5b8d79b1fcf..8a5d655d4c5 100644
--- a/tensorflow/core/data/serialization_utils_test.cc
+++ b/tensorflow/core/data/serialization_utils_test.cc
@@ -19,11 +19,13 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/test_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -47,54 +49,6 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-class TestContext {
- public:
-  static Status Create(std::unique_ptr<TestContext>* result) {
-    *result = absl::WrapUnique<TestContext>(new TestContext());
-
-    SessionOptions options;
-    auto* device_count = options.config.mutable_device_count();
-    device_count->insert({"CPU", 1});
-    std::vector<std::unique_ptr<Device>> devices;
-    TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
-        options, "/job:localhost/replica:0/task:0", &devices));
-    (*result)->device_mgr_ =
-        std::make_unique<StaticDeviceMgr>(std::move(devices));
-
-    FunctionDefLibrary proto;
-    (*result)->lib_def_ = std::make_unique<FunctionLibraryDefinition>(
-        OpRegistry::Global(), proto);
-
-    OptimizerOptions opts;
-    (*result)->pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
-        (*result)->device_mgr_.get(), Env::Default(), /*config=*/nullptr,
-        TF_GRAPH_DEF_VERSION, (*result)->lib_def_.get(), opts);
-    (*result)->runner_ = [](const std::function<void()>& fn) { fn(); };
-    (*result)->params_.function_library =
-        (*result)->pflr_->GetFLR("/device:CPU:0");
-    (*result)->params_.device = (*result)->device_mgr_->ListDevices()[0];
-    (*result)->params_.runner = &(*result)->runner_;
-    (*result)->op_ctx_ =
-        std::make_unique<OpKernelContext>(&(*result)->params_, 0);
-    (*result)->iter_ctx_ =
-        std::make_unique<IteratorContext>((*result)->op_ctx_.get());
-    return OkStatus();
-  }
-
-  IteratorContext* iter_ctx() const { return iter_ctx_.get(); }
-
- private:
-  TestContext() = default;
-
-  std::unique_ptr<DeviceMgr> device_mgr_;
-  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
-  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
-  std::function<void(std::function<void()>)> runner_;
-  OpKernelContext::Params params_;
-  std::unique_ptr<OpKernelContext> op_ctx_;
-  std::unique_ptr<IteratorContext> iter_ctx_;
-};
-
 string full_name(string key) { return FullName("Iterator:", key); }
 
 TEST(SerializationUtilsTest, CheckpointElementsRoundTrip) {
@@ -110,8 +64,8 @@ TEST(SerializationUtilsTest, CheckpointElementsRoundTrip) {
   VariantTensorDataReader reader(data);
   std::vector<std::vector<Tensor>> read_elements;
 
-  std::unique_ptr<TestContext> ctx;
-  TF_ASSERT_OK(TestContext::Create(&ctx));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<TestContext> ctx,
+                          TestContext::Create());
   TF_ASSERT_OK(ReadElementsFromCheckpoint(ctx->iter_ctx(), &reader, test_prefix,
                                           &read_elements));
   ASSERT_EQ(elements.size(), read_elements.size());
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index 6b1b6afbcc4..bf8fb7c65ce 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -379,6 +379,7 @@ tf_cc_test(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/tsl/protobuf:protos_all_cc",
         "@com_google_absl//absl/container:flat_hash_set",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
 )
@@ -595,6 +596,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:regexp",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -624,6 +626,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -800,6 +803,7 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ],
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 98788d2cf38..72e4c548eb9 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -61,6 +61,18 @@ bool IsColocatedTask(const TaskInfo& task) {
   });
 }
 
+StatusOr<DataTransferServerInfo> GetTransferServer(const std::string& protocol,
+                                                   const TaskInfo& task_info) {
+  for (const auto& transfer_server : task_info.transfer_servers()) {
+    if (transfer_server.protocol() == protocol) {
+      return transfer_server;
+    }
+  }
+  return errors::NotFound("protocol ", protocol,
+                          " is not available for worker ",
+                          task_info.worker_address());
+}
+
 }  // namespace
 
 DataServiceClient::DataServiceClient(const DataServiceParams& params)
@@ -314,14 +326,9 @@ void DataServiceClient::UpdateIterationFinished(bool iteration_finished)
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
 DataServiceClient::CreateWorkerClient(const std::string& protocol,
                                       const TaskInfo& task_info) {
-  for (const auto& transfer_server : task_info.transfer_servers()) {
-    if (transfer_server.protocol() == protocol) {
-      return CreateDataServiceWorkerClient(params_.protocol, transfer_server);
-    }
-  }
-  return errors::NotFound("protocol ", protocol,
-                          " is not available for worker ",
-                          task_info.worker_address());
+  TF_ASSIGN_OR_RETURN(DataTransferServerInfo transfer_server,
+                      GetTransferServer(protocol, task_info));
+  return CreateDataServiceWorkerClient(params_.protocol, transfer_server);
 }
 
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
@@ -331,41 +338,53 @@ DataServiceClient::CreateGrpcWorkerClient(const TaskInfo& task_info) {
 
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
 DataServiceClient::CreateAlternativeWorkerClientWithGrpcFallback(
-    const std::string& protocol, const TaskInfo& task_info) {
+    const DataTransferServerInfo& transfer_server, const TaskInfo& task_info) {
   StatusOr<std::unique_ptr<DataServiceWorkerClient>> worker =
-      CreateWorkerClient(protocol, task_info);
+      CreateDataServiceWorkerClient(params_.protocol, transfer_server);
   if (worker.ok()) {
     LOG(INFO) << "Successfully started client for data transfer protocol '"
-              << protocol << "'.";
+              << transfer_server.protocol() << "'.";
     return worker;
   }
   LOG(ERROR) << "Failed to start client for data transfer protocol '"
-             << protocol << "'; falling back to grpc. "
+             << transfer_server.protocol() << "'; falling back to grpc. "
              << "Original error: " << worker.status();
   metrics::RecordTFDataServiceDataTransferProtocolFallback(
-      protocol, static_cast<error::Code>(worker.status().raw_code()),
+      transfer_server.protocol(),
+      static_cast<error::Code>(worker.status().raw_code()),
       std::string(worker.status().message()));
   return CreateGrpcWorkerClient(task_info);
 }
 
 StatusOr<std::unique_ptr<DataServiceWorkerClient>>
 DataServiceClient::CreateWorkerClient(const TaskInfo& task_info) {
-  if (params_.data_transfer_protocol == kLocalTransferProtocol) {
+  if (LocalWorkers::Get(task_info.worker_address()) != nullptr) {
     DataTransferServerInfo info;
     info.set_protocol(kLocalTransferProtocol);
     info.set_address(task_info.worker_address());
     return CreateDataServiceWorkerClient(params_.protocol, info);
   }
   if (!params_.data_transfer_protocol.empty()) {
-    return CreateAlternativeWorkerClientWithGrpcFallback(
-        params_.data_transfer_protocol, task_info);
+    TF_ASSIGN_OR_RETURN(
+        DataTransferServerInfo transfer_server,
+        GetTransferServer(params_.data_transfer_protocol, task_info));
+    return CreateAlternativeWorkerClientWithGrpcFallback(transfer_server,
+                                                         task_info);
   }
   if (std::string default_protocol = DefaultDataTransferProtocol();
       default_protocol != kGrpcTransferProtocol) {
     LOG(INFO)
         << "This task is participating in the \"data_transfer\" experiment.";
-    return CreateAlternativeWorkerClientWithGrpcFallback(default_protocol,
-                                                         task_info);
+    StatusOr<DataTransferServerInfo> transfer_server =
+        GetTransferServer(default_protocol, task_info);
+    if (transfer_server.ok()) {
+      return CreateAlternativeWorkerClientWithGrpcFallback(*transfer_server,
+                                                           task_info);
+    }
+    LOG(INFO)
+        << "Failed to find transfer server for default data transfer protocol '"
+        << default_protocol << "'; falling back to grpc. "
+        << "Original error: " << transfer_server.status();
   }
   return CreateGrpcWorkerClient(task_info);
 }
@@ -375,7 +394,8 @@ Status DataServiceClient::AddTask(const TaskInfo& task_info)
   TF_ASSIGN_OR_RETURN(std::unique_ptr<DataServiceWorkerClient> worker,
                       CreateWorkerClient(task_info));
   metrics::RecordTFDataServiceDataTransferProtocolUsed(
-      worker->GetDataTransferProtocol());
+      worker->GetDataTransferProtocol(),
+      /*user_specified=*/!params_.data_transfer_protocol.empty());
   tasks_.push_back(std::make_shared<Task>(task_info, std::move(worker)));
   worker_thread_cv_.notify_one();
   if (IsCoordinatedRead()) {
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index 1ea93d72ba1..f362734b10f 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -143,8 +143,8 @@ class DataServiceClient {
   StatusOr<std::unique_ptr<DataServiceWorkerClient>> CreateGrpcWorkerClient(
       const TaskInfo& task_info);
   StatusOr<std::unique_ptr<DataServiceWorkerClient>>
-  CreateAlternativeWorkerClientWithGrpcFallback(const std::string& protocol,
-                                                const TaskInfo& task_info);
+  CreateAlternativeWorkerClientWithGrpcFallback(
+      const DataTransferServerInfo& transfer_server, const TaskInfo& task_info);
   void Heartbeat();
   void UpdateTasks(const ClientHeartbeatResponse& resp);
   bool ShouldReadFromTask(const TaskInfo& task) const;
diff --git a/tensorflow/core/data/service/client/data_service_client_test.cc b/tensorflow/core/data/service/client/data_service_client_test.cc
index 4b6a69fc42b..cdbe916412f 100644
--- a/tensorflow/core/data/service/client/data_service_client_test.cc
+++ b/tensorflow/core/data/service/client/data_service_client_test.cc
@@ -84,11 +84,10 @@ class TestDataServiceContext : public DataServiceContext {
         Env::Default()->StartThread({}, name, std::move(fn)));
   }
 
-  // NOLINTBEGIN(MOCK_METHOD does not work on Windows build, using deprecated
-  // MOCK_METHOD<N> instead)
-  MOCK_METHOD1(RecordBufferEnqueue, void(const std::vector<Tensor>& element));
-  MOCK_METHOD1(RecordBufferDequeue, void(const std::vector<Tensor>& element));
-  // NOLINTEND
+  MOCK_METHOD(void, RecordBufferEnqueue, (const std::vector<Tensor>& element),
+              (override));
+  MOCK_METHOD(void, RecordBufferDequeue, (const std::vector<Tensor>& element),
+              (override));
 };
 
 std::unique_ptr<TestDataServiceContext> GetTestDataServiceContext() {
diff --git a/tensorflow/core/data/service/dispatcher.proto b/tensorflow/core/data/service/dispatcher.proto
index eab261471e1..062cd6bfd4a 100644
--- a/tensorflow/core/data/service/dispatcher.proto
+++ b/tensorflow/core/data/service/dispatcher.proto
@@ -241,7 +241,7 @@ message SnapshotRequest {
 // Next tag: 1
 message SnapshotResponse {}
 
-// Next tag: 5
+// Next tag: 6
 message GetSnapshotSplitRequest {
   // The address of the worker requesting the split.
   string worker_address = 4;
@@ -254,6 +254,9 @@ message GetSnapshotSplitRequest {
 
   // The index of the dataset source from which to get the split.
   int64 source_index = 3;
+
+  // The repetition of the dataset from which to get the split.
+  int64 repetition_index = 5;
 }
 
 // Next tag: 4
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index 9a83f09f706..b4e68e4dbfa 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -171,13 +171,14 @@ Status DataServiceDispatcherClient::Snapshot(
 
 Status DataServiceDispatcherClient::GetSnapshotSplit(
     const std::string& worker_address, const std::string& base_path,
-    int64_t stream_index, int64_t source_index, Tensor& split,
-    int64_t& local_split_index, bool& end_of_splits) {
+    int64_t stream_index, int64_t source_index, int64_t repetition_index,
+    Tensor& split, int64_t& local_split_index, bool& end_of_splits) {
   GetSnapshotSplitRequest req;
   req.set_worker_address(worker_address);
   req.set_base_path(base_path);
   req.set_stream_index(stream_index);
   req.set_source_index(source_index);
+  req.set_repetition_index(repetition_index);
 
   GetSnapshotSplitResponse resp;
   grpc::ClientContext client_ctx;
diff --git a/tensorflow/core/data/service/dispatcher_client.h b/tensorflow/core/data/service/dispatcher_client.h
index 70129279e85..2fada9dcc44 100644
--- a/tensorflow/core/data/service/dispatcher_client.h
+++ b/tensorflow/core/data/service/dispatcher_client.h
@@ -73,7 +73,8 @@ class DataServiceDispatcherClient : public DataServiceClientBase {
   virtual Status GetSnapshotSplit(const std::string& worker_address,
                                   const std::string& base_path,
                                   int64_t stream_index, int64_t source_index,
-                                  Tensor& split, int64_t& local_split_index,
+                                  int64_t repetition_index, Tensor& split,
+                                  int64_t& local_split_index,
                                   bool& end_of_splits);
 
   // Initiates the process of materializing `dataset`'s output to `path`.
diff --git a/tensorflow/core/data/service/dispatcher_client_test.cc b/tensorflow/core/data/service/dispatcher_client_test.cc
index 2ea85e4f0d2..2705339b0dd 100644
--- a/tensorflow/core/data/service/dispatcher_client_test.cc
+++ b/tensorflow/core/data/service/dispatcher_client_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/protobuf/snapshot.pb.h"
 #include "tensorflow/core/protobuf/struct.pb.h"
+#include "tensorflow/tsl/protobuf/error_codes.pb.h"
 
 namespace tensorflow {
 namespace data {
@@ -135,7 +136,7 @@ TEST_F(DispatcherClientTest, SnapshotAlreadyStarted) {
       dispatcher_client_->Snapshot(RangeDataset(10), directory, metadata));
   EXPECT_THAT(
       dispatcher_client_->Snapshot(RangeDataset(10), directory, metadata),
-      StatusIs(error::INVALID_ARGUMENT, HasSubstr("already started")));
+      StatusIs(error::ALREADY_EXISTS, HasSubstr("already started")));
 }
 
 TEST_F(DispatcherClientTest, GetDataServiceConfig) {
@@ -202,7 +203,8 @@ TEST_F(DispatcherClientTest, GetSnapshotSplit) {
       TF_ASSERT_OK(dispatcher_client_->GetSnapshotSplit(
           test_cluster_->WorkerAddress(0), snapshot_task.base_path(),
           snapshot_task.stream_index(),
-          /*source_index=*/0, split, local_split_index, end_of_splits));
+          /*source_index=*/0, /*repetition_index=*/0, split, local_split_index,
+          end_of_splits));
       EXPECT_EQ(local_split_index, i);
       EXPECT_FALSE(end_of_splits);
     }
@@ -230,7 +232,8 @@ TEST_F(DispatcherClientTest, GetSnapshotSplitMultipleStreams) {
       TF_ASSERT_OK(dispatcher_client_->GetSnapshotSplit(
           test_cluster_->WorkerAddress(i), snapshot_task.base_path(),
           snapshot_task.stream_index(),
-          /*source_index=*/0, split, local_split_index, end_of_splits));
+          /*source_index=*/0, /*repetition_index=*/0, split, local_split_index,
+          end_of_splits));
       EXPECT_EQ(local_split_index, 0);
       EXPECT_FALSE(end_of_splits);
     }
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 3a704ec612e..dda07eb5a8d 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -1043,10 +1043,9 @@ Status DataServiceDispatcherImpl::Snapshot(const SnapshotRequest* request,
                                            SnapshotResponse* response) {
   TF_RETURN_IF_ERROR(CheckStarted());
   mutex_lock l(mu_);
-
   if (snapshots_.contains(request->path())) {
-    return errors::InvalidArgument("a snapshot at ", request->path(),
-                                   " is already started or completed");
+    return errors::AlreadyExists("tf.data snapshot at ", request->path(),
+                                 " is already started or completed");
   }
 
   TF_ASSIGN_OR_RETURN(std::unique_ptr<SnapshotManager> snapshot_manager,
diff --git a/tensorflow/core/data/service/journal.cc b/tensorflow/core/data/service/journal.cc
index df19547c9fd..5890e46fbf0 100644
--- a/tensorflow/core/data/service/journal.cc
+++ b/tensorflow/core/data/service/journal.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/data/service/journal.pb.h"
 #include "tensorflow/core/lib/io/record_reader.h"
 #include "tensorflow/core/lib/io/record_writer.h"
@@ -106,11 +107,11 @@ Status FileJournalReader::Read(Update& update, bool& end_of_journal) {
   while (true) {
     tstring record;
     Status s = reader_->ReadRecord(&record);
-    if (errors::IsOutOfRange(s)) {
+    if (absl::IsOutOfRange(s)) {
       sequence_number_++;
       std::string next_journal_file =
           DataServiceJournalFile(journal_dir_, sequence_number_);
-      if (errors::IsNotFound(env_->FileExists(next_journal_file))) {
+      if (absl::IsNotFound(env_->FileExists(next_journal_file))) {
         VLOG(3) << "Next journal file " << next_journal_file
                 << " does not exist. End of journal reached.";
         end_of_journal = true;
diff --git a/tensorflow/core/data/service/journal_test.cc b/tensorflow/core/data/service/journal_test.cc
index 5c1326c190a..81d993c209d 100644
--- a/tensorflow/core/data/service/journal_test.cc
+++ b/tensorflow/core/data/service/journal_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/journal.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
@@ -121,7 +122,7 @@ TEST(Journal, MissingFile) {
   Update result;
   bool end_of_journal = true;
   Status s = reader.Read(result, end_of_journal);
-  EXPECT_TRUE(errors::IsNotFound(s));
+  EXPECT_TRUE(absl::IsNotFound(s));
 }
 
 TEST(Journal, NonRecordData) {
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index 8068a66e7f0..da4caf696fd 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -19,7 +19,6 @@ tf_cc_test(
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     deps = [
         ":path_utils",
-        ":snapshot_reader",
         ":test_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -37,6 +36,7 @@ tf_cc_test(
         "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:tstring",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ] + tf_grpc_cc_dependencies() + tf_protos_profiler_service(),
@@ -185,6 +185,9 @@ cc_library(
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/protobuf:protos_all_cc",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
 )
@@ -213,27 +216,20 @@ tf_cc_test(
 )
 
 cc_library(
-    name = "snapshot_reader",
-    srcs = ["snapshot_reader.cc"],
-    hdrs = ["snapshot_reader.h"],
+    name = "snapshot_chunk_dataset_op",
+    srcs = ["snapshot_chunk_dataset_op.cc"],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        ":file_utils",
-        ":path_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/data:captured_function",
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:snapshot_utils",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
-        "//tensorflow/tsl/platform:path",
-        "//tensorflow/tsl/platform:refcount",
         "//tensorflow/tsl/platform:status",
-        "//tensorflow/tsl/platform:statusor",
-        "@com_google_absl//absl/strings",
+        "//tensorflow/tsl/platform:tstring",
     ],
 )
 
@@ -249,6 +245,7 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:snapshot_utils",
+        "//tensorflow/core/data/service:common",
         "//tensorflow/core/data/service:common_proto_cc",
         "//tensorflow/core/data/service:task_runner",
         "//tensorflow/core/data/service:worker_proto_cc",
@@ -260,7 +257,9 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "//tensorflow/tsl/platform:thread_annotations",
+        "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -270,7 +269,6 @@ tf_cc_test(
     srcs = ["snapshot_stream_writer_checkpoint_test.cc"],
     deps = [
         ":path_utils",
-        ":snapshot_reader",
         ":snapshot_stream_writer",
         ":test_utils",
         "//tensorflow/core:lib",
@@ -281,9 +279,11 @@ tf_cc_test(
         "//tensorflow/core/data/service:test_util",
         "//tensorflow/tsl/platform:env",
         "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:random",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:status_matchers",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -336,6 +336,7 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc b/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
index 9a15baf6eb7..3f3a0b13171 100644
--- a/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
+++ b/tensorflow/core/data/service/snapshot/distributed_snapshot_test.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <tuple>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/data/service/dispatcher_client.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
-#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
 #include "tensorflow/core/data/service/snapshot/test_utils.h"
 #include "tensorflow/core/data/service/test_cluster.h"
 #include "tensorflow/core/data/service/test_util.h"
@@ -76,7 +76,7 @@ class TestSnapshotCluster {
 tsl::Status WaitForFileExists(const std::string& file_path) {
   while (true) {
     tsl::Status status = Env::Default()->FileExists(file_path);
-    if (!errors::IsNotFound(status)) {
+    if (!absl::IsNotFound(status)) {
       TF_RETURN_IF_ERROR(status);
     }
     if (status.ok()) {
diff --git a/tensorflow/core/data/service/snapshot/path_utils.cc b/tensorflow/core/data/service/snapshot/path_utils.cc
index 633be77105b..8cb22e3538c 100644
--- a/tensorflow/core/data/service/snapshot/path_utils.cc
+++ b/tensorflow/core/data/service/snapshot/path_utils.cc
@@ -63,16 +63,25 @@ std::string SplitsDirectory(absl::string_view snapshot_path,
 }
 
 std::string SourceDirectory(absl::string_view snapshot_path,
-                            int64_t stream_index, int64_t source_id) {
+                            int64_t stream_index, int64_t source_index) {
   return tsl::io::JoinPath(SplitsDirectory(snapshot_path, stream_index),
-                           absl::StrCat("source_", source_id));
+                           absl::StrCat("source_", source_index));
+}
+
+std::string RepetitionDirectory(absl::string_view snapshot_path,
+                                int64_t stream_index, int64_t source_index,
+                                int64_t repetition_index) {
+  return tsl::io::JoinPath(
+      SourceDirectory(snapshot_path, stream_index, source_index),
+      absl::StrCat("repetition_", repetition_index));
 }
 
 std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
-                      int64_t source_id, int64_t local_index,
-                      int64_t global_index) {
+                      int64_t source_index, int64_t repetition_index,
+                      int64_t local_index, int64_t global_index) {
   return tsl::io::JoinPath(
-      SourceDirectory(snapshot_path, stream_index, source_id),
+      RepetitionDirectory(snapshot_path, stream_index, source_index,
+                          repetition_index),
       absl::StrCat("split_", local_index, "_", global_index));
 }
 
@@ -89,9 +98,37 @@ tsl::StatusOr<int64_t> ParseStreamDirectoryName(
   return stream_index;
 }
 
+tsl::StatusOr<int64_t> ParseSourceDirectoryName(
+    absl::string_view source_directory_name) {
+  std::vector<std::string> tokens = absl::StrSplit(source_directory_name, '_');
+  int64_t source_index = 0;
+  if (tokens.size() != 2 || tokens[0] != "source" ||
+      !absl::SimpleAtoi(tokens[1], &source_index) || source_index < 0) {
+    return tsl::errors::InvalidArgument(
+        "Invalid source directory name: ", source_directory_name,
+        ". Expected source_<source_index>.");
+  }
+  return source_index;
+}
+
+tsl::StatusOr<int64_t> ParseRepetitionDirectoryName(
+    absl::string_view repetition_directory_name) {
+  std::vector<std::string> tokens =
+      absl::StrSplit(repetition_directory_name, '_');
+  int64_t repetition_index = 0;
+  if (tokens.size() != 2 || tokens[0] != "repetition" ||
+      !absl::SimpleAtoi(tokens[1], &repetition_index) || repetition_index < 0) {
+    return tsl::errors::InvalidArgument(
+        "Invalid repetition directory name: ", repetition_directory_name,
+        ". Expected repetition_<repetition_index>.");
+  }
+  return repetition_index;
+}
+
 tsl::StatusOr<std::pair<int64_t, int64_t>> ParseSplitFilename(
     absl::string_view split_filename) {
-  std::vector<std::string> tokens = absl::StrSplit(split_filename, '_');
+  std::vector<std::string> tokens =
+      absl::StrSplit(tsl::io::Basename(split_filename), '_');
   int64_t local_split_index = 0, global_split_index = 0;
   if (tokens.size() != 3 || tokens[0] != "split" ||
       !absl::SimpleAtoi(tokens[1], &local_split_index) ||
diff --git a/tensorflow/core/data/service/snapshot/path_utils.h b/tensorflow/core/data/service/snapshot/path_utils.h
index 5c84b6114f5..d41adad0216 100644
--- a/tensorflow/core/data/service/snapshot/path_utils.h
+++ b/tensorflow/core/data/service/snapshot/path_utils.h
@@ -41,13 +41,18 @@ std::string SplitsDirectory(absl::string_view snapshot_path,
 // Returns the directory path for the assigned splits for one source, for a
 // worker writing one stream of a snapshot.
 std::string SourceDirectory(absl::string_view snapshot_path,
-                            int64_t stream_index, int64_t source_id);
+                            int64_t stream_index, int64_t source_index);
+
+// Returns the directory path for one repetition of a split provider.
+std::string RepetitionDirectory(absl::string_view snapshot_path,
+                                int64_t stream_index, int64_t source_index,
+                                int64_t repetition_index);
 
 // Returns the file path for an assigned split for a worker writing one stream
 // of a snapshot.
 std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
-                      int64_t source_id, int64_t local_index,
-                      int64_t global_index);
+                      int64_t source_index, int64_t repetition_index,
+                      int64_t local_index, int64_t global_index);
 
 // Returns the index of the stream. The expected format of
 // `stream_directory_name` is:
@@ -55,6 +60,18 @@ std::string SplitPath(absl::string_view snapshot_path, int64_t stream_index,
 tsl::StatusOr<int64_t> ParseStreamDirectoryName(
     absl::string_view stream_directory_name);
 
+// Returns the index of the source. The expected format of
+// `source_directory_name` is:
+// source_<stream_index>
+tsl::StatusOr<int64_t> ParseSourceDirectoryName(
+    absl::string_view source_directory_name);
+
+// Returns the index of the repetition. The expected format of
+// `repetition_directory_name` is:
+// repetition_<stream_index>
+tsl::StatusOr<int64_t> ParseRepetitionDirectoryName(
+    absl::string_view repetition_directory_name);
+
 // Returns a pair of {local_split_index, global_split_index} of the split. The
 // expected format of `split_filename` is:
 // split_<local_split_index>_<global_split_index>
diff --git a/tensorflow/core/data/service/snapshot/path_utils_test.cc b/tensorflow/core/data/service/snapshot/path_utils_test.cc
index 624a0d30a62..37c4064b141 100644
--- a/tensorflow/core/data/service/snapshot/path_utils_test.cc
+++ b/tensorflow/core/data/service/snapshot/path_utils_test.cc
@@ -46,22 +46,58 @@ TEST(PathUtilsTest, SplitsDirectory) {
 
 TEST(PathUtilsTest, SourceDirectory) {
   EXPECT_THAT(
-      SourceDirectory("/path/to/snapshot", /*stream_index=*/0, /*source_id=*/1),
+      SourceDirectory("/path/to/snapshot", /*stream_index=*/0,
+                      /*source_index=*/1),
       MatchesRegex("/path/to/snapshot.streams.stream_0.splits.source_1"));
 }
 
+TEST(PathUtilsTest, RepetitionDirectory) {
+  EXPECT_THAT(
+      RepetitionDirectory("/path/to/snapshot", /*stream_index=*/0,
+                          /*source_index=*/1, /*repetition_index=*/2),
+      MatchesRegex(
+          "/path/to/snapshot.streams.stream_0.splits.source_1.repetition_2"));
+}
+
 TEST(PathUtilsTest, SplitPath) {
   EXPECT_THAT(
-      SplitPath("/path/to/snapshot", /*stream_index=*/0, /*source_id=*/1,
-                /*local_index=*/2, /*global_index=*/3),
+      SplitPath("/path/to/snapshot", /*stream_index=*/0, /*source_index=*/1,
+                /*repetition_index=*/2, /*local_index=*/3, /*global_index=*/4),
       MatchesRegex(
-          "/path/to/snapshot.streams.stream_0.splits.source_1.split_2_3"));
+          "/path/to/"
+          "snapshot.streams.stream_0.splits.source_1.repetition_2.split_3_4"));
 }
 
 TEST(PathUtilsTest, ParseStreamDirectoryName) {
   EXPECT_THAT(ParseStreamDirectoryName("stream_1"), IsOkAndHolds(1));
 }
 
+TEST(PathUtilsTest, ParseSourceDirectoryName) {
+  EXPECT_THAT(ParseSourceDirectoryName("source_1"), IsOkAndHolds(1));
+  EXPECT_THAT(ParseSourceDirectoryName(""),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected source_<source_index>")));
+  EXPECT_THAT(ParseSourceDirectoryName("source_-1"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected source_<source_index>")));
+  EXPECT_THAT(ParseSourceDirectoryName("chunk_1"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected source_<source_index>")));
+}
+
+TEST(PathUtilsTest, ParseRepetitionDirectoryName) {
+  EXPECT_THAT(ParseRepetitionDirectoryName("repetition_1"), IsOkAndHolds(1));
+  EXPECT_THAT(ParseRepetitionDirectoryName(""),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected repetition_<repetition_index>")));
+  EXPECT_THAT(ParseRepetitionDirectoryName("repetition_-1"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected repetition_<repetition_index>")));
+  EXPECT_THAT(ParseRepetitionDirectoryName("chunk_1"),
+              StatusIs(error::INVALID_ARGUMENT,
+                       HasSubstr("Expected repetition_<repetition_index>")));
+}
+
 TEST(PathUtilsTest, InvalidStreamDirectoryName) {
   EXPECT_THAT(ParseStreamDirectoryName(""),
               StatusIs(error::INVALID_ARGUMENT,
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader.cc b/tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc
similarity index 71%
rename from tensorflow/core/data/service/snapshot/snapshot_reader.cc
rename to tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc
index ff63ec41d74..78d24dfcb9f 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_reader.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_dataset_op.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,17 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
-
 #include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/core/data/captured_function.h"
 #include "tensorflow/core/data/name_utils.h"
-#include "tensorflow/core/data/service/snapshot/file_utils.h"
 #include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -31,10 +27,8 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
-#include "tensorflow/tsl/platform/path.h"
-#include "tensorflow/tsl/platform/refcount.h"
 #include "tensorflow/tsl/platform/status.h"
-#include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/platform/tstring.h"
 
 namespace tensorflow {
 namespace data {
@@ -48,7 +42,7 @@ constexpr const char* const kOutputShapes = "output_shapes";
 
 constexpr int64_t kTFRecordReaderOutputBufferSize = 512 << 20;  // 512MB
 
-// A reader dataset is responsible for reading one chunk file.
+// A reader dataset is responsible for reading one chunk file of a snapshot.
 // TODO(b/250921378): Merge this with `snapshot_util::Reader::Dataset`.
 class SnapshotChunkDatasetOp : public DatasetOpKernel {
  public:
@@ -198,65 +192,9 @@ void SnapshotChunkDatasetOp::MakeDataset(OpKernelContext* ctx,
                                                 output_shapes_);
 }
 
-Status MakeNestedDataset(const SnapshotReaderParams& params,
-                         DatasetBase** output) {
-  TF_ASSIGN_OR_RETURN(
-      std::vector<std::string> chunk_files,
-      GetChildren(params.CommittedChunksDirectory(), params.env));
-
-  std::vector<DatasetBase*> datasets;
-  datasets.reserve(chunk_files.size());
-  for (int64_t i = 0; i < chunk_files.size(); ++i) {
-    std::string chunk_file_path =
-        tsl::io::JoinPath(params.CommittedChunksDirectory(), chunk_files[i]);
-    datasets.push_back(new SnapshotChunkDatasetOp::Dataset(
-        DatasetContext(DatasetContext::Params(
-            {"SnapshotChunkDataset",
-             strings::StrCat("SnapshotChunkDataset/_", i)})),
-        chunk_file_path, params.metadata.compression(), params.dtypes,
-        params.shapes));
-    datasets.back()->Initialize(/*metadata=*/{});
-  }
-  snapshot_util::Reader::MakeNestedDataset(datasets, output);
-  return OkStatus();
-}
-
 REGISTER_KERNEL_BUILDER(Name("SnapshotChunkDataset").Device(DEVICE_CPU),
                         SnapshotChunkDatasetOp);
 
 }  // namespace
-
-StatusOr<core::RefCountPtr<DatasetBase>> MakeSnapshotReaderDataset(
-    const SnapshotReaderParams& params,
-    InstantiatedCapturedFunction& instantiated_captured_func,
-    IteratorContext* ctx) {
-  TF_RETURN_IF_ERROR(ValidateSnapshot(params.snapshot_path, params.env));
-  DatasetBase* dataset_of_snapshot_files;
-  TF_RETURN_IF_ERROR(MakeNestedDataset(params, &dataset_of_snapshot_files));
-
-  Tensor input_dataset_tensor(DT_VARIANT, TensorShape({}));
-  TF_RETURN_IF_ERROR(StoreDatasetInVariantTensor(dataset_of_snapshot_files,
-                                                 &input_dataset_tensor));
-
-  std::vector<Tensor> reader_input;
-  std::vector<Tensor> reader_output;
-  reader_input.push_back(std::move(input_dataset_tensor));
-
-  // NOTE: We intentionally ignore resource modeling outside GetNext().
-  TF_RETURN_IF_ERROR(instantiated_captured_func.Run(
-      ctx, std::move(reader_input), &reader_output, /*node=*/nullptr));
-  if (reader_output.size() != 1) {
-    return errors::InvalidArgument(
-        "reader_func in tf.data.Dataset.load is expected to return one "
-        "argument. Got ",
-        reader_output.size(), ".");
-  }
-  DatasetBase* output_dataset = nullptr;
-  TF_RETURN_IF_ERROR(
-      GetDatasetFromVariantTensor(reader_output[0], &output_dataset));
-  output_dataset->Ref();
-  return core::RefCountPtr<DatasetBase>(output_dataset);
-}
-
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
index cbb11a285de..16a0da703a7 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -19,9 +19,13 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
@@ -45,9 +49,7 @@ namespace data {
 using ::tsl::OkStatus;
 using ::tsl::errors::InvalidArgument;
 
-// The time for which an UNKNOWN stream should transition to ORPHAN if no worker
-// claims ownership of it via heartbeat.
-const absl::Duration kUnknownStreamTimeout = absl::Seconds(45);
+const absl::Duration kProgressLoggingInterval = absl::Minutes(1);
 
 StatusOr<std::unique_ptr<SnapshotManager>> SnapshotManager::Start(
     const SnapshotRequest& request, Env* env) {
@@ -57,17 +59,46 @@ StatusOr<std::unique_ptr<SnapshotManager>> SnapshotManager::Start(
 }
 
 Status SnapshotManager::Start(const SnapshotRequest& request) {
+  LOG(INFO) << "Starting to write tf.data snapshot at " << request.path();
   if (env_->FileExists(request.path()).ok()) {
-    return InvalidArgument("Distributed tf.data snapshot at ", request.path(),
-                           " already exists.");
+    return errors::AlreadyExists("tf.data snapshot at ", request.path(),
+                                 " already exists.");
   }
-  TF_RETURN_IF_ERROR(CreateSplitProviders(request.dataset(), split_providers_));
+  TF_ASSIGN_OR_RETURN(sources_, CreateSources(request.dataset()));
+  TF_ASSIGN_OR_RETURN(num_total_splits_, CountSplits());
   TF_RETURN_IF_ERROR(WriteOnDiskSkeleton());
   TF_RETURN_IF_ERROR(WriteOnDiskMetadata(request));
   metadata_ = request.metadata();
+  LOG(INFO) << "Started writing tf.data distributed snapshot at " << path_;
   return OkStatus();
 }
 
+StatusOr<std::vector<SnapshotManager::Source>> SnapshotManager::CreateSources(
+    const DatasetDef& dataset_def) const {
+  std::vector<std::unique_ptr<SplitProvider>> split_providers;
+  TF_RETURN_IF_ERROR(CreateSplitProviders(dataset_def, split_providers));
+  std::vector<SnapshotManager::Source> sources;
+  sources.reserve(split_providers.size());
+  for (auto& split_provider : split_providers) {
+    sources.push_back({std::move(split_provider), /*repetition_index=*/0});
+  }
+  return sources;
+}
+
+StatusOr<int64_t> SnapshotManager::CountSplits() {
+  int64_t num_splits = 0;
+  for (const auto& source : sources_) {
+    Tensor tensor;
+    for (bool end_of_splits = false; !end_of_splits; ++num_splits) {
+      TF_RETURN_IF_ERROR(
+          source.split_provider->GetNext(&tensor, &end_of_splits));
+    }
+    --num_splits;
+    TF_RETURN_IF_ERROR(source.split_provider->Reset());
+  }
+  return num_splits;
+}
+
 Status SnapshotManager::WriteOnDiskSkeleton() {
   TF_RETURN_IF_ERROR(
       env_->RecursivelyCreateDir(CommittedChunksDirectory(path_)));
@@ -87,8 +118,7 @@ Status SnapshotManager::WriteOnDiskMetadata(const SnapshotRequest& request) {
 
 StatusOr<std::unique_ptr<SnapshotManager>> SnapshotManager::Resume(
     absl::string_view path, Env* env) {
-  SnapshotManager* snapshot_manager =
-      new SnapshotManager(path, env, absl::Microseconds(env->NowMicros()));
+  SnapshotManager* snapshot_manager = new SnapshotManager(path, env);
   TF_RETURN_IF_ERROR(snapshot_manager->Resume());
   return absl::WrapUnique(snapshot_manager);
 }
@@ -114,6 +144,7 @@ Status SnapshotManager::Resume() {
   }
   TF_RETURN_IF_ERROR(ReadOnDiskMetadata());
   TF_RETURN_IF_ERROR(ReadOnDiskStreams());
+  LOG(INFO) << "Resumed writing tf.data distributed snapshot at " << path_;
   return OkStatus();
 }
 
@@ -133,7 +164,8 @@ Status SnapshotManager::ReadOnDiskMetadata() {
   TF_RETURN_IF_ERROR(
       ReadBinaryProto(env_, DatasetDefFilePath(path_), &dataset_def));
 
-  TF_RETURN_IF_ERROR(CreateSplitProviders(dataset_def, split_providers_));
+  TF_ASSIGN_OR_RETURN(sources_, CreateSources(dataset_def));
+  TF_ASSIGN_OR_RETURN(num_total_splits_, CountSplits());
   return OkStatus();
 }
 
@@ -229,41 +261,63 @@ Status SnapshotManager::ReadOnDiskStream(
 Status SnapshotManager::ReadOnDiskSource(
     int64_t stream_index, int64_t source_index,
     absl::flat_hash_set<int64_t>& global_split_indices) {
-  std::string source_path = SourceDirectory(path_, stream_index, source_index);
-  TF_ASSIGN_OR_RETURN(std::vector<std::string> split_filenames,
-                      GetChildren(source_path, env_));
+  std::string source_directory =
+      SourceDirectory(path_, stream_index, source_index);
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> repetition_directories,
+                      GetChildren(source_directory, env_));
+  sources_[source_index].repetition_index =
+      repetition_directories.empty() ? 0 : repetition_directories.size() - 1;
 
-  Tensor unused_tensor;
-  bool unused_end_of_splits;
-  for (const auto& split_filename : split_filenames) {
-    std::string split_path = io::JoinPath(source_path, split_filename);
-    TF_ASSIGN_OR_RETURN(auto split_indices, ParseSplitFilename(split_filename));
-    auto [local_split_index, global_split_index] = split_indices;
-    if (local_split_index > split_filenames.size() - 1) {
-      return InvalidArgument(
-          "found conflict between the number of splits and name of ",
-          split_path);
+  for (const std::string& repetition : repetition_directories) {
+    std::string repetition_dir =
+        tsl::io::JoinPath(source_directory, repetition);
+    TF_ASSIGN_OR_RETURN(std::vector<std::string> split_files,
+                        GetChildren(repetition_dir, env_));
+    for (const std::string& split_file : split_files) {
+      std::string split_path = io::JoinPath(repetition_dir, split_file);
+      TF_RETURN_IF_ERROR(ReadOnDiskSplit(source_index, split_files, split_path,
+                                         global_split_indices));
     }
-    if (global_split_indices.contains(global_split_index)) {
-      return InvalidArgument("found duplicate global split index in name of ",
-                             split_path);
-    }
-
-    // To account for this split having been assigned, skip a split in the
-    // respective provider.
-    TF_RETURN_IF_ERROR(split_providers_[source_index]->GetNext(
-        &unused_tensor, &unused_end_of_splits));
-    global_split_indices.insert(global_split_index);
+    streams_[stream_index].num_assigned_splits_per_source[source_index] +=
+        split_files.size();
   }
+  return OkStatus();
+}
 
-  streams_[stream_index].num_assigned_splits[source_index] =
-      split_filenames.size();
+Status SnapshotManager::ReadOnDiskSplit(
+    int64_t source_index, const std::vector<std::string>& split_files,
+    const std::string& split_file,
+    absl::flat_hash_set<int64_t>& global_split_indices) {
+  // `split_file` must have this format:
+  // "split_<local_split_index>_<global_split_index>".
+  TF_ASSIGN_OR_RETURN(auto split_indices, ParseSplitFilename(split_file));
+  auto [local_split_index, global_split_index] = split_indices;
+  if (global_split_indices.contains(global_split_index)) {
+    return InvalidArgument("found duplicate global split index in name of ",
+                           split_file);
+  }
+  global_split_indices.insert(global_split_index);
+
+  // To account for this split having been assigned, skip a split in the
+  // respective split provider.
+  return SkipSplit(*sources_[source_index].split_provider);
+}
+
+Status SnapshotManager::SkipSplit(SplitProvider& split_provider) {
+  Tensor tensor;
+  bool end_of_splits = false;
+  TF_RETURN_IF_ERROR(split_provider.GetNext(&tensor, &end_of_splits));
+  while (end_of_splits) {
+    TF_RETURN_IF_ERROR(split_provider.Reset());
+    TF_RETURN_IF_ERROR(split_provider.GetNext(&tensor, &end_of_splits));
+  }
   return OkStatus();
 }
 
 Status SnapshotManager::HandleStreamCompletion(
     int64_t stream_index, absl::string_view worker_address) {
   streams_[stream_index].state = Stream::State::kDone;
+  ++num_completed_streams_;
   if (absl::c_all_of(streams_, [](const Stream& stream) {
         return stream.state == Stream::State::kDone;
       })) {
@@ -297,15 +351,19 @@ StatusOr<int64_t> SnapshotManager::CreateAndAssignNewStream(
     absl::string_view worker_address) {
   int64_t new_stream_index = streams_.size();
   for (int64_t source_index = 0; source_index < num_sources(); ++source_index) {
-    TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(
-        SourceDirectory(path_, new_stream_index, source_index)));
+    for (int64_t repetition_index = 0;
+         repetition_index <= sources_[source_index].repetition_index;
+         ++repetition_index) {
+      TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(RepetitionDirectory(
+          path_, new_stream_index, source_index, repetition_index)));
+    }
   }
   TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(
       StreamWorkerFilePath(path_, new_stream_index), worker_address, env_));
   streams_.push_back(Stream(num_sources()));
   assignments_[worker_address] = new_stream_index;
-  LOG(INFO) << "creating stream_ " << new_stream_index << " and assigning to "
-            << worker_address;
+  LOG(INFO) << "For snapshot at " << path_ << ", created stream_"
+            << new_stream_index << " and assigned to " << worker_address;
   return new_stream_index;
 }
 
@@ -321,15 +379,15 @@ SnapshotManager::MaybeGetOrCreateStreamAssignment(
     if (assigned_stream_index.has_value() &&
         *assigned_stream_index !=
             snapshot_progress->snapshot_task().stream_index()) {
-      return errors::Internal("worker ", worker_address,
-                              " think it's assigned stream ",
-                              " but it's actually assigned assigned stream ",
-                              *assigned_stream_index);
+      return absl::InternalError(absl::StrCat(
+          "tf.data snapshot worker ", worker_address, " was assigned stream ",
+          snapshot_progress->snapshot_task().stream_index(),
+          ", but is now assigned a different stream ", *assigned_stream_index));
     }
     if (assigned_stream_index.has_value() && snapshot_progress->completed()) {
       TF_RETURN_IF_ERROR(HandleStreamCompletion(
           snapshot_progress->snapshot_task().stream_index(), worker_address));
-      assigned_stream_index.reset();
+      return std::optional<int64_t>();
     }
     if (snapshot_progress->status().code() != error::OK) {
       TF_RETURN_IF_ERROR(
@@ -344,6 +402,10 @@ SnapshotManager::MaybeGetOrCreateStreamAssignment(
     TF_ASSIGN_OR_RETURN(assigned_stream_index,
                         CreateAndAssignNewStream(worker_address));
   }
+  if (assigned_stream_index &&
+      streams_[*assigned_stream_index].state == Stream::State::kDone) {
+    return std::optional<int64_t>();
+  }
   return assigned_stream_index;
 }
 
@@ -357,11 +419,24 @@ Status SnapshotManager::WorkerHeartbeat(const WorkerHeartbeatRequest& request,
     return OkStatus();
   }
 
+  if (absl::Time now = absl::FromUnixMicros(env_->NowMicros());
+      now - last_progress_log_time_ > kProgressLoggingInterval) {
+    LOG(INFO) << "tf.data snapshot progress [" << path_
+              << "]: " << num_completed_streams_ << "/" << streams_.size()
+              << " streams completed; " << num_assigned_splits_ << "/"
+              << num_total_splits_ << " splits assigned or completed.";
+    last_progress_log_time_ = now;
+  }
+
   const SnapshotTaskProgress* snapshot_progress = nullptr;
   if (auto it = request.snapshot_task_progress().find(path_);
       it != request.snapshot_task_progress().end()) {
     snapshot_progress = &it->second;
   }
+  if (snapshot_progress && snapshot_progress->completed() &&
+      mode_ == Mode::kActive) {
+    mode_ = Mode::kWindingDown;
+  }
   TF_ASSIGN_OR_RETURN(std::optional<int64_t> assigned_stream_index,
                       MaybeGetOrCreateStreamAssignment(request.worker_address(),
                                                        snapshot_progress));
@@ -386,42 +461,63 @@ Status SnapshotManager::GetSnapshotSplit(const GetSnapshotSplitRequest& request,
                             request.stream_index(),
                             ", but the assignment is no longer available.");
   } else if (it->second != request.stream_index()) {
-    return errors::Internal("worker ", request.worker_address(),
-                            " think it's assigned stream ",
-                            request.stream_index(),
-                            " but it's actually assigned stream ", it->second);
+    return errors::Internal(
+        "tf.data snapshot worker ", request.worker_address(),
+        " was assigned stream ", request.stream_index(),
+        " but is now assigned a different stream ", it->second);
+  }
+
+  Stream& stream = streams_[request.stream_index()];
+  int64_t local_split_index =
+      stream.num_assigned_splits_per_source[request.source_index()];
+  int64_t global_split_index = num_assigned_splits_;
+  response.set_local_split_index(local_split_index);
+
+  Source& source = sources_[request.source_index()];
+  if (request.repetition_index() < source.repetition_index) {
+    response.set_end_of_splits(true);
+    return OkStatus();
+  }
+  while (request.repetition_index() > source.repetition_index) {
+    // This could happen if an iterator is repeated before reaching end of
+    // input, e.g. for the longer input to `Dataset.zip`. In this case we mark
+    // the previous repetitions as completed and advance to the requested
+    // repetition.
+    TF_RETURN_IF_ERROR(ResetSource(source, request.source_index()));
   }
 
   Tensor split;
   bool end_of_splits;
-  TF_RETURN_IF_ERROR(split_providers_[request.source_index()]->GetNext(
-      &split, &end_of_splits));
-
-  Stream& stream = streams_[request.stream_index()];
-  int64_t local_split_index =
-      stream.num_assigned_splits[request.source_index()];
-  int64_t global_split_index = num_assigned_splits_;
-  response.set_local_split_index(local_split_index);
+  TF_RETURN_IF_ERROR(source.split_provider->GetNext(&split, &end_of_splits));
   if (end_of_splits) {
-    if (mode_ == Mode::kActive) {
-      mode_ = Mode::kWindingDown;
-    }
     response.set_end_of_splits(true);
     return OkStatus();
   }
 
-  std::string split_path =
-      SplitPath(path_, request.stream_index(), request.source_index(),
-                local_split_index, global_split_index);
+  std::string split_path = SplitPath(
+      path_, request.stream_index(), request.source_index(),
+      request.repetition_index(), local_split_index, global_split_index);
   TF_RETURN_IF_ERROR(AtomicallyWriteTFRecords(
       split_path, {split}, tsl::io::compression::kNone, env_));
   split.AsProtoTensorContent(response.mutable_split());
 
-  ++stream.num_assigned_splits[request.source_index()];
+  ++stream.num_assigned_splits_per_source[request.source_index()];
   ++num_assigned_splits_;
   return OkStatus();
 }
 
+Status SnapshotManager::ResetSource(Source& source, int64_t source_index) {
+  TF_RETURN_IF_ERROR(source.split_provider->Reset());
+  ++source.repetition_index;
+  LOG(INFO) << "Starting the " << source.repetition_index << "th repetition "
+            << " for snapshot " << path_ << ", source " << source_index;
+  for (int64_t i = 0; i < streams_.size(); ++i) {
+    TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(RepetitionDirectory(
+        path_, /*stream_index=*/i, source_index, source.repetition_index)));
+  }
+  return OkStatus();
+}
+
 Status SnapshotManager::GetSnapshotStreams(
     GetSnapshotStreamsResponse& response) {
   for (int64_t i = 0; i < streams_.size(); ++i) {
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
index 36f48a9f8b2..8b050fb1df0 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/data/service/dispatcher.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/protobuf/snapshot.pb.h"
@@ -83,10 +85,10 @@ class SnapshotManager {
   tsl::Status GetSnapshotStreams(GetSnapshotStreamsResponse& response);
 
  private:
-  SnapshotManager(
-      absl::string_view path, Env* env,
-      std::optional<absl::Duration> resume_time_micros = std::nullopt)
-      : path_(path), env_(env), resume_time_micros_(resume_time_micros) {}
+  SnapshotManager(absl::string_view path, Env* env)
+      : path_(path),
+        env_(env),
+        last_progress_log_time_(absl::FromUnixMicros(env->NowMicros())) {}
 
   // Helpers for `Start` above. These update the on-disk state.
   tsl::Status Start(const SnapshotRequest& request);
@@ -102,6 +104,11 @@ class SnapshotManager {
   tsl::Status ReadOnDiskSource(
       int64_t stream_index, int64_t source_index,
       absl::flat_hash_set<int64_t>& global_split_indices);
+  tsl::Status ReadOnDiskSplit(
+      int64_t source_index, const std::vector<std::string>& split_files,
+      const std::string& split_file,
+      absl::flat_hash_set<int64_t>& global_split_indices);
+  tsl::Status SkipSplit(SplitProvider& split_provider);
 
   // Helpers for `WorkerHeartbeat` above. These may update the in-memory and
   // on-disk states.
@@ -125,19 +132,16 @@ class SnapshotManager {
   tsl::Env* const env_;
   // Distributed snapshot metadata.
   experimental::DistributedSnapshotMetadata metadata_;
-  // If `Resume`d, the timestamp of the resumption of the snapshot.
-  std::optional<absl::Duration> resume_time_micros_;
+  // The last time progress was logged.
+  absl::Time last_progress_log_time_;
 
   // The addresses of all workers considered to be dead based on heartbeat
   // timeout.
   absl::flat_hash_set<std::string> dead_workers_;
 
-  // A split provider for each input source of the dataset being snapshotted.
-  std::vector<std::unique_ptr<SplitProvider>> split_providers_;
-  int64_t num_sources() const { return split_providers_.size(); }
-
   struct Stream {
-    explicit Stream(int64_t num_sources) : num_assigned_splits(num_sources) {}
+    explicit Stream(int64_t num_sources)
+        : num_assigned_splits_per_source(num_sources) {}
 
     enum class State {
       // The stream is not finished and the worker is heartbeating.
@@ -147,17 +151,44 @@ class SnapshotManager {
     };
 
     // A counter of assigned splits for each source.
-    std::vector<int64_t> num_assigned_splits;
+    std::vector<int64_t> num_assigned_splits_per_source;
+
+    int64_t num_assigned_splits() const {
+      return absl::c_accumulate(num_assigned_splits_per_source, 0);
+    }
+
     State state = State::kActive;
   };
 
+  struct Source {
+    // A split provider for each input source of the dataset being snapshotted.
+    std::unique_ptr<SplitProvider> split_provider;
+    // The number of times the split provider has repeated.
+    int64_t repetition_index = 0;
+  };
+
+  std::vector<Source> sources_;
+  // Creates sources for the specified dataset.
+  StatusOr<std::vector<Source>> CreateSources(
+      const DatasetDef& dataset_def) const;
+  // Counts the number of splits for a single repetition of the data in
+  // `sources_`.
+  StatusOr<int64_t> CountSplits();
+  // Resets a source when it runs out of splits, to support repetitions.
+  Status ResetSource(Source& source, int64_t source_index);
+  int64_t num_sources() const { return sources_.size(); }
+
   // All streams for this snapshot.
   std::vector<Stream> streams_;
   // A mapping of assigned worker to stream index.
   absl::flat_hash_map<std::string, int64_t> assignments_;
+  // A counter of completed streams for this snapshot.
+  int64_t num_completed_streams_ = 0;
 
-  // A counter of assigned aplits for this snapshot.
+  // A counter of assigned splits for this snapshot.
   int64_t num_assigned_splits_ = 0;
+  // The number of splits in a single repetition of the data in `sources_`.
+  int64_t num_total_splits_ = 0;
 
   enum class Mode {
     // No streams are done.
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc b/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc
index e1d3a861ff8..75d9d825152 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc
@@ -100,6 +100,55 @@ TEST(SnapshotManagerTest, GetSnapshotSplit) {
   }
 }
 
+TEST(SnapshotManagerTest, HandleStreamCompletion) {
+  std::string snapshot_path = testing::LocalTempFilename();
+  SnapshotRequest request;
+  *request.mutable_dataset() = testing::RangeDataset(10);
+  request.set_path(snapshot_path);
+  *request.mutable_metadata() =
+      testing::CreateDummyDistributedSnapshotMetadata();
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<SnapshotManager> snapshot_manager,
+                          SnapshotManager::Start(request, Env::Default()));
+
+  // Creates two streams.
+  WorkerHeartbeatRequest heartbeat_request;
+  WorkerHeartbeatResponse heartbeat_response;
+  heartbeat_request.set_worker_address("localhost:1");
+  TF_ASSERT_OK(
+      snapshot_manager->WorkerHeartbeat(heartbeat_request, heartbeat_response));
+  heartbeat_request.Clear();
+  heartbeat_response.Clear();
+  heartbeat_request.set_worker_address("localhost:2");
+  TF_ASSERT_OK(
+      snapshot_manager->WorkerHeartbeat(heartbeat_request, heartbeat_response));
+  ASSERT_EQ(heartbeat_response.snapshot_tasks().size(), 1);
+  const SnapshotTaskDef& snapshot_task = heartbeat_response.snapshot_tasks(0);
+  EXPECT_EQ(snapshot_task.base_path(), snapshot_path);
+  EXPECT_EQ(snapshot_task.stream_index(), 1);
+  EXPECT_EQ(snapshot_task.num_sources(), 1);
+
+  // Reports stream completion.
+  heartbeat_request.Clear();
+  heartbeat_response.Clear();
+  heartbeat_request.set_worker_address("localhost:1");
+  SnapshotTaskProgress progress;
+  *progress.mutable_snapshot_task() = snapshot_task;
+  progress.set_completed(true);
+  (*heartbeat_request.mutable_snapshot_task_progress())[snapshot_path] =
+      progress;
+  TF_ASSERT_OK(
+      snapshot_manager->WorkerHeartbeat(heartbeat_request, heartbeat_response));
+  EXPECT_TRUE(heartbeat_response.snapshot_tasks().empty());
+
+  // The worker should not receive a stream in the next heartbeat.
+  heartbeat_request.Clear();
+  heartbeat_response.Clear();
+  heartbeat_request.set_worker_address("localhost:1");
+  TF_ASSERT_OK(
+      snapshot_manager->WorkerHeartbeat(heartbeat_request, heartbeat_response));
+  EXPECT_TRUE(heartbeat_response.snapshot_tasks().empty());
+}
+
 TEST(SnapshotManagerTest, Resume) {
   std::string snapshot_path = testing::LocalTempFilename();
   SnapshotRequest request;
diff --git a/tensorflow/core/data/service/snapshot/snapshot_reader.h b/tensorflow/core/data/service/snapshot/snapshot_reader.h
deleted file mode 100644
index 76510a5bd46..00000000000
--- a/tensorflow/core/data/service/snapshot/snapshot_reader.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
-#define TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
-
-#include <string>
-#include <vector>
-
-#include "absl/strings/substitute.h"
-#include "tensorflow/core/data/captured_function.h"
-#include "tensorflow/core/data/service/snapshot/path_utils.h"
-#include "tensorflow/core/data/snapshot_utils.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/protobuf/snapshot.pb.h"
-#include "tensorflow/tsl/platform/env.h"
-#include "tensorflow/tsl/platform/refcount.h"
-#include "tensorflow/tsl/platform/statusor.h"
-
-namespace tensorflow {
-namespace data {
-
-struct SnapshotReaderParams {
-  // The directory path of the snapshot. See the comment on SnapshotManager for
-  // how the directory is structured.
-  std::string snapshot_path;
-
-  // Distributed snapshot metadata.
-  experimental::DistributedSnapshotMetadata metadata;
-
-  // Data types of the snapshot data elements.
-  DataTypeVector dtypes;
-
-  // Data shape of the snapshot data elements.
-  std::vector<PartialTensorShape> shapes;
-
-  // The Tensorflow environment.
-  Env* env = nullptr;
-
-  std::string CommittedChunksDirectory() const {
-    return tensorflow::data::CommittedChunksDirectory(snapshot_path);
-  }
-
-  std::string DebugString() const {
-    return absl::Substitute(
-        "SnapshotReaderParams { base_path: $0, metadata: $1 }", snapshot_path,
-        metadata.DebugString());
-  }
-};
-
-// Creates a dataset that reads tf.data distributed snapshots.
-StatusOr<core::RefCountPtr<DatasetBase>> MakeSnapshotReaderDataset(
-    const SnapshotReaderParams& params,
-    InstantiatedCapturedFunction& instantiated_captured_func,
-    IteratorContext* ctx);
-
-}  // namespace data
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_DATA_SERVICE_SNAPSHOT_SNAPSHOT_READER_H_
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc b/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
index e45bdc14303..a3359b82b7e 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider.cc
@@ -84,7 +84,8 @@ Status SnapshotSplitProvider::GetAndValidateSplit(Tensor* split,
     return OkStatus();
   }
 
-  TF_ASSIGN_OR_RETURN(split_to_file_map_, GetSplitsFiles(next_split_index_));
+  TF_RETURN_IF_ERROR(
+      GetSplitsFiles(next_split_index_, split_to_file_map_, repetition_index_));
   TF_RETURN_IF_ERROR(ValidateSplitFiles(split_to_file_map_, next_split_index_,
                                         dispatcher_split_index,
                                         *end_of_splits));
@@ -119,8 +120,8 @@ StatusOr<int64_t> SnapshotSplitProvider::GetSplitFromDispatcher(
           TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
             return dispatcher_->GetSnapshotSplit(
                 worker_address_, snapshot_task_.base_path(),
-                snapshot_task_.stream_index(), source_index_, *split,
-                local_split_index, *end_of_splits);
+                snapshot_task_.stream_index(), source_index_, repetition_index_,
+                *split, local_split_index, *end_of_splits);
           },
       "Get next split for snapshot",
       /*deadline_micros=*/env_->NowMicros() +
@@ -128,25 +129,34 @@ StatusOr<int64_t> SnapshotSplitProvider::GetSplitFromDispatcher(
   return local_split_index;
 }
 
-StatusOr<absl::btree_map<int64_t, std::string>>
-SnapshotSplitProvider::GetSplitsFiles(int64_t start_index) const
-    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+Status SnapshotSplitProvider::GetSplitsFiles(
+    int64_t start_index,
+    absl::btree_map<int64_t, std::string>& split_to_file_map,
+    int64_t& repetition_index) const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  split_to_file_map.clear();
   std::string splits_directory = SourceDirectory(
       snapshot_task_.base_path(), snapshot_task_.stream_index(), source_index_);
-  absl::btree_map<int64_t, std::string> splits;
-
-  TF_ASSIGN_OR_RETURN(std::vector<std::string> split_files,
+  TF_ASSIGN_OR_RETURN(std::vector<std::string> repetition_directories,
                       GetChildren(splits_directory, env_));
-  for (const std::string& split_file : split_files) {
-    TF_ASSIGN_OR_RETURN(auto split_indices, ParseSplitFilename(split_file));
-    auto [local_split_index, global_split_index] = split_indices;
-    if (local_split_index >= next_split_index_) {
-      splits[local_split_index] =
-          tsl::io::JoinPath(splits_directory, split_file);
+
+  for (const std::string& repetition : repetition_directories) {
+    std::string repetition_dir = io::JoinPath(splits_directory, repetition);
+    TF_ASSIGN_OR_RETURN(std::vector<std::string> split_files,
+                        GetChildren(repetition_dir, env_));
+    for (const std::string& split_file : split_files) {
+      TF_ASSIGN_OR_RETURN(auto split_index, ParseSplitFilename(split_file));
+      auto [local_split_index, global_split_index] = split_index;
+      if (local_split_index >= start_index) {
+        split_to_file_map[local_split_index] =
+            tsl::io::JoinPath(repetition_dir, split_file);
+      }
     }
   }
-  TF_RETURN_IF_ERROR(ValidateSplitFiles(splits, start_index));
-  return splits;
+
+  TF_RETURN_IF_ERROR(ValidateSplitFiles(split_to_file_map, start_index));
+  repetition_index =
+      repetition_directories.empty() ? 0 : repetition_directories.size() - 1;
+  return OkStatus();
 }
 
 Status SnapshotSplitProvider::ValidateSplitFiles(
@@ -198,7 +208,11 @@ Status SnapshotSplitProvider::ValidateSplitFiles(
   return OkStatus();
 }
 
-Status SnapshotSplitProvider::Reset() { return OkStatus(); }
+Status SnapshotSplitProvider::Reset() {
+  mutex_lock l(mu_);
+  ++repetition_index_;
+  return OkStatus();
+}
 
 Status SnapshotSplitProvider::Save(
     std::function<std::string(std::string)> full_name,
@@ -217,7 +231,8 @@ Status SnapshotSplitProvider::Restore(
       reader->ReadScalar(full_name(kNextSplitIndex), &next_split_index));
   mutex_lock l(mu_);
   next_split_index_ = next_split_index;
-  TF_ASSIGN_OR_RETURN(split_to_file_map_, GetSplitsFiles(next_split_index_));
+  TF_RETURN_IF_ERROR(
+      GetSplitsFiles(next_split_index_, split_to_file_map_, repetition_index_));
   return OkStatus();
 }
 
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider.h b/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
index 14a2866caea..eaa4695f0b4 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include "absl/container/btree_map.h"
 #include "absl/time/time.h"
@@ -69,8 +70,10 @@ class SnapshotSplitProvider : public SplitProvider {
 
   // Reads from the split directory and returns a map of split index to absolute
   // file path of the split, starting at `start_index`.
-  StatusOr<absl::btree_map<int64_t, std::string>> GetSplitsFiles(
-      int64_t start_index) const;
+  Status GetSplitsFiles(
+      int64_t start_index,
+      absl::btree_map<int64_t, std::string>& split_to_file_map,
+      int64_t& repetition_index) const;
 
   // Verifies `split_files` contains consecutive splits starting at
   // `start_index`.
@@ -90,6 +93,9 @@ class SnapshotSplitProvider : public SplitProvider {
   // The next split to read.
   int64_t next_split_index_ TF_GUARDED_BY(mu_) = 0;
 
+  // Number of times the dataset has repeated.
+  int64_t repetition_index_ TF_GUARDED_BY(mu_) = 0;
+
   // Maps the local split index to the absolute split file path.
   absl::btree_map<int64_t, std::string> split_to_file_map_ TF_GUARDED_BY(mu_);
 };
diff --git a/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc b/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc
index 62ebb71135e..67263e79af7 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_split_provider_test.cc
@@ -57,14 +57,12 @@ class MockDispatcherClient : public DataServiceDispatcherClient {
       : DataServiceDispatcherClient(/*address=*/"localhost",
                                     /*protocol=*/"grpc") {}
 
-  // NOLINTBEGIN(MOCK_METHOD does not work on Windows build, using deprecated
-  // MOCK_METHOD<N> instead)
-  MOCK_METHOD7(GetSnapshotSplit,
-               Status(const std::string& worker_address,
-                      const std::string& base_path, int64_t stream_index,
-                      int64_t source_index, Tensor& split,
-                      int64_t& local_split_index, bool& end_of_splits));
-  // NOLINTEND
+  MOCK_METHOD(Status, GetSnapshotSplit,
+              (const std::string& worker_address, const std::string& base_path,
+               int64_t stream_index, int64_t source_index,
+               int64_t repetition_index, Tensor& split,
+               int64_t& local_split_index, bool& end_of_splits),
+              (override));
 };
 
 SnapshotTaskDef TestSnapshotTask() {
@@ -77,8 +75,10 @@ SnapshotTaskDef TestSnapshotTask() {
 }
 
 Status WriteSplits(const SnapshotTaskDef& snapshot_task, int64_t num_splits) {
-  std::string source_dir = SourceDirectory(
-      snapshot_task.base_path(), snapshot_task.stream_index(), /*source_id=*/0);
+  std::string source_dir =
+      RepetitionDirectory(snapshot_task.base_path(),
+                          snapshot_task.stream_index(), /*source_index=*/0,
+                          /*repetition_index=*/0);
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(source_dir));
   for (int64_t i = 0; i < num_splits; ++i) {
     std::string split_filename = absl::StrCat("split_", i, "_", i);
@@ -96,10 +96,10 @@ TEST(SnapshotSplitProviderTest, GetSplitFromDispatcher) {
   auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
   MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
   // The dispatcher sends split 0 to the worker.
-  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
-      .WillOnce(DoAll(SetArgReferee<4>(split),
-                      SetArgReferee<5>(0),      // local_split_index
-                      SetArgReferee<6>(false),  // end_of_splits
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<5>(split),
+                      SetArgReferee<6>(0),      // local_split_index
+                      SetArgReferee<7>(false),  // end_of_splits
                       Return(OkStatus())));
 
   Tensor result;
@@ -120,10 +120,10 @@ TEST(SnapshotSplitProviderTest, GetSplitFromFile) {
   MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
   // The dispatcher sends split 9 to the worker. The worker should get previous
   // splits from the split files.
-  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
-      .WillOnce(DoAll(SetArgReferee<4>(split),
-                      SetArgReferee<5>(9),      // local_split_index
-                      SetArgReferee<6>(false),  // end_of_splits
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<5>(split),
+                      SetArgReferee<6>(9),      // local_split_index
+                      SetArgReferee<7>(false),  // end_of_splits
                       Return(OkStatus())));
   TF_ASSERT_OK(WriteSplits(snapshot_task, /*num_splits=*/10));
 
@@ -146,9 +146,9 @@ TEST(SnapshotSplitProviderTest, EndOfSplits) {
   auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
   MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
   // The dispatcher sends `end_of_splits` to the worker.
-  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
-      .WillOnce(DoAll(SetArgReferee<5>(0),     // local_split_index
-                      SetArgReferee<6>(true),  // end_of_splits
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<6>(0),     // local_split_index
+                      SetArgReferee<7>(true),  // end_of_splits
                       Return(OkStatus())));
 
   SnapshotSplitProvider split_provider(
@@ -167,10 +167,10 @@ TEST(SnapshotSplitProviderTest, SplitNotFound) {
   auto mock_dispatcher_ptr = std::make_unique<MockDispatcherClient>();
   MockDispatcherClient* mock_dispatcher = mock_dispatcher_ptr.get();
   // The dispatcher sends split 10, but no splits are written.
-  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _))
-      .WillOnce(DoAll(SetArgReferee<4>(split),
-                      SetArgReferee<5>(10),     // local_split_index
-                      SetArgReferee<6>(false),  // end_of_splits
+  EXPECT_CALL(*mock_dispatcher, GetSnapshotSplit(_, _, _, _, _, _, _, _))
+      .WillOnce(DoAll(SetArgReferee<5>(split),
+                      SetArgReferee<6>(10),     // local_split_index
+                      SetArgReferee<7>(false),  // end_of_splits
                       Return(OkStatus())));
   TF_ASSERT_OK(WriteSplits(snapshot_task, /*num_splits=*/0));
 
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
index 515ef8484e4..59a6284ec14 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
@@ -24,6 +24,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/str_cat.h"
+#include "absl/time/time.h"
+#include "tensorflow/core/data/service/common.h"
 #include "tensorflow/core/data/service/snapshot/file_utils.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/service/snapshot/utils.h"
@@ -40,6 +42,7 @@ limitations under the License.
 #include "tensorflow/tsl/platform/regexp.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/statusor.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
 
 namespace tensorflow {
 namespace data {
@@ -68,6 +71,7 @@ SnapshotStreamWriter::SnapshotStreamWriter(
     const SnapshotWriterParams& params, std::unique_ptr<TaskIterator> iterator)
     : params_(params), iterator_(std::move(iterator)) {
   DCHECK_NE(iterator_.get(), nullptr);
+  last_checkpoint_time_ = absl::FromUnixMicros(params_.env->NowMicros());
   snapshot_thread_ = absl::WrapUnique(params_.env->StartThread(
       /*thread_options=*/{}, /*name=*/"tf_data_service_snapshot_thread",
       [this]() { WriteSnapshotAndLog(); }));
@@ -85,6 +89,10 @@ void SnapshotStreamWriter::WriteSnapshotAndLog() TF_LOCKS_EXCLUDED(mu_) {
   LOG(INFO) << "Writing distributed tf.data snapshot stream: "
             << params_.DebugString();
   Status status = WriteSnapshot();
+  if (IsPreemptedError(status)) {
+    LOG(INFO) << "tf.data service snapshot writer is cancelled: " << status;
+    return;
+  }
   status = FinalizeStream(status);
   mutex_lock l(mu_);
   if (!status.ok()) {
@@ -129,8 +137,10 @@ bool SnapshotStreamWriter::ShouldWriteChunk() const TF_LOCKS_EXCLUDED(mu_) {
 }
 
 Status SnapshotStreamWriter::WriteChunk() {
-  LOG(INFO) << "Writing distributed tf.data snapshot stream "
-            << params_.stream_index << ", chunk " << chunk_index_ << ".";
+  LOG(INFO) << "Writing distributed tf.data snapshot " << params_.snapshot_path
+            << ", stream " << params_.stream_index << ", chunk " << chunk_index_
+            << ".";
+
   std::string chunk_file_path = GetChunkFilePath();
   snapshot_util::TFRecordWriter writer(chunk_file_path, params_.compression);
   TF_RETURN_IF_ERROR(writer.Initialize(params_.env));
@@ -182,6 +192,8 @@ Status SnapshotStreamWriter::WriteRecord(
   if (end_of_sequence_) {
     return writer.Close();
   }
+  tsl::profiler::TraceMe activity("SnapshotWriteRecord",
+                                  tsl::profiler::TraceMeLevel::kInfo);
   TF_RETURN_IF_ERROR(writer.WriteTensors(element));
   chunk_size_bytes_ += EstimatedSizeBytes(element);
   ++chunk_num_elements_;
@@ -237,6 +249,10 @@ void SnapshotStreamWriter::Cancel() TF_LOCKS_EXCLUDED(mu_) {
 
 bool SnapshotStreamWriter::ShouldSave() const TF_LOCKS_EXCLUDED(mu_) {
   mutex_lock l(mu_);
+  const absl::Time now = absl::FromUnixMicros(params_.env->NowMicros());
+  if (now < last_checkpoint_time_ + params_.checkpoint_interval) {
+    return false;
+  }
   if (end_of_sequence_) {
     // If this is the last chunk, we only write checkpoints when there are more
     // than one chunk. For example, if there are 3 chunks, the files will be:
@@ -258,12 +274,20 @@ Status SnapshotStreamWriter::Save() {
             << params_.stream_index << ", chunk " << chunk_index_
             << ", chunk size in bytes: " << chunk_size_bytes_
             << ", number of elements in chunk: " << chunk_num_elements_ << ".";
+  tsl::profiler::TraceMe activity("SnapshotCheckpoint",
+                                  tsl::profiler::TraceMeLevel::kInfo);
+  absl::Time start_time = absl::FromUnixMicros(params_.env->NowMicros());
   std::string checkpoint_path =
       CheckpointPath(chunk_index_, chunk_num_elements_);
   TF_ASSIGN_OR_RETURN(std::vector<Tensor> serialized_iterator,
                       iterator_->Save());
   TF_RETURN_IF_ERROR(AtomicallyWriteTFRecords(
       checkpoint_path, serialized_iterator, params_.compression, params_.env));
+  absl::Time end_time = absl::FromUnixMicros(params_.env->NowMicros());
+  LOG(INFO) << "Wrote checkpoint file " << checkpoint_path << ". "
+            << "Checkpointing distributed tf.data snapshot writer took "
+            << (end_time - start_time);
+  last_checkpoint_time_ = end_time;
   return DeleteOutdatedCheckpoints();
 }
 
@@ -327,8 +351,9 @@ Status SnapshotStreamWriter::Restore() {
   TF_RETURN_IF_ERROR(
       SyncCheckpointWithChunks(checkpoint_index, checkpoint_num_elements));
   chunk_index_ = checkpoint_index + 1;
-  LOG(INFO) << "Restored distributed tf.data snapshot writer. Stream "
-            << params_.stream_index << ", chunk " << checkpoint_index << ".";
+  LOG(INFO) << "Restored distributed tf.data snapshot writer. Snapshot "
+            << params_.snapshot_path << ", stream " << params_.stream_index
+            << ", chunk " << checkpoint_index << ".";
   return OkStatus();
 }
 
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
index 46ae004ce78..23a4328b44e 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 
 #include "absl/strings/substitute.h"
+#include "absl/time/time.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/service/task_runner.h"
@@ -39,6 +40,7 @@ namespace tensorflow {
 namespace data {
 
 constexpr int64_t kDefaultMaxChunkSizeBytes = 2 * (size_t{1} << 30);  // 2GB
+constexpr absl::Duration kDefaultCheckpointInterval = absl::Minutes(20);
 
 struct SnapshotWriterParams {
   // The directory path of the snapshot. See the comment on SnapshotStreamWriter
@@ -58,6 +60,9 @@ struct SnapshotWriterParams {
   // The maximum number of bytes in each chunk.
   int64_t max_chunk_size_bytes = kDefaultMaxChunkSizeBytes;
 
+  // How often should checkpoints be written.
+  absl::Duration checkpoint_interval = kDefaultCheckpointInterval;
+
   // If true, keep temporary files (e.g., checkpoints) after completing the
   // snapshot. Used only for unit testing.
   bool test_only_keep_temp_files = false;
@@ -217,6 +222,8 @@ class SnapshotStreamWriter {
   int64_t chunk_size_bytes_ = 0;
   // Number of elements in current chunk.
   int64_t chunk_num_elements_ = 0;
+  // Timestamp when the last checkpoint is taken.
+  absl::Time last_checkpoint_time_ = absl::Now();
 
   // True if the dataset is exhausted.
   bool end_of_sequence_ = false;
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
index 2875bd4c566..3b2ae53f99e 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer_checkpoint_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/time/time.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/data/service/snapshot/snapshot_stream_writer.h"
@@ -28,6 +29,7 @@ limitations under the License.
 #include "tensorflow/tsl/lib/io/compression.h"
 #include "tensorflow/tsl/platform/env.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/random.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/platform/status_matchers.h"
 #include "tensorflow/tsl/platform/statusor.h"
@@ -126,6 +128,39 @@ TEST_P(SnapshotStreamWriterParameterizedTest,
               IsOkAndHolds(UnorderedElementsAre()));
 }
 
+TEST_P(SnapshotStreamWriterParameterizedTest, VaryingCheckpointInterval) {
+  const int64_t range = 10;
+  const std::string compression = GetParam();
+  const DatasetDef dataset = testing::RangeDataset(range);
+  const int64_t stream_index = 0;
+  TF_ASSERT_OK_AND_ASSIGN(const std::string snapshot_path,
+                          CreateSnapshotDirectory());
+  TF_ASSERT_OK_AND_ASSIGN(
+      testing::PartialSnapshotWriter partial_writer,
+      testing::PartialSnapshotWriter::Create(
+          dataset, snapshot_path, stream_index, compression,
+          /*max_chunk_size_bytes=*/1, /*checkpoint_interval=*/
+          absl::Milliseconds(tsl::random::New64() % 1000)));
+  TF_ASSERT_OK(
+      partial_writer.WriteCommittedChunks({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+  TF_ASSERT_OK(partial_writer.WriteCheckpoints({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}));
+
+  SnapshotWriterParams writer_params{
+      snapshot_path,
+      /*stream_index=*/0,
+      compression,
+      Env::Default(),
+      /*max_chunk_size_bytes=*/1,
+      /*checkpoint_interval=*/absl::Milliseconds(tsl::random::New64() % 1000),
+      /*test_only_keep_temp_files=*/true};
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<StandaloneTaskIterator> iterator,
+                          testing::TestIterator(dataset));
+  SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
+  EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
+  EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
+              IsOkAndHolds(UnorderedElementsAre(0, 1, 2, 3, 4, 5, 6, 7, 8, 9)));
+}
+
 INSTANTIATE_TEST_SUITE_P(Compression, SnapshotStreamWriterParameterizedTest,
                          ValuesIn<std::string>({tsl::io::compression::kNone,
                                                 tsl::io::compression::kGzip,
@@ -139,12 +174,14 @@ TEST(SnapshotStreamWriterCheckpointTest, NoCheckpoint) {
 
   const std::string compression = tsl::io::compression::kSnappy;
   TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-  SnapshotWriterParams writer_params{snapshot_path,
-                                     /*stream_index=*/0,
-                                     compression,
-                                     Env::Default(),
-                                     /*max_chunk_size_bytes=*/kint64max,
-                                     /*test_only_keep_temp_files=*/true};
+  SnapshotWriterParams writer_params{
+      snapshot_path,
+      /*stream_index=*/0,
+      compression,
+      Env::Default(),
+      /*max_chunk_size_bytes=*/kint64max,
+      /*checkpoint_interval=*/absl::Microseconds(1),
+      /*test_only_keep_temp_files=*/true};
   SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
   EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
   EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
@@ -162,12 +199,14 @@ TEST(SnapshotStreamWriterCheckpointTest, WithCheckpoint) {
 
   const std::string compression = tsl::io::compression::kSnappy;
   TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-  SnapshotWriterParams writer_params{snapshot_path,
-                                     /*stream_index=*/0,
-                                     compression,
-                                     Env::Default(),
-                                     /*max_chunk_size_bytes=*/20,
-                                     /*test_only_keep_temp_files=*/true};
+  SnapshotWriterParams writer_params{
+      snapshot_path,
+      /*stream_index=*/0,
+      compression,
+      Env::Default(),
+      /*max_chunk_size_bytes=*/20,
+      /*checkpoint_interval=*/absl::Microseconds(1),
+      /*test_only_keep_temp_files=*/true};
   SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
   EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
   EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
@@ -192,12 +231,14 @@ TEST(SnapshotStreamWriterCheckpointTest, CleanupCheckpoint) {
 
   const std::string compression = tsl::io::compression::kSnappy;
   TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-  SnapshotWriterParams writer_params{snapshot_path,
-                                     /*stream_index=*/0,
-                                     compression,
-                                     Env::Default(),
-                                     /*max_chunk_size_bytes=*/20,
-                                     /*test_only_keep_temp_files=*/false};
+  SnapshotWriterParams writer_params{
+      snapshot_path,
+      /*stream_index=*/0,
+      compression,
+      Env::Default(),
+      /*max_chunk_size_bytes=*/20,
+      /*checkpoint_interval=*/absl::Microseconds(1),
+      /*test_only_keep_temp_files=*/false};
   SnapshotStreamWriter snapshot_writer(writer_params, std::move(iterator));
   EXPECT_THAT(snapshot_writer.Wait(), IsOkAndHolds(true));
   EXPECT_THAT(testing::ReadSnapshot<int64_t>(snapshot_path, compression),
diff --git a/tensorflow/core/data/service/snapshot/test_utils.cc b/tensorflow/core/data/service/snapshot/test_utils.cc
index d9c7fcc701e..b64b572de26 100644
--- a/tensorflow/core/data/service/snapshot/test_utils.cc
+++ b/tensorflow/core/data/service/snapshot/test_utils.cc
@@ -21,7 +21,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
@@ -42,7 +44,7 @@ namespace {
 tsl::StatusOr<std::string> CreateTmpDirectory() {
   std::string snapshot_path;
   if (!Env::Default()->LocalTempFilename(&snapshot_path)) {
-    return errors::FailedPrecondition(
+    return absl::FailedPreconditionError(
         "Failed to create local temp file for snapshot.");
   }
   TF_RETURN_IF_ERROR(Env::Default()->RecursivelyCreateDir(
@@ -54,7 +56,7 @@ tsl::StatusOr<int64_t> CommittedChunkIndex(const std::string& chunk_file) {
   std::vector<std::string> tokens = absl::StrSplit(chunk_file, '_');
   int64_t result = 0;
   if (tokens.size() != 4 || !absl::SimpleAtoi(tokens[2], &result)) {
-    return errors::Internal("Invalid");
+    return absl::InternalError("Invalid");
   }
   return result;
 }
@@ -63,7 +65,7 @@ tsl::StatusOr<int64_t> CheckpointIndex(const std::string& checkpoint_file) {
   std::vector<std::string> tokens = absl::StrSplit(checkpoint_file, '_');
   int64_t result = 0;
   if (tokens.size() != 3 || !absl::SimpleAtoi(tokens[1], &result)) {
-    return errors::Internal("Invalid");
+    return absl::InternalError("Invalid");
   }
   return result;
 }
@@ -74,19 +76,22 @@ PartialSnapshotWriter::PartialSnapshotWriter(const DatasetDef& dataset,
                                              const std::string& snapshot_path,
                                              int64_t stream_index,
                                              const std::string& compression,
-                                             int64_t max_chunk_size_bytes)
+                                             int64_t max_chunk_size_bytes,
+                                             absl::Duration checkpoint_interval)
     : dataset_(dataset),
       snapshot_path_(snapshot_path),
       stream_index_(stream_index),
       compression_(compression),
-      max_chunk_size_bytes_(max_chunk_size_bytes) {}
+      max_chunk_size_bytes_(max_chunk_size_bytes),
+      checkpoint_interval_(checkpoint_interval) {}
 
 tsl::StatusOr<PartialSnapshotWriter> PartialSnapshotWriter::Create(
     const DatasetDef& dataset, const std::string& snapshot_path,
     int64_t stream_index, const std::string& compression,
-    int64_t max_chunk_size_bytes) {
+    int64_t max_chunk_size_bytes, absl::Duration checkpoint_interval) {
   PartialSnapshotWriter writer(dataset, snapshot_path, stream_index,
-                               compression, max_chunk_size_bytes);
+                               compression, max_chunk_size_bytes,
+                               checkpoint_interval);
   TF_RETURN_IF_ERROR(writer.Initialize());
   return writer;
 }
@@ -99,6 +104,7 @@ tsl::Status PartialSnapshotWriter::Initialize() {
                                      compression_,
                                      Env::Default(),
                                      max_chunk_size_bytes_,
+                                     checkpoint_interval_,
                                      /*test_only_keep_temp_files=*/true};
   TF_ASSIGN_OR_RETURN(std::unique_ptr<StandaloneTaskIterator> iterator,
                       TestIterator(dataset_));
diff --git a/tensorflow/core/data/service/snapshot/test_utils.h b/tensorflow/core/data/service/snapshot/test_utils.h
index a6a91d4aac6..ffac046735c 100644
--- a/tensorflow/core/data/service/snapshot/test_utils.h
+++ b/tensorflow/core/data/service/snapshot/test_utils.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/snapshot/file_utils.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
@@ -55,7 +56,7 @@ tsl::StatusOr<std::vector<T>> ReadSnapshot(const std::string& base_path,
     while (true) {
       std::vector<Tensor> tensors;
       Status status = tfrecord_reader.ReadTensors(&tensors);
-      if (errors::IsOutOfRange(status)) {
+      if (absl::IsOutOfRange(status)) {
         break;
       }
       TF_RETURN_IF_ERROR(status);
@@ -73,7 +74,8 @@ class PartialSnapshotWriter {
   static tsl::StatusOr<PartialSnapshotWriter> Create(
       const DatasetDef& dataset, const std::string& snapshot_path,
       int64_t stream_index, const std::string& compression,
-      int64_t max_chunk_size_bytes = 1);
+      int64_t max_chunk_size_bytes = 1,
+      absl::Duration checkpoint_interval = absl::Microseconds(1));
   virtual ~PartialSnapshotWriter() = default;
   PartialSnapshotWriter(const PartialSnapshotWriter&) = delete;
   PartialSnapshotWriter& operator=(const PartialSnapshotWriter&) = delete;
@@ -96,7 +98,8 @@ class PartialSnapshotWriter {
   PartialSnapshotWriter(const DatasetDef& dataset,
                         const std::string& snapshot_path, int64_t stream_index,
                         const std::string& compression,
-                        int64_t max_chunk_size_bytes);
+                        int64_t max_chunk_size_bytes,
+                        absl::Duration checkpoint_interval);
 
   tsl::Status Initialize();
 
@@ -105,6 +108,7 @@ class PartialSnapshotWriter {
   const int64_t stream_index_;
   const std::string compression_;
   const int64_t max_chunk_size_bytes_;
+  const absl::Duration checkpoint_interval_;
 
   std::string tmp_snapshot_path_;
 };
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index b21feb5619f..985a762f9a1 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/data_transfer.h"
@@ -243,7 +244,7 @@ DatasetClient<T>::ReadFromTasks(const std::vector<TaskInfo>& tasks) {
       StatusOr<GetElementResult> element_result = ReadFromTask(task);
       // A task may be cancelled when it has finished but other workers are
       // still producing data.
-      if (errors::IsCancelled(element_result.status())) {
+      if (absl::IsCancelled(element_result.status())) {
         continue;
       }
       TF_RETURN_IF_ERROR(element_result.status());
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index 06e945b371f..907b6905c84 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/data/name_utils.h"
@@ -478,12 +479,12 @@ class Reader::Dataset : public DatasetBase {
                            bool* end_of_sequence) override {
       *end_of_sequence = false;
       Status s = reader_->ReadTensors(out_tensors);
-      if (!errors::IsOutOfRange(s)) {
+      if (!absl::IsOutOfRange(s)) {
         start_index_++;
         return s;
       }
       Status status = AdvanceToNextFile(ctx->env());
-      if (errors::IsNotFound(status)) {
+      if (absl::IsNotFound(status)) {
         *end_of_sequence = true;
         return OkStatus();
       }
@@ -768,7 +769,7 @@ StatusOr<std::vector<Tensor>> TFRecordReaderImpl::GetTensors() {
   std::vector<Tensor> tensors;
   while (true) {
     StatusOr<Tensor> tensor = GetNext();
-    if (errors::IsOutOfRange(tensor.status())) {
+    if (absl::IsOutOfRange(tensor.status())) {
       return tensors;
     }
     TF_RETURN_IF_ERROR(tensor.status());
diff --git a/tensorflow/core/data/snapshot_utils.h b/tensorflow/core/data/snapshot_utils.h
index 15ba39e7d84..4e8c3a79b85 100644
--- a/tensorflow/core/data/snapshot_utils.h
+++ b/tensorflow/core/data/snapshot_utils.h
@@ -17,9 +17,12 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SNAPSHOT_UTILS_H_
 
 #include <cstdint>
+#include <deque>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/dataset.h"
@@ -75,7 +78,7 @@ std::string ShardDirectory(const std::string& run_directory, int64_t shard_id);
 
 // Returns the checkpoint file name for the given directory and checkpoint ID.
 std::string GetCheckpointFileName(const std::string& shard_directory,
-                                  const uint64 checkpoint_id);
+                                  uint64 checkpoint_id);
 
 // This is a interface class that exposes snapshot writing functionality.
 class Writer {
@@ -96,7 +99,7 @@ class Writer {
   // be invalid after this call.
   virtual Status Close() = 0;
 
-  virtual ~Writer() {}
+  virtual ~Writer() = default;
 
  protected:
   virtual Status Initialize(tensorflow::Env* env) = 0;
@@ -221,8 +224,7 @@ class Reader {
                                   const string& compression_type, int version,
                                   const DataTypeVector& dtypes,
                                   const std::vector<PartialTensorShape>& shapes,
-                                  const int64_t start_index,
-                                  DatasetBase** output);
+                                  int64_t start_index, DatasetBase** output);
 
   // Returns a nested dataset for the given datasets.
   static void MakeNestedDataset(const std::vector<DatasetBase*>& datasets,
@@ -235,7 +237,7 @@ class Reader {
   // times then discarding the results.
   virtual Status SkipRecords(int64_t num_records);
 
-  virtual ~Reader() {}
+  virtual ~Reader() = default;
 
  protected:
   virtual Status Initialize(Env* env) = 0;
@@ -319,11 +321,11 @@ class CustomReader : public Reader {
   static constexpr const char* const kSeparator = "::";
 
   CustomReader(const std::string& filename, const string& compression_type,
-               const int version, const DataTypeVector& dtypes);
+               int version, const DataTypeVector& dtypes);
 
   Status ReadTensors(std::vector<Tensor>* read_tensors) override;
 
-  ~CustomReader() override {}
+  ~CustomReader() override = default;
 
  protected:
   Status Initialize(Env* env) override;
@@ -384,8 +386,7 @@ Status DumpDatasetGraph(Env* env, const std::string& path, uint64 hash,
 
 Status DetermineOpState(const std::string& mode_string, bool file_exists,
                         const experimental::SnapshotMetadataRecord* metadata,
-                        const uint64 pending_snapshot_expiry_seconds,
-                        Mode* mode);
+                        uint64 pending_snapshot_expiry_seconds, Mode* mode);
 
 // Represents a dataset element or EOF.
 struct ElementOrEOF {
diff --git a/tensorflow/core/data/snapshot_utils_test.cc b/tensorflow/core/data/snapshot_utils_test.cc
index e40cbb3a847..a38a65fada5 100644
--- a/tensorflow/core/data/snapshot_utils_test.cc
+++ b/tensorflow/core/data/snapshot_utils_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/data/snapshot_utils.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "tensorflow/core/data/service/test_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/data/split_utils.cc b/tensorflow/core/data/split_utils.cc
index 77cf805f4b5..fbda41044de 100644
--- a/tensorflow/core/data/split_utils.cc
+++ b/tensorflow/core/data/split_utils.cc
@@ -15,8 +15,10 @@ limitations under the License.
 #include "tensorflow/core/data/split_utils.h"
 
 #include <functional>
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/core/platform/errors.h"
 
diff --git a/tensorflow/core/data/split_utils.h b/tensorflow/core/data/split_utils.h
index db830a648fd..b841bad849c 100644
--- a/tensorflow/core/data/split_utils.h
+++ b/tensorflow/core/data/split_utils.h
@@ -17,7 +17,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_SPLIT_UTILS_H_
 
 #include <functional>
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/dataset.h"
 
diff --git a/tensorflow/core/data/split_utils_test.cc b/tensorflow/core/data/split_utils_test.cc
index cc0adb93321..15b61e125a0 100644
--- a/tensorflow/core/data/split_utils_test.cc
+++ b/tensorflow/core/data/split_utils_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/data/split_utils.h"
 
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/data/dataset_test_base.h"
 #include "tensorflow/core/data/dataset_utils.h"
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index cb37bfd6c6a..a1c702b0c77 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <iterator>
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/tensorflow/core/data/test_utils.cc b/tensorflow/core/data/test_utils.cc
new file mode 100644
index 00000000000..f3b4cf22f85
--- /dev/null
+++ b/tensorflow/core/data/test_utils.cc
@@ -0,0 +1,65 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/data/test_utils.h"
+
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/device_factory.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/public/version.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+StatusOr<std::unique_ptr<TestContext>> TestContext::Create() {
+  auto ctx = std::unique_ptr<TestContext>(new TestContext());
+  SessionOptions options;
+  auto* device_count = options.config.mutable_device_count();
+  device_count->insert({"CPU", 1});
+  std::vector<std::unique_ptr<Device>> devices;
+  TF_RETURN_IF_ERROR(DeviceFactory::AddDevices(
+      options, "/job:localhost/replica:0/task:0", &devices));
+  ctx->device_mgr_ = std::make_unique<StaticDeviceMgr>(std::move(devices));
+  FunctionDefLibrary proto;
+  ctx->lib_def_ =
+      std::make_unique<FunctionLibraryDefinition>(OpRegistry::Global(), proto);
+
+  OptimizerOptions opts;
+  ctx->pflr_ = std::make_unique<ProcessFunctionLibraryRuntime>(
+      ctx->device_mgr_.get(), Env::Default(), /*config=*/nullptr,
+      TF_GRAPH_DEF_VERSION, ctx->lib_def_.get(), opts);
+  ctx->runner_ = [](const std::function<void()>& fn) { fn(); };
+  ctx->params_.function_library = ctx->pflr_->GetFLR("/device:CPU:0");
+  ctx->params_.device = ctx->device_mgr_->ListDevices()[0];
+  ctx->params_.runner = &ctx->runner_;
+  ctx->op_ctx_ = std::make_unique<OpKernelContext>(&ctx->params_, 0);
+  ctx->iter_ctx_ = std::make_unique<IteratorContext>(ctx->op_ctx_.get());
+  return ctx;
+}
+
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/data/test_utils.h b/tensorflow/core/data/test_utils.h
new file mode 100644
index 00000000000..442ba238e88
--- /dev/null
+++ b/tensorflow/core/data/test_utils.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_DATA_TEST_UTILS_H_
+#define TENSORFLOW_CORE_DATA_TEST_UTILS_H_
+
+#include <functional>
+#include <memory>
+
+#include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace data {
+
+class TestContext {
+ public:
+  static StatusOr<std::unique_ptr<TestContext>> Create();
+  virtual ~TestContext() = default;
+
+  OpKernelContext* op_ctx() const { return op_ctx_.get(); }
+  IteratorContext* iter_ctx() const { return iter_ctx_.get(); }
+
+ private:
+  TestContext() = default;
+
+  std::unique_ptr<DeviceMgr> device_mgr_;
+  std::unique_ptr<FunctionLibraryDefinition> lib_def_;
+  std::unique_ptr<ProcessFunctionLibraryRuntime> pflr_;
+  std::function<void(std::function<void()>)> runner_;
+  OpKernelContext::Params params_;
+  std::unique_ptr<OpKernelContext> op_ctx_;
+  std::unique_ptr<IteratorContext> iter_ctx_;
+};
+
+}  // namespace data
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_DATA_TEST_UTILS_H_
diff --git a/tensorflow/core/data/unbounded_thread_pool.cc b/tensorflow/core/data/unbounded_thread_pool.cc
index 3c50fa5044d..aa9fb6d8d0e 100644
--- a/tensorflow/core/data/unbounded_thread_pool.cc
+++ b/tensorflow/core/data/unbounded_thread_pool.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/data/unbounded_thread_pool.h"
 
+#include <functional>
+#include <memory>
+#include <utility>
+
 #include "absl/memory/memory.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/lib/core/notification.h"
diff --git a/tensorflow/core/data/unbounded_thread_pool.h b/tensorflow/core/data/unbounded_thread_pool.h
index 5388552bbaf..f790c938c05 100644
--- a/tensorflow/core/data/unbounded_thread_pool.h
+++ b/tensorflow/core/data/unbounded_thread_pool.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_DATA_UNBOUNDED_THREAD_POOL_H_
 
 #include <deque>
+#include <functional>
 #include <memory>
 #include <vector>
 
diff --git a/tensorflow/core/data/unbounded_thread_pool_test.cc b/tensorflow/core/data/unbounded_thread_pool_test.cc
index acd9e2449ee..8be158ff140 100644
--- a/tensorflow/core/data/unbounded_thread_pool_test.cc
+++ b/tensorflow/core/data/unbounded_thread_pool_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/data/unbounded_thread_pool.h"
 
+#include <atomic>
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/lib/random/random.h"
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/debug/BUILD b/tensorflow/core/debug/BUILD
index c9ba706ebb1..8763870c9f4 100644
--- a/tensorflow/core/debug/BUILD
+++ b/tensorflow/core/debug/BUILD
@@ -109,6 +109,8 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/debug/debug_graph_utils.cc b/tensorflow/core/debug/debug_graph_utils.cc
index 37731144a28..a7cc04e3b96 100644
--- a/tensorflow/core/debug/debug_graph_utils.cc
+++ b/tensorflow/core/debug/debug_graph_utils.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/debug/debug_graph_utils.h"
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/common_runtime/memory_types.h"
 #include "tensorflow/core/framework/kernel_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
@@ -37,7 +39,8 @@ Status ParseBoolString(const string& bool_str, bool* bool_val) {
              lower_bool_str == "1") {
     *bool_val = true;
   } else {
-    return errors::InvalidArgument("Invalid string for bool value: ", bool_str);
+    return absl::InvalidArgumentError(
+        absl::StrCat("Invalid string for bool value: ", bool_str));
   }
   return OkStatus();
 }
@@ -368,8 +371,8 @@ Status DebugNodeInserter::ParseDebugOpName(
   } else {
     if (l_index == string::npos || l_index == 0 ||
         r_index != debug_op_name.size() - 1) {
-      return errors::InvalidArgument("Malformed debug op name \"",
-                                     debug_op_name, "\"");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Malformed debug op name \"", debug_op_name, "\""));
     }
 
     *debug_op_name_proper = debug_op_name.substr(0, l_index);
@@ -385,24 +388,24 @@ Status DebugNodeInserter::ParseDebugOpName(
 
       const size_t eq_index = seg.find('=');
       if (eq_index == string::npos) {
-        return errors::InvalidArgument(
-            "Malformed attributes in debug op name \"", debug_op_name, "\"");
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Malformed attributes in debug op name \"", debug_op_name, "\""));
       }
 
       const string key(seg.substr(0, eq_index));
       const string value(
           seg.substr(eq_index + 1, attribute_seg.size() - eq_index - 1));
       if (key.empty() || value.empty()) {
-        return errors::InvalidArgument(
-            "Malformed attributes in debug op name \"", debug_op_name, "\"");
+        return absl::InvalidArgumentError(absl::StrCat(
+            "Malformed attributes in debug op name \"", debug_op_name, "\""));
       }
 
       if (attributes->find(key) == attributes->end()) {
         (*attributes)[key] = value;
       } else {
-        return errors::InvalidArgument("Duplicate attribute name \"", key,
-                                       "\" found in the debug op: \"",
-                                       debug_op_name, "\"");
+        return absl::InvalidArgumentError(
+            absl::StrCat("Duplicate attribute name \"", key,
+                         "\" found in the debug op: \"", debug_op_name, "\""));
       }
     }
   }
@@ -426,31 +429,31 @@ Status DebugNodeInserter::SetDebugNodeAttributes(
         float float_value = 0.0;
         if (!::tensorflow::strings::safe_strtof(attr_value.c_str(),
                                                 &float_value)) {
-          return errors::InvalidArgument(
+          return absl::InvalidArgumentError(absl::StrCat(
               "Invalid value string for float-type attribute ", attr.name(),
-              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\""));
         }
         debug_node->AddAttr<float>(attr.name(), float_value);
       } else if (attr.type() == "int") {
         int64_t int_value = 0;
         if (!::tensorflow::strings::safe_strto64(attr_value, &int_value)) {
-          return errors::InvalidArgument(
+          return absl::InvalidArgumentError(absl::StrCat(
               "Invalid value string for int-type attribute ", attr.name(),
-              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\""));
         }
         debug_node->AddAttr<int>(attr.name(), int_value);
       } else if (attr.type() == "bool") {
         bool bool_value;
         if (!ParseBoolString(attr_value, &bool_value).ok()) {
-          return errors::InvalidArgument(
+          return absl::InvalidArgumentError(absl::StrCat(
               "Invalid value string for bool-type attribute ", attr.name(),
-              "of debug node ", debug_node->name(), ": \"", attr_value, "\"");
+              "of debug node ", debug_node->name(), ": \"", attr_value, "\""));
         }
         debug_node->AddAttr<bool>(attr.name(), bool_value);
       } else {
-        return errors::InvalidArgument(
-            "Unsupported type of custom attribute for debug ops: ",
-            attr.type());
+        return absl::InvalidArgumentError(
+            absl::StrCat("Unsupported type of custom attribute for debug ops: ",
+                         attr.type()));
       }
 
       unfulfilled_keys.erase(attr.name());
@@ -460,10 +463,10 @@ Status DebugNodeInserter::SetDebugNodeAttributes(
   if (unfulfilled_keys.empty()) {
     return OkStatus();
   } else {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         unfulfilled_keys.size(),
         " attribute key(s) were not valid for debug node ", debug_node->name(),
-        ": ", absl::StrJoin(unfulfilled_keys, ", "));
+        ": ", absl::StrJoin(unfulfilled_keys, ", ")));
   }
 }
 
@@ -490,20 +493,20 @@ Status DebugNodeInserter::CreateDebugNode(
                      .Attr("debug_urls", debug_urls);
 
   if (!builder.Finalize(&node_def).ok()) {
-    return errors::FailedPrecondition(
-        "Failed to create node definition for debug op ", debug_op_name_proper,
-        " on watched tensor ", tensor_name);
+    return absl::FailedPreconditionError(
+        absl::StrCat("Failed to create node definition for debug op ",
+                     debug_op_name_proper, " on watched tensor ", tensor_name));
   }
   if (!FindKernelDef(DeviceType(device.device_type()), node_def, &kdef, nullptr)
            .ok()) {
-    return errors::FailedPrecondition(
-        "Failed to find kernel definition for debug op ", debug_op_name_proper,
-        " on watched tensor ", tensor_name);
+    return absl::FailedPreconditionError(
+        absl::StrCat("Failed to find kernel definition for debug op ",
+                     debug_op_name_proper, " on watched tensor ", tensor_name));
   }
   if (!NodeBuilder(builder).Finalize(graph, debug_node).ok()) {
-    return errors::FailedPrecondition("Failed to create debug node ",
-                                      debug_op_name_proper,
-                                      " on watched tensor ", tensor_name);
+    return absl::FailedPreconditionError(
+        absl::StrCat("Failed to create debug node ", debug_op_name_proper,
+                     " on watched tensor ", tensor_name));
   }
 
   // Set custom attributes (if any).
diff --git a/tensorflow/core/debug/grpc_session_debug_test.cc b/tensorflow/core/debug/grpc_session_debug_test.cc
index 708994cf494..72cbbe82e42 100644
--- a/tensorflow/core/debug/grpc_session_debug_test.cc
+++ b/tensorflow/core/debug/grpc_session_debug_test.cc
@@ -154,7 +154,11 @@ TEST_F(GrpcSessionDebugTest, FileDebugURL) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(
+      test::TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({test::TestJob{/*name=*/"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   auto session = NewRemote(Options(cluster->targets()[0], 1));
   TF_CHECK_OK(session->Create(graph));
@@ -224,7 +228,11 @@ void SetDevice(GraphDef* graph, const string& name, const string& dev) {
 
 TEST_F(GrpcSessionDebugTest, MultiDevices_String) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 1), 2, &cluster));
+  TF_CHECK_OK(test::TestCluster::MakeTestCluster(
+      test::TestClusterConfig()
+          .Options(Devices(1, 1))
+          .Jobs({test::TestJob{/*name=*/"localhost", /*num_tasks=*/2}}),
+      &cluster));
   auto session = NewRemote(Options(cluster->targets()[0], 1000));
 
   // b = a
diff --git a/tensorflow/core/distributed_runtime/BUILD b/tensorflow/core/distributed_runtime/BUILD
index 0779262b9df..9df47dac0e3 100644
--- a/tensorflow/core/distributed_runtime/BUILD
+++ b/tensorflow/core/distributed_runtime/BUILD
@@ -194,6 +194,7 @@ tf_cc_test(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -391,6 +392,7 @@ cc_library(
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/protobuf:master_proto_cc",
         "//tensorflow/core/protobuf:worker_proto_cc",
+        "//tensorflow/tsl/protobuf:rpc_options_proto_cc",
     ],
 )
 
@@ -416,6 +418,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/debug:debug_graph_utils",
         "//tensorflow/core/protobuf:master_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -477,6 +480,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -488,6 +492,7 @@ cc_library(
         ":worker_cache",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
+        "//tensorflow/tsl/protobuf:rpc_options_proto_cc",
     ],
 )
 
@@ -617,6 +622,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:scoped_memory_debug_annotation",
         "//tensorflow/core/protobuf:worker_proto_cc",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index e0966c54200..b31dfa4f93f 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/hash/hash.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/cancellation.h"
@@ -33,6 +34,16 @@ limitations under the License.
 
 namespace tensorflow {
 
+namespace {
+
+size_t HashCall(BaseRecvTensorCall* call) {
+  // Salt hash with "42" to avoid using the same hash function for the shard key
+  // and the hashtable contained within the the shard itself.
+  return absl::HashOf(call, 42);
+}
+
+}  // namespace
+
 BaseRendezvousMgr::BaseRendezvousMgr(const WorkerEnv* worker_env)
     : cache_(new RendezvousCache<BaseRemoteRendezvous>()),
       worker_env_(worker_env) {}
@@ -347,12 +358,7 @@ void BaseRemoteRendezvous::RecvLocalAsync(const ParsedKey& parsed,
       // rendezvous logic. At some point after Initialize() is called, a Tensor
       // is produced locally that will then be sent in response to the incoming
       // RPC.
-
-      // Keeps a reference to ensure current rendezvous won't be released before
-      // these pending calls are applied.
-      tsl::core::RefCountPtr<Rendezvous> rendez_ref = GetNewRef(this);
-      deferred_calls_.emplace_back(parsed, std::move(done),
-                                   std::move(rendez_ref));
+      deferred_calls_.emplace_back(parsed, std::move(done), GetNewRef(this));
       return;
     }
   }
@@ -376,7 +382,7 @@ void BaseRemoteRendezvous::StartAbort(const Status& s) {
   // aggregating errors across devices: this allows us to prefer our original
   // status message over any cancellation related errors.
   Status derived_status = s;
-  if (errors::IsCancelled(s) || errors::IsAborted(s)) {
+  if (absl::IsCancelled(s) || absl::IsAborted(s)) {
     derived_status = StatusGroup::MakeDerived(s);
   }
 
@@ -441,7 +447,7 @@ void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
     }
   }
 
-  int hash = absl::Hash<void*>{}(call) % num_shards_;
+  int hash = HashCall(call) % num_shards_;
   bool buckets_found = false;
   bool already_cancelled = false;
   {
@@ -474,8 +480,8 @@ void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
       }
       if (!already_cancelled) {
         it = calls_
-                 .emplace(cm,
-                          std::make_unique<PendingCalls>(token, 0, num_shards_))
+                 .emplace(cm, std::make_unique<PendingCalls>(
+                                  token, 0, num_shards_, GetNewRef(this)))
                  .first;
       }
     }
@@ -496,7 +502,7 @@ void BaseRemoteRendezvous::RegisterCall(BaseRecvTensorCall* call,
 
 void BaseRemoteRendezvous::DeregisterCall(BaseRecvTensorCall* call,
                                           const Rendezvous::Args& args) {
-  int hash = absl::Hash<void*>{}(call) % num_shards_;
+  int hash = HashCall(call) % num_shards_;
   auto cm = args.cancellation_manager;
   bool is_last_call = false;
   {
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
index 04436174100..e69ee1dc7ef 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.h
@@ -206,6 +206,8 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   struct DeferredCall {
     const ParsedKey parsed;
     DoneCallback done;
+
+    // Keeps a reference to the rendezvous, to keep it alive.
     tsl::core::RefCountPtr<Rendezvous> rendezvous;
 
     DeferredCall(const ParsedKey& parsed, DoneCallback done,
@@ -220,11 +222,18 @@ class BaseRemoteRendezvous : public RemoteRendezvous {
   };
 
   struct PendingCalls {
-    PendingCalls(CancellationToken token, int num_calls, int num_buckets)
-        : token(token), num_calls(num_calls), buckets(num_buckets) {}
+    PendingCalls(CancellationToken token, int num_calls, int num_buckets,
+                 tsl::core::RefCountPtr<Rendezvous> rendez)
+        : token(token),
+          num_calls(num_calls),
+          buckets(num_buckets),
+          rendezvous(std::move(rendez)) {}
     CancellationToken token = CancellationManager::kInvalidToken;
     std::atomic<int> num_calls = 0;
     std::vector<CallBucket> buckets;
+
+    // Keeps a reference to the rendezvous, to keep it alive.
+    tsl::core::RefCountPtr<Rendezvous> rendezvous;
   };
 
   // "CancellationToken" is stored here so that when there's no active
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 4ddaf1bd47d..8a3df302cd9 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -12,8 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+
 #include "tensorflow/core/distributed_runtime/cluster_function_library_runtime.h"
 
+#include <map>
+
 #include "tensorflow/core/common_runtime/function_testlib.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_channel.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
@@ -31,9 +34,19 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
  public:
   ClusterFunctionLibraryRuntimeTest() {
     SessionOptions options;
-    TF_CHECK_OK(test::TestCluster::MakeTestCluster(options, 2, &cluster_));
+    TF_CHECK_OK(test::TestCluster::MakeTestCluster(
+        test::TestClusterConfig().Options(options).Jobs(
+            {test::TestJob{"localhost", 2}}),
+        &cluster_));
     GrpcChannelSpec spec;
-    TF_CHECK_OK(spec.AddHostPortsJob("localhost", cluster_->targets()));
+
+    std::map<int, string> host_ports;
+    int i = 0;
+    for (const auto& target : cluster_->targets("localhost")) {
+      host_ports[i++] = target;
+    }
+
+    TF_CHECK_OK(spec.AddHostPortsJob("localhost", host_ports));
     ChannelCreationFunction channel_func =
         ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
     grpc_worker_env_.reset(CreateGrpcWorkerEnv());
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index 8476147556a..b97aff64bbd 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/base_collective_executor.h"
 #include "tensorflow/core/common_runtime/copy_tensor.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -277,7 +278,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
               break;
             }
           }
-        } else if (errors::IsNotFound(s)) {
+        } else if (absl::IsNotFound(s)) {
           // Skip validating device incarnation if we don't know what the
           // incarnation should be. The device attribute is cached after the
           // first collective.
diff --git a/tensorflow/core/distributed_runtime/coordination/BUILD b/tensorflow/core/distributed_runtime/coordination/BUILD
index d901ad637f3..182b2c5e0ba 100644
--- a/tensorflow/core/distributed_runtime/coordination/BUILD
+++ b/tensorflow/core/distributed_runtime/coordination/BUILD
@@ -56,6 +56,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/tsl/distributed_runtime/coordination:coordination_service_agent",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index 825b4638e4a..d505b209b0f 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -20,6 +20,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/platform/errors.h"
@@ -33,15 +35,15 @@ std::pair<Status, bool> BarrierProxy::Wait() {
   mutex_lock l(mu_);
   if (status_set_) {
     return std::make_pair(
-        errors::FailedPrecondition(
-            "The barrier has already passed or timed out. key=", key_),
+        absl::FailedPreconditionError(absl::StrCat(
+            "The barrier has already passed or timed out. key=", key_)),
         false);
   }
   if (num_entered_ >= num_local_threads_) {
-    return std::make_pair(
-        errors::FailedPrecondition("Wait() called too many (>",
-                                   num_local_threads_, ") times. key=", key_),
-        false);
+    return std::make_pair(absl::FailedPreconditionError(absl::StrCat(
+                              "Wait() called too many (>", num_local_threads_,
+                              ") times. key=", key_)),
+                          false);
   }
   // Now that `Wait` has passed pre-condition check, the thread has entered the
   // barrier.
@@ -71,7 +73,8 @@ std::pair<Status, bool> BarrierProxy::Wait() {
         // cancelled for any reason.
         agent_->CancelBarrier(key_).IgnoreError();
       }
-      status_ = errors::DeadlineExceeded("BarrierProxy timeout: key=", key_);
+      status_ = absl::DeadlineExceededError(
+          absl::StrCat("BarrierProxy timeout: key=", key_));
       status_set_ = true;
       cv_.notify_all();
     }
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index 14269a6a4a5..99a621f3fd6 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -45,71 +45,73 @@ using tsl::CoordinationServiceAgent;
 
 class MockCoordinationServiceAgent : public CoordinationServiceAgent {
  public:
-  // NOLINTBEGIN(MOCK_METHOD does not work on Windows build, using deprecated
-  // MOCK_METHOD<N> instead)
-  MOCK_METHOD3(WaitAtBarrier,
-               Status(const std::string& barrier_id, absl::Duration timeout,
-                      const std::vector<CoordinatedTask>& tasks));
-  MOCK_METHOD1(CancelBarrier, Status(const std::string& barrier_id));
+  MOCK_METHOD(Status, WaitAtBarrier,
+              (const std::string& barrier_id, absl::Duration timeout,
+               const std::vector<CoordinatedTask>& tasks),
+              (override));
+  MOCK_METHOD(Status, CancelBarrier, (const std::string& barrier_id),
+              (override));
 
   // All the following member functions are not needed for testing.
-  MOCK_METHOD4(Initialize,
-               Status(Env* env, const CoordinationServiceConfig& config,
-                      std::unique_ptr<CoordinationClientCache> client_cache,
-                      StatusCallback error_fn));
-  MOCK_METHOD6(Initialize,
-               Status(Env* env, const std::string& job_name, int task_id,
-                      const CoordinationServiceConfig& configs,
-                      std::unique_ptr<CoordinationClient> leader_client,
-                      StatusCallback error_fn));
-  MOCK_METHOD5(Initialize,
-               Status(Env* env, const CoordinatedTask& task,
-                      const CoordinationServiceConfig& configs,
-                      std::unique_ptr<CoordinationClient> leader_client,
-                      StatusCallback error_fn));
-  MOCK_METHOD0(IsInitialized, bool());
-  MOCK_METHOD0(IsConnected, bool());
-  MOCK_METHOD0(IsError, bool());
-  MOCK_METHOD0(Connect, Status());
-  MOCK_METHOD1(WaitForAllTasks, Status(const DeviceInfo& local_devices));
-  MOCK_METHOD0(GetClusterDeviceInfo, const DeviceInfo&());
-  MOCK_METHOD0(GetOwnTask, StatusOr<CoordinatedTask>());
-  MOCK_METHOD1(GetTaskState, StatusOr<std::vector<CoordinatedTaskStateInfo>>(
-                                 const std::vector<CoordinatedTask>& task));
-  MOCK_METHOD1(ReportError, Status(const Status& error));
-  MOCK_METHOD0(Shutdown, Status());
-  MOCK_METHOD0(Reset, Status());
-  MOCK_METHOD1(GetKeyValue, StatusOr<std::string>(const std::string& key));
-  MOCK_METHOD2(GetKeyValue, StatusOr<std::string>(const std::string& key,
-                                                  absl::Duration timeout));
-  MOCK_METHOD2(GetKeyValueAsync,
-               std::shared_ptr<CallOptions>(const std::string& key,
-                                            StatusOrValueCallback done));
-  MOCK_METHOD1(TryGetKeyValue, StatusOr<std::string>(const std::string& key));
-  MOCK_METHOD1(GetKeyValueDir,
-               StatusOr<std::vector<KeyValueEntry>>(const std::string& key));
-  MOCK_METHOD2(GetKeyValueDirAsync,
-               void(const std::string& key, StatusOrValueDirCallback done));
-  MOCK_METHOD2(InsertKeyValue,
-               Status(const std::string& key, const std::string& value));
-  MOCK_METHOD1(DeleteKeyValue, Status(const std::string& key));
-  MOCK_METHOD2(UpdateKeyValue,
-               Status(const std::string& key, const std::string& value));
-  MOCK_METHOD2(StartWatchKey, Status(const std::string& key,
-                                     ChangedKeyValuesCallback on_change));
-  MOCK_METHOD1(StopWatchKey, Status(const std::string& key));
-  MOCK_METHOD4(WaitAtBarrierAsync,
-               void(const std::string& barrier_id, absl::Duration timeout,
-                    const std::vector<CoordinatedTask>& tasks,
-                    StatusCallback done));
-  MOCK_METHOD2(CancelBarrierAsync,
-               void(const std::string& barrier_id, StatusCallback done));
-  MOCK_METHOD0(GetEnv, StatusOr<Env*>());
-  MOCK_METHOD1(SetError, void(const Status& error));
-  MOCK_METHOD2(ActivateWatch,
-               Status(const std::string& key,
-                      const std::map<std::string, std::string>&));
-  // NOLINTEND
+  MOCK_METHOD(Status, Initialize,
+              (Env * env, const std::string& job_name, int task_id,
+               const CoordinationServiceConfig& configs,
+               std::unique_ptr<CoordinationClient> leader_client,
+               StatusCallback error_fn),
+              (override));
+  MOCK_METHOD(Status, Initialize,
+              (Env * env, const CoordinatedTask& task,
+               const CoordinationServiceConfig& configs,
+               std::unique_ptr<CoordinationClient> leader_client,
+               StatusCallback error_fn),
+              (override));
+  MOCK_METHOD(bool, IsInitialized, (), (override));
+  MOCK_METHOD(bool, IsConnected, (), (override));
+  MOCK_METHOD(bool, IsError, (), (override));
+  MOCK_METHOD(Status, Connect, (), (override));
+  MOCK_METHOD(Status, WaitForAllTasks, (const DeviceInfo& local_devices),
+              (override));
+  MOCK_METHOD(const DeviceInfo&, GetClusterDeviceInfo, (), (override));
+  MOCK_METHOD(StatusOr<CoordinatedTask>, GetOwnTask, (), (override));
+  MOCK_METHOD(StatusOr<std::vector<CoordinatedTaskStateInfo>>, GetTaskState,
+              (const std::vector<CoordinatedTask>& task), (override));
+  MOCK_METHOD(Status, ReportError, (const Status& error), (override));
+  MOCK_METHOD(Status, Shutdown, (), (override));
+  MOCK_METHOD(Status, Reset, (), (override));
+  MOCK_METHOD(StatusOr<std::string>, GetKeyValue, (const std::string& key),
+              (override));
+  MOCK_METHOD(StatusOr<std::string>, GetKeyValue,
+              (const std::string& key, absl::Duration timeout), (override));
+  MOCK_METHOD(std::shared_ptr<CallOptions>, GetKeyValueAsync,
+              (const std::string& key, StatusOrValueCallback done), (override));
+  MOCK_METHOD(StatusOr<std::string>, TryGetKeyValue, (const std::string& key),
+              (override));
+  MOCK_METHOD(StatusOr<std::vector<KeyValueEntry>>, GetKeyValueDir,
+              (const std::string& key), (override));
+  MOCK_METHOD(void, GetKeyValueDirAsync,
+              (const std::string& key, StatusOrValueDirCallback done),
+              (override));
+  MOCK_METHOD(Status, InsertKeyValue,
+              (const std::string& key, const std::string& value), (override));
+  MOCK_METHOD(Status, DeleteKeyValue, (const std::string& key), (override));
+  MOCK_METHOD(Status, UpdateKeyValue,
+              (const std::string& key, const std::string& value), (override));
+  MOCK_METHOD(Status, StartWatchKey,
+              (const std::string& key, ChangedKeyValuesCallback on_change),
+              (override));
+  MOCK_METHOD(Status, StopWatchKey, (const std::string& key), (override));
+  MOCK_METHOD(void, WaitAtBarrierAsync,
+              (const std::string& barrier_id, absl::Duration timeout,
+               const std::vector<CoordinatedTask>& tasks, StatusCallback done),
+              (override));
+  MOCK_METHOD(void, CancelBarrierAsync,
+              (const std::string& barrier_id, StatusCallback done), (override));
+  MOCK_METHOD(StatusOr<Env*>, GetEnv, (), (override));
+  MOCK_METHOD(void, SetError, (const Status& error), (override));
+  MOCK_METHOD(Status, ActivateWatch,
+              (const std::string& key,
+               (const std::map<std::string, std::string>&)),
+              (override));
 };
 
 constexpr auto kTestKey = "test_key";
diff --git a/tensorflow/core/distributed_runtime/eager/BUILD b/tensorflow/core/distributed_runtime/eager/BUILD
index bdb1cfea4df..11e98dddba6 100644
--- a/tensorflow/core/distributed_runtime/eager/BUILD
+++ b/tensorflow/core/distributed_runtime/eager/BUILD
@@ -54,6 +54,7 @@ cc_library(
         ":eager_client",
         "//tensorflow/core/common_runtime/eager:eager_executor",
         "//tensorflow/core/protobuf:eager_service_proto_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -119,6 +120,7 @@ cc_library(
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/tsl/distributed_runtime/preemption:preemption_notifier",
         "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
     ] + tf_grpc_cc_dependencies(),
@@ -147,6 +149,7 @@ tf_cc_test(
         "//tensorflow/core/distributed_runtime:worker_env",
         "//tensorflow/core/distributed_runtime/rpc:rpc_rendezvous_mgr",
         "//tensorflow/core/protobuf:eager_service_proto_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:variant",
     ],
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index dd5e09333f4..45aab6e5f84 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/eager/eager_executor.h"
 #include "tensorflow/core/distributed_runtime/eager/eager_client.h"
 #include "tensorflow/core/protobuf/eager_service.pb.h"
@@ -54,7 +55,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
           // 1. The remote tensor isn't ready.
           // 2. Lost connection to remote worker. In this case client will
           //    crash. We don't want to spam user with redundant warning logs.
-          if (!s.ok() && ready && !errors::IsUnavailable(s)) {
+          if (!s.ok() && ready && !absl::IsUnavailable(s)) {
             LOG_EVERY_N_SEC(WARNING, 60)
                 << "Ignoring an error encountered when deleting "
                    "remote tensors handles: "
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index 54ec62574f5..7924dee2597 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/fixed_array.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/c/eager/immediate_execution_distributed_manager.h"
@@ -59,7 +60,7 @@ Status GetNumRetvals(tensorflow::EagerContext* context, const string& op_name,
                      int* num_retvals) {
   const tensorflow::OpRegistrationData* op_reg_data = nullptr;
   auto status = tensorflow::OpRegistry::Global()->LookUp(op_name, &op_reg_data);
-  if (errors::IsNotFound(status)) {
+  if (absl::IsNotFound(status)) {
     status = context->FindFunctionOpData(op_name, &op_reg_data);
   }
   TF_RETURN_IF_ERROR(status);
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 991f657513e..74f2253fcaa 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/types/optional.h"
 #include "absl/types/variant.h"
 #include "tensorflow/c/tf_tensor.h"
@@ -545,7 +546,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
       Env::Default()->SleepForMicroseconds(500000);
       call_opts.StartCancel();
       n.WaitForNotification();
-      EXPECT_TRUE(errors::IsCancelled(status)) << status.message();
+      EXPECT_TRUE(absl::IsCancelled(status)) << status.message();
     } else {
       n.WaitForNotification();
       TF_ASSERT_OK(status);
@@ -638,7 +639,7 @@ class EagerServiceImplFunctionTest : public EagerServiceImplTest {
     }
     n.WaitForNotification();
     if (test_cancel) {
-      EXPECT_TRUE(errors::IsCancelled(status)) << status.message();
+      EXPECT_TRUE(absl::IsCancelled(status)) << status.message();
     } else {
       TF_ASSERT_OK(status);
       // Retrieve the output.
@@ -966,7 +967,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
       /*allow_control_flow_sync_execution=*/false,
       /*shape_inference_on_tfe_dialect_import=*/true,
       /*int_args_and_retvals_on_device=*/false,
-      /*xla_compile_device_type=*/std::nullopt, ctx->RendezvousFactory(),
+      /*xla_compile_device_type=*/std::nullopt,
+      /*allow_soft_placement=*/false, ctx->RendezvousFactory(),
       [=]() { return op_id; }));
 
   // Instantiate MatMulFunction on remote_device.
@@ -1020,7 +1022,8 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
       /*allow_control_flow_sync_execution=*/false,
       /*shape_inference_on_tfe_dialect_import=*/true,
       /*int_args_and_retvals_on_device=*/false,
-      /*xla_compile_device_type=*/std::nullopt, ctx->RendezvousFactory(),
+      /*xla_compile_device_type=*/std::nullopt,
+      /*allow_soft_placement=*/false, ctx->RendezvousFactory(),
       [=]() { return op_id; }));
 
   // Instantiate MatMulFunction on remote_device.
diff --git a/tensorflow/core/distributed_runtime/integration_test/BUILD b/tensorflow/core/distributed_runtime/integration_test/BUILD
index 1fbeafccbaa..2054dbff742 100644
--- a/tensorflow/core/distributed_runtime/integration_test/BUILD
+++ b/tensorflow/core/distributed_runtime/integration_test/BUILD
@@ -28,11 +28,12 @@ tf_cuda_library(
 
 tf_cuda_cc_test(
     name = "c_api_coordination_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_coordination_test.cc"],
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
         "no_cuda_asan",  # TODO(b/193450885)
+        "no_oss",  # TODO(b/283120771)
         "no_windows",  # TODO(b/207281588)
     ],
     deps = [
@@ -57,7 +58,7 @@ tf_cuda_cc_test(
 
 tf_cuda_cc_test(
     name = "c_api_session_coordination_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_session_coordination_test.cc"],
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
@@ -81,7 +82,7 @@ tf_cuda_cc_test(
 
 tf_cuda_cc_test(
     name = "c_api_multi_client_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_multi_client_test.cc"],
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
@@ -145,7 +146,7 @@ tf_cc_test(
 
 tf_cuda_cc_test(
     name = "c_api_recoverable_jobs_test",
-    size = "small",
+    size = "medium",
     srcs = ["c_api_recoverable_jobs_test.cc"],
     # copybara:uncomment extra_copts = ["-Wthread-safety-analysis"],
     tags = [
diff --git a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
index 54198c1ccc0..5e069082c58 100644
--- a/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
+++ b/tensorflow/core/distributed_runtime/integration_test/c_api_multi_client_function_test.cc
@@ -375,10 +375,6 @@ INSTANTIATE_TEST_SUITE_P(
         {"MultiClientMultiStepFunction", false, 3, 0, 0},
         {"MultiClientMultiStepFunctionWithRecvDelay", false, 5, 2, 0},
         {"MultiClientMultiStepFunctionWithSendDelay", false, 5, 0, 2},
-        {"MultiClientSingleStepFunctionTfrt", true, 1, 0, 0},
-        {"MultiClientMultiStepFunctionTfrt", true, 3, 0, 0},
-        {"MultiClientMultiStepFunctionWithRecvDelayTfrt", true, 5, 2, 0},
-        {"MultiClientMultiStepFunctionWithSendDelayTfrt", true, 5, 0, 2},
     }),
     [](const testing::TestParamInfo<MultiClientSendRecvTest::ParamType>& info) {
       return info.param.test_name;
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 393565a665d..46508b345c3 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -55,6 +55,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/worker.pb.h"
 #include "tensorflow/core/public/session_options.h"
 #include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/tsl/protobuf/rpc_options.pb.h"
 
 namespace tensorflow {
 
@@ -380,7 +381,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
 
     const ClusterDef& cluster_def = req->config().cluster_def();
     if (!cluster_def.job().empty()) {
-      worker_cache_factory_options.cluster_def = &cluster_def;
+      worker_cache_factory_options.cluster_def = cluster_def;
       // If the target starts with gRPC protocol prefix, remove the prefix
       string normalized_string(req->target());
       RE2::Replace(&normalized_string, kGrpcPrefixRegex, "");
@@ -389,7 +390,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
       for (auto&& job : cluster_def.job()) {
         for (auto&& task : job.tasks()) {
           if (task.second == normalized_string) {
-            if (worker_cache_factory_options.job_name != nullptr) {
+            if (!worker_cache_factory_options.job_name.empty()) {
               status = errors::InvalidArgument(
                   "Found multiple matching tasks that correspond to "
                   "to the master. Master target: '",
@@ -408,12 +409,12 @@ void Master::CreateSession(const CreateSessionRequest* req,
                   job.name(), ", task index: ", task.first);
               return;
             }
-            worker_cache_factory_options.job_name = &job.name();
+            worker_cache_factory_options.job_name = job.name();
             worker_cache_factory_options.task_index = task.first;
           }
         }
       }
-      worker_cache_factory_options.rpc_options = &req->config().rpc_options();
+      worker_cache_factory_options.rpc_options = req->config().rpc_options();
       // Create the worker cache from the computed server_def.
       status = env_->worker_cache_factory(worker_cache_factory_options,
                                           &worker_cache);
@@ -429,7 +430,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
       for (auto&& d : *remote_devices) {
         device_set->AddDevice(d.get());
         DeviceNameUtils::ParsedName name = d->parsed_name();
-        if (name.job == *worker_cache_factory_options.job_name &&
+        if (name.job == worker_cache_factory_options.job_name &&
             name.task == worker_cache_factory_options.task_index &&
             name.type == "CPU" && name.id == 0) {
           device_set->set_client_device(d.get());
@@ -443,7 +444,7 @@ void Master::CreateSession(const CreateSessionRequest* req,
           DeviceFinder::GetRemoteDevices(req->config().device_filters(), env_,
                                          worker_cache, remote_devices.get());
       if (!status.ok()) return;
-      device_set.reset(new DeviceSet);
+      device_set = std::make_unique<DeviceSet>();
       for (auto&& d : *remote_devices) {
         device_set->AddDevice(d.get());
       }
@@ -636,7 +637,9 @@ void Master::CleanupWorkers(const ResetRequest& reset) {
       if (worker) {
         worker->CleanupAllAsync(
             &req, &resp[i], [this, &n, worker_name, worker, c](Status s) {
-              TF_CHECK_OK(s);
+              if (!s.ok()) {
+                LOG(ERROR) << "Worker CleanupAll failed: " << s;
+              }
               env_->worker_cache->ReleaseWorker(worker_name, worker);
               n[c].Notify();
             });
diff --git a/tensorflow/core/distributed_runtime/master_env.h b/tensorflow/core/distributed_runtime/master_env.h
index ab54167fb7e..c57a7c859e7 100644
--- a/tensorflow/core/distributed_runtime/master_env.h
+++ b/tensorflow/core/distributed_runtime/master_env.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tensorflow_server.pb.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tensorflow/tsl/protobuf/rpc_options.pb.h"
 
 namespace tsl {
 class Env;
@@ -39,22 +40,22 @@ class OpRegistryInterface;
 
 // Options passed to the worker_cache_factory function.
 struct WorkerCacheFactoryOptions {
-  const ClusterDef* cluster_def = nullptr;
-  const string* job_name = nullptr;
+  ClusterDef cluster_def;
+  string job_name;
   int task_index;
-  const RPCOptions* rpc_options = nullptr;
+  int replica_index = 0;
+  RPCOptions rpc_options;
 
-  WorkerCacheFactoryOptions() {}
+  explicit WorkerCacheFactoryOptions() = default;
 
   // Construct from a ServerDef proto.
-  //
-  // Note: server_def must outlive WorkerCacheFactoryOptions!
-  WorkerCacheFactoryOptions(const ServerDef& server_def) {
+  explicit WorkerCacheFactoryOptions(const ServerDef& server_def) {
     if (server_def.has_cluster() && !server_def.job_name().empty()) {
-      cluster_def = &server_def.cluster();
-      job_name = &server_def.job_name();
+      cluster_def = server_def.cluster();
+      job_name = server_def.job_name();
       task_index = server_def.task_index();
-      rpc_options = &server_def.default_session_config().rpc_options();
+      rpc_options = server_def.default_session_config().rpc_options();
+      replica_index = server_def.replica();
     }
   }
 };
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index bcdc66d1797..838524ab76a 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/process_util.h"
 #include "tensorflow/core/common_runtime/profile_handler.h"
 #include "tensorflow/core/common_runtime/stats_publisher_interface.h"
@@ -1932,7 +1933,7 @@ Status MasterSession::PostRunCleanup(MasterSession::ReffedClientGraph* rcg,
     }
     // Schedule post-processing and cleanup to be done asynchronously.
     rcg->ProcessStats(step_id, pss, ph.get(), run_options, out_run_metadata);
-  } else if (errors::IsCancelled(s)) {
+  } else if (absl::IsCancelled(s)) {
     mutex_lock l(mu_);
     if (closed_) {
       if (garbage_collected_) {
diff --git a/tensorflow/core/distributed_runtime/master_test.cc b/tensorflow/core/distributed_runtime/master_test.cc
index 5741b39f634..5c2f17e31f8 100644
--- a/tensorflow/core/distributed_runtime/master_test.cc
+++ b/tensorflow/core/distributed_runtime/master_test.cc
@@ -48,7 +48,10 @@ class MasterTest : public ::testing::Test {
     SessionOptions options;
     (*options.config.mutable_device_count())["CPU"] = 1;
     (*options.config.mutable_device_count())["GPU"] = 0;
-    TF_CHECK_OK(test::TestCluster::MakeTestCluster(options, 2, &cluster_));
+    TF_CHECK_OK(test::TestCluster::MakeTestCluster(
+        test::TestClusterConfig().Options(options).Jobs(
+            {test::TestJob{/*job_name=*/"localhost", /*num_tasks=*/2}}),
+        &cluster_));
     SharedGrpcChannelPtr channel_ptr;
     TF_CHECK_OK(NewHostPortGrpcChannel(
         cluster_->targets()[0], &options.config.rpc_options(), &channel_ptr));
diff --git a/tensorflow/core/distributed_runtime/rpc/BUILD b/tensorflow/core/distributed_runtime/rpc/BUILD
index 643f04b75a7..8438a959e2a 100644
--- a/tensorflow/core/distributed_runtime/rpc/BUILD
+++ b/tensorflow/core/distributed_runtime/rpc/BUILD
@@ -328,6 +328,7 @@ cc_library(
         "//tensorflow/core/nccl:collective_communicator",
         "//tensorflow/core/profiler/rpc:profiler_service_impl",
         "//tensorflow/tsl/distributed_runtime/rpc:async_service_interface",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_profiler_service() + tf_grpc_dependencies() + tf_grpc_cc_dependencies(),
     alwayslink = 1,
 )
@@ -392,6 +393,8 @@ cc_library(
         "//tensorflow/core/kernels:matmul_op",
         "//tensorflow/core/kernels:reduction_ops",
         "//tensorflow/core/kernels:variable_ops",
+        "@com_google_absl//absl/flags:flag",
+        "@com_google_absl//absl/strings",
     ] + tf_grpc_cc_dependencies(),
     alwayslink = 1,
 )
@@ -421,6 +424,7 @@ tf_cuda_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:test",
         "//tensorflow/core:testlib",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
index 2bf914a0d0b..d68f403e634 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client_test.cc
@@ -27,8 +27,12 @@ namespace eager {
 
 TEST(GrpcEagerClientCache, TestGetClientThreadSafety) {
   GrpcChannelSpec spec;
-  TF_ASSERT_OK(spec.AddHostPortsJob(
-      "worker", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"}));
+  TF_ASSERT_OK(spec.AddHostPortsJob("worker", {{0, "a:1"},
+                                               {1, "b:2"},
+                                               {2, "c:3"},
+                                               {3, "d:4"},
+                                               {4, "e:5"},
+                                               {5, "f:6"}}));
   ChannelCreationFunction channel_func =
       ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
   auto channel_cache = std::shared_ptr<GrpcChannelCache>(
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
index 562b41997cc..26fab86cb44 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_remote_worker.cc
@@ -118,13 +118,15 @@ class GrpcRemoteWorker : public WorkerInterface {
   void CleanupGraphAsync(const CleanupGraphRequest* request,
                          CleanupGraphResponse* response,
                          StatusCallback done) override {
-    IssueRequest(request, response, cleanupgraph_, std::move(done));
+    IssueRequest(request, response, cleanupgraph_, std::move(done),
+                 /*call_opts=*/nullptr, /*fail_fast=*/false);
   }
 
   void CleanupAllAsync(const CleanupAllRequest* request,
                        CleanupAllResponse* response,
                        StatusCallback done) override {
-    IssueRequest(request, response, cleanupall_, std::move(done));
+    IssueRequest(request, response, cleanupall_, std::move(done),
+                 /*call_opts=*/nullptr, /*fail_fast=*/false);
   }
 
   void RecvBufAsync(CallOptions* call_opts, const RecvBufRequest* request,
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index a9c71b264f9..6b01ea2c264 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/server_builder.h"
+#include "absl/strings/numbers.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/process_util.h"
@@ -151,6 +152,12 @@ Status GrpcServer::GetHostAndPort(const ServerDef& server_def,
       if (server_def.port() != 0) {
         *port = server_def.port();
       } else {
+        if (server_def.replica() != 0) {
+          return errors::InvalidArgument(
+              "An explicit server port must be specified when using jobs "
+              "with multiple replicas: ",
+              server_def.DebugString());
+        }
         auto colon_index = iter->second.find_last_of(':');
         if (!strings::safe_strto32(iter->second.substr(colon_index + 1),
                                    port)) {
@@ -193,9 +200,9 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   sess_opts.config = config;
 
   // Configure shared devices between master and worker.
-  string name_prefix =
-      strings::StrCat("/job:", server_def_.job_name(), "/replica:0",
-                      "/task:", server_def_.task_index());
+  string name_prefix = strings::StrCat("/job:", server_def_.job_name(),
+                                       "/replica:", server_def_.replica(),
+                                       "/task:", server_def_.task_index());
   if (opts.local_device_mgr == nullptr) {
     std::vector<std::unique_ptr<Device>> devices;
     TF_RETURN_IF_ERROR(
@@ -354,20 +361,24 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
 
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
                                     GrpcChannelSpec* channel_spec) {
-  for (const auto& job : options.cluster_def->job()) {
+  for (const auto& job : options.cluster_def.job()) {
     std::map<int, string> host_ports;
     for (const auto& task : job.tasks()) {
-      string& host_port = host_ports[task.first];
-      if (!host_port.empty()) {
-        return errors::InvalidArgument("JobDef for job \"", job.name(),
-                                       "\" specified two addresses for task \"",
-                                       task.first, "\": ", host_port, " and ",
-                                       task.second);
+      std::vector<std::string> parts = absl::StrSplit(task.second, ':');
+      int port = -1;
+      if (parts.size() == 2) {
+        if (!absl::SimpleAtoi(parts[1], &port)) {
+          return errors::InvalidArgument("Failed to parse port.", task.second);
+        }
       }
-      if (job.name() == *options.job_name && task.first == options.task_index) {
-        host_port = strings::StrCat(host_name_, ":", bound_port_);
+
+      // Some test cases pass in `localhost:0` as a hack to dynamically allocate
+      // the worker port. We swap in the bound worker port when we see this
+      // pattern.
+      if (job.name() == options.job_name && task.first == options.task_index) {
+        host_ports[task.first] = strings::StrCat(host_name_, ":", bound_port_);
       } else {
-        host_port = task.second;
+        host_ports[task.first] = task.second;
       }
     }
     TF_RETURN_IF_ERROR(channel_spec->AddHostPortsJob(job.name(), host_ports));
@@ -377,11 +388,11 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
 
 Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
                                       WorkerCacheInterface** worker_cache) {
-  if (options.job_name == nullptr || options.job_name->empty()) {
+  if (options.job_name.empty()) {
     Status s = errors::InvalidArgument(
         "The master (current machine) is not included in the provided "
         "cluster_def. ",
-        options.cluster_def->DebugString());
+        options.cluster_def.DebugString());
     LOG(WARNING) << s;
     return s;
   }
@@ -389,14 +400,11 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   GrpcChannelSpec channel_spec;
   TF_RETURN_IF_ERROR(ParseChannelSpec(options, &channel_spec));
 
-  if (options.rpc_options == nullptr) {
-    return errors::InvalidArgument(
-        "rpc_options not set in WorkerCacheFactoryOptions");
-  }
   std::shared_ptr<GrpcChannelCache> channel_cache(NewGrpcChannelCache(
-      channel_spec, GetChannelCreationFunction(), *options.rpc_options));
+      channel_spec, GetChannelCreationFunction(), options.rpc_options));
 
-  string name_prefix = strings::StrCat("/job:", *options.job_name, "/replica:0",
+  string name_prefix = strings::StrCat("/job:", options.job_name,
+                                       "/replica:", options.replica_index,
                                        "/task:", options.task_index);
 
   const string host_port = channel_cache->TranslateTask(name_prefix);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
index 8e94ab2f411..7d103231fbd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 
+#include <string>
+
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/util/port.h"
+#include "tensorflow/tsl/lib/core/status_test_util.h"
 
 namespace tensorflow {
 
@@ -70,9 +73,8 @@ static void IsSingleFloatValue(const Tensor& val, float expected_val) {
 
 static SessionOptions Options(const string& target, int placement_period) {
   SessionOptions options;
-  // NOTE(mrry): GrpcSession requires a grpc:// scheme prefix in the target
-  // string.
   options.target = strings::StrCat("grpc://", target);
+  options.config.set_isolate_session_state(false);
   options.config.set_placement_period(placement_period);
   options.config.mutable_graph_options()
       ->mutable_optimizer_options()
@@ -81,9 +83,13 @@ static SessionOptions Options(const string& target, int placement_period) {
 }
 
 static Session* NewRemote(const SessionOptions& options) {
+  LOG(INFO) << "Connecting to " << options.target;
   return CHECK_NOTNULL(NewSession(options));
 }
 
+using test::TestClusterConfig;
+using test::TestJob;
+
 TEST(GrpcSessionTest, BasicNonProtoAPI) {
   GraphDef graph;
   string node_names[3];
@@ -91,19 +97,22 @@ TEST(GrpcSessionTest, BasicNonProtoAPI) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
-
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
   ASSERT_TRUE(session != nullptr);
 
   for (int iters = 0; iters < 25; ++iters) {
-    TF_CHECK_OK(session->Create(graph));
+    TF_ASSERT_OK(session->Create(graph));
     {
       // Just run to target node
       std::vector<std::pair<string, Tensor>> inputs;
       std::vector<string> targets = {node_names[2]};
-      TF_CHECK_OK(session->Run(inputs, {}, targets, nullptr));
+      TF_ASSERT_OK(session->Run(inputs, {}, targets, nullptr));
     }
     {
       // Run to a target node and a real tensor
@@ -111,12 +120,12 @@ TEST(GrpcSessionTest, BasicNonProtoAPI) {
       std::vector<string> names = {node_names[2] + ":0"};
       std::vector<string> targets = {node_names[1]};
       std::vector<Tensor> outputs;
-      TF_CHECK_OK(session->Run(inputs, names, targets, &outputs));
+      TF_ASSERT_OK(session->Run(inputs, names, targets, &outputs));
       ASSERT_TRUE(outputs[0].IsInitialized());
       ASSERT_EQ(4.0, outputs[0].flat<float>()(0));
     }
 
-    TF_CHECK_OK(session->Close());
+    TF_ASSERT_OK(session->Close());
   }
 }
 
@@ -127,22 +136,26 @@ TEST(GrpcSessionTest, BasicCallable) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
   ASSERT_TRUE(session != nullptr);
 
   for (int iters = 0; iters < 25; ++iters) {
-    TF_CHECK_OK(session->Create(graph));
+    TF_ASSERT_OK(session->Create(graph));
     {
       // Just run to target node
       CallableOptions opts;
       opts.add_target(node_names[2]);
       Session::CallableHandle handle;
-      TF_CHECK_OK(session->MakeCallable(opts, &handle));
-      TF_CHECK_OK(session->RunCallable(handle, {}, nullptr, nullptr));
-      TF_CHECK_OK(session->ReleaseCallable(handle));
+      TF_ASSERT_OK(session->MakeCallable(opts, &handle));
+      TF_ASSERT_OK(session->RunCallable(handle, {}, nullptr, nullptr));
+      TF_ASSERT_OK(session->ReleaseCallable(handle));
     }
     {
       // Run to a target node and a real tensor
@@ -150,16 +163,16 @@ TEST(GrpcSessionTest, BasicCallable) {
       opts.add_target(node_names[1]);
       opts.add_fetch(node_names[2] + ":0");
       Session::CallableHandle handle;
-      TF_CHECK_OK(session->MakeCallable(opts, &handle));
+      TF_ASSERT_OK(session->MakeCallable(opts, &handle));
       std::vector<Tensor> outputs;
-      TF_CHECK_OK(session->RunCallable(handle, {}, &outputs, nullptr));
+      TF_ASSERT_OK(session->RunCallable(handle, {}, &outputs, nullptr));
       ASSERT_EQ(1, outputs.size());
       ASSERT_TRUE(outputs[0].IsInitialized());
       ASSERT_EQ(4.0, outputs[0].flat<float>()(0));
-      TF_CHECK_OK(session->ReleaseCallable(handle));
+      TF_ASSERT_OK(session->ReleaseCallable(handle));
     }
 
-    TF_CHECK_OK(session->Close());
+    TF_ASSERT_OK(session->Close());
   }
 }
 
@@ -172,16 +185,20 @@ TEST(GrpcSessionTest, CallableWithOnDeviceFeedsAndFetches) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(graph));
+  TF_ASSERT_OK(session->Create(graph));
 
   std::vector<DeviceAttributes> devices;
-  TF_CHECK_OK(session->ListDevices(&devices));
+  TF_ASSERT_OK(session->ListDevices(&devices));
   ASSERT_GT(devices.size(), 0);
   const string device_name = devices.back().name();
 
@@ -193,7 +210,7 @@ TEST(GrpcSessionTest, CallableWithOnDeviceFeedsAndFetches) {
   Session::CallableHandle handle;
   Status status = session->MakeCallable(opts, &handle);
   EXPECT_EQ(error::UNIMPLEMENTED, status.code());
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 }
 
 TEST(GrpcSessionTest, BasicNonProtoAPIConsistentOrder) {
@@ -203,7 +220,11 @@ TEST(GrpcSessionTest, BasicNonProtoAPIConsistentOrder) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
@@ -235,7 +256,11 @@ TEST(GrpcSessionTest, NonLocalWithFilters) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   SessionOptions options;
   options.target = strings::StrCat("grpc://", cluster->targets()[0]);
@@ -247,9 +272,9 @@ TEST(GrpcSessionTest, NonLocalWithFilters) {
   {
     GraphDef graph_copy(graph);
     graph::SetDefaultDevice(cluster->devices()[0].name(), &graph_copy);
-    TF_CHECK_OK(session->Create(graph_copy));
-    TF_CHECK_OK(session->Run({}, {}, {node_names[2]}, nullptr));
-    TF_CHECK_OK(session->Close());
+    TF_ASSERT_OK(session->Create(graph_copy));
+    TF_ASSERT_OK(session->Run({}, {}, {node_names[2]}, nullptr));
+    TF_ASSERT_OK(session->Close());
   }
   {
     GraphDef graph_copy(graph);
@@ -265,25 +290,29 @@ TEST(GrpcSessionTest, FetchMultipleTimes) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(graph));
+  TF_ASSERT_OK(session->Create(graph));
   const std::vector<std::pair<string, Tensor>> inputs;
   std::vector<Tensor> outputs;
 
   const string node = node_names[2] + ":0";
-  TF_CHECK_OK(session->Run(inputs, {node, node}, {}, &outputs));
+  TF_ASSERT_OK(session->Run(inputs, {node, node}, {}, &outputs));
   EXPECT_EQ(2, outputs.size());
   for (int i = 0; i < outputs.size(); ++i) {
     const Tensor& t = outputs[i];
     ASSERT_TRUE(t.IsInitialized()) << i;
     ASSERT_EQ(4.0, t.flat<float>()(0)) << i;
   }
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 }
 
 TEST(GrpcSessionTest, DisableOutputPartitionGraphs) {
@@ -292,7 +321,11 @@ TEST(GrpcSessionTest, DisableOutputPartitionGraphs) {
   CreateGraphDef(&graph, node_names);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   SessionOptions options = Options(cluster->targets()[0], 1);
   options.config.mutable_experimental()->set_disable_output_partition_graphs(
@@ -301,10 +334,10 @@ TEST(GrpcSessionTest, DisableOutputPartitionGraphs) {
   std::unique_ptr<Session> session(NewRemote(options));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(graph));
+  TF_ASSERT_OK(session->Create(graph));
   {
     // Just run to target node.
-    TF_CHECK_OK(session->Run({}, {}, {node_names[2]}, nullptr));
+    TF_ASSERT_OK(session->Run({}, {}, {node_names[2]}, nullptr));
   }
   {
     // Attempting to get the partition graphs should fail.
@@ -318,7 +351,7 @@ TEST(GrpcSessionTest, DisableOutputPartitionGraphs) {
         absl::StrContains(s.message(), "disable_output_partition_graphs"));
   }
 
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 }
 
 // A = [3 2; -1 0]; x = rand(2, 1); We want to compute the largest
@@ -363,7 +396,7 @@ void FindMaxEigen(const string& target) {
 
   std::unique_ptr<Session> session(NewRemote(Options(target, 1)));
   ASSERT_TRUE(session != nullptr);
-  TF_CHECK_OK(session->Create(def));
+  TF_ASSERT_OK(session->Create(def));
 
   // Setup feeds and fetches.
   float lambda;
@@ -373,8 +406,8 @@ void FindMaxEigen(const string& target) {
 
   for (int i = 0; i < 25; ++i) {
     std::vector<Tensor> outputs;
-    TF_CHECK_OK(session->Run({{x->name(), feed_value}},
-                             {y->name(), y_normalized->name()}, {}, &outputs));
+    TF_ASSERT_OK(session->Run({{x->name(), feed_value}},
+                              {y->name(), y_normalized->name()}, {}, &outputs));
     const Tensor& y = outputs[0];
     const Tensor& y_normalized = outputs[1];
     // Print out lambda, x, and y.
@@ -392,7 +425,11 @@ void FindMaxEigen(const string& target) {
 
 TEST(FindMaxEigenTest, RemoteDevice) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
   FindMaxEigen(cluster->targets()[0]);
 }
 
@@ -410,7 +447,11 @@ void SetDevice(GraphDef* graph, const string& name, const string& dev) {
 // figure out why.
 TEST(GrpcSessionTest, DISABLED_MultiDevices) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   Graph graph(OpRegistry::Global());
   const int kSize = 1048576;
@@ -444,13 +485,13 @@ TEST(GrpcSessionTest, DISABLED_MultiDevices) {
         std::unique_ptr<Session> session(
             NewRemote(Options(cluster->targets()[0], 1000)));
         ASSERT_TRUE(session != nullptr);
-        TF_CHECK_OK(session->Create(def));
+        TF_ASSERT_OK(session->Create(def));
         {
           std::vector<Tensor> outputs;
           RunOptions options;
           options.set_trace_level(RunOptions::FULL_TRACE);
           RunMetadata metadata;
-          TF_CHECK_OK(
+          TF_ASSERT_OK(
               session->Run(options, {}, {c->name()}, {}, &outputs, &metadata));
           ASSERT_EQ(1, outputs.size());
           IsSingleFloatValue(outputs[0], 6.0 * kSize);
@@ -471,7 +512,7 @@ TEST(GrpcSessionTest, DISABLED_MultiDevices) {
           }
           ASSERT_TRUE(c_placed_correctly);
         }
-        TF_CHECK_OK(session->Close());
+        TF_ASSERT_OK(session->Close());
       }
     }
   }
@@ -479,7 +520,11 @@ TEST(GrpcSessionTest, DISABLED_MultiDevices) {
 
 TEST(GrpcSessionTest, LargeTensorSend) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   Graph graph(OpRegistry::Global());
 
@@ -515,19 +560,23 @@ TEST(GrpcSessionTest, LargeTensorSend) {
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1000)));
   ASSERT_TRUE(session != nullptr);
-  TF_CHECK_OK(session->Create(def));
+  TF_ASSERT_OK(session->Create(def));
   {
     std::vector<Tensor> outputs;
-    TF_CHECK_OK(session->Run({}, {max_node->name()}, {}, &outputs));
+    TF_ASSERT_OK(session->Run({}, {max_node->name()}, {}, &outputs));
     ASSERT_EQ(1, outputs.size());
     IsSingleFloatValue(outputs[0], 1.0);
   }
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 }
 
 TEST(GrpcSessionTest, MultiDevices_String) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 1), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 1))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1000)));
   ASSERT_TRUE(session != nullptr);
@@ -555,14 +604,14 @@ TEST(GrpcSessionTest, MultiDevices_String) {
       Status s = session->Create(def);
       if (s.ok()) {
         std::vector<Tensor> outputs;
-        TF_CHECK_OK(session->Run({}, {b->name()}, {}, &outputs));
+        TF_ASSERT_OK(session->Run({}, {b->name()}, {}, &outputs));
         ASSERT_EQ(1, outputs.size());
         ASSERT_EQ(outputs[0].dtype(), DT_STRING);
         ASSERT_EQ(outputs[0].NumElements(), 4);
         for (int i = 0; i < outputs[0].NumElements(); ++i) {
           EXPECT_EQ(outputs[0].flat<tstring>()(i), "hello, world");
         }
-        TF_CHECK_OK(session->Close());
+        TF_ASSERT_OK(session->Close());
       } else {
         LOG(ERROR) << "Error: " << s;
         ASSERT_TRUE((a_dev.device_type() == DEVICE_GPU) ||
@@ -575,7 +624,11 @@ TEST(GrpcSessionTest, MultiDevices_String) {
 
 TEST(GrpcSessionTest, SendRecv_Node_Naming) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 3, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/3}}),
+      &cluster));
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
   ASSERT_TRUE(session != nullptr);
@@ -605,12 +658,12 @@ TEST(GrpcSessionTest, SendRecv_Node_Naming) {
   SetDevice(&def, a->name(), src.name());
   SetDevice(&def, b->name(), dst0.name());
   SetDevice(&def, c->name(), dst1.name());
-  TF_CHECK_OK(session->Create(def));
+  TF_ASSERT_OK(session->Create(def));
 
   // Run subgraph a -> b, and fetch b.
   {
     std::vector<Tensor> outputs;
-    TF_CHECK_OK(session->Run({}, {b->name()}, {}, &outputs));
+    TF_ASSERT_OK(session->Run({}, {b->name()}, {}, &outputs));
     ASSERT_EQ(1, outputs.size());
     IsSingleFloatValue(outputs[0], 100);
   }
@@ -618,18 +671,22 @@ TEST(GrpcSessionTest, SendRecv_Node_Naming) {
   // Run subgraph a -> c, and fetch c.
   {
     std::vector<Tensor> outputs;
-    TF_CHECK_OK(session->Run({}, {c->name()}, {}, &outputs));
+    TF_ASSERT_OK(session->Run({}, {c->name()}, {}, &outputs));
     ASSERT_EQ(1, outputs.size());
     IsSingleFloatValue(outputs[0], 100);
   }
 
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 }
 
 TEST(GrpcSessionTest, Error) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
-  const string& master = cluster->targets()[0];
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
+  auto master = cluster->targets()[0];
   const string& dev_a = cluster->devices()[0].name();
   const string& dev_b = cluster->devices()[1].name();
   LOG(INFO) << "master " << master << "dev_a " << dev_a << "dev_b " << dev_b;
@@ -668,7 +725,7 @@ TEST(GrpcSessionTest, Error) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     Status status = session->Run({}, fetches, {}, nullptr);
     EXPECT_FALSE(status.ok());
@@ -676,7 +733,7 @@ TEST(GrpcSessionTest, Error) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
@@ -685,8 +742,12 @@ TEST(GrpcSessionTest, Error) {
 
 TEST(GrpcSessionTest, ErrorStatusLog) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
-  const string& master = cluster->targets()[0];
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
+  auto master = cluster->targets()[0];
   const string& dev_a = cluster->devices()[0].name();
   const string& dev_b = cluster->devices()[1].name();
   LOG(INFO) << "master " << master << "dev_a " << dev_a << "dev_b " << dev_b;
@@ -725,7 +786,7 @@ TEST(GrpcSessionTest, ErrorStatusLog) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     Status status = session->Run({}, fetches, {}, nullptr);
     EXPECT_FALSE(status.ok());
@@ -735,7 +796,7 @@ TEST(GrpcSessionTest, ErrorStatusLog) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
@@ -744,8 +805,12 @@ TEST(GrpcSessionTest, ErrorStatusLog) {
 
 TEST(GrpcSessionTest, LongErrorMessage) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
-  const string& master = cluster->targets()[0];
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
+  auto master = cluster->targets()[0];
   const string& dev_a = cluster->devices()[0].name();
   const string& dev_b = cluster->devices()[1].name();
   LOG(INFO) << "master " << master << "dev_a " << dev_a << "dev_b " << dev_b;
@@ -787,7 +852,7 @@ TEST(GrpcSessionTest, LongErrorMessage) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     Status status = session->Run({}, fetches, {}, nullptr);
     EXPECT_FALSE(status.ok());
@@ -795,7 +860,7 @@ TEST(GrpcSessionTest, LongErrorMessage) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
@@ -804,7 +869,11 @@ TEST(GrpcSessionTest, LongErrorMessage) {
 
 TEST(SessionTest, SharedVar) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/1}}),
+      &cluster));
   const string master = cluster->targets()[0];
   CHECK_EQ(cluster->devices().size(), 1);
 
@@ -829,10 +898,10 @@ TEST(SessionTest, SharedVar) {
   // Init a variable
   {
     Session* sess = NewRemote(Options(master, 1));
-    TF_CHECK_OK(sess->Create(gdef));
+    TF_ASSERT_OK(sess->Create(gdef));
     std::vector<std::pair<string, Tensor>> inp;
-    TF_CHECK_OK(sess->Run(inp, {}, {init_name}, nullptr));
-    TF_CHECK_OK(sess->Close());
+    TF_ASSERT_OK(sess->Run(inp, {}, {init_name}, nullptr));
+    TF_ASSERT_OK(sess->Close());
     delete sess;
   }
 
@@ -840,35 +909,117 @@ TEST(SessionTest, SharedVar) {
     // Update a variable
     {
       Session* sess = NewRemote(Options(master, 1));
-      TF_CHECK_OK(sess->Create(gdef));
+      TF_ASSERT_OK(sess->Create(gdef));
       std::vector<std::pair<string, Tensor>> inp;
-      TF_CHECK_OK(sess->Run(inp, {}, {inc_name}, nullptr));
-      TF_CHECK_OK(sess->Close());
+      TF_ASSERT_OK(sess->Run(inp, {}, {inc_name}, nullptr));
+      TF_ASSERT_OK(sess->Close());
       delete sess;
     }
 
     // Gets the variable's value.
     {
       Session* sess = NewRemote(Options(master, 1));
-      TF_CHECK_OK(sess->Create(gdef));
+      TF_ASSERT_OK(sess->Create(gdef));
       std::vector<std::pair<string, Tensor>> inp;
       std::vector<Tensor> ret;
-      TF_CHECK_OK(sess->Run(inp, {get_name}, {}, &ret));
+      TF_ASSERT_OK(sess->Run(inp, {get_name}, {}, &ret));
       ASSERT_EQ(ret.size(), 1);
       EXPECT_EQ(ret[0].scalar<float>()(), 1.0 * (1 + rep));
-      TF_CHECK_OK(sess->Close());
+      TF_ASSERT_OK(sess->Close());
       delete sess;
     }
   }
 }
 
+TEST(SessionTest, SharedVarWithMultipleLearnerReplicas) {
+  std::unique_ptr<test::TestCluster> cluster;
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"variable_server", /*num_tasks=*/1},
+                 TestJob{"learner", /*num_tasks=*/2, /*num_replicas=*/2}}),
+      &cluster));
+  for (const auto& device : cluster->devices()) {
+    LOG(INFO) << device.DebugString();
+  }
+
+  ASSERT_EQ(cluster->devices().size(), 3);
+
+  GraphDef gdef;
+  string init_name;
+  string inc_name;
+  string get_name;
+
+  std::string var_server_device = "/job:variable_server/replica:0/task:0";
+  std::string learner_0_device = "/job:learner/replica:0/task:0";
+  std::string learner_1_device = "/job:learner/replica:1/task:0";
+
+  LOG(INFO) << "Learners: " << absl::StrJoin(cluster->targets("learner"), "; ");
+  {
+    Graph g(OpRegistry::Global());
+    Tensor one(DT_FLOAT, TensorShape({}));
+    one.scalar<float>()() = 1.0;
+    Node* var = test::graph::Var(&g, DT_FLOAT, one.shape());
+    var->mutable_def()->set_device(var_server_device);
+    Node* init = test::graph::Assign(&g, var, test::graph::Constant(&g, one));
+    init_name = init->name();
+    Node* update = test::graph::Assign(
+        &g, var, test::graph::Add(&g, var, test::graph::Constant(&g, one)));
+    inc_name = update->name();
+    get_name = var->name();
+    test::graph::ToGraphDef(&g, &gdef);
+  }
+
+  // Initialize using learner 0, then learner 1. Each session should remain
+  // valid.
+  Session* learner0 = NewRemote(Options(cluster->targets("learner")[0], 1));
+
+  {
+    TF_ASSERT_OK(learner0->Create(gdef));
+    std::vector<std::pair<string, Tensor>> inp;
+    TF_ASSERT_OK(learner0->Run(inp, {}, {init_name}, nullptr));
+  }
+
+  // Increment with learner 0
+  for (int rep = 1; rep < 10; ++rep) {
+    std::vector<std::pair<string, Tensor>> inp;
+    TF_ASSERT_OK(learner0->Run(inp, {}, {inc_name}, nullptr));
+  }
+
+  Session* learner1 = NewRemote(Options(cluster->targets("learner")[1], 1));
+  TF_ASSERT_OK(learner1->Create(gdef));
+  // Increment with learner 1
+  for (int rep = 1; rep < 10; ++rep) {
+    std::vector<std::pair<string, Tensor>> inp;
+    TF_ASSERT_OK(learner1->Run(inp, {}, {inc_name}, nullptr));
+  }
+
+  // Fetch results with both and validate they are the same.
+  std::vector<std::pair<string, Tensor>> inp;
+  std::vector<Tensor> ret;
+  TF_ASSERT_OK(learner0->Run(inp, {get_name}, {}, &ret));
+  ASSERT_EQ(ret.size(), 1);
+  EXPECT_EQ(ret[0].scalar<float>()(), 1.0 * 19);
+
+  TF_ASSERT_OK(learner1->Run(inp, {get_name}, {}, &ret));
+  ASSERT_EQ(ret.size(), 1);
+  EXPECT_EQ(ret[0].scalar<float>()(), 1.0 * 19);
+
+  TF_ASSERT_OK(learner0->Close());
+  TF_ASSERT_OK(learner1->Close());
+}
+
 void CreateInvalidGraph(const string& graph_def_ascii,
                         const string& error_substring) {
   GraphDef graph;
   CHECK(protobuf::TextFormat::ParseFromString(graph_def_ascii, &graph));
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
@@ -1008,11 +1159,15 @@ TEST(SessionTest, ExtendValidation) {
   ASSERT_TRUE(success);
 
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
 
   std::unique_ptr<Session> session(
       NewRemote(Options(cluster->targets()[0], 1)));
-  TF_CHECK_OK(session->Create(graph));
+  TF_ASSERT_OK(session->Create(graph));
 
   // 1. Fail with an unknown input name.
   GraphDef extension;
@@ -1042,7 +1197,7 @@ TEST(SessionTest, ExtendValidation) {
   )",
                                                   &extension);
   ASSERT_TRUE(success);
-  TF_CHECK_OK(session->Extend(extension));
+  TF_ASSERT_OK(session->Extend(extension));
 
   // 2. Fail with a duplicate node.
   success = protobuf::TextFormat::ParseFromString(R"(
@@ -1103,7 +1258,11 @@ TEST(SessionTest, CreateTimeoutWithRunOptions) {
 TEST(SessionTest, RunTimeoutWithSessionOptions) {
   // Creates a RemoteSession with "operation_timeout_in_ms" set to 100.
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/1}}),
+      &cluster));
   SessionOptions options = Options(cluster->targets()[0], 100);
   options.config.set_operation_timeout_in_ms(1);
   std::unique_ptr<Session> session(NewRemote(options));
@@ -1115,7 +1274,7 @@ TEST(SessionTest, RunTimeoutWithSessionOptions) {
   GraphDef gdef;
   test::graph::ToGraphDef(&graph, &gdef);
   RunOptions run_options;
-  TF_CHECK_OK(session->Create(run_options, gdef));
+  TF_ASSERT_OK(session->Create(run_options, gdef));
 
   // Verifies that Run() times out, and the error code is DEADLINE_EXCEEDED.
   std::vector<std::pair<string, Tensor>> inputs;
@@ -1129,7 +1288,11 @@ TEST(SessionTest, RunTimeoutWithSessionOptions) {
 // Tests that Run() with "timeout_in_ms" set times out.
 TEST(SessionTest, RunTimeoutWithRunOptions) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/1}}),
+      &cluster));
   SessionOptions options = Options(cluster->targets()[0], 1);
   std::unique_ptr<Session> session(NewRemote(options));
 
@@ -1139,7 +1302,7 @@ TEST(SessionTest, RunTimeoutWithRunOptions) {
   Node* b_delay = test::graph::Delay(&graph, b, Microseconds(1000000));
   GraphDef gdef;
   test::graph::ToGraphDef(&graph, &gdef);
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
 
   // Verifies that Run() times out, and the error code is DEADLINE_EXCEEDED.
   std::vector<std::pair<string, Tensor>> inputs;
@@ -1155,7 +1318,11 @@ TEST(SessionTest, RunTimeoutWithRunOptions) {
 
 TEST(SessionTest, TestCompression) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 1, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/1}}),
+      &cluster));
   SessionOptions options = Options(cluster->targets()[0], 100);
   RPCOptions* rpc_options = options.config.mutable_rpc_options();
   rpc_options->set_compression_algorithm("deflate");
@@ -1171,20 +1338,24 @@ TEST(SessionTest, TestCompression) {
   GraphDef gdef;
   graph.ToGraphDef(&gdef);
   RunOptions run_options;
-  TF_CHECK_OK(session->Create(run_options, gdef));
+  TF_ASSERT_OK(session->Create(run_options, gdef));
 
   std::vector<std::pair<string, Tensor>> inputs;
   std::vector<Tensor> outputs;
-  TF_CHECK_OK(session->Run(inputs, {b->name()}, {}, &outputs));
+  TF_ASSERT_OK(session->Run(inputs, {b->name()}, {}, &outputs));
   ASSERT_EQ(1, outputs.size());
   IsSingleFloatValue(outputs[0], kTestValue);
 }
 
 TEST(GrpcSessionTest, ErrorAggregationTwoWorkersTwoErrors) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(1, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(1, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
   auto& devs = cluster->devices();
-  const string& master = cluster->targets()[0];
+  auto master = cluster->targets()[0];
   // worker 1
   const string w1_dev1 = devs[0].name();
   // worker 2
@@ -1231,7 +1402,7 @@ TEST(GrpcSessionTest, ErrorAggregationTwoWorkersTwoErrors) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     std::vector<Tensor> outputs;
     Status status = session->Run({}, fetches, {}, &outputs);
@@ -1243,7 +1414,7 @@ TEST(GrpcSessionTest, ErrorAggregationTwoWorkersTwoErrors) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
@@ -1252,9 +1423,13 @@ TEST(GrpcSessionTest, ErrorAggregationTwoWorkersTwoErrors) {
 
 TEST(GrpcSessionTest, ErrorAggregationTwoWorkerRace) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(2, 0), 2, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(2, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/2}}),
+      &cluster));
   auto& devs = cluster->devices();
-  const string& master = cluster->targets()[0];
+  auto master = cluster->targets()[0];
   // worker 1
   const string w1_dev1 = devs[0].name();
   const string w1_dev2 = devs[1].name();
@@ -1310,7 +1485,7 @@ TEST(GrpcSessionTest, ErrorAggregationTwoWorkerRace) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     std::vector<Tensor> outputs;
     Status status = session->Run({}, fetches, targets, &outputs);
@@ -1324,7 +1499,7 @@ TEST(GrpcSessionTest, ErrorAggregationTwoWorkerRace) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
@@ -1333,9 +1508,13 @@ TEST(GrpcSessionTest, ErrorAggregationTwoWorkerRace) {
 
 TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant1) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(2, 0), 3, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(2, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/3}}),
+      &cluster));
   auto& devs = cluster->devices();
-  const string& master = cluster->targets()[0];
+  auto master = cluster->targets()[0];
   // worker 1
   const string w1_dev1 = devs[0].name();
   const string w1_dev2 = devs[1].name();
@@ -1405,7 +1584,7 @@ TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant1) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     std::vector<Tensor> outputs;
     Status status = session->Run({}, fetches, targets, &outputs);
@@ -1420,7 +1599,7 @@ TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant1) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
@@ -1429,9 +1608,13 @@ TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant1) {
 
 TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant2) {
   std::unique_ptr<test::TestCluster> cluster;
-  TF_CHECK_OK(test::TestCluster::MakeTestCluster(Devices(2, 0), 3, &cluster));
+  TF_ASSERT_OK(test::TestCluster::MakeTestCluster(
+      TestClusterConfig()
+          .Options(Devices(2, 0))
+          .Jobs({TestJob{"localhost", /*num_tasks=*/3}}),
+      &cluster));
   auto& devs = cluster->devices();
-  const string& master = cluster->targets()[0];
+  auto master = cluster->targets()[0];
   // worker 1
   const string w1_dev1 = devs[0].name();
   const string w1_dev2 = devs[1].name();
@@ -1502,7 +1685,7 @@ TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant2) {
   std::unique_ptr<Session> session(NewRemote(Options(master, 1)));
   ASSERT_TRUE(session != nullptr);
 
-  TF_CHECK_OK(session->Create(gdef));
+  TF_ASSERT_OK(session->Create(gdef));
   {
     std::vector<Tensor> outputs;
     Status status = session->Run({}, fetches, targets, &outputs);
@@ -1517,7 +1700,7 @@ TEST(GrpcSessionTest, ErrorAggregationThreeWorkerRaceVariant2) {
   }
   // session->Close() shall clean up all states related to the session->
   // E.g., deregisters subgraph with workers, etc.
-  TF_CHECK_OK(session->Close());
+  TF_ASSERT_OK(session->Close());
 
   // Sleep a bit so that most of asynchronous works finishes before
   // the test process finishes.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index ff219478635..bd821118629 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -15,38 +15,49 @@ limitations under the License.
 
 #include "tensorflow/core/distributed_runtime/rpc/grpc_testlib.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/distributed_runtime/rpc/grpc_session.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/util/device_name_utils.h"
 
 namespace tensorflow {
 namespace test {
 
-Status TestCluster::MakeTestCluster(const SessionOptions& options, int n,
+Status TestCluster::MakeTestCluster(const TestClusterConfig& config,
                                     std::unique_ptr<TestCluster>* out_cluster) {
-  string server_path =
-      strings::StrCat(testing::TensorFlowSrcRoot(),
-                      "/core/distributed_runtime/rpc/grpc_testlib_server");
-  return MakeTestCluster(server_path, options, n, out_cluster);
-}
-
-Status TestCluster::MakeTestCluster(const string& binary_path,
-                                    const SessionOptions& options, int n,
-                                    std::unique_ptr<TestCluster>* out_cluster) {
-  CHECK_GE(n, 1);
+  std::string binary_path =
+      !config.binary_path.empty()
+          ? config.binary_path
+          : strings::StrCat(
+                testing::TensorFlowSrcRoot(),
+                "/core/distributed_runtime/rpc/grpc_testlib_server");
+  SessionOptions options = config.options;
   std::unique_ptr<TestCluster> ret(new TestCluster);
 
-  ret->targets_.resize(n);
+  std::vector<std::string> tf_job_args;
+  for (const auto& job : config.jobs) {
+    if (job.num_tasks % job.num_replicas != 0) {
+      return errors::InvalidArgument(
+          "Number of tasks must evenly divide replicas.");
+    }
 
-  std::vector<int> port(n);
-  for (int i = 0; i < n; ++i) {
-    port[i] = testing::PickUnusedPortOrDie();
-    ret->targets_[i] = strings::StrCat("localhost:", port[i]);
+    std::vector<std::string>& job_targets = ret->targets_[job.name];
+    for (int i = 0; i < job.num_tasks; ++i) {
+      int port = testing::PickUnusedPortOrDie();
+      job_targets.push_back(strings::StrCat("localhost:", port));
+    }
+    tf_job_args.push_back(strings::StrCat(job.name, "|",
+                                          absl::StrJoin(job_targets, ";"), "|",
+                                          job.num_tasks / job.num_replicas));
   }
-
-  const string tf_jobs = strings::StrCat("--tf_jobs=localhost|",
-                                         absl::StrJoin(ret->targets_, ";"));
+  std::string tf_jobs =
+      absl::StrCat("--tf_jobs=", absl::StrJoin(tf_job_args, ","));
 
   int num_cpus = 1;
   int num_gpus = 0;
@@ -58,25 +69,33 @@ Status TestCluster::MakeTestCluster(const string& binary_path,
   if (iter != options.config.device_count().end()) {
     num_gpus = iter->second;
   }
+  if (!options.env->FileExists(binary_path).ok()) {
+    return errors::Internal("Could not find grpc_testlib_server");
+  }
 
-  for (int i = 0; i < n; ++i) {
-    if (!options.env->FileExists(binary_path).ok()) {
-      return errors::Internal("Could not find grpc_testlib_server");
-    }
-    const std::vector<string> argv(
-        {binary_path, /* see grpc_testlib_server.cc for flags */
-         tf_jobs, "--tf_job=localhost", strings::StrCat("--tf_task=", i),
-         strings::StrCat("--num_cpus=", num_cpus),
-         strings::StrCat("--num_gpus=", num_gpus)});
-    ret->subprocesses_.emplace_back(CreateSubProcess(argv));
-    bool success = ret->subprocesses_[i]->Start();
-    if (!success) {
-      return errors::Internal("Could not start subprocess");
+  for (const auto& job : config.jobs) {
+    for (int i = 0; i < job.num_tasks; ++i) {
+      const std::vector<string> argv(
+          {binary_path, /* see grpc_testlib_server.cc for flags */
+           tf_jobs, strings::StrCat("--tf_job=", job.name),
+           strings::StrCat("--tf_task=", i / job.num_replicas),
+           strings::StrCat("--tf_replica=", i % job.num_replicas),
+           strings::StrCat("--num_cpus=", num_cpus),
+           strings::StrCat("--host_port=", ret->targets_[job.name][i]),
+           strings::StrCat("--num_gpus=", num_gpus)});
+      LOG(INFO) << "Start: " << absl::StrJoin(argv, " ");
+      auto subprocess = CreateSubProcess(argv);
+      bool success = subprocess->Start();
+      ret->subprocesses_.emplace_back(std::move(subprocess));
+      if (!success) {
+        return errors::Internal("Could not start subprocess");
+      }
     }
   }
 
   SessionOptions options_copy(options);
-  options_copy.target = strings::StrCat("grpc://", ret->targets_[0]);
+  options_copy.target =
+      strings::StrCat("grpc://", ret->targets(config.jobs[0].name)[0]);
 
   std::unique_ptr<GrpcSession> session;
   TF_RETURN_IF_ERROR(GrpcSession::Create(options_copy, &session));
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
index 58143f05e83..ba7938b019c 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/macros.h"
@@ -34,6 +35,27 @@ class Device;
 
 namespace test {
 
+struct TestJob {
+  std::string name;
+  int num_tasks;
+  int num_replicas = 1;
+};
+
+struct TestClusterConfig {
+  std::string binary_path;
+  SessionOptions options;
+  std::vector<TestJob> jobs;
+
+  TestClusterConfig& Options(const SessionOptions& options) {
+    this->options = options;
+    return *this;
+  }
+  TestClusterConfig& Jobs(const std::vector<TestJob>& jobs) {
+    this->jobs = jobs;
+    return *this;
+  }
+};
+
 // Provides a handle to a set of TensorFlow servers (masters and
 // workers) for testing purposes.
 //
@@ -47,18 +69,15 @@ class TestCluster {
   // processes `n`. On success, the test cluster is stored in
   // *out_cluster, and this function returns OK. Otherwise an error is
   // returned.
-  static Status MakeTestCluster(const SessionOptions& options, int n,
-                                std::unique_ptr<TestCluster>* out_cluster);
-
-  // As above, but allows overridding the server binary path via `binary_path`.
-  static Status MakeTestCluster(const string& binary_path,
-                                const SessionOptions& options, int n,
+  static Status MakeTestCluster(const TestClusterConfig& config,
                                 std::unique_ptr<TestCluster>* out_cluster);
   ~TestCluster();
 
   // Returns a vector of string "<hostname>:<port>" pairs that may be
   // used as targets to construct a GrpcSession.
-  const std::vector<string>& targets() const { return targets_; }
+  const std::vector<string>& targets(std::string job_name = "localhost") {
+    return targets_.at(job_name);
+  }
 
   // Returns a vector of devices available in this test cluster.
   const std::vector<DeviceAttributes>& devices() const { return devices_; }
@@ -67,7 +86,7 @@ class TestCluster {
   TestCluster() = default;
 
   std::vector<std::unique_ptr<SubProcess>> subprocesses_;
-  std::vector<string> targets_;
+  absl::flat_hash_map<std::string, std::vector<std::string>> targets_;
   std::vector<DeviceAttributes> devices_;
 
   TF_DISALLOW_COPY_AND_ASSIGN(TestCluster);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
index 849a52f00b0..1f93608522a 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
 #include <vector>
 
 #include "grpcpp/grpcpp.h"
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/server_builder.h"
-
+#include "absl/strings/numbers.h"
 #include "tensorflow/core/distributed_runtime/server_lib.h"
-
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/strings/str_util.h"
@@ -36,36 +36,73 @@ namespace tensorflow {
 namespace {
 
 Status FillServerDef(const string& job_spec, const string& job_name,
-                     int num_cpus, int num_gpus, int task_index,
-                     ServerDef* options) {
+                     int num_cpus, int num_gpus, int task_index, int replica,
+                     std::string host_port, ServerDef* options) {
   options->set_protocol("grpc");
   options->set_job_name(job_name);
   options->set_task_index(task_index);
+  options->set_replica(replica);
 
   uint32 my_tasks_per_replica = 0;
+  // A job with a single task can have multiple "replicas" (multiple replicas).
+  //
+  // These replicas are unaware of each other during normal operation; when
+  // we encounter a job with a replica configured, we select only the current
+  // replica and ignore the others.
   for (const string& job_str : str_util::Split(job_spec, ',')) {
     JobDef* job_def = options->mutable_cluster()->add_job();
-    // Split each entry in the flag into 2 pieces, separated by "|".
+    // Split each entry in the flag into 3 pieces, separated by "|".
     const std::vector<string> job_pieces = str_util::Split(job_str, '|');
-    CHECK_EQ(2, job_pieces.size()) << job_str;
+    CHECK_EQ(3, job_pieces.size()) << job_str;
     job_def->set_name(job_pieces[0]);
-    // Does a bit more validation of the tasks_per_replica.
+
+    int num_tasks;
+    if (!absl::SimpleAtoi(job_pieces[2], &num_tasks)) {
+      return errors::InvalidArgument("Invalid job string: ", job_str);
+    }
+
     const StringPiece spec = job_pieces[1];
+
     // job_str is of form <job_name>|<host_ports>.
     const std::vector<string> host_ports = str_util::Split(spec, ';');
     uint32 tasks_per_replica = host_ports.size();
+    auto& tasks = (*job_def->mutable_tasks());
     for (size_t i = 0; i < host_ports.size(); ++i) {
-      (*job_def->mutable_tasks())[i] = host_ports[i];
+      int task_id = i % num_tasks;
+      int replica_id = i / num_tasks;
+
+      if (job_def->name() == options->job_name()) {
+        if (replica_id == options->replica()) {
+          tasks[task_id] = host_ports[i];
+        }
+      } else {
+        if (tasks[task_id].empty()) {
+          tasks[task_id] = host_ports[i];
+        } else {
+          tasks[task_id] = absl::StrCat(tasks[task_id], ",", host_ports[i]);
+        }
+      }
     }
+
     if (job_def->name() == options->job_name()) {
       my_tasks_per_replica = tasks_per_replica;
     }
+
     LOG(INFO) << "Peer " << job_def->name() << " " << tasks_per_replica << " {"
               << absl::StrJoin(host_ports, ", ") << "}";
   }
   if (my_tasks_per_replica == 0) {
     return errors::InvalidArgument("Invalid job specification");
   }
+
+  std::vector<std::string> splits = absl::StrSplit(host_port, ':');
+  int port = 0;
+  if (!absl::SimpleAtoi(splits[1], &port)) {
+    return errors::InvalidArgument("Invalid host port: ", host_port);
+  }
+  options->set_port(port);
+
+  LOG(INFO) << options->DebugString();
   ConfigProto* config = options->mutable_default_session_config();
   (*config->mutable_device_count())["CPU"] = num_cpus;
   (*config->mutable_device_count())["GPU"] = num_gpus;
@@ -79,26 +116,31 @@ int main(int argc, char* argv[]) {
   tensorflow::port::InitMain(argv[0], &argc, &argv);
   tensorflow::string job_spec;
   tensorflow::string job_name;
+  tensorflow::string host_port;
   int num_cpus = 1;
   int num_gpus = 0;
   int task_index = 0;
+  int replica = 0;
   std::vector<tensorflow::Flag> flag_list = {
       tensorflow::Flag("tf_jobs", &job_spec, "job specification"),
       tensorflow::Flag("tf_job", &job_name, "job name"),
       tensorflow::Flag("tf_task", &task_index, "task index"),
+      tensorflow::Flag("tf_replica", &replica, "task replica"),
+      tensorflow::Flag("host_port", &host_port, "listen address"),
       tensorflow::Flag("num_cpus", &num_cpus, "number of CPUs"),
       tensorflow::Flag("num_gpus", &num_gpus, "number of GPUs"),
   };
   tensorflow::string usage = tensorflow::Flags::Usage(argv[0], flag_list);
   const bool parse_result = tensorflow::Flags::Parse(&argc, argv, flag_list);
-  if (!parse_result || argc != 1) {
+  if (!parse_result) {
     LOG(ERROR) << usage;
     return -1;
   }
 
   tensorflow::ServerDef def;
-  tensorflow::Status s = tensorflow::FillServerDef(job_spec, job_name, num_cpus,
-                                                   num_gpus, task_index, &def);
+  tensorflow::Status s =
+      tensorflow::FillServerDef(job_spec, job_name, num_cpus, num_gpus,
+                                task_index, replica, host_port, &def);
   if (!s.ok()) {
     LOG(ERROR) << "Could not parse job spec: " << s.message() << "\n" << usage;
     return -1;
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache_test.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache_test.cc
index ff32fa91205..6431e0c51d4 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache_test.cc
@@ -28,7 +28,8 @@ namespace tensorflow {
 
 TEST(GrpcWorkerCacheTest, NewGrpcWorkerCache) {
   GrpcChannelSpec spec;
-  TF_ASSERT_OK(spec.AddHostPortsJob("worker", {"a:0", "b:1", "c:2"}));
+  TF_ASSERT_OK(
+      spec.AddHostPortsJob("worker", {{0, "a:0"}, {1, "b:1"}, {2, "c:2"}}));
   ChannelCreationFunction channel_func =
       ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
   auto channel_cache = std::shared_ptr<GrpcChannelCache>(
@@ -63,7 +64,8 @@ TEST(GrpcWorkerCacheTest, NewGrpcWorkerCache) {
 
 TEST(GrpcWorkerCacheTest, DestructWorkerCacheInThreadPool) {
   GrpcChannelSpec spec;
-  TF_ASSERT_OK(spec.AddHostPortsJob("worker", {"a:1", "b:2", "c:3"}));
+  TF_ASSERT_OK(
+      spec.AddHostPortsJob("worker", {{0, "a:0"}, {1, "b:1"}, {2, "c:2"}}));
   ChannelCreationFunction channel_func =
       ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
   auto channel_cache = std::shared_ptr<GrpcChannelCache>(
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index 338aba4afac..723e4c21bd0 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -108,7 +108,8 @@ SessionMgr::SessionMgr(
 /* static */
 std::string SessionMgr::WorkerNameFromServerDef(const ServerDef& server_def) {
   return strings::StrCat("/job:", server_def.job_name(),
-                         "/replica:0/task:", server_def.task_index());
+                         "/replica:", server_def.replica(),
+                         "/task:", server_def.task_index());
 }
 
 Status SessionMgr::CreateSession(const std::string& session,
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 46b53e2be0e..21982e7d86a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/distributed_runtime/error_payloads.h"
 #include "tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.h"
 #include "tensorflow/core/distributed_runtime/worker_env.h"
@@ -250,7 +251,7 @@ TEST_F(SessionMgrTest, UnknownSessionHandle) {
   std::string session_handle = "unknown_session_handle";
   std::shared_ptr<WorkerSession> session;
   Status s = mgr_.WorkerSessionForSession(session_handle, &session);
-  EXPECT_TRUE(errors::IsAborted(s));
+  EXPECT_TRUE(absl::IsAborted(s));
   EXPECT_TRUE(absl::StrContains(s.message(), "Session handle is not found"));
   EXPECT_TRUE(s.GetPayload(kWorkerPossiblyRestarted).has_value());
 }
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index a4d7400ad8d..134baa0b925 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1258,6 +1258,7 @@ cc_library(
         "//tensorflow/core:test_main",
         "//tensorflow/core:testlib",
         "//tensorflow/core/common_runtime:direct_session_internal",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -1351,6 +1352,7 @@ tf_cc_tests(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
     ],
@@ -1385,7 +1387,7 @@ cc_library(
     visibility = ["//tensorflow/c:__pkg__"],
 )
 
-# All framewrok protos are self-contained, i.e. they only import other
+# All framework protos are self-contained, i.e. they only import other
 # protos from the same package, so we can build the protos here and then
 # link them from core:protos_all without circular dependencies.
 
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index bdeb8637f3a..a197e73ed8e 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -1220,8 +1220,8 @@ void DatasetOpKernel::Compute(OpKernelContext* ctx) {
     if (ctx->stack_trace().has_value() && VLOG_IS_ON(4)) {
       VLOG(4) << "Dataset " << dataset->type_string()
               << " created using the following stack trace:";
-      for (const auto& stack_frame :
-           ctx->stack_trace()->ToStackFrames({}, {})) {
+      for (const auto& stack_frame : ctx->stack_trace()->ToStackFrames(
+               {}, {}, /*reverse_traversal=*/false, /*limit=*/-1)) {
         VLOG(4) << stack_frame.file_name << ":" << stack_frame.line_number
                 << " in " << stack_frame.function_name << "()";
       }
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index f53fbc51498..0bf6c1e6d5d 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -815,7 +815,7 @@ class IteratorContext {
 
   int32 runner_threadpool_size() { return params_.runner_threadpool_size; }
 
-  std::vector<std::shared_ptr<SplitProvider>> split_providers() {
+  std::vector<std::shared_ptr<SplitProvider>> split_providers() const {
     return params_.split_providers;
   }
 
diff --git a/tensorflow/core/framework/function.cc b/tensorflow/core/framework/function.cc
index f624fe849be..6b84e4661cc 100644
--- a/tensorflow/core/framework/function.cc
+++ b/tensorflow/core/framework/function.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <ctype.h>
 
 #include <map>
+#include <string>
 #include <unordered_map>
 #include <utility>
 #include <vector>
@@ -25,6 +26,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/escaping.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/common_shape_fns.h"
@@ -35,6 +37,7 @@ limitations under the License.
 #include "tensorflow/core/framework/node_def_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
@@ -43,7 +46,9 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/util/device_name_utils.h"
 #include "tensorflow/core/util/equal_graph_def.h"
+#include "tensorflow/core/util/managed_stack_trace.h"
 #include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/path.h"
 
 namespace tensorflow {
 
@@ -1218,48 +1223,111 @@ Status FunctionCallFrame::SetRetval(int index, const Tensor& val) {
   return OkStatus();
 }
 
-tensorflow::GraphDebugInfo StackTracesMapToGraphDebugInfo(
-    const tensorflow::StackTracesMap& map) {
-  tensorflow::GraphDebugInfo debug_info;
-  for (const auto& [node_name, stack_trace] : map) {
-    if (stack_trace == nullptr) continue;
+// Ignore the frames containing this substring for common prefix calculation.
+static const char* kFilenameToIgnorePrefix = "<embedded";
 
-    tensorflow::GraphDebugInfo::StackTrace stack_trace_proto;
-    absl::flat_hash_map<string, int> file_name_to_index;
-    int new_name_index = 0;
+// Converts the given stack frame to a string.
+std::string StackFrameToString(const StackFrame& frame,
+                               int shared_prefix_length) {
+  std::string out = absl::StrFormat(
+      "File \"%s\", line %d, in %s",
+      absl::StrContains(frame.file_name, kFilenameToIgnorePrefix)
+          ? frame.file_name
+          : frame.file_name.substr(shared_prefix_length),
+      frame.line_number, frame.function_name);
+  return out;
+}
 
-    for (const auto& stack_frame : stack_trace->GetUserFrames(-1)) {
-      auto* file_line_col = stack_trace_proto.add_file_line_cols();
-      if (file_name_to_index.contains(stack_frame.file_name)) {
-        file_line_col->set_file_index(
-            file_name_to_index[stack_frame.file_name]);
-      } else {
-        *debug_info.add_files() = stack_frame.file_name;
-        file_line_col->set_file_index(new_name_index);
-        file_name_to_index[stack_frame.file_name] = new_name_index;
-        new_name_index++;
+std::string ToStringHelper(absl::Span<const StackFrame> stack_frames,
+                           int shared_prefix_length) {
+  return absl::StrJoin(
+      stack_frames, "\n", [&](std::string* out, const StackFrame& frame) {
+        absl::StrAppend(out, StackFrameToString(frame, shared_prefix_length));
+      });
+}
+
+FrozenStackTrace::FrozenStackTrace(absl::Span<StackFrame const> frames)
+    : frames_(frames.begin(), frames.end()) {}
+
+FrozenStackTrace::FrozenStackTrace(
+    const GraphDebugInfo::StackTrace& stack_trace,
+    const GraphDebugInfo& debug_info) {
+  for (const GraphDebugInfo::FileLineCol& file_line_col :
+       stack_trace.file_line_cols()) {
+    int file_index = file_line_col.file_index();
+    std::string file_name =
+        (file_index >= 0 && file_index < debug_info.files_size())
+            ? debug_info.files(file_index)
+            : "<UNKNOWN_FILE_NAME>";
+    frames_.push_back(
+        StackFrame(file_name, file_line_col.line(), file_line_col.func()));
+  }
+}
+
+absl::Span<StackFrame const> FrozenStackTrace::ToFrames() const {
+  return frames_;
+}
+
+std::vector<StackFrame> FrozenStackTrace::ToUncachedFrames() const {
+  return frames_;
+}
+
+StackFrame FrozenStackTrace::LastUserFrame() const { return frames_.back(); }
+
+std::vector<StackFrame> FrozenStackTrace::GetUserFrames(int limit) const {
+  if (limit >= 0 && limit < frames_.size()) {
+    auto subspan = absl::MakeSpan(frames_).subspan(0, limit);
+    return std::vector<StackFrame>{subspan.begin(), subspan.end()};
+  }
+  return frames_;
+}
+
+std::string FrozenStackTrace::ToString(const TracePrintingOptions& opts) const {
+  int shared_prefix_length = 0;
+  if (opts.filter_common_prefix) {
+    std::vector<std::string> prefix_file_names;
+    for (const StackFrame& frame : frames_) {
+      if (!absl::StrContains(frame.file_name, kFilenameToIgnorePrefix)) {
+        prefix_file_names.push_back(frame.file_name);
       }
-      file_line_col->set_line(stack_frame.line_number);
-      file_line_col->set_func(stack_frame.function_name);
     }
-
-    (*debug_info.mutable_traces())[node_name] = std::move(stack_trace_proto);
+    shared_prefix_length = tsl::io::CommonPathPrefix(prefix_file_names).size();
   }
 
-  return debug_info;
+  if (!opts.drop_internal_frames) {
+    return ToStringHelper(frames_, shared_prefix_length);
+  }
+
+  std::vector<StackFrame> non_internal_frames;
+  for (const StackFrame& frame : frames_) {
+    if (!IsInternalFrameForFilename(frame.file_name)) {
+      non_internal_frames.push_back(frame);
+    }
+  }
+  return ToStringHelper(non_internal_frames, shared_prefix_length);
+}
+
+tensorflow::GraphDebugInfo StackTracesMapToGraphDebugInfo(
+    const tensorflow::StackTracesMap& map, bool user_frames) {
+  GraphDebugInfoBuilder builder;
+  GraphDebugInfoBuilder::Options options;
+  options.user_frames = user_frames;
+  options.user_frames_limit = -1;
+  builder.AccumulateStackTracesMap(map, "", options);
+  return builder.Build();
 }
 
 FunctionRecord::FunctionRecord(const FunctionDef& fdef,
                                const StackTracesMap& stack_traces,
                                bool finalized)
-    : FunctionRecord(FunctionDef(fdef), stack_traces, finalized) {}
+    : FunctionRecord(FunctionDef(fdef), StackTracesMap(stack_traces),
+                     finalized) {}
 
 FunctionRecord::FunctionRecord(FunctionDef&& fdef,
-                               const StackTracesMap& stack_traces,
-                               bool finalized)
+                               StackTracesMap&& stack_traces, bool finalized)
     : finalized_(finalized),
       fdef_(std::move(fdef)),
-      stack_traces_(stack_traces),
+      stack_traces_(std::move(stack_traces)),
       // Exact shape inference for functions is handled by ShapeRefiner.
       // Here we pass a dummy shape inference function for legacy code paths.
       op_registration_data_(fdef_.signature(), shape_inference::UnknownShape,
@@ -1389,9 +1457,9 @@ Status FunctionLibraryDefinition::AddFunctionDef(
 }
 
 Status FunctionLibraryDefinition::AddFunctionDefHelper(
-    FunctionDef&& fdef, const StackTracesMap& stack_traces, bool* added) {
+    FunctionDef&& fdef, StackTracesMap&& stack_traces, bool* added) {
   FunctionRecord* record =
-      new FunctionRecord(std::move(fdef), stack_traces, true);
+      new FunctionRecord(std::move(fdef), std::move(stack_traces), true);
   core::ScopedUnref scoped_unref(record);
   Status status = AddHelper(record, added);
   return status;
@@ -1539,12 +1607,14 @@ Status FunctionLibraryDefinition::AddLibrary(FunctionDefLibrary&& lib_def) {
 }
 
 Status FunctionLibraryDefinition::AddLibrary(
-    const FunctionDefLibrary& lib_def, const StackTracesMap& stack_traces) {
-  return AddLibrary(FunctionDefLibrary(lib_def), stack_traces);
+    const FunctionDefLibrary& lib_def,
+    const FunctionDefLibraryStackTraces& library_traces) {
+  return AddLibrary(FunctionDefLibrary(lib_def), library_traces);
 }
 
 Status FunctionLibraryDefinition::AddLibrary(
-    FunctionDefLibrary&& lib_def, const StackTracesMap& stack_traces) {
+    FunctionDefLibrary&& lib_def,
+    const FunctionDefLibraryStackTraces& library_traces) {
   // Remember the funcs and grads that we added successfully so that
   // we can roll them back on error.
   mutex_lock l(mu_);
@@ -1553,8 +1623,11 @@ Status FunctionLibraryDefinition::AddLibrary(
   Status s;
   bool added;
   for (FunctionDef& fdef : *lib_def.mutable_function()) {
-    string name = fdef.signature().name();
-    s = AddFunctionDefHelper(std::move(fdef), stack_traces, &added);
+    std::string name = fdef.signature().name();
+    StackTracesMap stack_traces = library_traces.contains(name)
+                                      ? StackTracesMap(library_traces.at(name))
+                                      : StackTracesMap();
+    s = AddFunctionDefHelper(std::move(fdef), std::move(stack_traces), &added);
     if (!s.ok()) {
       Status remove_status = Remove(funcs, funcs_with_grads);
       if (!remove_status.ok()) {
@@ -1588,8 +1661,8 @@ Status FunctionLibraryDefinition::ReplaceFunction(
   mutex_lock l(mu_);
   bool added;
   TF_RETURN_IF_ERROR(RemoveFunctionHelper(func));
-  TF_RETURN_IF_ERROR(
-      AddFunctionDefHelper(FunctionDef(fdef), stack_traces, &added));
+  TF_RETURN_IF_ERROR(AddFunctionDefHelper(
+      FunctionDef(fdef), StackTracesMap(stack_traces), &added));
   return OkStatus();
 }
 
diff --git a/tensorflow/core/framework/function.h b/tensorflow/core/framework/function.h
index 8c4d0914e5b..76233cebf1b 100644
--- a/tensorflow/core/framework/function.h
+++ b/tensorflow/core/framework/function.h
@@ -17,10 +17,12 @@ limitations under the License.
 #define TENSORFLOW_CORE_FRAMEWORK_FUNCTION_H_
 
 #include <memory>
+#include <unordered_map>
 #include <vector>
 
 // clang-format off
 // Required for IS_MOBILE_PLATFORM
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/platform/platform.h"
 // clang-format on
@@ -368,14 +370,11 @@ class AbstractStackTrace {
 
   virtual ~AbstractStackTrace() {}
 
-  // The returned span is alive until either the AbstractStackTrace gets
-  // destroyed or its cache gets flushed.
+  // The returned span is alive as long as the AbstractStackTrace is alive.
   virtual absl::Span<StackFrame const> ToFrames() const = 0;
 
-  // Remove all data that was generated for e.g. the result of ToFrames().
-  // Calling this will make the next ToFrames() run more slowly, but tends to
-  // save a large amount of memory.
-  virtual void WipeCache() {}
+  // Return the frames, but without caching any of the generated data.
+  virtual std::vector<StackFrame> ToUncachedFrames() const { return {}; }
 
   // Returns the last stack frame from user code, attempting to ignore the
   // framework code. Returns an empty frame if no such stack frame was found.
@@ -388,13 +387,49 @@ class AbstractStackTrace {
   virtual std::string ToString(const TracePrintingOptions& opts) const = 0;
 };
 
+// A frozen sequence of StackFrames; an adapter for a span of StackFrames that
+// conforms to the AbstractStackTrace contract. Unlike other AbstractStackTrace
+// subclasses that could return different results from ToFrames() and
+// GetUserFrames(), this class returns the same sequence of stack frames for
+// both calls.
+class FrozenStackTrace : public AbstractStackTrace {
+ public:
+  // Constructs a FrozenStackTrace from a span of StackFrames by making a copy
+  // of each stack frame.
+  explicit FrozenStackTrace(absl::Span<StackFrame const> frames);
+
+  // Constructs a FrozenStackTrace from serialized proto data.
+  FrozenStackTrace(const GraphDebugInfo::StackTrace& stack_trace,
+                   const GraphDebugInfo& debug_info);
+
+  ~FrozenStackTrace() override = default;
+
+  absl::Span<StackFrame const> ToFrames() const override;
+
+  std::vector<StackFrame> ToUncachedFrames() const override;
+
+  StackFrame LastUserFrame() const override;
+
+  std::vector<StackFrame> GetUserFrames(int limit) const override;
+
+  std::string ToString(const TracePrintingOptions& opts) const override;
+
+ private:
+  std::vector<StackFrame> frames_;
+};
+
 using StackTracesMap =
     std::unordered_map<std::string,
                        std::shared_ptr<tensorflow::AbstractStackTrace>>;
 
-// Generates a GraphDebugInfo proto from a StackTracesMap object.
+// Map of function names to StackTracesMaps.
+using FunctionDefLibraryStackTraces =
+    absl::flat_hash_map<std::string, StackTracesMap>;
+
+// Generates a GraphDebugInfo proto from a StackTracesMap object. Returns user
+// frames by default. If `user_frames` is false, returns all frames.
 tensorflow::GraphDebugInfo StackTracesMapToGraphDebugInfo(
-    const tensorflow::StackTracesMap& map);
+    const tensorflow::StackTracesMap& map, bool user_frames = true);
 
 // Holds Function information that can be shared in multiple places.
 // FunctionRecord must be explicitly finalized before being saved in
@@ -403,7 +438,7 @@ class FunctionRecord : public core::RefCounted {
  public:
   FunctionRecord(const FunctionDef& fdef, const StackTracesMap& stack_traces,
                  bool finalized);
-  FunctionRecord(FunctionDef&& fdef, const StackTracesMap& stack_traces,
+  FunctionRecord(FunctionDef&& fdef, StackTracesMap&& stack_traces,
                  bool finalized);
 
   // Mark FunctionRecord as finalized (disable mutation).
@@ -532,9 +567,11 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
   // Duplicate functions and gradients are ignored.
   // This operation is atomic.
   Status AddLibrary(const FunctionDefLibrary& lib_def,
-                    const StackTracesMap& stack_traces) TF_LOCKS_EXCLUDED(mu_);
+                    const FunctionDefLibraryStackTraces& library_traces)
+      TF_LOCKS_EXCLUDED(mu_);
   Status AddLibrary(FunctionDefLibrary&& lib_def,
-                    const StackTracesMap& stack_traces) TF_LOCKS_EXCLUDED(mu_);
+                    const FunctionDefLibraryStackTraces& library_traces)
+      TF_LOCKS_EXCLUDED(mu_);
 
   // If the gradient function for 'func' is specified explicitly in
   // the library, returns the gradient function name.  Otherwise,
@@ -646,9 +683,8 @@ class FunctionLibraryDefinition : public OpRegistryInterface {
 
   // Same as AddFunctionDef/AddGradientDef except these methods set
   // `added` to true if the `fdef`/`grad` were actually added to this.
-  Status AddFunctionDefHelper(FunctionDef&& fdef,
-                              const StackTracesMap& stack_traces, bool* added)
-      TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
+  Status AddFunctionDefHelper(FunctionDef&& fdef, StackTracesMap&& stack_traces,
+                              bool* added) TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
   Status AddGradientDefHelper(const GradientDef& grad, bool* added)
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
@@ -866,6 +902,11 @@ class FunctionLibraryRuntime {
     // Instantiates the function for XLA compilation on device_type. If empty,
     // function is not compiled.
     std::string xla_compile_device_type;
+
+    // This interface is EXPERIMENTAL and subject to change.
+    //
+    // Instantiates the function enabling soft placement or outside compilation.
+    bool allow_soft_placement = false;
   };
   typedef uint64 Handle;
   virtual Status Instantiate(const std::string& function_name, AttrSlice attrs,
diff --git a/tensorflow/core/framework/function_test.cc b/tensorflow/core/framework/function_test.cc
index 4bee5c46534..3b58fa8d6c1 100644
--- a/tensorflow/core/framework/function_test.cc
+++ b/tensorflow/core/framework/function_test.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <vector>
 
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
 #include "tensorflow/core/framework/op.h"
@@ -36,6 +38,10 @@ limitations under the License.
 namespace tensorflow {
 namespace {
 
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Eq;
+
 // A helper class to make AttrSlice from initializer lists
 class Attrs {
  public:
@@ -1691,5 +1697,118 @@ TEST(InstantiateFunctionTest, ResourceInputDevice) {
   EXPECT_EQ(composite_devices.at("/device:COMPOSITE:0").size(), 2);
 }
 
+TEST(FrozenStackTrace, ToFramesReturnsAllFrames) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/subdir/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_THAT(frozen_stack_trace.ToFrames(), ElementsAreArray(frames));
+}
+
+TEST(FrozenStackTrace, GetUserFramesWithNegativeLimitReturnsAllFrames) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/subdir/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_THAT(frozen_stack_trace.GetUserFrames(-1), ElementsAreArray(frames));
+}
+
+TEST(FrozenStackTrace, GetUserFramesWithLargeLimitReturnsAllFrames) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/subdir/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_THAT(frozen_stack_trace.GetUserFrames(frames.size() + 1),
+              ElementsAreArray(frames));
+}
+
+TEST(FrozenStackTrace, GetUserFramesWithLowLimitSlicesFrames) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/subdir/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_THAT(frozen_stack_trace.GetUserFrames(2),
+              ElementsAre(Eq(frames[0]), Eq(frames[1])));
+}
+
+TEST(FrozenStackTrace, ToUncachedFramesReturnsAllFrames) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/subdir/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_THAT(frozen_stack_trace.ToUncachedFrames(), ElementsAreArray(frames));
+}
+
+TEST(FrozenStackTrace, LastUserFrameReturnsLastFrame) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/subdir/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_EQ(frozen_stack_trace.LastUserFrame(), frames[2]);
+}
+
+TEST(FrozenStackTrace, ToString) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/tensorflow/python/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+
+  EXPECT_EQ(frozen_stack_trace.ToString({}),
+            "File \"some/path/alpha.cc\", line 20, in bar\n"
+            "File \"some/path/beta.cc\", line 30, in fox\n"
+            "File \"some/path/tensorflow/python/gamma.cc\", line 40, in trot");
+}
+
+TEST(FrozenStackTrace, ToStringWithFilterCommonPrefix) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"<embedded something>", 10, "foo"},
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/tensorflow/python/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+  AbstractStackTrace::TracePrintingOptions options;
+  options.filter_common_prefix = true;
+  EXPECT_EQ(frozen_stack_trace.ToString(options),
+            "File \"<embedded something>\", line 10, in foo\n"
+            "File \"alpha.cc\", line 20, in bar\n"
+            "File \"beta.cc\", line 30, in fox\n"
+            "File \"tensorflow/python/gamma.cc\", line 40, in trot");
+}
+
+TEST(FrozenStackTrace, ToStringWithDropInternalFrames) {
+  std::vector<StackFrame> frames = std::vector<StackFrame>{
+      {"some/path/alpha.cc", 20, "bar"},
+      {"some/path/beta.cc", 30, "fox"},
+      {"some/path/tensorflow/python/gamma.cc", 40, "trot"},
+  };
+  FrozenStackTrace frozen_stack_trace(frames);
+  AbstractStackTrace::TracePrintingOptions options;
+  options.drop_internal_frames = true;
+  EXPECT_EQ(frozen_stack_trace.ToString(options),
+            "File \"some/path/alpha.cc\", line 20, in bar\n"
+            "File \"some/path/beta.cc\", line 30, in fox");
+}
+
 }  // end namespace
 }  // end namespace tensorflow
diff --git a/tensorflow/core/framework/graph_to_functiondef.cc b/tensorflow/core/framework/graph_to_functiondef.cc
index 462de682ebc..9b2d2d483bc 100644
--- a/tensorflow/core/framework/graph_to_functiondef.cc
+++ b/tensorflow/core/framework/graph_to_functiondef.cc
@@ -15,9 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/framework/graph_to_functiondef.h"
 
-#include <unordered_map>
-#include <unordered_set>
-
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -73,10 +72,10 @@ class NodeNameMapping {
   // input names (in signature), output names (in signature), and node names
   // (in node_def).
   // This is a superset of values in name_mapping_.
-  std::unordered_map<string, uint64> used_names_;
+  absl::flat_hash_map<string, uint64> used_names_;
   // Mapping from original node name from the graph to the normalized
   // and uniquified version of it.
-  std::unordered_map<string, string> name_mapping_;
+  absl::flat_hash_map<string, string> name_mapping_;
 };
 
 string NodeNameMapping::Normalize(string name) {
@@ -155,10 +154,10 @@ string NodeNameMapping::Lookup(const string& name) const {
 Status FillFunctionBody(
     const string& fn_name, const NodeNameMapping& node_names,
     const std::vector<const Node*>& body_nodes,
-    const std::unordered_map<string, string>& tensor_renaming,
+    const absl::flat_hash_map<string, string>& tensor_renaming,
     bool set_stateful_from_nodes, bool copy_placeholder_attrs_from_nodes,
     FunctionDef* fdef) {
-  std::unordered_set<string> func_attr_names;
+  absl::flat_hash_set<string> func_attr_names;
   for (const auto& func_attr : fdef->signature().attr()) {
     func_attr_names.insert(func_attr.name());
   }
@@ -397,7 +396,7 @@ Status GraphToFunctionDef(const Graph& fn_body, const string& fn_name,
   //  - For tensors produced by nodes in function's body:
   //    {flat_tensor_name -> nested_tensor_name}
   //    e.g. {Add:3 -> add_0:z:1}
-  std::unordered_map<string, string> tensor_renaming;
+  absl::flat_hash_map<string, string> tensor_renaming;
 
   // Fill outputs in function's signature.
   // We fill the outputs first to prevent output_names from colliding
diff --git a/tensorflow/core/framework/kernel_shape_util.cc b/tensorflow/core/framework/kernel_shape_util.cc
index 54716f78240..d36189ae594 100644
--- a/tensorflow/core/framework/kernel_shape_util.cc
+++ b/tensorflow/core/framework/kernel_shape_util.cc
@@ -78,15 +78,9 @@ Status GetWindowedOutputSizeVerbose(int64_t input_size, int64_t filter_size,
 Status GetWindowedOutputSize(int64_t input_size, int64_t filter_size,
                              int64_t stride, Padding padding_type,
                              int64_t* output_size, int64_t* padding_size) {
-  if (padding_type == Padding::EXPLICIT) {
-    return errors::Internal(
-        "GetWindowedOutputSize does not handle EXPLICIT padding; call "
-        "GetWindowedOutputSizeVerbose instead");
-  }
-  int64_t padding_after_unused;
-  return GetWindowedOutputSizeVerbose(input_size, filter_size, stride,
-                                      padding_type, output_size, padding_size,
-                                      &padding_after_unused);
+  return GetWindowedOutputSizeV2(input_size, filter_size,
+                                 /*dilation_rate=*/1, stride, padding_type,
+                                 output_size, padding_size);
 }
 
 Status GetWindowedOutputSizeV2(int64_t input_size, int64_t filter_size,
@@ -104,19 +98,6 @@ Status GetWindowedOutputSizeV2(int64_t input_size, int64_t filter_size,
                                         padding_size, &padding_after_unused);
 }
 
-Status Get3dOutputSize(const std::array<int64_t, 3>& input,
-                       const std::array<int64_t, 3>& window,
-                       const std::array<int64_t, 3>& strides,
-                       Padding padding_type, std::array<int64_t, 3>* output_ptr,
-                       std::array<int64_t, 3>* padding_ptr) {
-  for (size_t i = 0; i < input.size(); ++i) {
-    TF_RETURN_IF_ERROR(GetWindowedOutputSize(input[i], window[i], strides[i],
-                                             padding_type, &(*output_ptr)[i],
-                                             &(*padding_ptr)[i]));
-  }
-  return OkStatus();
-}
-
 Status Get3dOutputSizeV2(const std::array<int64_t, 3>& input,
                          const std::array<int64_t, 3>& window,
                          const std::array<int64_t, 3>& dilations,
diff --git a/tensorflow/core/framework/kernel_shape_util.h b/tensorflow/core/framework/kernel_shape_util.h
index e2c3a00e7c5..0ef13c186c3 100644
--- a/tensorflow/core/framework/kernel_shape_util.h
+++ b/tensorflow/core/framework/kernel_shape_util.h
@@ -132,12 +132,6 @@ Status GetWindowedOutputSizeVerboseV2(int64_t input_size, int64_t filter_size,
 // is padded with zeros, as well as for 3D avg/max pooling, where the input data
 // is padded with invalid values that are not considered for pooling. EXPLICIT
 // padding is not supported.
-Status Get3dOutputSize(const std::array<int64_t, 3>& input,
-                       const std::array<int64_t, 3>& window,
-                       const std::array<int64_t, 3>& strides,
-                       Padding padding_type, std::array<int64_t, 3>* output_ptr,
-                       std::array<int64_t, 3>* padding_ptr);
-
 // The V2 version computes the same outputs with arbitrary dilation_rate. For
 // detailed equations, refer to the comments for GetWindowedOutputSizeV2().
 Status Get3dOutputSizeV2(const std::array<int64_t, 3>& input,
diff --git a/tensorflow/core/framework/local_rendezvous.cc b/tensorflow/core/framework/local_rendezvous.cc
index 614d1e3427a..a77eb7eb3b8 100644
--- a/tensorflow/core/framework/local_rendezvous.cc
+++ b/tensorflow/core/framework/local_rendezvous.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/tsl/platform/logging.h"
+#include "tensorflow/tsl/platform/refcount.h"
 
 namespace tensorflow {
 
@@ -135,7 +136,7 @@ LocalRendezvous::~LocalRendezvous() {
     }
   }
   if (table_not_empty) {
-    StartAbort(errors::Cancelled("LocalRendezvous deleted"));
+    DoAbort(absl::CancelledError("LocalRendezvous deleted"));
   }
 }
 
@@ -379,7 +380,22 @@ void LocalRendezvous::RecvAsync(const Rendezvous::ParsedKey& key,
   delete item;
 }
 
+mutex& LocalRendezvous::aborted_rendezs_mu_ = *new mutex();
+
+std::vector<tsl::core::RefCountPtr<Rendezvous> >&
+    LocalRendezvous::aborted_rendezs_ =
+        *new std::vector<tsl::core::RefCountPtr<Rendezvous> >();
+
 void LocalRendezvous::StartAbort(const Status& status) {
+  DoAbort(status);
+
+  if (rc_owner_) {
+    mutex_lock l(aborted_rendezs_mu_);
+    aborted_rendezs_.push_back(tsl::core::GetNewRef(rc_owner_));
+  }
+}
+
+void LocalRendezvous::DoAbort(const Status& status) {
   CHECK(!status.ok());
   {
     mutex_lock l(mu_);
diff --git a/tensorflow/core/framework/local_rendezvous.h b/tensorflow/core/framework/local_rendezvous.h
index 060b6b912a8..39c4291c7dc 100644
--- a/tensorflow/core/framework/local_rendezvous.h
+++ b/tensorflow/core/framework/local_rendezvous.h
@@ -58,7 +58,15 @@ class LocalRendezvous {
   void StartAbort(const Status& status);
   Status status();
 
+  // Releases all the references to the aborted rendezvous. Used in unit tests.
+  static void ReleaseAbortedRendezvous() {
+    mutex_lock l(aborted_rendezs_mu_);
+    aborted_rendezs_.clear();
+  }
+
  private:
+  void DoAbort(const Status& status);
+
   tsl::core::RefCountPtr<Rendezvous> GetOwnerRefCountPtr();
 
   struct Item;
@@ -95,6 +103,15 @@ class LocalRendezvous {
   mutex mu_;
   Status status_ TF_GUARDED_BY(mu_);
 
+  // We deliberately leak one reference of the aborted rendezvous here, so that
+  // they won't be destructed, and lose the status_.
+  // This is necessary because subsequent calls to RendezvousMgr::Find() will
+  // return the aborted rendezvous, and proper errors will be propagated.
+  // TODO(hhb): find a better way to manage rendezvous lifespan.
+  static mutex& aborted_rendezs_mu_;
+  static std::vector<tsl::core::RefCountPtr<Rendezvous> >& aborted_rendezs_
+      TF_GUARDED_BY(aborted_rendezs_mu_);
+
   TF_DISALLOW_COPY_AND_ASSIGN(LocalRendezvous);
 };
 
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index ccd812c357e..05c66275d66 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -186,6 +186,14 @@ auto* tf_data_service_data_transfer_protocol_used =
         "data transfer protocol.",
         "data_transfer_protocol");
 
+auto* tf_data_service_data_transfer_protocol_used_by_nature =
+    tsl::monitoring::Counter<2>::New(
+        "/tensorflow/data/service/data_transfer_protocol_used_by_nature",
+        "The number of tf.data service worker clients created that use this "
+        "data transfer protocol and the nature ('default' or 'specified') "
+        "under which this protocol was chosen.",
+        "data_transfer_protocol", "nature");
+
 auto* tf_data_service_data_transfer_protocol_fallback =
     tsl::monitoring::Counter<3>::New(
         "/tensorflow/data/service/data_transfer_protocol_fallback",
@@ -293,6 +301,12 @@ auto* graph_optimization_cache_miss_count = tsl::monitoring::Counter<1>::New(
     "source"  // graph optimization source
 );
 
+auto* graph_optimization_cache_load_count = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/core/graph_optimization_cache_load_count",
+    "The number of times loading an optimized function graph to RAM.",
+    "source"  // graph optimization source
+);
+
 auto* xla_compilations = tsl::monitoring::Counter<0>::New(
     "/tensorflow/core/xla_compilations",
     "The number of XLA compilations used to collect "
@@ -449,8 +463,7 @@ void RecordTFDataServiceWorkerCreated() {
 }
 
 void RecordTFDataServiceJobsCreated(
-    const data::ProcessingModeDef& processing_mode,
-    bool is_coordinated_read) {
+    const data::ProcessingModeDef& processing_mode, bool is_coordinated_read) {
   const std::string sharding_policy_str =
       data::ProcessingModeDef::ShardingPolicy_Name(
           processing_mode.sharding_policy());
@@ -463,8 +476,7 @@ void RecordTFDataServiceJobsCreated(
 
 void RecordTFDataServiceClientIterators(
     int64_t worker_uid, data::DeploymentMode deployment_mode,
-    const data::ProcessingModeDef& processing_mode,
-    bool is_coordinated_read) {
+    const data::ProcessingModeDef& processing_mode, bool is_coordinated_read) {
   const std::string deployment_mode_str =
       data::DeploymentMode_Name(deployment_mode);
   const std::string sharding_policy_str =
@@ -479,8 +491,10 @@ void RecordTFDataServiceClientIterators(
 }
 
 void RecordTFDataServiceDataTransferProtocolUsed(
-    const string& data_transfer_protocol) {
-  tf_data_service_data_transfer_protocol_used->GetCell(data_transfer_protocol)
+    const string& data_transfer_protocol, bool user_specified) {
+  std::string nature = user_specified ? "specified" : "default";
+  tf_data_service_data_transfer_protocol_used_by_nature
+      ->GetCell(data_transfer_protocol, nature)
       ->IncrementBy(1);
 }
 
@@ -519,8 +533,7 @@ void RecordTFDataFilename(const string& name, const string& filename) {
   tf_data_filename_counter->GetCell(name, filename)->IncrementBy(1);
 }
 
-void RecordTFDataAutoShard(const string& id,
-                           data::AutoShardPolicy policy,
+void RecordTFDataAutoShard(const string& id, data::AutoShardPolicy policy,
                            int64 num_workers, int64 num_replicas) {
   tf_data_auto_shard->GetCell(id, "policy")->Set(static_cast<int64_t>(policy));
   tf_data_auto_shard->GetCell(id, "num_workers")->Set(num_workers);
@@ -671,6 +684,19 @@ int64_t GetFunctionGraphOptimizationCacheMissCount(
   return graph_optimization_cache_miss_count->GetCell(mapped_source)->value();
 }
 
+void IncrementFunctionGraphOptimizationCacheLoadCount(
+    int count, GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  graph_optimization_cache_load_count->GetCell(mapped_source)
+      ->IncrementBy(count);
+}
+
+int64_t GetFunctionGraphOptimizationCacheLoadCount(
+    GraphOptimizationSource source) {
+  std::string mapped_source = GraphOptimizationSourceMapping(source);
+  return graph_optimization_cache_load_count->GetCell(mapped_source)->value();
+}
+
 void UpdateTpuVariableDistributionTime(const uint64 distribution_time_usecs) {
   if (distribution_time_usecs > 0) {
     tpu_variable_distribution_time_usecs->GetCell()->IncrementBy(
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index 7c7314f1d86..9f8bab125af 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -134,9 +134,10 @@ void RecordTFDataServiceClientIterators(
     const data::ProcessingModeDef& processing_mode, bool is_coordinated_read);
 
 // Records that a tf.data service worker client has been created that will use
-// `data_transfer_protocol` to get data from the worker server.
+// `data_transfer_protocol` to get data from the worker server and whether or
+// not the user explicitly specified the protocol.
 void RecordTFDataServiceDataTransferProtocolUsed(
-    const string& data_transfer_protocol);
+    const string& data_transfer_protocol, bool user_specified);
 
 // Records that a tf.data service worker client fell back to gRPC rather than
 // use `data_transfer_protocol` because of an error of type `code` with message
@@ -259,6 +260,13 @@ void IncrementFunctionGraphOptimizationCacheMissCount(
 int64_t GetFunctionGraphOptimizationCacheMissCount(
     GraphOptimizationSource source);
 
+// Increments the number of restoring function graph optimization cache.
+void IncrementFunctionGraphOptimizationCacheLoadCount(
+    int count, GraphOptimizationSource source);
+
+int64_t GetFunctionGraphOptimizationCacheLoadCount(
+    GraphOptimizationSource source);
+
 // Records the activity of the first phase of the mlir bridge using the
 // tf_metadata.tf_mlir_bridge_first_phase_count metric.
 // device_type: tpu, cpu, gpu, etc.
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 47a0d427d3a..c5338871da3 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -55,6 +55,9 @@ constexpr double kTargetTimeSigmas = 1.0;
 // downsizing a buffer.
 constexpr double kBufferUpsizeMultiplier = 2.0;
 constexpr double kBufferDownsizeMultipliter = 0.9;
+// Threshold of low buffer watermark before a buffer is a candidate for
+// upsizing.
+constexpr int64_t kBufferLowWatermarkThreshold = 2;
 
 constexpr char kFlatMap[] = "FlatMap";
 constexpr char kInterleave[] = "Interleave";
@@ -202,13 +205,13 @@ std::string RemoveArrayIndices(absl::string_view s) {
   absl::string_view::size_type pos;
   std::string res;
   do {
-    pos = s.find("[", start_pos);
+    pos = s.find('[', start_pos);
     if (pos == absl::string_view::npos) {
       break;
     }
     res.append(s.data() + start_pos, pos - start_pos + 1);
     start_pos = pos + 1;
-    pos = s.find("]", start_pos);
+    pos = s.find(']', start_pos);
     if (pos == absl::string_view::npos) {
       break;
     }
@@ -361,7 +364,7 @@ Status ModelToProtoHelper(std::shared_ptr<Node> output, ModelProto* model) {
     const std::shared_ptr<Node> node = to_serialize.front();
     to_serialize.pop_front();
     TF_RETURN_IF_ERROR(node->ToProto(&(nodes[node->id()])));
-    for (auto input : node->inputs()) {
+    for (const auto& input : node->inputs()) {
       to_serialize.push_back(input);
     }
   }
@@ -1429,9 +1432,9 @@ std::shared_ptr<Node> MakeUnknownNode(Node::Args args) {
   return std::make_shared<Unknown>(std::move(args));
 }
 
-double Node::ComputeWaitTime(const double& producer_time,
-                             const double& consumer_time,
-                             const double& buffer_size,
+double Node::ComputeWaitTime(const double producer_time,
+                             const double consumer_time,
+                             const double buffer_size,
                              double* producer_time_derivative,
                              double* consumer_time_derivative,
                              double* buffer_size_derivative) {
@@ -1898,7 +1901,7 @@ void Node::CollectBufferParametersToUpsize(
           (parameter->state == nullptr || !parameter->state->tunable)) {
         continue;
       }
-      if (buffered_elements_low_ <= 0 &&
+      if (buffered_elements_low_ <= kBufferLowWatermarkThreshold &&
           buffered_elements_high_ >= parameter->value) {
         parameter->value = parameter->state->value;
         node_parameters[this] = parameter.get();
@@ -2243,7 +2246,7 @@ void Model::FlushMetrics() {
     auto node = queue.front();
     queue.pop_front();
     node->FlushMetrics();
-    for (auto input : node->inputs()) {
+    for (const auto& input : node->inputs()) {
       queue.push_back(input);
     }
   }
@@ -2624,7 +2627,7 @@ void Model::OptimizeStageBasedAsyncInterleaveManyNodes(
     interleave_many_nodes.push_back(snapshot);
   }
   Node::ModelParameters tunable_parameters;
-  for (auto node : interleave_many_nodes) {
+  for (const auto& node : interleave_many_nodes) {
     if (!IsAsyncInterleaveManyNode(node)) {
       continue;
     }
@@ -2695,7 +2698,7 @@ void Model::OptimizeStageBasedNonAsyncInterleaveManyNodes(
     all_nodes.push_back(snapshot);
   }
   Node::ModelParameters tunable_parameters;
-  for (auto node : all_nodes) {
+  for (const auto& node : all_nodes) {
     if (IsAsyncInterleaveManyNode(node)) {
       continue;
     }
@@ -3225,7 +3228,7 @@ double ModelTiming::ComputeInterleaveManyFirstInputTotalTime(const Node& node) {
 
 void ModelTiming::ComputeTotalTimes(const Node::NodeVector& reverse_bfs_nodes) {
   for (const auto& node : reverse_bfs_nodes) {
-    ComputeNodeTotalTime(*(node.get()));
+    ComputeNodeTotalTime(*(node));
   }
 }
 
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 5b32511ccbc..c3c2dd0675c 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -423,9 +423,8 @@ class Node {
   // Collects derivatives of `ComputeWaitTime` w.r.t `producer_time`,
   // `consumer_time' and `buffer_size` if the corresponding pointers are not
   // `nullptr`.
-  static double ComputeWaitTime(const double& producer_time,
-                                const double& consumer_time,
-                                const double& buffer_size,
+  static double ComputeWaitTime(double producer_time, double consumer_time,
+                                double buffer_size,
                                 double* producer_time_derivative,
                                 double* consumer_time_derivative,
                                 double* buffer_size_derivative);
diff --git a/tensorflow/core/framework/model_test.cc b/tensorflow/core/framework/model_test.cc
index 5b4470b9de8..6613d0d83b6 100644
--- a/tensorflow/core/framework/model_test.cc
+++ b/tensorflow/core/framework/model_test.cc
@@ -2327,18 +2327,17 @@ TEST_F(BufferSizeTest, OptimizeBuffers_PlentyOfMemory) {
   std::shared_ptr<Node> node_4 = GetNode(4);
   std::shared_ptr<Node> node_5 = GetNode(5);
   std::shared_ptr<Node> node_6 = GetNode(6);
-  // Set node 1 low watermark to 1 and high watermark to 2. Expect that it is
+  // Set node 1 low watermark to 3 and high watermark to 3. Expect that it is
   // downsized to 2.
-  node_1->record_buffer_event(100, 1);
-  node_1->record_buffer_event(100, 1);
-  EXPECT_EQ(1, node_1->buffered_elements_low());
-  EXPECT_EQ(2, node_1->buffered_elements_high());
-  // Set node 3 low watermark to 1 and high watermark to 5. Expect that it is
-  // not changed.
-  node_3->record_buffer_event(100, 1);
-  node_3->record_buffer_event(400, 4);
+  node_1->record_buffer_event(100, 3);
+  EXPECT_EQ(3, node_1->buffered_elements_low());
+  EXPECT_EQ(3, node_1->buffered_elements_high());
+  // Set node 3 low watermark to 3 and high watermark to 5. Expect that it is
+  // downsized to 4.
+  node_3->record_buffer_event(100, 3);
+  node_3->record_buffer_event(400, 2);
   node_3->record_buffer_event(-100, -1);
-  EXPECT_EQ(1, node_3->buffered_elements_low());
+  EXPECT_EQ(3, node_3->buffered_elements_low());
   EXPECT_EQ(5, node_3->buffered_elements_high());
   // Set node 4 low watermark to 0 and high watermark to 5. Expect that it is
   // upsized to 10.
@@ -2367,12 +2366,12 @@ TEST_F(BufferSizeTest, OptimizeBuffers_PlentyOfMemory) {
   model_->OptimizeBuffers(node_1->Snapshot(), 10000);
 
   EXPECT_EQ(2, node_1->parameter_value(kBufferSize));
-  EXPECT_EQ(5, node_3->parameter_value(kBufferSize));
+  EXPECT_EQ(4, node_3->parameter_value(kBufferSize));
   EXPECT_EQ(10, node_4->parameter_value(kBufferSize));
   EXPECT_EQ(8, node_5->parameter_value(kBufferSize));
   EXPECT_EQ(7, node_6->parameter_value(kBufferSize));
-  EXPECT_EQ(2, node_1->buffered_elements_low());
-  EXPECT_EQ(2, node_1->buffered_elements_high());
+  EXPECT_EQ(3, node_1->buffered_elements_low());
+  EXPECT_EQ(3, node_1->buffered_elements_high());
   EXPECT_EQ(4, node_3->buffered_elements_low());
   EXPECT_EQ(4, node_3->buffered_elements_high());
   EXPECT_EQ(4, node_4->buffered_elements_low());
diff --git a/tensorflow/core/framework/node_def_util.cc b/tensorflow/core/framework/node_def_util.cc
index 7ac0dc1897f..d3af99893e7 100644
--- a/tensorflow/core/framework/node_def_util.cc
+++ b/tensorflow/core/framework/node_def_util.cc
@@ -908,7 +908,6 @@ Status ValidateExternalNodeDefSyntax(const NodeDef& node_def) {
 
 Status AttachDef(const Status& status, const NodeDef& node_def,
                  bool allow_multiple_formatted_node) {
-  Status ret = status;
   string node_error;
   if (!allow_multiple_formatted_node &&
       absl::StrContains(status.message(), "{{node ")) {
@@ -916,8 +915,9 @@ Status AttachDef(const Status& status, const NodeDef& node_def,
   } else {
     node_error = FormatNodeDefForError(node_def);
   }
-  errors::AppendToMessage(&ret, strings::StrCat(" [[", node_error, "]]"));
-  return ret;
+  return errors::CreateWithUpdatedMessage(
+      status,
+      strings::StrCat(status.message(), "\n\t", " [[", node_error, "]]"));
 }
 
 void AddNodeAttr(StringPiece name, const AttrValue& value, NodeDef* node_def) {
diff --git a/tensorflow/core/framework/op.cc b/tensorflow/core/framework/op.cc
index 823e584d2a0..ccd5edcb3d3 100644
--- a/tensorflow/core/framework/op.cc
+++ b/tensorflow/core/framework/op.cc
@@ -68,7 +68,7 @@ Status OpNotFound(const string& op_type_name) {
       port::Hostname(), ". ",
       "Make sure the Op and Kernel are registered in the binary running in "
       "this process. Note that if you are loading a saved graph which used ops "
-      "from tf.contrib, accessing (e.g.) `tf.contrib.resampler` should be done "
+      "from tf.contrib (e.g. `tf.contrib.resampler`), accessing should be done "
       "before importing the graph, as contrib ops are lazily registered when "
       "the module is first accessed.");
   VLOG(1) << status.ToString();
diff --git a/tensorflow/core/framework/op_kernel.h b/tensorflow/core/framework/op_kernel.h
index fc27a63c0f6..074f6cc0e97 100644
--- a/tensorflow/core/framework/op_kernel.h
+++ b/tensorflow/core/framework/op_kernel.h
@@ -1241,6 +1241,18 @@ class OpKernelContext {
 
   Allocator* get_allocator(AllocatorAttributes attr);
 
+  Params* params() const { return params_; }
+  void set_params(Params* params) { params_ = params; }
+
+  void ResetOutputs(int num_outputs = 0) {
+    for (TensorValue& value : outputs_) {
+      DCHECK(!value.is_ref());
+      delete value.tensor;
+      value.tensor = nullptr;
+    }
+    outputs_.resize(num_outputs);
+  }
+
  private:
   bool record_memory_consumption_ = false;
 
diff --git a/tensorflow/core/framework/op_kernel_test_base.h b/tensorflow/core/framework/op_kernel_test_base.h
index 95950d891bc..3227005326f 100644
--- a/tensorflow/core/framework/op_kernel_test_base.h
+++ b/tensorflow/core/framework/op_kernel_test_base.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -128,7 +129,7 @@ class OpKernelBuilderTest : public ::testing::Test {
 
       // Test SupportedDeviceTypesForNode().
       PrioritizedDeviceTypeVector devices;
-      if (errors::IsNotFound(status)) {
+      if (absl::IsNotFound(status)) {
         TF_EXPECT_OK(SupportedDeviceTypesForNode(DeviceTypes(), def, &devices));
         for (const auto& dt : devices) {
           EXPECT_NE(dt.first, device_type);
@@ -156,7 +157,7 @@ class OpKernelBuilderTest : public ::testing::Test {
         FindKernelDef(device_type, def, &kernel_def, &kernel_class_name);
     if (status.ok()) {
       return kernel_class_name;
-    } else if (errors::IsNotFound(status)) {
+    } else if (absl::IsNotFound(status)) {
       return "not found";
     } else {
       return status.ToString();
diff --git a/tensorflow/core/framework/rendezvous_test.cc b/tensorflow/core/framework/rendezvous_test.cc
index cb8e422c5e6..1df18be1d43 100644
--- a/tensorflow/core/framework/rendezvous_test.cc
+++ b/tensorflow/core/framework/rendezvous_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/framework/rendezvous.h"
 
+#include "absl/status/status.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -164,7 +165,7 @@ TEST_F(LocalRendezvousTest, CancelBeforeRecv) {
   cm->StartCancel();
   auto s = rendez_->Recv(KeyFoo(), args, &val, &is_dead);
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(errors::IsCancelled(s));
+  EXPECT_TRUE(absl::IsCancelled(s));
   EXPECT_EQ("RecvAsync is cancelled.", s.message());
   delete cm;
 }
@@ -183,7 +184,7 @@ TEST_F(LocalRendezvousTest, CancelAfterRecv) {
   args.cancellation_manager = cm;
   auto s = rendez_->Recv(KeyFoo(), args, &val, &is_dead);
   EXPECT_FALSE(s.ok());
-  EXPECT_TRUE(errors::IsCancelled(s));
+  EXPECT_TRUE(absl::IsCancelled(s));
   EXPECT_EQ("RecvAsync is cancelled.", s.message());
   n.WaitForNotification();
   delete cm;
@@ -365,7 +366,7 @@ TEST_F(LocalRendezvousTest, RecvAbort) {
   bool val_dead = false;
   Rendezvous::Args args;
   Status status = rendez_->Recv(KeyFoo(), args, &val, &val_dead);
-  EXPECT_TRUE(errors::IsAborted(status));
+  EXPECT_TRUE(absl::IsAborted(status));
 }
 
 // Similar to RecvAbort. But this test case ensures the main thread
@@ -381,7 +382,7 @@ TEST_F(LocalRendezvousTest, RecvSleepAbort) {
   bool val_dead = false;
   Rendezvous::Args args;
   Status status = rendez_->Recv(KeyFoo(), args, &val, &val_dead);
-  EXPECT_TRUE(errors::IsAborted(status));
+  EXPECT_TRUE(absl::IsAborted(status));
 }
 
 TEST_F(LocalRendezvousTest, AbortThenRecvOrSend) {
@@ -389,9 +390,8 @@ TEST_F(LocalRendezvousTest, AbortThenRecvOrSend) {
   Tensor val(DT_STRING);
   bool val_dead = false;
   Rendezvous::Args args;
-  EXPECT_TRUE(errors::IsAborted(rendez_->Send(KeyFoo(), args, val, val_dead)));
-  EXPECT_TRUE(
-      errors::IsAborted(rendez_->Recv(KeyFoo(), args, &val, &val_dead)));
+  EXPECT_TRUE(absl::IsAborted(rendez_->Send(KeyFoo(), args, val, val_dead)));
+  EXPECT_TRUE(absl::IsAborted(rendez_->Recv(KeyFoo(), args, &val, &val_dead)));
 }
 
 class DummyDeviceContext : public DeviceContext {
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index 88314cebec8..39914cdeb71 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -318,6 +318,14 @@ class Tensor {
     return true;
 #else
     void* ptr = base<void>();
+    // If ptr is allocated through AsyncValueAllocator, the last bit is 1,
+    // which indicates ptr points to an AsyncValueTensor instead of raw buffer,
+    // and disable alignemnt check for it.
+    constexpr uintptr_t kTag = 0x1ULL;
+    uintptr_t value = reinterpret_cast<uintptr_t>(ptr);
+    if (value & kTag) {
+      return true;
+    }
     return dtype() == DT_STRING || NumElements() == 0 ||
            (reinterpret_cast<intptr_t>(ptr) % EIGEN_MAX_ALIGN_BYTES == 0);
 #endif
diff --git a/tensorflow/core/framework/types.proto b/tensorflow/core/framework/types.proto
index 2a01c028c04..54f02a0db04 100644
--- a/tensorflow/core/framework/types.proto
+++ b/tensorflow/core/framework/types.proto
@@ -44,8 +44,9 @@ enum DataType {
   DT_FLOAT8_E4M3FN = 25;  // 4 exponent bits, 3 mantissa bits, finite-only, with
                           // 2 NaNs (0bS1111111).
 
-  // Do not use!  These are only for parameters.  Every enum above
-  // should have a corresponding value below (verified by types_test).
+  // Do not use!  These are only for TF1's obsolete reference Variables.
+  // Every enum above should have a corresponding value below (verified by
+  // types_test).
   DT_FLOAT_REF = 101;
   DT_DOUBLE_REF = 102;
   DT_INT32_REF = 103;
diff --git a/tensorflow/core/function/capture/capture_container.py b/tensorflow/core/function/capture/capture_container.py
index 5388112229c..ba78999a4eb 100644
--- a/tensorflow/core/function/capture/capture_container.py
+++ b/tensorflow/core/function/capture/capture_container.py
@@ -74,6 +74,10 @@ class FunctionCaptures(object):
     # Set of external ops on which the graph has a control dependency
     self.control = object_identity.ObjectIdentitySet()
 
+    # Cached properties derived from the above.
+    self._cached_by_val_capture_tuples = []
+    self._cached_capture_types = py_collections.OrderedDict()
+
   def clear(self):
     self._by_ref_internal.clear()
     self._by_ref_external.clear()
@@ -82,10 +86,7 @@ class FunctionCaptures(object):
     self._by_val_external.clear()
 
   def capture_by_value(
-      self,
-      graph: Any,
-      tensor: core.Tensor,
-      name: Optional[str] = None
+      self, graph: Any, tensor: core.Tensor, name: Optional[str] = None
   ) -> core.Tensor:
     """Captures `tensor` if it's external to this graph.
 
@@ -114,9 +115,11 @@ class FunctionCaptures(object):
         name = str(pywrap_tfe.TFE_Py_UID())
 
       # Small EagerTensors are captured with Const ops
-      if (tensor.dtype in dtypes.TF_VALUE_DTYPES and
-          functools.reduce(lambda a, b: a*b, tensor.shape, 1) <=
-          _EAGER_CONST_THRESHOLD):
+      if (
+          tensor.dtype in dtypes.TF_VALUE_DTYPES
+          and functools.reduce(lambda a, b: a * b, tensor.shape, 1)
+          <= _EAGER_CONST_THRESHOLD
+      ):
         graph_const = self.by_val_internal.get(id(tensor))
         if graph_const is None:
           graph_const = tensor._capture_as_const(name)  # pylint: disable=protected-access
@@ -128,7 +131,8 @@ class FunctionCaptures(object):
               key=id(tensor),
               external=tensor,
               internal=graph_const,
-              is_by_ref=False)
+              is_by_ref=False,
+          )
           graph.inputs.append(graph_const)
         graph_const._record_tape(tensor)  # pylint: disable=protected-access
         return graph_const
@@ -156,7 +160,8 @@ class FunctionCaptures(object):
       external: Any,
       internal: core.Tensor,
       tracetype: Any = None,
-      is_by_ref: bool = False) -> None:
+      is_by_ref: bool = False,
+  ) -> None:
     """Replace a already exsiting capture, otherwise add it."""
     if is_by_ref:
       self._by_ref_external[key] = external
@@ -170,17 +175,19 @@ class FunctionCaptures(object):
       else:
         self._by_val_tracetype[key] = trace_type.from_value(external)
 
-  def pop(self,
-          key: Hashable,
-          is_by_ref: bool = False) -> Any:
+  def pop(self, key: Hashable, is_by_ref: bool = False) -> Any:
     if is_by_ref:
-      return (self._by_ref_external.pop(key, None),
-              self._by_ref_internal.pop(key, None),
-              self._by_ref_tracetype.pop(key, None))
+      return (
+          self._by_ref_external.pop(key, None),
+          self._by_ref_internal.pop(key, None),
+          self._by_ref_tracetype.pop(key, None),
+      )
     else:
-      return (self._by_val_external.pop(key, None),
-              self._by_val_internal.pop(key, None),
-              self._by_val_tracetype.pop(key, None))
+      return (
+          self._by_val_external.pop(key, None),
+          self._by_val_internal.pop(key, None),
+          self._by_val_tracetype.pop(key, None),
+      )
 
   def reset_captures(self, tensors, placeholders):
     """Set the captures with the provided list of captures & placeholder."""
@@ -197,10 +204,9 @@ class FunctionCaptures(object):
   # non-tensor values. Currently, this method is only used by
   # FuncGraph._experimental_capture_side_input_by_ref(), which contains the
   # logics for converting non-tensor values to tensor.
-  def _capture_by_ref(self,
-                      graph: Any,
-                      lam: Callable[[], Any],
-                      key: Hashable = None) -> Any:
+  def _capture_by_ref(
+      self, graph: Any, lam: Callable[[], Any], key: Hashable = None
+  ) -> Any:
     """Used during tracing process to create/retrive by-ref captures.
 
     Args:
@@ -243,6 +249,7 @@ class FunctionCaptures(object):
         self._by_ref_external[key] = other.by_ref_external[key]
         self._by_ref_tracetype[key] = other.by_ref_tracetype[key]
 
+  # TODO(panzf): Return structured values instead of flat tensors.
   def get_by_ref_snapshot(self) -> Mapping[Hashable, Any]:
     """Get a snapshot of current values of by-ref captures."""
     snapshot = {}
@@ -258,10 +265,8 @@ class FunctionCaptures(object):
     return snapshot
 
   def _create_placeholder_helper(
-      self,
-      graph: Any,
-      tensor: core.Tensor,
-      name: str):
+      self, graph: Any, tensor: core.Tensor, name: str
+  ):
     """A helper function to create capture placeholder."""
     placeholder = self._by_val_internal.get(id(tensor))
     if placeholder is None:
@@ -275,37 +280,44 @@ class FunctionCaptures(object):
       placeholder_ctx = trace_type.InternalPlaceholderContext(
           graph,
           with_none_control_dependencies=True,
-          composite_device_name=composite_device_name)
+          composite_device_name=composite_device_name,
+      )
       placeholder = spec.placeholder_value(placeholder_ctx)
       self.add_or_replace(
-          key=id(tensor),
-          external=tensor,
-          internal=placeholder,
-          is_by_ref=False)
+          key=id(tensor), external=tensor, internal=placeholder, is_by_ref=False
+      )
       graph.inputs.append(placeholder)
     placeholder._record_tape(tensor)  # pylint: disable=protected-access
     return placeholder
 
-  def _recompute_tuple_cache(self):
+  def _recompute_cached_properties(self):
+    """Regenerates cached properties if there have been mutations."""
+    self._by_val_internal.mutated = False
+    self._by_val_external.mutated = False
     assert len(self._by_val_internal) == len(self._by_val_external)
-    self._tuple_cache = []
+    self._cached_by_val_capture_tuples = []
     for key in self._by_val_internal:
       assert key in self._by_val_external
       internal = self._by_val_internal[key]
       external = self._by_val_external[key]
-      self._tuple_cache.append((external, internal))
+      self._cached_by_val_capture_tuples.append((external, internal))
+
+    self._cached_capture_types = py_collections.OrderedDict(
+        list(self._by_val_tracetype.items())
+        + list(self._by_ref_tracetype.items())
+    )
 
   @property
   def capture_types(self):
-    return {**self._by_val_tracetype, **self._by_ref_tracetype}
+    if self._by_val_internal.mutated or self._by_val_external.mutated:
+      self._recompute_cached_properties()
+    return self._cached_capture_types
 
   @property
   def by_val_capture_tuples(self):
     if self._by_val_internal.mutated or self._by_val_external.mutated:
-      self. _recompute_tuple_cache()
-      self._by_val_internal.mutated = False
-      self._by_val_external.mutated = False
-    return self._tuple_cache
+      self._recompute_cached_properties()
+    return self._cached_by_val_capture_tuples
 
   @property
   def by_ref_internal(self):
diff --git a/tensorflow/core/function/capture/restore_captures.py b/tensorflow/core/function/capture/restore_captures.py
index f390fa11ec0..afdd0c6af69 100644
--- a/tensorflow/core/function/capture/restore_captures.py
+++ b/tensorflow/core/function/capture/restore_captures.py
@@ -20,6 +20,7 @@ This functionality should ultimately be moved into a first-class core API.
 
 import warnings
 
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
@@ -131,3 +132,11 @@ def restore_captures(concrete_function, inputs):
         "warning if using tf.keras.models.load_model."
     )
   concrete_function.set_external_captures(captured_inputs_list)
+
+  # Update FunctionType with new captures.
+  if concrete_function.function_type:
+    concrete_function._function_type = function_type_lib.FunctionType(  # pylint: disable=protected-access
+        concrete_function.function_type.parameters.values(),
+        concrete_function.graph.function_captures.capture_types,
+        return_annotation=concrete_function.function_type.output,
+    )
diff --git a/tensorflow/core/function/polymorphism/BUILD b/tensorflow/core/function/polymorphism/BUILD
index 17ed32ef920..529b4c7cbb0 100644
--- a/tensorflow/core/function/polymorphism/BUILD
+++ b/tensorflow/core/function/polymorphism/BUILD
@@ -59,7 +59,7 @@ py_strict_test(
         ":function_cache",
         "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/types:trace",
     ],
diff --git a/tensorflow/core/function/polymorphism/function_cache.py b/tensorflow/core/function/polymorphism/function_cache.py
index 70acfc3fde0..a0989d0189b 100644
--- a/tensorflow/core/function/polymorphism/function_cache.py
+++ b/tensorflow/core/function/polymorphism/function_cache.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Cache to manage concrete functions and their signatures."""
+"""Cache to manage functions based on their FunctionType."""
 
 import collections
 from typing import Any, NamedTuple, Optional
@@ -27,21 +27,22 @@ class FunctionContext(NamedTuple):
 
 
 class FunctionCache:
-  """A container for managing concrete functions."""
+  """A container for managing functions."""
 
   __slots__ = ["_primary", "_dispatch_dict", "_garbage_collectors"]
 
   def __init__(self):
-    # Maps (FunctionContext, FunctionType) to a concrete function.
+    # Maps (FunctionContext, FunctionType) to a function.
     self._primary = collections.OrderedDict()
 
     # Maps FunctionContext to a TypeDispatchTable containing FunctionTypes of
     # that particular context.
     self._dispatch_dict = {}
 
-  def lookup(self, context: FunctionContext,
-             function_type: function_type_lib.FunctionType) -> Optional[Any]:
-    """Looks up a concrete function based on the context and type."""
+  def lookup(self, function_type: function_type_lib.FunctionType,
+             context: Optional[FunctionContext] = None) -> Optional[Any]:
+    """Looks up a function based on the context and type."""
+    context = context or FunctionContext(None)
     if context in self._dispatch_dict:
       dispatch_type = self._dispatch_dict[context].dispatch(function_type)
       if dispatch_type:
@@ -49,9 +50,11 @@ class FunctionCache:
 
     return None
 
-  def delete(self, context: FunctionContext,
-             function_type: function_type_lib.FunctionType) -> bool:
-    """Deletes a concrete function given the context and type."""
+  def delete(self, function_type: function_type_lib.FunctionType,
+             context: Optional[FunctionContext] = None,
+             ) -> bool:
+    """Deletes a function given the context and type."""
+    context = context or FunctionContext(None)
     if (context, function_type) not in self._primary:
       return False
 
@@ -60,21 +63,19 @@ class FunctionCache:
 
     return True
 
-  def add(self, context: FunctionContext,
-          function_type: function_type_lib.FunctionType,
-          concrete_fn: Any):
-    """Adds a new concrete function alongside its key.
+  def add(self, fn: Any, context: Optional[FunctionContext] = None) -> None:
+    """Adds a new function using its function_type.
 
     Args:
+      fn: The function to be added to the cache.
       context: A FunctionContext representing the current context.
-      function_type: A FunctionType representing concrete_fn signature.
-      concrete_fn: The concrete function to be added to the cache.
     """
-    self._primary[(context, function_type)] = concrete_fn
+    context = context or FunctionContext(None)
+    self._primary[(context, fn.function_type)] = fn
     if context not in self._dispatch_dict:
       self._dispatch_dict[context] = type_dispatch.TypeDispatchTable()
 
-    self._dispatch_dict[context].add_target(function_type)
+    self._dispatch_dict[context].add_target(fn.function_type)
 
   def generalize(
       self, context: FunctionContext,
@@ -89,10 +90,10 @@ class FunctionCache:
 
   # TODO(b/205971333): Remove this function.
   def clear(self):
-    """Removes all concrete functions from the cache."""
+    """Removes all functions from the cache."""
     self._primary.clear()
     self._dispatch_dict.clear()
 
   def values(self):
-    """Returns a list of all `ConcreteFunction` instances held by this cache."""
+    """Returns a list of all functions held by this cache."""
     return list(self._primary.values())
diff --git a/tensorflow/core/function/polymorphism/function_cache_test.py b/tensorflow/core/function/polymorphism/function_cache_test.py
index 62d493b57f2..67ea5ac2864 100644
--- a/tensorflow/core/function/polymorphism/function_cache_test.py
+++ b/tensorflow/core/function/polymorphism/function_cache_test.py
@@ -14,9 +14,10 @@
 # ==============================================================================
 """Tests for function_cache."""
 
+import dataclasses
 import itertools
 import timeit
-from typing import Optional
+from typing import Any, Optional
 
 from tensorflow.core.function import trace_type
 from tensorflow.core.function.polymorphism import function_cache
@@ -123,6 +124,12 @@ def make_single_param_type(type_constraint):
   )
 
 
+@dataclasses.dataclass(frozen=True)
+class MockFunction:
+  function_type: Any
+  test_string: str
+
+
 def make_type(value):
   typing_context = trace_type.InternalTracingContext()
   value_type = trace_type.from_value(value, typing_context)
@@ -130,27 +137,23 @@ def make_type(value):
   return f_type
 
 
-def make_none_context():
-  return function_cache.FunctionContext(None)
-
-
 class FunctionCacheTest(test.TestCase):
 
   def testConcreteFunctionDictRetainsInsertedKeys(self):
     cache = function_cache.FunctionCache()
 
     f_type_1 = make_type(1)
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_1))
+    self.assertIsNone(cache.lookup(f_type_1))
 
     f_type_2 = make_type(2)
     f_type_3 = make_type(3)
 
-    cache.add(make_none_context(), f_type_1, "test_1")
-    cache.add(make_none_context(), f_type_2, "test_2")
+    cache.add(MockFunction(f_type_1, "test_1"))
+    cache.add(MockFunction(f_type_2, "test_2"))
 
-    self.assertEqual(cache.lookup(make_none_context(), f_type_1), "test_1")
-    self.assertEqual(cache.lookup(make_none_context(), f_type_2), "test_2")
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_3))
+    self.assertEqual(cache.lookup(f_type_1).test_string, "test_1")
+    self.assertEqual(cache.lookup(f_type_2).test_string, "test_2")
+    self.assertIsNone(cache.lookup(f_type_3))
 
   def testClearRemovesAllConcreteFunctions(self):
     cache = function_cache.FunctionCache()
@@ -159,85 +162,91 @@ class FunctionCacheTest(test.TestCase):
     f_type_2 = make_type(2)
     f_type_3 = make_type(3)
 
-    cache.add(make_none_context(), f_type_1, "test_1")
-    cache.add(make_none_context(), f_type_2, "test_2")
+    cache.add(MockFunction(f_type_1, "test_1"))
+    cache.add(MockFunction(f_type_2, "test_2"))
 
-    self.assertEqual(cache.lookup(make_none_context(), f_type_1), "test_1")
-    self.assertEqual(cache.lookup(make_none_context(), f_type_2), "test_2")
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_3))
+    self.assertEqual(cache.lookup(f_type_1).test_string, "test_1")
+    self.assertEqual(cache.lookup(f_type_2).test_string, "test_2")
+    self.assertIsNone(cache.lookup(f_type_3))
 
     cache.clear()
 
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_1))
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_2))
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_3))
+    self.assertIsNone(cache.lookup(f_type_1))
+    self.assertIsNone(cache.lookup(f_type_2))
+    self.assertIsNone(cache.lookup(f_type_3))
 
   def testDeleteRemovesConcreteFunctions(self):
     cache = function_cache.FunctionCache()
     f_type_1 = make_type(1)
-    cache.add(make_none_context(), f_type_1, "test_1")
-    self.assertEqual(cache.lookup(make_none_context(), f_type_1), "test_1")
-    cache.delete(make_none_context(), f_type_1)
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_1))
+    cache.add(MockFunction(f_type_1, "test_1"))
+    self.assertEqual(cache.lookup(f_type_1).test_string, "test_1")
+    cache.delete(f_type_1)
+    self.assertIsNone(cache.lookup(f_type_1))
 
     f_type_2 = make_single_param_type(MockSubtypeOf2(2))
-    cache.add(
-        make_none_context(),
-        f_type_2,
-        "test_2",
-    )
-    self.assertEqual(cache.lookup(make_none_context(), f_type_2), "test_2")
+    cache.add(MockFunction(f_type_2, "test_2"))
+    self.assertEqual(cache.lookup(f_type_2).test_string, "test_2")
 
     f_type_3 = make_single_param_type(MockSubtypeOf2(3))
-    self.assertEqual(cache.lookup(make_none_context(), f_type_3), "test_2")
+    self.assertEqual(cache.lookup(f_type_3).test_string, "test_2")
 
-    cache.delete(make_none_context(), f_type_2)
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_2))
-    self.assertIsNone(cache.lookup(make_none_context(), f_type_3))
+    cache.delete(f_type_2)
+    self.assertIsNone(cache.lookup(f_type_2))
+    self.assertIsNone(cache.lookup(f_type_3))
 
   def testMostSpecificFunctionCacheKeyIsLookedUp(self):
     ctx = function_cache.FunctionContext(0)
     cache = function_cache.FunctionCache()
     cache.add(
-        ctx,
-        make_single_param_type(MockShape(1, 2, None)),
-        "a",
+        MockFunction(make_single_param_type(MockShape(1, 2, None)), "a"), ctx
     )
     cache.add(
-        ctx,
-        make_single_param_type(MockShape(1, 2, 3)),
-        "b",
+        MockFunction(make_single_param_type(MockShape(1, 2, 3)), "b"), ctx
     )
 
     self.assertEqual(
-        cache.lookup(ctx, make_single_param_type(MockShape(1, 2, 3))), "b"
+        cache.lookup(
+            make_single_param_type(MockShape(1, 2, 3)), ctx
+        ).test_string,
+        "b",
     )
 
   def testFirstMostSpecificFunctionCacheKeyIsLookedUp(self):
     ctx = function_cache.FunctionContext(0)
     cache = function_cache.FunctionCache()
     cache.add(
-        ctx,
-        make_single_param_type(MockShape(1, 2, None)),
-        "a",
+        MockFunction(make_single_param_type(MockShape(1, 2, None)), "a"), ctx
     )
     cache.add(
+        MockFunction(
+            make_single_param_type(MockShape(1, None, 3)),
+            "b",
+        ),
         ctx,
-        make_single_param_type(MockShape(1, None, 3)),
-        "b",
     )
 
     self.assertEqual(
-        cache.lookup(ctx, make_single_param_type(MockShape(1, 2, 3))), "a"
+        cache.lookup(
+            make_single_param_type(MockShape(1, 2, 3)), ctx
+        ).test_string,
+        "a",
     )
 
   def testMostSpecificFunctionCacheKeyIsOrderAgnostic(self):
     ctx = function_cache.FunctionContext(0)
     keys = [
-        (ctx, make_single_param_type(MockShape(1, 1, 1)), "a"),
-        (ctx, make_single_param_type(MockShape(1, None, 1)), "b"),
-        (ctx, make_single_param_type(MockShape(None, None, 1)), "c"),
-        (ctx, make_single_param_type(MockShape(None, None, None)), "d"),
+        (MockFunction(make_single_param_type(MockShape(1, 1, 1)), "a"), ctx),
+        (MockFunction(make_single_param_type(MockShape(1, None, 1)), "b"), ctx),
+        (
+            MockFunction(make_single_param_type(MockShape(None, None, 1)), "c"),
+            ctx,
+        ),
+        (
+            MockFunction(
+                make_single_param_type(MockShape(None, None, None)), "d"
+            ),
+            ctx,
+        ),
     ]
 
     for permutation in itertools.permutations(keys):
@@ -245,35 +254,43 @@ class FunctionCacheTest(test.TestCase):
       cache.add(
           permutation[0][0],
           permutation[0][1],
-          permutation[0][2],
       )
       cache.add(
           permutation[1][0],
           permutation[1][1],
-          permutation[1][2],
       )
       cache.add(
           permutation[2][0],
           permutation[2][1],
-          permutation[2][2],
       )
       cache.add(
           permutation[3][0],
           permutation[3][1],
-          permutation[3][2],
       )
 
       self.assertEqual(
-          cache.lookup(ctx, make_single_param_type(MockShape(1, 1, 1))), "a"
+          cache.lookup(
+              make_single_param_type(MockShape(1, 1, 1)), ctx
+          ).test_string,
+          "a",
       )
       self.assertEqual(
-          cache.lookup(ctx, make_single_param_type(MockShape(1, 2, 1))), "b"
+          cache.lookup(
+              make_single_param_type(MockShape(1, 2, 1)), ctx
+          ).test_string,
+          "b",
       )
       self.assertEqual(
-          cache.lookup(ctx, make_single_param_type(MockShape(2, 2, 1))), "c"
+          cache.lookup(
+              make_single_param_type(MockShape(2, 2, 1)), ctx
+          ).test_string,
+          "c",
       )
       self.assertEqual(
-          cache.lookup(ctx, make_single_param_type(MockShape(2, 2, 2))), "d"
+          cache.lookup(
+              make_single_param_type(MockShape(2, 2, 2)), ctx
+          ).test_string,
+          "d",
       )
 
 
@@ -295,15 +312,15 @@ class FunctionCacheBenchmark(test.Benchmark):
       keys.append(make_type(args))
 
     for key in keys[:-1]:
-      cache.add(make_none_context(), key, "testing")
+      cache.add(MockFunction(key, "testing"))
 
     iterations = 10000
     subtyping_time = timeit.timeit(
-        lambda: cache.lookup(make_none_context(), keys[-1]),
+        lambda: cache.lookup(keys[-1]),
         number=iterations,
     )
     equality_time = timeit.timeit(
-        lambda: cache.lookup(make_none_context(), keys[-1]),
+        lambda: cache.lookup(keys[-1]),
         number=iterations,
     )
 
@@ -345,15 +362,15 @@ class FunctionCacheBenchmark(test.Benchmark):
       keys.append(make_type(args))
 
     for key in keys:
-      cache.add(make_none_context(), key, "testing")
+      cache.add(MockFunction(key, "testing"))
 
     iterations = 10000
     subtyping_time = timeit.timeit(
-        lambda: cache.lookup(make_none_context(), keys[-1]),
+        lambda: cache.lookup(keys[-1]),
         number=iterations,
     )
     equality_time = timeit.timeit(
-        lambda: cache.lookup(make_none_context(), keys[-1]),
+        lambda: cache.lookup(keys[-1]),
         number=iterations,
     )
 
@@ -393,18 +410,16 @@ class FunctionCacheBenchmark(test.Benchmark):
       keys.append(make_type(args))
 
     for key in keys:
-      cache.add(make_none_context(), key, "testing")
+      cache.add(MockFunction(key, "testing"))
     cache.add(
-        make_none_context(),
-        make_single_param_type(MockSubtypeOf2(2)),
-        "testing",
+        MockFunction(make_single_param_type(MockSubtypeOf2(2)), "testing"),
     )
-    cache.lookup(make_none_context(), make_single_param_type(MockSubtypeOf2(3)))
+    cache.lookup(make_single_param_type(MockSubtypeOf2(3)))
 
     iterations = 10000
     lookup_key = make_single_param_type(MockSubtypeOf2(2))
     subtyping_time = timeit.timeit(
-        lambda: cache.lookup(make_none_context(), lookup_key), number=iterations
+        lambda: cache.lookup(lookup_key), number=iterations
     )
 
     self.report_benchmark(
@@ -435,18 +450,18 @@ class FunctionCacheBenchmark(test.Benchmark):
     def setup():
       cache.clear()
       for key in keys:
-        cache.add(make_none_context(), key, "testing")
+        cache.add(
+            MockFunction(key, "testing"),
+        )
       cache.add(
-          make_none_context(),
-          make_single_param_type(MockSubtypeOf2(3)),
-          "testing",
+          MockFunction(make_single_param_type(MockSubtypeOf2(3)), "testing"),
       )
 
     iterations = 10000
     lookup_key = make_single_param_type(MockSubtypeOf2(2))
     subtyping_time = sum(
         timeit.repeat(
-            stmt=lambda: cache.lookup(make_none_context(), lookup_key),
+            stmt=lambda: cache.lookup(lookup_key),
             setup=setup,
             repeat=iterations,
             number=1,
diff --git a/tensorflow/core/function/polymorphism/function_type.py b/tensorflow/core/function/polymorphism/function_type.py
index ba91ef380e7..acf10640050 100644
--- a/tensorflow/core/function/polymorphism/function_type.py
+++ b/tensorflow/core/function/polymorphism/function_type.py
@@ -156,7 +156,25 @@ class Parameter(inspect.Parameter):
 
 
 class FunctionType(inspect.Signature):
-  """Represents the signature of a polymorphic/monomorphic function."""
+  """Represents the type of a TensorFlow function.
+
+  FunctionType is the canonical way to represent the input/output contract of
+  all kinds of functions within the tf.function domain, including:
+    - Polymorphic Function
+    - Concrete Function
+    - Atomic Function
+
+  It provides consistent, centralized and layered logic for:
+    - Canonicalization of Python input arguments
+    - Type-based dispatch to monomorphic functions
+    - Packing/unpacking structured python values to Tensors
+    - Generation of structured placeholder values for tracing
+
+  Additionaly, it also provides:
+    - Lossless serialization
+    - Native integration with Python function signature representation
+    - Seamless migration from older representation formats
+  """
 
   def __init__(self,
                parameters: Sequence[inspect.Parameter],
@@ -167,11 +185,12 @@ class FunctionType(inspect.Signature):
 
   @property
   def parameters(self) -> Mapping[str, Any]:
+    """Returns an ordered mapping of parameter name to specification."""
     return super().parameters
 
   @property
   def captures(self) -> collections.OrderedDict:
-    """Return an ordered mapping of capture id to type."""
+    """Returns an ordered mapping of capture id to type."""
     return self._captures
 
   @property
@@ -265,13 +284,13 @@ class FunctionType(inspect.Signature):
       if not self_param.is_subtype_of(other_param):
         return False
 
-    # Self must have all capture names of other.
-    if not all(name in self.captures for name in other.captures):
+    # Other must have all capture names of self.
+    if not all(name in other.captures for name in self.captures):
       return False
 
     # Functions are contravariant upon the capture types.
-    return all(self.captures[name].is_subtype_of(capture_type)
-               for name, capture_type in other.captures.items())
+    return all(capture_type.is_subtype_of(other.captures[name])
+               for name, capture_type in self.captures.items())
 
   def most_specific_common_subtype(
       self, others: Sequence["FunctionType"]) -> Optional["FunctionType"]:
@@ -289,16 +308,22 @@ class FunctionType(inspect.Signature):
     if not all(subtyped_parameters):
       return None
 
-    # Common subtype must use captures common to all.
+    # Common subtype has superset of all captures.
     capture_names = set(self.captures.keys())
     for other in others:
-      capture_names = capture_names.intersection(other.captures.keys())
+      capture_names = capture_names.union(other.captures.keys())
 
     subtyped_captures = collections.OrderedDict()
     for name in capture_names:
+      containing = [t for t in [self, *others] if name in t.captures]
+      # Pick the first type that has the capture as the base.
+      base = containing[0]
+      relevant_others = containing[1:]
+
       # Functions are contravariant upon the capture types.
-      common_type = self.captures[name].most_specific_common_supertype(
-          [other.captures[name] for other in others])
+      common_type = base.captures[name].most_specific_common_supertype(
+          [other.captures[name] for other in relevant_others]
+      )
       if common_type is None:
         return None
       else:
@@ -344,6 +369,16 @@ class FunctionType(inspect.Signature):
 
     return self._cached_flat_outputs
 
+  def pack_output(self, flat_values):
+    """Packs flat tensors to generate a value of the output type."""
+    if flat_values is None:
+      flat_values = []
+
+    if self.output is None:
+      raise ValueError("Can not pack outputs for undefined output type.")
+    else:
+      return self.output._from_tensors(iter(flat_values))   # pylint: disable=protected-access
+
   def __eq__(self, other: Any) -> bool:
     if not isinstance(other, FunctionType):
       return NotImplemented
@@ -411,7 +446,7 @@ def _make_validated_mono_param(
 def canonicalize_to_monomorphic(
     args: Tuple[Any, ...], kwargs: Dict[Any, Any], default_values: Dict[Any,
                                                                         Any],
-    captures: Dict[Any, Any], polymorphic_type: FunctionType
+    capture_types: collections.OrderedDict, polymorphic_type: FunctionType
 ) -> Tuple[inspect.BoundArguments, FunctionType,
            trace_type.InternalTracingContext]:
   """Converts polymorphic parameters to monomorphic and associated type."""
@@ -463,10 +498,6 @@ def canonicalize_to_monomorphic(
                                      type_context,
                                      poly_parameter.type_constraint))
 
-  capture_types = collections.OrderedDict()
-  for name, value in captures.items():
-    capture_types[name] = trace_type.from_value(value, type_context)
-
   monomorphic_function_type = FunctionType(parameters, capture_types)
   mono_bound_arguments = monomorphic_function_type.bind(
       *poly_bound_arguments.args, **poly_bound_arguments.kwargs)
@@ -537,9 +568,12 @@ def add_type_constraints(function_type: FunctionType, input_signature: Any,
 
 
 def from_structured_signature(
-    input_signature, output_signature=None, capture_types=None
+    input_signature=None, output_signature=None, capture_types=None
 ) -> FunctionType:
   """Generates a FunctionType from legacy signature representation."""
+  if input_signature is None:
+    input_signature = ((), {})
+
   args, kwargs = input_signature
   parameters = []
 
@@ -568,14 +602,37 @@ def from_structured_signature(
         )
     )
 
-  if output_signature is None:
-    return_type = None
-  else:
-    return_type = trace_type.from_value(
-        output_signature,
-        trace_type.InternalTracingContext(is_legacy_signature=True),
-    )
+  return_type = trace_type.from_value(
+      output_signature,
+      trace_type.InternalTracingContext(is_legacy_signature=True),
+  )
 
   return FunctionType(
       parameters, capture_types or {}, return_annotation=return_type
   )
+
+
+def to_structured_signature(function_type: FunctionType) -> Tuple[Any, Any]:
+  """Returns structured input and output signatures from a FunctionType."""
+  def to_signature(x_type):
+    if x_type is None:
+      raise TypeError(
+          "Can not generate structured signature if FunctionType is not fully"
+          f" specified. Received {function_type}"
+      )
+    return x_type.placeholder_value(
+        trace_type.InternalPlaceholderContext(unnest_only=True)
+    )
+
+  args_signature = []
+  kwargs_signature = {}
+  for p in function_type.parameters.values():
+    if p.kind == Parameter.POSITIONAL_ONLY:
+      args_signature.append(to_signature(p.type_constraint))
+    else:
+      kwargs_signature[p.name] = to_signature(p.type_constraint)
+
+  input_signature = (tuple(args_signature), kwargs_signature)
+  output_signature = to_signature(function_type.output)
+
+  return input_signature, output_signature
diff --git a/tensorflow/core/function/polymorphism/function_type_test.py b/tensorflow/core/function/polymorphism/function_type_test.py
index 610d89913d9..76656cd8c7f 100644
--- a/tensorflow/core/function/polymorphism/function_type_test.py
+++ b/tensorflow/core/function/polymorphism/function_type_test.py
@@ -668,8 +668,8 @@ class CapturesTest(test.TestCase):
     self.type_d1 = gen_type_fn({"d": trace_type.from_value(1)})
 
   def testCapturesSubtype(self):
-    self.assertFalse(self.type_a1_b1.is_supertype_of(self.type_a1_b1_c1))
-    self.assertTrue(self.type_a1_b1_c1.is_supertype_of(self.type_a1_b1))
+    self.assertTrue(self.type_a1_b1.is_supertype_of(self.type_a1_b1_c1))
+    self.assertFalse(self.type_a1_b1_c1.is_supertype_of(self.type_a1_b1))
     self.assertFalse(self.type_a1_b1_c1.is_supertype_of(self.type_a2_b2_c2))
     self.assertFalse(self.type_a1_b1_c1.is_supertype_of(self.type_a2_b2_c2))
     self.assertFalse(self.type_d1.is_supertype_of(self.type_a1_b1))
@@ -685,7 +685,7 @@ class CapturesTest(test.TestCase):
 
     supertype_3 = self.type_a1_b1.most_specific_common_subtype(
         [self.type_a1_b1_c2])
-    self.assertLen(supertype_3.captures, 2)
+    self.assertLen(supertype_3.captures, 3)
 
     supertype_4 = self.type_a1_b1_c1.most_specific_common_subtype(
         [self.type_a1_b1_c2])
@@ -693,7 +693,7 @@ class CapturesTest(test.TestCase):
 
     supertype_5 = self.type_a1_b1_c1.most_specific_common_subtype(
         [self.type_d1])
-    self.assertEmpty(supertype_5.captures)
+    self.assertLen(supertype_5.captures, 4)
 
 
 class SanitizationTest(test.TestCase):
@@ -790,7 +790,7 @@ class FromStructuredSignatureTest(test.TestCase, parameterized.TestCase):
   )
   def testArgs(self, signature, expected_types):
     generated_type = function_type.from_structured_signature(signature)
-    self.assertIsNone(generated_type.output)
+    self.assertEqual(generated_type.output, trace_type.from_value(None))
     for i, p in enumerate(generated_type.parameters.values()):
       self.assertEqual(p.kind, function_type.Parameter.POSITIONAL_ONLY)
       self.assertEqual(p.type_constraint, expected_types[i])
@@ -817,7 +817,7 @@ class FromStructuredSignatureTest(test.TestCase, parameterized.TestCase):
   )
   def testKwargs(self, signature, expected_types):
     generated_type = function_type.from_structured_signature(signature)
-    self.assertIsNone(generated_type.output)
+    self.assertEqual(generated_type.output, trace_type.from_value(None))
     for p in generated_type.parameters.values():
       self.assertEqual(p.kind, function_type.Parameter.KEYWORD_ONLY)
       self.assertEqual(p.type_constraint, expected_types[p.name])
diff --git a/tensorflow/core/function/runtime_client/BUILD b/tensorflow/core/function/runtime_client/BUILD
index 68f51ab52e6..460db9c22f6 100644
--- a/tensorflow/core/function/runtime_client/BUILD
+++ b/tensorflow/core/function/runtime_client/BUILD
@@ -134,6 +134,7 @@ pytype_strict_library(
     visibility = [
         "//learning/brain/experimental/tfq:__pkg__",
         "//tensorflow/core/function/transform:__subpackages__",
+        "//tensorflow/python/eager:__subpackages__",
     ],
     deps = [
         ":runtime_client_pybind",
@@ -152,7 +153,6 @@ py_strict_test(
         ":runtime_client_py",
         "//tensorflow/core/framework:function_proto_py",
         "//tensorflow/core/function/testing:test_pass_py",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
@@ -161,6 +161,7 @@ py_strict_test(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/core/function/trace_type/BUILD b/tensorflow/core/function/trace_type/BUILD
index 955671ee8d6..e63fe599a05 100644
--- a/tensorflow/core/function/trace_type/BUILD
+++ b/tensorflow/core/function/trace_type/BUILD
@@ -47,9 +47,6 @@ py_strict_test(
     deps = [
         ":default_types",
         ":trace_type",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
@@ -59,6 +56,9 @@ py_strict_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
diff --git a/tensorflow/core/function/trace_type/default_types.py b/tensorflow/core/function/trace_type/default_types.py
index 2d87b1e88fa..b852e10e03c 100644
--- a/tensorflow/core/function/trace_type/default_types.py
+++ b/tensorflow/core/function/trace_type/default_types.py
@@ -15,6 +15,8 @@
 """TraceType implementations for common Python types."""
 
 import collections
+import math
+import numbers
 from typing import Any, Dict as PythonDict, Hashable, List as PythonList, Optional, Sequence, Tuple as PythonTuple, Type
 import weakref
 
@@ -34,11 +36,41 @@ def register_tensor_type(tensor_type):
   else:
     raise AssertionError("Tensor type is already registered.")
 
+NanMarker = object()
+
+
+def is_nan(x):
+  """Checks if given value is a Python NaN."""
+  if not isinstance(x, numbers.Number):
+    return False
+
+  if isinstance(x, complex):
+    return math.isnan(x.real) or math.isnan(x.imag)
+  else:
+    return math.isnan(x)
+
+
+def cast_and_return_whether_casted(
+    trace_types, values, context
+) -> PythonTuple[PythonList[Any], bool]:
+  did_cast = False
+  casted_values = []
+  for t, v in zip(trace_types, values):
+    casted_v = t._cast(v, context)  # pylint: disable=protected-access
+    casted_values.append(casted_v)
+    if casted_v is not v:
+      did_cast = True
+  return casted_values, did_cast
+
 
 class Literal(trace.TraceType, serialization.Serializable):
   """Represents a Literal type like bool, int or string."""
 
   def __init__(self, value: Any):
+    # We match nan values against each other even though Python doesn't.
+    if is_nan(value):
+      value = NanMarker
+
     self.value = value
     self._value_hash = hash(value)
 
@@ -98,8 +130,21 @@ class Literal(trace.TraceType, serialization.Serializable):
     # is expected to be a range and not a list.
     if isinstance(self.value, range):
       return list(self.value)
+
+    if self.value is NanMarker:
+      return float("nan")
+
     return self.value
 
+  def _cast(self, value: Any, casting_context: Any) -> Any:
+    if self.value is NanMarker and is_nan(value):
+      return value
+
+    if value == self.value:
+      return value
+    else:
+      raise ValueError(f"Can not cast {value!r} to {self!r}")
+
   def __eq__(self, other) -> bool:
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -134,6 +179,18 @@ class Weakref(trace.TraceType):
   def placeholder_value(self, placeholder_context) -> Any:
     return self._ref()
 
+  def _cast(self, value, _):
+    if value is self._ref() or value == self._ref():
+      return value
+
+    # We unwrap objects when generating the TraceType so we allow matching now.
+    while hasattr(value, "__wrapped__"):
+      value = value.__wrapped__
+      if value is self._ref():
+        return value
+
+    raise ValueError(f"Can not cast {value!r} to {self!r}")
+
   def __eq__(self, other):
     if not isinstance(other, trace.TraceType):
       return NotImplemented
@@ -216,6 +273,9 @@ class Tuple(trace.TraceType, serialization.Serializable):
       flattened_values.extend(comp_type._to_tensors(comp_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _from_tensors(self, tensors) -> Any:
+    return tuple(c._from_tensors(tensors) for c in self.components)  # pylint: disable=protected-access
+
   def _flatten(self) -> PythonList[trace.TraceType]:
     flattened_types = []
     for component in self.components:
@@ -228,8 +288,13 @@ class Tuple(trace.TraceType, serialization.Serializable):
         self.components
     ), f"Expected {value} to have length of {len(self.components)}"
 
-    return tuple(component._cast(  # pylint: disable=protected-access
-        v, casting_context) for v, component in zip(value, self.components))
+    casted_values, was_casted = cast_and_return_whether_casted(
+        self.components, value, casting_context
+    )
+    if was_casted:
+      return tuple(casted_values)
+    else:
+      return value
 
   def __eq__(self, other: Any) -> bool:
     if not isinstance(other, trace.TraceType):
@@ -297,12 +362,25 @@ class List(trace.TraceType, serialization.Serializable):
     assert isinstance(value, list)
     return self.components_tuple._to_tensors(tuple(value))  # pylint: disable=protected-access
 
+  def _from_tensors(self, tensors) -> Any:
+    return list(self.components_tuple._from_tensors(tensors))  # pylint: disable=protected-access
+
   def _flatten(self) -> PythonList[trace.TraceType]:
     return self.components_tuple._flatten()  # pylint: disable=protected-access
 
   def _cast(self, value: Any, casting_context) -> Any:
     assert isinstance(value, list), f"Can not cast {value!r} to list type."
-    return list(self.components_tuple._cast(tuple(value), casting_context))  # pylint: disable=protected-access
+    assert len(value) == len(
+        self.components_tuple.components
+    ), f"Expected {value} to have length of {len(self.components_tuple)}"
+
+    casted_values, was_casted = cast_and_return_whether_casted(
+        self.components_tuple.components, value, casting_context
+    )
+    if was_casted:
+      return list(casted_values)
+    else:
+      return value
 
   def __eq__(self, other: Any) -> bool:
     if not isinstance(other, trace.TraceType):
@@ -405,6 +483,14 @@ class NamedTuple(trace.TraceType, serialization.Serializable):
       flattened_values.extend(attribute_type._to_tensors(attribute_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _from_tensors(self, tensors) -> Any:
+    if self._placeholder_type is None:
+      raise ValueError("Packing serialized NamedTuples is not supported.")
+
+    return self._placeholder_type(
+        *[c._from_tensors(tensors) for c in self.attributes.components]  # pylint: disable=protected-access
+    )
+
   def _flatten(self) -> PythonList[trace.TraceType]:
     flattened_types = []
 
@@ -418,15 +504,20 @@ class NamedTuple(trace.TraceType, serialization.Serializable):
     assert util.is_namedtuple(
         value
     ), f"Cannot cast {value!r} to type {self._placeholder_type!r}."
-    cast_value = {}
     value_dict = value._asdict()
     assert set(value_dict.keys()) == set(
         self.attribute_names
     ), f"{value!r} has different attributes with the TraceType {self!r}"
 
-    for k, v in zip(self.attribute_names, self.attributes.components):
-      cast_value[k] = v._cast(getattr(value, k), casting_context)  # pylint: disable=protected-access
-    return self._placeholder_type(**cast_value)
+    casted_values, was_casted = cast_and_return_whether_casted(
+        self.attributes.components,
+        [getattr(value, name) for name in self.attribute_names],
+        casting_context,
+    )
+    if was_casted:
+      return self._placeholder_type(*casted_values)
+    else:
+      return value
 
   def __hash__(self) -> int:
     return hash((self.type_name, self.attribute_names, self.attributes))
@@ -533,6 +624,17 @@ class Attrs(trace.TraceType):
       flattened_values.extend(attribute_type._to_tensors(attribute_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _from_tensors(self, tensors):
+    if self._placeholder_type is None:
+      raise ValueError("Packing serialized NamedTuples is not supported.")
+
+    return self._placeholder_type(
+        *[
+            c._from_tensors(tensors)  # pylint: disable=protected-access
+            for c in self.named_attributes.attributes.components
+        ]
+    )
+
   def _flatten(self) -> PythonList[trace.TraceType]:
     flattened_types = []
 
@@ -543,15 +645,18 @@ class Attrs(trace.TraceType):
 
   def _cast(self, value: Any, casting_context) -> Any:
     assert util.is_attrs(value)
-    value_cast = {}
-    for attribute_name, attribute_type in zip(
-        self.named_attributes.attribute_names,
-        self.named_attributes.attributes.components):
-      attribute_value = getattr(value, attribute_name)
-      value_cast[attribute_name] = attribute_type._cast(  # pylint: disable=protected-access
-          attribute_value, casting_context)
 
-    return self._placeholder_type(**value_cast)
+    attr_names = self.named_attributes.attribute_names
+    casted_values, was_casted = cast_and_return_whether_casted(
+        self.named_attributes.attributes.components,
+        [getattr(value, name) for name in attr_names],
+        casting_context,
+    )
+
+    if was_casted:
+      return self._placeholder_type(*casted_values)
+    else:
+      return value
 
   def __hash__(self) -> int:
     return hash(self.named_attributes)
@@ -658,6 +763,22 @@ class Dict(trace.TraceType, serialization.Serializable):
       flattened_values.extend(comp_type._to_tensors(comp_value))  # pylint: disable=protected-access
     return flattened_values
 
+  def _from_tensors(self, tensors):
+    if self._placeholder_type is None:
+      raise ValueError("Packing serialized Dict is not supported.")
+
+    sorted_traversal = {
+        key: self.mapping[key]._from_tensors(tensors)  # pylint: disable=protected-access
+        for key in sorted(self.mapping)
+    }
+
+    if self._placeholder_type is collections.defaultdict:
+      return {key: sorted_traversal[key] for key in self.mapping}
+
+    return self._placeholder_type(
+        (key, sorted_traversal[key]) for key in self.mapping
+    )
+
   def _flatten(self) -> PythonList[trace.TraceType]:
     flattened_types = []
 
@@ -675,14 +796,18 @@ class Dict(trace.TraceType, serialization.Serializable):
         self.mapping.keys()
     ), f"{value!r} has different keys with the TraceType {self!r}."
 
-    cast_value = {}
-    for k in value:
-      assert k in self.mapping, f"Key {k} does not exist in TraceType {self!r}."
-      cast_value[k] = self.mapping[k]._cast(value[k], casting_context)  # pylint: disable=protected-access
-    if self._placeholder_type is None:
-      return cast_value
+    casted_values, was_casted = cast_and_return_whether_casted(
+        self.mapping.values(),
+        [value[k] for k in self.mapping.keys()],
+        casting_context,
+    )
+
+    if was_casted:
+      return self._placeholder_type(
+          **{k: v for k, v in zip(self.mapping.keys(), casted_values)}
+      )
     else:
-      return self._placeholder_type(**cast_value)
+      return value
 
   def __eq__(self, other) -> bool:
     if not isinstance(other, trace.TraceType):
diff --git a/tensorflow/core/function/trace_type/default_types_test.py b/tensorflow/core/function/trace_type/default_types_test.py
index 312b1c29133..ccf2021111d 100644
--- a/tensorflow/core/function/trace_type/default_types_test.py
+++ b/tensorflow/core/function/trace_type/default_types_test.py
@@ -84,6 +84,14 @@ class TestAttrsClass:
 
 class DefaultTypesTest(test.TestCase):
 
+  def testLiteralNan(self):
+    nan_literal = default_types.Literal(float('nan'))
+    complex_nan = default_types.Literal(complex(float('nan'), 1))
+    complex_nan_other = default_types.Literal(complex(1, float('nan')))
+    self.assertEqual(nan_literal, nan_literal)
+    self.assertEqual(nan_literal, complex_nan)
+    self.assertEqual(nan_literal, complex_nan_other)
+
   def testLiteralSupertypes(self):
     literal_a = default_types.Literal(1)
     literal_b = default_types.Literal(2)
@@ -279,6 +287,19 @@ class DefaultTypesTest(test.TestCase):
     self.assertEqual(dict_a, dict_c)
     self.assertNotEqual(dict_a, dict_b)
 
+  def testCastLazy(self):
+    list_type = default_types.List(
+        default_types.Literal('a'), default_types.Literal('b')
+    )
+    tuple_type = default_types.Tuple(default_types.Literal('c'), list_type)
+    dict_type = default_types.Dict(
+        {'key': tuple_type, 'other_key': list_type}, placeholder_type=dict
+    )
+
+    value = dict_type.placeholder_value(None)
+    casted_value = dict_type._cast(value, None)
+
+    self.assertIs(value, casted_value)
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/core/function/trace_type/trace_type_test.py b/tensorflow/core/function/trace_type/trace_type_test.py
index c9970343562..1ea00486673 100644
--- a/tensorflow/core/function/trace_type/trace_type_test.py
+++ b/tensorflow/core/function/trace_type/trace_type_test.py
@@ -237,7 +237,7 @@ class TraceTypeBuilderTest(test.TestCase, parameterized.TestCase):
 
     class MockWrapper(tuple):
       # Generated through trackable data structures:
-      # //tensorflow/python/training/tracking/data_structures.py
+      # //tensorflow/python/trackable/data_structures.py
       # With design pattern similar to Python functools:
       # https://docs.python.org/3/library/functools.html?highlight=__wrapped__#functools.update_wrapper
       __wrapped__ = ActualType(1, 2, 3)
@@ -264,7 +264,7 @@ class CastDefaultTypesTest(test.TestCase, parameterized.TestCase):
     ctx = trace_type.InternalCastContext()
     value = trace_float._cast(1.5, ctx)
     self.assertEqual(value, 1.5)
-    with self.assertRaises(AssertionError):
+    with self.assertRaises(ValueError):
       _ = trace_float._cast(1, ctx)
 
   @parameterized.parameters(list, tuple)
diff --git a/tensorflow/core/function/transform/BUILD b/tensorflow/core/function/transform/BUILD
index 31328d1f22e..e3f8ea5fae0 100644
--- a/tensorflow/core/function/transform/BUILD
+++ b/tensorflow/core/function/transform/BUILD
@@ -25,15 +25,15 @@ pytype_strict_library(
         "//tensorflow/core/framework:function_proto_py",
         "//tensorflow/core/function/capture:restore_captures",
         "//tensorflow/core/function/runtime_client:runtime_client_py",
-        "//tensorflow/python:default_gradient",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:handle_data_util",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:function_def_to_graph",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:default_gradient",
+        "//tensorflow/python/ops:handle_data_util",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
     ],
@@ -48,15 +48,15 @@ py_strict_test(
         ":transform",
         "//tensorflow/core/function/testing:test_pass_py",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
diff --git a/tensorflow/core/function/transform/transform.py b/tensorflow/core/function/transform/transform.py
index ce9aabd455c..0483cc488b6 100644
--- a/tensorflow/core/function/transform/transform.py
+++ b/tensorflow/core/function/transform/transform.py
@@ -252,7 +252,7 @@ def transform_function(
   # pylint: disable=protected-access
   # Get the new ConcreteFunction.
   updated_cf = function_lib.ConcreteFunction(
-      func_graph, attrs=fndef.attr, spec=cf._function_spec
+      func_graph, attrs=fndef.attr, function_type=cf.function_type
   )
 
   # Set arg_keywords and positional_args
diff --git a/tensorflow/core/graph/BUILD b/tensorflow/core/graph/BUILD
index d29de9199ed..984fd219233 100644
--- a/tensorflow/core/graph/BUILD
+++ b/tensorflow/core/graph/BUILD
@@ -65,6 +65,7 @@ filegroup(
         "algorithm.h",
         "default_device.h",
         "graph.h",
+        "graph_debug_info_builder.h",
         "graph_def_builder.h",
         "graph_node_util.h",
         "node_builder.h",
@@ -78,6 +79,7 @@ filegroup(
     srcs = [
         "edgeset.h",
         "graph.h",
+        "graph_debug_info_builder.h",
         "graph_def_builder.h",
         "graph_node_util.h",
         "node_builder.h",
@@ -91,6 +93,7 @@ filegroup(
     srcs = [
         "edgeset.cc",
         "graph.cc",
+        "graph_debug_info_builder.cc",
         "graph_def_builder.cc",
         "graph_node_util.cc",
         "node_builder.cc",
@@ -115,6 +118,7 @@ filegroup(
         "default_device.h",
         "edgeset.h",
         "graph.h",
+        "graph_debug_info_builder.h",
         "graph_def_builder.h",
         "graph_node_util.h",
         "graph_partition.h",
@@ -185,6 +189,8 @@ filegroup(
         "edgeset.h",
         "graph.cc",
         "graph.h",
+        "graph_debug_info_builder.cc",
+        "graph_debug_info_builder.h",
         "graph_def_builder.cc",
         "graph_def_builder.h",
         "graph_node_util.cc",
@@ -221,6 +227,7 @@ exports_files(
         "control_flow_test.cc",
         "costmodel_test.cc",
         "edgeset_test.cc",
+        "graph_debug_info_builder_test.cc",
         "graph_def_builder_test.cc",
         "graph_partition_test.cc",
         "graph_test.cc",
@@ -241,6 +248,7 @@ tf_cc_tests(
         "control_flow_test.cc",
         "costmodel_test.cc",
         "edgeset_test.cc",
+        "graph_debug_info_builder_test.cc",
         "graph_def_builder_test.cc",
         "graph_partition_test.cc",
         "graph_test.cc",
@@ -283,5 +291,6 @@ tf_cc_tests(
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/core/graph/algorithm.cc b/tensorflow/core/graph/algorithm.cc
index f80822d5b00..808d22e565e 100644
--- a/tensorflow/core/graph/algorithm.cc
+++ b/tensorflow/core/graph/algorithm.cc
@@ -204,7 +204,8 @@ void GetPostOrder(const Graph& g, std::vector<Node*>* order,
                   const NodeComparator& stable_comparator,
                   const EdgeFilter& edge_filter) {
   order->clear();
-  DFS(g, nullptr, [order](Node* n) { order->push_back(n); }, stable_comparator,
+  DFS(
+      g, nullptr, [order](Node* n) { order->push_back(n); }, stable_comparator,
       edge_filter);
 }
 
@@ -269,4 +270,54 @@ bool FixupSourceAndSinkEdges(Graph* g) {
   return changed;
 }
 
+namespace {
+template <class T>
+void BreadthFirstTraversalHelper(const Graph& g, gtl::ArraySlice<T> start,
+                                 const std::function<void(T)>& visit,
+                                 NodeComparator stable_comparator) {
+  std::deque<T> stack;
+  if (start.empty()) {
+    for (T n : g.nodes()) {
+      if (n->in_edges().empty()) {
+        stack.push_back(n);
+      }
+    }
+  }
+
+  std::vector<bool> seen(g.num_node_ids(), false);
+  while (!stack.empty()) {
+    T n = stack.front();
+    stack.pop_front();
+
+    seen[n->id()] = true;
+    visit(n);
+
+    std::vector<T> nodes_sorted;
+    for (const Edge* out_edge : n->out_edges()) {
+      if (!seen[out_edge->dst()->id()]) {
+        seen[out_edge->dst()->id()] = true;
+        nodes_sorted.emplace_back(out_edge->dst());
+      }
+    }
+    std::sort(nodes_sorted.begin(), nodes_sorted.end(), stable_comparator);
+    for (T out : nodes_sorted) {
+      stack.push_back(out);
+    }
+  }
+}
+}  // namespace
+
+void BreadthFirstTraversal(const Graph& g, gtl::ArraySlice<const Node*> start,
+                           const std::function<void(const Node*)>& visit,
+                           NodeComparator stable_comparator) {
+  return BreadthFirstTraversalHelper<const Node*>(g, start, visit,
+                                                  stable_comparator);
+}
+
+void BreadthFirstTraversal(Graph& g, gtl::ArraySlice<Node*> start,
+                           const std::function<void(Node*)>& visit,
+                           NodeComparator stable_comparator) {
+  return BreadthFirstTraversalHelper<Node*>(g, start, visit, stable_comparator);
+}
+
 }  // namespace tensorflow
diff --git a/tensorflow/core/graph/algorithm.h b/tensorflow/core/graph/algorithm.h
index 9a9595a86d6..c9f328a0cdc 100644
--- a/tensorflow/core/graph/algorithm.h
+++ b/tensorflow/core/graph/algorithm.h
@@ -50,10 +50,10 @@ struct NodeComparatorName {
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
-extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
-                const std::function<void(Node*)>& leave,
-                const NodeComparator& stable_comparator = {},
-                const EdgeFilter& edge_filter = {});
+void DFS(const Graph& g, const std::function<void(Node*)>& enter,
+         const std::function<void(Node*)>& leave,
+         const NodeComparator& stable_comparator = {},
+         const EdgeFilter& edge_filter = {});
 
 // Perform a depth-first-search on g starting at the 'start' nodes.
 // If enter is not empty, calls enter(n) before visiting any children of n.
@@ -61,16 +61,16 @@ extern void DFS(const Graph& g, const std::function<void(Node*)>& enter,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
-extern void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
-                    const std::function<void(Node*)>& enter,
-                    const std::function<void(Node*)>& leave,
-                    const NodeComparator& stable_comparator = {},
-                    const EdgeFilter& edge_filter = {});
-extern void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
-                    const std::function<void(const Node*)>& enter,
-                    const std::function<void(const Node*)>& leave,
-                    const NodeComparator& stable_comparator = {},
-                    const EdgeFilter& edge_filter = {});
+void DFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+             const std::function<void(Node*)>& enter,
+             const std::function<void(Node*)>& leave,
+             const NodeComparator& stable_comparator = {},
+             const EdgeFilter& edge_filter = {});
+void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+             const std::function<void(const Node*)>& enter,
+             const std::function<void(const Node*)>& leave,
+             const NodeComparator& stable_comparator = {},
+             const EdgeFilter& edge_filter = {});
 
 // Perform a reverse depth-first-search on g starting at the sink node.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
@@ -78,10 +78,10 @@ extern void DFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
-extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
-                       const std::function<void(Node*)>& leave,
-                       const NodeComparator& stable_comparator = {},
-                       const EdgeFilter& edge_filter = {});
+void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
+                const std::function<void(Node*)>& leave,
+                const NodeComparator& stable_comparator = {},
+                const EdgeFilter& edge_filter = {});
 
 // Perform a reverse depth-first-search on g starting at the 'start' nodes.
 // If enter is not empty, calls enter(n) before visiting any parents of n.
@@ -89,16 +89,26 @@ extern void ReverseDFS(const Graph& g, const std::function<void(Node*)>& enter,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 // If edge_filter is set then ignores edges for which edge_filter returns false.
-extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
-                           const std::function<void(Node*)>& enter,
-                           const std::function<void(Node*)>& leave,
-                           const NodeComparator& stable_comparator = {},
-                           const EdgeFilter& edge_filter = {});
-extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
-                           const std::function<void(const Node*)>& enter,
-                           const std::function<void(const Node*)>& leave,
-                           const NodeComparator& stable_comparator = {},
-                           const EdgeFilter& edge_filter = {});
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<Node*> start,
+                    const std::function<void(Node*)>& enter,
+                    const std::function<void(Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
+void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
+                    const std::function<void(const Node*)>& enter,
+                    const std::function<void(const Node*)>& leave,
+                    const NodeComparator& stable_comparator = {},
+                    const EdgeFilter& edge_filter = {});
+
+void BreadthFirstTraversal(
+    const Graph& g, gtl::ArraySlice<const Node*> start,
+    const std::function<void(const Node*)>& visit,
+    NodeComparator stable_comparator = NodeComparatorID());
+
+void BreadthFirstTraversal(
+    Graph& g, gtl::ArraySlice<Node*> start,
+    const std::function<void(Node*)>& visit,
+    NodeComparator stable_comparator = NodeComparatorID());
 
 // Stores in *order the post-order numbering of all nodes
 // in graph found via a depth first search starting at the source node.
@@ -109,7 +119,8 @@ extern void ReverseDFSFrom(const Graph& g, gtl::ArraySlice<const Node*> start,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 //
-// If edge_filter is set then ignores edges for which edge_filter returns false.
+// If edge_filter is set then ignores edges for which edge_filter returns
+// false.
 //
 // REQUIRES: order is not NULL.
 void GetPostOrder(const Graph& g, std::vector<Node*>* order,
@@ -120,7 +131,8 @@ void GetPostOrder(const Graph& g, std::vector<Node*>* order,
 // If stable_comparator is set, a stable ordering of visit is achieved by
 // sorting a node's neighbors first before visiting them.
 //
-// If edge_filter is set then ignores edges for which edge_filter returns false.
+// If edge_filter is set then ignores edges for which edge_filter returns
+// false.
 void GetReversePostOrder(const Graph& g, std::vector<Node*>* order,
                          const NodeComparator& stable_comparator = {},
                          const EdgeFilter& edge_filter = {});
diff --git a/tensorflow/core/graph/graph.cc b/tensorflow/core/graph/graph.cc
index 019ee93c614..c54f73e6e7a 100644
--- a/tensorflow/core/graph/graph.cc
+++ b/tensorflow/core/graph/graph.cc
@@ -16,21 +16,27 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_properties.h"
 #include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/versions.pb.h"
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
 #include "tensorflow/core/graph/graph_node_util.h"
 #include "tensorflow/core/graph/while_context.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/lib/hash/hash.h"
 #include "tensorflow/core/lib/strings/strcat.h"
@@ -765,26 +771,28 @@ Status Graph::AddWhileInputHack(Node* new_src, int new_src_index, Node* dst) {
   return OkStatus();
 }
 
-Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib,
-                                 const StackTracesMap& stack_traces) {
-  return AddFunctionLibrary(FunctionDefLibrary(fdef_lib), stack_traces);
+Status Graph::AddFunctionLibrary(
+    const FunctionDefLibrary& fdef_lib,
+    const FunctionDefLibraryStackTraces& library_traces) {
+  return AddFunctionLibrary(FunctionDefLibrary(fdef_lib), library_traces);
 }
 
-Status Graph::AddFunctionLibrary(FunctionDefLibrary&& fdef_lib,
-                                 const StackTracesMap& stack_traces) {
+Status Graph::AddFunctionLibrary(
+    FunctionDefLibrary&& fdef_lib,
+    const FunctionDefLibraryStackTraces& library_traces) {
   // Need a new-enough consumer to support the functions we add to the graph.
   if (fdef_lib.function_size() > 0 && versions_->min_consumer() < 12) {
     versions_->set_min_consumer(12);
   }
-  return ops_.AddLibrary(std::move(fdef_lib), stack_traces);
+  return ops_.AddLibrary(std::move(fdef_lib), library_traces);
 }
 
 Status Graph::AddFunctionLibrary(const FunctionDefLibrary& fdef_lib) {
-  return AddFunctionLibrary(fdef_lib, /*stack_traces=*/{});
+  return AddFunctionLibrary(fdef_lib, /*library_traces=*/{});
 }
 
 Status Graph::AddFunctionLibrary(FunctionDefLibrary&& fdef_lib) {
-  return AddFunctionLibrary(std::move(fdef_lib), /*stack_traces=*/{});
+  return AddFunctionLibrary(std::move(fdef_lib), /*library_traces=*/{});
 }
 
 Status Graph::AddFunctionDef(const FunctionDef& fdef,
@@ -1044,6 +1052,33 @@ void Graph::NodeType(StringPiece name, const FullTypeDef** result) {
   }
 }
 
+GraphDebugInfo Graph::BuildDebugInfo() const {
+  // Gather stack traces for all nodes associated with function definitions.
+  // Give these a map key in `traces` of <node_name> '@' <function_name>.
+  GraphDebugInfoBuilder builder;
+  for (const std::string& function_name : flib_def().ListFunctionNames()) {
+    if (core::RefCountPtr<FunctionRecord> function_record =
+            flib_def().FindRecord(function_name)) {
+      builder.AccumulateStackTracesMap(function_record->stack_traces(),
+                                       absl::StrCat("@", function_name));
+    }
+  }
+
+  // Other nodes will use the node name as the map key.
+  for (const Node* node : nodes()) {
+    if (node == nullptr || !node->IsOp()) {
+      continue;
+    }
+    const std::shared_ptr<AbstractStackTrace>& stack_trace =
+        node->GetStackTrace();
+    if (stack_trace != nullptr) {
+      builder.AccumulateStackTrace(*stack_trace, node->name());
+    }
+  }
+
+  return builder.Build();
+}
+
 std::string Edge::DebugString() const {
   auto src_name = src_ ? src_->name().c_str() : "<NULL>";
   auto dst_name = dst_ ? dst_->name().c_str() : "<NULL>";
diff --git a/tensorflow/core/graph/graph.h b/tensorflow/core/graph/graph.h
index a70c27aab49..617f071a633 100644
--- a/tensorflow/core/graph/graph.h
+++ b/tensorflow/core/graph/graph.h
@@ -64,8 +64,9 @@ namespace tensorflow {
 class Edge;
 class EdgeSetTest;
 class Graph;
-class GraphTest;
+class GraphDebugInfo;
 class GraphDef;
+class GraphTest;
 class Node;
 struct OutputTensor;
 class VersionDef;
@@ -630,9 +631,9 @@ class Graph {
   // imported function differs from an existing function or op with the same
   // name.
   Status AddFunctionLibrary(const FunctionDefLibrary& fdef_lib,
-                            const StackTracesMap& stack_traces);
+                            const FunctionDefLibraryStackTraces& stack_traces);
   Status AddFunctionLibrary(FunctionDefLibrary&& fdef_lib,
-                            const StackTracesMap& stack_traces);
+                            const FunctionDefLibraryStackTraces& stack_traces);
 
   // Adds the function definition and its stacktraces to this graph's op
   // registry. Ignores duplicate functions, and returns a bad status if an
@@ -815,6 +816,12 @@ class Graph {
   // information in the map.
   void NodeType(StringPiece name, const FullTypeDef** result);
 
+  // Builds a GraphDebugInfo from the functions and nodes in this graph. Stack
+  // traces associated with function definitions will have a key of the form
+  // <node_name> '@' <function_name>. Stack traces associated with other Nodes
+  // will use the node name as the key.
+  GraphDebugInfo BuildDebugInfo() const;
+
   // TODO(josh11b): uint64 hash() const;
 
  private:
diff --git a/tensorflow/core/graph/graph_debug_info_builder.cc b/tensorflow/core/graph/graph_debug_info_builder.cc
new file mode 100644
index 00000000000..db21242916e
--- /dev/null
+++ b/tensorflow/core/graph/graph_debug_info_builder.cc
@@ -0,0 +1,73 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
+
+#include <string>
+#include <utility>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/platform/stack_frame.h"
+
+namespace tensorflow {
+
+void GraphDebugInfoBuilder::AccumulateStackTracesMap(
+    const StackTracesMap& stack_traces_map, absl::string_view key_suffix,
+    const GraphDebugInfoBuilder::Options& options) {
+  for (const auto& [node_name, stack_trace] : stack_traces_map) {
+    if (stack_trace == nullptr) continue;
+    std::string trace_key = absl::StrCat(node_name, key_suffix);
+    AccumulateStackTrace(*stack_trace, trace_key, options);
+  }
+}
+
+void GraphDebugInfoBuilder::AccumulateStackTrace(
+    const AbstractStackTrace& abstract_stack_trace,
+    absl::string_view traces_key,
+    const GraphDebugInfoBuilder::Options& options) {
+  GraphDebugInfo::StackTrace stack_trace_proto;
+  if (options.user_frames) {
+    for (const auto& stack_frame :
+         abstract_stack_trace.GetUserFrames(options.user_frames_limit)) {
+      AppendToStackTraceProto(stack_frame, stack_trace_proto);
+    }
+  } else {
+    for (const auto& stack_frame : abstract_stack_trace.ToFrames()) {
+      AppendToStackTraceProto(stack_frame, stack_trace_proto);
+    }
+  }
+  (*debug_info_.mutable_traces())[traces_key] = std::move(stack_trace_proto);
+}
+
+void GraphDebugInfoBuilder::AppendToStackTraceProto(
+    const StackFrame& stack_frame,
+    GraphDebugInfo::StackTrace& stack_trace_proto) {
+  auto& file_line_col = *stack_trace_proto.add_file_line_cols();
+  if (file_name_to_index_.contains(stack_frame.file_name)) {
+    file_line_col.set_file_index(file_name_to_index_[stack_frame.file_name]);
+  } else {
+    file_line_col.set_file_index(new_name_index_);
+    file_name_to_index_[stack_frame.file_name] = new_name_index_;
+    *debug_info_.add_files() = stack_frame.file_name;
+    new_name_index_++;
+  }
+  file_line_col.set_line(stack_frame.line_number);
+  file_line_col.set_func(stack_frame.function_name);
+}
+
+GraphDebugInfo GraphDebugInfoBuilder::Build() const { return debug_info_; }
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_debug_info_builder.h b/tensorflow/core/graph/graph_debug_info_builder.h
new file mode 100644
index 00000000000..570f3a43ec5
--- /dev/null
+++ b/tensorflow/core/graph/graph_debug_info_builder.h
@@ -0,0 +1,88 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPH_GRAPH_DEBUG_INFO_BUILDER_H_
+#define TENSORFLOW_CORE_GRAPH_GRAPH_DEBUG_INFO_BUILDER_H_
+
+#include <string>
+
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/platform/stack_frame.h"
+#include "tensorflow/tsl/platform/macros.h"
+
+namespace tensorflow {
+
+// Builder for GraphDebugInfo protos from either an existing map of string keys
+// to stack traces, or individual stack traces, or both. All stack traces in a
+// GraphDebugInfo are stored with a string key in the `traces` field. In the
+// case of an existing map, its keys are used, appended with a key suffix,
+// which may be empty. If it is not empty, it is conventionally of the form
+// "@function_name", although this class doesn't care. In the case of an
+// individual stack trace, a key for `traces` must be provided.
+//
+// This builder will create a list of the unique file names across all stack
+// traces and store it in the `files` field. When storing stack traces into the
+// proto, file names are replaced by their index into `files`.
+//
+// Typical usage is to call one or both of the accumulate methods one or more
+// times and then to call the Build().
+
+class GraphDebugInfoBuilder {
+ public:
+  struct Options {
+    // Call the AbstractTraceMap GetUserFrames method rather than ToFrames
+    bool user_frames;
+    // Value of `limit` to pass to GetUserFrames if `user_frames` is true,
+    // otherwise ignored
+    int user_frames_limit;
+  };
+
+  explicit GraphDebugInfoBuilder() = default;
+
+  // Adds a map of stack traces to the GraphDebugInfo proto. For each key (node
+  // id) and stack traces entry in `stack_traces_map`, combine the key with
+  // `key_suffix` to form a new key and use that to add the stack traces to the
+  // `traces` field of the proto. If not empty, the suffix is typically of the
+  // form "@function_name", although this function doesn't care.
+  void AccumulateStackTracesMap(const StackTracesMap& stack_traces_map,
+                                absl::string_view key_suffix = "",
+                                const GraphDebugInfoBuilder::Options& options =
+                                    GraphDebugInfoBuilder::Options());
+
+  // Adds one stack trace to the GraphDebugInfo proto, using `traces_key` as the
+  // key for the `traces` field of the proto.
+  void AccumulateStackTrace(const AbstractStackTrace& abstract_stack_trace,
+                            absl::string_view traces_key,
+                            const GraphDebugInfoBuilder::Options& options =
+                                GraphDebugInfoBuilder::Options());
+
+  // Returns the GraphDebugInfo proto.
+  GraphDebugInfo Build() const;
+
+ private:
+  void AppendToStackTraceProto(const StackFrame& stack_frame,
+                               GraphDebugInfo::StackTrace& stack_trace_proto);
+
+  GraphDebugInfo debug_info_;
+  absl::flat_hash_map<std::string, int> file_name_to_index_;
+  int new_name_index_ = 0;
+
+  TF_DISALLOW_COPY_AND_ASSIGN(GraphDebugInfoBuilder);
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPH_GRAPH_DEBUG_INFO_BUILDER_H_
diff --git a/tensorflow/core/graph/graph_debug_info_builder_test.cc b/tensorflow/core/graph/graph_debug_info_builder_test.cc
new file mode 100644
index 00000000000..43908412221
--- /dev/null
+++ b/tensorflow/core/graph/graph_debug_info_builder_test.cc
@@ -0,0 +1,117 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/graph/graph_debug_info_builder.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/platform/test.h"
+
+namespace tensorflow {
+namespace {
+using ::testing::Eq;
+using ::testing::Ne;
+using ::testing::UnorderedElementsAre;
+
+class TestStackTrace : public AbstractStackTrace {
+ public:
+  explicit TestStackTrace(const std::vector<StackFrame> frames)
+      : frames_(std::move(frames)) {}
+
+  absl::Span<StackFrame const> ToFrames() const override { return frames_; }
+
+  std::vector<StackFrame> GetUserFrames(int limit) const override {
+    return frames_;
+  }
+
+  StackFrame LastUserFrame() const override { return frames_.back(); }
+
+  string ToString(const TracePrintingOptions& opts) const override {
+    auto frame = LastUserFrame();
+    return absl::StrCat(frame.file_name, ":", frame.line_number, ":",
+                        frame.function_name);
+  }
+
+  std::vector<StackFrame> frames_;
+};
+
+TEST(GraphDebugInfoBuilderTest, AccumulateStackTrace) {
+  TestStackTrace stack_trace(
+      std::vector<StackFrame>{{"dummy_file_alpha.cc", 20, "function_bar"},
+                              {"dummy_file_beta.cc", 30, "function_sop"}});
+
+  GraphDebugInfoBuilder builder;
+  builder.AccumulateStackTrace(stack_trace, "alpha_beta");
+  GraphDebugInfo debug_info = builder.Build();
+
+  EXPECT_THAT(debug_info.files(), UnorderedElementsAre("dummy_file_alpha.cc",
+                                                       "dummy_file_beta.cc"));
+  EXPECT_THAT(debug_info.traces_size(), Eq(1));
+
+  EXPECT_THAT(debug_info.traces().find("alpha_beta"),
+              Ne(debug_info.traces().end()));
+  auto actual_stack_trace = debug_info.traces().find("alpha_beta")->second;
+  EXPECT_THAT(actual_stack_trace.file_line_cols_size(), Eq(2));
+}
+
+TEST(GraphDebugInfoBuilderTest, AccumulateStackTracesMap) {
+  StackTracesMap stack_traces;
+  stack_traces["two"] = std::make_shared<TestStackTrace>(
+      std::vector<StackFrame>{{"dummy_file_alpha.cc", 20, "function_bar"},
+                              {"dummy_file_beta.cc", 30, "function_sop"}});
+  stack_traces["scale"] =
+      std::make_shared<TestStackTrace>(std::vector<StackFrame>{
+          {"dummy_file_alpha.cc", 10, "function_foo"},
+          {"dummy_file_beta.cc", 30, "function_sop"},
+      });
+  stack_traces["y"] = std::make_shared<TestStackTrace>(std::vector<StackFrame>{
+      {"dummy_file_alpha.cc", 15, "function_flex"},
+      {"dummy_file_alpha.cc", 20, "function_bar"},
+      {"dummy_file_beta.cc", 30, "function_sop"},
+  });
+
+  GraphDebugInfoBuilder builder;
+  builder.AccumulateStackTracesMap(stack_traces, "@func");
+  GraphDebugInfo debug_info = builder.Build();
+
+  EXPECT_THAT(debug_info.files(), UnorderedElementsAre("dummy_file_alpha.cc",
+                                                       "dummy_file_beta.cc"));
+  EXPECT_THAT(debug_info.traces_size(), Eq(3));
+
+  // Examine one of the three stack traces in detail.
+  EXPECT_THAT(debug_info.traces().find("scale@func"),
+              Ne(debug_info.traces().end()));
+  auto stack_trace = debug_info.traces().find("scale@func")->second;
+  EXPECT_THAT(stack_trace.file_line_cols_size(), Eq(2));
+
+  // `FileLineCol.file_index` is non-deterministic because the GraphDebugInfo is
+  // built by accumulating all file names into a set, and then storing that in
+  // the `files` field in an arbitrary order.
+  auto file_line_col_0 = stack_trace.file_line_cols(0);
+  auto file_line_col_1 = stack_trace.file_line_cols(1);
+  EXPECT_THAT(std::vector<int>(
+                  {file_line_col_0.file_index(), file_line_col_1.file_index()}),
+              UnorderedElementsAre(0, 1));
+  EXPECT_THAT(file_line_col_0.line(), Eq(10));
+  EXPECT_THAT(file_line_col_0.func(), Eq("function_foo"));
+  EXPECT_THAT(file_line_col_1.line(), Eq(30));
+  EXPECT_THAT(file_line_col_1.func(), Eq("function_sop"));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/core/graph/graph_test.cc b/tensorflow/core/graph/graph_test.cc
index 1822e6f48e3..ac65010657c 100644
--- a/tensorflow/core/graph/graph_test.cc
+++ b/tensorflow/core/graph/graph_test.cc
@@ -20,10 +20,12 @@ limitations under the License.
 #include <unordered_map>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/graph/benchmark_testlib.h"
 #include "tensorflow/core/graph/node_builder.h"
@@ -38,6 +40,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+using ::testing::UnorderedElementsAre;
+
 REGISTER_OP("OneInput").Input("x: float");
 
 REGISTER_OP("OneOutput").Output("y: float");
@@ -828,6 +832,78 @@ TEST_F(GraphTest, NodeShrinkTypeInput) {
   EXPECT_EQ(ft->args(3).args(0).type_id(), TFT_STRING);
 }
 
+TEST_F(GraphTest, BuildDebugInfo) {
+  class TestStackTrace : public AbstractStackTrace {
+   public:
+    explicit TestStackTrace(const std::vector<StackFrame> frames)
+        : frames_(std::move(frames)) {}
+
+    absl::Span<StackFrame const> ToFrames() const override { return frames_; }
+
+    std::vector<StackFrame> GetUserFrames(int limit) const override {
+      return frames_;
+    }
+
+    StackFrame LastUserFrame() const override { return frames_.back(); }
+
+    string ToString(const TracePrintingOptions& opts) const override {
+      StackFrame frame = LastUserFrame();
+      return absl::StrCat(frame.file_name, ":", frame.line_number, ":",
+                          frame.function_name);
+    }
+
+    std::vector<StackFrame> frames_;
+  };
+
+  FunctionDef func = test::function::XTimesTwo();
+
+  // Hard-code the node names "two", "scale", and "y" from XTimesTwo.
+  StackTracesMap stack_traces;
+  stack_traces["two"] = std::make_shared<TestStackTrace>(
+      std::vector<StackFrame>{{"dummy_file_alpha.cc", 20, "function_bar"},
+                              {"dummy_file_beta.cc", 30, "function_sop"}});
+  stack_traces["scale"] =
+      std::make_shared<TestStackTrace>(std::vector<StackFrame>{
+          {"dummy_file_alpha.cc", 10, "function_foo"},
+          {"dummy_file_beta.cc", 30, "function_sop"},
+      });
+  stack_traces["y"] = std::make_shared<TestStackTrace>(std::vector<StackFrame>{
+      {"dummy_file_alpha.cc", 15, "function_flex"},
+      {"dummy_file_alpha.cc", 20, "function_bar"},
+      {"dummy_file_beta.cc", 30, "function_sop"},
+  });
+
+  TF_CHECK_OK(graph_.AddFunctionDef(func, stack_traces));
+
+  GraphDebugInfo debug_info = graph_.BuildDebugInfo();
+
+  EXPECT_THAT(debug_info.files(), UnorderedElementsAre("dummy_file_alpha.cc",
+                                                       "dummy_file_beta.cc"));
+  EXPECT_EQ(debug_info.traces_size(), 3);
+
+  // Examine one of the three stack traces in detail.
+  EXPECT_NE(debug_info.traces().find("scale@XTimesTwo"),
+            debug_info.traces().end());
+  GraphDebugInfo::StackTrace stack_trace =
+      debug_info.traces().find("scale@XTimesTwo")->second;
+  EXPECT_EQ(stack_trace.file_line_cols_size(), 2);
+
+  // `FileLineCol.file_index` is non-deterministic because the GraphDebugInfo is
+  // built by accumulating all file names into a set, and then storing that in
+  // the `files` field in an arbitrary order.
+  const GraphDebugInfo::FileLineCol& file_line_col_0 =
+      stack_trace.file_line_cols(0);
+  const GraphDebugInfo::FileLineCol& file_line_col_1 =
+      stack_trace.file_line_cols(1);
+  EXPECT_THAT(std::vector<int>(
+                  {file_line_col_0.file_index(), file_line_col_1.file_index()}),
+              UnorderedElementsAre(0, 1));
+  EXPECT_EQ(file_line_col_0.line(), 10);
+  EXPECT_EQ(file_line_col_0.func(), "function_foo");
+  EXPECT_EQ(file_line_col_1.line(), 30);
+  EXPECT_EQ(file_line_col_1.func(), "function_sop");
+}
+
 void BM_InEdgeIteration(::testing::benchmark::State& state) {
   const int num_nodes = state.range(0);
   const int num_edges_per_node = state.range(1);
diff --git a/tensorflow/core/grappler/BUILD b/tensorflow/core/grappler/BUILD
index 8574d42dc35..f802d811b23 100644
--- a/tensorflow/core/grappler/BUILD
+++ b/tensorflow/core/grappler/BUILD
@@ -128,6 +128,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -187,6 +188,8 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/grappler/inputs:utils",
         "//tensorflow/core/grappler/optimizers:model_pruner",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/clusters/BUILD b/tensorflow/core/grappler/clusters/BUILD
index 663d22099ec..3ba4dc63ec8 100644
--- a/tensorflow/core/grappler/clusters/BUILD
+++ b/tensorflow/core/grappler/clusters/BUILD
@@ -71,6 +71,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:session_options",
         "//tensorflow/core/grappler:grappler_item",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -128,6 +129,8 @@ cc_library(
         "//tensorflow/core/common_runtime/gpu:gpu_id",
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/kernels:ops_util",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index ff170d238e5..11b62675b29 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/grappler/grappler_item.h"
@@ -106,7 +107,7 @@ class Cluster {
   // Enables collecting the allocator stats. If called, must be called before
   // Provision().
   virtual Status EnablePeakMemoryStats() {
-    return errors::Unimplemented(strings ::StrCat(
+    return absl::UnimplementedError(strings ::StrCat(
         "Peak Memory Stats are not supported on ", type(), " clusters"));
   }
 
@@ -114,7 +115,7 @@ class Cluster {
   // runs.
   virtual Status GetPeakMemoryUsage(
       std::unordered_map<string, uint64>* device_peak_memory) const {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "GetPeakMemoryUsage is not implemented for this type of cluster.");
   }
 
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 3fa1233e57f..5c5424d2de3 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <atomic>
 #include <memory>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/cc/training/queue_runner.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -74,7 +76,7 @@ Status SingleMachine::Provision() {
   // variables are global, and therefore we can't have more than 1 session alive
   // at a time. This check detects when more that one cluster is provisioned.
   if (already_provisioned) {
-    return errors::Unavailable(
+    return absl::UnavailableError(
         "Can't provision more than one single cluster at a time");
   }
 
@@ -89,16 +91,17 @@ Status SingleMachine::Provision() {
     } else if (dev.device_type() == "GPU") {
       DeviceNameUtils::ParsedName parsed;
       if (!DeviceNameUtils::ParseFullName(dev.name(), &parsed)) {
-        return errors::InvalidArgument(
-            strings::StrCat("Not able to parse GPU device name: ", dev.name()));
+        return absl::InvalidArgumentError(
+            absl::StrCat("Not able to parse GPU device name: ", dev.name()));
       }
       TfDeviceId tf_device_id(parsed.id);
       PlatformDeviceId platform_device_id;
       Status s =
           GpuIdManager::TfToPlatformDeviceId(tf_device_id, &platform_device_id);
       if (!s.ok()) {
-        return errors::Unavailable("Unknown TF GPU device with id ",
-                                   tf_device_id.value(), ": ", s.message());
+        return absl::UnavailableError(
+            absl::StrCat("Unknown TF GPU device with id ", tf_device_id.value(),
+                         ": ", s.message()));
       }
       attr = GetLocalGPUInfo(platform_device_id);
     } else if (dev.device_type().find("XLA") == string::npos) {
@@ -263,8 +266,8 @@ Status SingleMachine::RunWithTimeout(
       },
       timeout_s * 1000, thread_pool_.get());
   if (!executed_in_time) {
-    return errors::DeadlineExceeded("Failed to run the graph after ", timeout_s,
-                                    " seconds, aborting");
+    return absl::DeadlineExceededError(absl::StrCat(
+        "Failed to run the graph after ", timeout_s, " seconds, aborting"));
   } else if (run_metadata && status->ok()) {
     *run_metadata = *local_metadata;
   }
@@ -309,8 +312,9 @@ Status SingleMachine::CloseSession(bool use_timeout) {
   if (!executed_in_time) {
     // Let the caller know that we can't shutdown the session, and therefore
     // can't process any further.
-    return errors::Unavailable("Failed to close the previous session after ",
-                               timeout_s_, " seconds, aborting");
+    return absl::UnavailableError(
+        absl::StrCat("Failed to close the previous session after ", timeout_s_,
+                     " seconds, aborting"));
   }
 
   return OkStatus();
@@ -335,8 +339,8 @@ Status SingleMachine::ShutdownSession() {
   if (!notified) {
     // Let the caller know that we can't shutdown the session properly since
     // there are calls to Session::Run() still running.
-    return errors::Unavailable("The session is still running graphs after ",
-                               timeout_s_, " seconds");
+    return absl::UnavailableError(absl::StrCat(
+        "The session is still running graphs after ", timeout_s_, " seconds"));
   }
 
   return OkStatus();
@@ -362,7 +366,7 @@ Status SingleMachine::ResetSession() {
 
   session_.reset(NewSession(options_));
   if (!session_) {
-    return errors::Unknown("Failed to create session");
+    return absl::UnknownError("Failed to create session");
   }
   coordinator_.reset(new Coordinator());
 
diff --git a/tensorflow/core/grappler/costs/BUILD b/tensorflow/core/grappler/costs/BUILD
index c749d793614..37e886276ee 100644
--- a/tensorflow/core/grappler/costs/BUILD
+++ b/tensorflow/core/grappler/costs/BUILD
@@ -353,7 +353,10 @@ cc_library(
 tf_cc_test(
     name = "op_level_cost_estimator_test",
     srcs = ["op_level_cost_estimator_test.cc"],
-    tags = ["no_oss"],  # b/163222310
+    tags = [
+        "no_oss",
+        "not_run:arm",
+    ],  # b/163222310
     deps = [
         ":op_level_cost_estimator",
         "//tensorflow/core:framework",
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index 0b11adcdaa4..a7775aaeb31 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -18,9 +18,11 @@ limitations under the License.
 
 #include <unordered_map>
 #include <unordered_set>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/hash/hash.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -322,8 +324,8 @@ class GraphViewInternal {
     auto inserted = nodes_.emplace(node->name(), node);
     return inserted.second
                ? OkStatus()
-               : errors::InvalidArgument("Non unique node name detected: ",
-                                         node->name());
+               : absl::InvalidArgumentError(absl::StrCat(
+                     "Non unique node name detected: ", node->name()));
   }
 
   // TODO(ezhulenev): Remove this function.
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index d5c37f7194b..ce4f96ac204 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
 #include "tensorflow/core/common_runtime/device_mgr.h"
@@ -118,16 +120,18 @@ Status UpdatePlaceholderShape(
     const std::unordered_set<string>& signature_feed_nodes,
     GrapplerItem* new_item, NodeDef* node) {
   if (node->attr().count("dtype") == 0) {
-    return errors::Internal("Unknown type for placeholder ", node->name(),
-                            ", skipping this input");
+    return absl::InternalError(absl::StrCat("Unknown type for placeholder ",
+                                            node->name(),
+                                            ", skipping this input"));
   }
   DataType type = node->attr().at("dtype").type();
 
   // TODO(andiryxu): Consider cfg.placeholder_unknown_output_shape_dim >= 0 and
   // _output_shapes is present case.
   if (node->attr().count("shape") == 0) {
-    return errors::Internal("Unknown shape for placeholder ", node->name(),
-                            ", skipping this input");
+    return absl::InternalError(absl::StrCat("Unknown shape for placeholder ",
+                                            node->name(),
+                                            ", skipping this input"));
   }
 
   // Replace all unknown dimensions in the placeholder's tensorshape proto
@@ -139,8 +143,9 @@ Status UpdatePlaceholderShape(
   Status make_shape_status = ReplaceUnknownShapeDim(
       cfg, node->attr().at("shape").shape(), &shape_proto, &shape);
   if (!make_shape_status.ok()) {
-    return errors::Internal("Invalid shape for placeholder ", node->name(),
-                            ": ", make_shape_status, ", skipping this input");
+    return absl::InternalError(
+        absl::StrCat("Invalid shape for placeholder ", node->name(), ": ",
+                     make_shape_status.ToString(), ", skipping this input"));
   }
 
   // Some placeholder nodes have a mismatch between the node
diff --git a/tensorflow/core/grappler/op_types.cc b/tensorflow/core/grappler/op_types.cc
index b1ef63a8f49..e0981fe90c8 100644
--- a/tensorflow/core/grappler/op_types.cc
+++ b/tensorflow/core/grappler/op_types.cc
@@ -389,6 +389,10 @@ bool IsMirrorPadGrad(const NodeDef& node) {
   return node.op() == "MirrorPadGrad";
 }
 
+bool IsMklFusedMish(const NodeDef& node) {
+  return node.op() == "_MklFusedMish";
+}
+
 bool IsMod(const NodeDef& node) { return node.op() == "Mod"; }
 
 bool IsMul(const NodeDef& node) { return node.op() == "Mul"; }
diff --git a/tensorflow/core/grappler/op_types.h b/tensorflow/core/grappler/op_types.h
index b1e466f8900..c233b6e9c6b 100644
--- a/tensorflow/core/grappler/op_types.h
+++ b/tensorflow/core/grappler/op_types.h
@@ -121,6 +121,7 @@ bool IsMin(const NodeDef& node);
 bool IsMinimum(const NodeDef& node);
 bool IsMirrorPad(const NodeDef& node);
 bool IsMirrorPadGrad(const NodeDef& node);
+bool IsMklFusedMish(const NodeDef& node);
 bool IsMod(const NodeDef& node);
 bool IsMul(const NodeDef& node);
 bool IsMulNoNan(const NodeDef& node);
diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD
index 58ec93e281e..016b209cdd6 100644
--- a/tensorflow/core/grappler/optimizers/BUILD
+++ b/tensorflow/core/grappler/optimizers/BUILD
@@ -124,6 +124,7 @@ cc_library(
         "//tensorflow/core/grappler/utils:symbolic_shapes",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -182,6 +183,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -220,6 +222,7 @@ cc_library(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -491,6 +494,7 @@ tf_kernel_library(
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -621,6 +625,11 @@ cc_library(
         "meta_optimizer.h",
     ],
     copts = tf_copts(),
+    defines = select({
+        "//tensorflow:fuchsia": ["NO_LLVM_SUPPORT=1"],
+        "//tensorflow:ios_x86_64": ["NO_LLVM_SUPPORT=1"],
+        "//conditions:default": ["NO_LLVM_SUPPORT=0"],
+    }),
     visibility = ["//visibility:public"],
     deps = [
         ":arithmetic_optimizer",
@@ -657,10 +666,13 @@ cc_library(
         "//tensorflow/core/grappler/utils:tpu",
         "//tensorflow/core/grappler/verifiers:graph_verifier",
         "//tensorflow/core/grappler/verifiers:structure_verifier",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ] + select({
         #TODO(b/200087693): LLVM does not build on Fuchsia.
         "//tensorflow:fuchsia": [],
+        # LLVM does not build on most apple platforms.
+        "//tensorflow:ios_x86_64": [],
         "//conditions:default": [":tfg_optimizer_hook"],
     }),
 )
@@ -718,6 +730,8 @@ cc_library(
         "//tensorflow/core/platform:status",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/transforms:PassRegistration",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Pass",
     ],
@@ -844,6 +858,8 @@ cc_library(
         "//tensorflow/core/grappler/costs:graph_properties",
         "//tensorflow/core/grappler/utils:frame",
         "//tensorflow/core/grappler/utils:symbolic_shapes",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index e48b48ab668..c018067c9c6 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include <cmath>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -867,9 +869,9 @@ Status ConstantFolding::MaterializeConstantValuedNode(
       // node, even if the shape specified in the original Fill is large.
       Tensor t;
       if (!t.FromProto(input_tensor)) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             "Could not construct Tensor form TensorProto in node: ",
-            input_node->name());
+            input_node->name()));
       }
       tensor->clear_tensor_content();
       t.AsProtoField(tensor);
@@ -1176,9 +1178,9 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
       SET_TENSOR_VAL_CASE(DT_QUINT8, int32, int);
       SET_TENSOR_VAL_CASE(DT_BOOL, bool, bool);
     default:
-      return errors::InvalidArgument(
-          "Unsupported type in CreateConstantTensorAttrValue: ",
-          DataTypeString(type));
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported type in CreateConstantTensorAttrValue: ",
+                       DataTypeString(type)));
   }
   return OkStatus();
 }
@@ -1260,28 +1262,28 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   // Use the packed representation whenever possible to avoid generating large
   // graphdefs. Moreover, avoid repeating the last values if they're equal.
   if (tensor->NumElements() > 4) {
-#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, FIELDTYPE)                      \
-  {                                                                            \
-    const auto* val_ptr = tensor->flat<TYPE>().data();                         \
-    auto last = *val_ptr;                                                      \
-    int64_t last_index = 0;                                                    \
-    for (int64_t i = 0; i < tensor->NumElements(); ++i) {                      \
-      TYPE cur = *val_ptr++;                                                   \
-      if (PackedValuesNotEqual(cur, last)) {                                   \
-        last = cur;                                                            \
-        last_index = i;                                                        \
-      }                                                                        \
-    }                                                                          \
-    encoded_size = (last_index + 1) * sizeof(FIELDTYPE);                       \
-    if (encoded_size < kint32max) {                                            \
-      optimized = true;                                                        \
-      t->mutable_##FIELDTYPE##_val()->Reserve(last_index + 1);                 \
-      const auto* src_ptr = tensor->flat<TYPE>().data();                       \
-      auto* dst_ptr =                                                          \
-          t->mutable_##FIELDTYPE##_val()->AddNAlreadyReserved(last_index + 1); \
-      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);                   \
-    }                                                                          \
-  }                                                                            \
+#define POPULATE_TENSOR_PROTO(tensor, t, TYPE, FIELDTYPE)         \
+  {                                                               \
+    const auto* val_ptr = tensor->flat<TYPE>().data();            \
+    auto last = *val_ptr;                                         \
+    int64_t last_index = 0;                                       \
+    for (int64_t i = 0; i < tensor->NumElements(); ++i) {         \
+      TYPE cur = *val_ptr++;                                      \
+      if (PackedValuesNotEqual(cur, last)) {                      \
+        last = cur;                                               \
+        last_index = i;                                           \
+      }                                                           \
+    }                                                             \
+    encoded_size = (last_index + 1) * sizeof(FIELDTYPE);          \
+    if (encoded_size < kint32max) {                               \
+      optimized = true;                                           \
+      t->mutable_##FIELDTYPE##_val()->Reserve(last_index + 1);    \
+      const auto* src_ptr = tensor->flat<TYPE>().data();          \
+      auto* dst_ptr = t->mutable_##FIELDTYPE##_val()              \
+                          -> AddNAlreadyReserved(last_index + 1); \
+      std::copy(src_ptr, src_ptr + last_index + 1, dst_ptr);      \
+    }                                                             \
+  }                                                               \
   break
 
     switch (tensor->dtype()) {
@@ -1325,9 +1327,9 @@ Status ConstantFolding::CreateNodeDef(const string& name,
   node->mutable_attr()->insert({"value", attr_tensor});
 
   if (encoded_size > original_size && encoded_size >= kMaxConstantSize) {
-    return errors::InvalidArgument(
-        strings::StrCat("Can't fold ", name, ", its size would be too large (",
-                        encoded_size, " >= ", kMaxConstantSize, " bytes)"));
+    return absl::InvalidArgumentError(
+        absl::StrCat("Can't fold ", name, ", its size would be too large (",
+                     encoded_size, " >= ", kMaxConstantSize, " bytes)"));
   }
   return OkStatus();
 }
@@ -1378,16 +1380,16 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
                           " has a dtype of DT_INVALID."));
     }
     if (IsRefType(raw_val.dtype())) {
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(absl::StrCat(
           "Not allowed to construct a tensor with reference dtype, got ",
-          DataTypeString(raw_val.dtype()));
+          DataTypeString(raw_val.dtype())));
     }
     Tensor* value = new Tensor(raw_val.dtype(), raw_val.tensor_shape());
     if (!value->FromProto(raw_val)) {
       delete (value);
-      return errors::InvalidArgument("Unable to make Tensor from proto for ",
-                                     node.name(), " with shape ",
-                                     raw_val.tensor_shape().DebugString());
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unable to make Tensor from proto for ", node.name(),
+                       " with shape ", raw_val.tensor_shape().DebugString()));
     }
     inputs.emplace_back(value);
     total_inputs_size += value->TotalBytes();
@@ -1462,7 +1464,7 @@ Status ConstantFolding::FoldMergeNode(NodeDef* node, GraphDef* output_graph) {
     if (node_map_->GetNode(const_out_name) ||
         node_map_->GetNode(const_index_name)) {
       // Intended name already exists.
-      return errors::AlreadyExists(
+      return absl::AlreadyExistsError(
           strings::StrCat(const_out_name, " or ", const_index_name,
                           " already present in the graph"));
     }
@@ -1581,7 +1583,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
     } else {
       if (node_map_->GetNode(const_node->name())) {
         // Intended name already exists.
-        return errors::AlreadyExists(strings::StrCat(
+        return absl::AlreadyExistsError(strings::StrCat(
             const_node->name(), " already present in the graph"));
       }
       NodeDef* added_node = output_graph->add_node();
@@ -1707,18 +1709,19 @@ Status ConstantFolding::FoldGraph(
 Status ConstantFolding::IsSimplifiableReshape(
     const NodeDef& node, const GraphProperties& properties) const {
   if (!IsReshape(node)) {
-    return errors::Internal("Node ", node.name(), " is not a Reshape node");
+    return absl::InternalError(
+        absl::StrCat("Node ", node.name(), " is not a Reshape node"));
   }
   if (2 > node.input_size()) {
-    return errors::Internal("Node ", node.name(),
-                            " must have at most 2 inputs but has ",
-                            node.input_size());
+    return absl::InternalError(absl::StrCat(
+        "Node ", node.name(), " must have at most 2 inputs but has ",
+        node.input_size()));
   }
   const NodeDef* new_shape = node_map_->GetNode(node.input(1));
   if (!IsReallyConstant(*new_shape)) {
-    return errors::Internal("Node ", node.name(), " has shape ",
-                            new_shape->DebugString(),
-                            " which is not a constant");
+    return absl::InternalError(absl::StrCat("Node ", node.name(), " has shape ",
+                                            new_shape->DebugString(),
+                                            " which is not a constant"));
   }
   TensorVector outputs;
   auto outputs_cleanup = gtl::MakeCleanup([&outputs] {
@@ -1729,29 +1732,32 @@ Status ConstantFolding::IsSimplifiableReshape(
 
   Status s = EvaluateNode(*new_shape, TensorVector(), &outputs);
   if (!s.ok()) {
-    return errors::Internal("Could not evaluate node ", node.name());
+    return absl::InternalError(
+        absl::StrCat("Could not evaluate node ", node.name()));
   }
   if (outputs.size() != 1) {
-    return errors::Internal("Node ", node.name(),
-                            " must have exactly 1 output but has ",
-                            outputs.size());
+    return absl::InternalError(
+        absl::StrCat("Node ", node.name(),
+                     " must have exactly 1 output but has ", outputs.size()));
   }
 
   const std::vector<OpInfo::TensorProperties>& props =
       properties.GetInputProperties(node.name());
   if (props.empty()) {
-    return errors::Internal("Node ", node.name(), " has no properties");
+    return absl::InternalError(
+        absl::StrCat("Node ", node.name(), " has no properties"));
   }
   const OpInfo::TensorProperties& prop = props[0];
   if (prop.dtype() == DT_INVALID) {
-    return errors::Internal("Node ", node.name(), " has property ",
-                            prop.DebugString(), " with invalid dtype");
+    return absl::InternalError(
+        absl::StrCat("Node ", node.name(), " has property ", prop.DebugString(),
+                     " with invalid dtype"));
   }
   const PartialTensorShape shape(prop.shape());
   if (!shape.IsFullyDefined()) {
-    return errors::Internal("Node ", node.name(), " has property ",
-                            prop.DebugString(), " with shape ",
-                            shape.DebugString(), " which is not fully defined");
+    return absl::InternalError(absl::StrCat(
+        "Node ", node.name(), " has property ", prop.DebugString(),
+        " with shape ", shape.DebugString(), " which is not fully defined"));
   }
 
   PartialTensorShape new_dims;
@@ -1774,8 +1780,9 @@ Status ConstantFolding::IsSimplifiableReshape(
   }
 
   if (!shape.IsCompatibleWith(new_dims)) {
-    return errors::Internal("Expected shape ", shape.DebugString(),
-                            "to be compatible with ", new_dims.DebugString());
+    return absl::InternalError(
+        absl::StrCat("Expected shape ", shape.DebugString(),
+                     "to be compatible with ", new_dims.DebugString()));
   }
 
   return OkStatus();
@@ -2992,8 +2999,8 @@ Status ConstantFolding::SimplifyArithmeticOperations(
     const NodeDef* x = node_map_->GetNode(node->input(0));
     const NodeDef* y = node_map_->GetNode(node->input(1));
     if (x == nullptr || y == nullptr) {
-      return errors::InvalidArgument("Invalid inputs to node: ",
-                                     node->DebugString());
+      return absl::InvalidArgumentError(
+          absl::StrCat("Invalid inputs to node: ", node->DebugString()));
     }
     const TensorShapeProto& output_shape =
         properties.GetOutputProperties(node->name())[0].shape();
@@ -3984,7 +3991,7 @@ Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
     TF_RETURN_IF_ERROR(add_quantized_out(min_out_const_name, 1));
     TF_RETURN_IF_ERROR(add_quantized_out(max_out_const_name, 2));
   } else {
-    return errors::Internal(absl::Substitute(
+    return absl::InternalError(absl::Substitute(
         "Can't create Const for QuantizedMatMul min_out/max_out of "
         "node '$0' because of node name conflict",
         node->name()));
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index dccbac09df4..977ab674d00 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -61,6 +61,7 @@ cc_library(
         "//tensorflow/core/kernels/data:shard_dataset_op",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
@@ -102,6 +103,8 @@ cc_library(
         "//tensorflow/core/grappler:utils",
         "//tensorflow/core/grappler/clusters:cluster",
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
 )
@@ -756,6 +759,7 @@ cc_library(
         "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
         "//tensorflow/core/grappler/utils:functions",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ] + tf_protos_all(),
     alwayslink = 1,
@@ -984,6 +988,7 @@ tf_cc_test(
         "//tensorflow/core/grappler:grappler_item",
         "//tensorflow/core/grappler/utils:functions",
         "//tensorflow/core/kernels/data",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index 7e1a0cd153b..9c757fa333a 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/data/dataset_utils.h"
@@ -782,7 +783,7 @@ Status ApplyAutoShard(const NodeDef& sink_node, int64_t num_workers,
     case AutoShardPolicy::AUTO:
     default:
       Status s = ShardByFile(sink_node, num_workers, index, &flib, graph);
-      if (errors::IsNotFound(s)) {
+      if (absl::IsNotFound(s)) {
         LOG(WARNING) << "AUTO sharding policy will apply DATA sharding policy "
                         "as it failed to apply FILE sharding policy because of "
                         "the following reason: "
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
index dfba145cc9f..cc9b4508698 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h"
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/grappler/clusters/cluster.h"
@@ -81,7 +83,7 @@ Status AutotuneBufferSizes::OptimizeAndCollectStats(Cluster* cluster,
           stats->num_changes++;
         }
       } else {
-        return errors::FailedPrecondition(
+        return absl::FailedPreconditionError(
             "The autotune_buffer_sizes rewrite does not currently support "
             "non-constant buffer_size input.");
       }
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
index 0b5f0a055e0..5594e77dc85 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
 #define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_AUTOTUNE_BUFFER_SIZES_H_
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
 
@@ -52,8 +53,9 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
     } else if (autotune == "false") {
       autotune_ = false;
     } else {
-      return errors::InvalidArgument("Received an invalid value for parameter ",
-                                     kAutotune, ": ", autotune);
+      return absl::InvalidArgumentError(
+          absl::StrCat("Received an invalid value for parameter ", kAutotune,
+                       ": ", autotune));
     }
     return OkStatus();
   }
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 746a650c28e..08f1e8a1221 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/meta_optimizer.h"
 
+#include "absl/status/status.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
@@ -174,7 +175,7 @@ Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
   if (status.ok()) {
     // The optimizer succeeded and wrote the optimized graph to result.
     item->graph.Swap(&result);
-  } else if (errors::IsAborted(status)) {
+  } else if (absl::IsAborted(status)) {
     // A status of errors::Aborted just means that the optimizer was a no-op and
     // did not populate result. Swallow the error status and leave the original
     // graph in item.
diff --git a/tensorflow/core/grappler/optimizers/data/slack_test.cc b/tensorflow/core/grappler/optimizers/data/slack_test.cc
index 81a7657a3ec..bc1205cbae8 100644
--- a/tensorflow/core/grappler/optimizers/data/slack_test.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/data/slack.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/grappler/grappler_item.h"
 #include "tensorflow/core/grappler/optimizers/data/function_utils.h"
@@ -91,7 +92,7 @@ TEST(SlackTest, TestFailWithoutInit) {
   Status result = optimizer.Optimize(nullptr, item, &output);
 
   EXPECT_FALSE(result.ok());
-  EXPECT_TRUE(errors::IsInvalidArgument(result));
+  EXPECT_TRUE(absl::IsInvalidArgument(result));
 }
 
 TEST(SlackTest, TestFailWithInvalidSlackEveryParam) {
@@ -107,7 +108,7 @@ TEST(SlackTest, TestFailWithInvalidSlackEveryParam) {
   Status result = optimizer.Optimize(nullptr, item, &output);
 
   EXPECT_FALSE(result.ok());
-  EXPECT_TRUE(errors::IsInvalidArgument(result));
+  EXPECT_TRUE(absl::IsInvalidArgument(result));
 }
 
 TEST(SlackTest, TestFunctionNotOptimized) {
diff --git a/tensorflow/core/grappler/optimizers/function_optimizer.cc b/tensorflow/core/grappler/optimizers/function_optimizer.cc
index 611862b1532..bc6ccbe7bef 100644
--- a/tensorflow/core/grappler/optimizers/function_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/function_optimizer.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/compiler/jit/defs.h"
@@ -594,7 +596,7 @@ Status UpdateSpecializedFunctionCallSite(const FunctionDef& func,
     (*attr)[kFuncAttr].mutable_func()->set_name(specialized_func_name);
 
   } else {
-    return errors::InvalidArgument("Unknown function call site");
+    return absl::InvalidArgumentError("Unknown function call site");
   }
 
   return OkStatus();
@@ -753,7 +755,7 @@ Status SpecializeFunction(const NodeDef& func_node, const FunctionDef& func,
   if (flib.Contains(specialized_func_name)) {
     // NOTE(ezhulenev): This should never happen. If it happens, it's a sign of
     // a serious internal error, that must be investigated.
-    return errors::Internal("Created duplicate function specialization");
+    return absl::InternalError("Created duplicate function specialization");
   }
 
   specialized_func.mutable_signature()->set_name(specialized_func_name);
@@ -900,7 +902,7 @@ Status ValidateSideEffectsExecution(
         "Can't guarantee execution of function side-effects after inlining. "
         "Function call node has no outgoing control edges.";
     if (validate_outgoing_control_edge) {
-      return errors::Internal(error_message);
+      return absl::InternalError(error_message);
     } else {
       VLOG(3) << error_message;
     }
@@ -933,10 +935,10 @@ Status ValidateSideEffectsExecution(
             /*leave=*/{}, NodeComparatorName{});
 
     if (!will_execute) {
-      return errors::Internal(
+      return absl::InternalError(absl::StrCat(
           "Can't guarantee execution of a side-effectful node, that is not "
           "reachable from function control source. Function body node: ",
-          SummarizeNode(*side_effect));
+          SummarizeNode(*side_effect)));
     }
   }
 
@@ -999,9 +1001,9 @@ Status ValidateNoDeadOutputs(const FunctionLibraryDefinition& flib_def,
             /*edge_filter=*/stop_traversal);
 
     if (has_dead_output) {
-      return errors::Internal(
+      return absl::InternalError(absl::StrCat(
           "Can't inline a function with dead outputs. Dead tensor source: ",
-          SummarizeNode(*dead_tensor_source));
+          SummarizeNode(*dead_tensor_source)));
     }
   }
 
@@ -1019,9 +1021,9 @@ Status MakeFunctionBodyForInlining(const Node& node,
                              const string& name,
                              const FunctionDef** fdef) -> Status {
     if ((*fdef = flib_def.Find(name)) == nullptr) {
-      return errors::Internal(
+      return absl::InternalError(absl::StrCat(
           "Was not able to find a function definition (name=", name,
-          ") for a function call: ", SummarizeNode(node));
+          ") for a function call: ", SummarizeNode(node)));
     }
     return OkStatus();
   };
@@ -1049,8 +1051,8 @@ Status MakeFunctionBodyForInlining(const Node& node,
       gradient::Creator creator;
       TF_RETURN_IF_ERROR(gradient::GetOpGradientCreator(func.name(), &creator));
       if (creator == nullptr) {
-        return errors::InvalidArgument("No gradient is defined for ",
-                                       func.name());
+        return absl::InvalidArgumentError(
+            absl::StrCat("No gradient is defined for ", func.name()));
       }
       FunctionDef grad_fdef;
       TF_RETURN_IF_ERROR(creator(AttrSlice(&func.attr()), &grad_fdef));
@@ -1523,7 +1525,7 @@ Status FunctionOptimizer::Optimize(Cluster*, const GrapplerItem& item,
                                    GraphDef* optimized_graph) {
   // Nothing to do here.
   if (item.graph.library().function_size() == 0) {
-    return errors::Aborted("Nothing to do.");
+    return absl::AbortedError("Nothing to do.");
   }
 
   TF_RETURN_IF_ERROR(RunFunctionOptimizerPass(item, optimized_graph));
diff --git a/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc b/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc
index 1820af68442..78df573e2ce 100644
--- a/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc
+++ b/tensorflow/core/grappler/optimizers/gpu_swapping_kernels.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 // Op kernels used to swap data in and out of GPU memory.
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -30,8 +32,8 @@ class CopyFromGpuToHostKernel : public AsyncOpKernel {
     const Tensor& input = ctx->input(0);
     OP_REQUIRES_ASYNC(
         ctx, !ctx->input_alloc_attr(0).on_host(),
-        errors::Internal("The input tensor to the _CopyFromGpuToHost kernel "
-                         "must reside on the device."),
+        absl::InternalError("The input tensor to the _CopyFromGpuToHost kernel "
+                            "must reside on the device."),
         done);
 
     AllocatorAttributes alloc_attrs;
@@ -63,8 +65,8 @@ class CopyFromHostToGpuKernel : public AsyncOpKernel {
     const Tensor& input = ctx->input(0);
     OP_REQUIRES_ASYNC(
         ctx, ctx->input_alloc_attr(0).on_host(),
-        errors::Internal("The input tensor to the _CopyFromHostToGpu kernel "
-                         "must reside on the host."),
+        absl::InternalError("The input tensor to the _CopyFromHostToGpu kernel "
+                            "must reside on the host."),
         done);
 
     Tensor* output;
diff --git a/tensorflow/core/grappler/optimizers/graph_optimizer.h b/tensorflow/core/grappler/optimizers/graph_optimizer.h
index 243ec8442f3..cf38fd2c447 100644
--- a/tensorflow/core/grappler/optimizers/graph_optimizer.h
+++ b/tensorflow/core/grappler/optimizers/graph_optimizer.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
@@ -78,11 +79,12 @@ class GraphOptimizer {
   uint64 deadline_usec_;
 };
 
-#define GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED()                              \
-  do {                                                                      \
-    if (this->DeadlineExceeded()) {                                         \
-      return errors::DeadlineExceeded(this->name(), " exceeded deadline."); \
-    }                                                                       \
+#define GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED()                \
+  do {                                                        \
+    if (this->DeadlineExceeded()) {                           \
+      return absl::DeadlineExceededError(                     \
+          absl::StrCat(this->name(), " exceeded deadline.")); \
+    }                                                         \
   } while (0)
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/inference/BUILD b/tensorflow/core/grappler/optimizers/inference/BUILD
index 616ff56932a..292aea469cf 100644
--- a/tensorflow/core/grappler/optimizers/inference/BUILD
+++ b/tensorflow/core/grappler/optimizers/inference/BUILD
@@ -21,6 +21,15 @@ tf_proto_library(
     visibility = DEFAULT_VISIBILITY,
 )
 
+# copybara:uncomment_begin(google-only)
+# py_proto_library(
+#     name = "batch_op_rewriter_proto_py_pb2",
+#     api_version = 2,
+#     visibility = ["//visibility:public"],
+#     deps = [":batch_op_rewriter_proto"],
+# )
+# copybara:uncomment_end
+
 cc_library(
     name = "batch_op_rewriter",
     srcs = ["batch_op_rewriter.cc"],
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
index 68e628f5ab6..822dbf5c2b4 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "google/protobuf/repeated_field.h"
 #include "absl/status/status.h"
 #include "absl/strings/escaping.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/function.pb.h"
 #include "tensorflow/core/framework/node_def.pb.h"
@@ -139,7 +140,7 @@ Status BatchOpRewriter::Init(
   // Parse the config from params. Fail if its missing or fails to parse.
   if (config->parameter_map().find(kBatchOpRewriteConfigParamKey) ==
       config->parameter_map().end()) {
-    return ::tensorflow::errors::Internal(
+    return absl::InternalError(
         "batch_op_rewrite_config param must be set in the rewriter config "
         "with a serialized/encoded BatchOpRewriteConfig.");
   }
@@ -154,11 +155,11 @@ Status BatchOpRewriter::Init(
     return OkStatus();
   }
   if (!absl::Base64Unescape(params.s(), &unencoded)) {
-    return ::tensorflow::errors::Internal(
+    return absl::InternalError(
         "Failed to unencode batch_op_rewrite_config from params.");
   }
   if (!config_.ParseFromString(unencoded)) {
-    return ::tensorflow::errors::Internal(
+    return absl::InternalError(
         "Failed to parse batch_op_rewrite_config from params.");
   }
   VLOG(2) << "BatchOp Rewrite config is " << config_.DebugString();
@@ -177,7 +178,7 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
         config_proto_.experimental().session_metadata().name();
 
     if (!config_.model_scheduler_options().empty()) {
-      return ::tensorflow::errors::InvalidArgument(
+      return absl::InvalidArgumentError(
           "model_scheduler_options is deprecated. Please use the "
           "adaptive_batch_scheduler_option field in batch_options instead.");
     }
@@ -197,13 +198,13 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
         if ((params.min_inflight_batches > params.max_inflight_batches) ||
             (params.initial_inflight_batches < params.min_inflight_batches) ||
             (params.initial_inflight_batches > params.max_inflight_batches)) {
-          return errors ::InvalidArgument(
+          return absl::InvalidArgumentError(absl::StrCat(
               "Requires min_inflight_batches <= initial_inflight_batches "
               "and initial_inflight_batches <= max_inflight_batches; Got "
               "{min_inflight_batches : ",
               params.min_inflight_batches,
               ", initial_inflight_batches : ", params.initial_inflight_batches,
-              ", max_inflight_batches : ", params.max_inflight_batches, "}.");
+              ", max_inflight_batches : ", params.max_inflight_batches, "}."));
         }
 
         asbs_overridden = true;
@@ -220,7 +221,7 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
       if (config_.enable_adaptive_shared_batching_thread_pool() &&
           !asbs_overridden && batch_options.has_num_batch_threads() &&
           batch_options.num_batch_threads() != 0) {
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(
             "Unable to enable adapative shared batching because it requires "
             "num_batch_threads=0 but the BatchOpRewriteConfig is also trying "
             "to set num_batch_threads. Set either set "
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 516cf8bc995..605b18bf102 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -65,7 +66,7 @@ limitations under the License.
 #include "tensorflow/core/util/xla_config_registry.h"
 
 // #TODO(b/200087693): LLVM does not build on Fuchsia.
-#ifndef __Fuchsia__
+#if !NO_LLVM_SUPPORT
 #include "tensorflow/core/grappler/optimizers/tfg_optimizer_hook.h"
 #include "tensorflow/core/grappler/optimizers/tfg_passes_builder.h"
 #endif
@@ -286,11 +287,14 @@ Status MetaOptimizer::InitializeOptimizers(
   }
 
   // #TODO(b/200087693): LLVM does not build on Fuchsia.
-#ifndef __Fuchsia__
-  // Hooks the MLIR optimizer, it won't run any optimizations right now. This
-  // optimizer instance runs on functions one at a time; don't use any threads.
-  optimizers->push_back(std::make_unique<mlir::tfg::TFGGrapplerOptimizer>(
-      mlir::tfg::DefaultGrapplerPipeline));
+#if !NO_LLVM_SUPPORT
+  if (!cfg_.disable_tfg_optimizer()) {
+    // Hooks the MLIR optimizer, it won't run any optimizations right now. This
+    // optimizer instance runs on functions one at a time; don't use any
+    // threads.
+    optimizers->push_back(std::make_unique<mlir::tfg::TFGGrapplerOptimizer>(
+        mlir::tfg::DefaultGrapplerPipeline));
+  }
 #endif
 
 // A set of macro utilities which check if the toggle of an optimization.
@@ -428,11 +432,11 @@ Status MetaOptimizer::InitializeOptimizers(
         !enable_mlir_pass || USER_IS_EXPERIMENTAL_BOTH(remapping);
     if (enable_mlir_pass) {
 // #TODO(b/200087693): LLVM does not build on Fuchsia.
-#ifndef __Fuchsia__
+#if !NO_LLVM_SUPPORT
       optimizers->push_back(std::make_unique<mlir::tfg::TFGGrapplerOptimizer>(
           mlir::tfg::RemapperPassBuilder));
 #else
-      VLOG(2) << "mlir Remapper pass is not supported on Fuchsia";
+      VLOG(2) << "mlir Remapper pass is not supported on this platform";
 #endif
     }
     if (enable_grappler_pass) {
@@ -938,20 +942,21 @@ Status MetaOptimizer::RunOptimizer(
   string message;
   if (!status.ok()) {
     *optimized_graph = std::move(optimized_item->graph);
-    if (errors::IsAborted(status)) {
+    if (absl::IsAborted(status)) {
       // By convention we (ab-)use the Aborted error code to signal that the
       // optimizer returned without performing any changes to the graph.
       message = strings::StrCat(optimizer->name(),
                                 " did nothing. time = ", duration_ms, "ms.");
       // Swallow the non-critical error.
       status = OkStatus();
-    } else if (errors::IsDeadlineExceeded(status)) {
+    } else if (absl::IsDeadlineExceeded(status)) {
       message =
           strings::StrCat(status.ToString(), ", time = ", duration_ms, "ms.");
-      LOG(WARNING) << optimizer->name() << " failed: " << message;
+      LOG_EVERY_N_SEC(WARNING, 60)
+          << optimizer->name() << " failed: " << message;
     } else {
       message = status.ToString();
-      LOG(ERROR) << optimizer->name() << " failed: " << message;
+      LOG_EVERY_N_SEC(ERROR, 60) << optimizer->name() << " failed: " << message;
     }
   } else {
     message = strings::StrCat(
@@ -1273,8 +1278,8 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
   // TODO(jeffniu): None of the TFG optimizations are meant to create new
   // opportunities for other optimizers; they could, but it's unclear whether
   // re-running all the other optimizers is worthwhile.
-#ifndef __Fuchsia__
-  {
+#if !NO_LLVM_SUPPORT
+  if (!cfg_.disable_tfg_optimizer()) {
     // Create a Grappler optimization pipeline with only the TFG optimizer.
     std::vector<std::unique_ptr<GraphOptimizer>> optimizers;
     optimizers.push_back(std::make_unique<mlir::tfg::TFGGrapplerOptimizer>(
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index bed573493a0..894ea55fb45 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -852,9 +852,103 @@ class MklFusedBatchMatMul : public MklRemapperTest {
 
     auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
     auto tensors = EvaluateNodes(output, item.fetch, item.feed);
-    std::is_same<T, float>::value
-        ? test::ExpectClose(tensors_expected[0], tensors[0], 1e-6, 1e-6)
-        : test::ExpectClose(tensors_expected[0], tensors[0], 1e-2, 1e-2);
+    float atol = 1e-6, rtol = 1e-6;
+    if (std::is_same<T, bfloat16>::value) {
+      atol = 1e-2;
+      rtol = 1e-2;
+    }
+    test::ExpectClose(tensors_expected[0], tensors[0], atol, rtol);
+  }
+
+  template <typename T>
+  void VerifyPreceedingScalarMul(bool adjx, bool adjy) {
+    using ::tensorflow::ops::Placeholder;
+    using normal_generator = Eigen::internal::NormalRandomGenerator<T>;
+
+    int b0 = 2;
+    int b1 = 2;
+    int m = 32;
+    int k = 16;
+    int n = 64;
+
+    tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+    auto input_shape =
+        adjx ? TensorShape({b0, b1, k, m}) : TensorShape({b0, b1, m, k});
+    auto weight_shape =
+        adjy ? TensorShape({b0, b1, n, k}) : TensorShape({b0, b1, k, n});
+    auto add_shape = TensorShape({b0, 1, m, n});
+
+    auto input_placeholder_shape = ops::Placeholder::Shape(input_shape);
+    auto weight_placeholder_shape = ops::Placeholder::Shape(weight_shape);
+    auto add_placeholder_shape = ops::Placeholder::Shape(add_shape);
+
+    const DataType dtype = DataTypeToEnum<T>::v();
+    auto input =
+        Placeholder(s.WithOpName("input"), dtype, input_placeholder_shape);
+    auto weight =
+        Placeholder(s.WithOpName("weight"), dtype, weight_placeholder_shape);
+    auto addend =
+        Placeholder(s.WithOpName("addend"), dtype, add_placeholder_shape);
+
+    auto scale_const = ops::Const(s.WithOpName("scale_const"), {0.1f});
+    auto scale = ops::Cast(s.WithOpName("scale"), scale_const, dtype);
+    auto mul = ops::Multiply(s.WithOpName("mul"), input, scale);
+    auto batchmatmul =
+        ops::BatchMatMulV2(s.WithOpName("batchmatmul"), mul, weight,
+                           ops::BatchMatMulV2::Attrs().AdjX(adjx).AdjY(adjy));
+    auto add = ops::Add(s.WithOpName("add"), batchmatmul, addend);
+    auto fetch = ops::Identity(s.WithOpName("fetch"), add);
+
+    Tensor input_t = Tensor(dtype, input_shape);
+    Tensor weight_t = Tensor(dtype, weight_shape);
+    Tensor add_t = Tensor(dtype, add_shape);
+    input_t.flat<T>() =
+        input_t.flat<T>().template setRandom<normal_generator>();
+    weight_t.flat<T>() =
+        weight_t.flat<T>().template setRandom<normal_generator>();
+    add_t.flat<T>() = add_t.flat<T>().template setRandom<normal_generator>();
+
+    GrapplerItem item;
+    item.fetch = {"fetch"};
+    item.feed = {{"input", input_t}, {"weight", weight_t}, {"addend", add_t}};
+    TF_CHECK_OK(s.ToGraphDef(&item.graph));
+
+    // Place all nodes on CPU.
+    for (int i = 0; i < item.graph.node_size(); ++i) {
+      item.graph.mutable_node(i)->set_device("/device:CPU:0");
+    }
+
+    Remapper optimizer(RewriterConfig::ON);
+    GraphDef output;
+    TF_CHECK_OK(optimizer.Optimize(nullptr, item, &output));
+
+    int found = 0;
+    for (const NodeDef& node : output.node()) {
+      if (node.name() == "add") {
+        EXPECT_EQ("_MklFusedBatchMatMulV2", node.op());
+        EXPECT_EQ("input", node.input(0));
+        EXPECT_EQ("weight", node.input(1));
+        EXPECT_EQ("scale", node.input(2));
+        EXPECT_EQ("addend", node.input(3));
+        const auto fused_ops = node.attr().at("fused_ops").list().s();
+        EXPECT_EQ(2, fused_ops.size());
+        EXPECT_EQ("Mul", fused_ops[0]);
+        found++;
+        EXPECT_EQ("Add", fused_ops[1]);
+        found++;
+      }
+    }
+    EXPECT_EQ(2, found);
+
+    auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+    auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+    float atol = 1e-6, rtol = 1e-6;
+    if (std::is_same<T, bfloat16>::value) {
+      atol = 1e-2;
+      rtol = 1e-2;
+    }
+    test::ExpectClose(tensors_expected[0], tensors[0], atol, rtol);
   }
 };
 
@@ -866,6 +960,14 @@ TEST_F(MklFusedBatchMatMul, MulAndAdd) {
     }
 }
 
+TEST_F(MklFusedBatchMatMul, MulAndAdd2) {
+  for (const auto adjx : {false, true})
+    for (const auto adjy : {false, true}) {
+      this->VerifyPreceedingScalarMul<float>(adjx, adjy);
+      this->VerifyPreceedingScalarMul<bfloat16>(adjx, adjy);
+    }
+}
+
 class MklRemapperSwishTest : public GrapplerTest {
  protected:
   template <DataType DTYPE>
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 59a242068fa..3ee0c18b696 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -73,6 +73,10 @@ namespace grappler {
 //
 // Both Conv2D and MatMul implemented as Tensor contraction (on CPU), so all the
 // patterns are "ContractionWith...".
+//
+// _FusedConv2D/_FusedConv3D + <Activation> -> _FusedConv2D/_FusedConv3D
+// Supported Activations: LeakyRelu, Mish
+
 namespace {
 
 constexpr char kFusedConv2D[] = "_FusedConv2D";
@@ -82,6 +86,11 @@ constexpr char kFusedDepthwiseConv2dNative[] = "_FusedDepthwiseConv2dNative";
 constexpr char kFusedBatchNormEx[] = "_FusedBatchNormEx";
 constexpr char kFusedBatchNormGradEx[] = "_FusedBatchNormGradEx";
 constexpr char kTensorToHashBucket[] = "_TensorToHashBucketFast";
+constexpr char kLeakyRelu[] = "LeakyRelu";
+constexpr char kMklFusedMish[] = "_MklFusedMish";
+constexpr char kRelu[] = "Relu";
+constexpr char kRelu6[] = "Relu6";
+constexpr char kElu[] = "Elu";
 
 constexpr char kDataFormat[] = "data_format";
 constexpr char kIsTraining[] = "is_training";
@@ -176,6 +185,16 @@ struct ContractionWithBiasAdd {
   int bias_port = 1;
 };
 
+// Contraction node followed by Activation
+struct ContractionWithActivation {
+  ContractionWithActivation() = default;
+  ContractionWithActivation(int contraction, int activation)
+      : contraction(contraction), activation(activation) {}
+
+  int contraction = kMissingIndex;
+  int activation = kMissingIndex;
+};
+
 // Contraction node followed by a BiasAdd and Activation.
 struct ContractionWithBiasAddAndActivation {
   ContractionWithBiasAddAndActivation() = default;
@@ -596,6 +615,15 @@ bool IsDeviceCompatible(const RemapperContext& ctx, Pattern& matched,
          IsGpuCompatible(ctx, matched, cluster);
 }
 
+// Returns the generic op name for an _Mkl activation op
+std::string GetActivationName(std::string s) {
+  if (s == kMklFusedMish) {
+    return "Mish";
+  } else {
+    return s;
+  }
+}
+
 inline bool HasControlFaninOrFanout(const utils::MutableNodeView& node_view) {
   return node_view.NumControllingFanins() > 0 ||
          node_view.NumControlledFanouts() > 0;
@@ -748,6 +776,53 @@ bool FindContractionWithBias(const RemapperContext& ctx, int node_index,
   return true;
 }
 
+// Fuse _FusedConv{2,3}D with elementwise ops that
+// gets fused in the first iteration of remapper
+// Currently supports: LeakyRelu, _MklFusedMish
+bool FindFusedConvWithFusedActivation(const RemapperContext& ctx,
+                                      int node_index,
+                                      ContractionWithActivation* matched) {
+  const auto* node_view = ctx.graph_view.GetNode(node_index);
+  if (HasControlFaninOrFanout(*node_view)) return false;
+
+  const auto* node_def = node_view->node();
+
+  // Root of the pattern must be on CPU with MKL enabled
+  if (!NodeIsOnCpu(node_def) && !IsMKLEnabled()) return false;
+
+  // Root of the pattern must be a LeakyRelu or _MklFusedMish
+  if (!IsLeakyRelu(*node_def) && !IsMklFusedMish(*node_def)) return false;
+
+  if (node_view->NumRegularFanins() < 1) return false;
+  const auto& regular_fanin_0 = node_view->GetRegularFanin(0);
+  const auto* contraction_node_view = regular_fanin_0.node_view();
+  const auto* contraction_node_def = contraction_node_view->node();
+
+  // Input to the activation must be a _FusedConv2D or _FusedConv3D
+  if (!(contraction_node_def->op() == kFusedConv2D ||
+        contraction_node_def->op() == kFusedConv3D))
+    return false;
+
+  // Check if any activation is already fused into _FusedConv2D or _FusedConv3D
+  auto contraction_fused_ops_list =
+      contraction_node_def->attr().at("fused_ops").list().s();
+  for (auto it = contraction_fused_ops_list.begin();
+       it != contraction_fused_ops_list.end(); it++) {
+    if (*it == kLeakyRelu || *it == kMklFusedMish || *it == kRelu ||
+        *it == kRelu6 || *it == kElu) {
+      return false;
+    }
+  }
+
+  // We found the pattern
+  const ContractionWithActivation pattern{contraction_node_view->node_index(),
+                                          node_view->node_index()};
+
+  *matched = pattern;
+
+  return true;
+}
+
 bool FindContractionWithBiasAndActivation(
     const RemapperContext& ctx, Cluster* cluster, int node_index,
     ContractionWithBiasAddAndActivation* matched) {
@@ -2317,14 +2392,16 @@ bool FindTensorToHashBucket(const RemapperContext& ctx, int node_index,
 
 bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
                           std::map<string, int>* matched_nodes_map,
-                          std::set<int>* remove_node_indices) {
+                          std::set<int>* remove_node_indices,
+                          std::vector<string>* input_node_names) {
   if (!IsMKLEnabled()) return false;
 
   using utils::MatchingDirection;
   using utils::NodeStatus;
+  int pattern = 0;
   // clang-format off
   utils::OpTypePattern fusion_pattern1 =
-    {"AddV2", "output", NodeStatus::kReplace,
+    {"Add|AddV2", "output", NodeStatus::kReplace,
       {
         {"Mul", "mul", NodeStatus::kRemove,
           {
@@ -2337,15 +2414,20 @@ bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
     };
 
   utils::OpTypePattern fusion_pattern2 =
-    {"AddV2", "output", NodeStatus::kReplace,
+    {"Add|AddV2", "output", NodeStatus::kReplace,
       {
-        {"*", "addend", NodeStatus::kRemain},
-        {"Mul", "mul", NodeStatus::kRemove,
+        {"BatchMatMulV2", "batch_matmul", NodeStatus::kRemove,
           {
-            {"BatchMatMulV2", "batch_matmul", NodeStatus::kRemove},
-            {"*", "multiplicand", NodeStatus::kRemain}
+            {"Mul", "mul", NodeStatus::kRemove,
+              {
+                {"*", "mul_input0", NodeStatus::kRemain},
+                {"Const|Cast", "multiplicand", NodeStatus::kRemain}
+              }
+            },
+            {"*", "bmm_input1", NodeStatus::kRemain}
           }
-        }
+        },
+        {"*", "addend", NodeStatus::kRemain}
       }
     };
   // clang-format on
@@ -2359,6 +2441,7 @@ bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
       graph_matcher.GetMatchedNodes(fusion_pattern1, ctx->nodes_to_preserve,
                                     ctx->graph_view.GetNode(node_index),
                                     matched_nodes_map, remove_node_indices);
+  if (found_op_type_match) pattern = 1;
 
   if (!found_op_type_match) {
     matched_nodes_map->clear();
@@ -2367,6 +2450,7 @@ bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
         graph_matcher.GetMatchedNodes(fusion_pattern2, ctx->nodes_to_preserve,
                                       ctx->graph_view.GetNode(node_index),
                                       matched_nodes_map, remove_node_indices);
+    if (found_op_type_match) pattern = 2;
   }
 
   // OneDNN is not optimized for all shapes with regard to binary-post ops
@@ -2402,8 +2486,24 @@ bool FindFusedBatchMatMul(RemapperContext* ctx, int node_index,
   auto addend_props =
       ctx->graph_properties.GetOutputProperties(addend_node_def->name());
   auto addend_shape = addend_props[0].shape();
-  if (!(Rank(addend_shape) == 4 && addend_shape.dim(1).size() == 1))
+  if (!(Rank(addend_shape) == 4 && addend_shape.dim(1).size() == 1)) {
     return false;
+  }
+  input_node_names->clear();
+  input_node_names->resize(4);
+  if (pattern == 1) {
+    input_node_names->at(0) = batch_matmul_node_def->input(0);
+    input_node_names->at(1) = batch_matmul_node_def->input(1);
+    input_node_names->at(2) = multiplicand_node_def->name();
+    input_node_names->at(3) = addend_node_def->name();
+  } else if (pattern == 2) {
+    auto* mul_input0_node_def =
+        ctx->graph_view.GetNode(matched_nodes_map->at("mul_input0"))->node();
+    input_node_names->at(0) = mul_input0_node_def->name();
+    input_node_names->at(1) = batch_matmul_node_def->input(1);
+    input_node_names->at(2) = multiplicand_node_def->name();
+    input_node_names->at(3) = addend_node_def->name();
+  }
   return found_op_type_match;
 }
 
@@ -2835,6 +2935,59 @@ Status AddFusedContractionNode(RemapperContext* ctx,
   return OkStatus();
 }
 
+Status AddFusedContractionNode(RemapperContext* ctx,
+                               const ContractionWithActivation& matched,
+                               std::vector<bool>* invalidated_nodes,
+                               std::vector<bool>* nodes_to_delete) {
+  const GraphDef* graph = ctx->graph_view.graph();
+  const NodeDef& contraction = graph->node(matched.contraction);
+  const NodeDef& activation = graph->node(matched.activation);
+
+  VLOG(2) << "Fuse " << contraction.op() << " and " << activation.op() << ":"
+          << " activation=" << activation.name()
+          << " contraction=" << contraction.name();
+
+  NodeDef fused_op;
+
+  // In case of _FusedConv2{3}D, only updating the fused_ops
+  // attr and the value of alpha in case of LeakyRelu activation
+
+  // creating a copy of the contraction
+  fused_op.CopyFrom(contraction);
+
+  auto* attr = fused_op.mutable_attr();
+  auto contraction_fused_ops_list =
+      contraction.attr().at("fused_ops").list().s();
+
+  // updating the fused_ops attr
+  std::vector<std::string> fused_items;
+  for (auto it = contraction_fused_ops_list.begin();
+       it != contraction_fused_ops_list.end(); it++) {
+    fused_items.push_back(*it);
+  }
+  fused_items.push_back(GetActivationName(activation.op()));
+
+  SetAttrValue(fused_items, &(*attr)["fused_ops"]);
+
+  // LeakyRelu has a special attribute
+  if (IsLeakyRelu(activation)) {
+    auto& activation_attr = activation.attr();
+    (*attr)["leakyrelu_alpha"] = activation_attr.at("alpha");
+  }
+  fused_op.set_name(activation.name());
+
+  utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
+  Status status;
+  mutation->AddNode(std::move(fused_op), &status);
+  TF_RETURN_IF_ERROR(status);
+  TF_RETURN_IF_ERROR(mutation->Apply());
+
+  (*nodes_to_delete)[matched.contraction] = true;
+  (*invalidated_nodes)[matched.activation] = true;
+
+  return OkStatus();
+}
+
 Status AddFusedContractionNode(
     RemapperContext* ctx, const ContractionWithBiasAddAndActivation& matched,
     std::vector<bool>* invalidated_nodes, std::vector<bool>* nodes_to_delete) {
@@ -3753,25 +3906,19 @@ Status AddTensorToHashBucketNode(RemapperContext* ctx,
 Status AddFusedBatchMatMul(RemapperContext* ctx,
                            const std::map<string, int>& matched_nodes_map,
                            const std::set<int>& remove_node_indices,
+                           const std::vector<string>& input_node_names,
                            std::vector<bool>* invalidated_nodes,
                            std::vector<bool>* nodes_to_delete) {
   auto* output_node =
       ctx->graph_view.GetNode(matched_nodes_map.at("output"))->node();
   auto* batch_matmul_node =
       ctx->graph_view.GetNode(matched_nodes_map.at("batch_matmul"))->node();
-  auto* multiplicand_node =
-      ctx->graph_view.GetNode(matched_nodes_map.at("multiplicand"))->node();
-  auto* addend_node =
-      ctx->graph_view.GetNode(matched_nodes_map.at("addend"))->node();
 
   NodeDef fused_node;
   fused_node.set_name(output_node->name());
   fused_node.set_op("_MklFusedBatchMatMulV2");
   fused_node.set_device(batch_matmul_node->device());
-  fused_node.add_input(batch_matmul_node->input(0));
-  fused_node.add_input(batch_matmul_node->input(1));
-  fused_node.add_input(multiplicand_node->name());
-  fused_node.add_input(addend_node->name());
+  for (const auto& name : input_node_names) fused_node.add_input(name);
 
   CopyBatchMatMulAttributes(*batch_matmul_node, &fused_node);
   SetFusedOpAttributes(&fused_node, {"Mul", "Add"}, /*num_args=*/2);
@@ -4268,6 +4415,7 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
 
     ContractionWithBiasAddAndAdd contract_with_bias_and_add;
+    ContractionWithActivation contract_with_activation;
     ContractionWithBiasAndAddActivation contract_with_bias_and_add_activation;
 
     if (IsMKLEnabled()) {
@@ -4281,6 +4429,15 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
         continue;
       }
 
+      // Remap {_FusedConv2D, _FusedConv3D} + {LeakyRelu, _MklFusedMish}
+      // into {_FusedConv2D, _FusedConv3D}
+      if (FindFusedConvWithFusedActivation(ctx, i, &contract_with_activation)) {
+        TF_RETURN_IF_ERROR(
+            AddFusedContractionNode(&ctx, contract_with_activation,
+                                    &invalidated_nodes, &nodes_to_delete));
+        continue;
+      }
+
       // Remap {Conv2D,Conv3D}+BiasAdd+Add into the _FusedConv2D/3D.
       if (FindContractionWithBiasAddAndAdd(ctx, i,
                                            &contract_with_bias_and_add)) {
@@ -4300,6 +4457,7 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
 
       std::map<string, int> matched_nodes_map;
       std::set<int> remove_node_indices;
+      std::vector<string> input_node_names;
 
       // Softplus + Tanh + Mul to Mish conversion
       matched_nodes_map.clear();
@@ -4315,11 +4473,12 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
       // Remap BatchMatMul+Mul+AddV2 into the _FusedBatchMatMul.
       matched_nodes_map.clear();
       remove_node_indices.clear();
+      input_node_names.clear();
       if (FindFusedBatchMatMul(&ctx, i, &matched_nodes_map,
-                               &remove_node_indices)) {
-        TF_RETURN_IF_ERROR(
-            AddFusedBatchMatMul(&ctx, matched_nodes_map, remove_node_indices,
-                                &invalidated_nodes, &nodes_to_delete));
+                               &remove_node_indices, &input_node_names)) {
+        TF_RETURN_IF_ERROR(AddFusedBatchMatMul(
+            &ctx, matched_nodes_map, remove_node_indices, input_node_names,
+            &invalidated_nodes, &nodes_to_delete));
         continue;
       }
 
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index c2d4f2c0e77..4a044138738 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -1012,6 +1012,65 @@ TEST_F(RemapperFuseConvWithSqueezeAndBias, Conv3D_BF16) {
   RunTest<3, DT_BFLOAT16>();
 }
 
+TEST_F(RemapperTest, FusePadPrecededConv2DWithBias) {
+  using ::tensorflow::ops::Placeholder;
+  tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+  auto input_shape = ops::Placeholder::Shape({8, 224, 224, 3});
+  auto filter_shape = ops::Placeholder::Shape({7, 7, 3, 64});
+  auto paddings_shape = ops::Placeholder::Shape({4, 2});
+  auto bias_shape = ops::Placeholder::Shape({64});
+
+  auto input = Placeholder(s.WithOpName("input"), DT_FLOAT, input_shape);
+  auto filter = Placeholder(s.WithOpName("filter"), DT_FLOAT, filter_shape);
+  auto bias_in = Placeholder(s.WithOpName("bias_in"), DT_FLOAT, bias_shape);
+
+  std::vector<int> strides = {1, 2, 2, 1};
+  auto padding_const =
+      ops::Const(s.WithOpName("padding"), {0, 0, 3, 3, 3, 3, 0, 0}, {4, 2});
+  auto pad = ops::Pad(s.WithOpName("pad"), input, padding_const);
+  auto conv = ops::Conv2D(s.WithOpName("conv"), pad, filter, strides, "VALID");
+  auto bias = ops::BiasAdd(s.WithOpName("bias"), conv, bias_in);
+  auto fetch = ops::Identity(s.WithOpName("fetch"), bias);
+
+  auto input_t = GenerateTensorWithSetRandom<DT_FLOAT>({8, 224, 224, 3});
+  auto filter_t = GenerateTensorWithSetRandom<DT_FLOAT>({7, 7, 3, 64});
+  auto bias_t = GenerateTensorWithSetRandom<DT_FLOAT>({64});
+
+  GrapplerItem item;
+  item.fetch = {"fetch"};
+  item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias_in", bias_t}};
+  TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+  // Place all nodes on CPU.
+  for (int i = 0; i < item.graph.node_size(); ++i) {
+    item.graph.mutable_node(i)->set_device("/device:CPU:0");
+  }
+
+  Remapper optimizer(RewriterConfig::AGGRESSIVE);
+  GraphDef output;
+  TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+  int found = 0;
+  for (const NodeDef& node : output.node()) {
+    if (node.name() == "bias") {
+      EXPECT_EQ(node.op(), "_FusedConv2D");
+      ASSERT_GE(node.input_size(), 3);
+      EXPECT_EQ(node.input(0), "pad");
+      EXPECT_EQ(node.input(1), "filter");
+      EXPECT_EQ(node.input(2), "bias_in");
+      found++;
+    }
+  }
+  EXPECT_EQ(found, 1);
+
+  auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+  ASSERT_EQ(tensors_expected.size(), 1);
+  auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+  ASSERT_EQ(tensors.size(), 1);
+  test::ExpectClose(tensors[0], tensors_expected[0], 1e-6);
+}
+
 #ifdef INTEL_MKL
 TEST_F(RemapperTest, FuseConv3DWithBias) {
   if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to MKL.";
@@ -2506,5 +2565,157 @@ class RemapperLeakyReluTest : public GrapplerTest {
 TEST_F(RemapperLeakyReluTest, F32) { RunTest<DT_FLOAT>(); }
 TEST_F(RemapperLeakyReluTest, BF16) { RunTest<DT_BFLOAT16>(); }
 
+class RemapperFuseFusedConvWithFusedActivation : public RemapperTest {
+ public:
+  template <int dim, DataType DTYPE>
+  void RunTest() {
+    if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
+
+    using ::tensorflow::ops::Placeholder;
+
+    for (const string& activation : {"LeakyRelu", "Mish"}) {
+      tensorflow::Scope s = tensorflow::Scope::NewRootScope();
+
+      auto input_shape = ops::Placeholder::Shape({8, 32, 32, 3});
+      auto filter_shape = ops::Placeholder::Shape({1, 1, 3, 128});
+      auto bias_shape = ops::Placeholder::Shape({128});
+      std::vector<int> strides = {1, 1, 1, 1};
+
+      auto input_t = GenerateTensorWithSetRandom<DTYPE>({8, 32, 32, 3});
+      auto filter_t = GenerateTensorWithSetRandom<DTYPE>({1, 1, 3, 128});
+      auto bias_t = GenerateTensorWithSetRandom<DTYPE>({128});
+
+      if (dim == 3) {
+        input_shape = ops::Placeholder::Shape({8, 4, 32, 32, 3});
+        filter_shape = ops::Placeholder::Shape({1, 1, 1, 3, 128});
+        bias_shape = ops::Placeholder::Shape({128});
+        strides = {1, 1, 1, 1, 1};
+
+        input_t = GenerateTensorWithSetRandom<DTYPE>({8, 4, 32, 32, 3});
+        filter_t = GenerateTensorWithSetRandom<DTYPE>({1, 1, 1, 3, 128});
+        bias_t = GenerateTensorWithSetRandom<DTYPE>({128});
+      }
+
+      auto input = Placeholder(s.WithOpName("input"), DTYPE, input_shape);
+      auto filter = Placeholder(s.WithOpName("filter"), DTYPE, filter_shape);
+      auto bias = Placeholder(s.WithOpName("bias"), DTYPE, bias_shape);
+
+      float leakyrelu_alpha = 0.5;
+      if (dim == 2) {
+        auto conv =
+            ops::Conv2D(s.WithOpName("conv"), input, filter, strides, "SAME");
+        auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+
+        if (activation == "LeakyRelu") {
+          ops::Identity fetch = [&]() -> ops::Identity {
+            auto activate = s.WithOpName("activation");
+            auto fetch = s.WithOpName("fetch");
+            auto attr = ops::internal::LeakyRelu::Alpha(leakyrelu_alpha);
+            return ops::Identity(
+                fetch, ops::internal::LeakyRelu(activate, bias_add, attr));
+          }();
+        } else if (activation == "Mish") {
+          ops::Identity fetch = [&]() -> ops::Identity {
+            auto activate = s.WithOpName("activation");
+            auto fetch = s.WithOpName("fetch");
+            auto softplus = ops::Softplus(s.WithOpName("softplus"), bias_add);
+            auto tanh = ops::Tanh(s.WithOpName("tanh"), softplus);
+            return ops::Identity(fetch, ops::Mul(activate, bias_add, tanh));
+          }();
+        }
+      } else if (dim == 3) {
+        auto conv =
+            ops::Conv3D(s.WithOpName("conv"), input, filter, strides, "SAME");
+        auto bias_add = ops::BiasAdd(s.WithOpName("bias_add"), conv, bias);
+
+        if (activation == "LeakyRelu") {
+          ops::Identity fetch = [&]() -> ops::Identity {
+            auto activate = s.WithOpName("activation");
+            auto fetch = s.WithOpName("fetch");
+            auto attr = ops::internal::LeakyRelu::Alpha(leakyrelu_alpha);
+            return ops::Identity(
+                fetch, ops::internal::LeakyRelu(activate, bias_add, attr));
+          }();
+        } else if (activation == "Mish") {
+          ops::Identity fetch = [&]() -> ops::Identity {
+            auto activate = s.WithOpName("activation");
+            auto fetch = s.WithOpName("fetch");
+            auto softplus = ops::Softplus(s.WithOpName("softplus"), bias_add);
+            auto tanh = ops::Tanh(s.WithOpName("tanh"), softplus);
+            return ops::Identity(fetch, ops::Mul(activate, bias_add, tanh));
+          }();
+        }
+      }
+
+      GrapplerItem item;
+      item.fetch = {"fetch"};
+      item.feed = {{"input", input_t}, {"filter", filter_t}, {"bias", bias_t}};
+      TF_ASSERT_OK(s.ToGraphDef(&item.graph));
+
+      // Place all nodes on CPU.
+      for (int i = 0; i < item.graph.node_size(); ++i) {
+        item.graph.mutable_node(i)->set_device("/device:CPU:0");
+      }
+
+      Remapper optimizer(RewriterConfig::ON);
+      GraphDef output_1;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output_1));
+      item.graph = std::move(output_1);
+      GraphDef output;
+      TF_ASSERT_OK(optimizer.Optimize(nullptr, item, &output));
+
+      int found = 0;
+      for (const NodeDef& node : output.node()) {
+        if (node.name() == "activation") {
+          if (dim == 2) {
+            EXPECT_EQ(node.op(), "_FusedConv2D");
+          } else if (dim == 3) {
+            EXPECT_EQ(node.op(), "_FusedConv3D");
+          }
+
+          ASSERT_GE(node.input_size(), 3);
+          EXPECT_EQ(node.input(0), "input");
+          EXPECT_EQ(node.input(1), "filter");
+          EXPECT_EQ(node.attr().at("num_args").i(), 1);
+          EXPECT_EQ(node.input(2), "bias");
+
+          const auto fused_ops = node.attr().at("fused_ops").list().s();
+          ASSERT_EQ(fused_ops.size(), 2);
+          EXPECT_EQ(fused_ops[0], "BiasAdd");
+          EXPECT_EQ(fused_ops[1], activation);
+          if (activation == "LeakyRelu") {
+            EXPECT_EQ(node.attr().at("leakyrelu_alpha").f(), leakyrelu_alpha);
+          }
+
+          found++;
+        }
+      }
+      EXPECT_EQ(found, 1);
+
+      auto tensors_expected = EvaluateNodes(item.graph, item.fetch, item.feed);
+      ASSERT_EQ(tensors_expected.size(), 1);
+      auto tensors = EvaluateNodes(output, item.fetch, item.feed);
+      ASSERT_EQ(tensors.size(), 1);
+      if (DTYPE == DT_BFLOAT16)
+        test::ExpectClose(tensors[0], tensors_expected[0], 1e-2, 1e-2);
+      else
+        test::ExpectClose(tensors[0], tensors_expected[0], 1e-6);
+    }
+  }
+};
+
+TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv2D_F32) {
+  RunTest<2, DT_FLOAT>();
+}
+TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv2D_BF16) {
+  RunTest<2, DT_BFLOAT16>();
+}
+TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv3D_F32) {
+  RunTest<3, DT_FLOAT>();
+}
+TEST_F(RemapperFuseFusedConvWithFusedActivation, Conv3D_BF16) {
+  RunTest<3, DT_BFLOAT16>();
+}
+
 }  // namespace grappler
 }  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index 16374f83fdd..c0053a49a5f 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/grappler/optimizers/shape_optimizer.h"
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -62,7 +64,7 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
     }
   }
   if (!can_optimize) {
-    return errors::Aborted("Nothing to do.");
+    return absl::AbortedError("Nothing to do.");
   }
 
   *optimized_graph = item.graph;
diff --git a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
index 6b03aa31814..df6fcc04c13 100644
--- a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
+++ b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
@@ -132,7 +133,7 @@ Status TFGGrapplerOptimizer::Optimize(
     // Import errors are not fatal. Log the error here and return `Aborted` so
     // the meta optimizer knows to swallow the error.
     LOG(ERROR) << name() << " failed: " << status.ToString();
-    return tensorflow::errors::Aborted(status.message());
+    return absl::AbortedError(status.message());
   }
   metrics.ReportAndStop();
 
@@ -145,7 +146,7 @@ Status TFGGrapplerOptimizer::Optimize(
   // bypass the problem. Find a better way to collect the pipeline failure
   // message here.
   if (failed(impl_->RunPipeline(module))) {
-    return InvalidArgument("MLIR Graph Optimizer failed: ");
+    return absl::InvalidArgumentError("MLIR Graph Optimizer failed: ");
   }
 
   // Export the TFG module to GraphDef.
diff --git a/tensorflow/core/grappler/utils/BUILD b/tensorflow/core/grappler/utils/BUILD
index da360b37ca3..e73a367d842 100644
--- a/tensorflow/core/grappler/utils/BUILD
+++ b/tensorflow/core/grappler/utils/BUILD
@@ -204,6 +204,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index 4620886868c..eb1493f9681 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/strings/substitute.h"
@@ -157,7 +158,8 @@ Status InstantiationTypeParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
     absl::flat_hash_map<string, DataType>* type_parameters) {
   if (!type_parameters->empty()) {
-    return errors::InvalidArgument("Type parameters output map must be empty");
+    return absl::InvalidArgumentError(
+        "Type parameters output map must be empty");
   }
 
   const auto resolve_type_attr = [&](const OpDef::ArgDef& arg) -> Status {
@@ -193,7 +195,8 @@ Status InstantiationBodyParameters(
     const FunctionDef& func, const AttrSlice& func_instantiation_attr,
     absl::flat_hash_map<string, AttrValue>* body_parameters) {
   if (!body_parameters->empty()) {
-    return errors::InvalidArgument("Body parameters output map must be empty");
+    return absl::InvalidArgumentError(
+        "Body parameters output map must be empty");
   }
 
   for (const NodeDef& func_body_node : func.node_def()) {
@@ -209,8 +212,8 @@ Status InstantiationBodyParameters(
       if (placeholder_value) {
         body_parameters->insert({placeholder, *placeholder_value});
       } else {
-        return errors::InvalidArgument("Can't resolve placeholder: ",
-                                       placeholder);
+        return absl::InvalidArgumentError(
+            absl::StrCat("Can't resolve placeholder: ", placeholder));
       }
     }
   }
@@ -226,14 +229,14 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   const OpDef& signature = func.signature();
 
   if (signature.name().empty()) {
-    return errors::InvalidArgument("Function name must be specified");
+    return absl::InvalidArgumentError("Function name must be specified");
   }
 
   // Function types will be resolved from function instantiation attributes. All
   // other attributes will be lost during conversion to FunctionDef.
   for (const OpDef::AttrDef& attr : signature.attr()) {
     if (attr.type() != "type") {
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(
           "Function signature must have only type attributes");
     }
   }
@@ -292,9 +295,9 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
   std::vector<const FunctionDef::ArgAttrs*> arg_attr(inputs.size(), nullptr);
   for (const auto& attr : func.arg_attr()) {
     if (attr.first >= inputs.size()) {
-      return errors::InvalidArgument("Invalid attribute index, got ",
-                                     attr.first, " but expected less than ",
-                                     inputs.size());
+      return absl::InvalidArgumentError(
+          absl::StrCat("Invalid attribute index, got ", attr.first,
+                       " but expected less than ", inputs.size()));
     }
     arg_attr.at(attr.first) = &attr.second;
   }
@@ -319,14 +322,14 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
 Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
                              GrapplerFunctionItem* item) {
   if (!IsConstant(input_const)) {
-    return errors::InvalidArgument("Input node is not a constant: ",
-                                   SummarizeNodeDef(input_const));
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Input node is not a constant: ", SummarizeNodeDef(input_const)));
   }
   const int item_input_size = item->input_size();
   if (input_index < 0 || input_index >= item_input_size) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(absl::StrCat(
         "Function input index is out of bound: index=", input_index,
-        " input_size=", item->input_size());
+        " input_size=", item->input_size()));
   }
 
   const InputArgInstantiation& input_arg = item->input(input_index);
@@ -366,9 +369,9 @@ Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
   for (int remove_output : remove_outputs) {
     const int item_output_size = item->output_size();
     if (remove_output < 0 || remove_output >= item_output_size) {
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(absl::StrCat(
           "Function output index is out of bound: index=", remove_output,
-          " output_size=", item->output_size());
+          " output_size=", item->output_size()));
     }
   }
 
@@ -507,7 +510,8 @@ Status MakeFunctionDefHelper::AsFunctionDefInput(const string& graph_def_input,
     }
   }
 
-  return errors::InvalidArgument("Unknown graph def input: ", graph_def_input);
+  return absl::InvalidArgumentError(
+      absl::StrCat("Unknown graph def input: ", graph_def_input));
 }
 
 Status MakeFunctionDefHelper::AsFunctionDefNode(
@@ -540,8 +544,9 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
   for (const NodeDef& func_body_node : item.function_body().node()) {
     if (!helper.IsOutputNode(func_body_node)) continue;
     if (func_body_node.input_size() != 1) {
-      return errors::Internal("_Retval node must have single input: ",
-                              SummarizeNodeDef(func_body_node));
+      return absl::InternalError(
+          absl::StrCat("_Retval node must have single input: ",
+                       SummarizeNodeDef(func_body_node)));
     }
     output_tensors.emplace(func_body_node.name(), func_body_node.input(0));
   }
@@ -567,9 +572,9 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
 
     auto it = output_tensors.find(output_arg.node_name);
     if (it == output_tensors.end()) {
-      return errors::Internal(
-          "Can't find an output tensor for the output node: ",
-          output_arg.node_name);
+      return absl::InternalError(
+          absl::StrCat("Can't find an output tensor for the output node: ",
+                       output_arg.node_name));
     }
 
     TF_RETURN_IF_ERROR(helper.AsFunctionDefInput(
diff --git a/tensorflow/core/ir/importexport/BUILD b/tensorflow/core/ir/importexport/BUILD
index 8e54b15428d..0824354e051 100644
--- a/tensorflow/core/ir/importexport/BUILD
+++ b/tensorflow/core/ir/importexport/BUILD
@@ -6,6 +6,7 @@ package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         ":__subpackages__",
+        ":friends",
         "//tensorflow/compiler/mlir:__subpackages__",
         "//tensorflow/core:__subpackages__",
         "//tensorflow/core:dependency_allowlist",
@@ -14,6 +15,13 @@ package(
     licenses = ["notice"],  # Apache 2.0
 )
 
+package_group(
+    name = "friends",
+    packages = [
+        "//waymo/ml/compiler/mlir/...",
+    ],
+)
+
 cc_library(
     name = "parse_text_proto",
     srcs = [
@@ -230,6 +238,7 @@ tf_cc_binary(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/ir:test_utilities"],
     driver = "//tensorflow/core/ir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/ir/importexport/convert_tensor.cc b/tensorflow/core/ir/importexport/convert_tensor.cc
index 885a7e30b91..079d59cd1d1 100644
--- a/tensorflow/core/ir/importexport/convert_tensor.cc
+++ b/tensorflow/core/ir/importexport/convert_tensor.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/ir/importexport/convert_tensor.h"
 
 #include <optional>
+#include <string>
+#include <vector>
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/core/ir/importexport/convert_types.cc b/tensorflow/core/ir/importexport/convert_types.cc
index e87303a46ff..8710e45e4ed 100644
--- a/tensorflow/core/ir/importexport/convert_types.cc
+++ b/tensorflow/core/ir/importexport/convert_types.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/ir/importexport/convert_types.h"
 
+#include <limits>
+
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
diff --git a/tensorflow/core/ir/importexport/graphdef_export.cc b/tensorflow/core/ir/importexport/graphdef_export.cc
index 2dea67cfc61..2a325eaed19 100644
--- a/tensorflow/core/ir/importexport/graphdef_export.cc
+++ b/tensorflow/core/ir/importexport/graphdef_export.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/ir/importexport/graphdef_export.h"
 
+#include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/PointerUnion.h"
diff --git a/tensorflow/core/ir/importexport/graphdef_import.cc b/tensorflow/core/ir/importexport/graphdef_import.cc
index e5aefccf12e..f4709231165 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.cc
+++ b/tensorflow/core/ir/importexport/graphdef_import.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/ir/importexport/graphdef_import.h"
 
+#include <iterator>
+#include <memory>
 #include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/core/ir/importexport/mangling.cc b/tensorflow/core/ir/importexport/mangling.cc
index 3ba96acd193..69d1777d6e7 100644
--- a/tensorflow/core/ir/importexport/mangling.cc
+++ b/tensorflow/core/ir/importexport/mangling.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/ir/importexport/mangling.h"
 
+#include <cstring>
+#include <string>
+
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/tensor.pb.h"
diff --git a/tensorflow/core/ir/importexport/mangling.h b/tensorflow/core/ir/importexport/mangling.h
index 54a3fbf754b..98bcddccc9d 100644
--- a/tensorflow/core/ir/importexport/mangling.h
+++ b/tensorflow/core/ir/importexport/mangling.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_IR_IMPORTEXPORT_MANGLING_H_
 #define TENSORFLOW_CORE_IR_IMPORTEXPORT_MANGLING_H_
 
+#include <string>
+
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
diff --git a/tensorflow/core/ir/importexport/parse_text_proto.cc b/tensorflow/core/ir/importexport/parse_text_proto.cc
index 1bbdada09df..c3825e90cf0 100644
--- a/tensorflow/core/ir/importexport/parse_text_proto.cc
+++ b/tensorflow/core/ir/importexport/parse_text_proto.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/ir/importexport/parse_text_proto.h"
 
+#include <string>
+
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/ir/importexport/tests/BUILD b/tensorflow/core/ir/importexport/tests/BUILD
index 17a02621637..71fa743ef9b 100644
--- a/tensorflow/core/ir/importexport/tests/BUILD
+++ b/tensorflow/core/ir/importexport/tests/BUILD
@@ -22,6 +22,7 @@ filegroup(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
index 17a02621637..71fa743ef9b 100644
--- a/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
+++ b/tensorflow/core/ir/importexport/tests/graphdef_to_mlir/BUILD
@@ -22,6 +22,7 @@ filegroup(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
index 17a02621637..71fa743ef9b 100644
--- a/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
+++ b/tensorflow/core/ir/importexport/tests/mlir_to_graphdef/BUILD
@@ -22,6 +22,7 @@ filegroup(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.cc b/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.cc
index 42d7fe69882..b6941974504 100644
--- a/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.cc
+++ b/tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/ir/importexport/tests/roundtrip/roundtrip.h"
 
 #include <algorithm>
+#include <cmath>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc b/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc
index 85471de7eb0..1bc03b21ba6 100644
--- a/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc
+++ b/tensorflow/core/ir/importexport/tests/saved_model/saved_model_roundtrip_test.cc
@@ -53,9 +53,9 @@ void RunRoundTrip(const std::string& input_file) {
   auto status = mlir::tfg::ExportMlirToSavedModel(*module_ref, original_model,
                                                   &final_model);
   if (!status.ok()) {
-    LOG(ERROR) << "Export failed: " << status.ToString();
+    LOG(ERROR) << "Export failed: " << status;
   }
-  ASSERT_TRUE(status.ok()) << status.ToString();
+  ASSERT_TRUE(status.ok()) << status;
 
   tensorflow::MetaGraphDef* original_metagraph =
       original_model.mutable_meta_graphs(0);
diff --git a/tensorflow/core/ir/importexport/tfg-translate.cc b/tensorflow/core/ir/importexport/tfg-translate.cc
index c596dff30b0..222e8fedbe1 100644
--- a/tensorflow/core/ir/importexport/tfg-translate.cc
+++ b/tensorflow/core/ir/importexport/tfg-translate.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+
 #include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
diff --git a/tensorflow/core/ir/tests/BUILD b/tensorflow/core/ir/tests/BUILD
index 971ffa799e6..315309f01fd 100644
--- a/tensorflow/core/ir/tests/BUILD
+++ b/tensorflow/core/ir/tests/BUILD
@@ -38,6 +38,7 @@ filegroup(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/ir/types/BUILD b/tensorflow/core/ir/types/BUILD
index e5232d945db..dcaa5ed2b76 100644
--- a/tensorflow/core/ir/types/BUILD
+++ b/tensorflow/core/ir/types/BUILD
@@ -98,9 +98,7 @@ cc_library(
     textual_hdrs = [
         "types.def",
     ],
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
+    visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":AttributesIncGen",
         ":DialectIncGen",
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 424c4dc9f6c..f672a332361 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -347,6 +347,7 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
         "//tensorflow/core/framework:op_requires",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -1985,7 +1986,7 @@ tf_kernel_library(
 
 tf_cuda_cc_test(
     name = "gather_op_test",
-    size = "small",
+    size = "medium",
     srcs = ["gather_op_test.cc"],
     deps = [
         ":gather_op",
@@ -2333,6 +2334,7 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/status",
     ],
 )
 
@@ -3185,13 +3187,13 @@ tf_kernel_library(
 tf_kernel_library(
     name = "text_line_reader_op",
     prefix = "text_line_reader_op",
-    deps = IO_DEPS,
+    deps = IO_DEPS + ["@com_google_absl//absl/status"],
 )
 
 tf_kernel_library(
     name = "tf_record_reader_op",
     prefix = "tf_record_reader_op",
-    deps = IO_DEPS,
+    deps = IO_DEPS + ["@com_google_absl//absl/status"],
 )
 
 tf_kernel_library(
@@ -3762,6 +3764,9 @@ tf_cuda_cc_test(
     name = "matmul_op_test",
     size = "small",
     srcs = ["matmul_op_test.cc"],
+    tags = [
+        "no_arm64",  # b/282068262
+    ],
     deps = [
         ":matmul_op",
         ":ops_testutil",
@@ -6339,6 +6344,7 @@ filegroup(
         "dilation_ops.h",
         "fake_quant_ops_functor.h",
         "fill_empty_rows_functor.h",
+        "function_ops.h",
         "fused_batch_norm_op.h",
         "inplace_ops.cc",
         "inplace_ops_functor.h",
@@ -7797,7 +7803,6 @@ tf_cc_test(
         "//tensorflow/core/framework:fake_input",
         "//tensorflow/core/framework:types_proto_cc",
         "//third_party/eigen3",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -7914,96 +7919,21 @@ test_suite(
     ],
 )
 
-exports_files([
-    "cwise_op_abs.cc",
-    "cwise_op_add_1.cc",
-    "cwise_op_add_2.cc",
-    "cwise_op_atan2.cc",
-    "cwise_op_ceil.cc",
-    "cwise_op_cos.cc",
-    "cwise_op_div.cc",
-    "cwise_op_equal_to_1.cc",
-    "cwise_op_equal_to_2.cc",
-    "cwise_op_exp.cc",
-    "cwise_op_erf.cc",
-    "cwise_op_floor.cc",
-    "cwise_op_floor_div.cc",
-    "cwise_op_floor_mod.cc",
-    "cwise_op_gpu_add.cu.cc",
-    "cwise_op_gpu_atan2.cu.cc",
-    "cwise_op_gpu_ceil.cu.cc",
-    "cwise_op_gpu_cos.cu.cc",
-    "cwise_op_gpu_div.cu.cc",
-    "cwise_op_gpu_equal_to.cu.cc",
-    "cwise_op_gpu_exp.cu.cc",
-    "cwise_op_gpu_floor.cu.cc",
-    "cwise_op_gpu_floor_div.cu.cc",
-    "cwise_op_gpu_greater.cu.cc",
-    "cwise_op_gpu_greater_equal.cu.cc",
-    "cwise_op_gpu_isinf.cu.cc",
-    "cwise_op_gpu_less.cu.cc",
-    "cwise_op_gpu_less_equal.cu.cc",
-    "cwise_op_gpu_log.cu.cc",
-    "cwise_op_gpu_logical_and.cu.cc",
-    "cwise_op_gpu_logical_not.cu.cc",
-    "cwise_op_gpu_logical_or.cu.cc",
-    "cwise_op_gpu_maximum.cu.cc",
-    "cwise_op_gpu_minimum.cu.cc",
-    "cwise_op_gpu_mod.cu.cc",
-    "cwise_op_gpu_mul.cu.cc",
-    "cwise_op_gpu_neg.cu.cc",
-    "cwise_op_gpu_not_equal_to.cu.cc",
-    "cwise_op_gpu_pow.cu.cc",
-    "cwise_op_gpu_round.cu.cc",
-    "cwise_op_gpu_rsqrt.cu.cc",
-    "cwise_op_gpu_select.cu.cc",
-    "cwise_op_gpu_sigmoid.cu.cc",
-    "cwise_op_gpu_sin.cu.cc",
-    "cwise_op_gpu_sqrt.cu.cc",
-    "cwise_op_gpu_square.cu.cc",
-    "cwise_op_gpu_squared_difference.cu.cc",
-    "cwise_op_gpu_sub.cu.cc",
-    "cwise_op_gpu_tanh.cu.cc",
-    "cwise_op_greater.cc",
-    "cwise_op_greater_equal.cc",
-    "cwise_op_isinf.cc",
-    "cwise_op_leakyrelu.cc",
-    "cwise_op_less.cc",
-    "cwise_op_less_equal.cc",
-    "cwise_op_log.cc",
-    "cwise_op_logical_and.cc",
-    "cwise_op_logical_not.cc",
-    "cwise_op_logical_or.cc",
-    "cwise_op_maximum.cc",
-    "cwise_op_minimum.cc",
-    "cwise_op_mod.cc",
-    "cwise_op_mul_1.cc",
-    "cwise_op_mul_2.cc",
-    "cwise_op_neg_1.cc",
-    "cwise_op_neg_2.cc",
-    "cwise_op_not_equal_to_1.cc",
-    "cwise_op_not_equal_to_2.cc",
-    "cwise_op_pow.cc",
-    "cwise_op_round.cc",
-    "cwise_op_rsqrt.cc",
-    "cwise_op_select.cc",
-    "cwise_op_sigmoid.cc",
-    "cwise_op_sin.cc",
-    "cwise_op_sign.cc",
-    "cwise_op_sqrt.cc",
-    "cwise_op_square.cc",
-    "cwise_op_squared_difference.cc",
-    "cwise_op_sub.cc",
-    "cwise_op_tanh.cc",
-    "dequantize_op.cc",
-    "ops_testutil.h",
-    "quantize_and_dequantize_op.cc",
-    "quantize_op.cc",
-    "sparse_cross_op.cc",
-    "sparse_fill_empty_rows_op.cc",
-    "sparse_reshape_op.cc",
-    "unary_ops_composition.cc",
-])
+exports_files(
+    glob([
+        "cwise_op*.cc",
+        "cwise_op_gpu*.cu.cc",
+    ]) + [
+        "dequantize_op.cc",
+        "ops_testutil.h",
+        "quantize_and_dequantize_op.cc",
+        "quantize_op.cc",
+        "sparse_cross_op.cc",
+        "sparse_fill_empty_rows_op.cc",
+        "sparse_reshape_op.cc",
+        "unary_ops_composition.cc",
+    ],
+)
 
 tf_kernel_library(
     name = "sobol_op",
diff --git a/tensorflow/core/kernels/concat_op.cc b/tensorflow/core/kernels/concat_op.cc
index a1d77a78c51..6e842c26dc1 100644
--- a/tensorflow/core/kernels/concat_op.cc
+++ b/tensorflow/core/kernels/concat_op.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <limits>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -245,6 +247,7 @@ REGISTER_KERNEL_BUILDER(Name("ConcatV2")
                             .HostMemory("output"),
                         ConcatV2Op<CPUDevice, int32>);
 
+template <typename ShapeType>
 class ConcatOffsetOp : public OpKernel {
  public:
   explicit ConcatOffsetOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
@@ -253,15 +256,15 @@ class ConcatOffsetOp : public OpKernel {
     const Tensor& concat_dim = ctx->input(0);
     OP_REQUIRES(
         ctx, TensorShapeUtils::IsScalar(concat_dim.shape()),
-        errors::InvalidArgument(
+        absl::InvalidArgumentError(absl::StrCat(
             "Concat dim tensor should be a scalar integer, but got shape ",
-            concat_dim.shape().DebugString()));
+            concat_dim.shape().DebugString())));
     for (int i = 1; i < ctx->num_inputs(); ++i) {
       const Tensor& inp = ctx->input(i);
       OP_REQUIRES(ctx, TensorShapeUtils::IsVector(inp.shape()),
-                  errors::InvalidArgument("input ", i,
-                                          " should be a vector, but got shape ",
-                                          inp.shape().DebugString()));
+                  absl::InvalidArgumentError(absl::StrCat(
+                      "input ", i, " should be a vector, but got shape ",
+                      inp.shape().DebugString())));
     }
     // Suppose a Concat() op needs to Concatenate N tensors, each of
     // which has the same number of dimensions.  Their shapes match
@@ -283,35 +286,35 @@ class ConcatOffsetOp : public OpKernel {
     //  [0, 5, 0, 0]
     const int32_t N = ctx->num_inputs() - 1;
     const Tensor& inp0 = ctx->input(1);
-    auto inp0_vec = inp0.vec<int32>();
+    auto inp0_vec = inp0.vec<ShapeType>();
     const int64_t cdim = internal::SubtleMustCopy(concat_dim.scalar<int32>()());
     const int64_t dims = inp0.NumElements();
     int32_t axis = cdim < 0 ? cdim + dims : cdim;
     OP_REQUIRES(ctx, FastBoundsCheck(axis, dims),
-                errors::InvalidArgument("Concat dim is out of range: ", cdim,
-                                        " vs. ", dims));
-    int32_t offset = 0;
+                absl::InvalidArgumentError(absl::StrCat(
+                    "Concat dim is out of range: ", cdim, " vs. ", dims)));
+    int64_t offset = 0;
     for (int i = 0; i < N; ++i) {
       const Tensor& inp = ctx->input(1 + i);
-      OP_REQUIRES(
-          ctx, dims == inp.NumElements(),
-          errors::InvalidArgument("input ", i, " should contain ", dims,
-                                  " elements, but got ", inp.NumElements()));
-      auto inp_vec = inp.vec<int32>();
+      OP_REQUIRES(ctx, dims == inp.NumElements(),
+                  absl::InvalidArgumentError(
+                      absl::StrCat("input ", i, " should contain ", dims,
+                                   " elements, but got ", inp.NumElements())));
+      auto inp_vec = inp.vec<ShapeType>();
       Tensor* out = nullptr;
       OP_REQUIRES_OK(ctx, ctx->allocate_output(i, {dims}, &out));
-      auto out_vec = out->vec<int32>();
+      auto out_vec = out->vec<ShapeType>();
       for (int64_t j = 0; j < dims; ++j) {
         if (j == axis) {
           out_vec(j) = offset;
           offset += inp_vec(j);
         } else {
           OP_REQUIRES(ctx, (inp0_vec(j) == inp_vec(j)),
-                      errors::InvalidArgument(
+                      absl::InvalidArgumentError(absl::StrCat(
                           "All dimensions except ", axis, " must match. Input ",
                           i, " has shape [", inp.SummarizeValue(10),
                           "] and doesn't match input 0 with shape [",
-                          inp0.SummarizeValue(10), "]."));
+                          inp0.SummarizeValue(10), "].")));
           out_vec(j) = 0;
         }
       }
@@ -321,13 +324,25 @@ class ConcatOffsetOp : public OpKernel {
   bool IsExpensive() override { return false; }
 };
 
-REGISTER_KERNEL_BUILDER(Name("ConcatOffset").Device(DEVICE_CPU),
-                        ConcatOffsetOp);
+REGISTER_KERNEL_BUILDER(
+    Name("ConcatOffset").Device(DEVICE_CPU).TypeConstraint<int32>("shape_type"),
+    ConcatOffsetOp<int32>);
 REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
                             .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int32>("shape_type")
                             .HostMemory("concat_dim")
                             .HostMemory("shape")
                             .HostMemory("offset"),
-                        ConcatOffsetOp);
+                        ConcatOffsetOp<int32>);
+REGISTER_KERNEL_BUILDER(
+    Name("ConcatOffset").Device(DEVICE_CPU).TypeConstraint<int64>("shape_type"),
+    ConcatOffsetOp<int64>);
+REGISTER_KERNEL_BUILDER(Name("ConcatOffset")
+                            .Device(DEVICE_DEFAULT)
+                            .TypeConstraint<int64>("shape_type")
+                            .HostMemory("concat_dim")
+                            .HostMemory("shape")
+                            .HostMemory("offset"),
+                        ConcatOffsetOp<int64>);
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/constant_op.cc b/tensorflow/core/kernels/constant_op.cc
index e8b8534c2e4..35c7fdc6632 100644
--- a/tensorflow/core/kernels/constant_op.cc
+++ b/tensorflow/core/kernels/constant_op.cc
@@ -336,14 +336,15 @@ REGISTER_KERNEL(complex64, GPU);
 REGISTER_KERNEL(complex128, GPU);
 #endif
 REGISTER_KERNEL(bfloat16, GPU);
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef REGISTER_KERNEL
+
 REGISTER_KERNEL_BUILDER(Name("OnesLike")
                             .Device(DEVICE_DEFAULT)
                             .TypeConstraint<int32>("T")
                             .HostMemory("y"),
                         OnesLikeOp<CPUDevice, int32>);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-#undef REGISTER_KERNEL
 
 PlaceholderOp::PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("shape", &expected_shape_));
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 2ba3fcbf3d9..eb574c981f8 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -52,14 +52,14 @@ void SwitchNOp::Compute(OpKernelContext* context) {
   context->set_output(output_index, context->input(0));
 }
 
-REGISTER_KERNEL_BUILDER(
-    Name("Switch").Device(DEVICE_DEFAULT).HostMemory("pred"), SwitchOp);
 REGISTER_KERNEL_BUILDER(
     Name("Switch").Device(DEVICE_TPU_SYSTEM).HostMemory("pred"), SwitchOp);
 
+REGISTER_KERNEL_BUILDER(Name("Switch").Device(DEVICE_TPU).HostMemory("pred"),
+                        SwitchOp);
+
 REGISTER_KERNEL_BUILDER(
-    Name("_SwitchN").Device(DEVICE_DEFAULT).HostMemory("output_index"),
-    SwitchNOp);
+    Name("_SwitchN").Device(DEVICE_TPU).HostMemory("output_index"), SwitchNOp);
 
 #define REGISTER_CPU_SWITCH(type)                         \
   REGISTER_KERNEL_BUILDER(Name("Switch")                  \
@@ -156,6 +156,71 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
+#define REGISTER_DEFAULT_SWITCH(type)                     \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)                       \
+  REGISTER_KERNEL_BUILDER(Name("_SwitchN")                \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("output_index") \
+                              .TypeConstraint<type>("T"), \
+                          SwitchNOp)
+
+#define REGISTER_DEFAULT_REF_SWITCH(type)                 \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("pred")         \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_SWITCH);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_REF_SWITCH);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_REF_SWITCH);
+TF_CALL_variant(REGISTER_DEFAULT_SWITCH);
+TF_CALL_bool(REGISTER_DEFAULT_SWITCH);
+TF_CALL_bool(REGISTER_DEFAULT_REF_SWITCH);
+
+#undef REGISTER_DEFAULT_SWITCH
+#undef REGISTER_DEFAULT_REF_SWITCH
+
+#define REGISTER_DEFAULT_HOST_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("Switch")                  \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)                       \
+  REGISTER_KERNEL_BUILDER(Name("_SwitchN")                \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output_index") \
+                              .HostMemory("outputs")      \
+                              .TypeConstraint<type>("T"), \
+                          SwitchNOp)
+
+#define REGISTER_DEFAULT_HOST_REF_KERNEL(type)            \
+  REGISTER_KERNEL_BUILDER(Name("RefSwitch")               \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("pred")         \
+                              .HostMemory("output_false") \
+                              .HostMemory("output_true")  \
+                              .TypeConstraint<type>("T"), \
+                          SwitchOp)
+
+REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_REF_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_REF_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_DEFAULT_HOST_KERNEL
+#undef REGISTER_DEFAULT_HOST_REF_KERNEL
 
 class RefSelectOp : public OpKernel {
  public:
@@ -236,10 +301,10 @@ void MergeOp::Compute(OpKernelContext* context) {
 }
 
 REGISTER_KERNEL_BUILDER(Name("Merge").Device(DEVICE_CPU), MergeOp);
-REGISTER_KERNEL_BUILDER(
-    Name("Merge").Device(DEVICE_DEFAULT).HostMemory("value_index"), MergeOp);
 REGISTER_KERNEL_BUILDER(
     Name("Merge").Device(DEVICE_TPU_SYSTEM).HostMemory("value_index"), MergeOp);
+REGISTER_KERNEL_BUILDER(
+    Name("Merge").Device(DEVICE_TPU).HostMemory("value_index"), MergeOp);
 REGISTER_KERNEL_BUILDER(Name("RefMerge").Device(DEVICE_CPU), MergeOp);
 
 #define REGISTER_GPU_KERNEL(type)                         \
@@ -267,7 +332,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -293,6 +357,52 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#define REGISTER_DEFAULT_KERNEL(type)                     \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_DEFAULT)     \
+                              .TypeConstraint<type>("T")  \
+                              .HostMemory("value_index"), \
+                          MergeOp);
+
+#define REGISTER_DEFAULT_REF_KERNEL(type)                 \
+  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
+                              .Device(DEVICE_DEFAULT)     \
+                              .TypeConstraint<type>("T")  \
+                              .HostMemory("value_index"), \
+                          MergeOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_REF_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_REF_KERNEL);
+REGISTER_DEFAULT_KERNEL(bool);
+REGISTER_DEFAULT_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_DEFAULT_KERNEL);
+
+#undef REGISTER_DEFAULT_KERNEL
+#undef REGISTER_DEFAULT_REF_KERNEL
+
+#define REGISTER_DEFAULT_HOST_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("Merge")                   \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("inputs")       \
+                              .HostMemory("output")       \
+                              .HostMemory("value_index")  \
+                              .TypeConstraint<type>("T"), \
+                          MergeOp);                       \
+  REGISTER_KERNEL_BUILDER(Name("RefMerge")                \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("inputs")       \
+                              .HostMemory("output")       \
+                              .HostMemory("value_index")  \
+                              .TypeConstraint<type>("T"), \
+                          MergeOp)
+
+REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_DEFAULT_HOST_KERNEL
 
 void EnterOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
@@ -302,8 +412,8 @@ void EnterOp::Compute(OpKernelContext* context) {
   }
 }
 
-REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_DEFAULT), EnterOp);
 REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_TPU_SYSTEM), EnterOp);
+REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_TPU), EnterOp);
 REGISTER_KERNEL_BUILDER(Name("RefEnter").Device(DEVICE_CPU), EnterOp);
 
 #define REGISTER_GPU_KERNEL(type) \
@@ -322,7 +432,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -351,6 +460,48 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 #undef REGISTER_GPU_HOST_KERNEL
 #undef REGISTER_GPU_HOST_REF_KERNEL
 
+#define REGISTER_DEFAULT_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("Enter").Device(DEVICE_DEFAULT).TypeConstraint<type>("T"), EnterOp)
+#define REGISTER_DEFAULT_REF_KERNEL(type)                                \
+  REGISTER_KERNEL_BUILDER(                                               \
+      Name("RefEnter").Device(DEVICE_DEFAULT).TypeConstraint<type>("T"), \
+      EnterOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_REF_KERNEL);
+REGISTER_DEFAULT_KERNEL(bool);
+REGISTER_DEFAULT_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_DEFAULT_KERNEL);
+
+#undef REGISTER_DEFAULT_KERNEL
+#undef REGISTER_DEFAULT_REF_KERNEL
+
+#define REGISTER_DEFAULT_HOST_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("Enter")                   \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+#define REGISTER_DEFAULT_HOST_REF_KERNEL(type)            \
+  REGISTER_KERNEL_BUILDER(Name("RefEnter")                \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          EnterOp)
+
+REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_REF_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_REF_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_DEFAULT_HOST_KERNEL
+#undef REGISTER_DEFAULT_HOST_REF_KERNEL
+
 void ExitOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
     context->forward_ref_input_to_ref_output(0, 0);
@@ -359,8 +510,8 @@ void ExitOp::Compute(OpKernelContext* context) {
   }
 }
 
-REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_DEFAULT), ExitOp);
 REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_TPU_SYSTEM), ExitOp);
+REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_TPU), ExitOp);
 REGISTER_KERNEL_BUILDER(Name("RefExit").Device(DEVICE_CPU), ExitOp);
 
 #define REGISTER_GPU_KERNEL(type) \
@@ -379,7 +530,6 @@ TF_CALL_variant(REGISTER_GPU_KERNEL);
 #undef REGISTER_GPU_KERNEL
 #undef REGISTER_GPU_REF_KERNEL
 
-
 // Special GPU kernels for int32 and string.
 // TODO(b/25387198): Also enable int32 in device memory. This kernel
 // registration requires all int32 inputs and outputs to be in host memory.
@@ -403,6 +553,43 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#define REGISTER_DEFAULT_KERNEL(type) \
+  REGISTER_KERNEL_BUILDER(            \
+      Name("Exit").Device(DEVICE_DEFAULT).TypeConstraint<type>("T"), ExitOp);
+#define REGISTER_DEFAULT_REF_KERNEL(type)                               \
+  REGISTER_KERNEL_BUILDER(                                              \
+      Name("RefExit").Device(DEVICE_DEFAULT).TypeConstraint<type>("T"), \
+      ExitOp);
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL);
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_REF_KERNEL);
+REGISTER_DEFAULT_KERNEL(bool);
+REGISTER_DEFAULT_REF_KERNEL(bool);
+TF_CALL_variant(REGISTER_DEFAULT_KERNEL);
+
+#undef REGISTER_DEFAULT_KERNEL
+#undef REGISTER_DEFAULT_REF_KERNEL
+
+#define REGISTER_DEFAULT_HOST_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("Exit")                    \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ExitOp);                        \
+  REGISTER_KERNEL_BUILDER(Name("RefExit")                 \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          ExitOp)
+
+REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_DEFAULT_HOST_KERNEL
+
 void NextIterationOp::Compute(OpKernelContext* context) {
   if (IsRefType(context->input_dtype(0))) {
     context->forward_ref_input_to_ref_output(0, 0);
@@ -411,10 +598,10 @@ void NextIterationOp::Compute(OpKernelContext* context) {
   }
 }
 
-REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE_DEFAULT),
-                        NextIterationOp);
 REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE_TPU_SYSTEM),
                         NextIterationOp);
+REGISTER_KERNEL_BUILDER(Name("NextIteration").Device(DEVICE_TPU),
+                        NextIterationOp);
 REGISTER_KERNEL_BUILDER(Name("RefNextIteration").Device(DEVICE_CPU),
                         NextIterationOp);
 
@@ -455,6 +642,40 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 #undef REGISTER_GPU_HOST_KERNEL
 
+#define REGISTER_DEFAULT_KERNEL(type)                                         \
+  REGISTER_KERNEL_BUILDER(                                                    \
+      Name("NextIteration").Device(DEVICE_DEFAULT).TypeConstraint<type>("T"), \
+      NextIterationOp);                                                       \
+  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")                            \
+                              .Device(DEVICE_DEFAULT)                         \
+                              .TypeConstraint<type>("T"),                     \
+                          NextIterationOp)
+
+TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL);
+REGISTER_DEFAULT_KERNEL(bool);
+TF_CALL_variant(REGISTER_DEFAULT_KERNEL);
+
+#undef REGISTER_DEFAULT_KERNEL
+
+#define REGISTER_DEFAULT_HOST_KERNEL(type)                \
+  REGISTER_KERNEL_BUILDER(Name("NextIteration")           \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          NextIterationOp);               \
+  REGISTER_KERNEL_BUILDER(Name("RefNextIteration")        \
+                              .Device(DEVICE_DEFAULT)     \
+                              .HostMemory("data")         \
+                              .HostMemory("output")       \
+                              .TypeConstraint<type>("T"), \
+                          NextIterationOp)
+
+REGISTER_DEFAULT_HOST_KERNEL(int32);
+REGISTER_DEFAULT_HOST_KERNEL(tstring);
+REGISTER_DEFAULT_HOST_KERNEL(ResourceHandle);
+
+#undef REGISTER_DEFAULT_HOST_KERNEL
 
 LoopCondOp::LoopCondOp(OpKernelConstruction* context) : OpKernel(context) {}
 LoopCondOp::~LoopCondOp() = default;
diff --git a/tensorflow/core/kernels/cwise_ops_gradients.h b/tensorflow/core/kernels/cwise_ops_gradients.h
index 8c8fd4c748b..ffad345026b 100644
--- a/tensorflow/core/kernels/cwise_ops_gradients.h
+++ b/tensorflow/core/kernels/cwise_ops_gradients.h
@@ -72,14 +72,19 @@ template <typename T>
 struct scalar_inverse_gradient_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    const T out_conj = numext::conj(output);
-    return -output_gradient * out_conj * out_conj;
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return -out_conj * out_conj * output_gradient;
+    }
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet out_conj = pconj(output);
-    return pnegate(pmul(output_gradient, pmul(out_conj, out_conj)));
+    return mul_no_nan_op<T>().packetOp(pnegate(pmul(out_conj, out_conj)),
+                                       output_gradient);
   }
 };
 template <typename T>
@@ -95,15 +100,20 @@ template <typename T>
 struct scalar_sqrt_gradient_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    const T out_conj = numext::conj(output);
-    return static_cast<T>(0.5) * output_gradient / out_conj;
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return (static_cast<T>(0.5) * output_gradient) / out_conj;
+    }
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet const_half = pset1<Packet>(static_cast<T>(0.5));
     const Packet out_conj = pconj(output);
-    return pdiv(pmul(const_half, output_gradient), out_conj);
+    return mul_no_nan_op<T>().packetOp(pdiv(const_half, out_conj),
+                                       output_gradient);
   }
 };
 template <typename T>
@@ -119,17 +129,24 @@ template <typename T>
 struct scalar_rsqrt_gradient_op {
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const T
   operator()(const T& output, const T& output_gradient) const {
-    const T out_conj = numext::conj(output);
-    return static_cast<T>(-0.5) * (output_gradient * out_conj) *
-           (out_conj * out_conj);
+    if (output_gradient == T(0)) {
+      return T(0);
+    } else {
+      const T out_conj = numext::conj(output);
+      return static_cast<T>(-0.5) * (output_gradient * out_conj) *
+             (out_conj * out_conj);
+    }
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet
   packetOp(const Packet& output, const Packet& output_gradient) const {
     const Packet const_half = pset1<Packet>(static_cast<T>(-0.5));
     const Packet out_conj = pconj(output);
-    return pmul(const_half, pmul(pmul(output_gradient, out_conj),
-                                 pmul(out_conj, out_conj)));
+    auto safe_pmul = [](const Packet& a, const Packet& b) {
+      return mul_no_nan_op<T>().packetOp(a, b);
+    };
+    return safe_pmul(pmul(const_half, pmul(out_conj, out_conj)),
+                     safe_pmul(out_conj, output_gradient));
   }
 };
 template <typename T>
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index 91ea27eea52..e626460f484 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -422,15 +422,10 @@ tf_kernel_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:captured_function",
-        "//tensorflow/core/data:dataset_utils",
-        "//tensorflow/core/data:hash_utils",
         "//tensorflow/core/data:name_utils",
-        "//tensorflow/core/data:root_dataset",
         "//tensorflow/core/data:snapshot_utils",
         "//tensorflow/core/data:utils",
-        "//tensorflow/core/data/service/snapshot:snapshot_reader",
         "//tensorflow/core/framework:op_requires",
-        "//tensorflow/core/kernels/data:iterator_ops",
     ],
 )
 
@@ -968,6 +963,7 @@ tf_kernel_library(
         ":to_tf_record_op",
         ":unbatch_dataset_op",
         ":unique_dataset_op",
+        "//tensorflow/core/data/service/snapshot:snapshot_chunk_dataset_op",
     ] + select({
         "//tensorflow:fuchsia": [],
         "//conditions:default": [":lmdb_dataset_op"],
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index a887c411ac7..28fbd9b6d67 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/data/experimental/data_service_dataset_op.h"
 
 #include <algorithm>
+#include <atomic>
 #include <functional>
 #include <limits>
 #include <memory>
diff --git a/tensorflow/core/kernels/data/experimental/load_dataset_op.cc b/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
index f80e5af333f..c5dc060f44e 100644
--- a/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
@@ -22,18 +22,13 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/data/captured_function.h"
-#include "tensorflow/core/data/dataset_utils.h"
-#include "tensorflow/core/data/hash_utils.h"
 #include "tensorflow/core/data/name_utils.h"
-#include "tensorflow/core/data/root_dataset.h"
-#include "tensorflow/core/data/service/snapshot/snapshot_reader.h"
 #include "tensorflow/core/data/snapshot_utils.h"
 #include "tensorflow/core/data/utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/platform/cpu_info.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/stringprintf.h"
 #include "tensorflow/core/protobuf/snapshot.pb.h"
@@ -51,15 +46,15 @@ namespace experimental {
 /* static */ constexpr const char* const LoadDatasetOp::kReaderFuncOtherArgs;
 /* static */ constexpr const char* const LoadDatasetOp::kReaderFuncTarguments;
 
-class LoadDatasetOp::DatasetV1 : public DatasetBase {
+class LoadDatasetOp::Dataset : public DatasetBase {
  public:
-  DatasetV1(OpKernelContext* ctx, const tstring& path,
-            SnapshotMetadataRecord metadata, const std::string& compression,
-            std::unique_ptr<CapturedFunction> captured_func,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
+  Dataset(OpKernelContext* ctx, const tstring& path,
+          SnapshotMetadataRecord metadata, const std::string& compression,
+          std::unique_ptr<CapturedFunction> captured_reader_func,
+          const DataTypeVector& output_types,
+          const std::vector<PartialTensorShape>& output_shapes)
       : DatasetBase(DatasetContext(ctx)),
-        captured_func_(std::move(captured_func)),
+        captured_reader_func_(std::move(captured_reader_func)),
         compression_(compression),
         metadata_(std::move(metadata)),
         output_types_(output_types),
@@ -87,7 +82,7 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
   }
 
   Status CheckExternalState() const override {
-    return captured_func_->CheckExternalState();
+    return captured_reader_func_->CheckExternalState();
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
@@ -104,7 +99,7 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
 
     std::vector<Node*> reader_func_other_args;
     DataTypeVector reader_func_other_args_types;
-    TF_RETURN_IF_ERROR(captured_func_->AddToGraph(
+    TF_RETURN_IF_ERROR(captured_reader_func_->AddToGraph(
         ctx, b, &reader_func_other_args, &reader_func_other_args_types));
 
     // Attr: compression
@@ -113,7 +108,7 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
 
     // Attr: reader_func
     AttrValue reader_func_attr;
-    b->BuildAttrValue(captured_func_->func(), &reader_func_attr);
+    b->BuildAttrValue(captured_reader_func_->func(), &reader_func_attr);
 
     AttrValue reader_func_arguments_types_attr;
     b->BuildAttrValue(reader_func_other_args_types,
@@ -131,10 +126,10 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
   }
 
  private:
-  class Iterator : public DatasetIterator<DatasetV1> {
+  class Iterator : public DatasetIterator<Dataset> {
    public:
     explicit Iterator(const Params& params)
-        : DatasetIterator<DatasetV1>(params) {}
+        : DatasetIterator<Dataset>(params) {}
 
     ~Iterator() override {
       if (input_) {
@@ -144,8 +139,8 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
 
     Status Initialize(IteratorContext* ctx) override {
       mutex_lock l(mu_);
-      TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
-          ctx, &instantiated_captured_func_));
+      TF_RETURN_IF_ERROR(dataset()->captured_reader_func_->Instantiate(
+          ctx, &instantiated_captured_reader_func_));
       TF_RETURN_IF_ERROR(InitializeInput(ctx));
       return input_->MakeIterator(ctx, this, prefix(), &input_impl_);
     }
@@ -204,7 +199,7 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
       reader_input.push_back(std::move(input_dataset_tensor));
 
       // NOTE: We intentionally ignore resource modeling outside GetNext().
-      TF_RETURN_IF_ERROR(instantiated_captured_func_->Run(
+      TF_RETURN_IF_ERROR(instantiated_captured_reader_func_->Run(
           ctx, std::move(reader_input), &reader_output, /*node=*/nullptr));
       if (reader_output.size() != 1) {
         return errors::InvalidArgument(
@@ -221,10 +216,11 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
     mutex mu_;
     DatasetBase* input_ TF_GUARDED_BY(mu_) = nullptr;
     std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
-    std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func_;
+    std::unique_ptr<InstantiatedCapturedFunction>
+        instantiated_captured_reader_func_;
   };
 
-  const std::unique_ptr<CapturedFunction> captured_func_;
+  const std::unique_ptr<CapturedFunction> captured_reader_func_;
   const std::string compression_;
   const SnapshotMetadataRecord metadata_;
   const DataTypeVector output_types_;
@@ -232,151 +228,22 @@ class LoadDatasetOp::DatasetV1 : public DatasetBase {
   const tstring path_;
 };
 
-class LoadDatasetOp::DatasetV2 : public DatasetBase {
- public:
-  DatasetV2(OpKernelContext* ctx, const tstring& path,
-            DistributedSnapshotMetadata metadata,
-            std::unique_ptr<CapturedFunction> captured_func,
-            const DataTypeVector& output_types,
-            const std::vector<PartialTensorShape>& output_shapes)
-      : DatasetBase(DatasetContext(ctx)),
-        path_(path),
-        metadata_(std::move(metadata)),
-        captured_func_(std::move(captured_func)),
-        output_types_(output_types),
-        output_shapes_(output_shapes) {}
-
-  std::unique_ptr<IteratorBase> MakeIteratorInternal(
-      const string& prefix) const override {
-    return std::make_unique<Iterator>(Iterator::Params{
-        this, name_utils::IteratorPrefix(kDatasetType, prefix)});
-  }
-
-  const DataTypeVector& output_dtypes() const override { return output_types_; }
-
-  const std::vector<PartialTensorShape>& output_shapes() const override {
-    return output_shapes_;
-  }
-
-  string DebugString() const override {
-    return name_utils::DatasetDebugString(kDatasetType);
-  }
-
-  Status CheckExternalState() const override { return OkStatus(); }
-
- protected:
-  Status AsGraphDefInternal(SerializationContext* ctx,
-                            DatasetGraphDefBuilder* b,
-                            Node** output) const override {
-    Node* path_node = nullptr;
-    TF_RETURN_IF_ERROR(b->AddScalar(path_, &path_node));
-
-    std::vector<Node*> reader_func_other_args;
-    DataTypeVector reader_func_other_args_types;
-    TF_RETURN_IF_ERROR(captured_func_->AddToGraph(
-        ctx, b, &reader_func_other_args, &reader_func_other_args_types));
-
-    // Attr: compression
-    AttrValue compression_attr;
-    b->BuildAttrValue(metadata_.compression(), &compression_attr);
-
-    // Attr: reader_func
-    AttrValue reader_func_attr;
-    b->BuildAttrValue(captured_func_->func(), &reader_func_attr);
-
-    AttrValue reader_func_arguments_types_attr;
-    b->BuildAttrValue(reader_func_other_args_types,
-                      &reader_func_arguments_types_attr);
-
-    TF_RETURN_IF_ERROR(b->AddDataset(
-        this, {std::make_pair(0, path_node)},         // Single tensor inputs.
-        {std::make_pair(1, reader_func_other_args)},  // Tensor list inputs.
-        {std::make_pair(kCompression, compression_attr),
-         std::make_pair(kReaderFunc, reader_func_attr),
-         std::make_pair(kReaderFuncTarguments,
-                        reader_func_arguments_types_attr)},  // Attrs
-        output));
-    return OkStatus();
-  }
-
- private:
-  class Iterator : public DatasetIterator<DatasetV2> {
-   public:
-    explicit Iterator(const Params& params)
-        : DatasetIterator<DatasetV2>(params) {}
-
-    Status Initialize(IteratorContext* ctx) override {
-      mutex_lock l(mu_);
-      std::unique_ptr<InstantiatedCapturedFunction> instantiated_captured_func;
-      TF_RETURN_IF_ERROR(dataset()->captured_func_->Instantiate(
-          ctx, &instantiated_captured_func));
-
-      SnapshotReaderParams params{dataset()->path_, dataset()->metadata_,
-                                  dataset()->output_types_,
-                                  dataset()->output_shapes_, ctx->env()};
-      TF_ASSIGN_OR_RETURN(
-          core::RefCountPtr<DatasetBase> input,
-          MakeSnapshotReaderDataset(params, *instantiated_captured_func, ctx));
-      return input->MakeIterator(ctx, this, prefix(), &input_impl_);
-    }
-
-    Status GetNextInternal(IteratorContext* ctx,
-                           std::vector<Tensor>* out_tensors,
-                           bool* end_of_sequence) override {
-      mutex_lock l(mu_);
-      return input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
-    }
-
-    Status SaveInternal(SerializationContext* ctx,
-                        IteratorStateWriter* writer) override {
-      mutex_lock l(mu_);
-      return this->SaveInput(ctx, writer, input_impl_);
-    }
-
-    Status RestoreInternal(IteratorContext* ctx,
-                           IteratorStateReader* reader) override {
-      mutex_lock l(mu_);
-      return this->RestoreInput(ctx, reader, input_impl_);
-    }
-
-   private:
-    mutex mu_;
-    std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
-  };
-
-  const tstring path_;
-  const DistributedSnapshotMetadata metadata_;
-  const std::unique_ptr<CapturedFunction> captured_func_;
-  const DataTypeVector output_types_;
-  const std::vector<PartialTensorShape> output_shapes_;
-};
-
 LoadDatasetOp::LoadDatasetOp(OpKernelConstruction* ctx) : DatasetOpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kCompression, &compression_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputTypes, &output_types_));
   OP_REQUIRES_OK(ctx, ctx->GetAttr(kOutputShapes, &output_shapes_));
   OP_REQUIRES_OK(ctx, FunctionMetadata::Create(ctx, kReaderFunc, /*params=*/{},
-                                               &func_metadata_));
+                                               &reader_func_metadata_));
 }
 
 void LoadDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
   tstring path;
   OP_REQUIRES_OK(ctx, ParseScalarArgument(ctx, kPath, &path));
 
-  std::unique_ptr<CapturedFunction> captured_func;
-  OP_REQUIRES_OK(
-      ctx, CapturedFunction::Create(ctx, func_metadata_, kReaderFuncOtherArgs,
-                                    &captured_func));
-
-  experimental::DistributedSnapshotMetadata distributed_metadata;
-  if (Status s = ReadTextProto(ctx->env(), SnapshotMetadataFilePath(path),
-                               &distributed_metadata);
-      s.ok()) {
-    *output =
-        new DatasetV2(ctx, path, std::move(distributed_metadata),
-                      std::move(captured_func), output_types_, output_shapes_);
-    return;
-  }
+  std::unique_ptr<CapturedFunction> captured_reader_func;
+  OP_REQUIRES_OK(ctx, CapturedFunction::Create(ctx, reader_func_metadata_,
+                                               kReaderFuncOtherArgs,
+                                               &captured_reader_func));
 
   bool metadata_file_exists;
   experimental::SnapshotMetadataRecord nondistributed_metadata;
@@ -385,9 +252,9 @@ void LoadDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
                                                       &metadata_file_exists));
   OP_REQUIRES(ctx, metadata_file_exists,
               errors::NotFound("Could not find metadata file [", path, "]"));
-  *output =
-      new DatasetV1(ctx, path, std::move(nondistributed_metadata), compression_,
-                    std::move(captured_func), output_types_, output_shapes_);
+  *output = new Dataset(ctx, path, std::move(nondistributed_metadata),
+                        compression_, std::move(captured_reader_func),
+                        output_types_, output_shapes_);
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/experimental/load_dataset_op.h b/tensorflow/core/kernels/data/experimental/load_dataset_op.h
index 38f1357bac1..4a27d6aab15 100644
--- a/tensorflow/core/kernels/data/experimental/load_dataset_op.h
+++ b/tensorflow/core/kernels/data/experimental/load_dataset_op.h
@@ -23,8 +23,6 @@ limitations under the License.
 #include "tensorflow/core/data/captured_function.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/kernels/data/iterator_ops.h"
-#include "tensorflow/core/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
@@ -49,17 +47,12 @@ class LoadDatasetOp : public DatasetOpKernel {
   void MakeDataset(OpKernelContext* ctx, DatasetBase** output) override;
 
  private:
-  // Dataset classes for different formats. V1 loads the output of a
-  // `SaveDataset()`. V2 loads the output of a `DistributedSaveDataset()`.
-  // TODO(b/250921378): Unify the file format for distributed and
-  // non-distributed save so we only need to keep V2.
-  class DatasetV1;
-  class DatasetV2;
+  class Dataset;
 
   std::string compression_;
   DataTypeVector output_types_;
   std::vector<PartialTensorShape> output_shapes_;
-  std::shared_ptr<FunctionMetadata> func_metadata_;
+  std::shared_ptr<FunctionMetadata> reader_func_metadata_;
 };
 
 }  // namespace experimental
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 81701ae2bb2..2042792fa39 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -128,6 +128,8 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
     bool SymbolicCheckpointCompatible() const override { return true; }
 
     Status Initialize(IteratorContext* ctx) override {
+      mutex_lock l(mu_);
+      input_ckpt_ = std::make_unique<MemoryCheckpoint>(ctx->id_registry());
       TF_RETURN_IF_ERROR(
           dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
       return dataset()->captured_func_->Instantiate(
@@ -156,6 +158,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
             *end_of_sequence = false;
             return OkStatus();
           }
+          ctx->MergeCheckpoint(input_ckpt_.get());
 
           // We have reached the end of the current element, so maybe move on
           // to the next element.
@@ -165,8 +168,10 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
 
         // Get the next element from the input dataset.
         inputs_.clear();
+        auto input_ctx = std::make_unique<IteratorContext>(*ctx);
         TF_RETURN_IF_ERROR(
-            input_impl_->GetNext(ctx, &inputs_, end_of_sequence));
+            input_impl_->GetNext(input_ctx.get(), &inputs_, end_of_sequence));
+        input_ckpt_->Merge(input_ctx->checkpoint());
         if (*end_of_sequence) {
           input_impl_.reset();
           return OkStatus();
@@ -237,7 +242,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteScalar(
             full_name(kCurrentElementIteratorUninitialized),
             static_cast<int64_t>(!current_element_iterator_)));
-        if (current_element_iterator_) {
+        if (current_element_iterator_ && !ctx->symbolic_checkpoint()) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name(kInputsSize), inputs_.size()));
           for (int i = 0; i < inputs_.size(); i++) {
@@ -275,26 +280,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
             reader->ReadScalar(full_name(kCurrentElementIteratorUninitialized),
                                &current_element_iterator_uninitialized));
         if (!static_cast<bool>(current_element_iterator_uninitialized)) {
-          size_t inputs_size;
-          {
-            int64_t temp;
-            TF_RETURN_IF_ERROR(
-                reader->ReadScalar(full_name(kInputsSize), &temp));
-            inputs_size = static_cast<size_t>(temp);
-          }
-          inputs_.reserve(inputs_size);
-          for (int i = 0; i < inputs_size; i++) {
-            inputs_.emplace_back();
-            TF_RETURN_IF_ERROR(reader->ReadTensor(
-                ctx->flr(), full_name(strings::StrCat(kInputs, "[", i, "]")),
-                &inputs_.back()));
-          }
-
-          element_index_--;
-          TF_RETURN_IF_ERROR(
-              BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
-          TF_RETURN_IF_ERROR(
-              RestoreInput(ctx, reader, current_element_iterator_));
+          TF_RETURN_IF_ERROR(RestoreCurrentElementIterator(ctx, reader));
         }
       }
       return OkStatus();
@@ -311,8 +297,64 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           prefix(), &current_element_iterator_, node);
     }
 
+    Status RestoreCurrentElementIterator(IteratorContext* ctx,
+                                         IteratorStateReader* reader)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      if (ctx->symbolic_checkpoint()) {
+        return RestoreCurrentElementIteratorSymbolic(ctx, reader);
+      }
+      size_t inputs_size;
+      {
+        int64_t temp;
+        TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kInputsSize), &temp));
+        inputs_size = static_cast<size_t>(temp);
+      }
+      inputs_.reserve(inputs_size);
+      for (int i = 0; i < inputs_size; i++) {
+        inputs_.emplace_back();
+        TF_RETURN_IF_ERROR(reader->ReadTensor(
+            ctx->flr(), full_name(strings::StrCat(kInputs, "[", i, "]")),
+            &inputs_.back()));
+      }
+
+      element_index_--;
+      TF_RETURN_IF_ERROR(
+          BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, current_element_iterator_));
+      return OkStatus();
+    }
+
+    Status RestoreCurrentElementIteratorSymbolic(IteratorContext* ctx,
+                                                 IteratorStateReader* reader)
+        TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+      bool end_of_sequence;
+      auto input_ctx = std::make_unique<IteratorContext>(*ctx);
+      TF_RETURN_IF_ERROR(
+          input_impl_->GetNext(input_ctx.get(), &inputs_, &end_of_sequence));
+      if (end_of_sequence) {
+        return absl::FailedPreconditionError(
+            "Unexpected end of sequence while symbolically restoring "
+            "FlatMapDataset. Please verify that the input produces data "
+            "deterministically.");
+      }
+      input_ckpt_->Merge(input_ctx->checkpoint());
+      element_index_--;
+      TF_RETURN_IF_ERROR(
+          BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
+      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, current_element_iterator_));
+      return OkStatus();
+    }
+
     mutex mu_;
     size_t element_index_ TF_GUARDED_BY(mu_) = 0;
+    // Checkpoint to use for operations on input_impl_. We maintain a
+    // separate checkpoint from the one passed to flat_map so that we can
+    // control when symbolic checkpoint state will be propagated. In
+    // particular, we wait to propagate input checkpoint state until the
+    // tensors being flat_mapped have been fully consumed, so that if we need
+    // to restore the partially-flat-mapped dataset, we can do so by
+    // re-generating the input.
+    std::unique_ptr<MemoryCheckpoint> input_ckpt_ TF_GUARDED_BY(mu_);
     std::unique_ptr<IteratorBase> input_impl_ TF_GUARDED_BY(mu_);
     std::unique_ptr<IteratorBase> current_element_iterator_ TF_GUARDED_BY(mu_);
     std::vector<Tensor> inputs_ TF_GUARDED_BY(mu_);
diff --git a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
index 2368071090c..15fbfd59bc4 100644
--- a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
@@ -363,7 +363,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
       };
 
       // Apply the map function on `input_element`, storing the result in
-      // `result->return_values`, and invoking `done` when finished.
+      // `result->predicate_values`, and invoking `done` when finished.
       if (dataset()->captured_func_->use_inter_op_parallelism()) {
         instantiated_captured_func_->RunAsync(
             ctx.get(), std::move(input_element), &result->predicate_values,
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index e5dccf7bf05..aa24c0122ad 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -1451,7 +1451,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             *instantiated_captured_func_.get(), prefix(), &iterator,
             model_node()));
       }
-      TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, iterator));
+      IteratorContext nested_ctx = MakeNestedIteratorContext(ctx);
+      TF_RETURN_IF_ERROR(RestoreInput(&nested_ctx, reader, iterator));
+      ctx->MergeCheckpoint(nested_ctx.checkpoint());
       mutex_lock l(*mu_);
       element->iterator = std::move(iterator);
       *out = std::move(element);
diff --git a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
index 85c497bd6c9..efe53568697 100644
--- a/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
+++ b/tensorflow/core/kernels/linalg/matrix_triangular_solve_op_impl.h
@@ -373,8 +373,6 @@ struct LaunchBatchMatrixTriangularSolve<GPUDevice, Scalar> {
     typedef Scalar Coefficient;
     const Scalar alpha = Scalar(1.0);
 
-#if GOOGLE_CUDA
-
     // TODO(b/146763573): Consider using Trsv here when the right hand side is
     // a vector. This will require an explicit transpose since Trsv assumes
     // CUBLAS_SIDE_LEFT.
@@ -408,15 +406,6 @@ struct LaunchBatchMatrixTriangularSolve<GPUDevice, Scalar> {
         }
       }
     }
-#elif TENSORFLOW_USE_ROCM
-    for (int batch = 0; batch < batch_size; ++batch) {
-      OP_REQUIRES_OK(
-          context,
-          solver->Trsm(side, uplo, trans, diag, colmajor_rows, colmajor_cols,
-                       &alpha, a_ptrs[batch], leading_dim_matrix /*lda*/,
-                       out_ptrs[batch], leading_dim_output /*ldb*/));
-    }
-#endif
   }
 };
 
diff --git a/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc b/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
index f3482477184..4e0d97e097d 100644
--- a/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
+++ b/tensorflow/core/kernels/linalg/svd_op_gpu.cu.cc
@@ -24,7 +24,7 @@ limitations under the License.
 // pass quite as many raw pointers around. Would also be nice to reduce code
 // duplication.
 
-#if GOOGLE_CUDA
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 #define EIGEN_USE_GPU
 
 #include <algorithm>
@@ -117,8 +117,11 @@ class SvdOpGpu : public AsyncOpKernel {
     // TODO(jamessspencer): if not full_matrices, compute full U and V matrices
     // using Gesvdjbatched and return slices.
     const bool batched =
+#if GOOGLE_CUDA
         m <= 32 && n <= 32 && batch_size > 1 && (full_matrices_ || m == n);
-
+#else
+        false;
+#endif
     // Copies of U and V if required so can take transposes after SVD.
     Tensor u_copy, v_copy;
     Scalar* outputU_ptr = NULL;
@@ -185,6 +188,7 @@ class SvdOpGpu : public AsyncOpKernel {
     }
 
     if (batched) {
+#if GOOGLE_CUDA
       cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR;
       if (compute_uv_) jobz = CUSOLVER_EIG_MODE_VECTOR;
       OP_REQUIRES_OK_ASYNC(
@@ -193,6 +197,9 @@ class SvdOpGpu : public AsyncOpKernel {
                                 outputU_ptr, m, outputV_ptr, n, dev_info_ptr,
                                 batch_size),
           done);
+#else
+      eigen_assert(false && "not supported");
+#endif
     } else {
       for (int64 batch = 0; batch < batch_size; ++batch) {
         Scalar* input = input_ptr + batch * m * n;
@@ -452,4 +459,4 @@ REGISTER_LINALG_OP_GPU("BatchSvd", (SvdOpGpu<double>), double);
 
 }  // namespace tensorflow
 
-#endif  // GOOGLE_CUDA
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index a147a423330..018f8cdfea7 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/kernels/lookup_util.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
 #include "tensorflow/core/framework/lookup_interface.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -49,7 +50,7 @@ Status GetNumLinesInTextFile(Env* env, const string& vocab_file,
     next_id++;
     s = input_buffer.ReadLine(&line);
   }
-  if (!errors::IsOutOfRange(s)) {
+  if (!absl::IsOutOfRange(s)) {
     return s;
   }
   *num_lines = next_id;
@@ -110,7 +111,7 @@ class TextFileLineIterator
     string line;
     status_ = input_buffer_->ReadLine(&line);
     if (!status_.ok()) {
-      if (errors::IsOutOfRange(status_) && vocab_size_ != -1 &&
+      if (absl::IsOutOfRange(status_) && vocab_size_ != -1 &&
           next_id_ != vocab_size_) {
         status_ = errors::InvalidArgument("Invalid vocab_size in ", filename_,
                                           ": expected ", vocab_size_,
@@ -408,7 +409,7 @@ Status InitializeTableFromTextFile(
   // avoid trying to initialize the same table from the same file at the same
   // time.
   Status s = table->Initialize(iter, std::move(serializer));
-  if (errors::IsFailedPrecondition(s) && table->is_initialized()) {
+  if (absl::IsFailedPrecondition(s) && table->is_initialized()) {
     LOG(INFO) << "Table trying to initialize from file " << filename
               << " is already initialized.";
     return OkStatus();
diff --git a/tensorflow/core/kernels/mkl/BUILD b/tensorflow/core/kernels/mkl/BUILD
index 01c1df91fd0..1a781472a1f 100644
--- a/tensorflow/core/kernels/mkl/BUILD
+++ b/tensorflow/core/kernels/mkl/BUILD
@@ -219,6 +219,7 @@ tf_mkl_kernel_library(
 tf_mkl_kernel_library(
     name = "mkl_conv_op",
     hdrs = [
+        "mkl_kernel_util.h",
         "mkl_quantized_conv_ops.h",
     ],
     prefix = "mkl_conv",
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 9ef3577d20d..8dae3705e0a 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/strings/str_join.h"
+#include "tensorflow/core/kernels/mkl/mkl_kernel_util.h"
 #include "tensorflow/core/kernels/mkl/mkl_quantized_conv_ops.h"
 #include "tensorflow/core/kernels/no_op.h"
 #ifdef DNNL_AARCH64_USE_ACL
@@ -913,8 +914,6 @@ class MklConvOp : public OpKernel {
         // Tensorflow format to MKL format by caching the filter when it is
         // converted for the first time. This cached filter can then be reused
         // in subsequent iterations.
-#ifndef ENABLE_ONEDNN_V3
-        // TODO(intel-tf): Enable weight caching for oneDNN v3.x
         if (is_filter_const_) {
           if (IsFilterCacheEmpty(context)) {
             // Cache filter if it is not already cached.
@@ -924,7 +923,6 @@ class MklConvOp : public OpKernel {
           filter_data = GetCachedFilter(context, conv_fwd_pd->weights_desc());
           is_filter_cached = (filter_data != nullptr);
         }
-#endif  // !ENABLE_ONEDNN_V3
         if (!is_filter_cached) {
           filter.SetUsrMem(filter_md, &filter_tensor);
           if (filter_out_tensor == nullptr) {
@@ -1264,7 +1262,11 @@ class MklConvOp : public OpKernel {
   string data_format_str_;
   TensorFormat data_format_;
   Tensor cached_filter_data_ TF_GUARDED_BY(mu_);
+#ifndef ENABLE_ONEDNN_V3
   Tensor cached_filter_md_ TF_GUARDED_BY(mu_);
+#else
+  FilterMemoryDesc cached_filter_md_ TF_GUARDED_BY(mu_);
+#endif  // !ENABLE_ONEDNN_V3
 
   // Initialize to values the template is instantiated with
   bool fuse_biasadd_ = bias_enabled;
@@ -1313,10 +1315,11 @@ class MklConvOp : public OpKernel {
 
     *filter_tensor = &cached_filter_data_;
 
+    memory::desc weights_desc = conv_prim_desc.weights_desc();
+#ifndef ENABLE_ONEDNN_V3
     // There is no tensor format in DNNL 1.x. So we cache the complete filter
     // descriptor as flat byte array.
     TensorShape cached_filter_md_shape;
-    memory::desc weights_desc = conv_prim_desc.weights_desc();
     // We don't use .get_size() method of memory::desc since it returns size
     // required to store primitive's input memory. It is much more than size of
     // memory::desc itself.
@@ -1326,6 +1329,13 @@ class MklConvOp : public OpKernel {
                                           &cached_filter_md_));
     *reinterpret_cast<memory::desc*>(cached_filter_md_.flat<uint8>().data()) =
         weights_desc;
+#else
+    cached_filter_md_ = FilterMemoryDesc(
+        weights_desc.get_ndims(), weights_desc.get_inner_nblks(),
+        weights_desc.get_data_type(), weights_desc.get_dims(),
+        weights_desc.get_inner_blks(), weights_desc.get_inner_idxs(),
+        weights_desc.get_strides());
+#endif  // !ENABLE_ONEDNN_V3
   }
 
   void AllocateTensor(OpKernelContext* context, const ConvFwdPd& conv_prim_desc,
@@ -1386,7 +1396,14 @@ class MklConvOp : public OpKernel {
       return;
     }
 
-    // Otherwise, cache filter
+#ifdef ENABLE_ONEDNN_V3
+    // For now, cache filter only for blocked format
+    if (filter_md.get_format_kind() != memory::format_kind::blocked) {
+      return;
+    }
+#endif  // ENABLE_ONEDNN_V3
+
+    // Otherwise, cache reordered filter
     filter.SetUsrMem(filter_md, &filter_tensor);
     filter.CheckReorderToOpMem(conv_fwd_pd.get()->weights_desc(),
                                this->cpu_engine_, context);
@@ -1425,6 +1442,7 @@ class MklConvOp : public OpKernel {
       TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock lock(mu_);
     const Tensor& cached_filter_data = cached_filter_data_;
+#ifndef ENABLE_ONEDNN_V3
     const Tensor& cached_filter_md = cached_filter_md_;
 
     // Check if the memory descriptor of the cached weights is the same as
@@ -1435,6 +1453,24 @@ class MklConvOp : public OpKernel {
           const_cast<Tfilter*>(cached_filter_data.flat<Tfilter>().data()));
     }
     return nullptr;
+#else
+    // Return the cached weights only if the dimensions of the cached filter
+    // and the current filter match. Otherwise, return nullptr
+    //
+    // TODO(intel-tf): The following check assumes that all dimensions are known
+    // before checking for equality. We may have to modify it in the future once
+    // we support runtime dimensions (especially if the dimensions are still
+    // unknown at this point).
+    if (cached_filter_md_ ==
+        FilterMemoryDesc(filter_md.get_ndims(), filter_md.get_inner_nblks(),
+                         filter_md.get_data_type(), filter_md.get_dims(),
+                         filter_md.get_inner_blks(), filter_md.get_inner_idxs(),
+                         filter_md.get_strides())) {
+      return static_cast<Tfilter*>(
+          const_cast<Tfilter*>(cached_filter_data.flat<Tfilter>().data()));
+    }
+    return nullptr;
+#endif  // !ENABLE_ONEDNN_V3
   }
 };
 
@@ -1608,6 +1644,12 @@ class MklFusedConvOp
           context, num_args == 2,
           errors::InvalidArgument(
               "Fused Conv2D must have two extra arguments: bias and add."));
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Mish"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, dnnl::algorithm::eltwise_mish, 1.0);
+      OP_REQUIRES(context, num_args == 1,
+                  errors::InvalidArgument(
+                      "_FusedConv2D must have one extra argument: bias."));
     } else if (fused_ops == std::vector<string>{"BiasAdd", "_MklSwish"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_swish, 1.0);
@@ -2452,6 +2494,9 @@ class MklFusedConv3DOp
                      context->GetAttr("leakyrelu_alpha", &leakyrelu_alpha));
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_relu,
                                 leakyrelu_alpha);
+    } else if (fused_ops == std::vector<string>{"BiasAdd", "Mish"}) {
+      this->set_fuse_biasadd(true);
+      this->set_fuse_activation(true, dnnl::algorithm::eltwise_mish);
     } else if (fused_ops == std::vector<string>{"BiasAdd", "Relu"}) {
       this->set_fuse_biasadd(true);
       this->set_fuse_activation(true, dnnl::algorithm::eltwise_relu);
diff --git a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
index 118b3273ea7..6cff6cd4f93 100644
--- a/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
+++ b/tensorflow/core/kernels/mkl/mkl_fused_ops_test.cc
@@ -253,14 +253,17 @@ class MklFusedConv2DOpTest : public OpsTestBase {
                            const int padding, int stride = 1) {
     DataType dtype = DataTypeToEnum<T>::v();
     int num_args = static_cast<int>(args.size());
+    int num_host_args = 0;
 
     NodeDefBuilder builder =
         NodeDefBuilder("fused_conv_op", "_MklNativeFusedConv2D")
             .Input(FakeInput(dtype))
             .Input(FakeInput(dtype))
             .Input(FakeInput(num_args, dtype))
+            .Input(FakeInput(num_host_args, DT_FLOAT))
             .Attr("T", dtype)
             .Attr("num_args", num_args)
+            .Attr("num_host_args", num_host_args)
             .Attr("strides", {1, stride, stride, 1})
             .Attr("padding",
                   padding == kInvalidPaddingValue ? "SAME" : "EXPLICIT")
diff --git a/tensorflow/core/kernels/mkl/mkl_kernel_util.h b/tensorflow/core/kernels/mkl/mkl_kernel_util.h
index 32ecc313c11..b0b323d6723 100644
--- a/tensorflow/core/kernels/mkl/mkl_kernel_util.h
+++ b/tensorflow/core/kernels/mkl/mkl_kernel_util.h
@@ -46,6 +46,52 @@ class MklTestingUtil {
   }
 };
 
+#ifdef ENABLE_ONEDNN_V3
+// Since oneDNN v3.x exposes only an opaque memory descriptor, it is no longer
+// possible to cache the entire filter memory descriptor as is. So we store
+// all relevant information about it in the following class.
+//
+// TODO(intel-tf): When oneDNN major version changes to v4.x, weight
+// caching may not work as expected if the underlying memory descriptor
+// has changed (i.e. compared to v3.x). We have to return a status here
+// to catch oneDNN major version change to avoid unexpected results.
+class FilterMemoryDesc {
+ public:
+  FilterMemoryDesc() {}
+
+  explicit FilterMemoryDesc(int ndims, int inner_nblks,
+                            memory::data_type data_type,
+                            const memory::dims& dims,
+                            const memory::dims& inner_blks,
+                            const memory::dims& inner_idxs,
+                            const memory::dims& strides)
+      : ndims_(ndims),
+        inner_nblks_(inner_nblks),
+        data_type_(data_type),
+        dims_(dims),
+        inner_blks_(inner_blks),
+        inner_idxs_(inner_idxs),
+        strides_(strides) {}
+
+  ~FilterMemoryDesc() {}
+
+  bool operator==(const FilterMemoryDesc& other) const {
+    return (ndims_ == other.ndims_ && inner_nblks_ == other.inner_nblks_ &&
+            data_type_ == other.data_type_ && dims_ == other.dims_ &&
+            inner_blks_ == other.inner_blks_ &&
+            inner_idxs_ == other.inner_idxs_ && strides_ == other.strides_);
+  }
+
+ private:
+  int ndims_;
+  int inner_nblks_;
+  memory::data_type data_type_;
+  memory::dims dims_;
+  memory::dims inner_blks_;
+  memory::dims inner_idxs_;
+  memory::dims strides_;
+};
+#endif  // ENABLE_ONEDNN_V3
 }  // namespace tensorflow
 
 #endif  // INTEL_MKL
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
index e62b15d1922..e566005d1e3 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_op.cc
@@ -90,9 +90,9 @@ class MklMatMulOp : public OpKernel {
       (void)SetFPMathMode();
     }
 
-    const int m = a.dim_size(1 - dim_pair[0].first);
-    const int k = a.dim_size(dim_pair[0].first);
-    const int n = b.dim_size(1 - dim_pair[0].second);
+    const int64_t m = a.dim_size(1 - dim_pair[0].first);
+    const int64_t k = a.dim_size(dim_pair[0].first);
+    const int64_t n = b.dim_size(1 - dim_pair[0].second);
     bool transpose_a = dim_pair[0].first == 0;
     bool transpose_b = dim_pair[0].second == 1;
 
@@ -147,9 +147,10 @@ class MklMatMulOp : public OpKernel {
   // layout, leading dimension is the stride between consecutive rows, max(1,n)
   //
   // --------------------------------------------------------------------------
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const float* a, const int lda,
-                   const float* b, const int ldb, float* c, const int ldc) {
+  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb,
+                   const int64_t m, const int64_t n, const int64_t k,
+                   const float* a, const int64_t lda, const float* b,
+                   const int64_t ldb, float* c, const int64_t ldc) {
     // BLAS GEMM API defines Matrix Multiplication as c = alpha * op(a) * op(b)
     // + beta * c.
     // Since TF MatMul does not have parameters for alpha, beta, we set them to
@@ -180,10 +181,10 @@ class MklMatMulOp : public OpKernel {
 #endif  // !ENABLE_ONEDNN_OPENMP
   }
 
-  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb, const int m,
-                   const int n, const int k, const bfloat16* a, const int lda,
-                   const bfloat16* b, const int ldb, bfloat16* c,
-                   const int ldc) {
+  void MklBlasGemm(OpKernelContext* ctx, bool transa, bool transb,
+                   const int64_t m, const int64_t n, const int64_t k,
+                   const bfloat16* a, const int64_t lda, const bfloat16* b,
+                   const int64_t ldb, bfloat16* c, const int64_t ldc) {
     const float alpha = 1.0f;
     const float beta = 0.0f;
     const int index_transa = transa ? 1 : 0;
diff --git a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
index 542781742cd..da195f84411 100644
--- a/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_matmul_ops_common.h
@@ -40,7 +40,8 @@ namespace tensorflow {
 static Eigen::internal::CacheSizes cache_sizes = Eigen::internal::CacheSizes();
 
 typedef Eigen::ThreadPoolDevice CPUDevice;
-inline bool ExecuteSingleThreadedGemm(int m, int n, int k, int bytes) {
+inline bool ExecuteSingleThreadedGemm(int64_t m, int64_t n, int64_t k,
+                                      int bytes) {
   // Ideally we would like to determine blocking and then come up with
   // a heuristic but what we are targeting are very small models whose
   // total size is < x*L2. So we will do this simple calculation
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index 142db833098..8c68732b28f 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -158,38 +158,20 @@ tf_kernel_library(
         ":gpu_sqrt_kernels",
         ":gpu_square_kernels",
         "//third_party/eigen3",
-    ]) + if_mlir_generated_experimental_kernels_enabled(
-        [
-            ":gpu_acos_kernels_experimental",
-            ":gpu_acosh_kernels_experimental",
-            ":gpu_asin_kernels_experimental",
-            ":gpu_asinh_kernels_experimental",
-            ":gpu_atan_kernels_experimental",
-            ":gpu_cos_kernels_experimental",
-            ":gpu_cosh_kernels_experimental",
-            ":gpu_sin_kernels_experimental",
-            ":gpu_sinh_kernels_experimental",
-            ":gpu_tan_kernels_experimental",
-            ":gpu_tanh_kernels_experimental",
-            ":gpu_abs_kernels_experimental",
-            ":gpu_atanh_kernels_experimental",
-        ],
-        [
-            ":gpu_abs_kernels",
-            ":gpu_atanh_kernels",
-            ":gpu_acos_kernels",
-            ":gpu_acosh_kernels",
-            ":gpu_asin_kernels",
-            ":gpu_asinh_kernels",
-            ":gpu_atan_kernels",
-            ":gpu_cos_kernels",
-            ":gpu_cosh_kernels",
-            ":gpu_sin_kernels",
-            ":gpu_sinh_kernels",
-            ":gpu_tan_kernels",
-            ":gpu_tanh_kernels",
-        ],
-    ),
+        ":gpu_abs_kernels",
+        ":gpu_atanh_kernels",
+        ":gpu_acos_kernels",
+        ":gpu_acosh_kernels",
+        ":gpu_asin_kernels",
+        ":gpu_asinh_kernels",
+        ":gpu_atan_kernels",
+        ":gpu_cos_kernels",
+        ":gpu_cosh_kernels",
+        ":gpu_sin_kernels",
+        ":gpu_sinh_kernels",
+        ":gpu_tan_kernels",
+        ":gpu_tanh_kernels",
+    ]),
 )
 
 tf_kernel_library(
@@ -597,18 +579,6 @@ tf_cuda_cc_test(
 # Trigonometric kernels.
 gpu_kernel_library(
     name = "gpu_acos_kernels",
-    op = "acos",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    # Cannot vectorize.
-)
-
-gpu_kernel_library(
-    name = "gpu_acos_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -622,18 +592,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_acosh_kernels",
-    op = "acosh",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    # May be compute-bound.
-)
-
-gpu_kernel_library(
-    name = "gpu_acosh_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -647,18 +605,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_asin_kernels",
-    op = "asin",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    # Cannot vectorize.
-)
-
-gpu_kernel_library(
-    name = "gpu_asin_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -672,18 +618,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_asinh_kernels",
-    op = "asinh",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    # Cannot vectorize.
-)
-
-gpu_kernel_library(
-    name = "gpu_asinh_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -697,18 +631,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_atan_kernels",
-    op = "atan",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    unroll_factors = "4",
-)
-
-gpu_kernel_library(
-    name = "gpu_atan_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -722,18 +644,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_atanh_kernels",
-    op = "atanh",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    unroll_factors = "4",
-)
-
-gpu_kernel_library(
-    name = "gpu_atanh_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -759,21 +669,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_cos_kernels",
-    jit_types = [
-        "c64",
-        "c128",
-    ],
-    op = "cos",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-)
-
-gpu_kernel_library(
-    name = "gpu_cos_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -790,22 +685,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_cosh_kernels",
-    jit_types = [
-        "c64",
-        "c128",
-    ],
-    op = "cosh",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    # May be compute-bound.
-)
-
-gpu_kernel_library(
-    name = "gpu_cosh_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -823,21 +702,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_sin_kernels",
-    jit_types = [
-        "c64",
-        "c128",
-    ],
-    op = "sin",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-)
-
-gpu_kernel_library(
-    name = "gpu_sin_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -854,22 +718,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_sinh_kernels",
-    jit_types = [
-        "c64",
-        "c128",
-    ],
-    op = "sinh",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    # May be compute-bound.
-)
-
-gpu_kernel_library(
-    name = "gpu_sinh_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -887,21 +735,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_tan_kernels",
-    jit_types = [
-        "c64",
-        "c128",
-    ],
-    op = "tan",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-)
-
-gpu_kernel_library(
-    name = "gpu_tan_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -918,18 +751,6 @@ gpu_kernel_library(
 
 gpu_kernel_library(
     name = "gpu_tanh_kernels",
-    op = "tanh",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-    ],
-    unroll_factors = "4",
-)
-
-gpu_kernel_library(
-    name = "gpu_tanh_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
@@ -1388,23 +1209,6 @@ gpu_kernel_library(
 # TODO(b/25387198): Add an int32 kernel.
 gpu_kernel_library(
     name = "gpu_abs_kernels",
-    jit_types = [
-        "i8",
-        "i16",
-    ],
-    op = "abs",
-    tile_size = "256",
-    types = [
-        "f16",
-        "f32",
-        "f64",
-        "i64",
-    ],
-    unroll_factors = "4",
-)
-
-gpu_kernel_library(
-    name = "gpu_abs_kernels_experimental",
     jit_i64_indexed_for_large_tensors_types = [
         "f16",
         "f32",
diff --git a/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h b/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
index d346db461c2..59bf0e77f28 100644
--- a/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
+++ b/tensorflow/core/kernels/mlir_generated/base_binary_ops_test.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_BINARY_OPS_TEST_H_
 #define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_BINARY_OPS_TEST_H_
 
+#include <string>
+#include <vector>
+
 #include "absl/container/inlined_vector.h"
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/core/kernels/mlir_generated/base_ops_test.h b/tensorflow/core/kernels/mlir_generated/base_ops_test.h
index 685800fcd46..22f999c66db 100644
--- a/tensorflow/core/kernels/mlir_generated/base_ops_test.h
+++ b/tensorflow/core/kernels/mlir_generated/base_ops_test.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OPS_TEST_H_
 #define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_OPS_TEST_H_
 
+#include <complex>
+#include <limits>
 #include <string>
 
 #include "absl/container/inlined_vector.h"
diff --git a/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h b/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
index 3a4f8e474f0..5edb7e7d295 100644
--- a/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
+++ b/tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_UNARY_OPS_TEST_H_
 #define TENSORFLOW_CORE_KERNELS_MLIR_GENERATED_BASE_UNARY_OPS_TEST_H_
 
+#include <string>
+
 #include "absl/container/inlined_vector.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h"
 #include "tensorflow/core/framework/fake_input.h"
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index dbe565687c6..2029ad9fb2c 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -148,6 +148,7 @@ def _gen_kernel_bin_impl(ctx):
         arguments = cmd_args + [
             "--tile_sizes=%s" % ctx.attr.tile_size,
             "--max-supported-rank=%s" % ctx.attr.max_supported_rank,
+            "--host-triple=%s" % ctx.attr.host_triple,
             "--arch=%s" % ",".join(ctx.attr.gpu_archs),
             "--input=%s" % ctx.file.mlir_op.path,
             "--output=%s" % gpu_bin.path,
@@ -183,6 +184,7 @@ _gen_kernel_bin_rule = rule(
         "tile_size": attr.string(mandatory = True),
         "unroll_factors": attr.string(),
         "max_supported_rank": attr.int(),
+        "host_triple": attr.string(mandatory = True),
         "gpu_archs": attr.string_list(),
         "jit": attr.bool(),
         "jit_i64_indexed_for_large_tensors": attr.bool(),
@@ -335,6 +337,12 @@ def _gen_kernel_library(
                 platform = platform,
                 type = type,
             )
+
+            host_triple = select({
+                "@platforms//cpu:aarch64": "aarch64-unknown-linux-gnu",  # copybara:comment_replace "//third_party/bazel_platforms/cpu:aarch64": "aarch64-unknown-linux-gnu",
+                "//conditions:default": "x86_64-unknown-linux-gnu",
+            })
+
             _gen_kernel_bin_rule(
                 name = "{op}_{name}_{platform}_{type}_{output_type}_kernel_generator".format(
                     op = op,
@@ -345,6 +353,7 @@ def _gen_kernel_library(
                 ),
                 data_type = type,
                 extra_args = extra_args,
+                host_triple = host_triple,
                 gpu_archs = gpu_archs,
                 jit = jit,
                 max_supported_rank = max_supported_rank,
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
index 8591693ca02..020193d4ce1 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
@@ -13,7 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cmath>
+#include <complex>
 #include <limits>
+#include <memory>
+#include <utility>
 
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/device_factory.h"
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc
index 46784feb8b7..b8f46b76d50 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_large_tensor_test.cc
@@ -37,30 +37,24 @@ class UnaryOpsLargeTensorTest : public UnaryOpsTestBase {
 
 /// Test `tf.Abs`.
 
-#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
-    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
-
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 TEST_F(UnaryOpsLargeTensorTest, Abs) {
   Test<float, float, float, float>(
       "Abs", test::DefaultInputShapeExceedingInt32(),
       test::DefaultInput<float>(), std::abs,
       test::OpsTestConfig().ExpectStrictlyEqual().SuppressTolerance());
 }
-
 #endif
 
 /// Test `tf.Atanh`.
 
-#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) && \
-    defined(MLIR_GENERATED_EXPERIMENTAL_KERNELS_ENABLED)
-
+#if defined(MLIR_GENERATED_GPU_KERNELS_ENABLED)
 TEST_F(UnaryOpsLargeTensorTest, Atanh) {
   Test<float, float, float, float>("Atanh",
                                    test::DefaultInputShapeExceedingInt32(),
                                    test::DefaultInput<float>(), std::atanh,
                                    test::OpsTestConfig().ExpectStrictlyEqual());
 }
-
 #endif
 
 }  // namespace
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_test.cc
index 9565e5142d1..eb9414a0563 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_unary_ops_test.cc
@@ -16,7 +16,12 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <complex>
+#include <initializer_list>
 #include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/core/kernels/mlir_generated/base_ops_test.h"
 #include "tensorflow/core/kernels/mlir_generated/base_unary_ops_test.h"
diff --git a/tensorflow/core/kernels/pooling_ops_3d.cc b/tensorflow/core/kernels/pooling_ops_3d.cc
index 33e971b66df..0370566d8bd 100644
--- a/tensorflow/core/kernels/pooling_ops_3d.cc
+++ b/tensorflow/core/kernels/pooling_ops_3d.cc
@@ -184,9 +184,11 @@ class Pooling3DOp : public UnaryOp<T> {
                                    GetTensorDim(stride_, data_format_, '1'),
                                    GetTensorDim(stride_, data_format_, '0')}};
     std::array<int64_t, 3> padding, out;
+    std::array<int64_t, 3> dilations{1, 1, 1};
 
-    OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
-                                            padding_, &out, &padding));
+    OP_REQUIRES_OK(context,
+                   Get3dOutputSizeV2(input_size, window, dilations, stride,
+                                     padding_, &out, &padding));
 
     TensorShape out_shape;
     OP_REQUIRES_OK(context, ShapeFromFormatWithStatus(
@@ -362,9 +364,11 @@ class MaxPooling3dGradOp : public OpKernel {
                                    GetTensorDim(stride_, data_format_, '1'),
                                    GetTensorDim(stride_, data_format_, '0')}};
     std::array<int64_t, 3> out, padding;
+    std::array<int64_t, 3> dilations{1, 1, 1};
 
-    OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
-                                            padding_, &out, &padding));
+    OP_REQUIRES_OK(context,
+                   Get3dOutputSizeV2(input_size, window, dilations, stride,
+                                     padding_, &out, &padding));
 
     const int64_t depth = GetTensorDim(tensor_in, data_format_, 'C');
     const int64_t in_batch = GetTensorDim(tensor_in, data_format_, 'N');
@@ -546,9 +550,11 @@ class AvgPooling3dGradOp : public OpKernel {
                                    GetTensorDim(stride_, data_format_, '1'),
                                    GetTensorDim(stride_, data_format_, '0')}};
     std::array<int64_t, 3> padding, out;
+    std::array<int64_t, 3> dilations{1, 1, 1};
 
-    OP_REQUIRES_OK(context, Get3dOutputSize(input_size, window, stride,
-                                            padding_, &out, &padding));
+    OP_REQUIRES_OK(context,
+                   Get3dOutputSizeV2(input_size, window, dilations, stride,
+                                     padding_, &out, &padding));
 
     LaunchAvgPooling3dGradOp<Device, T>::launch(
         context, output_shape, out_backprop, window, stride, out, padding,
diff --git a/tensorflow/core/kernels/range_sampler_test.cc b/tensorflow/core/kernels/range_sampler_test.cc
index 329bc32c81a..aa1f398ed0c 100644
--- a/tensorflow/core/kernels/range_sampler_test.cc
+++ b/tensorflow/core/kernels/range_sampler_test.cc
@@ -13,9 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/kernels/range_sampler.h"
+
 #include <vector>
 
-#include "tensorflow/core/kernels/range_sampler.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/random/simple_philox.h"
@@ -171,7 +173,7 @@ TEST_F(RangeSamplerTest, FixedUnigramNoExistingFilename) {
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(9, 0.8, 0, 1, 0);
   Status s = test_sampler->SetDistributionSampler(env, fname);
   sampler_.reset(test_sampler);
-  EXPECT_TRUE(errors::IsNotFound(s)) << s;
+  EXPECT_TRUE(absl::IsNotFound(s)) << s;
 }
 TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
   Env* env = Env::Default();
@@ -180,7 +182,7 @@ TEST_F(RangeSamplerTest, FixedUnigramNoMatchingRangeWeights) {
   FixedUnigramSampler* test_sampler = new FixedUnigramSampler(8, 0.8, 0, 1, 0);
   Status s = test_sampler->SetDistributionSampler(env, fname);
   sampler_.reset(test_sampler);
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 }
 TEST_F(RangeSamplerTest, FixedUnigramChecksum) {
   Env* env = Env::Default();
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index 4fe99013480..00144b27b88 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -280,6 +280,28 @@ struct CSRSparseMatrixTransposeComponent<GPUDevice, T> {
   }
 };
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#define DEFINE_TRANSPOSE(Device, T)                                \
+  template Status CSRSparseMatrixTranspose<Device, T>::operator()( \
+      OpKernelContext* ctx, bool conjugate,                        \
+      const CSRSparseMatrix& input_matrix, CSRSparseMatrix* output_matrix);
+
+DEFINE_TRANSPOSE(CPUDevice, float);
+DEFINE_TRANSPOSE(CPUDevice, double);
+DEFINE_TRANSPOSE(CPUDevice, complex64);
+DEFINE_TRANSPOSE(CPUDevice, complex128);
+
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+DEFINE_TRANSPOSE(GPUDevice, float);
+DEFINE_TRANSPOSE(GPUDevice, double);
+DEFINE_TRANSPOSE(GPUDevice, complex64);
+DEFINE_TRANSPOSE(GPUDevice, complex128);
+
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
+#undef DEFINE_TRANSPOSE
+
 }  // namespace functor
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/split_v_op.cc b/tensorflow/core/kernels/split_v_op.cc
index 6bfb2170332..076699e00ee 100644
--- a/tensorflow/core/kernels/split_v_op.cc
+++ b/tensorflow/core/kernels/split_v_op.cc
@@ -493,6 +493,7 @@ class SplitVOpGPU : public SplitVOpBase<GPUDevice, T, Tlen> {
   REGISTER_SPLIT(type, int64_t);
 
 TF_CALL_ALL_TYPES(REGISTER_SPLIT_LEN);
+REGISTER_SPLIT_LEN(quint8)
 
 #undef REGISTER_SPLIT_LEN
 #undef REGISTER_SPLIT
diff --git a/tensorflow/core/kernels/text_line_reader_op.cc b/tensorflow/core/kernels/text_line_reader_op.cc
index a7c76cb7fa8..ae05e581ed0 100644
--- a/tensorflow/core/kernels/text_line_reader_op.cc
+++ b/tensorflow/core/kernels/text_line_reader_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -41,7 +43,7 @@ class TextLineReader : public ReaderBase {
     for (; line_number_ < skip_header_lines_; ++line_number_) {
       string line_contents;
       Status status = input_buffer_->ReadLine(&line_contents);
-      if (errors::IsOutOfRange(status)) {
+      if (absl::IsOutOfRange(status)) {
         // We ignore an end of file error when skipping header lines.
         // We will end up skipping this file.
         return OkStatus();
@@ -65,7 +67,7 @@ class TextLineReader : public ReaderBase {
       *produced = true;
       return status;
     }
-    if (errors::IsOutOfRange(status)) {  // End of file, advance to the next.
+    if (absl::IsOutOfRange(status)) {  // End of file, advance to the next.
       *at_end = true;
       return OkStatus();
     } else {  // Some other reading error
diff --git a/tensorflow/core/kernels/tf_record_reader_op.cc b/tensorflow/core/kernels/tf_record_reader_op.cc
index e5a9efef652..416bc22b941 100644
--- a/tensorflow/core/kernels/tf_record_reader_op.cc
+++ b/tensorflow/core/kernels/tf_record_reader_op.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // See docs in ../ops/io_ops.cc.
 
 #include <memory>
+
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/reader_base.h"
 #include "tensorflow/core/framework/reader_op_kernel.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -54,7 +56,7 @@ class TFRecordReader : public ReaderBase {
                     bool* at_end) override {
     *key = strings::StrCat(current_work(), ":", offset_);
     Status status = reader_->ReadRecord(&offset_, value);
-    if (errors::IsOutOfRange(status)) {
+    if (absl::IsOutOfRange(status)) {
       *at_end = true;
       return OkStatus();
     }
diff --git a/tensorflow/core/kernels/uniform_quant_ops/BUILD b/tensorflow/core/kernels/uniform_quant_ops/BUILD
index 1a20db322e8..a54b934c0c3 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/BUILD
+++ b/tensorflow/core/kernels/uniform_quant_ops/BUILD
@@ -196,6 +196,7 @@ tf_cc_test(
     deps = [
         ":tensor_utils",
         "//tensorflow/core/framework:tensor_testutil",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/kernels/uniform_quant_ops/tensor_utils_test.cc b/tensorflow/core/kernels/uniform_quant_ops/tensor_utils_test.cc
index bca0c0864ec..e3eb2d8a50d 100644
--- a/tensorflow/core/kernels/uniform_quant_ops/tensor_utils_test.cc
+++ b/tensorflow/core/kernels/uniform_quant_ops/tensor_utils_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/errors.h"
@@ -42,22 +43,22 @@ TEST(TensorUtilsTest, QuantizationAxisAndShapeValid) {
                                              /*zero_points_shape=*/{},
                                              /*quantization_axis=*/-1));
 
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       QuantizationAxisAndShapeValid(/*data_shape=*/{2, 3, 4},
                                     /*scales_shape=*/{3},
                                     /*zero_points_shape=*/{2},
                                     /*quantization_axis=*/1)));
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       QuantizationAxisAndShapeValid(/*data_shape=*/{2, 3, 4},
                                     /*scales_shape=*/{3},
                                     /*zero_points_shape=*/{3},
                                     /*quantization_axis=*/3)));
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       QuantizationAxisAndShapeValid(/*data_shape=*/{2, 3, 4},
                                     /*scales_shape=*/{3},
                                     /*zero_points_shape=*/{3},
                                     /*quantization_axis=*/-1)));
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       QuantizationAxisAndShapeValid(/*data_shape=*/{2, 3, 4},
                                     /*scales_shape=*/{5},
                                     /*zero_points_shape=*/{5},
diff --git a/tensorflow/core/lib/jpeg/jpeg_mem.cc b/tensorflow/core/lib/jpeg/jpeg_mem.cc
index fc0a290d450..eb7ef2b079f 100644
--- a/tensorflow/core/lib/jpeg/jpeg_mem.cc
+++ b/tensorflow/core/lib/jpeg/jpeg_mem.cc
@@ -20,8 +20,11 @@ limitations under the License.
 
 #include <setjmp.h>
 #include <string.h>
+
 #include <algorithm>
+#include <functional>
 #include <memory>
+#include <ostream>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/core/lib/monitoring/collection_registry_test.cc b/tensorflow/core/lib/monitoring/collection_registry_test.cc
index c149b45139d..c8c33c4a4fc 100644
--- a/tensorflow/core/lib/monitoring/collection_registry_test.cc
+++ b/tensorflow/core/lib/monitoring/collection_registry_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/lib/monitoring/collection_registry.h"
 
+#include <memory>
+
 #include "tensorflow/core/lib/monitoring/counter.h"
 #include "tensorflow/core/lib/monitoring/gauge.h"
 #include "tensorflow/core/lib/monitoring/percentile_sampler.h"
@@ -76,7 +78,7 @@ TEST(CollectionRegistryDeathTest, DuplicateRegistration) {
       collection_registry->Register(&metric_def, EmptyCollectionFunction);
   auto duplicate_handle =
       collection_registry->Register(&metric_def, EmptyCollectionFunction);
-  EXPECT_EQ(duplicate_handle, nullptr);
+  EXPECT_NE(duplicate_handle, nullptr);
 }
 
 TEST(CollectMetricsTest, Counter) {
diff --git a/tensorflow/core/lib/monitoring/counter_test.cc b/tensorflow/core/lib/monitoring/counter_test.cc
index 1dec04df980..f635db8036d 100644
--- a/tensorflow/core/lib/monitoring/counter_test.cc
+++ b/tensorflow/core/lib/monitoring/counter_test.cc
@@ -90,7 +90,7 @@ TEST(LabeledCounterTest, SameName) {
   auto* same_counter = Counter<1>::New("/tensorflow/test/counter_with_labels",
                                        "Counter with one label.", "MyLabel");
   EXPECT_TRUE(counter_with_labels->GetStatus().ok());
-  EXPECT_FALSE(same_counter->GetStatus().ok());
+  EXPECT_TRUE(same_counter->GetStatus().ok());
   delete same_counter;
 }
 
diff --git a/tensorflow/core/lib/monitoring/gauge_test.cc b/tensorflow/core/lib/monitoring/gauge_test.cc
index c332eabab5c..7443ab8f568 100644
--- a/tensorflow/core/lib/monitoring/gauge_test.cc
+++ b/tensorflow/core/lib/monitoring/gauge_test.cc
@@ -113,7 +113,7 @@ TEST(LabeledGaugeTest, SameName) {
   auto* same_gauge = Gauge<int64_t, 1>::New(
       "/tensorflow/test/gauge_with_labels", "Gauge with one label.", "MyLabel");
   EXPECT_TRUE(gauge_with_labels->GetStatus().ok());
-  EXPECT_FALSE(same_gauge->GetStatus().ok());
+  EXPECT_TRUE(same_gauge->GetStatus().ok());
   delete same_gauge;
 }
 
diff --git a/tensorflow/core/lib/monitoring/sampler_test.cc b/tensorflow/core/lib/monitoring/sampler_test.cc
index 8be15f92185..d9707d7c606 100644
--- a/tensorflow/core/lib/monitoring/sampler_test.cc
+++ b/tensorflow/core/lib/monitoring/sampler_test.cc
@@ -117,7 +117,7 @@ TEST(ExplicitSamplerTest, SameName) {
                                         "Sampler with one label.", "MyLabel"},
                                        Buckets::Explicit({10.0, 20.0}));
   EXPECT_TRUE(sampler_with_labels->GetStatus().ok());
-  EXPECT_FALSE(same_sampler->GetStatus().ok());
+  EXPECT_TRUE(same_sampler->GetStatus().ok());
   delete same_sampler;
 }
 
diff --git a/tensorflow/core/ops/array_ops.cc b/tensorflow/core/ops/array_ops.cc
index 11b4781841d..42eff63cab0 100644
--- a/tensorflow/core/ops/array_ops.cc
+++ b/tensorflow/core/ops/array_ops.cc
@@ -563,9 +563,10 @@ expected to invoke these operators.
 
 REGISTER_OP("ConcatOffset")
     .Input("concat_dim: int32")
-    .Input("shape: N * int32")
-    .Output("offset: N * int32")
+    .Input("shape: N * shape_type")
+    .Output("offset: N * shape_type")
     .Attr("N: int >= 2")
+    .Attr("shape_type: {int32, int64} = DT_INT32")
     .SetShapeFn([](InferenceContext* c) {
       for (int i = 1; i < c->num_inputs(); ++i) {
         c->set_output(i - 1, c->input(i));
diff --git a/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt
index 5b977e4a94f..a2d85962951 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/ConcatOffset.pbtxt
@@ -21,3 +21,39 @@ op 	 {
     minimum: 2
   }
 }
+op {
+  name: "ConcatOffset"
+  input_arg {
+    name: "concat_dim"
+    type: DT_INT32
+  }
+  input_arg {
+    name: "shape"
+    type_attr: "shape_type"
+    number_attr: "N"
+  }
+  output_arg {
+    name: "offset"
+    type_attr: "shape_type"
+    number_attr: "N"
+  }
+  attr {
+    name: "N"
+    type: "int"
+    has_minimum: true
+    minimum: 2
+  }
+  attr {
+    name: "shape_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
index 68c9383e613..df5b2f89a73 100644
--- a/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
+++ b/tensorflow/core/ops/compat/ops_history_v2/RelayoutLike.pbtxt
@@ -1,4 +1,4 @@
-op {
+op 	 {
   name: "RelayoutLike"
   input_arg {
     name: "input"
@@ -6,7 +6,7 @@ op {
   }
   input_arg {
     name: "layout_input"
-    type_attr: "T"
+    type_attr: "U"
   }
   output_arg {
     name: "output"
@@ -16,4 +16,8 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "U"
+    type: "type"
+  }
 }
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 17cc57aef2d..6d40c09dda3 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -235,15 +235,19 @@ REGISTER_OP("_MklFusedConv2D")
 REGISTER_OP("_MklNativeFusedConv2D")
     .Input("input: T")
     .Input("filter: T")
-    .Input("args: num_args * T")
+    .Input("args: TArgs")
+    .Input("host_args : num_host_args * float")
     .Output("output: T")
     .Attr("T: {bfloat16, float}")
+    .Attr("TArgs: list(type)")
     .Attr("num_args: int >= 0")
+    .Attr("num_host_args: int >=0 = 0")
     .Attr("strides: list(int)")
     .Attr("is_filter_const: bool = false")
     .Attr(GetPaddingAttrStringWithExplicit())
-    .Attr(GetConvnetDataFormatAttrString())
     .Attr(GetExplicitPaddingsAttrString())
+    .Attr(GetConvnetDataFormatAttrString())
+    .Attr(GetConvnetFilterFormatAttrString())
     .Attr("dilations: list(int) = [1, 1, 1, 1]")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr("fused_ops: list(string) = []")
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index 9336066fd2e..91c3caa8f0e 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -9319,12 +9319,12 @@ op {
   }
   input_arg {
     name: "shape"
-    type: DT_INT32
+    type_attr: "shape_type"
     number_attr: "N"
   }
   output_arg {
     name: "offset"
-    type: DT_INT32
+    type_attr: "shape_type"
     number_attr: "N"
   }
   attr {
@@ -9333,6 +9333,19 @@ op {
     has_minimum: true
     minimum: 2
   }
+  attr {
+    name: "shape_type"
+    type: "type"
+    default_value {
+      type: DT_INT32
+    }
+    allowed_values {
+      list {
+        type: DT_INT32
+        type: DT_INT64
+      }
+    }
+  }
 }
 op {
   name: "ConcatV2"
@@ -40822,7 +40835,7 @@ op {
   }
   input_arg {
     name: "layout_input"
-    type_attr: "T"
+    type_attr: "U"
   }
   output_arg {
     name: "output"
@@ -40832,6 +40845,10 @@ op {
     name: "T"
     type: "type"
   }
+  attr {
+    name: "U"
+    type: "type"
+  }
 }
 op {
   name: "Relu"
diff --git a/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc b/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc
index 32771c0f789..f8acf20c24a 100644
--- a/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc
+++ b/tensorflow/core/ops/tpu_embedding_load_retrieve_ops.cc
@@ -26,8 +26,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tpu {
 
-using OptimizationAlgorithm = OptimizationParameters::ParametersCase;
-
 REGISTER_OP("LoadTPUEmbeddingAdagradParameters")
     .Input("parameters: float32")
     .Input("accumulators: float32")
diff --git a/tensorflow/core/platform/cloud/gcs_dns_cache.h b/tensorflow/core/platform/cloud/gcs_dns_cache.h
index 95c9cd9de59..813bcd0ed63 100644
--- a/tensorflow/core/platform/cloud/gcs_dns_cache.h
+++ b/tensorflow/core/platform/cloud/gcs_dns_cache.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
 #define TENSORFLOW_CORE_PLATFORM_CLOUD_GCS_DNS_CACHE_H_
 
+#include <memory>
 #include <random>
+#include <vector>
 
 #include "tensorflow/core/platform/cloud/http_request.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/platform/distribute.bzl b/tensorflow/core/platform/distribute.bzl
index 67f16f0cb14..edb55b5627e 100644
--- a/tensorflow/core/platform/distribute.bzl
+++ b/tensorflow/core/platform/distribute.bzl
@@ -20,6 +20,7 @@ def distribute_py_test(
         disable_v3 = False,
         disable_mlir_bridge = True,
         disable_tpu_use_tfrt = None,
+        test_rule = native.py_test,
         **kwargs):
     """Generates py_test targets for CPU and GPU.
 
@@ -62,6 +63,7 @@ def distribute_py_test(
         shard_count = shard_count,
         tags = tags,
         args = args,
+        test_rule = test_rule,
         **kwargs
     )
 
@@ -81,6 +83,7 @@ def distribute_py_test(
             disable_v3 = disable_v3,
             disable_mlir_bridge = disable_mlir_bridge,
             disable_tfrt = disable_tpu_use_tfrt,
+            test_rule = test_rule,
         )
 
 def distribute_py_strict_test(**kwargs):
diff --git a/tensorflow/core/platform/path.h b/tensorflow/core/platform/path.h
index a7acd74b013..a6863851cf6 100644
--- a/tensorflow/core/platform/path.h
+++ b/tensorflow/core/platform/path.h
@@ -30,6 +30,7 @@ using tsl::io::internal::JoinPathImpl;
 using tsl::io::JoinPath;
 #endif /* SWIG */
 using tsl::io::Basename;
+using tsl::io::BasenamePrefix;
 using tsl::io::CleanPath;
 using tsl::io::CommonPathPrefix;
 using tsl::io::CreateURI;
diff --git a/tensorflow/core/platform/ram_file_system_test.py b/tensorflow/core/platform/ram_file_system_test.py
index ef10c89b45b..c25419fe676 100644
--- a/tensorflow/core/platform/ram_file_system_test.py
+++ b/tensorflow/core/platform/ram_file_system_test.py
@@ -15,6 +15,8 @@
 
 """Tests for ram_file_system.h."""
 
+import platform
+
 import numpy as np
 
 from tensorflow.python.eager import def_function
@@ -140,6 +142,9 @@ class RamFilesystemTest(test_util.TensorFlowTestCase):
     estimator.train(input_fn=input_fn, steps=10)
 
   def test_savedmodel(self):
+    if platform.system() == 'Windows':
+      self.skipTest('RAM FS not fully supported on Windows.')
+
     class MyModule(module.Module):
 
       @def_function.function(input_signature=[])
diff --git a/tensorflow/core/platform/status.h b/tensorflow/core/platform/status.h
index 7f66815de46..cba913bcab2 100644
--- a/tensorflow/core/platform/status.h
+++ b/tensorflow/core/platform/status.h
@@ -26,7 +26,6 @@ namespace tensorflow {
 // NOLINTBEGIN(misc-unused-using-decls)
 using tsl::FromAbslStatus;
 using tsl::OkStatus;
-using tsl::SourceLocation;
 using tsl::Status;
 using tsl::StatusCallback;
 using tsl::StatusGroup;
diff --git a/tensorflow/core/profiler/BUILD b/tensorflow/core/profiler/BUILD
index cea1673db80..16dacc4d7e7 100644
--- a/tensorflow/core/profiler/BUILD
+++ b/tensorflow/core/profiler/BUILD
@@ -123,12 +123,17 @@ tf_cc_binary(
     visibility = ["//visibility:public"],
     deps = [
         ":protos_all_cc",
-        "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_proto_parsing",
+        "//tensorflow/core/lib/core:errors",
+        "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/profiler:tfprof_options",
         "//tensorflow/core/profiler/internal:tfprof_stats",
         "//tensorflow/core/profiler/internal:tfprof_utils",
         "//tensorflow/core/profiler/internal/advisor:tfprof_advisor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@linenoise",
     ],
 )
diff --git a/tensorflow/core/profiler/backends/cpu/host_tracer_test.cc b/tensorflow/core/profiler/backends/cpu/host_tracer_test.cc
index c6ce9f75924..d21f8ed9d3a 100644
--- a/tensorflow/core/profiler/backends/cpu/host_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/cpu/host_tracer_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/backends/cpu/host_tracer.h"
 
 #include <memory>
+#include <optional>
 #include <ostream>
 #include <string>
 
@@ -99,7 +100,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   EXPECT_EQ(e3.Name(), "good");
   ASSERT_EQ(events[3].stats_size(), 1);
   {
-    absl::optional<std::string> value;
+    std::optional<std::string> value;
     e3.ForEachStat([&](const XStatVisitor& stat) {
       if (stat.Name() == "key1") value = stat.ToString();
     });
@@ -111,7 +112,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   EXPECT_EQ(e4.Name(), "morning");
   ASSERT_EQ(events[4].stats_size(), 2);
   {
-    absl::optional<std::string> value1, value2;
+    std::optional<std::string> value1, value2;
     e4.ForEachStat([&](const XStatVisitor& stat) {
       if (stat.Name() == "key1") {
         value1 = stat.ToString();
@@ -128,7 +129,7 @@ TEST(HostTracerTest, CollectsTraceMeEventsAsXSpace) {
   EXPECT_EQ(e5.Name(), "incomplete");
   ASSERT_EQ(events[5].stats_size(), 1);
   {
-    absl::optional<std::string> value1, value2;
+    std::optional<std::string> value1, value2;
     e5.ForEachStat([&](const XStatVisitor& stat) {
       if (stat.Name() == "key1") {
         value1 = stat.ToString();
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index 96002479e6a..e7d56544688 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <sstream>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #if GOOGLE_CUDA
@@ -117,10 +118,10 @@ class DeviceTracerTest : public ::testing::Test {
 
  protected:
   void ExpectFailure(const Status& status, error::Code code) {
-    EXPECT_FALSE(status.ok()) << status.ToString();
+    EXPECT_FALSE(status.ok()) << status;
     if (!status.ok()) {
       LOG(INFO) << "Status message: " << status.message();
-      EXPECT_EQ(code, status.code()) << status.ToString();
+      EXPECT_EQ(code, status.code()) << status;
     }
   }
 
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 5ec5e36279f..5a25b474748 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -309,26 +309,26 @@ cc_library(
         ":xplane_to_kernel_stats_db",
         ":xplane_to_op_metrics_db",
         ":xplane_to_step_events",
-        "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:diagnostics_proto_cc",
-        "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_metrics_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:steps_db_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_function_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:device_caps_utils",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:math_utils",
-        "//tensorflow/core/profiler/utils:step_intersection",
         "//tensorflow/core/profiler/utils:tf_xplane_visitor",
         "//tensorflow/core/profiler/utils:tpu_xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -518,6 +518,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -587,10 +588,10 @@ cc_library(
     copts = tf_profiler_copts(),
     visibility = ["//tensorflow/core/profiler:internal"],
     deps = [
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:derived_timeline",
         "//tensorflow/core/profiler/utils:group_events",
         "//tensorflow/core/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/tsl/profiler/utils:preprocess_xplane",
         "//tensorflow/tsl/profiler/utils:xplane_utils",
     ],
@@ -617,21 +618,22 @@ cc_library(
         ":xplane_to_tf_data_stats",
         ":xplane_to_tf_functions",
         ":xplane_to_tool_names",
+        ":xplane_to_trace_container",
         "//tensorflow/core:lib",
+        "//tensorflow/core/profiler/convert/trace_viewer:trace_events_to_json",
         "//tensorflow/core/profiler/protobuf:hardware_types_proto_cc",
         "//tensorflow/core/profiler/protobuf:input_pipeline_proto_cc",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_profile_proto_cc",
         "//tensorflow/core/profiler/protobuf:op_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:overview_page_proto_cc",
-        "//tensorflow/core/profiler/protobuf:pod_viewer_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_data_stats_proto_cc",
         "//tensorflow/core/profiler/protobuf:tf_stats_proto_cc",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/tsl/profiler/convert:xplane_to_trace_events",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -653,6 +655,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:timespan",
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -695,6 +698,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:xplane_schema",
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/core/profiler/utils:xplane_visitor",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -753,7 +757,7 @@ tf_cc_test(
         "//tensorflow/core/profiler/protobuf:memory_viewer_preprocess_proto_cc",
         "//tensorflow/core/util/proto:proto_utils",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
 
@@ -772,6 +776,7 @@ cc_library(
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
         "//tensorflow/core/profiler/utils:file_system_utils",
         "//tensorflow/core/profiler/utils:hlo_proto_map",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -816,6 +821,7 @@ cc_library(
         "//tensorflow/compiler/xla/service:hlo_proto_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/profiler/utils:hlo_module_utils",
         "//tensorflow/core/profiler/utils:hlo_proto_to_module",
     ],
 )
@@ -853,10 +859,23 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
     ],
 )
 
+tf_cc_test(
+    name = "repository_test",
+    size = "small",
+    srcs = ["repository_test.cc"],
+    deps = [
+        ":repository",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "xplane_to_tool_names",
     srcs = ["xplane_to_tool_names.cc"],
@@ -885,6 +904,76 @@ tf_cc_test(
         "//tensorflow/core/profiler/utils:xplane_utils",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
         "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "xplane_to_trace_container",
+    srcs = ["xplane_to_trace_container.cc"],
+    hdrs = ["xplane_to_trace_container.h"],
+    copts = tf_profiler_copts(),
+    deps = [
+        "//tensorflow/core/profiler/convert/trace_viewer:trace_event_arguments_builder",
+        "//tensorflow/core/profiler/convert/trace_viewer:trace_events",
+        "//tensorflow/core/profiler/convert/trace_viewer:trace_events_util",
+        "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
+        "//tensorflow/core/profiler/protobuf:trace_events_raw_proto_cc",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/core/profiler/utils:timespan",
+        "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "dcn_utils",
+    srcs = ["dcn_utils.cc"],
+    hdrs = ["dcn_utils.h"],
+    deps = [
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "//tensorflow/tsl/profiler/utils:xplane_visitor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dcn_utils_test",
+    srcs = ["dcn_utils_test.cc"],
+    deps = [
+        ":dcn_utils",
+        "//tensorflow/tsl/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/tsl/profiler/utils:xplane_builder",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "dcn_analysis",
+    srcs = ["dcn_analysis.cc"],
+    hdrs = ["dcn_analysis.h"],
+    deps = [
+        ":dcn_utils",
+        "//tensorflow/core/profiler/utils:xplane_builder",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
+        "//tensorflow/tsl/profiler/utils:math_utils",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "dcn_analysis_test",
+    srcs = ["dcn_analysis_test.cc"],
+    deps = [
+        ":dcn_analysis",
+        ":dcn_utils",
+        "//tensorflow/core/profiler/utils:tf_xplane_visitor",
+        "//tensorflow/tsl/profiler/utils:xplane_schema",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/dcn_analysis.cc b/tensorflow/core/profiler/convert/dcn_analysis.cc
new file mode 100644
index 00000000000..e8569fc5f90
--- /dev/null
+++ b/tensorflow/core/profiler/convert/dcn_analysis.cc
@@ -0,0 +1,464 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/dcn_analysis.h"
+
+#include <algorithm>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/convert/dcn_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/math_utils.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using tsl::profiler::kMaxCollectivesToDisplay;
+using tsl::profiler::kMegaScaleDcnReceive;
+using tsl::profiler::LineIdType;
+using tsl::profiler::MicroToNano;
+
+void DcnBurstManager::ResetBurstState() {
+  active_burst_messages_ = 0;
+  straggler_idx_ = 0;
+  active_burst_.num_messages = 0;
+  active_burst_.max_overlapping_messages = 0;
+  active_burst_.start_timestamp_ns = 0;
+  active_burst_.end_timestamp_ns = 0;
+  active_burst_.burst_size_bytes = 0;
+}
+
+void DcnBurstManager::CreateBursts(const TimestampMap& tm_events) {
+  ResetBurstState();
+  for (const auto& tm_event : tm_events) {
+    if (active_burst_messages_ < 0) {
+      LOG_FIRST_N(WARNING, 10)
+          << "Negative messages in burst, bursts will be incorrect.";
+    }
+    if (active_burst_messages_ == 0) {
+      // When no messages are active, next event starts a new burst
+      active_burst_.start_timestamp_ns = tm_event.first;
+    }
+    active_burst_messages_ += tm_event.second->message_diff;
+    if (tm_event.second->message_diff > 0) {
+      // On beginning of message increase messages and bytes
+      active_burst_.num_messages += tm_event.second->message_diff;
+      active_burst_.burst_size_bytes += tm_event.second->size_diff;
+    } else {
+      // On end of message, register straggler
+      Straggler straggler = {tm_event.second->duration_ns,   // duration_ns
+                             tm_event.second->timestamp_ns,  // end_timestamp_ns
+                             tm_event.second->size_diff * (-1),  // size_bytes
+                             tm_event.second->src_slice_id};     // src_slice_id
+      active_burst_.stragglers[straggler_idx_] = straggler;
+      straggler_idx_ = (straggler_idx_ + 1) % kMaxStragglersPerBurst;
+    }
+    active_burst_.max_overlapping_messages =
+        std::max(active_burst_.max_overlapping_messages,
+                 static_cast<uint64_t>(active_burst_messages_));
+    // If we are back at 0 messages, the burst has finished and can be added
+    // to the bursts_ vector.
+    if (active_burst_messages_ == 0) {
+      active_burst_.end_timestamp_ns = tm_event.first;
+      total_latency_ +=
+          (active_burst_.end_timestamp_ns - active_burst_.start_timestamp_ns);
+      bursts_.emplace_back(std::move(active_burst_));
+      ResetBurstState();
+    }
+  }
+}
+
+DcnEventsProcessor::DcnEventsProcessor(uint32_t num_tpu_tensor_cores,
+                                       bool is_megacore)
+    : num_tpu_tensor_cores_(num_tpu_tensor_cores), is_megacore_(is_megacore) {
+  // Register all MSXLA messages we may need to analyze. Currently only
+  // receive messages are processed.
+  registered_dcn_messages_.push_back(kMegaScaleDcnReceive);
+  tpu_collective_ts_map_.resize(num_tpu_tensor_cores_);
+  tpu_collective_bursts_.resize(num_tpu_tensor_cores_);
+}
+
+// Sets up map between registered Megascale messages and their event metadata
+// so they can be captured from host events.
+void DcnEventsProcessor::SetupMessageInfo(const XPlaneVisitor& plane) {
+  plane.ForEachEventMetadata([&](const XEventMetadataVisitor& event_metadata) {
+    if (std::find(registered_dcn_messages_.begin(),
+                  registered_dcn_messages_.end(),
+                  event_metadata.Name()) != registered_dcn_messages_.end()) {
+      megascale_msg_[event_metadata.Name()] = event_metadata.Id();
+    }
+  });
+}
+
+// If we use megacore, collective traffic goes to even TPU tensor cores.
+// Odd ones are woken up from their even pair (e.g. 0 wakes up 1).
+uint32_t DcnEventsProcessor::FindTpuIdx(const DcnMessage& dcn_message) {
+  uint32_t num_tpus = num_tpu_tensor_cores_;
+  if (is_megacore_) {
+    num_tpus /= 2;
+  }
+  uint32_t tpu_idx = dcn_message.tpu_dst % num_tpus;
+  if (is_megacore_) {
+    tpu_idx = tpu_idx * 2;
+  }
+  return tpu_idx;
+}
+
+void DcnEventsProcessor::GenerateTimestampEvents(
+    const DcnMessage& dcn_message) {
+  // Create one event for the beginning and one for the end of the message
+  std::shared_ptr<TimestampEvent> start_event(
+      new TimestampEvent{dcn_message.start_timestamp_ns, 0, 1,
+                         dcn_message.size_bytes, dcn_message.slice_src});
+  std::shared_ptr<TimestampEvent> end_event(new TimestampEvent{
+      dcn_message.end_timestamp_ns,
+      static_cast<uint64_t>(MicroToNano(dcn_message.duration_us)), -1,
+      -1 * dcn_message.size_bytes, dcn_message.slice_src});
+
+  // Add messages to host timestamp event map
+  std::pair<uint64_t, std::shared_ptr<TimestampEvent>> start_event_entry =
+      std::make_pair(dcn_message.start_timestamp_ns, start_event);
+  std::pair<uint64_t, std::shared_ptr<TimestampEvent>> end_event_entry =
+      std::make_pair(dcn_message.end_timestamp_ns, end_event);
+  host_ts_map_.insert(start_event_entry);
+  host_ts_map_.insert(end_event_entry);
+
+  // Add messages to the proper TPU collective timestamp event map.
+  const std::string& collective_name = dcn_message.collective_name;
+  uint32_t tpu_idx = FindTpuIdx(dcn_message);
+  auto& m = tpu_collective_ts_map_[tpu_idx][collective_name];
+  m.insert(start_event_entry);
+  m.insert(end_event_entry);
+}
+
+void DcnEventsProcessor::PrintTimestampEvents() {
+  for (const auto& host_ts : host_ts_map_) {
+    LOG(INFO) << host_ts.first << ": " << host_ts.second->timestamp_ns << " "
+              << host_ts.second->duration_ns << " "
+              << host_ts.second->message_diff << " "
+              << host_ts.second->size_diff << " "
+              << host_ts.second->src_slice_id;
+  }
+  for (uint32_t tpu_idx = 0; tpu_idx < num_tpu_tensor_cores_; tpu_idx++) {
+    LOG(INFO) << "TPU: " << tpu_idx;
+    for (const auto& col_id : tpu_collective_ts_map_[tpu_idx]) {
+      LOG(INFO) << col_id.first;
+      for (const auto& tpu_col_ts :
+           tpu_collective_ts_map_[tpu_idx][col_id.first]) {
+        LOG(INFO) << tpu_col_ts.first << ": " << tpu_col_ts.second->timestamp_ns
+                  << " " << tpu_col_ts.second->duration_ns << " "
+                  << tpu_col_ts.second->message_diff << " "
+                  << tpu_col_ts.second->size_diff << " "
+                  << tpu_col_ts.second->src_slice_id;
+      }
+    }
+  }
+}
+
+// Uses heuristics to qualify a good enough amount of collectives.
+// kMaxCollectivesToDisplay - 1 are displayed.
+// Collectives with < 5% of total host BW time are never qualified
+// Collectives with < 20% of total host BW time are qualified if less than 4
+//   collectives  have already been qualified.
+// Top 8 collectives with > 20% of total host BW time are qualified
+uint32_t DcnEventsProcessor::NumCollectivesQualified(
+    const std::vector<uint64_t>& latencies) {
+  uint32_t num_collectives_qualified = 0;
+  // Allow for 1 line to display stragglers of non-qualified collectives.
+  uint32_t max_collectives = kMaxCollectivesToDisplay - 1;
+  for (const auto& lat : latencies) {
+    if (lat < host_dcn_bursts_.TotalLatency() * 0.05) {
+      return num_collectives_qualified;
+    } else if (lat < host_dcn_bursts_.TotalLatency() * 0.2 &&
+               num_collectives_qualified >= (max_collectives / 2)) {
+      return num_collectives_qualified;
+    } else if (num_collectives_qualified >= max_collectives) {
+      return num_collectives_qualified;
+    } else {
+      num_collectives_qualified++;
+    }
+  }
+  return latencies.size();
+}
+
+// Find which collectives you are going to display in details (dedicated line)
+// and which not (shared line for stragglers).
+// Order collectives based on burst latency -- then qualify the top ones based
+// on NumCollectivesQualified function.
+void DcnEventsProcessor::QualifyCollectives() {
+  for (auto tpu_idx = 0; tpu_idx < num_tpu_tensor_cores_; tpu_idx++) {
+    std::vector<uint64_t> latency_to_order;
+    latency_to_order.reserve(tpu_collective_bursts_[tpu_idx].size());
+    for (const auto& col_info : tpu_collective_bursts_[tpu_idx]) {
+      latency_to_order.emplace_back(col_info.second.TotalLatency());
+    }
+    std::sort(latency_to_order.begin(), latency_to_order.end(),
+              std::greater<uint64_t>());
+    uint32_t num_collectives_qualified =
+        NumCollectivesQualified(latency_to_order);
+    if (num_collectives_qualified > 0) {
+      uint32_t min_latency_to_qualify =
+          latency_to_order[num_collectives_qualified - 1];
+      uint32_t col_num = 0;
+      for (auto& col_info : tpu_collective_bursts_[tpu_idx]) {
+        if (col_info.second.TotalLatency() >= min_latency_to_qualify) {
+          col_info.second.SetToDisplay(true);
+          if (++col_num == kMaxCollectivesToDisplay - 1) break;
+        }
+      }
+    }
+  }
+}
+
+void DcnEventsProcessor::GenerateBursts() {
+  host_dcn_bursts_.CreateBursts(host_ts_map_);
+  host_dcn_bursts_.SetToDisplay(true);
+
+  for (auto tpu_idx = 0; tpu_idx < num_tpu_tensor_cores_; tpu_idx++) {
+    for (const auto& col_info : tpu_collective_ts_map_[tpu_idx]) {
+      tpu_collective_bursts_[tpu_idx][col_info.first].CreateBursts(
+          tpu_collective_ts_map_[tpu_idx][col_info.first]);
+    }
+  }
+  QualifyCollectives();
+}
+
+void DcnEventsProcessor::ProcessReceiveMessages(const XPlaneVisitor& plane) {
+  plane.ForEachLine([&](const XLineVisitor& line) {
+    uint32_t recv_msg_id = megascale_msg_[kMegaScaleDcnReceive];
+    line.ForEachEvent([&](const XEventVisitor& event) {
+      if (event.Id() == recv_msg_id) {
+        DcnMessage dcn_message = GetDcnMessageFromXEvent(event);
+        // TODO(emizan): Report invalid and clock skew messages somehow.
+        // TODO(emizan): Bring back loopback messages when MSXLA fixes them.
+        if (dcn_message.validity_info == DCN_MESSAGE_VALID) {
+          GenerateTimestampEvents(dcn_message);
+        }
+        received_messages_.emplace_back(std::move(dcn_message));
+      }
+    });
+  });
+  GenerateBursts();
+}
+
+absl::string_view DcnEventsProcessor::GetBwInfo(bool is_per_tpu,
+                                                const DcnBurst& burst,
+                                                float& burst_mean_bw,
+                                                float& burst_bw_utilization) {
+  absl::string_view bw_level;
+  uint32_t bw_divider = 1;
+  burst_mean_bw = static_cast<float>(burst.burst_size_bytes) /
+                  (burst.end_timestamp_ns - burst.start_timestamp_ns);
+  if (is_per_tpu) {
+    bw_divider = num_tpu_tensor_cores_;
+    if (is_megacore_) {
+      bw_divider /= 2;
+    }
+  }
+  // Have 3 BW categories (low/med/high) to limit the amount of colors in the
+  // trace viewer
+  if (burst_mean_bw < kLimitLowHostDcnBw / bw_divider) {
+    bw_level = "Low BW";
+  } else if (burst_mean_bw < kLimitMedHostDcnBw / bw_divider) {
+    bw_level = "Med BW";
+  } else {
+    bw_level = "High BW";
+  }
+  burst_bw_utilization = burst_mean_bw / (kMaxHostDcnBw / bw_divider);
+  return bw_level;
+}
+
+void DcnEventsProcessor::AddHostDcnTrafficToXPlane(XPlane* host_xplane) {
+  if (!host_dcn_bursts_.ToDisplay()) return;
+  XPlaneBuilder plane_builder(host_xplane);
+  XLineBuilder line =
+      plane_builder.GetOrCreateLine(LineIdType::kDcnHostTraffic);
+  line.SetNameIfEmpty("DCN Host Bandwidth");
+  line.SetTimestampNs(0);
+  XStatMetadata* bw_stat_metadata =
+      plane_builder.GetOrCreateStatMetadata("Bandwidth (GBytes/sec)");
+  XStatMetadata* bw_util_stat_metadata =
+      plane_builder.GetOrCreateStatMetadata("Bandwidth Utilization");
+  XStatMetadata* num_msg_stat_metadata =
+      plane_builder.GetOrCreateStatMetadata("Total Messages");
+  XStatMetadata* max_overlap_msg_stat_metadata =
+      plane_builder.GetOrCreateStatMetadata("Max Overlapping Messages");
+  XStatMetadata* avg_msg_size_stat_metadata =
+      plane_builder.GetOrCreateStatMetadata("Average Message Size (Bytes)");
+  for (const auto& host_burst : host_dcn_bursts_.GetBursts()) {
+    float burst_mean_bw, bw_utilization;
+    absl::string_view bw_level =
+        GetBwInfo(false, host_burst, burst_mean_bw, bw_utilization);
+    XEventMetadata* event_metadata =
+        plane_builder.GetOrCreateEventMetadata(bw_level);
+    XEventBuilder event = line.AddEvent(*event_metadata);
+    event.SetOffsetNs(host_burst.start_timestamp_ns);
+    event.SetDurationNs(host_burst.end_timestamp_ns -
+                        host_burst.start_timestamp_ns);
+
+    // Using std::string to limit number of decimals.
+    event.ParseAndAddStatValue(*bw_stat_metadata,
+                               std::to_string(burst_mean_bw));
+    event.ParseAndAddStatValue(*bw_util_stat_metadata,
+                               std::to_string(bw_utilization));
+    event.AddStatValue(*num_msg_stat_metadata, host_burst.num_messages);
+    event.AddStatValue(*max_overlap_msg_stat_metadata,
+                       host_burst.max_overlapping_messages);
+    uint32_t avg_message_size =
+        host_burst.burst_size_bytes / host_burst.num_messages;
+    event.AddStatValue(*avg_msg_size_stat_metadata, avg_message_size);
+  }
+}
+
+void DcnEventsProcessor::AddUnqualifiedCollectivesToXPlane(
+    XPlaneBuilder& plane_builder, uint32_t tpu_idx) {
+  XLineBuilder line =
+      plane_builder.GetOrCreateLine(LineIdType::kDcnCollectiveTrafficMax);
+  line.SetNameIfEmpty("Remaining collectives");
+  line.SetTimestampNs(0);
+  for (const auto& col_item : tpu_collective_bursts_[tpu_idx]) {
+    if (col_item.second.ToDisplay()) continue;
+    for (const auto& col_burst : col_item.second.GetBursts()) {
+      XEventMetadata* straggler_event_metadata =
+          plane_builder.GetOrCreateEventMetadata(col_item.first);
+      uint32_t stragglers_processed = 0;
+      XStatMetadata* straggler_src_slice_stat_metadata =
+          plane_builder.GetOrCreateStatMetadata("Source slice");
+      XStatMetadata* straggler_duration_ns_stat_metadata =
+          plane_builder.GetOrCreateStatMetadata("Duration ns");
+      XStatMetadata* straggler_send_time_ns_stat_metadata =
+          plane_builder.GetOrCreateStatMetadata("Send timestamp ns");
+      XStatMetadata* straggler_recv_time_ns_stat_metadata =
+          plane_builder.GetOrCreateStatMetadata("Recv timestamp ns");
+      for (const auto& straggler : col_burst.stragglers) {
+        XEventBuilder straggler_event =
+            line.AddEvent(*straggler_event_metadata);
+        straggler_event.SetOffsetNs(straggler.end_timestamp_ns - 10000);
+        straggler_event.SetDurationNs(10000);
+        straggler_event.AddStatValue(*straggler_src_slice_stat_metadata,
+                                     straggler.src_slice_id);
+        straggler_event.AddStatValue(*straggler_duration_ns_stat_metadata,
+                                     straggler.duration_ns);
+        straggler_event.AddStatValue(
+            *straggler_send_time_ns_stat_metadata,
+            straggler.end_timestamp_ns - straggler.duration_ns);
+        straggler_event.AddStatValue(*straggler_recv_time_ns_stat_metadata,
+                                     straggler.end_timestamp_ns);
+        if (++stragglers_processed >= col_burst.num_messages) break;
+      }
+    }
+  }
+}
+
+void DcnEventsProcessor::AddQualifiedCollectivesToXPlane(
+    XPlaneBuilder& plane_builder, uint32_t tpu_idx) {
+  uint32_t total_collectives = 0;
+  for (const auto& col_item : tpu_collective_bursts_[tpu_idx]) {
+    // Skip collectives not enabled for display.
+    if (!col_item.second.ToDisplay()) continue;
+    const std::string& col_name = col_item.first;
+    XLineBuilder line = plane_builder.GetOrCreateLine(
+        LineIdType::kDcnCollectiveTraffic + total_collectives++);
+    line.SetNameIfEmpty(col_name);
+    line.SetTimestampNs(0);
+    XStatMetadata* bw_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Bandwidth (GBytes/sec)");
+    XStatMetadata* bw_util_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Bandwidth Utilization");
+    XStatMetadata* num_msg_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Total Messages");
+    XStatMetadata* max_overlap_msg_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Max Overlapping Messages");
+    XStatMetadata* avg_msg_size_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Average Message Size (Bytes)");
+    XStatMetadata* straggler_details_metadata =
+        plane_builder.GetOrCreateStatMetadata("Straggler info:");
+    XStatMetadata* straggler_src_slice_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Source slice");
+    XStatMetadata* straggler_duration_ns_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Duration ns");
+    XStatMetadata* straggler_send_time_ns_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Send timestamp ns");
+    XStatMetadata* straggler_recv_time_ns_stat_metadata =
+        plane_builder.GetOrCreateStatMetadata("Recv timestamp ns");
+    for (const auto& col_burst : col_item.second.GetBursts()) {
+      float burst_mean_bw, bw_utilization;
+      absl::string_view bw_level =
+          GetBwInfo(true, col_burst, burst_mean_bw, bw_utilization);
+      XEventMetadata* event_metadata =
+          plane_builder.GetOrCreateEventMetadata(bw_level);
+      XEventBuilder event = line.AddEvent(*event_metadata);
+      event.SetOffsetNs(col_burst.start_timestamp_ns);
+      event.SetDurationNs(col_burst.end_timestamp_ns -
+                          col_burst.start_timestamp_ns);
+      event.ParseAndAddStatValue(*bw_stat_metadata,
+                                 std::to_string(burst_mean_bw));
+      event.ParseAndAddStatValue(*bw_util_stat_metadata,
+                                 std::to_string(bw_utilization));
+      event.AddStatValue(*num_msg_stat_metadata, col_burst.num_messages);
+      event.AddStatValue(*max_overlap_msg_stat_metadata,
+                         col_burst.max_overlapping_messages);
+      event.AddStatValue(*avg_msg_size_stat_metadata,
+                         col_burst.burst_size_bytes / col_burst.num_messages);
+      // Add straggler info.
+      XEventMetadata* straggler_event_metadata =
+          plane_builder.GetOrCreateEventMetadata("Straggler");
+      uint32_t stragglers_processed = 0;
+      std::string straggler_details = "Stragglers:\n";
+      for (const auto& straggler : col_burst.stragglers) {
+        // Add an event for the last straggler
+        if (straggler.end_timestamp_ns == col_burst.end_timestamp_ns) {
+          XEventBuilder straggler_event =
+              line.AddEvent(*straggler_event_metadata);
+          straggler_event.SetOffsetNs(straggler.end_timestamp_ns -
+                                      straggler.duration_ns);
+          straggler_event.SetDurationNs(straggler.duration_ns);
+          straggler_event.AddStatValue(*straggler_src_slice_stat_metadata,
+                                       straggler.src_slice_id);
+          straggler_event.AddStatValue(*straggler_duration_ns_stat_metadata,
+                                       straggler.duration_ns);
+          straggler_event.AddStatValue(
+              *straggler_send_time_ns_stat_metadata,
+              straggler.end_timestamp_ns - straggler.duration_ns);
+          straggler_event.AddStatValue(*straggler_recv_time_ns_stat_metadata,
+                                       straggler.end_timestamp_ns);
+        }
+        // Add text metadata for all stragglers.
+        straggler_details +=
+            "  Src slice: " + std::to_string(straggler.src_slice_id) +
+            " -- Duration (ns): " + std::to_string(straggler.duration_ns) +
+            " -- [Send Timestamp, Recv Timestamp]: [" +
+            std::to_string(straggler.end_timestamp_ns - straggler.duration_ns) +
+            ", " + std::to_string(straggler.end_timestamp_ns) + "]\n";
+        if (++stragglers_processed >= col_burst.num_messages) break;
+      }
+      event.AddStatValue(*straggler_details_metadata, straggler_details);
+    }
+  }
+}
+
+void DcnEventsProcessor::AddTpuCollectiveDcnTrafficToXPlane(
+    XPlane* device_xplane, uint32_t tpu_idx) {
+  XPlaneBuilder plane_builder(device_xplane);
+  AddQualifiedCollectivesToXPlane(plane_builder, tpu_idx);
+  AddUnqualifiedCollectivesToXPlane(plane_builder, tpu_idx);
+}
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_analysis.h b/tensorflow/core/profiler/convert/dcn_analysis.h
new file mode 100644
index 00000000000..ed4095701ff
--- /dev/null
+++ b/tensorflow/core/profiler/convert/dcn_analysis.h
@@ -0,0 +1,224 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
+
+#include <array>
+#include <map>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/convert/dcn_utils.h"
+#include "tensorflow/core/profiler/utils/xplane_builder.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// Structure representing a DcnMessage using two entries:
+// One for the start of the message and one for the end.
+struct TimestampEvent {
+  uint64_t timestamp_ns;  // TraceMe logging timestamp
+  uint64_t duration_ns;   // 0 for start of message, duration for end of message
+  int32_t message_diff;   // +1/-1 for start/end of message.
+                          // Makes handling 0-sized messages easier and is
+                          // convenient for the burst generation algorithm.
+  size_t size_diff;       // +size/-size for start/end of message.
+  int32_t src_slice_id;   // Source slice for message, used for stragglers
+};
+
+// We use an multi map since TimestampEvents will be ordered and we
+// need separate entries for possible events happening at exactly the
+// same time.
+typedef std::multimap<uint64_t, std::shared_ptr<TimestampEvent>> TimestampMap;
+typedef absl::flat_hash_map<std::string, TimestampMap> CollectiveTimestampMap;
+
+// Straggler messages. These are shown at the end of the bursts they belong to.
+struct Straggler {
+  uint64_t duration_ns;       // Message duration in ns
+  uint64_t end_timestamp_ns;  // End of the message. For the last straggler
+                              // this will be the end of the burst
+  size_t size_bytes;          // Size of the message in bytes
+  int32_t src_slice_id;       // Source slice of the message
+                              // TODO(emizan) Add host info.
+};
+
+static constexpr uint32_t kMaxStragglersPerBurst = 4;
+
+// DCN Burst description.
+// A burst is defined as a period of time during which there is at least one
+// message in the network. Since DCN traffic is bursty this structure is
+// convenient to summarize 100K+ messages in a few 10s of bursts.
+// Burst scope is flexible. In this analysis we have per-host bursts, which
+// include messages arriving on a single host independent of sender/target TPU/
+// and collective. We also have per collective/TPU bursts which include messages
+// for a single collective+TPU combination.
+struct DcnBurst {
+  uint64_t start_timestamp_ns;        // Beginning of burst in ns
+  uint64_t end_timestamp_ns;          // End of burst in ns
+  uint64_t burst_size_bytes;          // Total number of bytes in burst
+  uint64_t num_messages;              // Messages in burst
+  uint64_t max_overlapping_messages;  // Max overlapping messages in burst
+  // Buffer of stragglers in a bursts. Contains the last few messages in a burst
+  std::array<Straggler, kMaxStragglersPerBurst> stragglers;
+};
+
+// Class with functionality to generate DcnBursts out of TimestampEvents.
+// Burst creation is a non-trivial state machine
+class DcnBurstManager {
+ public:
+  DcnBurstManager() = default;
+  uint64_t TotalLatency() const { return total_latency_; }
+  void SetToDisplay(bool to_display) { to_display_ = to_display; }
+  bool ToDisplay() const { return to_display_; }
+  const std::vector<DcnBurst> &GetBursts() const { return bursts_; }
+
+  // Run burst state machine creation out of timestamp map.
+  void CreateBursts(const TimestampMap &tm_events);
+  // For debugging purposes.
+  void PrintBursts() {
+    for (const auto &burst : bursts_) {
+      LOG(INFO) << burst.start_timestamp_ns << " " << burst.end_timestamp_ns
+                << " " << burst.num_messages << " " << burst.burst_size_bytes
+                << " " << burst.max_overlapping_messages;
+    }
+  }
+
+ private:
+  std::vector<DcnBurst> bursts_;  // Bursts created by this manager
+  uint64_t total_latency_ = 0;    // Total latency of all bursts created
+                                  // Used to see if bursts will be displayed
+  bool to_display_ = false;       // Set to true to enable burst display
+
+  int32_t active_burst_messages_;  // Used by burst creation state machine.
+  DcnBurst active_burst_;          // Active burst in creation
+  uint32_t straggler_idx_;
+
+  // Initializes state machine when new burst is detected.
+  void ResetBurstState();
+};
+
+typedef absl::flat_hash_map<std::string, DcnBurstManager>
+    CollectiveBurstManager;
+
+class DcnEventsProcessor {
+ public:
+  DcnEventsProcessor() = delete;
+  DcnEventsProcessor(uint32_t num_tpu_tensor_cores, bool is_megacore);
+
+  uint32_t NumTpuTensorCores() const { return num_tpu_tensor_cores_; }
+  bool IsMegacore() const { return is_megacore_; }
+
+  // Populates available megascale messages from event metadata.
+  void SetupMessageInfo(const tensorflow::profiler::XPlaneVisitor &plane);
+
+  std::optional<int32_t> MegaScaleMessageId(absl::string_view msg_name) const {
+    auto iter = megascale_msg_.find(msg_name);
+    if (iter != megascale_msg_.end()) {
+      return iter->second;
+    }
+    return std::nullopt;
+  }
+
+  uint32_t NumReceivedMessages() const { return received_messages_.size(); }
+  const tensorflow::profiler::DcnMessage &GetMessage(uint32_t i) const {
+    return received_messages_[i];
+  }
+
+  // Checks if messages with msg event name have been found in event metadata.
+  bool HasDcnMessages(absl::string_view msg_name) const {
+    return (megascale_msg_.find(msg_name) != megascale_msg_.end());
+  }
+
+  const TimestampMap &HostTsMap() const { return host_ts_map_; }
+  const std::vector<DcnBurst> &GetHostBursts() const {
+    return host_dcn_bursts_.GetBursts();
+  }
+
+  // Main function to process receive messages, and call other functions
+  // to generate timestamp events and bursts.
+  void ProcessReceiveMessages(const tensorflow::profiler::XPlaneVisitor &plane);
+
+  // Update XPlanes using DCN traffic info
+  void AddHostDcnTrafficToXPlane(tensorflow::profiler::XPlane *host_xplane);
+  void AddTpuCollectiveDcnTrafficToXPlane(
+      tensorflow::profiler::XPlane *device_xplane, uint32_t tpu_idx);
+
+ private:
+  // Tensor cores and megacore flag for this host. DCN messages are sent to a
+  // TPU chip, so we need to know the number of tensor cores and whether
+  // megacore is used to map DCN traffic to the proper tensor core.
+  const uint32_t num_tpu_tensor_cores_;
+  const bool is_megacore_;
+
+  // Used for visualization of BW and computation of BW utilization.
+  static constexpr float kLimitLowHostDcnBw = 4.17;
+  static constexpr float kLimitMedHostDcnBw = 8.34;
+  static constexpr float kMaxHostDcnBw = 12.5;
+
+  std::vector<absl::string_view> registered_dcn_messages_;
+
+  // Available megascale messages for this trace.
+  absl::flat_hash_map<absl::string_view, int32_t> megascale_msg_;
+
+  std::vector<tensorflow::profiler::DcnMessage> received_messages_;
+
+  // TimestampMaps for messages that arrive to this host
+  // and for messages of distinct collectives going to different TPUs.
+  TimestampMap host_ts_map_;
+  std::vector<CollectiveTimestampMap> tpu_collective_ts_map_;
+
+  // DcnBurstManagers for bursts that arrive to this host
+  // and for burst from distinct collectives going to different TPUs.
+  DcnBurstManager host_dcn_bursts_;
+  std::vector<CollectiveBurstManager> tpu_collective_bursts_;
+
+  // Find the TPU index a DCN message goes to.
+  uint32_t FindTpuIdx(const tensorflow::profiler::DcnMessage &dcn_message);
+  // Generates BW info to display in the trace viewer.
+  // This included trace event BW level string, mean BW per burst and
+  // utilization.
+  absl::string_view GetBwInfo(bool is_per_tpu, const DcnBurst &burst,
+                              float &burst_mean_bw,
+                              float &burst_bw_utilization);
+
+  // Qualify collectives to display on trace viewer.
+  // Qualified collectives are given a dedicated line, while for the rest
+  // we share a single line for their stragglers.
+  uint32_t NumCollectivesQualified(const std::vector<uint64_t> &latencies);
+  void QualifyCollectives();
+  // Export collective DCN activity to trace viewer.
+  void AddQualifiedCollectivesToXPlane(
+      tensorflow::profiler::XPlaneBuilder &plane_builder, uint32_t tpu_idx);
+  void AddUnqualifiedCollectivesToXPlane(
+      tensorflow::profiler::XPlaneBuilder &plane_builder, uint32_t tpu_idx);
+
+  // Create timestamp events for every message
+  void GenerateTimestampEvents(
+      const tensorflow::profiler::DcnMessage &dcn_message);
+  // For debugging purposes
+  void PrintTimestampEvents();
+  // Generate bursts (host and TPU/collective) from timestamp events.
+  void GenerateBursts();
+};
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_ANALYSIS_H_
diff --git a/tensorflow/core/profiler/convert/dcn_analysis_test.cc b/tensorflow/core/profiler/convert/dcn_analysis_test.cc
new file mode 100644
index 00000000000..63103612da5
--- /dev/null
+++ b/tensorflow/core/profiler/convert/dcn_analysis_test.cc
@@ -0,0 +1,360 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/profiler/convert/dcn_analysis.h"
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/core/profiler/convert/dcn_utils.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+
+namespace {
+
+using tensorflow::profiler::DCN_MESSAGE_INVALID_BAD_KEY;
+using tensorflow::profiler::DCN_MESSAGE_INVALID_CLOCK_SKEW;
+using tensorflow::profiler::DCN_MESSAGE_VALID;
+using tensorflow::profiler::DCN_MESSAGE_VALID_LOOPBACK;
+using tensorflow::profiler::XEventBuilder;
+using tensorflow::profiler::XEventMetadata;
+using tensorflow::profiler::XLineBuilder;
+using tensorflow::profiler::XPlane;
+using tensorflow::profiler::XPlaneBuilder;
+using tensorflow::profiler::XPlaneVisitor;
+using tensorflow::profiler::XSpace;
+using ::testing::FieldsAre;
+using tsl::profiler::kMegaScaleDcnReceive;
+using tsl::profiler::kMegaScaleDcnSend;
+
+TEST(DcnAnalysis, SetupMessageInfoTest) {
+  XSpace space;
+  XPlane *host_trace = space.add_planes();
+  XPlaneBuilder host_trace_builder(host_trace);
+
+  XEventMetadata *event_metadata_1 =
+      host_trace_builder.GetOrCreateEventMetadata(1);
+  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
+  XEventMetadata *event_metadata_2 =
+      host_trace_builder.GetOrCreateEventMetadata(2);
+  event_metadata_2->set_name(std::string(kMegaScaleDcnSend));
+
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
+  DcnEventsProcessor dcn_events_processor(/*num_tpu_tensor_cores*/ 4,
+                                          /*is_megacore*/ false);
+  dcn_events_processor.SetupMessageInfo(plane);
+  ASSERT_FALSE(dcn_events_processor.HasDcnMessages(kMegaScaleDcnSend));
+  ASSERT_TRUE(dcn_events_processor.HasDcnMessages(kMegaScaleDcnReceive));
+  ASSERT_FALSE(dcn_events_processor.HasDcnMessages("Another Message"));
+  ASSERT_EQ(dcn_events_processor.MegaScaleMessageId(kMegaScaleDcnReceive), 1);
+  ASSERT_EQ(dcn_events_processor.MegaScaleMessageId(kMegaScaleDcnSend),
+            std::nullopt);
+}
+
+// Test processing of valid messages and that all of them are received.
+TEST(DcnAnalysis, CreateMessageTestValidMessages) {
+  XSpace space;
+  XPlane *host_trace = space.add_planes();
+  XPlaneBuilder xplane_builder(host_trace);
+
+  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
+  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
+
+  XLineBuilder xline_builder_0 = xplane_builder.GetOrCreateLine(0);
+  XLineBuilder xline_builder_1 = xplane_builder.GetOrCreateLine(1);
+
+  // 1st event
+  XEventBuilder event_builder = xline_builder_0.AddEvent(*event_metadata_1);
+  event_builder.SetOffsetNs(100000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_label"),
+      "all-reduce.273_312");
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 2);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
+      3);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 1);
+  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
+                                 "dcn_destination_per_slice_device_id"),
+                             3);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 0);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 24);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 50);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 32768);
+
+  // 2nd event, same line
+  event_builder = xline_builder_0.AddEvent(*event_metadata_1);
+  event_builder.SetOffsetNs(175000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_label"),
+      "super-collective.1234");
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 112);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
+      12345);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 34);
+  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
+                                 "dcn_destination_per_slice_device_id"),
+                             98765);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 4);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 0);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 50);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 1);
+
+  // 3rd event event, new line, no chunk/loop index
+  event_builder = xline_builder_1.AddEvent(*event_metadata_1);
+  event_builder.SetOffsetNs(150000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_label"), "super-collective");
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 9);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
+      10005);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 0);
+  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
+                                 "dcn_destination_per_slice_device_id"),
+                             0);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 75);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 10);
+
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
+  DcnEventsProcessor dcn_events_processor(4, false);
+  dcn_events_processor.SetupMessageInfo(plane);
+  dcn_events_processor.ProcessReceiveMessages(plane);
+
+  ASSERT_EQ(dcn_events_processor.NumReceivedMessages(), 3);
+  EXPECT_THAT(dcn_events_processor.GetMessage(0),
+              FieldsAre("all-reduce.273_312", /* collective name */
+                        2, 3, 1, 3, /* slice_src, tpu_src, slice_dst, tpu_dst */
+                        /* start_timestamp_ns, end_timestamp_ns, duration_us */
+                        50000, 100000, 50,
+                        /* size_bytes, chunk_id, loop_index_id */
+                        32768, 0, 24,
+                        /* validity_info */
+                        DCN_MESSAGE_VALID));
+  EXPECT_THAT(dcn_events_processor.GetMessage(1),
+              FieldsAre("super-collective.1234", /* collective name */
+                        /* slice_src, tpu_src, slice_dst, tpu_dst */
+                        112, 12345, 34, 98765,
+                        /* start_timestamp_ns. end_timestamp_ns, duration_us */
+                        125000, 175000, 50,
+                        /* size_bytes, chunk_id, loop_index_id */
+                        1, 4, 0,
+                        /* validity_info */
+                        DCN_MESSAGE_VALID));
+  EXPECT_THAT(
+      dcn_events_processor.GetMessage(2),
+      FieldsAre("super-collective", /* collective name */
+                9, 10005, 0, 0,     /* slice_src, tpu_src, slice_dst, tpu_dst */
+                75000, 150000,      /* start_timestamp_ns. end_timestamp_ns */
+                75,                 /* duration_us */
+                10, -1, -1,         /* size_bytes, chunk_id, loop_index_id */
+                /* validity_info */
+                DCN_MESSAGE_VALID));
+  TimestampMap host_ts_map = dcn_events_processor.HostTsMap();
+  ASSERT_EQ(host_ts_map.size(), 6);
+  for (const auto &ts_map_item : host_ts_map) {
+    ASSERT_EQ(ts_map_item.first, ts_map_item.second->timestamp_ns);
+    if (ts_map_item.first == 50000) {
+      ASSERT_EQ(ts_map_item.second->duration_ns, 0);
+      ASSERT_EQ(ts_map_item.second->message_diff, 1);
+      ASSERT_EQ(ts_map_item.second->size_diff, 32768);
+    } else if (ts_map_item.first == 125000) {
+      ASSERT_EQ(ts_map_item.second->duration_ns, 0);
+      ASSERT_EQ(ts_map_item.second->message_diff, 1);
+      ASSERT_EQ(ts_map_item.second->size_diff, 1);
+    } else if (ts_map_item.first == 75000) {
+      ASSERT_EQ(ts_map_item.second->duration_ns, 0);
+      ASSERT_EQ(ts_map_item.second->message_diff, 1);
+      ASSERT_EQ(ts_map_item.second->size_diff, 10);
+    } else if (ts_map_item.first == 100000) {
+      ASSERT_EQ(ts_map_item.second->duration_ns, 50000);
+      ASSERT_EQ(ts_map_item.second->message_diff, -1);
+      ASSERT_EQ(ts_map_item.second->size_diff, -32768);
+    } else if (ts_map_item.first == 175000) {
+      ASSERT_EQ(ts_map_item.second->duration_ns, 50000);
+      ASSERT_EQ(ts_map_item.second->message_diff, -1);
+      ASSERT_EQ(ts_map_item.second->size_diff, -1);
+    } else if (ts_map_item.first == 150000) {
+      ASSERT_EQ(ts_map_item.second->duration_ns, 75000);
+      ASSERT_EQ(ts_map_item.second->message_diff, -1);
+      ASSERT_EQ(ts_map_item.second->size_diff, -10);
+    } else {
+      FAIL() << "Unexpected timestamp entry.";
+    }
+  }
+  const std::vector<DcnBurst> &host_bursts =
+      dcn_events_processor.GetHostBursts();
+  ASSERT_EQ(host_bursts.size(), 1);
+  ASSERT_EQ(host_bursts[0].num_messages, 3);
+  ASSERT_EQ(host_bursts[0].start_timestamp_ns, 50000);
+  ASSERT_EQ(host_bursts[0].end_timestamp_ns, 175000);
+  ASSERT_EQ(host_bursts[0].burst_size_bytes, 32779);
+  ASSERT_EQ(host_bursts[0].max_overlapping_messages, 2);
+}
+
+// Loopback message test, currently interpreted as valid.
+TEST(DcnAnalysis, CreateLoopBackMessageTest) {
+  XSpace space;
+  XPlane *host_trace = space.add_planes();
+  XPlaneBuilder xplane_builder(host_trace);
+
+  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
+  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
+
+  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
+  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata_1);
+  event_builder.SetOffsetNs(5000000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_label"), "all-gather.1234");
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 2);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
+      3);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 2);
+  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
+                                 "dcn_destination_per_slice_device_id"),
+                             1);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 4);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 40);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 1000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 1000);
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
+  DcnEventsProcessor dcn_events_processor(4, false);
+  dcn_events_processor.SetupMessageInfo(plane);
+  dcn_events_processor.ProcessReceiveMessages(plane);
+  ASSERT_EQ(dcn_events_processor.NumReceivedMessages(), 1);
+  EXPECT_THAT(dcn_events_processor.GetMessage(0),
+              FieldsAre("all-gather.1234", /* collective name */
+                        2, 3, 2, 1, /* slice_src, tpu_src, slice_dst, tpu_dst */
+                        /* start_timestamp_ns. end_timestamp_ns, duration_us */
+                        4000000, 5000000, 1000,
+                        /* size_bytes, chunk_id, loop_index_id */
+                        1000, 4, 40,
+                        /* validity_info */
+                        DCN_MESSAGE_VALID_LOOPBACK));
+}
+
+// Zero duration message, this is due to a bug or clock skew between source
+// and destination. Any analysis will just cause confusion, mark it as invalid.
+TEST(DcnAnalysis, CreateZeroDurationMessageTest) {
+  XSpace space;
+  XPlane *host_trace = space.add_planes();
+  XPlaneBuilder xplane_builder(host_trace);
+
+  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
+  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
+
+  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
+  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata_1);
+  event_builder.SetOffsetNs(20000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_label"),
+      "all-reduce.273_312");
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"), 2);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
+      3);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"), 1);
+  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
+                                 "dcn_destination_per_slice_device_id"),
+                             1);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), 0);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), 25);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 0);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 512);
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
+  DcnEventsProcessor dcn_events_processor(4, false);
+  dcn_events_processor.SetupMessageInfo(plane);
+  dcn_events_processor.ProcessReceiveMessages(plane);
+  EXPECT_THAT(
+      dcn_events_processor.GetMessage(0),
+      FieldsAre("all-reduce.273_312", /* collective name */
+                2, 3, 1, 1, /* slice_src, tpu_src, slice_dst, tpu_dst */
+                20000, 20000,
+                0, /* start_timestamp_ns. end_timestamp_ns, duration_us */
+                512, 0, 25, /* size_bytes, chunk_id, loop_index_id */
+                            /* validity_info */
+                DCN_MESSAGE_INVALID_CLOCK_SKEW));
+}
+
+// Missing key test, make sure it is invalid and correctly initialized.
+TEST(DcnAnalysis, CreateMissingKeyTest) {
+  XSpace space;
+  XPlane *host_trace = space.add_planes();
+  XPlaneBuilder xplane_builder(host_trace);
+
+  XEventMetadata *event_metadata_1 = xplane_builder.GetOrCreateEventMetadata(1);
+  event_metadata_1->set_name(std::string(kMegaScaleDcnReceive));
+
+  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
+  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata_1);
+  event_builder.SetOffsetNs(50000);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), 10);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"), 100);
+
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(host_trace);
+  DcnEventsProcessor dcn_events_processor(4, false);
+  dcn_events_processor.SetupMessageInfo(plane);
+  dcn_events_processor.ProcessReceiveMessages(plane);
+  EXPECT_THAT(
+      dcn_events_processor.GetMessage(0),
+      FieldsAre("",             /* collective name */
+                -1, -1, -1, -1, /* slice_src, tpu_src, slice_dst, tpu_dst */
+                40000, 50000,   /* start_timestamp_ns. end_timestamp_ns, */
+                10,             /* duration_us */
+                100, -1, -1,    /* size_bytes, chunk_id, loop_index_id */
+                                /* validity_info */
+                DCN_MESSAGE_INVALID_BAD_KEY));
+}
+
+}  // namespace
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_utils.cc b/tensorflow/core/profiler/convert/dcn_utils.cc
new file mode 100644
index 00000000000..0257de59df1
--- /dev/null
+++ b/tensorflow/core/profiler/convert/dcn_utils.cc
@@ -0,0 +1,120 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/dcn_utils.h"
+
+#include "absl/strings/match.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using tsl::profiler::MicroToNano;
+using tsl::profiler::StatType;
+using tsl::profiler::XEventVisitor;
+using tsl::profiler::XStatVisitor;
+
+DcnMessage CreateDcnMessageFromStats(const XEventVisitor& event_visitor) {
+  DcnMessage dcn_message;
+  event_visitor.ForEachStat([&](const XStatVisitor& stat) {
+    if (!stat.Type()) return;
+    switch (static_cast<StatType>(*stat.Type())) {
+      case StatType::kDcnLabel: {
+        dcn_message.collective_name = stat.ToString();
+        break;
+      }
+      case StatType::kDcnSourceSliceId: {
+        dcn_message.slice_src = stat.IntValue();
+        break;
+      }
+      case StatType::kDcnSourcePerSliceDeviceId: {
+        dcn_message.tpu_src = stat.IntValue();
+        break;
+      }
+      case StatType::kDcnDestinationSliceId: {
+        dcn_message.slice_dst = stat.IntValue();
+        break;
+      }
+      case StatType::kDcnDestinationPerSliceDeviceId: {
+        dcn_message.tpu_dst = stat.IntValue();
+        break;
+      }
+      case StatType::kDcnChunk: {
+        dcn_message.chunk_id = stat.IntValue();
+        break;
+      }
+      case StatType::kDcnLoopIndex: {
+        dcn_message.loop_index_id = stat.IntValue();
+
+        break;
+      }
+      case StatType::kPayloadSizeBytes: {
+        dcn_message.size_bytes = stat.IntValue();
+        break;
+      }
+      case StatType::kDuration: {
+        dcn_message.duration_us = stat.IntOrUintValue();
+        dcn_message.start_timestamp_ns =
+            event_visitor.TimestampNs() - MicroToNano(dcn_message.duration_us);
+        dcn_message.end_timestamp_ns = event_visitor.TimestampNs();
+        break;
+      }
+      default:
+        break;
+    }
+  });
+  return dcn_message;
+}
+
+// Analyze message to see if it can be directly processed or it falls under
+// corner-case categories, or if there is something wrong with it.
+void SetMessageValidity(DcnMessage& dcn_message) {
+  // Message should not be valid if fields have not been set properly
+  // The main use of that is to detect unexpected key format changes that do
+  // not cause crashes.
+  if (dcn_message.collective_name.empty() || dcn_message.slice_src == -1 ||
+      dcn_message.tpu_src == -1 || dcn_message.slice_dst == -1 ||
+      dcn_message.tpu_dst == -1 || dcn_message.size_bytes == -1) {
+    dcn_message.validity_info = DCN_MESSAGE_INVALID_BAD_KEY;
+  } else if (dcn_message.duration_us == 0) {
+    // Destination timestamp smaller than the source timestamp likely due to
+    // clock skew
+    dcn_message.validity_info = DCN_MESSAGE_INVALID_CLOCK_SKEW;
+  } else if (dcn_message.slice_src == dcn_message.slice_dst) {
+    // Loopback messages remain on the same host, so they are valid
+    // even though they should not go through DCN.
+    // TODO(emizan): Get host/TPU info and check host, not slice.
+    dcn_message.validity_info = DCN_MESSAGE_VALID_LOOPBACK;
+  } else {
+    dcn_message.validity_info = DCN_MESSAGE_VALID;
+  }
+}
+}  // namespace
+
+DcnMessage GetDcnMessageFromXEvent(const XEventVisitor& event_visitor) {
+  DcnMessage dcn_message = CreateDcnMessageFromStats(event_visitor);
+  SetMessageValidity(dcn_message);
+  return dcn_message;
+}
+
+bool IsDcnEvent(const tsl::profiler::XEventVisitor& event) {
+  return absl::StartsWith(event.Name(), "MegaScale:");
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/dcn_utils.h b/tensorflow/core/profiler/convert/dcn_utils.h
new file mode 100644
index 00000000000..f3e7d52e171
--- /dev/null
+++ b/tensorflow/core/profiler/convert/dcn_utils.h
@@ -0,0 +1,76 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
+
+#include <string>
+
+#include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+
+// DCN Message Validity
+enum DcnMessageValidity {
+  // Valid message
+  DCN_MESSAGE_VALID = 1,
+  // Valid message, but should not go through DCN, so it should not use BW.
+  DCN_MESSAGE_VALID_LOOPBACK = 2,
+  // Invalid message with 0 duration due to clock skew. Should be ignored.
+  DCN_MESSAGE_INVALID_CLOCK_SKEW = 3,
+  // Message that cannot be decoded. Should be ignored.
+  DCN_MESSAGE_INVALID_BAD_KEY = 4
+};
+
+// Structure representing a DCN event
+struct DcnMessage {
+  // Unique collective that generated this message, format should be
+  // <col name>_<number>, e.g. all_gather_34
+  std::string collective_name = "";
+  // Src info
+  // TODO(emizan) Add host info when you figure out how to get it from
+  // slice+tpu.
+  int32_t slice_src = -1;
+  int32_t tpu_src = -1;
+  // Dst info
+  int32_t slice_dst = -1;
+  int32_t tpu_dst = -1;
+  // Timing info in ns. Since MSXLA TraceMe's have us timestamps, we need to
+  // multiply by 1000 to get these timestamps.
+  uint64_t start_timestamp_ns = 0;
+  uint64_t end_timestamp_ns = 0;
+  uint64_t duration_us = 0;
+  // Size info
+  size_t size_bytes = 0;
+  // Chunk and Loop index
+  int32_t chunk_id = -1;
+  int32_t loop_index_id = -1;
+  // Is message valid/invalid and why
+  DcnMessageValidity validity_info = DCN_MESSAGE_INVALID_BAD_KEY;
+  // TBD: Add flow events in case you need to connect to other events pointed to
+  // by MSXLA TraceMe's
+};
+
+DcnMessage GetDcnMessageFromXEvent(
+    const tsl::profiler::XEventVisitor& event_visitor);
+
+// Check if the XEventVisitor is a DCN Message
+bool IsDcnEvent(const tsl::profiler::XEventVisitor& event);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_DCN_UTILS_H_
diff --git a/tensorflow/core/profiler/convert/dcn_utils_test.cc b/tensorflow/core/profiler/convert/dcn_utils_test.cc
new file mode 100644
index 00000000000..b2937011651
--- /dev/null
+++ b/tensorflow/core/profiler/convert/dcn_utils_test.cc
@@ -0,0 +1,139 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/dcn_utils.h"
+
+#include <cstdint>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/tsl/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/tsl/profiler/utils/xplane_builder.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using tsl::profiler::kMegaScaleDcnReceive;
+using tsl::profiler::XEventBuilder;
+using tsl::profiler::XEventVisitor;
+using tsl::profiler::XLineBuilder;
+using tsl::profiler::XPlaneBuilder;
+using tsl::profiler::XPlaneVisitor;
+
+void PopulateXPlane(XPlane &xplane, absl::string_view event_name, int offset,
+                    absl::string_view label, int64_t source_slice_id,
+                    int64_t source_per_slice_device_id,
+                    int64_t destination_slice_id,
+                    int64_t destination_per_slice_device_id, int64_t chunk,
+                    int64_t loop_index, int64_t payload_size,
+                    int64_t duration) {
+  XPlaneBuilder xplane_builder(&xplane);
+
+  XEventMetadata *event_metadata = xplane_builder.GetOrCreateEventMetadata(1);
+  event_metadata->set_name(std::string(event_name));
+
+  XLineBuilder xline_builder = xplane_builder.GetOrCreateLine(0);
+  XEventBuilder event_builder = xline_builder.AddEvent(*event_metadata);
+  event_builder.SetOffsetNs(offset);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_label"), label);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_slice_id"),
+      source_slice_id);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_source_per_slice_device_id"),
+      source_per_slice_device_id);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_destination_slice_id"),
+      destination_slice_id);
+  event_builder.AddStatValue(*xplane_builder.GetOrCreateStatMetadata(
+                                 "dcn_destination_per_slice_device_id"),
+                             destination_per_slice_device_id);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_chunk"), chunk);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("dcn_loop_index"), loop_index);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("duration_us"), duration);
+  event_builder.AddStatValue(
+      *xplane_builder.GetOrCreateStatMetadata("payload_size_bytes"),
+      payload_size);
+}
+
+TEST(DcnUtilsTest, IsDcnEvent) {
+  XPlane xplane;
+  PopulateXPlane(xplane, kMegaScaleDcnReceive, 0, "test", 0, 0, 0, 0, 0, 0, 0,
+                 0);
+  XLine line = xplane.lines()[0];
+  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
+
+  XEventVisitor visitor(&xplane_visitor, &line, &line.events()[0]);
+  EXPECT_TRUE(IsDcnEvent(visitor));
+}
+
+TEST(DcnUtilsTest, IsNotDcnEvent) {
+  XPlane xplane;
+  PopulateXPlane(xplane, "test", 0, "test", 0, 0, 0, 0, 0, 0, 0, 0);
+  XLine line = xplane.lines()[0];
+  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
+
+  XEventVisitor visitor(&xplane_visitor, &line, &line.events()[0]);
+  EXPECT_FALSE(IsDcnEvent(visitor));
+}
+
+TEST(DcnUtilsTest, GetDcnMessageFromXEvent) {
+  XPlane xplane;
+  PopulateXPlane(xplane, kMegaScaleDcnReceive, 100000, "all-reduce.273_312", 2,
+                 3, 1, 3, 0, 24, 32768, 50);
+  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
+  XEventVisitor visitor(&xplane_visitor, &xplane.lines()[0],
+                        &xplane.lines()[0].events()[0]);
+  EXPECT_THAT(GetDcnMessageFromXEvent(visitor),
+              testing::FieldsAre(
+                  "all-reduce.273_312", /* collective name */
+                  2, 3, 1, 3, /* slice_src, tpu_src, slice_dst, tpu_dst */
+                  /* start_timestamp_ns, end_timestamp_ns, duration_us */
+                  50000, 100000, 50,
+                  /* size_bytes, chunk_id, loop_index_id */
+                  32768, 0, 24,
+                  /* validity_info */
+                  DCN_MESSAGE_VALID));
+}
+
+TEST(DcnUtilsTest, GetDcnMessageFromXEventLoopBack) {
+  XPlane xplane;
+  PopulateXPlane(xplane, kMegaScaleDcnReceive, 5000000, "all-gather.1234", 2, 3,
+                 2, 1, 4, 40, 1000, 1000);
+  XPlaneVisitor xplane_visitor = tsl::profiler::CreateTfXPlaneVisitor(&xplane);
+  XEventVisitor visitor(&xplane_visitor, &xplane.lines()[0],
+                        &xplane.lines()[0].events()[0]);
+  EXPECT_THAT(GetDcnMessageFromXEvent(visitor),
+              testing::FieldsAre(
+                  "all-gather.1234", /* collective name */
+                  2, 3, 2, 1, /* slice_src, tpu_src, slice_dst, tpu_dst */
+                  /* start_timestamp_ns. end_timestamp_ns, duration_us */
+                  4000000, 5000000, 1000,
+                  /* size_bytes, chunk_id, loop_index_id */
+                  1000, 4, 40,
+                  /* validity_info */
+                  DCN_MESSAGE_VALID_LOOPBACK));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
index 936b6496bcb..0751d99d2d3 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/tool_options.h"
+#include "tensorflow/core/profiler/utils/hlo_module_utils.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_to_module.h"
 
 namespace tensorflow {
@@ -45,36 +46,6 @@ using ::xla::HloProto;
 using ::xla::HloRenderOptions;
 using ::xla::RenderedGraphFormat;
 
-const HloInstruction* FindInstruction(const HloModule& module,
-                                      std::string node_name) {
-  if (absl::StartsWith(node_name, "%")) {
-    node_name.erase(node_name.begin());
-  }
-  for (const HloComputation* computation : module.computations()) {
-    auto instrs = computation->instructions();
-    auto it = absl::c_find_if(instrs, [&](const HloInstruction* instr) {
-      // Try with and without "%" at the beginning of the node name.
-      return absl::EqualsIgnoreCase(instr->name(), node_name) ||
-             absl::EqualsIgnoreCase(instr->name(),
-                                    absl::StrCat("%", node_name));
-    });
-    if (it != instrs.end()) {
-      return *it;
-    }
-  }
-  return nullptr;
-}
-
-const HloComputation* FindComputation(const HloModule& module,
-                                      const std::string& comp_name) {
-  for (const HloComputation* computation : module.computations()) {
-    if (absl::EqualsIgnoreCase(computation->name(), comp_name)) {
-      return computation;
-    }
-  }
-  return nullptr;
-}
-
 void CleanUpHloModuleForGraphviz(HloModule* hlo_module) {
   // Infeed config is escaped serialized proto, and graphviz server complains.
   for (HloComputation* computation : hlo_module->computations()) {
diff --git a/tensorflow/core/profiler/convert/hlo_to_tools_data.h b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
index 7f43c2df30c..fbcafba8a4c 100644
--- a/tensorflow/core/profiler/convert/hlo_to_tools_data.h
+++ b/tensorflow/core/profiler/convert/hlo_to_tools_data.h
@@ -32,7 +32,7 @@ namespace profiler {
 // Return the serialized string of tool specific data when the conversion is
 // successful, else return an error status.
 StatusOr<std::string> ConvertHloProtoToToolData(
-    const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
+    const SessionSnapshot& session_snapshot, absl::string_view tool_name,
     const ToolOptions& options);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.cc b/tensorflow/core/profiler/convert/op_stats_combiner.cc
index 2263b2cad83..22c93538033 100644
--- a/tensorflow/core/profiler/convert/op_stats_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/op_stats_combiner.h"
 
 #include <algorithm>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_combiner.h b/tensorflow/core/profiler/convert/op_stats_combiner.h
index 8f694c3dd09..a8cb3c62c40 100644
--- a/tensorflow/core/profiler/convert/op_stats_combiner.h
+++ b/tensorflow/core/profiler/convert/op_stats_combiner.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_COMBINER_H_
 
+#include <vector>
+
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 8a4e1a550af..b3fe099e9e8 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <math.h>
 
 #include <algorithm>
+#include <optional>
+#include <ostream>
 #include <string>
 #include <vector>
 
@@ -359,7 +361,7 @@ double RatioOfHostToDeviceTimeToStepTime(
     const OpMetricsDb& host_tf_metrics_db,
     const InputPipelineAnalysisResult& input_pipeline_analysis) {
   // For TPU execution that uses infeed.
-  absl::optional<double> host_infeed_enqueue_ratio =
+  std::optional<double> host_infeed_enqueue_ratio =
       HostInfeedEnqueueRatio(host_tf_metrics_db);
   if (host_infeed_enqueue_ratio.has_value()) {
     return host_infeed_enqueue_ratio.value();
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
index 648f883f9ba..cb939b75322 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_overview_page.h"
 
+#include <algorithm>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
index 0d3dd3e047d..2911e95664a 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
+++ b/tensorflow/core/profiler/convert/op_stats_to_overview_page.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_OP_STATS_TO_OVERVIEW_PAGE_H_
 
+#include <string>
+
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
index c6a3f15ea6a..17e5d200ebd 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_stats.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
 
+#include <algorithm>
+#include <utility>
+#include <vector>
+
 #include "google/protobuf/any.pb.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
index c6d2a95f26d..b942e4d65c1 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_pod_viewer.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_pod_viewer.h"
 
+#include <utility>
+
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/profiler/convert/op_stats_to_pod_stats.h"
 #include "tensorflow/core/profiler/protobuf/pod_stats.pb.h"
diff --git a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
index 93783c5c836..31bba4a868f 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_tf_stats_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/op_stats_to_tf_stats.h"
 
+#include <string>
+#include <utility>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h b/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
index 3b8f2b166e5..efa3b53adf9 100644
--- a/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
+++ b/tensorflow/core/profiler/convert/preprocess_single_host_xplane.h
@@ -15,7 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_PREPROCESS_SINGLE_HOST_XPLANE_H_
 
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/repository.cc b/tensorflow/core/profiler/convert/repository.cc
index 8293e9c711b..9de9c40b8b8 100644
--- a/tensorflow/core/profiler/convert/repository.cc
+++ b/tensorflow/core/profiler/convert/repository.cc
@@ -22,15 +22,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
+namespace {
+std::string GetHostnameByPath(absl::string_view xspace_path) {
+  std::string file_name = std::string(tensorflow::io::Basename(xspace_path));
+  std::vector<std::string> parts = absl::StrSplit(file_name, '.');
+  return parts[0];
+}
+}  // namespace
 
 StatusOr<SessionSnapshot> SessionSnapshot::Create(
     std::vector<std::string> xspace_paths,
@@ -73,8 +81,30 @@ StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpace(
   return xspace_from_file;
 }
 
+StatusOr<std::unique_ptr<XSpace>> SessionSnapshot::GetXSpaceByName(
+    absl::string_view name) const {
+  if (auto it = hostname_map_.find(name); it != hostname_map_.end()) {
+    return GetXSpace(it->second);
+  }
+
+  return errors::InvalidArgument("Can not find the XSpace by name: ", name,
+                                 ". The total number of XSpace is ",
+                                 xspace_paths_.size());
+}
+
 std::string SessionSnapshot::GetHostname(size_t index) const {
-  return std::string(tensorflow::io::Basename(xspace_paths_.at(index)));
+  return GetHostnameByPath(xspace_paths_.at(index));
+}
+
+std::optional<std::string> SessionSnapshot::GetFilePath(
+    absl::string_view toolname, absl::string_view hostname) const {
+  if (!has_accessible_run_dir_) return std::nullopt;
+  std::string file_name = "";
+  if (toolname == "trace_viewer@")
+    file_name = absl::StrCat(hostname, ".", "SSTABLE");
+  if (!file_name.empty())
+    return tensorflow::io::JoinPath(session_run_dir_, file_name);
+  return std::nullopt;
 }
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/repository.h b/tensorflow/core/profiler/convert/repository.h
index c94d0cb4723..767e8b550a4 100644
--- a/tensorflow/core/profiler/convert/repository.h
+++ b/tensorflow/core/profiler/convert/repository.h
@@ -22,10 +22,11 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/statusor.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -48,17 +49,39 @@ class SessionSnapshot {
   // The caller of this function will take ownership of the XSpace.
   StatusOr<std::unique_ptr<XSpace>> GetXSpace(size_t index) const;
 
+  // Gets XSpace proto.
+  // The caller of this function will take ownership of the XSpace.
+  StatusOr<std::unique_ptr<XSpace>> GetXSpaceByName(
+      absl::string_view name) const;
+
   // Gets host name.
   std::string GetHostname(size_t index) const;
 
   // Gets the run directory of the profile session.
   absl::string_view GetSessionRunDir() const { return session_run_dir_; }
 
+  // Gets whether the session has an accessible run dir. If false, any
+  // path-based file read will be disabled in this mode.
+  bool HasAccessibleRunDir() const { return has_accessible_run_dir_; }
+
+  // Gets the path of the fast file for a given tool.
+  std::optional<std::string> GetFilePath(absl::string_view toolname,
+                                         absl::string_view host) const;
+
  private:
   SessionSnapshot(std::vector<std::string> xspace_paths,
                   std::optional<std::vector<std::unique_ptr<XSpace>>> xspaces)
-      : xspace_paths_(std::move(xspace_paths)), xspaces_(std::move(xspaces)) {
+      : xspace_paths_(std::move(xspace_paths)),
+        // If the snapshot was initialized by xspaces, the file path and run dir
+        // is a path tensorflow can't read from or write to so any file IO
+        // encapsulated in this class will be disabled in this mode.
+        has_accessible_run_dir_(!xspaces.has_value()),
+        xspaces_(std::move(xspaces)) {
     session_run_dir_ = tensorflow::io::Dirname(xspace_paths_.at(0));
+    for (size_t i = 0; i < xspace_paths_.size(); ++i) {
+      std::string host_name = GetHostname(i);
+      hostname_map_[host_name] = i;
+    }
   }
 
   // File paths to XSpace protos.
@@ -66,6 +89,11 @@ class SessionSnapshot {
   // The run directory of the profile session.
   absl::string_view session_run_dir_;
 
+  absl::flat_hash_map<std::string /*host_name*/, size_t /*index*/>
+      hostname_map_;
+
+  const bool has_accessible_run_dir_;
+
   // XSpace protos pre-loaded by the profiler plugin.
   // TODO(profiler): Use blobstore paths to initialize SessionSnapshot instead
   // of using pre-loaded XSpaces.
diff --git a/tensorflow/core/profiler/convert/repository_test.cc b/tensorflow/core/profiler/convert/repository_test.cc
new file mode 100644
index 00000000000..4bb53d0752f
--- /dev/null
+++ b/tensorflow/core/profiler/convert/repository_test.cc
@@ -0,0 +1,103 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/repository.h"
+
+#include <memory>
+#include <optional>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using ::testing::Eq;
+
+TEST(Repository, GetHostName) {
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"log/plugins/profile/hostname0.xplane.pb",
+                               "log/plugins/profile/hostname1.xplane.pb"},
+                              /*xspaces=*/std::nullopt);
+  TF_CHECK_OK(session_snapshot_or.status());
+  EXPECT_THAT(session_snapshot_or.value().GetHostname(0), Eq("hostname0"));
+  EXPECT_THAT(session_snapshot_or.value().GetHostname(1), Eq("hostname1"));
+  EXPECT_TRUE(session_snapshot_or.value().HasAccessibleRunDir());
+}
+
+TEST(Repository, GetSpaceByHostName) {
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  // prepare host 1.
+  auto space1 = std::make_unique<XSpace>();
+  *(space1->add_hostnames()) = "hostname1";
+  // with index 0 which shouldn't impact the space finding by name.
+  xspaces.push_back(std::move(space1));
+
+  // prepare host 0.
+  auto space0 = std::make_unique<XSpace>();
+  *(space0->add_hostnames()) = "hostname0";
+  // with index 1 which shouldn't impact the space finding by name.
+  xspaces.push_back(std::move(space0));
+
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"log/plugins/profile/hostname1.xplane.pb",
+                               "log/plugins/profile/hostname0.xplane.pb"},
+                              std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  auto xspace0_or = session_snapshot_or.value().GetXSpaceByName("hostname0");
+  TF_CHECK_OK(xspace0_or.status());
+  auto xspace1_or = session_snapshot_or.value().GetXSpaceByName("hostname1");
+  EXPECT_FALSE(session_snapshot_or.value().HasAccessibleRunDir());
+  TF_CHECK_OK(xspace1_or.status());
+  EXPECT_THAT(xspace0_or.value()->hostnames(0), Eq("hostname0"));
+  EXPECT_THAT(xspace1_or.value()->hostnames(0), Eq("hostname1"));
+}
+
+TEST(Repository, GetSSTableFile) {
+  auto session_snapshot_or =
+      SessionSnapshot::Create({"log/plugins/profile/hostname0.xplane.pb"},
+                              /*xspaces=*/std::nullopt);
+  TF_CHECK_OK(session_snapshot_or.status());
+  auto sstable_path =
+      session_snapshot_or.value().GetFilePath("trace_viewer@", "hostname0");
+  auto not_found_path =
+      session_snapshot_or.value().GetFilePath("memory_viewer", "hostname0");
+  EXPECT_THAT(sstable_path, Eq("log/plugins/profile/hostname0.SSTABLE"));
+  EXPECT_THAT(not_found_path, Eq(std::nullopt));
+}
+
+TEST(Repository, GetSSTableFileWithXSpace) {
+  std::vector<std::unique_ptr<XSpace>> xspaces;
+  // prepare host 0.
+  auto space0 = std::make_unique<XSpace>();
+  *(space0->add_hostnames()) = "hostname0";
+  // with index 1 which shouldn't impact the space finding by name.
+  xspaces.push_back(std::move(space0));
+  auto session_snapshot_or = SessionSnapshot::Create(
+      {"log/plugins/profile/hostname0.xplane.pb"}, std::move(xspaces));
+  TF_CHECK_OK(session_snapshot_or.status());
+  auto file_path_init_by_xspace =
+      session_snapshot_or.value().GetFilePath("trace_viewer@", "hostname0");
+  // The file path should be disabled in this mode.
+  EXPECT_THAT(file_path_init_by_xspace, Eq(std::nullopt));
+}
+
+}  // namespace
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
index 589a5c5378a..e0bfaa341f2 100644
--- a/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
+++ b/tensorflow/core/profiler/convert/step_events_to_steps_db.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 
+#include <ostream>
 #include <sstream>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/core/profiler/convert/tool_options.h b/tensorflow/core/profiler/convert/tool_options.h
index b3175408572..85f285e7526 100644
--- a/tensorflow/core/profiler/convert/tool_options.h
+++ b/tensorflow/core/profiler/convert/tool_options.h
@@ -60,7 +60,7 @@ inline std::string DebugString(const ToolOptions& options) {
     absl::StrAppend(
         &output, k, ":",
         std::visit([](const auto& value) { return absl::StrCat(value); }, v),
-        ";");
+        ":", v.index(), ";");
   }
   return absl::StrCat("{", output, "}");
 }
diff --git a/tensorflow/core/profiler/convert/trace_viewer/BUILD b/tensorflow/core/profiler/convert/trace_viewer/BUILD
index d9f41062b04..6d064de92a1 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/BUILD
+++ b/tensorflow/core/profiler/convert/trace_viewer/BUILD
@@ -91,6 +91,7 @@ cc_library(
     deps = [
         "//tensorflow/core/profiler/protobuf:trace_events_proto_cc",
         "//tensorflow/core/profiler/utils:timespan",
+        "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/algorithm:container",
     ],
 )
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h b/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
index 36d7459e069..73a0f81ef28 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h
@@ -32,25 +32,25 @@ class TraceEventArgumentsBuilder {
 
   void Append(absl::string_view key, absl::string_view value) {
     auto* arg = args_->add_arg();
-    arg->set_name(key);
-    arg->set_str_value(value);
+    arg->set_name(key.data(), key.size());
+    arg->set_str_value(value.data(), value.size());
   }
 
   void Append(absl::string_view key, int64_t value) {
     auto* arg = args_->add_arg();
-    arg->set_name(key);
+    arg->set_name(key.data(), key.size());
     arg->set_int_value(value);
   }
 
   void Append(absl::string_view key, uint64_t value) {
     auto* arg = args_->add_arg();
-    arg->set_name(key);
+    arg->set_name(key.data(), key.size());
     arg->set_uint_value(value);
   }
 
   void Append(absl::string_view key, double value) {
     auto* arg = args_->add_arg();
-    arg->set_name(key);
+    arg->set_name(key.data(), key.size());
     arg->set_double_value(value);
   }
 
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
index 571a7c040eb..c081048e6b6 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.cc
@@ -291,14 +291,15 @@ tsl::Status DoLoadFromLevelDbTable(
   }
 
   // Read events at the different zoom levels.
-  std::vector<std::vector<TraceEvent*>> loaded_events_by_level;
+  std::vector<std::unique_ptr<std::vector<TraceEvent*>>> loaded_events_by_level;
   size_t filtered = 0;
   TraceEvent event;  // Declared outside of the loop to avoid repeated calls to
                      // the constructor and destructor in the loop body. Cleared
                      // by every call to ParseFromCord.
   for (int i = 0;; ++i) {
-    loaded_events_by_level.emplace_back();
-    auto& loaded_events = loaded_events_by_level.back();
+    loaded_events_by_level.emplace_back(
+        std::make_unique<std::vector<TraceEvent*>>());
+    auto& loaded_events = *loaded_events_by_level.back();
     uint64_t resolution_ps = LayerResolutionPs(i);
     // Seek to the first element that might be in range. For the initial zoom
     // level, we don't know any bounds as events might be arbitrarily large.
@@ -341,12 +342,16 @@ tsl::Status DoLoadFromLevelDbTable(
              TraceEventsComparator());
   loaded_events_by_level.clear();
 
-  LOG(INFO) << "Loaded " << loaded_events.size() << " events and filtered "
+  LOG(INFO) << "Loaded " << loaded_events.size() << " events after filtering "
             << filtered << " events from LevelDb fast file: " << filename;
+  size_t visible_events_count = 0;
   for (TraceEvent* event : loaded_events) {
-    if (!visibility || !visibility->Filter(*event)) add_arena_event(event);
+    if (!visibility || !visibility->Filter(*event)) {
+      add_arena_event(event);
+      ++visible_events_count;
+    }
   }
-  LOG(INFO) << "Added " << trace.num_events()
+  LOG(INFO) << "Added " << visible_events_count
             << " visible events from LevelDb fast file: " << filename;
   return tsl::OkStatus();
 }
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
index 44ed4b43b3f..851ea97b2f5 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events.h
@@ -74,17 +74,25 @@ struct EventFactory {
   std::vector<std::unique_ptr<TraceEvent>> events;
 };
 
+struct DefaultStdHash {
+  size_t operator()(absl::string_view input) {
+    return std::hash<absl::string_view>()(input);
+  }
+};
+
 template <typename EventFactory, typename RawData,
-          typename Hash = std::hash<absl::string_view>()>
-class TraceEventsContainer {
+          typename Hash = DefaultStdHash>
+class TraceEventsContainerBase {
  public:
-  TraceEventsContainer() { arenas_.insert(std::make_shared<EventFactory>()); }
+  TraceEventsContainerBase() {
+    arenas_.insert(std::make_shared<EventFactory>());
+  }
 
   // Movable but non-copyable.
-  TraceEventsContainer(TraceEventsContainer&&) = default;
-  TraceEventsContainer& operator=(TraceEventsContainer&&) = default;
-  TraceEventsContainer(const TraceEventsContainer&) = delete;
-  TraceEventsContainer& operator=(const TraceEventsContainer&) = delete;
+  TraceEventsContainerBase(TraceEventsContainerBase&&) = default;
+  TraceEventsContainerBase& operator=(TraceEventsContainerBase&&) = default;
+  TraceEventsContainerBase(const TraceEventsContainerBase&) = delete;
+  TraceEventsContainerBase& operator=(const TraceEventsContainerBase&) = delete;
 
   // Creates a TraceEvent prefilled with the given values.
   void AddCompleteEvent(absl::string_view name, uint32_t resource_id,
@@ -177,7 +185,7 @@ class TraceEventsContainer {
   void AddCounterEvent(absl::string_view name, uint32_t device_id,
                        uint64_t timestamp_ps, const RawData& raw_data) {
     TraceEvent* event = CreateArenaEvent();
-    event->set_name(name);
+    event->set_name(name.data(), name.size());
     event->set_device_id(device_id);
     // Do not set resource_id for counter events, they are per device.
     event->set_timestamp_ps(timestamp_ps);
@@ -247,8 +255,8 @@ class TraceEventsContainer {
     return DoLoadFromLevelDbTable(
         filename, std::move(filter), std::move(visibility),
         filter_by_visibility_threshold, trace_, filter_by_visibility_,
-        absl::bind_front(&TraceEventsContainer::CopyEventToArena, this),
-        absl::bind_front(&TraceEventsContainer::AddArenaEvent, this));
+        absl::bind_front(&TraceEventsContainerBase::CopyEventToArena, this),
+        absl::bind_front(&TraceEventsContainerBase::AddArenaEvent, this));
   }
 
   // Calls 'callback' with all events stored in this container.
@@ -410,7 +418,7 @@ class TraceEventsContainer {
     if (name.size() > kNameInternThreshold) {
       event->set_name_ref(MaybeInternString(name));
     } else {
-      event->set_name(name);
+      event->set_name(name.data(), name.size());
     }
   }
 
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
index b8da6cebf93..0b99c60eb84 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.cc
@@ -55,7 +55,8 @@ std::string JsonEscape(absl::string_view raw) {
           escaped_string.push_back('t');
           break;
         default:
-          absl::StrAppendFormat(&escaped_string, "u%04x", static_cast<uint>(c));
+          absl::StrAppendFormat(&escaped_string, "u%04x",
+                                static_cast<unsigned int>(c));
       }
       continue;
     }
@@ -71,7 +72,8 @@ std::string JsonEscape(absl::string_view raw) {
       case '<':
       case '>':
       case '&': {
-        absl::StrAppendFormat(&escaped_string, "\\u%04x", static_cast<uint>(c));
+        absl::StrAppendFormat(&escaped_string, "\\u%04x",
+                              static_cast<unsigned int>(c));
         continue;
       }
       case '\xe2': {
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
index c972ab427ee..ede26412713 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h
@@ -584,7 +584,7 @@ class IOBufferAdapter {
 
   // Support IOBufferAdapter as a sink object for absl::Format.
   friend void AbslFormatFlush(IOBufferAdapter* buffer, absl::string_view s) {
-    buffer->output_->append(s);
+    absl::StrAppend(buffer->output_, s);
   }
 
  private:
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
index 4647f834b81..8751aefdd16 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.cc
@@ -14,9 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
 
+#include <memory>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "tensorflow/core/profiler/utils/xplane_visitor.h"
 
 namespace tensorflow {
 namespace profiler {
@@ -58,5 +61,28 @@ void ExpandTraceSpan(const Timespan& span, Trace* trace) {
   }
 }
 
+class DefaultResourceGrouper : public ResourceGrouperInterface {
+ public:
+  explicit DefaultResourceGrouper(uint32_t device_id, absl::string_view name)
+      : device_id_(device_id), name_(name) {}
+
+  std::vector<std::pair<uint32_t, absl::string_view>> Devices() const override {
+    return {{device_id_, name_}};
+  }
+
+  uint32_t GetDeviceId(uint32_t resource_id) const override {
+    return device_id_;
+  }
+
+ private:
+  uint32_t device_id_;
+  absl::string_view name_;
+};
+
+std::unique_ptr<ResourceGrouperInterface> CreateDefaultResourceGrouper(
+    uint32_t device_id, absl::string_view name) {
+  return std::make_unique<DefaultResourceGrouper>(device_id, name);
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
index 19c0d6cff6b..5144b010108 100644
--- a/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
+++ b/tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h
@@ -15,7 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_TRACE_VIEWER_TRACE_EVENTS_UTIL_H_
 
+#include <memory>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
@@ -104,56 +106,16 @@ void push_down_root(RandIt first, RandIt last, Compare comp) {
   first[hole] = std::move(value);
 }
 
-template <typename T>
-struct can_dereference_helper {
-  template <typename U, typename = decltype(*std::declval<U>())>
-  static std::true_type test(U);
-  template <typename... U>
-  static std::false_type test(U...);
-  using type = decltype(test(std::declval<T>()));
-};
-
-template <typename T>
-struct can_dereference
-    : can_dereference_helper<typename std::decay<T>::type>::type {};
-
-template <typename T>
-auto recursive_dereference(T&& t, std::false_type)
-    -> decltype(std::forward<T>(t)) {
-  return std::forward<T>(t);
-}
-
-template <typename T>
-auto recursive_dereference(T&& t)
-    -> decltype(recursive_dereference(std::forward<T>(t),
-                                      can_dereference<T>{}));
-
-template <typename T>
-auto recursive_dereference(T&& t, std::true_type)
-    -> decltype(recursive_dereference(*std::forward<T>(t))) {
-  return recursive_dereference(*std::forward<T>(t));
-}
-
-template <typename T>
-auto recursive_dereference(T&& t)
-    -> decltype(recursive_dereference(std::forward<T>(t),
-                                      can_dereference<T>{})) {
-  return recursive_dereference(std::forward<T>(t), can_dereference<T>{});
-}
-
-// ContainerContainer could be a container of a container or a container of
-// pointer of a container.
+// ContainerContainer could be a container of pointers to container.
 template <typename ContainerContainer, typename Out, typename Cmp>
 Out nway_merge(const ContainerContainer& containers, Out out, Cmp cmp) {
   using std::begin;
   using std::end;
-  using In = decltype(begin(
-      recursive_dereference(*begin(containers))));  // The input iterator type.
+  using In = decltype(begin(**begin(containers)));  // The input iterator type.
   using Range = std::pair<In, In>;
   std::vector<Range> sources;
   for (const auto& container : containers) {
-    Range r(begin(recursive_dereference(container)),
-            end(recursive_dereference(container)));
+    Range r(begin(*container), end(*container));
     if (r.first != r.second) {
       sources.push_back(r);
     }
@@ -180,6 +142,21 @@ Out nway_merge(const ContainerContainer& containers, Out out, Cmp cmp) {
   }
 }
 
+// Interface that allows defining classes that map XLines within a single XPlane
+// to multiple virtual devices in trace viewer.
+class ResourceGrouperInterface {
+ public:
+  virtual ~ResourceGrouperInterface() = default;
+
+  virtual std::vector<std::pair<uint32_t /*resource_id*/, absl::string_view>>
+  Devices() const = 0;
+
+  virtual uint32_t GetDeviceId(uint32_t resource_id) const = 0;
+};
+
+std::unique_ptr<ResourceGrouperInterface> CreateDefaultResourceGrouper(
+    uint32_t device_id, absl::string_view name);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_hlo.cc b/tensorflow/core/profiler/convert/xplane_to_hlo.cc
index e202330efcf..74d445ebdb5 100644
--- a/tensorflow/core/profiler/convert/xplane_to_hlo.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_hlo.cc
@@ -28,9 +28,9 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/repository.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/file_system_utils.h"
 #include "tensorflow/core/profiler/utils/hlo_proto_map.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_hlo.h b/tensorflow/core/profiler/convert/xplane_to_hlo.h
index 7479fe95017..f0560ed1694 100644
--- a/tensorflow/core/profiler/convert/xplane_to_hlo.h
+++ b/tensorflow/core/profiler/convert/xplane_to_hlo.h
@@ -28,8 +28,7 @@ namespace profiler {
 
 // Get HLO proto by module name.
 StatusOr<xla::HloProto> GetHloProtoByModuleName(
-    const SessionSnapshot& session_snapshot,
-    const absl::string_view module_name);
+    const SessionSnapshot& session_snapshot, absl::string_view module_name);
 
 // Converts multiple XSpaces to HLO protos.
 // Stores the HLO protos as files in the same directory as the xspace files.
diff --git a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
index ad2da9ae62c..381802870e8 100644
--- a/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 
 #include <functional>
+#include <ostream>
 #include <string>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
index 3cde39a6701..f0159349f81 100644
--- a/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
+++ b/tensorflow/core/profiler/convert/xplane_to_memory_profile.h
@@ -16,10 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_MEMORY_PROFILE_H_
 
+#include <string>
+
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/memory_profile.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index 71b59de0080..8a3a9535faf 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -86,7 +86,7 @@ void ProcessOneTfActivity(const TfActivity& activity,
   switch (activity.activity_type) {
     case kTfOpBegin: {
       tf_op_stack->Push(tf_op_id,
-                        absl::make_unique<TfOpInfo>(activity.timestamp_ps));
+                        std::make_unique<TfOpInfo>(activity.timestamp_ps));
       break;
     }
     case kTfOpEnd: {
@@ -144,7 +144,7 @@ void CollectTfActivities(const XLineVisitor& line,
     if (tf_op != nullptr) {
       ++tf_op_id;
       bool is_eager = false;
-      if (absl::optional<XStatVisitor> stat =
+      if (std::optional<XStatVisitor> stat =
               event.GetStat(StatType::kIsEager)) {
         is_eager = stat->IntValue();
       }
@@ -204,6 +204,15 @@ void SetOpMetadataFromHloEventMetadata(
         case StatType::kBytesAccessed:
           op_metrics->set_bytes_accessed(stat.IntOrUintValue());
           break;
+        case StatType::kMemoryAccessBreakdown: {
+          tensorflow::profiler::MemoryAccessBreakdown breakdown;
+          const auto& value = stat.BytesValue();
+          if (breakdown.ParseFromArray(value.data(), value.size())) {
+            *op_metrics->mutable_memory_accessed_breakdown() =
+                breakdown.memory_accessed();
+          }
+          break;
+        }
         default:
           break;
       }
@@ -261,6 +270,10 @@ void AdjustFlopsAndBytesAccessed(OpMetrics& op_metrics) {
   op_metrics.set_flops(op_metrics.flops() * op_metrics.occurrences());
   op_metrics.set_bytes_accessed(op_metrics.bytes_accessed() *
                                 op_metrics.occurrences());
+  for (auto& memory_access : *op_metrics.mutable_memory_accessed_breakdown()) {
+    memory_access.set_bytes_accessed(memory_access.bytes_accessed() *
+                                     op_metrics.occurrences());
+  }
 }
 
 }  // namespace
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index fc1f9412808..f1fa00f6084 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -15,38 +15,36 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 
-#include <memory>
 #include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/convert/op_metrics_db_combiner.h"
-#include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/convert/step_events_to_steps_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_kernel_stats_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_metrics_db.h"
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 #include "tensorflow/core/profiler/protobuf/diagnostics.pb.h"
-#include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
+#include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_metrics.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/steps_db.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_function.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/device_caps_utils.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
-#include "tensorflow/core/profiler/utils/step_intersection.h"
 #include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
 #include "tensorflow/core/profiler/utils/tpu_xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index 23aeae3fc98..a08ef175b10 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -20,7 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_events.cc b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
index 16f4e8ee448..6dae2913155 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_events.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_events.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_step_events.h"
 
+#include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/match.h"
@@ -186,7 +188,7 @@ StepEvents ConvertHostThreadsXPlaneToStepEvents(
 StepEvents ConvertDeviceStepInfoToStepMarkers(const XLineVisitor& line) {
   StepEvents result;
   line.ForEachEvent([&](const XEventVisitor& event) {
-    if (absl::optional<XStatVisitor> stat = event.GetStat(StatType::kGroupId)) {
+    if (std::optional<XStatVisitor> stat = event.GetStat(StatType::kGroupId)) {
       result[stat->IntValue()].AddMarker(
           StepMarker(StepMarkerType::kDeviceStepMarker, event.Name(),
                      event.GetTimespan()));
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_stats.cc b/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
index 6d678c11769..5f43c11e353 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_step_stats.cc
@@ -16,7 +16,10 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_step_stats.h"
 
 #include <cstdint>
+#include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -124,7 +127,7 @@ void ConvertGpuXSpaceToStepStats(const XSpace& xspace, StepStats* step_stats) {
       line.ForEachEvent([&](const XEventVisitor& event) {
         GpuEventStats stats(&event);
 
-        auto ns = absl::make_unique<NodeExecStats>();
+        auto ns = std::make_unique<NodeExecStats>();
         SetNodeTimes(event, ns.get());
 
         // Get launch information if available.
diff --git a/tensorflow/core/profiler/convert/xplane_to_step_stats.h b/tensorflow/core/profiler/convert/xplane_to_step_stats.h
index 789a5d2806f..b3ea6a84519 100644
--- a/tensorflow/core/profiler/convert/xplane_to_step_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_step_stats.h
@@ -17,7 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_STEP_STATS_H_
 
 #include "tensorflow/core/framework/step_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
index 2aeb9c7b556..aaf803a9e2f 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
 
+#include <algorithm>
 #include <optional>
+#include <string>
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
index 5e93b7821ff..19d0be484a1 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h
@@ -20,7 +20,7 @@ limitations under the License.
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
index 22c03992835..f10f8c92f12 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 
 #include <algorithm>
+#include <ostream>
 #include <stack>
 #include <string>
 #include <utility>
diff --git a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
index df55ac79bb8..fbff7ccecc7 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tf_functions.h
@@ -28,7 +28,7 @@ namespace profiler {
 TfFunctionDb ConvertHostThreadsXLineToTfFunctionDb(const XLineVisitor& line);
 
 // Returns a debugging string for the given TfFunctionDb.
-std::string DebugString(const TfFunctionDb tf_function_db);
+std::string DebugString(TfFunctionDb tf_function_db);
 
 // Combines the tf-function statistics from src and dst into dst.
 void CombineTfFunctionDb(const TfFunctionDb& src, TfFunctionDb* dst);
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
index ff7d5bf6240..75f6c134525 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
@@ -34,7 +34,7 @@ StatusOr<std::string> GetAvailableToolNames(
   std::vector<std::string> tools;
   if (session_snapshot.XSpaceSize() != 0) {
     tools.reserve(11);
-    tools.push_back("trace_viewer");
+    tools.push_back("trace_viewer@");
     tools.push_back("overview_page");
     tools.push_back("input_pipeline_analyzer");
     tools.push_back("tensorflow_stats");
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
index 7d4ef3aecc7..b071f73b8fa 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
@@ -91,7 +91,7 @@ TEST_P(XPlaneToToolsTest, ToolsList) {
 
   std::vector<std::string> tools = absl::StrSplit(toolsString.value(), ',');
 
-  std::vector<std::string> expected_tools = {"trace_viewer",
+  std::vector<std::string> expected_tools = {"trace_viewer@",
                                              "overview_page",
                                              "input_pipeline_analyzer",
                                              "tensorflow_stats",
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
index 8cc47033c5b..78f5dd35fc3 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/numbers.h"
 #include "absl/strings/string_view.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/hlo_to_tools_data.h"
@@ -31,33 +33,60 @@ limitations under the License.
 #include "tensorflow/core/profiler/convert/preprocess_single_host_xplane.h"
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/convert/tool_options.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_to_json.h"
 #include "tensorflow/core/profiler/convert/xplane_to_memory_profile.h"
 #include "tensorflow/core/profiler/convert/xplane_to_op_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tf_data_stats.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tf_functions.h"
 #include "tensorflow/core/profiler/convert/xplane_to_tool_names.h"
+#include "tensorflow/core/profiler/convert/xplane_to_trace_container.h"
 #include "tensorflow/core/profiler/protobuf/hardware_types.pb.h"
 #include "tensorflow/core/profiler/protobuf/input_pipeline.pb.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_profile.pb.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/overview_page.pb.h"
-#include "tensorflow/core/profiler/protobuf/pod_viewer.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_data_stats.pb.h"
 #include "tensorflow/core/profiler/protobuf/tf_stats.pb.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_utils.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/statusor.h"
 #include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
 namespace profiler {
 
 namespace {
 
+struct TraceViewOption {
+  uint64_t resolution = 0;
+  double start_time_ms = 0.0;
+  double end_time_ms = 0.0;
+};
+
+absl::StatusOr<TraceViewOption> GetTraceViewOption(const ToolOptions& options) {
+  TraceViewOption trace_options;
+  auto start_time_ms_opt =
+      GetParamWithDefault<std::string>(options, "start_time_ms", "0.0");
+  auto end_time_ms_opt =
+      GetParamWithDefault<std::string>(options, "end_time_ms", "0.0");
+  auto resolution_opt =
+      GetParamWithDefault<std::string>(options, "resolution", "0");
+
+  if (!absl::SimpleAtoi(resolution_opt, &trace_options.resolution) ||
+      !absl::SimpleAtod(start_time_ms_opt, &trace_options.start_time_ms) ||
+      !absl::SimpleAtod(end_time_ms_opt, &trace_options.end_time_ms)) {
+    return errors::InvalidArgument("wrong arguments");
+  }
+  return trace_options;
+}
+
 StatusOr<std::string> ConvertXSpaceToTraceEvents(
-    const SessionSnapshot& session_snapshot) {
+    const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
+    const ToolOptions& options) {
   if (session_snapshot.XSpaceSize() != 1) {
     return errors::InvalidArgument(
         "Trace events tool expects only 1 XSpace path but gets ",
@@ -69,8 +98,38 @@ StatusOr<std::string> ConvertXSpaceToTraceEvents(
   PreprocessSingleHostXSpace(xspace.get(), /*step_grouping=*/true,
                              /*derived_timeline=*/true);
   std::string content;
-  tsl::profiler::ConvertXSpaceToTraceEventsString(*xspace, &content);
-  return content;
+  if (tool_name == "trace_viewer") {
+    tsl::profiler::ConvertXSpaceToTraceEventsString(*xspace, &content);
+    return content;
+  } else {  // streaming trace viewer.
+    std::string host_name = session_snapshot.GetHostname(0);
+    auto sstable_path = session_snapshot.GetFilePath(tool_name, host_name);
+    if (!sstable_path) {
+      return errors::Unimplemented(
+          "streaming trace viewer hasn't been supported in Cloud AI");
+    }
+    if (!Env::Default()->FileExists(*sstable_path).ok()) {
+      TraceEventsContainer trace_container;
+      ConvertXSpaceToTraceEventsContainer(host_name, *xspace, &trace_container);
+      TF_RETURN_IF_ERROR(trace_container.StoreAsLevelDbTable(*sstable_path));
+    }
+    TF_ASSIGN_OR_RETURN(TraceViewOption trace_option,
+                        GetTraceViewOption(options));
+    auto visibility_filter = std::make_unique<TraceVisibilityFilter>(
+        MilliSpan(trace_option.start_time_ms, trace_option.end_time_ms),
+        trace_option.resolution);
+    TraceEventsContainer trace_container;
+    // Trace smaller than threshold will be disabled from streaming.
+    constexpr int64_t kDisableStreamingThreshold = 500000;
+    TF_RETURN_IF_ERROR(trace_container.LoadFromLevelDbTable(
+        *sstable_path, /*filter=*/nullptr, std::move(visibility_filter),
+        kDisableStreamingThreshold));
+    JsonTraceOptions options;
+    IOBufferAdapter adapter(&content);
+    TraceEventsToJson<IOBufferAdapter, TraceEventsContainer, RawData>(
+        options, trace_container, &adapter);
+    return content;
+  }
 }
 
 StatusOr<std::string> ConvertMultiXSpacesToOverviewPage(
@@ -236,8 +295,8 @@ StatusOr<std::string> ConvertMultiXSpacesToToolData(
     const ToolOptions& options) {
   LOG(INFO) << "serving tool: " << tool_name
             << " with options: " << DebugString(options);
-  if (tool_name == "trace_viewer") {
-    return ConvertXSpaceToTraceEvents(session_snapshot);
+  if (tool_name == "trace_viewer" || tool_name == "trace_viewer@") {
+    return ConvertXSpaceToTraceEvents(session_snapshot, tool_name, options);
   } else if (tool_name == "overview_page") {
     return ConvertMultiXSpacesToOverviewPage(session_snapshot);
   } else if (tool_name == "input_pipeline_analyzer") {
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.h b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
index 4fc37120883..f7fd494099d 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.h
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.h
@@ -30,7 +30,7 @@ namespace profiler {
 // Return the serialized string of tool specific data when the conversion is
 // successful, else return error status.
 StatusOr<std::string> ConvertMultiXSpacesToToolData(
-    const SessionSnapshot& session_snapshot, const absl::string_view tool_name,
+    const SessionSnapshot& session_snapshot, absl::string_view tool_name,
     const ToolOptions& options);
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container.cc b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
new file mode 100644
index 00000000000..8a738c27be2
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_container.cc
@@ -0,0 +1,219 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/profiler/convert/xplane_to_trace_container.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_event_arguments_builder.h"
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events_util.h"
+#include "tensorflow/core/profiler/protobuf/trace_events.pb.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tensorflow/core/profiler/utils/tf_xplane_visitor.h"
+#include "tensorflow/core/profiler/utils/timespan.h"
+#include "tensorflow/tsl/profiler/utils/xplane_schema.h"
+#include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
+
+namespace tensorflow {
+namespace profiler {
+namespace {
+
+using tsl::profiler::XEventVisitor;
+using tsl::profiler::XLineVisitor;
+using tsl::profiler::XStatVisitor;
+
+struct SpecialArguments {
+  std::optional<int64_t> group_id;
+  absl::string_view step_name;
+  bool is_async_event = false;
+  // Both flow and async events share the flow specification.
+  std::optional<XFlow> flow;
+};
+
+inline TraceEvent::FlowEntryType FlowEntryTypeFromDirection(
+    XFlow::FlowDirection direction) {
+  switch (direction) {
+    case XFlow::kFlowUnspecified:
+      return TraceEvent::FLOW_NONE;
+    case XFlow::kFlowIn:
+      return TraceEvent::FLOW_END;
+    case XFlow::kFlowOut:
+      return TraceEvent::FLOW_START;
+    case XFlow::kFlowInOut:
+      return TraceEvent::FLOW_MID;
+  }
+}
+
+template <typename T>
+void ConvertXStatToTraceEventArgument(const XStatVisitor& stat, T value,
+                                      SpecialArguments& special_args,
+                                      TraceEventArgumentsBuilder& args) {
+  if (stat.Type() == StatType::kFlow) {
+    special_args.flow = XFlow::FromStatValue(value);
+  } else if (stat.Type() == StatType::kGroupId) {
+    special_args.group_id = value;
+  } else if (stat.Type() == StatType::kIsAsync) {
+    special_args.is_async_event = true;
+  } else {
+    args.Append(stat.Name(), value);
+  }
+}
+
+SpecialArguments ConvertXStatsToTraceEventArguments(
+    const XEventVisitor& event, RawData* raw_data,
+    TraceEventArguments* raw_args) {
+  TraceEventArgumentsBuilder args(raw_args);
+  SpecialArguments special_args;
+  auto for_each_stat = [&special_args, &args](const XStatVisitor& stat) {
+    if (IsInternalStat(stat.Type())) return;
+    switch (stat.ValueCase()) {
+      case XStat::kInt64Value:
+        ConvertXStatToTraceEventArgument(stat, stat.IntValue(), special_args,
+                                         args);
+        break;
+      case XStat::kUint64Value:
+        ConvertXStatToTraceEventArgument(stat, stat.UintValue(), special_args,
+                                         args);
+        break;
+      case XStat::kDoubleValue:
+        args.Append(stat.Name(), stat.DoubleValue());
+        break;
+      case XStat::kStrValue:
+      case XStat::kRefValue: {
+        auto stat_value = stat.StrOrRefValue();
+        if (stat.Type() == StatType::kStepName) {
+          special_args.step_name = stat_value;
+        }
+        args.Append(stat.Name(), stat_value);
+        break;
+      }
+      case XStat::kBytesValue:
+        break;
+      case XStat::VALUE_NOT_SET:
+        break;
+    }
+  };
+  // Ensure the metadata stats appear before the per-occurrence stats.
+  event.Metadata().ForEachStat(for_each_stat);
+  event.ForEachStat(for_each_stat);
+  return special_args;
+}
+
+void ConvertXLineToTraceEventsContainer(uint32_t device_id,
+                                        const XLineVisitor& line,
+                                        TraceEventsContainer* container) {
+  std::optional<uint32_t> resource_id;
+
+  if (line.Name() != tsl::profiler::kCounterEventsLineName) {
+    resource_id = line.DisplayId();
+    Resource* resource = container->MutableResource(*resource_id, device_id);
+    resource->set_resource_id(*resource_id);
+    resource->set_name(std::string(line.DisplayName()));
+    resource->set_num_events(line.NumEvents());
+  }
+
+  RawData raw_data;  // hoisted for performance
+  line.ForEachEvent([device_id, resource_id, &raw_data,
+                     container](const XEventVisitor& event) {
+    int64_t event_type =
+        event.Type().value_or(HostEventType::kUnknownHostEventType);
+    if (IsInternalEvent(event_type)) return;
+    TraceEventArguments* raw_args = raw_data.mutable_args();
+    absl::string_view event_name;
+    if (event.HasDisplayName()) {
+      event_name = event.DisplayName();
+      TraceEventArgumentsBuilder args(raw_args);
+      constexpr size_t kMaxLongName = 10000;
+      if (event.Name().size() > kMaxLongName) {
+        args.Append("long_name",
+                    absl::StrCat(event.Name().substr(0, kMaxLongName),
+                                 "...<truncated>"));
+      } else {
+        args.Append("long_name", event.Name());
+      }
+    } else {
+      event_name = event.Name();
+    }
+    SpecialArguments special_args =
+        ConvertXStatsToTraceEventArguments(event, &raw_data, raw_args);
+    if (!special_args.step_name.empty()) {
+      event_name = special_args.step_name;
+    }
+    if (!resource_id) {
+      container->AddCounterEvent(event_name, device_id, event.TimestampPs(),
+                                 raw_data);
+    } else if (special_args.flow) {
+      Timespan span(event.TimestampPs(), event.DurationPs());
+      if (special_args.is_async_event) {
+        container->AddAsyncEvent(
+            event_name, device_id, span, special_args.flow->Id(),
+            FlowEntryTypeFromDirection(special_args.flow->Direction()),
+            special_args.flow->Category(), &raw_data, special_args.group_id);
+      } else {
+        container->AddFlowEvent(
+            event_name, *resource_id, device_id, span, special_args.flow->Id(),
+            FlowEntryTypeFromDirection(special_args.flow->Direction()),
+            special_args.flow->Category(), &raw_data, special_args.group_id);
+      }
+    } else {
+      Timespan span(event.TimestampPs(), event.DurationPs());
+      container->AddCompleteEvent(event_name, *resource_id, device_id, span,
+                                  &raw_data, special_args.group_id);
+    }
+    // Cleanup hoisted structure for next event.
+    if (raw_data.has_args()) raw_args->clear_arg();
+  });
+}
+
+}  // namespace
+
+void ConvertXPlaneToTraceEventsContainer(absl::string_view hostname,
+                                         const XPlane& xplane,
+                                         TraceEventsContainer* container) {
+  uint64_t device_id = xplane.id();
+  XPlaneVisitor plane = CreateTfXPlaneVisitor(&xplane);
+  std::unique_ptr<ResourceGrouperInterface> resource_grouper =
+      CreateDefaultResourceGrouper(device_id, plane.Name());
+
+  if (plane.NumLines() == 0) return;
+
+  for (const auto& [device_id, name] : resource_grouper->Devices()) {
+    Device* device = container->MutableDevice(device_id);
+    device->set_device_id(device_id);
+    device->set_name(absl::StrCat(hostname, " ", name));
+  }
+
+  plane.ForEachLine([&](const XLineVisitor& line) {
+    if (line.NumEvents() == 0) return;
+    // Capture a copy of XLineVisitor because it will go out of scope.
+    uint32_t device_id = resource_grouper->GetDeviceId(line.DisplayId());
+    ConvertXLineToTraceEventsContainer(device_id, line, container);
+  });
+}
+
+void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
+                                         const XSpace& space,
+                                         TraceEventsContainer* container) {
+  for (const auto& plane : space.planes()) {
+    ConvertXPlaneToTraceEventsContainer(hostname, plane, container);
+  }
+}
+
+}  // namespace profiler
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/xplane_to_trace_container.h b/tensorflow/core/profiler/convert/xplane_to_trace_container.h
new file mode 100644
index 00000000000..15083e8ae86
--- /dev/null
+++ b/tensorflow/core/profiler/convert/xplane_to_trace_container.h
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
+#define TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
+
+#include "tensorflow/core/profiler/convert/trace_viewer/trace_events.h"
+#include "tensorflow/core/profiler/protobuf/trace_events_raw.pb.h"
+#include "tensorflow/tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tensorflow {
+namespace profiler {
+
+using TraceEventsContainer = TraceEventsContainerBase<EventFactory, RawData>;
+
+// Converts XEvents within the XSpace into trace_viewer events container.
+void ConvertXSpaceToTraceEventsContainer(absl::string_view hostname,
+                                         const XSpace& xspace,
+                                         TraceEventsContainer* container);
+
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_CONVERT_XPLANE_TO_TRACE_CONTAINER_H_
diff --git a/tensorflow/core/profiler/internal/BUILD b/tensorflow/core/profiler/internal/BUILD
index e2cdaccd224..25db456af05 100644
--- a/tensorflow/core/profiler/internal/BUILD
+++ b/tensorflow/core/profiler/internal/BUILD
@@ -113,6 +113,7 @@ cc_library(
         "//tensorflow/core/platform:regexp",
         "//tensorflow/core/profiler:protos_all_cc",
         "//tensorflow/core/profiler:tfprof_options",
+        "//tensorflow/tsl/profiler/protobuf:profile_proto_cc",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
diff --git a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
index 6752d2c5281..9ea9bdac530 100644
--- a/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_ACCELERATOR_UTILIZATION_CHECKER_H_
 
+#include <algorithm>
+#include <map>
+
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
 
diff --git a/tensorflow/core/profiler/internal/advisor/checker.h b/tensorflow/core/profiler/internal/advisor/checker.h
index 89c3533a46e..3fc345ccfc5 100644
--- a/tensorflow/core/profiler/internal/advisor/checker.h
+++ b/tensorflow/core/profiler/internal/advisor/checker.h
@@ -31,7 +31,7 @@ static const char* const kCheckers[] = {
 
 class Checker {
  public:
-  virtual ~Checker() {}
+  virtual ~Checker() = default;
 
   virtual string name() const = 0;
 
diff --git a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
index cf6067fb8b5..4ec0cb571dd 100644
--- a/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
+++ b/tensorflow/core/profiler/internal/advisor/expensive_operation_checker.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_EXPENSIVE_OPERATION_CHECKER_H_
 
+#include <vector>
+
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
index 3fe772e6df1..e1db57cce89 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_ADVISOR_TFPROF_ADVISOR_H_
 
+#include <vector>
+
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/profiler/internal/advisor/accelerator_utilization_checker.h"
 #include "tensorflow/core/profiler/internal/advisor/checker.h"
diff --git a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
index 7cf83e54d07..d9219e57c60 100644
--- a/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
+++ b/tensorflow/core/profiler/internal/advisor/tfprof_advisor_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/advisor/tfprof_advisor.h"
 
+#include <map>
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
@@ -25,14 +29,14 @@ namespace tfprof {
 class TFProfAdvisorTest : public ::testing::Test {
  protected:
   TFProfAdvisorTest() {
-    stats_.reset(new TFStats(std::unique_ptr<GraphDef>(new GraphDef()), nullptr,
-                             nullptr, nullptr));
+    stats_ = std::make_unique<TFStats>(std::make_unique<GraphDef>(), nullptr,
+                                       nullptr, nullptr);
 
     stats_->AddNodeForTest(
         0, CreateNode("n1", "Conv2D", {{"data_format", "NHWC"}}, 0, 10, 2));
     stats_->AddNodeForTest(0, CreateNode("n2", "Conv2D", {}, 0, 20, 2));
     stats_->BuildAllViews();
-    advisor_.reset(new Advisor(stats_.get()));
+    advisor_ = std::make_unique<Advisor>(stats_.get());
   }
 
   std::unique_ptr<TFGraphNode> CreateNode(const string& name,
@@ -40,7 +44,7 @@ class TFProfAdvisorTest : public ::testing::Test {
                                           std::map<string, string> attrs,
                                           int64_t step, int64_t start_miros,
                                           int64_t end_rel_micros) {
-    node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
+    node_defs_.push_back(std::make_unique<NodeDef>());
     NodeDef* def = node_defs_.back().get();
 
     def->set_name(name);
diff --git a/tensorflow/core/profiler/internal/print_model_analysis.cc b/tensorflow/core/profiler/internal/print_model_analysis.cc
index 6703a98181c..e486529069a 100644
--- a/tensorflow/core/profiler/internal/print_model_analysis.cc
+++ b/tensorflow/core/profiler/internal/print_model_analysis.cc
@@ -97,7 +97,7 @@ bool NewProfiler(const string* graph, const string* op_log) {
 
   std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
-    op_log_ptr.reset(new OpLogProto());
+    op_log_ptr = std::make_unique<OpLogProto>();
     if (!op_log_ptr->ParseFromString(*op_log)) {
       absl::FPrintF(stderr, "Failed to parse OpLogProto.\n");
       return false;
@@ -143,7 +143,7 @@ double AddStep(int64_t step, const string* graph, const string* run_meta,
 
   if (op_log && !op_log->empty()) {
     std::unique_ptr<OpLogProto> op_log_ptr;
-    op_log_ptr.reset(new OpLogProto());
+    op_log_ptr = std::make_unique<OpLogProto>();
     op_log_ptr->ParseFromString(*op_log);
     tf_stat->AddOpLogProto(std::move(op_log_ptr));
   }
@@ -182,13 +182,13 @@ string PrintModelAnalysis(const string* graph, const string* run_meta,
 
   std::unique_ptr<RunMetadata> run_meta_ptr;
   if (run_meta && !run_meta->empty()) {
-    run_meta_ptr.reset(new RunMetadata());
+    run_meta_ptr = std::make_unique<RunMetadata>();
     run_meta_ptr->ParseFromString(*run_meta);
   }
 
   std::unique_ptr<OpLogProto> op_log_ptr;
   if (op_log && !op_log->empty()) {
-    op_log_ptr.reset(new OpLogProto());
+    op_log_ptr = std::make_unique<OpLogProto>();
     op_log_ptr->ParseFromString(*op_log);
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_code.cc b/tensorflow/core/profiler/internal/tfprof_code.cc
index 73cf0d85bb1..e232efa265c 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.cc
+++ b/tensorflow/core/profiler/internal/tfprof_code.cc
@@ -17,7 +17,15 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <algorithm>
+#include <iterator>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -26,6 +34,7 @@ limitations under the License.
 #include "tensorflow/core/lib/io/zlib_outputbuffer.h"
 #include "tensorflow/core/platform/regexp.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/tsl/profiler/protobuf/profile.pb.h"
 
 namespace tensorflow {
 namespace tfprof {
@@ -411,8 +420,8 @@ void TFCode::AddNode(TFGraphNode* node) {
   }
 
   if (!root_) {
-    graph_root_.reset(new TFMultiGraphNode(kTFProfRoot));
-    root_.reset(new CodeNode(graph_root_.get(), nullptr, ""));
+    graph_root_ = std::make_unique<TFMultiGraphNode>(kTFProfRoot);
+    root_ = std::make_unique<CodeNode>(graph_root_.get(), nullptr, "");
   }
 
   CodeNode* pre_code_node = root_.get();
@@ -505,7 +514,7 @@ const ShowMultiNode* TFCode::ShowInternal(const Options& opts,
 
   if (opts.output_type == kOutput[3]) {
     std::vector<uint64> call_ids;
-    pprof_profile_.reset(new PprofProfileImpl(&opts));
+    pprof_profile_ = std::make_unique<PprofProfileImpl>(&opts);
     Format(root, root->show_children, opts, &root->formatted_str,
            root->mutable_proto(), &call_ids);
     Status s = pprof_profile_->WritePprofProfile(
diff --git a/tensorflow/core/profiler/internal/tfprof_code.h b/tensorflow/core/profiler/internal/tfprof_code.h
index 4d9780d0212..cfa8801bcee 100644
--- a/tensorflow/core/profiler/internal/tfprof_code.h
+++ b/tensorflow/core/profiler/internal/tfprof_code.h
@@ -39,7 +39,7 @@ namespace tfprof {
 
 class PprofProfile {
  public:
-  virtual ~PprofProfile() {}
+  virtual ~PprofProfile() = default;
 
   virtual uint64 AddLocation(const CodeNode* callee,
                              const CodeNode* caller) = 0;
@@ -52,8 +52,8 @@ class PprofProfile {
 
 class TFCode : public TFMultiShow {
  public:
-  TFCode() {}
-  ~TFCode() override {}
+  TFCode() = default;
+  ~TFCode() override = default;
 
   // Add nodes to the code view. Called before Build()
   void AddNode(TFGraphNode* node) override;
@@ -69,7 +69,7 @@ class TFCode : public TFMultiShow {
   std::vector<CodeNode*> SearchRoot(std::vector<CodeNode*> roots,
                                     const std::vector<string>& regexes);
 
-  std::vector<CodeNode*> PrintScope(const std::vector<CodeNode*> roots,
+  std::vector<CodeNode*> PrintScope(std::vector<CodeNode*> roots,
                                     const Options& opts, int depth,
                                     int last_ident);
 
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.cc b/tensorflow/core/profiler/internal/tfprof_graph.cc
index 41f653a857e..4185c4d04ab 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.cc
+++ b/tensorflow/core/profiler/internal/tfprof_graph.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <map>
+#include <memory>
+#include <set>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/regexp.h"
@@ -27,19 +31,18 @@ limitations under the License.
 namespace tensorflow {
 namespace tfprof {
 GraphNode* TFGraph::CreateParentNode(const string& name) {
-  node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
+  node_defs_.push_back(std::make_unique<NodeDef>());
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFGraphParent);
-  parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
-      new TFGraphNode(node_defs_.back().get(), -1, nullptr));
-  nodes_map_[name] =
-      std::unique_ptr<GraphNode>(new GraphNode(parent_nodes_[name].get()));
+  parent_nodes_[name] =
+      std::make_unique<TFGraphNode>(node_defs_.back().get(), -1, nullptr);
+  nodes_map_[name] = std::make_unique<GraphNode>(parent_nodes_[name].get());
   return nodes_map_[name].get();
 }
 
 void TFGraph::AddNode(TFGraphNode* node) {
   string name = node->name();
-  nodes_map_[name] = std::unique_ptr<GraphNode>(new GraphNode(node));
+  nodes_map_[name] = std::make_unique<GraphNode>(node);
 }
 
 void TFGraph::Build() {
diff --git a/tensorflow/core/profiler/internal/tfprof_graph.h b/tensorflow/core/profiler/internal/tfprof_graph.h
index 9d1c1da391a..89ae0b372a2 100644
--- a/tensorflow/core/profiler/internal/tfprof_graph.h
+++ b/tensorflow/core/profiler/internal/tfprof_graph.h
@@ -42,7 +42,7 @@ class TFGraph : public TFShow {
  public:
   explicit TFGraph(checkpoint::CheckpointReader* ckpt_reader)
       : TFShow(ckpt_reader), root_(nullptr) {}
-  ~TFGraph() override {}
+  ~TFGraph() override = default;
 
   void AddNode(TFGraphNode* node) override;
 
@@ -63,7 +63,7 @@ class TFGraph : public TFShow {
                                      const std::vector<string>& regexes,
                                      std::set<string>* visited);
 
-  std::vector<GraphNode*> PrintGraph(const std::vector<GraphNode*> roots,
+  std::vector<GraphNode*> PrintGraph(std::vector<GraphNode*> roots,
                                      const Options& opts, int depth,
                                      int last_ident, std::set<string>* visits);
 
@@ -71,7 +71,7 @@ class TFGraph : public TFShow {
                                   const Options& opts,
                                   std::set<string>* visits);
 
-  void Format(const std::vector<GraphNode*> roots, string* display_str,
+  void Format(std::vector<GraphNode*> roots, string* display_str,
               GraphNodeProto* proto);
 
   MemoryTracker memory_tracker_;
diff --git a/tensorflow/core/profiler/internal/tfprof_node.cc b/tensorflow/core/profiler/internal/tfprof_node.cc
index 8208b0f5be2..784b995e0d8 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/tfprof_node.h"
 
+#include <algorithm>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/profiler/internal/tfprof_node.h b/tensorflow/core/profiler/internal/tfprof_node.h
index d7640cfb538..fa65804238d 100644
--- a/tensorflow/core/profiler/internal/tfprof_node.h
+++ b/tensorflow/core/profiler/internal/tfprof_node.h
@@ -16,9 +16,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_H_
 
+#include <algorithm>
 #include <map>
+#include <memory>
 #include <set>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_format.h"
@@ -49,7 +52,7 @@ class CallStack {
           const std::map<int64_t, string>* id_to_string)
         : trace_(trace), id_to_string_(id_to_string) {}
 
-    const int32 lineno() const { return trace_->lineno(); }
+    int32 lineno() const { return trace_->lineno(); }
     string file() const {
       // Backward compatible with old proto files.
       if (!trace_->file().empty()) return trace_->file();
@@ -85,7 +88,7 @@ class CallStack {
 
 class ExecStep {
  public:
-  ExecStep() {}
+  ExecStep() = default;
 
   void AddTimeStats(const string& dev, const NodeExecStats& step_stat);
 
@@ -358,7 +361,7 @@ class TFGraphNode {
   void AddCode(const CodeDef& code,
                const std::map<int64_t, string>* id_to_string) {
     if (!call_stack_) {
-      call_stack_.reset(new CallStack(code, id_to_string));
+      call_stack_ = std::make_unique<CallStack>(code, id_to_string);
     }
   }
 
@@ -437,7 +440,7 @@ class TFGraphNode {
     node_.Clear();
     node_.MergeFrom(node);
 
-    call_stack_.reset(new CallStack(node.trace(), id_to_string));
+    call_stack_ = std::make_unique<CallStack>(node.trace(), id_to_string);
 
     op_types_.clear();
     op_types_.insert(node_.op_types().begin(), node_.op_types().end());
@@ -637,7 +640,7 @@ class TFGraphNode {
     }
     return persistent_bytes;
   }
-  const std::map<int64_t, int64_t> allocator_bytes_in_use(int64_t step) const {
+  std::map<int64_t, int64_t> allocator_bytes_in_use(int64_t step) const {
     auto exec = execs_.find(step);
     if (exec == execs_.end()) {
       return empty_bytes_in_use_;
@@ -701,7 +704,7 @@ class TFGraphNode {
     return output_shapes_;
   }
 
-  const std::map<int, std::vector<int64_t>> input_shapes() const {
+  std::map<int, std::vector<int64_t>> input_shapes() const {
     std::map<int, std::vector<int64_t>> input_shapes;
     for (const auto& inp : inputs_) {
       // Always create an empty vec even if the shape info might be missing.
@@ -811,7 +814,7 @@ class TFMultiGraphNode {
 
       float_ops_ += node->float_ops(step);
       parameters_ += node->parameters();
-      if (node->shape().size() > 0) {
+      if (!node->shape().empty()) {
         shapes_.push_back(node->shape());
       }
       devices_.insert(node->canonical_device());
diff --git a/tensorflow/core/profiler/internal/tfprof_node_show.cc b/tensorflow/core/profiler/internal/tfprof_node_show.cc
index 26415a6820f..980eea626a5 100644
--- a/tensorflow/core/profiler/internal/tfprof_node_show.cc
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
 
+#include <vector>
+
 namespace tensorflow {
 namespace tfprof {
 namespace {}
diff --git a/tensorflow/core/profiler/internal/tfprof_node_show.h b/tensorflow/core/profiler/internal/tfprof_node_show.h
index e970a1c7e54..e3d4b86a01b 100644
--- a/tensorflow/core/profiler/internal/tfprof_node_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_node_show.h
@@ -25,6 +25,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_NODE_SHOW_H_
 
 #include <algorithm>
+#include <map>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -41,7 +43,7 @@ namespace tfprof {
 class ShowNode {
  public:
   explicit ShowNode(const TFGraphNode* node);
-  virtual ~ShowNode() {}
+  virtual ~ShowNode() = default;
 
   const string& name() const { return node->name(); }
   GraphNodeProto* mutable_proto();
@@ -76,7 +78,7 @@ class GraphNode : public ShowNode {
 class ScopeNode : public ShowNode {
  public:
   explicit ScopeNode(const TFGraphNode* node) : ShowNode(node) {}
-  ~ScopeNode() override {}
+  ~ScopeNode() override = default;
 
   std::vector<ScopeNode*> children;
   std::vector<ScopeNode*> show_children;
@@ -85,7 +87,7 @@ class ScopeNode : public ShowNode {
 class ShowMultiNode {
  public:
   explicit ShowMultiNode(TFMultiGraphNode* node);
-  virtual ~ShowMultiNode() {}
+  virtual ~ShowMultiNode() = default;
 
   bool ReInit(int64_t step, const std::vector<string>& type_regexes);
 
@@ -113,7 +115,7 @@ class CodeNode : public ShowMultiNode {
   CodeNode(TFMultiGraphNode* node, const CallStack::Trace* trace,
            const string& suffix)
       : ShowMultiNode(node), trace_(trace), suffix_(suffix) {}
-  ~CodeNode() override {}
+  ~CodeNode() override = default;
 
   CodeNode* AddChildren(const string& name, const CallStack::Trace* trace,
                         const string suffix) {
@@ -122,16 +124,16 @@ class CodeNode : public ShowMultiNode {
       return it->second.get();
     }
 
-    graph_children_.push_back(
-        std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(name)));
+    graph_children_.push_back(std::make_unique<TFMultiGraphNode>(name));
     auto child = &children_[name];
-    child->reset(new CodeNode(graph_children_.back().get(), trace, suffix));
+    *child =
+        std::make_unique<CodeNode>(graph_children_.back().get(), trace, suffix);
     children.push_back(child->get());
     return child->get();
   }
 
   bool has_trace() const { return trace_ != nullptr; }
-  const int32 lineno() const { return trace_->lineno(); }
+  int32 lineno() const { return trace_->lineno(); }
   string file() const { return trace_->file(); }
   string function() const { return trace_->function() + suffix_; }
   int32 func_start_line() const { return trace_->func_start_line(); }
@@ -149,7 +151,7 @@ class CodeNode : public ShowMultiNode {
 class OpNode : public ShowMultiNode {
  public:
   explicit OpNode(TFMultiGraphNode* node) : ShowMultiNode(node) {}
-  ~OpNode() override {}
+  ~OpNode() override = default;
 };
 
 }  // namespace tfprof
diff --git a/tensorflow/core/profiler/internal/tfprof_op.cc b/tensorflow/core/profiler/internal/tfprof_op.cc
index 83d67031627..ef274259f32 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.cc
+++ b/tensorflow/core/profiler/internal/tfprof_op.cc
@@ -17,7 +17,11 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <algorithm>
+#include <memory>
+#include <set>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -81,8 +85,7 @@ string FormatAcceleratorExecTime(const ShowMultiNode* node,
 void TFOp::AddNode(TFGraphNode* node) {
   const string& op = node->op();
   if (tfcnodes_map_.find(op) == tfcnodes_map_.end()) {
-    tfcnodes_map_[op] =
-        std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(op));
+    tfcnodes_map_[op] = std::make_unique<TFMultiGraphNode>(op);
   }
   TFMultiGraphNode* tfcnode = tfcnodes_map_[op].get();
   tfcnode->AddGraphNode(node);
@@ -90,13 +93,11 @@ void TFOp::AddNode(TFGraphNode* node) {
 
 void TFOp::Build() {
   for (auto& tn : tfcnodes_map_) {
-    cnodes_map_[tn.first] =
-        std::unique_ptr<OpNode>(new OpNode(tn.second.get()));
+    cnodes_map_[tn.first] = std::make_unique<OpNode>(tn.second.get());
   }
 
-  tfcnodes_map_[kTFProfRoot] =
-      std::unique_ptr<TFMultiGraphNode>(new TFMultiGraphNode(kTFProfRoot));
-  root_.reset(new OpNode(tfcnodes_map_[kTFProfRoot].get()));
+  tfcnodes_map_[kTFProfRoot] = std::make_unique<TFMultiGraphNode>(kTFProfRoot);
+  root_ = std::make_unique<OpNode>(tfcnodes_map_[kTFProfRoot].get());
 }
 
 const ShowMultiNode* TFOp::ShowInternal(const Options& opts,
diff --git a/tensorflow/core/profiler/internal/tfprof_op.h b/tensorflow/core/profiler/internal/tfprof_op.h
index d232e1ca08a..0aa4887e42b 100644
--- a/tensorflow/core/profiler/internal/tfprof_op.h
+++ b/tensorflow/core/profiler/internal/tfprof_op.h
@@ -40,7 +40,7 @@ namespace tfprof {
 class TFOp : public TFMultiShow {
  public:
   explicit TFOp() : TFMultiShow() {}
-  ~TFOp() override {}
+  ~TFOp() override = default;
 
   void AddNode(TFGraphNode* node) override;
 
@@ -50,7 +50,7 @@ class TFOp : public TFMultiShow {
   const ShowMultiNode* ShowInternal(const Options& opts,
                                     Timeline* timeline) override;
 
-  int64_t SearchRoot(const std::vector<OpNode*> nodes,
+  int64_t SearchRoot(std::vector<OpNode*> nodes,
                      const std::vector<string>& regexes);
 
   bool ShouldShowIfExtra(const ShowMultiNode* node, const Options& opts,
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.cc b/tensorflow/core/profiler/internal/tfprof_scope.cc
index 3c43c1ed4f5..4718365c45e 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.cc
+++ b/tensorflow/core/profiler/internal/tfprof_scope.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/regexp.h"
@@ -30,20 +32,19 @@ ScopeNode* TFScope::CreateParentNode(const string& name) {
   if (nodes_map_.find(name) != nodes_map_.end()) {
     return nodes_map_[name].get();
   }
-  node_defs_.push_back(std::unique_ptr<NodeDef>(new NodeDef()));
+  node_defs_.push_back(std::make_unique<NodeDef>());
   node_defs_.back()->set_name(name);
   node_defs_.back()->set_op(kTFScopeParent);
-  parent_nodes_[name] = std::unique_ptr<TFGraphNode>(
-      new TFGraphNode(node_defs_.back().get(), -1, nullptr));
-  nodes_map_[name] =
-      std::unique_ptr<ScopeNode>(new ScopeNode(parent_nodes_[name].get()));
+  parent_nodes_[name] =
+      std::make_unique<TFGraphNode>(node_defs_.back().get(), -1, nullptr);
+  nodes_map_[name] = std::make_unique<ScopeNode>(parent_nodes_[name].get());
   return nodes_map_[name].get();
 }
 
 void TFScope::AddNode(TFGraphNode* node) {
   string name = node->name();
   if (nodes_map_.find(node->name()) == nodes_map_.end()) {
-    nodes_map_[name] = std::unique_ptr<ScopeNode>(new ScopeNode(node));
+    nodes_map_[name] = std::make_unique<ScopeNode>(node);
   }
 
   auto last_slash = name.find_last_of('/');
diff --git a/tensorflow/core/profiler/internal/tfprof_scope.h b/tensorflow/core/profiler/internal/tfprof_scope.h
index b264c854673..ede6d633ae4 100644
--- a/tensorflow/core/profiler/internal/tfprof_scope.h
+++ b/tensorflow/core/profiler/internal/tfprof_scope.h
@@ -40,7 +40,7 @@ class TFScope : public TFShow {
  public:
   explicit TFScope(checkpoint::CheckpointReader* ckpt_reader)
       : TFShow(ckpt_reader), root_(nullptr) {}
-  ~TFScope() override {}
+  ~TFScope() override = default;
 
   void AddNode(TFGraphNode* node) override;
 
@@ -55,14 +55,14 @@ class TFScope : public TFShow {
   std::vector<ScopeNode*> SearchRoot(std::vector<ScopeNode*> roots,
                                      const std::vector<string>& regexes);
 
-  std::vector<ScopeNode*> PrintScope(const std::vector<ScopeNode*> roots,
+  std::vector<ScopeNode*> PrintScope(std::vector<ScopeNode*> roots,
                                      const Options& opts, int depth,
                                      int last_ident);
 
   std::vector<ScopeNode*> Account(const std::vector<ScopeNode*>& roots,
                                   const Options& opts);
 
-  void Format(const std::vector<ScopeNode*> roots, string* display_str,
+  void Format(std::vector<ScopeNode*> roots, string* display_str,
               GraphNodeProto* proto);
 
   ScopeNode* root_;
diff --git a/tensorflow/core/profiler/internal/tfprof_show.cc b/tensorflow/core/profiler/internal/tfprof_show.cc
index 9ed48cfcfab..85209e48437 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <memory>
 #include <set>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -64,7 +66,7 @@ bool TFShow::LookUpCheckPoint(const string& name,
     TF_DeleteStatus(status);
     return false;
   }
-  tensor->reset(new TFProfTensor(std::move(out_tensor)));
+  *tensor = std::make_unique<TFProfTensor>(std::move(out_tensor));
   TF_DeleteStatus(status);
   return true;
 }
diff --git a/tensorflow/core/profiler/internal/tfprof_show.h b/tensorflow/core/profiler/internal/tfprof_show.h
index 0d8f1776b55..ef713cbe471 100644
--- a/tensorflow/core/profiler/internal/tfprof_show.h
+++ b/tensorflow/core/profiler/internal/tfprof_show.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_H_
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -39,7 +40,7 @@ class TFShow {
  public:
   explicit TFShow(checkpoint::CheckpointReader* ckpt_reader)
       : ckpt_reader_(ckpt_reader) {}
-  virtual ~TFShow() {}
+  virtual ~TFShow() = default;
   virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
   virtual const GraphNodeProto& Show(const string& prefix,
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.cc b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
index d07943b3b8a..942cf25d145 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/tfprof_show_multi.h"
 
+#include <algorithm>
+#include <map>
 #include <memory>
 #include <set>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
diff --git a/tensorflow/core/profiler/internal/tfprof_show_multi.h b/tensorflow/core/profiler/internal/tfprof_show_multi.h
index 564fbfd58a3..1f424dd01f6 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_multi.h
+++ b/tensorflow/core/profiler/internal/tfprof_show_multi.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_SHOW_MULTI_H_
 
 #include <algorithm>
+#include <memory>
 #include <string>
 #include <vector>
 
@@ -38,8 +39,8 @@ namespace tfprof {
 
 class TFMultiShow {
  public:
-  explicit TFMultiShow() {}
-  virtual ~TFMultiShow() {}
+  explicit TFMultiShow() = default;
+  virtual ~TFMultiShow() = default;
   virtual void AddNode(TFGraphNode* node) = 0;
   virtual void Build() = 0;
   const MultiGraphNodeProto& Show(const string& prefix, const Options& opts);
diff --git a/tensorflow/core/profiler/internal/tfprof_show_test.cc b/tensorflow/core/profiler/internal/tfprof_show_test.cc
index 4abe1cbe262..b68c5c9e18d 100644
--- a/tensorflow/core/profiler/internal/tfprof_show_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_show_test.cc
@@ -13,14 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/internal/tfprof_stats.h"
-
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
 #include "tensorflow/core/profiler/tfprof_options.h"
@@ -67,8 +67,9 @@ class TFProfShowTest : public ::testing::Test {
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
-    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
-                                std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_ =
+        std::make_unique<TFStats>(std::move(graph_pb), std::move(run_meta_pb),
+                                  std::move(op_log_pb), std::move(ckpt_reader));
     tf_stats_->BuildAllViews();
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.cc b/tensorflow/core/profiler/internal/tfprof_stats.cc
index 3f647b006c6..6ca840cf71a 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats.cc
@@ -17,7 +17,10 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include <map>
+#include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_format.h"
@@ -107,28 +110,28 @@ TFStats::TFStats(const string& filename,
 
 void TFStats::BuildView(const string& cmd) {
   if (cmd == kCmds[0] && !scope_view_) {
-    scope_view_.reset(new TFScope(ckpt_reader_.get()));
+    scope_view_ = std::make_unique<TFScope>(ckpt_reader_.get());
     for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
       scope_view_->AddNode(it->second.get());
     }
     scope_view_->Build();
   }
   if (cmd == kCmds[1] && !graph_view_) {
-    graph_view_.reset(new TFGraph(ckpt_reader_.get()));
+    graph_view_ = std::make_unique<TFGraph>(ckpt_reader_.get());
     for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
       graph_view_->AddNode(it->second.get());
     }
     graph_view_->Build();
   }
   if (cmd == kCmds[2] && !code_view_) {
-    code_view_.reset(new TFCode());
+    code_view_ = std::make_unique<TFCode>();
     for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
       code_view_->AddNode(it->second.get());
     }
     code_view_->Build();
   }
   if (cmd == kCmds[3] && !op_view_) {
-    op_view_.reset(new TFOp());
+    op_view_ = std::make_unique<TFOp>();
     for (auto it = nodes_map_.begin(); it != nodes_map_.end(); it++) {
       op_view_->AddNode(it->second.get());
     }
@@ -200,8 +203,8 @@ void TFStats::AddGraph(std::unique_ptr<GraphDef> graph) {
     }
     node_added = true;
     size_t num_nodes = nodes_map_.size();
-    nodes_map_[node.name()] = std::unique_ptr<TFGraphNode>(
-        new TFGraphNode(&node, num_nodes, &nodes_map_));
+    nodes_map_[node.name()] =
+        std::make_unique<TFGraphNode>(&node, num_nodes, &nodes_map_);
     node_defs[node.name()] = &node;
   }
   for (auto it = node_defs.begin(); it != node_defs.end(); it++) {
@@ -295,8 +298,8 @@ void TFStats::AddRunMeta(int64_t step, std::unique_ptr<RunMetadata> run_meta) {
         NodeDef def;
         if (CreateRunMetadataNode(name, &def)) {
           size_t num_nodes = nodes_map_.size();
-          nodes_map_[name] = std::unique_ptr<TFGraphNode>(
-              new TFGraphNode(&def, num_nodes, &nodes_map_));
+          nodes_map_[name] =
+              std::make_unique<TFGraphNode>(&def, num_nodes, &nodes_map_);
           nodes_map_.at(name)->AddStepStat(step, dev_stat.device(), node_stat);
         }
       } else {
diff --git a/tensorflow/core/profiler/internal/tfprof_stats.h b/tensorflow/core/profiler/internal/tfprof_stats.h
index 615394c9e49..67cbdf565cb 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats.h
+++ b/tensorflow/core/profiler/internal/tfprof_stats.h
@@ -58,7 +58,7 @@ class TFStats {
   TFStats(const string& filename,
           std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader);
 
-  ~TFStats() {}
+  ~TFStats() = default;
 
   const std::map<string, std::unique_ptr<TFGraphNode>>& nodes() const {
     return nodes_map_;
diff --git a/tensorflow/core/profiler/internal/tfprof_stats_test.cc b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
index a8acc00f49a..1dd0317e3b4 100644
--- a/tensorflow/core/profiler/internal/tfprof_stats_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_stats_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/tfprof_stats.h"
 
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/lib/io/path.h"
@@ -60,8 +61,9 @@ class TFProfStatsTest : public ::testing::Test {
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
-    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
-                                std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_ =
+        std::make_unique<TFStats>(std::move(graph_pb), std::move(run_meta_pb),
+                                  std::move(op_log_pb), std::move(ckpt_reader));
     tf_stats_->BuildAllViews();
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor.cc b/tensorflow/core/profiler/internal/tfprof_tensor.cc
index 852105ad89b..6a88303584a 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/tfprof_tensor.h"
 
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor.h b/tensorflow/core/profiler/internal/tfprof_tensor.h
index 0e57866a2bb..4a04b005a40 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor.h
+++ b/tensorflow/core/profiler/internal/tfprof_tensor.h
@@ -22,7 +22,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TENSOR_H_
 
+#include <memory>
+#include <sstream>
 #include <typeinfo>
+#include <utility>
 
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
index dfdd526b923..b632636b44a 100644
--- a/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_tensor_test.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <utility>
+
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
@@ -45,8 +48,9 @@ class TFProfTensorTest : public ::testing::Test {
     CHECK(TF_GetCode(status) == TF_OK);
     TF_DeleteStatus(status);
 
-    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
-                                std::move(op_log_pb), std::move(ckpt_reader)));
+    tf_stats_ =
+        std::make_unique<TFStats>(std::move(graph_pb), std::move(run_meta_pb),
+                                  std::move(op_log_pb), std::move(ckpt_reader));
     tf_stats_->BuildAllViews();
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.cc b/tensorflow/core/profiler/internal/tfprof_timeline.cc
index a333b6c2233..7d8e58e81b4 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.cc
@@ -15,7 +15,13 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/internal/tfprof_timeline.h"
 
+#include <algorithm>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
@@ -183,7 +189,7 @@ void Timeline::AllocateTimeNodes(GraphNode* gnode) {
 
       if (process_.find(device) == process_.end()) {
         int64_t pid = AllocatePID();
-        process_[device].reset(new Process(device, pid));
+        process_[device] = std::make_unique<Process>(device, pid);
         chrome_formatter_.EmitPID(GetTimeDevName(device), pid);
       }
       Process* p = process_[device].get();
@@ -194,8 +200,8 @@ void Timeline::AllocateTimeNodes(GraphNode* gnode) {
         // TODO(xpan): There might be start time duplication here.
         if (tnodes_[device].find(start_micros) == tnodes_[device].end()) {
           // TODO(xpan): Give each kernel call a unique_name.
-          tnodes_[device][start_micros].reset(
-              new TimeNode(p, gnode, start_micros, exec_micros));
+          tnodes_[device][start_micros] =
+              std::make_unique<TimeNode>(p, gnode, start_micros, exec_micros);
         }
       }
     }
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline.h b/tensorflow/core/profiler/internal/tfprof_timeline.h
index 4d2b9ddf1c3..b50c5633799 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline.h
+++ b/tensorflow/core/profiler/internal/tfprof_timeline.h
@@ -16,6 +16,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 #define TENSORFLOW_CORE_PROFILER_INTERNAL_TFPROF_TIMELINE_H_
 
+#include <map>
+#include <memory>
+#include <set>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "json/json.h"
 #include "tensorflow/core/profiler/internal/tfprof_node_show.h"
@@ -28,7 +33,7 @@ typedef std::map<string, string> Event;
 // Class for generating timeline json output.
 class ChromeTraceFormatter {
  public:
-  ChromeTraceFormatter() {}
+  ChromeTraceFormatter() = default;
   // The following methods creates timeline nodes. See chrome tracing format
   // document for details.
   Json::Value CreateEvent(const string& ph, const string& category,
@@ -80,7 +85,7 @@ class TimeNode {
         start_micros(start_micros),
         exec_micros(exec_micros),
         tid(-1) {}
-  virtual ~TimeNode() {}
+  virtual ~TimeNode() = default;
 
   const string& name() { return node->name(); }
 
@@ -121,7 +126,7 @@ class Timeline {
  public:
   Timeline(int64_t step, const string& outfile)
       : step_(step), outfile_(outfile) {}
-  ~Timeline() {}
+  ~Timeline() = default;
 
   int64_t step() const { return step_; }
   void SetStep(int64_t step) { step_ = step; }
diff --git a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
index 7f9576493bb..7efa804a39d 100644
--- a/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
+++ b/tensorflow/core/profiler/internal/tfprof_timeline_test.cc
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/profiler/internal/tfprof_stats.h"
-
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/lib/hash/hash.h"
@@ -22,6 +21,7 @@ limitations under the License.
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/profiler/internal/tfprof_constants.h"
+#include "tensorflow/core/profiler/internal/tfprof_stats.h"
 #include "tensorflow/core/profiler/internal/tfprof_utils.h"
 #include "tensorflow/core/profiler/tfprof_log.pb.h"
 #include "tensorflow/core/profiler/tfprof_options.h"
@@ -47,8 +47,8 @@ class TFProfTimelineTest : public ::testing::Test {
     TF_CHECK_OK(
         ReadProtoFile(Env::Default(), run_meta_path, run_meta_pb.get(), true));
 
-    tf_stats_.reset(new TFStats(std::move(graph_pb), std::move(run_meta_pb),
-                                nullptr, nullptr));
+    tf_stats_ = std::make_unique<TFStats>(
+        std::move(graph_pb), std::move(run_meta_pb), nullptr, nullptr);
     tf_stats_->BuildAllViews();
   }
 
diff --git a/tensorflow/core/profiler/internal/tfprof_utils.cc b/tensorflow/core/profiler/internal/tfprof_utils.cc
index 3b1534e0d0b..251e632ec00 100644
--- a/tensorflow/core/profiler/internal/tfprof_utils.cc
+++ b/tensorflow/core/profiler/internal/tfprof_utils.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <algorithm>
 #include <memory>
 #include <set>
+#include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/numbers.h"
diff --git a/tensorflow/core/profiler/profiler.cc b/tensorflow/core/profiler/profiler.cc
index 79e21e00e21..4c7b39dce6f 100644
--- a/tensorflow/core/profiler/profiler.cc
+++ b/tensorflow/core/profiler/profiler.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "linenoise.h"
@@ -42,7 +43,7 @@ void completion(const char* buf, linenoiseCompletions* lc) {
   string buf_str = buf;
   if (buf_str.find(' ') == buf_str.npos) {
     for (const char* opt : kCmds) {
-      if (string(opt).find(buf_str) == 0) {
+      if (absl::StartsWith(string(opt), buf_str)) {
         linenoiseAddCompletion(lc, opt);
       }
     }
@@ -56,7 +57,7 @@ void completion(const char* buf, linenoiseCompletions* lc) {
     buf_str = buf_str.substr(last_dash + 1, kint32max);
   }
   for (const char* opt : kOptions) {
-    if (string(opt).find(buf_str) == 0) {
+    if (absl::StartsWith(string(opt), buf_str)) {
       linenoiseAddCompletion(lc, (prefix + opt).c_str());
     }
   }
@@ -184,8 +185,8 @@ int Run(int argc, char** argv) {
   std::unique_ptr<checkpoint::CheckpointReader> ckpt_reader;
   TF_Status* status = TF_NewStatus();
   if (!FLAGS_checkpoint_path.empty()) {
-    ckpt_reader.reset(
-        new checkpoint::CheckpointReader(FLAGS_checkpoint_path, status));
+    ckpt_reader = std::make_unique<checkpoint::CheckpointReader>(
+        FLAGS_checkpoint_path, status);
     if (TF_GetCode(status) != TF_OK) {
       absl::FPrintF(stderr, "%s\n", TF_Message(status));
       TF_DeleteStatus(status);
@@ -196,7 +197,8 @@ int Run(int argc, char** argv) {
 
   std::unique_ptr<TFStats> tf_stat;
   if (!FLAGS_profile_path.empty()) {
-    tf_stat.reset(new TFStats(FLAGS_profile_path, std::move(ckpt_reader)));
+    tf_stat =
+        std::make_unique<TFStats>(FLAGS_profile_path, std::move(ckpt_reader));
   } else {
     absl::PrintF(
         "Try to use a single --profile_path instead of "
@@ -223,8 +225,8 @@ int Run(int argc, char** argv) {
         return 1;
       }
     }
-    tf_stat.reset(new TFStats(std::move(graph), nullptr, std::move(op_log),
-                              std::move(ckpt_reader)));
+    tf_stat = std::make_unique<TFStats>(
+        std::move(graph), nullptr, std::move(op_log), std::move(ckpt_reader));
 
     std::vector<string> run_meta_files =
         absl::StrSplit(FLAGS_run_meta_path, ',', absl::SkipEmpty());
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index 13d9033738f..2695f102d28 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -56,6 +56,11 @@ message LayoutAnalysis {
   repeated Dimension dimensions = 1;
 }
 
+// A container to serialize this repeated field in "symbolized xplane."
+message MemoryAccessBreakdown {
+  repeated OpMetrics.MemoryAccessed memory_accessed = 1;
+}
+
 // Metrics for an operation (accumulated over all occurrences).
 // Next ID: 24
 message OpMetrics {
diff --git a/tensorflow/core/profiler/rpc/client/BUILD b/tensorflow/core/profiler/rpc/client/BUILD
index e1b56e30263..919c1b86978 100644
--- a/tensorflow/core/profiler/rpc/client/BUILD
+++ b/tensorflow/core/profiler/rpc/client/BUILD
@@ -33,7 +33,9 @@ tf_profiler_pybind_cc_library_wrapper(
 cc_library(
     name = "profiler_client",
     hdrs = ["profiler_client.h"],
-    visibility = ["//tensorflow/compiler/xla:__subpackages__"],
+    visibility = [
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
+    ],
     deps = [
         ":profiler_client_impl",
         "//tensorflow/core:lib",
diff --git a/tensorflow/core/profiler/rpc/oss/grpc.cc b/tensorflow/core/profiler/rpc/oss/grpc.cc
index 6e0e7ca5db2..6dfaee40f82 100644
--- a/tensorflow/core/profiler/rpc/oss/grpc.cc
+++ b/tensorflow/core/profiler/rpc/oss/grpc.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/rpc/grpc.h"
 
+#include <memory>
+
 namespace tensorflow {
 namespace profiler {
 
diff --git a/tensorflow/core/profiler/tfprof_options.cc b/tensorflow/core/profiler/tfprof_options.cc
index 2d014f3940f..515bef7c818 100644
--- a/tensorflow/core/profiler/tfprof_options.cc
+++ b/tensorflow/core/profiler/tfprof_options.cc
@@ -61,7 +61,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
           absl::StrFormat("E.g. Unknown output type: %s, Valid types: %s\n",
                           *output_type, absl::StrJoin(output_types, ",")));
     }
-    kv_split = absl::StrSplit(output_opt.substr(opt_split + 1), ",",
+    kv_split = absl::StrSplit(output_opt.substr(opt_split + 1), ',',
                               absl::SkipEmpty());
   }
 
@@ -92,7 +92,7 @@ tensorflow::Status ParseOutput(const string& output_opt, string* output_type,
 
   for (const string& kv_str : kv_split) {
     const std::vector<string> kv =
-        absl::StrSplit(kv_str, "=", absl::SkipEmpty());
+        absl::StrSplit(kv_str, '=', absl::SkipEmpty());
     if (kv.size() < 2) {
       return tensorflow::Status(
           absl::StatusCode::kInvalidArgument,
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 691bac556ce..8826636c56d 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -137,10 +137,6 @@ cc_library(
     name = "timespan",
     hdrs = ["timespan.h"],
     copts = tf_profiler_copts(),
-    visibility = [
-        "//tensorflow/core/profiler:internal",
-        "//tensorflow/tsl/profiler/utils:__pkg__",
-    ],
     deps = ["//tensorflow/tsl/profiler/utils:timespan"],
 )
 
@@ -278,10 +274,6 @@ cc_library(
     srcs = ["derived_timeline.cc"],
     hdrs = ["derived_timeline.h"],
     copts = tf_profiler_copts(),
-    visibility = [
-        "//tensorflow/core/profiler:internal",
-        "//tensorflow/tsl/profiler/utils:__pkg__",
-    ],
     deps = [
         ":gpu_event_stats",
         ":group_events",
@@ -513,3 +505,9 @@ tf_cuda_library(
         "@com_google_absl//absl/strings",
     ],
 )
+
+cc_library(
+    name = "hlo_module_utils",
+    hdrs = ["hlo_module_utils.h"],
+    deps = ["//tensorflow/compiler/xla/hlo/ir:hlo"],
+)
diff --git a/tensorflow/core/profiler/utils/hlo_module_utils.h b/tensorflow/core/profiler/utils/hlo_module_utils.h
new file mode 100644
index 00000000000..15e66359dab
--- /dev/null
+++ b/tensorflow/core/profiler/utils/hlo_module_utils.h
@@ -0,0 +1,60 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
+#define TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/hlo/ir/hlo_computation.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_instruction.h"
+#include "tensorflow/compiler/xla/hlo/ir/hlo_module.h"
+
+namespace tensorflow {
+namespace profiler {
+
+inline const xla::HloInstruction* FindInstruction(const xla::HloModule& module,
+                                                  std::string node_name) {
+  if (absl::StartsWith(node_name, "%")) {
+    node_name.erase(node_name.begin());
+  }
+  for (const xla::HloComputation* computation : module.computations()) {
+    auto instrs = computation->instructions();
+    auto it = absl::c_find_if(instrs, [&](const xla::HloInstruction* instr) {
+      // Try with and without "%" at the beginning of the node name.
+      return absl::EqualsIgnoreCase(instr->name(), node_name) ||
+             absl::EqualsIgnoreCase(instr->name(),
+                                    absl::StrCat("%", node_name));
+    });
+    if (it != instrs.end()) {
+      return *it;
+    }
+  }
+  return nullptr;
+}
+
+inline const xla::HloComputation* FindComputation(
+    const xla::HloModule& module, const std::string& comp_name) {
+  for (const xla::HloComputation* computation : module.computations()) {
+    if (absl::EqualsIgnoreCase(computation->name(), comp_name)) {
+      return computation;
+    }
+  }
+  return nullptr;
+}
+}  // namespace profiler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_PROFILER_UTILS_HLO_MODULE_UTILS_H_
diff --git a/tensorflow/core/protobuf/cluster.proto b/tensorflow/core/protobuf/cluster.proto
index 41a55e06d6e..4065eb4a9ed 100644
--- a/tensorflow/core/protobuf/cluster.proto
+++ b/tensorflow/core/protobuf/cluster.proto
@@ -74,6 +74,9 @@ message JobDef {
   // If the `name` field contains "worker", and the `tasks` map contains a
   // mapping from 7 to "example.org:2222", then the device prefix
   // "/job:worker/task:7" will be assigned to "example.org:2222".
+  //
+  // If a job has multiple replicas, host-ports will be comma-delimited, with
+  // one entry for each replica.
   map<int32, string> tasks = 2;
 }
 
diff --git a/tensorflow/core/protobuf/rewriter_config.proto b/tensorflow/core/protobuf/rewriter_config.proto
index 47a890f2b24..9f4042e6f8b 100644
--- a/tensorflow/core/protobuf/rewriter_config.proto
+++ b/tensorflow/core/protobuf/rewriter_config.proto
@@ -126,6 +126,8 @@ message RewriterConfig {
   Toggle auto_mixed_precision_cpu = 29;
   // Disable the entire meta optimizer (off by default).
   bool disable_meta_optimizer = 19;
+  // Disable the TFG optimizer (off by default).
+  bool disable_tfg_optimizer = 32;
   // Optimizers registered by plugin (default is ON)
   Toggle use_plugin_optimizers = 28;
   // Conditional code motion (default is ON).
diff --git a/tensorflow/core/protobuf/struct.proto b/tensorflow/core/protobuf/struct.proto
index abc6771f7db..019382b6a63 100644
--- a/tensorflow/core/protobuf/struct.proto
+++ b/tensorflow/core/protobuf/struct.proto
@@ -74,6 +74,8 @@ message StructuredValue {
     NamedTupleValue named_tuple_value = 54;
     // Represents a value for tf.Tensor.
     tensorflow.TensorProto tensor_value = 55;
+    // Represents a value for np.ndarray.
+    tensorflow.TensorProto numpy_value = 56;
   }
 }
 
diff --git a/tensorflow/core/protobuf/tensorflow_server.proto b/tensorflow/core/protobuf/tensorflow_server.proto
index 5374172df83..399667bedf6 100644
--- a/tensorflow/core/protobuf/tensorflow_server.proto
+++ b/tensorflow/core/protobuf/tensorflow_server.proto
@@ -38,6 +38,9 @@ message ServerDef {
   // that matches this name.
   string job_name = 2;
 
+  // Replica this server manages.
+  int32 replica = 8;
+
   // The task index of this server in its job.
   //
   // NOTE: The `cluster` field must contain a `JobDef` with a matching `name`
diff --git a/tensorflow/core/protobuf/tpu/optimization_parameters.proto b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
index cd7722b55e4..ff4880fabe8 100644
--- a/tensorflow/core/protobuf/tpu/optimization_parameters.proto
+++ b/tensorflow/core/protobuf/tpu/optimization_parameters.proto
@@ -213,6 +213,17 @@ message MomentumParameters {
   reserved 3;
 }
 
+// https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/Lion
+// momenta(new) = beta2 * momenta(old) + (1 - beta2) * grad
+// momenta_t = beta1 * momenta(old) + (1 - beta1) * grad
+// var(new) = var(old) - lr * sign(momenta_t)
+// Algorithm described in https://arxiv.org/abs/2302.06675.
+message LionParameters {
+  float beta1 = 1;
+  float beta2 = 2;
+  bool use_non_lazy_lion = 3;
+}
+
 // https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/RMSprop
 // https://github.com/tensorflow/tensorflow/blob/6b6471f3ffb7f1fefe42d814aa5fb9ab7a535b58/tensorflow/core/kernels/training_ops.cc#L4229
 message RmsPropParameters {
@@ -543,6 +554,7 @@ message OptimizationParameters {
     FtrlParameters ftrl = 5;
     AdamParameters adam = 6;
     MomentumParameters momentum = 8;
+    LionParameters lion = 29;
     RmsPropParameters rms_prop = 9;
     CenteredRmsPropParameters centered_rms_prop = 10;
     MdlAdagradLightParameters mdl_adagrad_light = 11;
@@ -557,7 +569,7 @@ message OptimizationParameters {
 
   reserved 15;  // Old use_gradient_accumulation.
 
-  // NEXT_ID: 29
+  // NEXT_ID: 30
 }
 
 // Specification of an optimization algorithm's state variables (both the main
diff --git a/tensorflow/core/public/session.h b/tensorflow/core/public/session.h
index ab86d578412..701f86dc11b 100644
--- a/tensorflow/core/public/session.h
+++ b/tensorflow/core/public/session.h
@@ -17,8 +17,10 @@ limitations under the License.
 #define TENSORFLOW_CORE_PUBLIC_SESSION_H_
 
 #include <string>
+#include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -145,12 +147,12 @@ class Session {
   //
   /// NOTE: This API is still experimental and may change.
   virtual Status Create(const RunOptions& run_options, const GraphDef& graph) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "Create(const RunOptions& run_options, const GraphDef& graph) is not "
         "supported for this session.");
   }
   virtual Status Extend(const RunOptions& run_options, const GraphDef& graph) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "Extend(const RunOptions& run_options, const GraphDef& graph) is not "
         "supported for this session.");
   }
@@ -163,7 +165,7 @@ class Session {
   }
 #endif
   virtual Status Close(const RunOptions& run_options) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "Close(const RunOptions& run_options) is not supported for this "
         "session.");
   }
@@ -188,7 +190,7 @@ class Session {
                      const std::vector<std::string>& target_tensor_names,
                      std::vector<Tensor>* outputs, RunMetadata* run_metadata,
                      const thread::ThreadPoolOptions& threadpool_options) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "Run with threadpool is not supported for this session.");
   }
 
@@ -232,7 +234,7 @@ class Session {
   // Sets `*output` to the `DeviceMgr` that owns accessible devices in the
   // address-space of the caller.
   virtual Status LocalDeviceManager(const DeviceMgr** output) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "LocalDeviceManager is not supported for this session.");
   }
 
@@ -244,7 +246,7 @@ class Session {
   /// NOTE: This API is still experimental and may change.
   virtual Status MakeCallable(const CallableOptions& callable_options,
                               CallableHandle* out_handle) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "MakeCallable is not supported for this session.");
   }
 
@@ -259,7 +261,7 @@ class Session {
                              const std::vector<Tensor>& feed_tensors,
                              std::vector<Tensor>* fetch_tensors,
                              RunMetadata* run_metadata) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "RunCallable is not supported for this session.");
   }
 
@@ -275,7 +277,7 @@ class Session {
       CallableHandle handle, const std::vector<Tensor>& feed_tensors,
       std::vector<Tensor>* fetch_tensors, RunMetadata* run_metadata,
       const thread::ThreadPoolOptions& threadpool_options) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "RunCallable with threadpool is not supported for this session.");
   }
 
@@ -283,7 +285,7 @@ class Session {
   /// session.
   /// NOTE: This API is still experimental and may change.
   virtual Status ReleaseCallable(CallableHandle handle) {
-    return errors::Unimplemented(
+    return absl::UnimplementedError(
         "ReleaseCallable is not supported for this session.");
   }
 
@@ -306,7 +308,8 @@ class Session {
   ///
   /// NOTE: This API is still experimental and may change.
   virtual Status Finalize() {
-    return errors::Unimplemented("Finalize is not supported for this session.");
+    return absl::UnimplementedError(
+        "Finalize is not supported for this session.");
   }
 };
 
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 6613c853545..6fac22a1aea 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1489  // Updated: 2023/5/7
+#define TF_GRAPH_DEF_VERSION 1524  // Updated: 2023/6/11
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/BUILD b/tensorflow/core/runtime_fallback/BUILD
index 4ee6b16c6b4..435d5474147 100644
--- a/tensorflow/core/runtime_fallback/BUILD
+++ b/tensorflow/core/runtime_fallback/BUILD
@@ -11,6 +11,7 @@ package_group(
         "//learning/brain/experimental/tfrt/...",
         "//learning/brain/tfrt/...",
         "//learning/brain/mobile/lite/...",
+        "//learning/infra/mira/distributed/...",
     ]),
 )
 
@@ -88,7 +89,6 @@ cc_library(
     tags = ["no_oss"],
     visibility = [
         ":internal",
-        "//tensorflow/core/tfrt/eager:__subpackages__",
     ],
     deps = if_google([
         "//tensorflow/core/runtime_fallback/test:test_tf_opkernels_alwayslink",
diff --git a/tensorflow/core/runtime_fallback/conversion/conversion.cc b/tensorflow/core/runtime_fallback/conversion/conversion.cc
index be3afcb12dd..71d97860aaa 100644
--- a/tensorflow/core/runtime_fallback/conversion/conversion.cc
+++ b/tensorflow/core/runtime_fallback/conversion/conversion.cc
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include "tensorflow/core/runtime_fallback/conversion/conversion.h"
 
+#include <utility>
+
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
 #include "tensorflow/core/runtime_fallback/kernel/tensor_util.h"
diff --git a/tensorflow/core/runtime_fallback/kernel/BUILD b/tensorflow/core/runtime_fallback/kernel/BUILD
index d97d591784d..b23b7af7af3 100644
--- a/tensorflow/core/runtime_fallback/kernel/BUILD
+++ b/tensorflow/core/runtime_fallback/kernel/BUILD
@@ -347,7 +347,6 @@ cc_library(
     visibility = [
         "//tensorflow/compiler/mlir/tfrt:__pkg__",
         "//tensorflow/core/runtime_fallback:internal",
-        "//tensorflow/core/tfrt/eager:__pkg__",
         "//tensorflow/core/tfrt/graph_executor:__pkg__",
         "//tensorflow/core/tfrt/saved_model:__pkg__",
     ],
@@ -361,8 +360,10 @@ cc_library(
         "//tensorflow/core/runtime_fallback/util:attr_util",
         "//tensorflow/core/runtime_fallback/util:type_util",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "//tensorflow/core/tfrt/fallback:device_with_custom_allocator",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
+        "//tensorflow/core/tfrt/utils",
         "//tensorflow/core/tfrt/utils:error_util",
         "//tensorflow/core/tfrt/utils:fallback_tensor",
         "//tensorflow/core/tfrt/utils:tensor_util",
@@ -392,7 +393,6 @@ cc_library(
     visibility = [
         "//tensorflow/compiler/mlir/tfrt:__pkg__",
         "//tensorflow/core/runtime_fallback:internal",
-        "//tensorflow/core/tfrt/eager:__pkg__",
         "//tensorflow/core/tfrt/graph_executor:__pkg__",
         "//tensorflow/core/tfrt/saved_model:__pkg__",
     ],
@@ -406,6 +406,7 @@ cc_library(
         "//tensorflow/core/runtime_fallback/runtime:op_logger",
         "//tensorflow/core/runtime_fallback/util:attr_util",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
+        "//tensorflow/core/tfrt/fallback:device_with_custom_allocator",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
         "//tensorflow/core/tfrt/utils:error_util",
@@ -448,6 +449,7 @@ cc_library(
         "//learning/brain/experimental/tfrt/native_lowering/kernels:__subpackages__",
         "//tensorflow/core/tfrt/graph_executor:__subpackages__",
         "//tensorflow/core/tfrt/saved_model:__subpackages__",
+        "//tensorflow/core/tfrt/mlrt/kernel:__subpackages__",
     ],
     deps = [
         "//tensorflow/core/tfrt/fallback:cost_recorder",
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
index 4b74a21f5e5..0645f02481e 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h
@@ -169,13 +169,13 @@ class KernelFallbackCompatRequestState {
     client_graph_resource_context_ = client_graph_resource_context;
   }
 
-  void set_model_config(
-      const tensorflow::tfrt_stub::ModelConfig* model_config) {
-    model_config_ = model_config;
+  void set_runtime_config(
+      const tensorflow::tfrt_stub::RuntimeConfig* runtime_config) {
+    runtime_config_ = runtime_config;
   }
 
-  const tensorflow::tfrt_stub::ModelConfig* model_config() const {
-    return model_config_;
+  const tensorflow::tfrt_stub::RuntimeConfig* runtime_config() const {
+    return runtime_config_;
   }
 
  private:
@@ -219,7 +219,7 @@ class KernelFallbackCompatRequestState {
 
   tfrt::ResourceContext* client_graph_resource_context_ = nullptr;
 
-  const tensorflow::tfrt_stub::ModelConfig* model_config_ = nullptr;
+  const tensorflow::tfrt_stub::RuntimeConfig* runtime_config_ = nullptr;
 };
 
 // Set up fallback context with common tensorflow states such as devices,
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
index 504c6a878ca..e50f86df200 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.cc
@@ -14,22 +14,23 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
 
+#include <algorithm>
 #include <cstdint>
+#include <cstdio>
 #include <cstring>
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 
 #include "absl/base/casts.h"
 #include "llvm/ADT/StringRef.h"
 #include "tensorflow/core/framework/logging.h"
-#include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/threadpool_interface.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
@@ -39,12 +40,13 @@ limitations under the License.
 #include "tensorflow/core/runtime_fallback/util/attr_util.h"
 #include "tensorflow/core/runtime_fallback/util/type_util.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
+#include "tensorflow/core/tfrt/fallback/device_with_custom_allocator.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/tensor_util.h"
+#include "tensorflow/core/tfrt/utils/utils.h"
 #include "tensorflow/tsl/platform/errors.h"
-#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
 #include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
 #include "tfrt/host_context/attribute_utils.h"  // from @tf_runtime
@@ -55,7 +57,6 @@ limitations under the License.
 #include "tfrt/host_context/kernel_registry.h"  // from @tf_runtime
 #include "tfrt/support/error_util.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
-#include "tfrt/support/pointer_util.h"  // from @tf_runtime
 #include "tfrt/support/string_util.h"  // from @tf_runtime
 #include "tfrt/tensor/tensor.h"  // from @tf_runtime
 
@@ -74,11 +75,6 @@ using ::tfrt::Chain;
 using ::tfrt::RCReference;
 using ::tfrt::string_view;
 
-constexpr char kOpKernelRunnerTableResourceName[] =
-    "OpKernelRunnerTableResourceName";
-
-constexpr char kFallbackResourceArray[] = "FallbackResourceArray";
-
 void KernelFallbackEmitError(
     const tfrt::ExecutionContext& exec_ctx,
     const KernelFallbackCompatRequestState* fallback_request_state,
@@ -96,7 +92,7 @@ void KernelFallbackEmitError(
   auto error = EmitErrorAsync(
       exec_ctx,
       absl::Status(
-          ToAbslStatus(status).code(),
+          status.code(),
           tfrt::StrCat(model_info, "error running kernel fallback kernel ",
                        op_name, ": ", status.message())));
   std::fill(results.begin(), results.end(), error);
@@ -106,12 +102,11 @@ void KernelFallbackEmitError(
 }  // namespace
 
 static llvm::Expected<gtl::InlinedVector<tensorflow::Tensor, 4>>
-ConvertInputTensors(llvm::ArrayRef<tfrt::Tensor*> arguments,
-                    const tfrt::ExecutionContext& exec_ctx) {
+ConvertInputTensors(llvm::ArrayRef<tfrt::Tensor*> arguments) {
   gtl::InlinedVector<tensorflow::Tensor, 4> input_tf_tensors;
   input_tf_tensors.reserve(arguments.size());
   for (tfrt::Tensor* argument : arguments) {
-    auto expected_tf_tensor = TFRTTensorToTFTensor(*argument, exec_ctx.host());
+    auto expected_tf_tensor = tfrt::TFRTTensorToTFTensor(*argument);
     if (!expected_tf_tensor) {
       return tfrt::MakeStringError(
           tfrt::StrCat(expected_tf_tensor.takeError()));
@@ -204,7 +199,7 @@ static void KernelFallbackExecuteCompatAsyncInternal(
     if (!context.status().ok()) {
       auto diag = tfrt::EmitError(
           exec_ctx,
-          absl::Status(ToAbslStatus(context.status()).code(),
+          absl::Status(context.status().code(),
                        tfrt::StrCat("error running kernel fallback kernel ",
                                     context.op_kernel().name(), ": ",
                                     context.status().message())));
@@ -259,14 +254,6 @@ static void KernelFallbackExecuteCompatSyncInternal(
   if (op_chain) *op_chain = tfrt::MakeAvailableAsyncValueRef<tfrt::Chain>();
 }
 
-static std::string PrintTfrtOpAttrsToString(const tfrt::OpAttrsRef& attrs) {
-  std::string str;
-  llvm::raw_string_ostream ss(str);
-  attrs.Print(ss);
-  ss.flush();
-  return str;
-}
-
 tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackExecuteCompatCoreRuntimeDispatch(
     const tfrt::ExecutionContext& exec_ctx, tfrt::string_view op_name,
     tfrt::string_view device_name, llvm::ArrayRef<tfrt::Tensor*> arguments,
@@ -276,7 +263,7 @@ tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackExecuteCompatCoreRuntimeDispatch(
   auto op_chain = tfrt::GetReadyChain();
   tensorflow::Status status;
 
-  auto expected_input_tf_tensors = ConvertInputTensors(arguments, exec_ctx);
+  auto expected_input_tf_tensors = ConvertInputTensors(arguments);
   if (!expected_input_tf_tensors) {
     status = tensorflow::errors::Internal(
         tfrt::StrCat(expected_input_tf_tensors.takeError()));
@@ -505,10 +492,10 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOp(
 
   // Start recording the op execution time, given a non-null cost recorder.
   auto* cost_recorder = fallback_request_state->cost_recorder();
-  uint64_t run_start_time_ns = 0;
+  uint64_t run_start_time = 0;
   tfrt::AsyncValueRef<tfrt::Chain> cost_chain;
   if (cost_recorder != nullptr) {
-    run_start_time_ns = Env::Default()->NowNanos();
+    run_start_time = tfrt::GetCpuClockCycle();
     if (op_chain == nullptr) op_chain = &cost_chain;
   }
 
@@ -534,9 +521,9 @@ TF_ATTRIBUTE_ALWAYS_INLINE static void KernelFallbackExecuteOp(
   // execution time. (It's not urgent because async ops are rare.)
   if (cost_recorder != nullptr) {
     op_chain->AndThen(
-        [cost_recorder, run_start_time_ns, op_key = frame.op_key().GetValue()] {
-          cost_recorder->RecordCostNanosecond(
-              op_key, Env::Default()->NowNanos() - run_start_time_ns);
+        [cost_recorder, run_start_time, op_key = frame.op_key().GetValue()] {
+          const uint64_t run_finish_time = tfrt::GetCpuClockCycle();
+          cost_recorder->RecordCost(op_key, run_finish_time - run_start_time);
         });
   }
 }
@@ -572,8 +559,7 @@ tfrt::AsyncValueRef<tfrt::Chain> KernelFallbackCreateOp(
       attr_builder, fallback_request_state->device_manager(),
       fallback_request_state->process_function_library_runtime());
   if (!statusor_runner.ok())
-    return tfrt::EmitErrorAsync(exec_ctx,
-                                ToAbslStatus(statusor_runner.status()));
+    return tfrt::EmitErrorAsync(exec_ctx, statusor_runner.status());
 
   if (!runner_table->Insert(op_key.GetValue(),
                             std::move(statusor_runner).value())) {
@@ -688,78 +674,6 @@ void FallbackAsyncExecuteOpSeq(tfrt::AsyncKernelFrame* frame) {
   out_op_chain = std::move(op_chain);
 }
 
-class DeviceWithCustomAllocator : public tensorflow::Device {
- public:
-  DeviceWithCustomAllocator(tensorflow::Device* device,
-                            tensorflow::Allocator* allocator)
-      : Device(device->env(), device->attributes()),
-        device_(device),
-        allocator_(allocator) {
-    DCHECK(device_);
-    DCHECK(allocator_);
-  }
-
-  Allocator* GetAllocator(AllocatorAttributes attr) override {
-    return allocator_;
-  }
-
-  const DeviceBase* UnderlyingDevice() const override {
-    return device_->UnderlyingDevice();
-  }
-  DeviceBase* UnderlyingDevice() override {
-    return device_->UnderlyingDevice();
-  }
-
-  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
-    return device_->tensorflow_cpu_worker_threads();
-  }
-
-  Allocator* GetScopedAllocator(AllocatorAttributes attr,
-                                int64_t step_id) override {
-    return device_->GetScopedAllocator(attr, step_id);
-  }
-
-  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
-    return device_->GetScopedAllocatorMgr();
-  }
-
-  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
-    return device_->eigen_cpu_device();
-  }
-
-  thread::ThreadPool* tensorflow_device_thread_pool() override {
-    return device_->tensorflow_device_thread_pool();
-  }
-
-  bool has_eigen_cpu_device() const override {
-    return device_->has_eigen_cpu_device();
-  }
-
-  Status MakeTensorFromProto(const TensorProto& tensor_proto,
-                             const AllocatorAttributes alloc_attrs,
-                             Tensor* tensor) override {
-    return device_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
-  }
-
-  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
-                              const DeviceContext* device_context,
-                              StatusCallback done) override {
-    device_->CopyTensorInSameDevice(input_tensor, output_tensor, device_context,
-                                    std::move(done));
-  }
-
-  Status Sync() override { return device_->Sync(); }
-
-  // Returns the resource manager associated w/ this device.
-  ResourceMgr* resource_manager() override {
-    return device_->resource_manager();
-  }
-
- private:
-  tensorflow::Device* device_ = nullptr;
-  tensorflow::Allocator* allocator_ = nullptr;
-};
-
 void KernelFallbackExecuteOpCustomAllocatorInternal(
     llvm::ArrayRef<tfrt::AsyncValue*> args,
     llvm::MutableArrayRef<tfrt::RCReference<tfrt::AsyncValue>> results,
@@ -794,7 +708,8 @@ void KernelFallbackExecuteOpCustomAllocatorInternal(
       GetDeviceFromFallbackState(*fallback_request_state, *kernel_runner);
 
   if (!kernel_runner->IsAsync()) {
-    DeviceWithCustomAllocator device_with_custom_allocator(device, allocator);
+    tfrt_stub::DeviceWithCustomAllocator device_with_custom_allocator(
+        device, allocator);
 
     KernelFallbackExecuteOpInternal(args, results,
                                     /*op_chain=*/op_chain, attr_frame, exec_ctx,
@@ -803,7 +718,8 @@ void KernelFallbackExecuteOpCustomAllocatorInternal(
                                     &device_with_custom_allocator);
   } else {
     auto device_with_custom_allocator =
-        std::make_unique<DeviceWithCustomAllocator>(device, allocator);
+        std::make_unique<tfrt_stub::DeviceWithCustomAllocator>(device,
+                                                               allocator);
 
     tfrt::AsyncValueRef<tfrt::Chain> op_ch;
     if (op_chain == nullptr) {
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
index c9e7aa0ce13..6a03357a259 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_kernels.cc
@@ -16,6 +16,8 @@ limitations under the License.
 // TFRT kernels for calling directly into current TF kernels, bypassing the
 // current TF runtime.
 
+#include <string>
+
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/runtime_fallback/kernel/attr_util.h"
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
index 0f287f27b3f..efb3f3a47eb 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_op_handler.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
@@ -155,7 +159,7 @@ Expected<CoreRuntimeOp> KernelFallbackOpHandler::MakeOp(string_view op_name) {
           auto error = tfrt::EmitErrorAsync(
               invocation.exec_ctx,
               absl::Status(
-                  ToAbslStatus(s).code(),
+                  s.code(),
                   tfrt::StrCat("Error running kernel fallback OpHandler ",
                                invocation.op_name, ":", s.message())));
           for (auto& result : invocation.results) {
@@ -241,7 +245,7 @@ KernelFallbackOpHandler::KernelFallbackOpHandler(
     CoreRuntime* runtime, RCReference<tfrt::Device> device)
     : OpHandler("tfkernel", runtime, nullptr), device_(std::move(device)) {}
 
-KernelFallbackOpHandler::~KernelFallbackOpHandler() {}
+KernelFallbackOpHandler::~KernelFallbackOpHandler() = default;
 
 llvm::Error KernelFallbackOpHandler::Initialize() {
   return llvm::Error::success();
diff --git a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h
index 9c290acc777..8ade7d008a2 100644
--- a/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h
+++ b/tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h
@@ -18,6 +18,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_TENSOR_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_KERNEL_FALLBACK_TENSOR_H_
 
+#include <utility>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tfrt/dtype/dtype.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
index d9b8785c195..0690117d902 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h"
 
+#include <memory>
 #include <optional>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/str_split.h"
 #include "absl/strings/strip.h"
@@ -236,7 +240,7 @@ TFRTOpMeta TFRTOpMetaBuilder::BuildMeta() const {
   return TFRTOpMeta(output_types_);
 }
 
-TFRTOpMetaMap::TFRTOpMetaMap() {}
+TFRTOpMetaMap::TFRTOpMetaMap() = default;
 
 void TFRTOpMetaMap::RegisterOpMeta(const TFRTOpMetaBuilder& op_builder) {
   auto insert_result = op_metas_.insert(
@@ -264,7 +268,7 @@ llvm::ManagedStatic<TFRTOpKernelFactories> tfrt_forwarding_kernel_factories;
 // Forwarding kernel registration.
 //////////////////////////////////////////////////////////////////////
 
-TFRTOpKernelFactories::TFRTOpKernelFactories() {}
+TFRTOpKernelFactories::TFRTOpKernelFactories() = default;
 
 void TFRTOpKernelFactories::RegisterFactory(StringPiece kernel_class_name,
                                             TFRTOpKernelReg kernel_info) {
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
index 59b2ea421ff..fee9279556c 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h
@@ -29,8 +29,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TFRT_OP_KERNEL_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_KERNEL_TFRT_OP_KERNEL_H_
 
+#include <memory>
 #include <optional>
 #include <string>
+#include <vector>
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -169,7 +171,7 @@ class TFRTOpKernelContext {
 class TFRTOpKernel {
  public:
   explicit TFRTOpKernel(TFRTOpKernelConstruction* context) {}
-  virtual ~TFRTOpKernel() {}
+  virtual ~TFRTOpKernel() = default;
   virtual void Compute(TFRTOpKernelContext* context) = 0;
 };
 
diff --git a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
index cc52d5353be..5c99d39745c 100644
--- a/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
+++ b/tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel_test.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/kernel/tfrt_op_kernel.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/error_codes.pb.h"
 #include "tensorflow/core/lib/core/status_test_util.h"
diff --git a/tensorflow/core/runtime_fallback/runtime/BUILD b/tensorflow/core/runtime_fallback/runtime/BUILD
index cc034faa2a5..b1e880621ef 100644
--- a/tensorflow/core/runtime_fallback/runtime/BUILD
+++ b/tensorflow/core/runtime_fallback/runtime/BUILD
@@ -26,7 +26,6 @@ cc_library(
     visibility = [
         # copybara:uncomment "//learning/brain/experimental/tfrt/cpp_tests/saved_model:__subpackages__",
         "//tensorflow/core/runtime_fallback:__subpackages__",
-        "//tensorflow/core/tfrt/eager:__subpackages__",
     ],
     deps = [
         "@com_google_absl//absl/memory",
@@ -170,7 +169,6 @@ cc_library(
         # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
         "//tensorflow/compiler/mlir/tfrt:__pkg__",
         "//tensorflow/core/runtime_fallback:__subpackages__",
-        "//tensorflow/core/tfrt/eager:__subpackages__",
     ],
     deps = [
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
index ca5f827cca7..fec3e14f358 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h"
 
+#include <cstdlib>
+#include <string>
+
 #include "tensorflow/core/kernels/batching_util/bounded_executor.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
diff --git a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
index 9c1265022cd..51f87070cf5 100644
--- a/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
+++ b/tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h
@@ -181,7 +181,7 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
           c, adaptive_shared_batch_scheduler_options, max_batch_size_,
           batch_timeout_micros_, max_enqueued_batches_, allowed_batch_sizes_,
           batch_function_, disable_padding_, &new_resource);
-      if (!status.ok()) return tsl::ToAbslStatus(status);
+      if (!status.ok()) return status;
       return tensorflow::core::RefCountPtr<BatchResourceType>(
           new_resource.release());
     };
@@ -193,7 +193,7 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
           c, num_batch_threads_, max_batch_size_, batch_timeout_micros_,
           max_enqueued_batches_, allowed_batch_sizes_, batch_function_,
           enable_large_batch_splitting_, disable_padding_, &new_resource);
-      if (!status.ok()) return tsl::ToAbslStatus(status);
+      if (!status.ok()) return status;
       return tensorflow::core::RefCountPtr<BatchResourceType>(
           new_resource.release());
     };
@@ -201,7 +201,7 @@ void BatchFunctionFallbackKernel<BatchResourceType>::ComputeAsync(
 
   auto br = client_graph_resource_context->GetOrCreateResource<
       tensorflow::core::RefCountPtr<BatchResourceType>>(shared_name_, creator);
-  if (!br.ok()) OP_REQUIRES_OK_ASYNC(c, tsl::FromAbslStatus(br.status()), done);
+  if (!br.ok()) OP_REQUIRES_OK_ASYNC(c, br.status(), done);
   auto expected_name = BatchResourceType::GetBatchFunctionName(batch_function_);
   auto received_name =
       BatchResourceType::GetBatchFunctionName((*br)->get()->batch_function());
diff --git a/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc b/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc
index f433ab2aede..dd18ab49fe5 100644
--- a/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc
+++ b/tensorflow/core/runtime_fallback/runtime/kernel_utils.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/eager/context_distributed_manager.h"
 #include "tensorflow/core/platform/status.h"
@@ -38,7 +42,7 @@ tfrt::Expected<OwnedEagerContext> InitEagerContext(
 
   if (device_mgr != nullptr) {
     Status s = device_mgr->AddDevices(std::move(devices));
-    DCHECK(s.ok()) << "Failed to initialize device manager.";
+    DCHECK_OK(s) << "Failed to initialize device manager.";
     auto r = tsl::core::RefCountPtr<IntraProcessRendezvous>(
         new tensorflow::IntraProcessRendezvous(device_mgr));
 
diff --git a/tensorflow/core/runtime_fallback/runtime/kernel_utils.h b/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
index c616550e74e..fc201927f1f 100644
--- a/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
+++ b/tensorflow/core/runtime_fallback/runtime/kernel_utils.h
@@ -19,6 +19,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_KERNEL_UTILS_H_
 
 #include <memory>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/runtime_fallback/runtime/op_logger.h b/tensorflow/core/runtime_fallback/runtime/op_logger.h
index 17cf3bcc3d0..c920715d3b5 100644
--- a/tensorflow/core/runtime_fallback/runtime/op_logger.h
+++ b/tensorflow/core/runtime_fallback/runtime/op_logger.h
@@ -19,6 +19,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_RUNTIME_OP_LOGGER_H_
 
 #include <memory>
+#include <string>
 
 #include "absl/memory/memory.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
index 449b57cb210..59a266dfac5 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_batch_tf_opkernels.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
 #include <cstdlib>
+#include <functional>
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/framework/op_kernel.h"
@@ -364,7 +368,7 @@ void FallbackBatchResource::ProcessFuncBatchImpl(
     // If there is only 1 error after deduplication, we emit the error with
     // proper error code mapping from TFRT to TF.
     if (errors.size() == 1) {
-      final_status = FromAbslStatus(*errors[0]);
+      final_status = *errors[0];
     } else {
       std::string msg;
       llvm::raw_string_ostream os(msg);
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
index 4aa0b50466c..aa54428a016 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.cc
@@ -18,7 +18,10 @@ limitations under the License.
 
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_kernels.h"
 
+#include <algorithm>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_split.h"
 #include "absl/synchronization/mutex.h"
@@ -589,7 +592,7 @@ AsyncValueRef<Chain> RuntimeFallbackExecute(
   auto emit_error = [&exec_ctx, results](const tensorflow::Status& status) {
     // Set the correct TFRT error code according to the error propagated from
     // runtime fallback execution.
-    auto error = EmitErrorAsync(exec_ctx, ToAbslStatus(status));
+    auto error = EmitErrorAsync(exec_ctx, status);
     // Set all results to error.
     std::fill(results.begin(), results.end(), error);
     return error;
@@ -906,8 +909,7 @@ void CoreRTTensorHandleToFallbackTensorInternal(
           result_ref->ForwardTo(std::move(knfb_tensor));
           return;
         }
-        auto expected_tf_tensor =
-            TFRTTensorToTFTensor(knfb_tensor.get(), exec_ctx.host());
+        auto expected_tf_tensor = tfrt::TFRTTensorToTFTensor(knfb_tensor.get());
         if (!expected_tf_tensor) {
           auto error = tfrt::EmitErrorAsync(
               exec_ctx, toString(expected_tf_tensor.takeError()));
@@ -921,7 +923,7 @@ void CoreRTTensorHandleToFallbackTensorInternal(
       });
     } else {
       set_result(tf_tensor_results[i],
-                 TFRTTensorToTFTensor(knfb_tensor.get(), exec_ctx.host()));
+                 tfrt::TFRTTensorToTFTensor(knfb_tensor.get()));
     }
   }
 }
@@ -1084,7 +1086,7 @@ static void RuntimeFallbackExecuteOp(
     const tensorflow::Tensor* tf_tensor = nullptr;
     tensorflow::Status s =
         runtime_fallback_tensor.GetTensorHandle()->Tensor(&tf_tensor);
-    DCHECK(s.ok()) << s.ToString();
+    DCHECK(s.ok()) << s;
     results[i] =
         tfrt::MakeAvailableAsyncValueRef<tensorflow::Tensor>(*tf_tensor);
   }
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc
index 0ee295a3d05..e6134f90e02 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.cc
@@ -18,6 +18,10 @@ limitations under the License.
 
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
@@ -304,7 +308,7 @@ RuntimeFallbackOpHandler::RuntimeFallbackOpHandler(
       device_(std::move(device)),
       tf_device_name_(tf_device_name) {}
 
-RuntimeFallbackOpHandler::~RuntimeFallbackOpHandler() {}
+RuntimeFallbackOpHandler::~RuntimeFallbackOpHandler() = default;
 
 llvm::Error RuntimeFallbackOpHandler::Initialize() {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
diff --git a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc
index 7d130a76b75..15b652086e2 100644
--- a/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc
+++ b/tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.cc
@@ -17,6 +17,9 @@ limitations under the License.
 
 #include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
 
+#include <memory>
+#include <utility>
+
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
 #include "tensorflow/c/tensor_interface.h"
diff --git a/tensorflow/core/runtime_fallback/tf_bef_executor_main.cc b/tensorflow/core/runtime_fallback/tf_bef_executor_main.cc
index 8cbc3b10d3b..126de81618d 100644
--- a/tensorflow/core/runtime_fallback/tf_bef_executor_main.cc
+++ b/tensorflow/core/runtime_fallback/tf_bef_executor_main.cc
@@ -19,6 +19,7 @@ limitations under the License.
 // This is allowed to link against Tensorflow libraries.
 
 #include <string>
+#include <vector>
 
 #include "absl/strings/str_split.h"
 #include "llvm/Support/Error.h"
diff --git a/tensorflow/core/runtime_fallback/util/BUILD b/tensorflow/core/runtime_fallback/util/BUILD
index db29b8fc705..a820dad23a3 100644
--- a/tensorflow/core/runtime_fallback/util/BUILD
+++ b/tensorflow/core/runtime_fallback/util/BUILD
@@ -11,7 +11,6 @@ package_group(
     packages = [
         "//learning/brain/experimental/tfrt/native_lowering/kernels/...",
         "//tensorflow/core/runtime_fallback/...",
-        "//tensorflow/core/tfrt/eager/backends/tpu/...",
         "//tensorflow/core/tfrt/utils/...",
     ],
 )
diff --git a/tensorflow/core/runtime_fallback/util/attr_util.cc b/tensorflow/core/runtime_fallback/util/attr_util.cc
index a21bcf36887..89eca1afdcf 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util.cc
@@ -14,7 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/util/attr_util.h"
 
+#include <algorithm>
 #include <cstdlib>
+#include <cstring>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_split.h"
@@ -61,7 +66,7 @@ llvm::Expected<tensorflow::Tensor> DecodeDenseAttrToTfTensor(
         dht.takeError());
   }
 
-  return tfrt::TFRTTensorToTFTensor(*dht, host);
+  return tfrt::TFRTTensorToTFTensor(*dht);
 }
 
 llvm::Error FillAttrValueMapUsingArray(const OpAttrsRawEntry& entry,
diff --git a/tensorflow/core/runtime_fallback/util/attr_util.h b/tensorflow/core/runtime_fallback/util/attr_util.h
index ed9925c0e2e..33e703639d9 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util.h
+++ b/tensorflow/core/runtime_fallback/util/attr_util.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_ATTR_UTIL_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_ATTR_UTIL_H_
 
+#include <vector>
+
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/StringRef.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/core/runtime_fallback/util/attr_util_test.cc b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
index 67d811257f5..4396b7792f8 100644
--- a/tensorflow/core/runtime_fallback/util/attr_util_test.cc
+++ b/tensorflow/core/runtime_fallback/util/attr_util_test.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/util/attr_util.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "llvm/Support/Error.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/node_def_util.h"
diff --git a/tensorflow/core/runtime_fallback/util/fallback_test_util.cc b/tensorflow/core/runtime_fallback/util/fallback_test_util.cc
index 9326ca3f714..3e451b31994 100644
--- a/tensorflow/core/runtime_fallback/util/fallback_test_util.cc
+++ b/tensorflow/core/runtime_fallback/util/fallback_test_util.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/runtime_fallback/util/fallback_test_util.h"
 
+#include <atomic>
+#include <utility>
+
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_request_context.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat.h"
diff --git a/tensorflow/core/runtime_fallback/util/tensor_util.h b/tensorflow/core/runtime_fallback/util/tensor_util.h
index bb913c53087..e2b79805fad 100644
--- a/tensorflow/core/runtime_fallback/util/tensor_util.h
+++ b/tensorflow/core/runtime_fallback/util/tensor_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_UTIL_H_
 #define TENSORFLOW_CORE_RUNTIME_FALLBACK_UTIL_TENSOR_UTIL_H_
 
+#include <memory>
+
 #include "tensorflow/c/tf_tensor.h"
 #include "tensorflow/c/tf_tensor_internal.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/runtime_fallback/util/tensor_util_test.cc b/tensorflow/core/runtime_fallback/util/tensor_util_test.cc
index fefb288e8a3..bcf7ae43923 100644
--- a/tensorflow/core/runtime_fallback/util/tensor_util_test.cc
+++ b/tensorflow/core/runtime_fallback/util/tensor_util_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/runtime_fallback/util/tensor_util.h"
 
+#include <memory>
+
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/platform/protobuf.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 35b6c57d122..6ffd17d760a 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -25,7 +25,6 @@ package_group(
         "//tensorflow/compiler/jit/...",
         "//tensorflow/core/common_runtime/next_pluggable_device/...",
         "//tensorflow/core/tfrt/...",
-        "//tensorflow/core/tfrt/eager/backends/tpu/...",
         "//tensorflow/core/tpu/...",
         "//tensorflow/dtensor/...",
         "//third_party/tf_runtime_google/...",
@@ -162,7 +161,11 @@ tf_cc_test(
 tf_cuda_cc_test(
     name = "create_pjrt_client_util_test",
     srcs = ["create_pjrt_client_util_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":create_pjrt_client_util",
         ":global_state",
@@ -184,10 +187,6 @@ cc_library(
     name = "pjrt_client_factory_options",
     hdrs = ["pjrt_client_factory_options.h"],
     visibility = [":friends"],
-    deps = [
-        "//tensorflow/compiler/xla/pjrt/distributed:client",
-        "//tensorflow/compiler/xla/pjrt/gpu:gpu_helpers",
-    ],
 )
 
 cc_library(
@@ -250,7 +249,11 @@ tf_cuda_cc_test(
     name = "pjrt_gpu_client_registration_test",
     size = "small",
     srcs = ["pjrt_gpu_client_registration_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    tags = tf_cuda_tests_tags() + [
+        "noasan",
+        "nomsan",
+        "notsan",
+    ],
     deps = [
         ":pjrt_client_factory_options",
         ":pjrt_client_factory_registry",
diff --git a/tensorflow/core/tfrt/common/global_state.cc b/tensorflow/core/tfrt/common/global_state.cc
index b865786db78..5cb42afccce 100644
--- a/tensorflow/core/tfrt/common/global_state.cc
+++ b/tensorflow/core/tfrt/common/global_state.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/common/global_state.h"
 
+#include <memory>
 #include <utility>
 
 #include "absl/memory/memory.h"
diff --git a/tensorflow/core/tfrt/common/pjrt_client_factory_options.h b/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
index fb4fe5919ef..47caf2116af 100644
--- a/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
+++ b/tensorflow/core/tfrt/common/pjrt_client_factory_options.h
@@ -21,9 +21,6 @@ limitations under the License.
 #include <set>
 #include <string>
 
-#include "tensorflow/compiler/xla/pjrt/distributed/client.h"
-#include "tensorflow/compiler/xla/pjrt/gpu/gpu_helpers.h"
-
 namespace xla {
 // PjrtClientFactoryOptions store arguments to create PJRT client.
 // Caller is responsible to set option value for corresponding PJRT client
@@ -31,8 +28,6 @@ namespace xla {
 struct PjrtClientFactoryOptions {
   struct GpuClientCreateOptions {
     bool asynchronous = false;
-    xla::GpuAllocatorConfig allocator_config = {};
-    std::shared_ptr<xla::DistributedRuntimeClient> distributed_client = nullptr;
     int node_id = 0;
     std::optional<std::set<int>> allowed_devices = std::nullopt;
     std::optional<std::string> platform_name = std::nullopt;
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
index d9b74a7d8d2..6de7390d743 100644
--- a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration.cc
@@ -27,13 +27,12 @@ namespace xla {
 
 StatusOr<std::unique_ptr<xla::PjRtClient>> GetGpuClient(
     const PjrtClientFactoryOptions& option) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<PjRtClient> client,
-      xla::GetStreamExecutorGpuClient(
-          option.gpu_options.asynchronous, option.gpu_options.allocator_config,
-          option.gpu_options.distributed_client, option.gpu_options.node_id,
-          option.gpu_options.allowed_devices,
-          option.gpu_options.platform_name));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<PjRtClient> client,
+                      xla::GetStreamExecutorGpuClient(
+                          option.gpu_options.asynchronous,
+                          /*allocator_config=*/{}, option.gpu_options.node_id,
+                          /*num_nodes=*/1, option.gpu_options.allowed_devices,
+                          option.gpu_options.platform_name));
   return std::move(client);
 }
 
diff --git a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
index b3cf4974fb9..f4feb34b541 100644
--- a/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
+++ b/tensorflow/core/tfrt/common/pjrt_gpu_client_registration_test.cc
@@ -25,8 +25,6 @@ namespace {
 
 TEST(PjrtGpuClientCreateTest, TestGpuCreateOption) {
   PjrtClientFactoryOptions options = PjrtClientFactoryOptions();
-  options.gpu_options.asynchronous = true;
-  options.gpu_options.allocator_config = xla::GpuAllocatorConfig();
   TF_ASSERT_OK_AND_ASSIGN(
       auto client, xla::PjrtClientFactoryRegistry::Get().GetPjrtClient(
                        tsl::DeviceType(tensorflow::DEVICE_GPU), options));
diff --git a/tensorflow/core/tfrt/eager/BUILD b/tensorflow/core/tfrt/eager/BUILD
deleted file mode 100644
index e61771c2b61..00000000000
--- a/tensorflow/core/tfrt/eager/BUILD
+++ /dev/null
@@ -1,236 +0,0 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = [
-        # copybara:uncomment "//learning/brain/experimental/dtensor/...",
-        "//tensorflow/c/eager/...",
-        "//tensorflow/core/runtime_fallback/...",  # TODO(chuanhao): remove after cl/326748977.
-        "//tensorflow/core/tfrt/...",
-        "//tensorflow/python/...",
-    ],
-)
-
-cc_library(
-    name = "transform_graph_function",
-    srcs = [
-        "transform_graph_function.cc",
-    ],
-    hdrs = [
-        "transform_graph_function.h",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core/common_runtime:device_set",
-        "//tensorflow/core/common_runtime:function",
-        "//tensorflow/core/common_runtime:function_body",
-        "//tensorflow/core/common_runtime:function_optimization_registry",
-        "//tensorflow/core/common_runtime:optimization_registry",
-        "//tensorflow/core/common_runtime:optimize_function_graph_utils",
-        "//tensorflow/core/common_runtime:placer",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/grappler:grappler_item",
-        "//tensorflow/core/grappler/optimizers:meta_optimizer",
-        "@tf_runtime//:hostcontext",
-        "@tf_runtime//:support",
-    ],
-)
-
-cc_library(
-    name = "c_api_tfrt",
-    srcs = [
-        "c_api_tfrt.cc",
-        "function_cache.cc",
-        "op_cache.cc",
-    ],
-    hdrs = [
-        "c_api_tfrt.h",
-        "function_cache.h",
-        "op_cache.h",
-    ],
-    deps = [
-        ":core_runtime",
-        ":tfrt_context",
-        ":transform_graph_function",
-        ":virtual_device",
-        "//tensorflow/c:tensor_interface",
-        "//tensorflow/c:tf_tensor_internal",
-        "//tensorflow/c/eager:abstract_function",
-        "//tensorflow/c/eager:abstract_op_attrs",
-        "//tensorflow/c/eager:abstract_operation",
-        "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:immediate_execution_context",
-        "//tensorflow/c/eager:immediate_execution_operation",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/c/experimental/saved_model/core:saved_model_api",
-        "//tensorflow/compiler/jit:common",
-        "//tensorflow/compiler/mlir/tfrt:function",
-        "//tensorflow/compiler/mlir/tfrt:import_model",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/common_runtime/eager:attr_builder",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/profiler/lib:traceme",
-        "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat_eager",
-        "//tensorflow/core/runtime_fallback/runtime:op_logger",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/runtime_fallback/util:attr_util",
-        "//tensorflow/core/runtime_fallback/util:tensor_util",
-        "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
-        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
-        "//tensorflow/core/tfrt/utils",
-        "//tensorflow/core/tfrt/utils:error_util",
-        "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
-        "@tf_runtime//:basic_kernels_alwayslink",
-        "@tf_runtime//:bef",
-        "@tf_runtime//:bef_attr_encoder",
-        "@tf_runtime//:befexecutor",
-        "@tf_runtime//:core_runtime_alwayslink",
-        "@tf_runtime//:dtype",
-        "@tf_runtime//:hostcontext_alwayslink",
-        "@tf_runtime//:metrics",
-        "@tf_runtime//:support",
-        "@tf_runtime//:tensor_alwayslink",
-        "@tf_runtime//backends/common:eigentype",
-        "@tf_runtime//backends/cpu:core_runtime_alwayslink",
-    ] + if_cuda([
-        "//tensorflow/core/tfrt/eager/backends/gpu:gpu_registration_alwayslink",
-    ]),
-)
-
-cc_library(
-    name = "tfrt_context",
-    srcs = [
-        "tfrt_context.cc",
-    ],
-    hdrs = [
-        "tfrt_context.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":core_runtime",
-        "//tensorflow/c/eager:immediate_execution_context",
-        "//tensorflow/core:core_cpu",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core/common_runtime:process_util",
-        "//tensorflow/core/platform:threadpool_interface",
-        "//tensorflow/core/runtime_fallback/conversion:conversion_alwayslink",
-        "//tensorflow/core/runtime_fallback/runtime:kernel_utils",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/tfrt/common:global_state",
-        "//tensorflow/core/tfrt/eager/backends/cpu:cpu_registration_alwayslink",
-        "//tensorflow/core/tfrt/runtime:tf_threadpool_concurrent_work_queue",
-        "//tensorflow/core/tpu:virtual_device",
-        "@tf_runtime//:basic_kernels_alwayslink",
-        "@tf_runtime//:core_runtime_alwayslink",
-        "@tf_runtime//:hostcontext_alwayslink",
-    ] + if_cuda([
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_gpu_alwayslink",
-        "//tensorflow/core/tfrt/eager/backends/gpu:gpu_registration_alwayslink",
-    ]),
-)
-
-cc_library(
-    name = "virtual_device",
-    hdrs = ["virtual_device.h"],
-    deps = [
-        "@tf_runtime//:hostcontext",
-    ],
-)
-
-# For a more maintainable build this target should not exist and the headers
-# should  be split into the existing cc_library targets, but this change was
-# automatically  done so that we can remove long standing issues and complexity
-# in the build system. It's up to the OWNERS of this package to get rid of it or
-# not. The use of the textual_hdrs attribute is discouraged, use hdrs instead.
-# Here it is used to avoid header parsing errors in packages where the feature
-# parse_headers was enabled since loose headers were not being parsed. See
-# go/loose-lsc-one-target-approach for more details.
-cc_library(
-    name = "loose_headers",
-    tags = ["avoid_dep"],
-    textual_hdrs = [
-        "c_api_tfrt.h",
-    ],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
-)
-
-alias(
-    name = "core_runtime",
-    actual = "//tensorflow/core/tfrt/eager/core_runtime:core_runtime_lib",
-)
-
-tf_cc_test(
-    name = "function_cache_test",
-    size = "small",
-    srcs = [
-        "function_cache_test.cc",
-    ],
-    args = ["--heap_check="],
-    tags = ["no_oss"],
-    deps = [
-        ":c_api_tfrt",
-        "//tensorflow/c:c_api",
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/c/eager:abstract_tensor_handle",
-        "//tensorflow/c/eager:c_api_experimental",
-        "//tensorflow/c/eager:c_api_test_util",
-        "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/c/experimental/ops:array_ops",
-        "//tensorflow/compiler/mlir/tensorflow/c:mlir_c_api_registration",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core/lib/llvm_rtti",
-        "//tensorflow/core/platform:refcount",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat_eager",
-        "@com_google_absl//absl/types:span",
-        "@tf_runtime//backends/cpu:tf_ops_alwayslink",
-    ],
-)
-
-tf_cc_test(
-    name = "op_cache_test",
-    size = "small",
-    srcs = [
-        "op_cache_test.cc",
-    ],
-    args = ["--heap_check="],
-    tags = ["no_oss"],
-    deps = [
-        ":c_api_tfrt",
-        "//tensorflow/c:tf_status_helper",
-        "//tensorflow/c/eager:c_api",
-        "//tensorflow/c/eager:c_api_internal",
-        "//tensorflow/c/eager:c_api_test_util",
-        "//tensorflow/c/eager:c_api_unified_internal",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "@com_google_absl//absl/types:span",
-        "@tf_runtime//backends/cpu:core_runtime",
-        "@tf_runtime//backends/cpu:tf_ops_alwayslink",
-    ],
-)
diff --git a/tensorflow/core/tfrt/eager/backends/cpu/BUILD b/tensorflow/core/tfrt/eager/backends/cpu/BUILD
deleted file mode 100644
index ec8dd8ac17d..00000000000
--- a/tensorflow/core/tfrt/eager/backends/cpu/BUILD
+++ /dev/null
@@ -1,34 +0,0 @@
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = [
-        "//tensorflow/core/tfrt/eager/...",
-    ],
-)
-
-cc_library(
-    name = "cpu_registration_alwayslink",
-    srcs = [
-        "cpu_registration.cc",
-    ],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/common_runtime:core_cpu_lib",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/tfrt/eager:core_runtime",
-        "@tf_runtime//:core_runtime_alwayslink",
-        "@tf_runtime//:dtype",
-        "@tf_runtime//:hostcontext_alwayslink",
-        "@tf_runtime//:support",
-        "@tf_runtime//:tensor",
-        "@tf_runtime//backends/cpu:core_runtime",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/core/tfrt/eager/backends/cpu/cpu_registration.cc b/tensorflow/core/tfrt/eager/backends/cpu/cpu_registration.cc
deleted file mode 100644
index 6fb71deaf87..00000000000
--- a/tensorflow/core/tfrt/eager/backends/cpu/cpu_registration.cc
+++ /dev/null
@@ -1,78 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
-#include "tfrt/cpu/core_runtime/cpu_op_handler.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
-#include "tfrt/tensor/scalar_host_tensor.h"  // from @tf_runtime
-#include "tfrt/tensor/string_host_tensor.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-namespace cpu {
-
-using ::tensorflow::DeviceMgr;
-
-static void RegisterCpuOpHandler(CoreRuntime* core_runtime,
-                                 ResourceContext* resource_context,
-                                 const DeviceMgr* device_mgr) {
-  for (auto& device : device_mgr->ListDevices()) {
-    auto& parsed_name = device->parsed_name();
-    assert(parsed_name.has_id && parsed_name.has_type);
-    if (parsed_name.type == "CPU") {
-      auto cpu = core_runtime->GetHostContext()
-                     ->GetDeviceManager()
-                     ->GetDeviceRef<CpuDevice>(device->name());
-      auto expected_fallback_op_handler =
-          tensorflow::tfd::CreateRuntimeFallbackOpHandler(core_runtime,
-                                                          device->name());
-      assert(expected_fallback_op_handler);
-
-      auto expected_cpu_op_handler = ::tfrt::CreateCpuOpHandler(
-          core_runtime, std::move(cpu), expected_fallback_op_handler.get());
-      assert(expected_cpu_op_handler);
-
-      expected_cpu_op_handler.get()->AddImplicitConversion(
-          tensorflow::tfd::RuntimeFallbackTensor::kTensorType,
-          DenseHostTensor::kTensorType);
-      expected_cpu_op_handler.get()->AddImplicitConversion(
-          tensorflow::tfd::RuntimeFallbackTensor::kTensorType,
-          AnyScalarHostTensor::kTensorType);
-      expected_cpu_op_handler.get()->AddImplicitConversion(
-          tensorflow::tfd::RuntimeFallbackTensor::kTensorType,
-          StringHostTensor::kTensorType);
-
-      core_runtime->RegisterOpHandler(device->name(),
-                                      expected_cpu_op_handler.get());
-      VLOG(1) << "Registered OpHandler for CPU device: " << device->name();
-
-      // TODO(fishx): Remove this when lowering pass can use full device name.
-      if (parsed_name.id == 0) {
-        core_runtime->RegisterOpHandler("cpu", expected_cpu_op_handler.get());
-      }
-    }
-  }
-}
-
-static OpHandlerRegistration register_cpu(RegisterCpuOpHandler);
-
-}  // namespace cpu
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/backends/gpu/BUILD b/tensorflow/core/tfrt/eager/backends/gpu/BUILD
deleted file mode 100644
index 0c53e0755d2..00000000000
--- a/tensorflow/core/tfrt/eager/backends/gpu/BUILD
+++ /dev/null
@@ -1,47 +0,0 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cuda_library",
-)
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":friends"],
-    features = ["-layering_check"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    packages = [
-        "//tensorflow/core/tfrt/eager/...",
-    ],
-)
-
-tf_cuda_library(
-    name = "gpu_registration_alwayslink",
-    srcs = [
-        "gpu_registration.cc",
-    ],
-    compatible_with = [],
-    tags = [
-        "manual",
-        "no_oss",
-        "requires_cuda",
-    ],  # Only build this library with --config=cuda.
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/common_runtime:core_cpu_lib",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_gpu_alwayslink",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_alwayslink",
-        "//tensorflow/core/runtime_fallback/runtime:runtime_fallback_gpu_alwayslink",
-        "//tensorflow/core/tfrt/eager:core_runtime",
-        "@tf_runtime//:core_runtime_alwayslink",
-        "@tf_runtime//:dtype",
-        "@tf_runtime//:hostcontext_alwayslink",
-        "@tf_runtime//:support",
-        "@tf_runtime//backends/gpu:gpu_op_handler",
-    ],
-    alwayslink = 1,
-)
diff --git a/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc b/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc
deleted file mode 100644
index 7abd250ac05..00000000000
--- a/tensorflow/core/tfrt/eager/backends/gpu/gpu_registration.cc
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/device/device_id.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/gpu/gpu_id_manager.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
-#include "tfrt/gpu/core_runtime/gpu_op_handler.h"  // from @tf_runtime
-#include "tfrt/gpu/device/device.h"  // from @tf_runtime
-#include "tfrt/gpu/device/device_util.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-namespace gpu {
-
-using ::tfrt::CoreRuntime;
-
-static void RegisterGpuOpHandler(CoreRuntime* core_runtime,
-                                 ResourceContext* resource_context,
-                                 const DeviceMgr* device_mgr) {
-  for (auto& device : device_mgr->ListDevices()) {
-    auto& parsed_name = device->parsed_name();
-    assert(parsed_name.has_id && parsed_name.has_type);
-    if (parsed_name.type == "GPU") {
-      // Please see the difference between tf_device_id and platform_device_id
-      // here in tensorflow/core/common_runtime/device/device_id.h
-      tensorflow::TfDeviceId tf_device_id(parsed_name.id);
-      tensorflow::PlatformDeviceId platform_device_id;
-      tensorflow::Status s = tensorflow::GpuIdManager::TfToPlatformDeviceId(
-          tf_device_id, &platform_device_id);
-      if (!s.ok()) {
-        LOG(ERROR) << "Failed to convert gpu device [" << device->name()
-                   << "] to platform device id due to error: " << s.message();
-        continue;
-      }
-      auto gpu = tfrt::gpu::GetOrCreateGpuDevice(
-          device->name(), platform_device_id.value(),
-          core_runtime->GetHostContext());
-      if (!gpu) {
-        LOG(ERROR) << "Failed to create gpu device [" << device->name()
-                   << "]. Error: " << StrCat(gpu.takeError());
-        continue;
-      }
-      LOG(INFO) << "Found a GPU device: " << device->name();
-      auto expected_fallback_op_handler =
-          tensorflow::tfd::CreateRuntimeFallbackOpHandler(core_runtime,
-                                                          device->name());
-      assert(expected_fallback_op_handler);
-
-      auto expected_gpu_op_handler =
-          ::tfrt::gpu::CreateGpuOpHandler(core_runtime, std::move(gpu.get()),
-                                          expected_fallback_op_handler.get());
-      assert(expected_gpu_op_handler);
-
-      core_runtime->RegisterOpHandler(device->name(),
-                                      expected_gpu_op_handler.get());
-
-      // TODO(fishx): Remove this when lowering pass can use full device name.
-      if (parsed_name.id == 0) {
-        core_runtime->RegisterOpHandler("gpu", expected_gpu_op_handler.get());
-      }
-    }
-  }
-}
-
-static OpHandlerRegistration register_gpu(RegisterGpuOpHandler);
-
-}  // namespace gpu
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt.cc b/tensorflow/core/tfrt/eager/c_api_tfrt.cc
deleted file mode 100644
index 6a4527efdf1..00000000000
--- a/tensorflow/core/tfrt/eager/c_api_tfrt.cc
+++ /dev/null
@@ -1,2002 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-
-#include <cstddef>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <vector>
-
-#include "tensorflow/c/eager/abstract_function.h"
-#include "tensorflow/c/eager/abstract_operation.h"
-#include "tensorflow/c/eager/immediate_execution_context.h"
-#include "tensorflow/c/eager/immediate_execution_operation.h"
-#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/tf_tensor_internal.h"
-#include "tensorflow/compiler/jit/defs.h"
-#include "tensorflow/compiler/mlir/tfrt/function/function.h"
-#include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_factory.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/framework/attr_value.pb.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/node_def_util.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/lib/monitoring/gauge.h"
-#include "tensorflow/core/platform/casts.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/logging.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/stringpiece.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h"
-#include "tensorflow/core/runtime_fallback/runtime/op_logger.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_tensor.h"
-#include "tensorflow/core/runtime_fallback/util/attr_util.h"
-#include "tensorflow/core/runtime_fallback/util/tensor_util.h"
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h"
-#include "tensorflow/core/tfrt/eager/virtual_device.h"
-#include "tensorflow/core/tfrt/utils/error_util.h"
-#include "tensorflow/core/tfrt/utils/utils.h"
-#include "tensorflow/core/util/device_name_utils.h"
-#include "tfrt/common/compat/eigen/eigen_dtype.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_attr_type.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_handler.h"  // from @tf_runtime
-#include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
-#include "tfrt/dtype/dtype.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
-#include "tfrt/host_context/attribute_utils.h"  // from @tf_runtime
-#include "tfrt/host_context/chain.h"  // from @tf_runtime
-#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
-#include "tfrt/host_context/device.h"  // from @tf_runtime
-#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
-#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
-#include "tfrt/host_context/function.h"  // from @tf_runtime
-#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/host_context/location.h"  // from @tf_runtime
-#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
-#include "tfrt/metrics/common_metrics.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-#include "tfrt/support/ref_count.h"  // from @tf_runtime
-#include "tfrt/support/string_util.h"  // from @tf_runtime
-#include "tfrt/tensor/conversion_registry.h"  // from @tf_runtime
-#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
-#include "tfrt/tensor/dense_host_tensor_view.h"  // from @tf_runtime
-#include "tfrt/tensor/scalar_host_tensor.h"  // from @tf_runtime
-#include "tfrt/tensor/string_host_tensor.h"  // from @tf_runtime
-#include "tfrt/tensor/tensor_metadata.h"  // from @tf_runtime
-#include "tfrt/tensor/tensor_serialize_utils.h"  // from @tf_runtime
-#include "tfrt/tensor/tensor_type_registration.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-namespace {
-
-using tensorflow::down_cast;
-
-constexpr char kGpuDeviceName[] = "GPU";
-constexpr char kEnableGrapplerAttr[] = "TFRT_TEST_enable_grappler";
-
-TensorMetadata CreateMetadata(DType dtype, absl::Span<const Index> dim_sizes) {
-  return TensorMetadata(
-      DType(dtype),
-      TensorShape(llvm::ArrayRef<Index>(
-          reinterpret_cast<const Index*>(dim_sizes.data()), dim_sizes.size())));
-}
-
-tensorflow::DataType ConvertDType(DType kind) {
-  switch (kind) {
-    case DType::UI8:
-      return tensorflow::DT_UINT8;
-    case DType::UI16:
-      return tensorflow::DT_UINT16;
-    case DType::UI32:
-      return tensorflow::DT_UINT32;
-    case DType::UI64:
-      return tensorflow::DT_UINT64;
-    case DType::I8:
-      return tensorflow::DT_INT8;
-    case DType::I16:
-      return tensorflow::DT_INT16;
-    case DType::I32:
-      return tensorflow::DT_INT32;
-    case DType::I64:
-      return tensorflow::DT_INT64;
-    case DType::BF16:
-      return tensorflow::DT_BFLOAT16;
-    case DType::F16:
-      return tensorflow::DT_HALF;
-    case DType::F32:
-      return tensorflow::DT_FLOAT;
-    case DType::F64:
-      return tensorflow::DT_DOUBLE;
-    case DType::I1:
-      return tensorflow::DT_BOOL;
-    case DType::Complex64:
-      return tensorflow::DT_COMPLEX64;
-    case DType::Complex128:
-      return tensorflow::DT_COMPLEX128;
-    case DType::String:
-      return tensorflow::DT_STRING;
-    case DType::Resource:
-      return tensorflow::DT_RESOURCE;
-    case DType::Variant:
-      return tensorflow::DT_VARIANT;
-    case DType::QUI8:
-      return tensorflow::DT_QUINT8;
-    case DType::QUI16:
-      return tensorflow::DT_QUINT16;
-    case DType::QI8:
-      return tensorflow::DT_QINT8;
-    case DType::QI16:
-      return tensorflow::DT_QINT16;
-    case DType::QI32:
-      return tensorflow::DT_QINT32;
-    default:
-      LOG(ERROR) << "Unsupported kind " << kind;
-      return tensorflow::DT_INVALID;
-  }
-}
-
-DType ConvertDType(tensorflow::DataType dtype) {
-  switch (dtype) {
-    case tensorflow::DT_UINT8:
-      return static_cast<DType>(DType::UI8);
-    case tensorflow::DT_UINT16:
-      return static_cast<DType>(DType::UI16);
-    case tensorflow::DT_UINT32:
-      return static_cast<DType>(DType::UI32);
-    case tensorflow::DT_UINT64:
-      return static_cast<DType>(DType::UI64);
-    case tensorflow::DT_INT8:
-      return static_cast<DType>(DType::I8);
-    case tensorflow::DT_INT16:
-      return static_cast<DType>(DType::I16);
-    case tensorflow::DT_INT32:
-      return static_cast<DType>(DType::I32);
-    case tensorflow::DT_INT64:
-      return static_cast<DType>(DType::I64);
-    case tensorflow::DT_BFLOAT16:
-      return static_cast<DType>(DType::BF16);
-    case tensorflow::DT_HALF:
-      return static_cast<DType>(DType::F16);
-    case tensorflow::DT_FLOAT:
-      return static_cast<DType>(DType::F32);
-    case tensorflow::DT_DOUBLE:
-      return static_cast<DType>(DType::F64);
-    case tensorflow::DT_BOOL:
-      return static_cast<DType>(DType::I1);
-    case tensorflow::DT_STRING:
-      return static_cast<DType>(DType::String);
-    case tensorflow::DT_COMPLEX64:
-      return static_cast<DType>(DType::Complex64);
-    case tensorflow::DT_COMPLEX128:
-      return static_cast<DType>(DType::Complex128);
-    case tensorflow::DT_RESOURCE:
-      return static_cast<DType>(DType::Resource);
-    case tensorflow::DT_VARIANT:
-      return static_cast<DType>(DType::Variant);
-    case tensorflow::DT_QUINT8:
-      return static_cast<DType>(DType::QUI8);
-    case tensorflow::DT_QUINT16:
-      return static_cast<DType>(DType::QUI16);
-    case tensorflow::DT_QINT8:
-      return static_cast<DType>(DType::QI8);
-    case tensorflow::DT_QINT16:
-      return static_cast<DType>(DType::QI16);
-    case tensorflow::DT_QINT32:
-      return static_cast<DType>(DType::QI32);
-    default:
-      LOG(FATAL) << "Unsupported dtype " << dtype;
-  }
-}
-
-OpAttrType ConvertDTypeToOpAttrType(tensorflow::DataType dtype) {
-  switch (dtype) {
-    case tensorflow::DT_UINT8:
-      return OpAttrType::UI8;
-    case tensorflow::DT_UINT16:
-      return OpAttrType::UI16;
-    case tensorflow::DT_UINT32:
-      return OpAttrType::UI32;
-    case tensorflow::DT_UINT64:
-      return OpAttrType::UI64;
-    case tensorflow::DT_INT8:
-      return OpAttrType::I8;
-    case tensorflow::DT_INT16:
-      return OpAttrType::I16;
-    case tensorflow::DT_INT32:
-      return OpAttrType::I32;
-    case tensorflow::DT_INT64:
-      return OpAttrType::I64;
-    case tensorflow::DT_BFLOAT16:
-      return OpAttrType::BF16;
-    case tensorflow::DT_HALF:
-      return OpAttrType::F16;
-    case tensorflow::DT_FLOAT:
-      return OpAttrType::F32;
-    case tensorflow::DT_DOUBLE:
-      return OpAttrType::F64;
-    case tensorflow::DT_BOOL:
-      return OpAttrType::BOOL;
-    case tensorflow::DT_COMPLEX64:
-      return OpAttrType::COMPLEX64;
-    case tensorflow::DT_COMPLEX128:
-      return OpAttrType::COMPLEX128;
-    default:
-      LOG(FATAL) << "Unsupported dtype " << dtype;
-  }
-}
-
-// This method will first look at the calling op attrs and then look at the
-// function def attrs to find the attribute value.
-void GetFuncAttr(const OpAttrs& op_attrs, const std::string& op_name,
-                 const tensorflow::FunctionLibraryDefinition& func_lib_def,
-                 string_view attr_name, bool* value) {
-  bool success = op_attrs.Get(attr_name, value);
-  if (success) {
-    DVLOG(2) << "Caller explicitly specifies " << attr_name.str()
-             << (value ? "=true " : "=false, ");
-    return;
-  }
-
-  const tensorflow::FunctionDef* function_def = func_lib_def.Find(op_name);
-  if (function_def == nullptr) {
-    return;
-  }
-
-  tensorflow::Status status =
-      GetNodeAttr(tensorflow::AttrSlice(&function_def->attr()),
-                  {attr_name.data(), attr_name.size()}, value);
-  if (status.ok()) {
-    DVLOG(2) << "Function definition explicitly specifies " << attr_name.str()
-             << (value ? "=true" : "=false");
-    return;
-  }
-}
-
-int64_t GetNextLocationId() {
-  static std::atomic<int64_t> id(0);
-  return id.fetch_add(1, std::memory_order_relaxed);
-}
-
-// TODO(b/161370736): Have a formal method to convert between TF's and TFRT's
-// device name. Currently TFRT adopts the suffix of TF's device name,
-// e.g. CPU:0.
-tfrt::Expected<const char*> ConvertTfDeviceNameToTfrt(
-    const char* device_name, tensorflow::EagerContext* eager_context) {
-  // NOTE(fishx): We need to get tf_device first because DeviceMgr in current TF
-  // allows us get the device with simplified name like "CPU:0". However, TFRT
-  // DeviceManager only allows get device via its fullname.
-  tensorflow::Device* tf_device;
-  tensorflow::Status s =
-      eager_context->FindDeviceFromName(device_name, &tf_device);
-  if (!s.ok()) {
-    return MakeStringError(s.message());
-  }
-  return tf_device->name().c_str();
-}
-
-}  // namespace
-
-tensorflow::DataType TensorInterface::Type() const {
-  auto kind = tensor_.get().metadata().dtype;
-  if (kind == DType::Unsupported) {
-    assert(llvm::isa<tensorflow::tfd::RuntimeFallbackTensor>(tensor_.get()));
-    return tensor_.get<tensorflow::tfd::RuntimeFallbackTensor>()
-        .GetTensorHandle()
-        ->DataType();
-  }
-  return ConvertDType(kind);
-}
-
-int TensorInterface::NumDims() const { return tensor_.get().shape().GetRank(); }
-
-int64_t TensorInterface::Dim(int dim_index) const {
-  return tensor_.get().shape().GetDimensionSize(dim_index);
-}
-
-int64_t TensorInterface::NumElements() const {
-  if (!tensor_) {
-    return static_cast<int64_t>(tf_tensor_.NumElements());
-  }
-  return tensor_.get().shape().GetNumElements();
-}
-
-size_t TensorInterface::ByteSize() const {
-  return tensor_.get().metadata().GetHostSizeInBytes();
-}
-
-void* TensorInterface::Data() const {
-  if (!tensor_) {
-    return tensorflow::TensorCApi::Buffer(tf_tensor_)->data();
-  } else {
-    auto& tensor = tensor_.get<DenseHostTensor>();
-    return tensor.data();
-  }
-}
-
-// TFRT DenseHostTensor is always aligned
-bool TensorInterface::IsAligned() const { return true; }
-
-bool TensorInterface::CanMove() const {
-  // It is safe to move the Tensor if and only if we own the unique reference to
-  // the tensor buffer.
-  auto& dht = tensor_.get<DenseHostTensor>();
-  return tensor_.IsUnique() && dht.buffer()->IsUnique();
-}
-
-std::string TensorInterface::SummarizeValue() const {
-  if (!tensor_) {
-    return tf_tensor_.SummarizeValue(/*max_entries=*/3, /*print_v2=*/true);
-  } else {
-    std::string result;
-    llvm::raw_string_ostream result_ostream(result);
-    tensor_->Print(result_ostream);
-    return result;
-  }
-}
-
-AsyncValueRef<Tensor> TensorInterface::TensorRef() const {
-  return tensor_.CopyRef();
-}
-
-TensorHandleInterface::TensorHandleInterface(Value&& v, TfrtContext* context)
-    : ImmediateExecutionTensorHandle(kTfrt),
-      context_(*context),
-      value_(std::move(v)) {}
-
-TensorHandleInterface::TensorHandleInterface(tensorflow::DataType dtype,
-                                             Value&& v, TfrtContext* context)
-    : ImmediateExecutionTensorHandle(kTfrt),
-      dtype_(dtype),
-      context_(*context),
-      value_(std::move(v)) {}
-
-tensorflow::DataType TensorHandleInterface::DataType() const {
-  // If dtype_ field is set, use it instead of waiting for the underlying
-  // TensorHandle's metadata to be available.
-  if (dtype_) {
-    return dtype_.value();
-  }
-  auto metadata = Metadata();
-  if (!metadata.has_value()) {
-    LOG(ERROR)
-        << "Failed to get DataType due to error metadata: "
-        << value_.get<TensorHandle>().GetAsyncMetadata().GetError().message();
-    return tensorflow::DT_INVALID;
-  }
-  auto kind = metadata.value()->dtype;
-  if (kind == DType::Unsupported) {
-    AsyncValue* async_tensor = value_.get<TensorHandle>().GetAsyncTensor();
-    if (!async_tensor->IsAvailable()) {
-      context_.GetHostContext()->Await(FormRef(async_tensor));
-    }
-
-    if (async_tensor->IsError()) {
-      LOG(ERROR) << "Failed to get DataType from an error tensor "
-                 << async_tensor->GetError().message();
-      return tensorflow::DT_INVALID;
-    }
-    assert(async_tensor->IsType<tensorflow::tfd::RuntimeFallbackTensor>());
-    return async_tensor->get<tensorflow::tfd::RuntimeFallbackTensor>()
-        .GetTensorHandle()
-        ->DataType();
-  }
-  return ConvertDType(kind);
-}
-
-tensorflow::Status TensorHandleInterface::TensorHandleStatus() const {
-  if (context_.IsAsync()) {
-    return ::tensorflow::OkStatus();
-  } else {
-    auto metadata = Metadata();
-    if (!metadata.has_value()) {
-      LOG(ERROR)
-          << "Metadata in the tensor handle is an error metadata: "
-          << value_.get<TensorHandle>().GetAsyncMetadata().GetError().message();
-      return tensorflow::errors::Internal(
-          value_.get<TensorHandle>().GetAsyncMetadata().GetError().message());
-    }
-
-    AsyncValue* async_tensor = value_.get<TensorHandle>().GetAsyncTensor();
-    if (!async_tensor->IsAvailable()) {
-      context_.GetHostContext()->Await(FormRef(async_tensor));
-    }
-
-    if (async_tensor->IsError()) {
-      LOG(ERROR) << "Async tensor in the tensor handle is an error tensor: "
-                 << async_tensor->GetError().message();
-      return tensorflow::errors::Internal(async_tensor->GetError().message());
-    }
-
-    return ::tensorflow::OkStatus();
-  }
-}
-
-tensorflow::Status TensorHandleInterface::Shape(
-    tensorflow::PartialTensorShape* shape) const {
-  auto metadata = Metadata();
-  if (!metadata.has_value()) {
-    return tensorflow::FromAbslStatus(
-        value_.get<TensorHandle>().GetAsyncMetadata().GetError());
-  }
-  int num_dims = metadata.value()->shape.GetRank();
-  if (num_dims == -1) {
-    return ::tensorflow::OkStatus();
-  }
-  llvm::SmallVector<Index, 8> dims;
-  metadata.value()->shape.GetDimensions(&dims);
-  TF_RETURN_IF_ERROR(tensorflow::TensorShapeUtils::MakeShape(dims, shape));
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status TensorHandleInterface::NumDims(int* num_dims) const {
-  auto metadata = Metadata();
-  if (!metadata.has_value()) {
-    return tensorflow::FromAbslStatus(
-        value_.get<TensorHandle>().GetAsyncMetadata().GetError());
-  }
-  *num_dims = metadata.value()->shape.GetRank();
-
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status TensorHandleInterface::NumElements(
-    int64_t* num_elements) const {
-  auto metadata = Metadata();
-  if (!metadata.has_value()) {
-    return tensorflow::FromAbslStatus(
-        value_.get<TensorHandle>().GetAsyncMetadata().GetError());
-  }
-  *num_elements = metadata.value()->shape.GetNumElements();
-
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status TensorHandleInterface::Dim(int dim_index,
-                                              int64_t* dim) const {
-  auto metadata = Metadata();
-  if (!metadata.has_value()) {
-    return tensorflow::FromAbslStatus(
-        value_.get<TensorHandle>().GetAsyncMetadata().GetError());
-  }
-  *dim = metadata.value()->shape.GetDimensionSize(dim_index);
-
-  return ::tensorflow::OkStatus();
-}
-
-const char* TensorHandleInterface::DeviceName(
-    tensorflow::Status* status) const {
-  auto& th = value_.get<TensorHandle>();
-  if (!th.IsDeviceAvailable()) {
-    context_.GetHostContext()->Await(th.GetAsyncDevice().CopyRCRef());
-  }
-  if (th.IsDeviceError()) {
-    *status = tensorflow::FromAbslStatus(th.GetAsyncDevice().GetError());
-    return nullptr;
-  }
-  return th.GetAvailableDevice()->name().data();
-}
-
-const char* TensorHandleInterface::BackingDeviceName(
-    tensorflow::Status* status) const {
-  return DeviceName(status);
-}
-
-const char* TensorHandleInterface::DeviceType(
-    tensorflow::Status* status) const {
-  auto& th = value_.get<TensorHandle>();
-  if (!th.IsDeviceAvailable()) {
-    context_.GetHostContext()->Await(th.GetAsyncDevice().CopyRCRef());
-  }
-  if (th.IsDeviceError()) {
-    *status = tensorflow::FromAbslStatus(th.GetAsyncDevice().GetError());
-    return nullptr;
-  }
-  return th.GetAvailableDevice()->type().name().data();
-}
-
-tensorflow::AbstractTensorInterface* TensorHandleInterface::Resolve(
-    tensorflow::Status* status) {
-  auto* host_ctx = context_.GetHostContext();
-  auto host_device_ref = host_ctx->GetHostDeviceRef();
-  auto& th = value_.get<TensorHandle>();
-
-  auto tensor_av = th.GetAsyncTensor();
-  if (!tensor_av->IsAvailable()) {
-    host_ctx->Await(FormRef(tensor_av));
-  }
-  if (auto* error = tensor_av->GetErrorIfPresent()) {
-    *status = tensorflow::FromAbslStatus(*error);
-    return nullptr;
-  }
-  assert(th.IsMetadataAvailable());
-
-  if (th.GetAsyncTensor()->get<Tensor>().tensor_type() ==
-      StringHostTensor::kTensorType) {
-    tensorflow::Tensor tf_tensor =
-        tensorflow::tfd::CopyShtToTfTensor(tensor_av->get<StringHostTensor>());
-    return new tensorflow::TensorInterface(tf_tensor);
-  }
-
-  // Convert the tensor to DenseHostTensor.
-  auto req_ctx =
-      tfrt::RequestContextBuilder(host_ctx, context_.GetResourceContext())
-          .build();
-  if (!req_ctx) {
-    *status = tensorflow::Status(
-        absl::StatusCode::kUnknown,
-        StrCat("Failed to build a RequestContext: ", req_ctx.takeError()));
-    return nullptr;
-  }
-  tfrt::ExecutionContext exec_ctx{std::move(*req_ctx)};
-  auto target_th = th.TransferTo(exec_ctx, std::move(host_device_ref),
-                                 DenseHostTensor::kTensorType);
-
-  auto target_av = target_th.GetAsyncTensor();
-  if (!target_av->IsAvailable()) {
-    host_ctx->Await(FormRef(target_av));
-  }
-  if (target_av->IsError()) {
-    *status = tensorflow::Status(
-        absl::StatusCode::kUnknown,
-        StrCat("Cannot resolve tensor: ", target_av->GetError().message()));
-    return nullptr;
-  }
-  auto host_tensor_ref = target_th.ReleaseTensorRef();
-  return new TensorInterface(std::move(host_tensor_ref));
-}
-
-std::optional<const TensorMetadata*> TensorHandleInterface::Metadata() const {
-  auto& th = value_.get<TensorHandle>();
-  if (!th.IsMetadataAvailable()) {
-    context_.GetHostContext()->Await(th.GetAsyncMetadata().CopyRCRef());
-  }
-  if (th.IsMetadataError()) {
-    return std::nullopt;
-  }
-  return &th.GetAvailableMetadata();
-}
-
-ContextInterface::ContextInterface(
-    const tensorflow::SessionOptions& opts,
-    tensorflow::ContextDevicePlacementPolicy default_device_placement_policy,
-    bool is_async)
-    : ImmediateExecutionContext(kTfrt),
-      context_(opts, default_device_placement_policy, is_async) {
-  LOG(INFO) << "TFRT Enabled";
-  metrics::AddTFRTVersionMetric();
-
-  op_handler_selector_ = std::make_unique<EagerOpHandlerSelector>(
-      GetCoreRuntime(), GetEagerContext(), GetFallbackOpHandler(),
-      GetEagerContext()->PinSmallOpsToCPU());
-
-  run_metadata_ = std::make_unique<tensorflow::RunMetadata>();
-}
-
-ContextInterface::~ContextInterface() {}
-
-AsyncValueRef<Chain>* ContextInterface::GetChain() {
-  auto thread_id = std::this_thread::get_id();
-  {
-    tensorflow::tf_shared_lock l(chain_map_mu_);
-    auto it = thread_local_chain_.find(thread_id);
-    if (it != thread_local_chain_.end()) {
-      return &it->second;
-    }
-  }
-  {
-    tensorflow::mutex_lock l(chain_map_mu_);
-    if (thread_local_chain_.find(thread_id) == thread_local_chain_.end()) {
-      auto chain = GetReadyChain();
-      thread_local_chain_[thread_id] = std::move(chain);
-    }
-    return &thread_local_chain_[thread_id];
-  }
-}
-
-template <typename T>
-static TensorInterface* MakeScalarTensor(T value, HostContext* host) {
-  // The TensorInterface implementation assumes the tensor is a DenseHostTensor,
-  // so we need to use a DenseHostTensor to represent a scalar tensor.
-  TensorMetadata md(GetDType<T>(), {});
-  auto t = DenseHostTensor::CreateUninitialized(md, host);
-  if (!t) {
-    LOG(ERROR) << "Failed to create DenseHostTensor";
-    return nullptr;
-  }
-  auto& dht = t.value();
-  MutableDHTArrayView<T> view{&dht};
-  view.Elements()[0] = value;
-
-  return new TensorInterface(
-      MakeAvailableAsyncValueRef<DenseHostTensor>(std::move(dht)));
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateInt64Scalar(
-    int64_t value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateUint64Scalar(
-    uint64_t value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateInt32Scalar(
-    int32_t value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateFloatScalar(
-    float value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateDoubleScalar(
-    double value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateHalfScalar(
-    Eigen::half value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateStringScalar(
-    tensorflow::tstring value) {
-  auto* host = GetHostContext();
-  TensorMetadata md(DType(DType::String), {});
-  auto t = StringHostTensor::MakeConstructedAsyncValueRef(md, host);
-  if (t.IsError()) {
-    LOG(ERROR) << "Failed to create StringHostTensor";
-    return nullptr;
-  }
-  t->strings()[0] = value;
-
-  t.SetStateConcrete();
-  return new TensorInterface(std::move(t));
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateComplex128Scalar(
-    tensorflow::complex128 value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateBoolScalar(
-    bool value) {
-  return MakeScalarTensor(value, GetHostContext());
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateTensor(
-    tensorflow::DataType dtype, absl::Span<const int64_t> dim_sizes) {
-  std::vector<Index> dimvec(dim_sizes.size());
-  for (int i = 0; i < dim_sizes.size(); ++i) {
-    dimvec[i] = static_cast<int64_t>(dim_sizes[i]);
-  }
-
-  TensorMetadata md;
-  switch (dtype) {
-    case tensorflow::DT_UINT8:
-      md = CreateMetadata(DType::UI8, dimvec);
-      break;
-    case tensorflow::DT_INT8:
-      md = CreateMetadata(DType::I8, dimvec);
-      break;
-    case tensorflow::DT_INT16:
-      md = CreateMetadata(DType::I16, dimvec);
-      break;
-    case tensorflow::DT_INT32:
-      md = CreateMetadata(DType::I32, dimvec);
-      break;
-    case tensorflow::DT_INT64:
-      md = CreateMetadata(DType::I64, dimvec);
-      break;
-    case tensorflow::DT_HALF:
-      md = CreateMetadata(DType::F16, dimvec);
-      break;
-    case tensorflow::DT_FLOAT:
-      md = CreateMetadata(DType::F32, dimvec);
-      break;
-    case tensorflow::DT_DOUBLE:
-      md = CreateMetadata(DType::F64, dimvec);
-      break;
-    case tensorflow::DT_BOOL:
-      md = CreateMetadata(DType::I1, dimvec);
-      break;
-    case tensorflow::DT_COMPLEX64:
-      md = CreateMetadata(DType::Complex64, dimvec);
-      break;
-    case tensorflow::DT_COMPLEX128:
-      md = CreateMetadata(DType::Complex128, dimvec);
-      break;
-    case tensorflow::DT_VARIANT:
-      // Note: TF Python API can create variant tensor for ragged tensor.
-      md = CreateMetadata(DType::Variant, dimvec);
-      break;
-    case tensorflow::DT_STRING:
-      // No TFRT Metadata needed for non-scalar string tensors.
-      break;
-    default:
-      LOG(ERROR) << "Cannot create tensor with dtype: " << dtype;
-      return nullptr;
-  }
-
-  if (dtype == tensorflow::DT_STRING) {
-    // Create Tensorflow Tensor as a buffer for tstrings.
-    return new TensorInterface(
-        tensorflow::Tensor(dtype, tensorflow::TensorShape(dim_sizes)));
-  } else {
-    auto t = DenseHostTensor::CreateUninitialized(md, GetHostContext());
-    return new TensorInterface(
-        MakeAvailableAsyncValueRef<DenseHostTensor>(std::move(t.value())));
-  }
-}
-
-tensorflow::AbstractTensorInterface* ContextInterface::CreateTensor(
-    tensorflow::DataType dtype, const int64_t* dims, int num_dims, void* data,
-    size_t len, MemoryReleaser memory_releaser, void* memory_releaser_arg) {
-  TensorMetadata metadata(ConvertDType(dtype),
-                          {dims, static_cast<size_t>(num_dims)});
-  RCReference<HostBuffer> buffer = HostBuffer::CreateFromExternal(
-      data, len,
-      [memory_releaser, memory_releaser_arg](void* data, size_t len) {
-        memory_releaser(data, len, memory_releaser_arg);
-      });
-  AsyncValueRef<DenseHostTensor> dht =
-      MakeConstructedAsyncValueRef<DenseHostTensor>(metadata,
-                                                    std::move(buffer));
-
-  dht.SetStateConcrete();
-  return new TensorInterface(std::move(dht));
-}
-
-bool ContextInterface::UsesTFRT() { return true; }
-
-tensorflow::ImmediateExecutionTensorHandle* ContextInterface::CreateLocalHandle(
-    tensorflow::AbstractTensorInterface* t) {
-  auto* tensor_interface = down_cast<TensorInterface*>(t);
-  auto* host = GetHostContext();
-
-  // Create RuntimeFallbackTensor from a TF Tensor, and then create
-  // the according TensorHandleInterface.
-  if (tensor_interface->IsTfTensor()) {
-    tensorflow::tfd::OwnedTensorHandle tf_tensor_handle{
-        tensorflow::TensorHandle::CreateLocalHandle(
-            tensor_interface->TfTensor())};
-
-    auto expected_result_tensor =
-        tensorflow::tfd::CreateRuntimeFallbackTensorFromTfTensorHandle(
-            std::move(tf_tensor_handle), GetHostContext());
-
-    if (expected_result_tensor) {
-      return new TensorHandleInterface(
-          Value(TensorHandle(host->GetHostDeviceRef(),
-                             expected_result_tensor.get().metadata(),
-                             MakeAvailableAsyncValueRef<
-                                 tensorflow::tfd::RuntimeFallbackTensor>(
-                                 std::move(expected_result_tensor.get())))),
-          GetTfrtContext());
-    } else {
-      return new TensorHandleInterface(
-          Value(TensorHandle::CreateError(MakeErrorAsyncValueRef(
-              StrCat(expected_result_tensor.takeError())))),
-          GetTfrtContext());
-    }
-  }
-
-  auto tensor_av = tensor_interface->TensorRef();
-  const TensorMetadata& md = tensor_av.get<Tensor>().metadata();
-
-  // NOTE(fishx): Following logic is needed to let TF-TFRT fully reach
-  // performance parity with current TF. This API is used to by tf.constant
-  // to convert Python object to **CPU** Tensor. tf.constant in current TF
-  // heavily depends on Tensor Mirroring feature for good performance. However,
-  // TFRT does not have Tensor Mirroring feature. In order to use Tensor
-  // Mirroring from current TF runtime, we convert the result of tf.constant to
-  // Fallback Tensor.
-
-  if (tensor_av.IsAvailable()) {
-    if (auto* dht = llvm::dyn_cast<DenseHostTensor>(&tensor_av.get<Tensor>())) {
-      return new TensorHandleInterface(
-          Value(TensorHandle(
-              host->GetHostDeviceRef(), md,
-              MakeAvailableAsyncValueRef<
-                  tensorflow::tfd::RuntimeFallbackTensor>(
-                  tensorflow::tfd::CopyRefDHTToRuntimeFallbackTensor(*dht,
-                                                                     host)))),
-          GetTfrtContext());
-    }
-  } else {
-    auto result_tensor = MakeIndirectAsyncValue();
-    tensor_av.AndThen([host, result_tensor = result_tensor,
-                       tensor_av = tensor_av.CopyRef()]() {
-      if (auto* dht =
-              llvm::dyn_cast<DenseHostTensor>(&tensor_av.get<Tensor>())) {
-        result_tensor->ForwardTo(
-            MakeAvailableAsyncValueRef<tensorflow::tfd::RuntimeFallbackTensor>(
-                tensorflow::tfd::CopyRefDHTToRuntimeFallbackTensor(*dht,
-                                                                   host)));
-      } else {
-        result_tensor->ForwardTo(tensor_av.CopyRef());
-      }
-    });
-    return new TensorHandleInterface(
-        Value(TensorHandle(host->GetHostDeviceRef(), md,
-                           AsyncValueRef<Tensor>(std::move(result_tensor)))),
-        GetTfrtContext());
-  }
-  return new TensorHandleInterface(
-      Value(TensorHandle(host->GetHostDeviceRef(), md, std::move(tensor_av))),
-      GetTfrtContext());
-}
-
-tensorflow::ImmediateExecutionTensorHandle*
-ContextInterface::CreateLocalHandleFromTFTensor(tensorflow::Tensor& t,
-                                                const char* d_name) {
-  auto* host = GetHostContext();
-  // Create RuntimeFallbackTensor from a TF Tensor, and then create
-  // the according TensorHandleInterface.
-  tensorflow::tfd::OwnedTensorHandle tf_tensor_handle{
-      tensorflow::TensorHandle::CreateLocalHandle(std::move(t))};
-
-  tfrt::Expected<tensorflow::tfd::RuntimeFallbackTensor>
-      expected_result_tensor =
-          tensorflow::tfd::CreateRuntimeFallbackTensorFromTfTensorHandle(
-              std::move(tf_tensor_handle), GetHostContext());
-
-  if (expected_result_tensor) {
-    return new TensorHandleInterface(
-        Value(TensorHandle(
-            host->GetHostDeviceRef(), expected_result_tensor.get().metadata(),
-            MakeAvailableAsyncValueRef<tensorflow::tfd::RuntimeFallbackTensor>(
-                std::move(expected_result_tensor.get())))),
-        GetTfrtContext());
-  } else {
-    return new TensorHandleInterface(
-        Value(TensorHandle::CreateError(MakeErrorAsyncValueRef(
-            StrCat(expected_result_tensor.takeError())))),
-        GetTfrtContext());
-  }
-}
-
-tensorflow::ImmediateExecutionTensorHandle*
-ContextInterface::TFTensorHandleFromInterface(
-    tensorflow::ImmediateExecutionTensorHandle* handle) {
-  TensorHandle th = tfrt::tf::TensorHandleFromInterface(handle)->Handle();
-  AsyncValue* tensor_av = th.GetAsyncTensor();
-  if (tensor_av->IsUnavailable()) GetHostContext()->Await(FormRef(tensor_av));
-
-  auto& tensor = th.GetAsyncTensor()->get<Tensor>();
-
-  if (auto* rtfbt =
-          llvm::dyn_cast<tensorflow::tfd::RuntimeFallbackTensor>(&tensor))
-    return rtfbt->GetTensorHandle();
-
-  if (auto* dht = llvm::dyn_cast<tfrt::DenseHostTensor>(&tensor)) {
-    return tensorflow::TensorHandle::CreateLocalHandle(
-        tensorflow::tfd::MoveHostBufferToTfTensor(dht->buffer(), dht->dtype(),
-                                                  dht->shape()));
-  }
-
-  if (auto* sht = llvm::dyn_cast<tfrt::StringHostTensor>(&tensor)) {
-    return tensorflow::TensorHandle::CreateLocalHandle(
-        tensorflow::tfd::CopyShtToTfTensor(*sht));
-  }
-
-  LOG(ERROR) << "Unsupported tensor type";
-  return nullptr;
-}
-
-tensorflow::ImmediateExecutionOperation* ContextInterface::CreateOperation() {
-  return new OperationInterface(this);
-}
-
-// TODO(srbs): Change this to directly fetch the MLIR function once that is
-// supported.
-tensorflow::Status ContextInterface::RegisterFunction(
-    tensorflow::AbstractFunction* f) {
-  tensorflow::FunctionDef* fdef;
-  TF_RETURN_IF_ERROR(f->GetFunctionDef(&fdef));
-  if (!fdef) {
-    return tensorflow::errors::InvalidArgument(
-        "GetFunctionDef returned nullptr.");
-  }
-  return AddFunctionDef(*fdef);
-}
-
-void ContextInterface::ListDevices(
-    std::vector<tensorflow::DeviceAttributes>* devices) {
-  context_.GetEagerContext()->ListDevices(devices);
-}
-
-tensorflow::Status ContextInterface::AddDevices(
-    std::vector<std::unique_ptr<tensorflow::Device>> devices) {
-  if (!devices.empty() && devices[0]->device_type() != "CPU")
-    return tensorflow::errors::InvalidArgument(
-        "Device: ", devices[0]->device_type(), " is not allowed to be added ",
-        "after the context is initialized. Currently allowed device: CPU. ",
-        "May update this API to allow adding more types of devices.");
-
-  for (const auto& d : devices) {
-    GetHostContext()->GetDeviceManager()->MaybeAddDevice(
-        TakeRef(new CpuDevice(d->name())));
-  }
-  TF_RETURN_IF_ERROR(GetEagerContext()->AddDevices(std::move(devices)));
-
-  return ::tensorflow::OkStatus();
-}
-
-void ContextInterface::ClearCachesAndThreadExecutors() {
-  GetEagerContext()->ClearCachesAndThreadExecutors();
-  GetHostContext()->Quiesce();
-}
-
-void ContextInterface::StartStep() { GetEagerContext()->StartStep(); }
-
-void ContextInterface::EndStep() { GetEagerContext()->EndStep(); }
-
-tensorflow::Status ContextInterface::EnableCollectiveOps(
-    const tensorflow::ServerDef& server_def) {
-  // Preserve the local virtual device names, since local virtual devices are
-  // added by TFRT and we need to add it back after worker server is
-  // initialized. Currently one such use case is the TPU_SYSTEM device, which
-  // is a virtual device specifically used to initialize TPUs.
-  std::vector<std::string> virtual_device_names;
-  int64_t ncpus = 0;
-
-  for (const auto& d :
-       GetHostContext()->GetDeviceManager()->ListDevices<Device>()) {
-    if (d->IsDeviceType(tfrt::VirtualDevice::kDeviceType)) {
-      tensorflow::DeviceNameUtils::ParsedName p;
-      if (!tensorflow::DeviceNameUtils::ParseFullName(d->name().str(), &p)) {
-        return tensorflow::errors::InvalidArgument(
-            "Invalid local virtual device name: ", d->name().str());
-      }
-
-      virtual_device_names.push_back(tensorflow::DeviceNameUtils::FullName(
-          server_def.job_name(), /*replica=*/0, server_def.task_index(), p.type,
-          p.id));
-    }
-    if (d->IsDeviceType(tfrt::CpuDevice::kDeviceType)) {
-      ++ncpus;
-    }
-  }
-
-  TF_RETURN_IF_ERROR(GetEagerContext()->EnableCollectiveOps(server_def));
-
-  // Create new devices with updated device name.
-  std::vector<std::unique_ptr<tensorflow::Device>> dummy_tf_devices;
-  CreateDummyTfDevices(virtual_device_names, &dummy_tf_devices);
-
-  std::string name_prefix =
-      absl::StrCat("/job:", server_def.job_name(),
-                   "/replica:0/task:", server_def.task_index());
-
-  // Update host device in TFRT HostContext.
-  GetHostContext()->ResetHostDevice(
-      GetHostContext()
-          ->GetDeviceManager()
-          ->MaybeAddDevice(
-              MakeRef<CpuDevice>(absl::StrCat(name_prefix, "/device:CPU:0")))
-          .release());
-
-  // Create additional host logical CPU devices.
-  for (int64_t i = 1; i < ncpus; ++i) {
-    GetHostContext()->GetDeviceManager()->MaybeAddDevice(
-        MakeRef<CpuDevice>(absl::StrCat(name_prefix, "/device:CPU:", i)));
-  }
-  // Update virtual devices in TFRT HostContext.
-  AddDummyTfrtDevices(virtual_device_names, GetHostContext());
-
-  // Update eager context's device manager.
-  auto* local_device_mgr = dynamic_cast<tensorflow::DynamicDeviceMgr*>(
-      GetEagerContext()->local_device_mgr());
-  TF_RETURN_IF_ERROR(local_device_mgr->AddDevices(std::move(dummy_tf_devices)));
-
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status ContextInterface::BuildFunctionRequestContext(
-    tensorflow::tfrt_stub::OpKernelRunnerTable* runner_table, int64_t step_id,
-    RCReference<tfrt::RequestContext>* request_context) {
-  RequestContextBuilder request_context_builder(GetHostContext(),
-                                                GetResourceContext(), step_id);
-
-  TF_RETURN_IF_ERROR(tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
-      &request_context_builder, runner_table, GetEagerContext()));
-  auto expected_request_context = std::move(request_context_builder).build();
-  if (!expected_request_context) {
-    return tensorflow::errors::Internal(
-        StrCat(expected_request_context.takeError()));
-  }
-  *request_context = std::move(expected_request_context.get());
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status ContextInterface::BuildOpRequestContext(
-    RCReference<tfrt::RequestContext>* request_context) {
-  return BuildFunctionRequestContext(/*runner_table=*/nullptr, /*step_id=*/0,
-                                     request_context);
-}
-
-tensorflow::ImmediateExecutionTensorHandle*
-ContextInterface::CopyTensorHandleToDevice(
-    tensorflow::ImmediateExecutionTensorHandle* handle, const char* device_name,
-    tensorflow::Status* status) {
-  auto* host_ctx = GetHostContext();
-
-  TensorHandle src_th = tfrt::tf::TensorHandleFromInterface(handle)->Handle();
-
-  auto tfrt_device_name =
-      ConvertTfDeviceNameToTfrt(device_name, GetEagerContext());
-  if (!tfrt_device_name) {
-    *status = tensorflow::errors::InvalidArgument(
-        StrCat(tfrt_device_name.takeError()));
-    RCReference<AsyncValue> error_av =
-        MakeErrorAsyncValueRef(status->message());
-    return new TensorHandleInterface(
-        Value(TensorHandle::CreateError(std::move(error_av))),
-        GetTfrtContext());
-  }
-  auto dst_device_ref = host_ctx->GetDeviceManager()->GetDeviceRef<Device>(
-      tfrt_device_name.get());
-  if (!dst_device_ref) {
-    std::string error_message =
-        tfrt::StrCat("Failed to find destination device with name: ",
-                     tfrt_device_name.get());
-    *status = tensorflow::errors::Internal(error_message);
-    RCReference<AsyncValue> error_av = MakeErrorAsyncValueRef(error_message);
-    return new TensorHandleInterface(
-        Value(TensorHandle::CreateError(std::move(error_av))),
-        GetTfrtContext());
-  }
-
-  RCReference<RequestContext> request_ctx;
-  *status = BuildOpRequestContext(&request_ctx);
-  if (!status->ok()) return nullptr;
-
-  ExecutionContext exec_ctx{std::move(request_ctx)};
-
-  auto target_th =
-      src_th.TransferToInferredType(exec_ctx, std::move(dst_device_ref));
-
-  auto target_av = target_th.GetAsyncTensor();
-  if (target_av->IsError()) {
-    *status = tensorflow::errors::Internal(
-        tfrt::StrCat("Copying to device <", tfrt_device_name.get(),
-                     "> failed: ", target_av->GetError().message()));
-    return nullptr;
-  }
-  return new TensorHandleInterface(Value(target_th.CopyRef()),
-                                   GetTfrtContext());
-}
-
-tensorflow::Status ContextInterface::AddFunctionDef(
-    const tensorflow::FunctionDef& fdef) {
-  return GetEagerContext()->AddFunctionDef(fdef);
-}
-
-tensorflow::Status ContextInterface::AddFunctionDefWithStackTraces(
-    const tensorflow::FunctionDef& fdef,
-    const tensorflow::StackTracesMap& stack_traces) {
-  return GetEagerContext()->AddFunctionDefWithStackTraces(fdef, stack_traces);
-}
-
-std::vector<std::string> ContextInterface::ListFunctionNames() {
-  return GetEagerContext()->ListFunctionNames();
-}
-
-tensorflow::ImmediateExecutionContext::CacheStats
-ContextInterface::GetCacheStats() {
-  return GetEagerContext()->GetCacheStats();
-}
-
-tensorflow::Status ContextInterface::RemoveFunction(const std::string& func) {
-  // TODO(tfrt-devs): We need to ensure all invocations of this function is
-  // finished before removing it.
-  function_cache_.RemoveFunction(func);
-  return GetEagerContext()->RemoveFunction(func);
-}
-
-tensorflow::Status ContextInterface::AddRemoveFunctionNotifier(
-    const std::string& func, std::function<void()> notifier) {
-  return GetEagerContext()->AddRemoveFunctionNotifier(func, notifier);
-}
-
-const tensorflow::FunctionDef* ContextInterface::FindFunctionDef(
-    const std::string& name) const {
-  return GetEagerContext()->FindFunctionDef(name);
-}
-
-tensorflow::core::RefCountPtr<tensorflow::FunctionRecord>
-ContextInterface::FindRecord(const std::string& name) const {
-  return GetEagerContext()->FindRecord(name);
-}
-
-const tensorflow::DeviceNameUtils::ParsedName&
-ContextInterface::HostCPUParsedName() const {
-  return context_.HostCPUParsedName();
-}
-
-const std::string& ContextInterface::HostCPUName() const {
-  return context_.GetEagerContext()->HostCPUName();
-}
-
-tensorflow::CustomDeviceOpHandler&
-ContextInterface::GetCustomDeviceOpHandler() {
-  return context_.GetEagerContext()->GetCustomDeviceOpHandler();
-}
-
-bool ContextInterface::IsCustomDevice(const std::string& device_name) {
-  return context_.GetEagerContext()->IsCustomDevice(device_name);
-}
-
-tensorflow::Status ContextInterface::RegisterCustomDevice(
-    const std::string& name, std::unique_ptr<tensorflow::CustomDevice> device) {
-  return context_.GetEagerContext()->RegisterCustomDevice(name,
-                                                          std::move(device));
-}
-
-tensorflow::FunctionLibraryDefinition* ContextInterface::FuncLibDef() {
-  return context_.GetEagerContext()->FuncLibDef();
-}
-
-void ContextInterface::SetReuseRendezvousForFunctions(
-    bool reuse_rendezvous_for_functions) {
-  // TODO(fishx): This feature doesn't work properly in TFRT yet. Fix it.
-  context_.GetEagerContext()->SetReuseRendezvousForFunctions(
-      reuse_rendezvous_for_functions);
-}
-
-void ContextInterface::ResetGlobalRendezvousForFunction() {
-  context_.GetEagerContext()->ResetGlobalRendezvousForFunction();
-}
-
-std::vector<std::string> ContextInterface::GetLoggedOpsTestonly() {
-  const auto& ret = GetHostContext()
-                        ->GetOrCreateSharedContext<tensorflow::tfd::OpLogger>()
-                        .GetLoggedOps();
-  return std::vector<std::string>(ret.begin(), ret.end());
-}
-
-HostContext* ContextInterface::GetHostContext() {
-  return GetCoreRuntime()->GetHostContext();
-}
-
-tensorflow::EagerContext* ContextInterface::GetEagerContext() {
-  return context_.GetEagerContext();
-}
-
-const tensorflow::EagerContext* ContextInterface::GetEagerContext() const {
-  return context_.GetEagerContext();
-}
-
-CoreRuntime* ContextInterface::GetCoreRuntime() {
-  return context_.GetCoreRuntime();
-}
-
-TfrtContext* ContextInterface::GetTfrtContext() { return &context_; }
-
-OpHandler* ContextInterface::GetFallbackOpHandler() {
-  return context_.GetFallbackOpHandler();
-}
-
-ResourceContext* ContextInterface::GetResourceContext() {
-  return context_.GetResourceContext();
-}
-
-tensorflow::Status ContextInterface::SelectOpHandlerFromArguments(
-    const tensorflow::ImmediateExecutionOperation& op, OpHandler** op_handler) {
-  return op_handler_selector_->SelectFromArguments(op, op_handler);
-}
-
-tensorflow::Status ContextInterface::SelectOpHandlerFromNodeDef(
-    const tensorflow::ImmediateExecutionOperation& op, const NodeDef* node_def,
-    OpHandler** op_handler) {
-  return op_handler_selector_->SelectFromNodeDef(op, node_def, op_handler);
-}
-
-std::unique_ptr<tensorflow::RunMetadata> ContextInterface::ExportRunMetadata() {
-  mutex_lock l(run_metadata_mu_);
-
-  // NOTE(fishx): We need to merge run_metadata from TF Eager Context because
-  // right now we still use current TF runtime to execute graph (e.g. tf.data
-  // via fallback).
-  auto result = GetEagerContext()->ExportRunMetadata();
-  result->MergeFrom(*run_metadata_);
-  run_metadata_ = std::make_unique<tensorflow::RunMetadata>();
-
-  return result;
-}
-
-tensorflow::Status ContextInterface::RunMetadataRecordFunction(
-    const std::string& func_name) {
-  const tensorflow::FunctionDef* fdef =
-      GetEagerContext()->FindFunctionDef(func_name);
-  if (fdef == nullptr) {
-    return tensorflow::errors::InvalidArgument(
-        "Failed to find function \"", func_name, "\" in function library");
-  }
-  std::unique_ptr<tensorflow::FunctionBody> fbody;
-  TF_RETURN_IF_ERROR(FunctionDefToBodyHelper(
-      *fdef, tensorflow::AttrSlice(), GetEagerContext()->FuncLibDef(), &fbody));
-  tensorflow::GraphDef def;
-  fbody->graph->ToGraphDef(&def);
-  *def.mutable_library() =
-      GetEagerContext()->FuncLibDef()->ReachableDefinitions(def).ToProto();
-
-  mutex_lock l(run_metadata_mu_);
-  auto* function_graphs = run_metadata_->add_function_graphs();
-  *function_graphs->mutable_pre_optimization_graph() = def;
-  // TODO(b/171600738): Figure out a way to record the right post optimization
-  // graph and partition graph.
-  *function_graphs->mutable_post_optimization_graph() = def;
-  *function_graphs->add_partition_graphs() = def;
-  *run_metadata_->add_partition_graphs() = def;
-  return ::tensorflow::OkStatus();
-}
-
-void ContextInterface::SetExecutorForThread(
-    tensorflow::EagerExecutor* executor) {
-  GetEagerContext()->SetExecutorForThread(executor);
-}
-
-tfrt::Location AbortLocationHandler::GetCurrentLocation() {
-  return tfrt::Location(this, GetNextLocationId());
-}
-
-void OpAttrsInterface::GetNameAttrList(
-    tensorflow::NameAttrList* name_and_attrs) const {
-  fallback_attrs_->FillAttrValueMap(name_and_attrs->mutable_attr());
-  name_and_attrs->set_name(fallback_attrs_->op_name());
-}
-
-Status OpAttrsInterface::GetTypeList(
-    absl::string_view attr_name,
-    absl::InlinedVector<tensorflow::DataType, 4>* type_list) const {
-  return tensorflow::errors::Unimplemented("OpAttrsInterface::GetTypeList");
-}
-
-bool OpAttrsInterface::GetInt(absl::string_view attr_name,
-                              int64_t* result) const {
-  return attrs_->Get<int64_t>({attr_name.data(), attr_name.size()}, result);
-}
-
-bool OpAttrsInterface::GetFloat(absl::string_view attr_name,
-                                float* result) const {
-  return attrs_->Get<float>({attr_name.data(), attr_name.size()}, result);
-}
-
-bool OpAttrsInterface::GetBool(absl::string_view attr_name,
-                               bool* result) const {
-  return attrs_->Get<bool>({attr_name.data(), attr_name.size()}, result);
-}
-
-bool OpAttrsInterface::GetType(absl::string_view attr_name,
-                               tensorflow::DataType* result) const {
-  auto optional_type =
-      attrs_->GetOptional<OpAttrType>({attr_name.data(), attr_name.size()});
-  if (!optional_type.has_value()) return false;
-  *result = tensorflow::tfd::ConvertToTfDataType(optional_type.value());
-  return true;
-}
-
-OperationInterface::OperationInterface(ContextInterface* context)
-    : ImmediateExecutionOperation(kTfrt),
-      op_attrs_(&attrs_, &fallback_attrs_),
-      context_(context) {}
-
-tensorflow::Status OperationInterface::Reset(const char* op,
-                                             const char* raw_device_name) {
-  op_name_ = op;
-  args_.clear();
-  attrs_.Reset();
-  custom_device_tensor_handle_count_ = 0;
-  op_def_ = nullptr;
-  fallback_attrs_.Reset(op);
-  stack_trace_.reset();
-  op_ = nullptr;
-  function_state_.reset();
-  tensorflow::Status s = tensorflow::OpDefForOp(op_name_, &op_def_);
-  is_function_ = !s.ok();
-  return SetDeviceName(raw_device_name);
-}
-
-tensorflow::Status OperationInterface::Execute(
-    absl::Span<tensorflow::AbstractTensorHandle*> retvals, int* num_retvals) {
-  tensorflow::profiler::TraceMe trace(
-      [&] {
-        return absl::StrCat("TFRT_Execute:", Name(), " device:", DeviceName());
-      },
-      tensorflow::profiler::TraceMeLevel::kInfo);
-  if (custom_device_tensor_handle_count_ > 0) {
-    return tensorflow::errors::InvalidArgument(
-        "Cannot execute ops that conntains unsupported arg in TFRT.");
-  }
-
-  TF_RETURN_IF_ERROR(Initialize());
-  assert(op_ != nullptr || function_state_);
-  auto* corert = context_->GetCoreRuntime();
-  auto* chain = context_->GetChain();
-  auto* host = corert->GetHostContext();
-  llvm::SmallVector<TensorHandle, 8> th_args;
-  th_args.reserve(args_.size());
-
-  llvm::SmallVector<TensorHandle, 8> result_ths;
-  result_ths.resize(*num_retvals);
-
-  if (function_state_) {
-    // Set up arguments. Check argument dtype synchronously if available.
-    auto arg_types = function_state_->GetArgTypes();
-    if (args_.size() != arg_types.size()) {
-      return tensorflow::errors::InvalidArgument("Expects ", arg_types.size(),
-                                                 " arguments, but ",
-                                                 args_.size(), " is provided");
-    }
-    auto args_size = args_.size();
-    for (auto i = 0; i < args_size; ++i) {
-      th_args.push_back(down_cast<TensorHandleInterface*>(args_[i].get())
-                            ->Handle()
-                            .CopyRef());
-      // TODO(b/173556766): This dtype check is only needed for corert lowering.
-      // In native lowering, compiler should obtain the argument dtype
-      // information from FunctionBody directly and lower the op to the native
-      // kernel that accepts the specified dtype.
-      if (th_args[i].IsMetadataAvailable()) {
-        auto arg_dtype = th_args[i].GetAvailableMetadata().dtype;
-        if (arg_dtype != arg_types[i]) {
-          return tensorflow::errors::InvalidArgument(
-              "Expects arg[", i, "] to be ", arg_types[i], " but ", arg_dtype,
-              " is provided");
-        }
-      }
-    }
-
-    RCReference<RequestContext> request_ctx;
-    TF_RETURN_IF_ERROR(context_->BuildFunctionRequestContext(
-        function_state_->GetRunnerTable(), step_id(), &request_ctx));
-
-    ExecutionContext exec_ctx{std::move(request_ctx),
-                              abort_location_handler_.GetCurrentLocation()};
-
-    // Make BEF executor to use TfThreadPoolWorkQueue to dispatch kernels.
-    exec_ctx.set_work_queue(
-        context_->GetTfrtContext()->GetTfThreadPoolWorkQueue());
-
-    // Execute the function.
-    function_state_->GetFunc()(exec_ctx, th_args, OpAttrsRef(attrs_),
-                               result_ths, chain);
-  } else {
-    RCReference<RequestContext> request_ctx;
-    TF_RETURN_IF_ERROR(context_->BuildOpRequestContext(&request_ctx));
-
-    ExecutionContext exec_ctx{std::move(request_ctx),
-                              abort_location_handler_.GetCurrentLocation()};
-    for (auto& arg : args_) {
-      th_args.push_back(
-          down_cast<TensorHandleInterface*>(arg.get())->Handle().CopyRef());
-    }
-    // If the CoreRuntimeOp is a native TFRT op, transfer arguments to target
-    // device if necessary.
-    if (!op_->IsFallback()) {
-      // Get the target device of the arguments that we want to implicitly copy
-      // to.
-      auto dst_device_ref = op_->GetDeviceRef();
-
-      for (auto& th_arg : th_args) {
-        th_arg =
-            th_arg.TransferTo(exec_ctx, dst_device_ref, op_->GetTensorType());
-      }
-    }
-
-    (*op_)(exec_ctx, th_args, OpAttrsRef(attrs_), result_ths, chain);
-  }
-
-  tensorflow::Status s = ::tensorflow::OkStatus();
-
-  if (TF_PREDICT_FALSE(!this->context_->IsAsync() && !chain->IsAvailable()))
-    host->Await({chain->CopyRCRef()});
-
-  if (TF_PREDICT_FALSE(chain->IsError())) {
-    s = tensorflow::FromAbslStatus(chain->GetError());
-    // TODO(tfrt-devs): Assess if we need a explicit API to clear error.
-    *chain = GetReadyChain();
-  }
-
-  for (size_t i = 0, e = result_ths.size(); i != e; ++i) {
-    auto& th_ref = result_ths[i];
-    if (TF_PREDICT_FALSE(!this->context_->IsAsync() &&
-                         !th_ref.GetAsyncTensor()->IsAvailable()))
-      host->Await(FormRef(th_ref.GetAsyncTensor()));
-
-    // NOTE(fishx): In async mode, we won't report error synchronously even
-    // though it is possible in TFRT. This is intended to match behavior in
-    // current TF. However, in the future, we may want to update this
-    // behavior since synchronous error may improve user experience in async
-    // mode.
-    if (TF_PREDICT_FALSE(!this->context_->IsAsync() &&
-                         th_ref.GetAsyncTensor()->IsError() && s.ok()))
-      s = tensorflow::FromAbslStatus(th_ref.GetAsyncTensor()->GetError());
-
-    if (function_state_ && context_->IsAsync()) {
-      retvals[i] = new TensorHandleInterface(function_state_->GetRetTypes()[i],
-                                             Value(std::move(result_ths[i])),
-                                             context_->GetTfrtContext());
-    } else {
-      retvals[i] = new TensorHandleInterface(Value(std::move(result_ths[i])),
-                                             context_->GetTfrtContext());
-    }
-  }
-
-  return s;
-}
-
-tensorflow::Status OperationInterface::Initialize() {
-  CoreRuntime* corert = context_->GetCoreRuntime();
-  if (!is_function_) {
-    // Obtain input arguments' dtype attrs as part of the cache key.
-    llvm::SmallVector<string_view, 4> dtypes;
-    attrs_.IterateEntries([&](const OpAttrsRawEntry& entry) {
-      if (entry.type == OpAttrType::DTYPE && !entry.IsArray())
-        dtypes.push_back(
-            GetNameString(*static_cast<const OpAttrType*>(entry.GetData())));
-    });
-
-    OpHandler* op_handler = nullptr;
-    TF_RETURN_IF_ERROR(
-        context_->SelectOpHandlerFromArguments(*this, &op_handler));
-    Expected<CoreRuntimeOp*> expected_op = context_->GetOpCache().GetOrAddOp(
-        op_name_, op_handler, device_name_, dtypes, this);
-    if (!expected_op) {
-      return tensorflow::errors::InvalidArgument(
-          StrCat("Cannot obtain CoreRuntimeOp: ", op_name_,
-                 " on device: ", device_name_, expected_op.takeError()));
-    }
-    op_ = expected_op.get();
-    // Update device name since op_handler_selecter may choose an op_handler
-    // that's different from what the user specifies.
-    device_name_ = op_->DeviceName().str();
-    return ::tensorflow::OkStatus();
-  }
-
-  bool compile_with_xla = false;
-  GetFuncAttr(attrs_, op_name_, *context_->GetEagerContext()->FuncLibDef(),
-              tensorflow::kXlaMustCompileAttr, &compile_with_xla);
-  // If the function has compile_with_xla==true, we will use RuntimeFallback
-  // to execute it, since TFRT does not support xla yet.
-  // TODO(tfrt-devs): Native support of compile_with_xla.
-  if (compile_with_xla) {
-    Expected<CoreRuntimeOp*> expected_op =
-        context_->GetOpCache().GetOrAddXlaOp(op_name_, context_);
-    if (!expected_op) {
-      return tensorflow::errors::NotFound(
-          StrCat("Cannot initialize xla function ", op_name_,
-                 " on fallback op handler.", expected_op.takeError()));
-    }
-    op_ = expected_op.get();
-    return ::tensorflow::OkStatus();
-  }
-
-  // Note(fishx): We need eager context for now because we need
-  // FunctionLibraryDefinition to convert FunctionDef to MLIR TF dialect. In
-  // the future, when we can generate MLIR from TF Python, we should get rid of
-  // this.
-  // FunctionDef -> BEF.
-  // Look up the cache. Compile BEF and insert to cache if miss.
-  tensorflow::DeviceSet dev_set;
-  const DeviceMgr* device_mgr = context_->GetEagerContext()->local_device_mgr();
-  if (device_mgr == nullptr)
-    return tensorflow::errors::NotFound("Cannot find device manager");
-  // TODO(tfrt-devs): support remote devices in TFRT.
-  for (auto d : device_mgr->ListDevices()) dev_set.AddDevice(d);
-  FunctionCache::FunctionCacheResult result;
-
-  tensorflow::TfrtFunctionCompileOptions compile_options;
-
-  // Use the host device if the user does not place the function to a specific
-  // device.
-  compile_options.default_device =
-      device_name_.empty() ? context_->GetEagerContext()->HostCPUName()
-                           : device_name_;
-
-  if (fallback_attrs_.NumAttributes() > 0) {
-    const auto& ndef = NodeDef();
-    // TODO(tfrt-devs): If we are to create more attributes, consider packing
-    // them into a proto.
-    {
-      const auto& it = ndef.attr().find(kEnableGrapplerAttr);
-      if (it != ndef.attr().end()) {
-        compile_options.enable_grappler = it->second.b();
-      }
-    }
-  }
-
-  llvm::SmallVector<const tfrt::Device*, 4> input_devices;
-  input_devices.reserve(args_.size());
-  for (auto& arg : args_) {
-    auto arg_th = down_cast<TensorHandleInterface*>(arg.get())->Handle();
-    if (!arg_th.IsDeviceAvailable()) {
-      corert->GetHostContext()->Await(arg_th.GetAsyncDevice().CopyRCRef());
-    }
-    input_devices.push_back(down_cast<TensorHandleInterface*>(arg.get())
-                                ->Handle()
-                                .GetAvailableDevice()
-                                .get());
-  }
-  TF_RETURN_IF_ERROR(context_->GetFunctionCache().GetOrAddFunction(
-      op_name_, device_name_, dev_set, context_->GetEagerContext(), corert,
-      /*request_ctx_fn=*/
-      [this](tensorflow::tfrt_stub::OpKernelRunnerTable* runner_table,
-             RCReference<RequestContext>* request_ctx) {
-        return context_->BuildFunctionRequestContext(runner_table, step_id(),
-                                                     request_ctx);
-      },
-      abort_location_handler_.GetCurrentLocation(), compile_options,
-      input_devices, &result));
-  // TODO(tfrt-devs): Avoid calling EagerContext::ShouldStoreGraphs().
-  if (result.is_cache_miss &&
-      context_->GetEagerContext()->ShouldStoreGraphs()) {
-    TF_RETURN_IF_ERROR(context_->RunMetadataRecordFunction(op_name_));
-  }
-  function_state_ = std::move(result.function_state);
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status OperationInterface::SetDeviceName(const char* name) {
-  if (op_ && name != device_name_) {
-    return tensorflow::errors::Internal(
-        "Failed to update device name. Right now TFRT cannot update device "
-        "name of a fallback op if it is initialized.");
-  }
-  device_name_ = name ? name : "";
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status OperationInterface::AddInput(
-    tensorflow::AbstractTensorHandle* input) {
-  tensorflow::ImmediateExecutionTensorHandle* h =
-      down_cast<tensorflow::ImmediateExecutionTensorHandle*>(input);
-  // TODO(b/175427838): It would be nice to be able to use tensorflow::isa here.
-  if (tensorflow::CustomDeviceTensorHandle::classof(h)) {
-    custom_device_tensor_handle_count_++;
-  }
-  h->Ref();
-  args_.push_back(
-      tensorflow::core::RefCountPtr<tensorflow::ImmediateExecutionTensorHandle>(
-          h));
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status OperationInterface::SetInput(
-    size_t index, tensorflow::ImmediateExecutionTensorHandle* input) {
-  if (index >= args_.size()) {
-    return tensorflow::errors::InvalidArgument("Index >= inputs.size: %d >= %d",
-                                               index, args_.size());
-  }
-  // TODO(b/175427838): It would be nice to be able to use tensorflow::isa here.
-  if (tensorflow::CustomDeviceTensorHandle::classof(args_[index].get())) {
-    custom_device_tensor_handle_count_--;
-  }
-  if (tensorflow::CustomDeviceTensorHandle::classof(input)) {
-    custom_device_tensor_handle_count_++;
-  }
-  input->Ref();
-  args_[index] =
-      tensorflow::core::RefCountPtr<tensorflow::ImmediateExecutionTensorHandle>(
-          input);
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status OperationInterface::AddInputList(
-    absl::Span<tensorflow::AbstractTensorHandle* const> inputs) {
-  return tensorflow::errors::Unimplemented(
-      "Unimplemented OperationInterface::AddInputList");
-}
-
-absl::Span<tensorflow::ImmediateExecutionTensorHandle* const>
-OperationInterface::GetInputs() const {
-  return absl::MakeSpan(
-      reinterpret_cast<tensorflow::ImmediateExecutionTensorHandle* const*>(
-          args_.data()),
-      args_.size());
-}
-
-tensorflow::Status OperationInterface::SetAttrString(const char* attr_name,
-                                                     const char* data,
-                                                     size_t length) {
-  fallback_attrs_.Set(attr_name, tensorflow::StringPiece(data, length));
-  if (attrs_.SetString(attr_name, string_view(data, length)))
-    return ::tensorflow::OkStatus();
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrString failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrInt(const char* attr_name,
-                                                  int64_t value) {
-  fallback_attrs_.Set(attr_name, static_cast<int64_t>(value));
-  if (attrs_.Set(attr_name, value)) return ::tensorflow::OkStatus();
-  return tensorflow::errors::Internal("OperationInterface::SetAttrInt failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrFloat(const char* attr_name,
-                                                    float value) {
-  fallback_attrs_.Set(attr_name, value);
-  if (attrs_.Set(attr_name, value)) return ::tensorflow::OkStatus();
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrFloat failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrBool(const char* attr_name,
-                                                   bool value) {
-  fallback_attrs_.Set(attr_name, value);
-  if (attrs_.Set(attr_name, value)) return ::tensorflow::OkStatus();
-  return tensorflow::errors::Internal("OperationInterface::SetAttrBool failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrType(const char* attr_name,
-                                                   tensorflow::DataType value) {
-  fallback_attrs_.Set(attr_name, value);
-  if (value == tensorflow::DT_INVALID) {
-    return tensorflow::errors::InvalidArgument(
-        "OperationInterface::SetAttrType failed to set DT_INVALID");
-  }
-  if (attrs_.Set(attr_name,
-                 tfrt::GetOpAttrTypeFromDType(
-                     tensorflow::tfd::ConvertTfDataTypeToBefAttrType(value))))
-    return ::tensorflow::OkStatus();
-  // TODO(fishx): Remove this workaround once we support all dtype in TF.
-  // This is fine for now since attribute "T", "U", "Tidx" is not used by TFRT
-  // native ops.
-  if (std::strcmp(attr_name, "T") == 0 || std::strcmp(attr_name, "U") == 0 ||
-      std::strcmp(attr_name, "Tidx") == 0) {
-    return ::tensorflow::OkStatus();
-  }
-  return tensorflow::errors::Internal("OperationInterface::SetAttrType failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrShape(const char* attr_name,
-                                                    const int64_t* dims,
-                                                    const int num_dims) {
-  // NOTE: This is copied from EagerOperation::SetAttrShape.
-  // TODO(b/154554118): Remove the duplication.
-  if (num_dims > tensorflow::TensorShape::MaxDimensions()) {
-    return tensorflow::errors::InvalidArgument(
-        "Value specified for `", attr_name, "` has ", num_dims,
-        " dimensions which is over the limit of ",
-        tensorflow::TensorShape::MaxDimensions(), ".");
-  }
-
-  tensorflow::TensorShapeProto proto;
-  size_t offset;
-  if (num_dims < 0) {
-    proto.set_unknown_rank(true);
-
-    // Set unranked ShapeAttr.
-    offset = bef_attr_encoder_.EncodeUnrankedShapeAttr();
-  } else {
-    for (int d = 0; d < num_dims; ++d) {
-      proto.add_dim()->set_size(dims[d]);
-    }
-
-    // Set RankedShapeAttr.
-    offset =
-        bef_attr_encoder_.EncodeRankedShapeAttr(llvm::ArrayRef(dims, num_dims));
-  }
-  fallback_attrs_.Set(attr_name, proto);
-
-  auto buf = bef_attr_encoder_.TakeResult();
-  tfrt::ShapeAttr shape_attr(buf.data() + offset);
-  // TODO(tfrt-devs): Avoid the copy.
-  if (attrs_.Set(attr_name, shape_attr)) return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrShape failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrFunction(
-    const char* attr_name, const tensorflow::AbstractOperation* value) {
-  auto* value_operation = down_cast<const OperationInterface*>(value);
-  // TODO(b/165412867): Set fallback_attrs_ for eager device placement.
-  // Consider removing this and rely on TFRT OpAttrs.
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(value->Name());
-  fallback_attrs_.Set(attr_name, attr_value);
-
-  if (attrs_.SetFunc(attr_name, {string_view(value_operation->Name())}))
-    return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrFunction failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrFunctionName(
-    const char* attr_name, const char* data, size_t length) {
-  // TODO(b/165412867): Set fallback_attrs_ for eager device placement.
-  // Consider removing this and rely on TFRT OpAttrs.
-  tensorflow::AttrValue attr_value;
-  tensorflow::NameAttrList* func = attr_value.mutable_func();
-  func->set_name(data);
-  fallback_attrs_.Set(attr_name, attr_value);
-
-  if (attrs_.SetFunc(attr_name, {data})) return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrFunctionName failed");
-}
-
-static size_t SerializeTFETensorToDenseAttr(
-    tensorflow::AbstractTensorInterface* tensor,
-    tfrt::BefAttrEncoder* encoder) {
-  std::vector<uint8_t> data;
-
-  const auto element_type =
-      tensorflow::tfd::ConvertTfDataTypeToBefAttrType(tensor->Type());
-  llvm::SmallVector<int64_t, 4> shape;
-  for (int i = 0; i < tensor->NumDims(); ++i) {
-    shape.push_back(tensor->Dim(i));
-  }
-  auto elements = llvm::ArrayRef(
-      reinterpret_cast<const uint8_t*>(tensor->Data()), tensor->ByteSize());
-  return encoder->EncodeDenseAttr(static_cast<DType>(element_type), shape,
-                                  elements);
-}
-
-tensorflow::Status OperationInterface::SetAttrTensor(
-    const char* attr_name, tensorflow::AbstractTensorInterface* tensor) {
-  tfrt::BefAttrEncoder encoder;
-  const size_t offset = SerializeTFETensorToDenseAttr(tensor, &encoder);
-  auto buffer = encoder.TakeResult();
-  DenseAttr dense_attr(buffer.data() + offset);
-  if (attrs_.Set(attr_name, dense_attr)) return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrTensor failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrStringList(
-    const char* attr_name, const void* const* values, const size_t* lengths,
-    int num_values) {
-  std::vector<tensorflow::StringPiece> v(num_values);
-  for (int i = 0; i < num_values; ++i) {
-    v[i] = tensorflow::StringPiece(static_cast<const char*>(values[i]),
-                                   lengths[i]);
-  }
-  fallback_attrs_.Set(attr_name, v);
-
-  tfrt::BefAttrEncoder encoder;
-  const size_t offset =
-      encoder.EncodeStringListAttr(values, lengths, num_values);
-  auto buf = encoder.TakeResult();
-  tfrt::AggregateAttr aggr_attr(buf.data() + offset);
-  // TODO(tfrt-devs): Avoid the copy.
-  if (attrs_.Set(attr_name, aggr_attr)) return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrStringList failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrFloatList(const char* attr_name,
-                                                        const float* values,
-                                                        int num_values) {
-  fallback_attrs_.Set(
-      attr_name, tensorflow::gtl::ArraySlice<const float>(values, num_values));
-
-  if (attrs_.SetArray(attr_name, tfrt::ArrayRef<float>(values, num_values)))
-    return ::tensorflow::OkStatus();
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrFloatList failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrIntList(const char* attr_name,
-                                                      const int64_t* values,
-                                                      int num_values) {
-  fallback_attrs_.Set(
-      attr_name, tensorflow::gtl::ArraySlice<const int64_t>(
-                     reinterpret_cast<const int64_t*>(values), num_values));
-
-  if (attrs_.SetArray(attr_name, tfrt::ArrayRef<int64_t>(values, num_values)))
-    return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrIntList failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrTypeList(
-    const char* attr_name, const tensorflow::DataType* values, int num_values) {
-  fallback_attrs_.Set(attr_name,
-                      tensorflow::gtl::ArraySlice<const tensorflow::DataType>(
-                          values, num_values));
-  // Convert to OpAttrType first.
-  llvm::SmallVector<tfrt::DType, 4> tfrt_dtypes;
-  tfrt_dtypes.reserve(num_values);
-  for (int i = 0; i < num_values; ++i) {
-    tfrt_dtypes.push_back(
-        tensorflow::tfd::ConvertTfDataTypeToBefAttrType(values[i]));
-  }
-
-  if (attrs_.SetRaw(attr_name, tfrt_dtypes.data(), tfrt::OpAttrType::DTYPE,
-                    num_values, OpAttrsRawEntryType::kArray))
-    return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrTypeList failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrBoolList(
-    const char* attr_name, const unsigned char* values, int num_values) {
-  std::unique_ptr<bool[]> b(new bool[num_values]);
-  for (int i = 0; i < num_values; ++i) {
-    b[i] = values[i];
-  }
-  fallback_attrs_.Set(
-      attr_name, tensorflow::gtl::ArraySlice<const bool>(b.get(), num_values));
-
-  // Convert to bool first.
-  llvm::SmallVector<bool, 4> bool_array;
-  bool_array.reserve(num_values);
-  for (int i = 0; i < num_values; ++i) {
-    bool_array.push_back(static_cast<bool>((values[i])));
-  }
-  if (attrs_.SetArray(attr_name,
-                      tfrt::ArrayRef<bool>(bool_array.data(), num_values)))
-    return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrBoolList failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrShapeList(const char* attr_name,
-                                                        const int64_t** dims,
-                                                        const int* num_dims,
-                                                        int num_values) {
-  std::unique_ptr<tensorflow::TensorShapeProto[]> proto(
-      new tensorflow::TensorShapeProto[num_values]);
-  for (int i = 0; i < num_values; ++i) {
-    const auto num_dims_i = num_dims[i];
-
-    if (num_dims_i > tensorflow::TensorShape::MaxDimensions()) {
-      return tensorflow::errors::InvalidArgument(
-          StrCat("Value specified for `", attr_name, "` has ", num_dims_i,
-                 " dimensions which is over the limit of ",
-                 tensorflow::TensorShape::MaxDimensions(), "."));
-    }
-    if (num_dims_i < 0) {
-      proto[i].set_unknown_rank(true);
-    } else {
-      const int64_t* dims_i = dims[i];
-      auto proto_i = &proto[i];
-      for (int d = 0; d < num_dims_i; ++d) {
-        proto_i->add_dim()->set_size(dims_i[d]);
-      }
-    }
-  }
-  fallback_attrs_.Set(attr_name,
-                      tensorflow::gtl::ArraySlice<tensorflow::TensorShapeProto>(
-                          proto.get(), num_values));
-
-  BefAttrEncoder encoder;
-  const size_t offset = encoder.EncodeShapeListAttr(dims, num_dims, num_values);
-  auto buf = encoder.TakeResult();
-  tfrt::AggregateAttr aggr_attr(buf.data() + offset);
-  if (attrs_.Set(attr_name, aggr_attr)) return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrShapeList failed");
-}
-
-tensorflow::Status OperationInterface::SetAttrFunctionList(
-    const char* attr_name, absl::Span<const AbstractOperation*> values) {
-  size_t num_values = values.size();
-  std::vector<const void*> func_attrs(num_values);
-  std::vector<size_t> lengths(num_values);
-
-  for (int i = 0; i < num_values; ++i) {
-    auto* value_operation = down_cast<const OperationInterface*>(values[i]);
-    lengths[i] = value_operation->Name().length();
-    func_attrs[i] = value_operation->Name().c_str();
-  }
-
-  // Encode the array of function attributes with BEF typed attribute encoder to
-  // an aggregated attribute.
-  BefAttrEncoder encoder;
-  const size_t offset =
-      encoder.EncodeFuncListAttr(func_attrs.data(), lengths.data(), num_values);
-  auto buf = encoder.TakeResult();
-  tfrt::AggregateAttr aggr_attr(buf.data() + offset);
-  if (attrs_.Set(attr_name, aggr_attr)) return ::tensorflow::OkStatus();
-
-  return tensorflow::errors::Internal(
-      "OperationInterface::SetAttrFunctionList failed");
-}
-
-tensorflow::Status OperationInterface::InputLength(const char* input_name,
-                                                   int* length) {
-  return tensorflow::errors::Unimplemented(
-      "Unimplemented OperationInterface::InputLength");
-}
-
-tensorflow::Status OperationInterface::OutputLength(const char* output_name,
-                                                    int* length) {
-  return tensorflow::errors::Unimplemented(
-      "Unimplemented OperationInterface::OutputLength");
-}
-
-const tensorflow::AbstractOpAttrs* OperationInterface::GetOpAttrs() const {
-  return &op_attrs_;
-}
-
-void OperationInterface::AddAttrs(const tensorflow::AbstractOpAttrs* op_attrs) {
-  auto* tfrt_op_attrs = down_cast<const OpAttrsInterface*>(op_attrs);
-  tfrt_op_attrs->GetAttrs()->IterateEntries(
-      [this](const OpAttrsRawEntry& entry) {
-        attrs_.SetRaw(entry.name, entry.GetData(), entry.type,
-                      entry.element_count, entry.entry_type);
-      });
-  fallback_attrs_.CopyAttributes(*tfrt_op_attrs->GetFallbackAttrs());
-}
-
-void OperationInterface::MaybeInferInputAttrs() {
-  if (!op_def_) return;
-  for (int i = 0; i < args_.size(); i++) {
-    auto* handle = args_[i].get();
-    const auto& input_def = op_def_->input_arg(i);
-    if (!input_def.number_attr().empty() ||
-        !input_def.type_list_attr().empty()) {
-      // Some clients that are still setting their input attributes manually are
-      // adding input list to their op by calling `TFE_OpAddInput` for each of
-      // its elements instead of calling `TFE_OpAddInputList`. When this
-      // happens, we cannot detect the end of such list, thus lose track of the
-      // input arguments in the op definition. To guarantee backward
-      // compatibility with those clients, disable automatic inference in this
-      // case.
-      return;
-    }
-    const std::string& type_attr = input_def.type_attr();
-    if (!type_attr.empty()) {
-      bool success = attrs_.Set(
-          type_attr, tfrt::GetOpAttrTypeFromDType(
-                         tensorflow::tfd::ConvertTfDataTypeToBefAttrType(
-                             handle->DataType())));
-      if (success) {
-        fallback_attrs_.Set(type_attr, handle->DataType());
-      }
-    }
-  }
-}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/c_api_tfrt.h b/tensorflow/core/tfrt/eager/c_api_tfrt.h
deleted file mode 100644
index 94c57e91855..00000000000
--- a/tensorflow/core/tfrt/eager/c_api_tfrt.h
+++ /dev/null
@@ -1,614 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_H_
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <vector>
-
-#include "absl/types/optional.h"
-#include "tensorflow/c/eager/abstract_op_attrs.h"
-#include "tensorflow/c/eager/abstract_tensor_handle.h"
-#include "tensorflow/c/eager/immediate_execution_context.h"
-#include "tensorflow/c/eager/immediate_execution_operation.h"
-#include "tensorflow/c/eager/immediate_execution_tensor_handle.h"
-#include "tensorflow/c/experimental/saved_model/core/saved_model_api.h"
-#include "tensorflow/c/tensor_interface.h"
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/framework/cancellation.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/refcount.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/protobuf/config.pb.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/tfrt/eager/function_cache.h"
-#include "tensorflow/core/tfrt/eager/op_cache.h"
-#include "tensorflow/core/tfrt/eager/tfrt_context.h"
-#include "tensorflow/core/util/device_name_utils.h"
-#include "tfrt/bef_converter/bef_attr_encoder.h"  // from @tf_runtime
-#include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
-#include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
-#include "tfrt/host_context/async_value_ref.h"  // from @tf_runtime
-#include "tfrt/host_context/value.h"  // from @tf_runtime
-#include "tfrt/support/aligned_buffer.h"  // from @tf_runtime
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-#include "tfrt/support/ref_count.h"  // from @tf_runtime
-#include "tfrt/tensor/tensor.h"  // from @tf_runtime
-
-namespace tfrt {
-
-class CoreRuntime;
-class CoreRuntimeOp;
-class DenseHostTensor;
-class OpHandler;
-class TensorHandle;
-class TensorMetadata;
-
-namespace tf {
-class EagerOpHandlerSelector;
-
-class ContextInterface : public tensorflow::ImmediateExecutionContext {
- public:
-  ContextInterface(
-      const tensorflow::SessionOptions& opts,
-      tensorflow::ContextDevicePlacementPolicy default_device_placement_policy,
-      bool is_async);
-  ~ContextInterface() override;
-
-  void Release() override { delete this; }
-
-  tensorflow::AbstractTensorInterface* CreateInt64Scalar(
-      int64_t value) override;
-  tensorflow::AbstractTensorInterface* CreateUint64Scalar(
-      uint64_t value) override;
-  tensorflow::AbstractTensorInterface* CreateInt32Scalar(
-      int32_t value) override;
-  tensorflow::AbstractTensorInterface* CreateFloatScalar(float value) override;
-  tensorflow::AbstractTensorInterface* CreateDoubleScalar(
-      double value) override;
-  tensorflow::AbstractTensorInterface* CreateHalfScalar(
-      Eigen::half value) override;
-  tensorflow::AbstractTensorInterface* CreateStringScalar(
-      tensorflow::tstring value) override;
-  tensorflow::AbstractTensorInterface* CreateComplex128Scalar(
-      tensorflow::complex128 value) override;
-  tensorflow::AbstractTensorInterface* CreateBoolScalar(bool value) override;
-
-  tensorflow::AbstractTensorInterface* CreateTensor(
-      tensorflow::DataType dtype, absl::Span<const int64_t> dim_sizes) override;
-  tensorflow::AbstractTensorInterface* CreateTensor(
-      tensorflow::DataType dtype, const int64_t* dims, int num_dims, void* data,
-      size_t len, MemoryReleaser memory_releaser,
-      void* memory_releaser_arg) override;
-
-  tensorflow::ImmediateExecutionTensorHandle* CreateLocalHandle(
-      tensorflow::AbstractTensorInterface* t) override;
-  // Create an abstract tensor handle from tensorflow::Tensor.
-  tensorflow::ImmediateExecutionTensorHandle* CreateLocalHandleFromTFTensor(
-      tensorflow::Tensor& t, const char* d_name) override;
-
-  // Convert a TFRT TensorHandle to tensorflow::TensorHandle.
-  tensorflow::ImmediateExecutionTensorHandle* TFTensorHandleFromInterface(
-      tensorflow::ImmediateExecutionTensorHandle* handle) override;
-
-  tensorflow::ImmediateExecutionTensorHandle* CopyTensorHandleToDevice(
-      tensorflow::ImmediateExecutionTensorHandle* handle,
-      const char* device_name, tensorflow::Status* status) override;
-
-  tensorflow::ImmediateExecutionOperation* CreateOperation() override;
-  tensorflow::Status RegisterFunction(tensorflow::AbstractFunction*) override;
-
-  tensorflow::CustomDeviceOpHandler& GetCustomDeviceOpHandler() override;
-
-  bool IsCustomDevice(const std::string& device_name) override;
-
-  tensorflow::Status RegisterCustomDevice(
-      const std::string& name,
-      std::unique_ptr<tensorflow::CustomDevice> device) override;
-
-  tensorflow::FunctionLibraryDefinition* FuncLibDef() override;
-
-  void SetReuseRendezvousForFunctions(
-      bool reuse_rendezvous_for_functions) override;
-
-  void ResetGlobalRendezvousForFunction() override;
-
-  bool UsesTFRT() override;
-
-  void ListDevices(std::vector<tensorflow::DeviceAttributes>* devices) override;
-
-  std::vector<tensorflow::Device*> ListLocalTfDevices() override {
-    return context_.GetEagerContext()->local_device_mgr()->ListDevices();
-  }
-
-  std::vector<tensorflow::Device*> ListAllTfDevices() override {
-    return context_.GetEagerContext()->ListAllTfDevices();
-  }
-
-  tensorflow::Status AddDevices(
-      std::vector<std::unique_ptr<tensorflow::Device>> devices) override;
-
-  void ClearCachesAndThreadExecutors() override;
-  void StartStep() override;
-  void EndStep() override;
-
-  tensorflow::Status AsyncWait() override {
-    TF_RETURN_IF_ERROR(GetEagerContext()->AsyncWait());
-    GetHostContext()->Quiesce();
-    return ::tensorflow::OkStatus();
-  }
-
-  tensorflow::Status AddFunctionDef(
-      const tensorflow::FunctionDef& fdef) override;
-  tensorflow::Status AddFunctionDefWithStackTraces(
-      const tensorflow::FunctionDef& fdef,
-      const tensorflow::StackTracesMap& stack_traces) override;
-  std::vector<std::string> ListFunctionNames() override;
-  tensorflow::ImmediateExecutionContext::CacheStats GetCacheStats() override;
-  tensorflow::Status RemoveFunction(const std::string& func) override;
-  tensorflow::Status AddRemoveFunctionNotifier(
-      const std::string& func, std::function<void()> notifier) override;
-  const tensorflow::FunctionDef* FindFunctionDef(
-      const std::string& name) const override;
-  tensorflow::core::RefCountPtr<tensorflow::FunctionRecord> FindRecord(
-      const std::string& name) const override;
-
-  const tensorflow::DeviceNameUtils::ParsedName& HostCPUParsedName()
-      const override;
-  const std::string& HostCPUName() const override;
-
-  void SetAllowSoftPlacement(bool enable) override {
-    // TODO(tfrt-devs): Move this flag to a common place that can be shared
-    // by current TF and TFRT.
-    GetEagerContext()->SetAllowSoftPlacement(enable);
-  }
-  void SetShouldStoreGraphs(bool value) override {
-    GetEagerContext()->SetShouldStoreGraphs(value);
-  }
-
-  tensorflow::Status EnableCollectiveOps(
-      const tensorflow::ServerDef& server_def) override;
-
-  std::unique_ptr<tensorflow::RunMetadata> ExportRunMetadata() override;
-
-  // Find the FunctionDef by the given name and record it in RunMetadata.
-  tensorflow::Status RunMetadataRecordFunction(const std::string& func_name);
-
-  void SetLogDevicePlacement(bool enable) override {
-    // TODO(tfrt-devs): Move this flag to a common place that can be shared
-    // by current TF and TFRT.
-    GetEagerContext()->SetLogDevicePlacement(enable);
-  }
-
-  void SetRunEagerOpAsFunction(bool enable) override {
-    // TODO(tfrt-devs): Move this flag to a common place that can be shared
-    // by current TF and TFRT.
-    GetEagerContext()->SetRunEagerOpAsFunction(enable);
-  }
-
-  void SetJitCompileRewrite(bool enable) override {
-    // TODO(tfrt-devs): Move this flag to a common place that can be shared
-    // by current TF and TFRT.
-    GetEagerContext()->SetJitCompileRewrite(enable);
-  }
-
-  tensorflow::EagerExecutor& Executor() override {
-    return GetEagerContext()->Executor();
-  }
-  void SetExecutorForThread(tensorflow::EagerExecutor* executor) override;
-
-  void SetThreadLocalDevicePlacementPolicy(
-      tensorflow::ContextDevicePlacementPolicy policy) override {
-    // TODO(tfrt-devs): Move this flag to a common place that can be shared
-    // by current TF and TFRT.
-    GetEagerContext()->SetThreadLocalDevicePlacementPolicy(policy);
-  }
-  tensorflow::ContextDevicePlacementPolicy GetDevicePlacementPolicy()
-      const override {
-    // TODO(tfrt-devs): Move this flag to a common place that can be shared
-    // by current TF and TFRT.
-    return GetEagerContext()->GetDevicePlacementPolicy();
-  }
-
-  CoreRuntime* GetCoreRuntime();
-  tensorflow::Status BuildFunctionRequestContext(
-      tensorflow::tfrt_stub::OpKernelRunnerTable* runner_table, int64_t step_id,
-      RCReference<tfrt::RequestContext>* request_context);
-  tensorflow::Status BuildOpRequestContext(
-      RCReference<tfrt::RequestContext>* request_context);
-  tensorflow::EagerContext* GetEagerContext();
-  const tensorflow::EagerContext* GetEagerContext() const;
-  TfrtContext* GetTfrtContext();
-
-  // Selects the op handler to execute the op based on the arguments. This
-  // op handler selection is cheap. But it can be nullptr even it return OK
-  // status.
-  tensorflow::Status SelectOpHandlerFromArguments(
-      const tensorflow::ImmediateExecutionOperation& op,
-      OpHandler** op_handler);
-
-  // Selects the op handler to execute the op based on NodeDef. This op handler
-  // selection is expensive. It will never return nullptr unless there is an
-  // error. Please only invoke this method when the cheap version fails.
-  tensorflow::Status SelectOpHandlerFromNodeDef(
-      const tensorflow::ImmediateExecutionOperation& op,
-      const tensorflow::NodeDef* node_def, OpHandler** op_handler);
-
-  // Returns the chain for current thread.
-  AsyncValueRef<Chain>* GetChain();
-
-  // Indicates sync or async execution.
-  bool IsAsync() const { return context_.IsAsync(); }
-
-  // For LLVM style RTTI.
-  static bool classof(const AbstractContext* op) {
-    return op->getKind() == kTfrt;
-  }
-
-  FunctionCache& GetFunctionCache() { return function_cache_; }
-
-  OpCache& GetOpCache() { return op_cache_; }
-
-  OpHandler* GetFallbackOpHandler();
-
-  std::vector<std::string> GetLoggedOpsTestonly() override;
-
-#if !defined(IS_MOBILE_PLATFORM)
-  void SetDistributedManager(
-      std::unique_ptr<tensorflow::ImmediateExecutionDistributedManager>
-          distributed) override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  tensorflow::ImmediateExecutionDistributedManager* GetDistributedManager()
-      override {
-    return context_.GetEagerContext()->GetDistributedManager();
-  }
-#endif  // !IS_MOBILE_PLATFORM
-
- private:
-  HostContext* GetHostContext();
-  ResourceContext* GetResourceContext();
-
-  Expected<OpHandler*> GetOpHandler(const char* name);
-
-  TfrtContext context_;
-
-  mutable tensorflow::mutex chain_map_mu_;
-  // TODO(chuanhao): Hook it up with C API to allow user to manage it.
-  // Each caller thread will have its own chain to dispatch ops.
-  std::unordered_map<std::thread::id, AsyncValueRef<Chain>> thread_local_chain_
-      TF_GUARDED_BY(chain_map_mu_);
-
-  std::unique_ptr<EagerOpHandlerSelector> op_handler_selector_;
-
-  // The cache that stores functions (composite CoreRuntimeOps).
-  FunctionCache function_cache_;
-
-  // The cache that stores CoreRuntimeOps. It's separate from function cache
-  // since a primitive CoreRuntimeOp is essentially a stateless function
-  // pointer, and so it doesn't need ref-count to manage its lifetime.
-  OpCache op_cache_;
-
-  mutex run_metadata_mu_;
-  std::unique_ptr<tensorflow::RunMetadata> run_metadata_
-      TFRT_GUARDED_BY(run_metadata_mu_);
-};
-
-class TensorInterface : public tensorflow::AbstractTensorInterface {
- public:
-  explicit TensorInterface(AsyncValueRef<Tensor> t) : tensor_(std::move(t)) {}
-  explicit TensorInterface(tensorflow::Tensor t) : tf_tensor_(std::move(t)) {}
-  ~TensorInterface() override {}
-
-  void Release() override { delete this; }
-
-  tensorflow::DataType Type() const override;
-  int NumDims() const override;
-  int64_t Dim(int dim_index) const override;
-  int64_t NumElements() const override;
-  size_t ByteSize() const override;
-  void* Data() const override;
-  bool IsAligned() const override;
-  bool CanMove() const override;
-  bool IsTfTensor() const { return !tensor_; }
-  std::string SummarizeValue() const override;
-
-  AsyncValueRef<tfrt::Tensor> TensorRef() const;
-  tensorflow::Tensor& TfTensor() { return tf_tensor_; }
-
- private:
-  AsyncValueRef<tfrt::Tensor> tensor_;
-  // NOTE(b/167608876): tensorflow::Tensor for handling non-scalar string
-  // tensors, for backward compatibility. This is a temporary workaround until
-  // we find a proper way to unify tensorflow::tstring and
-  // tfrt::StringHostTensor.
-  tensorflow::Tensor tf_tensor_;
-};
-
-class TensorHandleInterface
-    : public tensorflow::ImmediateExecutionTensorHandle {
- public:
-  explicit TensorHandleInterface(Value&& v, TfrtContext* context);
-
-  explicit TensorHandleInterface(tensorflow::DataType dtype, Value&& v,
-                                 TfrtContext* context);
-
-  tensorflow::DataType DataType() const override;
-  tensorflow::Status TensorHandleStatus() const override;
-  tensorflow::Status Shape(
-      tensorflow::PartialTensorShape* shape) const override;
-  tensorflow::Status NumDims(int* num_dims) const override;
-  tensorflow::Status NumElements(int64_t* num_elements) const override;
-  tensorflow::Status Dim(int dim_index, int64_t* dim) const override;
-
-  // DeviceName represents the device that creates the tensor handle.
-  // Currently the same with BackingDeviceName.
-  // TODO(b/169341326): unify device behavior between current TF and TFRT.
-  const char* DeviceName(tensorflow::Status* status) const override;
-
-  // BackingDeviceName represents the device where the tensor is physically
-  // placed. DeviceName and BackingDeviceName are the same for TFRT.
-  const char* BackingDeviceName(tensorflow::Status* status) const override;
-
-  const char* DeviceType(tensorflow::Status* status) const override;
-
-  int DeviceId(tensorflow::Status* status) const override {
-    // TODO(tfrt-devs): implement for tfrt tensor handle.
-    llvm_unreachable("unimplemented method.");
-  }
-
-  tensorflow::AbstractTensorInterface* Resolve(
-      tensorflow::Status* status) override;
-
-  TensorHandle Handle() { return value_.get<TensorHandle>().CopyRef(); }
-
-  Value* value() { return &value_; }
-
-  // For LLVM style RTTI.
-  static bool classof(const tensorflow::AbstractTensorHandle* ptr) {
-    return ptr->getKind() == kTfrt;
-  }
-
-  tensorflow::FullTypeDef FullType() const override { return full_type_; }
-
- private:
-  std::optional<const TensorMetadata*> Metadata() const;
-
-  tensorflow::StatusOr<tensorflow::DataType> ObtainDataTypeFromMetaData(
-      const TensorMetadata*) const;
-
-  // If the tensor handle is generated as the result of a function, the datatype
-  // is known from the function output signature.
-  // Therefore, we can obtain the datatype earlier, before the function
-  // execution completes.
-  std::optional<tensorflow::DataType> dtype_;
-
-  TfrtContext& context_;
-
-  // Value of tfrt::TensorHandle.
-  Value value_;
-
-  tensorflow::FullTypeDef full_type_;
-};
-
-template <typename T>
-inline TensorHandleInterface* TensorHandleFromInterface(T* handle) {
-  return tensorflow::down_cast<TensorHandleInterface*>(handle);
-}
-
-// TFRT location handler class that simply prints the error and abort the
-// program on encountering any error. It's primarily for easy debugging
-// TODO(kkb): Handle errors probably by raising a Python exception.
-class AbortLocationHandler final : public tfrt::LocationHandler {
- public:
-  tfrt::Location GetCurrentLocation();
-
- private:
-  tfrt::DecodedLocation DecodeLocation(tfrt::Location loc) const override {
-    // Return a dummy decoded location.
-    return {};
-  }
-};
-
-class OpAttrsInterface : public tensorflow::AbstractOpAttrs {
- public:
-  explicit OpAttrsInterface(const OpAttrs* attrs,
-                            tensorflow::AttrBuilder* fallback_attrs)
-      : AbstractOpAttrs(
-            tensorflow::AbstractOpAttrs::AbstractOpAttrsKind::kTfrt),
-        attrs_(attrs),
-        fallback_attrs_(fallback_attrs) {}
-  ~OpAttrsInterface() override {}
-
-  void GetNameAttrList(tensorflow::NameAttrList* name_and_attrs) const override;
-  tensorflow::Status GetTypeList(
-      absl::string_view attr_name,
-      absl::InlinedVector<tensorflow::DataType, 4>* type_list) const override;
-
-  bool GetInt(absl::string_view attr_name, int64_t* result) const override;
-  bool GetFloat(absl::string_view attr_name, float* result) const override;
-  bool GetBool(absl::string_view attr_name, bool* result) const override;
-  bool GetType(absl::string_view attr_name,
-               tensorflow::DataType* result) const override;
-
-  const OpAttrs* GetAttrs() const { return attrs_; }
-
-  const tensorflow::AttrBuilder* GetFallbackAttrs() const {
-    return fallback_attrs_;
-  }
-
- private:
-  // TODO(fishx): Move ownership to here.
-  const OpAttrs* attrs_;
-
-  // TODO(tfrt-devs): Remove this field and generate NameAttrList from attrs_.
-  // Today it is fine since we will set both attrs and fallback_attrs.
-  const tensorflow::AttrBuilder* fallback_attrs_;
-};
-
-class OperationInterface : public tensorflow::ImmediateExecutionOperation {
- public:
-  // All arguments come from ContextInterface.
-  explicit OperationInterface(ContextInterface* context);
-  ~OperationInterface() override {}
-
-  void Release() override { delete this; }
-
-  void Clear() override { args_.clear(); }
-
-  tensorflow::Status Reset(const char* op,
-                           const char* raw_device_name) override;
-  const std::string& Name() const override { return op_name_; }
-  const std::string& DeviceName() const override { return device_name_; }
-  tensorflow::Status SetDeviceName(const char* name) override;
-
-  tensorflow::ImmediateExecutionContext* GetContext() const override {
-    return context_;
-  }
-  bool HasCustomDeviceInput() const override {
-    return custom_device_tensor_handle_count_ > 0;
-  }
-
-  tensorflow::Status AddInput(tensorflow::AbstractTensorHandle* input) override;
-  tensorflow::Status AddInputList(
-      absl::Span<tensorflow::AbstractTensorHandle* const> inputs) override;
-  tensorflow::Status SetInput(
-      size_t index, tensorflow::ImmediateExecutionTensorHandle* input) override;
-  absl::Span<tensorflow::ImmediateExecutionTensorHandle* const> GetInputs()
-      const override;
-  tensorflow::Status Execute(
-      absl::Span<tensorflow::AbstractTensorHandle*> retvals,
-      int* num_retvals) override;
-  const tensorflow::OpDef* OpDef() const override { return op_def_; }
-  const tensorflow::NodeDef NodeDef() { return fallback_attrs_.BuildNodeDef(); }
-
-  tensorflow::Status SetAttrString(const char* attr_name, const char* data,
-                                   size_t length) override;
-  tensorflow::Status SetAttrInt(const char* attr_name, int64_t value) override;
-  tensorflow::Status SetAttrFloat(const char* attr_name, float value) override;
-  tensorflow::Status SetAttrBool(const char* attr_name, bool value) override;
-  tensorflow::Status SetAttrType(const char* attr_name,
-                                 tensorflow::DataType value) override;
-  tensorflow::Status SetAttrShape(const char* attr_name, const int64_t* dims,
-                                  const int num_dims) override;
-  tensorflow::Status SetAttrFunction(const char* attr_name,
-                                     const AbstractOperation* value) override;
-  tensorflow::Status SetAttrFunctionName(const char* attr_name,
-                                         const char* data,
-                                         size_t length) override;
-  tensorflow::Status SetAttrTensor(
-      const char* attr_name,
-      tensorflow::AbstractTensorInterface* tensor) override;
-  tensorflow::Status SetAttrStringList(const char* attr_name,
-                                       const void* const* values,
-                                       const size_t* lengths,
-                                       int num_values) override;
-  tensorflow::Status SetAttrFloatList(const char* attr_name,
-                                      const float* values,
-                                      int num_values) override;
-  tensorflow::Status SetAttrIntList(const char* attr_name,
-                                    const int64_t* values,
-                                    int num_values) override;
-  tensorflow::Status SetAttrTypeList(const char* attr_name,
-                                     const tensorflow::DataType* values,
-                                     int num_values) override;
-  tensorflow::Status SetAttrBoolList(const char* attr_name,
-                                     const unsigned char* values,
-                                     int num_values) override;
-  tensorflow::Status SetAttrShapeList(const char* attr_name,
-                                      const int64_t** dims, const int* num_dims,
-                                      int num_values) override;
-  tensorflow::Status SetAttrFunctionList(
-      const char* attr_name,
-      absl::Span<const AbstractOperation*> values) override;
-
-  tensorflow::Status InputLength(const char* input_name, int* length) override;
-  tensorflow::Status OutputLength(const char* output_name,
-                                  int* length) override;
-
-  const tensorflow::AbstractOpAttrs* GetOpAttrs() const override;
-  void AddAttrs(const tensorflow::AbstractOpAttrs* op_attrs) override;
-
-  void SetStackTrace(tensorflow::ManagedStackTrace stack_trace) override {
-    stack_trace_ = stack_trace;
-  }
-
-  void SetCancellationManager(
-      tensorflow::CancellationManager* cancellation_manager) override {
-    // TODO(b/181368626): Support cancellation.
-  }
-
-  std::optional<tensorflow::ManagedStackTrace> GetStackTrace() override {
-    return stack_trace_;
-  }
-
-  void SetStepId(int64_t step_id) override { step_id_ = step_id; }
-
-  int64_t step_id() { return step_id_; }
-
-  // For LLVM style RTTI.
-  static bool classof(const AbstractOperation* ptr) {
-    return ptr->getKind() == kTfrt;
-  }
-
-  friend class OpCache;
-
- private:
-  // Initialize op_ field. It can be either a trivial op or a composite op.
-  tensorflow::Status Initialize();
-
-  // Note(fishx): This method is copied from current TF. We use it to infer
-  // attribute like "T" in order to run device placement logic from current TF.
-  void MaybeInferInputAttrs();
-
-  int64_t step_id_ = 0;
-  // This field holds a primitive op. If the op represents a function, it
-  // will be held by function_state_ below, and this field will be empty.
-  CoreRuntimeOp* op_;
-  RCReference<FunctionState> function_state_;
-  std::string op_name_;
-  // The device user requested to place the op on.
-  std::string device_name_;
-  bool is_function_;
-  tfrt::BefAttrEncoder bef_attr_encoder_;
-  // TODO(b/165412867): Remove AttrBuilder.
-  tensorflow::AttrBuilder fallback_attrs_;
-  const tensorflow::OpDef* op_def_;  // op definition from protobuf
-  OpAttrs attrs_;
-  OpAttrsInterface op_attrs_;
-  llvm::SmallVector<
-      tensorflow::core::RefCountPtr<tensorflow::ImmediateExecutionTensorHandle>,
-      8>
-      args_;
-  AbortLocationHandler abort_location_handler_;
-  ContextInterface* const context_;
-  // TODO(kkb): Use tfrt::Location and implement TFRT async stack tracing.
-  std::optional<tensorflow::ManagedStackTrace> stack_trace_;
-
-  int custom_device_tensor_handle_count_ = 0;
-};
-
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_C_API_TFRT_H_
diff --git a/tensorflow/core/tfrt/eager/core_runtime/BUILD b/tensorflow/core/tfrt/eager/core_runtime/BUILD
deleted file mode 100644
index deb7341c88f..00000000000
--- a/tensorflow/core/tfrt/eager/core_runtime/BUILD
+++ /dev/null
@@ -1,36 +0,0 @@
-# Description:
-# core_runtime libraries.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow/core/tfrt/eager:__pkg__"],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "core_runtime_lib",
-    srcs = [
-        "op_handler_registry.cc",
-        "op_handler_selector.cc",
-    ],
-    hdrs = [
-        "op_handler_registry.h",
-        "op_handler_selector.h",
-    ],
-    deps = [
-        "//tensorflow/c/eager:immediate_execution_context",
-        "//tensorflow/c/eager:immediate_execution_operation",
-        "//tensorflow/c/eager:immediate_execution_tensor_handle",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core/common_runtime:core_cpu_lib",
-        "//tensorflow/core/common_runtime/eager:attr_builder",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/common_runtime/eager:placement_utils",
-        "//tensorflow/core/platform:errors",
-        "//tensorflow/core/platform:status",
-        "@tf_runtime//:core_runtime_alwayslink",
-        "@tf_runtime//:dtype",
-        "@tf_runtime//:hostcontext_alwayslink",
-        "@tf_runtime//:support",
-    ],
-)
diff --git a/tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.cc b/tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.cc
deleted file mode 100644
index 6c2e846f382..00000000000
--- a/tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.cc
+++ /dev/null
@@ -1,44 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
-
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-static std::vector<OpHandlerRegistrationFn>* GetStaticOpHandlerRegistrations() {
-  static std::vector<OpHandlerRegistrationFn>* ret =
-      new std::vector<OpHandlerRegistrationFn>;
-  return ret;
-}
-
-void RegisterOpHandlers(CoreRuntime* core_runtime,
-                        ResourceContext* resource_context,
-                        const DeviceMgr* device_mgr) {
-  for (auto fn : *GetStaticOpHandlerRegistrations()) {
-    fn(core_runtime, resource_context, device_mgr);
-  }
-}
-
-OpHandlerRegistration::OpHandlerRegistration(OpHandlerRegistrationFn fn) {
-  GetStaticOpHandlerRegistrations()->emplace_back(fn);
-}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h b/tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h
deleted file mode 100644
index 259765f6e09..00000000000
--- a/tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_CORE_RUNTIME_OP_HANDLER_REGISTRY_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_CORE_RUNTIME_OP_HANDLER_REGISTRY_H_
-
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-
-namespace tensorflow {
-class DeviceMgr;
-}  // namespace tensorflow
-
-namespace tfrt {
-
-class CoreRuntime;
-class ResourceContext;
-
-namespace tf {
-
-using ::tensorflow::DeviceMgr;
-
-// TODO(fishx): Change the second argument to tfrt::DeviceManager and move this
-// file into TFRT.
-using OpHandlerRegistrationFn = void (*)(CoreRuntime* core_runtime,
-                                         ResourceContext* resource_context,
-                                         const DeviceMgr* device_mgr);
-
-// This is called to register all OpHandlers into the given core_runtime.
-void RegisterOpHandlers(CoreRuntime* core_runtime,
-                        ResourceContext* resource_context,
-                        const DeviceMgr* device_mgr);
-
-// A helper class for registering a new OpHandler registration function.
-struct OpHandlerRegistration {
-  explicit OpHandlerRegistration(OpHandlerRegistrationFn fn);
-};
-
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_CORE_RUNTIME_OP_HANDLER_REGISTRY_H_
diff --git a/tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.cc b/tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.cc
deleted file mode 100644
index 7265947dd83..00000000000
--- a/tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h"
-
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/common_runtime/eager/placement_utils.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/device.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-#include "tfrt/support/ref_count.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-EagerOpHandlerSelector::EagerOpHandlerSelector(CoreRuntime* core_runtime,
-                                               EagerContext* eager_context,
-                                               OpHandler* fallback_op_handler,
-                                               bool pin_small_ops_to_cpu)
-    : core_runtime_(core_runtime),
-      eager_context_(eager_context),
-      cpu_device_(core_runtime->GetHostContext()->GetHostDevice()),
-      cpu_op_handler_(core_runtime_->GetOpHandler(cpu_device_.name())),
-      fallback_op_handler_(fallback_op_handler),
-      pin_small_ops_to_cpu_(pin_small_ops_to_cpu) {
-  assert(cpu_op_handler_);
-  assert(fallback_op_handler_);
-}
-
-EagerOpHandlerSelector::~EagerOpHandlerSelector() {}
-
-Status EagerOpHandlerSelector::SelectFromArguments(
-    const ImmediateExecutionOperation& op, OpHandler** op_handler) {
-  // If the op contains resource handle, place the op on the device of the
-  // resource.
-  // TODO(tfrt-devs): Unify this logic with MaybePinToResourceDevice in Eager
-  // runtime.
-  for (int i = 0; i < op.GetInputs().size(); i++) {
-    auto& handle = op.GetInputs()[i];
-    Status s;
-    if (handle->DataType() == tensorflow::DT_RESOURCE) {
-      auto device_name = handle->DeviceName(&s);
-      TF_RETURN_IF_ERROR(s);
-      *op_handler = core_runtime_->GetOpHandler(device_name);
-      if (*op_handler != nullptr) {
-        DVLOG(1) << "Setting device of operation " << op.Name() << " to "
-                 << device_name << " because input #" << i
-                 << " is a resource in this device.";
-        return ::tensorflow::OkStatus();
-      }
-    }
-  }
-
-  // Pin the op to cpu op handler if it is a small ops and all its inputs
-  // are on cpu already.
-  if (pin_small_ops_to_cpu_) {
-    bool pin_to_cpu;
-    TF_RETURN_IF_ERROR(tensorflow::eager::MaybePinSmallOpsToCpu(
-        &pin_to_cpu, op.Name(), op.GetInputs(),
-        {cpu_device_.name().data(), cpu_device_.name().size()}));
-    if (pin_to_cpu) {
-      *op_handler = cpu_op_handler_;
-      return ::tensorflow::OkStatus();
-    }
-  }
-
-  // Note: The output op_handler is nullptr.
-  return ::tensorflow::OkStatus();
-}
-
-Status EagerOpHandlerSelector::SelectFromNodeDef(
-    const ImmediateExecutionOperation& op, const NodeDef* ndef,
-    OpHandler** op_handler) {
-  const auto& requested_device = op.DeviceName();
-
-  // TODO(fishx): Use TFRT native op registry to select op handler.
-
-  // TODO(fishx): Add a cache for following device placement using current TF.
-  // Use EagerContext from current tf to select op handler for this op.
-  tensorflow::DeviceNameUtils::ParsedName device_parsed_name;
-  if (!tensorflow::DeviceNameUtils::ParseFullName(requested_device,
-                                                  &device_parsed_name)) {
-    return tensorflow::errors::InvalidArgument("Failed to parse device name: ",
-                                               requested_device);
-  }
-
-  tensorflow::Device* device;
-  TF_RETURN_IF_ERROR(
-      eager_context_->SelectDevice(device_parsed_name, *ndef, &device));
-
-  *op_handler = core_runtime_->GetOpHandler(device->name());
-
-  if (!(*op_handler)) *op_handler = fallback_op_handler_;
-
-  return ::tensorflow::OkStatus();
-}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h b/tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h
deleted file mode 100644
index 0551855f32c..00000000000
--- a/tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_CORE_RUNTIME_OP_HANDLER_SELECTOR_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_CORE_RUNTIME_OP_HANDLER_SELECTOR_H_
-
-#include "tensorflow/core/platform/status.h"
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-
-namespace tensorflow {
-
-class ImmediateExecutionOperation;
-class EagerContext;
-class AttrBuilder;
-class NodeDef;
-}  // namespace tensorflow
-
-namespace tfrt {
-class OpHandler;
-class CoreRuntime;
-class Device;
-
-namespace tf {
-
-using ::tensorflow::EagerContext;
-using ::tensorflow::ImmediateExecutionOperation;
-using ::tensorflow::NodeDef;
-using ::tensorflow::Status;
-
-// A helper class to select op handler in op-by-op execution.
-class EagerOpHandlerSelector final {
- public:
-  EagerOpHandlerSelector(CoreRuntime* core_runtime, EagerContext* eager_context,
-                         OpHandler* fallback_op_handler,
-                         bool pin_small_ops_to_cpu);
-  ~EagerOpHandlerSelector();
-
-  // Selects the op handler to execute the op based on the arguments. This
-  // op handler selection is cheap. But it can be nullptr even it return OK
-  // status.
-  Status SelectFromArguments(const ImmediateExecutionOperation& op,
-                             OpHandler** op_handler);
-
-  // Selects the op handler to execute the op based on NodeDef. This op handler
-  // selection is expensive. It will never return nullptr unless there is an
-  // error. Please only invoke this method when the cheap version fails.
-  Status SelectFromNodeDef(const ImmediateExecutionOperation& op,
-                           const NodeDef* ndef, OpHandler** op_handler);
-
- private:
-  CoreRuntime* core_runtime_;
-  EagerContext* eager_context_;
-
-  const Device& cpu_device_;
-  OpHandler* cpu_op_handler_;
-  OpHandler* fallback_op_handler_;
-  bool pin_small_ops_to_cpu_;
-};
-
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_CORE_RUNTIME_OP_HANDLER_SELECTOR_H_
diff --git a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD
deleted file mode 100644
index b6d015df142..00000000000
--- a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/BUILD
+++ /dev/null
@@ -1,31 +0,0 @@
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_cc_test",
-)
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
-
-licenses(["notice"])
-
-tf_cc_test(
-    name = "op_handler_selector_test",
-    srcs = ["op_handler_selector_test.cc"],
-    deps = [
-        "//tensorflow/core:framework",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/common_runtime:core_cpu",
-        "//tensorflow/core/common_runtime/eager:attr_builder",
-        "//tensorflow/core/common_runtime/eager:context",
-        "//tensorflow/core/common_runtime/eager:core",
-        "//tensorflow/core/platform:refcount",
-        "//tensorflow/core/tfrt/eager:core_runtime",
-        "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
-        "@tf_runtime//:support",
-        "@tf_runtime//backends/cpu:core_runtime",
-    ],
-)
diff --git a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc b/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
deleted file mode 100644
index b79ce9747d8..00000000000
--- a/tensorflow/core/tfrt/eager/cpp_tests/core_runtime/op_handler_selector_test.cc
+++ /dev/null
@@ -1,501 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_selector.h"
-
-#include <memory>
-#include <string>
-
-#include "tensorflow/core/common_runtime/device.h"
-#include "tensorflow/core/common_runtime/device_mgr.h"
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/framework/allocator.h"
-#include "tensorflow/core/framework/device_attributes.pb.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/refcount.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/protobuf/error_codes.pb.h"
-#include "tfrt/cpu/core_runtime/null_op_handler.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
-#include "tfrt/host_context/diagnostic.h"  // from @tf_runtime
-#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-#include "tfrt/support/ref_count.h"  // from @tf_runtime
-#include "tfrt/support/string_util.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-namespace {
-
-using ::tensorflow::AbstractTensorHandle;
-using ::tensorflow::Allocator;
-using ::tensorflow::AllocatorAttributes;
-using ::tensorflow::AttrBuilder;
-using ::tensorflow::DataType;
-using ::tensorflow::DEVICE_CPU;
-using ::tensorflow::DeviceAttributes;
-using ::tensorflow::DynamicDeviceMgr;
-using ::tensorflow::EagerContext;
-using ::tensorflow::ImmediateExecutionOperation;
-using ::tensorflow::OpKernel;
-using ::tensorflow::OpKernelConstruction;
-using ::tensorflow::OpKernelContext;
-using ::tensorflow::SessionOptions;
-using ::tensorflow::Status;
-
-constexpr char kFullCPU[] = "/job:a/replica:0/task:0/device:CPU:0";
-constexpr char kFullGPU[] = "/job:a/replica:0/task:0/device:FakeGPU:0";
-
-////////////////////////////////////////////////////////////////////////////////
-//
-// Op, kernel to set up the environment.
-//
-// The Placer uses information about the op (input types),
-// kernel (device constraints). To avoid depending on the full runtime, we
-// define dummy implementations of these, and register them with the
-// runtime.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-// A dummy OpKernel that is used to register ops on different devices.
-class DummyOp : public OpKernel {
- public:
-  explicit DummyOp(OpKernelConstruction* context) : OpKernel(context) {}
-  void Compute(OpKernelContext* context) override {}
-};
-
-// Register the following ops so they can be added to a Graph, and
-// kernels so that they can be placed on particular device types.
-REGISTER_OP("InvalidOp").Output("o: Ref(float)");
-
-REGISTER_OP("TestOp").Output("o: Ref(float)");
-REGISTER_KERNEL_BUILDER(Name("TestOp").Device(DEVICE_CPU).Priority(1), DummyOp);
-REGISTER_KERNEL_BUILDER(Name("TestOp").Device("FakeGPU").Priority(2), DummyOp);
-
-static tensorflow::Device* CreateDevice(const char* type, const char* name) {
-  class FakeDevice : public tensorflow::Device {
-   public:
-    explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-    Status Sync() override { return ::tensorflow::OkStatus(); }
-    Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
-  };
-  DeviceAttributes attr;
-  attr.set_name(name);
-  attr.set_device_type(type);
-  return new FakeDevice(attr);
-}
-
-class FakeTensorHandle : public tensorflow::ImmediateExecutionTensorHandle {
- public:
-  explicit FakeTensorHandle(string_view device_name, tensorflow::DataType dtype)
-      : ImmediateExecutionTensorHandle(kTfrt),
-        device_name_(device_name),
-        dtype_(dtype) {}
-
-  void Release() { Unref(); }
-
-  tensorflow::DataType DataType() const override { return dtype_; }
-  Status Shape(tensorflow::PartialTensorShape* shape) const override {
-    int64_t dim_sizes[] = {1};
-    return tensorflow::PartialTensorShape::MakePartialShape(dim_sizes, 1,
-                                                            shape);
-  }
-  Status NumDims(int* num_dims) const override {
-    *num_dims = 1;
-    return ::tensorflow::OkStatus();
-  }
-  Status NumElements(int64_t* num_elements) const override {
-    *num_elements = 1;
-    return ::tensorflow::OkStatus();
-  }
-  Status Dim(int dim_index, int64_t* dim) const override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  const char* DeviceName(Status* status) const override {
-    return device_name_.c_str();
-  }
-  const char* BackingDeviceName(Status* status) const override {
-    llvm_unreachable("unimplemented method.");
-  }
-  const char* DeviceType(Status* status) const override {
-    llvm_unreachable("unimplemented method.");
-  }
-  int DeviceId(Status* status) const override {
-    llvm_unreachable("unimplemented method.");
-  }
-  tensorflow::AbstractTensorInterface* Resolve(Status* status) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  ImmediateExecutionTensorHandle* Copy() {
-    Ref();
-    return this;
-  }
-  // Return default (TFT_UNSET) full type information. This could be updated in
-  // the future if full type information is needed.
-  tensorflow::FullTypeDef FullType() const override {
-    return tensorflow::FullTypeDef();
-  }
-
-  static bool classof(const AbstractTensorHandle* ptr) { return true; }
-
- private:
-  std::string device_name_;
-  tensorflow::DataType dtype_;
-};
-
-class FakeOperation : public ImmediateExecutionOperation {
- public:
-  explicit FakeOperation() : ImmediateExecutionOperation(kTfrt) {}
-  ~FakeOperation() override {}
-
-  void Release() override { delete this; }
-
-  void Clear() override { args_.clear(); }
-
-  tensorflow::ImmediateExecutionContext* GetContext() const override {
-    return nullptr;
-  }
-
-  bool HasCustomDeviceInput() const override { return false; }
-
-  Status Reset(const char* op, const char* raw_device_name) override {
-    op_name_ = op;
-    device_name_ = raw_device_name;
-    attrs_.Reset(op);
-    args_.clear();
-    return ::tensorflow::OkStatus();
-  }
-  const std::string& Name() const override { return op_name_; }
-  const std::string& DeviceName() const override { return device_name_; }
-  tensorflow::Status SetDeviceName(const char* name) override {
-    device_name_ = name;
-    return ::tensorflow::OkStatus();
-  }
-
-  Status AddInput(AbstractTensorHandle* input) override {
-    input->Ref();
-    args_.push_back(tensorflow::core::RefCountPtr<FakeTensorHandle>(
-        static_cast<FakeTensorHandle*>(input)));
-    attrs_.NumInputs(args_.size());
-    return ::tensorflow::OkStatus();
-  }
-  Status SetInput(size_t index,
-                  tensorflow::ImmediateExecutionTensorHandle* input) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status AddInputList(absl::Span<AbstractTensorHandle* const> inputs) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  absl::Span<tensorflow::ImmediateExecutionTensorHandle* const> GetInputs()
-      const override {
-    return absl::MakeSpan(
-        reinterpret_cast<tensorflow::ImmediateExecutionTensorHandle* const*>(
-            args_.data()),
-        args_.size());
-  }
-  Status Execute(absl::Span<AbstractTensorHandle*> retvals,
-                 int* num_retvals) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  const tensorflow::OpDef* OpDef() const override {
-    llvm_unreachable("unimplemented method.");
-  }
-  const tensorflow::AbstractOpAttrs* GetOpAttrs() const override {
-    llvm_unreachable("unimplemented method.");
-  }
-  void AddAttrs(const tensorflow::AbstractOpAttrs* op_attrs) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrString(const char* attr_name, const char* data,
-                       size_t length) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrInt(const char* attr_name, int64_t value) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrFloat(const char* attr_name, float value) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrBool(const char* attr_name, bool value) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrType(const char* attr_name,
-                     tensorflow::DataType value) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrShape(const char* attr_name, const int64_t* dims,
-                      const int num_dims) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrFunction(const char* attr_name,
-                         const AbstractOperation* value) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrFunctionName(const char* attr_name, const char* data,
-                             size_t length) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrTensor(const char* attr_name,
-                       tensorflow::AbstractTensorInterface* tensor) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrStringList(const char* attr_name, const void* const* values,
-                           const size_t* lengths, int num_values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrFloatList(const char* attr_name, const float* values,
-                          int num_values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrIntList(const char* attr_name, const int64_t* values,
-                        int num_values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrTypeList(const char* attr_name,
-                         const tensorflow::DataType* values,
-                         int num_values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
-                         int num_values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
-                          const int* num_dims, int num_values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status SetAttrFunctionList(
-      const char* attr_name,
-      absl::Span<const AbstractOperation*> values) override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  Status InputLength(const char* input_name, int* length) override {
-    llvm_unreachable("unimplemented method.");
-  }
-  Status OutputLength(const char* output_name, int* length) override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  void SetCancellationManager(
-      tensorflow::CancellationManager* cancellation_manager) override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  void SetStackTrace(tensorflow::ManagedStackTrace stack_trace) override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  absl::optional<tensorflow::ManagedStackTrace> GetStackTrace() override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  void SetStepId(int64_t step_id) override {
-    llvm_unreachable("unimplemented method.");
-  }
-
-  static bool classof(const AbstractOperation* ptr) { return true; }
-
-  AttrBuilder* GetAttrs() { return &attrs_; }
-
- private:
-  std::string op_name_;
-  std::string device_name_;
-  llvm::SmallVector<tensorflow::core::RefCountPtr<FakeTensorHandle>, 8> args_;
-  AttrBuilder attrs_;
-};
-
-static std::unique_ptr<CoreRuntime> CreateCoreRuntime() {
-  auto diag_handler = [](const DecodedDiagnostic& diag) {
-    LOG(ERROR) << "Encountered runtime error: " << diag.message() << "\n";
-  };
-  auto corert =
-      CoreRuntime::Create(diag_handler, tfrt::CreateMallocAllocator(),
-                          tfrt::CreateMultiThreadedWorkQueue(
-                              /*num_threads=*/4, /*num_blocking_threads=*/64),
-                          kFullCPU);
-
-  assert(corert);
-  return std::move(*corert);
-}
-
-class SelectorTest : public ::testing::Test {
- public:
-  SelectorTest() {
-    device_manager_ = new DynamicDeviceMgr();
-    std::vector<std::unique_ptr<tensorflow::Device>> added_devices;
-    SessionOptions opts;
-
-    // Have to use real CPU device. Other, ctx->HostCPU() will return invalid
-    // device.
-    added_devices.emplace_back(CreateDevice(tensorflow::DEVICE_CPU, kFullCPU));
-    added_devices.emplace_back(CreateDevice("FakeGPU", kFullGPU));
-
-    TF_CHECK_OK(device_manager_->AddDevices(std::move(added_devices)));
-
-    SessionOptions options;
-    options.config.set_log_device_placement(true);
-    options.config.set_allow_soft_placement(true);
-    eager_context_ = new EagerContext(
-        options,
-        tensorflow::ContextDevicePlacementPolicy::DEVICE_PLACEMENT_SILENT,
-        /* async */ false, device_manager_,
-        /* device_mgr_owned */ false, /* rendezvous */ nullptr,
-        /* cluster_flr */ nullptr, /*collective_executor_mgr=*/nullptr,
-        /*run_eager_op_as_function=*/true);
-    corert_ = CreateCoreRuntime();
-    fallback_op_handler_ = CreateOpHandler();
-    cpu_op_handler_ = CreateOpHandler();
-    gpu_op_handler_ = CreateOpHandler();
-    corert_->RegisterOpHandler(kFullCPU, cpu_op_handler_);
-    corert_->RegisterOpHandler(kFullGPU, gpu_op_handler_);
-
-    selector_ = std::make_unique<EagerOpHandlerSelector>(
-        corert_.get(), eager_context_, fallback_op_handler_,
-        /*pin_small_ops_to_cpu=*/true);
-  }
-
-  ~SelectorTest() override {
-    delete device_manager_;
-    if (eager_context_) {
-      eager_context_->Unref();
-    }
-  }
-
-  EagerOpHandlerSelector* selector() { return selector_.get(); }
-
-  void Init() {}
-
- protected:
-  OpHandler* CreateOpHandler() {
-    auto expected_op_handler = tfrt::CreateNullOpHandler(corert_.get());
-    assert(expected_op_handler);
-    return std::move(expected_op_handler.get());
-  }
-
-  DynamicDeviceMgr* device_manager_;
-  EagerContext* eager_context_;
-  std::unique_ptr<CoreRuntime> corert_;
-  OpHandler* fallback_op_handler_;
-  OpHandler* cpu_op_handler_;
-  OpHandler* gpu_op_handler_;
-  std::unique_ptr<EagerOpHandlerSelector> selector_;
-};
-
-TEST_F(SelectorTest, PinSmallOpToCpuTest) {
-  auto op = std::make_unique<FakeOperation>();
-  tensorflow::core::RefCountPtr<FakeTensorHandle> cpu_tensor(
-      new FakeTensorHandle(kFullCPU, tensorflow::DT_INT32));
-  tensorflow::core::RefCountPtr<FakeTensorHandle> gpu_tensor(
-      new FakeTensorHandle(kFullGPU, tensorflow::DT_INT32));
-
-  tensorflow::Status s;
-  TF_ASSERT_OK(op->Reset("TestOp", kFullGPU));
-  TF_ASSERT_OK(op->AddInput(cpu_tensor.get()));
-  OpHandler* op_handler = nullptr;
-  s = selector()->SelectFromArguments(*op, &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler));
-  ASSERT_EQ(op_handler, cpu_op_handler_);
-
-  op_handler = nullptr;
-  TF_ASSERT_OK(op->Reset("TestOp", kFullGPU));
-  TF_ASSERT_OK(op->AddInput(gpu_tensor.get()));
-  s = selector()->SelectFromArguments(*op, &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_FALSE(static_cast<bool>(op_handler));
-  s = selector()->SelectFromNodeDef(*op, &op->GetAttrs()->BuildNodeDef(),
-                                    &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler));
-  ASSERT_EQ(op_handler, gpu_op_handler_);
-}
-
-TEST_F(SelectorTest, PinResourceTest) {
-  auto op = std::make_unique<FakeOperation>();
-  tensorflow::core::RefCountPtr<FakeTensorHandle> cpu_tensor(
-      new FakeTensorHandle(kFullCPU, tensorflow::DT_RESOURCE));
-  tensorflow::core::RefCountPtr<FakeTensorHandle> gpu_tensor(
-      new FakeTensorHandle(kFullGPU, tensorflow::DT_RESOURCE));
-
-  tensorflow::Status s;
-  TF_ASSERT_OK(op->Reset("TestOp", kFullGPU));
-  TF_ASSERT_OK(op->AddInput(cpu_tensor.get()));
-  OpHandler* op_handler = nullptr;
-  s = selector()->SelectFromArguments(*op, &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler));
-  ASSERT_EQ(op_handler, cpu_op_handler_);
-
-  op_handler = nullptr;
-  TF_ASSERT_OK(op->Reset("TestOp", kFullCPU));
-  TF_ASSERT_OK(op->AddInput(gpu_tensor.get()));
-  s = selector()->SelectFromArguments(*op, &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler));
-  ASSERT_EQ(op_handler, gpu_op_handler_);
-}
-
-TEST_F(SelectorTest, InvalidDeviceNameTest) {
-  auto op = std::make_unique<FakeOperation>();
-
-  TF_ASSERT_OK(op->Reset("TestOp", "invalid_device_name"));
-
-  tensorflow::Status s;
-  OpHandler* op_handler = nullptr;
-  s = selector()->SelectFromNodeDef(*op, &op->GetAttrs()->BuildNodeDef(),
-                                    &op_handler);
-  ASSERT_EQ(s.code(), absl::StatusCode::kInvalidArgument);
-  ASSERT_FALSE(static_cast<bool>(op_handler));
-  EXPECT_TRUE(absl::StrContains(s.ToString(), "Failed to parse device name"));
-}
-
-TEST_F(SelectorTest, SoftPlacementTest) {
-  auto op = std::make_unique<FakeOperation>();
-
-  TF_ASSERT_OK(op->Reset("TestOp", "/device:FakeGPU:99"));
-  tensorflow::Status s;
-  OpHandler* op_handler = nullptr;
-  s = selector()->SelectFromNodeDef(*op, &op->GetAttrs()->BuildNodeDef(),
-                                    &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler)) << StrCat(s.ToString());
-  ASSERT_EQ(op_handler, gpu_op_handler_);
-}
-
-TEST_F(SelectorTest, HigherPriorityDeviceTest) {
-  auto op = std::make_unique<FakeOperation>();
-
-  tensorflow::Status s;
-  TF_ASSERT_OK(op->Reset("TestOp", ""));
-  OpHandler* op_handler = nullptr;
-  s = selector()->SelectFromNodeDef(*op, &op->GetAttrs()->BuildNodeDef(),
-                                    &op_handler);
-  ASSERT_EQ(s, ::tensorflow::OkStatus());
-  ASSERT_TRUE(static_cast<bool>(op_handler));
-  ASSERT_EQ(op_handler, gpu_op_handler_);
-}
-
-}  // namespace
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/function_cache.cc b/tensorflow/core/tfrt/eager/function_cache.cc
deleted file mode 100644
index a2cf691935e..00000000000
--- a/tensorflow/core/tfrt/eager/function_cache.cc
+++ /dev/null
@@ -1,154 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tfrt/eager/function_cache.h"
-
-#include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
-#include "tensorflow/core/graph/graph.h"
-#include "tensorflow/core/tfrt/eager/transform_graph_function.h"
-#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
-#include "tfrt/host_context/chain.h"  // from @tf_runtime
-#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-void FunctionCache::RemoveFunction(string_view op_name) {
-  mutex_lock l(cache_mu_);
-  auto iter = cache_.begin();
-  while (iter != cache_.end()) {
-    if (iter->first.op_name == op_name) {
-      iter = cache_.erase(iter);
-    } else {
-      ++iter;
-    }
-  }
-}
-
-tensorflow::Status FunctionCache::GetOrAddFunction(
-    const std::string& op_name, const std::string& device_name,
-    const tensorflow::DeviceSet& device_set,
-    tensorflow::EagerContext* eager_ctx, tfrt::CoreRuntime* corert,
-    RequestCtxBuilder request_ctx_fn, Location loc,
-    tensorflow::TfrtFunctionCompileOptions compile_options,
-    tfrt::ArrayRef<const Device*> input_devices,
-    FunctionCache::FunctionCacheResult* result) {
-  const CacheKey cache_key{op_name, device_name};
-  {
-    mutex_lock l(cache_mu_);
-    auto& function_state = cache_[cache_key];
-    if (function_state) {
-      *result = FunctionCache::FunctionCacheResult{function_state, false};
-      return ::tensorflow::OkStatus();
-    }
-  }
-
-  tensorflow::FunctionLibraryDefinition* func_lib_def = eager_ctx->FuncLibDef();
-  const tensorflow::FunctionDef* fdef = func_lib_def->Find(op_name);
-  if (fdef == nullptr)
-    return tensorflow::errors::NotFound(
-        "Cannot find function from FunctionLibraryDefinition ", op_name);
-
-  // Run graph optimizations using current runtime components before converting
-  // the graph to MLIR module.
-  std::unique_ptr<tensorflow::FunctionBody> fbody;
-  TF_RETURN_IF_ERROR(tensorflow::FunctionDefToBodyHelper(
-      *fdef, tensorflow::AttrSlice(), func_lib_def, &fbody));
-
-  // Transferring out the graph ownership from fbody.
-  auto graph = std::unique_ptr<tensorflow::Graph>(fbody->graph);
-  fbody->graph = nullptr;
-
-  tensorflow::GraphDef graph_def;
-  graph->ToGraphDef(&graph_def);
-  tensorflow::FunctionLibraryDefinition reachable_lib_def =
-      func_lib_def->ReachableDefinitions(graph_def);
-
-  TF_RETURN_IF_ERROR(tensorflow::TransformGraphFunction(
-      op_name, *fdef, device_name, device_set, eager_ctx,
-      compile_options.enable_grappler, &fbody, std::move(graph), input_devices,
-      &reachable_lib_def));
-
-  BefBuffer bef_buffer;
-
-  llvm::SmallVector<tfrt::string_view, 4> device_names;
-  device_names.reserve(device_set.devices().size());
-  for (auto& d : device_set.devices()) {
-    device_names.push_back(d->name());
-  }
-
-  // Lower FunctionDef to BEF.
-  TF_RETURN_IF_ERROR(tensorflow::ConvertFunctionToBef(
-      op_name, fbody.get(), reachable_lib_def, device_names, compile_options,
-      &bef_buffer));
-
-  HostContext* host_ctx = corert->GetHostContext();
-  auto bef_file =
-      tfrt::BEFFile::Open(bef_buffer, host_ctx->GetKernelRegistry(),
-                          host_ctx->diag_handler(), host_ctx->allocator());
-  if (!bef_file)
-    return tensorflow::errors::Internal(
-        "Failed to open lowered BEF for function ", op_name, ".");
-
-  const tfrt::Function* function = bef_file->GetFunction(op_name);
-  if (!function)
-    return tensorflow::errors::Internal(
-        "Failed to get function from BEF for function ", op_name, ".");
-
-  auto expected_fn = corert->MakeCompositeOp(function);
-  if (!expected_fn)
-    return tensorflow::errors::Internal(StrCat("Construct CoreRuntimeOp for ",
-                                               op_name.c_str(), " failed. ",
-                                               expected_fn.takeError()));
-
-  TfrtDataTypeVector tfrt_arg_types;
-  tensorflow::DataTypeVector tf_ret_types;
-
-  for (const auto& arg_type : fbody->arg_types) {
-    tfrt_arg_types.push_back(ConvertTfDTypeToTfrtDType(arg_type));
-  }
-
-  for (const auto& ret_type : fbody->ret_types) {
-    tf_ret_types.push_back(ret_type);
-  }
-
-  auto runner_table =
-      std::make_unique<tensorflow::tfrt_stub::OpKernelRunnerTable>();
-  RCReference<RequestContext> request_ctx;
-  TF_RETURN_IF_ERROR(request_ctx_fn(runner_table.get(), &request_ctx));
-
-  ExecutionContext exec_ctx{std::move(request_ctx), loc};
-  TF_RETURN_IF_ERROR(
-      RunRuntimeInitializer(exec_ctx, bef_file.get(), "_tfrt_fallback_init"));
-
-  RCReference<FunctionState> entry = FunctionState::CreateFunctionState(
-      tfrt_arg_types, tf_ret_types, std::move(bef_buffer), std::move(bef_file),
-      std::move(expected_fn.get()), std::move(runner_table));
-
-  mutex_lock l(cache_mu_);
-  // Insert the new entry to cache. If an entry with the same key is already
-  // present in the cache at this moment due to race condition, overwrites it.
-  cache_[cache_key] = entry;
-  *result = FunctionCache::FunctionCacheResult{std::move(entry), true};
-  return ::tensorflow::OkStatus();
-}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/function_cache.h b/tensorflow/core/tfrt/eager/function_cache.h
deleted file mode 100644
index d1600562fb8..00000000000
--- a/tensorflow/core/tfrt/eager/function_cache.h
+++ /dev/null
@@ -1,155 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_FUNCTION_CACHE_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_FUNCTION_CACHE_H_
-
-#include "tensorflow/compiler/mlir/tfrt/function/function.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
-#include "tensorflow/core/tfrt/utils/utils.h"
-#include "tfrt/bef_executor/bef_file.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
-#include "tfrt/host_context/function.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/support/aligned_buffer.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-#include "tfrt/support/mutex.h"  // from @tf_runtime
-#include "tfrt/support/ref_count.h"  // from @tf_runtime
-#include "tfrt/support/string_util.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-// A reference counted `state` object that contains a BEF file, which represents
-// a lowered FunctionDef. The CoreRuntimeOp is a callable handle to the function
-// to be called.
-class FunctionState : public ReferenceCounted<FunctionState> {
- public:
-  static RCReference<FunctionState> CreateFunctionState(
-      TfrtDataTypeSlice arg_types, tensorflow::DataTypeSlice ret_types,
-      BefBuffer bef_buffer, RCReference<BEFFile> bef_file, CoreRuntimeOp fn,
-      std::unique_ptr<tensorflow::tfrt_stub::OpKernelRunnerTable>
-          runner_table) {
-    return TakeRef(new FunctionState(arg_types, ret_types,
-                                     std::move(bef_buffer), std::move(bef_file),
-                                     std::move(fn), std::move(runner_table)));
-  }
-
-  const CoreRuntimeOp& GetFunc() const { return fn_; }
-
-  const TfrtDataTypeVector& GetArgTypes() { return arg_types_; }
-
-  const tensorflow::DataTypeVector& GetRetTypes() { return ret_types_; }
-
-  tensorflow::tfrt_stub::OpKernelRunnerTable* GetRunnerTable() {
-    return runner_table_.get();
-  }
-
- private:
-  FunctionState(
-      TfrtDataTypeSlice arg_types, tensorflow::DataTypeSlice ret_types,
-      BefBuffer bef_buffer, RCReference<BEFFile> bef_file, CoreRuntimeOp fn,
-      std::unique_ptr<tensorflow::tfrt_stub::OpKernelRunnerTable> runner_table)
-      : arg_types_(arg_types.begin(), arg_types.end()),
-        ret_types_(ret_types.begin(), ret_types.end()),
-        bef_buffer_(std::move(bef_buffer)),
-        bef_file_(std::move(bef_file)),
-        fn_(std::move(fn)),
-        runner_table_(std::move(runner_table)) {}
-
-  TfrtDataTypeVector arg_types_;
-  tensorflow::DataTypeVector ret_types_;
-  BefBuffer bef_buffer_;
-  RCReference<BEFFile> bef_file_;
-  const CoreRuntimeOp fn_;
-
-  // This is the op_kernel cache used by kernel fallback compact mode. We will
-  // initialize this table right after lowering the function.
-  std::unique_ptr<tensorflow::tfrt_stub::OpKernelRunnerTable> runner_table_;
-};
-
-// Cache for a single core runtime op or function (composite op). Thread safe.
-class FunctionCache {
- public:
-  // Iterate the cache and erase the op(s) with the specified op_name.
-  void RemoveFunction(string_view op_name) TFRT_EXCLUDES(cache_mu_);
-
-  struct FunctionCacheResult {
-    RCReference<FunctionState> function_state;
-    bool is_cache_miss;
-  };
-
-  typedef std::function<tensorflow::Status(
-      tensorflow::tfrt_stub::OpKernelRunnerTable*,
-      RCReference<RequestContext>*)>
-      RequestCtxBuilder;
-
-  // Helper function to look up the cache. If miss, insert the function to the
-  // cache.
-  // When the return status is OK, `result` is set.
-  tensorflow::Status GetOrAddFunction(
-      const std::string& op_name, const std::string& device_name,
-      const tensorflow::DeviceSet& device_set,
-      tensorflow::EagerContext* eager_ctx, tfrt::CoreRuntime* corert,
-      RequestCtxBuilder request_ctx_fn, Location loc,
-      tensorflow::TfrtFunctionCompileOptions compile_options,
-      tfrt::ArrayRef<const Device*> input_devices, FunctionCacheResult* result);
-
-  // The following helper functions are for debugging and testing only.
-  size_t Size() const {
-    mutex_lock l(cache_mu_);
-    return cache_.size();
-  }
-
-  bool Contains(string_view op_name, string_view device_name) const {
-    const CacheKey& cache_key{op_name.str(), device_name.str()};
-    mutex_lock l(cache_mu_);
-    return cache_.find(cache_key) != cache_.end();
-  }
-
- private:
-  // Note: Currently the key is a pair of op_name and device_name. New features
-  // may be added in the future.
-  struct CacheKey {
-    std::string op_name, device_name;
-
-    bool operator==(const CacheKey& other) const {
-      return (this->op_name == other.op_name &&
-              this->device_name == other.device_name);
-    }
-  };
-
-  struct CacheKeyHash {
-    size_t operator()(const CacheKey& pair) const {
-      return std::hash<std::string>()(pair.op_name) ^
-             std::hash<std::string>()(pair.device_name);
-    }
-  };
-
-  mutable mutex cache_mu_;
-  std::unordered_map<CacheKey, RCReference<FunctionState>, CacheKeyHash> cache_
-      TFRT_GUARDED_BY(cache_mu_);
-};
-
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_FUNCTION_CACHE_H_
diff --git a/tensorflow/core/tfrt/eager/function_cache_test.cc b/tensorflow/core/tfrt/eager/function_cache_test.cc
deleted file mode 100644
index cc1185fe307..00000000000
--- a/tensorflow/core/tfrt/eager/function_cache_test.cc
+++ /dev/null
@@ -1,298 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tfrt/eager/function_cache.h"
-
-#include <memory>
-#include <utility>
-
-#include "absl/types/span.h"
-#include "tensorflow/c/eager/abstract_tensor_handle.h"
-#include "tensorflow/c/eager/c_api_experimental.h"
-#include "tensorflow/c/eager/c_api_test_util.h"
-#include "tensorflow/c/eager/c_api_unified_experimental.h"
-#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
-#include "tensorflow/c/experimental/ops/array_ops.h"
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/c/tf_tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/core/status_test_util.h"
-#include "tensorflow/core/lib/llvm_rtti/llvm_rtti.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/refcount.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_execute_compat_eager.h"
-#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-
-namespace tfrt {
-namespace tf {
-namespace {
-
-using tensorflow::Status;
-using tensorflow::StatusFromTF_Status;
-using tensorflow::TF_StatusPtr;
-
-constexpr char kCpuName[] = "/job:localhost/replica:0/task:0/device:CPU:0";
-constexpr char kFunctionName[] = "test_fn";
-
-class CppTests : public ::testing::TestWithParam<const char*> {
- protected:
-  void SetUp() override {
-    TF_StatusPtr status(TF_NewStatus());
-    TF_SetTracingImplementation(GetParam(), status.get());
-    Status s = StatusFromTF_Status(status.get());
-    CHECK_EQ(tensorflow::errors::OK, s.code()) << s.message();
-  }
-};
-
-// Computes `inputs[0] + inputs[1]` and records it on the tape.
-tensorflow::Status Add(
-    tensorflow::AbstractContext* ctx,
-    absl::Span<tensorflow::AbstractTensorHandle* const> inputs,
-    absl::Span<tensorflow::AbstractTensorHandle*> outputs) {
-  tensorflow::AbstractOperationPtr add_op(ctx->CreateOperation());
-
-  TF_RETURN_IF_ERROR(add_op.get()->Reset("Add", /*raw_device_name=*/nullptr));
-
-  if (isa<tensorflow::tracing::TracingOperation>(add_op.get())) {
-    TF_RETURN_IF_ERROR(
-        dyn_cast<tensorflow::tracing::TracingOperation>(add_op.get())
-            ->SetOpName("my_add"));
-  }
-
-  TF_RETURN_IF_ERROR(add_op.get()->AddInput(inputs[0]));
-  TF_RETURN_IF_ERROR(add_op.get()->AddInput(inputs[1]));
-  int num_retvals = 1;
-  return add_op.get()->Execute(outputs, &num_retvals);
-}
-
-// Computes
-// return inputs[0] + inputs[1]
-tensorflow::Status AddModel(
-    tensorflow::AbstractContext* ctx,
-    absl::Span<tensorflow::AbstractTensorHandle* const> inputs,
-    absl::Span<tensorflow::AbstractTensorHandle*> outputs) {
-  std::vector<tensorflow::AbstractTensorHandle*> add_outputs(1);
-  // Compute x+y.
-  TF_RETURN_IF_ERROR(Add(ctx, inputs, absl::MakeSpan(add_outputs)));
-
-  outputs[0] = add_outputs[0];
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::AbstractContext* BuildFunction(const char* fn_name) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TF_ExecutionContext* graph_ctx = TF_CreateFunction(fn_name, status.get());
-  return tensorflow::unwrap(graph_ctx);
-}
-
-tensorflow::Status CreateParamsForInputs(
-    tensorflow::AbstractContext* ctx,
-    absl::Span<tensorflow::AbstractTensorHandle* const> inputs,
-    std::vector<tensorflow::AbstractTensorHandle*>* params) {
-  tensorflow::tracing::TracingTensorHandle* handle = nullptr;
-  for (auto input : inputs) {
-    tensorflow::PartialTensorShape shape;
-    TF_RETURN_IF_ERROR(input->Shape(&shape));
-    TF_RETURN_IF_ERROR(
-        dyn_cast<tensorflow::tracing::TracingContext>(ctx)->AddParameter(
-            input->DataType(), shape, &handle));
-    params->emplace_back(handle);
-  }
-  return ::tensorflow::OkStatus();
-}
-
-using Model = std::function<tensorflow::Status(
-    tensorflow::AbstractContext*,
-    absl::Span<tensorflow::AbstractTensorHandle* const>,
-    absl::Span<tensorflow::AbstractTensorHandle*>)>;
-
-tensorflow::Status PrepareFunction(
-    Model model, tensorflow::AbstractContext* ctx,
-    absl::Span<tensorflow::AbstractTensorHandle* const> inputs,
-    absl::Span<tensorflow::AbstractTensorHandle*> outputs) {
-  tensorflow::core::RefCountPtr<tensorflow::AbstractFunction> scoped_func;
-
-  tensorflow::AbstractContextPtr func_ctx(BuildFunction(kFunctionName));
-  std::vector<tensorflow::AbstractTensorHandle*> func_inputs;
-  func_inputs.reserve(inputs.size());
-  TF_RETURN_IF_ERROR(
-      CreateParamsForInputs(func_ctx.get(), inputs, &func_inputs));
-  tensorflow::OutputList output_list;
-  output_list.expected_num_outputs = outputs.size();
-  output_list.outputs.resize(outputs.size());
-  TF_RETURN_IF_ERROR(model(func_ctx.get(), absl::MakeSpan(func_inputs),
-                           absl::MakeSpan(output_list.outputs)));
-  for (auto func_input : func_inputs) {
-    func_input->Unref();
-  }
-  tensorflow::AbstractFunction* func = nullptr;
-  TF_RETURN_IF_ERROR(
-      dyn_cast<tensorflow::tracing::TracingContext>(func_ctx.get())
-          ->Finalize(&output_list, &func));
-  scoped_func.reset(func);
-  for (auto output : output_list.outputs) {
-    output->Unref();
-  }
-  TF_RETURN_IF_ERROR(ctx->RegisterFunction(func));
-
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status BuildImmediateExecutionContext(
-    bool use_tfrt, tensorflow::AbstractContext** ctx) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_ContextOptions* opts = TFE_NewContextOptions();
-  TFE_ContextOptionsSetTfrt(opts, use_tfrt);
-  *ctx = tensorflow::unwrap(TF_NewEagerExecutionContext(opts, status.get()));
-  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(status.get()));
-  TFE_DeleteContextOptions(opts);
-  return ::tensorflow::OkStatus();
-}
-
-tensorflow::Status TestScalarTensorHandle(
-    tensorflow::AbstractContext* ctx, float value,
-    tensorflow::AbstractTensorHandle** tensor) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  TFE_Context* eager_ctx =
-      TF_ExecutionContextGetTFEContext(wrap(ctx), status.get());
-  TF_RETURN_IF_ERROR(tensorflow::StatusFromTF_Status(status.get()));
-  TFE_TensorHandle* input_eager = TestScalarTensorHandle(eager_ctx, value);
-  *tensor = tensorflow::unwrap(
-      TF_CreateAbstractTensorFromEagerTensor(input_eager, status.get()));
-  return ::tensorflow::OkStatus();
-}
-
-TEST_P(CppTests, TestFunctionCacheWithAdd) {
-  std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-      TF_NewStatus(), TF_DeleteStatus);
-  tensorflow::AbstractContextPtr ctx;
-  {
-    tensorflow::AbstractContext* ctx_raw = nullptr;
-    tensorflow::Status s = BuildImmediateExecutionContext(true, &ctx_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
-    ctx.reset(ctx_raw);
-  }
-
-  tensorflow::AbstractTensorHandlePtr x;
-  {
-    tensorflow::AbstractTensorHandle* x_raw = nullptr;
-    tensorflow::Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &x_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
-    x.reset(x_raw);
-  }
-
-  tensorflow::AbstractTensorHandlePtr y;
-  {
-    tensorflow::AbstractTensorHandle* y_raw = nullptr;
-    tensorflow::Status s = TestScalarTensorHandle(ctx.get(), 2.0f, &y_raw);
-    ASSERT_EQ(tensorflow::errors::OK, s.code()) << s.message();
-    y.reset(y_raw);
-  }
-
-  // Pseudo-code:
-  // outputs = x + y
-  tensorflow::Status s;
-  std::vector<tensorflow::AbstractTensorHandle*> outputs(1);
-  s = PrepareFunction(AddModel, ctx.get(), {x.get(), y.get()},
-                      absl::MakeSpan(outputs));
-
-  ::tfrt::tf::FunctionCache cache;
-  ::tfrt::tf::ContextInterface* tfrt_ctx =
-      static_cast<::tfrt::tf::ContextInterface*>(ctx.get());
-  ::tfrt::CoreRuntime* corert = tfrt_ctx->GetCoreRuntime();
-  tensorflow::EagerContext* eager_ctx = tfrt_ctx->GetEagerContext();
-
-  // Cache is empty initially.
-  ASSERT_EQ(cache.Size(), 0);
-  ASSERT_EQ(cache.Contains(kFunctionName, kCpuName), false);
-
-  tensorflow::DeviceSet dev_set;
-  const tensorflow::DeviceMgr* device_mgr =
-      tfrt_ctx->GetEagerContext()->local_device_mgr();
-  for (auto d : device_mgr->ListDevices()) dev_set.AddDevice(d);
-  auto& device = corert->GetHostContext()->GetHostDevice();
-  const Device* input_devices[2] = {&device, &device};
-  tfrt::ResourceContext resource_context;
-  RequestContextBuilder req_ctx_builder(corert->GetHostContext(),
-                                        &resource_context);
-  TF_ASSERT_OK(tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
-      &req_ctx_builder, /*runner_table=*/nullptr, tfrt_ctx->GetEagerContext(),
-      /*user_intra_op_threadpool=*/nullptr, /*model_metadata=*/std::nullopt));
-  auto req_ctx = std::move(req_ctx_builder).build();
-  ExecutionContext exec_ctx(std::move(*req_ctx));
-
-  auto request_ctx_fn =
-      [host = corert->GetHostContext(),
-       eager_context = tfrt_ctx->GetEagerContext(), &resource_context](
-          tensorflow::tfrt_stub::OpKernelRunnerTable* runner_table,
-          RCReference<RequestContext>* request_ctx) {
-        RequestContextBuilder req_ctx_builder(host, &resource_context);
-        TF_RETURN_IF_ERROR(
-            tensorflow::tfd::SetUpKernelFallbackCompatRequestContext(
-                &req_ctx_builder, runner_table, eager_context,
-                /*user_intra_op_threadpool=*/nullptr,
-                /*model_metadata=*/std::nullopt));
-        *request_ctx = *std::move(req_ctx_builder).build();
-        return ::tensorflow::OkStatus();
-      };
-
-  // Inserts a new cache entry.
-  FunctionCache::FunctionCacheResult result;
-  TF_ASSERT_OK(cache.GetOrAddFunction(
-      kFunctionName, kCpuName, dev_set, eager_ctx, corert, request_ctx_fn,
-      /*loc=*/{}, tensorflow::TfrtFunctionCompileOptions(), input_devices,
-      &result));
-  ASSERT_NE(result.function_state.get(), nullptr);
-  // Cache contains the inserted entry now.
-  ASSERT_EQ(cache.Contains(kFunctionName, kCpuName), true);
-
-  // There's one entry in the cache.
-  ASSERT_EQ(cache.Size(), 1);
-
-  // This lookup is a cache hit.
-  TF_ASSERT_OK(cache.GetOrAddFunction(
-      kFunctionName, kCpuName, dev_set, eager_ctx, corert, request_ctx_fn,
-      /*loc=*/{}, tensorflow::TfrtFunctionCompileOptions(), input_devices,
-      &result));
-  ASSERT_NE(result.function_state.get(), nullptr);
-  // Cache hit doesn't create new entry in the cache.
-  ASSERT_EQ(cache.Size(), 1);
-
-  // Add another entry with the same function name but different device name.
-  // This lookup is a cache miss.
-  TF_ASSERT_OK(cache.GetOrAddFunction(
-      kFunctionName, "", dev_set, eager_ctx, corert, request_ctx_fn,
-      /*loc=*/{}, tensorflow::TfrtFunctionCompileOptions(), input_devices,
-      &result));
-  ASSERT_NE(result.function_state.get(), nullptr);
-  // Cache miss adds a new entry in the cache.
-  ASSERT_EQ(cache.Size(), 2);
-
-  cache.RemoveFunction(kFunctionName);
-
-  // RemoveFunction removes all entries in the cache since they have the same
-  // function name.
-  ASSERT_EQ(cache.Size(), 0);
-}
-
-INSTANTIATE_TEST_SUITE_P(UnifiedCAPI, CppTests, ::testing::Values("graphdef"));
-
-}  // namespace
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/op_cache.cc b/tensorflow/core/tfrt/eager/op_cache.cc
deleted file mode 100644
index 3786cd77089..00000000000
--- a/tensorflow/core/tfrt/eager/op_cache.cc
+++ /dev/null
@@ -1,83 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tfrt/eager/op_cache.h"
-
-#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-Expected<CoreRuntimeOp*> OpCache::GetOrAddOp(
-    string_view op_name, OpHandler* op_handler, string_view device_name,
-    llvm::SmallVector<string_view, 4> dtypes,
-    OperationInterface* const op_interface) {
-  CacheKey cache_key{op_name, op_handler,
-                     (op_handler == nullptr ? device_name : ""), dtypes};
-  {
-    mutex_lock l(cache_mu_);
-    auto iter = cache_.find(cache_key);
-    if (iter != cache_.end()) return &iter->second;
-  }
-
-  ContextInterface* context = op_interface->context_;
-
-  auto tfrt_op_name = StrCat("tf.", op_name);
-  op_interface->MaybeInferInputAttrs();
-  if (op_handler == nullptr) {
-    tensorflow::Status s = context->SelectOpHandlerFromNodeDef(
-        *op_interface, &op_interface->fallback_attrs_.BuildNodeDef(),
-        &op_handler);
-    if (!s.ok()) return MakeStringError(s.message());
-  }
-  Expected<CoreRuntimeOp> expected_op =
-      context->GetCoreRuntime()->MakeOp(tfrt_op_name, op_handler);
-  if (!expected_op) return MakeStringError(expected_op.takeError());
-
-  mutex_lock l(cache_mu_);
-  // Insert the new op to cache. If an entry with the same key is already
-  // present in the cache at this moment due to race condition, overwrites it.
-  cache_key.MakeConcrete();
-  cache_[cache_key] = std::move(expected_op.get());
-  return &cache_[cache_key];
-}
-
-Expected<CoreRuntimeOp*> OpCache::GetOrAddXlaOp(string_view op_name,
-                                                ContextInterface* context) {
-  // Device name and dtype are not meaningful to a XLA op.
-  CacheKey cache_key{op_name, nullptr, "", {}};
-  {
-    mutex_lock l(cache_mu_);
-    auto iter = cache_.find(cache_key);
-    if (iter != cache_.end()) return &iter->second;
-  }
-
-  auto tfrt_op_name = StrCat("tf.", op_name);
-  Expected<CoreRuntimeOp> expected_op = context->GetCoreRuntime()->MakeOp(
-      tfrt_op_name, context->GetFallbackOpHandler());
-  if (!expected_op) return MakeStringError(expected_op.takeError());
-
-  mutex_lock l(cache_mu_);
-  // Insert the new op to cache. If an entry with the same key is already
-  // present in the cache at this moment due to race condition, overwrites it.
-  cache_key.MakeConcrete();
-  cache_[cache_key] = std::move(expected_op.get());
-  return &cache_[cache_key];
-}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/op_cache.h b/tensorflow/core/tfrt/eager/op_cache.h
deleted file mode 100644
index 627d0637971..00000000000
--- a/tensorflow/core/tfrt/eager/op_cache.h
+++ /dev/null
@@ -1,183 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_OP_CACHE_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_OP_CACHE_H_
-
-#include "tensorflow/core/common_runtime/eager/attr_builder.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/framework/function.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/fingerprint.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/tfrt/utils/utils.h"
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime_op.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_attrs.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_handler.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-#include "tfrt/support/mutex.h"  // from @tf_runtime
-#include "tfrt/support/ref_count.h"  // from @tf_runtime
-#include "tfrt/support/string_util.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-class ContextInterface;
-class OperationInterface;
-
-// Cache for a single core runtime op. Thread safe.
-class OpCache {
- public:
-  // Helper function to look up the cache. If miss, insert the CoreRuntimeOp
-  // to the cache.
-  Expected<CoreRuntimeOp*> GetOrAddOp(string_view op_name,
-                                      OpHandler* op_handler,
-                                      string_view device_name,
-                                      llvm::SmallVector<string_view, 4> dtypes,
-                                      OperationInterface* const op_interface)
-      TFRT_EXCLUDES(cache_mu_);
-
-  // Compile with XLA is currently supported via fallback, and the compilation
-  // result is a CoreRuntimeOp.
-  // TODO(tfrt-devs): Native support of compile_with_xla.
-  Expected<CoreRuntimeOp*> GetOrAddXlaOp(string_view op_name,
-                                         ContextInterface* context)
-      TFRT_EXCLUDES(cache_mu_);
-
-  // The following helper functions are for debugging and testing only.
-  size_t Size() const {
-    mutex_lock l(cache_mu_);
-    return cache_.size();
-  }
-
-  bool Contains(string_view op_name, OpHandler* op_handler,
-                string_view device_name,
-                llvm::SmallVector<string_view, 4> dtypes) const {
-    const CacheKey& cache_key{op_name, op_handler,
-                              (op_handler == nullptr ? device_name : ""),
-                              dtypes};
-    mutex_lock l(cache_mu_);
-    return cache_.find(cache_key) != cache_.end();
-  }
-
- private:
-  class CacheKey {
-   public:
-    CacheKey(string_view op_name, OpHandler* op_handler,
-             string_view device_name, llvm::SmallVector<string_view, 4> dtypes)
-        : op_handler_(op_handler),
-          op_name_(op_name),
-          device_name_(device_name),
-          dtypes_(dtypes) {}
-
-    CacheKey(const CacheKey& other)
-        : op_handler_(other.op_handler_),
-          op_name_(other.op_name_),
-          device_name_(other.device_name_),
-          dtypes_(other.dtypes_) {
-      // Copy the concrete strings if the key is concrete, and set the
-      // string_views to refer to the concrete strings.
-      if (other.is_concrete_) {
-        op_name_concrete_ = other.op_name_concrete_;
-        op_name_ = op_name_concrete_.data();
-        device_name_concrete_ = other.device_name_concrete_;
-        device_name_ = device_name_concrete_.data();
-        size_t n = other.dtypes_concrete_.size();
-        dtypes_concrete_.reserve(n);
-        dtypes_.clear();
-        for (size_t i = 0; i < n; ++i) {
-          dtypes_concrete_.push_back(other.dtypes_concrete_[i]);
-          dtypes_.push_back(dtypes_concrete_[i].data());
-        }
-        is_concrete_ = true;
-      }
-    }
-
-    // Make the cache key concrete by copying the key components (strings) to
-    // internal storage.
-    void MakeConcrete() {
-      op_name_concrete_ = op_name_.str();
-      device_name_concrete_ = device_name_.str();
-      dtypes_concrete_.reserve(dtypes_.size());
-      for (const auto& dtype : dtypes_) dtypes_concrete_.push_back(dtype.str());
-      is_concrete_ = true;
-    }
-
-    bool operator==(const CacheKey& other) const {
-      // During comparing keys, self or other can be either concrete or not.
-      // If a CacheKey is concrete, it's likely that the string_view fields
-      // are not valid (for example the key is obtained from the cache). We
-      // need to make the string_view fields refer to the concrete fields
-      // by constructing copies of them.
-      CacheKey lhs{*this};
-      CacheKey rhs{other};
-
-      if (lhs.op_handler_ != rhs.op_handler_) return false;
-      if (lhs.dtypes_.size() != rhs.dtypes_.size()) return false;
-
-      for (size_t i = 0, n = lhs.dtypes_.size(); i < n; ++i) {
-        if (lhs.dtypes_[i] != rhs.dtypes_[i]) return false;
-      }
-      return (lhs.op_name_ == rhs.op_name_ &&
-              lhs.device_name_ == rhs.device_name_);
-    }
-
-    string_view OpName() { return op_name_; }
-
-    string_view DeviceName() { return device_name_; }
-
-    const llvm::SmallVector<string_view, 4>& Dtypes() { return dtypes_; }
-
-   private:
-    class OpHandler* op_handler_;
-    // friend size_t CacheKeyHash::operator()(const CacheKey& input_key);
-    // string_view is used for efficient cache look up to avoid string copy.
-    string_view op_name_, device_name_;
-    llvm::SmallVector<string_view, 4> dtypes_;
-
-    // Concrete string is used for storing cache key, since the lifetime
-    // of the strings should be the same as the container.
-    bool is_concrete_ = false;
-    std::string op_name_concrete_, device_name_concrete_;
-    llvm::SmallVector<std::string, 4> dtypes_concrete_;
-  };
-
-  class CacheKeyHash {
-   public:
-    size_t operator()(const CacheKey& input_key) const {
-      CacheKey key{input_key};
-      tensorflow::Fprint128 hash = tensorflow::Fingerprint128(
-          {key.OpName().data(), key.OpName().size()});
-      hash = tsl::FingerprintCat128(
-          hash, tensorflow::Fingerprint128(
-                    {key.DeviceName().data(), key.DeviceName().size()}));
-      for (const auto& dtype : key.Dtypes())
-        hash = tsl::FingerprintCat128(
-            hash, tensorflow::Fingerprint128({dtype.data(), dtype.size()}));
-      return hash.high64 ^ hash.low64;
-    }
-  };
-
-  mutable mutex cache_mu_;
-  std::unordered_map<CacheKey, CoreRuntimeOp, CacheKeyHash> cache_
-      TFRT_GUARDED_BY(cache_mu_);
-};
-
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_OP_CACHE_H_
diff --git a/tensorflow/core/tfrt/eager/op_cache_test.cc b/tensorflow/core/tfrt/eager/op_cache_test.cc
deleted file mode 100644
index 50c20daf7d3..00000000000
--- a/tensorflow/core/tfrt/eager/op_cache_test.cc
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include "tensorflow/core/tfrt/eager/op_cache.h"
-
-#include <memory>
-
-#include "absl/types/span.h"
-#include "tensorflow/c/eager/c_api.h"
-#include "tensorflow/c/eager/c_api_experimental.h"
-#include "tensorflow/c/eager/c_api_test_util.h"
-#include "tensorflow/c/eager/c_api_unified_experimental_internal.h"
-#include "tensorflow/c/tf_status_helper.h"
-#include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/tfrt/eager/c_api_tfrt.h"
-#include "tfrt/cpu/core_runtime/null_op_handler.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-namespace {
-
-constexpr char device_name[] = "/job:localhost/replica:0/task:0/device:CPU:0";
-constexpr char op_name[] = "Add";
-constexpr char dtype[] = "DT_INT8";
-
-class OpCacheTest : public ::testing::Test {
- protected:
-  void SetUp() override {
-    // Set up context.
-    std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)> status(
-        TF_NewStatus(), TF_DeleteStatus);
-    TFE_ContextOptions* opts = TFE_NewContextOptions();
-    TFE_ContextOptionsSetTfrt(opts, /*use_tfrt=*/true);
-    tensorflow::AbstractContext* ctx_raw = nullptr;
-    ctx_raw =
-        tensorflow::unwrap(TF_NewEagerExecutionContext(opts, status.get()));
-    tensorflow::Status s = tensorflow::StatusFromTF_Status(status.get());
-    ASSERT_TRUE(s.ok());
-    TFE_DeleteContextOptions(opts);
-    ctx_.reset(ctx_raw);
-
-    // Set up operation.
-    auto op_interface_ptr =
-        tensorflow::down_cast<::tfrt::tf::OperationInterface*>(
-            ctx_->CreateOperation());
-    op_interface_.reset(op_interface_ptr);
-    ASSERT_TRUE(op_interface_->Reset(op_name, device_name).ok());
-    ASSERT_TRUE(op_interface_->SetAttrType("T", tensorflow::DT_INT8).ok());
-  }
-
-  tensorflow::AbstractContextPtr ctx_;
-  std::unique_ptr<::tfrt::tf::OperationInterface> op_interface_;
-  ::tfrt::tf::OpCache cache_;
-};
-
-TEST_F(OpCacheTest, TestOpCacheInitiallyEmpty) {
-  // Cache is empty initially.
-  EXPECT_EQ(cache_.Size(), 0);
-  EXPECT_FALSE(
-      cache_.Contains(op_name, /*op_handler=*/nullptr, device_name, {dtype}));
-}
-
-TEST_F(OpCacheTest, TestOpCacheCacheHit) {
-  auto expected_op =
-      cache_.GetOrAddOp(op_name, /*op_handler=*/nullptr, device_name, {dtype},
-                        op_interface_.get());
-  // Inserts a new cache entry.
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  // There's one entry in the cache.
-  EXPECT_EQ(cache_.Size(), 1);
-  EXPECT_TRUE(
-      cache_.Contains(op_name, /*op_handler=*/nullptr, device_name, {dtype}));
-
-  // This lookup is a cache hit.
-  expected_op = cache_.GetOrAddOp(op_name, /*op_handler=*/nullptr, device_name,
-                                  {dtype}, op_interface_.get());
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  // Cache hit doesn't create new entry in the cache.
-  EXPECT_EQ(cache_.Size(), 1);
-}
-
-TEST_F(OpCacheTest, TestOpCacheCacheDeviceNameNotSpecifiedAndCacheMiss) {
-  // Inserts a new cache entry.
-  auto expected_op =
-      cache_.GetOrAddOp(op_name, /*op_handler=*/nullptr, device_name, {dtype},
-                        op_interface_.get());
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  // Inserts a op with empty device name. This incurs a cache miss.
-  expected_op =
-      cache_.GetOrAddOp(op_name, /*op_handler=*/nullptr, /*device_name=*/"",
-                        {dtype}, op_interface_.get());
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  // The eager placer (OpHandlerSelector) picks a device for the op.
-  EXPECT_STREQ(expected_op.get()->DeviceName().str().c_str(), device_name);
-
-  // This is a cache miss and will insert a new entry to the cache.
-  EXPECT_EQ(cache_.Size(), 2);
-
-  // Inserts a op with another dtype. This incurs a cache miss.
-  expected_op =
-      cache_.GetOrAddOp(op_name, /*op_handler=*/nullptr, /*device_name=*/"",
-                        {"F64"}, op_interface_.get());
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  // This is a cache miss and will insert a new entry to the cache.
-  EXPECT_EQ(cache_.Size(), 3);
-}
-
-TEST_F(OpCacheTest, TestOpCacheAlreadyPlaced) {
-  auto* op_handler =
-      tensorflow::down_cast<::tfrt::tf::ContextInterface*>(ctx_.get())
-          ->GetCoreRuntime()
-          ->GetOpHandler(device_name);
-  EXPECT_TRUE(op_handler != nullptr);
-  // Inserts a new cache entry.
-  auto expected_op = cache_.GetOrAddOp(op_name, op_handler, device_name,
-                                       {dtype}, op_interface_.get());
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  // The lookup is a cache hit.
-  expected_op = cache_.GetOrAddOp(op_name, op_handler, device_name, {dtype},
-                                  op_interface_.get());
-  EXPECT_TRUE((bool)expected_op) << StrCat(expected_op.takeError());
-
-  EXPECT_EQ(cache_.Size(), 1);
-}
-
-}  // namespace
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/tfrt_context.cc b/tensorflow/core/tfrt/eager/tfrt_context.cc
deleted file mode 100644
index 09dd31a9c89..00000000000
--- a/tensorflow/core/tfrt/eager/tfrt_context.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/tfrt/eager/tfrt_context.h"
-
-#include <string>
-#include <utility>
-
-#include "tensorflow/core/common_runtime/process_util.h"
-#include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/runtime_fallback/runtime/kernel_utils.h"
-#include "tensorflow/core/runtime_fallback/runtime/runtime_fallback_op_handler.h"
-#include "tensorflow/core/tfrt/common/global_state.h"
-#include "tensorflow/core/tfrt/eager/core_runtime/op_handler_registry.h"
-#include "tensorflow/core/tpu/virtual_device.h"
-#include "tensorflow/core/util/device_name_utils.h"
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
-#include "tfrt/host_context/host_allocator.h"  // from @tf_runtime
-
-namespace tfrt {
-namespace tf {
-
-TfrtContext::TfrtContext(
-    const tensorflow::SessionOptions& opts,
-    tensorflow::ContextDevicePlacementPolicy default_device_placement_policy,
-    bool is_async) {
-  tensorflow::tfd::EagerContextResource* eager_context_resource =
-      resource_context_
-          .GetOrCreateResource<tensorflow::tfd::EagerContextResource>(
-              tensorflow::tfd::kEagerContextResourceName, opts,
-              default_device_placement_policy, is_async);
-  auto eager_context_expected = eager_context_resource->GetTFEagerContext();
-  DCHECK(eager_context_expected) << StrCat(eager_context_expected.takeError());
-  eager_context_ = eager_context_expected.get();
-
-  eager_ctx_thread_pool_ = std::make_unique<ThreadPoolInterfaceWrapper>(
-      eager_context_->GetThreadPool()->AsEigenThreadPool());
-
-  local_thread_pool_.reset(tensorflow::NewThreadPoolFromSessionOptions(opts));
-
-  local_thread_pool_wrapper_ = std::make_unique<ThreadPoolInterfaceWrapper>(
-      local_thread_pool_->AsEigenThreadPool());
-
-  tf_thread_pool_work_queue_ =
-      std::make_unique<tensorflow::tfrt_stub::TfThreadPoolWorkQueue>(
-          /*intra_op_threadpool=*/local_thread_pool_wrapper_.get(),
-          /*inter_op_threadpool=*/eager_ctx_thread_pool_.get());
-  LOG(INFO) << "Created work queue from TF thread pool. inter op thread pool "
-            << "# threads: " << eager_ctx_thread_pool_->NumThreads()
-            << " intra op thread pool # threads: "
-            << local_thread_pool_wrapper_->NumThreads();
-
-  // Default cpu device name is "/job:localhost/replica:0/task:0/device:CPU:0".
-  const std::string& host_cpu_name = eager_context_->HostCPU()->name();
-
-  auto diag_handler = [](const DecodedDiagnostic& diag) {
-    LOG(ERROR) << diag.message();
-  };
-
-  auto rt = CoreRuntime::Create(diag_handler, CreateMallocAllocator(),
-                                CreateMultiThreadedWorkQueue(
-                                    /*num_threads=*/4,
-                                    /*num_blocking_threads=*/64),
-                                host_cpu_name);
-  DCHECK(rt) << StrCat(rt.takeError());
-  corert_ = std::move(rt.get());
-  host_context_ = corert_->GetHostContext();
-
-  // Create multiple (currently virtual) CPU devices according to options.
-  // TODO(b/174877837): Support multiple physical cpu devices.
-  int requested_num_cpus = 1;
-  auto iter = opts.config.device_count().find("CPU");
-  if (iter != opts.config.device_count().end()) {
-    requested_num_cpus = iter->second;
-  }
-
-  std::string cpu_name_prefix{host_cpu_name};
-  cpu_name_prefix.pop_back();  // remove the `id` from host cpu device name.
-  for (int i = 1; i < requested_num_cpus; ++i) {
-    host_context_->GetDeviceManager()->MaybeAddDevice(TakeRef(
-        new CpuDevice(absl::StrCat(cpu_name_prefix, std::to_string(i)))));
-  }
-
-  // Specifically register RuntimeFallbackOpHandler.
-  auto runtime_fallback_op_handler =
-      tensorflow::tfd::CreateRuntimeFallbackOpHandler(corert_.get(), "");
-  DCHECK(runtime_fallback_op_handler)
-      << StrCat(runtime_fallback_op_handler.takeError());
-  fallback_op_handler_ = runtime_fallback_op_handler.get();
-  corert_->RegisterOpHandler("tf", fallback_op_handler_);
-
-  RegisterOpHandlers(corert_.get(), &resource_context_,
-                     eager_context_->local_device_mgr());
-
-  // Set the global host context singleton.
-  tensorflow::tfrt_global::GlobalHostContext::Set(corert_->GetHostContext());
-}
-
-const tensorflow::DeviceNameUtils::ParsedName& TfrtContext::HostCPUParsedName()
-    const {
-  return eager_context_->HostCPU()->parsed_name();
-}
-
-bool TfrtContext::IsAsync() const { return eager_context_->Executor().Async(); }
-
-TfrtContext::~TfrtContext() {}
-
-}  // namespace tf
-}  // namespace tfrt
diff --git a/tensorflow/core/tfrt/eager/tfrt_context.h b/tensorflow/core/tfrt/eager/tfrt_context.h
deleted file mode 100644
index 9e6b64c962f..00000000000
--- a/tensorflow/core/tfrt/eager/tfrt_context.h
+++ /dev/null
@@ -1,122 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_TFRT_CONTEXT_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_TFRT_CONTEXT_H_
-
-#include <functional>
-#include <utility>
-
-#include "tensorflow/c/eager/immediate_execution_context.h"
-#include "tensorflow/core/platform/threadpool_interface.h"
-#include "tensorflow/core/public/session_options.h"
-#include "tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h"
-#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
-
-namespace tensorflow {
-class EagerContext;
-class DynamicDeviceMgr;
-}
-namespace tfrt {
-class HostContext;
-class CoreRuntime;
-class OpHandler;
-
-namespace tf {
-
-// Wraps an `Eigen::ThreadPoolInterface` as a
-// `tensorflow::thread::ThreadPoolInterface`.
-//
-// Copied from internal directory: http://shortn/_jsmzLpQu7q
-class ThreadPoolInterfaceWrapper
-    : public tensorflow::thread::ThreadPoolInterface {
- public:
-  explicit ThreadPoolInterfaceWrapper(Eigen::ThreadPoolInterface* thread_pool)
-      : thread_pool_{thread_pool} {
-    DCHECK(thread_pool);
-  }
-
-  void Schedule(std::function<void()> fn) override {
-    return thread_pool().Schedule(std::move(fn));
-  }
-
-  void ScheduleWithHint(std::function<void()> fn, int start, int end) override {
-    return thread_pool().ScheduleWithHint(std::move(fn), start, end);
-  }
-
-  void Cancel() override { thread_pool().Cancel(); }
-
-  int NumThreads() const override { return thread_pool().NumThreads(); }
-
-  int CurrentThreadId() const override {
-    return thread_pool().CurrentThreadId();
-  }
-
- private:
-  Eigen::ThreadPoolInterface& thread_pool() const {
-    DCHECK(thread_pool_);
-    return *thread_pool_;
-  }
-
-  // Not owning pointer to the thread pool.
-  Eigen::ThreadPoolInterface* thread_pool_ = nullptr;
-};
-
-// This class defines a list of objects needed to support execution with TFRT.
-class TfrtContext {
- public:
-  TfrtContext(
-      const tensorflow::SessionOptions& opts,
-      tensorflow::ContextDevicePlacementPolicy default_device_placement_policy,
-      bool is_async);
-  ~TfrtContext();
-
-  HostContext* GetHostContext() { return host_context_; }
-  CoreRuntime* GetCoreRuntime() { return corert_.get(); }
-  tensorflow::EagerContext* GetEagerContext() { return eager_context_; }
-  const tensorflow::EagerContext* GetEagerContext() const {
-    return eager_context_;
-  }
-  OpHandler* GetFallbackOpHandler() { return fallback_op_handler_; }
-
-  ResourceContext* GetResourceContext() { return &resource_context_; }
-
-  tensorflow::tfrt_stub::TfThreadPoolWorkQueue* GetTfThreadPoolWorkQueue() {
-    return tf_thread_pool_work_queue_.get();
-  }
-
-  const tensorflow::DeviceNameUtils::ParsedName& HostCPUParsedName() const;
-
-  bool IsAsync() const;
-
- private:
-  std::unique_ptr<CoreRuntime> corert_;
-  ::tfrt::HostContext* host_context_;
-  OpHandler* fallback_op_handler_;
-  ResourceContext resource_context_;
-  tensorflow::EagerContext* eager_context_;
-  std::unique_ptr<ThreadPoolInterfaceWrapper> eager_ctx_thread_pool_;
-
-  // Manage the local thread pool's lifetime because the wrapper does not own
-  // the thread pool.
-  std::unique_ptr<tensorflow::thread::ThreadPool> local_thread_pool_;
-  std::unique_ptr<ThreadPoolInterfaceWrapper> local_thread_pool_wrapper_;
-  std::unique_ptr<tensorflow::tfrt_stub::TfThreadPoolWorkQueue>
-      tf_thread_pool_work_queue_;
-};
-
-}  // namespace tf
-}  // namespace tfrt
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_TFRT_CONTEXT_H_
diff --git a/tensorflow/core/tfrt/eager/transform_graph_function.cc b/tensorflow/core/tfrt/eager/transform_graph_function.cc
deleted file mode 100644
index 8650236d4ac..00000000000
--- a/tensorflow/core/tfrt/eager/transform_graph_function.cc
+++ /dev/null
@@ -1,187 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "tensorflow/core/common_runtime/device_set.h"
-#include "tensorflow/core/common_runtime/eager/context.h"
-#include "tensorflow/core/common_runtime/function_optimization_registry.h"
-#include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/common_runtime/optimize_function_graph_utils.h"
-#include "tensorflow/core/common_runtime/placer.h"
-#include "tensorflow/core/common_runtime/process_function_library_runtime.h"
-#include "tensorflow/core/framework/graph_to_functiondef.h"
-#include "tensorflow/core/grappler/grappler_item.h"
-#include "tensorflow/core/grappler/optimizers/meta_optimizer.h"
-#include "tfrt/host_context/device.h"  // from @tf_runtime
-#include "tfrt/support/error_util.h"  // from @tf_runtime
-
-namespace tensorflow {
-
-namespace {
-constexpr char kDefaultCpuDeviceName[] = "CPU:0";
-}  // namespace
-
-Status TransformGraphFunction(const std::string& func_name,
-                              const FunctionDef& fdef,
-                              const std::string& device_name,
-                              const tensorflow::DeviceSet& device_set,
-                              EagerContext* eager_ctx, bool enable_grappler,
-                              std::unique_ptr<FunctionBody>* fbody,
-                              std::unique_ptr<Graph> graph,
-                              tfrt::ArrayRef<const tfrt::Device*> input_devices,
-                              FunctionLibraryDefinition* func_lib_def) {
-  const DeviceMgr* device_mgr = eager_ctx->local_device_mgr();
-  if (device_mgr == nullptr)
-    return errors::Internal("Cannot find device manager");
-  DumpGraph("Input function graph", graph.get());
-
-  std::vector<string> ret_node_names;
-  std::vector<string> control_ret_node_names;
-  // Mapping from a function body node name to the control output name.
-  std::unordered_map<string, string> node_name_to_control_ret;
-  std::vector<Node*> arg_nodes, ret_nodes;
-  DataTypeVector ret_types;
-  auto attrs = AttrSlice(&fdef.attr());
-  TF_RETURN_IF_ERROR(GetGraphAndArgRets(
-      func_name, attrs, &fdef, func_lib_def, &graph, &arg_nodes, &ret_nodes,
-      &ret_node_names, &ret_types, &control_ret_node_names));
-  for (const auto& control_ret : fdef.control_ret()) {
-    node_name_to_control_ret.emplace(control_ret.second, control_ret.first);
-  }
-  for (Node* node : arg_nodes) {
-    const AttrValue* attr_value;
-    TF_RETURN_IF_ERROR(node->attrs().Find("index", &attr_value));
-    int64_t index = attr_value->i();
-    node->set_assigned_device_name(input_devices[index]->name().str());
-  }
-
-  std::vector<string> input_device_names;
-  int input_size = input_devices.size();
-  input_device_names.reserve(input_size);
-  for (int i = 0; i < input_size; ++i) {
-    input_device_names.push_back(input_devices[i]->name().str());
-  }
-
-  std::vector<string> output_device_names;
-  int output_size = fdef.signature().output_arg_size();
-  output_device_names.reserve(output_size);
-  for (int i = 0; i < output_size; ++i) {
-    output_device_names.push_back(device_name);
-  }
-
-  // set default_device for placer.
-  Device* default_device = nullptr;
-  tensorflow::Status s = device_mgr->LookupDevice(device_name, &default_device);
-  if (!s.ok())
-    VLOG(1) << "TransformGraphFunction(): " << device_name << " is unknown."
-            << " default device for placer is not set.";
-
-  TF_RETURN_IF_ERROR(PinArgsAndRets(
-      input_device_names, output_device_names, device_set, arg_nodes, ret_nodes,
-      func_lib_def,
-      eager_ctx->AllowSoftPlacement() ? default_device : nullptr));
-  DumpGraph("After running PinArgsAndRets", graph.get());
-
-  ConfigProto config;
-  bool control_rets_updated = false;
-  TF_RETURN_IF_ERROR(FunctionOptimizationPassRegistry::Global().Run(
-      func_name, device_set, config, /*xla_compile_device_type*/ "", &graph,
-      func_lib_def, &control_ret_node_names, &control_rets_updated));
-
-  if (control_rets_updated) {
-    // Function graph pass may have resulted in different nodes/node names for
-    // control rets.
-    for (const auto& control_ret : control_ret_node_names) {
-      node_name_to_control_ret.emplace(control_ret, control_ret);
-    }
-  } else {
-    for (const auto& control_ret : fdef.control_ret()) {
-      node_name_to_control_ret.emplace(control_ret.second, control_ret.first);
-    }
-  }
-  DumpGraph("After running function optimization pass (bridge)", graph.get());
-
-  // Run function inlining so that placer can place ops in nested functions.
-  GraphOptimizationPassOptions optimization_options;
-  SessionOptions session_options;
-  // In TFRT we don't lower v2 control flow to v1.
-  session_options.config.mutable_experimental()->set_use_tfrt(true);
-  session_options.config.mutable_graph_options()
-      ->mutable_optimizer_options()
-      ->set_do_function_inlining(true);
-  optimization_options.session_options = &session_options;
-  optimization_options.graph = &graph;
-  optimization_options.flib_def = func_lib_def;
-  optimization_options.device_set = &device_set;
-  optimization_options.is_function_graph = true;
-  optimization_options.default_function_device = default_device;
-  optimization_options.function_def = &fdef;
-
-  TF_RETURN_IF_ERROR(OptimizationPassRegistry::Global()->RunGrouping(
-      OptimizationPassRegistry::PRE_PLACEMENT, optimization_options));
-  DumpGraph("After running pre placement passes", graph.get());
-
-  // Run placer before importing GraphDef to MLIR.
-  Placer placer(graph.get(), func_name, func_lib_def, &device_set,
-                default_device, eager_ctx->AllowSoftPlacement(),
-                /*log_device_placement=*/false);
-  TF_RETURN_IF_ERROR(placer.Run());
-  DumpGraph("After running placer", graph.get());
-
-  if (enable_grappler) {
-    Device* cpu_device;
-    TF_RETURN_IF_ERROR(
-        device_mgr->LookupDevice(kDefaultCpuDeviceName, &cpu_device));
-
-    ConfigProto config_proto;
-    config_proto.mutable_experimental()->set_use_tfrt(true);
-    config_proto.mutable_graph_options()
-        ->mutable_optimizer_options()
-        ->set_do_function_inlining(true);
-    // Do not skip grappler optimization even for small graphs.
-    config_proto.mutable_graph_options()
-        ->mutable_rewrite_options()
-        ->set_min_graph_nodes(-1);
-
-    grappler::GrapplerItem::OptimizationOptions grappler_options =
-        grappler::CreateOptOptionsForEager();
-    auto status = grappler::OptimizeGraph(
-        std::move(ret_node_names), std::move(control_ret_node_names),
-        func_lib_def, device_set, cpu_device, config_proto,
-        fdef.signature().name(), grappler_options, &graph);
-    if (!status.ok()) {
-      LOG(WARNING) << "Ignoring multi-device function optimization failure: "
-                   << status.ToString();
-    }
-    DumpGraph("After grappler optimization", graph.get());
-  }
-
-  // We must preserve control returns in each of the function components,
-  // otherwise after function inlining we might prune side-effectful nodes.
-  const auto control_ret =
-      [&node_name_to_control_ret](const Node* n) -> absl::optional<string> {
-    const auto it = node_name_to_control_ret.find(n->name());
-    if (it != node_name_to_control_ret.end())
-      return absl::make_optional<string>(it->second);
-    return absl::nullopt;
-  };
-  FunctionDef new_func;
-  TF_RETURN_IF_ERROR(
-      GraphToFunctionDef(*graph, func_name, control_ret, &new_func));
-  // Refresh `fbody`.
-  TF_RETURN_IF_ERROR(
-      FunctionDefToBodyHelper(new_func, AttrSlice(), func_lib_def, fbody));
-  return OkStatus();
-}
-}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/eager/transform_graph_function.h b/tensorflow/core/tfrt/eager/transform_graph_function.h
deleted file mode 100644
index e42f641b276..00000000000
--- a/tensorflow/core/tfrt/eager/transform_graph_function.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_TRANSFORM_GRAPH_FUNCTION_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_TRANSFORM_GRAPH_FUNCTION_H_
-
-#include <memory>
-
-#include "tensorflow/core/common_runtime/function_body.h"
-#include "tfrt/support/forward_decls.h"  // from @tf_runtime
-
-namespace tfrt {
-class Device;
-}  // namespace tfrt
-
-namespace tensorflow {
-
-class EagerContext;
-class FunctionDef;
-class FunctionLibraryDefinition;
-
-// Run placer.
-// When `enable_grappler` is true, also run grappler passes over
-// the input function, which might add some entries to `func_lib_def`.
-//
-// TODO(tfrt-devs): Consider passing in a more expressive compiler options
-// object such as TFRTCompilerOptions instead of `enable_grappler`, for caller
-// to configure graph transformation behavior, such as the more granular options
-// in RewriterConfig proto and even individual grappler pass options like
-// grappler::ArithmeticOptimizerOptions.
-Status TransformGraphFunction(const std::string& func_name,
-                              const FunctionDef& fdef,
-                              const std::string& device_name,
-                              const tensorflow::DeviceSet& device_set,
-                              EagerContext* eager_ctx, bool enable_grappler,
-                              std::unique_ptr<FunctionBody>* fbody,
-                              std::unique_ptr<Graph> graph,
-                              tfrt::ArrayRef<const tfrt::Device*> input_devices,
-                              FunctionLibraryDefinition* func_lib_def);
-
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_TRANSFORM_GRAPH_FUNCTION_H_
diff --git a/tensorflow/core/tfrt/fallback/BUILD b/tensorflow/core/tfrt/fallback/BUILD
index 86bc9760fce..765008be386 100644
--- a/tensorflow/core/tfrt/fallback/BUILD
+++ b/tensorflow/core/tfrt/fallback/BUILD
@@ -29,6 +29,7 @@ package_group(
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
         # copybara:uncomment "//learning/brain/mobile/lite/delegates/tfmrt/...",
+        # copybara:uncomment "//learning/infra/mira/distributed/...",
     ],
 )
 
@@ -68,7 +69,6 @@ cc_library(
     visibility = [
         ":friends",
         # copybara:uncomment "//tensorflow/core/runtime_fallback:internal",
-        # copybara:uncomment "//tensorflow/core/tfrt/eager:__pkg__",
         "//tensorflow/core/tfrt/graph_executor:__subpackages__",
         "//tensorflow/lite/delegates/flex:__pkg__",
     ],
@@ -107,10 +107,20 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/util:env_var",
+        "//tensorflow/tsl/platform:mutex",
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
+cc_library(
+    name = "device_with_custom_allocator",
+    hdrs = ["device_with_custom_allocator.h"],
+    deps = [
+        "//tensorflow/core:framework",
+        "//tensorflow/tsl/framework:allocator",
+    ],
+)
+
 tf_cc_test(
     name = "cost_recorder_test",
     srcs = ["cost_recorder_test.cc"],
diff --git a/tensorflow/core/tfrt/fallback/cost_recorder.cc b/tensorflow/core/tfrt/fallback/cost_recorder.cc
index 48fd4c67029..b3006c4d2a8 100644
--- a/tensorflow/core/tfrt/fallback/cost_recorder.cc
+++ b/tensorflow/core/tfrt/fallback/cost_recorder.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <limits>
 #include <string>
 
@@ -23,18 +25,18 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/fallback/op_cost_map.pb.h"
 #include "tensorflow/core/util/env_var.h"
+#include "tensorflow/tsl/platform/mutex.h"
 
 namespace tensorflow {
 namespace tfrt_stub {
 
-void CostRecorder::RecordCostNanosecond(int64_t op_key,
-                                        uint64_t execution_time_ns) {
+void CostRecorder::RecordCost(int64_t op_key, uint64_t execution_time) {
   mutex_lock l(op_cost_map_mutex_);
-  op_cost_map_[op_key].first += execution_time_ns;
+  op_cost_map_[op_key].first += execution_time;
   op_cost_map_[op_key].second += 1;
 }
 
-uint64_t CostRecorder::GetCostNanosecond(int64_t op_key) const {
+uint64_t CostRecorder::GetCost(int64_t op_key) const {
   tf_shared_lock l(op_cost_map_mutex_);
 
   const auto iter = op_cost_map_.find(op_key);
@@ -43,7 +45,13 @@ uint64_t CostRecorder::GetCostNanosecond(int64_t op_key) const {
   const auto total_cost = iter->second.first;
   const auto num_ops = iter->second.second;
 
-  return total_cost / num_ops;
+  auto r =
+      std::max(static_cast<uint64_t>(1),
+               static_cast<uint64_t>(total_cost / num_ops / normalize_ratio_));
+
+  VLOG(2) << "Get cost for op_key=" << op_key << ", cost=" << r;
+
+  return r;
 }
 
 Status CostRecorder::WriteToFile() const {
diff --git a/tensorflow/core/tfrt/fallback/cost_recorder.h b/tensorflow/core/tfrt/fallback/cost_recorder.h
index 3c2610b6925..9929ca76e10 100644
--- a/tensorflow/core/tfrt/fallback/cost_recorder.h
+++ b/tensorflow/core/tfrt/fallback/cost_recorder.h
@@ -18,6 +18,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_FALLBACK_COST_RECORDER_H_
 #define TENSORFLOW_CORE_TFRT_FALLBACK_COST_RECORDER_H_
 
+#include <cstdint>
 #include <string>
 #include <utility>
 
@@ -35,14 +36,18 @@ namespace tfrt_stub {
 // unique within a model.
 class CostRecorder {
  public:
-  // Records an execution duration for the op keyed by `op_key`.
-  void RecordCostNanosecond(int64_t op_key, uint64_t execution_time_ns);
+  explicit CostRecorder(uint64_t normalize_ratio = 1)
+      : normalize_ratio_(normalize_ratio) {}
 
-  // Returns the average execution duration of the op keyed by `op_key`. If
-  // there is no record for `op_key`, returns the uint32_t::max to avoid stream
-  // merging. Note that we don't use uint64_t::max because otherwise adding op
-  // costs would cause overflow. (See details in go/tfrt-stream-analysis-doc.)
-  uint64_t GetCostNanosecond(int64_t op_key) const;
+  // Records an execution duration for the op keyed by `op_key`.
+  void RecordCost(int64_t op_key, uint64_t execution_time);
+
+  // Returns the normalized average execution duration of the op keyed by
+  // `op_key`. If there is no record for `op_key`, returns the uint32_t::max to
+  // avoid stream merging. Note that we don't use uint64_t::max because
+  // otherwise adding op costs would cause overflow. (See details in
+  // go/tfrt-stream-analysis-doc.)
+  uint64_t GetCost(int64_t op_key) const;
 
   // Writes the op cost map (in format of `OpCostMapProto`) to a file specified
   // by the env var name `MesuredCostPathEnvVarName()`.
@@ -56,6 +61,9 @@ class CostRecorder {
   }
 
  private:
+  // Normalize the cost values by dividing by this.
+  uint64_t normalize_ratio_;
+
   mutable tensorflow::mutex op_cost_map_mutex_;
   // Map op key to {sum of op execution duration in nanoseconds, #occurences of
   // the op}.
diff --git a/tensorflow/core/tfrt/fallback/cost_recorder_test.cc b/tensorflow/core/tfrt/fallback/cost_recorder_test.cc
index befb09253d4..6f5d6486eb1 100644
--- a/tensorflow/core/tfrt/fallback/cost_recorder_test.cc
+++ b/tensorflow/core/tfrt/fallback/cost_recorder_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 
+#include <cstdint>
 #include <limits>
 #include <string>
 
@@ -29,36 +30,44 @@ namespace {
 constexpr int64_t kTestOpKey = 1;
 constexpr uint64_t kTestCost = 1234;
 constexpr uint64_t kTestAvgCost = 1851;
+constexpr uint64_t kTestNormalizedCost = 18;
 
-TEST(CostRecorderTest, RecordCostTest) {
-  CostRecorder recorder;
+struct TestParams {
+  uint64_t normalize_ratio = 1;
+};
 
-  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
-  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
+class CostRecorderTest : public ::testing::TestWithParam<TestParams> {};
+
+TEST_P(CostRecorderTest, RecordCostTest) {
+  CostRecorder recorder(GetParam().normalize_ratio);
+
+  recorder.RecordCost(kTestOpKey, kTestCost);
+  recorder.RecordCost(kTestOpKey, kTestCost);
 
   EXPECT_EQ(recorder.size(), 1);
 }
 
-TEST(CostRecorderTest, GetCostTest) {
-  CostRecorder recorder;
+TEST_P(CostRecorderTest, GetCostTest) {
+  CostRecorder recorder(GetParam().normalize_ratio);
 
-  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
-  recorder.RecordCostNanosecond(kTestOpKey, 2 * kTestCost);
+  recorder.RecordCost(kTestOpKey, kTestCost);
+  recorder.RecordCost(kTestOpKey, 2 * kTestCost);
 
   EXPECT_EQ(recorder.size(), 1);
-  EXPECT_EQ(recorder.GetCostNanosecond(kTestOpKey), kTestAvgCost);
+  EXPECT_EQ(recorder.GetCost(kTestOpKey), GetParam().normalize_ratio == 1
+                                              ? kTestAvgCost
+                                              : kTestNormalizedCost);
 }
 
-TEST(CostRecorderTest, GetCostDefaultValueTest) {
-  CostRecorder recorder;
+TEST_P(CostRecorderTest, GetCostDefaultValueTest) {
+  CostRecorder recorder(GetParam().normalize_ratio);
   ASSERT_EQ(recorder.size(), 0);
 
-  EXPECT_EQ(recorder.GetCostNanosecond(kTestOpKey),
-            std::numeric_limits<uint32_t>::max());
+  EXPECT_EQ(recorder.GetCost(kTestOpKey), std::numeric_limits<uint32_t>::max());
 }
 
-TEST(CostRecorderTest, WriteToFileTest) {
-  CostRecorder recorder;
+TEST_P(CostRecorderTest, WriteToFileTest) {
+  CostRecorder recorder(GetParam().normalize_ratio);
   ASSERT_EQ(recorder.size(), 0);
 
   std::string measured_cost_path;
@@ -74,12 +83,12 @@ TEST(CostRecorderTest, WriteToFileTest) {
   EXPECT_EQ(op_cost_map_proto.op_cost_map_size(), 0);
 }
 
-TEST(CostRecorderTest, ProtoRecordsTest) {
-  CostRecorder recorder;
+TEST_P(CostRecorderTest, ProtoRecordsTest) {
+  CostRecorder recorder(GetParam().normalize_ratio);
 
   // Records the cost of op.
-  recorder.RecordCostNanosecond(kTestOpKey, kTestCost);
-  recorder.RecordCostNanosecond(kTestOpKey, 2 * kTestCost);
+  recorder.RecordCost(kTestOpKey, kTestCost);
+  recorder.RecordCost(kTestOpKey, 2 * kTestCost);
   ASSERT_EQ(recorder.size(), 1);
 
   // Writes op's cost to the disk.
@@ -99,6 +108,9 @@ TEST(CostRecorderTest, ProtoRecordsTest) {
             kTestAvgCost);
 }
 
+INSTANTIATE_TEST_SUITE_P(CostRecorderTests, CostRecorderTest,
+                         ::testing::Values(TestParams{1}, TestParams{100}));
+
 }  // namespace
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/fallback/device_with_custom_allocator.h b/tensorflow/core/tfrt/fallback/device_with_custom_allocator.h
new file mode 100644
index 00000000000..37b7f00aa3d
--- /dev/null
+++ b/tensorflow/core/tfrt/fallback/device_with_custom_allocator.h
@@ -0,0 +1,101 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_FALLBACK_DEVICE_WITH_CUSTOM_ALLOCATOR_H_
+#define TENSORFLOW_CORE_TFRT_FALLBACK_DEVICE_WITH_CUSTOM_ALLOCATOR_H_
+
+#include <utility>
+
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/tsl/framework/allocator.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class DeviceWithCustomAllocator : public tensorflow::Device {
+ public:
+  DeviceWithCustomAllocator(tensorflow::Device* device,
+                            tensorflow::Allocator* allocator)
+      : Device(device->env(), device->attributes()),
+        device_(device),
+        allocator_(allocator) {
+    DCHECK(device_);
+    DCHECK(allocator_);
+  }
+
+  Allocator* GetAllocator(AllocatorAttributes attr) override {
+    return allocator_;
+  }
+
+  const DeviceBase* UnderlyingDevice() const override {
+    return device_->UnderlyingDevice();
+  }
+  DeviceBase* UnderlyingDevice() override {
+    return device_->UnderlyingDevice();
+  }
+
+  const CpuWorkerThreads* tensorflow_cpu_worker_threads() const override {
+    return device_->tensorflow_cpu_worker_threads();
+  }
+
+  Allocator* GetScopedAllocator(AllocatorAttributes attr,
+                                int64_t step_id) override {
+    return device_->GetScopedAllocator(attr, step_id);
+  }
+
+  ScopedAllocatorMgr* GetScopedAllocatorMgr() const override {
+    return device_->GetScopedAllocatorMgr();
+  }
+
+  const Eigen::ThreadPoolDevice* eigen_cpu_device() override {
+    return device_->eigen_cpu_device();
+  }
+
+  thread::ThreadPool* tensorflow_device_thread_pool() override {
+    return device_->tensorflow_device_thread_pool();
+  }
+
+  bool has_eigen_cpu_device() const override {
+    return device_->has_eigen_cpu_device();
+  }
+
+  Status MakeTensorFromProto(const TensorProto& tensor_proto,
+                             const AllocatorAttributes alloc_attrs,
+                             Tensor* tensor) override {
+    return device_->MakeTensorFromProto(tensor_proto, alloc_attrs, tensor);
+  }
+
+  void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
+                              const DeviceContext* device_context,
+                              StatusCallback done) override {
+    device_->CopyTensorInSameDevice(input_tensor, output_tensor, device_context,
+                                    std::move(done));
+  }
+
+  Status Sync() override { return device_->Sync(); }
+
+  // Returns the resource manager associated w/ this device.
+  ResourceMgr* resource_manager() override {
+    return device_->resource_manager();
+  }
+
+ private:
+  tensorflow::Device* device_ = nullptr;
+  tensorflow::Allocator* allocator_ = nullptr;
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_FALLBACK_DEVICE_WITH_CUSTOM_ALLOCATOR_H_
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.cc b/tensorflow/core/tfrt/fallback/fallback_state.cc
index 54d3a083457..5b1170e5aa7 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.cc
+++ b/tensorflow/core/tfrt/fallback/fallback_state.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
 #include "tensorflow/core/framework/device_factory.h"
diff --git a/tensorflow/core/tfrt/fallback/fallback_state.h b/tensorflow/core/tfrt/fallback/fallback_state.h
index 16aa74351ae..c06d94c2d3d 100644
--- a/tensorflow/core/tfrt/fallback/fallback_state.h
+++ b/tensorflow/core/tfrt/fallback/fallback_state.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_FALLBACK_FALLBACK_STATE_H_
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
 #include "tensorflow/core/common_runtime/device_set.h"
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
index 3f64c55c062..fd0bae2f3fa 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
+#include <functional>
+#include <memory>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner.h b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
index 2ca45c13f2d..64bb3e1d14c 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner.h
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <assert.h>
 #include <stddef.h>
 
+#include <functional>
 #include <memory>
 #include <string>
 #include <utility>
diff --git a/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc b/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
index 040a9590214..f41230b33ac 100644
--- a/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
+++ b/tensorflow/core/tfrt/fallback/op_kernel_runner_test.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/device_factory.h"
diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 9de585bfc3f..99396fdb4c2 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -11,8 +11,11 @@ package(
 package_group(
     name = "friends",
     packages = [
-        "//smartass/brain/inference/...",
+        # copybara:uncomment "//learning/brain/tfrt/support/...",
+        # copybara:uncomment "//smartass/brain/inference/...",
         "//tensorflow/core/tfrt/...",
+        "//tensorflow/core/tfrt/graph_executor/python/...",
+        # copybara:uncomment "//tensorflow_serving/servables/tensorflow/google/...",
     ],
 )
 
@@ -38,22 +41,18 @@ cc_library(
     hdrs = ["graph_executor.h"],
     tags = ["no_oss"],
     deps = [
+        ":export_mlir",
         ":graph_execution_options",
         ":sync_resource_state",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:import_model",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel:context",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:sync_context",
         "//learning/brain/experimental/tfrt/native_lowering/saved_model:saved_model_translate",
-        "//learning/infra/mira/mlrt/bytecode",
-        "//learning/infra/mira/mlrt/bytecode:executable",
-        "//learning/infra/mira/mlrt/interpreter:context",
-        "//learning/infra/mira/mlrt/interpreter:execute",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:tf_jitrt_request_context",
         "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
         "//tensorflow/compiler/mlir/tfrt:transforms/update_op_cost_in_tfrt_mlir",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:import_model",
         "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:core_cpu_internal",
@@ -65,9 +64,15 @@ cc_library(
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_utils",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:execute",
+        "//tensorflow/core/tfrt/mlrt/kernel:context",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/runtime:work_queue_interface",
         "//tensorflow/core/tfrt/utils",
@@ -99,10 +104,7 @@ tf_cc_test(
     tags = ["no_oss"],
     deps = [
         ":graph_executor",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
-        "//learning/infra/mira/mlrt/interpreter:context",
-        "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/cc:array_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:const_op",
@@ -112,6 +114,9 @@ tf_cc_test(
         "//tensorflow/core/grappler/utils:grappler_test",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:value",
+        "//tensorflow/core/tfrt/mlrt/kernel",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
@@ -128,16 +133,16 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":graph_executor",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
         "//learning/brain/tfrt/mlrt/application/vrooml:kernel",
-        "//learning/infra/mira/mlrt/interpreter:context",
-        "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/core/framework:graph_proto_cc",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:value",
+        "//tensorflow/core/tfrt/mlrt/kernel",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/utils:error_util",
         "@com_google_absl//absl/status:statusor",
@@ -164,10 +169,10 @@ tf_cc_test(
     srcs = ["synchronous_graph_executor_test.cc"],
     deps = [
         ":synchronous_graph_executor",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels:kernels_alwayslink",
-        "//learning/infra/mira/mlrt/interpreter:value",
         "//tensorflow/cc:array_ops",
+        "//tensorflow/core/tfrt/mlrt/interpreter:value",
+        "//tensorflow/core/tfrt/mlrt/kernel",
         "//tensorflow/core/tfrt/utils:error_util",
         "@com_google_googletest//:gtest_main",
         "@tf_runtime//cpp_tests:common",
@@ -203,6 +208,16 @@ cc_library(
     hdrs = ["sync_resource_state.h"],
     visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow_serving/util:any_ptr",
         "@tf_runtime//:tensor",
     ],
 )
+
+cc_library(
+    name = "export_mlir",
+    hdrs = ["export_mlir.h"],
+    deps = [
+        "//tensorflow/core/platform:statusor",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/core/tfrt/graph_executor/config.cc b/tensorflow/core/tfrt/graph_executor/config.cc
index 5d55d8c2457..6bc29b3654b 100644
--- a/tensorflow/core/tfrt/graph_executor/config.cc
+++ b/tensorflow/core/tfrt/graph_executor/config.cc
@@ -20,9 +20,9 @@ limitations under the License.
 namespace tensorflow {
 namespace tfrt_stub {
 
-absl::StatusOr<ModelConfig> ModelConfig::CreateFromProto(
-    ModelConfigProto proto) {
-  ModelConfig model_config;
+absl::StatusOr<RuntimeConfig> RuntimeConfig::CreateFromProto(
+    RuntimeConfigProto proto) {
+  RuntimeConfig model_config;
   model_config.proto_ = std::move(proto);
   size_t i = 0;
   for (const auto& any : model_config.proto_.config()) {
diff --git a/tensorflow/core/tfrt/graph_executor/config.h b/tensorflow/core/tfrt/graph_executor/config.h
index 432b38e7c27..d1c51a63e44 100644
--- a/tensorflow/core/tfrt/graph_executor/config.h
+++ b/tensorflow/core/tfrt/graph_executor/config.h
@@ -25,13 +25,14 @@ limitations under the License.
 namespace tensorflow {
 namespace tfrt_stub {
 
-// The helper class for building ModelConfigProto and retrieving configs of
-// certain types from the ModelConfigProto.
-class ModelConfig {
+// The helper class for building RuntimeConfigProto and retrieving configs of
+// certain types from the RuntimeConfigProto.
+class RuntimeConfig {
  public:
-  ModelConfig() = default;
+  RuntimeConfig() = default;
 
-  static absl::StatusOr<ModelConfig> CreateFromProto(ModelConfigProto proto);
+  static absl::StatusOr<RuntimeConfig> CreateFromProto(
+      RuntimeConfigProto proto);
 
   template <typename ConcreteProto>
   absl::Status Add(const ConcreteProto& config) {
@@ -68,10 +69,10 @@ class ModelConfig {
     return config;
   }
 
-  const ModelConfigProto& ToProto() const { return proto_; }
+  const RuntimeConfigProto& ToProto() const { return proto_; }
 
  private:
-  ModelConfigProto proto_;
+  RuntimeConfigProto proto_;
   absl::flat_hash_map<std::string, size_t> map_;
 };
 
diff --git a/tensorflow/core/tfrt/graph_executor/config.proto b/tensorflow/core/tfrt/graph_executor/config.proto
index 20681a45217..1cfa4658da6 100644
--- a/tensorflow/core/tfrt/graph_executor/config.proto
+++ b/tensorflow/core/tfrt/graph_executor/config.proto
@@ -6,6 +6,6 @@ import "google/protobuf/any.proto";
 
 // The serialization format for custom model configs. Though it is using
 // repeated Any protos, only one object of a proto type is allowed in the list.
-message ModelConfigProto {
+message RuntimeConfigProto {
   repeated google.protobuf.Any config = 1;
 }
diff --git a/tensorflow/core/tfrt/graph_executor/config_test.cc b/tensorflow/core/tfrt/graph_executor/config_test.cc
index 6d0cbee3dc2..37b24e7853a 100644
--- a/tensorflow/core/tfrt/graph_executor/config_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/config_test.cc
@@ -29,14 +29,14 @@ TEST(ConfigTest, Basic) {
   TestConfig2 expected_test_config2;
   expected_test_config2.set_tag("test config2");
 
-  ModelConfig model_config;
+  RuntimeConfig runtime_config;
 
-  ASSERT_OK(model_config.Add(expected_test_config2));
-  ASSERT_OK(model_config.Add(expected_test_config1));
+  ASSERT_OK(runtime_config.Add(expected_test_config2));
+  ASSERT_OK(runtime_config.Add(expected_test_config1));
 
-  auto test_config1 = model_config.Get<TestConfig1>();
+  auto test_config1 = runtime_config.Get<TestConfig1>();
   ASSERT_OK(test_config1);
-  auto test_config2 = model_config.Get<TestConfig2>();
+  auto test_config2 = runtime_config.Get<TestConfig2>();
   ASSERT_OK(test_config2);
 
   EXPECT_EQ(test_config1->tag(), "test config1");
@@ -49,16 +49,16 @@ TEST(ConfigTest, Load) {
   TestConfig2 expected_test_config2;
   expected_test_config2.set_tag("test config2");
 
-  ModelConfigProto model_config_proto;
-  model_config_proto.add_config()->PackFrom(expected_test_config1);
-  model_config_proto.add_config()->PackFrom(expected_test_config2);
+  RuntimeConfigProto runtime_config_proto;
+  runtime_config_proto.add_config()->PackFrom(expected_test_config1);
+  runtime_config_proto.add_config()->PackFrom(expected_test_config2);
 
-  ASSERT_OK_AND_ASSIGN(auto model_config,
-                       ModelConfig::CreateFromProto(model_config_proto));
+  ASSERT_OK_AND_ASSIGN(auto runtime_config,
+                       RuntimeConfig::CreateFromProto(runtime_config_proto));
 
-  auto test_config1 = model_config.Get<TestConfig1>();
+  auto test_config1 = runtime_config.Get<TestConfig1>();
   ASSERT_OK(test_config1);
-  auto test_config2 = model_config.Get<TestConfig2>();
+  auto test_config2 = runtime_config.Get<TestConfig2>();
   ASSERT_OK(test_config2);
 
   EXPECT_EQ(test_config1->tag(), "test config1");
@@ -69,13 +69,13 @@ TEST(ConfigTest, NotFound) {
   TestConfig1 expected_test_config1;
   expected_test_config1.set_tag("test config1");
 
-  ModelConfigProto model_config_proto;
-  model_config_proto.add_config()->PackFrom(expected_test_config1);
+  RuntimeConfigProto runtime_config_proto;
+  runtime_config_proto.add_config()->PackFrom(expected_test_config1);
 
-  ASSERT_OK_AND_ASSIGN(auto model_config,
-                       ModelConfig::CreateFromProto(model_config_proto));
+  ASSERT_OK_AND_ASSIGN(auto runtime_config,
+                       RuntimeConfig::CreateFromProto(runtime_config_proto));
 
-  EXPECT_THAT(model_config.Get<TestConfig2>(),
+  EXPECT_THAT(runtime_config.Get<TestConfig2>(),
               ::testing::status::StatusIs(absl::StatusCode::kNotFound));
 }
 
@@ -83,10 +83,10 @@ TEST(ConfigTest, Duplicate) {
   TestConfig1 expected_test_config1;
   expected_test_config1.set_tag("test config1");
 
-  ModelConfig model_config;
+  RuntimeConfig runtime_config;
 
-  ASSERT_OK(model_config.Add(expected_test_config1));
-  EXPECT_THAT(model_config.Add(expected_test_config1),
+  ASSERT_OK(runtime_config.Add(expected_test_config1));
+  EXPECT_THAT(runtime_config.Add(expected_test_config1),
               ::testing::status::StatusIs(absl::StatusCode::kAlreadyExists));
 }
 
diff --git a/tensorflow/core/tfrt/graph_executor/export_mlir.h b/tensorflow/core/tfrt/graph_executor/export_mlir.h
new file mode 100644
index 00000000000..e8593cbee24
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/export_mlir.h
@@ -0,0 +1,67 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXPORT_MLIR_H_
+#define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXPORT_MLIR_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/platform/statusor.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+class XsymbolUploader {
+ public:
+  virtual ~XsymbolUploader() = default;
+
+  virtual tensorflow::StatusOr<std::string> MaybeUploadMlirToXsymbol(
+      mlir::ModuleOp module) {
+    return "";
+  }
+};
+
+class XsymbolUploaderRegistry {
+ public:
+  XsymbolUploaderRegistry()
+      : xsymbol_uploader_(std::make_unique<XsymbolUploader>()) {}
+
+  void Register(std::unique_ptr<XsymbolUploader> xsymbol_uploader) {
+    xsymbol_uploader_ = std::move(xsymbol_uploader);
+  }
+
+  XsymbolUploader &Get() const { return *xsymbol_uploader_; }
+
+ private:
+  std::unique_ptr<XsymbolUploader> xsymbol_uploader_;
+};
+
+inline XsymbolUploaderRegistry &GetGlobalXsymbolUploaderRegistry() {
+  static auto *const registry = new XsymbolUploaderRegistry;
+  return *registry;
+}
+
+inline tensorflow::StatusOr<std::string> MaybeUploadMlirToXsymbol(
+    mlir::ModuleOp module) {
+  return GetGlobalXsymbolUploaderRegistry().Get().MaybeUploadMlirToXsymbol(
+      module);
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_EXPORT_MLIR_H_
diff --git a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
index abb7ea918db..bb18cc39d38 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_execution_options.h
@@ -55,14 +55,19 @@ struct GraphExecutionOptions {
   // Model metadata used for monitoring and tracing.
   tensorflow::SessionMetadata model_metadata;
 
-  // The model-specific configurations.
-  tensorflow::tfrt_stub::ModelConfig model_config;
+  // The model-specific runtime configurations.
+  tensorflow::tfrt_stub::RuntimeConfig runtime_config;
 
   // If true, for each client graph, the op costs of the first request will be
   // recorded and used to re-compile the client graph.
   // TODO(b/266251216): Maybe flip the default value or remote it.
   bool enable_online_cost_analysis = false;
 
+  // Normalize the op costs recorded during online cost analysis by dividing by
+  // this.
+  // TODO(b/278298965): Maybe remove normalization.
+  uint64_t online_cost_analysis_normalize_ratio = 1;
+
   // If true, the MLRT interpreter will be used instead of the BEF executor.
   // This option is experimental.
   bool enable_mlrt = false;
@@ -84,7 +89,7 @@ struct GraphExecutionRunOptions {
   // will be raised upon mismatch.
   bool validate_input_specs = false;
 
-  // TODO(b/239749833) Remove after b/239749833 is fixed.
+  // TODO(b/279197040) Remove after b/279197040 is fixed.
   // If true, the input specs will be checked before running, and an error
   // will be logged upon mismatch.
   bool validate_input_specs_dry_run = false;
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index d9e3a26a380..00301d75f67 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -25,14 +25,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms/import_model.h"
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/context.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_context.h"
 #include "learning/brain/experimental/tfrt/native_lowering/saved_model/saved_model_translate.h"
-#include "learning/infra/mira/mlrt/bytecode/bytecode.h"
-#include "learning/infra/mira/mlrt/bytecode/executable.h"
-#include "learning/infra/mira/mlrt/interpreter/context.h"
-#include "learning/infra/mira/mlrt/interpreter/execute.h"
 #include "absl/base/call_once.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -48,6 +42,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tfrt/jit/tf_jitrt_request_context.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
@@ -60,10 +55,17 @@ limitations under the License.
 #include "tensorflow/core/profiler/lib/traceme_encode.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/graph_executor/export_mlir.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
@@ -119,8 +121,8 @@ tensorflow::Status RunMlrtFunction(
   //
   // TODO(chky, rohitju): Unify tfrt::SyncContext with tf_mlrt::Context.
   tfrt::ExecutionContext exec_ctx(request_context);
-  execution_context.AddUserContext(
-      std::make_unique<tfrt::SyncContext>(&exec_ctx, sync_resource_state));
+  execution_context.AddUserContext(std::make_unique<tfrt::SyncContext>(
+      *request_context->host(), sync_resource_state));
 
   // Set up tf_mlrt::Context which is used for executing tensorflow::OpKernel.
   execution_context.AddUserContext(std::make_unique<tf_mlrt::Context>(
@@ -146,10 +148,8 @@ tensorflow::Status RunMlrtFunction(
   execution_context.set_exit_handler(
       [chain = chain.get()]() { chain->SetStateConcrete(); });
 
-  std::vector<uint8_t> last_uses;
-  execution_context.Call(function, last_uses, absl::Span<mlrt::Value>(),
-                         absl::MakeSpan(mlrt_inputs),
-                         absl::MakeSpan(mlrt_outputs));
+  execution_context.CallByMove(function, absl::MakeSpan(mlrt_inputs),
+                               absl::MakeSpan(mlrt_outputs));
 
   // TODO(chky): Set up cancellation.
 
@@ -160,7 +160,7 @@ tensorflow::Status RunMlrtFunction(
 
   if (!execution_context.status().ok()) {
     outputs->resize(mlrt_outputs.size(), tensorflow::Tensor());
-    return tsl::FromAbslStatus(execution_context.status());
+    return execution_context.status();
   }
 
   for (auto& mlrt_output : mlrt_outputs) {
@@ -230,7 +230,7 @@ StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
   fallback_request_state.set_cost_recorder(cost_recorder);
   fallback_request_state.set_client_graph_resource_context(
       client_graph_resource_context);
-  fallback_request_state.set_model_config(&options.model_config);
+  fallback_request_state.set_runtime_config(&options.runtime_config);
 
   TF_RETURN_IF_ERROR(
       tensorflow::SetUpTfJitRtRequestContext(&request_context_builder));
@@ -252,8 +252,8 @@ StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
 tensorflow::Status GraphExecutionRunOnFunction(
     const GraphExecutionOptions& options,
     const GraphExecutionRunOptions& run_options,
-    absl::string_view signature_name, const tfrt::Function* func,
-    const mlrt::LoadedExecutable* loaded_executable,
+    absl::string_view signature_name, const SymbolUids& symbol_uids,
+    const tfrt::Function* func, const mlrt::LoadedExecutable* loaded_executable,
     absl::Span<const tensorflow::Tensor> inputs,
     std::vector<tensorflow::Tensor>* outputs,
     tfrt::ResourceContext* resource_context,
@@ -273,14 +273,16 @@ tensorflow::Status GraphExecutionRunOnFunction(
   tensorflow::profiler::TraceMeProducer traceme(
       // To TraceMeConsumers in RunHandlerThreadPool::WorkerLoop.
       [request_id = request_info->tfrt_request_context->id(), signature_name,
-       &options] {
+       &options, symbol_uids] {
         return tensorflow::profiler::TraceMeEncode(
             "TfrtModelRun",
             {{"_r", 1},
              {"id", request_id},
              {"signature", signature_name},
              {"model_id", absl::StrCat(options.model_metadata.name(), ":",
-                                       options.model_metadata.version())}});
+                                       options.model_metadata.version())},
+             {"tf_symbol_uid", symbol_uids.tf_symbol_uid},
+             {"tfrt_symbol_uid", symbol_uids.tfrt_symbol_uid}});
       },
       tensorflow::profiler::ContextType::kTfrtExecutor,
       request_info->tfrt_request_context->id());
@@ -367,14 +369,14 @@ tensorflow::Status GraphExecutionRunOnFunction(
   tensorflow::StatusGroup status_group;
 
   if (chain->IsError()) {
-    status_group.Update(FromAbslStatus(chain->GetError()));
+    status_group.Update(chain->GetError());
   }
 
   for (tfrt::RCReference<tfrt::AsyncValue>& result : results) {
     DCHECK(result->IsAvailable());
 
     if (result->IsError()) {
-      status_group.Update(FromAbslStatus(result->GetError()));
+      status_group.Update(result->GetError());
       outputs->push_back(tensorflow::Tensor());
       continue;
     }
@@ -538,14 +540,15 @@ tensorflow::Status GraphExecutor::Run(
   // Conduct cost analysis for the first request on this `loaded_client_graph`.
   std::unique_ptr<CostRecorder> cost_recorder;
   if (options_.enable_online_cost_analysis) {
-    cost_recorder = loaded_client_graph.MaybeCreateCostRecorder();
+    cost_recorder = loaded_client_graph.MaybeCreateCostRecorder(
+        options_.online_cost_analysis_normalize_ratio);
   }
 
   std::vector<tensorflow::Tensor> flat_outputs;
   TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
-      options_, run_options, loaded_client_graph.name(), func,
-      loaded_executable, flat_inputs, &flat_outputs, &resource_context_,
-      &executable_context->resource_context,
+      options_, run_options, loaded_client_graph.name(),
+      loaded_client_graph.symbol_uids(), func, loaded_executable, flat_inputs,
+      &flat_outputs, &resource_context_, &executable_context->resource_context,
       &loaded_client_graph.runner_table(),
       &loaded_client_graph.resource_array(), runtime(), fallback_state_,
       &req_deadline_tracker_, cost_recorder.get()));
@@ -579,6 +582,16 @@ GraphExecutor::ImportAndCompileClientGraph(
   auto context = std::make_unique<mlir::MLIRContext>();
   ASSIGN_OR_RETURN_IN_IMPORT(
       auto module, ImportClientGraphToMlirModule(client_graph, context.get()));
+  // TODO(b/278143179): Upload module w/o control flow.
+  SymbolUids symbol_uids;
+
+  if (auto tf_symbol_uid = MaybeUploadMlirToXsymbol(module.get());
+      tf_symbol_uid.ok()) {
+    symbol_uids.tf_symbol_uid = *tf_symbol_uid;
+  } else {
+    LOG(ERROR) << tf_symbol_uid.status();
+  }
+
   auto import_duration = absl::Now() - import_start_time;
   LOG(INFO) << "TFRT finished importing client graph (" << &client_graph
             << "). Took " << absl::ToInt64Milliseconds(import_duration)
@@ -611,7 +624,9 @@ GraphExecutor::ImportAndCompileClientGraph(
 
     ASSIGN_OR_RETURN_IN_COMPILE(
         auto bytecode_buffer,
-        CompileMlirModuleToByteCode(module.get(), &module_with_op_keys));
+        tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
+            options_.compile_options, fallback_state_, module.get(),
+            &module_with_op_keys));
     mlrt::bc::Executable executable(bytecode_buffer.data());
     auto bytecode_executable =
         std::make_unique<mlrt::LoadedExecutable>(executable, *kernel_registry_);
@@ -624,13 +639,21 @@ GraphExecutor::ImportAndCompileClientGraph(
     executable_context = std::make_shared<ExecutableContext>(
         std::move(bef), std::move(bef_file));
   }
+
+  if (auto tfrt_symbol_uid = MaybeUploadMlirToXsymbol(module.get());
+      tfrt_symbol_uid.ok()) {
+    symbol_uids.tfrt_symbol_uid = *tfrt_symbol_uid;
+  } else {
+    LOG(ERROR) << tfrt_symbol_uid.status();
+  }
+
   auto compile_duration = absl::Now() - compile_start_time;
   LOG(INFO) << "TFRT finished compiling client graph (" << &client_graph
             << "). Took " << absl::ToInt64Milliseconds(compile_duration)
             << " ms. Client graph name: " << client_graph.name;
 
   return std::make_unique<LoadedClientGraph>(
-      client_graph.name, this, std::move(context),
+      client_graph.name, std::move(symbol_uids), this, std::move(context),
       std::move(module_with_op_keys), std::move(module),
       std::move(executable_context));
 }
@@ -700,19 +723,6 @@ StatusOr<tfrt::BefBuffer> GraphExecutor::CompileMlirModuleToBef(
   return bef;
 }
 
-StatusOr<mlrt::bc::Buffer> GraphExecutor::CompileMlirModuleToByteCode(
-    mlir::ModuleOp module,
-    mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys) const {
-  return tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
-      options_.compile_options, module, module_with_op_keys);
-}
-
-StatusOr<mlrt::bc::Buffer> GraphExecutor::CompileMlirModuleWithOpKeysToByteCode(
-    mlir::ModuleOp module, const CostRecorder& cost_recorder) const {
-  return tensorflow::mlrt_compiler::ConvertTfMlirWithOpKeysToBytecode(
-      options_.compile_options, module, cost_recorder);
-}
-
 tensorflow::Status GraphExecutor::InitBef(
     LoadedClientGraph* loaded_client_graph,
     tensorflow::tfrt_stub::WorkQueueInterface* work_queue) {
@@ -850,16 +860,6 @@ tensorflow::Status GraphExecutor::RunWithSyncInterpreter(
           /*work_queue=*/nullptr,
           graph_name.empty() ? output_tensor_names[0] : graph_name));
 
-  TF_ASSIGN_OR_RETURN(
-      auto request_info,
-      CreateRequestInfo(options_, /*run_options=*/{},
-                        options_.runtime->work_queue(), &resource_context_,
-                        /*client_graph_resource_context=*/nullptr,
-                        &loaded_client_graph.runner_table(),
-                        &loaded_client_graph.resource_array(),
-                        fallback_state_));
-  tfrt::ExecutionContext exec_ctx{request_info->tfrt_request_context};
-
   // Get a shared_ptr of the executable so that during the current request the
   // executable to use is guaranteed to be alive.
   auto executable_context = loaded_client_graph.executable_context();
@@ -867,31 +867,36 @@ tensorflow::Status GraphExecutor::RunWithSyncInterpreter(
       executable_context->bytecode_executable.get());
 
   auto sync_context = std::make_unique<tfrt::SyncContext>(
-      &exec_ctx, &loaded_client_graph.sync_resource_state());
+      *options_.runtime->core_runtime()->GetHostContext(),
+      &loaded_client_graph.sync_resource_state());
   execution_context.AddUserContext(std::move(sync_context));
 
+  tensorflow::tfd::KernelFallbackCompatRequestState kernel_fallback_state(
+      tfd::GetDefaultRunner(), &fallback_state_.get().device_manager(),
+      /*step_id=*/0, &loaded_client_graph.runner_table(),
+      &loaded_client_graph.resource_array(),
+      /*user_intra_op_threadpool=*/nullptr, /*model_metadata=*/std::nullopt,
+      &fallback_state_.get().process_function_library_runtime());
   auto tf_context = std::make_unique<tensorflow::tf_mlrt::Context>(
-      &request_info->tfrt_request_context
-           ->GetData<tensorflow::tfd::KernelFallbackCompatRequestState>(),
-      request_info->tfrt_request_context->resource_context());
+      &kernel_fallback_state, &resource_context_);
   execution_context.AddUserContext(std::move(tf_context));
 
   auto serving_function = executable_context->bytecode_executable->GetFunction(
       loaded_client_graph.name());
   DCHECK(serving_function);
 
-  std::vector<uint8_t> last_uses;
-  execution_context.Call(serving_function, last_uses, absl::Span<mlrt::Value>(),
-                         input_values, outputs);
+  execution_context.CallByMove(serving_function, input_values, outputs);
   mlrt::Execute(execution_context);
-  return tsl::FromAbslStatus(execution_context.status());
+  return execution_context.status();
 }
 
 std::unique_ptr<CostRecorder>
-GraphExecutor::LoadedClientGraph::MaybeCreateCostRecorder() const {
+GraphExecutor::LoadedClientGraph::MaybeCreateCostRecorder(
+    uint64_t normalize_ratio) const {
   std::unique_ptr<CostRecorder> cost_recorder;
-  absl::call_once(create_cost_recorder_once_,
-                  [&]() { cost_recorder = std::make_unique<CostRecorder>(); });
+  absl::call_once(create_cost_recorder_once_, [&]() {
+    cost_recorder = std::make_unique<CostRecorder>(normalize_ratio);
+  });
   return cost_recorder;
 }
 
@@ -908,9 +913,12 @@ Status GraphExecutor::LoadedClientGraph::UpdateCost(
   if (executable_context()->IsForMlrt()) {
     // Recompile from the TF MLIR with recorded costs (skipping
     // AssignOpKeyPass), during which Stream Analysis is redone.
-    TF_ASSIGN_OR_RETURN(auto bytecode_buffer,
-                        graph_executor_->CompileMlirModuleWithOpKeysToByteCode(
-                            tf_mlir_with_op_keys.get(), cost_recorder));
+    TF_ASSIGN_OR_RETURN(
+        auto bytecode_buffer,
+        tensorflow::mlrt_compiler::ConvertTfMlirWithOpKeysToBytecode(
+            graph_executor_->options().compile_options,
+            graph_executor_->fallback_state_.get(), tf_mlir_with_op_keys.get(),
+            cost_recorder));
     mlrt::bc::Executable executable(bytecode_buffer.data());
     auto bytecode_executable = std::make_unique<mlrt::LoadedExecutable>(
         executable, *graph_executor_->kernel_registry_);
@@ -942,5 +950,18 @@ Status GraphExecutor::LoadedClientGraph::UpdateCost(
   return OkStatus();
 }
 
+tensorflow::Status GraphExecutor::CompileGraph(
+    const std::string& graph_name,
+    absl::Span<const std::string> input_tensor_names,
+    absl::Span<const tensorflow::DataType> input_tensor_dtypes,
+    absl::Span<const std::string> output_tensor_names,
+    absl::Span<const std::string> target_tensor_names) {
+  return GetOrCreateLoadedClientGraph(
+             /*run_options=*/{}, input_tensor_names, input_tensor_dtypes,
+             output_tensor_names, target_tensor_names,
+             /*work_queue=*/nullptr, graph_name)
+      .status();
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index 29b958efa9f..8d0479f87fd 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
 #define TENSORFLOW_CORE_TFRT_GRAPH_EXECUTOR_GRAPH_EXECUTOR_H_
 
+#include <cstdint>
 #include <functional>
 #include <memory>
 #include <optional>
@@ -22,8 +23,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/infra/mira/mlrt/bytecode/bytecode.h"
-#include "learning/infra/mira/mlrt/interpreter/context.h"
 #include "absl/base/call_once.h"
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -36,6 +35,8 @@ limitations under the License.
 #include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
@@ -66,6 +67,11 @@ struct RequestInfo {
   std::function<void(std::function<void()>)> runner;
 };
 
+struct SymbolUids {
+  std::string tf_symbol_uid;
+  std::string tfrt_symbol_uid;
+};
+
 // Creates a `RequestInfo` given relative data.
 // Note: `resource_context` is per-graph-executor and
 // `client_graph_resource_context` is per-loaded-client-graph. See the comment
@@ -87,8 +93,8 @@ StatusOr<std::unique_ptr<RequestInfo>> CreateRequestInfo(
 tensorflow::Status GraphExecutionRunOnFunction(
     const GraphExecutionOptions& options,
     const GraphExecutionRunOptions& run_options,
-    absl::string_view signature_name, const tfrt::Function* func,
-    const mlrt::LoadedExecutable* loaded_executable,
+    absl::string_view signature_name, const SymbolUids& symbol_uids,
+    const tfrt::Function* func, const mlrt::LoadedExecutable* loaded_executable,
     absl::Span<const tensorflow::Tensor> inputs,
     std::vector<tensorflow::Tensor>* outputs,
     tfrt::ResourceContext* resource_context,
@@ -150,12 +156,14 @@ class GraphExecutor {
   // The loading result of a `ClientGraph`.
   class LoadedClientGraph {
    public:
-    LoadedClientGraph(std::string name, GraphExecutor* graph_executor,
+    LoadedClientGraph(std::string name, SymbolUids symbol_uids,
+                      GraphExecutor* graph_executor,
                       std::unique_ptr<mlir::MLIRContext> mlir_context,
                       mlir::OwningOpRef<mlir::ModuleOp> tf_mlir_with_op_keys,
                       mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
                       std::shared_ptr<ExecutableContext> executable_context)
         : name_(std::move(name)),
+          symbol_uids_(std::move(symbol_uids)),
           graph_executor_(graph_executor),
           mlir_context_(std::move(mlir_context)),
           tf_mlir_with_op_keys_(std::move(tf_mlir_with_op_keys)),
@@ -164,7 +172,8 @@ class GraphExecutor {
 
     // Returns a `CostRecorder` if none has been created before for this
     // `LoadedClientGraph`.
-    std::unique_ptr<CostRecorder> MaybeCreateCostRecorder() const;
+    std::unique_ptr<CostRecorder> MaybeCreateCostRecorder(
+        uint64_t normalize_ratio = 1) const;
 
     // Updates the op cost values in this `LoadedClientGraph` with records from
     // `cost_recorder`.
@@ -177,6 +186,7 @@ class GraphExecutor {
       return executable_context_;
     }
     absl::string_view name() const { return name_; }
+    const SymbolUids& symbol_uids() const { return symbol_uids_; }
 
     OpKernelRunnerTable& runner_table() { return runner_table_; }
     tfd::FallbackResourceArray& resource_array() { return resource_array_; }
@@ -184,6 +194,7 @@ class GraphExecutor {
 
    private:
     std::string name_;
+    SymbolUids symbol_uids_;
     GraphExecutor* graph_executor_ = nullptr;
     OpKernelRunnerTable runner_table_;
     tfd::FallbackResourceArray resource_array_;
@@ -265,6 +276,14 @@ class GraphExecutor {
 
   const Options& options() const { return options_; }
 
+  // Compiles graph for `graph_name` and runs any initializers.
+  tensorflow::Status CompileGraph(
+      const std::string& graph_name,
+      absl::Span<const std::string> input_tensor_names,
+      absl::Span<const tensorflow::DataType> input_tensor_dtypes,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_tensor_names);
+
  private:
   // A set of methods to load a client graph.
   StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>> LoadClientGraph(
@@ -276,11 +295,6 @@ class GraphExecutor {
   ImportClientGraphToMlirModule(const GraphExecutor::ClientGraph& client_graph,
                                 mlir::MLIRContext* context) const;
   StatusOr<tfrt::BefBuffer> CompileMlirModuleToBef(mlir::ModuleOp module) const;
-  StatusOr<mlrt::bc::Buffer> CompileMlirModuleToByteCode(
-      mlir::ModuleOp module,
-      mlir::OwningOpRef<mlir::ModuleOp>* module_with_op_keys) const;
-  StatusOr<mlrt::bc::Buffer> CompileMlirModuleWithOpKeysToByteCode(
-      mlir::ModuleOp module, const CostRecorder& cost_recorder) const;
 
   tensorflow::Status InitBef(
       LoadedClientGraph* loaded_client_graph,
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
index 44e061a8d83..f335305f186 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
@@ -19,11 +19,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
-#include "learning/infra/mira/mlrt/interpreter/context.h"
-#include "learning/infra/mira/mlrt/interpreter/value.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/ops/array_ops.h"
@@ -33,6 +30,9 @@ limitations under the License.
 #include "tensorflow/core/grappler/utils/grappler_test.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/status.h"
@@ -146,11 +146,15 @@ INSTANTIATE_TEST_SUITE_P(GraphExecutorTestSuite, GraphExecutorTest,
 
 TEST_F(GraphExecutorTest, DoOnlineCostAnalysisExactlyOnce) {
   GraphExecutor::LoadedClientGraph loaded_client_graph_0(
-      "name0", /*graph_executor=*/nullptr, /*mlir_context=*/nullptr,
+      "name0", /*symbol_uids=*/{},
+      /*graph_executor=*/nullptr,
+      /*mlir_context=*/nullptr,
       /*tf_mlir_with_op_keys=*/{}, /*tfrt_mlir=*/{},
       /*executable_context=*/nullptr);
   GraphExecutor::LoadedClientGraph loaded_client_graph_1(
-      "name1", /*graph_executor=*/nullptr, /*mlir_context=*/nullptr,
+      "name1", /*symbol_uids=*/{},
+      /*graph_executor=*/nullptr,
+      /*mlir_context=*/nullptr,
       /*tf_mlir_with_op_keys=*/{}, /*tfrt_mlir=*/{},
       /*executable_context=*/nullptr);
 
diff --git a/tensorflow/core/tfrt/graph_executor/python/BUILD b/tensorflow/core/tfrt/graph_executor/python/BUILD
new file mode 100644
index 00000000000..9656d51ce09
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/python/BUILD
@@ -0,0 +1,42 @@
+# This package loads pybind11 dependencies to ensure compatibility with Python API.
+# Files affected: graph_execution_options
+load("//tensorflow:tensorflow.default.bzl", "pybind_extension", "tf_python_pybind_extension")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        # Authorized users go here.
+        "//tensorflow/core/tfrt/saved_model/...",
+        "//tensorflow/core/tfrt/graph_executor/...",
+    ],
+)
+
+pybind_extension(
+    name = "graph_execution_options",
+    srcs = ["graph_execution_options.cc"],
+    deps = [
+        "//tensorflow/core/tfrt/graph_executor",
+        "//tensorflow/core/tfrt/saved_model",
+        "@pybind11",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_pywrap_graph_execution_options",
+    srcs = ["graph_execution_options.cc"],
+    module_name = "_pywrap_graph_execution_options",
+    deps = [
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/saved_model:saved_model_aot_compile",
+        "//third_party/python_runtime:headers",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+    ],
+)
diff --git a/tensorflow/core/tfrt/graph_executor/python/graph_execution_options.cc b/tensorflow/core/tfrt/graph_executor/python/graph_execution_options.cc
new file mode 100644
index 00000000000..557f41b91a9
--- /dev/null
+++ b/tensorflow/core/tfrt/graph_executor/python/graph_execution_options.cc
@@ -0,0 +1,35 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(GraphExecutionRunOptions, m) {
+  py::class_<tensorflow::tfrt_stub::GraphExecutionRunOptions>(
+      m, "GraphExecutionRunOptions")
+      .def(py::init<>());
+  m.doc() =
+      "pybind11 GraphExecutionRunOptions wrapper";  // optional module docstring
+}
+
+PYBIND11_MODULE(_pywrap_graph_execution_options, m) {
+  py::class_<tensorflow::tfrt_stub::GraphExecutionOptions>(
+      m, "GraphExecutionOptions")
+      .def(py::init<const tensorflow::tfrt_stub::Runtime *>());
+}
diff --git a/tensorflow/core/tfrt/graph_executor/sync_resource_state.h b/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
index d0b3783f91f..1571fc01352 100644
--- a/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
+++ b/tensorflow/core/tfrt/graph_executor/sync_resource_state.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "third_party/tensorflow_serving/util/any_ptr.h"
 #include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
 namespace tensorflow {
 namespace tfrt_stub {
@@ -38,8 +39,23 @@ class SyncResourceState {
     return resource_dht_.at(index).CopyRef();
   }
 
+  template <typename T>
+  void Set(int index, T* resource) {
+    if (resources_.size() <= index) {
+      resources_.resize(index + 1);
+    }
+
+    resources_[index] = serving::AnyPtr(resource);
+  }
+
+  template <typename T>
+  T* Get(int index) const {
+    return resources_.at(index).get<T>();
+  }
+
  private:
   std::vector<tfrt::DenseHostTensor> resource_dht_;
+  std::vector<serving::AnyPtr> resources_;
 };
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
index 976425e3101..e17e6e3851b 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.cc
@@ -18,7 +18,6 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
 #include "learning/brain/tfrt/mlrt/application/vrooml/kernel.h"
@@ -27,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/utils/error_util.h"
 
@@ -100,5 +100,16 @@ absl::Status SynchronousGraphExecutor::Run(
       target_tensor_names, outputs));
 }
 
+absl::Status SynchronousGraphExecutor::CompileGraph(
+    const std::string& graph_name,
+    absl::Span<const std::string> input_tensor_names,
+    absl::Span<const tensorflow::DataType> input_tensor_dtypes,
+    absl::Span<const std::string> output_tensor_names,
+    absl::Span<const std::string> target_tensor_names) {
+  return tfrt::AbslStatusFromTfStatus(graph_executor_->CompileGraph(
+      graph_name, input_tensor_names, input_tensor_dtypes, output_tensor_names,
+      target_tensor_names));
+}
+
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h
index 0963bdaea10..e5aa9debde8 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#include "learning/infra/mira/mlrt/interpreter/context.h"
-#include "learning/infra/mira/mlrt/interpreter/value.h"
 #include "absl/status/statusor.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
 #include "tfrt/host_context/value.h"  // from @tf_runtime
@@ -62,6 +62,14 @@ class SynchronousGraphExecutor {
     return graph_executor_->runtime().core_runtime()->GetHostContext();
   }
 
+  // Compiles graph for `graph_name` and runs any initializers.
+  absl::Status CompileGraph(
+      const std::string& graph_name,
+      absl::Span<const std::string> input_tensor_names,
+      absl::Span<const tensorflow::DataType> input_tensor_dtypes,
+      absl::Span<const std::string> output_tensor_names,
+      absl::Span<const std::string> target_tensor_names);
+
  private:
   SynchronousGraphExecutor(
       std::unique_ptr<tensorflow::tfrt_stub::Runtime> runtime,
diff --git a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc
index 36f01cfb203..43fedb717b2 100644
--- a/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/synchronous_graph_executor_test.cc
@@ -18,13 +18,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
-#include "learning/infra/mira/mlrt/interpreter/value.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/cc/ops/array_ops.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/utils/error_util.h"
 #include "tfrt/cpp_tests/test_util.h""  // from @tf_runtime
 
diff --git a/tensorflow/core/tfrt/mla/BUILD b/tensorflow/core/tfrt/mla/BUILD
index 3e836a7ca6a..95009cbbffa 100644
--- a/tensorflow/core/tfrt/mla/BUILD
+++ b/tensorflow/core/tfrt/mla/BUILD
@@ -28,7 +28,7 @@ cc_library(
     name = "mla_utils",
     hdrs = ["mla_utils.h"],
     visibility = [
-        "//tensorflow/core/tfrt/saved_model:__pkg__",
+        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
     ],
     deps = [
         "//tensorflow/core/platform:errors",
diff --git a/tensorflow/core/tfrt/mlrt/attribute/BUILD b/tensorflow/core/tfrt/mlrt/attribute/BUILD
new file mode 100644
index 00000000000..89b97bdacfb
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/attribute/BUILD
@@ -0,0 +1,47 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//smartass/brain/ops/tfrt_kernels:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
+        "//tensorflow/core/tfrt:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "attribute",
+    srcs = ["attribute.cc"],
+    hdrs = ["attribute.h"],
+    deps = [
+        "//tensorflow/compiler/mlir/tensorflow:convert_type",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:mlir_to_bytecode",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status:statusor",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+tf_cc_test(
+    name = "attribute_test",
+    srcs = ["attribute_test.cc"],
+    deps = [
+        ":attribute",
+        "//tensorflow/compiler/mlir/tensorflow",  # build_cleaner: keep
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:mlir_to_bytecode",
+        "//tensorflow/core:protos_all_cc",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/core/tfrt/mlrt/attribute/attribute.cc b/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
new file mode 100644
index 00000000000..e319cd4dea7
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/attribute/attribute.cc
@@ -0,0 +1,158 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/attribute/attribute.h"
+
+#include <cstring>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+absl::StatusOr<std::string> EncodeTensorflowAttribute(
+    const mlrt::ModuleEmitterContext& module_context, mlir::Attribute attr) {
+  if (auto result = mlrt::EncodeSimpleAttribute(module_context, attr)) {
+    return std::move(*result);
+  }
+
+  if (auto dense_attr = attr.dyn_cast<mlir::DenseElementsAttr>()) {
+    auto element_type = dense_attr.getElementType();
+
+    tensorflow::DataType dtype;
+    TF_RETURN_IF_ERROR(tensorflow::ToAbslStatus(
+        tensorflow::ConvertToDataType(element_type, &dtype)));
+
+    if (dtype == tensorflow::DT_STRING) {
+      return absl::InvalidArgumentError(
+          "String tensor attribute is not yet supported");
+    }
+
+    mlrt::bc::Buffer buffer;
+    mlrt::bc::Allocator allocator(&buffer);
+    auto tensor_ctor = mlrt::bc::New<TensorAttr>(&allocator, dtype);
+
+    auto shaped_type = dense_attr.getType();
+    size_t num_elements = shaped_type.getNumElements();
+
+    tensor_ctor.set_num_elements(num_elements);
+
+    std::vector<int64_t> shape(shaped_type.getShape().begin(),
+                               shaped_type.getShape().end());
+    tensor_ctor.construct_shape(shape);
+
+    if (dtype == tensorflow::DT_BOOL) {
+      // bool values has special encoding in MLIR. It occupies one bit in MLIR
+      // but in bytecode it is one byte.
+      std::vector<uint8_t> data(num_elements);
+      int i = 0;
+      for (auto v : dense_attr.getValues<bool>()) {
+        data[i++] = static_cast<uint8_t>(v);
+      }
+      tensor_ctor.construct_data(data.size())
+          .Place(reinterpret_cast<const char*>(data.data()), data.size());
+    } else {
+      auto raw_data = dense_attr.getRawData();
+      if (dense_attr.isSplat()) {
+        std::vector<char> data(raw_data.size() * num_elements);
+        char* p = data.data();
+        for (int i = 0; i < num_elements; ++i, p += raw_data.size()) {
+          std::memcpy(p, raw_data.data(), raw_data.size());
+        }
+        tensor_ctor.construct_data(data.size()).Place(data.data(), data.size());
+      } else {
+        tensor_ctor.construct_data(raw_data.size())
+            .Place(raw_data.data(), raw_data.size());
+      }
+    }
+
+    return std::string(buffer.data(), buffer.size());
+  }
+
+  // Handle dtype attrs
+  if (auto type_attr = attr.dyn_cast<mlir::TypeAttr>()) {
+    tensorflow::DataType dtype;
+    TF_RETURN_IF_ERROR(tensorflow::ToAbslStatus(
+        tensorflow::ConvertToDataType(type_attr.getValue(), &dtype)));
+    std::string data(sizeof(dtype), '\0');
+    std::memcpy(data.data(), &dtype, sizeof(dtype));
+    return data;
+  }
+
+  // Handle shape attrs
+  if (auto shape_attr = attr.dyn_cast<mlir::TF::ShapeAttr>()) {
+    llvm::ArrayRef<int64_t> shape;
+    if (!shape_attr.getUnranked()) {
+      auto shape_or = shape_attr.getValue();
+      if (!shape_or.has_value()) {
+        std::string attr_str;
+        llvm::raw_string_ostream os(attr_str);
+        attr.print(os);
+
+        return absl::InvalidArgumentError(
+            absl::StrCat("Failed to get shape from shape attr: ", attr_str));
+      }
+      shape = *shape_or;
+    }
+
+    mlrt::bc::Buffer buffer;
+    mlrt::bc::Allocator allocator(&buffer);
+    auto shape_attr_ctor = mlrt::bc::New<ShapeAttr>(&allocator);
+    shape_attr_ctor.set_unranked(shape_attr.getUnranked());
+
+    std::vector<int64_t> shape_vec(shape.begin(), shape.end());
+    shape_attr_ctor.construct_shape(shape_vec);
+    return std::string(buffer.data(), buffer.size());
+  }
+
+  // Handle attribute arrays.
+  if (auto array_attr = attr.dyn_cast<mlir::ArrayAttr>()) {
+    mlrt::bc::Buffer buffer;
+    mlrt::bc::Allocator allocator(&buffer);
+    auto ctor = mlrt::bc::New<mlrt::bc::Vector<tensorflow::DataType>>(
+        &allocator, array_attr.size());
+
+    int i;
+    for (i = 0; i < array_attr.size(); ++i) {
+      if (auto type_attr = array_attr[i].dyn_cast<mlir::TypeAttr>()) {
+        tensorflow::DataType dtype;
+        TF_RETURN_IF_ERROR(tensorflow::ToAbslStatus(
+            tensorflow::ConvertToDataType(type_attr.getValue(), &dtype)));
+        ctor.ConstructAt(i, dtype);
+      } else {
+        break;
+      }
+    }
+
+    if (i == array_attr.size()) {
+      return std::string(buffer.data(), buffer.size());
+    }
+  }
+
+  std::string attr_str;
+  llvm::raw_string_ostream os(attr_str);
+  attr.print(os);
+
+  return absl::InvalidArgumentError(
+      absl::StrCat("Try to encode unsupported attribute: ", attr_str));
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/attribute/attribute.h b/tensorflow/core/tfrt/mlrt/attribute/attribute.h
new file mode 100644
index 00000000000..f27118ea0f3
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/attribute/attribute.h
@@ -0,0 +1,128 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_ATTRIBUTE_ATTRIBUTE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_ATTRIBUTE_ATTRIBUTE_H_
+
+#include <string>
+
+#include "absl/status/statusor.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+class ShapeAttr {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(uint8_t, unranked);
+    DEFINE_BYTECODE_FIELD(mlrt::bc::Vector<int64_t>, dims);
+  };
+
+  class Constructor {
+   public:
+    Constructor(mlrt::bc::Allocator* allocator, mlrt::bc::BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    void set_unranked(bool unranked) {
+      StorageType::construct_unranked(allocator_, address_, unranked);
+    }
+
+    template <typename... Args>
+    auto construct_shape(Args&&... args) {
+      return StorageType::construct_dims(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    mlrt::bc::BcAddr_t address() const { return address_; }
+
+   private:
+    mlrt::bc::Allocator* allocator_;
+    mlrt::bc::BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit ShapeAttr(const char* p) : p_(p) {}
+
+  bool unranked() const { return StorageType::read_unranked(p_); }
+  mlrt::bc::Vector<int64_t> dims() const { return StorageType::read_dims(p_); }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+class TensorAttr {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(tensorflow::DataType, dtype);
+    DEFINE_BYTECODE_FIELD(uint64_t, num_elements);
+    DEFINE_BYTECODE_FIELD(mlrt::bc::Vector<int64_t>, shape);
+    DEFINE_BYTECODE_FIELD(mlrt::bc::Vector<char>, data);
+  };
+
+  class Constructor {
+   public:
+    Constructor(mlrt::bc::Allocator* allocator, mlrt::bc::BcAddr_t address,
+                tensorflow::DataType dtype)
+        : allocator_(allocator), address_(address) {
+      StorageType::construct_dtype(allocator_, address_, dtype);
+    }
+
+    void set_num_elements(size_t num) {
+      StorageType::construct_num_elements(allocator_, address_, num);
+    }
+
+    template <typename... Args>
+    auto construct_shape(Args&&... args) {
+      return StorageType::construct_shape(allocator_, address_,
+                                          std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_data(Args&&... args) {
+      return StorageType::construct_data(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    mlrt::bc::BcAddr_t address() const { return address_; }
+
+   private:
+    mlrt::bc::Allocator* allocator_;
+    mlrt::bc::BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit TensorAttr(const char* p) : p_(p) {}
+
+  tensorflow::DataType dtype() const { return StorageType::read_dtype(p_); }
+  mlrt::bc::Vector<int64_t> shape() const {
+    return StorageType::read_shape(p_);
+  }
+  mlrt::bc::Vector<char> data() const { return StorageType::read_data(p_); }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+absl::StatusOr<std::string> EncodeTensorflowAttribute(
+    const mlrt::ModuleEmitterContext& module_context, mlir::Attribute attr);
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_ATTRIBUTE_ATTRIBUTE_H_
diff --git a/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc b/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc
new file mode 100644
index 00000000000..317c2005c7d
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/attribute/attribute_test.cc
@@ -0,0 +1,198 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/attribute/attribute.h"
+
+#include <array>
+#include <cstring>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.h"
+#include "tensorflow/core/framework/types.pb.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+namespace {
+
+TEST(AttributeTest, TensorAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir::Builder builder(&mlir_context);
+
+  std::array<int64_t, 4> data = {0, 1, 2, 3};
+
+  auto dense_i64_attr = builder.getI64VectorAttr(data);
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+  ASSERT_OK_AND_ASSIGN(auto attr_buffer, EncodeTensorflowAttribute(
+                                             emitter_context, dense_i64_attr));
+
+  TensorAttr tensor_attr(attr_buffer.data());
+
+  EXPECT_EQ(tensor_attr.dtype(), tensorflow::DT_INT64);
+  EXPECT_THAT(tensor_attr.shape(), ::testing::ElementsAreArray({4}));
+  EXPECT_EQ(
+      absl::string_view(tensor_attr.data().data(), tensor_attr.data().size()),
+      absl::string_view(reinterpret_cast<const char*>(data.data()),
+                        data.size() * sizeof(int64_t)));
+}
+
+TEST(AttributeTest, BoolTensorAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir::Builder builder(&mlir_context);
+
+  auto dense_bool_attr = builder.getBoolVectorAttr({true, false, true, false});
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+  ASSERT_OK_AND_ASSIGN(auto attr_buffer, EncodeTensorflowAttribute(
+                                             emitter_context, dense_bool_attr));
+
+  TensorAttr tensor_attr(attr_buffer.data());
+
+  EXPECT_EQ(tensor_attr.dtype(), tensorflow::DT_BOOL);
+  EXPECT_THAT(tensor_attr.shape(), ::testing::ElementsAreArray({4}));
+
+  std::array<uint8_t, 4> expected_data = {1, 0, 1, 0};
+
+  EXPECT_EQ(
+      absl::string_view(tensor_attr.data().data(), tensor_attr.data().size()),
+      absl::string_view(reinterpret_cast<const char*>(expected_data.data()),
+                        expected_data.size() * sizeof(uint8_t)));
+}
+
+TEST(AttributeTest, SplatTensorAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir::Builder builder(&mlir_context);
+
+  auto dense_splat_i64_attr = mlir::DenseElementsAttr::get<int64_t>(
+      mlir::RankedTensorType::get(4, builder.getI64Type()), 100);
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+  ASSERT_OK_AND_ASSIGN(
+      auto attr_buffer,
+      EncodeTensorflowAttribute(emitter_context, dense_splat_i64_attr));
+
+  TensorAttr tensor_attr(attr_buffer.data());
+
+  EXPECT_EQ(tensor_attr.dtype(), tensorflow::DT_INT64);
+  EXPECT_THAT(tensor_attr.shape(), ::testing::ElementsAreArray({4}));
+  EXPECT_EQ(tensor_attr.data().size(), 4 * sizeof(int64_t));
+
+  const char* p = tensor_attr.data().data();
+  for (int i = 0; i < 4; ++i, p += sizeof(int64_t)) {
+    int64_t v;
+    std::memcpy(&v, p, sizeof(int64_t));
+    EXPECT_EQ(v, 100);
+  }
+}
+
+TEST(AttributeTest, TypedAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir_context.loadDialect<mlir::TF::TensorFlowDialect>();
+  mlir::Builder builder(&mlir_context);
+
+  auto type_attr = mlir::TypeAttr::get(builder.getType<mlir::IntegerType>(32));
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+  ASSERT_OK_AND_ASSIGN(auto attr_buffer,
+                       EncodeTensorflowAttribute(emitter_context, type_attr));
+  tensorflow::DataType dtype;
+  std::memcpy(&dtype, attr_buffer.data(), sizeof(dtype));
+
+  EXPECT_EQ(dtype, DT_INT32);
+}
+
+TEST(AttributeTest, ShapeAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir_context.loadDialect<mlir::TF::TensorFlowDialect>();
+
+  std::array<int64_t, 4> data = {1, 2, 3, 4};
+
+  auto shape_attr = mlir::TF::ShapeAttr::get(
+      &mlir_context, llvm::ArrayRef<int64_t>(data.begin(), data.end()),
+      /*unranked=*/false);
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+  ASSERT_OK_AND_ASSIGN(auto attr_buffer,
+                       EncodeTensorflowAttribute(emitter_context, shape_attr));
+
+  ShapeAttr shape_attr_decoded(attr_buffer.data());
+
+  EXPECT_EQ(shape_attr_decoded.unranked(), false);
+  EXPECT_THAT(shape_attr_decoded.dims(),
+              ::testing::ElementsAreArray({1, 2, 3, 4}));
+}
+
+TEST(AttributeTest, DtypeArrayAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir_context.loadDialect<mlir::TF::TensorFlowDialect>();
+  mlir::Builder builder(&mlir_context);
+
+  std::array<mlir::Attribute, 4> arr = {
+      mlir::TypeAttr::get(builder.getType<mlir::IntegerType>(32)),
+      mlir::TypeAttr::get(builder.getType<mlir::IntegerType>(64)),
+      mlir::TypeAttr::get(builder.getType<mlir::Float32Type>()),
+      mlir::TypeAttr::get(builder.getType<mlir::IntegerType>(1))};
+
+  auto arr_attr = mlir::ArrayAttr::get(
+      &mlir_context, llvm::ArrayRef<mlir::Attribute>(arr.begin(), arr.end()));
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+  ASSERT_OK_AND_ASSIGN(auto attr_buffer,
+                       EncodeTensorflowAttribute(emitter_context, arr_attr));
+
+  mlrt::bc::Vector<tensorflow::DataType> dtype_arr(attr_buffer.data());
+  EXPECT_THAT(dtype_arr, ::testing::ElementsAreArray(
+                             {DT_INT32, DT_INT64, DT_FLOAT, DT_BOOL}));
+}
+
+TEST(AttributeTest, UnsupportedAttr) {
+  mlir::MLIRContext mlir_context;
+  mlir_context.loadDialect<mlir::TF::TensorFlowDialect>();
+  mlir::Builder builder(&mlir_context);
+
+  auto dense_string_attr = mlir::DenseStringElementsAttr::get(
+      mlir::RankedTensorType::get({2}, builder.getType<mlir::TF::StringType>()),
+      {"a", "b"});
+
+  mlrt::AttributeEncoderRegistry attribute_encoder_registry;
+  mlrt::ModuleEmitterContext emitter_context(&attribute_encoder_registry);
+
+  EXPECT_THAT(EncodeTensorflowAttribute(emitter_context, dense_string_attr),
+              ::testing::status::CanonicalStatusIs(
+                  absl::StatusCode::kInvalidArgument,
+                  "String tensor attribute is not yet supported"));
+
+  EXPECT_THAT(EncodeTensorflowAttribute(emitter_context, builder.getUnitAttr()),
+              ::testing::status::CanonicalStatusIs(
+                  absl::StatusCode::kInvalidArgument,
+                  "Try to encode unsupported attribute: unit"));
+}
+
+}  // namespace
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/BUILD b/tensorflow/core/tfrt/mlrt/bytecode/BUILD
new file mode 100644
index 00000000000..82c10e8645d
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/BUILD
@@ -0,0 +1,98 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
+        "//tensorflow/core/tfrt:__subpackages__",
+        "//tensorflow/core/tfrt/mlrt:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "bytecode",
+    hdrs = ["bytecode.h"],
+    deps = [
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "span",
+    hdrs = ["span.h"],
+    deps = [":bytecode"],
+)
+
+cc_library(
+    name = "function",
+    hdrs = ["function.h"],
+    deps = [
+        ":bytecode",
+        ":kernel",
+    ],
+)
+
+cc_library(
+    name = "kernel",
+    hdrs = ["kernel.h"],
+    deps = [":bytecode"],
+)
+
+cc_library(
+    name = "executable",
+    hdrs = ["executable.h"],
+    deps = [":function"],
+)
+
+tf_cc_test(
+    name = "bytecode_test",
+    srcs = ["bytecode_test.cc"],
+    deps = [
+        ":bytecode",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_test",
+    srcs = ["kernel_test.cc"],
+    deps = [
+        ":kernel",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "function_test",
+    srcs = ["function_test.cc"],
+    deps = [
+        ":function",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "executable_test",
+    srcs = ["executable_test.cc"],
+    deps = [
+        ":executable",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "span_test",
+    srcs = ["span_test.cc"],
+    deps = [
+        ":span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h b/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
new file mode 100644
index 00000000000..f6b8de5da15
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/bytecode.h
@@ -0,0 +1,524 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_BYTECODE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_BYTECODE_H_
+
+// This file defines bytecode primitives that can be used to build bytecode
+// structures. This library is C++17 compliant and portable for different
+// platforms. It should be also as effcient as plain C++ structs on common
+// platforms.
+//
+// Usage:
+//
+// class CustomStruct {
+//  public:
+//    // The actual storage of this CustomStruct should be defined as a member
+//    // struct of this class. Defining storage struct is almost as simple as
+//    // defining a plain C++ struct;
+//    struct Storage {
+//      using Self = Storage;
+//      // DEFINE_BYTECODE_FIELD will generate helpers for reading and
+//      constructing
+//      // the field in bytecode.
+//      DEFINE_BYTECODE_FIELD(uint32_t, x);
+//      DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, y);
+//    };
+//
+//    // If the storage involves indirection like std::vector, a member class
+//    // Constructor should be also provided.
+//    class Constructor {
+//      public:
+//        // The Constructor will use `allocator` to allocate indirect storage,
+//        // though the direct storage is assumed to be already allocated using
+//        // the same allocator starting at `address`.
+//        explicit Constructor(Allocator* allocator, BcAddr_t address)
+//          : allocator_(allocator), address_(address) {}
+//
+//      // Setting trivial fields only need to call construct_<field_name>
+//      // provided by DEFINE_BYTECODE_FIELD.
+//      void set_x(uint32_t x) {
+//        Storage::construct_x(allocator_, address_, x);
+//      }
+//
+//      // Setting non-trivial fields only need to call construct_<field_name>
+//      // provided by DEFINE_BYTECODE_FIELD and also return the field's
+//      constructor. bc::Vector<uint32_t>::Constructor construct_y(size_t
+//      y_size) {
+//        return Storage::construct_y(allocator_, address_, y_size);
+//      }
+//
+//      BcAddr_t address() const { return address_; }
+//
+//      private:
+//        bc::Allocator* allocator_;
+//        BcAddr_t address_;
+//    };
+//    using NonTrivialConstructorType = Constructor;
+//
+//    explicit CustomStruct(const char* p) : p_(p) {}
+//
+//    // Reading fields needs only calling read_<field_name> methods provided by
+//    // DEFINE_BYTECODE_FIELD.
+//    uint32_t x() const { return Storage::read_x(p_); }
+//    bc::Vector<uint32_t> y() const { return Storage::read_y(p_); }
+//
+//    private:
+//      // The CustomStruct can contain only the pointer to the actual memory
+//      // blob. So fields need not be touched if not necessary, which would
+//      // otherwise incurs overhead.
+//      const char* p_;
+// };
+
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+
+namespace mlrt {
+namespace bc {
+
+using BcAddr_t = uint64_t;
+
+class Buffer {
+ public:
+  char* Get(BcAddr_t address) {
+    DCHECK_LT(address, buffer_.size());
+    return &buffer_.at(address);
+  }
+
+  char* data() { return buffer_.data(); }
+  const char* data() const { return buffer_.data(); }
+  size_t size() const { return buffer_.size(); }
+  bool empty() const { return buffer_.empty(); }
+
+ private:
+  static_assert(alignof(std::max_align_t) >= 8,
+                "The bytecode buffer needs to be at least 8-byte aligned.");
+  std::vector<char> buffer_;
+
+  friend class Allocator;
+};
+
+class Allocator {
+ public:
+  explicit Allocator(Buffer* buffer) : buffer_(buffer) {
+    DCHECK(buffer != nullptr);
+  }
+
+  BcAddr_t Allocate(size_t size, size_t alignment) {
+    DCHECK_LE(alignment, 8);
+
+    // Calculate the next buffer size that is greater or equal to the previous
+    // buffer size, and is also aligned to `alignment`.
+    size_t next_align =
+        (buffer_->buffer_.size() + alignment - 1) / alignment * alignment;
+
+    buffer_->buffer_.resize(next_align + size);
+
+    return next_align;
+  }
+
+  template <typename T>
+  BcAddr_t Allocate() {
+    static_assert(std::is_trivial<T>::value, "T must be trivial.");
+    return Allocate(sizeof(T), alignof(T));
+  }
+
+  size_t size() const { return buffer_->size(); }
+
+  char* raw(BcAddr_t address) { return buffer_->Get(address); }
+
+ private:
+  Buffer* buffer_;
+};
+
+// AccessTraits encapsulates the fundamental Read() and Construct() methods for
+// reading and constructing bytecode data structures.
+
+// AccessTraits specialized for trivial types.
+template <typename T, typename Enable = void>
+struct AccessTraits {
+  using StorageType = T;
+  static_assert(std::is_trivial<StorageType>::value,
+                "StorageType must be trivial.");
+
+  using ConstructorType = void;
+
+  static T Read(const char* p) {
+    // To be compliant with C++ standard on object lifetime and strict aliasing
+    // rules, we have to copy the data from memory to construct a new object.
+    // This is fine on most platforms as the copy can be optimized away,
+    // assuming `p` is sufficiently aligned.
+    T value;
+    std::memcpy(&value, p, sizeof(T));
+    return value;
+  }
+
+  template <typename... Args>
+  static BcAddr_t Construct(Allocator* allocator, BcAddr_t address,
+                            Args&&... args) {
+    // Similar to Read(), memcpy is used to serialize data to bytecode.
+    T value(std::forward<Args>(args)...);
+    std::memcpy(allocator->raw(address), &value, sizeof(T));
+    return address;
+  }
+
+  // Place the bytes directly for this trivial type T. It also supports placing
+  // bytes for a contiguous array of T. The number of bytes, `size` must not be
+  // greater than `num` * sizeof(T).
+  static void Place(Allocator* allocator, BcAddr_t address, const char* data,
+                    size_t size, size_t num = 1) {
+    CHECK_LE(size, num * sizeof(T));  // Crash Ok
+    std::memcpy(allocator->raw(address), data, size);
+  }
+};
+
+// AccessTraits specialized for non-trivial types.
+template <typename T>
+struct AccessTraits<T, std::void_t<typename T::NonTrivialConstructorType>> {
+  // Non-trivial types should provide a member struct `StorageType` to
+  // specify the storage layout.
+  using StorageType = typename T::StorageType;
+  static_assert(std::is_trivial<StorageType>::value,
+                "StorageType must be trivial.");
+
+  // Non-trivial types should provide a member type `NonTrivialConstructorType`
+  // for constructing storages.
+  using ConstructorType = typename T::NonTrivialConstructorType;
+
+  static T Read(const char* p) {
+    // Reading non-trivial types is simply constructing the bytecode type with
+    // the pointer to the memory blob. All reading methods are encapsulated in
+    // `T`.
+    return T(p);
+  }
+
+  template <typename... Args>
+  static ConstructorType Construct(Allocator* allocator, BcAddr_t address,
+                                   Args&&... args) {
+    // Constructing non-trivial types is simply creating the corresponding
+    // constructor.
+    return ConstructorType(allocator, address, std::forward<Args>(args)...);
+  }
+};
+
+// The bytecode counterparts of malloc() and operator new() are also provided.
+template <typename T>
+BcAddr_t Allocate(Allocator* allocator) {
+  return allocator->Allocate<typename AccessTraits<T>::StorageType>();
+}
+template <typename T, typename... Args>
+auto New(Allocator* allocator, Args&&... args) {
+  auto address = Allocate<T>(allocator);
+  return AccessTraits<T>::Construct(allocator, address,
+                                    std::forward<Args>(args)...);
+}
+
+// The iterator for reading bytecode data. It uses AccessTraits<T>::Read() for
+// reading the data. It is an input iterator as we cannot return the type-safe
+// reference to the data in bytecode in a C++ compliant way due to object
+// lifetime and strict aliasing rule.
+template <typename T>
+class ReadIterator {
+  using StorageType = typename AccessTraits<T>::StorageType;
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = std::remove_cv_t<T>;
+  using pointer = void;
+  using reference = value_type;
+  using iterator_category = std::input_iterator_tag;
+
+  explicit ReadIterator(const char* data) : data_(data) {}
+
+  const char* data() const { return data_; }
+
+  value_type operator*() const { return AccessTraits<T>::Read(data_); }
+
+  ReadIterator& operator++() {
+    data_ += sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator++(int) {
+    ReadIterator r = *this;
+    data_ += sizeof(StorageType);
+    return r;
+  }
+
+  ReadIterator& operator+=(difference_type offset) {
+    data_ += offset * sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator+(difference_type offset) const {
+    ReadIterator r = *this;
+    r += offset;
+    return r;
+  }
+
+  ReadIterator& operator--() {
+    data_ -= sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator--(int) {
+    ReadIterator r = *this;
+    data_ -= sizeof(StorageType);
+    return r;
+  }
+
+  ReadIterator& operator-=(difference_type offset) {
+    data_ -= offset * sizeof(StorageType);
+    return *this;
+  }
+
+  ReadIterator operator-(difference_type offset) const {
+    ReadIterator r = *this;
+    r -= offset;
+    return r;
+  }
+
+  difference_type operator-(const ReadIterator& other) const {
+    DCHECK_EQ((data_ - other.data_) % sizeof(StorageType), 0);
+    return (data_ - other.data_) / sizeof(StorageType);
+  }
+
+  friend bool operator==(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ == b.data_;
+  }
+
+  friend bool operator!=(const ReadIterator& a, const ReadIterator& b) {
+    return !(a == b);
+  }
+
+  friend bool operator<(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ < b.data_;
+  }
+
+  friend bool operator<=(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ <= b.data_;
+  }
+
+  friend bool operator>(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ > b.data_;
+  }
+
+  friend bool operator>=(const ReadIterator& a, const ReadIterator& b) {
+    return a.data_ >= b.data_;
+  }
+
+ private:
+  const char* data_ = nullptr;
+};
+
+// DEFINE_BYTECODE_FIELD provides helper functions for reading and constructing
+// member fields in bytecode.
+#define DEFINE_BYTECODE_FIELD(Type, name)                                   \
+  typename ::mlrt::bc::AccessTraits<Type>::StorageType name;                \
+  static const char* name##_pointer(const char* base) {                     \
+    return base + offsetof(Self, name);                                     \
+  }                                                                         \
+  static ::mlrt::bc::BcAddr_t name##_address(::mlrt::bc::BcAddr_t base) {   \
+    return base + offsetof(Self, name);                                     \
+  }                                                                         \
+  static Type read_##name(const char* base) {                               \
+    return ::mlrt::bc::AccessTraits<Type>::Read(name##_pointer(base));      \
+  }                                                                         \
+  template <typename... Args>                                               \
+  static auto construct_##name(::mlrt::bc::Allocator* allocator,            \
+                               ::mlrt::bc::BcAddr_t base, Args&&... args) { \
+    return ::mlrt::bc::AccessTraits<Type>::Construct(                       \
+        allocator, name##_address(base), std::forward<Args>(args)...);      \
+  }                                                                         \
+  static_assert(                                                            \
+      std::is_trivial<                                                      \
+          typename ::mlrt::bc::AccessTraits<Type>::StorageType>::value,     \
+      "Bytecode storage types must be trivial.")
+
+// Defines a bytecode vector.
+template <typename T, typename SizeType = uint32_t>
+class Vector {
+ public:
+  struct Storage {
+    using Self = Storage;
+    DEFINE_BYTECODE_FIELD(SizeType, size);
+    DEFINE_BYTECODE_FIELD(SizeType, offset);
+  };
+  static_assert(std::is_trivial<Storage>::value, "StorageType is trivial");
+  static_assert(std::is_standard_layout<Storage>::value,
+                "StorageType has standard layout");
+  static_assert(sizeof(Storage) == 2 * sizeof(SizeType));
+  static_assert(alignof(Storage) == alignof(SizeType));
+
+  using StorageType = Storage;
+  using ElementStorageType = typename AccessTraits<T>::StorageType;
+
+  using value_type = T;
+  using iterator = ReadIterator<T>;
+  using const_iterator = iterator;
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address, size_t size)
+        : allocator_(allocator), address_(address) {
+      DCHECK_GE(allocator->size(), address + sizeof(StorageType));
+      size_t data_start = allocator->Allocate(size * sizeof(ElementStorageType),
+                                              alignof(ElementStorageType));
+
+      CHECK_LT(size, std::numeric_limits<SizeType>::max());  // Crash Ok
+      CHECK_LT(data_start - address,                         // Crash Ok
+               std::numeric_limits<SizeType>::max());
+      storage_.size = size;
+      storage_.offset = data_start - address;
+      AccessTraits<StorageType>::Construct(allocator, address, storage_);
+    }
+
+    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
+    Constructor(Allocator* allocator, BcAddr_t address,
+                const std::vector<T>& vec)
+        : Constructor(allocator, address, vec.size()) {
+      Assign(vec.begin(), vec.end());
+    }
+
+    template <typename... Args>
+    auto ConstructAt(size_t index, Args&&... args) {
+      DCHECK_LT(index, size());
+      return AccessTraits<T>::Construct(allocator_, GetElementAddress(index),
+                                        std::forward<Args>(args)...);
+    }
+
+    template <typename V>
+    void Assign(std::initializer_list<V> ilist) {
+      DCHECK_EQ(ilist.size(), size());
+      Assign(ilist.begin(), ilist.end());
+    }
+
+    template <typename Range>
+    void Assign(const Range& range) {
+      DCHECK_EQ(std::distance(std::begin(range), std::end(range)), size());
+      Assign(std::begin(range), std::end(range));
+    }
+
+    template <typename Iter>
+    void Assign(Iter begin, Iter end) {
+      size_t i = 0;
+      for (; begin != end; ++begin) {
+        ConstructAt(i++, *begin);
+      }
+      DCHECK_EQ(i, size());
+    }
+
+    // If T is a trivial inplace type like int32_t, we can place the bytes for
+    // this vector directly instead of constructing the elements one by one.
+    template <
+        typename U = T,
+        typename std::enable_if<
+            std::is_same_v<typename AccessTraits<U>::ConstructorType, void>,
+            int>::type = 0>
+    void Place(const char* data, size_t size) {
+      AccessTraits<U>::Place(allocator_, address_ + storage_.offset, data, size,
+                             storage_.size);
+    }
+
+    // TODO(chky): Implement iterators for construction.
+
+    size_t size() const { return storage_.size; }
+    BcAddr_t address() const { return address_; }
+
+   private:
+    BcAddr_t GetElementAddress(size_t index) const {
+      return address_ + storage_.offset + index * sizeof(ElementStorageType);
+    }
+
+    Allocator* allocator_;
+    BcAddr_t address_;
+    Vector::Storage storage_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit Vector(const char* p) : p_(p) {
+    static_assert(!std::is_trivial_v<Vector>);
+    DCHECK(p_ != nullptr);
+  }
+  Vector() {
+    static_assert(!std::is_trivial_v<Vector>);
+    static Storage kEmptyStorage{0, 0};
+    p_ = reinterpret_cast<const char*>(&kEmptyStorage);
+  }
+
+  const char* data() const { return p_ + offset(); }
+
+  size_t size() const { return StorageType::read_size(p_); }
+  bool empty() const { return size() == 0; }
+
+  iterator begin() const { return iterator(data()); }
+  iterator end() const {
+    return iterator(data() + size() * sizeof(ElementStorageType));
+  }
+
+  T operator[](size_t index) const {
+    DCHECK_LT(index, size());
+    auto iter = begin();
+    iter += index;
+    return *iter;
+  }
+
+ private:
+  SizeType offset() const { return StorageType::read_offset(p_); }
+
+  const char* p_;
+};
+
+class String : public Vector<char, uint64_t> {
+ public:
+  using Base = Vector<char, uint64_t>;
+  using Base::Base;
+
+  class Constructor : public Base::Constructor {
+   public:
+    using Base::Constructor::Assign;
+
+    Constructor(Allocator* allocator, BcAddr_t address, absl::string_view str)
+        : Base::Constructor(allocator, address, str.size()) {
+      Assign(str.begin(), str.end());
+    }
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  using Base::data;
+  using Base::size;
+
+  std::string str() const { return std::string(data(), size()); }
+  absl::string_view Get() const { return absl::string_view(data(), size()); }
+
+  operator absl::string_view() const {  // NOLINT
+    return absl::string_view(data(), size());
+  }
+
+  friend bool operator==(String x, absl::string_view y) { return x.Get() == y; }
+  friend bool operator==(absl::string_view x, String y) { return x == y.Get(); }
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_BYTECODE_H_
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/bytecode_test.cc b/tensorflow/core/tfrt/mlrt/bytecode/bytecode_test.cc
new file mode 100644
index 00000000000..124c850d527
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/bytecode_test.cc
@@ -0,0 +1,162 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+#include <array>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace bc {
+namespace {
+
+TEST(ByteCodeTest, VectorOfTrivial) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  auto ctor = New<Vector<uint32_t>>(&alloc, /*size=*/4);
+
+  for (int i = 0; i < 4; ++i) {
+    ctor.ConstructAt(i, i);
+  }
+
+  Vector<uint32_t> view(buffer.Get(ctor.address()));
+
+  ASSERT_EQ(view.size(), 4);
+  EXPECT_EQ(view[0], 0);
+  EXPECT_EQ(view[1], 1);
+  EXPECT_EQ(view[2], 2);
+  EXPECT_EQ(view[3], 3);
+
+  EXPECT_THAT(view, ::testing::ElementsAreArray({0, 1, 2, 3}));
+
+  Vector<uint32_t> empty;
+  ASSERT_TRUE(empty.empty());
+}
+
+TEST(ByteCodeTest, VectorOfVector) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  using T = Vector<uint32_t>;
+  using V = Vector<T>;
+
+  auto vctor = New<V>(&alloc, 3);
+
+  {
+    auto tctor = vctor.ConstructAt(0, 2);
+    tctor.ConstructAt(0, 0);
+    tctor.ConstructAt(1, 1);
+  }
+
+  {
+    auto tctor = vctor.ConstructAt(1, 1);
+    tctor.ConstructAt(0, 2);
+  }
+
+  vctor.ConstructAt(2, 0);
+
+  V v(buffer.Get(vctor.address()));
+
+  auto t0 = v[0];
+  ASSERT_EQ(t0.size(), 2);
+  EXPECT_EQ(t0[0], 0);
+  EXPECT_EQ(t0[1], 1);
+  EXPECT_THAT(t0, testing::ElementsAreArray({0, 1}));
+
+  auto t1 = v[1];
+  ASSERT_EQ(t1.size(), 1);
+  EXPECT_EQ(t1[0], 2);
+  EXPECT_THAT(t1, testing::ElementsAreArray({2}));
+
+  auto t2 = v[2];
+  ASSERT_EQ(t2.size(), 0);
+
+  Vector<Vector<uint32_t>> empty;
+  ASSERT_TRUE(empty.empty());
+}
+
+TEST(ByteCodeTest, String) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  auto ctor = New<String>(&alloc, "bytecode string");
+
+  String view(buffer.Get(ctor.address()));
+
+  EXPECT_EQ(view.str(), "bytecode string");
+  EXPECT_EQ(view.Get(), "bytecode string");
+  EXPECT_EQ(absl::string_view(view), "bytecode string");
+}
+
+TEST(ByteCodeTest, PlaceVectorOfTrivial) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  auto ctor = New<Vector<uint32_t>>(&alloc, /*size=*/4);
+
+  std::array<uint32_t, 4> data = {0, 1, 2, 3};
+
+  ctor.Place(reinterpret_cast<const char*>(data.data()),
+             data.size() * sizeof(uint32_t));
+
+  Vector<uint32_t> view(buffer.Get(ctor.address()));
+
+  ASSERT_EQ(view.size(), 4);
+  EXPECT_EQ(view[0], 0);
+  EXPECT_EQ(view[1], 1);
+  EXPECT_EQ(view[2], 2);
+  EXPECT_EQ(view[3], 3);
+
+  EXPECT_THAT(view, ::testing::ElementsAreArray({0, 1, 2, 3}));
+}
+
+TEST(ByteCodeTest, ReadIteratorDistance) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  auto ctor = New<Vector<uint32_t>>(&alloc, /*size=*/4);
+
+  for (int i = 0; i < 4; ++i) {
+    ctor.ConstructAt(i, i);
+  }
+
+  Vector<uint32_t> view(buffer.Get(ctor.address()));
+
+  EXPECT_EQ(view.end() - view.begin(), 4);
+}
+
+TEST(ByteCodeTest, ReadIteratorCompare) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  auto ctor = New<Vector<uint32_t>>(&alloc, /*size=*/4);
+
+  for (int i = 0; i < 4; ++i) {
+    ctor.ConstructAt(i, i);
+  }
+
+  Vector<uint32_t> view(buffer.Get(ctor.address()));
+
+  EXPECT_GE(view.end(), view.begin());
+  EXPECT_GT(view.end(), view.begin());
+  EXPECT_LE(view.begin(), view.end());
+  EXPECT_LT(view.begin(), view.end());
+}
+
+}  // namespace
+}  // namespace bc
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/executable.h b/tensorflow/core/tfrt/mlrt/bytecode/executable.h
new file mode 100644
index 00000000000..2f6f9c0edea
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/executable.h
@@ -0,0 +1,90 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_EXECUTABLE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_EXECUTABLE_H_
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
+
+namespace mlrt {
+namespace bc {
+
+// Defines the bytecode format for the executable, which contains the following
+// section:
+//  1) kernel_names: an ordered list of strings for kernel names that appear in
+//  this file. The `code` fields of kernels in `functions` will be indices to
+//  this list.
+//
+//  2) attributes: an ordered list of strings that are raw bytes. It is kernel
+//  implementations' resposiblity to decode the bytes properly. The `attributes`
+//  field of kernels in `functions` will be indices to this list.
+//
+//  3) functions: an order list of functions, which contains kernels and other
+//  metadata. Please refer to function.h for its detailed format.
+class Executable {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(Vector<String>, kernel_names);
+    DEFINE_BYTECODE_FIELD(Vector<Function>, functions);
+    DEFINE_BYTECODE_FIELD(Vector<String>, attributes);
+  };
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    template <typename... Args>
+    auto construct_kernel_names(Args&&... args) {
+      return StorageType::construct_kernel_names(allocator_, address_,
+                                                 std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_attributes(Args&&... args) {
+      return StorageType::construct_attributes(allocator_, address_,
+                                               std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_functions(Args&&... args) {
+      return StorageType::construct_functions(allocator_, address_,
+                                              std::forward<Args>(args)...);
+    }
+
+    BcAddr_t address() const { return address_; }
+
+   private:
+    Allocator* allocator_;
+    BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit Executable(const char* p) : p_(p) {}
+
+  Vector<String> kernel_names() const {
+    return StorageType::read_kernel_names(p_);
+  }
+  Vector<Function> functions() const { return StorageType::read_functions(p_); }
+  Vector<String> attributes() const { return StorageType::read_attributes(p_); }
+
+ private:
+  const char* p_;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_EXECUTABLE_H_
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/executable_test.cc b/tensorflow/core/tfrt/mlrt/bytecode/executable_test.cc
new file mode 100644
index 00000000000..e10800482c4
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/executable_test.cc
@@ -0,0 +1,64 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+
+#include <cstring>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace bc {
+namespace {
+
+TEST(ExecutableTest, Executable) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  Executable::Constructor executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  Vector<String>::Constructor kernel_names_ctor =
+      executable_ctor.construct_kernel_names(2);
+  kernel_names_ctor.ConstructAt(0, "add");
+  kernel_names_ctor.ConstructAt(1, "return");
+
+  auto attributes_ctor = executable_ctor.construct_attributes(1);
+
+  int32_t constant = 1;
+  std::string constant_str(sizeof(int32_t), '\0');
+  std::memcpy(constant_str.data(), &constant, sizeof(int32_t));
+  attributes_ctor.ConstructAt(0, constant_str);
+
+  executable_ctor.construct_functions(1);
+
+  Executable executable(buffer.Get(executable_ctor.address()));
+
+  EXPECT_THAT(executable.kernel_names(),
+              ::testing::ElementsAreArray({"add", "return"}));
+  EXPECT_EQ(executable.attributes().size(), 1);
+
+  int32_t value;
+  ASSERT_EQ(executable.attributes()[0].size(), sizeof(value));
+  std::memcpy(&value, executable.attributes()[0].data(), sizeof(int32_t));
+
+  EXPECT_EQ(value, constant);
+
+  EXPECT_EQ(executable.functions().size(), 1);
+}
+
+}  // namespace
+}  // namespace bc
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/function.h b/tensorflow/core/tfrt/mlrt/bytecode/function.h
new file mode 100644
index 00000000000..c85fc40d482
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/function.h
@@ -0,0 +1,110 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_FUNCTION_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_FUNCTION_H_
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+
+namespace mlrt {
+namespace bc {
+
+class Function {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(String, name);
+    DEFINE_BYTECODE_FIELD(uint32_t, num_regs);
+    DEFINE_BYTECODE_FIELD(Vector<uint32_t>, input_regs);
+    DEFINE_BYTECODE_FIELD(Vector<uint32_t>, output_regs);
+    DEFINE_BYTECODE_FIELD(Vector<uint8_t>, output_last_uses);
+    DEFINE_BYTECODE_FIELD(Vector<Kernel>, kernels);
+  };
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    template <typename... Args>
+    auto construct_name(Args&&... args) {
+      return StorageType::construct_name(allocator_, address_,
+                                         std::forward<Args>(args)...);
+    }
+
+    void set_num_regs(uint32_t num_regs) {
+      StorageType::construct_num_regs(allocator_, address_, num_regs);
+    }
+
+    template <typename... Args>
+    auto construct_input_regs(Args&&... args) {
+      return StorageType::construct_input_regs(allocator_, address_,
+                                               std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_output_regs(Args&&... args) {
+      return StorageType::construct_output_regs(allocator_, address_,
+                                                std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_output_last_uses(Args&&... args) {
+      return StorageType::construct_output_last_uses(
+          allocator_, address_, std::forward<Args>(args)...);
+    }
+
+    template <typename... Args>
+    auto construct_kernels(Args&&... args) {
+      return StorageType::construct_kernels(allocator_, address_,
+                                            std::forward<Args>(args)...);
+    }
+
+    BcAddr_t address() const { return address_; }
+
+   private:
+    Allocator* allocator_;
+    BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  Function() = default;
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  Function(std::nullptr_t) : p_(nullptr) {}
+  explicit Function(const char* p) : p_(p) {}
+
+  String name() const { return StorageType::read_name(p_); }
+  uint32_t num_regs() const { return StorageType::read_num_regs(p_); }
+  Vector<uint32_t> input_regs() const {
+    return StorageType::read_input_regs(p_);
+  }
+  Vector<uint32_t> output_regs() const {
+    return StorageType::read_output_regs(p_);
+  }
+  Vector<uint8_t> output_last_uses() const {
+    return StorageType::read_output_last_uses(p_);
+  }
+  Vector<Kernel> kernels() const { return StorageType::read_kernels(p_); }
+
+  explicit operator bool() const { return p_ != nullptr; }
+
+ private:
+  const char* p_ = nullptr;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_FUNCTION_H_
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/function_test.cc b/tensorflow/core/tfrt/mlrt/bytecode/function_test.cc
new file mode 100644
index 00000000000..2a6a4e75eb9
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/function_test.cc
@@ -0,0 +1,55 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace bc {
+namespace {
+
+TEST(FunctionTest, Function) {
+  Buffer buffer;
+  Allocator allocator(&buffer);
+
+  Function::Constructor ctor = New<Function>(&allocator);
+
+  ctor.construct_name("main");
+
+  ctor.set_num_regs(10);
+
+  ctor.construct_input_regs(/*size=*/2).Assign({0, 1});
+  ctor.construct_output_regs(/*size=*/1).Assign({9});
+  ctor.construct_output_last_uses(/*size=*/1).Assign({true});
+  ctor.construct_kernels(/*size=*/3);
+
+  Function function(buffer.Get(ctor.address()));
+
+  EXPECT_EQ(function.name().Get(), "main");
+  EXPECT_EQ(function.num_regs(), 10);
+
+  EXPECT_THAT(function.input_regs(), ::testing::ElementsAreArray({0, 1}));
+  EXPECT_THAT(function.output_regs(), ::testing::ElementsAreArray({9}));
+  EXPECT_THAT(function.output_last_uses(), ::testing::ElementsAreArray({true}));
+
+  Vector<Kernel> kernels = function.kernels();
+
+  EXPECT_EQ(kernels.size(), 3);
+}
+
+}  // namespace
+}  // namespace bc
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/kernel.h b/tensorflow/core/tfrt/mlrt/bytecode/kernel.h
new file mode 100644
index 00000000000..b4e6f53ba85
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/kernel.h
@@ -0,0 +1,90 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_KERNEL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_KERNEL_H_
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+namespace bc {
+
+class Kernel {
+ public:
+  struct StorageType {
+    using Self = StorageType;
+    DEFINE_BYTECODE_FIELD(uint32_t, code);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, arguments);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, results);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint32_t>, attributes);
+    DEFINE_BYTECODE_FIELD(bc::Vector<uint8_t>, last_uses);
+  };
+
+  class Constructor {
+   public:
+    Constructor(Allocator* allocator, BcAddr_t address)
+        : allocator_(allocator), address_(address) {}
+
+    void set_code(uint32_t code) {
+      StorageType::construct_code(allocator_, address_, code);
+    }
+
+    template <typename... Args>
+    auto construct_arguments(Args&&... args) {
+      return StorageType::construct_arguments(allocator_, address_,
+                                              std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_results(Args&&... args) {
+      return StorageType::construct_results(allocator_, address_,
+                                            std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_attributes(Args&&... args) {
+      return StorageType::construct_attributes(allocator_, address_,
+                                               std::forward<Args>(args)...);
+    }
+    template <typename... Args>
+    auto construct_last_uses(Args&&... args) {
+      return StorageType::construct_last_uses(allocator_, address_,
+                                              std::forward<Args>(args)...);
+    }
+
+    BcAddr_t address() const { return address_; }
+
+   private:
+    Allocator* allocator_;
+    BcAddr_t address_;
+  };
+  using NonTrivialConstructorType = Constructor;
+
+  explicit Kernel(const char* p) : p_(p) {}
+  Kernel() : p_(nullptr) {}
+
+  uint32_t code() const { return StorageType::read_code(p_); }
+  Vector<uint32_t> arguments() const { return StorageType::read_arguments(p_); }
+  Vector<uint32_t> results() const { return StorageType::read_results(p_); }
+  Vector<uint32_t> attributes() const {
+    return StorageType::read_attributes(p_);
+  }
+  Vector<uint8_t> last_uses() const { return StorageType::read_last_uses(p_); }
+
+ private:
+  const char* p_;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_KERNEL_H_
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/kernel_test.cc b/tensorflow/core/tfrt/mlrt/bytecode/kernel_test.cc
new file mode 100644
index 00000000000..b8d506aceb0
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/kernel_test.cc
@@ -0,0 +1,49 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/bytecode/kernel.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace bc {
+namespace {
+
+TEST(KernelTest, Kernel) {
+  Buffer buffer;
+  Allocator allocator(&buffer);
+
+  Kernel::Constructor ctor = New<Kernel>(&allocator);
+
+  ctor.set_code(100);
+
+  ctor.construct_arguments(/*size=*/2).Assign({400, 500});
+  ctor.construct_results(/*size=*/3).Assign({100, 200, 300});
+  ctor.construct_attributes(/*size=*/1).Assign({1400});
+  ctor.construct_last_uses(/*size=*/2).Assign({0, 1});
+
+  Kernel kernel(buffer.Get(ctor.address()));
+
+  EXPECT_EQ(kernel.code(), 100);
+
+  EXPECT_THAT(kernel.arguments(), testing::ElementsAreArray({400, 500}));
+  EXPECT_THAT(kernel.results(), testing::ElementsAreArray({100, 200, 300}));
+  EXPECT_THAT(kernel.attributes(), testing::ElementsAreArray({1400}));
+  EXPECT_THAT(kernel.last_uses(), testing::ElementsAreArray({0, 1}));
+}
+
+}  // namespace
+}  // namespace bc
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/span.h b/tensorflow/core/tfrt/mlrt/bytecode/span.h
new file mode 100644
index 00000000000..bf8ce7eb8e2
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/span.h
@@ -0,0 +1,86 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_SPAN_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_SPAN_H_
+
+#include <cstdint>
+#include <vector>
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+namespace bc {
+
+// Span is a range view of contiguous byte region like bc::Vector. It reads the
+// array size and start pointer eagerly, so that the range can be adapted.
+template <typename T>
+class Span {
+ public:
+  using value_type = T;
+  using iterator = ReadIterator<T>;
+  using const_iterator = iterator;
+
+  Span() = default;
+  Span(const char* data, size_t size) : data_(data), size_(size) {}
+
+  template <typename SizeType>
+  Span(const Vector<T, SizeType>& vec)  // NOLINT(google-explicit-constructor)
+      : Span(vec.data(), vec.size()) {}
+  Span(const String& vec)  // NOLINT(google-explicit-constructor)
+      : Span(vec.data(), vec.size()) {}
+  Span(const std::vector<T>& vec)  // NOLINT(google-explicit-constructor)
+      : Span(reinterpret_cast<const char*>(vec.data()), vec.size()) {}
+
+  const char* data() const { return data_; }
+  const char* data(size_t index) const { return data_ + index * sizeof(T); }
+
+  iterator begin() const { return iterator(data_); }
+  iterator end() const { return iterator(data_ + size_ * sizeof(T)); }
+  T back() const {
+    DCHECK_GT(size_, 0);
+    return *iterator(data_ + (size_ - 1) * sizeof(T));
+  }
+
+  T operator[](size_t index) const {
+    DCHECK_LT(index, size());
+    auto iter = begin();
+    iter += index;
+    return *iter;
+  }
+
+  size_t size() const { return size_; }
+  bool empty() const { return size_ == 0; }
+
+  Span drop_front(size_t num = 1) const {
+    auto beg = begin();
+    beg += num;
+    DCHECK_GE(size(), num);
+    return Span(beg.data(), size() - num);
+  }
+
+  Span drop_back(size_t num = 1) const {
+    DCHECK_GE(size(), num);
+    return Span(data(), size() - num);
+  }
+
+ private:
+  const char* data_ = nullptr;
+  size_t size_ = 0;
+};
+
+}  // namespace bc
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_BYTECODE_SPAN_H_
diff --git a/tensorflow/core/tfrt/mlrt/bytecode/span_test.cc b/tensorflow/core/tfrt/mlrt/bytecode/span_test.cc
new file mode 100644
index 00000000000..da7fa2ea5d4
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/bytecode/span_test.cc
@@ -0,0 +1,97 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/bytecode/span.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace bc {
+namespace {
+
+TEST(SpanTest, SpanOfTrivial) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  auto ctor = New<Vector<uint32_t>>(&alloc, /*size=*/4);
+
+  for (int i = 0; i < 4; ++i) {
+    ctor.ConstructAt(i, i);
+  }
+
+  Vector<uint32_t> vec(buffer.Get(ctor.address()));
+  Span<uint32_t> span(vec);
+
+  ASSERT_EQ(span.size(), 4);
+  EXPECT_EQ(span[0], 0);
+  EXPECT_EQ(span[1], 1);
+  EXPECT_EQ(span[2], 2);
+  EXPECT_EQ(span[3], 3);
+
+  EXPECT_THAT(span, testing::ElementsAreArray({0, 1, 2, 3}));
+}
+
+TEST(BefTest, SpanOfVector) {
+  Buffer buffer;
+  Allocator alloc(&buffer);
+
+  using T = Vector<uint32_t>;
+  using V = Vector<T>;
+
+  auto vctor = New<V>(&alloc, 3);
+
+  {
+    auto tctor = vctor.ConstructAt(0, 2);
+    tctor.ConstructAt(0, 0);
+    tctor.ConstructAt(1, 1);
+  }
+
+  {
+    auto tctor = vctor.ConstructAt(1, 1);
+    tctor.ConstructAt(0, 2);
+  }
+
+  vctor.ConstructAt(2, 0);
+
+  V v(buffer.Get(vctor.address()));
+  Span<T> span(v);
+
+  T t0 = span[0];
+  ASSERT_EQ(t0.size(), 2);
+  EXPECT_EQ(t0[0], 0);
+  EXPECT_EQ(t0[1], 1);
+  EXPECT_THAT(t0, testing::ElementsAreArray({0, 1}));
+
+  T t1 = span[1];
+  ASSERT_EQ(t1.size(), 1);
+  EXPECT_EQ(t1[0], 2);
+  EXPECT_THAT(t1, testing::ElementsAreArray({2}));
+
+  T t2 = span[2];
+  ASSERT_EQ(t2.size(), 0);
+}
+
+TEST(SpanTest, SpanOfStdVectorTrivial) {
+  std::vector<uint32_t> vec = {0, 1, 2, 3};
+  Span<uint32_t> span(vec);
+
+  EXPECT_THAT(span, testing::ElementsAreArray({0, 1, 2, 3}));
+}
+
+}  // namespace
+}  // namespace bc
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/BUILD b/tensorflow/core/tfrt/mlrt/interpreter/BUILD
new file mode 100644
index 00000000000..32c101b2b49
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/BUILD
@@ -0,0 +1,188 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        # copybara:uncomment "//smartass/brain/inference:__subpackages__",
+        # copybara:uncomment "//smartass/brain/ops/tfrt_kernels:__subpackages__",
+        "//tensorflow/compiler/mlir/tfrt/translate/mlrt:__subpackages__",
+        "//tensorflow/core/tfrt:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "context",
+    srcs = ["context.cc"],
+    hdrs = ["context.h"],
+    deps = [
+        ":attribute_span",
+        ":register_span",
+        ":value",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+tf_cc_test(
+    name = "context_test",
+    srcs = ["context_test.cc"],
+    deps = [
+        ":context",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_library(
+    name = "execute",
+    srcs = ["execute.cc"],
+    hdrs = ["execute.h"],
+    deps = [
+        ":context",
+        "//tensorflow/tsl/profiler/lib:traceme",
+        "@com_google_absl//absl/log",
+    ],
+)
+
+cc_library(
+    name = "register_span",
+    hdrs = ["register_span.h"],
+    deps = [
+        ":iterator",
+        ":value",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:span",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "iterator",
+    hdrs = ["iterator.h"],
+    deps = ["//tensorflow/core/tfrt/mlrt/bytecode"],
+)
+
+cc_library(
+    name = "attribute_span",
+    hdrs = ["attribute_span.h"],
+    deps = [
+        ":iterator",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:span",
+        "@com_google_absl//absl/log:check",
+    ],
+)
+
+cc_library(
+    name = "value",
+    hdrs = ["value.h"],
+    deps = [
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+    ],
+)
+
+cc_library(
+    name = "future",
+    hdrs = ["future.h"],
+    deps = [
+        ":context",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@tf_runtime//:async_value",
+    ],
+)
+
+cc_library(
+    name = "async_handle",
+    srcs = ["async_handle.cc"],
+    hdrs = ["async_handle.h"],
+    deps = [
+        ":context",
+        ":future",
+        "@com_google_absl//absl/log:check",
+        "@tf_runtime//:async_value",
+    ],
+)
+
+cc_library(
+    name = "builtin_kernels",
+    srcs = ["builtin_kernels.cc"],
+    hdrs = ["builtin_kernels.h"],
+    deps = [
+        ":async_handle",
+        ":context",
+        ":execute",
+        ":value",
+        "//tensorflow/tsl/profiler/lib:traceme",
+        "@com_google_absl//absl/status",
+    ],
+)
+
+cc_library(
+    name = "interpreter_testutil",
+    testonly = 1,
+    hdrs = ["interpreter_testutil.h"],
+    deps = [
+        ":attribute_span",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+tf_cc_test(
+    name = "interpreter_test",
+    srcs = ["interpreter_test.cc"],
+    deps = [
+        ":async_handle",
+        ":builtin_kernels",
+        ":execute",
+        ":future",
+        ":interpreter_testutil",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+tf_cc_test(
+    name = "value_test",
+    srcs = ["value_test.cc"],
+    deps = [
+        ":value",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "register_span_test",
+    srcs = ["register_span_test.cc"],
+    deps = [
+        ":register_span",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+tf_cc_test(
+    name = "future_test",
+    srcs = ["future_test.cc"],
+    deps = [
+        ":future",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc b/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc
new file mode 100644
index 00000000000..73827ffab12
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/async_handle.cc
@@ -0,0 +1,37 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/async_handle.h"
+
+#include <memory>
+#include <utility>
+
+namespace mlrt {
+
+std::pair<AsyncHandle::Promise, AsyncHandle> AsyncHandle::Allocate(
+    const ExecutionContext& current) {
+  auto user_contexts = current.CopyUserContexts();
+
+  auto new_context = std::make_unique<ExecutionContext>(
+      &current.loaded_executable(), std::move(user_contexts));
+  new_context->set_work_queue(current.work_queue());
+
+  auto shared_state = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
+
+  Promise promise(shared_state);
+  AsyncHandle handle(std::move(new_context), std::move(shared_state));
+  return {std::move(promise), std::move(handle)};
+}
+
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/async_handle.h b/tensorflow/core/tfrt/mlrt/interpreter/async_handle.h
new file mode 100644
index 00000000000..cb309882826
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/async_handle.h
@@ -0,0 +1,168 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ASYNC_HANDLE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ASYNC_HANDLE_H_
+
+#include <memory>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
+
+namespace mlrt {
+
+// mlrt::AsyncHandle is a specialized future for mananging context of an async
+// execution.
+//
+// Example usage:
+//
+//  // Create the context the async execution by copying the current context.
+//  auto [promise, handle] = AsyncHandle::Allocate(current_context);
+//
+//  // Set up completion signal through the `promise` created.
+//  handle.execution_context().set_exit_handler(
+//      [promise = std::move(promise)]() { promise.Finish(); });
+//
+//  // Launch execution.
+//  thread_pool.Schedule([&execution_context = handle.execution_context()](){
+//    execution_context.Call(...);
+//    Execute(execution_context);
+//  });
+//
+//  // Pass `handle` to places that need to wait for the execution.
+//  other_execution_context.Await(std::move(handle));
+//
+class AsyncHandle {
+ public:
+  class Promise {
+   public:
+    Promise(const Promise&) = delete;
+    Promise& operator=(const Promise&) = delete;
+    Promise(Promise&&) = default;
+    Promise& operator=(Promise&&) = default;
+
+    ~Promise() {
+      DCHECK(!shared_state_ || shared_state_.IsAvailable())
+          << "A non-empty promise must be fulfilled.";
+    }
+
+    void Finish(absl::Status status) && {
+      if (status.ok()) {
+        shared_state_.SetStateConcrete();
+      } else {
+        shared_state_.SetError(std::move(status));
+      }
+    }
+
+    // We don't need HandleError() method for AsyncHandle::Promise because it is
+    // managed by the framework internally and should never be placed in the
+    // register.
+
+   private:
+    explicit Promise(tsl::AsyncValueRef<tsl::Chain> shared_state)
+        : shared_state_(std::move(shared_state)) {}
+    tsl::AsyncValueRef<tsl::Chain> shared_state_;
+
+    friend class AsyncHandle;
+  };
+
+  // Allocate an AsyncHandle and the corresponding promise.
+  static std::pair<Promise, AsyncHandle> Allocate(
+      const ExecutionContext& current);
+
+  AsyncHandle(const AsyncHandle&) = delete;
+  AsyncHandle& operator=(const AsyncHandle&) = delete;
+  AsyncHandle(AsyncHandle&&) = default;
+  AsyncHandle& operator=(AsyncHandle&&) = default;
+
+  ~AsyncHandle() {
+    DCHECK(!shared_state_ || shared_state_.IsAvailable())
+        << "A non-empty AsyncHandle must be awaited.";
+  }
+
+  // Then() enqueues a callback which will be called when the future is
+  // fulfilled with either an error or a value.
+  //
+  // The following Then() overloads accept a callback with the following
+  // signatures:
+  //
+  // 1) void(absl::Status)
+  //    The argument is the status of this future in ready state.
+  //
+  // 2) void()
+  //    There is no argument. The callback will be called whenever it is ready.
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if<std::is_same_v<Arg, absl::Status>, void>::type Then(
+      F then) && {
+    DCHECK(shared_state_);
+    auto* shared_state_ptr = shared_state_.GetAsyncValue();
+    shared_state_ptr->AndThen([shared_state = std::move(shared_state_),
+                               execution_context =
+                                   std::move(execution_context_),
+                               then = std::move(then)]() mutable {
+      future_internal::InvokeThen(std::move(then), shared_state.GetAsyncValue(),
+                                  future_internal::ArgTag<Arg>());
+    });
+  }
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if<std::is_void_v<Arg>, void>::type Then(F then) && {
+    DCHECK(shared_state_);
+    auto* shared_state_ptr = shared_state_.GetAsyncValue();
+    shared_state_ptr->AndThen(
+        [shared_state = std::move(shared_state_),
+         execution_context = std::move(execution_context_),
+         then = std::move(then)]() mutable { std::move(then)(); });
+  }
+
+  void HandleError(Value* arg) {
+    if (!shared_state_ || shared_state_.IsAvailable()) {
+      // This is an empty handle or it is already finished.
+      return;
+    }
+
+    auto& execution_context = *arg->Get<ExecutionContext*>();
+    execution_context.Await(std::move(*this));
+  }
+
+  bool IsReady() const { return shared_state_.IsAvailable(); }
+  bool IsError() const { return shared_state_.IsError(); }
+
+  const absl::Status& GetError() const { return shared_state_.GetError(); }
+
+  ExecutionContext& execution_context() { return *execution_context_; }
+
+ private:
+  AsyncHandle(std::unique_ptr<ExecutionContext> execution_context,
+              tsl::AsyncValueRef<tsl::Chain> shared_state)
+      : execution_context_(std::move(execution_context)),
+        shared_state_(std::move(shared_state)) {
+    DCHECK(execution_context_);
+    DCHECK(shared_state_);
+  }
+
+  std::unique_ptr<ExecutionContext> execution_context_;
+  tsl::AsyncValueRef<tsl::Chain> shared_state_;
+};
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ASYNC_HANDLE_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h b/tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h
new file mode 100644
index 00000000000..485aeceb2e1
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h
@@ -0,0 +1,87 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ATTRIBUTE_SPAN_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ATTRIBUTE_SPAN_H_
+
+#include <cstring>
+#include <type_traits>
+
+#include "absl/log/check.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/iterator.h"
+
+namespace mlrt {
+namespace attribute_internal {
+
+// LINT.IfChange(mlrt_attributes)
+template <typename T>
+inline constexpr bool kCanAttributeBeInlined =
+    (std::is_integral_v<T> ||
+     std::is_floating_point_v<T>)&&(sizeof(T) <= sizeof(uint32_t));
+// LINT.ThenChange(../../../../compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc:mlrt_attributes)
+
+}  // namespace attribute_internal
+
+class AttributeSpan {
+  class Iterator
+      : public iterator_internal::IteratorBase<Iterator, bc::String,
+                                               bc::Span<bc::String>> {
+   public:
+    using IteratorBase<Iterator, bc::String,
+                       bc::Span<bc::String>>::IteratorBase;
+  };
+
+ public:
+  using value_type = bc::String;
+  using iterator = Iterator;
+  using const_iterator = iterator;
+
+  AttributeSpan(bc::Span<uint32_t> attr_indices,
+                bc::Span<bc::String> attributes)
+      : attr_indices_(attr_indices), attributes_(attributes) {}
+
+  bc::String operator[](size_t id) const {
+    return attributes_[attr_indices_[id]];
+  }
+
+  template <typename T>
+  T GetAs(size_t id) const {
+    if constexpr (std::is_same_v<T, bc::String>) {
+      return attributes_[attr_indices_[id]];
+    }
+
+    if constexpr (attribute_internal::kCanAttributeBeInlined<T>) {
+      return bc::AccessTraits<T>::Read(attr_indices_.data(id));
+    }
+
+    return bc::AccessTraits<T>::Read(attributes_[attr_indices_[id]].data());
+  }
+
+  size_t size() const { return attr_indices_.size(); }
+
+  iterator begin() const {
+    return iterator(attr_indices_.begin(), attributes_);
+  }
+  iterator end() const { return iterator(attr_indices_.end(), attributes_); }
+
+ private:
+  bc::Span<uint32_t> attr_indices_;
+  bc::Span<bc::String> attributes_;
+};
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ATTRIBUTE_SPAN_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc b/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc
new file mode 100644
index 00000000000..b486af6dfb7
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.cc
@@ -0,0 +1,258 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
+
+#include <iterator>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/async_handle.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
+
+namespace mlrt {
+
+void AsyncOp(KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("mlrt.async");
+  uint32_t func_idx = frame.attributes().GetAs<uint32_t>(0);
+
+  auto& execution_context = frame.execution_context();
+
+  auto function =
+      execution_context.loaded_executable().executable().functions()[func_idx];
+
+  trace_me.AppendMetadata([&]() {
+    return tsl::profiler::TraceMeEncode({{"name", function.name().Get()}});
+  });
+
+  auto [promise, handle] = AsyncHandle::Allocate(execution_context);
+  auto* work_queue = execution_context.work_queue();
+  DCHECK(work_queue);
+
+  handle.execution_context().set_exit_handler(
+      [&execution_context = handle.execution_context(),
+       promise = std::move(promise)]() mutable {
+        std::move(promise).Finish(execution_context.status());
+      });
+
+  handle.execution_context().Call(function, frame.last_uses(),
+                                  frame.arguments(),
+                                  /*results=*/absl::Span<Value>());
+
+  work_queue->AddTask(
+      [&execution_context = handle.execution_context()]() mutable {
+        Execute(execution_context);
+      });
+
+  frame.results()[0].Set<AsyncHandle>(std::move(handle));
+}
+
+void AwaitHandleOp(KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("mlrt.await_handle");
+  auto& handle = frame.arguments()[0].Get<AsyncHandle>();
+  auto& execution_context = frame.execution_context();
+  execution_context.Await(std::move(handle));
+}
+
+void AwaitAllHandleOp(KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("mlrt.await_all_handle");
+  RegisterValueSpan<AsyncHandle> handles(frame.arguments());
+  auto& execution_context = frame.execution_context();
+  execution_context.AwaitAll(handles);
+}
+
+void AllocateControlFutures(KernelFrame frame) {
+  uint32_t num = frame.attributes().GetAs<uint32_t>(0);
+
+  DCHECK_EQ(num * 2, frame.results().size());
+  for (int i = 0; i < num; ++i) {
+    auto promise = Promise::Allocate<Control>();
+    frame.results()[num + i].Set<Future>(promise.GetFuture());
+    frame.results()[i].Set<Promise>(std::move(promise));
+  }
+}
+
+void AwaitControlOp(KernelFrame frame) {
+  auto future = frame.arguments()[0].Get<Future>();
+  auto& execution_context = frame.execution_context();
+  execution_context.Await(std::move(future));
+}
+
+void AwaitAllControlOp(KernelFrame frame) {
+  RegisterValueSpan<Future> futures(frame.arguments());
+  auto& execution_context = frame.execution_context();
+  execution_context.AwaitAll(futures);
+}
+
+void PromiseControlOp(KernelFrame frame) {
+  auto& promise = frame.arguments()[0].Get<Promise>();
+  std::move(promise).Set<Control>(Control{});
+}
+
+// The call op contains one uint32_t attribute that is the index into the
+// functions list in the executable.
+void CallOp(KernelFrame frame) {
+  uint32_t func_idx = frame.attributes().GetAs<uint32_t>(0);
+
+  auto& execution_context = frame.execution_context();
+
+  auto function =
+      execution_context.loaded_executable().executable().functions()[func_idx];
+
+  execution_context.Call(function, frame.last_uses(), frame.arguments(),
+                         frame.results());
+}
+
+struct CaseOp : KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "mlrt.case";
+
+  uint32_t branch_index() const { return arguments()[0].Get<uint32_t>(); }
+  mlrt::bc::Vector<uint32_t> function_indices() const {
+    return attributes().GetAs<mlrt::bc::Vector<uint32_t>>(0);
+  }
+  void Invoke();
+};
+
+void CaseOp::Invoke() {
+  uint32_t argument_branch_idx = branch_index();
+  mlrt::bc::Vector<uint32_t> attribute_function_indices = function_indices();
+
+  if (argument_branch_idx >= attribute_function_indices.size()) {
+    execution_context().Fail(absl::InvalidArgumentError(
+        absl::StrCat("Case branch number ", argument_branch_idx,
+                     " exceeds limit ", attribute_function_indices.size())));
+    return;
+  }
+
+  auto function =
+      execution_context()
+          .loaded_executable()
+          .executable()
+          .functions()[attribute_function_indices[argument_branch_idx]];
+  execution_context().Call(function, last_uses().drop_front(),
+                           arguments().drop_front(), results());
+}
+
+void ReturnOp(KernelFrame frame) {
+  frame.execution_context().Return(frame.arguments());
+}
+
+void CondOp(KernelFrame frame) {
+  bool cond = frame.arguments()[0].Get<bool>();
+
+  uint32_t func_idx = frame.attributes().GetAs<uint32_t>(cond ? 0 : 1);
+
+  auto& execution_context = frame.execution_context();
+
+  auto function =
+      execution_context.loaded_executable().executable().functions()[func_idx];
+
+  execution_context.Call(function, frame.last_uses().drop_front(),
+                         frame.arguments().drop_front(), frame.results());
+}
+
+struct WhileOp : KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "mlrt.while";
+
+  uint32_t body_function() const { return attributes().GetAs<uint32_t>(0); }
+  void Invoke();
+};
+
+void WhileOp::Invoke() {
+  tsl::profiler::TraceMe trace_me("mlrt.while");
+  uint32_t func_idx = body_function();
+  mlrt::bc::Function body_fn = execution_context()
+                                   .loaded_executable()
+                                   .executable()
+                                   .functions()[func_idx];
+
+  DCHECK_EQ(arguments().size(), results().size());
+  DCHECK_EQ(body_fn.input_regs().size() + 1, arguments().size());
+  DCHECK_EQ(body_fn.output_regs().size(), results().size());
+
+  bool predicate;
+  auto& kernel_context =
+      execution_context().function_context().kernel_context();
+  const int body_argument_size = arguments().size() - 1;
+  if (!kernel_context.reenter) {
+    // First time that enters this kernel.
+
+    // Read the pass in initial boolean condition that decides whether the first
+    // iteration will be executed.
+    predicate = arguments()[0].Get<bool>();
+
+    if (predicate) {
+      // Executes the first iteration.
+      DCHECK(kernel_context.registers.empty());
+      kernel_context.registers.resize(body_argument_size);
+
+      kernel_context.reenter++;
+      execution_context().Call(body_fn, last_uses().drop_front(),
+                               arguments().drop_front(), results());
+    } else {
+      // No execution at all; simply copy arguments (shift by 1) to results.
+      for (int i = 0; i < body_argument_size; ++i) {
+        results()[i] = arguments()[i + 1];
+      }
+    }
+  } else {
+    // Next iteration.
+
+    // Read the last element of the previous iteration that decides whether we
+    // continue or end iterations.
+    predicate = results().back().Get<bool>();
+    if (predicate) {
+      // Continue to the next iteration.
+      absl::Span<Value> body_args = absl::MakeSpan(kernel_context.registers);
+      for (int i = 0; i < body_argument_size; ++i) {
+        body_args[i] = std::move(results()[i]);
+      }
+
+      kernel_context.reenter++;
+      execution_context().CallByMove(body_fn, body_args, results());
+    } else {
+      // Exit the loop. Frame results are already populated by the previous
+      // iteration.
+      kernel_context.reenter = 0;
+      kernel_context.registers.clear();
+    }
+  }
+}
+
+void RegisterBuiltinKernels(KernelRegistry& registry) {
+  // Keep kernels ordered by their names.
+  registry.Register<CaseOp>();
+  registry.Register<WhileOp>();
+  registry.Register("mlrt.allocate_control_futures", &AllocateControlFutures);
+  registry.Register("mlrt.async", &AsyncOp);
+  registry.Register("mlrt.await_control", &AwaitControlOp);
+  registry.Register("mlrt.await_all_control", &AwaitAllControlOp);
+  registry.Register("mlrt.await_handle", &AwaitHandleOp);
+  registry.Register("mlrt.await_all_handle", &AwaitAllHandleOp);
+  registry.Register("mlrt.cond", &CondOp);
+  registry.Register("mlrt.promise_control", &PromiseControlOp);
+  // Built-in support for some non-MLRT specific OPs.
+  registry.Register("call", &CallOp);
+  registry.Register("return", &ReturnOp);
+}
+
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/eager/virtual_device.h b/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h
similarity index 50%
rename from tensorflow/core/tfrt/eager/virtual_device.h
rename to tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h
index 92a20cdf7b7..ccb0b2b36e6 100644
--- a/tensorflow/core/tfrt/eager/virtual_device.h
+++ b/tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h
@@ -12,27 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_TFRT_EAGER_VIRTUAL_DEVICE_H_
-#define TENSORFLOW_CORE_TFRT_EAGER_VIRTUAL_DEVICE_H_
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_BUILTIN_KERNELS_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_BUILTIN_KERNELS_H_
 
-#include "tfrt/host_context/device.h"  // from @tf_runtime
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 
-namespace tfrt {
+namespace mlrt {
 
-// This device is for the use cases that a `Device` is not mapped to a physical
-// device.
-class VirtualDevice : public Device, public DeviceTraits<VirtualDevice> {
- public:
-  static const char* type_name() {
-    static constexpr char kName[] = "virtual";
-    return kName;
-  }
+void CallOp(KernelFrame& frame);
+void ReturnOp(KernelFrame& frame);
 
-  explicit VirtualDevice(string_view name) : Device(kDeviceType, name) {}
+void AsyncOp(KernelFrame& frame);
+void AwaitHandleOp(KernelFrame& frame);
 
-  ~VirtualDevice() override = default;
-};
+void RegisterBuiltinKernels(KernelRegistry& registry);
 
-}  // namespace tfrt
+}  // namespace mlrt
 
-#endif  // TENSORFLOW_CORE_TFRT_EAGER_VIRTUAL_DEVICE_H_
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_BUILTIN_KERNELS_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/context.cc b/tensorflow/core/tfrt/mlrt/interpreter/context.cc
new file mode 100644
index 00000000000..c06843ab345
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/context.cc
@@ -0,0 +1,52 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+
+namespace mlrt {
+namespace context_internal {
+
+UserContextBase::~UserContextBase() = default;
+
+}
+
+void KernelRegistry::Register(absl::string_view name,
+                              KernelImplementation kernel) {
+  map_.emplace(name, kernel);
+}
+
+KernelImplementation KernelRegistry::Get(absl::string_view name) const {
+  DCHECK(map_.contains(name)) << "Missing kernel in registry: " << name;
+  return map_.at(name);
+}
+
+void KernelRegistry::Merge(const KernelRegistry& other) {
+  map_.insert(other.map_.begin(), other.map_.end());
+}
+
+LoadedExecutable::LoadedExecutable(bc::Executable executable,
+                                   const KernelRegistry& kernel_registry)
+    : executable_(executable) {
+  kernels_.reserve(executable_.kernel_names().size());
+  for (auto kernel_name : executable_.kernel_names()) {
+    kernels_.push_back(kernel_registry.Get(kernel_name));
+  }
+
+  functions_.reserve(executable_.functions().size());
+  for (auto function : executable_.functions()) {
+    functions_[function.name().Get()] = function;
+  }
+}
+
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/context.h b/tensorflow/core/tfrt/mlrt/interpreter/context.h
new file mode 100644
index 00000000000..89817424f34
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/context.h
@@ -0,0 +1,565 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_CONTEXT_H_
+
+#include <algorithm>
+#include <atomic>
+#include <functional>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace mlrt {
+
+class KernelFrame;
+class ExecutionContext;
+
+class Future;
+template <typename T, typename FutureLikeContainer, typename ResultRefContainer>
+Future AwaitAll(FutureLikeContainer futures, ResultRefContainer results);
+template <typename FutureLikeContainer>
+Future AwaitAll(FutureLikeContainer futures);
+
+using KernelImplementation = void (*)(KernelFrame);
+
+class KernelRegistry {
+ public:
+  void Register(absl::string_view name, KernelImplementation kernel);
+
+  KernelImplementation Get(absl::string_view name) const;
+
+  template <typename KernelClass>
+  void Register(absl::string_view name);
+
+  template <typename KernelClass>
+  void Register() {
+    Register<KernelClass>(KernelClass::kName);
+  }
+
+  void Merge(const KernelRegistry& other);
+
+ private:
+  absl::flat_hash_map<std::string, KernelImplementation> map_;
+};
+
+class LoadedExecutable {
+ public:
+  LoadedExecutable(bc::Executable executable,
+                   const KernelRegistry& kernel_registry);
+
+  absl::Span<const KernelImplementation> kernels() const { return kernels_; }
+
+  bc::Function GetFunction(absl::string_view name) const {
+    if (auto iter = functions_.find(name); iter != functions_.end()) {
+      return iter->second;
+    }
+
+    return nullptr;
+  }
+
+  bc::Executable executable() const { return executable_; }
+
+ private:
+  bc::Executable executable_;
+
+  absl::flat_hash_map<std::string, bc::Function> functions_;
+  std::vector<KernelImplementation> kernels_;
+};
+
+// A helper structure that holds states for a kernel. Typical usuage is that a
+// control kernel wants to call a function and then come back to the same
+// kernel, e.g. WhileOp.
+struct KernelContext {
+  // Any non-zero value indicates the kernel just reentered.
+  int reenter = 0;
+  // Registers for callee.
+  std::vector<Value> registers;
+};
+
+namespace execute_internal {
+
+void UnwindOnError(ExecutionContext& context, int64_t pc);
+
+}
+
+class FunctionContext {
+ public:
+  FunctionContext(bc::Function function, ExecutionContext* execution_context)
+      : pc_(0),
+        registers_(function.num_regs()),
+        function_object_(function),
+        execution_context_(execution_context) {
+    DCHECK(execution_context);
+  }
+
+  FunctionContext(const FunctionContext&) = delete;
+  FunctionContext& operator=(const FunctionContext&) = delete;
+  FunctionContext(FunctionContext&&) = default;
+  FunctionContext& operator=(FunctionContext&&) = default;
+
+  ExecutionContext& execution_context() { return *execution_context_; }
+
+  const bc::Function& function_object() const { return function_object_; }
+
+  absl::Span<Value> regs() { return absl::MakeSpan(registers_); }
+
+  // Argument passing is via either copy or move.
+  template <typename Args, typename Results>
+  void Call(bc::Span<uint8_t> last_uses, Args args, Results results) {
+    auto idx_iter = function_object_.input_regs().begin();
+
+    DCHECK_EQ(function_object_.input_regs().size(), args.size());
+
+    DCHECK_EQ(args.size(), last_uses.size());
+    auto last_use_iter = last_uses.begin();
+    for (auto& arg : args) {
+      if (*last_use_iter) {
+        registers_[*idx_iter] = std::move(arg);
+      } else {
+        registers_[*idx_iter] = arg;
+      }
+      ++idx_iter;
+      ++last_use_iter;
+    }
+
+    results_.reserve(results.size());
+    for (auto& result : results) {
+      results_.push_back(&result);
+    }
+  }
+
+  // Argument passing is via move.
+  template <typename Args, typename Results>
+  void CallByMove(Args args, Results results) {
+    auto idx_iter = function_object_.input_regs().begin();
+
+    DCHECK_EQ(function_object_.input_regs().size(), args.size());
+
+    for (auto& arg : args) {
+      registers_[*idx_iter] = std::move(arg);
+      ++idx_iter;
+    }
+
+    results_.reserve(results.size());
+    for (auto& result : results) {
+      results_.push_back(&result);
+    }
+  }
+
+  // The return operation copies or moves (if not a ref) the results.
+  void Return(RegisterSpan results) {
+    DCHECK_EQ(results.size(), function_object_.output_regs().size());
+    auto result_iter = results.begin();
+    auto output_last_uses = function_object_.output_last_uses();
+
+    for (int i = 0; i < results_.size(); ++i) {
+      auto* result = results_[i];
+
+      if (!output_last_uses.empty() && output_last_uses[i]) {
+        // We only move the result only if it is the last use.
+        *result = std::move(*result_iter);
+      } else {
+        *result = *result_iter;
+      }
+      ++result_iter;
+    }
+  }
+
+  const KernelContext& kernel_context() const { return kernel_context_; }
+  KernelContext& kernel_context() { return kernel_context_; }
+
+ private:
+  int64_t pc_;
+  std::vector<Value> registers_;
+  std::vector<Value*> results_;
+  bc::Function function_object_;
+  KernelContext kernel_context_;
+
+  ExecutionContext* execution_context_ = nullptr;
+
+  friend class ExecutionContext;
+  friend void Execute(ExecutionContext& context);
+  friend void execute_internal::UnwindOnError(ExecutionContext& context,
+                                              int64_t pc);
+};
+
+namespace context_internal {
+
+inline std::atomic<int>& GetNextId() {
+  static std::atomic<int> next_id = 0;
+  return next_id;
+}
+
+class UserContextBase {
+ public:
+  virtual ~UserContextBase();
+
+  virtual std::unique_ptr<UserContextBase> Copy() const = 0;
+};
+
+}  // namespace context_internal
+
+// Every user context should inherit from this class. Internally it generates a
+// unique id for each user context type for internal management.
+template <typename Derived>
+class UserContext : public context_internal::UserContextBase {
+ public:
+  using Base = context_internal::UserContextBase;
+
+  static int id() { return id_; }
+
+  std::unique_ptr<Base> Copy() const final {
+    return std::make_unique<Derived>(*static_cast<const Derived*>(this));
+  }
+
+ private:
+  inline static int id_ = context_internal::GetNextId()++;
+};
+
+class ExecutionContext {
+ public:
+  explicit ExecutionContext(const LoadedExecutable* loaded_executable)
+      : user_contexts_(context_internal::GetNextId().load()),
+        loaded_executable_(loaded_executable) {}
+
+  ExecutionContext(
+      const LoadedExecutable* loaded_executable,
+      std::vector<std::unique_ptr<context_internal::UserContextBase>>
+          user_contexts)
+      : user_contexts_(std::move(user_contexts)),
+        loaded_executable_(loaded_executable) {}
+
+  void set_exit_handler(absl::AnyInvocable<void() &&> exit_handler) {
+    exit_handler_ = std::move(exit_handler);
+  }
+
+  tfrt::ConcurrentWorkQueue* work_queue() const { return work_queue_; }
+
+  void set_work_queue(tfrt::ConcurrentWorkQueue* work_queue) {
+    work_queue_ = work_queue;
+  }
+
+  template <typename Args, typename Results>
+  void Call(bc::Function function_object, bc::Span<uint8_t> last_uses,
+            Args args, Results results) {
+    auto& function_context =
+        function_stack_.emplace_back(function_object, this);
+    function_context.Call(last_uses, args, results);
+    state_ = State::kReady;
+  }
+
+  template <typename Args, typename Results>
+  void CallByMove(bc::Function function_object, Args args, Results results) {
+    auto& function_context =
+        function_stack_.emplace_back(function_object, this);
+    function_context.CallByMove(args, results);
+    state_ = State::kReady;
+  }
+
+  void Return(RegisterSpan results) {
+    auto& function_context = function_stack_.back();
+    function_context.Return(results);
+    state_ = State::kReturn;
+  }
+
+  FunctionContext& function_context() { return function_stack_.back(); }
+
+  // Enqueues the current execution to the wait list of the `future`. Once the
+  // `future` is ready, the execution will be resumed. And the value will be
+  // populated in `result` if it is not an error.
+  template <typename T, typename FutureLike>
+  void Await(FutureLike future, Value* result) {
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      } else {
+        std::move(future).Then(
+            [result](T value) { result->Set(std::move(value)); });
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, result, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then([this, result, resume = std::move(resume)](
+                                 absl::StatusOr<T> value) mutable {
+        if (!value.ok()) {
+          Fail(std::move(value).status());
+        } else {
+          result->Set(*std::move(value));
+          state_ = State::kRunning;
+        }
+
+        std::move(resume)();
+      });
+    };
+  }
+
+  template <typename FutureLike>
+  void Await(FutureLike future) {
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then(
+          [this, resume = std::move(resume)](absl::Status status) mutable {
+            if (!status.ok()) {
+              Fail(std::move(status));
+            } else {
+              state_ = State::kRunning;
+            }
+
+            std::move(resume)();
+          });
+    };
+  }
+
+  template <typename T, typename FutureLikeContainer,
+            typename ResultRefContainer>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void AwaitAll(FutureLikeContainer futures,
+                                             ResultRefContainer results) {
+    auto future = mlrt::AwaitAll<T>(futures, results);
+
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then(
+          [this, resume = std::move(resume)](absl::Status status) mutable {
+            state_ = State::kRunning;
+
+            if (!status.ok()) {
+              Fail(status);
+            }
+
+            std::move(resume)();
+          });
+    };
+  }
+
+  template <typename FutureLikeContainer>
+  ABSL_ATTRIBUTE_ALWAYS_INLINE void AwaitAll(FutureLikeContainer futures) {
+    auto future = mlrt::AwaitAll(futures);
+
+    if (future.IsReady()) {
+      if (future.IsError()) {
+        Fail(future.GetError());
+      }
+      return;
+    }
+
+    state_ = State::kSuspended;
+    suspend_handler_ = [this, future = std::move(future)](
+                           absl::AnyInvocable<void()&&> resume) mutable {
+      std::move(future).Then(
+          [this, resume = std::move(resume)](absl::Status status) mutable {
+            state_ = State::kRunning;
+
+            if (!status.ok()) {
+              Fail(status);
+            }
+
+            std::move(resume)();
+          });
+    };
+  }
+
+  const LoadedExecutable& loaded_executable() const {
+    return *loaded_executable_;
+  }
+
+  void Fail(absl::Status status) {
+    state_ = State::kError;
+    status_ = std::move(status);
+  }
+
+  void FailOnCancellation() { Fail(absl::CancelledError()); }
+
+  const absl::Status& status() const { return status_; }
+
+  // Add an instance of user context to the execution context.
+  template <typename T>
+  void AddUserContext(std::unique_ptr<T> user_context) {
+    static_assert(std::is_base_of_v<UserContext<T>, T>);
+    DCHECK_LT(T::id(), user_contexts_.size());
+    user_contexts_[T::id()] = std::move(user_context);
+  }
+
+  // Return an reference to the user context.
+  template <typename T>
+  T& GetUserContext() const {
+    static_assert(std::is_base_of_v<UserContext<T>, T>);
+    DCHECK_LT(T::id(), user_contexts_.size());
+    return *static_cast<T*>(user_contexts_[T::id()].get());
+  }
+
+  std::vector<std::unique_ptr<context_internal::UserContextBase>>
+  CopyUserContexts() const {
+    std::vector<std::unique_ptr<context_internal::UserContextBase>>
+        user_contexts;
+    user_contexts.reserve(user_contexts_.size());
+    for (const auto& user_context : user_contexts_) {
+      if (user_context) {
+        user_contexts.push_back(user_context->Copy());
+      } else {
+        user_contexts.push_back(nullptr);
+      }
+    }
+    return user_contexts;
+  }
+
+ private:
+  absl::InlinedVector<FunctionContext, 2> function_stack_;
+
+  enum class State {
+    // The function is pushed to the stack, and ready for execution.
+    kReady = 0,
+
+    // The function is being executed and has not reached the return op yet.
+    kRunning,
+
+    // The function finished executing the return op, and ready for being popped
+    // from the stack.
+    kReturn,
+
+    // The function is suspended from execution due to context switches.
+    kSuspended,
+
+    // The execution reports an error in the current thread, and the execution
+    // will be aborted by cleaning the states.
+    kError
+  };
+  State state_ = State::kReady;
+
+  absl::Status status_;
+
+  // The `suspend_handler_` is a callable whose argument is another callable
+  // that resumes the execution (or error handling).
+  absl::AnyInvocable<void(absl::AnyInvocable<void() &&> resume) &&>
+      suspend_handler_;
+  absl::AnyInvocable<void() &&> exit_handler_;
+
+  tfrt::ConcurrentWorkQueue* work_queue_ = nullptr;
+
+  std::vector<std::unique_ptr<context_internal::UserContextBase>>
+      user_contexts_;
+
+  const LoadedExecutable* loaded_executable_ = nullptr;
+
+  friend class AsyncHandle;
+  friend void Execute(ExecutionContext& context);
+  friend void execute_internal::UnwindOnError(ExecutionContext& context,
+                                              int64_t pc);
+};
+
+class KernelFrame {
+ public:
+  struct State {
+    State(absl::Span<Value> regs, bc::Span<bc::String> attrs,
+          ExecutionContext* execution_context)
+        : regs(regs), attrs(attrs), execution_context(execution_context) {
+      DCHECK(execution_context);
+    }
+
+    explicit State(FunctionContext* function_context)
+        : State(function_context->regs(),
+                function_context->execution_context()
+                    .loaded_executable()
+                    .executable()
+                    .attributes(),
+                &function_context->execution_context()) {}
+
+    bc::Kernel kernel;
+    absl::Span<Value> regs;
+    bc::Span<bc::String> attrs;
+    ExecutionContext* execution_context = nullptr;
+  };
+
+  explicit KernelFrame(State* state) : state_(state) { DCHECK(state_); }
+
+  template <typename T>
+  operator T() const {  // NOLINT
+    return T(state_);
+  }
+
+  RegisterSpan arguments() const {
+    return RegisterSpan(kernel().arguments(), regs());
+  }
+
+  RegisterSpan results() const {
+    return RegisterSpan(kernel().results(), regs());
+  }
+
+  AttributeSpan attributes() const {
+    return AttributeSpan(kernel().attributes(), attrs());
+  }
+
+  bc::Span<uint8_t> last_uses() const { return kernel().last_uses(); }
+
+  ExecutionContext& execution_context() { return *state_->execution_context; }
+  const ExecutionContext& execution_context() const {
+    return *state_->execution_context;
+  }
+
+  void set_kernel(bc::Kernel kernel) { this->kernel() = kernel; }
+
+ private:
+  bc::Kernel& kernel() { return state_->kernel; }
+  const bc::Kernel& kernel() const { return state_->kernel; }
+
+  absl::Span<Value> regs() const { return state_->regs; }
+  bc::Span<bc::String> attrs() const { return state_->attrs; }
+
+  State* state_ = nullptr;
+
+  friend void Execute(ExecutionContext& context);
+};
+
+template <typename KernelClass>
+inline void KernelRegistry::Register(absl::string_view name) {
+  Register(
+      name, +[](KernelFrame frame) { KernelClass(frame).Invoke(); });
+}
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_CONTEXT_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc
new file mode 100644
index 00000000000..d02d73cad23
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/context_test.cc
@@ -0,0 +1,119 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace {
+
+struct A : KernelFrame {
+  static constexpr char kName[] = "A";
+  using KernelFrame::KernelFrame;
+  void Invoke() {}
+};
+
+struct B : KernelFrame {
+  static constexpr char kName[] = "B";
+  using KernelFrame::KernelFrame;
+  void Invoke() {}
+};
+
+struct C : KernelFrame {
+  static constexpr char kName[] = "C";
+  using KernelFrame::KernelFrame;
+  void Invoke() {}
+};
+
+TEST(ContextTest, MergeKernelRegistry) {
+  KernelRegistry reg_a;
+  reg_a.Register<A>();
+  reg_a.Register<B>();
+
+  KernelRegistry reg_b;
+  reg_b.Register<B>();
+  reg_b.Register<C>();
+
+  EXPECT_TRUE(reg_a.Get(A::kName));
+  EXPECT_TRUE(reg_a.Get(B::kName));
+
+  reg_a.Merge(reg_b);
+
+  EXPECT_TRUE(reg_a.Get(A::kName));
+  EXPECT_TRUE(reg_a.Get(B::kName));
+  EXPECT_TRUE(reg_a.Get(C::kName));
+}
+
+struct TestContext0 : UserContext<TestContext0> {
+  int v = 0;
+};
+struct TestContext1 : UserContext<TestContext1> {
+  int v = 1;
+};
+
+TEST(ContextTest, UserContext) {
+  EXPECT_EQ(TestContext0::id(), 0);
+  EXPECT_EQ(TestContext1::id(), 1);
+
+  ExecutionContext execution_context(/*loaded_executable=*/nullptr);
+
+  auto test_1 = std::make_unique<TestContext1>();
+  auto* test_1_ptr = test_1.get();
+  execution_context.AddUserContext(std::move(test_1));
+
+  auto test_0 = std::make_unique<TestContext0>();
+  auto* test_0_ptr = test_0.get();
+  execution_context.AddUserContext(std::move(test_0));
+
+  EXPECT_EQ(&execution_context.GetUserContext<TestContext0>(), test_0_ptr);
+  EXPECT_EQ(&execution_context.GetUserContext<TestContext1>(), test_1_ptr);
+  EXPECT_EQ(execution_context.GetUserContext<TestContext0>().v, 0);
+  EXPECT_EQ(execution_context.GetUserContext<TestContext1>().v, 1);
+
+  ExecutionContext execution_context_copy(/*loaded_executable=*/nullptr,
+                                          execution_context.CopyUserContexts());
+  EXPECT_NE(&execution_context_copy.GetUserContext<TestContext0>(), test_0_ptr);
+  EXPECT_NE(&execution_context_copy.GetUserContext<TestContext1>(), test_1_ptr);
+
+  EXPECT_EQ(execution_context_copy.GetUserContext<TestContext0>().v, 0);
+  EXPECT_EQ(execution_context_copy.GetUserContext<TestContext1>().v, 1);
+}
+
+TEST(ContextTest, PartialUserContext) {
+  EXPECT_EQ(TestContext0::id(), 0);
+  EXPECT_EQ(TestContext1::id(), 1);
+
+  ExecutionContext execution_context(/*loaded_executable=*/nullptr);
+
+  auto test_1 = std::make_unique<TestContext1>();
+  auto* test_1_ptr = test_1.get();
+  execution_context.AddUserContext(std::move(test_1));
+
+  EXPECT_EQ(&execution_context.GetUserContext<TestContext1>(), test_1_ptr);
+  EXPECT_EQ(execution_context.GetUserContext<TestContext1>().v, 1);
+
+  ExecutionContext execution_context_copy(/*loaded_executable=*/nullptr,
+                                          execution_context.CopyUserContexts());
+  EXPECT_NE(&execution_context_copy.GetUserContext<TestContext1>(), test_1_ptr);
+
+  EXPECT_EQ(execution_context_copy.GetUserContext<TestContext1>().v, 1);
+}
+
+}  // namespace
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/execute.cc b/tensorflow/core/tfrt/mlrt/interpreter/execute.cc
new file mode 100644
index 00000000000..ed85743ce9e
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/execute.cc
@@ -0,0 +1,187 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+
+#include <utility>
+
+#include "absl/log/log.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
+
+namespace mlrt {
+namespace execute_internal {
+
+void UnwindOnError(ExecutionContext& context, int64_t pc);
+
+}
+
+// The single-threaded execution of the kernels.
+void Execute(ExecutionContext& context) {
+  for (;;) {
+    DCHECK(!context.function_stack_.empty());
+
+    int function_stack_index = context.function_stack_.size() - 1;
+    FunctionContext* current_function = &context.function_stack_.back();
+    int64_t pc = current_function->pc_;
+
+    auto kernels = context.loaded_executable().kernels();
+
+    auto kernel_object_iter =
+        current_function->function_object().kernels().begin();
+    kernel_object_iter += pc;
+
+    KernelFrame::State kstate(current_function);
+    KernelFrame frame(&kstate);
+
+    // The main loop for executing kernels in program order. The kernels may set
+    // the execution state to break this loop for context-switching or error
+    // handling.
+    for (; context.state_ == ExecutionContext::State::kRunning; ++pc) {
+      DCHECK(kernel_object_iter <
+             current_function->function_object().kernels().end());
+      bc::Kernel kernel_object = *kernel_object_iter;
+      frame.set_kernel(kernel_object);
+      kernels[kernel_object.code()](frame);
+      ++kernel_object_iter;
+    }
+
+    // Update the program counter if we need to break the sequential execution
+    // loop.
+    current_function = &context.function_stack_[function_stack_index];
+    current_function->pc_ = pc;
+
+    // The state transition if a kernel set the execution state other than
+    // kRunning.
+    switch (context.state_) {
+      case ExecutionContext::State::kReady: {
+        context.state_ = ExecutionContext::State::kRunning;
+        if (current_function->kernel_context().reenter) {
+          // Rewind PC to comeback to the kernel that calls a function.
+          current_function->pc_--;
+        }
+        break;
+      }
+      case ExecutionContext::State::kRunning:
+        LOG(FATAL) << "This cannot happen.";  // Crash Ok
+        break;
+      case ExecutionContext::State::kReturn: {
+        tsl::profiler::TraceMe trace_me("Execute::Return");
+        context.function_stack_.pop_back();
+        if (context.function_stack_.empty()) {
+          if (context.exit_handler_) {
+            std::move(context.exit_handler_)();
+          }
+          return;
+        }
+        context.state_ = ExecutionContext::State::kRunning;
+        break;
+      }
+      case ExecutionContext::State::kSuspended: {
+        tsl::profiler::TraceMe trace_me("Execute::Suspend");
+        DCHECK(context.suspend_handler_);
+        std::move(context.suspend_handler_)([&context]() {
+          auto* work_queue = context.work_queue();
+          DCHECK(work_queue);
+          work_queue->AddTask([&context]() { Execute(context); });
+        });
+        return;
+      }
+      case ExecutionContext::State::kError: {
+        tsl::profiler::TraceMe trace_me("Execute::Error");
+        // Upon an error, we unwind the function stack by calling HandleError()
+        // on each register.
+        execute_internal::UnwindOnError(context, -1);
+        return;
+      }
+    }
+  }
+}
+
+namespace execute_internal {
+
+void UnwindOnError(ExecutionContext& context, int64_t pc) {
+  while (!context.function_stack_.empty()) {
+    DCHECK(context.state_ == ExecutionContext::State::kError);
+
+    FunctionContext* current_function = &context.function_stack_.back();
+
+    Value context_value(&context);
+
+    if (pc == -1) {
+      // Unwind the input registers.
+      DCHECK(context.state_ == ExecutionContext::State::kError);
+      ++pc;
+      RegisterSpan input_reg_span(
+          current_function->function_object().input_regs(),
+          current_function->regs());
+
+      for (Value& reg : input_reg_span) {
+        reg.HandleError(context_value);
+        if (context.state_ != ExecutionContext::State::kError) {
+          DCHECK(context.state_ == ExecutionContext::State::kSuspended);
+          // Rewind current pc so that the execution context come back to where
+          // is is suspended.
+          --pc;
+          break;
+        }
+      }
+    }
+
+    for (; context.state_ == ExecutionContext::State::kError &&
+           pc <= current_function->pc_;
+         ++pc) {
+      bc::Kernel kernel = current_function->function_object().kernels()[pc];
+
+      RegisterSpan reg_span(kernel.results(), current_function->regs());
+
+      for (Value& reg : reg_span) {
+        reg.HandleError(context_value);
+        if (context.state_ != ExecutionContext::State::kError) {
+          DCHECK(context.state_ == ExecutionContext::State::kSuspended);
+          // Rewind current pc so that the execution context come back to where
+          // is is suspended.
+          --pc;
+          break;
+        }
+      }
+    }
+
+    if (context.state_ == ExecutionContext::State::kSuspended) {
+      DCHECK(context.suspend_handler_)
+          << "suspend_handler_ must be populated when the state is set to "
+             "kSuspended.";
+      std::move(context.suspend_handler_)([&context, pc]() {
+        auto* work_queue = context.work_queue();
+        DCHECK(work_queue);
+        work_queue->AddTask([&context, pc]() {
+          context.state_ = ExecutionContext::State::kError;
+          UnwindOnError(context, pc);
+        });
+      });
+      return;
+    }
+
+    DCHECK(context.state_ != ExecutionContext::State::kSuspended);
+
+    pc = -1;
+    context.function_stack_.pop_back();
+  }
+
+  if (context.exit_handler_) {
+    std::move(context.exit_handler_)();
+  }
+}
+
+}  // namespace execute_internal
+}  // namespace mlrt
diff --git a/tensorflow/cc/experimental/libtf/tests/runtime_test_tfrt.cc b/tensorflow/core/tfrt/mlrt/interpreter/execute.h
similarity index 63%
rename from tensorflow/cc/experimental/libtf/tests/runtime_test_tfrt.cc
rename to tensorflow/core/tfrt/mlrt/interpreter/execute.h
index b9fcc7ad89c..7492d44a042 100644
--- a/tensorflow/cc/experimental/libtf/tests/runtime_test_tfrt.cc
+++ b/tensorflow/core/tfrt/mlrt/interpreter/execute.h
@@ -12,18 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.h"
-#include "tensorflow/cc/experimental/libtf/tests/runtime_test.h"
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_EXECUTE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_EXECUTE_H_
 
-namespace tf {
-namespace libtf {
-namespace runtime {
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 
-#ifdef PLATFORM_GOOGLE
-INSTANTIATE_TEST_SUITE_P(TF2CAPI, RuntimeTest,
-                         ::testing::Values(tfrt::Runtime));
-#endif
+namespace mlrt {
 
-}  // namespace runtime
-}  // namespace libtf
-}  // namespace tf
+void Execute(ExecutionContext& context);
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_EXECUTE_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/future.h b/tensorflow/core/tfrt/mlrt/interpreter/future.h
new file mode 100644
index 00000000000..fd32214cf39
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/future.h
@@ -0,0 +1,348 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_FUTURE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_FUTURE_H_
+
+#include <atomic>
+#include <type_traits>
+#include <utility>
+
+#include "absl/base/attributes.h"
+#include "absl/log/check.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tfrt/concurrency/async_value.h"  // from @tf_runtime
+#include "tfrt/concurrency/async_value_ref.h"  // from @tf_runtime
+
+namespace mlrt {
+namespace future_internal {
+
+// The overloads of GetArgumentType() are used to get the argument type of a
+// callable.
+void GetArgumentType(void (*)());
+template <typename F>
+void GetArgumentType(void (F::*)());
+template <typename F>
+void GetArgumentType(void (F::*)() const);
+template <typename Arg>
+Arg GetArgumentType(void (*)(Arg));
+template <typename F, typename Arg>
+Arg GetArgumentType(void (F::*)(Arg));
+template <typename F, typename Arg>
+Arg GetArgumentType(void (F::*)(Arg) const);
+template <typename F>
+decltype(GetArgumentType(&F::operator())) GetArgumentType(F);
+
+template <typename F>
+using ArgumentType = decltype(GetArgumentType(std::declval<F>()));
+
+template <typename T>
+struct ArgTag {};
+
+// The overloads of InvokeThen() are used to invoke different implementation
+// according to `then`'s argument type.
+template <typename F, typename T>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void InvokeThen(F&& then,
+                                             tsl::AsyncValue* shared_state,
+                                             ArgTag<T>) {
+  auto& arg = shared_state->get<T>();
+  if (shared_state->IsUnique()) {
+    std::forward<F>(then)(std::move(arg));
+  } else {
+    std::forward<F>(then)(arg);
+  }
+}
+
+template <typename F>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void InvokeThen(F&& then,
+                                             tsl::AsyncValue* shared_state,
+                                             ArgTag<absl::Status>) {
+  if (shared_state->IsError()) {
+    std::forward<F>(then)(shared_state->GetError());
+  } else {
+    std::forward<F>(then)(absl::OkStatus());
+  }
+}
+
+template <typename F, typename T>
+ABSL_ATTRIBUTE_ALWAYS_INLINE void InvokeThen(F&& then,
+                                             tsl::AsyncValue* shared_state,
+                                             ArgTag<absl::StatusOr<T>>) {
+  if (shared_state->IsError()) {
+    std::forward<F>(then)(shared_state->GetError());
+  } else {
+    InvokeThen(std::forward<F>(then), shared_state, ArgTag<T>());
+  }
+}
+
+}  // namespace future_internal
+
+struct Control {};
+
+// mlrt::Future is similar to std::shared_future<T> but type-erased.
+class Future {
+ public:
+  // Constructs a mlrt::Future directly from tsl::AsyncValue. This is used to
+  // integrate MLRT with existing systems that uses AsyncValue directly. For new
+  // use cases, creating mlrt::Future through mlrt::Promise is preferred.
+  template <typename T>
+  explicit Future(tsl::AsyncValueRef<T> async_value)
+      : shared_state_(std::move(async_value)) {}
+
+  Future(const Future& other) = default;
+  Future& operator=(const Future& other) = default;
+  Future(Future&& other) = default;
+  Future& operator=(Future&& other) = default;
+
+  explicit operator bool() const { return shared_state_ != nullptr; }
+
+  bool IsReady() const {
+    DCHECK(shared_state_);
+    return shared_state_->IsAvailable();
+  }
+
+  bool IsError() const {
+    DCHECK(shared_state_);
+    return shared_state_->IsError();
+  }
+
+  template <typename T>
+  const T& Get() const {
+    DCHECK(shared_state_);
+    return shared_state_->get<T>();
+  }
+
+  const absl::Status& GetError() const {
+    DCHECK(shared_state_);
+    return shared_state_->GetError();
+  }
+
+  // Then() enqueues a callback which will be called when the future is
+  // fulfilled with either an error or a value.
+  //
+  // The following Then() overloads accept a callback with the following
+  // signatures:
+  //
+  // 1) void(absl::StatusOr<T>)
+  //    The argument can be either the error or the value.
+  //
+  // 2) void(absl::Status)
+  //    The argument is the status of this future in ready state.
+  //
+  // 3) void(T)
+  //    The argument is the fulfilled value. It is undefined behavior if there
+  //    is an error.
+  //
+  // 4) void()
+  //    There is no argument. The callback will be called whenever it is ready.
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if_t<!std::is_void_v<Arg>, void> Then(F then) && {
+    DCHECK(shared_state_);
+    auto* shared_state_ptr = shared_state_.get();
+    shared_state_ptr->AndThen([shared_state = std::move(shared_state_),
+                               then = std::move(then)]() mutable {
+      future_internal::InvokeThen(std::move(then), shared_state.get(),
+                                  future_internal::ArgTag<Arg>());
+    });
+  }
+
+  template <typename F,
+            typename Arg = std::decay_t<future_internal::ArgumentType<F>>>
+  typename std::enable_if_t<std::is_void_v<Arg>, void> Then(F then) && {
+    DCHECK(shared_state_);
+    auto* shared_state_ptr = shared_state_.get();
+    shared_state_ptr->AndThen(
+        [shared_state = std::move(shared_state_),
+         then = std::move(then)]() mutable { std::move(then)(); });
+  }
+
+  size_t UseCount() const {
+    DCHECK(shared_state_);
+    return shared_state_->NumRef();
+  }
+
+  // We don't need HandleError() method for Future because
+  // AsyncHandle::HandleError() is enough for error handling for async
+  // execution.
+
+ private:
+  friend class Promise;
+
+  explicit Future(tsl::RCReference<tsl::AsyncValue> shared_state)
+      : shared_state_(std::move(shared_state)) {}
+
+  tsl::RCReference<tsl::AsyncValue> shared_state_;
+};
+
+// mlrt::Promise is similar to std::promise<T> but type-erased.
+class Promise {
+ public:
+  template <typename T>
+  static Promise Allocate() {
+    return Promise(tsl::MakeUnconstructedAsyncValueRef<T>().ReleaseRCRef());
+  }
+
+  ~Promise() {
+    DCHECK(!shared_state_ || shared_state_->IsAvailable())
+        << "A non-empty promise must be fulfilled.";
+  }
+
+  Promise(const Promise&) = delete;
+  Promise& operator=(const Promise&) = delete;
+  Promise(Promise&&) = default;
+  Promise& operator=(Promise&&) = default;
+
+  Future GetFuture() const { return Future(shared_state_); }
+
+  template <typename T, typename... Args>
+  void Set(Args&&... args) && {
+    DCHECK(shared_state_);
+
+    auto shared_state = std::move(shared_state_);
+    auto* shared_state_ptr = shared_state.get();
+
+    // Since each waiter will hold a reference to the shared state, we can drop
+    // the reference in mlrt::Promise::Set() in order to trigger passing by move
+    // for the last waiter.
+    if (!shared_state->IsUnique()) {
+      shared_state.reset();
+    }
+
+    shared_state_ptr->emplace<T>(std::forward<Args>(args)...);
+  }
+
+  void SetError(absl::Status status) && {
+    DCHECK(shared_state_);
+
+    DCHECK(!status.ok());
+    shared_state_->SetError(std::move(status));
+    shared_state_.reset();
+  }
+
+  void HandleError(Value* arg) && {
+    if (!shared_state_ || shared_state_->IsAvailable()) {
+      // This is an empty promise or it is already fulfilled.
+      return;
+    }
+
+    auto& execution_context = *arg->Get<ExecutionContext*>();
+    DCHECK(!execution_context.status().ok());
+
+    std::move(*this).SetError(execution_context.status());
+  }
+
+  explicit operator bool() const { return shared_state_ != nullptr; }
+
+ private:
+  explicit Promise(tsl::RCReference<tsl::AsyncValue> shared_state)
+      : shared_state_(std::move(shared_state)) {}
+
+  tsl::RCReference<tsl::AsyncValue> shared_state_;
+};
+
+namespace future_internal {
+
+struct State {
+  State(int size, mlrt::Promise promise)
+      : count(size), promise(std::move(promise)) {}
+
+  std::atomic<int> count;
+  mlrt::Promise promise;
+
+  absl::Mutex mu;
+  absl::Status status;
+
+  void SetError(absl::Status status) {
+    absl::MutexLock lock(&mu);
+    this->status = std::move(status);
+  }
+
+  // Returns true if it is the last consumer of the state. If this method
+  // returns false, *this object might be destroyed anytime so the data can no
+  // longer be accessed after it returns false.
+  bool DecrementCount() {
+    if (count.fetch_sub(1, std::memory_order_acq_rel) == 1) {
+      if (status.ok()) {
+        std::move(promise).Set<Control>(Control());
+      } else {
+        std::move(promise).SetError(std::move(status));
+      }
+      return true;
+    }
+    return false;
+  }
+};
+
+}  // namespace future_internal
+
+template <typename T, typename FutureLikeContainer, typename ResultRefContainer>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Future AwaitAll(FutureLikeContainer futures,
+                                             ResultRefContainer results) {
+  DCHECK(!futures.empty());
+
+  auto promise = Promise::Allocate<Control>();
+  auto await_all = promise.GetFuture();
+  auto* state = new future_internal::State(futures.size(), std::move(promise));
+
+  DCHECK_EQ(futures.size(), results.size());
+  for (int i = 0; i < futures.size(); ++i) {
+    auto& future = futures[i];
+    std::move(future).Then(
+        [state, result = &results[i]](absl::StatusOr<T> value) {
+          if (value.ok()) {
+            result->Set(std::move(*value));
+          } else {
+            state->SetError(std::move(value).status());
+          }
+
+          if (state->DecrementCount()) {
+            delete state;
+          }
+        });
+  }
+
+  return await_all;
+}
+
+template <typename FutureLikeContainer>
+ABSL_ATTRIBUTE_ALWAYS_INLINE Future AwaitAll(FutureLikeContainer futures) {
+  DCHECK(!futures.empty());
+
+  auto promise = Promise::Allocate<Control>();
+  auto await_all = promise.GetFuture();
+  auto* state = new future_internal::State(futures.size(), std::move(promise));
+
+  for (int i = 0; i < futures.size(); ++i) {
+    auto& future = futures[i];
+    std::move(future).Then([state](absl::Status status) {
+      if (!status.ok()) {
+        state->SetError(std::move(status));
+      }
+
+      if (state->DecrementCount()) {
+        delete state;
+      }
+    });
+  }
+
+  return await_all;
+}
+
+// TODO(chky): Implement type-safe version of Future and Promise.
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_FUTURE_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc
new file mode 100644
index 00000000000..f5f8ae186cd
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/future_test.cc
@@ -0,0 +1,156 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace {
+
+TEST(FutureTest, Basic) {
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    std::move(promise).Set<int>(1);
+    EXPECT_FALSE(promise);  // NOLINT(bugprone-use-after-move)
+    ASSERT_TRUE(future);
+    ASSERT_TRUE(future.IsReady());
+    EXPECT_EQ(future.Get<int>(), 1);
+  }
+
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    int u = 0;
+    ASSERT_TRUE(future);
+    std::move(future).Then(
+        [&](absl::StatusOr<int> result) { u = result.value(); });
+    EXPECT_FALSE(future);  // NOLINT(bugprone-use-after-move)
+    EXPECT_EQ(u, 0);
+    std::move(promise).Set<int>(1);
+    EXPECT_EQ(u, 1);
+  }
+
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    int v = 0;
+    ASSERT_TRUE(future);
+    std::move(future).Then([&](int result) { v = result; });
+    EXPECT_FALSE(future);  // NOLINT(bugprone-use-after-move)
+    EXPECT_EQ(v, 0);
+    std::move(promise).Set<int>(1);
+    EXPECT_EQ(v, 1);
+  }
+
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    int w = 0;
+    ASSERT_TRUE(future);
+    std::move(future).Then([&]() { w = 2; });
+    EXPECT_FALSE(future);  // NOLINT(bugprone-use-after-move)
+    EXPECT_EQ(w, 0);
+    std::move(promise).Set<int>(1);
+    EXPECT_EQ(w, 2);
+  }
+
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    absl::Status s = absl::InternalError("error");
+    ASSERT_TRUE(future);
+    std::move(future).Then([&](absl::Status status) { s = status; });
+    EXPECT_FALSE(future);  // NOLINT(bugprone-use-after-move)
+    EXPECT_FALSE(s.ok());
+    std::move(promise).Set<int>(1);
+    EXPECT_TRUE(s.ok());
+  }
+}
+
+TEST(FutureTest, CopyAndMove) {
+  auto promise = Promise::Allocate<int>();
+  auto future = promise.GetFuture();
+
+  EXPECT_EQ(future.UseCount(), 2);
+
+  {
+    auto copy = future;
+    EXPECT_EQ(copy.UseCount(), 3);
+  }
+
+  auto move = std::move(future);
+  EXPECT_EQ(move.UseCount(), 2);
+
+  std::move(promise).Set<int>(1);
+
+  EXPECT_EQ(move.UseCount(), 1);
+}
+
+TEST(FutureTest, CreateFromAsyncValue) {
+  auto promise = tsl::MakeUnconstructedAsyncValueRef<int>();
+  mlrt::Future future(promise);
+
+  int v = 0;
+  std::move(future).Then([&](int result) { v = result; });
+  EXPECT_EQ(v, 0);
+
+  promise.emplace(1);
+  EXPECT_EQ(v, 1);
+}
+
+TEST(FutureTest, Error) {
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+
+    std::move(promise).SetError(absl::InternalError("test error"));
+
+    ASSERT_TRUE(future.IsError());
+    EXPECT_THAT(future.GetError(),
+                ::testing::status::CanonicalStatusIs(
+                    absl::StatusCode::kInternal, "test error"));
+  }
+
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    absl::StatusOr<int> r;
+    std::move(future).Then(
+        [&](absl::StatusOr<int> result) { r = std::move(result); });
+
+    std::move(promise).SetError(absl::InternalError("test error"));
+
+    EXPECT_THAT(r, ::testing::status::CanonicalStatusIs(
+                       absl::StatusCode::kInternal, "test error"));
+  }
+
+  {
+    auto promise = Promise::Allocate<int>();
+    auto future = promise.GetFuture();
+    absl::Status s;
+    std::move(future).Then([&](absl::Status status) { s = std::move(status); });
+    std::move(promise).SetError(absl::InternalError("test error"));
+
+    EXPECT_THAT(s, ::testing::status::CanonicalStatusIs(
+                       absl::StatusCode::kInternal, "test error"));
+  }
+}
+
+}  // namespace
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc
new file mode 100644
index 00000000000..ec6f58a67c3
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/interpreter_test.cc
@@ -0,0 +1,2908 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <cstdint>
+#include <cstring>
+#include <memory>
+#include <numeric>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "testing/base/public/benchmark.h"
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/async_handle.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+
+namespace mlrt {
+namespace {
+
+class AddI32Kernel : public KernelFrame {
+ public:
+  using KernelFrame::KernelFrame;
+
+  int32_t arg0() const { return arguments()[kArg0Index].Get<int32_t>(); }
+  int32_t arg1() const { return arguments()[kArg1Index].Get<int32_t>(); }
+
+  void set_result(int32_t result) { results()[kResultIndex].Set(result); }
+
+  void Invoke() { set_result(arg0() + arg1()); }
+
+  static constexpr char kName[] = "add";
+
+ private:
+  static constexpr int kArg0Index = 0;
+  static constexpr int kArg1Index = 1;
+
+  static constexpr int kResultIndex = 0;
+};
+
+void AddI32Const(KernelFrame frame) {
+  auto args = frame.arguments();
+  int32_t constant = frame.attributes().GetAs<int32_t>(0);
+  frame.results()[0].Set(args[0].Get<int32_t>() + constant);
+}
+
+bc::Buffer CreateSequentialAddExecutable(int num_add) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"add", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+
+  testing::SymbolTable regs;
+
+  function_ctor.construct_name("main");
+  function_ctor.construct_input_regs(1).Assign({regs.Def("r0")});
+  function_ctor.construct_output_last_uses(1).Assign({true});
+
+  auto kernels_ctor = function_ctor.construct_kernels(num_add + 1);
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("add"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"r0", "r0"}));
+    kernel_ctor.construct_results(1).Assign({regs.Def("r1")});
+  }
+
+  for (int i = 1; i < num_add; ++i) {
+    auto kernel_ctor = kernels_ctor.ConstructAt(i);
+    kernel_ctor.set_code(kernels.Use("add"));
+    kernel_ctor.construct_arguments(2).Assign(
+        regs.Use({absl::StrCat("r", (i + 1) % 2 + 1), "r0"}));
+    kernel_ctor.construct_results(1).Assign(
+        {regs.Def(absl::StrCat("r", i % 2 + 1))});
+  }
+
+  auto kernel_ctor = kernels_ctor.ConstructAt(num_add);
+  kernel_ctor.set_code(kernels.Use("return"));
+  kernel_ctor.construct_arguments(1).Assign(
+      {regs.Use(absl::StrCat("r", (num_add - 1) % 2 + 1))});
+
+  function_ctor.construct_output_regs(1).Assign(
+      {regs.Use(absl::StrCat("r", (num_add - 1) % 2 + 1))});
+  function_ctor.set_num_regs(regs.size());
+
+  return buffer;
+}
+
+bc::Buffer CreateSequentialAddAttributesExecutable(int num_add) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"add.const", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("op_key", 1);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+
+  testing::SymbolTable regs;
+
+  function_ctor.construct_name("main");
+  function_ctor.construct_input_regs(1).Assign({regs.Def("r0")});
+
+  auto kernels_ctor = function_ctor.construct_kernels(num_add + 1);
+  for (int i = 0; i < num_add; ++i) {
+    auto kernel_ctor = kernels_ctor.ConstructAt(i);
+    kernel_ctor.set_code(kernels.Use("add.const"));
+    kernel_ctor.construct_arguments(1).Assign({regs.Use(absl::StrCat("r", i))});
+    kernel_ctor.construct_results(1).Assign(
+        {regs.Def(absl::StrCat("r", i + 1))});
+    kernel_ctor.construct_attributes(1).Assign(
+        {attributes.GetHandle("op_key")});
+  }
+
+  auto kernel_ctor = kernels_ctor.ConstructAt(num_add);
+  kernel_ctor.set_code(kernels.Use("return"));
+  kernel_ctor.construct_arguments(1).Assign(
+      {regs.Use(absl::StrCat("r", num_add))});
+
+  function_ctor.construct_output_regs(1).Assign(
+      {regs.Use(absl::StrCat("r", num_add))});
+  function_ctor.set_num_regs(regs.size());
+
+  return buffer;
+}
+
+TEST(InterpreterTest, SequentialAdd) {
+  auto buffer = CreateSequentialAddExecutable(99);
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register<AddI32Kernel>();
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  absl::Notification notification;
+
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_exit_handler([&]() { notification.Notify(); });
+
+  int32_t v = 1;
+  mlrt::Value arg(v);
+  mlrt::Value result;
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&arg, 1),
+                         absl::Span<Value>(&result, 1));
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_EQ(result.Get<int32_t>(), 100);
+}
+
+TEST(InterpreterTest, SequentialAddAttributes) {
+  auto buffer = CreateSequentialAddAttributesExecutable(99);
+
+  bc::Executable executable(buffer.Get(0));
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("add.const", &AddI32Const);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  absl::Notification notification;
+
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_exit_handler([&]() { notification.Notify(); });
+
+  int32_t v = 1;
+  mlrt::Value arg(v);
+  mlrt::Value result;
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&arg, 1),
+                         absl::Span<Value>(&result, 1));
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_EQ(result.Get<int32_t>(), 100);
+}
+
+bc::Buffer CreateCallExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("op_key", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"call", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    testing::SymbolTable regs;
+
+    auto caller_ctor = functions_ctor.ConstructAt(0);
+    caller_ctor.construct_name("caller");
+    caller_ctor.construct_input_regs(1).Assign({regs.Def("arg")});
+
+    auto kernels_ctor = caller_ctor.construct_kernels(2);
+    {
+      // Call
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("call"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("arg")});
+      kernel_ctor.construct_last_uses(1).Assign({true});
+      kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("op_key")});
+    }
+
+    {
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+    }
+
+    caller_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+    caller_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    testing::SymbolTable regs;
+
+    auto callee_ctor = functions_ctor.ConstructAt(1);
+    callee_ctor.construct_name("callee");
+    callee_ctor.construct_input_regs(1).Assign({regs.Def("arg")});
+
+    {
+      auto kernels_ctor = callee_ctor.construct_kernels(1);
+
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("arg")});
+    }
+
+    callee_ctor.construct_output_regs(1).Assign({regs.Use("arg")});
+    callee_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, Call) {
+  auto buffer = CreateCallExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("caller");
+  ASSERT_TRUE(function);
+
+  Value input(123);
+  Value output;
+
+  std::vector<uint8_t> last_uses = {false};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output.Get<int>(), 123);
+  EXPECT_TRUE(input.HasValue());
+}
+
+bc::Buffer CreateCondExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(2));
+
+  attributes.Add("then_idx", 1);
+
+  attributes.Add("else_idx", 2);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.cond", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(3);
+  {
+    auto caller_ctor = functions_ctor.ConstructAt(0);
+    caller_ctor.construct_name("caller");
+
+    testing::SymbolTable regs;
+
+    caller_ctor.construct_input_regs(3).Assign(regs.Def({"cond", "x", "y"}));
+
+    {
+      auto kernels_ctor = caller_ctor.construct_kernels(2);
+      {
+        // mlrt.cond
+        auto kernel_ctor = kernels_ctor.ConstructAt(0);
+        kernel_ctor.set_code(kernels.Use("mlrt.cond"));
+        kernel_ctor.construct_arguments(3).Assign(regs.Use({"cond", "x", "y"}));
+        kernel_ctor.construct_last_uses(3).Assign({true, true, true});
+        kernel_ctor.construct_results(1).Assign({regs.Def("z")});
+        kernel_ctor.construct_attributes(2).Assign(
+            {attributes.GetHandle("then_idx"),
+             attributes.GetHandle("else_idx")});
+      }
+
+      {
+        // Return
+        auto kernel_ctor = kernels_ctor.ConstructAt(1);
+        kernel_ctor.set_code(kernels.Use("return"));
+        kernel_ctor.construct_arguments(1).Assign({regs.Use("z")});
+      }
+    }
+
+    caller_ctor.set_num_regs(regs.size());
+    caller_ctor.construct_output_regs(1).Assign({regs.Use("z")});
+  }
+
+  {
+    auto then_ctor = functions_ctor.ConstructAt(1);
+    then_ctor.construct_name("then");
+
+    testing::SymbolTable regs;
+
+    then_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"x", "y"}));
+
+    {
+      auto kernels_ctor = then_ctor.construct_kernels(1);
+
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("x")});
+    }
+
+    then_ctor.set_num_regs(regs.size());
+    then_ctor.construct_output_regs(1).Assign({regs.Use("x")});
+  }
+
+  {
+    auto else_ctor = functions_ctor.ConstructAt(2);
+    else_ctor.construct_name("else");
+
+    testing::SymbolTable regs;
+
+    else_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"x", "y"}));
+
+    {
+      auto kernels_ctor = else_ctor.construct_kernels(1);
+
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("y")});
+    }
+
+    else_ctor.set_num_regs(regs.size());
+    else_ctor.construct_output_regs(1).Assign({regs.Use("y")});
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, Cond) {
+  auto buffer = CreateCondExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("caller");
+  ASSERT_TRUE(function);
+
+  Value inputs[3];
+  inputs[0].Set(true);
+  inputs[1].Set(100);
+  inputs[2].Set(200);
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true, false, false};
+  execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output.Get<int>(), 100);
+
+  ASSERT_TRUE(inputs[1].HasValue());
+  ASSERT_TRUE(inputs[2].HasValue());
+  ASSERT_EQ(inputs[1].Get<int>(), 100);
+  ASSERT_EQ(inputs[2].Get<int>(), 200);
+
+  inputs[0].Set(false);
+  execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output.Get<int>(), 200);
+}
+
+bc::Buffer CreateNestedCallExecutable(int num_calls) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  for (int i = 0; i < num_calls; ++i) {
+    attributes.Add(absl::StrCat("f_id_", i), i);
+  }
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"call", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(num_calls);
+
+  for (int i = 0; i < num_calls - 1; ++i) {
+    testing::SymbolTable regs;
+
+    auto caller_ctor = functions_ctor.ConstructAt(i);
+    caller_ctor.construct_name(absl::StrCat("call_", i));
+    caller_ctor.construct_input_regs(1).Assign({regs.Def("arg")});
+
+    auto kernels_ctor = caller_ctor.construct_kernels(2);
+    {
+      // Call
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("call"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("arg")});
+      kernel_ctor.construct_last_uses(1).Assign({true});
+      kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle(absl::StrCat("f_id_", i + 1))});
+    }
+
+    {
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+    }
+
+    caller_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+    caller_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    testing::SymbolTable regs;
+
+    auto callee_ctor = functions_ctor.ConstructAt(num_calls - 1);
+    callee_ctor.construct_name(absl::StrCat("call_", num_calls));
+    callee_ctor.construct_input_regs(1).Assign({regs.Def("arg")});
+
+    {
+      auto kernels_ctor = callee_ctor.construct_kernels(1);
+
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("arg")});
+    }
+
+    callee_ctor.construct_output_regs(1).Assign({regs.Use("arg")});
+    callee_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, NestedCall) {
+  auto buffer = CreateNestedCallExecutable(32);
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("call_0");
+  ASSERT_TRUE(function);
+
+  Value input(123);
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output.Get<int>(), 123);
+}
+
+bc::Buffer CreateFailExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"fail", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  auto kernels_ctor = function_ctor.construct_kernels(2);
+
+  {
+    // Fail
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("fail"));
+  }
+
+  {
+    // Return
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("return"));
+  }
+
+  return buffer;
+}
+
+void Fail(KernelFrame frame) {
+  frame.execution_context().Fail(absl::InternalError("test error"));
+}
+
+TEST(InterpreterTest, Fail) {
+  auto buffer = CreateFailExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("fail", &Fail);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  std::vector<uint8_t> last_uses;
+  execution_context.Call(function, last_uses, absl::Span<Value>(),
+                         absl::Span<Value>());
+  Execute(execution_context);
+
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+bc::Buffer CreateAwaitExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"await.i32", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  testing::SymbolTable regs;
+
+  function_ctor.construct_input_regs(1).Assign({regs.Def("future")});
+
+  auto kernels_ctor = function_ctor.construct_kernels(2);
+
+  {
+    // Await
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("await.i32"));
+    kernel_ctor.construct_arguments(1).Assign({regs.Use("future")});
+    kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+  }
+
+  {
+    // Return
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("return"));
+    kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+  }
+
+  function_ctor.set_num_regs(regs.size());
+  function_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+  function_ctor.construct_output_last_uses(1).Assign({true});
+
+  return buffer;
+}
+
+void AwaitI32(KernelFrame frame) {
+  auto& future = frame.arguments()[0].Get<Future>();
+
+  frame.execution_context().Await<int32_t>(future, &frame.results()[0]);
+}
+
+TEST(InterpreterTest, Await) {
+  auto buffer = CreateAwaitExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("await.i32", &AwaitI32);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<int32_t>();
+
+  Value input(promise.GetFuture());
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  std::move(promise).Set<int32_t>(100);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output.Get<int32_t>(), 100);
+}
+
+struct TestPayload {
+  TestPayload() = default;
+  TestPayload(const TestPayload& other)
+      : copy(other.copy + 1), move(other.move) {}
+  TestPayload& operator=(const TestPayload& other) {
+    copy = other.copy + 1;
+    move = other.move;
+    return *this;
+  }
+  TestPayload(TestPayload&& other) : copy(other.copy), move(other.move + 1) {}
+  TestPayload& operator=(TestPayload&& other) {
+    copy = other.copy;
+    move = other.move + 1;
+    return *this;
+  }
+
+  int copy = 0;
+  int move = 0;
+};
+
+void AwaitTestPayload(KernelFrame frame) {
+  auto& future = frame.arguments()[0].Get<Future>();
+
+  frame.execution_context().Await<TestPayload>(std::move(future),
+                                               &frame.results()[0]);
+}
+
+TEST(InterpreterTest, AwaitMove) {
+  auto buffer = CreateAwaitExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("await.i32", &AwaitTestPayload);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  {
+    absl::Notification notification;
+    execution_context.set_exit_handler(
+        [&notification]() { notification.Notify(); });
+
+    auto promise = Promise::Allocate<TestPayload>();
+
+    Value input(promise.GetFuture());
+    Value output;
+
+    std::vector<uint8_t> last_uses = {true};
+    execution_context.Call(executable.functions()[0], last_uses,
+                           absl::Span<Value>(&input, 1),
+                           absl::Span<Value>(&output, 1));
+    Execute(execution_context);
+
+    std::move(promise).Set<TestPayload>(TestPayload{});
+
+    notification.WaitForNotification();
+    ASSERT_OK(execution_context.status());
+
+    EXPECT_EQ(output.Get<TestPayload>().copy, 0);
+    EXPECT_EQ(output.Get<TestPayload>().move, 4);
+  }
+
+  {
+    absl::Notification notification;
+    execution_context.set_exit_handler(
+        [&notification]() { notification.Notify(); });
+
+    auto promise = Promise::Allocate<TestPayload>();
+
+    Value input(promise.GetFuture());
+    Value output;
+
+    std::vector<uint8_t> last_uses = {true};
+    execution_context.Call(executable.functions()[0], last_uses,
+                           absl::Span<Value>(&input, 1),
+                           absl::Span<Value>(&output, 1));
+    std::move(promise).Set<TestPayload>(TestPayload{});
+
+    Execute(execution_context);
+
+    notification.WaitForNotification();
+    ASSERT_OK(execution_context.status());
+
+    EXPECT_EQ(output.Get<TestPayload>().copy, 0);
+    EXPECT_EQ(output.Get<TestPayload>().move, 4);
+  }
+}
+
+TEST(InterpreterTest, AwaitError) {
+  auto buffer = CreateAwaitExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("await.i32", &AwaitI32);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<int32_t>();
+
+  Value input(promise.GetFuture());
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  std::move(promise).SetError(absl::InternalError("test error"));
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+bc::Buffer CreateAwaitAllExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"await_all.i32", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  testing::SymbolTable regs;
+
+  function_ctor.construct_input_regs(2).Assign(
+      regs.Def(absl::Span<const std::string>{"f0", "f1"}));
+
+  auto kernels_ctor = function_ctor.construct_kernels(2);
+
+  {
+    // await_all.i32
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("await_all.i32"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"f0", "f1"}));
+    kernel_ctor.construct_last_uses(2).Assign({true, true});
+    kernel_ctor.construct_results(2).Assign(
+        regs.Def(absl::Span<const std::string>{"r0", "r1"}));
+  }
+
+  {
+    // Return
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("return"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"r0", "r1"}));
+  }
+
+  function_ctor.set_num_regs(regs.size());
+  function_ctor.construct_output_regs(2).Assign(regs.Use({"r0", "r1"}));
+
+  return buffer;
+}
+
+void AwaitAllI32(KernelFrame frame) {
+  RegisterValueSpan<Future> futures(frame.arguments());
+  frame.execution_context().AwaitAll<int32_t>(futures, frame.results());
+}
+
+TEST(InterpreterTest, AwaitAll) {
+  auto buffer = CreateAwaitAllExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("await_all.i32", &AwaitAllI32);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto p0 = Promise::Allocate<int32_t>();
+  auto p1 = Promise::Allocate<int32_t>();
+
+  std::vector<Value> inputs(2);
+  inputs[0].Set(p0.GetFuture());
+  inputs[1].Set(p1.GetFuture());
+  std::vector<Value> outputs(2);
+
+  std::vector<uint8_t> last_uses = {true, true};
+
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(inputs), absl::MakeSpan(outputs));
+  Execute(execution_context);
+
+  std::move(p0).Set<int32_t>(100);
+  std::move(p1).Set<int32_t>(200);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(outputs[0].Get<int32_t>(), 100);
+  EXPECT_EQ(outputs[1].Get<int32_t>(), 200);
+}
+
+void AwaitAllSharedPtrI32(KernelFrame frame) {
+  RegisterValueSpan<Future> futures(frame.arguments());
+  frame.execution_context().AwaitAll<std::shared_ptr<int32_t>>(futures,
+                                                               frame.results());
+
+  for (int i = 0; i < futures.size(); ++i) {
+    if (frame.last_uses()[i]) {
+      futures.Destroy(i);
+    }
+  }
+}
+
+TEST(InterpreterTest, AwaitAllSingleProducerMultiConsumers) {
+  auto buffer = CreateAwaitAllExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("await_all.i32", &AwaitAllSharedPtrI32);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto p = Promise::Allocate<std::shared_ptr<int32_t>>();
+
+  std::vector<Value> inputs(2);
+  inputs[0].Set(p.GetFuture());
+  inputs[1].Set(p.GetFuture());
+  std::vector<Value> outputs(2);
+
+  std::vector<uint8_t> last_uses = {true, true};
+
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(inputs), absl::MakeSpan(outputs));
+  Execute(execution_context);
+  work_queue->AddTask([p = std::move(p)]() mutable {
+    std::move(p).Set<std::shared_ptr<int32_t>>(std::make_shared<int32_t>(123));
+  });
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(*outputs[0].Get<std::shared_ptr<int32_t>>(), 123);
+  EXPECT_EQ(*outputs[1].Get<std::shared_ptr<int32_t>>(), 123);
+}
+
+TEST(InterpreterTest, AwaitAllError) {
+  auto buffer = CreateAwaitAllExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("await_all.i32", &AwaitAllI32);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto p0 = Promise::Allocate<int32_t>();
+  auto p1 = Promise::Allocate<int32_t>();
+
+  std::vector<Value> inputs(2);
+  inputs[0].Set(p0.GetFuture());
+  inputs[1].Set(p1.GetFuture());
+  std::vector<Value> outputs(2);
+
+  std::vector<uint8_t> last_uses = {true, true};
+
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(inputs), absl::MakeSpan(outputs));
+  Execute(execution_context);
+
+  std::move(p0).Set<int32_t>(100);
+
+  // The execution must be blocked on a tf_mlrt.await_all at this moment,
+  // because `p1` has not been fulfilled yet.
+  ASSERT_FALSE(notification.HasBeenNotified());
+
+  std::move(p1).SetError(absl::InternalError("test error"));
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+struct TestState : UserContext<TestState> {
+  int* state = nullptr;
+};
+
+void WriteState(KernelFrame frame) {
+  auto& test_state = frame.execution_context().GetUserContext<TestState>();
+  CHECK(test_state.state);
+  *test_state.state = frame.arguments()[0].Get<int>();
+}
+
+void ReadState(KernelFrame frame) {
+  auto& test_state = frame.execution_context().GetUserContext<TestState>();
+  CHECK(test_state.state);
+  frame.results()[0].Set(*test_state.state);
+}
+
+bc::Buffer CreateAwaitControlExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.await_control", "return",
+                                    "read_state"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  testing::SymbolTable regs;
+  function_ctor.construct_input_regs(1).Assign({regs.Def("control_future")});
+
+  auto kernels_ctor = function_ctor.construct_kernels(3);
+
+  {
+    // mlrt.await_control
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("mlrt.await_control"));
+    kernel_ctor.construct_arguments(1).Assign({regs.Use("control_future")});
+  }
+
+  {
+    // read_state
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("read_state"));
+    kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+  }
+
+  {
+    // Return
+    auto kernel_ctor = kernels_ctor.ConstructAt(2);
+    kernel_ctor.set_code(kernels.Use("return"));
+    kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+  }
+
+  function_ctor.set_num_regs(regs.size());
+  function_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+
+  return buffer;
+}
+
+TEST(InterpreterTest, AwaitControl) {
+  auto buffer = CreateAwaitControlExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("read_state", &ReadState);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  int state = 0;
+  auto test_state = std::make_unique<TestState>();
+  test_state->state = &state;
+  execution_context.AddUserContext(std::move(test_state));
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<Control>();
+
+  Value input(promise.GetFuture());
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  state = 100;
+  std::move(promise).Set<Control>();
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+  ASSERT_TRUE(output.HasValue());
+  EXPECT_EQ(output.Get<int>(), 100);
+}
+
+bc::Buffer CreateAwaitAllControlExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.await_all_control", "return",
+                                    "read_state"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  testing::SymbolTable regs;
+  function_ctor.construct_input_regs(2).Assign(
+      regs.Def(absl::Span<const std::string>{"f0", "f1"}));
+
+  auto kernels_ctor = function_ctor.construct_kernels(3);
+
+  {
+    // mlrt.await_all_control
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("mlrt.await_all_control"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"f0", "f1"}));
+  }
+
+  {
+    // read_state
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("read_state"));
+    kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+  }
+
+  {
+    // Return
+    auto kernel_ctor = kernels_ctor.ConstructAt(2);
+    kernel_ctor.set_code(kernels.Use("return"));
+    kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+  }
+
+  function_ctor.set_num_regs(regs.size());
+  function_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+
+  return buffer;
+}
+
+TEST(InterpreterTest, AwaitAllControl) {
+  auto buffer = CreateAwaitAllControlExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("read_state", &ReadState);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  int state = 0;
+  auto test_state = std::make_unique<TestState>();
+  test_state->state = &state;
+  execution_context.AddUserContext(std::move(test_state));
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto p0 = Promise::Allocate<Control>();
+  auto p1 = Promise::Allocate<Control>();
+
+  std::vector<Value> inputs(2);
+  inputs[0].Set(p0.GetFuture());
+  inputs[1].Set(p1.GetFuture());
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true, true};
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(inputs), absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  state = 100;
+
+  ASSERT_FALSE(notification.HasBeenNotified());
+
+  std::move(p0).Set<Control>();
+
+  ASSERT_FALSE(notification.HasBeenNotified());
+
+  std::move(p1).Set<Control>();
+
+  notification.WaitForNotification();
+
+  ASSERT_OK(execution_context.status());
+  ASSERT_TRUE(output.HasValue());
+  EXPECT_EQ(output.Get<int>(), 100);
+}
+
+TEST(InterpreterTest, AwaitAllControlError) {
+  auto buffer = CreateAwaitAllControlExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("read_state", &ReadState);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  int state = 0;
+  auto test_state = std::make_unique<TestState>();
+  test_state->state = &state;
+  execution_context.AddUserContext(std::move(test_state));
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto p0 = Promise::Allocate<Control>();
+  auto p1 = Promise::Allocate<Control>();
+
+  std::vector<Value> inputs(2);
+  inputs[0].Set(p0.GetFuture());
+  inputs[1].Set(p1.GetFuture());
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true, true};
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(inputs), absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  state = 100;
+
+  ASSERT_FALSE(notification.HasBeenNotified());
+
+  std::move(p1).Set<Control>();
+
+  ASSERT_FALSE(notification.HasBeenNotified());
+
+  std::move(p0).SetError(absl::InternalError("test error"));
+
+  notification.WaitForNotification();
+
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+class AddInPlaceI32 : public KernelFrame {
+ public:
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "add_inplace";
+
+  int32_t arg0() const { return arguments()[kArg0Index].Get<int32_t>(); }
+  int32_t arg1() const { return arguments()[kArg1Index].Get<int32_t>(); }
+  int32_t& arg2() const { return *arguments()[kArg2Index].Get<int32_t*>(); }
+
+  void Invoke() { arg2() = arg0() + arg1(); }
+
+ private:
+  static constexpr int kArg0Index = 0;
+  static constexpr int kArg1Index = 1;
+  static constexpr int kArg2Index = 2;
+};
+
+bc::Buffer CreateAsyncExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("func_idx", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.async", "mlrt.await_handle",
+                                    "add_inplace", "return"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(3).Assign(regs.Def({"x", "y", "z_ptr"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z_ptr"}));
+      kernel_ctor.construct_last_uses(3).Assign({false, false, false});
+      kernel_ctor.construct_results(1).Assign({regs.Def("handle")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("func_idx")});
+    }
+
+    {
+      // mlrt.await_handle
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_handle"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("handle")});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("callee");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(3).Assign(regs.Def({"x", "y", "z_ptr"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+    {
+      // add_inplace
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("add_inplace"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z_ptr"}));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, Async) {
+  auto buffer = CreateAsyncExecutable();
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register<AddInPlaceI32>();
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  int32_t output = 0;
+  std::vector<mlrt::Value> args(3);
+  args[0].Set<int32_t>(1);
+  args[1].Set<int32_t>(2);
+  // The output parameter will be moved into execution.
+  args[2].Set<int32_t*>(&output);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(args), absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output, 3);
+}
+
+void AddInPlaceI32Error(KernelFrame frame) {
+  frame.execution_context().Fail(absl::InternalError("test error"));
+}
+
+TEST(InterpreterTest, AsyncError) {
+  auto buffer = CreateAsyncExecutable();
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("add_inplace", &AddInPlaceI32Error);
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  int32_t output = 0;
+  std::vector<mlrt::Value> args(3);
+  args[0].Set<int32_t>(1);
+  args[1].Set<int32_t>(2);
+  args[2].Set<int32_t*>(&output);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(function, last_uses, absl::MakeSpan(args),
+                         absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+bc::Buffer CreateNestedAsyncExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(2));
+
+  std::string constant_str(sizeof(uint32_t), '\0');
+
+  attributes.Add("async_callee_index", 1);
+
+  attributes.Add("callee_index", 2);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.async", "mlrt.await_handle",
+                                    "add_inplace", "return"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(3);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(3).Assign(regs.Def({"x", "y", "z"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z"}));
+      kernel_ctor.construct_last_uses(3).Assign({false, false, false});
+      kernel_ctor.construct_results(1).Assign({regs.Def("handle")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("async_callee_index")});
+    }
+
+    {
+      // mlrt.await_handle
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_handle"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("handle")});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("async_callee");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(3).Assign(regs.Def({"x", "y", "z"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z"}));
+      kernel_ctor.construct_last_uses(3).Assign({false, false, false});
+      kernel_ctor.construct_results(1).Assign({regs.Def("handle")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("callee_index")});
+    }
+
+    {
+      // mlrt.await_handle
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_handle"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("handle")});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(2);
+    function_ctor.construct_name("callee");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(3).Assign(regs.Def({"x", "y", "z"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+    {
+      // add_inplace
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("add_inplace"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z"}));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, NestedAsync) {
+  auto buffer = CreateNestedAsyncExecutable();
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register<AddInPlaceI32>();
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  int32_t output = 0;
+  std::vector<mlrt::Value> args(3);
+  args[0].Set<int32_t>(1);
+  args[1].Set<int32_t>(2);
+  args[2].Set<int32_t*>(&output);  // output parameter
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(function, last_uses, absl::MakeSpan(args),
+                         absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(output, 3);
+}
+
+TEST(InterpreterTest, NestedAsyncError) {
+  auto buffer = CreateNestedAsyncExecutable();
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("add_inplace", &AddInPlaceI32Error);
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  int32_t output = 0;
+  std::vector<mlrt::Value> args(3);
+  args[0].Set<int32_t>(1);
+  args[1].Set<int32_t>(2);
+  args[2].Set<int32_t*>(&output);  // output parameter
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(function, last_uses, absl::MakeSpan(args),
+                         absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+bc::Buffer CreateAsyncControlPromiseAwaitExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(2));
+
+  attributes.Add("func_idx", 1);
+
+  attributes.Add("num_futures", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.async",
+                                    "mlrt.await_handle",
+                                    "mlrt.allocate_control_futures",
+                                    "mlrt.await_control",
+                                    "mlrt.promise_control",
+                                    "return",
+                                    "write_state",
+                                    "read_state"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(1).Assign({regs.Def("input")});
+
+    auto kernels_ctor = function_ctor.construct_kernels(6);
+    {
+      // mlrt.allocate_control_futures
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.allocate_control_futures"));
+      kernel_ctor.construct_results(2).Assign(regs.Def(
+          absl::Span<const std::string>{"control_promise", "control_future"}));
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("num_futures")});
+    }
+
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(2).Assign(
+          regs.Use({"input", "control_promise"}));
+      kernel_ctor.construct_last_uses(2).Assign({false, true});
+      kernel_ctor.construct_results(1).Assign({regs.Def("handle")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("func_idx")});
+    }
+
+    {
+      // mlrt.await_control
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_control"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("control_future")});
+    }
+
+    {
+      // read_state
+      auto kernel_ctor = kernels_ctor.ConstructAt(3);
+      kernel_ctor.set_code(kernels.Use("read_state"));
+      kernel_ctor.construct_results(1).Assign({regs.Def("output")});
+    }
+
+    {
+      // mlrt.await_handle
+      auto kernel_ctor = kernels_ctor.ConstructAt(4);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_handle"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("handle")});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(5);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("output")});
+    }
+
+    function_ctor.set_num_regs(regs.size());
+    function_ctor.construct_output_regs(1).Assign({regs.Use("output")});
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("callee");
+    testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"input", "control_promise"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+    {
+      // write_state
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("write_state"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("input")});
+    }
+
+    {
+      // mlrt.promise_control
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.promise_control"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("control_promise")});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, AsyncControlPromiseAwait) {
+  auto buffer = CreateAsyncControlPromiseAwaitExecutable();
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("read_state", &ReadState);
+  kernel_registry.Register("write_state", &WriteState);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  int state = 0;
+  auto test_state = std::make_unique<TestState>();
+  test_state->state = &state;
+  execution_context.AddUserContext(std::move(test_state));
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  Value input(200);
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  ASSERT_TRUE(output.HasValue());
+  EXPECT_EQ(output.Get<int>(), 200);
+}
+
+bc::Buffer CreateAwaitAllHandleExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("func_idx", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.async", "mlrt.await_all_handle",
+                                    "add_inplace", "return"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(4).Assign(
+        regs.Def({"x", "y", "z_ptr", "w_ptr"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(4);
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z_ptr"}));
+      kernel_ctor.construct_last_uses(3).Assign({false, false, false});
+      kernel_ctor.construct_results(1).Assign({regs.Def("h0")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("func_idx")});
+    }
+
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "w_ptr"}));
+      kernel_ctor.construct_last_uses(3).Assign({false, false, false});
+      kernel_ctor.construct_results(1).Assign({regs.Def("h1")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("func_idx")});
+    }
+
+    {
+      // mlrt.await_all_handle
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_all_handle"));
+      kernel_ctor.construct_arguments(2).Assign(regs.Use({"h0", "h1"}));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(3);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("callee");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(3).Assign(regs.Def({"x", "y", "z_ptr"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+    {
+      // add_inplace
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("add_inplace"));
+      kernel_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z_ptr"}));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, AwaitAllhandle) {
+  auto buffer = CreateAwaitAllHandleExecutable();
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register<AddInPlaceI32>();
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  int32_t z = 0, w = 0;
+  std::vector<mlrt::Value> args(4);
+  args[0].Set<int32_t>(1);
+  args[1].Set<int32_t>(2);
+  args[2].Set<int32_t*>(&z);
+  args[3].Set<int32_t*>(&w);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true, true};
+  execution_context.Call(function, last_uses, absl::MakeSpan(args),
+                         absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(z, 3);
+  EXPECT_EQ(w, 3);
+}
+
+bc::Buffer CreateWhileExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("body_idx", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.while", "return", "test_while_body"};
+  executable_ctor.construct_kernel_names(3).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+  {
+    auto caller_ctor = functions_ctor.ConstructAt(0);
+    caller_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+
+    caller_ctor.construct_input_regs(4).Assign(
+        regs.Def({"init_bool", "x", "y", "z"}));
+
+    {
+      auto kernels_ctor = caller_ctor.construct_kernels(2);
+      {
+        // mlrt.while
+        auto kernel_ctor = kernels_ctor.ConstructAt(0);
+        kernel_ctor.set_code(kernels.Use("mlrt.while"));
+        kernel_ctor.construct_arguments(4).Assign(
+            regs.Use({"init_bool", "x", "y", "z"}));
+        kernel_ctor.construct_last_uses(4).Assign({true, true, true, true});
+        kernel_ctor.construct_results(4).Assign(regs.Def({"r", "s", "t", "u"}));
+        kernel_ctor.construct_attributes(1).Assign(
+            {attributes.GetHandle("body_idx")});
+      }
+
+      {
+        // Return
+        auto kernel_ctor = kernels_ctor.ConstructAt(1);
+        kernel_ctor.set_code(kernels.Use("return"));
+        kernel_ctor.construct_arguments(1).Assign({regs.Use("t")});
+      }
+    }
+
+    caller_ctor.set_num_regs(regs.size());
+    caller_ctor.construct_output_regs(1).Assign({regs.Use("t")});
+  }
+
+  {
+    auto body_ctor = functions_ctor.ConstructAt(1);
+    body_ctor.construct_name("body");
+
+    testing::SymbolTable regs;
+
+    body_ctor.construct_input_regs(3).Assign(
+        regs.Def(absl::Span<const std::string>{"x", "y", "z"}));
+
+    {
+      auto kernels_ctor = body_ctor.construct_kernels(2);
+      auto predicate_ctor = kernels_ctor.ConstructAt(0);
+      predicate_ctor.set_code(kernels.Use("test_while_body"));
+      predicate_ctor.construct_arguments(3).Assign(regs.Use({"x", "y", "z"}));
+      predicate_ctor.construct_results(4).Assign(
+          regs.Def({"u", "v", "w", "p"}));
+
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(4).Assign(regs.Use({"u", "v", "w", "p"}));
+    }
+
+    body_ctor.set_num_regs(regs.size());
+    body_ctor.construct_output_regs(4).Assign(regs.Use({"u", "v", "w", "p"}));
+  }
+
+  return buffer;
+}
+
+// A test while loop body.
+// Pseudo code
+// out[0] = in[0] + 1 --> Loop count increment
+// out[1] = in[1]     --> Loop count stop value
+// out[2] = in[2] + 2 (Step)  --> Increment value
+// out[3] = in[0] < in[1]
+constexpr int32_t kValueIncrementStep = 2;
+void TestWhileBody(KernelFrame frame) {
+  ASSERT_EQ(frame.arguments().size(), 3);
+  ASSERT_EQ(frame.results().size(), 4);
+  frame.results()[0].Set(frame.arguments()[0].Get<int32_t>() + 1);
+  frame.results()[1].Set(frame.arguments()[1].Get<int32_t>());
+  frame.results()[2].Set(frame.arguments()[2].Get<int32_t>() +
+                         kValueIncrementStep);
+  frame.results()[3].Set(frame.arguments()[0].Get<int32_t>() <
+                         frame.arguments()[1].Get<int32_t>());
+}
+
+TEST(KernelTest, While) {
+  auto buffer = CreateWhileExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry registry;
+  RegisterBuiltinKernels(registry);
+  registry.Register("test_while_body", &TestWhileBody);
+  LoadedExecutable loaded_executable(executable, registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  Value inputs[4];
+
+  constexpr int32_t kStart = 0;
+  constexpr int32_t kEnd = 2;
+  constexpr int32_t kInitialValue = 6;
+
+  inputs[0].Set(true);
+  inputs[1].Set(kStart);
+  inputs[2].Set(kEnd);
+  inputs[3].Set(kInitialValue);
+  Value output;
+
+  std::vector<uint8_t> last_uses = {false, false, false, false};
+  execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                         absl::Span<Value>(&output, 1));
+
+  Execute(execution_context);
+
+  ASSERT_TRUE(output.HasValue());
+  EXPECT_EQ(kInitialValue + kValueIncrementStep * (kEnd - kStart + 1),
+            output.Get<int32_t>());
+}
+
+TEST(KernelTest, WhileWithInitialFalseCondition) {
+  auto buffer = CreateWhileExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry registry;
+  RegisterBuiltinKernels(registry);
+  registry.Register("test_while_body", &TestWhileBody);
+  LoadedExecutable loaded_executable(executable, registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  Value inputs[4];
+
+  constexpr int32_t kStart = 0;
+  constexpr int32_t kEnd = 2;
+  constexpr int32_t kInitialValue = 6;
+
+  inputs[0].Set(false);
+  inputs[1].Set(kStart);
+  inputs[2].Set(kEnd);
+  inputs[3].Set(kInitialValue);
+  Value output;
+
+  std::vector<uint8_t> last_uses = {false, false, false, false};
+  execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                         absl::Span<Value>(&output, 1));
+
+  Execute(execution_context);
+
+  ASSERT_TRUE(output.HasValue());
+  EXPECT_EQ(kInitialValue, output.Get<int32_t>());
+
+  // Should have no side effect on input.
+  EXPECT_EQ(inputs[0].Get<bool>(), false);
+  EXPECT_EQ(inputs[1].Get<int32_t>(), kStart);
+  EXPECT_EQ(inputs[2].Get<int32_t>(), kEnd);
+  EXPECT_EQ(inputs[3].Get<int32_t>(), kInitialValue);
+}
+
+bc::Buffer CreateUnwindExecutable(int num_regs = 1) {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  auto kernel_names_ctor = executable_ctor.construct_kernel_names(2);
+  kernel_names_ctor.ConstructAt(0, "cancel");
+  kernel_names_ctor.ConstructAt(1, "return");
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+    function_ctor.set_num_regs(num_regs);
+
+    std::vector<uint32_t> reg_indices(num_regs);
+    std::iota(reg_indices.begin(), reg_indices.end(), 0);
+
+    function_ctor.construct_input_regs(num_regs).Assign(reg_indices.begin(),
+                                                        reg_indices.end());
+    function_ctor.construct_output_regs(num_regs).Assign(reg_indices.begin(),
+                                                         reg_indices.end());
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+
+    {
+      // cancel
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(0);
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(1);
+      kernel_ctor.construct_arguments(num_regs).Assign(reg_indices.begin(),
+                                                       reg_indices.end());
+    }
+  }
+
+  return buffer;
+}
+
+void Cancel(KernelFrame frame) {
+  frame.execution_context().Fail(absl::CancelledError("test cancel"));
+}
+
+TEST(InterpreterTest, UnwindPromise) {
+  auto buffer = CreateUnwindExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto promise = Promise::Allocate<int32_t>();
+  auto future = promise.GetFuture();
+
+  Value input(std::move(promise));
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+}
+
+TEST(InterpreterTest, UnwindInvalidPromise) {
+  auto buffer = CreateUnwindExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto promise = Promise::Allocate<int32_t>();
+  auto future = promise.GetFuture();
+  std::move(promise).Set<int32_t>(100);
+
+  Value input(std::move(promise));  // NOLINT(bugprone-use-after-move)
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+  EXPECT_EQ(future.Get<int32_t>(), 100);
+}
+
+TEST(InterpreterTest, UnwindFuture) {
+  auto buffer = CreateUnwindExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<int32_t>();
+
+  Value input(promise.GetFuture());
+  Value output;
+
+  std::vector<uint8_t> last_uses = {false};
+
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  std::move(promise).Set<int32_t>(100);
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+  EXPECT_EQ(input.Get<Future>().Get<int32_t>(), 100);
+}
+
+TEST(InterpreterTest, UnwindPromiseAndFuture) {
+  auto buffer = CreateUnwindExecutable(/*num_regs=*/2);
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<int32_t>();
+  auto future = promise.GetFuture();
+
+  // If both the promise and the future are going to be unwinded before the
+  // promise is set or passed to another thread, the promise must be unwinded
+  // first before the future to avoid deadlock.
+  std::vector<Value> inputs(2);
+  inputs[0].Set(std::move(promise));
+  inputs[1].Set(future);
+
+  std::vector<Value> outputs(2);
+
+  std::vector<uint8_t> last_uses = {true, true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(inputs),
+                         absl::Span<Value>(outputs));
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+  EXPECT_THAT(future.GetError(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+}
+
+TEST(InterpreterTest, UnwindAsyncHandle) {
+  auto buffer = CreateUnwindExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto [promise, handle] = AsyncHandle::Allocate(execution_context);
+
+  Value input(std::move(handle));
+  Value output;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&input, 1),
+                         absl::Span<Value>(&output, 1));
+  Execute(execution_context);
+
+  ASSERT_FALSE(notification.HasBeenNotified());
+
+  std::move(promise).Finish(absl::OkStatus());
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+}
+
+bc::Buffer CreateCaseExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  {
+    bc::Buffer attr_buffer;
+    bc::Allocator attr_allocator(&attr_buffer);
+    bc::New<bc::Vector<int32_t>>(&attr_allocator, std::vector<int32_t>{1, 2});
+    attributes.Add("function_indices",
+                   absl::string_view(attr_buffer.data(), attr_buffer.size()));
+  }
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.case", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(3);
+  {
+    auto caller_ctor = functions_ctor.ConstructAt(0);
+    caller_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+
+    caller_ctor.construct_input_regs(3).Assign(
+        regs.Def({"branch_idx", "in_0", "in_1"}));
+
+    {
+      auto kernels_ctor = caller_ctor.construct_kernels(2);
+      {
+        // mlrt.case
+        auto kernel_ctor = kernels_ctor.ConstructAt(0);
+        kernel_ctor.set_code(kernels.Use("mlrt.case"));
+        kernel_ctor.construct_arguments(3).Assign(
+            regs.Use({"branch_idx", "in_0", "in_1"}));
+        kernel_ctor.construct_last_uses(3).Assign({true, true, true});
+        kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+        kernel_ctor.construct_attributes(1).Assign(
+            {attributes.GetHandle("function_indices")});
+      }
+
+      {
+        // Return
+        auto kernel_ctor = kernels_ctor.ConstructAt(1);
+        kernel_ctor.set_code(kernels.Use("return"));
+        kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+      }
+    }
+
+    caller_ctor.set_num_regs(regs.size());
+    caller_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+  }
+
+  {
+    auto callee_ctor = functions_ctor.ConstructAt(1);
+    callee_ctor.construct_name("callee0");
+
+    testing::SymbolTable regs;
+
+    callee_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"x", "y"}));
+
+    {
+      auto kernels_ctor = callee_ctor.construct_kernels(1);
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign(regs.Use({"x"}));
+    }
+
+    callee_ctor.set_num_regs(regs.size());
+    callee_ctor.construct_output_regs(1).Assign(regs.Use({"x"}));
+  }
+
+  {
+    auto callee_ctor = functions_ctor.ConstructAt(2);
+    callee_ctor.construct_name("calle1");
+
+    testing::SymbolTable regs;
+
+    callee_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"x", "y"}));
+
+    {
+      auto kernels_ctor = callee_ctor.construct_kernels(1);
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign(regs.Use({"y"}));
+    }
+
+    callee_ctor.set_num_regs(regs.size());
+    callee_ctor.construct_output_regs(1).Assign(regs.Use({"y"}));
+  }
+
+  return buffer;
+}
+
+bc::Buffer CreateUnwindComplexExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("func_idx", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.async", "mlrt.await_control",
+                                    "cancel", "return"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"p", "f"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("f")});
+      kernel_ctor.construct_last_uses(1).Assign({true});
+      kernel_ctor.construct_results(1).Assign({regs.Def("handle")});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("func_idx")});
+    }
+
+    {
+      // cancel
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("cancel"));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("callee");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(1).Assign({regs.Def("f")});
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+    {
+      // mlrt.await_control
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_control"));
+      kernel_ctor.construct_arguments(1).Assign(regs.Use({"f"}));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, UnwindComplex) {
+  // Test unwinding a function that launches an async function and uses
+  // promise/future to communicate with the async function. Interpreter should
+  // handle error correctly in this situation by setting error in promises and
+  // wait until the async function to finish.
+
+  auto buffer = CreateUnwindComplexExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<int32_t>();
+  auto future = promise.GetFuture();
+
+  std::vector<Value> inputs(2);
+  inputs[0].Set(std::move(promise));
+  inputs[1].Set(future);
+
+  std::vector<uint8_t> last_uses = {true, true};
+
+  execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                         absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+  EXPECT_THAT(future.GetError(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+}
+
+bc::Buffer CreateUnwindNestedExecutable() {
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = bc::New<bc::Executable>(&allocator);
+
+  testing::AttributeTable attributes(executable_ctor.construct_attributes(1));
+
+  attributes.Add("func_idx", 1);
+
+  testing::SymbolTable kernels;
+  std::vector<std::string> names = {"call", "cancel", "return"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(3);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    testing::SymbolTable regs;
+    function_ctor.construct_input_regs(1).Assign({regs.Def("p")});
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+
+    {
+      // Call
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("call"));
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("func_idx")});
+    }
+
+    {
+      // Return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("callee");
+
+    testing::SymbolTable regs;
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+
+    {
+      // cancel
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("cancel"));
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(InterpreterTest, UnwindNested) {
+  auto buffer = CreateUnwindNestedExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("cancel", &Cancel);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  auto promise = Promise::Allocate<int32_t>();
+  auto future = promise.GetFuture();
+
+  Value input(std::move(promise));
+  Value output;
+
+  execution_context.CallByMove(function, absl::Span<Value>(&input, 1),
+                               absl::Span<Value>());
+  Execute(execution_context);
+
+  notification.WaitForNotification();
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+  EXPECT_THAT(future.GetError(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kCancelled,
+                                                   "test cancel"));
+}
+
+TEST(KernelTest, Case) {
+  auto buffer = CreateCaseExecutable();
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry registry;
+  RegisterBuiltinKernels(registry);
+  LoadedExecutable loaded_executable(executable, registry);
+
+  ExecutionContext execution_context(&loaded_executable);
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  Value inputs[3];
+
+  constexpr int32_t kBranch0In = 123;
+  constexpr int32_t kBranch1In = 456;
+
+  // Test Branch 0
+  {
+    inputs[0].Set<uint32_t>(0);
+    inputs[1].Set(kBranch0In);
+    inputs[2].Set(kBranch1In);
+    Value output;
+
+    std::vector<uint8_t> last_uses = {true, true, true};
+    execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                           absl::Span<Value>(&output, 1));
+
+    Execute(execution_context);
+
+    ASSERT_TRUE(output.HasValue());
+    EXPECT_EQ(kBranch0In, output.Get<int32_t>());
+  }
+  {
+    // Test Branch 1
+    inputs[0].Set<uint32_t>(1);
+    inputs[1].Set(kBranch0In);
+    inputs[2].Set(kBranch1In);
+    Value output;
+
+    std::vector<uint8_t> last_uses = {true, true, true};
+    execution_context.Call(function, last_uses, absl::MakeSpan(inputs),
+                           absl::Span<Value>(&output, 1));
+
+    Execute(execution_context);
+
+    ASSERT_TRUE(output.HasValue());
+    EXPECT_EQ(kBranch1In, output.Get<int32_t>());
+  }
+}
+
+void BM_SequentialAdd(benchmark::State& state) {
+  auto buffer = CreateSequentialAddExecutable(99);
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register<AddI32Kernel>();
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  absl::Notification notification;
+
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_exit_handler([&]() { notification.Notify(); });
+
+  int32_t v = 1;
+  Value arg(v);
+  Value result;
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  std::vector<uint8_t> last_uses = {false};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&arg, 1),
+                         absl::Span<Value>(&result, 1));
+
+  Execute(execution_context);
+  notification.WaitForNotification();
+  CHECK_EQ(result.Get<int32_t>(), 100);
+
+  for (auto s : state) {
+    absl::Notification notification;
+
+    ExecutionContext execution_context(&loaded_executable);
+    execution_context.set_exit_handler([&]() { notification.Notify(); });
+
+    execution_context.Call(function, last_uses, absl::Span<Value>(&arg, 1),
+                           absl::Span<Value>(&result, 1));
+    Execute(execution_context);
+    notification.WaitForNotification();
+  }
+}
+BENCHMARK(BM_SequentialAdd);
+
+void BM_SequentialAddAttributes(benchmark::State& state) {
+  auto buffer = CreateSequentialAddAttributesExecutable(99);
+
+  bc::Executable executable(buffer.data());
+
+  KernelRegistry kernel_registry;
+  RegisterBuiltinKernels(kernel_registry);
+  kernel_registry.Register("add.const", &AddI32Const);
+
+  LoadedExecutable loaded_executable(executable, kernel_registry);
+
+  absl::Notification notification;
+
+  ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_exit_handler([&]() { notification.Notify(); });
+
+  int32_t v = 1;
+  mlrt::Value arg(v);
+  mlrt::Value result;
+
+  auto function = loaded_executable.GetFunction("main");
+  ASSERT_TRUE(function);
+
+  std::vector<uint8_t> last_uses = {false};
+  execution_context.Call(function, last_uses, absl::Span<Value>(&arg, 1),
+                         absl::Span<Value>(&result, 1));
+  Execute(execution_context);
+  notification.WaitForNotification();
+  CHECK_EQ(result.Get<int32_t>(), 100);
+
+  for (auto s : state) {
+    absl::Notification notification;
+
+    ExecutionContext execution_context(&loaded_executable);
+    execution_context.set_exit_handler([&]() { notification.Notify(); });
+
+    std::vector<uint8_t> last_uses = {false};
+    execution_context.Call(function, last_uses, absl::Span<Value>(&arg, 1),
+                           absl::Span<Value>(&result, 1));
+    Execute(execution_context);
+    notification.WaitForNotification();
+  }
+}
+BENCHMARK(BM_SequentialAddAttributes);
+
+}  // namespace
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h b/tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h
new file mode 100644
index 00000000000..2b1d967a075
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h
@@ -0,0 +1,126 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_INTERPRETER_TESTUTIL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_INTERPRETER_TESTUTIL_H_
+
+#include <cstring>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
+
+namespace mlrt {
+namespace testing {
+
+class SymbolTable {
+ public:
+  int Def(absl::string_view name) {
+    auto iter = reg_names_.find(name);
+    if (iter != reg_names_.end()) {
+      return iter->second;
+    }
+
+    int& id = reg_names_[name];
+    id = next_reg_id_++;
+
+    return id;
+  }
+
+  std::vector<int> Def(absl::Span<const std::string> names) {
+    return DefOrUse(names,
+                    [this](absl::string_view name) { return Def(name); });
+  }
+
+  int Use(absl::string_view name) const {
+    DCHECK(reg_names_.contains(name));
+    return reg_names_.at(name);
+  }
+
+  std::vector<int> Use(absl::Span<const std::string> names) {
+    return DefOrUse(names,
+                    [this](absl::string_view name) { return Use(name); });
+  }
+
+  size_t size() const { return reg_names_.size(); }
+
+ private:
+  std::vector<int> DefOrUse(
+      absl::Span<const std::string> names,
+      absl::FunctionRef<int(absl::string_view)> def_or_use) {
+    std::vector<int> ids;
+    ids.reserve(names.size());
+    for (const auto& name : names) {
+      ids.push_back(def_or_use(name));
+    }
+    return ids;
+  }
+
+  absl::flat_hash_map<std::string, int> reg_names_;
+  int next_reg_id_ = 0;
+};
+
+class AttributeTable {
+ public:
+  explicit AttributeTable(bc::Vector<bc::String>::Constructor attributes_ctor)
+      : ctor_(attributes_ctor) {}
+
+  void Add(absl::string_view name, absl::string_view value) {
+    handles_[name] = next_id_;
+    ctor_.ConstructAt(next_id_++, value);
+  }
+
+  void Add(absl::string_view name, const char* value) {
+    Add(name, absl::string_view(value));
+  }
+
+  void AddInline(absl::string_view name, absl::string_view value) {
+    DCHECK_LE(value.size(), sizeof(uint32_t));
+    std::memcpy(&handles_[name], value.data(), value.size());
+  }
+
+  template <typename T,
+            typename std::enable_if_t<
+                attribute_internal::kCanAttributeBeInlined<T>, int> = 0>
+  void Add(absl::string_view name, T value) {
+    AddInline(name, absl::string_view(reinterpret_cast<const char*>(&value),
+                                      sizeof(value)));
+  }
+
+  template <typename T, typename std::enable_if_t<
+                            std::is_trivial_v<T> &&
+                                !attribute_internal::kCanAttributeBeInlined<T>,
+                            int> = 0>
+  void Add(absl::string_view name, T value) {
+    Add(name, absl::string_view(reinterpret_cast<const char*>(&value),
+                                sizeof(value)));
+  }
+
+  uint32_t GetHandle(absl::string_view name) { return handles_.at(name); }
+
+ private:
+  bc::Vector<bc::String>::Constructor ctor_;
+  int next_id_ = 0;
+  absl::flat_hash_map<std::string, uint32_t> handles_;
+};
+
+}  // namespace testing
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_INTERPRETER_TESTUTIL_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/iterator.h b/tensorflow/core/tfrt/mlrt/interpreter/iterator.h
new file mode 100644
index 00000000000..582e7defadb
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/iterator.h
@@ -0,0 +1,131 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ITERATOR_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ITERATOR_H_
+
+#include <iterator>
+
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+
+namespace mlrt {
+namespace iterator_internal {
+
+template <typename Iter, typename ValueType, typename ValueRangeType>
+class IteratorBase {
+  const Iter& self() const { return static_cast<const Iter&>(*this); }
+  Iter& self() { return static_cast<Iter&>(*this); }
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = ValueType;
+  using pointer = ValueType*;
+  using reference = ValueType&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  explicit IteratorBase(bc::ReadIterator<uint32_t> index_iter,
+                        ValueRangeType values)
+      : index_iter_(index_iter), values_(values) {}
+
+  reference operator*() const { return values_[*index_iter_]; }
+
+  pointer operator->() const { return &values_[*index_iter_]; }
+
+  reference operator[](difference_type i) const {
+    return values_[*(index_iter_ + i)];
+  }
+
+  Iter& operator+=(difference_type d) {
+    index_iter_ += d;
+    return self();
+  }
+
+  Iter& operator-=(difference_type d) {
+    index_iter_ -= d;
+    return self();
+  }
+
+  Iter& operator++() {
+    ++index_iter_;
+    return self();
+  }
+
+  Iter operator++(int) {
+    Iter r = self();
+    ++index_iter_;
+    return r;
+  }
+
+  Iter& operator--() {
+    --index_iter_;
+    return self();
+  }
+
+  Iter operator--(int) {
+    Iter r = self();
+    --index_iter_;
+    return r;
+  }
+
+  Iter operator+(difference_type d) const {
+    Iter r = self();
+    r += d;
+    return r;
+  }
+
+  friend Iter operator+(difference_type d, const Iter& i) { return i + d; }
+
+  Iter operator-(difference_type d) const {
+    Iter r = self();
+    r -= d;
+    return r;
+  }
+
+  difference_type operator-(const Iter& other) const {
+    return index_iter_ - other.index_iter_;
+  }
+
+  friend bool operator==(const Iter& a, const Iter& b) {
+    return a.index_iter_ == b.index_iter_;
+  }
+
+  friend bool operator!=(const Iter& a, const Iter& b) {
+    return a.index_iter_ != b.index_iter_;
+  }
+
+  friend bool operator<(const Iter& a, const Iter& b) {
+    return a.index_iter_ < b.index_iter_;
+  }
+
+  friend bool operator<=(const Iter& a, const Iter& b) {
+    return a.index_iter_ <= b.index_iter_;
+  }
+
+  friend bool operator>(const Iter& a, const Iter& b) {
+    return a.index_iter_ > b.index_iter_;
+  }
+
+  friend bool operator>=(const Iter& a, const Iter& b) {
+    return a.index_iter_ >= b.index_iter_;
+  }
+
+ private:
+  bc::ReadIterator<uint32_t> index_iter_;
+  ValueRangeType values_;
+};
+
+}  // namespace iterator_internal
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_ITERATOR_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/register_span.h b/tensorflow/core/tfrt/mlrt/interpreter/register_span.h
new file mode 100644
index 00000000000..5fbdb415503
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/register_span.h
@@ -0,0 +1,224 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_REGISTER_SPAN_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_REGISTER_SPAN_H_
+
+#include <iterator>
+
+#include "absl/types/span.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/iterator.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+
+namespace mlrt {
+
+class RegisterIterator
+    : public iterator_internal::IteratorBase<RegisterIterator, Value,
+                                             absl::Span<Value>> {
+ public:
+  using IteratorBase<RegisterIterator, Value, absl::Span<Value>>::IteratorBase;
+};
+
+class ConstRegisterIterator
+    : public iterator_internal::IteratorBase<ConstRegisterIterator, const Value,
+                                             absl::Span<const Value>> {
+  using IteratorBase<ConstRegisterIterator, const Value,
+                     absl::Span<const Value>>::IteratorBase;
+};
+
+class RegisterSpan {
+ public:
+  using value_type = Value;
+  using size_type = size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = Value&;
+  using const_reference = const Value&;
+  using pointer = Value*;
+  using const_pointer = const Value*;
+  using iterator = RegisterIterator;
+  using const_iterator = ConstRegisterIterator;
+
+  RegisterSpan(bc::Span<uint32_t> reg_indices, absl::Span<Value> regs)
+      : reg_indices_(reg_indices), regs_(regs) {}
+
+  Value& operator[](size_t idx) { return regs_[reg_indices_[idx]]; }
+  const Value& operator[](size_t idx) const { return regs_[reg_indices_[idx]]; }
+  Value& back() const { return regs_[reg_indices_.back()]; }
+
+  size_t size() const { return reg_indices_.size(); }
+
+  iterator begin() const { return iterator(reg_indices_.begin(), regs_); }
+  iterator end() const { return iterator(reg_indices_.end(), regs_); }
+
+  RegisterSpan drop_front(int num = 1) {
+    return RegisterSpan(reg_indices_.drop_front(num), regs_);
+  }
+
+  RegisterSpan drop_back(int num = 1) {
+    return RegisterSpan(reg_indices_.drop_back(num), regs_);
+  }
+
+ private:
+  bc::Span<uint32_t> reg_indices_;
+  absl::Span<Value> regs_;
+};
+
+template <typename T>
+class RegisterValueIterator {
+  using Iter = RegisterValueIterator;
+
+ public:
+  using difference_type = std::ptrdiff_t;
+  using value_type = T;
+  using pointer = T*;
+  using reference = T&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  explicit RegisterValueIterator(RegisterIterator reg_iter)
+      : reg_iter_(reg_iter) {}
+
+  reference operator*() const { return (*reg_iter_).Get<T>(); }
+
+  pointer operator->() const { return &(*reg_iter_).Get<T>(); }
+
+  reference operator[](difference_type i) const {
+    return (*(reg_iter_ + i)).Get<T>();
+  }
+
+  Iter& operator+=(difference_type d) {
+    reg_iter_ += d;
+    return *this;
+  }
+
+  Iter& operator-=(difference_type d) {
+    reg_iter_ -= d;
+    return *this;
+  }
+
+  Iter& operator++() {
+    ++reg_iter_;
+    return *this;
+  }
+
+  Iter operator++(int) {
+    Iter r = *this;
+    ++reg_iter_;
+    return r;
+  }
+
+  Iter& operator--() {
+    --reg_iter_;
+    return *this;
+  }
+
+  Iter operator--(int) {
+    Iter r = *this;
+    --reg_iter_;
+    return r;
+  }
+
+  Iter operator+(difference_type d) const {
+    Iter r = *this;
+    r += d;
+    return r;
+  }
+
+  friend Iter operator+(difference_type d, const Iter& i) { return i + d; }
+
+  Iter operator-(difference_type d) const {
+    Iter r = *this;
+    r -= d;
+    return r;
+  }
+
+  difference_type operator-(const Iter& other) const {
+    return reg_iter_ - other.reg_iter_;
+  }
+
+  friend bool operator==(const Iter& a, const Iter& b) {
+    return a.reg_iter_ == b.reg_iter_;
+  }
+
+  friend bool operator!=(const Iter& a, const Iter& b) {
+    return a.reg_iter_ != b.reg_iter_;
+  }
+
+  friend bool operator<(const Iter& a, const Iter& b) {
+    return a.reg_iter_ < b.reg_iter_;
+  }
+
+  friend bool operator<=(const Iter& a, const Iter& b) {
+    return a.reg_iter_ <= b.reg_iter_;
+  }
+
+  friend bool operator>(const Iter& a, const Iter& b) {
+    return a.reg_iter_ > b.reg_iter_;
+  }
+
+  friend bool operator>=(const Iter& a, const Iter& b) {
+    return a.reg_iter_ >= b.reg_iter_;
+  }
+
+ private:
+  RegisterIterator reg_iter_;
+};
+
+template <typename T>
+class RegisterValueSpan {
+ public:
+  using value_type = T;
+  using size_type = size_t;
+  using difference_type = std::ptrdiff_t;
+  using reference = T&;
+  using const_reference = const T&;
+  using pointer = T*;
+  using const_pointer = const T*;
+  using iterator = RegisterValueIterator<T>;
+  using const_iterator = RegisterValueIterator<const T>;
+
+  RegisterValueSpan(bc::Span<uint32_t> reg_indices, absl::Span<Value> regs)
+      : reg_span_(reg_indices, regs) {}
+
+  // NOLINTNEXTLINE(google-explicit-constructor)
+  RegisterValueSpan(RegisterSpan reg_span) : reg_span_(reg_span) {}
+
+  T& operator[](size_t idx) { return reg_span_[idx].Get<T>(); }
+  const T& operator[](size_t idx) const { return reg_span_[idx].Get<T>(); }
+
+  void Destroy(size_t idx) { reg_span_[idx].Destroy<T>(); }
+
+  size_t size() const { return reg_span_.size(); }
+
+  iterator begin() const { return iterator(reg_span_.begin()); }
+  iterator end() const { return iterator(reg_span_.end()); }
+
+  bool empty() const { return size() == 0; }
+
+  RegisterValueSpan drop_front(int num = 1) {
+    return reg_span_.drop_front(num);
+  }
+
+  RegisterValueSpan drop_back(int num = 1) { return reg_span_.drop_back(num); }
+
+  RegisterSpan reg_span() const { return reg_span_; }
+
+ private:
+  RegisterSpan reg_span_;
+};
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_REGISTER_SPAN_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/register_span_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/register_span_test.cc
new file mode 100644
index 00000000000..a10cc0b8478
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/register_span_test.cc
@@ -0,0 +1,131 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
+
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+
+namespace mlrt {
+namespace {
+
+TEST(RegisterSpan, RegisterSpan) {
+  std::vector<Value> regs(4);
+  regs[0].Set<int>(0);
+  regs[1].Set<int>(1);
+  regs[2].Set<int>(2);
+  regs[3].Set<int>(3);
+
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto reg_indices_ctor =
+      bc::New<bc::Vector<uint32_t>>(&allocator, std::vector<uint32_t>{1, 2});
+
+  bc::Vector<uint32_t> reg_indices(buffer.Get(reg_indices_ctor.address()));
+
+  RegisterSpan reg_span(reg_indices, absl::MakeSpan(regs));
+
+  ASSERT_EQ(reg_span.size(), 2);
+
+  EXPECT_EQ(reg_span[0].Get<int>(), 1);
+  EXPECT_EQ(reg_span[1].Get<int>(), 2);
+
+  EXPECT_THAT(RegisterValueSpan<int>(reg_span),
+              ::testing::ElementsAreArray({1, 2}));
+}
+
+TEST(RegisterSpan, RegisterSpanToStdVector) {
+  std::vector<Value> regs(4);
+  regs[0].Set<int>(0);
+  regs[1].Set<int>(1);
+  regs[2].Set<int>(2);
+  regs[3].Set<int>(3);
+
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto reg_indices_ctor =
+      bc::New<bc::Vector<uint32_t>>(&allocator, std::vector<uint32_t>{1, 2});
+
+  bc::Vector<uint32_t> reg_indices(buffer.Get(reg_indices_ctor.address()));
+
+  RegisterSpan reg_span(reg_indices, absl::MakeSpan(regs));
+
+  std::vector<Value> subset(reg_span.begin(), reg_span.end());
+
+  ASSERT_EQ(subset.size(), 2);
+
+  EXPECT_EQ(subset[0].Get<int>(), 1);
+  EXPECT_EQ(subset[1].Get<int>(), 2);
+}
+
+TEST(RegisterSpan, RegisterValueSpan) {
+  std::vector<Value> regs(4);
+  regs[0].Set<int>(0);
+  regs[1].Set<int>(1);
+  regs[2].Set<int>(2);
+  regs[3].Set<int>(3);
+
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto reg_indices_ctor =
+      bc::New<bc::Vector<uint32_t>>(&allocator, std::vector<uint32_t>{1, 3});
+
+  bc::Vector<uint32_t> reg_indices(buffer.Get(reg_indices_ctor.address()));
+
+  RegisterValueSpan<int> reg_span(reg_indices, absl::MakeSpan(regs));
+
+  ASSERT_EQ(reg_span.size(), 2);
+
+  EXPECT_EQ(reg_span[0], 1);
+  EXPECT_EQ(reg_span[1], 3);
+
+  EXPECT_THAT(reg_span, ::testing::ElementsAreArray({1, 3}));
+}
+
+TEST(RegisterSpan, Modifiers) {
+  std::vector<Value> regs(4);
+  regs[0].Set<int>(0);
+  regs[1].Set<int>(1);
+  regs[2].Set<int>(2);
+  regs[3].Set<int>(3);
+
+  bc::Buffer buffer;
+  bc::Allocator allocator(&buffer);
+
+  auto reg_indices_ctor = bc::New<bc::Vector<uint32_t>>(
+      &allocator, std::vector<uint32_t>{0, 2, 1, 3});
+
+  bc::Vector<uint32_t> reg_indices(buffer.Get(reg_indices_ctor.address()));
+
+  RegisterSpan reg_span(reg_indices, absl::MakeSpan(regs));
+
+  RegisterValueSpan<int> reg_value_span(reg_span);
+
+  EXPECT_THAT(RegisterValueSpan<int>(reg_span.drop_back(2)),
+              ::testing::ElementsAreArray({0, 2}));
+  EXPECT_THAT(reg_value_span.drop_front(2),
+              ::testing::ElementsAreArray({1, 3}));
+
+  reg_value_span.Destroy(1);
+  EXPECT_FALSE(regs[2].HasValue());
+}
+
+}  // namespace
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/value.h b/tensorflow/core/tfrt/mlrt/interpreter/value.h
new file mode 100644
index 00000000000..782f579b235
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/value.h
@@ -0,0 +1,419 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_VALUE_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_VALUE_H_
+
+#include <new>
+#include <type_traits>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+
+namespace mlrt {
+
+class Value;
+
+namespace value_internal {
+
+struct InPlaceStorageT {
+  // Many tensor implementations like tensorflow::Tensor requires multiple
+  // words, and we'd like to keep these values inplace.
+  //
+  // TODO(chky): Consider a better size for inplace storage.
+  alignas(8) char data[56];
+};
+
+template <typename T>
+using IsInPlaceStorage =
+    std::integral_constant<bool, sizeof(T) <= sizeof(InPlaceStorageT) &&
+                                     alignof(T) <= alignof(InPlaceStorageT) &&
+                                     std::is_move_constructible<T>::value>;
+
+// Since we type-erase the value to be put in class Value, we need to an enum
+// value to select the operation that should be applied on the type-erased
+// value.
+enum class Action {
+  kDestroy = 0,  // Destructor
+  kCopy,         // Copy constructor/assignment
+  kMove,         // Move constructor/assignment
+  kError,        // Error handler
+  kTypeInfo      // Get type info
+};
+
+struct TypeInfo {};
+
+using HandlerFuncPtr = TypeInfo* (*)(Action, Value*, Value*);
+
+template <typename T>
+class InPlaceHandler;
+template <typename T>
+class OutOfPlaceHandler;
+
+template <typename T>
+using Handler = std::conditional_t<IsInPlaceStorage<T>::value,
+                                   InPlaceHandler<T>, OutOfPlaceHandler<T>>;
+
+template <typename T, typename Enable = void>
+struct HasHandleError : std::false_type {};
+
+template <typename T>
+struct HasHandleError<
+    T, std::void_t<decltype(std::declval<T>().HandleError(nullptr))>>
+    : std::true_type {};
+
+}  // namespace value_internal
+
+// A container for type-erased value. The value should be at least copy
+// constructable to be put into this container. This container has both move and
+// copy semantics, but if the concrete value does not support copy, calling the
+// copy operations on this class will result in undefined behavior.
+class alignas(64) Value {
+ public:
+  // Value is default constructible. The payload is unset in the default
+  // constructed Value.
+  Value() = default;
+
+  Value(const Value&);
+  Value& operator=(const Value&);
+  Value(Value&&);
+  Value& operator=(Value&&);
+
+  // Construct Value and store `t` as the payload.
+  template <typename T,
+            typename std::enable_if<!std::is_same_v<std::decay_t<T>, Value>,
+                                    int>::type = 0>
+  explicit Value(T&& t);
+
+  template <typename T,
+            typename std::enable_if<!std::is_same_v<std::decay_t<T>, Value>,
+                                    int>::type = 0>
+  Value& operator=(T&& value) {
+    Set(std::forward<T>(value));
+    return *this;
+  }
+
+  ~Value();
+
+  // Get() function returns the payload of the Value object in the requested
+  // type.
+  //
+  // Dynamic type checking is performed in the debug mode.
+  template <typename T>
+  T& Get();
+
+  template <typename T>
+  const T& Get() const;
+
+  // Emplace() constructs the payload object of type T in place with the given
+  // args. If the value is already initialized, the original value will be
+  // destroyed.
+  template <typename T, typename... Args>
+  void Emplace(Args&&... args);
+
+  // Construct() constructs the payload object of type T in place with the given
+  // args. The value should be uninitialized before calling this method.
+  // Otherwise the behavior is undefined.
+  template <typename T, typename... Args>
+  void Construct(Args&&... args);
+
+  // Destroy() destroys the payload object of type T. The value must be already
+  // initialized with a value of type T. Otherwise the behavior is undefined.
+  template <typename T>
+  void Destroy();
+
+  // Set() stores the argument `t` as the payload of Value.
+  template <typename T>
+  void Set(T&& t);
+
+  // Reset the Value object to empty.
+  void Reset();
+
+  // Call T::HandleError() method on the underlying value of type T. If T does
+  // not have a HandleError() method, this method does nothing.
+  void HandleError(Value& arg);
+
+  // Check if Value contains a payload.
+  bool HasValue() const { return handler_ != nullptr; }
+
+  // Check if Value contains object of type T.
+  template <typename T>
+  bool IsType() const;
+
+  // Check if object of type T is stored in place.
+  template <typename T>
+  static constexpr bool IsInPlace() {
+    return value_internal::IsInPlaceStorage<T>::value;
+  }
+
+ private:
+  union {
+    value_internal::InPlaceStorageT storage_{};
+    void* value_;
+  };
+  value_internal::HandlerFuncPtr handler_ = nullptr;
+
+  template <typename>
+  friend class value_internal::InPlaceHandler;
+  template <typename>
+  friend class value_internal::OutOfPlaceHandler;
+};
+
+// We only optimize the code for 64-bit architectures for now.
+static_assert(sizeof(Value) == 64 || sizeof(void*) != 8);
+
+// -----------------------------------------------------------
+// Implementation details.
+
+namespace value_internal {
+
+template <typename T>
+TypeInfo* GetTypeInfo();
+
+template <typename T,
+          typename std::enable_if_t<HasHandleError<T>::value, int> = 0>
+void HandleErrorInternal(Value* self, Value* arg) {
+  std::move(self->Get<T>()).HandleError(arg);
+}
+
+template <typename T,
+          typename std::enable_if_t<!HasHandleError<T>::value, int> = 0>
+static void HandleErrorInternal(Value* self, Value* arg) {}
+
+template <class T>
+struct InPlaceHandler {
+  template <typename... Args>
+  static void Construct(Value* self, Args&&... args) {
+    new (&self->storage_) T(std::forward<Args>(args)...);
+    self->handler_ = &Handle;
+  }
+
+  static TypeInfo* Handle(Action action, Value* self, Value* other) {
+    switch (action) {
+      case Action::kDestroy:
+        Destroy(self);
+        return nullptr;
+      case Action::kCopy:
+        Copy(self, other);
+        return nullptr;
+      case Action::kMove:
+        Move(self, other);
+        return nullptr;
+      case Action::kError:
+        HandleError(self, other);
+        return nullptr;
+      case Action::kTypeInfo:
+        return GetTypeInfo<T>();
+    }
+  }
+
+  static void Destroy(Value* self) {
+    DCHECK(self->HasValue());
+    auto* p = std::launder(reinterpret_cast<T*>(&self->storage_));
+    p->~T();
+    self->handler_ = nullptr;
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    Construct(dest, *std::launder(reinterpret_cast<const V*>(&self->storage_)));
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            !std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    LOG(FATAL) << "Copying a mlrt::Value whose underlying type is "  // Crash Ok
+                  "not copyable is a runtime error.";
+  }
+
+  static void Copy(Value* self, Value* dest) { CopyInternal<T>(self, dest); }
+
+  static void Move(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    Construct(dest,
+              std::move(*std::launder(reinterpret_cast<T*>(&self->storage_))));
+    Destroy(self);
+  }
+
+  static void HandleError(Value* self, Value* arg) {
+    HandleErrorInternal<T>(self, arg);
+  }
+};
+
+template <class T>
+struct OutOfPlaceHandler {
+  template <typename... Args>
+  static void Construct(Value* self, Args&&... args) {
+    self->value_ = new T(std::forward<Args>(args)...);
+    self->handler_ = &Handle;
+  }
+
+  static TypeInfo* Handle(Action action, Value* self, Value* other) {
+    switch (action) {
+      case Action::kDestroy:
+        Destroy(self);
+        return nullptr;
+      case Action::kCopy:
+        Copy(self, other);
+        return nullptr;
+      case Action::kMove:
+        Move(self, other);
+        return nullptr;
+      case Action::kError:
+        HandleError(self, other);
+        return nullptr;
+      case Action::kTypeInfo:
+        return GetTypeInfo<T>();
+    }
+  }
+
+  static void Destroy(Value* self) {
+    DCHECK(self->HasValue());
+    delete static_cast<T*>(self->value_);
+    self->handler_ = nullptr;
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    Construct(dest, *static_cast<const V*>(self->value_));
+  }
+
+  template <typename V, typename std::enable_if_t<
+                            !std::is_copy_constructible<V>::value, int> = 0>
+  static void CopyInternal(Value* self, Value* dest) {
+    LOG(FATAL) << "Copying a mlrt::Value whose underlying type is "  // Crash Ok
+                  "not copyable is a runtime error.";
+  }
+
+  static void Copy(Value* self, Value* dest) { CopyInternal<T>(self, dest); }
+
+  static void Move(Value* self, Value* dest) {
+    DCHECK(self->HasValue() && !dest->HasValue());
+    dest->value_ = self->value_;
+    dest->handler_ = &Handle;
+    self->handler_ = nullptr;
+  }
+
+  static void HandleError(Value* self, Value* arg) {
+    HandleErrorInternal<T>(self, arg);
+  }
+};
+
+template <typename T>
+__attribute__((noinline)) TypeInfo* GetTypeInfo() {
+  static TypeInfo kTypeInfo;
+  return &kTypeInfo;
+}
+
+}  // namespace value_internal
+
+template <typename T, typename std::enable_if<
+                          !std::is_same_v<std::decay_t<T>, Value>, int>::type>
+Value::Value(T&& t) {
+  Construct<std::decay_t<T>>(std::forward<T>(t));
+}
+
+inline Value::Value(const Value& v) {
+  if (v.HasValue())
+    v.handler_(value_internal::Action::kCopy, const_cast<Value*>(&v), this);
+}
+
+inline Value& Value::operator=(const Value& v) {
+  Reset();
+  if (v.HasValue())
+    v.handler_(value_internal::Action::kCopy, const_cast<Value*>(&v), this);
+  return *this;
+}
+
+inline Value::Value(Value&& v) {
+  if (v.HasValue()) v.handler_(value_internal::Action::kMove, &v, this);
+}
+
+inline Value& Value::operator=(Value&& v) {
+  Reset();
+  if (v.HasValue()) v.handler_(value_internal::Action::kMove, &v, this);
+  return *this;
+}
+
+inline void Value::HandleError(Value& arg) {
+  if (HasValue()) handler_(value_internal::Action::kError, this, &arg);
+}
+
+inline Value::~Value() { Reset(); }
+
+template <typename T>
+T& Value::Get() {
+  return const_cast<T&>(static_cast<const Value*>(this)->Get<T>());
+}
+
+template <typename T>
+const T& Value::Get() const {
+  DCHECK(IsType<T>());
+
+  if constexpr (IsInPlace<T>()) {
+    return *std::launder(reinterpret_cast<const T*>(&storage_));
+  }
+
+  return *static_cast<const T*>(value_);
+}
+
+// Emplace() constructs the payload object of type T in place with the given
+// args.
+template <typename T, typename... Args>
+void Value::Emplace(Args&&... args) {
+  Reset();
+  Construct<std::decay_t<T>>(std::forward<Args>(args)...);
+}
+
+// Set() stores the argument `t` as the payload of Value.
+template <typename T>
+void Value::Set(T&& t) {
+  Emplace<T>(std::forward<T>(t));
+}
+
+template <typename T, typename... Args>
+void Value::Construct(Args&&... args) {
+  DCHECK(!HasValue());
+  static_assert(!std::is_same_v<T, Value>);
+  value_internal::Handler<T>::Construct(this, std::forward<Args>(args)...);
+}
+
+template <typename T>
+void Value::Destroy() {
+  DCHECK(HasValue());
+  DCHECK(IsType<T>());
+  static_assert(!std::is_same_v<T, Value>);
+  value_internal::Handler<T>::Destroy(this);
+}
+
+// Reset the Value object to empty.
+inline void Value::Reset() {
+  if (handler_ == nullptr) return;
+  handler_(value_internal::Action::kDestroy, this, nullptr);
+}
+
+template <typename T>
+bool Value::IsType() const {
+  return handler_(value_internal::Action::kTypeInfo, const_cast<Value*>(this),
+                  nullptr) == value_internal::GetTypeInfo<T>();
+}
+
+}  // namespace mlrt
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_INTERPRETER_VALUE_H_
diff --git a/tensorflow/core/tfrt/mlrt/interpreter/value_test.cc b/tensorflow/core/tfrt/mlrt/interpreter/value_test.cc
new file mode 100644
index 00000000000..fdefd6fe4ac
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/interpreter/value_test.cc
@@ -0,0 +1,175 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace mlrt {
+namespace {
+
+TEST(ValueTest, SmallCopyable) {
+  struct SmallCopyable {
+    int v;
+  };
+
+  Value value(SmallCopyable{100});
+  EXPECT_EQ(value.Get<SmallCopyable>().v, 100);
+
+  Value value_copy(value);
+  EXPECT_EQ(value_copy.Get<SmallCopyable>().v, 100);
+  EXPECT_EQ(value.Get<SmallCopyable>().v, 100);
+
+  Value value_move = std::move(value);
+  EXPECT_EQ(value_move.Get<SmallCopyable>().v, 100);
+  EXPECT_FALSE(value.HasValue());  // NOLINT
+
+  ASSERT_TRUE(value_move.HasValue());
+  value_move.Destroy<SmallCopyable>();
+  EXPECT_FALSE(value_move.HasValue());
+
+  value_move = SmallCopyable{100};
+  EXPECT_EQ(value_move.Get<SmallCopyable>().v, 100);
+}
+
+TEST(ValueTest, LargeCopyable) {
+  constexpr char kData[] =
+      "<<This line contains 32 bytes>>\n"
+      "<<This line contains 32 bytes>>\n"
+      "<<This line contains 32 bytes>>\n"
+      "<<This line contains 32 bytes>>";
+
+  static_assert(sizeof(kData) == 128);
+
+  struct LargeCopyable {
+    char data[128] =
+        "<<This line contains 32 bytes>>\n"
+        "<<This line contains 32 bytes>>\n"
+        "<<This line contains 32 bytes>>\n"
+        "<<This line contains 32 bytes>>";
+  };
+
+  Value value(LargeCopyable{});
+  EXPECT_EQ(absl::string_view(value.Get<LargeCopyable>().data), kData);
+
+  Value value_copy = value;
+  EXPECT_EQ(absl::string_view(value_copy.Get<LargeCopyable>().data), kData);
+  EXPECT_EQ(absl::string_view(value.Get<LargeCopyable>().data), kData);
+
+  Value value_move = std::move(value);
+  EXPECT_EQ(absl::string_view(value_move.Get<LargeCopyable>().data), kData);
+  EXPECT_FALSE(value.HasValue());  // NOLINT
+
+  ASSERT_TRUE(value_move.HasValue());
+  value_move.Destroy<LargeCopyable>();
+  EXPECT_FALSE(value_move.HasValue());
+
+  value_move = LargeCopyable{};
+  EXPECT_EQ(absl::string_view(value_move.Get<LargeCopyable>().data), kData);
+}
+
+TEST(ValueTest, SmallMoveOnly) {
+  struct SmallMoveOnly {
+    int v;
+
+    explicit SmallMoveOnly(int v) : v(v) {}
+    SmallMoveOnly(const SmallMoveOnly&) = delete;
+    SmallMoveOnly& operator=(const SmallMoveOnly&) = delete;
+    SmallMoveOnly(SmallMoveOnly&&) = default;
+    SmallMoveOnly& operator=(SmallMoveOnly&&) = default;
+  };
+
+  Value value(SmallMoveOnly(100));
+  EXPECT_EQ(value.Get<SmallMoveOnly>().v, 100);
+
+  Value value_move = std::move(value);
+  EXPECT_EQ(value_move.Get<SmallMoveOnly>().v, 100);
+  EXPECT_FALSE(value.HasValue());  // NOLINT
+}
+
+TEST(ValueTest, LargeMoveOnly) {
+  constexpr char kData[] =
+      "<<This line contains 32 bytes>>\n"
+      "<<This line contains 32 bytes>>\n"
+      "<<This line contains 32 bytes>>\n"
+      "<<This line contains 32 bytes>>";
+
+  static_assert(sizeof(kData) == 128);
+
+  struct LargeMoveOnly {
+    char data[128] =
+        "<<This line contains 32 bytes>>\n"
+        "<<This line contains 32 bytes>>\n"
+        "<<This line contains 32 bytes>>\n"
+        "<<This line contains 32 bytes>>";
+
+    LargeMoveOnly() = default;
+    LargeMoveOnly(const LargeMoveOnly&) = delete;
+    LargeMoveOnly& operator=(const LargeMoveOnly&) = delete;
+    LargeMoveOnly(LargeMoveOnly&&) = default;
+    LargeMoveOnly& operator=(LargeMoveOnly&&) = default;
+  };
+
+  Value value(LargeMoveOnly{});
+  EXPECT_EQ(absl::string_view(value.Get<LargeMoveOnly>().data), kData);
+
+  Value value_move = std::move(value);
+  EXPECT_EQ(absl::string_view(value_move.Get<LargeMoveOnly>().data), kData);
+  EXPECT_FALSE(value.HasValue());  // NOLINT
+}
+
+TEST(ValueTest, Error) {
+  Value arg(100);
+
+  arg.HandleError(arg);
+
+  EXPECT_EQ(arg.Get<int>(), 100);
+
+  struct Small {
+    int* v = nullptr;
+
+    void HandleError(Value* arg) { *v = arg->Get<int>(); }
+  };
+
+  int v = 0;
+
+  Value value(Small{&v});
+
+  value.HandleError(arg);
+
+  EXPECT_EQ(v, 100);
+  EXPECT_EQ(*value.Get<Small>().v, 100);
+
+  struct Large {
+    int* v = nullptr;
+    char data[128];
+
+    void HandleError(Value* arg) { *v = arg->Get<int>(); }
+  };
+
+  v = 0;
+
+  value = Value(Large{&v});
+
+  value.HandleError(arg);
+
+  EXPECT_EQ(v, 100);
+  EXPECT_EQ(*value.Get<Large>().v, 100);
+}
+
+}  // namespace
+}  // namespace mlrt
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
new file mode 100644
index 00000000000..ba8383af095
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -0,0 +1,124 @@
+load(
+    "//tensorflow:tensorflow.bzl",
+    "tf_cc_test",
+)
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        # copybara:uncomment "//learning/brain/experimental/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/brain/tfrt:__subpackages__",
+        # copybara:uncomment "//learning/infra/mira/distributed:__subpackages__",
+        "//tensorflow/core/tfrt/graph_executor:__subpackages__",
+        "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests:__subpackages__",
+        "//tensorflow/core/tfrt/saved_model:__subpackages__",
+        "//tensorflow/core/tfrt/tfrt_session:__subpackages__",
+    ],
+)
+
+cc_library(
+    name = "kernel",
+    srcs = ["kernel.cc"],
+    hdrs = ["kernel.h"],
+    deps = [
+        ":context",
+        ":kernel_runner_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_utils",
+        "//tensorflow/core/tfrt/fallback:device_with_custom_allocator",
+        "//tensorflow/core/tfrt/mlrt/interpreter:async_handle",
+        "//tensorflow/core/tfrt/mlrt/interpreter:attribute_span",
+        "//tensorflow/core/tfrt/mlrt/interpreter:builtin_kernels",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:execute",
+        "//tensorflow/core/tfrt/mlrt/interpreter:future",
+        "//tensorflow/core/tfrt/mlrt/interpreter:register_span",
+        "//tensorflow/core/tfrt/mlrt/interpreter:value",
+        "//tensorflow/core/tfrt/utils",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/lib:traceme",
+        "//third_party/protobuf",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@tf_runtime//:async_value",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+cc_library(
+    name = "batch_kernel",
+    srcs = ["batch_kernel.cc"],
+    hdrs = ["batch_kernel.h"],
+    deps = [
+        ":context",
+        ":kernel_runner_utils",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/core/runtime_fallback/runtime:fallback_batch_kernel",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:execute",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/strings",
+        "@tf_runtime//:async_value",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+cc_library(
+    name = "kernel_runner_utils",
+    srcs = ["kernel_runner_utils.cc"],
+    hdrs = ["kernel_runner_utils.h"],
+    deps = [
+        ":context",
+        "//tensorflow/core/profiler/lib:traceme",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_utils",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:future",
+        "//tensorflow/core/tfrt/mlrt/interpreter:register_span",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+    ],
+)
+
+cc_library(
+    name = "context",
+    hdrs = ["context.h"],
+    deps = [
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "@tf_runtime//:hostcontext",
+    ],
+)
+
+tf_cc_test(
+    name = "kernel_test",
+    srcs = ["kernel_test.cc"],
+    deps = [
+        ":batch_kernel",
+        ":context",
+        ":kernel",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/ops:math_ops_op_lib",
+        "//tensorflow/core/tfrt/fallback:device_with_custom_allocator",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "//tensorflow/core/tfrt/mlrt/interpreter:execute",
+        "//tensorflow/core/tfrt/mlrt/interpreter:future",
+        "//tensorflow/core/tfrt/mlrt/interpreter:interpreter_testutil",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@tf_runtime//:hostcontext",
+        "@tf_runtime//:ref_count",
+    ],
+)
diff --git a/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
new file mode 100644
index 00000000000..0d274cab8ad
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.cc
@@ -0,0 +1,408 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
+
+#include <cstdlib>
+#include <functional>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/base/optimization.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/kernels/batching_util/batch_scheduler.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/runtime_fallback/runtime/fallback_batch_kernel.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner_cache.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+namespace {
+
+constexpr char kMlrtBatchFunctionName[] = "MlrtBatchFunction";
+constexpr char kOpKernelRunnerCacheResourceName[] = "MlrtOpKernelCache";
+
+// The custom KernelFrame for tf_mlrt.batch_function op.
+struct BatchFunctionOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.batch_function";
+
+  mlrt::RegisterValueSpan<tfrt_stub::FallbackTensor> args() const {
+    return arguments();
+  }
+
+  absl::string_view device_name() const {
+    return attributes().GetAs<mlrt::bc::String>(0).Get();
+  }
+
+  tensorflow::Device* device() const {
+    return context().fallback_request_state().cpu_device();
+  }
+
+  mlrt::bc::Function f() const {
+    uint32_t func_idx = attributes().GetAs<uint32_t>(1);
+    return execution_context()
+        .loaded_executable()
+        .executable()
+        .functions()[func_idx];
+  }
+
+  absl::string_view node_def_text() const {
+    return attributes().GetAs<mlrt::bc::String>(2).Get();
+  }
+
+  Context& context() const {
+    return execution_context().GetUserContext<Context>();
+  }
+
+  void Invoke();
+};
+
+// A thread local variable for passing the mlrt::ExecutionContext in the same
+// thread.
+//
+// TODO(chky): This is a workaround, though it is better than using an
+// additional tensor argument. A better implementation would be to remove the
+// tensorflow::OpKernel interface used here.
+mlrt::ExecutionContext*& GetBatchFunctionMlrtContext() {
+  thread_local mlrt::ExecutionContext* context = nullptr;
+  return context;
+}
+
+// An RAII object for saving and restoring the thread local
+// mlrt::ExecutionContext.
+class ScopedBatchFunctionMlrtContext {
+ public:
+  explicit ScopedBatchFunctionMlrtContext(
+      mlrt::ExecutionContext* current_context) {
+    last_context_ = GetBatchFunctionMlrtContext();
+    GetBatchFunctionMlrtContext() = current_context;
+  }
+
+  ScopedBatchFunctionMlrtContext(const ScopedBatchFunctionMlrtContext&) =
+      delete;
+  ScopedBatchFunctionMlrtContext& operator=(
+      const ScopedBatchFunctionMlrtContext&) = delete;
+
+  ~ScopedBatchFunctionMlrtContext() {
+    GetBatchFunctionMlrtContext() = last_context_;
+  }
+
+ private:
+  mlrt::ExecutionContext* last_context_ = nullptr;
+};
+
+void BatchFunctionOp::Invoke() {
+  ScopedBatchFunctionMlrtContext scoped_context(&execution_context());
+
+  const auto& fallback_request_state = context().fallback_request_state();
+
+  auto* runner_cache = context()
+                           .resource_context()
+                           .GetOrCreateResource<tfrt_stub::OpKernelRunnerCache>(
+                               kOpKernelRunnerCacheResourceName);
+
+  auto attr_builder = [node_def_text = node_def_text(),
+                       f = f()](tensorflow::AttrValueMap* attr_value_map) {
+    tensorflow::NodeDef node_def;
+    if (!proto2::TextFormat::ParseFromString(node_def_text, &node_def)) {
+      return absl::InternalError(
+          absl::StrCat("CreateOp: failed to parse NodeDef: ", node_def_text));
+    }
+
+    *attr_value_map = node_def.attr();
+
+    auto ptr_value = absl::bit_cast<int64_t>(f);
+    (*attr_value_map)["opaque_function_handle"].set_i(ptr_value);
+
+    return OkStatus();
+  };
+
+  tfrt::Location loc;
+  loc.data = absl::bit_cast<intptr_t>(f());
+
+  auto kernel_runner = runner_cache->GetOrCreate(
+      loc, kMlrtBatchFunctionName, device_name(), args().size(), attr_builder,
+      fallback_request_state.device_manager(),
+      fallback_request_state.process_function_library_runtime());
+
+  if (ABSL_PREDICT_FALSE(!kernel_runner.ok())) {
+    execution_context().Fail(std::move(kernel_runner).status());
+    return;
+  }
+
+  DCHECK((*kernel_runner)->IsAsync());
+  ExecuteKernelRunner</*IsAsync=*/true>(
+      *this, context(), fallback_request_state, **kernel_runner);
+}
+
+// A customized BatchResource whose batch function is a mlrt::bc::Function.
+class MlrtBatchResource : public tensorflow::serving::BatchResourceBase {
+  struct MlrtBatchTask : BatchTask {
+    explicit MlrtBatchTask(mlrt::ExecutionContext* caller_context)
+        : caller_context(caller_context) {
+      DCHECK(caller_context);
+    }
+    mlrt::ExecutionContext* caller_context = nullptr;
+
+   private:
+    std::unique_ptr<BatchTask> CreateDerivedTask() override {
+      return std::make_unique<MlrtBatchTask>(this->caller_context);
+    }
+  };
+
+ public:
+  using BatchFunctionType = mlrt::bc::Function;
+
+  static mlrt::bc::Function CastHandleToFunction(int64_t handle) {
+    return absl::bit_cast<mlrt::bc::Function>(handle);
+  }
+
+  // This can only be called in Compute() and ComputeAsync() because thread
+  // local is used to pass the context.
+  static StatusOr<std::unique_ptr<BatchTask>> CreateBatchTask(
+      OpKernelContext*) {
+    return {std::make_unique<MlrtBatchTask>(GetBatchFunctionMlrtContext())};
+  }
+
+  // This can only be called in Compute() and ComputeAsync() because thread
+  // local is used to pass the context.
+  static StatusOr<tfrt::ResourceContext*> GetClientGraphResourceContext(
+      OpKernelContext*) {
+    const auto& context =
+        GetBatchFunctionMlrtContext()->GetUserContext<Context>();
+    const auto& fallback_request_state = context.fallback_request_state();
+    // If `client_graph_resource_context` is null, it implies that it's safe to
+    // fall back to the per-model resource context.
+    return fallback_request_state.client_graph_resource_context() != nullptr
+               ? fallback_request_state.client_graph_resource_context()
+               : &context.resource_context();
+  }
+
+  static absl::string_view GetBatchFunctionName(
+      const BatchFunctionType& batch_function) {
+    return batch_function.name();
+  }
+
+  static Status Create(OpKernelContext* c, int32_t num_batch_threads,
+                       int32_t max_batch_size, int32_t batch_timeout_micros,
+                       int32_t max_enqueued_batches,
+                       const std::vector<int32_t>& allowed_batch_sizes,
+                       mlrt::bc::Function function,
+                       bool enable_large_batch_splitting, bool disable_padding,
+                       std::unique_ptr<MlrtBatchResource>* resource) {
+    BatcherT::Options batcher_options;
+    batcher_options.num_batch_threads = num_batch_threads;
+    std::shared_ptr<BatcherT> batcher;
+    TF_RETURN_IF_ERROR(BatcherT::Create(batcher_options, &batcher));
+
+    resource->reset(new MlrtBatchResource(
+        function, std::move(batcher),
+        GetBatcherQueueOptions(num_batch_threads, max_batch_size,
+                               batch_timeout_micros, max_enqueued_batches,
+                               allowed_batch_sizes,
+                               enable_large_batch_splitting, disable_padding),
+        allowed_batch_sizes));
+    return OkStatus();
+  }
+
+  static Status Create(
+      OpKernelContext* c,
+      AdaptiveBatcherT::Options adaptive_shared_batch_scheduler_options,
+      int32_t max_batch_size, int32_t batch_timeout_micros,
+      int32_t max_enqueued_batches,
+      const std::vector<int32_t>& allowed_batch_sizes,
+      mlrt::bc::Function function, bool disable_padding,
+      std::unique_ptr<MlrtBatchResource>* resource) {
+    std::shared_ptr<AdaptiveBatcherT> batcher;
+    TF_RETURN_IF_ERROR(AdaptiveBatcherT::Create(
+        adaptive_shared_batch_scheduler_options, &batcher));
+
+    resource->reset(new MlrtBatchResource(
+        function, std::move(batcher),
+        GetAdaptiveBatcherQueueOptions(max_batch_size, batch_timeout_micros,
+                                       max_enqueued_batches,
+                                       true /* enable large batch split */,
+                                       allowed_batch_sizes, disable_padding),
+        allowed_batch_sizes));
+    return OkStatus();
+  }
+
+  string DebugString() const final { return "MlrtBatchResource"; }
+
+  mlrt::bc::Function batch_function() const { return batch_function_; }
+
+ private:
+  MlrtBatchResource(mlrt::bc::Function batch_function,
+                    std::shared_ptr<BatcherT> batcher,
+                    const BatcherT::QueueOptions& batcher_queue_options,
+                    const std::vector<int32_t>& allowed_batch_sizes)
+      : BatchResourceBase(
+            /*has_process_batch_function=*/true, std::move(batcher),
+            batcher_queue_options, allowed_batch_sizes),
+        batch_function_(batch_function) {}
+
+  MlrtBatchResource(mlrt::bc::Function batch_function,
+                    std::shared_ptr<AdaptiveBatcherT> batcher,
+                    const AdaptiveBatcherT::QueueOptions& batcher_queue_options,
+                    const std::vector<int32_t>& allowed_batch_sizes)
+      : BatchResourceBase(
+            /*has_process_batch_function=*/true, std::move(batcher),
+            batcher_queue_options, allowed_batch_sizes),
+        batch_function_(batch_function) {}
+
+  void ProcessFuncBatchImpl(
+      const BatchTask& last_task, absl::Span<const Tensor> inputs,
+      std::vector<Tensor>* combined_outputs,
+      std::function<void(const Status&)> done) const override;
+
+  mlrt::bc::Function batch_function_;
+};
+
+void MlrtBatchResource::ProcessFuncBatchImpl(
+    const BatchTask& last_task, absl::Span<const Tensor> inputs,
+    std::vector<Tensor>* combined_outputs,
+    std::function<void(const Status&)> done) const {
+  std::vector<mlrt::Value> arguments;
+  arguments.reserve(inputs.size());
+  for (const auto& input : inputs) {
+    arguments.emplace_back(tfrt_stub::FallbackTensor(input));
+  }
+
+  std::vector<mlrt::Value> results(batch_function_.output_regs().size());
+
+  const auto& task = down_cast<const MlrtBatchTask&>(last_task);
+  DCHECK(task.context);
+  mlrt::ExecutionContext& caller_context = *task.caller_context;
+
+  auto& caller_tf_context = caller_context.GetUserContext<tf_mlrt::Context>();
+  const auto& caller_fallback_request_state =
+      caller_tf_context.fallback_request_state();
+
+  // Using the same logic as in the c'tor of FunctionLibraryRuntime::Options,
+  // to avoid clash with any Session-generated step ID. DirectSession and
+  // MasterSession generates non-negative step IDs.
+  int64_t step_id = -std::abs(static_cast<int64_t>(random::New64()));
+
+  // Copy per-request states to create a new KernelFallbackCompatRequestState.
+  //
+  // TODO(chky): Consider adding copy ctor for KernelFallbackCompatRequestState.
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      caller_fallback_request_state.runner(),
+      &caller_fallback_request_state.device_manager(), step_id,
+      caller_fallback_request_state.runner_table(),
+      caller_fallback_request_state.resource_array(),
+      caller_fallback_request_state.intra_op_threadpool(),
+      caller_fallback_request_state.session_metadata(),
+      &caller_fallback_request_state.process_function_library_runtime());
+
+  fallback_request_state.set_cost_recorder(
+      caller_fallback_request_state.cost_recorder());
+
+  fallback_request_state.set_client_graph_resource_context(
+      caller_fallback_request_state.client_graph_resource_context());
+
+  tensorflow::profiler::TraceMeProducer activity(
+      // To TraceMeConsumers in WorkQueue.
+      [step_id] {
+        return tensorflow::profiler::TraceMeEncode(
+            "RunMlrtFunction", {{"id", step_id}, {"_r", 1}});
+      },
+      tensorflow::profiler::ContextType::kTfrtExecutor, step_id,
+      tensorflow::profiler::TraceMeLevel::kInfo);
+
+  // Copy the ExecutionContext and its user contexts for async execution.
+  auto user_contexts = caller_context.CopyUserContexts();
+  mlrt::ExecutionContext execution_context(&caller_context.loaded_executable(),
+                                           std::move(user_contexts));
+  execution_context.GetUserContext<tf_mlrt::Context>()
+      .set_fallback_request_state(&fallback_request_state);
+
+  auto* work_queue = caller_context.work_queue();
+  DCHECK(work_queue);
+  execution_context.set_work_queue(work_queue);
+
+  auto chain = tsl::MakeConstructedAsyncValueRef<tsl::Chain>();
+
+  execution_context.set_exit_handler(
+      [chain]() mutable { chain.SetStateConcrete(); });
+
+  execution_context.CallByMove(batch_function_, absl::MakeSpan(arguments),
+                               absl::MakeSpan(results));
+
+  work_queue->AddTask(
+      [&execution_context]() { mlrt::Execute(execution_context); });
+
+  work_queue->Await(chain.CopyRCRef());
+
+  if (execution_context.status().ok()) {
+    combined_outputs->reserve(results.size());
+    for (const auto& output : results) {
+      combined_outputs->push_back(
+          output.Get<tfrt_stub::FallbackTensor>().tensor());
+    }
+  }
+
+  done(execution_context.status());
+}
+
+REGISTER_KERNEL_BUILDER(
+    Name(kMlrtBatchFunctionName).Device(DEVICE_CPU),
+    tfrt_stub::BatchFunctionFallbackKernel<MlrtBatchResource>);
+
+// Identical to BatchFunction except it has 2 extra TFRT attributes and it does
+// not have `f` attribute. Users will not invoke this op directly.
+REGISTER_OP(kMlrtBatchFunctionName)
+    .Input("in_tensors: Tin")
+    .Input("captured_tensors: Tcaptured")
+    .Output("out_tensors: Tout")
+    .Attr("num_batch_threads: int")
+    .Attr("max_batch_size: int")
+    .Attr("batch_timeout_micros: int")
+    .Attr("max_enqueued_batches: int = 10")
+    .Attr("allowed_batch_sizes: list(int) = []")
+    .Attr("container: string = ''")
+    .Attr("shared_name: string = ''")
+    .Attr("batching_queue: string = ''")
+    .Attr("Tin: list(type)")
+    .Attr("Tcaptured: list(type) >= 0")
+    .Attr("Tout: list(type)")
+    .Attr("enable_large_batch_splitting: bool = false")
+    .Attr("disable_padding: bool = false")
+    // An opaque function handle, which is an int64_t, for passing the batch
+    // function.
+    .Attr("opaque_function_handle: int")
+    .SetShapeFn(shape_inference::UnknownShape);
+
+}  // namespace
+
+// TODO(rohitju, chky): This additional Register is not ideal but unavoidable
+// since the batch kernel libraries are very large. We should refactor the
+// runtime_fallback lib to have only the necessary deps as a clean up and remove
+// this Register function.
+void RegisterTfMlrtBatchKernels(mlrt::KernelRegistry& registry) {
+  registry.Register<BatchFunctionOp>();
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.h b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h
similarity index 59%
rename from tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.h
rename to tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h
index e87b3cc67e2..a7b8e5f1a92 100644
--- a/tensorflow/cc/experimental/libtf/runtime/tfrt/tfrt.h
+++ b/tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h
@@ -12,22 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CC_EXPERIMENTAL_LIBTF_RUNTIME_TFRT_TFRT_H_
-#define TENSORFLOW_CC_EXPERIMENTAL_LIBTF_RUNTIME_TFRT_TFRT_H_
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_BATCH_KERNEL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_BATCH_KERNEL_H_
 
-#include "tensorflow/cc/experimental/libtf/runtime/runtime.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 
-namespace tf {
-namespace libtf {
-namespace runtime {
-namespace tfrt {
+namespace tensorflow {
+namespace tf_mlrt {
 
-// Instantiate a TFRT Runtime.
-Runtime Runtime();
+void RegisterTfMlrtBatchKernels(mlrt::KernelRegistry& registry);
 
-}  // namespace tfrt
-}  // namespace runtime
-}  // namespace libtf
-}  // namespace tf
+}  // namespace tf_mlrt
+}  // namespace tensorflow
 
-#endif  // TENSORFLOW_CC_EXPERIMENTAL_LIBTF_RUNTIME_TFRT_TFRT_H_
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_BATCH_KERNEL_H_
diff --git a/tensorflow/core/tfrt/mlrt/kernel/context.h b/tensorflow/core/tfrt/mlrt/kernel/context.h
new file mode 100644
index 00000000000..fa682f22c2e
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/context.h
@@ -0,0 +1,132 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_CONTEXT_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_CONTEXT_H_
+
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+// The context for tensorflow::OpKernel.
+class Context : public mlrt::UserContext<Context> {
+ public:
+  explicit Context(
+      const tfd::KernelFallbackCompatRequestState* fallback_request_state,
+      tfrt::ResourceContext* resource_context,
+      const tfrt::CancellationContext* cancellation_context = nullptr)
+      : fallback_request_state_(fallback_request_state),
+        op_kernel_context_(fallback_request_state_),
+        resource_context_(resource_context),
+        cancellation_context_(cancellation_context) {
+    DCHECK(resource_context_);
+  }
+
+  Context(const Context&) = default;
+  Context& operator=(const Context&) = default;
+
+  const tfd::KernelFallbackCompatRequestState& fallback_request_state() const {
+    return *fallback_request_state_;
+  }
+  void set_fallback_request_state(
+      const tfd::KernelFallbackCompatRequestState* fallback_request_state) {
+    DCHECK(fallback_request_state);
+    fallback_request_state_ = fallback_request_state;
+  }
+
+  OpKernelContext::Params& params() { return op_kernel_context_.params; }
+  OpKernelContext& op_kernel_context() {
+    return op_kernel_context_.op_kernel_context;
+  }
+
+  tfrt::ResourceContext& resource_context() const { return *resource_context_; }
+
+  const tfrt::CancellationContext* cancellation_context() const {
+    return cancellation_context_;
+  }
+
+  tfrt_stub::OpKernelRunState& run_state() {
+    // Keep states needed by kernel execution in a thread local storage to avoid
+    // repeated reallocation and destruction of them.
+    thread_local tfrt_stub::OpKernelRunState run_state;
+    return run_state;
+  }
+
+  // Return true if there is a cancellation request.
+  bool IsCancelled() {
+    return cancellation_context_ != nullptr &&
+           cancellation_context_->IsCancelled();
+  }
+
+ private:
+  const tfd::KernelFallbackCompatRequestState* fallback_request_state_ =
+      nullptr;
+
+  struct CopyableOpKernelContext {
+    OpKernelContext::Params params;
+    OpKernelContext op_kernel_context;
+
+    explicit CopyableOpKernelContext(
+        const tfd::KernelFallbackCompatRequestState* fallback_request_state)
+        : params(),
+          op_kernel_context(
+              [this, fallback_request_state]() {
+                DCHECK(fallback_request_state);
+                params.step_id = fallback_request_state->step_id();
+                auto* device = fallback_request_state->cpu_device();
+                params.device = device;
+                // Still use original device's resource_manager.
+                params.resource_manager = device->resource_manager();
+                params.step_container =
+                    fallback_request_state->step_container();
+                // Following two parameters are used to support executing
+                // tf.data via fallback.
+                params.function_library =
+                    fallback_request_state->cpu_function_library_runtime();
+                params.runner = fallback_request_state->runner();
+                params.collective_executor =
+                    fallback_request_state->collective_executor();
+                params.rendezvous = fallback_request_state->rendezvous();
+                params.session_metadata =
+                    &fallback_request_state->session_metadata();
+                params.cancellation_manager =
+                    fallback_request_state->cancellation_manager();
+                return &params;
+              }(),
+              0) {}
+    CopyableOpKernelContext(const CopyableOpKernelContext& other)
+        : params(other.params),
+          op_kernel_context(&params, other.op_kernel_context.num_outputs()) {}
+    CopyableOpKernelContext& operator=(const CopyableOpKernelContext& other) {
+      params = other.params;
+      op_kernel_context.ResetOutputs(other.op_kernel_context.num_outputs());
+      return *this;
+    }
+    ~CopyableOpKernelContext() { op_kernel_context.ResetOutputs(); }
+  };
+  CopyableOpKernelContext op_kernel_context_;
+
+  tfrt::ResourceContext* resource_context_ = nullptr;
+  const tfrt::CancellationContext* cancellation_context_;
+};
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_CONTEXT_H_
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
new file mode 100644
index 00000000000..42f0f86fedc
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
@@ -0,0 +1,649 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/optimization.h"
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "third_party/protobuf/text_format.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
+#include "tensorflow/core/tfrt/fallback/device_with_custom_allocator.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/async_handle.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
+#include "tensorflow/core/tfrt/utils/utils.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/lib/traceme.h"
+#include "tfrt/concurrency/chain.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+namespace {
+
+struct MapFnOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.map_fn";
+  // Tensor list or flow in inputs starts after max_iteration
+  static constexpr int kTensorListFlowInStartIndex = 1;
+
+  int32_t max_iteration() const {
+    const auto& tensor =
+        arguments()[0].Get<tensorflow::tfrt_stub::FallbackTensor>().tensor();
+    DCHECK(TensorShapeUtils::IsScalar(tensor.shape()));
+
+    return tensor.scalar<int32_t>()();
+  }
+  mlrt::RegisterValueSpan<tensorflow::tfrt_stub::FallbackTensor>
+  tensor_list_or_flow_in() const {
+    int num_args = arguments().size();
+    return arguments()
+        .drop_back(num_args - kTensorListFlowInStartIndex -
+                   num_tensor_list_or_flow_in())
+        .drop_front();
+  }
+  mlrt::bc::Span<uint8_t> tensor_list_or_flow_in_last_use() const {
+    int num_args = last_uses().size();
+    return last_uses().drop_front().drop_back(
+        num_args - kTensorListFlowInStartIndex - num_tensor_list_or_flow_in());
+  }
+
+  int32_t body_func_index() const { return attributes().GetAs<int32_t>(0); }
+  int32_t num_tensor_list_or_flow_in() const {
+    return attributes().GetAs<int32_t>(1);
+  }
+
+  Context& context() { return execution_context().GetUserContext<Context>(); }
+
+  void Invoke();
+};
+
+void MapFnOp::Invoke() {
+  auto function = execution_context()
+                      .loaded_executable()
+                      .executable()
+                      .functions()[body_func_index()];
+
+  tsl::profiler::TraceMe trace_me("tf_mlrt.map_fn");
+  trace_me.AppendMetadata([&]() {
+    return tsl::profiler::TraceMeEncode(
+        {{"max_iteration", max_iteration()}, {"name", function.name().Get()}});
+  });
+
+  if (max_iteration() <= 0) {
+    auto results = this->results();
+    auto in_tensor_list_last_use = tensor_list_or_flow_in_last_use();
+    DCHECK_EQ(results.size(), num_tensor_list_or_flow_in());
+    auto in_tensor_list = tensor_list_or_flow_in();
+    for (int i = 0; i < num_tensor_list_or_flow_in(); ++i) {
+      if (in_tensor_list_last_use[i]) {
+        results[i].Set(std::move(in_tensor_list[i]));
+      } else {
+        results[i].Set(in_tensor_list[i]);
+      }
+    }
+    return;
+  }
+
+  DCHECK_GE(arguments().size(), 2);
+  DCHECK_GE(results().size(), 1);
+
+  std::vector<mlrt::AsyncHandle> handles;
+  handles.reserve(max_iteration());
+
+  std::vector<mlrt::Promise> initializer_promises;
+  initializer_promises.reserve(num_tensor_list_or_flow_in());
+
+  std::vector<mlrt::Future> last_iter_futures;
+  last_iter_futures.reserve(num_tensor_list_or_flow_in());
+  for (int i = 0; i < num_tensor_list_or_flow_in(); ++i) {
+    initializer_promises.push_back(
+        mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>());
+    last_iter_futures.push_back(initializer_promises.back().GetFuture());
+  }
+
+  std::vector<mlrt::Value> body_args;
+  std::vector<uint8_t> body_arg_last_uses;
+  body_args.resize(arguments().size() + 1 + num_tensor_list_or_flow_in());
+  body_arg_last_uses.resize(body_args.size(), false);
+  std::fill(body_arg_last_uses.begin(),
+            body_arg_last_uses.begin() + 2 * num_tensor_list_or_flow_in() + 2,
+            true);
+
+  // Copy the invairant arguments (after max_iteration +
+  // tensor_list_or_flow_ins)
+  auto arg_iter = body_args.begin() + 2 * num_tensor_list_or_flow_in() + 2;
+  for (int j = num_tensor_list_or_flow_in() + 1; j < arguments().size();
+       ++j, ++arg_iter) {
+    *arg_iter = arguments()[j];
+  }
+
+  auto* work_queue = execution_context().work_queue();
+  DCHECK(work_queue);
+
+  for (int i = 0; i < max_iteration(); ++i) {
+    auto [promise, handle] = mlrt::AsyncHandle::Allocate(execution_context());
+
+    auto& thread_execution_context = handle.execution_context();
+    handles.push_back(std::move(handle));
+
+    thread_execution_context.set_exit_handler(
+        [&execution_context = thread_execution_context,
+         promise = std::move(promise)]() mutable {
+          std::move(promise).Finish(execution_context.status());
+        });
+
+    auto arg_iter = body_args.begin();
+    for (int j = 0; j < last_iter_futures.size(); ++j) {
+      *arg_iter = std::move(last_iter_futures[j]);
+      ++arg_iter;
+      auto tensor_promise =
+          mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>();
+
+      // Iteration n's future provide a continuation token for the next
+      // iteration.
+      last_iter_futures[j] = tensor_promise.GetFuture();
+      *arg_iter = std::move(tensor_promise);
+      ++arg_iter;
+    }
+
+    // The current loop count is the next argument.
+    tensorflow::Tensor loop_counter_tensor(DT_INT32, {});
+    loop_counter_tensor.scalar<int32_t>()() = i;
+    *arg_iter =
+        tensorflow::tfrt_stub::FallbackTensor(std::move(loop_counter_tensor));
+    ++arg_iter;
+
+    tensorflow::Tensor element_index_tensor(DT_INT32, {});
+    element_index_tensor.scalar<int32_t>()() = i;
+    *arg_iter =
+        tensorflow::tfrt_stub::FallbackTensor(std::move(element_index_tensor));
+    ++arg_iter;
+
+    thread_execution_context.Call(function, body_arg_last_uses,
+                                  absl::MakeSpan(body_args),
+                                  absl::Span<mlrt::Value>());
+  }
+
+  //  Kick off task by setting first future
+  auto in_tensor_list = tensor_list_or_flow_in();
+  auto in_tensor_list_last_use = tensor_list_or_flow_in_last_use();
+  for (int j = 0; j < num_tensor_list_or_flow_in(); j++) {
+    if (in_tensor_list_last_use[j]) {
+      std::move(initializer_promises[j])
+          .Set<tensorflow::tfrt_stub::FallbackTensor>(
+              std::move(in_tensor_list[j]));
+    } else {
+      std::move(initializer_promises[j])
+          .Set<tensorflow::tfrt_stub::FallbackTensor>(in_tensor_list[j]);
+    }
+  }
+
+  // Run the first iteration inline while the rest iterations are enqueued to
+  // the thread pool.
+  for (int i = 1; i < max_iteration(); ++i) {
+    work_queue->AddTask(
+        [&execution_context = handles[i].execution_context()]() {
+          Execute(execution_context);
+        });
+  }
+
+  if (!handles.empty()) {
+    Execute(handles[0].execution_context());
+  }
+
+  mlrt::Future await_all = mlrt::AwaitAll(absl::MakeSpan(handles));
+
+  // Need a separate promise to make this blocking call.
+  // Do not use wait on last_iter_future b/c when last_iter_future is ready,
+  // the body function may not return in theory yet.
+  auto all_done_promise = mlrt::Promise::Allocate<mlrt::Control>();
+  auto all_done_future = all_done_promise.GetFuture();
+
+  // TODO(deqiangc): remove std::move(handles).
+  std::move(await_all).Then([results = results(),
+                             last_iter_futures = std::move(last_iter_futures),
+                             handles = std::move(handles),
+                             done_promise = std::move(all_done_promise)](
+                                absl::Status status) mutable {
+    // Keep handles alive
+    DCHECK_EQ(results.size(), last_iter_futures.size());
+    // TODO(deqiangc): future.then outside so that we can avoid this copy.
+    for (int j = 0; j < last_iter_futures.size(); j++) {
+      CHECK(last_iter_futures[j].IsReady());  // Crash OK
+
+      if (last_iter_futures[j].IsError()) {
+        // Error code and source location will reflect the first error if handle
+        // does not report error.
+        if (status.ok()) {
+          status = absl::Status(
+              /*code=*/last_iter_futures[j].GetError().code(),
+              /*msg=*/
+              absl::StrCat(last_iter_futures[j].GetError().message(),
+                           ". First Error Index=", j, " of ",
+                           last_iter_futures.size()),
+              absl::SourceLocation());
+          for (const auto& location :
+               last_iter_futures[j].GetError().GetSourceLocations()) {
+            status.AddSourceLocation(location);
+          }
+        }
+      } else {
+        results[j].Set(
+            last_iter_futures[j].Get<tensorflow::tfrt_stub::FallbackTensor>());
+      }
+    }
+    if (!status.ok()) {
+      std::move(done_promise).SetError(std::move(status));
+    } else {
+      std::move(done_promise).Set<mlrt::Control>(mlrt::Control{});
+    }
+  });
+  execution_context().Await(std::move(all_done_future));
+}
+
+struct CancelOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.cancel";
+  void Invoke();
+};
+
+void CancelOp::Invoke() {
+  if (execution_context().GetUserContext<Context>().IsCancelled()) {
+    execution_context().FailOnCancellation();
+  }
+}
+
+struct CreateOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.createop";
+
+  absl::string_view node_def_text() const {
+    return attributes().GetAs<mlrt::bc::String>(0).Get();
+  }
+
+  int32_t op_key() const { return attributes().GetAs<int32_t>(1); }
+
+  Context& context() { return execution_context().GetUserContext<Context>(); }
+
+  void Invoke();
+};
+
+void CreateOp::Invoke() {
+  auto& fallback_request_state = context().fallback_request_state();
+
+  tensorflow::NodeDef node_def;
+  if (!proto2::TextFormat::ParseFromString(node_def_text(), &node_def)) {
+    execution_context().Fail(absl::InternalError(
+        absl::StrCat("CreateOp: failed to parse NodeDef: ", node_def_text())));
+    return;
+  }
+
+  auto runner = tfrt_stub::OpKernelRunner::Create(
+                    node_def.op(), node_def.name(), node_def.device(),
+                    node_def.input().size(),
+                    [&](tensorflow::AttrValueMap* attr_value_map) {
+                      *attr_value_map = node_def.attr();
+                      return OkStatus();
+                    },
+                    fallback_request_state.device_manager(),
+                    fallback_request_state.process_function_library_runtime())
+                    .value();
+
+  if (!fallback_request_state.runner_table()->Insert(op_key(),
+                                                     std::move(runner))) {
+    execution_context().Fail(absl::InternalError(absl::StrCat(
+        "CreateOp: OpKernelRunner already exists: ", node_def.op())));
+  }
+}
+
+template <bool IsAsync, typename Frame>
+void ExecuteOpInternal(Frame& frame) {
+  int32_t op_key = frame.op_key();
+
+  auto& context = frame.context();
+  const auto& fallback_request_state = context.fallback_request_state();
+
+  // Start recording the op execution time, given a non-null cost recorder.
+  auto* cost_recorder = fallback_request_state.cost_recorder();
+  uint64_t run_start_time = 0;
+  if (cost_recorder != nullptr) run_start_time = tfrt::GetCpuClockCycle();
+
+  auto* kernel_runner =
+      fallback_request_state.runner_table()->GetUnsafe(op_key);
+  DCHECK(kernel_runner);
+
+  ExecuteKernelRunner<IsAsync>(frame, context, fallback_request_state,
+                               *kernel_runner);
+
+  // Finish recording the op execution time, given a non-null
+  // cost recorder.
+  //
+  // TODO(b/259602527): Measure async op costs more accurately with whole
+  // execution time. (It's not urgent because async ops are rare.)
+  if (cost_recorder != nullptr) {
+    const uint64_t run_finish_time = tfrt::GetCpuClockCycle();
+    cost_recorder->RecordCost(op_key, run_finish_time - run_start_time);
+  }
+}
+
+struct ExecuteOp : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  // TODO(chky, deqiangc): Consider changing "executeop" to "execute_op" so that
+  // the naming convention is consistent with other kernels.
+  static constexpr char kName[] = "tf_mlrt.executeop";
+  static constexpr bool kUseCustomDevice = false;
+
+  mlrt::RegisterValueSpan<tfrt_stub::FallbackTensor> args() const {
+    return arguments();
+  }
+
+  absl::string_view node_def_text() const {
+    return attributes().GetAs<mlrt::bc::String>(0).Get();
+  }
+
+  int32_t op_key() const { return attributes().GetAs<int32_t>(1); }
+
+  Context& context() const {
+    return execution_context().GetUserContext<Context>();
+  }
+
+  tensorflow::Device* device() const {
+    return context().fallback_request_state().cpu_device();
+  }
+
+  void Invoke() { ExecuteOpInternal</*IsAsync=*/false>(*this); }
+};
+
+struct AsyncExecuteOp : ExecuteOp {
+  using ExecuteOp::ExecuteOp;
+
+  static constexpr char kName[] = "tf_mlrt.async_executeop";
+
+  void Invoke() {
+    static_assert(!AsyncExecuteOp::kUseCustomDevice);
+    if (execution_context().GetUserContext<Context>().IsCancelled()) {
+      execution_context().FailOnCancellation();
+      return;
+    }
+
+    ExecuteOpInternal</*IsAsync=*/true>(*this);
+  }
+};
+
+struct ExecuteOpDevice : ExecuteOp {
+  using Base = ExecuteOp;
+  using Base::Base;
+
+  static constexpr char kName[] = "tf_mlrt.executeop.device";
+  static constexpr bool kUseCustomDevice = true;
+
+  mlrt::RegisterValueSpan<tfrt_stub::FallbackTensor> args() const {
+    return arguments().drop_front();
+  }
+
+  mlrt::bc::Span<uint8_t> last_uses() const {
+    return Base::last_uses().drop_front();
+  }
+
+  tensorflow::Device* device() const {
+    return arguments()[0].Get<std::unique_ptr<tensorflow::Device>>().get();
+  }
+
+  void Invoke() { ExecuteOpInternal</*IsAsync=*/false>(*this); }
+};
+
+struct AsyncExecuteOpDevice : ExecuteOpDevice {
+  using ExecuteOpDevice::ExecuteOpDevice;
+
+  static constexpr char kName[] = "tf_mlrt.async_executeop.device";
+
+  void Invoke() {
+    static_assert(AsyncExecuteOpDevice::kUseCustomDevice);
+    if (execution_context().GetUserContext<Context>().IsCancelled()) {
+      execution_context().FailOnCancellation();
+      return;
+    }
+
+    ExecuteOpInternal</*IsAsync=*/true>(*this);
+  }
+};
+
+void SetResource(mlrt::KernelFrame frame) {
+  auto& resource_tensor = frame.arguments()[0].Get<tfrt_stub::FallbackTensor>();
+  int64_t index = frame.attributes().GetAs<int64_t>(0);
+  auto& context = frame.execution_context().GetUserContext<Context>();
+  const auto& fallback_request_state = context.fallback_request_state();
+
+  auto* resource_array = fallback_request_state.resource_array();
+  if (!resource_array) {
+    frame.execution_context().Fail(
+        absl::InternalError("Fallback resource_array is null"));
+    return;
+  }
+
+  resource_array->SetResource(
+      index,
+      tensorflow::tfrt_stub::ImmutableTensor::Create(resource_tensor.tensor()));
+}
+
+void GetResource(mlrt::KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("tf_mlrt.get_resource");
+  auto& context = frame.execution_context().GetUserContext<Context>();
+  const auto& fallback_request_state = context.fallback_request_state();
+
+  auto* resource_array = fallback_request_state.resource_array();
+  if (!resource_array) {
+    frame.execution_context().Fail(
+        absl::InternalError("Fallback resource_array is null"));
+    return;
+  }
+
+  mlrt::bc::Vector<int64_t> indices(frame.attributes()[0].data());
+
+  auto results = frame.results();
+
+  for (int i = 0; i < indices.size(); ++i) {
+    results[i].Emplace<tensorflow::tfrt_stub::FallbackTensor>(
+        resource_array->GetResourceAsFallbackTensor(indices[i]));
+  }
+}
+
+void TensorToInt32(mlrt::KernelFrame frame) {
+  const auto& tensor = frame.arguments()[0]
+                           .Get<tensorflow::tfrt_stub::FallbackTensor>()
+                           .tensor();
+  if (TensorShapeUtils::IsScalar(tensor.shape()) &&
+      tensor.dtype() == DT_INT32) {
+    frame.results()[0].Set(tensor.scalar<int32_t>()());
+  } else {
+    frame.execution_context().Fail(absl::InvalidArgumentError(absl::StrCat(
+        DataTypeString(tensor.dtype()), " cannot be converted to a int32")));
+  }
+}
+
+absl::StatusOr<bool> PredicateInternal(const tensorflow::Tensor& tensor) {
+  if (TensorShapeUtils::IsScalar(tensor.shape())) {
+    switch (tensor.dtype()) {
+#define CASE(T)                  \
+  case DataTypeToEnum<T>::value: \
+    return tensor.scalar<T>()() != 0;
+
+      CASE(float);
+      CASE(double);
+      CASE(uint8_t);
+      CASE(int8_t);
+      CASE(int16_t);
+      CASE(int32_t);
+      CASE(int64_t);
+      CASE(bool);
+#undef CASE
+      case DT_STRING:
+        return !tensor.scalar<tstring>()().empty();
+      default:
+        return absl::InvalidArgumentError(
+            absl::StrCat(DataTypeString(tensor.dtype()),
+                         " cannot be converted to a boolean"));
+    }
+  }
+
+  return tensor.NumElements() > 0;
+}
+
+void Predicate(mlrt::KernelFrame frame) {
+  const auto& tensor = frame.arguments()[0]
+                           .Get<tensorflow::tfrt_stub::FallbackTensor>()
+                           .tensor();
+  auto result = PredicateInternal(tensor);
+  if (ABSL_PREDICT_FALSE(!result.ok())) {
+    frame.execution_context().Fail(result.status());
+    return;
+  }
+
+  frame.results()[0].Set(*result);
+}
+
+void AllocateTensorFutures(mlrt::KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("tf_mlrt.allocate_futures");
+  uint32_t num = frame.attributes().GetAs<uint32_t>(0);
+
+  DCHECK_EQ(num * 2, frame.results().size());
+  for (int i = 0; i < num; ++i) {
+    auto promise =
+        mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>();
+    frame.results()[num + i].Set<mlrt::Future>(promise.GetFuture());
+    frame.results()[i].Set<mlrt::Promise>(std::move(promise));
+  }
+}
+
+void AwaitTensor(mlrt::KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("tf_mlrt.await");
+  auto& future = frame.arguments()[0].Get<mlrt::Future>();
+  if (frame.last_uses()[0]) {
+    frame.execution_context().Await<tensorflow::tfrt_stub::FallbackTensor>(
+        std::move(future), &frame.results()[0]);
+    frame.arguments()[0].Destroy<mlrt::Future>();
+  } else {
+    frame.execution_context().Await<tensorflow::tfrt_stub::FallbackTensor>(
+        future, &frame.results()[0]);
+  }
+}
+
+void AwaitAllTensor(mlrt::KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("tf_mlrt.await_all");
+  mlrt::RegisterValueSpan<mlrt::Future> futures(frame.arguments());
+  frame.execution_context().AwaitAll<tensorflow::tfrt_stub::FallbackTensor>(
+      futures, frame.results());
+
+  DCHECK_EQ(frame.last_uses().size(), futures.size());
+  auto last_use_iter = frame.last_uses().begin();
+
+  for (int i = 0; i < futures.size(); ++i) {
+    if (*last_use_iter++) {
+      futures.Destroy(i);
+    }
+  }
+}
+
+void PromiseTensor(mlrt::KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("tf_mlrt.promise");
+  auto& promise = frame.arguments()[0].Get<mlrt::Promise>();
+  auto& tensor =
+      frame.arguments()[1].Get<tensorflow::tfrt_stub::FallbackTensor>();
+  if (frame.last_uses()[1]) {
+    std::move(promise).Set<tensorflow::tfrt_stub::FallbackTensor>(
+        std::move(tensor));
+  } else {
+    std::move(promise).Set<tensorflow::tfrt_stub::FallbackTensor>(tensor);
+  }
+
+  frame.arguments()[0].Destroy<mlrt::Promise>();
+}
+
+void PromiseFuture(mlrt::KernelFrame frame) {
+  tsl::profiler::TraceMe trace_me("tf_mlrt.promise_future");
+  auto& promise = frame.arguments()[0].Get<mlrt::Promise>();
+  auto incoming_future = frame.arguments()[1].Get<mlrt::Future>();
+  std::move(incoming_future)
+      .Then([promise = std::move(promise)](
+                absl::StatusOr<tensorflow::tfrt_stub::FallbackTensor>
+                    value) mutable {
+        if (value.ok()) {
+          std::move(promise).Set<tensorflow::tfrt_stub::FallbackTensor>(
+              *std::move(value));
+        } else {
+          std::move(promise).SetError(std::move(value).status());
+        }
+      });
+}
+
+}  // namespace
+
+mlrt::KernelRegistry& GetTfMlrtOptionalKernelRegistry() {
+  static auto* const registry = new mlrt::KernelRegistry;
+  return *registry;
+}
+
+void RegisterTfMlrtKernels(mlrt::KernelRegistry& registry) {
+  mlrt::RegisterBuiltinKernels(registry);
+  // TODO(chky,rohitju): These kernels should be unified with the corresponding
+  // tfrt_fallback_sync kernels, e.g. tfrt_fallback_sync.executeop.
+  registry.Register<CancelOp>();
+  registry.Register<CreateOp>();
+  registry.Register<CreateOp>("tfrt_fallback_sync.createop");
+  registry.Register<ExecuteOp>();
+  registry.Register<ExecuteOp>("tfrt_fallback_sync.executeop");
+  registry.Register<AsyncExecuteOp>();
+  registry.Register<ExecuteOpDevice>();
+  registry.Register<AsyncExecuteOpDevice>();
+  registry.Register("tf_mlrt.set_resource", &SetResource);
+  registry.Register("tfrt_fallback_sync.set_resource", &SetResource);
+  registry.Register("tf_mlrt.get_resource", &GetResource);
+  registry.Register("tfrt_fallback_sync.get_resource", &GetResource);
+  registry.Register("tf_mlrt.predicate", &Predicate);
+  registry.Register("tf_mlrt.tensor_to_int32", &TensorToInt32);
+  registry.Register("tf_mlrt.allocate_futures", &AllocateTensorFutures);
+  registry.Register("tf_mlrt.await", &AwaitTensor);
+  registry.Register("tf_mlrt.await_all", &AwaitAllTensor);
+  registry.Register<MapFnOp>();
+  registry.Register("tf_mlrt.promise", &PromiseTensor);
+  registry.Register("tf_mlrt.promise_future", &PromiseFuture);
+
+  registry.Merge(GetTfMlrtOptionalKernelRegistry());
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.h b/tensorflow/core/tfrt/mlrt/kernel/kernel.h
new file mode 100644
index 00000000000..36ee01d1a50
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.h
@@ -0,0 +1,30 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_H_
+
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+mlrt::KernelRegistry& GetTfMlrtOptionalKernelRegistry();
+
+void RegisterTfMlrtKernels(mlrt::KernelRegistry& registry);
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_H_
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.cc
new file mode 100644
index 00000000000..8d81e00644f
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.cc
@@ -0,0 +1,79 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+void LaunchAsyncOpKernel(const tfrt_stub::OpKernelRunner& kernel_runner,
+                         const tfrt_stub::OpKernelRunState& run_state,
+                         const OpKernelContext::Params& params,
+                         mlrt::RegisterSpan results) {
+  struct AsyncState {
+    explicit AsyncState(const tfrt_stub::OpKernelRunState& rs,
+                        const OpKernelContext::Params& params, int num_outputs)
+        : run_state(rs.input_tf_tensor_values, params),
+          context(&run_state.params, num_outputs) {}
+
+    tfrt_stub::OpKernelRunState run_state;
+    OpKernelContext context;
+
+    std::vector<mlrt::Promise> results;
+  };
+
+  DCHECK_EQ(results.size(), kernel_runner.op_kernel()->num_outputs());
+  auto async_state =
+      std::make_shared<AsyncState>(run_state, params, results.size());
+
+  async_state->results.reserve(results.size());
+  for (int i = 0; i < results.size(); ++i) {
+    auto promise =
+        mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>();
+
+    results[i].Set(promise.GetFuture());
+    async_state->results.push_back(std::move(promise));
+  }
+
+  auto* op_kernel_context_ptr = &async_state->context;
+
+  auto done_callback = [async_state = std::move(async_state)]() {
+    auto& op_kernel_context = async_state->context;
+
+    if (!op_kernel_context.status().ok()) {
+      for (auto& result : async_state->results) {
+        std::move(result).SetError(op_kernel_context.status());
+      }
+      return;
+    }
+
+    for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
+      DCHECK(op_kernel_context.mutable_output(i));
+      std::move(async_state->results[i])
+          .Set<tensorflow::tfrt_stub::FallbackTensor>(
+              std::move(*op_kernel_context.mutable_output(i)));
+    }
+  };
+
+  kernel_runner.RunAsync(op_kernel_context_ptr, std::move(done_callback));
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
new file mode 100644
index 00000000000..83b99fb19e9
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h
@@ -0,0 +1,144 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_RUNNER_UTILS_H_
+#define TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_RUNNER_UTILS_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/optimization.h"
+#include "absl/cleanup/cleanup.h"
+#include "tensorflow/core/profiler/lib/traceme.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+void LaunchAsyncOpKernel(const tfrt_stub::OpKernelRunner& kernel_runner,
+                         const tfrt_stub::OpKernelRunState& run_state,
+                         const OpKernelContext::Params& params,
+                         mlrt::RegisterSpan results);
+
+inline void SetUpParams(const tfrt_stub::OpKernelRunner& kernel_runner,
+                        absl::Span<const TensorValue> input_tf_tensor_values,
+                        OpKernelContext::Params& params) {
+  params.inputs = input_tf_tensor_values;
+  params.op_kernel = kernel_runner.op_kernel();
+  params.input_alloc_attrs = kernel_runner.input_alloc_attrs();
+  params.output_attr_array = kernel_runner.output_alloc_attrs().data();
+}
+
+template <bool IsAsync, typename Frame>
+void ExecuteKernelRunner(
+    Frame& frame, Context& context,
+    const tfd::KernelFallbackCompatRequestState& fallback_request_state,
+    const tfrt_stub::OpKernelRunner& kernel_runner) {
+  tsl::profiler::TraceMe trace_me([&]() -> std::string {
+    return tsl::profiler::TraceMeOp(
+        kernel_runner.op_kernel()->name_view(),
+        kernel_runner.op_kernel()->type_string_view());
+  });
+
+  auto args = frame.args();
+  auto last_uses = frame.last_uses();
+
+  auto& run_state = context.run_state();
+  auto& tensor_buffers = run_state.tensor_buffers;
+
+  auto clean_up_inputs = absl::MakeCleanup([&]() {
+    for (const auto* buffer : tensor_buffers) {
+      DCHECK(buffer);
+      buffer->Unref();
+    }
+    tensor_buffers.clear();
+  });
+
+  // Prepare the input tensors.
+  auto& input_tf_tensor_values = run_state.input_tf_tensor_values;
+  input_tf_tensor_values.resize(args.size());
+  for (int i = 0; i < args.size(); ++i) {
+    auto& fallback_tensor = args[i];
+    // If the argument is immutable or it is the last use in the current scope,
+    // we can just keep the reference without copying that invovles expensive
+    // atomic reference counting. And if it is the last use, it can enable
+    // buffer forwarding optimization in many tensorflow OpKernels.
+    if (!fallback_tensor.is_immutable() && !last_uses[i]) {
+      if (const auto* buffer = fallback_tensor.buffer()) {
+        buffer->Ref();
+        tensor_buffers.push_back(buffer);
+      }
+    }
+    input_tf_tensor_values[i].tensor = &fallback_tensor.tensor();
+  }
+
+  auto& params = context.params();
+  SetUpParams(kernel_runner, input_tf_tensor_values, params);
+
+  auto results = frame.results();
+
+  if constexpr (!IsAsync) {
+    tensorflow::DeviceBase* device = nullptr;
+    if constexpr (Frame::kUseCustomDevice) {
+      // If the kernel is using custom device, save the current device and
+      // change to the custom device.
+      device = params.device;
+      params.device = frame.device();
+    }
+
+    auto& op_kernel_context = context.op_kernel_context();
+    op_kernel_context.ResetOutputs(results.size());
+
+    kernel_runner.Run(&op_kernel_context);
+
+    if constexpr (Frame::kUseCustomDevice) {
+      // We need to restore the device as params will be reused by kernels
+      // invoked later.
+      params.device = device;
+    }
+
+    if (ABSL_PREDICT_FALSE(!op_kernel_context.status().ok())) {
+      frame.execution_context().Fail(op_kernel_context.status());
+      return;
+    }
+
+    for (int i = 0; i < op_kernel_context.num_outputs(); ++i) {
+      DCHECK(op_kernel_context.mutable_output(i));
+      results[i].template Emplace<tensorflow::tfrt_stub::FallbackTensor>(
+          std::move(*op_kernel_context.mutable_output(i)));
+    }
+  } else {
+    // TODO(chky): Add custom device support for async opkernel.
+
+    LaunchAsyncOpKernel(kernel_runner, run_state, params, results);
+  }
+
+  auto reg_span = args.reg_span();
+  for (int i = 0; i < last_uses.size(); ++i) {
+    if (last_uses[i]) {
+      reg_span[i].template Destroy<tensorflow::tfrt_stub::FallbackTensor>();
+    }
+  }
+}
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_MLRT_KERNEL_KERNEL_RUNNER_UTILS_H_
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
new file mode 100644
index 00000000000..e3a4dc82998
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel_test.cc
@@ -0,0 +1,2307 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/substitute.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/tfrt/fallback/device_with_custom_allocator.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+#include "tfrt/concurrency/ref_count.h"  // from @tf_runtime
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/execution_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+namespace {
+
+struct TestOp : mlrt::KernelFrame {
+  static constexpr char kName[] = "test";
+  using KernelFrame::KernelFrame;
+  void Invoke() {}
+};
+
+TEST(KernelTest, OptionalRegistry) {
+  GetTfMlrtOptionalKernelRegistry().Register<TestOp>();
+
+  mlrt::KernelRegistry registry;
+
+  RegisterTfMlrtKernels(registry);
+
+  EXPECT_TRUE(registry.Get(TestOp::kName));
+}
+
+mlrt::bc::Buffer CreateExecutableForCreateExecuteOp(absl::string_view op_name) {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  auto kernel_names_ctor = executable_ctor.construct_kernel_names(3);
+  kernel_names_ctor.ConstructAt(0, "tf_mlrt.createop");
+  kernel_names_ctor.ConstructAt(1, "tf_mlrt.executeop");
+  kernel_names_ctor.ConstructAt(2, "return");
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(2));
+
+  attributes.Add("node_def_str",
+                 absl::Substitute(
+                     R"pb(name: "$0"
+                          op: "$0"
+                          input: "dummy_arg"
+                          input: "dummy_arg"
+                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                          attr {
+                            key: "T"
+                            value { type: DT_INT32 }
+                          })pb",
+                     op_name));
+
+  attributes.Add("op_key", 0);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+  function_ctor.set_num_regs(2);
+  function_ctor.construct_input_regs(1).Assign({0});
+  function_ctor.construct_output_regs(1).Assign({1});
+
+  auto kernels_ctor = function_ctor.construct_kernels(3);
+
+  auto createop_ctor = kernels_ctor.ConstructAt(0);
+  createop_ctor.set_code(0);
+  createop_ctor.construct_arguments(0);
+  createop_ctor.construct_results(0);
+  createop_ctor.construct_attributes(2).Assign(
+      {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+
+  auto executeop_ctor = kernels_ctor.ConstructAt(1);
+  executeop_ctor.set_code(1);
+  executeop_ctor.construct_arguments(2).Assign({0, 0});
+  executeop_ctor.construct_results(1).Assign({1});
+  executeop_ctor.construct_attributes(2).Assign(
+      {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+  executeop_ctor.construct_last_uses(2).Assign({0, 0});
+
+  auto return_ctor = kernels_ctor.ConstructAt(2);
+  return_ctor.set_code(2);
+  return_ctor.construct_arguments(1).Assign({1});
+  return_ctor.construct_results(0);
+
+  return buffer;
+}
+
+TEST(KernelTest, CreateExecuteOp) {
+  auto buffer = CreateExecutableForCreateExecuteOp("AddV2");
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  int32_t input = 100;
+  tensorflow::Tensor input_tensor(input);
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value result;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  ASSERT_OK(execution_context.status());
+
+  int32_t output = 200;
+  tensorflow::Tensor expected(output);
+
+  tensorflow::test::ExpectEqual(
+      result.Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+}
+
+mlrt::bc::Buffer CreateExecutableForCreateExecuteOpCustomDevice(
+    absl::string_view op_name) {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+
+  std::vector<std::string> kernel_names = {"tf_mlrt.createop",
+                                           "tf_mlrt.executeop.device",
+                                           "tf_mlrt.executeop", "return"};
+
+  executable_ctor.construct_kernel_names(kernel_names.size())
+      .Assign(kernel_names);
+  kernels.Def(kernel_names);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(2));
+
+  attributes.Add("node_def_str",
+                 absl::Substitute(
+                     R"pb(name: "$0"
+                          op: "$0"
+                          device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                          attr {
+                            key: "T"
+                            value { type: DT_STRING }
+                          })pb",
+                     op_name));
+
+  attributes.Add("op_key", 0);
+
+  mlrt::testing::SymbolTable regs;
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+  function_ctor.construct_input_regs(1).Assign({regs.Def("device")});
+  auto kernels_ctor = function_ctor.construct_kernels(4);
+
+  auto createop_ctor = kernels_ctor.ConstructAt(0);
+  createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+  createop_ctor.construct_arguments(0);
+  createop_ctor.construct_results(0);
+  createop_ctor.construct_attributes(2).Assign(
+      {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+
+  {
+    auto executeop_ctor = kernels_ctor.ConstructAt(1);
+    executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop.device"));
+    executeop_ctor.construct_arguments(1).Assign({regs.Use("device")});
+    executeop_ctor.construct_results(1).Assign(
+        {regs.Def("custom_device_name")});
+    executeop_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+    executeop_ctor.construct_last_uses(1).Assign({true});
+  }
+
+  {
+    auto executeop_ctor = kernels_ctor.ConstructAt(2);
+    executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
+    executeop_ctor.construct_results(1).Assign({regs.Def("device_name")});
+    executeop_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+  }
+
+  auto return_ctor = kernels_ctor.ConstructAt(3);
+  return_ctor.set_code(kernels.Use("return"));
+  return_ctor.construct_arguments(2).Assign(
+      {regs.Use("custom_device_name"), regs.Use("device_name")});
+
+  function_ctor.construct_output_regs(2).Assign(
+      {regs.Use("custom_device_name"), regs.Use("device_name")});
+  function_ctor.set_num_regs(regs.size());
+
+  return buffer;
+}
+
+REGISTER_OP("TestDevice")
+    .Output("z: T")
+    .Attr("T: {string}")
+    .SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+class TestDeviceKernel : public OpKernel {
+ public:
+  explicit TestDeviceKernel(OpKernelConstruction* context)
+      : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    ctx->set_output(0, tensorflow::Tensor(ctx->device()->name()));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TestDevice").Device(DEVICE_CPU),
+                        TestDeviceKernel);
+
+TEST(KernelTest, CreateExecuteDeviceOp) {
+  auto buffer = CreateExecutableForCreateExecuteOpCustomDevice("TestDevice");
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  class TestDevice : public tfrt_stub::DeviceWithCustomAllocator {
+   public:
+    using DeviceWithCustomAllocator::DeviceWithCustomAllocator;
+
+    const std::string& name() const override { return name_; }
+
+   private:
+    std::string name_ = "test_device";
+  };
+
+  mlrt::Value arg;
+  arg.Set<std::unique_ptr<Device>>(std::make_unique<TestDevice>(
+      fallback_request_state.cpu_device(),
+      fallback_request_state.cpu_device()->GetAllocator({})));
+  mlrt::Value results[2];
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(results));
+  mlrt::Execute(execution_context);
+
+  ASSERT_OK(execution_context.status());
+
+  EXPECT_EQ(
+      results[0].Get<tfrt_stub::FallbackTensor>().tensor().scalar<tstring>()(),
+      "test_device");
+  EXPECT_EQ(
+      results[1].Get<tfrt_stub::FallbackTensor>().tensor().scalar<tstring>()(),
+      fallback_request_state.cpu_device()->name());
+}
+
+REGISTER_OP("TestError")
+    .Input("x: T")
+    .Input("y: T")
+    .Output("z: T")
+    .Attr("T: {int32}")
+    .SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+class TestErrorKernel : public OpKernel {
+ public:
+  explicit TestErrorKernel(OpKernelConstruction* context) : OpKernel(context) {}
+
+  void Compute(OpKernelContext* ctx) override {
+    ctx->SetStatus(absl::InternalError("test error"));
+  }
+};
+
+REGISTER_KERNEL_BUILDER(Name("TestError").Device(DEVICE_CPU), TestErrorKernel);
+
+TEST(KernelTest, CreateExecuteOpError) {
+  auto buffer = CreateExecutableForCreateExecuteOp("TestError");
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  int32_t input = 100;
+  tensorflow::Tensor input_tensor(input);
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value result;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "test error"));
+}
+
+REGISTER_OP("TestAsyncIdentity")
+    .Input("x: T")
+    .Output("y: T")
+    .Attr("T: {int32}")
+    .SetShapeFn(::tensorflow::shape_inference::UnchangedShape);
+
+class TestAsyncIdentityKernel : public AsyncOpKernel {
+ public:
+  explicit TestAsyncIdentityKernel(OpKernelConstruction* context)
+      : AsyncOpKernel(context) {}
+
+  void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
+    DCHECK(ctx->device()->tensorflow_cpu_worker_threads()->workers);
+    ctx->device()->tensorflow_cpu_worker_threads()->workers->Schedule(
+        [done = std::move(done), ctx]() {
+          const Tensor& x = ctx->input(0);
+          ctx->set_output(0, x);
+          done();
+        });
+  }
+
+ private:
+  TF_DISALLOW_COPY_AND_ASSIGN(TestAsyncIdentityKernel);
+};
+
+REGISTER_KERNEL_BUILDER(Name("TestAsyncIdentity").Device(DEVICE_CPU),
+                        TestAsyncIdentityKernel);
+
+mlrt::bc::Buffer CreateExecutableForCreateAsyncExecuteOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+
+  std::vector<std::string> kernel_names = {"tf_mlrt.createop",
+                                           "tf_mlrt.async_executeop",
+                                           "tf_mlrt.await_all", "return"};
+
+  executable_ctor.construct_kernel_names(kernel_names.size())
+      .Assign(kernel_names);
+  kernels.Def(kernel_names);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(2));
+
+  attributes.Add("node_def_str",
+                 R"pb(name: "TestAsyncIdentity"
+                      op: "TestAsyncIdentity"
+                      input: "dummy_arg"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "T"
+                        value { type: DT_INT32 }
+                      })pb");
+
+  attributes.Add("op_key", 0);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  mlrt::testing::SymbolTable regs;
+
+  function_ctor.construct_input_regs(1).Assign({regs.Def("input")});
+
+  auto kernels_ctor = function_ctor.construct_kernels(5);
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+    kernel_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.async_executeop"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"input", "input"}));
+    kernel_ctor.construct_results(1).Assign({regs.Def("result_future_0")});
+    kernel_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+    kernel_ctor.construct_last_uses(2).Assign({0, 0});
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(2);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.async_executeop"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"input", "input"}));
+    kernel_ctor.construct_results(1).Assign({regs.Def("result_future_1")});
+    kernel_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+    kernel_ctor.construct_last_uses(2).Assign({0, 0});
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(3);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.await_all"));
+    kernel_ctor.construct_arguments(2).Assign(
+        regs.Use({"result_future_0", "result_future_1"}));
+    kernel_ctor.construct_last_uses(2).Assign({true, true});
+    kernel_ctor.construct_results(2).Assign(
+        regs.Def(absl::Span<const std::string>{"result_0", "result_1"}));
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(4);
+    kernel_ctor.set_code(kernels.Use("return"));
+    kernel_ctor.construct_arguments(2).Assign(
+        regs.Use({"result_0", "result_1"}));
+  }
+
+  function_ctor.set_num_regs(regs.size());
+  function_ctor.construct_output_regs(2).Assign(
+      regs.Use({"result_0", "result_1"}));
+
+  return buffer;
+}
+
+TEST(KernelTest, CreateAsyncExecuteOp) {
+  auto buffer = CreateExecutableForCreateAsyncExecuteOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  int32_t input = 100;
+  tensorflow::Tensor input_tensor(input);
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  std::vector<mlrt::Value> results(2);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(results));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  int32_t output = 100;
+  tensorflow::Tensor expected(output);
+
+  tensorflow::test::ExpectEqual(
+      results[0].Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+  tensorflow::test::ExpectEqual(
+      results[1].Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+}
+
+TEST(KernelTest, AsyncExecuteOpCanCancell) {
+  auto buffer = CreateExecutableForCreateAsyncExecuteOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::RCReference<tfrt::CancellationContext> cancellation_context =
+      tfrt::TakeRef(new tfrt::CancellationContext());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context = std::make_unique<Context>(
+      &fallback_request_state, &resource_context, cancellation_context.get());
+  execution_context.AddUserContext(std::move(tf_context));
+
+  int32_t input = 100;
+  tensorflow::Tensor input_tensor(input);
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  std::vector<mlrt::Value> results(2);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(results));
+
+  cancellation_context->Cancel();
+
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_THAT(execution_context.status(), ::testing::status::CanonicalStatusIs(
+                                              absl::StatusCode::kCancelled));
+}
+
+mlrt::bc::Buffer CreateExecutableForSetGetResourceOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  auto kernel_names_ctor = executable_ctor.construct_kernel_names(3);
+  kernel_names_ctor.ConstructAt(0, "tf_mlrt.set_resource");
+  kernel_names_ctor.ConstructAt(1, "tf_mlrt.get_resource");
+  kernel_names_ctor.ConstructAt(2, "return");
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(2));
+
+  attributes.Add("index", static_cast<int64_t>(0));
+
+  {
+    mlrt::bc::Buffer attr_buffer;
+    mlrt::bc::Allocator attr_allocator(&attr_buffer);
+    mlrt::bc::New<mlrt::bc::Vector<int64_t>>(&attr_allocator,
+                                             std::vector<int64_t>{0});
+    attributes.Add("i64_array",
+                   absl::string_view(attr_buffer.data(), attr_buffer.size()));
+  }
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+  function_ctor.set_num_regs(2);
+  function_ctor.construct_input_regs(1).Assign({0});
+  function_ctor.construct_output_regs(1).Assign({1});
+
+  auto kernels_ctor = function_ctor.construct_kernels(3);
+
+  auto set_resource_ctor = kernels_ctor.ConstructAt(0);
+  set_resource_ctor.set_code(0);
+  set_resource_ctor.construct_arguments(1).Assign({0});
+  set_resource_ctor.construct_results(0);
+  set_resource_ctor.construct_attributes(1).Assign(
+      {attributes.GetHandle("index")});
+  set_resource_ctor.construct_last_uses(1).Assign({0});
+
+  auto get_resource_ctor = kernels_ctor.ConstructAt(1);
+  get_resource_ctor.set_code(1);
+  get_resource_ctor.construct_arguments(0);
+  get_resource_ctor.construct_results(1).Assign({1});
+  get_resource_ctor.construct_attributes(1).Assign(
+      {attributes.GetHandle("i64_array")});
+
+  auto return_ctor = kernels_ctor.ConstructAt(2);
+  return_ctor.set_code(2);
+  return_ctor.construct_arguments(1).Assign({1});
+  return_ctor.construct_results(0);
+
+  return buffer;
+}
+
+TEST(KernelTest, SetGetResource) {
+  auto buffer = CreateExecutableForSetGetResourceOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  int32_t input = 100;
+  tensorflow::Tensor input_tensor(input);
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value result;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+  ASSERT_OK(execution_context.status());
+
+  int32_t output = 100;
+  tensorflow::Tensor expected(output);
+
+  tensorflow::test::ExpectEqual(
+      result.Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+}
+
+mlrt::bc::Buffer CreateExecutableForPredicateOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  auto kernel_names_ctor = executable_ctor.construct_kernel_names(2);
+  kernel_names_ctor.ConstructAt(0, "tf_mlrt.predicate");
+  kernel_names_ctor.ConstructAt(1, "return");
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+  function_ctor.set_num_regs(2);
+  function_ctor.construct_input_regs(1).Assign({0});
+  function_ctor.construct_output_regs(1).Assign({1});
+
+  auto kernels_ctor = function_ctor.construct_kernels(2);
+
+  auto predicate_ctor = kernels_ctor.ConstructAt(0);
+  predicate_ctor.set_code(0);
+  predicate_ctor.construct_arguments(1).Assign({0});
+  predicate_ctor.construct_results(1).Assign({1});
+
+  auto return_ctor = kernels_ctor.ConstructAt(1);
+  return_ctor.set_code(1);
+  return_ctor.construct_arguments(1).Assign({1});
+
+  return buffer;
+}
+
+TEST(KernelTest, Predicate) {
+  auto buffer = CreateExecutableForPredicateOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+
+  std::vector<tensorflow::Tensor> inputs = {
+      tensorflow::Tensor(true),
+      tensorflow::Tensor(static_cast<int8_t>(100)),
+      tensorflow::Tensor(static_cast<uint8_t>(100)),
+      tensorflow::Tensor(static_cast<int16_t>(100)),
+      tensorflow::Tensor(static_cast<int32_t>(100)),
+      tensorflow::Tensor(static_cast<int64_t>(100)),
+      tensorflow::Tensor(100.0f),
+      tensorflow::Tensor(100.0),
+      tensorflow::Tensor("100"),
+      tensorflow::Tensor(tensorflow::DT_INT32, {4}),
+  };
+
+  for (const auto& input : inputs) {
+    mlrt::Value arg((tfrt_stub::FallbackTensor(input)));
+    mlrt::Value result;
+
+    std::vector<uint8_t> last_uses = {true};
+    execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                           absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+    mlrt::Execute(execution_context);
+    ASSERT_OK(execution_context.status());
+
+    ASSERT_TRUE(result.HasValue());
+    EXPECT_TRUE(result.Get<bool>());
+  }
+
+  tensorflow::Tensor error_input(tensorflow::DT_VARIANT, {});
+  mlrt::Value arg((tfrt_stub::FallbackTensor(error_input)));
+  mlrt::Value result;
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(
+                  absl::StatusCode::kInvalidArgument,
+                  "variant cannot be converted to a boolean"));
+}
+
+mlrt::bc::Buffer CreateExecutableForPromiseAwaitOps() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(5));
+
+  attributes.Add("func_idx", 1);
+
+  attributes.Add("copy_arg_size", 1);
+
+  attributes.Add("num_futures", 1);
+
+  attributes.Add("node_def_str",
+                 R"pb(name: "AddV2"
+                      op: "AddV2"
+                      input: "dummy_arg"
+                      input: "dummy_arg"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "T"
+                        value { type: DT_INT32 }
+                      })pb");
+
+  attributes.Add("op_key", 0);
+
+  auto kernel_names_ctor = executable_ctor.construct_kernel_names(8);
+  kernel_names_ctor.ConstructAt(0, "mlrt.async");
+  kernel_names_ctor.ConstructAt(1, "mlrt.await_handle");
+  kernel_names_ctor.ConstructAt(2, "tf_mlrt.allocate_futures");
+  kernel_names_ctor.ConstructAt(3, "tf_mlrt.await");
+  kernel_names_ctor.ConstructAt(4, "tf_mlrt.promise");
+  kernel_names_ctor.ConstructAt(5, "return");
+  kernel_names_ctor.ConstructAt(6, "tf_mlrt.createop");
+  kernel_names_ctor.ConstructAt(7, "tf_mlrt.executeop");
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+    function_ctor.set_num_regs(7);
+    function_ctor.construct_input_regs(1).Assign({0});
+    function_ctor.construct_output_regs(1).Assign({4});
+
+    auto kernels_ctor = function_ctor.construct_kernels(8);
+    {
+      // tf_mlrt.allocate_futures
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(2);
+      kernel_ctor.construct_results(2).Assign({1, 2});
+      kernel_ctor.construct_attributes(1).Assign(
+          {attributes.GetHandle("num_futures")});
+    }
+
+    {
+      // tf_mlrt.createop
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(6);
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("node_def_str"),
+           attributes.GetHandle("op_key")});
+    }
+
+    {
+      // mlrt.async
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(0);
+      kernel_ctor.construct_arguments(2).Assign({0, 1});
+      kernel_ctor.construct_last_uses(2).Assign({false, true});
+      kernel_ctor.construct_results(1).Assign({3});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("func_idx"),
+           attributes.GetHandle("copy_arg_size")});
+    }
+
+    {
+      // tf_mlrt.executeop "tf.AddV2"
+      auto kernel_ctor = kernels_ctor.ConstructAt(3);
+      kernel_ctor.set_code(7);
+      kernel_ctor.construct_arguments(2).Assign({0, 0});
+      kernel_ctor.construct_results(1).Assign({5});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("node_def_str"),
+           attributes.GetHandle("op_key")});
+      kernel_ctor.construct_last_uses(2).Assign({0, 0});
+    }
+
+    {
+      // tf_mlrt.await
+      auto kernel_ctor = kernels_ctor.ConstructAt(4);
+      kernel_ctor.set_code(3);
+      kernel_ctor.construct_arguments(1).Assign({2});
+      kernel_ctor.construct_last_uses(1).Assign({true});
+      kernel_ctor.construct_results(1).Assign({4});
+    }
+
+    {
+      // tf_mlrt.executeop "tf.AddV2"
+      auto kernel_ctor = kernels_ctor.ConstructAt(5);
+      kernel_ctor.set_code(7);
+      kernel_ctor.construct_arguments(2).Assign({4, 5});
+      kernel_ctor.construct_results(1).Assign({6});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("node_def_str"),
+           attributes.GetHandle("op_key")});
+      kernel_ctor.construct_last_uses(2).Assign({0, 0});
+    }
+
+    {
+      // mlrt.await_handle
+      auto kernel_ctor = kernels_ctor.ConstructAt(6);
+      kernel_ctor.set_code(1);
+      kernel_ctor.construct_arguments(1).Assign({3});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(7);
+      kernel_ctor.set_code(5);
+      kernel_ctor.construct_arguments(1).Assign({6});
+    }
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("callee");
+    function_ctor.set_num_regs(3);
+    function_ctor.construct_input_regs(2).Assign({0, 1});
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+
+    {
+      // tf_mlrt.executeop "tf.AddV2"
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(7);
+      kernel_ctor.construct_arguments(2).Assign({0, 0});
+      kernel_ctor.construct_results(1).Assign({2});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("node_def_str"),
+           attributes.GetHandle("op_key")});
+      kernel_ctor.construct_last_uses(2).Assign({0, 0});
+    }
+
+    {
+      // tf_mlrt.promise
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(4);
+      kernel_ctor.construct_arguments(2).Assign({1, 2});
+      kernel_ctor.construct_last_uses(2).Assign({true, true});
+    }
+
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(5);
+    }
+  }
+
+  return buffer;
+}
+
+TEST(KernelTest, PromiseAwait) {
+  auto buffer = CreateExecutableForPromiseAwaitOps();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  tensorflow::Tensor input(100);
+  mlrt::Value arg((tfrt_stub::FallbackTensor(input)));
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  ASSERT_OK(execution_context.status());
+
+  tensorflow::test::ExpectEqual(
+      result.Get<tfrt_stub::FallbackTensor>().tensor(),
+      tensorflow::Tensor(400));
+}
+
+mlrt::bc::Buffer CreateExecutableForPromiseFutureOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+
+  std::vector<std::string> kernel_names = {
+      "tf_mlrt.createop",        "tf_mlrt.allocate_futures",
+      "tf_mlrt.async_executeop", "tf_mlrt.promise_future",
+      "tf_mlrt.await_all",       "return"};
+
+  executable_ctor.construct_kernel_names(kernel_names.size())
+      .Assign(kernel_names);
+  kernels.Def(kernel_names);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(3));
+
+  attributes.Add("node_def_str",
+                 R"pb(name: "TestAsyncIdentity"
+                      op: "TestAsyncIdentity"
+                      input: "dummy_arg"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "T"
+                        value { type: DT_INT32 }
+                      })pb");
+
+  attributes.Add("op_key", 0);
+  attributes.Add("num_futures", 1);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  mlrt::testing::SymbolTable regs;
+
+  function_ctor.construct_input_regs(1).Assign({regs.Def("input")});
+
+  auto kernels_ctor = function_ctor.construct_kernels(6);
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(0);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+    kernel_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+  }
+
+  {
+    // tf_mlrt.allocate_futures
+    auto kernel_ctor = kernels_ctor.ConstructAt(1);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.allocate_futures"));
+    kernel_ctor.construct_results(2).Assign(
+        regs.Def(absl::Span<const std::string>{"promise", "future"}));
+    kernel_ctor.construct_attributes(1).Assign(
+        {attributes.GetHandle("num_futures")});
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(2);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.async_executeop"));
+    kernel_ctor.construct_arguments(2).Assign(regs.Use({"input", "input"}));
+    kernel_ctor.construct_results(1).Assign({regs.Def("result_future_0")});
+    kernel_ctor.construct_attributes(2).Assign(
+        {attributes.GetHandle("node_def_str"), attributes.GetHandle("op_key")});
+    kernel_ctor.construct_last_uses(2).Assign({0, 0});
+  }
+  {
+    // tf_mlrt.promise_future
+    auto kernel_ctor = kernels_ctor.ConstructAt(3);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.promise_future"));
+    kernel_ctor.construct_arguments(2).Assign(
+        regs.Use({"promise", "result_future_0"}));
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(4);
+    kernel_ctor.set_code(kernels.Use("tf_mlrt.await_all"));
+    kernel_ctor.construct_arguments(2).Assign(
+        regs.Use({"future", "result_future_0"}));
+    kernel_ctor.construct_last_uses(2).Assign({true, true});
+    kernel_ctor.construct_results(2).Assign(
+        {regs.Def("result_0"), regs.Def("result_1")});
+  }
+
+  {
+    auto kernel_ctor = kernels_ctor.ConstructAt(5);
+    kernel_ctor.set_code(kernels.Use("return"));
+    kernel_ctor.construct_arguments(2).Assign(
+        regs.Use({"result_0", "result_1"}));
+  }
+
+  function_ctor.set_num_regs(regs.size());
+  function_ctor.construct_output_regs(2).Assign(
+      regs.Use({"result_0", "result_1"}));
+
+  return buffer;
+}
+
+TEST(KernelTest, PromiseFutureOp) {
+  auto buffer = CreateExecutableForPromiseFutureOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  int32_t input = 100;
+  tensorflow::Tensor input_tensor(input);
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value results[2];
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(results));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+  ASSERT_OK(execution_context.status());
+
+  int32_t output = 100;
+  tensorflow::Tensor expected(output);
+  tensorflow::test::ExpectEqual(
+      results[0].Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+  tensorflow::test::ExpectEqual(
+      results[1].Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+}
+
+mlrt::bc::Buffer CreateExecutableForBatchFunctionOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+
+  std::vector<std::string> kernel_names = {
+      "tf_mlrt.createop", "tf_mlrt.executeop", "tf_mlrt.batch_function",
+      "tf_mlrt.await", "return"};
+
+  executable_ctor.construct_kernel_names(kernel_names.size())
+      .Assign(kernel_names);
+  kernels.Def(kernel_names);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(5));
+
+  attributes.Add("node_def_str",
+                 R"pb(name: "AddV2"
+                      op: "AddV2"
+                      input: "dummy_arg"
+                      input: "dummy_arg"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "T"
+                        value { type: DT_INT32 }
+                      })pb");
+
+  attributes.Add("op_key", 0);
+
+  attributes.Add("func_idx", 1);
+
+  // attributes[3] is NodeDef for batch function.
+  attributes.Add("batch_node_def_str",
+                 R"pb(name: "BatchFunction"
+                      op: "MlrtBatchFunction"
+                      input: "dummy_arg"
+                      input: "dummy_arg"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "num_batch_threads"
+                        value: { i: 16 }
+                      }
+                      attr {
+                        key: "max_batch_size"
+                        value { i: 1 }
+                      }
+                      attr {
+                        key: "allowed_batch_sizes"
+                        value { list { i: 1 } }
+                      }
+                      attr {
+                        key: "batch_timeout_micros"
+                        value { i: 0 }
+                      }
+                      attr {
+                        key: "container"
+                        value { s: "container" }
+                      }
+                      attr {
+                        key: "shared_name"
+                        value { s: "shared_name" }
+                      }
+                      attr {
+                        key: "batching_queue"
+                        value { s: "batching_queue" }
+                      }
+                      attr {
+                        key: "enable_large_batch_splitting"
+                        value { b: false }
+                      }
+                      attr {
+                        key: "Tin"
+                        value { list { type: DT_INT32 type: DT_INT32 } }
+                      }
+                      attr {
+                        key: "Tcaptured"
+                        value { list {} }
+                      }
+                      attr {
+                        key: "Tout"
+                        value { list { type: DT_INT32 } }
+                      })pb");
+
+  attributes.Add("device", "/device:CPU:0");
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    mlrt::testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(1).Assign({regs.Def("input")});
+
+    auto kernels_ctor = function_ctor.construct_kernels(4);
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("node_def_str"),
+           attributes.GetHandle("op_key")});
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.batch_function"));
+      kernel_ctor.construct_arguments(2).Assign(regs.Use({"input", "input"}));
+      kernel_ctor.construct_results(1).Assign({regs.Def("result_future")});
+      kernel_ctor.construct_attributes(3).Assign(
+          {attributes.GetHandle("device"), attributes.GetHandle("func_idx"),
+           attributes.GetHandle("batch_node_def_str")});
+      kernel_ctor.construct_last_uses(2).Assign({0, 0});
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.await"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("result_future")});
+      kernel_ctor.construct_last_uses(1).Assign({true});
+      kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(3);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+    }
+
+    function_ctor.set_num_regs(regs.size());
+    function_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("batch_function");
+
+    mlrt::testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"input_0", "input_1"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
+      kernel_ctor.construct_arguments(2).Assign(
+          regs.Use({"input_0", "input_1"}));
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("node_def_str"),
+           attributes.GetHandle("op_key")});
+      kernel_ctor.construct_results(1).Assign({regs.Def("result")});
+      kernel_ctor.construct_last_uses(2).Assign({0, 0});
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("result")});
+    }
+
+    function_ctor.set_num_regs(regs.size());
+    function_ctor.construct_output_regs(1).Assign({regs.Use("result")});
+  }
+
+  return buffer;
+}
+
+TEST(KernelTest, BatchFunctionOp) {
+  auto buffer = CreateExecutableForBatchFunctionOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  RegisterTfMlrtBatchKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  tensorflow::Tensor input_tensor(tensorflow::DT_INT32, {1});
+  input_tensor.flat<int32_t>()(0) = 100;
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  ASSERT_OK(execution_context.status());
+
+  tensorflow::Tensor expected(tensorflow::DT_INT32, {1});
+  expected.flat<int32_t>()(0) = 200;
+
+  tensorflow::test::ExpectEqual(
+      result.Get<tfrt_stub::FallbackTensor>().tensor(), expected);
+}
+
+mlrt::bc::Buffer CreateExecutableForCancelOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  auto kernel_names_ctor = executable_ctor.construct_kernel_names(3);
+  kernel_names_ctor.ConstructAt(0, "tf_mlrt.cancel");
+  kernel_names_ctor.ConstructAt(1, "tf_mlrt.predicate");
+  kernel_names_ctor.ConstructAt(2, "return");
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+    function_ctor.set_num_regs(2);
+    function_ctor.construct_input_regs(1).Assign({0});
+    function_ctor.construct_output_regs(1).Assign({1});
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+
+    {
+      // cancel
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(0);
+    }
+    {
+      // predicate
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(1);
+      kernel_ctor.construct_arguments(1).Assign({0});
+      kernel_ctor.construct_results(1).Assign({1});
+    }
+    {
+      // return
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(2);
+      kernel_ctor.construct_arguments(1).Assign({1});
+    }
+  }
+
+  return buffer;
+}
+
+TEST(KernelTest, CancelCanEarlyReturn) {
+  auto buffer = CreateExecutableForCancelOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::RCReference<tfrt::CancellationContext> cancellation_context =
+      tfrt::TakeRef(new tfrt::CancellationContext());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context = std::make_unique<Context>(
+      &fallback_request_state, &resource_context, cancellation_context.get());
+  execution_context.AddUserContext(std::move(tf_context));
+
+  tensorflow::Tensor input_tensor =
+      tensorflow::Tensor(static_cast<int8_t>(100));
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+
+  // Cancelling
+  cancellation_context->Cancel();
+
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_THAT(execution_context.status(), ::testing::status::CanonicalStatusIs(
+                                              absl::StatusCode::kCancelled));
+  EXPECT_EQ(result.HasValue(), false);
+}
+
+// Have a cancel OP has no side effect when there is no cancel request.
+TEST(KernelTest, NoCancel) {
+  auto buffer = CreateExecutableForCancelOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  tensorflow::Tensor input_tensor =
+      tensorflow::Tensor(static_cast<int8_t>(100));
+  mlrt::Value arg(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  ASSERT_OK(execution_context.status());
+
+  ASSERT_TRUE(result.HasValue());
+  EXPECT_TRUE(result.Get<bool>());
+}
+
+mlrt::bc::Buffer CreateAsyncExecutableForCancelOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(2));
+  attributes.Add("async_callee", 1);
+  attributes.Add("copy_arg_size", 0);
+
+  mlrt::testing::SymbolTable kernels;
+  std::vector<std::string> names = {"mlrt.async", "mlrt.await_handle",
+                                    "tf_mlrt.cancel", "tf_mlrt.promise",
+                                    "return"};
+  executable_ctor.construct_kernel_names(names.size()).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    mlrt::testing::SymbolTable regs;
+    function_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"in", "promise"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+    {
+      // async
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("mlrt.async"));
+      kernel_ctor.construct_arguments(2).Assign(regs.Use({"in", "promise"}));
+      kernel_ctor.construct_last_uses(2).Assign({true, true});
+      kernel_ctor.construct_results(1).Assign({regs.Def("handle")});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("async_callee"),
+           attributes.GetHandle("copy_arg_size")});
+    }
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("mlrt.await_handle"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("handle")});
+    }
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("async_callee");
+
+    mlrt::testing::SymbolTable regs;
+    function_ctor.construct_input_regs(2).Assign(
+        regs.Def(absl::Span<const std::string>{"in", "promise"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(3);
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.cancel"));
+    }
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.promise"));
+      kernel_ctor.construct_arguments(2).Assign(regs.Use({"promise", "in"}));
+      kernel_ctor.construct_last_uses(2).Assign({true, true});
+    }
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(2);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(KernelTest, CancelInAsyncCanEarlyReturn) {
+  auto buffer = CreateAsyncExecutableForCancelOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::RCReference<tfrt::CancellationContext> cancellation_context =
+      tfrt::TakeRef(new tfrt::CancellationContext());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context = std::make_unique<Context>(
+      &fallback_request_state, &resource_context, cancellation_context.get());
+  execution_context.AddUserContext(std::move(tf_context));
+
+  tensorflow::Tensor input_tensor =
+      tensorflow::Tensor(static_cast<int8_t>(100));
+
+  auto promise =
+      mlrt::Promise::Allocate<tensorflow::tfrt_stub::FallbackTensor>();
+  auto future = promise.GetFuture();
+
+  std::vector<mlrt::Value> input_args;
+  input_args.push_back(
+      mlrt::Value(tfrt_stub::FallbackTensor(std::move(input_tensor))));
+  input_args.push_back(mlrt::Value(std::move(promise)));
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(input_args), absl::Span<mlrt::Value>());
+
+  // Cancelling
+  cancellation_context->Cancel();
+
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_THAT(execution_context.status(), ::testing::status::CanonicalStatusIs(
+                                              absl::StatusCode::kCancelled));
+
+  ASSERT_TRUE(future.IsError());
+
+  EXPECT_THAT(future.GetError(), ::testing::status::CanonicalStatusIs(
+                                     absl::StatusCode::kCancelled));
+}
+
+mlrt::bc::Buffer CreateExecutableForTensorToInt32Op() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+  std::vector<std::string> names = {"tf_mlrt.tensor_to_int32", "return"};
+  executable_ctor.construct_kernel_names(2).Assign(names);
+  kernels.Def(names);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+  auto function_ctor = functions_ctor.ConstructAt(0);
+  function_ctor.construct_name("main");
+
+  mlrt::testing::SymbolTable regs;
+
+  function_ctor.construct_input_regs(1).Assign({regs.Def("tensor_int")});
+
+  auto kernels_ctor = function_ctor.construct_kernels(2);
+
+  auto predicate_ctor = kernels_ctor.ConstructAt(0);
+  predicate_ctor.set_code(kernels.Use("tf_mlrt.tensor_to_int32"));
+  predicate_ctor.construct_arguments(1).Assign({regs.Use("tensor_int")});
+  predicate_ctor.construct_results(1).Assign({regs.Def("scalar_int")});
+
+  auto return_ctor = kernels_ctor.ConstructAt(1);
+  return_ctor.set_code(kernels.Use("return"));
+  return_ctor.construct_arguments(1).Assign({regs.Use("scalar_int")});
+
+  function_ctor.construct_output_regs(1).Assign({regs.Use("scalar_int")});
+  function_ctor.set_num_regs(regs.size());
+
+  return buffer;
+}
+
+TEST(KernelTest, TensorToInt32) {
+  auto buffer = CreateExecutableForTensorToInt32Op();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  {
+    tensorflow::Tensor input = tensorflow::Tensor(static_cast<int32_t>(100));
+
+    mlrt::Value arg((tfrt_stub::FallbackTensor(input)));
+    mlrt::Value result;
+
+    std::vector<uint8_t> last_uses = {true};
+    execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                           absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+    mlrt::Execute(execution_context);
+    ASSERT_OK(execution_context.status());
+
+    ASSERT_TRUE(result.HasValue());
+    EXPECT_EQ(result.Get<int32_t>(), 100);
+  }
+  {
+    tensorflow::Tensor error_input(tensorflow::DT_VARIANT, {});
+    mlrt::Value arg((tfrt_stub::FallbackTensor(error_input)));
+    mlrt::Value result;
+
+    std::vector<uint8_t> last_uses = {true};
+    execution_context.Call(loaded_executable.GetFunction("main"), last_uses,
+                           absl::MakeSpan(&arg, 1), absl::MakeSpan(&result, 1));
+    mlrt::Execute(execution_context);
+    EXPECT_THAT(execution_context.status(),
+                ::testing::status::CanonicalStatusIs(
+                    absl::StatusCode::kInvalidArgument,
+                    "variant cannot be converted to a int32"));
+  }
+}
+
+// A function body for MapFnOp.
+constexpr int32_t kMapFnOpMaxIteration = 2;
+constexpr int32_t kMapFnBias = 100;
+void TestMapFnBody(mlrt::KernelFrame frame) {
+  ASSERT_EQ(frame.arguments().size(), 5);
+
+  auto future = frame.arguments()[0].Get<mlrt::Future>();
+  auto promise = std::move(frame.arguments()[1].Get<mlrt::Promise>());
+
+  int32_t loop_index = frame.arguments()[2]
+                           .Get<tensorflow::tfrt_stub::FallbackTensor>()
+                           .tensor()
+                           .scalar<int32_t>()();
+
+  int32_t element_index = frame.arguments()[3]
+                              .Get<tensorflow::tfrt_stub::FallbackTensor>()
+                              .tensor()
+                              .scalar<int32_t>()();
+
+  int32_t offset = frame.arguments()[4]
+                       .Get<tensorflow::tfrt_stub::FallbackTensor>()
+                       .tensor()
+                       .scalar<int32_t>()();
+
+  for (; !future.IsReady();) {
+    // wait for future to be ready
+  }
+  auto in_tensor = future.Get<tensorflow::tfrt_stub::FallbackTensor>().tensor();
+  tensorflow::Tensor out_tensor(DT_INT32, {kMapFnOpMaxIteration});
+  for (int i = 0; i < element_index; i++) {
+    out_tensor.flat<int32_t>()(i) = in_tensor.flat<int32_t>()(i);
+  }
+
+  out_tensor.flat<int32_t>()(element_index) = loop_index + offset;
+
+  std::move(promise).Set<tensorflow::tfrt_stub::FallbackTensor>(out_tensor);
+}
+void TestMapFnBodyError(mlrt::KernelFrame frame) {
+  ASSERT_EQ(frame.arguments().size(), 5);
+
+  auto future = frame.arguments()[0].Get<mlrt::Future>();
+  auto promise = std::move(frame.arguments()[1].Get<mlrt::Promise>());
+  std::move(future).Then([promise = std::move(promise)]() mutable {
+    std::move(promise).SetError(absl::InternalError("Test Error"));
+  });
+}
+
+// An MapFn body that errors out but with its return promise is already set.
+// Note that currently, while_to_map_fn pass ensure promise is only set right
+// before return.
+void TestMapFnBodyErrorWithPromiseSet(mlrt::KernelFrame frame) {
+  ASSERT_EQ(frame.arguments().size(), 5);
+
+  auto future = frame.arguments()[0].Get<mlrt::Future>();
+  auto promise = std::move(frame.arguments()[1].Get<mlrt::Promise>());
+
+  auto loop_index =
+      frame.arguments()[2].Get<tensorflow::tfrt_stub::FallbackTensor>();
+  std::move(promise).Set<tensorflow::tfrt_stub::FallbackTensor>(
+      std::move(loop_index));
+
+  auto control_promise = mlrt::Promise::Allocate<mlrt::Control>();
+  auto control_future = control_promise.GetFuture();
+  std::move(future).Then(
+      [control_promise = std::move(control_promise)]() mutable {
+        std::move(control_promise).SetError(absl::InternalError("Test Error"));
+      });
+
+  frame.execution_context().Await(std::move(control_future));
+}
+
+mlrt::bc::Buffer CreateExecutableForMapFnOp() {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+
+  std::vector<std::string> kernel_names = {"tf_mlrt.map_fn", "return",
+                                           "test_map_fn_body"};
+
+  executable_ctor.construct_kernel_names(kernel_names.size())
+      .Assign(kernel_names);
+  kernels.Def(kernel_names);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(1));
+  attributes.Add("body_idx", 1);
+  attributes.Add("num_tensor_list", 1);
+
+  auto functions_ctor = executable_ctor.construct_functions(2);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    mlrt::testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(3).Assign(
+        regs.Def({"max_iteration", "tensor_list", "bias"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.map_fn"));
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("body_idx"),
+           attributes.GetHandle("num_tensor_list")});
+      kernel_ctor.construct_arguments(3).Assign(
+          regs.Use({"max_iteration", "tensor_list", "bias"}));
+      kernel_ctor.construct_results(1).Assign({regs.Def("result0")});
+      kernel_ctor.construct_last_uses(3).Assign({0, 1, 0});
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("result0")});
+    }
+
+    function_ctor.set_num_regs(regs.size());
+    function_ctor.construct_output_regs(1).Assign({regs.Use("result0")});
+  }
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(1);
+    function_ctor.construct_name("body_function");
+
+    mlrt::testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(5).Assign(regs.Def(
+        {"future", "promise", "loop_counter", "element_index", "bias"}));
+
+    auto kernels_ctor = function_ctor.construct_kernels(2);
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(0);
+      kernel_ctor.set_code(kernels.Use("test_map_fn_body"));
+      kernel_ctor.construct_arguments(5).Assign(regs.Use(
+          {"future", "promise", "loop_counter", "element_index", "bias"}));
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(1);
+      kernel_ctor.set_code(kernels.Use("return"));
+    }
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(KernelTest, MapFnOp) {
+  auto buffer = CreateExecutableForMapFnOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  registry.Register("test_map_fn_body", TestMapFnBody);
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  std::vector<mlrt::Value> args;
+  args.resize(3);
+
+  tensorflow::Tensor loop_max_iteration_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {},
+                                              &loop_max_iteration_tensor));
+  loop_max_iteration_tensor.scalar<int32_t>()() = kMapFnOpMaxIteration;
+  args.at(0).Set(
+      tfrt_stub::FallbackTensor(std::move(loop_max_iteration_tensor)));
+
+  tensorflow::Tensor tensor_list;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {kMapFnOpMaxIteration},
+                                              &tensor_list));
+  args.at(1).Set(tfrt_stub::FallbackTensor(std::move(tensor_list)));
+
+  tensorflow::Tensor bias_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &bias_tensor));
+  bias_tensor.scalar<int32_t>()() = kMapFnBias;
+  args.at(2).Set(tfrt_stub::FallbackTensor(bias_tensor));
+
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  ASSERT_OK(execution_context.status());
+
+  tensorflow::Tensor expected(tensorflow::DT_INT32, {kMapFnOpMaxIteration});
+  expected.flat<int32_t>()(0) = 100;
+  expected.flat<int32_t>()(1) = 101;
+
+  auto& to_be = result.Get<tensorflow::tfrt_stub::FallbackTensor>();
+  tensorflow::test::ExpectEqual(to_be.tensor(), expected);
+}
+
+TEST(KernelTest, MapFnOpZeroIteration) {
+  auto buffer = CreateExecutableForMapFnOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  registry.Register("test_map_fn_body", TestMapFnBody);
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  std::vector<mlrt::Value> args;
+  args.resize(3);
+
+  tensorflow::Tensor loop_max_iteration_tensor(DT_INT32, {});
+  loop_max_iteration_tensor.scalar<int32_t>()() = 0;
+  args.at(0).Set(
+      tfrt_stub::FallbackTensor(std::move(loop_max_iteration_tensor)));
+
+  tensorflow::Tensor tensor_list(DT_INT32, {});
+  tensor_list.scalar<int32_t>()() = 1000;
+  args.at(1).Set(tfrt_stub::FallbackTensor(std::move(tensor_list)));
+
+  tensorflow::Tensor bias_tensor(DT_INT32, {});
+  bias_tensor.scalar<int32_t>()() = kMapFnBias;
+  args.at(2).Set(tfrt_stub::FallbackTensor(bias_tensor));
+
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  ASSERT_OK(execution_context.status());
+
+  tensorflow::Tensor expected(tensorflow::DT_INT32, {});
+  expected.scalar<int32_t>()() = 1000;
+
+  auto& to_be = result.Get<tensorflow::tfrt_stub::FallbackTensor>();
+  tensorflow::test::ExpectEqual(to_be.tensor(), expected);
+}
+
+TEST(KernelTest, MapFnOpError) {
+  auto buffer = CreateExecutableForMapFnOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  registry.Register("test_map_fn_body", TestMapFnBodyError);
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  std::vector<mlrt::Value> args;
+  args.resize(3);
+
+  tensorflow::Tensor loop_max_iteration_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {},
+                                              &loop_max_iteration_tensor));
+  loop_max_iteration_tensor.scalar<int32_t>()() = kMapFnOpMaxIteration;
+  args.at(0).Set(
+      tfrt_stub::FallbackTensor(std::move(loop_max_iteration_tensor)));
+
+  tensorflow::Tensor tensor_list;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {kMapFnOpMaxIteration},
+                                              &tensor_list));
+  args.at(1).Set(tfrt_stub::FallbackTensor(std::move(tensor_list)));
+
+  tensorflow::Tensor bias_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &bias_tensor));
+  bias_tensor.scalar<int32_t>()() = kMapFnBias;
+  args.at(2).Set(tfrt_stub::FallbackTensor(bias_tensor));
+
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_THAT(
+      execution_context.status(),
+      ::testing::status::CanonicalStatusIs(
+          absl::StatusCode::kInternal, "Test Error. First Error Index=0 of 1"));
+}
+
+TEST(KernelTest, MapFnOpErrorWithPromiseSet) {
+  auto buffer = CreateExecutableForMapFnOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  RegisterTfMlrtKernels(registry);
+  registry.Register("test_map_fn_body", TestMapFnBodyErrorWithPromiseSet);
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  std::vector<mlrt::Value> args;
+  args.resize(3);
+
+  tensorflow::Tensor loop_max_iteration_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {},
+                                              &loop_max_iteration_tensor));
+  loop_max_iteration_tensor.scalar<int32_t>()() = kMapFnOpMaxIteration;
+  args.at(0).Set(
+      tfrt_stub::FallbackTensor(std::move(loop_max_iteration_tensor)));
+
+  tensorflow::Tensor tensor_list;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {kMapFnOpMaxIteration},
+                                              &tensor_list));
+  args.at(1).Set(tfrt_stub::FallbackTensor(std::move(tensor_list)));
+
+  tensorflow::Tensor bias_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &bias_tensor));
+  bias_tensor.scalar<int32_t>()() = kMapFnBias;
+  args.at(2).Set(tfrt_stub::FallbackTensor(bias_tensor));
+
+  mlrt::Value result;
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  std::vector<uint8_t> last_uses = {true, true, true};
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(&result, 1));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  EXPECT_THAT(execution_context.status(),
+              ::testing::status::CanonicalStatusIs(absl::StatusCode::kInternal,
+                                                   "Test Error"));
+}
+
+}  // namespace
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
index 414f49e267c..eab86b1f55d 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
@@ -39,7 +39,6 @@ tf_cc_test(
         "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
@@ -80,7 +79,7 @@ tf_cc_test(
         "//third_party/eigen3",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/synchronization",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@tf_runtime//:hostcontext",
     ],
 )
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
index 679b06c5bd3..a1b73c509dd 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.cc
@@ -13,8 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
 #include <atomic>
+#include <functional>
+#include <list>
 #include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 #define EIGEN_USE_THREADS
 
 #include <optional>
@@ -658,7 +664,7 @@ class RunHandler::Impl {
  public:
   explicit Impl(RunHandlerPool::Impl* pool_impl);
 
-  ~Impl() {}
+  ~Impl() = default;
 
   // Stores now time (in microseconds) since unix epoch when the handler is
   // requested via RunHandlerPool::Get().
@@ -773,12 +779,10 @@ class RunHandlerPool::Impl {
   std::unique_ptr<RunHandler> Get(int64_t step_id, int64_t timeout_in_ms,
                                   const RunHandlerOptions& options)
       TF_LOCKS_EXCLUDED(mu_) {
-    thread_local std::unique_ptr<
-        Eigen::MaxSizeVector<internal::ThreadWorkSource*>>
-        thread_work_sources =
-            std::make_unique<Eigen::MaxSizeVector<internal::ThreadWorkSource*>>(
+    thread_local auto thread_work_sources =
+        std::make_unique<Eigen::MaxSizeVector<internal::ThreadWorkSource*>>(
 
-                max_handlers_);
+            max_handlers_);
     uint64_t version;
     int num_active_requests;
     RunHandler::Impl* handler_impl;
@@ -1024,7 +1028,7 @@ int RunHandler::Impl::RunHandlerEigenThreadPool::CurrentThreadId() const {
 
 RunHandlerPool::RunHandlerPool(Options options) : impl_(new Impl(options)) {}
 
-RunHandlerPool::~RunHandlerPool() {}
+RunHandlerPool::~RunHandlerPool() = default;
 
 std::unique_ptr<RunHandler> RunHandlerPool::Get(
     int64_t step_id, int64_t timeout_in_ms, const RunHandlerOptions& options) {
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
index 07f222d2fe1..2d25b4a63b6 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler.h
@@ -16,9 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_H_
 #define TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_H_
 
+#include <atomic>
 #include <cstddef>
+#include <functional>
+#include <memory>
 #include <optional>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/lib/histogram/histogram.h"
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
index f6257219e3e..511fdcd9b86 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_CONCURRENT_WORK_QUEUE_H_
 #define TENSORFLOW_CORE_TFRT_RUN_HANDLER_THREAD_POOL_RUN_HANDLER_CONCURRENT_WORK_QUEUE_H_
 
+#include <atomic>
 #include <memory>
 #include <optional>
 #include <ostream>
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
index b0513102796..f01f92b67c9 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_concurrent_work_queue_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <cstdio>
 #include <memory>
+#include <utility>
 
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
index 70d8870874d..daedc31c100 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_test.cc
@@ -14,8 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include <cstddef>
+#include <functional>
 #include <memory>
 #include <string>
+#include <tuple>
 #include <vector>
 
 #include <gtest/gtest.h>
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.cc b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.cc
index c3d615e2848..092e6c2697e 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.cc
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/core/tfrt/run_handler_thread_pool/run_handler_util.h"
 
 #include <cmath>
+#include <cstdlib>
+#include <string>
+#include <vector>
 
 #include "tensorflow/core/lib/strings/numbers.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/tfrt/runtime/BUILD b/tensorflow/core/tfrt/runtime/BUILD
index 10f02ad482d..82483beff3b 100644
--- a/tensorflow/core/tfrt/runtime/BUILD
+++ b/tensorflow/core/tfrt/runtime/BUILD
@@ -99,7 +99,7 @@ tf_cc_test(
         "//tensorflow/core/tfrt/utils:thread_pool",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@tf_runtime//:hostcontext",
         "@tf_runtime//:support",
     ],
diff --git a/tensorflow/core/tfrt/runtime/runtime.cc b/tensorflow/core/tfrt/runtime/runtime.cc
index 2348b9f045e..486fbce68e0 100644
--- a/tensorflow/core/tfrt/runtime/runtime.cc
+++ b/tensorflow/core/tfrt/runtime/runtime.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 
+#include <memory>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/core/tfrt/runtime/runtime.h b/tensorflow/core/tfrt/runtime/runtime.h
index 7c5ea818967..67b8e7d14bd 100644
--- a/tensorflow/core/tfrt/runtime/runtime.h
+++ b/tensorflow/core/tfrt/runtime/runtime.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_RUNTIME_RUNTIME_H_
 #define TENSORFLOW_CORE_TFRT_RUNTIME_RUNTIME_H_
 
+#include <functional>
 #include <memory>
 #include <utility>
 #include <vector>
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
index 314c1ce8128..a77f9855662 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h"
 
+#include <memory>
 #include <optional>
 #include <utility>
 
diff --git a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
index 0734a3a18d2..73d19196a2b 100644
--- a/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
+++ b/tensorflow/core/tfrt/runtime/tf_threadpool_concurrent_work_queue.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_RUNTIME_TF_THREADPOOL_CONCURRENT_WORK_QUEUE_H_
 #define TENSORFLOW_CORE_TFRT_RUNTIME_TF_THREADPOOL_CONCURRENT_WORK_QUEUE_H_
 
+#include <memory>
 #include <optional>
 #include <string>
 
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.cc b/tensorflow/core/tfrt/runtime/work_queue_interface.cc
index d4d976ba86b..97a555b37b6 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.cc
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface.h b/tensorflow/core/tfrt/runtime/work_queue_interface.h
index 72107da26b9..b5d3f27477a 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface.h
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface.h
@@ -16,6 +16,9 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_RUNTIME_WORK_QUEUE_INTERFACE_H_
 
 #include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
 
 #include "tensorflow/core/platform/context.h"
 #include "tensorflow/core/platform/statusor.h"
diff --git a/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc b/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc
index 7e980f52263..a2e26b2cf70 100644
--- a/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc
+++ b/tensorflow/core/tfrt/runtime/work_queue_interface_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 
+#include <thread>
 #include <utility>
 
 #include <gmock/gmock.h>
diff --git a/tensorflow/core/tfrt/saved_model/BUILD b/tensorflow/core/tfrt/saved_model/BUILD
index 0566127265e..acef0f8e774 100644
--- a/tensorflow/core/tfrt/saved_model/BUILD
+++ b/tensorflow/core/tfrt/saved_model/BUILD
@@ -15,16 +15,45 @@ package_group(
         # copybara:uncomment "//learning/infra/mira/...",
         # copybara:uncomment "//learning/serving/...",
         "//tensorflow/core/runtime_fallback/...",
+        "//tensorflow/core/tfrt/mlrt/application/tensorflow/tests/...",
         "//tensorflow/core/tfrt/saved_model/tests/...",
         "//tensorflow/core/tfrt/graph_executor/...",
         "//tensorflow/core/tfrt/tfrt_session/...",
         "//tensorflow/core/tfrt/utils/debug/...",
         "//tensorflow_serving/...",
-        "//platforms/xla/tests/saved_models/...",
+        "//tensorflow/core/tfrt/saved_model/python/...",
+        # copybara:uncomment "//platforms/xla/tests/saved_models/...",
         # copybara:uncomment "//quality/webanswers/servo2/...",
     ],
 )
 
+cc_library(
+    name = "saved_model_aot_compile",
+    srcs = [
+        "saved_model_aot_compile.cc",
+    ],
+    hdrs = ["saved_model_aot_compile.h"],
+    deps = [
+        "//tensorflow/cc/saved_model:constants",
+        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
+        "//tensorflow/compiler/xla/service:compiler",
+        "//tensorflow/core:all_kernels",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:lib",
+        "//tensorflow/core/ops",
+        "//tensorflow/core/platform:enable_tf2_utils",
+        "//tensorflow/core/platform:path",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/utils:tfrt_runtime",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:status",
+        "@com_google_absl//absl/status",
+        "@tf_runtime//:core_runtime_alwayslink",
+    ],
+)
+
 cc_library(
     name = "saved_model_lib",
     srcs = [
@@ -43,6 +72,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:saved_model",
         "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
+        "//tensorflow/compiler/mlir/tfrt/transforms/mlrt:import_model",
         "//tensorflow/compiler/xla:status_macros",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu_base",
@@ -59,7 +89,11 @@ cc_library(
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/graph_executor",
+        "//tensorflow/core/tfrt/graph_executor:export_mlir",
         "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/kernel",
+        "//tensorflow/core/tfrt/mlrt/kernel:batch_kernel",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tfrt/runtime:work_queue_interface",
         "//tensorflow/core/tfrt/utils",
@@ -82,11 +116,7 @@ cc_library(
         "@tf_runtime//:support",
     ] + if_google([
         "//third_party/tf_runtime_google:streamz_metrics_registry_alwayslink",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms:import_model",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel",
-        "//learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel:batch_kernel",
         "//learning/brain/experimental/tfrt/native_lowering/kernels",
-        "//learning/infra/mira/mlrt/bytecode",
     ]),
 )
 
@@ -133,6 +163,7 @@ cc_library(
         "//tensorflow/core/tfrt/runtime",
         "@tf_runtime//:hostcontext",
     ] + if_google([
+        "//learning/brain/tfrt/support:export_mlir",
         "//learning/brain/tfrt/tpu/compiler/mlir:tf_to_tfrt_tpu",
     ]),
 )
diff --git a/tensorflow/core/tfrt/saved_model/python/BUILD b/tensorflow/core/tfrt/saved_model/python/BUILD
new file mode 100644
index 00000000000..46774defb1a
--- /dev/null
+++ b/tensorflow/core/tfrt/saved_model/python/BUILD
@@ -0,0 +1,46 @@
+# This package loads pybind11 dependencies to ensure compatibility with Python API.
+# Files affected: saved_model_aot_compile
+
+load("//tensorflow:tensorflow.default.bzl", "tf_python_pybind_extension")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
+    licenses = ["notice"],
+)
+
+package_group(
+    name = "friends",
+    packages = [
+        # Authorized users go here.
+        "//tensorflow/core/tfrt/saved_model/...",
+        "//tensorflow/core/tfrt/graph_executor/...",
+    ],
+)
+
+py_binary(
+    name = "saved_model_aot_compile_py",
+    srcs = ["saved_model_aot_compile.py"],
+    main = "saved_model_aot_compile.py",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":_pywrap_saved_model_aot_compile",
+        "//tensorflow/core/tfrt/graph_executor/python:_pywrap_graph_execution_options",
+        "@absl_py//absl:app",
+    ],
+)
+
+tf_python_pybind_extension(
+    name = "_pywrap_saved_model_aot_compile",
+    srcs = ["saved_model_aot_compile_wrapper.cc"],
+    module_name = "_pywrap_saved_model_aot_compile",
+    deps = [
+        "//tensorflow/core/tfrt/graph_executor:graph_execution_options",
+        "//tensorflow/core/tfrt/runtime",
+        "//tensorflow/core/tfrt/saved_model:saved_model_aot_compile",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+    ],
+)
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile.py b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile.py
new file mode 100644
index 00000000000..da11aa9b22a
--- /dev/null
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile.py
@@ -0,0 +1,45 @@
+#  Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+
+#     http://www.apache.org/licenses/LICENSE-2.0
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Test .py file for pybind11 files for AotOptions and AotCompileSavedModel, currently unable to test due to nullptr in AotOptions."""
+
+
+from absl import app
+from tensorflow.core.tfrt.graph_executor.python import _pywrap_graph_execution_options
+from tensorflow.core.tfrt.saved_model.python import _pywrap_saved_model_aot_compile
+
+
+def main(unused_argv):
+  if not _pywrap_saved_model_aot_compile:
+    return
+  try:
+    # Test for creating an instance of GraphExecutionOptions
+    test = _pywrap_graph_execution_options.GraphExecutionOptions()
+    print(test)
+
+    # Executes AoTOptions and AotCompileSavedModel for Wrapping Tests
+    _pywrap_saved_model_aot_compile.AotOptions()
+
+    # TODO(cesarmagana): Once AotCompileSavedModel is complete
+    # update this test script to read from CNS
+    _pywrap_saved_model_aot_compile.AotCompileSavedModel("random")
+
+  # Could also do except status.StatusNotOk if testing for AotCompileSavedModel
+  except Exception as exception:  # pylint: disable=broad-exception-caught
+    print(exception)
+
+
+if __name__ == "__main__":
+  app.run(main)
diff --git a/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
new file mode 100644
index 00000000000..b8b0f698500
--- /dev/null
+++ b/tensorflow/core/tfrt/saved_model/python/saved_model_aot_compile_wrapper.cc
@@ -0,0 +1,41 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+#include "tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h"
+#include "tensorflow/python/lib/core/pybind11_lib.h"
+
+namespace py = pybind11;
+
+PYBIND11_MODULE(_pywrap_saved_model_aot_compile, m) {
+  py::google::ImportStatusModule();
+
+  py::class_<tensorflow::tfrt_stub::AotOptions>(m, "AotOptions",
+                                                py::dynamic_attr())
+      .def(py::init<>())
+      .def_readwrite(
+          "graph_execution_options",
+          &tensorflow::tfrt_stub::AotOptions::graph_execution_options);
+  m.doc() = "pybind11 AotOptions Python - C++ Wrapper";
+
+  m.def("AotCompileSavedModel", &tensorflow::tfrt_stub::AotCompileSavedModel,
+        py::arg("input_model_dir") = absl::string_view(),
+        py::arg("aot_options") = tensorflow::tfrt_stub::AotOptions(),
+        py::arg("output_model_dir") = absl::string_view());
+  m.doc() = "pybind11 AotCompileSavedModel Python - C++ Wrapper";
+}
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index 321161dc89d..5966e48db31 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -25,11 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/compiler/transforms/import_model.h"
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/batch_kernel.h"
-#include "learning/brain/experimental/tfrt/mlrt/application/tensorflow/kernel/kernel.h"
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/math_kernels.h"
-#include "learning/infra/mira/mlrt/bytecode/bytecode.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
@@ -44,6 +40,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "tensorflow/compiler/xla/status_macros.h"
@@ -59,8 +56,12 @@ limitations under the License.
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/graph_executor/export_mlir.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/batch_kernel.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_import_input.h"
 #include "tensorflow/core/tfrt/utils/error_util.h"
@@ -90,7 +91,6 @@ using ::tensorflow::StatusOr;
 
 struct Initializer {
   std::string name;
-  std::vector<tensorflow::Tensor> inputs;
 };
 
 struct InitializersAndSignatures {
@@ -135,58 +135,29 @@ auto* saved_model_init_time_seconds =
         "/tensorflow/tfrt/saved_model/init_time",
         "Record the initialization time for the savedmodel.", "model_name");
 
-// TODO(b/239749833) clean up this retention after input spec validation is
+// TODO(b/279197040) clean up this retention after input spec validation is
 // enabled everywhere.
 auto* saved_model_input_spec_validation_failure =
     tensorflow::monitoring::Gauge<bool, 1>::New(
         "/tensorflow/tfrt/saved_model/input_spec_validation_failure",
         "Record the models that failed input spec validation.", "model_name");
 
-tensorflow::Tensor CreateScalarStringTensor(absl::string_view str) {
-  return tensorflow::Tensor(tensorflow::tstring(str));
-}
-
-// Create the tensor for the bound input, which can be a variable or an asset.
-//
-// TODO(chky): For V2 models, the bound input can also be a resource.
-StatusOr<tensorflow::Tensor> CreateTensorFromBoundInput(
-    mlir::Operation* bound_input, absl::string_view saved_model_dir) {
-  // Assets are files in the saved model directory. We pass their filenames to
-  // functions so that they can be used.
-  if (auto asset = llvm::dyn_cast<mlir::tf_saved_model::AssetOp>(bound_input)) {
-    // The filename in the asset is a relative path. So we prefix it with the
-    // directory path.
-    return CreateScalarStringTensor(
-        tensorflow::io::JoinPath(saved_model_dir, asset.getFilename().str()));
-  }
-
-  return tensorflow::errors::Internal(
-      "Failed to create captured tensors: unknown bound input type.");
-}
-
 StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
-    mlir::ModuleOp module, absl::string_view saved_model_dir) {
+    mlir::ModuleOp module) {
   InitializersAndSignatures result;
 
-  // A map for initializer inputs.
-  absl::flat_hash_map<std::string, std::vector<tensorflow::Tensor>>
-      initializer_input_map;
-
   // Create placeholders for initializers.
   for (auto session_initializer_name :
        mlir::tf_saved_model::GetSessionInitializerExportedName(module)) {
     Initializer initializer;
     initializer.name = session_initializer_name.str();
-    initializer_input_map[initializer.name];
     result.initializers.push_back(std::move(initializer));
   }
 
   auto& signatures = result.signature_map;
-  tensorflow::StatusGroup status_group;
   TF_RETURN_IF_ERROR(tensorflow::MapFunctionSignaturesFromTFSavedModelMLIR(
       module,
-      [&status_group, &signatures, &initializer_input_map, saved_model_dir](
-          const tensorflow::TFRTSavedModelSignatureInfo& sig_info) {
+      [&signatures](const tensorflow::TFRTSavedModelSignatureInfo& sig_info) {
         auto signature_name = std::string(sig_info.func_name);
         auto& signature = signatures[signature_name];
 
@@ -210,31 +181,8 @@ StatusOr<InitializersAndSignatures> GetInitializersAndSignatures(
         for (auto& spec : sig_info.output_specs) {
           signature.output_specs.push_back(TensorSpec(spec.first, spec.second));
         }
-
-        auto init_iter = initializer_input_map.find(signature_name);
-        if (init_iter == initializer_input_map.end()) return;
-
-        auto& init_inputs = init_iter->second;
-
-        for (auto* bound_input : sig_info.bound_inputs) {
-          auto capture =
-              CreateTensorFromBoundInput(bound_input, saved_model_dir);
-          if (!capture.ok()) {
-            status_group.Update(capture.status());
-            // Insert a random tensor in case of errors.
-            init_inputs.push_back(tensorflow::Tensor());
-          } else {
-            init_inputs.push_back(*std::move(capture));
-          }
-        }
       }));
 
-  if (!status_group.ok()) return status_group.as_concatenated_status();
-
-  for (auto& initializer : result.initializers) {
-    initializer.inputs = std::move(initializer_input_map.at(initializer.name));
-  }
-
   return result;
 }
 
@@ -262,11 +210,10 @@ tensorflow::Status RunBytecodeInitializers(
 
   for (const auto& p : initializers_and_signatures.initializers) {
     const auto& initializer_name = p.name;
-    const auto& initializer_inputs = p.inputs;
     std::vector<tensorflow::Tensor> outputs;
     TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
-        options, /*run_options=*/{}, initializer_name, nullptr,
-        &loaded_executable, initializer_inputs, &outputs, resource_context,
+        options, /*run_options=*/{}, initializer_name, /*symbol_uid=*/{},
+        nullptr, &loaded_executable, /*inputs=*/{}, &outputs, resource_context,
         /*client_graph_resource_context=*/nullptr, runner_table, resource_array,
         *options.runtime, fallback_state,
         /*req_deadline_tracker=*/nullptr));
@@ -308,13 +255,12 @@ tensorflow::Status RunBefInitializers(
 
   for (const auto& p : initializers_and_signatures.initializers) {
     const auto& initializer_name = p.name;
-    const auto& initializer_inputs = p.inputs;
     auto* func = bef_file->GetFunction(initializer_name);
     DCHECK(func);
     std::vector<tensorflow::Tensor> outputs;
     TF_RETURN_IF_ERROR(GraphExecutionRunOnFunction(
-        options, /*run_options=*/{}, initializer_name, func,
-        /*loaded_executable=*/nullptr, initializer_inputs, &outputs,
+        options, /*run_options=*/{}, initializer_name, /*symbol_uid=*/{}, func,
+        /*loaded_executable=*/nullptr, /*inputs=*/{}, &outputs,
         resource_context,
         /*client_graph_resource_context=*/nullptr, runner_table, resource_array,
         *options.runtime, fallback_state,
@@ -597,6 +543,9 @@ void UpdateCompileOptions(SavedModel::Options& options) {
     LOG(INFO) << "lazy_loading_use_graph_executor is updated to be the same as "
                  "enable_lazy_loading: "
               << options.enable_lazy_loading;
+    options.graph_execution_options.compile_options
+        .enable_while_parallel_iterations = true;
+    LOG(INFO) << "enable_while_parallel_iterations is always true for MLRT";
   }
 }
 
@@ -641,6 +590,8 @@ SavedModelImpl::LoadSavedModel(Options options,
   UpdateTpuTargetByBridgeCompatibility(options.graph_execution_options,
                                        meta_graph_def.graph_def());
   UpdateCompileOptions(options);
+  options.graph_execution_options.compile_options.saved_model_dir =
+      saved_model_dir;
 
   mlir::MLIRContext context;
 
@@ -674,6 +625,10 @@ SavedModelImpl::LoadSavedModel(Options options,
           options.graph_execution_options.run_placer_grappler_on_functions,
           options.graph_execution_options.enable_tfrt_gpu,
           options.graph_execution_options.compile_options.use_bridge_for_gpu));
+  // TODO(b/278143179): Upload module w/o control flow.
+  SymbolUids symbol_uids;
+  ASSIGN_OR_RETURN_IN_IMPORT(symbol_uids.tf_symbol_uid,
+                             MaybeUploadMlirToXsymbol(mlir_module.get()));
 
   const auto import_duration = absl::Now() - import_start_time;
   saved_model_import_time_seconds->GetCell(std::string(saved_model_dir))
@@ -683,9 +638,8 @@ SavedModelImpl::LoadSavedModel(Options options,
 
   // Step 2: Compile the MLIR module from TF dialect to TFRT dialect (in BEF).
   const auto compile_start_time = absl::Now();
-  ASSIGN_OR_RETURN_IN_COMPILE(
-      auto initializers_and_signatures,
-      GetInitializersAndSignatures(mlir_module.get(), saved_model_dir));
+  ASSIGN_OR_RETURN_IN_COMPILE(auto initializers_and_signatures,
+                              GetInitializersAndSignatures(mlir_module.get()));
   // If lazy loading is enabled, the user signatures are not exported via MLIR
   // module, so we need to get them from the proto.
   // TODO(b/187228559): Unify the code paths for populating the signature map.
@@ -699,13 +653,14 @@ SavedModelImpl::LoadSavedModel(Options options,
     ASSIGN_OR_RETURN_IN_COMPILE(
         bytecode, tensorflow::mlrt_compiler::ConvertTfMlirToBytecode(
                       options.graph_execution_options.compile_options,
-                      mlir_module.get()));
+                      *fallback_state, mlir_module.get()));
   } else {
     RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
         options.graph_execution_options.compile_options, mlir_module.get(),
         &bef, fallback_state.get()));
   }
-
+  ASSIGN_OR_RETURN_IN_COMPILE(symbol_uids.tfrt_symbol_uid,
+                              MaybeUploadMlirToXsymbol(mlir_module.get()));
   const auto compile_duration = absl::Now() - compile_start_time;
   saved_model_compile_time_seconds->GetCell(std::string(saved_model_dir))
       ->Set(absl::ToInt64Seconds(compile_duration));
@@ -763,17 +718,18 @@ SavedModelImpl::LoadSavedModel(Options options,
 
   // Finally, create the saved model.
   return {std::make_unique<SavedModelImpl>(
-      std::move(options), std::move(meta_graph_def), std::move(bef),
-      std::move(bef_file), std::move(bytecode), std::move(loaded_executable),
+      std::move(options), std::move(symbol_uids), std::move(meta_graph_def),
+      std::move(bef), std::move(bef_file), std::move(bytecode),
+      std::move(loaded_executable),
       std::move(initializers_and_signatures.signature_map),
       std::move(fallback_state), std::move(runner_table),
       std::move(resource_array), std::move(graph_executor))};
 }
 
 SavedModelImpl::SavedModelImpl(
-    Options options, tensorflow::MetaGraphDef meta_graph_def,
-    tfrt::BefBuffer bef, tfrt::RCReference<tfrt::BEFFile> bef_file,
-    mlrt::bc::Buffer bytecode,
+    Options options, SymbolUids symbol_uids,
+    tensorflow::MetaGraphDef meta_graph_def, tfrt::BefBuffer bef,
+    tfrt::RCReference<tfrt::BEFFile> bef_file, mlrt::bc::Buffer bytecode,
     std::optional<mlrt::LoadedExecutable> loaded_executable,
     SignatureMap signatures, std::unique_ptr<FallbackState> fallback_state,
     std::unique_ptr<OpKernelRunnerTable> runner_table,
@@ -781,13 +737,14 @@ SavedModelImpl::SavedModelImpl(
     std::unique_ptr<GraphExecutor> graph_executor)
     : SavedModel(options.graph_execution_options.runtime),
       options_(std::move(options)),
+      symbol_uids_(std::move(symbol_uids)),
       meta_graph_def_(std::move(meta_graph_def)),
       bef_(std::move(bef)),
       bef_file_(std::move(bef_file)),
       bytecode_(std::move(bytecode)),
       loaded_executable_(std::move(loaded_executable)),
       req_deadline_tracker_(
-          options.graph_execution_options.runtime->core_runtime()
+          options_.graph_execution_options.runtime->core_runtime()
               ->GetHostContext()),
       signatures_(std::move(signatures)),
       fallback_state_(std::move(fallback_state)),
@@ -849,12 +806,13 @@ tensorflow::Status SavedModelImpl::Run(
       CheckInputSpecs(options_.graph_execution_options.model_metadata,
                       run_options, name, signature, inputs));
 
+  const SymbolUids* symbol_uids = nullptr;
   const tfrt::Function* func = nullptr;
   const mlrt::LoadedExecutable* loaded_executable = nullptr;
   OpKernelRunnerTable* runner_table = nullptr;
   tfd::FallbackResourceArray* resource_array = nullptr;
   if (options_.enable_lazy_loading) {
-    // TODO(b/216379787): Remove this lazy loading path once b/239749833 is
+    // TODO(b/216379787): Remove this lazy loading path once b/279197040 is
     // unblocked.
 
     // If lazy loading is enabled, no signature is loaded into `bef_file_`, so
@@ -862,11 +820,13 @@ tensorflow::Status SavedModelImpl::Run(
     TF_ASSIGN_OR_RETURN(
         const LoadingResult& loading_result,
         GetOrCreateLoadingResult(run_options, {std::string(name)}));
+    symbol_uids = &loading_result.symbol_uids;
     func = loading_result.bef_file->GetFunction(
         tensorflow::kImportModelDefaultGraphFuncName);
     runner_table = loading_result.runner_table.get();
     resource_array = loading_result.resource_array.get();
   } else {
+    symbol_uids = &symbol_uids_;
     if (loaded_executable_) {
       loaded_executable = &(*loaded_executable_);
     } else {
@@ -881,7 +841,7 @@ tensorflow::Status SavedModelImpl::Run(
   DCHECK(resource_array);
 
   return GraphExecutionRunOnFunction(
-      options_.graph_execution_options, run_options, name, func,
+      options_.graph_execution_options, run_options, name, *symbol_uids, func,
       loaded_executable, inputs, outputs, resource_context,
       /*client_graph_resource_context=*/nullptr, runner_table, resource_array,
       runtime(), *fallback_state_, &req_deadline_tracker_);
@@ -1092,11 +1052,14 @@ SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
       auto module, ImportSubgraph(&context, joined_signature.input_nodes,
                                   joined_signature.output_nodes,
                                   joined_signature.target_nodes));
+  // TODO(b/278143179): Upload module w/o control flow.
+  SymbolUids symbol_uids;
+  ASSIGN_OR_RETURN_IN_IMPORT(symbol_uids.tf_symbol_uid,
+                             MaybeUploadMlirToXsymbol(module.get()));
 
   // Step 2: Compile the MLIR module from TF dialect to TFRT dialect (in BEF).
   auto loading_result = std::make_unique<LoadingResult>();
   loading_result->name = joined_signature.name;
-
   loading_result->runner_table = std::make_unique<OpKernelRunnerTable>();
   loading_result->resource_array =
       std::make_unique<tfd::FallbackResourceArray>();
@@ -1104,6 +1067,9 @@ SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
   RETURN_IF_ERROR_IN_COMPILE(tensorflow::ConvertTfMlirToBef(
       options_.graph_execution_options.compile_options, module.get(),
       &loading_result->bef, fallback_state_.get()));
+  ASSIGN_OR_RETURN_IN_COMPILE(symbol_uids.tfrt_symbol_uid,
+                              MaybeUploadMlirToXsymbol(module.get()));
+  loading_result->symbol_uids = std::move(symbol_uids);
 
   // Step 3: Initialize runtime states using special BEF functions.
   ASSIGN_OR_RETURN_IN_INIT(
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.h b/tensorflow/core/tfrt/saved_model/saved_model.h
index 7e4eadfafab..4e85041f8b4 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model.h
@@ -125,7 +125,7 @@ class SavedModel {
 
     // If true, the lazy loading path will use tfrt_stub::GraphExecutor.
     //
-    // TODO(b/216379787): Remove this option once b/239749833 is unblocked.
+    // TODO(b/216379787): Remove this option once b/279197040 is unblocked.
     bool lazy_loading_use_graph_executor = false;
 
     GraphExecutionOptions graph_execution_options;
@@ -218,9 +218,9 @@ class SavedModelImpl final : public SavedModel {
       absl::string_view saved_model_dir);
 
   SavedModelImpl(
-      Options options, tensorflow::MetaGraphDef meta_graph_def,
-      tfrt::BefBuffer bef, tfrt::RCReference<tfrt::BEFFile> bef_file,
-      mlrt::bc::Buffer bytecode,
+      Options options, SymbolUids symbol_uids,
+      tensorflow::MetaGraphDef meta_graph_def, tfrt::BefBuffer bef,
+      tfrt::RCReference<tfrt::BEFFile> bef_file, mlrt::bc::Buffer bytecode,
       std::optional<mlrt::LoadedExecutable> loaded_executable,
       absl::flat_hash_map<std::string, internal::Signature> signatures,
       std::unique_ptr<FallbackState> fallback_state,
@@ -260,6 +260,7 @@ class SavedModelImpl final : public SavedModel {
   // The result of loading signature(s).
   struct LoadingResult {
     std::string name;
+    SymbolUids symbol_uids;
     tfrt::BefBuffer bef;
     tfrt::RCReference<tfrt::BEFFile> bef_file;
     std::unique_ptr<OpKernelRunnerTable> runner_table;
@@ -288,6 +289,7 @@ class SavedModelImpl final : public SavedModel {
       TF_LOCKS_EXCLUDED(loading_result_cache_mu_);
 
   Options options_;
+  SymbolUids symbol_uids_;
   // `meta_graph_def_` only contains metadata of the model. The graph_def field
   // is removed.
   //
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
new file mode 100644
index 00000000000..0891cdada16
--- /dev/null
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -0,0 +1,84 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h"
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/core/platform/file_system_helper.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+#include "tensorflow/core/tfrt/utils/tfrt_runtime.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/platform/file_system_helper.h"
+#include "tensorflow/tsl/platform/status.h"
+
+namespace tensorflow::tfrt_stub {
+
+AotOptions::AotOptions()
+    : graph_execution_options(
+          TfrtRuntime::GetGlobalTfrtRuntime().GetRuntime()) {}
+
+Status AotCompileSavedModel(absl::string_view input_model_dir,
+                            const AotOptions& aot_options,
+                            absl::string_view output_model_dir) {
+  Env* env = Env::Default();
+  const std::string warmup_requests_path = io::JoinPath(
+      input_model_dir, "assets.extra", "tf_serving_warmup_requests");
+  TF_RETURN_IF_ERROR(env->FileExists(warmup_requests_path));
+
+  const std::string saved_model_pb_path =
+      io::JoinPath(input_model_dir, kSavedModelFilenamePb);
+  const std::string saved_model_pbtxt_path =
+      io::JoinPath(input_model_dir, kSavedModelFilenamePbTxt);
+  bool pb_found = env->FileExists(saved_model_pb_path).ok();
+  bool pbtxt_found = env->FileExists(saved_model_pbtxt_path).ok();
+  if (!pb_found && !pbtxt_found) {
+    return absl::NotFoundError(absl::StrCat(
+        "saved_model not found in input directory: ", input_model_dir));
+  }
+
+  const bool new_directory = !output_model_dir.empty();
+  std::string output_dir;
+  if (!new_directory) {
+    output_dir = std::string(input_model_dir);
+  } else {
+    // TODO(chrisminge) modify to copy everything in input directory
+    output_dir = std::string(output_model_dir);
+    TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(output_dir, {}));
+  }
+  const std::string aot_directory =
+      io::JoinPath(std::string(output_model_dir), "aot_packages");
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(aot_directory, {}));
+  if (pb_found) {
+    const std::string output_file_directory =
+        io::JoinPath(std::string(output_model_dir),
+                     absl::StrCat("aot_", kSavedModelFilenamePb));
+    return env->CopyFile(saved_model_pb_path, output_file_directory);
+  } else {
+    const std::string output_file_directory =
+        io::JoinPath(std::string(output_model_dir),
+                     absl::StrCat("aot_", kSavedModelFilenamePbTxt));
+    return env->CopyFile(saved_model_pbtxt_path, output_file_directory);
+  }
+}
+
+}  // namespace tensorflow::tfrt_stub
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
new file mode 100644
index 00000000000..a3cffc385b8
--- /dev/null
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.h
@@ -0,0 +1,40 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_AOT_COMPILE_H_
+#define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_AOT_COMPILE_H_
+
+#include <string>
+
+#include "tensorflow/compiler/xla/service/compiler.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
+
+namespace tensorflow::tfrt_stub {
+struct AotOptions {
+  GraphExecutionOptions graph_execution_options;
+  AotOptions();
+};
+
+// AOT Compiles saved_model in input_model_dir, writing output
+// saved_model and aot packages to output_model_dir, or
+// "{input_model_dir}/aot_packages" if output dir provided. Warmup requests
+// should be present in input_model_dir
+Status AotCompileSavedModel(absl::string_view input_model_dir,
+                            const AotOptions& aot_options = {},
+                            absl::string_view output_model_dir = "");
+
+}  // namespace tensorflow::tfrt_stub
+
+#endif  // TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_AOT_COMPILE_H_
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
index a4b2051dfb0..bfd25ccb769 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/saved_model/saved_model_import_input.h"
 
+#include <memory>
+#include <utility>
+
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/upgrade_graph.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_import_input.h b/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
index 59c6a5d89d6..3c1b9fca053 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_import_input.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_IMPORT_INPUT_H_
 #define TENSORFLOW_CORE_TFRT_SAVED_MODEL_SAVED_MODEL_IMPORT_INPUT_H_
 
+#include <memory>
+#include <string>
+
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_testutil.h b/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
index 134b2d5c5f1..0508c031e23 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
+++ b/tensorflow/core/tfrt/saved_model/saved_model_testutil.h
@@ -110,15 +110,14 @@ void ExpectTensorEqual(const tensorflow::Tensor& x, const tensorflow::Tensor& y,
 
 SavedModel::Options DefaultTpuModelOptions(
     tensorflow::tfrt_stub::Runtime* runtime,
-    tensorflow::TfrtDeviceInfraTarget tpu_target);
+    tensorflow::TfrtDeviceInfraTarget device_target);
 
 tensorflow::StatusOr<std::vector<tensorflow::serving::PredictRequest>>
 GetWarmupRequests(absl::string_view saved_model_dir);
 
 void ProcessPredictRequestsAndMaybeProfile(
     const std::vector<tensorflow::serving::PredictRequest>& requests,
-    SavedModel* saved_model, const bool profile = false,
-    const int32_t num_steps = 1);
+    SavedModel* saved_model, bool profile = false, int32_t num_steps = 1);
 
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index d6c29a975bf..0e8f8a9448a 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -33,14 +33,14 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
@@ -60,10 +60,10 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
@@ -80,11 +80,11 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
@@ -101,12 +101,12 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
@@ -123,13 +123,13 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
@@ -146,13 +146,13 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl:app",
@@ -167,13 +167,13 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:standard_ops",  # build_cleaner: keep
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:standard_ops",  # build_cleaner: keep
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl:app",
@@ -189,16 +189,16 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_ops",
         "//tensorflow/python/framework:test_ops_kernels",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
@@ -218,14 +218,14 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
@@ -244,15 +244,15 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl:app",
@@ -268,17 +268,17 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl:app",
@@ -293,16 +293,16 @@ pytype_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:standard_ops",  # build_cleaner: keep
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:standard_ops",  # build_cleaner: keep
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
         "@absl_py//absl:app",
@@ -318,15 +318,15 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
@@ -346,22 +346,22 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
         "//tensorflow/python/saved_model:tag_constants",
         "//tensorflow/python/saved_model:utils",
+        "//tensorflow/python/training",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
     ],
@@ -374,17 +374,17 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
@@ -404,16 +404,16 @@ pytype_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":disable_tf2",  # build_cleaner: keep; go/disable_tf2
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:input_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_options",
diff --git a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
index e3b63425d78..0cb693e7b87 100644
--- a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
+++ b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
@@ -15,6 +15,6 @@ def gen_saved_model(model_name = "", script = "", **kwargs):
             "$(location " + script + ") --saved_model_path=$(RULEDIR)/" + model_name,
             "touch $(OUTS)",  # TODO(b/188517768): fix model gen.
         ),
-        exec_tools = [script],
+        tools = [script],
         **kwargs
     )
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index e24a2b704af..95b8d71ee26 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -976,9 +976,6 @@ TEST(SavedModelTest, WhileLoopV1) {
   auto options = DefaultSavedModelOptions(runtime.get());
   options.graph_execution_options.compile_options.enable_grappler = true;
 
-  // TODO(chky): Implement while op in MLRT.
-  if (options.graph_execution_options.enable_mlrt) return;
-
   auto saved_model = SavedModelImpl::LoadSavedModel(options, saved_model_dir,
                                                     /*tags=*/{"serve"});
   TF_ASSERT_OK(saved_model.status());
@@ -1102,7 +1099,7 @@ TEST(SavedModelTest, CustomModelConfig) {
   runtime->AddCreateRuntimeResourceFn(
       [&test_config](const GraphExecutionOptions& options,
                      tfrt::ResourceContext*) {
-        test_config = options.model_config.Get<TestConfig1>().value();
+        test_config = options.runtime_config.Get<TestConfig1>().value();
       });
 
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -1110,7 +1107,7 @@ TEST(SavedModelTest, CustomModelConfig) {
 
   TestConfig1 expected_test_config;
   expected_test_config.set_tag("test config");
-  ASSERT_OK(options.graph_execution_options.model_config.Add<TestConfig1>(
+  ASSERT_OK(options.graph_execution_options.runtime_config.Add<TestConfig1>(
       expected_test_config));
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/tensorflow/core/tfrt/utils/BUILD b/tensorflow/core/tfrt/utils/BUILD
index 889a19c919b..f2a3022ac03 100644
--- a/tensorflow/core/tfrt/utils/BUILD
+++ b/tensorflow/core/tfrt/utils/BUILD
@@ -13,6 +13,9 @@ package_group(
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/research/pjrt/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
+        # copybara:uncomment "//learning/infra/mira/distributed/...",
+        "//learning/serving/...",
+        "//quality/webanswers/...",
         "//smartass/brain/inference/...",
         # copybara:uncomment "//smartass/brain/ops/...",
         "//tensorflow/c/eager/...",
@@ -20,12 +23,21 @@ package_group(
         "//tensorflow/core/runtime_fallback/...",
         "//tensorflow/core/tfrt/...",
         "//tensorflow/python/...",
-        # copybara:uncomment "//tensorflow_serving/batching/google/...",
-        # copybara:uncomment "//tensorflow_serving/servables/tensorflow/google/...",
+        # copybara:uncomment "//tensorflow_serving/...",
         # copybara:uncomment "//third_party/tf_runtime_google/...",
     ],
 )
 
+cc_library(
+    name = "tfrt_runtime",
+    srcs = ["tfrt_runtime.cc"],
+    hdrs = ["tfrt_runtime.h"],
+    deps = [
+        "//tensorflow/core/tfrt/runtime",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 cc_library(
     name = "utils",
     srcs = [
@@ -42,9 +54,9 @@ cc_library(
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/lib/gtl:array_slice",
         "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:profile_utils_cpu_utils",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/platform:strcat",
-        "//tensorflow/core/tfrt/eager:virtual_device",
         "//tensorflow/core/tfrt/runtime",
         "//tensorflow/core/tpu:virtual_device",
         "@com_google_absl//absl/status",
@@ -187,7 +199,6 @@ tf_cc_test(
         "//tensorflow/core/common_runtime:placer",
         "//tensorflow/core/framework:graph_proto_cc",
         "//tensorflow/core/grappler/utils:grappler_test",
-        "@com_google_googletest//:gtest_main",
     ],
 )
 
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
index aa8ae077568..47c76633f65 100644
--- a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
@@ -63,7 +63,7 @@ const Node* FindNode(const Graph* graph, absl::string_view node_name) {
 
 const Node* GetInputNode(const Node* node, size_t index) {
   const Node* input_node;
-  CHECK(node->input_node(index, &input_node).ok());
+  CHECK_OK(node->input_node(index, &input_node));
   return input_node;
 }
 
diff --git a/tensorflow/core/tfrt/utils/error_util.cc b/tensorflow/core/tfrt/utils/error_util.cc
index 4f0f93a767d..3345cd8797b 100644
--- a/tensorflow/core/tfrt/utils/error_util.cc
+++ b/tensorflow/core/tfrt/utils/error_util.cc
@@ -25,7 +25,7 @@ tfrt::ErrorCode ConvertTfErrorCodeToTfrtErrorCode(
   auto tf_error_code = status.code();
   switch (tf_error_code) {
     default:
-      LOG(INFO) << "Unsupported TensorFlow error code: " << status.ToString();
+      LOG(INFO) << "Unsupported TensorFlow error code: " << status;
       return tfrt::ErrorCode::kUnknown;
 #define ERROR_TYPE(TFRT_ERROR, TF_ERROR) \
   case absl::StatusCode::TF_ERROR:       \
@@ -35,14 +35,14 @@ tfrt::ErrorCode ConvertTfErrorCodeToTfrtErrorCode(
 }
 
 tensorflow::Status CreateTfErrorStatus(const DecodedDiagnostic& error) {
-  return tensorflow::FromAbslStatus(error.status);
+  return error.status;
 }
 
 tensorflow::Status ToTfStatus(const tfrt::AsyncValue* av) {
   CHECK(av != nullptr && av->IsAvailable())  // Crash OK
       << "Expected a ready async value.";
   if (av->IsError()) {
-    return tensorflow::FromAbslStatus(av->GetError());
+    return av->GetError();
   }
   return ::tensorflow::OkStatus();
 }
diff --git a/tensorflow/core/tfrt/utils/error_util.h b/tensorflow/core/tfrt/utils/error_util.h
index 4be952ca05f..e694931f82e 100644
--- a/tensorflow/core/tfrt/utils/error_util.h
+++ b/tensorflow/core/tfrt/utils/error_util.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_UTILS_ERROR_UTIL_H_
 #define TENSORFLOW_CORE_TFRT_UTILS_ERROR_UTIL_H_
 
+#include <string>
+
 #include "tensorflow/core/platform/status.h"
 #include "tfrt/support/error_util.h"  // from @tf_runtime
 #include "tfrt/support/forward_decls.h"  // from @tf_runtime
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor.cc b/tensorflow/core/tfrt/utils/fallback_tensor.cc
index c4b8e30bdcc..93cb745c6ce 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor.cc
+++ b/tensorflow/core/tfrt/utils/fallback_tensor.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/dma_helper.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tfrt/utils/fallback_tensor.h b/tensorflow/core/tfrt/utils/fallback_tensor.h
index 1c12f47f7e0..3544b363bf8 100644
--- a/tensorflow/core/tfrt/utils/fallback_tensor.h
+++ b/tensorflow/core/tfrt/utils/fallback_tensor.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
 #define TENSORFLOW_CORE_TFRT_UTILS_FALLBACK_TENSOR_H_
 
+#include <utility>
+
 #include "tensorflow/core/common_runtime/dma_helper.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/tsl/profiler/lib/traceme.h"
diff --git a/tensorflow/core/tfrt/utils/graph_partition.cc b/tensorflow/core/tfrt/utils/graph_partition.cc
index e591582c121..e336ce96399 100644
--- a/tensorflow/core/tfrt/utils/graph_partition.cc
+++ b/tensorflow/core/tfrt/utils/graph_partition.cc
@@ -16,10 +16,13 @@ limitations under the License.
 
 #include <algorithm>
 #include <functional>
+#include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <unordered_map>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -61,7 +64,7 @@ struct CallNodeInputInfo {
 
 struct OutputNodeInfo {
   absl::flat_hash_map<std::string, NodeInfo> output_nodes;
-  absl::optional<std::pair<std::string, NodeInfo>> auxiliary_output_node;
+  std::optional<std::pair<std::string, NodeInfo>> auxiliary_output_node;
 };
 
 // Prepares the `subgraph` for the conversion to a function by adding
@@ -75,7 +78,7 @@ Status PrepareSubgraphForFunctionConversion(
     const std::string& func_name,
     absl::flat_hash_map<std::string, NodeInfo>& input_nodes,
     absl::flat_hash_map<std::string, NodeInfo>& output_nodes,
-    absl::optional<std::pair<std::string, NodeInfo>>& auxiliary_output_node,
+    std::optional<std::pair<std::string, NodeInfo>>& auxiliary_output_node,
     Graph* subgraph, Graph* graph) {
   std::unordered_map<std::string, Node*> name_to_node_map =
       subgraph->BuildNodeNameIndex();
@@ -272,7 +275,7 @@ StatusOr<Node*> BuildPartitionedCallOp(
     if (control_ret_names.contains(node->name())) {
       return node->name();
     }
-    return absl::nullopt;
+    return std::nullopt;
   };
 
   FunctionDef new_fdef;
@@ -381,7 +384,7 @@ StatusOr<Node*> BuildStatefulPartitionedCallOp(
 // Returns true if nodes in the `graph` are assigned to multiple devices.
 bool HasMultipleDevices(const Graph* graph) {
   bool has_multiple_devices = false;
-  absl::optional<std::string> location;
+  std::optional<std::string> location;
   for (const Node* node : graph->op_nodes()) {
     if (location) {
       if (*location != node->assigned_device_name()) {
@@ -460,7 +463,7 @@ StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
     OutputNodeInfo& output_node_info = device_to_output_info_map[device];
     absl::flat_hash_map<std::string, NodeInfo>& output_nodes =
         output_node_info.output_nodes;
-    absl::optional<std::pair<std::string, NodeInfo>>& auxiliary_output_node =
+    std::optional<std::pair<std::string, NodeInfo>>& auxiliary_output_node =
         output_node_info.auxiliary_output_node;
 
     // Add _Arg and _Retval nodes to the subgraph to prepare for converting it
diff --git a/tensorflow/core/tfrt/utils/graph_partition.h b/tensorflow/core/tfrt/utils/graph_partition.h
index 5d33ec4bfe9..c18fab2f1ec 100644
--- a/tensorflow/core/tfrt/utils/graph_partition.h
+++ b/tensorflow/core/tfrt/utils/graph_partition.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_UTILS_GRAPH_PARTITION_H_
 #define TENSORFLOW_CORE_TFRT_UTILS_GRAPH_PARTITION_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/core/tfrt/utils/tensor_util.cc b/tensorflow/core/tfrt/utils/tensor_util.cc
index b770461c0ba..69848f629aa 100644
--- a/tensorflow/core/tfrt/utils/tensor_util.cc
+++ b/tensorflow/core/tfrt/utils/tensor_util.cc
@@ -110,8 +110,7 @@ StatusOr<tensorflow::DataType> ConvertTFRTDTypeToTFDType(DType dtype) {
 
 }  // namespace
 
-llvm::Expected<tensorflow::Tensor> TFRTTensorToTFTensor(const Tensor& tensor,
-                                                        HostContext* host) {
+llvm::Expected<tensorflow::Tensor> TFRTTensorToTFTensor(const Tensor& tensor) {
   if (auto* knfbt = llvm::dyn_cast<tensorflow::KernelFallbackTensor>(&tensor)) {
     return *knfbt->GetTensor();
   }
@@ -213,4 +212,23 @@ StatusOr<tensorflow::Tensor> CreateTFTensorFromTensorHandle(
   return tensorflow::errors::Internal("unknown host tensor type");
 }
 
+Expected<tfrt::DenseHostTensor> ConvertTfTensorToDHT(
+    tensorflow::Tensor tf_tensor) {
+  auto metadata = tensorflow::tfd::GetTensorMetadata(tf_tensor);
+  if (!IsTriviallyCopyable(metadata.dtype))
+    return MakeStringError(
+        "Cannot convert tf Tensor with non-trivially copyable dtype to DHT");
+
+  void* data = tf_tensor.data();
+  size_t size = tf_tensor.AllocatedBytes();
+  tfrt::RCReference<tfrt::HostBuffer> host_buffer =
+      tfrt::HostBuffer::CreateFromExternal(
+          data, size, [tf_tensor = std::move(tf_tensor)](void*, size_t) {});
+
+  // Assume HostBuffer::CreateFromExternal never fails.
+  assert(host_buffer);
+
+  return tfrt::DenseHostTensor(metadata, std::move(host_buffer));
+}
+
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/tensor_util.h b/tensorflow/core/tfrt/utils/tensor_util.h
index e9810af9e90..bb424118213 100644
--- a/tensorflow/core/tfrt/utils/tensor_util.h
+++ b/tensorflow/core/tfrt/utils/tensor_util.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include "tensorflow/core/platform/statusor.h"
 #include "tfrt/core_runtime/tensor_handle.h"  // from @tf_runtime
 #include "tfrt/host_context/host_context.h"  // from @tf_runtime
+#include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
 #include "tfrt/tensor/tensor.h"  // from @tf_runtime
 
 namespace tfrt {
 
 // Converts a tfrt::Tensor to tensorflow::Tensor.
-llvm::Expected<tensorflow::Tensor> TFRTTensorToTFTensor(const Tensor& tensor,
-                                                        HostContext* host);
+llvm::Expected<tensorflow::Tensor> TFRTTensorToTFTensor(const Tensor& tensor);
 
 // Converts a tensorflow::Tensor to tfrt::TensorHandle.
 AsyncValueRef<TensorHandle> TFTensorToTFRTTensorHandle(
@@ -39,6 +39,10 @@ tensorflow::StatusOr<TensorHandle> CreateTensorHandleFromTFTensor(
 tensorflow::StatusOr<tensorflow::Tensor> CreateTFTensorFromTensorHandle(
     const TensorHandle& tensor_handle);
 
+// Converts a tensorflow::Tensor to tfrt::DenseHostTensor.
+// TODO(tfrt-devs): consider generalize to TFTensorToTFRTTensor
+Expected<DenseHostTensor> ConvertTfTensorToDHT(tensorflow::Tensor tf_tensor);
+
 }  // namespace tfrt
 
 #endif  // TENSORFLOW_CORE_TFRT_UTILS_TENSOR_UTIL_H_
diff --git a/tensorflow/core/tfrt/utils/tensor_util_test.cc b/tensorflow/core/tfrt/utils/tensor_util_test.cc
index e7b05d868d7..d5e5ab080ec 100644
--- a/tensorflow/core/tfrt/utils/tensor_util_test.cc
+++ b/tensorflow/core/tfrt/utils/tensor_util_test.cc
@@ -14,8 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/utils/tensor_util.h"
 
+#include <complex>
+#include <memory>
+#include <numeric>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "llvm/ADT/SmallVector.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_tensor.h"
 #include "tfrt/cpp_tests/test_util.h""  // from @tf_runtime
 #include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
@@ -36,7 +41,7 @@ TEST(TensorUtilTest, DHTToTFTensor) {
       TensorMetadata(DType(DType::I32), {2, 2}), host.get());
   auto view = MutableDHTArrayView<int32_t>(&dht);
   std::iota(view.begin(), view.end(), 1);
-  auto tf_tensor = *TFRTTensorToTFTensor(dht, host.get());
+  auto tf_tensor = *TFRTTensorToTFTensor(dht);
   EXPECT_THAT(tf_tensor.shape().dim_sizes(), testing::ElementsAre(2, 2));
   EXPECT_EQ(tf_tensor.data(), dht.data());
 }
@@ -46,7 +51,7 @@ TEST(TensorUtilTest, SHTToTFTensor) {
   auto sht = *StringHostTensor::CreateUninitialized(
       TensorMetadata(DType(DType::String), TensorShape(1)), host.get());
   sht.strings().front() = "hello";
-  auto tf_tensor = *TFRTTensorToTFTensor(sht, host.get());
+  auto tf_tensor = *TFRTTensorToTFTensor(sht);
   EXPECT_EQ(tf_tensor.NumElements(), 1);
   EXPECT_EQ(tf_tensor.flat<tensorflow::tstring>()(0), "hello");
 }
@@ -55,7 +60,7 @@ TEST(TensorUtilTest, KNFBToTFTensor) {
   std::unique_ptr<HostContext> host = CreateHostContext();
   tensorflow::Tensor original_tf_tensor(0.1f);
   tensorflow::KernelFallbackTensor knfbt(original_tf_tensor);
-  auto tf_tensor = *TFRTTensorToTFTensor(knfbt, host.get());
+  auto tf_tensor = *TFRTTensorToTFTensor(knfbt);
   EXPECT_EQ(tf_tensor.NumElements(), 1);
   EXPECT_EQ(tf_tensor.data(), original_tf_tensor.data());
 }
@@ -63,7 +68,7 @@ TEST(TensorUtilTest, KNFBToTFTensor) {
 TEST(TensorUtilTest, ScalarHostTensorToTFTensor) {
   std::unique_ptr<HostContext> host = CreateHostContext();
   ScalarHostTensor<int32_t> t(TensorShape({2, 2}), 1);
-  auto tf_tensor = TFRTTensorToTFTensor(t, host.get());
+  auto tf_tensor = TFRTTensorToTFTensor(t);
   ASSERT_FALSE(!tf_tensor);
   ASSERT_EQ(tf_tensor->dtype(), tensorflow::DT_INT32);
   ASSERT_THAT(tf_tensor->shape().dim_sizes(), testing::ElementsAre(2, 2));
@@ -88,7 +93,7 @@ TEST(TensorUtilTest, TFRTTensorToTFTensorUnsupported) {
   };
 
   UnsupportedTensor t(TensorMetadata(DType{DType::I32}, /*shape=*/{}));
-  EXPECT_FALSE(TFRTTensorToTFTensor(t, host.get()));
+  EXPECT_FALSE(TFRTTensorToTFTensor(t));
 }
 
 TEST(TensorUtilTest, TFTensorToTFRTTensorHandle) {
@@ -96,8 +101,8 @@ TEST(TensorUtilTest, TFTensorToTFRTTensorHandle) {
   tensorflow::Tensor tf_tensor(0.2f);
   auto handle = TFTensorToTFRTTensorHandle(tf_tensor, host.get());
   ASSERT_TRUE(handle->GetAsyncTensor()->IsAvailable());
-  auto converted_tf_tensor = *TFRTTensorToTFTensor(
-      handle->GetAsyncTensor()->get<Tensor>(), host.get());
+  auto converted_tf_tensor =
+      *TFRTTensorToTFTensor(handle->GetAsyncTensor()->get<Tensor>());
   EXPECT_EQ(converted_tf_tensor.NumElements(), 1);
   EXPECT_EQ(converted_tf_tensor.data(), tf_tensor.data());
 }
@@ -193,5 +198,15 @@ TEST(TensorUtilTest, TFTensorAndTensorHandleString) {
   EXPECT_EQ(tf_data_2(1), "string");
 }
 
+TEST(TensorUtilTest, TFTensorToDHT) {
+  std::unique_ptr<HostContext> host = CreateHostContext();
+  tensorflow::Tensor tf_tensor(0.2f);
+  auto dht = ConvertTfTensorToDHT(tf_tensor);
+  ASSERT_FALSE(!dht);
+  llvm::SmallVector<int32_t, 4> dims;
+  EXPECT_EQ(dht->NumElements(), 1);
+  EXPECT_EQ(tf_tensor.data(), dht->data());
+}
+
 }  // namespace
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
index 629e09767ea..aa99c168ebd 100644
--- a/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
+++ b/tensorflow/core/tfrt/utils/tfrt_graph_execution_state_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 #include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
 
 #include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/core/tfrt/utils/tfrt_runtime.cc b/tensorflow/core/tfrt/utils/tfrt_runtime.cc
new file mode 100644
index 00000000000..d3be5d90705
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/tfrt_runtime.cc
@@ -0,0 +1,39 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/core/tfrt/utils/tfrt_runtime.h"
+
+#include <memory>
+#include <utility>
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+TfrtRuntime& TfrtRuntime::GetGlobalTfrtRuntime() {
+  static TfrtRuntime* tfrt_runtime = new TfrtRuntime();
+  return *tfrt_runtime;
+}
+
+void TfrtRuntime::SetRuntime(std::unique_ptr<Runtime> runtime) {
+  absl::MutexLock l(&m_);
+  runtime_ = std::move(runtime);
+}
+
+Runtime* TfrtRuntime::GetRuntime() {
+  absl::MutexLock l(&m_);
+  return runtime_.get();
+}
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/utils/tfrt_runtime.h b/tensorflow/core/tfrt/utils/tfrt_runtime.h
new file mode 100644
index 00000000000..b2cf3da9806
--- /dev/null
+++ b/tensorflow/core/tfrt/utils/tfrt_runtime.h
@@ -0,0 +1,48 @@
+/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_CORE_TFRT_UTILS_TFRT_RUNTIME_H_
+#define TENSORFLOW_CORE_TFRT_UTILS_TFRT_RUNTIME_H_
+
+#include <memory>
+
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/tfrt/runtime/runtime.h"
+
+namespace tensorflow {
+namespace tfrt_stub {
+
+// A class to hold global TFRT core runtime. The class is thread-safe.
+// TODO(b/281750702) Unify this class with `tensorflow::tfrt_stub::Runtime`.
+class TfrtRuntime {
+ public:
+  // Set Global TFRT Runtime.
+  void SetRuntime(std::unique_ptr<Runtime> runtime);
+
+  // Get Global TFRT Runtime.
+  Runtime* GetRuntime();
+
+  // Get a global instance of the TfrtRuntime.
+  static TfrtRuntime& GetGlobalTfrtRuntime();
+
+ private:
+  absl::Mutex m_;
+  // TFRT Runtime.
+  std::unique_ptr<Runtime> runtime_ ABSL_GUARDED_BY(m_);
+};
+
+}  // namespace tfrt_stub
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_UTILS_TFRT_RUNTIME_H_
diff --git a/tensorflow/core/tfrt/utils/thread_pool.h b/tensorflow/core/tfrt/utils/thread_pool.h
index ad315cc8bbc..0efe9133cf6 100644
--- a/tensorflow/core/tfrt/utils/thread_pool.h
+++ b/tensorflow/core/tfrt/utils/thread_pool.h
@@ -15,6 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_UTILS_THREAD_POOL_H_
 #define TENSORFLOW_CORE_TFRT_UTILS_THREAD_POOL_H_
 
+#include <functional>
+#include <string>
+#include <utility>
+
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/threadpool.h"
 #include "tensorflow/core/platform/threadpool_interface.h"
diff --git a/tensorflow/core/tfrt/utils/utils.cc b/tensorflow/core/tfrt/utils/utils.cc
index 3d1ecbe2310..5a887cf691e 100644
--- a/tensorflow/core/tfrt/utils/utils.cc
+++ b/tensorflow/core/tfrt/utils/utils.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/utils/utils.h"
 
+#include <atomic>
+#include <cstdint>
 #include <memory>
 #include <string>
 #include <utility>
@@ -21,7 +23,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/xla/status_macros.h"
 #include "tensorflow/core/framework/device.h"
-#include "tensorflow/core/tfrt/eager/virtual_device.h"
+#include "tensorflow/core/platform/profile_utils/cpu_utils.h"
 #include "tensorflow/core/tfrt/utils/error_util.h"
 #include "tensorflow/core/tpu/virtual_device.h"
 #include "tfrt/bef/bef_encoding.h"  // from @tf_runtime
@@ -98,14 +100,6 @@ void CreateDummyTfDevices(
   }
 }
 
-void AddDummyTfrtDevices(const std::vector<std::string>& device_names,
-                         HostContext* host_ctx) {
-  for (const auto& name : device_names) {
-    host_ctx->GetDeviceManager()->MaybeAddDevice(
-        TakeRef(new tfrt::VirtualDevice(name)));
-  }
-}
-
 StatusOr<RCReference<tfrt::BEFFile>> CreateBefFileFromBefBuffer(
     const tensorflow::tfrt_stub::Runtime& runtime, const tfrt::BefBuffer& bef) {
   auto* core_runtime = runtime.core_runtime();
@@ -124,4 +118,8 @@ int64_t GetUniqueInt() {
   return id.fetch_add(1, std::memory_order_relaxed);
 }
 
+uint64_t GetCpuClockCycle() {
+  return tensorflow::profile_utils::CpuUtils::GetCurrentClockCycle();
+}
+
 }  // namespace tfrt
diff --git a/tensorflow/core/tfrt/utils/utils.h b/tensorflow/core/tfrt/utils/utils.h
index 179bcbb93dc..d857b1a642c 100644
--- a/tensorflow/core/tfrt/utils/utils.h
+++ b/tensorflow/core/tfrt/utils/utils.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_UTILS_UTILS_H_
 #define TENSORFLOW_CORE_TFRT_UTILS_UTILS_H_
 
+#include <cstdint>
+#include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "tensorflow/core/framework/types.pb.h"
@@ -72,6 +75,9 @@ tensorflow::StatusOr<RCReference<tfrt::BEFFile>> CreateBefFileFromBefBuffer(
 // Returns a unique integer within this process.
 int64_t GetUniqueInt();
 
+// Returns current CPU time.
+uint64_t GetCpuClockCycle();
+
 // A list of macros similar to `TF_RETURN_IF_ERROR`, with additional model
 // loading stage info.
 #define RETURN_IF_ERROR_IN_IMPORT(...) \
diff --git a/tensorflow/core/tfrt/utils/utils_test.cc b/tensorflow/core/tfrt/utils/utils_test.cc
index 8b057cd4d03..996d816a7f4 100644
--- a/tensorflow/core/tfrt/utils/utils_test.cc
+++ b/tensorflow/core/tfrt/utils/utils_test.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/utils/utils.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/core/framework/device.h"
@@ -53,21 +57,6 @@ TEST(UtilsTest, CreateDummyTfDevices) {
   EXPECT_EQ(dummy_tf_devices[1]->name(), device_name[1]);
 }
 
-TEST(UtilsTest, AddDummyTfrtDevices) {
-  std::unique_ptr<HostContext> host_ctx = CreateHostContext();
-  const std::vector<std::string> device_name{"/device:tpu:0"};
-  AddDummyTfrtDevices(device_name, host_ctx.get());
-
-  RCReference<Device> device0 =
-      host_ctx->GetDeviceManager()->GetDeviceRef<Device>(device_name[0]);
-  ASSERT_TRUE(device0);
-  EXPECT_EQ(device0->name(), device_name[0]);
-
-  RCReference<Device> device1 =
-      host_ctx->GetDeviceManager()->GetDeviceRef<Device>("no-such-device");
-  EXPECT_FALSE(device1);
-}
-
 TEST(UtilsTest, ReturnIfErrorInImport) {
   auto status = []() {
     RETURN_IF_ERROR_IN_IMPORT(
diff --git a/tensorflow/core/tpu/graph_rewrite/BUILD b/tensorflow/core/tpu/graph_rewrite/BUILD
index c74205053d7..5873bf83aca 100644
--- a/tensorflow/core/tpu/graph_rewrite/BUILD
+++ b/tensorflow/core/tpu/graph_rewrite/BUILD
@@ -144,6 +144,7 @@ cc_library(
         ":incomplete_nodedef_builder",
         "//tensorflow/compiler/jit:encapsulate_util",
         "//tensorflow/compiler/jit:shape_inference",
+        "//tensorflow/compiler/mlir/tensorflow:xla_sharding_util",
         "//tensorflow/compiler/tf2xla:common",
         "//tensorflow/compiler/tf2xla:resource_operation_table",
         "//tensorflow/compiler/tf2xla:sharding_util",
diff --git a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
index 94bbca61f0d..9424a9c3b9e 100644
--- a/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
+++ b/tensorflow/core/tpu/graph_rewrite/distributed_tpu_rewrite_pass.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/escaping.h"
 #include "tensorflow/compiler/jit/encapsulate_util.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "tensorflow/compiler/tf2xla/resource_operation_table.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
@@ -3579,7 +3580,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
                   edge->src_output());
             }
           }
-          if (arg_shardings[orig_arg_num].type() == xla::OpSharding::OTHER) {
+          if (IsSplitSharding(arg_shardings[orig_arg_num])) {
             // Don't automatically add a split node when input node is
             // kTPUPartitionedInput
             if (_IsTPUPartitionedInput(edge->src())) {
@@ -3632,8 +3633,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
                              split_node_and_index.index, node, i);
             }
           } else if (_IsTPUPartitionedInput(edge->src()) &&
-                     arg_shardings[orig_arg_num].type() ==
-                         xla::OpSharding::REPLICATED) {
+                     IsReplicatedSharding(arg_shardings[orig_arg_num])) {
             graph->AddEdge(replicate_input_fan_in_nodes[input_num][core].node,
                            replicate_input_fan_in_nodes[input_num][core].port,
                            node, i);
@@ -3699,7 +3699,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
                     mpmd, num_cores_per_replica, replica, arg_shapes,
                     &per_host_var_copies, graph));
 
-            if (arg_shardings[orig_arg_num].type() == xla::OpSharding::OTHER) {
+            if (IsSplitSharding(arg_shardings[orig_arg_num])) {
               ShardedInputInfo sharded_input_info;
 
               if (EnableXlaParamBroadcast(enable_xla_param_broadcast_, mpmd,
@@ -3757,7 +3757,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
         int output_num =
             replica * num_retvals_per_replica + core_retval_nums[core][i];
         const auto& sharding = retval_shardings[core_retval_nums[core][i]];
-        if (sharding.type() == xla::OpSharding::OTHER) {
+        if (IsSplitSharding(sharding)) {
           int retval_index = core_retval_nums[core][i];
           retval_index_to_output_index_mapping[retval_index][core] = i;
           bool is_last_core =
@@ -3841,7 +3841,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
 
         // If this is a replicated output, outputs on all cores will be the
         // same, and we only take the output from core 0.
-        if (sharding.type() == xla::OpSharding::REPLICATED && core != 0) {
+        if (IsReplicatedSharding(sharding) && core != 0) {
           continue;
         }
 
@@ -3871,7 +3871,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
           const auto& sharding = arg_shardings[orig_arg_num];
           // If this is a tiling sharded variable, concat variable updates from
           // all cores.
-          if (sharding.type() == xla::OpSharding::OTHER) {
+          if (IsSplitSharding(sharding)) {
             orig_arg_num_to_output_index_mapping[orig_arg_num][core] = i;
 
             // Do this in the iteration of last core in tile assignment, so all
@@ -3935,7 +3935,7 @@ Status DistributedTPURewritePass::BuildExecuteNodes(
           // If this is a replicated variable, outputs on all cores will be the
           // same, and we only take the output from core 0 for the variable
           // update.
-          if (sharding.type() == xla::OpSharding::REPLICATED && core != 0) {
+          if (IsReplicatedSharding(sharding) && core != 0) {
             continue;
           }
           VariableWrite& write = variable_writes->at(core_variable_writes[i]);
diff --git a/tensorflow/core/tpu/kernels/BUILD b/tensorflow/core/tpu/kernels/BUILD
index b0c70589166..fd0b342028a 100644
--- a/tensorflow/core/tpu/kernels/BUILD
+++ b/tensorflow/core/tpu/kernels/BUILD
@@ -460,6 +460,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/profiler/lib:traceme",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
diff --git a/tensorflow/core/tpu/kernels/compiled_subgraph.h b/tensorflow/core/tpu/kernels/compiled_subgraph.h
index 15997056c51..f0934e9d900 100644
--- a/tensorflow/core/tpu/kernels/compiled_subgraph.h
+++ b/tensorflow/core/tpu/kernels/compiled_subgraph.h
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
diff --git a/tensorflow/core/tpu/kernels/host_compute_ops.cc b/tensorflow/core/tpu/kernels/host_compute_ops.cc
index 1bda86ecb37..005453a8be0 100644
--- a/tensorflow/core/tpu/kernels/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/host_compute_ops.cc
@@ -13,7 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <atomic>
 #include <string>
+#include <vector>
 
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/core/tpu/kernels/image_resize_ops.cc b/tensorflow/core/tpu/kernels/image_resize_ops.cc
index 4398e34ec6c..41abf60767e 100644
--- a/tensorflow/core/tpu/kernels/image_resize_ops.cc
+++ b/tensorflow/core/tpu/kernels/image_resize_ops.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -39,7 +41,7 @@ class TpuCustomResizeOp : public XlaOpKernel {
   xla::Shape GetOutputShape(XlaOpKernelContext* ctx) const {
     std::vector<int64_t> out_size;
     auto status = ctx->ConstantInputAsIntVector(1, &out_size);
-    CHECK_EQ(out_size.size(), 2) << status.ToString();
+    CHECK_EQ(out_size.size(), 2) << status;
     xla::Shape output_shape =
         TensorShapeToXLAShape(ctx->output_xla_type(0), ctx->InputShape(0));
     output_shape.mutable_dimensions()[1] = out_size[0];
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.cc b/tensorflow/core/tpu/kernels/infeed_ops.cc
index 88d29ab9d0c..89b0126f1ae 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/infeed_ops.cc
@@ -16,6 +16,10 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/infeed_ops.h"
 
 #include <algorithm>
+#include <deque>
+#include <iterator>
+#include <memory>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/compiler/jit/xla_device.h"
@@ -160,7 +164,7 @@ Status GetInfeedShapeWithLayout(OpKernelConstruction* ctx,
 // `LinearizerBufferList` (aka `std::deque<LinearizerBuffer>`)
 // object, so the `Encode()` and `Decode()` methods are not implemented.
 struct LinearizedBuffersWrapper {
-  explicit LinearizedBuffersWrapper() {}
+  explicit LinearizedBuffersWrapper() = default;
   explicit LinearizedBuffersWrapper(LinearizerBufferList bufs,
                                     std::vector<tensorflow::Tensor> ts)
       : buffers(std::move(bufs)), tensors(std::move(ts)) {}
@@ -378,7 +382,7 @@ class StreamExecutorInfeedEnqueueOp : public TpuInfeedEnqueueOp {
  public:
   explicit StreamExecutorInfeedEnqueueOp(OpKernelConstruction* ctx)
       : TpuInfeedEnqueueOp(ctx,
-                           absl::make_unique<StreamExecutorTransferOpImpl>()) {}
+                           std::make_unique<StreamExecutorTransferOpImpl>()) {}
 
  private:
   StreamExecutorInfeedEnqueueOp(const StreamExecutorInfeedEnqueueOp&) = delete;
@@ -390,7 +394,7 @@ class StreamExecutorInfeedEnqueueTupleOp : public TpuInfeedEnqueueTupleOp {
  public:
   explicit StreamExecutorInfeedEnqueueTupleOp(OpKernelConstruction* ctx)
       : TpuInfeedEnqueueTupleOp(
-            ctx, absl::make_unique<StreamExecutorTransferOpImpl>()) {}
+            ctx, std::make_unique<StreamExecutorTransferOpImpl>()) {}
 
  private:
   StreamExecutorInfeedEnqueueTupleOp(
@@ -405,7 +409,7 @@ class StreamExecutorInfeedEnqueuePrelinearizedBufferOp
   explicit StreamExecutorInfeedEnqueuePrelinearizedBufferOp(
       OpKernelConstruction* ctx)
       : InfeedEnqueuePrelinearizedBufferOp(
-            ctx, absl::make_unique<StreamExecutorTransferOpImpl>()) {}
+            ctx, std::make_unique<StreamExecutorTransferOpImpl>()) {}
 
  private:
   // InfeedEnqueuePrelinearizedBufferOp is neither copyable nor movable.
diff --git a/tensorflow/core/tpu/kernels/infeed_ops.h b/tensorflow/core/tpu/kernels/infeed_ops.h
index 8cf3ac8fb61..0fcd20573d4 100644
--- a/tensorflow/core/tpu/kernels/infeed_ops.h
+++ b/tensorflow/core/tpu/kernels/infeed_ops.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_INFEED_OPS_H_
 
+#include <memory>
+#include <vector>
+
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.cc b/tensorflow/core/tpu/kernels/outfeed_ops.cc
index ddff29cf2d7..b2642abe3bd 100644
--- a/tensorflow/core/tpu/kernels/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/outfeed_ops.h"
 
+#include <memory>
+
 namespace tensorflow {
 namespace {
 template <class T>
@@ -22,7 +24,7 @@ class StreamExecutorOutfeedDequeueOp : public TpuOutfeedDequeueOp<T> {
  public:
   explicit StreamExecutorOutfeedDequeueOp(OpKernelConstruction* ctx)
       : TpuOutfeedDequeueOp<T>(
-            ctx, absl::make_unique<StreamExecutorTransferOpImpl>()) {}
+            ctx, std::make_unique<StreamExecutorTransferOpImpl>()) {}
 
  private:
   StreamExecutorOutfeedDequeueOp(const StreamExecutorOutfeedDequeueOp&) =
@@ -36,7 +38,7 @@ class StreamExecutorOutfeedDequeueTupleOp : public TpuOutfeedDequeueTupleOp<T> {
  public:
   explicit StreamExecutorOutfeedDequeueTupleOp(OpKernelConstruction* ctx)
       : TpuOutfeedDequeueTupleOp<T>(
-            ctx, absl::make_unique<StreamExecutorTransferOpImpl>()) {}
+            ctx, std::make_unique<StreamExecutorTransferOpImpl>()) {}
 
  private:
   StreamExecutorOutfeedDequeueTupleOp(
diff --git a/tensorflow/core/tpu/kernels/outfeed_ops.h b/tensorflow/core/tpu/kernels/outfeed_ops.h
index 1424d5cef1c..7a398ded41d 100644
--- a/tensorflow/core/tpu/kernels/outfeed_ops.h
+++ b/tensorflow/core/tpu/kernels/outfeed_ops.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_OUTFEED_OPS_H_
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/tf2xla/literal_util.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/core/tpu/kernels/sharding_util_ops.cc b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
index ef7552d1034..1ce3bd4797f 100644
--- a/tensorflow/core/tpu/kernels/sharding_util_ops.cc
+++ b/tensorflow/core/tpu/kernels/sharding_util_ops.cc
@@ -132,39 +132,39 @@ Status CreateResourceInvalidDTypeError(const ResourceHandle& handle,
 template <int Rank>
 Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, Rank>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 1> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 1>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 2> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 2>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 3> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 3>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 4> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 4>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 5> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 5>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 6> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 6>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 7> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 7>& slice_shape, int index);
 template <>
 Eigen::DSizes<Eigen::DenseIndex, 8> TF_ATTRIBUTE_NOINLINE GetSliceIndices(
     absl::Span<const int32> num_partitions,
-    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, const int index);
+    const Eigen::DSizes<Eigen::DenseIndex, 8>& slice_shape, int index);
 
 template <int Rank>
 Eigen::DSizes<Eigen::DenseIndex, Rank> GetSliceIndices(
diff --git a/tensorflow/core/tpu/kernels/topk_ops.cc b/tensorflow/core/tpu/kernels/topk_ops.cc
index fe2748d7c9e..fdd012e3e9f 100644
--- a/tensorflow/core/tpu/kernels/topk_ops.cc
+++ b/tensorflow/core/tpu/kernels/topk_ops.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <utility>
+#include <vector>
+
 #include "absl/numeric/bits.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
index bf76e7a92ea..eec514f22b2 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_entry_unloader.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_UNLOADER_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_ENTRY_UNLOADER_H_
 
+#include <string>
+
 #include "absl/container/flat_hash_set.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/framework/resource_mgr.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
index 93a9cb44410..b0931ab7b09 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_external.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
 
+#include <functional>
+#include <memory>
 #include <string>
+#include <utility>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
@@ -48,7 +51,7 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
   }
 
   entry->tpu_program_group =
-      absl::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
+      std::make_unique<TpuProgramGroup>(std::move(tpu_program_group));
   entry->initialized = true;
 
   if (entry->initialization_status.ok()) {
@@ -59,9 +62,9 @@ void PopulateEntry(const std::string& key, CompiledSubgraph* entry,
 
 std::unique_ptr<CompiledSubgraph> CreateAndInitializeCompiledSubgraph(
     CompiledSubgraph* main_entry) {
-  auto entry = absl::make_unique<CompiledSubgraph>();
+  auto entry = std::make_unique<CompiledSubgraph>();
   entry->main_entry = main_entry;
-  entry->tpu_program_group = absl::make_unique<TpuProgramGroup>();
+  entry->tpu_program_group = std::make_unique<TpuProgramGroup>();
   return entry;
 }
 }  // namespace
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc
index 66f05266480..aceee7ac03e 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_factory.h"
 
+#include <functional>
+
 #include "tensorflow/core/framework/resource_mgr.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_external.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
index b478a32fa0c..6a2c989b370 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h"
 
 #include <functional>
+#include <memory>
 
 #include "grpcpp/impl/codegen/async_stream.h"
 #include "grpcpp/impl/codegen/async_unary_call.h"
@@ -93,7 +94,7 @@ grpc::TpuCompilationCacheService::Service::Service() {
           this)));
 }
 
-grpc::TpuCompilationCacheService::Service::~Service() {}
+grpc::TpuCompilationCacheService::Service::~Service() = default;
 
 ::grpc::Status grpc::TpuCompilationCacheService::Service::GetTpuProgram(
     ::grpc::ServerContext* context, const RequestType* request,
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
index 1d97ce25a13..b7c6b7c3810 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_grpc.h
@@ -18,6 +18,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_GRPC_H_
 
 #include <functional>
+#include <memory>
 
 #include "grpcpp/generic/async_generic_service.h"
 #include "grpcpp/impl/codegen/async_stream.h"
@@ -67,7 +68,7 @@ class TpuCompilationCacheService final {
   }
   class StubInterface {
    public:
-    virtual ~StubInterface() {}
+    virtual ~StubInterface() = default;
     // This method requests the cached proto that the TPU execute op has
     // been instructed to execute.
     virtual ::grpc::Status GetTpuProgram(::grpc::ClientContext* context,
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
index b86589f102f..96ae1f3ed46 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.cc
@@ -14,7 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h"
 
+#include <functional>
+#include <memory>
+#include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 #include "tensorflow/core/platform/casts.h"
@@ -126,7 +130,7 @@ TpuCompilationCacheInterface::~TpuCompilationCacheInterface() {
   for (const auto& entry : entries_by_uid_) {
     while (entry.second->external_references > 0) {
       Status s = Release(entry.first);
-      CHECK(s.ok());
+      TF_CHECK_OK(s);
     }
   }
   while (!entries_by_last_use_.empty()) {
@@ -232,7 +236,7 @@ size_t TpuCompilationCacheInterface::RemoveEntry(const std::string& key) {
   TpuCompilationMetrics::SetCacheEntryCount(cache_.size());
 
   auto parsed_key_or_status = ParseCompilationCacheKey(key);
-  CHECK(parsed_key_or_status.status().ok());
+  TF_CHECK_OK(parsed_key_or_status.status());
   const TpuCompilationCacheKey parsed_key =
       std::move(parsed_key_or_status).value();
   if (!parsed_key.has_guaranteed_const) {
@@ -348,7 +352,7 @@ void TpuCompilationCacheInterface::InsertEntry(const std::string& key,
   TpuCompilationMetrics::SetCacheEntryCount(cache_.size());
 
   auto parsed_key_or_status = ParseCompilationCacheKey(key);
-  CHECK(parsed_key_or_status.status().ok());
+  TF_CHECK_OK(parsed_key_or_status.status());
   const TpuCompilationCacheKey parsed_key =
       std::move(parsed_key_or_status).value();
   if (!parsed_key.has_guaranteed_const) {
@@ -578,8 +582,8 @@ Status TpuCompilationCacheInterface::Lookup(
     return errors::NotFound("No proto found for core index ", proto_index,
                             " in subgraph with uid ", uid);
   }
-  *entry = absl::make_unique<CompilationCacheEntryRef>(this, cache_entry,
-                                                       proto_index);
+  *entry = std::make_unique<CompilationCacheEntryRef>(this, cache_entry,
+                                                      proto_index);
   return OkStatus();
 }
 
@@ -598,8 +602,8 @@ Status TpuCompilationCacheInterface::Lookup(
   }
   CompiledSubgraph* cache_entry = iter->second.first;
   int proto_index = iter->second.second;
-  *entry = absl::make_unique<CompilationCacheEntryRef>(this, cache_entry,
-                                                       proto_index);
+  *entry = std::make_unique<CompilationCacheEntryRef>(this, cache_entry,
+                                                      proto_index);
   return OkStatus();
 }
 }  // namespace tpu
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
index 199e9b05f92..d4bd9e0d6e9 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_interface.h
@@ -15,11 +15,16 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_INTERFACE_H_
 
+#include <functional>
+#include <map>
 #include <memory>
 #include <string>
+#include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
@@ -302,7 +307,7 @@ class TpuCompilationCacheInterface : public ResourceBase {
       ABSL_GUARDED_BY(mu_);
   // All the subgraph entries that can be looked up in the cache, indexed by
   // uid.
-  absl::node_hash_map<int64_t, CompiledSubgraph*> entries_by_uid_
+  absl::flat_hash_map<int64_t, CompiledSubgraph*> entries_by_uid_
       ABSL_GUARDED_BY(mu_);
   // All the protos that can be looked up in the cache, indexed by proto
   // key. The value of the map is a subgraph and the index of the proto compiled
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h
index 6acb2d94b8e..81997e69a32 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_key.h
@@ -60,7 +60,7 @@ struct TpuCompilationCacheKey {
                         guaranteed_const_fingerprint());
   }
 
-  explicit TpuCompilationCacheKey() {}
+  explicit TpuCompilationCacheKey() = default;
   explicit TpuCompilationCacheKey(const std::string& p) : prefix(p) {}
 };
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
index a830ead1eb6..12d24a44b67 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h"
 
+#include <memory>
+
 namespace tensorflow {
 namespace tpu {
 
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
index 7b7867c0b0c..22fdb914ce4 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_local_lookup.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOCAL_LOOKUP_H_
 
+#include <memory>
+
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_entry.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
index 83008a74667..54465e90e35 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_lookup.h
@@ -15,6 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILATION_CACHE_LOOKUP_H_
 
+#include <memory>
+#include <utility>
+
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_common.pb.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
index 2ee31c0eb16..f8816936de7 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_lookup.h"
 
+#include <limits>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "grpcpp/security/credentials.h"
 #include "absl/strings/str_cat.h"
 #include "absl/time/time.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
index 885d8e9cb72..05c5e948893 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h"
 
+#include <memory>
+
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
 #include "tensorflow/core/platform/casts.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
index 2b79da55593..9d29a81663a 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_rpc_support.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "grpcpp/security/credentials.h"
@@ -84,7 +85,7 @@ std::shared_ptr<::grpc::ChannelCredentials> CreateChannelCredentials();
 // `cache_entry` will be instantiated by the function.
 template <typename ResponseType>
 Status DeserializeRpcResponseToCacheEntry(
-    const absl::string_view local_proto_key, ResponseType* response,
+    absl::string_view local_proto_key, ResponseType* response,
     std::shared_ptr<CacheEntry>* cache_entry);
 
 // Serializes `TpuCompilationCacheEntry` to gRPC bufer slices.
diff --git a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
index c39aa2f90b2..980ac861565 100644
--- a/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compilation_cache_service.cc
@@ -15,6 +15,8 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_compilation_cache_service.h"
 
 #include <chrono>  // NOLINT
+#include <memory>
+#include <vector>
 
 #include "grpcpp/support/byte_buffer.h"
 #include "tensorflow/core/distributed_runtime/rpc/grpc_util.h"
@@ -36,7 +38,7 @@ TpuCompilationCacheService::TpuCompilationCacheService(
       cache_(cache),
       server_builder_(server_builder),
       cq_(server_builder_->AddCompletionQueue()),
-      thread_pool_(absl::make_unique<thread::ThreadPool>(
+      thread_pool_(std::make_unique<thread::ThreadPool>(
           Env::Default(), "TpuCompilationCacheService",
           kGetTpuProgramServingThreads)) {
   cache_->Ref();
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.cc b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
index 84c6c87f7a6..c74afd59dfe 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compile_op.h"
 
+#include <memory>
 #include <string>
+#include <utility>
 
 #include "tensorflow/compiler/xla/statusor.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
index 1f52783ffb1..ac5d9b37643 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.cc
@@ -14,7 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_common.h"
 
+#include <atomic>
+#include <cstdlib>
+#include <memory>
+#include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/cleanup/cleanup.h"
 #include "absl/strings/str_cat.h"
@@ -200,8 +206,8 @@ Status TpuCompileOpKernelCommon::CompileLocallyAndFillHostCacheInternal(
   if (use_mlir_) {
     const ConfigProto* config = flib_runtime->config_proto();
     ConfigProto::Experimental::MlirBridgeRollout rollout_state =
-        GetMlirBridgeRolloutState(config ? absl::make_optional(*config)
-                                         : absl::nullopt);
+        GetMlirBridgeRolloutState(config ? std::make_optional(*config)
+                                         : std::nullopt);
     compile_status =
         Compile(MlirToHloArgs{mlir_module_, rollout_state}, mesh_state->data(),
                 arg_shapes, &key, tpu_program_group);
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
index 387e844bbbd..f9b7136b294 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_common.h
@@ -15,7 +15,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_COMMON_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_COMMON_H_
 
+#include <atomic>
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/span.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
index fdd12961a70..d0358b70dce 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_impl.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "tensorflow/compiler/xla/status.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile.pb.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
index 91b02e4a828..a88d546bdfb 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.cc
@@ -14,7 +14,12 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 
+#include <memory>
+#include <optional>
 #include <string>
+#include <utility>
+#include <variant>
+#include <vector>
 
 #include "tensorflow/compiler/xla/debug_options_flags.h"
 #include "tensorflow/compiler/xla/service/computation_layout.h"
@@ -57,13 +62,13 @@ Status ValidateResultShape(const Shape& client_shape,
 
 StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
     const ProgramShape& program_shape, absl::Span<const Shape> argument_shapes,
-    absl::optional<const Shape> result_layout,
-    absl::optional<const DeviceAssignment> device_assignment, int replica_count,
+    std::optional<const Shape> result_layout,
+    std::optional<const DeviceAssignment> device_assignment, int replica_count,
     int num_partitions, const DebugOptions* debug_options, const int* seed,
     const int* launch_id, const bool* alias_passthrough_params,
     const xla::FusionConfigCollection* fusion_config_collection,
     const std::vector<std::vector<bool>>* fusion_config) {
-  auto config = absl::make_unique<HloModuleConfig>(program_shape);
+  auto config = std::make_unique<HloModuleConfig>(program_shape);
   ComputationLayout* computation_layout =
       config->mutable_entry_computation_layout();
   if (program_shape.parameters_size() != argument_shapes.size()) {
@@ -136,8 +141,8 @@ StatusOr<std::unique_ptr<HloModuleConfig>> CreateModuleConfig(
 StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
     const xla::ProgramShape& program_shape,
     absl::Span<const Shape> argument_shapes,
-    absl::optional<const Shape> result_layout,
-    absl::optional<const DeviceAssignment> device_assignment, int replica_count,
+    std::optional<const Shape> result_layout,
+    std::optional<const DeviceAssignment> device_assignment, int replica_count,
     int num_partitions, const DebugOptions* debug_options) {
   return CreateModuleConfig(program_shape, argument_shapes, result_layout,
                             device_assignment, replica_count, num_partitions,
@@ -349,12 +354,12 @@ StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
   compilation_request.set_use_mlir(use_mlir);
   if (use_mlir) {
     VLOG(1) << "Serializing MlirModule";
-    const MlirToHloArgs& mlir_computation = absl::get<0>(computation);
+    const MlirToHloArgs& mlir_computation = std::get<0>(computation);
     *compilation_request.mutable_mlir_module() =
         string(mlir_computation.mlir_module);
   } else {
     VLOG(1) << "Serializing FunctionDefinitionLibrary";
-    const FunctionToHloArgs& function_computation = absl::get<1>(computation);
+    const FunctionToHloArgs& function_computation = std::get<1>(computation);
     *compilation_request.mutable_fdef_lib() =
         function_computation.flib_def->ToProto();
     compilation_request.set_graph_def_version(
@@ -365,14 +370,14 @@ StatusOr<TpuCompilationRequestProto> CreateTpuCompilationRequest(
     // to avoid passing guaranteed_constants over C_API.
     if (function_computation.guaranteed_constants.index() == 0) {
       absl::Span<const TensorProto* const> guaranteed_constants =
-          absl::get<0>(function_computation.guaranteed_constants);
+          std::get<0>(function_computation.guaranteed_constants);
       for (const TensorProto* constant : guaranteed_constants) {
         *compilation_request.add_guaranteed_constants() = *constant;
       }
     } else {
       CHECK_EQ(function_computation.guaranteed_constants.index(), 1);
       const OpInputList& guaranteed_constants =
-          *absl::get<1>(function_computation.guaranteed_constants);
+          *std::get<1>(function_computation.guaranteed_constants);
       for (const Tensor& constant : guaranteed_constants) {
         constant.AsProtoTensorContent(
             compilation_request.add_guaranteed_constants());
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
index 7758c464d40..ce7c39ae800 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_support.h
@@ -15,7 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_SUPPORT_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_COMPILE_OP_SUPPORT_H_
 
+#include <memory>
+#include <optional>
 #include <string>
+#include <utility>
+#include <variant>
 #include <vector>
 
 #include "absl/strings/string_view.h"
@@ -53,8 +57,8 @@ struct MlirToHloArgs {
 };
 
 // Variant of guaranteed constant tensors types.
-using GuaranteedConsts = absl::variant<absl::Span<const TensorProto* const>,
-                                       const OpInputList* const>;
+using GuaranteedConsts = std::variant<absl::Span<const TensorProto* const>,
+                                      const OpInputList* const>;
 
 // List of parameters for lowering function library definition to HLO IR.
 struct FunctionToHloArgs {
@@ -95,8 +99,8 @@ xla::Shape GetPerDeviceShape(const xla::Shape& shape,
 tsl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
     const xla::ProgramShape& program_shape,
     absl::Span<const xla::Shape> argument_shapes,
-    absl::optional<const xla::Shape> result_layout,
-    absl::optional<const xla::DeviceAssignment> device_assignment,
+    std::optional<const xla::Shape> result_layout,
+    std::optional<const xla::DeviceAssignment> device_assignment,
     int replica_count, int num_partitions,
     const xla::DebugOptions* debug_options, const int* seed,
     const int* launch_id, const bool* alias_passthrough_params,
@@ -106,8 +110,8 @@ tsl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
 tsl::StatusOr<std::unique_ptr<xla::HloModuleConfig>> CreateModuleConfig(
     const xla::ProgramShape& program_shape,
     absl::Span<const xla::Shape> argument_shapes,
-    absl::optional<const xla::Shape> result_layout,
-    absl::optional<const xla::DeviceAssignment> device_assignment,
+    std::optional<const xla::Shape> result_layout,
+    std::optional<const xla::DeviceAssignment> device_assignment,
     int replica_count, int num_partitions,
     const xla::DebugOptions* debug_options);
 
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
index 695bc15aeba..c720cf81386 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.cc
@@ -15,6 +15,9 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_configuration_ops.h"
 
 #include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/cleanup/cleanup.h"
 #include "tensorflow/c/tf_status.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
index 2659cf5cd2b..35bbdfb17cc 100644
--- a/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_configuration_ops.h
@@ -44,7 +44,7 @@ class ConfigureDistributedTpuOp : public OpKernel {
         errors::Internal("_ConfigureDistributedTPU needs at least one input"));
   }
   void Compute(OpKernelContext* ctx) override;
-  ~ConfigureDistributedTpuOp() override {}
+  ~ConfigureDistributedTpuOp() override = default;
 
  private:
   // ConfigureDistributedTpuOp is neither copyable nor movable.
@@ -68,7 +68,7 @@ class WaitForDistributedTpuOp : public OpKernel {
                                         startup_timeout_sec_, " must be >0"));
   }
   void Compute(OpKernelContext* ctx) override;
-  ~WaitForDistributedTpuOp() override {}
+  ~WaitForDistributedTpuOp() override = default;
 
  private:
   // The time to wait for all hosts to start up.
@@ -89,7 +89,7 @@ class ShutdownDistributedTpuOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override;
 
-  ~ShutdownDistributedTpuOp() override {}
+  ~ShutdownDistributedTpuOp() override = default;
 
  private:
   // ShutdownDistributedTpuOp is neither copyable nor movable.
@@ -115,7 +115,7 @@ class InitializeHostForDistributedTpuOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override;
 
-  ~InitializeHostForDistributedTpuOp() override {}
+  ~InitializeHostForDistributedTpuOp() override = default;
 
  private:
   // InitializeHostForDistributedTpuOp is neither copyable nor movable.
@@ -137,7 +137,7 @@ class SetGlobalTPUArrayOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override;
 
-  ~SetGlobalTPUArrayOp() override {}
+  ~SetGlobalTPUArrayOp() override = default;
 
  private:
   // SetGlobalTPUArrayOp is neither copyable nor movable.
@@ -156,7 +156,7 @@ class DisconnectDistributedTpuChipsOp : public OpKernel {
 
   void Compute(OpKernelContext* ctx) override;
 
-  ~DisconnectDistributedTpuChipsOp() override {}
+  ~DisconnectDistributedTpuChipsOp() override = default;
 
  private:
   // DisconnectDistributedTpuChipsOp is neither copyable nor movable.
diff --git a/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc b/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc
index 1ee1f2ecfe1..2150f2188b6 100644
--- a/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_embedding_load_retrieve_ops.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <array>
+#include <limits>
 #include <memory>
 #include <string>
 #include <vector>
diff --git a/tensorflow/core/tpu/kernels/tpu_execute_op.cc b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
index 3bbff04f5a5..50a2593fa25 100644
--- a/tensorflow/core/tpu/kernels/tpu_execute_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_execute_op.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_execute_op.h"
 
+#include <memory>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
@@ -278,7 +280,7 @@ xla::StatusOr<std::unique_ptr<InputBuffers>> BuildComputationInputs(
   se::DeviceMemoryAllocator* const allocator = backend->memory_allocator();
   xla::TransferManager* const transfer_manager = backend->transfer_manager();
 
-  auto input_buffers = absl::make_unique<InputBuffers>(
+  auto input_buffers = std::make_unique<InputBuffers>(
       transfer_manager->HostShapeToDeviceShape(input_host_shape));
 
   // Allocates a buffer for the root tuple.
@@ -437,7 +439,7 @@ xla::StatusOr<std::unique_ptr<OutputBuffers>> AllocateOutputTensors(
       node_context->backend()->memory_allocator();
 
   auto output_buffers =
-      absl::make_unique<OutputBuffers>(std::move(scoped_buffers), allocator);
+      std::make_unique<OutputBuffers>(std::move(scoped_buffers), allocator);
 
   xla::Shape output_device_shape = output_buffers->buffers.on_device_shape();
 
diff --git a/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.cc b/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.cc
index b146ade8b4b..c646b3cdc01 100644
--- a/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.cc
+++ b/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h"
 
+#include <optional>
+#include <string>
+
 namespace tensorflow {
 namespace tpu {
 
@@ -79,12 +82,12 @@ bool TpuFingerprintLookup::RegisterIntermediateAndValuePair(uint64 intermediate,
   }
 }
 
-absl::optional<::tensorflow::StringPiece> TpuFingerprintLookup::Lookup(
+std::optional<::tensorflow::StringPiece> TpuFingerprintLookup::Lookup(
     uint64 key) {
   absl::MutexLock lock(&mu_);
   auto it = key_to_value_.find(key);
   if (it == key_to_value_.end()) {
-    return absl::optional<::tensorflow::StringPiece>{};
+    return std::optional<::tensorflow::StringPiece>{};
   } else {
     return it->second;
   }
diff --git a/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h b/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h
index f603536924b..8519fa760c0 100644
--- a/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h
+++ b/tensorflow/core/tpu/kernels/tpu_fingerprint_lookup.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <cstddef>
 #include <deque>
+#include <optional>
+#include <string>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
@@ -64,7 +66,7 @@ class TpuFingerprintLookup : public ResourceBase {
   // Look up fingerprint with key.
   // Return absl::optional<::tensorflow::StringPiece>{} if
   // not found.
-  absl::optional<::tensorflow::StringPiece> Lookup(uint64 key);
+  std::optional<::tensorflow::StringPiece> Lookup(uint64 key);
 
   size_t num_valid() {
     absl::MutexLock lock(&mu_);
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
index fc8aead74d6..88cb668faa9 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.cc
@@ -16,7 +16,17 @@ limitations under the License.
 #include "tensorflow/core/tpu/kernels/tpu_functional_ops.h"
 
 #include <algorithm>
+#include <functional>
+#include <map>
 #include <memory>
+#include <numeric>
+#include <optional>
+#include <set>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <utility>
+#include <vector>
 
 #include "absl/strings/match.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/c_api_decl.h"
@@ -145,7 +155,7 @@ Status ParseTPUVariableInfor(const Node* node, const int num_cores_per_replica,
     if (next != edge->dst()) {
       VLOG(3) << "Looked through Enter/Switch node " << next->DebugString();
     }
-    TF_ASSIGN_OR_RETURN(absl::optional<xla::OpSharding> sharding,
+    TF_ASSIGN_OR_RETURN(std::optional<xla::OpSharding> sharding,
                         ParseShardingFromDevice(*next, num_cores_per_replica,
                                                 /*add_metadata=*/false));
     if (sharding.has_value() && sharding->tile_assignment_devices_size() > 0) {
@@ -197,7 +207,7 @@ bool IsSupportedTPUOp(const string& op_name) {
 // Sets the sharding attributes for an XlaSharding node.
 void SetXlaShardingNodeAttr(Node* xla_sharding_node, int num_cores_per_replica,
                             int rank, int shard_dim) {
-  auto sharding = absl::make_optional<xla::OpSharding>();
+  auto sharding = std::make_optional<xla::OpSharding>();
   sharding->set_type(xla::OpSharding::OTHER);
 
   std::vector<int64_t> dims(rank, 1LL);
@@ -2005,7 +2015,7 @@ Status TPUPartitionedCallOp::InferShapesWithResourceVar(
     std::map<int, InferredShape>& arg_shapes,
     GraphShapeInfo* tpu_inferred_info) {
   auto shape_inference_graph_interim =
-      absl::make_unique<Graph>(graph->flib_def());
+      std::make_unique<Graph>(graph->flib_def());
   CopyGraph(*graph, shape_inference_graph_interim.get());
 
   for (Node* node : shape_inference_graph_interim->nodes()) {
@@ -2079,7 +2089,7 @@ Status TPUPartitionedCallOp::ShardInputsWithXlaSharding(
     if (!input_node_status.ok()) {
       VLOG(2) << "Skip because cannot retrieve input node 0 of "
               << replicated_input_node->name() << " because "
-              << input_node_status.ToString();
+              << input_node_status;
       continue;
     }
 
@@ -2121,7 +2131,7 @@ Status TPUPartitionedCallOp::ShardInputsWithXlaSharding(
         continue;
       }
 
-      auto sharding = absl::make_optional<xla::OpSharding>();
+      auto sharding = std::make_optional<xla::OpSharding>();
       sharding->set_type(xla::OpSharding::OTHER);
 
       // Sets up tile_assignment_dimensions.
diff --git a/tensorflow/core/tpu/kernels/tpu_functional_ops.h b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
index 1fe2e138437..65e335e9bb8 100644
--- a/tensorflow/core/tpu/kernels/tpu_functional_ops.h
+++ b/tensorflow/core/tpu/kernels/tpu_functional_ops.h
@@ -16,6 +16,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_FUNCTIONAL_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_FUNCTIONAL_OPS_H_
 
+#include <map>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <unordered_map>
+#include <vector>
+
 #include "absl/base/call_once.h"
 #include "tensorflow/compiler/jit/shape_inference.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
@@ -124,7 +131,7 @@ class TPUPartitionedCallOp : public AsyncOpKernel {
         &runtime_params_);
   }
 
-  ~TPUPartitionedCallOp() override {}
+  ~TPUPartitionedCallOp() override = default;
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override;
 
diff --git a/tensorflow/core/tpu/kernels/tpu_op_util.cc b/tensorflow/core/tpu/kernels/tpu_op_util.cc
index 9c7ec1f8333..2953603ed82 100644
--- a/tensorflow/core/tpu/kernels/tpu_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_op_util.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <cstdint>
 #include <string>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
index ae6a41d8e2c..8c89287873e 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_H_
 
+#include <optional>
+
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_ops_c_api.h"
 #include "tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h"
@@ -37,7 +39,7 @@ class TPUOrdinalSelector : TPUOrdinalSelectorInterface {
     stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_DestroyFn(
         ordinal_selector_);
   }
-  int64_t GetOrdinal(absl::optional<uint64> key, int64_t* req_id) override {
+  int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) override {
     int64_t ordinal;
     stream_executor::tpu::OpsApiFn()->TfTpuOrdinalSelector_GetOrdinalFn(
         ordinal_selector_, key, req_id, &ordinal);
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
index a5ca996c5af..040959d592a 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_interface.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_INTERFACE_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_ORDINAL_SELECTOR_INTERFACE_H_
 
+#include <optional>
+
 #include "tensorflow/core/framework/types.h"
 
 namespace tensorflow {
@@ -24,7 +26,7 @@ namespace tpu {
 class TPUOrdinalSelectorInterface {
  public:
   virtual ~TPUOrdinalSelectorInterface() = default;
-  virtual int64_t GetOrdinal(absl::optional<uint64> key, int64_t* req_id) = 0;
+  virtual int64_t GetOrdinal(std::optional<uint64> key, int64_t* req_id) = 0;
   virtual void DequeueFromCoreSelector(int32_t device_ordinal,
                                        int64_t req_id) = 0;
 };
diff --git a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_op.cc b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_op.cc
index 87f1b6e0286..339fe1bb536 100644
--- a/tensorflow/core/tpu/kernels/tpu_ordinal_selector_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_ordinal_selector_op.cc
@@ -30,7 +30,7 @@ class TPUOrdinalSelectorOp : public OpKernel {
  public:
   explicit TPUOrdinalSelectorOp(OpKernelConstruction* ctx) : OpKernel(ctx) {}
 
-  ~TPUOrdinalSelectorOp() override {}
+  ~TPUOrdinalSelectorOp() override = default;
 
   void Compute(OpKernelContext* ctx) override {
     Tensor output(DT_INT32, TensorShape({}));
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.cc b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
index 44c53b202ab..dc52446c6c0 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.cc
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_pod_state.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/cleanup/cleanup.h"
 #include "tensorflow/c/tf_status.h"
 #include "tensorflow/c/tf_status_helper.h"
@@ -62,7 +67,7 @@ ConstructCacheService(ResourceMgr* rmgr, int serving_port,
 #endif
   TF_RETURN_IF_ERROR(server_builder.status());
 
-  auto cache_service = absl::make_unique<TpuCompilationCacheService>(
+  auto cache_service = std::make_unique<TpuCompilationCacheService>(
       server_builder.value().get(), compilation_cache);
   cache_service->SetMemoryQuota(1ul << 31);  // 2GB
   cache_service->Start();
diff --git a/tensorflow/core/tpu/kernels/tpu_pod_state.h b/tensorflow/core/tpu/kernels/tpu_pod_state.h
index 9515a8ee8f5..56f93226451 100644
--- a/tensorflow/core/tpu/kernels/tpu_pod_state.h
+++ b/tensorflow/core/tpu/kernels/tpu_pod_state.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_POD_STATE_H_
 
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.cc b/tensorflow/core/tpu/kernels/tpu_program_group.cc
index facb0c62d7e..f9a6bd65afd 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.cc
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_program_group.h"
 
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/xla/hlo/ir/hlo_module_group.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/proto_helper.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/status_helper.h"
@@ -152,7 +156,7 @@ void TpuProgramGroup::UnloadAndDestroyPrograms() {
         tpu_program, status.c_status);
     auto s = status.status();
     if (!s.ok()) {
-      LOG(ERROR) << "TpuProgramGroup::UnloadPrograms(): " << s.ToString();
+      LOG(ERROR) << "TpuProgramGroup::UnloadPrograms(): " << s;
     }
   }
   tpu_programs_.clear();
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group.h b/tensorflow/core/tpu/kernels/tpu_program_group.h
index e7e40a0348b..a3a8d7fb693 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_PROGRAM_GROUP_H_
 
 #include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/types/optional.h"
@@ -58,7 +60,7 @@ class TpuAotCompilationOptions : public xla::AotCompilationOptions {
     return allow_separate_sharding_programs_;
   }
 
-  const std::vector<xla::HloModuleConfig::ShardableValueUpdatePair>
+  std::vector<xla::HloModuleConfig::ShardableValueUpdatePair>
   shardable_value_update_pairs() const {
     return shardable_value_update_pairs_;
   }
diff --git a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
index 6742c73eb6d..0832eb8da23 100644
--- a/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
+++ b/tensorflow/core/tpu/kernels/tpu_program_group_interface.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/time/time.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
index b06470cbaf2..a96e148deed 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/tpu_reshard_variables_op.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/variable_info_util.h"
 #include "tensorflow/compiler/jit/xla_device.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
index ac14ef3930e..bdb6abf11fd 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h"
 
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/jit/xla_tensor.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
index c1559e71534..40cf39a9381 100644
--- a/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
+++ b/tensorflow/core/tpu/kernels/tpu_reshard_variables_op_util.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_TPU_KERNELS_TPU_RESHARD_VARIABLES_OP_UTIL_H_
 
 #include <memory>
+#include <vector>
 
 #include "tensorflow/compiler/jit/variable_info.h"
 #include "tensorflow/compiler/jit/xla_launch_util.h"
diff --git a/tensorflow/core/tpu/kernels/tpu_util.cc b/tensorflow/core/tpu/kernels/tpu_util.cc
index 39eb88f0b6c..f8ebe8455be 100644
--- a/tensorflow/core/tpu/kernels/tpu_util.cc
+++ b/tensorflow/core/tpu/kernels/tpu_util.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tpu/kernels/tpu_util.h"
 
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_api.h"
@@ -100,7 +105,7 @@ Status DynamicShapesToTensorShapes(const InputList& dynamic_shapes,
 
 xla::StatusOr<std::unique_ptr<::grpc::ServerBuilder>> CreateServerBuilder(
     int serving_port) {
-  auto server_builder = absl::make_unique<::grpc::ServerBuilder>();
+  auto server_builder = std::make_unique<::grpc::ServerBuilder>();
   server_builder->AddListeningPort(
       absl::StrFormat("[::]:%d", serving_port),
       ::grpc::InsecureServerCredentials());  // NOLINT
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.cc b/tensorflow/core/tpu/kernels/transfer_ops.cc
index 389010b3977..d7ecb78bdad 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.cc
+++ b/tensorflow/core/tpu/kernels/transfer_ops.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/tpu/kernels/transfer_ops.h"
 
+#include <deque>
+#include <memory>
+#include <utility>
+
 #include "tensorflow/compiler/xla/stream_executor/multi_platform_manager.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_node_context.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
diff --git a/tensorflow/core/tpu/kernels/transfer_ops.h b/tensorflow/core/tpu/kernels/transfer_ops.h
index 59c78dc44dd..1aac340e6c7 100644
--- a/tensorflow/core/tpu/kernels/transfer_ops.h
+++ b/tensorflow/core/tpu/kernels/transfer_ops.h
@@ -16,6 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
 #define TENSORFLOW_CORE_TPU_KERNELS_TRANSFER_OPS_H_
 
+#include <deque>
+#include <memory>
+#include <string>
+
 #include "tensorflow/compiler/jit/xla_device.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_platform_interface.h"
 #include "tensorflow/compiler/xla/stream_executor/tpu/tpu_transfer_manager_interface.h"
@@ -26,7 +30,7 @@ namespace tensorflow {
 
 class TpuTransferOpInterface {
  public:
-  virtual ~TpuTransferOpInterface() {}
+  virtual ~TpuTransferOpInterface() = default;
   virtual void Cancel() = 0;
   virtual StatusOr<int> GetDeviceOrdinal(OpKernelContext* ctx) = 0;
 
diff --git a/tensorflow/core/tpu/kernels/xla/get_item_op.cc b/tensorflow/core/tpu/kernels/xla/get_item_op.cc
index 9daf7fdfa12..bcd221bef91 100644
--- a/tensorflow/core/tpu/kernels/xla/get_item_op.cc
+++ b/tensorflow/core/tpu/kernels/xla/get_item_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
 #define EIGEN_USE_THREADS
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
index f22a87c235a..df86b9d4667 100644
--- a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
@@ -136,7 +140,7 @@ class HostComputeOp : public XlaOpKernel {
                                      &original_node_name_));
   }
 
-  ~HostComputeOp() override {}
+  ~HostComputeOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
@@ -421,7 +425,7 @@ class SendToHostOp : public XlaOpKernel {
                                      &original_node_name_));
   }
 
-  ~SendToHostOp() override {}
+  ~SendToHostOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
@@ -480,7 +484,7 @@ class RecvFromHostOp : public XlaOpKernel {
       original_node_name_ = name();
   }
 
-  ~RecvFromHostOp() override {}
+  ~RecvFromHostOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
diff --git a/tensorflow/core/tpu/kernels/xla/infeed_op.cc b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
index bbbac04834d..e9ca4f81eba 100644
--- a/tensorflow/core/tpu/kernels/xla/infeed_op.cc
+++ b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/sharding_util.h"
 #include "tensorflow/compiler/tf2xla/type_util.h"
@@ -49,7 +52,7 @@ xla::Shape GetTPUInfeedLayout(const xla::Shape& shape) {
 // sharding of the op. If the op has tile sharding, assign the layout based on
 // the shard shape.
 Status UpdateInfeedLayout(xla::Shape* shape,
-                          absl::optional<xla::OpSharding> sharding) {
+                          std::optional<xla::OpSharding> sharding) {
   if (sharding && sharding->type() == xla::OpSharding::OTHER) {
     TF_ASSIGN_OR_RETURN(auto hlo_sharding,
                         xla::HloSharding::FromProto(*sharding));
@@ -123,12 +126,12 @@ class InfeedDequeueTupleOp : public XlaOpKernel {
     }
   }
 
-  ~InfeedDequeueTupleOp() override {}
+  ~InfeedDequeueTupleOp() override = default;
 
   void Compile(XlaOpKernelContext* ctx) override {
     xla::XlaBuilder* b = ctx->builder();
     for (int64_t i = 0; i < xla_shapes_.size(); ++i) {
-      absl::optional<xla::OpSharding> sharding;
+      std::optional<xla::OpSharding> sharding;
       if (b->sharding()) {
         sharding = b->sharding()->type() == xla::OpSharding::TUPLE
                        ? b->sharding()->tuple_shardings(i)
@@ -141,7 +144,7 @@ class InfeedDequeueTupleOp : public XlaOpKernel {
 
     // Don't apply the infeed tuple sharding to the get-tuple-elements. They
     // need non-tuple shardings.
-    xla::XlaScopedShardingAssignment clear_sharding(b, absl::nullopt);
+    xla::XlaScopedShardingAssignment clear_sharding(b, std::nullopt);
     for (int i = 0; i < shapes_.size(); ++i) {
       ctx->SetOutput(i, xla::GetTupleElement(tuple, i));
     }
diff --git a/tensorflow/core/tpu/kernels/xla/inplace_ops.cc b/tensorflow/core/tpu/kernels/xla/inplace_ops.cc
index d5271f3e23b..b22697c8390 100644
--- a/tensorflow/core/tpu/kernels/xla/inplace_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/inplace_ops.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <vector>
 
+#include "tensorflow/compiler/tf2xla/lib/scatter.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include "tensorflow/compiler/tf2xla/lib/scatter.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
-#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
index 8abdd3d171f..b20da6fc893 100644
--- a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
@@ -13,14 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <vector>
+
 #include "tensorflow/compiler/tf2xla/shape_util.h"
+#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/compiler/xla/client/xla_builder.h"
-#include "tensorflow/core/framework/op_kernel.h"
-
-#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
+#include "tensorflow/core/framework/op_kernel.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
index adf383c9e96..525eb994b5f 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
+++ b/tensorflow/core/tpu/ops/tpu_embedding_ops.cc
@@ -153,23 +153,23 @@ REGISTER_OP("FinalizeTPUEmbedding")
 //    till _ConfigureDistributedTPU completes. Also stores the HBM size and the
 //    embedding partitioner output in the system metadata where it can be used
 //    while compiling embedding Ops for TPU.
-// 2) The ConfigureTPUEmbeddingMemory Op runs on TPU:0 of all tasks. Using the
-//    output of the ExecuteTPUEmbeddingPartitioner Op, it allocates HBM memory,
-//    initializes the TPUEmbeddingManager and store the HBM buffer
-//    configuration.
-// 3) The ConfigureTPUEmbeddingHost Op runs on TPU:0 of all tasks. Using the
-//    output of the ExecuteTPUEmbeddingPartitioner Op, it builds the program
-//    that executes on the TPUEmbeddings, configures the TPUEmbedding hardware
-//    and sets up the TPUEmbedding host software. Using the output of the
-//    ConfigureTPUEmbeddingMemory Op that it receives from all tasks, it
-//    checks that HBM segment sizes are equal, and combines each task's
-//    allocation info to create a global map of HBM base addresses. It uses that
-//    to initialize the TPUEmbeddingManager, and also provides the hostname:port
-//    for inter-TPUEmbedding agreement of minibatch sizing.
-// 4) The _ConnectInterTPUEmbeddingCommunication Op runs on TPU:0 of all tasks.
-//    It uses the hostname:port output from all ConfigureTPUEmbeddingHost Ops
-//    to form all-to-all connections between all tasks for inter-TPUEmbedding
-//    agreement.
+// 2) The ConfigureTPUEmbeddingMemory Op runs on the TPU:0's host device of all
+//    tasks. Using the output of the ExecuteTPUEmbeddingPartitioner Op, it
+//    allocates HBM memory, initializes the TPUEmbeddingManager and store the
+//    HBM buffer configuration.
+// 3) The ConfigureTPUEmbeddingHost Op runs on the TPU:0's host device of all
+//    tasks. Using the output of the ExecuteTPUEmbeddingPartitioner Op, it
+//    builds the program that executes on the TPUEmbeddings, configures the
+//    TPUEmbedding hardware and sets up the TPUEmbedding host software. Using
+//    the output of the ConfigureTPUEmbeddingMemory Op that it receives from all
+//    tasks, it checks that HBM segment sizes are equal, and combines each
+//    task's allocation info to create a global map of HBM base addresses. It
+//    uses that to initialize the TPUEmbeddingManager, and also provides the
+//    hostname:port for inter-TPUEmbedding agreement of minibatch sizing.
+// 4) The _ConnectInterTPUEmbeddingCommunication Op runs on the TPU:0's host
+//    device of all tasks. It uses the hostname:port output from all
+//    ConfigureTPUEmbeddingHost Ops to form all-to-all connections between all
+//    tasks for inter-TPUEmbedding agreement.
 // 5) The FinalizeTPUEmbedding Op runs on TPU_SYSTEM of
 //    task 0. It takes as input the outputs from all ConfigureTPUEmbeddingHost
 //    Ops and validates that the HBM base address (in bytes) used for
diff --git a/tensorflow/core/tpu/ops/tpu_embedding_shape_util.cc b/tensorflow/core/tpu/ops/tpu_embedding_shape_util.cc
index da6dad61c91..26fce69f9f3 100644
--- a/tensorflow/core/tpu/ops/tpu_embedding_shape_util.cc
+++ b/tensorflow/core/tpu/ops/tpu_embedding_shape_util.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <array>
 #include <string>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "tensorflow/compiler/xla/status_macros.h"
diff --git a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
index 506ba04af39..b508e089b7f 100644
--- a/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
+++ b/tensorflow/core/tpu/tpu_embedding_optimization_parameters_utils.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "tensorflow/compiler/xla/hlo/ir/hlo_opcode.h"
 #include "tensorflow/compiler/xla/service/hlo.pb.h"
@@ -45,6 +46,8 @@ std::string GetOptimizationAlgorithmName(OptimizationAlgorithm alg) {
       return "ADAM";
     case OptimizationAlgorithm::kMomentum:
       return "Momentum";
+    case OptimizationAlgorithm::kLion:
+      return "Lion";
     case OptimizationAlgorithm::kRmsProp:
       return "RMSProp";
     case OptimizationAlgorithm::kCenteredRmsProp:
@@ -87,6 +90,8 @@ std::string GetOptimizationAlgorithmFriendlyName(OptimizationAlgorithm alg) {
       return "ADAM";
     case OptimizationAlgorithm::kMomentum:
       return "Momentum";
+    case OptimizationAlgorithm::kLion:
+      return "Lion";
     case OptimizationAlgorithm::kRmsProp:
       return "RMSProp";
     case OptimizationAlgorithm::kCenteredRmsProp:
@@ -140,6 +145,9 @@ Status GetBaseAuxiliaryParameterCount(const OptimizationParameters& params,
     case OptimizationAlgorithm::kMomentum:
       *count = 1;
       return OkStatus();
+    case OptimizationAlgorithm::kLion:
+      *count = 1;
+      return OkStatus();
     case OptimizationAlgorithm::kRmsProp:
       *count = 2;
       return OkStatus();
@@ -304,6 +312,11 @@ Status GetOptimizationAlgorithmStateVariables(
       add_state_variable("momenta");
       break;
     }
+    case OptimizationAlgorithm::kLion: {
+      add_state_variable("parameters");
+      add_state_variable("momenta");
+      break;
+    }
     case OptimizationAlgorithm::kRmsProp: {
       add_state_variable("parameters");
       add_state_variable("ms");
@@ -398,6 +411,7 @@ std::vector<OptimizationAlgorithm> GetOptimizationAlgorithms() {
       OptimizationAlgorithm::kFtrl,
       OptimizationAlgorithm::kAdam,
       OptimizationAlgorithm::kMomentum,
+      OptimizationAlgorithm::kLion,
       OptimizationAlgorithm::kRmsProp,
       OptimizationAlgorithm::kCenteredRmsProp,
       OptimizationAlgorithm::kMdlAdagradLight,
diff --git a/tensorflow/core/transforms/cf_sink/BUILD b/tensorflow/core/transforms/cf_sink/BUILD
index 778d97191b0..e5a77916b5a 100644
--- a/tensorflow/core/transforms/cf_sink/BUILD
+++ b/tensorflow/core/transforms/cf_sink/BUILD
@@ -28,6 +28,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/consolidate_attrs/BUILD b/tensorflow/core/transforms/consolidate_attrs/BUILD
index b9dabc149d2..50525a50582 100644
--- a/tensorflow/core/transforms/consolidate_attrs/BUILD
+++ b/tensorflow/core/transforms/consolidate_attrs/BUILD
@@ -29,6 +29,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/const_dedupe_hoist/BUILD b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
index 598d76d8d30..b6a81a5a93f 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/BUILD
+++ b/tensorflow/core/transforms/const_dedupe_hoist/BUILD
@@ -25,6 +25,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/const_dedupe_hoist/pass.cc b/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
index 75b76f35932..8358bd338d3 100644
--- a/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
+++ b/tensorflow/core/transforms/const_dedupe_hoist/pass.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <forward_list>
 #include <memory>
+#include <vector>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/core/transforms/constant_folding/BUILD b/tensorflow/core/transforms/constant_folding/BUILD
index d059ec8fe75..1b5b0fb43f4 100644
--- a/tensorflow/core/transforms/constant_folding/BUILD
+++ b/tensorflow/core/transforms/constant_folding/BUILD
@@ -35,6 +35,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/constant_folding/pass.cc b/tensorflow/core/transforms/constant_folding/pass.cc
index 317114efc7b..ba080586688 100644
--- a/tensorflow/core/transforms/constant_folding/pass.cc
+++ b/tensorflow/core/transforms/constant_folding/pass.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <iterator>
+#include <memory>
 #include <numeric>
 #include <optional>
 #include <string>
diff --git a/tensorflow/core/transforms/cse/BUILD b/tensorflow/core/transforms/cse/BUILD
index 01da88ec807..a6c6914204c 100644
--- a/tensorflow/core/transforms/cse/BUILD
+++ b/tensorflow/core/transforms/cse/BUILD
@@ -25,6 +25,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/drop_unregistered_attribute/BUILD b/tensorflow/core/transforms/drop_unregistered_attribute/BUILD
index 98ba466b40c..98a5fe7d236 100644
--- a/tensorflow/core/transforms/drop_unregistered_attribute/BUILD
+++ b/tensorflow/core/transforms/drop_unregistered_attribute/BUILD
@@ -23,6 +23,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD b/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
index 52492c59d62..fe69bb24386 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
@@ -27,6 +27,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
index 8576a75f8f3..7b40357f799 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.h"
 
+#include <memory>
 #include <utility>
 
 #include "llvm/ADT/BitVector.h"
diff --git a/tensorflow/core/transforms/func_to_graph/BUILD b/tensorflow/core/transforms/func_to_graph/BUILD
index 498fe45d5eb..4cd2e365f3d 100644
--- a/tensorflow/core/transforms/func_to_graph/BUILD
+++ b/tensorflow/core/transforms/func_to_graph/BUILD
@@ -36,6 +36,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/func_to_graph/pass.cc b/tensorflow/core/transforms/func_to_graph/pass.cc
index 4cb3bf3cac4..4ca2e504114 100644
--- a/tensorflow/core/transforms/func_to_graph/pass.cc
+++ b/tensorflow/core/transforms/func_to_graph/pass.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/func_to_graph/pass.h"
 
+#include <memory>
+
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
diff --git a/tensorflow/core/transforms/functional_to_region/BUILD b/tensorflow/core/transforms/functional_to_region/BUILD
index b57d66c4929..14addc62ce7 100644
--- a/tensorflow/core/transforms/functional_to_region/BUILD
+++ b/tensorflow/core/transforms/functional_to_region/BUILD
@@ -39,6 +39,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/functional_to_region/impl.cc b/tensorflow/core/transforms/functional_to_region/impl.cc
index 8612571579c..809362da86c 100644
--- a/tensorflow/core/transforms/functional_to_region/impl.cc
+++ b/tensorflow/core/transforms/functional_to_region/impl.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <tuple>
+#include <utility>
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/core/transforms/graph_compactor/BUILD b/tensorflow/core/transforms/graph_compactor/BUILD
index 2c57a99f948..360246876f5 100644
--- a/tensorflow/core/transforms/graph_compactor/BUILD
+++ b/tensorflow/core/transforms/graph_compactor/BUILD
@@ -29,6 +29,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/graph_to_func/BUILD b/tensorflow/core/transforms/graph_to_func/BUILD
index 2ba7ffda769..c4bcc7fb83b 100644
--- a/tensorflow/core/transforms/graph_to_func/BUILD
+++ b/tensorflow/core/transforms/graph_to_func/BUILD
@@ -40,6 +40,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/legacy_call/BUILD b/tensorflow/core/transforms/legacy_call/BUILD
index de50a30b4be..c010fdb7333 100644
--- a/tensorflow/core/transforms/legacy_call/BUILD
+++ b/tensorflow/core/transforms/legacy_call/BUILD
@@ -24,6 +24,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/region_to_functional/BUILD b/tensorflow/core/transforms/region_to_functional/BUILD
index 6da4f3fe8c1..b49cb34ce65 100644
--- a/tensorflow/core/transforms/region_to_functional/BUILD
+++ b/tensorflow/core/transforms/region_to_functional/BUILD
@@ -40,6 +40,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/remapper/BUILD b/tensorflow/core/transforms/remapper/BUILD
index 03dc448fa94..4a3ab37da42 100644
--- a/tensorflow/core/transforms/remapper/BUILD
+++ b/tensorflow/core/transforms/remapper/BUILD
@@ -53,6 +53,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     test_file_exts = ["mlir"],
diff --git a/tensorflow/core/transforms/shape_inference/BUILD b/tensorflow/core/transforms/shape_inference/BUILD
index 5e2a2d17404..c1fd69fbe2b 100644
--- a/tensorflow/core/transforms/shape_inference/BUILD
+++ b/tensorflow/core/transforms/shape_inference/BUILD
@@ -30,6 +30,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/shape_inference/pass.cc b/tensorflow/core/transforms/shape_inference/pass.cc
index ada5a4f81d7..c6763b4b51e 100644
--- a/tensorflow/core/transforms/shape_inference/pass.cc
+++ b/tensorflow/core/transforms/shape_inference/pass.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/shape_inference/pass.h"
 
+#include <memory>
+#include <vector>
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/core/transforms/toposort/BUILD b/tensorflow/core/transforms/toposort/BUILD
index b91b8a59c08..7b39cc616a4 100644
--- a/tensorflow/core/transforms/toposort/BUILD
+++ b/tensorflow/core/transforms/toposort/BUILD
@@ -25,6 +25,7 @@ cc_library(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = ["//tensorflow/core/transforms:test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     exclude = [],
diff --git a/tensorflow/core/transforms/toposort/pass.cc b/tensorflow/core/transforms/toposort/pass.cc
index bbde5335464..627ef0519d1 100644
--- a/tensorflow/core/transforms/toposort/pass.cc
+++ b/tensorflow/core/transforms/toposort/pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/transforms/toposort/pass.h"
 
+#include <memory>
 #include <vector>
 
 #include "llvm/ADT/STLExtras.h"
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index d5449f337f3..425ee244567 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/batch_util.h"
 
+#include <algorithm>
+#include <utility>
+
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
diff --git a/tensorflow/core/util/bcast.h b/tensorflow/core/util/bcast.h
index 531c47aec3d..a9d8d19e6eb 100644
--- a/tensorflow/core/util/bcast.h
+++ b/tensorflow/core/util/bcast.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_BCAST_H_
 
 #include <algorithm>
+#include <vector>
 
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
@@ -78,10 +79,9 @@ class BCastList {
   // If return_flattened_batch_indices is true, the implementation will compute
   // for each output member of the flattened output, which batch indices of
   // each input correspond to it. This is disabled by default.
-  explicit BCastList(const Vec (&x)[N],
-                     const bool fewer_dims_optimization = true,
-                     const bool return_flattened_batch_indices = false);
-  ~BCastList() {}
+  explicit BCastList(const Vec (&x)[N], bool fewer_dims_optimization = true,
+                     bool return_flattened_batch_indices = false);
+  ~BCastList() = default;
 
   // Returns true iff two operands are compatible according to the
   // broadcasting rule.
@@ -96,7 +96,7 @@ class BCastList {
   const Vec& result_shape() const { return result_; }
   const Vec& output_shape() const { return output_; }
   const Vec& grad_reduce_idx(int i) const { return grad_reduce_idx_[i]; }
-  const int64_t output_batch_size() const { return output_batch_size_; }
+  int64_t output_batch_size() const { return output_batch_size_; }
 
   // Returns the mapping from the flattened output batch indices to x's
   // flattened batch indices. The result is a vector of length
@@ -366,7 +366,7 @@ class BCast : public BCastList<2> {
       : BCastList<2>({x, y}, fewer_dims_optimization,
                      return_flattened_batch_indices) {}
 
-  ~BCast() {}
+  ~BCast() = default;
 
   // If and only if IsValid(), the following fields can be used in
   // implementing a broadcasted binary tensor operation according to
diff --git a/tensorflow/core/util/debug_data_dumper.cc b/tensorflow/core/util/debug_data_dumper.cc
index c6472476145..0c5b56a6931 100644
--- a/tensorflow/core/util/debug_data_dumper.cc
+++ b/tensorflow/core/util/debug_data_dumper.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/util/debug_data_dumper.h"
 
+#include <optional>
+#include <set>
+#include <string>
+#include <vector>
+
 #include "absl/strings/str_format.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/path.h"
@@ -30,7 +35,7 @@ DebugDataDumper* DebugDataDumper::Global() {
 DebugDataDumper::DebugDataDumper() { LoadEnvvars(); }
 
 void DebugDataDumper::LoadEnvvars() {
-  // Load TF_DUMP_GRAPH_PREFIX.
+  // Load TF_DUMP_GRAPH_WRAPPED.
   const char* dump_wrapped = getenv("TF_DUMP_GRAPH_WRAPPED");
   dump_wrapped_ = static_cast<bool>(dump_wrapped);
 
diff --git a/tensorflow/core/util/debug_events_writer.cc b/tensorflow/core/util/debug_events_writer.cc
index 113cb305571..82af4c64d0c 100644
--- a/tensorflow/core/util/debug_events_writer.cc
+++ b/tensorflow/core/util/debug_events_writer.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "tensorflow/core/util/debug_events_writer.h"
 
+#include <deque>
+#include <memory>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/lib/strings/stringprintf.h"
@@ -52,7 +58,7 @@ Status SingleDebugEventFileWriter::Init() {
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       env_->NewWritableFile(file_path_, &writable_file_),
       "Creating writable file ", file_path_);
-  record_writer_.reset(new io::RecordWriter(writable_file_.get()));
+  record_writer_ = std::make_unique<io::RecordWriter>(writable_file_.get());
   if (record_writer_ == nullptr) {
     return errors::Unknown("Could not create record writer at path: ",
                            file_path_);
@@ -177,7 +183,8 @@ Status DebugEventsWriter::Init() {
 
   // The metadata file should be created.
   string metadata_filename = GetFileNameInternal(METADATA);
-  metadata_writer_.reset(new SingleDebugEventFileWriter(metadata_filename));
+  metadata_writer_ =
+      std::make_unique<SingleDebugEventFileWriter>(metadata_filename);
   if (metadata_writer_ == nullptr) {
     return errors::Unknown("Could not create debug event metadata file writer");
   }
@@ -494,7 +501,7 @@ Status DebugEventsWriter::InitNonMetadataFile(DebugEventFileType type) {
   const string filename = GetFileNameInternal(type);
   writer->reset();
 
-  writer->reset(new SingleDebugEventFileWriter(filename));
+  *writer = std::make_unique<SingleDebugEventFileWriter>(filename);
   if (*writer == nullptr) {
     return errors::Unknown("Could not create debug event file writer for ",
                            filename);
diff --git a/tensorflow/core/util/debug_events_writer.h b/tensorflow/core/util/debug_events_writer.h
index b4a00409103..570a40fc9c9 100644
--- a/tensorflow/core/util/debug_events_writer.h
+++ b/tensorflow/core/util/debug_events_writer.h
@@ -16,7 +16,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
 #define TENSORFLOW_CORE_UTIL_DEBUG_EVENTS_WRITER_H_
 
+#include <atomic>
 #include <deque>
+#include <memory>
+#include <unordered_map>
 
 #include "absl/container/flat_hash_map.h"
 #include "tensorflow/core/framework/tensor.h"
diff --git a/tensorflow/core/util/debug_events_writer_test.cc b/tensorflow/core/util/debug_events_writer_test.cc
index 074d3053e24..5ac30597608 100644
--- a/tensorflow/core/util/debug_events_writer_test.cc
+++ b/tensorflow/core/util/debug_events_writer_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/debug_events_writer.h"
 
+#include <algorithm>
+#include <atomic>
+#include <memory>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
diff --git a/tensorflow/core/util/dump_graph.cc b/tensorflow/core/util/dump_graph.cc
index 0079da0683d..51fd1143fe7 100644
--- a/tensorflow/core/util/dump_graph.cc
+++ b/tensorflow/core/util/dump_graph.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/dump_graph.h"
 
+#include <functional>
 #include <memory>
 #include <unordered_map>
 
@@ -94,7 +95,7 @@ GraphDumperConfig& GetGraphDumperConfig() {
 // WritableFile that simply prints to stderr.
 class StderrWritableFile : public WritableFile {
  public:
-  StderrWritableFile() {}
+  StderrWritableFile() = default;
 
   Status Append(StringPiece data) override {
     fprintf(stderr, "%.*s", static_cast<int>(data.size()), data.data());
diff --git a/tensorflow/core/util/einsum_op_util.cc b/tensorflow/core/util/einsum_op_util.cc
index a49bb18c0cf..b416526dccb 100644
--- a/tensorflow/core/util/einsum_op_util.cc
+++ b/tensorflow/core/util/einsum_op_util.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/util/einsum_op_util.h"
 
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_split.h"
diff --git a/tensorflow/core/util/einsum_op_util.h b/tensorflow/core/util/einsum_op_util.h
index 307caeee449..2d99dd2cad5 100644
--- a/tensorflow/core/util/einsum_op_util.h
+++ b/tensorflow/core/util/einsum_op_util.h
@@ -15,6 +15,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_EINSUM_OP_UTIL_H_
 #define TENSORFLOW_CORE_UTIL_EINSUM_OP_UTIL_H_
 
+#include <vector>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/gtl/inlined_vector.h"
 
diff --git a/tensorflow/core/util/equal_graph_def.cc b/tensorflow/core/util/equal_graph_def.cc
index 2937e9602f8..a27ebb034ec 100644
--- a/tensorflow/core/util/equal_graph_def.cc
+++ b/tensorflow/core/util/equal_graph_def.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/util/equal_graph_def.h"
 
+#include <map>
+#include <set>
 #include <unordered_map>
 #include <unordered_set>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
diff --git a/tensorflow/core/util/events_writer.cc b/tensorflow/core/util/events_writer.cc
index f039123d36a..368b3705372 100644
--- a/tensorflow/core/util/events_writer.cc
+++ b/tensorflow/core/util/events_writer.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stddef.h>  // for NULL
 
+#include <memory>
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/lib/io/path.h"
@@ -76,7 +78,7 @@ Status EventsWriter::InitIfNeeded() {
   TF_RETURN_WITH_CONTEXT_IF_ERROR(
       env_->NewWritableFile(filename_, &recordio_file_),
       "Creating writable file ", filename_);
-  recordio_writer_.reset(new io::RecordWriter(recordio_file_.get()));
+  recordio_writer_ = std::make_unique<io::RecordWriter>(recordio_file_.get());
   if (recordio_writer_ == nullptr) {
     return errors::Unknown("Could not create record writer");
   }
diff --git a/tensorflow/core/util/events_writer_test.cc b/tensorflow/core/util/events_writer_test.cc
index c77e78a17c4..1c3185b1924 100644
--- a/tensorflow/core/util/events_writer_test.cc
+++ b/tensorflow/core/util/events_writer_test.cc
@@ -16,6 +16,9 @@ limitations under the License.
 #include "tensorflow/core/util/events_writer.h"
 
 #include <math.h>
+
+#include <memory>
+
 #include "tensorflow/core/framework/summary.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -62,7 +65,7 @@ static bool ReadEventProto(io::RecordReader* reader, uint64* offset,
 }
 
 void VerifyFile(const string& filename) {
-  CHECK(env()->FileExists(filename).ok());
+  TF_CHECK_OK(env()->FileExists(filename));
   std::unique_ptr<RandomAccessFile> event_file;
   TF_CHECK_OK(env()->NewRandomAccessFile(filename, &event_file));
   io::RecordReader* reader = new io::RecordReader(event_file.get());
diff --git a/tensorflow/core/util/example_proto_fast_parsing.cc b/tensorflow/core/util/example_proto_fast_parsing.cc
index 80a3def7465..8e8ed1d4fec 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing.cc
@@ -14,6 +14,10 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
+#include <algorithm>
+#include <functional>
+#include <optional>
+#include <utility>
 #include <vector>
 
 #include "absl/base/casts.h"
@@ -118,7 +122,7 @@ namespace parsed {
 // ParseDataType has to be called first, then appropriate ParseZzzzList.
 class Feature {
  public:
-  Feature() {}
+  Feature() = default;
   explicit Feature(StringPiece serialized) : serialized_(serialized) {}
 
   Status ParseDataType(DataType* dtype) {
@@ -1081,7 +1085,7 @@ class TensorVector {
  private:
   // Use absl::optional to avoid calling the default constructor of Tensor
   // unnecessarily.
-  absl::optional<Tensor> tensor_;
+  std::optional<Tensor> tensor_;
 
   // Cached pointer to the raw data inside the tensor.
   T* data_ = nullptr;
diff --git a/tensorflow/core/util/example_proto_fast_parsing.h b/tensorflow/core/util/example_proto_fast_parsing.h
index 762da49c11a..db924c748db 100644
--- a/tensorflow/core/util/example_proto_fast_parsing.h
+++ b/tensorflow/core/util/example_proto_fast_parsing.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/example/example.pb.h"
@@ -151,10 +152,10 @@ Status FastParseSingleExample(const FastParseSingleExampleConfig& config,
 // (If batch=true, then this parses a single SequenceExample.)
 Status FastParseSequenceExample(
     const example::FastParseExampleConfig& context_config,
-    const example::FastParseExampleConfig& feature_list_config,
+    const example::FastParseExampleConfig& sequence_config,
     gtl::ArraySlice<tstring> serialized, gtl::ArraySlice<tstring> example_names,
     thread::ThreadPool* thread_pool, example::Result* context_result,
-    example::Result* feature_list_result,
+    example::Result* sequence_result,
     std::vector<Tensor>* dense_feature_lengths, bool is_batch = true);
 
 // This function parses serialized Example and populates given example.
diff --git a/tensorflow/core/util/example_proto_fast_parsing_test.cc b/tensorflow/core/util/example_proto_fast_parsing_test.cc
index 51f0b942d4d..945271c2050 100644
--- a/tensorflow/core/util/example_proto_fast_parsing_test.cc
+++ b/tensorflow/core/util/example_proto_fast_parsing_test.cc
@@ -13,10 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <utility>
-
 #include "tensorflow/core/util/example_proto_fast_parsing.h"
 
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
 #include "tensorflow/core/lib/random/philox_random.h"
diff --git a/tensorflow/core/util/example_proto_helper.cc b/tensorflow/core/util/example_proto_helper.cc
index 08151df99c1..1a7be3bfcd8 100644
--- a/tensorflow/core/util/example_proto_helper.cc
+++ b/tensorflow/core/util/example_proto_helper.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/example_proto_helper.h"
 
+#include <algorithm>
+#include <limits>
 #include <vector>
 
 #include "tensorflow/core/example/example.pb.h"
diff --git a/tensorflow/core/util/example_proto_helper.h b/tensorflow/core/util/example_proto_helper.h
index 95f3e18d893..030dfc396ff 100644
--- a/tensorflow/core/util/example_proto_helper.h
+++ b/tensorflow/core/util/example_proto_helper.h
@@ -75,11 +75,11 @@ struct VarLenFeature {
 // CopyIntoSparseTensor can be used to copy from the temporary vector
 // into the final allocated tensors.
 Status SingleExampleProtoToTensors(
-    const Example& example, const string& name, const int batch_index,
+    const Example& example, const string& name, int batch_index,
     const std::vector<FixedLenFeature>& fixed_len_features,
     const std::vector<VarLenFeature>& var_len_features,
-    std::vector<Tensor*>* dense_values,
-    std::vector<std::vector<Tensor>>* sparse_values_temporary_vector);
+    std::vector<Tensor*>* output_dense_values_tensor,
+    std::vector<std::vector<Tensor>>* output_sparse_values_tmp);
 
 // The shape of the indices and values tensors associated with a SparseTensor
 // are dependent on the contents of the batch.
@@ -94,7 +94,7 @@ struct VarLenFeatureBatchShapes {
 // are actually filled.
 Status GetSparseTensorShapes(const VarLenFeature& var_len_feature,
                              const std::vector<Tensor>& sparse_values_tmp,
-                             const int batch_size,
+                             int batch_size,
                              VarLenFeatureBatchShapes* output_shapes);
 
 // A method to convert a batch of tensorflow::Example protos into output
@@ -128,7 +128,7 @@ Status CheckTypesMatch(const Feature& feature, const DataType& dtype,
 
 // For a single Example, copy a dense feature value into an output
 // dense value tensor Out at the provided out_index offset.
-Status FeatureDenseCopy(const std::size_t out_index, const string& name,
+Status FeatureDenseCopy(std::size_t out_index, const string& name,
                         const string& key, const DataType& dtype,
                         const TensorShape& shape, const Feature& feature,
                         Tensor* out);
@@ -140,16 +140,15 @@ void RowDenseCopy(const std::size_t& out_index, const DataType& dtype,
 
 // For a single Example, and given sparse feature return a temporary output
 // Tensor suitable for being collected in the temporary sparse value vector.
-Tensor FeatureSparseCopy(const std::size_t batch, const string& key,
+Tensor FeatureSparseCopy(std::size_t batch, const string& key,
                          const DataType& dtype, const Feature& feature);
 
 // Copy a temporary Tensor into the final sparse indices and values
 // tensor at a given batch index and element offset. This method
 // assumes that the indices/values Tensors have been properly allocated
 // for the batch.
-int64_t CopyIntoSparseTensor(const Tensor& in, const int batch,
-                             const int64_t offset, Tensor* indices,
-                             Tensor* values);
+int64_t CopyIntoSparseTensor(const Tensor& in, int batch, int64_t offset,
+                             Tensor* indices, Tensor* values);
 
 // Check that each dense_shape has known rank and inner dimensions; and
 // update variable_length (whether the outer dimension is None) and
diff --git a/tensorflow/core/util/example_proto_helper_test.cc b/tensorflow/core/util/example_proto_helper_test.cc
index 41994823a46..669e537adb1 100644
--- a/tensorflow/core/util/example_proto_helper_test.cc
+++ b/tensorflow/core/util/example_proto_helper_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/core/util/example_proto_helper.h"
 
 #include <cstdint>
+#include <vector>
 
 #include "tensorflow/core/example/example.pb.h"
 #include "tensorflow/core/example/feature.pb.h"
diff --git a/tensorflow/core/util/exec_on_stall_test.cc b/tensorflow/core/util/exec_on_stall_test.cc
index 42e66a7e84a..1bbc94ac8c7 100644
--- a/tensorflow/core/util/exec_on_stall_test.cc
+++ b/tensorflow/core/util/exec_on_stall_test.cc
@@ -15,6 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/util/exec_on_stall.h"
 
+#include <functional>
+#include <memory>
+#include <utility>
+
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/test.h"
@@ -28,7 +32,8 @@ struct Chunk {
 
 Chunk* NewChunk(int stall_seconds, std::function<void()> f) {
   Chunk* c = new Chunk;
-  c->stall_closure.reset(new ExecuteOnStall(stall_seconds, std::move(f)));
+  c->stall_closure =
+      std::make_unique<ExecuteOnStall>(stall_seconds, std::move(f));
   return c;
 }
 
diff --git a/tensorflow/core/util/gpu_solvers.h b/tensorflow/core/util/gpu_solvers.h
index 940806ecd70..e1dfa9d60cf 100644
--- a/tensorflow/core/util/gpu_solvers.h
+++ b/tensorflow/core/util/gpu_solvers.h
@@ -293,7 +293,6 @@ class GpuSolver {
   Status GetrfBatched(int n, Scalar** dev_A, int lda, int* dev_pivots,
                       DeviceLapackInfo* info, const int batch_count);
 
-  // No GetrsBatched for HipSolver yet.
   template <typename Scalar>
   Status GetrsBatched(const rocblas_operation trans, int n, int nrhs,
                       Scalar** A, int lda, int* dev_pivots, Scalar** B,
@@ -329,11 +328,41 @@ class GpuSolver {
                       const Scalar* const host_a_dev_ptrs[], int lda,
                       DeviceLapackInfo* dev_lapack_info, int batch_size);
 
+  // See
+  // https://rocblas.readthedocs.io/en/latest/API_Reference_Guide.html#trsm_batched
+  // trsm_batched performs the following batched operation:
+  // op(A_i)*X_i = alpha*B_i or
+  // X_i*op(A_i) = alpha*B_i, for i = 1, ..., batch_count,
+  // where alpha is a scalar, X and B are batched m by n matrices,
+  // A is triangular batched matrix and op(A) is one of
+  // op( A ) = A   or
+  // op( A ) = A^T   or
+  // op( A ) = A^H.
+  // Each matrix X_i is overwritten on B_i for i = 1, ..., batch_count.
+  template <typename Scalar>
+  Status TrsmBatched(rocblas_side side, rocblas_fill uplo,
+                     rocblas_operation trans, rocblas_diagonal diag, int m,
+                     int n, const Scalar* alpha,
+                     const Scalar* const dev_Aarray[], int lda,
+                     Scalar* dev_Barray[], int ldb, int batch_size);
+
+  template <typename Scalar>
+  Status Trsv(rocblas_fill uplo, rocblas_operation trans, rocblas_diagonal diag,
+              int n, const Scalar* A, int lda, Scalar* x, int intcx);
+
   template <typename Scalar>
   Status Trsm(rocblas_side side, rocblas_fill uplo, rocblas_operation trans,
               rocblas_diagonal diag, int m, int n, const Scalar* alpha,
               const Scalar* A, int lda, Scalar* B, int ldb);
 
+  // Singular value decomposition.
+  // See: https://hipsolver.readthedocs.io/en/latest/api_lapackfunc.html#svds
+  // No GesvdjBatched yet.
+  template <typename Scalar>
+  Status Gesvd(signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,
+               int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,
+               int ldvt, int* dev_lapack_info) TF_MUST_USE_RESULT;
+
   // QR factorization.
   // Computes QR factorization A = Q * R.
   template <typename Scalar>
diff --git a/tensorflow/core/util/image_resizer_state.h b/tensorflow/core/util/image_resizer_state.h
index 29429b32f00..70c932d36b8 100644
--- a/tensorflow/core/util/image_resizer_state.h
+++ b/tensorflow/core/util/image_resizer_state.h
@@ -48,7 +48,7 @@ inline float CalculateResizeScale(int64_t in_size, int64_t out_size,
 // Half pixel scaler scales assuming that the pixel centers are at 0.5, i.e. the
 // floating point coordinates of the top,left pixel is 0.5,0.5.
 struct HalfPixelScaler {
-  HalfPixelScaler(){};
+  HalfPixelScaler() = default;
   inline float operator()(const int x, const float scale) const {
     // Note that we subtract 0.5 from the return value, as the existing bilinear
     // sampling code etc assumes pixels are in the old coordinate system.
@@ -60,7 +60,7 @@ struct HalfPixelScaler {
 // translation leading to inconsistent results. For example, a flip then a
 // resize gives different results then a resize then a flip.
 struct LegacyScaler {
-  LegacyScaler(){};
+  LegacyScaler() = default;
   inline float operator()(const int x, const float scale) const {
     return static_cast<float>(x) * scale;
   }
diff --git a/tensorflow/core/util/incremental_barrier.cc b/tensorflow/core/util/incremental_barrier.cc
index cbea7f25cc5..ee351a3e032 100644
--- a/tensorflow/core/util/incremental_barrier.cc
+++ b/tensorflow/core/util/incremental_barrier.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <atomic>
 #include <functional>
+#include <utility>
 
 #include "absl/functional/bind_front.h"
 #include "tensorflow/core/platform/logging.h"
diff --git a/tensorflow/core/util/managed_stack_trace.h b/tensorflow/core/util/managed_stack_trace.h
index f08ba7c36eb..f06bd074f7d 100644
--- a/tensorflow/core/util/managed_stack_trace.h
+++ b/tensorflow/core/util/managed_stack_trace.h
@@ -17,11 +17,14 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_MANAGED_STACK_TRACE_H_
 
 #include <functional>
+#include <memory>
 #include <string>
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/platform/stack_frame.h"
 
@@ -59,25 +62,40 @@ inline bool IsInternalFrameForFilename(absl::string_view file_name) {
          !absl::StrContains(file_name, "test.py");
 }
 
-// Language agnostic stack trace class. It only saves an id, and language
-// clients are responsible for managing the actual stack trace objects.
-class ManagedStackTrace {
+class CapturedStackTrace {
  public:
-  ManagedStackTrace(int id, ToStackFramesFunctor* to_stack_frames)
-      : id_(id), to_stack_frames_(to_stack_frames) {}
+  virtual ~CapturedStackTrace() = default;
+
+  std::vector<StackFrame> ToStackFrames(const SourceMap& source_map,
+                                        const StackTraceFilter& filtered) {
+    return ToStackFrames(source_map, filtered, /*reverse_traversal=*/false,
+                         /*limit=*/-1);
+  }
+  virtual std::vector<StackFrame> ToStackFrames(
+      const SourceMap& source_map, const StackTraceFilter& filtered,
+      bool reverse_traversal, int limit) const = 0;
+};
+
+// Kept for backwards compatibility with existing users, this simply wraps an
+// underlying stack trace pointer.
+class ManagedStackTrace : public CapturedStackTrace {
+ public:
+  explicit ManagedStackTrace(std::shared_ptr<CapturedStackTrace> trace)
+      : trace_(trace) {}
+
+  ~ManagedStackTrace() override { trace_.reset(); }
 
   // Returns stack trace as a vector of `StackFrame`s.
   std::vector<StackFrame> ToStackFrames(const SourceMap& source_map,
                                         const StackTraceFilter& filtered,
-                                        bool reverse_traversal = false,
-                                        int limit = -1) const {
-    return to_stack_frames_(id_, source_map, filtered, reverse_traversal,
-                            limit);
+                                        bool reverse_traversal,
+                                        int limit) const override {
+    return trace_->ToStackFrames(source_map, filtered, reverse_traversal,
+                                 limit);
   }
 
  private:
-  int id_;
-  ToStackFramesFunctor* to_stack_frames_;
+  std::shared_ptr<CapturedStackTrace> trace_;
 };
 
 // Generates a message with a definition location based on a provided stack
diff --git a/tensorflow/core/util/matmul_bcast.h b/tensorflow/core/util/matmul_bcast.h
index 89c48cc505b..6af757d0b7b 100644
--- a/tensorflow/core/util/matmul_bcast.h
+++ b/tensorflow/core/util/matmul_bcast.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
 #define TENSORFLOW_CORE_UTIL_MATMUL_BCAST_H_
 
+#include <algorithm>
+#include <memory>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -39,7 +42,7 @@ class MatMulBCast {
     const Vec y_resized(y.begin(), y.end() - 2);
 
     batch_bcast_ =
-        absl::make_unique<BCast>(std::move(x_resized), std::move(y_resized));
+        std::make_unique<BCast>(std::move(x_resized), std::move(y_resized));
     if (!batch_bcast_->IsValid()) {
       // Set broadcasting_required_ to true to make IsValid() return false;
       broadcasting_required_ = true;
@@ -66,9 +69,9 @@ class MatMulBCast {
   }
   bool IsBroadcastingRequired() const { return broadcasting_required_; }
 
-  const int64_t output_batch_size() const { return output_batch_size_; }
-  const int64_t x_batch_size() const { return x_batch_size_; }
-  const int64_t y_batch_size() const { return y_batch_size_; }
+  int64_t output_batch_size() const { return output_batch_size_; }
+  int64_t x_batch_size() const { return x_batch_size_; }
+  int64_t y_batch_size() const { return y_batch_size_; }
   const TensorShape& output_batch_shape() const { return output_batch_shape_; }
 
   // Returns the mapping from the flattened output batch indices to x's
diff --git a/tensorflow/core/util/memmapped_file_system.cc b/tensorflow/core/util/memmapped_file_system.cc
index c2bb728d669..689f2578bfc 100644
--- a/tensorflow/core/util/memmapped_file_system.cc
+++ b/tensorflow/core/util/memmapped_file_system.cc
@@ -14,6 +14,11 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/memmapped_file_system.h"
 
+#include <algorithm>
+#include <memory>
+#include <utility>
+#include <vector>
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/str_util.h"
 #include "tensorflow/core/platform/protobuf.h"
@@ -84,7 +89,7 @@ class RandomAccessFileFromMemmapped : public RandomAccessFile {
 
 }  // namespace
 
-MemmappedFileSystem::MemmappedFileSystem() {}
+MemmappedFileSystem::MemmappedFileSystem() = default;
 
 Status MemmappedFileSystem::FileExists(const string& fname,
                                        TransactionToken* token) {
@@ -108,9 +113,9 @@ Status MemmappedFileSystem::NewRandomAccessFile(
   if (dir_element == directory_.end()) {
     return errors::NotFound("Region ", filename, " is not found");
   }
-  result->reset(new RandomAccessFileFromMemmapped(
+  *result = std::make_unique<RandomAccessFileFromMemmapped>(
       GetMemoryWithOffset(dir_element->second.offset),
-      dir_element->second.length));
+      dir_element->second.length);
   return OkStatus();
 }
 
@@ -124,9 +129,9 @@ Status MemmappedFileSystem::NewReadOnlyMemoryRegionFromFile(
   if (dir_element == directory_.end()) {
     return errors::NotFound("Region ", filename, " is not found");
   }
-  result->reset(new ReadOnlyMemoryRegionFromMemmapped(
+  *result = std::make_unique<ReadOnlyMemoryRegionFromMemmapped>(
       GetMemoryWithOffset(dir_element->second.offset),
-      dir_element->second.length));
+      dir_element->second.length);
   return OkStatus();
 }
 
diff --git a/tensorflow/core/util/memmapped_file_system.h b/tensorflow/core/util/memmapped_file_system.h
index 27305a500f5..80dc5869255 100644
--- a/tensorflow/core/util/memmapped_file_system.h
+++ b/tensorflow/core/util/memmapped_file_system.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/core/platform/env.h"
 
diff --git a/tensorflow/core/util/memmapped_file_system_test.cc b/tensorflow/core/util/memmapped_file_system_test.cc
index 5e88b9c9af2..103e3237376 100644
--- a/tensorflow/core/util/memmapped_file_system_test.cc
+++ b/tensorflow/core/util/memmapped_file_system_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/util/memmapped_file_system.h"
 
+#include <memory>
+
 #include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/versions.pb.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
diff --git a/tensorflow/core/util/overflow_test.cc b/tensorflow/core/util/overflow_test.cc
index 178e7d55309..d47ad2301f1 100644
--- a/tensorflow/core/util/overflow_test.cc
+++ b/tensorflow/core/util/overflow_test.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include "tensorflow/core/util/overflow.h"
 
 #include <cmath>
+#include <limits>
+#include <vector>
 
 #ifdef PLATFORM_WINDOWS
 #include <Windows.h>
diff --git a/tensorflow/core/util/padding.cc b/tensorflow/core/util/padding.cc
index e196fdaecf1..9727b6da16f 100644
--- a/tensorflow/core/util/padding.cc
+++ b/tensorflow/core/util/padding.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/padding.h"
 
+#include <vector>
+
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 
diff --git a/tensorflow/core/util/presized_cuckoo_map_test.cc b/tensorflow/core/util/presized_cuckoo_map_test.cc
index 2ef444a4a22..9534cce138c 100644
--- a/tensorflow/core/util/presized_cuckoo_map_test.cc
+++ b/tensorflow/core/util/presized_cuckoo_map_test.cc
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "tensorflow/core/util/presized_cuckoo_map.h"
+
 #include <array>
+#include <vector>
 
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/fingerprint.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/platform/test_benchmark.h"
-#include "tensorflow/core/util/presized_cuckoo_map.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/core/util/ragged_to_dense_util.cc b/tensorflow/core/util/ragged_to_dense_util.cc
index d9aae041837..8bd39867ff8 100644
--- a/tensorflow/core/util/ragged_to_dense_util.cc
+++ b/tensorflow/core/util/ragged_to_dense_util.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/ragged_to_dense_util.h"
 
+#include <algorithm>
+#include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/util/ragged_to_dense_util.h b/tensorflow/core/util/ragged_to_dense_util.h
index 40ffdfb2fb4..a8d434001ff 100644
--- a/tensorflow/core/util/ragged_to_dense_util.h
+++ b/tensorflow/core/util/ragged_to_dense_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_H_
 #define TENSORFLOW_CORE_UTIL_RAGGED_TO_DENSE_UTIL_H_
 
+#include <vector>
+
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
 #include "tensorflow/core/framework/tensor_shape.h"
diff --git a/tensorflow/core/util/ragged_to_dense_util_test.cc b/tensorflow/core/util/ragged_to_dense_util_test.cc
index 7b8f2c4d3b2..9752e0a8a41 100644
--- a/tensorflow/core/util/ragged_to_dense_util_test.cc
+++ b/tensorflow/core/util/ragged_to_dense_util_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/ragged_to_dense_util.h"
 
+#include <vector>
+
 #include <gmock/gmock.h>
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/shape_inference.h"
diff --git a/tensorflow/core/util/reffed_status_callback.h b/tensorflow/core/util/reffed_status_callback.h
index 73f16c4ca75..b0728f251a6 100644
--- a/tensorflow/core/util/reffed_status_callback.h
+++ b/tensorflow/core/util/reffed_status_callback.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
 #define TENSORFLOW_CORE_UTIL_REFFED_STATUS_CALLBACK_H_
 
+#include <utility>
+
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/refcount.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -47,7 +49,7 @@ class ReffedStatusCallback : public core::RefCounted {
     return status_group_.as_summary_status();
   }
 
-  ~ReffedStatusCallback() { done_(status_group_.as_summary_status()); }
+  ~ReffedStatusCallback() override { done_(status_group_.as_summary_status()); }
 
  private:
   StatusCallback done_;
diff --git a/tensorflow/core/util/reffed_status_callback_test.cc b/tensorflow/core/util/reffed_status_callback_test.cc
index c32dfff1cd1..d71e9866dd2 100644
--- a/tensorflow/core/util/reffed_status_callback_test.cc
+++ b/tensorflow/core/util/reffed_status_callback_test.cc
@@ -16,8 +16,11 @@ limitations under the License.
 #include "tensorflow/core/util/reffed_status_callback.h"
 
 #include <atomic>
+#include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/notification.h"
 #include "tensorflow/core/lib/core/threadpool.h"
@@ -30,7 +33,7 @@ namespace {
 
 TEST(TestReffedStatusCallback, CallsBackOK) {
   bool called = false;
-  Status status = errors::InvalidArgument("");
+  Status status = absl::InvalidArgumentError("");
   auto done = [&called, &status](const Status& s) {
     called = true;
     status = s;
@@ -50,8 +53,8 @@ TEST(TestReffedStatusCallback, CallsBackFail) {
     status = s;
   };
   auto* cb = new ReffedStatusCallback(std::move(done));
-  cb->UpdateStatus(errors::Internal("1"));
-  cb->UpdateStatus(errors::InvalidArgument("2"));
+  cb->UpdateStatus(absl::InternalError("1"));
+  cb->UpdateStatus(absl::InvalidArgumentError("2"));
   EXPECT_FALSE(called);
   cb->Unref();
   EXPECT_TRUE(called);
@@ -72,9 +75,9 @@ TEST(TestReffedStatusCallback, RefMulti) {
   };
   auto* cb = new ReffedStatusCallback(std::move(done));
   cb->Ref();
-  cb->UpdateStatus(errors::Internal("1"));
+  cb->UpdateStatus(absl::InternalError("1"));
   cb->Ref();
-  cb->UpdateStatus(errors::Internal("2"));
+  cb->UpdateStatus(absl::InternalError("2"));
   cb->Unref();
   cb->Unref();
   EXPECT_FALSE(called);
@@ -102,7 +105,7 @@ TEST(TestReffedStatusCallback, MultiThreaded) {
   for (int i = 0; i < 5; ++i) {
     cb->Ref();
     threads.Schedule([cb]() {
-      cb->UpdateStatus(errors::InvalidArgument("err"));
+      cb->UpdateStatus(absl::InvalidArgumentError("err"));
       cb->Unref();
     });
   }
diff --git a/tensorflow/core/util/rocm_solvers.cc b/tensorflow/core/util/rocm_solvers.cc
index fb3e5a4b435..15acfe61820 100644
--- a/tensorflow/core/util/rocm_solvers.cc
+++ b/tensorflow/core/util/rocm_solvers.cc
@@ -243,6 +243,30 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
                                       wrapped_done);
 }
 
+// Allocates a temporary tensor. The GpuSolver object maintains a
+// TensorReference to the underlying Tensor to prevent it from being deallocated
+// prematurely.
+Status GpuSolver::allocate_scoped_tensor(DataType type,
+                                         const TensorShape& shape,
+                                         Tensor* out_temp) {
+  const Status status = context_->allocate_temp(type, shape, out_temp);
+  if (status.ok()) {
+    scratch_tensor_refs_.emplace_back(*out_temp);
+  }
+  return status;
+}
+
+Status GpuSolver::forward_input_or_allocate_scoped_tensor(
+    gtl::ArraySlice<int> candidate_input_indices, DataType type,
+    const TensorShape& shape, Tensor* out_temp) {
+  const Status status = context_->forward_input_or_allocate_temp(
+      candidate_input_indices, type, shape, out_temp);
+  if (status.ok()) {
+    scratch_tensor_refs_.emplace_back(*out_temp);
+  }
+  return status;
+}
+
 #define TF_RETURN_IF_ROCBLAS_ERROR(expr)                                  \
   do {                                                                    \
     auto status = (expr);                                                 \
@@ -257,6 +281,8 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
 #define TF_CALL_ROCSOLV_TYPES(m) \
   m(float, s) m(double, d) m(std::complex<float>, c) m(std::complex<double>, z)
 #define TF_CALL_LAPACK_TYPES_NO_COMPLEX(m) m(float, s) m(double, d)
+#define TF_CALL_HIP_LAPACK_TYPES_NO_COMPLEX(m) m(float, S) m(double, D)
+
 #define BLAS_SOLVER_FN(method, type_prefix) \
   se::wrap::rocblas##_##type_prefix##method
 
@@ -281,6 +307,12 @@ void GpuSolver::CheckLapackInfoAndDeleteSolverAsync(
 #define BUFSIZE_FN(method, hip_prefix) \
   se::wrap::hipsolver##hip_prefix##method##_bufferSize
 
+//=============================================================================
+// Wrappers of hip/rocSolver computational methods begin here.
+//  Please check actual declarations here
+//  https://github.com/ROCmSoftwarePlatform/hipSOLVER
+//  https://github.com/ROCmSoftwarePlatform/rocSOLVER
+//=============================================================================
 #if TF_ROCM_VERSION >= 40500
 
 #define GETRF_INSTANCE(Scalar, type_prefix)                                \
@@ -705,29 +737,48 @@ TF_CALL_ROCSOLV_TYPES(GETRF_BATCHED_INSTANCE);
 
 TF_CALL_ROCSOLV_TYPES(GETRS_BATCHED_INSTANCE);
 
-// Allocates a temporary tensor. The GpuSolver object maintains a
-// TensorReference to the underlying Tensor to prevent it from being deallocated
-// prematurely.
-Status GpuSolver::allocate_scoped_tensor(DataType type,
-                                         const TensorShape& shape,
-                                         Tensor* out_temp) {
-  const Status status = context_->allocate_temp(type, shape, out_temp);
-  if (status.ok()) {
-    scratch_tensor_refs_.emplace_back(*out_temp);
+#define GESVD_INSTANCE(Scalar, type_prefix)                                   \
+  template <>                                                                 \
+  Status GpuSolver::Gesvd<Scalar>(                                            \
+      signed char jobu, signed char jobvt, int m, int n, Scalar* dev_A,       \
+      int lda, Scalar* dev_S, Scalar* dev_U, int ldu, Scalar* dev_VT,         \
+      int ldvt, int* dev_lapack_info) {                                       \
+    mutex_lock lock(handle_map_mutex);                                        \
+    /* Get amount of workspace memory required. */                            \
+    int lwork;                                                                \
+    TF_RETURN_IF_ROCBLAS_ERROR(BUFSIZE_FN(gesvd, type_prefix)(                \
+        hipsolver_handle_, jobu, jobvt, m, n, &lwork));                       \
+    /* Allocate device memory for workspace. */                               \
+    auto dev_workspace =                                                      \
+        this -> GetScratchSpace<Scalar>(lwork, "", /* on_host */ false);      \
+    TF_RETURN_IF_ROCBLAS_ERROR(SOLVER_FN(gesvd, type_prefix)(                 \
+        hipsolver_handle_, jobu, jobvt, m, n, ROCmComplex(dev_A), lda, dev_S, \
+        ROCmComplex(dev_U), ldu, ROCmComplex(dev_VT), ldvt,                   \
+        ROCmComplex(dev_workspace.mutable_data()), lwork, nullptr,            \
+        dev_lapack_info));                                                    \
+    return OkStatus();                                                        \
   }
-  return status;
-}
 
-Status GpuSolver::forward_input_or_allocate_scoped_tensor(
-    gtl::ArraySlice<int> candidate_input_indices, DataType type,
-    const TensorShape& shape, Tensor* out_temp) {
-  const Status status = context_->forward_input_or_allocate_temp(
-      candidate_input_indices, type, shape, out_temp);
-  if (status.ok()) {
-    scratch_tensor_refs_.emplace_back(*out_temp);
+TF_CALL_HIP_LAPACK_TYPES_NO_COMPLEX(GESVD_INSTANCE);
+
+//=============================================================================
+// Wrappers of rocBlas computational methods begin here.
+//  Please check actual declarations here
+//  https://github.com/ROCmSoftwarePlatform/rocBlas
+//=============================================================================
+#define TRSV_INSTANCE(Scalar, type_prefix)                               \
+  template <>                                                            \
+  Status GpuSolver::Trsv<Scalar>(                                        \
+      rocblas_fill uplo, rocblas_operation trans, rocblas_diagonal diag, \
+      int n, const Scalar* A, int lda, Scalar* x, int incx) {            \
+    mutex_lock lock(handle_map_mutex);                                   \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;              \
+    TF_RETURN_IF_ROCBLAS_ERROR(BLAS_SOLVER_FN(trsv, type_prefix)(        \
+        rocm_blas_handle_, uplo, trans, diag, n, A, lda, x, incx));      \
+    return OkStatus();                                                   \
   }
-  return status;
-}
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(TRSV_INSTANCE);
 
 template <typename Scalar, typename SolverFnT>
 static inline Status TrsmImpl(GpuExecutor* gpu_executor, SolverFnT solver,
@@ -806,6 +857,39 @@ Status MatInvBatchedImpl(GpuExecutor* gpu_executor, SolverFnT solver,
         host_a_inverse_dev_ptrs, ldainv, dev_lapack_info, batch_size);        \
   }
 
+#define TRSM_BATCHED_INSTANCE(Scalar, type_prefix)                            \
+  template <>                                                                 \
+  Status GpuSolver::TrsmBatched<Scalar>(                                      \
+      rocblas_side side, rocblas_fill uplo, rocblas_operation trans,          \
+      rocblas_diagonal diag, int m, int n, const Scalar* alpha,               \
+      const Scalar* const dev_Aarray[], int lda, Scalar* dev_Barray[],        \
+      int ldb, int batch_size) {                                              \
+    mutex_lock lock(handle_map_mutex);                                        \
+    using ROCmScalar = typename ROCmComplexT<Scalar>::type;                   \
+    ScratchSpace<uint8> dev_a_dev_ptrs = this->GetScratchSpace<uint8>(        \
+        sizeof(ROCmScalar*) * batch_size, "", /* on_host */ false);           \
+    ScratchSpace<uint8> dev_b_dev_ptrs = this->GetScratchSpace<uint8>(        \
+        sizeof(ROCmScalar*) * batch_size, "", /* on_host */ false);           \
+    if (!CopyHostToDevice(context_, dev_a_dev_ptrs.mutable_data() /* dest */, \
+                          dev_Aarray /* source */, dev_a_dev_ptrs.bytes())) { \
+      return errors::Internal(                                                \
+          "TrsmBatched: Failed to copy pointers to device");                  \
+    }                                                                         \
+    if (!CopyHostToDevice(context_, dev_b_dev_ptrs.mutable_data() /* dest */, \
+                          dev_Barray /* source */, dev_b_dev_ptrs.bytes())) { \
+      return errors::Internal(                                                \
+          "TrsmBatched: Failed to copy pointers to device");                  \
+    }                                                                         \
+    TF_RETURN_IF_ROCBLAS_ERROR(BLAS_SOLVER_FN(trsm_batched, type_prefix)(     \
+        rocm_blas_handle_, side, uplo, trans, diag, m, n, alpha,              \
+        reinterpret_cast<ROCmScalar**>(dev_a_dev_ptrs.mutable_data()), lda,   \
+        reinterpret_cast<ROCmScalar**>(dev_b_dev_ptrs.mutable_data()), ldb,   \
+        batch_size));                                                         \
+    return OkStatus();                                                        \
+  }
+
+TF_CALL_LAPACK_TYPES_NO_COMPLEX(TRSM_BATCHED_INSTANCE);
+
 template <typename Scalar, typename SolverFnT>
 Status GeamImpl(GpuExecutor* gpu_executor, SolverFnT solver,
                 rocblas_handle rocm_blas_handle, rocblas_operation transa,
diff --git a/tensorflow/core/util/saved_tensor_slice_util.cc b/tensorflow/core/util/saved_tensor_slice_util.cc
index 245c8daccd3..9149bedafac 100644
--- a/tensorflow/core/util/saved_tensor_slice_util.cc
+++ b/tensorflow/core/util/saved_tensor_slice_util.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/saved_tensor_slice_util.h"
 
+#include <vector>
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/strings/ordered_code.h"
 #include "tensorflow/core/lib/strings/str_util.h"
diff --git a/tensorflow/core/util/stat_summarizer.cc b/tensorflow/core/util/stat_summarizer.cc
index f7488724f51..26a06bbb6ff 100644
--- a/tensorflow/core/util/stat_summarizer.cc
+++ b/tensorflow/core/util/stat_summarizer.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <queue>
 #include <sstream>
 #include <string>
+#include <utility>
+#include <vector>
 
+#include "absl/strings/match.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/framework/tensor_description.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
@@ -39,7 +42,7 @@ StatSummarizer::StatSummarizer(const StatSummarizerOptions& options)
 StatSummarizer::StatSummarizer(const tensorflow::GraphDef& tensorflow_graph)
     : stats_calculator_(new StatsCalculator(StatSummarizerOptions())) {}
 
-StatSummarizer::~StatSummarizer() {}
+StatSummarizer::~StatSummarizer() = default;
 
 void StatSummarizer::Validate(const std::vector<TensorDescription>* outputs,
                               const NodeExecStats& ns) const {
@@ -109,8 +112,8 @@ std::string OpType(const DeviceStepStats& ds, const NodeExecStats& ns) {
   // gracefully. Till then, duplicate what is done by:
   // https://www.tensorflow.org/code/tensorflow/python/client/timeline.py
   // and rely on the unittest.
-  if (ds.device().find("/stream") != std::string::npos ||
-      ds.device().find("/memcpy") != std::string::npos) {
+  if (absl::StrContains(ds.device(), "/stream") ||
+      absl::StrContains(ds.device(), "/memcpy")) {
     // Stats from the GPUTracer, does not correspond to TensorFlow ops.
     return "<>";
   }
@@ -139,8 +142,8 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       // /stream:$index. GPU memcpys are duplicated both in /memcpy and their
       // /stream:$index. So only keep /stream:all and /memcpy and ignore all
       // /stream:$index to only count GPU executions once.
-      if (ds.device().find("/stream") != std::string::npos &&
-          ds.device().find("/stream:all") == std::string::npos) {
+      if (absl::StrContains(ds.device(), "/stream") &&
+          !absl::StrContains(ds.device(), "/stream:all")) {
         continue;
       }
       // NOTE(fishx): We will record ops execution time twice: one as CPU
@@ -149,7 +152,7 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       // CPU activities here.
       // TODO(b/138729463): Read ops execution time from CPU activities instead
       // of runtime activities.
-      if (ds.device().find("/host:CPU") != std::string::npos) {
+      if (absl::StrContains(ds.device(), "/host:CPU")) {
         continue;
       }
 
@@ -159,14 +162,14 @@ void StatSummarizer::ProcessStepStats(const StepStats& step_stats) {
       // are unique, so we add [Kernel] or [MemCpy] as a suffix to the name.
       // To make the node type summary work better, we prefix "gpu:" to
       // the op type when the info is from a /gpu/stream or /memcpy channel.
-      if (ds.device().find("/stream") != std::string::npos) {
+      if (absl::StrContains(ds.device(), "/stream")) {
         // node_name: name ":" opType
         auto parts = str_util::Split(ns.node_name(), ':');
         if (parts.size() == 2) {
           name = parts[0] + " [Kernel]";
           op_type = "gpu:" + parts[1];
         }
-      } else if (ds.device().find("/memcpy") != std::string::npos) {
+      } else if (absl::StrContains(ds.device(), "/memcpy")) {
         // node_name: name (":" opType)? ":" memCpyType
         auto parts = str_util::Split(ns.node_name(), ':');
         if (parts.size() == 2 || parts.size() == 3) {
diff --git a/tensorflow/core/util/stat_summarizer.h b/tensorflow/core/util/stat_summarizer.h
index 7e6d6f63724..3eae427f548 100644
--- a/tensorflow/core/util/stat_summarizer.h
+++ b/tensorflow/core/util/stat_summarizer.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <memory>
 #include <sstream>
 #include <string>
+#include <vector>
 
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/core/util/stat_summarizer_test.cc b/tensorflow/core/util/stat_summarizer_test.cc
index 4553559e7a9..c55e98c2f43 100644
--- a/tensorflow/core/util/stat_summarizer_test.cc
+++ b/tensorflow/core/util/stat_summarizer_test.cc
@@ -15,6 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/util/stat_summarizer.h"
 
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "absl/strings/match.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/step_stats.pb.h"
 #include "tensorflow/core/lib/core/status.h"
@@ -77,10 +82,10 @@ versions {
   const std::string by_node_type = stats.GetStatsByNodeType();
 
   // output should contain both the node type and node name.
-  ASSERT_TRUE(output.find("Const") != std::string::npos) << output;
-  ASSERT_TRUE(output.find("myconstant") != std::string::npos) << output;
+  ASSERT_TRUE(absl::StrContains(output, "Const")) << output;
+  ASSERT_TRUE(absl::StrContains(output, "myconstant")) << output;
   // stats by node type should include the type.
-  ASSERT_TRUE(by_node_type.find("Const") != std::string::npos) << by_node_type;
+  ASSERT_TRUE(absl::StrContains(by_node_type, "Const")) << by_node_type;
 }
 
 }  // namespace
diff --git a/tensorflow/core/util/strided_slice_op.cc b/tensorflow/core/util/strided_slice_op.cc
index 8ec4833b274..c8a0d9bb446 100644
--- a/tensorflow/core/util/strided_slice_op.cc
+++ b/tensorflow/core/util/strided_slice_op.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <iterator>
+#include <utility>
 
 #include "tensorflow/core/framework/bounds_check.h"
 #include "tensorflow/core/lib/core/status.h"
diff --git a/tensorflow/core/util/strided_slice_op.h b/tensorflow/core/util/strided_slice_op.h
index 8b0d2ea76b8..ef3b75eb35c 100644
--- a/tensorflow/core/util/strided_slice_op.h
+++ b/tensorflow/core/util/strided_slice_op.h
@@ -67,7 +67,7 @@ struct StridedSliceShapeSpec {
 Status ValidateStridedSliceOp(
     const Tensor* begin_tensor, const Tensor* end_tensor,
     const Tensor& strides_tensor, const PartialTensorShape& input_shape,
-    int32_t begin_mask_spec, int32_t end_mask_spec, const int32_t ellipsis_mask,
+    int32_t begin_mask_spec, int32_t end_mask_spec, int32_t ellipsis_mask,
     int32_t new_axis_mask, int32_t shrink_axis_mask,
     PartialTensorShape* processing_shape, PartialTensorShape* final_shape,
     bool* is_identity, bool* is_simple_slice, bool* slice_dim0,
@@ -79,7 +79,7 @@ Status ValidateStridedSliceOp(
 Status ValidateStridedSliceOp(
     const Tensor* begin_tensor, const Tensor* end_tensor,
     const Tensor& strides_tensor, const PartialTensorShape& input_shape,
-    int32_t begin_mask_spec, int32_t end_mask_spec, const int32_t ellipsis_mask,
+    int32_t begin_mask_spec, int32_t end_mask_spec, int32_t ellipsis_mask,
     int32_t new_axis_mask, int32_t shrink_axis_mask,
     TensorShape* processing_shape, TensorShape* final_shape, bool* is_identity,
     bool* is_simple_slice, bool* slice_dim0,
diff --git a/tensorflow/core/util/strided_slice_op_test.cc b/tensorflow/core/util/strided_slice_op_test.cc
index fdd16d4b86f..04bfefa4d56 100644
--- a/tensorflow/core/util/strided_slice_op_test.cc
+++ b/tensorflow/core/util/strided_slice_op_test.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/core/util/strided_slice_op.h"
 
 #include <algorithm>
+#include <ostream>
 #include <tuple>
+#include <utility>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
diff --git a/tensorflow/core/util/tensor_format.h b/tensorflow/core/util/tensor_format.h
index 9c389eeddfb..887ee5a7786 100644
--- a/tensorflow/core/util/tensor_format.h
+++ b/tensorflow/core/util/tensor_format.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_CORE_UTIL_TENSOR_FORMAT_H_
 
 #include <array>
+#include <string>
 #include <vector>
 
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/core/util/tensor_slice_reader.cc b/tensorflow/core/util/tensor_slice_reader.cc
index 3163f4444e5..b33fe43c569 100644
--- a/tensorflow/core/util/tensor_slice_reader.cc
+++ b/tensorflow/core/util/tensor_slice_reader.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
 #include <climits>
+#include <memory>
 #include <utility>
 #include <vector>
 
@@ -37,7 +38,7 @@ namespace tensorflow {
 
 namespace checkpoint {
 
-TensorSliceReader::Table::~Table() {}
+TensorSliceReader::Table::~Table() = default;
 
 namespace {
 class TensorSliceReaderTable : public TensorSliceReader::Table {
diff --git a/tensorflow/core/util/tensor_slice_reader.h b/tensorflow/core/util/tensor_slice_reader.h
index bc0a91523fe..aaf3ffab9dc 100644
--- a/tensorflow/core/util/tensor_slice_reader.h
+++ b/tensorflow/core/util/tensor_slice_reader.h
@@ -19,9 +19,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_H_
 
+#include <functional>
+#include <memory>
 #include <unordered_map>
-
+#include <utility>
 #include <vector>
+
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -75,7 +78,7 @@ class TensorSliceReader {
   int num_files() const { return sss_.size(); }
 
   // Get the status of the reader.
-  const Status status() const { return status_; }
+  Status status() const { return status_; }
 
   // Checks if the reader contains any slice of a tensor. In case the reader
   // does contain the tensor, if "shape" is not nullptr, fill "shape" with the
@@ -139,7 +142,7 @@ class TensorSliceReader {
 };
 
 Status OpenTableTensorSliceReader(const string& fname,
-                                  TensorSliceReader::Table** table);
+                                  TensorSliceReader::Table** result);
 
 template <typename T>
 bool TensorSliceReader::CopySliceData(const string& name,
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.cc b/tensorflow/core/util/tensor_slice_reader_cache.cc
index b179b421922..e592f4a154b 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.cc
+++ b/tensorflow/core/util/tensor_slice_reader_cache.cc
@@ -23,7 +23,7 @@ namespace tensorflow {
 
 namespace checkpoint {
 
-TensorSliceReaderCacheWrapper::TensorSliceReaderCacheWrapper() {}
+TensorSliceReaderCacheWrapper::TensorSliceReaderCacheWrapper() = default;
 TensorSliceReaderCacheWrapper::~TensorSliceReaderCacheWrapper() {
   delete cache_;
   cache_ = nullptr;
@@ -41,7 +41,7 @@ const TensorSliceReader* TensorSliceReaderCacheWrapper::GetReader(
                            preferred_shard);
 }
 
-TensorSliceReaderCache::TensorSliceReaderCache() {}
+TensorSliceReaderCache::TensorSliceReaderCache() = default;
 
 TensorSliceReaderCache::~TensorSliceReaderCache() {
   for (const auto& pair : readers_) {
diff --git a/tensorflow/core/util/tensor_slice_reader_cache.h b/tensorflow/core/util/tensor_slice_reader_cache.h
index 9f1919df4e4..59426f97de1 100644
--- a/tensorflow/core/util/tensor_slice_reader_cache.h
+++ b/tensorflow/core/util/tensor_slice_reader_cache.h
@@ -19,7 +19,9 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_READER_CACHE_H_
 
+#include <set>
 #include <unordered_map>
+#include <utility>
 
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/util/tensor_slice_reader_test.cc b/tensorflow/core/util/tensor_slice_reader_test.cc
index 0d33fb71e4c..74c890016f6 100644
--- a/tensorflow/core/util/tensor_slice_reader_test.cc
+++ b/tensorflow/core/util/tensor_slice_reader_test.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_reader.h"
 
+#include <functional>
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/core/util/tensor_slice_set.cc b/tensorflow/core/util/tensor_slice_set.cc
index af7dcd92613..dcf7f9bf0db 100644
--- a/tensorflow/core/util/tensor_slice_set.cc
+++ b/tensorflow/core/util/tensor_slice_set.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_set.h"
 
+#include <unordered_map>
+#include <utility>
 #include <vector>
+
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/gtl/map_util.h"
 #include "tensorflow/core/platform/logging.h"
@@ -28,7 +31,7 @@ namespace checkpoint {
 TensorSliceSet::TensorSliceSet(const TensorShape& shape, DataType type)
     : shape_(shape), type_(type) {}
 
-TensorSliceSet::~TensorSliceSet() {}
+TensorSliceSet::~TensorSliceSet() = default;
 
 Status TensorSliceSet::Register(const TensorSlice& slice, const string& tag) {
   TensorShape result_shape;
diff --git a/tensorflow/core/util/tensor_slice_set.h b/tensorflow/core/util/tensor_slice_set.h
index 869f73dadff..4887321de95 100644
--- a/tensorflow/core/util/tensor_slice_set.h
+++ b/tensorflow/core/util/tensor_slice_set.h
@@ -21,6 +21,7 @@ limitations under the License.
 
 #include <string>  // for string
 #include <unordered_map>
+#include <utility>
 #include <vector>
 
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -40,7 +41,7 @@ class TensorSliceSet {
   virtual ~TensorSliceSet();
 
   const TensorShape& shape() const { return shape_; }
-  const DataType type() const { return type_; }
+  DataType type() const { return type_; }
 
   // Register a new slice for the tensor. The "tag" is an arbitrary string
   // associated with the slice (in one application it denotes the name of the
diff --git a/tensorflow/core/util/tensor_slice_set_test.cc b/tensorflow/core/util/tensor_slice_set_test.cc
index c9f08f50483..39eec6835fe 100644
--- a/tensorflow/core/util/tensor_slice_set_test.cc
+++ b/tensorflow/core/util/tensor_slice_set_test.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_set.h"
 
+#include <utility>
 #include <vector>
+
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/test.h"
diff --git a/tensorflow/core/util/tensor_slice_writer.cc b/tensorflow/core/util/tensor_slice_writer.cc
index 451d4d55ccf..580ed304242 100644
--- a/tensorflow/core/util/tensor_slice_writer.cc
+++ b/tensorflow/core/util/tensor_slice_writer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/core/util/tensor_slice_writer.h"
 
+#include <memory>
 #include <utility>
 
 #include "tensorflow/core/framework/versions.pb.h"
@@ -38,7 +39,7 @@ class TableBuilder : public TensorSliceWriter::Builder {
   TableBuilder(const string& name, WritableFile* f) : name_(name), file_(f) {
     table::Options option;
     option.compression = table::kNoCompression;
-    builder_.reset(new table::TableBuilder(option, f));
+    builder_ = std::make_unique<table::TableBuilder>(option, f);
   }
   void Add(StringPiece key, StringPiece val) override {
     builder_->Add(key, val);
diff --git a/tensorflow/core/util/tensor_slice_writer.h b/tensorflow/core/util/tensor_slice_writer.h
index 34aa7c0b916..0f69d5eb4dd 100644
--- a/tensorflow/core/util/tensor_slice_writer.h
+++ b/tensorflow/core/util/tensor_slice_writer.h
@@ -19,7 +19,10 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
 #define TENSORFLOW_CORE_UTIL_TENSOR_SLICE_WRITER_H_
 
+#include <functional>
+#include <map>
 #include <unordered_map>
+#include <utility>
 
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_slice.h"
@@ -44,7 +47,7 @@ class TensorSliceWriter {
   // Abstract interface that TensorSliceWriter uses for building
   class Builder {
    public:
-    virtual ~Builder() {}
+    virtual ~Builder() = default;
     virtual void Add(StringPiece key, StringPiece value) = 0;
     virtual Status Finish(int64_t* file_size) = 0;
   };
@@ -52,7 +55,7 @@ class TensorSliceWriter {
 
   TensorSliceWriter(const string& filename,
                     CreateBuilderFunction create_builder);
-  virtual ~TensorSliceWriter() {}
+  virtual ~TensorSliceWriter() = default;
   // Adds a slice. We support float and int32 for now.
   // TODO(yangke): add more supports
   template <typename T>
diff --git a/tensorflow/core/util/util.h b/tensorflow/core/util/util.h
index 21a0f7c27ff..04a68cef182 100644
--- a/tensorflow/core/util/util.h
+++ b/tensorflow/core/util/util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_UTIL_UTIL_H_
 #define TENSORFLOW_CORE_UTIL_UTIL_H_
 
+#include <string>
+
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
 
@@ -54,7 +56,7 @@ std::string PrintMemory(const char* ptr, size_t n);
 // Given a flattened index into a tensor, computes a string s so that
 // StrAppend("tensor", s) is a Python indexing expression.  E.g.,
 // "tensor", "tensor[i]", "tensor[i, j]", etc.
-std::string SliceDebugString(const TensorShape& shape, const int64_t flat);
+std::string SliceDebugString(const TensorShape& shape, int64_t flat);
 
 // Check if MKL is enabled in runtime
 bool IsMKLEnabled();
diff --git a/tensorflow/core/util/work_sharder.cc b/tensorflow/core/util/work_sharder.cc
index ebf7a94976d..27f53de45f1 100644
--- a/tensorflow/core/util/work_sharder.cc
+++ b/tensorflow/core/util/work_sharder.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/core/util/work_sharder.h"
 
+#include <algorithm>
+#include <functional>
+
 #include "tensorflow/core/platform/blocking_counter.h"
 #include "tensorflow/core/platform/logging.h"
 
diff --git a/tensorflow/core/util/work_sharder_test.cc b/tensorflow/core/util/work_sharder_test.cc
index 476c2960e76..4b33a99f246 100644
--- a/tensorflow/core/util/work_sharder_test.cc
+++ b/tensorflow/core/util/work_sharder_test.cc
@@ -15,8 +15,11 @@ limitations under the License.
 
 #include "tensorflow/core/util/work_sharder.h"
 
+#include <algorithm>
 #include <atomic>
+#include <functional>
 #include <vector>
+
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/mutex.h"
diff --git a/tensorflow/core/util/xla_config_registry.cc b/tensorflow/core/util/xla_config_registry.cc
index ce180ee7816..f65e9c9adce 100644
--- a/tensorflow/core/util/xla_config_registry.cc
+++ b/tensorflow/core/util/xla_config_registry.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/util/xla_config_registry.h"
 
+#include <utility>
+
 #include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
diff --git a/tensorflow/distribute/experimental/rpc/kernels/BUILD b/tensorflow/distribute/experimental/rpc/kernels/BUILD
index f8757df41e0..eef3a4e2357 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/BUILD
+++ b/tensorflow/distribute/experimental/rpc/kernels/BUILD
@@ -51,6 +51,7 @@ tf_kernel_library(
         "//tensorflow/distribute/experimental/rpc/proto:tf_rpc_service_cc_grpc_proto",
         "//tensorflow/distribute/experimental/rpc/proto:tf_rpc_service_proto_cc",
         "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
     ],
diff --git a/tensorflow/distribute/experimental/rpc/kernels/oss/grpc_credentials.cc b/tensorflow/distribute/experimental/rpc/kernels/oss/grpc_credentials.cc
index e64b3339d99..b57f078fa9f 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/oss/grpc_credentials.cc
+++ b/tensorflow/distribute/experimental/rpc/kernels/oss/grpc_credentials.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/security/server_credentials.h"
 
diff --git a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
index ffa8586d699..18be70c2754 100644
--- a/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
+++ b/tensorflow/distribute/experimental/rpc/kernels/rpc_ops.cc
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <algorithm>
 #include <functional>
 #include <memory>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "grpcpp/channel.h"
@@ -25,6 +27,7 @@ limitations under the License.
 #include "grpcpp/impl/codegen/status.h"
 #include "grpcpp/security/credentials.h"
 #include "grpcpp/server_builder.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 // Needed for encoding and decoding ResourceDeleter Variant.
@@ -761,7 +764,7 @@ void RpcCheckStatusOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   {
     auto status = LookupResource(ctx, handle, &future_resource);
     if (!status.ok()) {
-      if (errors::IsNotFound(status)) {
+      if (absl::IsNotFound(status)) {
         ctx->SetStatus(tensorflow::errors::NotFound(
             absl::StrCat("Future resource no longer exists. Please make sure "
                          "resource is not already deleted.")));
@@ -795,7 +798,7 @@ void RpcGetValueOp::ComputeAsync(OpKernelContext* ctx, DoneCallback done) {
   {
     auto status = LookupResource(ctx, handle, &future_resource);
     if (!status.ok()) {
-      if (errors::IsNotFound(status)) {
+      if (absl::IsNotFound(status)) {
         ctx->SetStatus(tensorflow::errors::NotFound(
             absl::StrCat("Future resource no longer exists. Please ensure "
                          "resource is not already deleted.")));
diff --git a/tensorflow/dtensor/build_defs.bzl b/tensorflow/dtensor/build_defs.bzl
index a86c33bce04..9a5f165bd07 100644
--- a/tensorflow/dtensor/build_defs.bzl
+++ b/tensorflow/dtensor/build_defs.bzl
@@ -1,12 +1,14 @@
 """Helpers for defining multi-platform DTensor test targets."""
 
-load("//tensorflow:tensorflow.bzl", "py_strict_test")
-
 # LINT.IfChange
-ALL_BACKENDS = ["cpu", "gpu", "tpu"]
-TPU_V3_DONUT_BACKEND = "tpu_v3_2x2"
-TPU_V4_DONUT_BACKEND = "tpu_v4_2x2"
-GPU_2DEVS_BACKEND = "2gpus"
+ALL_BACKENDS = [
+    "cpu",  # 1 physical CPU,
+    "gpu",  # 1 physical GPU,
+    "tpu",  # 2 physical TPU devices
+]
+TPU_V3_DONUT_BACKEND = "tpu_v3_2x2"  # 8 TPU devices; includes TFRT and non-TFRT targets
+TPU_V4_DONUT_BACKEND = "tpu_v4_2x2"  # 8 TPU devices for non-Megacore targets and 4 for Megacore targets
+GPU_2DEVS_BACKEND = "2gpus"  # 2 Physical GPUs.
 PATHWAYS = "pw"
 PATHWAYS_V3_DONUT_BACKEND = "pw_v3_2x2"
 # LINT.ThenChange(
@@ -102,7 +104,7 @@ def dtensor_test(
         shard_count = None,
         size = None,
         get_configurations = _get_configurations,
-        test_rule = py_strict_test):
+        test_rule = native.py_test):
     """Defines a set of per-platform DTensor test targets.
 
     Generates test targets named:
diff --git a/tensorflow/dtensor/cc/BUILD b/tensorflow/dtensor/cc/BUILD
index 3cdd091bb58..2f40baf16a6 100644
--- a/tensorflow/dtensor/cc/BUILD
+++ b/tensorflow/dtensor/cc/BUILD
@@ -1,4 +1,5 @@
-# DTensor C++ runtime and libraries.
+#include "third_party/absl/strings/str_cat.h"
+#DTensor C++ runtime and libraries.
 
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load(
@@ -132,6 +133,7 @@ cc_library(
         "//tensorflow/tsl/platform:refcount",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -260,6 +262,7 @@ cc_library(
         "//tensorflow/dtensor/mlir/dtensor_dialect:Dialect",
         "//tensorflow/tsl/platform:status",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -383,3 +386,14 @@ cc_library(
         "//tensorflow/c:conversion_macros",
     ],
 )
+
+cc_library(
+    name = "slice_util",
+    srcs = ["slice_util.cc"],
+    hdrs = ["slice_util.h"],
+    deps = [
+        ":tensor_layout",
+        "//tensorflow/core:lib",
+        "@llvm-project//mlir:IR",
+    ],
+)
diff --git a/tensorflow/dtensor/cc/constants.h b/tensorflow/dtensor/cc/constants.h
index 5b9673cb0be..6db66aa75e3 100644
--- a/tensorflow/dtensor/cc/constants.h
+++ b/tensorflow/dtensor/cc/constants.h
@@ -94,6 +94,10 @@ static constexpr char kMeshCoordinatesAttr[] = "_mesh_coordinates";
 // such as IR dumps etc.
 static constexpr char kDoNotLog[] = "dtensor.do_not_log";
 
+// Attribute used to record the name of the eager operation triggered the
+// DTensor rewrites.
+static constexpr char kEagerOperationName[] = "dtensor.eager_operation_name";
+
 // The number of TPU cores in a donut.
 static constexpr int kTpuDonutSize = 8;
 
@@ -112,6 +116,10 @@ static constexpr char kSkipXlaCompilation[] = "_skip_xla_compilation";
 // to uniquely name functions.
 static constexpr char kCacheKey[] = "dtensor.cache_key";
 
+// An attribute on Const nodes to record which argument it was originally
+// from.
+static constexpr char kFromArgIndex[] = "dtensor.from_arg_index";
+
 // An attribute that determines whether a tensor is a sparse tensor. If this
 // attribute exists in a tensor, then this tensor is a sparse tensor.
 static constexpr char kSparseValue[] = "tf._sparse";
@@ -145,6 +153,11 @@ static constexpr char kIteratorOutputShapes[] = "output_shapes";
 // The number of list of regular tensors used to represent sparse tensors.
 static constexpr int kSparseTensorNum = 3;
 
+// Attribute which stores the environment variable value for all_reduce
+// optimization group size: DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE.
+static constexpr char kAllReduceNumOpsInGroup[] =
+    "dtensor.all_reduce_combiner.num_ops_in_group";
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index 521a7a0e33e..cec1392e667 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -1570,7 +1570,7 @@ DTensorDevice::DTensorOperationToModule(
             << ". DTensor is (re-)using its SPMD transformation.";
     result.module = **cached_mlir_module;
     return result;
-  } else if (function_def) {
+  } else if (function_def || VLOG_IS_ON(2)) {
     LOG(INFO) << "DTensor cache key lookup missed for " << doperation.name
               << ". DTensor is (re-)computing its SPMD transformation.";
   }
@@ -1595,8 +1595,9 @@ DTensorDevice::DTensorOperationToModule(
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> mlir_module_ref,
       pass_runner_.ImportGraphToMlir(
-          device_set, doperation.is_func(), doperation.default_mesh, *flib_def,
-          *result.graph, result.doperation_cache_key));
+          device_set, absl::string_view{doperation.name}, doperation.is_func(),
+          doperation.default_mesh, *flib_def, *result.graph,
+          result.doperation_cache_key));
 
   tsl::core::WeakPtr<ExecutableManager<mlir::OwningOpRef<mlir::ModuleOp>>>
       manager{module_manager_.get()};
@@ -1638,11 +1639,9 @@ void DTensorDevice::ModuleToExecutionFunctions(
     VLOG(2) << "DTensor cache key lookup found for " << doperation.name
             << ". DTensor is (re-)using its ExecutionFunctions.";
     return;
-  } else {
-    if (doperation.is_func()) {
-      LOG(INFO) << "DTensor cache key lookup missed for " << doperation.name
-                << ". DTensor is (re-)computing its ExecutionFunctions.";
-    }
+  } else if (doperation.is_func() || VLOG_IS_ON(2)) {
+    LOG(INFO) << "DTensor cache key lookup missed for " << doperation.name
+              << ". DTensor is (re-)computing its ExecutionFunctions.";
   }
 
   // Transforms ModuleOp and extracts ExecutionFunctions from lowered ModuleOp.
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.cc b/tensorflow/dtensor/cc/dtensor_device_util.cc
index 95659c0b396..3ed6e4618ef 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.cc
+++ b/tensorflow/dtensor/cc/dtensor_device_util.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <iterator>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -25,6 +26,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/Support/Casting.h"
 #include "tensorflow/c/eager/c_api_internal.h"
@@ -91,6 +93,24 @@ std::vector<TensorHandlePtr> BroadcastTensorHandleToParallelTensor(
   return components;
 }
 
+StatusOr<ResourceHandle> TensorHandleToResourceHandle(
+    TFE_TensorHandle* tensor) {
+  // Resolve the Tensor as resource handle such that we can get the shape and
+  // dtype of the tensor it points to.
+  TF_Status status;
+  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tf_tensor(
+      TFE_TensorHandleResolve(tensor, &status), TF_DeleteTensor);
+  if (TF_GetCode(&status) != TF_OK) {
+    return StatusFromTF_Status(&status);
+  }
+  Tensor t;
+  TF_RETURN_IF_ERROR(TF_TensorToTensor(tf_tensor.get(), &t));
+  if (t.dtype() != DataType::DT_RESOURCE) {
+    return absl::InvalidArgumentError("Expecting a DT_RESOURCE Tensor");
+  }
+  return t.flat<ResourceHandle>()(0);
+}
+
 // Broadcast a single non-parallel resource tensor onto `mesh` with a fully
 // replicated sharding spec. Does not take ownership of `tensor`.
 std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
@@ -99,22 +119,14 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
   // Only broadcast resource tensors that point to scalars since they are
   // always replicated. We also still want to catch honest user errors so
   // error out on non-scalars.
-  // Resolve the Tensor as resource handle and get the shape and dtype
-  // of the tensor it points to.
-  std::unique_ptr<TF_Tensor, decltype(&TF_DeleteTensor)> tf_tensor(
-      TFE_TensorHandleResolve(tensor, status), TF_DeleteTensor);
-  Tensor t;
-  Status convert_status = TF_TensorToTensor(tf_tensor.get(), &t);
-  if (!convert_status.ok() || t.dtype() != DataType::DT_RESOURCE) {
-    TF_SetStatus(status, TF_INTERNAL,
-                 absl::StrCat("TF_TensorToTensor() Conversion failed:",
-                              convert_status.message())
-                     .c_str());
-    return nullptr;
-  }
+
   // Replicate this resource handle to all devices without changing the
   // associated device of the resource itself.
-  ResourceHandle r = t.flat<ResourceHandle>()(0);
+  auto r = TensorHandleToResourceHandle(tensor);
+  if (!r.ok()) {
+    Set_TF_Status_from_Status(status, r.status());
+    return nullptr;
+  }
 
   // Only broadcast resource tensors onto a CPU mesh. Copying
   // resource tensors to non CPU device is not supported.
@@ -127,8 +139,8 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
 
     // Get the stack_trace and Summaries from the resource tensor.
     absl::StrAppend(
-        &error_message, "Offending variable summary: ", r.SummarizeValue(),
-        "\nStack trace: ", DefinitionLocationMsg(r.definition_stack_trace()));
+        &error_message, "Offending variable summary: ", r->SummarizeValue(),
+        "\nStack trace: ", DefinitionLocationMsg(r->definition_stack_trace()));
     TF_SetStatus(status, TF_INVALID_ARGUMENT, error_message.c_str());
     return nullptr;
   }
@@ -150,9 +162,9 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
       context, tensor, target_mesh, status);
   if (TF_GetCode(status) != TF_OK) return nullptr;
 
-  int rank = r.dtypes_and_shapes().empty()
+  int rank = r->dtypes_and_shapes().empty()
                  ? 0
-                 : r.dtypes_and_shapes().begin()->shape.dims();
+                 : r->dtypes_and_shapes().begin()->shape.dims();
 
   StatusOr<std::unique_ptr<TensorWithLayoutTf>> result = CreateTensorWithLayout(
       std::move(tensors), Layout::ReplicatedOnMesh(target_mesh, rank));
@@ -166,14 +178,14 @@ std::unique_ptr<TensorWithLayoutTf> BroadcastResourceTensor(
     return nullptr;
   }
 
-  if (!r.dtypes_and_shapes().empty()) {
-    PartialTensorShape partial_shape = r.dtypes_and_shapes().begin()->shape;
+  if (!r->dtypes_and_shapes().empty()) {
+    PartialTensorShape partial_shape = r->dtypes_and_shapes().begin()->shape;
     // Set the shape/type of the tensor that the resource points to
     // so that the graph has correct shape/type information that we can use.
     const Status s =
         llvm::cast<ResourceHandleWithLayout>((*result).get())
             ->UpdateShapeAndDType(partial_shape.AsProto(),
-                                  r.dtypes_and_shapes().begin()->dtype);
+                                  r->dtypes_and_shapes().begin()->dtype);
     if (!s.ok()) {
       TF_SetStatus(
           status, TF_INTERNAL,
@@ -215,11 +227,11 @@ Status ParseAttrMap(const Node& node, absl::string_view indices_attr,
   }
   const TensorProto* indices;
   if (!TryGetNodeAttr(node.attrs(), indices_attr, &indices)) {
-    return errors::Internal(
+    return absl::InternalError(
         "Arg indices must be set when setting inferred resource layouts.");
   }
   if (indices->int_val_size() != layouts.size()) {
-    return errors::Internal(
+    return absl::InternalError(
         "Arg indices for inferred resource argument must match the "
         "size of inferred resource layout.");
   }
@@ -257,15 +269,15 @@ StatusOr<Layout> GetLayoutThroughIdentityOps(Node* op, int output_index) {
   const auto serialized_layouts = op->attrs().Find(kLayoutAttr);
 
   if (!serialized_layouts) {
-    return errors::InvalidArgument(
-        op->op_def().name(), " doesn't contain attribute : ", kLayoutAttr);
+    return absl::InvalidArgumentError(absl::StrCat(
+        op->op_def().name(), " doesn't contain attribute : ", kLayoutAttr));
   }
 
   // We assume that there is one layout for each output.
   if (serialized_layouts->list().s_size() != op->num_outputs()) {
-    return errors::InvalidArgument(
-        "Number of outputs to ", op->op_def().name(),
-        " does not match number of layouts attached");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Number of outputs to ", op->op_def().name(),
+                     " does not match number of layouts attached"));
   }
 
   return Layout::FromString(serialized_layouts->list().s(output_index));
@@ -282,7 +294,8 @@ StatusOr<std::vector<int64_t>> GetTensorShapeAsVector(
   if (status.ok()) {
     const int dims = shape.dims();
     if (dims < 0) {
-      return errors::InvalidArgument("Unavailable tensor shape!");
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unavailable tensor shape!"));
     }
     std::vector<int64_t> result;
     result.reserve(dims);
@@ -535,12 +548,12 @@ tsl::Status ResourceHandleWithLayout::UpdateLayout(const Layout& new_layout) {
   // empty. This is still hacky as we use empty layout as placeholder for
   // eagerly placed VarHandleOp.
   if (!dereferenced_layout_.has_value() && new_layout.IsEmpty()) {
-    return tsl::errors::InvalidArgument("New layout is empty.");
+    return absl::InvalidArgumentError("New layout is empty.");
   }
   if (dereferenced_layout_.has_value() &&
       !LayoutsAreCompatible(dereferenced_layout_, new_layout)) {
     // TODO(xiejw, allenl): Consider allowing variables to switch layouts.
-    return tsl::errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Attempted to overwrite an existing Layout.");
   }
   dereferenced_layout_.emplace(new_layout);
@@ -550,8 +563,8 @@ tsl::Status ResourceHandleWithLayout::UpdateLayout(const Layout& new_layout) {
 tsl::Status ResourceHandleWithLayout::UpdateAttrs(
     const EmbeddingResourceAttrs& attrs) {
   if (attrs_.has_value()) {
-    return tsl::errors::InvalidArgument(
-        "Attempted to overwrite an existing embedding resource attribute.");
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Attempted to overwrite an existing embedding resource attribute."));
   }
   attrs_.emplace(attrs);
   return tsl::OkStatus();
@@ -660,7 +673,7 @@ template <>
 StatusOr<bool> ExecutableManager<ExecutionFunctions>::ShouldFoldInput(
     const DTensorOperation& doperation,
     const std::vector<TensorWithLayout*>& inputs, const int input_index) const {
-  return tsl::errors::Unavailable(
+  return absl::UnavailableError(
       "ExecutionFunctions manager can not check if the input is foldable, as "
       "the information is maintained by other types of managers (e.g. ModuleOp "
       "manager)");
@@ -804,6 +817,7 @@ Status PrepareGraphForMlir(
       NodeDef const_node = input->const_value_node()->const_value().value();
       const_node.set_name(absl::StrCat("input_", i, "_const_value"));
       Node* const_value_n = graph->AddNode(const_node, &status);
+      const_value_n->AddAttr(kFromArgIndex, i);
       TF_RETURN_IF_ERROR(status);
       TF_RETURN_IF_ERROR(shape_refiner.AddNode(const_value_n));
       graph_op_inputs.push_back(FunctionArgument{
@@ -921,7 +935,7 @@ StatusOr<ExecutionFunctions> IdentifyAllFunctionsToExecute(
 
       TF_RETURN_IF_ERROR(node->input_node(in_index, &input_node));
       if (!input_node->IsArg())
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(
             "Input node to mesh computation must be arg node.");
 
       int global_index;
@@ -970,7 +984,7 @@ StatusOr<ExecutionFunctions> IdentifyAllFunctionsToExecute(
   }
 
   if (execution_functions.function_list.empty()) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "MLIR transformed graph does not have any functions to execute for "
         "mesh.");
   }
@@ -1051,7 +1065,8 @@ StatusOr<std::vector<std::vector<TFE_TensorHandle*>>> PrepareEmbeddingInputs(
 
   // Check if there is no embedding resource input found.
   if (table_vars_input_index.empty()) {
-    return errors::Internal("There are no TPU embedding resource input found.");
+    return absl::InternalError(
+        "There are no TPU embedding resource input found.");
   }
   std::vector<std::vector<TFE_TensorHandle*>> parallel_inputs;
   // Assure parallel inputs has numeric order as table ids.
@@ -1109,9 +1124,9 @@ StatusOr<std::map<int64_t, std::vector<Node*>>> GetTPUEmbeddingInputNodes(
     if (!status.ok()) {
       TF_SetStatus(s, static_cast<TF_Code>(status.code()),
                    tsl::NullTerminatedMessage(status));
-      return errors::Internal(
-          "Failed to set embedding resource attrs. \n Got error: ",
-          status.message());
+      return absl::InternalError(
+          absl::StrCat("Failed to set embedding resource attrs. \n Got error: ",
+                       status.message()));
     }
   }
   return table_id_node_map;
@@ -1127,7 +1142,7 @@ StatusOr<std::string> ValidateResourceMeshConsistency(
     if (mesh_str.empty()) {
       mesh_str = input_mesh_str;
     } else if (mesh_str != input_mesh_str) {
-      return errors::Internal(absl::StrCat(
+      return absl::InternalError(absl::StrCat(
           "All inputs of embedding resource must be on same mesh. but get : ",
           mesh_str, " != ", input_mesh_str));
     }
@@ -1142,15 +1157,15 @@ Status InsertFunctionForTPUEmbeddingCheckpoint(
     const std::string& checkpoint_fn_name) {
   if (checkpoint_fn_name != kLoadEmbeddingFn &&
       checkpoint_fn_name != kRetrieveEmbeddingFn) {
-    return errors::InvalidArgument(absl::StrCat(
+    return absl::InvalidArgumentError(absl::StrCat(absl::StrCat(
         "Found wrong function name: ", checkpoint_fn_name,
-        " \n expects : ", kLoadEmbeddingFn, " or ", kRetrieveEmbeddingFn));
+        " \n expects : ", kLoadEmbeddingFn, " or ", kRetrieveEmbeddingFn)));
   }
 
   StatusOr<std::map<int64_t, std::vector<Node*>>> table_id_node_map =
       GetTPUEmbeddingInputNodes(status, *graph, inputs);
   if (!table_id_node_map.ok()) {
-    return errors::Internal(table_id_node_map.status().message());
+    return absl::InternalError(table_id_node_map.status().message());
   }
 
   StatusOr<std::string> mesh_str = ValidateResourceMeshConsistency(inputs);
@@ -1166,7 +1181,7 @@ Status InsertFunctionForTPUEmbeddingCheckpoint(
   for (int i = 0; i < num_tables; ++i) {
     auto node_vec_ptr = table_id_node_map->find(i);
     if (node_vec_ptr == table_id_node_map->end()) {
-      return errors::Internal(
+      return absl::InternalError(
           absl::StrCat("Embedding table id ", i, " is not found."));
     }
     for (const Node* n : node_vec_ptr->second) {
diff --git a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
index 63fc5a440b0..e299c06b1a5 100644
--- a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
+++ b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
@@ -72,7 +72,7 @@ DTensorMlirPassRunner::DTensorMlirPassRunner()
 
 StatusOr<mlir::OwningOpRef<mlir::ModuleOp>>
 DTensorMlirPassRunner::ImportGraphToMlir(
-    const DeviceSet& device_set, bool is_func,
+    const DeviceSet& device_set, absl::string_view name, bool is_func,
     const dtensor::Mesh& default_mesh,
     const FunctionLibraryDefinition& flib_def, const Graph& graph,
     Fprint128 cache_key) {
@@ -102,9 +102,11 @@ DTensorMlirPassRunner::ImportGraphToMlir(
                   mlir::StringAttr::get(&context_, default_mesh.ToString()));
 
   // Tag the module for logging or not depending on flag.
-  if (!is_func && !dtensor::LogOpByOp())
+  if (!is_func && !dtensor::LogOpByOp(name))
     module->setAttr(dtensor::kDoNotLog, mlir::UnitAttr::get(&context_));
 
+  module->setAttr(dtensor::kEagerOperationName,
+                  mlir::StringAttr::get(&context_, name));
   // Set the cache key for the module as an attribute. This attribute will be
   // used to rename all private functions in the module (by appending the
   // cache key) so they have unique names.
@@ -112,6 +114,13 @@ DTensorMlirPassRunner::ImportGraphToMlir(
       dtensor::kCacheKey,
       mlir::StringAttr::get(&context_, absl::StrCat("_", cache_key.low64, "_",
                                                     cache_key.high64)));
+  // Set the all_reduce_combine_optimization environment variable as module
+  // attribute
+  int group_size = dtensor::AllReduceCombineOptimizationGroupSize();
+  module->setAttr(
+      dtensor::kAllReduceNumOpsInGroup,
+      mlir::IntegerAttr::get(mlir::IntegerType::get(&context_, /*width=*/64),
+                             group_size));
 
   return module_ref;
 }
diff --git a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
index 6f09d61f97f..51b37305298 100644
--- a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
+++ b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -35,7 +36,7 @@ class DTensorMlirPassRunner {
 
   // Imports Graph to MLIR module in tf_execute Dialect with DTensor attributes.
   StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportGraphToMlir(
-      const DeviceSet& device_set, bool is_func,
+      const DeviceSet& device_set, absl::string_view name, bool is_func,
       const dtensor::Mesh& default_mesh,
       const FunctionLibraryDefinition& flib_def, const Graph& graph,
       Fprint128 cache_key);
diff --git a/tensorflow/dtensor/cc/dtensor_ops.cc b/tensorflow/dtensor/cc/dtensor_ops.cc
index c3fd22cd3e1..83c82e5a058 100644
--- a/tensorflow/dtensor/cc/dtensor_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_ops.cc
@@ -43,9 +43,10 @@ REGISTER_OP("Relayout")
 // Relayout the input according to the layout of layout_input.
 REGISTER_OP("RelayoutLike")
     .Input("input: T")
-    .Input("layout_input: T")  // To infer the output mesh.
+    .Input("layout_input: U")  // To infer the output mesh.
     .Output("output: T")
     .Attr("T: type")
+    .Attr("U: type")
     .SetShapeFn(UnchangedShape);
 
 // Copy `input` to the given mesh and layout.
diff --git a/tensorflow/dtensor/cc/dtensor_utils.cc b/tensorflow/dtensor/cc/dtensor_utils.cc
index 0cef65a6347..8a55e3d9904 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.cc
+++ b/tensorflow/dtensor/cc/dtensor_utils.cc
@@ -15,11 +15,14 @@ limitations under the License.
 
 #include "tensorflow/dtensor/cc/dtensor_utils.h"
 
+#include <algorithm>
 #include <cstdlib>
 #include <string>
+#include <vector>
 
 #include "absl/strings/ascii.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_split.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/tsl/util/env_var.h"
 
@@ -54,10 +57,15 @@ bool LogOnAllTasks() {
   return true;
 }
 
-bool LogOpByOp() {
-  char* dtensor_log_op_by_op_str = std::getenv("DTENSOR_LOG_OP_BY_OP");
-  if (dtensor_log_op_by_op_str == nullptr) return false;
-  return true;
+bool LogOpByOp(absl::string_view op_name) {
+  char* op_list_str = std::getenv("DTENSOR_LOG_OP_BY_OP");
+  if (op_list_str == nullptr) return false;
+  if (!strcmp(op_list_str, "*")) return true;
+  std::vector<absl::string_view> op_list = absl::StrSplit(op_list_str, ',');
+  if (std::find(op_list.begin(), op_list.end(), op_name) != op_list.end()) {
+    return true;
+  }
+  return false;
 }
 
 int LayoutPropagationMaxSteps() {
@@ -139,5 +147,15 @@ bool EnableAllToAllForRelayout() {
   return is_enabled;
 }
 
+int AllReduceCombineOptimizationGroupSize() {
+  char* group_size_str =
+      std::getenv("DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE");
+  if (group_size_str == nullptr) return 0;
+  int group_size;
+  if (absl::SimpleAtoi(group_size_str, &group_size)) return group_size;
+  LOG(WARNING) << "Invalid DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE, "
+                  "using the default value 0.";
+  return 0;
+}
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_utils.h b/tensorflow/dtensor/cc/dtensor_utils.h
index fa008d29ddd..cd9194d5262 100644
--- a/tensorflow/dtensor/cc/dtensor_utils.h
+++ b/tensorflow/dtensor/cc/dtensor_utils.h
@@ -17,6 +17,9 @@ limitations under the License.
 #define TENSORFLOW_DTENSOR_CC_DTENSOR_UTILS_H_
 
 #include <string>
+
+#include "absl/strings/string_view.h"
+
 namespace tensorflow {
 namespace dtensor {
 
@@ -33,7 +36,7 @@ bool LogOnAllTasks();
 
 // Returns whether to log op-by-op execution in addition to function execution
 // when logging is enabled.
-bool LogOpByOp();
+bool LogOpByOp(absl::string_view op_name);
 
 // Returns the maximum number of steps to run layout propagation. If the number
 // of steps exceeds this amount, layout propagation will fail.
@@ -63,6 +66,8 @@ bool EnableReplicatedSpmdAsDefault(const std::string& op_name);
 // Returns whether to use all-to-all collective for relayout when possible.
 bool EnableAllToAllForRelayout();
 
+// Returns the maximum number of AllReduce ops to merge into a group.
+int AllReduceCombineOptimizationGroupSize();
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/cc/slice_util.cc b/tensorflow/dtensor/cc/slice_util.cc
new file mode 100644
index 00000000000..f9f54b80789
--- /dev/null
+++ b/tensorflow/dtensor/cc/slice_util.cc
@@ -0,0 +1,213 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/cc/slice_util.h"
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace slice_util {
+
+namespace {
+
+// Computes the size of the ellipsis and the output rank.
+StatusOr<int64_t> GetEllipsisSize(int64_t input_rank,
+                                  const std::vector<Token>& tokens,
+                                  int64_t* output_rank) {
+  bool found = false;
+  int64_t regular_axis = 0;
+  int64_t new_axis = 0;
+  int64_t shrink_axis = 0;
+  for (const auto& token : tokens) {
+    switch (token.token_type) {
+      case Token::ELLIPSIS:
+        if (found) {
+          return absl::InvalidArgumentError(
+              "More than one ellipsis was found.");
+        }
+        found = true;
+        break;
+      case Token::NEW_AXIS:
+        ++new_axis;
+        break;
+      case Token::SHRINK_AXIS:
+        ++shrink_axis;
+        break;
+      case Token::REGULAR:
+        ++regular_axis;
+        break;
+    }
+  }
+  int64_t ellipsis_size = input_rank - (regular_axis + shrink_axis);
+  if (found && ellipsis_size < 0) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Ellipsis was found, but there is no remaining axis for it.",
+        " input_rank=", input_rank, " regular_axis=", regular_axis,
+        " shrink_axis=", shrink_axis));
+  }
+  *output_rank = regular_axis + ellipsis_size + new_axis;
+  return ellipsis_size;
+}
+
+}  // namespace
+
+Token Token::normalize(int64_t dim_size) const {
+  if (dynamic_mask) {
+    return *this;
+  }
+  int64_t new_begin = begin;
+  int dir = (stride > 0) ? 1 : -1;
+  if (begin_mask) {
+    if (dir > 0) {
+      new_begin = 0;
+    } else {
+      new_begin = dim_size - 1;
+    }
+  }
+  int64_t new_end = end;
+  if (end_mask) {
+    if (dir > 0) {
+      new_end = dim_size;
+    } else {
+      new_end = -1;
+    }
+  }
+  // Shift begin and end by same number of periods to distinguish full cycle
+  // from empty.
+  int64_t shift = (new_begin - new_begin % dim_size);
+  new_begin -= shift;
+  new_end -= shift;
+
+  int64_t n = dir * (new_end - new_begin + stride - dir) / (dir * stride);
+
+  // Round end by cycle size to ensure `(end - begin) / strides` is the
+  // number of result elements. To support cases like begin=0, end=-1.
+  if (n < 0) {
+    new_end = new_end + dir * dim_size;
+  }
+  n = dir * (new_end - new_begin + stride - dir) / (dir * stride);
+  new_end = new_begin + n * stride;
+
+  Token r = *this;
+  r.begin = new_begin;
+  r.end = new_end;
+  return r;
+}
+
+// Returns a Token for local slicing if no relayout along this axis
+// is needed. If no such local slicing is possible, returns nullopt.
+std::optional<Token> Token::GetLocalToken(int64_t dim_size,
+                                          int64_t num_shards) const {
+  Token token = normalize(dim_size);
+  VLOG(5) << "Compute: "
+          << "dim_size=" << dim_size << " num_shards=" << num_shards
+          << " token.begin=" << token.begin << " token.end=" << token.end
+          << " token.stride=" << token.stride;
+  if (token.begin_mask && token.end_mask) return token;
+  if (token.dynamic_mask) return std::nullopt;
+  if (token.stride < 0) return std::nullopt;
+  int64_t shard_dim_size = dim_size / num_shards;
+  if (shard_dim_size % token.stride == 0) {
+    // Simple striped slicing, where every 1 out of stride items
+    // are selected can remain sharded the same way.
+    if (token.begin >= 0 && token.begin < token.stride &&
+        token.end >= dim_size && token.end < dim_size + token.stride) {
+      token.end = shard_dim_size + (token.end - dim_size);
+      return token;
+    }
+  }
+  return std::nullopt;
+}
+
+Status TokenProcessor::Run(const std::vector<Token>& tokens) {
+  int64_t input_rank = input_rank_;
+  int64_t output_rank;
+  TF_ASSIGN_OR_RETURN(int64_t ellipsis_size,
+                      GetEllipsisSize(input_rank, tokens, &output_rank));
+
+  PrepareResults(tokens.size(), input_rank, output_rank);
+
+  bool out_of_bound = false;
+  int64_t input_index = 0;
+  int64_t output_index = 0;
+
+  for (const auto& token : tokens) {
+    switch (token.token_type) {
+      case Token::ELLIPSIS:
+        VisitEllipsisAxis(token);
+        out_of_bound = VisitLoop(input_rank, output_rank, ellipsis_size,
+                                 &input_index, &output_index);
+        ellipsis_size = 0;
+        break;
+      case Token::SHRINK_AXIS:
+        VisitShrinkAxis(token, input_index, output_index);
+        ++input_index;
+        break;
+      case Token::NEW_AXIS:
+        VisitNewAxis(token, input_index, output_index);
+        ++output_index;
+        break;
+      case Token::REGULAR:
+        if (input_index >= input_rank) {
+          out_of_bound = true;
+          break;
+        }
+        VisitRegularAxis(token, input_index, output_index);
+        ++input_index;
+        ++output_index;
+        break;
+    }
+
+    if (out_of_bound) {
+      break;
+    }
+  }
+  if (ellipsis_size > 0) {
+    out_of_bound = VisitLoop(input_rank, output_rank, ellipsis_size,
+                             &input_index, &output_index);
+  }
+  if (out_of_bound) {
+    return absl::InvalidArgumentError(
+        "Reading axis beyond the input tensor's rank. "
+        "The slicing token is incorrect.");
+  }
+
+  return FinalizeResults(input_rank, output_rank);
+}
+
+bool TokenProcessor::VisitLoop(int64_t input_rank, int64_t output_rank,
+                               int64_t ellipsis_size, int64_t* input_index,
+                               int64_t* output_index) {
+  for (int64_t k = 0; k < ellipsis_size; ++k) {
+    if (*input_index >= input_rank) {
+      return true;
+    }
+    VisitImplicitAxis(*input_index, *output_index);
+    ++*input_index;
+    ++*output_index;
+  }
+  return false;
+}
+
+}  // namespace slice_util
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/slice_util.h b/tensorflow/dtensor/cc/slice_util.h
new file mode 100644
index 00000000000..f8167f5dff2
--- /dev/null
+++ b/tensorflow/dtensor/cc/slice_util.h
@@ -0,0 +1,312 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_CC_SLICE_UTIL_H_
+#define TENSORFLOW_DTENSOR_CC_SLICE_UTIL_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace slice_util {
+
+// Defines a token of the strided slicing mini-language.
+// Refer to the definition of StridedSlice Op for the informal definition of
+// the language. During slicing, axes of the input tensor are processed one
+// by one according to the tokens of the slicing spec vector.
+struct Token {
+  enum TokenType {
+    REGULAR,   // Slice the current axis by begin/end/begin_mask/end_mask and
+               // stride.
+    NEW_AXIS,  // Add a new axis at the current location to the output.
+    ELLIPSIS,  // Copy over following axes to the output till the ellipsis ends.
+    SHRINK_AXIS  // Like a regular axis, but sequeeze this axis from output
+                 // after slicing.
+  } token_type;
+
+  int64_t begin = 0;          // Begin of the slice.
+  int64_t end = 0;            // End of the slice.
+  int64_t stride = 0;         // Stride of the slice.
+  bool dynamic_mask = false;  // If begin, end, or stride is a dynamic value.
+  bool begin_mask = false;    // True if the begin is maximal.
+  bool end_mask = false;      // True if the end is maximal.
+
+  Token() = default;
+  Token(TokenType token_type, int64_t begin, int64_t end, int64_t stride,
+        bool dynamic_mask = false, bool begin_mask = false,
+        bool end_mask = false)
+      : token_type(token_type),
+        begin(begin),
+        end(end),
+        stride(stride),
+        dynamic_mask(dynamic_mask),
+        begin_mask(begin_mask),
+        end_mask(end_mask) {}
+
+  // Normalizes the token such that (end - begin) is evenly divided by stride,
+  // and the result equals the total elements after the slicing.
+  Token normalize(int64_t dim_size) const;
+  std::optional<Token> GetLocalToken(int64_t dim_size,
+                                     int64_t num_shards) const;
+};
+
+// TODO(feyu): is there a C++ way to do vari args and templates move this out
+// of this class?
+template <typename T, typename... Types>
+StatusOr<T> CreateAndRun(const std::vector<Token>& tokens, Types... args) {
+  T visitor(args...);
+  TF_RETURN_IF_ERROR(visitor.Run(tokens));
+  return visitor;
+}
+
+class TokenProcessor {
+ public:
+  explicit TokenProcessor(int64_t input_rank) : input_rank_(input_rank) {}
+  virtual ~TokenProcessor() = default;
+
+  Status Run(const std::vector<Token>& tokens);
+
+ protected:
+  // Loop for an ellipsis or the unconsumed axes in the end.
+  bool VisitLoop(int64_t input_rank, int64_t output_rank, int64_t ellipsis_size,
+                 int64_t* input_index, int64_t* output_index);
+
+  virtual void VisitImplicitAxis(int64_t input_index, int64_t output_index) = 0;
+
+  virtual void VisitEllipsisAxis(const Token& token) = 0;
+
+  virtual void VisitShrinkAxis(const Token& token, int64_t input_index,
+                               int64_t output_index) = 0;
+
+  virtual void VisitNewAxis(const Token& token, int64_t input_index,
+                            int64_t output_index) = 0;
+
+  virtual void VisitRegularAxis(const Token& token, int64_t input_index,
+                                int64_t output_index) = 0;
+
+  virtual void PrepareResults(int64_t spec_rank, int64_t input_rank,
+                              int64_t output_rank) = 0;
+
+  virtual Status FinalizeResults(int64_t input_rank, int64_t output_rank) = 0;
+
+ private:
+  const int64_t input_rank_;
+};
+
+// Forward layout inference of from a StridedSlice token vector.
+//
+// For value_layout = StridedSlice(input_layout, tokens)
+//
+// The inference consumes input_layout, and produces:
+//  - a planned expander_input_layout that is suitable for SPMD expansion.
+//  - a planned expander_value_layout that is suitable for SPMD expansion.
+//  - a local_tokens vector for the arguments of the post-SPMD StridedSliceOp.
+//  expander_input_layout and expander_value_layout are consistent with
+//  local_tokens.
+class ForwardLayoutInference : public TokenProcessor {
+ public:
+  ForwardLayoutInference(const Layout& input_layout,
+                         const llvm::ArrayRef<int64_t> input_shape)
+      : TokenProcessor(input_shape.size()),
+        input_layout_(input_layout),
+        input_shape_(input_shape),
+        input_sharding_(input_layout.sharding_spec_strs()) {}
+
+  const Layout& expander_value_layout() const { return expander_value_layout_; }
+
+  const Layout& expander_input_layout() const { return expander_input_layout_; }
+
+  const std::vector<Token>& local_tokens() const { return local_tokens_; }
+
+ protected:
+  void VisitEllipsisAxis(const Token& token) override {
+    local_tokens_.push_back(token);
+  }
+
+  void VisitImplicitAxis(int64_t input_index, int64_t output_index) override {
+    expander_input_sharding_.push_back(input_sharding_[output_index]);
+    expander_value_sharding_.push_back(input_sharding_[output_index]);
+  }
+
+  void VisitShrinkAxis(const Token& token, int64_t input_index,
+                       int64_t output_index) override {
+    local_tokens_.push_back(token);
+    expander_input_sharding_.push_back(Layout::kUnshardedDim);
+    // Skips this axis from values, since it will be removed from the inputs.
+  }
+
+  void VisitNewAxis(const Token& token, int64_t input_index,
+                    int64_t output_index) override {
+    local_tokens_.push_back(token);
+    expander_value_sharding_.push_back(Layout::kUnshardedDim);
+  }
+
+  void VisitRegularAxis(const Token& token, int64_t input_index,
+                        int64_t output_index) override {
+    auto local_token = token.GetLocalToken(
+        /*dim_size=*/input_shape_[input_index],
+        /*num_shards*/ input_layout_.num_shards_for_dim(input_index));
+    std::string sharding = input_sharding_[input_index];
+    if (local_token.has_value()) {
+      local_tokens_.push_back(*local_token);
+    } else {
+      sharding = Layout::kUnshardedDim;
+      local_tokens_.push_back(token);
+    }
+    expander_value_sharding_.push_back(sharding);
+    expander_input_sharding_.push_back(sharding);
+  }
+
+  void PrepareResults(int64_t spec_rank, int64_t input_rank,
+                      int64_t output_rank) override {
+    local_tokens_.reserve(spec_rank);
+    expander_input_sharding_.reserve(input_rank);
+    expander_value_sharding_.reserve(output_rank);
+  }
+
+  Status FinalizeResults(int64_t input_rank, int64_t output_rank) override {
+    DCHECK_EQ(expander_input_sharding_.size(), input_rank);
+    DCHECK_EQ(expander_value_sharding_.size(), output_rank);
+    TF_ASSIGN_OR_RETURN(
+        expander_input_layout_,
+        Layout::GetLayout(expander_input_sharding_, input_layout_.mesh()));
+    TF_ASSIGN_OR_RETURN(
+        expander_value_layout_,
+        Layout::GetLayout(expander_value_sharding_, input_layout_.mesh()));
+    return OkStatus();
+  }
+
+ private:
+  const Layout& input_layout_;
+  const llvm::ArrayRef<int64_t> input_shape_;
+  std::vector<std::string> input_sharding_;
+  std::vector<std::string> expander_value_sharding_;
+  std::vector<std::string> expander_input_sharding_;
+  // Outputs
+  Layout expander_value_layout_;
+  Layout expander_input_layout_;
+  std::vector<Token> local_tokens_;
+};
+
+// Backward layout inference for a StridedSlice token vector.
+//
+// For value_layout = StridedSlice(input_layout, tokens)
+//
+// The inference consumes value_layout, and produces:
+//  - a planned expander_input_layout that is suitable for SPMD expansion.
+//  - a planned expander_value_layout that is suitable for SPMD expansion.
+//  - a local_tokens vector for the arguments of the post-SPMD StridedSliceOp.
+//  expander_input_layout and expander_value_layout are consistent with
+//  local_tokens.
+class BackwardLayoutInference : public TokenProcessor {
+ public:
+  BackwardLayoutInference(const Layout& value_layout,
+                          const llvm::ArrayRef<int64_t> input_shape)
+      : TokenProcessor(input_shape.size()),
+        value_layout_(value_layout),
+        input_shape_(input_shape),
+        value_sharding_(value_layout.sharding_spec_strs()) {}
+
+  const Layout& expander_input_layout() const { return expander_input_layout_; }
+
+  const Layout& expander_value_layout() const { return expander_value_layout_; }
+
+  const std::vector<Token>& local_tokens() const { return local_tokens_; }
+
+ protected:
+  void VisitEllipsisAxis(const Token& token) override {
+    local_tokens_.push_back(token);
+  }
+
+  void VisitImplicitAxis(int64_t input_index, int64_t output_index) override {
+    expander_input_sharding_.push_back(value_sharding_[output_index]);
+    expander_value_sharding_.push_back(value_sharding_[output_index]);
+  }
+
+  void VisitShrinkAxis(const Token& token, int64_t input_index,
+                       int64_t output_index) override {
+    local_tokens_.push_back(token);
+    // There is no constraint on the input sharding, but we prefer to keep it
+    // unsharded to avoid inserting relayout toward the internal input layout.
+    expander_input_sharding_.push_back(Layout::kUnshardedDim);
+  }
+
+  void VisitNewAxis(const Token& token, int64_t input_index,
+                    int64_t output_index) override {
+    local_tokens_.push_back(token);
+    // No corresponding input axis.
+    expander_value_sharding_.push_back(Layout::kUnshardedDim);
+  }
+
+  void VisitRegularAxis(const Token& token, int64_t input_index,
+                        int64_t output_index) override {
+    auto local_token = token.GetLocalToken(
+        /*dim_size=*/input_shape_[input_index],
+        /*num_shards*/ value_layout_.num_shards_for_dim(output_index));
+    if (local_token.has_value()) {
+      std::string sharding = value_sharding_[output_index];
+      local_tokens_.push_back(*local_token);
+      expander_input_sharding_.push_back(sharding);
+      expander_value_sharding_.push_back(sharding);
+    } else {
+      local_tokens_.push_back(token);
+      // There is no constraint on the input sharding, but we prefer to keep it
+      // unsharded to avoid inserting relayout toward the internal input layout.
+      expander_input_sharding_.push_back(Layout::kUnshardedDim);
+      expander_value_sharding_.push_back(Layout::kUnshardedDim);
+    }
+  }
+
+  void PrepareResults(int64_t spec_rank, int64_t input_rank,
+                      int64_t output_rank) override {
+    local_tokens_.reserve(spec_rank);
+    expander_input_sharding_.reserve(input_rank);
+    expander_value_sharding_.reserve(output_rank);
+  }
+
+  Status FinalizeResults(int64_t input_rank, int64_t output_rank) override {
+    DCHECK_EQ(expander_input_sharding_.size(), input_rank);
+    DCHECK_EQ(expander_value_sharding_.size(), output_rank);
+    TF_ASSIGN_OR_RETURN(
+        expander_input_layout_,
+        Layout::GetLayout(expander_input_sharding_, value_layout_.mesh()));
+    TF_ASSIGN_OR_RETURN(
+        expander_value_layout_,
+        Layout::GetLayout(expander_value_sharding_, value_layout_.mesh()));
+    return OkStatus();
+  }
+
+ private:
+  const Layout& value_layout_;
+  const llvm::ArrayRef<int64_t> input_shape_;
+  std::vector<std::string> value_sharding_;
+  std::vector<std::string> expander_input_sharding_;
+  std::vector<std::string> expander_value_sharding_;
+  // Outputs
+  Layout expander_input_layout_;
+  Layout expander_value_layout_;
+  std::vector<Token> local_tokens_;
+};
+
+}  // namespace slice_util
+}  // namespace dtensor
+}  // namespace tensorflow
+#endif  // TENSORFLOW_DTENSOR_CC_SLICE_UTIL_H_
diff --git a/tensorflow/dtensor/cc/tensor_layout.cc b/tensorflow/dtensor/cc/tensor_layout.cc
index 6b94560292c..467563d743c 100644
--- a/tensorflow/dtensor/cc/tensor_layout.cc
+++ b/tensorflow/dtensor/cc/tensor_layout.cc
@@ -380,7 +380,7 @@ StatusOr<Mesh> Mesh::ToDeviceType(const std::string& device_type) const {
                                   parsed_dev.task, device_type, parsed_dev.id));
     parsed_dev.Clear();
   }
-  return GetMesh(name_, mesh_dims_, global_device_ids_, local_device_ids_,
+  return GetMesh("", mesh_dims_, global_device_ids_, local_device_ids_,
                  to_local_devices, /*global_devices=*/{});
 }
 
@@ -1371,5 +1371,78 @@ StatusOr<Layout> ConcatenateLayouts(const Layout& layout_a,
   return Layout::FromProto(output_layout_proto);
 }
 
+StatusOr<Layout> GetMostShardedLayout(const std::vector<Layout>& layouts) {
+  if (layouts.empty())
+    return absl::InvalidArgumentError("Layout should not be empty");
+
+  absl::flat_hash_map<std::string, std::set<int>> layout_map;
+  for (const Layout& layout : layouts) {
+    for (int i = 0; i < layout.rank(); ++i) {
+      const std::string& mesh_dim = layout.dim(i).sharding_spec();
+      if (mesh_dim == Layout::kUnshardedDim) continue;
+
+      layout_map[mesh_dim].insert(i);
+    }
+  }
+
+  for (auto& it : layout_map)
+    if (it.second.size() > 1) it.second.clear();
+
+  std::map<int, std::set<std::string>> dim_to_layout_map;
+  for (const auto& it : layout_map) {
+    assert(it.second.size() <= 1);
+    if (it.second.empty()) continue;
+
+    const int tensor_dim_index = *it.second.begin();
+    dim_to_layout_map[tensor_dim_index].insert(it.first);
+  }
+
+  for (auto& it : dim_to_layout_map)
+    if (it.second.size() > 1) it.second.clear();
+
+  std::vector<std::string> merged_spec;
+  assert(!layouts.empty());
+  for (int i = 0; i < layouts[0].rank(); ++i) {
+    const auto it = dim_to_layout_map.find(i);
+    if (it != dim_to_layout_map.end() && !it->second.empty()) {
+      assert(it->second.size() == 1);
+      merged_spec.emplace_back(*it->second.begin());
+    } else {
+      merged_spec.emplace_back(Layout::kUnshardedDim);
+    }
+  }
+  return Layout::GetLayout(merged_spec, layouts[0].mesh());
+}
+
+StatusOr<Layout> GetLeastShardedLayout(const std::vector<Layout>& layouts) {
+  if (layouts.empty())
+    return absl::InvalidArgumentError("Layout should not be empty");
+  int rank = -1;
+  std::vector<std::string> specs;
+  for (const auto& layout : layouts) {
+    if (rank == -1) {
+      rank = layout.rank();
+    } else {
+      if (rank != layout.rank()) {
+        return absl::InvalidArgumentError(
+            "Not all layouts to GetLeastShardedLayout are of the same rank.");
+      }
+    }
+  }
+  specs.resize(rank, Layout::kAny);
+  for (const auto& layout : layouts) {
+    auto current_specs = layout.sharding_spec_strs();
+    for (int i = 0; i < rank; i++) {
+      auto current_spec = current_specs[i];
+      if (specs[i] == Layout::kAny) {
+        specs[i] = current_spec;
+      } else if (specs[i] != current_spec) {
+        // Least sharded compatible spec must be unsharded.
+        specs[i] = Layout::kUnshardedDim;
+      }
+    }
+  }
+  return Layout::GetLayout(specs, layouts[0].mesh());
+}
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/tensor_layout.h b/tensorflow/dtensor/cc/tensor_layout.h
index 26cdc83f8bb..23d1e981357 100644
--- a/tensorflow/dtensor/cc/tensor_layout.h
+++ b/tensorflow/dtensor/cc/tensor_layout.h
@@ -160,6 +160,8 @@ class Mesh {
     return local_device_ids_.empty() && !global_device_ids_.empty();
   }
 
+  StatusOr<Mesh> host_mesh() const { return ToDeviceType("CPU"); }
+
   // Device information methods.
   std::string device_type() const;
   // Takes an index in the flattened list of devices and returns a location
@@ -427,6 +429,9 @@ class Layout {
 StatusOr<Layout> ConcatenateLayouts(const Layout& layout_a,
                                     const Layout& layout_b);
 
+StatusOr<Layout> GetMostShardedLayout(const std::vector<Layout>& layouts);
+StatusOr<Layout> GetLeastShardedLayout(const std::vector<Layout>& layouts);
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/mlir/BUILD b/tensorflow/dtensor/mlir/BUILD
index ecd0319b06f..5ed5b1c5e49 100644
--- a/tensorflow/dtensor/mlir/BUILD
+++ b/tensorflow/dtensor/mlir/BUILD
@@ -256,7 +256,6 @@ cc_library(
         "//tensorflow/dtensor/cc:tensor_layout",
         "//tensorflow/dtensor/mlir/dtensor_dialect:ir/dtensor_attributes",
         "//tensorflow/dtensor/mlir/utils:dtensor_mlir_passes_internal",
-        "//tensorflow/tsl/platform:logging",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
@@ -292,6 +291,7 @@ cc_library(
         "//tensorflow/dtensor/cc:constants",
         "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/dtensor/cc:tensor_layout",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
@@ -379,6 +379,7 @@ cc_library(
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
     alwayslink = True,
 )
@@ -467,10 +468,12 @@ cc_library(
     ] + glob([
         "*spmd_expander.cc",
         "expansions/*spmd_expander.cc",
+        "expansions/*spmd_expander_v2.cc",
     ]),
     hdrs = glob([
         "*spmd_expander.h",
         "expansions/*spmd_expander.h",
+        "expansions/*spmd_expander_v2.h",
     ]),
     deps = [
         ":collectives",
@@ -496,11 +499,13 @@ cc_library(
         "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/dtensor/cc:dtensor_utils",
         "//tensorflow/dtensor/cc:save_restore_util",
+        "//tensorflow/dtensor/cc:slice_util",
         "//tensorflow/dtensor/cc:tensor_layout",
         "//tensorflow/dtensor/proto:layout_proto_cc",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
         "@llvm-project//llvm:Support",
@@ -521,6 +526,7 @@ cc_library(
         ":tf_dtensor_dialect",
         "//tensorflow/compiler/mlir/tensorflow:dynamic_shape_utils",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:lib",
         "//tensorflow/dtensor/cc:dstatus",
         "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
diff --git a/tensorflow/dtensor/mlir/Passes.td b/tensorflow/dtensor/mlir/Passes.td
index fc3396ed2b4..7b9eb7da7d4 100644
--- a/tensorflow/dtensor/mlir/Passes.td
+++ b/tensorflow/dtensor/mlir/Passes.td
@@ -303,6 +303,15 @@ def DTensorMergeClusters
   ];
 }
 
+def DTensorDecomposeControlflow
+    : Pass<"dtensor-decompose-controlflow", "mlir::ModuleOp"> {
+  let summary = "Decompose control flow ops to different meshes";
+  let constructor = "CreateDTensorDecomposeControlflowPass()";
+  let dependentDialects = [
+  ];
+}
+
+
 def DTensorLowerSendRecv
     : Pass<"dtensor-lower-send-recv", "mlir::ModuleOp"> {
   let summary = "Lowers DTensorSend/DTensorRecv ops to send/recv ops.";
diff --git a/tensorflow/dtensor/mlir/annotate_global_shape.cc b/tensorflow/dtensor/mlir/annotate_global_shape.cc
index 9c893d73912..32d1e0bba95 100644
--- a/tensorflow/dtensor/mlir/annotate_global_shape.cc
+++ b/tensorflow/dtensor/mlir/annotate_global_shape.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/dtensor/mlir/cluster_function_conversion.cc b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
index de02976957d..a1a40fc0801 100644
--- a/tensorflow/dtensor/mlir/cluster_function_conversion.cc
+++ b/tensorflow/dtensor/mlir/cluster_function_conversion.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
+#include <memory>
+#include <optional>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
@@ -62,7 +65,7 @@ mlir::LogicalResult AttachRetvalLayouts(
   if (!func)
     return sp_call_op.emitOpError() << "found no FuncOp for symbol " << sym;
 
-  llvm::SmallVector<absl::optional<Layout>, 8> retvals_layouts;
+  llvm::SmallVector<std::optional<Layout>, 8> retvals_layouts;
   retvals_layouts.reserve(func.getNumResults());
   for (auto operand : func.front().getTerminator()->getOperands()) {
     auto result_layout_or_status = ExtractLayoutFromOperand(operand);
diff --git a/tensorflow/dtensor/mlir/collectives.cc b/tensorflow/dtensor/mlir/collectives.cc
index 6d4ea9593bc..aef9f9006fe 100644
--- a/tensorflow/dtensor/mlir/collectives.cc
+++ b/tensorflow/dtensor/mlir/collectives.cc
@@ -16,7 +16,9 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/collectives.h"
 
 #include <cstdint>
+#include <iterator>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
diff --git a/tensorflow/dtensor/mlir/collectives_common.cc b/tensorflow/dtensor/mlir/collectives_common.cc
index ea327a8759f..5346dff4756 100644
--- a/tensorflow/dtensor/mlir/collectives_common.cc
+++ b/tensorflow/dtensor/mlir/collectives_common.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/collectives_common.h"
 
+#include <map>
 #include <string>
+#include <vector>
 
 namespace tensorflow {
 namespace dtensor {
diff --git a/tensorflow/dtensor/mlir/collectives_common.h b/tensorflow/dtensor/mlir/collectives_common.h
index 1a4d62f6dcb..fcac4085fa1 100644
--- a/tensorflow/dtensor/mlir/collectives_common.h
+++ b/tensorflow/dtensor/mlir/collectives_common.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_COMMON_H_
 #define TENSORFLOW_DTENSOR_MLIR_COLLECTIVES_COMMON_H_
 
+#include <map>
 #include <string>
+#include <vector>
 
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 
diff --git a/tensorflow/dtensor/mlir/constant_folding.cc b/tensorflow/dtensor/mlir/constant_folding.cc
index 28bbc9f76d5..edde8507d22 100644
--- a/tensorflow/dtensor/mlir/constant_folding.cc
+++ b/tensorflow/dtensor/mlir/constant_folding.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h b/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
index 29875fec834..830445c0c77 100644
--- a/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
+++ b/tensorflow/dtensor/mlir/create_dtensor_mlir_passes.h
@@ -130,6 +130,9 @@ CreateDTensorAllScatterLoweringPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorMergeClustersPass();
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorDecomposeControlflowPass();
+
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateDTensorLowerSendRecv();
 
diff --git a/tensorflow/dtensor/mlir/dce.cc b/tensorflow/dtensor/mlir/dce.cc
index 5eb2a4e417f..b3771107e97 100644
--- a/tensorflow/dtensor/mlir/dce.cc
+++ b/tensorflow/dtensor/mlir/dce.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc b/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
index 32c8d249415..4618e3fa9ff 100644
--- a/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
+++ b/tensorflow/dtensor/mlir/designate_resource_handle_mesh.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
index ca3016b9cff..d39a71fad4d 100644
--- a/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
+++ b/tensorflow/dtensor/mlir/device_mesh_cluster_coarsening.cc
@@ -13,7 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
+#include <memory>
 #include <optional>
+#include <utility>
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
index 1a7a482ee0d..04dbb7ae844 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_combine_optimization.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <memory>
+#include <set>
 #include <string>
 #include <vector>
 
@@ -542,11 +543,49 @@ createSubgroupsByGroupAssignment(
   return all_reduce_new_groups;
 }
 
+// Experimental extended grouping logics to avoid aggressive grouping.
+// This function performs the same grouping method as tf.distribute, which group
+// all reduce ops by user defined group size (number of ops) in the input order.
+std::vector<std::vector<mlir::TF::DTensorAllReduceOp>>
+createSubgroupsByExtendedNumOps(
+    std::vector<std::vector<mlir::TF::DTensorAllReduceOp>> all_reduce_groups,
+    int group_size) {
+  VLOG(4) << "max group size: " << group_size;
+  VLOG(4) << "current number of groups: " << all_reduce_groups.size();
+  // Disable extended grouping if group size is set to zero
+  if (group_size <= 0) return all_reduce_groups;
+  std::vector<std::vector<mlir::TF::DTensorAllReduceOp>> all_reduce_new_groups;
+  // Further break down the current all_reduced_groups by extended group size
+  for (const auto& all_reduce_group : all_reduce_groups) {
+    if (all_reduce_group.size() <= group_size) {
+      all_reduce_new_groups.push_back(all_reduce_group);
+      continue;
+    }
+    // Safe to "assume" num_groups would be greater or equal to two, because the
+    // above condition check guarantees case of zero or one would be skipped
+    int num_groups = (all_reduce_group.size() + group_size - 1) / group_size;
+    VLOG(4) << all_reduce_group.size() << " all_reduce ops in the current group"
+            << ", able to split into " << num_groups << " groups\n";
+    for (int i = 0; i < num_groups - 1; i++) {
+      all_reduce_new_groups.push_back(std::vector<mlir::TF::DTensorAllReduceOp>(
+          all_reduce_group.begin() + i * group_size,
+          all_reduce_group.begin() + (i + 1) * group_size));
+    }
+    // Handle the last sub-group
+    all_reduce_new_groups.push_back(std::vector<mlir::TF::DTensorAllReduceOp>(
+        all_reduce_group.begin() + (num_groups - 1) * group_size,
+        all_reduce_group.end()));
+  }
+  VLOG(4) << "new number of groups: " << all_reduce_new_groups.size();
+  return all_reduce_new_groups;
+}
+
 struct DTensorAllReduceCombineOptimization
     : public impl::DTensorAllReduceCombineOptimizationBase<
           DTensorAllReduceCombineOptimization> {
   void runOnOperation() override {
     mlir::func::FuncOp function = getOperation();
+    auto module = function->getParentOfType<mlir::ModuleOp>();
     function.walk([&](mlir::tf_device::ClusterOp cluster) {
       std::vector<mlir::TF::DTensorAllReduceOp> ordered_all_reduces;
       std::vector<mlir::Block*> ordered_blocks;
@@ -570,8 +609,8 @@ struct DTensorAllReduceCombineOptimization
       });
 
       if (ordered_all_reduces.size() > 1) {
-        // Create dependency graph for all all_reduce operations, so that that
-        // independent ops can be merged
+        // Create dependency graph for all eligible all_reduce operations,
+        // so that independent ops can be merged
         auto all_reduce_groups =
             createIndependentReduceOpsGroups(ordered_all_reduces);
 
@@ -582,9 +621,24 @@ struct DTensorAllReduceCombineOptimization
         all_reduce_groups = createSubgroupsByReductionAttr(all_reduce_groups);
         all_reduce_groups = createSubgroupsByGroupAssignment(all_reduce_groups);
 
+        // Experimental extended grouping
+        int group_size = 0;
+        if (module->hasAttrOfType<mlir::IntegerAttr>(kAllReduceNumOpsInGroup)) {
+          group_size =
+              module->getAttrOfType<mlir::IntegerAttr>(kAllReduceNumOpsInGroup)
+                  .getInt();
+        }
+        all_reduce_groups =
+            createSubgroupsByExtendedNumOps(all_reduce_groups, group_size);
+
+        // Maintain relative order of ALLReduces within the block.
         std::sort(all_reduce_groups.begin(), all_reduce_groups.end(),
                   [](std::vector<mlir::TF::DTensorAllReduceOp> lhs,
                      std::vector<mlir::TF::DTensorAllReduceOp> rhs) {
+                    if (lhs.empty() || rhs.empty()) {
+                      // Skip order check if either group is empty.
+                      return false;
+                    }
                     if (lhs[0]->getBlock() == rhs[0]->getBlock())
                       return lhs[0]->isBeforeInBlock(rhs[0]);
                     return true;
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
index a4c328bb781..dc9cb9347de 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_scatter_optimization.cc
@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/FormatVariadic.h"
diff --git a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
index 43f31cc83ae..78a67ea1b9f 100644
--- a/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
+++ b/tensorflow/dtensor/mlir/dtensor_allreduce_sum_optimization.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "llvm/ADT/DenseMap.h"
diff --git a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
index b84a4a09d8a..5ecb38b3e97 100644
--- a/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mixed_precision_reduce.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "absl/strings/string_view.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc b/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
index a72a4f48cba..f7eb2c087df 100644
--- a/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
+++ b/tensorflow/dtensor/mlir/dtensor_mlir_passes.cc
@@ -50,6 +50,15 @@ class ConditionalPrinter : public BridgeLoggerConfig {
 
   void printAfterIfEnabled(mlir::Pass *pass, mlir::Operation *operation,
                            PrintCallbackFn print_callback) override {
+    // NOTE(b/284312504): Disable dumping of
+    // FunctionalToExecutorDialectConversionPass as it tends to get very large
+    // being a nested pass on FuncOp before inliner.
+    if (pass->getName() == "ExecutorDialectToFunctionalPass") {
+      return;
+    }
+    if (pass->getName() == "FunctionalToExecutorDialectConversionPass") {
+      return;
+    }
     mlir::ModuleOp module = mlir::dyn_cast<mlir::ModuleOp>(operation);
     if (!module) module = operation->getParentOfType<mlir::ModuleOp>();
     if (module && !module->hasAttr(dtensor::kDoNotLog) && !do_not_print_)
@@ -143,20 +152,26 @@ void CreateDTensorMLIRPass(const mlir::TF::StandardPipelineOptions &options,
     func_pm.addPass(CreateDTensorDesignateResourceHandleMesh());
   }
 
+  // Clone Control Flow.
+  pm->addPass(CreateDTensorDecomposeControlflowPass());
+
   // Validates that all cross mesh data transfers are expressed via
   // DTensorLayout operation and lowers it to send/recvs.
   pm->addPass(CreateDTensorHandleCrossClusterDependencies());
 
+  // Merge Clusters
+  pm->addPass(CreateDTensorMergeClustersPass());
+
   // Mark all ops and functions with global shape attribute to preserve global
   // shape information as it is needed during Layout Propagation and SPMD
   // expansion.
   pm->addPass(CreateDTensorAnnotateGlobalShape());
 
-  // Propagate layout to all ops in graph.
-  pm->addPass(CreateDTensorMergeClustersPass());
-
   AddDTensorEmbeddingPassV2(pm);
 
+  ////////
+  // Propagate layout to all ops in graph.
+
   // For DTensor Checkpoint V2, the outputs of tf.RestoreV2 ops
   // do not have shape information. We can infer the shapes of these
   // outputs from the tf.AssignVariableOps that consume these outputs.
diff --git a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
index 4314796a4cd..d2d4f1eacba 100644
--- a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
+++ b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -39,10 +40,10 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/dtensor/cc/constants.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/op_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -66,10 +67,12 @@ using ExpandedArgumentMap =
     absl::flat_hash_map<int,
                         absl::flat_hash_map<Mesh, std::vector<mlir::Value>>>;
 
-mlir::BlockArgument MakeArgumentForDevice(mlir::Builder& builder,
-                                          mlir::func::FuncOp func,
-                                          mlir::Type arg_type,
-                                          const std::string& device) {
+using ExpandedResultsMap = absl::flat_hash_map<int, std::vector<mlir::Value>>;
+
+mlir::BlockArgument InsertArgumentForDevice(mlir::OpBuilder& builder,
+                                            mlir::func::FuncOp func,
+                                            mlir::Type arg_type,
+                                            const std::string& device) {
   const int arg_index = func.getNumArguments();
 
   std::vector<mlir::NamedAttribute> named_attrs = {builder.getNamedAttr(
@@ -82,38 +85,83 @@ mlir::BlockArgument MakeArgumentForDevice(mlir::Builder& builder,
   return func.getArgument(arg_index);
 }
 
-StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
-    mlir::func::FuncOp func, ExpandedArgumentMap& expanded_arguments,
-    unsigned int argument_number, const Mesh* target_mesh = nullptr);
-
+// Returns the user of all the ops in the span iff it is a single return op.
+// Otherwise, returns nullptr; for example, if there are multiple return ops.
 template <typename Operation>
-Status ExpandOperation(ExpandedArgumentMap& expanded_arguments_map,
-                       absl::Span<const std::string> devices, Operation op,
-                       const Layout& layout) {
-  auto func = op->template getParentOfType<mlir::func::FuncOp>();
-  if (!func) {
-    // This line should be unreachable within the current framework.
-    // This function is only called on operations discovered while walking
-    // through the main function.
-    return errors::InvalidArgument("Operator not within function.");
+mlir::func::ReturnOp GetReturnOpFromUsers(absl::Span<Operation> ops) {
+  mlir::func::ReturnOp return_op;
+
+  for (Operation op : ops) {
+    for (mlir::Operation* user : op->getUsers()) {
+      // TODO(twelve): Determine whether we should follow identity ops.
+      if (mlir::func::ReturnOp op =
+              llvm::dyn_cast_or_null<mlir::func::ReturnOp>(user)) {
+        if (return_op) {
+          if (return_op != op) {
+            return nullptr;
+          }
+        } else {
+          return_op = op;
+        }
+      } else {
+        return nullptr;
+      }
+    }
   }
 
-  mlir::OpBuilder builder(op);
-  const Mesh& mesh = layout.mesh();
+  return return_op;
+}
+
+// Returns the devices for a given mesh.
+absl::Span<const std::string> GetDevices(const Mesh& mesh) {
+  const std::vector<std::string>& devices = mesh.global_devices();
+  if (devices.empty()) {
+    return mesh.local_devices();
+  } else {
+    return devices;
+  }
+}
+
+StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
+    mlir::OpBuilder& builder, mlir::func::FuncOp target_func,
+    ExpandedArgumentMap& expanded_arguments, mlir::BlockArgument argument,
+    const Mesh* target_mesh = nullptr);
+
+// Extracts the operation's layouts, then expands it across them.
+template <typename Operation>
+mlir::LogicalResult ExpandOperation(mlir::func::FuncOp target_func,
+                                    mlir::func::ReturnOp return_op,
+                                    ExpandedArgumentMap& expanded_arguments,
+                                    ExpandedResultsMap& expanded_results,
+                                    Operation op) {
+  const StatusOr<std::optional<Mesh>> mesh = ExtractDeviceMeshFromOp(op);
+  if (!(mesh.ok() && *mesh)) {
+    op->emitOpError("Failed to retrieve op mesh or layout.");
+    return mlir::failure();
+  } else if ((*mesh)->IsSingleDevice()) {
+    op->emitOpError("Unimplemented, single-device expansion support.");
+    return mlir::failure();
+  }
+
+  mlir::OpBuilder builder(target_func.getBody());
+  const absl::Span<const std::string> devices = GetDevices(**mesh);
   const std::size_t num_devices = devices.size();
+  const Mesh* target_mesh = &(**mesh);
 
   llvm::SmallVector<Operation> replications;
   for (std::size_t i = 0; i < num_devices; ++i) {
     llvm::SmallVector<mlir::Value, 8> operands;
     for (const mlir::Value& operand : op->getOperands()) {
       if (const auto arg = operand.dyn_cast_or_null<mlir::BlockArgument>()) {
-        TF_ASSIGN_OR_RETURN(const absl::Span<mlir::Value> expanded_arguments,
-                            GetExpandedArguments(func, expanded_arguments_map,
-                                                 arg.getArgNumber(), &mesh));
-        if (expanded_arguments.empty()) {
+        const StatusOr<absl::Span<mlir::Value>> new_args = GetExpandedArguments(
+            builder, target_func, expanded_arguments, arg, target_mesh);
+        if (!new_args.ok()) {
+          op->emitOpError(tsl::NullTerminatedMessage(new_args.status()));
+          return mlir::failure();
+        } else if (new_args->empty()) {
           operands.push_back(operand);
         } else {
-          operands.push_back(expanded_arguments[i]);
+          operands.push_back((*new_args)[i]);
         }
       } else {
         operands.push_back(operand);
@@ -131,99 +179,22 @@ Status ExpandOperation(ExpandedArgumentMap& expanded_arguments_map,
     replications.emplace_back(new_op);
   }
 
-  mlir::func::ReturnOp return_op;
-  for (const mlir::OpOperand& user : op->getUses()) {
-    const mlir::Operation* owner = user.getOwner();
-    if (!(return_op = llvm::dyn_cast_or_null<mlir::func::ReturnOp>(owner))) {
-      // TODO(twelve) : Determine whether this restriction should be lifted.
-      return errors::InvalidArgument("Call result must be used by return op.");
-    }
-  }
-
   if (return_op) {
-    llvm::SmallVector<mlir::Value, 8> operands;
-    for (const mlir::Value operand : return_op->getOperands()) {
+    mlir::Operation::operand_range operands = return_op->getOperands();
+    for (const auto [i, operand] : llvm::enumerate(operands)) {
       if (op == operand.getDefiningOp()) {
         const mlir::Operation::result_range results = op->getResults();
         const mlir::Operation::result_range::iterator search =
             llvm::find(results, operand);
         const std::size_t result_number = search - results.begin();
         for (const Operation& replication : replications) {
-          operands.push_back(replication->getResult(result_number));
-        }
-      } else {
-        operands.push_back(operand);
-      }
-    }
-
-    llvm::SmallVector<mlir::Type, 8> results;
-    for (const mlir::Value& operand : operands) {
-      results.push_back(operand.getType());
-    }
-
-    const mlir::FunctionType func_type = func.getFunctionType();
-    func.removeResAttrsAttr();
-    func.setFunctionType(
-        builder.getFunctionType(func_type.getInputs(), results));
-
-    builder.create<mlir::func::ReturnOp>(return_op->getLoc(), operands);
-
-    return_op->erase();
-  }
-
-  return OkStatus();
-}
-
-// Returns the devices for a given mesh.
-absl::Span<const std::string> GetDevices(const Mesh& mesh) {
-  const std::vector<std::string>& devices = mesh.global_devices();
-  if (devices.empty()) {
-    return mesh.local_devices();
-  } else {
-    return devices;
-  }
-}
-
-// Extracts the operation's layouts, then expands it across them.
-template <typename Operation>
-mlir::LogicalResult ExpandOperations(ExpandedArgumentMap& expanded_arguments,
-                                     Operation op) {
-  const StatusOr<std::optional<Mesh>> mesh = ExtractDeviceMeshFromOp(op);
-  const StatusOr<std::vector<std::optional<Layout>>> layouts =
-      ExtractLayoutFromOp(op);
-  if (!((mesh.ok() && *mesh) && (layouts.ok() && !layouts->empty()))) {
-    op->emitOpError("Failed to retrieve op mesh or layout.");
-    return mlir::failure();
-  }
-
-  bool expanded = false;
-  for (const std::optional<Layout>& layout : *layouts) {
-    if (layout) {
-      const Mesh& layout_mesh = layout->mesh();
-      if (**mesh != layout_mesh) {
-        op->emitOpError("Unimplemented, outputs not on op mesh.");
-        return mlir::failure();
-      } else if (layout_mesh.IsSingleDevice()) {
-        op->emitOpError("Unimplemented, single-device expansion support.");
-        return mlir::failure();
-      } else {
-        const absl::Span<const std::string> devices = GetDevices(layout_mesh);
-        const Status status =
-            ExpandOperation(expanded_arguments, devices, op, *layout);
-        if (status.ok()) {
-          expanded = true;
-        } else {
-          op->emitOpError(tsl::NullTerminatedMessage(status));
-          return mlir::failure();
+          expanded_results[i].emplace_back(
+              replication->getResult(result_number));
         }
       }
     }
   }
 
-  if (expanded) {
-    op->erase();
-  }
-
   return mlir::success();
 }
 
@@ -258,13 +229,11 @@ void UpdateEntryFuncAttr(mlir::OpBuilder& builder, mlir::func::FuncOp func) {
 }
 
 StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
-    mlir::func::FuncOp func, ExpandedArgumentMap& expanded_arguments,
-    unsigned int argument_number, const Mesh* target_mesh) {
-  if (func.getName() != kMainFuncName) {
-    return absl::Span<mlir::Value>();  // only expand main function arguments
-  }
-  const mlir::BlockArgument arg = func.getArgument(argument_number);
+    mlir::OpBuilder& builder, mlir::func::FuncOp target_func,
+    ExpandedArgumentMap& expanded_arguments, mlir::BlockArgument arg,
+    const Mesh* target_mesh) {
   std::optional<Mesh> mesh;
+  unsigned int argument_number = arg.getArgNumber();
   if (argument_number == kDeviceIDArgumentNumber) {
     if (target_mesh) {
       mesh = *target_mesh;
@@ -283,15 +252,12 @@ StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
       const absl::Span<const std::string> devices = GetDevices(*mesh);
       const std::size_t num_devices = devices.size();
       replications.reserve(num_devices);
-      mlir::Block& func_block = func.getBody().front();
-      mlir::OpBuilder builder(&(func_block.front()));
       if (argument_number == kDeviceIDArgumentNumber) {
-        mlir::Location loc = func_block.front().getLoc();
         for (int i = 0; i < num_devices; ++i) {
           const auto value_attr = mlir::DenseIntElementsAttr::get<int>(
               mlir::RankedTensorType::get({0}, builder.getI32Type()), {i});
           replications.emplace_back(
-              builder.create<mlir::TF::ConstOp>(loc, value_attr));
+              builder.create<mlir::TF::ConstOp>(arg.getLoc(), value_attr));
         }
       } else {
         mlir::TensorType tensor_type =
@@ -300,8 +266,8 @@ StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
           return errors::InvalidArgument("Could not determine tensor type.");
         }
         for (int i = 0; i < num_devices; ++i) {
-          replications.emplace_back(
-              MakeArgumentForDevice(builder, func, tensor_type, devices[i]));
+          replications.emplace_back(InsertArgumentForDevice(
+              builder, target_func, tensor_type, devices[i]));
         }
       }
     }
@@ -311,22 +277,133 @@ StatusOr<absl::Span<mlir::Value>> GetExpandedArguments(
   }
 }
 
+template <typename Results>
+mlir::FunctionType GetFunctionType(mlir::OpBuilder& builder,
+                                   mlir::func::FuncOp func, Results results) {
+  std::vector<mlir::Type> input_types, result_types;
+  for (mlir::BlockArgument input : func.getArguments()) {
+    input_types.emplace_back(input.getType());
+  }
+  for (const auto result : results) {
+    result_types.emplace_back(result.getType());
+  }
+  return builder.getFunctionType(input_types, result_types);
+}
+
+// Build a new main function that calls the multi-device/translated function.
+mlir::LogicalResult BuildOuterMainFunc(
+    mlir::ModuleOp module, mlir::func::FuncOp old_main_func,
+    mlir::func::FuncOp translated_func, mlir::func::ReturnOp return_op,
+    absl::Span<mlir::TF::StatefulPartitionedCallOp> call_ops) {
+  llvm::SmallVector<mlir::Attribute, 4> output_layouts;
+  for (mlir::TF::StatefulPartitionedCallOp call_op : call_ops) {
+    // Then extract all their output layouts.
+    mlir::ArrayAttr layouts =
+        call_op->getAttr(kLayoutAttr).dyn_cast_or_null<mlir::ArrayAttr>();
+    if (!layouts) {
+      call_op.emitOpError() << "Could not find op's layouts.";
+      return mlir::failure();
+    }
+    // Here, we assume that the output layouts and the results are in the same
+    // ordering--this property should be guaranteed as long as all the results
+    // have been expanded (produced by ExpandOperation).
+    output_layouts.insert(output_layouts.end(), layouts.begin(), layouts.end());
+  }
+
+  mlir::SymbolTable symbol_table(module);
+  mlir::Block* module_body = module.getBody();
+  mlir::OpBuilder builder = mlir::OpBuilder::atBlockBegin(module_body);
+  // Build a new main function with no initial attributes/return type.
+  mlir::func::FuncOp main_func = mlir::func::FuncOp::create(
+      old_main_func.getLoc(), "main", builder.getFunctionType({}, {}));
+  mlir::Block* entry_block = main_func.addEntryBlock();
+  builder.setInsertionPointToEnd(entry_block);
+
+  // Copy the arguments from the translated function to the new main function.
+  std::vector<mlir::Value> inputs;
+  for (auto [arg_index, arg] :
+       llvm::enumerate(translated_func.getArguments())) {
+    main_func.insertArgument(arg_index, arg.getType(),
+                             translated_func.getArgAttrDict(arg_index),
+                             old_main_func.getLoc());
+    inputs.emplace_back(main_func.getArgument(arg_index));
+  }
+
+  // Get the type of the translated function.
+  mlir::FunctionType func_type = translated_func.getFunctionType();
+  // Then build a call op targeting it (reflecting its result types).
+  auto expanded_call_op = builder.create<mlir::TF::StatefulPartitionedCallOp>(
+      call_ops[0].getLoc(), func_type.getResults(), inputs,
+      translated_func.getSymName(),
+      /*config=*/builder.getStringAttr(""),
+      /*config_proto=*/builder.getStringAttr(""),
+      /*executor_type=*/builder.getStringAttr(""));
+
+  // Set the output layout attribute on the new call op.
+  llvm::ArrayRef<mlir::Attribute> output_layouts_ref(output_layouts);
+  mlir::ArrayAttr output_layouts_attr =
+      builder.getArrayAttr(output_layouts_ref);
+  expanded_call_op->setAttr(kLayoutAttr, output_layouts_attr);
+
+  // Return all the values from the new call op.
+  mlir::Operation::result_range outputs = expanded_call_op.getResults();
+  if (return_op) {
+    builder.create<mlir::func::ReturnOp>(return_op.getLoc(), outputs);
+  } else if (!outputs.empty()) {
+    call_ops[0]->emitOpError("Call had results, but they were not used.");
+    return mlir::failure();
+  }
+
+  // Update the function's type based on the arguments and return values.
+  main_func.setFunctionType(GetFunctionType(builder, main_func, outputs));
+  UpdateEntryFuncAttr(builder, main_func);
+
+  // Erase the original main func.
+  symbol_table.remove(old_main_func);
+  old_main_func.erase();
+  // Add the new main function to the module's symbol table, ensuring that it's
+  // located before all the other functions with the module.
+  symbol_table.insert(main_func, module_body->begin());
+
+  return mlir::success();
+}
+
 struct DTensorMultiDeviceExpansion
     : public impl::DTensorMultiDeviceExpansionBase<
           DTensorMultiDeviceExpansion> {
   void runOnOperation() override {
     mlir::ModuleOp module = getOperation();
+    mlir::SymbolTable symbol_table(module);
     mlir::func::FuncOp main_func =
         module.lookupSymbol<mlir::func::FuncOp>(kMainFuncName);
     if (!main_func) {
       return;
     }
 
+    std::string translated_func_name =
+        llvm::formatv("_multi_device_func_{0}_{1}", OpHash(module),
+                      OpHash(main_func))
+            .str();
+    mlir::OpBuilder builder = mlir::OpBuilder::atBlockEnd(module.getBody());
+    mlir::func::FuncOp translated_func =
+        mlir::func::FuncOp::create(main_func.getLoc(), translated_func_name,
+                                   builder.getFunctionType({}, {}));
+
+    // build the entry block and return op of the translated function
+    builder.setInsertionPointToEnd(translated_func.addEntryBlock());
+    auto translated_terminator_op =
+        builder.create<mlir::func::ReturnOp>(main_func.getLoc());
+
+    // so the function has a "terminator" and we can insert it into the module
+    translated_func.setVisibility(mlir::SymbolTable::Visibility::Private);
+    symbol_table.insert(translated_func);
+
     ExpandedArgumentMap expanded_arguments_map;
     for (unsigned i = 1; i < main_func.getNumArguments(); ++i) {
       // Expand all the arguments (in case they're unused).
       StatusOr<absl::Span<mlir::Value>> expanded_arguments =
-          GetExpandedArguments(main_func, expanded_arguments_map, i);
+          GetExpandedArguments(builder, translated_func, expanded_arguments_map,
+                               main_func.getArgument(i));
       if (!expanded_arguments.ok()) {
         main_func->emitOpError(
             tsl::NullTerminatedMessage(expanded_arguments.status()));
@@ -347,36 +424,49 @@ struct DTensorMultiDeviceExpansion
       }
     });
 
+    // Ensure that all the call ops return results via the same op.
+    mlir::func::ReturnOp return_op = GetReturnOpFromUsers(
+        absl::Span<mlir::TF::StatefulPartitionedCallOp>(stateful_call_ops));
+    if (!return_op && !stateful_call_ops.empty()) {
+      stateful_call_ops[0]->emitOpError(
+          "Calls must be used by exactly one return op.");
+      return;
+    }
+
+    ExpandedResultsMap expanded_results;
     for (const mlir::TF::StatefulPartitionedCallOp& stateful_call_op :
          stateful_call_ops) {
       mlir::LogicalResult status =
-          ExpandOperations(expanded_arguments_map, stateful_call_op);
-      if (status.failed()) {
+          ExpandOperation(translated_func, return_op, expanded_arguments_map,
+                          expanded_results, stateful_call_op);
+      if (mlir::failed(status)) {
         return;
       }
     }
 
-    if (main_func && !expanded_arguments_map.empty()) {
-      mlir::OpBuilder builder(main_func);
-      const mlir::FunctionType func_type = main_func.getFunctionType();
-      const llvm::ArrayRef<mlir::Type> inputs = func_type.getInputs();
-      llvm::SmallVector<mlir::Type, 8> next_inputs;
-      unsigned num_erased = 0;
-      for (unsigned i = 0; i < inputs.size(); ++i) {
-        const ExpandedArgumentMap::iterator search =
-            expanded_arguments_map.find(i);
-        // Always erase the device id, even when it's unexpanded.
-        if ((search == expanded_arguments_map.end()) &&
-            (i != kDeviceIDArgumentNumber)) {
-          next_inputs.push_back(inputs[i]);
-        } else {
-          main_func.eraseArgument(i - num_erased);
-          num_erased += 1;
-        }
+    std::vector<mlir::Value> results;
+    for (unsigned i = 0; i < return_op->getNumOperands(); ++i) {
+      ExpandedResultsMap::iterator search = expanded_results.find(i);
+      if (search == expanded_results.end()) {
+        results.emplace_back(return_op->getOperand(i));
+      } else {
+        std::vector<mlir::Value>& values = search->second;
+        results.insert(results.end(), values.begin(), values.end());
       }
-      main_func.setFunctionType(
-          builder.getFunctionType(next_inputs, func_type.getResults()));
-      UpdateEntryFuncAttr(builder, main_func);
+    }
+
+    // update the operands of the translated return op
+    translated_terminator_op->setOperands(results);
+    // and, update the function's type accordingly
+    translated_func.setFunctionType(GetFunctionType(
+        builder, translated_func, absl::Span<mlir::Value>(results)));
+    UpdateEntryFuncAttr(builder, translated_func);
+
+    mlir::LogicalResult status = BuildOuterMainFunc(
+        module, main_func, translated_func, return_op,
+        absl::Span<mlir::TF::StatefulPartitionedCallOp>(stateful_call_ops));
+    if (mlir::failed(status)) {
+      return;
     }
   }
 };
diff --git a/tensorflow/dtensor/mlir/dtensor_send_recv.h b/tensorflow/dtensor/mlir/dtensor_send_recv.h
index c156fd3738a..d4b21144a7f 100644
--- a/tensorflow/dtensor/mlir/dtensor_send_recv.h
+++ b/tensorflow/dtensor/mlir/dtensor_send_recv.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_DTENSOR_SEND_RECV_H_
 #define TENSORFLOW_DTENSOR_MLIR_DTENSOR_SEND_RECV_H_
 
+#include "absl/status/status.h"
 #include "llvm/Support/Casting.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -67,7 +68,7 @@ StatusOr<mlir::Operation*> GetCorrespondingDTensorSendRecvOp(
   } else {
     const bool is_recv = std::is_same<DTensorOp, mlir::TF::DTensorRecv>::value;
     if (!is_recv) {
-      return errors::Internal(
+      return absl::InternalError(
           "Error checking if is same for DTensorOp and DTensorRecv.");
     }
     module.walk([&](mlir::Operation* op) {
@@ -99,7 +100,7 @@ StatusOr<mlir::Operation*> GetCorrespondingDTensorSendRecvOp(
   }
 
   if (!corresponding_op)
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "DTensorSend/DTensorRecv op must have corresponding "
         "DTensorRecv/DTensorSend op.");
 
diff --git a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
index b06b7265d2a..85a01de4816 100644
--- a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
@@ -19,6 +19,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -55,7 +57,7 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
                       ExtractLayoutFromOperand(dtensor_send.getInput()));
 
   if (!send_layout_or_null.has_value())
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Input to DTensorSend must have specified layout.");
 
   const Layout& send_layout = send_layout_or_null.value();
@@ -66,14 +68,14 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
 
   // If any one of send/recv mesh are empty, return error.
   if (send_mesh.IsEmpty() || recv_mesh.IsEmpty())
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Found empty mesh when sending/receiving tensor across clusters.");
 
   // If send host not found in list of receiving hosts, return error.
   std::vector<std::string> send_hosts = send_layout.ReducedMesh().hosts();
   std::vector<std::string> recv_hosts = recv_layout.ReducedMesh().hosts();
   if (send_hosts != recv_hosts)
-    return errors::InvalidArgument("Send and receive hosts don't match");
+    return absl::InvalidArgumentError("Send and receive hosts don't match");
 
   // Check shards in sending host match those in the receiving host.
   const auto send_host_shard_map = send_layout.HostShardMap();
@@ -83,15 +85,15 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
         send_host_shard_map.find(host)->second;
     ShardVector shards_in_recv_host = recv_host_shard_map.find(host)->second;
     if (shards_in_send_host != shards_in_recv_host)
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(absl::StrCat(
           "Send and receive host shard vectors don't match. Send shard_vector:",
           shards_in_send_host.ToString(),
-          " / Recv host spec : ", shards_in_recv_host.ToString());
+          " / Recv host spec : ", shards_in_recv_host.ToString()));
   }
 
   // Send/Recv mesh must be different.
   if (recv_mesh == send_mesh)
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "Found CopyToMesh op sending tensor to same mesh. Only use "
         "CopyToMesh to transfer data across different mesh cluster. For "
         "changing layout within the same mesh, use tf.Relayout op.");
@@ -100,7 +102,7 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
   // For example, TPU mesh -> GPU mesh or TPU mesh -> another TPU mesh
   // is disallowed.
   if (!send_mesh.is_cpu_mesh() && !recv_mesh.is_cpu_mesh())
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "tf.CopyToMesh op must be used to send data from/to host mesh.");
 
   return OkStatus();
@@ -116,7 +118,7 @@ StatusOr<mlir::Operation*> ExpandRelayoutOp(RelayoutOp relayout,
     if (sharding_spec == Layout::kMatch) match_present = true;
 
   if (!match_present && output_layout != target_layout)
-    return errors::Internal(
+    return absl::InternalError(
         "output layout of Relayout op after layout propagation does not match "
         "layout specified by Relayout op.");
 
@@ -135,7 +137,7 @@ StatusOr<mlir::Operation*> ExpandRelayoutOp(RelayoutOp relayout,
   auto value_or_status =
       EmitRelayout(relayout.getInput(), input_layout, output_layout);
   if (!value_or_status.ok())
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         llvm::formatv("Unsupported layout received for {0} op. Trying "
                       "to set tensor "
                       "to layout : {1}. Found error {2}",
@@ -330,7 +332,7 @@ DTensorRecvSPMDExpander::ComputeLayoutForward(
   mlir::TF::DTensorRecv dtensor_recv =
       mlir::dyn_cast<mlir::TF::DTensorRecv>(op);
   if (!dtensor_recv) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         llvm::formatv("Expecting DTensorRecvOp but got {0}", OpName(op)).str());
   }
   return llvm::DenseMap<int, Layout>({{0, dtensor_recv.getLayout()}});
diff --git a/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc
index ec47c82abcf..e8e24c9b5a2 100644
--- a/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/elementwise_spmd_expander.cc
@@ -116,8 +116,7 @@ StatusOr<mlir::Operation*> ElementwiseSPMDExpander::ExpandOp(
   // Resource output is only likely to be for identity op. However, keeping
   // the checkgeneric here.
   auto op_result = op->getOpResult(0);
-  if (llvm::isa<mlir::TF::ResourceType>(
-          mlir::getElementTypeOrSelf(op_result))) {
+  if (IsResourceType(op_result)) {
     TF_RETURN_IF_ERROR(InferSPMDExpandedLocalShapeForResourceOutput(
         &op_result, output_layout.value(), builder.getContext()));
   }
diff --git a/tensorflow/dtensor/mlir/expansions/range_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/range_spmd_expander.cc
index 7e096c2ce2b..46cf5ccc246 100644
--- a/tensorflow/dtensor/mlir/expansions/range_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/range_spmd_expander.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/expansions/range_spmd_expander.h"
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 #include "tensorflow/dtensor/mlir/shape_utils.h"
@@ -27,11 +29,11 @@ StatusOr<mlir::Operation*> RangeSPMDExpander::ExpandOp(mlir::Operation* op) {
   TF_ASSIGN_OR_RETURN(auto layout, ExtractSingleLayoutFromOp(op));
 
   if (!layout)
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "layout of RangeOp must be known before SPMD expansion.");
 
   if (!layout->IsFullyReplicated())
-    return errors::Internal("Shared RangeOp is not supported yet.");
+    return absl::InternalError("Shared RangeOp is not supported yet.");
 
   return InferSPMDExpandedLocalShape(op);
 }
diff --git a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
index c15becba9e7..448d7652aa7 100644
--- a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
@@ -149,11 +149,24 @@ Status ExtractDims<mlir::TF::BiasAddGradOp>(
   return OkStatus();
 }
 
+template <>
+Status ExtractDims<mlir::TF::EncodePngOp>(
+    mlir::Operation* op, llvm::SmallVector<int64_t, 4>* reduced_dims,
+    bool* keep_dims, bool* matched) {
+  if (!llvm::isa<mlir::TF::EncodePngOp>(op)) return OkStatus();
+  *reduced_dims = {-3, -2, -1};
+  *keep_dims = false;
+  *matched = true;
+  return OkStatus();
+}
+
 Status ExtractReductionParameters(mlir::Operation* op,
                                   absl::flat_hash_set<int>& reduced_dims_set,
                                   bool& keep_dims) {
   llvm::SmallVector<int64_t, 4> reduced_dims;
   bool matched = false;
+  TF_RETURN_IF_ERROR(ExtractDims<mlir::TF::EncodePngOp>(op, &reduced_dims,
+                                                        &keep_dims, &matched));
   TF_RETURN_IF_ERROR(
       ExtractDims<mlir::TF::SumOp>(op, &reduced_dims, &keep_dims, &matched));
   TF_RETURN_IF_ERROR(
@@ -223,8 +236,8 @@ StatusOr<mlir::Operation*> ReduceSPMDExpander::ExpandOp(mlir::Operation* op) {
   InferSPMDExpandedLocalShape(op);
 
   mlir::Operation* reduce_op;
-  if (mlir::isa<mlir::TF::SumOp, mlir::TF::L2LossOp, mlir::TF::BiasAddGradOp>(
-          op)) {
+  if (mlir::isa<mlir::TF::SumOp, mlir::TF::L2LossOp, mlir::TF::BiasAddGradOp,
+                mlir::TF::EncodePngOp>(op)) {
     TF_ASSIGN_OR_RETURN(
         reduce_op,
         EmitAllReduce(builder, output_layout, reduced_dims, op, kReduceOpAdd));
diff --git a/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
index 58bdad936c0..645ed13fd01 100644
--- a/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/save_restore_spmd_expander.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -183,7 +184,8 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
         saving_specs) {
   mlir::ModuleOp module = original_save->getParentOfType<mlir::ModuleOp>();
   if (!module)
-    return errors::Internal("SaveV2 op isn't enclosed inside a mlir::ModuleOp");
+    return absl::InternalError(
+        "SaveV2 op isn't enclosed inside a mlir::ModuleOp");
 
   mlir::SymbolTable symbol_table(module);
 
@@ -201,10 +203,10 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
   if (extraction_status.ok()) {
     for (const std::string& shape_and_slice : original_shape_and_slices) {
       if (!shape_and_slice.empty())
-        return errors::InvalidArgument(
+        return absl::InvalidArgumentError(absl::StrCat(
             absl::StrCat("DTensor SaveV2 requires shape_and_slices() field to "
                          "be empty for tensors, but get : ",
-                         shape_and_slice));
+                         shape_and_slice)));
     }
   } else {
     VLOG(2) << "Failed to extract and verify shape_and_slices() from "
@@ -277,7 +279,7 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
                           GetGlobalShapeOfValueFromDTensorLayout(
                               original_save.getTensorNames()));
       if (tensor_names_shape.size() != 1)
-        return errors::Internal(
+        return absl::InternalError(
             llvm::formatv("SaveV2 op got `tensor_names` with rank {0}) but "
                           "expects rank to be 1.",
                           tensor_names_shape.size())
@@ -372,7 +374,7 @@ StatusOr<mlir::TF::CaseOp> ConditionalSave(
 
 StatusOr<mlir::Operation*> ExpandSaveV2Op(mlir::Operation* op) {
   if (!llvm::isa<mlir::TF::SaveV2Op>(op)) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         llvm::formatv("Expecting SaveV2Op but got {0}", OpName(op)).str());
   }
 
@@ -393,9 +395,9 @@ StatusOr<mlir::Operation*> ExpandSaveV2Op(mlir::Operation* op) {
     TF_ASSIGN_OR_RETURN(absl::optional<Layout> layout,
                         ExtractLayoutFromOperand(tensor));
     if (!layout)
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(absl::StrCat(
           "layout is required when saving a DTensor but find no layout "
-          "attached");
+          "attached"));
 
     TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> tensor_shape,
                         GetGlobalShapeOfValueFromDTensorLayout(it.value()));
@@ -429,7 +431,7 @@ StatusOr<mlir::Operation*> ExpandMergeV2Op(mlir::Operation* op) {
   mlir::TF::MergeV2CheckpointsOp merge_v2 =
       mlir::dyn_cast<mlir::TF::MergeV2CheckpointsOp>(op);
   if (!merge_v2) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         llvm::formatv("Expecting MergeV2CheckpointsOp but got {0}", OpName(op))
             .str());
   }
@@ -563,7 +565,7 @@ StatusOr<mlir::Operation*> ExpandRestoreV2OpHelper(
   // Prepare for building CaseOp.
   mlir::ModuleOp module = op->template getParentOfType<mlir::ModuleOp>();
   if (!module)
-    return errors::Internal(
+    return absl::InternalError(
         "DTensorRestoreV2 op isn't enclosed inside a mlir::ModuleOp");
 
   mlir::SymbolTable symbol_table(module);
@@ -679,7 +681,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
   mlir::TF::DTensorRestoreV2Op restore_v2 =
       mlir::dyn_cast<mlir::TF::DTensorRestoreV2Op>(op);
   if (!restore_v2) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         llvm::formatv("Expecting DTensorRestoreV2Op but got {0}", OpName(op))
             .str());
   }
@@ -687,7 +689,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
   mlir::ArrayAttr input_shapes_attr =
       restore_v2->getAttrOfType<mlir::ArrayAttr>("input_shapes");
   if (!input_shapes_attr) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "DTensorRestoreV2Op requires input_shapes attributes.");
   }
 
@@ -696,7 +698,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
   for (const auto& shape : input_shapes_attr) {
     mlir::TF::ShapeAttr shape_attr = shape.cast<mlir::TF::ShapeAttr>();
     if (!shape_attr.hasStaticShape()) {
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(
           llvm::formatv("DTensorRestoreV2Op requires statically known input "
                         "shape, but got non-static shape: {0}.",
                         shape_attr)
@@ -708,7 +710,7 @@ StatusOr<mlir::Operation*> ExpandDTensorRestoreV2Op(mlir::Operation* op) {
 
   mlir::ArrayAttr input_layouts_attr = restore_v2.getInputLayouts();
   if (!input_layouts_attr) {
-    return errors::InvalidArgument(
+    return absl::InvalidArgumentError(
         "DTensorRestoreV2Op requires input_layouts attributes.");
   }
   std::vector<Layout> input_layouts;
@@ -800,7 +802,7 @@ StatusOr<mlir::Operation*> SaveRestoreSPMDExpander::ExpandOp(
     return ExpandRestoreV2Op(op);
   }
 
-  return errors::Unimplemented(
+  return absl::UnimplementedError(
       llvm::formatv("SPMD for op : {0} is not implemented ", OpName(op)).str());
 }
 
@@ -827,7 +829,7 @@ StatusOr<llvm::SmallVector<Layout>> GetLayoutsFromAssignVariableOps(
         }
         auto next_op = consuming_op->getResult(0).getUsers();
         if (next_op.empty()) {
-          return errors::Internal(
+          return absl::InternalError(
               "Expected a result of an identity op to be consumed by another "
               "op, but was empty during RestoreV2 Expansion.");
         }
@@ -866,7 +868,7 @@ SaveRestoreSPMDExpander::ComputeLayoutForward(
     mlir::TF::RestoreV2Op restore_v2 = mlir::cast<mlir::TF::RestoreV2Op>(op);
     TF_ASSIGN_OR_RETURN(Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
     if (!mesh.is_cpu_mesh()) {
-      return errors::InvalidArgument(
+      return absl::InvalidArgumentError(
           llvm::formatv(
               "RestoreV2Op must run on a CPU mesh, but was running on: {0}",
               mesh.ToString())
@@ -878,11 +880,11 @@ SaveRestoreSPMDExpander::ComputeLayoutForward(
     TF_ASSIGN_OR_RETURN(
         auto layouts, GetLayoutsFromAssignVariableOps(module_op, &restore_v2));
     if (layouts.size() != restore_v2.getNumResults()) {
-      return errors::Internal(llvm::formatv("Failed to get {0} output layouts "
-                                            "for RestoreV2Op. Got {1} layouts.",
-                                            restore_v2.getNumResults(),
-                                            layouts.size())
-                                  .str());
+      return absl::InternalError(
+          llvm::formatv("Failed to get {0} output layouts "
+                        "for RestoreV2Op. Got {1} layouts.",
+                        restore_v2.getNumResults(), layouts.size())
+              .str());
     }
     llvm::DenseMap<int, Layout> output_layouts(restore_v2.getNumResults());
 
@@ -909,7 +911,7 @@ SaveRestoreSPMDExpander::ComputeLayoutForward(
     }
     return output_layouts;
   }
-  return errors::Unimplemented(
+  return absl::UnimplementedError(
       llvm::formatv("Layout propagation for op : {0} is not implemented",
                     OpName(op))
           .str());
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index d3291e223ac..f96d28fa90f 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -110,344 +110,6 @@ StatusOr<Layout> VerifySliceLayout(
   return Layout::FromProto(proposed_proto);
 }
 
-llvm::SmallVector<int64_t, 4> CalculateBitVector(const uint64_t mask_value) {
-  llvm::SmallVector<int64_t, 4> bit_vector;
-  bit_vector.resize(sizeof(uint64_t) * 8, 0);
-  for (int i = 0; i < sizeof(uint64_t) * 8; ++i) {
-    bit_vector[i] = (mask_value >> i & 1);
-  }
-  return bit_vector;
-}
-
-// The begin/end/stride and the masks are all sized to mach the number of
-// entries in the slice specification. E.g. [:, ..., 3] will have a begin/end/
-// stride of size 3 and the max set bit in the mask will be the 3rd bit.
-// This function converts this specifications into ones relative to the input
-// tensor.
-// We also output a bool vector of the input indices which are not shrunk away.
-// These always must be replicated, since shrinking an index means we took a
-// single element along that axis and it must be present on all cores.
-// spec_to_input maps the 'spec' dimensions to the input dimensions. This is
-// needed so we can create a new 'end' input for the SPMD expanded op.
-//
-// NOTE: If the begin or ends are dynamic, they will be size 0.
-// If strides is dynamic it will be the correct rank but contain 0s (an invalid
-// stride).
-template <typename T>
-Status GetInputOrientedData(T strided_slice,
-                            llvm::SmallVectorImpl<int64_t>* begin,
-                            uint64_t* begin_mask,
-                            llvm::SmallVectorImpl<int64_t>* end,
-                            uint64_t* end_mask,
-                            llvm::SmallVectorImpl<int64_t>* strides,
-                            llvm::SmallVectorImpl<bool>* not_shrunk,
-                            llvm::SmallVectorImpl<int64>* spec_to_input) {
-  begin->resize(0);
-  end->resize(0);
-  strides->resize(0);
-
-  llvm::SmallVector<int64_t, 4> spec_begin;
-  llvm::SmallVector<int64_t, 4> spec_end;
-  llvm::SmallVector<int64_t, 4> spec_strides;
-
-  TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> strides_shape,
-                      GetShapeOfValue(strided_slice.getStrides(),
-                                      /*fail_on_dynamic=*/true));
-  if (strides_shape.size() != 1)
-    return errors::InvalidArgument(
-        "strides input to strided operation is not rank 1");
-
-  int64_t spec_rank = strides_shape[0];
-  spec_to_input->resize(spec_rank, -1);
-
-  if (!ExtractConstVectorFromValue(strided_slice.getStrides(), &spec_strides)
-           .ok())
-    spec_strides.resize(spec_rank, 0);
-
-  if (ExtractConstVectorFromValue(strided_slice.getBegin(), &spec_begin).ok())
-    if (spec_begin.size() != spec_rank)
-      return errors::InvalidArgument(
-          "rank of begin input to strided operation does not equal rank of "
-          "strides input");
-
-  if (ExtractConstVectorFromValue(strided_slice.getEnd(), &spec_end).ok())
-    if (spec_end.size() != spec_rank)
-      return errors::InvalidArgument(
-          "rank of end input to strided operation does not equal rank of "
-          "strides input");
-
-  const uint64_t new_axis_mask = strided_slice.getNewAxisMask();
-  const uint64_t shink_axis_mask = strided_slice.getShrinkAxisMask();
-  const uint64_t spec_begin_mask = strided_slice.getBeginMask();
-  const uint64_t spec_end_mask = strided_slice.getEndMask();
-  uint64_t ellipsis_mask = strided_slice.getEllipsisMask();
-
-  int64_t input_rank;
-  if (mlir::isa<mlir::TF::StridedSliceOp>(strided_slice) ||
-      mlir::isa<mlir::TF::TensorStridedSliceUpdateOp>(strided_slice)) {
-    // For StridedSlice the first operand is the input.
-    input_rank = ValueRank(strided_slice->getOperand(0));
-  } else if (mlir::isa<mlir::TF::StridedSliceGradOp>(strided_slice)) {
-    // For StridedSliceGrad the first operand is the shape of the input.
-    TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> input_shape,
-                        GetShapeOfValue(strided_slice->getOperand(0)));
-    if (input_shape.size() != 1)
-      return errors::InvalidArgument("input shape must be rank 1");
-    input_rank = input_shape[0];
-  }
-
-  if (absl::popcount(ellipsis_mask) > 1)
-    return errors::InvalidArgument(
-        "strided slice only supports at most one ellipsis");
-
-  // Count the number of axes after the ellipsis
-  bool found_ellipsis = false;
-  int64_t num_add_axis_after_ellipsis = 0;
-  for (int64_t i = 0; i < spec_rank; ++i) {
-    if (found_ellipsis && ((1 << i) & new_axis_mask))
-      num_add_axis_after_ellipsis++;
-    if ((1 << i) & ellipsis_mask) found_ellipsis = true;
-  }
-  // Guarantee one ellipsis. If there isn't one, add it at the end of the spec.
-  // If we do this, add one to the total rank so that we process the ellipsis as
-  // part of the loop below.
-  if (!found_ellipsis) ellipsis_mask |= (1 << (spec_rank++));
-
-  // At this point total rank cannot be more than input_rank + number of
-  // new axes plus the number of ellipses. Check that condition so that we know
-  // the loop below won't have input_index >= input_rank.
-  if (spec_rank > input_rank + absl::popcount(new_axis_mask) + 1)
-    return errors::InvalidArgument(
-        "incompatible input rank, number of new axis and specification rank: ",
-        input_rank, ", ", absl::popcount(new_axis_mask), ", ", spec_rank);
-
-  int64_t input_index = 0;
-  for (int64_t spec_index = 0; spec_index < spec_rank; ++spec_index) {
-    if ((1 << spec_index) & ellipsis_mask) {
-      const int64_t next_input_index =
-          std::min(input_rank - (spec_rank - spec_index) + 1 +
-                       num_add_axis_after_ellipsis,
-                   input_rank);
-      for (; input_index < next_input_index; input_index++) {
-        // For input axes within the ellipsis region, we include the entire axis
-        // by setting the begin and end mask.
-        not_shrunk->emplace_back(true);
-        if (!spec_begin.empty()) begin->emplace_back(0);
-        if (!spec_end.empty()) end->emplace_back(0);
-        strides->emplace_back(1);
-        (*begin_mask) |= 1 << input_index;
-        (*end_mask) |= 1 << input_index;
-      }
-    } else if (((1 << spec_index) & new_axis_mask) == 0) {
-      not_shrunk->emplace_back(((1 << spec_index) & shink_axis_mask) == 0);
-      if (!spec_begin.empty()) begin->emplace_back(spec_begin[spec_index]);
-      if (!spec_end.empty()) end->emplace_back(spec_end[spec_index]);
-      strides->emplace_back(spec_strides[spec_index]);
-      (*spec_to_input)[spec_index] = input_index;
-      (*begin_mask) |= ((spec_begin_mask >> spec_index) & 1) << input_index;
-      (*end_mask) |= ((spec_end_mask >> spec_index) & 1) << input_index;
-      input_index++;
-    }
-  }
-
-  // This should not happen.
-  if (input_index != input_rank)
-    return errors::Internal("strided slice input not totally processed");
-
-  return OkStatus();
-}
-
-// Return an intermediate layout for StridedSlice(Grad), where we can lower the
-// global StridedSlice(Grad) to a local one.
-// All the inputs (begin/end/stride/masks) are sized to match the 'total rank'
-// which is the rank of the input rank + number of new dimensions added (e.g
-// the number of bits set in the new_axis_mask).
-// The values of these inputs on the 'newly added' dimensions are ignored.
-// global_input_shape is the global shape for the main input of StridedSlice or
-// equivalently the global shape of the output of StridedSliceGrad.
-// If new_end is not a nullptr, it will be set to the new ending vector if
-// the end was constant, otherwise it will be cleared.
-template <typename T>
-StatusOr<Layout> GetStridedSliceIntermediateLayout(
-    T strided_slice, const Layout& layout,
-    const llvm::ArrayRef<int64_t> global_input_shape,
-    llvm::SmallVectorImpl<int64_t>* new_end = nullptr) {
-  const int64_t rank = global_input_shape.size();
-
-  // Records if the corresponding axis of the input can be sharded.
-  llvm::SmallVector<bool, 4> can_shard;
-  // Lists the start/end of the slice. Value is otherwise clamped to the correct
-  // range.
-  llvm::SmallVector<int64_t, 4> begin;
-  llvm::SmallVector<int64_t, 4> end;
-  // Lists the stride for each tensor dimension. Positive when its constant and
-  // 0 when its dynamic.
-  llvm::SmallVector<int64_t, 4> strides;
-  llvm::SmallVector<int64_t, 4> total_to_input;
-  // The current number of shards long each axis;
-  const std::vector<int32> shards = layout.num_shards();
-
-  uint64_t begin_mask = 0;
-  uint64_t end_mask = 0;
-
-  TF_RETURN_IF_ERROR(GetInputOrientedData(strided_slice, &begin, &begin_mask,
-                                          &end, &end_mask, &strides, &can_shard,
-                                          &total_to_input));
-
-  bool const_begin = !begin.empty();
-  bool const_end = !end.empty();
-
-  if (!const_begin) begin.resize(rank, 0);
-
-  if (!const_end) end.resize(rank, 0);
-
-  for (int i = 0; i < rank; ++i) {
-    if ((1 << i) & begin_mask)
-      begin[i] = 0;
-    else if (begin[i] < 0)
-      begin[i] += global_input_shape[i];
-
-    if (begin[i] < 0l) {
-      begin[i] = 0l;
-    } else if (begin[i] > global_input_shape[i] - 1) {
-      begin[i] = global_input_shape[i] - 1;
-    }
-
-    if ((1 << i) & end_mask)
-      end[i] = global_input_shape[i];
-    else if (end[i] < 0)
-      end[i] += global_input_shape[i];
-
-    if (end[i] < 1l) {
-      end[i] = 1l;
-    } else if (end[i] > global_input_shape[i]) {
-      end[i] = global_input_shape[i];
-    }
-
-    // Negative and dynamic stride requires unsharded axis.
-    if (strides[i] < 1) can_shard[i] = false;
-    // The local size must be divisible by the stride, otherwise the begin
-    // for each local slice would be different.
-    if ((global_input_shape[i] / shards[i]) % strides[i] != 0)
-      can_shard[i] = false;
-    // If start or end are dynamic we can't shard.
-    if (!(((1 << i) & begin_mask) || const_begin) ||
-        !(((1 << i) & end_mask) || const_end))
-      can_shard[i] = false;
-    // Finally if amount of space left on 'left' and 'right' of the tensor
-    // is more than (or equal to) a stride then we can't shard as there would be
-    // an unequal number of outputs per shard.
-    // NOTE: the case of end[i] == begin[i] may be a simple optimization since
-    // the result is an empty tensor.
-    if (global_input_shape[i] - (end[i] - begin[i]) >= strides[i])
-      can_shard[i] = false;
-    // If there is currently no sharding, it doesn't make sense to shard.
-    if (shards[i] == 1) can_shard[i] = false;
-  }
-
-  // Compute the new 'end' for the slice. Note that this end needs to be in
-  // terms of the 'total' index not the input index (i.e. it needs 'bogus'
-  // entries for the new axes).
-  if (new_end != nullptr) {
-    if (!const_end) {
-      // Dynamic end are unchanged. We indicate this by ensuring the passed in
-      // is empty;
-      new_end->clear();
-    } else {
-      new_end->resize(total_to_input.size());
-      for (int i = 0; i < total_to_input.size(); ++i) {
-        const int64_t inp = total_to_input[i];
-        if (inp != -1) {
-          // If we can keep input axis input_index sharded, we need to update
-          // the end. Given the conditions we enforeced above, we can set end to
-          // the local size of input.
-          if (can_shard[inp])
-            (*new_end)[i] = global_input_shape[inp] / shards[inp];
-          else
-            (*new_end)[i] = end[inp];
-        }
-      }
-    }
-  }
-
-  // Compute the new layout, its basically the old layout but replicated on some
-  // axis.
-  absl::flat_hash_set<int> reduced_dims;
-  for (int i = 0; i < rank; ++i)
-    if (!can_shard[i]) reduced_dims.emplace(i);
-  return layout.GetLayoutWithReducedDims(reduced_dims, /*keep_dims=*/true);
-}
-
-enum Direction {
-  FORWARD,
-  BACKWARD,
-};
-
-// Applies the shrink and new masks to a layout. This function works in both the
-// forwards and backwards direction as specified in the direction argument.
-template <typename SliceOpT>
-StatusOr<Layout> ApplyNewAndShrinkMasksToLayout(SliceOpT slice_op,
-                                                const int input_rank,
-                                                const int output_rank,
-                                                const Layout& proposed_layout,
-                                                const Direction direction) {
-  // Calculate bit mask for shrunk dimensions/newly added dimensions.
-  const llvm::SmallVector<int64_t, 4> new_axis_mask =
-      CalculateBitVector(slice_op.getNewAxisMask());
-  const llvm::SmallVector<int64_t, 4> shrink_axis_mask =
-      CalculateBitVector(slice_op.getShrinkAxisMask());
-
-  std::vector<std::string> sharding_spec;
-  int input_dim_index = 0;
-  int output_dim_index = 0;
-  int current_dimension_index = 0;
-  while (current_dimension_index < proposed_layout.rank()) {
-    if (input_dim_index < input_rank &&
-        shrink_axis_mask[input_dim_index] == 1) {
-      input_dim_index++;
-      if (direction == BACKWARD)
-        sharding_spec.emplace_back(Layout::kUnshardedDim);
-      else
-        current_dimension_index++;
-    } else if (output_dim_index < output_rank &&
-               new_axis_mask[output_dim_index] == 1) {
-      if (direction == FORWARD)
-        sharding_spec.emplace_back(Layout::kUnshardedDim);
-      else
-        current_dimension_index++;
-      output_dim_index++;
-    } else {
-      sharding_spec.emplace_back(
-          proposed_layout.sharding_spec(current_dimension_index));
-      input_dim_index++;
-      output_dim_index++;
-      current_dimension_index++;
-    }
-  }
-
-  const auto& mask = (direction == FORWARD) ? new_axis_mask : shrink_axis_mask;
-  // New dimensions may be added after all dimensions have been sliced.
-  while (current_dimension_index < mask.size() &&
-         mask[current_dimension_index] == 1) {
-    sharding_spec.emplace_back(Layout::kUnshardedDim);
-    current_dimension_index++;
-  }
-
-  return Layout::GetLayout(sharding_spec, proposed_layout.mesh());
-}
-
-mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
-                                     mlir::Location loc,
-                                     llvm::ArrayRef<int64_t> values,
-                                     mlir::Type type) {
-  if (type.cast<mlir::RankedTensorType>().getElementType().isInteger(64)) {
-    return Int64Const(builder, loc, values);
-  } else {
-    llvm::SmallVector<int32, 4> values32(values.begin(), values.end());
-    return IntConst(builder, loc, values32);
-  }
-}
-
 }  // namespace
 
 StatusOr<mlir::Operation*> SliceSPMDExpander::ExpandOp(mlir::Operation* op) {
@@ -582,422 +244,6 @@ StatusOr<llvm::DenseMap<int, Layout>> SliceSPMDExpander::ComputeLayoutBackward(
   return input_layouts;
 }
 
-StatusOr<mlir::Operation*> StridedSliceSPMDExpander::ExpandOp(
-    mlir::Operation* op) {
-  auto strided_slice_op = mlir::cast<mlir::TF::StridedSliceOp>(op);
-  TF_ASSIGN_OR_RETURN(Layout input_layout, ExtractRequiredLayoutFromOperand(
-                                               strided_slice_op.getInput()));
-  TF_ASSIGN_OR_RETURN(Layout output_layout,
-                      ExtractRequiredSingleLayoutFromOp(op));
-  TF_ASSIGN_OR_RETURN(
-      const llvm::ArrayRef<int64_t> global_input_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.getInput()));
-
-  llvm::SmallVector<int64_t, 4> end;
-  TF_ASSIGN_OR_RETURN(
-      Layout intermediate_input_layout,
-      GetStridedSliceIntermediateLayout(strided_slice_op, input_layout,
-                                        global_input_shape, &end));
-
-  TF_ASSIGN_OR_RETURN(mlir::Value new_input,
-                      EmitRelayout(strided_slice_op.getInput(), input_layout,
-                                   intermediate_input_layout));
-
-  strided_slice_op.getInputMutable().assign(new_input);
-
-  mlir::OpBuilder builder(op);
-
-  if (!end.empty()) {
-    mlir::Value new_end =
-        IntConstWithMatchingType(builder, strided_slice_op.getLoc(), end,
-                                 strided_slice_op.getBegin().getType());
-    strided_slice_op.getEndMutable().assign(new_end);
-  }
-
-  op = InferSPMDExpandedLocalShape(op);
-
-  // Compute the layout of the output after the local StridedSlice takes place.
-  const int input_rank = global_input_shape.size();
-  const int output_rank = ValueRank(strided_slice_op.getOutput());
-
-  // Calculate bit mask for shrinked dimensions/newly added dimensions.
-  const llvm::SmallVector<int64_t, 4> new_axis_mask =
-      CalculateBitVector(strided_slice_op.getNewAxisMask());
-  const llvm::SmallVector<int64_t, 4> shrink_axis_mask =
-      CalculateBitVector(strided_slice_op.getShrinkAxisMask());
-
-  TF_ASSIGN_OR_RETURN(
-      Layout intermediate_output_layout,
-      ApplyNewAndShrinkMasksToLayout(strided_slice_op, input_rank, output_rank,
-                                     intermediate_input_layout, FORWARD));
-
-  // Do a final relayout to the correct output layout in case there are any
-  // differences between intermediate_output_layout and output_layout.
-  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
-
-  TF_ASSIGN_OR_RETURN(
-      mlir::Value output,
-      EmitRelayout(strided_slice_op.getOutput(), intermediate_output_layout,
-                   output_layout, &newly_created_ops));
-
-  strided_slice_op.getOutput().replaceAllUsesExcept(output, newly_created_ops);
-
-  return output.getDefiningOp();
-}
-
-StatusOr<llvm::DenseMap<int, Layout>>
-StridedSliceSPMDExpander::ComputeLayoutForward(
-    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
-  // If the input layout is missing, don't return an output layout.
-  if (input_layouts.find(0) == input_layouts.end())
-    return llvm::DenseMap<int, Layout>();
-
-  mlir::TF::StridedSliceOp strided_slice_op =
-      mlir::cast<mlir::TF::StridedSliceOp>(op);
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.getInput(),
-                                      /*fail_on_dynamic=*/true));
-  const int input_rank = global_input_shape.size();
-  const int output_rank = ValueRank(strided_slice_op.getOutput());
-
-  const Layout& input_layout = input_layouts.lookup(0);
-  TF_ASSIGN_OR_RETURN(Layout proposed_layout,
-                      GetStridedSliceIntermediateLayout(
-                          strided_slice_op, input_layout, global_input_shape));
-  // If dimension was added or removed, create a new proposed output layout
-  // with dimensions added/skipped.
-  TF_ASSIGN_OR_RETURN(
-      proposed_layout,
-      ApplyNewAndShrinkMasksToLayout(strided_slice_op, input_rank, output_rank,
-                                     proposed_layout, FORWARD));
-  return llvm::DenseMap<int, Layout>({{0, proposed_layout}});
-}
-
-StatusOr<llvm::DenseMap<int, Layout>>
-StridedSliceSPMDExpander::ComputeLayoutBackward(
-    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
-  mlir::TF::StridedSliceOp strided_slice_op =
-      mlir::cast<mlir::TF::StridedSliceOp>(op);
-  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
-
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.getInput(),
-                                      /*fail_on_dynamic=*/true));
-  const int input_rank = global_input_shape.size();
-  const int output_rank = ValueRank(strided_slice_op.getOutput());
-
-  llvm::DenseMap<int, Layout> input_layouts(strided_slice_op.getNumOperands());
-  // Set replicated layout for begin, end, and strides operands.
-  input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[3] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-
-  // input
-  if (output_layouts.find(0) != output_layouts.end()) {
-    // This layout must exist (as there is only one output).
-    const Layout& output_layout = output_layouts.lookup(0);
-    // If dimension was added or removed, take the current output layout, and
-    // add/skip dimensions in it as needed to get an input layout.
-    TF_ASSIGN_OR_RETURN(
-        Layout proposed_layout,
-        ApplyNewAndShrinkMasksToLayout(strided_slice_op, input_rank,
-                                       output_rank, output_layout, BACKWARD));
-    TF_ASSIGN_OR_RETURN(proposed_layout, GetStridedSliceIntermediateLayout(
-                                             strided_slice_op, proposed_layout,
-                                             global_input_shape));
-    input_layouts[0] = proposed_layout;
-  }
-
-  return input_layouts;
-}
-
-StatusOr<mlir::Operation*> TensorStridedSliceUpdateSPMDExpander::ExpandOp(
-    mlir::Operation* op) {
-  mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
-      llvm::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
-  TF_ASSIGN_OR_RETURN(
-      const Layout input_layout,
-      ExtractRequiredLayoutFromOperand(strided_slice_op.getInput()));
-  TF_ASSIGN_OR_RETURN(
-      const Layout value_layout,
-      ExtractRequiredLayoutFromOperand(strided_slice_op.getValue()));
-  TF_ASSIGN_OR_RETURN(const Layout output_layout,
-                      ExtractRequiredSingleLayoutFromOp(op));
-
-  TF_ASSIGN_OR_RETURN(
-      const llvm::ArrayRef<int64_t> global_input_shape,
-      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.getInput()));
-
-  const int input_rank = global_input_shape.size();
-  const int value_rank = ValueRank(strided_slice_op.getValue());
-
-  llvm::SmallVector<int64_t, 4> end;
-  TF_ASSIGN_OR_RETURN(
-      Layout intermediate_input_layout,
-      GetStridedSliceIntermediateLayout(strided_slice_op, input_layout,
-                                        global_input_shape, &end));
-
-  TF_ASSIGN_OR_RETURN(
-      Layout intermediate_value_layout,
-      ApplyNewAndShrinkMasksToLayout(strided_slice_op, input_rank, value_rank,
-                                     intermediate_input_layout, FORWARD));
-
-  TF_ASSIGN_OR_RETURN(mlir::Value new_input,
-                      EmitRelayout(strided_slice_op.getInput(), input_layout,
-                                   intermediate_input_layout));
-
-  TF_ASSIGN_OR_RETURN(mlir::Value new_value,
-                      EmitRelayout(strided_slice_op.getValue(), value_layout,
-                                   intermediate_value_layout));
-
-  strided_slice_op.getInputMutable().assign(new_input);
-  strided_slice_op.getValueMutable().assign(new_value);
-
-  mlir::OpBuilder builder(op);
-
-  if (!end.empty()) {
-    mlir::Value new_end =
-        IntConstWithMatchingType(builder, strided_slice_op.getLoc(), end,
-                                 strided_slice_op.getBegin().getType());
-    strided_slice_op.getEndMutable().assign(new_end);
-  }
-
-  op = InferSPMDExpandedLocalShape(op);
-
-  // Do a final relayout to the correct output layout in case there are any
-  // differences between intermediate_output_layout and output_layout.
-  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
-
-  TF_ASSIGN_OR_RETURN(
-      mlir::Value output,
-      EmitRelayout(strided_slice_op.getOutput(), intermediate_input_layout,
-                   output_layout, &newly_created_ops));
-
-  strided_slice_op.getOutput().replaceAllUsesExcept(output, newly_created_ops);
-
-  return output.getDefiningOp();
-}
-
-StatusOr<llvm::DenseMap<int, Layout>>
-TensorStridedSliceUpdateSPMDExpander::ComputeLayoutForward(
-    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
-  // If the input layout and value layout are missing, don't return an output
-  // layout.
-  if (input_layouts.find(0) == input_layouts.end() &&
-      input_layouts.find(4) == input_layouts.end())
-    return llvm::DenseMap<int, Layout>();
-
-  mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
-      mlir::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.getInput(),
-                                      /*fail_on_dynamic=*/true));
-  const int input_rank = global_input_shape.size();
-  const int value_rank = ValueRank(strided_slice_op.getValue());
-
-  // We have a choice to determine the output layout, we will default to use
-  // input_layout if available, otherwise we will expand value_layout and use
-  // that.
-  Layout input_layout;
-  if (input_layouts.find(0) != input_layouts.end()) {
-    input_layout = input_layouts.lookup(0);
-  } else {
-    // When we don't have the input layout, use value layout to 'create' the
-    // input layout. We do this by applying the new and shrink masks backwards.
-    // This is because in the case of a normal strided slice the layout of
-    // value would be output layout.
-    const Layout& value_layout = input_layouts.lookup(4);
-    TF_ASSIGN_OR_RETURN(input_layout, ApplyNewAndShrinkMasksToLayout(
-                                          strided_slice_op, input_rank,
-                                          value_rank, value_layout, BACKWARD));
-  }
-  TF_ASSIGN_OR_RETURN(Layout proposed_output_layout,
-                      GetStridedSliceIntermediateLayout(
-                          strided_slice_op, input_layout, global_input_shape));
-
-  return llvm::DenseMap<int, Layout>({{0, proposed_output_layout}});
-}
-
-StatusOr<llvm::DenseMap<int, Layout>>
-TensorStridedSliceUpdateSPMDExpander::ComputeLayoutBackward(
-    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
-  mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
-      mlir::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
-  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
-
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
-                      GetShapeOfValue(strided_slice_op.getInput(),
-                                      /*fail_on_dynamic=*/true));
-  const int input_rank = global_input_shape.size();
-  const int value_rank = ValueRank(strided_slice_op.getValue());
-
-  llvm::DenseMap<int, Layout> input_layouts(strided_slice_op.getNumOperands());
-  // Set replicated layout for begin, end, and strides operands.
-  input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[3] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-
-  // input and value layouts
-  if (output_layouts.find(0) != output_layouts.end()) {
-    const Layout& output_layout = output_layouts.lookup(0);
-    TF_ASSIGN_OR_RETURN(
-        const Layout proposed_input_layout,
-        GetStridedSliceIntermediateLayout(strided_slice_op, output_layout,
-                                          global_input_shape));
-    input_layouts[0] = proposed_input_layout;
-
-    // We also need a layout for value as well, and for that we just take the
-    // input layout and apply the masks.
-    // The layout of value is determined from the input layout by applying the
-    // new and shrink masks in the forwards direction as value would have been
-    // the output layout for a normal strided slice operation.
-    TF_ASSIGN_OR_RETURN(
-        const Layout proposed_value_layout,
-        ApplyNewAndShrinkMasksToLayout(strided_slice_op, input_rank, value_rank,
-                                       proposed_input_layout, FORWARD));
-    input_layouts[4] = proposed_value_layout;
-  }
-
-  return input_layouts;
-}
-
-StatusOr<mlir::Operation*> StridedSliceGradSPMDExpander::ExpandOp(
-    mlir::Operation* op) {
-  auto strided_slice_grad_op = llvm::cast<mlir::TF::StridedSliceGradOp>(op);
-  TF_ASSIGN_OR_RETURN(
-      const Layout input_layout,
-      ExtractRequiredLayoutFromOperand(strided_slice_grad_op.getDy()));
-  TF_ASSIGN_OR_RETURN(const Layout output_layout,
-                      ExtractRequiredSingleLayoutFromOp(op));
-
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
-                      GetGlobalShapeOfValueFromDTensorLayout(
-                          strided_slice_grad_op.getOutput()));
-
-  const int input_rank = ValueRank(strided_slice_grad_op.getDy());
-  const int output_rank = global_output_shape.size();
-
-  llvm::SmallVector<int64_t, 4> end;
-  TF_ASSIGN_OR_RETURN(
-      Layout intermediate_output_layout,
-      GetStridedSliceIntermediateLayout(strided_slice_grad_op, output_layout,
-                                        global_output_shape, &end));
-
-  TF_ASSIGN_OR_RETURN(Layout intermediate_input_layout,
-                      ApplyNewAndShrinkMasksToLayout(
-                          strided_slice_grad_op, output_rank, input_rank,
-                          intermediate_output_layout, FORWARD));
-
-  TF_ASSIGN_OR_RETURN(mlir::Value new_dy,
-                      EmitRelayout(strided_slice_grad_op.getDy(), input_layout,
-                                   intermediate_input_layout));
-
-  strided_slice_grad_op.getDyMutable().assign(new_dy);
-
-  mlir::OpBuilder builder(op);
-
-  if (!end.empty()) {
-    mlir::Value new_end =
-        IntConstWithMatchingType(builder, strided_slice_grad_op.getLoc(), end,
-                                 strided_slice_grad_op.getBegin().getType());
-    strided_slice_grad_op.getEndMutable().assign(new_end);
-  }
-
-  // The shape input to StridedSliceGrad will still be global, so we need to
-  // compute the local shape update it.
-  std::vector<int64_t> computed_output_shape =
-      intermediate_output_layout.LocalShapeFromGlobalShape(global_output_shape);
-  mlir::Value new_shape = IntConstWithMatchingType(
-      builder, strided_slice_grad_op.getLoc(), computed_output_shape,
-      strided_slice_grad_op.getBegin().getType());
-  strided_slice_grad_op.getShapeMutable().assign(new_shape);
-
-  op = InferSPMDExpandedLocalShape(op);
-
-  // Do a final relayout to the correct output layout in case there are any
-  // differences between intermediate_output_layout and output_layout.
-  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
-
-  TF_ASSIGN_OR_RETURN(mlir::Value output,
-                      EmitRelayout(strided_slice_grad_op.getOutput(),
-                                   intermediate_output_layout, output_layout,
-                                   &newly_created_ops));
-
-  strided_slice_grad_op.getOutput().replaceAllUsesExcept(output,
-                                                         newly_created_ops);
-
-  return output.getDefiningOp();
-}
-
-StatusOr<llvm::DenseMap<int, Layout>>
-StridedSliceGradSPMDExpander::ComputeLayoutForward(
-    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
-  // If the input layout is missing, don't return an output layout.
-  if (input_layouts.find(4) == input_layouts.end())
-    return llvm::DenseMap<int, Layout>();
-
-  mlir::TF::StridedSliceGradOp strided_slice_grad_op =
-      mlir::cast<mlir::TF::StridedSliceGradOp>(op);
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
-                      GetShapeOfValue(strided_slice_grad_op.getOutput(),
-                                      /*fail_on_dynamic=*/true));
-  const int input_rank = ValueRank(strided_slice_grad_op.getDy());
-  const int output_rank = global_output_shape.size();
-
-  const Layout& input_layout = input_layouts.lookup(4);
-  // If dimension was added or removed, take the current output layout, and
-  // add/skip dimensions in it as needed to get an input layout.
-  TF_ASSIGN_OR_RETURN(
-      Layout proposed_layout,
-      ApplyNewAndShrinkMasksToLayout(strided_slice_grad_op, output_rank,
-                                     input_rank, input_layout, BACKWARD));
-  TF_ASSIGN_OR_RETURN(
-      proposed_layout,
-      GetStridedSliceIntermediateLayout(strided_slice_grad_op, proposed_layout,
-                                        global_output_shape));
-  return llvm::DenseMap<int, Layout>({{0, proposed_layout}});
-}
-
-StatusOr<llvm::DenseMap<int, Layout>>
-StridedSliceGradSPMDExpander::ComputeLayoutBackward(
-    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
-  mlir::TF::StridedSliceGradOp strided_slice_grad_op =
-      mlir::cast<mlir::TF::StridedSliceGradOp>(op);
-  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
-
-  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
-                      GetShapeOfValue(strided_slice_grad_op.getOutput(),
-                                      /*fail_on_dynamic=*/true));
-  const int input_rank = ValueRank(strided_slice_grad_op.getDy());
-  const int output_rank = global_output_shape.size();
-
-  llvm::DenseMap<int, Layout> input_layouts(
-      strided_slice_grad_op.getNumOperands());
-  // Set replicated layout for shape, begin, end, stride operands.
-  input_layouts[0] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-  input_layouts[3] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
-
-  // dy
-  if (output_layouts.find(0) != output_layouts.end()) {
-    const Layout& output_layout = output_layouts.lookup(0);
-    TF_ASSIGN_OR_RETURN(
-        Layout proposed_layout,
-        GetStridedSliceIntermediateLayout(strided_slice_grad_op, output_layout,
-                                          global_output_shape));
-
-    // If dimension was added or removed, create a new proposed output layout
-    // with dimensions added/skipped.
-    TF_ASSIGN_OR_RETURN(
-        proposed_layout,
-        ApplyNewAndShrinkMasksToLayout(strided_slice_grad_op, output_rank,
-                                       input_rank, proposed_layout, FORWARD));
-    input_layouts[4] = proposed_layout;
-  }
-
-  return input_layouts;
-}
 
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h
index 135318934a7..8b48c76e84b 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.h
@@ -35,44 +35,6 @@ class SliceSPMDExpander : public SPMDExpanderBase {
       const llvm::DenseMap<int, Layout>& output_layouts) override;
 };
 
-class StridedSliceSPMDExpander : public SPMDExpanderBase {
- public:
-  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
-
-  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
-      mlir::Operation* op,
-      const llvm::DenseMap<int, Layout>& input_layouts) override;
-
-  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
-      mlir::Operation* op,
-      const llvm::DenseMap<int, Layout>& output_layouts) override;
-};
-
-class TensorStridedSliceUpdateSPMDExpander : public SPMDExpanderBase {
- public:
-  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
-
-  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
-      mlir::Operation* op,
-      const llvm::DenseMap<int, Layout>& input_layouts) override;
-
-  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
-      mlir::Operation* op,
-      const llvm::DenseMap<int, Layout>& output_layouts) override;
-};
-
-class StridedSliceGradSPMDExpander : public SliceSPMDExpander {
- public:
-  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
-
-  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
-      mlir::Operation* op,
-      const llvm::DenseMap<int, Layout>& input_layouts) override;
-
-  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
-      mlir::Operation* op,
-      const llvm::DenseMap<int, Layout>& output_layouts) override;
-};
 
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc
new file mode 100644
index 00000000000..0a4104e2b73
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc
@@ -0,0 +1,499 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h"
+
+#include <algorithm>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/cc/slice_util.h"
+#include "tensorflow/dtensor/mlir/collectives.h"
+#include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/shape_utils.h"
+#include "tensorflow/dtensor/mlir/spmd_expander_common.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+namespace {
+
+// Tokenizes the arguments of a StridedSlice Op to a vector of Tokens.
+// Most arguments are converted directly. If begin, end, or strides are dynamic
+// shaped then the converted Tokens will have dynamic_mask set to true.
+// Fails if the rank is dynamic.
+template <typename T>
+StatusOr<std::vector<slice_util::Token>> TokenizeOp(T strided_slice) {
+  std::vector<slice_util::Token> tokens;
+
+  llvm::SmallVector<int64_t, 4> spec_begin;
+  llvm::SmallVector<int64_t, 4> spec_end;
+  llvm::SmallVector<int64_t, 4> spec_strides;
+
+  TF_ASSIGN_OR_RETURN(llvm::ArrayRef<int64_t> strides_shape,
+                      GetShapeOfValue(strided_slice.getStrides(),
+                                      /*fail_on_dynamic=*/true));
+  if (strides_shape.size() != 1)
+    return absl::InvalidArgumentError(
+        "strides input to strided operation is not rank 1");
+
+  int64_t spec_rank = strides_shape[0];
+  bool dynamic = false;
+  tokens.reserve(spec_rank);
+
+  if (!ExtractConstVectorFromValue(strided_slice.getStrides(), &spec_strides)
+           .ok()) {
+    spec_strides.resize(spec_rank, 0);
+    dynamic = true;
+  }
+  if (ExtractConstVectorFromValue(strided_slice.getBegin(), &spec_begin).ok()) {
+    if (spec_begin.size() != spec_rank)
+      return absl::InvalidArgumentError(
+          "rank of begin input to strided operation does not equal rank of "
+          "strides input");
+  } else {
+    spec_begin.resize(spec_rank, 0);
+    dynamic = true;
+  }
+  if (ExtractConstVectorFromValue(strided_slice.getEnd(), &spec_end).ok()) {
+    if (spec_end.size() != spec_rank)
+      return absl::InvalidArgumentError(
+          "rank of end input to strided operation does not equal rank of "
+          "strides input");
+  } else {
+    spec_end.resize(spec_rank, 0);
+    dynamic = true;
+  }
+  const uint64_t new_axis_mask = strided_slice.getNewAxisMask();
+  const uint64_t shrink_axis_mask = strided_slice.getShrinkAxisMask();
+  const uint64_t spec_begin_mask = strided_slice.getBeginMask();
+  const uint64_t spec_end_mask = strided_slice.getEndMask();
+  uint64_t ellipsis_mask = strided_slice.getEllipsisMask();
+
+  if (absl::popcount(ellipsis_mask) > 1)
+    return absl::InvalidArgumentError(
+        "strided slice only supports at most one ellipsis");
+
+  for (int64_t token_index = 0; token_index < spec_rank; ++token_index) {
+    uint64_t bit = 1 << token_index;
+    slice_util::Token::TokenType token_type = slice_util::Token::REGULAR;
+    if (bit & new_axis_mask) {
+      token_type = slice_util::Token::NEW_AXIS;
+    } else if (bit & shrink_axis_mask) {
+      token_type = slice_util::Token::SHRINK_AXIS;
+    } else if (bit & ellipsis_mask) {
+      token_type = slice_util::Token::ELLIPSIS;
+    }
+    tokens.emplace_back(token_type,
+                        /*begin=*/spec_begin[token_index],
+                        /*end=*/spec_end[token_index],
+                        /*stride=*/spec_strides[token_index],
+                        /*dynamic_mask=*/dynamic,
+                        /*begin_mask=*/bit & spec_begin_mask,
+                        /*end_mask=*/bit & spec_end_mask);
+  }
+
+  return tokens;
+}
+
+// Updates an Op's inputs and attributes using the Token vector.
+// NOTE(feyu): This function only updates the end argument because currently
+// this is the only meaningful change when a global Token vector is converted
+// to the local Token vector.
+template <typename T>
+Status UpdateOpFromTokens(T strided_slice,
+                          const std::vector<slice_util::Token>& tokens) {
+  mlir::OpBuilder builder(strided_slice);
+  llvm::SmallVector<int64_t, 4> end;
+  end.reserve(tokens.size());
+  for (const auto& token : tokens) {
+    end.push_back(token.end);
+  }
+  assert(end.size() == tokens.size());
+  mlir::Value new_end = IntConstWithMatchingType(
+      builder, strided_slice.getLoc(), end, strided_slice.getBegin().getType());
+  strided_slice.getEndMutable().assign(new_end);
+  return OkStatus();
+}
+}  // namespace
+
+//
+// StridedSlice
+//
+StatusOr<mlir::Operation*> StridedSliceSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  mlir::OpBuilder builder(op);
+
+  auto strided_slice_op = mlir::cast<mlir::TF::StridedSliceOp>(op);
+  TF_ASSIGN_OR_RETURN(Layout input_layout, ExtractRequiredLayoutFromOperand(
+                                               strided_slice_op.getInput()));
+  TF_ASSIGN_OR_RETURN(Layout output_layout,
+                      ExtractRequiredSingleLayoutFromOp(op));
+  TF_ASSIGN_OR_RETURN(
+      const llvm::ArrayRef<int64_t> global_input_shape,
+      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.getInput()));
+
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_op));
+
+  TF_ASSIGN_OR_RETURN(
+      auto forward,
+      slice_util::CreateAndRun<slice_util::ForwardLayoutInference>(
+          tokens, input_layout, global_input_shape));
+
+  TF_ASSIGN_OR_RETURN(mlir::Value new_input,
+                      EmitRelayout(strided_slice_op.getInput(), input_layout,
+                                   forward.expander_input_layout()));
+
+  TF_RETURN_IF_ERROR(
+      UpdateOpFromTokens(strided_slice_op, forward.local_tokens()));
+
+  strided_slice_op.getInputMutable().assign(new_input);
+
+  op = InferSPMDExpandedLocalShape(op);
+
+  // Do a final relayout to the correct output layout in case there are any
+  // differences between intermediate_output_layout and output_layout.
+  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
+
+  TF_ASSIGN_OR_RETURN(mlir::Value output,
+                      EmitRelayout(strided_slice_op.getOutput(),
+                                   forward.expander_value_layout(),
+                                   output_layout, &newly_created_ops));
+
+  strided_slice_op.getOutput().replaceAllUsesExcept(output, newly_created_ops);
+
+  return output.getDefiningOp();
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+StridedSliceSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  mlir::TF::StridedSliceOp strided_slice_op =
+      mlir::cast<mlir::TF::StridedSliceOp>(op);
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
+                      GetShapeOfValue(strided_slice_op.getInput(),
+                                      /*fail_on_dynamic=*/true));
+
+  if (input_layouts.find(0) == input_layouts.end()) {
+    return llvm::DenseMap<int, Layout>();
+  }
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_op));
+  TF_ASSIGN_OR_RETURN(
+      auto forward,
+      slice_util::CreateAndRun<slice_util::ForwardLayoutInference>(
+          tokens, input_layouts.lookup(0), global_input_shape));
+
+  return llvm::DenseMap<int, Layout>({{0, forward.expander_value_layout()}});
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+StridedSliceSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  mlir::TF::StridedSliceOp strided_slice_op =
+      mlir::cast<mlir::TF::StridedSliceOp>(op);
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
+                      GetShapeOfValue(strided_slice_op.getInput(),
+                                      /*fail_on_dynamic=*/true));
+
+  llvm::DenseMap<int, Layout> input_layouts(strided_slice_op.getNumOperands());
+  // Set replicated layout for begin, end, and strides operands.
+  input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[3] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+
+  // input
+  if (output_layouts.find(0) == output_layouts.end()) {
+    return input_layouts;
+  }
+
+  const Layout& output_layout = output_layouts.lookup(0);
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_op));
+  TF_ASSIGN_OR_RETURN(
+      auto backward,
+      slice_util::CreateAndRun<slice_util::BackwardLayoutInference>(
+          tokens, output_layout, global_input_shape));
+
+  input_layouts[0] = backward.expander_input_layout();
+  return input_layouts;
+}
+
+//
+//  TensorStridedSliceUpdate
+//
+StatusOr<mlir::Operation*> TensorStridedSliceUpdateSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
+      llvm::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
+  TF_ASSIGN_OR_RETURN(
+      const Layout input_layout,
+      ExtractRequiredLayoutFromOperand(strided_slice_op.getInput()));
+  TF_ASSIGN_OR_RETURN(
+      const Layout value_layout,
+      ExtractRequiredLayoutFromOperand(strided_slice_op.getValue()));
+  TF_ASSIGN_OR_RETURN(const Layout output_layout,
+                      ExtractRequiredSingleLayoutFromOp(op));
+
+  TF_ASSIGN_OR_RETURN(
+      const llvm::ArrayRef<int64_t> global_input_shape,
+      GetGlobalShapeOfValueFromDTensorLayout(strided_slice_op.getInput()));
+
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_op));
+
+  TF_ASSIGN_OR_RETURN(
+      auto forward,
+      slice_util::CreateAndRun<slice_util::ForwardLayoutInference>(
+          tokens, input_layout, global_input_shape));
+
+  TF_ASSIGN_OR_RETURN(mlir::Value new_input,
+                      EmitRelayout(strided_slice_op.getInput(), input_layout,
+                                   forward.expander_input_layout()));
+
+  TF_ASSIGN_OR_RETURN(mlir::Value new_value,
+                      EmitRelayout(strided_slice_op.getValue(), value_layout,
+                                   forward.expander_value_layout()));
+
+  strided_slice_op.getInputMutable().assign(new_input);
+  strided_slice_op.getValueMutable().assign(new_value);
+
+  TF_RETURN_IF_ERROR(
+      UpdateOpFromTokens(strided_slice_op, forward.local_tokens()));
+
+  op = InferSPMDExpandedLocalShape(op);
+
+  mlir::OpBuilder builder(op);
+
+  // Do a final relayout to the correct output layout in case there are any
+  // differences between intermediate_output_layout and output_layout.
+  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
+
+  TF_ASSIGN_OR_RETURN(mlir::Value output,
+                      EmitRelayout(strided_slice_op.getOutput(),
+                                   forward.expander_input_layout(),
+                                   output_layout, &newly_created_ops));
+
+  strided_slice_op.getOutput().replaceAllUsesExcept(output, newly_created_ops);
+
+  return output.getDefiningOp();
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+TensorStridedSliceUpdateSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
+      mlir::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
+                      GetShapeOfValue(strided_slice_op.getInput(),
+                                      /*fail_on_dynamic=*/true));
+
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_op));
+  // We have a choice to determine the output layout, we will default to use
+  // input_layout if available, otherwise we will expand value_layout and use
+  // that.
+  std::vector<Layout> candidates;
+  if (input_layouts.find(0) != input_layouts.end()) {
+    // If we have an input_layout, prefer to keep it.
+    candidates.push_back(input_layouts.lookup(0));
+  }
+
+  if (input_layouts.find(4) != input_layouts.end()) {
+    // When we don't have the input layout, use value layout to 'create' the
+    // input layout. We do this by applying the backward inference.
+    // This is because in the case of a normal strided slice the layout of
+    // value would be output layout.
+    const Layout& value_layout = input_layouts.lookup(4);
+    TF_ASSIGN_OR_RETURN(
+        auto backward,
+        slice_util::CreateAndRun<slice_util::BackwardLayoutInference>(
+            tokens, value_layout, global_input_shape));
+    candidates.push_back(backward.expander_input_layout());
+  }
+  if (candidates.empty()) {
+    return llvm::DenseMap<int, Layout>();
+  }
+
+  TF_ASSIGN_OR_RETURN(auto input_layout, GetLeastShardedLayout(candidates));
+
+  return llvm::DenseMap<int, Layout>({{0, input_layout}});
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+TensorStridedSliceUpdateSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  mlir::TF::TensorStridedSliceUpdateOp strided_slice_op =
+      mlir::cast<mlir::TF::TensorStridedSliceUpdateOp>(op);
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_input_shape,
+                      GetShapeOfValue(strided_slice_op.getInput(),
+                                      /*fail_on_dynamic=*/true));
+
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_op));
+
+  llvm::DenseMap<int, Layout> input_layouts(strided_slice_op.getNumOperands());
+  // Set replicated layout for begin, end, and strides operands.
+  input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[3] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+
+  // input and value layouts
+  if (output_layouts.find(0) != output_layouts.end()) {
+    const Layout& output_layout = output_layouts.lookup(0);
+    input_layouts[0] = output_layout;
+
+    // We also need a layout for value as well, and for that we just take the
+    // forward inference of the input layout.
+    TF_ASSIGN_OR_RETURN(
+        auto forward,
+        slice_util::CreateAndRun<slice_util::ForwardLayoutInference>(
+            tokens, output_layout, global_input_shape));
+    input_layouts[4] = forward.expander_value_layout();
+  }
+
+  return input_layouts;
+}
+
+//
+// StridedSliceGrad
+//
+StatusOr<mlir::Operation*> StridedSliceGradSPMDExpander::ExpandOp(
+    mlir::Operation* op) {
+  auto strided_slice_grad_op = llvm::cast<mlir::TF::StridedSliceGradOp>(op);
+  TF_ASSIGN_OR_RETURN(
+      const Layout input_layout,
+      ExtractRequiredLayoutFromOperand(strided_slice_grad_op.getDy()));
+  TF_ASSIGN_OR_RETURN(const Layout output_layout,
+                      ExtractRequiredSingleLayoutFromOp(op));
+
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
+                      GetGlobalShapeOfValueFromDTensorLayout(
+                          strided_slice_grad_op.getOutput()));
+
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_grad_op));
+
+  TF_ASSIGN_OR_RETURN(
+      auto backward,
+      slice_util::CreateAndRun<slice_util::BackwardLayoutInference>(
+          tokens, input_layout, global_output_shape));
+
+  TF_ASSIGN_OR_RETURN(mlir::Value new_dy,
+                      EmitRelayout(strided_slice_grad_op.getDy(), input_layout,
+                                   backward.expander_value_layout()));
+
+  TF_RETURN_IF_ERROR(
+      UpdateOpFromTokens(strided_slice_grad_op, backward.local_tokens()));
+  strided_slice_grad_op.getDyMutable().assign(new_dy);
+
+  mlir::OpBuilder builder(op);
+
+  // The shape input to StridedSliceGrad will still be global, so we need to
+  // compute the local shape update it.
+  std::vector<int64_t> computed_output_shape =
+      backward.expander_input_layout().LocalShapeFromGlobalShape(
+          global_output_shape);
+  mlir::Value new_shape = IntConstWithMatchingType(
+      builder, strided_slice_grad_op.getLoc(), computed_output_shape,
+      strided_slice_grad_op.getBegin().getType());
+  strided_slice_grad_op.getShapeMutable().assign(new_shape);
+
+  op = InferSPMDExpandedLocalShape(op);
+
+  // Do a final relayout to the correct output layout in case there are any
+  // differences between intermediate_output_layout and output_layout.
+  llvm::SmallPtrSet<mlir::Operation*, 4> newly_created_ops;
+
+  TF_ASSIGN_OR_RETURN(mlir::Value output,
+                      EmitRelayout(strided_slice_grad_op.getOutput(),
+                                   backward.expander_input_layout(),
+                                   output_layout, &newly_created_ops));
+
+  strided_slice_grad_op.getOutput().replaceAllUsesExcept(output,
+                                                         newly_created_ops);
+
+  return output.getDefiningOp();
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+StridedSliceGradSPMDExpander::ComputeLayoutForward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& input_layouts) {
+  // If the input layout is missing, don't return an output layout.
+  if (input_layouts.find(4) == input_layouts.end())
+    return llvm::DenseMap<int, Layout>();
+
+  mlir::TF::StridedSliceGradOp strided_slice_grad_op =
+      mlir::cast<mlir::TF::StridedSliceGradOp>(op);
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
+                      GetShapeOfValue(strided_slice_grad_op.getOutput(),
+                                      /*fail_on_dynamic=*/true));
+  const Layout& input_layout = input_layouts.lookup(4);
+
+  TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_grad_op));
+  TF_ASSIGN_OR_RETURN(
+      auto backward,
+      slice_util::CreateAndRun<slice_util::BackwardLayoutInference>(
+          tokens, input_layout, global_output_shape));
+
+  return llvm::DenseMap<int, Layout>({{0, backward.expander_input_layout()}});
+}
+
+StatusOr<llvm::DenseMap<int, Layout>>
+StridedSliceGradSPMDExpander::ComputeLayoutBackward(
+    mlir::Operation* op, const llvm::DenseMap<int, Layout>& output_layouts) {
+  mlir::TF::StridedSliceGradOp strided_slice_grad_op =
+      mlir::cast<mlir::TF::StridedSliceGradOp>(op);
+  TF_ASSIGN_OR_RETURN(const Mesh mesh, ExtractDeviceMeshEnclosingCluster(op));
+
+  TF_ASSIGN_OR_RETURN(const llvm::ArrayRef<int64_t> global_output_shape,
+                      GetShapeOfValue(strided_slice_grad_op.getOutput(),
+                                      /*fail_on_dynamic=*/true));
+  llvm::DenseMap<int, Layout> input_layouts(
+      strided_slice_grad_op.getNumOperands());
+  // Set replicated layout for shape, begin, end, stride operands.
+  input_layouts[0] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[1] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[2] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+  input_layouts[3] = Layout::ReplicatedOnMesh(mesh, /*rank=*/1);
+
+  // dy
+  if (output_layouts.find(0) != output_layouts.end()) {
+    const Layout& output_layout = output_layouts.lookup(0);
+    TF_ASSIGN_OR_RETURN(auto tokens, TokenizeOp(strided_slice_grad_op));
+
+    TF_ASSIGN_OR_RETURN(
+        auto forward,
+        slice_util::CreateAndRun<slice_util::ForwardLayoutInference>(
+            tokens, output_layout, global_output_shape));
+    input_layouts[4] = forward.expander_value_layout();
+  }
+
+  return input_layouts;
+}
+
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h b/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h
new file mode 100644
index 00000000000..b83dba58e48
--- /dev/null
+++ b/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h
@@ -0,0 +1,67 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_STRIDED_SLICE_SPMD_EXPANDER_H_
+#define TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_STRIDED_SLICE_SPMD_EXPANDER_H_
+
+#include "tensorflow/dtensor/cc/dstatus.h"
+#include "tensorflow/dtensor/mlir/spmd_expander.h"
+
+namespace tensorflow {
+namespace dtensor {
+
+class StridedSliceSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class TensorStridedSliceUpdateSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+class StridedSliceGradSPMDExpander : public SPMDExpanderBase {
+ public:
+  StatusOr<mlir::Operation*> ExpandOp(mlir::Operation* op) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutForward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& input_layouts) override;
+
+  StatusOr<llvm::DenseMap<int, Layout>> ComputeLayoutBackward(
+      mlir::Operation* op,
+      const llvm::DenseMap<int, Layout>& output_layouts) override;
+};
+
+}  // namespace dtensor
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_DTENSOR_MLIR_EXPANSIONS_STRIDED_SLICE_SPMD_EXPANDER_H_
diff --git a/tensorflow/dtensor/mlir/function_renaming.cc b/tensorflow/dtensor/mlir/function_renaming.cc
index 8ee3f6003ed..fe91943915b 100644
--- a/tensorflow/dtensor/mlir/function_renaming.cc
+++ b/tensorflow/dtensor/mlir/function_renaming.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/group_assignment_test.cc b/tensorflow/dtensor/mlir/group_assignment_test.cc
index 3d64f19be63..106ca7fcba9 100644
--- a/tensorflow/dtensor/mlir/group_assignment_test.cc
+++ b/tensorflow/dtensor/mlir/group_assignment_test.cc
@@ -60,7 +60,7 @@ GroupAssignment CreateGroupAssignment(
       group_assignment_attr,
       GroupAssignment::ReplicaToDeviceMap::DefaultReplicaToDeviceMap(
           num_slices, slice_size));
-  CHECK(group_assignment.ok());
+  TF_CHECK_OK(group_assignment.status());
   return *group_assignment;
 }
 
@@ -74,7 +74,7 @@ GroupAssignment CreateGroupAssignment(
   StatusOr<GroupAssignment> group_assignment = GroupAssignment::FromMLIR(
       group_assignment_attr,
       GroupAssignment::ReplicaToDeviceMap(std::move(map)));
-  CHECK(group_assignment.ok());
+  TF_CHECK_OK(group_assignment.status());
   return *group_assignment;
 }
 
diff --git a/tensorflow/dtensor/mlir/handle_sparsetensors.cc b/tensorflow/dtensor/mlir/handle_sparsetensors.cc
index 48985d22fb8..30c18accca8 100644
--- a/tensorflow/dtensor/mlir/handle_sparsetensors.cc
+++ b/tensorflow/dtensor/mlir/handle_sparsetensors.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 #include <unordered_set>
 #include <utility>
diff --git a/tensorflow/dtensor/mlir/ir/tf_dtensor.td b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
index 4a05ff9a7b5..243ba8a30f0 100644
--- a/tensorflow/dtensor/mlir/ir/tf_dtensor.td
+++ b/tensorflow/dtensor/mlir/ir/tf_dtensor.td
@@ -151,6 +151,7 @@ def TF_RelayoutLikeOp : TF_Op<"RelayoutLike", [Pure, TF_AllTypesMatch<["input",
   );
 
   TF_DerivedOperandTypeAttr T = TF_DerivedOperandTypeAttr<0>;
+  TF_DerivedOperandTypeAttr U = TF_DerivedOperandTypeAttr<1>;
 }
 
 def TF_CopyToMeshOp : TF_Op<"CopyToMesh", [Pure]> {
diff --git a/tensorflow/dtensor/mlir/layout_parsing.cc b/tensorflow/dtensor/mlir/layout_parsing.cc
index 4197d94d450..441ab2b86ed 100644
--- a/tensorflow/dtensor/mlir/layout_parsing.cc
+++ b/tensorflow/dtensor/mlir/layout_parsing.cc
@@ -15,8 +15,10 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
 
+#include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "absl/types/optional.h"
@@ -47,9 +49,9 @@ bool OpUsesV2LayoutAnnotation(mlir::Operation* op) {
 
 }  // namespace
 
-StatusOr<absl::optional<Layout>> ExtractSingleLayoutFromOp(
+StatusOr<std::optional<Layout>> ExtractSingleLayoutFromOp(
     mlir::Operation* op, std::string attr_name) {
-  absl::optional<Layout> out;
+  std::optional<Layout> out;
 
   // If v2 layout propagation algorithm is used, parse layout from DTensorLayout
   // op.
@@ -73,22 +75,21 @@ StatusOr<absl::optional<Layout>> ExtractSingleLayoutFromOp(
   return out;
 }
 
-StatusOr<absl::optional<Layout>> ExtractSingleLayoutFromOp(
-    mlir::Operation* op) {
+StatusOr<std::optional<Layout>> ExtractSingleLayoutFromOp(mlir::Operation* op) {
   return ExtractSingleLayoutFromOp(op, kLayoutAttr);
 }
 
 StatusOr<Layout> ExtractRequiredSingleLayoutFromOp(mlir::Operation* op) {
-  TF_ASSIGN_OR_RETURN(absl::optional<Layout> layout,
+  TF_ASSIGN_OR_RETURN(std::optional<Layout> layout,
                       ExtractSingleLayoutFromOp(op));
   if (!layout) return errors::Internal("expected layout missing");
 
   return *layout;
 }
 
-StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
+StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
     mlir::Operation* op, std::string attr_name) {
-  std::vector<absl::optional<Layout>> outs;
+  std::vector<std::optional<Layout>> outs;
   outs.reserve(op->getNumResults());
 
   // If v2 layout propagation algorithm is used, parse layout from DTensorLayout
@@ -109,20 +110,20 @@ StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
         TF_ASSIGN_OR_RETURN(auto layout, Layout::FromString(attr_str));
         outs.emplace_back(std::move(layout));
       } else {
-        outs.emplace_back(absl::nullopt);
+        outs.emplace_back(std::nullopt);
       }
     }
   }
   return outs;
 }
 
-StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
+StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
     mlir::Operation* op) {
   return ExtractLayoutFromOp(op, kLayoutAttr);
 }
 
 StatusOr<std::vector<Layout>> ExtractRequiredLayoutFromOp(mlir::Operation* op) {
-  TF_ASSIGN_OR_RETURN(std::vector<absl::optional<Layout>> optional_layouts,
+  TF_ASSIGN_OR_RETURN(std::vector<std::optional<Layout>> optional_layouts,
                       ExtractLayoutFromOp(op));
   std::vector<Layout> layouts;
   for (const absl::optional<Layout>& layout : optional_layouts) {
@@ -146,8 +147,8 @@ StatusOr<Mesh> ExtractDeviceMeshEnclosingCluster(mlir::Operation* op) {
   return *mesh;
 }
 
-StatusOr<absl::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op) {
-  absl::optional<Mesh> extracted_mesh;
+StatusOr<std::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op) {
+  std::optional<Mesh> extracted_mesh;
   if (op == nullptr) return extracted_mesh;
 
   auto mesh_str_attr = op->getAttrOfType<mlir::StringAttr>(kMeshAttr);
@@ -160,10 +161,10 @@ StatusOr<absl::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op) {
   return extracted_mesh;
 }
 
-StatusOr<absl::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
+StatusOr<std::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
   if (auto op_result = operand.dyn_cast<mlir::OpResult>()) {
     mlir::Operation* op = op_result.getDefiningOp();
-    absl::optional<Layout> out;
+    std::optional<Layout> out;
     if (auto layout_op = llvm::dyn_cast<mlir::TF::DTensorLayout>(op)) {
       out.emplace(layout_op.getLayout());
     } else {
@@ -195,7 +196,7 @@ StatusOr<absl::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
     return errors::InvalidArgument("op must be enclosed by a function");
   }
 
-  absl::optional<Layout> extracted_layout;
+  std::optional<Layout> extracted_layout;
   auto layout_attr = func_op.getArgAttrOfType<mlir::StringAttr>(
       block_arg.getArgNumber(), kCustomDeviceAttr);
   if (!layout_attr) return extracted_layout;
@@ -207,7 +208,7 @@ StatusOr<absl::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand) {
 }
 
 StatusOr<Layout> ExtractRequiredLayoutFromOperand(mlir::Value operand) {
-  TF_ASSIGN_OR_RETURN(absl::optional<Layout> layout,
+  TF_ASSIGN_OR_RETURN(std::optional<Layout> layout,
                       ExtractLayoutFromOperand(operand));
   if (!layout) return errors::Internal("expected layout missing");
 
@@ -243,12 +244,12 @@ void SetLayoutOnOp(mlir::Operation* op,
 }
 
 void SetSingleLayoutOnOp(mlir::Operation* op, const Layout& layout) {
-  SetLayoutOnOp(op, mlir::OpBuilder(op), {absl::optional<Layout>(layout)});
+  SetLayoutOnOp(op, mlir::OpBuilder(op), {std::optional<Layout>(layout)});
 }
 
-StatusOr<absl::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
+StatusOr<std::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
     mlir::func::ReturnOp return_op, const int return_index) {
-  absl::optional<Layout> layout;
+  std::optional<Layout> layout;
   // If value feeds into func op return op, then check to see if layout
   // attribute is set for the return value.
   auto function = return_op->getParentOfType<mlir::func::FuncOp>();
diff --git a/tensorflow/dtensor/mlir/layout_parsing.h b/tensorflow/dtensor/mlir/layout_parsing.h
index ddd30a2aeb0..8756814c051 100644
--- a/tensorflow/dtensor/mlir/layout_parsing.h
+++ b/tensorflow/dtensor/mlir/layout_parsing.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_LAYOUT_PARSING_H_
 #define TENSORFLOW_DTENSOR_MLIR_LAYOUT_PARSING_H_
 
+#include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
@@ -32,18 +34,18 @@ namespace tensorflow {
 namespace dtensor {
 
 // Extracts `_layout` attribute from `op` and assert a single layout.
-StatusOr<absl::optional<Layout>> ExtractSingleLayoutFromOp(mlir::Operation* op);
+StatusOr<std::optional<Layout>> ExtractSingleLayoutFromOp(mlir::Operation* op);
 
 // Extracts `_layout` attribute from `op`, and returns an error is the layout
 // is missing.
 StatusOr<Layout> ExtractRequiredSingleLayoutFromOp(mlir::Operation* op);
 
 // Extracts `_layout` attribute from `op` and assert a single layout.
-StatusOr<absl::optional<Layout>> ExtractSingleLayoutFromOp(
+StatusOr<std::optional<Layout>> ExtractSingleLayoutFromOp(
     mlir::Operation* op, std::string attr_name);
 
 // Extracts `_layout` attribute from `op`.
-StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
+StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
     mlir::Operation* op);
 
 // Extracts `_layout` attribute from `op` and returns an error if any are
@@ -51,11 +53,11 @@ StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
 StatusOr<std::vector<Layout>> ExtractRequiredLayoutFromOp(mlir::Operation* op);
 
 // Extract and deserialize a tensor layout from `attr_name`.
-StatusOr<std::vector<absl::optional<Layout>>> ExtractLayoutFromOp(
+StatusOr<std::vector<std::optional<Layout>>> ExtractLayoutFromOp(
     mlir::Operation* op, std::string attr_name);
 
 // Extracts '_layout' attribute from `operand`.
-StatusOr<absl::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand);
+StatusOr<std::optional<Layout>> ExtractLayoutFromOperand(mlir::Value operand);
 
 // Extracts '_layout' attribute from `operand` and returns an error if missing.
 StatusOr<Layout> ExtractRequiredLayoutFromOperand(mlir::Value operand);
@@ -79,11 +81,11 @@ void SetSingleLayoutOnOp(mlir::Operation* op, const Layout& layout);
 StatusOr<Mesh> ExtractDeviceMeshEnclosingCluster(mlir::Operation* op);
 
 // Extracts device mesh configuration from op's `_mesh` attribute.
-StatusOr<absl::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op);
+StatusOr<std::optional<Mesh>> ExtractDeviceMeshFromOp(mlir::Operation* op);
 
 // Extracts default layout information from function return attribute.
-StatusOr<absl::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
-    mlir::func::ReturnOp return_op, const int return_index);
+StatusOr<std::optional<Layout>> ExtractLayoutFromFunctionReturnAttr(
+    mlir::func::ReturnOp return_op, int return_index);
 
 // Extract element layouts from the iterator resource operand of an op that uses
 // that iterator (e.g. IteratorGetNext, OptionalGetValue, etc.). The layouts are
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index aad1f3ceb47..ecc601bcafd 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -14,10 +14,14 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <deque>
 #include <iterator>
+#include <memory>
 #include <optional>
 #include <queue>
+#include <set>
 #include <string>
+#include <vector>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/optional.h"
@@ -39,6 +43,7 @@ limitations under the License.
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -72,10 +77,7 @@ constexpr int kLayoutPropagationMaxStages = 3;
 
 bool IsProducerResourceOpWithEmptyLayout(const mlir::Value& producer_value,
                                          const Layout& producer) {
-  return (
-      producer.IsEmpty() &&
-      llvm::isa<mlir::TF::ResourceType>(
-          producer_value.getType().cast<mlir::TensorType>().getElementType()));
+  return (producer.IsEmpty() && IsResourceType(producer_value));
 }
 
 bool AllOpResultsHaveLayouts(
@@ -302,7 +304,7 @@ StatusOr<Layout> MergeLayouts(
 
 mlir::LogicalResult InsertLayoutsForDTensorLayout(
     mlir::ModuleOp& module,
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>>& producer_request,
+    llvm::DenseMap<mlir::Value, std::optional<Layout>>& producer_request,
     llvm::DenseSet<mlir::Value>& is_updated,
     llvm::DenseSet<mlir::Value>& is_locked) {
   return mlir::failure(
@@ -330,7 +332,7 @@ mlir::LogicalResult InsertInitialLayoutsFromComputeLayout(
     mlir::ModuleOp module,
     const llvm::DenseMap<mlir::Value, std::vector<mlir::OpOperand*>>& consumers,
     const llvm::DenseMap<mlir::OpOperand*, std::vector<mlir::Value>>& producers,
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>>& producer_request,
+    llvm::DenseMap<mlir::Value, std::optional<Layout>>& producer_request,
     llvm::DenseMap<mlir::Value, mlir::DenseMap<mlir::OpOperand*, Layout>>&
         consumer_requests,
     llvm::DenseSet<mlir::Value>& is_updated) {
@@ -421,7 +423,7 @@ mlir::LogicalResult InsertInitialLayouts(
     const llvm::DenseMap<mlir::OpOperand*, std::vector<mlir::Value>>& producers,
     llvm::DenseMap<mlir::Value, mlir::DenseMap<mlir::OpOperand*, Layout>>&
         consumer_request,
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>>& producer_request,
+    llvm::DenseMap<mlir::Value, std::optional<Layout>>& producer_request,
     llvm::DenseSet<mlir::Value>& is_updated,
     llvm::DenseSet<mlir::Value>& is_locked) {
   std::queue<mlir::Operation*> operations;
@@ -439,7 +441,7 @@ mlir::LogicalResult InsertInitialLayouts(
 mlir::LogicalResult MergeAndGetUpdatedLayouts(
     const llvm::DenseSet<mlir::Value>& is_locked,
     llvm::DenseSet<mlir::Value>& is_updated,
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>>& producer_request,
+    llvm::DenseMap<mlir::Value, std::optional<Layout>>& producer_request,
     llvm::DenseMap<mlir::Value, mlir::DenseMap<mlir::OpOperand*, Layout>>&
         consumer_requests,
     llvm::DenseMap<mlir::Value, Layout>& merged_layouts) {
@@ -476,50 +478,16 @@ mlir::LogicalResult MergeAndGetUpdatedLayouts(
 }
 
 // Finds the most sharded merged layout given `layouts`.
-mlir::LogicalResult GetMostShardedLayout(llvm::ArrayRef<Layout> layouts,
-                                         mlir::Location location,
-                                         absl::optional<Layout>* out) {
+mlir::LogicalResult GetMostShardedLayoutHelper(llvm::ArrayRef<Layout> layouts,
+                                               mlir::Location location,
+                                               std::optional<Layout>* out) {
   // If there are no layouts to merge, leave the output empty.
-  if (layouts.empty()) return mlir::success();
-
-  absl::optional<Layout> layout;
-  std::map<std::string, std::set<int>> layout_map;
-  for (const Layout& layout : layouts) {
-    for (int i = 0; i < layout.rank(); ++i) {
-      const std::string& mesh_dim = layout.dim(i).sharding_spec();
-      if (mesh_dim == Layout::kUnshardedDim) continue;
-
-      layout_map[mesh_dim].insert(i);
-    }
+  if (layouts.empty()) {
+    return mlir::success();
   }
-
-  for (auto& it : layout_map)
-    if (it.second.size() > 1) it.second.clear();
-
-  std::map<int, std::set<std::string>> dim_to_layout_map;
-  for (const auto& it : layout_map) {
-    assert(it.second.size() <= 1);
-    if (it.second.empty()) continue;
-
-    const int tensor_dim_index = *it.second.begin();
-    dim_to_layout_map[tensor_dim_index].insert(it.first);
-  }
-
-  for (auto& it : dim_to_layout_map)
-    if (it.second.size() > 1) it.second.clear();
-
-  std::vector<std::string> merged_spec;
-  assert(!layouts.empty());
-  for (int i = 0; i < layouts[0].rank(); ++i) {
-    const auto it = dim_to_layout_map.find(i);
-    if (it != dim_to_layout_map.end() && !it->second.empty()) {
-      assert(it->second.size() == 1);
-      merged_spec.emplace_back(*it->second.begin());
-    } else {
-      merged_spec.emplace_back(Layout::kUnshardedDim);
-    }
-  }
-  const auto new_layout = Layout::GetLayout(merged_spec, layouts[0].mesh());
+  // FIXME(feyu): This shall use a reference, not copying the layout.
+  std::vector<Layout> layouts_vector = {layouts.begin(), layouts.end()};
+  const auto new_layout = GetMostShardedLayout(layouts_vector);
   if (!new_layout.ok()) {
     return mlir::emitError(
         location, llvm::formatv("error in layout propagation while merging "
@@ -545,10 +513,10 @@ mlir::LogicalResult GetMostShardedLayout(llvm::ArrayRef<Layout> layouts,
 mlir::LogicalResult MergeProducerLayouts(
     const llvm::DenseMap<mlir::Value, Layout>& merged_layouts,
     const std::vector<mlir::Value>& producer_values, mlir::Location location,
-    absl::optional<Layout>* layout_out) {
+    std::optional<Layout>* layout_out) {
   // If there is a single producer for mlir::Value, then return the layout
   // from the producer.
-  absl::optional<Layout> layout;
+  std::optional<Layout> layout;
   if (producer_values.size() == 1) {
     const auto it = merged_layouts.find(producer_values[0]);
     if (it != merged_layouts.end()) *layout_out = it->second;
@@ -564,7 +532,8 @@ mlir::LogicalResult MergeProducerLayouts(
     candidate_layouts.emplace_back(it->second);
   }
 
-  if (mlir::failed(GetMostShardedLayout(candidate_layouts, location, &layout)))
+  if (mlir::failed(
+          GetMostShardedLayoutHelper(candidate_layouts, location, &layout)))
     return mlir::failure();
 
   if (layout) *layout_out = *layout;
@@ -578,7 +547,7 @@ mlir::LogicalResult UpdateLayoutsForOp(
     mlir::Operation* op,
     const llvm::DenseMap<mlir::OpOperand*, std::vector<mlir::Value>>& producers,
     const llvm::DenseMap<mlir::Value, Layout>& merged_layouts,
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>>& producer_request,
+    llvm::DenseMap<mlir::Value, std::optional<Layout>>& producer_request,
     llvm::DenseMap<mlir::Value, mlir::DenseMap<mlir::OpOperand*, Layout>>&
         consumer_requests,
     llvm::DenseSet<mlir::Value>& is_updated) {
@@ -600,7 +569,7 @@ mlir::LogicalResult UpdateLayoutsForOp(
     if (producer_values == producers.end())
       return op->emitError() << "Unable to find producer for operand " << i;
 
-    absl::optional<Layout> layout;
+    std::optional<Layout> layout;
     if (mlir::failed(MergeProducerLayouts(merged_layouts,
                                           producer_values->getSecond(),
                                           op->getLoc(), &layout)))
@@ -679,7 +648,8 @@ mlir::LogicalResult UpdateLayoutsForOp(
             return op->emitOpError()
                    << "Rank for input " << i << " layout is "
                    << input_layout->second.rank() << " but actual rank is "
-                   << ValueRank(value);
+                   << ValueRank(value) << " input layout is "
+                   << input_layout->second.ToString();
 
           // If there was a layout returned and either no previous request or
           // the request changed, insert and mark as updated.
@@ -711,9 +681,11 @@ mlir::LogicalResult UpdateLayoutsForOp(
     const auto& result = op->getOpResult(i);
     if (producer_request[result] != output_layout->second) {
       if (output_layout->second.rank() != ValueRank(result))
-        return op->emitOpError() << "Rank for output " << i << " layout is "
-                                 << output_layout->second.rank()
-                                 << " but actual rank is " << ValueRank(result);
+        return op->emitOpError()
+               << "Rank for output " << i << " layout is "
+               << output_layout->second.rank() << " but actual rank is "
+               << ValueRank(result) << " output layout is "
+               << output_layout->second.ToString();
       producer_request[result] = output_layout->second;
 
       is_updated.insert(result);
@@ -1020,7 +992,8 @@ class LayoutPrinter : public mlir::OpAsmPrinter {
 
 // Log the current set of layouts to a file marked by the hash of the input
 // module and the stage.
-void LogLayoutsAndOps(const int stage, const uint64_t module_hash,
+void LogLayoutsAndOps(const int stage, const int steps,
+                      const uint64_t module_hash,
                       const llvm::DenseMap<mlir::Value, Layout>& merged_layouts,
                       mlir::ModuleOp& module) {
   if (module->hasAttr(kDoNotLog) || ((ClientId() != 0) && !LogOnAllTasks()))
@@ -1039,6 +1012,11 @@ void LogLayoutsAndOps(const int stage, const uint64_t module_hash,
 
   absl::StrAppend(&prefix, "/layout_propagation_v2_module_", module_hash,
                   "_stage_", stage, "_");
+  if (steps >= 0) {
+    absl::StrAppend(&prefix, steps, "_");
+  } else {
+    absl::StrAppend(&prefix, "end_");
+  }
   if (!tensorflow::Env::Default()->CreateUniqueFileName(&prefix, ".mlir")) {
     LOG(WARNING) << "cannot create unique filename, won't dump MLIR module.";
     return;
@@ -1105,9 +1083,9 @@ mlir::LogicalResult InsertDTensorLayoutForIfRegionOp(
       std::set<Layout> layouts_set{layouts.begin(), layouts.end()};
       if (layouts_set.size() == 1) continue;
 
-      absl::optional<Layout> merged_layout;
-      if (mlir::failed(
-              GetMostShardedLayout(layouts, if_op.getLoc(), &merged_layout)))
+      std::optional<Layout> merged_layout;
+      if (mlir::failed(GetMostShardedLayoutHelper(layouts, if_op.getLoc(),
+                                                  &merged_layout)))
         return mlir::failure();
       assert(merged_layout);
 
@@ -1239,31 +1217,6 @@ mlir::LogicalResult InsertRelayoutForWhileLoops(
   return mlir::success();
 }
 
-// For all constants with multiple usages, clone the constants so that each
-// constant operation has at most 1 usage.
-void DuplicateConstants(mlir::ModuleOp module) {
-  llvm::SmallVector<mlir::TF::ConstOp, 4> const_ops;
-  module.walk(
-      [&](mlir::TF::ConstOp const_op) { const_ops.emplace_back(const_op); });
-
-  for (mlir::TF::ConstOp const_op : const_ops) {
-    mlir::OpBuilder builder(const_op);
-    auto uses = const_op->getUses();
-    if (uses.empty()) return;
-
-    llvm::SmallDenseMap<mlir::Operation*, mlir::OpOperand*> const_use_map;
-    mlir::OpOperand& first_use = *uses.begin();
-    for (mlir::OpOperand& use : uses) {
-      if (&use == &first_use) continue;
-
-      mlir::Operation* new_const = builder.clone(*const_op);
-      const_use_map.try_emplace(new_const, &use);
-    }
-
-    for (const auto& it : const_use_map) it.second->set(it.first->getResult(0));
-  }
-}
-
 // Find the root(s) values of "current_value" within the cycle, and put it
 // into "roots".
 void FindRoot(
@@ -1358,13 +1311,13 @@ void FindRootsAndEmitError(
 Status RunOneIteration(
     llvm::DenseSet<mlir::Value>& is_locked,
     llvm::DenseSet<mlir::Value>& is_updated,
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>>& producer_request,
+    llvm::DenseMap<mlir::Value, std::optional<Layout>>& producer_request,
     llvm::DenseMap<mlir::Value, mlir::DenseMap<mlir::OpOperand*, Layout>>&
         consumer_requests,
     llvm::DenseMap<mlir::OpOperand*, std::vector<mlir::Value>>& producers,
     llvm::DenseMap<mlir::Value, std::vector<mlir::OpOperand*>>& consumers,
     llvm::DenseMap<mlir::Value, Layout>& merged_layouts, mlir::ModuleOp& module,
-    const uint64_t module_hash, int* stage) {
+    const uint64_t module_hash, int stage, int* steps) {
   if (is_updated.empty()) return OkStatus();
   // Merge any possibly updated layouts.
   if (mlir::failed(
@@ -1379,7 +1332,7 @@ Status RunOneIteration(
   is_updated.clear();
 
   if (VLOG_IS_ON(2)) {
-    LogLayoutsAndOps(*stage, module_hash, merged_layouts, module);
+    LogLayoutsAndOps(stage, *steps, module_hash, merged_layouts, module);
   }
 
   for (auto* op : operations_needing_update) {
@@ -1388,7 +1341,7 @@ Status RunOneIteration(
                                         is_updated)))
       return errors::Internal("UpdateLayoutsForOp failed to update layouts.");
   }
-  ++(*stage);
+  ++(*steps);
   return OkStatus();
 }
 
@@ -1454,7 +1407,7 @@ struct DLayoutPropagationPassV2
     llvm::DenseMap<mlir::OpOperand*, std::vector<mlir::Value>> producers;
     // For each mlir::Value this is what the producer would like to have the
     // layout be.
-    llvm::DenseMap<mlir::Value, absl::optional<Layout>> producer_request;
+    llvm::DenseMap<mlir::Value, std::optional<Layout>> producer_request;
     // For each mlir::Value this is what the consumers would like to have the
     // layout be. Note the map is in 'parallel' to the consumers map above.
     llvm::DenseMap<mlir::Value, mlir::DenseMap<mlir::OpOperand*, Layout>>
@@ -1497,17 +1450,20 @@ struct DLayoutPropagationPassV2
       int steps = 0;
       // Step 1. Run the layout propagation v2 until convergence or max steps.
       while (!is_updated.empty() && steps < LayoutPropagationMaxSteps()) {
-        Status status = RunOneIteration(
-            is_locked, is_updated, producer_request, consumer_requests,
-            producers, consumers, merged_layouts, module, module_hash, &steps);
+        Status status =
+            RunOneIteration(is_locked, is_updated, producer_request,
+                            consumer_requests, producers, consumers,
+                            merged_layouts, module, module_hash, stage, &steps);
         if (!status.ok()) {
           module.emitOpError() << "Failure running iteration.";
           return signalPassFailure();
         }
       }
-      if (VLOG_IS_ON(2)) {
-        LOG(INFO) << "Failed to converge in stage " << stage;
-      }
+
+      if (is_updated.empty()) break;
+
+      VLOG(2) << "Failed to converge in stage " << stage;
+
       // Step 2. If we are here, then we failed to converge, and likely
       // there is an oscillation of layouts. Detect all the edges that are
       // changing layouts.
@@ -1519,7 +1475,7 @@ struct DLayoutPropagationPassV2
       while (changed.size() > previous_change_size) {
         if (!RunOneIteration(is_locked, is_updated, producer_request,
                              consumer_requests, producers, consumers,
-                             merged_layouts, module, module_hash, &steps)
+                             merged_layouts, module, module_hash, stage, &steps)
                  .ok()) {
           module.emitOpError() << "Failure running iteration.";
           return signalPassFailure();
@@ -1533,6 +1489,7 @@ struct DLayoutPropagationPassV2
         previous_change_size = changed.size();
       }
 
+      VLOG(4) << "Size of changed:" << changed.size();
       // Step 3. Layouts that haven't changed means they're not part of the
       // cyclic problem, so freeze them.
       for (const auto& value_and_layout : merged_layouts) {
@@ -1541,6 +1498,7 @@ struct DLayoutPropagationPassV2
           is_locked.insert(value);
         }
       }
+      VLOG(4) << "Size of locked:" << is_locked.size();
       // Step 4. Any information corresponding to the changed layouts
       // should be disinfected, we do this by clearing all information
       // regarding them.
@@ -1558,6 +1516,7 @@ struct DLayoutPropagationPassV2
                                  operations_needing_update);
       is_updated.clear();
 
+      VLOG(4) << "Size of need update:" << operations_needing_update.size();
       for (auto* op : operations_needing_update) {
         if (mlir::failed(UpdateLayoutsForOp(op, producers, merged_layouts,
                                             producer_request, consumer_requests,
@@ -1578,7 +1537,7 @@ struct DLayoutPropagationPassV2
       return signalPassFailure();
 
     if (VLOG_IS_ON(2)) {
-      LogLayoutsAndOps(stage, module_hash, merged_layouts, module);
+      LogLayoutsAndOps(stage, -1, module_hash, merged_layouts, module);
     }
 
     if (!AllOpResultsHaveLayouts(&module, tf_dialect, merged_layouts))
diff --git a/tensorflow/dtensor/mlir/lower_send_recv.cc b/tensorflow/dtensor/mlir/lower_send_recv.cc
index 1028829660d..29f79cce808 100644
--- a/tensorflow/dtensor/mlir/lower_send_recv.cc
+++ b/tensorflow/dtensor/mlir/lower_send_recv.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/merge_clusters.cc b/tensorflow/dtensor/mlir/merge_clusters.cc
index 367325e5d05..37ab79d1325 100644
--- a/tensorflow/dtensor/mlir/merge_clusters.cc
+++ b/tensorflow/dtensor/mlir/merge_clusters.cc
@@ -14,8 +14,12 @@ limitations under the License.
 ==============================================================================*/
 
 #include <algorithm>
+#include <iterator>
+#include <map>
+#include <memory>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/APInt.h"
@@ -34,6 +38,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
@@ -53,6 +58,7 @@ namespace dtensor {
 
 namespace {
 #define GEN_PASS_DEF_DTENSORMERGECLUSTERS
+#define GEN_PASS_DEF_DTENSORDECOMPOSECONTROLFLOW
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
 
 constexpr char kMissingMeshErrorMsg[] =
@@ -222,8 +228,8 @@ mlir::LogicalResult MergeClusterMetadata(
 
 // Removes tf_device.Cluster ops if tf_device.Cluster is nested inside another
 // cluster and it has same mesh specification as parent cluster.
-mlir::LogicalResult InlineNestedDeviceClusters(mlir::ModuleOp module) {
-  auto clusters = FindAllDeviceClusters(module);
+mlir::LogicalResult InlineNestedDeviceClusters(mlir::Operation* op) {
+  auto clusters = FindAllDeviceClusters(op);
   for (mlir::tf_device::ClusterOp cluster : clusters) {
     auto parent_cluster =
         cluster->getParentOfType<mlir::tf_device::ClusterOp>();
@@ -358,6 +364,9 @@ bool IsInsideIfThenBranch(mlir::TF::IfRegionOp if_op,
 mlir::LogicalResult DecomposeIf(mlir::TF::IfRegionOp if_op,
                                 mlir::MLIRContext* context,
                                 int* num_control_flow_send_recvs) {
+  if (mlir::failed(InlineNestedDeviceClusters(if_op))) {
+    return mlir::failure();
+  }
   auto nested_clusters = FindAllDeviceClusters(if_op);
   if (nested_clusters.empty()) return mlir::success();
 
@@ -486,6 +495,8 @@ mlir::LogicalResult MergeClusters(mlir::ModuleOp module) {
   mlir::func::FuncOp main_func =
       module.lookupSymbol<mlir::func::FuncOp>("main");
 
+  if (!main_func) return mlir::success();
+
   // Create global cluster for each mesh in entire computation.
   auto clusters = FindAllDeviceClusters(main_func);
   mlir::Block& func_block = *main_func.getBody().begin();
@@ -602,11 +613,6 @@ struct DTensorMergeClusters
     if (mlir::failed(InlineNestedDeviceClusters(module)))
       return signalPassFailure();
 
-    int num_controlflow_send_recv = 0;
-    if (mlir::failed(
-            DecomposeControlflow(&context, &num_controlflow_send_recv, module)))
-      return signalPassFailure();
-
     if (mlir::failed(MergeClusters(module))) return signalPassFailure();
 
     llvm::SmallVector<mlir::tf_device::ClusterOp, 4> clusters;
@@ -620,6 +626,24 @@ struct DTensorMergeClusters
   };
 };
 
+struct DTensorDecomposeControlflow
+    : public impl::DTensorDecomposeControlflowBase<
+          DTensorDecomposeControlflow> {
+  void getDependentDialects(mlir::DialectRegistry& registry) const override {
+    registry.insert<mlir::dtensor::DTensorDialect>();
+  }
+
+  void runOnOperation() override {
+    mlir::MLIRContext& context = getContext();
+    mlir::OpBuilder op_builder(&context);
+    auto module = getOperation();
+
+    int num = 0;
+    if (mlir::failed(DecomposeControlflow(&context, &num, module)))
+      return signalPassFailure();
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
@@ -627,5 +651,9 @@ CreateDTensorMergeClustersPass() {
   return std::make_unique<DTensorMergeClusters>();
 }
 
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateDTensorDecomposeControlflowPass() {
+  return std::make_unique<DTensorDecomposeControlflow>();
+}
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/mesh_propagation.cc b/tensorflow/dtensor/mlir/mesh_propagation.cc
index 62af0969f1c..54d547cc826 100644
--- a/tensorflow/dtensor/mlir/mesh_propagation.cc
+++ b/tensorflow/dtensor/mlir/mesh_propagation.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -30,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
@@ -209,8 +211,7 @@ mlir::LogicalResult InferMeshFromInputs(
         // extracted from the DTensorLayout op to infer the mesh of the cluster.
         if (auto layout_op =
                 llvm::dyn_cast<mlir::TF::DTensorLayout>(operand->getOwner())) {
-          auto mesh = layout_op.getLayout().mesh();
-          extracted_config.emplace(mesh);
+          extracted_config.emplace(layout_op.getLayout().mesh());
         } else {
           auto extract_result =
               ExtractMeshFromOperand(producers, operand, &extracted_config);
@@ -228,9 +229,18 @@ mlir::LogicalResult InferMeshFromInputs(
 
         inputs_with_inferred_mesh->emplace_back(operand);
         if (mesh->has_value() && extracted_config != mesh->value()) {
+          llvm::SmallVector<std::string, 8> input_debug_strings;
+          int index = 0;
+          for (const auto& input : *inputs_with_inferred_mesh) {
+            input_debug_strings.push_back(
+                llvm::formatv("Input Cluster {0}: {1}", index, input->get()));
+            ++index;
+          }
           result = cluster.emitOpError(
-              "failed during mesh propagation. All inputs to "
-              "`tf_device.Cluster` must have same mesh configuration.");
+              llvm::formatv("failed during mesh propagation. All inputs to "
+                            "`tf_device.Cluster` must have same mesh "
+                            "configuration. List of found inputs:\n{0}",
+                            absl::StrJoin(input_debug_strings, "\n")));
         }
 
         if (!mesh->has_value()) mesh->emplace(extracted_config.value());
diff --git a/tensorflow/dtensor/mlir/op_to_device_cluster.cc b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
index b52b607e37a..02c7fc73dc8 100644
--- a/tensorflow/dtensor/mlir/op_to_device_cluster.cc
+++ b/tensorflow/dtensor/mlir/op_to_device_cluster.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "llvm/Support/Casting.h"
diff --git a/tensorflow/dtensor/mlir/op_utils.cc b/tensorflow/dtensor/mlir/op_utils.cc
index 9f0215a4636..c989c9c22f5 100644
--- a/tensorflow/dtensor/mlir/op_utils.cc
+++ b/tensorflow/dtensor/mlir/op_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/op_utils.h"
 
+#include <optional>
 #include <string>
 
 #include "llvm/Support/raw_ostream.h"
@@ -39,17 +40,17 @@ uint64_t OpHash(mlir::Operation* op) {
 }
 
 // Returns FuncOp if `op` is a callable.
-absl::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op) {
+std::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op) {
   auto call_op = llvm::dyn_cast<mlir::CallOpInterface>(op);
-  if (!call_op) return absl::nullopt;
+  if (!call_op) return std::nullopt;
 
   mlir::CallInterfaceCallable callable = call_op.getCallableForCallee();
   mlir::SymbolRefAttr sym = callable.dyn_cast<mlir::SymbolRefAttr>();
-  if (!sym) return absl::nullopt;
+  if (!sym) return std::nullopt;
 
   mlir::func::FuncOp func = llvm::dyn_cast<mlir::func::FuncOp>(
       mlir::SymbolTable::lookupNearestSymbolFrom(op, sym));
-  if (!func) return absl::nullopt;
+  if (!func) return std::nullopt;
 
   return func;
 }
@@ -107,5 +108,31 @@ mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
 
   return mlir::success();
 }
+
+// For all constants with multiple usages, clone the constants so that each
+// constant operation has at most 1 usage.
+void DuplicateConstants(mlir::Operation* op) {
+  llvm::SmallVector<mlir::TF::ConstOp, 4> const_ops;
+  op->walk(
+      [&](mlir::TF::ConstOp const_op) { const_ops.emplace_back(const_op); });
+
+  for (mlir::TF::ConstOp const_op : const_ops) {
+    mlir::OpBuilder builder(const_op);
+    auto uses = const_op->getUses();
+    if (uses.empty()) return;
+
+    llvm::SmallDenseMap<mlir::Operation*, mlir::OpOperand*> const_use_map;
+    mlir::OpOperand& first_use = *uses.begin();
+    for (mlir::OpOperand& use : uses) {
+      if (&use == &first_use) continue;
+
+      mlir::Operation* new_const = builder.clone(*const_op);
+      const_use_map.try_emplace(new_const, &use);
+    }
+
+    for (const auto& it : const_use_map) it.second->set(it.first->getResult(0));
+  }
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/op_utils.h b/tensorflow/dtensor/mlir/op_utils.h
index c98aa612aca..2b5fc7b9d93 100644
--- a/tensorflow/dtensor/mlir/op_utils.h
+++ b/tensorflow/dtensor/mlir/op_utils.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_OP_UTILS_H_
 #define TENSORFLOW_DTENSOR_MLIR_OP_UTILS_H_
 
+#include <optional>
 #include <string>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -39,7 +40,7 @@ inline std::string GetFullOpName(const std::string& name) {
 }
 
 // Returns FuncOp if `op` is a callable.
-absl::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op);
+std::optional<mlir::func::FuncOp> MaybeFindFunction(mlir::Operation* op);
 
 // DTensorLayout only conveys layout information of tensors which is no
 // longer needed after SPMD expansion. As so, remove all layouts from
@@ -76,6 +77,11 @@ void RemoveDTensorLayoutOps(mlir::ModuleOp module,
 //   %2 = tf.Add(%1, %1)
 mlir::LogicalResult ReplaceAuxiliaryDTensorLayoutOpsWithIdentity(
     mlir::ModuleOp module);
+
+// For all constants with multiple usages, clone the constants so that each
+// constant operation has at most 1 usage.
+void DuplicateConstants(mlir::Operation* op);
+
 }  // namespace dtensor
 }  // namespace tensorflow
 
diff --git a/tensorflow/dtensor/mlir/propagate_default_layout.cc b/tensorflow/dtensor/mlir/propagate_default_layout.cc
index b511718565a..39aba3a8716 100644
--- a/tensorflow/dtensor/mlir/propagate_default_layout.cc
+++ b/tensorflow/dtensor/mlir/propagate_default_layout.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "absl/types/optional.h"
@@ -43,9 +44,10 @@ namespace {
 #define GEN_PASS_DEF_DTENSORPROPAGATEDEFAULTLAYOUT
 #include "tensorflow/dtensor/mlir/dtensor_passes.h.inc"
 
-// Creates tf.DTensorLayout op that forwards `input` value.
+// Rewrites Op to a tf.DTensorLayout op that forwards `input` value.
 void CreateDTensorLayoutOp(const Layout& layout, mlir::Value input,
                            mlir::TensorType& type, mlir::Location loc,
+                           mlir::IntegerAttr arg_index,
                            mlir::OpBuilder* builder,
                            mlir::MLIRContext* context) {
   if (layout.IsEmpty()) return;
@@ -53,6 +55,9 @@ void CreateDTensorLayoutOp(const Layout& layout, mlir::Value input,
   auto layout_op = builder->create<mlir::TF::DTensorLayout>(
       loc, input, mlir::dtensor::LayoutAttr::get(context, layout),
       mlir::TF::ShapeAttr::get(context, type));
+  if (arg_index != nullptr) {
+    layout_op->setAttr(kFromArgIndex, arg_index);
+  }
   llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
   input.replaceAllUsesExcept(layout_op.getOutput(), exception);
 }
@@ -82,7 +87,7 @@ mlir::LogicalResult PropagateDTensorLayoutForRelayout(
   if (!type) return relayout.emitOpError("type required for Relayout op");
 
   CreateDTensorLayoutOp(layout, relayout.getOutput(), type, relayout.getLoc(),
-                        &builder, &c);
+                        nullptr, &builder, &c);
   return mlir::success();
 }
 
@@ -107,7 +112,9 @@ mlir::LogicalResult PropagateFunctionArgAttrToLayoutOp(
     mlir::Type tensor_type = GetSubtypeOrSelf(arg);
     if (auto type = tensor_type.dyn_cast<mlir::TensorType>()) {
       CreateDTensorLayoutOp(layout_or_status.value(), arg, type,
-                            function.getLoc(), &builder, &c);
+                            function.getLoc(),
+                            builder.getI64IntegerAttr(arg_index), &builder, &c);
+
     } else {
       return function.emitOpError()
              << "is missing tensor type for argument " << arg_index;
@@ -144,7 +151,7 @@ mlir::LogicalResult PropagateFunctionDefaultLayoutAttrToLayoutOp(
 
     if (auto type = return_value.getType().dyn_cast<mlir::TensorType>())
       CreateDTensorLayoutOp(result_layout_or_status.value(), return_value, type,
-                            function.getLoc(), &builder, &c);
+                            function.getLoc(), nullptr, &builder, &c);
     else
       return function.emitOpError()
              << "is missing tensor type for result " << ret_index;
@@ -153,6 +160,48 @@ mlir::LogicalResult PropagateFunctionDefaultLayoutAttrToLayoutOp(
   return mlir::success();
 }
 
+mlir::LogicalResult PropagateOpAttrToLayoutOp(mlir::MLIRContext& context,
+                                              mlir::func::FuncOp function) {
+  auto walk_result =
+      function.walk([&](mlir::Operation* op) -> mlir::WalkResult {
+        if (auto relayout = llvm::dyn_cast<mlir::TF::RelayoutOp>(op)) {
+          (void)PropagateDTensorLayoutForRelayout(context, relayout);
+          return mlir::WalkResult::advance();
+        }
+
+        auto layout_or_status = ExtractLayoutFromOp(op);
+        auto arg_index = op->getAttrOfType<mlir::IntegerAttr>(kFromArgIndex);
+        if (!layout_or_status.ok()) {
+          op->emitOpError(llvm::formatv(
+              "op has layout attribute {0} that cannot be deserizlied.",
+              kLayoutAttr));
+          return mlir::WalkResult::interrupt();
+        }
+
+        mlir::OpBuilder builder(&context);
+        builder.setInsertionPointAfter(op);
+        const auto layouts = layout_or_status.value();
+        for (const auto& layout_and_index : llvm::enumerate(layouts)) {
+          const int index = layout_and_index.index();
+          const auto& layout = layout_and_index.value();
+          if (!layout || layout->IsEmpty()) continue;
+
+          auto op_output = op->getResult(index);
+          if (auto type = op_output.getType().dyn_cast<mlir::TensorType>()) {
+            CreateDTensorLayoutOp(*layout, op_output, type, function.getLoc(),
+                                  arg_index, &builder, &context);
+          } else {
+            return op->emitOpError()
+                   << "type for output " << index << " is not a TensorType";
+          }
+        }
+
+        return mlir::WalkResult::advance();
+      });
+  if (walk_result.wasInterrupted()) return mlir::failure();
+  return mlir::success();
+}
+
 // MLIR pass that removes trivially unused operations in graph.
 struct DTensorPropagateDefaultLayout
     : public impl::DTensorPropagateDefaultLayoutBase<
@@ -167,48 +216,9 @@ struct DTensorPropagateDefaultLayout
 
     auto function = getOperation();
 
-    auto walk_result =
-        getOperation().walk([&](mlir::Operation* op) -> mlir::WalkResult {
-          if (auto relayout = llvm::dyn_cast<mlir::TF::RelayoutOp>(op)) {
-            (void)PropagateDTensorLayoutForRelayout(context, relayout);
-            return mlir::WalkResult::advance();
-          }
-
-          // Set user annotated layout on operations.
-          auto layout_or_status = ExtractLayoutFromOp(op);
-          if (!layout_or_status.ok()) {
-            op->emitOpError(llvm::formatv(
-                "op has layout attribute {0} that cannot be deserizlied.",
-                kLayoutAttr));
-            return mlir::WalkResult::interrupt();
-          }
-
-          mlir::OpBuilder builder(&context);
-          builder.setInsertionPointAfter(op);
-          const auto layouts = layout_or_status.value();
-          for (const auto& layout_and_index : llvm::enumerate(layouts)) {
-            const int index = layout_and_index.index();
-            const auto& layout = layout_and_index.value();
-            if (!layout || layout->IsEmpty()) continue;
-
-            auto op_output = op->getResult(index);
-            if (auto type = op_output.getType().dyn_cast<mlir::TensorType>()) {
-              auto layout_op = builder.create<mlir::TF::DTensorLayout>(
-                  function.getLoc(), op_output,
-                  mlir::dtensor::LayoutAttr::get(&context, *layout),
-                  mlir::TF::ShapeAttr::get(&context, type));
-              llvm::SmallPtrSet<mlir::Operation*, 4> exception{layout_op};
-              op_output.replaceAllUsesExcept(layout_op.getOutput(), exception);
-            } else {
-              return op->emitOpError()
-                     << "type for output " << index << " is not a TensorType";
-            }
-          }
-
-          return mlir::WalkResult::advance();
-        });
-
-    if (walk_result.wasInterrupted()) return signalPassFailure();
+    // Set user annotated layout on operations.
+    if (mlir::failed(PropagateOpAttrToLayoutOp(context, function)))
+      return signalPassFailure();
 
     // Set user annotated layout on function arguments.
     if (mlir::failed(PropagateFunctionArgAttrToLayoutOp(context, function)))
diff --git a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
index 6100cfd8ce6..404be2b38e6 100644
--- a/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
+++ b/tensorflow/dtensor/mlir/propagate_device_id_to_function_args.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/set_default_sharding.cc b/tensorflow/dtensor/mlir/set_default_sharding.cc
index 04e084a247d..7b5431e05d6 100644
--- a/tensorflow/dtensor/mlir/set_default_sharding.cc
+++ b/tensorflow/dtensor/mlir/set_default_sharding.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 
 #include "llvm/ADT/StringRef.h"
diff --git a/tensorflow/dtensor/mlir/shape_utils.cc b/tensorflow/dtensor/mlir/shape_utils.cc
index 33a30a1007c..57b07683f67 100644
--- a/tensorflow/dtensor/mlir/shape_utils.cc
+++ b/tensorflow/dtensor/mlir/shape_utils.cc
@@ -15,6 +15,9 @@ limitations under the License.
 
 #include "tensorflow/dtensor/mlir/shape_utils.h"
 
+#include <optional>
+#include <vector>
+
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/sparse_expander.h b/tensorflow/dtensor/mlir/sparse_expander.h
index 0707528b6fa..485c3061139 100644
--- a/tensorflow/dtensor/mlir/sparse_expander.h
+++ b/tensorflow/dtensor/mlir/sparse_expander.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_H_
 #define TENSORFLOW_DTENSOR_MLIR_SPARSE_EXPANDER_H_
 
+#include <memory>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
@@ -33,7 +34,7 @@ namespace dtensor {
 // any sparse input tensors.
 class SparseExpanderBase {
  public:
-  virtual ~SparseExpanderBase() {}
+  virtual ~SparseExpanderBase() = default;
 
   // Converts `op` to a Sparse expanded form. Sparse expansion logic is
   // a function of op type and op's operand type.
diff --git a/tensorflow/dtensor/mlir/spmd_expander.cc b/tensorflow/dtensor/mlir/spmd_expander.cc
index f76db7b6ca6..eb590bee77c 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cstdint>
 #include <iterator>
 #include <memory>
+#include <optional>
 #include <string>
 
 #include "absl/container/flat_hash_map.h"
@@ -88,7 +89,7 @@ InitOnStartupMarker SPMDExpanderRegistry::RegisterPropagateFn(
 
 Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
                                               mlir::Operation** output) {
-  TF_ASSIGN_OR_RETURN(std::vector<absl::optional<Layout>> computed_layout,
+  TF_ASSIGN_OR_RETURN(std::vector<std::optional<Layout>> computed_layout,
                       ExtractLayoutFromOp(op));
 
   if (computed_layout.empty() && op->getNumResults() != 0) {
@@ -134,7 +135,7 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
   TF_ASSIGN_OR_RETURN(*output, this->ExpandOp(op));
 
   // TODO(hthu): Use ToString() instead.
-  SetLayoutOnOp(*output, absl::Span<absl::optional<Layout>>(
+  SetLayoutOnOp(*output, absl::Span<std::optional<Layout>>(
                              computed_layout.data(), computed_layout.size()));
 
   // Verify the local shape of the expanded operation matches the shape expected
@@ -168,7 +169,9 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
       if (expanded_shape != expected_shape) {
         return errors::Internal(
             "SPMD expansion resulted in op output inconsistent with the "
-            "provided layout.");
+            "provided layout. Expected shape: <",
+            absl::StrJoin(expected_global_shape, ","), "> got shape: <",
+            absl::StrJoin(global_output_shapes[index], ","), ">");
       }
     }
   }
diff --git a/tensorflow/dtensor/mlir/spmd_expander.h b/tensorflow/dtensor/mlir/spmd_expander.h
index eeef7eee2c2..77b12dbc7b6 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.h
+++ b/tensorflow/dtensor/mlir/spmd_expander.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_H_
 #define TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_H_
 
+#include <memory>
 #include <string>
 
 #include "absl/types/optional.h"
@@ -33,7 +34,7 @@ namespace dtensor {
 // Base class for handling SPMD expansion of a MLIR TF Operation.
 class SPMDExpanderBase {
  public:
-  virtual ~SPMDExpanderBase() {}
+  virtual ~SPMDExpanderBase() = default;
 
   // Converts `op` to a SPMD expanded form. SPMD expansion logic is
   // a function of op type, op output's layout, and layout of op's
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index bbf6c532ff7..1b1fd774ef3 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -18,7 +18,9 @@ limitations under the License.
 #include <algorithm>
 #include <atomic>
 #include <iterator>
+#include <optional>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/strings/str_cat.h"
@@ -284,7 +286,7 @@ StatusOr<Layout> GetBroadcastLayoutForElementWise(
   return Layout::GetLayout(output_layout_specs, layout_a.mesh());
 }
 
-StatusOr<absl::optional<Layout>> GetMergedOperandLayout(
+StatusOr<std::optional<Layout>> GetMergedOperandLayout(
     const llvm::DenseMap<int, Layout>& operand_layouts, mlir::Operation* op) {
   // Represents list of Layouts and it's operand index where layout value is
   // defined (i.e. layout is not absl::nullopt).
@@ -301,7 +303,7 @@ StatusOr<absl::optional<Layout>> GetMergedOperandLayout(
   }
 
   if (filtered_preferred_operand_layouts.empty())
-    return absl::optional<Layout>();
+    return std::optional<Layout>();
 
   // Merged all operands and it's layouts to a single broadcasted layout.
   Layout merged_operand_layout = filtered_preferred_operand_layouts[0].first;
@@ -325,7 +327,7 @@ StatusOr<absl::optional<Layout>> GetMergedOperandLayout(
                             merged_shape, shape_to_merge,
                             /*dims_to_ignore=*/0, left_splits, right_splits));
   }
-  return absl::optional<Layout>(merged_operand_layout);
+  return std::optional<Layout>(merged_operand_layout);
 }
 
 mlir::Value GetForwardedDTensorLayoutInput(mlir::Value value) {
@@ -772,7 +774,7 @@ mlir::Operation* TopologicalIterator::next() {
 
   // If this is a function call op, push the first op of the function body so
   // that the function body is converted before the call site.
-  absl::optional<mlir::func::FuncOp> func = MaybeFindFunction(op);
+  std::optional<mlir::func::FuncOp> func = MaybeFindFunction(op);
   if (func.has_value()) {
     mlir::StringRef func_name = func->getName();
 
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.h b/tensorflow/dtensor/mlir/spmd_expander_common.h
index f817bbfa548..ec1d52ba620 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.h
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_COMMON_H_
 #define TENSORFLOW_DTENSOR_MLIR_SPMD_EXPANDER_COMMON_H_
 
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -66,8 +67,8 @@ StatusOr<mlir::TensorType> GlobalTypeFromLocalType(
 
 // Creates a tf::SplitOp that splits 'src_input' into 'num_splits' ways
 // in 'split_dimension' dimension and returns the split values.
-Status CreateSplitOp(const int num_split, const int split_dimension,
-                     const mlir::Location location, mlir::Value src_input,
+Status CreateSplitOp(int num_split, int split_dimension,
+                     mlir::Location location, mlir::Value src_input,
                      mlir::OpBuilder* builder, mlir::TF::SplitOp* split_op);
 
 // Given layouts + shapes, determines if the two are broadcast compatible.
@@ -80,7 +81,7 @@ StatusOr<Layout> GetBroadcastLayoutForElementWise(
 
 // Returns a merged layout using `GetBroadcastLayoutForElementwise()` function
 // given a list of operand layouts.
-StatusOr<absl::optional<Layout>> GetMergedOperandLayout(
+StatusOr<std::optional<Layout>> GetMergedOperandLayout(
     const llvm::DenseMap<int, Layout>& operand_layouts, mlir::Operation* op);
 
 // Returns the forwarded input value of DTensorLayout op for which `value` is
diff --git a/tensorflow/dtensor/mlir/spmd_expanders.cc b/tensorflow/dtensor/mlir/spmd_expanders.cc
index ff6752573fe..06affc52720 100644
--- a/tensorflow/dtensor/mlir/spmd_expanders.cc
+++ b/tensorflow/dtensor/mlir/spmd_expanders.cc
@@ -51,6 +51,7 @@ limitations under the License.
 #include "tensorflow/dtensor/mlir/expansions/sparse_to_dense_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/split_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/squeeze_spmd_expander.h"
+#include "tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/tensorlist_getitem_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/tensorlist_reserve_spmd_expander.h"
 #include "tensorflow/dtensor/mlir/expansions/tensorlist_setitem_spmd_expander.h"
@@ -560,6 +561,9 @@ REGISTER_SPMD(RandomNormalInt, TF::RandomUniformIntOp,
 // Unique
 REGISTER_SPMD(Unique, TF::UniqueOp, ReplicatedOpSPMDExpander,
               /*relayout_when_sharded=*/true);
+// Image Ops
+
+REGISTER_SPMD(EncodePng, TF::EncodePngOp, ReduceSPMDExpander);
 
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/spmd_expansion.cc b/tensorflow/dtensor/mlir/spmd_expansion.cc
index 361475b9129..c2dc62d17e2 100644
--- a/tensorflow/dtensor/mlir/spmd_expansion.cc
+++ b/tensorflow/dtensor/mlir/spmd_expansion.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+#include <optional>
+#include <vector>
+
 #include "absl/types/optional.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
@@ -103,7 +107,7 @@ mlir::Operation* NextTFOp(mlir::Operation* op) {
 // this function is an no-op.
 mlir::LogicalResult UpdateResourceArgumentType(
     const int arg_index, mlir::func::FuncOp function,
-    absl::optional<mlir::RankedTensorType> new_subtype = absl::nullopt) {
+    std::optional<mlir::RankedTensorType> new_subtype = std::nullopt) {
   auto resource_arg = function.getArgument(arg_index);
   if (new_subtype) {
     auto new_var_type = mlir::RankedTensorType::get(
@@ -329,7 +333,7 @@ mlir::LogicalResult ConductSPMDExpansion(mlir::ModuleOp module) {
   TopologicalIterator iterator(main_func);
   while (iterator.hasNext()) {
     mlir::Operation* op = iterator.next();
-    absl::optional<mlir::func::FuncOp> func = MaybeFindFunction(op);
+    std::optional<mlir::func::FuncOp> func = MaybeFindFunction(op);
     if (func.has_value()) {
       if (mlir::failed(
               UpdateFunctionWithLocalInputShapes(op->getOpOperands(), *func)))
diff --git a/tensorflow/dtensor/mlir/tests/BUILD b/tensorflow/dtensor/mlir/tests/BUILD
index ac871952770..a7c5db027b6 100644
--- a/tensorflow/dtensor/mlir/tests/BUILD
+++ b/tensorflow/dtensor/mlir/tests/BUILD
@@ -7,6 +7,7 @@ package(
 )
 
 glob_lit_tests(
+    name = "all_tests",
     data = [":test_utilities"],
     driver = "@llvm-project//mlir:run_lit.sh",
     # TODO(b/177569789): Fix below tests to use V2.
diff --git a/tensorflow/dtensor/mlir/tests/decompose_controlflow.mlir b/tensorflow/dtensor/mlir/tests/decompose_controlflow.mlir
new file mode 100644
index 00000000000..da4ea508160
--- /dev/null
+++ b/tensorflow/dtensor/mlir/tests/decompose_controlflow.mlir
@@ -0,0 +1,270 @@
+// RUN: dtensor-opt %s -split-input-file -dtensor-decompose-controlflow -dtensor-merge-clusters -verify-diagnostics | FileCheck %s
+
+// -----
+
+// Check tf.If control flow ops are decomposed correctly.
+
+// CHECK-LABEL: module @test_if_decomposed
+module @test_if_decomposed {
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[PREDICATE_RECV_OUT:.*]] = "tf.DTensorRecv"
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT]])
+  // CHECK-NEXT:     "tf.D"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
+  // CHECK-SAME: () -> ()
+
+  // CHECK-NEXT: %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
+  // CHECK-NEXT:   "tf.A"
+  // CHECK-NEXT:   "tf.B"
+  // CHECK-NEXT:   %[[PREDICATE_OUT:.*]] = "tf.G"
+  // CHECK-NEXT:   "tf.F"
+  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_OUT]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:        %[[E_OUT:.*]] = "tf.E"
+  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  // CHECK-NEXT: return %[[CLUSTER_OUT]]
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      "tf_device.cluster"() ({
+        "tf.D"() {} : () -> ()
+        tf_device.return
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+}
+
+// -----
+
+// Check decomposing 2 tf_device.cluster ops inside then/else branch of tf.If.
+
+// CHECK-LABEL: module @test_if_then_else_branches
+module @test_if_then_else_branches {
+// CHECK: func @main
+// CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
+func.func @main(%arg0: tensor<i32>) -> tensor<?xi32> {
+  // CHECK:      "tf_device.cluster"
+  // CHECK:        %[[PREDICATE_RECV_OUT:.*]] = "tf.DTensorRecv"
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT]])
+  // CHECK-NEXT:     "tf.D"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
+  // CHECK-SAME: () -> ()
+
+  // CHECK-NEXT: "tf_device.cluster"
+  // CHECK:        %[[PREDICATE_RECV_OUT_2:.*]] = "tf.DTensorRecv"
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_1"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT_2]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.I"
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:        tf_device.return
+  // CHECK-NEXT: _mesh = "TPU|a=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
+  // CHECK-SAME: () -> ()
+
+  // CHECK-NEXT: %[[CPU_CLUSTER_OUT:.*]] = "tf_device.cluster"
+  // CHECK-NEXT:   "tf.A"()
+  // CHECK-NEXT:   "tf.B"()
+  // CHECK-NEXT:   %[[PREDICATE_OUT:.*]] = "tf.G"()
+  // CHECK-NEXT:   "tf.F"()
+  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
+  // CHECK-SAME:   key = "SendRecvKeyForControlflow_1"
+  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_OUT]])
+  // CHECK-NEXT:     "tf.Yield"
+  // CHECK:          "tf.Yield"
+  // CHECK:        %[[E_OUT:.*]] = "tf.E"()
+  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
+  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  // CHECK-NEXT: return %[[CPU_CLUSTER_OUT]]
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      "tf_device.cluster"() ({
+        "tf.D"() {} : () -> ()
+        tf_device.return
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf_device.cluster"() ({
+        "tf.I"() {} : () -> ()
+        tf_device.return
+      }) {_mesh = "TPU|a=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+}
+
+// -----
+
+
+// Check decomposing tf_device cluster inside tested tf.If op.
+
+// CHECK-LABEL: module @test_nested_cluster_inside_if
+// CHECK: func @main
+// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
+module @test_nested_cluster_inside_if {
+func.func @main(%arg0: tensor<i32>) -> tensor<?xi32> {
+  // CHECK:       "tf_device.cluster"()
+  // CHECK:         %[[OUTER_PREDICATE_RECV:.*]] = "tf.DTensorRecv"()
+  // CHECK-SAME:    key = "SendRecvKeyForControlflow_1"
+  // CHECK-NEXT:    "tf.IfRegion"(%[[OUTER_PREDICATE_RECV]])
+  // CHECK-NEXT:      %[[INNER_PREDICATE_RECV:.*]] = "tf.DTensorRecv"()
+  // CHECK-SAME:      key = "SendRecvKeyForControlflow_0"
+  // CHECK-NEXT:      "tf.IfRegion"(%[[INNER_PREDICATE_RECV]])
+  // CHECK-NEXT:        "tf.Yield"
+  // CHECK:             "tf.I"
+  // CHECK-NEXT:        "tf.D"
+  // CHECK:             "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:         tf_device.return
+  // CHECK-NEXT:    _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
+  // CHECK-SAME:    () -> ()
+
+  // CHECK:       "tf_device.cluster"
+  // CHECK-NEXT:    "tf.A"
+  // CHECK-NEXT:    "tf.B"
+  // CHECK-NEXT:    %[[OUTER_PREDICATE:.*]] = "tf.G"
+  // CHECK-NEXT:    "tf.DTensorSend"(%[[OUTER_PREDICATE]])
+  // CHECK-NEXT:    "tf.IfRegion"(%[[OUTER_PREDICATE]])
+  // CHECK-NEXT:      %[[INNER_PREDICATE:.*]] = "tf.H"
+  // CHECK-NEXT:      "tf.DTensorSend"(%[[INNER_PREDICATE]])
+  // CHECK-NEXT:      "tf.IfRegion"(%[[INNER_PREDICATE]])
+  // CHECK-NEXT:        "tf.Yield"
+  // CHECK:             "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:           "tf.Yield"
+  // CHECK:         %[[E_OUT:.*]] = "tf.E"
+  // CHECK-NEXT:    tf_device.return %[[E_OUT]]
+  // CHECK-NEXT:  _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+
+    "tf.IfRegion"(%6) ({
+       %7 = "tf.H"(%4) : (tensor<?xi32>) -> (tensor<i1>)
+
+      "tf.IfRegion"(%7)({
+          "tf.Yield"() : () -> ()
+        },
+        {
+          "tf_device.cluster"() ({
+            %8 = "tf.I"() : () -> (tensor<?xi32>)
+            "tf.D"(%8) : (tensor<?xi32>) -> ()
+            tf_device.return
+          }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+          "tf.Yield"() : () -> ()
+        }) {is_stateless = false} : (tensor<i1>) -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) { is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+}
+
+// -----
+
+// Check nested clusters with input edges are disallowed.
+
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      // expected-error @+1 {{found nested tf_device.Cluster op with inputs}}
+      "tf_device.cluster"() ({
+        "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> ()
+        tf_device.return
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
+// -----
+
+// Check nested clusters with outputs edges are disallowed.
+
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      // expected-error @+1 {{found nested tf_device.Cluster op with outputs}}
+      %9 = "tf_device.cluster"() ({
+        %8 = "tf.D"() : () -> tensor<?xi32>
+        tf_device.return %8 : tensor<?xi32>
+      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> (tensor<?xi32>)
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
index 2c101531e55..5da8b06d804 100644
--- a/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
+++ b/tensorflow/dtensor/mlir/tests/dtensor_allreduce_combine_optimization.mlir
@@ -146,3 +146,56 @@ func.func @main() {
   }) : () -> tensor<4x4xf32>
   "func.return"() : () -> ()
 }
+
+// -----
+module attributes {dtensor.all_reduce_combiner.num_ops_in_group = 2} {
+  // Check that when DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE is set, 
+  // independent DTensorAllReduce ops of the same element type and group
+  // assignment are combined no more than the specified size. Use of dummy All-
+  // Reduces (of the same input) gaurantees ops to be grouped together if envvar
+  // is not specified.
+  // The following scenario should have 3 groups *without* envvar set:
+  // group 1: 2 all reduces
+  // group 2: 3 all reduces
+  // group 3: 4 all reduces
+  // With DTENSOR_ALLREDUCE_COMBINE_OPTIMIZATION_GROUP_SIZE=2, we expect to have
+  // the following 5 groups:
+  // group 1: 2 all reduces (original group, test for exact match of size)
+  // group 2: 2 all reduces (2/3 of original group 2, test of uneven split)
+  // group 3: 1 all reduces (1/3 of original group 2, test of uneven split)
+  // group 4: 2 all reduces (2/4 of original group 3, test of even split)
+  // group 5: 2 all reduces (2/4 of original group 3, test of even split)
+  // CHECK-LABEL: func @main
+  func.func @main() {
+    // CHECK:      %[[ALL_REDUCE_1:.*]] = "tf.DTensorAllReduce"
+    // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+    // CHECK:      %[[ALL_REDUCE_2:.*]] = "tf.DTensorAllReduce"
+    // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+    // CHECK:      %[[ALL_REDUCE_3:.*]] = "tf.DTensorAllReduce"
+    // CHECK-SAME:   (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+    // CHECK:      %[[ALL_REDUCE_4:.*]] = "tf.DTensorAllReduce"
+    // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+    // CHECK:      %[[ALL_REDUCE_5:.*]] = "tf.DTensorAllReduce"
+    // CHECK-SAME:   (tensor<1024xf32>, tensor<2x2xi32>) -> tensor<1024xf32>
+    %0 = "tf_device.cluster"() ({
+      %1 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+      %2 = "tf.Const"() {value = dense<[[0, 1], [2, 3]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+      %3 = "tf.Const"() {value = dense<1.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+      %4 = "tf.Const"() {value = dense<[[3, 2], [1, 0]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+      %5 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %6 = "tf.DTensorAllReduce"(%1, %2) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %7 = "tf.DTensorAllReduce"(%3, %4) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %8 = "tf.DTensorAllReduce"(%3, %4) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %9 = "tf.DTensorAllReduce"(%3, %4) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %10 = "tf.Const"() {value = dense<0.0> : tensor<4x4xf32>} : () -> tensor<4x4xf32>
+      %11 = "tf.Const"() {value = dense<[[0, 1], [3, 2]]> : tensor<2x2xi32>} : () -> tensor<2x2xi32>
+      %12 = "tf.DTensorAllReduce"(%10, %11) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %13 = "tf.DTensorAllReduce"(%10, %11) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %14 = "tf.DTensorAllReduce"(%10, %11) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %15 = "tf.DTensorAllReduce"(%10, %11) {_layout = ["sharding_specs:x,y, mesh:|x=2,y=2|*GPU"], device_type = "GPU", reduce_op = "Add"} : (tensor<4x4xf32>, tensor<2x2xi32>) -> tensor<4x4xf32>
+      %16 = "tf.Add"(%9, %15) : (tensor<4x4xf32>, tensor<4x4xf32>) -> tensor<4x4xf32>
+      "tf_device.return"(%16) : (tensor<4x4xf32>) -> ()
+    }) : () -> tensor<4x4xf32>
+    "func.return"() : () -> ()
+  }
+}
\ No newline at end of file
diff --git a/tensorflow/dtensor/mlir/tests/merge_clusters.mlir b/tensorflow/dtensor/mlir/tests/merge_clusters.mlir
index 8d736ca9beb..8cc4a8b17cd 100644
--- a/tensorflow/dtensor/mlir/tests/merge_clusters.mlir
+++ b/tensorflow/dtensor/mlir/tests/merge_clusters.mlir
@@ -2,7 +2,8 @@
 
 // Check that multiple tf_device.Cluster ops with same mesh specification are
 // merged correctly to a single global cluster.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_merge_same_mesh
+module @test_merge_same_mesh {
 func.func @main(%arg0: tensor<i32>) -> (tensor<1xi32>,  tensor<i64>, tensor<1xi32>, tensor<i64>) {
   // CHECK:      "tf_device.cluster"
   // CHECK:        "tf.Cast"
@@ -71,12 +72,15 @@ func.func @main(%arg0: tensor<i32>) -> (tensor<1xi32>,  tensor<i64>, tensor<1xi3
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> (tensor<1xi32>, tensor<i64>)
   func.return %7, %8, %9, %10 : tensor<1xi32>,  tensor<i64>, tensor<1xi32>, tensor<i64>
 }
+}
 
 // -----
 
 // Check that duplicate/nested tf_device.cluster ops are removed.
 
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_nested_cluster_are_removed
+module @test_nested_cluster_are_removed {
+// CHECK: func @main
 func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   // CHECK:      "tf_device.cluster"
   // CHECK:        "tf.A"
@@ -115,302 +119,14 @@ func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
   func.return %2 : tensor<?xi32>
 }
-
-// -----
-
-// Check clusters with no mesh specification are disallowed.
-
-func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %2 = "tf_device.cluster"() ({
-    %3 = "tf.A"() : () -> (tensor<?xi32>)
-    %4 = "tf.B"() : () -> (tensor<?xi32>)
-    %6 = "tf.G"() : () -> (tensor<i1>)
-    %7 = "tf.F"() : () -> tensor<?xi32>
-    "tf.IfRegion"(%6) ({
-
-      // expected-error @+1 {{All clusters must have specified mesh}}
-      "tf_device.cluster"() ({
-        "tf.D"() : () -> ()
-        tf_device.return
-      }) : () -> ()
-
-      "tf.Yield"() : () -> ()
-    }, {
-      "tf.Yield"() : () -> ()
-    }) {is_stateless = false} : (tensor<i1>) -> ()
-
-    %5 = "tf.E"() : () -> tensor<?xi32>
-    tf_device.return %5 : tensor<?xi32>
-  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
-
-  func.return %2 : tensor<?xi32>
-}
-
-// -----
-
-// Check nested clusters with input edges are disallowed.
-
-func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %2 = "tf_device.cluster"() ({
-    %3 = "tf.A"() : () -> (tensor<?xi32>)
-    %4 = "tf.B"() : () -> (tensor<?xi32>)
-    %6 = "tf.G"() : () -> (tensor<i1>)
-    %7 = "tf.F"() : () -> tensor<?xi32>
-    "tf.IfRegion"(%6) ({
-
-      // expected-error @+1 {{found nested tf_device.Cluster op with inputs}}
-      "tf_device.cluster"() ({
-        "tf.D"(%4, %3, %7) {} : (tensor<?xi32>, tensor<?xi32>, tensor<?xi32>) -> ()
-        tf_device.return
-      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
-
-      "tf.Yield"() : () -> ()
-    }, {
-      "tf.Yield"() : () -> ()
-    }) {is_stateless = false} : (tensor<i1>) -> ()
-
-    %5 = "tf.E"() : () -> tensor<?xi32>
-    tf_device.return %5 : tensor<?xi32>
-  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
-
-  func.return %2 : tensor<?xi32>
-}
-
-// -----
-
-// Check nested clusters with outputs edges are disallowed.
-
-func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  %2 = "tf_device.cluster"() ({
-    %3 = "tf.A"() : () -> (tensor<?xi32>)
-    %4 = "tf.B"() : () -> (tensor<?xi32>)
-    %6 = "tf.G"() : () -> (tensor<i1>)
-    %7 = "tf.F"() : () -> tensor<?xi32>
-    "tf.IfRegion"(%6) ({
-
-      // expected-error @+1 {{found nested tf_device.Cluster op with outputs}}
-      %9 = "tf_device.cluster"() ({
-        %8 = "tf.D"() : () -> tensor<?xi32>
-        tf_device.return %8 : tensor<?xi32>
-      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> (tensor<?xi32>)
-
-      "tf.Yield"() : () -> ()
-    }, {
-      "tf.Yield"() : () -> ()
-    }) {is_stateless = false} : (tensor<i1>) -> ()
-
-    %5 = "tf.E"() : () -> tensor<?xi32>
-    tf_device.return %5 : tensor<?xi32>
-  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
-
-  func.return %2 : tensor<?xi32>
-}
-
-
-// -----
-
-// Check tf.If control flow ops are decomposed correctly.
-
-// CHECK-LABEL: func @main
-func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        %[[PREDICATE_RECV_OUT:.*]] = "tf.DTensorRecv"
-  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
-  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT]])
-  // CHECK-NEXT:     "tf.D"
-  // CHECK-NEXT:     "tf.Yield"
-  // CHECK:        tf_device.return
-  // CHECK-NEXT: _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
-  // CHECK-SAME: () -> ()
-
-  // CHECK-NEXT: %[[CLUSTER_OUT:.*]] = "tf_device.cluster"
-  // CHECK-NEXT:   "tf.A"
-  // CHECK-NEXT:   "tf.B"
-  // CHECK-NEXT:   %[[PREDICATE_OUT:.*]] = "tf.G"
-  // CHECK-NEXT:   "tf.F"
-  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
-  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_OUT]])
-  // CHECK-NEXT:     "tf.Yield"
-  // CHECK:          "tf.Yield"
-  // CHECK:        %[[E_OUT:.*]] = "tf.E"
-  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
-  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
-  // CHECK-NEXT: return %[[CLUSTER_OUT]]
-  %2 = "tf_device.cluster"() ({
-    %3 = "tf.A"() : () -> (tensor<?xi32>)
-    %4 = "tf.B"() : () -> (tensor<?xi32>)
-    %6 = "tf.G"() : () -> (tensor<i1>)
-    %7 = "tf.F"() : () -> tensor<?xi32>
-    "tf.IfRegion"(%6) ({
-
-      "tf_device.cluster"() ({
-        "tf.D"() {} : () -> ()
-        tf_device.return
-      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
-
-      "tf.Yield"() : () -> ()
-    }, {
-      "tf.Yield"() : () -> ()
-    }) {is_stateless = false} : (tensor<i1>) -> ()
-
-    %5 = "tf.E"() : () -> tensor<?xi32>
-    tf_device.return %5 : tensor<?xi32>
-  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
-
-  func.return %2 : tensor<?xi32>
-}
-
-// -----
-
-// Check decomposing 2 tf_device.cluster ops inside then/else branch of tf.If.
-
-// CHECK-LABEL: func @main
-// CHECK-SAME:  %[[ARG0:.*]]: tensor<i32>
-func.func @main(%arg0: tensor<i32>) -> tensor<?xi32> {
-  // CHECK:      "tf_device.cluster"
-  // CHECK:        %[[PREDICATE_RECV_OUT:.*]] = "tf.DTensorRecv"
-  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
-  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT]])
-  // CHECK-NEXT:     "tf.D"
-  // CHECK-NEXT:     "tf.Yield"
-  // CHECK:          "tf.Yield"
-  // CHECK:        tf_device.return
-  // CHECK-NEXT: _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
-  // CHECK-SAME: () -> ()
-
-  // CHECK-NEXT: "tf_device.cluster"
-  // CHECK:        %[[PREDICATE_RECV_OUT_2:.*]] = "tf.DTensorRecv"
-  // CHECK-SAME:   key = "SendRecvKeyForControlflow_1"
-  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_RECV_OUT_2]])
-  // CHECK-NEXT:     "tf.Yield"
-  // CHECK:          "tf.I"
-  // CHECK-NEXT:     "tf.Yield"
-  // CHECK:        tf_device.return
-  // CHECK-NEXT: _mesh = "TPU|a=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"
-  // CHECK-SAME: () -> ()
-
-  // CHECK-NEXT: %[[CPU_CLUSTER_OUT:.*]] = "tf_device.cluster"
-  // CHECK-NEXT:   "tf.A"()
-  // CHECK-NEXT:   "tf.B"()
-  // CHECK-NEXT:   %[[PREDICATE_OUT:.*]] = "tf.G"()
-  // CHECK-NEXT:   "tf.F"()
-  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
-  // CHECK-SAME:   key = "SendRecvKeyForControlflow_0"
-  // CHECK-NEXT:   "tf.DTensorSend"(%[[PREDICATE_OUT]])
-  // CHECK-SAME:   key = "SendRecvKeyForControlflow_1"
-  // CHECK-NEXT:   "tf.IfRegion"(%[[PREDICATE_OUT]])
-  // CHECK-NEXT:     "tf.Yield"
-  // CHECK:          "tf.Yield"
-  // CHECK:        %[[E_OUT:.*]] = "tf.E"()
-  // CHECK-NEXT:   tf_device.return %[[E_OUT]]
-  // CHECK-NEXT: _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
-  // CHECK-NEXT: return %[[CPU_CLUSTER_OUT]]
-  %2 = "tf_device.cluster"() ({
-    %3 = "tf.A"() : () -> (tensor<?xi32>)
-    %4 = "tf.B"() : () -> (tensor<?xi32>)
-    %6 = "tf.G"() : () -> (tensor<i1>)
-    %7 = "tf.F"() : () -> tensor<?xi32>
-    "tf.IfRegion"(%6) ({
-
-      "tf_device.cluster"() ({
-        "tf.D"() {} : () -> ()
-        tf_device.return
-      }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
-
-      "tf.Yield"() : () -> ()
-    }, {
-      "tf_device.cluster"() ({
-        "tf.I"() {} : () -> ()
-        tf_device.return
-      }) {_mesh = "TPU|a=4|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1,/job:localhost/task:0/device:TPU:2,/job:localhost/task:0/device:TPU:3"} : () -> ()
-      "tf.Yield"() : () -> ()
-    }) {is_stateless = false} : (tensor<i1>) -> ()
-
-    %5 = "tf.E"() : () -> tensor<?xi32>
-    tf_device.return %5 : tensor<?xi32>
-  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
-
-  func.return %2 : tensor<?xi32>
-}
-
-// -----
-
-// Check decomposing tf_device cluster inside tested tf.If op.
-
-// CHECK-LABEL: func @main
-// CHECK-SAME: %[[ARG0:.*]]: tensor<i32>
-func.func @main(%arg0: tensor<i32>) -> tensor<?xi32> {
-  // CHECK:       "tf_device.cluster"()
-  // CHECK:         %[[OUTER_PREDICATE_RECV:.*]] = "tf.DTensorRecv"()
-  // CHECK-SAME:    key = "SendRecvKeyForControlflow_1"
-  // CHECK-NEXT:    "tf.IfRegion"(%[[OUTER_PREDICATE_RECV]])
-  // CHECK-NEXT:      %[[INNER_PREDICATE_RECV:.*]] = "tf.DTensorRecv"()
-  // CHECK-SAME:      key = "SendRecvKeyForControlflow_0"
-  // CHECK-NEXT:      "tf.IfRegion"(%[[INNER_PREDICATE_RECV]])
-  // CHECK-NEXT:        "tf.Yield"
-  // CHECK:             "tf.I"
-  // CHECK-NEXT:        "tf.D"
-  // CHECK:             "tf.Yield"
-  // CHECK:           "tf.Yield"
-  // CHECK:           "tf.Yield"
-  // CHECK:         tf_device.return
-  // CHECK-NEXT:    _mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"
-  // CHECK-SAME:    () -> ()
-
-  // CHECK:       "tf_device.cluster"
-  // CHECK-NEXT:    "tf.A"
-  // CHECK-NEXT:    "tf.B"
-  // CHECK-NEXT:    %[[OUTER_PREDICATE:.*]] = "tf.G"
-  // CHECK-NEXT:    "tf.DTensorSend"(%[[OUTER_PREDICATE]])
-  // CHECK-NEXT:    "tf.IfRegion"(%[[OUTER_PREDICATE]])
-  // CHECK-NEXT:      %[[INNER_PREDICATE:.*]] = "tf.H"
-  // CHECK-NEXT:      "tf.DTensorSend"(%[[INNER_PREDICATE]])
-  // CHECK-NEXT:      "tf.IfRegion"(%[[INNER_PREDICATE]])
-  // CHECK-NEXT:        "tf.Yield"
-  // CHECK:             "tf.Yield"
-  // CHECK:           "tf.Yield"
-  // CHECK:           "tf.Yield"
-  // CHECK:         %[[E_OUT:.*]] = "tf.E"
-  // CHECK-NEXT:    tf_device.return %[[E_OUT]]
-  // CHECK-NEXT:  _mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"
-  %2 = "tf_device.cluster"() ({
-    %3 = "tf.A"() : () -> (tensor<?xi32>)
-    %4 = "tf.B"() : () -> (tensor<?xi32>)
-    %6 = "tf.G"() : () -> (tensor<i1>)
-
-    "tf.IfRegion"(%6) ({
-       %7 = "tf.H"(%4) : (tensor<?xi32>) -> (tensor<i1>)
-
-      "tf.IfRegion"(%7)({
-          "tf.Yield"() : () -> ()
-        },
-        {
-          "tf_device.cluster"() ({
-            %8 = "tf.I"() : () -> (tensor<?xi32>)
-            "tf.D"(%8) : (tensor<?xi32>) -> ()
-            tf_device.return
-          }) {_mesh = "TPU|x=1|0|0|/job:localhost/task:0/device:TPU:0"} : () -> ()
-
-          "tf.Yield"() : () -> ()
-        }) {is_stateless = false} : (tensor<i1>) -> ()
-
-      "tf.Yield"() : () -> ()
-    }, {
-      "tf.Yield"() : () -> ()
-    }) { is_stateless = false} : (tensor<i1>) -> ()
-
-    %5 = "tf.E"() : () -> tensor<?xi32>
-    tf_device.return %5 : tensor<?xi32>
-  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
-
-  func.return %2 : tensor<?xi32>
 }
 
 // -----
 
 // Check whether metadata attributes are cloned correctly during cluster
 // merging.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_clone_metadata
+module @test_clone_metadata {
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf32>>>) -> () {
   // CHECK:      "tf_device.cluster"
   // CHECK:        "tf.B"
@@ -441,11 +157,13 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _inferred_resource_indices = dense<1> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
   func.return
 }
+}
 
 // -----
 
 // Check whether metadata attributes are merged correctly.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_merge_metadata
+module @test_merge_metadata {
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf32>>>,  %arg2: tensor<!tf_type.resource<tensor<2x4xf32>>>) -> () {
   // CHECK:      "tf_device.cluster"
   // CHECK:        "tf.B"
@@ -478,11 +196,13 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _inferred_resource_indices = dense<1> : vector<1xi32>, _inferred_resource_layouts = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
   func.return
 }
+}
 
 // -----
 
 // Check whether shape op metadata attributes are merged correctly.
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_shape_merge
+module @test_shape_merge {
 func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2x4xf32>) -> () {
   // CHECK:      "tf_device.cluster"
   // CHECK:        "tf.B"
@@ -511,6 +231,7 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<2x4xf32>, %arg2: tensor<2x4xf3
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0", _shape_input_indices = dense<[1]> : vector<1xi32>, _shape_input_layout = ["sharding_specs:unsharded,unsharded, mesh:CPU|x=1|0|0|CPU:0"]} : () -> ()
   func.return
 }
+}
 
 // -----
 
@@ -536,12 +257,15 @@ func.func @main(%arg0: tensor<i32>, %arg1: tensor<!tf_type.resource<tensor<2x4xf
   func.return
 }
 
+
 // -----
 
 // Check that unused tf_device.cluster results are pruned away.
 
-// CHECK-LABEL: func @main
+// CHECK-LABEL module @test_prune_unused
+// CHECK: func @main
 // CHECK-SAME: %[[DEVICE_ID:.*]]: tensor<i32>
+module @test_prune_unused {
 func.func @main(%arg0: tensor<i32>) {
   // CHECK:       "tf_device.cluster"()
   // CHECK-NEXT:    "tf.Const"
@@ -554,3 +278,35 @@ func.func @main(%arg0: tensor<i32>) {
   }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<i64>
  func.return
 }
+}
+
+// -----
+
+// Check clusters with no mesh specification are disallowed.
+
+func.func @main(%arg0: tensor<?xi32>) -> tensor<?xi32> {
+  %2 = "tf_device.cluster"() ({
+    %3 = "tf.A"() : () -> (tensor<?xi32>)
+    %4 = "tf.B"() : () -> (tensor<?xi32>)
+    %6 = "tf.G"() : () -> (tensor<i1>)
+    %7 = "tf.F"() : () -> tensor<?xi32>
+    "tf.IfRegion"(%6) ({
+
+      // expected-error @+1 {{All clusters must have specified mesh}}
+      "tf_device.cluster"() ({
+        "tf.D"() : () -> ()
+        tf_device.return
+      }) : () -> ()
+
+      "tf.Yield"() : () -> ()
+    }, {
+      "tf.Yield"() : () -> ()
+    }) {is_stateless = false} : (tensor<i1>) -> ()
+
+    %5 = "tf.E"() : () -> tensor<?xi32>
+    tf_device.return %5 : tensor<?xi32>
+  }) {_mesh = "CPU|x=1|0|0|/job:localhost/task:0/device:CPU:0"} : () -> tensor<?xi32>
+
+  func.return %2 : tensor<?xi32>
+}
+
diff --git a/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir b/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
index 9e82bf4d048..f75b5ffef77 100644
--- a/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
+++ b/tensorflow/dtensor/mlir/tests/mesh_propagation.mlir
@@ -74,15 +74,20 @@ module @test_args_of_enclosing_func {
 
 // -----
 
+// Check that error is raised if mesh from inputs is not unique.
+
+module @test_error_on_different_meshes {
 
 func.func @main(%arg0: tensor<1xf32>, %arg1: tensor<1xf32> {tf._mesh = "CPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1"}, %arg2: tensor<1xf32> {tf._mesh = "TPU|x=2,y=1|0,1|0,1|/job:localhost/task:0/device:TPU:0,/job:localhost/task:0/device:TPU:1"}) -> () {
-    // expected-error @+1 {{ All inputs to `tf_device.Cluster` must have same mesh configuration}}
+    // COMMENT: Regex is used to do a multiline match.
+    // expected-error-re @+1 {{All inputs to `tf_device.Cluster` must have same mesh configuration{{.*}}List of found inputs:{{.*}}Input Cluster 0:{{.*}}Input Cluster 1:}}
     %0 = "tf_device.cluster"() ({
       %1 = "tf.A"(%arg1, %arg2) : (tensor<1xf32>, tensor<1xf32>) -> tensor<i32>
       tf_device.return %1 : tensor<i32>
     }) : () -> (tensor<i32>)
     func.return
 }
+}
 
 // -----
 
diff --git a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
index 71dd3a47c8c..cf11f7d63dd 100644
--- a/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
+++ b/tensorflow/dtensor/mlir/tests/multi_device_expansion.mlir
@@ -1,7 +1,16 @@
-// RUN: dtensor-opt %s -split-input-file -dtensor-multi-device-expansion -verify-diagnostics | FileCheck %s
+ // RUN: dtensor-opt %s -split-input-file -dtensor-multi-device-expansion -verify-diagnostics | FileCheck %s
 
 module attributes {tf._default_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf.devices = {"/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:CPU:1", "/job:localhost/replica:0/task:0/device:CPU:2", "/job:localhost/replica:0/task:0/device:CPU:3", "/job:localhost/replica:0/task:0/device:CPU:4", "/job:localhost/replica:0/task:0/device:CPU:5", "/job:localhost/replica:0/task:0/device:CPU:6", "/job:localhost/replica:0/task:0/device:CPU:7"}} {
-  // CHECK-LABEL: func @main
+  func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8xi32> {tf._global_shape = #tf_type.shape<8>, tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf._mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<8xi32> {tf._global_shape = #tf_type.shape<8>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
+    %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"], _mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config_proto = "", executor_type = "", f = @_test_func} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
+    return %1 : tensor<8xi32>
+  }
+
+  func.func private @_test_func(%arg0: tensor<i32>, %arg1: tensor<8xi32>) -> tensor<8xi32> {
+    return %arg1 : tensor<8xi32>
+  }
+
+  // CHECK-LABEL: func.func @main
   // CHECK: %arg0: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}
   // CHECK: %arg1: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:1"}
   // CHECK: %arg2: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:2"}
@@ -10,30 +19,39 @@ module attributes {tf._default_mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|
   // CHECK: %arg5: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:5"}
   // CHECK: %arg6: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:6"}
   // CHECK: %arg7: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:7"}
-  func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8xi32> {tf._global_shape = #tf_type.shape<8>, tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf._mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<8xi32> {tf._global_shape = #tf_type.shape<8>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
-    // CHECK: %[[CST0:.*]] = "tf.Const"
-    // CHECK: %[[CST1:.*]] = "tf.Const"
-    // CHECK: %[[CST2:.*]] = "tf.Const"
-    // CHECK: %[[CST3:.*]] = "tf.Const"
-    // CHECK: %[[CST4:.*]] = "tf.Const"
-    // CHECK: %[[CST5:.*]] = "tf.Const"
-    // CHECK: %[[CST6:.*]] = "tf.Const"
-    // CHECK: %[[CST7:.*]] = "tf.Const"
-    // CHECK: %[[RES0:.*]] = "tf.StatefulPartitionedCall"(%[[CST0]], %arg0)
-    // CHECK: %[[RES1:.*]] = "tf.StatefulPartitionedCall"(%[[CST1]], %arg1)
-    // CHECK: %[[RES2:.*]] = "tf.StatefulPartitionedCall"(%[[CST2]], %arg2)
-    // CHECK: %[[RES3:.*]] = "tf.StatefulPartitionedCall"(%[[CST3]], %arg3)
-    // CHECK: %[[RES4:.*]] = "tf.StatefulPartitionedCall"(%[[CST4]], %arg4)
-    // CHECK: %[[RES5:.*]] = "tf.StatefulPartitionedCall"(%[[CST5]], %arg5)
-    // CHECK: %[[RES6:.*]] = "tf.StatefulPartitionedCall"(%[[CST6]], %arg6)
-    // CHECK: %[[RES7:.*]] = "tf.StatefulPartitionedCall"(%[[CST7]], %arg7)
-    %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"], _mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config_proto = "", executor_type = "", f = @_test_func} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
-    // CHECK: return %[[RES0]], %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]], %[[RES5]], %[[RES6]], %[[RES7]]
-    return %1 : tensor<8xi32>
-  }
-  func.func private @_test_func(%arg0: tensor<i32>, %arg1: tensor<8xi32>) -> tensor<8xi32> {
-    return %arg1 : tensor<8xi32>
-  }
+  // CHECK: tf.entry_function = {inputs = "input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7", outputs = "output_0,output_1,output_2,output_3,output_4,output_5,output_6,output_7"
+  // CHECK: %[[RES:.*]]:8 = "tf.StatefulPartitionedCall"(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7)
+  // CHECK-SAME: _layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"]
+  // CHECK-SAME: f = @_multi_device_func_7433945607865292515_971647271862201157
+  // CHECK: return %[[RES]]#0, %[[RES]]#1, %[[RES]]#2, %[[RES]]#3, %[[RES]]#4, %[[RES]]#5, %[[RES]]#6, %[[RES]]#7
+
+  // CHECK-LABEL: func.func private @_multi_device_func_7433945607865292515_971647271862201157
+  // CHECK: %arg0: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:0"}
+  // CHECK: %arg1: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:1"}
+  // CHECK: %arg2: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:2"}
+  // CHECK: %arg3: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:3"}
+  // CHECK: %arg4: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:4"}
+  // CHECK: %arg5: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:5"}
+  // CHECK: %arg6: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:6"}
+  // CHECK: %arg7: tensor<8xi32> {tf.device = "/job:localhost/replica:0/task:0/device:CPU:7"}
+  // CHECK: tf.entry_function = {inputs = "input_0,input_1,input_2,input_3,input_4,input_5,input_6,input_7", outputs = "output_0,output_1,output_2,output_3,output_4,output_5,output_6,output_7"
+  // CHECK: %[[CST0:.*]] = "tf.Const"
+  // CHECK: %[[CST1:.*]] = "tf.Const"
+  // CHECK: %[[CST2:.*]] = "tf.Const"
+  // CHECK: %[[CST3:.*]] = "tf.Const"
+  // CHECK: %[[CST4:.*]] = "tf.Const"
+  // CHECK: %[[CST5:.*]] = "tf.Const"
+  // CHECK: %[[CST6:.*]] = "tf.Const"
+  // CHECK: %[[CST7:.*]] = "tf.Const"
+  // CHECK: %[[RES0:.*]] = "tf.StatefulPartitionedCall"(%[[CST0]], %arg0)
+  // CHECK: %[[RES1:.*]] = "tf.StatefulPartitionedCall"(%[[CST1]], %arg1)
+  // CHECK: %[[RES2:.*]] = "tf.StatefulPartitionedCall"(%[[CST2]], %arg2)
+  // CHECK: %[[RES3:.*]] = "tf.StatefulPartitionedCall"(%[[CST3]], %arg3)
+  // CHECK: %[[RES4:.*]] = "tf.StatefulPartitionedCall"(%[[CST4]], %arg4)
+  // CHECK: %[[RES5:.*]] = "tf.StatefulPartitionedCall"(%[[CST5]], %arg5)
+  // CHECK: %[[RES6:.*]] = "tf.StatefulPartitionedCall"(%[[CST6]], %arg6)
+  // CHECK: %[[RES7:.*]] = "tf.StatefulPartitionedCall"(%[[CST7]], %arg7)
+  // CHECK: return %[[RES0]], %[[RES1]], %[[RES2]], %[[RES3]], %[[RES4]], %[[RES5]], %[[RES6]], %[[RES7]]
 }
 
 // -----
@@ -55,7 +73,7 @@ module {
 
 module {
   func.func @main(%arg0: tensor<i32> {tf._global_shape = #tf_type.shape<>}, %arg1: tensor<8xi32> {tf._global_shape = #tf_type.shape<8>, tf._layout = "sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", tf._mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"}) -> (tensor<8xi32> {tf._global_shape = #tf_type.shape<8>}) attributes {tf.entry_function = {control_outputs = "eager_operation", inputs = "device_id,op_input_0", outputs = "op_output_0"}} {
-    // expected-error @+1 {{Call result must be used by return op.}}
+    // expected-error @+1 {{Calls must be used by exactly one return op.}}
     %1 = "tf.StatefulPartitionedCall"(%arg0, %arg1) {_layout = ["sharding_specs:unsharded, mesh:|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7"], _mesh = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config = "|x=2,y=4|0,1,2,3,4,5,6,7|0,1,2,3,4,5,6,7|/job:localhost/replica:0/task:0/device:CPU:0,/job:localhost/replica:0/task:0/device:CPU:1,/job:localhost/replica:0/task:0/device:CPU:2,/job:localhost/replica:0/task:0/device:CPU:3,/job:localhost/replica:0/task:0/device:CPU:4,/job:localhost/replica:0/task:0/device:CPU:5,/job:localhost/replica:0/task:0/device:CPU:6,/job:localhost/replica:0/task:0/device:CPU:7", config_proto = "", executor_type = "", f = @_test_func} : (tensor<i32>, tensor<8xi32>) -> tensor<8xi32>
     %2 = "tf.Identity"(%1) : (tensor<8xi32>) -> tensor<8xi32>
     return %2 : tensor<8xi32>
diff --git a/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir b/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
index 40529d8adc2..18e49f77bac 100644
--- a/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
+++ b/tensorflow/dtensor/mlir/tests/propagate_default_layout.mlir
@@ -3,16 +3,20 @@
 // Check that layouts attributes in function arguments are converted to layout
 // ops.
 
-// CHECK-LABEL: func @main
-// CHECK-SAME:  %[[ARG_0:[a-z0-9]*]]: tensor<i32>
-// CHECK-SAME:  %[[ARG_1:[a-z0-9]*]]: tensor<i32>
+// CHECK-LABEL: module @test_convert_arguments
+module @test_convert_arguments {
+// CHECK: func.func @main
+// CHECK-SAME:  %arg[[ARG_0:[a-z0-9]*]]: tensor<i32>
+// CHECK-SAME:  %arg[[ARG_1:[a-z0-9]*]]: tensor<i32>
 func.func @main(
   %arg1: tensor<i32>{ tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"},
   %arg2: tensor<i32>{ tf._layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> (tensor<i32>) {
-  // CHECK:      %[[ARG1_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%[[ARG_1]])
+  // CHECK:      %[[ARG1_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%arg[[ARG_1]])
+  // CHECK-SAME: dtensor.from_arg_index = [[ARG_1]]
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
   // CHECK-SAME: (tensor<i32>) -> tensor<i32>
-  // CHECK-NEXT: %[[ARG0_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%[[ARG_0]])
+  // CHECK-NEXT: %[[ARG0_OUT:[a-z0-9]*]] = "tf.DTensorLayout"(%arg[[ARG_0]])
+  // CHECK-SAME: dtensor.from_arg_index = [[ARG_0]]
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs: mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
   // CHECK-SAME: (tensor<i32>) -> tensor<i32>
   // CHECK-NEXT: "tf.A"(%[[ARG0_OUT]], %[[ARG1_OUT]])
@@ -23,13 +27,16 @@ func.func @main(
   "tf.C"(%arg1) : (tensor<i32>) -> ()
   func.return %1 : tensor<i32>
 }
+}
 
 // -----
 
 // Check that layouts attributes in function outputs are converted to layout
 // ops.
 
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_convert_outputs
+module @test_convert_outputs {
+// CHECK-NEXT: func @main
 func.func @main() -> (tensor<i32>{tf._default_layout = "sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) {
   // CHECK:      %[[A_OUT:.*]] = "tf.A"() : () -> tensor<i32>
   // CHECK-NEXT: %[[LAYOUT_A_OUT:.*]] = "tf.DTensorLayout"(%[[A_OUT]])
@@ -38,13 +45,16 @@ func.func @main() -> (tensor<i32>{tf._default_layout = "sharding_specs:scalar me
   %1 = "tf.A"() : () -> tensor<i32>
   func.return %1 : tensor<i32>
 }
+}
 
 // -----
 
 // Check that layouts attributes of operations are correclty converted to layout
 // op.
 
-// CHECK-LABEL: func @main
+// CHECK-LABEL: module @test_convert_operations
+module @test_convert_operations {
+// CHECK-NEXT: func @main
 func.func @main() -> (tensor<i32>) {
   // CHECK:      %[[A_OUT:.*]] = "tf.A"()
   // CHECK-NEXT: "tf.DTensorLayout"(%[[A_OUT]])
@@ -52,11 +62,30 @@ func.func @main() -> (tensor<i32>) {
   %1 = "tf.A"() {_layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
   func.return %1 : tensor<i32>
 }
+}
 
 // -----
 
+// Check that when creating DTensorLayout for operations, dtensor.from_arg_index is preserved.
+
+// CHECK-LABEL: module @test_preserve_from_arg_index
+module @test_preserve_from_arg_index {
+// CHECK-NEXT: func @main
+func.func @main() -> (tensor<i32>) {
+  // CHECK:      %[[A_OUT:.*]] = "tf.A"()
+  // CHECK-NEXT: "tf.DTensorLayout"(%[[A_OUT]])
+  // CHECK-SAME: dtensor.from_arg_index = 3
+  %1 = "tf.A"() {dtensor.from_arg_index = 3, _layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
+  func.return %1 : tensor<i32>
+}
+}
+
+// -----
+
+// CHECK-LABEL: module @test_convert_resource_args
+module @test_convert_resource_args {
 // Check that resource typed arg with layouts are correctly converted to DTesnorLayout with global shape.
-// CHECK-LABEL: func @main
+// CHECK-NEXT: func @main
 func.func @main(%arg0: tensor<!tf_type.resource<tensor<4x2xf32>>>{ tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> tensor<i32> {
   // CHECK:      "tf.DTensorLayout"(%arg0)
   // CHECK-SAME: global_shape = #tf_type.shape<4x2>
@@ -64,14 +93,18 @@ func.func @main(%arg0: tensor<!tf_type.resource<tensor<4x2xf32>>>{ tf._layout =
   %1 = "tf.A"() {_layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
   func.return %1 : tensor<i32>
 }
+}
 
 // -----
 
+// CHECK-LABEL: module @test_convert_variant_args
+module @test_convert_variant_args {
 // Check that variant typed arg with layouts are correctly converted to DTesnorLayout.
-// CHECK-LABEL: func @main
+// CHECK-NEXT: func @main
 func.func @main(%arg0: tensor<!tf_type.variant<tensor<4x4xi32>>>{ tf._layout = "sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"}) -> tensor<i32> {
   // CHECK:      "tf.DTensorLayout"(%arg0)
   // CHECK-SAME: layout = #dtensor.layout<sharding_specs:x,y, mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3>
   %1 = "tf.A"() {_layout = ["sharding_specs:scalar mesh:|x=2,y=2|0,1,2,3|0,1,2,3|/job:localhost/task:0/device:CPU:0,/job:localhost/task:0/device:CPU:1,/job:localhost/task:0/device:CPU:2,/job:localhost/task:0/device:CPU:3"]} : () -> tensor<i32>
   func.return %1 : tensor<i32>
 }
+}
diff --git a/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc b/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
index 325c367f726..7fcf1d3b3e2 100644
--- a/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
+++ b/tensorflow/dtensor/mlir/tpu_add_resource_device_attribute.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
+
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -31,6 +33,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/dtensor/mlir/dtensor_mlir_passes.h"
 #include "tensorflow/dtensor/mlir/layout_parsing.h"
+#include "tensorflow/dtensor/mlir/value_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -41,14 +44,6 @@ namespace {
 
 constexpr char kFuncDeviceAttr[] = "tf.device";
 
-// Returns whether `val` is of resource type.
-bool IsResourceType(mlir::Value val) {
-  return val.isa<mlir::BlockArgument>() && val.getType()
-                                               .cast<mlir::TensorType>()
-                                               .getElementType()
-                                               .isa<mlir::TF::ResourceType>();
-}
-
 // Adds device attribute to `arg` with the device placement of `execute_op`
 void AddPlaceholderDeviceAttributeToResource(
     mlir::BlockArgument arg, mlir::TF::TPUExecuteOp execute_op) {
@@ -97,7 +92,8 @@ struct DTensorTpuAddResourceDeviceAttribute
     mlir::WalkResult walk_result =
         module.walk([](mlir::TF::TPUExecuteOp tpu_execute) {
           for (mlir::Value tpu_input : tpu_execute.getOperands()) {
-            if (IsResourceType(tpu_input))
+            if (tpu_input.isa<mlir::BlockArgument>() &&
+                IsResourceType(tpu_input))
               AddPlaceholderDeviceAttributeToResource(
                   tpu_input.cast<mlir::BlockArgument>(), tpu_execute);
 
diff --git a/tensorflow/dtensor/mlir/tpu_integration.cc b/tensorflow/dtensor/mlir/tpu_integration.cc
index 3bfa6010ea1..9568552d35d 100644
--- a/tensorflow/dtensor/mlir/tpu_integration.cc
+++ b/tensorflow/dtensor/mlir/tpu_integration.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <iterator>
+#include <memory>
 #include <string>
 #include <utility>
 
diff --git a/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc b/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc
index e8fb30b9af4..9ca267c02b4 100644
--- a/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc
+++ b/tensorflow/dtensor/mlir/undo_merge_const_across_mesh.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "tensorflow/dtensor/cc/tensor_layout.h"
 #include "tensorflow/dtensor/mlir/ir/tf_dtensor.h"
+#include "tensorflow/dtensor/mlir/op_utils.h"
 
 namespace tensorflow {
 namespace dtensor {
@@ -36,35 +37,7 @@ namespace {
 struct DTensorUndoMergeConstAcrossMesh
     : public impl::DTensorUndoMergeConstAcrossMeshBase<
           DTensorUndoMergeConstAcrossMesh> {
-  void runOnOperation() override {
-    mlir::MLIRContext& context = getContext();
-    mlir::OpBuilder builder(&context);
-    getOperation().walk([&builder](mlir::TF::ConstOp const_op) {
-      llvm::SmallVector<Mesh> known_meshes;
-      llvm::SmallVector<mlir::TF::DTensorLayout> unique_layout_ops;
-      for (mlir::Operation* consumer : const_op->getUsers()) {
-        mlir::TF::DTensorLayout layout_op =
-            mlir::dyn_cast<mlir::TF::DTensorLayout>(consumer);
-        if (!layout_op) continue;
-
-        const Layout layout = layout_op.getLayout();  // keep-alive for mesh.
-        const Mesh& mesh = layout.mesh();
-        if (std::find(known_meshes.begin(), known_meshes.end(), mesh) ==
-            known_meshes.end()) {
-          if (!known_meshes.empty()) {
-            // We skip the first layout_op to preserve its original ConstOp.
-            unique_layout_ops.push_back(layout_op);
-          }
-          known_meshes.emplace_back(mesh);
-        }
-      }
-      for (auto& layout_op : unique_layout_ops) {
-        builder.setInsertionPoint(layout_op);
-        layout_op->replaceUsesOfWith(const_op,
-                                     builder.cloneWithoutRegions(const_op));
-      }
-    });
-  }
+  void runOnOperation() override { DuplicateConstants(getOperation()); }
 };
 
 struct DTensorElideIdentityBeforeCopyToMesh
diff --git a/tensorflow/dtensor/mlir/utils/collective_lowering.cc b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
index e65c4df9352..9242e7c6871 100644
--- a/tensorflow/dtensor/mlir/utils/collective_lowering.cc
+++ b/tensorflow/dtensor/mlir/utils/collective_lowering.cc
@@ -589,6 +589,35 @@ mlir::LogicalResult LowerAllReduceOpImpl(
   return mlir::success();
 }
 
+mlir::LogicalResult WrapOpWithCasts(const mlir::RankedTensorType& input_type,
+                                    const mlir::RankedTensorType& output_type,
+                                    mlir::Operation* reduce_op) {
+  mlir::OpBuilder builder(reduce_op);
+  auto intermediate_type = mlir::RankedTensorType::get(
+      output_type.getShape(), input_type.getElementType());
+
+  const mlir::Location loc = reduce_op->getLoc();
+  mlir::TF::CastOp cast_to_long = builder.create<mlir::TF::CastOp>(
+      loc, input_type, reduce_op->getOperand(0));
+  reduce_op->setOperand(0, cast_to_long.getY());
+  reduce_op->getResult(0).setType(intermediate_type);
+
+  mlir::Value result = reduce_op->getResult(0);
+  builder.setInsertionPointAfter(reduce_op);
+  mlir::TF::CastOp cast_to_original =
+      builder.create<mlir::TF::CastOp>(loc, output_type, result);
+  StatusOr<Layout> result_layout =
+      ExtractRequiredSingleLayoutFromOp(result.getDefiningOp());
+
+  if (!result_layout.ok()) {
+    return reduce_op->emitOpError(result_layout.status().message());
+  }
+  SetSingleLayoutOnOp(cast_to_original, *result_layout);
+  reduce_op->getResult(0).replaceAllUsesExcept(cast_to_original.getY(),
+                                               cast_to_original);
+  return mlir::success();
+}
+
 template <class ReduceOpType>
 mlir::LogicalResult ConvertShortIntReduce(ReduceOpType reduce_op) {
   mlir::OpBuilder builder(reduce_op);
@@ -596,7 +625,6 @@ mlir::LogicalResult ConvertShortIntReduce(ReduceOpType reduce_op) {
   if (!output_layout.ok()) {
     return reduce_op.emitOpError(output_layout.status().message());
   }
-  const mlir::Location loc = reduce_op.getLoc();
   const mlir::Type output_type = reduce_op.getResult().getType();
   const mlir::Type input_type = reduce_op.getOperand(0).getType();
 
@@ -623,36 +651,35 @@ mlir::LogicalResult ConvertShortIntReduce(ReduceOpType reduce_op) {
              << "Received '" << reduce_op.getReduceOpAttr().getValue().str()
              << "'";
   }
-  int32_t min_width = 64;
-  if (output_layout->mesh().is_tpu_mesh()) {
-    min_width = 32;
-  }
-  if (mlir::isa<mlir::IntegerType>(tensor_input_type.getElementType()) &&
-      tensor_input_type.getElementType().getIntOrFloatBitWidth() < min_width) {
-    const mlir::Type integer_input_type = mlir::RankedTensorType::get(
-        tensor_input_type.getShape(), builder.getIntegerType(min_width));
-    mlir::TF::CastOp cast_to_long = builder.create<mlir::TF::CastOp>(
-        loc, integer_input_type, reduce_op.getInput());
-    reduce_op.setOperand(0, cast_to_long.getY());
-    auto integer_output_type = mlir::RankedTensorType::get(
-        tensor_output_type.getShape(), builder.getIntegerType(min_width));
-    reduce_op.getOutput().setType(integer_output_type);
-
-    // Add cast back to boolean after reduction.
-    mlir::Value result = reduce_op.getOutput();
-    builder.setInsertionPointAfter(reduce_op);
-    mlir::TF::CastOp cast_to_original =
-        builder.create<mlir::TF::CastOp>(loc, output_type, result);
-    StatusOr<Layout> result_layout =
-        ExtractRequiredSingleLayoutFromOp(result.getDefiningOp());
-    if (!result_layout.ok()) {
-      return reduce_op.emitOpError(result_layout.status().message());
+  if (mlir::isa<mlir::IntegerType>(tensor_input_type.getElementType())) {
+    int32_t min_width = 64;
+    if (output_layout->mesh().is_tpu_mesh()) {
+      min_width = 32;
     }
-    SetSingleLayoutOnOp(cast_to_original, *result_layout);
-    reduce_op.getOutput().replaceAllUsesExcept(cast_to_original.getY(),
-                                               cast_to_original);
-  }
 
+    if (tensor_input_type.getElementType().getIntOrFloatBitWidth() >=
+        min_width) {
+      return mlir::success();
+    }
+    auto input_type = mlir::RankedTensorType::get(
+        tensor_input_type.getShape(), builder.getIntegerType(min_width));
+
+    auto output_type = mlir::RankedTensorType::get(
+        tensor_output_type.getShape(), tensor_input_type.getElementType());
+    return WrapOpWithCasts(input_type, output_type, reduce_op);
+  }
+  if (mlir::isa<mlir::BFloat16Type>(tensor_input_type.getElementType())) {
+    if (output_layout->mesh().is_tpu_mesh()) {
+      return mlir::success();
+    }
+    auto input_type = mlir::RankedTensorType::get(tensor_input_type.getShape(),
+                                                  builder.getF32Type());
+
+    auto output_type = mlir::RankedTensorType::get(
+        tensor_output_type.getShape(), tensor_input_type.getElementType());
+
+    return WrapOpWithCasts(input_type, output_type, reduce_op);
+  }
   return mlir::success();
 }
 
diff --git a/tensorflow/dtensor/mlir/value_utils.cc b/tensorflow/dtensor/mlir/value_utils.cc
index 2719ceb1e11..d326c6cda27 100644
--- a/tensorflow/dtensor/mlir/value_utils.cc
+++ b/tensorflow/dtensor/mlir/value_utils.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "llvm/Support/FormatVariadic.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/collection_ops_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
 #include "tensorflow/core/platform/errors.h"
@@ -141,6 +142,18 @@ mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
   return builder.create<mlir::TF::ConstOp>(loc, const_attr).getResult();
 }
 
+mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
+                                     mlir::Location loc,
+                                     llvm::ArrayRef<int64_t> values,
+                                     mlir::Type type) {
+  if (type.cast<mlir::RankedTensorType>().getElementType().isInteger(64)) {
+    return Int64Const(builder, loc, values);
+  } else {
+    llvm::SmallVector<int32, 4> values32(values.begin(), values.end());
+    return IntConst(builder, loc, values32);
+  }
+}
+
 StatusOr<int64_t> ExtractConstIntFromValue(mlir::Value value) {
   value = GetForwardedInput(value);
   if (value.isa<mlir::BlockArgument>())
@@ -265,5 +278,12 @@ mlir::Type GetSubtypeOrSelf(mlir::Value val) {
   return type;
 }
 
+bool IsResourceType(mlir::Value val) {
+  return val.getType()
+      .cast<mlir::TensorType>()
+      .getElementType()
+      .isa<mlir::TF::ResourceType>();
+}
+
 }  // namespace dtensor
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/mlir/value_utils.h b/tensorflow/dtensor/mlir/value_utils.h
index bab372a0869..5c47151b3b1 100644
--- a/tensorflow/dtensor/mlir/value_utils.h
+++ b/tensorflow/dtensor/mlir/value_utils.h
@@ -57,12 +57,18 @@ mlir::Value StringConst(mlir::OpBuilder& builder, mlir::Location loc,
 // Returns a tf.string scalar constant with given value.
 mlir::Value StringScalarConst(mlir::OpBuilder& builder, mlir::Location loc,
                               llvm::StringRef value);
+// Returns a Int constant with the matching type.
+mlir::Value IntConstWithMatchingType(mlir::OpBuilder& builder,
+                                     mlir::Location loc,
+                                     llvm::ArrayRef<int64_t> values,
+                                     mlir::Type type);
+
 StatusOr<int64_t> ExtractConstIntFromValue(mlir::Value value);
 Status ExtractConstVectorFromValue(mlir::Value value,
                                    llvm::SmallVector<int64_t, 4>* out_vector);
 
 // Returns a int64 scalar constant with `value`.
-mlir::Value CreateIntScalarConst(const int64_t value, mlir::OpBuilder builder,
+mlir::Value CreateIntScalarConst(int64_t value, mlir::OpBuilder builder,
                                  mlir::Location loc, bool use_int64 = true);
 
 // Returns a scalar constant with 'value' of 'type'.
@@ -80,6 +86,9 @@ StatusOr<mlir::Value> SelectScalarValueFromArray(mlir::OpBuilder& builder,
 // then it returns the subtype.
 mlir::Type GetSubtypeOrSelf(mlir::Value value);
 
+// Returns whether `val` is of resource type.
+bool IsResourceType(mlir::Value val);
+
 }  // namespace dtensor
 }  // namespace tensorflow
 #endif  // TENSORFLOW_DTENSOR_MLIR_VALUE_UTILS_H_
diff --git a/tensorflow/dtensor/python/BUILD b/tensorflow/dtensor/python/BUILD
index 7c32baf8a2e..a10095d09be 100644
--- a/tensorflow/dtensor/python/BUILD
+++ b/tensorflow/dtensor/python/BUILD
@@ -46,7 +46,9 @@ pytype_strict_library(
         ":dtensor_device",
         ":gen_dtensor_ops",
         ":layout",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -77,11 +79,10 @@ pytype_strict_library(
     name = "layout",
     srcs = ["layout.py"],
     deps = [
-        ":config",
         "//tensorflow/dtensor/proto:layout_proto_py_pb2",
         "//tensorflow/python:_pywrap_dtensor_device",
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -94,13 +95,13 @@ pytype_strict_library(
     deps = [
         ":api",
         ":layout",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/util:tf_export",
@@ -118,15 +119,15 @@ pytype_strict_library(
         ":save_restore",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/dtensor/proto:layout_proto_py_pb2",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_options",
         "//tensorflow/python/checkpoint:graph_view",
         "//tensorflow/python/checkpoint:restore",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/training:py_checkpoint_reader",
@@ -144,11 +145,11 @@ pytype_strict_library(
     deps = [
         ":api",
         ":layout",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
         "//tensorflow/python/types:core",
         "//third_party/py/numpy",
     ],
@@ -164,12 +165,12 @@ pytype_strict_library(
         ":gen_dtensor_ops",
         ":layout",
         ":mesh_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -186,9 +187,9 @@ pytype_strict_library(
         ":layout",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:_pywrap_dtensor_device",
-        "//tensorflow/python:device",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
@@ -213,12 +214,12 @@ pytype_strict_library(
         ":config",
         ":layout",
         ":tpu_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:config",
-        "//tensorflow/python:device",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:tfrt_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
@@ -238,22 +239,19 @@ pytype_strict_library(
         "//tensorflow/dtensor:dtensor-users",
     ],
     deps = [
-        ":api",
         ":config",
         ":dtensor_device",
         ":gen_dtensor_ops",
-        ":heartbeat",
         ":layout",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:remote_utils",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu:topology",
@@ -269,10 +267,10 @@ pytype_strict_library(
     deps = [
         ":api",
         ":config",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:collective_ops",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
@@ -293,18 +291,18 @@ pytype_strict_library(
         ":api",
         ":config",
         ":layout",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/types:data",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
diff --git a/tensorflow/dtensor/python/accelerator_util.py b/tensorflow/dtensor/python/accelerator_util.py
index 96979ead672..e32d54b36c1 100644
--- a/tensorflow/dtensor/python/accelerator_util.py
+++ b/tensorflow/dtensor/python/accelerator_util.py
@@ -119,6 +119,7 @@ def initialize_multi_client_cluster(job_name: str,
 def initialize_accelerator_system(
     device_type: Optional[str] = None,
     enable_coordination_service: Optional[bool] = True,
+    num_logical_cpu_devices: Optional[int] = None,
     experimental_reset_context: Optional[bool] = False,
 ) -> str:
   """Initializes accelerators and communication fabrics for DTensor.
@@ -160,6 +161,10 @@ def initialize_accelerator_system(
     enable_coordination_service: If true, enable distributed coordination
       service to make sure that workers know the devices on each other, when
       there is more than 1 client.
+    num_logical_cpu_devices: the number of logical CPU devices per DTensor
+      client. Default to the current number of logical CPU
+      (`dtensor.num_local_devices("CPU")`),when `device_type` is CPU, otherwise
+      set automatially to match the number of local GPU/TPU devices.
     experimental_reset_context: Reset the tensorflow context. Behaviors of
       existing TensorFlow objects (e.g. Tensors) are undefined. Set this to True
       as an escape hatch, if there is no clear way to refactor your code to call
@@ -217,10 +222,25 @@ def initialize_accelerator_system(
   # Configure logical host CPU devices for accelerators.
   if device_type in ("GPU", "TPU"):
     num_local_devices = config.num_local_devices(device_type)
-    if config.num_local_devices("CPU") < num_local_devices:
-      tf_config.set_logical_device_configuration(
-          tf_config.list_physical_devices("CPU")[0],
-          [context.LogicalDeviceConfiguration()] * num_local_devices)
+    if num_logical_cpu_devices is None:
+      num_logical_cpu_devices = max(
+          config.num_local_devices("CPU"), num_local_devices
+      )
+    else:
+      if num_logical_cpu_devices < num_local_devices:
+        raise ValueError(
+            "If set, `num_logical_cpu_devices`"
+            f" (={num_logical_cpu_devices}) must be greater than or"
+            f" equal to the number of local {device_type} devices"
+            f" (={num_local_devices})"
+        )
+
+  if num_logical_cpu_devices is not None:
+    tf_config.set_logical_device_configuration(
+        tf_config.list_physical_devices("CPU")[0],
+        [context.LogicalDeviceConfiguration()]
+        * num_logical_cpu_devices,
+    )
 
   if not config.is_local_mode():
     initialize_multi_client_cluster(
diff --git a/tensorflow/dtensor/python/api.py b/tensorflow/dtensor/python/api.py
index bcfb36b8625..d971a4dfa7b 100644
--- a/tensorflow/dtensor/python/api.py
+++ b/tensorflow/dtensor/python/api.py
@@ -21,6 +21,7 @@ from typing import Any, Callable, Optional, Sequence
 from tensorflow.dtensor.python import dtensor_device
 from tensorflow.dtensor.python import gen_dtensor_ops
 from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
@@ -53,9 +54,12 @@ def call_with_layout(fn: Callable[...,
     The return value of `fn` transformed to a DTensor if requested.
   """
   if layout is not None:
-    with default_mesh(layout.mesh):
-      with _dtensor_device()._default_layout(layout):  # pylint: disable=protected-access
-        return fn(*args, **kwargs)
+    if context.executing_eagerly():
+      with default_mesh(layout.mesh):
+        with _dtensor_device()._default_layout(layout):  # pylint: disable=protected-access
+          return fn(*args, **kwargs)
+    else:
+      return relayout(fn(*args, **kwargs), layout)
   return fn(*args, **kwargs)
 
 
@@ -107,6 +111,22 @@ def default_mesh(mesh: layout_lib.Mesh):
       yield
 
 
+@tf_export("experimental.dtensor.get_default_mesh", v1=[])
+def get_default_mesh() -> Optional[layout_lib.Mesh]:
+  """Return the default mesh under the current dtensor device context.
+
+  In the case that dtensor device system is not initialized, this function
+  will return None.
+
+  Returns:
+    The current default mesh for the dtensor device context.
+  """
+  if _dtensor_singleton is None:
+    return None
+  else:
+    return _dtensor_singleton._current_default_mesh   # pylint: disable=protected-access
+
+
 @tf_export("experimental.dtensor.device_name", v1=[])
 def device_name() -> str:
   """Returns the singleton DTensor device's name.
@@ -389,7 +409,9 @@ def check_layout(tensor: ops.Tensor, layout: layout_lib.Layout) -> None:
 
 
 @tf_export("experimental.dtensor.relayout", v1=[])
-def relayout(tensor: ops.Tensor, layout: layout_lib.Layout) -> ops.Tensor:
+def relayout(
+    tensor: ops.Tensor, layout: layout_lib.Layout, name: Optional[str] = None
+) -> ops.Tensor:
   """Changes the layout of `tensor`.
 
   Changes the layout of `tensor` to `layout`. This is used to fine-tune the
@@ -415,13 +437,65 @@ def relayout(tensor: ops.Tensor, layout: layout_lib.Layout) -> ops.Tensor:
   Args:
     tensor: A DTensor to specify a new layout for.
     layout: A Layout object specifying a new sharding spec.
+    name: name of the Op.
 
   Returns:
     A DTensor output from the Relayout op.
   """
   layout_str = layout.to_string()
   with default_mesh(layout.mesh):
-    return gen_dtensor_ops.relayout(tensor, layout_str)
+    return gen_dtensor_ops.relayout(tensor, layout_str, name=name)
+
+
+@tf_export("experimental.dtensor.relayout_like", v1=[])
+def relayout_like(
+    tensor: ops.Tensor, layout_tensor: ops.Tensor, name: Optional[str] = None
+) -> ops.Tensor:
+  """Changes the layout of `tensor` to the same as `layout_tensor`.
+
+  `relayout_like` is often used inside a `tf.function`, to ensure a tensor is
+  placed to the same mesh and with the same layout as another tensor.
+
+  The backward gradient of a `relayout` is a `relayout_like` operation, to
+  ensure the backward tensor has the same layout as the forward input tensor:
+
+  ```
+  @ops.RegisterGradient("Relayout")
+  def _relayout_gradient(op, grad):
+    return relayout_like(grad, layout_input=op.inputs[0])
+  ```
+
+  Here is another illustrative example:
+
+  ```
+  @tf.function
+  def func(x):
+    z = tf.ones(x.shape)
+    z = dtensor.relayout_like(z, x)
+    return x + z
+
+  with dtensor.default_mesh(cpu_mesh):
+    x = tf.ones((4, 4))
+
+  with dtensor.default_mesh(gpu_mesh):
+    y = func(x)
+
+  # y would be on the cpu mesh, following the mesh of x.
+  ```
+
+  Args:
+    tensor: A DTensor to specify a new layout for.
+    layout_tensor: A Tensor object whose layout will be used for the layout of
+      result. The shape and type of layout_tensor are irrelevant.
+    name: name of the Op.
+
+  Returns:
+    A DTensor output from the RelayoutLike op.
+  """
+
+  return gen_dtensor_ops.relayout_like(
+      input=tensor, layout_input=layout_tensor, name=name
+  )
 
 
 def _set_dtensor_device(device: dtensor_device.DTensorDevice) -> None:
diff --git a/tensorflow/dtensor/python/input_util.py b/tensorflow/dtensor/python/input_util.py
index a4f2beca404..230ae42fdcb 100644
--- a/tensorflow/dtensor/python/input_util.py
+++ b/tensorflow/dtensor/python/input_util.py
@@ -145,6 +145,7 @@ class _DTensorIterator(iterator_ops.OwnedIterator):
       # device mesh. If the dataset layouts are on the host mesh itself, this
       # is handled by DTensor as a no-op.
       host_elem = self._next_internal()
+      context.async_wait()
       device_elem = nest.map_structure(
           api.copy_to_mesh, host_elem, self._layouts)
       context.async_wait()
diff --git a/tensorflow/dtensor/python/layout.py b/tensorflow/dtensor/python/layout.py
index 5eb136be824..8999a0e6a1e 100644
--- a/tensorflow/dtensor/python/layout.py
+++ b/tensorflow/dtensor/python/layout.py
@@ -21,7 +21,6 @@ from typing import List, Dict, Optional, Union
 import numpy as np
 
 from tensorflow.dtensor.proto import layout_pb2
-from tensorflow.dtensor.python import config
 from tensorflow.python import _pywrap_dtensor_device
 from tensorflow.python.framework import device as tf_device
 from tensorflow.python.framework import ops
@@ -268,43 +267,11 @@ class Mesh(_pywrap_dtensor_device.Mesh):
     _pywrap_dtensor_device.Mesh.__init__(mesh, single_device=device)
     return mesh
 
-  # TODO(b/242201545): implement this in Mesh C++ class
-  def host_mesh(self):
-    """Returns the 1-1 mapped host mesh."""
-    if self.device_type().upper() == 'CPU':
-      return self
-
-    v_cpus_counts = config.num_local_devices('CPU')
-    if v_cpus_counts < len(self.local_devices()):
-      raise ValueError(
-          'Must have at least {0} virtual CPUs for mesh : {1}, '
-          'but got : {2} virtual CPUs. '
-          'Call tf.experimental.dtensor.initialize_accelerator_system() '
-          'to initialize the host CPU devices with the accelerators.'.format(
-              len(self.local_devices()), self.to_string(), v_cpus_counts
-          )
-      )
-    local_device_specs = [
-        tf_device.DeviceSpec.from_string(d) for d in self.local_devices()
-    ]
-    global_device_specs = [
-        tf_device.DeviceSpec.from_string(d) for d in self.global_devices()
-    ]
-
-    device_array = np.asarray(
-        [spec.replace(device_type='CPU') for spec in local_device_specs]
-    ).reshape((len(self.local_devices()), 1))
-    global_devices = [
-        spec.replace(device_type='CPU') for spec in global_device_specs
-    ]
-    h_mesh = Mesh(
-        self.dim_names,
-        self.global_device_ids(),
-        self.local_device_ids(),
-        np.ravel(device_array).tolist(),
-        global_devices=global_devices,
-    )
-    return h_mesh
+  def host_mesh(self) -> 'Mesh':
+    """Returns a host mesh."""
+    # TODO(b/242201545): Find a way to get the super class to return correct
+    # typed objects.
+    return Mesh.from_string(super().host_mesh().to_string())
 
   # TODO(b/242201545): implement this in Mesh C++ class
   def local_device_locations(self) -> List[Dict[str, int]]:
diff --git a/tensorflow/dtensor/python/tests/BUILD b/tensorflow/dtensor/python/tests/BUILD
index d5ef7c00162..007691189b8 100644
--- a/tensorflow/dtensor/python/tests/BUILD
+++ b/tensorflow/dtensor/python/tests/BUILD
@@ -11,6 +11,8 @@ load(
     "dtensor_test",
 )
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 # File used by internal tests.
 exports_files([
     "spmd_test.py",
@@ -41,25 +43,25 @@ pytype_library(
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:numpy_util",
         "//tensorflow/dtensor/python:tpu_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:bitwise_ops_gen",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:spectral_ops_gen",
-        "//tensorflow/python:stateless_random_ops_gen",
-        "//tensorflow/python:stateless_random_ops_v2_gen",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:bitwise_ops_gen",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:spectral_ops_gen",
+        "//tensorflow/python/ops:stateless_random_ops_gen",
+        "//tensorflow/python/ops:stateless_random_ops_v2_gen",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
@@ -74,9 +76,9 @@ dtensor_test(
     deps = [
         ":test_util",
         "//tensorflow/dtensor/python:config",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -96,13 +98,13 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -128,13 +130,13 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -158,18 +160,18 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:dtensor_device",
         "//tensorflow/dtensor/python:layout",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -178,6 +180,7 @@ dtensor_test(
 
 py_strict_test(
     name = "input_util_test",
+    size = "medium",
     srcs = ["input_util_test.py"],
     shard_count = 8,
     deps = [
@@ -186,9 +189,6 @@ py_strict_test(
         "//tensorflow/dtensor/python:input_util",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
@@ -196,6 +196,9 @@ py_strict_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -214,14 +217,15 @@ dtensor_test(
         ":test_util",
         "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:layout",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -251,14 +255,14 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:numpy_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -313,16 +317,16 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
@@ -359,16 +363,16 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
@@ -405,22 +409,58 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
     ],
 )
 
+dtensor_test(
+    name = "multi_mesh_test",
+    srcs = ["multi_mesh_test.py"],
+    disable_tfrt = [
+        "gpu",
+        "tpu",
+    ],  # TODO(b/192095157)
+    shard_count = {
+        "cpu": 5,
+        "gpu": 5,
+        "tpu": 10,
+        TPU_V3_DONUT_BACKEND: 10,
+    },
+    deps = [
+        ":test_util",
+        "//tensorflow/dtensor/python:accelerator_util",
+        "//tensorflow/dtensor/python:api",
+        "//tensorflow/dtensor/python:config",
+        "//tensorflow/dtensor/python:d_variable",
+        "//tensorflow/dtensor/python:layout",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 dtensor_test(
     name = "numpy_util_test",
     srcs = ["numpy_util_test.py"],
@@ -454,24 +494,6 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:numpy_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:bitwise_ops_gen",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:linalg_ops_gen",
-        "//tensorflow/python:list_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops_gen",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:stateless_random_ops_gen",
-        "//tensorflow/python:string_ops_gen",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager/polymorphic_function",
@@ -480,6 +502,24 @@ dtensor_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:bitwise_ops_gen",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:stateless_random_ops_gen",
+        "//tensorflow/python/ops:string_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
@@ -507,24 +547,6 @@ dtensor_test(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:numpy_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:bitwise_ops_gen",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:linalg_ops_gen",
-        "//tensorflow/python:list_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops_gen",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:stateless_random_ops_gen",
-        "//tensorflow/python:string_ops_gen",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager/polymorphic_function",
@@ -533,6 +555,24 @@ dtensor_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:bitwise_ops_gen",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:stateless_random_ops_gen",
+        "//tensorflow/python/ops:string_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
diff --git a/tensorflow/dtensor/python/tests/device_test.py b/tensorflow/dtensor/python/tests/device_test.py
index f8377606c09..9fc135be045 100644
--- a/tensorflow/dtensor/python/tests/device_test.py
+++ b/tensorflow/dtensor/python/tests/device_test.py
@@ -107,7 +107,7 @@ class DTensorDeviceTest(test_util.DTensorBaseTest, parameterized.TestCase):
       a + big  # pylint:disable=pointless-statement
     a + small  # pylint:disable=pointless-statement
 
-  def test_concurrent_execute(self):
+  def testConcurrentExecute(self):
     results = {}
 
     def func(thread_id):
diff --git a/tensorflow/dtensor/python/tests/input_util_test.py b/tensorflow/dtensor/python/tests/input_util_test.py
index 08f31975f4c..aa1d386e339 100644
--- a/tensorflow/dtensor/python/tests/input_util_test.py
+++ b/tensorflow/dtensor/python/tests/input_util_test.py
@@ -18,7 +18,6 @@ import contextlib
 import threading
 
 from absl.testing import parameterized
-
 import numpy as np
 
 from tensorflow.dtensor.python import api
@@ -128,6 +127,8 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
       for _ in math_ops.range(steps - 1):
         output += next(iterator)
         iters += 1
+        if not is_graph:
+          mesh_util.barrier(self.mesh)
       return output, iters
 
     train_fn = polymorphic_function.function(train) if is_graph else train
@@ -138,11 +139,17 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
 
     d_iterator = iter(d_dataset)
     d_output, d_iters = train_fn(d_iterator, num_batches)
-    mesh_util.barrier(self.mesh)
 
+    mesh_util.barrier(self.mesh)
     # Try one more iteration which will raise an exception since the iterator is
     # exhausted.
     with self.assertRaises(exception):
+      if is_graph:
+        # FIXME(b/285884302): This flakily raises error
+        # "Cannot add 'while_cond' function, because a different function"
+        # Since num_batches is changed to 1, it retriggers SPMD expansion.
+        # Recreating polymorphic function to avoid running into the error.
+        train_fn = polymorphic_function.function(train)
       train_fn(d_iterator, 1)
       # In the graph case, we need to wait for the executor to finish all async
       # calls after invoking the tf.function to ensure any pending error is
@@ -174,6 +181,8 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
       for img in iterator:
         output += img
         iters += 1
+        if not is_graph:
+          mesh_util.barrier(self.mesh)
       return output, iters
 
     train_fn = polymorphic_function.function(train) if is_graph else train
@@ -204,7 +213,8 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
     self.assertEqual(d_dataset.element_spec.shape, [batch_size, 8, 8, 3])
 
     def train(iterator):
-      return next(iterator)
+      it = next(iterator)
+      return it
 
     train_fn = polymorphic_function.function(train) if is_graph else train
 
@@ -212,8 +222,9 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
     self.assertEqual(d_iterator.element_spec.shape, [batch_size, 8, 8, 3])
 
     d_images = train_fn(d_iterator)
-
+    mesh_util.barrier(self.mesh)
     expected = next(iter(dataset.batch(batch_size, drop_remainder=True)))
+    mesh_util.barrier(self.mesh)
     self.assertDTensorEqual(expected, images_layout, d_images)
 
   @parameterized.named_parameters(('Eager', False), ('Graph', True))
@@ -243,7 +254,6 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
 
     d_iterator = iter(d_dataset)
     d_images, d_labels = train_fn(d_iterator)
-
     expected_images, expected_labels = next(
         iter(dataset.batch(batch_size, drop_remainder=True)))
     self.assertDTensorEqual(expected_images, images_layout, d_images)
@@ -431,8 +441,6 @@ class DTensorDatasetTest(test_util.DTensorBaseTest):
       start_idx, end_idx = i * batch_size, (i + 1) * batch_size
       self.assertDTensorEqual(inputs[start_idx:end_idx], inputs_layout, elem)
 
-    self.assertRaises(StopIteration, lambda: next(d_iterator))
-
   @parameterized.product(
       (
           dict(
diff --git a/tensorflow/dtensor/python/tests/layout_propagation_test.py b/tensorflow/dtensor/python/tests/layout_propagation_test.py
index 8b2f37952b7..67608eebae7 100644
--- a/tensorflow/dtensor/python/tests/layout_propagation_test.py
+++ b/tensorflow/dtensor/python/tests/layout_propagation_test.py
@@ -239,6 +239,22 @@ class LayoutPropagationV2Test(test_util.DTensorBaseTest):
     self.assertDTensorEqual(golden_accum, sharded_layout_0, dtensor_accum)
     self.assertDTensorEqual(golden_accum_2, sharded_layout_1, dtensor_accum_2)
 
+  def test_layout_like(self):
+    a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.float32)
+    a = numpy_util.pack_numpy(a, self.unsharded_layout)
+
+    b = constant_op.constant([0, 1, 2, 3], dtype=dtypes.int64)
+    b = numpy_util.pack_numpy(b, self.x_layout)
+
+    @polymorphic_function.function
+    def layout_like_function(i, t):
+      i = math_ops.sqrt(i)
+      return api.relayout_like(i, t)
+
+    # Triggers relayout_like with different dtype.
+    dtensor_result = layout_like_function(a, b)
+    api.check_layout(dtensor_result, self.x_layout)
+
   def test_layout_prop_v2_if(self):
     a = constant_op.constant([0, 1, 2, 1], dtype=dtypes.float32)
     a = numpy_util.pack_numpy(a, self.unsharded_layout)
diff --git a/tensorflow/dtensor/python/tests/layout_test.py b/tensorflow/dtensor/python/tests/layout_test.py
index 243015f676f..264be67f633 100644
--- a/tensorflow/dtensor/python/tests/layout_test.py
+++ b/tensorflow/dtensor/python/tests/layout_test.py
@@ -30,6 +30,7 @@ from tensorflow.python.framework import combinations
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors_impl
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.platform import test
@@ -301,6 +302,24 @@ class MeshTest(test_util.DTensorBaseTest, parameterized.TestCase):
       layout.Mesh([_MESH_DIM_BATCH], global_ids, local_ids,
                   np.ravel(a).tolist())
 
+  def test_host_mesh(self):
+    device_ids = test_util.create_device_ids_array((4, 2))
+    mesh = layout.Mesh(
+        [_MESH_DIM_BATCH, _MESH_DIM_X],
+        device_ids,
+        np.ravel(device_ids).tolist(),
+        test_util.create_device_list((4, 2), 'GPU'),
+        mesh_name='name_not_preserved',
+    )
+    expected = layout.Mesh(
+        [_MESH_DIM_BATCH, _MESH_DIM_X],
+        device_ids,
+        np.ravel(device_ids).tolist(),
+        test_util.create_device_list((4, 2), 'CPU'),
+    )
+    host_mesh = mesh.host_mesh()
+    self.assertEqual(host_mesh, expected)
+
 
 class LayoutTest(test_util.DTensorBaseTest, parameterized.TestCase):
 
@@ -406,6 +425,9 @@ class RelayoutTest(test_util.DTensorBaseTest):
         for device in ('CPU', 'GPU', 'TPU')
     }
     self.mesh = self.configTestMesh(mesh_dict)
+    # 1D Layouts
+    self.x_layout = layout.Layout.batch_sharded(self.mesh, _MESH_DIM_X, rank=1)
+    self.y_layout = layout.Layout.batch_sharded(self.mesh, _MESH_DIM_Y, rank=1)
     # 2D Layouts
     self.unsharded_unsharded_layout = layout.Layout.replicated(
         self.mesh, rank=2
@@ -440,6 +462,40 @@ class RelayoutTest(test_util.DTensorBaseTest):
     else:
       self.assertDTensorEqual(inp, to_layout, do_relayout())
 
+  @combinations.generate(combinations.combine(is_graph=[False, True]))
+  def test_relayout_like_simple(self, is_graph):
+    data = np.array([1, 2, 3, 4.0], dtype='f4')
+    inp = api.relayout(data, self.y_layout)
+    inp_layout = api.relayout(data, self.x_layout)
+
+    def do_relayout():
+      return api.relayout_like(inp, inp_layout)
+
+    if is_graph:
+      do_relayout = polymorphic_function.function(do_relayout)
+
+    with api.default_mesh(self.mesh):
+      result = do_relayout()
+
+    self.assertDTensorEqual(data, self.x_layout, result)
+
+  def test_relayout_like_init_scope(self):
+    data = np.array([1, 2, 3, 4.0], dtype='f4')
+    inp = api.relayout(data, self.y_layout)
+    inp_layout = api.relayout(data, self.x_layout)
+
+    @polymorphic_function.function
+    def do_relayout(x):
+      with ops.init_scope():
+        return api.relayout_like(inp, x)
+
+    with api.default_mesh(self.mesh):
+      with self.assertRaisesRegex(
+          TypeError, 'is out of scope and cannot be used here'
+      ):
+        result = do_relayout(inp_layout)
+        result.numpy()
+
   def test_nested_relayout_gradient_preserves_layout(self):
     # Test that nesting gradient tapes with relayouts preserves the layout of
     # the original DTensor input. The second-order gradient should have a layout
@@ -503,6 +559,33 @@ class RelayoutTest(test_util.DTensorBaseTest):
 
     func_with_relayout(sharded_w)
 
+  @combinations.generate(
+      combinations.combine(size=[16, 4096], is_graph=[False, True])
+  )
+  def test_call_with_layout(self, size, is_graph):
+    layout_x = layout.Layout.batch_sharded(
+        self.mesh, batch_dim=_MESH_DIM_X, rank=1
+    )
+    layout_y = layout.Layout.batch_sharded(
+        self.mesh, batch_dim=_MESH_DIM_Y, rank=1
+    )
+
+    expected = array_ops.zeros(shape=[size])
+
+    def func():
+      tensor_x = api.call_with_layout(array_ops.zeros, layout_x, shape=[size])
+      tensor_y = api.call_with_layout(array_ops.zeros, layout_y, shape=[size])
+      return tensor_x, tensor_y
+
+    if is_graph:
+      func = polymorphic_function.function(func)
+
+    with api.default_mesh(self.mesh):
+      tensor_x, tensor_y = func()
+
+    self.assertDTensorEqual(expected, layout_x, tensor_x)
+    self.assertDTensorEqual(expected, layout_y, tensor_y)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/dtensor/python/tests/multi_mesh_test.py b/tensorflow/dtensor/python/tests/multi_mesh_test.py
new file mode 100644
index 00000000000..735d833bb44
--- /dev/null
+++ b/tensorflow/dtensor/python/tests/multi_mesh_test.py
@@ -0,0 +1,769 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for multiple meshes in DTensor."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.dtensor.python import accelerator_util
+from tensorflow.dtensor.python import api
+from tensorflow.dtensor.python import config
+from tensorflow.dtensor.python import d_variable
+from tensorflow.dtensor.python import layout as layout_lib
+from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.platform import test
+
+Layout = layout_lib.Layout
+Mesh = layout_lib.Mesh
+UNSHARDED = layout_lib.UNSHARDED
+
+# Makes a 1D mesh with dimension X(2).
+_MESH_DIM_X = 'x'
+_MESH_DIM_Y = 'y'
+_DEVICES_IDS = test_util.create_device_ids_array((2,))
+_ONE_D_CPU_MESH = Mesh(
+    [_MESH_DIM_X],
+    _DEVICES_IDS,
+    np.ravel(_DEVICES_IDS).tolist(),
+    test_util.create_device_list((2,), 'CPU'),
+)
+_ONE_D_GPU_MESH = Mesh(
+    [_MESH_DIM_X],
+    _DEVICES_IDS,
+    np.ravel(_DEVICES_IDS).tolist(),
+    test_util.create_device_list((2,), 'GPU'),
+)
+_ONE_D_TPU_MESH = Mesh(
+    [_MESH_DIM_X],
+    _DEVICES_IDS,
+    np.ravel(_DEVICES_IDS).tolist(),
+    test_util.create_device_list((2,), 'TPU'),
+)
+_ONE_D_CPU_MESH_Y = Mesh(
+    [_MESH_DIM_Y],
+    _DEVICES_IDS,
+    np.ravel(_DEVICES_IDS).tolist(),
+    test_util.create_device_list((2,), 'CPU'),
+)
+
+_OTHER_CPU_MESH = Mesh(
+    [_MESH_DIM_X],
+    _DEVICES_IDS[:1],
+    np.ravel(_DEVICES_IDS[:1]).tolist(),
+    test_util.create_device_list((1,), 'CPU'),
+)
+
+
+class MultiMeshTest(test_util.DTensorBaseTest):
+
+  def setUp(self):
+    super(MultiMeshTest, self).setUp()
+    self.first_mesh = _ONE_D_CPU_MESH
+    if test_util.is_tpu_present():
+      self.second_mesh = _ONE_D_TPU_MESH
+    elif test_util.is_gpu_present():
+      self.second_mesh = _ONE_D_GPU_MESH
+    else:
+      self.second_mesh = _ONE_D_CPU_MESH_Y
+
+    device_type = config.preferred_device_type()
+    if device_type != 'TPU':
+      test_util.reset_logical_devices(device_type, 2)
+    accelerator_util.initialize_accelerator_system(device_type)
+
+  def testBasicCopyToMesh(self):
+    target_layout = Layout.replicated(self.first_mesh, rank=1)
+    numpy_value = np.zeros([3], dtype=np.int32)
+    dtensor_copy_from_numpy = api.copy_to_mesh(numpy_value, target_layout)
+    self.assertDTensorEqual(numpy_value, target_layout, dtensor_copy_from_numpy)
+
+    numpy_value = np.ones([3], dtype=np.int32)
+    src_mesh = api.copy_to_mesh(
+        numpy_value, Layout.replicated(self.second_mesh, rank=1)
+    )
+    dtensor_from_another_mesh = api.copy_to_mesh(src_mesh, target_layout)
+    self.assertDTensorEqual(
+        numpy_value, target_layout, dtensor_from_another_mesh
+    )
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Graph', is_eager=False),
+      dict(testcase_name='Eager', is_eager=True),
+  )
+  def testCopyToMeshOneToOneSharded(self, is_eager):
+    if not test_util.is_tpu_present():
+      self.skipForDeviceType(
+          ['CPU'], 'Need at least one device mesh for this test.'
+      )
+
+    replicated_layout = Layout.replicated(self.first_mesh, rank=1)
+    first_layout = Layout([_MESH_DIM_X], self.first_mesh)
+    second_layout = Layout([_MESH_DIM_X], self.second_mesh)
+
+    numpy_value = np.zeros([8], dtype=np.int32)
+    dt_value = api.copy_to_mesh(numpy_value, replicated_layout)
+    self.assertDTensorEqual(numpy_value, replicated_layout, dt_value)
+
+    def fn(val):
+      dt_source = api.relayout(val, first_layout)
+      dt_target = api.copy_to_mesh(dt_source, second_layout)
+      return dt_source, dt_target
+
+    if not is_eager:
+      fn = polymorphic_function.function(fn)
+
+    dt_source, dt_target = fn(dt_value)
+    self.assertDTensorEqual(numpy_value, first_layout, dt_source)
+    self.assertDTensorEqual(numpy_value, second_layout, dt_target)
+
+  @parameterized.named_parameters(
+      dict(testcase_name='Graph', is_eager=False),
+      dict(testcase_name='Eager', is_eager=True),
+  )
+  def testCopyToMeshToShardedLayout(self, is_eager):
+    target_layout = Layout([_MESH_DIM_X], self.first_mesh)
+    a_np = array_ops.zeros([8], dtype=dtypes.int32)
+
+    def fn(val):
+      return api.copy_to_mesh(val, target_layout)
+
+    if not is_eager:
+      fn = polymorphic_function.function(fn)
+
+    with api.default_mesh(self.first_mesh):
+      dt_value = fn(a_np)
+
+    self.assertDTensorEqual(a_np, target_layout, dt_value)
+
+  def testNestedDefaultMesh(self):
+
+    @polymorphic_function.function
+    def func(a):
+      return a + 3.0
+
+    with api.default_mesh(self.first_mesh):
+      with api.default_mesh(self.second_mesh):
+        with api.default_mesh(self.first_mesh):
+          result = func(array_ops.ones(shape=()))
+          self.assertEqual(api.fetch_layout(result).mesh, self.first_mesh)
+        result = func(array_ops.ones(shape=()))
+        self.assertEqual(api.fetch_layout(result).mesh, self.second_mesh)
+      result = func(array_ops.ones(shape=()))
+      self.assertEqual(api.fetch_layout(result).mesh, self.first_mesh)
+
+  def testImplicitCopyToCPUMeshForStrings(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    host_device_id = test_util.create_device_ids_array((1,))
+    host_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        host_device_id,
+        np.ravel(host_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+    replicated_layout_on_cpu = Layout.replicated(host_cpu_mesh, rank=0)
+    replicated_layout_on_tpu = Layout.replicated(self.second_mesh, rank=0)
+
+    string_tensor = constant_op.constant('hello')
+
+    @polymorphic_function.function
+    def f(tensor, dtensor_a, dtensor_b):
+      # Return an identity op for all three inputs so that linter does not
+      # complain about unused variables.
+      return tensor, dtensor_a, dtensor_b
+
+    cpu_dtensor = api.copy_to_mesh(
+        constant_op.constant(1), replicated_layout_on_cpu
+    )
+    tpu_dtensor = api.copy_to_mesh(
+        constant_op.constant(1), replicated_layout_on_tpu
+    )
+
+    string_dtensor, _, _ = f(string_tensor, cpu_dtensor, tpu_dtensor)
+
+    # Regular string tensor should be implicitly copied onto the CPU mesh,
+    # not the TPU mesh.
+    self.assertEqual(api.fetch_layout(string_dtensor), replicated_layout_on_cpu)
+
+  def testMultiMeshBroadcast(self):
+    first_mesh_a = api.copy_to_mesh(
+        np.zeros([3], dtype=np.int32),
+        Layout.replicated(self.first_mesh, rank=1),
+    )
+    second_mesh_a = api.copy_to_mesh(
+        np.ones([3], dtype=np.int32),
+        Layout.replicated(self.second_mesh, rank=1),
+    )
+    self.assertDTensorEqual(
+        np.asarray([1, 1, 1], dtype=np.int32),
+        Layout.replicated(self.first_mesh, rank=1), first_mesh_a + 1)
+    # Run an add with small constant - the constant should be broadcasted
+    # onto the second mesh rather than the first.
+    self.assertDTensorEqual(
+        np.asarray([2, 2, 2], dtype=np.int32),
+        Layout.replicated(self.second_mesh, rank=1), second_mesh_a + 1)
+
+  def testMultiMeshAdd(self):
+    a = constant_op.constant(1, dtype=dtypes.int32)
+    b = constant_op.constant(2, dtype=dtypes.int32)
+    with ops.device_v2(api.device_name()):
+      first_mesh_a = api.copy_to_mesh(
+          a, Layout.replicated(self.first_mesh, rank=0)
+      )
+      first_mesh_b = api.copy_to_mesh(
+          b, Layout.replicated(self.first_mesh, rank=0)
+      )
+    # Copy-to-mesh doesn't work with multi-mesh as we always broadcast to
+    # default mesh.
+    # TODO(hthu): Use copy-to-mesh after the generic Relayout CL is in.
+    second_mesh_a = api.copy_to_mesh(
+        np.ones([3], dtype=np.int32),
+        Layout.replicated(self.second_mesh, rank=1),
+    )
+    second_mesh_b = api.copy_to_mesh(
+        np.zeros([3], dtype=np.int32),
+        Layout.replicated(self.second_mesh, rank=1),
+    )
+
+    first_mesh_result = first_mesh_a + first_mesh_b
+    second_mesh_result = second_mesh_a + second_mesh_b
+    self.assertDTensorEqual(
+        np.asarray(3, dtype=np.int32),
+        Layout.replicated(self.first_mesh, rank=0), first_mesh_result)
+    self.assertDTensorEqual(
+        np.ones([3], dtype=np.int32),
+        Layout.replicated(self.second_mesh, rank=1), second_mesh_result)
+
+  def testMultiMeshFunc(self):
+    a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+    with ops.device_v2(api.device_name()):
+      first_mesh_a = api.copy_to_mesh(
+          a, Layout.replicated(self.first_mesh, rank=1)
+      )
+    second_mesh_a = api.copy_to_mesh(
+        np.ones([4], dtype=np.int32),
+        Layout.replicated(self.second_mesh, rank=1),
+    )
+    with self.assertRaises(errors_impl.UnknownError):
+      # fails mesh propagation as it requires all inputs to be on the same
+      # mesh.
+      # pylint: disable=pointless-statement
+      first_mesh_a + second_mesh_a
+      # pylint: enable=pointless-statement
+
+  def testMultiMeshInSideFunctionLayoutV2(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    replicated_layout_on_tpu = Layout.replicated(self.second_mesh, rank=1)
+    host_device_id = test_util.create_device_ids_array((1,))
+    host_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        host_device_id,
+        np.ravel(host_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+    replicated_layout_on_cpu = Layout.replicated(host_cpu_mesh, rank=0)
+
+    a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+
+    def func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      t = math_ops.reduce_sum(t)
+      return math_ops.sqrt(t)
+
+    golden_result = func(a)
+
+    a = api.copy_to_mesh(a, replicated_layout_on_tpu)
+
+    @polymorphic_function.function
+    def cpu_func(t):
+      return math_ops.sqrt(t)
+
+    @polymorphic_function.function
+    def tpu_func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      t = math_ops.reduce_sum(t)
+      cpu_tensor = api.copy_to_mesh(t, replicated_layout_on_cpu)
+      return cpu_func(cpu_tensor)
+
+    with ops.device_v2(api.device_name()):
+      output = tpu_func(a)
+      self.assertDTensorEqual(golden_result, replicated_layout_on_cpu, output)
+
+  def testMultiMeshCancellation(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    host_device_id = test_util.create_device_ids_array((1,))
+    host_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        host_device_id,
+        np.ravel(host_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+    replicated_layout_on_cpu = Layout([UNSHARDED], host_cpu_mesh)
+    replicated_layout_on_tpu = Layout([UNSHARDED], self.second_mesh)
+
+    @polymorphic_function.function
+    def cpu_func(x):
+      # Integer division by 0, which returns a bad status.
+      x = math_ops.cast(gen_math_ops.div(x=x, y=x), dtypes.float32)
+      return math_ops.cast(x, dtypes.float32)
+
+    @polymorphic_function.function
+    def tpu_func(cpu_tensor):
+      cpu_result = cpu_func(cpu_tensor)
+      tpu_tensor = api.copy_to_mesh(cpu_result, replicated_layout_on_tpu)
+      # A reduction on the TPU mesh which must be cancelled in response to the
+      # CPU mesh's failure.
+      return math_ops.reduce_sum(tpu_tensor)
+
+    cpu_tensor = api.copy_to_mesh(
+        constant_op.constant([0, 1]), replicated_layout_on_cpu
+    )
+
+    with self.assertRaisesRegex(Exception, 'Integer division by zero'):
+      # Ensure any errors are raised at end of scope.
+      with context.async_scope():
+        with ops.device_v2(api.device_name()):
+          tpu_func(cpu_tensor)
+
+  def testMultiMeshCPUToTPUTransfer(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    multiple_host_device_id = test_util.create_device_ids_array((2,))
+    host_multi_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        multiple_host_device_id,
+        np.ravel(multiple_host_device_id).tolist(),
+        test_util.create_device_list((2,), 'CPU'),
+    )
+
+    replicated_layout_on_cpu = Layout.replicated(host_multi_cpu_mesh, rank=1)
+    sharded_layout_on_tpu_r1 = Layout([_MESH_DIM_X], self.second_mesh)
+    replicated_layout_on_tpu_r1 = Layout.replicated(self.second_mesh, rank=1)
+
+    a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+    a = api.copy_to_mesh(a, replicated_layout_on_cpu)
+
+    @polymorphic_function.function
+    def tpu_func(t):
+      return api.relayout(t, sharded_layout_on_tpu_r1)
+
+    @polymorphic_function.function
+    def cpu_func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      tpu_tensor = api.copy_to_mesh(t, replicated_layout_on_tpu_r1)
+      return tpu_func(tpu_tensor)
+
+    with ops.device_v2(api.device_name()):
+      output = cpu_func(a)
+
+    api.check_layout(output, sharded_layout_on_tpu_r1)
+
+  def testMultiMeshUnsupportedTypes(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    host_device_id = test_util.create_device_ids_array((1,))
+    host_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        host_device_id,
+        np.ravel(host_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+
+    replicated_layout_on_cpu = Layout.replicated(host_cpu_mesh, rank=1)
+    replicated_layout_on_tpu_r1 = Layout.replicated(self.second_mesh, rank=1)
+
+    s = constant_op.constant(['a', 'b', 'c'], dtype=dtypes.string)
+    s = api.copy_to_mesh(s, replicated_layout_on_cpu)
+
+    @polymorphic_function.function
+    def tpu_func(t):
+      return array_ops.identity(t)
+
+    @polymorphic_function.function
+    def cpu_func(t):
+      t = array_ops.identity(t)
+      tpu_tensor = api.copy_to_mesh(t, replicated_layout_on_tpu_r1)
+      return tpu_func(tpu_tensor)
+
+    with self.assertRaises(errors_impl.UnknownError) as ex:
+      with ops.device_v2(api.device_name()):
+        _ = str(cpu_func(s))
+
+    self.assertIn('unsupported output type', ex.exception.message)
+
+  def testMultiMeshCPUToCPUTransfer(self):
+    send_device_id = test_util.create_device_ids_array((1,))
+    send_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        send_device_id,
+        np.ravel(send_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+    recv_cpu_mesh = Mesh.from_string(
+        '|x=1|0|0|/job:localhost/replica:0/task:0/device:CPU:1'
+    )
+
+    replicated_layout_on_cpu_send = Layout.replicated(send_cpu_mesh, rank=1)
+    replicated_layout_on_cpu_recv = Layout.replicated(recv_cpu_mesh, rank=1)
+    replicated_layout_on_cpu_r0 = Layout.replicated(recv_cpu_mesh, rank=0)
+
+    def func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      t = math_ops.reduce_sum(t)
+      return math_ops.sqrt(t)
+
+    @polymorphic_function.function
+    def cpu_recv_func(t):
+      t = math_ops.reduce_sum(t)
+      t = math_ops.sqrt(t)
+      return t
+
+    @polymorphic_function.function
+    def cpu_send_func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      cpu_recv_tensor = api.copy_to_mesh(t, replicated_layout_on_cpu_recv)
+      t = cpu_recv_func(cpu_recv_tensor)
+      return t
+
+    a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+    golden_result = func(a)
+    a = api.copy_to_mesh(a, replicated_layout_on_cpu_send)
+
+    with ops.device_v2(api.device_name()):
+      output = cpu_send_func(a)
+      self.assertDTensorEqual(golden_result, replicated_layout_on_cpu_r0,
+                              output)
+
+  def testMultiMeshCPUTest(self):
+    device_ids = test_util.create_device_ids_array((2,))
+    cpu_mesh_a = Mesh(
+        ['x'],
+        device_ids,
+        np.ravel(device_ids).tolist(),
+        test_util.create_device_list((2,), 'CPU'),
+    )
+    cpu_mesh_b = Mesh(
+        ['y'],
+        device_ids,
+        np.ravel(device_ids).tolist(),
+        test_util.create_device_list((2,), 'CPU'),
+    )
+    replicated_layout_on_a = Layout.replicated(cpu_mesh_a, rank=1)
+    replicated_layout_on_b = Layout.replicated(cpu_mesh_b, rank=1)
+
+    x = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+    y = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+
+    a = api.copy_to_mesh(x, replicated_layout_on_a)
+    b = api.copy_to_mesh(y, replicated_layout_on_b)
+
+    @polymorphic_function.function
+    def func2(t1, t2):
+      t1 = math_ops.cast(t1, dtypes.float32)
+      t1 = t1 * t1
+
+      t2 = math_ops.cast(t2, dtypes.float32)
+      t2 = math_ops.sqrt(t2)
+      return t1, t2
+
+    with ops.device_v2(api.device_name()):
+      output1, output2 = func2(a, b)
+
+    api.check_layout(output1, replicated_layout_on_a)
+    api.check_layout(output2, replicated_layout_on_b)
+
+  def testFunctionWithMultiMeshInputOutputs(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    host_device_id = test_util.create_device_ids_array((1,))
+    host_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        host_device_id,
+        np.ravel(host_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+    replicated_layout_on_cpu = Layout.replicated(host_cpu_mesh, rank=1)
+    replicated_layout_on_cpu_r0 = Layout.replicated(host_cpu_mesh, rank=0)
+    replicated_layout_on_tpu_r0 = Layout.replicated(self.second_mesh, rank=0)
+    replicated_layout_on_tpu = Layout.replicated(self.second_mesh, rank=1)
+
+    a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+    b = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+
+    def golden_func(t1, t2):
+      t1 = math_ops.cast(t1, dtypes.float32)
+      t1 = t1 * t1
+      t2 = math_ops.cast(t2, dtypes.float32)
+      t2 = math_ops.reduce_sum(t2)
+      out1 = gen_math_ops.neg(t2)
+
+      t2 = t2 + t1
+      out0 = math_ops.sqrt(t2)
+      return out0, out1
+
+    golden_result0, golden_result1 = golden_func(a, b)
+
+    cpu_dtensor = api.copy_to_mesh(a, replicated_layout_on_cpu)
+    tpu_dtensor = api.copy_to_mesh(b, replicated_layout_on_tpu)
+
+    @polymorphic_function.function
+    def cpu_func(t1, t2):
+      t2 = t2 + t1
+      return math_ops.sqrt(t2)
+
+    @polymorphic_function.function
+    def func(tpu_input, cpu_input):
+      cpu_input = math_ops.cast(cpu_input, dtypes.float32)
+      cpu_input = cpu_input * cpu_input
+      tpu_input = math_ops.cast(tpu_input, dtypes.float32)
+      tpu_input = math_ops.reduce_sum(tpu_input)
+      tpu_output = gen_math_ops.neg(tpu_input)
+
+      cpu_tensor = api.copy_to_mesh(tpu_input, replicated_layout_on_cpu_r0)
+      cpu_output = cpu_func(cpu_tensor, cpu_input)
+      return cpu_output, tpu_output
+
+    with ops.device_v2(api.device_name()):
+      output0, output1 = func(tpu_dtensor, cpu_dtensor)
+
+    self.assertDTensorEqual(golden_result0, replicated_layout_on_cpu, output0)
+    self.assertDTensorEqual(golden_result1, replicated_layout_on_tpu_r0,
+                            output1)
+
+  def testMultiMeshWithResourceOps(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    host_device_id = test_util.create_device_ids_array((1,))
+    host_cpu_mesh = Mesh(
+        [_MESH_DIM_X],
+        host_device_id,
+        np.ravel(host_device_id).tolist(),
+        test_util.create_device_list((1,), 'CPU'),
+    )
+
+    replicated_layout_on_cpu = Layout.replicated(host_cpu_mesh, rank=0)
+    replicated_layout_on_tpu = Layout.replicated(self.second_mesh, rank=1)
+    a = constant_op.constant(
+        [1, 2, 3, 4], dtype=dtypes.int64
+    )  # NOTE(b/274627284): Variable of int32 type on GPU doesn't work.
+
+    def func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      t = math_ops.reduce_sum(t)
+      return math_ops.sqrt(t)
+
+    golden_result = func(a)
+
+    @polymorphic_function.function
+    def cpu_func(t):
+      return math_ops.sqrt(t)
+
+    @polymorphic_function.function
+    def tpu_func(t):
+      t = math_ops.cast(t, dtypes.float32)
+      t = math_ops.reduce_sum(t)
+      cpu_tensor = api.copy_to_mesh(t, replicated_layout_on_cpu)
+      return cpu_func(cpu_tensor)
+
+    with ops.device_v2(api.device_name()):
+      v = api.copy_to_mesh(a, replicated_layout_on_tpu)
+      w = d_variable.DVariable(v)
+      output = tpu_func(w)
+
+    self.assertDTensorEqual(golden_result, replicated_layout_on_cpu, output)
+
+  @parameterized.named_parameters(
+      ('_host_to_dev_sharded_i32', True, True, dtypes.int32),
+      ('_dev_to_host_sharded_i32', False, True, dtypes.int32),
+      ('_host_to_dev_replicated_i32', True, False, dtypes.int32),
+      ('_dev_to_host_replicated_i32', False, False, dtypes.int32),
+      ('_host_to_dev_sharded_bf16', True, True, dtypes.bfloat16),
+      ('_dev_to_host_sharded_bf16', False, True, dtypes.bfloat16),
+      ('_host_to_dev_replicated_bf16', True, False, dtypes.bfloat16),
+      ('_dev_to_host_replicated_bf16', False, False, dtypes.bfloat16),
+      ('_host_to_dev_sharded_f32', True, True, dtypes.float32),
+      ('_dev_to_host_sharded_f32', False, True, dtypes.float32),
+      ('_host_to_dev_replicated_f32', True, False, dtypes.float32),
+      ('_dev_to_host_replicated_f32', False, False, dtypes.float32),
+      ('_host_to_dev_sharded_f64', True, True, dtypes.float64),
+      ('_dev_to_host_sharded_f64', False, True, dtypes.float64),
+      ('_host_to_dev_replicated_f64', True, False, dtypes.float64),
+      ('_dev_to_host_replicated_f64', False, False, dtypes.float64),
+  )
+  def testMultiMeshHostDeviceTransfer(self, host_to_dev, sharded, dtype):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    def run_copy_to_mesh(data, src_layout, dst_layout):
+
+      @polymorphic_function.function
+      def func(x):
+        return api.copy_to_mesh(x, dst_layout)
+
+      if src_layout.is_fully_replicated():
+        src_data = api.copy_to_mesh(data, src_layout)
+      else:
+        src_data = api.copy_to_mesh(
+            data, Layout.replicated(src_layout.mesh, rank=len(data.shape))
+        )
+        src_data = api.relayout(src_data, src_layout)
+      dst_data = func(src_data)
+      return (src_data, dst_data)
+
+    dev_mesh = self.first_mesh
+    cpu_mesh = self.second_mesh
+
+    if host_to_dev:
+      src_mesh, dst_mesh = cpu_mesh, dev_mesh
+    else:
+      src_mesh, dst_mesh = dev_mesh, cpu_mesh
+
+    if sharded:
+      src_layout = Layout.batch_sharded(src_mesh, src_mesh.dim_names[0], rank=2)
+      dst_layout = Layout.batch_sharded(dst_mesh, dst_mesh.dim_names[0], rank=2)
+    else:
+      src_layout = Layout.replicated(src_mesh, rank=2)
+      dst_layout = Layout.replicated(dst_mesh, rank=2)
+
+    data = array_ops.ones([8, 8], dtype=dtype)
+    src, dst = run_copy_to_mesh(data, src_layout, dst_layout)
+    self.assertDTensorEqual(data, src_layout, src)
+    self.assertDTensorEqual(data, dst_layout, dst)
+
+  @parameterized.named_parameters(('_host_to_tpu', True),
+                                  ('_tpu_to_host', False))
+  def testMultiMeshWithHostMesh(self, host_to_tpu):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    sharded_layout_on_tpu = Layout([_MESH_DIM_X], self.second_mesh)
+    host_layout = Layout(sharded_layout_on_tpu.sharding_specs,
+                         sharded_layout_on_tpu.mesh.host_mesh())
+
+    if host_to_tpu:
+      source_layout = host_layout
+      target_layout = sharded_layout_on_tpu
+    else:
+      source_layout = sharded_layout_on_tpu
+      target_layout = host_layout
+
+    numpy_a = constant_op.constant([1, 2, 3, 4], dtype=dtypes.int32)
+
+    # TODO(b/193443769): switch to a single copy_to_mesh when this is supported.
+    replicated_layout = Layout.replicated(source_layout.mesh,
+                                          source_layout.rank)
+    a = api.copy_to_mesh(numpy_a, replicated_layout)
+    a = api.relayout(a, source_layout)
+
+    @polymorphic_function.function
+    def func(t):
+      target_tensor = api.copy_to_mesh(t, target_layout)
+      return array_ops.identity(target_tensor)
+
+    with ops.device_v2(api.device_name()):
+      dtensor_output = func(a)
+
+    self.assertDTensorEqual(numpy_a, target_layout, dtensor_output)
+
+  def testMultiMeshBackward(self):
+    self.skipForDeviceType(
+        ['CPU'],
+        'Skipping test as only CPU mesh is available for multi-meshtest.',
+    )
+
+    replicated_layout_on_tpu = Layout.replicated(self.second_mesh, rank=1)
+    host_layout = Layout.replicated(self.second_mesh.host_mesh(), rank=1)
+
+    source_layout = host_layout
+    target_layout = replicated_layout_on_tpu
+
+    @polymorphic_function.function
+    def func(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        x = x * 4.0
+        t = api.copy_to_mesh(x, target_layout)
+        sqrt = math_ops.sqrt(t)
+        sqrt_grad = tape.gradient(sqrt, x)
+        return sqrt_grad
+
+    @polymorphic_function.function
+    def second(x):
+      with backprop.GradientTape() as tape:
+        tape.watch(x)
+        sqrt_grad = func(x)
+        sqrt_grad_grad = tape.gradient(sqrt_grad, x)
+        return sqrt_grad_grad
+
+    numpy_a = constant_op.constant([1, 4, 16, 64], dtype=dtypes.float32)
+    a = api.copy_to_mesh(numpy_a, source_layout)
+
+    with ops.device_v2(api.device_name()):
+      a_grad = func(a)
+
+    self.assertDTensorEqual(0.5 * 0.5 * (1 / numpy_a)**0.5, host_layout, a_grad)
+    with ops.device_v2(api.device_name()):
+      a_grad_grad = second(a)
+    self.assertDTensorEqual(-0.5 * 0.5 * 0.5 * (1 / numpy_a)**1.5, host_layout,
+                            a_grad_grad)
+
+  def testDVariableDefaultMesh(self):
+    other_layout = Layout.replicated(_OTHER_CPU_MESH, rank=0)
+    first_layout = Layout.replicated(_ONE_D_CPU_MESH, rank=0)
+
+    _ = api.copy_to_mesh(1.0, other_layout)
+    init_value = api.copy_to_mesh(1.0, first_layout)
+    _ = d_variable.DVariable(init_value)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/dtensor/python/tests/spmd_test.py b/tensorflow/dtensor/python/tests/spmd_test.py
index 7d59318299f..ee5963d7683 100644
--- a/tensorflow/dtensor/python/tests/spmd_test.py
+++ b/tensorflow/dtensor/python/tests/spmd_test.py
@@ -1471,50 +1471,118 @@ class DTensorSPMDTest(test_util.DTensorBaseTest):
     self.assertDTensorEqual(expected_result, sharded_layout, dtensor_result)
 
   @parameterized.named_parameters(
-      ('FullyReplicatedInputs', {
-          'begin': [0, 0],
-          'end': [-1, 2],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED] * 2), ('NewAxisMask', {
-          'begin': [0, 0, 0, 0],
-          'end': [0, 0, 2, 4],
-          'strides': [1, 1, 1, 1],
-          'new_axis_mask': 3
-      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 4),
-      ('ShrinkAxisMask', {
-          'begin': [0, 0],
-          'end': [-1, 2],
-          'strides': [1, 1],
-          'shrink_axis_mask': 2
-      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED]),
-      ('ShardingOnNonSlicedDimension', {
-          'begin': [0, 0],
-          'end': [2, 2],
-          'strides': [1, 2]
-      }, [_MESH_DIM_X, layout_lib.UNSHARDED]),
-      ('StrideOnShardedDimensionNoRelayout1', {
-          'begin': [0, 0],
-          'end': [2, 4],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNoRelayout2', {
-          'begin': [0, 1],
-          'end': [2, 4],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNoRelayout3', {
-          'begin': [0, 0],
-          'end': [2, 3],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNeedRelayout', {
-          'begin': [0, 0],
-          'end': [-1, 4],
-          'strides': [1, 3]
-      }, [_MESH_DIM_X, layout_lib.UNSHARDED], [layout_lib.UNSHARDED] * 2))
+      (
+          'FullyReplicatedInputs',
+          {'begin': [0, 0], 'end': [-1, 2], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'NewAxisMask',
+          {
+              'begin': [0, 0, 0, 0],
+              'end': [0, 0, 2, 4],
+              'strides': [1, 1, 1, 1],
+              'new_axis_mask': 3,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 4,
+      ),
+      (
+          'ShrinkAxisMask',
+          {
+              'begin': [0, 0],
+              'end': [-1, 2],
+              'strides': [1, 1],
+              'shrink_axis_mask': 2,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED],
+      ),
+      (
+          'EllipsisAxisMask',
+          {
+              'begin': [0, 0, 0],
+              'end': [0, 0, 0],
+              'strides': [1, 1, 1],
+              'ellipsis_mask': 1,
+              'new_axis_mask': 6,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 4,
+      ),
+      (
+          'MoreAxis',
+          {
+              'begin': [0],
+              'end': [2],
+              'strides': [1],
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'ShardingOnNonSlicedDimension',
+          {'begin': [0, 0], 'end': [2, 2], 'strides': [1, 2]},
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout1',
+          {'begin': [0, 0], 'end': [2, 4], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout2',
+          {'begin': [0, 1], 'end': [2, 4], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout3',
+          {'begin': [0, 0], 'end': [2, 3], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNeedRelayout',
+          {'begin': [0, 0], 'end': [-1, 4], 'strides': [1, 3]},
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'DynamicSliceWithBeginEndMask',
+          {
+              'begin': lambda: array_ops.fill([2], 0),
+              'end': [-1, 4],
+              'strides': [1, 3],
+              'begin_mask': 3,
+              'end_mask': 3,
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'DynamicSliceNoMask',
+          {
+              'begin': lambda: array_ops.fill([2], 0),
+              'end': [-1, 4],
+              'strides': [1, 3],
+              'begin_mask': 0,
+              'end_mask': 0,
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+      ),
+  )
   def testStridedSliceOps(self, args, input_layout, expected_layout=None):
     input_tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
-    expected_result = gen_array_ops.strided_slice(input=input_tensor, **args)
+
+    @polymorphic_function.function
+    def func(input_tensor):
+      newargs = {}
+      for key, value in args.items():
+        newargs[key] = value() if hasattr(value, '__call__') else value
+
+      return gen_array_ops.strided_slice(input=input_tensor, **newargs)
+
+    expected_result = func(input_tensor)
 
     input_layout = Layout(input_layout, self.mesh)
     if expected_layout is None:
@@ -1523,119 +1591,251 @@ class DTensorSPMDTest(test_util.DTensorBaseTest):
       expected_layout = Layout(expected_layout, self.mesh)
 
     dtensor_input_tensor = api.relayout(input_tensor, input_layout)
-    dtensor_result = gen_array_ops.strided_slice(
-        input=dtensor_input_tensor, **args)
+    dtensor_result = func(dtensor_input_tensor)
 
     self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
 
   @parameterized.named_parameters(
-      ('FullyReplicatedInputs', {
-          'begin': [0, 0],
-          'end': [-1, 2],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED] * 2), ('NewAxisMask', {
-          'begin': [0, 0, 0, 0],
-          'end': [0, 0, 2, 4],
-          'strides': [1, 1, 1, 1],
-          'new_axis_mask': 3
-      }, [layout_lib.UNSHARDED] * 2), ('ShrinkAxisMask', {
-          'begin': [0, 0],
-          'end': [-1, 2],
-          'strides': [1, 1],
-          'shrink_axis_mask': 2
-      }, [layout_lib.UNSHARDED] * 2), ('ShardingOnNonSlicedDimension', {
-          'begin': [0, 0],
-          'end': [2, 2],
-          'strides': [1, 2]
-      }, [_MESH_DIM_X, layout_lib.UNSHARDED]),
-      ('StrideOnShardedDimensionNoRelayout1', {
-          'begin': [0, 0],
-          'end': [2, 4],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNoRelayout2', {
-          'begin': [0, 1],
-          'end': [2, 4],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNoRelayout3', {
-          'begin': [0, 0],
-          'end': [2, 3],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNeedRelayout', {
-          'begin': [0, 0],
-          'end': [-1, 4],
-          'strides': [1, 3]
-      }, [_MESH_DIM_X, layout_lib.UNSHARDED], [layout_lib.UNSHARDED] * 2))
-  def testStridedSliceGradOps(self, args, input_layout, expected_layout=None):
+      (
+          'FullyReplicatedInputs',
+          {'begin': [0, 0], 'end': [-1, 2], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'NewAxisMask',
+          {
+              'begin': [0, 0, 0, 0],
+              'end': [0, 0, 2, 4],
+              'strides': [1, 1, 1, 1],
+              'new_axis_mask': 3,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 4,
+      ),
+      (
+          'ShrinkAxisMask',
+          {
+              'begin': [0, 0],
+              'end': [-1, 2],
+              'strides': [1, 1],
+              'shrink_axis_mask': 2,
+          },
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED],
+      ),
+      (
+          'EllipsisAxisMask',
+          {
+              'begin': [0, 0, 0],
+              'end': [0, 0, 0],
+              'strides': [1, 1, 1],
+              'ellipsis_mask': 1,
+              'new_axis_mask': 6,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 4,
+      ),
+      (
+          'MoreAxis',
+          {
+              'begin': [0],
+              'end': [2],
+              'strides': [1],
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'ShardingOnNonSlicedDimension',
+          {'begin': [0, 0], 'end': [2, 2], 'strides': [1, 2]},
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout1',
+          {'begin': [0, 0], 'end': [2, 4], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout2',
+          {'begin': [0, 1], 'end': [2, 4], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout3',
+          {'begin': [0, 0], 'end': [2, 3], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNeedRelayout',
+          {'begin': [0, 0], 'end': [-1, 4], 'strides': [1, 3]},
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'DynamicSliceWithBeginEndMask',
+          {
+              'begin': lambda: array_ops.fill([2], 0),
+              'end': [-1, 4],
+              'strides': [1, 3],
+              'begin_mask': 3,
+              'end_mask': 3,
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'DynamicSliceNoMask',
+          {
+              'begin': lambda: array_ops.fill([2], 0),
+              'end': [-1, 4],
+              'strides': [1, 3],
+              'begin_mask': 0,
+              'end_mask': 0,
+          },
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+      ),
+  )
+  def testStridedSliceGradOps(self, args, expected_layout, value_layout=None):
     input_tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
     shape = input_tensor.shape.as_list()
-    input_layout = Layout(input_layout, self.mesh)
-    if expected_layout is None:
-      expected_layout = input_layout
+    expected_layout = Layout(expected_layout, self.mesh)
+    if value_layout is None:
+      value_layout = expected_layout
     else:
-      expected_layout = Layout(expected_layout, self.mesh)
+      value_layout = Layout(value_layout, self.mesh)
 
-    grad = gen_array_ops.strided_slice(input=input_tensor, **args)
-    expected_result = gen_array_ops.strided_slice_grad(
-        shape=shape, **args, dy=grad)
+    def get_newargs():
+      newargs = {}
+      for key, value in args.items():
+        newargs[key] = value() if hasattr(value, '__call__') else value
+      return newargs
 
-    dtensor_input_tensor = api.relayout(input_tensor, input_layout)
-    grad = gen_array_ops.strided_slice(input=dtensor_input_tensor, **args)
-    dtensor_result = gen_array_ops.strided_slice_grad(
-        shape=shape, **args, dy=grad)
+    @polymorphic_function.function
+    def func(grad):
+      return gen_array_ops.strided_slice_grad(
+          shape=shape, **get_newargs(), dy=grad
+      )
+
+    grad = gen_array_ops.strided_slice(input=input_tensor, **get_newargs())
+    expected_result = func(grad)
+
+    dtensor_grad = api.relayout(grad, value_layout)
+    dtensor_result = func(dtensor_grad)
 
     self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
 
   @parameterized.named_parameters(
-      ('FullyReplicatedInputs', {
-          'begin': [0, 0],
-          'end': [-1, 2],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 2),
-      ('NewAxisMask', {
-          'begin': [0, 0, 0, 0],
-          'end': [0, 0, 2, 4],
-          'strides': [1, 1, 1, 1],
-          'new_axis_mask': 3
-      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 4),
-      ('ShrinkAxisMask', {
-          'begin': [0, 0],
-          'end': [-1, 2],
-          'strides': [1, 1],
-          'shrink_axis_mask': 2
-      }, [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED]),
-      ('ShardingOnNonSlicedDimension', {
-          'begin': [0, 0],
-          'end': [2, 2],
-          'strides': [1, 2]
-      }, [_MESH_DIM_X, layout_lib.UNSHARDED
-         ], [_MESH_DIM_X, layout_lib.UNSHARDED]),
-      ('StrideOnShardedDimensionNoRelayout1', {
-          'begin': [0, 0],
-          'end': [2, 4],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X
-         ], [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNoRelayout2', {
-          'begin': [0, 1],
-          'end': [2, 4],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X
-         ], [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNoRelayout3', {
-          'begin': [0, 0],
-          'end': [2, 3],
-          'strides': [1, 2]
-      }, [layout_lib.UNSHARDED, _MESH_DIM_X
-         ], [layout_lib.UNSHARDED, _MESH_DIM_X]),
-      ('StrideOnShardedDimensionNeedRelayout', {
-          'begin': [0, 0],
-          'end': [-1, 4],
-          'strides': [1, 3]
-      }, [_MESH_DIM_X, layout_lib.UNSHARDED
-         ], [layout_lib.UNSHARDED] * 2, [layout_lib.UNSHARDED] * 2))
+      (
+          'FullyReplicatedInputs',
+          {'begin': [0, 0], 'end': [-1, 2], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'NewAxisMask',
+          {
+              'begin': [0, 0, 0, 0],
+              'end': [0, 0, 2, 4],
+              'strides': [1, 1, 1, 1],
+              'new_axis_mask': 3,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 4,
+      ),
+      (
+          'ShrinkAxisMask',
+          {
+              'begin': [0, 0],
+              'end': [-1, 2],
+              'strides': [1, 1],
+              'shrink_axis_mask': 2,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED],
+      ),
+      (
+          'EllipsisAxisMask',
+          {
+              'begin': [0, 0, 0],
+              'end': [0, 0, 0],
+              'strides': [1, 1, 1],
+              'ellipsis_mask': 1,
+              'new_axis_mask': 6,
+          },
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 4,
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'MoreAxis',
+          {
+              'begin': [0],
+              'end': [2],
+              'strides': [1],
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'ShardingOnNonSlicedDimension',
+          {'begin': [0, 0], 'end': [2, 2], 'strides': [1, 2]},
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout1',
+          {'begin': [0, 0], 'end': [2, 4], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout2',
+          {'begin': [0, 1], 'end': [2, 4], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNoRelayout3',
+          {'begin': [0, 0], 'end': [2, 3], 'strides': [1, 2]},
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+          [layout_lib.UNSHARDED, _MESH_DIM_X],
+      ),
+      (
+          'StrideOnShardedDimensionNeedRelayout',
+          {'begin': [0, 0], 'end': [-1, 4], 'strides': [1, 3]},
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED] * 2,
+          [layout_lib.UNSHARDED] * 2,
+      ),
+      (
+          'DynamicSliceWithBeginEndMask',
+          {
+              'begin': lambda: array_ops.fill([2], 0),
+              'end': [-1, 4],
+              'strides': [1, 3],
+              'begin_mask': 3,
+              'end_mask': 3,
+          },
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+          [_MESH_DIM_X, layout_lib.UNSHARDED],
+      ),
+      (
+          'DynamicSliceNoMask',
+          {
+              'begin': lambda: array_ops.fill([2], 0),
+              'end': [-1, 4],
+              'strides': [1, 3],
+              'begin_mask': 0,
+              'end_mask': 0,
+          },
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+          [layout_lib.UNSHARDED, layout_lib.UNSHARDED],
+      ),
+  )
   def testStridedSliceUpdateOps(self,
                                 args,
                                 input_layout,
@@ -1643,9 +1843,24 @@ class DTensorSPMDTest(test_util.DTensorBaseTest):
                                 expected_layout=None):
     self.skipForDeviceType(['TPU'], 'b/123559667; op has no XLA implementation')
     input_tensor = constant_op.constant([[1., 2., 3., 4.], [5., 6., 7., 8.]])
-    value_tensor = gen_array_ops.strided_slice(input=input_tensor, **args) * 10.
-    expected_result = gen_array_ops.tensor_strided_slice_update(
-        input=input_tensor, value=value_tensor, **args)
+
+    def get_newargs():
+      newargs = {}
+      for key, value in args.items():
+        newargs[key] = value() if hasattr(value, '__call__') else value
+      return newargs
+
+    value_tensor = (
+        gen_array_ops.strided_slice(input=input_tensor, **get_newargs()) * 10.0
+    )
+
+    @polymorphic_function.function
+    def func(input_tensor, value_tensor):
+      return gen_array_ops.tensor_strided_slice_update(
+          input=input_tensor, value=value_tensor, **get_newargs()
+      )
+
+    expected_result = func(input_tensor, value_tensor)
 
     input_layout = Layout(input_layout, self.mesh)
     value_layout = Layout(value_layout, self.mesh)
@@ -1656,9 +1871,8 @@ class DTensorSPMDTest(test_util.DTensorBaseTest):
 
     dtensor_input_tensor = api.relayout(input_tensor, input_layout)
     dtensor_value_tensor = api.relayout(value_tensor, value_layout)
-    dtensor_result = gen_array_ops.tensor_strided_slice_update(
-        input=dtensor_input_tensor, value=dtensor_value_tensor, **args)
 
+    dtensor_result = func(dtensor_input_tensor, dtensor_value_tensor)
     self.assertDTensorEqual(expected_result, expected_layout, dtensor_result)
 
   def testBroadcastGradientArgs(self):
@@ -2944,8 +3158,7 @@ class DTensorLayoutPropSPMDTest(test_util.DTensorBaseTest):
 
     @polymorphic_function.function
     def add_fn(x, y):
-      result = math_ops.add(x, y)
-      return api.relayout(result, a_layout)
+      return math_ops.add(x, y)
 
     dtensor_result = add_fn(a, b)
     self.assertDTensorEqual(expected, a_layout, dtensor_result)
diff --git a/tensorflow/dtensor/python/tpu_util.py b/tensorflow/dtensor/python/tpu_util.py
index d41b9090861..42b4d457926 100644
--- a/tensorflow/dtensor/python/tpu_util.py
+++ b/tensorflow/dtensor/python/tpu_util.py
@@ -714,8 +714,14 @@ def create_tpu_mesh(
   ]
   global_device_ids, local_device_ids, local_device_list = _create_device_array(
       mesh_shape, _TPU_DEVICE_TYPE, None, local_device_ids=indexes)
-  return layout_lib.Mesh(mesh_dim_names, global_device_ids, local_device_ids,
-                         local_device_list, mesh_name, use_xla_spmd)
+  return layout_lib.Mesh(
+      mesh_dim_names,
+      global_device_ids,
+      local_device_ids,
+      local_device_list,
+      mesh_name,
+      use_xla_spmd=use_xla_spmd,
+  )
 
 
 def get_device_ids(mesh: layout_lib.Mesh,
diff --git a/tensorflow/dtensor/tests/BUILD b/tensorflow/dtensor/tests/BUILD
index ab022e13d8e..d33de3cf413 100644
--- a/tensorflow/dtensor/tests/BUILD
+++ b/tensorflow/dtensor/tests/BUILD
@@ -4,6 +4,23 @@ load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
+tf_cc_test(
+    name = "slice_util_test",
+    srcs = ["slice_util_test.cc"],
+    deps = [
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:test",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:statusor",
+        "//tensorflow/dtensor/cc:slice_util",
+        "//tensorflow/dtensor/cc:tensor_layout",
+        "//tensorflow/dtensor/proto:layout_proto_cc",
+        "//tensorflow/tsl/platform:status_matchers",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 tf_cc_test(
     name = "tensor_layout_test",
     srcs = ["tensor_layout_test.cc"],
@@ -41,7 +58,7 @@ tf_cc_test(
         "//tensorflow/dtensor/cc:dtensor_device_util",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status_matchers",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -55,6 +72,6 @@ tf_cc_test(
         "//tensorflow/dtensor/mlir:spmd_expander",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status_matchers",
-        "@com_google_googletest//:gtest_main",
+        "@com_google_googletest//:gtest",
     ],
 )
diff --git a/tensorflow/dtensor/tests/slice_util_test.cc b/tensorflow/dtensor/tests/slice_util_test.cc
new file mode 100644
index 00000000000..385add94a5b
--- /dev/null
+++ b/tensorflow/dtensor/tests/slice_util_test.cc
@@ -0,0 +1,642 @@
+/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/dtensor/cc/slice_util.h"
+
+#include <map>
+#include <memory>
+#include <ostream>
+#include <string>
+#include <vector>
+
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/test.h"
+#include "tensorflow/dtensor/cc/tensor_layout.h"
+#include "tensorflow/dtensor/proto/layout.pb.h"
+#include "tensorflow/tsl/platform/status_matchers.h"
+
+namespace tensorflow {
+namespace dtensor {
+namespace slice_util {
+namespace {
+
+using ::testing::SizeIs;
+using ::tsl::testing::IsOk;
+
+TEST(TokenTest, NormalizeDynamic) {
+  auto spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/0, /*stride=*/1,
+                    /*dynamic_mask=*/true,
+                    /*begin_mask=*/true, /*end_mask=*/true);
+  EXPECT_EQ(spec.normalize(4).begin, 0);
+  EXPECT_EQ(spec.normalize(4).end, 0);
+  EXPECT_EQ(spec.normalize(4).dynamic_mask, true);
+  EXPECT_EQ(spec.normalize(4).begin_mask, true);
+  EXPECT_EQ(spec.normalize(4).end_mask, true);
+}
+
+TEST(TokenTest, NormalizeFullPositiveStride) {
+  auto spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/4, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(4).begin, 0);
+  EXPECT_EQ(spec.normalize(4).end, 4);
+
+  spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/4, /*stride=*/2);
+  EXPECT_EQ(spec.normalize(4).begin, 0);
+  EXPECT_EQ(spec.normalize(4).end, 4);
+
+  spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/4, /*stride=*/3);
+  EXPECT_EQ(spec.normalize(4).begin, 0);
+  EXPECT_EQ(spec.normalize(4).end, 6);
+
+  spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/4, /*stride=*/5);
+  EXPECT_EQ(spec.normalize(4).begin, 0);
+  EXPECT_EQ(spec.normalize(4).end, 5);
+}
+
+TEST(TokenTest, NormalizeFullNegativeStride) {
+  auto spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/-1, /*stride=*/-1);
+  EXPECT_EQ(spec.normalize(4).begin, 3);
+  EXPECT_EQ(spec.normalize(4).end, -1);
+
+  spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/-1, /*stride=*/-2);
+  EXPECT_EQ(spec.normalize(4).begin, 3);
+  EXPECT_EQ(spec.normalize(4).end, -1);
+
+  spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/-1, /*stride=*/-3);
+  EXPECT_EQ(spec.normalize(4).begin, 3);
+  EXPECT_EQ(spec.normalize(4).end, -3);
+
+  spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/-1, /*stride=*/-5);
+  EXPECT_EQ(spec.normalize(4).begin, 3);
+  EXPECT_EQ(spec.normalize(4).end, -2);
+}
+
+TEST(TokenTest, NormalizeZeroPositiveStride) {
+  auto spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/3, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(7).begin, 3);
+  EXPECT_EQ(spec.normalize(7).end, 3);
+
+  spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/0, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(7).begin, 0);
+  EXPECT_EQ(spec.normalize(7).end, 0);
+}
+
+TEST(TokenTest, NormalizeZeroNegativeStride) {
+  auto spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/3, /*stride=*/-1);
+  EXPECT_EQ(spec.normalize(7).begin, 3);
+  EXPECT_EQ(spec.normalize(7).end, 3);
+
+  spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/0, /*stride=*/-1);
+  EXPECT_EQ(spec.normalize(7).begin, 0);
+  EXPECT_EQ(spec.normalize(7).end, 0);
+}
+
+TEST(TokenTest, NormalizePartialPositiveStride) {
+  auto spec = Token(Token::REGULAR, /*begin=*/1, /*end=*/5, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(7).begin, 1);
+  EXPECT_EQ(spec.normalize(7).end, 5);
+
+  spec = Token(Token::REGULAR, /*begin=*/1, /*end=*/5, /*stride=*/2);
+  EXPECT_EQ(spec.normalize(7).begin, 1);
+  EXPECT_EQ(spec.normalize(7).end, 5);
+
+  spec = Token(Token::REGULAR, /*begin=*/1, /*end=*/5, /*stride=*/3);
+  EXPECT_EQ(spec.normalize(7).begin, 1);
+  EXPECT_EQ(spec.normalize(7).end, 7);
+
+  spec = Token(Token::REGULAR, /*begin=*/1, /*end=*/5, /*stride=*/5);
+  EXPECT_EQ(spec.normalize(7).begin, 1);
+  EXPECT_EQ(spec.normalize(7).end, 6);
+
+  spec = Token(Token::REGULAR, /*begin=*/1, /*end=*/-1, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(7).begin, 1);
+  EXPECT_EQ(spec.normalize(7).end, 6);
+
+  spec = Token(Token::REGULAR, /*begin=*/0, /*end=*/-1, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(7).begin, 0);
+  EXPECT_EQ(spec.normalize(7).end, 6);
+}
+
+TEST(TokenTest, NormalizePartialNegativeStride) {
+  auto spec = Token(Token::REGULAR, /*begin=*/6, /*end=*/2, /*stride=*/-1);
+  EXPECT_EQ(spec.normalize(7).begin, 6);
+  EXPECT_EQ(spec.normalize(7).end, 2);
+
+  spec = Token(Token::REGULAR, /*begin=*/6, /*end=*/2, /*stride=*/-2);
+  EXPECT_EQ(spec.normalize(7).begin, 6);
+  EXPECT_EQ(spec.normalize(7).end, 2);
+
+  spec = Token(Token::REGULAR, /*begin=*/6, /*end=*/2, /*stride=*/-3);
+  EXPECT_EQ(spec.normalize(7).begin, 6);
+  EXPECT_EQ(spec.normalize(7).end, 0);
+
+  spec = Token(Token::REGULAR, /*begin=*/6, /*end=*/2, /*stride=*/-5);
+  EXPECT_EQ(spec.normalize(7).begin, 6);
+  EXPECT_EQ(spec.normalize(7).end, 1);
+}
+
+TEST(TokenTest, NormalizeFarFromCenter) {
+  auto spec = Token(Token::REGULAR, /*begin=*/100, /*end=*/102, /*stride=*/1);
+  EXPECT_EQ(spec.normalize(9).begin, 1);
+  EXPECT_EQ(spec.normalize(9).end, 3);
+}
+
+TEST(TokenTest, NormalizeBeginMask) {
+  auto spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/2, /*stride=*/1);
+  spec.begin_mask = true;
+  EXPECT_EQ(spec.normalize(7).begin, 0);
+
+  spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/2, /*stride=*/-1);
+  spec.begin_mask = true;
+  EXPECT_EQ(spec.normalize(7).begin, 6);
+}
+
+TEST(TokenTest, NormalizeEndMask) {
+  auto spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/2, /*stride=*/1);
+  spec.end_mask = true;
+  EXPECT_EQ(spec.normalize(7).end, 7);
+
+  spec = Token(Token::REGULAR, /*begin=*/3, /*end=*/2, /*stride=*/-1);
+  spec.end_mask = true;
+  EXPECT_EQ(spec.normalize(7).end, -1);
+}
+
+class InferenceTest : public ::testing::Test {
+ protected:
+  Mesh GetMesh() {
+    return Mesh::CreateMesh("MyMesh", /*dim_names=*/{"x", "y"},
+                            /*mesh_shape=*/{2, 1},
+                            /*global_device_ids=*/{0, 1},
+                            /*global_devices_str=*/
+                            {"/job:localhost/task:0/device:CPU:0",
+                             "/job:localhost/task:0/device:CPU:1"},
+                            /*local_device_ids=*/{0, 1},
+                            /*local_devices_str=*/
+                            {"/job:localhost/task:0/device:CPU:0",
+                             "/job:localhost/task:0/device:CPU:1"},
+                            /*use_xla_spmd=*/false);
+  }
+};
+
+TEST_F(InferenceTest, FullyReplicatedInputs) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, Layout::kUnshardedDim},
+      GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, Layout::kUnshardedDim},
+      GetMesh());
+  const auto specs = std::vector<Token>{
+      Token(Token::REGULAR, /*begin=*/0, /*end=*/-1, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::REGULAR, /*begin=*/0, /*end=*/2, /*stride=*/2,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false)};
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(
+      forward->expander_input_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, -1);
+  EXPECT_EQ(forward->local_tokens()[1].end, 2);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(
+      backward->expander_value_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(
+      backward->expander_input_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->local_tokens()[0].end, -1);
+  EXPECT_EQ(backward->local_tokens()[1].end, 2);
+}
+
+TEST_F(InferenceTest, NewAxisMask) {
+  const Layout input_layout =
+      *Layout::GetLayout(std::vector<std::string>{"x", "y"}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, Layout::kUnshardedDim,
+                               "x", "y"},
+      GetMesh());
+
+  const auto specs = std::vector<Token>{
+      Token(Token::NEW_AXIS, /*begin=*/0, /*end=*/0, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::NEW_AXIS, /*begin=*/0, /*end=*/0, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::REGULAR, /*begin=*/0, /*end=*/2, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::REGULAR, /*begin=*/0, /*end=*/4, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({"x", "y"}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(4));
+  EXPECT_EQ(forward->local_tokens()[0].end, 0);
+  EXPECT_EQ(forward->local_tokens()[1].end, 0);
+  EXPECT_EQ(forward->local_tokens()[2].end, 1);  // dim_size(x) == 2.
+  EXPECT_EQ(forward->local_tokens()[3].end, 4);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>(
+                {Layout::kUnshardedDim, Layout::kUnshardedDim, "x", "y"}));
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_THAT(backward->local_tokens(), SizeIs(4));
+  EXPECT_EQ(backward->local_tokens()[0].end, 0);
+  EXPECT_EQ(backward->local_tokens()[1].end, 0);
+  EXPECT_EQ(backward->local_tokens()[2].end, 1);  // dim_size(x) == 2.
+  EXPECT_EQ(backward->local_tokens()[3].end, 4);
+}
+
+TEST_F(InferenceTest, ShrinkAxisMask) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, Layout::kUnshardedDim},
+      GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim}, GetMesh());
+  const auto specs = std::vector<Token>{
+      Token(Token::REGULAR, /*begin=*/0, /*end=*/-1, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::SHRINK_AXIS, /*begin=*/0, /*end=*/2,
+            /*stride=*/1, /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(
+      forward->expander_input_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, -1);
+  EXPECT_EQ(forward->local_tokens()[1].end, 2);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_EQ(backward->local_tokens()[0].end, -1);
+  EXPECT_EQ(backward->local_tokens()[1].end, 2);
+}
+
+TEST_F(InferenceTest, EllipsisMask) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{"x", "y", Layout::kUnshardedDim}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{"x", "y", Layout::kUnshardedDim,
+                               Layout::kUnshardedDim, Layout::kUnshardedDim},
+      GetMesh());
+
+  const auto specs =
+      std::vector<Token>{Token(Token::ELLIPSIS, /*begin=*/0, /*end=*/0,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::NEW_AXIS, /*begin=*/0, /*end=*/0,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::NEW_AXIS, /*begin=*/0, /*end=*/0,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4, 6});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({"x", "y", Layout::kUnshardedDim}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  // No local specs for the ellipsis axes "x" and "y".
+  EXPECT_THAT(forward->local_tokens(), SizeIs(3));
+  EXPECT_EQ(forward->local_tokens()[0].end, 0);
+  EXPECT_EQ(forward->local_tokens()[1].end, 0);
+  EXPECT_EQ(forward->local_tokens()[2].end, 0);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4, 6});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(
+      backward->expander_value_layout().sharding_spec_strs(),
+      std::vector<std::string>({"x", "y", Layout::kUnshardedDim,
+                                Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  // No local specs for the ellipsis axes "x" and "y".
+  EXPECT_THAT(backward->local_tokens(), SizeIs(3));
+  EXPECT_EQ(backward->local_tokens()[0].end, 0);
+  EXPECT_EQ(backward->local_tokens()[1].end, 0);
+  EXPECT_EQ(backward->local_tokens()[2].end, 0);
+}
+
+TEST_F(InferenceTest, EllipsisNewAxisEndMask) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, Layout::kUnshardedDim},
+      GetMesh());
+  const auto specs = std::vector<Token>{
+      Token(Token::ELLIPSIS, /*begin=*/0, /*end=*/0, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::NEW_AXIS, /*begin=*/0, /*end=*/0, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/false,
+            /*end_mask=*/false),
+      Token(Token::REGULAR, /*begin=*/0, /*end=*/0, /*stride=*/1,
+            /*dynamic_mask=*/false,
+            /*begin_mask=*/true,
+            /*end_mask=*/true),
+  };
+  auto forward = CreateAndRun<ForwardLayoutInference>(specs, input_layout,
+                                                      std::vector<int64_t>{2});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(3));
+  EXPECT_EQ(forward->local_tokens()[0].end, 0);
+  EXPECT_EQ(forward->local_tokens()[1].end, 0);
+  EXPECT_EQ(forward->local_tokens()[2].end, 2);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(
+      backward->expander_value_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(backward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(3));
+  EXPECT_EQ(backward->local_tokens()[0].end, 0);
+  EXPECT_EQ(backward->local_tokens()[1].end, 0);
+  EXPECT_EQ(backward->local_tokens()[2].end, 2);
+}
+
+TEST_F(InferenceTest, AdditionalAxes) {
+  const Layout input_layout =
+      *Layout::GetLayout(std::vector<std::string>{"x", "y"}, GetMesh());
+  const Layout output_layout =
+      *Layout::GetLayout(std::vector<std::string>{"x", "y"}, GetMesh());
+  const auto specs =
+      std::vector<Token>{Token(Token::REGULAR, /*begin=*/0, /*end=*/0,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/true,
+                               /*end_mask=*/true)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({"x", "y"}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(1));
+  EXPECT_EQ(forward->local_tokens()[0].begin_mask, true);
+  EXPECT_EQ(forward->local_tokens()[0].end_mask, true);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>({"x", "y"}));
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_THAT(backward->local_tokens(), SizeIs(1));
+  EXPECT_EQ(forward->local_tokens()[0].begin_mask, true);
+  EXPECT_EQ(forward->local_tokens()[0].end_mask, true);
+}
+
+TEST_F(InferenceTest, ShardingOnNonSlicedDimension) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{"x", Layout::kUnshardedDim}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{"x", Layout::kUnshardedDim}, GetMesh());
+  const auto specs =
+      std::vector<Token>{Token(Token::REGULAR, /*begin=*/0, /*end=*/2,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::REGULAR, /*begin=*/0, /*end=*/2,
+                               /*stride=*/2, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({"x", Layout::kUnshardedDim}));
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, 1);  // dim_size(x) == 2
+  EXPECT_EQ(forward->local_tokens()[1].end, 2);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>({"x", Layout::kUnshardedDim}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->local_tokens()[0].end, 1);  // dim_size(x) == 2
+  EXPECT_EQ(backward->local_tokens()[1].end, 2);
+}
+
+TEST_F(InferenceTest, StrideOnShardedDimensionNoRelayout1) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, "x"}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, "x"}, GetMesh());
+  const auto specs =
+      std::vector<Token>{Token(Token::REGULAR, /*begin=*/0, /*end=*/2,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::REGULAR, /*begin=*/0, /*end=*/4,
+                               /*stride=*/2, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false)};
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim, "x"}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, 2);
+  EXPECT_EQ(forward->local_tokens()[1].end, 2);  // dim_size(x) == 2
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim, "x"}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->local_tokens()[0].end, 2);  // dim_size(x) == 2
+  EXPECT_EQ(backward->local_tokens()[1].end, 2);
+}
+
+TEST_F(InferenceTest, StrideOnShardedDimensionNoRelayout2) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, "y"}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, "y"}, GetMesh());
+  const auto specs =
+      std::vector<Token>{Token(Token::REGULAR, /*begin=*/0, /*end=*/2,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::REGULAR, /*begin=*/0, /*end=*/4,
+                               /*stride=*/2, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim, "y"}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, 2);
+  EXPECT_EQ(forward->local_tokens()[1].end, 4);  // dim_size(x) == 1
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim, "y"}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->local_tokens()[0].end, 2);  // dim_size(x) == 2
+  EXPECT_EQ(backward->local_tokens()[1].end, 4);
+}
+
+TEST_F(InferenceTest, StrideOnShardedDimensionNoRelayout3) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, "x"}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, "x"}, GetMesh());
+  const auto specs =
+      std::vector<Token>{Token(Token::REGULAR, /*begin=*/0, /*end=*/2,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::REGULAR, /*begin=*/0, /*end=*/3,
+                               /*stride=*/2, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(forward->expander_input_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim, "x"}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, 2);
+  EXPECT_EQ(forward->local_tokens()[1].end, 2);  // dim_size(x) == 2
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  EXPECT_EQ(backward->expander_input_layout(), input_layout);
+  EXPECT_EQ(backward->expander_value_layout().sharding_spec_strs(),
+            std::vector<std::string>({Layout::kUnshardedDim, "x"}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->local_tokens()[0].end, 2);  // dim_size(x) == 2
+  EXPECT_EQ(backward->local_tokens()[1].end, 2);
+}
+
+TEST_F(InferenceTest, StrideOnShardedDimensionNeedRelayout) {
+  const Layout input_layout = *Layout::GetLayout(
+      std::vector<std::string>{"x", Layout::kUnshardedDim}, GetMesh());
+  const Layout output_layout = *Layout::GetLayout(
+      std::vector<std::string>{Layout::kUnshardedDim, Layout::kUnshardedDim},
+      GetMesh());
+
+  const auto specs =
+      std::vector<Token>{Token(Token::REGULAR, /*begin=*/0, /*end=*/-1,
+                               /*stride=*/1, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false),
+                         Token(Token::REGULAR, /*begin=*/0, /*end=*/4,
+                               /*stride=*/3, /*dynamic_mask=*/false,
+                               /*begin_mask=*/false,
+                               /*end_mask=*/false)};
+
+  auto forward = CreateAndRun<ForwardLayoutInference>(
+      specs, input_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(forward, IsOk());
+  EXPECT_EQ(
+      forward->expander_input_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(forward->expander_value_layout(), output_layout);
+  EXPECT_THAT(forward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(forward->local_tokens()[0].end, -1);
+  EXPECT_EQ(forward->local_tokens()[1].end, 4);
+
+  auto backward = CreateAndRun<BackwardLayoutInference>(
+      specs, output_layout, std::vector<int64_t>{2, 4});
+  ASSERT_THAT(backward, IsOk());
+  // The backward inferred input_layout prefers replicated layouts for this
+  // case.
+  EXPECT_EQ(
+      backward->expander_input_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_EQ(
+      backward->expander_value_layout().sharding_spec_strs(),
+      std::vector<std::string>({Layout::kUnshardedDim, Layout::kUnshardedDim}));
+  EXPECT_THAT(backward->local_tokens(), SizeIs(2));
+  EXPECT_EQ(backward->local_tokens()[0].end, -1);
+  EXPECT_EQ(backward->local_tokens()[1].end, 4);
+}
+
+}  // namespace
+}  // namespace slice_util
+}  // namespace dtensor
+}  // namespace tensorflow
diff --git a/tensorflow/dtensor/tests/tensor_layout_test.cc b/tensorflow/dtensor/tests/tensor_layout_test.cc
index 6c8b03b47e3..97a3c2ff2b5 100644
--- a/tensorflow/dtensor/tests/tensor_layout_test.cc
+++ b/tensorflow/dtensor/tests/tensor_layout_test.cc
@@ -649,14 +649,14 @@ TEST_F(LayoutTest, EmptyMeshDeviceType) {
 
 TEST_F(LayoutTest, ConvertMeshDeviceType) {
   TF_ASSERT_OK_AND_ASSIGN(Mesh mesh,
-                          Mesh::FromString("mesh:|x=2,batch=1|*TPU"));
+                          Mesh::FromString("mesh_name|x=2,batch=1|*TPU"));
   TF_ASSERT_OK_AND_ASSIGN(Mesh cpu_mesh, mesh.ToDeviceType("CPU"));
   EXPECT_TRUE(cpu_mesh.is_cpu_mesh());
 
   std::string expected_task_name = "/job:localhost/replica:0/task:0/";
   TF_ASSERT_OK_AND_ASSIGN(
       Mesh expected_mesh,
-      Mesh::FromString("mesh:|x=2,batch=1|0,1|0,1|" + expected_task_name +
+      Mesh::FromString("|x=2,batch=1|0,1|0,1|" + expected_task_name +
                        "device:CPU:0," + expected_task_name + "device:CPU:1"));
   EXPECT_EQ(cpu_mesh, expected_mesh);
 }
diff --git a/tensorflow/examples/adding_an_op/BUILD b/tensorflow/examples/adding_an_op/BUILD
index c1962f684a1..6b66c4e2a24 100644
--- a/tensorflow/examples/adding_an_op/BUILD
+++ b/tensorflow/examples/adding_an_op/BUILD
@@ -57,9 +57,9 @@ py_library(
     srcs = ["zero_out_grad_2.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:sparse_ops",
     ],
 )
 
diff --git a/tensorflow/examples/adding_an_op/cuda_op_kernel.cc b/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
index 4492df58cb3..acc6e7d9c7a 100644
--- a/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
+++ b/tensorflow/examples/adding_an_op/cuda_op_kernel.cc
@@ -28,7 +28,7 @@ output: A Tensor.
   output = input + 1
 )doc");
 
-void AddOneKernelLauncher(const int* in, const int N, int* out);
+void AddOneKernelLauncher(const int* in, int N, int* out);
 
 class AddOneOp : public OpKernel {
  public:
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_3/multiplex_3_kernel.cc b/tensorflow/examples/custom_ops_doc/multiplex_3/multiplex_3_kernel.cc
index 1895353e70f..f1b9d3b5a1c 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_3/multiplex_3_kernel.cc
+++ b/tensorflow/examples/custom_ops_doc/multiplex_3/multiplex_3_kernel.cc
@@ -16,6 +16,8 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/register_types.h"
diff --git a/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc b/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc
index ab55e524cc0..3529af44819 100644
--- a/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc
+++ b/tensorflow/examples/custom_ops_doc/multiplex_4/multiplex_4_op.cc
@@ -29,7 +29,6 @@ namespace tensorflow {
 namespace custom_op_examples {
 
 using ::tensorflow::shape_inference::InferenceContext;
-using ::tensorflow::shape_inference::ShapeHandle;
 
 Status MultiplexShapeFunction(InferenceContext* c) {
   int64_t num_cond_a;
diff --git a/tensorflow/examples/label_image/main.cc b/tensorflow/examples/label_image/main.cc
index 04f509244b3..c96cdc4dd9e 100644
--- a/tensorflow/examples/label_image/main.cc
+++ b/tensorflow/examples/label_image/main.cc
@@ -34,7 +34,10 @@ limitations under the License.
 // Note that, for GIF inputs, to reuse existing code, only single-frame ones
 // are supported.
 
+#include <algorithm>
 #include <fstream>
+#include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
diff --git a/tensorflow/examples/multibox_detector/main.cc b/tensorflow/examples/multibox_detector/main.cc
index 40f1964e3da..10c5fc6816c 100644
--- a/tensorflow/examples/multibox_detector/main.cc
+++ b/tensorflow/examples/multibox_detector/main.cc
@@ -17,8 +17,11 @@ limitations under the License.
 #include <stdio.h>
 #include <string.h>
 
+#include <algorithm>
 #include <cmath>
 #include <fstream>
+#include <memory>
+#include <string>
 #include <vector>
 
 #include "tensorflow/cc/ops/const_op.h"
@@ -119,8 +122,8 @@ Status ReadTensorFromImageFile(const string& file_name, const int input_height,
       root, dims_expander,
       Const(root.WithOpName("size"), {input_height, input_width}));
   // Subtract the mean and divide by the scale.
-  Div(root.WithOpName(output_name), Sub(root, resized, {input_mean}),
-      {input_std});
+  Div give_me_a_name(root.WithOpName(output_name),
+                     Sub(root, resized, {input_mean}), {input_std});
 
   // This runs the GraphDef network definition that we've just constructed, and
   // returns the results in the output tensor.
@@ -190,7 +193,8 @@ Status GetTopDetections(const std::vector<Tensor>& outputs, int how_many_labels,
   using namespace ::tensorflow::ops;  // NOLINT(build/namespaces)
 
   string output_name = "top_k";
-  TopK(root.WithOpName(output_name), outputs[0], how_many_labels);
+  TopK give_me_a_name(root.WithOpName(output_name), outputs[0],
+                      how_many_labels);
   // This runs the GraphDef network definition that we've just constructed, and
   // returns the results in the output tensors.
   tensorflow::GraphDef graph;
diff --git a/tensorflow/examples/speech_commands/BUILD b/tensorflow/examples/speech_commands/BUILD
index baf0d6966a4..52323e51f46 100644
--- a/tensorflow/examples/speech_commands/BUILD
+++ b/tensorflow/examples/speech_commands/BUILD
@@ -67,7 +67,7 @@ tf_py_test(
     ],
     deps = [
         ":models",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -95,7 +95,7 @@ tf_py_test(
     deps = [
         ":input_data",
         ":models",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -131,7 +131,7 @@ tf_py_test(
     ],
     deps = [
         ":train_main_lib",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -176,7 +176,7 @@ tf_py_test(
     ],
     deps = [
         ":freeze_main_lib",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -218,7 +218,7 @@ tf_py_test(
     ],
     deps = [
         ":wav_to_features_main_lib",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -261,7 +261,7 @@ tf_py_test(
     ],
     deps = [
         ":generate_streaming_test_wav_main_lib",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -316,7 +316,7 @@ tf_py_test(
     ],
     deps = [
         ":label_wav_main_lib",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 3fa52415df7..be7847d6928 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -6687,10 +6687,10 @@ func Concat(scope *Scope, concat_dim tf.Output, values []tf.Output) (output tf.O
 // Arguments:
 //
 //	concat_dim: The dimension along which to concatenate.
-//	shape: The `N` int32 vectors representing shape of tensors being concatenated.
+//	shape: The `N` int32 or int64 vectors representing shape of tensors being concatenated.
 //
-// Returns The `N` int32 vectors representing the starting offset
-// of input tensors within the concatenated output.
+// Returns The `N` vectors representing the starting offset
+// of input tensors within the concatenated output with type matching `shape`.
 func ConcatOffset(scope *Scope, concat_dim tf.Output, shape []tf.Output) (offset []tf.Output) {
 	if scope.Err() != nil {
 		return
diff --git a/tensorflow/java/BUILD b/tensorflow/java/BUILD
index d6f984966ed..c529b2bcffa 100644
--- a/tensorflow/java/BUILD
+++ b/tensorflow/java/BUILD
@@ -439,7 +439,9 @@ tf_cc_binary(
             "-z defs",
             "-s",
             "-Wl,--version-script,$(location {})".format(LINKER_VERSION_SCRIPT),
-            "-Wl,--undefined-version",
+            # copybara:uncomment_begin(google-only)
+            # "-Wl,--undefined-version",
+            # copybara:uncomment_end
         ],
     }),
     linkshared = 1,
diff --git a/tensorflow/java/src/main/native/graph_jni.cc b/tensorflow/java/src/main/native/graph_jni.cc
index b7b84a60fc5..2c0254f0e46 100644
--- a/tensorflow/java/src/main/native/graph_jni.cc
+++ b/tensorflow/java/src/main/native/graph_jni.cc
@@ -163,7 +163,7 @@ JNIEXPORT jlongArray JNICALL Java_org_tensorflow_Graph_addGradients(
                      "expected %d, got %d dx handles", ny,
                      env->GetArrayLength(dx_handles));
     }
-    dx.reset(new TF_Output[ny]);
+    dx = std::make_unique<TF_Output[]>(ny);
     resolveOutputs(env, "dx", dx_handles, dx_indices, dx.get(), ny);
   }
   if (env->ExceptionCheck()) return nullptr;
diff --git a/tensorflow/js/ops/ts_op_gen.cc b/tensorflow/js/ops/ts_op_gen.cc
index fb93bb6d8e8..558158f9f24 100644
--- a/tensorflow/js/ops/ts_op_gen.cc
+++ b/tensorflow/js/ops/ts_op_gen.cc
@@ -14,7 +14,10 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/js/ops/ts_op_gen.h"
+
+#include <memory>
 #include <unordered_map>
+#include <vector>
 
 #include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
@@ -89,7 +92,7 @@ class GenTypeScriptOp {
 GenTypeScriptOp::GenTypeScriptOp(const OpDef& op_def, const ApiDef& api_def)
     : op_def_(op_def), api_def_(api_def), num_outputs_(0) {}
 
-GenTypeScriptOp::~GenTypeScriptOp() {}
+GenTypeScriptOp::~GenTypeScriptOp() = default;
 
 string GenTypeScriptOp::Code() {
   ProcessArgs();
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index c08cd7bcec5..1a7059ab22b 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -1182,6 +1182,7 @@ cc_test(
         ":util",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/CMakeLists.txt b/tensorflow/lite/CMakeLists.txt
index dd6ffd9d897..3994f1b1c3f 100644
--- a/tensorflow/lite/CMakeLists.txt
+++ b/tensorflow/lite/CMakeLists.txt
@@ -138,6 +138,10 @@ macro(populate_tf_source_vars RELATIVE_DIR SOURCES_VAR)
     "${TF_SOURCE_DIR}/${RELATIVE_DIR}" ${SOURCES_VAR} ${ARGN}
   )
 endmacro()
+# Make sure all repos have licenses.
+set(OVERRIDABLE_FETCH_CONTENT_LICENSE_CHECK ON)
+# Additional library dependencies based upon enabled features.
+set(TFLITE_TARGET_DEPENDENCIES "")
 # Find TensorFlow Lite dependencies.
 find_package(absl REQUIRED)
 find_package(Eigen3 REQUIRED)
@@ -145,7 +149,10 @@ find_package(farmhash REQUIRED)
 find_package(fft2d REQUIRED)
 find_package(FlatBuffers REQUIRED)
 find_package(gemmlowp REQUIRED)
-find_package(NEON_2_SSE REQUIRED)
+if (NOT CMAKE_SYSTEM_PROCESSOR OR CMAKE_SYSTEM_PROCESSOR MATCHES "x86")
+  find_package(NEON_2_SSE REQUIRED)
+  list(APPEND TFLITE_TARGET_DEPENDENCIES NEON_2_SSE::NEON_2_SSE)
+endif()
 find_package(cpuinfo REQUIRED)  #CPUINFO is used by XNNPACK and RUY library
 find_package(ruy REQUIRED)
 # Download necessary dependencies.
@@ -178,8 +185,6 @@ endif()
 set(TFLITE_TARGET_PUBLIC_OPTIONS "-DEIGEN_NEON_GEBP_NR=4")
 set(TFLITE_TARGET_PRIVATE_OPTIONS "")
 set(TFLITE_TARGET_PRIVATE_DEFINITIONS "")
-# Additional library dependencies based upon enabled features.
-set(TFLITE_TARGET_DEPENDENCIES "")
 if(CMAKE_CXX_COMPILER_ID MATCHES "Clang$")
   # TFLite uses deprecated methods in neon2sse which generates a huge number of
   # warnings so surpress these until they're fixed.
@@ -224,6 +229,9 @@ if(CMAKE_SYSTEM_NAME MATCHES "Android")
 endif()
 # Build a list of source files to compile into the TF Lite library.
 populate_tflite_source_vars("." TFLITE_SRCS)
+if(CMAKE_SYSTEM_NAME MATCHES "Windows" AND BUILD_SHARED_LIBS)
+  list(FILTER TFLITE_SRCS EXCLUDE REGEX ".*simple_memory_arena_debug_dump\\.cc$")
+endif()
 
 # This particular file is excluded because the more explicit approach to enable
 # XNNPACK delegate is preferred to the weak-symbol one.
@@ -525,6 +533,10 @@ populate_tflite_source_vars("kernels/internal/reference/integer_ops"
 populate_tflite_source_vars("kernels/internal/reference/sparse_ops"
   TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS
 )
+populate_tflite_source_vars("kernels/internal/optimized/4bit"
+  TFLITE_KERNEL_INTERNAL_OPT_4BIT_SRCS
+  FILTER "(.*neon.*|.*sse.*)\\.(cc|h)"
+)
 set(TFLITE_PROFILER_SRCS
   ${TFLITE_SOURCE_DIR}/profiling/platform_profiler.cc
   ${TFLITE_SOURCE_DIR}/profiling/root_profiler.h
@@ -569,6 +581,7 @@ set(_ALL_TFLITE_SRCS
   ${TFLITE_KERNEL_INTERNAL_OPT_INTEGER_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_OPT_SPARSE_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_OPT_SRCS}
+  ${TFLITE_KERNEL_INTERNAL_OPT_4BIT_SRCS}
   ${TFLITE_KERNEL_INTERNAL_REF_INTEGER_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_REF_SPARSE_OPS_SRCS}
   ${TFLITE_KERNEL_INTERNAL_REF_SRCS}
@@ -595,7 +608,6 @@ target_include_directories(tensorflow-lite
 target_link_libraries(tensorflow-lite
   PUBLIC
     Eigen3::Eigen
-    NEON_2_SSE::NEON_2_SSE
     absl::flags
     absl::hash
     absl::status
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index f5d5c8382b0..6be5a308f7e 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -288,7 +288,10 @@ cc_library(
         "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
         "@com_google_absl//absl/memory",
     ] + select({
-        "//tensorflow:android": [
+        "//third_party/bazel_platforms/cpu:aarch64": [
+            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+        ],
+        "//third_party/bazel_platforms/cpu:armv7": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/lite/acceleration/configuration/configuration.proto b/tensorflow/lite/acceleration/configuration/configuration.proto
index 7b864b2d314..9119f4599e2 100644
--- a/tensorflow/lite/acceleration/configuration/configuration.proto
+++ b/tensorflow/lite/acceleration/configuration/configuration.proto
@@ -97,6 +97,8 @@ enum Delegate {
   EDGETPU_CORAL = 6;
   // Apple CoreML.
   CORE_ML = 7;
+  // Arm NN Delegate.
+  ARMNN = 8;
 }
 
 enum NNAPIExecutionPreference {
@@ -585,6 +587,11 @@ message GoogleEdgeTpuSettings {
   // TFLite Async API.
   optional TriState prefer_cache_coherency_for_inputs = 9;
   optional TriState prefer_cache_coherency_for_outputs = 10;
+
+  // Whether to allow the accelerator to optionally use lower-precision
+  // float16 (16-bit floating point) arithmetic when doing calculations on
+  // float32 (32-bit floating point).
+  optional bool allow_fp16_precision_for_fp32 = 11 [default = false];
 }
 
 // Coral Dev Board / USB accelerator delegate settings.
@@ -622,6 +629,22 @@ message CPUSettings {
   optional int32 num_threads = 1 [default = -1];
 }
 
+// Arm NN Delegate Settings.
+// More information about Arm NN delegate options can be found in
+// https://arm-software.github.io/armnn/latest/delegate.xhtml#delegateoptions
+message ArmNNSettings {
+  // A comma separated list without whitespaces of backends
+  // which should be used for execution. Falls back to next backend in list
+  // if previous does not provide support for operation.
+  optional string backends = 1;
+  // Allows the use of optimisation techniques e.g. Winograd that
+  // will reduce execution time with the possibility of a drop in accuracy.
+  optional bool fastmath = 2;
+  // Additional Arm NN delegate options. See
+  // https://arm-software.github.io/armnn/latest/delegate.xhtml#delegateoptions
+  optional string additional_parameters = 3;
+}
+
 // How to configure TFLite.
 message TFLiteSettings {
   // Which delegate to use.
@@ -676,6 +699,9 @@ message TFLiteSettings {
 
   // Compilation caching settings.
   optional CompilationCachingSettings compilation_caching_settings = 15;
+
+  // For configuring the Arm NN delegate.
+  optional ArmNNSettings armnn_settings = 16;
 }
 
 // Whether to automatically fallback to TFLite CPU path on delegation errors.
diff --git a/tensorflow/lite/acceleration/configuration/configuration_generated.h b/tensorflow/lite/acceleration/configuration/configuration_generated.h
index 6c00ee4d0ae..d95e98a0710 100644
--- a/tensorflow/lite/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
@@ -85,6 +85,10 @@ struct CPUSettings;
 struct CPUSettingsBuilder;
 struct CPUSettingsT;
 
+struct ArmNNSettings;
+struct ArmNNSettingsBuilder;
+struct ArmNNSettingsT;
+
 struct TFLiteSettings;
 struct TFLiteSettingsBuilder;
 struct TFLiteSettingsT;
@@ -185,6 +189,8 @@ bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
 bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
 bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
 bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
 bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
 bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
 bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
@@ -269,11 +275,12 @@ enum Delegate : int32_t {
   Delegate_EDGETPU = 5,
   Delegate_EDGETPU_CORAL = 6,
   Delegate_CORE_ML = 7,
+  Delegate_ARMNN = 8,
   Delegate_MIN = Delegate_NONE,
-  Delegate_MAX = Delegate_CORE_ML
+  Delegate_MAX = Delegate_ARMNN
 };
 
-inline const Delegate (&EnumValuesDelegate())[8] {
+inline const Delegate (&EnumValuesDelegate())[9] {
   static const Delegate values[] = {
     Delegate_NONE,
     Delegate_NNAPI,
@@ -282,13 +289,14 @@ inline const Delegate (&EnumValuesDelegate())[8] {
     Delegate_XNNPACK,
     Delegate_EDGETPU,
     Delegate_EDGETPU_CORAL,
-    Delegate_CORE_ML
+    Delegate_CORE_ML,
+    Delegate_ARMNN
   };
   return values;
 }
 
 inline const char * const *EnumNamesDelegate() {
-  static const char * const names[9] = {
+  static const char * const names[10] = {
     "NONE",
     "NNAPI",
     "GPU",
@@ -297,13 +305,14 @@ inline const char * const *EnumNamesDelegate() {
     "EDGETPU",
     "EDGETPU_CORAL",
     "CORE_ML",
+    "ARMNN",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameDelegate(Delegate e) {
-  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_CORE_ML)) return "";
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_ARMNN)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesDelegate()[index];
 }
@@ -2135,6 +2144,7 @@ struct GoogleEdgeTpuSettingsT : public ::flatbuffers::NativeTable {
   bool delegate_should_manage_cache_for_outputs = true;
   tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
   tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED;
+  bool allow_fp16_precision_for_fp32 = false;
 };
 
 struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
@@ -2150,7 +2160,8 @@ struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Ta
     VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_INPUTS = 16,
     VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS = 18,
     VT_PREFER_CACHE_COHERENCY_FOR_INPUTS = 20,
-    VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS = 22
+    VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS = 22,
+    VT_ALLOW_FP16_PRECISION_FOR_FP32 = 24
   };
   int32_t log_verbosity() const {
     return GetField<int32_t>(VT_LOG_VERBOSITY, -1);
@@ -2182,6 +2193,9 @@ struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Ta
   tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs() const {
     return static_cast<tflite::GoogleEdgeTpuSettings_::TriState>(GetField<int32_t>(VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 0));
   }
+  bool allow_fp16_precision_for_fp32() const {
+    return GetField<uint8_t>(VT_ALLOW_FP16_PRECISION_FOR_FP32, 0) != 0;
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_LOG_VERBOSITY, 4) &&
@@ -2196,6 +2210,7 @@ struct GoogleEdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Ta
            VerifyField<uint8_t>(verifier, VT_DELEGATE_SHOULD_MANAGE_CACHE_FOR_OUTPUTS, 1) &&
            VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_INPUTS, 4) &&
            VerifyField<int32_t>(verifier, VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, 4) &&
+           VerifyField<uint8_t>(verifier, VT_ALLOW_FP16_PRECISION_FOR_FP32, 1) &&
            verifier.EndTable();
   }
   GoogleEdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2237,6 +2252,9 @@ struct GoogleEdgeTpuSettingsBuilder {
   void add_prefer_cache_coherency_for_outputs(tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs) {
     fbb_.AddElement<int32_t>(GoogleEdgeTpuSettings::VT_PREFER_CACHE_COHERENCY_FOR_OUTPUTS, static_cast<int32_t>(prefer_cache_coherency_for_outputs), 0);
   }
+  void add_allow_fp16_precision_for_fp32(bool allow_fp16_precision_for_fp32) {
+    fbb_.AddElement<uint8_t>(GoogleEdgeTpuSettings::VT_ALLOW_FP16_PRECISION_FOR_FP32, static_cast<uint8_t>(allow_fp16_precision_for_fp32), 0);
+  }
   explicit GoogleEdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2259,7 +2277,8 @@ inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
     bool delegate_should_manage_cache_for_inputs = true,
     bool delegate_should_manage_cache_for_outputs = true,
     tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
-    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    bool allow_fp16_precision_for_fp32 = false) {
   GoogleEdgeTpuSettingsBuilder builder_(_fbb);
   builder_.add_prefer_cache_coherency_for_outputs(prefer_cache_coherency_for_outputs);
   builder_.add_prefer_cache_coherency_for_inputs(prefer_cache_coherency_for_inputs);
@@ -2267,6 +2286,7 @@ inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
   builder_.add_extension_data(extension_data);
   builder_.add_priority(priority);
   builder_.add_log_verbosity(log_verbosity);
+  builder_.add_allow_fp16_precision_for_fp32(allow_fp16_precision_for_fp32);
   builder_.add_delegate_should_manage_cache_for_outputs(delegate_should_manage_cache_for_outputs);
   builder_.add_delegate_should_manage_cache_for_inputs(delegate_should_manage_cache_for_inputs);
   builder_.add_use_async_api(use_async_api);
@@ -2285,7 +2305,8 @@ inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsD
     bool delegate_should_manage_cache_for_inputs = true,
     bool delegate_should_manage_cache_for_outputs = true,
     tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_inputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
-    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED) {
+    tflite::GoogleEdgeTpuSettings_::TriState prefer_cache_coherency_for_outputs = tflite::GoogleEdgeTpuSettings_::TriState_TRISTATE_UNDEFINED,
+    bool allow_fp16_precision_for_fp32 = false) {
   auto extension_data__ = extension_data ? _fbb.CreateVector<uint8_t>(*extension_data) : 0;
   auto model_identifier__ = model_identifier ? _fbb.CreateString(model_identifier) : 0;
   return tflite::CreateGoogleEdgeTpuSettings(
@@ -2299,7 +2320,8 @@ inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettingsD
       delegate_should_manage_cache_for_inputs,
       delegate_should_manage_cache_for_outputs,
       prefer_cache_coherency_for_inputs,
-      prefer_cache_coherency_for_outputs);
+      prefer_cache_coherency_for_outputs,
+      allow_fp16_precision_for_fp32);
 }
 
 ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -2457,6 +2479,97 @@ inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(
 
 ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ArmNNSettingsT : public ::flatbuffers::NativeTable {
+  typedef ArmNNSettings TableType;
+  std::string backends{};
+  bool fastmath = false;
+  std::string additional_parameters{};
+};
+
+struct ArmNNSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArmNNSettingsT NativeTableType;
+  typedef ArmNNSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BACKENDS = 4,
+    VT_FASTMATH = 6,
+    VT_ADDITIONAL_PARAMETERS = 8
+  };
+  const ::flatbuffers::String *backends() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_BACKENDS);
+  }
+  bool fastmath() const {
+    return GetField<uint8_t>(VT_FASTMATH, 0) != 0;
+  }
+  const ::flatbuffers::String *additional_parameters() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ADDITIONAL_PARAMETERS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BACKENDS) &&
+           verifier.VerifyString(backends()) &&
+           VerifyField<uint8_t>(verifier, VT_FASTMATH, 1) &&
+           VerifyOffset(verifier, VT_ADDITIONAL_PARAMETERS) &&
+           verifier.VerifyString(additional_parameters()) &&
+           verifier.EndTable();
+  }
+  ArmNNSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArmNNSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArmNNSettingsBuilder {
+  typedef ArmNNSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_backends(::flatbuffers::Offset<::flatbuffers::String> backends) {
+    fbb_.AddOffset(ArmNNSettings::VT_BACKENDS, backends);
+  }
+  void add_fastmath(bool fastmath) {
+    fbb_.AddElement<uint8_t>(ArmNNSettings::VT_FASTMATH, static_cast<uint8_t>(fastmath), 0);
+  }
+  void add_additional_parameters(::flatbuffers::Offset<::flatbuffers::String> additional_parameters) {
+    fbb_.AddOffset(ArmNNSettings::VT_ADDITIONAL_PARAMETERS, additional_parameters);
+  }
+  explicit ArmNNSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArmNNSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArmNNSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> backends = 0,
+    bool fastmath = false,
+    ::flatbuffers::Offset<::flatbuffers::String> additional_parameters = 0) {
+  ArmNNSettingsBuilder builder_(_fbb);
+  builder_.add_additional_parameters(additional_parameters);
+  builder_.add_backends(backends);
+  builder_.add_fastmath(fastmath);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *backends = nullptr,
+    bool fastmath = false,
+    const char *additional_parameters = nullptr) {
+  auto backends__ = backends ? _fbb.CreateString(backends) : 0;
+  auto additional_parameters__ = additional_parameters ? _fbb.CreateString(additional_parameters) : 0;
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      backends__,
+      fastmath,
+      additional_parameters__);
+}
+
+::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
+
 struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   typedef TFLiteSettings TableType;
   tflite::Delegate delegate = tflite::Delegate_NONE;
@@ -2474,6 +2587,7 @@ struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   std::unique_ptr<tflite::StableDelegateLoaderSettingsT> stable_delegate_loader_settings{};
   std::unique_ptr<tflite::GoogleEdgeTpuSettingsT> google_edgetpu_settings{};
   std::unique_ptr<tflite::CompilationCachingSettingsT> compilation_caching_settings{};
+  std::unique_ptr<tflite::ArmNNSettingsT> armnn_settings{};
   TFLiteSettingsT() = default;
   TFLiteSettingsT(const TFLiteSettingsT &o);
   TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -2498,7 +2612,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_DISABLE_DEFAULT_DELEGATES = 26,
     VT_STABLE_DELEGATE_LOADER_SETTINGS = 28,
     VT_GOOGLE_EDGETPU_SETTINGS = 30,
-    VT_COMPILATION_CACHING_SETTINGS = 32
+    VT_COMPILATION_CACHING_SETTINGS = 32,
+    VT_ARMNN_SETTINGS = 34
   };
   tflite::Delegate delegate() const {
     return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
@@ -2545,6 +2660,9 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const tflite::CompilationCachingSettings *compilation_caching_settings() const {
     return GetPointer<const tflite::CompilationCachingSettings *>(VT_COMPILATION_CACHING_SETTINGS);
   }
+  const tflite::ArmNNSettings *armnn_settings() const {
+    return GetPointer<const tflite::ArmNNSettings *>(VT_ARMNN_SETTINGS);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
@@ -2574,6 +2692,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            verifier.VerifyTable(google_edgetpu_settings()) &&
            VerifyOffset(verifier, VT_COMPILATION_CACHING_SETTINGS) &&
            verifier.VerifyTable(compilation_caching_settings()) &&
+           VerifyOffset(verifier, VT_ARMNN_SETTINGS) &&
+           verifier.VerifyTable(armnn_settings()) &&
            verifier.EndTable();
   }
   TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2630,6 +2750,9 @@ struct TFLiteSettingsBuilder {
   void add_compilation_caching_settings(::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_COMPILATION_CACHING_SETTINGS, compilation_caching_settings);
   }
+  void add_armnn_settings(::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_ARMNN_SETTINGS, armnn_settings);
+  }
   explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2657,8 +2780,10 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
     bool disable_default_delegates = false,
     ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
     ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
-    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0) {
+    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0,
+    ::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings = 0) {
   TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_armnn_settings(armnn_settings);
   builder_.add_compilation_caching_settings(compilation_caching_settings);
   builder_.add_google_edgetpu_settings(google_edgetpu_settings);
   builder_.add_stable_delegate_loader_settings(stable_delegate_loader_settings);
@@ -4735,7 +4860,8 @@ inline bool operator==(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSet
       (lhs.delegate_should_manage_cache_for_inputs == rhs.delegate_should_manage_cache_for_inputs) &&
       (lhs.delegate_should_manage_cache_for_outputs == rhs.delegate_should_manage_cache_for_outputs) &&
       (lhs.prefer_cache_coherency_for_inputs == rhs.prefer_cache_coherency_for_inputs) &&
-      (lhs.prefer_cache_coherency_for_outputs == rhs.prefer_cache_coherency_for_outputs);
+      (lhs.prefer_cache_coherency_for_outputs == rhs.prefer_cache_coherency_for_outputs) &&
+      (lhs.allow_fp16_precision_for_fp32 == rhs.allow_fp16_precision_for_fp32);
 }
 
 inline bool operator!=(const GoogleEdgeTpuSettingsT &lhs, const GoogleEdgeTpuSettingsT &rhs) {
@@ -4762,6 +4888,7 @@ inline void GoogleEdgeTpuSettings::UnPackTo(GoogleEdgeTpuSettingsT *_o, const ::
   { auto _e = delegate_should_manage_cache_for_outputs(); _o->delegate_should_manage_cache_for_outputs = _e; }
   { auto _e = prefer_cache_coherency_for_inputs(); _o->prefer_cache_coherency_for_inputs = _e; }
   { auto _e = prefer_cache_coherency_for_outputs(); _o->prefer_cache_coherency_for_outputs = _e; }
+  { auto _e = allow_fp16_precision_for_fp32(); _o->allow_fp16_precision_for_fp32 = _e; }
 }
 
 inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> GoogleEdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const GoogleEdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -4782,6 +4909,7 @@ inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
   auto _delegate_should_manage_cache_for_outputs = _o->delegate_should_manage_cache_for_outputs;
   auto _prefer_cache_coherency_for_inputs = _o->prefer_cache_coherency_for_inputs;
   auto _prefer_cache_coherency_for_outputs = _o->prefer_cache_coherency_for_outputs;
+  auto _allow_fp16_precision_for_fp32 = _o->allow_fp16_precision_for_fp32;
   return tflite::CreateGoogleEdgeTpuSettings(
       _fbb,
       _log_verbosity,
@@ -4793,7 +4921,8 @@ inline ::flatbuffers::Offset<GoogleEdgeTpuSettings> CreateGoogleEdgeTpuSettings(
       _delegate_should_manage_cache_for_inputs,
       _delegate_should_manage_cache_for_outputs,
       _prefer_cache_coherency_for_inputs,
-      _prefer_cache_coherency_for_outputs);
+      _prefer_cache_coherency_for_outputs,
+      _allow_fp16_precision_for_fp32);
 }
 
 
@@ -4882,6 +5011,49 @@ inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatB
       _num_threads);
 }
 
+inline bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+  return
+      (lhs.backends == rhs.backends) &&
+      (lhs.fastmath == rhs.fastmath) &&
+      (lhs.additional_parameters == rhs.additional_parameters);
+}
+
+inline bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ArmNNSettingsT *ArmNNSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArmNNSettingsT>(new ArmNNSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArmNNSettings::UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = backends(); if (_e) _o->backends = _e->str(); }
+  { auto _e = fastmath(); _o->fastmath = _e; }
+  { auto _e = additional_parameters(); if (_e) _o->additional_parameters = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> ArmNNSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArmNNSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArmNNSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _backends = _o->backends.empty() ? 0 : _fbb.CreateString(_o->backends);
+  auto _fastmath = _o->fastmath;
+  auto _additional_parameters = _o->additional_parameters.empty() ? 0 : _fbb.CreateString(_o->additional_parameters);
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      _backends,
+      _fastmath,
+      _additional_parameters);
+}
 
 inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
   return
@@ -4899,7 +5071,8 @@ inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
       (lhs.disable_default_delegates == rhs.disable_default_delegates) &&
       ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings)) &&
       ((lhs.google_edgetpu_settings == rhs.google_edgetpu_settings) || (lhs.google_edgetpu_settings && rhs.google_edgetpu_settings && *lhs.google_edgetpu_settings == *rhs.google_edgetpu_settings)) &&
-      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings));
+      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings)) &&
+      ((lhs.armnn_settings == rhs.armnn_settings) || (lhs.armnn_settings && rhs.armnn_settings && *lhs.armnn_settings == *rhs.armnn_settings));
 }
 
 inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
@@ -4941,6 +5114,7 @@ inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFER
   std::swap(stable_delegate_loader_settings, o.stable_delegate_loader_settings);
   std::swap(google_edgetpu_settings, o.google_edgetpu_settings);
   std::swap(compilation_caching_settings, o.compilation_caching_settings);
+  std::swap(armnn_settings, o.armnn_settings);
   return *this;
 }
 
@@ -4968,6 +5142,7 @@ inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::r
   { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } else if (_o->stable_delegate_loader_settings) { _o->stable_delegate_loader_settings.reset(); } }
   { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
   { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
+  { auto _e = armnn_settings(); if (_e) { if(_o->armnn_settings) { _e->UnPackTo(_o->armnn_settings.get(), _resolver); } else { _o->armnn_settings = std::unique_ptr<tflite::ArmNNSettingsT>(_e->UnPack(_resolver)); } } else if (_o->armnn_settings) { _o->armnn_settings.reset(); } }
 }
 
 inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -4993,6 +5168,7 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers:
   auto _stable_delegate_loader_settings = _o->stable_delegate_loader_settings ? CreateStableDelegateLoaderSettings(_fbb, _o->stable_delegate_loader_settings.get(), _rehasher) : 0;
   auto _google_edgetpu_settings = _o->google_edgetpu_settings ? CreateGoogleEdgeTpuSettings(_fbb, _o->google_edgetpu_settings.get(), _rehasher) : 0;
   auto _compilation_caching_settings = _o->compilation_caching_settings ? CreateCompilationCachingSettings(_fbb, _o->compilation_caching_settings.get(), _rehasher) : 0;
+  auto _armnn_settings = _o->armnn_settings ? CreateArmNNSettings(_fbb, _o->armnn_settings.get(), _rehasher) : 0;
   return tflite::CreateTFLiteSettings(
       _fbb,
       _delegate,
@@ -5009,7 +5185,8 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers:
       _disable_default_delegates,
       _stable_delegate_loader_settings,
       _google_edgetpu_settings,
-      _compilation_caching_settings);
+      _compilation_caching_settings,
+      _armnn_settings);
 }
 
 
diff --git a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
index a61d81f4be7..bb32de9bf87 100644
--- a/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
+++ b/tensorflow/lite/acceleration/configuration/flatbuffer_to_proto.cc
@@ -55,6 +55,8 @@ proto::Delegate ConvertDelegate(Delegate delegate) {
       return proto::Delegate::EDGETPU_CORAL;
     case Delegate_CORE_ML:
       return proto::Delegate::CORE_ML;
+    case Delegate_ARMNN:
+      return proto::Delegate::ARMNN;
   }
   TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Unexpected value for Delegate: %d",
                   delegate);
diff --git a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
index baf622a07db..958550cb550 100644
--- a/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
+++ b/tensorflow/lite/acceleration/configuration/proto_to_flatbuffer.cc
@@ -63,6 +63,8 @@ Delegate ConvertDelegate(proto::Delegate delegate) {
       return Delegate_EDGETPU_CORAL;
     case proto::Delegate::CORE_ML:
       return Delegate_CORE_ML;
+    case proto::Delegate::ARMNN:
+      return Delegate_ARMNN;
   }
   TFLITE_LOG_PROD(TFLITE_LOG_ERROR, "Unexpected value for Delegate: %d",
                   delegate);
diff --git a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
index e1a12dc8255..f5039d41aab 100644
--- a/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
+++ b/tensorflow/lite/acceleration/configuration/testdata/configuration.proto_prev
@@ -48,7 +48,10 @@
 // //tensorflow/lite/acceleration/configuration:proto_to_flatbuffer
 // && cp
 // bazel-bin/tensorflow/lite/acceleration/configuration/configuration_generated.h
-// tensorflow/lite/acceleration/configuration/configuration_generated.h.oss
+// tensorflow/lite/acceleration/configuration/configuration_generated.h
+// NOTE: If you are a Google developer using the internal dev environment,
+// please read the description of the following bash script and then run it:
+// ./third_party/tensorflow/lite/acceleration/configuration/google/regenerate_tflite_configuration_generated_header.sh
 
 syntax = "proto2";
 
@@ -92,6 +95,8 @@ enum Delegate {
   EDGETPU_CORAL = 6;
   // Apple CoreML.
   CORE_ML = 7;
+  // Arm NN Delegate.
+  ARMNN = 8;
 }
 
 enum NNAPIExecutionPreference {
@@ -513,6 +518,12 @@ message GoogleEdgeTpuSettings {
     PRIORITY_HIGH = 3;
   }
 
+  enum TriState {
+    TRISTATE_UNDEFINED = 0;
+    TRISTATE_FALSE = 1;
+    TRISTATE_TRUE = 2;
+  }
+
   // Controls the verbosity level of the delegate log messages. Set to -1 to let
   // the delegate choose. Otherwise, it must range from 0 to 10 (inclusive),
   // where lower values indicate less verbosity. A higher verbosity level may
@@ -554,6 +565,26 @@ message GoogleEdgeTpuSettings {
   // Both Model A and B will be cached separately, and coexist for efficient
   // lookups.
   optional string model_identifier = 5 [default = ""];
+
+  // If set to true, the user must use TFLite Async API to run the inference.
+  optional bool use_async_api = 6 [default = false];
+
+  // Specifies whether or not the delegate should handle cache management for
+  // the imported input or output buffers with TFLite Async API. These
+  // options have no effect if the user is not using the TFLite Async API.
+  optional bool delegate_should_manage_cache_for_inputs = 7 [default = true];
+  optional bool delegate_should_manage_cache_for_outputs = 8 [default = true];
+
+  // Specifies whether or not cache coherency is preferred for the imported
+  // input or output buffers with TFLite Async API. These options are purely
+  // advisory. Even if the user specifies that cache coherency is preferred,
+  // the delegate may still choose to use cache incoherent memory under certain
+  // circumstances, e.g. hardware limitation. If it is set to
+  // TRISTATE_UNDEFINED, the delegate will use the default value based on the
+  // device type. These options have no effect if the user is not using the
+  // TFLite Async API.
+  optional TriState prefer_cache_coherency_for_inputs = 9;
+  optional TriState prefer_cache_coherency_for_outputs = 10;
 }
 
 // Coral Dev Board / USB accelerator delegate settings.
@@ -591,6 +622,22 @@ message CPUSettings {
   optional int32 num_threads = 1 [default = -1];
 }
 
+// Arm NN Delegate Settings.
+// More information about Arm NN delegate options can be found in
+// https://arm-software.github.io/armnn/latest/delegate.xhtml#delegateoptions
+message ArmNNSettings {
+  // A comma separated list without whitespaces of backends
+  // which should be used for execution. Falls back to next backend in list
+  // if previous does not provide support for operation.
+  optional string backends = 1;
+  // Allows the use of optimisation techniques e.g. Winograd that
+  // will reduce execution time with the possibility of a drop in accuracy.
+  optional bool fastmath = 2;
+  // Additional Arm NN delegate options. See
+  // https://arm-software.github.io/armnn/latest/delegate.xhtml#delegateoptions
+  optional string additional_parameters = 3;
+}
+
 // How to configure TFLite.
 message TFLiteSettings {
   // Which delegate to use.
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index 32a7eadba3d..6818a0eabbd 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -29,25 +29,11 @@ limitations under the License.
 #include "tensorflow/lite/simple_memory_arena.h"
 
 namespace tflite {
-namespace {
 
 constexpr int32_t kLastActiveNodeUndefined =
     std::numeric_limits<int32_t>::max();
 constexpr int32_t kNodeNotAssigned = std::numeric_limits<int32_t>::max();
-
-bool ShareFirstInputWithFirstOutputForNode(const TfLiteRegistration& node_reg) {
-  // TODO (b/254230751): add support for more ops which support forwarding.
-  switch (node_reg.builtin_code) {
-    case kTfLiteBuiltinExpandDims:
-    case kTfLiteBuiltinReshape:
-    case kTfLiteBuiltinSqueeze:
-    case kTfLiteBuiltinBitcast:
-      return true;
-    default:
-      return false;
-  }
-}
-}  // namespace
+constexpr int32_t kScalarTensorBytes = 4;
 
 ArenaPlanner::ArenaPlanner(TfLiteContext* context,
                            std::unique_ptr<GraphInfo> graph_info,
@@ -116,45 +102,109 @@ int ArenaPlanner::FindSharedTensor(int tensor_index) {
   return tensor_index;
 }
 
-void ArenaPlanner::IdentifySharedTensors() {
+bool ArenaPlanner::InputTensorCanBeShared(const TfLiteTensor& input_tensor,
+                                          const TfLiteTensor& output_tensor,
+                                          int input_id, int output_id,
+                                          bool tensor_changed) {
+  // Both tensors must be the same size.
+  // Often a small tensor indicates that `ResizeInputTensor` has not yet been
+  // called, the form of a broadcast may change so sharing may no longer be
+  // possible. This is to prevent false detection of sharing which causes
+  // the shared tensor to live longer than it otherwise would, potentially
+  // increasing memory usage.
+  // The input and output are always the same size for ops which don't modify
+  // the tensor.
+  if (tensor_changed) {
+    if (input_tensor.bytes != output_tensor.bytes ||
+        input_tensor.bytes <= kScalarTensorBytes) {
+      return false;
+    }
+    // If there is more than one reference to the input tensor, we cannot
+    // share. TODO(b/254230751): The last consumer can share.
+    if (refcounts_[input_id] > 1) {
+      return false;
+    }
+  }
+  for (int input : graph_info_->inputs()) {
+    if (input == input_id) {
+      return false;
+    }
+  }
+  for (int output : graph_info_->outputs()) {
+    if (output == output_id) {
+      return false;
+    }
+  }
+  TfLiteAllocationType input_allocation_type = input_tensor.allocation_type;
+  TfLiteAllocationType output_allocation_type = output_tensor.allocation_type;
+  if (input_allocation_type != output_allocation_type &&
+      input_allocation_type != kTfLiteArenaRw) {
+    return false;
+  }
+  return true;
+}
+
+// An op can reuse one of the input tensors if:
+// The sizes are equal (broadcast is an example where this may not be true)
+// The tensors are allocated within the same arena.
+// The number of references to the shared input is one in the case of ops which
+// modify the contents.
+// Subgraph inputs and outputs cannot be shared.
+void ArenaPlanner::IdentifyInPlaceTensors() {
   actual_tensor_id_.clear();
-  TfLiteTensor* tensors = graph_info_->tensors();
   const int num_execution_nodes = graph_info_->num_execution_nodes();
+  TfLiteTensor* tensors = graph_info_->tensors();
   for (int i = 0; i < num_execution_nodes; ++i) {
-    const auto& reg = graph_info_->registration(i);
-    const auto& tflite_node = graph_info_->node(i);
-    if (ShareFirstInputWithFirstOutputForNode(reg)) {
-      int32_t input_tensor = tflite_node.inputs->data[0];
-      int32_t output_tensor = tflite_node.outputs->data[0];
-      bool is_input_or_output_tensor = false;
-      for (int input : graph_info_->inputs()) {
-        if (input == input_tensor) {
-          is_input_or_output_tensor = true;
-          break;
-        }
-      }
-      for (int output : graph_info_->outputs()) {
-        if (output == output_tensor) {
-          is_input_or_output_tensor = true;
-          break;
-        }
-      }
-      if (is_input_or_output_tensor) continue;
-      TfLiteAllocationType input_allocation_type =
-          tensors[input_tensor].allocation_type;
-      TfLiteAllocationType output_allocation_type =
-          tensors[output_tensor].allocation_type;
-      // Only tensors allocated in the same arena may be shared.
-      if (input_allocation_type != output_allocation_type) {
+    const TfLiteRegistration& registration = graph_info_->registration(i);
+    const TfLiteNode& node = graph_info_->node(i);
+    if (node.outputs->size < 1) continue;
+    bool tensor_changed = true;
+    switch (registration.builtin_code) {
+      // Operation types whose output can reuse input memory.
+      // TODO (b/254230751): add support for more ops which support forwarding.
+      // The following ops don't care about the number of consumers of the input
+      // being shared as they do not modify the contents.
+      case kTfLiteBuiltinBitcast:
+      case kTfLiteBuiltinExpandDims:
+      case kTfLiteBuiltinReshape:
+      case kTfLiteBuiltinSqueeze:
+        tensor_changed = false;
+        break;
+      case kTfLiteBuiltinAdd:
+      case kTfLiteBuiltinDiv:
+      case kTfLiteBuiltinDynamicUpdateSlice:
+      case kTfLiteBuiltinMul:
+      case kTfLiteBuiltinSoftmax:
+      case kTfLiteBuiltinSub:
+        break;
+      default:
+        continue;
+    }
+    int32_t input_id = -1;
+    int32_t output_id = node.outputs->data[0];
+    const TfLiteTensor& output_tensor = tensors[output_id];
+    for (int i = 0; i < node.inputs->size; ++i) {
+      if (node.inputs->data[i] == kTfLiteOptionalTensor) {
         continue;
       }
-      if (input_allocation_type == kTfLiteArenaRw ||
-          input_allocation_type == kTfLiteArenaRwPersistent) {
-        // Handle the case where a shared tensor is also shared.
-        int32_t actual_output_tensor_id = FindSharedTensor(input_tensor);
-        actual_tensor_id_[output_tensor] = actual_output_tensor_id;
+      const TfLiteTensor& input_tensor = tensors[node.inputs->data[i]];
+      if (InputTensorCanBeShared(input_tensor, output_tensor,
+                                 node.inputs->data[i], output_id,
+                                 tensor_changed)) {
+        input_id = node.inputs->data[i];
+        break;
       }
     }
+    if (input_id == -1) {
+      continue;
+    }
+    int32_t actual_output_tensor_id = FindSharedTensor(input_id);
+    if (tensor_changed) {
+      if (refcounts_[actual_output_tensor_id] > 1) {
+        continue;
+      }
+    }
+    actual_tensor_id_[output_id] = actual_output_tensor_id;
   }
 }
 
@@ -170,7 +220,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
       std::max(graph_info_->num_execution_nodes(), (size_t)1), {});
 
   // Keeps track of references to each tensor.
-  std::vector<int> refcounts(num_tensors, 0);
+  refcounts_.resize(num_tensors, 0);
 
   auto allocate = [this](int node, int tensor) -> TfLiteStatus {
     if (alloc_node_[tensor] != kNodeNotAssigned) {
@@ -197,16 +247,15 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
   // artificially adding one to their ref-counts so they are never selected
   // for deallocation.
   for (int tensor_index : graph_info_->outputs()) {
-    refcounts[tensor_index]++;
+    ++refcounts_[tensor_index];
   }
 
-  IdentifySharedTensors();
   // Variable tensors also should be ensured to be never overwritten and need to
   // be alive all the time.
   for (int tensor_index : graph_info_->variables()) {
     // Increase the reference count for variable tensors by one, so it will
     // never be deallocated.
-    refcounts[tensor_index]++;
+    ++refcounts_[tensor_index];
     // `variables` is a subgraph-level list and it should never be
     // kTfLiteOptionalTensor.
     TF_LITE_ENSURE(context_, tensor_index != kTfLiteOptionalTensor);
@@ -219,12 +268,14 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
   // overwritten.
   for (int tensor_index : graph_info_->inputs()) {
     if (tensor_index != kTfLiteOptionalTensor) {
-      refcounts[tensor_index]++;
+      ++refcounts_[tensor_index];
       TF_LITE_ENSURE_STATUS(allocate(0, tensor_index));
       nodes_to_tensors_[0].insert(tensor_index);
     }
   }
-
+  // Copy reference counts before sharing tensors so that the correct values are
+  // used to determine if a tensor may be shared or not.
+  std::vector<int> refcounts = refcounts_;
   // Count references to node input tensors.
   const int num_execution_nodes = graph_info_->num_execution_nodes();
   for (size_t i = 0; i < num_execution_nodes; ++i) {
@@ -233,13 +284,26 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
     for (int j = 0; j < node_inputs->size; ++j) {
       int tensor_index = node_inputs->data[j];
       if (tensor_index != kTfLiteOptionalTensor) {
-        // Correctly count references for shared buffers.
-        tensor_index = FindSharedTensor(tensor_index);
-        refcounts[tensor_index]++;
+        ++refcounts_[tensor_index];
       }
     }
   }
 
+  IdentifyInPlaceTensors();
+  // Use the new reference counts to determine when tensors memory can safely be
+  // reused.
+  for (size_t i = 0; i < num_execution_nodes; ++i) {
+    const TfLiteNode& node = graph_info_->node(i);
+    TfLiteIntArray* node_inputs = node.inputs;
+    for (int j = 0; j < node_inputs->size; ++j) {
+      int tensor_index = node_inputs->data[j];
+      if (tensor_index != kTfLiteOptionalTensor) {
+        // Correctly count references for shared buffers.
+        tensor_index = FindSharedTensor(tensor_index);
+        ++refcounts[tensor_index];
+      }
+    }
+  }
   // Go through the graph in execution order.
   for (size_t i = 0; i < num_execution_nodes; ++i) {
     const TfLiteNode& node = graph_info_->node(i);
@@ -263,7 +327,7 @@ TfLiteStatus ArenaPlanner::PlanAllocations() {
         if (tensor_index != kTfLiteOptionalTensor) {
           // Correctly count references for shared buffers.
           tensor_index = FindSharedTensor(tensor_index);
-          refcounts[tensor_index]--;
+          --refcounts[tensor_index];
           if (refcounts[tensor_index] == 0) {
             TF_LITE_ENSURE_STATUS(deallocate(i, tensor_index));
           }
@@ -465,10 +529,12 @@ TfLiteStatus ArenaPlanner::CalculateAllocations(
       // changed to kTfLiteCustom or kTfLiteDynamic after `PlanAllocations` was
       // called. This means that the buffer is no longer shareable so remove its
       // index from `actual_tensor_id_`.
+      // A call to `ResizeInputTensor` may cause the form of a broadcast op
+      // meaning that tensor sharing is no longer valid.
       TfLiteAllocationType allocation_type =
           tensors[it->second].allocation_type;
-      if (allocation_type != kTfLiteArenaRwPersistent &&
-          allocation_type != kTfLiteArenaRw) {
+      if (allocation_type != kTfLiteArenaRw ||
+          tensors[it->second].bytes != tensors[it->first].bytes) {
         actual_tensor_id_.erase(it);
       } else {
         // Don't allocate the tensor, it can safely share the input buffer.
diff --git a/tensorflow/lite/arena_planner.h b/tensorflow/lite/arena_planner.h
index e67fb3d7f3a..f8547c352a8 100644
--- a/tensorflow/lite/arena_planner.h
+++ b/tensorflow/lite/arena_planner.h
@@ -73,8 +73,16 @@ class ArenaPlanner : public MemoryPlanner {
   std::intptr_t BasePointer(TfLiteAllocationType type);
 
  private:
-  // Identify tensors which may share memory.
-  void IdentifySharedTensors();
+  // Check whether the input tensor's memory may be shared the output tensor.
+  // tensor_changed: true if the output tensor modifies the tensor data. For
+  // example, `Reshape` doesn't modify data but Add does.
+  bool InputTensorCanBeShared(const TfLiteTensor& input,
+                              const TfLiteTensor& output, int input_id,
+                              int output_id, bool tensor_changed);
+
+  // Identify tensors which can share memory with another.
+  void IdentifyInPlaceTensors();
+
   // Make sure all the arenas have reserved enough memory to store all their
   // tensors.
   TfLiteStatus Commit(bool* arena_reallocated);
@@ -153,6 +161,9 @@ class ArenaPlanner : public MemoryPlanner {
   // data with another tensor.
   // NOLINTNEXTLINE - absl::flat_hash_map increases binary size by 106kB.
   std::unordered_map<int32_t, int32_t> actual_tensor_id_;
+
+  // Store number of references to each tensor.
+  std::vector<int> refcounts_;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 168691065ef..f603cffb8fa 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -418,6 +418,8 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithInplaceReshape) {
                       {{4}, {5}, {}}  // Fourth Op, output
                   },
                   {5});
+  (*graph.tensors())[2].bytes = 24;
+  (*graph.tensors())[4].bytes = 24;
   SetGraph(&graph);
   Execute(0, graph.nodes().size() - 1);
 
@@ -437,6 +439,11 @@ TEST_F(ArenaPlannerTest, SimpleGraphWithChainOfInplaceOps) {
                       {{7}, {8}, {}},
                   },
                   {8});
+  (*graph.tensors())[2].bytes = 24;
+  (*graph.tensors())[4].bytes = 24;
+  (*graph.tensors())[5].bytes = 24;
+  (*graph.tensors())[6].bytes = 24;
+  (*graph.tensors())[7].bytes = 24;
   SetGraph(&graph);
   Execute(0, graph.nodes().size() - 1);
 
@@ -457,6 +464,8 @@ TEST_F(ArenaPlannerTest, SimpleGraphsWithReshapeInputOutput) {
        {{2, 1}, {3}, {}, kTfLiteBuiltinReshape},
        {{3}, {4}, {}}},
       {4});
+  (*graph.tensors())[2].bytes = 24;
+  (*graph.tensors())[3].bytes = 24;
   SetGraph(&graph);
   Execute(0, graph.nodes().size() - 1);
 
diff --git a/tensorflow/lite/build_def.bzl b/tensorflow/lite/build_def.bzl
index 8f022d52698..cff2417fca8 100644
--- a/tensorflow/lite/build_def.bzl
+++ b/tensorflow/lite/build_def.bzl
@@ -449,7 +449,7 @@ def gen_model_coverage_test(src, model_name, data, failure_type, tags, size = "m
                 "//third_party/py/tensorflow",
                 "//tensorflow/lite/testing/model_coverage:model_coverage_lib",
                 "//tensorflow/lite/python:lite",
-                "//tensorflow/python:client_testlib",
+                "//tensorflow/python/platform:client_testlib",
             ] + flex_dep(target_op_sets),
         )
 
diff --git a/tensorflow/lite/c/BUILD b/tensorflow/lite/c/BUILD
index 682391f2377..95bf3e6bd4c 100644
--- a/tensorflow/lite/c/BUILD
+++ b/tensorflow/lite/c/BUILD
@@ -307,16 +307,18 @@ cc_library(
     name = "c_api_opaque_internal",
     srcs = ["c_api_opaque_internal.cc"],
     hdrs = ["c_api_opaque_internal.h"],
+    compatible_with = get_compatible_with_portable(),
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     visibility = [
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
         ":c_api_types",
-        ":c_api_without_op_resolver",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/c:registration_external",
     ],
 )
 
@@ -332,14 +334,16 @@ cc_library(
     ],
     deps = [
         ":c_api_types",
-        ":c_api_without_op_resolver_without_alwayslink",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/c:c_api_internal",
+        "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/core/c:registration_external_without_alwayslink",
     ],
 )
 
 # For use with library targets that can't use relative paths.
+# LINT.IfChange(exported_headers)
 exports_files([
     "c_api.h",
     "c_api_experimental.h",
@@ -361,6 +365,7 @@ alias(
     name = "tensorflowlite_c_impl_hdrs_filegroup",
     actual = "//tensorflow/lite/core/c:headers_filegroup",
 )
+# LINT.ThenChange(../java/BUILD:TFLITE_HEADERS)
 
 # Test the C extension API code.
 test_suite(
diff --git a/tensorflow/lite/c/CMakeLists.txt b/tensorflow/lite/c/CMakeLists.txt
index 8df942099c3..3fb086ec91a 100644
--- a/tensorflow/lite/c/CMakeLists.txt
+++ b/tensorflow/lite/c/CMakeLists.txt
@@ -66,6 +66,7 @@ add_library(tensorflowlite_c ${TFLITE_C_LIBTYPE}
   ${TFLITE_SOURCE_DIR}/core/c/c_api.cc
   ${TFLITE_SOURCE_DIR}/core/c/c_api_experimental.cc
   ${TFLITE_SOURCE_DIR}/core/c/common.cc
+  ${TFLITE_SOURCE_DIR}/core/c/registration_external.cc
   builtin_op_data.h
   c_api.h
   c_api_experimental.h
diff --git a/tensorflow/lite/c/c_api_opaque_internal.cc b/tensorflow/lite/c/c_api_opaque_internal.cc
index 61d27a00188..e0bd99308eb 100644
--- a/tensorflow/lite/c/c_api_opaque_internal.cc
+++ b/tensorflow/lite/c/c_api_opaque_internal.cc
@@ -17,17 +17,13 @@ limitations under the License.
 #include <memory>
 #include <unordered_map>
 
-#include "tensorflow/lite/c/c_api.h"
-#include "tensorflow/lite/c/c_api_internal.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/registration_external.h"
 #include "tensorflow/lite/core/subgraph.h"
 
 namespace tflite {
 namespace internal {
 
-static constexpr char kDataNullLog[] =
-    "The supplied 'data' argument must not be null.";
-
 TfLiteRegistrationExternal*
 CommonOpaqueConversionUtil::ObtainRegistrationExternal(
     TfLiteContext* context, TfLiteRegistration* registration, int node_index) {
@@ -47,5 +43,6 @@ CommonOpaqueConversionUtil::ObtainRegistrationExternal(
       std::unique_ptr<TfLiteRegistrationExternal>(registration_external));
   return registration_external;
 }
+
 }  // namespace internal
 }  // namespace tflite
diff --git a/tensorflow/lite/c/common_internal.h b/tensorflow/lite/c/common_internal.h
index fdcccabb72e..23d81bd8f49 100644
--- a/tensorflow/lite/c/common_internal.h
+++ b/tensorflow/lite/c/common_internal.h
@@ -39,14 +39,16 @@ typedef struct TfLiteRegistrationExternal {
   // Initializes the op from serialized data.
   void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
                 size_t length);
+
+  // Deallocates the op.
   // The pointer `buffer` is the data previously returned by an init invocation.
   void (*free)(TfLiteOpaqueContext* context, void* buffer);
 
   // Called when the inputs that this node depends on have been resized.
   TfLiteStatus (*prepare)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
 
-  // Called when the node is executed. (should read node->inputs and output to
-  // node->outputs).
+  // Called when the node is executed. (Should read node inputs and write to
+  // node outputs).
   TfLiteStatus (*invoke)(TfLiteOpaqueContext* context, TfLiteOpaqueNode* node);
 
   // Retrieves the async kernel. The functor is nullptr if the node / backend
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index 0d92e317a6b..eca81073965 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -478,8 +478,9 @@ cc_test(
     deps = [
         ":framework_stable",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:util",
         "//tensorflow/lite/kernels:builtin_ops",  # build_cleaner: keep
-        "//tensorflow/lite/testing:util",  # build_cleaner: keep
+        "//tensorflow/lite/testing:util",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/core/api/flatbuffer_conversions.cc b/tensorflow/lite/core/api/flatbuffer_conversions.cc
index af0a0eb229e..9f955df1a6d 100644
--- a/tensorflow/lite/core/api/flatbuffer_conversions.cc
+++ b/tensorflow/lite/core/api/flatbuffer_conversions.cc
@@ -2266,6 +2266,7 @@ TfLiteStatus ParseStridedSlice(const Operator* op,
     params->ellipsis_mask = schema_params->ellipsis_mask();
     params->new_axis_mask = schema_params->new_axis_mask();
     params->shrink_axis_mask = schema_params->shrink_axis_mask();
+    params->offset = schema_params->offset();
   } else {
     // TODO(b/157480169): We should either return kTfLiteError or fill in some
     // reasonable defaults in the params struct. We are not doing so until we
diff --git a/tensorflow/lite/core/api/op_resolver.h b/tensorflow/lite/core/api/op_resolver.h
index 0118c84e144..e8a4e32771a 100644
--- a/tensorflow/lite/core/api/op_resolver.h
+++ b/tensorflow/lite/core/api/op_resolver.h
@@ -28,6 +28,13 @@ namespace tflite {
 /// Abstract interface that returns TfLiteRegistrations given op codes or custom
 /// op names. This is the mechanism that ops being referenced in the flatbuffer
 /// model are mapped to executable function pointers (TfLiteRegistrations).
+///
+/// The lifetime of the TfLiteRegistration object whose address is
+/// returned by FindOp must exceed the lifetime of any InterpreterBuilder or
+/// Interpreter created with this OpResolver.
+/// Likewise the lifetime of the TfLiteRegistrationExternal object referenced
+/// from the TfLiteRegistration object, if any, must exceed the lifetime of
+/// any InterpreterBuilder or Interpreter created with this OpResolver.
 class OpResolver {
  public:
   /// Finds the op registration for a builtin operator by enum code.
diff --git a/tensorflow/lite/core/async/BUILD b/tensorflow/lite/core/async/BUILD
index 45fb2917c19..62510425289 100644
--- a/tensorflow/lite/core/async/BUILD
+++ b/tensorflow/lite/core/async/BUILD
@@ -122,6 +122,7 @@ cc_test(
         "//tensorflow/lite/core:framework_stable",
         "//tensorflow/lite/core/async/c:task",
         "//tensorflow/lite/core/async/c:types",
+        "//tensorflow/lite/core/async/interop:attribute_map_internal",
         "//tensorflow/lite/core/async/testing:mock_async_kernel",
         "//tensorflow/lite/core/async/testing:test_backend",
         "//tensorflow/lite/core/c:c_api_experimental",
diff --git a/tensorflow/lite/core/async/async_signature_runner.cc b/tensorflow/lite/core/async/async_signature_runner.cc
index eecfc8536f4..d87f9f40683 100644
--- a/tensorflow/lite/core/async/async_signature_runner.cc
+++ b/tensorflow/lite/core/async/async_signature_runner.cc
@@ -34,10 +34,11 @@ namespace {
 
 // Returns the tensor index of the given signature name.
 // `map` is a mapping from tensor signature name to tensor index.
-// Return -1 if name is not found in the map.
-int GetIndex(const std::map<std::string, uint32_t>& map, const char* name) {
-  const auto& it = map.find(name);
-  return it == map.end() ? -1 : it->second;
+// Return -1 if name is not found in the map or map is nullptr.
+int GetIndex(const std::map<std::string, uint32_t>* map, const char* name) {
+  if (map == nullptr) return -1;
+  const auto& it = map->find(name);
+  return it == map->end() ? -1 : it->second;
 }
 
 }  // namespace
@@ -47,11 +48,11 @@ int AsyncSignatureRunner::GetTensorIndex(TfLiteIoType io_type,
   int tensor_index = -1;
   switch (io_type) {
     case kTfLiteIoTypeInput: {
-      tensor_index = GetIndex(signature_def_->inputs, name);
+      tensor_index = GetIndex(input_to_index_, name);
       break;
     };
     case kTfLiteIoTypeOutput: {
-      tensor_index = GetIndex(signature_def_->outputs, name);
+      tensor_index = GetIndex(output_to_index_, name);
       break;
     }
     default: {
@@ -66,14 +67,19 @@ int AsyncSignatureRunner::GetTensorIndex(TfLiteIoType io_type,
 
 AsyncSignatureRunner::AsyncSignatureRunner(
     const internal::SignatureDef* signature_def, Subgraph* subgraph)
-    : signature_def_(signature_def), subgraph_(subgraph) {
+    : subgraph_(subgraph) {
   async_subgraph_ = std::make_unique<AsyncSubgraph>(subgraph);
-  // Collects the list of input and output tensor names.
-  for (const auto& it : signature_def_->inputs) {
-    input_names_.push_back(it.first.c_str());
-  }
-  for (const auto& it : signature_def_->outputs) {
-    output_names_.push_back(it.first.c_str());
+  if (signature_def) {
+    signature_key_ = signature_def->signature_key;
+    input_to_index_ = &signature_def->inputs;
+    output_to_index_ = &signature_def->outputs;
+    // Collects the list of input and output tensor names.
+    for (const auto& it : *input_to_index_) {
+      input_names_.push_back(it.first.c_str());
+    }
+    for (const auto& it : *output_to_index_) {
+      output_names_.push_back(it.first.c_str());
+    }
   }
 }
 
@@ -107,6 +113,13 @@ bool AsyncSignatureRunner::ReconcileRestrictions(
     const TfLiteAttributeMap* user_provided_attributes,
     TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const {
   auto tensor_index = GetTensorIndex(io_type, name);
+  return ReconcileRestrictions(tensor_index, user_provided_attributes, merged,
+                               conflict);
+}
+
+bool AsyncSignatureRunner::ReconcileRestrictions(
+    int tensor_index, const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) const {
   if (tensor_index < 0) return false;
   return async_subgraph_->ReconcileRestrictions(
       tensor_index, user_provided_attributes, merged, conflict);
@@ -115,6 +128,11 @@ bool AsyncSignatureRunner::ReconcileRestrictions(
 TfLiteStatus AsyncSignatureRunner::SetAttributes(
     TfLiteIoType io_type, const char* name, const TfLiteAttributeMap* attrs) {
   auto tensor_index = GetTensorIndex(io_type, name);
+  return SetAttributes(tensor_index, attrs);
+}
+
+TfLiteStatus AsyncSignatureRunner::SetAttributes(
+    int tensor_index, const TfLiteAttributeMap* attrs) {
   if (tensor_index < 0) return kTfLiteError;
   return async_subgraph_->SetAttributes(tensor_index, attrs);
 }
@@ -125,8 +143,8 @@ TfLiteStatus AsyncSignatureRunner::PrepareBackends() {
 
 TfLiteExecutionTask* AsyncSignatureRunner::CreateTask() {
   auto* task = async_subgraph_->CreateTask();
-  task->task->SetInputNameMap(&signature_def_->inputs);
-  task->task->SetOutputNameMap(&signature_def_->outputs);
+  task->task->SetInputNameMap(input_to_index_);
+  task->task->SetOutputNameMap(output_to_index_);
   return task;
 }
 
@@ -141,6 +159,7 @@ TfLiteStatus AsyncSignatureRunner::Wait(TfLiteExecutionTask* task) {
 TfLiteStatus AsyncSignatureRunner::Finish(TfLiteExecutionTask* task) {
   return async_subgraph_->Finish(task);
 }
+
 const TfLiteOpaqueTensor* AsyncSignatureRunner::input_tensor(
     const char* input_name) const {
   if (auto idx = GetTensorIndex(kTfLiteIoTypeInput, input_name); idx >= 0) {
diff --git a/tensorflow/lite/core/async/async_signature_runner.h b/tensorflow/lite/core/async/async_signature_runner.h
index c198ea14c77..b23a460debe 100644
--- a/tensorflow/lite/core/async/async_signature_runner.h
+++ b/tensorflow/lite/core/async/async_signature_runner.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
 #define TENSORFLOW_LITE_CORE_ASYNC_ASYNC_SIGNATURE_RUNNER_H_
 
+#include <map>
 #include <memory>
 #include <string>
 #include <vector>
@@ -89,13 +90,25 @@ class AsyncSignatureRunner {
   // Merged attributes will be populated to `merged`.
   // If there's a conflict attribute, it's populated to `conflict` if provided.
   // `user_provided_attributes` and `merged` should not be nullptr.
-  // Returns true if the reconcilation successes and there's no conflicting
+  // Returns true if the reconciliation successes and there's no conflicting
   // attributes.
   bool ReconcileRestrictions(TfLiteIoType io_type, const char* name,
                              const TfLiteAttributeMap* user_provided_attributes,
                              TfLiteAttributeMap* merged,
                              TfLiteAttributeMap* conflict) const;
 
+  // Reconciles registrations with all backends depending on I/O tensor at
+  // `tensor_index` if the backend kernel reads or writes the tensor. Merged
+  // attributes will be populated to `merged`. If there's a conflict attribute,
+  // it's populated to `conflict` if provided. `user_provided_attributes` and
+  // `merged` should not be nullptr.
+  // Returns true if the reconciliation successes and there's no conflicting
+  // attributes.
+  bool ReconcileRestrictions(int tensor_index,
+                             const TfLiteAttributeMap* user_provided_attributes,
+                             TfLiteAttributeMap* merged,
+                             TfLiteAttributeMap* conflict) const;
+
   // Finalizes the attribute for I/O tensor `name` with `attrs`.
   // The attributes will be sent to all backend kernels that depends on tensor.
   // Must call `Prepare` after setting new attributes.
@@ -103,6 +116,12 @@ class AsyncSignatureRunner {
   TfLiteStatus SetAttributes(TfLiteIoType io_type, const char* name,
                              const TfLiteAttributeMap* attrs);
 
+  // Finalizes the attribute for I/O tensor at `tensor_index` with `attrs`.
+  // The attributes will be sent to all backend kernels that depends on tensor.
+  // Must call `Prepare` after setting new attributes.
+  // Returns true if all backends accept the `attrs`.
+  TfLiteStatus SetAttributes(int tensor_index, const TfLiteAttributeMap* attrs);
+
   // Prepares delegate backends for execution.
   // Must be called after calling `SetAttributes`.
   TfLiteStatus PrepareBackends();
@@ -147,7 +166,7 @@ class AsyncSignatureRunner {
   TfLiteStatus Finish(TfLiteExecutionTask* task);
 
   /// Returns the key for the corresponding signature.
-  const std::string& signature_key() { return signature_def_->signature_key; }
+  const std::string& signature_key() { return signature_key_; }
 
   /// Returns the number of inputs.
   size_t input_size() const { return subgraph_->inputs().size(); }
@@ -156,9 +175,11 @@ class AsyncSignatureRunner {
   size_t output_size() const { return subgraph_->outputs().size(); }
 
   /// Read-only access to list of signature input names.
+  /// Returns an empty vector if the model does not have signature.
   const std::vector<const char*>& input_names() { return input_names_; }
 
   /// Read-only access to list of signature output names.
+  /// Returns an empty vector if the model does not have signature.
   const std::vector<const char*>& output_names() { return output_names_; }
 
   /// Returns the input tensor information identified by 'input_name' in the
@@ -175,22 +196,45 @@ class AsyncSignatureRunner {
   /// accessed via hardware buffer directly.
   const TfLiteOpaqueTensor* output_tensor(const char* output_name) const;
 
+  /// Tensor index based accessors.
+
+  /// Read only access to list of input index.
+  const std::vector<int>& inputs() const { return subgraph_->inputs(); }
+
+  /// Read only access to list of output index.
+  const std::vector<int>& outputs() const { return subgraph_->outputs(); }
+
+  /// Returns the tensor information by tensor index.
+  const TfLiteOpaqueTensor* tensor(int tensor_index) const {
+    // The following cast is safe only because this code is part of the
+    // TF Lite runtime implementation.  Apps using TF Lite should not rely on
+    // TfLiteOpaqueTensor and TfLiteTensor being equivalent.
+    return reinterpret_cast<const TfLiteOpaqueTensor*>(
+        subgraph_->tensor(tensor_index));
+  }
+
  private:
   friend class AsyncSignatureRunnerTest;
 
   int GetTensorIndex(TfLiteIoType io_type, const char* name) const;
 
+  std::string signature_key_;
+
   // The list of input tensor names.
   std::vector<const char*> input_names_;
   // The list of output tensor names.
   std::vector<const char*> output_names_;
 
   // Not owned.
-  const internal::SignatureDef* signature_def_ = nullptr;
+  // If the model does not have signature def, the name maps will be nullptr.
+  const std::map<std::string, uint32_t>* input_to_index_ = nullptr;
+  const std::map<std::string, uint32_t>* output_to_index_ = nullptr;
+
+  // Not owned.
   Subgraph* subgraph_ = nullptr;
 
   // Currently AsyncSubgraph is owned by SignatureRunner. However after
-  // we stablize the interface, the async subgraph should be owned by the
+  // we stabilize the interface, the async subgraph should be owned by the
   // interpreter and AsyncSignatureRunner won't own any of the subgraphs.
   std::unique_ptr<AsyncSubgraph> async_subgraph_;
 };
diff --git a/tensorflow/lite/core/async/async_signature_runner_test.cc b/tensorflow/lite/core/async/async_signature_runner_test.cc
index 31aab5ad0da..bb5e23b3111 100644
--- a/tensorflow/lite/core/async/async_signature_runner_test.cc
+++ b/tensorflow/lite/core/async/async_signature_runner_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/core/async/backend_async_kernel_interface.h"
 #include "tensorflow/lite/core/async/c/task.h"
 #include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/async/interop/attribute_map_internal.h"
 #include "tensorflow/lite/core/async/testing/mock_async_kernel.h"
 #include "tensorflow/lite/core/async/testing/test_backend.h"
 #include "tensorflow/lite/core/c/c_api_opaque.h"
@@ -31,12 +32,21 @@ limitations under the License.
 #include "tensorflow/lite/core/kernels/builtin_op_kernels.h"
 #include "tensorflow/lite/interpreter_test_util.h"
 
+using ::testing::_;
+using ::testing::Return;
+
 namespace tflite {
 namespace async {
 
 class AsyncSignatureRunnerTest : public InterpreterTest {
  protected:
   void SetUp() override {
+    InitInterpreter();
+    const char kSignatureKey[] = "serving_default";
+    BuildSignature(kSignatureKey, {{"input", 0}}, {{"output", 1}});
+  }
+
+  void InitInterpreter() {
     kernel_ =
         std::make_unique<::testing::StrictMock<testing::MockAsyncKernel>>();
     backend_ = std::make_unique<testing::TestBackend>(kernel_->kernel());
@@ -54,8 +64,6 @@ class AsyncSignatureRunnerTest : public InterpreterTest {
     void* builtin_data_1 = malloc(sizeof(int));
     interpreter_->AddNodeWithParameters({0, 0}, {1}, nullptr, 0, builtin_data_1,
                                         reg);
-    const char kSignatureKey[] = "serving_default";
-    BuildSignature(kSignatureKey, {{"input", 0}}, {{"output", 1}});
     interpreter_->ModifyGraphWithDelegate(backend_->get_delegate());
   }
 
@@ -74,10 +82,22 @@ TEST_F(AsyncSignatureRunnerTest, GetAsyncSignatureRunner) {
   EXPECT_EQ(nullptr, signature_runner_);
   signature_runner_ = interpreter_->GetAsyncSignatureRunner("serving_default");
   EXPECT_NE(nullptr, signature_runner_);
+  auto* signature_runner_null_key =
+      interpreter_->GetAsyncSignatureRunner(nullptr);
+  EXPECT_EQ(signature_runner_null_key, signature_runner_);
+  EXPECT_STREQ("serving_default", signature_runner_->signature_key().c_str());
 
   EXPECT_EQ(nullptr, interpreter_->GetAsyncSignatureRunner("foo"));
 }
 
+TEST_F(AsyncSignatureRunnerTest, WrongSignatureKeyTest) {
+  const char kSignatureKey[] = "serving_default";
+  BuildSignature(interpreter_.get(), kSignatureKey, {{"input", 0}},
+                 {{"output", 1}}, 1);
+  signature_runner_ = interpreter_->GetAsyncSignatureRunner(nullptr);
+  EXPECT_EQ(nullptr, signature_runner_);
+}
+
 TEST_F(AsyncSignatureRunnerTest, InputsTest) {
   signature_runner_ = interpreter_->GetAsyncSignatureRunner("serving_default");
   EXPECT_EQ(1, signature_runner_->input_size());
@@ -127,5 +147,92 @@ TEST_F(AsyncSignatureRunnerTest, CreateTaskTest) {
   EXPECT_EQ(kTfLiteOk, signature_runner_->Finish(task));
 }
 
+TEST_F(AsyncSignatureRunnerTest, ReconcileTest) {
+  signature_runner_ = interpreter_->GetAsyncSignatureRunner("serving_default");
+
+  EXPECT_CALL(*kernel_, ReconcileRestrictions(_, _, _, _, _, _))
+      .WillOnce(Return(true));
+  EXPECT_CALL(*kernel_, SetAttributes(_, _, _, _)).WillOnce(Return(kTfLiteOk));
+
+  auto* attrs = new TfLiteAttributeMap(kTfLiteAttrMapTypeBuffer);
+  EXPECT_TRUE(signature_runner_->ReconcileRestrictions(
+      kTfLiteIoTypeInput, "input", attrs, attrs, attrs));
+  EXPECT_EQ(kTfLiteOk, signature_runner_->SetAttributes(kTfLiteIoTypeInput,
+                                                        "input", attrs));
+
+  EXPECT_FALSE(signature_runner_->ReconcileRestrictions(
+      kTfLiteIoTypeInput, "foo", attrs, attrs, attrs));
+  EXPECT_EQ(kTfLiteError,
+            signature_runner_->SetAttributes(kTfLiteIoTypeInput, "foo", attrs));
+
+  delete attrs;
+}
+
+class AsyncSignatureRunnerNoSignatureDefTest : public AsyncSignatureRunnerTest {
+ public:
+  void SetUp() override { InitInterpreter(); }
+};
+
+TEST_F(AsyncSignatureRunnerNoSignatureDefTest, GetAsyncSignatureRunner) {
+  EXPECT_EQ(nullptr, signature_runner_);
+  EXPECT_NE(nullptr, interpreter_->GetAsyncSignatureRunner(nullptr));
+
+  EXPECT_EQ(nullptr, interpreter_->GetAsyncSignatureRunner("foo"));
+}
+
+TEST_F(AsyncSignatureRunnerNoSignatureDefTest, InputsTest) {
+  signature_runner_ = interpreter_->GetAsyncSignatureRunner(nullptr);
+  EXPECT_EQ(1, signature_runner_->input_size());
+  EXPECT_EQ(0, signature_runner_->input_names().size());
+
+  EXPECT_EQ(1, signature_runner_->inputs().size());
+  EXPECT_NE(nullptr, signature_runner_->tensor(signature_runner_->inputs()[0]));
+}
+
+TEST_F(AsyncSignatureRunnerNoSignatureDefTest, OutputsTest) {
+  signature_runner_ = interpreter_->GetAsyncSignatureRunner(nullptr);
+  EXPECT_EQ(1, signature_runner_->output_size());
+  EXPECT_EQ(0, signature_runner_->output_names().size());
+
+  EXPECT_EQ(1, signature_runner_->outputs().size());
+  EXPECT_NE(nullptr,
+            signature_runner_->tensor(signature_runner_->outputs()[0]));
+}
+
+TEST_F(AsyncSignatureRunnerNoSignatureDefTest, CreateTaskTest) {
+  EXPECT_CALL(*kernel_, Finish(::testing::_, ::testing::_));
+
+  signature_runner_ = interpreter_->GetAsyncSignatureRunner(nullptr);
+  auto* task = signature_runner_->CreateTask();
+  EXPECT_NE(nullptr, task);
+
+  TfLiteExecutionTaskSetBufferByIndex(task, 0, 24);
+  TfLiteExecutionTaskSetBufferByIndex(task, 1, 12);
+  TfLiteBufferHandle input_buffer, output_buffer;
+  input_buffer = TfLiteExecutionTaskGetBufferByIndex(task, 0);
+  output_buffer = TfLiteExecutionTaskGetBufferByIndex(task, 1);
+  EXPECT_EQ(24, input_buffer);
+  EXPECT_EQ(12, output_buffer);
+  EXPECT_EQ(kTfLiteOk, signature_runner_->Finish(task));
+}
+
+TEST_F(AsyncSignatureRunnerNoSignatureDefTest, ReconcileTest) {
+  signature_runner_ = interpreter_->GetAsyncSignatureRunner(nullptr);
+
+  EXPECT_CALL(*kernel_, ReconcileRestrictions(_, _, _, _, _, _))
+      .WillOnce(Return(true));
+  EXPECT_CALL(*kernel_, SetAttributes(_, _, _, _)).WillOnce(Return(kTfLiteOk));
+
+  auto* attrs = new TfLiteAttributeMap(kTfLiteAttrMapTypeBuffer);
+  EXPECT_TRUE(signature_runner_->ReconcileRestrictions(0, attrs, attrs, attrs));
+  EXPECT_EQ(kTfLiteOk, signature_runner_->SetAttributes(0, attrs));
+
+  EXPECT_FALSE(
+      signature_runner_->ReconcileRestrictions(42, attrs, attrs, attrs));
+  EXPECT_EQ(kTfLiteError, signature_runner_->SetAttributes(42, attrs));
+
+  delete attrs;
+}
+
 }  // namespace async
 }  // namespace tflite
diff --git a/tensorflow/lite/core/async/async_subgraph.cc b/tensorflow/lite/core/async/async_subgraph.cc
index 6bed5710d8a..7a575372049 100644
--- a/tensorflow/lite/core/async/async_subgraph.cc
+++ b/tensorflow/lite/core/async/async_subgraph.cc
@@ -159,6 +159,9 @@ bool AsyncSubgraph::ReconcileRestrictions(
       async_kernel() == nullptr) {
     return false;
   }
+  if (tensor_index < 0 || tensor_index >= subgraph_->tensors_size()) {
+    return false;
+  }
   return (*async_kernel_->reconcile_restrictions)(
       async_kernel_, opaque_context(), opaque_node_, tensor_index,
       user_provided_attributes, merged, conflict);
@@ -169,6 +172,9 @@ TfLiteStatus AsyncSubgraph::SetAttributes(int tensor_index,
   if (attrs == nullptr || async_kernel() == nullptr) {
     return kTfLiteError;
   }
+  if (tensor_index < 0 || tensor_index >= subgraph_->tensors_size()) {
+    return kTfLiteError;
+  }
   return (*async_kernel_->set_attributes)(async_kernel_, opaque_context(),
                                           opaque_node_, tensor_index, attrs);
 }
diff --git a/tensorflow/lite/core/async/async_subgraph_test.cc b/tensorflow/lite/core/async/async_subgraph_test.cc
index 85233be7b15..9aa6712f899 100644
--- a/tensorflow/lite/core/async/async_subgraph_test.cc
+++ b/tensorflow/lite/core/async/async_subgraph_test.cc
@@ -155,5 +155,13 @@ TEST_F(AsyncSubgraphTest, BasicTest) {
   EXPECT_NE(handle, another_handle);
 }
 
+TEST_F(AsyncSubgraphTest, OutOfBoundTest) {
+  BuildAsyncSubgraph();
+  auto* attrs = new TfLiteAttributeMap(kTfLiteAttrMapTypeBuffer);
+  EXPECT_FALSE(subgraph_->ReconcileRestrictions(42, attrs, attrs, attrs));
+  EXPECT_EQ(kTfLiteError, subgraph_->SetAttributes(42, attrs));
+  delete attrs;
+}
+
 }  // namespace async
 }  // namespace tflite
diff --git a/tensorflow/lite/core/async/c/async_signature_runner.cc b/tensorflow/lite/core/async/c/async_signature_runner.cc
index e18aa1c1572..ae5df6a3c32 100644
--- a/tensorflow/lite/core/async/c/async_signature_runner.cc
+++ b/tensorflow/lite/core/async/c/async_signature_runner.cc
@@ -89,6 +89,15 @@ bool TfLiteAsyncSignatureRunnerReconcileRestrictions(
       io_type, name, user_provided_attributes, merged, conflict);
 }
 
+bool TfLiteAsyncSignatureRunnerReconcileRestrictionsByIndex(
+    const TfLiteAsyncSignatureRunner* async_signature_runner, int tensor_index,
+    const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict) {
+  if (!async_signature_runner) return false;
+  return async_signature_runner->impl->ReconcileRestrictions(
+      tensor_index, user_provided_attributes, merged, conflict);
+}
+
 TfLiteStatus TfLiteAsyncSignatureRunnerSetAttributes(
     TfLiteAsyncSignatureRunner* async_signature_runner, TfLiteIoType io_type,
     const char* name, const TfLiteAttributeMap* attrs) {
@@ -96,6 +105,13 @@ TfLiteStatus TfLiteAsyncSignatureRunnerSetAttributes(
   return async_signature_runner->impl->SetAttributes(io_type, name, attrs);
 }
 
+TfLiteStatus TfLiteAsyncSignatureRunnerSetAttributesByIndex(
+    TfLiteAsyncSignatureRunner* async_signature_runner, int tensor_index,
+    const TfLiteAttributeMap* attrs) {
+  if (!async_signature_runner) return kTfLiteError;
+  return async_signature_runner->impl->SetAttributes(tensor_index, attrs);
+}
+
 TfLiteStatus TfLiteAsyncSignatureRunnerPrepareBackends(
     TfLiteAsyncSignatureRunner* async_signature_runner) {
   if (!async_signature_runner) return kTfLiteError;
@@ -144,7 +160,11 @@ const char* TfLiteAsyncSignatureRunnerGetInputName(
   if (input_index < 0 || input_index >= count) {
     return nullptr;
   }
-  return async_signature_runner->impl->input_names()[input_index];
+  const auto& input_names = async_signature_runner->impl->input_names();
+  if (input_index >= input_names.size()) {
+    return nullptr;
+  }
+  return input_names[input_index];
 }
 
 size_t TfLiteAsyncSignatureRunnerGetOutputCount(
@@ -162,6 +182,10 @@ const char* TfLiteAsyncSignatureRunnerGetOutputName(
   if (output_index < 0 || output_index >= count) {
     return nullptr;
   }
+  const auto& output_names = async_signature_runner->impl->output_names();
+  if (output_index >= output_names.size()) {
+    return nullptr;
+  }
   return async_signature_runner->impl->output_names()[output_index];
 }
 
@@ -183,3 +207,21 @@ void TfLiteAsyncSignatureRunnerDelete(
     TfLiteAsyncSignatureRunner* signature_runner) {
   delete signature_runner;
 }
+
+const int* TfLiteAsyncSignatureRunnerInputTensorIndices(
+    const TfLiteAsyncSignatureRunner* async_signature_runner) {
+  if (!async_signature_runner) return nullptr;
+  return async_signature_runner->impl->inputs().data();
+}
+
+const int* TfLiteAsyncSignatureRunnerOutputTensorIndices(
+    const TfLiteAsyncSignatureRunner* async_signature_runner) {
+  if (!async_signature_runner) return nullptr;
+  return async_signature_runner->impl->outputs().data();
+}
+
+const TfLiteOpaqueTensor* TfLiteAsyncSignatureRunnerGetTensor(
+    const TfLiteAsyncSignatureRunner* async_signature_runner, int index) {
+  if (!async_signature_runner) return nullptr;
+  return async_signature_runner->impl->tensor(index);
+}
diff --git a/tensorflow/lite/core/async/c/async_signature_runner.h b/tensorflow/lite/core/async/c/async_signature_runner.h
index 17f28bf3b4b..7996162bc54 100644
--- a/tensorflow/lite/core/async/c/async_signature_runner.h
+++ b/tensorflow/lite/core/async/c/async_signature_runner.h
@@ -128,17 +128,47 @@ TFL_CAPI_EXPORT extern bool TfLiteAsyncSignatureRunnerReconcileRestrictions(
     const TfLiteAttributeMap* user_provided_attributes,
     TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict);
 
+/// Reconciles restrictions with the backend for I/O tensor at `tensor_index`.
+/// The backend will read `user_provided_attributes` and tries to reconcile
+/// those attributes. The backend will also populate its own restrictions
+/// back to the caller.
+/// The merged attributes will be populated to `merged`. For attributes that
+/// the backend does not know or not care about, those will also be copied to
+/// `merged` attributes.
+/// If there's a conflicting attribute, it will be populated to `conflict` if
+/// it's provided.
+/// `user_provided_attributes` and `merged` should not be nullptr.
+/// Returns true if the reconcilation succeeded and there's no
+/// conflicting attributes.
+TFL_CAPI_EXPORT extern bool
+TfLiteAsyncSignatureRunnerReconcileRestrictionsByIndex(
+    const TfLiteAsyncSignatureRunner* async_signature_runner, int tensor_index,
+    const TfLiteAttributeMap* user_provided_attributes,
+    TfLiteAttributeMap* merged, TfLiteAttributeMap* conflict);
+
 /// Finalizes I/O tensor `name`'s attributes with `attrs`.
 /// The attributes will be forwarded to all backend kernels that depends on
 /// tensor. Must call `TfLiteAsyncSignatureRunnerPrepareBackends` after setting
 /// new attributes.
-/// Callers needs to ensure the livetime of `name` and `attrs` before this
+/// Callers needs to ensure the lifetime of `name` and `attrs` before this
 /// function returns, and those may be deallocated afterwards.
 /// Returns true if all backends accept the `attrs`.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteAsyncSignatureRunnerSetAttributes(
     TfLiteAsyncSignatureRunner* async_signature_runner, TfLiteIoType io_type,
     const char* name, const TfLiteAttributeMap* attrs);
 
+/// Finalizes I/O tensor at `tensor_index`'s attributes with `attrs`.
+/// The attributes will be forwarded to all backend kernels that depends on
+/// tensor. Must call `TfLiteAsyncSignatureRunnerPrepareBackends` after setting
+/// new attributes.
+/// Callers needs to ensure the lifetime of `name` and `attrs` before this
+/// function returns, and those may be deallocated afterwards.
+/// Returns true if all backends accept the `attrs`.
+TFL_CAPI_EXPORT extern TfLiteStatus
+TfLiteAsyncSignatureRunnerSetAttributesByIndex(
+    TfLiteAsyncSignatureRunner* async_signature_runner, int tensor_index,
+    const TfLiteAttributeMap* attrs);
+
 /// Prepares delegate backends for execution.
 /// Must be called after `TfLiteAsyncSignatureRunnerSetAttributes` and before
 /// `TfLiteAsyncSignatureRunnerCreateTask`.
@@ -270,6 +300,39 @@ TfLiteAsyncSignatureRunnerGetOutputTensor(
 TFL_CAPI_EXPORT extern void TfLiteAsyncSignatureRunnerDelete(
     TfLiteAsyncSignatureRunner* signature_runner);
 
+/// Returns a pointer to an array of input tensor indices.  The length of the
+/// array can be obtained via a call to
+/// `TfLiteAsyncSignatureRunnerGetInputCount`.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const int* TfLiteAsyncSignatureRunnerInputTensorIndices(
+    const TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Returns a pointer to an array of output tensor indices.  The length of the
+/// array can be obtained via a call to
+/// `TfLiteAsyncSignatureRunnerGetOutputCount`.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const int* TfLiteAsyncSignatureRunnerOutputTensorIndices(
+    const TfLiteAsyncSignatureRunner* async_signature_runner);
+
+/// Returns the tensor metadata identified by `index` in the given
+/// signature.
+/// Returns nullptr if the given index is not valid or out of bound.
+///
+/// NOTE: For AsyncSignatureRunner, tensor data are not stored within
+/// `TfLiteOpaqueTensors` but in platform-specific hardware buffer objects.
+/// This method is only used for accessing the metadata like shape and data type
+/// of the input tensors.
+///
+/// NOTE: The lifetime of the returned tensor is the same as (and depends on)
+/// the lifetime of `async_signature_runner`.
+TFL_CAPI_EXPORT extern const TfLiteOpaqueTensor*
+TfLiteAsyncSignatureRunnerGetTensor(
+    const TfLiteAsyncSignatureRunner* async_signature_runner, int index);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif  // __cplusplus
diff --git a/tensorflow/lite/core/async/c/async_signature_runner_test.cc b/tensorflow/lite/core/async/c/async_signature_runner_test.cc
index e2cd0847e85..2648e5028ed 100644
--- a/tensorflow/lite/core/async/c/async_signature_runner_test.cc
+++ b/tensorflow/lite/core/async/c/async_signature_runner_test.cc
@@ -43,7 +43,8 @@ using ::testing::Return;
 namespace tflite {
 namespace async {
 
-class AsyncSignatureRunnerTest : public InterpreterTest {
+class AsyncSignatureRunnerTest : public InterpreterTest,
+                                 public ::testing::WithParamInterface<bool> {
  protected:
   void SetUp() override {
     kernel_ =
@@ -63,13 +64,23 @@ class AsyncSignatureRunnerTest : public InterpreterTest {
     void* builtin_data_1 = malloc(sizeof(int));
     interpreter->AddNodeWithParameters({0, 0}, {1}, nullptr, 0, builtin_data_1,
                                        reg);
-    const char kSignatureKey[] = "serving_default";
-    BuildSignature(interpreter.get(), kSignatureKey, {{"input", 0}},
-                   {{"output", 1}});
-    interpreter->ModifyGraphWithDelegate(backend_->get_delegate());
     tflite_interpreter_.impl = std::move(interpreter);
-    runner_ = TfLiteInterpreterGetAsyncSignatureRunner(&tflite_interpreter_,
-                                                       kSignatureKey);
+  }
+
+  void BuildRunner(bool has_signature) {
+    auto* interpreter = tflite_interpreter_.impl.get();
+    if (has_signature) {
+      const char kSignatureKey[] = "serving_default";
+      BuildSignature(interpreter, kSignatureKey, {{"input", 0}},
+                     {{"output", 1}});
+      interpreter->ModifyGraphWithDelegate(backend_->get_delegate());
+      runner_ = TfLiteInterpreterGetAsyncSignatureRunner(&tflite_interpreter_,
+                                                         kSignatureKey);
+    } else {
+      interpreter->ModifyGraphWithDelegate(backend_->get_delegate());
+      runner_ = TfLiteInterpreterGetAsyncSignatureRunner(&tflite_interpreter_,
+                                                         nullptr);
+    }
     ASSERT_NE(nullptr, runner_);
   }
 
@@ -83,7 +94,11 @@ class AsyncSignatureRunnerTest : public InterpreterTest {
   TfLiteInterpreter tflite_interpreter_{};
 };
 
-TEST_F(AsyncSignatureRunnerTest, RegisterBufferTest) {
+INSTANTIATE_TEST_SUITE_P(AsyncSignatureRunnerTest, AsyncSignatureRunnerTest,
+                         ::testing::Bool());
+
+TEST_P(AsyncSignatureRunnerTest, RegisterBufferTest) {
+  BuildRunner(GetParam());
   EXPECT_CALL(*kernel_, RegisterBuffer(_, _, _, _, _))
       .WillOnce(Return(kTfLiteOk));
   EXPECT_CALL(*kernel_, RegisterBufferSlice(_, _, _, _))
@@ -102,7 +117,8 @@ TEST_F(AsyncSignatureRunnerTest, RegisterBufferTest) {
   TfLiteBackendBufferDelete(buf);
 }
 
-TEST_F(AsyncSignatureRunnerTest, SupportedTypesTest) {
+TEST_P(AsyncSignatureRunnerTest, SupportedTypesTest) {
+  BuildRunner(GetParam());
   const char* const* buffer_types = nullptr;
   size_t num_buffer_types = 0;
   EXPECT_EQ(kTfLiteOk,
@@ -119,19 +135,29 @@ TEST_F(AsyncSignatureRunnerTest, SupportedTypesTest) {
   EXPECT_STREQ("sync_type", sync_types[0]);
 }
 
-TEST_F(AsyncSignatureRunnerTest, ReconcileTest) {
+TEST_P(AsyncSignatureRunnerTest, ReconcileTest) {
+  bool has_signature = GetParam();
+  BuildRunner(has_signature);
   EXPECT_CALL(*kernel_, ReconcileRestrictions(_, _, _, _, _, _))
       .WillOnce(Return(true));
   EXPECT_CALL(*kernel_, SetAttributes(_, _, _, _)).WillOnce(Return(kTfLiteOk));
   auto* attr = TfLiteAttributeMapCreate(kTfLiteAttrMapTypeBuffer);
-  EXPECT_TRUE(TfLiteAsyncSignatureRunnerReconcileRestrictions(
-      runner_, kTfLiteIoTypeInput, "input", attr, attr, nullptr));
-  EXPECT_EQ(kTfLiteOk, TfLiteAsyncSignatureRunnerSetAttributes(
-                           runner_, kTfLiteIoTypeInput, "input", attr));
+  if (has_signature) {
+    EXPECT_TRUE(TfLiteAsyncSignatureRunnerReconcileRestrictions(
+        runner_, kTfLiteIoTypeInput, "input", attr, attr, nullptr));
+    EXPECT_EQ(kTfLiteOk, TfLiteAsyncSignatureRunnerSetAttributes(
+                             runner_, kTfLiteIoTypeInput, "input", attr));
+  } else {
+    EXPECT_TRUE(TfLiteAsyncSignatureRunnerReconcileRestrictionsByIndex(
+        runner_, 0, attr, attr, nullptr));
+    EXPECT_EQ(kTfLiteOk,
+              TfLiteAsyncSignatureRunnerSetAttributesByIndex(runner_, 0, attr));
+  }
   TfLiteAttributeMapDelete(attr);
 }
 
-TEST_F(AsyncSignatureRunnerTest, ExecutionTest) {
+TEST_P(AsyncSignatureRunnerTest, ExecutionTest) {
+  BuildRunner(GetParam());
   EXPECT_CALL(*kernel_, Prepare(_, _)).WillOnce(Return(kTfLiteOk));
   EXPECT_CALL(*kernel_, Eval(_, _, _)).WillOnce(Return(kTfLiteOk));
   EXPECT_CALL(*kernel_, Wait(_, _)).WillOnce(Return(kTfLiteOk));
@@ -146,20 +172,61 @@ TEST_F(AsyncSignatureRunnerTest, ExecutionTest) {
   EXPECT_EQ(kTfLiteOk, TfLiteAsyncSignatureRunnerFinish(runner_, task));
 }
 
-TEST_F(AsyncSignatureRunnerTest, InputsTest) {
+TEST_P(AsyncSignatureRunnerTest, InputsTest) {
+  bool has_signature = GetParam();
+  BuildRunner(has_signature);
   EXPECT_EQ(1, TfLiteAsyncSignatureRunnerGetInputCount(runner_));
-  EXPECT_STREQ("input", TfLiteAsyncSignatureRunnerGetInputName(runner_, 0));
-  EXPECT_STREQ("x",
-               TfLiteOpaqueTensorName(
-                   TfLiteAsyncSignatureRunnerGetInputTensor(runner_, "input")));
+  if (has_signature) {
+    EXPECT_STREQ("input", TfLiteAsyncSignatureRunnerGetInputName(runner_, 0));
+    EXPECT_STREQ(
+        "x", TfLiteOpaqueTensorName(
+                 TfLiteAsyncSignatureRunnerGetInputTensor(runner_, "input")));
+  } else {
+    EXPECT_EQ(nullptr, TfLiteAsyncSignatureRunnerGetInputName(runner_, 0));
+    EXPECT_EQ(nullptr,
+              TfLiteAsyncSignatureRunnerGetInputTensor(runner_, "input"));
+  }
 }
 
-TEST_F(AsyncSignatureRunnerTest, OutputsTest) {
+TEST_P(AsyncSignatureRunnerTest, OutputsTest) {
+  bool has_signature = GetParam();
+  BuildRunner(has_signature);
   EXPECT_EQ(1, TfLiteAsyncSignatureRunnerGetOutputCount(runner_));
-  EXPECT_STREQ("output", TfLiteAsyncSignatureRunnerGetOutputName(runner_, 0));
-  EXPECT_STREQ(
-      "a", TfLiteOpaqueTensorName(
-               TfLiteAsyncSignatureRunnerGetOutputTensor(runner_, "output")));
+  if (has_signature) {
+    EXPECT_STREQ("output", TfLiteAsyncSignatureRunnerGetOutputName(runner_, 0));
+    EXPECT_STREQ(
+        "a", TfLiteOpaqueTensorName(
+                 TfLiteAsyncSignatureRunnerGetOutputTensor(runner_, "output")));
+  } else {
+    EXPECT_EQ(nullptr, TfLiteAsyncSignatureRunnerGetOutputName(runner_, 0));
+    EXPECT_EQ(nullptr,
+              TfLiteAsyncSignatureRunnerGetOutputTensor(runner_, "output"));
+  }
+}
+
+TEST_P(AsyncSignatureRunnerTest, InputByIndexTest) {
+  BuildRunner(GetParam());
+  EXPECT_EQ(1, TfLiteAsyncSignatureRunnerGetInputCount(runner_));
+  auto* indices = TfLiteAsyncSignatureRunnerInputTensorIndices(runner_);
+  EXPECT_NE(nullptr, indices);
+  auto indice = indices[0];
+  EXPECT_STREQ("x", TfLiteOpaqueTensorName(
+                        TfLiteAsyncSignatureRunnerGetTensor(runner_, indice)));
+}
+
+TEST_P(AsyncSignatureRunnerTest, OutputsByIndexTest) {
+  BuildRunner(GetParam());
+  EXPECT_EQ(1, TfLiteAsyncSignatureRunnerGetOutputCount(runner_));
+  auto* indices = TfLiteAsyncSignatureRunnerOutputTensorIndices(runner_);
+  EXPECT_NE(nullptr, indices);
+  auto indice = indices[0];
+  EXPECT_STREQ("a", TfLiteOpaqueTensorName(
+                        TfLiteAsyncSignatureRunnerGetTensor(runner_, indice)));
+}
+
+TEST_P(AsyncSignatureRunnerTest, IndexOutOfBound) {
+  BuildRunner(GetParam());
+  EXPECT_EQ(nullptr, TfLiteAsyncSignatureRunnerGetTensor(runner_, 42));
 }
 
 }  // namespace async
diff --git a/tensorflow/lite/core/async/c/task.cc b/tensorflow/lite/core/async/c/task.cc
index c50bcab2530..4a92798f416 100644
--- a/tensorflow/lite/core/async/c/task.cc
+++ b/tensorflow/lite/core/async/c/task.cc
@@ -31,6 +31,13 @@ TfLiteStatus TfLiteExecutionTaskSetBuffer(TfLiteExecutionTask* task,
   return task->task->SetBufferHandle(io_type, tensor_signature_name, handle);
 }
 
+TfLiteStatus TfLiteExecutionTaskSetBufferByIndex(TfLiteExecutionTask* task,
+                                                 int tensor_index,
+                                                 TfLiteBufferHandle handle) {
+  if (task == nullptr || task->task == nullptr) return kTfLiteError;
+  return task->task->SetBufferHandle(tensor_index, handle);
+}
+
 TfLiteStatus TfLiteExecutionTaskSetSync(TfLiteExecutionTask* task,
                                         TfLiteIoType io_type,
                                         const char* tensor_signature_name,
@@ -41,6 +48,13 @@ TfLiteStatus TfLiteExecutionTaskSetSync(TfLiteExecutionTask* task,
   return task->task->SetSynchronization(io_type, tensor_signature_name, sync);
 }
 
+TfLiteStatus TfLiteExecutionTaskSetSyncByIndex(TfLiteExecutionTask* task,
+                                               int tensor_index,
+                                               TfLiteSynchronization* sync) {
+  if (task == nullptr || task->task == nullptr) return kTfLiteError;
+  return task->task->SetSynchronization(tensor_index, sync);
+}
+
 TfLiteBufferHandle TfLiteExecutionTaskGetBufferByName(
     const TfLiteExecutionTask* task, TfLiteIoType io_type,
     const char* tensor_signature_name) {
diff --git a/tensorflow/lite/core/async/c/task.h b/tensorflow/lite/core/async/c/task.h
index a2ab4f62b15..51eb32e06ec 100644
--- a/tensorflow/lite/core/async/c/task.h
+++ b/tensorflow/lite/core/async/c/task.h
@@ -55,6 +55,14 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetBuffer(
     TfLiteExecutionTask* task, TfLiteIoType io_type,
     const char* tensor_signature_name, TfLiteBufferHandle handle);
 
+/// Sets the buffer handle to the input / output tensor associated with the
+/// tensor index.
+/// NOTE: This method does not check tensor index is pointing to a valid tensor.
+/// Caller need to make sure the tensor_index points to a valid tensor by
+/// using the element from AsyncSignatureRunner inputs / outputs array.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetBufferByIndex(
+    TfLiteExecutionTask* task, int tensor_index, TfLiteBufferHandle handle);
+
 /// Returns the buffer handle of the input / output tensor associated with
 /// `tensor_signature_name`.
 /// `task` and `tensor_signature_name` must not be nullptr.
@@ -105,6 +113,15 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetSync(
     TfLiteExecutionTask* task, TfLiteIoType io_type,
     const char* tensor_signature_name, TfLiteSynchronization* sync);
 
+/// Sets the opaque sync object to the input / output tensor associated with the
+/// tensor index.
+/// NOTE: This method does not check tensor index is pointing to a
+/// valid tensor. Caller need to make sure the tensor_index points to a valid
+/// tensor by using the element from AsyncSignatureRunner inputs / outputs
+/// array.
+TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskSetSyncByIndex(
+    TfLiteExecutionTask* task, int tensor_index, TfLiteSynchronization* sync);
+
 /// Returns the sync object of the input / output tensor associated with
 /// `tensor_signature_name`.
 /// `task` and `tensor_signature_name` must not be nullptr.
@@ -128,7 +145,7 @@ TFL_CAPI_EXPORT extern void TfLiteExecutionTaskSetDelegateExecutionData(
     TfLiteExecutionTask* task, TfLiteAsyncKernel* kernel, void* data);
 
 /// Task status
-/// Thread safe accessors for the lastest status of the task.
+/// Thread safe accessors for the latest status of the task.
 TFL_CAPI_EXPORT extern TfLiteStatus TfLiteExecutionTaskGetStatus(
     const TfLiteExecutionTask* task);
 
diff --git a/tensorflow/lite/core/async/c/task_test.cc b/tensorflow/lite/core/async/c/task_test.cc
index 26da735fff9..19d6f584c5b 100644
--- a/tensorflow/lite/core/async/c/task_test.cc
+++ b/tensorflow/lite/core/async/c/task_test.cc
@@ -104,6 +104,9 @@ TEST_F(TfLiteExecutionTaskTest, NullTest) {
   EXPECT_EQ(nullptr, TfLiteExecutionTaskGetSyncByIndex(nullptr, 3));
   EXPECT_EQ(kTfLiteError, TfLiteExecutionTaskGetStatus(nullptr));
   TfLiteExecutionTaskSetStatus(nullptr, kTfLiteOk);
+  EXPECT_EQ(kTfLiteError, TfLiteExecutionTaskSetBufferByIndex(nullptr, 0, 0));
+  EXPECT_EQ(kTfLiteError,
+            TfLiteExecutionTaskSetSyncByIndex(nullptr, 0, nullptr));
 }
 
 TEST_F(TfLiteExecutionTaskTest, StatusTest) {
diff --git a/tensorflow/lite/core/async/task_internal.cc b/tensorflow/lite/core/async/task_internal.cc
index cf6296126a4..fea80c05e85 100644
--- a/tensorflow/lite/core/async/task_internal.cc
+++ b/tensorflow/lite/core/async/task_internal.cc
@@ -67,7 +67,12 @@ TfLiteStatus ExecutionTask::SetBufferHandle(TfLiteIoType io_type,
   if (!GetTensorIdx(io_type, name, &index)) {
     return kTfLiteError;
   }
-  io_data_[index].buf = handle;
+  return SetBufferHandle(index, handle);
+}
+
+TfLiteStatus ExecutionTask::SetBufferHandle(int tensor_index,
+                                            TfLiteBufferHandle handle) {
+  io_data_[tensor_index].buf = handle;
   return kTfLiteOk;
 }
 
@@ -95,7 +100,12 @@ TfLiteStatus ExecutionTask::SetSynchronization(TfLiteIoType io_type,
   if (!GetTensorIdx(io_type, name, &index)) {
     return kTfLiteError;
   }
-  io_data_[index].sync = sync;
+  return SetSynchronization(index, sync);
+}
+
+TfLiteStatus ExecutionTask::SetSynchronization(int tensor_index,
+                                               TfLiteSynchronization* sync) {
+  io_data_[tensor_index].sync = sync;
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/async/task_internal.h b/tensorflow/lite/core/async/task_internal.h
index 31f9866c655..09290458c8c 100644
--- a/tensorflow/lite/core/async/task_internal.h
+++ b/tensorflow/lite/core/async/task_internal.h
@@ -61,6 +61,10 @@ class ExecutionTask {
   TfLiteStatus SetBufferHandle(TfLiteIoType io_type, const char* name,
                                TfLiteBufferHandle handle);
 
+  // Same as SetBufferHandle above but uses tensor index. Callers need to make
+  // sure the index is valid.
+  TfLiteStatus SetBufferHandle(int tensor_index, TfLiteBufferHandle handle);
+
   // Returns the TfLiteSynchronization for input / output tensor `name`.
   // If there's tensor `name` is not found, returns nullptr.
   TfLiteSynchronization* GetSynchronization(TfLiteIoType io_type,
@@ -73,6 +77,11 @@ class ExecutionTask {
   TfLiteStatus SetSynchronization(TfLiteIoType io_type, const char* name,
                                   TfLiteSynchronization* sync);
 
+  // Same as TfLiteSynchronization above but uses tensor index. Callers need to
+  // make sure the index is valid.
+  TfLiteStatus SetSynchronization(int tensor_index,
+                                  TfLiteSynchronization* sync);
+
   using TensorNameMapT = std::map<std::string, uint32_t>;
 
   // Sets the mapping from signature input name to tensor index.
@@ -136,6 +145,7 @@ class ExecutionTask {
 
   // Mapping from signature name to tensor index.
   // Not owned. Set and owned by AsyncSignatureRunner.
+  // Can be nullptr if the model does not have signature def.
   const TensorNameMapT* input_name_to_idx_ = nullptr;
   const TensorNameMapT* output_name_to_idx_ = nullptr;
 
diff --git a/tensorflow/lite/core/c/BUILD b/tensorflow/lite/core/c/BUILD
index 0de86e3b0be..dbe4c99a6f3 100644
--- a/tensorflow/lite/core/c/BUILD
+++ b/tensorflow/lite/core/c/BUILD
@@ -22,8 +22,10 @@ exports_files(
     srcs = [
         "c_api.h",
         "c_api_experimental.h",
+        "c_api_opaque.h",
         "c_api_types.h",
         "common.h",
+        "registration_external.h",
     ],
     visibility = [
         "//tensorflow/lite:__subpackages__",
@@ -42,6 +44,7 @@ filegroup(
         "c_api.h",
         "c_api_types.h",
         "common.h",
+        "registration_external.h",
     ],
 )
 
@@ -58,6 +61,7 @@ tflite_cc_library_with_c_headers_test(
     deps = [
         ":c_api_types",
         ":c_api_without_op_resolver",
+        ":registration_external",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:create_op_resolver_with_builtin_ops",
         "//tensorflow/lite/c:common",
@@ -90,6 +94,7 @@ tflite_cc_library_with_c_headers_test(
     ],
     deps = [
         ":c_api_types",
+        ":registration_external",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:version",
         "//tensorflow/lite/c:c_api_internal",
@@ -117,6 +122,7 @@ tflite_cc_library_with_c_headers_test(
     ],
     deps = [
         ":c_api_types",
+        ":registration_external",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite/core/async/c:types",
     ],
@@ -131,6 +137,7 @@ tflite_cc_library_with_c_headers_test(
     tags = ["allow_undefined_symbols"],  # For tflite::CreateOpResolver().
     deps = [
         ":c_api_types",
+        ":registration_external_without_alwayslink",
         "//tensorflow/lite:builtin_ops",
         "//tensorflow/lite:framework",
         "//tensorflow/lite:version",
@@ -399,3 +406,39 @@ cc_test(
         "@com_google_googletest//:gtest_main",
     ],
 )
+
+tflite_cc_library_with_c_headers_test(
+    name = "registration_external",
+    srcs = ["registration_external.cc"],
+    hdrs = ["registration_external.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    visibility = [
+        "//tensorflow/lite:__subpackages__",
+    ] + c_api_visibility_allowlist(),
+    deps = [
+        "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/c:common_internal",
+        "//tensorflow/lite/core/async/c:types",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+    # When building shared libraries, we need alwayslink here, since the functions
+    # in this header are part of the public API.
+    alwayslink = 1,
+)
+
+tflite_cc_library_with_c_headers_test(
+    name = "registration_external_without_alwayslink",
+    srcs = ["registration_external.cc"],
+    hdrs = ["registration_external.h"],
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts(),
+    deps = [
+        "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/c:common_internal",
+        "//tensorflow/lite/core/async/c:types",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
diff --git a/tensorflow/lite/core/c/builtin_op_data.h b/tensorflow/lite/core/c/builtin_op_data.h
index 26d7e9989c8..e9c6eb3488d 100644
--- a/tensorflow/lite/core/c/builtin_op_data.h
+++ b/tensorflow/lite/core/c/builtin_op_data.h
@@ -20,6 +20,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
 #define TENSORFLOW_LITE_CORE_C_BUILTIN_OP_DATA_H_
 
+#include <stdbool.h>
 #include <stdint.h>
 
 #include "tensorflow/lite/core/c/common.h"
@@ -166,7 +167,7 @@ typedef struct {
   TfLiteFusedActivation activation;
   bool merge_outputs;
 
-  // Parameter for Bidirectional RNN verison 3.
+  // Parameter for Bidirectional RNN version 3.
   bool asymmetric_quantize_inputs;
 } TfLiteBidirectionalSequenceRNNParams;
 
@@ -407,6 +408,10 @@ typedef struct {
   int ellipsis_mask;
   int new_axis_mask;
   int shrink_axis_mask;
+
+  // Parameters supported by version 8:
+  // If true, then the end tensor is an offset of the begin tensor.
+  bool offset;
 } TfLiteStridedSliceParams;
 
 typedef struct {
diff --git a/tensorflow/lite/core/c/c_api.cc b/tensorflow/lite/core/c/c_api.cc
index d8eeaadd9c2..cbf32a8025f 100644
--- a/tensorflow/lite/core/c/c_api.cc
+++ b/tensorflow/lite/core/c/c_api.cc
@@ -271,74 +271,6 @@ TfLiteStatus TfLiteTensorCopyToBuffer(const TfLiteTensor* tensor,
   return kTfLiteOk;
 }
 
-TfLiteRegistrationExternal* TfLiteRegistrationExternalCreate(
-    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version) {
-  return new TfLiteRegistrationExternal{/*.custom_name =*/custom_name,
-                                        /*.version =*/version,
-                                        /*.init =*/nullptr,
-                                        /*.free =*/nullptr,
-                                        /*.prepare =*/nullptr,
-                                        /*.invoke =*/nullptr,
-                                        /*.async_kernel =*/nullptr,
-                                        /*.builtin_code =*/builtin_code,
-                                        /*.node_index =*/-1};
-}
-
-void TfLiteRegistrationExternalDelete(TfLiteRegistrationExternal* reg) {
-  delete reg;
-}
-
-void TfLiteRegistrationExternalSetInit(
-    TfLiteRegistrationExternal* registration,
-    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
-                  size_t length)) {
-  registration->init = init;
-}
-
-void TfLiteRegistrationExternalSetFree(
-    TfLiteRegistrationExternal* registration,
-    void (*free)(TfLiteOpaqueContext* context, void* data)) {
-  registration->free = free;
-}
-
-void TfLiteRegistrationExternalSetPrepare(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
-                            TfLiteOpaqueNode* node)) {
-  registration->prepare = prepare;
-}
-
-void TfLiteRegistrationExternalSetInvoke(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
-                           TfLiteOpaqueNode* node)) {
-  registration->invoke = invoke;
-}
-
-void TfLiteRegistrationExternalSetAsyncKernel(
-    TfLiteRegistrationExternal* registration,
-    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
-                                       TfLiteOpaqueNode* node)) {
-  registration->async_kernel = async_kernel;
-}
-
-TfLiteBuiltinOperator TfLiteRegistrationExternalGetBuiltInCode(
-    const TfLiteRegistrationExternal* registration) {
-  return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
-}
-
-int TfLiteRegistrationExternalGetVersion(
-    const TfLiteRegistrationExternal* registration) {
-  if (!registration) {
-    return -1;
-  }
-  return registration->version;
-}
-
-const char* TfLiteRegistrationExternalGetCustomName(
-    const TfLiteRegistrationExternal* registration) {
-  return registration->custom_name;
-}
 // LINT.ThenChange(//tensorflow/lite/experimental/examples/unity/TensorFlowLitePlugin/Assets/TensorFlowLite/SDK/Scripts/Interpreter.cs)
 
 }  // extern "C"
diff --git a/tensorflow/lite/core/c/c_api.h b/tensorflow/lite/core/c/c_api.h
index a4316b6ae9e..81d7ba3d351 100644
--- a/tensorflow/lite/core/c/c_api.h
+++ b/tensorflow/lite/core/c/c_api.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "tensorflow/lite/builtin_ops.h"
 #include "tensorflow/lite/core/async/c/types.h"
 #include "tensorflow/lite/core/c/c_api_types.h"  // IWYU pragma: export
+#include "tensorflow/lite/core/c/registration_external.h"  // IWYU pragma: export
 
 // --------------------------------------------------------------------------
 /// \file
@@ -101,11 +102,6 @@ typedef struct TfLiteInterpreter TfLiteInterpreter;
 /// data including a dimensionality (or NULL if not currently defined).
 typedef struct TfLiteTensor TfLiteTensor;
 
-/// TfLiteRegistrationExternal is an external version of TfLiteRegistration to
-/// use custom op registration API.
-/// \warning This is an experimental type and subject to change.
-typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
-
 // --------------------------------------------------------------------------
 /// The TensorFlow Lite Runtime version.
 ///
@@ -467,96 +463,6 @@ TFL_CAPI_EXPORT extern TfLiteStatus TfLiteTensorCopyToBuffer(
     const TfLiteTensor* output_tensor, void* output_data,
     size_t output_data_size);
 
-/// Returns a new TfLiteRegistrationExternal instance.
-///
-/// \note The caller retains ownership and should ensure that
-/// the lifetime of the `TfLiteRegistrationExternal` must be at least as long as
-/// the lifetime of the `TfLiteInterpreter`.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteRegistrationExternal*
-TfLiteRegistrationExternalCreate(TfLiteBuiltinOperator builtin_code,
-                                 const char* custom_name, int version);
-
-/// Return the builtin op code of the provided external 'registration'.
-///
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern TfLiteBuiltinOperator
-TfLiteRegistrationExternalGetBuiltInCode(
-    const TfLiteRegistrationExternal* registration);
-
-/// Return the OP version of the provided external 'registration'.  Return -1
-/// in case of error, or if the provided address is null.
-///
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern int TfLiteRegistrationExternalGetVersion(
-    const TfLiteRegistrationExternal* registration);
-
-/// Returns the custom name of the provided 'registration'. The returned pointer
-/// will be non-null iff the op is a custom op.
-///
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern const char* TfLiteRegistrationExternalGetCustomName(
-    const TfLiteRegistrationExternal* registration);
-
-/// Destroys the TfLiteRegistrationExternal instance.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalDelete(
-    TfLiteRegistrationExternal* registration);
-
-/// Sets the initialization callback for the registration.
-///
-/// The callback is called to initialize the op from serialized data.
-/// Please refer `init` of `TfLiteRegistration` for the detail.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
-    TfLiteRegistrationExternal* registration,
-    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
-                  size_t length));
-
-/// Sets the deallocation callback for the registration.
-///
-/// This callback is called to deallocate the data returned by the init
-/// callback. The value passed in the `data` parameter is the value that was
-/// returned by the `init` callback.
-/// Please refer `free` of `TfLiteRegistration` for the detail.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
-    TfLiteRegistrationExternal* registration,
-    void (*free)(TfLiteOpaqueContext* context, void* data));
-
-/// Sets the preparation callback for the registration.
-///
-/// The callback is called when the inputs of operator have been resized.
-/// Please refer `prepare` of `TfLiteRegistration` for the detail.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
-                            TfLiteOpaqueNode* node));
-
-/// Sets the invocation callback for the registration.
-///
-/// The callback is called when the operator is executed.
-/// Please refer `invoke` of `TfLiteRegistration` for the detail.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
-    TfLiteRegistrationExternal* registration,
-    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
-                           TfLiteOpaqueNode* node));
-
-/// Sets the async kernel accessor callback for the registration.
-///
-/// The callback is called to retrieve the async kernel if the delegate supports
-/// it. If the delegate does not support async execution, either this function
-/// should not be called, or `async_kernel` needs to be nullptr.
-/// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
-/// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
-/// \warning This is an experimental API and subject to change.
-TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
-    TfLiteRegistrationExternal* registration,
-    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
-                                       TfLiteOpaqueNode* node));
-
 // NOLINTEND(modernize-redundant-void-arg)
 
 #ifdef __cplusplus
diff --git a/tensorflow/lite/core/c/c_api_experimental.h b/tensorflow/lite/core/c/c_api_experimental.h
index f203f8a4198..1867bb4825c 100644
--- a/tensorflow/lite/core/c/c_api_experimental.h
+++ b/tensorflow/lite/core/c/c_api_experimental.h
@@ -112,6 +112,10 @@ TFL_CAPI_EXPORT void TfLiteInterpreterOptionsAddCustomOp(
 /// If `op_resolver_user_data` is non-null, its lifetime must be at least as
 /// long as the lifetime of the `TfLiteInterpreterOptions`.
 ///
+/// The TfLiteRegistrationExternal objects whose addresses are returned by
+/// `find_builtin_op` and `find_custom_op` must outlive both the
+/// InterpreterOptions object and any Interpreter object created from it.
+///
 /// WARNING: This is an experimental API and subject to change.
 void TfLiteInterpreterOptionsSetOpResolverExternal(
     TfLiteInterpreterOptions* options,
diff --git a/tensorflow/lite/core/c/c_api_opaque.cc b/tensorflow/lite/core/c/c_api_opaque.cc
index 54fb05b2985..7498371a42d 100644
--- a/tensorflow/lite/core/c/c_api_opaque.cc
+++ b/tensorflow/lite/core/c/c_api_opaque.cc
@@ -303,6 +303,7 @@ TfLiteStatus TfLiteOpaqueContextReplaceNodeSubsetsWithDelegateKernels(
   TfLiteRegistration registration{};
   registration.registration_external = registration_external;
 
+  // Takes ownership of registration_external, if delegate is opaque delegate.
   TfLiteStatus status = context->ReplaceNodeSubsetsWithDelegateKernels(
       context, registration, nodes_to_replace, delegate);
   return status;
diff --git a/tensorflow/lite/core/c/common.cc b/tensorflow/lite/core/c/common.cc
index a25abcfbfd9..534f14b474f 100644
--- a/tensorflow/lite/core/c/common.cc
+++ b/tensorflow/lite/core/c/common.cc
@@ -15,103 +15,143 @@ limitations under the License.
 
 #include "tensorflow/lite/core/c/common.h"
 
+#ifndef TF_LITE_STATIC_MEMORY
+#include <cstdlib>
+#endif  // TF_LITE_STATIC_MEMORY
+
+#include <cstring>
+#include <type_traits>
+#include <utility>
+
 #include "tensorflow/lite/core/c/c_api_types.h"
 #ifdef TF_LITE_TENSORFLOW_PROFILER
 #include "tensorflow/lite/tensorflow_profiler_logger.h"
 #endif
 
-#ifndef TF_LITE_STATIC_MEMORY
-#include <stdlib.h>
-#include <string.h>
-#endif  // TF_LITE_STATIC_MEMORY
+namespace {
 
-extern "C" {
-
-size_t TfLiteIntArrayGetSizeInBytes(int size) {
-  static TfLiteIntArray dummy;
-
-  size_t computed_size = sizeof(dummy) + sizeof(dummy.data[0]) * size;
+template <class T>
+size_t TfLiteVarArrayGetSizeInBytes(const int size) {
+  constexpr size_t data_size = sizeof(std::declval<T>().data[0]);
+  size_t computed_size = sizeof(T) + data_size * size;
 #if defined(_MSC_VER)
   // Context for why this is needed is in http://b/189926408#comment21
-  computed_size -= sizeof(dummy.data[0]);
+  computed_size -= data_size;
 #endif
   return computed_size;
 }
 
+template <class T, class U>
+int TfLiteVarArrayEqualsArray(const T* const a, const int b_size,
+                              const U* const b_data) {
+  static_assert(std::is_same<decltype(a->data[0]), const U&>::value,
+                "TfLiteVarArrayEqualsArray can only compare same type arrays");
+  if (a == nullptr) {
+    return b_size == 0;
+  }
+  if (a->size != b_size) {
+    return 0;
+  }
+  return !memcmp(a->data, b_data, a->size * sizeof(a->data[0]));
+}
+
+template <class T>
+int TfLiteVarArrayEqual(const T* const a, const T* const b) {
+  // This goes first because null arrays must compare equal.
+  if (a == b) {
+    return 1;
+  }
+  if (a == nullptr || b == nullptr) {
+    return 0;
+  }
+  return TfLiteVarArrayEqualsArray(a, b->size, b->data);
+}
+
+#ifndef TF_LITE_STATIC_MEMORY
+
+template <class T>
+T* TfLiteVarArrayCreate(const int size) {
+  const size_t alloc_size = TfLiteVarArrayGetSizeInBytes<T>(size);
+  if (alloc_size <= 0) {
+    return nullptr;
+  }
+  T* ret = (T*)malloc(alloc_size);
+  if (!ret) {
+    return nullptr;
+  }
+  ret->size = size;
+  return ret;
+}
+
+template <class T>
+T* TfLiteVarArrayCopy(const T* const src) {
+  if (!src) {
+    return nullptr;
+  }
+  T* const ret = TfLiteVarArrayCreate<T>(src->size);
+  if (ret) {
+    memcpy(ret->data, src->data, src->size * sizeof(src->data[0]));
+  }
+  return ret;
+}
+
+#endif  // TF_LITE_STATIC_MEMORY
+
+template <class T>
+void TfLiteVarArrayFree(T* a) {
+  free(a);
+}
+
+}  // namespace
+
+extern "C" {
+
+size_t TfLiteIntArrayGetSizeInBytes(int size) {
+  return TfLiteVarArrayGetSizeInBytes<TfLiteIntArray>(size);
+}
+
 int TfLiteIntArrayEqual(const TfLiteIntArray* a, const TfLiteIntArray* b) {
-  if (a == b) return 1;
-  if (a == nullptr || b == nullptr) return 0;
-  return TfLiteIntArrayEqualsArray(a, b->size, b->data);
+  return TfLiteVarArrayEqual(a, b);
 }
 
 int TfLiteIntArrayEqualsArray(const TfLiteIntArray* a, int b_size,
                               const int b_data[]) {
-  if (a == nullptr) return (b_size == 0);
-  if (a->size != b_size) return 0;
-  int i = 0;
-  for (; i < a->size; i++)
-    if (a->data[i] != b_data[i]) return 0;
-  return 1;
+  return TfLiteVarArrayEqualsArray(a, b_size, b_data);
 }
 
 #ifndef TF_LITE_STATIC_MEMORY
 
 TfLiteIntArray* TfLiteIntArrayCreate(int size) {
-  size_t alloc_size = TfLiteIntArrayGetSizeInBytes(size);
-  if (alloc_size <= 0) return nullptr;
-  TfLiteIntArray* ret = (TfLiteIntArray*)malloc(alloc_size);
-  if (!ret) return ret;
-  ret->size = size;
-  return ret;
+  return TfLiteVarArrayCreate<TfLiteIntArray>(size);
 }
 
 TfLiteIntArray* TfLiteIntArrayCopy(const TfLiteIntArray* src) {
-  if (!src) return nullptr;
-  TfLiteIntArray* ret = TfLiteIntArrayCreate(src->size);
-  if (ret) {
-    memcpy(ret->data, src->data, src->size * sizeof(int));
-  }
-  return ret;
+  return TfLiteVarArrayCopy(src);
 }
 
-void TfLiteIntArrayFree(TfLiteIntArray* a) { free(a); }
+void TfLiteIntArrayFree(TfLiteIntArray* a) { TfLiteVarArrayFree(a); }
 
 #endif  // TF_LITE_STATIC_MEMORY
 
 int TfLiteFloatArrayGetSizeInBytes(int size) {
-  static TfLiteFloatArray dummy;
-
-  int computed_size = sizeof(dummy) + sizeof(dummy.data[0]) * size;
-#if defined(_MSC_VER)
-  // Context for why this is needed is in http://b/189926408#comment21
-  computed_size -= sizeof(dummy.data[0]);
-#endif
-  return computed_size;
+  return TfLiteVarArrayGetSizeInBytes<TfLiteFloatArray>(size);
 }
 
 #ifndef TF_LITE_STATIC_MEMORY
 
 TfLiteFloatArray* TfLiteFloatArrayCreate(int size) {
-  TfLiteFloatArray* ret =
-      (TfLiteFloatArray*)malloc(TfLiteFloatArrayGetSizeInBytes(size));
-  ret->size = size;
-  return ret;
+  return TfLiteVarArrayCreate<TfLiteFloatArray>(size);
 }
 
 TfLiteFloatArray* TfLiteFloatArrayCopy(const TfLiteFloatArray* src) {
-  if (!src) return nullptr;
-  TfLiteFloatArray* ret = TfLiteFloatArrayCreate(src->size);
-  if (ret) {
-    memcpy(ret->data, src->data, src->size * sizeof(float));
-  }
-  return ret;
+  return TfLiteVarArrayCopy(src);
 }
 
-void TfLiteFloatArrayFree(TfLiteFloatArray* a) { free(a); }
+void TfLiteFloatArrayFree(TfLiteFloatArray* a) { TfLiteVarArrayFree(a); }
 
 void TfLiteTensorDataFree(TfLiteTensor* t) {
   if (t->allocation_type == kTfLiteVariantObject) {
-    delete reinterpret_cast<VariantData*>(t->data.data);
+    delete static_cast<VariantData*>(t->data.data);
   } else if (t->allocation_type == kTfLiteDynamic ||
              t->allocation_type == kTfLitePersistentRo) {
     if (t->data.raw) {
@@ -223,8 +263,10 @@ TfLiteStatus TfLiteTensorCopy(const TfLiteTensor* src, TfLiteTensor* dst) {
   dst->dims = TfLiteIntArrayCopy(src->dims);
   if (src->allocation_type == kTfLiteVariantObject) {
     if (dst->allocation_type != kTfLiteVariantObject) return kTfLiteError;
-    dst->data.data =
-        reinterpret_cast<VariantData*>(src->data.data)->Clone(dst->data.raw);
+    auto* dst_vd = static_cast<VariantData*>(dst->data.data);
+    auto* src_vd = static_cast<VariantData*>(src->data.data);
+    // Implicitly casted via return from `CloneTo`. Don't need static cast here.
+    dst->data.data = src_vd->CloneTo(dst_vd);
   } else {
     memcpy(dst->data.raw, src->data.raw, src->bytes);
   }
diff --git a/tensorflow/lite/core/c/common.h b/tensorflow/lite/core/c/common.h
index 24e30eb6db7..0b91773706d 100644
--- a/tensorflow/lite/core/c/common.h
+++ b/tensorflow/lite/core/c/common.h
@@ -1207,39 +1207,97 @@ void* TfLiteOpaqueDelegateGetData(const TfLiteOpaqueDelegate* delegate);
 
 #include <utility>
 
-// `kTfLiteVariant` type tensors encode arbitrary C++ objects behind their
-// `data.data : void*` member. This is the type-erased interface for interacting
-// with such objects at runtime. Deleting or Cloning any `VariantData`
-// will call the destructor and copy constructor of the erased type
-// automatically. For example usage, see `common_test.cc`.
+// --- TFLITE VARIANT TENSORS ----
+// Programming languges usually define "variant" as a type that can hold an
+// unbounded set of types. See std::any
+// (https://en.cppreference.com/w/cpp/utility/any) for a related standard
+// library construct. In tensorflow, variant tensors have a data member which is
+// an Object that is destructible and copy constructible.
+//   Variant tensors are commonly used to represent non trivial data
+// semantics that don't fit into simple primitives, such as lists of tensors and
+// datasets. Additionally, they can facilitate containers for optimizing
+// memory movement of tensor data.
+//
+// The following set of classes define the variant tensor member for tflite.
+// They implement a type-erased container intended to be used behind the
+// `data.data : void*` member of `TfLiteTensor`s. Runtime functions interact
+// the variant member at the level of a `VariantData`, whereas kernels
+// operate with the full knowledge of the un-erased type. The `VariantData`
+// class provides abstract methods for destroying and copying `VariantData`.
+// Invoking these methods will dispatch to the erased type opaquely.
+//    The contents of any object of type derived from `AbstractVariant` can be
+// written to `TfLiteTensor::data::data : void*` from kernels. If the runtime
+// were to copy such a tensor through `TfLiteTensorCopy`, the destination data
+// member will contain the result of invoking the erased type's copy
+// constructor. Similar for the runtime releasing tensors from memory, the
+// erased type's destructor will be invoked. There are a few caveats to consider
+// to use these safely, which we discuss below.
+//
+// EXAMPLE: READING VARIANT TENSORS
+//   ```
+//   // retrieve input with `type == kTfLiteVariant`
+//   TfLiteTensor* input = ...
+//   // must first static cast to `VariantData`, more on this below.
+//   VariantData* vd_input = static_cast<VariantData*>(t->data.data);
+//   CustomType* typed_input =
+//   static_cast<CustomType*>(vd_input);
+//   // do custom work on `typed_input`...
+//   ```
+//
+// EXAMPLE: WRITING VARIANT TENSORS
+//   ```
+//   TfLiteTensor* output = ...
+//   // construct a new variant object behind the target tensor
+//   TfLiteVariantRealloc<DerivedType, DerivedArgs...>(output, args...);
+//   // again must static cast to `VariantData*` before writing to `void*`.
+//   output->data.data = static_cast<VariantData*>(typed_output);
+//   ```
+//
+// WHY STATIC CAST TO `VariantData*`
+//    The Standard defines a `reinterpret_cast` from a derived type to its
+// parents as undefined behavior when the parent is a non-standard layout.
+// https://en.cppreference.com/w/cpp/language/reinterpret_cast (see bullet 5).
+// Due to the `VariantData` having virtual members it is indeed non-standard
+// layout, and any type derived from `VariantData` fails to be
+// "transparently-replaceable". I.e. implicit cast from derived to base in this
+// case may adjust the pointer and by definition `reinterpret_cast` will not
+// the adjust the pointer.
+//    Thus, dereferencing a pointer of type `VariantData` which addresses
+// the first byte of an object of said derived type is UB unless it was first
+// implicitly or statically casted to a `VariantData`. Writing the object of
+// derived type directly to `void*` which is dereferenced as a `VariantData` is
+// then UB, and so the intermediate cast through `VariantData` must be enforced.
+//    A good example of this issue is ellucidate in the bottom code snippet
+// here: https://en.cppreference.com/w/cpp/utility/launder.
 class VariantData {
  public:
   // All variant objects must be able to be destroyed and copied.
   virtual ~VariantData() = default;
-  // This allows for a "virtual copy-constructor" like pattern.
-  // In most cases, we will be copying from an input to an output tensor.
-  // Often, the output tensor is already allocated so we can pass
-  // a pointer to its buffer for reuse.
-  virtual VariantData* Clone(char* maybe_alloc) const = 0;
+  // A "virtual copy-constructor". Often the destination tensor of a variant
+  // copy may have been previously allocated in a prior call to inference. We
+  // allow the copy to target the destinations buffer (`maybe_alloc`),
+  // for potential reuse and optimizations. `maybe_alloc` must be of the same
+  // underlying derived type. References to whatever object is at
+  // `maybe_alloc` may be invalidated.
+  virtual VariantData* CloneTo(VariantData* maybe_alloc) const = 0;
 };
 
-// An abstract base class for variant objects. The template parameter
-// is the type we are erasing.
+// Concrete implementations extend `AbstractVariantData` with CRPT.
 template <typename ErasedDerived>
 class AbstractVariantData : public VariantData {
  public:
-  VariantData* Clone(char* maybe_alloc) const override {
-    if (maybe_alloc) {
-      // We assume that the output tensor is already a variant of the same
-      // derived type. If the output is still allocated, then it still may have
-      // state that was not destroyed, so we must call the destructor before
-      // using the buffer.
+  VariantData* CloneTo(VariantData* maybe_alloc) const override {
+    if (maybe_alloc != nullptr) {
+      // If the output is still allocated, then its object may still be
+      // in its life time and the destructor must be called before re-using the
+      // buffer.
       //     This may actual have a non-negligle effect on perfomance if the
-      // destructor is complex. In a future optimization we would want to
-      // introduce something like "move to" semantics, allowing for the
+      // destructor is complex. A future iteration may
+      // introduce copy or move asignment semantics, allowing for the
       // underlying implementation to optimize for this case.
-      reinterpret_cast<VariantData*>(maybe_alloc)->~VariantData();
-      return new (maybe_alloc)
+      auto* derived = static_cast<ErasedDerived*>(maybe_alloc);
+      derived->~ErasedDerived();
+      return new (derived)
           ErasedDerived(static_cast<ErasedDerived const&>(*this));
     }
     return new ErasedDerived(static_cast<ErasedDerived const&>(*this));
@@ -1254,21 +1312,23 @@ class AbstractVariantData : public VariantData {
 // Analogous to `TfLiteTensorRealloc` for allocation of tensors whose
 // data member points to an arbitrary C++ object. `VariantType` refers
 // to the erased type of said object and `VariantArgs` refers to
-// a list of argument types with which to construct a new `VariantType`
-// `VariantArgs` must match constructor in `VariantType`.
+// a list of argument types with which to construct a new `VariantType`.
+// `VariantArgs` must match a constructor of `VariantType`.
 template <class VariantType, class... VariantArgs>
 TfLiteStatus TfLiteTensorVariantRealloc(TfLiteTensor* t,
                                         VariantArgs&&... args) {
   if (t->type != kTfLiteVariant) return kTfLiteError;
-  if (t->data.raw) {
-    reinterpret_cast<VariantData*>(t->data.data)->~VariantData();
-    // For now we assume if `t` is already allocated then it was allocated
+  VariantType* new_vd;
+  if (t->data.raw != nullptr) {
+    auto* target_vd = static_cast<VariantData*>(t->data.data);
+    target_vd->~VariantData();
+    // As above, we assume if `t` is already allocated then it was allocated
     // with the same `VariantType` as templated.
-    t->data.data =
-        new (t->data.raw) VariantType(std::forward<VariantArgs...>(args...));
+    new_vd = new (t->data.raw) VariantType(std::forward<VariantArgs>(args)...);
   } else {
-    t->data.data = new VariantType(std::forward<VariantArgs...>(args...));
+    new_vd = new VariantType(std::forward<VariantArgs>(args)...);
   }
+  t->data.data = static_cast<VariantData*>(new_vd);
   t->allocation_type = kTfLiteVariantObject;
   return kTfLiteOk;
 }
diff --git a/tensorflow/lite/core/c/common_test.cc b/tensorflow/lite/core/c/common_test.cc
index e9d6c9c5f8b..49169d3f91e 100644
--- a/tensorflow/lite/core/c/common_test.cc
+++ b/tensorflow/lite/core/c/common_test.cc
@@ -66,10 +66,13 @@ TEST(IntArray, TestIntArrayEqual) {
   TfLiteIntArray* d = TfLiteIntArrayCreate(2);
   d->data[0] = 6;
   d->data[1] = 6;
-  ASSERT_FALSE(TfLiteIntArrayEqual(a, b));
-  ASSERT_TRUE(TfLiteIntArrayEqual(b, c));
-  ASSERT_TRUE(TfLiteIntArrayEqual(b, b));
-  ASSERT_FALSE(TfLiteIntArrayEqual(c, d));
+  EXPECT_FALSE(TfLiteIntArrayEqual(a, b));
+  EXPECT_TRUE(TfLiteIntArrayEqual(b, c));
+  EXPECT_TRUE(TfLiteIntArrayEqual(b, b));
+  EXPECT_FALSE(TfLiteIntArrayEqual(c, d));
+  EXPECT_FALSE(TfLiteIntArrayEqual(nullptr, a));
+  EXPECT_FALSE(TfLiteIntArrayEqual(a, nullptr));
+  EXPECT_TRUE(TfLiteIntArrayEqual(nullptr, nullptr));
   TfLiteIntArrayFree(a);
   TfLiteIntArrayFree(b);
   TfLiteIntArrayFree(c);
@@ -445,102 +448,155 @@ class VariantFoo : public AbstractVariantData<VariantFoo> {
   Foo foo_data_;
 };
 
-TEST(TestTfLiteReallocWithObject, SimpleConstruction) {
-  TfLiteTensor t = {};
-  t.type = kTfLiteVariant;
-  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 3)), kTfLiteOk);
-  ASSERT_EQ(reinterpret_cast<VariantFoo*>(t.data.data)->GetFooInt(), 3);
-  ASSERT_EQ(t.type, kTfLiteVariant);
-  ASSERT_EQ(t.allocation_type, kTfLiteVariantObject);
-  TfLiteTensorFree(&t);
+// Want to validate the TfLiteTensorVariantRealloc works as intended
+// with > 1 constructor arguments.
+class VariantFoo2 : public AbstractVariantData<VariantFoo2> {
+ public:
+  explicit VariantFoo2(int number, float float_number)
+      : foo_data_(Foo{number, false}), float_data_(float_number) {}
+  VariantFoo2(const VariantFoo2& other) {
+    foo_data_ = other.foo_data_;
+    foo_data_.copied = true;
+    float_data_ = other.float_data_;
+  }
+  int GetFooInt() { return foo_data_.data; }
+  bool GetFooCopied() { return foo_data_.copied; }
+  float GetFloatData() { return float_data_; }
+
+ private:
+  Foo foo_data_;
+  float float_data_;
+};
+
+TEST(TestTfLiteReallocWithObject, ConstructSingleParamVariant) {
+  TensorUniquePtr t = BuildTfLiteTensor();
+  t->type = kTfLiteVariant;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(t.get(), 3)),
+            kTfLiteOk);
+  ASSERT_EQ(reinterpret_cast<VariantFoo*>(t->data.data)->GetFooInt(), 3);
+  ASSERT_EQ(t->type, kTfLiteVariant);
+  ASSERT_EQ(t->allocation_type, kTfLiteVariantObject);
 }
 
-TEST(TestTfLiteReallocWithObject, ConstructWithAlreadyAllocated) {
-  TfLiteTensor t = {};
-  t.type = kTfLiteVariant;
-  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 3)), kTfLiteOk);
-  void* before_address = t.data.data;
-  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 5)), kTfLiteOk);
-  EXPECT_EQ(t.data.data, before_address);
-  EXPECT_EQ(reinterpret_cast<VariantFoo*>(t.data.data)->GetFooInt(), 5);
-  EXPECT_EQ(t.type, kTfLiteVariant);
-  EXPECT_EQ(t.allocation_type, kTfLiteVariantObject);
-  TfLiteTensorFree(&t);
+TEST(TestTfLiteReallocWithObject, ConstructMultiParamVariant) {
+  TensorUniquePtr t = BuildTfLiteTensor();
+  t->type = kTfLiteVariant;
+  ASSERT_EQ(
+      (TfLiteTensorVariantRealloc<VariantFoo2, int, float>(t.get(), 3, 1.0)),
+      kTfLiteOk);
+  VariantFoo2* data = reinterpret_cast<VariantFoo2*>(t->data.data);
+  ASSERT_EQ(data->GetFooInt(), 3);
+  ASSERT_EQ(data->GetFloatData(), 1.0);
+  ASSERT_EQ(t->type, kTfLiteVariant);
+  ASSERT_EQ(t->allocation_type, kTfLiteVariantObject);
+}
+
+TEST(TestTfLiteReallocWithObject,
+     ConstructSingleParamVariantWithAlreadyAllocated) {
+  TensorUniquePtr t = BuildTfLiteTensor();
+  t->type = kTfLiteVariant;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(t.get(), 3)),
+            kTfLiteOk);
+  void* before_address = t->data.data;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(t.get(), 5)),
+            kTfLiteOk);
+  EXPECT_EQ(t->data.data, before_address);
+  EXPECT_EQ(reinterpret_cast<VariantFoo*>(t->data.data)->GetFooInt(), 5);
+  EXPECT_EQ(t->type, kTfLiteVariant);
+  EXPECT_EQ(t->allocation_type, kTfLiteVariantObject);
+}
+
+TEST(TestTfLiteReallocWithObject,
+     ConstructMutliParamVariantWithAlreadyAllocated) {
+  TensorUniquePtr t = BuildTfLiteTensor();
+  t->type = kTfLiteVariant;
+  ASSERT_EQ(
+      (TfLiteTensorVariantRealloc<VariantFoo2, int, float>(t.get(), 3, 1.0)),
+      kTfLiteOk);
+  void* before_address = t->data.data;
+  ASSERT_EQ(
+      (TfLiteTensorVariantRealloc<VariantFoo2, int, float>(t.get(), 5, 2.0)),
+      kTfLiteOk);
+  EXPECT_EQ(t->data.data, before_address);
+  VariantFoo2* data = reinterpret_cast<VariantFoo2*>(t->data.data);
+  EXPECT_EQ(data->GetFooInt(), 5);
+  EXPECT_EQ(data->GetFloatData(), 2.0);
+  EXPECT_EQ(t->type, kTfLiteVariant);
+  EXPECT_EQ(t->allocation_type, kTfLiteVariantObject);
 }
 
 TEST(TestTfLiteReallocWithObject, NonVariantTypeError) {
-  TfLiteTensor t = {};
-  t.type = kTfLiteInt32;
-  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(&t, 3)), kTfLiteError);
+  TensorUniquePtr t = BuildTfLiteTensor();
+  t->type = kTfLiteInt32;
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(t.get(), 3)),
+            kTfLiteError);
 }
 
 TEST(TestVariantData, CopyVariantTensorCallsDerivedCopyCstor) {
   // Basic setup for tensors.
-  TfLiteTensor src_variant_tensor = {};
-  TfLiteTensor dst_variant_tensor = {};
-  for (auto* tensor : {&src_variant_tensor, &dst_variant_tensor}) {
+  TensorUniquePtr src_variant_tensor = BuildTfLiteTensor();
+  TensorUniquePtr dst_variant_tensor = BuildTfLiteTensor();
+  for (TfLiteTensor* tensor :
+       {src_variant_tensor.get(), dst_variant_tensor.get()}) {
     tensor->dims = ConvertVectorToTfLiteIntArray({0});
     tensor->allocation_type = kTfLiteVariantObject;
     tensor->type = kTfLiteVariant;
   }
   // Initialize variant data object. `src_variant_tensor` takes ownership
   // of any arguments passed to `TfLiteTensorRealloc` should it succeed.
-  ASSERT_EQ(
-      (TfLiteTensorVariantRealloc<VariantFoo, int>(&src_variant_tensor, 1)),
-      kTfLiteOk);
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(
+                src_variant_tensor.get(), 1)),
+            kTfLiteOk);
   auto* src_variant_data =
-      reinterpret_cast<VariantFoo*>(src_variant_tensor.data.data);
+      reinterpret_cast<VariantFoo*>(src_variant_tensor->data.data);
   EXPECT_EQ(src_variant_data->GetFooInt(), 1);
   EXPECT_EQ(src_variant_data->GetFooCopied(), false);
 
   // Copy one variant tensor to another as usual.
-  ASSERT_EQ(TfLiteTensorCopy(&src_variant_tensor, &dst_variant_tensor),
-            kTfLiteOk);
+  ASSERT_EQ(
+      TfLiteTensorCopy(src_variant_tensor.get(), dst_variant_tensor.get()),
+      kTfLiteOk);
 
   // The destination tensor's data.data member will point to the result
   // of calling the copy constructor of the source tensors underlying object.
   auto* dst_variant_data =
-      reinterpret_cast<VariantFoo*>(dst_variant_tensor.data.data);
+      reinterpret_cast<VariantFoo*>(dst_variant_tensor->data.data);
   EXPECT_EQ(dst_variant_data->GetFooInt(), 1);
   EXPECT_EQ(dst_variant_data->GetFooCopied(), true);
-
-  TfLiteTensorFree(&src_variant_tensor);
-  TfLiteTensorFree(&dst_variant_tensor);
 }
 
 TEST(TestVariantData, CopyVariantTensorCallsDerivedCopyCstorWithAllocation) {
   // Basic setup for tensors.
-  TfLiteTensor src_variant_tensor = {};
-  TfLiteTensor dst_variant_tensor = {};
-  for (auto* tensor : {&src_variant_tensor, &dst_variant_tensor}) {
+  TensorUniquePtr src_variant_tensor = BuildTfLiteTensor();
+  TensorUniquePtr dst_variant_tensor = BuildTfLiteTensor();
+  for (TfLiteTensor* tensor :
+       {src_variant_tensor.get(), dst_variant_tensor.get()}) {
     tensor->dims = ConvertVectorToTfLiteIntArray({0});
     tensor->allocation_type = kTfLiteVariantObject;
     tensor->type = kTfLiteVariant;
   }
   // Initialize variant data objects.
-  ASSERT_EQ(
-      (TfLiteTensorVariantRealloc<VariantFoo, int>(&src_variant_tensor, 1)),
-      kTfLiteOk);
-  ASSERT_EQ(
-      (TfLiteTensorVariantRealloc<VariantFoo, int>(&dst_variant_tensor, 2)),
-      kTfLiteOk);
-
-  void* before_address = dst_variant_tensor.data.data;
-
-  // Copy one variant tensor to another as usual.
-  ASSERT_EQ(TfLiteTensorCopy(&src_variant_tensor, &dst_variant_tensor),
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(
+                src_variant_tensor.get(), 1)),
+            kTfLiteOk);
+  ASSERT_EQ((TfLiteTensorVariantRealloc<VariantFoo, int>(
+                dst_variant_tensor.get(), 2)),
             kTfLiteOk);
 
+  void* before_address = dst_variant_tensor->data.data;
+
+  // Copy one variant tensor to another as usual.
+  ASSERT_EQ(
+      TfLiteTensorCopy(src_variant_tensor.get(), dst_variant_tensor.get()),
+      kTfLiteOk);
+
   auto* dst_variant_data =
-      reinterpret_cast<VariantFoo*>(dst_variant_tensor.data.data);
+      reinterpret_cast<VariantFoo*>(dst_variant_tensor->data.data);
   EXPECT_EQ(dst_variant_data->GetFooInt(), 1);
 
   // If the destination tensor is already populated, the dstor will be called
   // and the buffer reused.
-  EXPECT_EQ(dst_variant_tensor.data.data, before_address);
-
-  TfLiteTensorFree(&src_variant_tensor);
-  TfLiteTensorFree(&dst_variant_tensor);
+  EXPECT_EQ(dst_variant_tensor->data.data, before_address);
 }
 
 }  // namespace tflite
diff --git a/tensorflow/lite/core/c/registration_external.cc b/tensorflow/lite/core/c/registration_external.cc
new file mode 100644
index 00000000000..c0ab6894cf4
--- /dev/null
+++ b/tensorflow/lite/core/c/registration_external.cc
@@ -0,0 +1,86 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/core/c/registration_external.h"
+
+#include "tensorflow/lite/c/common_internal.h"
+
+TfLiteRegistrationExternal* TfLiteRegistrationExternalCreate(
+    TfLiteBuiltinOperator builtin_code, const char* custom_name, int version) {
+  return new TfLiteRegistrationExternal{/*.custom_name =*/custom_name,
+                                        /*.version =*/version,
+                                        /*.init =*/nullptr,
+                                        /*.free =*/nullptr,
+                                        /*.prepare =*/nullptr,
+                                        /*.invoke =*/nullptr,
+                                        /*.async_kernel =*/nullptr,
+                                        /*.builtin_code =*/builtin_code,
+                                        /*.node_index =*/-1};
+}
+
+void TfLiteRegistrationExternalDelete(TfLiteRegistrationExternal* reg) {
+  delete reg;
+}
+
+void TfLiteRegistrationExternalSetInit(
+    TfLiteRegistrationExternal* registration,
+    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                  size_t length)) {
+  registration->init = init;
+}
+
+void TfLiteRegistrationExternalSetFree(
+    TfLiteRegistrationExternal* registration,
+    void (*free)(TfLiteOpaqueContext* context, void* data)) {
+  registration->free = free;
+}
+
+void TfLiteRegistrationExternalSetPrepare(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node)) {
+  registration->prepare = prepare;
+}
+
+void TfLiteRegistrationExternalSetInvoke(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node)) {
+  registration->invoke = invoke;
+}
+
+void TfLiteRegistrationExternalSetAsyncKernel(
+    TfLiteRegistrationExternal* registration,
+    TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                       TfLiteOpaqueNode* node)) {
+  registration->async_kernel = async_kernel;
+}
+
+TfLiteBuiltinOperator TfLiteRegistrationExternalGetBuiltInCode(
+    const TfLiteRegistrationExternal* registration) {
+  return static_cast<TfLiteBuiltinOperator>(registration->builtin_code);
+}
+
+const char* TfLiteRegistrationExternalGetCustomName(
+    const TfLiteRegistrationExternal* registration) {
+  return registration->custom_name;
+}
+
+int TfLiteRegistrationExternalGetVersion(
+    const TfLiteRegistrationExternal* registration) {
+  if (!registration) {
+    return -1;
+  }
+  return registration->version;
+}
diff --git a/tensorflow/lite/core/c/registration_external.h b/tensorflow/lite/core/c/registration_external.h
new file mode 100644
index 00000000000..e581f59709d
--- /dev/null
+++ b/tensorflow/lite/core/c/registration_external.h
@@ -0,0 +1,138 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+/// \warning Users of TensorFlow Lite should not include this file directly,
+/// but should instead include "third_party/tensorflow/lite/c/c_api.h".
+/// Only the TensorFlow Lite implementation itself should include this
+/// file directly.
+#ifndef TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
+#define TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
+
+#include <stdlib.h>
+
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/core/async/c/types.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+// TfLiteRegistrationExternal is an external version of TfLiteRegistration to
+// use custom op registration API.
+//
+// \warning This is an experimental type and subject to change.
+typedef struct TfLiteRegistrationExternal TfLiteRegistrationExternal;
+
+// Returns a new TfLiteRegistrationExternal instance.
+//
+// \note The caller retains ownership and should ensure that
+// the lifetime of the `TfLiteRegistrationExternal` must be at least as long as
+// the lifetime of the `TfLiteInterpreter`.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteRegistrationExternal*
+TfLiteRegistrationExternalCreate(TfLiteBuiltinOperator builtin_code,
+                                 const char* custom_name, int version);
+
+// Destroys the TfLiteRegistrationExternal instance.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalDelete(
+    TfLiteRegistrationExternal* registration);
+
+// Return the builtin op code of the provided external 'registration'.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern TfLiteBuiltinOperator
+TfLiteRegistrationExternalGetBuiltInCode(
+    const TfLiteRegistrationExternal* registration);
+
+/// Returns the custom name of the provided 'registration'. The returned pointer
+/// will be non-null iff the op is a custom op.
+///
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern const char* TfLiteRegistrationExternalGetCustomName(
+    const TfLiteRegistrationExternal* registration);
+
+/// Return the OP version of the provided external 'registration'.  Return -1
+/// in case of error, or if the provided address is null.
+///
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern int TfLiteRegistrationExternalGetVersion(
+    const TfLiteRegistrationExternal* registration);
+
+// Sets the initialization callback for the registration.
+//
+// The callback is called to initialize the op from serialized data.
+// Please refer `init` of `TfLiteRegistration` for the detail.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInit(
+    TfLiteRegistrationExternal* registration,
+    void* (*init)(TfLiteOpaqueContext* context, const char* buffer,
+                  size_t length));
+
+// Sets the deallocation callback for the registration.
+//
+// This callback is called to deallocate the data returned by the init callback.
+// The value passed in the `data` parameter is the value that was returned by
+// the `init` callback.
+// Please refer `free` of `TfLiteRegistration` for the detail.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetFree(
+    TfLiteRegistrationExternal* registration,
+    void (*free)(TfLiteOpaqueContext* context, void* data));
+
+// Sets the preparation callback for the registration.
+//
+// The callback is called when the inputs of operator have been resized.
+// Please refer `prepare` of `TfLiteRegistration` for the detail.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetPrepare(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*prepare)(TfLiteOpaqueContext* context,
+                            TfLiteOpaqueNode* node));
+
+// Sets the invocation callback for the registration.
+//
+// The callback is called when the operator is executed.
+// Please refer `invoke` of `TfLiteRegistration` for the detail.
+//
+// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetInvoke(
+    TfLiteRegistrationExternal* registration,
+    TfLiteStatus (*invoke)(TfLiteOpaqueContext* context,
+                           TfLiteOpaqueNode* node));
+
+/// Sets the async kernel accessor callback for the registration.
+///
+/// The callback is called to retrieve the async kernel if the delegate supports
+/// it. If the delegate does not support async execution, either this function
+/// should not be called, or `async_kernel` needs to be nullptr.
+/// `node` is the delegate TfLiteNode created by `ModifyGraphWithDelegate`.
+/// Please refer `async_kernel` of `TfLiteRegistration` for the detail.
+/// \warning This is an experimental API and subject to change.
+TFL_CAPI_EXPORT extern void TfLiteRegistrationExternalSetAsyncKernel(
+    TfLiteRegistrationExternal* registration,
+    struct TfLiteAsyncKernel* (*async_kernel)(TfLiteOpaqueContext* context,
+                                              TfLiteOpaqueNode* node));
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+
+#endif  // TENSORFLOW_LITE_CORE_C_REGISTRATION_EXTERNAL_H_
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index 61a2713b4ac..518287023f0 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -272,7 +272,7 @@ TfLiteStatus Interpreter::SetTensorParametersReadWrite(
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadOnly(
-    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    int tensor_index, TfLiteType type, const char* name, size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, const char* buffer,
     size_t bytes, const Allocation* allocation) {
   TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
@@ -282,9 +282,9 @@ TfLiteStatus Interpreter::SetTensorParametersReadOnly(
 }
 
 TfLiteStatus Interpreter::SetTensorParametersReadWrite(
-    int tensor_index, TfLiteType type, const char* name, const size_t rank,
+    int tensor_index, TfLiteType type, const char* name, size_t rank,
     const int* dims, TfLiteQuantizationParams quantization, bool is_variable,
-    const size_t rank_dims_signature, const int* dims_signature) {
+    size_t rank_dims_signature, const int* dims_signature) {
   TfLiteQuantization new_quantization = GetQuantizationFromLegacy(quantization);
   return primary_subgraph().SetTensorParametersReadWrite(
       tensor_index, type, name, rank, dims, new_quantization, is_variable,
diff --git a/tensorflow/lite/core/interpreter.h b/tensorflow/lite/core/interpreter.h
index e9695c64632..92b6f2404cf 100644
--- a/tensorflow/lite/core/interpreter.h
+++ b/tensorflow/lite/core/interpreter.h
@@ -349,6 +349,8 @@ class Interpreter {
   /// \brief Returns a pointer to the AsyncSignatureRunner instance to run the
   /// part of the graph identified by a SignatureDef. The nullptr is returned if
   /// the given signature key is not valid.
+  /// if the model does not have signature def, pass nullptr to signature_key
+  /// and AsyncSignatureRunner will be created using primary subgraph (0).
   /// The async delegate should be applied before calling this function.
   async::AsyncSignatureRunner* GetAsyncSignatureRunner(
       const char* signature_key);
diff --git a/tensorflow/lite/core/interpreter_experimental.cc b/tensorflow/lite/core/interpreter_experimental.cc
index 464585445f0..4ac1cf79321 100644
--- a/tensorflow/lite/core/interpreter_experimental.cc
+++ b/tensorflow/lite/core/interpreter_experimental.cc
@@ -39,6 +39,10 @@ limitations under the License.
 
 namespace tflite {
 
+namespace {
+static constexpr char kDefaultServingSignatureDefKey[] = "serving_default";
+}  // namespace
+
 TfLiteStatus Interpreter::SetCustomAllocationForTensor(
     int tensor_index, const TfLiteCustomAllocation& allocation, int64_t flags) {
   return primary_subgraph().SetCustomAllocationForTensor(tensor_index,
@@ -174,11 +178,40 @@ SignatureRunner* Interpreter::GetSignatureRunner(const char* signature_key) {
 
 async::AsyncSignatureRunner* Interpreter::GetAsyncSignatureRunner(
     const char* signature_key) {
+  // Handles nullptr signature key.
+  // If the model does not have signature def, use default name as placeholder.
+  // Otherwise use the first signature key that points to primary subgraph.
+  bool empty_signature_fallback = false;
+  if (signature_key == nullptr) {
+    if (signature_defs_.empty()) {
+      signature_key = kDefaultServingSignatureDefKey;
+      empty_signature_fallback = true;
+    } else {
+      for (const auto& signature : signature_defs_) {
+        if (signature.subgraph_index == 0) {
+          signature_key = signature.signature_key.c_str();
+          break;
+        }
+      }
+    }
+  }
+
+  if (signature_key == nullptr) {
+    // The model has signature def but none of those points to primary subgraph.
+    return nullptr;
+  }
+
   auto iter = async_signature_runner_map_.find(signature_key);
   if (iter != async_signature_runner_map_.end()) {
     return &(iter->second);
   }
 
+  if (empty_signature_fallback) {
+    auto status = async_signature_runner_map_.insert(
+        {signature_key,
+         async::AsyncSignatureRunner(nullptr, &primary_subgraph())});
+    return &(status.first->second);
+  }
   for (const auto& signature : signature_defs_) {
     if (signature.signature_key == signature_key) {
       auto status = async_signature_runner_map_.insert(
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index e59d1d8f576..733ba358d19 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -90,7 +90,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 3);
   AddBuiltin(BuiltinOperator_CONCATENATION, Register_CONCATENATION(),
              /* min_version = */ 1,
-             /* max_version = */ 3);
+             /* max_version = */ 4);
   AddBuiltin(BuiltinOperator_ADD, Register_ADD(),
              /* min_version */ 1,
              /* max_version */ 5);
@@ -160,7 +160,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_STRIDED_SLICE, Register_STRIDED_SLICE(),
              /* min_version = */ 1,
-             /* max_version = */ 7);
+             /* max_version = */ 8);
   AddBuiltin(BuiltinOperator_EXP, Register_EXP(),
              /* min_version = */ 1,
              /* max_version = */ 2);
@@ -273,7 +273,9 @@ BuiltinOpResolver::BuiltinOpResolver() {
   AddBuiltin(BuiltinOperator_FLOOR_MOD, Register_FLOOR_MOD(),
              /* min_version = */ 1,
              /* max_version = */ 2);
-  AddBuiltin(BuiltinOperator_RANGE, Register_RANGE());
+  AddBuiltin(BuiltinOperator_RANGE, Register_RANGE(),
+             /* min_version = */ 1,
+             /* max_version = */ 2);
   AddBuiltin(BuiltinOperator_LEAKY_RELU, Register_LEAKY_RELU(),
              /* min_version = */ 1,
              /* max_version = */ 2);
diff --git a/tensorflow/lite/core/kernels/register_test.cc b/tensorflow/lite/core/kernels/register_test.cc
index a6f0b05652a..3adbd2d6a63 100644
--- a/tensorflow/lite/core/kernels/register_test.cc
+++ b/tensorflow/lite/core/kernels/register_test.cc
@@ -67,7 +67,7 @@ TEST(BuiltinOpResolverTest, HasXNNPACKDelegate_QS8) {
 }
 
 TEST(BuiltinOpResolverTest, HasXNNPACKDelegate_QS8_QU8) {
-  BuiltinOpResolverWithXNNPACK builtin_op_resolver(true);
+  BuiltinOpResolver builtin_op_resolver;
   ASSERT_EQ(builtin_op_resolver.GetDelegateCreators().size(), 1);
   BuiltinOpResolver::TfLiteDelegateCreator delegate_creator =
       builtin_op_resolver.GetDelegateCreators()[0];
diff --git a/tensorflow/lite/core/subgraph.cc b/tensorflow/lite/core/subgraph.cc
index 28bfa533eb0..32dd0640bcc 100644
--- a/tensorflow/lite/core/subgraph.cc
+++ b/tensorflow/lite/core/subgraph.cc
@@ -500,6 +500,12 @@ TfLiteStatus Subgraph::PartitionGraph(const TfLiteIntArray* nodes_to_replace,
 TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
     TfLiteRegistration registration, const TfLiteIntArray* nodes_to_replace,
     TfLiteDelegate* delegate) {
+  // Annotate the registration as DELEGATE op.
+  registration.builtin_code = BuiltinOperator_DELEGATE;
+  if (registration.registration_external) {
+    registration.registration_external->builtin_code = BuiltinOperator_DELEGATE;
+  }
+
   // The subgraph is taking ownership of the external registration, in case the
   // user has supplied an opaque delegate.
   if (TfLiteDelegateHasValidOpaqueDelegateBuilder(delegate)) {
@@ -512,9 +518,6 @@ TfLiteStatus Subgraph::ReplaceNodeSubsetsWithDelegateKernels(
     return kTfLiteOk;
   }
 
-  // Annotate the registration as DELEGATE op.
-  registration.builtin_code = BuiltinOperator_DELEGATE;
-
   // Analyze the graph to find all independent node_subsets that are either
   // fully not-this-delegate or this-delegate computation.
   std::vector<NodeSubset> node_subsets;
@@ -1378,17 +1381,6 @@ TfLiteStatus Subgraph::PrepareOpsStartingAt(
 }
 
 TfLiteStatus Subgraph::PrepareOpsAndTensors() {
-  if (!memory_planner_) {
-#ifdef TFLITE_USE_SIMPLE_MEMORY_PLANNER
-    memory_planner_.reset(new SimplePlanner(&context_, CreateGraphInfo()));
-#else
-    memory_planner_ = std::make_unique<ArenaPlanner>(
-        &context_, CreateGraphInfo(), ShouldPreserveAllTensors(),
-        kDefaultTensorAlignment, subgraph_index_);
-#endif
-    memory_planner_->PlanAllocations();
-  }
-
   // Prepare original execution plan if any applied delegate wants it.
   // If any of the delegates is immutable, this won't be triggered
   // post-delegation (since we undo/redo delegation). For all other cases, other
@@ -1418,6 +1410,17 @@ TfLiteStatus Subgraph::PrepareOpsAndTensors() {
                            execution_plan_, &last_exec_plan_index_prepared));
   next_execution_plan_index_to_prepare_ = last_exec_plan_index_prepared + 1;
 
+  if (!memory_planner_) {
+#ifdef TFLITE_USE_SIMPLE_MEMORY_PLANNER
+    memory_planner_.reset(new SimplePlanner(&context_, CreateGraphInfo()));
+#else
+    memory_planner_ = std::make_unique<ArenaPlanner>(
+        &context_, CreateGraphInfo(), ShouldPreserveAllTensors(),
+        kDefaultTensorAlignment, subgraph_index_);
+#endif
+    memory_planner_->PlanAllocations();
+  }
+
   // Execute arena allocations.
   TF_LITE_ENSURE_STATUS(memory_planner_->ExecuteAllocations(
       next_execution_plan_index_to_plan_allocation_,
@@ -1650,8 +1653,10 @@ TfLiteStatus Subgraph::ResizeTensor(TfLiteContext* context,
                                   new_size->data)) {
     // A number of clients assume |new_size| remains valid upon success, so
     // swap it in as the new (but logically identical) tensor dims.
-    TfLiteIntArrayFree(tensor->dims);
-    tensor->dims = new_size;
+    if (new_size != tensor->dims) {
+      TfLiteIntArrayFree(tensor->dims);
+      tensor->dims = new_size;
+    }
     return kTfLiteOk;
   }
 
@@ -1756,7 +1761,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadOnly(
     size_t required_bytes;
     TF_LITE_ENSURE_OK(
         &context_,
-        tflite::BytesRequired(type, dims, ndims, &required_bytes, context_));
+        tflite::BytesRequired(type, dims, ndims, &required_bytes, &context_));
     TF_LITE_ENSURE_EQ(&context_, required_bytes, bytes);
   }
 
@@ -1811,7 +1816,7 @@ TfLiteStatus Subgraph::SetTensorParametersReadWrite(
     // they will require.
     TF_LITE_ENSURE_OK(
         &context_,
-        tflite::BytesRequired(type, dims, ndims, &required_bytes, context_));
+        tflite::BytesRequired(type, dims, ndims, &required_bytes, &context_));
   }
 
   TfLiteAllocationType allocation_type = kTfLiteArenaRw;
@@ -1863,7 +1868,7 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
       size_t bytesRequired;
       TfLiteStatus status =
           tflite::BytesRequired(tensor->type, new_size->data, new_size->size,
-                                &bytesRequired, context_);
+                                &bytesRequired, &context_);
       if (status != kTfLiteOk) {
         TfLiteIntArrayFree(new_size);
         return kTfLiteError;
@@ -1873,7 +1878,9 @@ TfLiteStatus Subgraph::ResizeTensorImpl(TfLiteTensor* tensor,
       TfLiteTensorResizeMaybeCopy(bytesRequired, tensor, false);
       tensor->bytes = bytesRequired;
     }
-    if (tensor->dims) TfLiteIntArrayFree(tensor->dims);
+    if (tensor->dims && tensor->dims != new_size) {
+      TfLiteIntArrayFree(tensor->dims);
+    }
     tensor->dims = new_size;
 
     // Reset arena-allocated tensors; they will be allocated later.
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index ce61e89a671..90df228f047 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -20,10 +20,8 @@ limitations under the License.
 
 #include <atomic>
 #include <cstdint>
-#include <cstdlib>
 #include <map>
 #include <memory>
-#include <set>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -342,7 +340,9 @@ class Subgraph {
 
   // Returns a pointer to vector of subgraphs.
   // WARNING: This is an experimental API and subject to change.
-  std::vector<std::unique_ptr<Subgraph>>* GetSubgraphs() { return subgraphs_; }
+  std::vector<std::unique_ptr<Subgraph>>* GetSubgraphs() const {
+    return subgraphs_;
+  }
 
   // Returns the location of this object within subgraphs_, or
   // kInvalidSubgraphIndex if subgraphs_ is nullptr or *this is not
@@ -398,7 +398,7 @@ class Subgraph {
   // the TfLite library and allow users to plug-in their own memory planner
   // debugger, we have utilized weak symbols to meet these two requirements. By
   // default, there is no debugging info dumped. However, if the TfLite-provided
-  // lite:simple_memory_arena_debug_dump (i.e. containing the strong defintion)
+  // lite:simple_memory_arena_debug_dump (i.e. containing the strong definition)
   // is linked to the program, calling this function will output memory usage
   // information about tenosrs and ops.
   void DumpMemoryPlannerDebugInfo() const;
@@ -663,6 +663,10 @@ class Subgraph {
 
   // WARNING: This is an experimental API and subject to change.
   // Entry point for C API ReplaceNodeSubsetsWithDelegateKernels
+  // If the delegate has the 'opaque_delegate_builder' field set,
+  // then the 'registration.registration_external' must be non-null,
+  // and the subgraph takes ownership of the registration_external.
+  // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
   static TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
       TfLiteContext* context, TfLiteRegistration registration,
       const TfLiteIntArray* nodes_to_replace, TfLiteDelegate* delegate);
@@ -670,6 +674,9 @@ class Subgraph {
   // Update the execution graph to replace some of the nodes with stub
   // nodes. Specifically any node index that has `nodes[index]==1` will be
   // slated for replacement with a delegate kernel specified by registration.
+  // If the delegate has the 'opaque_delegate_builder' field set,
+  // then the 'registration.registration_external' must be non-null,
+  // and the subgraph takes ownership of the registration_external.
   // Ownership of 'nodes_to_replace' and 'delegate' remains with the caller.
   // WARNING: This is an experimental interface that is subject to change.
   TfLiteStatus ReplaceNodeSubsetsWithDelegateKernels(
@@ -881,6 +888,27 @@ class Subgraph {
   // sits inside the associated TFLite interpreter instance.
   TfLiteExternalContext** external_contexts_;
 
+  // A set of 'TfLiteRegistrationExternal' pointers that are owned by the
+  // subgraph.  The objects pointed to by the 'TfLiteRegistrationExternal'
+  // pointers are deleted in the 'Subgraph' destructor.
+  //
+  // The intended usage of this container is to provide (friend) classes
+  // the option to dynamically allocate 'TfLiteRegistrationExternal' objects
+  // and then tie the lifetime of these objects to a subgraph.
+  //
+  // WARNING: This field needs to precede 'nodes_and_registration_', to ensure
+  // that it outlives that field, since that field might contain references to
+  // the TfLiteRegistrationExternal objects contained in this fielld.
+  //
+  // LINT.IfChange
+  // Ideally we could include c_api.h and use
+  // 'TfLiteRegistrationExternalDelete' as the deleter,  but that would create a
+  // dependency cycle.
+  std::unordered_set<  // NOLINT
+      std::unique_ptr<const TfLiteRegistrationExternal>>
+      registration_externals_;
+  // LINT.ThenChange(//tensorflow/lite/core/c/c_api.cc)
+
   // Node inputs/outputs are stored in TfLiteNode and TfLiteRegistration stores
   // function pointers to actual implementation.
   // Nodes should appear in the order in which they are instantiated at runtime.
@@ -1023,23 +1051,6 @@ class Subgraph {
   // uses this tensor.
   std::map<int, int> tensor_to_last_op_index_;
 
-  // A set of 'TfLiteRegistrationExternal' pointers that are owned by the
-  // subgraph.  The objects pointed to by the 'TfLiteRegistrationExternal'
-  // pointers are deleted in the 'Subgraph' destructor.
-  //
-  // The intended usage of this container is to provide (friend) classes
-  // the option to dynamically allocate 'TfLiteRegistrationExternal' objects
-  // and then tie the lifetime of these objects to a subgraph.
-  //
-  // LINT.IfChange
-  // Ideally we could include c_api.h and use
-  // 'TfLiteRegistrationExternalDelete' as the deleter,  but that would create a
-  // dependency cycle.
-  std::unordered_set<  // NOLINT
-      std::unique_ptr<const TfLiteRegistrationExternal>>
-      registration_externals_;
-  // LINT.ThenChange(//tensorflow/lite/core/c/c_api.cc)
-
   // `InterpreterOptions` object which is being used and owned by Interpreter.
   InterpreterOptions* options_;
 
diff --git a/tensorflow/lite/core/subgraph_test.cc b/tensorflow/lite/core/subgraph_test.cc
index edda4ff85c7..cac72edcdee 100644
--- a/tensorflow/lite/core/subgraph_test.cc
+++ b/tensorflow/lite/core/subgraph_test.cc
@@ -15,12 +15,20 @@ limitations under the License.
 
 #include "tensorflow/lite/core/subgraph.h"
 
+#include <algorithm>
+#include <cstddef>
+#include <functional>
 #include <memory>
+#include <numeric>
+#include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/stderr_reporter.h"
+#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 
@@ -33,6 +41,9 @@ TfLiteRegistration* Register_NEG();
 
 namespace {
 
+using testing::ElementsAreArray;
+using testing::Not;
+
 TEST(RemoveUnusedInputs, NothingToRemove) {
   Interpreter interpreter;
   auto& subgraph = interpreter.primary_subgraph();
@@ -130,5 +141,139 @@ TEST(MarkSubgraphAsDelegationSkippable, MarkSubgraphAsDelegationSkippable) {
   ASSERT_TRUE(subgraphs[1]->IsDelegationSkippable());
 }
 
+// Helper to get the minimal buffer size to allocate for a buffer of given
+// shape.
+size_t BytesFor(const TfLiteType type, const int* const data,
+                const size_t size) {
+  size_t type_size;
+  CHECK_EQ(GetSizeOfType(nullptr, type, &type_size), kTfLiteOk)
+      << "Type is not supported by GetSizeOfType";
+  return std::accumulate(data, data + size, type_size, std::multiplies<int>());
+}
+
+size_t BytesFor(const TfLiteType type, const TfLiteIntArray& dims) {
+  return BytesFor(type, dims.data, dims.size);
+}
+
+size_t BytesFor(const TfLiteType type, const std::vector<int>& dims) {
+  return BytesFor(type, dims.data(), dims.size());
+}
+
+// Sets up a TFLite context and default values to initialize/resize test
+// tensors.
+class SubgraphResizeTensorTest : public testing::Test {
+ public:
+  SubgraphResizeTensorTest() {
+    tensor_.type = type_;
+    tensor_.allocation_type = kTfLiteDynamic;
+  }
+
+  ~SubgraphResizeTensorTest() override { TfLiteTensorFree(&tensor_); }
+
+ protected:
+  const TfLiteType type_ = kTfLiteInt32;
+  Interpreter interpreter_;
+  TfLiteContext& context_ = *interpreter_.primary_subgraph().context();
+  const std::vector<int> reference_shape_ = {5, 4, 3};
+  const size_t reference_dims_bytes_ = BytesFor(type_, reference_shape_);
+  TfLiteTensor tensor_ = {};
+  TfLiteIntArray* dims_ = ConvertVectorToTfLiteIntArray(reference_shape_);
+};
+
+TEST_F(SubgraphResizeTensorTest, ResizeEmptyDynamicTensorAllocateData) {
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims_), kTfLiteOk);
+  EXPECT_EQ(tensor_.dims, dims_);
+  // Some alignment requirements may lead to more memory being allocated.
+  EXPECT_GE(tensor_.bytes, reference_dims_bytes_);
+  // Touch memory to trigger ASAN in case of failure.
+  std::fill_n(tensor_.data.raw, reference_dims_bytes_, 0);
+  std::fill_n(tensor_.dims->data, tensor_.dims->size, 1);
+}
+
+TEST_F(SubgraphResizeTensorTest,
+       ResizeEmptyDynamicTensorWithStoredShapeAllocatesData) {
+  tensor_.dims = dims_;
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, tensor_.dims),
+            kTfLiteOk);
+  // Some alignment requirements may lead to more memory being allocated.
+  EXPECT_GE(tensor_.bytes, reference_dims_bytes_);
+  // Touch memory to trigger ASAN in case of incorrect handling.
+  std::fill_n(tensor_.data.raw, reference_dims_bytes_, 0);
+  std::fill_n(tensor_.dims->data, tensor_.dims->size, 1);
+}
+
+TEST_F(SubgraphResizeTensorTest, ResizeDynamicTensorWithTheEqualShapeIsANoop) {
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims_), kTfLiteOk);
+  const void* const initial_data = tensor_.data.data;
+
+  TfLiteIntArray* dims2 = ConvertVectorToTfLiteIntArray(reference_shape_);
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims2), kTfLiteOk);
+
+  EXPECT_EQ(tensor_.dims, dims2);
+  // Some alignment requirements may lead to more memory being allocated.
+  EXPECT_GE(tensor_.bytes, reference_dims_bytes_);
+  EXPECT_GE(tensor_.data.data, initial_data);
+  // Touch memory to trigger ASAN in case of incorrect handling.
+  std::fill_n(tensor_.data.raw, reference_dims_bytes_, 0);
+  std::fill_n(tensor_.dims->data, tensor_.dims->size, 1);
+}
+
+TEST_F(SubgraphResizeTensorTest, ResizeDynamicTensorWithStoredShapeIsANoop) {
+  tensor_.dims = dims_;
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, tensor_.dims),
+            kTfLiteOk);
+  const void* const initial_data = tensor_.data.data;
+  // Reallocate the tensor with its current shape.
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, tensor_.dims),
+            kTfLiteOk);
+  // Some alignment requirements may lead to more memory being allocated.
+  EXPECT_GE(tensor_.bytes, reference_dims_bytes_);
+  EXPECT_GE(tensor_.data.data, initial_data);
+  // Touch memory to trigger ASAN in case of incorrect handling.
+  std::fill_n(tensor_.data.raw, reference_dims_bytes_, 0);
+  std::fill_n(tensor_.dims->data, tensor_.dims->size, 1);
+}
+
+TEST_F(SubgraphResizeTensorTest,
+       ResizeDynamicTensorWithEquivalentBufferSizeIsANoop) {
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims_), kTfLiteOk);
+  const void* const initial_data = tensor_.data.data;
+
+  const std::vector<int> new_shape = {3, 4, 5};
+  ASSERT_THAT(new_shape, Not(ElementsAreArray(reference_shape_)));
+  TfLiteIntArray* dims2 = ConvertVectorToTfLiteIntArray(new_shape);
+  ASSERT_EQ(BytesFor(type_, *dims2), reference_dims_bytes_);
+
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims2), kTfLiteOk);
+
+  // Some alignment requirements may lead to more memory being allocated.
+  EXPECT_GE(tensor_.bytes, reference_dims_bytes_);
+  EXPECT_EQ(tensor_.data.data, initial_data);
+  EXPECT_EQ(tensor_.dims, dims2);
+  // Touch memory to trigger ASAN in case of incorrect handling.
+  std::fill_n(tensor_.data.raw, reference_dims_bytes_, 0);
+  std::fill_n(tensor_.dims->data, tensor_.dims->size, 1);
+}
+
+TEST_F(SubgraphResizeTensorTest,
+       ResizeDynamicTensorWithDifferentShapeReallocatesData) {
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims_), kTfLiteOk);
+  const void* const initial_data = tensor_.data.data;
+
+  TfLiteIntArray* dims2 = ConvertVectorToTfLiteIntArray({5, 4, 6});
+  const int dims2_bytes = BytesFor(type_, *dims2);
+  ASSERT_NE(dims2_bytes, reference_dims_bytes_);
+
+  ASSERT_EQ(context_.ResizeTensor(&context_, &tensor_, dims2), kTfLiteOk);
+
+  // Some alignment requirements may lead to more memory being allocated.
+  EXPECT_GE(tensor_.bytes, dims2_bytes);
+  EXPECT_NE(tensor_.data.data, initial_data);
+  EXPECT_EQ(tensor_.dims, dims2);
+  // Touch memory to trigger ASAN in case of incorrect handling.
+  std::fill_n(tensor_.data.raw, dims2_bytes, 0);
+  std::fill_n(tensor_.dims->data, tensor_.dims->size, 1);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/delegates/BUILD b/tensorflow/lite/delegates/BUILD
index 83a92acdbe6..d0a627922cc 100644
--- a/tensorflow/lite/delegates/BUILD
+++ b/tensorflow/lite/delegates/BUILD
@@ -35,9 +35,9 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tflite_copts(),
     deps = [
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
     ],
 )
 
@@ -48,9 +48,9 @@ cc_test(
     linkstatic = 1,
     deps = [
         ":telemetry",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/core/api",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/profiling:profile_buffer",
         "@com_google_googletest//:gtest_main",
         "@flatbuffers",
diff --git a/tensorflow/lite/delegates/delegate_test.cc b/tensorflow/lite/delegates/delegate_test.cc
index 3c9bb1641f5..51a36d4f789 100644
--- a/tensorflow/lite/delegates/delegate_test.cc
+++ b/tensorflow/lite/delegates/delegate_test.cc
@@ -100,7 +100,7 @@ TEST_F(TestDelegate, DelegateNodePrepareFailure) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -212,10 +212,10 @@ TEST_F(TestDelegate, SetBufferHandleToOutput) {
 }
 
 TEST_F(TestDelegate, SetInvalidHandleToTensor) {
-  interpreter_->Invoke();
   delegate_ = std::unique_ptr<SimpleDelegate>(new SimpleDelegate({0, 1, 2}));
   TfLiteDelegate* delegate = delegate_->get_tf_lite_delegate();
   interpreter_->ModifyGraphWithDelegate(delegate);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
 
   SimpleDelegate another_simple_delegate({0, 1, 2});
 
@@ -264,7 +264,7 @@ TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -278,7 +278,7 @@ TEST_F(TestDelegate, TestResizeInputWithNonDynamicDelegate) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -307,7 +307,7 @@ TEST_F(TestDelegate, TestRequirePropagatedShapes_NonDynamicDelegate) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -337,7 +337,7 @@ TEST_F(TestDelegate, TestRequirePropagatedShapes_DynamicDelegateWithFlag) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -381,7 +381,7 @@ TEST_F(TestDelegate, TestCopyFromBufferInvoke) {
   ASSERT_EQ(tensor->buffer_handle, kTfLiteNullBufferHandle);
 
   // Called Invoke without setting the buffer will not call the CopyFromBuffer
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   std::vector<float> res = {2.0f, 4.0f, 6.0f};
   for (int i = 0; i < tensor->dims->data[0]; ++i) {
     ASSERT_EQ(tensor->data.f[i], res[i]);
@@ -410,8 +410,8 @@ TEST_F(TestDelegate, TestCopyFromBuffer) {
   TfLiteBufferHandle handle = AllocateBufferHandle();
   TfLiteStatus status =
       interpreter_->SetBufferHandle(kOutputTensorIndex, handle, delegate);
-  interpreter_->Invoke();
   ASSERT_EQ(status, kTfLiteOk);
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   EXPECT_EQ(tensor->delegate, delegate);
   EXPECT_EQ(tensor->buffer_handle, handle);
   for (int i = 0; i < tensor->dims->data[0]; ++i) {
@@ -752,7 +752,7 @@ TEST_P(TestTwoDelegates, SecondDelegationPrepareFailure) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -872,7 +872,7 @@ TEST_P(TestTwoDelegates, TestResizeInputTensors) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -943,7 +943,7 @@ TEST_P(TestTwoDelegates, TestRequirePropagatedShapes) {
 
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 4 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 4 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 4; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
@@ -981,7 +981,7 @@ TEST_P(TestTwoDelegates, ReleaseNonPersistentMemoryWithDelegates) {
   // Verify Invoke() behavior.
   memcpy(interpreter_->typed_tensor<float>(0), input.data(), 3 * sizeof(float));
   memcpy(interpreter_->typed_tensor<float>(1), input.data(), 3 * sizeof(float));
-  interpreter_->Invoke();
+  ASSERT_EQ(interpreter_->Invoke(), kTfLiteOk);
   for (int i = 0; i < 3; ++i) {
     EXPECT_EQ(tensor->data.f[i], expected_output[i]) << i;
   }
diff --git a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
index 16d9e7a3a1f..96d7026f151 100644
--- a/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
+++ b/tensorflow/lite/delegates/flex/allowlisted_flex_ops.cc
@@ -103,6 +103,7 @@ const std::set<std::string>& GetFlexAllowlist() {
           "Bucketize",
           "CTCBeamSearchDecoder",
           "CTCGreedyDecoder",
+          "Case",
           "Cast",
           "Ceil",
           "CheckNumerics",
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index e3a547a22ce..f3976b407e6 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -258,12 +258,17 @@ GpuInfo GpuInfoFromDeviceID(cl_device_id id, cl_platform_id platform_id) {
     info.opencl_info.supports_fp16_rtn = false;
   }
 
-  if (info.IsPowerVR() && !info.opencl_info.supports_fp16) {
-    // PowerVR doesn't have full support of fp16 and so doesn't list this
-    // extension. But it can support fp16 in MADs and as buffers/textures types,
-    // so we will use it.
-    info.opencl_info.supports_fp16 = true;
-    info.opencl_info.supports_fp16_rtn = info.opencl_info.supports_fp32_rtn;
+  if (info.IsPowerVR()) {
+    if (!info.powervr_info.IsBetterThan(PowerVRGpu::kRogueGm9xxx)) {
+      // Some GPU older than RogueGe8xxx has accuracy issue with FP16.
+      info.opencl_info.supports_fp16 = false;
+    } else if (!info.opencl_info.supports_fp16) {
+      // PowerVR doesn't have full support of fp16 and so doesn't list this
+      // extension. But it can support fp16 in MADs and as buffers/textures
+      // types, so we will use it.
+      info.opencl_info.supports_fp16 = true;
+      info.opencl_info.supports_fp16_rtn = info.opencl_info.supports_fp32_rtn;
+    }
   }
 
   if (!info.opencl_info.supports_image3d_writes &&
diff --git a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
index 2e3f07e46b6..02315c30a7a 100755
--- a/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/compiled_program_cache_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/cl/inference_context.cc b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
index e5219f2645e..684ed5c5107 100644
--- a/tensorflow/lite/delegates/gpu/cl/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/cl/inference_context.cc
@@ -226,10 +226,16 @@ void InferenceContext::ExecutionHints::Init(const GpuInfo& gpu_info) {
 
     flush_periodically = true;
     flush_period = 24;
-  }
-  if (gpu_info.IsPowerVR()) {
+  } else if (gpu_info.IsPowerVR()) {
     need_flush = true;
     flush_periodically = true;
+    // Some Ge8xxx devices are slower without frequent periodic flushing.
+    flush_period =
+        gpu_info.powervr_info.IsBetterThan(PowerVRGpu::kRogueGm9xxx) ? 16 : 4;
+  } else if (gpu_info.IsAdreno() &&
+             !gpu_info.adreno_info.IsBetterThan(AdrenoGpu::kAdreno630)) {
+    // Adreno620 or lower devices has smaller GPU buffer.
+    flush_periodically = true;
     flush_period = 16;
   }
 }
diff --git a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
index c0d14cfbc02..d0108d97b29 100755
--- a/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
+++ b/tensorflow/lite/delegates/gpu/cl/serialization_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 #include "gpu_model_generated.h"
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index ae96ba76ad7..f94890ec38e 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -139,11 +139,20 @@ MaliGpu GetMaliGpuVersion(const std::string& gpu_description) {
 PowerVRGpu GetPowerVRGpuVersion(const std::string& gpu_description) {
   // Order must be preserved
   const std::vector<std::pair<std::string, PowerVRGpu>> kMapping = {
-      {"rogue", PowerVRGpu::kRogue},     {"axe", PowerVRGpu::kAXE},
-      {"axm", PowerVRGpu::kAXM},         {"axt", PowerVRGpu::kAXT},
-      {"bxe", PowerVRGpu::kBXE},         {"bxm", PowerVRGpu::kBXM},
-      {"bxs", PowerVRGpu::kBXS},         {"bxt", PowerVRGpu::kBXT},
-      {"cxt", PowerVRGpu::kCXT},         {"dxt", PowerVRGpu::kDXT},
+      {"rogue gm9", PowerVRGpu::kRogueGm9xxx},    // From OpenGL
+      {"powervr gm9", PowerVRGpu::kRogueGm9xxx},  /// From OpenCL
+      {"rogue ge8", PowerVRGpu::kRogueGe8xxx},    // From OpenGL
+      {"powervr ge8", PowerVRGpu::kRogueGe8xxx},  /// From OpenCL
+      {"rogue", PowerVRGpu::kRogue},
+      {"axe", PowerVRGpu::kAXE},
+      {"axm", PowerVRGpu::kAXM},
+      {"axt", PowerVRGpu::kAXT},
+      {"bxe", PowerVRGpu::kBXE},
+      {"bxm", PowerVRGpu::kBXM},
+      {"bxs", PowerVRGpu::kBXS},
+      {"bxt", PowerVRGpu::kBXT},
+      {"cxt", PowerVRGpu::kCXT},
+      {"dxt", PowerVRGpu::kDXT},
       {"powervr g", PowerVRGpu::kRogue},
   };
   for (const auto& v : kMapping) {
@@ -223,6 +232,11 @@ bool AdrenoInfo::IsAdreno7xx() const {
          adreno_gpu == AdrenoGpu::kAdreno740;
 }
 
+bool AdrenoInfo::IsBetterThan(AdrenoGpu gpu) const {
+  // Smaller value is better (recent) version.
+  return (adreno_gpu <= gpu);
+}
+
 bool AdrenoInfo::IsAdreno6xxOrHigher() const {
   return (!compiler_bugs_in_a6xx && IsAdreno6xx()) || IsAdreno7xx();
 }
@@ -579,7 +593,11 @@ int MaliInfo::GetApproximateComputeUnitsCount() const {
 PowerVRInfo::PowerVRInfo(const std::string& gpu_description)
     : gpu_version(GetPowerVRGpuVersion(gpu_description)) {}
 
-bool PowerVRInfo::IsRogue() const { return gpu_version == PowerVRGpu::kRogue; }
+bool PowerVRInfo::IsRogue() const {
+  return gpu_version == PowerVRGpu::kRogue ||
+         gpu_version == PowerVRGpu::kRogueGe8xxx ||
+         gpu_version == PowerVRGpu::kRogueGm9xxx;
+}
 
 bool PowerVRInfo::IsImgAxx() const {
   return gpu_version == PowerVRGpu::kAXE || gpu_version == PowerVRGpu::kAXM ||
@@ -595,6 +613,11 @@ bool PowerVRInfo::IsImgCxx() const { return gpu_version == PowerVRGpu::kCXT; }
 
 bool PowerVRInfo::IsImgDxx() const { return gpu_version == PowerVRGpu::kDXT; }
 
+bool PowerVRInfo::IsBetterThan(PowerVRGpu gpu) const {
+  // Smaller value is better (recent) version.
+  return (gpu_version <= gpu);
+}
+
 void GetGpuInfoFromDeviceDescription(const std::string& gpu_description,
                                      GpuApi gpu_api, GpuInfo* gpu_info) {
   gpu_info->gpu_api = gpu_api;
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index 68c457f6bce..391060ebb64 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -128,6 +128,7 @@ struct AdrenoInfo {
   bool IsAdreno6xx() const;
   bool IsAdreno7xx() const;
   bool IsAdreno6xxOrHigher() const;
+  bool IsBetterThan(AdrenoGpu gpu) const;
 
   // This function returns some not very documented physical parameter of
   // Adreno6xx GPU.
@@ -263,7 +264,8 @@ struct MaliInfo {
 };
 
 enum class PowerVRGpu {
-  kUnknown,
+  kRogueGm9xxx,
+  kRogueGe8xxx,
   kRogue,
   // New generation of IMG gpus after 2019:
   kAXE,
@@ -275,6 +277,7 @@ enum class PowerVRGpu {
   kBXT,
   kCXT,
   kDXT,
+  kUnknown,
 };
 
 struct PowerVRInfo {
@@ -293,6 +296,8 @@ struct PowerVRInfo {
   bool IsImgBxx() const;
   bool IsImgCxx() const;
   bool IsImgDxx() const;
+
+  bool IsBetterThan(PowerVRGpu gpu) const;
 };
 
 struct OpenGlInfo {
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
index 3fa3fbaf816..8bb2ae32e0d 100755
--- a/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model_generated.h
@@ -22,8 +22,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 #include "serialization_base_generated.h"
diff --git a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
index 59831408614..015bb5955d5 100755
--- a/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
+++ b/tensorflow/lite/delegates/gpu/common/task/serialization_base_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
index 733b287a753..4473c65e1fd 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.cc
@@ -27,17 +27,35 @@ limitations under the License.
 #include "tensorflow/lite/delegates/gpu/gl/compiler/variable_accessor.h"
 #include "tensorflow/lite/delegates/gpu/gl/variable.h"
 
+#ifdef __ANDROID__
+#include <sys/system_properties.h>
+#endif  // __ANDROID__
+
 namespace tflite {
 namespace gpu {
 namespace gl {
 
 ShaderCodegen::ShaderCodegen(const CompilationOptions& options,
                              const GpuInfo& gpu_info)
-    : options_(options), gpu_type_(gpu_info.vendor) {}
+    : options_(options),
+      gpu_type_(gpu_info.vendor),
+      inline_parameters_(options.inline_parameters) {
+#ifdef __ANDROID__
+  if (gpu_info.IsAdreno() &&
+      gpu_info.adreno_info.adreno_gpu == AdrenoGpu::kAdreno730) {
+    char sdk_version[PROP_VALUE_MAX];
+    __system_property_get("ro.build.version.sdk", sdk_version);
+    if (!strcmp(sdk_version, "31")) inline_parameters_ = false;
+  } else if (gpu_info.IsPowerVR() &&
+             !gpu_info.powervr_info.IsBetterThan(PowerVRGpu::kRogueGm9xxx)) {
+    inline_parameters_ = false;
+  }
+#endif  // __ANDROID__
+}
 
 absl::Status ShaderCodegen::Build(CompiledNodeAttributes attr,
                                   ShaderCode* shader_code) const {
-  VariableAccessor variable_accessor(options_.inline_parameters,
+  VariableAccessor variable_accessor(inline_parameters_,
                                      options_.vulkan_support);
   ObjectAccessor object_accessor(gpu_type_ == GpuVendor::kMali,
                                  options_.sampler_textures, &variable_accessor);
@@ -155,7 +173,7 @@ absl::Status ShaderCodegen::Build(CompiledNodeAttributes attr,
     RETURN_IF_ERROR(preprocessor.Rewrite(main_source_code, &main_source_code));
   }
 
-  if (options_.inline_parameters) {
+  if (inline_parameters_) {
     main_source_code = absl::StrCat(variable_accessor.GetConstDeclarations(),
                                     main_source_code);
   }
diff --git a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
index 492d965b6eb..510d6a27a4a 100644
--- a/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
+++ b/tensorflow/lite/delegates/gpu/gl/compiler/shader_codegen.h
@@ -45,6 +45,7 @@ class ShaderCodegen {
  private:
   const CompilationOptions options_;
   const GpuVendor gpu_type_;
+  bool inline_parameters_;
 };
 
 }  // namespace gl
diff --git a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
index 612c4d76cc0..b4e01cfbbc8 100644
--- a/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
+++ b/tensorflow/lite/delegates/gpu/java/src/main/native/gpu_delegate_jni.cc
@@ -21,9 +21,9 @@ limitations under the License.
 
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/c/gpu_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/gpu_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 #include "tensorflow/lite/delegates/gpu/delegate_options.h"
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
index 101b3827bb9..44c4810d23d 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_list.cc
@@ -16,7 +16,7 @@ limitations under the License.
 
 namespace tflite {
 
-const char* const NnapiAccelerationTestParams::kAccelerationTestConfig =
+const char* const NnapiAccelerationTestParams::acceleration_test_config_ =
     R"(
 ## Every Test can be allowlisted or denylisted using a regexp on its test_id
 
diff --git a/tensorflow/lite/delegates/nnapi/acceleration_test_util.h b/tensorflow/lite/delegates/nnapi/acceleration_test_util.h
index 344d9c52c68..bde6055f2c7 100644
--- a/tensorflow/lite/delegates/nnapi/acceleration_test_util.h
+++ b/tensorflow/lite/delegates/nnapi/acceleration_test_util.h
@@ -27,8 +27,9 @@ namespace tflite {
 // NNAPI specific configuration for the validation allowlist.
 class NnapiAccelerationTestParams {
  public:
-  // Content in nnapi_acceleration_test_list.cc.
-  static const char* const kAccelerationTestConfig;
+  static const char* AccelerationTestConfig() {
+    return acceleration_test_config_;
+  }
 
   static NnapiAccelerationTestParams ParseConfigurationLine(
       const std::string& conf_line);
@@ -43,6 +44,8 @@ class NnapiAccelerationTestParams {
   int MinAndroidSdkVersion() { return min_android_sdk_version_; }
 
  private:
+  // Content in nnapi_acceleration_test_list.cc.
+  static const char* const acceleration_test_config_;
   int min_android_sdk_version_;
 };
 
diff --git a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc
index d12657d1eae..ef8afa8c925 100644
--- a/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc
+++ b/tensorflow/lite/delegates/nnapi/java/src/main/native/nnapi_delegate_impl_jni.cc
@@ -20,9 +20,9 @@ limitations under the License.
 
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/c/nnapi_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/nnapi_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/nnapi/nnapi_delegate.h"
 #endif
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
index 844710eaa45..eebacace5e5 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate.cc
@@ -1456,6 +1456,23 @@ class NNAPIOpBuilder {
     return kTfLiteOk;
   }
 
+  TfLiteStatus TransformCosIntoSupportedOps(int lite_node_index,
+                                            TfLiteNode* node,
+                                            TfLiteRegistration* reg) {
+    const TfLiteTensor& theta = context_->tensors[node->inputs->data[0]];
+
+    // NNAPI only supports float sin
+    auto tensor_size = theta.bytes / sizeof(float);
+
+    // Convert cos to sin: $cos(x) = sin(\frac{\pi}{2} - x)$
+    auto data = theta.data.f;
+    for (int i = 0; i < tensor_size; i++) {
+      data[i] = M_PI_2 - data[i];
+    }
+
+    return kTfLiteOk;
+  }
+
   // Finish emitting the op (of type `type`) into the NN API.
   TfLiteStatus FinalizeAddOperation(ANeuralNetworksOperationType type,
                                     int lite_node_index) {
@@ -2940,6 +2957,7 @@ bool NNAPIDelegateKernel::Validate(
              NNAPIValidationFailureType::kUnsupportedInputType,
              "Size type should be Int32", &val_ctx);
     } break;
+    case kTfLiteBuiltinCos:
     case kTfLiteBuiltinSin: {
       ExpectOpVersion(version, 1, &val_ctx);
       ExpectMinAndroidSdkVersion(android_sdk_version, kMinSdkVersionForNNAPI12,
@@ -3967,6 +3985,9 @@ TfLiteStatus NNAPIDelegateKernel::Map(
     case kTfLiteBuiltinSlice: {
       *nn_op_type = ANEURALNETWORKS_SLICE;
     } break;
+    case kTfLiteBuiltinCos: {
+      *nn_op_type = ANEURALNETWORKS_SIN;
+    } break;
     case kTfLiteBuiltinSin: {
       *nn_op_type = ANEURALNETWORKS_SIN;
     } break;
@@ -5596,6 +5617,12 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
           node_index, node, reg));
       continue;
     }
+    // Delegate COS by lowering it into SIN.
+    if (reg->builtin_code == kTfLiteBuiltinCos) {
+      TF_LITE_ENSURE_STATUS(
+          builder.TransformCosIntoSupportedOps(node_index, node, reg));
+      continue;
+    }
     // Fully quantized full LSTM.
     if (target_feature_level_ >= kMinSdkVersionForNNAPI13 &&
         reg->builtin_code == kTfLiteBuiltinLstm && isLstmFullKernel(node) &&
@@ -5845,6 +5872,14 @@ TfLiteStatus NNAPIDelegateKernel::AddOpsAndTensors(
         const TfLiteTensor constant_value = context->tensors[constant_value_id];
 
         switch (constant_value.type) {
+          case kTfLiteFloat16:
+            if (constant_value.allocation_type == kTfLiteMmapRo) {
+              builder.AddScalarFloat32Operand(constant_value.data.f16->data);
+            } else {
+              builder.AddSingleValueTensorAsScalarOperand(
+                  constant_value_id, ANEURALNETWORKS_TENSOR_FLOAT16);
+            }
+            break;
           case kTfLiteFloat32:
             if (constant_value.allocation_type == kTfLiteMmapRo) {
               builder.AddScalarFloat32Operand(*constant_value.data.f);
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
index c3f329ec1bf..6a8ad7a8c28 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_device_selection_test.cc
@@ -42,6 +42,7 @@ namespace {
 class FloatAddOpModel : public SingleOpModel {
  public:
   FloatAddOpModel() = default;
+  ~FloatAddOpModel() { stateful_delegate_.reset(); }
   void Init(const NnApi* nnapi, tflite::StatefulNnApiDelegate::Options options,
             const TensorData& input1, const TensorData& input2,
             const TensorData& output, ActivationFunctionType activation_type,
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
index 664b2e0c24e..c53efed4d23 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_errno_test.cc
@@ -38,6 +38,7 @@ class SingleOpModelWithNNAPI : public SingleOpModel {
         std::make_unique<StatefulNnApiDelegate>(nnapi, options_);
     this->SetDelegate(stateful_delegate_.get());
   }
+  ~SingleOpModelWithNNAPI() { stateful_delegate_.reset(); }
 
   StatefulNnApiDelegate* GetDelegate() { return stateful_delegate_.get(); }
 
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
index c307ab4c663..15e3c1a6d38 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_nnapi_failure_handling_test.cc
@@ -68,6 +68,7 @@ class AddSubOpsAcceleratedModel : public MultiOpModel {
     Init(input1, input2, input3, output, activation_type,
          allow_fp32_relax_to_fp16);
   }
+  ~AddSubOpsAcceleratedModel() { stateful_delegate_.reset(); }
 
   int input1() { return input1_; }
   int input2() { return input2_; }
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
index d46d10f5696..4bc0b6b69ab 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_signed_quantization_test.cc
@@ -43,6 +43,7 @@ namespace {
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() = default;
+  ~SingleOpModelWithNNAPI() { stateful_delegate_.reset(); }
   void Init(const NnApi* nnapi) {
     options_.disallow_nnapi_cpu = false;
     stateful_delegate_ =
diff --git a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
index 75efbeee597..362be8c865e 100644
--- a/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
+++ b/tensorflow/lite/delegates/nnapi/nnapi_delegate_test.cc
@@ -54,6 +54,7 @@ MATCHER(QuantizedNear, "") {
 class SingleOpModelWithNNAPI : public SingleOpModel {
  public:
   SingleOpModelWithNNAPI() { options_.disallow_nnapi_cpu = false; }
+  ~SingleOpModelWithNNAPI() { stateful_delegate_.reset(); }
 
   explicit SingleOpModelWithNNAPI(
       const StatefulNnApiDelegate::Options& options) {
@@ -1713,6 +1714,15 @@ TEST(Elementwise, Sin) {
   EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
 }
 
+TEST(Elementwise, Cos) {
+  ElementwiseOpFloatModel m(BuiltinOperator_COS, {1, 1, 4, 1});
+  m.PopulateTensor<float>(m.input(), {0, 3.1415926, -3.1415926, 1});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray(ArrayFloatNear({1.0, -1, -1, 0.54030})));
+  EXPECT_THAT(m.GetTensorShape(m.output()), ElementsAreArray({1, 1, 4, 1}));
+}
+
 TEST(Elementwise, Sqrt) {
   ElementwiseOpFloatModel m(BuiltinOperator_SQRT, {1, 1, 4, 1});
   m.PopulateTensor<float>(m.input(), {0, 1, 2, 4});
diff --git a/tensorflow/lite/delegates/telemetry.h b/tensorflow/lite/delegates/telemetry.h
index 4475ae86e72..461e23f433a 100644
--- a/tensorflow/lite/delegates/telemetry.h
+++ b/tensorflow/lite/delegates/telemetry.h
@@ -18,8 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <limits>
 
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 
 // This file implements utilities for delegate telemetry. These enable
 // representation and reporting of hardware-specific configurations, status
diff --git a/tensorflow/lite/delegates/telemetry_test.cc b/tensorflow/lite/delegates/telemetry_test.cc
index 5ece7336321..5bf39419589 100644
--- a/tensorflow/lite/delegates/telemetry_test.cc
+++ b/tensorflow/lite/delegates/telemetry_test.cc
@@ -20,9 +20,9 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/core/api/profiler.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/profiling/profile_buffer.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
index 57e654c9960..4da5c7ebf9d 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/BUILD
@@ -158,3 +158,20 @@ genrule(
         ' > $@
     """,
 )
+
+cc_binary(
+    name = "sample_app_using_stable_delegate",
+    testonly = True,
+    srcs = ["sample_app_using_stable_delegate.cc"],
+    data = [
+        ":tensorflowlite_sample_stable_delegate",
+        "//tensorflow/lite:testdata/add.bin",
+    ],
+    deps = [
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite/c:c_api",
+        "//tensorflow/lite/delegates/utils/experimental/stable_delegate:delegate_loader",
+        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/kernels:builtin_ops",
+    ],
+)
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
index 9e91e3f27fa..35dc6e33865 100644
--- a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/README.md
@@ -5,28 +5,29 @@
 An example delegate for stable delegate testing that supports addition and
 subtraction operations only.
 
-The sample stable delegate implementation uses the stable delegate API,
-which is based around `TfLiteOpaqueDelegate`. `TfLiteOpaqueDelegate` is
-an opaque version of `TfLiteDelegate`; which allows delegation of nodes to
-alternative backends. This is an abstract type that is intended to have the same
-role as `TfLiteDelegate` from
+The sample stable delegate implementation uses the stable delegate API, which is
+based around `TfLiteOpaqueDelegate`. `TfLiteOpaqueDelegate` is an opaque version
+of `TfLiteDelegate`; which allows delegation of nodes to alternative backends.
+This is an abstract type that is intended to have the same role as
+`TfLiteDelegate` from
 [common.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/c/common.h),
 but without exposing the implementation details of how delegates are
 implemented.
 
-`TfLiteOpaqueDelegate`s can be loaded dynamically
-(see `sample_stable_delegate_external_test.cc`) and then be supplied to the
-TFLite runtime, in the same way as statically linked delegates can.
+`TfLiteOpaqueDelegate`s can be loaded dynamically (see
+`sample_stable_delegate_external_test.cc`) and then be supplied to the TFLite
+runtime, in the same way as statically linked delegates can.
 
 Note however that open-source TF Lite does not (yet) provide a binary stable
-interface between delegates and the TF Lite runtime itself.  Therefore any
-opaque delegate that is loaded dynamically into TF Lite *must* have been built
-against the same version (and commit) that the TF Lite runtime itself has been
-built at. Any other configuration can lead to undefined behavior.
+interface between delegates and the TF Lite runtime itself. Therefore any opaque
+delegate that is loaded dynamically into TF Lite *must* have been built against
+the same version (and commit) that the TF Lite runtime itself has been built at.
+Any other configuration can lead to undefined behavior.
 
 ## Delegate implementation
 
-The sample stable delegate uses two supporting interfaces [SimpleOpaqueDelegateInterface and SimpleOpaqueDelegateKernelInterface](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/simple_opaque_delegate.h).
+The sample stable delegate uses two supporting interfaces
+[SimpleOpaqueDelegateInterface and SimpleOpaqueDelegateKernelInterface](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/simple_opaque_delegate.h).
 These APIs make it easier to implement an opaque TF Lite delegate, though their
 usage is entirely optional.
 
@@ -37,11 +38,13 @@ deletion.
 
 ## Testing
 
-See [sample_stable_delegate_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_test.cc)
+See
+[sample_stable_delegate_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_test.cc)
 for a standalone test driver that links the sample stable delegate statically
 and runs inference on a TF Lite model.
 
-See [sample_stable_delegate_external_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc)
+See
+[sample_stable_delegate_external_test.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_stable_delegate_external_test.cc)
 for a standalone test driver that loads the sample stable delegate dynamically
 and runs inference on a TF Lite model.
 
@@ -81,19 +84,33 @@ adb shell 'echo "{
 "> /data/local/tmp/stable_delegate_settings.json'
 ```
 
+We create a configuration file for the delegate test suite to verify that the
+models in the specified test cases have been delegated:
+
+```bash
+adb shell 'echo "
+  # The sample stable delegate supports static-sized addition and subtraction operations.
+  FloatSubOpModel.NoActivation
+  FloatSubOpModel.VariousInputShapes
+  FloatAddOpModel.NoActivation
+  FloatAddOpModel.VariousInputShapes
+"> /data/local/tmp/stable_delegate_acceleration_test_config.json'
+```
+
 Then, we build the test suite itself:
 
 ```bash
-bazel build -c opt --config=android_arm64 //tensorflow/lite/kernels:combined_all_kernel_tests
+bazel build -c opt --config=android_arm64 //tensorflow/lite/delegates/utils/experimental/stable_delegate:stable_delegate_test_suite
 
-adb push "$(bazel info -c opt --config=android_arm64 bazel-bin)"/tensorflow/lite/kernels/combined_all_kernel_tests /data/local/tmp
+adb push "$(bazel info -c opt --config=android_arm64 bazel-bin)"/tensorflow/lite/delegates/utils/experimental/stable_delegate/stable_delegate_test_suite /data/local/tmp
 ```
 
 Now, we can execute the test suite with providing the settings file:
 
 ```bash
-adb shell "/data/local/tmp/combined_all_kernel_tests \
-  --stable_delegate_settings_file=/data/local/tmp/stable_delegate_settings.json"
+adb shell "/data/local/tmp/stable_delegate_test_suite \
+  --stable_delegate_settings_file=/data/local/tmp/stable_delegate_settings.json \
+  --acceleration_test_config_path=/data/local/tmp/stable_delegate_acceleration_test_config.json"
 ```
 
 The test suite will show the following output in console after all tests are
@@ -109,7 +126,8 @@ passed:
 
 #### Delegate Performance Benchmark app
 
-The [Delegate Performance Benchmark app](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md)
+The
+[Delegate Performance Benchmark app](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/experimental/delegate_performance/android/README.md)
 is the recommended tool to test the latency and accuracy of a stable delegate.
 
 #### TF Lite Benchmark Tool
@@ -127,8 +145,8 @@ provided TF Lite model file.
 
 ##### A) Run on a regular linux host
 
-The following instructions show how to run the
-tool on regular desktop linux machine.
+The following instructions show how to run the tool on regular desktop linux
+machine.
 
 First, we build the sample stable delegate shared library file,
 `libtensorflowlite_sample_stable_delegate.so`, which we will later load
@@ -157,8 +175,8 @@ Then, we build the `benchmark_model` tool itself:
 bazel build -c opt //tensorflow/lite/tools/benchmark:benchmark_model
 ```
 
-Now, we can execute the benchmark tool.  We provide the settings file together
-with a TF Lite file that contains ADD operations.  We do this because the sample
+Now, we can execute the benchmark tool. We provide the settings file together
+with a TF Lite file that contains ADD operations. We do this because the sample
 stable delegate only support ADD and SUB:
 
 ```bash
@@ -216,3 +234,14 @@ adb shell "/data/local/tmp/benchmark_model \
   --stable_delegate_settings_file=/data/local/tmp/stable_delegate_settings.json \
   --graph=/data/local/tmp/add.bin"
 ```
+
+## Sample app
+
+To show how to use the sample stable delegate, we have included a sample app
+that uses it.  You can build and run the sample app as follows:
+
+```
+bazel run -c opt \
+    //tensorflow/lite/delegates/utils/experimental/sample_stable_delegate:sample_app_using_stable_delegate \
+    tensorflow/lite/testdata/add.tflite
+```
diff --git a/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_app_using_stable_delegate.cc b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_app_using_stable_delegate.cc
new file mode 100644
index 00000000000..aaaebb4dd14
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/sample_stable_delegate/sample_app_using_stable_delegate.cc
@@ -0,0 +1,154 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// An example app that uses the sample stable delegate.
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <memory>
+
+#include "tensorflow/lite/c/c_api.h"  // For TfLiteTensorByteSize.
+#include "tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.h"
+#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/interpreter_builder.h"
+#include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model_builder.h"
+
+#ifdef NDEBUG
+#define CHECK(x) ((void)(x))  // Avoid warnings for otherwise unused variables.
+#else
+#define CHECK(x) assert(x)
+#endif
+
+using tflite::TFLiteSettings;
+using tflite::TFLiteSettingsBuilder;
+using tflite::delegates::utils::LoadDelegateFromSharedLibrary;
+
+bool EndsWith(const char* whole, const char* suffix) {
+  size_t whole_length = strlen(whole);
+  size_t suffix_length = strlen(suffix);
+  return whole_length >= suffix_length &&
+         strcmp(whole + whole_length - suffix_length, suffix) == 0;
+}
+
+int main(int argc, char* argv[]) {
+  // It might be nicer style to use absl command-line flags, but here we're
+  // trying to minimize dependencies to keep the example as simple as possible.
+  if (argc != 2) {
+    fprintf(stderr,
+            "Usage: sample_app_using_stable_delegate <tflite model>\n"
+            "\n"
+            "This program runs the model using the sample stable delegate,\n"
+            "passing in some arbitrary data as input to the model.\n"
+            "This sample app assumes that the model's inputs and outputs are\n"
+            "float tensors.\n");
+    return 1;
+  }
+  const char* filename = argv[1];
+
+  // Load a model from a file (the filename would typically end with ".tflite").
+  std::unique_ptr<tflite::FlatBufferModel> model =
+      tflite::FlatBufferModel::BuildFromFile(filename);
+  CHECK(model != nullptr);
+
+  // Load the example stable delegate plugin from a shared library file.
+  const TfLiteStableDelegate* stable_delegate_handle =
+      LoadDelegateFromSharedLibrary(
+          "tensorflow/lite/delegates/utils/experimental/"
+          "sample_stable_delegate/libtensorflowlite_sample_stable_delegate.so");
+  CHECK(stable_delegate_handle != nullptr);
+
+  // Build a TFLiteSettings flatbuffer.
+  // The one in this example is an empty flatbuffer,
+  // but additional delegate-specific parameters could
+  // be passed to the delegate via this flatbuffer.
+  flatbuffers::FlatBufferBuilder flatbuffer_builder;
+  TFLiteSettingsBuilder tflite_settings_builder(flatbuffer_builder);
+  // Additional delegate-specific parameters can be set here.
+  // The example stable delegate currently doesn't take any additional
+  // delegate-specific parameters, but here's an example of what it
+  // would like like if we wanted to pass additional parameters to the
+  // Google Edge TPU delegate, which does have additional parameters.
+#if 0
+  tflite::GoogleEdgeTpuSettingsBuilder edgetpu_settings_builder(
+      flatbuffer_builder);
+  edgetpu_settings_builder.add_log_verbosity(10);
+  tflite_settings_builder.add_google_edgetpu_settings(
+      edgetpu_settings_builder.Finish());
+#endif
+  flatbuffers::Offset<TFLiteSettings> tflite_settings =
+      tflite_settings_builder.Finish();
+  flatbuffer_builder.Finish(tflite_settings);
+  const TFLiteSettings* settings = flatbuffers::GetRoot<TFLiteSettings>(
+      flatbuffer_builder.GetBufferPointer());
+  CHECK(settings != nullptr);
+
+  // Construct the delegate instance, using the settings flatbuffer.
+  TfLiteOpaqueDelegate* opaque_delegate =
+      stable_delegate_handle->delegate_plugin->create(settings);
+  CHECK(opaque_delegate != nullptr);
+
+  // Construct the model interpreter, using the model and the delegate.
+  tflite::ops::builtin::BuiltinOpResolver resolver;
+  tflite::InterpreterBuilder builder(*model, resolver);
+  builder.AddDelegate(opaque_delegate);
+  std::unique_ptr<tflite::Interpreter> interpreter;
+  builder(&interpreter);
+  CHECK(interpreter != nullptr);
+
+  // Allocate tensor buffers.
+  CHECK(interpreter->AllocateTensors() == kTfLiteOk);
+
+  // Fill input buffer.
+  // The only input to the test model is a single tensor of floats.
+  // We fill it with some arbitrary data.
+  float* input = interpreter->typed_input_tensor<float>(0);
+  int64_t num_input_elements =
+      TfLiteTensorByteSize(interpreter->input_tensor(0)) / sizeof(float);
+  for (int i = 0; i < num_input_elements; i++) {
+    input[i] = 111.222 * i;  // Some arbitrary input data.
+  }
+
+  // Run inference.
+  CHECK(interpreter->Invoke() == kTfLiteOk);
+
+  // Get output buffer.
+  // The only ouput to the test model is a single tensor of floats.
+  float* output = interpreter->typed_output_tensor<float>(0);
+  int64_t num_output_elements =
+      TfLiteTensorByteSize(interpreter->output_tensor(0)) / sizeof(float);
+
+  // Print inputs and results of computation.
+  for (int i = 0; i < num_input_elements; i++) {
+    printf("input[%d] = %.3f\n", i, input[i]);
+  }
+  printf("\n");
+  for (int i = 0; i < num_output_elements; i++) {
+    printf("output[%d] = %.3f\n", i, output[i]);
+  }
+
+  // Verify results, if we're using a specific known model.
+  if (EndsWith(filename, "lite/testdata/add.bin")) {
+    CHECK(num_input_elements == num_output_elements);
+    for (int i = 0; i < num_output_elements; i++) {
+      // The add.bin model computes f(X) = (X + X) + X.
+      float expected_output_i = input[i] * 3.0;
+      CHECK(fabs(output[i] - expected_output_i) <= 0.0000001 * fabs(output[i]));
+    }
+    printf("SUCCEEDED\n");
+  }
+
+  return 0;
+}
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
index a479aa6def2..b7c7624168c 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
@@ -18,6 +18,8 @@ load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//tensorflow/lite:build_def.bzl", "tflite_cc_shared_object", "tflite_linkopts_no_undefined")
 load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "cc_library_with_tflite")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library_with_tflite(
     name = "delegate_loader",
     srcs = ["delegate_loader.cc"],
@@ -122,6 +124,36 @@ cc_test(
     ],
 )
 
+cc_library(
+    name = "kernel_test_main",
+    testonly = 1,
+    srcs = ["kernel_test_main.cc"],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite/kernels:acceleration_test_util",
+        "//tensorflow/lite/kernels:acceleration_test_util_internal",
+        "//tensorflow/lite/kernels:test_delegate_providers_lib",
+        "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/kernels:test_util_delegate_providers",
+        "//tensorflow/lite/testing:util",
+        "@com_google_benchmark//:benchmark",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "stable_delegate_test_suite",
+    size = "medium",
+    tags = [
+        "manual",
+        "notap",
+    ],
+    deps = [
+        ":kernel_test_main",
+        "//tensorflow/lite/kernels:combined_all_kernel_tests_lib",
+    ],
+)
+
 exports_files(
     srcs = [
         "test_xnnpack_settings.json",
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc
new file mode 100644
index 00000000000..3c0d4c5a93f
--- /dev/null
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/kernel_test_main.cc
@@ -0,0 +1,156 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <fstream>
+#include <string>
+
+#include <gtest/gtest.h>
+#include "benchmark/benchmark.h"  // from @com_google_benchmark
+#include "tensorflow/lite/kernels/acceleration_test_util.h"
+#include "tensorflow/lite/kernels/acceleration_test_util_internal.h"
+#include "tensorflow/lite/kernels/test_delegate_providers.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/testing/util.h"
+
+namespace tflite {
+namespace {
+
+class DelegateTestSuiteAccelerationTestParams {
+ public:
+  static const char* AccelerationTestConfig() {
+    return acceleration_test_config_->c_str();
+  }
+  static void SetAccelerationTestConfig(std::string config) {
+    *acceleration_test_config_ = config;
+  }
+  static void Destroy() { delete acceleration_test_config_; }
+  static DelegateTestSuiteAccelerationTestParams ParseConfigurationLine(
+      const std::string& conf_line) {
+    // No test argument is supported at the moment.
+    return {};
+  }
+
+ private:
+  static std::string* acceleration_test_config_;
+};
+
+std::string*
+    DelegateTestSuiteAccelerationTestParams::acceleration_test_config_ =
+        new std::string(
+            R"(
+## Every Test can be allowlisted or denylisted using a regexp on its test_id
+
+## Test_id
+#
+# The test_id is test_suite_name / test_name, this differs from the
+# name used by the build because of the / separator instead of .
+# Parameterized tests names are composed by the base test name / test / ordinal
+# the ordinal is the position in the list of parameters generated by the
+# cardinal product of all the different parameter sets
+
+# Denylist/Allowlist
+# To denylist an element simply add - before the test_id regex
+
+## Rules evaluation
+#
+# Rules are checked in order, the first matching completes the browsing
+# This can be useful to put more specific rules first and generic default
+# ones below
+
+## Test Arguments
+#
+# No test argument is supported at the moment.
+
+# DTS checks all tests by default if no acceleraton test config file is
+# provided.
+.*
+
+)");
+
+void ValidateAcceleration(const SingleOpModel& model) {
+  std::string test_id = GetCurrentTestId();
+  const bool supported =
+      GetAccelerationTestParam<DelegateTestSuiteAccelerationTestParams>(test_id)
+          .has_value();
+  if (!supported) {
+    return;
+  }
+
+  // If we have multiple delegates applied, we would skip this check at the
+  // moment.
+  int num_applied_delegates = model.GetNumberOfAppliedDelegates();
+  if (num_applied_delegates > 1) {
+    TFLITE_LOG(WARN) << "Skipping acceleration validation as "
+                     << num_applied_delegates
+                     << " delegates have been successfully applied.";
+    return;
+  }
+  TFLITE_LOG(INFO) << "Validating acceleration with the stable delegate";
+  EXPECT_EQ(model.CountNumberOfDelegatedPartitions(), 1)
+      << "Expecting operation to be accelerated but cannot find a partition "
+         "associated to the stable delegate";
+  EXPECT_GT(num_applied_delegates, 0) << "No delegates were applied.";
+}
+
+bool InitKernelTest(int* argc, char** argv) {
+  KernelTestDelegateProviders* const delegate_providers =
+      KernelTestDelegateProviders::Get();
+  if (!delegate_providers->InitFromCmdlineArgs(
+          argc, const_cast<const char**>(argv))) {
+    return false;
+  }
+  const auto& delegate_params = delegate_providers->ConstParams();
+  if (delegate_params.HasParam("stable_delegate_settings_file") &&
+      !delegate_params.Get<std::string>("stable_delegate_settings_file")
+           .empty()) {
+    AccelerationValidator::Get()->AddCallback(ValidateAcceleration);
+  }
+  if (delegate_params.HasParam(
+          KernelTestDelegateProviders::kAccelerationTestConfigPath)) {
+    std::string acceleration_test_config_path =
+        delegate_params.Get<std::string>(
+            KernelTestDelegateProviders::kAccelerationTestConfigPath);
+    if (acceleration_test_config_path.empty()) {
+      return true;
+    }
+    std::ifstream fp(acceleration_test_config_path);
+    if (!fp.good()) {
+      return false;
+    }
+    DelegateTestSuiteAccelerationTestParams::SetAccelerationTestConfig(
+        std::string(std::istreambuf_iterator<char>(fp),
+                    std::istreambuf_iterator<char>()));
+  }
+  return true;
+}
+
+void DestroyKernelTest() {
+  DelegateTestSuiteAccelerationTestParams::Destroy();
+}
+
+}  // namespace
+}  // namespace tflite
+
+int main(int argc, char** argv) {
+  tflite::LogToStderr();
+  if (tflite::InitKernelTest(&argc, argv)) {
+    testing::InitGoogleTest(&argc, argv);
+    benchmark::RunSpecifiedBenchmarks();
+    int ret = RUN_ALL_TESTS();
+    tflite::DestroyKernelTest();
+    return ret;
+  } else {
+    return EXIT_FAILURE;
+  }
+}
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index fc7c571dd83..989da30831f 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -2237,7 +2237,7 @@ class Subgraph {
     // (subgraph is non-null), logging context is the same as context, and error
     // messages are passed to TFLite. When we detect supported operations
     // (subgraph is null), logging context is null, and error messages are
-    // supressed.
+    // suppressed.
 #ifdef XNNPACK_DELEGATE_ENABLE_LOGGING
     TfLiteContext* logging_context = context;
 #else
@@ -5459,11 +5459,15 @@ class Subgraph {
                                  begins[i], input_shape->data[i], node_index);
       }
 
+      int actual_end_data = end_data[i];
+      if (params->offset) {
+        actual_end_data += begin_data[i];
+      }
       // If end is negative, we count from the back, -1 is the last element.
-      if (end_data[i] < 0) {
-        ends[i] = end_data[i] + input_shape->data[i];
+      if (actual_end_data < 0) {
+        ends[i] = actual_end_data + input_shape->data[i];
       } else {
-        ends[i] = end_data[i];
+        ends[i] = actual_end_data;
       }
 
       if ((params->end_mask & (1 << i)) != 0) {
@@ -5788,7 +5792,7 @@ class Subgraph {
   // Mapping from TFLite Tensor IDs (same as XNNPACK Value IDs) for
   // input/output tensors in the delegated subgraph to their data locations.
   std::unordered_map<int, void*> externals_;
-  // Memory location to use for 0-size extenal tensors, as TFLite init their
+  // Memory location to use for 0-size external tensors, as TFLite init their
   // data pointer to nullptr, and XNNPACK requires valid data pointers.
   char dummy_data_{0};
   // Persistent tensors need to be set up in all cases (even without external
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index 1c4d806b2f9..7f3475a9a55 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -159,7 +159,10 @@ cc_library(
         "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "@com_google_absl//absl/memory",
     ] + select({
-        "//tensorflow:android": [
+        "//third_party/bazel_platforms/cpu:aarch64": [
+            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+        ],
+        "//third_party/bazel_platforms/cpu:armv7": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
index 6c00ee4d0ae..4f2bda672df 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
@@ -85,6 +85,10 @@ struct CPUSettings;
 struct CPUSettingsBuilder;
 struct CPUSettingsT;
 
+struct ArmNNSettings;
+struct ArmNNSettingsBuilder;
+struct ArmNNSettingsT;
+
 struct TFLiteSettings;
 struct TFLiteSettingsBuilder;
 struct TFLiteSettingsT;
@@ -185,6 +189,8 @@ bool operator==(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
 bool operator!=(const CoralSettingsT &lhs, const CoralSettingsT &rhs);
 bool operator==(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
 bool operator!=(const CPUSettingsT &lhs, const CPUSettingsT &rhs);
+bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
+bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs);
 bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
 bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs);
 bool operator==(const FallbackSettingsT &lhs, const FallbackSettingsT &rhs);
@@ -269,11 +275,12 @@ enum Delegate : int32_t {
   Delegate_EDGETPU = 5,
   Delegate_EDGETPU_CORAL = 6,
   Delegate_CORE_ML = 7,
+  Delegate_ARMNN = 8,
   Delegate_MIN = Delegate_NONE,
-  Delegate_MAX = Delegate_CORE_ML
+  Delegate_MAX = Delegate_ARMNN
 };
 
-inline const Delegate (&EnumValuesDelegate())[8] {
+inline const Delegate (&EnumValuesDelegate())[9] {
   static const Delegate values[] = {
     Delegate_NONE,
     Delegate_NNAPI,
@@ -282,13 +289,14 @@ inline const Delegate (&EnumValuesDelegate())[8] {
     Delegate_XNNPACK,
     Delegate_EDGETPU,
     Delegate_EDGETPU_CORAL,
-    Delegate_CORE_ML
+    Delegate_CORE_ML,
+    Delegate_ARMNN
   };
   return values;
 }
 
 inline const char * const *EnumNamesDelegate() {
-  static const char * const names[9] = {
+  static const char * const names[10] = {
     "NONE",
     "NNAPI",
     "GPU",
@@ -297,13 +305,14 @@ inline const char * const *EnumNamesDelegate() {
     "EDGETPU",
     "EDGETPU_CORAL",
     "CORE_ML",
+    "ARMNN",
     nullptr
   };
   return names;
 }
 
 inline const char *EnumNameDelegate(Delegate e) {
-  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_CORE_ML)) return "";
+  if (::flatbuffers::IsOutRange(e, Delegate_NONE, Delegate_ARMNN)) return "";
   const size_t index = static_cast<size_t>(e);
   return EnumNamesDelegate()[index];
 }
@@ -2457,6 +2466,96 @@ inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(
 
 ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatBufferBuilder &_fbb, const CPUSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
 
+struct ArmNNSettingsT : public ::flatbuffers::NativeTable {
+  typedef ArmNNSettings TableType;
+  std::string backends{};
+  bool fastmath = false;
+  std::string additional_parameters{};
+};
+
+struct ArmNNSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
+  typedef ArmNNSettingsT NativeTableType;
+  typedef ArmNNSettingsBuilder Builder;
+  enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE {
+    VT_BACKENDS = 4,
+    VT_FASTMATH = 6,
+    VT_ADDITIONAL_PARAMETERS = 8
+  };
+  const ::flatbuffers::String *backends() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_BACKENDS);
+  }
+  bool fastmath() const {
+    return GetField<uint8_t>(VT_FASTMATH, 0) != 0;
+  }
+  const ::flatbuffers::String *additional_parameters() const {
+    return GetPointer<const ::flatbuffers::String *>(VT_ADDITIONAL_PARAMETERS);
+  }
+  bool Verify(::flatbuffers::Verifier &verifier) const {
+    return VerifyTableStart(verifier) &&
+           VerifyOffset(verifier, VT_BACKENDS) &&
+           verifier.VerifyString(backends()) &&
+           VerifyField<uint8_t>(verifier, VT_FASTMATH, 1) &&
+           VerifyOffset(verifier, VT_ADDITIONAL_PARAMETERS) &&
+           verifier.VerifyString(additional_parameters()) &&
+           verifier.EndTable();
+  }
+  ArmNNSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  void UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
+  static ::flatbuffers::Offset<ArmNNSettings> Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+};
+
+struct ArmNNSettingsBuilder {
+  typedef ArmNNSettings Table;
+  ::flatbuffers::FlatBufferBuilder &fbb_;
+  ::flatbuffers::uoffset_t start_;
+  void add_backends(::flatbuffers::Offset<::flatbuffers::String> backends) {
+    fbb_.AddOffset(ArmNNSettings::VT_BACKENDS, backends);
+  }
+  void add_fastmath(bool fastmath) {
+    fbb_.AddElement<uint8_t>(ArmNNSettings::VT_FASTMATH, static_cast<uint8_t>(fastmath), 0);
+  }
+  void add_additional_parameters(::flatbuffers::Offset<::flatbuffers::String> additional_parameters) {
+    fbb_.AddOffset(ArmNNSettings::VT_ADDITIONAL_PARAMETERS, additional_parameters);
+  }
+  explicit ArmNNSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
+        : fbb_(_fbb) {
+    start_ = fbb_.StartTable();
+  }
+  ::flatbuffers::Offset<ArmNNSettings> Finish() {
+    const auto end = fbb_.EndTable(start_);
+    auto o = ::flatbuffers::Offset<ArmNNSettings>(end);
+    return o;
+  }
+};
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    ::flatbuffers::Offset<::flatbuffers::String> backends = 0,
+    bool fastmath = false,
+    ::flatbuffers::Offset<::flatbuffers::String> additional_parameters = 0) {
+  ArmNNSettingsBuilder builder_(_fbb);
+  builder_.add_additional_parameters(additional_parameters);
+  builder_.add_backends(backends);
+  builder_.add_fastmath(fastmath);
+  return builder_.Finish();
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettingsDirect(
+    ::flatbuffers::FlatBufferBuilder &_fbb,
+    const char *backends = nullptr,
+    bool fastmath = false,
+    const char *additional_parameters = nullptr) {
+  auto backends__ = backends ? _fbb.CreateString(backends) : 0;
+  auto additional_parameters__ = additional_parameters ? _fbb.CreateString(additional_parameters) : 0;
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      backends__,
+      fastmath,
+      additional_parameters__);
+}
+
+::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
+
 struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   typedef TFLiteSettings TableType;
   tflite::Delegate delegate = tflite::Delegate_NONE;
@@ -2474,6 +2573,7 @@ struct TFLiteSettingsT : public ::flatbuffers::NativeTable {
   std::unique_ptr<tflite::StableDelegateLoaderSettingsT> stable_delegate_loader_settings{};
   std::unique_ptr<tflite::GoogleEdgeTpuSettingsT> google_edgetpu_settings{};
   std::unique_ptr<tflite::CompilationCachingSettingsT> compilation_caching_settings{};
+  std::unique_ptr<tflite::ArmNNSettingsT> armnn_settings{};
   TFLiteSettingsT() = default;
   TFLiteSettingsT(const TFLiteSettingsT &o);
   TFLiteSettingsT(TFLiteSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -2498,7 +2598,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_DISABLE_DEFAULT_DELEGATES = 26,
     VT_STABLE_DELEGATE_LOADER_SETTINGS = 28,
     VT_GOOGLE_EDGETPU_SETTINGS = 30,
-    VT_COMPILATION_CACHING_SETTINGS = 32
+    VT_COMPILATION_CACHING_SETTINGS = 32,
+    VT_ARMNN_SETTINGS = 34
   };
   tflite::Delegate delegate() const {
     return static_cast<tflite::Delegate>(GetField<int32_t>(VT_DELEGATE, 0));
@@ -2545,6 +2646,9 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const tflite::CompilationCachingSettings *compilation_caching_settings() const {
     return GetPointer<const tflite::CompilationCachingSettings *>(VT_COMPILATION_CACHING_SETTINGS);
   }
+  const tflite::ArmNNSettings *armnn_settings() const {
+    return GetPointer<const tflite::ArmNNSettings *>(VT_ARMNN_SETTINGS);
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_DELEGATE, 4) &&
@@ -2574,6 +2678,8 @@ struct TFLiteSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            verifier.VerifyTable(google_edgetpu_settings()) &&
            VerifyOffset(verifier, VT_COMPILATION_CACHING_SETTINGS) &&
            verifier.VerifyTable(compilation_caching_settings()) &&
+           VerifyOffset(verifier, VT_ARMNN_SETTINGS) &&
+           verifier.VerifyTable(armnn_settings()) &&
            verifier.EndTable();
   }
   TFLiteSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2630,6 +2736,9 @@ struct TFLiteSettingsBuilder {
   void add_compilation_caching_settings(::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings) {
     fbb_.AddOffset(TFLiteSettings::VT_COMPILATION_CACHING_SETTINGS, compilation_caching_settings);
   }
+  void add_armnn_settings(::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings) {
+    fbb_.AddOffset(TFLiteSettings::VT_ARMNN_SETTINGS, armnn_settings);
+  }
   explicit TFLiteSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2657,8 +2766,10 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(
     bool disable_default_delegates = false,
     ::flatbuffers::Offset<tflite::StableDelegateLoaderSettings> stable_delegate_loader_settings = 0,
     ::flatbuffers::Offset<tflite::GoogleEdgeTpuSettings> google_edgetpu_settings = 0,
-    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0) {
+    ::flatbuffers::Offset<tflite::CompilationCachingSettings> compilation_caching_settings = 0,
+    ::flatbuffers::Offset<tflite::ArmNNSettings> armnn_settings = 0) {
   TFLiteSettingsBuilder builder_(_fbb);
+  builder_.add_armnn_settings(armnn_settings);
   builder_.add_compilation_caching_settings(compilation_caching_settings);
   builder_.add_google_edgetpu_settings(google_edgetpu_settings);
   builder_.add_stable_delegate_loader_settings(stable_delegate_loader_settings);
@@ -4883,6 +4994,51 @@ inline ::flatbuffers::Offset<CPUSettings> CreateCPUSettings(::flatbuffers::FlatB
 }
 
 
+inline bool operator==(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+  return
+      (lhs.backends == rhs.backends) &&
+      (lhs.fastmath == rhs.fastmath) &&
+      (lhs.additional_parameters == rhs.additional_parameters);
+}
+
+inline bool operator!=(const ArmNNSettingsT &lhs, const ArmNNSettingsT &rhs) {
+    return !(lhs == rhs);
+}
+
+
+inline ArmNNSettingsT *ArmNNSettings::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
+  auto _o = std::unique_ptr<ArmNNSettingsT>(new ArmNNSettingsT());
+  UnPackTo(_o.get(), _resolver);
+  return _o.release();
+}
+
+inline void ArmNNSettings::UnPackTo(ArmNNSettingsT *_o, const ::flatbuffers::resolver_function_t *_resolver) const {
+  (void)_o;
+  (void)_resolver;
+  { auto _e = backends(); if (_e) _o->backends = _e->str(); }
+  { auto _e = fastmath(); _o->fastmath = _e; }
+  { auto _e = additional_parameters(); if (_e) _o->additional_parameters = _e->str(); }
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> ArmNNSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  return CreateArmNNSettings(_fbb, _o, _rehasher);
+}
+
+inline ::flatbuffers::Offset<ArmNNSettings> CreateArmNNSettings(::flatbuffers::FlatBufferBuilder &_fbb, const ArmNNSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher) {
+  (void)_rehasher;
+  (void)_o;
+  struct _VectorArgs { ::flatbuffers::FlatBufferBuilder *__fbb; const ArmNNSettingsT* __o; const ::flatbuffers::rehasher_function_t *__rehasher; } _va = { &_fbb, _o, _rehasher}; (void)_va;
+  auto _backends = _o->backends.empty() ? 0 : _fbb.CreateString(_o->backends);
+  auto _fastmath = _o->fastmath;
+  auto _additional_parameters = _o->additional_parameters.empty() ? 0 : _fbb.CreateString(_o->additional_parameters);
+  return tflite::CreateArmNNSettings(
+      _fbb,
+      _backends,
+      _fastmath,
+      _additional_parameters);
+}
+
+
 inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
   return
       (lhs.delegate == rhs.delegate) &&
@@ -4899,7 +5055,8 @@ inline bool operator==(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
       (lhs.disable_default_delegates == rhs.disable_default_delegates) &&
       ((lhs.stable_delegate_loader_settings == rhs.stable_delegate_loader_settings) || (lhs.stable_delegate_loader_settings && rhs.stable_delegate_loader_settings && *lhs.stable_delegate_loader_settings == *rhs.stable_delegate_loader_settings)) &&
       ((lhs.google_edgetpu_settings == rhs.google_edgetpu_settings) || (lhs.google_edgetpu_settings && rhs.google_edgetpu_settings && *lhs.google_edgetpu_settings == *rhs.google_edgetpu_settings)) &&
-      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings));
+      ((lhs.compilation_caching_settings == rhs.compilation_caching_settings) || (lhs.compilation_caching_settings && rhs.compilation_caching_settings && *lhs.compilation_caching_settings == *rhs.compilation_caching_settings)) &&
+      ((lhs.armnn_settings == rhs.armnn_settings) || (lhs.armnn_settings && rhs.armnn_settings && *lhs.armnn_settings == *rhs.armnn_settings));
 }
 
 inline bool operator!=(const TFLiteSettingsT &lhs, const TFLiteSettingsT &rhs) {
@@ -4922,7 +5079,8 @@ inline TFLiteSettingsT::TFLiteSettingsT(const TFLiteSettingsT &o)
         disable_default_delegates(o.disable_default_delegates),
         stable_delegate_loader_settings((o.stable_delegate_loader_settings) ? new tflite::StableDelegateLoaderSettingsT(*o.stable_delegate_loader_settings) : nullptr),
         google_edgetpu_settings((o.google_edgetpu_settings) ? new tflite::GoogleEdgeTpuSettingsT(*o.google_edgetpu_settings) : nullptr),
-        compilation_caching_settings((o.compilation_caching_settings) ? new tflite::CompilationCachingSettingsT(*o.compilation_caching_settings) : nullptr) {
+        compilation_caching_settings((o.compilation_caching_settings) ? new tflite::CompilationCachingSettingsT(*o.compilation_caching_settings) : nullptr),
+        armnn_settings((o.armnn_settings) ? new tflite::ArmNNSettingsT(*o.armnn_settings) : nullptr) {
 }
 
 inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFERS_NOEXCEPT {
@@ -4941,6 +5099,7 @@ inline TFLiteSettingsT &TFLiteSettingsT::operator=(TFLiteSettingsT o) FLATBUFFER
   std::swap(stable_delegate_loader_settings, o.stable_delegate_loader_settings);
   std::swap(google_edgetpu_settings, o.google_edgetpu_settings);
   std::swap(compilation_caching_settings, o.compilation_caching_settings);
+  std::swap(armnn_settings, o.armnn_settings);
   return *this;
 }
 
@@ -4968,6 +5127,7 @@ inline void TFLiteSettings::UnPackTo(TFLiteSettingsT *_o, const ::flatbuffers::r
   { auto _e = stable_delegate_loader_settings(); if (_e) { if(_o->stable_delegate_loader_settings) { _e->UnPackTo(_o->stable_delegate_loader_settings.get(), _resolver); } else { _o->stable_delegate_loader_settings = std::unique_ptr<tflite::StableDelegateLoaderSettingsT>(_e->UnPack(_resolver)); } } else if (_o->stable_delegate_loader_settings) { _o->stable_delegate_loader_settings.reset(); } }
   { auto _e = google_edgetpu_settings(); if (_e) { if(_o->google_edgetpu_settings) { _e->UnPackTo(_o->google_edgetpu_settings.get(), _resolver); } else { _o->google_edgetpu_settings = std::unique_ptr<tflite::GoogleEdgeTpuSettingsT>(_e->UnPack(_resolver)); } } else if (_o->google_edgetpu_settings) { _o->google_edgetpu_settings.reset(); } }
   { auto _e = compilation_caching_settings(); if (_e) { if(_o->compilation_caching_settings) { _e->UnPackTo(_o->compilation_caching_settings.get(), _resolver); } else { _o->compilation_caching_settings = std::unique_ptr<tflite::CompilationCachingSettingsT>(_e->UnPack(_resolver)); } } else if (_o->compilation_caching_settings) { _o->compilation_caching_settings.reset(); } }
+  { auto _e = armnn_settings(); if (_e) { if(_o->armnn_settings) { _e->UnPackTo(_o->armnn_settings.get(), _resolver); } else { _o->armnn_settings = std::unique_ptr<tflite::ArmNNSettingsT>(_e->UnPack(_resolver)); } } else if (_o->armnn_settings) { _o->armnn_settings.reset(); } }
 }
 
 inline ::flatbuffers::Offset<TFLiteSettings> TFLiteSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const TFLiteSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -4993,6 +5153,7 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers:
   auto _stable_delegate_loader_settings = _o->stable_delegate_loader_settings ? CreateStableDelegateLoaderSettings(_fbb, _o->stable_delegate_loader_settings.get(), _rehasher) : 0;
   auto _google_edgetpu_settings = _o->google_edgetpu_settings ? CreateGoogleEdgeTpuSettings(_fbb, _o->google_edgetpu_settings.get(), _rehasher) : 0;
   auto _compilation_caching_settings = _o->compilation_caching_settings ? CreateCompilationCachingSettings(_fbb, _o->compilation_caching_settings.get(), _rehasher) : 0;
+  auto _armnn_settings = _o->armnn_settings ? CreateArmNNSettings(_fbb, _o->armnn_settings.get(), _rehasher) : 0;
   return tflite::CreateTFLiteSettings(
       _fbb,
       _delegate,
@@ -5009,7 +5170,8 @@ inline ::flatbuffers::Offset<TFLiteSettings> CreateTFLiteSettings(::flatbuffers:
       _disable_default_delegates,
       _stable_delegate_loader_settings,
       _google_edgetpu_settings,
-      _compilation_caching_settings);
+      _compilation_caching_settings,
+      _armnn_settings);
 }
 
 
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
index 8b0b025b647..2e37eceee2b 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/BUILD
@@ -30,8 +30,8 @@ cc_library(
     hdrs = ["c_api.h"],
     deps = [
         ":c_api_types",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:benchmark_result_evaluator",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:blocking_validator_runner",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:status_codes",
@@ -46,7 +46,7 @@ cc_test(
     srcs = ["c_api_test.cc"],
     deps = [
         ":c_api",
-        "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
+        "//tensorflow/lite/acceleration/configuration:configuration_fbs",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_mobilenet_validation_model",
         "//tensorflow/lite/experimental/acceleration/mini_benchmark:embedded_simple_addition_model",
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
index 49e4e773b36..95c32f45bd6 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api.cc
@@ -23,8 +23,8 @@ limitations under the License.
 
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/verifier.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/benchmark_result_evaluator.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/blocking_validator_runner.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_types.h"
diff --git a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
index 76b5b64ab03..7e251af1d16 100644
--- a/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
+++ b/tensorflow/lite/experimental/acceleration/mini_benchmark/c/c_api_test.cc
@@ -28,7 +28,7 @@ limitations under the License.
 #include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
 #include "flatbuffers/vector.h"  // from @flatbuffers
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_mobilenet_validation_model.h"
 #include "tensorflow/lite/experimental/acceleration/mini_benchmark/embedded_simple_addition_model.h"
diff --git a/tensorflow/lite/experimental/microfrontend/BUILD b/tensorflow/lite/experimental/microfrontend/BUILD
index 29cc6af148c..0d1d3ed198c 100644
--- a/tensorflow/lite/experimental/microfrontend/BUILD
+++ b/tensorflow/lite/experimental/microfrontend/BUILD
@@ -111,13 +111,13 @@ tf_custom_op_py_library(
     srcs_version = "PY3",
     deps = [
         ":audio_microfrontend_op",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/util:tf_export",
     ],
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index 119217d44c9..20fd46066b9 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -112,16 +112,26 @@ upper_tabs:
       - title: "Smart reply"
         path: https://github.com/tensorflow/examples/tree/master/lite/examples/smart_reply/android
         status: external
-      - title: "Auto complete"
-        path: /lite/examples/auto_complete/overview
+      - title: "Autocomplete"
+        path: https://github.com/tensorflow/examples/tree/master/lite/examples/generative_ai/android
+        status: external
 
       - heading: "Audio"
       - title: "Sound and word recognition"
         path: /lite/android/tutorials/audio_classification
 
       - heading: "Hardware acceleration"
+      - title: "Acceleration service"
+        path: /lite/android/acceleration_service
+        status: experimental
       - title: "GPU delegate"
-        path: /lite/android/delegates/gpu
+        section:
+        - title: "Interpreter API"
+          path: /lite/android/delegates/gpu.md
+        - title: "Task library API"
+          path: /lite/android/delegates/gpu_task.md
+        - title: "Native API"
+          path: /lite/android/delegates/gpu_native.md
       - title: "NNAPI delegate"
         path: /lite/android/delegates/nnapi
       - title: "Hexagon delegate"
@@ -353,6 +363,8 @@ upper_tabs:
         path: /lite/examples/bert_qa/overview
       - title: "Smart reply"
         path: /lite/examples/smart_reply/overview
+      - title: "Autocomplete"
+        path: /lite/examples/auto_complete/overview
       - title: "Text classification"
         path: /lite/examples/text_classification/overview
       - heading: "Audio"
diff --git a/tensorflow/lite/g3doc/android/acceleration_service.md b/tensorflow/lite/g3doc/android/acceleration_service.md
new file mode 100644
index 00000000000..241dfa9ffc3
--- /dev/null
+++ b/tensorflow/lite/g3doc/android/acceleration_service.md
@@ -0,0 +1,334 @@
+# Acceleration Service for Android (Beta)
+
+Beta: Acceleration Service for Android is currently in Beta.
+Please review the [Caveats](#caveats) and the [Terms and Privacy]
+(#terms_privacy) sections of this page for more details.
+
+The use of specialized processors such as GPUs, NPUs or DSPs for hardware
+acceleration can dramatically improve inference performance (up to 10x faster
+inference in some cases) and the user experience of your ML-enabled Android
+application. However, given the variety of hardware and drivers your users might
+have, picking the optimal hardware acceleration configuration for each user's
+device can be challenging. Moreover, enabling the wrong configuration on a
+device can create poor user experience due to high latency or, in some rare
+cases, runtime errors or accuracy issues caused by hardware incompatibilities.
+
+Acceleration Service for Android is an API that helps you pick the
+optimal hardware acceleration configuration for a given user device and your
+`.tflite` model, while minimizing the risk of runtime error or accuracy issues.
+
+Acceleration Service evaluates different acceleration configurations on user
+devices by running internal inference benchmarks with your TensorFlow Lite
+model. These test runs typically complete in a few seconds, depending on your
+model. You can run the benchmarks once on every user device before inference
+time, cache the result and use it during inference. These benchmarks are run
+out-of-process; which minimizes the risk of crashes to your app.
+
+Provide your model, data samples and expected results ("golden" inputs and
+outputs) and Acceleration Service will run an internal TFLite inference
+benchmark to provide you with hardware recommendations.
+
+![image](../images/acceleration/acceleration_service.png)
+
+Acceleration Service is part of Android's custom ML stack and works with
+[TensorFlow Lite in Google Play services](https://www.tensorflow.org/lite/android/play_services).
+
+## Add the dependencies to your project
+
+Add the following dependencies to your application's build.gradle file:
+
+```
+implementation  "com.google.android.gms:play-services-tflite-
+acceleration-service:16.0.0-beta01"
+```
+
+The Acceleration Service API works with [TensorFlow Lite in Google Play
+Services](https://www.tensorflow.org/lite/android/play_services). If you
+aren't using the TensorFlow Lite runtime provided via Play Services yet, you
+will need to update your [dependencies](https://www.tensorflow.org/lite/android/play_services#1_add_project_dependencies_2).
+
+## How to use the Acceleration Service API
+
+To use Acceleration Service, start by creating the acceleration configuration
+you want to evaluate for you model (e.g GPU with OpenGL). Then create a
+validation configuration with your model, some sample data and the expected
+model output. Finally call `validateConfig()` in passing both your
+acceleration configuration and validation configuration.
+
+![image](../images/acceleration/acceleration_service_steps.png)
+
+### Create acceleration configurations
+
+Acceleration configurations are representations of the hardware configurations
+which are translated into delegates during the execution time.
+The Acceleration Service will then use these configurations internally
+to perform test inferences.
+
+At the moment the acceleration service enables you to evaluate GPU
+configurations (converted to GPU delegate during the execution time)
+with the
+[GpuAccelerationConfig](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/GpuAccelerationConfig)
+and CPU inference (with
+[CpuAccelerationConfig](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CpuAccelerationConfig)).
+We are working on supporting more delegates to access other hardware in the
+future.
+
+#### GPU acceleration configuration
+
+Create a GPU acceleration configuration as follow:
+
+```
+AccelerationConfig accelerationConfig = new GpuAccelerationConfig.Builder()
+  .setEnableQuantizedInference(false)
+  .build();
+```
+
+You must specify whether or not your model is using quantization with
+[`setEnableQuantizedInference()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/GpuAccelerationConfig.Builder#public-gpuaccelerationconfig.builder-setenablequantizedinference-boolean-value).
+
+#### CPU acceleration configuration
+
+Create the CPU acceleration as follow:
+
+```
+AccelerationConfig accelerationConfig = new CpuAccelerationConfig.Builder()
+  .setNumThreads(2)
+  .build();
+```
+
+Use the
+[`setNumThreads()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CpuAccelerationConfig.Builder#setNumThreads\(int\))
+method to define the number of threads you want to use to evaluate CPU
+inference.
+
+### Create validation configurations
+
+Validation configurations enable you to define how you want the Acceleration
+Service to evaluate inferences. You will use them to pass:
+
+-   input samples,
+-   expected outputs,
+-   accuracy validation logic.
+
+Make sure to provide input samples for which you expect a good performance of
+your model (also known as “golden” samples).
+
+Create a
+[`ValidationConfig`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/ValidationConfig)
+with
+[`CustomValidationConfig.Builder`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.Builder)
+as follow:
+
+```
+ValidationConfig validationConfig = new CustomValidationConfig.Builder()
+   .setBatchSize(5)
+   .setGoldenInputs(inputs)
+   .setGoldenOutputs(outputBuffer)
+   .setAccuracyValidator(new MyCustomAccuracyValidator())
+   .build();
+```
+
+Specify the number of the golden samples with
+[`setBatchSize()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.Builder#setBatchSize\(int\)).
+Pass the inputs of your golden samples using
+[`setGoldenInputs()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.Builder#public-customvalidationconfig.builder-setgoldeninputs-object...-value).
+Provide the expected output for the input passed with
+[`setGoldenOutputs()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.Builder#public-customvalidationconfig.builder-setgoldenoutputs-bytebuffer...-value).
+
+You can define a maximum inference time with [`setInferenceTimeoutMillis()`](
+https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.Builder#public-customvalidationconfig.builder-setinferencetimeoutmillis-long-value)
+(5000 ms by default). If the inference takes longer than the time you defined,
+the configuration will be rejected.
+
+Optionally, you can also create a custom [`AccuracyValidator`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.AccuracyValidator)
+as follow:
+
+```
+class MyCustomAccuracyValidator implements AccuracyValidator {
+   boolean validate(
+      BenchmarkResult benchmarkResult,
+      ByteBuffer[] goldenOutput) {
+        for (int i = 0; i < benchmarkResult.actualOutput().size(); i++) {
+            if (!goldenOutputs[i]
+               .equals(benchmarkResult.actualOutput().get(i).getValue())) {
+               return false;
+            }
+         }
+         return true;
+
+   }
+}
+```
+
+Make sure to define a validation logic that works for your use case.
+
+Note that if the validation data is already embedded in your model, you can use
+[`EmbeddedValidationConfig`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/EmbeddedValidationConfig).
+
+##### Generate validation outputs
+
+Golden outputs are optional and as long as you provide golden inputs, the
+Acceleration Service can internally generate the golden outputs. You can also
+define the acceleration configuration used to generate these golden outputs by
+calling [`setGoldenConfig()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/CustomValidationConfig.Builder#setGoldenConfig(com.google.android.gms.tflite.acceleration.AccelerationConfig)):
+
+```
+ValidationConfig validationConfig = new CustomValidationConfig.Builder()
+   .setBatchSize(5)
+   .setGoldenInputs(inputs)
+   .setGoldenConfig(customCpuAccelerationConfig)
+   [...]
+   .build();
+```
+
+### Validate Acceleration configuration
+
+Once you have created an acceleration configuration and a validation config you
+can evaluate them for your model.
+
+Make sure that the TensorFlow Lite with Play Services runtime is properly
+initialized and that the GPU delegate is available for the device by running:
+
+```
+TfLiteGpu.isGpuDelegateAvailable(context)
+   .onSuccessTask(gpuAvailable -> TfLite.initialize(context,
+      TfLiteInitializationOptions.builder()
+        .setEnableGpuDelegateSupport(gpuAvailable)
+        .build()
+      )
+   );
+```
+
+Instantiate the [`AccelerationService`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/AccelerationService)
+by calling [`AccelerationService.create()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/AccelerationService#create(android.content.Context)).
+
+You can then validate your acceleration configuration for your model by calling
+[`validateConfig()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/AccelerationService#validateConfig(com.google.android.gms.tflite.acceleration.Model,%20com.google.android.gms.tflite.acceleration.AccelerationConfig,%20com.google.android.gms.tflite.acceleration.ValidationConfig)):
+
+```
+InterpreterApi interpreter;
+InterpreterOptions interpreterOptions = InterpreterApi.Options();
+AccelerationService.create(context)
+   .validateConfig(model, accelerationConfig, validationConfig)
+   .addOnSuccessListener(validatedConfig -> {
+      if (validatedConfig.isValid() && validatedConfig.benchmarkResult().hasPassedAccuracyTest()) {
+         interpreterOptions.setAccelerationConfig(validatedConfig);
+         interpreter = InterpreterApi.create(model, interpreterOptions);
+});
+```
+
+You can also validate multiple configurations by calling
+[`validateConfigs()`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/AccelerationService#validateConfigs(com.google.android.gms.tflite.acceleration.Model,%20java.lang.Iterable%3Ccom.google.android.gms.tflite.acceleration.AccelerationConfig%3E,%20com.google.android.gms.tflite.acceleration.ValidationConfig))
+and passing an `Iterable<AccelerationConfig>` object as a parameter.
+
+`validateConfig()`will return a
+`Task<`[`ValidatedAccelerationConfigResult`](https://developers.google.com/android/reference/com/google/android/gms/tflite/acceleration/ValidatedAccelerationConfigResult)`>`
+from the Google Play services
+[Task Api](https://developers.google.com/android/guides/tasks) which enables
+asynchronous tasks. \
+To get the result from the validation call, add an
+[`addOnSuccessListener()`](https://developers.google.com/android/reference/com/google/android/gms/tasks/OnSuccessListener)
+callback.
+
+#### Use validated configuration in your interpreter
+
+After checking if the `ValidatedAccelerationConfigResult` returned in the
+callback is valid, you can set the validated config as an acceleration config
+for your interpreter calling `interpreterOptions.setAccelerationConfig()`.
+
+#### Configuration caching
+
+The optimal acceleration configuration for your model is unlikely to change on
+the device. So once you receive a satisfying acceleration configuration, you
+should store it on the device and let your application retrieve it and use it to
+create your `InterpreterOptions` during the following sessions instead of
+running another validation. The `serialize()` and `deserialize()` methods in
+`ValidatedAccelerationConfigResult` make the storage and retrieval process
+easier.
+
+## Limitations
+
+The Acceleration Service has the current following limitations:
+
+-   Only CPU and GPU acceleration configurations are supported at the moment,
+-   It only supports TensorFlow Lite in Google Play services and you cannot
+    use it if you are using the bundled version of TensorFlow Lite,
+-   It doesn't support the TensorFlow Lite [Task
+    Library](https://www.tensorflow.org/lite/inference_with_metadata/task_library/overview)
+    as you can't directly initialize
+    [`BaseOptions`](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder)
+    with the `ValidatedAccelerationConfigResult` object.
+-   Acceleration Service SDK only supports API level 22 and above.
+
+## Caveats {:#caveats}
+
+Please review the following caveats carefully, especially if you are planning
+to use this SDK in production:
+
+-   Before exiting Beta and releasing the stable version for the
+    Acceleration Service API, we will publish a new SDK which may have some
+    differences from the current Beta one. In order to continue using the
+    Acceleration Service, you will need to migrate to this new SDK and push an
+    update to your app in a timely manner. Not doing so may cause breakages as
+    the Beta SDK may no longer be compatible with Google Play services after
+    some time.
+
+-   There is no guarantee that a specific feature within the Acceleration
+    Service API or the API as a whole will ever become generally available. It
+    may remain in Beta indefinitely, be shut down, or be combined with other
+    features into packages designed for specific developer audiences. Some
+    features with the Acceleration Service API or the entire API itself may
+    eventually become generally available, but there is no fixed schedule for
+    this.
+
+## Terms and privacy {:#terms_privacy}
+
+#### Terms of Service
+
+Use of the Acceleration Service APIs is subject to the [Google APIs Terms of
+Service](https://developers.google.com/terms/).\
+Additionally, the Acceleration Service APIs is currently in beta
+and, as such, by using it you acknowledge the potential issues outlined in the
+Caveats section above and acknowledge that the Acceleration Service may not
+always perform as specified.
+
+#### Privacy
+
+When you use the Acceleration Service APIs, processing of the input data (e.g.
+images, video, text) fully happens on-device, and **the Acceleration Service
+does not send that data to Google servers**. As a result, you can use our APIs
+for processing input data that should not leave the device.\
+The Acceleration Service APIs may contact Google servers from time to time in
+order to receive things like bug fixes, updated models and hardware accelerator
+compatibility information. The Acceleration Service APIs also send metrics about
+the performance and utilization of the APIs in your app to Google. Google uses
+this metrics data to measure performance, debug, maintain and improve the APIs,
+and detect misuse or abuse, as further described in our [Privacy
+Policy](https://policies.google.com/privacy).\
+**You are responsible for informing users of your app about Google's processing
+of the Acceleration Service metrics data as required by applicable law.**\
+Data we collect includes the following:
+
+-   Device information (such as manufacturer, model, OS version and build) and
+    available ML hardware accelerators (GPU and DSP). Used for diagnostics and
+    usage analytics.
+-   App information (package name / bundle id, app version). Used for
+    diagnostics and usage analytics.
+-   API configuration (such as image format and resolution). Used for
+    diagnostics and usage analytics.
+-   Event type (such as initialize, download model, update, run, detection).
+    Used for diagnostics and usage analytics.
+-   Error codes. Used for diagnostics.
+-   Performance metrics. Used for diagnostics.
+-   Per-installation identifiers that do not uniquely identify a user or
+    physical device. Used for operation of remote configuration and usage
+    analytics.
+-   Network request sender IP addresses. Used for remote configuration
+    diagnostics. Collected IP addresses are retained temporarily.
+
+## Support and feedback
+
+You can provide feedback and get support through the TensorFlow Issue Tracker.
+Please report issues and support requests using the
+[issue template](https://github.com/tensorflow/tensorflow/issues/new?title=TensorFlow+Lite+in+Play+Services+issue&template=tflite-in-play-services.md)
+for TensorFlow Lite in Google Play services.
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/android/delegates/gpu.md b/tensorflow/lite/g3doc/android/delegates/gpu.md
index 9ff0d1715eb..f1b867fc903 100644
--- a/tensorflow/lite/g3doc/android/delegates/gpu.md
+++ b/tensorflow/lite/g3doc/android/delegates/gpu.md
@@ -1,137 +1,97 @@
-# GPU acceleration delegate for Android
+# GPU acceleration delegate with Interpreter API
 
 Using graphics processing units (GPUs) to run your machine learning (ML) models
-can dramatically improve the performance of your model and the user experience
-of your ML-enabled applications. On Android devices, you can enable use of
-GPU-accelerated execution of your models using a [*delegate*](../../performance/delegates).
-Delegates act as hardware drivers for TensorFlow Lite, allowing you to run
-the code of your model on GPU processors.
+can dramatically improve the performance and the user experience
+of your ML-enabled applications. On Android devices, you can enable
 
-This page describes how to enable GPU acceleration for TensorFlow Lite models
-in Android apps. For more information about using the GPU delegate for
+[*delegate*](../../performance/delegates) and one of the following APIs:
+
+- Interpreter API - this guide
+- Task library API - [guide](./gpu_task)
+- Native (C/C++) API - [guide](./gpu_native)
+
+This page describes how to enable GPU acceleration for TensorFlow Lite models in
+Android apps using the Interpreter API.
+For more information about using the GPU delegate for
 TensorFlow Lite, including best practices and advanced techniques, see the
 [GPU delegates](../../performance/gpu) page.
 
-## Use GPU with Task Library APIs
+## Use GPU with TensorFlow Lite with Google Play services
 
-The TensorFlow Lite
-[Task Libraries](../../inference_with_metadata/task_library/overview) provide a
-set of task-specific APIs for building a machine learning applications. This
-section describes how to use the GPU accelerator delegate with these APIs.
+The TensorFlow Lite [Interpreter API](https://tensorflow.org/lite/api_docs/java/org/tensorflow/lite/InterpreterApi)
+provides a set of general purpose APIs for building a machine learning
+applications. This section describes how to use the GPU accelerator delegate
+with these APIs with TensorFlow Lite with Google Play services.
+
+[TensorFlow Lite with Google Play services](../play_services) is the recommended
+path to use TensorFlow Lite on Android. If your application is targeting devices
+not running Google Play, see the
+[GPU with Interpreter API and standalone TensorFlow Lite](#standalone)
+section.
 
 ### Add project dependencies
 
-Enable access to the GPU delegate APIs with the TensorFlow Lite Task Libraries
-by adding the following dependencies update your development projects
-`build.gradle` file to include the `tensorflow-lite-gpu-delegate-plugin` package
-as shown in the following code example:
+To enable access to the GPU delegate, add
+`com.google.android.gms:play-services-tflite-gpu` to your app's `build.gradle`
+file:
 
 ```
 dependencies {
-  ...
-  implementation 'org.tensorflow:tensorflow-lite-gpu-delegate-plugin'
+    ...
+    implementation 'com.google.android.gms:play-services-tflite-java:16.0.1'
+    implementation 'com.google.android.gms:play-services-tflite-gpu:16.1.0'
 }
 ```
 
 ### Enable GPU acceleration
 
-Enable the GPU delegate option for your Task API model class with the
-[`BaseOptions`](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder)
-class. For example, you can set up GPU in `ObjectDetector` as shown in the
-following code examples:
+Then initialize TensorFlow Lite with Google Play services with the GPU support:
 
 <div>
   <devsite-selector>
     <section>
       <h3>Kotlin</h3>
       <p><pre class="prettyprint lang-kotlin">
-    import org.tensorflow.lite.task.core.BaseOptions
-    import org.tensorflow.lite.task.gms.vision.detector.ObjectDetector
-
-    val baseOptions = BaseOptions.builder().useGpu().build()
-
-    val options =
-        ObjectDetector.ObjectDetectorOptions.builder()
-            .setBaseOptions(baseOptions)
-            .setMaxResults(1)
-            .build()
-
-    val objectDetector = ObjectDetector.createFromFileAndOptions(
-      context, model, options)
+    val useGpuTask = TfLiteGpu.isGpuDelegateAvailable(context)
 
+    val interpreterTask = useGpuTask.continueWith { useGpuTask ->
+      TfLite.initialize(context,
+          TfLiteInitializationOptions.builder()
+          .setEnableGpuDelegateSupport(useGpuTask.result)
+          .build())
+      }
       </pre></p>
     </section>
     <section>
       <h3>Java</h3>
       <p><pre class="prettyprint lang-java">
-    import org.tensorflow.lite.task.core.BaseOptions
-    import org.tensorflow.lite.task.gms.vision.detector.ObjectDetector
+    Task<boolean> useGpuTask = TfLiteGpu.isGpuDelegateAvailable(context);
 
-    BaseOptions baseOptions = BaseOptions.builder().useGpu().build();
-
-    ObjectDetectorOptions options =
-        ObjectDetectorOptions.builder()
-            .setBaseOptions(baseOptions)
-            .setMaxResults(1)
-            .build();
-
-    val objectDetector = ObjectDetector.createFromFileAndOptions(
-      context, model, options);
+    Task<Options> interpreterOptionsTask = useGpuTask.continueWith({ task ->
+      TfLite.initialize(context,
+      TfLiteInitializationOptions.builder()
+        .setEnableGpuDelegateSupport(true)
+        .build());
+    });
       </pre></p>
     </section>
   </devsite-selector>
 </div>
 
-## Use GPU with Interpreter API
-
-The TensorFlow Lite [Interpreter API](../../api_docs/java/org/tensorflow/lite/InterpreterApi)
-provides a set of general purpose APIs for building a machine learning
-applications. This section describes how to use the GPU accelerator delegate
-with these APIs.
-
-### Add project dependencies
-
-Enable access to the GPU delegate APIs by adding the following dependencies
-update your development projects `build.gradle` file to include the
-`org.tensorflow:tensorflow-lite-gpu` package as shown in the following code
-example:
-
-```
-dependencies {
-    ...
-    implementation 'org.tensorflow:tensorflow-lite'
-    implementation 'org.tensorflow:tensorflow-lite-gpu'
-}
-```
-
-### Enable GPU acceleration
-
-Then run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify
-the `GpuDelegate` through `Interpreter.Options`.
+You can finally initialize the interpreter passing a `GpuDelegateFactory`
+through `InterpreterApi.Options`:
 
 <div>
   <devsite-selector>
     <section>
       <h3>Kotlin</h3>
       <p><pre class="prettyprint lang-kotlin">
-    import org.tensorflow.lite.Interpreter
-    import org.tensorflow.lite.gpu.CompatibilityList
-    import org.tensorflow.lite.gpu.GpuDelegate
 
-    val compatList = CompatibilityList()
+    val options = InterpreterApi.Options()
+      .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
+      .addDelegateFactory(GpuDelegateFactory())
 
-    val options = Interpreter.Options().apply{
-        if(compatList.isDelegateSupportedOnThisDevice){
-            // if the device has a supported GPU, add the GPU delegate
-            val delegateOptions = compatList.bestOptionsForThisDevice
-            this.addDelegate(GpuDelegate(delegateOptions))
-        } else {
-            // if the GPU is not supported, run on 4 threads
-            this.setNumThreads(4)
-        }
-    }
-
-    val interpreter = Interpreter(model, options)
+    val interpreter = InterpreterApi(model, options)
 
     // Run inference
     writeToInput(input)
@@ -142,25 +102,12 @@ the `GpuDelegate` through `Interpreter.Options`.
     <section>
       <h3>Java</h3>
       <p><pre class="prettyprint lang-java">
-    import org.tensorflow.lite.Interpreter;
-    import org.tensorflow.lite.gpu.CompatibilityList;
-    import org.tensorflow.lite.gpu.GpuDelegate;
 
-    // Initialize interpreter with GPU delegate
-    Interpreter.Options options = new Interpreter.Options();
-    CompatibilityList compatList = CompatibilityList();
+    Options options = InterpreterApi.Options()
+      .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
+      .addDelegateFactory(new GpuDelegateFactory());
 
-    if(compatList.isDelegateSupportedOnThisDevice()){
-        // if the device has a supported GPU, add the GPU delegate
-        GpuDelegate.Options delegateOptions = compatList.getBestOptionsForThisDevice();
-        GpuDelegate gpuDelegate = new GpuDelegate(delegateOptions);
-        options.addDelegate(gpuDelegate);
-    } else {
-        // if the GPU is not supported, run on 4 threads
-        options.setNumThreads(4);
-    }
-
-    Interpreter interpreter = new Interpreter(model, options);
+    Interpreter interpreter = new InterpreterApi(model, options);
 
     // Run inference
     writeToInput(input);
@@ -179,57 +126,92 @@ The GPU delegate can also be used with ML model binding in Android Studio.
 For more information, see
 [Generate model interfaces using metadata](../../inference_with_metadata/codegen#acceleration).
 
-## Advanced GPU support
+## Use GPU with standalone TensorFlow Lite {:#standalone}
 
-This section covers advanced uses of the GPU delegate for Android, including
-the C API, C++ API, and use of quantized models.
+If your application is targets devices which are not running Google Play,
+it is possible to bundle the GPU delegate to your application and use it
+with the standalone version of TensorFlow Lite.
 
-### C/C++ API for Android
+### Add project dependencies
 
-Use the TensorFlow Lite GPU delegate for Android in C or C++ by creating the
-delegate with `TfLiteGpuDelegateV2Create()` and destroying it with
-`TfLiteGpuDelegateV2Delete()`, as shown in the following example code:
+To enable access to the GPU delegate, add
+`org.tensorflow:tensorflow-lite-gpu-delegate-plugin` to your app's `build.gradle`
+file:
 
-```c++
-// Set up interpreter.
-auto model = FlatBufferModel::BuildFromFile(model_path);
-if (!model) return false;
-ops::builtin::BuiltinOpResolver op_resolver;
-std::unique_ptr<Interpreter> interpreter;
-InterpreterBuilder(*model, op_resolver)(&interpreter);
-
-// NEW: Prepare GPU delegate.
-auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr);
-if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
-
-// Run inference.
-WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
-if (interpreter->Invoke() != kTfLiteOk) return false;
-ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
-
-// NEW: Clean up.
-TfLiteGpuDelegateV2Delete(delegate);
+```
+dependencies {
+    ...
+    implementation 'org.tensorflow:tensorflow-lite'
+    implementation 'org.tensorflow:tensorflow-lite-gpu-delegate-plugin'
+}
 ```
 
-Review the `TfLiteGpuDelegateOptionsV2` object code to build a delegate instance
-with custom options. You can initialize the default options with
-`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary.
+### Enable GPU acceleration
 
-The TensorFlow Lite GPU delegate for Android in C or C++ uses the
-[Bazel](https://bazel.io) build system. You can build the delegate using the
-following command:
+Then run TensorFlow Lite on GPU with `TfLiteDelegate`. In Java, you can specify
+the `GpuDelegate` through `Interpreter.Options`.
 
-```sh
-bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:delegate                           # for static library
-bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_delegate.so  # for dynamic library
-```
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+      <p><pre class="prettyprint lang-kotlin">
+      import org.tensorflow.lite.Interpreter
+      import org.tensorflow.lite.gpu.CompatibilityList
+      import org.tensorflow.lite.gpu.GpuDelegate
 
-When calling `Interpreter::ModifyGraphWithDelegate()` or
-`Interpreter::Invoke()`, the caller must have an `EGLContext` in the current
-thread and `Interpreter::Invoke()` must be called from the same `EGLContext`. If
-an `EGLContext` does not exist, the delegate creates one internally, but then
-you must ensure that `Interpreter::Invoke()` is always called from the same
-thread in which `Interpreter::ModifyGraphWithDelegate()` was called.
+      val compatList = CompatibilityList()
+
+      val options = Interpreter.Options().apply{
+          if(compatList.isDelegateSupportedOnThisDevice){
+              // if the device has a supported GPU, add the GPU delegate
+              val delegateOptions = compatList.bestOptionsForThisDevice
+              this.addDelegate(GpuDelegate(delegateOptions))
+          } else {
+              // if the GPU is not supported, run on 4 threads
+              this.setNumThreads(4)
+          }
+      }
+
+      val interpreter = Interpreter(model, options)
+
+      // Run inference
+      writeToInput(input)
+      interpreter.run(input, output)
+      readFromOutput(output)
+      </pre></p>
+    </section>
+    <section>
+      <h3>Java</h3>
+      <p><pre class="prettyprint lang-java">
+      import org.tensorflow.lite.Interpreter;
+      import org.tensorflow.lite.gpu.CompatibilityList;
+      import org.tensorflow.lite.gpu.GpuDelegate;
+
+      // Initialize interpreter with GPU delegate
+      Interpreter.Options options = new Interpreter.Options();
+      CompatibilityList compatList = CompatibilityList();
+
+      if(compatList.isDelegateSupportedOnThisDevice()){
+          // if the device has a supported GPU, add the GPU delegate
+          GpuDelegate.Options delegateOptions = compatList.getBestOptionsForThisDevice();
+          GpuDelegate gpuDelegate = new GpuDelegate(delegateOptions);
+          options.addDelegate(gpuDelegate);
+      } else {
+          // if the GPU is not supported, run on 4 threads
+          options.setNumThreads(4);
+      }
+
+      Interpreter interpreter = new Interpreter(model, options);
+
+      // Run inference
+      writeToInput(input);
+      interpreter.run(input, output);
+      readFromOutput(output);
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
 
 ### Quantized models {:#quantized-models}
 
@@ -252,18 +234,8 @@ GpuDelegate delegate = new GpuDelegate(new GpuDelegate.Options().setQuantizedMod
 Interpreter.Options options = (new Interpreter.Options()).addDelegate(delegate);
       </pre></p>
     </section>
-    <section>
-      <h3>C++</h3>
-      <p><pre class="prettyprint lang-c++">
-TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
-options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
-
-auto* delegate = TfLiteGpuDelegateV2Create(options);
-if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
-      </pre></p>
-    </section>
   </devsite-selector>
 </div>
 
 For more information about running quantized models with GPU acceleration,
-see [GPU delegate](../../performance/gpu#quantized-models) overview.
+see [GPU delegate](../../performance/gpu#quantized-models) overview.
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/android/delegates/gpu_native.md b/tensorflow/lite/g3doc/android/delegates/gpu_native.md
new file mode 100644
index 00000000000..996056e1789
--- /dev/null
+++ b/tensorflow/lite/g3doc/android/delegates/gpu_native.md
@@ -0,0 +1,93 @@
+# GPU acceleration delegate with C/C++ API
+
+Using graphics processing units (GPUs) to run your machine learning (ML) models
+can dramatically improve the performance and the user experience
+of your ML-enabled applications. On Android devices, you can enable
+GPU-accelerated execution of your models using a
+[*delegate*](../../performance/delegates) and one of the following APIs:
+
+- Interpreter API - [guide](./gpu)
+- Task library API - [guide](./gpu_task)
+- Native (C/C++) API - this guide
+
+This guide covers advanced
+uses of the GPU delegate for the C API, C++ API, and use of quantized models.
+For more information about using the GPU delegate for TensorFlow Lite,
+including best practices and advanced techniques, see the
+[GPU delegates](../../performance/gpu) page.
+
+## Enable GPU acceleration
+
+Use the TensorFlow Lite GPU delegate for Android in C or C++ by creating the
+delegate with `TfLiteGpuDelegateV2Create()` and destroying it with
+`TfLiteGpuDelegateV2Delete()`, as shown in the following example code:
+
+```c++
+// Set up interpreter.
+auto model = FlatBufferModel::BuildFromFile(model_path);
+if (!model) return false;
+ops::builtin::BuiltinOpResolver op_resolver;
+std::unique_ptr<Interpreter> interpreter;
+InterpreterBuilder(*model, op_resolver)(&interpreter);
+
+// NEW: Prepare GPU delegate.
+auto* delegate = TfLiteGpuDelegateV2Create(/*default options=*/nullptr);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+
+// Run inference.
+WriteToInputTensor(interpreter->typed_input_tensor<float>(0));
+if (interpreter->Invoke() != kTfLiteOk) return false;
+ReadFromOutputTensor(interpreter->typed_output_tensor<float>(0));
+
+// NEW: Clean up.
+TfLiteGpuDelegateV2Delete(delegate);
+```
+
+Review the `TfLiteGpuDelegateOptionsV2` object code to build a delegate instance
+with custom options. You can initialize the default options with
+`TfLiteGpuDelegateOptionsV2Default()` and then modify them as necessary.
+
+The TensorFlow Lite GPU delegate for Android in C or C++ uses the
+[Bazel](https://bazel.io) build system. You can build the delegate using the
+following command:
+
+```sh
+bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:delegate                           # for static library
+bazel build -c opt --config android_arm64 tensorflow/lite/delegates/gpu:libtensorflowlite_gpu_delegate.so  # for dynamic library
+```
+
+When calling `Interpreter::ModifyGraphWithDelegate()` or
+`Interpreter::Invoke()`, the caller must have an `EGLContext` in the current
+thread and `Interpreter::Invoke()` must be called from the same `EGLContext`. If
+an `EGLContext` does not exist, the delegate creates one internally, but then
+you must ensure that `Interpreter::Invoke()` is always called from the same
+thread in which `Interpreter::ModifyGraphWithDelegate()` was called.
+
+## Quantized models {:#quantized-models}
+
+Android GPU delegate libraries support quantized models by default. You do not
+have to make any code changes to use quantized models with the GPU delegate. The
+following section explains how to disable quantized support for testing or
+experimental purposes.
+
+#### Disable quantized model support
+
+The following code shows how to ***disable*** support for quantized models.
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>C++</h3>
+      <p><pre class="prettyprint lang-c++">
+TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
+options.experimental_flags = TFLITE_GPU_EXPERIMENTAL_FLAGS_NONE;
+
+auto* delegate = TfLiteGpuDelegateV2Create(options);
+if (interpreter->ModifyGraphWithDelegate(delegate) != kTfLiteOk) return false;
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
+For more information about running quantized models with GPU acceleration,
+see [GPU delegate](../../performance/gpu#quantized-models) overview.
\ No newline at end of file
diff --git a/tensorflow/lite/g3doc/android/delegates/gpu_task.md b/tensorflow/lite/g3doc/android/delegates/gpu_task.md
new file mode 100644
index 00000000000..327e1b4668a
--- /dev/null
+++ b/tensorflow/lite/g3doc/android/delegates/gpu_task.md
@@ -0,0 +1,164 @@
+# GPU acceleration delegate with Task library
+
+Using graphics processing units (GPUs) to run your machine learning (ML) models
+can dramatically improve the performance and the user experience
+of your ML-enabled applications. On Android devices, you can enable
+GPU-accelerated execution of your models using a
+[*delegate*](../../performance/delegates) and one of the following APIs:
+
+- Interpreter API - [guide](./gpu)
+- Task library API - this guide
+- Native (C/C++) API - this [guide](./gpu_native)
+
+This page describes how to enable GPU acceleration for TensorFlow Lite models in
+Android apps using the Task library.
+For more information about the GPU delegate for TensorFlow Lite,
+including best practices and advanced techniques, see the
+[GPU delegates](../../performance/gpu) page.
+
+## Use GPU with TensorFlow Lite with Google Play services
+
+The TensorFlow Lite
+[Task Libraries](../../inference_with_metadata/task_library/overview) provide a
+set of task-specific APIs for building machine learning applications. This
+section describes how to use the GPU accelerator delegate with these APIs using
+TensorFlow Lite with Google Play services.
+
+[TensorFlow Lite with Google Play services](../play_services) is the recommended
+path to use TensorFlow Lite on Android. If your application is targeting devices
+not running Google Play, see the
+[GPU with Task Library and standalone TensorFlow Lite](#standalone)
+section.
+
+### Add project dependencies
+
+To enable access to the GPU delegate with the TensorFlow Lite Task
+Libraries using Google Play services, add
+`com.google.android.gms:play-services-tflite-gpu` to the
+dependencies of your app's `build.gradle` file:
+
+```
+dependencies {
+  ...
+  implementation 'com.google.android.gms:play-services-tflite-gpu:16.0.0'
+}
+```
+
+### Enable GPU acceleration
+
+Then, verify asynchronously that GPU delegate is available for the device using
+the
+[`TfLiteGpu`](https://developers.google.com/android/reference/com/google/android/gms/tflite/gpu/support/TfLiteGpu)
+class and enable the GPU delegate option for your Task API model class with the
+[`BaseOptions`](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder)
+class. For example, you can set up GPU in `ObjectDetector` as shown in the
+following code examples:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+      <p><pre class="prettyprint lang-kotlin">
+        val useGpuTask = TfLiteGpu.isGpuDelegateAvailable(context)
+
+        lateinit val optionsTask = useGpuTask.continueWith { task ->
+          val baseOptionsBuilder = BaseOptions.builder()
+          if (task.result) {
+            baseOptionsBuilder.useGpu()
+          }
+        ObjectDetectorOptions.builder()
+                  .setBaseOptions(baseOptionsBuilder.build())
+                  .setMaxResults(1)
+                  .build()
+        }
+      </pre></p>
+    </section>
+    <section>
+      <h3>Java</h3>
+      <p><pre class="prettyprint lang-java">
+      Task<Boolean> useGpuTask = TfLiteGpu.isGpuDelegateAvailable(context);
+
+      Task<ObjectDetectorOptions> optionsTask = useGpuTask.continueWith({ task ->
+        BaseOptions baseOptionsBuilder = BaseOptions.builder();
+        if (task.getResult()) {
+          baseOptionsBuilder.useGpu();
+        }
+        return ObjectDetectorOptions.builder()
+                .setBaseOptions(baseOptionsBuilder.build())
+                .setMaxResults(1)
+                .build()
+      });
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
+## Use GPU with standalone TensorFlow Lite {:#standalone}
+
+If your application is targets devices which are not running Google Play,
+it is possible to bundle the GPU delegate to your application and use it
+with the standalone version of TensorFlow Lite.
+
+### Add project dependencies
+
+To enable access to the GPU delegate with the TensorFlow Lite Task
+Libraries using the standalone version of TensorFlow Lite, add
+`org.tensorflow:tensorflow-lite-gpu-delegate-plugin` to the
+dependencies of your app's `build.gradle` file:
+
+```
+dependencies {
+  ...
+  implementation 'org.tensorflow:tensorflow-lite'
+  implementation 'org.tensorflow:tensorflow-lite-gpu-delegate-plugin'
+}
+```
+
+### Enable GPU acceleration
+
+Then enable the GPU delegate option for your Task API model class with the
+[`BaseOptions`](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder)
+class. For example, you can set up GPU in `ObjectDetector` as shown in the
+following code examples:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+      <p><pre class="prettyprint lang-kotlin">
+    import org.tensorflow.lite.task.core.BaseOptions
+    import org.tensorflow.lite.task.gms.vision.detector.ObjectDetector
+
+    val baseOptions = BaseOptions.builder().useGpu().build()
+
+    val options =
+        ObjectDetector.ObjectDetectorOptions.builder()
+            .setBaseOptions(baseOptions)
+            .setMaxResults(1)
+            .build()
+
+    val objectDetector = ObjectDetector.createFromFileAndOptions(
+      context, model, options)
+      </pre></p>
+    </section>
+    <section>
+      <h3>Java</h3>
+      <p><pre class="prettyprint lang-java">
+    import org.tensorflow.lite.task.core.BaseOptions
+    import org.tensorflow.lite.task.gms.vision.detector.ObjectDetector
+
+    BaseOptions baseOptions = BaseOptions.builder().useGpu().build();
+
+    ObjectDetectorOptions options =
+        ObjectDetectorOptions.builder()
+            .setBaseOptions(baseOptions)
+            .setMaxResults(1)
+            .build();
+
+    val objectDetector = ObjectDetector.createFromFileAndOptions(
+      context, model, options);
+      </pre></p>
+    </section>
+  </devsite-selector>
+</div>
+
diff --git a/tensorflow/lite/g3doc/examples/auto_complete/overview.md b/tensorflow/lite/g3doc/examples/auto_complete/overview.md
index f73b1c811b0..7858093a903 100644
--- a/tensorflow/lite/g3doc/examples/auto_complete/overview.md
+++ b/tensorflow/lite/g3doc/examples/auto_complete/overview.md
@@ -1,6 +1,9 @@
 # Auto Complete
 
 <table class="tfo-notebook-buttons" align="left">
+  <td>
+    <a target="_blank" href="https://www.tensorflow.org/lite/examples/auto_complete/overview"><img src="https://www.tensorflow.org/images/tf_logo_32px.png" />View on TensorFlow.org</a>
+  </td>
   <td>
     <a target="_blank" href="https://colab.sandbox.google.com/github/tensorflow/codelabs/blob/main/KerasNLP/io2023_workshop.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
   </td>
@@ -23,6 +26,33 @@ Keras LLM and provides suggestions for model optimization using quantizing
 techniques, which otherwise would require a much larger amount of memory and
 greater computational power to run.
 
+We have open sourced our
+[Android app framework](https://github.com/tensorflow/examples/tree/master/lite/examples/generative_ai/)
+that any compatible TFLite LLMs can plug into. Here are two demos:
+
+*   In Figure 1, we used a Keras GPT-2 model to perform text completion tasks on
+    device.
+*   In Figure 2, we converted a version of instruction-tuned
+    [PaLM model](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html)
+    (1.5 billion parameters) to TFLite and executed through TFLite runtime.
+
+<center>
+![Autocomplete with PaLM](https://storage.googleapis.com/download.tensorflow.org/tflite/examples/autocomplete_fig1.gif){: width="400px"}
+<figcaption><b>Figure 1: </b>Example of running the Keras GPT-2 model (converted
+from this [Codelab](https://codelabs.developers.google.com/kerasnlp-tflite)) on
+device to perform text completion on Pixel 7. Demo shows the real latency with
+no speedup.</figcaption>
+</center>
+
+<center>
+![Autocomplete with PaLM](https://storage.googleapis.com/download.tensorflow.org/tflite/examples/autocomplete_fig2.gif){: width="400px"}
+</p>
+<figcaption><b>Figure 2: </b>Example of running a version of
+[PaLM model](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html)
+with 1.5 billion parameters. Demo is recorded on Pixel 7 Pro without playback
+speedup.</figcaption>
+</center>
+
 ## Guides
 
 ### Model authoring
@@ -80,8 +110,7 @@ model to a more compact TensorFlow Lite format using the TensorFlow Lite
 **converter**, and then use the TensorFlow Lite **interpreter**, which is highly
 optimized for mobile devices, to run the converted model.
 
-<img src="../images/tflite_workflow.png" class="attempt-right" />
-
+<img src="https://www.tensorflow.org/lite/examples/auto_complete/images/tflite_workflow.png" class="attempt-right" />
 Start with the `generate()` function from `GPT2CausalLM` that performs the
 conversion. Wrap the `generate()` function to create a concrete TensorFlow
 function:
@@ -270,7 +299,7 @@ gradle file.
 
 ### Context window size
 
-<img src="../images/context_window.png" class="attempt-right" />
+<img src="https://www.tensorflow.org/lite/examples/auto_complete/images/context_window.png" class="attempt-right" />
 
 The app has a changeable parameter ‘context window size’, which is needed
 because LLMs today generally have a fixed context size which limits how many
diff --git a/tensorflow/lite/g3doc/examples/image_classification/overview.md b/tensorflow/lite/g3doc/examples/image_classification/overview.md
index d2fdcc737f6..bee481ec0d6 100644
--- a/tensorflow/lite/g3doc/examples/image_classification/overview.md
+++ b/tensorflow/lite/g3doc/examples/image_classification/overview.md
@@ -27,7 +27,7 @@ recommended you explore the following example applications that can help you get
 started.
 
 You can leverage the out-of-box API from
-[TensorFlow Lite Task Library](../../inference_with_metadata/task_library/image_classifier)
+[TensorFlow Lite Task Library](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/inference_with_metadata/task_library/image_classifier.md)
 to integrate image classification models in just a few lines of code. You can
 also build your own custom inference pipeline using the
 [TensorFlow Lite Support Library](../../inference_with_metadata/lite_support).
diff --git a/tensorflow/lite/g3doc/examples/object_detection/overview.md b/tensorflow/lite/g3doc/examples/object_detection/overview.md
index c685816a595..06298ff62a2 100644
--- a/tensorflow/lite/g3doc/examples/object_detection/overview.md
+++ b/tensorflow/lite/g3doc/examples/object_detection/overview.md
@@ -57,10 +57,11 @@ to integrate object detection models in just a few lines of code. You can also
 build your own custom inference pipeline using the
 [TensorFlow Lite Interpreter Java API](../../guide/inference#load_and_run_a_model_in_java).
 
-The Android example below demonstrates the implementation for both methods as
-[lib_task_api](https://github.com/tensorflow/examples/tree/r2.12/lite/examples/object_detection/android/lib_task_api)
+The Android example below demonstrates the implementation for both methods
+using
+[Task library](https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android_play_services)
 and
-[lib_interpreter](https://github.com/tensorflow/examples/tree/eb925e460f761f5ed643d17f0c449e040ac2ac45/lite/examples/object_detection/android/lib_interpreter),
+[interpreter API](https://github.com/tensorflow/examples/tree/eb925e460f761f5ed643d17f0c449e040ac2ac45/lite/examples/object_detection/android/lib_interpreter),
 respectively.
 
 <a class="button button-primary" href="https://github.com/tensorflow/examples/tree/master/lite/examples/object_detection/android">View
diff --git a/tensorflow/lite/g3doc/guide/build_cmake.md b/tensorflow/lite/g3doc/guide/build_cmake.md
index 42d95b8ac04..9382283ccf6 100644
--- a/tensorflow/lite/g3doc/guide/build_cmake.md
+++ b/tensorflow/lite/g3doc/guide/build_cmake.md
@@ -51,7 +51,7 @@ cmake ../tensorflow_src/tensorflow/lite
 #### Debug build
 
 If you need to produce a debug build which has symbol information, you need to
-provide `-DCMAKE_BUILD_TYPE=Debug` option.
+provide the `-DCMAKE_BUILD_TYPE=Debug` option.
 
 ```sh
 cmake ../tensorflow_src/tensorflow/lite -DCMAKE_BUILD_TYPE=Debug
@@ -59,8 +59,8 @@ cmake ../tensorflow_src/tensorflow/lite -DCMAKE_BUILD_TYPE=Debug
 
 #### Build with kernel unit tests
 
-In order to be able to run kernel tests, you need to provide
-'-DTFLITE_KERNEL_TEST=on' flag. Unit test cross-compilation specifics can be
+In order to be able to run kernel tests, you need to provide the
+`-DTFLITE_KERNEL_TEST=on` flag. Unit test cross-compilation specifics can be
 found in the next subsection.
 
 ```sh
@@ -203,7 +203,7 @@ NVidia CUDA OpenCL 1.2.
 
 ### Step 5. Build TensorFlow Lite
 
-In the tflite_build directory,
+In the `tflite_build` directory,
 
 ```sh
 cmake --build . -j
@@ -218,7 +218,7 @@ section.
 
 ### Step 6. Build TensorFlow Lite Benchmark Tool and Label Image Example (Optional)
 
-In the tflite_build directory,
+In the `tflite_build` directory,
 
 ```sh
 cmake --build . -j -t benchmark_model
@@ -234,19 +234,19 @@ Here is the list of available options. You can override it with
 `-D<option_name>=[ON|OFF]`. For example, `-DTFLITE_ENABLE_XNNPACK=OFF` to
 disable XNNPACK which is enabled by default.
 
-| Option Name           | Feature        | Android | Linux | macOS | Windows |
-| --------------------- | -------------- | ------- | ----- | ----- | ------- |
-| TFLITE_ENABLE_RUY     | Enable RUY     | ON      | OFF   | OFF   | OFF     |
-:                       : matrix         :         :       :       :         :
-:                       : multiplication :         :       :       :         :
-:                       : library        :         :       :       :         :
-| TFLITE_ENABLE_NNAPI   | Enable NNAPI   | ON      | OFF   | N/A   | N/A     |
-:                       : delegate       :         :       :       :         :
-| TFLITE_ENABLE_GPU     | Enable GPU     | OFF     | OFF   | N/A   | N/A     |
-:                       : delegate       :         :       :       :         :
-| TFLITE_ENABLE_XNNPACK | Enable XNNPACK | ON      | ON    | ON    | ON      |
-:                       : delegate       :         :       :       :         :
-| TFLITE_ENABLE_MMAP    | Enable MMAP    | ON      | ON    | ON    | N/A     |
+| Option Name             | Feature        | Android | Linux | macOS | Windows |
+| ----------------------- | -------------- | ------- | ----- | ----- | ------- |
+| `TFLITE_ENABLE_RUY`     | Enable RUY     | ON      | OFF   | OFF   | OFF     |
+:                         : matrix         :         :       :       :         :
+:                         : multiplication :         :       :       :         :
+:                         : library        :         :       :       :         :
+| `TFLITE_ENABLE_NNAPI`   | Enable NNAPI   | ON      | OFF   | N/A   | N/A     |
+:                         : delegate       :         :       :       :         :
+| `TFLITE_ENABLE_GPU`     | Enable GPU     | OFF     | OFF   | N/A   | N/A     |
+:                         : delegate       :         :       :       :         :
+| `TFLITE_ENABLE_XNNPACK` | Enable XNNPACK | ON      | ON    | ON    | ON      |
+:                         : delegate       :         :       :       :         :
+| `TFLITE_ENABLE_MMAP`    | Enable MMAP    | ON      | ON    | ON    | N/A     |
 
 ## Create a CMake project which uses TensorFlow Lite
 
@@ -291,10 +291,13 @@ cmake --build . -j
 This command generates the following shared library in the current directory.
 
 Platform | Library name
--------- | -------------------------
-Linux    | libtensorflowlite_c.so
-macOS    | libtensorflowlite_c.dylib
-Windows  | tensorflowlite_c.dll
+-------- | ---------------------------
+Linux    | `libtensorflowlite_c.so`
+macOS    | `libtensorflowlite_c.dylib`
+Windows  | `tensorflowlite_c.dll`
 
-**Note:** You need necessary headers (c_api.h, c_api_experimental.h and
-common.h) to use the generated shared library.
+**Note:** You need the public headers (`tensorflow/lite/c_api.h`,
+`tensorflow/lite/c_api_experimental.h`, `tensorflow/lite/c_api_types.h`, and
+`tensorflow/lite/common.h`), and the private headers that those public headers
+include (`tensorflow/lite/core/builtin_ops.h`, `tensorflow/lite/core/c/*.h`, and
+`tensorflow/lite/core/async/c/*.h`, ) to use the generated shared library.
diff --git a/tensorflow/lite/g3doc/guide/build_cmake_arm.md b/tensorflow/lite/g3doc/guide/build_cmake_arm.md
index 17397810e2d..a83d1cce5a3 100644
--- a/tensorflow/lite/g3doc/guide/build_cmake_arm.md
+++ b/tensorflow/lite/g3doc/guide/build_cmake_arm.md
@@ -72,7 +72,7 @@ installed).
 
 #### Download toolchain
 
-These commands install gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu toolchain
+These commands install `gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu` toolchain
 under ${HOME}/toolchains.
 
 ```sh
@@ -99,7 +99,7 @@ cmake -DCMAKE_C_COMPILER=${ARMCC_PREFIX}gcc \
   ../tensorflow/lite/
 ```
 
-**Note:** You can enable GPU delegate with "-DTFLITE_ENABLE_GPU=ON" if your
+**Note:** You can enable GPU delegate with `-DTFLITE_ENABLE_GPU=ON` if your
 target device supports OpenCL 1.2 or higher.
 
 ## Build for ARMv7 NEON enabled
@@ -109,8 +109,8 @@ which is compatible with Raspberry Pi 3 and 4.
 
 #### Download toolchain
 
-These commands install gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf toolchain
-under ${HOME}/toolchains.
+These commands install `gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf`
+toolchain under ${HOME}/toolchains.
 
 ```sh
 curl -LO https://storage.googleapis.com/mirror.tensorflow.org/developer.arm.com/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz
@@ -137,9 +137,9 @@ cmake -DCMAKE_C_COMPILER=${ARMCC_PREFIX}gcc \
 ```
 
 **Note:** Since ARMv7 architecture is diverse, you may need to update
-ARMCC_FLAGS for your target device profiles. For example, when compiling with
+`ARMCC_FLAGS` for your target device profiles. For example, when compiling with
 XNNPACK enabled (i.e. `XNNPACK=ON`) in Tensorflow Lite 2.8, please add
-`-mfp16-format=ieee` to ARMCC_FLAGS.
+`-mfp16-format=ieee` to `ARMCC_FLAGS`.
 
 ## Build for Raspberry Pi Zero (ARMv6)
 
@@ -148,8 +148,8 @@ Raspberry Pi Zero.
 
 #### Download toolchain
 
-These commands install gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf toolchain
-under ${HOME}/toolchains.
+These commands install `gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf`
+toolchain under ${HOME}/toolchains.
 
 ```sh
 curl -LO https://storage.googleapis.com/mirror.tensorflow.org/developer.arm.com/media/Files/downloads/gnu-a/8.3-2019.03/binrel/gcc-arm-8.3-2019.03-x86_64-arm-linux-gnueabihf.tar.xz
diff --git a/tensorflow/lite/g3doc/guide/build_cmake_pip.md b/tensorflow/lite/g3doc/guide/build_cmake_pip.md
index 76e710c5b4a..9432edf3450 100644
--- a/tensorflow/lite/g3doc/guide/build_cmake_pip.md
+++ b/tensorflow/lite/g3doc/guide/build_cmake_pip.md
@@ -85,10 +85,10 @@ to understand your target environment) In that case, you need to modify
 The toolchain script defines the following two variables for the
 `build_pip_package_with_cmake.sh` script.
 
-Variable     | Purpose                  | example
------------- | ------------------------ | -------------------------------
-ARMCC_PREFIX | defines toolchain prefix | arm-linux-gnueabihf-
-ARMCC_FLAGS  | compilation flags        | -march=armv7-a -mfpu=neon-vfpv4
+Variable       | Purpose                  | example
+-------------- | ------------------------ | -------------------------------
+`ARMCC_PREFIX` | defines toolchain prefix | arm-linux-gnueabihf-
+`ARMCC_FLAGS`  | compilation flags        | -march=armv7-a -mfpu=neon-vfpv4
 
-**Note:** ARMCC_FLAGS might need to contain Python library include path. See the
-`download_toolchains.sh` for the reference.
+**Note:** `ARMCC_FLAGS` might need to contain Python library include path. See
+the `download_toolchains.sh` for the reference.
diff --git a/tensorflow/lite/g3doc/guide/faq.md b/tensorflow/lite/g3doc/guide/faq.md
index 95594c1360e..2c47bc0d2b2 100644
--- a/tensorflow/lite/g3doc/guide/faq.md
+++ b/tensorflow/lite/g3doc/guide/faq.md
@@ -104,7 +104,7 @@ like this:
     [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite&module-type=image-classification).
 *   *Tweak the number of threads.* Many TensorFlow Lite operators support
     multi-threaded kernels. You can use `SetNumThreads()` in the
-    [C++ API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/interpreter.h#L345)
+    [C++ API](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/interpreter_builder.h#L110)
     to do this. However, increasing threads results in performance variability
     depending on the environment.
 *   *Use Hardware Accelerators.* TensorFlow Lite supports model acceleration for
diff --git a/tensorflow/lite/g3doc/guide/op_select_allowlist.md b/tensorflow/lite/g3doc/guide/op_select_allowlist.md
index 735f732d373..97cff86087a 100644
--- a/tensorflow/lite/g3doc/guide/op_select_allowlist.md
+++ b/tensorflow/lite/g3doc/guide/op_select_allowlist.md
@@ -81,6 +81,7 @@ supported by TensorFlow Lite runtime with the Select TensorFlow Ops feature.
 *   `raw_ops.Bucketize`
 *   `raw_ops.CTCBeamSearchDecoder`
 *   `raw_ops.CTCGreedyDecoder`
+*   `raw_ops.Case`
 *   `raw_ops.Cast`
 *   `raw_ops.Ceil`
 *   `raw_ops.CheckNumerics`
diff --git a/tensorflow/lite/g3doc/guide/ops_select.md b/tensorflow/lite/g3doc/guide/ops_select.md
index 60478e94461..3581d4ea1c2 100644
--- a/tensorflow/lite/g3doc/guide/ops_select.md
+++ b/tensorflow/lite/g3doc/guide/ops_select.md
@@ -226,10 +226,10 @@ This command generates the following shared library in
 `bazel-bin/tensorflow/lite/delegates/flex`.
 
 Platform | Library name
--------- | ----------------------------
-Linux    | libtensorflowlite_flex.so
-macOS    | libtensorflowlite_flex.dylib
-Windows  | tensorflowlite_flex.dll
+-------- | ------------------------------
+Linux    | `libtensorflowlite_flex.so`
+macOS    | `libtensorflowlite_flex.dylib`
+Windows  | `tensorflowlite_flex.dll`
 
 Note that the necessary `TfLiteDelegate` will be installed automatically when
 creating the interpreter at runtime as long as the shared library is linked. It
diff --git a/tensorflow/lite/g3doc/images/acceleration/acceleration_service.png b/tensorflow/lite/g3doc/images/acceleration/acceleration_service.png
new file mode 100644
index 00000000000..150bc423698
Binary files /dev/null and b/tensorflow/lite/g3doc/images/acceleration/acceleration_service.png differ
diff --git a/tensorflow/lite/g3doc/images/acceleration/acceleration_service_steps.png b/tensorflow/lite/g3doc/images/acceleration/acceleration_service_steps.png
new file mode 100644
index 00000000000..e3522f18c7b
Binary files /dev/null and b/tensorflow/lite/g3doc/images/acceleration/acceleration_service_steps.png differ
diff --git a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
index 5465d3a1fbe..3ade16ce142 100644
--- a/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
+++ b/tensorflow/lite/g3doc/inference_with_metadata/task_library/overview.md
@@ -65,7 +65,7 @@ hardware acceleration of TensorFlow Lite models by leveraging on-device
 accelerators such as the [GPU](https://www.tensorflow.org/lite/performance/gpu)
 and [Coral Edge TPU](https://coral.ai/). Utilizing them for neural network
 operations provides huge benefits in terms of latency and power efficiency. For
-example, GPUs can provide upto a
+example, GPUs can provide up to a
 [5x speedup](https://blog.tensorflow.org/2020/08/faster-mobile-gpu-inference-with-opencl.html)
 in latency on mobile devices, and Coral Edge TPUs inference
 [10x faster](https://coral.ai/docs/edgetpu/benchmarks/) than desktop CPUs.
@@ -106,7 +106,7 @@ default.
 
 Step 2. Configure GPU delegate in the task options through
 [BaseOptions](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder).
-For example, you can set up GPU in `ObjectDetecor` as follows:
+For example, you can set up GPU in `ObjectDetector` as follows:
 
 ```java
 // Turn on GPU delegation.
diff --git a/tensorflow/lite/g3doc/microcontrollers/build_convert.md b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
index d5c9fec901d..402fc1eb26b 100644
--- a/tensorflow/lite/g3doc/microcontrollers/build_convert.md
+++ b/tensorflow/lite/g3doc/microcontrollers/build_convert.md
@@ -53,12 +53,9 @@ Once you have generated the file, you can include it in your program. It is
 important to change the array declaration to `const` for better memory
 efficiency on embedded platforms.
 
-<!-- 
-Removing this link for now because it is broken. Need to update TF example repos. b/244204652
 For an example of how to include and use a model in your program, see
-[`model.cc`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/model.cc)
+[`evaluate_test.cc`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/evaluate_test.cc)
 in the *Hello World* example.
--->
 
 ## Model architecture and training
 
@@ -95,4 +92,4 @@ to run. We are working on expanding operation support, both in terms of
 reference implementations and optimizations for specific architectures.
 
 The supported operations can be seen in the file
-[`all_ops_resolver.cc`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/all_ops_resolver.cc)
+[`micro_mutable_ops_resolver.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/micro_mutable_op_resolver.h)
diff --git a/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md b/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
index 537258f2a01..e5787fa98c2 100644
--- a/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
+++ b/tensorflow/lite/g3doc/microcontrollers/get_started_low_level.md
@@ -16,8 +16,8 @@ animation.
 
 The end-to-end workflow involves the following steps:
 
-1.  [Train a model](#train_a_model) (in Python): A jupyter notebook to train,
-    convert and optimize a model for on-device use.
+1.  [Train a model](#train_a_model) (in Python): A python file to train, convert
+    and optimize a model for on-device use.
 2.  [Run inference](#run_inference) (in C++ 17): An end-to-end unit test that
     runs inference on the model using the [C++ library](library.md).
 
@@ -50,12 +50,13 @@ Learn more about supported platforms in
 Note: You can skip this section and use the trained model included in the
 example code.
 
-Use Google Colaboratory to
-[train your own model](https://colab.research.google.com/github/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/train/train_hello_world_model.ipynb).
-For more details, refer to the `README.md`:
+Use
+[train.py](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/train.py)
+for hello world model training for sinwave recognition
 
-<a class="button button-primary" href="https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/examples/hello_world/train/README.md">Hello
-World Training README.md</a>
+Run: `bazel build tensorflow/lite/micro/examples/hello_world:train`
+`bazel-bin/tensorflow/lite/micro/examples/hello_world/train --save_tf_model
+--save_dir=/tmp/model_created/`
 
 ## Run inference
 
@@ -66,7 +67,7 @@ To run the model on your device, we will walk through the instructions in the
 World README.md</a>
 
 The following sections walk through the example's
-[`hello_world_test.cc`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/examples/hello_world/hello_world_test.cc),
+[`evaluate_test.cc`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/examples/hello_world/evaluate_test.cc),
 unit test which demonstrates how to run inference using TensorFlow Lite for
 Microcontrollers. It loads the model and runs inference several times.
 
@@ -76,14 +77,14 @@ To use the TensorFlow Lite for Microcontrollers library, we must include the
 following header files:
 
 ```C++
-#include "tensorflow/lite/micro/all_ops_resolver.h"
+#include "tensorflow/lite/micro/micro_mutable_op_resolver.h"
 #include "tensorflow/lite/micro/micro_error_reporter.h"
 #include "tensorflow/lite/micro/micro_interpreter.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/version.h"
 ```
 
--   [`all_ops_resolver.h`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/all_ops_resolver.h)
+-   [`micro_mutable_op_resolver.h`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/micro_mutable_op_resolver.h)
     provides the operations used by the interpreter to run the model.
 -   [`micro_error_reporter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h)
     outputs debug information.
@@ -162,23 +163,29 @@ if (model->version() != TFLITE_SCHEMA_VERSION) {
 
 ### 6. Instantiate operations resolver
 
-An
-[`AllOpsResolver`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/all_ops_resolver.h)
-instance is declared. This will be used by the interpreter to access the
-operations that are used by the model:
+A
+[`MicroMutableOpResolver`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/micro_mutable_op_resolver.h)
+instance is declared. This will be used by the interpreter to register and
+access the operations that are used by the model:
 
 ```C++
-tflite::AllOpsResolver resolver;
+using HelloWorldOpResolver = tflite::MicroMutableOpResolver<1>;
+
+TfLiteStatus RegisterOps(HelloWorldOpResolver& op_resolver) {
+  TF_LITE_ENSURE_STATUS(op_resolver.AddFullyConnected());
+  return kTfLiteOk;
+
 ```
 
-The `AllOpsResolver` loads all of the operations available in TensorFlow Lite
-for Microcontrollers, which uses a lot of memory. Since a given model will only
-use a subset of these operations, it's recommended that real world applications
-load only the operations that are needed.
+The `MicroMutableOpResolver`requires a template parameter indicating the number
+of ops that will be registered. The `RegisterOps` function registers the ops
+with the resolver.
 
-This is done using a different class, `MicroMutableOpResolver`. You can see how
-to use it in the *Micro speech* example's
-[`micro_speech_test.cc`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/examples/micro_speech/micro_speech_test.cc).
+```C++
+HelloWorldOpResolver op_resolver;
+TF_LITE_ENSURE_STATUS(RegisterOps(op_resolver));
+
+```
 
 ### 7. Allocate memory
 
@@ -331,12 +338,3 @@ interpreter.Invoke();
 value = output->data.f[0];
 TF_LITE_MICRO_EXPECT_NEAR(-0.959, value, 0.05);
 ```
-
-### 15. Read the application code
-
-Once you have walked through this unit test, you should be able to understand
-the example's application code, located in
-[`main_functions.cc`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/hello_world/main_functions.cc).
-It follows a similar process, but generates an input value based on how many
-inferences have been run, and calls a device-specific function that displays the
-model's output to the user.
diff --git a/tensorflow/lite/g3doc/microcontrollers/index.md b/tensorflow/lite/g3doc/microcontrollers/index.md
index 33c7faa471c..a4da3aa8951 100644
--- a/tensorflow/lite/g3doc/microcontrollers/index.md
+++ b/tensorflow/lite/g3doc/microcontrollers/index.md
@@ -67,9 +67,6 @@ platform, as given below:
 *   [Micro speech](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/micro_speech) -
     Captures audio with a microphone to detect the words "yes" and "no"
     *   [Tutorial using SparkFun Edge](https://codelabs.developers.google.com/codelabs/sparkfun-tensorflow/#0)
-*   [Magic wand](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/magic_wand) -
-    Captures accelerometer data to classify three different physical gestures
-    *   [Tutorial using Arduino Nano 33 BLE Sense](https://codelabs.developers.google.com/codelabs/ai-magicwand/#0)
 *   [Person detection](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/examples/person_detection) -
     Captures camera data with an image sensor to detect the presence or absence
     of a person
diff --git a/tensorflow/lite/g3doc/microcontrollers/library.md b/tensorflow/lite/g3doc/microcontrollers/library.md
index f9884206645..72fa63831b9 100644
--- a/tensorflow/lite/g3doc/microcontrollers/library.md
+++ b/tensorflow/lite/g3doc/microcontrollers/library.md
@@ -22,14 +22,12 @@ within various embedded development environments.
 The most important files for using the TensorFlow Lite for Microcontrollers
 interpreter are located in the root of the project, accompanied by tests:
 
--   [`all_ops_resolver.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/all_ops_resolver.h)
-    or
-    [`micro_mutable_op_resolver.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/micro_mutable_op_resolver.h)
-    can be used to provide the operations used by the interpreter to run the
-    model. Since `all_ops_resolver.h` pulls in every available operation, it
-    uses a lot of memory. In production applications, you should use
-    `micro_mutable_op_resolver.h` to pull in only the operations your model
-    needs.
+```
+[`micro_mutable_op_resolver.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/micro_mutable_op_resolver.h)
+can be used to provide the operations used by the interpreter to run the
+model.
+```
+
 -   [`micro_error_reporter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/tflite_bridge/micro_error_reporter.h)
     outputs debug information.
 -   [`micro_interpreter.h`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/micro_interpreter.h)
@@ -40,7 +38,7 @@ walkthrough of typical usage.
 
 The build system provides for platform-specific implementations of certain
 files. These are located in a directory with the platform name, for example
-[`sparkfun_edge`](https://github.com/tensorflow/tflite-micro/blob/main/tensorflow/lite/micro/sparkfun_edge).
+[`cortex-m`](https://github.com/tensorflow/tflite-micro/tree/main/tensorflow/lite/micro/cortex_m_generic).
 
 Several other directories exist, including:
 
@@ -129,18 +127,19 @@ make -f tensorflow/lite/micro/tools/make/Makefile hello_world_bin
 ```
 
 By default, the project will be compiled for the host operating system. To
-specify a different target architecture, use `TARGET=`. The following example
-shows how to build the *Hello World* example for the SparkFun Edge:
+specify a different target architecture, use `TARGET=` and `TARGET_ARCH=`. The
+following example shows how to build the *Hello World* example for a generic
+cortex-m0:
 
 ```bash
-make -f tensorflow/lite/micro/tools/make/Makefile TARGET=sparkfun_edge hello_world_bin
+make -f tensorflow/lite/micro/tools/make/Makefile TARGET=cortex_m_generic TARGET_ARCH=cortex-m0 hello_world_bin
 ```
 
 When a target is specified, any available target-specific source files will be
 used in place of the original code. For example, the subdirectory
-`examples/hello_world/sparkfun_edge` contains SparkFun Edge implementations of
-the files `constants.cc` and `output_handler.cc`, which will be used when the
-target `sparkfun_edge` is specified.
+`examples/hello_world/cortex_m_generic` contains SparkFun Edge implementations
+of the files `constants.cc` and `output_handler.cc`, which will be used when the
+target `cortex_m_generic` is specified.
 
 You can find the project names in the project's Makefiles. For example,
 `examples/hello_world/Makefile.inc` specifies the binary names for the *Hello
diff --git a/tensorflow/lite/g3doc/models/convert/metadata.md b/tensorflow/lite/g3doc/models/convert/metadata.md
index 9825c380115..e10b52472a7 100644
--- a/tensorflow/lite/g3doc/models/convert/metadata.md
+++ b/tensorflow/lite/g3doc/models/convert/metadata.md
@@ -7,9 +7,9 @@ input / output information. The metadata consists of both
 *   human readable parts which convey the best practice when using the model,
     and
 *   machine readable parts that can be leveraged by code generators, such as the
-    [TensorFlow Lite Android code generator](../../inference_with_metadata/codegen#generate-code-with-tensorflow-lite-android-code-generator)
+    [TensorFlow Lite Android code generator](../../inference_with_metadata/codegen.md#generate-model-interfaces-with-tensorflow-lite-code-generator-codegen)
     and the
-    [Android Studio ML Binding feature](../../inference_with_metadata/codegen#generate-code-with-android-studio-ml-model-binding).
+    [Android Studio ML Binding feature](../../inference_with_metadata/codegen.md#use-android-studio-ml-model-binding-mlbinding).
 
 All image models published on
 [TensorFlow Hub](https://tfhub.dev/s?deployment-format=lite) have been populated
@@ -76,9 +76,9 @@ There are three parts to the model metadata in the
     [SubGraphMetadata.output_tensor_metadata](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L599).
 
 Since TensorFlow Lite only supports single subgraph at this point, the
-[TensorFlow Lite code generator](../../inference_with_metadata/codegen#generate-code-with-tensorflow-lite-android-code-generator)
+[TensorFlow Lite code generator](../../inference_with_metadata/codegen..md#generate-model-interfaces-with-tensorflow-lite-code-generator-codegen)
 and the
-[Android Studio ML Binding feature](../../inference_with_metadata/codegen#generate-code-with-android-studio-ml-model-binding)
+[Android Studio ML Binding feature](../../inference_with_metadata/codegen.md#use-android-studio-ml-model-binding-mlbinding)
 will use `ModelMetadata.name` and `ModelMetadata.description`, instead of
 `SubGraphMetadata.name` and `SubGraphMetadata.description`, when displaying
 metadata and generating code.
@@ -114,7 +114,7 @@ for more details.
 The associated file information can be recorded in the metadata. Depending on
 the file type and where the file is attached to (i.e. `ModelMetadata`,
 `SubGraphMetadata`, and `TensorMetadata`),
-[the TensorFlow Lite Android code generator](../../inference_with_metadata/codegen)
+[the TensorFlow Lite Android code generator](../../inference_with_metadata/codegen.md)
 may apply corresponding pre/post processing automatically to the object. See
 [the \<Codegen usage\> section of each associate file type](https://github.com/tensorflow/tflite-support/blob/4cd0551658b6e26030e0ba7fc4d3127152e0d4ae/tensorflow_lite_support/metadata/metadata_schema.fbs#L77-L127)
 in the schema for more details.
diff --git a/tensorflow/lite/interpreter_test_util.h b/tensorflow/lite/interpreter_test_util.h
index 0296f35b81d..b844f97ff7d 100644
--- a/tensorflow/lite/interpreter_test_util.h
+++ b/tensorflow/lite/interpreter_test_util.h
@@ -69,12 +69,13 @@ class InterpreterTest : public ::testing::Test {
   static void BuildSignature(Interpreter* interpreter,
                              const std::string& signature_key,
                              const std::map<std::string, uint32_t>& inputs,
-                             const std::map<std::string, uint32_t>& outputs) {
+                             const std::map<std::string, uint32_t>& outputs,
+                             int subgraph_index = 0) {
     internal::SignatureDef signature;
     signature.inputs = inputs;
     signature.outputs = outputs;
     signature.signature_key = signature_key;
-    signature.subgraph_index = 0;
+    signature.subgraph_index = subgraph_index;
     interpreter->SetSignatureDef({signature});
   }
 
diff --git a/tensorflow/lite/ios/BUILD.apple b/tensorflow/lite/ios/BUILD.apple
index 178fadb21e1..94efc408e43 100644
--- a/tensorflow/lite/ios/BUILD.apple
+++ b/tensorflow/lite/ios/BUILD.apple
@@ -81,9 +81,10 @@ strip_common_include_path_prefix(
         "//tensorflow/lite:builtin_ops.h",
         # Here we include the paths of the actual c library headers in tensorflow/lite/core/c instead of the shim headers in tensorflow/lite/c.
         "//tensorflow/lite/core/c:c_api_experimental.h",
-        "//tensorflow/lite/core/c:common.h",
         "//tensorflow/lite/core/c:c_api.h",
         "//tensorflow/lite/core/c:c_api_types.h",
+        "//tensorflow/lite/core/c:common.h",
+        "//tensorflow/lite/core/c:registration_external.h",
         "//tensorflow/lite/core/async/c:types.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
@@ -110,6 +111,7 @@ tflite_ios_framework(
         ":c_api_types.h",
         ":common.h",
         ":profiler.h",
+        ":registration_external.h",
         ":telemetry_setting.h",
         ":types.h",
         ":xnnpack_delegate.h",
@@ -133,6 +135,7 @@ ios_static_framework(
         ":c_api_types.h",
         ":common.h",
         ":profiler.h",
+        ":registration_external.h",
         ":telemetry_setting.h",
         ":types.h",
         ":xnnpack_delegate.h",
@@ -214,6 +217,7 @@ cc_library(
         "//tensorflow/lite/core/c:c_api_experimental.h",
         "//tensorflow/lite/core/c:c_api_types.h",
         "//tensorflow/lite/core/c:common.h",
+        "//tensorflow/lite/core/c:registration_external.h",
         "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate.h",
         "//tensorflow/lite/profiling/telemetry/c:profiler.h",
         "//tensorflow/lite/profiling/telemetry/c:telemetry_setting.h",
diff --git a/tensorflow/lite/ios/TensorFlowLiteC.h b/tensorflow/lite/ios/TensorFlowLiteC.h
index e1f94983010..5aabc2d5a3c 100644
--- a/tensorflow/lite/ios/TensorFlowLiteC.h
+++ b/tensorflow/lite/ios/TensorFlowLiteC.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_experimental.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/core/c/registration_external.h"
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #include "tensorflow/lite/profiling/telemetry/c/profiler.h"
 
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 2b2edef4b69..b50b2c8c29d 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -124,6 +124,33 @@ filegroup(
     srcs = [x for x in JAVA_SRCS if x not in JAVA_API_SRCS and x not in JAVA_EXPERIMENTAL_SRCS],
 )
 
+# C headers for the Native APIs.
+
+# TODO(b/277186081) Move this list to `../c/BUILD` when aar_with_jni supports filegroups in the headers parameter.
+# TODO(b/279573130): add `builtin_op_data.h` to this list, it will be needed for DBA delegates.
+# LINT.IfChange(TFLITE_HEADERS)
+TFLITE_HEADERS = [
+    # TODO(b/175298345): Clean up and if possible remove c:common.h and core/c:common.h here.
+    "//tensorflow/lite:builtin_ops.h",
+    "//tensorflow/lite/c:c_api.h",
+    "//tensorflow/lite/c:c_api_experimental.h",
+    "//tensorflow/lite/c:c_api_opaque.h",
+    "//tensorflow/lite/c:c_api_types.h",
+    "//tensorflow/lite/c:common.h",
+    "//tensorflow/lite/core/c:c_api.h",
+    "//tensorflow/lite/core/c:c_api_opaque.h",
+    "//tensorflow/lite/core/c:c_api_types.h",
+    "//tensorflow/lite/core/c:c_api_experimental.h",
+    "//tensorflow/lite/core/c:common.h",
+    "//tensorflow/lite/delegates/nnapi:nnapi_delegate_c_api.h",
+]
+# LINT.ThenChange(../c/BUILD:exported_headers)
+
+filegroup(
+    name = "tflite_headers",
+    srcs = TFLITE_HEADERS,
+)
+
 #-----------------------------------------------------------------------------
 # AAR targets.
 
@@ -136,21 +163,7 @@ filegroup(
 aar_with_jni(
     name = "tensorflow-lite",
     android_library = ":tensorflowlite",
-    headers = [
-        # TODO(b/175298345): Clean up and if possible remove c:common.h and core/c:common.h here.
-        "//tensorflow/lite:builtin_ops.h",
-        "//tensorflow/lite/c:c_api.h",
-        "//tensorflow/lite/c:c_api_experimental.h",
-        "//tensorflow/lite/c:c_api_opaque.h",
-        "//tensorflow/lite/c:c_api_types.h",
-        "//tensorflow/lite/c:common.h",
-        "//tensorflow/lite/core/c:c_api.h",
-        "//tensorflow/lite/core/c:c_api_opaque.h",
-        "//tensorflow/lite/core/c:c_api_types.h",
-        "//tensorflow/lite/core/c:c_api_experimental.h",
-        "//tensorflow/lite/core/c:common.h",
-        "//tensorflow/lite/delegates/nnapi:nnapi_delegate_c_api.h",
-    ],
+    headers = TFLITE_HEADERS,
 )
 
 # No JNI should be in this AAR. Only package in the LICENSE.
@@ -493,7 +506,9 @@ java_test_with_tflite(
     # run every test case in this test in a separate shard, which ensures that
     # they run in a separate process that doesn't have the native libraries
     # loaded yet.
-    shard_count = 3,
+    # LINT.IfChange(TensorFlowLiteTestShardCount)
+    shard_count = 5,  # grep -c @Test src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
+    # LINT.ThenChange(src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java)
     tags = [
         "v1only",
     ],
diff --git a/tensorflow/lite/java/src/main/native/BUILD b/tensorflow/lite/java/src/main/native/BUILD
index 34521348d0a..8428becad49 100644
--- a/tensorflow/lite/java/src/main/native/BUILD
+++ b/tensorflow/lite/java/src/main/native/BUILD
@@ -101,8 +101,8 @@ cc_library_with_tflite(
         "//tensorflow/lite/c/jni:jni_utils",
         "//tensorflow/lite/c:c_api_without_op_resolver",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:xnnpack_plugin",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration/c:xnnpack_plugin",
         "//tensorflow/lite/tools:verifier_internal",
     ],
     deps = [
diff --git a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
index 26f1e1bd71b..4ac912e6a48 100644
--- a/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
+++ b/tensorflow/lite/java/src/main/native/nativeinterpreterwrapper_jni.cc
@@ -31,9 +31,9 @@ limitations under the License.
 #include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/tools/verifier_internal.h"
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #endif
diff --git a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
index f15ded27bc1..13272c282e0 100644
--- a/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
+++ b/tensorflow/lite/java/src/main/native/op_resolver_lazy_delegate_proxy.cc
@@ -29,9 +29,9 @@ limitations under the License.
 #include "tensorflow/lite/core/api/op_resolver_internal.h"
 #include "tensorflow/lite/model_builder.h"
 #if TFLITE_DISABLE_SELECT_JAVA_APIS
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #else
 #include "tensorflow/lite/delegates/xnnpack/xnnpack_delegate.h"
 #endif
diff --git a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
index 281076395db..e03face8be8 100644
--- a/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
+++ b/tensorflow/lite/java/src/test/java/org/tensorflow/lite/TensorFlowLiteTest.java
@@ -23,6 +23,8 @@ import org.junit.runner.RunWith;
 import org.junit.runners.JUnit4;
 import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime;
 
+// LINT.IfChange
+
 /** Unit tests for {@link org.tensorflow.lite.TensorFlowLite}. */
 @RunWith(JUnit4.class)
 public final class TensorFlowLiteTest {
@@ -115,3 +117,5 @@ public final class TensorFlowLiteTest {
     }
   }
 }
+
+// LINT.ThenChange(../../../../../../BUILD:TensorFlowLiteTestShardCount)
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index a3a2c27e5f5..5085d6a7a8e 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -522,9 +522,15 @@ cc_test(
     }),
     tags = ["tflite_smoke_test"],
     deps = [
+        ":builtin_ops",  # build_cleaner: keep
         ":kernel_util",
+        ":test_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:cc_api_stable",
+        "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
-        "//tensorflow/lite/testing:util",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest_main",
     ],
@@ -1981,6 +1987,18 @@ cc_test(
     ],
 )
 
+cc_test(
+    name = "fully_connected_4bit_test",
+    size = "small",
+    srcs = ["fully_connected_4bit_test.cc"],
+    deps = [
+        ":builtin_ops",
+        ":test_main",
+        ":test_util",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_test(
     name = "local_response_norm_test",
     size = "small",
@@ -2898,6 +2916,9 @@ cc_test(
     deps = [
         ":test_main",
         ":test_util",
+        "//tensorflow/lite:framework_stable",
+        "//tensorflow/lite/core:framework_stable",
+        "//tensorflow/lite/kernels:subgraph_test_util",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_googletest//:gtest",
         "@flatbuffers",
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal.h b/tensorflow/lite/kernels/acceleration_test_util_internal.h
index eb00e49f469..3b5a5166ece 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal.h
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal.h
@@ -77,7 +77,7 @@ std::optional<T> GetAccelerationTestParam(std::string test_id) {
       config->push_back(ConfigurationEntry<T>(key, value, is_denylist));
     };
 
-    ReadAccelerationConfig(T::kAccelerationTestConfig, consumer);
+    ReadAccelerationConfig(T::AccelerationTestConfig(), consumer);
 
     // Even if it has been already set, it would be just replaced with the
     // same value, just freeing the old value to avoid leaks
diff --git a/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc b/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
index 2da168f3257..bee7471fc62 100644
--- a/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
+++ b/tensorflow/lite/kernels/acceleration_test_util_internal_test.cc
@@ -42,6 +42,10 @@ struct SimpleConfig {
       test-7,data-7
       )";
 
+  static const char* AccelerationTestConfig() {
+    return kAccelerationTestConfig;
+  }
+
   static SimpleConfig ParseConfigurationLine(const std::string& conf_line) {
     return {conf_line};
   }
@@ -187,6 +191,10 @@ struct UnmatchedSimpleConfig {
  public:
   static constexpr const char* kAccelerationTestConfig = nullptr;
 
+  static const char* AccelerationTestConfig() {
+    return kAccelerationTestConfig;
+  }
+
   static UnmatchedSimpleConfig ParseConfigurationLine(
       const std::string& conf_line) {
     return {conf_line};
diff --git a/tensorflow/lite/kernels/concatenation.cc b/tensorflow/lite/kernels/concatenation.cc
index 87ec74018b2..99a62128a73 100644
--- a/tensorflow/lite/kernels/concatenation.cc
+++ b/tensorflow/lite/kernels/concatenation.cc
@@ -91,6 +91,9 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node, int axis,
     case kTfLiteInt32:
       TF_LITE_CONCATENATION(int32);
       break;
+    case kTfLiteUInt32:
+      TF_LITE_CONCATENATION(uint32_t);
+      break;
     case kTfLiteUInt8:
       TF_LITE_CONCATENATION_QUANTIZED();
       break;
@@ -138,7 +141,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                  input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
                      input_type == kTfLiteInt8 || input_type == kTfLiteInt16 ||
                      input_type == kTfLiteInt32 || input_type == kTfLiteInt64 ||
-                     input_type == kTfLiteBool);
+                     input_type == kTfLiteBool || input_type == kTfLiteUInt32);
 
   // Output dimensions will match input dimensions, except 'axis', which
   // will be the sum of inputs
diff --git a/tensorflow/lite/kernels/concatenation_test.cc b/tensorflow/lite/kernels/concatenation_test.cc
index 6394dedd587..e9b7b938ac0 100644
--- a/tensorflow/lite/kernels/concatenation_test.cc
+++ b/tensorflow/lite/kernels/concatenation_test.cc
@@ -62,13 +62,14 @@ class BaseConcatenationOpModel : public SingleOpModel {
   int output_;
 };
 
+template <typename T>
 class ConcatenationOpModel : public BaseConcatenationOpModel {
  public:
   using BaseConcatenationOpModel::BaseConcatenationOpModel;
-  void SetInput(int index, std::initializer_list<float> data) {
-    PopulateTensor(index, data);
+  void SetInput(int index, std::initializer_list<T> data) {
+    PopulateTensor<T>(index, data);
   }
-  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  std::vector<T> GetOutput() { return ExtractVector<T>(output_); }
 };
 
 class QuantizedConcatenationOpModel : public BaseConcatenationOpModel {
@@ -100,16 +101,25 @@ class BoolConcatenationOpModel : public BaseConcatenationOpModel {
 };
 
 TEST(ConcatenationOpTest, ThreeDimensionalOneInput) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
-                          /*num_inputs=*/1);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/1,
+                                 /*num_inputs=*/1);
   m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
   ASSERT_EQ(m0.Invoke(), kTfLiteOk);
   EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
 }
 
+TEST(ConcatenationOpTest, ThreeDimensionalOneInputUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2}}, /*axis=*/1,
+                                    /*num_inputs=*/1);
+  m0.SetInput(0, {1, 3, 4, 7});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 4, 7}));
+}
+
 TEST(ConcatenationOpTest, FiveDimensionalOneInput) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2, 1, 3}}, /*axis=*/2,
-                          /*num_inputs=*/1);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 1, 2, 1, 3}},
+                                 /*axis=*/2,
+                                 /*num_inputs=*/1);
   m0.SetInput(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
                   11.0f, 12.0f});
   ASSERT_EQ(m0.Invoke(), kTfLiteOk);
@@ -117,9 +127,20 @@ TEST(ConcatenationOpTest, FiveDimensionalOneInput) {
               ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
 }
 
+TEST(ConcatenationOpTest, FiveDimensionalOneInputUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2, 1, 3}},
+                                    /*axis=*/2,
+                                    /*num_inputs=*/1);
+  m0.SetInput(0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+}
+
 TEST(ConcatenationOpTest, FiveDimensionalTwoInput) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2, 1, 3}}, /*axis=*/0,
-                          /*num_inputs=*/2);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 1, 2, 1, 3}},
+                                 /*axis=*/0,
+                                 /*num_inputs=*/2);
   m0.SetInput(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
                   11.0f, 12.0f});
   m0.SetInput(1, {13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f,
@@ -131,9 +152,23 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInput) {
                         13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}));
 }
 
+TEST(ConcatenationOpTest, FiveDimensionalTwoInputUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2, 1, 3}},
+                                    /*axis=*/0,
+                                    /*num_inputs=*/2);
+  m0.SetInput(0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m0.SetInput(1, {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(
+      m0.GetOutput(),
+      ElementsAreArray({1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12,
+                        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24}));
+}
+
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputNegativeAxes) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2, 1, 3}}, /*axis=*/-2,
-                          /*num_inputs=*/2);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 1, 2, 1, 3}},
+                                 /*axis=*/-2,
+                                 /*num_inputs=*/2);
   m0.SetInput(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
                   11.0f, 12.0f});
   m0.SetInput(1, {13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f,
@@ -144,6 +179,18 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInputNegativeAxes) {
                                 7, 8, 9, 19, 20, 21, 10, 11, 12, 22, 23, 24}));
 }
 
+TEST(ConcatenationOpTest, FiveDimensionalTwoInputNegativeAxesUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2, 1, 3}},
+                                    /*axis=*/-2,
+                                    /*num_inputs=*/2);
+  m0.SetInput(0, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  m0.SetInput(1, {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({1, 2, 3, 13, 14, 15, 4,  5,  6,  16, 17, 18,
+                                7, 8, 9, 19, 20, 21, 10, 11, 12, 22, 23, 24}));
+}
+
 TEST(ConcatenationOpTest, FiveDimensionalTwoInputQuantizedUint8) {
   QuantizedConcatenationOpModel m0(
       {TensorType_UINT8, {2, 1, 2, 1, 3}, -12.7, 12.8},
@@ -170,7 +217,7 @@ TEST(ConcatenationOpTest, FiveDimensionalTwoInputQuantizedUint8) {
 }
 
 TEST(ConcatenationOpTest, ThreeDimensionalTwoInputsDifferentShapes) {
-  ConcatenationOpModel m0(
+  ConcatenationOpModel<float> m0(
       {{TensorType_FLOAT32, {2, 1, 2}}, {TensorType_FLOAT32, {2, 3, 2}}},
       /*axis=*/1, /*num_inputs=*/2, TensorType_FLOAT32);
   m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
@@ -181,63 +228,141 @@ TEST(ConcatenationOpTest, ThreeDimensionalTwoInputsDifferentShapes) {
                                                 8, 9, 10, 11, 12}));
 }
 
+TEST(ConcatenationOpTest, ThreeDimensionalTwoInputsDifferentShapesUInt32) {
+  ConcatenationOpModel<uint32_t> m0(
+      {{TensorType_UINT32, {2, 1, 2}}, {TensorType_UINT32, {2, 3, 2}}},
+      /*axis=*/1, /*num_inputs=*/2, TensorType_UINT32);
+  m0.SetInput(0, {1, 3, 4, 7});
+  m0.SetInput(1, {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 3, 1, 2, 3, 4, 5, 6, 4, 7, 7,
+                                                8, 9, 10, 11, 12}));
+}
+
 #if GTEST_HAS_DEATH_TEST
 TEST(ConcatenationOpTest, ThreeDimensionalTwoInputsDifferentShapesWrongAxis) {
   EXPECT_DEATH(
-      ConcatenationOpModel m0(
+      ConcatenationOpModel<float> m0(
           {{TensorType_FLOAT32, {2, 1, 2}}, {TensorType_FLOAT32, {2, 3, 2}}},
           /*axis=*/0, /*num_inputs=*/2, TensorType_FLOAT32),
       "Cannot allocate tensors");
 }
+
+TEST(ConcatenationOpTest,
+     ThreeDimensionalTwoInputsDifferentShapesWrongAxisUInt32) {
+  EXPECT_DEATH(
+      ConcatenationOpModel<uint32_t> m0(
+          {{TensorType_UINT32, {2, 1, 2}}, {TensorType_UINT32, {2, 3, 2}}},
+          /*axis=*/0, /*num_inputs=*/2, TensorType_UINT32),
+      "Cannot allocate tensors");
+}
 #endif
 
 TEST(ConcatenationOpTest, OneTrivialInput) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {1}}, /*axis=*/0,
-                          /*num_inputs=*/1);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {1}}, /*axis=*/0,
+                                 /*num_inputs=*/1);
   m0.SetInput(0, {5.0f});
   ASSERT_EQ(m0.Invoke(), kTfLiteOk);
   EXPECT_THAT(m0.GetOutput(), ::testing::ElementsAre(5));
 }
 
+TEST(ConcatenationOpTest, OneTrivialInputUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {1}}, /*axis=*/0,
+                                    /*num_inputs=*/1);
+  m0.SetInput(0, {5});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(), ::testing::ElementsAre(5));
+}
+
 TEST(ConcatenationOpTest, TwoDimensionalOneInput) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
-                          /*num_inputs=*/1);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
+                                 /*num_inputs=*/1);
   m0.SetInput(0, {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f});
   ASSERT_EQ(m0.Invoke(), kTfLiteOk);
   EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
 }
 
+TEST(ConcatenationOpTest, TwoDimensionalOneInputUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 3}}, /*axis=*/0,
+                                    /*num_inputs=*/1);
+  m0.SetInput(0, {1, 2, 3, 4, 5, 6});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({1, 2, 3, 4, 5, 6}));
+}
+
 TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxes) {
   // We will concatenate two tensors along different dimensions.
   auto tensor0 = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f};
   auto tensor1 = {7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f};
 
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
-                          /*num_inputs=*/2);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 3}}, /*axis=*/0,
+                                 /*num_inputs=*/2);
   m0.SetInput(0, tensor0);
   m0.SetInput(1, tensor1);
   ASSERT_EQ(m0.Invoke(), kTfLiteOk);
   EXPECT_THAT(m0.GetOutput(),
               ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
 
-  ConcatenationOpModel m0_negative({TensorType_FLOAT32, {2, 3}}, /*axis=*/-2,
-                                   /*num_inputs=*/2);
+  ConcatenationOpModel<float> m0_negative({TensorType_FLOAT32, {2, 3}},
+                                          /*axis=*/-2,
+                                          /*num_inputs=*/2);
   m0_negative.SetInput(0, tensor0);
   m0_negative.SetInput(1, tensor1);
   ASSERT_EQ(m0_negative.Invoke(), kTfLiteOk);
   EXPECT_THAT(m0_negative.GetOutput(),
               ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
 
-  ConcatenationOpModel m1({TensorType_FLOAT32, {2, 3}}, /*axis=*/1,
-                          /*num_inputs=*/2);
+  ConcatenationOpModel<float> m1({TensorType_FLOAT32, {2, 3}}, /*axis=*/1,
+                                 /*num_inputs=*/2);
   m1.SetInput(0, tensor0);
   m1.SetInput(1, tensor1);
   ASSERT_EQ(m1.Invoke(), kTfLiteOk);
   EXPECT_THAT(m1.GetOutput(),
               ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
 
-  ConcatenationOpModel m1_negative({TensorType_FLOAT32, {2, 3}}, /*axis=*/-1,
-                                   /*num_inputs=*/2);
+  ConcatenationOpModel<float> m1_negative({TensorType_FLOAT32, {2, 3}},
+                                          /*axis=*/-1,
+                                          /*num_inputs=*/2);
+  m1_negative.SetInput(0, tensor0);
+  m1_negative.SetInput(1, tensor1);
+  ASSERT_EQ(m1_negative.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m1_negative.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+}
+
+TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxesUInt32) {
+  // We will concatenate two tensors along different dimensions.
+  std::initializer_list<uint32_t> tensor0 = {1, 2, 3, 4, 5, 6};
+  std::initializer_list<uint32_t> tensor1 = {7, 8, 9, 10, 11, 12};
+
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 3}}, /*axis=*/0,
+                                    /*num_inputs=*/2);
+  m0.SetInput(0, tensor0);
+  m0.SetInput(1, tensor1);
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
+  ConcatenationOpModel<uint32_t> m0_negative({TensorType_UINT32, {2, 3}},
+                                             /*axis=*/-2,
+                                             /*num_inputs=*/2);
+  m0_negative.SetInput(0, tensor0);
+  m0_negative.SetInput(1, tensor1);
+  ASSERT_EQ(m0_negative.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0_negative.GetOutput(),
+              ElementsAreArray({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}));
+
+  ConcatenationOpModel<uint32_t> m1({TensorType_UINT32, {2, 3}}, /*axis=*/1,
+                                    /*num_inputs=*/2);
+  m1.SetInput(0, tensor0);
+  m1.SetInput(1, tensor1);
+  ASSERT_EQ(m1.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m1.GetOutput(),
+              ElementsAreArray({1, 2, 3, 7, 8, 9, 4, 5, 6, 10, 11, 12}));
+
+  ConcatenationOpModel<uint32_t> m1_negative({TensorType_UINT32, {2, 3}},
+                                             /*axis=*/-1,
+                                             /*num_inputs=*/2);
   m1_negative.SetInput(0, tensor0);
   m1_negative.SetInput(1, tensor1);
   ASSERT_EQ(m1_negative.Invoke(), kTfLiteOk);
@@ -246,8 +371,8 @@ TEST(ConcatenationOpTest, TwoInputsTwoAxesNegativeAxes) {
 }
 
 TEST(ConcatenationOpTest, FourInputs) {
-  ConcatenationOpModel m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/2,
-                          /*num_inputs=*/4);
+  ConcatenationOpModel<float> m0({TensorType_FLOAT32, {2, 1, 2}}, /*axis=*/2,
+                                 /*num_inputs=*/4);
   m0.SetInput(0, {1.0f, 3.0f, 4.0f, 7.0f});
   m0.SetInput(1, {1.1f, 3.1f, 4.1f, 7.1f});
   m0.SetInput(2, {1.2f, 3.2f, 4.2f, 7.2f});
@@ -260,6 +385,20 @@ TEST(ConcatenationOpTest, FourInputs) {
               }));
 }
 
+TEST(ConcatenationOpTest, FourInputsUInt32) {
+  ConcatenationOpModel<uint32_t> m0({TensorType_UINT32, {2, 1, 2}}, /*axis=*/2,
+                                    /*num_inputs=*/4);
+  m0.SetInput(0, {1, 3, 4, 7});
+  m0.SetInput(1, {1, 3, 4, 7});
+  m0.SetInput(2, {1, 3, 4, 7});
+  m0.SetInput(3, {1, 3, 4, 7});
+  ASSERT_EQ(m0.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m0.GetOutput(), ElementsAreArray({
+                                  1, 3, 1, 3, 1, 3, 1, 3,  //
+                                  4, 7, 4, 7, 4, 7, 4, 7,  //
+                              }));
+}
+
 TEST(ConcatenationOpTest, FourInputsQuantizedUint8) {
   QuantizedConcatenationOpModel m0({TensorType_UINT8, {2, 1, 2}, -12.7, 12.8},
                                    /*axis=*/2,
@@ -587,6 +726,9 @@ class ConcatenationOpPersistentModelTest : public ::testing::Test {
     if (std::is_same<T, int32_t>::value) {
       tensor_type = TensorType_INT32;
     }
+    if (std::is_same<T, uint32_t>::value) {
+      tensor_type = TensorType_UINT32;
+    }
     if (is_quantized) {
       tensor_type = TensorType_INT8;
     }
@@ -595,7 +737,7 @@ class ConcatenationOpPersistentModelTest : public ::testing::Test {
   }
 };
 
-using DataTypes = ::testing::Types<float, int32_t>;
+using DataTypes = ::testing::Types<float, int32_t, uint32_t>;
 TYPED_TEST_SUITE(ConcatenationOpPersistentModelTest, DataTypes);
 
 TYPED_TEST(ConcatenationOpPersistentModelTest, PersistentTest) {
diff --git a/tensorflow/lite/kernels/cpu_backend_context.cc b/tensorflow/lite/kernels/cpu_backend_context.cc
index c78a266a43d..70ae333bb78 100644
--- a/tensorflow/lite/kernels/cpu_backend_context.cc
+++ b/tensorflow/lite/kernels/cpu_backend_context.cc
@@ -44,8 +44,7 @@ namespace tflite {
 extern TFLITE_ATTRIBUTE_WEAK bool UseGemmlowpOnX86();
 #endif  // defined(TFLITE_HAS_ATTRIBUTE_WEAK) && !(__APPLE__)
 
-// TODO(b/138922878) Enable when Ruy builds on Apple.
-#if defined(TFLITE_HAVE_CPUINFO) && !defined(__APPLE__)
+#if defined(TFLITE_HAVE_CPUINFO)
 CpuBackendContext::CpuInfo::~CpuInfo() {
   if (init_status_ == InitStatus::kInitialized) {
     cpuinfo_deinitialize();
diff --git a/tensorflow/lite/kernels/dynamic_update_slice.cc b/tensorflow/lite/kernels/dynamic_update_slice.cc
index 27103b91935..b3d2fefdc5b 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice.cc
@@ -120,8 +120,10 @@ void DynamicUpdateSlice(const TfLiteTensor* input, const TfLiteTensor* update,
   std::vector<int> clamped_start_indices =
       ClampStartIndices(input_dims, indices_data, input_shape, update_shape);
 
-  // Copies input to output first.
-  memcpy(output->data.raw, input->data.raw, input->bytes);
+  // If the operation is not done in-place, copy the input data to the output.
+  if (input->data.data != output->data.data) {
+    memcpy(output->data.data, input->data.data, input->bytes);
+  }
 
   // Update tensor has no elements. Skip.
   if (update_shape.FlatSize() == 0) {
diff --git a/tensorflow/lite/kernels/dynamic_update_slice_test.cc b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
index a12ed79dc31..39c0444b31b 100644
--- a/tensorflow/lite/kernels/dynamic_update_slice_test.cc
+++ b/tensorflow/lite/kernels/dynamic_update_slice_test.cc
@@ -23,12 +23,15 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
 
+using ::testing::ElementsAre;
 using ::testing::ElementsAreArray;
 
 class DynamicUpdateSliceOpModel : public SingleOpModel {
@@ -198,5 +201,104 @@ TEST(DynamicUpdateSliceOpTest, UpdateShapeTooLargeTest) {
       "i\\) was not true.");
 }
 
+// Sets up an interpreter and a graph to test inplace use of input tensors for
+// the operation's output.
+class DynamicUpdateSliceGraphModel {
+ public:
+  static constexpr struct InPlaceGraph {
+  } kInPlaceGraph{};
+  static constexpr struct NotInPlaceGraph {
+  } kNotInPlaceGraph{};
+
+  DynamicUpdateSliceGraphModel(InPlaceGraph, bool multiple_consumers) {
+    builder_.BuildInplaceDynamicUpdateSliceSubgraph(
+        interpreter_.primary_subgraph(), multiple_consumers);
+    SetUpInterpreter();
+  }
+
+  explicit DynamicUpdateSliceGraphModel(NotInPlaceGraph) {
+    builder_.BuildInputDynamicUpdateSliceSubgraph(
+        interpreter_.primary_subgraph());
+    SetUpInterpreter();
+  }
+
+  void SetUpInterpreter() {
+    interpreter_.ResizeInputTensor(interpreter_.inputs()[0], {2, 3});
+    interpreter_.ResizeInputTensor(interpreter_.inputs()[1], {1, 3});
+    interpreter_.ResizeInputTensor(interpreter_.inputs()[2], {2});
+    CHECK_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+    subgraph_test_util::FillIntTensor(&GetInputTensor(0), {0, 0, 0, 0, 0, 0});
+    subgraph_test_util::FillIntTensor(&GetInputTensor(1), {3, 3, 3});
+    subgraph_test_util::FillIntTensor(&GetInputTensor(2), {1, 0});
+  }
+
+  Interpreter& GetInterpreter() { return interpreter_; }
+
+  // Get a tensor given its internal index.
+  TfLiteTensor& GetTensor(int index) { return *interpreter_.tensor(index); }
+
+  // Get an input tensor given the declaration order.
+  TfLiteTensor& GetInputTensor(int index) {
+    return GetTensor(interpreter_.inputs()[index]);
+  }
+
+  // Get an output tensor given the declaration order.
+  TfLiteTensor& GetOutputTensor(int index) {
+    return GetTensor(interpreter_.outputs()[index]);
+  }
+
+ protected:
+  Interpreter interpreter_;
+  // The builder serves as an RAII guard for some of the tensor buffers and must
+  // live until the end of the test.
+  subgraph_test_util::SubgraphBuilder builder_;
+};
+
+absl::Span<int> ShapeOf(const TfLiteTensor& tensor) {
+  if (!tensor.dims) {
+    return {};
+  }
+  return absl::Span<int>(tensor.dims->data, tensor.dims->size);
+}
+
+template <class T>
+absl::Span<int32_t> DataOf(const TfLiteTensor& tensor) {
+  return absl::Span<int>(tensor.data.i32, tensor.bytes / sizeof(T));
+}
+
+TEST(DynamicUpdateSliceOpTest, DoNotReuseGraphInputBuffer) {
+  auto model = DynamicUpdateSliceGraphModel(
+      DynamicUpdateSliceGraphModel::kNotInPlaceGraph);
+  ASSERT_EQ(model.GetInterpreter().Invoke(), kTfLiteOk);
+
+  const TfLiteTensor& output = model.GetOutputTensor(0);
+  EXPECT_THAT(ShapeOf(output), ElementsAre(2, 3));
+  EXPECT_THAT(DataOf<int32_t>(output), ElementsAre(1, 1, 1, 4, 4, 4));
+
+  const TfLiteTensor& input0 = model.GetInputTensor(0);
+  const TfLiteTensor& intermediate = model.GetTensor(5);
+  EXPECT_NE(input0.data.data, intermediate.data.data);
+}
+
+TEST(DynamicUpdateSliceOpTest, OnlyShareBufferForASingleConsumer) {
+  for (bool multiple_consumers : {true, false}) {
+    auto model = DynamicUpdateSliceGraphModel(
+        DynamicUpdateSliceGraphModel::kInPlaceGraph, multiple_consumers);
+    ASSERT_EQ(model.GetInterpreter().Invoke(), kTfLiteOk);
+
+    const TfLiteTensor& output = model.GetOutputTensor(0);
+    EXPECT_THAT(ShapeOf(output), ElementsAre(2, 3));
+    EXPECT_THAT(DataOf<int32_t>(output), ElementsAre(2, 2, 2, 4, 4, 4));
+
+    const TfLiteTensor& intermediate0 = model.GetTensor(5);
+    const TfLiteTensor& intermediate1 = model.GetTensor(6);
+    if (multiple_consumers) {
+      EXPECT_NE(intermediate0.data.data, intermediate1.data.data);
+    } else {
+      EXPECT_EQ(intermediate0.data.data, intermediate1.data.data);
+    }
+  }
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index ce6f5478098..b502e5921c0 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -25,14 +25,13 @@ limitations under the License.
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/cpu_backend_context.h"
+#include "tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h"
 #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
 #include "tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/quantization_util.h"
 #include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
 #include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
-#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
 #include "tensorflow/lite/kernels/internal/reference/sparse_ops/fully_connected.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/internal/tensor_utils.h"
 #include "tensorflow/lite/kernels/internal/types.h"
@@ -124,6 +123,8 @@ struct OpData {
   bool compute_row_sums = false;
   // Only used for sparse hybrid fully connected kernels.
   bool ledger_initialized;
+  // Used for 4bit hybrid
+  std::unique_ptr<optimized_4bit::OpData4Bit> op_data_4bit = nullptr;
 };
 
 constexpr int kInputTensor = 0;
@@ -132,6 +133,12 @@ constexpr int kBiasTensor = 2;
 constexpr int kOutputTensor = 0;
 constexpr int kShuffledInputWorkspaceTensor = 1;
 
+// Begin temporary tensor ids created at init and initialized during prepare.
+constexpr int kQuantizedInputTensor = 0;
+constexpr int kScalingFactorsTensor = 1;
+constexpr int kAccumulatorTensor = 2;
+constexpr int kInputOffsetsTensor = 3;
+
 inline TfLiteStatus CheckTypes(TfLiteContext* context,
                                const TfLiteTensor* input,
                                const TfLiteTensor* filter,
@@ -194,7 +201,98 @@ void Free(TfLiteContext* context, void* buffer) {
   delete reinterpret_cast<OpData*>(buffer);
 }
 
-TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
+TfLiteStatus UpdateOutputSize(TfLiteContext* context,
+                              TfLiteFullyConnectedParams* params,
+                              const TfLiteTensor* input, TfLiteTensor* output,
+                              int batch_size, int num_units, int cols) {
+  TfLiteIntArray* output_size_array = nullptr;
+  if (params->keep_num_dims) {
+    TF_LITE_ENSURE_EQ(context, input->dims->data[input->dims->size - 1], cols);
+    output_size_array = TfLiteIntArrayCopy(input->dims);
+    output_size_array->data[output_size_array->size - 1] = num_units;
+  } else {
+    // Otherwise, the output is (potentially flattened to) a 2-D matrix.
+    output_size_array = TfLiteIntArrayCreate(2);
+    output_size_array->data[0] = batch_size;
+    output_size_array->data[1] = num_units;
+  }
+  return context->ResizeTensor(context, output, output_size_array);
+}
+
+TfLiteStatus PrepareImpl4Bit(TfLiteContext* context, TfLiteNode* node,
+                             int lhs_width, int rhs_width, int depth,
+                             int batch_size, int cols, int output_depth) {
+  const int units = output_depth;
+  const int lhs_layout_cols =
+      (cols + (optimized_4bit::FilterDepth - 1)) & ~(depth - 1);
+  const int rhs_layout_rows = (batch_size + (rhs_width - 1)) & ~(rhs_width - 1);
+  const int rhs_layout_cols = lhs_layout_cols;
+  const int dst_layout_rows = rhs_layout_rows;
+  const int dst_layout_cols = (units + (lhs_width - 1)) & ~(lhs_width - 1);
+
+  OpData* data = reinterpret_cast<OpData*>(node->user_data);
+  TfLiteIntArrayFree(node->temporaries);
+  node->temporaries = TfLiteIntArrayCreate(5);
+  for (int i = 0; i < 5; i++) {
+    node->temporaries->data[i] = data->scratch_tensor_index + i;
+  }
+
+  TfLiteTensor* input_quantized;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, kQuantizedInputTensor, &input_quantized));
+  input_quantized->type = kTfLiteInt8;
+  input_quantized->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* input_quantized_size = TfLiteIntArrayCreate(2);
+  input_quantized_size->data[0] = rhs_layout_rows;
+  input_quantized_size->data[1] = rhs_layout_cols;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
+                                                   input_quantized_size));
+  TfLiteTensor* scaling_factors;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, kScalingFactorsTensor, &scaling_factors));
+  scaling_factors->type = kTfLiteFloat32;
+  scaling_factors->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
+  scaling_factors_size->data[0] = rhs_layout_rows;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
+                                                   scaling_factors_size));
+
+  TfLiteTensor* accum_scratch;
+  TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, kAccumulatorTensor,
+                                              &accum_scratch));
+  accum_scratch->type = kTfLiteInt32;
+  accum_scratch->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* accum_size = TfLiteIntArrayCreate(2);
+  accum_size->data[0] = dst_layout_rows, accum_size->data[1] = dst_layout_cols;
+  TF_LITE_ENSURE_OK(context,
+                    context->ResizeTensor(context, accum_scratch, accum_size));
+
+  TfLiteTensor* input_offsets;
+  TF_LITE_ENSURE_OK(
+      context,
+      GetTemporarySafe(context, node, kInputOffsetsTensor, &input_offsets));
+  input_offsets->type = kTfLiteInt32;
+  input_offsets->allocation_type = kTfLiteArenaRw;
+  TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
+  input_offsets_size->data[0] = rhs_layout_rows;
+  TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
+                                                   input_offsets_size));
+
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  auto* params =
+      reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
+  return UpdateOutputSize(context, params, input, output, batch_size, units,
+                          cols);
+}
+
+TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node,
+                         KernelType kernel_type) {
   auto* params =
       reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
   OpData* data = reinterpret_cast<OpData*>(node->user_data);
@@ -342,6 +440,32 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
         filter->type == kTfLiteInt4));
   const bool is_sparse = filter->sparsity != nullptr;
   if (is_hybrid) {
+    // Use optimized implementation for 4bit
+    if (filter->type == kTfLiteInt4 && kernel_type == kGenericOptimized &&
+        IsConstantTensor(filter) && batch_size &&
+        ((input_size / batch_size) % 2 == 0) &&
+        num_units >= optimized_4bit::FilterWidth &&
+        (input_size / batch_size) >= optimized_4bit::FilterDepth) {
+      const int cols = input_size / batch_size;
+      if (!data->op_data_4bit) {
+        data->op_data_4bit = std::make_unique<optimized_4bit::OpData4Bit>();
+      }
+      if (data->op_data_4bit->batch_size == batch_size) {
+        return kTfLiteOk;
+      }
+      data->op_data_4bit->batch_size = batch_size;
+      for (int packed_rows = optimized_4bit::GetMaxSupportedRows();
+           packed_rows > 0; packed_rows /= 2) {
+        if (batch_size >= packed_rows) {
+          data->op_data_4bit->rows_right = packed_rows;
+          break;
+        }
+      }
+      return PrepareImpl4Bit(context, node, optimized_4bit::FilterWidth,
+                             data->op_data_4bit->rows_right,
+                             optimized_4bit::FilterDepth, batch_size, cols,
+                             num_units);
+    }
     TfLiteIntArrayFree(node->temporaries);
     data->compute_row_sums = true;
     if (is_sparse) {
@@ -430,26 +554,8 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
   }
 
   // Resize output.
-  TfLiteIntArray* output_size_array = nullptr;
-  if (params->keep_num_dims) {
-    // When number of dimensions are kept the filter operates along the last
-    // dimensions. In other words, for an input tensor with shape
-    // [batch_size, ..., n_inputs] and a filter of shape [n_inputs, n_units]
-    // this Op produces an output of shape [batch_size, ..., n_units].
-    TF_LITE_ENSURE_EQ(context, input->dims->data[input->dims->size - 1],
-                      SizeOfDimension(filter, 1));
-    output_size_array = TfLiteIntArrayCopy(input->dims);
-    output_size_array->data[output_size_array->size - 1] = num_units;
-  } else {
-    // Otherwise, the output is (potentially flattened to) a 2-D matrix.
-    output_size_array = TfLiteIntArrayCreate(2);
-    output_size_array->data[0] = batch_size;
-    output_size_array->data[1] = num_units;
-  }
-  TF_LITE_ENSURE_OK(context,
-                    context->ResizeTensor(context, output, output_size_array));
-
-  return kTfLiteOk;
+  return UpdateOutputSize(context, params, input, output, batch_size, num_units,
+                          filter->dims->data[1]);
 }
 
 template <KernelType kernel_type>
@@ -476,7 +582,7 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
                                 params->activation == kTfLiteActReluN1To1 ||
                                 params->activation == kTfLiteActRelu6);
   }
-  return PrepareImpl(context, node);
+  return PrepareImpl(context, node, kernel_type);
 }
 
 TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
@@ -731,6 +837,69 @@ struct SparseHybridFullyConnectedTask : cpu_backend_threadpool::Task {
   TfLiteTensor* output;
 };
 
+TfLiteStatus EvalHybridDense4Bit(
+    TfLiteContext* context, TfLiteNode* node,
+    TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input,
+    const TfLiteTensor* filter, const TfLiteTensor* bias,
+    TfLiteTensor* input_quantized, TfLiteTensor* scaling_factors,
+    TfLiteTensor* accum_scratch, TfLiteTensor* input_offsets,
+    TfLiteTensor* output) {
+  float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
+  int8_t* quant_data = GetTensorData<int8_t>(input_quantized);
+  int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offsets);
+  const int batch_size = data->op_data_4bit->batch_size;
+  const int output_depth = filter->dims->data[0];
+  const int cols = filter->dims->data[1];
+  const int rhs_width = data->op_data_4bit->rows_right;
+  const int depth = optimized_4bit::FilterDepth;
+  const int lhs_width = optimized_4bit::FilterWidth;
+  const int lhs_layout_rows =
+      (output_depth + (lhs_width - 1)) & ~(lhs_width - 1);
+  const int lhs_layout_cols = (cols + (depth - 1)) & ~(depth - 1);
+  const int rhs_layout_rows = (batch_size + (rhs_width - 1)) & ~(rhs_width - 1);
+  const int rhs_layout_cols = lhs_layout_cols;
+  const int dst_layout_rows = rhs_layout_rows;
+  const int dst_layout_cols = lhs_layout_rows;
+  if (data->op_data_4bit->needs_prepack) {
+    optimized_4bit::Prepack(
+        &data->op_data_4bit->prepacked_cache, GetTensorData<int8_t>(filter),
+        lhs_layout_rows, lhs_layout_cols, output_depth, cols, lhs_width, depth);
+    data->op_data_4bit->needs_prepack = false;
+  }
+  std::vector<float> filter_scales(lhs_layout_rows, filter->params.scale);
+  auto* filter_params =
+      reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
+  if (filter_params && filter_params->scale && filter_params->scale->size > 0) {
+    if (filter_params->scale->size == 1) {
+      std::fill(filter_scales.begin(), filter_scales.end(),
+                filter_params->scale->data[0]);
+    } else {
+      for (int i = 0; i < filter_params->scale->size; i++) {
+        filter_scales[i] = filter_params->scale->data[i];
+      }
+    }
+  }
+  optimized_4bit::BatchQuantizeFloats4Bit(
+      GetTensorData<float>(input), batch_size, cols, quant_data,
+      scaling_factors_ptr, rhs_width, depth, input_offset_ptr);
+  const float* bias_ptr =
+      bias != nullptr ? GetTensorData<float>(bias) : nullptr;
+  optimized_4bit::AssignBiasAndComputeOffsets(
+      input_offset_ptr, scaling_factors_ptr, filter_scales.data(), bias_ptr,
+      GetTensorData<float>(output), output_depth, batch_size);
+  const uint8_t* lhs = data->op_data_4bit->prepacked_cache;
+  int32_t* dst = GetTensorData<int32_t>(accum_scratch);
+  optimized_4bit::RunAndUnpack(
+      data->op_data_4bit->rows_right, lhs, quant_data, dst, output_depth,
+      batch_size, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols,
+      GetTensorData<float>(output), scaling_factors_ptr, filter_scales.data());
+  tensor_utils::ApplyActivationToVector(
+      GetTensorData<float>(output), batch_size * output_depth,
+      params->activation, GetTensorData<float>(output));
+  return kTfLiteOk;
+}
+
 TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
                         TfLiteFullyConnectedParams* params, OpData* data,
                         const TfLiteTensor* input, const TfLiteTensor* filter,
@@ -1001,6 +1170,11 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
     TfLiteTensor* input_offsets;
     TF_LITE_ENSURE_OK(
         context, GetTemporarySafe(context, node, /*index=*/3, &input_offsets));
+    if (data->op_data_4bit) {
+      return EvalHybridDense4Bit(context, node, params, data, input, filter,
+                                 bias, input_quantized, scaling_factors,
+                                 accum_scratch, input_offsets, output);
+    }
     TfLiteTensor* row_sums;
     TF_LITE_ENSURE_OK(context,
                       GetTemporarySafe(context, node, /*index=*/4, &row_sums));
diff --git a/tensorflow/lite/kernels/fully_connected_4bit_test.cc b/tensorflow/lite/kernels/fully_connected_4bit_test.cc
new file mode 100644
index 00000000000..8b770cc7226
--- /dev/null
+++ b/tensorflow/lite/kernels/fully_connected_4bit_test.cc
@@ -0,0 +1,206 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdlib>
+#include <memory>
+#include <random>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/fully_connected.h"
+#include "tensorflow/lite/kernels/test_util.h"
+
+namespace tflite {
+
+class FullyConnected4BitOpModel : public SingleOpModel {
+ public:
+  FullyConnected4BitOpModel(
+      int units, int batches, const TensorData& input,
+      const TensorData& weights, const TensorData& output,
+      std::vector<int8_t> weights_initializer, TfLiteRegistration* registration,
+      ActivationFunctionType activation_func = ActivationFunctionType_RELU)
+      : batches_(batches), units_(units) {
+    // Calculate input_size_ from batch and input shape.
+    int total_input_size = 1;
+    for (size_t i = 0; i < input.shape.size(); ++i) {
+      total_input_size *= input.shape[i];
+    }
+    input_size_ = total_input_size / batches_;
+    input_ = AddInput(input);
+    const std::vector<int8_t> quantized_data(weights_initializer);
+    std::vector<int8_t> weight_data(quantized_data.size() / 2);
+    for (int i = 0; i < quantized_data.size(); i++) {
+      uint8_t val = quantized_data[i] & UINT8_C(15);
+      if ((i % 2) == 0) {
+        weight_data[i / 2] = val & INT8_C(15);
+      } else {
+        weight_data[i / 2] |= (val << 4);
+      }
+    }
+    weights_ =
+        AddConstInput<int8_t>(weights, weight_data.data(), weight_data.size());
+    bias_ = AddInput({TensorType_FLOAT32, {units_}});
+    output_ = AddOutput(output);
+    FullyConnectedOptionsWeightsFormat weights_format =
+        FullyConnectedOptionsWeightsFormat_DEFAULT;
+    SetBuiltinOp(BuiltinOperator_FULLY_CONNECTED,
+                 BuiltinOptions_FullyConnectedOptions,
+                 CreateFullyConnectedOptions(builder_, activation_func,
+                                             weights_format, true)
+                     .Union());
+    resolver_ = std::make_unique<SingleOpResolver>(
+        BuiltinOperator_FULLY_CONNECTED, registration);
+    BuildInterpreter({GetShape(input_), GetShape(weights_), GetShape(bias_)});
+    SetUnitScale();
+  }
+
+  void SetUnitScale() {
+    TfLiteTensor* t = interpreter_->tensor(weights_);
+    t->type = kTfLiteInt4;
+    t->params.scale = 1.0;
+    auto filter_params =
+        reinterpret_cast<TfLiteAffineQuantization*>(t->quantization.params);
+    if (filter_params && filter_params->scale &&
+        filter_params->scale->size > 0) {
+      for (int i = 0; i < filter_params->scale->size; i++) {
+        filter_params->scale->data[i] = 1.0;
+      }
+    }
+  }
+  void SetInput(const std::vector<float>& f) { PopulateTensor(input_, f); }
+  std::vector<float> GetOutput() { return ExtractVector<float>(output_); }
+  void SetBias(const std::vector<float>& f) { PopulateTensor(bias_, f); }
+  int input_size() { return input_size_; }
+  int num_units() { return units_; }
+  int num_batches() { return batches_; }
+
+ protected:
+  int input_;
+  int weights_;
+  int bias_;
+  int output_;
+  int batches_;
+  int units_;
+  int input_size_;
+  bool use_native_int4_ = false;
+};
+
+TEST(Hybrid4BitFullyConnectedOpTest, SimpleTestHybridInt4) {
+  int units = 5;
+  int batches = 4;
+  int cols = 40;
+  FullyConnected4BitOpModel m(
+      units, batches,
+      /*input=*/{TensorType_FLOAT32, {batches, cols}},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*output=*/{TensorType_FLOAT32, {units, batches}},
+      {
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          1,  2, 3, 4, 5, 6, 7, 1, 2, -3, -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          1,  2, 3, 4, 5, 6, 7, 1, 2, -3, -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          1,  2, 3, 4, 5, 6, 7, 1, 2, -3, -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+          -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,  -1, 2, 3, 4, 5, 6, 7, 1, 2, 3,
+      },
+      ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT(),
+      ActivationFunctionType_RELU);
+  m.SetBias({1, 2, 3, 1, 2});
+  m.SetInput({
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, -8, 9,  -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+      1, 2, 3, 4, 5, 6, 7, 8,  -9, -10, 1, 2, 3, 4, 5, 6, 7, 8, -9, -10,
+  });
+  m.Invoke();
+
+  EXPECT_THAT(m.GetOutput(),
+              ElementsAreArray(ArrayFloatNear(
+                  {393., 456., 457., 455., 394., 413., 476., 477., 475., 414.,
+                   393., 456., 457., 455., 394., 393., 456., 457., 455., 394},
+                  /*max_abs_error=*/1.3f)));
+}
+
+std::mt19937 random_engine(2023);
+std::uniform_real_distribution<float> real_dist(0.f, 1.f);
+std::uniform_int_distribution<int32_t> int_dist(-7, 7);
+
+class Hybrid4BitFullyConnectedVsReferenceOpTests
+    : public ::testing::TestWithParam<::testing::tuple<int, int, int>> {};
+
+TEST_P(Hybrid4BitFullyConnectedVsReferenceOpTests, TestHybridInt4) {
+  auto params = GetParam();
+  int units = std::get<0>(params);
+  int batches = std::get<1>(params);
+  int cols = std::get<2>(params);
+  std::vector<int8_t> weight_data(units * cols, 0);
+  std::vector<float> input_data(batches * cols, 0);
+  std::vector<float> bias_data(units, 0);
+  for (int i = 0; i < units * cols; ++i) {
+    weight_data[i] = int_dist(random_engine);
+  }
+  for (int i = 0; i < batches * cols; ++i) {
+    input_data[i] = real_dist(random_engine);
+  }
+  for (int i = 0; i < units; ++i) {
+    bias_data[i] = real_dist(random_engine);
+  }
+  FullyConnected4BitOpModel test(
+      units, batches,
+      /*input=*/{TensorType_FLOAT32, {batches, cols}},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*output=*/{TensorType_FLOAT32, {units, batches}}, weight_data,
+      ops::builtin::Register_FULLY_CONNECTED_GENERIC_OPT(),
+      ActivationFunctionType_RELU);
+  test.SetBias(bias_data);
+  test.SetInput(input_data);
+  test.Invoke();
+  std::vector<float> test_data = test.GetOutput();
+  FullyConnected4BitOpModel expected(
+      units, batches,
+      /*input=*/{TensorType_FLOAT32, {batches, cols}},
+      /*weights=*/{TensorType_INT4, {units, cols}, 0.0, 7.0, 1.0},
+      /*output=*/{TensorType_FLOAT32, {units, batches}}, weight_data,
+      ops::builtin::Register_FULLY_CONNECTED_REF(),
+      ActivationFunctionType_RELU);
+  expected.SetBias(bias_data);
+  expected.SetInput(input_data);
+  expected.Invoke();
+  std::vector<float> expected_data = expected.GetOutput();
+  EXPECT_THAT(test_data, ElementsAreArray(ArrayFloatNear(
+                             expected_data, /*max_abs_error=*/1e-3f)));
+}
+
+INSTANTIATE_TEST_SUITE_P(Hybrid4BitFullyConnectedVsReferenceOpTests,
+                         Hybrid4BitFullyConnectedVsReferenceOpTests,
+                         ::testing::ValuesIn({
+                             std::make_tuple(4, 1, 32),
+                             std::make_tuple(4, 1, 64),
+                             std::make_tuple(5, 1, 128),
+                             std::make_tuple(5, 4, 128),
+                             std::make_tuple(5, 6, 128),
+                             std::make_tuple(5, 1, 38),
+                             std::make_tuple(5, 4, 72),
+                             std::make_tuple(5, 6, 130),
+                         }));
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/BUILD b/tensorflow/lite/kernels/internal/BUILD
index db5e58dcf6d..fdbc1ee785d 100644
--- a/tensorflow/lite/kernels/internal/BUILD
+++ b/tensorflow/lite/kernels/internal/BUILD
@@ -142,6 +142,13 @@ config_setting(
     },
 )
 
+config_setting(
+    name = "tvos_x86_64",
+    values = {
+        "cpu": "tvos_x86_64",
+    },
+)
+
 config_setting(
     name = "ios_armv7",
     values = {
@@ -248,6 +255,7 @@ selects.config_setting_group(
         ":darwin_x86_64",
         ":freebsd",
         ":windows",
+        ":tvos_x86_64",
     ],
 )
 
@@ -293,6 +301,7 @@ cc_library(
         "optimized/depthwiseconv_multithread.h",
         "optimized/depthwiseconv_uint8.h",
         "optimized/depthwiseconv_uint8_3x3_filter.h",
+        "optimized/fully_connected_4bit.h",
         "optimized/im2col_utils.h",
         "optimized/integer_ops/add.h",
         "optimized/integer_ops/conv.h",
@@ -324,6 +333,7 @@ cc_library(
         ":compatibility",
         ":cppmath",
         ":cpu_check",
+        ":optimized_4bit",
         ":quantization_util",
         ":reduce_utils",
         ":reference_base",
@@ -400,6 +410,89 @@ cc_library(
     ],
 )
 
+selects.config_setting_group(
+    name = "linux_x86_64_opt",
+    match_all = [
+        "//tensorflow:linux_x86_64",
+        "//tensorflow:optimized",
+    ],
+)
+
+cc_library(
+    name = "optimized_4bit",
+    srcs = select({
+        ":linux_x86_64_opt": [
+            "optimized/4bit/sse_fully_connected.cc",
+        ],
+        "//tensorflow:android_arm64": [
+            "optimized/4bit/neon_fully_connected.cc",
+            "optimized/4bit/neon_fully_connected_aarch64_nosdot.cc",
+            "optimized/4bit/neon_fully_connected_aarch64_sdot.cc",
+            "optimized/4bit/neon_fully_connected_impl.h",
+        ],
+        "//tensorflow:android_arm": [
+            "optimized/4bit/neon_fully_connected.cc",
+            "optimized/4bit/neon_fully_connected_arm32.cc",
+            "optimized/4bit/neon_fully_connected_impl.h",
+        ],
+        "//conditions:default": [],
+    }) + [
+        "optimized/4bit/fully_connected_common.h",
+        "optimized/4bit/fully_connected_reference.cc",
+        "optimized/4bit/fully_connected_reference_impl.h",
+    ],
+    hdrs = [
+        "optimized/4bit/fully_connected_reference.h",
+        "optimized/fully_connected_4bit.h",
+    ] + select({
+        ":linux_x86_64_opt": [
+            "optimized/4bit/sse_fully_connected.h",
+            "optimized/4bit/sse_fully_connected_impl.h",
+        ],
+        "//tensorflow:android_arm64": [
+            "optimized/4bit/neon_fully_connected.h",
+        ],
+        "//tensorflow:android_arm": [
+            "optimized/4bit/neon_fully_connected.h",
+        ],
+        "//conditions:default": [],
+    }),
+    compatible_with = get_compatible_with_portable(),
+    copts = tflite_copts() + select({
+        "//tensorflow:android_arm64": [
+            "-march=armv8.2-a+dotprod",
+            "-mfpu=neon-fp-armv8",
+        ],
+        "//tensorflow:android_arm": [],
+        "//conditions:default": [],
+    }) + NEON_FLAGS_IF_APPLICABLE,
+    defines = select({
+        ":linux_x86_64_opt": ["FC_4BIT_SSE"],
+        "//tensorflow:android_arm64": [
+            "FC_4BIT_NEON",
+        ],
+        "//tensorflow:android_arm": [
+            "FC_4BIT_NEON",
+        ],
+        "//conditions:default": [],
+    }),
+    deps = [
+        ":cppmath",
+        ":cpu_check",
+        "@cpuinfo//:cpuinfo_with_unstripped_include_path",
+    ],
+)
+
+cc_test(
+    name = "optimized_4bit_test",
+    srcs = ["optimized/optimized_4bit_test.cc"],
+    deps = [
+        ":common",
+        ":optimized_4bit",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_test(
     name = "tensor_test",
     srcs = ["tensor_test.cc"],
@@ -1123,6 +1216,24 @@ cc_test(
 cc_test(
     name = "resize_bilinear_test",
     srcs = ["resize_bilinear_test.cc"],
+    defines = [
+        "TFLITE_SINGLE_ROUNDING=0",
+    ],
+    deps = [
+        ":optimized_base",
+        ":reference_base",
+        ":test_util",
+        ":types",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+cc_test(
+    name = "resize_bilinear_test_single_rounding",
+    srcs = ["resize_bilinear_test.cc"],
+    defines = [
+        "TFLITE_SINGLE_ROUNDING=1",
+    ],
     deps = [
         ":optimized_base",
         ":reference_base",
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h
new file mode 100644
index 00000000000..8d660281ed6
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_COMMON_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_COMMON_H_
+#include <cstdint>
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Since we need to convert int4 to int8 with shifts, it is faster if we
+// can use unsigned int4, so just subtract zero_point_4bit from all values.
+// Fold input * zero_point into quantization since we need to quantize
+// each input and multiply by zero_point_4bit to convert back to signed int.
+constexpr int zero_point_4bit = -7;
+
+inline int8_t upper(int8_t value) { return value >> 4; }
+
+inline int8_t lower(int8_t value) {
+  uint8_t sign_y = UINT8_C(256) - (value & UINT8_C(8));
+  return (value & UINT8_C(7)) | sign_y;
+}
+
+inline int8_t merge(int8_t upper, int8_t lower) {
+  const auto to_int4 = [](int8_t v) -> uint8_t {
+    int32_t x = v + 7;
+    return static_cast<uint8_t>(x);
+  };
+  return (to_int4(upper) << 4) | to_int4(lower);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_COMMON_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.cc b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.cc
new file mode 100644
index 00000000000..3768e3fbdda
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.cc
@@ -0,0 +1,362 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+void ReferencePackInner(const int8_t* src, uint8_t* box, int src_rows,
+                        int src_cols, int outer_row, int outer_col,
+                        int outer_rows, int outer_cols, int inner_rows,
+                        int inner_cols) {
+  // Create a kernel-specific layout for a packed unit.
+  const int width = inner_rows;
+  const int depth = inner_cols;
+  const int real_depth = depth / 2;
+  const int real_src_cols = src_cols / 2;
+  // Determine row and column of source tensor.
+  const int row = outer_row * inner_rows;
+  const int col = outer_col * inner_cols;
+  // The source width  (rows) and depth (columns).
+  int src_width = std::min(width, src_rows - row);
+  int src_depth = std::min(depth, src_cols - col);
+  int real_col = col / 2;
+  const int8_t* src_data = src + row * real_src_cols + real_col;
+  int real_src_depth = src_depth / 2;
+  // Src is [rows / src_rows, cols / src_depth, src_rows, src_cols]
+  // Reshape and pad to [outer_rows, outer_cols, width, depth]
+  // Interleave values [u1,u2,...,u_depth] to
+  // [u1,u_{depth/2+1},u2,u_{depth/2+2},.., u_{depth/2},u_depth]
+  // So that after shifting, we get [u1,u2...u_{depth/2}] and
+  // [u_{depth/2} + 1, ... u_{depth}].
+  for (int m = 0; m < src_width; ++m) {
+    int i = 0;
+    int k = 0;
+    int half_depth = depth / 2;
+    int half_half_depth = half_depth / 2;
+    for (; i < (real_src_depth & (~(half_depth - 1))); i += half_depth) {
+      for (int j = 0; j < half_half_depth; ++j) {
+        const int8_t v1 = (int8_t)src_data[i + j];
+        int8_t uv1 = upper(v1);
+        int8_t lv1 = lower(v1);
+        const int8_t v2 = (int8_t)src_data[i + j + half_half_depth];
+        int8_t uv2 = upper(v2);
+        int8_t lv2 = lower(v2);
+        box[k] = merge(lv1, lv2);
+        box[k + 1] = merge(uv1, uv2);
+        k += 2;
+      }
+    }
+    // Handle remaining 16 values
+    for (; i < (real_src_depth & (~(half_half_depth - 1)));
+         i += half_half_depth) {
+      for (int j = 0; j < 8; ++j) {
+        const int8_t v1 = (int8_t)src_data[i + j];
+        int8_t uv1 = upper(v1);
+        int8_t lv1 = lower(v1);
+        box[k] = merge(lv1, 0);
+        box[k + 1] = merge(uv1, 0);
+        k += 2;
+      }
+    }
+    // Any remaining values are just interleaved with 0.
+    for (; i < real_src_depth; i++) {
+      const int8_t v1 = (int8_t)src_data[i];
+      int8_t uv1 = upper(v1);
+      int8_t lv1 = lower(v1);
+      box[k] = merge(lv1, 0);
+      box[k + 1] = merge(uv1, 0);
+      k += 2;
+    }
+    box += real_depth;
+    src_data += real_src_cols;
+  }
+}
+
+void ReferencePrepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                      int layout_cols, int src_rows, int src_cols, int width,
+                      int depth) {
+  size_t size = layout_rows * layout_cols / 2;
+  *dest = reinterpret_cast<uint8_t*>(malloc(size));
+  memset(*dest, static_cast<uint8_t>(0x77), sizeof(uint8_t) * size);
+  int outer_cols = layout_cols / depth;
+  int outer_rows = layout_rows / width;
+  int inner_cols = depth;
+  int inner_rows = width;
+  for (int outer_row = 0; outer_row < outer_rows; ++outer_row) {
+    for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+      // Each outer row x outer col contains width x depth, copied
+      // from tensor at the cluster_index.
+      const int cluster_index = outer_row * outer_cols + outer_col;
+      const int real_depth = inner_cols / 2;
+      uint8_t* box = *dest + cluster_index * real_depth * inner_rows;
+      ReferencePackInner(tensor, box, src_rows, src_cols, outer_row, outer_col,
+                         outer_rows, outer_cols, inner_rows, inner_cols);
+    }
+  }
+}
+
+void ReferenceBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                      int n_data, int8_t* quantized_data_ptr,
+                                      float* scaling_factors, int width,
+                                      int depth, int32_t* input_offsets) {
+  const int rows = n_batch;
+  const int cols = n_data;
+  // depth is always cols
+  const int layout_rows = (rows + (width - 1)) & ~(width - 1);
+  const int layout_cols = (cols + (depth - 1)) & ~(depth - 1);
+  const int size = layout_rows * layout_cols;
+  int8_t* data = quantized_data_ptr;
+  memset(data, 0, sizeof(int8_t) * size);
+  memset(input_offsets, 0, sizeof(int32_t) * layout_rows);
+  const float* tensor_data = float_data_ptr;
+  // basically, we need to make a new 4D matrix
+  // [rows / width, cols / depth, width, depth] in depth-first
+  const int outer_cols = layout_cols / depth;
+  const int outer_rows = layout_rows / width;
+  float* scaling_factors_ptr = scaling_factors;
+  for (int outer_row = 0; outer_row < outer_rows; outer_row++) {
+    std::vector<float> scale(width);
+    const int row = width * outer_row;
+    scaling_factors_ptr = scaling_factors + row;
+    for (int w = 0; w < width; ++w) {
+      if ((row + w) >= rows) {
+        continue;
+      }
+      const float* start = tensor_data + (row + w) * cols;
+      float scale_denom = 0;
+      for (int c = 0; c < cols; ++c) {
+        scale_denom = std::max(scale_denom, std::abs(*(start++)));
+      }
+      scale[w] = 127.0 / scale_denom;
+      scaling_factors_ptr[w] = scale_denom / 127.0;
+    }
+    for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+      const int col = depth * outer_col;
+      const int src_width = std::min(width, rows - row);
+      const int src_depth = std::min(depth, cols - col);
+      const int cluster_index = outer_row * outer_cols + outer_col;
+      int8_t* box = data + cluster_index * depth * width;
+      for (int w = 0; w < src_width; ++w) {
+        const float* float_data = tensor_data + (row + w) * cols + col;
+        for (int d = 0; d < src_depth; ++d) {
+          int8_t q = static_cast<int8_t>(TfLiteRound(float_data[d] * scale[w]));
+          box[w * depth + d] = q;
+          input_offsets[row + w] += q;
+        }
+      }
+    }
+  }
+  for (int r = 0; r < layout_rows; ++r) {
+    // Multiply the input by zero-point so that we don't have to calculate
+    // later.
+    input_offsets[r] = input_offsets[r] * zero_point_4bit;
+  }
+}
+
+void ReferenceAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                          const float* batch_scales,
+                                          const float* filter_scales,
+                                          const float* bias_ptr,
+                                          float* output_ptr, int output_depth,
+                                          int batch_size) {
+  if (bias_ptr) {
+    for (int b = 0; b < batch_size; ++b) {
+      const float val = *input_offsets++ * *batch_scales++;
+      const float* filter_scales_ptr = filter_scales;
+      const float* bias_ptr_tmp = bias_ptr;
+      for (int i = 0; i < output_depth; i++) {
+        *output_ptr++ = (val * *filter_scales_ptr++) + *bias_ptr_tmp++;
+      }
+    }
+    return;
+  }
+  for (int b = 0; b < batch_size; ++b) {
+    const float val = *input_offsets++ * *batch_scales++;
+    const float* filter_scales_ptr = filter_scales;
+    for (int i = 0; i < output_depth; i++) {
+      *output_ptr++ = (val * *filter_scales_ptr++);
+    }
+  }
+}
+
+/* Unpack the accumulated scratch buffer by transposing and multiplying
+ * by input and filter scales.
+ * Before, dst contains integer accumulated values with layout:
+ *   [rhs_layout_rows // rhs_width, lhs_layout_rows // lhs_width,
+ *       rhs_width, lhs_width]
+ * Transpose and dequantize to [batch_size, num_units].
+ */
+template <int Depth, int Width>
+void ReferenceUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+                     int num_units, const float* scaling_factors,
+                     const float* filter_scales, int dst_layout_rows,
+                     int dst_layout_cols) {
+  // Width == 1 is when batch size == 1, the most frequent case.
+  // No need to iterate over outer rows.
+  if (Width == 1) {
+    const int outer_rows = dst_layout_rows / Width;
+    const int outer_cols = dst_layout_cols / Depth;
+    const int32_t* dst_ptr = dst;
+    int unit = 0;
+    for (int outer_col = 0; outer_col < outer_cols;
+         ++outer_col, unit += Depth) {
+      float* tmp_output_ptr = output_ptr + unit;
+      int len = num_units - unit < Depth ? num_units - unit : Depth;
+      const float* scaling_factors_ptr = scaling_factors;
+      for (int outer_row = 0; outer_row < outer_rows; ++outer_row) {
+        const float scale = *scaling_factors_ptr;
+        const float* filter_scales_ptr = filter_scales + unit;
+        for (int i = 0; i < len; ++i) {
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+        }
+        dst_ptr += (Depth - len);
+        scaling_factors_ptr += Width;
+        tmp_output_ptr += (num_units - len);
+      }
+    }
+    return;
+  }
+  const int outer_rows = dst_layout_rows / Width;
+  const int outer_cols = dst_layout_cols / Depth;
+  for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+    const int unit = outer_col * Depth;
+    const int remaining_units = std::min(num_units - unit, Depth);
+    const int depth_offset = Depth - remaining_units;
+    const int width_offset = num_units - remaining_units;
+    int outer_row = 0;
+    for (; outer_row < outer_rows; ++outer_row) {
+      const int batch = outer_row * Width;
+      const int remaining_width = std::min(batch_size - batch, Width);
+      const int cluster_index = outer_col * outer_rows + outer_row;
+      const int32_t* dst_ptr = dst + cluster_index * Depth * Width;
+      float* tmp_output_ptr = output_ptr + batch * num_units + unit;
+      const float* scale = scaling_factors + batch;
+      int w = remaining_width;
+      for (; w > 0; --w, scale++) {
+        int d = remaining_units;
+        const float* filter_scales_ptr = filter_scales + unit;
+        for (; d > 0; --d) {
+          *tmp_output_ptr++ += *dst_ptr++ * (*scale) * (*filter_scales_ptr++);
+        }
+        dst_ptr += depth_offset;
+        tmp_output_ptr += width_offset;
+      }
+    }
+  }
+}
+
+template <int RowsLeft, int RowsRight, int Cols>
+void ReferenceRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                        int lhs_layout_rows, int lhs_layout_cols,
+                        int rhs_layout_rows, int rhs_layout_cols,
+                        int dst_layout_rows, int dst_layout_cols) {
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* elementPtr = dst;
+  const int outer_rows = (clamped_end_row + RowsLeft - 1) / RowsLeft;
+  const int outer_cols = (clamped_end_col + RowsRight - 1) / RowsRight;
+  const int depth = std::min(lhs_layout_cols / Cols, rhs_layout_cols / Cols);
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * RowsLeft * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * RowsRight * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      int32_t accum[RowsLeft * RowsRight];
+      memset(accum, 0, sizeof(int32_t) * RowsLeft * RowsRight);
+      for (int k = 0; k < depth; ++k) {
+        uint8_t lhs_[RowsLeft][Cols];
+        for (int m = 0; m < RowsLeft; ++m) {
+          for (int n = 0; n < Cols / 2; ++n) {
+            uint8_t val = *(lhs_val++);
+            lhs_[m][n] = (val >> 4 & 15);
+            lhs_[m][n + (Cols / 2)] = (val & 15);
+          }
+        }
+        int8_t rhs_[RowsRight][Cols];
+        for (int m = 0; m < RowsRight; ++m) {
+          for (int n = 0; n < Cols; ++n) {
+            rhs_[m][n] = *(rhs_val++);
+          }
+        }
+        for (int r = 0; r < RowsRight; ++r) {
+          for (int l = 0; l < RowsLeft; ++l) {
+            for (int i = 0; i < Cols; ++i) {
+              accum[r * RowsLeft + l] += lhs_[l][i] * rhs_[r][i];
+            }
+          }
+        }
+      }  // end depth
+      for (int r = 0; r < RowsRight; ++r) {
+        for (int l = 0; l < RowsLeft; ++l) {
+          int32_t q = accum[r * RowsLeft + l];
+          *(elementPtr++) = q;
+        }
+      }
+    }
+  }
+}
+
+template void ReferenceUnpack<4, 1>(float* output_ptr, const int32_t* dst,
+                                    int batch_size, int num_units,
+                                    const float* scaling_factors,
+                                    const float* filter_scales,
+                                    int dst_layout_rows, int dst_layout_cols);
+
+template void ReferenceUnpack<4, 2>(float* output_ptr, const int32_t* dst,
+                                    int batch_size, int num_units,
+                                    const float* scaling_factors,
+                                    const float* filter_scales,
+                                    int dst_layout_rows, int dst_layout_cols);
+
+template void ReferenceUnpack<4, 4>(float* output_ptr, const int32_t* dst,
+                                    int batch_size, int num_units,
+                                    const float* scaling_factors,
+                                    const float* filter_scales,
+                                    int dst_layout_rows, int dst_layout_cols);
+
+template void ReferenceRunKernel<4, 1, 32>(
+    const uint8_t* lhs, const int8_t* rhs, int32_t* dst, int lhs_layout_rows,
+    int lhs_layout_cols, int rhs_layout_rows, int rhs_layout_cols,
+    int dst_layout_rows, int dst_layout_cols);
+
+template void ReferenceRunKernel<4, 2, 32>(
+    const uint8_t* lhs, const int8_t* rhs, int32_t* dst, int lhs_layout_rows,
+    int lhs_layout_cols, int rhs_layout_rows, int rhs_layout_cols,
+    int dst_layout_rows, int dst_layout_cols);
+
+template void ReferenceRunKernel<4, 4, 32>(
+    const uint8_t* lhs, const int8_t* rhs, int32_t* dst, int lhs_layout_rows,
+    int lhs_layout_cols, int rhs_layout_rows, int rhs_layout_cols,
+    int dst_layout_rows, int dst_layout_cols);
+
+}  // namespace optimized_4bit
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h
new file mode 100644
index 00000000000..4c7cf4331a2
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h
@@ -0,0 +1,163 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+/* Returns the maximum number of rhs rows supported/compiled.
+ *
+ * In general 4 is the most we can do without running out of registers on
+ * aarch64. For x86/aarch64, 4x32 4bit = 64 bytes can be held in cache line
+ * size. This is required to set the packing layout for the rhs.
+ *
+ * For reference, return 1.
+ */
+inline int GetMaxSupportedRows() { return 1; }
+
+/* Pack a 4bit inner_rows x inner_cols array from src.
+ * This is called as an inner function for Prepack.
+ */
+inline void PackInner(const int8_t* src, uint8_t* box, int src_rows,
+                      int src_cols, int outer_row, int outer_col,
+                      int outer_rows, int outer_cols, int inner_rows,
+                      int inner_cols) {
+  ReferencePackInner(src, box, src_rows, src_cols, outer_row, outer_col,
+                     outer_rows, outer_cols, inner_rows, inner_cols);
+}
+
+/* Prepack lhs matrix, and allocate destination pointer.
+ * Transform tensor from (src_rows, src_cols) to
+ * (layout_rows / width, layout_cols / depth, width, depth) with possibly
+ * padding, and interleaving values along depth / 2 dimensions.
+ * dest will be allocated and aligned.
+ */
+inline void Prepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  ReferencePrepack(dest, tensor, layout_rows, layout_cols, src_rows, src_cols,
+                   width, depth);
+}
+
+/* Quantize input floats to 8bit and calculate sum of each column.
+ *  Data in float_data_ptr of shape (n_batch x n_data), is quantized and
+ * packed into (n_batch / width, n_data / depth, width, data) into
+ * quantized_data_ptr and input_offsets will contain the product of filter
+ * zero_point and input.
+ */
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  ReferenceBatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                                   quantized_data_ptr, scaling_factors, width,
+                                   depth, input_offsets);
+}
+
+/* Write bias + input offset * filter_scale to output_ptr.
+ * output_ptr of size (batch_size, output_depth) will have
+ * output_ptr[output_depth * b + o] =
+ *     bias_ptr[o] + input_offsets[b] * batch_scales[b] * filter_scale[o]
+ */
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        const float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  ReferenceAssignBiasAndComputeOffsets(input_offsets, batch_scales,
+                                       filter_scales, bias_ptr, output_ptr,
+                                       output_depth, batch_size);
+}
+
+/* Add accumulated integer sums in dst to float output.
+ * output_ptr of size (batch_size, output_depth) will have
+ * output_ptr[b * output_depth + o] = \
+ *   dst[b / dst_layout_rows, o / dst_layout_cols,
+ *       b % dst_layout_rows, o % dst_layout_cols] * scaling_filters[b] *
+ *       filter_scales[o]
+ */
+template <int Depth, int Width>
+void Unpack(float* output_ptr, const int32_t* dst, int batch_size,
+            int num_units, const float* scaling_factors,
+            const float* filter_scales, int dst_layout_rows,
+            int dst_layout_cols) {
+  ReferenceUnpack<Depth, Width>(output_ptr, dst, batch_size, num_units,
+                                scaling_factors, filter_scales, dst_layout_rows,
+                                dst_layout_cols);
+}
+
+/* Computes dst = (lchd,rchd->lr, lhs, rhs)
+ * Where l = lhs_layout_rows, r = rhs_layout_rows,
+ * c = rhs_layout_cols = lhs_layout_cols.
+ */
+template <int RowsLeft, int RowsRight, int Cols>
+void RunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+               int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+               int rhs_layout_cols, int dst_layout_rows, int dst_layout_cols) {
+  ReferenceRunKernel<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+// Begin template specializations.
+template <>
+inline void Unpack<4, 1>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  ReferenceUnpack<4, 1>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                        filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  ReferenceRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                               rhs_layout_rows, rhs_layout_cols,
+                               dst_layout_rows, dst_layout_cols);
+}
+
+// End template specializations.
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, const int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+  ReferenceRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                               rhs_layout_rows, rhs_layout_cols,
+                               dst_layout_rows, dst_layout_cols);
+  ReferenceUnpack<4, 1>(output_ptr, dst, batch_size, output_depth,
+                        scaling_factors, filter_scales, dst_layout_rows,
+                        dst_layout_cols);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h
new file mode 100644
index 00000000000..7b66c6095a1
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference_impl.h
@@ -0,0 +1,62 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_IMPL_H_
+
+#include <stdint.h>
+
+namespace tflite {
+namespace optimized_4bit {
+
+void ReferencePackInner(const int8_t* src, uint8_t* box, int src_rows,
+                        int src_cols, int outer_row, int outer_col,
+                        int outer_rows, int outer_cols, int inner_rows,
+                        int inner_cols);
+
+void ReferencePrepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                      int layout_cols, int src_rows, int src_cols, int width,
+                      int depth);
+
+void ReferenceBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                      int n_data, int8_t* quantized_data_ptr,
+                                      float* scaling_factors, int width,
+                                      int depth, int32_t* input_offsets);
+
+void ReferenceAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                          const float* batch_scales,
+                                          const float* filter_scales,
+                                          const float* bias_ptr,
+                                          float* output_ptr, int output_depth,
+                                          int batch_size);
+
+template <int Depth, int Width>
+extern void ReferenceUnpack(float* output_ptr, const int32_t* dst,
+                            int batch_size, int num_units,
+                            const float* scaling_factors,
+                            const float* filter_scales, int dst_layout_rows,
+                            int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void ReferenceRunKernel(const uint8_t* lhs, const int8_t* rhs,
+                               int32_t* dst, int lhs_layout_rows,
+                               int lhs_layout_cols, int rhs_layout_rows,
+                               int rhs_layout_cols, int dst_layout_rows,
+                               int dst_layout_cols);
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_FULLY_CONNECTED_REFERENCE_IMPL_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.cc b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.cc
new file mode 100644
index 00000000000..2db1830e898
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.cc
@@ -0,0 +1,588 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+#include <stdint.h>
+#include <sys/auxv.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "include/cpuinfo.h"
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+#ifndef __aarch64__
+inline int16x8_t vqmovn_high_s32(int16x4_t a_s16x4, int32x4_t b_s32x4) {
+  return vcombine_s16(a_s16x4, vqmovn_s32(b_s32x4));
+}
+
+inline int8x16_t vqmovn_high_s16(int8x8_t a_s8x8, int16x8_t b_s16x8) {
+  return vcombine_s8(a_s8x8, vqmovn_s16(b_s16x8));
+}
+
+inline int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
+  int32x4x2_t deinterleaved = vuzpq_s32(a, b);
+  return vqaddq_s32(deinterleaved.val[0], deinterleaved.val[1]);
+}
+
+inline float vmaxvq_f32(float32x4_t max_f32x4) {
+  float32x2_t max_f32x2 =
+      vmax_f32(vget_low_f32(max_f32x4), vget_high_f32(max_f32x4));
+  max_f32x2 = vpmax_f32(max_f32x2, max_f32x2);
+  return vget_lane_f32(max_f32x2, 0);
+}
+
+inline int32x4_t vcvtaq_s32_f32(float32x4_t a_f32x4) {
+  float32x4_t half = vdupq_n_f32(.5);
+  float32x4_t sign =
+      vcvtq_f32_u32(vshrq_n_u32(vreinterpretq_u32_f32(a_f32x4), 31));
+  float32x4_t add_half = vaddq_f32(a_f32x4, half);
+  float32x4_t round = vsubq_f32(add_half, sign);
+  return vcvtq_s32_f32(round);
+}
+
+void NeonAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                     const float* batch_scales,
+                                     float* filter_scales,
+                                     const float* bias_ptr, float* output_ptr,
+                                     int output_depth, int batch_size) {
+  if (bias_ptr) {
+    for (int b = 0; b < batch_size; ++b) {
+      const float* filter_scales_ptr = filter_scales;
+      const float* tmp_bias_ptr = bias_ptr;
+      float val = input_offsets[b] * batch_scales[b];
+      int o = output_depth;
+      const float32x4_t v4_f32x4 = vdupq_n_f32(val);
+      for (; o >= 4; o -= 4) {
+        float32x4_t v0_f32x4 = vld1q_f32(filter_scales_ptr);
+        filter_scales_ptr += 4;
+        float32x4_t v5_f32x4 = vld1q_f32(tmp_bias_ptr);
+        tmp_bias_ptr += 4;
+        v5_f32x4 = vmlaq_f32(v5_f32x4, v0_f32x4, v4_f32x4);
+        vst1q_f32(output_ptr, v5_f32x4);
+        output_ptr += 4;
+      }
+      for (; o > 0; --o) {
+        *output_ptr++ = val * (*filter_scales_ptr++) + (*tmp_bias_ptr++);
+      }
+    }
+    return;
+  }
+  for (int b = 0; b < batch_size; ++b) {
+    const float* filter_scales_ptr = filter_scales;
+    float val = input_offsets[b] * batch_scales[b];
+    int o = output_depth;
+    const float32x4_t v4_f32x4 = vdupq_n_f32(val);
+    for (; o >= 4; o -= 4) {
+      float32x4_t v0_f32x4 = vld1q_f32(filter_scales_ptr);
+      filter_scales_ptr += 4;
+      float32x4_t v13_f32x4 = vmulq_f32(v0_f32x4, v4_f32x4);
+      vst1q_f32(output_ptr, v13_f32x4);
+      output_ptr += 4;
+    }
+    for (; o > 0; --o) {
+      *output_ptr++ = val * (*filter_scales_ptr++);
+    }
+  }
+}
+
+#else
+
+void NeonAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                     const float* batch_scales,
+                                     float* filter_scales,
+                                     const float* bias_ptr, float* output_ptr,
+                                     int output_depth, int batch_size) {
+  if (bias_ptr) {
+    for (int b = 0; b < batch_size; ++b) {
+      const float* filter_scales_ptr = filter_scales;
+      const float* tmp_bias_ptr = bias_ptr;
+      float val = input_offsets[b] * batch_scales[b];
+      int o = output_depth;
+      const float32x4_t v4_f32x4 = vdupq_n_f32(val);
+      for (; o >= 16; o -= 16) {
+        float32x4x4_t v0_to_v3_f32x4x4 = vld1q_f32_x4(filter_scales_ptr);
+        filter_scales_ptr += 16;
+        float32x4x4_t v5_to_v8_f32x4x4 = vld1q_f32_x4(tmp_bias_ptr);
+        tmp_bias_ptr += 16;
+        v5_to_v8_f32x4x4.val[0] = vfmaq_f32(v5_to_v8_f32x4x4.val[0],
+                                            v0_to_v3_f32x4x4.val[0], v4_f32x4);
+        v5_to_v8_f32x4x4.val[1] = vfmaq_f32(v5_to_v8_f32x4x4.val[1],
+                                            v0_to_v3_f32x4x4.val[1], v4_f32x4);
+        v5_to_v8_f32x4x4.val[2] = vfmaq_f32(v5_to_v8_f32x4x4.val[2],
+                                            v0_to_v3_f32x4x4.val[2], v4_f32x4);
+        v5_to_v8_f32x4x4.val[3] = vfmaq_f32(v5_to_v8_f32x4x4.val[3],
+                                            v0_to_v3_f32x4x4.val[3], v4_f32x4);
+        vst1q_f32_x4(output_ptr, v5_to_v8_f32x4x4);
+        output_ptr += 16;
+      }
+      if (o >= 8) {
+        float32x4x2_t v0_to_v1_f32x4x2 = vld1q_f32_x2(filter_scales_ptr);
+        filter_scales_ptr += 8;
+        float32x4x2_t v5_to_v6_f32x4x2 = vld1q_f32_x2(tmp_bias_ptr);
+        tmp_bias_ptr += 8;
+        v5_to_v6_f32x4x2.val[0] = vfmaq_f32(v5_to_v6_f32x4x2.val[0],
+                                            v0_to_v1_f32x4x2.val[0], v4_f32x4);
+        v5_to_v6_f32x4x2.val[1] = vfmaq_f32(v5_to_v6_f32x4x2.val[1],
+                                            v0_to_v1_f32x4x2.val[1], v4_f32x4);
+        vst1q_f32_x2(output_ptr, v5_to_v6_f32x4x2);
+        output_ptr += 8;
+        o -= 8;
+      }
+      if (o >= 4) {
+        float32x4_t v0_f32x4 = vld1q_f32(filter_scales_ptr);
+        filter_scales_ptr += 4;
+        float32x4_t v5_f32x4 = vld1q_f32(tmp_bias_ptr);
+        tmp_bias_ptr += 4;
+        v5_f32x4 = vfmaq_f32(v5_f32x4, v0_f32x4, v4_f32x4);
+        vst1q_f32(output_ptr, v5_f32x4);
+        output_ptr += 4;
+        o -= 4;
+      }
+      for (; o > 0; --o) {
+        *output_ptr++ = val * (*filter_scales_ptr++) + (*tmp_bias_ptr++);
+      }
+    }
+    return;
+  }
+  for (int b = 0; b < batch_size; ++b) {
+    const float* filter_scales_ptr = filter_scales;
+    float val = input_offsets[b] * batch_scales[b];
+    int o = output_depth;
+    const float32x4_t v4_f32x4 = vdupq_n_f32(val);
+    for (; o >= 16; o -= 16) {
+      float32x4x4_t v0_to_v3_f32x4x4 = vld1q_f32_x4(filter_scales_ptr);
+      filter_scales_ptr += 16;
+      float32x4x4_t v13_to_v16_f32x4x4;
+      v13_to_v16_f32x4x4.val[0] = vmulq_f32(v0_to_v3_f32x4x4.val[0], v4_f32x4);
+      v13_to_v16_f32x4x4.val[1] = vmulq_f32(v0_to_v3_f32x4x4.val[1], v4_f32x4);
+      v13_to_v16_f32x4x4.val[2] = vmulq_f32(v0_to_v3_f32x4x4.val[2], v4_f32x4);
+      v13_to_v16_f32x4x4.val[3] = vmulq_f32(v0_to_v3_f32x4x4.val[3], v4_f32x4);
+      vst1q_f32_x4(output_ptr, v13_to_v16_f32x4x4);
+      output_ptr += 16;
+    }
+    if (o >= 8) {
+      float32x4x2_t v0_to_v1_f32x4x2 = vld1q_f32_x2(filter_scales_ptr);
+      filter_scales_ptr += 8;
+      float32x4x2_t v11_to_v12_f32x4x2;
+      v11_to_v12_f32x4x2.val[0] = vmulq_f32(v0_to_v1_f32x4x2.val[0], v4_f32x4);
+      v11_to_v12_f32x4x2.val[1] = vmulq_f32(v0_to_v1_f32x4x2.val[1], v4_f32x4);
+      vst1q_f32_x2(output_ptr, v11_to_v12_f32x4x2);
+      output_ptr += 8;
+      o -= 8;
+    }
+    if (o >= 4) {
+      float32x4_t v0_f32x4 = vld1q_f32(filter_scales_ptr);
+      filter_scales_ptr += 4;
+      float32x4_t v13_f32x4 = vmulq_f32(v0_f32x4, v4_f32x4);
+      vst1q_f32(output_ptr, v13_f32x4);
+      output_ptr += 4;
+      o -= 4;
+    }
+    for (; o > 0; --o) {
+      *output_ptr++ = val * (*filter_scales_ptr++);
+    }
+  }
+}
+#endif
+
+void NeonPackInner(const int8_t* src, uint8_t* box, int src_rows, int src_cols,
+                   int outer_row, int outer_col, int outer_rows, int outer_cols,
+                   int inner_rows, int inner_cols) {
+  // create a kernel-specific layout and store it into packed
+  const int width = inner_rows;
+  const int depth = inner_cols;
+  const int real_depth = depth / 2;
+  const int real_src_cols = src_cols / 2;
+  // which virtual row
+  const int row = outer_row * inner_rows;
+  const int col = outer_col * inner_cols;
+  int src_width = std::min(width, src_rows - row);
+  int src_depth = std::min(depth, src_cols - col);
+  int real_col = col / 2;
+  const int8_t* src_data = src + row * real_src_cols + real_col;
+  int real_src_depth = src_depth / 2;
+  const int8x16_t seven = vdupq_n_s8(7);
+  const int8x8_t seven8 = vdup_n_s8(7);
+  for (int m = 0; m < src_width; ++m) {
+    int i = 0;
+    int k = 0;
+    for (; i < (real_src_depth & (~15)); i += 16) {
+      int8x16_t values_16x8 = vld1q_s8(src_data + i);
+      int8x16_t uv1 = vshrq_n_s8(values_16x8, 4);
+      int8x16_t lv1 = vshlq_n_s8(values_16x8, 4);
+      uv1 = vaddq_s8(uv1, seven);
+      lv1 = vshrq_n_s8(lv1, 4);
+      lv1 = vaddq_s8(lv1, seven);
+      int8x8_t iuvl = vget_low_s8(uv1);
+      int8x8_t iuvh = vget_high_s8(uv1);
+      int8x8_t ilvl = vget_low_s8(lv1);
+      int8x8_t ilvh = vget_high_s8(lv1);
+      uint8x8_t uvl = vshl_n_u8(vreinterpret_u8_s8(iuvl), 4);
+      uint8x8_t lvl = vshl_n_u8(vreinterpret_u8_s8(ilvl), 4);
+      uint8x8_t uv = vorr_u8(uvl, vreinterpret_u8_s8(iuvh));
+      uint8x8_t lv = vorr_u8(lvl, vreinterpret_u8_s8(ilvh));
+      uint8x8x2_t zipped = vzip_u8(lv, uv);
+      uint8x16_t combined = vcombine_u8(zipped.val[0], zipped.val[1]);
+      vst1q_u8(box + k, combined);
+      k += 16;
+    }
+    // handle remaining 16 values
+    for (; i < (real_src_depth & (~7)); i += 8) {
+      int8x8_t values_8x8 = vld1_s8(src_data + i);
+      int8x8_t uv1 = vshr_n_s8(values_8x8, 4);
+      int8x8_t lv1 = vshl_n_s8(values_8x8, 4);
+      uv1 = vadd_s8(uv1, seven8);
+      lv1 = vshr_n_s8(lv1, 4);
+      lv1 = vadd_s8(lv1, seven8);
+      uint8x8_t uvl = vshl_n_u8(vreinterpret_u8_s8(uv1), 4);
+      uint8x8_t lvl = vshl_n_u8(vreinterpret_u8_s8(lv1), 4);
+      uint8x8x2_t zipped = vzip_u8(lvl, uvl);
+      uint8x16_t combined = vcombine_u8(zipped.val[0], zipped.val[1]);
+      vst1q_u8(box + k, combined);
+      k += 16;
+    }
+    for (; i < real_src_depth; i++) {
+      const int8_t v1 = (int8_t)src_data[i];
+      int8_t uv1 = upper(v1);
+      int8_t lv1 = lower(v1);
+      box[k] = merge(lv1, 0);
+      box[k + 1] = merge(uv1, 0);
+      k += 2;
+    }
+    box += real_depth;
+    src_data += real_src_cols;
+  }
+}
+
+void NeonPrepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                 int layout_cols, int src_rows, int src_cols, int width,
+                 int depth) {
+  // depth is always cols
+  size_t size = layout_rows * layout_cols / 2;
+  int res =
+      posix_memalign(reinterpret_cast<void**>(dest), EIGEN_MAX_ALIGN_BYTES,
+                     size + (EIGEN_MAX_ALIGN_BYTES));
+  (void)res;
+  memset(*dest, static_cast<uint8_t>(119), sizeof(uint8_t) * size);
+  // basically, we need to make a new 4D matrix
+  // [rows / width, cols / depth, width, depth] in depth-first
+  int outer_cols = layout_cols / depth;
+  int outer_rows = layout_rows / width;
+  int inner_cols = depth;
+  int inner_rows = width;
+  for (int outer_row = 0; outer_row < outer_rows; ++outer_row) {
+    for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+      const int cluster_index = outer_row * outer_cols + outer_col;
+      const int real_depth = inner_cols / 2;
+      uint8_t* box = *dest + cluster_index * real_depth * inner_rows;
+      NeonPackInner(tensor, box, src_rows, src_cols, outer_row, outer_col,
+                    outer_rows, outer_cols, inner_rows, inner_cols);
+    }
+  }
+}
+
+void NeonBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                 int n_data, int8_t* quantized_data_ptr,
+                                 float* scaling_factors, int width, int depth,
+                                 int32_t* input_offsets) {
+  const int rows = n_batch;
+  const int cols = n_data;
+  // depth is alpways cols
+  const int layout_rows = (rows + (width - 1)) & ~(width - 1);
+  const int layout_cols = (cols + (depth - 1)) & ~(depth - 1);
+  const int size = layout_rows * layout_cols;
+  int8_t* data = quantized_data_ptr;
+  memset(data, 0, sizeof(int8_t) * size);
+  memset(input_offsets, 0, sizeof(int32_t) * layout_rows);
+  const float* tensor_data = float_data_ptr;
+  // basically, we need to make a new 4D matrix
+  // [rows / width, cols / depth, width, depth] in depth-first
+  const int outer_cols = layout_cols / depth;
+  const int outer_rows = layout_rows / width;
+  float* scaling_factors_ptr = scaling_factors;
+  for (int outer_row = 0; outer_row < outer_rows; outer_row++) {
+    std::vector<float> scale(width);
+    const int row = width * outer_row;
+    scaling_factors_ptr = scaling_factors + row;
+    for (int w = 0; w < width; ++w) {
+      if ((row + w) >= rows) {
+        continue;
+      }
+      int c = 0;
+      const float* start = tensor_data + (row + w) * cols;
+      float32x4_t v1_f32x4 = vdupq_n_f32(0);
+      for (; c < (cols & ~3); c += 4) {
+        float32x4_t v0_f32x4 = vld1q_f32(start);
+        v0_f32x4 = vabsq_f32(v0_f32x4);
+        start += 4;
+        v1_f32x4 = vmaxq_f32(v0_f32x4, v1_f32x4);
+      }
+      float scale_denom = vmaxvq_f32(v1_f32x4);
+      for (; c < cols; ++c) {
+        scale_denom = std::max(scale_denom, std::abs(*(start++)));
+      }
+      scale[w] = 127.0 / scale_denom;
+      scaling_factors_ptr[w] = scale_denom / 127.0;
+    }
+    for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+      const int col = depth * outer_col;
+      const int src_width = std::min(width, rows - row);
+      const int src_depth = std::min(depth, cols - col);
+      const int cluster_index = outer_row * outer_cols + outer_col;
+      int8_t* box = data + cluster_index * depth * width;
+      __builtin_prefetch(box, 1, 3);
+      for (int w = 0; w < src_width; ++w) {
+        const float scale_w = scale[w];
+        const float* float_data = tensor_data + (row + w) * cols + col;
+        __builtin_prefetch(float_data, 0, 3);
+        int32_t* input_offsets_ptr = input_offsets + (row + w);
+        __builtin_prefetch(input_offsets_ptr, 1, 3);
+        int8_t* x0 = box + w * depth;
+        const float* x1 = float_data;
+        int16x8_t v12_s16x8 = vdupq_n_s16(0);
+        int32x4_t v13_s32x4 = vdupq_n_s32(0);
+        size_t run_depth = 0;
+        float32x4_t v0_f32x4 = vdupq_n_f32(scale_w);
+#ifdef __aarch64__
+        for (; run_depth < (src_depth & ~15); run_depth += 16) {
+          const float32x4x4_t v1_f32x4x4 = vld1q_f32_x4(x1);
+          x1 += 16;
+          const float32x4_t v5_f32x4 = vmulq_f32(v1_f32x4x4.val[0], v0_f32x4);
+          const float32x4_t v6_f32x4 = vmulq_f32(v1_f32x4x4.val[1], v0_f32x4);
+          const float32x4_t v7_f32x4 = vmulq_f32(v1_f32x4x4.val[2], v0_f32x4);
+          const float32x4_t v8_f32x4 = vmulq_f32(v1_f32x4x4.val[3], v0_f32x4);
+          const int32x4_t v5_s32x4 = vcvtaq_s32_f32(v5_f32x4);
+          const int32x4_t v6_s32x4 = vcvtaq_s32_f32(v6_f32x4);
+          const int32x4_t v7_s32x4 = vcvtaq_s32_f32(v7_f32x4);
+          const int32x4_t v8_s32x4 = vcvtaq_s32_f32(v8_f32x4);
+          const int16x4_t v9_low_s16x4 = vqmovn_s32(v5_s32x4);
+          const int16x8_t v9_s16x8 = vqmovn_high_s32(v9_low_s16x4, v6_s32x4);
+          const int16x4_t v10_low_s16x4 = vqmovn_s32(v7_s32x4);
+          const int16x8_t v10_s16x8 = vqmovn_high_s32(v10_low_s16x4, v8_s32x4);
+          const int8x8_t v11_low_s8x8 = vqmovn_s16(v9_s16x8);
+          const int8x16_t v11_s8x16 = vqmovn_high_s16(v11_low_s8x8, v10_s16x8);
+          v12_s16x8 = vaddq_s16(v12_s16x8, v9_s16x8);
+          v12_s16x8 = vaddq_s16(v12_s16x8, v10_s16x8);
+          vst1q_s8(x0, v11_s8x16);
+          x0 += 16;
+        }
+        for (; run_depth < (src_depth & ~7); run_depth += 8) {
+          const float32x4x2_t v1_f32x4x2 = vld1q_f32_x2(x1);
+          x1 += 8;
+          const float32x4_t v5_f32x4 = vmulq_f32(v1_f32x4x2.val[0], v0_f32x4);
+          const float32x4_t v6_f32x4 = vmulq_f32(v1_f32x4x2.val[1], v0_f32x4);
+          const int32x4_t v5_s32x4 = vcvtaq_s32_f32(v5_f32x4);
+          const int32x4_t v6_s32x4 = vcvtaq_s32_f32(v6_f32x4);
+          const int16x4_t v9_low_s16x4 = vqmovn_s32(v5_s32x4);
+          const int16x8_t v9_s16x8 = vqmovn_high_s32(v9_low_s16x4, v6_s32x4);
+          const int8x8_t v11_low_s8x8 = vqmovn_s16(v9_s16x8);
+          vst1_s8(x0, v11_low_s8x8);
+          x0 += 8;
+          v12_s16x8 = vaddq_s16(v12_s16x8, v9_s16x8);
+        }
+#else
+        for (; run_depth < (src_depth & ~7); run_depth += 8) {
+          const float32x4_t v1_f32x4_0 = vld1q_f32(x1);
+          x1 += 4;
+          const float32x4_t v1_f32x4_1 = vld1q_f32(x1);
+          x1 += 4;
+          const float32x4_t v5_f32x4 = vmulq_f32(v1_f32x4_0, v0_f32x4);
+          const float32x4_t v6_f32x4 = vmulq_f32(v1_f32x4_1, v0_f32x4);
+          const int32x4_t v5_s32x4 = vcvtaq_s32_f32(v5_f32x4);
+          const int32x4_t v6_s32x4 = vcvtaq_s32_f32(v6_f32x4);
+          const int16x4_t v9_low_s16x4 = vqmovn_s32(v5_s32x4);
+          const int16x8_t v9_s16x8 = vqmovn_high_s32(v9_low_s16x4, v6_s32x4);
+          const int8x8_t v11_low_s8x8 = vqmovn_s16(v9_s16x8);
+          vst1_s8(x0, v11_low_s8x8);
+          x0 += 8;
+          v12_s16x8 = vaddq_s16(v12_s16x8, v9_s16x8);
+        }
+#endif
+        int32_t row_sum = 0;
+        if (run_depth > 0) {
+          v13_s32x4 = vpadalq_s16(v13_s32x4, v12_s16x8);
+          v13_s32x4 = vpaddq_s32(v13_s32x4, v13_s32x4);
+          v13_s32x4 = vpaddq_s32(v13_s32x4, v13_s32x4);
+          row_sum += vgetq_lane_s32(v13_s32x4, 0);
+        }
+        for (; run_depth < src_depth; run_depth++) {
+          const float f = *x1++;
+          const int8_t q =
+              static_cast<int8_t>(::tflite::TfLiteRound(f * scale_w));
+          *x0++ = q;
+          row_sum += q;
+        }
+        input_offsets[row + w] += row_sum;
+      }
+    }
+  }
+  for (int r = 0; r < layout_rows; ++r) {
+    input_offsets[r] = input_offsets[r] * zero_point_4bit;
+  }
+}
+
+void NeonAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                     const float* batch_scales,
+                                     float* filter_scales,
+                                     const float* bias_ptr, float* output_ptr,
+                                     int output_depth, int batch_size);
+
+template <int Depth, int Width>
+void NeonUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+                int num_units, const float* scaling_factors,
+                const float* filter_scales, int dst_layout_rows,
+                int dst_layout_cols) {
+  if (Width == 1) {
+    const int outer_rows = dst_layout_rows / Width;
+    const int outer_cols = dst_layout_cols / Depth;
+    const int32_t* dst_ptr = dst;
+    int unit = 0;
+    for (int outer_col = 0; outer_col < outer_cols;
+         ++outer_col, unit += Depth) {
+      float* tmp_output_ptr = output_ptr + unit;
+      int len = num_units - unit < Depth ? num_units - unit : Depth;
+      int cond = len & ~3;
+      const float* scaling_factors_ptr = scaling_factors;
+      for (int outer_row = 0; outer_row < outer_rows; ++outer_row) {
+        const float scale = *scaling_factors_ptr;
+        const float* filter_scales_ptr = filter_scales + unit;
+        int i = 0;
+        for (; i < cond; i += 4) {
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+        }
+        for (; i < len; ++i) {
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+        }
+        dst_ptr += (Depth - len);
+        scaling_factors_ptr += Width;
+        tmp_output_ptr += (num_units - len);
+      }
+    }
+    return;
+  }
+  const int outer_rows = dst_layout_rows / Width;
+  const int outer_cols = dst_layout_cols / Depth;
+  for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+    const int unit = outer_col * Depth;
+    const int remaining_units = std::min(num_units - unit, Depth);
+    const int depth_offset = Depth - remaining_units;
+    const int width_offset = num_units - remaining_units;
+    int outer_row = 0;
+    for (; outer_row < outer_rows; ++outer_row) {
+      const int batch = outer_row * Width;
+      const int remaining_width = std::min(batch_size - batch, Width);
+      const int cluster_index = outer_col * outer_rows + outer_row;
+      const int32_t* dst_ptr = dst + cluster_index * Depth * Width;
+      float* tmp_output_ptr = output_ptr + batch * num_units + unit;
+      const float* scale = scaling_factors + batch;
+      int w = remaining_width;
+      for (; w > 0; --w, scale++) {
+        int d = remaining_units;
+        const float* filter_scales_ptr = filter_scales + unit;
+        float32x4_t v3_f32x4 = vld1q_dup_f32(scale);
+        for (; d > 3; d -= 4) {
+          int32x4_t v0_s32x4 = vld1q_s32(dst_ptr);
+          dst_ptr += 4;
+          float32x4_t v1_f32x4 = vcvtq_f32_s32(v0_s32x4);
+          float32x4_t v2_f32x4 = vld1q_f32(filter_scales_ptr);
+          filter_scales_ptr += 4;
+          float32x4_t v5_f32x4 = vmulq_f32(v1_f32x4, v3_f32x4);
+          float32x4_t v6_f32x4 = vmulq_f32(v5_f32x4, v2_f32x4);
+          float32x4_t v7_f32x4 = vld1q_f32(tmp_output_ptr);
+          float32x4_t v8_f32x4 = vaddq_f32(v6_f32x4, v7_f32x4);
+          vst1q_f32(tmp_output_ptr, v8_f32x4);
+          tmp_output_ptr += 4;
+        }
+        for (; d > 0; --d) {
+          *tmp_output_ptr++ += *dst_ptr++ * (*scale) * (*filter_scales_ptr++);
+        }
+        dst_ptr += depth_offset;
+        tmp_output_ptr += width_offset;
+      }
+    }
+  }
+}
+
+inline bool HasSDot() { return cpuinfo_has_arm_neon_dot(); }
+
+template <int RowsLeft, int RowsRight, int Cols>
+void NeonRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                   int lhs_layout_rows, int lhs_layout_cols,
+                   int rhs_layout_rows, int rhs_layout_cols,
+                   int dst_layout_rows, int dst_layout_cols) {
+#ifdef __aarch64__
+  if (HasSDot()) {
+    NeonRunKernelSDot<RowsLeft, RowsRight, Cols>(
+        lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+        rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+#endif
+  NeonRunKernelNoSDot<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+template void NeonUnpack<4, 1>(float* output_ptr, const int32_t* dst,
+                               int batch_size, int num_units,
+                               const float* scaling_factors,
+                               const float* filter_scales, int dst_layout_rows,
+                               int dst_layout_cols);
+
+template void NeonRunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                      int32_t* dst, int lhs_layout_rows,
+                                      int lhs_layout_cols, int rhs_layout_rows,
+                                      int rhs_layout_cols, int dst_layout_rows,
+                                      int dst_layout_cols);
+
+#ifdef __aarch64__
+template void NeonUnpack<4, 2>(float* output_ptr, const int32_t* dst,
+                               int batch_size, int num_units,
+                               const float* scaling_factors,
+                               const float* filter_scales, int dst_layout_rows,
+                               int dst_layout_cols);
+
+template void NeonRunKernel<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                      int32_t* dst, int lhs_layout_rows,
+                                      int lhs_layout_cols, int rhs_layout_rows,
+                                      int rhs_layout_cols, int dst_layout_rows,
+                                      int dst_layout_cols);
+
+template void NeonUnpack<4, 4>(float* output_ptr, const int32_t* dst,
+                               int batch_size, int num_units,
+                               const float* scaling_factors,
+                               const float* filter_scales, int dst_layout_rows,
+                               int dst_layout_cols);
+
+template void NeonRunKernel<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                      int32_t* dst, int lhs_layout_rows,
+                                      int lhs_layout_cols, int rhs_layout_rows,
+                                      int rhs_layout_cols, int dst_layout_rows,
+                                      int dst_layout_cols);
+#endif
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+#endif  // defined(FC_4BIT_NEON)...
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h
new file mode 100644
index 00000000000..49ffe4a709b
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h
@@ -0,0 +1,192 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_H_
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Maximum RowsRight compiled RunKernel implementations.
+inline int GetMaxSupportedRows() {
+#ifdef __aarch64__
+  return 4;
+#else
+  return 1;
+#endif
+}
+
+// Pack a 4bit inner_rows x inner_cols array from src.
+inline void PackInner(const int8_t* src, uint8_t* box, int src_rows,
+                      int src_cols, int outer_row, int outer_col,
+                      int outer_rows, int outer_cols, int inner_rows,
+                      int inner_cols) {
+  NeonPackInner(src, box, src_rows, src_cols, outer_row, outer_col, outer_rows,
+                outer_cols, inner_rows, inner_cols);
+}
+
+// Prepack lhs matrix, and allocate destination pointer.
+inline void Prepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  NeonPrepack(dest, tensor, layout_rows, layout_cols, src_rows, src_cols, width,
+              depth);
+}
+
+// Quantize input floats to 8bit and calculate sum of each column.
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  NeonBatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                              quantized_data_ptr, scaling_factors, width, depth,
+                              input_offsets);
+}
+
+// Write bias + input offset * filter_scale to output_ptr.
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  NeonAssignBiasAndComputeOffsets(input_offsets, batch_scales, filter_scales,
+                                  bias_ptr, output_ptr, output_depth,
+                                  batch_size);
+}
+
+// Add accumulated integer sums in dst to float output.
+template <int Depth, int Width>
+void Unpack(float* output_ptr, const int32_t* dst, int batch_size,
+            int num_units, const float* scaling_factors,
+            const float* filter_scales, int dst_layout_rows,
+            int dst_layout_cols) {
+  NeonUnpack<Depth, Width>(output_ptr, dst, batch_size, num_units,
+                           scaling_factors, filter_scales, dst_layout_rows,
+                           dst_layout_cols);
+}
+
+// Compute sum of lhs * rhs columnwise.
+template <int RowsLeft, int RowsRight, int Cols>
+void RunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+               int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+               int rhs_layout_cols, int dst_layout_rows, int dst_layout_cols) {
+  NeonRunKernel<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 1>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  NeonUnpack<4, 1>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  NeonRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+}
+
+#ifdef __aarch64__
+template <>
+inline void Unpack<4, 2>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  NeonUnpack<4, 2>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  NeonRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 4>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  NeonUnpack<4, 4>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  NeonRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+}
+#endif
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+#ifdef __aarch64__
+  if (rhs_width >= 4) {
+    NeonRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                            rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                            dst_layout_cols);
+    NeonUnpack<4, 4>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                     filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+  if (rhs_width >= 2) {
+    NeonRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                            rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                            dst_layout_cols);
+    NeonUnpack<4, 2>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                     filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+#endif
+  NeonRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                          rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                          dst_layout_cols);
+  NeonUnpack<4, 1>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                   filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_NEON)...
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_aarch64_nosdot.cc b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_aarch64_nosdot.cc
new file mode 100644
index 00000000000..cb8f5e1efe4
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_aarch64_nosdot.cc
@@ -0,0 +1,731 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+#define INNER_LOOP_PREAMBLE "1"
+#define OUTER_LOOP_BEGIN "2"
+#define OUTER_LOOP_END "3"
+#define INNER_LOOP_BEGIN "4"
+#define INNER_LOOP "5"
+#define INNER_LOOP_END "6"
+#define INNER_LOOP_POSTAMBLE "7"
+#define END "8"
+
+#define KERNEL_4x1                   \
+  "dup v24.16b, %w[bit_shift]\n"     \
+  "mov x0, %[element_ptr]\n"         \
+  "mov x6, %[lhs_val]\n"             \
+  "mov x1, %[rhs_val]\n"             \
+                                     \
+      INNER_LOOP_BEGIN               \
+  ":\n"                              \
+  "mov x4, x6\n"                     \
+  "ld1 {v4.16b}, [x4], #16\n"        \
+  "dup v16.4s, wzr\n"                \
+  "dup v17.4s, wzr\n"                \
+  "ld1 {v5.16b}, [x4], #16\n"        \
+  "dup v18.4s, wzr\n"                \
+  "dup v19.4s, wzr\n"                \
+  "ld1 {v6.16b}, [x4], #16\n"        \
+  "and v8.16b, v4.16b, v24.16b\n"    \
+  "and v9.16b, v5.16b, v24.16b\n"    \
+  "ld1 {v7.16b}, [x4], #16\n"        \
+  "ushr v12.16b, v4.16b, #4\n"       \
+  "ushr v13.16b, v5.16b, #4\n"       \
+  "ld1 {v0.16b}, [x1], #16\n"        \
+  "and v10.16b, v6.16b, v24.16b\n"   \
+  "and v11.16b, v7.16b, v24.16b\n"   \
+  "ld1 {v1.16b}, [x1], #16\n"        \
+  "ushr v14.16b, v6.16b, #4\n"       \
+  "ushr v15.16b, v7.16b, #4\n"       \
+  "mov w3, %w[run_depth]\n"          \
+  "subs w3, w3, #1\n"                \
+  "b.ls " INNER_LOOP_END "f\n"       \
+                                     \
+      INNER_LOOP                     \
+  ":\n"                              \
+  "ld1 {v4.16b}, [x4], #16\n"        \
+  "smull v20.8h, v12.8b, v0.8b\n"    \
+  "smull v21.8h, v13.8b, v0.8b\n"    \
+  "smull v22.8h, v14.8b, v0.8b\n"    \
+  "ld1 {v5.16b}, [x4], #16\n"        \
+  "smull v23.8h, v15.8b, v0.8b\n"    \
+  "smlal v20.8h, v8.8b, v1.8b\n"     \
+  "smlal v21.8h, v9.8b, v1.8b\n"     \
+  "ld1 {v6.16b}, [x4], #16\n"        \
+  "smlal v22.8h, v10.8b, v1.8b\n"    \
+  "smlal v23.8h, v11.8b, v1.8b\n"    \
+  "smlal2 v20.8h, v12.16b, v0.16b\n" \
+  "ld1 {v7.16b}, [x4], #16\n"        \
+  "smlal2 v21.8h, v13.16b, v0.16b\n" \
+  "smlal2 v22.8h, v14.16b, v0.16b\n" \
+  "smlal2 v23.8h, v15.16b, v0.16b\n" \
+  "smlal2 v20.8h, v8.16b, v1.16b\n"  \
+  "smlal2 v21.8h, v9.16b, v1.16b\n"  \
+  "smlal2 v22.8h, v10.16b, v1.16b\n" \
+  "ld1 {v0.16b}, [x1], #16\n"        \
+  "smlal2 v23.8h, v11.16b, v1.16b\n" \
+  "sadalp v16.4s, v20.8h\n"          \
+  "sadalp v17.4s, v21.8h\n"          \
+  "sadalp v18.4s, v22.8h\n"          \
+  "sadalp v19.4s, v23.8h\n"          \
+  "ld1 {v1.16b}, [x1], #16\n"        \
+  "and v8.16b, v4.16b, v24.16b\n"    \
+  "and v9.16b, v5.16b, v24.16b\n"    \
+  "ushr v12.16b, v4.16b, #4\n"       \
+  "ushr v13.16b, v5.16b, #4\n"       \
+  "and v10.16b, v6.16b, v24.16b\n"   \
+  "and v11.16b, v7.16b, v24.16b\n"   \
+  "ushr v14.16b, v6.16b, #4\n"       \
+  "ushr v15.16b, v7.16b, #4\n"       \
+  "subs w3, w3, #1\n"                \
+  "b.hi " INNER_LOOP "b\n"           \
+                                     \
+      INNER_LOOP_END                 \
+  ":\n"                              \
+  "smull v20.8h, v12.8b, v0.8b\n"    \
+  "smull v21.8h, v13.8b, v0.8b\n"    \
+  "smull v22.8h, v14.8b, v0.8b\n"    \
+  "smull v23.8h, v15.8b, v0.8b\n"    \
+  "smlal v20.8h, v8.8b, v1.8b\n"     \
+  "smlal v21.8h, v9.8b, v1.8b\n"     \
+  "smlal v22.8h, v10.8b, v1.8b\n"    \
+  "smlal v23.8h, v11.8b, v1.8b\n"    \
+  "smlal2 v20.8h, v12.16b, v0.16b\n" \
+  "smlal2 v21.8h, v13.16b, v0.16b\n" \
+  "smlal2 v22.8h, v14.16b, v0.16b\n" \
+  "smlal2 v23.8h, v15.16b, v0.16b\n" \
+  "smlal2 v20.8h, v8.16b, v1.16b\n"  \
+  "smlal2 v21.8h, v9.16b, v1.16b\n"  \
+  "smlal2 v22.8h, v10.16b, v1.16b\n" \
+  "smlal2 v23.8h, v11.16b, v1.16b\n" \
+  "sadalp v16.4s, v20.8h\n"          \
+  "sadalp v17.4s, v21.8h\n"          \
+  "sadalp v18.4s, v22.8h\n"          \
+  "sadalp v19.4s, v23.8h\n"          \
+  "addp v4.4s, v16.4s, v17.4s\n"     \
+  "addp v5.4s, v18.4s, v19.4s\n"     \
+  "addp v6.4s, v4.4s, v5.4s\n"       \
+  "st1 {v6.4s}, [x0], #16\n"
+
+#define KERNEL_4x2                        \
+  "mov x0, %[element_ptr]\n"              \
+  "mov x6, %[lhs_val]\n"                  \
+  "mov x1, %[rhs_val]\n" INNER_LOOP_BEGIN \
+  ":\n"                                   \
+  "mov x4, x6\n"                          \
+  "ld1 {v4.16b}, [x4], #16\n"             \
+  "dup v31.16b, %w[bit_shift]\n"          \
+  "dup v16.4s, wzr\n"                     \
+  "dup v17.4s, wzr\n"                     \
+  "ld1 {v5.16b}, [x4], #16\n"             \
+  "dup v18.4s, wzr\n"                     \
+  "dup v19.4s, wzr\n"                     \
+  "ld1 {v6.16b}, [x4], #16\n"             \
+  "and v8.16b, v4.16b, v31.16b\n"         \
+  "and v9.16b, v5.16b, v31.16b\n"         \
+  "ld1 {v7.16b}, [x4], #16\n"             \
+  "ushr v12.16b, v4.16b, #4\n"            \
+  "ushr v13.16b, v5.16b, #4\n"            \
+  "dup v24.4s, wzr\n"                     \
+  "ld1 {v0.16b}, [x1], #16\n"             \
+  "dup v25.4s, wzr\n"                     \
+  "dup v26.4s, wzr\n"                     \
+  "ld1 {v1.16b}, [x1], #16\n"             \
+  "dup v27.4s, wzr\n"                     \
+  "and v10.16b, v6.16b, v31.16b\n"        \
+  "ld1 {v2.16b}, [x1], #16\n"             \
+  "and v11.16b, v7.16b, v31.16b\n"        \
+  "ushr v14.16b, v6.16b, #4\n"            \
+  "ld1 {v3.16b}, [x1], #16\n"             \
+  "ushr v15.16b, v7.16b, #4\n"            \
+  "mov w3, %w[run_depth]\n"               \
+  "subs w3, w3, #1\n"                     \
+  "b.ls " INNER_LOOP_END "f\n"            \
+                                          \
+      INNER_LOOP                          \
+  ":\n"                                   \
+  "smull v20.8h, v12.8b, v0.8b\n"         \
+  "smull v21.8h, v13.8b, v0.8b\n"         \
+  "smull v22.8h, v14.8b, v0.8b\n"         \
+  "ld1 {v4.16b}, [x4], #16\n"             \
+  "smull v23.8h, v15.8b, v0.8b\n"         \
+  "smlal v20.8h, v8.8b, v1.8b\n"          \
+  "smlal v21.8h, v9.8b, v1.8b\n"          \
+  "ld1 {v5.16b}, [x4], #16\n"             \
+  "smlal v22.8h, v10.8b, v1.8b\n"         \
+  "smlal v23.8h, v11.8b, v1.8b\n"         \
+  "smlal2 v20.8h, v12.16b, v0.16b\n"      \
+  "ld1 {v6.16b}, [x4], #16\n"             \
+  "smlal2 v21.8h, v13.16b, v0.16b\n"      \
+  "smlal2 v22.8h, v14.16b, v0.16b\n"      \
+  "smlal2 v23.8h, v15.16b, v0.16b\n"      \
+  "ld1 {v7.16b}, [x4], #16\n"             \
+  "smlal2 v20.8h, v8.16b, v1.16b\n"       \
+  "smlal2 v21.8h, v9.16b, v1.16b\n"       \
+  "smlal2 v22.8h, v10.16b, v1.16b\n"      \
+  "smlal2 v23.8h, v11.16b, v1.16b\n"      \
+                                          \
+  "ld1 {v0.16b}, [x1], #16\n"             \
+                                          \
+  "sadalp v16.4s, v20.8h\n"               \
+  "sadalp v17.4s, v21.8h\n"               \
+  "sadalp v18.4s, v22.8h\n"               \
+  "sadalp v19.4s, v23.8h\n"               \
+                                          \
+  "ld1 {v1.16b}, [x1], #16\n"             \
+                                          \
+  "smull v28.8h, v12.8b, v2.8b\n"         \
+  "smull v29.8h, v13.8b, v2.8b\n"         \
+  "smull v30.8h, v14.8b, v2.8b\n"         \
+  "smull v20.8h, v15.8b, v2.8b\n"         \
+                                          \
+  "smlal v28.8h, v8.8b, v3.8b\n"          \
+  "smlal v29.8h, v9.8b, v3.8b\n"          \
+  "smlal v30.8h, v10.8b, v3.8b\n"         \
+  "smlal v20.8h, v11.8b, v3.8b\n"         \
+  "smlal2 v28.8h, v12.16b, v2.16b\n"      \
+  "smlal2 v29.8h, v13.16b, v2.16b\n"      \
+  "smlal2 v30.8h, v14.16b, v2.16b\n"      \
+  "smlal2 v20.8h, v15.16b, v2.16b\n"      \
+  "smlal2 v28.8h, v8.16b, v3.16b\n"       \
+  "smlal2 v29.8h, v9.16b, v3.16b\n"       \
+  "smlal2 v30.8h, v10.16b, v3.16b\n"      \
+  "smlal2 v20.8h, v11.16b, v3.16b\n"      \
+                                          \
+  "ld1 {v2.16b}, [x1], #16\n"             \
+                                          \
+  "sadalp v24.4s, v28.8h\n"               \
+  "sadalp v25.4s, v29.8h\n"               \
+  "sadalp v26.4s, v30.8h\n"               \
+  "sadalp v27.4s, v20.8h\n"               \
+                                          \
+  "ld1 {v3.16b}, [x1], #16\n"             \
+                                          \
+  "and v8.16b, v4.16b, v31.16b\n"         \
+  "and v9.16b, v5.16b, v31.16b\n"         \
+  "ushr v12.16b, v4.16b, #4\n"            \
+  "ushr v13.16b, v5.16b, #4\n"            \
+                                          \
+  "subs w3, w3, #1\n"                     \
+                                          \
+  "and v10.16b, v6.16b, v31.16b\n"        \
+  "and v11.16b, v7.16b, v31.16b\n"        \
+  "ushr v14.16b, v6.16b, #4\n"            \
+  "ushr v15.16b, v7.16b, #4\n"            \
+                                          \
+  "b.hi " INNER_LOOP "b\n"                \
+                                          \
+      INNER_LOOP_END                      \
+  ":\n"                                   \
+  "smull v20.8h, v12.8b, v0.8b\n"         \
+  "smull v21.8h, v13.8b, v0.8b\n"         \
+  "smull v22.8h, v14.8b, v0.8b\n"         \
+  "smull v23.8h, v15.8b, v0.8b\n"         \
+  "smlal v20.8h, v8.8b, v1.8b\n"          \
+  "smlal v21.8h, v9.8b, v1.8b\n"          \
+  "smlal v22.8h, v10.8b, v1.8b\n"         \
+  "smlal v23.8h, v11.8b, v1.8b\n"         \
+  "smlal2 v20.8h, v12.16b, v0.16b\n"      \
+  "smlal2 v21.8h, v13.16b, v0.16b\n"      \
+  "smlal2 v22.8h, v14.16b, v0.16b\n"      \
+  "smlal2 v23.8h, v15.16b, v0.16b\n"      \
+  "smlal2 v20.8h, v8.16b, v1.16b\n"       \
+  "smlal2 v21.8h, v9.16b, v1.16b\n"       \
+  "smlal2 v22.8h, v10.16b, v1.16b\n"      \
+  "smlal2 v23.8h, v11.16b, v1.16b\n"      \
+  "smull v28.8h, v12.8b, v2.8b\n"         \
+  "smull v29.8h, v13.8b, v2.8b\n"         \
+  "smull v30.8h, v14.8b, v2.8b\n"         \
+  "smull v31.8h, v15.8b, v2.8b\n"         \
+  "smlal v28.8h, v8.8b, v3.8b\n"          \
+  "smlal v29.8h, v9.8b, v3.8b\n"          \
+  "smlal v30.8h, v10.8b, v3.8b\n"         \
+  "smlal v31.8h, v11.8b, v3.8b\n"         \
+  "smlal2 v28.8h, v12.16b, v2.16b\n"      \
+  "smlal2 v29.8h, v13.16b, v2.16b\n"      \
+  "smlal2 v30.8h, v14.16b, v2.16b\n"      \
+  "smlal2 v31.8h, v15.16b, v2.16b\n"      \
+  "smlal2 v28.8h, v8.16b, v3.16b\n"       \
+  "smlal2 v29.8h, v9.16b, v3.16b\n"       \
+  "smlal2 v30.8h, v10.16b, v3.16b\n"      \
+  "smlal2 v31.8h, v11.16b, v3.16b\n"      \
+                                          \
+  "sadalp v16.4s, v20.8h\n"               \
+  "sadalp v17.4s, v21.8h\n"               \
+  "sadalp v18.4s, v22.8h\n"               \
+  "sadalp v19.4s, v23.8h\n"               \
+  "sadalp v24.4s, v28.8h\n"               \
+  "sadalp v25.4s, v29.8h\n"               \
+  "sadalp v26.4s, v30.8h\n"               \
+  "sadalp v27.4s, v31.8h\n"               \
+                                          \
+  "addp v4.4s, v16.4s, v17.4s\n"          \
+  "addp v5.4s, v18.4s, v19.4s\n"          \
+  "addp v8.4s, v24.4s, v25.4s\n"          \
+  "addp v9.4s, v26.4s, v27.4s\n"          \
+  "addp v6.4s, v4.4s, v5.4s\n"            \
+  "addp v7.4s, v8.4s, v9.4s\n"            \
+  "st1 {v6.4s, v7.4s}, [x0], #32\n"
+
+#define KERNEL_4x4                  \
+  "dup v3.16b, %w[bit_shift]\n"     \
+  "mov x0, %[element_ptr]\n"        \
+  "mov x6, %[lhs_val]\n"            \
+  "mov x1, %[rhs_val]\n"            \
+                                    \
+      INNER_LOOP_BEGIN              \
+  ":\n"                             \
+  "mov x4, x6\n"                    \
+  "ld1 {v4.16b}, [x4], #16\n"       \
+  "dup v16.4s, wzr\n"               \
+  "dup v17.4s, wzr\n"               \
+  "dup v18.4s, wzr\n"               \
+  "dup v19.4s, wzr\n"               \
+  "ld1 {v5.16b}, [x4], #16\n"       \
+  "dup v20.4s, wzr\n"               \
+  "dup v21.4s, wzr\n"               \
+  "dup v22.4s, wzr\n"               \
+  "ld1 {v6.16b}, [x4], #16\n"       \
+  "dup v23.4s, wzr\n"               \
+  "dup v24.4s, wzr\n"               \
+  "dup v25.4s, wzr\n"               \
+  "ld1 {v7.16b}, [x4], #16\n"       \
+  "dup v26.4s, wzr\n"               \
+  "dup v27.4s, wzr\n"               \
+  "dup v28.4s, wzr\n"               \
+  "dup v29.4s, wzr\n"               \
+  "ld1 {v0.16b}, [x1], #16\n"       \
+  "dup v30.4s, wzr\n"               \
+  "dup v31.4s, wzr\n"               \
+  "mov w3, %w[run_depth]\n"         \
+  "ld1 {v1.16b}, [x1], #16\n"       \
+  "and v8.16b, v4.16b, v3.16b\n"    \
+  "and v9.16b, v5.16b, v3.16b\n"    \
+  "and v10.16b, v6.16b, v3.16b\n"   \
+  "and v11.16b, v7.16b, v3.16b\n"   \
+  "ushr v12.16b, v4.16b, #4\n"      \
+  "ushr v13.16b, v5.16b, #4\n"      \
+  "ushr v14.16b, v6.16b, #4\n"      \
+  "ushr v15.16b, v7.16b, #4\n"      \
+  "subs w3, w3, #1\n"               \
+  "b.ls " INNER_LOOP_END "f\n"      \
+                                    \
+      INNER_LOOP                    \
+  ":\n"                             \
+  "smull v4.8h, v12.8b, v0.8b\n"    \
+  "smull v5.8h, v13.8b, v0.8b\n"    \
+  "smull v6.8h, v14.8b, v0.8b\n"    \
+  "smull v7.8h, v15.8b, v0.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v0.16b\n" \
+  "smlal2 v5.8h, v13.16b, v0.16b\n" \
+  "smlal2 v6.8h, v14.16b, v0.16b\n" \
+  "smlal2 v7.8h, v15.16b, v0.16b\n" \
+                                    \
+  "ld1 {v2.16b}, [x1], #16\n"       \
+                                    \
+  "smlal v4.8h, v8.8b, v1.8b\n"     \
+  "smlal v5.8h, v9.8b, v1.8b\n"     \
+  "smlal v6.8h, v10.8b, v1.8b\n"    \
+  "smlal v7.8h, v11.8b, v1.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v1.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v1.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v1.16b\n" \
+  "smlal2 v7.8h, v11.16b, v1.16b\n" \
+                                    \
+  "ld1 {v0.16b}, [x1], #16\n"       \
+                                    \
+  "sadalp v16.4s, v4.8h\n"          \
+  "sadalp v17.4s, v5.8h\n"          \
+  "sadalp v18.4s, v6.8h\n"          \
+  "sadalp v19.4s, v7.8h\n"          \
+                                    \
+  "smull v4.8h, v12.8b, v2.8b\n"    \
+  "smull v5.8h, v13.8b, v2.8b\n"    \
+  "smull v6.8h, v14.8b, v2.8b\n"    \
+  "smull v7.8h, v15.8b, v2.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v2.16b\n" \
+  "smlal2 v5.8h, v13.16b, v2.16b\n" \
+  "smlal2 v6.8h, v14.16b, v2.16b\n" \
+  "smlal2 v7.8h, v15.16b, v2.16b\n" \
+                                    \
+  "ld1 {v1.16b}, [x1], #16\n"       \
+                                    \
+  "smlal v4.8h, v8.8b, v0.8b\n"     \
+  "smlal v5.8h, v9.8b, v0.8b\n"     \
+  "smlal v6.8h, v10.8b, v0.8b\n"    \
+  "smlal v7.8h, v11.8b, v0.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v0.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v0.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v0.16b\n" \
+  "smlal2 v7.8h, v11.16b, v0.16b\n" \
+                                    \
+  "ld1 {v2.16b}, [x1], #16\n"       \
+                                    \
+  "sadalp v20.4s, v4.8h\n"          \
+  "sadalp v21.4s, v5.8h\n"          \
+  "sadalp v22.4s, v6.8h\n"          \
+  "sadalp v23.4s, v7.8h\n"          \
+                                    \
+  "smull v4.8h, v12.8b, v1.8b\n"    \
+  "smull v5.8h, v13.8b, v1.8b\n"    \
+  "smull v6.8h, v14.8b, v1.8b\n"    \
+  "smull v7.8h, v15.8b, v1.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v1.16b\n" \
+  "smlal2 v5.8h, v13.16b, v1.16b\n" \
+  "smlal2 v6.8h, v14.16b, v1.16b\n" \
+  "smlal2 v7.8h, v15.16b, v1.16b\n" \
+                                    \
+  "ld1 {v0.16b}, [x1], #16\n"       \
+                                    \
+  "smlal v4.8h, v8.8b, v2.8b\n"     \
+  "smlal v5.8h, v9.8b, v2.8b\n"     \
+  "smlal v6.8h, v10.8b, v2.8b\n"    \
+  "smlal v7.8h, v11.8b, v2.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v2.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v2.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v2.16b\n" \
+  "smlal2 v7.8h, v11.16b, v2.16b\n" \
+                                    \
+  "ld1 {v1.16b}, [x1], #16\n"       \
+                                    \
+  "sadalp v24.4s, v4.8h\n"          \
+  "sadalp v25.4s, v5.8h\n"          \
+  "sadalp v26.4s, v6.8h\n"          \
+  "sadalp v27.4s, v7.8h\n"          \
+                                    \
+  "smull v4.8h, v12.8b, v0.8b\n"    \
+  "smull v5.8h, v13.8b, v0.8b\n"    \
+  "smull v6.8h, v14.8b, v0.8b\n"    \
+  "smull v7.8h, v15.8b, v0.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v0.16b\n" \
+  "smlal2 v5.8h, v13.16b, v0.16b\n" \
+  "smlal2 v6.8h, v14.16b, v0.16b\n" \
+  "smlal2 v7.8h, v15.16b, v0.16b\n" \
+                                    \
+  "ld1 {v12.16b}, [x4], #16\n"      \
+                                    \
+  "smlal v4.8h, v8.8b, v1.8b\n"     \
+  "smlal v5.8h, v9.8b, v1.8b\n"     \
+  "smlal v6.8h, v10.8b, v1.8b\n"    \
+  "smlal v7.8h, v11.8b, v1.8b\n"    \
+                                    \
+  "ld1 {v13.16b}, [x4], #16\n"      \
+                                    \
+  "smlal2 v4.8h, v8.16b, v1.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v1.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v1.16b\n" \
+  "smlal2 v7.8h, v11.16b, v1.16b\n" \
+                                    \
+  "ld1 {v14.16b}, [x4], #16\n"      \
+                                    \
+  "sadalp v28.4s, v4.8h\n"          \
+  "sadalp v29.4s, v5.8h\n"          \
+  "sadalp v30.4s, v6.8h\n"          \
+  "sadalp v31.4s, v7.8h\n"          \
+                                    \
+  "ld1 {v15.16b}, [x4], #16\n"      \
+                                    \
+  "and v8.16b, v12.16b, v3.16b\n"   \
+  "and v9.16b, v13.16b, v3.16b\n"   \
+  "and v10.16b, v14.16b, v3.16b\n"  \
+                                    \
+  "ld1 {v0.16b}, [x1], #16\n"       \
+                                    \
+  "and v11.16b, v15.16b, v3.16b\n"  \
+  "ushr v12.16b, v12.16b, #4\n"     \
+  "ushr v13.16b, v13.16b, #4\n"     \
+                                    \
+  "ld1 {v1.16b}, [x1], #16\n"       \
+                                    \
+  "ushr v14.16b, v14.16b, #4\n"     \
+  "ushr v15.16b, v15.16b, #4\n"     \
+                                    \
+  "subs w3, w3, #1\n"               \
+  "b.hi " INNER_LOOP "b\n"          \
+                                    \
+      INNER_LOOP_END                \
+  ":\n"                             \
+  "smull v4.8h, v12.8b, v0.8b\n"    \
+  "smull v5.8h, v13.8b, v0.8b\n"    \
+  "smull v6.8h, v14.8b, v0.8b\n"    \
+  "smull v7.8h, v15.8b, v0.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v0.16b\n" \
+  "smlal2 v5.8h, v13.16b, v0.16b\n" \
+  "smlal2 v6.8h, v14.16b, v0.16b\n" \
+  "smlal2 v7.8h, v15.16b, v0.16b\n" \
+                                    \
+  "ld1 {v2.16b}, [x1], #16\n"       \
+                                    \
+  "smlal v4.8h, v8.8b, v1.8b\n"     \
+  "smlal v5.8h, v9.8b, v1.8b\n"     \
+  "smlal v6.8h, v10.8b, v1.8b\n"    \
+  "smlal v7.8h, v11.8b, v1.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v1.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v1.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v1.16b\n" \
+  "smlal2 v7.8h, v11.16b, v1.16b\n" \
+                                    \
+  "ld1 {v0.16b}, [x1], #16\n"       \
+                                    \
+  "sadalp v16.4s, v4.8h\n"          \
+  "sadalp v17.4s, v5.8h\n"          \
+  "sadalp v18.4s, v6.8h\n"          \
+  "sadalp v19.4s, v7.8h\n"          \
+                                    \
+  "smull v4.8h, v12.8b, v2.8b\n"    \
+  "smull v5.8h, v13.8b, v2.8b\n"    \
+  "smull v6.8h, v14.8b, v2.8b\n"    \
+  "smull v7.8h, v15.8b, v2.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v2.16b\n" \
+  "smlal2 v5.8h, v13.16b, v2.16b\n" \
+  "smlal2 v6.8h, v14.16b, v2.16b\n" \
+  "smlal2 v7.8h, v15.16b, v2.16b\n" \
+                                    \
+  "ld1 {v1.16b}, [x1], #16\n"       \
+                                    \
+  "smlal v4.8h, v8.8b, v0.8b\n"     \
+  "smlal v5.8h, v9.8b, v0.8b\n"     \
+  "smlal v6.8h, v10.8b, v0.8b\n"    \
+  "smlal v7.8h, v11.8b, v0.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v0.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v0.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v0.16b\n" \
+  "smlal2 v7.8h, v11.16b, v0.16b\n" \
+                                    \
+  "ld1 {v2.16b}, [x1], #16\n"       \
+                                    \
+  "sadalp v20.4s, v4.8h\n"          \
+  "sadalp v21.4s, v5.8h\n"          \
+  "sadalp v22.4s, v6.8h\n"          \
+  "sadalp v23.4s, v7.8h\n"          \
+                                    \
+  "smull v4.8h, v12.8b, v1.8b\n"    \
+  "smull v5.8h, v13.8b, v1.8b\n"    \
+  "smull v6.8h, v14.8b, v1.8b\n"    \
+  "smull v7.8h, v15.8b, v1.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v1.16b\n" \
+  "smlal2 v5.8h, v13.16b, v1.16b\n" \
+  "smlal2 v6.8h, v14.16b, v1.16b\n" \
+  "smlal2 v7.8h, v15.16b, v1.16b\n" \
+                                    \
+  "ld1 {v0.16b}, [x1], #16\n"       \
+                                    \
+  "smlal v4.8h, v8.8b, v2.8b\n"     \
+  "smlal v5.8h, v9.8b, v2.8b\n"     \
+  "smlal v6.8h, v10.8b, v2.8b\n"    \
+  "smlal v7.8h, v11.8b, v2.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v2.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v2.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v2.16b\n" \
+  "smlal2 v7.8h, v11.16b, v2.16b\n" \
+                                    \
+  "ld1 {v1.16b}, [x1], #16\n"       \
+                                    \
+  "sadalp v24.4s, v4.8h\n"          \
+  "sadalp v25.4s, v5.8h\n"          \
+  "sadalp v26.4s, v6.8h\n"          \
+  "sadalp v27.4s, v7.8h\n"          \
+                                    \
+  "smull v4.8h, v12.8b, v0.8b\n"    \
+  "smull v5.8h, v13.8b, v0.8b\n"    \
+  "smull v6.8h, v14.8b, v0.8b\n"    \
+  "smull v7.8h, v15.8b, v0.8b\n"    \
+  "smlal2 v4.8h, v12.16b, v0.16b\n" \
+  "smlal2 v5.8h, v13.16b, v0.16b\n" \
+  "smlal2 v6.8h, v14.16b, v0.16b\n" \
+  "smlal2 v7.8h, v15.16b, v0.16b\n" \
+                                    \
+  "smlal v4.8h, v8.8b, v1.8b\n"     \
+  "smlal v5.8h, v9.8b, v1.8b\n"     \
+  "smlal v6.8h, v10.8b, v1.8b\n"    \
+  "smlal v7.8h, v11.8b, v1.8b\n"    \
+  "smlal2 v4.8h, v8.16b, v1.16b\n"  \
+  "smlal2 v5.8h, v9.16b, v1.16b\n"  \
+  "smlal2 v6.8h, v10.16b, v1.16b\n" \
+  "smlal2 v7.8h, v11.16b, v1.16b\n" \
+                                    \
+  "sadalp v28.4s, v4.8h\n"          \
+  "sadalp v29.4s, v5.8h\n"          \
+  "sadalp v30.4s, v6.8h\n"          \
+  "sadalp v31.4s, v7.8h\n"          \
+                                    \
+  "addp v14.4s, v16.4s, v17.4s\n"   \
+  "addp v15.4s, v18.4s, v19.4s\n"   \
+  "addp v12.4s, v20.4s, v21.4s\n"   \
+  "addp v13.4s, v22.4s, v23.4s\n"   \
+  "addp v10.4s, v24.4s, v25.4s\n"   \
+  "addp v11.4s, v26.4s, v27.4s\n"   \
+  "addp v8.4s, v28.4s, v29.4s\n"    \
+  "addp v9.4s, v30.4s, v31.4s\n"    \
+  "addp v4.4s, v14.4s, v15.4s\n"    \
+  "addp v5.4s, v12.4s, v13.4s\n"    \
+  "addp v6.4s, v10.4s, v11.4s\n"    \
+  "addp v7.4s, v8.4s, v9.4s\n"      \
+  "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64\n"
+
+template <int RowsLeft, int RowsRight, int Cols>
+void NeonRunKernelNoSDot(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols) {}
+
+template <>
+void NeonRunKernelNoSDot<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                   int32_t* dst, int lhs_layout_rows,
+                                   int lhs_layout_cols, int rhs_layout_rows,
+                                   int rhs_layout_cols, int dst_layout_rows,
+                                   int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 1;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x1
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "x0", "x1", "w2", "w3", "x4", "w5", "x6",
+                     "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                     "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                     "v19", "v20", "v21", "v22", "v23", "v24");
+      element_ptr += 4;
+    }
+  }
+}
+
+template <>
+void NeonRunKernelNoSDot<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                   int32_t* dst, int lhs_layout_rows,
+                                   int lhs_layout_cols, int rhs_layout_rows,
+                                   int rhs_layout_cols, int dst_layout_rows,
+                                   int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 2;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x2
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "x0", "x1", "w2", "w3", "x4", "w5", "x6",
+                     "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                     "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                     "v19", "v20", "v21", "v22", "v23", "v24");
+      element_ptr += 8;
+    }
+  }
+}
+
+template <>
+void NeonRunKernelNoSDot<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                   int32_t* dst, int lhs_layout_rows,
+                                   int lhs_layout_cols, int rhs_layout_rows,
+                                   int rhs_layout_cols, int dst_layout_rows,
+                                   int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 4;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x4
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "x0", "x1", "w2", "w3", "x4", "w5", "x6",
+                     "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                     "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                     "v19", "v20", "v21", "v22", "v23", "v24");
+      element_ptr += 16;
+    }
+  }
+}
+
+#undef INNER_LOOP_PREAMBLE
+#undef OUTER_LOOP_BEGIN
+#undef OUTER_LOOP_END
+#undef INNER_LOOP_BEGIN
+#undef INNER_LOOP
+#undef INNER_LOOP_END
+#undef INNER_LOOP_POSTAMBLE
+#undef END
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+#endif  // defined(FC_4BIT_NEON)...
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_aarch64_sdot.cc b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_aarch64_sdot.cc
new file mode 100644
index 00000000000..81dd4d6f9b5
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_aarch64_sdot.cc
@@ -0,0 +1,485 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+#define INNER_LOOP_PREAMBLE "1"
+#define OUTER_LOOP_BEGIN "2"
+#define OUTER_LOOP_END "3"
+#define INNER_LOOP_BEGIN "4"
+#define INNER_LOOP "5"
+#define INNER_LOOP_END "6"
+#define INNER_LOOP_POSTAMBLE "7"
+#define END "8"
+
+#define KERNEL_4x1                                    \
+  "dup v24.16b, %w[bit_shift]\n"                      \
+  "mov x0, %[element_ptr]\n"                          \
+  "mov x6, %[lhs_val]\n"                              \
+  "mov x1, %[rhs_val]\n"                              \
+                                                      \
+      INNER_LOOP_BEGIN                                \
+  ":\n"                                               \
+  "mov x4, x6\n"                                      \
+  "ld1 {v4.16b}, [x4], #16\n"                         \
+  "dup v16.4s, wzr\n"                                 \
+  "dup v17.4s, wzr\n"                                 \
+  "ld1 {v5.16b}, [x4], #16\n"                         \
+  "dup v18.4s, wzr\n"                                 \
+  "dup v19.4s, wzr\n"                                 \
+  "ld1 {v6.16b, v7.16b}, [x4], #32\n"                 \
+  "and v8.16b, v4.16b, v24.16b\n"                     \
+  "and v9.16b, v5.16b, v24.16b\n"                     \
+  "ushr v12.16b, v4.16b, #4\n"                        \
+  "ushr v13.16b, v5.16b, #4\n"                        \
+  "ld1 {v0.16b, v1.16b}, [x1], #32\n"                 \
+  "and v10.16b, v6.16b, v24.16b\n"                    \
+  "and v11.16b, v7.16b, v24.16b\n"                    \
+  "ushr v14.16b, v6.16b, #4\n"                        \
+  "ushr v15.16b, v7.16b, #4\n"                        \
+  "mov w3, %w[run_depth]\n"                           \
+  "subs w3, w3, #1\n"                                 \
+  "b.ls " INNER_LOOP_END "f\n"                        \
+                                                      \
+      INNER_LOOP                                      \
+  ":\n"                                               \
+  "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x4], #64\n" \
+  "sdot v16.4s, v8.16b, v1.16b\n"                     \
+  "sdot v17.4s, v9.16b, v1.16b\n"                     \
+  "sdot v18.4s, v10.16b, v1.16b\n"                    \
+  "sdot v19.4s, v11.16b, v1.16b\n"                    \
+  "sdot v16.4s, v12.16b, v0.16b\n"                    \
+  "sdot v17.4s, v13.16b, v0.16b\n"                    \
+  "sdot v18.4s, v14.16b, v0.16b\n"                    \
+  "sdot v19.4s, v15.16b, v0.16b\n"                    \
+  "and v8.16b, v4.16b, v24.16b\n"                     \
+  "and v9.16b, v5.16b, v24.16b\n"                     \
+  "ushr v12.16b, v4.16b, #4\n"                        \
+  "ushr v13.16b, v5.16b, #4\n"                        \
+  "ld1 {v0.16b, v1.16b}, [x1], #32\n"                 \
+  "and v10.16b, v6.16b, v24.16b\n"                    \
+  "and v11.16b, v7.16b, v24.16b\n"                    \
+  "ushr v14.16b, v6.16b, #4\n"                        \
+  "ushr v15.16b, v7.16b, #4\n"                        \
+  "subs w3, w3, #1\n"                                 \
+  "b.hi " INNER_LOOP "b\n"                            \
+                                                      \
+      INNER_LOOP_END                                  \
+  ":\n"                                               \
+  "sdot v16.4s, v8.16b, v1.16b\n"                     \
+  "sdot v17.4s, v9.16b, v1.16b\n"                     \
+  "sdot v18.4s, v10.16b, v1.16b\n"                    \
+  "sdot v19.4s, v11.16b, v1.16b\n"                    \
+  "sdot v16.4s, v12.16b, v0.16b\n"                    \
+  "sdot v17.4s, v13.16b, v0.16b\n"                    \
+  "sdot v18.4s, v14.16b, v0.16b\n"                    \
+  "sdot v19.4s, v15.16b, v0.16b\n"                    \
+  "addp v4.4s, v16.4s, v17.4s\n"                      \
+  "addp v5.4s, v18.4s, v19.4s\n"                      \
+  "addp v6.4s, v4.4s, v5.4s\n"                        \
+  "st1 {v6.4s}, [x0], #16\n"
+
+#define KERNEL_4x2                                    \
+  "dup v24.16b, %w[bit_shift]\n"                      \
+  "mov x0, %[element_ptr]\n"                          \
+  "mov x6, %[lhs_val]\n"                              \
+  "mov x1, %[rhs_val]\n"                              \
+                                                      \
+      INNER_LOOP_BEGIN                                \
+  ":\n"                                               \
+  "mov x4, x6\n"                                      \
+  "ld1 {v4.16b}, [x4], #16\n"                         \
+  "dup v16.4s, wzr\n"                                 \
+  "dup v17.4s, wzr\n"                                 \
+  "ld1 {v5.16b}, [x4], #16\n"                         \
+  "dup v18.4s, wzr\n"                                 \
+  "dup v19.4s, wzr\n"                                 \
+  "ld1 {v6.16b, v7.16b}, [x4], #32\n"                 \
+  "dup v20.4s, wzr\n"                                 \
+  "dup v21.4s, wzr\n"                                 \
+  "and v8.16b, v4.16b, v24.16b\n"                     \
+  "and v9.16b, v5.16b, v24.16b\n"                     \
+  "dup v22.4s, wzr\n"                                 \
+  "dup v23.4s, wzr\n"                                 \
+  "ushr v12.16b, v4.16b, #4\n"                        \
+  "ushr v13.16b, v5.16b, #4\n"                        \
+  "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64\n" \
+  "and v10.16b, v6.16b, v24.16b\n"                    \
+  "and v11.16b, v7.16b, v24.16b\n"                    \
+  "ushr v14.16b, v6.16b, #4\n"                        \
+  "ushr v15.16b, v7.16b, #4\n"                        \
+  "mov w3, %w[run_depth]\n"                           \
+  "subs w3, w3, #1\n"                                 \
+  "b.ls " INNER_LOOP_END "f\n"                        \
+                                                      \
+      INNER_LOOP                                      \
+  ":\n"                                               \
+  "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x4], #64\n" \
+                                                      \
+  "sdot v16.4s, v12.16b, v0.16b\n"                    \
+  "sdot v17.4s, v13.16b, v0.16b\n"                    \
+  "sdot v18.4s, v14.16b, v0.16b\n"                    \
+  "sdot v19.4s, v15.16b, v0.16b\n"                    \
+  "sdot v16.4s, v8.16b, v1.16b\n"                     \
+  "sdot v17.4s, v9.16b, v1.16b\n"                     \
+  "sdot v18.4s, v10.16b, v1.16b\n"                    \
+  "sdot v19.4s, v11.16b, v1.16b\n"                    \
+  "sdot v20.4s, v12.16b, v2.16b\n"                    \
+  "sdot v21.4s, v13.16b, v2.16b\n"                    \
+  "sdot v22.4s, v14.16b, v2.16b\n"                    \
+  "sdot v23.4s, v15.16b, v2.16b\n"                    \
+  "sdot v20.4s, v8.16b, v3.16b\n"                     \
+  "sdot v21.4s, v9.16b, v3.16b\n"                     \
+  "sdot v22.4s, v10.16b, v3.16b\n"                    \
+  "sdot v23.4s, v11.16b, v3.16b\n"                    \
+                                                      \
+  "and v8.16b, v4.16b, v24.16b\n"                     \
+  "and v9.16b, v5.16b, v24.16b\n"                     \
+  "ushr v12.16b, v4.16b, #4\n"                        \
+  "ld1 {v0.16b, v1.16b, v2.16b, v3.16b}, [x1], #64\n" \
+  "ushr v13.16b, v5.16b, #4\n"                        \
+  "and v10.16b, v6.16b, v24.16b\n"                    \
+  "and v11.16b, v7.16b, v24.16b\n"                    \
+  "ushr v14.16b, v6.16b, #4\n"                        \
+  "ushr v15.16b, v7.16b, #4\n"                        \
+  "subs w3, w3, #1\n"                                 \
+  "b.hi " INNER_LOOP "b\n"                            \
+                                                      \
+      INNER_LOOP_END                                  \
+  ":\n"                                               \
+  "sdot v16.4s, v12.16b, v0.16b\n"                    \
+  "sdot v17.4s, v13.16b, v0.16b\n"                    \
+  "sdot v18.4s, v14.16b, v0.16b\n"                    \
+  "sdot v19.4s, v15.16b, v0.16b\n"                    \
+  "sdot v16.4s, v8.16b, v1.16b\n"                     \
+  "sdot v17.4s, v9.16b, v1.16b\n"                     \
+  "sdot v18.4s, v10.16b, v1.16b\n"                    \
+  "sdot v19.4s, v11.16b, v1.16b\n"                    \
+  "sdot v20.4s, v12.16b, v2.16b\n"                    \
+  "sdot v21.4s, v13.16b, v2.16b\n"                    \
+  "sdot v22.4s, v14.16b, v2.16b\n"                    \
+  "sdot v23.4s, v15.16b, v2.16b\n"                    \
+  "sdot v20.4s, v8.16b, v3.16b\n"                     \
+  "sdot v21.4s, v9.16b, v3.16b\n"                     \
+  "sdot v22.4s, v10.16b, v3.16b\n"                    \
+  "sdot v23.4s, v11.16b, v3.16b\n"                    \
+  "addp v4.4s, v16.4s, v17.4s\n"                      \
+  "addp v5.4s, v18.4s, v19.4s\n"                      \
+  "addp v8.4s, v20.4s, v21.4s\n"                      \
+  "addp v9.4s, v22.4s, v23.4s\n"                      \
+  "addp v6.4s, v4.4s, v5.4s\n"                        \
+  "addp v7.4s, v8.4s, v9.4s\n"                        \
+  "st1 {v6.4s, v7.4s}, [x0], #32\n"
+
+#define KERNEL_4x4                                    \
+  "dup v3.16b, %w[bit_shift]\n"                       \
+  "mov x0, %[element_ptr]\n"                          \
+  "mov x6, %[lhs_val]\n"                              \
+  "mov x1, %[rhs_val]\n"                              \
+                                                      \
+      INNER_LOOP_BEGIN                                \
+  ":\n"                                               \
+  "mov x4, x6\n"                                      \
+  "ld1 {v4.16b}, [x4], #16\n"                         \
+  "dup v16.4s, wzr\n"                                 \
+  "dup v17.4s, wzr\n"                                 \
+  "ld1 {v5.16b}, [x4], #16\n"                         \
+  "dup v18.4s, wzr\n"                                 \
+  "dup v19.4s, wzr\n"                                 \
+  "ld1 {v6.16b, v7.16b}, [x4], #32\n"                 \
+  "and v8.16b, v4.16b, v3.16b\n"                      \
+  "and v9.16b, v5.16b, v3.16b\n"                      \
+  "dup v20.4s, wzr\n"                                 \
+  "dup v21.4s, wzr\n"                                 \
+  "dup v22.4s, wzr\n"                                 \
+  "dup v23.4s, wzr\n"                                 \
+  "dup v24.4s, wzr\n"                                 \
+  "dup v25.4s, wzr\n"                                 \
+  "dup v26.4s, wzr\n"                                 \
+  "dup v27.4s, wzr\n"                                 \
+  "dup v28.4s, wzr\n"                                 \
+  "dup v29.4s, wzr\n"                                 \
+  "dup v30.4s, wzr\n"                                 \
+  "dup v31.4s, wzr\n"                                 \
+  "ushr v12.16b, v4.16b, #4\n"                        \
+  "ushr v13.16b, v5.16b, #4\n"                        \
+  "ld1 {v0.16b, v1.16b}, [x1], #32\n"                 \
+  "and v10.16b, v6.16b, v3.16b\n"                     \
+  "and v11.16b, v7.16b, v3.16b\n"                     \
+  "ushr v14.16b, v6.16b, #4\n"                        \
+  "ushr v15.16b, v7.16b, #4\n"                        \
+  "mov w3, %w[run_depth]\n"                           \
+  "subs w3, w3, #1\n"                                 \
+  "b.ls " INNER_LOOP_END "f\n"                        \
+                                                      \
+      INNER_LOOP                                      \
+  ":\n"                                               \
+  "ld1 {v4.16b, v5.16b, v6.16b, v7.16b}, [x4], #64\n" \
+  "sdot v16.4s, v12.16b, v0.16b\n"                    \
+  "sdot v17.4s, v13.16b, v0.16b\n"                    \
+  "sdot v18.4s, v14.16b, v0.16b\n"                    \
+  "sdot v19.4s, v15.16b, v0.16b\n"                    \
+  "ld1 {v2.16b}, [x1], #16\n"                         \
+  "sdot v16.4s, v8.16b, v1.16b\n"                     \
+  "sdot v17.4s, v9.16b, v1.16b\n"                     \
+  "sdot v18.4s, v10.16b, v1.16b\n"                    \
+  "sdot v19.4s, v11.16b, v1.16b\n"                    \
+  "ld1 {v0.16b}, [x1], #16\n"                         \
+  "sdot v20.4s, v12.16b, v2.16b\n"                    \
+  "sdot v21.4s, v13.16b, v2.16b\n"                    \
+  "sdot v22.4s, v14.16b, v2.16b\n"                    \
+  "sdot v23.4s, v15.16b, v2.16b\n"                    \
+  "ld1 {v1.16b}, [x1], #16\n"                         \
+  "sdot v20.4s, v8.16b, v0.16b\n"                     \
+  "sdot v21.4s, v9.16b, v0.16b\n"                     \
+  "sdot v22.4s, v10.16b, v0.16b\n"                    \
+  "sdot v23.4s, v11.16b, v0.16b\n"                    \
+  "ld1 {v2.16b}, [x1], #16\n"                         \
+  "sdot v24.4s, v12.16b, v1.16b\n"                    \
+  "sdot v25.4s, v13.16b, v1.16b\n"                    \
+  "sdot v26.4s, v14.16b, v1.16b\n"                    \
+  "sdot v27.4s, v15.16b, v1.16b\n"                    \
+  "ld1 {v0.16b}, [x1], #16\n"                         \
+  "sdot v24.4s, v8.16b, v2.16b\n"                     \
+  "sdot v25.4s, v9.16b, v2.16b\n"                     \
+  "sdot v26.4s, v10.16b, v2.16b\n"                    \
+  "sdot v27.4s, v11.16b, v2.16b\n"                    \
+  "ld1 {v1.16b}, [x1], #16\n"                         \
+  "sdot v28.4s, v12.16b, v0.16b\n"                    \
+  "sdot v29.4s, v13.16b, v0.16b\n"                    \
+  "sdot v30.4s, v14.16b, v0.16b\n"                    \
+  "sdot v31.4s, v15.16b, v0.16b\n"                    \
+  "sdot v28.4s, v8.16b, v1.16b\n"                     \
+  "sdot v29.4s, v9.16b, v1.16b\n"                     \
+  "sdot v30.4s, v10.16b, v1.16b\n"                    \
+  "sdot v31.4s, v11.16b, v1.16b\n"                    \
+  "ld1 {v0.16b}, [x1], #16\n"                         \
+  "and v8.16b, v4.16b, v3.16b\n"                      \
+  "and v9.16b, v5.16b, v3.16b\n"                      \
+  "ushr v12.16b, v4.16b, #4\n"                        \
+  "ushr v13.16b, v5.16b, #4\n"                        \
+  "ld1 {v1.16b}, [x1], #16\n"                         \
+  "and v10.16b, v6.16b, v3.16b\n"                     \
+  "and v11.16b, v7.16b, v3.16b\n"                     \
+  "ushr v14.16b, v6.16b, #4\n"                        \
+  "ushr v15.16b, v7.16b, #4\n"                        \
+  "subs w3, w3, #1\n"                                 \
+  "b.hi " INNER_LOOP "b\n"                            \
+                                                      \
+      INNER_LOOP_END                                  \
+  ":\n"                                               \
+  "sdot v16.4s, v12.16b, v0.16b\n"                    \
+  "sdot v17.4s, v13.16b, v0.16b\n"                    \
+  "sdot v18.4s, v14.16b, v0.16b\n"                    \
+  "sdot v19.4s, v15.16b, v0.16b\n"                    \
+  "ld1 {v2.16b}, [x1], #16\n"                         \
+  "sdot v16.4s, v8.16b, v1.16b\n"                     \
+  "sdot v17.4s, v9.16b, v1.16b\n"                     \
+  "sdot v18.4s, v10.16b, v1.16b\n"                    \
+  "sdot v19.4s, v11.16b, v1.16b\n"                    \
+  "ld1 {v0.16b}, [x1], #16\n"                         \
+  "sdot v20.4s, v12.16b, v2.16b\n"                    \
+  "sdot v21.4s, v13.16b, v2.16b\n"                    \
+  "sdot v22.4s, v14.16b, v2.16b\n"                    \
+  "sdot v23.4s, v15.16b, v2.16b\n"                    \
+  "ld1 {v1.16b}, [x1], #16\n"                         \
+  "sdot v20.4s, v8.16b, v0.16b\n"                     \
+  "sdot v21.4s, v9.16b, v0.16b\n"                     \
+  "sdot v22.4s, v10.16b, v0.16b\n"                    \
+  "sdot v23.4s, v11.16b, v0.16b\n"                    \
+  "ld1 {v2.16b}, [x1], #16\n"                         \
+  "sdot v24.4s, v12.16b, v1.16b\n"                    \
+  "sdot v25.4s, v13.16b, v1.16b\n"                    \
+  "sdot v26.4s, v14.16b, v1.16b\n"                    \
+  "sdot v27.4s, v15.16b, v1.16b\n"                    \
+  "ld1 {v0.16b}, [x1], #16\n"                         \
+  "sdot v24.4s, v8.16b, v2.16b\n"                     \
+  "sdot v25.4s, v9.16b, v2.16b\n"                     \
+  "sdot v26.4s, v10.16b, v2.16b\n"                    \
+  "sdot v27.4s, v11.16b, v2.16b\n"                    \
+  "ld1 {v1.16b}, [x1], #16\n"                         \
+  "sdot v28.4s, v12.16b, v0.16b\n"                    \
+  "sdot v29.4s, v13.16b, v0.16b\n"                    \
+  "sdot v30.4s, v14.16b, v0.16b\n"                    \
+  "sdot v31.4s, v15.16b, v0.16b\n"                    \
+  "sdot v28.4s, v8.16b, v1.16b\n"                     \
+  "sdot v29.4s, v9.16b, v1.16b\n"                     \
+  "sdot v30.4s, v10.16b, v1.16b\n"                    \
+  "sdot v31.4s, v11.16b, v1.16b\n"                    \
+  "addp v14.4s, v16.4s, v17.4s\n"                     \
+  "addp v15.4s, v18.4s, v19.4s\n"                     \
+  "addp v12.4s, v20.4s, v21.4s\n"                     \
+  "addp v13.4s, v22.4s, v23.4s\n"                     \
+  "addp v10.4s, v24.4s, v25.4s\n"                     \
+  "addp v11.4s, v26.4s, v27.4s\n"                     \
+  "addp v8.4s, v28.4s, v29.4s\n"                      \
+  "addp v9.4s, v30.4s, v31.4s\n"                      \
+  "addp v4.4s, v14.4s, v15.4s\n"                      \
+  "addp v5.4s, v12.4s, v13.4s\n"                      \
+  "addp v6.4s, v10.4s, v11.4s\n"                      \
+  "addp v7.4s, v8.4s, v9.4s\n"                        \
+  "st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x0], #64\n"
+
+template <int RowsLeft, int RowsRight, int Cols>
+void NeonRunKernelSDot(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                       int lhs_layout_rows, int lhs_layout_cols,
+                       int rhs_layout_rows, int rhs_layout_cols,
+                       int dst_layout_rows, int dst_layout_cols) {}
+
+template <>
+void NeonRunKernelSDot<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                 int32_t* dst, int lhs_layout_rows,
+                                 int lhs_layout_cols, int rhs_layout_rows,
+                                 int rhs_layout_cols, int dst_layout_rows,
+                                 int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 1;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x1
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "x0", "x1", "w2", "w3", "x4", "w5", "x6",
+                     "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                     "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                     "v19", "v20", "v21", "v22", "v23", "v24");
+      element_ptr += 4;
+    }
+  }
+}
+
+template <>
+void NeonRunKernelSDot<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                 int32_t* dst, int lhs_layout_rows,
+                                 int lhs_layout_cols, int rhs_layout_rows,
+                                 int rhs_layout_cols, int dst_layout_rows,
+                                 int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 2;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x2
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "x0", "x1", "w2", "w3", "x4", "w5", "x6",
+                     "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                     "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                     "v19", "v20", "v21", "v22", "v23", "v24");
+      element_ptr += 8;
+    }
+  }
+}
+
+template <>
+void NeonRunKernelSDot<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                 int32_t* dst, int lhs_layout_rows,
+                                 int lhs_layout_cols, int rhs_layout_rows,
+                                 int rhs_layout_cols, int dst_layout_rows,
+                                 int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 4;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x4
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "x0", "x1", "w2", "w3", "x4", "w5", "x6",
+                     "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
+                     "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18",
+                     "v19", "v20", "v21", "v22", "v23", "v24");
+      element_ptr += 16;
+    }
+  }
+}
+
+#undef INNER_LOOP_PREAMBLE
+#undef OUTER_LOOP_BEGIN
+#undef OUTER_LOOP_END
+#undef INNER_LOOP_BEGIN
+#undef INNER_LOOP
+#undef INNER_LOOP_END
+#undef INNER_LOOP_POSTAMBLE
+#undef END
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+#endif  // defined(FC_4BIT_NEON)...
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_arm32.cc b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_arm32.cc
new file mode 100644
index 00000000000..d336c5918c1
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_arm32.cc
@@ -0,0 +1,219 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+
+#include <arm_neon.h>
+#include <stdint.h>
+
+#include <algorithm>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+#define INNER_LOOP_PREAMBLE "1"
+#define OUTER_LOOP_BEGIN "2"
+#define OUTER_LOOP_END "3"
+#define INNER_LOOP_BEGIN "4"
+#define INNER_LOOP "5"
+#define INNER_LOOP_END "6"
+#define INNER_LOOP_POSTAMBLE "7"
+#define END "8"
+
+#define KERNEL_4x1                       \
+  "mov r8, 0xf\n"                        \
+  "vdup.8 d28, r8\n"                     \
+  "mov r0, %[element_ptr]\n"             \
+  "mov r6, %[lhs_val]\n"                 \
+  "mov r8, #0\n"                         \
+  "mov r1, %[rhs_val]\n"                 \
+                                         \
+      INNER_LOOP_BEGIN                   \
+  ":\n"                                  \
+  "mov r4, r6\n"                         \
+  "vld1.8 {d8, d9}, [r4]!\n"             \
+  "vdup.8 q0, r8\n"                      \
+  "vdup.8 q1, r8\n"                      \
+  "vld1.8 {d10, d11}, [r4]!\n"           \
+  "vdup.8 q2, r8\n"                      \
+  "vdup.8 q3, r8\n"                      \
+  "vld1.8 {d12, d13}, [r4]!\n"           \
+  "vand d16, d8, d28\n"                  \
+  "vand d17, d9, d28\n"                  \
+  "vand d18, d10, d28\n"                 \
+  "vand d19, d11, d28\n"                 \
+  "vld1.8 {d14, d15}, [r4]!\n"           \
+  "vshr.u8 q4, q4, #4\n"                 \
+  "vshr.u8 q5, q5, #4\n"                 \
+  "vld1.8 {d24, d25}, [r1]!\n"           \
+  "vand d20, d12, d28\n"                 \
+  "vand d21, d13, d28\n"                 \
+  "vand d22, d14, d28\n"                 \
+  "vand d23, d15, d28\n"                 \
+  "vld1.8 {d26, d27}, [r1]!\n"           \
+  "vshr.u8 q6, q6, #4\n"                 \
+  "vshr.u8 q7, q7, #4\n"                 \
+  "mov r3, %[run_depth]\n"               \
+  "subs r3, r3, #1\n"                    \
+  "bls " INNER_LOOP_END "f\n"            \
+                                         \
+      INNER_LOOP                         \
+  ":\n"                                  \
+                                         \
+  "vmlal.s8 q0, d8, d24\n"               \
+  "vmlal.s8 q1, d10, d24\n"              \
+  "vmlal.s8 q0, d9, d25\n"               \
+  "vmlal.s8 q1, d11, d25\n"              \
+  "vld1.8 {d8, d9, d10, d11}, [r4]!\n"   \
+                                         \
+  "vmlal.s8 q2, d12, d24\n"              \
+  "vmlal.s8 q3, d14, d24\n"              \
+  "vmlal.s8 q2, d13, d25\n"              \
+  "vmlal.s8 q3, d15, d25\n"              \
+                                         \
+  "vld1.8 {d12, d13, d14, d15}, [r4]!\n" \
+                                         \
+  "vmlal.s8 q0, d16, d26\n"              \
+  "vmlal.s8 q1, d18, d26\n"              \
+  "vmlal.s8 q2, d20, d26\n"              \
+  "vmlal.s8 q3, d22, d26\n"              \
+                                         \
+  "vmlal.s8 q0, d17, d27\n"              \
+  "vmlal.s8 q1, d19, d27\n"              \
+  "vmlal.s8 q2, d21, d27\n"              \
+  "vmlal.s8 q3, d23, d27\n"              \
+                                         \
+  "vld1.8 {d24, d25, d26, d27}, [r1]!\n" \
+                                         \
+  "vand d16, d8, d28\n"                  \
+  "vand d17, d9, d28\n"                  \
+  "vand d18, d10, d28\n"                 \
+  "vand d19, d11, d28\n"                 \
+  "vand d20, d12, d28\n"                 \
+  "vand d21, d13, d28\n"                 \
+  "vand d22, d14, d28\n"                 \
+  "vand d23, d15, d28\n"                 \
+  "vshr.u8 q4, q4, #4\n"                 \
+  "vshr.u8 q5, q5, #4\n"                 \
+  "vshr.u8 q6, q6, #4\n"                 \
+  "vshr.u8 q7, q7, #4\n"                 \
+                                         \
+  "subs r3, r3, #1\n"                    \
+  "bhi " INNER_LOOP "b\n"                \
+                                         \
+      INNER_LOOP_END                     \
+  ":\n"                                  \
+                                         \
+  "vmlal.s8 q0, d8, d24\n"               \
+  "vmlal.s8 q1, d10, d24\n"              \
+  "vmlal.s8 q2, d12, d24\n"              \
+  "vmlal.s8 q3, d14, d24\n"              \
+                                         \
+  "vmlal.s8 q0, d9, d25\n"               \
+  "vmlal.s8 q1, d11, d25\n"              \
+  "vmlal.s8 q2, d13, d25\n"              \
+  "vmlal.s8 q3, d15, d25\n"              \
+                                         \
+  "vmlal.s8 q0, d16, d26\n"              \
+  "vmlal.s8 q1, d18, d26\n"              \
+  "vmlal.s8 q2, d20, d26\n"              \
+  "vmlal.s8 q3, d22, d26\n"              \
+                                         \
+  "vmlal.s8 q0, d17, d27\n"              \
+  "vmlal.s8 q1, d19, d27\n"              \
+  "vmlal.s8 q2, d21, d27\n"              \
+  "vmlal.s8 q3, d23, d27\n"              \
+                                         \
+  "vpaddl.s16 q4, q0\n"                  \
+  "vpaddl.s16 q5, q1\n"                  \
+  "vpaddl.s16 q6, q2\n"                  \
+  "vpaddl.s16 q7, q3\n"                  \
+                                         \
+  "vpadd.i32 d0, d8, d9\n"               \
+  "vpadd.i32 d1, d10, d11\n"             \
+  "vpadd.i32 d2, d12, d13\n"             \
+  "vpadd.i32 d3, d14, d15\n"             \
+                                         \
+  "vpadd.i32 d4, d0, d1\n"               \
+  "vpadd.i32 d5, d2, d3\n"               \
+  "vst1.32 {d4, d5}, [r0]!\n"
+
+template <int RowsLeft, int RowsRight, int Cols>
+void NeonRunKernelNoSDot(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols) {}
+
+template <>
+void NeonRunKernelNoSDot<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                   int32_t* dst, int lhs_layout_rows,
+                                   int lhs_layout_cols, int rhs_layout_rows,
+                                   int rhs_layout_cols, int dst_layout_rows,
+                                   int dst_layout_cols) {
+  const int rows_left = 4;
+  const int rows_right = 1;
+  const int cols = 32;
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* element_ptr = dst;
+  const int outer_rows = (clamped_end_row + rows_left - 1) / rows_left;
+  const int outer_cols = (clamped_end_col + rows_right - 1) / rows_right;
+  const int depth = std::min(lhs_layout_cols / cols, rhs_layout_cols / cols);
+  const uint8_t bit_shift = 15;
+  const int run_depth = depth;
+  for (int i = start_row; i < outer_rows; ++i) {
+    const int left_index = i * rows_left * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      const int right_index = j * rows_right * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      asm volatile(KERNEL_4x1
+                   : [lhs_val] "+r"(lhs_val), [rhs_val] "+r"(rhs_val),
+                     [element_ptr] "+r"(element_ptr)
+                   : [bit_shift] "r"(bit_shift), [run_depth] "r"(run_depth)
+                   : "cc", "memory", "r0", "r1", "r2", "r3", "r4", "r5", "r6",
+                     "r8", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "d8",
+                     "d9", "d10", "d11", "d12", "d13", "d14", "d15", "d16",
+                     "d17", "d18", "d19", "d20", "d21", "d22", "d23", "d24",
+                     "d25", "d26", "d27", "d28", "d29", "d30", "d31", "q0",
+                     "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+                     "q10", "q11", "q12", "q13", "q14");
+      element_ptr += 4;
+    }
+  }
+}
+
+#undef INNER_LOOP_PREAMBLE
+#undef OUTER_LOOP_BEGIN
+#undef OUTER_LOOP_END
+#undef INNER_LOOP_BEGIN
+#undef INNER_LOOP
+#undef INNER_LOOP_END
+#undef INNER_LOOP_POSTAMBLE
+#undef END
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+#endif  // defined(FC_4BIT_NEON) ...
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h
new file mode 100644
index 00000000000..a0ca2926286
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected_impl.h
@@ -0,0 +1,81 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_IMPL_H_
+#if defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include <stdint.h>
+
+#if !defined(EIGEN_MAX_ALIGN_BYTES) && !defined(__aarch64__)
+#define EIGEN_MAX_ALIGN_BYTES 32
+#elif !defined(EIGEN_MAX_ALIGN_BYTES)
+#define EIGEN_MAX_ALIGN_BYTES 64
+#endif
+
+namespace tflite {
+namespace optimized_4bit {
+
+void NeonPackInner(const int8_t* src, uint8_t* box, int src_rows, int src_cols,
+                   int outer_row, int outer_col, int outer_rows, int outer_cols,
+                   int inner_rows, int inner_cols);
+
+void NeonPrepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                 int layout_cols, int src_rows, int src_cols, int width,
+                 int depth);
+
+void NeonBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                 int n_data, int8_t* quantized_data_ptr,
+                                 float* scaling_factors, int width, int depth,
+                                 int32_t* input_offsets);
+
+void NeonAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                     const float* batch_scales,
+                                     float* filter_scales,
+                                     const float* bias_ptr, float* output_ptr,
+                                     int output_depth, int batch_size);
+
+template <int Depth, int Width>
+extern void NeonUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+                       int num_units, const float* scaling_factors,
+                       const float* filter_scales, int dst_layout_rows,
+                       int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void NeonRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                          int lhs_layout_rows, int lhs_layout_cols,
+                          int rhs_layout_rows, int rhs_layout_cols,
+                          int dst_layout_rows, int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void NeonRunKernelNoSDot(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols);
+
+#ifdef __aarch64__
+template <int RowsLeft, int RowsRight, int Cols>
+extern void NeonRunKernelSDot(const uint8_t* lhs, const int8_t* rhs,
+                              int32_t* dst, int lhs_layout_rows,
+                              int lhs_layout_cols, int rhs_layout_rows,
+                              int rhs_layout_cols, int dst_layout_rows,
+                              int dst_layout_cols);
+#endif
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_NEON)...
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_NEON_FULLY_CONNECTED_IMPL_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.cc b/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.cc
new file mode 100644
index 00000000000..b2f943e8022
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.cc
@@ -0,0 +1,450 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+
+#include <stdint.h>
+#include <stdlib.h>
+
+// NOLINTBEGIN
+#include <tmmintrin.h>
+
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
+#include <vector>
+
+#include "tensorflow/lite/kernels/internal/cppmath.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_common.h"
+#include "tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+#define is_aligned(ptr, bytes) ((((size_t)(ptr)) & (bytes - 1)) == 0)
+
+void SsePackInner(const int8_t* src, uint8_t* box, int src_rows, int src_cols,
+                  int outer_row, int outer_col, int outer_rows, int outer_cols,
+                  int inner_rows, int inner_cols) {
+  const int width = inner_rows;
+  const int depth = inner_cols;
+  const int real_depth = depth / 2;
+  const int real_src_cols = src_cols / 2;
+  const int row = outer_row * inner_rows;
+  const int col = outer_col * inner_cols;
+  int src_width = std::min(width, src_rows - row);
+  int src_depth = std::min(depth, src_cols - col);
+  int real_col = col / 2;
+  const int8_t* src_data = src + row * real_src_cols + real_col;
+  int real_src_depth = src_depth / 2;
+  const __m128i bitmask_upper = _mm_set1_epi16(255U << 8);
+  const __m128i bitmask_lower = _mm_set1_epi16(255U);
+  const __m128i seven = _mm_set1_epi8(7);
+  for (int m = 0; m < src_width; ++m) {
+    int i = 0;
+    int k = 0;
+    for (; i < (real_src_depth & (~15)); i += 16) {
+      const __m128i values_128i = _mm_loadu_si128((__m128i*)(src_data + i));
+      // sign extend uv1
+      __m128i uv1 = _mm_srai_epi16(values_128i, 4);
+      uv1 = _mm_add_epi8(uv1, seven);
+      uv1 = _mm_and_si128(uv1, bitmask_upper);
+      __m128i uv2 = _mm_slli_epi16(values_128i, 8);
+      uv2 = _mm_srai_epi16(uv2, 12);
+      uv2 = _mm_add_epi8(uv2, seven);
+      uv2 = _mm_and_si128(uv2, bitmask_lower);
+      uv1 = _mm_or_si128(uv1, uv2);
+
+      __m128i lv1 = _mm_slli_epi16(values_128i, 4);
+      lv1 = _mm_srai_epi16(lv1, 4);
+      lv1 = _mm_add_epi8(lv1, seven);
+      lv1 = _mm_and_si128(lv1, bitmask_upper);
+      __m128i lv2 = _mm_slli_epi16(values_128i, 12);
+      lv2 = _mm_srai_epi16(lv2, 12);
+      lv2 = _mm_add_epi8(lv2, seven);
+      lv2 = _mm_and_si128(lv2, bitmask_lower);
+
+      lv1 = _mm_or_si128(lv1, lv2);
+      __m128i u = _mm_or_si128(_mm_slli_epi16(uv1, 4),
+                               _mm_unpackhi_epi64(uv1, _mm_setzero_si128()));
+      __m128i l = _mm_or_si128(_mm_slli_epi16(lv1, 4),
+                               _mm_unpackhi_epi64(lv1, _mm_setzero_si128()));
+      __m128i v = _mm_unpacklo_epi8(l, u);
+      _mm_store_si128((__m128i*)(box + k), v);
+      k += 16;
+    }
+    // handle remaining 16 values
+    for (; i < (real_src_depth & (~7)); i += 8) {
+      __m128i values_128i = _mm_loadu_si128((__m128i*)(src_data + i));
+      __m128i uv1 = _mm_srai_epi16(values_128i, 4);
+      uv1 = _mm_add_epi8(uv1, seven);
+      uv1 = _mm_and_si128(uv1, bitmask_upper);
+      __m128i uv2 = _mm_slli_epi16(values_128i, 8);
+      uv2 = _mm_srai_epi16(uv2, 12);
+      uv2 = _mm_add_epi8(uv2, seven);
+      uv2 = _mm_and_si128(uv2, bitmask_lower);
+      uv1 = _mm_or_si128(uv1, uv2);
+
+      __m128i lv1 = _mm_slli_epi16(values_128i, 4);
+      lv1 = _mm_srai_epi16(lv1, 4);
+      lv1 = _mm_add_epi8(lv1, seven);
+      lv1 = _mm_and_si128(lv1, bitmask_upper);
+      __m128i lv2 = _mm_slli_epi16(values_128i, 12);
+      lv2 = _mm_srai_epi16(lv2, 12);
+      lv2 = _mm_add_epi8(lv2, seven);
+      lv2 = _mm_and_si128(lv2, bitmask_lower);
+      lv1 = _mm_or_si128(lv1, lv2);
+      __m128i u = _mm_or_si128(_mm_slli_epi16(uv1, 4), _mm_setzero_si128());
+      __m128i l = _mm_or_si128(_mm_slli_epi16(lv1, 4), _mm_setzero_si128());
+      __m128i v = _mm_unpacklo_epi8(l, u);
+      _mm_store_si128((__m128i*)(box + k), v);
+      k += 16;
+    }
+    for (; i < real_src_depth; i++) {
+      const int8_t v1 = (int8_t)src_data[i];
+      int8_t uv1 = upper(v1);
+      int8_t lv1 = lower(v1);
+      box[k] = merge(lv1, 0);
+      box[k + 1] = merge(uv1, 0);
+      k += 2;
+    }
+    box += real_depth;
+    src_data += real_src_cols;
+  }
+}
+
+void SsePrepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                int layout_cols, int src_rows, int src_cols, int width,
+                int depth) {
+  size_t size = layout_rows * layout_cols / 2;
+  int res =
+      posix_memalign(reinterpret_cast<void**>(dest), EIGEN_MAX_ALIGN_BYTES,
+                     size + (EIGEN_MAX_ALIGN_BYTES));
+  (void)res;
+  memset(*dest, static_cast<uint8_t>(119), sizeof(uint8_t) * size);
+  int outer_cols = layout_cols / depth;
+  int outer_rows = layout_rows / width;
+  int inner_cols = depth;
+  int inner_rows = width;
+  for (int outer_row = 0; outer_row < outer_rows; ++outer_row) {
+    for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+      const int cluster_index = outer_row * outer_cols + outer_col;
+      const int real_depth = inner_cols / 2;
+      uint8_t* box = *dest + cluster_index * real_depth * inner_rows;
+      SsePackInner(tensor, box, src_rows, src_cols, outer_row, outer_col,
+                   outer_rows, outer_cols, inner_rows, inner_cols);
+    }
+  }
+}
+
+void SseBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int width, int depth,
+                                int32_t* input_offsets) {
+  const int rows = n_batch;
+  const int cols = n_data;
+  // depth is always cols
+  const int layout_rows = (rows + (width - 1)) & ~(width - 1);
+  const int layout_cols = (cols + (depth - 1)) & ~(depth - 1);
+  const int size = layout_rows * layout_cols;
+  int8_t* data = quantized_data_ptr;
+  memset(data, 0, sizeof(int8_t) * size);
+  memset(input_offsets, 0, sizeof(int32_t) * layout_rows);
+  const float* tensor_data = float_data_ptr;
+  // basically, we need to make a new 4D matrix
+  // [rows / width, cols / depth, width, depth] in depth-first
+  const int outer_cols = layout_cols / depth;
+  const int outer_rows = layout_rows / width;
+  float* scaling_factors_ptr = scaling_factors;
+  for (int outer_row = 0; outer_row < outer_rows; outer_row++) {
+    std::vector<float> scale(width);
+    const int row = width * outer_row;
+    scaling_factors_ptr = scaling_factors + row;
+    for (int w = 0; w < width; ++w) {
+      if ((row + w) >= rows) {
+        continue;
+      }
+      const float* start = tensor_data + (row + w) * cols;
+      int c = 0;
+      float scale_denom = 0;
+      for (; c < cols; ++c) {
+        scale_denom = std::max(scale_denom, std::abs(*(start++)));
+      }
+      scale[w] = 127.0 / scale_denom;
+      scaling_factors_ptr[w] = scale_denom / 127.0;
+    }
+    for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+      const int col = depth * outer_col;
+      const int src_width = std::min(width, rows - row);
+      const int src_depth = std::min(depth, cols - col);
+      const int cluster_index = outer_row * outer_cols + outer_col;
+      int8_t* box = data + cluster_index * depth * width;
+      for (int w = 0; w < src_width; ++w) {
+        const float* float_data = tensor_data + (row + w) * cols + col;
+        int d = 0;
+        for (; d < src_depth; ++d) {
+          int8_t q = static_cast<int8_t>(TfLiteRound(float_data[d] * scale[w]));
+          box[w * depth + d] = q;
+          input_offsets[row + w] += q;
+        }
+      }
+    }
+  }
+  for (int r = 0; r < layout_rows; ++r) {
+    input_offsets[r] = input_offsets[r] * zero_point_4bit;
+  }
+}
+
+void SseAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                    const float* batch_scales,
+                                    const float* filter_scales,
+                                    const float* bias_ptr, float* output_ptr,
+                                    int output_depth, int batch_size) {
+  if (bias_ptr) {
+    for (int b = 0; b < batch_size; ++b) {
+      const float val = *input_offsets++ * *batch_scales++;
+      const float* filter_scales_ptr = filter_scales;
+      const float* bias_ptr_tmp = bias_ptr;
+      int i = 0;
+      for (; i < output_depth; i++) {
+        *output_ptr++ = (val * *filter_scales_ptr++) + *bias_ptr_tmp++;
+      }
+    }
+    return;
+  }
+  for (int b = 0; b < batch_size; ++b) {
+    const float val = *input_offsets++ * *batch_scales++;
+    const float* filter_scales_ptr = filter_scales;
+    int i = 0;
+    for (; i < output_depth; i++) {
+      *output_ptr++ = (val * *filter_scales_ptr++);
+    }
+  }
+}
+
+template <int Depth, int Width>
+void SseUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+               int num_units, const float* scaling_factors,
+               const float* filter_scales, int dst_layout_rows,
+               int dst_layout_cols) {
+  if (Width == 1) {
+    const int outer_rows = dst_layout_rows / Width;
+    const int outer_cols = dst_layout_cols / Depth;
+    const int32_t* dst_ptr = dst;
+    int unit = 0;
+    for (int outer_col = 0; outer_col < outer_cols;
+         ++outer_col, unit += Depth) {
+      float* tmp_output_ptr = output_ptr + unit;
+      int len = num_units - unit < Depth ? num_units - unit : Depth;
+      int cond = len & ~3;
+      const float* scaling_factors_ptr = scaling_factors;
+      for (int outer_row = 0; outer_row < outer_rows; ++outer_row) {
+        const float scale = *scaling_factors_ptr;
+        const float* filter_scales_ptr = filter_scales + unit;
+        int i = 0;
+        for (; i < cond; i += 4) {
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+        }
+        for (; i < len; ++i) {
+          *(tmp_output_ptr++) += *(dst_ptr++) * scale * (*filter_scales_ptr++);
+        }
+        dst_ptr += (Depth - len);
+        scaling_factors_ptr += Width;
+        tmp_output_ptr += (num_units - len);
+      }
+    }
+    return;
+  }
+  const int outer_rows = dst_layout_rows / Width;
+  const int outer_cols = dst_layout_cols / Depth;
+  for (int outer_col = 0; outer_col < outer_cols; ++outer_col) {
+    const int unit = outer_col * Depth;
+    const int remaining_units = std::min(num_units - unit, Depth);
+    const int depth_offset = Depth - remaining_units;
+    const int width_offset = num_units - remaining_units;
+    int outer_row = 0;
+    for (; outer_row < outer_rows; ++outer_row) {
+      const int batch = outer_row * Width;
+      const int remaining_width = std::min(batch_size - batch, Width);
+      const int cluster_index = outer_col * outer_rows + outer_row;
+      const int32_t* dst_ptr = dst + cluster_index * Depth * Width;
+      float* tmp_output_ptr = output_ptr + batch * num_units + unit;
+      const float* scale = scaling_factors + batch;
+      int w = remaining_width;
+      for (; w > 0; --w, scale++) {
+        int d = remaining_units;
+        const float* filter_scales_ptr = filter_scales + unit;
+        for (; d > 0; --d) {
+          *tmp_output_ptr++ += *dst_ptr++ * (*scale) * (*filter_scales_ptr++);
+        }
+        dst_ptr += depth_offset;
+        tmp_output_ptr += width_offset;
+      }
+    }
+  }
+}
+
+inline __m128i DotProdInt8x4x4(__m128i acc_32x4, __m128i a_8x16,
+                               __m128i b_8x16) {
+  b_8x16 = _mm_sign_epi8(b_8x16, a_8x16);
+  a_8x16 = _mm_abs_epi8(a_8x16);
+  __m128i sumprod_16x8 = _mm_maddubs_epi16(a_8x16, b_8x16);
+  return _mm_add_epi32(acc_32x4,
+                       _mm_madd_epi16(sumprod_16x8, _mm_set1_epi16(1)));
+}
+
+inline __m128i ReduceInt32x4x4(__m128i a, __m128i b, __m128i c, __m128i d) {
+  // Assuming x = [x0, x1, x2, x3]
+  const __m128i a_b_lo_half = _mm_unpacklo_epi32(a, b);  // [a0, b0, a1, b1]
+  const __m128i a_b_hi_half = _mm_unpackhi_epi32(a, b);  // [a2, b2, a3, b3]
+  const __m128i a_plus_b =
+      _mm_add_epi32(a_b_lo_half, a_b_hi_half);  // [a0+a2, b0+b2, a1+a3, b1+b3]
+  const __m128i c_d_lo_half = _mm_unpacklo_epi32(c, d);  // [c0, d0, c1, d1]
+  const __m128i c_d_hi_half = _mm_unpackhi_epi32(c, d);  // [c2, d2, c3, d3]
+  const __m128i c_plus_d =
+      _mm_add_epi32(c_d_lo_half, c_d_hi_half);  // [c0+c2, d0+d2, c1+c3, d1+d3]
+  const __m128i all_evns =
+      _mm_unpacklo_epi64(a_plus_b, c_plus_d);  // [a02, b02, c02, d02]
+  const __m128i all_odds =
+      _mm_unpackhi_epi64(a_plus_b, c_plus_d);  // [a13, b13, c13, d13]
+  return _mm_add_epi32(all_evns, all_odds);    // [a0123, b0123, c0123, d0123]
+}
+
+template <int RowsLeft, int RowsRight, int Cols>
+void SseRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                  int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+                  int rhs_layout_cols, int dst_layout_rows,
+                  int dst_layout_cols) {
+  const int start_row = 0;
+  const int start_col = 0;
+  const int end_row = lhs_layout_rows;
+  const int end_col = rhs_layout_rows;
+  const int clamped_end_row = std::min(end_row, dst_layout_cols);
+  const int clamped_end_col = std::min(end_col, dst_layout_rows);
+  int32_t* elementPtr = dst;
+  const int outer_rows = (clamped_end_row + RowsLeft - 1) / RowsLeft;
+  const int outer_cols = (clamped_end_col + RowsRight - 1) / RowsRight;
+  const int depth = std::min(lhs_layout_cols / Cols, rhs_layout_cols / Cols);
+  const __m128i bitmask = _mm_set1_epi8(15);
+  uint8_t* lhs_vec = nullptr;
+  for (int i = start_row; i < outer_rows; ++i) {
+    int left_index = i * RowsLeft * lhs_layout_cols / 2;
+    const uint8_t* lhs_val_data = lhs + left_index;
+    if (!is_aligned(lhs_val_data, 16)) {
+      size_t size = RowsLeft * lhs_layout_cols / 2;
+      if (!lhs_vec) {
+        int res = posix_memalign(reinterpret_cast<void**>(&lhs_vec),
+                                 EIGEN_MAX_ALIGN_BYTES,
+                                 size + (EIGEN_MAX_ALIGN_BYTES));
+        (void)res;
+      }
+      memcpy(lhs_vec, lhs_val_data, size);
+      lhs_val_data = lhs_vec;
+    }
+    for (int j = start_col; j < outer_cols; ++j) {
+      const uint8_t* lhs_val = lhs_val_data;
+      int right_index = j * RowsRight * rhs_layout_cols;
+      const int8_t* rhs_val = rhs + right_index;
+      __m128i accum[RowsRight * RowsLeft];
+      for (int m = 0; m < (RowsLeft * RowsRight); ++m) {
+        accum[m] = _mm_set1_epi8(0);
+      }
+      for (int k = 0; k < depth; ++k) {
+        __m128i lhs_row[RowsLeft];
+        for (int m = 0; m < RowsLeft; ++m) {
+          lhs_row[m] = _mm_load_si128((__m128i*)(lhs_val));
+          lhs_val += 16;
+        }
+        __m128i rhs[RowsRight][2];
+        for (int m = 0; m < RowsRight; ++m) {
+          for (int n = 0; n < 2; ++n) {
+            rhs[m][n] = _mm_loadu_si128((__m128i*)(rhs_val));
+            rhs_val += 16;
+          }
+        }
+        __m128i lhs_row_8[RowsLeft][2];
+        for (int m = 0; m < RowsLeft; ++m) {
+          lhs_row_8[m][0] = _mm_srli_epi16(lhs_row[m], 4);
+          lhs_row_8[m][1] = _mm_and_si128(lhs_row[m], bitmask);
+        }
+        for (int m = 0; m < RowsLeft; ++m) {
+          lhs_row_8[m][0] = _mm_and_si128(lhs_row_8[m][0], bitmask);
+        }
+        for (int i = 0; i < 2; ++i) {
+          for (int r = 0; r < RowsRight; ++r) {
+            for (int l = 0; l < RowsLeft; ++l) {
+              accum[r * RowsLeft + l] = DotProdInt8x4x4(
+                  accum[r * RowsLeft + l], lhs_row_8[l][i], rhs[r][i]);
+            }
+          }
+        }
+      }
+      for (int r = 0; r < RowsRight; ++r) {
+        __m128i sum =
+            ReduceInt32x4x4(accum[r * RowsLeft], accum[r * RowsLeft + 1],
+                            accum[r * RowsLeft + 2], accum[r * RowsLeft + 3]);
+        _mm_storeu_si128((__m128i*)elementPtr, sum);
+        elementPtr += 4;
+      }
+    }
+  }
+  if (lhs_vec != nullptr) {
+    free(lhs_vec);
+  }
+}
+// NOLINTEND
+
+template void SseUnpack<4, 1>(float* output_ptr, const int32_t* dst,
+                              int batch_size, int num_units,
+                              const float* scaling_factors,
+                              const float* filter_scales, int dst_layout_rows,
+                              int dst_layout_cols);
+
+template void SseUnpack<4, 2>(float* output_ptr, const int32_t* dst,
+                              int batch_size, int num_units,
+                              const float* scaling_factors,
+                              const float* filter_scales, int dst_layout_rows,
+                              int dst_layout_cols);
+
+template void SseUnpack<4, 4>(float* output_ptr, const int32_t* dst,
+                              int batch_size, int num_units,
+                              const float* scaling_factors,
+                              const float* filter_scales, int dst_layout_rows,
+                              int dst_layout_cols);
+
+template void SseRunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                     int32_t* dst, int lhs_layout_rows,
+                                     int lhs_layout_cols, int rhs_layout_rows,
+                                     int rhs_layout_cols, int dst_layout_rows,
+                                     int dst_layout_cols);
+
+template void SseRunKernel<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                     int32_t* dst, int lhs_layout_rows,
+                                     int lhs_layout_cols, int rhs_layout_rows,
+                                     int rhs_layout_cols, int dst_layout_rows,
+                                     int dst_layout_cols);
+
+template void SseRunKernel<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                     int32_t* dst, int lhs_layout_rows,
+                                     int lhs_layout_cols, int rhs_layout_rows,
+                                     int rhs_layout_cols, int dst_layout_rows,
+                                     int dst_layout_cols);
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_SSE) && defined(__SSSE3__)
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h b/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h
new file mode 100644
index 00000000000..4ab7491c3e6
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h
@@ -0,0 +1,185 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_H_
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+
+#include <stdint.h>
+
+#include "tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h"
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Maximum RowsRight compiled RunKernel implementations.
+inline int GetMaxSupportedRows() { return 4; }
+
+// Pack a 4bit inner_rows x inner_cols array from src.
+inline void PackInner(const int8_t* src, uint8_t* box, int src_rows,
+                      int src_cols, int outer_row, int outer_col,
+                      int outer_rows, int outer_cols, int inner_rows,
+                      int inner_cols) {
+  SsePackInner(src, box, src_rows, src_cols, outer_row, outer_col, outer_rows,
+               outer_cols, inner_rows, inner_cols);
+}
+
+// Prepack lhs matrix, and allocate destination pointer.
+inline void Prepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                    int layout_cols, int src_rows, int src_cols, int width,
+                    int depth) {
+  SsePrepack(dest, tensor, layout_rows, layout_cols, src_rows, src_cols, width,
+             depth);
+}
+
+// Quantize input floats to 8bit and calculate sum of each column.
+inline void BatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                    int n_data, int8_t* quantized_data_ptr,
+                                    float* scaling_factors, int width,
+                                    int depth, int32_t* input_offsets) {
+  SseBatchQuantizeFloats4Bit(float_data_ptr, n_batch, n_data,
+                             quantized_data_ptr, scaling_factors, width, depth,
+                             input_offsets);
+}
+
+// Write bias + input offset * filter_scale to output_ptr.
+inline void AssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                        const float* batch_scales,
+                                        float* filter_scales,
+                                        const float* bias_ptr,
+                                        float* output_ptr, int output_depth,
+                                        int batch_size) {
+  SseAssignBiasAndComputeOffsets(input_offsets, batch_scales, filter_scales,
+                                 bias_ptr, output_ptr, output_depth,
+                                 batch_size);
+}
+
+// Add accumulated integer sums in dst to float output.
+template <int Depth, int Width>
+void Unpack(float* output_ptr, const int32_t* dst, int batch_size,
+            int num_units, const float* scaling_factors,
+            const float* filter_scales, int dst_layout_rows,
+            int dst_layout_cols) {
+  SseUnpack<Depth, Width>(output_ptr, dst, batch_size, num_units,
+                          scaling_factors, filter_scales, dst_layout_rows,
+                          dst_layout_cols);
+}
+
+// Compute sum of lhs * rhs columnwise.
+template <int RowsLeft, int RowsRight, int Cols>
+void RunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+               int lhs_layout_rows, int lhs_layout_cols, int rhs_layout_rows,
+               int rhs_layout_cols, int dst_layout_rows, int dst_layout_cols) {
+  SseRunKernel<RowsLeft, RowsRight, Cols>(
+      lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols, rhs_layout_rows,
+      rhs_layout_cols, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 1>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  SseUnpack<4, 1>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 1, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  SseRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 2>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  SseUnpack<4, 2>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 2, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  SseRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+}
+
+template <>
+inline void Unpack<4, 4>(float* output_ptr, const int32_t* dst, int batch_size,
+                         int num_units, const float* scaling_factors,
+                         const float* filter_scales, int dst_layout_rows,
+                         int dst_layout_cols) {
+  SseUnpack<4, 4>(output_ptr, dst, batch_size, num_units, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+template <>
+inline void RunKernel<4, 4, 32>(const uint8_t* lhs, const int8_t* rhs,
+                                int32_t* dst, int lhs_layout_rows,
+                                int lhs_layout_cols, int rhs_layout_rows,
+                                int rhs_layout_cols, int dst_layout_rows,
+                                int dst_layout_cols) {
+  SseRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+}
+
+// Compute sum of lhs * rhs columnwise and write output to output_ptr.
+inline void RunAndUnpack(int rhs_width, const uint8_t* lhs, const int8_t* rhs,
+                         int32_t* dst, int output_depth, int batch_size,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols,
+                         float* output_ptr, const float* scaling_factors,
+                         const float* filter_scales) {
+  if (rhs_width >= 4) {
+    SseRunKernel<4, 4, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                           rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                           dst_layout_cols);
+    SseUnpack<4, 4>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                    filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+  if (rhs_width >= 2) {
+    SseRunKernel<4, 2, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                           rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                           dst_layout_cols);
+    SseUnpack<4, 2>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                    filter_scales, dst_layout_rows, dst_layout_cols);
+    return;
+  }
+  SseRunKernel<4, 1, 32>(lhs, rhs, dst, lhs_layout_rows, lhs_layout_cols,
+                         rhs_layout_rows, rhs_layout_cols, dst_layout_rows,
+                         dst_layout_cols);
+  SseUnpack<4, 1>(output_ptr, dst, batch_size, output_depth, scaling_factors,
+                  filter_scales, dst_layout_rows, dst_layout_cols);
+}
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_SSE) && defined(__SSSE3__)
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h b/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h
new file mode 100644
index 00000000000..ae39369ac64
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected_impl.h
@@ -0,0 +1,64 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_IMPL_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_IMPL_H_
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+
+#include <stdint.h>
+
+#ifndef EIGEN_MAX_ALIGN_BYTES
+#define EIGEN_MAX_ALIGN_BYTES 64
+#endif
+
+namespace tflite {
+namespace optimized_4bit {
+
+void SsePackInner(const int8_t* src, uint8_t* box, int src_rows, int src_cols,
+                  int outer_row, int outer_col, int outer_rows, int outer_cols,
+                  int inner_rows, int inner_cols);
+
+void SsePrepack(uint8_t** dest, const int8_t* tensor, int layout_rows,
+                int layout_cols, int src_rows, int src_cols, int width,
+                int depth);
+
+void SseBatchQuantizeFloats4Bit(const float* float_data_ptr, int n_batch,
+                                int n_data, int8_t* quantized_data_ptr,
+                                float* scaling_factors, int width, int depth,
+                                int32_t* input_offsets);
+
+void SseAssignBiasAndComputeOffsets(const int32_t* input_offsets,
+                                    const float* batch_scales,
+                                    const float* filter_scales,
+                                    const float* bias_ptr, float* output_ptr,
+                                    int output_depth, int batch_size);
+
+template <int Depth, int Width>
+extern void SseUnpack(float* output_ptr, const int32_t* dst, int batch_size,
+                      int num_units, const float* scaling_factors,
+                      const float* filter_scales, int dst_layout_rows,
+                      int dst_layout_cols);
+
+template <int RowsLeft, int RowsRight, int Cols>
+extern void SseRunKernel(const uint8_t* lhs, const int8_t* rhs, int32_t* dst,
+                         int lhs_layout_rows, int lhs_layout_cols,
+                         int rhs_layout_rows, int rhs_layout_cols,
+                         int dst_layout_rows, int dst_layout_cols);
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // defined(FC_4BIT_SSE) && defined(__SSSE3__)
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_4BIT_SSE_FULLY_CONNECTED_IMPL_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h b/tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h
new file mode 100644
index 00000000000..277c4bd05a7
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_FULLY_CONNECTED_4BIT_H_
+#define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_FULLY_CONNECTED_4BIT_H_
+#include <stdint.h>
+
+#include <cstdlib>
+
+#if defined(FC_4BIT_SSE) && defined(__SSSE3__)
+#include "tensorflow/lite/kernels/internal/optimized/4bit/sse_fully_connected.h"
+#elif defined(FC_4BIT_NEON) && (defined(__ARM_NEON__) || defined(__ARM_NEON))
+#include "tensorflow/lite/kernels/internal/optimized/4bit/neon_fully_connected.h"
+#else
+#include "tensorflow/lite/kernels/internal/optimized/4bit/fully_connected_reference.h"
+#endif
+
+namespace tflite {
+namespace optimized_4bit {
+
+// Define 4-bit filter block size: 4x32 (64 bytes)
+constexpr int FilterWidth = 4;
+constexpr int FilterDepth = 32;
+
+struct OpData4Bit {
+  int rows_right = 1;
+  int batch_size = 0;
+  bool needs_prepack = true;
+  uint8_t* prepacked_cache = nullptr;
+  ~OpData4Bit() { free(prepacked_cache); }
+};
+
+}  // namespace optimized_4bit
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_FULLY_CONNECTED_4BIT_H_
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc b/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc
new file mode 100644
index 00000000000..a75501bc9ec
--- /dev/null
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc
@@ -0,0 +1,546 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <algorithm>
+#include <cstddef>
+#include <random>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "tensorflow/lite/kernels/internal/optimized/fully_connected_4bit.h"
+
+namespace tflite {
+namespace {
+
+std::mt19937 random_engine(2023);
+std::uniform_real_distribution<float> real_dist(0.f, 1.f);
+std::uniform_int_distribution<int32_t> int_dist(-7, 7);
+
+struct TestPack {
+  TestPack(std::vector<int8_t> src_data, int src_rows, int src_cols, int width,
+           int depth)
+      : src_data(src_data),
+        src_rows(src_rows),
+        src_cols(src_cols),
+        width(width),
+        depth(depth) {
+    rows = (src_rows + (width - 1)) & ~(width - 1);
+    cols = (src_cols + (depth - 1)) & ~(depth - 1);
+  }
+
+  ~TestPack() { free(packed_data); }
+  void Prepack() {
+    optimized_4bit::Prepack(&packed_data, src_data.data(), rows, cols, src_rows,
+                            src_cols, width, depth);
+  }
+
+  std::vector<uint8_t> AsVector() {
+    int size = rows * cols / 2;
+    std::vector<uint8_t> values(size);
+    for (int i = 0; i < size; i++) {
+      values[i] = packed_data[i];
+    }
+    return values;
+  }
+
+  std::vector<int8_t> src_data;
+  uint8_t* packed_data;
+  int src_rows;
+  int src_cols;
+  int rows;
+  int cols;
+  int width;
+  int depth;
+};
+
+class RunPackTests
+    : public ::testing::TestWithParam<::testing::tuple<int, int>> {};
+
+TEST_P(RunPackTests, RunPackTests) {
+  auto params = GetParam();
+  int src_rows = std::get<0>(params);
+  int src_cols = std::get<1>(params);
+  std::vector<int8_t> test_data;
+  test_data.reserve(src_rows * src_cols / 2);
+  for (int i = 0; i < src_rows; ++i) {
+    int stride = optimized_4bit::FilterDepth / 4;
+    int strides = src_cols / stride / 2;
+    int v = -7;
+    int l = 0;
+    for (int j = 0; j < strides; j++) {
+      for (int k = 0; k < stride; k++) {  // 8
+        int lower = static_cast<uint8_t>(v) & UINT8_C(15);
+        int upper = static_cast<uint8_t>(v) << 4;
+        test_data.push_back(upper | lower);
+        l++;
+      }
+      v++;
+    }
+    while (l < (src_cols / 2)) {
+      int lower = static_cast<uint8_t>(v) & UINT8_C(15);
+      test_data.push_back(lower << 4 | lower);
+      l++;
+    }
+  }
+  TestPack test(test_data, src_rows, src_cols, optimized_4bit::FilterWidth,
+                optimized_4bit::FilterDepth);
+  test.Prepack();
+  std::vector<uint8_t> result = test.AsVector();
+  int outer_rows = test.rows / optimized_4bit::FilterWidth;
+  int outer_cols = test.cols / optimized_4bit::FilterDepth;
+  int k = 0;
+  for (int i = 0; i < outer_rows; ++i) {
+    int v = -7;
+    for (int j = 0; j < outer_cols; ++j) {
+      for (int w = 0; w < optimized_4bit::FilterWidth; w++) {
+        int c = 0;
+        for (; c < optimized_4bit::FilterDepth / 2; c++) {
+          uint8_t res = result[k++];
+          uint8_t res0 = res >> 4;
+          uint8_t res1 = res & UINT8_C(15);
+          int res00 = res0 - 7;
+          int res11 = res1 - 7;
+          if ((i * optimized_4bit::FilterWidth + w) < src_rows) {
+            if ((j * optimized_4bit::FilterDepth / 2 + c) < src_cols / 2) {
+              EXPECT_EQ(res00, v % 8);
+            }
+            if ((j * optimized_4bit::FilterDepth / 2 + c + 16) < src_cols / 2) {
+              EXPECT_EQ(res11, v + 1 % 8);
+            }
+          }
+        }
+      }
+      v += 2;
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(RunPackTests, RunPackTests,
+                         ::testing::ValuesIn({
+                             std::make_tuple(4, 32),
+                             std::make_tuple(4, 46),
+                             std::make_tuple(5, 64),
+                             std::make_tuple(5, 72),
+                         }));
+
+struct TestQuantize {
+  TestQuantize(std::vector<float> src_data, int src_rows, int src_cols,
+               int width, int depth)
+      : src_data(src_data),
+        src_rows(src_rows),
+        src_cols(src_cols),
+        width(width),
+        depth(depth) {
+    rows = (src_rows + (width - 1)) & ~(width - 1);
+    cols = (src_cols + (depth - 1)) & ~(depth - 1);
+    scaling_factors.assign(rows, 1.0);
+    input_offsets.assign(rows, 0);
+    output_data.assign(rows * cols, 0);
+  }
+
+  void BatchQuantizeFloats4Bit() {
+    optimized_4bit::BatchQuantizeFloats4Bit(
+        src_data.data(), src_rows, src_cols, output_data.data(),
+        scaling_factors.data(), width, depth, input_offsets.data());
+  }
+
+  std::vector<float> src_data;
+  int src_rows;
+  int src_cols;
+  int rows;
+  int cols;
+  int width;
+  int depth;
+  std::vector<int8_t> output_data;
+  std::vector<float> scaling_factors;
+  std::vector<int32_t> input_offsets;
+};
+
+class RunQuantizeInputTests
+    : public ::testing::TestWithParam<::testing::tuple<int, int, int>> {};
+
+TEST_P(RunQuantizeInputTests, RunQuantizeInputsTests) {
+  auto params = GetParam();
+  int width = std::get<0>(params);
+  int src_rows = std::get<1>(params);
+  int src_cols = std::get<2>(params);
+  std::vector<float> test_data;
+  test_data.reserve(src_rows * src_cols);
+
+  float v = -127.0;
+  for (int i = 0; i < src_rows; ++i) {
+    for (int j = 0; j < src_cols; ++j) {
+      test_data.push_back(v / (i + 1));
+      v = -v;
+    }
+  }
+  TestQuantize test(test_data, src_rows, src_cols, width,
+                    optimized_4bit::FilterDepth);
+  test.BatchQuantizeFloats4Bit();
+  int8_t* result = test.output_data.data();
+  int k = 0;
+  int outer_rows = test.rows / width;
+  int outer_cols = test.cols / optimized_4bit::FilterDepth;
+  for (int i = 0; i < outer_rows; ++i) {
+    for (int j = 0; j < outer_cols; ++j) {
+      for (int w = 0; w < width; w++) {
+        int c = 0;
+        v = -127;
+        for (; c < optimized_4bit::FilterDepth; c++) {
+          int8_t res = result[k++];
+          int res0 = static_cast<int>(res);
+          if ((i * width + w) < src_rows) {
+            if ((j * optimized_4bit::FilterDepth + c) < src_cols) {
+              EXPECT_EQ(res0, v);
+            }
+          }
+          v = -v;
+        }
+      }
+      v += 2;
+    }
+  }
+  for (int i = 0; i < test.rows; i++) {
+    if (i >= src_rows) {
+      continue;
+    }
+    EXPECT_EQ(test.input_offsets[i], 0);
+    EXPECT_NEAR(test.scaling_factors[i], 1.0 / (1 + i), 1e-3);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(RunQuantizeInputTests, RunQuantizeInputTests,
+                         ::testing::ValuesIn({
+                             std::make_tuple(1, 1, 32),
+                             std::make_tuple(1, 3, 46),
+                             std::make_tuple(1, 9, 64),
+                             std::make_tuple(1, 25, 72),
+                             std::make_tuple(2, 2, 32),
+                             std::make_tuple(2, 3, 46),
+                             std::make_tuple(2, 9, 64),
+                             std::make_tuple(2, 25, 72),
+                             std::make_tuple(4, 4, 32),
+                             std::make_tuple(4, 5, 46),
+                             std::make_tuple(4, 9, 64),
+                             std::make_tuple(4, 25, 72),
+                         }));
+
+struct TestAssignBiasAndComputeOffset {
+  TestAssignBiasAndComputeOffset(std::vector<float> output_data,
+                                 std::vector<int32_t> input_offsets,
+                                 std::vector<float> input_scales,
+                                 std::vector<float> filter_scales,
+                                 std::vector<float> bias, int output_rows,
+                                 int output_cols, bool use_bias)
+      : output_data(output_data),
+        input_offsets(input_offsets),
+        input_scales(input_scales),
+        filter_scales(filter_scales),
+        bias(bias),
+        output_rows(output_rows),
+        output_cols(output_cols),
+        use_bias(use_bias) {}
+
+  void AssignBiasAndComputeOffsets() {
+    optimized_4bit::AssignBiasAndComputeOffsets(
+        input_offsets.data(), input_scales.data(), filter_scales.data(),
+        use_bias ? bias.data() : nullptr, output_data.data(), output_cols,
+        output_rows);
+  }
+  std::vector<float> output_data;
+  std::vector<int32_t> input_offsets;
+  std::vector<float> input_scales;
+  std::vector<float> filter_scales;
+  std::vector<float> bias;
+  int output_rows;
+  int output_cols;
+  bool use_bias;
+};
+
+class RunAssignBiasAndOffsetsTests
+    : public ::testing::TestWithParam<::testing::tuple<int, int, bool>> {};
+
+TEST_P(RunAssignBiasAndOffsetsTests, RunAssignBiasAndOffsetssTests) {
+  auto params = GetParam();
+  int output_rows = std::get<0>(params);
+  int output_cols = std::get<1>(params);
+  bool use_bias = std::get<2>(params);
+  std::vector<float> test_data(output_rows * output_cols, 0);
+  std::vector<float> test_input_scales(output_rows);
+  std::vector<int32_t> test_input_offsets(output_rows);
+  std::vector<float> test_filter_scales(output_cols);
+  std::vector<float> test_bias(output_cols);
+
+  for (int i = 0; i < output_rows; ++i) {
+    test_input_scales[i] = real_dist(random_engine);
+    test_input_offsets[i] = int_dist(random_engine);
+  }
+  for (int i = 0; i < output_cols; ++i) {
+    test_filter_scales[i] = real_dist(random_engine);
+    test_bias[i] = real_dist(random_engine);
+  }
+
+  TestAssignBiasAndComputeOffset test(
+      test_data, test_input_offsets, test_input_scales, test_filter_scales,
+      test_bias, output_rows, output_cols, use_bias);
+  test.AssignBiasAndComputeOffsets();
+  float* result = test.output_data.data();
+  for (int i = 0; i < output_rows; ++i) {
+    for (int j = 0; j < output_cols; ++j) {
+      float val = result[i * output_cols + j];
+      float expected = use_bias ? test_bias[j] : 0;
+      expected +=
+          test_input_offsets[i] * test_input_scales[i] * test_filter_scales[j];
+      EXPECT_NEAR(val, expected, 1e-3);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(RunAssignBiasAndOffsetsTests,
+                         RunAssignBiasAndOffsetsTests,
+                         ::testing::ValuesIn({
+                             std::make_tuple(1, 8, true),
+                             std::make_tuple(4, 17, false),
+                             std::make_tuple(4, 17, true),
+                             std::make_tuple(11, 33, false),
+                             std::make_tuple(11, 33, true),
+                         }));
+
+struct TestUnpack {
+  TestUnpack(std::vector<int32_t> src_data, std::vector<float> input_scales,
+             std::vector<float> filter_scales, int src_rows, int src_cols,
+             int output_rows, int output_cols)
+      : src_data(src_data),
+        input_scales(input_scales),
+        filter_scales(filter_scales),
+        src_rows(src_rows),
+        src_cols(src_cols),
+        output_rows(output_rows),
+        output_cols(output_cols) {
+    output_data.assign(output_rows * output_cols, 0.0);
+  }
+
+  template <int Depth, int Width>
+  void Unpack() {
+    optimized_4bit::Unpack<Depth, Width>(
+        output_data.data(), src_data.data(), output_rows, output_cols,
+        input_scales.data(), filter_scales.data(), src_rows, src_cols);
+  }
+
+  std::vector<int32_t> src_data;
+  std::vector<float> input_scales;
+  std::vector<float> filter_scales;
+  std::vector<float> output_data;
+  int src_rows;
+  int src_cols;
+  int output_rows;
+  int output_cols;
+};
+
+class RunUnpackTests
+    : public ::testing::TestWithParam<::testing::tuple<int, int, int>> {};
+
+TEST_P(RunUnpackTests, RunUnpackTests) {
+  auto params = GetParam();
+  int src_rows = std::get<0>(params);
+  int src_cols = std::get<1>(params);
+  // In this case, we only unpack 1 rhs row,
+  // so the batch_size and accumulator rows must match.
+  int output_rows = src_rows;
+  int output_cols = std::get<2>(params);
+  std::vector<float> test_input_scales(src_rows);
+  std::vector<float> test_filter_scales(src_cols);
+  std::vector<int32_t> test_data;
+  test_data.reserve(src_rows * src_cols);
+  int outer_cols = src_cols / optimized_4bit::FilterWidth;
+  int outer_rows = src_rows;
+  for (int j = 0; j < outer_cols; ++j) {
+    for (int i = 0; i < outer_rows; ++i) {
+      for (int k = 0; k < optimized_4bit::FilterWidth; ++k) {
+        test_data.push_back(i);
+      }
+    }
+  }
+  for (int i = 0; i < src_rows; ++i) {
+    test_input_scales[i] = real_dist(random_engine);
+  }
+  for (int i = 0; i < src_cols; ++i) {
+    test_filter_scales[i] = real_dist(random_engine);
+  }
+  TestUnpack test(test_data, test_input_scales, test_filter_scales, src_rows,
+                  src_cols, output_rows, output_cols);
+  test.Unpack<4, 1>();
+  std::vector<float> result = test.output_data;
+  for (int i = 0; i < output_rows; ++i) {
+    for (int j = 0; j < output_cols; ++j) {
+      float res = result[i * output_cols + j];
+      EXPECT_EQ(res, i * test_input_scales[i] * test_filter_scales[j]);
+    }
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(RunUnpackTests, RunUnpackTests,
+                         ::testing::ValuesIn({
+                             std::make_tuple(1, 8, 5),
+                             std::make_tuple(3, 4, 4),
+                         }));
+
+class RunKernelTests
+    : public ::testing::TestWithParam<::testing::tuple<int, int, int, int>> {};
+
+TEST_P(RunKernelTests, RunKernelTests) {
+  auto params = GetParam();
+  int rhs_width = std::get<0>(params);
+  int lhs_layout_rows = std::get<1>(params);
+  int rhs_layout_rows = std::get<2>(params);
+  int lhs_layout_cols = std::get<3>(params);
+  int rhs_layout_cols = lhs_layout_cols;
+
+  std::vector<uint8_t> test_lhs(lhs_layout_rows * lhs_layout_cols / 2, 0.0);
+  std::vector<int8_t> test_rhs(rhs_layout_rows * rhs_layout_cols, 0.0);
+  std::vector<int32_t> test_accum(lhs_layout_rows * rhs_layout_rows, 0.0);
+
+  int lhs_outer_rows = lhs_layout_rows / optimized_4bit::FilterWidth;
+  int lhs_outer_cols = lhs_layout_cols / optimized_4bit::FilterDepth;
+
+  // pack lhs
+  for (int i = 0; i < lhs_outer_rows; ++i) {
+    for (int j = 0; j < lhs_outer_cols; ++j) {
+      for (int k = 0; k < optimized_4bit::FilterWidth; ++k) {
+        for (int l = 0; l < optimized_4bit::FilterDepth / 2; ++l) {
+          uint8_t u = static_cast<uint8_t>(int_dist(random_engine) + 7);
+          uint8_t v = static_cast<uint8_t>(int_dist(random_engine) + 7);
+          int lower = static_cast<uint8_t>(v) & UINT8_C(15);
+          int upper = static_cast<uint8_t>(u) << 4;
+          int cluster_index = (i * lhs_outer_cols + j) *
+                              optimized_4bit::FilterDepth *
+                              optimized_4bit::FilterWidth / 2;
+          int index = cluster_index + k * (optimized_4bit::FilterDepth / 2);
+          test_lhs[index + l] = (upper | lower);
+        }
+      }
+    }
+  }
+  int rhs_outer_rows = rhs_layout_rows / rhs_width;
+  int rhs_outer_cols = rhs_layout_cols / optimized_4bit::FilterDepth;
+
+  for (int i = 0; i < rhs_outer_rows; ++i) {
+    for (int j = 0; j < rhs_outer_cols; ++j) {
+      for (int k = 0; k < rhs_width; ++k) {
+        for (int l = 0; l < optimized_4bit::FilterDepth; ++l) {
+          int8_t u = static_cast<int8_t>(int_dist(random_engine));
+          int cluster_index = (i * rhs_outer_cols + j) *
+                              optimized_4bit::FilterDepth * rhs_width;
+          int index = cluster_index + k * optimized_4bit::FilterDepth;
+          test_rhs[index + l] = u;
+        }
+      }
+    }
+  }
+
+  int index = 0;
+  std::vector<int32_t> expected_accum(lhs_layout_rows * rhs_layout_rows, 0.0);
+  int outer_cols = rhs_outer_cols;
+  for (int i = 0; i < lhs_outer_rows; ++i) {
+    for (int j = 0; j < rhs_outer_rows; ++j) {
+      int32_t accum[1][optimized_4bit::FilterWidth];
+      for (int k = 0; k < rhs_width; ++k) {
+        for (int l = 0; l < optimized_4bit::FilterWidth; ++l) {
+          memset(accum, 0,
+                 sizeof(int32_t) * rhs_width * optimized_4bit::FilterWidth);
+          for (int m = 0; m < outer_cols; ++m) {
+            for (int n = 0; n < optimized_4bit::FilterDepth; ++n) {
+              int right_index = ((j * outer_cols + m) * rhs_width + k) *
+                                optimized_4bit::FilterDepth;
+              int8_t rhs = test_rhs[right_index + n];
+              int left_index =
+                  ((i * outer_cols + m) * optimized_4bit::FilterWidth + l) *
+                  (optimized_4bit::FilterDepth / 2);
+              uint8_t lhs = 0;
+              if (n < optimized_4bit::FilterDepth / 2) {
+                int a = n % 16;
+                lhs = static_cast<uint8_t>(test_lhs[left_index + a] >> 4);
+              } else {
+                int a = n % 16;
+                lhs = static_cast<uint8_t>(test_lhs[left_index + a] &
+                                           UINT8_C(15));
+              }
+              int accum_index = ((i * rhs_outer_rows + j) * rhs_width + k) *
+                                optimized_4bit::FilterWidth;
+              expected_accum[accum_index + l] += rhs * lhs;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  index = 0;
+  switch (rhs_width) {
+#if defined(FC_4BIT_NEON) && defined(__aarch64__)
+    case 4:
+      optimized_4bit::RunKernel<optimized_4bit::FilterWidth, 4,
+                                optimized_4bit::FilterDepth>(
+          test_lhs.data(), test_rhs.data(), test_accum.data(), lhs_layout_rows,
+          lhs_layout_cols, rhs_layout_rows, rhs_layout_cols, rhs_layout_rows,
+          lhs_layout_rows);
+      break;
+    case 2:
+      optimized_4bit::RunKernel<optimized_4bit::FilterWidth, 2,
+                                optimized_4bit::FilterDepth>(
+          test_lhs.data(), test_rhs.data(), test_accum.data(), lhs_layout_rows,
+          lhs_layout_cols, rhs_layout_rows, rhs_layout_cols, rhs_layout_rows,
+          lhs_layout_rows);
+      break;
+#endif
+    case 1:
+      [[fallthrough]];
+    default:
+      optimized_4bit::RunKernel<optimized_4bit::FilterWidth, 1,
+                                optimized_4bit::FilterDepth>(
+          test_lhs.data(), test_rhs.data(), test_accum.data(), lhs_layout_rows,
+          lhs_layout_cols, rhs_layout_rows, rhs_layout_cols, rhs_layout_rows,
+          lhs_layout_rows);
+      break;
+  }
+
+  for (int i = 0; i < (rhs_layout_rows * lhs_layout_rows); ++i) {
+    int32_t expected_val = expected_accum[i];
+    int32_t val = test_accum[i];
+    EXPECT_EQ(val, expected_val);
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    RunKernelTests, RunKernelTests, ::testing::ValuesIn({
+      std::make_tuple(1, 4, 1, 32), std::make_tuple(1, 8, 1, 32),
+          std::make_tuple(1, 16, 1, 32), std::make_tuple(1, 4, 1, 64),
+          std::make_tuple(1, 8, 1, 64), std::make_tuple(1, 16, 1, 64),
+          std::make_tuple(1, 4, 5, 64), std::make_tuple(1, 8, 9, 64),
+          std::make_tuple(1, 16, 17, 64),
+#if defined(FC_4BIT_NEON) && defined(__aarch64__)
+          std::make_tuple(2, 8, 2, 32), std::make_tuple(2, 16, 2, 32),
+          std::make_tuple(2, 4, 4, 64), std::make_tuple(2, 8, 4, 64),
+          std::make_tuple(2, 16, 4, 64), std::make_tuple(2, 4, 4, 64),
+          std::make_tuple(2, 8, 8, 64), std::make_tuple(2, 16, 16, 64),
+          std::make_tuple(4, 4, 4, 32), std::make_tuple(4, 8, 4, 32),
+          std::make_tuple(4, 16, 4, 32), std::make_tuple(4, 4, 8, 64),
+          std::make_tuple(4, 8, 8, 64), std::make_tuple(4, 16, 8, 64),
+          std::make_tuple(4, 4, 8, 64), std::make_tuple(4, 8, 12, 64),
+          std::make_tuple(4, 16, 32, 64),
+#endif
+    }));
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/optimized/reduce.h b/tensorflow/lite/kernels/internal/optimized/reduce.h
index 7f661550357..502103becc7 100644
--- a/tensorflow/lite/kernels/internal/optimized/reduce.h
+++ b/tensorflow/lite/kernels/internal/optimized/reduce.h
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include <algorithm>
 #include <limits>
+#include <vector>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/kernels/cpu_backend_threadpool.h"
@@ -407,6 +408,8 @@ bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
                         const int* axis, const int num_axis_dimensions,
                         bool keep_dims, int* normalized_dims,
                         int* resolved_axis, U* temp_sum, bool compute_sum) {
+  const int32_t kMinValue = std::numeric_limits<T>::min();
+  const int32_t kMaxValue = std::numeric_limits<T>::max();
   ruy::profiler::ScopeLabel label(compute_sum ? "QuantizedSum"
                                               : "QuantizedMean");
   // Reset output data.
@@ -467,9 +470,9 @@ bool QuantizedMeanOrSum(const T* input_data, int32_t input_zero_point,
     if (compute_sum) {
       const float bias = -input_zero_point * scale * num_elements_in_axis;
       for (size_t idx = 0; idx < num_outputs; ++idx) {
-        const U value =
-            static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
-            output_zero_point;
+        U value = static_cast<U>(TfLiteRound(temp_sum[idx] * scale + bias)) +
+                  output_zero_point;
+        value = std::min(std::max(value, kMinValue), kMaxValue);
         output_data[idx] = static_cast<T>(value);
       }
     } else {
diff --git a/tensorflow/lite/kernels/internal/reference/conv.h b/tensorflow/lite/kernels/internal/reference/conv.h
index 3a53e06e7ee..3c9f9fc7f49 100644
--- a/tensorflow/lite/kernels/internal/reference/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/conv.h
@@ -56,8 +56,10 @@ inline void Conv(const ConvParams& params, const RuntimeShape& input_shape,
   const int filter_width = filter_shape.Dims(2);
   const int filter_input_depth = filter_shape.Dims(3);
   const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
   TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
   const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
 
diff --git a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
index ba3e2a81d78..eac00576a25 100644
--- a/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
+++ b/tensorflow/lite/kernels/internal/reference/integer_ops/conv.h
@@ -63,8 +63,10 @@ inline void ConvPerChannel(
   const int filter_width = filter_shape.Dims(2);
   const int filter_input_depth = filter_shape.Dims(3);
   const int groups = input_depth / filter_input_depth;
+  TFLITE_DCHECK_NE(groups, 0);
   TFLITE_DCHECK_EQ(input_depth % filter_input_depth, 0);
   const int filters_per_group = output_depth / groups;
+  TFLITE_DCHECK_NE(filters_per_group, 0);
   const int output_height = output_shape.Dims(1);
   const int output_width = output_shape.Dims(2);
   for (int batch = 0; batch < batches; ++batch) {
diff --git a/tensorflow/lite/kernels/internal/reference/resize_bilinear.h b/tensorflow/lite/kernels/internal/reference/resize_bilinear.h
index b5edadb9844..bf9a88af578 100644
--- a/tensorflow/lite/kernels/internal/reference/resize_bilinear.h
+++ b/tensorflow/lite/kernels/internal/reference/resize_bilinear.h
@@ -212,9 +212,14 @@ inline void ResizeBilinearInteger(
               (input_y - (1 << 10) * y0) * (input_x - (1 << 10) * x0);
           const int64_t output_20 =
               output_20_ll + output_20_lu + output_20_rl + output_20_ru;
+#if TFLITE_SINGLE_ROUNDING
+          const int64_t round = 1 << 19;
+          const T interpolation = static_cast<T>((output_20 + round) >> 20);
+#else
           const int64_t round = (output_20 > 0) ? (1 << 19) : -(1 << 19);
           const T interpolation =
               static_cast<T>((output_20 + round) / (1 << 20));
+#endif  // TFLITE_SINGLE_ROUNDING
           output_data[Offset(output_shape, b, y, x, c)] = interpolation;
         }
       }
diff --git a/tensorflow/lite/kernels/internal/spectrogram.cc b/tensorflow/lite/kernels/internal/spectrogram.cc
index a832962a38d..919eebeb3e7 100644
--- a/tensorflow/lite/kernels/internal/spectrogram.cc
+++ b/tensorflow/lite/kernels/internal/spectrogram.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <assert.h>
 #include <math.h>
+#include <stdint.h>
 
 #include "third_party/fft2d/fft.h"
 
diff --git a/tensorflow/lite/kernels/internal/strided_slice_logic.h b/tensorflow/lite/kernels/internal/strided_slice_logic.h
index 2efdcf26fe0..449cac04667 100644
--- a/tensorflow/lite/kernels/internal/strided_slice_logic.h
+++ b/tensorflow/lite/kernels/internal/strided_slice_logic.h
@@ -103,6 +103,7 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
   const auto shrink_axis_mask = params.shrink_axis_mask;
   const bool shrink_axis = shrink_axis_mask & (1 << axis);
   const int axis_size = input_shape.Dims(axis);
+  const bool offset = params.offset;
   if (shrink_axis) {
     if (start >= axis_size) {
       return start;
@@ -112,6 +113,9 @@ inline int StridedSliceEndForAxis(const tflite::StridedSliceParams& params,
   }
   const auto* indices = params.stop_indices;
   int end = indices[axis];
+  if (offset) {
+    end += start;
+  }
   const int32_t stride = params.strides[axis];
   const int32_t end_mask = (params.end_mask & 1 << axis);
   if (end < 0) {
@@ -246,7 +250,7 @@ inline tflite::StridedSliceParams BuildStridedSliceParams(
     int begin_mask, int end_mask, int shrink_axis_mask,
     const std::vector<int>& start_indices, const std::vector<int>& stop_indices,
     const std::vector<int>& strides) {
-  tflite::StridedSliceParams op_params;
+  tflite::StridedSliceParams op_params{};
   const int dims_count = start_indices.size();
 
   op_params.start_indices_count = dims_count;
diff --git a/tensorflow/lite/kernels/internal/tensor_utils_test.cc b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
index 496d88c2167..e1aded39af1 100644
--- a/tensorflow/lite/kernels/internal/tensor_utils_test.cc
+++ b/tensorflow/lite/kernels/internal/tensor_utils_test.cc
@@ -464,7 +464,7 @@ TEST(uKernels, HybridMatrixBatchVectorMultiplyAccumulate8x8_16Test) {
       input_offsets.data(), scratch.data(), row_sums, &compute_row_sums,
       &context);
 
-  const std::vector<float_t> expected_output = {
+  const std::vector<float> expected_output = {
       -228, 1548,  937, -166, -1164, -1578, -278,  303, 839,  -820,  132,
       1733, -1858, 58,  -425, -587,  -228,  1548,  937, -166, -1164, -1578,
       -278, 303,   839, -820, 132,   1733,  -1858, 58,  -425, -587,
diff --git a/tensorflow/lite/kernels/internal/types.h b/tensorflow/lite/kernels/internal/types.h
index 043a8513637..b775ca815d9 100644
--- a/tensorflow/lite/kernels/internal/types.h
+++ b/tensorflow/lite/kernels/internal/types.h
@@ -982,6 +982,7 @@ struct StridedSliceParams {
   uint16_t end_mask;
   uint16_t new_axis_mask;
   uint16_t shrink_axis_mask;
+  bool offset;
 };
 
 struct TanhParams {
diff --git a/tensorflow/lite/kernels/kernel_util.h b/tensorflow/lite/kernels/kernel_util.h
index 608db54ae1b..24061ab249b 100644
--- a/tensorflow/lite/kernels/kernel_util.h
+++ b/tensorflow/lite/kernels/kernel_util.h
@@ -204,22 +204,23 @@ inline bool IsConstantOrPersistentTensor(const TfLiteTensor* tensor) {
 inline bool IsDynamicTensor(const TfLiteTensor* tensor) {
   return tensor->allocation_type == kTfLiteDynamic;
 }
-
+#ifndef TF_LITE_STATIC_MEMORY
 // Sets tensor to dynamic.
 inline void SetTensorToDynamic(TfLiteTensor* tensor) {
   if (tensor->allocation_type != kTfLiteDynamic) {
+    TfLiteTensorDataFree(tensor);
     tensor->allocation_type = kTfLiteDynamic;
-    tensor->data.raw = nullptr;
   }
 }
 
 // Sets tensor to persistent and read-only.
 inline void SetTensorToPersistentRo(TfLiteTensor* tensor) {
   if (tensor->allocation_type != kTfLitePersistentRo) {
+    TfLiteTensorDataFree(tensor);
     tensor->allocation_type = kTfLitePersistentRo;
-    tensor->data.raw = nullptr;
   }
 }
+#endif  // TF_LITE_STATIC_MEMORY
 
 // Determines whether it is a hybrid op - one that has float inputs and
 // quantized weights.
diff --git a/tensorflow/lite/kernels/kernel_util_test.cc b/tensorflow/lite/kernels/kernel_util_test.cc
index bc4fa125556..8480747a232 100644
--- a/tensorflow/lite/kernels/kernel_util_test.cc
+++ b/tensorflow/lite/kernels/kernel_util_test.cc
@@ -20,14 +20,20 @@ limitations under the License.
 #include <string.h>
 
 #include <initializer_list>
+#include <memory>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/testing/util.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/util.h"
 
 namespace tflite {
 namespace {
@@ -50,341 +56,272 @@ void ReportError(TfLiteContext* context, const char* format, ...) {
   c->error = temp_buffer;
 }
 
-class KernelUtilTest : public ::testing::Test {
+class TestWithTfLiteContext : public ::testing::Test {
  public:
-  KernelUtilTest() {
-    context_.ReportError = ReportError;
+  TestWithTfLiteContext() { context_.ReportError = ReportError; }
 
-    memset(&tensor1_, 0, sizeof(TfLiteTensor));
-    memset(&tensor2_, 0, sizeof(TfLiteTensor));
-    memset(&tensor3_, 0, sizeof(TfLiteTensor));
-    tensor1_.dims = nullptr;
-    tensor2_.dims = nullptr;
-    tensor3_.dims = nullptr;
-    tensor1_.allocation_type = kTfLiteMmapRo;
-    tensor2_.allocation_type = kTfLiteMmapRo;
-    tensor3_.allocation_type = kTfLiteMmapRo;
-  }
-  ~KernelUtilTest() override {
-    TfLiteTensorFree(&tensor1_);
-    TfLiteTensorFree(&tensor2_);
-    TfLiteTensorFree(&tensor3_);
-  }
-
-  void SetShape(TfLiteTensor* tensor, std::initializer_list<int> dims) {
-    TfLiteTensorFree(tensor);
-    tensor->dims = TfLiteIntArrayCreate(dims.size());
-    int i = 0;
-    for (const auto& d : dims) {
-      tensor->dims->data[i] = d;
-      ++i;
-    }
-  }
-
-  std::vector<int> GetShape(TfLiteIntArray* dims) {
-    std::vector<int> result;
-    for (int i = 0; i < dims->size; ++i) {
-      result.push_back(dims->data[i]);
-    }
-    return result;
+  // `allocation_type` and `type` are not relavant for most of these tests,
+  // so we provide a simpler wrapper to construct tensors.
+  TensorUniquePtr BuildTfLiteTensorForTest(std::initializer_list<int> dims) {
+    return BuildTfLiteTensor(kTfLiteInt32, dims, kTfLiteDynamic);
   }
 
  protected:
   TestContext context_;
-  TfLiteTensor tensor1_;
-  TfLiteTensor tensor2_;
-  TfLiteTensor tensor3_;
 };
 
-TEST_F(KernelUtilTest, SameShapeEmpty) {
-  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+class HaveSameShapeTest : public TestWithTfLiteContext {};
 
-  SetShape(&tensor1_, {1, 2, 3});
-  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+TEST_F(HaveSameShapeTest, NullPointerIsSameShape) {
+  TensorUniquePtr t1 = BuildTfLiteTensor();
+  t1->dims = nullptr;
 
-  SetShape(&tensor2_, {1, 2});
-  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
+  TensorUniquePtr t2 = BuildTfLiteTensor();
+  t2->dims = nullptr;
 
-  SetShape(&tensor2_, {1, 2, 3, 4});
-  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
-
-  SetShape(&tensor2_, {1, 2, 3});
-  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
-
-  SetShape(&tensor2_, {});
-  EXPECT_FALSE(HaveSameShapes(&tensor1_, &tensor2_));
-
-  SetShape(&tensor1_, {});
-  EXPECT_TRUE(HaveSameShapes(&tensor1_, &tensor2_));
+  EXPECT_TRUE(HaveSameShapes(t1.get(), t2.get()));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDim) {
+TEST_F(HaveSameShapeTest, NotSameShapeFalse) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({2, 3});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({3});
+
+  EXPECT_FALSE(HaveSameShapes(t1.get(), t2.get()));
+}
+
+TEST_F(HaveSameShapeTest, EmptyShapeEqualTrue) {
+  TensorUniquePtr t1 = BuildTfLiteTensor();
+  TensorUniquePtr t2 = BuildTfLiteTensor();
+
+  EXPECT_TRUE(HaveSameShapes(t1.get(), t2.get()));
+}
+
+class BroadcastShapeTest : public TestWithTfLiteContext {};
+
+TEST_F(BroadcastShapeTest, IncompatibleDimNullptr) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 3});
+
   TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {1, 3});
-  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
+
+  EXPECT_NE(kTfLiteOk,
+            CalculateShapeForBroadcast(&context_, t1.get(), t2.get(), &output));
   EXPECT_EQ(output, nullptr);
   EXPECT_EQ(context_.error,
             "Given shapes, [1,2] and [1,3], are not broadcastable.");
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDimWithZero) {
+TEST_F(BroadcastShapeTest, IncompatibleDimWithZeroNullptr) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 0});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 3});
+
   TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 0});
-  SetShape(&tensor2_, {1, 3});
-  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
+
+  EXPECT_NE(kTfLiteOk,
+            CalculateShapeForBroadcast(&context_, t1.get(), t2.get(), &output));
   EXPECT_EQ(output, nullptr);
   EXPECT_EQ(context_.error,
             "Given shapes, [1,0] and [1,3], are not broadcastable.");
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeOnes) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 1});
-  SetShape(&tensor2_, {1, 3});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, BroadCastSecondDimension) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 1});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 3});
 
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {1, 1});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  auto status =
+      CalculateShapeForBroadcast(&context_, t1.get(), t2.get(), &raw_output);
+  ASSERT_EQ(kTfLiteOk, status);
+  IntArrayUniquePtr output(raw_output);
+
+  EXPECT_THAT(output.get(), DimsAre({1, 3}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeScalars) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(1, 2));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, ScalarAnd2dBroadcastsTo2d) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({});
 
-  SetShape(&tensor1_, {});
-  SetShape(&tensor2_, {2});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(2));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  &raw_output));
+
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({1, 2}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeDifferentSizes) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {3, 1, 1});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(3, 1, 2));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, DifferentRankBroadcastsToHigherRank) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({3, 1, 2});
 
-  SetShape(&tensor1_, {1, 2, 3, 4});
-  SetShape(&tensor2_, {1, 3, 1});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(1, 2, 3, 4));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({3, 1, 2}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeWithZero) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {3, 0, 1});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(3, 0, 2));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, ZeroDimDifferentRankBroadcastsToHigherRank) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({3, 0, 2});
 
-  SetShape(&tensor1_, {2, 1, 0});
-  SetShape(&tensor2_, {1, 3, 1});
-  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, &tensor1_,
-                                                  &tensor2_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(2, 3, 0));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({3, 0, 2}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDimOnThreeTensors) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {1, 3});
-  SetShape(&tensor3_, {1, 4});
-  EXPECT_NE(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_EQ(output, nullptr);
+TEST_F(BroadcastShapeTest, ZeroDimSameRankBroadcastsToHigherRank) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({3, 0, 1});
+
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  &raw_output));
+
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({3, 0, 2}));
+}
+
+TEST_F(BroadcastShapeTest, IncompatibleDimOnThreeTensorsNullptr) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 3});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({1, 4});
+
+  TfLiteIntArray* raw_output = nullptr;
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  EXPECT_EQ(raw_output, nullptr);
   EXPECT_EQ(context_.error,
             "Given shapes, [1,2], [1,3] and [1,4], are not broadcastable.");
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeIncompatibleDimWithZeroOnThreeTensors) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 1});
-  SetShape(&tensor2_, {1, 3});
-  SetShape(&tensor3_, {1, 0});
-  EXPECT_NE(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_EQ(output, nullptr);
+TEST_F(BroadcastShapeTest, IncompatibleDimWithZeroOnThreeTensorsNullptr) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 1});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 3});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({1, 0});
+
+  TfLiteIntArray* raw_output = nullptr;
+  EXPECT_NE(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  EXPECT_EQ(raw_output, nullptr);
   EXPECT_EQ(context_.error,
             "Given shapes, [1,1], [1,3] and [1,0], are not broadcastable.");
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeOnesOnThreeTensors) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 1});
-  SetShape(&tensor2_, {1, 1});
-  SetShape(&tensor3_, {1, 3});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, ThreeTensorsBroadcastToLarger2ndDim) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 1});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 1});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({1, 3});
 
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {1, 1});
-  SetShape(&tensor3_, {1, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  TfLiteIntArrayFree(output);
-
-  SetShape(&tensor1_, {1, 1});
-  SetShape(&tensor2_, {1, 4});
-  SetShape(&tensor3_, {1, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({1, 3}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeScalarsOnThreeTensors) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {});
-  SetShape(&tensor3_, {});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(1, 2));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, TwoScalarsBroadcastTo2d) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({});
 
-  SetShape(&tensor1_, {});
-  SetShape(&tensor2_, {2});
-  SetShape(&tensor3_, {});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(2));
-  TfLiteIntArrayFree(output);
-
-  SetShape(&tensor1_, {});
-  SetShape(&tensor2_, {});
-  SetShape(&tensor3_, {3, 2, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(3, 2, 1));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({1, 2}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeDifferentSizesOnThreeTensors) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {3, 1, 1});
-  SetShape(&tensor3_, {3, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(3, 3, 2));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest, DifferentSizesOnThreeTensorsBroadcastToLargerRank) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({3, 1, 1});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({3, 1});
 
-  SetShape(&tensor1_, {3, 4});
-  SetShape(&tensor2_, {1, 3, 1});
-  SetShape(&tensor3_, {1, 2, 1, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(1, 2, 3, 4));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({3, 3, 2}));
 }
 
-TEST_F(KernelUtilTest, BroadcastShapeWithZeroOnThreeTensors) {
-  TfLiteIntArray* output = nullptr;
-  SetShape(&tensor1_, {1, 2});
-  SetShape(&tensor2_, {3, 1, 1});
-  SetShape(&tensor3_, {0, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(3, 0, 2));
-  TfLiteIntArrayFree(output);
+TEST_F(BroadcastShapeTest,
+       DifferentSizesOnThreeTensors4dBroadcastToLargerRank) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({3, 4});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({1, 3, 1});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({1, 2, 1, 1});
 
-  SetShape(&tensor1_, {1, 4});
-  SetShape(&tensor2_, {1, 0, 1});
-  SetShape(&tensor3_, {1, 2, 1, 1});
-  EXPECT_EQ(kTfLiteOk,
-            CalculateShapeForBroadcast(&context_, &tensor1_, &tensor2_,
-                                       &tensor3_, &output));
-  EXPECT_THAT(GetShape(output), ElementsAre(1, 2, 0, 4));
-  TfLiteIntArrayFree(output);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({1, 2, 3, 4}));
 }
 
-TEST_F(KernelUtilTest, GetShapeDebugString) {
-  TfLiteIntArray* dims0 = TfLiteIntArrayCreate(0);
-  EXPECT_EQ("[]", GetShapeDebugString(dims0));
-  TfLiteIntArrayFree(dims0);
+TEST_F(BroadcastShapeTest, ZeroOnThreeTensorsBroadcastToLargerRank) {
+  TensorUniquePtr t1 = BuildTfLiteTensorForTest({1, 2});
+  TensorUniquePtr t2 = BuildTfLiteTensorForTest({3, 1, 1});
+  TensorUniquePtr t3 = BuildTfLiteTensorForTest({0, 1});
 
-  TfLiteIntArray* dims1 = TfLiteIntArrayCreate(1);
+  TfLiteIntArray* raw_output;
+  EXPECT_EQ(kTfLiteOk, CalculateShapeForBroadcast(&context_, t1.get(), t2.get(),
+                                                  t3.get(), &raw_output));
+  IntArrayUniquePtr output(raw_output);
+  EXPECT_THAT(output.get(), DimsAre({3, 0, 2}));
+}
+
+TEST(GetShapeDebugStringTest, GetShapeDebugString) {
+  IntArrayUniquePtr dims0 = BuildTfLiteIntArray({});
+  EXPECT_EQ("[]", GetShapeDebugString(dims0.get()));
+
+  IntArrayUniquePtr dims1 = BuildTfLiteIntArray({1});
   dims1->data[0] = 1;
-  EXPECT_EQ("[1]", GetShapeDebugString(dims1));
-  TfLiteIntArrayFree(dims1);
+  EXPECT_EQ("[1]", GetShapeDebugString(dims1.get()));
 
-  TfLiteIntArray* dims2 = TfLiteIntArrayCreate(2);
+  IntArrayUniquePtr dims2 = BuildTfLiteIntArray({2, 3});
   dims2->data[0] = 2;
   dims2->data[1] = 3;
-  EXPECT_EQ("[2,3]", GetShapeDebugString(dims2));
-  TfLiteIntArrayFree(dims2);
+  EXPECT_EQ("[2,3]", GetShapeDebugString(dims2.get()));
 
-  TfLiteIntArray* dims3 = TfLiteIntArrayCreate(3);
+  IntArrayUniquePtr dims3 = BuildTfLiteIntArray({4, 5, 6});
   dims3->data[0] = 4;
   dims3->data[1] = 5;
   dims3->data[2] = 6;
-  EXPECT_EQ("[4,5,6]", GetShapeDebugString(dims3));
-  TfLiteIntArrayFree(dims3);
+  EXPECT_EQ("[4,5,6]", GetShapeDebugString(dims3.get()));
 }
 
-TEST_F(KernelUtilTest, CheckAndPopulate) {
+class QuantizationParamsTest : public TestWithTfLiteContext {};
+
+TEST_F(QuantizationParamsTest, PerChannelConvolution) {
   // Create input.
-  TfLiteTensor input = {};
-  input.type = kTfLiteInt8;
-  input.allocation_type = kTfLiteArenaRw;
-  input.dims = TfLiteIntArrayCreate(1);
-  input.dims->data[0] = 2;
+  TensorUniquePtr input = BuildTfLiteTensor();
+  input->type = kTfLiteInt8;
+  input->allocation_type = kTfLiteArenaRw;
+  input->dims = TfLiteIntArrayCreate(1);
+  input->dims->data[0] = 2;
   TfLiteQuantizationParams input_quant = {0.5, 5};
-  input.params = input_quant;
-  input.quantization.type = kTfLiteAffineQuantization;
+  input->params = input_quant;
+  input->quantization.type = kTfLiteAffineQuantization;
   auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   input_params->scale = TfLiteFloatArrayCreate(1);
   input_params->scale->data[0] = 0.5;
   input_params->zero_point = TfLiteIntArrayCreate(1);
   input_params->zero_point->data[0] = 5;
-  input.quantization.params = reinterpret_cast<void*>(input_params);
+  input->quantization.params = reinterpret_cast<void*>(input_params);
 
   // Create filter.
-  TfLiteTensor filter = {};
-  filter.type = kTfLiteInt8;
-  filter.allocation_type = kTfLiteArenaRw;
-  filter.dims = TfLiteIntArrayCreate(4);
-  filter.dims->data[0] = 3;
-  filter.dims->data[1] = 4;
-  filter.dims->data[2] = 5;
-  filter.dims->data[3] = 6;
+  TensorUniquePtr filter = BuildTfLiteTensor();
+  filter->type = kTfLiteInt8;
+  filter->allocation_type = kTfLiteArenaRw;
+  filter->dims = TfLiteIntArrayCreate(4);
+  filter->dims->data[0] = 3;
+  filter->dims->data[1] = 4;
+  filter->dims->data[2] = 5;
+  filter->dims->data[3] = 6;
   TfLiteQuantizationParams filter_quant = {0.25, 0};
-  filter.params = filter_quant;
-  filter.quantization.type = kTfLiteAffineQuantization;
+  filter->params = filter_quant;
+  filter->quantization.type = kTfLiteAffineQuantization;
   auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   filter_params->scale = TfLiteFloatArrayCreate(3);
@@ -396,16 +333,16 @@ TEST_F(KernelUtilTest, CheckAndPopulate) {
   filter_params->zero_point->data[1] = 0;
   filter_params->zero_point->data[2] = 0;
   filter_params->quantized_dimension = 0;
-  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+  filter->quantization.params = reinterpret_cast<void*>(filter_params);
 
   // Create bias.
-  TfLiteTensor bias = {};
-  bias.type = kTfLiteInt32;
-  bias.allocation_type = kTfLiteArenaRw;
-  bias.dims = TfLiteIntArrayCreate(4);
+  TensorUniquePtr bias = BuildTfLiteTensor();
+  bias->type = kTfLiteInt32;
+  bias->allocation_type = kTfLiteArenaRw;
+  bias->dims = TfLiteIntArrayCreate(4);
   TfLiteQuantizationParams bias_quant = {0.125, 9};
-  bias.params = bias_quant;
-  bias.quantization.type = kTfLiteAffineQuantization;
+  bias->params = bias_quant;
+  bias->quantization.type = kTfLiteAffineQuantization;
   auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   bias_params->scale = TfLiteFloatArrayCreate(3);
@@ -416,23 +353,23 @@ TEST_F(KernelUtilTest, CheckAndPopulate) {
   bias_params->zero_point->data[0] = 11;
   bias_params->zero_point->data[1] = 12;
   bias_params->zero_point->data[2] = 15;
-  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+  bias->quantization.params = reinterpret_cast<void*>(bias_params);
 
   // Create output.
-  TfLiteTensor output = {};
-  output.type = kTfLiteInt8;
-  output.allocation_type = kTfLiteArenaRw;
-  output.dims = nullptr;
+  TensorUniquePtr output = BuildTfLiteTensor();
+  output->type = kTfLiteInt8;
+  output->allocation_type = kTfLiteArenaRw;
+  output->dims = nullptr;
   TfLiteQuantizationParams output_quant = {0.5, -128};
-  output.params = output_quant;
-  output.quantization.type = kTfLiteAffineQuantization;
+  output->params = output_quant;
+  output->quantization.type = kTfLiteAffineQuantization;
   auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   output_params->scale = TfLiteFloatArrayCreate(1);
   output_params->scale->data[0] = 0.5;
   output_params->zero_point = TfLiteIntArrayCreate(1);
   output_params->zero_point->data[0] = -128;
-  output.quantization.params = reinterpret_cast<void*>(output_params);
+  output->quantization.params = reinterpret_cast<void*>(output_params);
 
   // Create call parameters.
   int32_t multiplier;
@@ -443,53 +380,48 @@ TEST_F(KernelUtilTest, CheckAndPopulate) {
   std::vector<int32_t> per_channel_shift(3);
 
   // Call and verify results for per channel case.
-  EXPECT_EQ(
-      kTfLiteOk,
-      PopulateConvolutionQuantizationParams(
-          &context_, &input, &filter, &bias, &output, kTfLiteActRelu,
-          &multiplier, &shift, &output_activation_min, &output_activation_max,
-          per_channel_multiplier.data(), per_channel_shift.data()));
+  auto status = PopulateConvolutionQuantizationParams(
+      &context_, input.get(), filter.get(), bias.get(), output.get(),
+      kTfLiteActRelu, &multiplier, &shift, &output_activation_min,
+      &output_activation_max, per_channel_multiplier.data(),
+      per_channel_shift.data());
+  EXPECT_EQ(kTfLiteOk, status);
   EXPECT_THAT(per_channel_multiplier,
               ElementsAre(1073741824, 1073741824, 1073741824));
   EXPECT_THAT(per_channel_shift, ElementsAre(-1, -2, -1));
-
-  // Release.
-  TfLiteTensorFree(&input);
-  TfLiteTensorFree(&filter);
-  TfLiteTensorFree(&bias);
-  TfLiteTensorFree(&output);
 }
 
-TEST_F(KernelUtilTest, CheckAndPopulateShift) {
+TEST_F(QuantizationParamsTest, CheckAndPopulateShift) {
   // Create input of type kTfLiteUInt8.
-  TfLiteTensor input = {};
-  input.type = kTfLiteUInt8;
-  input.allocation_type = kTfLiteArenaRw;
-  input.dims = TfLiteIntArrayCreate(1);
-  input.dims->data[0] = 2;
+
+  TensorUniquePtr input = BuildTfLiteTensor();
+  input->type = kTfLiteUInt8;
+  input->allocation_type = kTfLiteArenaRw;
+  input->dims = TfLiteIntArrayCreate(1);
+  input->dims->data[0] = 2;
   TfLiteQuantizationParams input_quant = {0.5, 5};
-  input.params = input_quant;
-  input.quantization.type = kTfLiteAffineQuantization;
+  input->params = input_quant;
+  input->quantization.type = kTfLiteAffineQuantization;
   auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   input_params->scale = TfLiteFloatArrayCreate(1);
   input_params->scale->data[0] = 0.5;
   input_params->zero_point = TfLiteIntArrayCreate(1);
   input_params->zero_point->data[0] = 5;
-  input.quantization.params = reinterpret_cast<void*>(input_params);
+  input->quantization.params = reinterpret_cast<void*>(input_params);
 
   // Create filter of type kTfLiteUInt8.
-  TfLiteTensor filter = {};
-  filter.type = kTfLiteUInt8;
-  filter.allocation_type = kTfLiteArenaRw;
-  filter.dims = TfLiteIntArrayCreate(4);
-  filter.dims->data[0] = 3;
-  filter.dims->data[1] = 4;
-  filter.dims->data[2] = 5;
-  filter.dims->data[3] = 6;
+  TensorUniquePtr filter = BuildTfLiteTensor();
+  filter->type = kTfLiteUInt8;
+  filter->allocation_type = kTfLiteArenaRw;
+  filter->dims = TfLiteIntArrayCreate(4);
+  filter->dims->data[0] = 3;
+  filter->dims->data[1] = 4;
+  filter->dims->data[2] = 5;
+  filter->dims->data[3] = 6;
   TfLiteQuantizationParams filter_quant = {0.25, 0};
-  filter.params = filter_quant;
-  filter.quantization.type = kTfLiteAffineQuantization;
+  filter->params = filter_quant;
+  filter->quantization.type = kTfLiteAffineQuantization;
   auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   // Create scale of size one.
@@ -498,16 +430,16 @@ TEST_F(KernelUtilTest, CheckAndPopulateShift) {
   filter_params->zero_point = TfLiteIntArrayCreate(1);
   filter_params->zero_point->data[0] = 0;
   filter_params->quantized_dimension = 0;
-  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+  filter->quantization.params = reinterpret_cast<void*>(filter_params);
 
   // Create bias for kTfLiteUInt8.
-  TfLiteTensor bias = {};
-  bias.type = kTfLiteUInt8;
-  bias.allocation_type = kTfLiteArenaRw;
-  bias.dims = TfLiteIntArrayCreate(4);
+  TensorUniquePtr bias = BuildTfLiteTensor();
+  bias->type = kTfLiteUInt8;
+  bias->allocation_type = kTfLiteArenaRw;
+  bias->dims = TfLiteIntArrayCreate(4);
   TfLiteQuantizationParams bias_quant = {0.125, 9};
-  bias.params = bias_quant;
-  bias.quantization.type = kTfLiteAffineQuantization;
+  bias->params = bias_quant;
+  bias->quantization.type = kTfLiteAffineQuantization;
   auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   bias_params->scale = TfLiteFloatArrayCreate(3);
@@ -518,23 +450,23 @@ TEST_F(KernelUtilTest, CheckAndPopulateShift) {
   bias_params->zero_point->data[0] = 11;
   bias_params->zero_point->data[1] = 12;
   bias_params->zero_point->data[2] = 15;
-  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+  bias->quantization.params = reinterpret_cast<void*>(bias_params);
 
   // Create output for kTfLiteUInt8.
-  TfLiteTensor output = {};
-  output.type = kTfLiteUInt8;
-  output.allocation_type = kTfLiteArenaRw;
-  output.dims = nullptr;
+  TensorUniquePtr output = BuildTfLiteTensor();
+  output->type = kTfLiteUInt8;
+  output->allocation_type = kTfLiteArenaRw;
+  output->dims = nullptr;
   TfLiteQuantizationParams output_quant = {0.5, 128};
-  output.params = output_quant;
-  output.quantization.type = kTfLiteAffineQuantization;
+  output->params = output_quant;
+  output->quantization.type = kTfLiteAffineQuantization;
   auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   output_params->scale = TfLiteFloatArrayCreate(1);
   output_params->scale->data[0] = 0.5;
   output_params->zero_point = TfLiteIntArrayCreate(1);
   output_params->zero_point->data[0] = 128;
-  output.quantization.params = reinterpret_cast<void*>(output_params);
+  output->quantization.params = reinterpret_cast<void*>(output_params);
 
   // Create call parameters.
   int32_t multiplier;
@@ -545,12 +477,12 @@ TEST_F(KernelUtilTest, CheckAndPopulateShift) {
   std::vector<int> per_channel_shift(3);
 
   // Call and verify results for per channel case.
-  EXPECT_EQ(
-      kTfLiteOk,
-      PopulateConvolutionQuantizationParams(
-          &context_, &input, &filter, &bias, &output, kTfLiteActRelu,
-          &multiplier, &shift, &output_activation_min, &output_activation_max,
-          per_channel_multiplier.data(), per_channel_shift.data(), 3));
+  EXPECT_EQ(kTfLiteOk,
+            PopulateConvolutionQuantizationParams(
+                &context_, input.get(), filter.get(), bias.get(), output.get(),
+                kTfLiteActRelu, &multiplier, &shift, &output_activation_min,
+                &output_activation_max, per_channel_multiplier.data(),
+                per_channel_shift.data(), 3));
   // Since the filter scale has a size of one but the number of channels is
   // three, in our TC we expect three 1073741824 as output
   EXPECT_THAT(per_channel_multiplier,
@@ -558,45 +490,39 @@ TEST_F(KernelUtilTest, CheckAndPopulateShift) {
   EXPECT_THAT(per_channel_shift, ElementsAre(-1, -1, -1));
   EXPECT_EQ(shift, 1);
   EXPECT_EQ(multiplier, 1073741824);
-
-  // Release.
-  TfLiteTensorFree(&input);
-  TfLiteTensorFree(&filter);
-  TfLiteTensorFree(&bias);
-  TfLiteTensorFree(&output);
 }
 
 #ifndef __APPLE__  // Some Apple toolchains don't support std::ldexp
-TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
+TEST_F(QuantizationParamsTest, CheckAndPopulateZeroValue) {
   // Create input.
-  TfLiteTensor input = {};
-  input.type = kTfLiteInt8;
-  input.allocation_type = kTfLiteArenaRw;
-  input.dims = TfLiteIntArrayCreate(1);
-  input.dims->data[0] = 2;
+  auto input = BuildTfLiteTensor();
+  input->type = kTfLiteInt8;
+  input->allocation_type = kTfLiteArenaRw;
+  input->dims = TfLiteIntArrayCreate(1);
+  input->dims->data[0] = 2;
   TfLiteQuantizationParams input_quant = {1, 5};
-  input.params = input_quant;
-  input.quantization.type = kTfLiteAffineQuantization;
+  input->params = input_quant;
+  input->quantization.type = kTfLiteAffineQuantization;
   auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   input_params->scale = TfLiteFloatArrayCreate(1);
   input_params->scale->data[0] = 1;
   input_params->zero_point = TfLiteIntArrayCreate(1);
   input_params->zero_point->data[0] = 5;
-  input.quantization.params = reinterpret_cast<void*>(input_params);
+  input->quantization.params = reinterpret_cast<void*>(input_params);
 
   // Create filter.
-  TfLiteTensor filter = {};
-  filter.type = kTfLiteInt8;
-  filter.allocation_type = kTfLiteArenaRw;
-  filter.dims = TfLiteIntArrayCreate(4);
-  filter.dims->data[0] = 3;
-  filter.dims->data[1] = 4;
-  filter.dims->data[2] = 5;
-  filter.dims->data[3] = 6;
+  auto filter = BuildTfLiteTensor();
+  filter->type = kTfLiteInt8;
+  filter->allocation_type = kTfLiteArenaRw;
+  filter->dims = TfLiteIntArrayCreate(4);
+  filter->dims->data[0] = 3;
+  filter->dims->data[1] = 4;
+  filter->dims->data[2] = 5;
+  filter->dims->data[3] = 6;
   TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
-  filter.params = filter_quant;
-  filter.quantization.type = kTfLiteAffineQuantization;
+  filter->params = filter_quant;
+  filter->quantization.type = kTfLiteAffineQuantization;
   auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   filter_params->scale = TfLiteFloatArrayCreate(3);
@@ -608,16 +534,16 @@ TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   filter_params->zero_point->data[1] = 0;
   filter_params->zero_point->data[2] = 0;
   filter_params->quantized_dimension = 0;
-  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+  filter->quantization.params = reinterpret_cast<void*>(filter_params);
 
   // Create bias.
-  TfLiteTensor bias = {};
-  bias.type = kTfLiteInt32;
-  bias.allocation_type = kTfLiteArenaRw;
-  bias.dims = TfLiteIntArrayCreate(4);
+  auto bias = BuildTfLiteTensor();
+  bias->type = kTfLiteInt32;
+  bias->allocation_type = kTfLiteArenaRw;
+  bias->dims = TfLiteIntArrayCreate(4);
   TfLiteQuantizationParams bias_quant = {4.6566129e-10, 9};
-  bias.params = bias_quant;
-  bias.quantization.type = kTfLiteAffineQuantization;
+  bias->params = bias_quant;
+  bias->quantization.type = kTfLiteAffineQuantization;
   auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   bias_params->scale = TfLiteFloatArrayCreate(3);
@@ -628,23 +554,23 @@ TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   bias_params->zero_point->data[0] = 11;
   bias_params->zero_point->data[1] = 12;
   bias_params->zero_point->data[2] = 15;
-  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+  bias->quantization.params = reinterpret_cast<void*>(bias_params);
 
   // Create output.
-  TfLiteTensor output = {};
-  output.type = kTfLiteInt8;
-  output.allocation_type = kTfLiteArenaRw;
-  output.dims = nullptr;
+  auto output = BuildTfLiteTensor();
+  output->type = kTfLiteInt8;
+  output->allocation_type = kTfLiteArenaRw;
+  output->dims = nullptr;
   TfLiteQuantizationParams output_quant = {1, -128};
-  output.params = output_quant;
-  output.quantization.type = kTfLiteAffineQuantization;
+  output->params = output_quant;
+  output->quantization.type = kTfLiteAffineQuantization;
   auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   output_params->scale = TfLiteFloatArrayCreate(1);
   output_params->scale->data[0] = 1;
   output_params->zero_point = TfLiteIntArrayCreate(1);
   output_params->zero_point->data[0] = -128;
-  output.quantization.params = reinterpret_cast<void*>(output_params);
+  output->quantization.params = reinterpret_cast<void*>(output_params);
 
   // Create call parameters.
   int32_t multiplier;
@@ -655,53 +581,47 @@ TEST_F(KernelUtilTest, CheckAndPopulateZeroValue) {
   std::vector<int> per_channel_shift(3);
 
   // Call and verify results for per channel case.
-  EXPECT_EQ(
-      kTfLiteOk,
-      PopulateConvolutionQuantizationParams(
-          &context_, &input, &filter, &bias, &output, kTfLiteActRelu,
-          &multiplier, &shift, &output_activation_min, &output_activation_max,
-          per_channel_multiplier.data(), per_channel_shift.data(), 3));
+  EXPECT_EQ(kTfLiteOk,
+            PopulateConvolutionQuantizationParams(
+                &context_, input.get(), filter.get(), bias.get(), output.get(),
+                kTfLiteActRelu, &multiplier, &shift, &output_activation_min,
+                &output_activation_max, per_channel_multiplier.data(),
+                per_channel_shift.data(), 3));
   EXPECT_THAT(per_channel_multiplier, ElementsAre(1073741824, 1073741824, 0));
   EXPECT_THAT(per_channel_shift, ElementsAre(-30, -31, 0));
-
-  // Release.
-  TfLiteTensorFree(&input);
-  TfLiteTensorFree(&filter);
-  TfLiteTensorFree(&bias);
-  TfLiteTensorFree(&output);
 }
 #endif
 
-TEST_F(KernelUtilTest, CheckAndPopulateUint8) {
+TEST_F(QuantizationParamsTest, CheckAndPopulateUint8) {
   // Create input.
-  TfLiteTensor input = {};
-  input.type = kTfLiteUInt8;
-  input.allocation_type = kTfLiteArenaRw;
-  input.dims = TfLiteIntArrayCreate(1);
-  input.dims->data[0] = 2;
+  auto input = BuildTfLiteTensor();
+  input->type = kTfLiteUInt8;
+  input->allocation_type = kTfLiteArenaRw;
+  input->dims = TfLiteIntArrayCreate(1);
+  input->dims->data[0] = 2;
   TfLiteQuantizationParams input_quant = {1, 5};
-  input.params = input_quant;
-  input.quantization.type = kTfLiteAffineQuantization;
+  input->params = input_quant;
+  input->quantization.type = kTfLiteAffineQuantization;
   auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   input_params->scale = TfLiteFloatArrayCreate(1);
   input_params->scale->data[0] = 1;
   input_params->zero_point = TfLiteIntArrayCreate(1);
   input_params->zero_point->data[0] = 5;
-  input.quantization.params = reinterpret_cast<void*>(input_params);
+  input->quantization.params = reinterpret_cast<void*>(input_params);
 
   // Create filter.
-  TfLiteTensor filter = {};
-  filter.type = kTfLiteUInt8;
-  filter.allocation_type = kTfLiteArenaRw;
-  filter.dims = TfLiteIntArrayCreate(4);
-  filter.dims->data[0] = 3;
-  filter.dims->data[1] = 4;
-  filter.dims->data[2] = 5;
-  filter.dims->data[3] = 6;
+  auto filter = BuildTfLiteTensor();
+  filter->type = kTfLiteUInt8;
+  filter->allocation_type = kTfLiteArenaRw;
+  filter->dims = TfLiteIntArrayCreate(4);
+  filter->dims->data[0] = 3;
+  filter->dims->data[1] = 4;
+  filter->dims->data[2] = 5;
+  filter->dims->data[3] = 6;
   TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
-  filter.params = filter_quant;
-  filter.quantization.type = kTfLiteAffineQuantization;
+  filter->params = filter_quant;
+  filter->quantization.type = kTfLiteAffineQuantization;
   auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   filter_params->scale = TfLiteFloatArrayCreate(1);
@@ -710,39 +630,39 @@ TEST_F(KernelUtilTest, CheckAndPopulateUint8) {
   filter_params->zero_point = TfLiteIntArrayCreate(1);
   filter_params->zero_point->data[0] = 0;
   filter_params->quantized_dimension = 0;
-  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+  filter->quantization.params = reinterpret_cast<void*>(filter_params);
 
   // Create bias.
-  TfLiteTensor bias = {};
-  bias.type = kTfLiteInt32;
-  bias.allocation_type = kTfLiteArenaRw;
-  bias.dims = TfLiteIntArrayCreate(4);
+  auto bias = BuildTfLiteTensor();
+  bias->type = kTfLiteInt32;
+  bias->allocation_type = kTfLiteArenaRw;
+  bias->dims = TfLiteIntArrayCreate(4);
   TfLiteQuantizationParams bias_quant = {4.6566129e-10, 9};
-  bias.params = bias_quant;
-  bias.quantization.type = kTfLiteAffineQuantization;
+  bias->params = bias_quant;
+  bias->quantization.type = kTfLiteAffineQuantization;
   auto* bias_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   bias_params->scale = TfLiteFloatArrayCreate(1);
   bias_params->scale->data[0] = 4.6566129e-10;  // 2^-31
   bias_params->zero_point = TfLiteIntArrayCreate(1);
   bias_params->zero_point->data[0] = 11;
-  bias.quantization.params = reinterpret_cast<void*>(bias_params);
+  bias->quantization.params = reinterpret_cast<void*>(bias_params);
 
   // Create output.
-  TfLiteTensor output = {};
-  output.type = kTfLiteUInt8;
-  output.allocation_type = kTfLiteArenaRw;
-  output.dims = nullptr;
+  auto output = BuildTfLiteTensor();
+  output->type = kTfLiteUInt8;
+  output->allocation_type = kTfLiteArenaRw;
+  output->dims = nullptr;
   TfLiteQuantizationParams output_quant = {1, -128};
-  output.params = output_quant;
-  output.quantization.type = kTfLiteAffineQuantization;
+  output->params = output_quant;
+  output->quantization.type = kTfLiteAffineQuantization;
   auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   output_params->scale = TfLiteFloatArrayCreate(1);
   output_params->scale->data[0] = 1;
   output_params->zero_point = TfLiteIntArrayCreate(1);
   output_params->zero_point->data[0] = -128;
-  output.quantization.params = reinterpret_cast<void*>(output_params);
+  output->quantization.params = reinterpret_cast<void*>(output_params);
 
   // Create call parameters.
   int32_t multiplier;
@@ -753,53 +673,47 @@ TEST_F(KernelUtilTest, CheckAndPopulateUint8) {
   std::vector<int> per_channel_shift(3);
 
   // Call and verify results for per channel case.
-  EXPECT_EQ(
-      kTfLiteOk,
-      PopulateConvolutionQuantizationParams(
-          &context_, &input, &filter, &bias, &output, kTfLiteActRelu,
-          &multiplier, &shift, &output_activation_min, &output_activation_max,
-          per_channel_multiplier.data(), per_channel_shift.data(), 3));
+  EXPECT_EQ(kTfLiteOk,
+            PopulateConvolutionQuantizationParams(
+                &context_, input.get(), filter.get(), bias.get(), output.get(),
+                kTfLiteActRelu, &multiplier, &shift, &output_activation_min,
+                &output_activation_max, per_channel_multiplier.data(),
+                per_channel_shift.data(), 3));
   EXPECT_THAT(per_channel_multiplier,
               ElementsAre(1073741824, 1073741824, 1073741824));
   EXPECT_THAT(per_channel_shift, ElementsAre(-30, -30, -30));
-
-  // Release.
-  TfLiteTensorFree(&input);
-  TfLiteTensorFree(&filter);
-  TfLiteTensorFree(&bias);
-  TfLiteTensorFree(&output);
 }
 
-TEST_F(KernelUtilTest, CheckAndPopulateWithoutBias) {
+TEST_F(QuantizationParamsTest, CheckAndPopulateWithoutBias) {
   // Create input.
-  TfLiteTensor input = {};
-  input.type = kTfLiteUInt8;
-  input.allocation_type = kTfLiteArenaRw;
-  input.dims = TfLiteIntArrayCreate(1);
-  input.dims->data[0] = 2;
+  auto input = BuildTfLiteTensor();
+  input->type = kTfLiteUInt8;
+  input->allocation_type = kTfLiteArenaRw;
+  input->dims = TfLiteIntArrayCreate(1);
+  input->dims->data[0] = 2;
   TfLiteQuantizationParams input_quant = {1, 5};
-  input.params = input_quant;
-  input.quantization.type = kTfLiteAffineQuantization;
+  input->params = input_quant;
+  input->quantization.type = kTfLiteAffineQuantization;
   auto* input_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   input_params->scale = TfLiteFloatArrayCreate(1);
   input_params->scale->data[0] = 1;
   input_params->zero_point = TfLiteIntArrayCreate(1);
   input_params->zero_point->data[0] = 5;
-  input.quantization.params = reinterpret_cast<void*>(input_params);
+  input->quantization.params = reinterpret_cast<void*>(input_params);
 
   // Create filter.
-  TfLiteTensor filter = {};
-  filter.type = kTfLiteUInt8;
-  filter.allocation_type = kTfLiteArenaRw;
-  filter.dims = TfLiteIntArrayCreate(4);
-  filter.dims->data[0] = 3;
-  filter.dims->data[1] = 4;
-  filter.dims->data[2] = 5;
-  filter.dims->data[3] = 6;
+  auto filter = BuildTfLiteTensor();
+  filter->type = kTfLiteUInt8;
+  filter->allocation_type = kTfLiteArenaRw;
+  filter->dims = TfLiteIntArrayCreate(4);
+  filter->dims->data[0] = 3;
+  filter->dims->data[1] = 4;
+  filter->dims->data[2] = 5;
+  filter->dims->data[3] = 6;
   TfLiteQuantizationParams filter_quant = {4.6566129e-10, 0};
-  filter.params = filter_quant;
-  filter.quantization.type = kTfLiteAffineQuantization;
+  filter->params = filter_quant;
+  filter->quantization.type = kTfLiteAffineQuantization;
   auto* filter_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   filter_params->scale = TfLiteFloatArrayCreate(1);
@@ -808,23 +722,23 @@ TEST_F(KernelUtilTest, CheckAndPopulateWithoutBias) {
   filter_params->zero_point = TfLiteIntArrayCreate(1);
   filter_params->zero_point->data[0] = 0;
   filter_params->quantized_dimension = 0;
-  filter.quantization.params = reinterpret_cast<void*>(filter_params);
+  filter->quantization.params = reinterpret_cast<void*>(filter_params);
 
   // Create output.
-  TfLiteTensor output = {};
-  output.type = kTfLiteUInt8;
-  output.allocation_type = kTfLiteArenaRw;
-  output.dims = nullptr;
+  auto output = BuildTfLiteTensor();
+  output->type = kTfLiteUInt8;
+  output->allocation_type = kTfLiteArenaRw;
+  output->dims = nullptr;
   TfLiteQuantizationParams output_quant = {1, -128};
-  output.params = output_quant;
-  output.quantization.type = kTfLiteAffineQuantization;
+  output->params = output_quant;
+  output->quantization.type = kTfLiteAffineQuantization;
   auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   output_params->scale = TfLiteFloatArrayCreate(1);
   output_params->scale->data[0] = 1;
   output_params->zero_point = TfLiteIntArrayCreate(1);
   output_params->zero_point->data[0] = -128;
-  output.quantization.params = reinterpret_cast<void*>(output_params);
+  output->quantization.params = reinterpret_cast<void*>(output_params);
 
   // Create call parameters.
   int32_t multiplier;
@@ -835,61 +749,53 @@ TEST_F(KernelUtilTest, CheckAndPopulateWithoutBias) {
   std::vector<int> per_channel_shift(3);
 
   // Call and verify results for per channel case.
-  EXPECT_EQ(
-      kTfLiteOk,
-      PopulateConvolutionQuantizationParams(
-          &context_, &input, &filter, nullptr, &output, kTfLiteActRelu,
-          &multiplier, &shift, &output_activation_min, &output_activation_max,
-          per_channel_multiplier.data(), per_channel_shift.data(), 3));
+  EXPECT_EQ(kTfLiteOk,
+            PopulateConvolutionQuantizationParams(
+                &context_, input.get(), filter.get(), nullptr, output.get(),
+                kTfLiteActRelu, &multiplier, &shift, &output_activation_min,
+                &output_activation_max, per_channel_multiplier.data(),
+                per_channel_shift.data(), 3));
   EXPECT_THAT(per_channel_multiplier,
               ElementsAre(1073741824, 1073741824, 1073741824));
   EXPECT_THAT(per_channel_shift, ElementsAre(-30, -30, -30));
-
-  // Release.
-  TfLiteTensorFree(&input);
-  TfLiteTensorFree(&filter);
-  TfLiteTensorFree(&output);
 }
 
-TEST_F(KernelUtilTest, ActivationRangeQuantizedOverflow) {
+TEST_F(QuantizationParamsTest, ActivationRangeQuantizedOverflow) {
   // Create output.
-  TfLiteTensor output = {};
-  output.type = kTfLiteUInt8;
-  output.allocation_type = kTfLiteArenaRw;
-  output.dims = nullptr;
+  auto output = BuildTfLiteTensor();
+  output->type = kTfLiteUInt8;
+  output->allocation_type = kTfLiteArenaRw;
+  output->dims = nullptr;
   TfLiteQuantizationParams output_quant = {1e-10, -128};
-  output.params = output_quant;
-  output.quantization.type = kTfLiteAffineQuantization;
+  output->params = output_quant;
+  output->quantization.type = kTfLiteAffineQuantization;
   auto* output_params = reinterpret_cast<TfLiteAffineQuantization*>(
       malloc(sizeof(TfLiteAffineQuantization)));
   output_params->scale = TfLiteFloatArrayCreate(1);
   output_params->scale->data[0] = 1;
   output_params->zero_point = TfLiteIntArrayCreate(1);
   output_params->zero_point->data[0] = -128;
-  output.quantization.params = reinterpret_cast<void*>(output_params);
+  output->quantization.params = reinterpret_cast<void*>(output_params);
 
   // For bounded activation, a too small scale value may cause overflow.
   // Make sure overflow error is handled gracefully.
   int32_t act_min, act_max;
   ASSERT_EQ(kTfLiteOk,
-            CalculateActivationRangeQuantized(&context_, kTfLiteActRelu,
-                                              &output, &act_min, &act_max));
+            CalculateActivationRangeQuantized(
+                &context_, kTfLiteActRelu, output.get(), &act_min, &act_max));
   ASSERT_NE(kTfLiteOk,
-            CalculateActivationRangeQuantized(&context_, kTfLiteActRelu6,
-                                              &output, &act_min, &act_max));
+            CalculateActivationRangeQuantized(
+                &context_, kTfLiteActRelu6, output.get(), &act_min, &act_max));
   EXPECT_TRUE(absl::StrContains(
       context_.error, "no_integer_overflow_from_quantization was not true"));
-  ASSERT_NE(kTfLiteOk,
-            CalculateActivationRangeQuantized(&context_, kTfLiteActReluN1To1,
-                                              &output, &act_min, &act_max));
+  ASSERT_NE(kTfLiteOk, CalculateActivationRangeQuantized(
+                           &context_, kTfLiteActReluN1To1, output.get(),
+                           &act_min, &act_max));
   EXPECT_TRUE(absl::StrContains(
       context_.error, "no_integer_overflow_from_quantization was not true"));
-
-  // Release.
-  TfLiteTensorFree(&output);
 }
 
-TEST_F(KernelUtilTest, IsMobilePlatform) {
+TEST_F(QuantizationParamsTest, IsMobilePlatform) {
   // Note: This isn't meant to be exhaustive, as that would require replicating
   // the method's implementation, but it is a basic smoke check.
 #if defined(__ANDROID__)
@@ -901,20 +807,93 @@ TEST_F(KernelUtilTest, IsMobilePlatform) {
 #endif
 }
 
-TEST_F(KernelUtilTest, HasUnspecifiedDimension) {
-  TfLiteTensor tensor;
-  TfLiteIntArray* shape_sig = TfLiteIntArrayCreate(3);
-  shape_sig->data[0] = 1;
-  shape_sig->data[1] = -1;
-  shape_sig->data[2] = 3;
-  tensor.dims_signature = shape_sig;
+TEST(HasUnspecifiedDimensions, ReturnsTrueIfADimIsMinusOne) {
+  auto tensor = BuildTfLiteTensor(kTfLiteInt32, {1, 1, 3}, kTfLiteDynamic);
+  tensor->dims_signature = ConvertVectorToTfLiteIntArray({1, -1, 3});
+  EXPECT_TRUE(HasUnspecifiedDimension(tensor.get()));
+}
 
-  EXPECT_TRUE(HasUnspecifiedDimension(&tensor));
+TEST(HasUnspecifiedDimensions, ReturnsFalseIfAllPostiveDims) {
+  auto tensor = BuildTfLiteTensor(kTfLiteInt32, {1, 1, 3}, kTfLiteDynamic);
+  tensor->dims_signature = ConvertVectorToTfLiteIntArray({1, 1, 3});
+  EXPECT_FALSE(HasUnspecifiedDimension(tensor.get()));
+}
 
-  shape_sig->data[1] = 2;
-  EXPECT_FALSE(HasUnspecifiedDimension(&tensor));
+// Sets up a TFLite context and default values to initialize/resize test
+// tensors.
+class SetTensorAllocationTypeTest : public testing::Test {
+ public:
+  SetTensorAllocationTypeTest() {
+    tensor_->type = kTfLiteInt32;
+    tensor_->allocation_type = kTfLiteDynamic;
+  }
 
-  TfLiteIntArrayFree(shape_sig);
+ protected:
+  Interpreter interpreter_;
+  TfLiteContext& context_ = *interpreter_.primary_subgraph().context();
+  IntArrayUniquePtr dims_ = BuildTfLiteIntArray({2, 3, 4});
+  TensorUniquePtr tensor_ = BuildTfLiteTensor();
+};
+
+TEST_F(SetTensorAllocationTypeTest,
+       SetUnallocatedDynamicTensorToDynamicIsANoop) {
+  tensor_->allocation_type = kTfLiteDynamic;
+  SetTensorToDynamic(tensor_.get());
+  EXPECT_EQ(tensor_->data.data, nullptr);
+  EXPECT_EQ(tensor_->allocation_type, kTfLiteDynamic);
+}
+
+TEST_F(SetTensorAllocationTypeTest, SetAllocatedDynamicTensorToDynamicIsANoop) {
+  tensor_->allocation_type = kTfLiteDynamic;
+  ASSERT_EQ(context_.ResizeTensor(&context_, tensor_.get(), dims_.release()),
+            kTfLiteOk);
+  const void* const original_data = tensor_->data.data;
+  SetTensorToDynamic(tensor_.get());
+  EXPECT_EQ(tensor_->data.data, original_data);
+  EXPECT_EQ(tensor_->allocation_type, kTfLiteDynamic);
+}
+
+TEST_F(SetTensorAllocationTypeTest,
+       SetAllocatedPersistentRoTensorToDynamicFreesExistingTensorData) {
+  tensor_->allocation_type = kTfLitePersistentRo;
+  ASSERT_EQ(context_.ResizeTensor(&context_, tensor_.get(), dims_.release()),
+            kTfLiteOk);
+
+  // Leak checker will raise an error if data is not freed.
+  SetTensorToDynamic(tensor_.get());
+  EXPECT_EQ(tensor_->data.data, nullptr);
+  EXPECT_EQ(tensor_->allocation_type, kTfLiteDynamic);
+}
+
+TEST_F(SetTensorAllocationTypeTest,
+       SetUnallocatedPersistentRoTensorToPersistentRoIsANoop) {
+  tensor_->allocation_type = kTfLitePersistentRo;
+  SetTensorToPersistentRo(tensor_.get());
+  EXPECT_EQ(tensor_->data.data, nullptr);
+  EXPECT_EQ(tensor_->allocation_type, kTfLitePersistentRo);
+}
+
+TEST_F(SetTensorAllocationTypeTest,
+       SetAllocatedPersistentRoTensorToPersistentRoIsANoop) {
+  tensor_->allocation_type = kTfLitePersistentRo;
+  ASSERT_EQ(context_.ResizeTensor(&context_, tensor_.get(), dims_.release()),
+            kTfLiteOk);
+  const void* const original_data = tensor_->data.data;
+  SetTensorToPersistentRo(tensor_.get());
+  EXPECT_EQ(tensor_->data.data, original_data);
+  EXPECT_EQ(tensor_->allocation_type, kTfLitePersistentRo);
+}
+
+TEST_F(SetTensorAllocationTypeTest,
+       SetAllocatedDynamicTensorToPersistentRoFreesExistingTensorData) {
+  tensor_->allocation_type = kTfLiteDynamic;
+  ASSERT_EQ(context_.ResizeTensor(&context_, tensor_.get(), dims_.release()),
+            kTfLiteOk);
+
+  // Leak checker will raise an error if data is not freed.
+  SetTensorToPersistentRo(tensor_.get());
+  EXPECT_EQ(tensor_->data.data, nullptr);
+  EXPECT_EQ(tensor_->allocation_type, kTfLitePersistentRo);
 }
 
 }  // namespace
diff --git a/tensorflow/lite/kernels/one_hot.cc b/tensorflow/lite/kernels/one_hot.cc
index 0177c073c3f..2f267790dfb 100644
--- a/tensorflow/lite/kernels/one_hot.cc
+++ b/tensorflow/lite/kernels/one_hot.cc
@@ -169,7 +169,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   OneHotContext op_context{context, node};
 
   if (IsDynamicTensor(op_context.output)) {
-    ResizeOutputTensor(context, op_context);
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, op_context));
   }
 
   switch (op_context.output->type) {
diff --git a/tensorflow/lite/kernels/pad.cc b/tensorflow/lite/kernels/pad.cc
index 9ba0d9b3e74..036afde5fb3 100644
--- a/tensorflow/lite/kernels/pad.cc
+++ b/tensorflow/lite/kernels/pad.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <limits>
+#include <type_traits>
 
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/kernels/internal/compatibility.h"
@@ -49,10 +50,27 @@ struct PadContext {
     }
     output = GetOutput(context, node, 0);
     dims = NumDimensions(input);
+    switch (paddings->type) {
+      case kTfLiteInt64: {
+        SetResizingCategory<int64_t>(context);
+        break;
+      }
+      case kTfLiteInt32:
+        SetResizingCategory<int32_t>(context);
+        break;
+      default:
+        TF_LITE_KERNEL_LOG(context,
+                           "Padding type %s is currently not supported by Pad.",
+                           TfLiteTypeGetName(paddings->type));
+    }
+  }
 
+  template <typename padding_integer_type>
+  void SetResizingCategory(TfLiteContext* context) {
+    const padding_integer_type* paddings_data =
+        GetTensorData<padding_integer_type>(paddings);
     resizing_category = ResizingCategory::kGenericResize;
     const int paddings_total = GetTensorShape(paddings).FlatSize();
-    const int32* paddings_data = GetTensorData<int32>(paddings);
     // Paddings will be a n,2 array, and we need to detect 4D arrays with the
     // pattern { {0,0}, {a, b}, {c, d}, {0,0} }.
     if (IsConstantTensor(paddings) && paddings_total == 8 &&
@@ -61,6 +79,7 @@ struct PadContext {
       resizing_category = ResizingCategory::kImageStyle;
     }
   }
+
   const TfLiteTensor* constant_values;
   const TfLiteTensor* input;
   const TfLiteTensor* paddings;
@@ -69,48 +88,140 @@ struct PadContext {
   ResizingCategory resizing_category;
 };
 
-// Resizes output array based on the input size and padding size. This function
-// is callable from both Prepare() and Eval() as long as the caller ensures the
-// paddings data is present.
+bool CheckPaddingOverflow(PadContext* op_context) {
+  if (op_context->paddings->type == kTfLiteInt64) {
+    const int64_t* paddings_data = GetTensorData<int64_t>(op_context->paddings);
+    if (paddings_data != nullptr) {
+      int64_t int32_min =
+          static_cast<int64_t>(std::numeric_limits<int32_t>::min());
+      int64_t int32_max =
+          static_cast<int64_t>(std::numeric_limits<int32_t>::max());
+      for (int idx = 0; idx < op_context->dims; ++idx) {
+        int64_t padding = paddings_data[idx];
+        if (padding < int32_min || padding > int32_max) {
+          return true;
+        }
+      }
+    }
+  }
+  return false;
+}
+
+// Helper template function for resizing output array based on the input size
+// and padding size. Do not call this directly, call ResizeOutputTensor()
+// instead.
+template <typename PaddingIntegerType>
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 PadContext* op_context) {
+  if (op_context->paddings->type == kTfLiteInt64) {
+    TF_LITE_ENSURE(context, (std::is_same_v<PaddingIntegerType, int64_t>));
+  } else {
+    TF_LITE_ENSURE(context, (std::is_same_v<PaddingIntegerType, int32_t>));
+  }
   // Ensures the paddings array is dims x 2.
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 0),
                     op_context->dims);
   TF_LITE_ENSURE_EQ(context, SizeOfDimension(op_context->paddings, 1), 2);
 
-  // Ensures all the elements of the paddings is non-negative.
-  const int32* paddings_data = GetTensorData<int32>(op_context->paddings);
-
+  // Right now we only support paddings between INT32_MIN and INT32_MAX, so
+  // we are using int here and below.
+  TfLiteIntArray* input_size = op_context->input->dims;
+  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
+  const PaddingIntegerType* paddings_data =
+      GetTensorData<PaddingIntegerType>(op_context->paddings);
   for (int idx = 0; idx < op_context->dims; ++idx) {
-    int before_padding = *paddings_data++;
-    int after_padding = *paddings_data++;
-
+    // Paddings are between INT32_MIN and INT32_MAX.
+    int before_padding = static_cast<int>(*paddings_data++);
+    int after_padding = static_cast<int>(*paddings_data++);
     TF_LITE_ENSURE_MSG(context, (before_padding >= 0 && after_padding >= 0),
                        "Pad value has to be greater than equal to 0.");
   }
-
-  // Determines the size of the output tensor.
-  TfLiteIntArray* input_size = op_context->input->dims;
-  TfLiteIntArray* output_size = TfLiteIntArrayCopy(input_size);
-  paddings_data = GetTensorData<int32>(op_context->paddings);
-
+  paddings_data = GetTensorData<PaddingIntegerType>(op_context->paddings);
   for (int idx = 0; idx < op_context->dims; ++idx) {
-    int before_padding = *paddings_data++;
-    int after_padding = *paddings_data++;
-
+    // Paddings are between INT32_MIN and INT32_MAX.
+    int before_padding = static_cast<int>(*paddings_data++);
+    int after_padding = static_cast<int>(*paddings_data++);
     output_size->data[idx] =
         (input_size->data[idx] + before_padding + after_padding);
   }
-
   return context->ResizeTensor(context, op_context->output, output_size);
 }
 
+// Resizes output array based on the input size and padding size. This function
+// is callable from both Prepare() and Eval() as long as the caller ensures the
+// paddings data is present.
+TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
+                                PadContext* op_context) {
+  switch (op_context->paddings->type) {
+    case kTfLiteInt64: {
+      return ResizeOutputTensor<int64_t>(context, op_context);
+    }
+    case kTfLiteInt32: {
+      return ResizeOutputTensor<int32_t>(context, op_context);
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Padding type %s is currently not supported by Pad.",
+                         TfLiteTypeGetName(op_context->paddings->type));
+  }
+  return kTfLiteError;
+}
+
+// Helper template function for getting pad params. Do not call this directly,
+// call GetPadParams() instead.
+template <typename PaddingIntegerType>
+tflite::PadParams GetPadParams(TfLiteContext* context,
+                               const PadContext& op_context) {
+  tflite::PadParams op_params;
+  if (!(op_context.paddings->type == kTfLiteInt64 &&
+        std::is_same_v<PaddingIntegerType,
+                       int64_t>)&&!(op_context.paddings->type == kTfLiteInt32 &&
+                                    std::is_same_v<PaddingIntegerType,
+                                                   int32_t>)) {
+    TF_LITE_KERNEL_LOG(context, "Padding type %s doesn't match typename.",
+                       TfLiteTypeGetName(op_context.paddings->type));
+    return op_params;
+  }
+  const PaddingIntegerType* paddings_data =
+      GetTensorData<PaddingIntegerType>(op_context.paddings);
+  op_params.left_padding_count = op_context.dims;
+  op_params.right_padding_count = op_context.dims;
+  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
+    op_params.left_padding[idx] = static_cast<int32_t>(paddings_data[idx * 2]);
+    op_params.right_padding[idx] =
+        static_cast<int32_t>(paddings_data[idx * 2 + 1]);
+  }
+  return op_params;
+}
+
+tflite::PadParams GetPadParams(TfLiteContext* context,
+                               const PadContext& op_context) {
+  switch (op_context.paddings->type) {
+    case kTfLiteInt64: {
+      return GetPadParams<int64_t>(context, op_context);
+    }
+    case kTfLiteInt32: {
+      return GetPadParams<int32_t>(context, op_context);
+      break;
+    }
+    default:
+      TF_LITE_KERNEL_LOG(context,
+                         "Padding type %s is currently not supported by Pad.",
+                         TfLiteTypeGetName(op_context.paddings->type));
+  }
+  return tflite::PadParams();
+}
+
 TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE(context, NumInputs(node) == 2 || NumInputs(node) == 3);
   TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
 
   PadContext op_context(context, node);
+  if (IsConstantTensor(op_context.paddings)) {
+    TF_LITE_ENSURE_MSG(context, !CheckPaddingOverflow(&op_context),
+                       "INT64 padding overflow. Only support value between "
+                       "INT32_MIN and INT32_MAX.");
+  }
   TF_LITE_ENSURE_TYPES_EQ(context, op_context.input->type,
                           op_context.output->type);
   if (op_context.constant_values != nullptr) {
@@ -175,6 +286,10 @@ template <KernelType kernel_type>
 TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   PadContext op_context(context, node);
 
+  TF_LITE_ENSURE_MSG(context, !CheckPaddingOverflow(&op_context),
+                     "INT64 padding overflow. Only support value between "
+                     "INT32_MIN and INT32_MAX.");
+
   if (op_context.constant_values != nullptr) {
     // Ensure that constant_values is a scalar.
     TF_LITE_ENSURE_EQ(context, NumElements(op_context.constant_values), 1);
@@ -185,20 +300,10 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
   }
 
-  // Create before and after padding arrays that are accepted by the kernel.
-  const int32* paddings_data = GetTensorData<int32>(op_context.paddings);
-
   TF_LITE_ENSURE(
       context, op_context.dims <= reference_ops::PadKernelMaxDimensionCount());
 
-  tflite::PadParams op_params;
-  op_params.left_padding_count = op_context.dims;
-  op_params.right_padding_count = op_context.dims;
-
-  for (int idx = op_context.dims - 1; idx >= 0; --idx) {
-    op_params.left_padding[idx] = paddings_data[idx * 2];
-    op_params.right_padding[idx] = paddings_data[idx * 2 + 1];
-  }
+  tflite::PadParams op_params = GetPadParams(context, op_context);
 
 #define TF_LITE_PAD(type, op_name, scalar, pad_value)                     \
   const scalar pad_value_copy = pad_value;                                \
diff --git a/tensorflow/lite/kernels/pad_test.cc b/tensorflow/lite/kernels/pad_test.cc
index af74d3d4df5..6fc7e79719a 100644
--- a/tensorflow/lite/kernels/pad_test.cc
+++ b/tensorflow/lite/kernels/pad_test.cc
@@ -29,7 +29,7 @@ namespace {
 using ::testing::ElementsAreArray;
 using ::testing::Matcher;
 
-template <typename RegularInputOutput>
+template <typename RegularInputOutput, typename PaddingIntegerType>
 class PadOpModel : public SingleOpModel {
  public:
   void SetInput(std::initializer_list<RegularInputOutput> data) {
@@ -46,8 +46,8 @@ class PadOpModel : public SingleOpModel {
     QuantizeAndPopulate<QuantizedInputOutput>(constant_values_, {data});
   }
 
-  void SetPaddings(std::initializer_list<int> paddings) {
-    PopulateTensor<int>(paddings_, paddings);
+  void SetPaddings(std::initializer_list<PaddingIntegerType> paddings) {
+    PopulateTensor<PaddingIntegerType>(paddings_, paddings);
   }
 
   std::vector<RegularInputOutput> GetOutput() {
@@ -69,17 +69,18 @@ class PadOpModel : public SingleOpModel {
   int constant_values_;
 };
 
-// Tests case where paddings is a const tensor. Type T is the dtype.
-template <typename T1>
-class PadV2OpConstModel : public PadOpModel<T1> {
+// Tests case where paddings is a const tensor. Type T1 is the dtype. Type T2 is
+// the padding dtype.
+template <typename T1, typename T2>
+class PadV2OpConstModel : public PadOpModel<T1, T2> {
  public:
   PadV2OpConstModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
-                    std::initializer_list<int> paddings, T1 constant_values,
+                    std::initializer_list<T2> paddings, T1 constant_values,
                     const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ =
-        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+        this->AddConstInput(GetTensorType<T2>(), paddings, paddings_shape);
     this->constant_values_ =
         this->AddConstInput(GetTensorType<T1>(), {constant_values}, {1});
 
@@ -92,12 +93,12 @@ class PadV2OpConstModel : public PadOpModel<T1> {
 
   PadV2OpConstModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
-                    std::initializer_list<int> paddings,
+                    std::initializer_list<T2> paddings,
                     const TensorData& constant_values,
                     const TensorData& output) {
     this->input_ = this->AddInput(input);
     this->paddings_ =
-        this->AddConstInput(TensorType_INT32, paddings, paddings_shape);
+        this->AddConstInput(GetTensorType<T2>(), paddings, paddings_shape);
     this->constant_values_ = this->AddInput(constant_values);
 
     this->output_ = this->AddOutput(output);
@@ -108,39 +109,41 @@ class PadV2OpConstModel : public PadOpModel<T1> {
   }
 };
 
-// Tests case where paddings is a const tensor.
+// Tests case where paddings is a const tensor. Type T is the padding dtype.
 //
 // Example usage is as follows:
 //    PadOpDynamicModel m(input_shape, paddings_shape, paddings_data);
 //    m.SetInput(input_data);
 //    m.Invoke();
-class PadOpConstModel : public PadOpModel<float> {
+template <typename T>
+class PadOpConstModel : public PadOpModel<float, T> {
  public:
   PadOpConstModel(const TensorData& input,
                   std::initializer_list<int> paddings_shape,
-                  std::initializer_list<int> paddings,
-                  const TensorData& output) {
-    this->input_ = AddInput(input);
-    paddings_ = AddConstInput(TensorType_INT32, paddings, paddings_shape);
-    constant_values_ = AddNullInput();
-    output_ = AddOutput(output);
+                  std::initializer_list<T> paddings, const TensorData& output) {
+    this->input_ = this->AddInput(input);
+    this->paddings_ =
+        this->AddConstInput(GetTensorType<T>(), paddings, paddings_shape);
+    this->constant_values_ = this->AddNullInput();
+    this->output_ = this->AddOutput(output);
 
-    SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
-                 CreatePadOptions(builder_).Union());
-    BuildInterpreter({input.shape});
+    this->SetBuiltinOp(BuiltinOperator_PAD, BuiltinOptions_PadOptions,
+                       CreatePadOptions(this->builder_).Union());
+    this->BuildInterpreter({input.shape});
   }
 };
 
 // Test case where paddings is a non-const tensor.
-template <typename RegularInputOutput>
-class PadV2OpDynamicModel : public PadOpModel<RegularInputOutput> {
+template <typename RegularInputOutput, typename PaddingIntegerType>
+class PadV2OpDynamicModel
+    : public PadOpModel<RegularInputOutput, PaddingIntegerType> {
  public:
   PadV2OpDynamicModel(const TensorData& input,
                       std::initializer_list<int> paddings_shape,
                       RegularInputOutput constant_values,
                       const TensorData& output) {
     this->input_ = this->AddInput(input);
-    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->paddings_ = this->AddInput(GetTensorType<PaddingIntegerType>());
     this->constant_values_ = this->AddConstInput(
         GetTensorType<RegularInputOutput>(), {constant_values}, {1});
     this->output_ = this->AddOutput(output);
@@ -154,7 +157,7 @@ class PadV2OpDynamicModel : public PadOpModel<RegularInputOutput> {
                       const TensorData& constant_values,
                       const TensorData& output) {
     this->input_ = this->AddInput(input);
-    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->paddings_ = this->AddInput(GetTensorType<PaddingIntegerType>());
     this->constant_values_ = this->AddInput(constant_values);
     this->output_ = this->AddOutput(output);
 
@@ -171,13 +174,14 @@ class PadV2OpDynamicModel : public PadOpModel<RegularInputOutput> {
 //    m.SetInput(input_data);
 //    m.SetPaddings(paddings_data);
 //    m.Invoke();
-class PadOpDynamicModel : public PadOpModel<float> {
+template <typename T>
+class PadOpDynamicModel : public PadOpModel<float, T> {
  public:
   PadOpDynamicModel(const TensorData& input,
                     std::initializer_list<int> paddings_shape,
                     const TensorData& output) {
     this->input_ = this->AddInput(input);
-    this->paddings_ = this->AddInput(TensorType_INT32);
+    this->paddings_ = this->AddInput(GetTensorType<T>());
     this->constant_values_ = this->AddNullInput();
     this->output_ = this->AddOutput(output);
 
@@ -187,34 +191,67 @@ class PadOpDynamicModel : public PadOpModel<float> {
   }
 };
 
+class PadOpTest : public ::testing::Test {};
+
 #if GTEST_HAS_DEATH_TEST
-TEST(PadOpTest, TooManyDimensions) {
-  EXPECT_DEATH(
-      PadOpConstModel({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
-                      {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
-                      {TensorType_FLOAT32}),
-      "dims <= reference_ops::PadKernelMaxDimensionCount()");
+template <typename padding_integer_type>
+void TooFewDimensions() {
+  EXPECT_DEATH(PadOpConstModel<padding_integer_type>(
+                   {TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
+                   {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9},
+                   {TensorType_FLOAT32}),
+               "dims <= reference_ops::PadKernelMaxDimensionCount()");
 }
 
-TEST(PadOpTest, UnequalDimensions) {
-  EXPECT_DEATH(PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
-                               {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
+TEST_F(PadOpTest, Int32PaddingTooFewDimensions) { TooFewDimensions<int32_t>(); }
+
+TEST_F(PadOpTest, Int64PaddingTooFewDimensions) { TooFewDimensions<int64_t>(); }
+
+template <typename padding_integer_type>
+void UnequalDimensions() {
+  EXPECT_DEATH(PadOpConstModel<padding_integer_type>(
+                   {TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2},
+                   {1, 1, 2, 2, 3, 3}, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
-TEST(PadOpTest, InvalidPadValue) {
-  EXPECT_DEATH(
-      PadOpConstModel({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
-                      {0, 0, 1, -1, 2, -1, 0, 0}, {TensorType_FLOAT32}),
-      "Pad value has to be greater than equal to 0.");
+TEST_F(PadOpTest, Int32PaddingUnequalDimensions) {
+  UnequalDimensions<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingUnequalDimensions) {
+  UnequalDimensions<int64_t>();
+}
+
+template <typename padding_integer_type>
+void InvalidPadValue() {
+  EXPECT_DEATH(PadOpConstModel<int32_t>({TensorType_FLOAT32, {1, 1, 2, 1}},
+                                        {4, 2}, {0, 0, 1, -1, 2, -1, 0, 0},
+                                        {TensorType_FLOAT32}),
+               "Pad value has to be greater than equal to 0.");
+}
+
+TEST_F(PadOpTest, Int32PaddingInvalidPadValue) { InvalidPadValue<int32_t>(); }
+
+TEST_F(PadOpTest, Int64PaddingInvalidPadValue) { InvalidPadValue<int64_t>(); }
+
+TEST_F(PadOpTest, Int64PaddingOverflow) {
+  EXPECT_DEATH(PadOpConstModel<int64_t>(
+                   {TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                   {std::numeric_limits<int64_t>::min(), 0, 1, -1, 2, -1, 0, 0},
+                   {TensorType_FLOAT32}),
+               "INT64 padding overflow. Only support value between INT32_MIN "
+               "and INT32_MAX.");
 }
 #endif
 
-TEST(PadOpTest, SimpleConstTest) {
+template <typename padding_integer_type>
+void SimpleConstTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                    {1, 1, 0, 0, 1, 1, 0, 0}, {TensorType_FLOAT32});
+  PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                          {4, 2}, {1, 1, 0, 0, 1, 1, 0, 0},
+                                          {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(),
@@ -223,11 +260,17 @@ TEST(PadOpTest, SimpleConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 2, 4, 1}));
 }
 
-TEST(PadOpTest, SimpleConstImageStyleTest) {
+TEST_F(PadOpTest, Int32PaddingSimpleConstTest) { SimpleConstTest<int32_t>(); }
+
+TEST_F(PadOpTest, Int64PaddingSimpleConstTest) { SimpleConstTest<int64_t>(); }
+
+template <typename padding_integer_type>
+void SimpleConstImageStyleTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
+  PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                          {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                          {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -235,50 +278,97 @@ TEST(PadOpTest, SimpleConstImageStyleTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
+TEST_F(PadOpTest, Int32PaddingSimpleConstImageStyleTest) {
+  SimpleConstImageStyleTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingSimpleConstImageStyleTest) {
+  SimpleConstImageStyleTest<int64_t>();
+}
+
 // Optimized versions may choose to handle zero-sized images differently.
-TEST(PadOpTest, ZeroHeightConstImageStyleTest) {
-  PadOpConstModel m({TensorType_FLOAT32, {1, 0, 2, 1}}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
+template <typename padding_integer_type>
+void ZeroHeightConstImageStyleTest() {
+  PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {1, 0, 2, 1}},
+                                          {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                          {TensorType_FLOAT32});
   // Nothing to SetInput().
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 4, 1}));
 }
 
+TEST_F(PadOpTest, Int32PaddingZeroHeightConstImageStyleTest) {
+  ZeroHeightConstImageStyleTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingZeroHeightConstImageStyleTest) {
+  ZeroHeightConstImageStyleTest<int64_t>();
+}
+
 // Optimized versions may choose to handle zero-sized images differently.
-TEST(PadOpTest, ZeroWidthConstImageStyleTest) {
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 0, 1}}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0}, {TensorType_FLOAT32});
+template <typename padding_integer_type>
+void ZeroWidthConstImageStyleTest() {
+  PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 0, 1}},
+                                          {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
+                                          {TensorType_FLOAT32});
   // Nothing to SetInput().
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 0, 0, 0}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 2, 1}));
 }
 
-TEST(PadOpTest, SimpleConst1DTest) {
-  PadOpConstModel m({TensorType_FLOAT32, {2}}, {1, 2}, {1, 2},
-                    {TensorType_FLOAT32});
+TEST_F(PadOpTest, Int32PaddingZeroWidthConstImageStyleTest) {
+  ZeroWidthConstImageStyleTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingZeroWidthConstImageStyleTest) {
+  ZeroWidthConstImageStyleTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConst1DTest() {
+  PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {2}}, {1, 2},
+                                          {1, 2}, {TensorType_FLOAT32});
   m.SetInput({2, 3});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 2, 3, 0, 0}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({5}));
 }
 
-TEST(PadOpTest, SimpleConst1DDim0Test) {
+TEST_F(PadOpTest, Int32PaddingSimpleConst1DTest) {
+  SimpleConst1DTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingSimpleConst1DTest) {
+  SimpleConst1DTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConst1DDim0Test() {
   if (SingleOpModel::GetForceUseNnapi()) {
     return;
   }
-  PadOpConstModel m({TensorType_FLOAT32, {0}}, {1, 2}, {1, 2},
-                    {TensorType_FLOAT32});
+  PadOpConstModel<int32_t> m({TensorType_FLOAT32, {0}}, {1, 2}, {1, 2},
+                             {TensorType_FLOAT32});
   // NumElements(input) = 0, so there is no input data.
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
 }
 
-TEST(PadOpTest, SimpleDynamicTest) {
-  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                      {TensorType_FLOAT32});
+TEST_F(PadOpTest, Int32PaddingSimpleConst1DDim0Test) {
+  SimpleConst1DDim0Test<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingSimpleConst1DDim0Test) {
+  SimpleConst1DDim0Test<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleDynamicTest() {
+  PadOpDynamicModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 2, 1}},
+                                            {4, 2}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -287,20 +377,40 @@ TEST(PadOpTest, SimpleDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadOpTest, DynamicUnequalDimensions) {
+TEST_F(PadOpTest, Int32PaddingSimpleDynamicTest) {
+  SimpleDynamicTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingSimpleDynamicTest) {
+  SimpleDynamicTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void DynamicUnequalDimensions() {
   if (SingleOpModel::GetForceUseNnapi()) {
     return;
   }
-  PadOpDynamicModel m({TensorType_FLOAT32, {}}, {3, 2}, {TensorType_FLOAT32});
+  PadOpDynamicModel<padding_integer_type> m({TensorType_FLOAT32, {}}, {3, 2},
+                                            {TensorType_FLOAT32});
   // Skip invoking m.SetInput() since the method doesn't work with dynamic
   // shapes.
   m.SetPaddings({0, 0, 1, 1, 1, 1});
   ASSERT_NE(m.Invoke(), kTfLiteOk) << "Unequal dimensions.";
 }
 
-TEST(PadOpTest, AdvancedConstTest) {
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                    {1, 0, 0, 2, 0, 3, 0, 0}, {TensorType_FLOAT32});
+TEST_F(PadOpTest, Int32PaddingDynamicUnequalDimensions) {
+  DynamicUnequalDimensions<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingDynamicUnequalDimensions) {
+  DynamicUnequalDimensions<int64_t>();
+}
+
+template <typename padding_integer_type>
+void AdvancedConstTestV2() {
+  PadOpConstModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 3, 1}},
+                                          {4, 2}, {1, 0, 0, 2, 0, 3, 0, 0},
+                                          {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(
@@ -311,9 +421,18 @@ TEST(PadOpTest, AdvancedConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 4, 6, 1}));
 }
 
-TEST(PadOpTest, AdvancedConstImageStyleTest) {
-  PadOpConstModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                    {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
+TEST_F(PadOpTest, Int32PaddingAdvancedConstTest) {
+  AdvancedConstTestV2<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingAdvancedConstTest) {
+  AdvancedConstTestV2<int64_t>();
+}
+
+template <typename padding_integer_type>
+void AdvancedConstImageStyleTest() {
+  PadOpConstModel<int32_t> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
+                             {0, 0, 0, 2, 1, 3, 0, 0}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(),
@@ -322,9 +441,18 @@ TEST(PadOpTest, AdvancedConstImageStyleTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-TEST(PadOpTest, AdvancedDynamicTest) {
-  PadOpDynamicModel m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                      {TensorType_FLOAT32});
+TEST_F(PadOpTest, Int32PaddingAdvancedConstImageStyleTest) {
+  AdvancedConstImageStyleTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingAdvancedConstImageStyleTest) {
+  AdvancedConstImageStyleTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void AdvancedDynamicTest() {
+  PadOpDynamicModel<padding_integer_type> m({TensorType_FLOAT32, {1, 2, 3, 1}},
+                                            {4, 2}, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -334,6 +462,14 @@ TEST(PadOpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+TEST_F(PadOpTest, Int32PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTest<int32_t>();
+}
+
+TEST_F(PadOpTest, Int64PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTest<int64_t>();
+}
+
 std::vector<Matcher<float>> DequantizedArrayNear(
     const std::vector<float>& values, const float min, const float max) {
   const float quantization_tolerance = (max - min) / 255.0;
@@ -347,10 +483,10 @@ template <typename integer_type, TensorType tensor_dtype>
 void ZeroNotInQuantizationRange() {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
-  EXPECT_DEATH(
-      PadOpConstModel m({tensor_dtype, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
-                        {0, 0, 1, 1, 1, 1, 0, 0}, {tensor_dtype, {}, 1.0, 2.0}),
-      ".*Check failed: f_min <= 0.*");
+  EXPECT_DEATH(PadOpConstModel<int32_t> m(
+                   {tensor_dtype, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
+                   {0, 0, 1, 1, 1, 1, 0, 0}, {tensor_dtype, {}, 1.0, 2.0}),
+               ".*Check failed: f_min <= 0.*");
 }
 
 TEST_F(QuantizedPadOpTest, UInt8ZeroNotInQuantizationRange) {
@@ -372,8 +508,9 @@ void SimpleConstTest() {
   const float kMin = -1.f;
   const float kMax = tensor_dtype == TensorType_INT16 ? 32767.f / 32768.f : 1.f;
 
-  PadOpConstModel m({tensor_dtype, {1, 2, 2, 1}, kMin, kMax}, {4, 2},
-                    {0, 0, 1, 1, 1, 1, 0, 0}, {tensor_dtype, {}, kMin, kMax});
+  PadOpConstModel<int32_t> m({tensor_dtype, {1, 2, 2, 1}, kMin, kMax}, {4, 2},
+                             {0, 0, 1, 1, 1, 1, 0, 0},
+                             {tensor_dtype, {}, kMin, kMax});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
@@ -398,8 +535,8 @@ void SimpleDynamicTest() {
   const float kMin = -1.f;
   const float kMax = tensor_dtype == TensorType_INT16 ? 32767.f / 32768.f : 1.f;
 
-  PadOpDynamicModel m({tensor_dtype, {1, 2, 2, 1}, kMin, kMax}, {4, 2},
-                      {tensor_dtype, {}, kMin, kMax});
+  PadOpDynamicModel<int32_t> m({tensor_dtype, {1, 2, 2, 1}, kMin, kMax}, {4, 2},
+                               {tensor_dtype, {}, kMin, kMax});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -425,8 +562,9 @@ void AdvancedConstTest() {
   const float kMin = -1.f;
   const float kMax = tensor_dtype == TensorType_INT16 ? 32767.f / 32768.f : 1.f;
 
-  PadOpConstModel m({tensor_dtype, {1, 2, 3, 1}, kMin, kMax}, {4, 2},
-                    {0, 0, 0, 2, 1, 3, 0, 0}, {tensor_dtype, {}, kMin, kMax});
+  PadOpConstModel<int32_t> m({tensor_dtype, {1, 2, 3, 1}, kMin, kMax}, {4, 2},
+                             {0, 0, 0, 2, 1, 3, 0, 0},
+                             {tensor_dtype, {}, kMin, kMax});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.template GetDequantizedOutput<integer_type>(),
@@ -452,8 +590,8 @@ void AdvancedDynamicTest() {
   const float kMin = -1.f;
   const float kMax = tensor_dtype == TensorType_INT16 ? 32767.f / 32768.f : 1.f;
 
-  PadOpDynamicModel m({tensor_dtype, {1, 2, 3, 1}, kMin, kMax}, {4, 2},
-                      {tensor_dtype, {}, kMin, kMax});
+  PadOpDynamicModel<int32_t> m({tensor_dtype, {1, 2, 3, 1}, kMin, kMax}, {4, 2},
+                               {tensor_dtype, {}, kMin, kMax});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -475,36 +613,83 @@ TEST_F(QuantizedPadOpTest, Int16AdvancedDynamicTest) {
   AdvancedDynamicTest<int16_t, TensorType_INT16>();
 }
 
+class PadV2OpTest : public ::testing::Test {};
+
 #if GTEST_HAS_DEATH_TEST
-TEST(PadV2OpTest, TooManyDimensions) {
-  typedef PadV2OpConstModel<float> f;
+template <typename padding_integer_type>
+void TooManyDimensions() {
+  typedef PadV2OpConstModel<float, padding_integer_type> f;
   EXPECT_DEATH(f({TensorType_FLOAT32, {1, 2, 3, 4, 5, 6, 7, 8, 9}}, {9, 2},
                  {1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9}, 0.0,
                  {TensorType_FLOAT32}),
                "dims <= reference_ops::PadKernelMaxDimensionCount()");
 }
 
-TEST(PadV2OpTest, UnequalDimensions) {
-  typedef PadV2OpConstModel<float> f;
+TEST_F(PadV2OpTest, Int32PaddingTooManyDimensions) {
+  TooManyDimensions<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingTooManyDimensions) {
+  TooManyDimensions<int64_t>();
+}
+
+template <typename padding_integer_type>
+void UnequalDimensionsV2() {
+  typedef PadV2OpConstModel<float, padding_integer_type> f;
   EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {3, 2}, {1, 1, 2, 2, 3, 3},
                  0.0, {TensorType_FLOAT32}),
                "3 != 4");
 }
 
-TEST(PadV2OpTest, InvalidPadValue) {
-  typedef PadV2OpConstModel<float> f;
+TEST_F(PadV2OpTest, Int32PaddingUnequalDimensions) {
+  UnequalDimensionsV2<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingUnequalDimensions) {
+  UnequalDimensionsV2<int64_t>();
+}
+
+template <typename padding_integer_type>
+void InvalidPadValueV2() {
+  typedef PadV2OpConstModel<float, padding_integer_type> f;
   EXPECT_DEATH(f({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
                  {0, 0, 1, -1, 2, -1, 0, 0}, 0.0, {TensorType_FLOAT32}),
                "Pad value has to be greater than equal to 0.");
 }
+
+TEST_F(PadV2OpTest, Int32PaddingInvalidPadValue) {
+  InvalidPadValueV2<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingInvalidPadValue) {
+  InvalidPadValueV2<int64_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingOverflow) {
+  EXPECT_DEATH(PadOpConstModel<int64_t>(
+                   {TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                   {std::numeric_limits<int64_t>::min(), 0, 1, -1, 2, -1, 0, 0},
+                   {TensorType_FLOAT32}),
+               "INT64 padding overflow. Only support value between INT32_MIN "
+               "and INT32_MAX.");
+}
+
+TEST_F(PadV2OpTest, UnsupportedPaddingType) {
+  EXPECT_DEATH(
+      PadOpConstModel<float>({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
+                             {0, 0, 1, 1, 2, 1, 0, 0}, {TensorType_FLOAT32}),
+      "Padding type FLOAT32 is currently not supported by Pad.");
+}
+
 #endif
 
-TEST(PadV2OpTest, SimpleConstTestUint8) {
+template <typename padding_integer_type>
+void SimpleConstTestUint8() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
-                             {TensorType_FLOAT32});
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -512,12 +697,21 @@ TEST(PadV2OpTest, SimpleConstTestUint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleConstTestInt8) {
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstTestUint8) {
+  SimpleConstTestUint8<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstTestUint8) {
+  SimpleConstTestUint8<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConstTestInt8() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
-                             {TensorType_FLOAT32});
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 0.0,
+      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({0, 0, 0, 0, 0, 1, 2, 0, 0, 3, 4,
@@ -525,11 +719,21 @@ TEST(PadV2OpTest, SimpleConstTestInt8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleConstFloat32ValuedTestUint8) {
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstTestInt8) {
+  SimpleConstTestInt8<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstTestInt8) {
+  SimpleConstTestInt8<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConstFloat32ValuedTestUint8() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -537,11 +741,21 @@ TEST(PadV2OpTest, SimpleConstFloat32ValuedTestUint8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleConstFloat32ValuedTestInt8) {
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstFloat32ValuedTestUint8) {
+  SimpleConstFloat32ValuedTestUint8<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstFloat32ValuedTestUint8) {
+  SimpleConstFloat32ValuedTestUint8<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConstFloat32ValuedTestInt8() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2},
-                             {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -549,22 +763,42 @@ TEST(PadV2OpTest, SimpleConstFloat32ValuedTestInt8) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, Simple4DConstFloat32ValuedTest) {
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstFloat32ValuedTestInt8) {
+  SimpleConstFloat32ValuedTestInt8<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstFloat32ValuedTestInt8) {
+  SimpleConstFloat32ValuedTestInt8<int64_t>();
+}
+
+template <typename padding_integer_type>
+void Simple4DConstFloat32ValuedTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2},
-                             {0, 1, 0, 0, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 1, 2, 1}}, {4, 2}, {0, 1, 0, 0, 0, 0, 0, 1}, 5,
+      {TensorType_FLOAT32});
   m.SetInput({3, 3});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 5, 3, 5, 5, 5, 5, 5}));
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 2, 2}));
 }
 
-TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
+TEST_F(PadV2OpTest, Int32PaddingSimple4DConstFloat32ValuedTest) {
+  Simple4DConstFloat32ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimple4DConstFloat32ValuedTest) {
+  Simple4DConstFloat32ValuedTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleConstInt32ValuedTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<int32_t> m({TensorType_INT32, {1, 2, 2, 1}}, {4, 2},
-                               {0, 0, 1, 1, 1, 1, 0, 0}, 5, {TensorType_INT32});
+  PadV2OpConstModel<int32_t, padding_integer_type> m(
+      {TensorType_INT32, {1, 2, 2, 1}}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0}, 5,
+      {TensorType_INT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(), ElementsAreArray({5, 5, 5, 5, 5, 1, 2, 5, 5, 3, 4,
@@ -572,9 +806,18 @@ TEST(PadV2OpTest, SimpleConstInt32ValuedTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleDynamicTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0,
-                               {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingSimpleConstInt32ValuedTest) {
+  SimpleConstInt32ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleConstInt32ValuedTest) {
+  SimpleConstInt32ValuedTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleDynamicTestV2() {
+  PadV2OpDynamicModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 0.0, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -583,21 +826,39 @@ TEST(PadV2OpTest, SimpleDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, DynamicUnequalDimensions) {
+TEST_F(PadV2OpTest, Int32PaddingSimpleDynamicTest) {
+  SimpleDynamicTestV2<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleDynamicTest) {
+  SimpleDynamicTestV2<int64_t>();
+}
+
+template <typename padding_integer_type>
+void PadV2OpDynamicUnequalDimensions() {
   if (SingleOpModel::GetForceUseNnapi()) {
     return;
   }
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {}}, {4, 2}, 0.0,
-                               {TensorType_FLOAT32});
+  PadV2OpDynamicModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {}}, {4, 2}, 0.0, {TensorType_FLOAT32});
   // Skip invoking m.SetInput() since the method doesn't work with dynamic
   // shapes.
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_NE(m.Invoke(), kTfLiteOk) << "Unequal dimensions";
 }
 
-TEST(PadV2OpTest, SimpleDynamicValuedTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5,
-                               {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingDynamicUnequalDimensions) {
+  PadV2OpDynamicUnequalDimensions<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingDynamicUnequalDimensions) {
+  PadV2OpDynamicUnequalDimensions<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleDynamicValuedTest() {
+  PadV2OpDynamicModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1}}, {4, 2}, 5, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -606,9 +867,18 @@ TEST(PadV2OpTest, SimpleDynamicValuedTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1}));
 }
 
-TEST(PadV2OpTest, SimpleTensorWithDim0Test) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 0}}, {4, 2}, 5,
-                               {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingSimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleDynamicValuedTest) {
+  SimpleDynamicValuedTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void SimpleTensorWithDim0Test() {
+  PadV2OpDynamicModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 0}}, {4, 2}, 5, {TensorType_FLOAT32});
   // NumElements(input) = 0, so there is no input data.
   m.SetPaddings({0, 0, 1, 1, 0, 0, 1, 1});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -622,10 +892,19 @@ TEST(PadV2OpTest, SimpleTensorWithDim0Test) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 0}));
 }
 
-TEST(PadV2OpTest, Simple5DConstFloat32ValuedTest) {
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 1, 2, 1, 1}}, {5, 2},
-                             {0, 1, 0, 0, 1, 1, 0, 0, 0, 1}, 5,
-                             {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingSimpleTensorWithDim0Test) {
+  SimpleTensorWithDim0Test<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimpleTensorWithDim0Test) {
+  SimpleTensorWithDim0Test<int64_t>();
+}
+
+template <typename padding_integer_type>
+void Simple5DConstFloat32ValuedTest() {
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 1, 2, 1, 1}}, {5, 2},
+      {0, 1, 0, 0, 1, 1, 0, 0, 0, 1}, 5, {TensorType_FLOAT32});
   m.SetInput({3, 3});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 1, 4, 1, 2}));
@@ -633,10 +912,19 @@ TEST(PadV2OpTest, Simple5DConstFloat32ValuedTest) {
                                                5, 5, 5, 5, 5}));
 }
 
-TEST(PadV2OpTest, Simple5DConstInt32ValuedTest) {
-  PadV2OpConstModel<int32_t> m({TensorType_INT32, {1, 2, 2, 1, 1}}, {5, 2},
-                               {0, 0, 1, 1, 1, 1, 0, 0, 1, 1}, 5,
-                               {TensorType_INT32});
+TEST_F(PadV2OpTest, Int32PaddingSimple5DConstFloat32ValuedTest) {
+  Simple5DConstFloat32ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimple5DConstFloat32ValuedTest) {
+  Simple5DConstFloat32ValuedTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void Simple5DConstInt32ValuedTest() {
+  PadV2OpConstModel<int32_t, padding_integer_type> m(
+      {TensorType_INT32, {1, 2, 2, 1, 1}}, {5, 2},
+      {0, 0, 1, 1, 1, 1, 0, 0, 1, 1}, 5, {TensorType_INT32});
   m.SetInput({1, 2, 3, 4});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 4, 1, 3}));
@@ -647,9 +935,18 @@ TEST(PadV2OpTest, Simple5DConstInt32ValuedTest) {
                         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-TEST(PadV2OpTest, Simple5DDynamicValuedTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 2, 1, 1}}, {5, 2}, 5,
-                               {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingSimple5DConstInt32ValuedTest) {
+  Simple5DConstInt32ValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimple5DConstInt32ValuedTest) {
+  Simple5DConstInt32ValuedTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void Simple5DDynamicValuedTest() {
+  PadV2OpDynamicModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 2, 1, 1}}, {5, 2}, 5, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4});
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0, 1, 1});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -661,9 +958,19 @@ TEST(PadV2OpTest, Simple5DDynamicValuedTest) {
                         5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5}));
 }
 
-TEST(PadV2OpTest, AdvancedConstTest) {
-  PadV2OpConstModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2},
-                             {0, 0, 0, 2, 1, 3, 0, 0}, 0, {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingSimple5DDynamicValuedTest) {
+  Simple5DDynamicValuedTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingSimple5DDynamicValuedTest) {
+  Simple5DDynamicValuedTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void AdvancedConstTest() {
+  PadV2OpConstModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0}, 0,
+      {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutput(),
@@ -672,9 +979,18 @@ TEST(PadV2OpTest, AdvancedConstTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
-TEST(PadV2OpTest, AdvancedDynamicTest) {
-  PadV2OpDynamicModel<float> m({TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0,
-                               {TensorType_FLOAT32});
+TEST_F(PadV2OpTest, Int32PaddingAdvancedConstTest) {
+  AdvancedConstTest<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingAdvancedConstTest) {
+  AdvancedConstTest<int64_t>();
+}
+
+template <typename padding_integer_type>
+void AdvancedDynamicTestV2() {
+  PadV2OpDynamicModel<float, padding_integer_type> m(
+      {TensorType_FLOAT32, {1, 2, 3, 1}}, {4, 2}, 0, {TensorType_FLOAT32});
   m.SetInput({1, 2, 3, 4, 5, 6});
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
@@ -684,6 +1000,14 @@ TEST(PadV2OpTest, AdvancedDynamicTest) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 4, 7, 1}));
 }
 
+TEST_F(PadV2OpTest, Int32PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTestV2<int32_t>();
+}
+
+TEST_F(PadV2OpTest, Int64PaddingAdvancedDynamicTest) {
+  AdvancedDynamicTestV2<int64_t>();
+}
+
 class QuantizedPadV2OpTest : public ::testing::Test {
  protected:
   std::vector<Matcher<float>> DequantizedArrayNear(
@@ -698,7 +1022,7 @@ template <TensorType tensor_dtype>
 void ZeroNotInQuantizationRangeV2() {
   // The test_util and actual quantization code currently ensure that the range
   // must include zero, but if that ever changes, this test will catch it.
-  typedef PadV2OpConstModel<float> f;
+  typedef PadV2OpConstModel<float, int32_t> f;
   EXPECT_DEATH(f({tensor_dtype, {1, 2, 2, 1}, 1.0, 2.0}, {4, 2},
                  {0, 0, 1, 1, 1, 1, 0, 0}, 0, {tensor_dtype, {}, 1.0, 2.0}),
                ".*Check failed: f_min <= 0.*");
@@ -716,7 +1040,7 @@ template <typename integer_type, TensorType tensor_dtype>
 void SimpleConstTestV2() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<integer_type> m(
+  PadV2OpConstModel<integer_type, int32_t> m(
       {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
       {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
@@ -738,9 +1062,9 @@ TEST_F(QuantizedPadV2OpTest, Int8SimpleConstTest) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void SimpleDynamicTestV2() {
-  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0},
-                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
-                                      {tensor_dtype, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<integer_type, int32_t> m(
+      {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
   m.template SetQuantizedPadValue<integer_type>(0);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
@@ -761,7 +1085,7 @@ TEST_F(QuantizedPadV2OpTest, Int8SimpleDynamicTest) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void AdvancedConstTestV2() {
-  PadV2OpConstModel<integer_type> m(
+  PadV2OpConstModel<integer_type, int32_t> m(
       {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
       {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
@@ -784,9 +1108,9 @@ TEST_F(QuantizedPadV2OpTest, Int8AdvancedConstTest) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void AdvancedDynamicTestV2() {
-  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0},
-                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
-                                      {tensor_dtype, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<integer_type, int32_t> m(
+      {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.template SetQuantizedPadValue<integer_type>(0);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
@@ -810,7 +1134,7 @@ template <typename integer_type, TensorType tensor_dtype>
 void SimpleConstValuedTest() {
   // Padding is represented as four 2-D lists representing above padding and
   // below padding (i.e. {{0, 0}, {1, 1}, {1, 1}, {0, 0}}).
-  PadV2OpConstModel<integer_type> m(
+  PadV2OpConstModel<integer_type, int32_t> m(
       {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 1, 1, 1, 1, 0, 0},
       {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
@@ -833,9 +1157,9 @@ TEST_F(QuantizedPadV2OpTest, Int8SimpleConstValuedTest) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void SimpleDynamicValuedTest() {
-  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0},
-                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
-                                      {tensor_dtype, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<integer_type, int32_t> m(
+      {tensor_dtype, {1, 2, 2, 1}, -1.0, 1.0}, {4, 2},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7});
   m.template SetQuantizedPadValue<integer_type>(-0.5);
   m.SetPaddings({0, 0, 1, 1, 1, 1, 0, 0});
@@ -857,7 +1181,7 @@ TEST_F(QuantizedPadV2OpTest, Int8SimpleDynamicValuedTest) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void AdvancedConstValuedTest() {
-  PadV2OpConstModel<integer_type> m(
+  PadV2OpConstModel<integer_type, int32_t> m(
       {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2}, {0, 0, 0, 2, 1, 3, 0, 0},
       {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
@@ -881,9 +1205,9 @@ TEST_F(QuantizedPadV2OpTest, Int8AdvancedConstValuedTest) {
 
 template <typename integer_type, TensorType tensor_dtype>
 void AdvancedDynamicValuedTest() {
-  PadV2OpDynamicModel<integer_type> m({tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0},
-                                      {4, 2}, {tensor_dtype, {1}, -1.0, 1.0},
-                                      {tensor_dtype, {}, -1.0, 1.0});
+  PadV2OpDynamicModel<integer_type, int32_t> m(
+      {tensor_dtype, {1, 2, 3, 1}, -1.0, 1.0}, {4, 2},
+      {tensor_dtype, {1}, -1.0, 1.0}, {tensor_dtype, {}, -1.0, 1.0});
   m.template SetQuantizedInput<integer_type>({-0.8, 0.2, 0.9, 0.7, 0.1, -0.3});
   m.template SetQuantizedPadValue<integer_type>(-0.5);
   m.SetPaddings({0, 0, 0, 2, 1, 3, 0, 0});
diff --git a/tensorflow/lite/kernels/range.cc b/tensorflow/lite/kernels/range.cc
index e0f22795c7d..59db33f08ac 100644
--- a/tensorflow/lite/kernels/range.cc
+++ b/tensorflow/lite/kernels/range.cc
@@ -68,6 +68,13 @@ TfLiteStatus ResizeOutput(TfLiteContext* context, const TfLiteTensor* start,
                                 *GetTensorData<int32_t>(delta), &size));
       break;
     }
+    case kTfLiteInt64: {
+      TF_LITE_ENSURE_OK(context,
+                        GetSize(context, *GetTensorData<int64_t>(start),
+                                *GetTensorData<int64_t>(limit),
+                                *GetTensorData<int64_t>(delta), &size));
+      break;
+    }
     case kTfLiteFloat32: {
       TF_LITE_ENSURE_OK(context, GetSize(context, *GetTensorData<float>(start),
                                          *GetTensorData<float>(limit),
@@ -109,6 +116,10 @@ TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* start,
       CalculateRange<float>(start, delta, output);
       break;
     }
+    case kTfLiteInt64: {
+      CalculateRange<int64_t>(start, delta, output);
+      break;
+    }
     default: {
       TF_LITE_KERNEL_LOG(context, "Unsupported data type: %d", output->type);
       return kTfLiteError;
@@ -134,10 +145,11 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
   TF_LITE_ENSURE_EQ(context, NumDimensions(limit), 0);
   TF_LITE_ENSURE_EQ(context, NumDimensions(delta), 0);
 
-  // Currently only supports int32 and float.
+  // Currently only supports int32, int64 and float.
   // TODO(b/117912892): Support quantization as well.
   const auto dtype = start->type;
-  if (dtype != kTfLiteFloat32 && dtype != kTfLiteInt32) {
+  if (dtype != kTfLiteFloat32 && dtype != kTfLiteInt32 &&
+      dtype != kTfLiteInt64) {
     TF_LITE_KERNEL_LOG(context, "Unknown index output data type: %s",
                        TfLiteTypeGetName(dtype));
     return kTfLiteError;
diff --git a/tensorflow/lite/kernels/range_test.cc b/tensorflow/lite/kernels/range_test.cc
index 78d9f9f4470..d201b75c17e 100644
--- a/tensorflow/lite/kernels/range_test.cc
+++ b/tensorflow/lite/kernels/range_test.cc
@@ -183,5 +183,73 @@ TEST(RangeOpModel, EmptyOutputConst) {
   EXPECT_THAT(model.GetOutput(), ElementsAre());
 }
 
+TEST(RangeOpModel, Int64Simple) {
+  RangeOpModel<int64_t> model(TensorType_INT64);
+  model.PopulateTensor<int64_t>(model.start(), {0});
+  model.PopulateTensor<int64_t>(model.limit(), {4});
+  model.PopulateTensor<int64_t>(model.delta(), {1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(RangeOpModel, Int64SimpleConst) {
+  RangeOpModel<int64_t> model(TensorType_INT64, {0}, {4}, {1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(RangeOpModel, Int64DeltaGreaterThanOne) {
+  RangeOpModel<int64_t> model(TensorType_INT64);
+  model.PopulateTensor<int64_t>(model.start(), {2});
+  model.PopulateTensor<int64_t>(model.limit(), {9});
+  model.PopulateTensor<int64_t>(model.delta(), {2});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(2, 4, 6, 8));
+}
+
+TEST(RangeOpModel, Int64DeltaGreaterThanOneConst) {
+  RangeOpModel<int64_t> model(TensorType_INT64, {2}, {9}, {2});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(4));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(2, 4, 6, 8));
+}
+
+TEST(RangeOpModel, Int64NegativeDelta) {
+  RangeOpModel<int64_t> model(TensorType_INT64);
+  model.PopulateTensor<int64_t>(model.start(), {10});
+  model.PopulateTensor<int64_t>(model.limit(), {3});
+  model.PopulateTensor<int64_t>(model.delta(), {-3});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(10, 7, 4));
+}
+
+TEST(RangeOpModel, Int64NegativeDeltaConst) {
+  RangeOpModel<int64_t> model(TensorType_INT64, {10}, {3}, {-3});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(3));
+  EXPECT_THAT(model.GetOutput(), ElementsAre(10, 7, 4));
+}
+
+TEST(RangeOpModel, Int64EmptyOutput) {
+  RangeOpModel<int64_t> model(TensorType_INT64);
+  model.PopulateTensor<int64_t>(model.start(), {0});
+  model.PopulateTensor<int64_t>(model.limit(), {0});
+  model.PopulateTensor<int64_t>(model.delta(), {1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(0));
+  EXPECT_THAT(model.GetOutput(), ElementsAre());
+}
+
+TEST(RangeOpModel, Int64EmptyOutputConst) {
+  RangeOpModel<int64_t> model(TensorType_INT64, {0}, {0}, {1});
+  ASSERT_EQ(model.Invoke(), kTfLiteOk);
+  EXPECT_THAT(model.GetOutputShape(), ElementsAre(0));
+  EXPECT_THAT(model.GetOutput(), ElementsAre());
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/reduce.cc b/tensorflow/lite/kernels/reduce.cc
index ad0f0c10500..cd859255d4c 100644
--- a/tensorflow/lite/kernels/reduce.cc
+++ b/tensorflow/lite/kernels/reduce.cc
@@ -450,27 +450,40 @@ TfLiteStatus Mean(TfLiteContext* context, const OpContext* op_context,
 
 template <typename T>
 TfLiteStatus QuantizedMeanOrSum(TfLiteContext* context,
-                                const OpContext* op_context, int* temp_index,
-                                int* resolved_axis, int* temp_sum,
-                                KernelType kernel_type, bool compute_sum) {
-  int num_axis = static_cast<int>(NumElements(op_context->axis));
-  auto args = std::tuple(
-      GetTensorData<T>(op_context->input), op_context->input->params.zero_point,
-      op_context->input->params.scale, &op_context->input->dims->data[0],
-      op_context->input->dims->size, GetTensorData<T>(op_context->output),
-      op_context->output->params.zero_point, op_context->output->params.scale,
-      &op_context->output->dims->data[0], op_context->output->dims->size,
-      GetTensorData<int>(op_context->axis), num_axis,
-      op_context->params->keep_dims, temp_index, resolved_axis, temp_sum,
-      compute_sum);
-  if (kernel_type == kReference) {
+                                const OpContext& op_context,
+                                const OpData* op_data, TfLiteTensor* temp_index,
+                                TfLiteTensor* resolved_axis,
+                                TfLiteTensor* temp_sum, KernelType kernel_type,
+                                bool compute_sum) {
+  int num_axis = static_cast<int>(NumElements(op_context.axis));
+  if (kernel_type == kGenericOptimized) {
     TF_LITE_ENSURE(
         context,
-        std::apply(reference_ops::QuantizedMeanOrSum<T, int32_t>, args));
+        optimized_ops::QuantizedMeanOrSum(
+            GetTensorData<T>(op_context.input),
+            op_context.input->params.zero_point, op_context.input->params.scale,
+            op_context.input->dims->data, op_context.input->dims->size,
+            GetTensorData<T>(op_context.output),
+            op_context.output->params.zero_point,
+            op_context.output->params.scale, op_context.output->dims->data,
+            op_context.output->dims->size, GetTensorData<int>(op_context.axis),
+            num_axis, op_context.params->keep_dims,
+            GetTensorData<int>(temp_index), GetTensorData<int>(resolved_axis),
+            GetTensorData<int32_t>(temp_sum), compute_sum));
   } else {
     TF_LITE_ENSURE(
         context,
-        std::apply(optimized_ops::QuantizedMeanOrSum<T, int32_t>, args));
+        reference_ops::QuantizedMeanOrSum(
+            GetTensorData<uint8_t>(op_context.input),
+            op_context.input->params.zero_point, op_context.input->dims->data,
+            op_context.input->dims->size,
+            GetTensorData<uint8_t>(op_context.output), op_data->multiplier,
+            op_data->shift, op_context.output->params.zero_point,
+            op_context.output->dims->data, op_context.output->dims->size,
+            GetTensorData<int>(op_context.axis), num_axis,
+            op_context.params->keep_dims, GetTensorData<int>(temp_index),
+            GetTensorData<int>(resolved_axis), GetTensorData<int32_t>(temp_sum),
+            compute_sum));
   }
   return kTfLiteOk;
 }
@@ -899,19 +912,12 @@ TfLiteStatus EvalGeneric(TfLiteContext* context, TfLiteNode* node) {
 template <KernelType kernel_type>
 TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
   OpContext op_context(context, node);
-  const OpData* op_data = reinterpret_cast<const OpData*>(node->user_data);
   ruy::profiler::ScopeLabel label("Sum");
   const auto& input = op_context.input;
-  const auto& output = op_context.output;
-  const bool same_scale =
-      (input->params.scale == output->params.scale &&
-       input->params.zero_point == output->params.zero_point);
   const bool eight_bit_quantized =
       input->type == kTfLiteUInt8 || input->type == kTfLiteInt8;
-  const bool need_rescale = (eight_bit_quantized && !same_scale);
-  if (need_rescale) {
-    // Rescaling 8bit reduce sum.
-    int num_axis = static_cast<int>(NumElements(op_context.axis));
+  if (eight_bit_quantized) {
+    const OpData* op_data = reinterpret_cast<const OpData*>(node->user_data);
     TfLiteTensor* temp_index;
     TF_LITE_ENSURE_OK(
         context, GetTemporarySafe(context, node, /*index=*/0, &temp_index));
@@ -931,36 +937,14 @@ TfLiteStatus EvalSum(TfLiteContext* context, TfLiteNode* node) {
     }
 
     if (input->type == kTfLiteUInt8) {
-      TF_LITE_ENSURE(
-          context,
-          reference_ops::QuantizedMeanOrSum(
-              GetTensorData<uint8_t>(op_context.input),
-              op_context.input->params.zero_point, op_context.input->dims->data,
-              op_context.input->dims->size,
-              GetTensorData<uint8_t>(op_context.output), op_data->multiplier,
-              op_data->shift, op_context.output->params.zero_point,
-              op_context.output->dims->data, op_context.output->dims->size,
-              GetTensorData<int>(op_context.axis), num_axis,
-              op_context.params->keep_dims, GetTensorData<int>(temp_index),
-              GetTensorData<int>(resolved_axis),
-              GetTensorData<int32_t>(temp_sum),
-              /*compute_sum=*/true));
+      return QuantizedMeanOrSum<uint8_t>(context, op_context, op_data,
+                                         temp_index, resolved_axis, temp_sum,
+                                         kernel_type, /*compute_sum=*/true);
     }
     if (input->type == kTfLiteInt8) {
-      TF_LITE_ENSURE(
-          context,
-          reference_ops::QuantizedMeanOrSum(
-              GetTensorData<int8_t>(op_context.input),
-              op_context.input->params.zero_point, op_context.input->dims->data,
-              op_context.input->dims->size,
-              GetTensorData<int8_t>(op_context.output), op_data->multiplier,
-              op_data->shift, op_context.output->params.zero_point,
-              op_context.output->dims->data, op_context.output->dims->size,
-              GetTensorData<int>(op_context.axis), num_axis,
-              op_context.params->keep_dims, GetTensorData<int>(temp_index),
-              GetTensorData<int>(resolved_axis),
-              GetTensorData<int32_t>(temp_sum),
-              /*compute_sum=*/true));
+      return QuantizedMeanOrSum<int8_t>(context, op_context, op_data,
+                                        temp_index, resolved_axis, temp_sum,
+                                        kernel_type, /*compute_sum=*/true);
     }
   } else {
     return EvalGeneric<kernel_type, kSum>(context, node);
diff --git a/tensorflow/lite/kernels/reduce_test.cc b/tensorflow/lite/kernels/reduce_test.cc
index 359634cde14..3f9d51c9d29 100644
--- a/tensorflow/lite/kernels/reduce_test.cc
+++ b/tensorflow/lite/kernels/reduce_test.cc
@@ -792,9 +792,11 @@ TEST(ConstUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(
-                  ArrayFloatNear({-0.823529, -0.815686}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      // 0.4 + 0.3 + 0.5 = 1.0 (clamped)
+      // 0.2 + 0.4 + 0.6 = 1.0 (clamped)
+      ElementsAreArray(ArrayFloatNear({1.0, 1.0}, kQuantizedTolerance)));
 }
 
 TEST(ConstUint8SumOpTest, NotKeepDimsRescaling) {
@@ -840,9 +842,12 @@ TEST(ConstUint8SumOpTest, KeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3, 1}));
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(ArrayFloatNear({-0.407843, -0.313726, 0.0941177},
-                                              kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      // 0.4 + 0.2 = 0.6
+      // 0.3 + 0.4 = 0.7
+      // 0.5 + 0.6 = 1.0 (clamped)
+      ElementsAreArray(ArrayFloatNear({0.6, 0.7, 1.0}, kQuantizedTolerance)));
 }
 
 TEST(DynamicUint8SumOpTest, NotKeepDims) {
@@ -856,9 +861,11 @@ TEST(DynamicUint8SumOpTest, NotKeepDims) {
   m.QuantizeAndPopulate<uint8_t>(m.Input(), data);
   ASSERT_EQ(m.Invoke(), kTfLiteOk);
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2}));
-  EXPECT_THAT(m.GetDequantizedOutput<uint8_t>(),
-              ElementsAreArray(
-                  ArrayFloatNear({1.48235, 1.64706}, kQuantizedTolerance)));
+  EXPECT_THAT(
+      m.GetDequantizedOutput<uint8_t>(),
+      // 1.3 + -4.8 = -3.5
+      // -3.6 + 0.24 = -3.36
+      ElementsAreArray(ArrayFloatNear({-3.5, -3.36}, kQuantizedTolerance)));
 }
 
 TEST(DynamicUint8SumOpTest, KeepDims) {
@@ -874,7 +881,9 @@ TEST(DynamicUint8SumOpTest, KeepDims) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2}));
   EXPECT_THAT(
       m.GetDequantizedOutput<uint8_t>(),
-      ElementsAreArray(ArrayFloatNear({6.47059, 10.698}, kQuantizedTolerance)));
+      // 11.14 + 7.423 = 12.0 (clamped)
+      // -0.14 + 0.879 = 0.739
+      ElementsAreArray(ArrayFloatNear({12.0, 0.739}, kQuantizedTolerance)));
 }
 
 TEST(ConstInt8SumOpTest, Rescale) {
diff --git a/tensorflow/lite/kernels/resize_bilinear_test.cc b/tensorflow/lite/kernels/resize_bilinear_test.cc
index 71005d67651..01b90b5120e 100644
--- a/tensorflow/lite/kernels/resize_bilinear_test.cc
+++ b/tensorflow/lite/kernels/resize_bilinear_test.cc
@@ -438,6 +438,30 @@ TEST_P(ResizeBilinearOpTest, HorizontalResizeExtremeValuesInt16) {
               ElementsAreArray(ArrayFloatNear({32765, 32766, 32767})));
 }
 
+TEST_P(ResizeBilinearOpTest, HorizontalResizeExtremeNegativeValuesInt8) {
+  ResizeBilinearOpModel m({TensorType_INT8, {1, 1, 2, 1}}, {1, 3}, GetParam());
+  m.SetInput<int8_t>({-120, -128});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutput<int8_t>(),
+              ElementsAreArray(ArrayFloatNear({-120, -125, -128})));
+}
+
+TEST_P(ResizeBilinearOpTest, HorizontalResizeExtremeNegativeValuesInt16) {
+  if (SingleOpModel::GetForceUseNnapi()) {
+    return;
+  }
+  ResizeBilinearOpModel m({TensorType_INT16, {1, 1, 2, 1}}, {1, 3}, GetParam());
+  m.SetInput<int16_t>({-32256, -32768});
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+#if TFLITE_SINGLE_ROUNDING
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({-32256, -32597, -32768})));
+#else
+  EXPECT_THAT(m.GetOutput<int16_t>(),
+              ElementsAreArray(ArrayFloatNear({-32256, -32598, -32768})));
+#endif  // TFLITE_SINGLE_ROUNDING
+}
+
 INSTANTIATE_TEST_SUITE_P(ResizeBilinearOpTest, ResizeBilinearOpTest,
                          testing::Values(TestType::kConst, TestType::kDynamic));
 
diff --git a/tensorflow/lite/kernels/select.cc b/tensorflow/lite/kernels/select.cc
index fb01871b051..994bd51f234 100644
--- a/tensorflow/lite/kernels/select.cc
+++ b/tensorflow/lite/kernels/select.cc
@@ -85,7 +85,7 @@ TfLiteStatus SelectPrepare(TfLiteContext* context, TfLiteNode* node) {
       GetTensorShape(input_x).FlatSize() == 1 &&
       GetTensorShape(input_y).FlatSize() == 1 &&
       GetTensorShape(output).FlatSize() == 1) {
-    return kTfLiteOk;
+    return context->ResizeTensor(context, output, output->dims);
   }
 
   bool same_shape = HaveSameShapes(input_condition, input_x) &&
diff --git a/tensorflow/lite/kernels/shim/tf_op_shim.cc b/tensorflow/lite/kernels/shim/tf_op_shim.cc
index 770a120209c..7d12bc88417 100644
--- a/tensorflow/lite/kernels/shim/tf_op_shim.cc
+++ b/tensorflow/lite/kernels/shim/tf_op_shim.cc
@@ -100,7 +100,7 @@ TensorViewOr TfInvokeContext::GetOutput(const int idx,
   for (int i = 0; i < shape->size(); ++i) shape_64[i] = (*shape)[i];
   auto status = context_->allocate_output(
       idx, ::tensorflow::TensorShape(shape_64), &output_t);
-  if (!status.ok()) return tsl::ToAbslStatus(status);
+  if (!status.ok()) return status;
   SH_ASSIGN_OR_RETURN(const TfTensorView& tensor_view,
                       TensorView::New(output_t));
   return std::make_unique<TfTensorView>(std::move(tensor_view));
diff --git a/tensorflow/lite/kernels/shim/tf_op_shim.h b/tensorflow/lite/kernels/shim/tf_op_shim.h
index 000f6e9e3ef..834a394b39b 100644
--- a/tensorflow/lite/kernels/shim/tf_op_shim.h
+++ b/tensorflow/lite/kernels/shim/tf_op_shim.h
@@ -96,20 +96,20 @@ class TfOpKernel : public ::tensorflow::OpKernel {
   explicit TfOpKernel(::tensorflow::OpKernelConstruction* c)
       : OpKernel(c), impl_(std::make_unique<ImplType>()) {
     TfInitContext ctx(c);
-    c->SetStatus(::tensorflow::FromAbslStatus(impl_->Init(&ctx)));
+    c->SetStatus(impl_->Init(&ctx));
   }
 
   // The main computation of the op
   void Compute(::tensorflow::OpKernelContext* c) override {
     TfInvokeContext ctx(c);
-    OP_REQUIRES_OK(c, ::tensorflow::FromAbslStatus(impl_->Invoke(&ctx)));
+    OP_REQUIRES_OK(c, impl_->Invoke(&ctx));
   }
 
   // Shape inference for the op.
   static tensorflow::Status ShapeInference(
       ::tensorflow::shape_inference::InferenceContext* c) {
     TfShapeInferenceContext ctx(c);
-    return ::tensorflow::FromAbslStatus(ImplType::ShapeInference(&ctx));
+    return ImplType::ShapeInference(&ctx);
   }
 
   // The operation name
diff --git a/tensorflow/lite/kernels/strided_slice.cc b/tensorflow/lite/kernels/strided_slice.cc
index 2d5244e02ca..764364a2db1 100644
--- a/tensorflow/lite/kernels/strided_slice.cc
+++ b/tensorflow/lite/kernels/strided_slice.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <algorithm>
+#include <cmath>
 #include <vector>
 
 #include "tensorflow/lite/core/c/builtin_op_data.h"
@@ -73,7 +74,8 @@ struct StridedSliceContext {
   int input_dims;
 };
 
-StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context) {
+StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context,
+                                           bool start_and_end_indices) {
   StridedSliceParams op_params{};
 
   // The ellipsis_mask and new_axis_mask in op_params are not used. Those masks
@@ -83,6 +85,7 @@ StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context) {
   op_params.end_mask = 0;
   op_params.new_axis_mask = 0;
   op_params.shrink_axis_mask = 0;
+  op_params.offset = op_context->params->offset;
 
   // Count indexes where the new_axis_mask is set but the ellipsis_mask is not.
   const int begin_count = GetTensorShape(op_context->begin).Dims(0);
@@ -161,8 +164,10 @@ StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context) {
           i, input_shape.Dims(i - added_axises));
     } else {
       const int orig_idx = i - added_ellipsis;
-      op_params.start_indices[i] = begin_data[orig_idx];
-      op_params.stop_indices[i] = end_data[orig_idx];
+      if (start_and_end_indices) {
+        op_params.start_indices[i] = begin_data[orig_idx];
+        op_params.stop_indices[i] = end_data[orig_idx];
+      }
       op_params.strides[i] = strides_data[orig_idx];
       if (op_context->params->begin_mask & (1 << orig_idx)) {
         op_params.begin_mask |= (1 << i);
@@ -190,36 +195,35 @@ StridedSliceParams BuildStridedSliceParams(StridedSliceContext* op_context) {
 TfLiteStatus ResizeOutputTensor(TfLiteContext* context,
                                 StridedSliceContext* op_context) {
   std::vector<int> output_shape_vector;
-  StridedSliceParams op_params = BuildStridedSliceParams(op_context);
+  StridedSliceParams op_params =
+      BuildStridedSliceParams(op_context, !op_context->params->offset);
   const RuntimeShape effective_input_shape = op_context->effective_input_shape;
   TF_LITE_ENSURE_MSG(
       context, effective_input_shape.DimensionsCount() <= 5,
       "StridedSlice op only supports up to 5D output including added axis.");
 
+  const int32_t* end_data = GetTensorData<int32_t>(op_context->end);
   for (int idx = effective_input_shape.DimensionsCount() - 1; idx >= 0; --idx) {
     int32_t stride = op_params.strides[idx];
     TF_LITE_ENSURE_MSG(context, stride != 0, "stride value has to be non-zero");
 
-    int32_t begin = ::tflite::strided_slice::StridedSliceStartForAxis(
-        op_params, effective_input_shape, idx);
-    int32_t end = ::tflite::strided_slice::StridedSliceEndForAxis(
-        op_params, effective_input_shape, idx, begin);
-
-    // When shrinking an axis, the end position does not matter (and can be
-    // incorrect when negative indexing is used, see Issue #19260). Always use
-    // begin + 1 to generate a length 1 slice, since begin has
-    // already been adjusted for negative indices by GetBeginValueAtIndex.
+    int32_t dim_shape = 0;
     const bool shrink_axis = op_params.shrink_axis_mask & (1 << idx);
-    if (shrink_axis) {
-      end = begin + 1;
-    }
+    if (shrink_axis) continue;
+    if (op_params.offset) {
+      dim_shape = end_data[idx];
+    } else {
+      int32_t begin = ::tflite::strided_slice::StridedSliceStartForAxis(
+          op_params, effective_input_shape, idx);
+      int32_t end = ::tflite::strided_slice::StridedSliceEndForAxis(
+          op_params, effective_input_shape, idx, begin);
 
-    // This is valid for both positive and negative strides
-    int32_t dim_shape = std::ceil((end - begin) / static_cast<float>(stride));
-    dim_shape = dim_shape < 0 ? 0 : dim_shape;
-    if (!shrink_axis) {
-      output_shape_vector.push_back(dim_shape);
+      // This is valid for both positive and negative strides
+      dim_shape = end - begin;
     }
+    dim_shape = std::ceil((dim_shape) / static_cast<float>(stride));
+    dim_shape = dim_shape < 0 ? 0 : dim_shape;
+    output_shape_vector.push_back(dim_shape);
   }
 
   TfLiteIntArray* output_shape =
@@ -241,7 +245,7 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
   if (IsDynamicTensor(op_context.output)) {
     TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
   }
-  StridedSliceParams op_params = BuildStridedSliceParams(&op_context);
+  StridedSliceParams op_params = BuildStridedSliceParams(&op_context, true);
 
   switch (op_context.input->type) {
     case kTfLiteFloat32:
@@ -326,15 +330,20 @@ TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
 
   // Postpone allocation of output if any of the indexing tensors is not
   // constant
-  if (!(IsConstantOrPersistentTensor(op_context.begin) &&
-        IsConstantOrPersistentTensor(op_context.end) &&
-        IsConstantOrPersistentTensor(op_context.strides))) {
+  bool offset = op_context.params->offset;
+  bool output_shape_known = IsConstantOrPersistentTensor(op_context.strides);
+  output_shape_known &=
+      offset || (IsConstantOrPersistentTensor(op_context.begin) &&
+                 IsConstantOrPersistentTensor(op_context.end));
+  if (!output_shape_known) {
     SetTensorToDynamic(op_context.output);
     return kTfLiteOk;
   }
-  if (IsConstantOrPersistentTensor(op_context.input)) {
+  if (IsConstantOrPersistentTensor(op_context.input) &&
+      IsConstantOrPersistentTensor(op_context.begin) &&
+      IsConstantOrPersistentTensor(op_context.end)) {
     SetTensorToPersistentRo(op_context.output);
-    ResizeOutputTensor(context, &op_context);
+    TF_LITE_ENSURE_OK(context, ResizeOutputTensor(context, &op_context));
     op_data->noop = true;
     return EvalImpl<kGenericOptimized>(context, node);
   }
diff --git a/tensorflow/lite/kernels/strided_slice_test.cc b/tensorflow/lite/kernels/strided_slice_test.cc
index e4f0b694507..9e63abebe73 100644
--- a/tensorflow/lite/kernels/strided_slice_test.cc
+++ b/tensorflow/lite/kernels/strided_slice_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include <stdint.h>
 
 #include <initializer_list>
+#include <numeric>
 #include <string>
 #include <type_traits>
 #include <vector>
@@ -42,14 +43,19 @@ class StridedSliceOpModel : public SingleOpModel {
                       const std::vector<int> end_data,
                       const std::vector<int> strides_data, int begin_mask,
                       int end_mask, int ellipsis_mask, int new_axis_mask,
-                      int shrink_axis_mask, bool const_tensors,
-                      bool use_simple_allocator = true) {
-    if (const_tensors) {
+                      int shrink_axis_mask, bool constant_tensors,
+                      bool offset = false) {
+    if (constant_tensors) {
       input_ =
           AddConstInput(GetTensorType<input_type>(), input_data, input_shape);
       begin_ = AddConstInput(TensorType_INT32, begin_data, begin_shape);
       end_ = AddConstInput(TensorType_INT32, end_data, end_shape);
       strides_ = AddConstInput(TensorType_INT32, strides_data, strides_shape);
+    } else if (offset) {
+      input_ = AddInput(GetTensorType<input_type>());
+      begin_ = AddInput(TensorType_INT32);
+      end_ = AddConstInput(TensorType_INT32, end_data, end_shape);
+      strides_ = AddConstInput(TensorType_INT32, strides_data, strides_shape);
     } else {
       input_ = AddInput(GetTensorType<input_type>());
       begin_ = AddInput(TensorType_INT32);
@@ -60,20 +66,51 @@ class StridedSliceOpModel : public SingleOpModel {
     SetBuiltinOp(
         BuiltinOperator_STRIDED_SLICE, BuiltinOptions_StridedSliceOptions,
         CreateStridedSliceOptions(builder_, begin_mask, end_mask, ellipsis_mask,
-                                  new_axis_mask, shrink_axis_mask)
+                                  new_axis_mask, shrink_axis_mask, offset)
             .Union());
-    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape},
-                     use_simple_allocator);
-    if (!const_tensors) {
+    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
+    if (!constant_tensors) {
       if (!input_data.empty()) {
         SetInput(input_data, std::is_same<std::string, input_type>());
       }
       SetBegin(begin_data);
       SetEnd(end_data);
       SetStrides(strides_data);
+    } else if (offset) {
+      if (!input_data.empty()) {
+        SetInput(input_data, std::is_same<std::string, input_type>());
+      }
+      SetBegin(begin_data);
     }
   }
 
+  // Constant input, strides and end with offset.
+  StridedSliceOpModel(std::initializer_list<int> input_shape,
+                      std::initializer_list<int> begin_shape,
+                      std::initializer_list<int> end_shape,
+                      std::initializer_list<int> strides_shape,
+                      const std::vector<input_type> input_data,
+                      const std::vector<int> begin_data,
+                      const std::vector<int> end_data,
+                      const std::vector<int> strides_data, int begin_mask,
+                      int end_mask, int ellipsis_mask, int new_axis_mask,
+                      int shrink_axis_mask) {
+    input_ =
+        AddConstInput(GetTensorType<input_type>(), input_data, input_shape);
+    begin_ = AddInput(TensorType_INT32);
+    end_ = AddConstInput(TensorType_INT32, end_data, end_shape);
+    strides_ = AddConstInput(TensorType_INT32, strides_data, strides_shape);
+    output_ = AddOutput(GetTensorType<input_type>());
+    SetBuiltinOp(BuiltinOperator_STRIDED_SLICE,
+                 BuiltinOptions_StridedSliceOptions,
+                 CreateStridedSliceOptions(builder_, begin_mask, end_mask,
+                                           ellipsis_mask, new_axis_mask,
+                                           shrink_axis_mask, /*offset=*/true)
+                     .Union());
+    BuildInterpreter({input_shape, begin_shape, end_shape, strides_shape});
+    SetBegin(begin_data);
+  }
+
   template <typename T>
   void SetInput(const std::vector<T> data, std::false_type) {
     PopulateTensor<input_type>(input_, data);
@@ -100,6 +137,10 @@ class StridedSliceOpModel : public SingleOpModel {
   }
   std::vector<int> GetOutputShape() { return GetTensorShape(output_); }
 
+  const TfLiteTensor* GetOutputTensor(int index) {
+    return interpreter_->output_tensor(index);
+  }
+
  private:
   int input_;
   int begin_;
@@ -138,6 +179,92 @@ TYPED_TEST(StridedSliceOpTest, In1DEmpty) {
   }
 }
 
+TYPED_TEST(StridedSliceOpTest, Offset) {
+  for (bool constant_tensors : {true, false}) {
+    if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+      // NNAPI does not support graphs with all constant inputs.
+      continue;
+    }
+    StridedSliceOpModel<TypeParam> m(
+        {10}, {1}, {1}, {1},
+        std::vector<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, {1}, {3}, {1}, 0,
+        0, 0, 0, 0, constant_tensors, /*offset=*/true);
+    ASSERT_EQ(m.Invoke(), kTfLiteOk);
+    EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({3}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 3}));
+    if (constant_tensors) {
+      EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLitePersistentRo);
+    } else {
+      EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
+    }
+  }
+}
+
+TYPED_TEST(StridedSliceOpTest, OffsetArray) {
+  for (bool constant_tensors : {true, false}) {
+    if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
+      // NNAPI does not support graphs with all constant inputs.
+      continue;
+    }
+    StridedSliceOpModel<TypeParam> m(
+        {3, 4}, {2}, {2}, {2},
+        std::vector<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1},
+        {2, 2}, {1, 1}, 0, 0, 0, 0, 0, constant_tensors, /*offset=*/true);
+    ASSERT_EQ(m.Invoke(), kTfLiteOk);
+    EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+    EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 5, 6}));
+    if (constant_tensors) {
+      EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLitePersistentRo);
+    } else {
+      EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
+    }
+  }
+}
+
+TYPED_TEST(StridedSliceOpTest, OffsetConstant) {
+  StridedSliceOpModel<TypeParam> m(
+      {3, 4}, {2}, {2}, {2},
+      std::vector<TypeParam>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}, {0, 1},
+      {2, 2}, {1, 1}, 0, 0, 0, 0, 0, /*constant_tensors*/ false,
+      /*offset=*/true);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 2, 5, 6}));
+  EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
+}
+
+TYPED_TEST(StridedSliceOpTest, OffsetConstantStride) {
+  const int height = 5;
+  const int width = 6;
+  std::vector<TypeParam> input_data(height * width);
+  std::iota(input_data.begin(), input_data.end(), 0);
+
+  StridedSliceOpModel<TypeParam> m({height, width}, {2}, {2}, {2}, input_data,
+                                   {0, 1}, {4, 3}, {2, 2}, 0, 0, 0, 0, 0,
+                                   /*constant_tensors*/ false,
+                                   /*offset=*/true);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({1, 3, 13, 15}));
+  EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
+}
+
+TYPED_TEST(StridedSliceOpTest, OffsetConstantNegativeStride) {
+  const int height = 5;
+  const int width = 6;
+  std::vector<TypeParam> input_data(height * width);
+  std::iota(input_data.begin(), input_data.end(), 0);
+
+  StridedSliceOpModel<TypeParam> m({height, width}, {2}, {2}, {2}, input_data,
+                                   {4, 4}, {-4, -3}, {-2, -2}, 0, 0, 0, 0, 0,
+                                   /*constant_tensors*/ false,
+                                   /*offset=*/true);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 2}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({28, 26, 16, 14}));
+  EXPECT_THAT(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRw);
+}
+
 TYPED_TEST(StridedSliceOpTest, In1D) {
   for (bool constant_tensors : {true, false}) {
     if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
@@ -943,7 +1070,6 @@ TEST(StridedSliceOpTest, In5D_String_IdentityShrinkAxis1) {
   EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({1, 2, 1, 2}));
   EXPECT_THAT(m.GetStringOutput(), ElementsAreArray({"1", "2", "3", "4"}));
 }
-}  // namespace
 TYPED_TEST(StridedSliceOpTest, In2D_ShrinkAxis_Endmask_AtSameAxis) {
   for (bool constant_tensors : {true, false}) {
     if (SingleOpModel::GetForceUseNnapi() && constant_tensors) {
@@ -1194,5 +1320,13 @@ TYPED_TEST(StridedSliceOpTest, NegEndMask) {
     EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
     EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 2, 1, 6, 5, 4}));
   }
+}
+TYPED_TEST(StridedSliceOpTest, NoopOffset) {
+  StridedSliceOpModel<TypeParam> m({2, 3}, {2}, {2}, {2}, {1, 2, 3, 4, 5, 6},
+                                   {0, -1}, {2, -3}, {1, -1}, 0, 0b10, 0, 0, 0);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.GetOutputShape(), ElementsAreArray({2, 3}));
+  EXPECT_THAT(m.GetOutput(), ElementsAreArray({3, 2, 1, 6, 5, 4}));
+}
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/subgraph_test_util.cc b/tensorflow/lite/kernels/subgraph_test_util.cc
index b11448d387a..d594a9a9c06 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.cc
+++ b/tensorflow/lite/kernels/subgraph_test_util.cc
@@ -71,22 +71,112 @@ TfLiteRegistration* Register_RANDOM_INT() {
 
 namespace subgraph_test_util {
 
-namespace {
-
 void SetupTensor(Subgraph* subgraph, int tensor_index, TfLiteType type) {
   ASSERT_EQ(subgraph->SetTensorParametersReadWrite(tensor_index, type, "", 0,
                                                    nullptr, {}, false),
             kTfLiteOk);
 }
 
-}  // namespace
-
 SubgraphBuilder::~SubgraphBuilder() {
   for (auto buffer : buffers_) {
     free(buffer);
   }
 }
 
+namespace {
+
+// Add an ADD node to the subgraph.
+void AddAddNode(Subgraph& subgraph, int input0, int input1, int output) {
+  int node_index;
+  TfLiteAddParams* add_params =
+      reinterpret_cast<TfLiteAddParams*>(calloc(1, sizeof(TfLiteAddParams)));
+  auto* add_reg = ops::builtin::Register_ADD();
+  add_reg->builtin_code = kTfLiteBuiltinAdd;
+  subgraph.AddNodeWithParameters({input0, input1}, {output}, {}, nullptr, 0,
+                                 add_params, add_reg, &node_index);
+}
+
+// Add a DYNAMIC_UPDATE_SLICE node to the subgraph.
+void AddDynamicUpdateSliceNode(Subgraph& subgraph, int input0, int input1,
+                               int input2, int output) {
+  int node_index;
+  auto* reg = ops::builtin::Register_DYNAMIC_UPDATE_SLICE();
+  reg->builtin_code = kTfLiteBuiltinDynamicUpdateSlice;
+  subgraph.AddNodeWithParameters({input0, input1, input2}, {output}, {},
+                                 nullptr, 0, nullptr, reg, &node_index);
+}
+
+}  // namespace
+
+void SubgraphBuilder::BuildInputDynamicUpdateSliceSubgraph(Subgraph& subgraph) {
+  enum {
+    kInput0,
+    kInput1,
+    kInput2,
+    kConstRhs,
+    kOutput,
+    kIntermediate0,
+    kTensorCount
+  };
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph.AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph.SetInputs({kInput0, kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph.SetOutputs({kOutput}), kTfLiteOk);
+  for (int i = 0; i < kTensorCount; ++i) {
+    SetupTensor(&subgraph, i, kTfLiteInt32);
+  }
+
+  // kInput0 --> +---+
+  // kInput1 --> |DUS| --> kIntermediate0 --> +---+
+  // kInput2 --> +---+                        |ADD| --> kOutput
+  //                            kConstRhs --> +---+
+  CreateConstantInt32Tensor(&subgraph, kConstRhs, {1}, {1});
+  AddDynamicUpdateSliceNode(subgraph, kInput0, kInput1, kInput2,
+                            kIntermediate0);
+  AddAddNode(subgraph, kIntermediate0, kConstRhs, kOutput);
+}
+
+void SubgraphBuilder::BuildInplaceDynamicUpdateSliceSubgraph(
+    Subgraph& subgraph, bool multiple_consumers) {
+  enum {
+    kInput0,
+    kInput1,
+    kInput2,
+    kConstRhs,
+    kOutput,
+    kIntermediate0,
+    kIntermediate1,
+    kTensorCount
+  };
+
+  int first_new_tensor_index;
+  ASSERT_EQ(subgraph.AddTensors(kTensorCount, &first_new_tensor_index),
+            kTfLiteOk);
+  ASSERT_EQ(first_new_tensor_index, 0);
+  ASSERT_EQ(subgraph.SetInputs({kInput0, kInput1, kInput2}), kTfLiteOk);
+  ASSERT_EQ(subgraph.SetOutputs({kOutput}), kTfLiteOk);
+  for (int i = 0; i < kTensorCount; ++i) {
+    SetupTensor(&subgraph, i, kTfLiteInt32);
+  }
+
+  //   kInput0 --> +---+
+  //               |ADD| --> kIntermediate0 --> +---+
+  // kConstRhs --> +---+       |    kInput1 --> |DUS| --> kIntermediate1
+  //     |                     |    kInput2 --> +---+           |
+  //     |                     |                                +----> +---+
+  //     |if one consumer      |if multiple consumers                  |ADD|
+  //     +---------------------+-------------------------------------> +---+
+  CreateConstantInt32Tensor(&subgraph, kConstRhs, {1}, {1});
+  AddAddNode(subgraph, kInput0, kConstRhs, kIntermediate0);
+  AddDynamicUpdateSliceNode(subgraph, kIntermediate0, kInput1, kInput2,
+                            kIntermediate1);
+  AddAddNode(subgraph, kIntermediate1,
+             multiple_consumers ? kIntermediate0 : kConstRhs, kOutput);
+}
+
 void SubgraphBuilder::BuildAddSubgraph(Subgraph* subgraph) {
   const int kInput1 = 0;
   const int kInput2 = 1;
@@ -292,8 +382,8 @@ void SubgraphBuilder::BuildAccumulateLoopBodySubgraph(Subgraph* subgraph) {
                                   &node_index);
 }
 
-void SubgraphBuilder::BuildPadLoopBodySubgraph(Subgraph* subgraph,
-                                               const std::vector<int> padding) {
+void SubgraphBuilder::BuildPadLoopBodySubgraph(
+    Subgraph* subgraph, const std::vector<int>& padding) {
   const int kInputCounter = 0;
   const int kInputValue = 1;
   const int kOutputCounter = 2;
diff --git a/tensorflow/lite/kernels/subgraph_test_util.h b/tensorflow/lite/kernels/subgraph_test_util.h
index 72ffe92c03a..2e2e87a38cd 100644
--- a/tensorflow/lite/kernels/subgraph_test_util.h
+++ b/tensorflow/lite/kernels/subgraph_test_util.h
@@ -38,6 +38,15 @@ class SubgraphBuilder {
  public:
   ~SubgraphBuilder();
 
+  // Build a subgraph with a dynamic update slice op which operates on
+  // a subgraph input tensor. The input buffer cannot be shared with the output.
+  void BuildInputDynamicUpdateSliceSubgraph(Subgraph& subgraph);
+
+  // Build a subgraph with a dynamic update slice op which operates on
+  // an intermediate tensor. The input buffer can be shared with the output if
+  // multiple nodes do not consume the input tensor.
+  void BuildInplaceDynamicUpdateSliceSubgraph(Subgraph& subgraph,
+                                              bool multiple_consumers);
   // Build a subgraph with a single Add op.
   // 2 inputs. 1 output.
   void BuildAddSubgraph(Subgraph* subgraph);
@@ -78,7 +87,7 @@ class SubgraphBuilder {
   //   Equivalent to (counter, value) -> (counter + 1, tf.pad(value, padding))
   // Note the padding is created as a constant tensor.
   void BuildPadLoopBodySubgraph(Subgraph* subgraph,
-                                const std::vector<int> padding);
+                                const std::vector<int>& padding);
 
   // Build a subgraph with a single While op.
   // 2 inputs, 2 outputs.
@@ -164,6 +173,10 @@ void CheckIntTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
 void CheckBoolTensor(const TfLiteTensor* tensor, const std::vector<int>& shape,
                      const std::vector<bool>& data);
 
+// Sets the tensor to be readable and writable. Call this on input
+// tensors when constructing Subgraphs to test.
+void SetupTensor(Subgraph* subgraph, int tensor_index, TfLiteType type);
+
 }  // namespace subgraph_test_util
 }  // namespace tflite
 
diff --git a/tensorflow/lite/kernels/test_delegate_providers.cc b/tensorflow/lite/kernels/test_delegate_providers.cc
index 33a980ab2eb..1d91ef456e6 100644
--- a/tensorflow/lite/kernels/test_delegate_providers.cc
+++ b/tensorflow/lite/kernels/test_delegate_providers.cc
@@ -15,12 +15,14 @@ limitations under the License.
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
 
 #include <string>
+#include <vector>
 
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/logging.h"
 #include "tensorflow/lite/tools/tool_params.h"
 
 namespace tflite {
+constexpr char KernelTestDelegateProviders::kAccelerationTestConfigPath[];
 constexpr char KernelTestDelegateProviders::kUseSimpleAllocator[];
 
 /*static*/ KernelTestDelegateProviders* KernelTestDelegateProviders::Get() {
@@ -32,17 +34,29 @@ constexpr char KernelTestDelegateProviders::kUseSimpleAllocator[];
 KernelTestDelegateProviders::KernelTestDelegateProviders()
     : delegate_list_util_(&params_) {
   delegate_list_util_.AddAllDelegateParams();
+  params_.AddParam(kAccelerationTestConfigPath,
+                   tools::ToolParam::Create<std::string>(""));
   params_.AddParam(kUseSimpleAllocator, tools::ToolParam::Create<bool>(false));
 }
 
 bool KernelTestDelegateProviders::InitFromCmdlineArgs(int* argc,
                                                       const char** argv) {
-  std::vector<tflite::Flag> flags = {Flag(
-      kUseSimpleAllocator,
-      [this](const bool& val, int argv_position) {  // NOLINT
-        this->params_.Set<bool>(kUseSimpleAllocator, val, argv_position);
-      },
-      false, "Use Simple Memory Allocator for SingleOpModel", Flag::kOptional)};
+  std::vector<tflite::Flag> flags = {
+      Flag(
+          kAccelerationTestConfigPath,
+          [this](const std::string& val, int argv_position) {  // NOLINT
+            this->params_.Set<std::string>(kAccelerationTestConfigPath, val,
+                                           argv_position);
+          },
+          "", "Acceleration test config file for SingleOpModel",
+          Flag::kOptional),
+      Flag(
+          kUseSimpleAllocator,
+          [this](const bool& val, int argv_position) {  // NOLINT
+            this->params_.Set<bool>(kUseSimpleAllocator, val, argv_position);
+          },
+          false, "Use Simple Memory Allocator for SingleOpModel",
+          Flag::kOptional)};
   delegate_list_util_.AppendCmdlineFlags(flags);
 
   bool parse_result = tflite::Flags::Parse(argc, argv, flags);
diff --git a/tensorflow/lite/kernels/test_delegate_providers.h b/tensorflow/lite/kernels/test_delegate_providers.h
index 66bd6387d26..58023b550a0 100644
--- a/tensorflow/lite/kernels/test_delegate_providers.h
+++ b/tensorflow/lite/kernels/test_delegate_providers.h
@@ -66,6 +66,9 @@ class KernelTestDelegateProviders {
 
   // An option name to use Simple Memory Allocator.
   static constexpr char kUseSimpleAllocator[] = "use_simple_allocator";
+  // An option name to provide acceleration test config file.
+  static constexpr char kAccelerationTestConfigPath[] =
+      "acceleration_test_config_path";
 
  private:
   // Contain delegate-related parameters that are initialized from command-line
diff --git a/tensorflow/lite/kernels/test_util.h b/tensorflow/lite/kernels/test_util.h
index 21dddfc1ecc..e9e1f892a0e 100644
--- a/tensorflow/lite/kernels/test_util.h
+++ b/tensorflow/lite/kernels/test_util.h
@@ -682,7 +682,7 @@ class SingleOpModel {
     if (reset_interpreter) {
       // Reconstruct interpreter as number of threads may affect internal state,
       // e.g. stratch buffer allocation.
-      BuildInterpreter(input_shapes_, num_threads, allocate_and_delegate_,
+      BuildInterpreter(input_shapes_, num_threads, allow_fp32_relax_to_fp16_,
                        apply_delegate_, allocate_and_delegate_);
     }
     interpreter_->SetNumThreads(num_threads);
diff --git a/tensorflow/lite/kernels/tile.cc b/tensorflow/lite/kernels/tile.cc
index 649466b7d61..7d6ce454572 100644
--- a/tensorflow/lite/kernels/tile.cc
+++ b/tensorflow/lite/kernels/tile.cc
@@ -233,15 +233,11 @@ TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
   }
 
   switch (output->type) {
-    case kTfLiteFloat32:
-      Tile<float>(*(input->dims), input, multipliers, output);
-      break;
     case kTfLiteInt8:
+    case kTfLiteUInt8:
       Tile<int8_t>(*(input->dims), input, multipliers, output);
       break;
-    case kTfLiteUInt8:
-      Tile<uint8_t>(*(input->dims), input, multipliers, output);
-      break;
+    case kTfLiteFloat32:
     case kTfLiteInt32:
       Tile<int32_t>(*(input->dims), input, multipliers, output);
       break;
diff --git a/tensorflow/lite/kernels/variants/BUILD b/tensorflow/lite/kernels/variants/BUILD
new file mode 100644
index 00000000000..4bc98ee1d15
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/BUILD
@@ -0,0 +1,87 @@
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+cc_library(
+    name = "list_ops_util",
+    srcs = ["list_ops_util.cc"],
+    hdrs = ["list_ops_util.h"],
+    deps = [
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_test(
+    name = "list_ops_util_test",
+    srcs = ["list_ops_util_test.cc"],
+    deps = [
+        ":list_ops_util",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+# test utilities for complex multi-op subgraphs
+cc_library(
+    name = "list_ops_subgraph_test_util",
+    testonly = True,
+    srcs = ["list_ops_subgraph_test_util.cc"],
+    hdrs = ["list_ops_subgraph_test_util.h"],
+    deps = [
+        "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:cc_api_stable",
+        "//tensorflow/lite/core:subgraph",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/kernels:op_macros",
+        "//tensorflow/lite/kernels:subgraph_test_util",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_test(
+    name = "list_ops_subgraph_test",
+    srcs = ["list_ops_subgraph_test.cc"],
+    deps = [
+        ":list_ops_subgraph_test_util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+# type erased class that sits behind the `data.data` of variant tensors
+cc_library(
+    name = "tensor_array",
+    srcs = ["tensor_array.cc"],
+    hdrs = ["tensor_array.h"],
+    compatible_with = get_compatible_with_portable(),
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:common",
+    ],
+)
+
+cc_test(
+    name = "tensor_array_test",
+    srcs = ["tensor_array_test.cc"],
+    deps = [
+        ":tensor_array",
+        "//tensorflow/lite:type_to_tflitetype",
+        "//tensorflow/lite:util",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core/c:c_api_types",
+        "//tensorflow/lite/core/c:common",
+        "//tensorflow/lite/kernels:test_util",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
diff --git a/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc b/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc
new file mode 100644
index 00000000000..9d52eca9c99
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_ops_subgraph_test.cc
@@ -0,0 +1,48 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h"
+
+namespace tflite {
+namespace {
+
+TEST_F(ListOpsSubgraphTest, SimpleAddConst) {
+  builder_.AddConstSubgraph(&interpreter_.primary_subgraph());
+
+  TfLiteTensor* cst1 = interpreter_.tensor(0);
+  ASSERT_THAT(cst1, DimsAre({2}));
+  EXPECT_EQ(cst1->data.i32[0], 2);
+  EXPECT_EQ(cst1->data.i32[1], 2);
+
+  TfLiteTensor* cst2 = interpreter_.tensor(1);
+  ASSERT_THAT(cst2, DimsAre({2}));
+  EXPECT_EQ(cst2->data.i32[0], 3);
+  EXPECT_EQ(cst2->data.i32[1], 3);
+
+  ASSERT_EQ(interpreter_.AllocateTensors(), kTfLiteOk);
+  ASSERT_EQ(interpreter_.Invoke(), kTfLiteOk);
+
+  TfLiteTensor* out = interpreter_.tensor(2);
+  ASSERT_THAT(out, DimsAre({2}));
+  EXPECT_EQ(out->data.i32[0], 5);
+  EXPECT_EQ(out->data.i32[1], 5);
+}
+
+}  // namespace
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc
new file mode 100644
index 00000000000..60dbe0fc644
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.cc
@@ -0,0 +1,95 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h"
+
+#include <algorithm>
+#include <climits>
+#include <cstring>
+#include <functional>
+#include <numeric>
+#include <vector>
+
+#include "absl/types/span.h"
+#include "tensorflow/lite/builtin_ops.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/builtin_op_data.h"
+#include "tensorflow/lite/kernels/builtin_op_kernels.h"
+#include "tensorflow/lite/kernels/op_macros.h"
+#include "tensorflow/lite/kernels/subgraph_test_util.h"
+
+namespace tflite {
+using ::tflite::subgraph_test_util::SetupTensor;
+
+void ListOpsSubgraphBuilder::CreateConstantInt32Tensor(
+    Subgraph* subgraph, int tensor_index, absl::Span<const int> shape,
+    absl::Span<const int> data) {
+  const bool all_static_dimensions =
+      std::all_of(shape.begin(), shape.end(), [](int i) { return i >= 0; });
+  TF_LITE_ASSERT(all_static_dimensions);
+  TF_LITE_ASSERT(!shape.empty());
+
+  // tflite only supports tensors with at most rank 5
+  const bool will_not_overflow =
+      std::all_of(shape.begin(), shape.end(),
+                  [](int i) { return i < (INT_MAX / 5); }) &&
+      shape.size() <= 5;
+  TF_LITE_ASSERT(will_not_overflow);
+
+  const int num_elements =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>());
+  TF_LITE_ASSERT_EQ(num_elements, data.size());
+
+  size_t bytes = sizeof(int32_t) * num_elements;
+  int_buffers_.push_back(std::vector<int32_t>(data.begin(), data.end()));
+
+  TfLiteStatus stat = subgraph->SetTensorParametersReadOnly(
+      tensor_index, kTfLiteInt32, /*name=*/"",
+      std::vector<int>(shape.begin(), shape.end()), /*quantization=*/{},
+      reinterpret_cast<const char*>(int_buffers_.back().data()), bytes);
+  TF_LITE_ASSERT_EQ(stat, kTfLiteOk);
+}
+
+void ListOpsSubgraphBuilder::AddConstSubgraph(Subgraph* subgraph) {
+  constexpr int kLHS = 0;
+  constexpr int kRHS = 1;
+  constexpr int kOut = 2;
+  constexpr int kTensorCount = 3;
+  // kLHS(0) --> +-----------+
+  //             |    ADD    | --> kOut(2)
+  // kRHS(1) --> +-----------+
+
+  int first_new_tensor_index;
+  TF_LITE_ASSERT_EQ(subgraph->AddTensors(kTensorCount, &first_new_tensor_index),
+                    kTfLiteOk);
+  TF_LITE_ASSERT_EQ(first_new_tensor_index, 0);
+  TF_LITE_ASSERT_EQ(subgraph->SetOutputs({kOut}), kTfLiteOk);
+
+  CreateConstantInt32Tensor(subgraph, kLHS, {2}, {2, 2});
+  CreateConstantInt32Tensor(subgraph, kRHS, {2}, {3, 3});
+  SetupTensor(subgraph, kOut, kTfLiteInt32);
+
+  TfLiteAddParams* params =
+      reinterpret_cast<TfLiteAddParams*>(malloc(sizeof(TfLiteAddParams)));
+  params->activation = kTfLiteActNone;
+  TfLiteRegistration* add_reg = ops::builtin::Register_ADD();
+  add_reg->builtin_code = kTfLiteBuiltinAdd;
+  int node_index;
+  TfLiteStatus stat = subgraph->AddNodeWithParameters(
+      {kLHS, kRHS}, {kOut}, {}, nullptr, 0, params, add_reg, &node_index);
+  TF_LITE_ASSERT_EQ(stat, kTfLiteOk);
+}
+
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
new file mode 100644
index 00000000000..a4b47b21495
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_ops_subgraph_test_util.h
@@ -0,0 +1,47 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_SUBGRAPH_TEST_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_SUBGRAPH_TEST_UTIL_H_
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/types/span.h"
+#include "tensorflow/lite/core/interpreter.h"
+#include "tensorflow/lite/core/subgraph.h"
+
+namespace tflite {
+
+// Helper class for constructing complicated subgraphs for testing.
+class ListOpsSubgraphBuilder {
+ public:
+  void AddConstSubgraph(Subgraph* subgraph);
+
+ private:
+  void CreateConstantInt32Tensor(Subgraph* subgraph, int tensor_index,
+                                 absl::Span<const int> shape,
+                                 absl::Span<const int> data);
+  std::vector<std::vector<int32_t>> int_buffers_;
+};
+
+class ListOpsSubgraphTest : public ::testing::Test {
+ protected:
+  Interpreter interpreter_;
+  ListOpsSubgraphBuilder builder_;
+};
+
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_SUBGRAPH_TEST_UTIL_H_
diff --git a/tensorflow/lite/kernels/variants/list_ops_util.cc b/tensorflow/lite/kernels/variants/list_ops_util.cc
new file mode 100644
index 00000000000..1a66cb1e7ad
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_ops_util.cc
@@ -0,0 +1,36 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/variants/list_ops_util.h"
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+
+// Creates a `TfLiteIntArray*` from tensor data that represents a shape.
+IntArrayUniquePtr TensorAsShape(const TfLiteTensor& shape) {
+  if (shape.dims->size == 0) {
+    // `shape` tensor encode an unranked shape.
+    return BuildTfLiteIntArray({});
+  }
+  const int rank = shape.dims->data[0];
+  const int* begin = reinterpret_cast<const int*>(shape.data.data);
+  const int* end = begin + rank;
+  return BuildTfLiteIntArray(std::vector<int>(begin, end));
+}
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/list_ops_util.h b/tensorflow/lite/kernels/variants/list_ops_util.h
new file mode 100644
index 00000000000..fc001cb1e1c
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_ops_util.h
@@ -0,0 +1,32 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_UTIL_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_UTIL_H_
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+
+// Creates a dims array from tensor whose data represents a shape
+// signature.
+IntArrayUniquePtr TensorAsShape(const TfLiteTensor& shape);
+
+}  // namespace variants
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_LIST_OPS_UTIL_H_
diff --git a/tensorflow/lite/kernels/variants/list_ops_util_test.cc b/tensorflow/lite/kernels/variants/list_ops_util_test.cc
new file mode 100644
index 00000000000..70c62802c80
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/list_ops_util_test.cc
@@ -0,0 +1,57 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/variants/list_ops_util.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+namespace {
+
+TEST(TensorAsShape, ScalarTensorReturnsEmptyIntArray) {
+  TensorUniquePtr scalar_tensor =
+      BuildTfLiteTensor(kTfLiteInt32, BuildTfLiteIntArray({}), kTfLiteDynamic);
+
+  IntArrayUniquePtr shape_from_tensor = TensorAsShape(*scalar_tensor);
+  ASSERT_THAT(shape_from_tensor.get(), DimsAre({}));
+}
+
+TEST(TensorAsShape, SingleElementTensorReturnsSize1Shape) {
+  TensorUniquePtr single_el_tensor =
+      BuildTfLiteTensor(kTfLiteInt32, BuildTfLiteIntArray({1}), kTfLiteDynamic);
+  single_el_tensor->data.i32[0] = 10;
+
+  IntArrayUniquePtr shape_from_tensor = TensorAsShape(*single_el_tensor);
+  ASSERT_THAT(shape_from_tensor.get(), DimsAre({10}));
+}
+
+TEST(TensorAsShape, OneDMultipleElementShapeReturnsHighRankedShape) {
+  TensorUniquePtr one_d_mul_el_tensor =
+      BuildTfLiteTensor(kTfLiteInt32, BuildTfLiteIntArray({3}), kTfLiteDynamic);
+  one_d_mul_el_tensor->data.i32[0] = 10;
+  one_d_mul_el_tensor->data.i32[1] = 9;
+  one_d_mul_el_tensor->data.i32[2] = 8;
+
+  IntArrayUniquePtr shape_from_tensor = TensorAsShape(*one_d_mul_el_tensor);
+  ASSERT_THAT(shape_from_tensor.get(), DimsAre({10, 9, 8}));
+}
+
+}  // namespace
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/tensor_array.cc b/tensorflow/lite/kernels/variants/tensor_array.cc
new file mode 100644
index 00000000000..d390058fa56
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/tensor_array.cc
@@ -0,0 +1,128 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+
+#include <cstring>
+
+namespace tflite {
+namespace variants {
+
+TensorArray::TensorArray(const TensorArray& other) {
+  TfLiteIntArray* copied_shape = TfLiteIntArrayCopy(other.element_shape_.get());
+  element_shape_ = IntArrayUniquePtr(copied_shape);
+  element_type_ = other.element_type_;
+  num_elements_ = other.num_elements_;
+  elements_ =
+      (RefCountedTensor*)malloc(sizeof(RefCountedTensor) * other.num_elements_);
+  other.AssignBuffer(elements_);
+}
+
+TensorArray& TensorArray::operator=(const TensorArray& other) {
+  TfLiteIntArray* copied_shape = TfLiteIntArrayCopy(other.element_shape_.get());
+  element_shape_ = IntArrayUniquePtr(copied_shape);
+  Resize(other.num_elements_);
+  Clear();
+  other.AssignBuffer(elements_);
+  return *this;
+}
+
+void TensorArray::Resize(int num_elements) {
+  if (num_elements == NumElements() || num_elements < 0) return;
+  if (num_elements > NumElements()) {
+    // The length of the array is being increased. Reallocate the buffer
+    // to the appropriate size and setup the new `RefCountedTensors`.
+    elements_ = (RefCountedTensor*)realloc(
+        elements_, num_elements * sizeof(RefCountedTensor));
+    for (int i = NumElements(); i < num_elements; ++i) {
+      elements_[i].count = nullptr;
+      elements_[i].tensor = nullptr;
+    }
+  } else {
+    // The length of the array is being decreased, update each of the
+    // references that we will no longer keep before reallocating.
+    for (int i = num_elements; i < NumElements(); ++i) {
+      Drop(i);
+    }
+    elements_ = (RefCountedTensor*)realloc(
+        elements_, num_elements * sizeof(RefCountedTensor));
+  }
+  num_elements_ = num_elements;
+}
+
+const TfLiteTensor* TensorArray::At(int index) const {
+  if (index < 0 || index >= NumElements()) {
+    return nullptr;
+  }
+  return elements_[index].tensor;
+}
+
+bool TensorArray::Set(int index, TensorUniquePtr tensor) {
+  if (index < 0 || index >= NumElements()) {
+    return false;
+  }
+  // Drop element if it exists.
+  Drop(index);
+  // Setup the `RefCountedTensor` at given index to wrap the given tensor.
+  int* c = (int*)malloc(sizeof(int));
+  *c = 1;
+  elements_[index].tensor = tensor.release();
+  elements_[index].count = c;
+  return true;
+}
+
+void TensorArray::Drop(int i) {
+  RefCountedTensor* t = elements_ + i;
+  int* count = t->count;
+  if (count == nullptr) {
+    return;
+  }
+  if (*count == 1) {
+    TfLiteTensorFree(t->tensor);
+    free(t->tensor);
+    free(t->count);
+    t->tensor = nullptr;
+    t->count = nullptr;
+    return;
+  }
+  (*count)--;
+}
+
+TensorArray::~TensorArray() {
+  Clear();
+  free(elements_);
+  elements_ = nullptr;
+}
+
+// `Drop`s each element in the list.
+void TensorArray::Clear() {
+  for (int i = 0; i < num_elements_; ++i) {
+    Drop(i);
+  }
+}
+
+void TensorArray::AssignBuffer(RefCountedTensor* dst) const {
+  // Copy `this` underlying buffer.
+  std::memcpy(dst, elements_, sizeof(RefCountedTensor) * num_elements_);
+  // Increment the reference count for each copied tensor.
+  for (int i = 0; i < num_elements_; ++i) {
+    if (dst[i].count == nullptr) {
+      continue;
+    }
+    (*dst[i].count)++;
+  }
+}
+
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/kernels/variants/tensor_array.h b/tensorflow/lite/kernels/variants/tensor_array.h
new file mode 100644
index 00000000000..0568587c998
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/tensor_array.h
@@ -0,0 +1,106 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_LITE_KERNELS_VARIANTS_TENSOR_ARRAY_H_
+#define TENSORFLOW_LITE_KERNELS_VARIANTS_TENSOR_ARRAY_H_
+
+#include <utility>
+
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+
+// `VariantData` implementation for a dynamically sized array of `TfLiteTensor`.
+// Each element of the array is a lightweight `RefCountedTensor`.
+// --- WARNING ---
+// This is intended to be used in a single-threaded manner
+// and users must take care when calling non-const methods, even on different
+// instances. Different instances may share underlying control structures (when
+// using the copy constructor to initialize them), and in such cases function
+// calls across all affected instances must be properly synchronized. Calling
+// non-const functions on any of the linked objects requires exclusive access to
+// all of them.
+class TensorArray : public AbstractVariantData<TensorArray> {
+ public:
+  // Takes ownership of `element_shape` input.
+  TensorArray(TfLiteType element_type, IntArrayUniquePtr element_shape)
+      : element_shape_(std::move(element_shape)), element_type_(element_type) {}
+
+  // Copying a `TensorArray` copies the sources underlying array of
+  // `RefCountedTensor` in one `memcpy` and increments each of the ref counts.
+  TensorArray(const TensorArray& other);
+
+  // Drops the references of `this` and assigns members in the same way
+  // as the copy constructor.
+  TensorArray& operator=(const TensorArray& other);
+
+  const TfLiteIntArray* ElementShape() const { return element_shape_.get(); }
+
+  int NumElements() const { return num_elements_; }
+
+  // Resizes the array for given number of elements. If the length of the array
+  // is being decreased, `Drop` the reference to the elements that will no
+  // longer be in the array. If index is out of bounds, no effect.
+  void Resize(int num_elements);
+
+  // Retrieve the tensor at the given index.
+  const TfLiteTensor* At(int index) const;
+
+  TfLiteType ElementType() const { return element_type_; }
+
+  // Set the item at the given index with the given tensor. Takes ownership
+  // of the given tensor. If there exists an element at the given index,
+  // `Drop` this array's reference to it.
+  bool Set(int index, TensorUniquePtr tensor);
+
+  // `Drop`s each reference that exists in the array.
+  ~TensorArray() override;
+
+ private:
+  // Simple structure to hold tensor pointer and ref count. Only to be used
+  // as elements within `TensorArray`.
+  struct RefCountedTensor {
+    TfLiteTensor* tensor = nullptr;
+    int* count = nullptr;
+  };
+
+  // "Drops" the reference at the given index because it will no longer be held
+  // in this array. Decrements the reference count, if this array holds the only
+  // reference than free the underlying tensor.
+  void Drop(int i);
+
+  // `Drop`s each element in the list.
+  void Clear();
+
+  // Assigns this `elements_` buffer to `dst`. Requires that the size
+  // of this elements buffer be the same as `dst` and that `dst` has
+  // been `Clear`ed. Like the rest of this class, copying `this` buffer
+  // needs to increment references of `const this`, so beware.
+  void AssignBuffer(RefCountedTensor* dst) const;
+
+  // elements_ is nullptr iff num_elements is 0.
+  RefCountedTensor* elements_ = nullptr;
+  int num_elements_ = 0;
+
+  IntArrayUniquePtr element_shape_;
+  TfLiteType element_type_;
+};
+
+}  // namespace variants
+}  // namespace tflite
+
+#endif  // TENSORFLOW_LITE_KERNELS_VARIANTS_TENSOR_ARRAY_H_
diff --git a/tensorflow/lite/kernels/variants/tensor_array_test.cc b/tensorflow/lite/kernels/variants/tensor_array_test.cc
new file mode 100644
index 00000000000..2801fde4eb4
--- /dev/null
+++ b/tensorflow/lite/kernels/variants/tensor_array_test.cc
@@ -0,0 +1,232 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/kernels/variants/tensor_array.h"
+
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/c/c_api_types.h"
+#include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/portable_type_to_tflitetype.h"
+#include "tensorflow/lite/util.h"
+
+namespace tflite {
+namespace variants {
+namespace {
+
+template <typename T>
+TensorUniquePtr MakeTensorWithData(std::vector<int> dims,
+                                   const std::vector<T>& data) {
+  TensorUniquePtr tensor =
+      BuildTfLiteTensor(typeToTfLiteType<T>(), dims, kTfLiteDynamic);
+  const int num_elements =
+      std::accumulate(dims.begin(), dims.end(), 1, std::multiplies<int>());
+  T* data_start = (T*)tensor->data.data;
+  // TODO(b/257472333) Investigate vector alignment and if this
+  // can be replaced with memcpy.
+  for (int i = 0; i < num_elements; ++i) {
+    data_start[i] = data[i];
+  }
+  // For these tests we want to give `TensorArray`s ownership of their
+  // constituent tensors, so we release here.
+  return tensor;
+}
+
+TensorArray MakeTensorArrayForTest(const std::vector<int>& dims) {
+  return TensorArray(kTfLiteInt32, BuildTfLiteIntArray(dims));
+}
+
+TEST(TensorArrayTest, InsertSingleElement) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(2);
+  ASSERT_TRUE(arr.Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+  const TfLiteTensor* added_tensor = arr.At(0);
+  ASSERT_TRUE(added_tensor != nullptr);
+  ASSERT_THAT(added_tensor, DimsAre({2}));
+  EXPECT_EQ(added_tensor->data.i32[0], 3);
+  EXPECT_EQ(added_tensor->data.i32[1], 4);
+}
+
+TEST(TensorArrayTest, ResizeToZero) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(2);
+  EXPECT_EQ(arr.NumElements(), 2);
+  arr.Resize(0);
+  EXPECT_EQ(arr.NumElements(), 0);
+}
+
+TEST(TensorArrayTest, InsertOOB) {
+  auto arr = MakeTensorArrayForTest({});
+  TensorUniquePtr tensor = MakeTensorWithData<int>({2}, {3, 4});
+  arr.Resize(1);
+  ASSERT_FALSE(arr.Set(-1, std::move(tensor)));
+  EXPECT_FALSE(arr.At(0));
+}
+
+TEST(TensorArrayTest, InsertMultipleElements) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(2);
+  EXPECT_TRUE(arr.Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+  EXPECT_TRUE(arr.Set(1, MakeTensorWithData<int>({3}, {3, 4, 5})));
+  EXPECT_THAT(arr.At(0), DimsAre({2}));
+  EXPECT_THAT(arr.At(1), DimsAre({3}));
+}
+
+TEST(TensorArrayTest, InsertSameIndexTwiceDeletes) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(2);
+  EXPECT_TRUE(arr.Set(0, MakeTensorWithData<int>({2}, {3, 2})));
+  EXPECT_TRUE(arr.Set(0, MakeTensorWithData<int>({3}, {3, 4, 5})));
+  EXPECT_THAT(arr.At(0), DimsAre({3}));
+}
+
+TEST(TensorArrayTest, ResizeUpWithElements) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(1);
+  ASSERT_TRUE(arr.Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+  arr.Resize(2);
+  EXPECT_THAT(arr.At(0), DimsAre({2}));
+  EXPECT_FALSE(arr.At(1));
+  EXPECT_EQ(arr.NumElements(), 2);
+}
+
+// resize down delete elements.
+TEST(TensorArrayTest, ResizeDownDeletesElements) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(2);
+  ASSERT_TRUE(arr.Set(1, MakeTensorWithData<int>({2}, {3, 4})));
+  arr.Resize(1);
+  EXPECT_EQ(arr.NumElements(), 1);
+  EXPECT_FALSE(arr.At(0));
+}
+
+TEST(TensorArrayTest, CopyListWithZeroLength) {
+  auto arr = MakeTensorArrayForTest({});
+  TensorArray arr2{arr};
+  EXPECT_EQ(arr.NumElements(), arr2.NumElements());
+  EXPECT_EQ(arr.NumElements(), 0);
+}
+
+TEST(TensorArrayTest, CopyAssignListWithZeroLength) {
+  auto arr = MakeTensorArrayForTest({});
+  arr = MakeTensorArrayForTest({2, 2});
+  EXPECT_EQ(arr.NumElements(), 0);
+  EXPECT_THAT(arr.ElementShape(), DimsAre({2, 2}));
+}
+
+TEST(TensorArrayTest, CopyEmptyList) {
+  auto arr = MakeTensorArrayForTest({});
+  arr.Resize(2);
+  TensorArray arr2{arr};
+  EXPECT_EQ(arr.NumElements(), arr2.NumElements());
+  EXPECT_EQ(arr.NumElements(), 2);
+}
+
+TEST(TensorArrayTest, CopyAssignToEmptyList) {
+  auto arr = MakeTensorArrayForTest({});
+  auto target_arr = MakeTensorArrayForTest({2, 2});
+  target_arr.Resize(2);
+  target_arr = arr;
+  EXPECT_EQ(target_arr.NumElements(), 0);
+  EXPECT_THAT(target_arr.ElementShape(), DimsAre({}));
+}
+
+TEST(TensorArrayTest, CopyListWithItem) {
+  std::optional<TensorArray> arr = TensorArray(kTfLiteInt32, {});
+  arr->Resize(1);
+  ASSERT_TRUE(arr->Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+
+  TensorArray arr2{*arr};
+  EXPECT_EQ(arr->NumElements(), arr2.NumElements());
+  // Both point to the same tensor.
+  EXPECT_EQ(arr->At(0), arr2.At(0));
+  // They are ref counted so deleting one list doesn't effect the tensor
+  // in the other.
+  arr.reset();
+  EXPECT_THAT(arr2.At(0), DimsAre({2}));
+}
+
+TEST(TensorArrayTest, CopyAssignToListWithItem) {
+  auto target_arr = MakeTensorArrayForTest({});
+  target_arr.Resize(2);
+  ASSERT_TRUE(target_arr.Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+
+  auto src_arr = MakeTensorArrayForTest({2, 2});
+  src_arr.Resize(1);
+
+  target_arr = src_arr;
+
+  EXPECT_EQ(target_arr.NumElements(), src_arr.NumElements());
+  EXPECT_EQ(target_arr.At(0), nullptr);
+}
+
+TEST(TensorArrayTest, CopyAssignFromListWithItem) {
+  auto target_arr = MakeTensorArrayForTest({2, 2});
+  target_arr.Resize(1);
+
+  auto src_arr = MakeTensorArrayForTest({});
+  src_arr.Resize(2);
+  ASSERT_TRUE(src_arr.Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+
+  target_arr = src_arr;
+
+  EXPECT_EQ(target_arr.NumElements(), src_arr.NumElements());
+  EXPECT_EQ(src_arr.At(0), target_arr.At(0));
+}
+
+TEST(TensorArrayTest, DeleteEmptyTensorArray) {
+  TensorArray* arr = new TensorArray{kTfLiteInt32, {}};
+  delete arr;
+}
+
+TEST(TensorArrayTest, DeleteResizedEmptyTensorArray) {
+  TensorArray* arr = new TensorArray{kTfLiteInt32, {}};
+  arr->Resize(2);
+  delete arr;
+}
+
+// OpaqueVariantTensorArrayDataTest(s) test usage of the `TensorArray` through
+// the generic interface methods defined in
+// `third_party/tensorflow/lite/core/c/common.h`. While appearing slightly
+// contrived in function, this test exemplifies proper casting protocol of
+// `VariantData` and asserts that the derived methods are dispatched to.
+TEST(OpaqueVariantTensorArrayDataTest, CastThroughVoidAndCopy) {
+  TensorArray* arr = new TensorArray{kTfLiteFloat32, {}};
+  arr->Resize(2);
+  ASSERT_TRUE(arr->Set(0, MakeTensorWithData<int>({2}, {3, 4})));
+  void* erased = static_cast<VariantData*>(arr);
+
+  VariantData* d = static_cast<VariantData*>(erased);
+  VariantData* copied_d = d->CloneTo(nullptr);
+  auto* copied_arr = static_cast<TensorArray*>(copied_d);
+  ASSERT_THAT(copied_arr->At(0), DimsAre({2}));
+  ASSERT_THAT(arr->At(0), DimsAre({2}));
+  ASSERT_EQ(arr->At(0), arr->At(0));
+
+  delete d;
+  delete copied_d;
+}
+
+}  // namespace
+}  // namespace variants
+}  // namespace tflite
diff --git a/tensorflow/lite/mmap_allocation_disabled.cc b/tensorflow/lite/mmap_allocation_disabled.cc
index 5613d1dcd7e..95c34446797 100644
--- a/tensorflow/lite/mmap_allocation_disabled.cc
+++ b/tensorflow/lite/mmap_allocation_disabled.cc
@@ -26,6 +26,10 @@ MMAPAllocation::MMAPAllocation(const char* filename,
 MMAPAllocation::MMAPAllocation(int fd, ErrorReporter* error_reporter)
     : MMAPAllocation(error_reporter, -1) {}
 
+MMAPAllocation::MMAPAllocation(int fd, size_t offset, size_t length,
+                               ErrorReporter* error_reporter)
+    : MMAPAllocation(error_reporter, -1) {}
+
 MMAPAllocation::MMAPAllocation(ErrorReporter* error_reporter, int owned_fd)
     : Allocation(error_reporter, Allocation::Type::kMMap),
       mmapped_buffer_(nullptr) {
diff --git a/tensorflow/lite/nnapi/nnapi_implementation.cc b/tensorflow/lite/nnapi/nnapi_implementation.cc
index 69dfe1f6a9a..5f77293ba7f 100644
--- a/tensorflow/lite/nnapi/nnapi_implementation.cc
+++ b/tensorflow/lite/nnapi/nnapi_implementation.cc
@@ -185,7 +185,8 @@ const NnApi LoadNnApi() {
     return nnapi;
   }
 
-  if (IsIsolatedProcess()) {
+  // Disable NNAPI for Android 13 and earlier if running in isolated process.
+  if (nnapi.android_sdk_version <= 33 && IsIsolatedProcess()) {
     NNAPI_LOG("NNAPI is disabled in an isolated process");
     nnapi.nnapi_exists = false;
     return nnapi;
diff --git a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
index f9b85d46916..5e8ddfb910e 100644
--- a/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
+++ b/tensorflow/lite/objc/TensorFlowLiteObjC.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteObjC'
-  s.version          = '2.11.0'
+  s.version          = '2.12.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => 'd5b57ca93e506df258271ea00fc29cf98383a374' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '1e8f3f6a3e899fe75d3c394d4fe07f2c2b69e2ec' }
   s.summary          = 'TensorFlow Lite for Objective-C'
   s.description      = <<-DESC
 
@@ -47,6 +47,7 @@ Pod::Spec.new do |s|
       tfl_dir + 'c/c_api_types.h',
       tfl_dir + 'c/common.h',
       tfl_dir + 'delegates/xnnpack/xnnpack_delegate.h',
+      tfl_dir + 'core/c/registration_external.h',
     ]
     core.exclude_files = [
       objc_dir + '{apis,sources}/TFL{Metal,CoreML}Delegate.{h,m}',
diff --git a/tensorflow/lite/optional_debug_tools.cc b/tensorflow/lite/optional_debug_tools.cc
index 968f7e8ccdb..ce6e9e4973f 100644
--- a/tensorflow/lite/optional_debug_tools.cc
+++ b/tensorflow/lite/optional_debug_tools.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <functional>
 #include <limits>
 #include <set>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
@@ -394,7 +395,10 @@ std::string TruncateString(const char* str, int size_limit,
 }  // namespace
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(const Interpreter* interpreter) {
+void PrintInterpreterState(const Interpreter* interpreter,
+                           const int32_t tensor_name_display_length,
+                           const int32_t tensor_type_display_length,
+                           const int32_t alloc_type_display_length) {
   const size_t num_subgraphs = interpreter->subgraphs_size();
   printf("Interpreter has %zu subgraphs.\n\n", num_subgraphs);
 
@@ -420,16 +424,32 @@ void PrintInterpreterState(const Interpreter* interpreter) {
       tensor_mem_info.Update(tensor_index, *tensor);
     }
 
-    printf("Tensor %3s %-25s %-15s %-18s %-18s %-10s %-16s\n", "ID", "Name",
-           "Type", "AllocType", "Size (Bytes/MB)", "Shape", "MemAddr-Offset");
+    // To dynamically determine the format string
+    std::stringstream var_length_fs;
+    var_length_fs << "%-" << tensor_name_display_length << "s %-"
+                  << tensor_type_display_length << "s %-"
+                  << alloc_type_display_length << "s";
+
+    printf(
+        ("Tensor %3s " + var_length_fs.str() + " %-18s %-10s %-16s\n").c_str(),
+        "ID", "Name", "Type", "AllocType", "Size (Bytes/MB)", "Shape",
+        "MemAddr-Offset");
+
     for (size_t tensor_index = 0; tensor_index < subgraph.tensors_size();
          tensor_index++) {
       const TfLiteTensor* tensor =
           subgraph.tensor(static_cast<int>(tensor_index));
-      printf("Tensor %3zu %-25s %-15s %-18s %-8zu / %.2f ", tensor_index,
-             TruncateString(tensor->name, 25, /*truncate_at_end*/ true).c_str(),
-             TruncateString(TensorTypeName(tensor->type), 15).c_str(),
-             TruncateString(AllocTypeName(tensor->allocation_type), 18).c_str(),
+      printf(("Tensor %3zu " + var_length_fs.str() + " %-8zu / %.2f ").c_str(),
+             tensor_index,
+             TruncateString(tensor->name, tensor_name_display_length,
+                            /*truncate_at_end*/ true)
+                 .c_str(),
+             TruncateString(TensorTypeName(tensor->type),
+                            tensor_type_display_length)
+                 .c_str(),
+             TruncateString(AllocTypeName(tensor->allocation_type),
+                            alloc_type_display_length)
+                 .c_str(),
              tensor->bytes, (static_cast<float>(tensor->bytes) / (1 << 20)));
       PrintTfLiteIntVector(tensor->dims, /*collapse_consecutives*/ false);
       const int64_t start_offset =
diff --git a/tensorflow/lite/optional_debug_tools.h b/tensorflow/lite/optional_debug_tools.h
index d05519592a6..cceb74a017c 100644
--- a/tensorflow/lite/optional_debug_tools.h
+++ b/tensorflow/lite/optional_debug_tools.h
@@ -24,7 +24,10 @@ limitations under the License.
 namespace tflite {
 
 // Prints a dump of what tensors and what nodes are in the interpreter.
-void PrintInterpreterState(const impl::Interpreter* interpreter);
+void PrintInterpreterState(const impl::Interpreter* interpreter,
+                           int32_t tensor_name_display_length = 25,
+                           int32_t tensor_type_display_length = 15,
+                           int32_t alloc_type_display_length = 18);
 
 }  // namespace tflite
 
diff --git a/tensorflow/lite/profiling/BUILD b/tensorflow/lite/profiling/BUILD
index a0bc2fe96e5..5bfea0c9a69 100644
--- a/tensorflow/lite/profiling/BUILD
+++ b/tensorflow/lite/profiling/BUILD
@@ -96,12 +96,16 @@ cc_library(
 
 cc_library(
     name = "profile_buffer",
+    srcs = [
+        "profile_buffer.cc",
+    ],
     hdrs = ["profile_buffer.h"],
     compatible_with = get_compatible_with_portable(),
     copts = common_copts,
     deps = [
         ":memory_info",
         ":time",
+        "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/core/api",
     ],
 )
diff --git a/tensorflow/lite/profiling/profile_buffer.cc b/tensorflow/lite/profiling/profile_buffer.cc
new file mode 100644
index 00000000000..56bddc4a2bd
--- /dev/null
+++ b/tensorflow/lite/profiling/profile_buffer.cc
@@ -0,0 +1,130 @@
+/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/lite/profiling/profile_buffer.h"
+
+#include <utility>
+
+#include "tensorflow/lite/minimal_logging.h"
+
+namespace tflite {
+namespace profiling {
+
+uint32_t ProfileBuffer::BeginEvent(const char* tag,
+                                   ProfileEvent::EventType event_type,
+                                   int64_t event_metadata1,
+                                   int64_t event_metadata2) {
+  if (!enabled_) {
+    return kInvalidEventHandle;
+  }
+  uint64_t timestamp = time::NowMicros();
+  const auto next_index = GetNextEntryIndex();
+  if (next_index.second) {
+    return next_index.first;
+  }
+  const int index = next_index.first;
+  event_buffer_[index].tag = tag;
+  event_buffer_[index].event_type = event_type;
+  event_buffer_[index].event_metadata = event_metadata1;
+  event_buffer_[index].extra_event_metadata = event_metadata2;
+  event_buffer_[index].begin_timestamp_us = timestamp;
+  event_buffer_[index].elapsed_time = 0;
+  if (event_type != Profiler::EventType::OPERATOR_INVOKE_EVENT) {
+    event_buffer_[index].begin_mem_usage = memory::GetMemoryUsage();
+  }
+  current_index_++;
+  return index;
+}
+
+void ProfileBuffer::EndEvent(uint32_t event_handle,
+                             const int64_t* event_metadata1,
+                             const int64_t* event_metadata2) {
+  if (!enabled_ || event_handle == kInvalidEventHandle ||
+      event_handle > current_index_) {
+    return;
+  }
+  const uint32_t max_size = event_buffer_.size();
+  if (current_index_ > (max_size + event_handle)) {
+    // Ignore, buffer has already overflowed.
+    return;
+  }
+
+  int event_index = event_handle % max_size;
+  event_buffer_[event_index].elapsed_time =
+      time::NowMicros() - event_buffer_[event_index].begin_timestamp_us;
+  if (event_buffer_[event_index].event_type !=
+      Profiler::EventType::OPERATOR_INVOKE_EVENT) {
+    event_buffer_[event_index].end_mem_usage = memory::GetMemoryUsage();
+  }
+  if (event_metadata1) {
+    event_buffer_[event_index].event_metadata = *event_metadata1;
+  }
+  if (event_metadata2) {
+    event_buffer_[event_index].extra_event_metadata = *event_metadata2;
+  }
+}
+
+const struct ProfileEvent* ProfileBuffer::At(size_t index) const {
+  size_t size = Size();
+  if (index >= size) {
+    return nullptr;
+  }
+  const uint32_t max_size = event_buffer_.size();
+  uint32_t start =
+      (current_index_ > max_size) ? current_index_ % max_size : max_size;
+  index = (index + start) % max_size;
+  return &event_buffer_[index];
+}
+
+void ProfileBuffer::AddEvent(const char* tag,
+                             ProfileEvent::EventType event_type,
+                             uint64_t elapsed_time, int64_t event_metadata1,
+                             int64_t event_metadata2) {
+  if (!enabled_) {
+    return;
+  }
+  const auto next_index = GetNextEntryIndex();
+  if (next_index.second) {
+    return;
+  }
+  const int index = next_index.first;
+  event_buffer_[index].tag = tag;
+  event_buffer_[index].event_type = event_type;
+  event_buffer_[index].event_metadata = event_metadata1;
+  event_buffer_[index].extra_event_metadata = event_metadata2;
+  event_buffer_[index].begin_timestamp_us = 0;
+  event_buffer_[index].elapsed_time = elapsed_time;
+  current_index_++;
+}
+
+std::pair<int, bool> ProfileBuffer::GetNextEntryIndex() {
+  int index = current_index_ % event_buffer_.size();
+  if (current_index_ == 0 || index != 0) {
+    return std::make_pair(index, false);
+  }
+
+  // Current buffer is full
+  if (!allow_dynamic_expansion_) {
+    TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
+                         "Warning: Dropping ProfileBuffer event.");
+    return std::make_pair(current_index_, true);
+  } else {
+    TFLITE_LOG_PROD_ONCE(TFLITE_LOG_INFO,
+                         "Warning: Doubling internal profiling buffer.");
+    event_buffer_.resize(current_index_ * 2);
+    return std::make_pair(current_index_, false);
+  }
+}
+}  // namespace profiling
+}  // namespace tflite
diff --git a/tensorflow/lite/profiling/profile_buffer.h b/tensorflow/lite/profiling/profile_buffer.h
index 6f2a1daf8e5..ff7a47e71eb 100644
--- a/tensorflow/lite/profiling/profile_buffer.h
+++ b/tensorflow/lite/profiling/profile_buffer.h
@@ -78,28 +78,7 @@ class ProfileBuffer {
   // buffer is disabled this has no affect.
   // The tag of the event should remain valid till the buffer is valid.
   uint32_t BeginEvent(const char* tag, ProfileEvent::EventType event_type,
-                      int64_t event_metadata1, int64_t event_metadata2) {
-    if (!enabled_) {
-      return kInvalidEventHandle;
-    }
-    uint64_t timestamp = time::NowMicros();
-    const auto next_index = GetNextEntryIndex();
-    if (next_index.second) {
-      return next_index.first;
-    }
-    const int index = next_index.first;
-    event_buffer_[index].tag = tag;
-    event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_metadata = event_metadata1;
-    event_buffer_[index].extra_event_metadata = event_metadata2;
-    event_buffer_[index].begin_timestamp_us = timestamp;
-    event_buffer_[index].elapsed_time = 0;
-    if (event_type != Profiler::EventType::OPERATOR_INVOKE_EVENT) {
-      event_buffer_[index].begin_mem_usage = memory::GetMemoryUsage();
-    }
-    current_index_++;
-    return index;
-  }
+                      int64_t event_metadata1, int64_t event_metadata2);
 
   // Sets the enabled state of buffer to |enabled|
   void SetEnabled(bool enabled) { enabled_ = enabled; }
@@ -108,52 +87,11 @@ class ProfileBuffer {
   // If the buffer is disabled or previous event has been overwritten this
   // operation has not effect.
   void EndEvent(uint32_t event_handle, const int64_t* event_metadata1 = nullptr,
-                const int64_t* event_metadata2 = nullptr) {
-    if (!enabled_ || event_handle == kInvalidEventHandle ||
-        event_handle > current_index_) {
-      return;
-    }
-    const uint32_t max_size = event_buffer_.size();
-    if (current_index_ > (max_size + event_handle)) {
-      // Ignore, buffer has already overflowed.
-      fprintf(stderr, "Warning: Dropping ProfileBuffer event.\n");
-      return;
-    }
-
-    int event_index = event_handle % max_size;
-    event_buffer_[event_index].elapsed_time =
-        time::NowMicros() - event_buffer_[event_index].begin_timestamp_us;
-    if (event_buffer_[event_index].event_type !=
-        Profiler::EventType::OPERATOR_INVOKE_EVENT) {
-      event_buffer_[event_index].end_mem_usage = memory::GetMemoryUsage();
-    }
-    if (event_metadata1) {
-      event_buffer_[event_index].event_metadata = *event_metadata1;
-    }
-    if (event_metadata2) {
-      event_buffer_[event_index].extra_event_metadata = *event_metadata2;
-    }
-  }
+                const int64_t* event_metadata2 = nullptr);
 
   void AddEvent(const char* tag, ProfileEvent::EventType event_type,
                 uint64_t elapsed_time, int64_t event_metadata1,
-                int64_t event_metadata2) {
-    if (!enabled_) {
-      return;
-    }
-    const auto next_index = GetNextEntryIndex();
-    if (next_index.second) {
-      return;
-    }
-    const int index = next_index.first;
-    event_buffer_[index].tag = tag;
-    event_buffer_[index].event_type = event_type;
-    event_buffer_[index].event_metadata = event_metadata1;
-    event_buffer_[index].extra_event_metadata = event_metadata2;
-    event_buffer_[index].begin_timestamp_us = 0;
-    event_buffer_[index].elapsed_time = elapsed_time;
-    current_index_++;
-  }
+                int64_t event_metadata2);
 
   // Returns the size of the buffer.
   size_t Size() const {
@@ -170,37 +108,12 @@ class ProfileBuffer {
   // Returns the profile event at the given index. If the index is invalid a
   // nullptr is returned. The return event may get overwritten if more events
   // are added to buffer.
-  const struct ProfileEvent* At(size_t index) const {
-    size_t size = Size();
-    if (index >= size) {
-      return nullptr;
-    }
-    const uint32_t max_size = event_buffer_.size();
-    uint32_t start =
-        (current_index_ > max_size) ? current_index_ % max_size : max_size;
-    index = (index + start) % max_size;
-    return &event_buffer_[index];
-  }
+  const struct ProfileEvent* At(size_t index) const;
 
  private:
   // Returns a pair of values. The 1st element refers to the next buffer id,
   // the 2nd element refers to whether the buffer reaches its allowed capacity.
-  std::pair<int, bool> GetNextEntryIndex() {
-    int index = current_index_ % event_buffer_.size();
-    if (current_index_ == 0 || index != 0) {
-      return std::make_pair(index, false);
-    }
-
-    // Current buffer is full
-    if (!allow_dynamic_expansion_) {
-      fprintf(stderr, "Warning: Dropping ProfileBuffer event.\n");
-      return std::make_pair(current_index_, true);
-    } else {
-      fprintf(stderr, "Warning: Doubling internal profiling buffer.\n");
-      event_buffer_.resize(current_index_ * 2);
-      return std::make_pair(current_index_, false);
-    }
-  }
+  std::pair<int, bool> GetNextEntryIndex();
 
   bool enabled_;
   uint32_t current_index_;
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 81e4697ea84..44345be426a 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -62,8 +62,8 @@ py_test(
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python/metrics",
         "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//third_party/py/numpy",
     ],
@@ -119,7 +119,7 @@ py_library(
         ":lite",
         ":schema_util",
         "//tensorflow/lite/tools:visualize",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework",
     ],
 )
 
@@ -160,17 +160,17 @@ py_test(
         ":test_util",
         ":tflite_convert",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/saved_model",
@@ -237,8 +237,8 @@ py_test(
     deps = [
         ":lite",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:resource_loader",
     ],
@@ -265,8 +265,8 @@ py_test(
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto_py",
         "//tensorflow/lite/python/testdata:_pywrap_test_registerer",
         "//tensorflow/lite/python/testdata:double_op",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
@@ -281,8 +281,8 @@ py_library(
     ],
     deps = [
         ":lite",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -296,8 +296,8 @@ py_test(
         ":test_util",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python/testdata:double_op",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -314,12 +314,12 @@ py_library(
         "//tensorflow/lite/python:tflite_keras_util",
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/lite/tools:flatbuffer_utils",
-        "//tensorflow/python:convert_to_constants",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:error_interpolation",
-        "//tensorflow/python:graph_util",
-        "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:error_interpolation",
+        "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/grappler:tf_optimizer",
         "//tensorflow/python/training:saver",
         "@absl_py//absl/logging",
         "@flatbuffers//:runtime_py",
@@ -337,15 +337,15 @@ py_test(
     deps = [
         ":util",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:convert_to_constants",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:convert_to_constants",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -383,7 +383,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/lite/toco:toco_flags_proto_py",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/util:all_util",
         "//tensorflow/python/util:tf_export",
     ],
@@ -405,8 +405,8 @@ pytype_strict_library(
         "//tensorflow/lite/toco:toco_flags_proto_py",
         "//tensorflow/lite/toco/python:toco_from_protos",
         "//tensorflow/lite/tools:flatbuffer_utils",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -419,7 +419,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:graph_util",
+        "//tensorflow/python/framework:graph_util",
         "//tensorflow/python/util:all_util",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
@@ -437,15 +437,15 @@ pytype_strict_contrib_test(
         "//tensorflow/lite/python/metrics:converter_error_data_proto_py",
         "//tensorflow/lite/python/metrics:metrics_wrapper",
         "//tensorflow/lite/toco:toco_flags_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:graph_util",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -460,7 +460,7 @@ py_library(
     deps = [
         ":convert_phase",
         ":util",
-        "//tensorflow/python:graph_util",
+        "//tensorflow/python/framework:graph_util",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model",
     ],
@@ -477,12 +477,12 @@ py_test(
     visibility = ["//visibility:public"],
     deps = [
         ":convert_saved_model",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:layers",
         "//tensorflow/python:nn",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/layers",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model",
     ],
 )
@@ -555,8 +555,8 @@ py_test(
     deps = [
         ":analyzer",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/lite/python/analyzer_test.py b/tensorflow/lite/python/analyzer_test.py
index 1f1f7481f4f..e1645d60a3a 100644
--- a/tensorflow/lite/python/analyzer_test.py
+++ b/tensorflow/lite/python/analyzer_test.py
@@ -216,6 +216,7 @@ class AnalyzerTest(test_util.TensorFlowTestCase):
 
     converter = tf.lite.TFLiteConverter.from_concrete_functions(
         [func.get_concrete_function()], func)
+    converter.unfold_batchmatmul = True
     fb_model = converter.convert()
     mock_stdout = io.StringIO()
     with test.mock.patch.object(sys, 'stdout', mock_stdout):
diff --git a/tensorflow/lite/python/authoring/BUILD b/tensorflow/lite/python/authoring/BUILD
index 1c98de1176d..88ef9cab469 100644
--- a/tensorflow/lite/python/authoring/BUILD
+++ b/tensorflow/lite/python/authoring/BUILD
@@ -19,7 +19,7 @@ py_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/lite/python/metrics:converter_error_data_proto_py",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework",
     ],
 )
 
diff --git a/tensorflow/lite/python/interpreter_wrapper/BUILD b/tensorflow/lite/python/interpreter_wrapper/BUILD
index e8864a8595d..ee2d710efd1 100644
--- a/tensorflow/lite/python/interpreter_wrapper/BUILD
+++ b/tensorflow/lite/python/interpreter_wrapper/BUILD
@@ -88,7 +88,7 @@ pybind_extension(
         ":interpreter_wrapper_lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework_stable",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ] + select({
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 857e472556d..3f67b5fa1e8 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -614,7 +614,7 @@ class TFLiteConverterBase:
     self._saved_model_exported_names = []
     self._tflite_metrics = metrics.TFLiteConverterMetrics()
     self._collected_converter_params = {}
-    self._experimental_disable_batchmatmul_unfold = False
+    self.unfold_batchmatmul = False
     self._experimental_lower_tensor_list_ops = True
     self._experimental_default_to_single_batch_in_tensor_list_ops = False
     self._experimental_unfold_large_splat_constant = False
@@ -754,7 +754,7 @@ class TFLiteConverterBase:
         "enable_mlir_converter": self.experimental_new_converter,
         "select_user_tf_ops": self.target_spec.experimental_select_user_tf_ops,
         "supported_backends": self.target_spec.experimental_supported_backends,
-        "unfold_batchmatmul": not self._experimental_disable_batchmatmul_unfold,
+        "unfold_batchmatmul": self.unfold_batchmatmul,
         "lower_tensor_list_ops": self._experimental_lower_tensor_list_ops,
         "unfold_large_splat_constant": (
             self._experimental_unfold_large_splat_constant
@@ -786,6 +786,14 @@ class TFLiteConverterBase:
           "saved_model_exported_names": self._saved_model_exported_names,
       })
 
+    if self._experimental_quantization_options:
+      logging.warning(
+          "Configs from custom methods in experimental_quantization_options"
+          " may not produce a valid tflite model. Note that currently this"
+          " option only supports StableHLO path. Setting this option in TFLite"
+          " path will be a no-op."
+      )
+
     return args
 
   def _contains_function_with_implements_attr(self, saved_model_proto):
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index 71ccdc5657b..17997a23a11 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -1855,7 +1855,7 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     converter = tf.lite.TFLiteConverter.from_saved_model(tf_saved_model_dir)
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
     tflite_model = converter.convert()
-    # 2. Initialize the Intepreter
+    # 2. Initialize the Interpreter
     interpreter = Interpreter(model_content=tflite_model)
     input_details = interpreter.get_input_details()[0]
     output_details = interpreter.get_output_details()[0]
@@ -1875,7 +1875,7 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     converter.inference_input_type = tf.int8
     converter.inference_output_type = tf.int8
     tflite_model_quant = converter.convert()
-    # 2. Initialize the Intepreter
+    # 2. Initialize the Interpreter
     interpreter = Interpreter(model_content=tflite_model_quant)
     input_details = interpreter.get_input_details()[0]
     output_details = interpreter.get_output_details()[0]
@@ -2692,9 +2692,9 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
   def testMlirStableHLOPresetQuantizationMethod(
       self, preset_quantization_method
   ):
-    k_num_filters = 38
+    num_filters = 38
     model = tf.keras.models.Sequential(
-        [tf.keras.layers.Conv2D(k_num_filters, (3, 3), activation='relu')]
+        [tf.keras.layers.Conv2D(num_filters, (3, 3), activation='relu')]
     )
     model.build(input_shape=(1, 5, 5, 3))
     saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_saved_model')
@@ -2719,6 +2719,45 @@ class FromSavedModelTest(lite_v2_test_util.ModelTest):
     quantized_stablehlo_model = converter.convert()
     self.assertIsNotNone(quantized_stablehlo_model)
 
+  @test_util.run_v2_only
+  def testMlirStableHLOCustomQuantizationMethod(self):
+    num_filters = 38
+    model = tf.keras.models.Sequential(
+        [tf.keras.layers.Conv2D(num_filters, (3, 3), activation='relu')]
+    )
+    model.build(input_shape=(1, 5, 5, 3))
+    saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_saved_model')
+    save(model, saved_model_dir)
+
+    quantization_options = quant_opts_pb2.QuantizationOptions(
+        quantization_method=quant_opts_pb2.QuantizationMethod(
+            custom_quantization_method=quant_opts_pb2.CustomQuantizationMethod(
+                quantization_component_spec=[
+                    quant_opts_pb2.QuantizationComponentSpec(
+                        quantization_component=quant_opts_pb2.QuantizationComponentSpec.QuantizationComponent.COMPONENT_WEIGHT,
+                        bit_width=quant_opts_pb2.QuantizationComponentSpec.BitWidth.BIT_WIDTH_16,
+                        bit_type=quant_opts_pb2.QuantizationComponentSpec.BitType.BIT_TYPE_FLOAT,
+                    ),
+                    quant_opts_pb2.QuantizationComponentSpec(
+                        quantization_component=quant_opts_pb2.QuantizationComponentSpec.QuantizationComponent.COMPONENT_BIAS,
+                        bit_width=quant_opts_pb2.QuantizationComponentSpec.BitWidth.BIT_WIDTH_16,
+                    ),
+                ]
+            )
+        )
+    )
+
+    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
+    converter._experimental_quantization_options = quantization_options
+
+    converter.target_spec.supported_ops = [
+        tf.lite.OpsSet.EXPERIMENTAL_STABLEHLO_OPS
+    ]
+    converter.exclude_conversion_metadata = True
+    converter.optimizations = [lite.Optimize.DEFAULT]
+    quantized_stablehlo_model = converter.convert()
+    self.assertIsNotNone(quantized_stablehlo_model)
+
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
@@ -3591,6 +3630,49 @@ class ControlFlowTest(lite_v2_test_util.ModelTest):
     self.assertAllClose(expected_value, actual_value, atol=1e-05)
 
 
+class StridedSliceTest(lite_v2_test_util.ModelTest):
+
+  @test_util.run_v2_only
+  def testStridedSlice(self):
+    input_data = tf.constant(
+        [
+            1.0,
+            2.0,
+            3.0,
+            4.0,
+            5.0,
+            6,
+        ],
+        shape=[6],
+        dtype=tf.float32,
+    )
+    begin = tf.Variable([1], dtype=tf.int32)
+
+    @tf.function(
+        input_signature=[
+            tf.TensorSpec(shape=[6], dtype=tf.float32),
+            tf.TensorSpec(shape=[1], dtype=tf.int32),
+        ]
+    )
+    def model(a, begin):
+      return tf.strided_slice(a, begin, begin + 3)
+
+    concrete_func = model.get_concrete_function()
+
+    # Convert model.
+    converter = lite.TFLiteConverterV2.from_concrete_functions(
+        [concrete_func], model
+    )
+    tflite_model = converter.convert()
+
+    # Check values from converted model.
+    expected_value = concrete_func(input_data, begin)
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_data, begin])[
+        0
+    ]
+    self.assertAllClose(expected_value, actual_value)
+
+
 class GrapplerTest(lite_v2_test_util.ModelTest):
 
   @test_util.run_v2_only
diff --git a/tensorflow/lite/python/metrics/BUILD b/tensorflow/lite/python/metrics/BUILD
index 867c84c9fc0..e5ec668b73d 100644
--- a/tensorflow/lite/python/metrics/BUILD
+++ b/tensorflow/lite/python/metrics/BUILD
@@ -38,7 +38,7 @@ pybind_extension(
     visibility = ["//visibility:private"],
     deps = [
         ":metrics_wrapper_lib",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@com_google_protobuf//:protobuf",
         "@pybind11",
@@ -64,8 +64,8 @@ py_test(
     deps = [
         ":metrics_wrapper",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -125,8 +125,8 @@ py_test(
         ":metrics",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python:lite",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/lite/python/optimize/BUILD b/tensorflow/lite/python/optimize/BUILD
index 36caf347a66..d5c7b5ed66e 100644
--- a/tensorflow/lite/python/optimize/BUILD
+++ b/tensorflow/lite/python/optimize/BUILD
@@ -41,7 +41,7 @@ pybind_extension(
         ":calibration_wrapper_lib",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework_stable",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -58,7 +58,7 @@ py_library(
         ":_pywrap_tensorflow_lite_calibration_wrapper",  # buildcleaner: keep
         "//tensorflow/lite/python:convert_phase",
         "//tensorflow/lite/python:interpreter",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/util:lazy_loader",
         "//third_party/py/numpy",
     ],
@@ -76,9 +76,9 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":calibrator",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/lite/python/testdata/BUILD b/tensorflow/lite/python/testdata/BUILD
index 84048bbc164..8d0bfc359f2 100644
--- a/tensorflow/lite/python/testdata/BUILD
+++ b/tensorflow/lite/python/testdata/BUILD
@@ -140,7 +140,7 @@ tf_custom_op_py_library(
     srcs_version = "PY3",
     deps = [
         ":gen_double_op_wrapper",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
     ],
 )
 
diff --git a/tensorflow/lite/python/util.py b/tensorflow/lite/python/util.py
index 25eccb4fa54..376f4d8e168 100644
--- a/tensorflow/lite/python/util.py
+++ b/tensorflow/lite/python/util.py
@@ -19,8 +19,8 @@ import datetime
 import sys
 
 from absl import logging
-
 import flatbuffers
+
 from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.protobuf import config_pb2 as _config_pb2
 from tensorflow.core.protobuf import meta_graph_pb2 as _meta_graph_pb2
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index d3b0753c1bc..82038805efb 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -64,8 +64,8 @@ py_test(
     ],
     deps = [
         ":upgrade_schema_main_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/lite/schema/conversion_metadata_generated.h b/tensorflow/lite/schema/conversion_metadata_generated.h
index 20dfff1671b..69b2316a2d6 100755
--- a/tensorflow/lite/schema/conversion_metadata_generated.h
+++ b/tensorflow/lite/schema/conversion_metadata_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
diff --git a/tensorflow/lite/schema/schema.fbs b/tensorflow/lite/schema/schema.fbs
index e7cef7ad2a1..175a7178b09 100644
--- a/tensorflow/lite/schema/schema.fbs
+++ b/tensorflow/lite/schema/schema.fbs
@@ -880,6 +880,8 @@ table StridedSliceOptions {
   ellipsis_mask: int;
   new_axis_mask: int;
   shrink_axis_mask: int;
+  // If true, then the end tensor is an offset of the begin tensor.
+  offset: bool;
 }
 
 table LogSoftmaxOptions {
diff --git a/tensorflow/lite/schema/schema_generated.h b/tensorflow/lite/schema/schema_generated.h
index f03fdd0fc13..85a9badc7e7 100755
--- a/tensorflow/lite/schema/schema_generated.h
+++ b/tensorflow/lite/schema/schema_generated.h
@@ -23,8 +23,8 @@ limitations under the License.
 // Ensure the included flatbuffers.h is the same version as when this file was
 // generated, otherwise it may not be compatible.
 static_assert(FLATBUFFERS_VERSION_MAJOR == 23 &&
-              FLATBUFFERS_VERSION_MINOR == 1 &&
-              FLATBUFFERS_VERSION_REVISION == 21,
+              FLATBUFFERS_VERSION_MINOR == 5 &&
+              FLATBUFFERS_VERSION_REVISION == 8,
              "Non-compatible flatbuffers version included");
 
 namespace tflite {
@@ -7933,6 +7933,7 @@ struct StridedSliceOptionsT : public ::flatbuffers::NativeTable {
   int32_t ellipsis_mask = 0;
   int32_t new_axis_mask = 0;
   int32_t shrink_axis_mask = 0;
+  bool offset = false;
 };
 
 struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
@@ -7943,7 +7944,8 @@ struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Tabl
     VT_END_MASK = 6,
     VT_ELLIPSIS_MASK = 8,
     VT_NEW_AXIS_MASK = 10,
-    VT_SHRINK_AXIS_MASK = 12
+    VT_SHRINK_AXIS_MASK = 12,
+    VT_OFFSET = 14
   };
   int32_t begin_mask() const {
     return GetField<int32_t>(VT_BEGIN_MASK, 0);
@@ -7960,6 +7962,9 @@ struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Tabl
   int32_t shrink_axis_mask() const {
     return GetField<int32_t>(VT_SHRINK_AXIS_MASK, 0);
   }
+  bool offset() const {
+    return GetField<uint8_t>(VT_OFFSET, 0) != 0;
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_BEGIN_MASK, 4) &&
@@ -7967,6 +7972,7 @@ struct StridedSliceOptions FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Tabl
            VerifyField<int32_t>(verifier, VT_ELLIPSIS_MASK, 4) &&
            VerifyField<int32_t>(verifier, VT_NEW_AXIS_MASK, 4) &&
            VerifyField<int32_t>(verifier, VT_SHRINK_AXIS_MASK, 4) &&
+           VerifyField<uint8_t>(verifier, VT_OFFSET, 1) &&
            verifier.EndTable();
   }
   StridedSliceOptionsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -7993,6 +7999,9 @@ struct StridedSliceOptionsBuilder {
   void add_shrink_axis_mask(int32_t shrink_axis_mask) {
     fbb_.AddElement<int32_t>(StridedSliceOptions::VT_SHRINK_AXIS_MASK, shrink_axis_mask, 0);
   }
+  void add_offset(bool offset) {
+    fbb_.AddElement<uint8_t>(StridedSliceOptions::VT_OFFSET, static_cast<uint8_t>(offset), 0);
+  }
   explicit StridedSliceOptionsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -8010,13 +8019,15 @@ inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(
     int32_t end_mask = 0,
     int32_t ellipsis_mask = 0,
     int32_t new_axis_mask = 0,
-    int32_t shrink_axis_mask = 0) {
+    int32_t shrink_axis_mask = 0,
+    bool offset = false) {
   StridedSliceOptionsBuilder builder_(_fbb);
   builder_.add_shrink_axis_mask(shrink_axis_mask);
   builder_.add_new_axis_mask(new_axis_mask);
   builder_.add_ellipsis_mask(ellipsis_mask);
   builder_.add_end_mask(end_mask);
   builder_.add_begin_mask(begin_mask);
+  builder_.add_offset(offset);
   return builder_.Finish();
 }
 
@@ -15191,6 +15202,7 @@ inline void StridedSliceOptions::UnPackTo(StridedSliceOptionsT *_o, const ::flat
   { auto _e = ellipsis_mask(); _o->ellipsis_mask = _e; }
   { auto _e = new_axis_mask(); _o->new_axis_mask = _e; }
   { auto _e = shrink_axis_mask(); _o->shrink_axis_mask = _e; }
+  { auto _e = offset(); _o->offset = _e; }
 }
 
 inline ::flatbuffers::Offset<StridedSliceOptions> StridedSliceOptions::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const StridedSliceOptionsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -15206,13 +15218,15 @@ inline ::flatbuffers::Offset<StridedSliceOptions> CreateStridedSliceOptions(::fl
   auto _ellipsis_mask = _o->ellipsis_mask;
   auto _new_axis_mask = _o->new_axis_mask;
   auto _shrink_axis_mask = _o->shrink_axis_mask;
+  auto _offset = _o->offset;
   return tflite::CreateStridedSliceOptions(
       _fbb,
       _begin_mask,
       _end_mask,
       _ellipsis_mask,
       _new_axis_mask,
-      _shrink_axis_mask);
+      _shrink_axis_mask,
+      _offset);
 }
 
 inline LogSoftmaxOptionsT *LogSoftmaxOptions::UnPack(const ::flatbuffers::resolver_function_t *_resolver) const {
diff --git a/tensorflow/lite/special_rules.bzl b/tensorflow/lite/special_rules.bzl
index 954b4a6d923..c9614dbb411 100644
--- a/tensorflow/lite/special_rules.bzl
+++ b/tensorflow/lite/special_rules.bzl
@@ -49,7 +49,7 @@ def op_resolver_internal_visibility_allowlist():
     return []
 
 def nnapi_plugin_impl_visibility_allowlist():
-    """Returns a list of packages that can depend on tensorflow/lite/experimental/acceleration/configuration:nnapi_plugin_impl.
+    """Returns a list of packages that can depend on tensorflow/lite/acceleration/configuration:nnapi_plugin_impl.
 
     This is a no-op outside of Google."""
     return []
diff --git a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
index 5206174371f..cf351f630e8 100644
--- a/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
+++ b/tensorflow/lite/swift/TensorFlowLiteSwift.podspec
@@ -1,10 +1,10 @@
 Pod::Spec.new do |s|
   s.name             = 'TensorFlowLiteSwift'
-  s.version          = '2.11.0'
+  s.version          = '2.12.0'
   s.authors          = 'Google Inc.'
   s.license          = { :type => 'Apache' }
   s.homepage         = 'https://github.com/tensorflow/tensorflow'
-  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => 'd5b57ca93e506df258271ea00fc29cf98383a374' }
+  s.source           = { :git => 'https://github.com/tensorflow/tensorflow.git', :commit => '1e8f3f6a3e899fe75d3c394d4fe07f2c2b69e2ec' }
   s.summary          = 'TensorFlow Lite for Swift'
   s.description      = <<-DESC
 
diff --git a/tensorflow/lite/testing/BUILD b/tensorflow/lite/testing/BUILD
index 4df75d71f80..cc607021321 100644
--- a/tensorflow/lite/testing/BUILD
+++ b/tensorflow/lite/testing/BUILD
@@ -130,7 +130,7 @@ py_library(
         ":generate_examples_report",
         "//tensorflow:tensorflow_py",
         "//tensorflow/lite/python:test_util",
-        "//tensorflow/python:graph_util",
+        "//tensorflow/python/framework:graph_util",
         "//third_party/py/numpy",
     ],
 )
@@ -269,8 +269,10 @@ cc_library(
         ":test_runner",
         "//tensorflow/lite:builtin_op_data",
         "//tensorflow/lite:framework",
+        "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core:framework",
+        "//tensorflow/lite/core/api:op_resolver",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/core/kernels:builtin_ops",
@@ -280,7 +282,10 @@ cc_library(
         "//tensorflow/lite/kernels/gradient:gradient_ops",
         "//tensorflow/lite/kernels/parse_example",
         "//tensorflow/lite/kernels/perception:perception_ops",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/evaluation:utils",
+        "//third_party/eigen3",
         "@com_google_absl//absl/strings",
     ] + select({
         "//tensorflow:ios": [],
@@ -601,6 +606,7 @@ cc_library(
         "//tensorflow/lite:string_util",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/core/c:common",
+        "//third_party/eigen3",
         "@com_google_absl//absl/strings",
     ],
 )
@@ -644,7 +650,7 @@ pybind_extension(
     deps = [
         ":string_util_lib",
         "//tensorflow/lite/python/interpreter_wrapper:numpy",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
diff --git a/tensorflow/lite/testing/generate_examples_lib.py b/tensorflow/lite/testing/generate_examples_lib.py
index 23d687abaa1..69e13659863 100644
--- a/tensorflow/lite/testing/generate_examples_lib.py
+++ b/tensorflow/lite/testing/generate_examples_lib.py
@@ -280,8 +280,8 @@ class Options:
     self.skip_high_dimension_inputs = False
     # Whether to enable DynamicUpdateSlice op.
     self.enable_dynamic_update_slice = False
-    # Whether to disable unrolling batch matmul.
-    self.disable_batchmatmul_unfold = False
+    # Whether to unrolling batch matmul.
+    self.unfold_batchmatmul = False
     # Experimental low bit options
     self.experimental_low_bit_qat = False
 
diff --git a/tensorflow/lite/testing/mlir_convert.py b/tensorflow/lite/testing/mlir_convert.py
index 12c5677f12d..d90681c0ef4 100644
--- a/tensorflow/lite/testing/mlir_convert.py
+++ b/tensorflow/lite/testing/mlir_convert.py
@@ -69,8 +69,7 @@ def mlir_convert(
   if options.enable_dynamic_update_slice:
     converter._experimental_enable_dynamic_update_slice = True  # pylint: disable=protected-access
 
-  if options.disable_batchmatmul_unfold:
-    converter._experimental_disable_batchmatmul_unfold = True  # pylint: disable=protected-access
+  converter.unfold_batchmatmul = options.unfold_batchmatmul
 
   if test_params.get("dynamic_range_quantize", False):
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
diff --git a/tensorflow/lite/testing/op_tests/pad.py b/tensorflow/lite/testing/op_tests/pad.py
index fdde5dfa991..54a0e49d53f 100644
--- a/tensorflow/lite/testing/op_tests/pad.py
+++ b/tensorflow/lite/testing/op_tests/pad.py
@@ -30,55 +30,67 @@ def make_pad_tests(options):
       {
           "dtype": [tf.int32, tf.int64, tf.float32],
           "input_shape": [[1, 1, 2, 1, 1], [2, 1, 1, 1, 1]],
-          "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0], [1, 0]],
-                       [[0, 1], [0, 0], [0, 0], [2, 3], [1, 0]]],
+          "padding_dtype": [tf.int32, tf.int64],
+          "paddings": [
+              [[0, 0], [0, 1], [2, 3], [0, 0], [1, 0]],
+              [[0, 1], [0, 0], [0, 0], [2, 3], [1, 0]],
+          ],
           "constant_paddings": [True, False],
           "fully_quantize": [False],
-          "quant_16x8": [False]
+          "quant_16x8": [False],
       },
       # 4D:
       {
           "dtype": [tf.int32, tf.int64, tf.float32],
           "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
-          "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]],
-                       [[0, 1], [0, 0], [0, 0], [2, 3]]],
+          "padding_dtype": [tf.int32, tf.int64],
+          "paddings": [
+              [[0, 0], [0, 1], [2, 3], [0, 0]],
+              [[0, 1], [0, 0], [0, 0], [2, 3]],
+          ],
           "constant_paddings": [True, False],
           "fully_quantize": [False],
-          "quant_16x8": [False]
+          "quant_16x8": [False],
       },
       # 2D:
       {
           "dtype": [tf.int32, tf.int64, tf.float32],
           "input_shape": [[1, 2]],
+          "padding_dtype": [tf.int32, tf.int64],
           "paddings": [[[0, 1], [2, 3]]],
           "constant_paddings": [True, False],
           "fully_quantize": [False],
-          "quant_16x8": [False]
+          "quant_16x8": [False],
       },
       # 1D:
       {
           "dtype": [tf.int32],
           "input_shape": [[1]],
+          "padding_dtype": [tf.int32, tf.int64],
           "paddings": [[[1, 2]]],
           "constant_paddings": [False],
           "fully_quantize": [False],
-          "quant_16x8": [False]
+          "quant_16x8": [False],
       },
       # 4D:
       {
           "dtype": [tf.float32],
           "input_shape": [[1, 1, 2, 1], [2, 1, 1, 1]],
-          "paddings": [[[0, 0], [0, 1], [2, 3], [0, 0]],
-                       [[0, 1], [0, 0], [0, 0], [2, 3]],
-                       [[0, 0], [0, 0], [0, 0], [0, 0]]],
+          "padding_dtype": [tf.int32, tf.int64],
+          "paddings": [
+              [[0, 0], [0, 1], [2, 3], [0, 0]],
+              [[0, 1], [0, 0], [0, 0], [2, 3]],
+              [[0, 0], [0, 0], [0, 0], [0, 0]],
+          ],
           "constant_paddings": [True],
           "fully_quantize": [True],
-          "quant_16x8": [False, True]
+          "quant_16x8": [False, True],
       },
       # 2D:
       {
           "dtype": [tf.float32],
           "input_shape": [[1, 2]],
+          "padding_dtype": [tf.int32, tf.int64],
           "paddings": [[[0, 1], [2, 3]]],
           "constant_paddings": [True],
           "fully_quantize": [True],
@@ -88,6 +100,7 @@ def make_pad_tests(options):
       {
           "dtype": [tf.float32],
           "input_shape": [[1]],
+          "padding_dtype": [tf.int32, tf.int64],
           "paddings": [[[1, 2]]],
           "constant_paddings": [True],
           "fully_quantize": [True],
@@ -109,7 +122,8 @@ def make_pad_tests(options):
     else:
       shape = [len(parameters["paddings"]), 2]
       paddings = tf.compat.v1.placeholder(
-          dtype=tf.int32, name="padding", shape=shape)
+          dtype=parameters["padding_dtype"], name="padding", shape=shape
+      )
       input_tensors = [input_tensor, paddings]
 
     out = tf.pad(tensor=input_tensor, paddings=paddings)
diff --git a/tensorflow/lite/testing/op_tests/range.py b/tensorflow/lite/testing/op_tests/range.py
index d19bc229d04..df3d96e2174 100644
--- a/tensorflow/lite/testing/op_tests/range.py
+++ b/tensorflow/lite/testing/op_tests/range.py
@@ -24,7 +24,7 @@ def make_range_tests(options):
   """Make a set of tests to do range."""
 
   test_parameters = [{
-      "dtype": [tf.int32, tf.float32],
+      "dtype": [tf.int32, tf.float32, tf.int64],
       "offset": [10, 100, 1000, 0],
       "delta": [1, 2, 3, 4, -1, -2, -3, -4],
   }]
diff --git a/tensorflow/lite/testing/result_expectations.cc b/tensorflow/lite/testing/result_expectations.cc
index 0e32f35cb22..a0d5af2dd33 100644
--- a/tensorflow/lite/testing/result_expectations.cc
+++ b/tensorflow/lite/testing/result_expectations.cc
@@ -16,13 +16,17 @@ limitations under the License.
 
 #include <cmath>
 #include <complex>
+#include <cstdint>
+#include <cstdio>
 #include <cstdlib>
+#include <cstring>
 #include <iostream>
 #include <memory>
 #include <ostream>
 #include <string>
 #include <vector>
 
+#include "third_party/eigen3/Eigen/Core"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/string_util.h"
diff --git a/tensorflow/lite/testing/tflite_driver.cc b/tensorflow/lite/testing/tflite_driver.cc
index 627a37c9152..7dfb35a9ea2 100644
--- a/tensorflow/lite/testing/tflite_driver.cc
+++ b/tensorflow/lite/testing/tflite_driver.cc
@@ -16,9 +16,10 @@ limitations under the License.
 
 #include <algorithm>
 #include <complex>
-#include <cstdlib>
+#include <cstdint>
+#include <cstring>
 #include <iostream>
-#include <iterator>
+#include <map>
 #include <memory>
 #include <ostream>
 #include <string>
@@ -26,10 +27,15 @@ limitations under the License.
 #include <vector>
 
 #include "absl/strings/escaping.h"
-#include "absl/strings/numbers.h"
-#include "tensorflow/lite/builtin_op_data.h"
-#include "tensorflow/lite/core/c/c_api_types.h"
-#include "tensorflow/lite/core/c/common.h"
+#include "absl/strings/str_cat.h"
+#include "third_party/eigen3/Eigen/Core"
+#include "tensorflow/lite/core/interpreter_builder.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/interpreter.h"
+#include "tensorflow/lite/string_type.h"
+#include "tensorflow/lite/testing/result_expectations.h"
+#include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/logging.h"
 #if !defined(__APPLE__)
 #include "tensorflow/lite/delegates/flex/delegate.h"
 #endif
@@ -41,7 +47,6 @@ limitations under the License.
 #include "tensorflow/lite/kernels/register_ref.h"
 #include "tensorflow/lite/kernels/test_delegate_providers.h"
 #include "tensorflow/lite/signature_runner.h"
-#include "tensorflow/lite/string_util.h"
 #include "tensorflow/lite/testing/join.h"
 #include "tensorflow/lite/testing/split.h"
 #include "tensorflow/lite/tools/evaluation/utils.h"
diff --git a/tensorflow/lite/testing/tflite_driver.h b/tensorflow/lite/testing/tflite_driver.h
index 8a35bbef747..b4aaf0d6ce1 100644
--- a/tensorflow/lite/testing/tflite_driver.h
+++ b/tensorflow/lite/testing/tflite_driver.h
@@ -15,22 +15,20 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
 #define TENSORFLOW_LITE_TESTING_TFLITE_DRIVER_H_
 
+#include <stdlib.h>
+
+#include <cstdint>
 #include <map>
 #include <memory>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/testing/result_expectations.h"
-#if !defined(__APPLE__)
-#include "tensorflow/lite/delegates/flex/delegate.h"
-#endif
-#include "tensorflow/lite/core/interpreter.h"
-#include "tensorflow/lite/core/kernels/register.h"
-#include "tensorflow/lite/core/model.h"
-#include "tensorflow/lite/kernels/register_ref.h"
+#include "tensorflow/lite/core/api/op_resolver.h"
+#include "tensorflow/lite/core/model_builder.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/signature_runner.h"
+#include "tensorflow/lite/testing/result_expectations.h"
 #include "tensorflow/lite/testing/test_runner.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/toco/logging/BUILD b/tensorflow/lite/toco/logging/BUILD
index c4291422639..caf2dbf0a97 100644
--- a/tensorflow/lite/toco/logging/BUILD
+++ b/tensorflow/lite/toco/logging/BUILD
@@ -93,9 +93,9 @@ py_strict_test(
     deps = [
         ":gen_html",
         ":toco_conversion_log_proto_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/lite/tools/BUILD b/tensorflow/lite/tools/BUILD
index 7d292b431a5..92c58ed9ad7 100644
--- a/tensorflow/lite/tools/BUILD
+++ b/tensorflow/lite/tools/BUILD
@@ -49,8 +49,8 @@ py_test(
     deps = [
         ":test_utils",
         ":visualize",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -60,11 +60,11 @@ py_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:io_ops",
         "@absl_py//absl:app",
         "@absl_py//absl/flags",
     ],
@@ -78,9 +78,9 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":convert_image_to_csv",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//third_party/py/numpy",
     ],
@@ -142,8 +142,8 @@ py_test(
     deps = [
         ":flatbuffer_utils",
         ":test_utils",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/lite/tools/benchmark/CMakeLists.txt b/tensorflow/lite/tools/benchmark/CMakeLists.txt
index a9f75078298..4f7d1b66612 100644
--- a/tensorflow/lite/tools/benchmark/CMakeLists.txt
+++ b/tensorflow/lite/tools/benchmark/CMakeLists.txt
@@ -25,6 +25,7 @@ list(APPEND TFLITE_BENCHMARK_SRCS
   ${TFLITE_SOURCE_DIR}/kernels/internal/utils/sparsity_format_converter.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_info.cc
   ${TFLITE_SOURCE_DIR}/profiling/memory_usage_monitor.cc
+  ${TFLITE_SOURCE_DIR}/profiling/profile_buffer.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summarizer.cc
   ${TFLITE_SOURCE_DIR}/profiling/profile_summary_formatter.cc
   ${TFLITE_SOURCE_DIR}/profiling/root_profiler.cc
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 3685b682d96..f17d395c72b 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -218,7 +218,7 @@ for model benchmarking.
 
 #### Stable delegate [Experimental]
 *   `stable_delegate_loader_settings`: `string` (default="") A path to the
-    JSON-encoded delegate [`TFLiteSettings`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/experimental/acceleration/configuration/configuration.proto#L488) file, which is defined in `configuration.proto`.
+    JSON-encoded delegate [`TFLiteSettings`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/acceleration/configuration/configuration.proto#L488) file, which is defined in `configuration.proto`.
 
 As some delegates are only available on certain platforms, when running the
 benchmark tool on a particular platform, specifying `--help` will print out all
diff --git a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
index ee46ff9bf02..791c7faa266 100644
--- a/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
+++ b/tensorflow/lite/tools/benchmark/android/AndroidManifest.xml
@@ -23,7 +23,7 @@
 
     <uses-sdk
         android:minSdkVersion="23"
-        android:targetSdkVersion="23" />
+        android:targetSdkVersion="31" />
 
     <application
         android:debuggable="true">
@@ -36,6 +36,10 @@
             android:theme="@android:style/Theme.NoDisplay"
             android:exported="true"
             android:noHistory="true" />
+        <uses-library android:name="libOpenCL.so"
+           android:required="false"/>
+        <uses-library android:name="libOpenCL-pixel.so"
+           android:required="false"/>
     </application>
 
 </manifest>
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index cc534c6efe7..434bfa8b71b 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -104,7 +104,10 @@ class InterpreterStatePrinter : public BenchmarkListener {
     if (params_->Get<bool>("print_preinvoke_state")) {
       TFLITE_LOG(INFO) << "\n====Printing out TfLite interpreter pre-invoke "
                           "state begins====";
-      tflite::PrintInterpreterState(interpreter_);
+      tflite::PrintInterpreterState(
+          interpreter_, params_->Get<int32_t>("tensor_name_display_length"),
+          params_->Get<int32_t>("tensor_type_display_length"),
+          params_->Get<int32_t>("alloc_type_display_length"));
       TFLITE_LOG(INFO) << "====Printing out TfLite interpreter pre-invoke "
                           "state ends====\n";
     }
@@ -114,7 +117,10 @@ class InterpreterStatePrinter : public BenchmarkListener {
     if (params_->Get<bool>("print_postinvoke_state")) {
       TFLITE_LOG(INFO) << "\n====Printing out TfLite interpreter post-invoke "
                           "state begins====";
-      tflite::PrintInterpreterState(interpreter_);
+      tflite::PrintInterpreterState(
+          interpreter_, params_->Get<int32_t>("tensor_name_display_length"),
+          params_->Get<int32_t>("tensor_type_display_length"),
+          params_->Get<int32_t>("alloc_type_display_length"));
       TFLITE_LOG(INFO) << "====Printing out TfLite interpreter post-invoke "
                           "state ends====\n";
     }
@@ -378,6 +384,13 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
   default_params.AddParam("output_filepath",
                           BenchmarkParam::Create<std::string>(""));
 
+  default_params.AddParam("tensor_name_display_length",
+                          BenchmarkParam::Create<int32_t>(25));
+  default_params.AddParam("tensor_type_display_length",
+                          BenchmarkParam::Create<int32_t>(15));
+  default_params.AddParam("alloc_type_display_length",
+                          BenchmarkParam::Create<int32_t>(18));
+
   tools::ProvidedDelegateList delegate_providers(&default_params);
   delegate_providers.AddAllDelegateParams();
 
@@ -458,7 +471,19 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
                        "Disable delegate clustering."),
       CreateFlag<std::string>(
           "output_filepath", &params_,
-          "File path to export outputs layer as binary data.")};
+          "File path to export outputs layer as binary data."),
+      CreateFlag<int32_t>(
+          "tensor_name_display_length", &params_,
+          "The number of characters to show for the tensor's name when "
+          "printing the interpeter's state, defaults to 25."),
+      CreateFlag<int32_t>(
+          "tensor_type_display_length", &params_,
+          "The number of characters to show for the tensor's type when "
+          "printing the interpeter's state, defaults to 15."),
+      CreateFlag<int32_t>(
+          "alloc_type_display_length", &params_,
+          "The number of characters to show for the tensor's allocation type "
+          "when printing the interpeter's state, defaults to 18.")};
 
   flags.insert(flags.end(), specific_flags.begin(), specific_flags.end());
 
@@ -505,6 +530,12 @@ void BenchmarkTfLiteModel::LogParams() {
                       "Disable delegate clustering", verbose);
   LOG_BENCHMARK_PARAM(std::string, "output_filepath",
                       "File path to export outputs layer to", verbose);
+  LOG_BENCHMARK_PARAM(int32_t, "tensor_name_display_length",
+                      "Tensor name display length", verbose);
+  LOG_BENCHMARK_PARAM(int32_t, "tensor_type_display_length",
+                      "Tensor type display length", verbose);
+  LOG_BENCHMARK_PARAM(int32_t, "alloc_type_display_length",
+                      "Tensor allocation type display length", verbose);
 
   for (const auto& delegate_provider :
        tools::GetRegisteredDelegateProviders()) {
diff --git a/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake b/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
index 9ed95109ba9..5e26cb0319c 100644
--- a/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
+++ b/tensorflow/lite/tools/cmake/modules/OverridableFetchContent.cmake
@@ -431,7 +431,10 @@ function(_OverridableFetchContent_SetProperty CONTENT_NAME PROPERTY_NAME
     BRIEF_DOCS "${DOCUMENTATION}"
     FULL_DOCS "${DOCUMENTATION}"
   )
-  set_property(GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}" "${PROPERTY_VALUE}")
+set_property(
+  GLOBAL PROPERTY "${GLOBAL_PROPERTY_NAME}"
+  "${PROPERTY_VALUE}"
+)
 endfunction()
 
 
@@ -505,7 +508,10 @@ function(OverridableFetchContent_GetProperties CONTENT_NAME)
     )
     set(PROPERTY_VALUE "${${EXPORT_PROPERTY}}")
     if(PROPERTY_VALUE)
-      set(${CONTENT_NAME}_${EXPORT_PROPERTY} "${PROPERTY_VALUE}" PARENT_SCOPE)
+      set(${CONTENT_NAME_LOWER}_${EXPORT_PROPERTY}
+        "${PROPERTY_VALUE}"
+        PARENT_SCOPE
+      )
     endif()
   endforeach()
   FetchContent_GetProperties("${CONTENT_NAME}" ${OUTPUT_ARGS})
@@ -534,8 +540,8 @@ function(OverridableFetchContent_Populate CONTENT_NAME)
   # If a license file isn't cached try finding it in the repo.
   set(LICENSE_FILE "${${CONTENT_NAME_LOWER}_LICENSE_FILE}")
   set(LICENSE_URL "${${CONTENT_NAME_LOWER}_LICENSE_URL}")
+  set(SOURCE_DIR "${${CONTENT_NAME_LOWER}_SOURCE_DIR}")
   if(${CONTENT_NAME}_POPULATED AND NOT LICENSE_FILE)
-    set(SOURCE_DIR "${${CONTENT_NAME_LOWER}_SOURCE_DIR}")
     find_file(_${CONTENT_NAME_LOWER}_LICENSE_FILE
       NAMES LICENSE LICENSE.md LICENSE.txt NOTICE COPYING
       PATHS "${SOURCE_DIR}"
@@ -549,6 +555,33 @@ function(OverridableFetchContent_Populate CONTENT_NAME)
       file(TO_CMAKE_PATH "${LICENSE_FILE}" LICENSE_FILE)
     endif()
   endif()
+  # If LICENSE_FILE was not found but a URL was specified then try downloading
+  # the license.
+  set(LICENSE_FILE_FULL_PATH "${SOURCE_DIR}/${LICENSE_FILE}")
+  if(NOT EXISTS "${LICENSE_FILE_FULL_PATH}" AND LICENSE_URL)
+    set(LICENSE_FILE_DOWNLOAD "${SOURCE_DIR}/${CONTENT_NAME}_LICENSE.txt")
+    if(NOT EXISTS "${LICENSE_FILE_DOWNLOAD}")
+      message(STATUS
+        "${CONTENT_NAME} '${LICENSE_FILE_FULL_PATH}' does not exist "
+        "downloading ${LICENSE_URL} --> ${LICENSE_FILE_DOWNLOAD}")
+      file(DOWNLOAD "${LICENSE_URL}" "${LICENSE_FILE_DOWNLOAD}" STATUS RESULT)
+      list(GET RESULT 0 RESULT)
+      if(NOT RESULT EQUAL 0)
+        message(
+          FATAL_ERROR
+          "Failed to download ${LICENSE_URL} for ${CONTENT_NAME} to "
+          "${LICENSE_FILE_DOWNLOAD}"
+        )
+      endif()
+    endif()
+    file(RELATIVE_PATH LICENSE_FILE "${SOURCE_DIR}" "${LICENSE_FILE_DOWNLOAD}")
+    _OverridableFetchContent_SetProperty(
+      "${CONTENT_NAME}"
+      LICENSE_FILE
+      "License for ${CONTENT_NAME}"
+      "${LICENSE_FILE}"
+    )
+  endif()
   # If a LICENSE_FILE was found populate the URL.
   if(LICENSE_FILE AND NOT LICENSE_URL)
     _LicenseFileToUrl(
diff --git a/tensorflow/lite/tools/cmake/modules/egl_headers.cmake b/tensorflow/lite/tools/cmake/modules/egl_headers.cmake
index cac542f077b..7aefea458c0 100644
--- a/tensorflow/lite/tools/cmake/modules/egl_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/egl_headers.cmake
@@ -28,6 +28,8 @@ OverridableFetchContent_Declare(
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/egl_headers"
+  # Per https://www.khronos.org/legal/Khronos_Apache_2.0_CLA
+  LICENSE_URL "https://www.apache.org/licenses/LICENSE-2.0.txt"
 )
 
 OverridableFetchContent_GetProperties(egl_headers)
diff --git a/tensorflow/lite/tools/cmake/modules/eigen.cmake b/tensorflow/lite/tools/cmake/modules/eigen.cmake
index d58a7e42f5a..93a63d280d8 100644
--- a/tensorflow/lite/tools/cmake/modules/eigen.cmake
+++ b/tensorflow/lite/tools/cmake/modules/eigen.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   eigen
   GIT_REPOSITORY https://gitlab.com/libeigen/eigen.git
   # Sync with tensorflow/third_party/eigen3/workspace.bzl
-  GIT_TAG b0f877f8e01e90a5b0f3a79d46ea234899f8b499
+  GIT_TAG 0b51f763cbbd0ed08168f88972724329f0375498
   # It's not currently (cmake 3.17) possible to shallow clone with a GIT TAG
   # as cmake attempts to git checkout the commit hash after the clone
   # which doesn't work as it's a shallow clone hence a different commit hash.
diff --git a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
index f1ed4fb1748..907dc40b435 100644
--- a/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/flatbuffers.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   flatbuffers
   GIT_REPOSITORY https://github.com/google/flatbuffers
   # Sync with tensorflow/third_party/flatbuffers/workspace.bzl
-  GIT_TAG v23.1.21
+  GIT_TAG v23.5.8
   GIT_SHALLOW TRUE
   GIT_PROGRESS TRUE
   SOURCE_DIR "${CMAKE_BINARY_DIR}/flatbuffers"
diff --git a/tensorflow/lite/tools/cmake/modules/neon2sse.cmake b/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
index 1cc6af805c3..c2612e34bd2 100644
--- a/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
+++ b/tensorflow/lite/tools/cmake/modules/neon2sse.cmake
@@ -19,11 +19,16 @@ if(TARGET neon2sse OR neon2sse_POPULATED)
   return()
 endif()
 
+set(NEON2SSE_URL
+  https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/ARM_NEON_2_x86_SSE/archive/a15b489e1222b2087007546b4912e21293ea86ff.tar.gz
+)
 OverridableFetchContent_Declare(
   neon2sse
-  URL https://storage.googleapis.com/mirror.tensorflow.org/github.com/intel/ARM_NEON_2_x86_SSE/archive/a15b489e1222b2087007546b4912e21293ea86ff.tar.gz
+  URL "${NEON2SSE_URL}"
   # Sync with tensorflow/workspace2.bzl
   URL_HASH SHA256=019fbc7ec25860070a1d90e12686fc160cfb33e22aa063c80f52b363f1361e9d
+  LICENSE_FILE LICENSE
+  LICENSE_URL "${NEON2SSE_URL}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/neon2sse"
 )
 
diff --git a/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake b/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake
index 5e243278fc4..1b2ca66b848 100644
--- a/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake
+++ b/tensorflow/lite/tools/cmake/modules/opengl_headers.cmake
@@ -28,6 +28,8 @@ OverridableFetchContent_Declare(
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/opengl_headers"
+  # Per https://www.khronos.org/legal/Khronos_Apache_2.0_CLA
+  LICENSE_URL "https://www.apache.org/licenses/LICENSE-2.0.txt"
 )
 
 OverridableFetchContent_GetProperties(opengl_headers)
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index eeff72caaf2..eeb86758d63 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -40,10 +40,9 @@ set(XNNPACK_BUILD_BENCHMARKS OFF CACHE BOOL "Disable XNNPACK benchmarks.")
 
 # The following line adds project of PTHREADPOOL, FP16 and XNNPACK which are
 # needed to compile XNNPACK delegate of TFLite.
-add_subdirectory(
-  "${xnnpack_SOURCE_DIR}"
-  "${xnnpack_BINARY_DIR}"
-)
+# Note, we introduce an intermediate subdirectory, see ${TFLITE_SOURCE_DIR}/tools/cmake/modules/xnnpack/CMakeLists.txt
+# for details.
+add_subdirectory(${TFLITE_SOURCE_DIR}/tools/cmake/modules/xnnpack)
 
 include_directories(
   AFTER
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack/CMakeLists.txt b/tensorflow/lite/tools/cmake/modules/xnnpack/CMakeLists.txt
new file mode 100644
index 00000000000..f156dc34ef0
--- /dev/null
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack/CMakeLists.txt
@@ -0,0 +1,33 @@
+#
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# The "-mcpu=" switch might be introduced externaly into CMake: either in <LANG>_FLAGS or
+# as part of CC, CXX, ASM environmental variables (to be stored in CMAKE_<LANG>_COMPILER_ARG1).
+# This switch is not compatible with XNNPACK build mechanism and causes the XNNPACK compilation
+# break due to "unsupported instructions". This switch needs to be removed for XNNPACK
+# In order to isolate the changes only for XNNPACK and its depencencies, a subfolder is
+# introduced.
+
+foreach(FLAG IN ITEMS CMAKE_ASM_FLAGS CMAKE_ASM_COMPILER_ARG1 CMAKE_C_FLAGS CMAKE_C_COMPILER_ARG1 CMAKE_CXX_FLAGS CMAKE_CXX_COMPILER_ARG1)
+  if(${FLAG})
+    string(REGEX REPLACE "-mcpu=[-a-zA-Z0-9_.^$*+?]*" "" _tmp ${${FLAG}})
+    set(${FLAG} ${_tmp})
+  endif()
+endforeach()
+
+add_subdirectory(
+  "${xnnpack_SOURCE_DIR}"
+  "${xnnpack_BINARY_DIR}"
+)
diff --git a/tensorflow/lite/tools/delegates/README.md b/tensorflow/lite/tools/delegates/README.md
index 0e5aba096f8..daceaba51d0 100644
--- a/tensorflow/lite/tools/delegates/README.md
+++ b/tensorflow/lite/tools/delegates/README.md
@@ -174,7 +174,7 @@ delegate library is built with "-DCL_DELEGATE_NO_GL" macro.
 The stable delegate provider provides a `TfLiteOpaqueDelegate` object pointer
 and its corresponding deleter by loading a dynamic library that encapsulates the
 actual TFLite delegate implementation in a
-[`TfLiteStableDelegate`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h)
+[`TfLiteStableDelegate`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h)
 struct instance.
 
 While the structure of the stable delegate provider is similar to the external
@@ -194,7 +194,7 @@ different.
 The stable delegates and the external delegates use different APIs for
 diagnosing errors, creating and destroying the delegates. For more details of
 the concrete API differences, please check
-[stable_delegate.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/experimental/acceleration/configuration/c/stable_delegate.h)
+[stable_delegate.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/core/acceleration/configuration/c/stable_delegate.h)
 and
 [external_delegate.h](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/delegates/external/external_delegate.h).
 
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
index 8aace544f5f..9601726f043 100644
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
@@ -1,6 +1,8 @@
 # Provides stable ABI delegate.
 load("//tensorflow/lite:build_def.bzl", "tflite_copts")
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "delegate_provider",
     srcs = [
diff --git a/tensorflow/lite/tools/evaluation/BUILD b/tensorflow/lite/tools/evaluation/BUILD
index 7db05c48aa1..227f760d01a 100644
--- a/tensorflow/lite/tools/evaluation/BUILD
+++ b/tensorflow/lite/tools/evaluation/BUILD
@@ -65,21 +65,21 @@ cc_library_with_stable_tflite_abi(
     tflite_deps = [
         "//tensorflow/lite/c:c_api",
         "//tensorflow/lite/c:common",
-        "//tensorflow/lite/experimental/acceleration/configuration/c:delegate_plugin",
+        "//tensorflow/lite/acceleration/configuration/c:delegate_plugin",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/delegates:delegate_provider_lib",
     ],
     tflite_deps_selects = [{
         "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
-            "//tensorflow/lite/experimental/acceleration/configuration/c:xnnpack_plugin",
+            "//tensorflow/lite/acceleration/configuration/c:xnnpack_plugin",
         ],
     }],
     deps = select({
         "//tensorflow/lite:tflite_with_xnnpack_explicit_false": [],
         "//conditions:default": [
+            "//tensorflow/lite/acceleration/configuration:configuration_fbs",
             "//tensorflow/lite/delegates/xnnpack:xnnpack_delegate_hdrs_only",
-            "//tensorflow/lite/experimental/acceleration/configuration:configuration_fbs",
         ],
     }),
 )
diff --git a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
index 8580dabdeb3..38a733fce24 100644
--- a/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
+++ b/tensorflow/lite/tools/evaluation/stages/tflite_inference_stage.cc
@@ -135,7 +135,7 @@ TfLiteStatus TfliteInferenceStage::Init(
   }
 
   if (apply_default_delegates) {
-    resolver_ = std::make_unique<ops::builtin::BuiltinOpResolverWithXNNPACK>();
+    resolver_ = std::make_unique<ops::builtin::BuiltinOpResolver>();
   } else {
     resolver_ = std::make_unique<
         ops::builtin::BuiltinOpResolverWithoutDefaultDelegates>();
diff --git a/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh b/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh
index 310a3045ab3..366078b3d1f 100644
--- a/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh
+++ b/tensorflow/lite/tools/evaluation/tasks/ios/build_evaluation_framework.sh
@@ -17,7 +17,7 @@
 set -e
 
 WORKSPACE_ROOT=$(bazel info workspace 2> /dev/null)
-EVALUATION_DIR=third_party/tensorflow/lite/tools/evaluation/tasks
+EVALUATION_DIR=tensorflow/lite/tools/evaluation/tasks
 DEST_DIR="${EVALUATION_DIR}/ios/TFLiteEvaluation/TFLiteEvaluation/Frameworks"
 FRAMEWORK_TARGET=TensorFlowLiteInferenceDiffC_framework
 
diff --git a/tensorflow/lite/tools/evaluation/utils.cc b/tensorflow/lite/tools/evaluation/utils.cc
index b0b8ae74d88..28523a5034f 100644
--- a/tensorflow/lite/tools/evaluation/utils.cc
+++ b/tensorflow/lite/tools/evaluation/utils.cc
@@ -27,10 +27,10 @@ limitations under the License.
 #endif
 
 #ifndef TFLITE_WITHOUT_XNNPACK
+#include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/c/xnnpack_plugin.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
 #include "tensorflow/lite/c/common.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/delegate_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/c/xnnpack_plugin.h"
-#include "tensorflow/lite/experimental/acceleration/configuration/configuration_generated.h"
 #endif  // !defined(TFLITE_WITHOUT_XNNPACK)
 
 #if !defined(_WIN32)
diff --git a/tensorflow/lite/tools/make/README.md b/tensorflow/lite/tools/make/README.md
index fda7854e069..0c4495b3c83 100644
--- a/tensorflow/lite/tools/make/README.md
+++ b/tensorflow/lite/tools/make/README.md
@@ -1,6 +1,6 @@
 # Build TensorFlow Lite with Makefile
 
-**WARNING: Using Makefile to build TensorFlow Lite is deprecated at the
+**WARNING: Using Makefile to build TensorFlow Lite is deprecated since
 Aug 2021.**
 
 Please use CMake or Bazel instead. Please refer to the
diff --git a/tensorflow/lite/tools/optimize/debugging/python/BUILD b/tensorflow/lite/tools/optimize/debugging/python/BUILD
index 274e0c40706..4b459349c80 100644
--- a/tensorflow/lite/tools/optimize/debugging/python/BUILD
+++ b/tensorflow/lite/tools/optimize/debugging/python/BUILD
@@ -36,7 +36,7 @@ py_strict_test(
         "//tensorflow/lite/python:convert",
         "//tensorflow/lite/python:lite",
         "//tensorflow/lite/python/metrics",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/trackable:autotrackable",
         "//third_party/py/numpy",
diff --git a/tensorflow/lite/tools/optimize/python/BUILD b/tensorflow/lite/tools/optimize/python/BUILD
index 8f0758efe35..a15b3d90c76 100644
--- a/tensorflow/lite/tools/optimize/python/BUILD
+++ b/tensorflow/lite/tools/optimize/python/BUILD
@@ -41,8 +41,8 @@ py_test(
     deps = [
         ":modify_model_interface_lib",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -51,7 +51,7 @@ py_library(
     name = "modify_model_interface_constants",
     srcs = ["modify_model_interface_constants.py"],
     srcs_version = "PY3",
-    deps = ["//tensorflow/python:dtypes"],
+    deps = ["//tensorflow/python/framework:dtypes"],
 )
 
 pybind_extension(
diff --git a/tensorflow/lite/tools/pip_package/README.md b/tensorflow/lite/tools/pip_package/README.md
index 3c34fce8e0a..c10bcb921e1 100644
--- a/tensorflow/lite/tools/pip_package/README.md
+++ b/tensorflow/lite/tools/pip_package/README.md
@@ -43,7 +43,7 @@ make BASE_IMAGE=ubuntu:18.04 PYTHON=python3 TENSORFLOW_TARGET=native docker-buil
 ```
 
 In addition to the wheel there is a way to build Debian package by adding
-BUILD_DEB=y to the make command (only for python3):
+`BUILD_DEB=y` to the make command (only for python3):
 
 ```sh
 make BASE_IMAGE=debian:buster PYTHON=python3 TENSORFLOW_TARGET=rpi BUILD_DEB=y docker-build
@@ -53,7 +53,7 @@ make BASE_IMAGE=debian:buster PYTHON=python3 TENSORFLOW_TARGET=rpi BUILD_DEB=y d
 
 There is another build steps to build a binary wheel which uses Bazel instead of
 Makefile. You don't need to install additional dependencies.
-This approach can leverage TF's ci_build.sh for ARM cross builds.
+This approach can leverage TF's `ci_build.sh` for ARM cross builds.
 
 ### Normal build for your workstation
 
@@ -114,7 +114,7 @@ bash tensorflow/lite/tools/pip_package/build_pip_package_with_bazel.sh windows
 
 If you want to use TF ops with Python API, you need to enable flex support.
 You can build TFLite interpreter with flex ops support by providing
-"--define=tflite_pip_with_flex=true" to Bazel.
+`--define=tflite_pip_with_flex=true` to Bazel.
 
 Here are some examples.
 
@@ -135,7 +135,7 @@ CI_DOCKER_EXTRA_PARAMS="-e CUSTOM_BAZEL_FLAGS=--define=tflite_pip_with_flex=true
 
 ## Usage
 
-Note, unlike tensorflow this will be installed to a tflite_runtime namespace.
+Note, unlike tensorflow this will be installed to a `tflite_runtime` namespace.
 You can then use the Tensorflow Lite interpreter as.
 
 ```python
@@ -151,4 +151,3 @@ bigger host will be supported.
 
 * You cannot use TensorFlow Select ops, only TensorFlow Lite builtins.
 * Currently custom ops and delegates cannot be registered.
-
diff --git a/tensorflow/lite/tools/signature/BUILD b/tensorflow/lite/tools/signature/BUILD
index 3915f3b96c1..15e8265f67b 100644
--- a/tensorflow/lite/tools/signature/BUILD
+++ b/tensorflow/lite/tools/signature/BUILD
@@ -70,7 +70,7 @@ pybind_extension(
         ":signature_def_util",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/core:framework",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/lite/tools/strip_buffers/stripping_lib.cc b/tensorflow/lite/tools/strip_buffers/stripping_lib.cc
index a0da9607c1e..0283ad8beac 100644
--- a/tensorflow/lite/tools/strip_buffers/stripping_lib.cc
+++ b/tensorflow/lite/tools/strip_buffers/stripping_lib.cc
@@ -286,11 +286,11 @@ TfLiteStatus ReconstituteConstantTensorsIntoFlatbuffer(
             sizeof(int8_t) * data.size());
         output_buffers.push_back(CreateBuffer(*new_model_builder, data_buffer));
       } else if (type == kTfLiteFloat32) {
-        std::vector<float_t> data;
+        std::vector<float> data;
         GenerateRandomGaussianData(num_elements, -1, 1, &data);
         auto data_buffer = new_model_builder->CreateVector(
             reinterpret_cast<const uint8_t*>(data.data()),
-            sizeof(float_t) * data.size());
+            sizeof(float) * data.size());
         output_buffers.push_back(CreateBuffer(*new_model_builder, data_buffer));
       } else if (type == kTfLiteInt32) {
         std::vector<int32_t> data;
diff --git a/tensorflow/lite/tools/versioning/gpu_compatibility.cc b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
index ae795071961..087f51fc7ca 100644
--- a/tensorflow/lite/tools/versioning/gpu_compatibility.cc
+++ b/tensorflow/lite/tools/versioning/gpu_compatibility.cc
@@ -860,11 +860,6 @@ absl::Status CheckGpuDelegateCompatibility(const OpSignature& op_sig) {
                            "Pad operation. But node has ",
                            tf_options->mode));
         }
-      } else if (opcode == kTfLiteBuiltinPadv2 && op_sig.inputs.size() == 3) {
-        if (op_sig.inputs.at(2).type != kTfLiteFloat32) {
-          return absl::InvalidArgumentError(
-              "constant_values must be a scalar float");
-        }
       }
       RETURN_IF_ERROR(CheckInputsOutputs(op_sig,
                                          /*required_runtime_inputs=*/1,
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index 978c9854953..9fee662180b 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -523,6 +523,9 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       auto strided_slice_params =
           reinterpret_cast<TfLiteStridedSliceParams*>(op_sig.builtin_data);
       TFLITE_DCHECK(strided_slice_params != nullptr);
+      if (strided_slice_params->offset == true) {
+        return 8;
+      }
       if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
         return 7;
       }
@@ -755,6 +758,12 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       }
       return 1;
 
+    case BuiltinOperator_RANGE:
+      if (op_sig.inputs.at(0).type == kTfLiteInt64) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_BATCH_MATMUL: {
       // In case of int16 inputs, the version is 3.
       if (op_sig.inputs.at(0).type == kTfLiteInt16) {
@@ -791,6 +800,18 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
       return 1;
 
     case BuiltinOperator_CONCATENATION:
+      if (op_sig.inputs.at(0).type == kTfLiteUInt32) {
+        return 4;
+      }
+      // In case of int16 inputs, the version is 3.
+      if (op_sig.inputs.at(0).type == kTfLiteInt16) {
+        return 3;
+      }
+      if (op_sig.inputs.at(0).type == kTfLiteInt8) {
+        return 2;
+      }
+      return 1;
+
     case BuiltinOperator_SOFTMAX:
     case BuiltinOperator_MEAN:
     case BuiltinOperator_MIRROR_PAD:
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 9caf1c7e57f..72787ade52c 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -274,6 +274,16 @@ TEST(OpVersionTest, VersioningUnpackTest) {
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
 }
 
+TEST(OpVersionTest, VersioningRangeTest) {
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_RANGE;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt64);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+}
+
 TEST(OpVersionTest, VersioningReluTest) {
   OpSignature fake_op_sig = {
       .op = BuiltinOperator_RELU,
@@ -345,6 +355,9 @@ TEST(OpVersionTest, VersioningStridedSliceTest) {
 
   fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 7);
+
+  strided_slice_params.offset = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 8);
 }
 
 TEST(OpVersionTest, VersioningSpaceToDepthTest) {
@@ -574,7 +587,19 @@ TEST(OpVersionTest, VersioningPadV2Test) {
 }
 
 TEST(OpVersionTest, VersioningConcatenationTest) {
-  SimpleVersioningTest(BuiltinOperator_CONCATENATION);
+  OpSignature fake_op_sig = {};
+  fake_op_sig.op = BuiltinOperator_CONCATENATION;
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 1);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt8);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 2);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteInt16);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 3);
+
+  fake_op_sig.inputs = CreateOpSignatureTensorSpecs(kTfLiteUInt32);
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 4);
 }
 
 TEST(OpVersionTest, VersioningSelectTest) {
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index 8ee32b4a76a..ff49954eae7 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -110,6 +110,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_CONCATENATION, 1}, "1.5.0"},
            {{BuiltinOperator_CONCATENATION, 2}, "1.14.0"},
            {{BuiltinOperator_CONCATENATION, 3}, "2.3.0"},
+           {{BuiltinOperator_CONCATENATION, 4}, "2.14.0"},
            {{BuiltinOperator_DEPTH_TO_SPACE, 1}, "2.1.0"},
            {{BuiltinOperator_DEPTH_TO_SPACE, 2}, "2.5.0"},
            {{BuiltinOperator_EMBEDDING_LOOKUP, 1}, "1.13.0"},
@@ -248,6 +249,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_STRIDED_SLICE, 5}, "2.5.0"},
            {{BuiltinOperator_STRIDED_SLICE, 6}, "2.6.0"},
            {{BuiltinOperator_STRIDED_SLICE, 7}, "2.14.0"},
+           {{BuiltinOperator_STRIDED_SLICE, 8}, "2.14.0"},
            {{BuiltinOperator_TOPK_V2, 1}, "1.7.0"},
            {{BuiltinOperator_TOPK_V2, 2}, "1.14.0"},
            {{BuiltinOperator_TOPK_V2, 3}, "2.13.0"},
@@ -363,6 +365,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_FLOOR_MOD, 1}, "1.13.0"},
            {{BuiltinOperator_FLOOR_MOD, 2}, "2.13.0"},
            {{BuiltinOperator_RANGE, 1}, "1.13.0"},
+           {{BuiltinOperator_RANGE, 2}, "2.14.0"},
            {{BuiltinOperator_SIN, 1}, "1.9.0"},
            {{BuiltinOperator_LOG, 1}, "1.14.0"},
            {{BuiltinOperator_SQRT, 1}, "1.10.0"},
diff --git a/tensorflow/lite/util.cc b/tensorflow/lite/util.cc
index 65242cbdf7c..dcffa7b465e 100644
--- a/tensorflow/lite/util.cc
+++ b/tensorflow/lite/util.cc
@@ -14,11 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/util.h"
 
-#include <stddef.h>
-#include <stdint.h>
-
 #include <algorithm>
 #include <complex>
+#include <cstddef>
 #include <cstring>
 #include <initializer_list>
 #include <memory>
@@ -47,14 +45,6 @@ bool IsFlexOp(const char* custom_name) {
                                 strlen(kFlexCustomCodePrefix)) == 0;
 }
 
-std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
-    const std::vector<int>& data) {
-  std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> result(
-      TfLiteIntArrayCreate(data.size()));
-  std::copy(data.begin(), data.end(), result->data);
-  return result;
-}
-
 TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input) {
   return ConvertArrayToTfLiteIntArray(static_cast<int>(input.size()),
                                       input.data());
@@ -203,22 +193,22 @@ TfLiteStatus MultiplyAndCheckOverflow(size_t a, size_t b, size_t* product) {
 }
 
 TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                           size_t* bytes, TfLiteContext context_) {
-  TF_LITE_ENSURE(&context_, bytes != nullptr);
+                           size_t* bytes, TfLiteContext* context_) {
+  TF_LITE_ENSURE(context_, bytes != nullptr);
   // When 'dims_size' is 0, we simply assume it's a scalar. Therefore, we start
   // 'count' as 1.
   size_t count = 1;
   for (int k = 0; k < dims_size; k++) {
     size_t old_count = count;
     TF_LITE_ENSURE_MSG(
-        &context_,
+        context_,
         MultiplyAndCheckOverflow(old_count, dims[k], &count) == kTfLiteOk,
         "BytesRequired number of elements overflowed.\n");
   }
   size_t type_size = 0;
-  TF_LITE_ENSURE_OK(&context_, GetSizeOfType(&context_, type, &type_size));
+  TF_LITE_ENSURE_OK(context_, GetSizeOfType(context_, type, &type_size));
   TF_LITE_ENSURE_MSG(
-      &context_, MultiplyAndCheckOverflow(type_size, count, bytes) == kTfLiteOk,
+      context_, MultiplyAndCheckOverflow(type_size, count, bytes) == kTfLiteOk,
       "BytesRequired number of bytes overflowed.\n");
 
   // GetSizeOfType doesn't work for kTfLiteInt4 due to it having 2 values packed
@@ -230,4 +220,45 @@ TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
 
   return kTfLiteOk;
 }
+
+IntArrayUniquePtr BuildTfLiteIntArray(const std::vector<int>& data) {
+  return IntArrayUniquePtr(ConvertVectorToTfLiteIntArray(data));
+}
+
+TensorUniquePtr BuildTfLiteTensor() {
+  return TensorUniquePtr((TfLiteTensor*)calloc(1, sizeof(TfLiteTensor)));
+}
+
+TensorUniquePtr BuildTfLiteTensor(TfLiteType type, const std::vector<int>& dims,
+                                  TfLiteAllocationType allocation_type) {
+  return BuildTfLiteTensor(type, BuildTfLiteIntArray(dims), allocation_type);
+}
+
+// Allocates an appropriate sized buffer underneath returned tensor
+// based on the value of `dims`. Since arena allocated tensors should not
+// be managed by the user, we do not permit `kTfLiteArena` as a
+// valid allocation type.
+TensorUniquePtr BuildTfLiteTensor(TfLiteType type, IntArrayUniquePtr dims,
+                                  TfLiteAllocationType allocation_type) {
+  assert(allocation_type != kTfLiteArenaRw &&
+         allocation_type != kTfLiteArenaRwPersistent);
+  TfLiteIntArray* dims_data = dims.release();
+  if (!dims_data) {
+    return nullptr;
+  }
+  size_t bytes;
+  auto compute_bytes_stat =
+      BytesRequired(type, dims_data->data, dims_data->size, &bytes, nullptr);
+  if (compute_bytes_stat != kTfLiteOk) {
+    return nullptr;
+  }
+  TensorUniquePtr t = BuildTfLiteTensor();
+  TfLiteTensorReset(type, /*name=*/nullptr, dims_data, /*quantization=*/{},
+                    /*buffer=*/nullptr, bytes, allocation_type,
+                    /*allocation=*/nullptr, /*is_variable=*/false,
+                    /*tensor=*/t.get());
+  TfLiteTensorRealloc(bytes, t.get());
+  return t;
+}
+
 }  // namespace tflite
diff --git a/tensorflow/lite/util.h b/tensorflow/lite/util.h
index 607effe7973..fd089c17336 100644
--- a/tensorflow/lite/util.h
+++ b/tensorflow/lite/util.h
@@ -55,26 +55,15 @@ TfLiteIntArray* ConvertVectorToTfLiteIntArray(const std::vector<int>& input);
 // Converts an array (of the given size) to a `TfLiteIntArray`. The caller
 // takes ownership of the returned pointer, and must make sure 'dims' has at
 // least 'ndims' elements.
-TfLiteIntArray* ConvertArrayToTfLiteIntArray(const int ndims, const int* dims);
+TfLiteIntArray* ConvertArrayToTfLiteIntArray(int ndims, const int* dims);
 
 // Checks whether a `TfLiteIntArray` and an int array have matching elements.
 // The caller must guarantee that 'b' has at least 'b_size' elements.
-bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, const int b_size,
+bool EqualArrayAndTfLiteIntArray(const TfLiteIntArray* a, int b_size,
                                  const int* b);
 
 size_t CombineHashes(std::initializer_list<size_t> hashes);
 
-struct TfLiteIntArrayDeleter {
-  void operator()(TfLiteIntArray* a) {
-    if (a) TfLiteIntArrayFree(a);
-  }
-};
-
-// Helper for Building TfLiteIntArray that is wrapped in a unique_ptr,
-// So that it is automatically freed when it goes out of the scope.
-std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter> BuildTfLiteIntArray(
-    const std::vector<int>& data);
-
 // Populates the size in bytes of a type into `bytes`. Returns kTfLiteOk for
 // valid types, and kTfLiteError otherwise.
 TfLiteStatus GetSizeOfType(TfLiteContext* context, const TfLiteType type,
@@ -115,8 +104,37 @@ inline bool IsResourceOrVariant(const TfLiteTensor* tensor) {
 // specified by the array dims (of length dims_size). Returns the status code
 // and bytes.
 TfLiteStatus BytesRequired(TfLiteType type, const int* dims, size_t dims_size,
-                           size_t* bytes, TfLiteContext context);
+                           size_t* bytes, TfLiteContext* context);
 
+/// UNIQUE PTR WRAPPERS ///
+struct TfLiteIntArrayDeleter {
+  void operator()(TfLiteIntArray* a) {
+    if (a) {
+      TfLiteIntArrayFree(a);
+    }
+  }
+};
+struct TfLiteTensorDeleter {
+  void operator()(TfLiteTensor* t) {
+    if (t) {
+      TfLiteTensorFree(t);
+    }
+    free(t);
+  }
+};
+
+// `unique_ptr` wrapper for `TfLiteIntArray`s.
+using IntArrayUniquePtr =
+    std::unique_ptr<TfLiteIntArray, TfLiteIntArrayDeleter>;
+IntArrayUniquePtr BuildTfLiteIntArray(const std::vector<int>& data);
+
+// `unique_ptr` wrapper for `TfLiteTensor`s.
+using TensorUniquePtr = std::unique_ptr<TfLiteTensor, TfLiteTensorDeleter>;
+TensorUniquePtr BuildTfLiteTensor();
+TensorUniquePtr BuildTfLiteTensor(TfLiteType type, const std::vector<int>& dims,
+                                  TfLiteAllocationType allocation_type);
+TensorUniquePtr BuildTfLiteTensor(TfLiteType type, IntArrayUniquePtr dims,
+                                  TfLiteAllocationType allocation_type);
 }  // namespace tflite
 
 #endif  // TENSORFLOW_LITE_UTIL_H_
diff --git a/tensorflow/lite/util_test.cc b/tensorflow/lite/util_test.cc
index 825d6f8a7e2..641b7a6e40a 100644
--- a/tensorflow/lite/util_test.cc
+++ b/tensorflow/lite/util_test.cc
@@ -18,17 +18,22 @@ limitations under the License.
 #include <stddef.h>
 #include <stdlib.h>
 
+#include <algorithm>
 #include <string>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/core/c/common.h"
+#include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
 namespace tflite {
 namespace {
 
+using testing::ElementsAreArray;
+
 TEST(ConvertVectorToTfLiteIntArray, TestWithVector) {
   std::vector<int> input = {1, 2};
   TfLiteIntArray* output = ConvertVectorToTfLiteIntArray(input);
@@ -146,7 +151,7 @@ TEST(FourBitTest, BytesRequiredEven) {
   size_t dims_size = 4;
   size_t required_bytes_four_bit;
   tflite::BytesRequired(kTfLiteInt4, dims_ptr, dims_size,
-                        &required_bytes_four_bit, context);
+                        &required_bytes_four_bit, &context);
 
   ASSERT_EQ(required_bytes_four_bit, 15);
 }
@@ -159,10 +164,33 @@ TEST(FourBitTest, BytesRequiredOdd) {
   size_t dims_size = 2;
   size_t required_bytes_four_bit;
   tflite::BytesRequired(kTfLiteInt4, dims_ptr, dims_size,
-                        &required_bytes_four_bit, context);
+                        &required_bytes_four_bit, &context);
 
   ASSERT_EQ(required_bytes_four_bit, 3);
 }
 
+TEST(TestMakeUniqueTensor, Valid) {
+  TensorUniquePtr t = BuildTfLiteTensor(kTfLiteInt32, {2, 3}, kTfLiteDynamic);
+  ASSERT_NE(t.get(), nullptr);
+
+  EXPECT_THAT(t.get(), DimsAre({2, 3}));
+  EXPECT_EQ(t->bytes, 24);
+
+  EXPECT_EQ(t->type, kTfLiteInt32);
+  EXPECT_EQ(t->allocation_type, kTfLiteDynamic);
+
+  // Check memory has been properly allocated.
+  int* data = t->data.i32;
+  std::fill_n(data, 6, 0);
+  ASSERT_NE(data, nullptr);
+  ASSERT_THAT(std::vector<int>(data, data + 6),
+              ElementsAreArray({0, 0, 0, 0, 0, 0}));
+}
+
+TEST(TestMakeUniqueTensor, NullDimsReturnsNull) {
+  TensorUniquePtr t = BuildTfLiteTensor(kTfLiteInt32, nullptr, kTfLiteDynamic);
+  ASSERT_EQ(t.get(), nullptr);
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 54de233d09a..3dc4830ffd1 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -130,6 +130,7 @@ tensorflow/tools/benchmark/onednn_benchmark_config:.sh
 tensorflow/tools/build_info/BUILD:
 tensorflow/tools/ci_build/a100/nightly.sh:
 tensorflow/tools/ci_build/horovod/gpu/nightly.sh:
+tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test:.py
 tensorflow/tools/ci_build/release/common.sh:
 tensorflow/tools/ci_build/release/common_win.bat:
 tensorflow/tools/ci_build/release/mac_build_utils.sh:
@@ -316,9 +317,12 @@ third_party/pprof.BUILD:
 third_party/protobuf/BUILD:
 third_party/py/BUILD.tpl:
 third_party/py/BUILD:
+third_party/py/ml_dtypes/BUILD:
+third_party/py/ml_dtypes/LICENSE:
 third_party/py/numpy/BUILD:
 third_party/py/python_configure.bzl:
 third_party/pybind11.BUILD:
+third_party/pybind11_bazel/BUILD:
 third_party/pybind11_protobuf/BUILD:
 third_party/python_runtime/BUILD:
 third_party/remote_config/BUILD.tpl:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 5ef7fd7e90e..b554faa7b3d 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -3,8 +3,8 @@
 #  ":platform" - Low-level and platform-specific Python code.
 
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "clean_dep", "if_google", "if_mlir", "if_oss", "if_windows", "if_xla_available", "py_test", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps", "tsl_async_value_deps")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "get_compatible_with_portable", "pywrap_tensorflow_macro", "tf_external_workspace_visible", "tf_monitoring_python_deps", "tf_py_test", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.bzl", "VERSION", "cc_header_only_library", "clean_dep", "if_google", "if_mlir", "if_oss", "if_windows", "if_xla_available", "tf_enable_mlir_bridge", "tf_python_pybind_static_deps", "tsl_async_value_deps")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "pywrap_tensorflow_macro", "tf_external_workspace_visible", "tf_monitoring_python_deps", "tf_pybind_cc_library_wrapper", "tf_python_pybind_extension")
 load(
     "//tensorflow/core/platform:build_config.bzl",
     "tf_additional_binary_deps",
@@ -16,7 +16,6 @@ load(
     "tf_additional_plugin_deps",
     "tf_additional_profiler_deps",
 )
-load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
 load("//third_party/mkl_dnn:build_defs.bzl", "if_onednn_v3")
 
 # TODO(mdan): Break into per-directory files.
@@ -73,9 +72,11 @@ py_library(
         "//tensorflow/tools/quantization:__pkg__",  # TODO(b/34059704): remove when fixed
     ],
     deps = [
-        ":gradient_checker_v2",
         ":no_contrib",
         "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/ops/structured:structured_ops",
         "//tensorflow/python/platform:app",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
@@ -97,7 +98,7 @@ py_library(
 # keras_lib may be removed in the future, it should NOT be used as
 # a direct dependency.
 # TODO(b/199420795) investigate minimal set of dependencies ...
-py_library(
+py_strict_library(
     name = "keras_lib",
     srcs_version = "PY3",
     visibility = [
@@ -113,10 +114,10 @@ py_library(
         "//third_party/py/tensorflow_privacy:__subpackages__",  # TODO(b/163395075): remove when fixed
     ],
     deps = [
-        ":layers",
-        ":rnn",
         "//tensorflow/python/feature_column:feature_column_py",
         "//tensorflow/python/keras",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:rnn",
     ],
 )
 
@@ -138,64 +139,17 @@ py_library(
     deps = [
         ":_pywrap_py_exception_registry",
         ":_pywrap_quantize_training",
-        ":_pywrap_utils",
-        ":array_ops",
-        ":array_ops_stack",
-        ":audio_ops_gen",
-        ":bincount_ops",
-        ":bitwise_ops",
-        ":boosted_trees_ops",
-        ":check_ops",
-        ":client_testlib",
-        ":clustering_ops",
-        ":collective_ops",
-        ":composite_tensor_ops",
-        ":cond_v2",
-        ":confusion_matrix",
-        ":control_flow_ops",
-        ":cudnn_rnn_ops_gen",
         ":distributed_framework_test_lib",
-        ":dtensor",
-        ":filesystem_ops",
-        ":functional_ops",
-        ":gradient_checker",
-        ":gradient_checker_v2",
-        ":histogram_ops",
-        ":image_ops",
-        ":initializers_ns",
-        ":io_ops",
         ":keras_lib",
-        ":lib",
-        ":list_ops",
-        ":manip_ops",
-        ":map_fn",
-        ":map_ops",
-        ":math_ops",
         ":metrics",
-        ":nccl_ops",
         ":nn",
-        ":ops",
-        ":proto_ops",
         ":pywrap_tensorflow",
         ":pywrap_tfe",
-        ":random_crop_ops",
-        ":ref_variable",
-        ":rnn_ops_gen",
-        ":script_ops",
-        ":sendrecv_ops_gen",
-        ":session_ops",
+        ":rnn_cell",
         ":sets",
-        ":sparse_ops",
-        ":standard_ops",
-        ":state_ops",
-        ":string_ops",
-        ":tensor_array_ops",
-        ":training",
-        ":uniform_quant_ops_gen",
-        ":variable_v1",
-        ":weights_broadcast_ops",
         ":while_v2",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/dtensor/python:dtensor",
         "//tensorflow/python/autograph",
         "//tensorflow/python/autograph/utils:testing",
         "//tensorflow/python/client",
@@ -231,16 +185,71 @@ py_library(
         "//tensorflow/python/framework:graph_util",
         "//tensorflow/python/framework:kernels",
         "//tensorflow/python/framework:subscribe",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_ops",  # TODO(b/183988750): Break testing code out into separate rule.
         "//tensorflow/python/grappler:tf_cluster",
         "//tensorflow/python/grappler:tf_item",
         "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:audio_ops_gen",
+        "//tensorflow/python/ops:bincount_ops",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:boosted_trees_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:clustering_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:composite_tensor_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:cond_v2",
+        "//tensorflow/python/ops:confusion_matrix",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_case",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:cudnn_rnn_ops_gen",
+        "//tensorflow/python/ops:filesystem_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:histogram_ops",
+        "//tensorflow/python/ops:image_ops",
+        "//tensorflow/python/ops:initializers_ns",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:map_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nccl_ops",
+        "//tensorflow/python/ops:proto_ops",
+        "//tensorflow/python/ops:random_crop_ops",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:rnn_ops_gen",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sendrecv_ops_gen",
+        "//tensorflow/python/ops:session_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:standard_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:uniform_quant_ops_gen",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:weights_broadcast_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/ops/distributions:bijector_test_util",
+        "//tensorflow/python/ops/distributions:identity_bijector",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg/sparse:sparse_py",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/ops/numpy_ops:np_config",
+        "//tensorflow/python/ops/numpy_ops:np_random",
         "//tensorflow/python/ops/numpy_ops:numpy",
         "//tensorflow/python/ops/parallel_for",
         "//tensorflow/python/ops/ragged",
@@ -254,8 +263,10 @@ py_library(
         "//tensorflow/python/ops/signal:spectral_ops",
         "//tensorflow/python/ops/signal:util_ops",
         "//tensorflow/python/ops/signal:window_ops",
+        "//tensorflow/python/ops/structured:structured_ops",
         "//tensorflow/python/platform:_pywrap_stacktrace_handler",
         "//tensorflow/python/platform:app",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:resource_loader",
@@ -270,12 +281,18 @@ py_library(
         "//tensorflow/python/summary/writer",
         "//tensorflow/python/tools:module_util",
         "//tensorflow/python/tools/api/generator:create_python_api",
+        "//tensorflow/python/tpu:datasets",
+        "//tensorflow/python/tpu:functional",
+        "//tensorflow/python/tpu:preempted_hook_py",
         "//tensorflow/python/tpu:tpu_noestimator",
+        "//tensorflow/python/training",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/training:saver_test_utils",
         "//tensorflow/python/types:core",
         "//tensorflow/python/types:data",
         "//tensorflow/python/types:distribute",
         "//tensorflow/python/types:trace",
+        "//tensorflow/python/user_ops:ops",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
         "//tensorflow/python/util:_pywrap_kernel_registry",
         "//tensorflow/python/util:_pywrap_nest",
@@ -283,9 +300,11 @@ py_library(
         "//tensorflow/python/util:_pywrap_tfprof",
         "//tensorflow/python/util:_pywrap_transform_graph",
         "//tensorflow/python/util:_pywrap_util_port",
+        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:all_util",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:module_wrapper",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -316,32 +335,25 @@ py_library(
     deps = [
         ":no_contrib",
         "//tensorflow/core/function/trace_type",
+        "//tensorflow/python/data",
         "//tensorflow/python/distribute:merge_call_interim",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute/failure_handling:failure_handling_lib",
         "//tensorflow/python/distribute/failure_handling:preemption_watcher",
+        "//tensorflow/python/ops/numpy_ops:np_array_ops",
+        "//tensorflow/python/ops/numpy_ops:np_arrays",
+        "//tensorflow/python/ops/numpy_ops:np_config",
+        "//tensorflow/python/ops/numpy_ops:np_dtypes",
+        "//tensorflow/python/ops/numpy_ops:np_math_ops",
+        "//tensorflow/python/ops/numpy_ops:np_random",
+        "//tensorflow/python/ops/numpy_ops:np_utils",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-alias(
-    name = "tf_decorator",
-    actual = "//tensorflow/python/util:tf_decorator",
-)
-
-# Deprecated.  Use
-#   //third_party/tensorflow/tsl/python/lib/core:bfloat16_lib
-# for the bfloat16-specific dependency, or
-#   //third_party/tensorflow/tsl/python/lib/core
-# for all custom types and python utilities.
-alias(
-    name = "bfloat16_lib",
-    actual = "//tensorflow/python/lib/core:bfloat16_lib",
-)
-
 # Necessary for the pywrap inclusion below.
 tf_pybind_cc_library_wrapper(
     name = "tfcompile_headers_lib",
@@ -372,16 +384,6 @@ tf_python_pybind_extension(
     ],
 )
 
-alias(
-    name = "pywrap_tf_session",
-    actual = "//tensorflow/python/client:pywrap_tf_session",
-)
-
-alias(
-    name = "_pywrap_utils",
-    actual = "//tensorflow/python/util:_pywrap_utils",
-)
-
 tf_python_pybind_extension(
     name = "_pywrap_quantize_training",
     srcs = [
@@ -412,14 +414,14 @@ tf_python_pybind_extension(
         "//tensorflow:windows": [],
     }),
     static_deps = tf_python_pybind_static_deps(),
+    # Do not sort: core:py_exception_registry must come before platform:status
     deps = [
-        ":py_exception_registry",
-        "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:status",
-        "//tensorflow/python/lib/core:py_exception_registry",
-        "//third_party/python_runtime:headers",
         "@com_google_absl//absl/container:fixed_array",
         "@pybind11",
+        "//third_party/python_runtime:headers",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/python/lib/core:py_exception_registry",
+        "//tensorflow/core/platform:status",
     ],
 )
 
@@ -461,29 +463,7 @@ cc_header_only_library(
     ],
 )
 
-py_library(
-    name = "map_fn",
-    srcs = ["ops/map_fn.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":tensor_array_ops",
-        ":variable_scope",
-        ":while_loop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
-    ],
-)
-
-py_library(
+py_strict_library(
     name = "extra_py_tests_deps",
     srcs_version = "PY3",
     # extra_py_tests_deps is public to allow the use of tf_py_test or
@@ -502,3468 +482,86 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "distributed_framework_test_lib",
     srcs_version = "PY3",
     deps = ["//tensorflow/python/framework:test_lib"],
 )
 
-tf_gen_op_wrapper_private_py(
-    name = "functional_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-py_library(
-    name = "functional_ops",
-    srcs = ["ops/functional_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":functional_ops_gen",
-        ":tensor_array_ops",
-        ":variable_scope",
-        ":while_loop",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:auto_control_deps_utils",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_test(
-    name = "functional_ops_test",
-    srcs = ["ops/functional_ops_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":functional_ops",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:function",
-        "//tensorflow/python/framework:tensor_spec",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "debug_ops_gen",
-    out = "ops/gen_debug_ops.py",
-    visibility = [
-        "//tensorflow/python/debug:__subpackages__",
-    ],
-    deps = ["//tensorflow/core:debug_ops_op_lib"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "array_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/compiler/tests:__pkg__",
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/array_ops:__pkg__",
-        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
-        "//tensorflow/python/kernel_tests/v1_compat_tests:__pkg__",
-        "//tensorflow/python/training:__pkg__",
-        "//tensorflow/security/fuzzing/google:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/c/kernels:bitcast_op_lib",
-        "//tensorflow/core:array_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "bitwise_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/compiler/tests:__pkg__",
-        "//tensorflow/contrib/quantization:__pkg__",
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/kernel_tests:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "boosted_trees_ops_gen",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:boosted_trees_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "summary_ops_gen",
-    visibility = ["//tensorflow:__subpackages__"],
-    deps = ["//tensorflow/core:summary_ops_op_lib"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "audio_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "cudnn_rnn_ops_gen",
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "candidate_sampling_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "checkpoint_ops_gen",
-    visibility = [
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/io_ops:__pkg__",
-        "//tensorflow/python/training:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "clustering_ops_gen",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:clustering_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "collective_ops_gen",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:collective_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "control_flow_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow:__subpackages__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:control_flow_ops_op_lib",
-        "//tensorflow/core:no_op_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "ctc_ops_gen",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "data_flow_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
-        "//tensorflow/python/kernel_tests/data_structures:__pkg__",
-        "//tensorflow/python/training:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "dataset_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow:__subpackages__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "optional_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow:__subpackages__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "experimental_dataset_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow:__subpackages__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "filesystem_ops_gen",
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "image_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "io_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/checkpoint:__pkg__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/array_ops:__pkg__",
-        "//tensorflow/python/kernel_tests/io_ops:__pkg__",
-        "//tensorflow/python/trackable:__pkg__",
-        "//tensorflow/python/training:__pkg__",
-        "//tensorflow/python/training/tracking:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "linalg_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/dtensor:dtensor-internal",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "logging_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/python/framework:__pkg__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
-        "//tensorflow/python/summary:__pkg__",
-        "//tensorflow/python/summary/writer:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/c/kernels:histogram_summary_op_lib",
-        "//tensorflow/c/kernels:merge_summary_op_lib",
-        "//tensorflow/c/kernels:summary_op_lib",
-        "//tensorflow/core:logging_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "lookup_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/training:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "batch_ops_gen",
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-)
-
-py_library(
-    name = "batch_ops",
-    srcs = [
-        "ops/batch_ops.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":batch_ops_gen",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-cuda_py_test(
-    name = "batch_ops_test",
-    size = "small",
-    srcs = ["ops/batch_ops_test.py"],
-    main = "ops/batch_ops_test.py",
-    python_version = "PY3",
-    srcs_version = "PY3",
-    tags = [
-        "manual",
-        "no_cuda_asan",  # b/177916286
-        "no_pip",
-        "nomac",
-    ],
-    deps = [
-        ":array_ops",
-        ":batch_ops",
-        ":client_testlib",
-        ":gradients",
-        ":script_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "manip_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "math_ops_gen",
-    visibility = [
-        "//learning/brain/google/python/ops:__pkg__",
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/compiler/tests:__pkg__",
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/math_ops:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "nn_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/compiler/tests:__pkg__",
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/nn_ops:__pkg__",
-        "//tensorflow/python/kernel_tests/sparse_ops:__pkg__",
-        "//tensorflow/python/tools:__pkg__",
-        "//tensorflow/security/fuzzing/google:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "count_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/security/fuzzing/google:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "parsing_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/python/autograph/operators:__pkg__",
-        "//tensorflow/python/data/ops:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "random_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "special_math_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "stateful_random_ops_gen",
-    visibility = ["//learning/brain/python/ops:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "resource_variable_ops_gen",
-    visibility = [
-        "//tensorflow/compiler/tf2xla:internal",
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/distribute:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "stateless_random_ops_gen",
-    visibility = [
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/data/experimental/ops:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "stateless_random_ops_v2_gen",
-    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "random_index_shuffle_ops_gen",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "list_ops_gen",
-    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "map_ops_gen",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "script_ops_gen",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "sdca_ops_gen",
-    visibility = ["//tensorflow_estimator/python/estimator/canned/linear_optimizer:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "set_ops_gen",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "state_ops_gen",
-    visibility = [
-        "//learning/brain/python/ops:__pkg__",
-        "//tensorflow/python/framework:__pkg__",
-        "//tensorflow/python/kernel_tests:__pkg__",
-        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
-        "//tensorflow/python/kernel_tests/variables:__pkg__",
-        "//tensorflow/python/training:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "sparse_ops_gen",
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "spectral_ops_gen",
-    visibility = [
-        "//tensorflow/dtensor:dtensor-internal",
-        "//tensorflow/python/ops/signal:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "string_ops_gen",
-    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "training_ops_gen",
-    visibility = [
-        "//tensorflow/python/training:__pkg__",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "composite_tensor_ops_gen",
-    visibility = [],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "ragged_array_ops_gen",
-    visibility = ["//tensorflow/python/ops/ragged:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "ragged_math_ops_gen",
-    visibility = ["//tensorflow/python/ops/ragged:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "ragged_conversion_ops_gen",
-    visibility = ["//tensorflow/python/ops/ragged:__pkg__"],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "risc_ops_gen",
-    visibility = [
-        "//tensorflow/python/ops/risc:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:risc_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(name = "rnn_ops_gen")
-
-tf_gen_op_wrapper_private_py(
-    name = "sendrecv_ops_gen",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:sendrecv_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "tpu_ops_gen",
-    visibility = [
-        "//smartass/brain/configure/python:__pkg__",
-        "//tensorflow/python/tpu:__pkg__",
-        "//tensorflow/python/tpu/ops:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core:tpu_configuration_ops_op_lib",
-        "//tensorflow/core:tpu_cross_replica_ops_op_lib",
-        "//tensorflow/core:tpu_embedding_load_retrieve_ops_op_lib",
-        "//tensorflow/core:tpu_embedding_ops_op_lib",
-        "//tensorflow/core:tpu_functional_ops_op_lib",
-        "//tensorflow/core:tpu_heartbeat_ops_op_lib",
-        "//tensorflow/core:tpu_host_compute_ops_op_lib",
-        "//tensorflow/core:tpu_infeed_ops_op_lib",
-        "//tensorflow/core:tpu_ordinal_selector_ops_op_lib",
-        "//tensorflow/core:tpu_outfeed_ops_op_lib",
-        "//tensorflow/core:tpu_replication_ops_op_lib",
-        "//tensorflow/core:tpu_sharding_util_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "tpu_partition_ops_gen",
-    visibility = [
-        "//tensorflow:internal",
-        "//tensorflow/python/tpu:__pkg__",
-        "//tensorflow/python/tpu/ops:__pkg__",
-    ],
-    deps = [
-        "//tensorflow/core/tpu/ops:tpu_partitioned_ops",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "uniform_quant_ops_gen",
-    visibility = [
-        "//tensorflow:__subpackages__",
-    ],
-    deps = [
-        "//tensorflow/core:uniform_quant_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "sync_ops_gen",
-    visibility = [
-        "//tensorflow:internal",
-    ],
-    deps = [
-        "//tensorflow/core:sync_ops_op_lib",
-    ],
-)
-
-py_library(
-    name = "array_grad",
-    srcs = ["ops/array_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":cond",
-        ":control_flow_ops",
-        ":control_flow_util",
-        ":math_ops",
-        ":math_ops_gen",
-        ":pywrap_tfe",
-        ":resource_variable_ops_gen",
-        ":sparse_ops",
-        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-    ],
-)
-
-cuda_py_test(
-    name = "array_grad_test",
-    srcs = ["ops/array_grad_test.py"],
-    main = "ops/array_grad_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradient_checker_v2",
-        ":gradients",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "array_ops",
-    srcs = [
-        "ops/array_ops.py",
-        "ops/inplace_ops.py",
-    ],
-    srcs_version = "PY3",
-    visibility = visibility,
-    deps = [
-        ":array_ops_gen",
-        ":array_ops_stack",
-        ":control_flow_util",
-        ":math_ops_gen",
-        ":shape_util",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/eager:record",
-        "//tensorflow/python/framework:common_shapes",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:_pywrap_utils",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "array_ops_stack",
-    srcs = ["ops/array_ops_stack.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops_gen",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "bitwise_ops",
-    srcs = ["ops/bitwise_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":bitwise_ops_gen",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "boosted_trees_ops",
-    srcs = ["ops/boosted_trees_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":boosted_trees_ops_gen",
-        ":ops",
-        ":training",
-        "//tensorflow/python/framework",
-    ],
-)
-
-py_library(
-    name = "optional_grad",
-    srcs = ["ops/optional_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
+# TODO(b/285020435): Update usages and then remove.
+py_strict_library(
     name = "sets",
-    srcs = [
-        "ops/sets.py",
-        "ops/sets_impl.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        ":set_ops_gen",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "candidate_sampling_ops",
-    srcs = ["ops/candidate_sampling_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":candidate_sampling_ops_gen",
-        ":math_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:random_seed",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "check_ops",
-    srcs = ["ops/check_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":control_flow_ops",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "clip_ops",
-    srcs = ["ops/clip_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops",
-        ":nn_ops_gen",
-        ":numerics",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-tf_py_test(
-    name = "clip_ops_test",
-    size = "small",
-    srcs = ["ops/clip_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":clip_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "clustering_ops",
-    srcs = ["ops/clustering_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":clustering_ops_gen",
-        ":cond",
-        ":ops",
-        ":training",
-        ":variable_v1",
-        "//tensorflow/python/framework",
-    ],
-)
-
-tf_py_test(
-    name = "clustering_ops_test",
-    size = "medium",
-    srcs = ["ops/clustering_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":clustering_ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "collective_ops",
-    srcs = ["ops/collective_ops.py"],
-    srcs_version = "PY3",
-    deps = [":collective_ops_gen"],
-)
-
-tf_py_test(
-    name = "collective_ops_test",
-    size = "small",
-    srcs = ["ops/collective_ops_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_rocm",
-    ],
-    deps = [
-        ":client_testlib",
-        ":collective_ops",
-        ":variable_v1",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:kernels",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:tf_logging",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "collective_ops_xla_test",
-    size = "small",
-    srcs = ["ops/collective_ops_xla_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_pip",
-        "no_rocm",
-        "no_windows",
-        "nomac",
-    ],
-    xla_enable_strict_auto_jit = True,
-    deps = [
-        ":client_testlib",
-        ":collective_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:kernels",
-        "//tensorflow/python/framework:ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "collective_ops_gpu_test",
-    size = "small",
-    srcs = ["ops/collective_ops_gpu_test.py"],
-    main = "ops/collective_ops_gpu_test.py",
-    python_version = "PY3",
-    tags = [
-        "guitar",
-        "multi_gpu",
-        "no_windows",
-    ],
-    deps = [
-        ":client_testlib",
-        ":collective_ops",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "compiled_collective_ops_gpu_test",
-    size = "small",
-    srcs = ["ops/compiled_collective_ops_gpu_test.py"],
-    main = "ops/compiled_collective_ops_gpu_test.py",
-    python_version = "PY3",
-    tags = [
-        "guitar",
-        "multi_gpu",
-        "no_windows",
-    ],
-    deps = [
-        ":client_testlib",
-        ":collective_ops",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "control_flow_assert",
-    srcs = ["ops/control_flow_assert.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":control_flow_ops_gen",
-        ":logging_ops_gen",
-        ":math_ops_gen",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:tf_should_use",
-    ],
-)
-
-py_library(
-    name = "control_flow_grad",
-    srcs =
-        ["ops/control_flow_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":control_flow_ops",
-        ":control_flow_ops_gen",
-        ":control_flow_util",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-    ],
-)
-
-# Note: targets depending on this should also depend on ":cond_v2" and ":while_v2".
-# See b/118513001.
-py_library(
-    name = "control_flow_ops",
-    srcs = ["ops/control_flow_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":cond",
-        ":control_flow_assert",
-        ":control_flow_case",
-        ":control_flow_ops_gen",
-        ":control_flow_util",
-        ":functional_ops_gen",
-        ":math_ops",
-        ":tensor_array_ops",
-        ":while_loop",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager/polymorphic_function:eager_function_run",
-        "//tensorflow/python/framework:composite_tensor",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:lazy_loader",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
-    ],
-)
-
-py_library(
-    name = "control_flow_case",
-    srcs = ["ops/control_flow_case.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops_stack",
-        ":cond",
-        ":control_flow_assert",
-        ":math_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "control_flow_switch_case",
-    srcs = ["ops/control_flow_switch_case.py"],
-    deps = [
-        ":array_ops",
-        ":control_flow_util",
-        ":functional_ops_gen",
-        ":math_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:lazy_loader",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "cond",
-    srcs = ["ops/cond.py"],
-    deps = [
-        ":array_ops",
-        ":control_flow_util",
-        ":math_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager/polymorphic_function:eager_function_run",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/types:core",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:lazy_loader",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "while_loop",
-    srcs = ["ops/while_loop.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":control_flow_util",
-        ":math_ops",
-        ":tensor_array_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:lazy_loader",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
-    ],
-)
-
-py_library(
-    name = "control_flow_util",
-    srcs = ["ops/control_flow_util.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/ops:sets",
+        "//tensorflow/python/ops:sets_impl",
     ],
 )
 
+# TODO(b/285020435): Update usages and then remove.
 py_strict_library(
-    name = "control_flow_util_v2",
-    srcs = ["ops/control_flow_util_v2.py"],
-    srcs_version = "PY3",
-    visibility = visibility + ["//waymo/ml:__subpackages__"],
-    deps = [
-        ":control_flow_util",
-        ":control_flow_v2_func_graphs",
-        ":gradients_util",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/data/util:structure",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager/polymorphic_function:atomic_function",
-        "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
-        "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
-        "//tensorflow/python/eager/polymorphic_function:transform",
-        "//tensorflow/python/framework:func_graph",
-        "//tensorflow/python/framework:function_def_to_graph",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:keras_deps",
-        "//tensorflow/python/util:tf_decorator",
-    ],
-)
-
-py_library(
-    name = "control_flow_v2_func_graphs",
-    srcs = ["ops/control_flow_v2_func_graphs.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/framework:func_graph",
-    ],
-)
-
-py_library(
-    name = "control_flow_v2_toggles",
-    srcs = ["ops/control_flow_v2_toggles.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":control_flow_util",
-        ":control_flow_util_v2",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-tf_py_test(
-    name = "control_flow_v2_toggles_test",
-    size = "small",
-    srcs = ["ops/control_flow_v2_toggles_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":control_flow_util_v2",
-        ":control_flow_v2_toggles",
-        ":platform_test",
-    ],
-)
-
-tf_py_test(
-    name = "control_flow_v2_enable_test",
-    size = "small",
-    srcs = ["ops/control_flow_v2_enable_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":control_flow_util",
-        ":platform_test",
-        ":tf2",
-    ],
-)
-
-tf_py_test(
-    name = "control_flow_v2_disable_test",
-    size = "small",
-    srcs = ["ops/control_flow_v2_disable_test.py"],
-    python_version = "PY3",
-    # This tests that it is possible to disable cfv2 using env vars.
-    # This does not apply to TF 2.0 nightly builds which enable
-    # v2 behavior using `tf.compat.v1.enable_v2_behavior()` in which case
-    # `tf.compat.v1.disable_control_flow_v2()` needs to be used.
-    tags = [
-        "no_oss",
-        "no_pip",
-    ],
-    deps = [
-        ":client_testlib",
-        ":control_flow_util",
-        ":platform_test",
-        ":tf2",
-    ],
-)
-
-py_library(
-    name = "cond_v2",
-    srcs = [
-        "ops/cond_v2.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_util_v2",
-        ":functional_ops_gen",
-        ":gradients_util",
-        ":handle_data_util",
-        ":optional_ops_gen",
-        ":pywrap_tensorflow",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/framework:auto_control_deps_utils",
-        "//tensorflow/python/framework:c_api_util",
-        "//tensorflow/python/framework:function",
-        "//tensorflow/python/framework:function_def_to_graph",
-        "//tensorflow/python/framework:graph_to_function_def",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:nest",
-    ],
-)
-
-py_library(
     name = "while_v2",
-    srcs = [
-        "ops/while_v2.py",
-        "ops/while_v2_indexed_slices_rewriter.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":control_flow_util",
-        ":control_flow_util_v2",
-        ":functional_ops_gen",
-        ":gradients_util",
-        ":handle_data_util",
-        ":list_ops",
-        ":map_ops",
-        ":tensor_array_ops",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/framework:auto_control_deps_utils",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:func_graph",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_conversion",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:variable_utils",
+        "//tensorflow/python/ops:while_v2",
+        "//tensorflow/python/ops:while_v2_indexed_slices_rewriter",
     ],
 )
 
-py_library(
-    name = "bincount_ops",
-    srcs = ["ops/bincount_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":count_ops_gen",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-cuda_py_test(
-    name = "bincount_ops_test",
-    size = "small",
-    srcs = ["ops/bincount_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":bincount_ops",
-        ":platform_test",
-        "//tensorflow/python/platform:client_testlib",
-    ],
-)
-
-py_library(
-    name = "ctc_ops",
-    srcs = ["ops/ctc_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":ctc_ops_gen",
-        ":functional_ops",
-        ":nn_grad",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:device",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:function",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "cudnn_rnn_grad",
-    srcs = ["ops/cudnn_rnn_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":cudnn_rnn_ops_gen",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "data_flow_grad",
-    srcs = ["ops/data_flow_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":data_flow_ops",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "data_flow_ops",
-    srcs = ["ops/data_flow_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":data_flow_ops_gen",
-        ":math_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:random_seed",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "embedding_ops",
-    srcs = ["ops/embedding_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":clip_ops",
-        ":data_flow_grad",
-        ":data_flow_ops",
-        ":math_ops",
-        ":resource_variable_ops",
-        ":sparse_ops",
-        ":variables",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/ops/ragged:ragged_functional_ops",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "handle_data_util",
-    srcs = [
-        "ops/handle_data_util.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":protos_all_py",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/types:core",
-        "//tensorflow/python/util:compat",
-    ],
-)
-
-py_library(
+py_strict_library(
     name = "gradients",
-    srcs = [
-        "ops/custom_gradient.py",
-        "ops/gradients.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        ":gradients_impl",
-        ":gradients_util",
-        ":handle_data_util",
-        ":math_ops",
-        ":unconnected_gradients",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/eager:forwardprop",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager:record",
-        "//tensorflow/python/eager:tape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
+        "//tensorflow/python/ops:custom_gradient",
+        "//tensorflow/python/ops:gradients",
     ],
 )
 
-py_library(
-    name = "gradients_impl",
-    srcs = [
-        "ops/gradients_impl.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":array_grad",
-        ":array_ops",
-        ":bitwise_ops",
-        ":check_ops",
-        ":control_flow_grad",
-        ":control_flow_util",
-        ":image_grad",
-        ":linalg_grad",
-        ":linalg_ops",
-        ":logging_ops",
-        ":manip_grad",
-        ":manip_ops",
-        ":math_grad",
-        ":math_ops",
-        ":optional_grad",
-        ":random_grad",
-        ":tensor_array_ops",
-        ":unconnected_gradients",
-        ":while_loop",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "gradients_util",
-    srcs = [
-        "ops/gradients_util.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":control_flow_state",
-        ":control_flow_util",
-        ":default_gradient",
-        ":functional_ops_gen",
-        ":math_ops",
-        ":resource_variable_ops",
-        ":unconnected_gradients",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:backprop_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:composite_tensor",
-        "//tensorflow/python/framework:composite_tensor_gradient",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "default_gradient",
-    srcs = [
-        "ops/default_gradient.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":resource_variable_ops",
-        "//tensorflow/python/framework:dtypes",
-    ],
-)
-
-py_library(
-    name = "control_flow_state",
-    srcs = [
-        "ops/control_flow_state.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":control_flow_ops",
-        ":control_flow_util",
-        ":data_flow_ops_gen",
-        ":resource_variable_ops_gen",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-    ],
-)
-
-py_library(
-    name = "unconnected_gradients",
-    srcs = ["ops/unconnected_gradients.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "filesystem_ops",
-    srcs = ["ops/filesystem_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":filesystem_ops_gen",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "histogram_ops",
-    srcs = ["ops/histogram_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":clip_ops",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "image_grad",
-    srcs = ["ops/image_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":image_ops_gen",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
+# TODO(b/285020435): Update usages and then remove.
+py_strict_library(
     name = "image_ops",
-    srcs = [
-        "ops/image_ops.py",
-        "ops/image_ops_impl.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        ":array_ops",
-        ":check_ops",
-        ":clip_ops",
-        ":cond",
-        ":control_flow_case",
-        ":control_flow_ops",
-        ":gradients",
-        ":image_ops_gen",
-        ":math_ops",
-        ":nn",
-        ":nn_ops_gen",
-        ":random_ops",
-        ":string_ops",
-        ":variables",
-        ":while_loop",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:random_seed",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "init_ops",
-    srcs = ["ops/init_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_stack",
-        ":linalg_ops_gen",
-        ":linalg_ops_impl",
-        ":math_ops",
-        ":random_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "init_ops_v2",
-    srcs = ["ops/init_ops_v2.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_stack",
-        ":linalg_ops_gen",
-        ":linalg_ops_impl",
-        ":math_ops",
-        ":random_ops",
-        ":stateless_random_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "initializers_ns",
-    srcs = ["ops/initializers_ns.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":init_ops",
-        ":variables",
+        "//tensorflow/python/ops:image_ops",
+        "//tensorflow/python/ops:image_ops_impl",
     ],
 )
 
+# TODO(b/285020435): Update usages and then remove.
 py_strict_library(
-    name = "io_ops",
-    srcs = ["ops/io_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":data_flow_ops_gen",
-        ":io_ops_gen",
-        ":lib",
-        ":parsing_ops_gen",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "linalg_grad",
-    srcs = ["ops/linalg_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":control_flow_ops",
-        ":linalg_ops",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/ops/linalg:linalg_impl",
-    ],
-)
-
-py_library(
-    name = "linalg_ops",
-    srcs = ["ops/linalg_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":linalg_ops_gen",
-        ":linalg_ops_impl",
-        ":map_fn",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "linalg_ops_impl",
-    srcs = ["ops/linalg_ops_impl.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:compat",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "manip_grad",
-    srcs = ["ops/manip_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":manip_ops",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "manip_ops",
-    srcs = ["ops/manip_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":manip_ops_gen",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "logging_ops",
-    srcs = ["ops/logging_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":logging_ops_gen",
-        ":string_ops",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "lookup_ops",
-    srcs = ["ops/lookup_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":lookup_ops_gen",
-        ":math_ops",
-        ":string_ops",
-        "//tensorflow/python/checkpoint:saveable_compat",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/saved_model/registration",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "math_grad",
-    srcs = ["ops/math_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":math_ops",
-        ":math_ops_gen",
-        ":special_math_ops",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "op_selector",
-    srcs = ["ops/op_selector.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:object_identity",
-    ],
-)
-
-py_library(
-    name = "math_ops",
-    srcs = ["ops/math_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":bitwise_ops_gen",
-        ":control_flow_ops_gen",
-        ":data_flow_ops_gen",
-        ":math_ops_gen",
-        ":nn_ops_gen",
-        ":sparse_ops_gen",
-        ":state_ops",
-        ":state_ops_gen",
-        "//tensorflow/python/compat",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:common_shapes",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:graph_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:lazy_loader",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:traceback_utils",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "resources",
-    srcs = ["ops/resources.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":math_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:tf_should_use",
-    ],
-)
-
-py_library(
-    name = "resource_variable_ops",
-    srcs = ["ops/resource_variable_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":handle_data_util",
-        ":resource_variable_ops_gen",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/checkpoint:tensor_callable",
-        "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:record",
-        "//tensorflow/python/eager:tape",
-        "//tensorflow/python/framework:auto_control_deps_utils",
-        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/util:_pywrap_utils",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "critical_section_ops",
-    srcs = ["ops/critical_section_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":resource_variable_ops_gen",
-        ":tensor_array_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "list_ops",
-    srcs = ["ops/list_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":handle_data_util",
-        ":list_ops_gen",
-        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
-        "//tensorflow/python/util:lazy_loader",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "map_ops",
-    srcs = ["ops/map_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":map_ops_gen",
-    ],
-)
-
-py_library(
     name = "nn",
-    srcs = [
-        "ops/nn.py",
-        "ops/nn_impl.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        ":array_ops",
-        ":candidate_sampling_ops",
-        ":cond",
-        ":ctc_ops",
-        ":embedding_ops",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":nn_ops_gen",
-        ":sparse_ops",
-        ":variables",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:device_context",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "nn_grad",
-    srcs = ["ops/nn_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":gradients",
-        ":math_ops",
-        ":nn_ops",
-        ":nn_ops_gen",
-        ":sparse_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-    ],
-)
-
-py_library(
-    name = "nn_ops",
-    srcs = ["ops/nn_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops",
-        ":nn_ops_gen",
-        ":random_ops",
-        ":stateless_random_ops",
-        ":variables",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:graph_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:random_seed",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform:device_context",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "numerics",
-    srcs = ["ops/numerics.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "parsing_config",
-    srcs = ["ops/parsing_config.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/ops/ragged:ragged_math_ops",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "parsing_ops",
-    srcs = ["ops/parsing_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":math_ops",
-        ":parsing_config",
-        ":parsing_ops_gen",
-        ":sparse_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "partitioned_variables",
-    srcs = ["ops/partitioned_variables.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":variable_scope",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "random_grad",
-    srcs = ["ops/random_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops",
-        ":random_ops_gen",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "random_ops",
-    srcs = ["ops/random_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":math_ops",
-        ":random_ops_gen",
-        ":shape_util",
-        ":stateless_random_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:random_seed",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/ops:nn",
+        "//tensorflow/python/ops:nn_impl",
     ],
 )
 
+# TODO(b/285020435): Update usages and then remove.
 py_strict_library(
-    name = "random_crop_ops",
-    srcs = ["ops/random_crop_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_assert",
-        ":control_flow_ops",
-        ":math_ops",
-        ":random_ops",
-        ":stateless_random_ops",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "stateful_random_ops",
-    srcs = ["ops/stateful_random_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":math_ops",
-        ":stateful_random_ops_gen",
-        ":stateless_random_ops",
-        ":stateless_random_ops_v2_gen",
-        ":variables",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_strict_library(
-    name = "stateless_random_ops",
-    srcs = ["ops/stateless_random_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_stack",
-        ":bitwise_ops",
-        ":math_ops",
-        ":random_index_shuffle_ops_gen",
-        ":shape_util",
-        ":stateless_random_ops_gen",
-        ":stateless_random_ops_v2_gen",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "rnn",
-    srcs = ["ops/rnn.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":control_flow_ops",
-        ":control_flow_util",
-        ":control_flow_util_v2",
-        ":math_ops",
-        ":rnn_cell",
-        ":tensor_array_ops",
-        ":variable_scope",
-        ":while_loop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
     name = "rnn_cell",
-    srcs = [
-        "ops/rnn_cell.py",
-        "ops/rnn_cell_impl.py",
-        "ops/rnn_cell_wrapper_impl.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_impl",
-        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_wrapper_impl",
+        "//tensorflow/python/ops:rnn_cell",
+        "//tensorflow/python/ops:rnn_cell_impl",
+        "//tensorflow/python/ops:rnn_cell_wrapper_impl",
     ],
 )
 
-py_library(
-    name = "script_ops",
-    srcs = [
-        "ops/autograph_ops.py",
-        "ops/script_ops.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":script_ops_gen",
-        "//tensorflow/python/autograph/operators:py_builtins",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:backprop_util",
-        "//tensorflow/python/eager:record",
-        "//tensorflow/python/framework:composite_tensor",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:func_graph",
-        "//tensorflow/python/framework:function",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/framework:type_spec",
-        "//tensorflow/python/lib/core:_pywrap_py_func",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "sdca_ops",
-    srcs = ["ops/sdca_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":sdca_ops_gen",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "session_ops",
-    srcs = ["ops/session_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":data_flow_ops_gen",
-        "//tensorflow/python/framework:device",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "shape_util",
-    srcs = ["ops/shape_util.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-    ],
-)
-
-py_library(
-    name = "sparse_grad",
-    srcs = ["ops/sparse_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops",
-        ":sparse_ops",
-        ":sparse_ops_gen",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-    ],
-)
-
-py_library(
-    name = "sparse_ops",
-    srcs = ["ops/sparse_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":check_ops",
-        ":control_flow_ops",
-        ":math_ops",
-        ":sparse_ops_gen",
-        ":special_math_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "sparse_ops_test",
-    srcs = ["ops/sparse_ops_test.py"],
-    main = "ops/sparse_ops_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_grad",
-        ":gradient_checker_v2",
-        ":sparse_grad",
-        ":sparse_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:test",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_library(
-    name = "sort_ops",
-    srcs = ["ops/sort_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops",
-        ":nn_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "sort_ops_test",
-    srcs = ["ops/sort_ops_test.py"],
-    main = "ops/sort_ops_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":random_ops",
-        ":sort_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "confusion_matrix",
-    srcs = ["ops/confusion_matrix.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":check_ops",
-        ":cond",
-        ":control_flow_ops",
-        ":math_ops",
-        ":sparse_ops",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "weights_broadcast_ops",
-    srcs = [
-        "ops/weights_broadcast_ops.py",
-    ],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":control_flow_ops",
-        ":math_ops",
-        ":sets",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
+# TODO(b/285020435): Update usages and then remove.
+py_strict_library(
     name = "metrics",
-    srcs = [
-        "ops/metrics.py",
-        "ops/metrics_impl.py",
-    ],
-    srcs_version = "PY3",
+    tags = ["avoid_dep"],
     deps = [
-        ":array_ops",
-        ":check_ops",
-        ":cond",
-        ":confusion_matrix",
-        ":control_flow_ops",
-        ":math_ops",
-        ":nn",
-        ":sets",
-        ":sparse_ops",
-        ":state_ops",
-        ":variable_scope",
-        ":variable_v1",
-        ":variables",
-        ":weights_broadcast_ops",
-        "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/ops:metrics",
+        "//tensorflow/python/ops:metrics_impl",
     ],
 )
 
-py_library(
-    name = "special_math_ops",
-    srcs = ["ops/special_math_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":check_ops",
-        ":control_flow_ops",
-        ":math_ops",
-        ":special_math_ops_gen",
-        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-        "@opt_einsum_archive//:opt_einsum",
-    ],
-)
-
-py_library(
-    name = "rnn_grad",
-    srcs = ["ops/rnn_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":rnn_ops_gen",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-cuda_py_test(
-    name = "rnn_grad_test",
-    srcs = ["ops/rnn_grad_test.py"],
-    main = "ops/rnn_grad_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradients",
-        ":math_ops",
-        ":rnn_grad",
-        ":rnn_ops_gen",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "script_ops_test",
-    srcs = ["ops/script_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":script_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "autograph_ops_test",
-    srcs = ["ops/autograph_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":script_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/platform:client_testlib",
-    ],
-)
-
-py_library(
-    name = "standard_ops",
-    srcs = ["ops/standard_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_grad",
-        ":array_ops",
-        ":batch_ops",
-        ":check_ops",
-        ":clip_ops",
-        ":cond",
-        ":confusion_matrix",
-        ":control_flow_case",
-        ":control_flow_ops",
-        ":critical_section_ops",
-        ":cudnn_rnn_grad",
-        ":data_flow_grad",
-        ":data_flow_ops",
-        ":functional_ops",
-        ":gradients",
-        ":histogram_ops",
-        ":init_ops",
-        ":io_ops",
-        ":linalg_ops",
-        ":logging_ops",
-        ":lookup_ops",
-        ":manip_grad",
-        ":manip_ops",
-        ":math_grad",
-        ":math_ops",
-        ":numerics",
-        ":parsing_ops",
-        ":partitioned_variables",
-        ":proto_ops",
-        ":random_ops",
-        ":rnn_grad",
-        ":script_ops",
-        ":session_ops",
-        ":sort_ops",
-        ":sparse_grad",
-        ":sparse_ops",
-        ":special_math_ops",
-        ":state_grad",
-        ":state_ops",
-        ":stateful_random_ops",
-        ":stateless_random_ops",
-        ":string_ops",
-        ":template",
-        ":tensor_array_grad",
-        ":tensor_array_ops",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
-        "//tensorflow/python/eager:wrap_function",
-        "//tensorflow/python/ops/distributions",
-        "//tensorflow/python/ops/linalg",
-        "//tensorflow/python/ops/ragged",
-        "//tensorflow/python/ops/structured",
-    ],
-)
-
-py_library(
-    name = "state_grad",
-    srcs = ["ops/state_grad.py"],
-    srcs_version = "PY3",
-    deps = ["//tensorflow/python/framework:ops"],
-)
-
-py_library(
-    name = "state_ops",
-    srcs = ["ops/state_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":math_ops_gen",
-        ":resource_variable_ops_gen",
-        ":state_ops_gen",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "string_ops",
-    srcs = ["ops/string_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":string_ops_gen",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:dispatch",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "summary_ops_v2",
-    srcs = ["ops/summary_ops_v2.py"],
-    srcs_version = "PY3",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        ":array_ops",
-        ":control_flow_ops",
-        ":math_ops",
-        ":resource_variable_ops",
-        ":resource_variable_ops_gen",
-        ":summary_op_util",
-        ":summary_ops_gen",
-        ":training_util",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:profiler",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:smart_cond",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "template",
-    srcs = ["ops/template.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":variable_scope",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "tensor_array_grad",
-    srcs = ["ops/tensor_array_grad.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":tensor_array_ops",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
-    name = "tensor_array_ops",
-    srcs = ["ops/tensor_array_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":control_flow_ops_gen",
-        ":data_flow_ops_gen",
-        ":list_ops",
-        ":math_ops",
-        ":tf2",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_util",
-        "//tensorflow/python/framework:type_spec_registry",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/saved_model:nested_structure_coder",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:tf_should_use",
-    ],
-)
-
-py_library(
-    name = "composite_tensor_ops",
-    srcs = ["ops/composite_tensor_ops.py"],
-    srcs_version = "PY2AND3",
-    deps = [
-        "composite_tensor_ops_gen",
-        "//tensorflow/python/util:nest",
-    ],
-)
-
-py_library(
-    name = "variable_scope",
-    srcs = ["ops/variable_scope.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":init_ops",
-        ":resource_variable_ops",
-        ":tf2",
-        ":variables",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:monitoring",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:function_utils",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-    ],
-)
-
-py_library(
-    name = "variables",
-    srcs = ["ops/variables.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_stack",
-        ":control_flow_ops",
-        ":math_ops",
-        ":math_ops_gen",
-        ":pywrap_tensorflow",
-        ":state_ops",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/trackable:base",
-        "//tensorflow/python/util:_pywrap_utils",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:tf_should_use",
-        "//tensorflow/python/util:traceback_utils",
-    ],
-)
-
-py_library(
-    name = "ref_variable",
-    srcs = ["ops/ref_variable.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":array_ops_gen",
-        ":resource_variable_ops",
-        ":state_ops",
-        ":state_ops_gen",
-        ":variable_scope",
-        ":variable_v1",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/trackable:base",
-        "//tensorflow/python/types:core",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "variable_v1",
-    srcs = ["ops/variable_v1.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":cond",
-        ":state_ops",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:tf_should_use",
-    ],
-)
-
-py_library(
-    name = "gradient_checker",
-    srcs = ["ops/gradient_checker.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":gradients",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "gradient_checker_v2",
-    srcs = ["ops/gradient_checker_v2.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":gradients",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "bitwise_ops_test",
-    size = "small",
-    srcs = ["ops/bitwise_ops_test.py"],
-    main = "ops/bitwise_ops_test.py",
-    python_version = "PY3",
-    tags = ["no_windows"],
-    deps = [
-        ":bitwise_ops",
-        ":bitwise_ops_gen",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:test",
-    ],
-)
-
-cuda_py_test(
-    name = "control_flow_ops_test",
-    size = "medium",  # Contains artificial waits.
-    srcs = ["ops/control_flow_ops_test.py"],
-    main = "ops/control_flow_ops_test.py",
-    python_version = "PY3",
-    shard_count = 2,
-    tags = [
-        "no_cuda_asan",  # b/173241932
-    ],
-    xla_tags = [
-        "no_cuda_asan",  # times out
-    ],
-    deps = [
-        ":array_ops",
-        ":cond",
-        ":cond_v2",
-        ":control_flow_case",
-        ":control_flow_ops",
-        ":control_flow_v2_toggles",
-        ":embedding_ops",
-        ":gradients",
-        ":init_ops",
-        ":math_ops",
-        ":platform_test",
-        ":state_ops",
-        ":tensor_array_grad",
-        ":tensor_array_ops",
-        ":training",
-        ":variable_scope",
-        ":variables",
-        ":while_loop",
-        ":while_v2",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/util:nest",
-    ],
-)
-
-py_test(
-    name = "op_selector_test",
-    srcs = ["ops/op_selector_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":math_ops",
-        ":op_selector",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-cuda_py_test(
-    name = "embedding_ops_test",
-    srcs = ["ops/embedding_ops_test.py"],
-    main = "ops/embedding_ops_test.py",
-    python_version = "PY3",
-    tags = ["no_windows_gpu"],
-    deps = [
-        ":embedding_ops",
-        ":gradients",
-        ":resource_variable_ops",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:test",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "gradient_checker_v2_test",
-    size = "medium",
-    srcs = ["ops/gradient_checker_v2_test.py"],
-    main = "ops/gradient_checker_v2_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradient_checker_v2",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:sparse_tensor",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:tf_logging",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "gradients_test",
-    size = "medium",
-    srcs = ["ops/gradients_test.py"],
-    main = "ops/gradients_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_grad",
-        ":array_ops",
-        ":cond",
-        ":control_flow_grad",
-        ":control_flow_ops",
-        ":data_flow_grad",
-        ":data_flow_ops",
-        ":functional_ops",
-        ":gradient_checker_v2",
-        ":gradients",
-        ":init_ops",
-        ":list_ops",
-        ":map_ops",
-        ":math_grad",
-        ":math_ops",
-        ":nn_grad",
-        ":nn_ops",
-        ":platform_test",
-        ":ref_variable",
-        ":state_grad",
-        ":state_ops",
-        ":tensor_array_grad",
-        ":tensor_array_ops",
-        ":unconnected_gradients",
-        ":variable_scope",
-        ":variable_v1",
-        ":while_loop",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:function",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/framework:test_ops",
-        "//tensorflow/python/util:nest",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-cuda_py_test(
-    name = "image_grad_d9m_test",
-    size = "large",
-    srcs = ["ops/image_grad_d9m_test.py"],
-    main = "ops/image_grad_d9m_test.py",
-    python_version = "PY3",
-    shard_count = 5,
-    deps = [
-        ":image_grad_test_base",
-        "//tensorflow/python/platform:client_testlib",
-    ],
-)
-
-cuda_py_test(
-    name = "image_grad_test",
-    size = "large",
-    srcs = ["ops/image_grad_test.py"],
-    main = "ops/image_grad_test.py",
-    python_version = "PY3",
-    shard_count = 5,
-    deps = [
-        ":image_grad_test_base",
-        "//tensorflow/python/platform:client_testlib",
-    ],
-)
-
-py_library(
-    name = "image_grad_test_base",
-    srcs = ["ops/image_grad_test_base.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":errors",
-        ":gradient_checker_v2",
-        ":gradients",
-        ":image_ops",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "image_ops_test",
-    size = "medium",
-    srcs = ["ops/image_ops_test.py"],
-    data = ["//tensorflow/core:image_testdata"],
-    main = "ops/image_ops_test.py",
-    python_version = "PY3",
-    shard_count = 16,
-    tags = [
-        "no_cuda_asan",  # TODO(b/171511582): re-enable.
-        "no_windows",  #TODO(b/207035199): re-enable
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":config",
-        ":control_flow_ops",
-        ":image_ops",
-        ":io_ops",
-        ":math_ops",
-        ":platform_test",
-        ":random_ops",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-cuda_py_test(
-    name = "init_ops_test",
-    size = "small",
-    srcs = ["ops/init_ops_test.py"],
-    main = "ops/init_ops_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":init_ops",
-        ":resource_variable_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "init_ops_v2_test",
-    size = "medium",
-    srcs = ["ops/init_ops_v2_test.py"],
-    main = "ops/init_ops_v2_test.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":init_ops_v2",
-        ":random_ops",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "math_grad_test",
-    size = "small",
-    srcs = ["ops/math_grad_test.py"],
-    main = "ops/math_grad_test.py",
-    python_version = "PY3",
-    tags = ["no_windows_gpu"],
-    xla_tags = [
-        "no_cuda_asan",  # times out
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradient_checker_v2",
-        ":math_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "math_ops_test",
-    size = "medium",
-    srcs = ["ops/math_ops_test.py"],
-    main = "ops/math_ops_test.py",
-    python_version = "PY3",
-    tags = [
-        "no_windows_gpu",
-    ],
-    deps = [
-        ":array_ops",
-        ":gradients",
-        ":math_ops",
-        ":platform_test",
-        ":variables",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:indexed_slices",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "math_ops_linspace_test",
-    size = "medium",
-    srcs = ["ops/math_ops_linspace_test.py"],
-    main = "ops/math_ops_linspace_test.py",
-    python_version = "PY3",
-    tags = [
-        "no_windows_gpu",
-    ],
-    deps = [
-        ":math_ops",
-        ":platform_test",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "nn_batchnorm_test",
-    size = "medium",
-    srcs = ["ops/nn_batchnorm_test.py"],
-    main = "ops/nn_batchnorm_test.py",
-    python_version = "PY3",
-    shard_count = 4,
-    tags = [
-        "no_mac_arm64",
-        "no_windows",
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradients",
-        ":math_ops",
-        ":nn",
-        ":nn_grad",
-        ":nn_ops_gen",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "nn_fused_batchnorm_d9m_test",
-    size = "medium",
-    srcs = ["ops/nn_fused_batchnorm_d9m_test.py"],
-    main = "ops/nn_fused_batchnorm_d9m_test.py",
-    python_version = "PY3",
-    shard_count = 4,
-    tags = [
-        "no_rocm",
-    ],
-    deps = [
-        ":client_testlib",
-        ":config",
-        ":constant_op",
-        ":dtypes",
-        ":errors",
-        ":nn",
-        ":nn_grad",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/framework:config",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "nn_fused_batchnorm_test",
-    size = "large",
-    srcs = ["ops/nn_fused_batchnorm_test.py"],
-    main = "ops/nn_fused_batchnorm_test.py",
-    python_version = "PY3",
-    shard_count = 24,
-    tags = ["no_rocm"],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradients",
-        ":nn",
-        ":nn_grad",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "nn_test",
-    size = "medium",
-    srcs = ["ops/nn_test.py"],
-    main = "ops/nn_test.py",
-    python_version = "PY3",
-    tags = ["no_windows"],
-    xla_tags = [
-        "no_cuda_asan",  # times out
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradient_checker_v2",
-        ":nn",
-        ":nn_grad",
-        ":nn_ops",
-        ":partitioned_variables",
-        ":variable_scope",
-        ":variables",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-py_test(
-    name = "nn_loss_scaling_utilities_test",
-    size = "small",
-    srcs = ["ops/nn_loss_scaling_utilities_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        "//tensorflow/python/distribute:combinations",
-        "//tensorflow/python/distribute:strategy_combinations",
-        "//tensorflow/python/distribute:test_util",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-cuda_py_test(
-    name = "nn_xent_test",
-    size = "medium",
-    srcs = ["ops/nn_xent_test.py"],
-    main = "ops/nn_xent_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":gradients",
-        ":nn",
-        ":nn_grad",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "tensor_array_ops_test",
-    size = "small",
-    srcs = ["ops/tensor_array_ops_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "sobol_ops_test",
-    size = "small",
-    srcs = ["ops/sobol_ops_test.py"],
-    kernels = [
-        "//tensorflow/core/kernels:libtfkernel_sobol_op.so",
-    ],
-    main = "ops/sobol_ops_test.py",
-    tags = [
-        "no_oss",  # TODO(b/149565560)
-        "no_windows_gpu",
-    ],
-    deps = [
-        ":math_ops",
-        ":platform_test",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "special_math_ops_test",
-    size = "medium",
-    srcs = ["ops/special_math_ops_test.py"],
-    main = "ops/special_math_ops_test.py",
-    python_version = "PY3",
-    shard_count = 10,
-    tags = [
-        "no_rocm",
-        "no_windows_gpu",
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradient_checker_v2",
-        ":math_ops",
-        ":special_math_ops",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:benchmark",
-        "//tensorflow/python/platform:tf_logging",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "variable_spec_test",
-    size = "small",
-    srcs = ["ops/variable_spec_test.py"],
-    python_version = "PY3",
-    deps = [
-        ":platform_test",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:fast_tensor_util",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:test_lib",
-        "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/util:nest",
-        "//third_party/py/numpy",
-    ],
-)
-
-alias(
-    name = "client",
-    actual = "//tensorflow/python/client:client",
-)
-
 # Note: this is a heavyweight library specialized for TensorFlow graphs. Do not use for
 # other purposes.
 
-py_library(
+py_strict_library(
     name = "global_test_configuration",
     compatible_with = get_compatible_with_portable(),
     srcs_version = "PY3",
@@ -3987,12 +585,7 @@ tf_proto_library(
     visibility = visibility,
 )
 
-alias(
-    name = "device_lib",
-    actual = "//tensorflow/python/client:device_lib",
-)
-
-py_library(
+py_strict_library(
     name = "pywrap_tensorflow",
     srcs = [
         "pywrap_tensorflow.py",
@@ -4034,7 +627,7 @@ pywrap_tensorflow_macro(
         "@eigen_archive//:__subpackages__",
     ]),
     roots = [
-        ":py_exception_registry",
+        "//tensorflow/python/lib/core:py_exception_registry",
     ],
     static_deps = [
         "@arm_neon_2_x86_sse//:__subpackages__",
@@ -4101,7 +694,6 @@ pywrap_tensorflow_macro(
     ] + tsl_async_value_deps() + if_onednn_v3(["@onednn_v3//:__subpackages__"]),
     win_def_file = ":pywrap_tensorflow_filtered_def_file",
     deps = [
-        ":safe_pyobject_ptr",
         "//tensorflow/c:c_api",
         "//tensorflow/c:c_api_experimental",
         "//tensorflow/c:checkpoint_reader",
@@ -4217,9 +809,6 @@ pywrap_tensorflow_macro(
 filegroup(
     name = "win_lib_files_for_exported_symbols",
     srcs = [
-        ":ndarray_tensor",  # checkpoint_reader
-        ":py_exception_registry",  # py_exception_registry
-        ":safe_pyobject_ptr",  # checkpoint_reader
         "//tensorflow/c:checkpoint_reader",  # checkpoint_reader
         "//tensorflow/c:python_api",  # tf_session
         "//tensorflow/c:safe_ptr",  # checkpoint_reader
@@ -4289,7 +878,10 @@ filegroup(
         "//tensorflow/python/framework:python_tensor_converter",  # python_tensor_converter
         "//tensorflow/python/grappler:cost_analyzer_lib",
         "//tensorflow/python/grappler:model_analyzer_lib",  # model_analyzer
+        "//tensorflow/python/lib/core:ndarray_tensor",  # checkpoint_reader
+        "//tensorflow/python/lib/core:py_exception_registry",  # py_exception_registry
         "//tensorflow/python/lib/core:py_func_lib",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",  # checkpoint_reader
         "//tensorflow/python/util:cpp_nest",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:kernel_registry",
@@ -4408,408 +1000,7 @@ cc_import(
 
 # ** Targets for Windows build (end) **
 
-alias(
-    name = "session",
-    actual = "//tensorflow/python/client:session",
-)
-
-alias(
-    name = "timeline",
-    actual = "//tensorflow/python/client:timeline",
-    visibility = [
-        "//visibility:public",
-    ],
-)
-
-py_library(
-    name = "summary_op_util",
-    srcs = ["ops/summary_op_util.py"],
-    srcs_version = "PY3",
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-    ],
-)
-
-alias(
-    name = "fake_summary_writer",
-    actual = "//tensorflow/python/summary/writer:fake_summary_writer",
-    visibility = ["//visibility:public"],
-)
-
-# -----------------------------------------------------------------------------
-# Quantization
-
-tf_py_test(
-    name = "dequantize_op_test",
-    size = "small",
-    srcs = ["ops/dequantize_op_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_windows",
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "quantized_ops_test",
-    size = "small",
-    srcs = ["ops/quantized_ops_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_windows",
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "quantized_conv_ops_test",
-    size = "small",
-    srcs = ["ops/quantized_conv_ops_test.py"],
-    python_version = "PY3",
-    tags = [
-        "no_windows",
-    ],
-    deps = [
-        ":client_testlib",
-        ":nn_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:errors",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
-    name = "ops/array_ops_test",
-    srcs = ["ops/array_ops_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradients",
-        ":math_ops",
-        ":random_ops",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/framework:test_lib",
-    ],
-)
-
-cuda_py_test(
-    name = "accumulate_n_benchmark",
-    size = "medium",
-    srcs = ["ops/accumulate_n_benchmark.py"],
-    main = "ops/accumulate_n_benchmark.py",
-    python_version = "PY3",
-    shard_count = 6,
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops_gen",
-        ":data_flow_ops",
-        ":math_ops",
-        ":random_ops",
-        ":state_ops",
-        ":state_ops_gen",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:tensor_shape",
-    ],
-)
-
-cuda_py_test(
-    name = "batch_norm_benchmark",
-    srcs = ["ops/batch_norm_benchmark.py"],
-    main = "ops/batch_norm_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":gradients",
-        ":math_ops",
-        ":nn",
-        ":nn_grad",
-        ":nn_ops_gen",
-        ":random_ops",
-        ":variables",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-    ],
-)
-
-cuda_py_test(
-    name = "collective_ops_benchmark",
-    srcs = ["ops/collective_ops_benchmark.py"],
-    main = "ops/collective_ops_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":collective_ops",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-cuda_py_test(
-    name = "concat_benchmark",
-    srcs = ["ops/concat_benchmark.py"],
-    main = "ops/concat_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":gradients",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-cuda_py_test(
-    name = "control_flow_ops_benchmark",
-    srcs = ["ops/control_flow_ops_benchmark.py"],
-    main = "ops/control_flow_ops_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":cond",
-        ":control_flow_ops",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-cuda_py_test(
-    name = "conv2d_benchmark",
-    size = "large",
-    srcs = ["ops/conv2d_benchmark.py"],
-    main = "ops/conv2d_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":control_flow_ops",
-        ":nn_ops",
-        ":platform_benchmark",
-        ":random_ops",
-        ":variable_v1",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:flags",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "split_benchmark",
-    srcs = ["ops/split_benchmark.py"],
-    main = "ops/split_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":platform_benchmark",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/platform:tf_logging",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "transpose_benchmark",
-    size = "medium",
-    srcs = ["ops/transpose_benchmark.py"],
-    main = "ops/transpose_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":control_flow_ops",
-        ":platform_benchmark",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "matmul_benchmark",
-    size = "medium",
-    srcs = ["ops/matmul_benchmark.py"],
-    main = "ops/matmul_benchmark.py",
-    python_version = "PY3",
-    deps = [
-        ":matmul_benchmark_main_lib",
-        "//tensorflow/python/platform:client_testlib",
-    ],
-)
-
-py_library(
-    name = "matmul_benchmark_main_lib",
-    testonly = True,
-    srcs = ["ops/matmul_benchmark.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":control_flow_ops",
-        ":math_ops",
-        ":platform_benchmark",
-        ":random_ops",
-        ":variable_v1",
-        ":variables",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/client",
-        "//tensorflow/python/framework:ops",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "nn_grad_test",
-    size = "medium",
-    srcs = ["ops/nn_grad_test.py"],
-    main = "ops/nn_grad_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-        ":gradient_checker_v2",
-        ":nn_grad",
-        ":nn_ops",
-        "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:test_lib",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "nccl_ops_gen",
-    visibility = ["//tensorflow:internal"],
-    deps = [
-        "//tensorflow/core:nccl_ops_op_lib",
-    ],
-)
-
-py_library(
-    name = "nccl_ops",
-    srcs = ["ops/nccl_ops.py"],
-    srcs_version = "PY3",
-    visibility = visibility + [
-        "//learning/deepmind/tensorflow:__subpackages__",
-        "//third_party/car/deep_nets/tensorflow:__subpackages__",
-    ],
-    deps = [
-        ":nccl_ops_gen",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/framework:device",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-cuda_py_test(
-    name = "nccl_ops_test",
-    size = "small",
-    srcs = ["ops/nccl_ops_test.py"],
-    main = "ops/nccl_ops_test.py",
-    python_version = "PY3",
-    # Disabled on jenkins until errors finding nvmlShutdown are found.
-    tags = [
-        "manual",
-        "multi_gpu",
-        "no_oss",
-        "noguitar",
-        "notap",
-    ],
-    deps = [
-        ":array_ops",
-        ":client_testlib",
-        ":nccl_ops",
-        ":platform_test",
-        "//tensorflow/python/framework:test_lib",
-    ],
-)
-
-cuda_py_test(
-    name = "factory_ops_test",
-    size = "small",
-    srcs = ["ops/factory_ops_test.py"],
-    main = "ops/factory_ops_test.py",
-    deps = [
-        ":platform_test",
-        ":sparse_ops",
-        "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/distribute:mirrored_strategy",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/testing:parameterized",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "decode_proto_ops_gen",
-    deps = [
-        "//tensorflow/core:decode_proto_ops_op_lib",
-    ],
-)
-
-tf_gen_op_wrapper_private_py(
-    name = "encode_proto_ops_gen",
-    deps = [
-        "//tensorflow/core:encode_proto_ops_op_lib",
-    ],
-)
-
-py_library(
-    name = "proto_ops",
-    srcs = ["ops/proto_ops.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":decode_proto_ops_gen",
-        ":encode_proto_ops_gen",
-        "//tensorflow/python/framework:ops",
-    ],
-)
-
-py_library(
+py_strict_library(
     name = "pywrap_mlir",
     srcs = ["pywrap_mlir.py"],
     srcs_version = "PY3",
@@ -4817,6 +1008,7 @@ py_library(
     deps = [
         ":_pywrap_mlir",
         ":pywrap_tensorflow",
+        "//tensorflow/python/eager:context",
     ],
 )
 
@@ -4842,7 +1034,7 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "pywrap_sanitizers",
     srcs = ["pywrap_sanitizers.py"],
     srcs_version = "PY3",
@@ -4858,9 +1050,9 @@ tf_python_pybind_extension(
     srcs = ["//tensorflow/core/config:flags_api_wrapper.cc"],
     visibility = ["//tensorflow/core/config:__subpackages__"],
     deps = [
-        ":pybind11_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core/config:flags_headers",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/types:optional",
         "@pybind11",
@@ -4903,7 +1095,7 @@ cc_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "pywrap_tfe",
     srcs = ["pywrap_tfe.py"],
     srcs_version = "PY3",
@@ -4982,7 +1174,7 @@ tf_python_pybind_extension(
     ),
 )
 
-py_library(
+py_strict_library(
     name = "pywrap_tfe_monitoring_reader",
     testonly = True,
     srcs = ["pywrap_tfe_monitoring_reader.py"],
@@ -5089,7 +1281,7 @@ tf_pybind_cc_library_wrapper(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf2",
     srcs = ["tf2.py"],
     srcs_version = "PY3",
@@ -5099,496 +1291,26 @@ py_library(
     ],
 )
 
-cuda_py_test(
-    name = "raw_ops_test",
-    srcs = ["ops/raw_ops_test.py"],
-    main = "ops/raw_ops_test.py",
-    python_version = "PY3",
-    deps = [
-        ":client_testlib",
-    ],
-)
-
-alias(
-    name = "platform_benchmark",
-    actual = "//tensorflow/python/platform:benchmark",
-)
-
-alias(
-    name = "platform_analytics",
-    actual = "//tensorflow/python/platform:analytics",
-)
-
-py_library(
+py_strict_library(
     name = "platform_test",
     srcs_version = "PY3",
     deps = ["//tensorflow/python/platform:test"],
 )
 
-alias(
-    name = "client_testlib",
-    actual = "//tensorflow/python/platform:client_testlib",
-)
-
-alias(
-    name = "pybind11_absl",
-    actual = "//tensorflow/python/lib/core:pybind11_absl",
-)
-
-alias(
-    name = "pybind11_proto",
-    actual = "//tensorflow/python/lib/core:pybind11_proto",
-)
-
-alias(
-    name = "py_func_lib",
-    actual = "//tensorflow/python/lib/core:py_func_lib",
-)
-
-alias(
-    name = "py_seq_tensor",
-    actual = "//tensorflow/python/lib/core:py_seq_tensor",
-)
-
-alias(
-    name = "py_util",
-    actual = "//tensorflow/python/lib/core:py_util",
-)
-
-alias(
-    name = "py_exception_registry",
-    actual = "//tensorflow/python/lib/core:py_exception_registry",
-)
-
-alias(
-    name = "pybind11_lib",
-    actual = "//tensorflow/python/lib/core:pybind11_lib",
-)
-
-alias(
-    name = "pybind11_status_headers",
-    actual = "//tensorflow/python/lib/core:pybind11_status_headers",
-)
-
-alias(
-    name = "pybind11_status",
-    actual = "//tensorflow/python/lib/core:pybind11_status",
-)
-
-alias(
-    name = "lib",
-    actual = "//tensorflow/python/lib/io:lib",
-)
-
-alias(
-    name = "safe_pyobject_ptr",
-    actual = "//tensorflow/python/lib/core:safe_pyobject_ptr",
-)
-
-alias(
-    name = "ndarray_tensor",
-    actual = "//tensorflow/python/lib/core:ndarray_tensor",
-)
-
-alias(
-    name = "ndarray_tensor_bridge",
-    actual = "//tensorflow/python/lib/core:ndarray_tensor_bridge",
-)
-
-alias(
-    name = "ndarray_tensor_headers",
-    actual = "//tensorflow/python/lib/core:ndarray_tensor_headers",
-)
-
-alias(
-    name = "python_op_gen_main",
-    actual = "//tensorflow/python/framework:python_op_gen_main",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "framework",
-    actual = "//tensorflow/python/framework",
-)
-
-alias(
-    name = "c_api_util",
-    actual = "//tensorflow/python/framework:c_api_util",
-)
-
-alias(
-    name = "constant_op",
-    actual = "//tensorflow/python/framework:constant_op",
-)
-
-alias(
-    name = "device_spec",
-    actual = "//tensorflow/python/framework:device_spec",
-)
-
-alias(
-    name = "device",
-    actual = "//tensorflow/python/framework:device",
-)
-
-alias(
-    name = "_dtypes",
-    actual = "//tensorflow/python/framework:_dtypes",
-)
-
-alias(
-    name = "dtypes",
-    actual = "//tensorflow/python/framework:dtypes",
-    visibility = visibility + ["//smartass:__subpackages__"],
-)
-
-alias(
-    name = "errors",
-    actual = "//tensorflow/python/framework:errors",
-)
-
-alias(
-    name = "error_interpolation",
-    actual = "//tensorflow/python/framework:error_interpolation",
-)
-
-alias(
-    name = "function",
-    actual = "//tensorflow/python/framework:function",
-    visibility = visibility + ["//smartass/brain:__subpackages__"],
-)
-
-alias(
-    name = "graph_to_function_def",
-    actual = "//tensorflow/python/framework:graph_to_function_def",
-)
-
-alias(
-    name = "function_def_to_graph",
-    actual = "//tensorflow/python/framework:function_def_to_graph",
-    visibility = visibility + ["//waymo/ml:__subpackages__"],
-)
-
-alias(
-    name = "graph_util",
-    actual = "//tensorflow/python/framework:graph_util",
-)
-
-alias(
-    name = "convert_to_constants",
-    actual = "//tensorflow/python/framework:convert_to_constants",
-    visibility = visibility + ["//waymo/ml:__subpackages__"],
-)
-
-alias(
-    name = "kernels",
-    actual = "//tensorflow/python/framework:kernels",
-)
-
-alias(
-    name = "op_def_registry",
-    actual = "//tensorflow/python/framework:op_def_registry",
-)
-
-alias(
-    name = "framework_ops",
-    actual = "//tensorflow/python/framework:ops",
-)
-
-alias(
-    name = "framework_indexed_slices",
-    actual = "//tensorflow/python/framework:indexed_slices",
-)
-
 alias(
     name = "framework_for_generated_wrappers",
     actual = "//tensorflow/python/framework:for_generated_wrappers",
     visibility = ["//visibility:public"],
 )
 
-alias(
-    name = "framework_for_generated_wrappers_v2",
-    actual = "//tensorflow/python/framework:for_generated_wrappers_v2",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "op_callbacks",
-    actual = "//tensorflow/python/framework:op_callbacks",
-)
-
-alias(
-    name = "indexed_slices",
-    actual = "//tensorflow/python/framework:indexed_slices",
-)
-
-alias(
-    name = "func_graph",
-    actual = "//tensorflow/python/framework:func_graph",
-)
-
-alias(
-    name = "auto_control_deps",
-    actual = "//tensorflow/python/framework:auto_control_deps",
-)
-
-alias(
-    name = "config",
-    actual = "//tensorflow/python/framework:config",
-)
-
-alias(
-    name = "random_seed",
-    actual = "//tensorflow/python/framework:random_seed",
-)
-
-alias(
-    name = "smart_cond",
-    actual = "//tensorflow/python/framework:smart_cond",
-)
-
-alias(
-    name = "sparse_tensor",
-    actual = "//tensorflow/python/framework:sparse_tensor",
-)
-
-alias(
-    name = "composite_tensor",
-    actual = "//tensorflow/python/framework:composite_tensor",
-    visibility = visibility,
-)
-
-alias(
-    name = "graph_building_benchmark",
-    actual = "//tensorflow/python/framework:graph_building_benchmark",
-)
-
-alias(
-    name = "tensor_shape",
-    actual = "//tensorflow/python/framework:tensor_shape",
-    visibility = visibility + ["//smartass/brain/configure/python:__pkg__"],
-)
-
-alias(
-    name = "type_spec",
-    actual = "//tensorflow/python/framework:type_spec",
-    visibility = visibility,
-)
-
-alias(
-    name = "tensor_spec",
-    actual = "//tensorflow/python/framework:tensor_spec",
-)
-
-alias(
-    name = "tensor_util",
-    actual = "//tensorflow/python/framework:tensor_util",
-)
-
-alias(
-    name = "versions",
-    actual = "//tensorflow/python/framework:versions",
-)
-
-alias(
-    name = "framework_test_lib",
-    actual = "//tensorflow/python/framework:test_lib",
-    visibility = visibility + [
-        "//tensorflow_estimator/python/estimator:__subpackages__",
-    ],
-)
-
-alias(
-    name = "is_mlir_bridge_test_true",
-    actual = "//tensorflow/python/framework:is_mlir_bridge_test_true",
-    visibility = visibility + ["//tensorflow/compiler/tests:friends"],
-)
-
-alias(
-    name = "tfrt_utils",
-    actual = "//tensorflow/python/framework:tfrt_utils",
-)
-
-alias(
-    name = "framework_combinations",
-    actual = "//tensorflow/python/framework:combinations",
-)
-
-alias(
-    name = "framework_importer_test",
-    actual = "//tensorflow/python/framework:importer_test",
-)
-
-alias(
-    name = "framework_test_combinations_lib",
-    actual = "//tensorflow/python/framework:test_combinations_lib",
-)
-
-alias(
-    name = "framework_kernels_test",
-    actual = "//tensorflow/python/framework:kernels_test",
-)
-
-alias(
-    name = "framework_ops_test",
-    actual = "//tensorflow/python/framework:ops_test",
-)
-
-alias(
-    name = "framework_test_util_test",
-    actual = "//tensorflow/python/framework:test_util_test",
-)
-
-alias(
-    name = "function_def_to_graph_test",
-    actual = "//tensorflow/python/framework:function_def_to_graph_test",
-)
-
-alias(
-    name = "op_def_library_test",
-    actual = "//tensorflow/python/framework:op_def_library_test",
-)
-
-alias(
-    name = "config_test",
-    actual = "//tensorflow/python/framework:config_test",
-)
-
-alias(
-    name = "is_mlir_bridge_test_false",
-    actual = "//tensorflow/python/framework:is_mlir_bridge_test_false",
-    visibility = visibility + ["//tensorflow/compiler/tests:friends"],
-)
-
-alias(
-    name = "memory_checker",
-    actual = "//tensorflow/python/framework:memory_checker",
-)
-
-alias(
-    name = "test_ops",
-    actual = "//tensorflow/python/framework:test_ops",
-)
-
-alias(
-    name = "test_ops_kernels",
-    actual = "//tensorflow/python/framework:test_ops_kernels",
-)
-
-alias(
-    name = "basic_session_run_hooks",
-    actual = "//tensorflow/python/training:basic_session_run_hooks",
-)
-
-alias(
-    name = "checkpoint_management",
-    actual = "//tensorflow/python/training:checkpoint_management",
-)
-
-alias(
-    name = "py_checkpoint_reader",
-    actual = "//tensorflow/python/training:py_checkpoint_reader",
-)
-
-alias(
-    name = "saver",
-    actual = "//tensorflow/python/training:saver",
-)
-
-alias(
-    name = "session_run_hook",
-    actual = "//tensorflow/python/training:session_run_hook",
-)
-
-alias(
-    name = "checkpoint",
-    actual = "//tensorflow/python/checkpoint:checkpoint",
-)
-
-alias(
-    name = "checkpoint_lib",
-    actual = "//tensorflow/python/checkpoint:checkpoint_lib",
-)
-
-alias(
-    name = "trackable",
-    actual = "//tensorflow/python/trackable:trackable",
-)
-
-alias(
-    name = "training",
-    actual = "//tensorflow/python/training:training",
-)
-
-alias(
-    name = "training_lib",
-    actual = "//tensorflow/python/training:training_lib",
-)
-
-alias(
-    name = "training_server_lib",
-    actual = "//tensorflow/python/training:server_lib",
-)
-
-alias(
-    name = "training_util",
-    actual = "//tensorflow/python/training:training_util",
-)
-
-alias(
-    name = "loss_scale",
-    actual = "//tensorflow/python/training/experimental:loss_scale",
-)
-
-alias(
-    name = "loss_scale_optimizer",
-    actual = "//tensorflow/python/training/experimental:loss_scale_optimizer",
-)
-
-alias(
-    name = "mixed_precision",
-    actual = "//tensorflow/python/training/experimental:mixed_precision",
-)
-
-alias(
-    name = "mixed_precision_global_state",
-    actual = "//tensorflow/python/training/experimental:mixed_precision_global_state",
-)
-
-alias(
-    name = "tf_optimizer",
-    actual = "//tensorflow/python/grappler:tf_optimizer",
-    visibility = ["//visibility:public"],
-)
-
-alias(
-    name = "ops",
-    actual = "//tensorflow/python/user_ops:ops",
-)
-
-alias(
-    name = "layers_base",
-    actual = "//tensorflow/python/layers:layers_base",
-)
-
-alias(
-    name = "layers_util",
-    actual = "//tensorflow/python/layers:layers_util",
-)
-
-alias(
-    name = "layers",
-    actual = "//tensorflow/python/layers:layers",
-)
-
-py_library(
+py_strict_library(
     name = "dtensor",
     deps = [
         "//tensorflow/dtensor/python:dtensor",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "learning_rate_decay",
     # This rule depends on a target that only python:__pkg__ has visibility for.
     srcs = ["//tensorflow/python/training:learning_rate_decay.py"],
@@ -5606,7 +1328,7 @@ py_library(
 #
 # # Special build rule for jax2tf.
 # # TODO(b/239052279): remove once TF->TFLite dependency cycle is resolved
-# py_library(
+# py_strict_library(
 #     name = "jax2tf_support",
 #     visibility = ["//third_party/py/jax/experimental/jax2tf:__subpackages__"],
 #     deps = [
diff --git a/tensorflow/python/__init__.py b/tensorflow/python/__init__.py
index f2b7f371f3c..9006c3a850c 100644
--- a/tensorflow/python/__init__.py
+++ b/tensorflow/python/__init__.py
@@ -39,7 +39,6 @@ from tensorflow.python.eager import context
 # pylint: enable=wildcard-import
 
 # Bring in subpackages.
-from tensorflow.python import data
 from tensorflow.python import distribute
 # from tensorflow.python import keras
 from tensorflow.python.feature_column import feature_column_lib as feature_column
@@ -54,7 +53,6 @@ from tensorflow.python.ops import image_ops as image
 from tensorflow.python.ops import manip_ops as manip
 from tensorflow.python.ops import metrics
 from tensorflow.python.ops import nn
-from tensorflow.python.ops import numpy_ops
 from tensorflow.python.ops import ragged
 from tensorflow.python.ops import sets
 from tensorflow.python.ops import stateful_random_ops
diff --git a/tensorflow/python/autograph/__init__.py b/tensorflow/python/autograph/__init__.py
index fbfcf22f698..e37691961fb 100644
--- a/tensorflow/python/autograph/__init__.py
+++ b/tensorflow/python/autograph/__init__.py
@@ -29,22 +29,6 @@ For more information, see the
 and the [tf.function guide](https://www.tensorflow.org/guide/function#autograph_transformations).
 """
 
-# TODO(mdan): Bring only the relevant symbols to the top level.
-from tensorflow.python.autograph import operators
-from tensorflow.python.autograph import utils
-from tensorflow.python.autograph.core.converter import ConversionOptions
-from tensorflow.python.autograph.core.converter import Feature
-from tensorflow.python.autograph.impl.api import AutoGraphError
-from tensorflow.python.autograph.impl.api import convert
-from tensorflow.python.autograph.impl.api import converted_call
-from tensorflow.python.autograph.impl.api import do_not_convert
-from tensorflow.python.autograph.impl.api import StackTraceMapper
-from tensorflow.python.autograph.impl.api import to_code
-from tensorflow.python.autograph.impl.api import to_graph
-from tensorflow.python.autograph.lang.directives import set_element_type
-from tensorflow.python.autograph.lang.directives import set_loop_options
-from tensorflow.python.autograph.lang.special_functions import stack
-from tensorflow.python.autograph.utils import ag_logging
 from tensorflow.python.util.all_util import remove_undocumented
 
 # TODO(mdan): Revisit this list once we finalize the generated code mechanism.
diff --git a/tensorflow/python/autograph/converters/BUILD b/tensorflow/python/autograph/converters/BUILD
index 57a0e14df85..2261a63a93c 100644
--- a/tensorflow/python/autograph/converters/BUILD
+++ b/tensorflow/python/autograph/converters/BUILD
@@ -242,10 +242,10 @@ py_strict_test(
         ":asserts",
         ":functions",
         ":return_statements",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -256,9 +256,9 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":break_statements",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -270,8 +270,8 @@ py_strict_test(
     deps = [
         ":call_trees",
         ":functions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -282,8 +282,8 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":conditional_expressions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -294,9 +294,9 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":continue_statements",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -309,7 +309,6 @@ py_strict_test(
         ":break_statements",
         ":continue_statements",
         ":control_flow",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -317,6 +316,7 @@ py_strict_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
@@ -329,10 +329,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":directives",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/pyct:anno",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -343,12 +343,12 @@ py_strict_test(
     deps = [
         ":functions",
         ":return_statements",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -359,8 +359,8 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":list_comprehensions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -372,14 +372,14 @@ py_strict_test(
     deps = [
         ":directives",
         ":lists",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:list_ops",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/autograph/lang:special_functions",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -390,10 +390,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":logical_expressions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -405,9 +405,9 @@ py_strict_test(
     deps = [
         ":functions",
         ":return_statements",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -419,12 +419,12 @@ py_strict_test(
     deps = [
         ":directives",
         ":slices",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:list_ops",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/autograph/lang:directives",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -435,7 +435,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":variables",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/core/BUILD b/tensorflow/python/autograph/core/BUILD
index f27800a176c..34bb222ec87 100644
--- a/tensorflow/python/autograph/core/BUILD
+++ b/tensorflow/python/autograph/core/BUILD
@@ -61,9 +61,9 @@ py_strict_library(
     deps = [
         ":ag_ctx",
         ":converter",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/autograph/operators:variables",
         "//tensorflow/python/framework:auto_control_deps",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/util:nest",
     ],
@@ -91,11 +91,11 @@ py_strict_test(
     deps = [
         ":converter",
         ":test_lib",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:loader",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:templates",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -107,9 +107,9 @@ py_strict_test(
     deps = [
         ":converter",
         ":function_wrappers",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/impl/BUILD b/tensorflow/python/autograph/impl/BUILD
index c0cda6ed2d3..6bdf963518e 100644
--- a/tensorflow/python/autograph/impl/BUILD
+++ b/tensorflow/python/autograph/impl/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -33,7 +33,6 @@ py_strict_library(
         "//tensorflow/python/autograph/operators:__init__",
         "//tensorflow/python/autograph/operators:py_builtins",
         "//tensorflow/python/autograph/pyct:anno",
-        "//tensorflow/python/autograph/pyct:cache",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:error_utils",
         "//tensorflow/python/autograph/pyct:errors",
@@ -67,7 +66,7 @@ py_strict_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "api_test",
     srcs = ["api_test.py"],
     python_version = "PY3",
@@ -75,9 +74,6 @@ tf_py_test(
     deps = [
         ":api",
         ":conversion",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:test_lib",
@@ -92,24 +88,27 @@ tf_py_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:function_utils",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "conversion_test",
     srcs = ["conversion_test.py"],
     deps = [
         ":api",
         ":conversion",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/core:config",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/impl/testing:pybind_for_testing",
         "//tensorflow/python/autograph/utils:__init__",
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/impl/conversion_test.py b/tensorflow/python/autograph/impl/conversion_test.py
index a0d8cc314fa..1814ef4262b 100644
--- a/tensorflow/python/autograph/impl/conversion_test.py
+++ b/tensorflow/python/autograph/impl/conversion_test.py
@@ -89,6 +89,10 @@ class ConversionTest(test.TestCase):
     self.assertFalse(conversion.is_allowlisted(tc.converted_method))
 
   def test_is_allowlisted_tfmethodwrapper(self):
+    allowlisted_mod = imp.new_module('test_allowlisted_call')
+    sys.modules['test_allowlisted_call'] = allowlisted_mod
+    config.CONVERSION_RULES = ((config.DoNotConvert('test_allowlisted_call'),) +
+                               config.CONVERSION_RULES)
 
     class TestClass:
 
diff --git a/tensorflow/python/autograph/lang/BUILD b/tensorflow/python/autograph/lang/BUILD
index c0247a4f779..d9207ac75a1 100644
--- a/tensorflow/python/autograph/lang/BUILD
+++ b/tensorflow/python/autograph/lang/BUILD
@@ -31,12 +31,12 @@ py_strict_test(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:list_ops",
         "//tensorflow/python/autograph/lang:special_functions",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/operators/BUILD b/tensorflow/python/autograph/operators/BUILD
index 70c6ff17876..cde1fbf8bf2 100644
--- a/tensorflow/python/autograph/operators/BUILD
+++ b/tensorflow/python/autograph/operators/BUILD
@@ -11,19 +11,19 @@ py_strict_library(
     srcs = ["py_builtins.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops_gen",
-        "//tensorflow/python:string_ops_gen",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python/autograph/utils:tensors",
         "//tensorflow/python/autograph/utils:type_registry",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops_gen",
+        "//tensorflow/python/ops:string_ops_gen",
     ],
 )
 
@@ -32,8 +32,8 @@ py_strict_library(
     srcs = ["exceptions.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:control_flow_assert",
         "//tensorflow/python/util:tf_inspect",
     ],
 )
@@ -59,9 +59,9 @@ py_strict_library(
     srcs = ["logical.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops_gen",
     ],
 )
 
@@ -76,14 +76,14 @@ py_strict_library(
     srcs = ["data_structures.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
     ],
 )
 
@@ -93,8 +93,8 @@ py_strict_library(
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
         ":control_flow",
-        "//tensorflow/python:cond",
         "//tensorflow/python/autograph/utils:tensors",
+        "//tensorflow/python/ops:cond",
     ],
 )
 
@@ -105,24 +105,24 @@ py_strict_library(
     deps = [
         ":py_builtins",
         ":variables",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/autograph/utils:ag_logging",
         "//tensorflow/python/autograph/utils:misc",
         "//tensorflow/python/autograph/utils:tensors",
         "//tensorflow/python/autograph/utils:type_registry",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/types:distribute",
         "//tensorflow/python/util:nest",
@@ -136,12 +136,12 @@ py_strict_library(
     srcs = ["slices.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:string_ops_gen",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:string_ops_gen",
+        "//tensorflow/python/ops:tensor_array_ops",
     ],
 )
 
@@ -152,13 +152,13 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":data_structures",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -169,10 +169,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":conditional_expressions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -187,12 +187,6 @@ py_strict_test(
     deps = [
         ":control_flow",
         ":variables",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python/autograph/utils:ag_logging",
         "//tensorflow/python/autograph/utils:testing",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -201,7 +195,13 @@ py_strict_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -213,10 +213,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":exceptions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -227,9 +227,9 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":logical",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -241,10 +241,6 @@ py_strict_test(
     deps = [
         ":data_structures",
         ":py_builtins",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/autograph/core:converter",
         "//tensorflow/python/autograph/core:function_wrappers",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -253,6 +249,10 @@ py_strict_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -263,9 +263,9 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":slices",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:list_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -276,7 +276,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":variables",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/python/autograph/pyct/BUILD b/tensorflow/python/autograph/pyct/BUILD
index 138c11eb9c8..6e7337da938 100644
--- a/tensorflow/python/autograph/pyct/BUILD
+++ b/tensorflow/python/autograph/pyct/BUILD
@@ -193,7 +193,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":anno",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -209,7 +209,7 @@ py_strict_test(
         ":parser",
         ":pretty_printer",
         ":qual_names",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
@@ -221,7 +221,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":cache",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -233,7 +233,7 @@ py_strict_test(
     deps = [
         ":cfg",
         ":parser",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
@@ -248,7 +248,7 @@ py_strict_test(
         ":loader",
         ":parser",
         ":pretty_printer",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_inspect",
         "@gast_archive//:gast",
     ],
@@ -262,7 +262,7 @@ py_strict_test(
     deps = [
         ":error_utils",
         ":origin_info",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -273,11 +273,11 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":inspect_utils",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct/testing:basic_definitions",
         "//tensorflow/python/autograph/pyct/testing:decorators",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/lib:__init__",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -294,7 +294,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":naming",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -308,8 +308,8 @@ py_strict_test(
         ":inspect_utils",
         ":origin_info",
         ":parser",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct/testing:basic_definitions",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_inspect",
     ],
 )
@@ -324,7 +324,7 @@ py_strict_test(
         ":errors",
         ":parser",
         ":pretty_printer",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
@@ -336,7 +336,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":pretty_printer",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -349,7 +349,7 @@ py_strict_test(
         ":anno",
         ":parser",
         ":qual_names",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -363,7 +363,7 @@ py_strict_test(
         ":parser",
         ":qual_names",
         ":templates",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
         "@gast_archive//:gast",
     ],
@@ -379,7 +379,7 @@ py_strict_test(
         ":origin_info",
         ":parser",
         ":transformer",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
@@ -392,7 +392,7 @@ py_strict_test(
     deps = [
         ":transformer",
         ":transpiler",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/common_transformers/BUILD b/tensorflow/python/autograph/pyct/common_transformers/BUILD
index f5f0bcedc72..8f61d39dd4d 100644
--- a/tensorflow/python/autograph/pyct/common_transformers/BUILD
+++ b/tensorflow/python/autograph/pyct/common_transformers/BUILD
@@ -1,11 +1,11 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "common_transformers",
     srcs = [
         "anf.py",
@@ -20,7 +20,7 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "anf_test",
     srcs = ["anf_test.py"],
     python_version = "PY3",
@@ -28,10 +28,10 @@ py_test(
     tags = ["no_oss"],
     deps = [
         ":common_transformers",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct:loader",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/static_analysis/BUILD b/tensorflow/python/autograph/pyct/static_analysis/BUILD
index 4aadeae18e0..51a5df52a0e 100644
--- a/tensorflow/python/autograph/pyct/static_analysis/BUILD
+++ b/tensorflow/python/autograph/pyct/static_analysis/BUILD
@@ -101,12 +101,12 @@ py_strict_test(
     deps = [
         ":activity",
         ":annos",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:naming",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/platform:client_testlib",
         "@gast_archive//:gast",
     ],
 )
@@ -121,13 +121,13 @@ py_strict_test(
         ":activity",
         ":liveness",
         ":reaching_fndefs",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:naming",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -139,13 +139,13 @@ py_strict_test(
     deps = [
         ":activity",
         ":reaching_definitions",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:naming",
         "//tensorflow/python/autograph/pyct:parser",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transformer",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -159,10 +159,10 @@ py_strict_test(
         ":reaching_definitions",
         ":reaching_fndefs",
         ":type_inference",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/autograph/pyct:anno",
         "//tensorflow/python/autograph/pyct:cfg",
         "//tensorflow/python/autograph/pyct:qual_names",
         "//tensorflow/python/autograph/pyct:transpiler",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/autograph/pyct/testing/BUILD b/tensorflow/python/autograph/pyct/testing/BUILD
index d3729cf0fc8..e0cbb4f918a 100644
--- a/tensorflow/python/autograph/pyct/testing/BUILD
+++ b/tensorflow/python/autograph/pyct/testing/BUILD
@@ -45,7 +45,7 @@ py_strict_test(
     ],
     deps = [
         ":codegen",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/autograph/tests/BUILD b/tensorflow/python/autograph/tests/BUILD
index 62e5005151d..21bc8999693 100644
--- a/tensorflow/python/autograph/tests/BUILD
+++ b/tensorflow/python/autograph/tests/BUILD
@@ -1,10 +1,11 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load(":reference_test.bzl", "reference_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 licenses(["notice"])
 
-py_library(
+py_strict_library(
     name = "reference_tests",
     srcs = ["reference_test_base.py"],
     deps = [
@@ -27,11 +28,17 @@ reference_test(name = "type_annotations_test")
 
 # ## Scoping and modularity
 
-reference_test(name = "loop_scoping_test")
+reference_test(
+    name = "loop_scoping_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
 # ## Composite names
 
-reference_test(name = "composite_names_in_control_flow_test")
+reference_test(
+    name = "composite_names_in_control_flow_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
 # ## Function Calls
 
@@ -41,9 +48,15 @@ reference_test(name = "call_to_lambda_function_test")
 
 reference_test(name = "call_to_named_tuple_test")
 
-reference_test(name = "call_to_numpy_function_test")
+reference_test(
+    name = "call_to_numpy_function_test",
+    additional_deps = ["//third_party/py/numpy"],
+)
 
-reference_test(name = "call_to_print_function_test")
+reference_test(
+    name = "call_to_print_function_test",
+    additional_deps = ["//third_party/py/numpy"],
+)
 
 reference_test(name = "call_to_tf_api_test")
 
@@ -53,35 +66,64 @@ reference_test(name = "call_to_user_function_test")
 
 reference_test(name = "basic_ifexp_test")
 
-reference_test(name = "cond_basic_test")
+reference_test(
+    name = "cond_basic_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
-reference_test(name = "early_return_test")
+reference_test(
+    name = "early_return_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
-reference_test(name = "loop_basic_test")
+reference_test(
+    name = "loop_basic_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
 reference_test(
     name = "loop_control_flow_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
     shard_count = 20,
 )
 
-reference_test(name = "loop_control_flow_illegal_cases_test")
+reference_test(
+    name = "loop_control_flow_illegal_cases_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
-reference_test(name = "loop_created_variables_test")
+reference_test(
+    name = "loop_created_variables_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
 reference_test(
     name = "loop_distributed_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
     tags = [
         "no_oss",  # TODO(mdan): Investigate timeout reason.
     ],
 )
 
-reference_test(name = "loop_with_variable_type_test")
+reference_test(
+    name = "loop_with_variable_type_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
-reference_test(name = "loop_with_variable_type_illegal_cases_test")
+reference_test(
+    name = "loop_with_variable_type_illegal_cases_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
-reference_test(name = "loop_with_function_call_test")
+reference_test(
+    name = "loop_with_function_call_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
-reference_test(name = "nested_control_flow_test")
+reference_test(
+    name = "nested_control_flow_test",
+    additional_deps = ["@absl_py//absl/testing:parameterized"],
+)
 
 # ## Assert
 
@@ -101,6 +143,7 @@ reference_test(name = "datasets_test")
 
 reference_test(
     name = "basic_list_test",
+    additional_deps = ["//tensorflow/python/autograph"],
     tags = [
         "no_oss",
         "notap",
diff --git a/tensorflow/python/autograph/tests/basic_list_test.py b/tensorflow/python/autograph/tests/basic_list_test.py
index 1c49ea49846..f6369a1e61b 100644
--- a/tensorflow/python/autograph/tests/basic_list_test.py
+++ b/tensorflow/python/autograph/tests/basic_list_test.py
@@ -16,7 +16,8 @@
 
 import tensorflow as tf
 
-from tensorflow.python import autograph as ag
+from tensorflow.python.autograph.lang import directives
+from tensorflow.python.autograph.lang import special_functions
 from tensorflow.python.autograph.tests import reference_test_base
 
 
@@ -25,7 +26,7 @@ def type_not_annotated(n):
   # TODO(mdan): Here, we ought to infer the dtype and shape when i is staged.
   for i in range(n):
     l.append(i)
-  return ag.stack(l, strict=False)
+  return special_functions.stack(l, strict=False)
 
 
 def element_access():
@@ -33,7 +34,7 @@ def element_access():
   l.append(1)
   l.append(2)
   l.append(3)
-  ag.set_element_type(l, tf.int32)
+  directives.set_element_type(l, tf.int32)
   return 2 * l[1]
 
 
@@ -42,40 +43,40 @@ def element_update():
   l.append(1)
   l.append(2)
   l.append(3)
-  ag.set_element_type(l, tf.int32)
+  directives.set_element_type(l, tf.int32)
   l[1] = 5
-  return ag.stack(l, strict=False)
+  return special_functions.stack(l, strict=False)
 
 
 def simple_fill(n):
   l = []
-  ag.set_element_type(l, tf.int32)
+  directives.set_element_type(l, tf.int32)
   for i in range(n):
     l.append(i)
-  return ag.stack(l, strict=False)
+  return special_functions.stack(l, strict=False)
 
 
 def nested_fill(m, n):
   mat = []
-  ag.set_element_type(mat, tf.int32)
+  directives.set_element_type(mat, tf.int32)
   for _ in range(m):
     l = []
-    ag.set_element_type(l, tf.int32)
+    directives.set_element_type(l, tf.int32)
     for j in range(n):
       l.append(j)
-    mat.append(ag.stack(l, strict=False))
-  return ag.stack(mat, strict=False)
+    mat.append(special_functions.stack(l, strict=False))
+  return special_functions.stack(mat, strict=False)
 
 
 def read_write_loop(n):
   l = []
   l.append(1)
   l.append(1)
-  ag.set_element_type(l, tf.int32)
+  directives.set_element_type(l, tf.int32)
   for i in range(2, n):
     l.append(l[i-1] + l[i-2])
     l[i-2] = -l[i-2]
-  return ag.stack(l, strict=False)
+  return special_functions.stack(l, strict=False)
 
 
 def simple_empty(n):
@@ -84,11 +85,11 @@ def simple_empty(n):
   l.append(2)
   l.append(3)
   l.append(4)
-  ag.set_element_type(l, tf.int32, ())
+  directives.set_element_type(l, tf.int32, ())
   s = 0
   for _ in range(n):
     s += l.pop()
-  return ag.stack(l, strict=False), s
+  return special_functions.stack(l, strict=False), s
 
 
 def mutation(t, n):
diff --git a/tensorflow/python/autograph/tests/reference_test.bzl b/tensorflow/python/autograph/tests/reference_test.bzl
index 9e549a35f9c..babf0f1a27f 100644
--- a/tensorflow/python/autograph/tests/reference_test.bzl
+++ b/tensorflow/python/autograph/tests/reference_test.bzl
@@ -14,11 +14,16 @@
 # ==============================================================================
 """BUILD target helpers."""
 
-def reference_test(name, tags = [], shard_count = 1):
-    native.py_test(
+load("//tensorflow:strict.default.bzl", "py_strict_test")
+
+def reference_test(name, additional_deps = [], tags = [], shard_count = 1):
+    py_strict_test(
         name = name,
         srcs = [name + ".py"],
-        deps = [":reference_tests"],
+        deps = [
+            ":reference_tests",
+            "//tensorflow:tensorflow_py_no_contrib",
+        ] + additional_deps,
         python_version = "PY3",
         shard_count = shard_count,
         tags = tags + ["no_windows", "no_pip"],
diff --git a/tensorflow/python/autograph/utils/BUILD b/tensorflow/python/autograph/utils/BUILD
index 76e126104d3..254881dd92e 100644
--- a/tensorflow/python/autograph/utils/BUILD
+++ b/tensorflow/python/autograph/utils/BUILD
@@ -19,9 +19,9 @@ py_strict_library(
     srcs = ["tensor_list.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
     ],
 )
 
@@ -40,8 +40,8 @@ py_strict_library(
     srcs = ["context_managers.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:tensor_array_ops",
     ],
 )
 
@@ -50,10 +50,10 @@ py_strict_library(
     srcs = ["misc.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
     ],
 )
 
@@ -62,10 +62,10 @@ py_strict_library(
     srcs = ["testing.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:op_callbacks",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -86,10 +86,10 @@ py_strict_library(
     srcs = ["tensors.py"],
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:tensor_array_ops",
     ],
 )
 
@@ -100,10 +100,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":context_managers",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -114,11 +114,11 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":misc",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -129,14 +129,14 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tensor_list",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -147,11 +147,11 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tensors",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/build_defs.bzl b/tensorflow/python/build_defs.bzl
index b369e8b14ee..313893c50b3 100644
--- a/tensorflow/python/build_defs.bzl
+++ b/tensorflow/python/build_defs.bzl
@@ -18,8 +18,11 @@ def tf_gen_op_wrapper_private_py(
         visibility = []):
     if not name.endswith("_gen"):
         fail("name must end in _gen")
+    new_name = name[:-4]
+    if not out:
+        out = "gen_" + new_name + ".py"
     tf_gen_op_wrapper_py(
-        name = name[:-4],  # Strip off _gen
+        name = new_name,  # Strip off _gen
         out = out,
         visibility = visibility or ["//visibility:private"],
         deps = deps,
diff --git a/tensorflow/python/checkpoint/BUILD b/tensorflow/python/checkpoint/BUILD
index 1a9d83f6e1e..9467d43952e 100644
--- a/tensorflow/python/checkpoint/BUILD
+++ b/tensorflow/python/checkpoint/BUILD
@@ -18,6 +18,10 @@ package(
 
 py_strict_library(
     name = "checkpoint_lib",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":checkpoint",
         ":checkpoint_management",
@@ -35,8 +39,6 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":checkpoint_context",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/eager:context",
@@ -44,6 +46,8 @@ py_strict_library(
         "//tensorflow/python/eager:executor",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:pywrap_saved_model",
         "//tensorflow/python/util:object_identity",
         "@absl_py//absl/logging",
@@ -66,23 +70,22 @@ py_strict_library(
         ":save_util_v1",
         ":util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:path_helpers",
@@ -91,6 +94,7 @@ py_strict_library(
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/training:py_checkpoint_reader",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/util:compat",
@@ -114,29 +118,29 @@ tf_py_strict_test(
         ":checkpoint_options",
         ":graph_view",
         ":save_util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:template",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:stack",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:template",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:checkpoint_utils",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -154,18 +158,18 @@ tf_py_strict_test(
     srcs = ["checkpoint_with_v1_optimizers_test.py"],
     deps = [
         ":checkpoint",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:template",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:template",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/training",
     ],
 )
 
@@ -175,8 +179,8 @@ tf_py_strict_test(
     deps = [
         ":checkpoint",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:pywrap_saved_model",
     ],
@@ -233,9 +237,9 @@ py_strict_library(
         ":saveable_compat",
         ":util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
@@ -255,9 +259,9 @@ py_strict_library(
         ":saveable_compat",
         ":util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
@@ -274,8 +278,8 @@ tf_py_strict_test(
     deps = [
         ":graph_view",
         ":save_util_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/util:object_identity",
@@ -311,8 +315,8 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/util:object_identity",
     ],
@@ -327,11 +331,11 @@ py_strict_library(
         ":functional_saver",
         ":save_util_v1",
         ":saveable_compat",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:io_ops_gen",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:io_ops_gen",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
@@ -349,10 +353,10 @@ tf_py_strict_test(
     deps = [
         ":checkpoint",
         ":restore",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
@@ -364,11 +368,11 @@ tf_py_strict_test(
     srcs = ["benchmarks_test.py"],
     deps = [
         ":checkpoint",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:io_ops_gen",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:py_checkpoint_reader",
@@ -397,10 +401,6 @@ py_strict_library(
     deps = [
         ":checkpoint_options",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -408,6 +408,10 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:trackable_utils",
         "//tensorflow/python/training/saving:saveable_object",
@@ -426,7 +430,6 @@ cuda_py_strict_test(
     deps = [
         ":checkpoint_options",
         ":functional_saver",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
@@ -435,6 +438,7 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -456,8 +460,8 @@ tf_py_strict_test(
     deps = [
         ":checkpoint",
         ":tensor_callable",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:base",
     ],
@@ -469,11 +473,11 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:checkpoint_state_py",
         "//tensorflow/python/training:training_util",
@@ -494,13 +498,13 @@ cuda_py_strict_test(
         ":checkpoint",
         ":checkpoint_management",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:checkpoint_state_py",
@@ -529,9 +533,9 @@ tf_py_strict_test(
         ":checkpoint",
         ":generate_checkpoint_lib",
         ":saveable_compat",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:checkpoint_utils",
         "//tensorflow/python/training/saving:saveable_object",
@@ -544,12 +548,12 @@ py_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:checkpoint",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl:app",
     ],
 )
@@ -559,12 +563,12 @@ py_strict_library(
     srcs = ["testdata/generate_checkpoint.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:checkpoint",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl:app",
     ],
 )
diff --git a/tensorflow/python/checkpoint/async_checkpoint_helper.py b/tensorflow/python/checkpoint/async_checkpoint_helper.py
index 1096edf276a..4b9a7734230 100644
--- a/tensorflow/python/checkpoint/async_checkpoint_helper.py
+++ b/tensorflow/python/checkpoint/async_checkpoint_helper.py
@@ -234,7 +234,7 @@ class AsyncCheckpointHelper:
     # Initiate the underlying Checkpoint instance with the copied items.
     self._checkpoint = self._checkpointer_impl(**self._checkpoint_items)
     # Initiate the underlying Checkpoint instance's save_counter.
-    save_counter = self._checkpoint.save_counter
+    save_counter = self._checkpoint.save_counter.numpy()
     logging.info("Initializing async checkpoint's save_counter: %d",
                  save_counter)
 
diff --git a/tensorflow/python/client/BUILD b/tensorflow/python/client/BUILD
index 2c785fe0fc1..bc9c5a1ff9d 100644
--- a/tensorflow/python/client/BUILD
+++ b/tensorflow/python/client/BUILD
@@ -121,11 +121,16 @@ py_strict_library(
         "timeline.py",
     ],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/mlperf:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":_pywrap_device_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:build_info",
@@ -141,10 +146,10 @@ tf_py_strict_test(
     deps = [
         ":_pywrap_events_writer",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:compat",
     ],
@@ -154,6 +159,10 @@ py_strict_library(
     name = "device_lib",
     srcs = ["device_lib.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/cleverhans:__subpackages__",
+    ],
     deps = [
         ":_pywrap_device_lib",
         "//tensorflow/core:protos_all_py",
@@ -232,10 +241,10 @@ tf_cuda_library(
         "//tensorflow/core:graph",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:ndarray_tensor",
-        "//tensorflow/python:ndarray_tensor_bridge",
-        "//tensorflow/python:safe_pyobject_ptr",
         "//tensorflow/python/framework:test_ops_kernels",
+        "//tensorflow/python/lib/core:ndarray_tensor",
+        "//tensorflow/python/lib/core:ndarray_tensor_bridge",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/tsl/python/lib/core:numpy",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
@@ -247,12 +256,14 @@ py_strict_library(
     name = "session",
     srcs = ["session.py"],
     srcs_version = "PY3",
-    visibility = ["//tensorflow:internal"],
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":pywrap_tf_session",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:mixed_precision_global_state",
-        "//tensorflow/python:session_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/framework:device",
@@ -262,9 +273,11 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:stack",
+        "//tensorflow/python/ops:session_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training/experimental:mixed_precision_global_state",
         "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -311,18 +324,7 @@ tf_py_strict_test(
     deps = [
         ":session",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_ops_gen",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework",
@@ -339,6 +341,17 @@ tf_py_strict_test(
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:versions",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
@@ -363,17 +376,17 @@ tf_py_strict_test(
     deps = [
         ":session",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
@@ -414,13 +427,13 @@ tf_py_strict_test(
     ],
     deps = [
         ":session",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/training:server_lib",
     ],
 )
@@ -439,12 +452,12 @@ cuda_py_strict_test(
         ":client",
         ":session",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -459,14 +472,14 @@ cuda_py_strict_test(
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -479,12 +492,12 @@ cuda_py_strict_test(
     python_version = "PY3",
     deps = [
         ":session",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/client/session.py b/tensorflow/python/client/session.py
index 1fc17686a7b..6c68d0c1759 100644
--- a/tensorflow/python/client/session.py
+++ b/tensorflow/python/client/session.py
@@ -39,10 +39,12 @@ from tensorflow.python.ops import session_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.training.experimental import mixed_precision_global_state
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
+
 _python_session_create_counter = monitoring.Counter(
     '/tensorflow/api/python/session_create_counter',
     'Counter for number of sessions created in Python.')
@@ -978,9 +980,19 @@ class BaseSession(SessionInterface):
         tf_session.TF_DeleteBuffer(options_ptr)
     return result
 
+  @deprecation.deprecated(
+      '2023-06-01',
+      'This function is deprecated and we do not expect adding new'
+      'functionality to it. Please do not have your code depending'
+      'on this function.',
+  )
   def partial_run(self, handle, fetches, feed_dict=None):
     """Continues the execution with more feeds and fetches.
 
+    NOTE: This function is deprecated and we do not expect adding new
+    functionality to it. Please do not have your code depending on this
+    function.
+
     This is EXPERIMENTAL and subject to change.
 
     To use partial execution, a user first calls `partial_run_setup()` and
@@ -1025,9 +1037,19 @@ class BaseSession(SessionInterface):
     # TODO(touts): Support feeding and fetching the same tensor.
     return self._run(handle, fetches, feed_dict, None, None)
 
+  @deprecation.deprecated(
+      '2023-06-01',
+      'This function is deprecated and we do not expect adding new'
+      'functionality to it. Please do not have your code depending'
+      'on this function.',
+  )
   def partial_run_setup(self, fetches, feeds=None):
     """Sets up a graph with feeds and fetches for partial run.
 
+    NOTE: This function is deprecated and we do not expect adding new
+    functionality to it. Please do not have your code depending on this
+    function.
+
     This is EXPERIMENTAL and subject to change.
 
     Note that contrary to `run`, `feeds` only specifies the graph elements.
diff --git a/tensorflow/python/client/tf_session_wrapper.cc b/tensorflow/python/client/tf_session_wrapper.cc
index 1b2b894b8cc..559b3dab340 100644
--- a/tensorflow/python/client/tf_session_wrapper.cc
+++ b/tensorflow/python/client/tf_session_wrapper.cc
@@ -1732,6 +1732,21 @@ PYBIND11_MODULE(_pywrap_tf_session, m) {
       },
       py::return_value_policy::reference);
 
+  m.def("TF_GraphToGraphDefPybind", [](PyGraph* graph) {
+    tensorflow::Safe_TF_StatusPtr status =
+        tensorflow::make_safe(TF_NewStatus());
+    // Release GIL.
+    py::gil_scoped_release release;
+    TF_Graph* tf_graph = graph->tf_graph();
+    auto def = new tensorflow::GraphDef();
+    {
+      tensorflow::mutex_lock l(tf_graph->mu);
+      tf_graph->graph.ToGraphDef(def);
+    }
+    tensorflow::MaybeRaiseRegisteredFromTFStatusWithGIL(status.get());
+    return def;
+  });
+
   m.def("TF_GraphToGraphDef", [](PyGraph* graph, TF_Buffer* output_graph_def) {
     tensorflow::Safe_TF_StatusPtr status =
         tensorflow::make_safe(TF_NewStatus());
diff --git a/tensorflow/python/compat/BUILD b/tensorflow/python/compat/BUILD
index 9f12f0eb6fe..5d6100d4f01 100644
--- a/tensorflow/python/compat/BUILD
+++ b/tensorflow/python/compat/BUILD
@@ -12,9 +12,7 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:control_flow_v2_toggles",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/experimental/ops:counter",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:random_ops",
@@ -24,6 +22,8 @@ py_strict_library(
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -47,7 +47,7 @@ tf_py_strict_test(
     tags = ["nofwdcompat"],
     deps = [
         ":compat",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -57,9 +57,9 @@ tf_py_strict_test(
     srcs = ["disable_v2_behavior_test.py"],
     deps = [
         ":v2_compat",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:_pywrap_tf2",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index 9b5eedcdb4a..478e14449b4 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@ from tensorflow.python.util.tf_export import tf_export
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 5, 7)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2023, 6, 11)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/compiler/mlir/BUILD b/tensorflow/python/compiler/mlir/BUILD
index 2277bf0740d..b18e97cacc3 100644
--- a/tensorflow/python/compiler/mlir/BUILD
+++ b/tensorflow/python/compiler/mlir/BUILD
@@ -28,15 +28,15 @@ py_strict_test(
     ],
     deps = [
         ":mlir",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_mlir",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/compiler/tensorrt/BUILD b/tensorflow/python/compiler/tensorrt/BUILD
index 40df3ee8efd..6c9099bdd0c 100644
--- a/tensorflow/python/compiler/tensorrt/BUILD
+++ b/tensorflow/python/compiler/tensorrt/BUILD
@@ -41,8 +41,6 @@ py_strict_library(
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/compiler/tf2tensorrt:trt_ops_loader",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:resource_variable_ops_gen",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:wrap_function",
@@ -52,6 +50,8 @@ py_strict_library(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:load",
@@ -80,14 +80,14 @@ py_strict_library(
         ":trt_convert_py",
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
@@ -124,10 +124,6 @@ cuda_py_strict_test(
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/compiler/tf2tensorrt:trt_engine_instance_proto_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops_gen",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compiler/tensorrt/test:test_utils",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework",
@@ -139,6 +135,10 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:load",
@@ -178,14 +178,8 @@ cuda_py_strict_test(
         ":trt_convert_py",
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:nn",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/estimator",
@@ -198,6 +192,11 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/keras:metrics",
         "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
diff --git a/tensorflow/python/compiler/tensorrt/model_tests/BUILD b/tensorflow/python/compiler/tensorrt/model_tests/BUILD
index 03923d2bdcf..bce6b0560ee 100644
--- a/tensorflow/python/compiler/tensorrt/model_tests/BUILD
+++ b/tensorflow/python/compiler/tensorrt/model_tests/BUILD
@@ -20,14 +20,14 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python/client:session",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:convert_to_constants",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
@@ -58,11 +58,11 @@ py_strict_binary(
     deps = [
         ":model_handler",
         ":result_analyzer",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:config",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
diff --git a/tensorflow/python/compiler/tensorrt/test/BUILD b/tensorflow/python/compiler/tensorrt/test/BUILD
index a68db4d3f59..2f45127a3b3 100644
--- a/tensorflow/python/compiler/tensorrt/test/BUILD
+++ b/tensorflow/python/compiler/tensorrt/test/BUILD
@@ -29,8 +29,6 @@ py_strict_library(
     deps = [
         "//tensorflow/compiler/tf2tensorrt:_pywrap_py_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework",
@@ -38,6 +36,8 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model:builder",
@@ -103,14 +103,14 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -123,11 +123,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -140,11 +140,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:image_ops",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -162,15 +162,15 @@ cuda_py_strict_test(
     deps = [
         ":tf_trt_integration_test_base_srcs",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:constants",
@@ -187,11 +187,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -204,12 +204,12 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -222,10 +222,10 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops_gen",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops_gen",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
@@ -239,10 +239,10 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/compiler/tensorrt:trt_convert_py",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -255,10 +255,10 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -271,11 +271,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -303,9 +303,9 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -318,11 +318,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -335,9 +335,9 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -350,11 +350,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -367,12 +367,12 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -381,15 +381,15 @@ cuda_py_strict_test(
     name = "memory_alignment_test",
     srcs = ["memory_alignment_test.py"],
     python_version = "PY3",
-    tags = base_tags,
+    tags = base_tags + ["no_oss"],  # TODO(b/285588022)
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -402,10 +402,10 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -418,11 +418,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -435,11 +435,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -452,11 +452,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -469,10 +469,10 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -485,10 +485,10 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -501,11 +501,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:signature_constants",
@@ -521,9 +521,9 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -536,12 +536,12 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -554,11 +554,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -571,11 +571,11 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":tf_trt_integration_test_base_srcs",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/compiler/tensorrt/trt_convert.py b/tensorflow/python/compiler/tensorrt/trt_convert.py
index 659dd54bc30..392869eee6d 100644
--- a/tensorflow/python/compiler/tensorrt/trt_convert.py
+++ b/tensorflow/python/compiler/tensorrt/trt_convert.py
@@ -954,6 +954,7 @@ def _construct_function_from_graph_def(func, graph_def, frozen_func=None):
       [tensor.name for tensor in frozen_func.outputs], captures)
   new_func.graph.structured_outputs = nest.pack_sequence_as(
       func.graph.structured_outputs, new_func.graph.structured_outputs)
+  new_func._function_type = func.function_type  # pylint: disable=protected-access
 
   # Copy structured input signature from original function (used during
   # serialization)
diff --git a/tensorflow/python/compiler/xla/BUILD b/tensorflow/python/compiler/xla/BUILD
index 2690207f40f..8afbdcb0179 100644
--- a/tensorflow/python/compiler/xla/BUILD
+++ b/tensorflow/python/compiler/xla/BUILD
@@ -33,19 +33,18 @@ cuda_py_strict_test(
     xla_enabled = True,
     deps = [
         ":compiler_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:function",
         "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -61,9 +60,9 @@ py_library(
         "//tensorflow/core:protos_all_py",
         # Do not remove: required to run xla ops on Cloud.
         "//tensorflow/compiler/tf2xla/python:xla",  # build_cleaner: keep
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
@@ -87,18 +86,18 @@ cuda_py_strict_test(
     xla_enabled = True,
     deps = [
         ":xla",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/estimator:model_fn",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/summary:__init__",
         "//tensorflow/python/tpu:tpu_feed",
@@ -116,14 +115,14 @@ cuda_py_strict_test(
     ],
     xla_enabled = True,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -145,10 +144,10 @@ tf_py_strict_test(
     xla_enable_strict_auto_jit = False,
     xla_enabled = True,
     deps = [
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
diff --git a/tensorflow/python/compiler/xla/experimental/BUILD b/tensorflow/python/compiler/xla/experimental/BUILD
index 485d06300c4..595897108d1 100644
--- a/tensorflow/python/compiler/xla/experimental/BUILD
+++ b/tensorflow/python/compiler/xla/experimental/BUILD
@@ -27,11 +27,11 @@ py_strict_test(
     deps = [
         ":xla_sharding",
         "//tensorflow/compiler/xla:xla_data_proto_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:absltest",
     ],
diff --git a/tensorflow/python/compiler/xla/pjrt_compile_test.py b/tensorflow/python/compiler/xla/pjrt_compile_test.py
index e8d47977e08..3b4cd7b9b02 100644
--- a/tensorflow/python/compiler/xla/pjrt_compile_test.py
+++ b/tensorflow/python/compiler/xla/pjrt_compile_test.py
@@ -88,6 +88,13 @@ class PjrtCompileTest(test.TestCase):
       self.assertAllClose([1.0, 2.0], x.value(), atol=1e-05)
       self.assertAllClose([2.0, 3.0], y.value(), atol=1e-05)
 
+  # TODO(b/286458275): Add autoclustering tests when b/274176440 is fixed and
+  # when we can use PJRT with the GPU device. XLA_GPU device doesn't work well
+  # with autoclustering - sometimes it tries to wrap functions in an XlaLaunch
+  # op instead of _XlaCompile/_XlaRun ops.
+  def test_xla_compile_and_run(self):
+    pass
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/benchmarks/BUILD b/tensorflow/python/data/benchmarks/BUILD
index 3e23f6d8d9f..be65e17f978 100644
--- a/tensorflow/python/data/benchmarks/BUILD
+++ b/tensorflow/python/data/benchmarks/BUILD
@@ -12,12 +12,12 @@ py_strict_library(
     srcs = ["benchmark_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -26,12 +26,12 @@ tf_py_strict_test(
     name = "meta_benchmark",
     srcs = ["meta_benchmark.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -41,10 +41,10 @@ tf_py_strict_test(
     srcs = ["batch_benchmark.py"],
     deps = [
         ":benchmark_base",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:random_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -54,8 +54,8 @@ tf_py_strict_test(
     srcs = ["filter_benchmark.py"],
     deps = [
         ":benchmark_base",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops:array_ops",
     ],
 )
 
@@ -64,12 +64,12 @@ tf_py_strict_test(
     srcs = ["from_tensor_slices_benchmark.py"],
     deps = [
         ":benchmark_base",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python/data/experimental/ops:get_single_element",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:structured_function",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:dataset_ops_gen",
         "//third_party/py/numpy",
     ],
 )
@@ -99,13 +99,13 @@ tf_py_strict_test(
     srcs = ["map_benchmark.py"],
     deps = [
         ":benchmark_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:while_loop",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/__init__.py b/tensorflow/python/data/experimental/__init__.py
index 129e53be340..3965ab53b77 100644
--- a/tensorflow/python/data/experimental/__init__.py
+++ b/tensorflow/python/data/experimental/__init__.py
@@ -72,6 +72,7 @@ See [Importing Data](https://tensorflow.org/guide/datasets) for an overview.
 @@make_saveable_from_iterator
 @@map_and_batch
 @@map_and_batch_with_legacy_function
+@@pad_to_cardinality
 @@parallel_interleave
 @@parse_example_dataset
 @@prefetch_to_device
@@ -124,6 +125,7 @@ from tensorflow.python.data.experimental.ops.iterator_ops import make_saveable_f
 from tensorflow.python.data.experimental.ops.lookup_ops import DatasetInitializer
 from tensorflow.python.data.experimental.ops.lookup_ops import index_table_from_dataset
 from tensorflow.python.data.experimental.ops.lookup_ops import table_from_dataset
+from tensorflow.python.data.experimental.ops.pad_to_cardinality import pad_to_cardinality
 from tensorflow.python.data.experimental.ops.parsing_ops import parse_example_dataset
 from tensorflow.python.data.experimental.ops.prefetching_ops import copy_to_device
 from tensorflow.python.data.experimental.ops.prefetching_ops import prefetch_to_device
diff --git a/tensorflow/python/data/experimental/benchmarks/BUILD b/tensorflow/python/data/experimental/benchmarks/BUILD
index 29a2c9c9f8a..6cfb00dc956 100644
--- a/tensorflow/python/data/experimental/benchmarks/BUILD
+++ b/tensorflow/python/data/experimental/benchmarks/BUILD
@@ -14,10 +14,10 @@ py_strict_binary(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/ops:math_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -26,10 +26,10 @@ tf_py_strict_test(
     name = "autotune_benchmark",
     srcs = ["autotune_benchmark.py"],
     deps = [
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/ops:math_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -38,11 +38,11 @@ tf_py_strict_test(
     name = "parameter_value_benchmark",
     srcs = ["parameter_value_benchmark.py"],
     deps = [
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -52,11 +52,11 @@ tf_py_strict_test(
     srcs = ["csv_dataset_benchmark.py"],
     tags = ["no_pip"],
     deps = [
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/platform:gfile",
     ],
 )
@@ -66,13 +66,13 @@ tf_py_strict_test(
     srcs = ["map_and_batch_benchmark.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//third_party/py/numpy",
     ],
 )
@@ -81,14 +81,14 @@ tf_py_strict_test(
     name = "map_defun_benchmark",
     srcs = ["map_defun_benchmark.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:map_defun",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -106,10 +106,10 @@ tf_py_strict_test(
     name = "optimize_benchmark",
     srcs = ["optimize_benchmark.py"],
     deps = [
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -130,11 +130,11 @@ tf_py_strict_test(
     name = "snapshot_dataset_benchmark",
     srcs = ["snapshot_dataset_benchmark.py"],
     deps = [
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/benchmarks:benchmark_base",
         "//tensorflow/python/data/experimental/ops:snapshot",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 57eaec1a1e0..1b93ada9aab 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -28,13 +28,13 @@ tf_py_strict_test(
     size = "small",
     srcs = ["assert_next_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -44,13 +44,13 @@ tf_py_strict_test(
     size = "small",
     srcs = ["assert_prev_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -62,9 +62,6 @@ tf_py_strict_test(
     shard_count = 4,
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
@@ -81,6 +78,9 @@ tf_py_strict_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -90,18 +90,18 @@ tf_py_strict_test(
     size = "medium",
     srcs = ["checkpoint_input_pipeline_hook_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:saver",
         "//tensorflow/python/training:training_util",
         "@absl_py//absl/testing:parameterized",
@@ -133,11 +133,6 @@ cuda_py_strict_test(
     srcs = ["copy_to_device_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -146,8 +141,13 @@ cuda_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -159,11 +159,6 @@ tf_py_strict_test(
     srcs = ["csv_dataset_test.py"],
     shard_count = 8,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:error_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
@@ -171,6 +166,11 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -180,14 +180,14 @@ tf_py_strict_test(
     size = "medium",
     srcs = ["dense_to_sparse_batch_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -198,10 +198,6 @@ tf_py_strict_test(
     size = "small",
     srcs = ["from_list_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/experimental/ops:from_list",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
@@ -209,6 +205,10 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -220,19 +220,19 @@ tf_py_strict_test(
     srcs = ["group_by_reducer_test.py"],
     shard_count = 12,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -244,8 +244,6 @@ tf_py_strict_test(
     srcs = ["index_shuffle_test.py"],
     shard_count = 12,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:shuffle_ops",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -253,6 +251,8 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -266,14 +266,14 @@ tf_py_strict_test(
         "notap",  # b/272281090
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:io",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -283,16 +283,16 @@ tf_py_strict_test(
     size = "small",
     srcs = ["lookup_ops_test.py"],
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:lookup_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -303,12 +303,6 @@ tf_py_strict_test(
     srcs = ["make_batched_features_dataset_test.py"],
     shard_count = 6,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
@@ -316,6 +310,12 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -326,15 +326,15 @@ tf_py_strict_test(
     size = "small",
     srcs = ["make_csv_dataset_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -345,12 +345,12 @@ tf_py_strict_test(
     size = "small",
     srcs = ["make_saveable_from_iterator_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -361,15 +361,15 @@ tf_py_strict_test(
     size = "medium",
     srcs = ["make_tf_record_dataset_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -380,18 +380,7 @@ tf_py_strict_test(
     srcs = ["map_and_batch_test.py"],
     shard_count = 4,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_sanitizers",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:batching",
@@ -401,6 +390,17 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -411,24 +411,24 @@ tf_py_strict_test(
     size = "small",
     srcs = ["map_defun_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/experimental/ops:map_defun",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -438,12 +438,12 @@ tf_py_strict_test(
     size = "small",
     srcs = ["matching_files_dataset_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:matching_files",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -454,13 +454,13 @@ tf_py_strict_test(
     size = "small",
     srcs = ["model_dataset_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -470,12 +470,25 @@ tf_py_strict_test(
     size = "small",
     srcs = ["non_serializable_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_strict_test(
+    name = "pad_to_cardinality_test",
+    srcs = ["pad_to_cardinality_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:pad_to_cardinality",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -486,13 +499,6 @@ tf_py_strict_test(
     srcs = ["parallel_interleave_test.py"],
     shard_count = 8,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
@@ -500,6 +506,13 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -511,12 +524,6 @@ tf_py_strict_test(
     shard_count = 4,
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:parsing_ops",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -525,7 +532,13 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -537,17 +550,17 @@ cuda_py_strict_test(
     srcs = ["prefetch_to_device_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -557,13 +570,13 @@ tf_py_strict_test(
     size = "small",
     srcs = ["prefetch_with_slack_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -574,8 +587,6 @@ tf_py_strict_test(
     srcs = ["rebatch_dataset_test.py"],
     shard_count = 4,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:image_ops",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -586,7 +597,9 @@ tf_py_strict_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:image_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -599,22 +612,22 @@ tf_py_strict_test(
     tags = ["no_oss"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_combinations",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -625,14 +638,14 @@ tf_py_strict_test(
     size = "medium",
     srcs = ["shuffle_and_repeat_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:shuffle_ops",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -643,12 +656,12 @@ tf_py_strict_test(
     size = "small",
     srcs = ["sleep_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -659,14 +672,14 @@ tf_py_strict_test(
     srcs = ["sql_dataset_test.py"],
     shard_count = 4,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
         "@org_sqlite//:python",
     ],
@@ -677,10 +690,6 @@ tf_py_strict_test(
     size = "medium",
     srcs = ["tf_record_writer_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:writers",
         "//tensorflow/python/data/kernel_tests:test_base",
@@ -688,6 +697,10 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -698,10 +711,10 @@ tf_py_strict_test(
     size = "small",
     srcs = ["variant_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -711,13 +724,13 @@ cuda_py_strict_test(
     size = "small",
     srcs = ["wrap_unwrap_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
index b0fc5cec1f5..7249aa596ca 100644
--- a/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/from_list_test.py
@@ -45,7 +45,7 @@ class FromListTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   def testLargeNInputs(self):
     elements = list(range(1000))
-    dataset = dataset_ops.Dataset.from_tensor_slices(elements)
+    dataset = from_list.from_list(elements)
     self.assertDatasetProduces(dataset, expected_output=elements)
 
   @combinations.generate(test_base.default_test_combinations())
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 41f60a7676f..f3c6d0eef38 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -1,203 +1,220 @@
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "filter_fusion_test",
     size = "medium",
     srcs = ["filter_fusion_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "grappler_test",
     size = "medium",
     srcs = ["grappler_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_and_batch_fusion_test",
     size = "small",
     srcs = ["map_and_batch_fusion_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_and_filter_fusion_test",
     size = "small",
     srcs = ["map_and_filter_fusion_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_fusion_test",
     size = "medium",
     srcs = ["map_fusion_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "map_parallelization_test",
     size = "small",
     srcs = ["map_parallelization_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "filter_parallelization_test",
     size = "medium",
     srcs = ["filter_parallelization_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-    ],
-)
-
-tf_py_test(
-    name = "make_deterministic_test",
-    size = "small",
-    srcs = ["make_deterministic_test.py"],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python/compat:v2_compat",
-        "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
+    name = "make_deterministic_test",
+    size = "small",
+    srcs = ["make_deterministic_test.py"],
+    deps = [
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:interleave_ops",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_strict_test(
     name = "noop_elimination_test",
     size = "small",
     srcs = ["noop_elimination_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "optimization_test",
     size = "small",
     srcs = ["optimization_test.py"],
     shard_count = 2,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:scan_ops",
         "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "shuffle_and_repeat_fusion_test",
     size = "small",
     srcs = ["shuffle_and_repeat_fusion_test.py"],
@@ -207,11 +224,13 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/data/experimental/kernel_tests/pad_to_cardinality_test.py b/tensorflow/python/data/experimental/kernel_tests/pad_to_cardinality_test.py
new file mode 100644
index 00000000000..2331eb0726f
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/pad_to_cardinality_test.py
@@ -0,0 +1,124 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `tf.data.experimental.pad_to_cardinality()."""
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import pad_to_cardinality
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+pad_to_cardinality = pad_to_cardinality.pad_to_cardinality
+
+
+class PadToCardinalityTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testBasic(self):
+    data = [1, 2, 3, 4, 5]
+    target = 12
+    ds = dataset_ops.Dataset.from_tensor_slices({'a': data})
+    ds = ds.apply(pad_to_cardinality(target))
+    expected_data = [{'a': data[i], 'valid': True} for i in range(len(data))]
+    expected_padding = [
+        {'a': 0, 'valid': False} for _ in range(target - len(data))
+    ]
+    self.assertAllEqual(
+        self.getDatasetOutput(ds), expected_data + expected_padding
+    )
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNoPadding(self):
+    data = [1, 2, 3, 4, 5]
+    target = 5
+    ds = dataset_ops.Dataset.from_tensor_slices({'a': data})
+    ds = ds.apply(pad_to_cardinality(target))
+    expected_data = [{'a': data[i], 'valid': True} for i in range(len(data))]
+    self.assertAllEqual(self.getDatasetOutput(ds), expected_data)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testStructuredData(self):
+    data = {
+        'a': [1, 2, 3, 4, 5],
+        'b': ([b'a', b'b', b'c', b'd', b'e'], [-1, -2, -3, -4, -5]),
+    }
+    data_len = len(data['a'])
+    target = 12
+    ds = dataset_ops.Dataset.from_tensor_slices(data)
+    ds = ds.apply(pad_to_cardinality(target))
+    expected_data = [
+        {
+            'a': data['a'][i],
+            'b': (data['b'][0][i], data['b'][1][i]),
+            'valid': True,
+        }
+        for i in range(data_len)
+    ]
+    expected_padding = [
+        {'a': 0, 'b': (b'', 0), 'valid': False}
+        for _ in range(target - data_len)
+    ]
+    self.assertAllEqual(
+        self.getDatasetOutput(ds), expected_data + expected_padding
+    )
+
+  @combinations.generate(test_base.v2_eager_only_combinations())
+  def testUnknownCardinality(self):
+    ds = dataset_ops.Dataset.from_tensors({'a': 1}).filter(lambda x: True)
+    with self.assertRaisesRegex(
+        ValueError,
+        'The dataset passed into `pad_to_cardinality` must have a '
+        'known cardinalty, but has cardinality -2',
+    ):
+      ds = ds.apply(pad_to_cardinality(5))
+      self.getDatasetOutput(ds)
+
+  @combinations.generate(test_base.v2_eager_only_combinations())
+  def testInfiniteCardinality(self):
+    ds = dataset_ops.Dataset.from_tensors({'a': 1}).repeat()
+    with self.assertRaisesRegex(
+        ValueError,
+        'The dataset passed into `pad_to_cardinality` must have a '
+        'known cardinalty, but has cardinality -1',
+    ):
+      ds = ds.apply(pad_to_cardinality(5))
+      self.getDatasetOutput(ds)
+
+  @combinations.generate(test_base.v2_only_combinations())
+  def testNonMapping(self):
+    ds = dataset_ops.Dataset.from_tensors(1)
+    with self.assertRaisesRegex(
+        ValueError,
+        '`pad_to_cardinality` requires its input dataset to be a dictionary',
+    ):
+      ds = ds.apply(pad_to_cardinality(5))
+      self.getDatasetOutput(ds)
+
+  @combinations.generate(test_base.v2_eager_only_combinations())
+  def testRequestedCardinalityTooShort(self):
+    ds = dataset_ops.Dataset.from_tensors({'a': 1}).repeat(5)
+    with self.assertRaisesRegex(
+        ValueError,
+        r'The dataset passed into `pad_to_cardinality` must have a cardinalty '
+        r'less than the target cardinality \(3\), but has cardinality 5',
+    ):
+      ds = ds.apply(pad_to_cardinality(3))
+      self.getDatasetOutput(ds)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/BUILD b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
index f1e3bcc88b1..69ebe235ca1 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/service/BUILD
@@ -1,6 +1,7 @@
 # tf.data service tests.
 
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -8,7 +9,7 @@ package(
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "auto_shard_test",
     size = "medium",
     srcs = ["auto_shard_test.py"],
@@ -22,7 +23,6 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
@@ -30,118 +30,120 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "coordinated_read_ft_test",
     size = "medium",
     srcs = ["coordinated_read_ft_test.py"],
     shard_count = 8,
     deps = [
         ":test_base",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "coordinated_read_test",
     size = "medium",
     srcs = ["coordinated_read_test.py"],
     shard_count = 8,
     deps = [
         ":test_base",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "data_service_ops_test",
     size = "medium",
     srcs = ["data_service_ops_test.py"],
     shard_count = 32,
     deps = [
         ":test_base",
-        "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data",
+        "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
+        "//tensorflow/python/data/experimental/ops:grouping",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "dynamic_sharding_test",
     size = "medium",
     srcs = ["dynamic_sharding_test.py"],
     shard_count = 16,
     deps = [
         ":test_base",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python/data",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
-        "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/kernel_tests:tf_record_test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fault_tolerance_test",
     size = "small",
     srcs = ["fault_tolerance_test.py"],
     shard_count = 8,
     deps = [
         ":test_base",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/experimental/service:server_lib",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "local_workers_test",
     size = "medium",
     srcs = ["local_workers_test.py"],
@@ -158,27 +160,28 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "metadata_test",
     size = "medium",
     srcs = ["metadata_test.py"],
     shard_count = 2,
     deps = [
         ":test_base",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:script_ops",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "cross_trainer_cache_test",
     size = "medium",
     srcs = ["cross_trainer_cache_test.py"],
@@ -189,12 +192,13 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/platform:test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "cross_trainer_cache_ft_test",
     size = "medium",
     srcs = ["cross_trainer_cache_ft_test.py"],
@@ -205,80 +209,83 @@ tf_py_test(
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
-        "//tensorflow/python/platform:test",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "distributed_save_test",
     size = "medium",
     srcs = ["distributed_save_test.py"],
     shard_count = 4,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python/data/experimental/kernel_tests/service:test_base",
+        ":test_base",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/experimental/ops:distributed_save_op",
         "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/platform:test",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "distributed_save_ft_test",
     size = "medium",
     srcs = ["distributed_save_ft_test.py"],
     shard_count = 17,
     deps = [
         ":test_base",
-        "//tensorflow/python:platform_test",
         "//tensorflow/python/data/experimental/ops:distributed_save_op",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:test_mode",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "multi_device_test",
     size = "small",
     srcs = ["multi_device_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/data",
-        "//tensorflow/python/data/experimental/ops:testing",
-        "//tensorflow/python/data/experimental/service:server_lib",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_process_cluster",
     srcs = ["multi_process_cluster.py"],
     srcs_version = "PY3",
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/distribute:multi_process_lib",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:test",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "multi_process_cluster_test",
     size = "medium",
     srcs = ["multi_process_cluster_test.py"],
@@ -292,19 +299,24 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test_base",
     srcs = ["test_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:test",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "worker_tags_test",
     size = "medium",
     srcs = ["worker_tags_test.py"],
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py
index b82cfcefef9..bfb3d3939a4 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_ft_test.py
@@ -26,6 +26,7 @@ from tensorflow.python.data.kernel_tests import test_base
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import test_mode
 from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
 from tensorflow.python.platform import test
 
 # Enum value for `SnapshotStreamInfo` states.
@@ -58,6 +59,8 @@ def get_stream_assignments(cluster, n, snapshot_idx=0):
 
 class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
 
+  maxDiff = None
+
   def setUp(self):
     super().setUp()
     self._path = os.path.join(
@@ -75,9 +78,9 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
     if num_sources > 1:
       ds = dataset_ops.Dataset.zip((ds,) * num_sources)
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
-    distributed_save_op.distributed_save(
+    self.evaluate(distributed_save_op.distributed_save(
         ds, self._path, cluster.dispatcher_address()
-    )
+    ))
     return cluster, ds
 
   def splits_dir(self, stream_idx=0, worker=0):
@@ -94,6 +97,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
     return os.path.join(
         self.splits_dir(stream_idx, worker=worker),
         f"source_{source_idx}",
+        "repetition_0",
     )
 
   def _make_stream_dir(self, stream_name, worker=0):
@@ -103,24 +107,26 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
         f"{worker}"
     )
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoverySucceeds(self):
     cluster, _ = self.setup()
     cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoveryBlocksOverwrite(self):
     cluster, ds = self.setup()
     cluster.restart_dispatcher()
-    with self.assertRaisesOpError("is already started or completed"):
-      distributed_save_op.distributed_save(
+    with self.assertRaisesRegex(
+        errors.AlreadyExistsError, "is already started or completed"
+    ):
+      self.evaluate(distributed_save_op.distributed_save(
           ds, self._path, cluster.dispatcher_address()
-      )
+      ))
 
   # TODO(b/250921378): Figure out why tsan times out when there is a worker.
   @combinations.generate(
       combinations.times(
-          test_base.eager_only_combinations(),
+          test_base.default_test_combinations(),
           combinations.combine(
               bad_stream_dir_name=["stream_", "stream_x", "stream_-1"]
           ),
@@ -134,7 +140,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(
-          test_base.eager_only_combinations(),
+          test_base.default_test_combinations(),
           combinations.combine(
               bad_source_dir_name=["source_", "source_x", "source_-1"]
           ),
@@ -146,7 +152,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, "can't parse"):
       cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoveryFailsWithOutOfBoundsSourceName(self):
     cluster, _ = self.setup(num_workers=0)
     os.makedirs(os.path.join(self.splits_dir(), "source_1"))
@@ -155,7 +161,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
 
   @combinations.generate(
       combinations.times(
-          test_base.eager_only_combinations(),
+          test_base.default_test_combinations(),
           combinations.combine(
               bad_split_filename=[
                   "split_",
@@ -174,7 +180,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
         ValueError, "Expected split_<local_split_index>_<global_split_index>"):
       cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoveryFailsWithOutOfOrderSplitName(self):
     cluster, _ = self.setup(num_workers=0)
     write_file(os.path.join(self.source_dir(), "split_1_0"))
@@ -182,21 +188,14 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
         ValueError, "The local split index 1 exceeds the global split index 0"):
       cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
-  def testSnapshotRecoveryFailsWithOutOfBoundsSplitName(self):
-    cluster, _ = self.setup(num_workers=0)
-    write_file(os.path.join(self.source_dir(), "split_1_1"))
-    with self.assertRaisesRegex(ValueError, "found conflict"):
-      cluster.restart_dispatcher()
-
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoveryFailsWithMissingGlobalIndexInSplitNames(self):
     cluster, _ = self.setup(num_workers=0)
     write_file(os.path.join(self.source_dir(), "split_0_1"))
     with self.assertRaisesRegex(ValueError, "found missing global"):
       cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoveryFailsWithDuplicateGlobalIndexInSplitName(self):
     cluster, _ = self.setup(num_workers=0)
     write_file(os.path.join(self.source_dir(stream_idx=0), "split_0_1"))
@@ -206,7 +205,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, "found duplicate global"):
       cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testSnapshotRecoveryFailsWithDuplicateWorkerAssignment(self):
     cluster, _ = self.setup(num_workers=0)
     write_file(os.path.join(self.source_dir(stream_idx=0), "split_0_1"))
@@ -214,7 +213,7 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
     with self.assertRaisesRegex(ValueError, "worker is already assigned"):
       cluster.restart_dispatcher()
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testStreamsReassignedAfterDispatcherRestart(self):
     n = 5
     cluster, _ = self.setup(num_workers=n, ds_size=10000)
@@ -228,11 +227,123 @@ class SnapshotFtTest(data_service_test_base.TestBase, parameterized.TestCase):
       streams = get_streams()
     self.assertCountEqual([stream.index for stream in streams], range(n))
 
-  @combinations.generate(test_base.eager_only_combinations())
+  @combinations.generate(test_base.default_test_combinations())
   def testLargeMultiSourceSnapshotRecoversAndCompletes(self):
     n = 5
     cluster, _ = self.setup(num_workers=n, ds_size=1000, num_sources=3)
-    get_stream_assignments(cluster, n)  # Block until all workers have streams.
+    get_stream_assignments(cluster, n)  # Blocks until all workers have streams.
+    cluster.stop_worker(0)
+    self.assertTrue(
+        os.path.exists(
+            os.path.join(self._path, "streams", "stream_0", "checkpoints")))
+
+    cluster.restart_dispatcher()
+    cluster.restart_worker(0)
+    self._wait_for_snapshot()
+    self.assertTrue(self._snapshot_is_done())
+    # TODO(b/250921378): Verify the number of elements.
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testRepeatedDatasetRecoversAndCompletes(self):
+    cluster = data_service_test_base.TestCluster(num_workers=3)
+    ds = dataset_ops.Dataset.range(100)
+    ds = ds.repeat(10)
+    self.evaluate(distributed_save_op.distributed_save(
+        ds, self._path, cluster.dispatcher_address()
+    ))
+
+    get_stream_assignments(cluster, 3)  # Blocks until all workers have streams.
+    cluster.stop_worker(0)
+    cluster.restart_dispatcher()
+    cluster.restart_worker(0)
+    self._wait_for_snapshot()
+    self.assertTrue(self._snapshot_is_done())
+    # TODO(b/250921378): Verify the number of elements.
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNonrepeatedDatasetDoesntProduceSecondRepetitionDir(self):
+    num_workers = 5
+    num_sources = 3
+    cluster, _ = self.setup(
+        num_workers=num_workers,
+        ds_size=1000,
+        num_sources=num_sources,
+    )
+    # Blocks until all workers have streams.
+    get_stream_assignments(cluster, num_workers)
+    cluster.stop_worker(0)
+    cluster.restart_worker(0)
+    self._wait_for_snapshot()
+    self.assertTrue(self._snapshot_is_done())
+    for stream_idx in range(num_workers):
+      for source_idx in range(num_sources):
+        self.assertFalse(
+            os.path.exists(
+                os.path.join(
+                    self._path,
+                    "streams",
+                    f"stream_{stream_idx}",
+                    "splits",
+                    f"source_{source_idx}",
+                    "repetition_1",
+                )
+            )
+        )
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testMultipleDatasetRecoversAndCompletes(self):
+    cluster = data_service_test_base.TestCluster(num_workers=3)
+    dataset1 = dataset_ops.Dataset.range(1000)
+    datasets = [
+        dataset_ops.Dataset.from_tensors("a").repeat(100),
+        dataset_ops.Dataset.from_tensors("b").repeat(100),
+        dataset_ops.Dataset.from_tensors("c").repeat(100),
+    ]
+    choice_dataset = dataset_ops.Dataset.range(3).repeat()
+    dataset2 = dataset_ops.Dataset.choose_from_datasets(
+        datasets, choice_dataset
+    )
+
+    snapshot_path1 = os.path.join(self._path, "snapshot1")
+    snapshot_path2 = os.path.join(self._path, "snapshot2")
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset1, snapshot_path1, cluster.dispatcher_address()
+        )
+    )
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset2, snapshot_path2, cluster.dispatcher_address()
+        )
+    )
+
+    get_stream_assignments(cluster, 3)  # Blocks until all workers have streams.
+    cluster.stop_worker(0)
+    cluster.restart_dispatcher()
+    cluster.restart_worker(0)
+    while not os.path.exists(os.path.join(snapshot_path1, "DONE")):
+      time.sleep(0.1)
+    while not os.path.exists(os.path.join(snapshot_path2, "DONE")):
+      time.sleep(0.1)
+    # TODO(b/250921378): Verify the number of elements.
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testNestedDataset(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.from_tensor_slices(range(100))
+    def interleave_fn(x):
+      ds = dataset_ops.Dataset.from_tensor_slices(range(x))
+      def flat_map_fn(y):
+        return dataset_ops.Dataset.from_tensor_slices([y])
+      return ds.flat_map(flat_map_fn)
+    dataset = dataset.interleave(
+        interleave_fn, cycle_length=2, num_parallel_calls=2)
+
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, self._path, cluster.dispatcher_address()))
+    get_stream_assignments(cluster, 1)  # Blocks until all workers have streams.
+    time.sleep(1)
     cluster.stop_worker(0)
     cluster.restart_dispatcher()
     cluster.restart_worker(0)
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py
index 5d22cb67181..6777927cc50 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_test.py
@@ -103,6 +103,21 @@ class DistributedSaveTest(
     dataset = dataset_ops.Dataset.load(self._test_dir)
     self.assertDatasetProduces(dataset, list(range(10)))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testRepeatedDataset(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.range(10)
+    dataset = dataset.repeat(3)
+    self.evaluate(distributed_save_op.distributed_save(
+        dataset,
+        self._test_dir,
+        cluster.dispatcher_address(),
+    ))
+    _wait_for_snapshot(self._test_dir)
+
+    dataset = dataset_ops.Dataset.load(self._test_dir)
+    self.assertDatasetProduces(dataset, list(range(10)) * 3)
+
   @combinations.generate(test_base.default_test_combinations())
   def testChooseFromDatasets(self):
     cluster = data_service_test_base.TestCluster(num_workers=1)
@@ -119,7 +134,72 @@ class DistributedSaveTest(
     _wait_for_snapshot(self._test_dir)
 
     dataset = dataset_ops.Dataset.load(self._test_dir)
-    self.assertDatasetProduces(dataset, ["a", "b", "c"] * 5)
+    self.assertDatasetProduces(dataset, [b"a", b"b", b"c"] * 5)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testChooseFromRepeatedDatasets(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    datasets = [
+        dataset_ops.Dataset.from_tensors("a").repeat(5),
+        dataset_ops.Dataset.from_tensors("b").repeat(5),
+        dataset_ops.Dataset.from_tensors("c").repeat(5),
+    ]
+    choice_dataset = dataset_ops.Dataset.range(3).repeat()
+    dataset = dataset_ops.Dataset.choose_from_datasets(datasets, choice_dataset)
+    self.evaluate(distributed_save_op.distributed_save(
+        dataset, self._test_dir, cluster.dispatcher_address()
+    ))
+    _wait_for_snapshot(self._test_dir)
+
+    dataset = dataset_ops.Dataset.load(self._test_dir)
+    self.assertDatasetProduces(dataset, [b"a", b"b", b"c"] * 5)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_workers=[1, 3]),
+      )
+  )
+  def testWriteMultipleDatasets(self, num_workers):
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
+    dataset1 = dataset_ops.Dataset.range(100)
+    datasets = [
+        dataset_ops.Dataset.from_tensors("a").repeat(5),
+        dataset_ops.Dataset.from_tensors("b").repeat(5),
+        dataset_ops.Dataset.from_tensors("c").repeat(5),
+    ]
+    choice_dataset = dataset_ops.Dataset.range(3).repeat()
+    dataset2 = dataset_ops.Dataset.choose_from_datasets(
+        datasets, choice_dataset
+    )
+
+    snapshot_path1 = os.path.join(self._test_dir, "snapshot1")
+    snapshot_path2 = os.path.join(self._test_dir, "snapshot2")
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset1, snapshot_path1, cluster.dispatcher_address()
+        )
+    )
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset2, snapshot_path2, cluster.dispatcher_address()
+        )
+    )
+    _wait_for_snapshot(snapshot_path1)
+    _wait_for_snapshot(snapshot_path2)
+
+    ignore_order = num_workers > 1
+    dataset1 = dataset_ops.Dataset.load(snapshot_path1)
+    self.assertDatasetProduces(
+        dataset1,
+        list(range(100)),
+        assert_items_equal=ignore_order,
+    )
+    self.assertDatasetProduces(
+        dataset2,
+        [b"a", b"b", b"c"] * 5,
+        assert_items_equal=ignore_order,
+    )
 
   @combinations.generate(test_base.default_test_combinations())
   def testLoadWithCustomReaderFunc(self):
@@ -160,12 +240,44 @@ class DistributedSaveTest(
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
-          combinations.combine(num_workers=[1, 3]),
-      )
-  )
-  def testDistributedLoad(self, num_workers):
+          combinations.combine(
+              num_workers=[1, 3],
+              sharding_policy=[
+                  data_service_ops.ShardingPolicy.OFF,
+                  data_service_ops.ShardingPolicy.DYNAMIC])))
+  def testDistributedLoad(self, num_workers, sharding_policy):
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     dataset = dataset_ops.Dataset.range(10)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, self._test_dir, cluster.dispatcher_address()))
+    _wait_for_snapshot(self._test_dir)
+
+    dataset = dataset_ops.Dataset.load(self._test_dir)
+    dataset = dataset.apply(
+        data_service_ops.distribute(
+            processing_mode=sharding_policy,
+            service=cluster.dispatcher_address()))
+
+    ignore_order = num_workers > 0
+    expected = list(range(10))
+    if sharding_policy == data_service_ops.ShardingPolicy.OFF:
+      expected *= num_workers
+    self.assertDatasetProduces(
+        dataset, expected, assert_items_equal=ignore_order
+    )
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testImbalancedZipAndRepeat(self):
+    smaller_num_elements = 200
+    larger_num_elements = 1000
+    repetitions = 3
+
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset1 = dataset_ops.Dataset.range(smaller_num_elements)
+    dataset2 = dataset_ops.Dataset.range(larger_num_elements)
+    dataset = dataset_ops.Dataset.zip((dataset1, dataset2))
+    dataset = dataset.repeat(repetitions)
     self.evaluate(
         distributed_save_op.distributed_save(
             dataset, self._test_dir, cluster.dispatcher_address()
@@ -174,15 +286,10 @@ class DistributedSaveTest(
     _wait_for_snapshot(self._test_dir)
 
     dataset = dataset_ops.Dataset.load(self._test_dir)
-    dataset = dataset.apply(
-        data_service_ops.distribute(
-            data_service_ops.ShardingPolicy.OFF,
-            cluster.dispatcher_address(),
-        )
+    expected = repetitions * (
+        list(zip(range(smaller_num_elements), range(smaller_num_elements)))
     )
-    ignore_order = num_workers > 0
-    self.assertDatasetProduces(
-        dataset, list(range(10)) * num_workers, assert_items_equal=ignore_order)
+    self.assertDatasetProduces(dataset, expected, assert_items_equal=True)
 
   @combinations.generate(test_base.default_test_combinations())
   def testSnapshotDoesNotExist(self):
@@ -202,7 +309,7 @@ class DistributedSaveTest(
     cluster = data_service_test_base.TestCluster(num_workers=1)
     dataset = dataset_ops.Dataset.range(10)
     with self.assertRaisesRegex(
-        errors.InvalidArgumentError, "already started or completed"):
+        errors.AlreadyExistsError, "already started or completed"):
       self.evaluate(
           distributed_save_op.distributed_save(
               dataset, self._test_dir, cluster.dispatcher_address()))
@@ -222,7 +329,7 @@ class DistributedSaveTest(
     _wait_for_error(self._test_dir)
 
     with self.assertRaisesRegex(
-        errors.InvalidArgumentError, "the save job failed to write it."):
+        ValueError, "The save job failed to write it."):
       dataset = dataset_ops.Dataset.load(self._test_dir)
       self.getDatasetOutput(dataset)
 
@@ -246,6 +353,39 @@ class DistributedSaveTest(
           dataset, self._test_dir, cluster.dispatcher_address()
       ))
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testBadElementSpec(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.range(10)
+    self.evaluate(distributed_save_op.distributed_save(
+        dataset, self._test_dir,
+        cluster.dispatcher_address(),
+        compression="AUTO"))
+    _wait_for_snapshot(self._test_dir)
+
+    with self.assertRaisesRegex(
+        ValueError,
+        "User specified element_spec bad_element_spec, but the actual "
+        "element_spec is TensorSpec"):
+      _ = dataset_ops.Dataset.load(self._test_dir,
+                                   element_spec="bad_element_spec")
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testBadCompression(self):
+    cluster = data_service_test_base.TestCluster(num_workers=1)
+    dataset = dataset_ops.Dataset.range(10)
+    self.evaluate(distributed_save_op.distributed_save(
+        dataset, self._test_dir,
+        cluster.dispatcher_address(),
+        compression="AUTO"))
+    _wait_for_snapshot(self._test_dir)
+
+    with self.assertRaisesRegex(
+        ValueError,
+        "User specified compression ZLIB, but the actual compression is "
+        "SNAPPY."):
+      _ = dataset_ops.Dataset.load(self._test_dir, compression="ZLIB")
+
 
 class LoadCheckpointTest(
     DistributedSaveTestBase,
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index b06ac88b86e..d2018a755e0 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -11,16 +11,16 @@ py_strict_library(
     srcs = ["batching.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:structured_function",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -31,11 +31,11 @@ py_strict_library(
     srcs = ["cardinality.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -45,8 +45,8 @@ py_strict_library(
     srcs = ["compression_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
     ],
 )
 
@@ -55,9 +55,9 @@ py_strict_library(
     srcs = ["counter.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -72,9 +72,6 @@ py_strict_library(
     deps = [
         ":compression_ops",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tf2",
         "//tensorflow/python/data/experimental/service:_pywrap_server_lib",
         "//tensorflow/python/data/experimental/service:_pywrap_utils",
@@ -83,8 +80,11 @@ py_strict_library(
         "//tensorflow/python/data/ops:structured_function",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util:tf_export",
     ],
@@ -98,7 +98,7 @@ py_strict_library(
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
     ],
 )
@@ -110,16 +110,16 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/types:data",
         "//tensorflow/python/util:tf_export",
     ],
@@ -150,10 +150,10 @@ py_strict_library(
     srcs = ["from_list.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -175,14 +175,14 @@ py_strict_library(
     srcs = ["grouping.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:structured_function",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -221,13 +221,13 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:basic_session_run_hooks",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:session_run_hook",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/training:basic_session_run_hooks",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/training:session_run_hook",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -241,12 +241,12 @@ py_strict_library(
     srcs_version = "PY2AND3",
     deps = [
         ":cardinality",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -256,9 +256,9 @@ py_strict_library(
     srcs = ["map_defun.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:dataset_ops_gen",
     ],
 )
 
@@ -267,11 +267,24 @@ py_strict_library(
     srcs = ["matching_files.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+    ],
+)
+
+py_strict_library(
+    name = "pad_to_cardinality",
+    srcs = ["pad_to_cardinality.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
@@ -280,13 +293,13 @@ py_strict_library(
     srcs = ["parsing_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -298,13 +311,6 @@ py_strict_library(
     srcs = ["prefetching_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:structured_function",
@@ -313,6 +319,13 @@ py_strict_library(
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -322,8 +335,8 @@ py_strict_library(
     srcs = ["random_access.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/util:structure",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -351,20 +364,20 @@ py_strict_library(
     deps = [
         ":error_ops",
         ":parsing_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
         "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:convert",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:io_ops",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -404,15 +417,15 @@ py_strict_library(
     ],
     deps = [
         ":random_access",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:random_seed",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -427,11 +440,11 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -453,10 +466,10 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
     ],
 )
 
@@ -479,12 +492,12 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/types:data",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -511,6 +524,7 @@ py_strict_library(
         ":io",
         ":map_defun",
         ":matching_files",
+        ":pad_to_cardinality",
         ":prefetching_ops",
         ":random_access",
         ":readers",
@@ -521,9 +535,9 @@ py_strict_library(
         ":take_while_ops",
         ":unique",
         ":writers",
-        "//tensorflow/python:dataset_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:debug_mode",
         "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/ops:dataset_ops_gen",
     ],
 )
diff --git a/tensorflow/python/data/experimental/ops/data_service_ops.py b/tensorflow/python/data/experimental/ops/data_service_ops.py
index 350ae950ce7..d2732ed9739 100644
--- a/tensorflow/python/data/experimental/ops/data_service_ops.py
+++ b/tensorflow/python/data/experimental/ops/data_service_ops.py
@@ -521,7 +521,6 @@ def _distribute(processing_mode,
         max_outstanding_requests=max_outstanding_requests,
         task_refresh_interval_hint_ms=task_refresh_interval_hint_ms,
         data_transfer_protocol=data_transfer_protocol,
-        compression=compression,
         cross_trainer_cache=cross_trainer_cache,
         target_workers=target_workers)
 
@@ -900,7 +899,6 @@ def _from_dataset_id(processing_mode,
                      max_outstanding_requests=None,
                      task_refresh_interval_hint_ms=None,
                      data_transfer_protocol=None,
-                     compression="AUTO",
                      cross_trainer_cache=None,
                      target_workers="AUTO"):
   """Creates a dataset which reads data from the tf.data service.
@@ -950,8 +948,6 @@ def _from_dataset_id(processing_mode,
       dispatcher for task changes.
     data_transfer_protocol: (Optional.) The protocol to use for transferring
       data with the tf.data service. By default, data is transferred using gRPC.
-    compression: An indication of how the dataset's elements were compressed, so
-      that `from_dataset_id` can uncompress them if necessary.
     cross_trainer_cache: (Optional.) If a `CrossTrainerCache` object is
       provided, dataset iteration will be shared across concurrently running
       trainers. See
@@ -1007,7 +1003,6 @@ def _from_dataset_id(processing_mode,
     protocol, address = service
   else:
     protocol, address = _parse_service(service)
-  _validate_compression(compression)
   if job_name is not None:
     if not isinstance(job_name, str) and not isinstance(job_name, ops.Tensor):
       raise ValueError(
diff --git a/tensorflow/python/data/experimental/ops/pad_to_cardinality.py b/tensorflow/python/data/experimental/ops/pad_to_cardinality.py
new file mode 100644
index 00000000000..e98ddb88733
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/pad_to_cardinality.py
@@ -0,0 +1,105 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""The implementation of `tf.data.experimental.pad_to_cardinality`."""
+
+from collections.abc import Mapping
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import context
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export("data.experimental.pad_to_cardinality")
+def pad_to_cardinality(cardinality, mask_key="valid"):
+  """Pads a dataset with fake elements to reach the desired cardinality.
+
+  The dataset to pad must have a known and finite cardinality and contain
+  dictionary elements. The `mask_key` will be added to differentiate between
+  real and padding elements -- real elements will have a `<mask_key>=True` entry
+  while padding elements will have a `<mask_key>=False` entry.
+
+  Example usage:
+
+  >>> ds = tf.data.Dataset.from_tensor_slices({'a': [1, 2]})
+  >>> ds = ds.apply(tf.data.experimental.pad_to_cardinality(3))
+  >>> list(ds.as_numpy_iterator())
+  [{'a': 1, 'valid': True}, {'a': 2, 'valid': True}, {'a': 0, 'valid': False}]
+
+  This can be useful, e.g. during eval, when partial batches are undesirable but
+  it is also important not to drop any data.
+
+  ```
+  ds = ...
+  # Round up to the next full batch.
+  target_cardinality = -(-ds.cardinality() // batch_size) * batch_size
+  ds = ds.apply(tf.data.experimental.pad_to_cardinality(target_cardinality))
+  # Set `drop_remainder` so that batch shape will be known statically. No data
+  # will actually be dropped since the batch size divides the cardinality.
+  ds = ds.batch(batch_size, drop_remainder=True)
+  ```
+
+  Args:
+    cardinality: The cardinality to pad the dataset to.
+    mask_key: The key to use for identifying real vs padding elements.
+
+  Returns:
+    A dataset transformation that can be applied via `Dataset.apply()`.
+  """
+
+  def make_filler_dataset(ds):
+    padding = cardinality - ds.cardinality()
+
+    filler_element = nest.map_structure(
+        lambda spec: array_ops.zeros(spec.shape, spec.dtype), ds.element_spec
+    )
+    filler_element[mask_key] = False
+    filler_dataset = dataset_ops.Dataset.from_tensors(filler_element)
+    filler_dataset = filler_dataset.repeat(padding)
+    return filler_dataset
+
+  def apply_valid_mask(x):
+    x[mask_key] = True
+    return x
+
+  def _apply_fn(dataset):
+    # The cardinality tensor is unknown during tracing, so we only check it
+    # in eager mode.
+    if context.executing_eagerly():
+      if dataset.cardinality() < 0:
+        raise ValueError(
+            "The dataset passed into `pad_to_cardinality` must "
+            "have a known cardinalty, but has cardinality "
+            f"{dataset.cardinality()}"
+        )
+      if dataset.cardinality() > cardinality:
+        raise ValueError(
+            "The dataset passed into `pad_to_cardinality` must "
+            "have a cardinalty less than the target cardinality "
+            f"({cardinality}), but has cardinality "
+            f"{dataset.cardinality()}"
+        )
+    if not isinstance(dataset.element_spec, Mapping):
+      raise ValueError(
+          "`pad_to_cardinality` requires its input dataset to "
+          "be a dictionary."
+      )
+    filler = make_filler_dataset(dataset)
+    dataset = dataset.map(apply_valid_mask)
+    dataset = dataset.concatenate(filler)
+    return dataset
+
+  return _apply_fn
diff --git a/tensorflow/python/data/experimental/service/BUILD b/tensorflow/python/data/experimental/service/BUILD
index 1f2d6cc19c5..a0cc68f6a5e 100644
--- a/tensorflow/python/data/experimental/service/BUILD
+++ b/tensorflow/python/data/experimental/service/BUILD
@@ -13,8 +13,8 @@ tf_python_pybind_extension(
     deps = [
         "//tensorflow/core:lib",
         "//tensorflow/core/data/service:server_lib_headers_lib",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
         "//third_party/python_runtime:headers",
         "@com_google_absl//absl/strings",
         "@pybind11",
@@ -42,7 +42,7 @@ tf_py_strict_test(
     srcs = ["server_lib_test.py"],
     deps = [
         ":server_lib",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/profiler:profiler_client",
     ],
@@ -53,7 +53,7 @@ tf_python_pybind_extension(
     srcs = ["utils_wrapper.cc"],
     deps = [
         "//tensorflow/core/data/service:py_utils",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
diff --git a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
index 6d37373d5a2..50049669b1d 100644
--- a/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
+++ b/tensorflow/python/data/experimental/service/server_lib_wrapper.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <memory>
 #include <string>
 #include <vector>
 
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 01e662c8dcd..0728dc9b051 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 # Definitions are loaded separately so that copybara can pattern match (and modify) each definition.
 load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
@@ -15,15 +15,15 @@ tf_py_strict_test(
     srcs = ["as_numpy_iterator_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -37,13 +37,6 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:random_access",
@@ -51,10 +44,17 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
         "//tensorflow/python/ops/ragged:ragged_concat_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_math_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -66,15 +66,15 @@ tf_py_strict_test(
     srcs = ["bucket_by_sequence_length_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -91,13 +91,6 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:random_access",
@@ -105,6 +98,13 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -130,15 +130,6 @@ tf_py_strict_test(
     grpc_enabled = True,
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:grouping",
@@ -149,25 +140,26 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "checkpoint_test_base",
     srcs = ["checkpoint_test_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/checkpoint:checkpoint_options",
@@ -175,7 +167,15 @@ py_library(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/training:saver",
         "//tensorflow/python/util:nest",
@@ -191,15 +191,15 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -212,14 +212,14 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -232,11 +232,11 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -247,13 +247,13 @@ tf_py_strict_test(
     srcs = ["dataset_spec_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -266,13 +266,6 @@ tf_py_strict_test(
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:debug_mode",
@@ -285,10 +278,17 @@ tf_py_strict_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -301,13 +301,13 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -320,14 +320,14 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -341,10 +341,10 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -356,27 +356,31 @@ tf_py_strict_test(
     srcs = ["flat_map_test.py"],
     grpc_enabled = True,
     shard_count = 8,
+    tags = [
+        "nomac",  # b/284304023
+    ],
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:function",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops/ragged:ragged_conversion_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -389,18 +393,18 @@ tf_py_strict_test(
     shard_count = 4,
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -413,13 +417,13 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -432,16 +436,16 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -455,23 +459,23 @@ tf_py_strict_test(
         ":checkpoint_test_base",
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -483,14 +487,14 @@ tf_py_strict_test(
     srcs = ["get_single_element_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -503,17 +507,17 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -526,15 +530,15 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:io_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -549,16 +553,16 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -576,22 +580,22 @@ tf_py_strict_test(
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -604,31 +608,31 @@ cuda_py_strict_test(
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -641,8 +645,8 @@ tf_py_strict_test(
     srcs = ["len_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:framework_combinations",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -654,10 +658,10 @@ tf_py_strict_test(
     srcs = ["list_files_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -672,31 +676,8 @@ tf_py_strict_test(
         ":checkpoint_test_base",
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_sanitizers",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:random_access",
@@ -705,10 +686,33 @@ tf_py_strict_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_case",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_concat_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -727,11 +731,11 @@ cuda_py_strict_test(
     ],
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/types:internal",
         "@absl_py//absl/testing:parameterized",
@@ -750,10 +754,6 @@ cuda_py_strict_test(
     ],
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
@@ -763,8 +763,12 @@ cuda_py_strict_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -775,15 +779,6 @@ cuda_py_strict_test(
     srcs = ["optional_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:optional_ops",
@@ -791,8 +786,17 @@ cuda_py_strict_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -804,13 +808,13 @@ tf_py_strict_test(
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -823,19 +827,19 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -848,21 +852,21 @@ cuda_py_strict_test(
     srcs = ["placement_test.py"],
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:test_ops",
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -874,12 +878,12 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -891,18 +895,18 @@ tf_py_strict_test(
     shard_count = 4,
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/ragged:ragged_concat_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -915,11 +919,11 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -949,13 +953,13 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -969,20 +973,20 @@ cuda_py_strict_test(
     ],
     deps = [
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -995,14 +999,14 @@ tf_py_strict_test(
     shard_count = 10,
     deps = [
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1017,13 +1021,13 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1037,16 +1041,16 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:random_seed",
         "//tensorflow/python/compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1059,24 +1063,24 @@ cuda_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1090,12 +1094,12 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1109,16 +1113,7 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:pywrap_sanitizers",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/experimental/ops:iterator_ops",
@@ -1127,7 +1122,16 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:saver",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1142,12 +1146,12 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1167,9 +1171,6 @@ tf_py_strict_test(
         ":checkpoint_test_base",
         ":test_base",
         ":tf_record_test_base",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/experimental/ops:snapshot",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
@@ -1177,24 +1178,27 @@ tf_py_strict_test(
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 tf_py_strict_test(
     name = "take_test",
-    size = "small",
+    size = "medium",
     srcs = ["take_test.py"],
     shard_count = 4,
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1208,34 +1212,24 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test_base",
     srcs = ["test_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/data/experimental/ops:lookup_ops",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -1243,26 +1237,36 @@ py_library(
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
 tf_py_strict_test(
     name = "text_line_dataset_test",
-    size = "small",
+    size = "medium",
     srcs = ["text_line_dataset_test.py"],
     shard_count = 4,
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1277,17 +1281,17 @@ tf_py_strict_test(
         ":checkpoint_test_base",
         ":test_base",
         ":tf_record_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_record_test_base",
     srcs = [
         "tf_record_test_base.py",
@@ -1296,11 +1300,11 @@ py_library(
     deps = [
         ":test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python/data/experimental/ops:readers",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/util:compat",
     ],
 )
@@ -1312,18 +1316,18 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1338,10 +1342,10 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1353,10 +1357,10 @@ py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1385,15 +1389,15 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1407,13 +1411,13 @@ tf_py_strict_test(
     deps = [
         ":checkpoint_test_base",
         ":test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python/data/experimental/ops:random_access",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/data/kernel_tests/flat_map_test.py b/tensorflow/python/data/kernel_tests/flat_map_test.py
index a603fd96c7d..29c9cf72aea 100644
--- a/tensorflow/python/data/kernel_tests/flat_map_test.py
+++ b/tensorflow/python/data/kernel_tests/flat_map_test.py
@@ -32,6 +32,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import tensor_array_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops.ragged import ragged_conversion_ops
@@ -48,7 +49,8 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
     repeats = [1, 2, 3, 4, 5, 0, 1]
     components = np.array(repeats, dtype=np.int64)
     dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
-        lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x))
+        lambda x: dataset_ops.Dataset.from_tensors([x]).repeat(x)
+    )
     expected_output = []
     for i in repeats:
       expected_output.extend([[i]] * i)
@@ -60,7 +62,8 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
     components = np.array(repeats, dtype=np.int64)
     dataset = dataset_ops.Dataset.from_tensor_slices(components).flat_map(
         lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
-            lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))
+            lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y)
+        )
     )
     expected_output = []
     for row in repeats:
@@ -72,12 +75,14 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSharedResourceNestedFlatMapDataset(self):
     repeats = [[1, 2], [3, 4], [5, 0], [1, 7]]
     components = np.array(repeats, dtype=np.int64)
-    iterator = (
-        dataset_ops.make_initializable_iterator(
-            dataset_ops.Dataset.from_tensor_slices(components).flat_map(
-                lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
-                    lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y))),
-            shared_name="shared_flat_map_iterator"))
+    iterator = dataset_ops.make_initializable_iterator(
+        dataset_ops.Dataset.from_tensor_slices(components).flat_map(
+            lambda x: dataset_ops.Dataset.from_tensor_slices(x).flat_map(
+                lambda y: dataset_ops.Dataset.from_tensors(y).repeat(y)
+            )
+        ),
+        shared_name="shared_flat_map_iterator",
+    )
     init_op = iterator.initializer
     get_next = iterator.get_next()
 
@@ -103,10 +108,15 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testMapDict(self):
-    dataset = dataset_ops.Dataset.range(10).map(
-        lambda x: {"foo": x * 2, "bar": x ** 2}).flat_map(
-            lambda d: dataset_ops.Dataset.from_tensors(
-                d["foo"]).repeat(d["bar"]))
+    dataset = (
+        dataset_ops.Dataset.range(10)
+        .map(lambda x: {"foo": x * 2, "bar": x**2})
+        .flat_map(
+            lambda d: dataset_ops.Dataset.from_tensors(d["foo"]).repeat(
+                d["bar"]
+            )
+        )
+    )
     get_next = self.getNext(dataset)
     for i in range(10):
       for _ in range(i**2):
@@ -118,11 +128,13 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testSparse(self):
     def _map_fn(i):
       return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2]
+      )
 
     def _flat_map_fn(x):
       return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values)
+      )
 
     dataset = dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
     expected_output = []
@@ -135,10 +147,9 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
   def testTensorArray(self):
     def _map_fn(i):
       i = math_ops.cast(i, dtypes.int32)
-      return (
-          tensor_array_ops.TensorArray(
-              dtype=dtypes.int32, element_shape=(), size=i)
-          .unstack(math_ops.range(i)))
+      return tensor_array_ops.TensorArray(
+          dtype=dtypes.int32, element_shape=(), size=i
+      ).unstack(math_ops.range(i))
 
     def _flat_map_fn(x):
       self.assertIsInstance(x, tensor_array_ops.TensorArray)
@@ -155,13 +166,13 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testRagged(self):
-
     def _map_fn(i):
       return ragged_tensor.RaggedTensor.from_tensor(i * [[1], [-1]])
 
     def _flat_map_fn(x):
       return dataset_ops.Dataset.from_tensor_slices(
-          ragged_conversion_ops.to_tensor(x))
+          ragged_conversion_ops.to_tensor(x)
+      )
 
     dataset = dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
     expected_output = []
@@ -172,26 +183,64 @@ class FlatMapTest(test_base.DatasetTestBase, parameterized.TestCase):
 
   @combinations.generate(test_base.default_test_combinations())
   def testName(self):
-
     def fn(x):
       return dataset_ops.Dataset.from_tensors(x)
 
     dataset = dataset_ops.Dataset.from_tensors(42).flat_map(fn, name="flat_map")
     self.assertDatasetProduces(dataset, [42])
 
+  @combinations.generate(test_base.v2_eager_only_combinations())
+  def testSymbolicCheckpointSize(self):
+    examples_per_flat_map = 100
+    example_len = 10_000
 
-class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
-                            parameterized.TestCase):
+    def flat_map_fn(_):
+      data = []
+      for _ in range(examples_per_flat_map):
+        data.append(
+            stateless_random_ops.stateless_random_uniform(
+                [example_len], seed=(42, 42)
+            )
+        )
+      return dataset_ops.Dataset.from_tensor_slices(data)
+
+    ds = dataset_ops.Dataset.range(10)
+    # Inputs to flat_map are >1MB
+    ds = ds.map(
+        lambda x: stateless_random_ops.stateless_random_uniform(
+            [1_000_000], seed=(42, 42)
+        )
+    )
+    ds = ds.flat_map(flat_map_fn)
+
+    options = options_lib.Options()
+    options.experimental_symbolic_checkpoint = True
+    ds = ds.with_options(options)
+
+    it = ds.as_numpy_iterator()
+    for _ in range(30):
+      next(it)
+
+    ckpt = it._save()
+    # Make sure the checkpoint is smaller than the element sizes, i.e. no
+    # elements are being stored in the checkpoint.
+    self.assertLess(len(ckpt.numpy()), 10_000)
+
+
+class FlatMapCheckpointTest(
+    checkpoint_test_base.CheckpointTestBase, parameterized.TestCase
+):
 
   @combinations.generate(
       combinations.times(
           test_base.default_test_combinations(),
           checkpoint_test_base.default_test_combinations(),
-          combinations.combine(symbolic_checkpoint=[False, True])))
+          combinations.combine(symbolic_checkpoint=[False, True]),
+      )
+  )
   def test(self, verify_fn, symbolic_checkpoint):
     # Complicated way of saying range(start, start+25).
     def build_ds(start):
-
       def map_fn(x):
         return dataset_ops.Dataset.range(x, x + 5)
 
@@ -204,12 +253,13 @@ class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
     verify_fn(self, lambda: build_ds(0), num_outputs=25)
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+      )
+  )
   def testNested(self, verify_fn):
-
     def build_ds():
-
       inner_ds = dataset_ops.Dataset.from_tensor_slices(range(42))
       ds = dataset_ops.Dataset.from_tensors(inner_ds)
       return ds.flat_map(lambda x: x)
@@ -217,14 +267,14 @@ class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
     verify_fn(self, build_ds, num_outputs=42)
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+      )
+  )
   def testMapThenFlatMap(self, verify_fn):
-
     def build_ds():
-
       def flat_map_fn(_):
-
         def map_fn(y):
           return 10 * math_ops.cast(y, dtypes.int32)
 
@@ -235,14 +285,14 @@ class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
     verify_fn(self, build_ds, num_outputs=500)
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+      )
+  )
   def testCaptureDefunInMapFn(self, verify_fn):
-
     def build_ds():
-
       def map_fn(x):
-
         @function.Defun(dtypes.int64)
         def defun_fn(x):
           return constant_op.constant(1000) + math_ops.cast(x, dtypes.int32)
@@ -255,25 +305,24 @@ class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
 
   @combinations.generate(test_base.default_test_combinations())
   def testDisallowVariableCapture(self):
-
     def build_ds():
       test_var = variable_scope.get_variable(
-          name="test_var", shape=(), use_resource=True)
+          name="test_var", shape=(), use_resource=True
+      )
       return dataset_ops.Dataset.range(5).flat_map(
-          lambda _: dataset_ops.Dataset.from_tensor_slices([test_var]))
+          lambda _: dataset_ops.Dataset.from_tensor_slices([test_var])
+      )
 
     self.verify_error_on_save(build_ds, 5, errors.FailedPreconditionError)
 
   @combinations.generate(test_base.default_test_combinations())
   def testDisallowCapturingStatefulOps(self):
-
     def build_ds():
-
       def flat_map_fn(_):
-
         def map_fn(x):
           return random_ops.random_uniform(
-              (), 0, 10, dtype=dtypes.int32) * math_ops.cast(x, dtypes.int32)
+              (), 0, 10, dtype=dtypes.int32
+          ) * math_ops.cast(x, dtypes.int32)
 
         return dataset_ops.Dataset.range(100).map(map_fn)
 
@@ -282,17 +331,21 @@ class FlatMapCheckpointTest(checkpoint_test_base.CheckpointTestBase,
     self.verify_error_on_save(build_ds, 500, errors.FailedPreconditionError)
 
   @combinations.generate(
-      combinations.times(test_base.default_test_combinations(),
-                         checkpoint_test_base.default_test_combinations()))
+      combinations.times(
+          test_base.default_test_combinations(),
+          checkpoint_test_base.default_test_combinations(),
+      )
+  )
   def testSparse(self, verify_fn):
-
     def _map_fn(i):
       return sparse_tensor.SparseTensorValue(
-          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2])
+          indices=[[0, 0], [1, 1]], values=(i * [1, -1]), dense_shape=[2, 2]
+      )
 
     def _flat_map_fn(x):
       return dataset_ops.Dataset.from_tensor_slices(
-          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values))
+          sparse_ops.sparse_to_dense(x.indices, x.dense_shape, x.values)
+      )
 
     def _build_ds():
       return dataset_ops.Dataset.range(10).map(_map_fn).flat_map(_flat_map_fn)
diff --git a/tensorflow/python/data/kernel_tests/shuffle_test.py b/tensorflow/python/data/kernel_tests/shuffle_test.py
index 1e08720a957..39b203d71b2 100644
--- a/tensorflow/python/data/kernel_tests/shuffle_test.py
+++ b/tensorflow/python/data/kernel_tests/shuffle_test.py
@@ -15,6 +15,7 @@
 """Tests for `tf.data.Dataset.shuffle()`."""
 import collections
 import functools
+import sys
 
 from absl.testing import parameterized
 import numpy as np
@@ -475,6 +476,8 @@ class ShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
         pywrap_sanitizers.is_tsan_enabled() or
         pywrap_sanitizers.is_msan_enabled()):
       self.skipTest("Skip to avoid OOM when using sanitizers.")
+    if sys.platform == "darwin":
+      self.skipTest("Skip to avoid memory issues on mac.")
     dataset = dataset_ops.Dataset.range(12).batch(2)
     dataset = dataset.map(
         # Create tensors of size 512M.
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 65381907c5f..d83d8580dbf 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -12,13 +12,15 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":debug_mode",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python:script_ops",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:script_ops",
         "//tensorflow/python/util:function_utils",
         "//tensorflow/python/util:lazy_loader",
         "//tensorflow/python/util:variable_utils",
@@ -91,26 +93,6 @@ py_strict_library(
         ":options",
         ":structured_function",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:function",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops_gen",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:smart_cond",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:stateless_random_ops_gen",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:string_ops_gen",
         "//tensorflow/python:tf2",
         "//tensorflow/python/autograph/operators:control_flow",
         "//tensorflow/python/autograph/operators:py_builtins",
@@ -124,15 +106,37 @@ py_strict_library(
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/data/util:traverse",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/framework:auto_control_deps",
         "//tensorflow/python/framework:auto_control_deps_utils",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:stateless_random_ops_gen",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:string_ops_gen",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:nested_structure_coder",
@@ -162,13 +166,7 @@ py_strict_library(
         ":optional_ops",
         ":options",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/operators:control_flow",
         "//tensorflow/python/autograph/operators:py_builtins",
         "//tensorflow/python/checkpoint:saveable_compat",
@@ -176,13 +174,20 @@ py_strict_library(
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:parsing_ops",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
@@ -201,21 +206,21 @@ py_strict_library(
         ":dataset_ops",
         ":iterator_ops",
         ":options",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
     ],
 )
 
@@ -225,13 +230,13 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:optional_ops_gen",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:optional_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -259,16 +264,16 @@ py_strict_library(
     deps = [
         ":dataset_ops",
         ":structured_function",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf2",
         "//tensorflow/python/data/util:convert",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
         "//tensorflow/python/types:data",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index 44f4b0a7541..0c3a1dab4ae 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -35,6 +35,8 @@ from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
 from tensorflow.python.data.util import traverse
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import wrap_function
 from tensorflow.python.framework import auto_control_deps
 from tensorflow.python.framework import auto_control_deps_utils as acd_utils
 from tensorflow.python.framework import composite_tensor
@@ -76,18 +78,6 @@ from tensorflow.python.util.tf_export import tf_export
 # symbols can be removed once all internal uses are updated.
 StructuredFunctionWrapper = structured_function.StructuredFunctionWrapper
 
-# Loaded lazily due to a circular dependency (roughly
-# tf.function->wrap_function->dataset->autograph->tf.function).
-# TODO(b/133251390): Use a regular import.
-wrap_function = lazy_loader.LazyLoader(
-    "wrap_function", globals(),
-    "tensorflow.python.eager.wrap_function")
-# Loaded lazily due to a circular dependency
-# dataset_ops->def_function->func_graph->autograph->dataset_ops
-# TODO(kathywu): Use a regular import.
-def_function = lazy_loader.LazyLoader(
-    "def_function", globals(),
-    "tensorflow.python.eager.def_function")
 # TODO(b/240947712): Clean up the circular dependencies.
 # Loaded lazily due to a circular dependency (dataset_ops ->
 # prefetch_op -> dataset_ops).
@@ -630,7 +620,7 @@ class DatasetV2(
             f"`tf.data.Dataset.as_numpy_iterator()` is not supported for "
             f"datasets that produce values of type {component_spec.value_type}")
 
-    return _NumpyIterator(self)
+    return NumpyIterator(self)
 
   @property
   def _flat_shapes(self):
@@ -4676,7 +4666,8 @@ nested_structure_coder.register_codec(
 )
 
 
-class _NumpyIterator(tracking_base.Trackable):
+@tf_export("data.NumpyIterator")
+class NumpyIterator(tracking_base.Trackable):
   """Iterator over a dataset with elements converted to numpy."""
 
   __slots__ = ["_iterator"]
@@ -4712,15 +4703,28 @@ class _NumpyIterator(tracking_base.Trackable):
     # pylint: disable=protected-access
     return self._iterator._restore_from_tensors(restored_tensors)
 
+  # TODO(b/284309865): Remove once `_save` is no longer used anywhere.
   def _save(self):
+    # pylint: disable=protected-access
+    return self.save()
+
+  def save(self):
     # pylint: disable=protected-access
     return self._iterator._save()
 
+  # TODO(b/284309865): Remove once `_restore` is no longer used anywhere.
   def _restore(self, state):
+    return self.restore(state)
+
+  def restore(self, state):
     # pylint: disable=protected-access
     return self._iterator._restore(state)
 
 
+# TODO(b/284309865): Remove once `_NumpyIterator` is no longer used anywhere.
+_NumpyIterator = NumpyIterator
+
+
 class _VariantTracker(resource_lib.CapturableResource):
   """Allows export of functions capturing a Dataset in SavedModels.
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index bae8dde3949..5354345577e 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -18,6 +18,7 @@ import threading
 import warnings
 
 from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
 from tensorflow.python.checkpoint import saveable_compat
 from tensorflow.python.data.ops import iterator_autograph
 from tensorflow.python.data.ops import optional_ops
@@ -40,7 +41,6 @@ from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training.saver import BaseSaverBuilder
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import deprecation
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.tf_export import tf_export
 
@@ -79,11 +79,6 @@ GET_NEXT_CALL_ERROR_MESSAGE = (
 GLOBAL_ITERATORS = "iterators"
 
 
-autograph_ctx = lazy_loader.LazyLoader(
-    "autograph_ctx", globals(),
-    "tensorflow.python.autograph.core.ag_ctx")
-
-
 def _device_stack_is_empty():
   if context.executing_eagerly():
     return context.context().device_name is None
diff --git a/tensorflow/python/data/ops/load_op.py b/tensorflow/python/data/ops/load_op.py
index 94206d28c60..262d79419e1 100644
--- a/tensorflow/python/data/ops/load_op.py
+++ b/tensorflow/python/data/ops/load_op.py
@@ -16,6 +16,9 @@
 import multiprocessing
 import os
 
+from google.protobuf import message
+from google.protobuf import text_format
+from tensorflow.core.protobuf import snapshot_pb2
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.ops import structured_function
 from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
@@ -25,31 +28,64 @@ from tensorflow.python.saved_model import nested_structure_coder
 
 
 def _load(path, element_spec, compression, reader_func):
+  """Loads dataset from tf.data snapshot."""
+
+  def _get_distributed_snapshot_metadata():
+    """Reads the distributed snapshot metadata.
+
+    Returns:
+      DistributedSnapshotMetadata if the snapshot is a distributed snapshot.
+      Returns None if it is a non-distributed snapshot.
+    """
+    try:
+      with gfile.GFile(os.path.join(path, "snapshot.metadata"), "r") as f:
+        return text_format.ParseLines(
+            f, snapshot_pb2.DistributedSnapshotMetadata())
+    except (text_format.ParseError, message.DecodeError, UnicodeDecodeError):
+      return None
+
+  if reader_func is None:
+    reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
+        lambda x: x,
+        cycle_length=multiprocessing.cpu_count(),
+        num_parallel_calls=dataset_ops.AUTOTUNE)
+
+  if element_spec is None:
+    with gfile.GFile(
+        os.path.join(path, dataset_ops.DATASET_SPEC_FILENAME), "rb") as f:
+      encoded_spec = f.read()
+    element_spec = _parse_element_spec(encoded_spec)
+
+  distributed_snapshot_metadata = _get_distributed_snapshot_metadata()
+  if distributed_snapshot_metadata:
+    _validate_snapshot(
+        path, distributed_snapshot_metadata, element_spec, compression)
+    return _load_distributed_snapshot(
+        path, distributed_snapshot_metadata, reader_func)
   return _LoadDataset(path, element_spec, compression, reader_func)
 
 
+def _load_distributed_snapshot(path, metadata, reader_func):
+  """Loads a distributed snapshot."""
+
+  chunks_dir = os.path.join(path, "chunks")
+  chunk_files = [
+      os.path.join(chunks_dir, f) for f in gfile.ListDirectory(chunks_dir)]
+  dataset = dataset_ops.Dataset.from_tensor_slices(chunk_files)
+  dataset = dataset.map(
+      lambda chunk_file: _SnapshotChunkDataset(  # pylint:disable=g-long-lambda
+          chunk_file,
+          element_spec=_parse_element_spec(metadata.element_spec),
+          compression=metadata.compression))
+  return reader_func(dataset)
+
+
 class _LoadDataset(dataset_ops.DatasetSource):
   """A dataset that loads previously saved dataset."""
 
-  def __init__(self, path, element_spec=None, compression=None,
-               reader_func=None):
-    if reader_func is None:
-      reader_func = lambda datasets: datasets.interleave(  # pylint:disable=g-long-lambda
-          lambda x: x,
-          cycle_length=multiprocessing.cpu_count(),
-          num_parallel_calls=dataset_ops.AUTOTUNE)
-
+  def __init__(self, path, element_spec, compression, reader_func):
     self._path = path
-    if element_spec is None:
-      with gfile.GFile(
-          os.path.join(path, dataset_ops.DATASET_SPEC_FILENAME), "rb") as f:
-        encoded_spec = f.read()
-      struct_pb = nested_structure_coder.struct_pb2.StructuredValue()
-      struct_pb.ParseFromString(encoded_spec)
-      spec = nested_structure_coder.decode_proto(struct_pb)
-      self._element_spec = spec
-    else:
-      self._element_spec = element_spec
+    self._element_spec = element_spec
     self._compression = compression
     self._reader_func = structured_function.StructuredFunctionWrapper(
         reader_func,
@@ -66,9 +102,72 @@ class _LoadDataset(dataset_ops.DatasetSource):
         **self._flat_structure)
     super().__init__(variant_tensor)
 
-  def _functions(self):
-    return [self._reader_func]
+  @property
+  def element_spec(self):
+    return self._element_spec
+
+
+class _SnapshotChunkDataset(dataset_ops.DatasetSource):
+  """A dataset for one chunk file from a tf.data distributed snapshot."""
+
+  def __init__(self, chunk_file, element_spec, compression):
+    self._chunk_file = chunk_file
+    self._element_spec = element_spec
+    variant_tensor = ged_ops.snapshot_chunk_dataset(
+        chunk_file,
+        compression=compression,
+        **self._flat_structure)
+    super().__init__(variant_tensor)
 
   @property
   def element_spec(self):
     return self._element_spec
+
+
+def _validate_snapshot(path, metadata, element_spec, compression):
+  """Validates a tf.data distributed snapshot.
+
+  Args:
+    path: Root path of the distributed snapshot.
+    metadata: The DistributedSnapshotMetadata of the snapshot.
+    element_spec: Dataset element_spec.
+    compression: Compression method used for saving.
+
+  Raises:
+    ValueError if the snapshot is invalid.
+  """
+
+  if not gfile.Exists(path):
+    raise ValueError(
+        f"Failed to load tf.data snapshot at {path}: The snapshot directory "
+        "does not exist.")
+
+  if gfile.Exists(os.path.join(path, "ERROR")):
+    with gfile.GFile(os.path.join(path, "ERROR"), "r") as f:
+      raise ValueError(
+          f"Failed to load tf.data snapshot at {path}. The save job failed to "
+          f"write it. Status: {f.read()}")
+
+  if not gfile.Exists(os.path.join(path, "DONE")):
+    raise ValueError(
+        f"Failed to load tf.data snapshot at {path}. The save job has not "
+        "finished writing the snapshot.")
+
+  snapshot_element_spec = _parse_element_spec(metadata.element_spec)
+  if element_spec and element_spec != snapshot_element_spec:
+    raise ValueError(
+        f"Failed to load tf.data snapshot at {path}. User specified "
+        f"element_spec {element_spec}, but the actual element_spec is "
+        f"{snapshot_element_spec}.")
+
+  if compression and compression != metadata.compression:
+    raise ValueError(
+        f"Failed to load tf.data snapshot at {path}. User specified "
+        f"compression {compression}, but the actual compression is "
+        f"{metadata.compression}.")
+
+
+def _parse_element_spec(encoded_element_spec):
+  struct_pb = nested_structure_coder.struct_pb2.StructuredValue()
+  struct_pb.ParseFromString(encoded_element_spec)
+  return nested_structure_coder.decode_proto(struct_pb)
diff --git a/tensorflow/python/data/ops/structured_function.py b/tensorflow/python/data/ops/structured_function.py
index f566a07e029..c67cecd39f8 100644
--- a/tensorflow/python/data/ops/structured_function.py
+++ b/tensorflow/python/data/ops/structured_function.py
@@ -16,6 +16,8 @@
 
 import warnings
 
+from tensorflow.python.autograph.core import ag_ctx as autograph_ctx
+from tensorflow.python.autograph.impl import api as autograph
 from tensorflow.python.data.ops import debug_mode
 from tensorflow.python.data.util import nest
 from tensorflow.python.data.util import structure
@@ -26,17 +28,8 @@ from tensorflow.python.framework import function
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.util import function_utils
-from tensorflow.python.util import lazy_loader
 from tensorflow.python.util import variable_utils
 
-autograph = lazy_loader.LazyLoader(
-    "autograph", globals(),
-    "tensorflow.python.autograph.impl.api")
-# TODO(mdan): Create a public API for this.
-autograph_ctx = lazy_loader.LazyLoader(
-    "autograph_ctx", globals(),
-    "tensorflow.python.autograph.core.ag_ctx")
-
 
 def _should_pack(arg):
   """Determines whether the caller needs to pack the argument in a tuple.
diff --git a/tensorflow/python/data/util/BUILD b/tensorflow/python/data/util/BUILD
index 308af0d1873..126c2d12967 100644
--- a/tensorflow/python/data/util/BUILD
+++ b/tensorflow/python/data/util/BUILD
@@ -1,5 +1,4 @@
-load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.bzl", "py_strict_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -24,13 +23,16 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":nest",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -40,11 +42,11 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":nest",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:sparse_ops",
     ],
 )
 
@@ -57,11 +59,15 @@ py_strict_test(
     deps = [
         ":nest",
         ":sparse",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -71,15 +77,15 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":nest",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/types:internal",
@@ -99,22 +105,22 @@ py_strict_test(
     deps = [
         ":nest",
         ":structure",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -137,8 +143,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":options",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -147,10 +155,10 @@ py_strict_library(
     srcs = ["convert.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
@@ -162,10 +170,13 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":convert",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -174,26 +185,30 @@ py_strict_library(
     srcs = ["random_seed.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
 py_strict_test(
     name = "random_seed_test",
-    size = "small",
+    size = "medium",
     srcs = ["random_seed_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":random_seed",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -214,11 +229,13 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":traverse",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python/data/experimental/kernel_tests/service:test_base",
+        "//tensorflow/python/compat",
         "//tensorflow/python/data/experimental/ops:dataset_ops",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/debug/cli/BUILD b/tensorflow/python/debug/cli/BUILD
index a99ccc98761..31cd9a34886 100644
--- a/tensorflow/python/debug/cli/BUILD
+++ b/tensorflow/python/debug/cli/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "py_binary", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,7 +7,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "profile_analyzer_cli",
     srcs = ["profile_analyzer_cli.py"],
     srcs_version = "PY3",
@@ -22,7 +22,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "base_ui",
     srcs = ["base_ui.py"],
     srcs_version = "PY3",
@@ -33,28 +33,7 @@ py_library(
     ],
 )
 
-py_library(
-    name = "curses_widgets",
-    srcs = ["curses_widgets.py"],
-    srcs_version = "PY3",
-    deps = [":debugger_cli_common"],
-)
-
-py_library(
-    name = "curses_ui",
-    srcs = ["curses_ui.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base_ui",
-        ":cli_shared",
-        ":command_parser",
-        ":curses_widgets",
-        ":debugger_cli_common",
-        ":tensor_format",
-    ],
-)
-
-py_library(
+py_strict_library(
     name = "readline_ui",
     srcs = ["readline_ui.py"],
     srcs_version = "PY3",
@@ -64,23 +43,22 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ui_factory",
     srcs = ["ui_factory.py"],
     srcs_version = "PY3",
     deps = [
-        ":curses_ui",
-        ":readline_ui",
+        ":readline_ui",  # build_cleaner keep.
     ],
 )
 
-py_library(
+py_strict_library(
     name = "command_parser",
     srcs = ["command_parser.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "tensor_format",
     srcs = ["tensor_format.py"],
     srcs_version = "PY3",
@@ -91,7 +69,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cli_shared",
     srcs = ["cli_shared.py"],
     srcs_version = "PY3",
@@ -99,15 +77,15 @@ py_library(
         ":command_parser",
         ":debugger_cli_common",
         ":tensor_format",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
         "//tensorflow/python/debug/lib:common",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "evaluator",
     srcs = ["evaluator.py"],
     srcs_version = "PY3",
@@ -117,7 +95,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "analyzer_cli",
     srcs = ["analyzer_cli.py"],
     srcs_version = "PY3",
@@ -133,7 +111,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cli_config",
     srcs = ["cli_config.py"],
     srcs_version = "PY3",
@@ -143,18 +121,18 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debugger_cli_common",
     srcs = ["debugger_cli_common.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "offline_analyzer",
     srcs = ["offline_analyzer.py"],
     python_version = "PY3",
@@ -162,40 +140,18 @@ py_binary(
     deps = [":offline_analyzer_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "offline_analyzer_lib",
     srcs = ["offline_analyzer.py"],
     srcs_version = "PY3",
     deps = [
         ":analyzer_cli",
-        "//tensorflow/python",  # TODO(b/34059704): remove when fixed
         "//tensorflow/python/debug/lib:debug_data",
+        "@absl_py//absl:app",
     ],
 )
 
-py_test(
-    name = "curses_ui_test",
-    size = "small",
-    srcs = ["curses_ui_test.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    tags = [
-        "no_windows",
-    ],
-    deps = [
-        ":cli_config",
-        ":cli_test_utils",
-        ":curses_ui",
-        ":debugger_cli_common",
-        ":tensor_format",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/platform:gfile",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_test(
+py_strict_test(
     name = "readline_ui_test",
     size = "small",
     srcs = ["readline_ui_test.py"],
@@ -207,13 +163,14 @@ py_test(
         ":debugger_cli_common",
         ":readline_ui",
         ":ui_factory",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:gfile",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "debugger_cli_common_test",
     size = "small",
     srcs = ["debugger_cli_common_test.py"],
@@ -221,15 +178,15 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":debugger_cli_common",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "cli_config_test",
     size = "small",
     srcs = ["cli_config_test.py"],
@@ -237,14 +194,14 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":cli_config",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:gfile",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "command_parser_test",
     size = "small",
     srcs = ["command_parser_test.py"],
@@ -252,12 +209,12 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":command_parser",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "tensor_format_test",
     size = "small",
     srcs = ["tensor_format_test.py"],
@@ -266,14 +223,15 @@ py_test(
     deps = [
         ":cli_test_utils",
         ":tensor_format",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "cli_shared_test",
     size = "small",
     srcs = ["cli_shared_test.py"],
@@ -282,17 +240,17 @@ py_test(
     deps = [
         ":cli_shared",
         ":debugger_cli_common",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "evaluator_test",
     size = "small",
     srcs = [
@@ -303,21 +261,21 @@ py_test(
     tags = ["no_windows"],  # TODO(b/184424727): Enable this test on Windows.
     deps = [
         ":evaluator",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cli_test_utils",
     srcs = ["cli_test_utils.py"],
     srcs_version = "PY3",
     deps = ["//third_party/py/numpy"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "analyzer_cli_test",
     size = "small",
     srcs = ["analyzer_cli_test.py"],
@@ -332,27 +290,27 @@ cuda_py_test(
         ":command_parser",
         ":debugger_cli_common",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/debug/lib:debug_data",
         "//tensorflow/python/debug/lib:debug_utils",
         "//tensorflow/python/debug/lib:source_utils",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "profile_analyzer_cli_test",
     size = "small",
     srcs = ["profile_analyzer_cli_test.py"],
@@ -362,14 +320,14 @@ py_test(
         ":debugger_cli_common",
         ":profile_analyzer_cli",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:session",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
diff --git a/tensorflow/python/debug/cli/analyzer_cli.py b/tensorflow/python/debug/cli/analyzer_cli.py
index b8950b6dc30..7131a1df441 100644
--- a/tensorflow/python/debug/cli/analyzer_cli.py
+++ b/tensorflow/python/debug/cli/analyzer_cli.py
@@ -1577,16 +1577,16 @@ class DebugAnalyzer(object):
 
 def create_analyzer_ui(debug_dump,
                        tensor_filters=None,
-                       ui_type="curses",
+                       ui_type="readline",
                        on_ui_exit=None,
                        config=None):
-  """Create an instance of CursesUI based on a DebugDumpDir object.
+  """Create an instance of ReadlineUI based on a DebugDumpDir object.
 
   Args:
     debug_dump: (debug_data.DebugDumpDir) The debug dump to use.
     tensor_filters: (dict) A dict mapping tensor filter name (str) to tensor
       filter (Callable).
-    ui_type: (str) requested UI type, e.g., "curses", "readline".
+    ui_type: (str) requested UI type, only "readline" is supported.
     on_ui_exit: (`Callable`) the callback to be called when the UI exits.
     config: A `cli_config.CLIConfig` object.
 
diff --git a/tensorflow/python/debug/cli/curses_ui.py b/tensorflow/python/debug/cli/curses_ui.py
deleted file mode 100644
index 03bb977cbd3..00000000000
--- a/tensorflow/python/debug/cli/curses_ui.py
+++ /dev/null
@@ -1,1703 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Curses-Based Command-Line Interface of TensorFlow Debugger (tfdbg)."""
-import collections
-import curses
-from curses import textpad
-import os
-import signal
-import sys
-import threading
-
-
-from tensorflow.python.debug.cli import base_ui
-from tensorflow.python.debug.cli import cli_shared
-from tensorflow.python.debug.cli import command_parser
-from tensorflow.python.debug.cli import curses_widgets
-from tensorflow.python.debug.cli import debugger_cli_common
-from tensorflow.python.debug.cli import tensor_format
-
-
-_SCROLL_REFRESH = "refresh"
-_SCROLL_UP = "up"
-_SCROLL_DOWN = "down"
-_SCROLL_UP_A_LINE = "up_a_line"
-_SCROLL_DOWN_A_LINE = "down_a_line"
-_SCROLL_HOME = "home"
-_SCROLL_END = "end"
-_SCROLL_TO_LINE_INDEX = "scroll_to_line_index"
-
-_COLOR_READY_COLORTERMS = ["gnome-terminal", "xfce4-terminal"]
-_COLOR_ENABLED_TERM = "xterm-256color"
-
-
-def _get_command_from_line_attr_segs(mouse_x, attr_segs):
-  """Attempt to extract command from the attribute segments of a line.
-
-  Args:
-    mouse_x: (int) x coordinate of the mouse event.
-    attr_segs: (list) The list of attribute segments of a line from a
-      RichTextLines object.
-
-  Returns:
-    (str or None) If a command exists: the command as a str; otherwise, None.
-  """
-
-  for seg in attr_segs:
-    if seg[0] <= mouse_x < seg[1]:
-      attributes = seg[2] if isinstance(seg[2], list) else [seg[2]]
-      for attr in attributes:
-        if isinstance(attr, debugger_cli_common.MenuItem):
-          return attr.content
-
-
-class ScrollBar(object):
-  """Vertical ScrollBar for Curses-based CLI.
-
-  An object of this class has knowledge of the location of the scroll bar
-  in the screen coordinates, the current scrolling position, and the total
-  number of text lines in the screen text. By using this information, it
-  can generate text rendering of the scroll bar, which consists of and UP
-  button on the top and a DOWN button on the bottom, in addition to a scroll
-  block in between, whose exact location is determined by the scrolling
-  position. The object can also calculate the scrolling command (e.g.,
-  _SCROLL_UP_A_LINE, _SCROLL_DOWN) from the coordinate of a mouse click
-  event in the screen region it occupies.
-  """
-
-  BASE_ATTR = cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE
-
-  def __init__(self,
-               min_x,
-               min_y,
-               max_x,
-               max_y,
-               scroll_position,
-               output_num_rows):
-    """Constructor of ScrollBar.
-
-    Args:
-      min_x: (int) left index of the scroll bar on the screen (inclusive).
-      min_y: (int) top index of the scroll bar on the screen (inclusive).
-      max_x: (int) right index of the scroll bar on the screen (inclusive).
-      max_y: (int) bottom index of the scroll bar on the screen (inclusive).
-      scroll_position: (int) 0-based location of the screen output. For example,
-        if the screen output is scrolled to the top, the value of
-        scroll_position should be 0. If it is scrolled to the bottom, the value
-        should be output_num_rows - 1.
-      output_num_rows: (int) Total number of output rows.
-
-    Raises:
-      ValueError: If the width or height of the scroll bar, as determined
-       by min_x, max_x, min_y and max_y, is too small.
-    """
-
-    self._min_x = min_x
-    self._min_y = min_y
-    self._max_x = max_x
-    self._max_y = max_y
-    self._scroll_position = scroll_position
-    self._output_num_rows = output_num_rows
-    self._scroll_bar_height = max_y - min_y + 1
-
-    if self._max_x < self._min_x:
-      raise ValueError("Insufficient width for ScrollBar (%d)" %
-                       (self._max_x - self._min_x + 1))
-    if self._max_y < self._min_y + 3:
-      raise ValueError("Insufficient height for ScrollBar (%d)" %
-                       (self._max_y - self._min_y + 1))
-
-  def _block_y(self, screen_coord_sys=False):
-    """Get the 0-based y coordinate of the scroll block.
-
-    This y coordinate takes into account the presence of the UP and DN buttons
-    present at the top and bottom of the ScrollBar. For example, at the home
-    location, the return value will be 1; at the bottom location, the return
-    value will be self._scroll_bar_height - 2.
-
-    Args:
-      screen_coord_sys: (`bool`) whether the return value will be in the
-        screen coordinate system.
-
-    Returns:
-      (int) 0-based y coordinate of the scroll block, in the ScrollBar
-        coordinate system by default. For example,
-        when scroll position is at the top, this return value will be 1 (not 0,
-        because of the presence of the UP button). When scroll position is at
-        the bottom, this return value will be self._scroll_bar_height - 2
-        (not self._scroll_bar_height - 1, because of the presence of the DOWN
-        button).
-    """
-
-    rel_block_y = int(
-        float(self._scroll_position) / (self._output_num_rows - 1) *
-        (self._scroll_bar_height - 3)) + 1
-    return rel_block_y + self._min_y if screen_coord_sys else rel_block_y
-
-  def layout(self):
-    """Get the RichTextLines layout of the scroll bar.
-
-    Returns:
-      (debugger_cli_common.RichTextLines) The text layout of the scroll bar.
-    """
-    width = self._max_x - self._min_x + 1
-    empty_line = " " * width
-    foreground_font_attr_segs = [(0, width, self.BASE_ATTR)]
-
-    if self._output_num_rows > 1:
-      block_y = self._block_y()
-
-      if width == 1:
-        up_text = "U"
-        down_text = "D"
-      elif width == 2:
-        up_text = "UP"
-        down_text = "DN"
-      elif width == 3:
-        up_text = "UP "
-        down_text = "DN "
-      else:
-        up_text = " UP "
-        down_text = "DOWN"
-
-      layout = debugger_cli_common.RichTextLines(
-          [up_text], font_attr_segs={0: [(0, width, self.BASE_ATTR)]})
-      for i in range(1, self._scroll_bar_height - 1):
-        font_attr_segs = foreground_font_attr_segs if i == block_y else None
-        layout.append(empty_line, font_attr_segs=font_attr_segs)
-      layout.append(down_text, font_attr_segs=foreground_font_attr_segs)
-    else:
-      layout = debugger_cli_common.RichTextLines(
-          [empty_line] * self._scroll_bar_height)
-
-    return layout
-
-  def get_click_command(self, mouse_y):
-    if self._output_num_rows <= 1:
-      return None
-    elif mouse_y == self._min_y:
-      return _SCROLL_UP_A_LINE
-    elif mouse_y == self._max_y:
-      return _SCROLL_DOWN_A_LINE
-    elif (mouse_y > self._block_y(screen_coord_sys=True) and
-          mouse_y < self._max_y):
-      return _SCROLL_DOWN
-    elif (mouse_y < self._block_y(screen_coord_sys=True) and
-          mouse_y > self._min_y):
-      return _SCROLL_UP
-    else:
-      return None
-
-
-class CursesUI(base_ui.BaseUI):
-  """Curses-based Command-line UI.
-
-  In this class, the methods with the prefix "_screen_" are the methods that
-  interact with the actual terminal using the curses library.
-  """
-
-  CLI_TERMINATOR_KEY = 7  # Terminator key for input text box.
-  CLI_TAB_KEY = ord("\t")
-  BACKSPACE_KEY = ord("\b")
-  REGEX_SEARCH_PREFIX = "/"
-  TENSOR_INDICES_NAVIGATION_PREFIX = "@"
-
-  _NAVIGATION_FORWARD_COMMAND = "next"
-  _NAVIGATION_BACK_COMMAND = "prev"
-
-  # Limit screen width to work around the limitation of the curses library that
-  # it may return invalid x coordinates for large values.
-  _SCREEN_WIDTH_LIMIT = 220
-
-  # Possible Enter keys. 343 is curses key code for the num-pad Enter key when
-  # num lock is off.
-  CLI_CR_KEYS = [ord("\n"), ord("\r"), 343]
-
-  _KEY_MAP = {
-      127: curses.KEY_BACKSPACE,  # Backspace
-      curses.KEY_DC: 4,  # Delete
-  }
-
-  _FOREGROUND_COLORS = {
-      cli_shared.COLOR_WHITE: curses.COLOR_WHITE,
-      cli_shared.COLOR_RED: curses.COLOR_RED,
-      cli_shared.COLOR_GREEN: curses.COLOR_GREEN,
-      cli_shared.COLOR_YELLOW: curses.COLOR_YELLOW,
-      cli_shared.COLOR_BLUE: curses.COLOR_BLUE,
-      cli_shared.COLOR_CYAN: curses.COLOR_CYAN,
-      cli_shared.COLOR_MAGENTA: curses.COLOR_MAGENTA,
-      cli_shared.COLOR_BLACK: curses.COLOR_BLACK,
-  }
-  _BACKGROUND_COLORS = {
-      "transparent": -1,
-      cli_shared.COLOR_WHITE: curses.COLOR_WHITE,
-      cli_shared.COLOR_BLACK: curses.COLOR_BLACK,
-  }
-
-  # Font attribute for search and highlighting.
-  _SEARCH_HIGHLIGHT_FONT_ATTR = (
-      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
-  _ARRAY_INDICES_COLOR_PAIR = (
-      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
-  _ERROR_TOAST_COLOR_PAIR = (
-      cli_shared.COLOR_RED + "_on_" + cli_shared.COLOR_WHITE)
-  _INFO_TOAST_COLOR_PAIR = (
-      cli_shared.COLOR_BLUE + "_on_" + cli_shared.COLOR_WHITE)
-  _STATUS_BAR_COLOR_PAIR = (
-      cli_shared.COLOR_BLACK + "_on_" + cli_shared.COLOR_WHITE)
-  _UI_WAIT_COLOR_PAIR = (
-      cli_shared.COLOR_MAGENTA + "_on_" + cli_shared.COLOR_WHITE)
-  _NAVIGATION_WARNING_COLOR_PAIR = (
-      cli_shared.COLOR_RED + "_on_" + cli_shared.COLOR_WHITE)
-
-  _UI_WAIT_MESSAGE = "Processing..."
-
-  # The delay (in ms) between each update of the scroll bar when the mouse
-  # button is held down on the scroll bar. Controls how fast the screen scrolls.
-  _MOUSE_SCROLL_DELAY_MS = 100
-
-  _single_instance_lock = threading.Lock()
-
-  def __init__(self, on_ui_exit=None, config=None):
-    """Constructor of CursesUI.
-
-    Args:
-      on_ui_exit: (Callable) Callback invoked when the UI exits.
-      config: An instance of `cli_config.CLIConfig()` carrying user-facing
-        configurations.
-    """
-
-    base_ui.BaseUI.__init__(self, on_ui_exit=on_ui_exit, config=config)
-
-    self._screen_init()
-    self._screen_refresh_size()
-    # TODO(cais): Error out if the size of the screen is too small.
-
-    # Initialize some UI component size and locations.
-    self._init_layout()
-
-    self._command_history_store = debugger_cli_common.CommandHistory()
-
-    # Active list of command history, used in history navigation.
-    # _command_handler_registry holds all the history commands the CLI has
-    # received, up to a size limit. _active_command_history is the history
-    # currently being navigated in, e.g., using the Up/Down keys. The latter
-    # can be different from the former during prefixed or regex-based history
-    # navigation, e.g., when user enter the beginning of a command and hit Up.
-    self._active_command_history = []
-
-    # Pointer to the current position in the history sequence.
-    # 0 means it is a new command being keyed in.
-    self._command_pointer = 0
-
-    self._command_history_limit = 100
-
-    self._pending_command = ""
-
-    self._nav_history = curses_widgets.CursesNavigationHistory(10)
-
-    # State related to screen output.
-    self._output_pad = None
-    self._output_pad_row = 0
-    self._output_array_pointer_indices = None
-    self._curr_unwrapped_output = None
-    self._curr_wrapped_output = None
-
-    try:
-      # Register signal handler for SIGINT.
-      signal.signal(signal.SIGINT, self._interrupt_handler)
-    except ValueError:
-      # Running in a child thread, can't catch signals.
-      pass
-
-    self.register_command_handler(
-        "mouse",
-        self._mouse_mode_command_handler,
-        "Get or set the mouse mode of this CLI: (on|off)",
-        prefix_aliases=["m"])
-
-  def _init_layout(self):
-    """Initialize the layout of UI components.
-
-    Initialize the location and size of UI components such as command textbox
-    and output region according to the terminal size.
-    """
-
-    # NamedTuple for rectangular locations on screen
-    self.rectangle = collections.namedtuple("rectangle",
-                                            "top left bottom right")
-
-    # Height of command text box
-    self._command_textbox_height = 2
-
-    self._title_row = 0
-
-    # Row index of the Navigation Bar (i.e., the bar that contains forward and
-    # backward buttons and displays the current command line).
-    self._nav_bar_row = 1
-
-    # Top row index of the output pad.
-    # A "pad" is a curses object that holds lines of text and not limited to
-    # screen size. It can be rendered on the screen partially with scroll
-    # parameters specified.
-    self._output_top_row = 2
-
-    # Number of rows that the output pad has.
-    self._output_num_rows = (
-        self._max_y - self._output_top_row - self._command_textbox_height - 1)
-
-    # Row index of scroll information line: Taking into account the zero-based
-    # row indexing and the command textbox area under the scroll information
-    # row.
-    self._output_scroll_row = self._max_y - 1 - self._command_textbox_height
-
-    # Tab completion bottom row.
-    self._candidates_top_row = self._output_scroll_row - 4
-    self._candidates_bottom_row = self._output_scroll_row - 1
-
-    # Maximum number of lines the candidates display can have.
-    self._candidates_max_lines = int(self._output_num_rows / 2)
-
-    self.max_output_lines = 10000
-
-    # Regex search state.
-    self._curr_search_regex = None
-    self._unwrapped_regex_match_lines = []
-
-    # Size of view port on screen, which is always smaller or equal to the
-    # screen size.
-    self._output_pad_screen_height = self._output_num_rows - 1
-    self._output_pad_screen_width = self._max_x - 2
-    self._output_pad_screen_location = self.rectangle(
-        top=self._output_top_row,
-        left=0,
-        bottom=self._output_top_row + self._output_num_rows,
-        right=self._output_pad_screen_width)
-
-  def _screen_init(self):
-    """Screen initialization.
-
-    Creates curses stdscr and initialize the color pairs for display.
-    """
-    # If the terminal type is color-ready, enable it.
-    if os.getenv("COLORTERM") in _COLOR_READY_COLORTERMS:
-      os.environ["TERM"] = _COLOR_ENABLED_TERM
-    self._stdscr = curses.initscr()
-    self._command_window = None
-    self._screen_color_init()
-
-  def _screen_color_init(self):
-    """Initialization of screen colors."""
-    curses.start_color()
-    curses.use_default_colors()
-    self._color_pairs = {}
-    color_index = 0
-
-    # Prepare color pairs.
-    for fg_color in self._FOREGROUND_COLORS:
-      for bg_color in self._BACKGROUND_COLORS:
-        color_index += 1
-        curses.init_pair(color_index, self._FOREGROUND_COLORS[fg_color],
-                         self._BACKGROUND_COLORS[bg_color])
-
-        color_name = fg_color
-        if bg_color != "transparent":
-          color_name += "_on_" + bg_color
-
-        self._color_pairs[color_name] = curses.color_pair(color_index)
-
-    # Try getting color(s) available only under 256-color support.
-    try:
-      color_index += 1
-      curses.init_pair(color_index, 245, -1)
-      self._color_pairs[cli_shared.COLOR_GRAY] = curses.color_pair(color_index)
-    except curses.error:
-      # Use fall-back color(s):
-      self._color_pairs[cli_shared.COLOR_GRAY] = (
-          self._color_pairs[cli_shared.COLOR_GREEN])
-
-    # A_BOLD or A_BLINK is not really a "color". But place it here for
-    # convenience.
-    self._color_pairs["bold"] = curses.A_BOLD
-    self._color_pairs["blink"] = curses.A_BLINK
-    self._color_pairs["underline"] = curses.A_UNDERLINE
-
-    # Default color pair to use when a specified color pair does not exist.
-    self._default_color_pair = self._color_pairs[cli_shared.COLOR_WHITE]
-
-  def _screen_launch(self, enable_mouse_on_start):
-    """Launch the curses screen."""
-
-    curses.noecho()
-    curses.cbreak()
-    self._stdscr.keypad(1)
-
-    self._mouse_enabled = self.config.get("mouse_mode")
-    self._screen_set_mousemask()
-    self.config.set_callback(
-        "mouse_mode",
-        lambda cfg: self._set_mouse_enabled(cfg.get("mouse_mode")))
-
-    self._screen_create_command_window()
-
-  def _screen_create_command_window(self):
-    """Create command window according to screen size."""
-    if self._command_window:
-      del self._command_window
-
-    self._command_window = curses.newwin(
-        self._command_textbox_height, self._max_x - len(self.CLI_PROMPT),
-        self._max_y - self._command_textbox_height, len(self.CLI_PROMPT))
-
-  def _screen_refresh(self):
-    self._stdscr.refresh()
-
-  def _screen_terminate(self):
-    """Terminate the curses screen."""
-
-    self._stdscr.keypad(0)
-    curses.nocbreak()
-    curses.echo()
-    curses.endwin()
-
-    try:
-      # Remove SIGINT handler.
-      signal.signal(signal.SIGINT, signal.SIG_DFL)
-    except ValueError:
-      # Can't catch signals unless you're the main thread.
-      pass
-
-  def run_ui(self,
-             init_command=None,
-             title=None,
-             title_color=None,
-             enable_mouse_on_start=True):
-    """Run the CLI: See the doc of base_ui.BaseUI.run_ui for more details."""
-
-    # Only one instance of the Curses UI can be running at a time, since
-    # otherwise they would try to both read from the same keystrokes, and write
-    # to the same screen.
-    self._single_instance_lock.acquire()
-
-    self._screen_launch(enable_mouse_on_start=enable_mouse_on_start)
-
-    # Optional initial command.
-    if init_command is not None:
-      self._dispatch_command(init_command)
-
-    if title is not None:
-      self._title(title, title_color=title_color)
-
-    # CLI main loop.
-    exit_token = self._ui_loop()
-
-    if self._on_ui_exit:
-      self._on_ui_exit()
-
-    self._screen_terminate()
-
-    self._single_instance_lock.release()
-
-    return exit_token
-
-  def get_help(self):
-    return self._command_handler_registry.get_help()
-
-  def _addstr(self, *args):
-    try:
-      self._stdscr.addstr(*args)
-    except curses.error:
-      pass
-
-  def _refresh_pad(self, pad, *args):
-    try:
-      pad.refresh(*args)
-    except curses.error:
-      pass
-
-  def _screen_create_command_textbox(self, existing_command=None):
-    """Create command textbox on screen.
-
-    Args:
-      existing_command: (str) A command string to put in the textbox right
-        after its creation.
-    """
-
-    # Display the tfdbg prompt.
-    self._addstr(self._max_y - self._command_textbox_height, 0,
-                 self.CLI_PROMPT, curses.A_BOLD)
-    self._stdscr.refresh()
-
-    self._command_window.clear()
-
-    # Command text box.
-    self._command_textbox = textpad.Textbox(
-        self._command_window, insert_mode=True)
-
-    # Enter existing command.
-    self._auto_key_in(existing_command)
-
-  def _ui_loop(self):
-    """Command-line UI loop.
-
-    Returns:
-      An exit token of arbitrary type. The token can be None.
-    """
-
-    while True:
-      # Enter history command if pointer is in history (> 0):
-      if self._command_pointer > 0:
-        existing_command = self._active_command_history[-self._command_pointer]
-      else:
-        existing_command = self._pending_command
-      self._screen_create_command_textbox(existing_command)
-
-      try:
-        command, terminator, pending_command_changed = self._get_user_command()
-      except debugger_cli_common.CommandLineExit as e:
-        return e.exit_token
-
-      if not command and terminator != self.CLI_TAB_KEY:
-        continue
-
-      if terminator in self.CLI_CR_KEYS or terminator == curses.KEY_MOUSE:
-        exit_token = self._dispatch_command(command)
-        if exit_token is not None:
-          return exit_token
-      elif terminator == self.CLI_TAB_KEY:
-        tab_completed = self._tab_complete(command)
-        self._pending_command = tab_completed
-        self._cmd_ptr = 0
-      elif pending_command_changed:
-        self._pending_command = command
-
-    return
-
-  def _get_user_command(self):
-    """Get user command from UI.
-
-    Returns:
-      command: (str) The user-entered command.
-      terminator: (str) Terminator type for the command.
-        If command is a normal command entered with the Enter key, the value
-        will be the key itself. If this is a tab completion call (using the
-        Tab key), the value will reflect that as well.
-      pending_command_changed:  (bool) If the pending command has changed.
-        Used during command history navigation.
-    """
-
-    # First, reset textbox state variables.
-    self._textbox_curr_terminator = None
-    self._textbox_pending_command_changed = False
-
-    command = self._screen_get_user_command()
-    command = self._strip_terminator(command)
-    return (command, self._textbox_curr_terminator,
-            self._textbox_pending_command_changed)
-
-  def _screen_get_user_command(self):
-    return self._command_textbox.edit(validate=self._on_textbox_keypress)
-
-  def _strip_terminator(self, command):
-    if not command:
-      return command
-
-    for v in self.CLI_CR_KEYS:
-      if v < 256:
-        command = command.replace(chr(v), "")
-
-    return command.strip()
-
-  def _screen_refresh_size(self):
-    self._max_y, self._max_x = self._stdscr.getmaxyx()
-    if self._max_x > self._SCREEN_WIDTH_LIMIT:
-      self._max_x = self._SCREEN_WIDTH_LIMIT
-
-  def _navigate_screen_output(self, command):
-    """Navigate in screen output history.
-
-    Args:
-      command: (`str`) the navigation command, from
-        {self._NAVIGATION_FORWARD_COMMAND, self._NAVIGATION_BACK_COMMAND}.
-    """
-    if command == self._NAVIGATION_FORWARD_COMMAND:
-      if self._nav_history.can_go_forward():
-        item = self._nav_history.go_forward()
-        scroll_position = item.scroll_position
-      else:
-        self._toast("At the LATEST in navigation history!",
-                    color=self._NAVIGATION_WARNING_COLOR_PAIR)
-        return
-    else:
-      if self._nav_history.can_go_back():
-        item = self._nav_history.go_back()
-        scroll_position = item.scroll_position
-      else:
-        self._toast("At the OLDEST in navigation history!",
-                    color=self._NAVIGATION_WARNING_COLOR_PAIR)
-        return
-
-    self._display_output(item.screen_output)
-    if scroll_position != 0:
-      self._scroll_output(_SCROLL_TO_LINE_INDEX, line_index=scroll_position)
-
-  def _dispatch_command(self, command):
-    """Dispatch user command.
-
-    Args:
-      command: (str) Command to dispatch.
-
-    Returns:
-      An exit token object. None value means that the UI loop should not exit.
-      A non-None value means the UI loop should exit.
-    """
-
-    if self._output_pad:
-      self._toast(self._UI_WAIT_MESSAGE, color=self._UI_WAIT_COLOR_PAIR)
-
-    if command in self.CLI_EXIT_COMMANDS:
-      # Explicit user command-triggered exit: EXPLICIT_USER_EXIT as the exit
-      # token.
-      return debugger_cli_common.EXPLICIT_USER_EXIT
-    elif (command == self._NAVIGATION_FORWARD_COMMAND or
-          command == self._NAVIGATION_BACK_COMMAND):
-      self._navigate_screen_output(command)
-      return
-
-    if command:
-      self._command_history_store.add_command(command)
-
-    if (command.startswith(self.REGEX_SEARCH_PREFIX) and
-        self._curr_unwrapped_output):
-      if len(command) > len(self.REGEX_SEARCH_PREFIX):
-        # Command is like "/regex". Perform regex search.
-        regex = command[len(self.REGEX_SEARCH_PREFIX):]
-
-        self._curr_search_regex = regex
-        self._display_output(self._curr_unwrapped_output, highlight_regex=regex)
-      elif self._unwrapped_regex_match_lines:
-        # Command is "/". Continue scrolling down matching lines.
-        self._display_output(
-            self._curr_unwrapped_output,
-            is_refresh=True,
-            highlight_regex=self._curr_search_regex)
-
-      self._command_pointer = 0
-      self._pending_command = ""
-      return
-    elif command.startswith(self.TENSOR_INDICES_NAVIGATION_PREFIX):
-      indices_str = command[1:].strip()
-      if indices_str:
-        try:
-          indices = command_parser.parse_indices(indices_str)
-          omitted, line_index, _, _ = tensor_format.locate_tensor_element(
-              self._curr_wrapped_output, indices)
-          if not omitted:
-            self._scroll_output(
-                _SCROLL_TO_LINE_INDEX, line_index=line_index)
-        except Exception as e:  # pylint: disable=broad-except
-          self._error_toast(str(e))
-      else:
-        self._error_toast("Empty indices.")
-
-      return
-
-    try:
-      prefix, args, output_file_path = self._parse_command(command)
-    except SyntaxError as e:
-      self._error_toast(str(e))
-      return
-
-    if not prefix:
-      # Empty command: take no action. Should not exit.
-      return
-
-    # Take into account scroll bar width.
-    screen_info = {"cols": self._max_x - 2}
-    exit_token = None
-    if self._command_handler_registry.is_registered(prefix):
-      try:
-        screen_output = self._command_handler_registry.dispatch_command(
-            prefix, args, screen_info=screen_info)
-      except debugger_cli_common.CommandLineExit as e:
-        exit_token = e.exit_token
-    else:
-      screen_output = debugger_cli_common.RichTextLines([
-          self.ERROR_MESSAGE_PREFIX + "Invalid command prefix \"%s\"" % prefix
-      ])
-
-    # Clear active command history. Until next up/down history navigation
-    # occurs, it will stay empty.
-    self._active_command_history = []
-
-    if exit_token is not None:
-      return exit_token
-
-    self._nav_history.add_item(command, screen_output, 0)
-
-    self._display_output(screen_output)
-    if output_file_path:
-      try:
-        screen_output.write_to_file(output_file_path)
-        self._info_toast("Wrote output to %s" % output_file_path)
-      except Exception:  # pylint: disable=broad-except
-        self._error_toast("Failed to write output to %s" % output_file_path)
-
-    self._command_pointer = 0
-    self._pending_command = ""
-
-  def _screen_gather_textbox_str(self):
-    """Gather the text string in the command text box.
-
-    Returns:
-      (str) the current text string in the command textbox, excluding any
-      return keys.
-    """
-
-    txt = self._command_textbox.gather()
-    return txt.strip()
-
-  def _on_textbox_keypress(self, x):
-    """Text box key validator: Callback of key strokes.
-
-    Handles a user's keypress in the input text box. Translates certain keys to
-    terminator keys for the textbox to allow its edit() method to return.
-    Also handles special key-triggered events such as PgUp/PgDown scrolling of
-    the screen output.
-
-    Args:
-      x: (int) Key code.
-
-    Returns:
-      (int) A translated key code. In most cases, this is identical to the
-        input x. However, if x is a Return key, the return value will be
-        CLI_TERMINATOR_KEY, so that the text box's edit() method can return.
-
-    Raises:
-      TypeError: If the input x is not of type int.
-      debugger_cli_common.CommandLineExit: If a mouse-triggered command returns
-        an exit token when dispatched.
-    """
-    if not isinstance(x, int):
-      raise TypeError("Key validator expected type int, received type %s" %
-                      type(x))
-
-    if x in self.CLI_CR_KEYS:
-      # Make Enter key the terminator
-      self._textbox_curr_terminator = x
-      return self.CLI_TERMINATOR_KEY
-    elif x == self.CLI_TAB_KEY:
-      self._textbox_curr_terminator = self.CLI_TAB_KEY
-      return self.CLI_TERMINATOR_KEY
-    elif x == curses.KEY_PPAGE:
-      self._scroll_output(_SCROLL_UP_A_LINE)
-      return x
-    elif x == curses.KEY_NPAGE:
-      self._scroll_output(_SCROLL_DOWN_A_LINE)
-      return x
-    elif x == curses.KEY_HOME:
-      self._scroll_output(_SCROLL_HOME)
-      return x
-    elif x == curses.KEY_END:
-      self._scroll_output(_SCROLL_END)
-      return x
-    elif x in [curses.KEY_UP, curses.KEY_DOWN]:
-      # Command history navigation.
-      if not self._active_command_history:
-        hist_prefix = self._screen_gather_textbox_str()
-        self._active_command_history = (
-            self._command_history_store.lookup_prefix(
-                hist_prefix, self._command_history_limit))
-
-      if self._active_command_history:
-        if x == curses.KEY_UP:
-          if self._command_pointer < len(self._active_command_history):
-            self._command_pointer += 1
-        elif x == curses.KEY_DOWN:
-          if self._command_pointer > 0:
-            self._command_pointer -= 1
-      else:
-        self._command_pointer = 0
-
-      self._textbox_curr_terminator = x
-
-      # Force return from the textbox edit(), so that the textbox can be
-      # redrawn with a history command entered.
-      return self.CLI_TERMINATOR_KEY
-    elif x == curses.KEY_RESIZE:
-      # Respond to terminal resize.
-      self._screen_refresh_size()
-      self._init_layout()
-      self._screen_create_command_window()
-      self._redraw_output()
-
-      # Force return from the textbox edit(), so that the textbox can be
-      # redrawn.
-      return self.CLI_TERMINATOR_KEY
-    elif x == curses.KEY_MOUSE and self._mouse_enabled:
-      try:
-        _, mouse_x, mouse_y, _, mouse_event_type = self._screen_getmouse()
-      except curses.error:
-        mouse_event_type = None
-
-      if mouse_event_type == curses.BUTTON1_PRESSED:
-        # Logic for held mouse-triggered scrolling.
-        if mouse_x >= self._max_x - 2:
-          # Disable blocking on checking for user input.
-          self._command_window.nodelay(True)
-
-          # Loop while mouse button is pressed.
-          while mouse_event_type == curses.BUTTON1_PRESSED:
-            # Sleep for a bit.
-            curses.napms(self._MOUSE_SCROLL_DELAY_MS)
-            scroll_command = self._scroll_bar.get_click_command(mouse_y)
-            if scroll_command in (_SCROLL_UP_A_LINE, _SCROLL_DOWN_A_LINE):
-              self._scroll_output(scroll_command)
-
-            # Check to see if different mouse event is in queue.
-            self._command_window.getch()
-            try:
-              _, _, _, _, mouse_event_type = self._screen_getmouse()
-            except curses.error:
-              pass
-
-          self._command_window.nodelay(False)
-          return x
-      elif mouse_event_type == curses.BUTTON1_RELEASED:
-        # Logic for mouse-triggered scrolling.
-        if mouse_x >= self._max_x - 2:
-          scroll_command = self._scroll_bar.get_click_command(mouse_y)
-          if scroll_command is not None:
-            self._scroll_output(scroll_command)
-          return x
-        else:
-          command = self._fetch_hyperlink_command(mouse_x, mouse_y)
-          if command:
-            self._screen_create_command_textbox()
-            exit_token = self._dispatch_command(command)
-            if exit_token is not None:
-              raise debugger_cli_common.CommandLineExit(exit_token=exit_token)
-    else:
-      # Mark the pending command as modified.
-      self._textbox_pending_command_changed = True
-      # Invalidate active command history.
-      self._command_pointer = 0
-      self._active_command_history = []
-      return self._KEY_MAP.get(x, x)
-
-  def _screen_getmouse(self):
-    return curses.getmouse()
-
-  def _redraw_output(self):
-    if self._curr_unwrapped_output is not None:
-      self._display_nav_bar()
-      self._display_main_menu(self._curr_unwrapped_output)
-      self._display_output(self._curr_unwrapped_output, is_refresh=True)
-
-  def _fetch_hyperlink_command(self, mouse_x, mouse_y):
-    output_top = self._output_top_row
-    if self._main_menu_pad:
-      output_top += 1
-
-    if mouse_y == self._nav_bar_row and self._nav_bar:
-      # Click was in the nav bar.
-      return _get_command_from_line_attr_segs(mouse_x,
-                                              self._nav_bar.font_attr_segs[0])
-    elif mouse_y == self._output_top_row and self._main_menu_pad:
-      # Click was in the menu bar.
-      return _get_command_from_line_attr_segs(mouse_x,
-                                              self._main_menu.font_attr_segs[0])
-    else:
-      absolute_mouse_y = mouse_y + self._output_pad_row - output_top
-      if absolute_mouse_y in self._curr_wrapped_output.font_attr_segs:
-        return _get_command_from_line_attr_segs(
-            mouse_x, self._curr_wrapped_output.font_attr_segs[absolute_mouse_y])
-
-  def _title(self, title, title_color=None):
-    """Display title.
-
-    Args:
-      title: (str) The title to display.
-      title_color: (str) Color of the title, e.g., "yellow".
-    """
-
-    # Pad input title str with "-" and space characters to make it pretty.
-    self._title_line = "--- %s " % title
-    if len(self._title_line) < self._max_x:
-      self._title_line += "-" * (self._max_x - len(self._title_line))
-
-    self._screen_draw_text_line(
-        self._title_row, self._title_line, color=title_color)
-
-  def _auto_key_in(self, command, erase_existing=False):
-    """Automatically key in a command to the command Textbox.
-
-    Args:
-      command: The command, as a string or None.
-      erase_existing: (bool) whether existing text (if any) is to be erased
-          first.
-    """
-    if erase_existing:
-      self._erase_existing_command()
-
-    command = command or ""
-    for c in command:
-      self._command_textbox.do_command(ord(c))
-
-  def _erase_existing_command(self):
-    """Erase existing text in command textpad."""
-
-    existing_len = len(self._command_textbox.gather())
-    for _ in range(existing_len):
-      self._command_textbox.do_command(self.BACKSPACE_KEY)
-
-  def _screen_draw_text_line(self, row, line, attr=curses.A_NORMAL, color=None):
-    """Render a line of text on the screen.
-
-    Args:
-      row: (int) Row index.
-      line: (str) The line content.
-      attr: curses font attribute.
-      color: (str) font foreground color name.
-
-    Raises:
-      TypeError: If row is not of type int.
-    """
-
-    if not isinstance(row, int):
-      raise TypeError("Invalid type in row")
-
-    if len(line) > self._max_x:
-      line = line[:self._max_x]
-
-    color_pair = (self._default_color_pair if color is None else
-                  self._color_pairs[color])
-
-    self._addstr(row, 0, line, color_pair | attr)
-    self._screen_refresh()
-
-  def _screen_new_output_pad(self, rows, cols):
-    """Generate a new pad on the screen.
-
-    Args:
-      rows: (int) Number of rows the pad will have: not limited to screen size.
-      cols: (int) Number of columns the pad will have: not limited to screen
-        size.
-
-    Returns:
-      A curses textpad object.
-    """
-
-    return curses.newpad(rows, cols)
-
-  def _screen_display_output(self, output):
-    """Actually render text output on the screen.
-
-    Wraps the lines according to screen width. Pad lines below according to
-    screen height so that the user can scroll the output to a state where
-    the last non-empty line is on the top of the screen. Then renders the
-    lines on the screen.
-
-    Args:
-      output: (RichTextLines) text lines to display on the screen. These lines
-        may have widths exceeding the screen width. This method will take care
-        of the wrapping.
-
-    Returns:
-      (List of int) A list of line indices, in the wrapped output, where there
-        are regex matches.
-    """
-
-    # Wrap the output lines according to screen width.
-    self._curr_wrapped_output, wrapped_line_indices = (
-        debugger_cli_common.wrap_rich_text_lines(output, self._max_x - 2))
-
-    # Append lines to curr_wrapped_output so that the user can scroll to a
-    # state where the last text line is on the top of the output area.
-    self._curr_wrapped_output.lines.extend([""] * (self._output_num_rows - 1))
-
-    # Limit number of lines displayed to avoid curses overflow problems.
-    if self._curr_wrapped_output.num_lines() > self.max_output_lines:
-      self._curr_wrapped_output = self._curr_wrapped_output.slice(
-          0, self.max_output_lines)
-      self._curr_wrapped_output.lines.append("Output cut off at %d lines!" %
-                                             self.max_output_lines)
-      self._curr_wrapped_output.font_attr_segs[self.max_output_lines] = [
-          (0, len(output.lines[-1]), cli_shared.COLOR_MAGENTA)
-      ]
-
-    self._display_nav_bar()
-    self._display_main_menu(self._curr_wrapped_output)
-
-    (self._output_pad, self._output_pad_height,
-     self._output_pad_width) = self._display_lines(self._curr_wrapped_output,
-                                                   self._output_num_rows)
-
-    # The indices of lines with regex matches (if any) need to be mapped to
-    # indices of wrapped lines.
-    return [
-        wrapped_line_indices[line]
-        for line in self._unwrapped_regex_match_lines
-    ]
-
-  def _display_output(self, output, is_refresh=False, highlight_regex=None):
-    """Display text output in a scrollable text pad.
-
-    This method does some preprocessing on the text lines, render them on the
-    screen and scroll to the appropriate line. These are done according to regex
-    highlighting requests (if any), scroll-to-next-match requests (if any),
-    and screen refresh requests (if any).
-
-    TODO(cais): Separate these unrelated request to increase clarity and
-      maintainability.
-
-    Args:
-      output: A RichTextLines object that is the screen output text.
-      is_refresh: (bool) Is this a refreshing display with existing output.
-      highlight_regex: (str) Optional string representing the regex used to
-        search and highlight in the current screen output.
-    """
-
-    if not output:
-      return
-
-    if highlight_regex:
-      try:
-        output = debugger_cli_common.regex_find(
-            output, highlight_regex, font_attr=self._SEARCH_HIGHLIGHT_FONT_ATTR)
-      except ValueError as e:
-        self._error_toast(str(e))
-        return
-
-      if not is_refresh:
-        # Perform new regex search on the current output.
-        self._unwrapped_regex_match_lines = output.annotations[
-            debugger_cli_common.REGEX_MATCH_LINES_KEY]
-      else:
-        # Continue scrolling down.
-        self._output_pad_row += 1
-    else:
-      self._curr_unwrapped_output = output
-      self._unwrapped_regex_match_lines = []
-
-    # Display output on the screen.
-    wrapped_regex_match_lines = self._screen_display_output(output)
-
-    # Now that the text lines are displayed on the screen scroll to the
-    # appropriate line according to previous scrolling state and regex search
-    # and highlighting state.
-
-    if highlight_regex:
-      next_match_line = -1
-      for match_line in wrapped_regex_match_lines:
-        if match_line >= self._output_pad_row:
-          next_match_line = match_line
-          break
-
-      if next_match_line >= 0:
-        self._scroll_output(
-            _SCROLL_TO_LINE_INDEX, line_index=next_match_line)
-      else:
-        # Regex search found no match >= current line number. Display message
-        # stating as such.
-        self._toast("Pattern not found", color=self._ERROR_TOAST_COLOR_PAIR)
-    elif is_refresh:
-      self._scroll_output(_SCROLL_REFRESH)
-    elif debugger_cli_common.INIT_SCROLL_POS_KEY in output.annotations:
-      line_index = output.annotations[debugger_cli_common.INIT_SCROLL_POS_KEY]
-      self._scroll_output(_SCROLL_TO_LINE_INDEX, line_index=line_index)
-    else:
-      self._output_pad_row = 0
-      self._scroll_output(_SCROLL_HOME)
-
-  def _display_lines(self, output, min_num_rows):
-    """Display RichTextLines object on screen.
-
-    Args:
-      output: A RichTextLines object.
-      min_num_rows: (int) Minimum number of output rows.
-
-    Returns:
-      1) The text pad object used to display the main text body.
-      2) (int) number of rows of the text pad, which may exceed screen size.
-      3) (int) number of columns of the text pad.
-
-    Raises:
-      ValueError: If input argument "output" is invalid.
-    """
-
-    if not isinstance(output, debugger_cli_common.RichTextLines):
-      raise ValueError(
-          "Output is required to be an instance of RichTextLines, but is not.")
-
-    self._screen_refresh()
-
-    # Number of rows the output area will have.
-    rows = max(min_num_rows, len(output.lines))
-
-    # Size of the output pad, which may exceed screen size and require
-    # scrolling.
-    cols = self._max_x - 2
-
-    # Create new output pad.
-    pad = self._screen_new_output_pad(rows, cols)
-
-    for i in range(len(output.lines)):
-      if i in output.font_attr_segs:
-        self._screen_add_line_to_output_pad(
-            pad, i, output.lines[i], color_segments=output.font_attr_segs[i])
-      else:
-        self._screen_add_line_to_output_pad(pad, i, output.lines[i])
-
-    return pad, rows, cols
-
-  def _display_nav_bar(self):
-    nav_bar_width = self._max_x - 2
-    self._nav_bar_pad = self._screen_new_output_pad(1, nav_bar_width)
-    self._nav_bar = self._nav_history.render(
-        nav_bar_width,
-        self._NAVIGATION_BACK_COMMAND,
-        self._NAVIGATION_FORWARD_COMMAND)
-    self._screen_add_line_to_output_pad(
-        self._nav_bar_pad, 0, self._nav_bar.lines[0][:nav_bar_width - 1],
-        color_segments=(self._nav_bar.font_attr_segs[0]
-                        if 0 in self._nav_bar.font_attr_segs else None))
-
-  def _display_main_menu(self, output):
-    """Display main menu associated with screen output, if the menu exists.
-
-    Args:
-      output: (debugger_cli_common.RichTextLines) The RichTextLines output from
-        the annotations field of which the menu will be extracted and used (if
-        the menu exists).
-    """
-
-    if debugger_cli_common.MAIN_MENU_KEY in output.annotations:
-      self._main_menu = output.annotations[
-          debugger_cli_common.MAIN_MENU_KEY].format_as_single_line(
-              prefix="| ", divider=" | ", enabled_item_attrs=["underline"])
-
-      self._main_menu_pad = self._screen_new_output_pad(1, self._max_x - 2)
-
-      # The unwrapped menu line may exceed screen width, in which case it needs
-      # to be cut off.
-      wrapped_menu, _ = debugger_cli_common.wrap_rich_text_lines(
-          self._main_menu, self._max_x - 3)
-      self._screen_add_line_to_output_pad(
-          self._main_menu_pad,
-          0,
-          wrapped_menu.lines[0],
-          color_segments=(wrapped_menu.font_attr_segs[0]
-                          if 0 in wrapped_menu.font_attr_segs else None))
-    else:
-      self._main_menu = None
-      self._main_menu_pad = None
-
-  def _pad_line_end_with_whitespace(self, pad, row, line_end_x):
-    """Pad the whitespace at the end of a line with the default color pair.
-
-    Prevents spurious color pairs from appearing at the end of the lines in
-    certain text terminals.
-
-    Args:
-      pad: The curses pad object to operate on.
-      row: (`int`) row index.
-      line_end_x: (`int`) column index of the end of the line (beginning of
-        the whitespace).
-    """
-    if line_end_x < self._max_x - 2:
-      pad.addstr(row, line_end_x, " " * (self._max_x - 3 - line_end_x),
-                 self._default_color_pair)
-
-  def _screen_add_line_to_output_pad(self, pad, row, txt, color_segments=None):
-    """Render a line in a text pad.
-
-    Assumes: segments in color_segments are sorted in ascending order of the
-    beginning index.
-    Note: Gaps between the segments are allowed and will be fixed in with a
-    default color.
-
-    Args:
-      pad: The text pad to render the line in.
-      row: Row index, as an int.
-      txt: The text to be displayed on the specified row, as a str.
-      color_segments: A list of 3-tuples. Each tuple represents the beginning
-        and the end of a color segment, in the form of a right-open interval:
-        [start, end). The last element of the tuple is a color string, e.g.,
-        "red".
-
-    Raisee:
-      TypeError: If color_segments is not of type list.
-    """
-
-    if not color_segments:
-      pad.addstr(row, 0, txt, self._default_color_pair)
-      self._pad_line_end_with_whitespace(pad, row, len(txt))
-      return
-
-    if not isinstance(color_segments, list):
-      raise TypeError("Input color_segments needs to be a list, but is not.")
-
-    all_segments = []
-    all_color_pairs = []
-
-    # Process the beginning.
-    if color_segments[0][0] == 0:
-      pass
-    else:
-      all_segments.append((0, color_segments[0][0]))
-      all_color_pairs.append(self._default_color_pair)
-
-    for (curr_start, curr_end, curr_attrs), (next_start, _, _) in zip(
-        color_segments, color_segments[1:] + [(len(txt), None, None)]):
-      all_segments.append((curr_start, curr_end))
-
-      if not isinstance(curr_attrs, list):
-        curr_attrs = [curr_attrs]
-
-      curses_attr = curses.A_NORMAL
-      for attr in curr_attrs:
-        if (self._mouse_enabled and
-            isinstance(attr, debugger_cli_common.MenuItem)):
-          curses_attr |= curses.A_UNDERLINE
-        else:
-          curses_attr |= self._color_pairs.get(attr, self._default_color_pair)
-      all_color_pairs.append(curses_attr)
-
-      if curr_end < next_start:
-        # Fill in the gap with the default color.
-        all_segments.append((curr_end, next_start))
-        all_color_pairs.append(self._default_color_pair)
-
-    # Finally, draw all the segments.
-    for segment, color_pair in zip(all_segments, all_color_pairs):
-      if segment[1] < self._max_x:
-        pad.addstr(row, segment[0], txt[segment[0]:segment[1]], color_pair)
-    if all_segments:
-      self._pad_line_end_with_whitespace(pad, row, all_segments[-1][1])
-
-  def _screen_scroll_output_pad(self, pad, viewport_top, viewport_left,
-                                screen_location_top, screen_location_left,
-                                screen_location_bottom, screen_location_right):
-    self._refresh_pad(pad, viewport_top, viewport_left, screen_location_top,
-                      screen_location_left, screen_location_bottom,
-                      screen_location_right)
-    self._scroll_bar = ScrollBar(
-        self._max_x - 2,
-        3,
-        self._max_x - 1,
-        self._output_num_rows + 1,
-        self._output_pad_row,
-        self._output_pad_height - self._output_pad_screen_height)
-
-    (scroll_pad, _, _) = self._display_lines(
-        self._scroll_bar.layout(), self._output_num_rows - 1)
-    self._refresh_pad(scroll_pad, 0, 0, self._output_top_row + 1,
-                      self._max_x - 2, self._output_num_rows + 1,
-                      self._max_x - 1)
-
-  def _scroll_output(self, direction, line_index=None):
-    """Scroll the output pad.
-
-    Args:
-      direction: _SCROLL_REFRESH, _SCROLL_UP, _SCROLL_DOWN, _SCROLL_UP_A_LINE,
-        _SCROLL_DOWN_A_LINE, _SCROLL_HOME, _SCROLL_END, _SCROLL_TO_LINE_INDEX
-      line_index: (int) Specifies the zero-based line index to scroll to.
-        Applicable only if direction is _SCROLL_TO_LINE_INDEX.
-
-    Raises:
-      ValueError: On invalid scroll direction.
-      TypeError: If line_index is not int and direction is
-        _SCROLL_TO_LINE_INDEX.
-    """
-
-    if not self._output_pad:
-      # No output pad is present. Do nothing.
-      return
-
-    if direction == _SCROLL_REFRESH:
-      pass
-    elif direction == _SCROLL_UP:
-      # Scroll up.
-      self._output_pad_row -= int(self._output_num_rows / 3)
-      if self._output_pad_row < 0:
-        self._output_pad_row = 0
-    elif direction == _SCROLL_DOWN:
-      # Scroll down.
-      self._output_pad_row += int(self._output_num_rows / 3)
-      if (self._output_pad_row >
-          self._output_pad_height - self._output_pad_screen_height - 1):
-        self._output_pad_row = (
-            self._output_pad_height - self._output_pad_screen_height - 1)
-    elif direction == _SCROLL_UP_A_LINE:
-      # Scroll up a line
-      if self._output_pad_row - 1 >= 0:
-        self._output_pad_row -= 1
-    elif direction == _SCROLL_DOWN_A_LINE:
-      # Scroll down a line
-      if self._output_pad_row + 1 < (
-          self._output_pad_height - self._output_pad_screen_height):
-        self._output_pad_row += 1
-    elif direction == _SCROLL_HOME:
-      # Scroll to top
-      self._output_pad_row = 0
-    elif direction == _SCROLL_END:
-      # Scroll to bottom
-      self._output_pad_row = (
-          self._output_pad_height - self._output_pad_screen_height - 1)
-    elif direction == _SCROLL_TO_LINE_INDEX:
-      if not isinstance(line_index, int):
-        raise TypeError("Invalid line_index type (%s) under mode %s" %
-                        (type(line_index), _SCROLL_TO_LINE_INDEX))
-      self._output_pad_row = line_index
-    else:
-      raise ValueError("Unsupported scroll mode: %s" % direction)
-
-    self._nav_history.update_scroll_position(self._output_pad_row)
-
-    # Actually scroll the output pad: refresh with new location.
-    output_pad_top = self._output_pad_screen_location.top
-    if self._main_menu_pad:
-      output_pad_top += 1
-    self._screen_scroll_output_pad(self._output_pad, self._output_pad_row, 0,
-                                   output_pad_top,
-                                   self._output_pad_screen_location.left,
-                                   self._output_pad_screen_location.bottom,
-                                   self._output_pad_screen_location.right)
-    self._screen_render_nav_bar()
-    self._screen_render_menu_pad()
-
-    self._scroll_info = self._compile_ui_status_summary()
-    self._screen_draw_text_line(
-        self._output_scroll_row,
-        self._scroll_info,
-        color=self._STATUS_BAR_COLOR_PAIR)
-
-  def _screen_render_nav_bar(self):
-    if self._nav_bar_pad:
-      self._refresh_pad(self._nav_bar_pad, 0, 0, self._nav_bar_row, 0,
-                        self._output_pad_screen_location.top, self._max_x)
-
-  def _screen_render_menu_pad(self):
-    if self._main_menu_pad:
-      self._refresh_pad(
-          self._main_menu_pad, 0, 0, self._output_pad_screen_location.top, 0,
-          self._output_pad_screen_location.top, self._max_x)
-
-  def _compile_ui_status_summary(self):
-    """Compile status summary about this Curses UI instance.
-
-    The information includes: scroll status and mouse ON/OFF status.
-
-    Returns:
-      (str) A single text line summarizing the UI status, adapted to the
-        current screen width.
-    """
-
-    info = ""
-    if self._output_pad_height > self._output_pad_screen_height + 1:
-      # Display information about the scrolling of tall screen output.
-      scroll_percentage = 100.0 * (min(
-          1.0,
-          float(self._output_pad_row) /
-          (self._output_pad_height - self._output_pad_screen_height - 1)))
-      if self._output_pad_row == 0:
-        scroll_directions = " (PgDn)"
-      elif self._output_pad_row >= (
-          self._output_pad_height - self._output_pad_screen_height - 1):
-        scroll_directions = " (PgUp)"
-      else:
-        scroll_directions = " (PgDn/PgUp)"
-
-      info += "--- Scroll%s: %.2f%% " % (scroll_directions, scroll_percentage)
-
-    self._output_array_pointer_indices = self._show_array_indices()
-
-    # Add array indices information to scroll message.
-    if self._output_array_pointer_indices:
-      if self._output_array_pointer_indices[0]:
-        info += self._format_indices(self._output_array_pointer_indices[0])
-      info += "-"
-      if self._output_array_pointer_indices[-1]:
-        info += self._format_indices(self._output_array_pointer_indices[-1])
-      info += " "
-
-    # Add mouse mode information.
-    mouse_mode_str = "Mouse: "
-    mouse_mode_str += "ON" if self._mouse_enabled else "OFF"
-
-    if len(info) + len(mouse_mode_str) + 5 < self._max_x:
-      info += "-" * (self._max_x - len(info) - len(mouse_mode_str) - 4)
-      info += " "
-      info += mouse_mode_str
-      info += " ---"
-    else:
-      info += "-" * (self._max_x - len(info))
-
-    return info
-
-  def _format_indices(self, indices):
-    # Remove the spaces to make it compact.
-    return repr(indices).replace(" ", "")
-
-  def _show_array_indices(self):
-    """Show array indices for the lines at the top and bottom of the output.
-
-    For the top line and bottom line of the output display area, show the
-    element indices of the array being displayed.
-
-    Returns:
-      If either the top of the bottom row has any matching array indices,
-      a dict from line index (0 being the top of the display area, -1
-      being the bottom of the display area) to array element indices. For
-      example:
-        {0: [0, 0], -1: [10, 0]}
-      Otherwise, None.
-    """
-
-    indices_top = self._show_array_index_at_line(0)
-
-    output_top = self._output_top_row
-    if self._main_menu_pad:
-      output_top += 1
-    bottom_line_index = (
-        self._output_pad_screen_location.bottom - output_top - 1)
-    indices_bottom = self._show_array_index_at_line(bottom_line_index)
-
-    if indices_top or indices_bottom:
-      return {0: indices_top, -1: indices_bottom}
-    else:
-      return None
-
-  def _show_array_index_at_line(self, line_index):
-    """Show array indices for the specified line in the display area.
-
-    Uses the line number to array indices map in the annotations field of the
-    RichTextLines object being displayed.
-    If the displayed RichTextLines object does not contain such a mapping,
-    will do nothing.
-
-    Args:
-      line_index: (int) 0-based line index from the top of the display area.
-        For example,if line_index == 0, this method will display the array
-        indices for the line currently at the top of the display area.
-
-    Returns:
-      (list) The array indices at the specified line, if available. None, if
-        not available.
-    """
-
-    # Examine whether the index information is available for the specified line
-    # number.
-    pointer = self._output_pad_row + line_index
-    if (pointer in self._curr_wrapped_output.annotations and
-        "i0" in self._curr_wrapped_output.annotations[pointer]):
-      indices = self._curr_wrapped_output.annotations[pointer]["i0"]
-
-      array_indices_str = self._format_indices(indices)
-      array_indices_info = "@" + array_indices_str
-
-      # TODO(cais): Determine line_index properly given menu pad status.
-      #   Test coverage?
-      output_top = self._output_top_row
-      if self._main_menu_pad:
-        output_top += 1
-
-      self._toast(
-          array_indices_info,
-          color=self._ARRAY_INDICES_COLOR_PAIR,
-          line_index=output_top + line_index)
-
-      return indices
-    else:
-      return None
-
-  def _tab_complete(self, command_str):
-    """Perform tab completion.
-
-    Obtains tab completion candidates.
-    If there are no candidates, return command_str and take no other actions.
-    If there are candidates, display the candidates on screen and return
-    command_str + (common prefix of the candidates).
-
-    Args:
-      command_str: (str) The str in the command input textbox when Tab key is
-        hit.
-
-    Returns:
-      (str) Completed string. Could be the same as command_str if no completion
-      candidate is available. If candidate(s) are available, return command_str
-      appended by the common prefix of the candidates.
-    """
-
-    context, prefix, except_last_word = self._analyze_tab_complete_input(
-        command_str)
-    candidates, common_prefix = self._tab_completion_registry.get_completions(
-        context, prefix)
-
-    if candidates and len(candidates) > 1:
-      self._display_candidates(candidates)
-    else:
-      # In the case of len(candidates) == 1, the single completion will be
-      # entered to the textbox automatically. So there is no need to show any
-      # candidates.
-      self._display_candidates([])
-
-    if common_prefix:
-      # Common prefix is not None and non-empty. The completed string will
-      # incorporate the common prefix.
-      return except_last_word + common_prefix
-    else:
-      return except_last_word + prefix
-
-  def _display_candidates(self, candidates):
-    """Show candidates (e.g., tab-completion candidates) on multiple lines.
-
-    Args:
-      candidates: (list of str) candidates.
-    """
-
-    if self._curr_unwrapped_output:
-      # Force refresh screen output.
-      self._scroll_output(_SCROLL_REFRESH)
-
-    if not candidates:
-      return
-
-    candidates_prefix = "Candidates: "
-    candidates_line = candidates_prefix + " ".join(candidates)
-    candidates_output = debugger_cli_common.RichTextLines(
-        candidates_line,
-        font_attr_segs={
-            0: [(len(candidates_prefix), len(candidates_line), "yellow")]
-        })
-
-    candidates_output, _ = debugger_cli_common.wrap_rich_text_lines(
-        candidates_output, self._max_x - 3)
-
-    # Calculate how many lines the candidate text should occupy. Limit it to
-    # a maximum value.
-    candidates_num_rows = min(
-        len(candidates_output.lines), self._candidates_max_lines)
-    self._candidates_top_row = (
-        self._candidates_bottom_row - candidates_num_rows + 1)
-
-    # Render the candidate text on screen.
-    pad, _, _ = self._display_lines(candidates_output, 0)
-    self._screen_scroll_output_pad(
-        pad, 0, 0, self._candidates_top_row, 0,
-        self._candidates_top_row + candidates_num_rows - 1, self._max_x - 2)
-
-  def _toast(self, message, color=None, line_index=None):
-    """Display a one-line message on the screen.
-
-    By default, the toast is displayed in the line right above the scroll bar.
-    But the line location can be overridden with the line_index arg.
-
-    Args:
-      message: (str) the message to display.
-      color: (str) optional color attribute for the message.
-      line_index: (int) line index.
-    """
-
-    pad, _, _ = self._display_lines(
-        debugger_cli_common.RichTextLines(
-            message,
-            font_attr_segs={
-                0: [(0, len(message), color or cli_shared.COLOR_WHITE)]}),
-        0)
-
-    right_end = min(len(message), self._max_x - 2)
-
-    if line_index is None:
-      line_index = self._output_scroll_row - 1
-    self._screen_scroll_output_pad(pad, 0, 0, line_index, 0, line_index,
-                                   right_end)
-
-  def _error_toast(self, message):
-    """Display a one-line error message on screen.
-
-    Args:
-      message: The error message, without the preceding "ERROR: " substring.
-    """
-
-    self._toast(
-        self.ERROR_MESSAGE_PREFIX + message, color=self._ERROR_TOAST_COLOR_PAIR)
-
-  def _info_toast(self, message):
-    """Display a one-line informational message on screen.
-
-    Args:
-      message: The informational message.
-    """
-
-    self._toast(
-        self.INFO_MESSAGE_PREFIX + message, color=self._INFO_TOAST_COLOR_PAIR)
-
-  def _interrupt_handler(self, signal_num, frame):
-    del signal_num  # Unused.
-    del frame  # Unused.
-
-    if self._on_ui_exit:
-      self._on_ui_exit()
-
-    self._screen_terminate()
-    print("\ntfdbg: caught SIGINT; calling sys.exit(1).", file=sys.stderr)
-    sys.exit(1)
-
-  def _mouse_mode_command_handler(self, args, screen_info=None):
-    """Handler for the command prefix 'mouse'.
-
-    Args:
-      args: (list of str) Arguments to the command prefix 'mouse'.
-      screen_info: (dict) Information about the screen, unused by this handler.
-
-    Returns:
-      None, as this command handler does not generate any screen outputs other
-        than toasts.
-    """
-
-    del screen_info
-
-    if not args or len(args) == 1:
-      if args:
-        if args[0].lower() == "on":
-          enabled = True
-        elif args[0].lower() == "off":
-          enabled = False
-        else:
-          self._error_toast("Invalid mouse mode: %s" % args[0])
-          return None
-
-        self._set_mouse_enabled(enabled)
-
-      mode_str = "on" if self._mouse_enabled else "off"
-      self._info_toast("Mouse mode: %s" % mode_str)
-    else:
-      self._error_toast("mouse_mode: syntax error")
-
-    return None
-
-  def _set_mouse_enabled(self, enabled):
-    if self._mouse_enabled != enabled:
-      self._mouse_enabled = enabled
-      self._screen_set_mousemask()
-      self._redraw_output()
-
-  def _screen_set_mousemask(self):
-    if self._mouse_enabled:
-      curses.mousemask(curses.BUTTON1_RELEASED | curses.BUTTON1_PRESSED)
-    else:
-      curses.mousemask(0)
diff --git a/tensorflow/python/debug/cli/curses_ui_test.py b/tensorflow/python/debug/cli/curses_ui_test.py
deleted file mode 100644
index c56b4272067..00000000000
--- a/tensorflow/python/debug/cli/curses_ui_test.py
+++ /dev/null
@@ -1,1667 +0,0 @@
-# Copyright 2016 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Tests of the curses-based CLI."""
-import argparse
-import curses
-import os
-import queue
-import tempfile
-import threading
-
-import numpy as np
-
-from tensorflow.python.debug.cli import cli_config
-from tensorflow.python.debug.cli import cli_test_utils
-from tensorflow.python.debug.cli import curses_ui
-from tensorflow.python.debug.cli import debugger_cli_common
-from tensorflow.python.debug.cli import tensor_format
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import gfile
-from tensorflow.python.platform import googletest
-
-
-def string_to_codes(cmd):
-  return [ord(c) for c in cmd]
-
-
-def codes_to_string(cmd_code):
-  # Omit non-ASCII key codes.
-  return "".join(chr(code) for code in cmd_code if code < 256)
-
-
-class MockCursesUI(curses_ui.CursesUI):
-  """Mock subclass of CursesUI that bypasses actual terminal manipulations."""
-
-  def __init__(self,
-               height,
-               width,
-               command_sequence=None):
-    self._height = height
-    self._width = width
-
-    self._command_sequence = command_sequence
-    self._command_counter = 0
-
-    # The mock class has no actual textbox. So use this variable to keep
-    # track of what's entered in the textbox on creation.
-    self._curr_existing_command = ""
-
-    # Observers for test.
-    # Observers of screen output.
-    self.unwrapped_outputs = []
-    self.wrapped_outputs = []
-    self.scroll_messages = []
-    self.output_array_pointer_indices = []
-
-    self.output_pad_rows = []
-
-    # Observers of command textbox.
-    self.existing_commands = []
-
-    # Observer for tab-completion candidates.
-    self.candidates_lists = []
-
-    # Observer for the main menu.
-    self.main_menu_list = []
-
-    # Observer for toast messages.
-    self.toasts = []
-
-    curses_ui.CursesUI.__init__(
-        self,
-        config=cli_config.CLIConfig(
-            config_file_path=os.path.join(tempfile.mkdtemp(), ".tfdbg_config")))
-
-    # Override the default path to the command history file to avoid test
-    # concurrency issues.
-    _, history_file_path = tempfile.mkstemp()  # safe to ignore fd
-    self._command_history_store = debugger_cli_common.CommandHistory(
-        history_file_path=history_file_path)
-
-  # Below, override the _screen_ prefixed member methods that interact with the
-  # actual terminal, so that the mock can run in a terminal-less environment.
-
-  # TODO(cais): Search for a way to have a mock terminal object that behaves
-  # like the actual terminal, so that we can test the terminal interaction
-  # parts of the CursesUI class.
-
-  def _screen_init(self):
-    pass
-
-  def _screen_refresh_size(self):
-    self._max_y = self._height
-    self._max_x = self._width
-
-  def _screen_launch(self, enable_mouse_on_start):
-    self._mouse_enabled = enable_mouse_on_start
-
-  def _screen_terminate(self):
-    pass
-
-  def _screen_refresh(self):
-    pass
-
-  def _screen_create_command_window(self):
-    pass
-
-  def _screen_create_command_textbox(self, existing_command=None):
-    """Override to insert observer of existing commands.
-
-    Used in testing of history navigation and tab completion.
-
-    Args:
-      existing_command: Command string entered to the textbox at textbox
-        creation time. Note that the textbox does not actually exist in this
-        mock subclass. This method only keeps track of and records the state.
-    """
-
-    self.existing_commands.append(existing_command)
-    self._curr_existing_command = existing_command
-
-  def _screen_new_output_pad(self, rows, cols):
-    return "mock_pad"
-
-  def _screen_add_line_to_output_pad(self, pad, row, txt, color_segments=None):
-    pass
-
-  def _screen_draw_text_line(self, row, line, attr=curses.A_NORMAL, color=None):
-    pass
-
-  def _screen_scroll_output_pad(self, pad, viewport_top, viewport_left,
-                                screen_location_top, screen_location_left,
-                                screen_location_bottom, screen_location_right):
-    pass
-
-  def _screen_get_user_command(self):
-    command = self._command_sequence[self._command_counter]
-
-    self._command_key_counter = 0
-    for c in command:
-      if c == curses.KEY_RESIZE:
-        # Special case for simulating a terminal resize event in curses.
-        self._height = command[1]
-        self._width = command[2]
-        self._on_textbox_keypress(c)
-        self._command_counter += 1
-        return ""
-      elif c == curses.KEY_MOUSE:
-        mouse_x = command[1]
-        mouse_y = command[2]
-        self._command_counter += 1
-        self._textbox_curr_terminator = c
-        return self._fetch_hyperlink_command(mouse_x, mouse_y)
-      else:
-        y = self._on_textbox_keypress(c)
-
-        self._command_key_counter += 1
-        if y == curses_ui.CursesUI.CLI_TERMINATOR_KEY:
-          break
-
-    self._command_counter += 1
-
-    # Take into account pre-existing string automatically entered on textbox
-    # creation.
-    return self._curr_existing_command + codes_to_string(command)
-
-  def _screen_getmouse(self):
-    output = (0, self._mouse_xy_sequence[self._mouse_counter][0],
-              self._mouse_xy_sequence[self._mouse_counter][1], 0,
-              curses.BUTTON1_CLICKED)
-    self._mouse_counter += 1
-    return output
-
-  def _screen_gather_textbox_str(self):
-    return codes_to_string(self._command_sequence[self._command_counter]
-                           [:self._command_key_counter])
-
-  def _scroll_output(self, direction, line_index=None):
-    """Override to observe screen output.
-
-    This method is invoked after every command that generates a new screen
-    output and after every keyboard triggered screen scrolling. Therefore
-    it is a good place to insert the observer.
-
-    Args:
-      direction: which direction to scroll.
-      line_index: (int or None) Optional line index to scroll to. See doc string
-        of the overridden method for more information.
-    """
-
-    curses_ui.CursesUI._scroll_output(self, direction, line_index=line_index)
-
-    self.unwrapped_outputs.append(self._curr_unwrapped_output)
-    self.wrapped_outputs.append(self._curr_wrapped_output)
-    self.scroll_messages.append(self._scroll_info)
-    self.output_array_pointer_indices.append(self._output_array_pointer_indices)
-    self.output_pad_rows.append(self._output_pad_row)
-
-  def _display_main_menu(self, output):
-    curses_ui.CursesUI._display_main_menu(self, output)
-
-    self.main_menu_list.append(self._main_menu)
-
-  def _screen_render_nav_bar(self):
-    pass
-
-  def _screen_render_menu_pad(self):
-    pass
-
-  def _display_candidates(self, candidates):
-    curses_ui.CursesUI._display_candidates(self, candidates)
-
-    self.candidates_lists.append(candidates)
-
-  def _toast(self, message, color=None, line_index=None):
-    curses_ui.CursesUI._toast(self, message, color=color, line_index=line_index)
-
-    self.toasts.append(message)
-
-
-class CursesTest(test_util.TensorFlowTestCase):
-
-  _EXIT = string_to_codes("exit\n")
-
-  def _babble(self, args, screen_info=None):
-    ap = argparse.ArgumentParser(
-        description="Do babble.", usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "-n",
-        "--num_times",
-        dest="num_times",
-        type=int,
-        default=60,
-        help="How many times to babble")
-    ap.add_argument(
-        "-l",
-        "--line",
-        dest="line",
-        type=str,
-        default="bar",
-        help="The content of each line")
-    ap.add_argument(
-        "-k",
-        "--link",
-        dest="link",
-        action="store_true",
-        help="Create a command link on each line")
-    ap.add_argument(
-        "-m",
-        "--menu",
-        dest="menu",
-        action="store_true",
-        help="Create a menu for testing")
-
-    parsed = ap.parse_args(args)
-
-    lines = [parsed.line] * parsed.num_times
-    font_attr_segs = {}
-    if parsed.link:
-      for i in range(len(lines)):
-        font_attr_segs[i] = [(
-            0,
-            len(lines[i]),
-            debugger_cli_common.MenuItem("", "babble"),)]
-
-    annotations = {}
-    if parsed.menu:
-      menu = debugger_cli_common.Menu()
-      menu.append(
-          debugger_cli_common.MenuItem("babble again", "babble"))
-      menu.append(
-          debugger_cli_common.MenuItem("ahoy", "ahoy", enabled=False))
-      annotations[debugger_cli_common.MAIN_MENU_KEY] = menu
-
-    output = debugger_cli_common.RichTextLines(
-        lines, font_attr_segs=font_attr_segs, annotations=annotations)
-    return output
-
-  def _print_ones(self, args, screen_info=None):
-    ap = argparse.ArgumentParser(
-        description="Print all-one matrix.", usage=argparse.SUPPRESS)
-    ap.add_argument(
-        "-s",
-        "--size",
-        dest="size",
-        type=int,
-        default=3,
-        help="Size of the matrix. For example, of the value is 3, "
-        "the matrix will have shape (3, 3)")
-
-    parsed = ap.parse_args(args)
-
-    m = np.ones([parsed.size, parsed.size])
-
-    return tensor_format.format_tensor(m, "m")
-
-  def testInitialization(self):
-    ui = MockCursesUI(40, 80)
-
-    self.assertEqual(0, ui._command_pointer)
-    self.assertEqual([], ui._active_command_history)
-    self.assertEqual("", ui._pending_command)
-
-  def testCursesUiInChildThreadStartsWithoutException(self):
-    result = queue.Queue()
-    def child_thread():
-      try:
-        MockCursesUI(40, 80)
-      except ValueError as e:
-        result.put(e)
-    t = threading.Thread(target=child_thread)
-    t.start()
-    t.join()
-    self.assertTrue(result.empty())
-
-  def testRunUIExitImmediately(self):
-    """Make sure that the UI can exit properly after launch."""
-
-    ui = MockCursesUI(40, 80, command_sequence=[self._EXIT])
-    ui.run_ui()
-
-    # No screen output should have happened.
-    self.assertEqual(0, len(ui.unwrapped_outputs))
-
-  def testRunUIEmptyCommand(self):
-    """Issue an empty command then exit."""
-
-    ui = MockCursesUI(40, 80, command_sequence=[[], self._EXIT])
-    ui.run_ui()
-
-    # Empty command should not lead to any screen output.
-    self.assertEqual(0, len(ui.unwrapped_outputs))
-
-  def testRunUIInvalidCommandPrefix(self):
-    """Handle an unregistered command prefix."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("foo\n"), self._EXIT])
-    ui.run_ui()
-
-    # Screen output/scrolling should have happened exactly once.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-
-    self.assertEqual(["ERROR: Invalid command prefix \"foo\""],
-                     ui.unwrapped_outputs[0].lines)
-    # TODO(cais): Add explanation for the 35 extra lines.
-    self.assertEqual(["ERROR: Invalid command prefix \"foo\""],
-                     ui.wrapped_outputs[0].lines[:1])
-    # A single line of output should not have caused scrolling.
-    self.assertNotIn("Scroll", ui.scroll_messages[0])
-    self.assertIn("Mouse:", ui.scroll_messages[0])
-
-  def testRunUIInvalidCommandSyntax(self):
-    """Handle a command with invalid syntax."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("babble -z\n"), self._EXIT])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    # Screen output/scrolling should have happened exactly once.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-    self.assertIn("Mouse:", ui.scroll_messages[0])
-    self.assertEqual(
-        ["Syntax error for command: babble", "For help, do \"help babble\""],
-        ui.unwrapped_outputs[0].lines)
-
-  def testRunUIScrollTallOutputPageDownUp(self):
-    """Scroll tall output with PageDown and PageUp."""
-
-    # Use PageDown and PageUp to scroll back and forth a little before exiting.
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("babble\n"), [curses.KEY_NPAGE] * 2 +
-                          [curses.KEY_PPAGE] + self._EXIT])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    # Screen output/scrolling should have happened exactly once.
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-    self.assertEqual(4, len(ui.wrapped_outputs))
-    self.assertEqual(4, len(ui.scroll_messages))
-
-    # Before scrolling.
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.wrapped_outputs[0].lines[:60])
-
-    # Initial scroll: At the top.
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[0])
-    self.assertIn("Mouse:", ui.scroll_messages[0])
-
-    # After 1st scrolling (PageDown).
-    # The screen output shouldn't have changed. Only the viewport should.
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.wrapped_outputs[0].lines[:60])
-    self.assertIn("Scroll (PgDn/PgUp): 1.69%", ui.scroll_messages[1])
-    self.assertIn("Mouse:", ui.scroll_messages[1])
-
-    # After 2nd scrolling (PageDown).
-    self.assertIn("Scroll (PgDn/PgUp): 3.39%", ui.scroll_messages[2])
-    self.assertIn("Mouse:", ui.scroll_messages[2])
-
-    # After 3rd scrolling (PageUp).
-    self.assertIn("Scroll (PgDn/PgUp): 1.69%", ui.scroll_messages[3])
-    self.assertIn("Mouse:", ui.scroll_messages[3])
-
-  def testCutOffTooManyOutputLines(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("babble -n 20\n"), self._EXIT])
-
-    # Modify max_output_lines so that this test doesn't use too much time or
-    # memory.
-    ui.max_output_lines = 10
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(["bar"] * 10 + ["Output cut off at 10 lines!"],
-                     ui.wrapped_outputs[0].lines[:11])
-
-  def testRunUIScrollTallOutputEndHome(self):
-    """Scroll tall output with PageDown and PageUp."""
-
-    # Use End and Home to scroll a little before exiting to test scrolling.
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble\n"),
-            [curses.KEY_END] * 2 + [curses.KEY_HOME] + self._EXIT
-        ])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    # Screen output/scrolling should have happened exactly once.
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-    self.assertEqual(4, len(ui.wrapped_outputs))
-    self.assertEqual(4, len(ui.scroll_messages))
-
-    # Before scrolling.
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.wrapped_outputs[0].lines[:60])
-
-    # Initial scroll: At the top.
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[0])
-
-    # After 1st scrolling (End).
-    self.assertIn("Scroll (PgUp): 100.00%", ui.scroll_messages[1])
-
-    # After 2nd scrolling (End).
-    self.assertIn("Scroll (PgUp): 100.00%", ui.scroll_messages[2])
-
-    # After 3rd scrolling (Hhome).
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[3])
-
-  def testRunUIWithInitCmd(self):
-    """Run UI with an initial command specified."""
-
-    ui = MockCursesUI(40, 80, command_sequence=[self._EXIT])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui(init_command="babble")
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.wrapped_outputs[0].lines[:60])
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[0])
-
-  def testCompileHelpWithoutHelpIntro(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("help\n"), self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(["babble", "  Aliases: b", "", "  babble some"],
-                     ui.unwrapped_outputs[0].lines[:4])
-
-  def testCompileHelpWithHelpIntro(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("help\n"), self._EXIT])
-
-    help_intro = debugger_cli_common.RichTextLines(
-        ["This is a curses UI.", "All it can do is 'babble'.", ""])
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.set_help_intro(help_intro)
-    ui.run_ui()
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(
-        help_intro.lines + ["babble", "  Aliases: b", "", "  babble some"],
-        ui.unwrapped_outputs[0].lines[:7])
-
-  def testCommandHistoryNavBackwardOnce(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("help\n"),
-                          [curses.KEY_UP],  # Hit Up and Enter.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(2, len(ui.unwrapped_outputs))
-
-    for i in [0, 1]:
-      self.assertEqual(["babble", "  Aliases: b", "", "  babble some"],
-                       ui.unwrapped_outputs[i].lines[:4])
-
-  def testCommandHistoryNavBackwardTwice(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("help\n"),
-                          string_to_codes("babble\n"),
-                          [curses.KEY_UP],
-                          [curses.KEY_UP],  # Hit Up twice and Enter.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(3, len(ui.unwrapped_outputs))
-
-    # The 1st and 3rd outputs are for command "help".
-    for i in [0, 2]:
-      self.assertEqual(["babble", "  Aliases: b", "", "  babble some"],
-                       ui.unwrapped_outputs[i].lines[:4])
-
-    # The 2nd output is for command "babble".
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
-
-  def testCommandHistoryNavBackwardOverLimit(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("help\n"),
-                          string_to_codes("babble\n"),
-                          [curses.KEY_UP],
-                          [curses.KEY_UP],
-                          [curses.KEY_UP],  # Hit Up three times and Enter.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(3, len(ui.unwrapped_outputs))
-
-    # The 1st and 3rd outputs are for command "help".
-    for i in [0, 2]:
-      self.assertEqual(["babble", "  Aliases: b", "", "  babble some"],
-                       ui.unwrapped_outputs[i].lines[:4])
-
-    # The 2nd output is for command "babble".
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
-
-  def testCommandHistoryNavBackwardThenForward(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("help\n"),
-                          string_to_codes("babble\n"),
-                          [curses.KEY_UP],
-                          [curses.KEY_UP],
-                          [curses.KEY_DOWN],  # Hit Up twice and Down once.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(3, len(ui.unwrapped_outputs))
-
-    # The 1st output is for command "help".
-    self.assertEqual(["babble", "  Aliases: b", "", "  babble some"],
-                     ui.unwrapped_outputs[0].lines[:4])
-
-    # The 2nd and 3rd outputs are for command "babble".
-    for i in [1, 2]:
-      self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[i].lines)
-
-  def testCommandHistoryPrefixNavBackwardOnce(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 1\n"),
-            string_to_codes("babble -n 10\n"),
-            string_to_codes("help\n"),
-            string_to_codes("b") + [curses.KEY_UP],  # Navigate with prefix.
-            string_to_codes("\n"),
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(["bar"], ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[1].lines)
-    self.assertEqual(["babble", "  Aliases: b", "", "  babble some"],
-                     ui.unwrapped_outputs[2].lines[:4])
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[3].lines)
-
-  def testTerminalResize(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("babble\n"),
-                          [curses.KEY_RESIZE, 100, 85],  # Resize to [100, 85]
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    # The resize event should have caused a second screen output event.
-    self.assertEqual(2, len(ui.unwrapped_outputs))
-    self.assertEqual(2, len(ui.wrapped_outputs))
-    self.assertEqual(2, len(ui.scroll_messages))
-
-    # The 1st and 2nd screen outputs should be identical (unwrapped).
-    self.assertEqual(ui.unwrapped_outputs[0], ui.unwrapped_outputs[1])
-
-    # The 1st scroll info should contain scrolling, because the screen size
-    # is less than the number of lines in the output.
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[0])
-
-  def testTabCompletionWithCommonPrefix(self):
-    # Type "b" and trigger tab completion.
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("b\t"), string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["ba"])
-    ui.run_ui()
-
-    # The automatically registered exit commands "exit" and "quit" should not
-    # appear in the tab completion candidates because they don't start with
-    # "b".
-    self.assertEqual([["ba", "babble"]], ui.candidates_lists)
-
-    # "ba" is a common prefix of the two candidates. So the "ba" command should
-    # have been issued after the Enter.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.wrapped_outputs[0].lines[:60])
-
-  def testTabCompletionEmptyTriggerWithoutCommonPrefix(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("\t"),  # Trigger tab completion.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["a"])
-    # Use a different alias "a" instead.
-    ui.run_ui()
-
-    # The manually registered command, along with the automatically registered
-    # exit commands should appear in the candidates.
-    self.assertEqual(
-        [["a", "babble", "cfg", "config", "exit", "h", "help", "m", "mouse",
-          "quit"]], ui.candidates_lists)
-
-    # The two candidates have no common prefix. So no command should have been
-    # issued.
-    self.assertEqual(0, len(ui.unwrapped_outputs))
-    self.assertEqual(0, len(ui.wrapped_outputs))
-    self.assertEqual(0, len(ui.scroll_messages))
-
-  def testTabCompletionNonemptyTriggerSingleCandidate(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("b\t"),  # Trigger tab completion.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["a"])
-    ui.run_ui()
-
-    # There is only one candidate, so no candidates should have been displayed.
-    # Instead, the completion should have been automatically keyed in, leading
-    # to the "babble" command being issue.
-    self.assertEqual([[]], ui.candidates_lists)
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.wrapped_outputs[0].lines[:60])
-
-  def testTabCompletionNoMatch(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[string_to_codes("c\t"),  # Trigger tab completion.
-                          string_to_codes("\n"),
-                          self._EXIT])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["a"])
-    ui.run_ui()
-
-    # Only the invalid command "c" should have been issued.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-
-    self.assertEqual(["ERROR: Invalid command prefix \"c\""],
-                     ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["ERROR: Invalid command prefix \"c\""],
-                     ui.wrapped_outputs[0].lines[:1])
-
-  def testTabCompletionOneWordContext(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\t"),  # Trigger tab completion.
-            string_to_codes("\n"),
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.register_tab_comp_context(["babble", "b"], ["10", "20", "30", "300"])
-    ui.run_ui()
-
-    self.assertEqual([["30", "300"]], ui.candidates_lists)
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-    self.assertEqual(["bar"] * 30, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 30, ui.wrapped_outputs[0].lines[:30])
-
-  def testTabCompletionTwice(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 1\t"),  # Trigger tab completion.
-            string_to_codes("2\t"),  # With more prefix, tab again.
-            string_to_codes("3\n"),
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.register_tab_comp_context(["babble", "b"], ["10", "120", "123"])
-    ui.run_ui()
-
-    # There should have been two different lists of candidates.
-    self.assertEqual([["10", "120", "123"], ["120", "123"]],
-                     ui.candidates_lists)
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.wrapped_outputs))
-    self.assertEqual(1, len(ui.scroll_messages))
-    self.assertEqual(["bar"] * 123, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 123, ui.wrapped_outputs[0].lines[:123])
-
-  def testRegexSearch(self):
-    """Test regex search."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\n"),
-            string_to_codes("/(b|r)\n"),  # Regex search and highlight.
-            string_to_codes("/a\n"),  # Regex search and highlight.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    # The unwrapped (original) output should never have any highlighting.
-    self.assertEqual(3, len(ui.unwrapped_outputs))
-    for i in range(3):
-      self.assertEqual(["bar"] * 3, ui.unwrapped_outputs[i].lines)
-      self.assertEqual({}, ui.unwrapped_outputs[i].font_attr_segs)
-
-    # The wrapped outputs should show highlighting depending on the regex.
-    self.assertEqual(3, len(ui.wrapped_outputs))
-
-    # The first output should have no highlighting.
-    self.assertEqual(["bar"] * 3, ui.wrapped_outputs[0].lines[:3])
-    self.assertEqual({}, ui.wrapped_outputs[0].font_attr_segs)
-
-    # The second output should have highlighting for "b" and "r".
-    self.assertEqual(["bar"] * 3, ui.wrapped_outputs[1].lines[:3])
-    for i in range(3):
-      self.assertEqual([(0, 1, "black_on_white"), (2, 3, "black_on_white")],
-                       ui.wrapped_outputs[1].font_attr_segs[i])
-
-    # The third output should have highlighting for "a" only.
-    self.assertEqual(["bar"] * 3, ui.wrapped_outputs[1].lines[:3])
-    for i in range(3):
-      self.assertEqual([(1, 2, "black_on_white")],
-                       ui.wrapped_outputs[2].font_attr_segs[i])
-
-  def testRegexSearchContinuation(self):
-    """Test continuing scrolling down to next regex match."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\n"),
-            string_to_codes("/(b|r)\n"),  # Regex search and highlight.
-            string_to_codes("/\n"),  # Continue scrolling down: 1st time.
-            string_to_codes("/\n"),  # Continue scrolling down: 2nd time.
-            string_to_codes("/\n"),  # Continue scrolling down: 3rd time.
-            string_to_codes("/\n"),  # Continue scrolling down: 4th time.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    # The 1st output is for the non-searched output. The other three are for
-    # the searched output. Even though continuation search "/" is performed
-    # four times, there should be only three searched outputs, because the
-    # last one has exceeded the end.
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-
-    for i in range(4):
-      self.assertEqual(["bar"] * 3, ui.unwrapped_outputs[i].lines)
-      self.assertEqual({}, ui.unwrapped_outputs[i].font_attr_segs)
-
-    self.assertEqual(["bar"] * 3, ui.wrapped_outputs[0].lines[:3])
-    self.assertEqual({}, ui.wrapped_outputs[0].font_attr_segs)
-
-    for j in range(1, 4):
-      self.assertEqual(["bar"] * 3, ui.wrapped_outputs[j].lines[:3])
-      self.assertEqual({
-          0: [(0, 1, "black_on_white"), (2, 3, "black_on_white")],
-          1: [(0, 1, "black_on_white"), (2, 3, "black_on_white")],
-          2: [(0, 1, "black_on_white"), (2, 3, "black_on_white")]
-      }, ui.wrapped_outputs[j].font_attr_segs)
-
-    self.assertEqual([0, 0, 1, 2], ui.output_pad_rows)
-
-  def testRegexSearchUnderLineWrapping(self):
-    ui = MockCursesUI(
-        40,
-        6,  # Use a narrow window to trigger line wrapping
-        command_sequence=[
-            string_to_codes("babble -n 3 -l foo-bar-baz-qux\n"),
-            string_to_codes("/foo\n"),  # Regex search and highlight.
-            string_to_codes("/\n"),  # Continue scrolling down: 1st time.
-            string_to_codes("/\n"),  # Continue scrolling down: 2nd time.
-            string_to_codes("/\n"),  # Continue scrolling down: 3rd time.
-            string_to_codes("/\n"),  # Continue scrolling down: 4th time.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some")
-    ui.run_ui()
-
-    self.assertEqual(4, len(ui.wrapped_outputs))
-    for wrapped_output in ui.wrapped_outputs:
-      self.assertEqual(["foo-", "bar-", "baz-", "qux"] * 3,
-                       wrapped_output.lines[0 : 12])
-
-    # The scroll location should reflect the line wrapping.
-    self.assertEqual([0, 0, 4, 8], ui.output_pad_rows)
-
-  def testRegexSearchNoMatchContinuation(self):
-    """Test continuing scrolling when there is no regex match."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\n"),
-            string_to_codes("/foo\n"),  # Regex search and highlight.
-            string_to_codes("/\n"),  # Continue scrolling down.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    # The regex search and continuation search in the 3rd command should not
-    # have produced any output.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual([0], ui.output_pad_rows)
-
-  def testRegexSearchContinuationWithoutSearch(self):
-    """Test continuation scrolling when no regex search has been performed."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\n"),
-            string_to_codes("/\n"),  # Continue scrolling without search first.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual([0], ui.output_pad_rows)
-
-  def testRegexSearchWithInvalidRegex(self):
-    """Test using invalid regex to search."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\n"),
-            string_to_codes("/[\n"),  # Continue scrolling without search first.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    # Invalid regex should not have led to a new screen of output.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual([0], ui.output_pad_rows)
-
-    # Invalid regex should have led to a toast error message.
-    self.assertEqual(
-        [MockCursesUI._UI_WAIT_MESSAGE,
-         "ERROR: Invalid regular expression: \"[\"",
-         MockCursesUI._UI_WAIT_MESSAGE],
-        ui.toasts)
-
-  def testRegexSearchFromCommandHistory(self):
-    """Test regex search commands are recorded in command history."""
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 3\n"),
-            string_to_codes("/(b|r)\n"),  # Regex search and highlight.
-            string_to_codes("babble -n 4\n"),
-            [curses.KEY_UP],
-            [curses.KEY_UP],
-            string_to_codes("\n"),  # Hit Up twice and Enter.
-            self._EXIT
-        ])
-
-    ui.register_command_handler(
-        "babble", self._babble, "babble some", prefix_aliases=["b"])
-    ui.run_ui()
-
-    self.assertEqual(4, len(ui.wrapped_outputs))
-
-    self.assertEqual(["bar"] * 3, ui.wrapped_outputs[0].lines[:3])
-    self.assertEqual({}, ui.wrapped_outputs[0].font_attr_segs)
-
-    self.assertEqual(["bar"] * 3, ui.wrapped_outputs[1].lines[:3])
-    for i in range(3):
-      self.assertEqual([(0, 1, "black_on_white"), (2, 3, "black_on_white")],
-                       ui.wrapped_outputs[1].font_attr_segs[i])
-
-    self.assertEqual(["bar"] * 4, ui.wrapped_outputs[2].lines[:4])
-    self.assertEqual({}, ui.wrapped_outputs[2].font_attr_segs)
-
-    # The regex search command loaded from history should have worked on the
-    # new screen output.
-    self.assertEqual(["bar"] * 4, ui.wrapped_outputs[3].lines[:4])
-    for i in range(4):
-      self.assertEqual([(0, 1, "black_on_white"), (2, 3, "black_on_white")],
-                       ui.wrapped_outputs[3].font_attr_segs[i])
-
-  def testDisplayTensorWithIndices(self):
-    """Test displaying tensor with indices."""
-
-    ui = MockCursesUI(
-        9,  # Use a small screen height to cause scrolling.
-        80,
-        command_sequence=[
-            string_to_codes("print_ones --size 5\n"),
-            [curses.KEY_NPAGE],
-            [curses.KEY_NPAGE],
-            [curses.KEY_NPAGE],
-            [curses.KEY_END],
-            [curses.KEY_NPAGE],  # This PageDown goes over the bottom limit.
-            [curses.KEY_PPAGE],
-            [curses.KEY_PPAGE],
-            [curses.KEY_PPAGE],
-            [curses.KEY_HOME],
-            [curses.KEY_PPAGE],  # This PageDown goes over the top limit.
-            self._EXIT
-        ])
-
-    ui.register_command_handler("print_ones", self._print_ones,
-                                "print an all-one matrix of specified size")
-    ui.run_ui()
-
-    self.assertEqual(11, len(ui.unwrapped_outputs))
-    self.assertEqual(11, len(ui.output_array_pointer_indices))
-    self.assertEqual(11, len(ui.scroll_messages))
-
-    for i in range(11):
-      cli_test_utils.assert_lines_equal_ignoring_whitespace(
-          self, ["Tensor \"m\":", ""], ui.unwrapped_outputs[i].lines[:2])
-      self.assertEqual(
-          repr(np.ones([5, 5])).split("\n"), ui.unwrapped_outputs[i].lines[2:])
-
-    self.assertEqual({
-        0: None,
-        -1: [1, 0]
-    }, ui.output_array_pointer_indices[0])
-    self.assertIn(" Scroll (PgDn): 0.00% -[1,0] ", ui.scroll_messages[0])
-
-    # Scrolled down one line.
-    self.assertEqual({
-        0: None,
-        -1: [2, 0]
-    }, ui.output_array_pointer_indices[1])
-    self.assertIn(" Scroll (PgDn/PgUp): 16.67% -[2,0] ", ui.scroll_messages[1])
-
-    # Scrolled down one line.
-    self.assertEqual({
-        0: [0, 0],
-        -1: [3, 0]
-    }, ui.output_array_pointer_indices[2])
-    self.assertIn(" Scroll (PgDn/PgUp): 33.33% [0,0]-[3,0] ",
-                  ui.scroll_messages[2])
-
-    # Scrolled down one line.
-    self.assertEqual({
-        0: [1, 0],
-        -1: [4, 0]
-    }, ui.output_array_pointer_indices[3])
-    self.assertIn(" Scroll (PgDn/PgUp): 50.00% [1,0]-[4,0] ",
-                  ui.scroll_messages[3])
-
-    # Scroll to the bottom.
-    self.assertEqual({
-        0: [4, 0],
-        -1: None
-    }, ui.output_array_pointer_indices[4])
-    self.assertIn(" Scroll (PgUp): 100.00% [4,0]- ", ui.scroll_messages[4])
-
-    # Attempt to scroll beyond the bottom should lead to no change.
-    self.assertEqual({
-        0: [4, 0],
-        -1: None
-    }, ui.output_array_pointer_indices[5])
-    self.assertIn(" Scroll (PgUp): 100.00% [4,0]- ", ui.scroll_messages[5])
-
-    # Scrolled up one line.
-    self.assertEqual({
-        0: [3, 0],
-        -1: None
-    }, ui.output_array_pointer_indices[6])
-    self.assertIn(" Scroll (PgDn/PgUp): 83.33% [3,0]- ", ui.scroll_messages[6])
-
-    # Scrolled up one line.
-    self.assertEqual({
-        0: [2, 0],
-        -1: None
-    }, ui.output_array_pointer_indices[7])
-    self.assertIn(" Scroll (PgDn/PgUp): 66.67% [2,0]- ", ui.scroll_messages[7])
-
-    # Scrolled up one line.
-    self.assertEqual({
-        0: [1, 0],
-        -1: [4, 0]
-    }, ui.output_array_pointer_indices[8])
-    self.assertIn(" Scroll (PgDn/PgUp): 50.00% [1,0]-[4,0] ",
-                  ui.scroll_messages[8])
-
-    # Scroll to the top.
-    self.assertEqual({
-        0: None,
-        -1: [1, 0]
-    }, ui.output_array_pointer_indices[9])
-    self.assertIn(" Scroll (PgDn): 0.00% -[1,0] ", ui.scroll_messages[9])
-
-    # Attempt to scroll pass the top limit should lead to no change.
-    self.assertEqual({
-        0: None,
-        -1: [1, 0]
-    }, ui.output_array_pointer_indices[10])
-    self.assertIn(" Scroll (PgDn): 0.00% -[1,0] ", ui.scroll_messages[10])
-
-  def testScrollTensorByValidIndices(self):
-    """Test scrolling to specified (valid) indices in a tensor."""
-
-    ui = MockCursesUI(
-        8,  # Use a small screen height to cause scrolling.
-        80,
-        command_sequence=[
-            string_to_codes("print_ones --size 5\n"),
-            string_to_codes("@[0, 0]\n"),  # Scroll to element [0, 0].
-            string_to_codes("@1,0\n"),  # Scroll to element [3, 0].
-            string_to_codes("@[0,2]\n"),  # Scroll back to line 0.
-            self._EXIT
-        ])
-
-    ui.register_command_handler("print_ones", self._print_ones,
-                                "print an all-one matrix of specified size")
-    ui.run_ui()
-
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-    self.assertEqual(4, len(ui.output_array_pointer_indices))
-
-    for i in range(4):
-      cli_test_utils.assert_lines_equal_ignoring_whitespace(
-          self, ["Tensor \"m\":", ""], ui.unwrapped_outputs[i].lines[:2])
-      self.assertEqual(
-          repr(np.ones([5, 5])).split("\n"), ui.unwrapped_outputs[i].lines[2:])
-
-    self.assertEqual({
-        0: None,
-        -1: [0, 0]
-    }, ui.output_array_pointer_indices[0])
-    self.assertEqual({
-        0: [0, 0],
-        -1: [2, 0]
-    }, ui.output_array_pointer_indices[1])
-    self.assertEqual({
-        0: [1, 0],
-        -1: [3, 0]
-    }, ui.output_array_pointer_indices[2])
-    self.assertEqual({
-        0: [0, 0],
-        -1: [2, 0]
-    }, ui.output_array_pointer_indices[3])
-
-  def testScrollTensorByInvalidIndices(self):
-    """Test scrolling to specified invalid indices in a tensor."""
-
-    ui = MockCursesUI(
-        8,  # Use a small screen height to cause scrolling.
-        80,
-        command_sequence=[
-            string_to_codes("print_ones --size 5\n"),
-            string_to_codes("@[10, 0]\n"),  # Scroll to invalid indices.
-            string_to_codes("@[]\n"),  # Scroll to invalid indices.
-            string_to_codes("@\n"),  # Scroll to invalid indices.
-            self._EXIT
-        ])
-
-    ui.register_command_handler("print_ones", self._print_ones,
-                                "print an all-one matrix of specified size")
-    ui.run_ui()
-
-    # Because all scroll-by-indices commands are invalid, there should be only
-    # one output event.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(1, len(ui.output_array_pointer_indices))
-
-    # Check error messages.
-    self.assertEqual("ERROR: Indices exceed tensor dimensions.", ui.toasts[2])
-    self.assertEqual("ERROR: invalid literal for int() with base 10: ''",
-                     ui.toasts[4])
-    self.assertEqual("ERROR: Empty indices.", ui.toasts[6])
-
-  def testWriteScreenOutputToFileWorks(self):
-    output_path = tempfile.mktemp()
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2>%s\n" % output_path),
-            self._EXIT
-        ])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-
-    with gfile.Open(output_path, "r") as f:
-      self.assertEqual("bar\nbar\n", f.read())
-
-    # Clean up output file.
-    gfile.Remove(output_path)
-
-  def testIncompleteRedirectErrors(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2 >\n"),
-            self._EXIT
-        ])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(["ERROR: Redirect file path is empty"], ui.toasts)
-    self.assertEqual(0, len(ui.unwrapped_outputs))
-
-  def testAppendingRedirectErrors(self):
-    output_path = tempfile.mktemp()
-
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2 >> %s\n" % output_path),
-            self._EXIT
-        ])
-
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(
-        ["Syntax error for command: babble", "For help, do \"help babble\""],
-        ui.unwrapped_outputs[0].lines)
-
-    # Clean up output file.
-    gfile.Remove(output_path)
-
-  def testMouseOffTakesEffect(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("mouse off\n"), string_to_codes("babble\n"),
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-
-    ui.run_ui()
-    self.assertFalse(ui._mouse_enabled)
-    self.assertIn("Mouse: OFF", ui.scroll_messages[-1])
-
-  def testMouseOffAndOnTakeEffect(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("mouse off\n"), string_to_codes("mouse on\n"),
-            string_to_codes("babble\n"), self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-
-    ui.run_ui()
-    self.assertTrue(ui._mouse_enabled)
-    self.assertIn("Mouse: ON", ui.scroll_messages[-1])
-
-  def testMouseClickOnLinkTriggersCommand(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 10 -k\n"),
-            [curses.KEY_MOUSE, 1, 4],  # A click on a hyperlink.
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(2, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
-
-  def testMouseClickOnLinkWithExistingTextTriggersCommand(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 10 -k\n"),
-            string_to_codes("foo"),  # Enter some existing code in the textbox.
-            [curses.KEY_MOUSE, 1, 4],  # A click on a hyperlink.
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(2, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
-
-  def testMouseClickOffLinkDoesNotTriggersCommand(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 10 -k\n"),
-            # A click off a hyperlink (too much to the right).
-            [curses.KEY_MOUSE, 8, 4],
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    # The mouse click event should not triggered no command.
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
-
-    # This command should have generated no main menus.
-    self.assertEqual([None], ui.main_menu_list)
-
-  def testMouseClickOnEnabledMenuItemWorks(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 10 -m\n"),
-            # A click on the enabled menu item.
-            [curses.KEY_MOUSE, 3, 2],
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(2, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 60, ui.unwrapped_outputs[1].lines)
-
-    # Check the content of the menu.
-    self.assertEqual(["| babble again | ahoy | "], ui.main_menu_list[0].lines)
-    self.assertEqual(1, len(ui.main_menu_list[0].font_attr_segs))
-    self.assertEqual(1, len(ui.main_menu_list[0].font_attr_segs[0]))
-
-    item_annot = ui.main_menu_list[0].font_attr_segs[0][0]
-    self.assertEqual(2, item_annot[0])
-    self.assertEqual(14, item_annot[1])
-    self.assertEqual("babble", item_annot[2][0].content)
-    self.assertEqual("underline", item_annot[2][1])
-
-    # The output from the menu-triggered command does not have a menu.
-    self.assertIsNone(ui.main_menu_list[1])
-
-  def testMouseClickOnDisabledMenuItemTriggersNoCommand(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 10 -m\n"),
-            # A click on the disabled menu item.
-            [curses.KEY_MOUSE, 18, 1],
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(1, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 10, ui.unwrapped_outputs[0].lines)
-
-  def testNavigationUsingCommandLineWorks(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2\n"),
-            string_to_codes("babble -n 4\n"),
-            string_to_codes("prev\n"),
-            string_to_codes("next\n"),
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[3].lines)
-
-  def testNavigationOverOldestLimitUsingCommandLineGivesCorrectWarning(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2\n"),
-            string_to_codes("babble -n 4\n"),
-            string_to_codes("prev\n"),
-            string_to_codes("prev\n"),  # Navigate over oldest limit.
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(3, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
-
-    self.assertEqual("At the OLDEST in navigation history!", ui.toasts[-2])
-
-  def testNavigationOverLatestLimitUsingCommandLineGivesCorrectWarning(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2\n"),
-            string_to_codes("babble -n 4\n"),
-            string_to_codes("prev\n"),
-            string_to_codes("next\n"),
-            string_to_codes("next\n"),  # Navigate over latest limit.
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[3].lines)
-
-    self.assertEqual("At the LATEST in navigation history!", ui.toasts[-2])
-
-  def testMouseClicksOnNavBarWorks(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2\n"),
-            string_to_codes("babble -n 4\n"),
-            # A click on the back (prev) button of the nav bar.
-            [curses.KEY_MOUSE, 3, 1],
-            # A click on the forward (prev) button of the nav bar.
-            [curses.KEY_MOUSE, 7, 1],
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(4, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[1].lines)
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[2].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[3].lines)
-
-  def testMouseClicksOnNavBarAfterPreviousScrollingWorks(self):
-    ui = MockCursesUI(
-        40,
-        80,
-        command_sequence=[
-            string_to_codes("babble -n 2\n"),
-            [curses.KEY_NPAGE],   # Scroll down one line.
-            string_to_codes("babble -n 4\n"),
-            # A click on the back (prev) button of the nav bar.
-            [curses.KEY_MOUSE, 3, 1],
-            # A click on the forward (prev) button of the nav bar.
-            [curses.KEY_MOUSE, 7, 1],
-            self._EXIT
-        ])
-    ui.register_command_handler("babble", self._babble, "")
-    ui.run_ui()
-
-    self.assertEqual(6, len(ui.unwrapped_outputs))
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[0].lines)
-    # From manual scroll.
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[1].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[2].lines)
-    # From history navigation.
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[3].lines)
-    # From history navigation's auto-scroll to history scroll position.
-    self.assertEqual(["bar"] * 2, ui.unwrapped_outputs[4].lines)
-    self.assertEqual(["bar"] * 4, ui.unwrapped_outputs[5].lines)
-
-    self.assertEqual(6, len(ui.scroll_messages))
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[0])
-    self.assertIn("Scroll (PgUp): 100.00%", ui.scroll_messages[1])
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[2])
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[3])
-    self.assertIn("Scroll (PgUp): 100.00%", ui.scroll_messages[4])
-    self.assertIn("Scroll (PgDn): 0.00%", ui.scroll_messages[5])
-
-
-class ScrollBarTest(test_util.TensorFlowTestCase):
-
-  def testConstructorRaisesExceptionForNotEnoughHeight(self):
-    with self.assertRaisesRegex(ValueError,
-                                r"Insufficient height for ScrollBar \(2\)"):
-      curses_ui.ScrollBar(0, 0, 1, 1, 0, 0)
-
-  def testLayoutIsEmptyForZeroRow(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 0)
-    layout = scroll_bar.layout()
-    self.assertEqual(["  "] * 8, layout.lines)
-    self.assertEqual({}, layout.font_attr_segs)
-
-  def testLayoutIsEmptyFoOneRow(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 1)
-    layout = scroll_bar.layout()
-    self.assertEqual(["  "] * 8, layout.lines)
-    self.assertEqual({}, layout.font_attr_segs)
-
-  def testClickCommandForOneRowIsNone(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 1)
-    self.assertIsNone(scroll_bar.get_click_command(0))
-    self.assertIsNone(scroll_bar.get_click_command(3))
-    self.assertIsNone(scroll_bar.get_click_command(7))
-    self.assertIsNone(scroll_bar.get_click_command(8))
-
-  def testLayoutIsCorrectForTopPosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 0, 20)
-    layout = scroll_bar.layout()
-    self.assertEqual(["UP"] + ["  "] * 6 + ["DN"], layout.lines)
-    self.assertEqual(
-        {0: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
-         1: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
-         7: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)]},
-        layout.font_attr_segs)
-
-  def testWidth1LayoutIsCorrectForTopPosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 0, 7, 0, 20)
-    layout = scroll_bar.layout()
-    self.assertEqual(["U"] + [" "] * 6 + ["D"], layout.lines)
-    self.assertEqual(
-        {0: [(0, 1, curses_ui.ScrollBar.BASE_ATTR)],
-         1: [(0, 1, curses_ui.ScrollBar.BASE_ATTR)],
-         7: [(0, 1, curses_ui.ScrollBar.BASE_ATTR)]},
-        layout.font_attr_segs)
-
-  def testWidth3LayoutIsCorrectForTopPosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 2, 7, 0, 20)
-    layout = scroll_bar.layout()
-    self.assertEqual(["UP "] + ["   "] * 6 + ["DN "], layout.lines)
-    self.assertEqual(
-        {0: [(0, 3, curses_ui.ScrollBar.BASE_ATTR)],
-         1: [(0, 3, curses_ui.ScrollBar.BASE_ATTR)],
-         7: [(0, 3, curses_ui.ScrollBar.BASE_ATTR)]},
-        layout.font_attr_segs)
-
-  def testWidth4LayoutIsCorrectForTopPosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 3, 7, 0, 20)
-    layout = scroll_bar.layout()
-    self.assertEqual([" UP "] + ["    "] * 6 + ["DOWN"], layout.lines)
-    self.assertEqual(
-        {0: [(0, 4, curses_ui.ScrollBar.BASE_ATTR)],
-         1: [(0, 4, curses_ui.ScrollBar.BASE_ATTR)],
-         7: [(0, 4, curses_ui.ScrollBar.BASE_ATTR)]},
-        layout.font_attr_segs)
-
-  def testLayoutIsCorrectForBottomPosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 19, 20)
-    layout = scroll_bar.layout()
-    self.assertEqual(["UP"] + ["  "] * 6 + ["DN"], layout.lines)
-    self.assertEqual(
-        {0: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
-         6: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
-         7: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)]},
-        layout.font_attr_segs)
-
-  def testLayoutIsCorrectForMiddlePosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 10, 20)
-    layout = scroll_bar.layout()
-    self.assertEqual(["UP"] + ["  "] * 6 + ["DN"], layout.lines)
-    self.assertEqual(
-        {0: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
-         3: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)],
-         7: [(0, 2, curses_ui.ScrollBar.BASE_ATTR)]},
-        layout.font_attr_segs)
-
-  def testClickCommandsAreCorrectForMiddlePosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 10, 20)
-    self.assertIsNone(scroll_bar.get_click_command(-1))
-    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
-                     scroll_bar.get_click_command(0))
-    self.assertEqual(curses_ui._SCROLL_UP,
-                     scroll_bar.get_click_command(1))
-    self.assertEqual(curses_ui._SCROLL_UP,
-                     scroll_bar.get_click_command(2))
-    self.assertIsNone(scroll_bar.get_click_command(3))
-    self.assertEqual(curses_ui._SCROLL_DOWN,
-                     scroll_bar.get_click_command(5))
-    self.assertEqual(curses_ui._SCROLL_DOWN,
-                     scroll_bar.get_click_command(6))
-    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
-                     scroll_bar.get_click_command(7))
-    self.assertIsNone(scroll_bar.get_click_command(8))
-
-  def testClickCommandsAreCorrectForBottomPosition(self):
-    scroll_bar = curses_ui.ScrollBar(0, 0, 1, 7, 19, 20)
-    self.assertIsNone(scroll_bar.get_click_command(-1))
-    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
-                     scroll_bar.get_click_command(0))
-    for i in range(1, 6):
-      self.assertEqual(curses_ui._SCROLL_UP,
-                       scroll_bar.get_click_command(i))
-    self.assertIsNone(scroll_bar.get_click_command(6))
-    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
-                     scroll_bar.get_click_command(7))
-    self.assertIsNone(scroll_bar.get_click_command(8))
-
-  def testClickCommandsAreCorrectForScrollBarNotAtZeroMinY(self):
-    scroll_bar = curses_ui.ScrollBar(0, 5, 1, 12, 10, 20)
-    self.assertIsNone(scroll_bar.get_click_command(0))
-    self.assertIsNone(scroll_bar.get_click_command(4))
-    self.assertEqual(curses_ui._SCROLL_UP_A_LINE,
-                     scroll_bar.get_click_command(5))
-    self.assertEqual(curses_ui._SCROLL_UP,
-                     scroll_bar.get_click_command(6))
-    self.assertEqual(curses_ui._SCROLL_UP,
-                     scroll_bar.get_click_command(7))
-    self.assertIsNone(scroll_bar.get_click_command(8))
-    self.assertEqual(curses_ui._SCROLL_DOWN,
-                     scroll_bar.get_click_command(10))
-    self.assertEqual(curses_ui._SCROLL_DOWN,
-                     scroll_bar.get_click_command(11))
-    self.assertEqual(curses_ui._SCROLL_DOWN_A_LINE,
-                     scroll_bar.get_click_command(12))
-    self.assertIsNone(scroll_bar.get_click_command(13))
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/cli/curses_widgets.py b/tensorflow/python/debug/cli/curses_widgets.py
deleted file mode 100644
index 6a4c4edf71f..00000000000
--- a/tensorflow/python/debug/cli/curses_widgets.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Widgets for Curses-based CLI."""
-from tensorflow.python.debug.cli import debugger_cli_common
-
-
-RL = debugger_cli_common.RichLine
-
-
-class NavigationHistoryItem(object):
-  """Individual item in navigation history."""
-
-  def __init__(self, command, screen_output, scroll_position):
-    """Constructor of NavigationHistoryItem.
-
-    Args:
-      command: (`str`) the command line text.
-      screen_output: the screen output of the command.
-      scroll_position: (`int`) scroll position in the screen output.
-    """
-    self.command = command
-    self.screen_output = screen_output
-    self.scroll_position = scroll_position
-
-
-class CursesNavigationHistory(object):
-  """Navigation history containing commands, outputs and scroll info."""
-
-  BACK_ARROW_TEXT = "<--"
-  FORWARD_ARROW_TEXT = "-->"
-
-  def __init__(self, capacity):
-    """Constructor of CursesNavigationHistory.
-
-    Args:
-      capacity: (`int`) How many items this object can hold. Each item consists
-        of a command stirng, an output RichTextLines object and a scroll
-        position.
-
-    Raises:
-      ValueError: If capacity is not a positive number.
-    """
-    if capacity <= 0:
-      raise ValueError("In valid capacity value: %d" % capacity)
-
-    self._capacity = capacity
-    self._items = []
-    self._pointer = -1
-
-  def add_item(self, command, screen_output, scroll_position):
-    """Add an item to the navigation histoyr.
-
-    Args:
-      command: command line text.
-      screen_output: screen output produced for the command.
-      scroll_position: (`int`) scroll position in the screen output.
-    """
-    if self._pointer + 1 < len(self._items):
-      self._items = self._items[:self._pointer + 1]
-    self._items.append(
-        NavigationHistoryItem(command, screen_output, scroll_position))
-    if len(self._items) > self._capacity:
-      self._items = self._items[-self._capacity:]
-    self._pointer = len(self._items) - 1
-
-  def update_scroll_position(self, new_scroll_position):
-    """Update the scroll position of the currently-pointed-to history item.
-
-    Args:
-      new_scroll_position: (`int`) new scroll-position value.
-
-    Raises:
-      ValueError: If the history is empty.
-    """
-    if not self._items:
-      raise ValueError("Empty navigation history")
-    self._items[self._pointer].scroll_position = new_scroll_position
-
-  def size(self):
-    return len(self._items)
-
-  def pointer(self):
-    return self._pointer
-
-  def go_back(self):
-    """Go back one place in the history, if possible.
-
-    Decrease the pointer value by 1, if possible. Otherwise, the pointer value
-    will be unchanged.
-
-    Returns:
-      The updated pointer value.
-
-    Raises:
-      ValueError: If history is empty.
-    """
-    if not self._items:
-      raise ValueError("Empty navigation history")
-
-    if self.can_go_back():
-      self._pointer -= 1
-    return self._items[self._pointer]
-
-  def go_forward(self):
-    """Go forward one place in the history, if possible.
-
-    Increase the pointer value by 1, if possible. Otherwise, the pointer value
-    will be unchanged.
-
-    Returns:
-      The updated pointer value.
-
-    Raises:
-      ValueError: If history is empty.
-    """
-    if not self._items:
-      raise ValueError("Empty navigation history")
-
-    if self.can_go_forward():
-      self._pointer += 1
-    return self._items[self._pointer]
-
-  def can_go_back(self):
-    """Test whether client can go back one place.
-
-    Returns:
-      (`bool`) Whether going back one place is possible.
-    """
-    return self._pointer >= 1
-
-  def can_go_forward(self):
-    """Test whether client can go forward one place.
-
-    Returns:
-      (`bool`) Whether going back one place is possible.
-    """
-    return self._pointer + 1 < len(self._items)
-
-  def render(self,
-             max_length,
-             backward_command,
-             forward_command,
-             latest_command_attribute="black_on_white",
-             old_command_attribute="magenta_on_white"):
-    """Render the rich text content of the single-line navigation bar.
-
-    Args:
-      max_length: (`int`) Maximum length of the navigation bar, in characters.
-      backward_command: (`str`) command for going backward. Used to construct
-        the shortcut menu item.
-      forward_command: (`str`) command for going forward. Used to construct the
-        shortcut menu item.
-       latest_command_attribute: font attribute for latest command.
-       old_command_attribute: font attribute for old (non-latest) command.
-
-    Returns:
-      (`debugger_cli_common.RichTextLines`) the navigation bar text with
-        attributes.
-    """
-    output = RL("| ")
-    output += RL(
-        self.BACK_ARROW_TEXT,
-        (debugger_cli_common.MenuItem(None, backward_command)
-         if self.can_go_back() else None))
-    output += RL(" ")
-    output += RL(
-        self.FORWARD_ARROW_TEXT,
-        (debugger_cli_common.MenuItem(None, forward_command)
-         if self.can_go_forward() else None))
-
-    if self._items:
-      command_attribute = (latest_command_attribute
-                           if (self._pointer == (len(self._items) - 1))
-                           else old_command_attribute)
-      output += RL(" | ")
-      if self._pointer != len(self._items) - 1:
-        output += RL("(-%d) " % (len(self._items) - 1 - self._pointer),
-                     command_attribute)
-
-      if len(output) < max_length:
-        maybe_truncated_command = self._items[self._pointer].command[
-            :(max_length - len(output))]
-        output += RL(maybe_truncated_command, command_attribute)
-
-    return debugger_cli_common.rich_text_lines_from_rich_line_list([output])
diff --git a/tensorflow/python/debug/cli/curses_widgets_test.py b/tensorflow/python/debug/cli/curses_widgets_test.py
deleted file mode 100644
index 64791698d0d..00000000000
--- a/tensorflow/python/debug/cli/curses_widgets_test.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Unit tests for curses-based CLI widgets."""
-from tensorflow.python.debug.cli import curses_widgets
-from tensorflow.python.debug.cli import debugger_cli_common
-from tensorflow.python.framework import test_util
-from tensorflow.python.platform import googletest
-
-RTL = debugger_cli_common.RichTextLines
-CNH = curses_widgets.CursesNavigationHistory
-
-
-class CNHTest(test_util.TensorFlowTestCase):
-
-  def testConstructorWorks(self):
-    CNH(10)
-
-  def testConstructorWithInvalidCapacityErrors(self):
-    with self.assertRaises(ValueError):
-      CNH(0)
-    with self.assertRaises(ValueError):
-      CNH(-1)
-
-  def testInitialStateIsCorrect(self):
-    nav_history = CNH(10)
-    self.assertEqual(0, nav_history.size())
-    self.assertFalse(nav_history.can_go_forward())
-    self.assertFalse(nav_history.can_go_back())
-
-    with self.assertRaisesRegex(ValueError, "Empty navigation history"):
-      nav_history.go_back()
-    with self.assertRaisesRegex(ValueError, "Empty navigation history"):
-      nav_history.go_forward()
-    with self.assertRaisesRegex(ValueError, "Empty navigation history"):
-      nav_history.update_scroll_position(3)
-
-  def testAddOneItemWorks(self):
-    nav_history = CNH(10)
-    nav_history.add_item("foo", RTL(["bar"]), 0)
-
-    self.assertEqual(1, nav_history.size())
-    self.assertEqual(0, nav_history.pointer())
-
-    self.assertFalse(nav_history.can_go_forward())
-    self.assertFalse(nav_history.can_go_back())
-
-    output = nav_history.go_back()
-    self.assertEqual("foo", output.command)
-    self.assertEqual(["bar"], output.screen_output.lines)
-    self.assertEqual(0, output.scroll_position)
-
-  def testAddItemsBeyondCapacityWorks(self):
-    nav_history = CNH(2)
-    nav_history.add_item("foo", RTL(["foo_output"]), 0)
-    nav_history.add_item("bar", RTL(["bar_output"]), 0)
-
-    self.assertEqual(2, nav_history.size())
-    self.assertEqual(1, nav_history.pointer())
-    self.assertTrue(nav_history.can_go_back())
-    self.assertFalse(nav_history.can_go_forward())
-
-    nav_history.add_item("baz", RTL(["baz_output"]), 0)
-
-    self.assertEqual(2, nav_history.size())
-    self.assertEqual(1, nav_history.pointer())
-    self.assertTrue(nav_history.can_go_back())
-    self.assertFalse(nav_history.can_go_forward())
-
-    item = nav_history.go_back()
-    self.assertEqual("bar", item.command)
-    self.assertFalse(nav_history.can_go_back())
-    self.assertTrue(nav_history.can_go_forward())
-
-    item = nav_history.go_forward()
-    self.assertEqual("baz", item.command)
-    self.assertTrue(nav_history.can_go_back())
-    self.assertFalse(nav_history.can_go_forward())
-
-  def testAddItemFromNonLatestPointerPositionWorks(self):
-    nav_history = CNH(2)
-    nav_history.add_item("foo", RTL(["foo_output"]), 0)
-    nav_history.add_item("bar", RTL(["bar_output"]), 0)
-
-    nav_history.go_back()
-    nav_history.add_item("baz", RTL(["baz_output"]), 0)
-
-    self.assertEqual(2, nav_history.size())
-    self.assertEqual(1, nav_history.pointer())
-    self.assertTrue(nav_history.can_go_back())
-    self.assertFalse(nav_history.can_go_forward())
-
-    item = nav_history.go_back()
-    self.assertEqual("foo", item.command)
-    item = nav_history.go_forward()
-    self.assertEqual("baz", item.command)
-
-  def testUpdateScrollPositionOnLatestItemWorks(self):
-    nav_history = CNH(2)
-    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
-    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
-
-    nav_history.update_scroll_position(1)
-    nav_history.go_back()
-    item = nav_history.go_forward()
-    self.assertEqual("bar", item.command)
-    self.assertEqual(1, item.scroll_position)
-
-  def testUpdateScrollPositionOnOldItemWorks(self):
-    nav_history = CNH(2)
-    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
-    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
-
-    item = nav_history.go_back()
-    self.assertEqual("foo", item.command)
-    self.assertEqual(0, item.scroll_position)
-
-    nav_history.update_scroll_position(1)
-    nav_history.go_forward()
-    item = nav_history.go_back()
-    self.assertEqual("foo", item.command)
-    self.assertEqual(1, item.scroll_position)
-
-    item = nav_history.go_forward()
-    self.assertEqual("bar", item.command)
-    self.assertEqual(0, item.scroll_position)
-
-  def testRenderWithEmptyHistoryWorks(self):
-    nav_history = CNH(2)
-
-    output = nav_history.render(40, "prev", "next")
-    self.assertEqual(1, len(output.lines))
-    self.assertEqual(
-        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT,
-        output.lines[0])
-    self.assertEqual({}, output.font_attr_segs)
-
-  def testRenderLatestWithSufficientLengthWorks(self):
-    nav_history = CNH(2)
-    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
-    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
-
-    output = nav_history.render(
-        40,
-        "prev",
-        "next",
-        latest_command_attribute="green",
-        old_command_attribute="yellow")
-    self.assertEqual(1, len(output.lines))
-    self.assertEqual(
-        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
-        " | bar",
-        output.lines[0])
-    self.assertEqual(2, output.font_attr_segs[0][0][0])
-    self.assertEqual(5, output.font_attr_segs[0][0][1])
-    self.assertEqual("prev", output.font_attr_segs[0][0][2].content)
-
-    self.assertEqual(12, output.font_attr_segs[0][1][0])
-    self.assertEqual(15, output.font_attr_segs[0][1][1])
-    self.assertEqual("green", output.font_attr_segs[0][1][2])
-
-  def testRenderOldButNotOldestWithSufficientLengthWorks(self):
-    nav_history = CNH(3)
-    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
-    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
-    nav_history.add_item("baz", RTL(["baz_out", "more_baz_out"]), 0)
-
-    nav_history.go_back()
-
-    output = nav_history.render(
-        40,
-        "prev",
-        "next",
-        latest_command_attribute="green",
-        old_command_attribute="yellow")
-    self.assertEqual(1, len(output.lines))
-    self.assertEqual(
-        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
-        " | (-1) bar",
-        output.lines[0])
-    self.assertEqual(2, output.font_attr_segs[0][0][0])
-    self.assertEqual(5, output.font_attr_segs[0][0][1])
-    self.assertEqual("prev", output.font_attr_segs[0][0][2].content)
-
-    self.assertEqual(6, output.font_attr_segs[0][1][0])
-    self.assertEqual(9, output.font_attr_segs[0][1][1])
-    self.assertEqual("next", output.font_attr_segs[0][1][2].content)
-
-    self.assertEqual(12, output.font_attr_segs[0][2][0])
-    self.assertEqual(17, output.font_attr_segs[0][2][1])
-    self.assertEqual("yellow", output.font_attr_segs[0][2][2])
-
-    self.assertEqual(17, output.font_attr_segs[0][3][0])
-    self.assertEqual(20, output.font_attr_segs[0][3][1])
-    self.assertEqual("yellow", output.font_attr_segs[0][3][2])
-
-  def testRenderOldestWithSufficientLengthWorks(self):
-    nav_history = CNH(3)
-    nav_history.add_item("foo", RTL(["foo_out", "more_foo_out"]), 0)
-    nav_history.add_item("bar", RTL(["bar_out", "more_bar_out"]), 0)
-    nav_history.add_item("baz", RTL(["baz_out", "more_baz_out"]), 0)
-
-    nav_history.go_back()
-    nav_history.go_back()
-
-    output = nav_history.render(
-        40,
-        "prev",
-        "next",
-        latest_command_attribute="green",
-        old_command_attribute="yellow")
-    self.assertEqual(1, len(output.lines))
-    self.assertEqual(
-        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
-        " | (-2) foo",
-        output.lines[0])
-    self.assertEqual(6, output.font_attr_segs[0][0][0])
-    self.assertEqual(9, output.font_attr_segs[0][0][1])
-    self.assertEqual("next", output.font_attr_segs[0][0][2].content)
-
-    self.assertEqual(12, output.font_attr_segs[0][1][0])
-    self.assertEqual(17, output.font_attr_segs[0][1][1])
-    self.assertEqual("yellow", output.font_attr_segs[0][1][2])
-
-    self.assertEqual(17, output.font_attr_segs[0][2][0])
-    self.assertEqual(20, output.font_attr_segs[0][2][1])
-    self.assertEqual("yellow", output.font_attr_segs[0][2][2])
-
-  def testRenderWithInsufficientLengthWorks(self):
-    nav_history = CNH(2)
-    nav_history.add_item("long_command", RTL(["output"]), 0)
-
-    output = nav_history.render(
-        15,
-        "prev",
-        "next",
-        latest_command_attribute="green",
-        old_command_attribute="yellow")
-    self.assertEqual(1, len(output.lines))
-    self.assertEqual(
-        "| " + CNH.BACK_ARROW_TEXT + " " + CNH.FORWARD_ARROW_TEXT +
-        " | lon",
-        output.lines[0])
-
-    self.assertEqual(12, output.font_attr_segs[0][0][0])
-    self.assertEqual(15, output.font_attr_segs[0][0][1])
-    self.assertEqual("green", output.font_attr_segs[0][0][2])
-
-
-if __name__ == "__main__":
-  googletest.main()
diff --git a/tensorflow/python/debug/cli/offline_analyzer.py b/tensorflow/python/debug/cli/offline_analyzer.py
index 363ebfe65b9..897ef7e7b74 100644
--- a/tensorflow/python/debug/cli/offline_analyzer.py
+++ b/tensorflow/python/debug/cli/offline_analyzer.py
@@ -18,15 +18,11 @@ import sys
 
 from absl import app
 
-# Google-internal import(s).
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.lib import debug_data
 
 
 def main(_):
-  if FLAGS.log_usage:
-    pass  # No logging for open-source.
-
   if not FLAGS.dump_dir:
     print("ERROR: dump_dir flag is empty.", file=sys.stderr)
     sys.exit(1)
@@ -59,8 +55,8 @@ if __name__ == "__main__":
   parser.add_argument(
       "--ui_type",
       type=str,
-      default="curses",
-      help="Command-line user interface type (curses | readline)")
+      default="readline",
+      help="Command-line user interface type (only readline is supported)")
   parser.add_argument(
       "--validate_graph",
       nargs="?",
diff --git a/tensorflow/python/debug/cli/profile_analyzer_cli.py b/tensorflow/python/debug/cli/profile_analyzer_cli.py
index 3f0fd084b7a..dc699ce7940 100644
--- a/tensorflow/python/debug/cli/profile_analyzer_cli.py
+++ b/tensorflow/python/debug/cli/profile_analyzer_cli.py
@@ -763,15 +763,15 @@ class ProfileAnalyzer(object):
 
 def create_profiler_ui(graph,
                        run_metadata,
-                       ui_type="curses",
+                       ui_type="readline",
                        on_ui_exit=None,
                        config=None):
-  """Create an instance of CursesUI based on a `tf.Graph` and `RunMetadata`.
+  """Create an instance of ReadlineUI based on a `tf.Graph` and `RunMetadata`.
 
   Args:
     graph: Python `Graph` object.
     run_metadata: A `RunMetadata` protobuf object.
-    ui_type: (str) requested UI type, e.g., "curses", "readline".
+    ui_type: (str) requested UI type, e.g., "readline".
     on_ui_exit: (`Callable`) the callback to be called when the UI exits.
     config: An instance of `cli_config.CLIConfig`.
 
diff --git a/tensorflow/python/debug/cli/ui_factory.py b/tensorflow/python/debug/cli/ui_factory.py
index 15d74b4c7b1..d066e3ad66e 100644
--- a/tensorflow/python/debug/cli/ui_factory.py
+++ b/tensorflow/python/debug/cli/ui_factory.py
@@ -16,7 +16,7 @@
 import copy
 
 
-SUPPORTED_UI_TYPES = ["curses", "readline"]
+SUPPORTED_UI_TYPES = ["readline"]
 
 
 def get_ui(ui_type,
@@ -26,12 +26,11 @@ def get_ui(ui_type,
   """Create a `base_ui.BaseUI` subtype.
 
   This factory method attempts to fallback to other available ui_types on
-  ImportError. For example, if `ui_type` is `curses`, but `curses` cannot be
-  imported properly, e.g., on Windows, will fallback to `readline`.
+  ImportError.
 
   Args:
     ui_type: (`str`) requested UI type. Currently supported:
-      (curses | readline)
+      ( readline)
     on_ui_exit: (`Callable`) the callback to be called when the UI exits.
     available_ui_types: (`None` or `list` of `str`) Manually-set available
       ui_types.
@@ -52,10 +51,7 @@ def get_ui(ui_type,
 
   try:
     # pylint: disable=g-import-not-at-top
-    if not ui_type or ui_type == "curses":
-      from tensorflow.python.debug.cli import curses_ui
-      return curses_ui.CursesUI(on_ui_exit=on_ui_exit, config=config)
-    elif ui_type == "readline":
+    if ui_type == "readline":
       from tensorflow.python.debug.cli import readline_ui
       return readline_ui.ReadlineUI(on_ui_exit=on_ui_exit, config=config)
     # pylint: enable=g-import-not-at-top
diff --git a/tensorflow/python/debug/examples/v1/debug_errors.py b/tensorflow/python/debug/examples/v1/debug_errors.py
index 7d291692a76..8417e4b2ba4 100644
--- a/tensorflow/python/debug/examples/v1/debug_errors.py
+++ b/tensorflow/python/debug/examples/v1/debug_errors.py
@@ -71,8 +71,8 @@ if __name__ == "__main__":
   parser.add_argument(
       "--ui_type",
       type=str,
-      default="curses",
-      help="Command-line user interface type (curses | readline)")
+      default="readline",
+      help="Command-line user interface type (only readline is supported)")
   parser.add_argument(
       "--debug",
       type="bool",
diff --git a/tensorflow/python/debug/examples/v1/debug_fibonacci.py b/tensorflow/python/debug/examples/v1/debug_fibonacci.py
index 9b92bd68b8e..5373f76a142 100644
--- a/tensorflow/python/debug/examples/v1/debug_fibonacci.py
+++ b/tensorflow/python/debug/examples/v1/debug_fibonacci.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Demo of the tfdbg curses UI: A TF network computing Fibonacci sequence."""
+"""Demo of the tfdbg readline UI: A TF network computing Fibonacci sequence."""
 import argparse
 import sys
 
@@ -79,8 +79,8 @@ if __name__ == "__main__":
   parser.add_argument(
       "--ui_type",
       type=str,
-      default="curses",
-      help="Command-line user interface type (curses | readline)")
+      default="readline",
+      help="Command-line user interface type (only readline is supported)")
   parser.add_argument(
       "--debug",
       dest="debug",
diff --git a/tensorflow/python/debug/examples/v1/debug_keras.py b/tensorflow/python/debug/examples/v1/debug_keras.py
index af4446baead..c04511b8c6a 100644
--- a/tensorflow/python/debug/examples/v1/debug_keras.py
+++ b/tensorflow/python/debug/examples/v1/debug_keras.py
@@ -75,8 +75,8 @@ if __name__ == "__main__":
   parser.add_argument(
       "--ui_type",
       type=str,
-      default="curses",
-      help="Command-line user interface type (curses | readline).")
+      default="readline",
+      help="Command-line user interface type (only readline is supported).")
   parser.add_argument(
       "--use_random_config_path",
       type="bool",
diff --git a/tensorflow/python/debug/examples/v1/debug_mnist_v1.py b/tensorflow/python/debug/examples/v1/debug_mnist_v1.py
index 9dacd49dbad..cea5abd65cf 100644
--- a/tensorflow/python/debug/examples/v1/debug_mnist_v1.py
+++ b/tensorflow/python/debug/examples/v1/debug_mnist_v1.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Demo of the tfdbg curses CLI: Locating the source of bad numerical values.
+"""Demo of the tfdbg readline CLI: Locating the source of bad numerical values.
 
 The neural network in this demo is larged based on the tutorial at:
   tensorflow/examples/tutorials/mnist/mnist_with_summaries.py
@@ -70,8 +70,8 @@ def parse_args():
   parser.add_argument(
       "--ui_type",
       type=str,
-      default="curses",
-      help="Command-line user interface type (curses | readline)")
+      default="readline",
+      help="Command-line user interface type (only readline is supported)")
   parser.add_argument(
       "--fake_data",
       type="bool",
diff --git a/tensorflow/python/debug/examples/v1/debug_tflearn_iris.py b/tensorflow/python/debug/examples/v1/debug_tflearn_iris.py
index af8d586b4e7..4cd4c903950 100644
--- a/tensorflow/python/debug/examples/v1/debug_tflearn_iris.py
+++ b/tensorflow/python/debug/examples/v1/debug_tflearn_iris.py
@@ -111,8 +111,8 @@ if __name__ == "__main__":
   parser.add_argument(
       "--ui_type",
       type=str,
-      default="curses",
-      help="Command-line user interface type (curses | readline)")
+      default="readline",
+      help="Command-line user interface type (only readline is supported)")
   parser.add_argument(
       "--debug",
       type="bool",
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index b09e3cb51b6..cac333bf01a 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "py_binary", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,30 +7,34 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "op_callbacks_common",
     srcs = ["op_callbacks_common.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "check_numerics_callback",
     srcs = ["check_numerics_callback.py"],
     srcs_version = "PY3",
     deps = [
         ":op_callbacks_common",
         ":source_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:debug_ops_gen",
-        "//tensorflow/python:op_callbacks",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:op_callbacks",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:debug_ops_gen",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dumping_callback",
     srcs = ["dumping_callback.py"],
     srcs_version = "PY3",
@@ -38,18 +42,24 @@ py_library(
         ":debug_events_writer",
         ":op_callbacks_common",
         ":source_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:debug_ops_gen",
-        "//tensorflow/python:op_callbacks",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:op_callbacks",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:debug_ops_gen",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_stack",
-        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dumping_callback_test_lib",
     srcs = ["dumping_callback_test_lib.py"],
     srcs_version = "PY3",
@@ -57,64 +67,62 @@ py_library(
         ":check_numerics_callback",
         ":debug_events_reader",
         ":dumping_callback",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:versions",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "common",
     srcs = ["common.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "debug_events_reader",
     srcs = ["debug_events_reader.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_events_monitors",
     srcs = ["debug_events_monitors.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:lib",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_events_writer",
     srcs = ["debug_events_writer.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/client:_pywrap_debug_events_writer",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_graphs",
     srcs = ["debug_graphs.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:op_def_registry",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:op_def_registry",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_data",
     srcs = ["debug_data.py"],
     srcs_version = "PY3",
@@ -125,9 +133,7 @@ py_library(
     deps = [
         ":debug_graphs",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:op_def_registry",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
@@ -135,25 +141,26 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_gradients",
     srcs = ["debug_gradients.py"],
     srcs_version = "PY3",
     deps = [
         ":debug_data",
         ":debug_graphs",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_utils",
     srcs = ["debug_utils.py"],
     srcs_version = "PY3",
 )
 
-py_binary(
+py_strict_binary(
     name = "grpc_tensorflow_server",
     srcs = ["grpc_tensorflow_server.py"],
     python_version = "PY3",
@@ -161,7 +168,7 @@ py_binary(
     deps = [":grpc_tensorflow_server_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "grpc_tensorflow_server_lib",
     srcs = [
         "grpc_tensorflow_server.py",
@@ -169,41 +176,45 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
+        "@absl_py//absl:app",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "source_utils",
     srcs = ["source_utils.py"],
     srcs_version = "PY3",
     deps = [
         ":profiling",
         "//third_party/py/numpy",
+        "@absl_py//absl:app",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "source_remote",
     srcs = ["source_remote.py"],
     srcs_version = "PY3",
     deps = [
         ":common",
         ":debug_service_pb2_grpc",
+        ":source_utils",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/debug:debug_service_proto_py",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler:tfprof_logger",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profiling",
     srcs = ["profiling.py"],
     srcs_version = "PY3",
 )
 
-py_test(
+py_strict_test(
     name = "common_test",
     size = "small",
     srcs = ["common_test.py"],
@@ -211,14 +222,13 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":common",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "debug_events_monitors_test",
     size = "medium",
     srcs = ["debug_events_monitors_test.py"],
@@ -230,18 +240,24 @@ py_test(
     deps = [
         ":debug_events_monitors",
         ":debug_events_reader",
-        ":debug_events_writer",
         ":dumping_callback",
         ":dumping_callback_test_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "debug_events_writer_test",
     size = "medium",
     srcs = ["debug_events_writer_test.py"],
@@ -255,13 +271,15 @@ py_test(
         ":debug_events_writer",
         ":dumping_callback_test_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:versions",
         "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "debug_graphs_test",
     size = "small",
     srcs = ["debug_graphs_test.py"],
@@ -269,12 +287,12 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":debug_graphs",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "debug_data_test",
     size = "small",
     srcs = ["debug_data_test.py"],
@@ -284,15 +302,16 @@ py_test(
     deps = [
         ":debug_data",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "check_numerics_callback_test",
     size = "medium",
     srcs = ["check_numerics_callback_test.py"],
@@ -303,17 +322,29 @@ cuda_py_test(
     ],
     deps = [
         ":check_numerics_callback",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_grad",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_grad",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dumping_callback_test",
     size = "medium",
     srcs = ["dumping_callback_test.py"],
@@ -325,20 +356,27 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_events_reader",
-        ":debug_events_writer",
         ":dumping_callback",
         ":dumping_callback_test_lib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "debug_v2_ops_test",
     size = "medium",
     srcs = ["debug_v2_ops_test.py"],
@@ -348,19 +386,23 @@ cuda_py_test(
         ":debug_events_reader",
         ":debug_events_writer",
         ":dumping_callback_test_lib",
-        "//tensorflow/python:debug_ops_gen",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:debug_ops_gen",
+        "//tensorflow/python/ops:math_ops",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "debug_gradients_test",
     size = "small",
     srcs = ["debug_gradients_test.py"],
@@ -370,17 +412,20 @@ cuda_py_test(
         ":debug_data",
         ":debug_gradients",
         ":debug_utils",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "debug_utils_test",
     size = "small",
     srcs = ["debug_utils_test.py"],
@@ -389,18 +434,18 @@ py_test(
     deps = [
         ":debug_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_v1",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "source_utils_test",
     size = "small",
     srcs = ["source_utils_test.py"],
@@ -414,23 +459,22 @@ py_test(
         ":debug_utils",
         ":source_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python:while_v2",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "source_remote_test",
     size = "small",
     srcs = ["source_remote_test.py"],
@@ -444,20 +488,20 @@ py_test(
         ":grpc_debug_test_server",
         ":source_remote",
         ":source_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core/debug:debug_service_proto_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "profiling_test",
     size = "small",
     srcs = ["profiling_test.py"],
@@ -466,12 +510,12 @@ py_test(
     deps = [
         ":profiling",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "session_debug_testlib",
     srcs = ["session_debug_testlib.py"],
     srcs_version = "PY3",
@@ -480,40 +524,43 @@ py_library(
         ":debug_graphs",
         ":debug_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:rnn",
         "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:rnn",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "debug_service_pb2_grpc",
     srcs = ["debug_service_pb2_grpc.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/debug:debug_service_proto_py",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "grpc_debug_server",
     srcs = ["grpc_debug_server.py"],
     srcs_version = "PY3",
@@ -521,13 +568,14 @@ py_library(
     deps = [
         ":debug_graphs",
         ":debug_service_pb2_grpc",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/debug:debug_service_proto_py",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "grpc_debug_test_server",
     srcs = ["grpc_debug_test_server.py"],
     srcs_version = "PY3",
@@ -536,15 +584,17 @@ py_library(
         ":debug_utils",
         ":grpc_debug_server",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:variables",
+        "//tensorflow/core/debug:debug_service_proto_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "debug_grappler_test",
     size = "small",
     srcs = ["debug_grappler_test.py"],
@@ -553,16 +603,19 @@ cuda_py_test(
     deps = [
         ":debug_data",
         ":debug_utils",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "session_debug_file_test",
     size = "small",
     srcs = ["session_debug_file_test.py"],
@@ -573,15 +626,19 @@ cuda_py_test(
         ":debug_data",
         ":debug_utils",
         ":session_debug_testlib",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "debug_graph_reconstruction_test",
     size = "small",
     srcs = ["debug_graph_reconstruction_test.py"],
@@ -589,19 +646,24 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":debug_data",
+        ":debug_graphs",
         ":debug_utils",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "session_debug_multi_gpu_test",
     size = "small",
     srcs = ["session_debug_multi_gpu_test.py"],
@@ -614,11 +676,14 @@ cuda_py_test(
     deps = [
         ":debug_data",
         ":debug_utils",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
diff --git a/tensorflow/python/debug/lib/source_utils.py b/tensorflow/python/debug/lib/source_utils.py
index c5f87394525..8949101903e 100644
--- a/tensorflow/python/debug/lib/source_utils.py
+++ b/tensorflow/python/debug/lib/source_utils.py
@@ -19,7 +19,7 @@ import os
 import re
 import zipfile
 
-import absl
+from absl import app
 import numpy as np
 
 from tensorflow.python.debug.lib import profiling
@@ -29,7 +29,7 @@ _TENSORFLOW_BASEDIR = os.path.dirname(
     os.path.dirname(os.path.dirname(os.path.dirname(
         os.path.normpath(os.path.abspath(__file__))))))
 
-_ABSL_BASEDIR = os.path.dirname(absl.__file__)
+_ABSL_BASEDIR = os.path.dirname(app.__file__)
 
 
 UNCOMPILED_SOURCE_SUFFIXES = (".py")
diff --git a/tensorflow/python/debug/wrappers/BUILD b/tensorflow/python/debug/wrappers/BUILD
index cf94a2d0865..bcb51333e20 100644
--- a/tensorflow/python/debug/wrappers/BUILD
+++ b/tensorflow/python/debug/wrappers/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,24 +6,24 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "framework",
     srcs = ["framework.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/debug/lib:debug_utils",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:stack",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dumping_wrapper",
     srcs = ["dumping_wrapper.py"],
     srcs_version = "PY3",
@@ -36,7 +36,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "grpc_wrapper",
     srcs = ["grpc_wrapper.py"],
     srcs_version = "PY3",
@@ -47,25 +47,26 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "local_cli_wrapper",
     srcs = ["local_cli_wrapper.py"],
     srcs_version = "PY3",
     deps = [
         ":framework",
         "//tensorflow/python/debug/cli:analyzer_cli",
+        "//tensorflow/python/debug/cli:cli_config",
         "//tensorflow/python/debug/cli:cli_shared",
         "//tensorflow/python/debug/cli:command_parser",
         "//tensorflow/python/debug/cli:debugger_cli_common",
         "//tensorflow/python/debug/cli:profile_analyzer_cli",
-        "//tensorflow/python/debug/cli:tensor_format",
         "//tensorflow/python/debug/cli:ui_factory",
         "//tensorflow/python/debug/lib:common",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/lib/io:lib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "hooks",
     srcs = ["hooks.py"],
     srcs_version = "PY3",
@@ -79,12 +80,12 @@ py_library(
         ":grpc_wrapper",
         ":local_cli_wrapper",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training",
         "//tensorflow/python/debug/lib:debug_utils",
+        "//tensorflow/python/training:session_run_hook",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "framework_test",
     size = "medium",
     srcs = ["framework_test.py"],
@@ -96,23 +97,25 @@ py_test(
     deps = [
         ":framework",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "dumping_wrapper_test",
     size = "small",
     srcs = ["dumping_wrapper_test.py"],
@@ -122,20 +125,22 @@ py_test(
         ":dumping_wrapper",
         ":framework",
         ":hooks",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/debug/lib:debug_data",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/training",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "local_cli_wrapper_test",
     size = "small",
     srcs = ["local_cli_wrapper_test.py"],
@@ -144,27 +149,32 @@ py_test(
     deps = [
         ":local_cli_wrapper",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/debug/cli:cli_config",
         "//tensorflow/python/debug/cli:cli_shared",
         "//tensorflow/python/debug/cli:debugger_cli_common",
         "//tensorflow/python/debug/cli:ui_factory",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
+        "//tensorflow/python/training:session_run_hook",
         "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "disk_usage_test",
     size = "small",
     srcs = ["disk_usage_test.py"],
@@ -173,11 +183,13 @@ py_test(
     deps = [
         ":dumping_wrapper",
         ":hooks",
-        "//tensorflow/python:client",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
     ],
 )
diff --git a/tensorflow/python/debug/wrappers/disk_usage_test.py b/tensorflow/python/debug/wrappers/disk_usage_test.py
index 19e1f9bffd5..9b28be3fd2e 100644
--- a/tensorflow/python/debug/wrappers/disk_usage_test.py
+++ b/tensorflow/python/debug/wrappers/disk_usage_test.py
@@ -54,8 +54,7 @@ class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
       del fetches, feeds
       return "DebugIdentity", r"(.*delta.*|.*inc_v.*)", r".*"
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root,
-        watch_fn=_watch_fn, log_usage=False)
+        self.sess, session_root=self.session_root, watch_fn=_watch_fn)
     sess.run(self.inc_v)
 
   def testWrapperSessionExceedingLimit(self):
@@ -63,8 +62,7 @@ class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
       del fetches, feeds
       return "DebugIdentity", r".*delta.*", r".*"
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root,
-        watch_fn=_watch_fn, log_usage=False)
+        self.sess, session_root=self.session_root, watch_fn=_watch_fn)
     # Due to the watch function, each run should dump only 1 tensor,
     # which has a size of 4 bytes, which corresponds to the dumped 'delta:0'
     # tensor of scalar shape and float32 dtype.
@@ -82,7 +80,7 @@ class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
       del fetches, feeds
       return "DebugIdentity", r".*delta.*", r".*"
     dumping_hook = hooks.DumpingDebugHook(
-        self.session_root, watch_fn=_watch_fn, log_usage=False)
+        self.session_root, watch_fn=_watch_fn)
     mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
     mon_sess.run(self.inc_v)
 
@@ -91,7 +89,7 @@ class DumpingDebugWrapperDiskUsageLimitTest(test_util.TensorFlowTestCase):
       del fetches, feeds
       return "DebugIdentity", r".*delta.*", r".*"
     dumping_hook = hooks.DumpingDebugHook(
-        self.session_root, watch_fn=_watch_fn, log_usage=False)
+        self.session_root, watch_fn=_watch_fn)
     mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
     # Like in `testWrapperSessionExceedingLimit`, the first two calls
     # should be within the byte limit, but the third one should error
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper.py b/tensorflow/python/debug/wrappers/dumping_wrapper.py
index 0d2bac0d44e..b2ed07dd1b7 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper.py
@@ -17,7 +17,6 @@ import os
 import threading
 import time
 
-# Google-internal import(s).
 from tensorflow.core.util import event_pb2
 from tensorflow.python.debug.lib import debug_data
 from tensorflow.python.debug.wrappers import framework
@@ -32,8 +31,7 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
                session_root,
                watch_fn=None,
                thread_name_filter=None,
-               pass_through_operrors=None,
-               log_usage=True):
+               pass_through_operrors=None):
     """Constructor of DumpingDebugWrapperSession.
 
     Args:
@@ -55,16 +53,11 @@ class DumpingDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
         more details.
       pass_through_operrors: If true, all captured OpErrors will be
         propagated. By default this captures all OpErrors.
-      log_usage: (`bool`) whether the usage of this class is to be logged.
 
     Raises:
        ValueError: If `session_root` is an existing and non-empty directory or
        if `session_root` is a file.
     """
-
-    if log_usage:
-      pass  # No logging for open-source.
-
     framework.NonInteractiveDebugWrapperSession.__init__(
         self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter,
         pass_through_operrors=pass_through_operrors)
diff --git a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
index 0eca5aa8b48..2e6a735facb 100644
--- a/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/dumping_wrapper_test.py
@@ -72,7 +72,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegex(
         ValueError, "session_root path points to a non-empty directory"):
       dumping_wrapper.DumpingDebugWrapperSession(
-          session.Session(), session_root=self.session_root, log_usage=False)
+          session.Session(), session_root=self.session_root)
 
   def testConstructWrapperWithExistingFileDumpRootRaisesException(self):
     file_path = os.path.join(self.session_root, "foo")
@@ -82,19 +82,19 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegex(ValueError,
                                 "session_root path points to a file"):
       dumping_wrapper.DumpingDebugWrapperSession(
-          session.Session(), session_root=file_path, log_usage=False)
+          session.Session(), session_root=file_path)
 
   def testConstructWrapperWithNonexistentSessionRootCreatesDirectory(self):
     new_dir_path = os.path.join(tempfile.mkdtemp(), "new_dir")
     dumping_wrapper.DumpingDebugWrapperSession(
-        session.Session(), session_root=new_dir_path, log_usage=False)
+        session.Session(), session_root=new_dir_path)
     self.assertTrue(gfile.IsDirectory(new_dir_path))
     # Cleanup.
     gfile.DeleteRecursively(new_dir_path)
 
   def testDumpingOnASingleRunWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
+        self.sess, session_root=self.session_root)
     sess.run(self.inc_v)
 
     dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
@@ -109,7 +109,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testDumpingOnASingleRunWorksWithRelativePathForDebugDumpDir(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
+        self.sess, session_root=self.session_root)
     sess.run(self.inc_v)
     dump_dirs = glob.glob(os.path.join(self.session_root, "run_*"))
     cwd = os.getcwd()
@@ -123,7 +123,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testDumpingOnASingleRunWithFeedDictWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
+        self.sess, session_root=self.session_root)
     feed_dict = {self.ph: 3.2}
     sess.run(self.inc_w_ph, feed_dict=feed_dict)
 
@@ -139,7 +139,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testDumpingOnMultipleRunsWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
+        self.sess, session_root=self.session_root)
     for _ in range(3):
       sess.run(self.inc_v)
 
@@ -161,8 +161,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       dumping_wrapper.DumpingDebugWrapperSession(
           self.sess,
           session_root=self.session_root,
-          watch_fn=bad_watch_fn,
-          log_usage=False)
+          watch_fn=bad_watch_fn)
 
   def testDumpingWithLegacyWatchFnOnFetchesWorks(self):
     """Use a watch_fn that returns different allowlists for different runs."""
@@ -180,8 +179,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess,
         session_root=self.session_root,
-        watch_fn=watch_fn,
-        log_usage=False)
+        watch_fn=watch_fn)
 
     for _ in range(3):
       sess.run(self.inc_v)
@@ -216,8 +214,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess,
         session_root=self.session_root,
-        watch_fn=watch_fn,
-        log_usage=False)
+        watch_fn=watch_fn)
 
     sess.run(self.inc_v)
 
@@ -243,8 +240,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
         self.sess,
         session_root=self.session_root,
-        watch_fn=watch_fn,
-        log_usage=False)
+        watch_fn=watch_fn)
 
     sess.run(self.inc_v)
 
@@ -261,7 +257,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertNotIn("delta", dumped_nodes)
 
   def testDumpingDebugHookWithoutWatchFnWorks(self):
-    dumping_hook = hooks.DumpingDebugHook(self.session_root, log_usage=False)
+    dumping_hook = hooks.DumpingDebugHook(self.session_root)
     mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
     mon_sess.run(self.inc_v)
 
@@ -293,7 +289,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
             op_type_regex_allowlist=r"^$")
 
     dumping_hook = hooks.DumpingDebugHook(
-        self.session_root, watch_fn=counting_watch_fn, log_usage=False)
+        self.session_root, watch_fn=counting_watch_fn)
     mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
     for _ in range(4):
       mon_sess.run(self.inc_v)
@@ -331,7 +327,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
         return "DebugIdentity", r"$^", r"$^"
 
     dumping_hook = hooks.DumpingDebugHook(
-        self.session_root, watch_fn=counting_watch_fn, log_usage=False)
+        self.session_root, watch_fn=counting_watch_fn)
     mon_sess = monitored_session._HookedSession(self.sess, [dumping_hook])
     for _ in range(4):
       mon_sess.run(self.inc_v)
@@ -355,7 +351,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testDumpingFromMultipleThreadsObeysThreadNameFilter(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False,
+        self.sess, session_root=self.session_root,
         thread_name_filter=r"MainThread$")
 
     self.assertAllClose(1.0, sess.run(self.delta))
@@ -376,7 +372,7 @@ class DumpingDebugWrapperSessionTest(test_util.TensorFlowTestCase):
 
   def testDumpingWrapperWithEmptyFetchWorks(self):
     sess = dumping_wrapper.DumpingDebugWrapperSession(
-        self.sess, session_root=self.session_root, log_usage=False)
+        self.sess, session_root=self.session_root)
     sess.run([])
 
 
diff --git a/tensorflow/python/debug/wrappers/grpc_wrapper.py b/tensorflow/python/debug/wrappers/grpc_wrapper.py
index f517ef7c7b3..bc6c9fde5ff 100644
--- a/tensorflow/python/debug/wrappers/grpc_wrapper.py
+++ b/tensorflow/python/debug/wrappers/grpc_wrapper.py
@@ -17,7 +17,6 @@ import signal
 import sys
 import traceback
 
-# Google-internal import(s).
 from tensorflow.python.debug.lib import common
 from tensorflow.python.debug.wrappers import framework
 
@@ -67,8 +66,7 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
                sess,
                grpc_debug_server_addresses,
                watch_fn=None,
-               thread_name_filter=None,
-               log_usage=True):
+               thread_name_filter=None):
     """Constructor of DumpingDebugWrapperSession.
 
     Args:
@@ -84,16 +82,11 @@ class GrpcDebugWrapperSession(framework.NonInteractiveDebugWrapperSession):
       thread_name_filter: Regular-expression white list for threads on which the
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
-      log_usage: (`bool`) whether the usage of this class is to be logged.
 
     Raises:
        TypeError: If `grpc_debug_server_addresses` is not a `str` or a `list`
          of `str`.
     """
-
-    if log_usage:
-      pass  # No logging for open-source.
-
     framework.NonInteractiveDebugWrapperSession.__init__(
         self, sess, watch_fn=watch_fn, thread_name_filter=thread_name_filter)
 
@@ -168,8 +161,7 @@ class TensorBoardDebugWrapperSession(GrpcDebugWrapperSession):
                sess,
                grpc_debug_server_addresses,
                thread_name_filter=None,
-               send_traceback_and_source_code=True,
-               log_usage=True):
+               send_traceback_and_source_code=True):
     """Constructor of TensorBoardDebugWrapperSession.
 
     Args:
@@ -180,8 +172,6 @@ class TensorBoardDebugWrapperSession(GrpcDebugWrapperSession):
       thread_name_filter: Optional filter for thread names.
       send_traceback_and_source_code: Whether traceback of graph elements and
         the source code are to be sent to the debug server(s).
-      log_usage: Whether the usage of this class is to be logged (if
-        applicable).
     """
     def _gated_grpc_watch_fn(fetches, feeds):
       del fetches, feeds  # Unused.
@@ -192,8 +182,7 @@ class TensorBoardDebugWrapperSession(GrpcDebugWrapperSession):
         sess,
         grpc_debug_server_addresses,
         watch_fn=_gated_grpc_watch_fn,
-        thread_name_filter=thread_name_filter,
-        log_usage=log_usage)
+        thread_name_filter=thread_name_filter)
 
     self._send_traceback_and_source_code = send_traceback_and_source_code
     # Keeps track of the latest version of Python graph object that has been
diff --git a/tensorflow/python/debug/wrappers/hooks.py b/tensorflow/python/debug/wrappers/hooks.py
index a47a28cd25d..c9c7515875f 100644
--- a/tensorflow/python/debug/wrappers/hooks.py
+++ b/tensorflow/python/debug/wrappers/hooks.py
@@ -33,7 +33,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
   """
 
   def __init__(self,
-               ui_type="curses",
+               ui_type="readline",
                dump_root=None,
                thread_name_filter=None,
                config_file_path=None):
@@ -41,7 +41,7 @@ class LocalCLIDebugHook(session_run_hook.SessionRunHook):
 
     Args:
       ui_type: (`str`) requested user-interface type. Currently supported:
-        (curses | readline).
+        (readline).
       dump_root: (`str`) optional path to the dump root directory. Must be a
         directory that does not exist or an empty directory. If the directory
         does not exist, it will be created by the debugger core during debug
@@ -154,8 +154,7 @@ class DumpingDebugHook(session_run_hook.SessionRunHook):
   def __init__(self,
                session_root,
                watch_fn=None,
-               thread_name_filter=None,
-               log_usage=True):
+               thread_name_filter=None):
     """Create a local debugger command-line interface (CLI) hook.
 
     Args:
@@ -166,13 +165,11 @@ class DumpingDebugHook(session_run_hook.SessionRunHook):
       thread_name_filter: Regular-expression white list for threads on which the
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
-      log_usage: (bool) Whether usage is to be logged.
     """
 
     self._session_root = session_root
     self._watch_fn = watch_fn
     self._thread_name_filter = thread_name_filter
-    self._log_usage = log_usage
     self._session_wrapper = None
 
   def begin(self):
@@ -185,8 +182,7 @@ class DumpingDebugHook(session_run_hook.SessionRunHook):
           run_context.session,
           self._session_root,
           watch_fn=self._watch_fn,
-          thread_name_filter=self._thread_name_filter,
-          log_usage=self._log_usage)
+          thread_name_filter=self._thread_name_filter)
       reset_disk_byte_usage = True
 
     self._session_wrapper.increment_run_call_count()
@@ -233,8 +229,7 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
   def __init__(self,
                grpc_debug_server_addresses,
                watch_fn=None,
-               thread_name_filter=None,
-               log_usage=True):
+               thread_name_filter=None):
     """Constructs a GrpcDebugHook.
 
     Args:
@@ -247,7 +242,6 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
       thread_name_filter: Regular-expression white list for threads on which the
         wrapper session will be active. See doc of `BaseDebugWrapperSession` for
         more details.
-      log_usage: (bool) Whether usage is to be logged.
     """
     self._grpc_debug_wrapper_session = None
     self._thread_name_filter = thread_name_filter
@@ -257,7 +251,6 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
         [grpc_debug_server_addresses])
 
     self._watch_fn = watch_fn
-    self._log_usage = log_usage
 
   def before_run(self, run_context):
     """Called right before a session is run.
@@ -275,8 +268,7 @@ class GrpcDebugHook(session_run_hook.SessionRunHook):
           run_context.session,
           self._grpc_debug_server_addresses,
           watch_fn=self._watch_fn,
-          thread_name_filter=self._thread_name_filter,
-          log_usage=self._log_usage)
+          thread_name_filter=self._thread_name_filter)
 
     fetches = run_context.original_args.fetches
     feed_dict = run_context.original_args.feed_dict
@@ -313,8 +305,7 @@ class TensorBoardDebugHook(GrpcDebugHook):
   def __init__(self,
                grpc_debug_server_addresses,
                thread_name_filter=None,
-               send_traceback_and_source_code=True,
-               log_usage=True):
+               send_traceback_and_source_code=True):
     """Constructor of TensorBoardDebugHook.
 
     Args:
@@ -324,8 +315,6 @@ class TensorBoardDebugHook(GrpcDebugHook):
       thread_name_filter: Optional filter for thread names.
       send_traceback_and_source_code: Whether traceback of graph elements and
         the source code are to be sent to the debug server(s).
-      log_usage: Whether the usage of this class is to be logged (if
-        applicable).
     """
 
     def _gated_grpc_watch_fn(fetches, feeds):
@@ -336,8 +325,7 @@ class TensorBoardDebugHook(GrpcDebugHook):
     super(TensorBoardDebugHook, self).__init__(
         grpc_debug_server_addresses,
         watch_fn=_gated_grpc_watch_fn,
-        thread_name_filter=thread_name_filter,
-        log_usage=log_usage)
+        thread_name_filter=thread_name_filter)
 
     self._grpc_debug_server_addresses = grpc_debug_server_addresses
     self._send_traceback_and_source_code = send_traceback_and_source_code
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper.py b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
index d0056ec16a6..91a2beed8b2 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper.py
@@ -18,7 +18,6 @@ import os
 import sys
 import tempfile
 
-# Google-internal import(s).
 from tensorflow.python.debug.cli import analyzer_cli
 from tensorflow.python.debug.cli import cli_config
 from tensorflow.python.debug.cli import cli_shared
@@ -47,8 +46,7 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
   def __init__(self,
                sess,
                dump_root=None,
-               log_usage=True,
-               ui_type="curses",
+               ui_type="readline",
                thread_name_filter=None,
                config_file_path=False):
     """Constructor of LocalCLIDebugWrapperSession.
@@ -60,9 +58,8 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
         does not exist, it will be created by the debugger core during debug
         `run()` calls and removed afterwards. If `None`, the debug dumps will
         be at tfdbg_<random_string> under the system temp directory.
-      log_usage: (`bool`) whether the usage of this class is to be logged.
       ui_type: (`str`) requested UI type. Currently supported:
-        (curses | readline)
+        (readline)
       thread_name_filter: Regular-expression white list for thread name. See
         the doc of `BaseDebugWrapperSession` for details.
       config_file_path: Optional override to the default configuration file
@@ -72,10 +69,6 @@ class LocalCLIDebugWrapperSession(framework.BaseDebugWrapperSession):
       ValueError: If dump_root is an existing and non-empty directory or if
         dump_root is a file.
     """
-
-    if log_usage:
-      pass  # No logging for open-source.
-
     framework.BaseDebugWrapperSession.__init__(
         self, sess, thread_name_filter=thread_name_filter)
 
diff --git a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
index e1e27467be3..3a15c9d8542 100644
--- a/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
+++ b/tensorflow/python/debug/wrappers/local_cli_wrapper_test.py
@@ -70,7 +70,7 @@ class LocalCLIDebuggerWrapperSessionForTest(
     """
 
     local_cli_wrapper.LocalCLIDebugWrapperSession.__init__(
-        self, sess, dump_root=dump_root, log_usage=False)
+        self, sess, dump_root=dump_root)
 
     self._command_sequence = command_sequence
     self._command_pointer = 0
@@ -172,8 +172,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
       file_io.delete_recursively(self._tmp_dir)
 
   def testConstructWrapper(self):
-    local_cli_wrapper.LocalCLIDebugWrapperSession(
-        session.Session(), log_usage=False)
+    local_cli_wrapper.LocalCLIDebugWrapperSession(session.Session())
 
   def testConstructWrapperWithExistingNonEmptyDumpRoot(self):
     dir_path = os.path.join(self._tmp_dir, "foo")
@@ -183,7 +182,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     with self.assertRaisesRegex(
         ValueError, "dump_root path points to a non-empty directory"):
       local_cli_wrapper.LocalCLIDebugWrapperSession(
-          session.Session(), dump_root=self._tmp_dir, log_usage=False)
+          session.Session(), dump_root=self._tmp_dir)
 
   def testConstructWrapperWithExistingFileDumpRoot(self):
     file_path = os.path.join(self._tmp_dir, "foo")
@@ -191,7 +190,7 @@ class LocalCLIDebugWrapperSessionTest(test_util.TensorFlowTestCase):
     self.assertTrue(os.path.isfile(file_path))
     with self.assertRaisesRegex(ValueError, "dump_root path points to a file"):
       local_cli_wrapper.LocalCLIDebugWrapperSession(
-          session.Session(), dump_root=file_path, log_usage=False)
+          session.Session(), dump_root=file_path)
 
   def testRunsUnderDebugMode(self):
     wrapped_sess = LocalCLIDebuggerWrapperSessionForTest(
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 034f73ff6aa..a21daab346c 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -42,16 +42,16 @@ py_strict_library(
         ":tpu_values",
         ":values",
         ":values_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:kernels",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python/client:device_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:kernels",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -68,18 +68,18 @@ pytype_strict_library(
     deps = [
         ":collective_util",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:backprop_util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nccl_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/types:core",
     ],
@@ -90,10 +90,10 @@ py_strict_library(
     srcs = ["device_util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -106,11 +106,11 @@ cuda_py_strict_test(
         ":device_util",
         ":multi_worker_test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -146,18 +146,8 @@ py_strict_library(
         ":device_util",
         ":numpy_dataset",
         ":reduce_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -165,9 +155,19 @@ py_strict_library(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/types:distribute",
@@ -192,18 +192,18 @@ py_strict_test(
         ":distribute_lib",
         ":input_lib",
         ":reduce_util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
@@ -229,11 +229,11 @@ py_strict_library(
         ":distribute_coordinator_context",
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:coordinator",
         "//tensorflow/python/training:monitored_session",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -250,16 +250,16 @@ py_strict_test(
         ":distribute_coordinator",
         ":distribute_coordinator_context",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:distributed_framework_test_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "@six_archive//:six",
     ],
 )
@@ -281,17 +281,17 @@ py_strict_library(
         ":distribute_lib",
         ":distribute_utils",
         ":shared_variable_creator",
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:traceback_utils",
     ],
 )
@@ -305,15 +305,15 @@ py_strict_library(
         ":reduce_util",
         ":tpu_values",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops/losses:losses_impl",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -325,8 +325,8 @@ py_strict_library(
     srcs = ["tpu_util.py"],
     deps = [
         ":packed_distributed_variable",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/tpu:tpu_replication",
     ],
 )
@@ -350,20 +350,20 @@ py_strict_library(
         ":reduce_util",
         ":values",
         ":values_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -387,17 +387,17 @@ py_strict_library(
         ":numpy_dataset",
         ":ps_values",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -428,12 +428,12 @@ py_strict_library(
         ":input_lib",
         ":input_util",
         ":numpy_dataset",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -459,19 +459,19 @@ py_strict_library(
         ":reduce_util",
         ":values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:control_flow_util",
         "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -486,7 +486,7 @@ py_strict_library(
     deps = [
         ":distribute_coordinator_context",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -506,15 +506,15 @@ cuda_py_strict_test(
         ":multi_process_runner",
         ":multi_worker_test_base",
         ":reduce_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
         "@dill_archive//:dill",  # build_cleaner: keep
@@ -526,13 +526,13 @@ py_strict_library(
     srcs = ["numpy_dataset.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
@@ -546,9 +546,9 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":numpy_dataset",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variable_v1",
         "//third_party/py/numpy",
     ],
 )
@@ -564,12 +564,6 @@ py_strict_library(
         ":input_ops",
         ":reduce_util",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/autograph/operators:py_builtins",
         "//tensorflow/python/data/experimental/ops:batching",
         "//tensorflow/python/data/experimental/ops:cardinality",
@@ -584,9 +578,15 @@ py_strict_library(
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/types:distribute",
@@ -601,12 +601,12 @@ py_strict_library(
     srcs = ["input_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/data/util:traverse",
         "//tensorflow/python/framework:op_def_registry",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/types:data",
         "//tensorflow/python/types:distribute",
     ],
@@ -618,16 +618,16 @@ cuda_py_strict_test(
     python_version = "PY3",
     deps = [
         ":input_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
@@ -640,8 +640,8 @@ py_strict_test(
     deps = [
         ":multi_worker_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:training",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/training",
     ],
 )
 
@@ -663,17 +663,6 @@ py_strict_library(
         ":tpu_util",
         ":tpu_values",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device_spec",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
@@ -682,9 +671,20 @@ py_strict_library(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:device_spec",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/tpu:device_assignment",
         "//tensorflow/python/tpu:tpu_hardware_feature",
@@ -703,6 +703,7 @@ py_strict_library(
 distribute_py_strict_test(
     name = "random_generator_test",
     srcs = ["random_generator_test.py"],
+    exec_properties = {"mem": "24g"},
     main = "random_generator_test.py",
     shard_count = 12,
     tags = [
@@ -722,10 +723,6 @@ distribute_py_strict_test(
         ":sharded_variable",
         ":strategy_combinations",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:stateful_random_ops",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
@@ -735,6 +732,10 @@ distribute_py_strict_test(
         "//tensorflow/python/framework:test_combinations_lib",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/util:deprecation",
@@ -756,17 +757,7 @@ tpu_py_strict_test(
         ":tpu_strategy",
         ":tpu_values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
@@ -784,6 +775,16 @@ tpu_py_strict_test(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/platform:tf_logging",
@@ -813,7 +814,6 @@ tpu_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/platform:flags",
-        "//tensorflow/python/tpu:tpu_strategy_util",
     ],
 )
 
@@ -828,8 +828,8 @@ py_strict_library(
         ":distribute_coordinator",
         ":distribute_coordinator_context",
         ":multi_worker_util",
-        "//tensorflow/python:training",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
         "@six_archive//:six",
     ],
 )
@@ -839,7 +839,7 @@ py_strict_library(
     srcs = ["reduce_util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -876,10 +876,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":shared_variable_creator",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
     ],
 )
 
@@ -889,8 +889,8 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":distribute_lib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
     ],
 )
 
@@ -900,11 +900,11 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":device_util",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
     ],
 )
 
@@ -919,20 +919,20 @@ py_strict_library(
         ":reduce_util",
         ":values_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object",
@@ -949,12 +949,12 @@ py_strict_library(
         ":device_util",
         ":tpu_util",
         ":values_util",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
@@ -969,14 +969,14 @@ distribute_py_strict_test(
         ":strategy_combinations",
         ":test_util",
         ":values_v2",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -990,21 +990,20 @@ py_strict_library(
         ":distribute_utils",
         ":values",
         ":values_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/coordinator:coordinator_context",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
-        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:handle_data_util",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/types:core",
-        "//tensorflow/python/util:lazy_loader",
         "//third_party/py/numpy",
     ],
 )
@@ -1016,12 +1015,12 @@ py_strict_library(
     deps = [
         ":distribute_lib",
         ":reduce_util",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
         "//tensorflow/python/training/saving:saveable_object",
@@ -1038,12 +1037,12 @@ py_strict_library(
         ":tpu_util",
         ":values",
         ":values_util",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops_gen",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:tape",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
     ],
 )
 
@@ -1053,16 +1052,16 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":tpu_util",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:resource_variable_ops_gen",
-        "//tensorflow/python:tpu_partition_ops_gen",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:tpu_partition_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
     ],
@@ -1075,10 +1074,10 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_replicated_variable",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:variables",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1098,17 +1097,17 @@ py_strict_library(
         ":distribute_lib",
         ":multi_process_runner",
         ":multi_worker_test_base",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_combinations_lib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tf_decorator",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:combinations",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_combinations_lib",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl:app",
         "@six_archive//:six",
@@ -1123,10 +1122,10 @@ distribute_py_strict_test(
     deps = [
         ":combinations",
         ":test_util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_combinations",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1164,7 +1163,6 @@ py_strict_library(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/tpu:device_assignment",
-        "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
     ],
@@ -1190,11 +1188,11 @@ distribute_py_strict_test(
         ":strategy_combinations",
         ":test_util",
         ":tpu_strategy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1207,17 +1205,17 @@ py_strict_library(
         ":distribute_coordinator",
         ":multi_process_runner",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training_lib",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:training_lib",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
@@ -1253,11 +1251,11 @@ cuda_py_strict_test(
     deps = [
         ":combinations",
         ":strategy_combinations",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1273,10 +1271,10 @@ distribute_py_strict_test(
     deps = [
         ":combinations",
         ":strategy_combinations",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1290,6 +1288,7 @@ distribute_py_strict_test(
     tags = [
         "multi_and_single_gpu",
         "no_cuda_asan",  # TODO(b/214574707): times out
+        "no_oss",  # TODO(b/284980046): Breaking nightlies, fails when unable to start GRPC server.
     ],
     deps = [
         ":combinations",
@@ -1303,14 +1302,7 @@ distribute_py_strict_test(
         ":reduce_util",
         ":strategy_combinations",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/experimental/ops:data_service_ops",
         "//tensorflow/python/data/experimental/service:server_lib",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -1322,9 +1314,16 @@ distribute_py_strict_test(
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:extension_type",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
@@ -1347,9 +1346,6 @@ distribute_py_strict_test(
         ":test_util",
         ":tpu_strategy",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tf2",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:readers",
@@ -1357,8 +1353,11 @@ distribute_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
@@ -1374,14 +1373,14 @@ cuda_py_strict_test(
         ":combinations",
         ":cross_device_utils",
         ":device_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1408,19 +1407,19 @@ cuda_py_strict_test(
         ":test_util",
         ":values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
         "@dill_archive//:dill",  # build_cleaner: keep
@@ -1451,21 +1450,21 @@ py_strict_library(
     srcs = ["sharded_variable.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -1489,28 +1488,28 @@ tf_py_strict_test(
         ":parameter_server_strategy_v2",
         ":sharded_variable",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:embedding_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:save",
@@ -1535,28 +1534,28 @@ py_strict_library(
         ":reduce_util",
         ":tpu_strategy",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/training",
         "//tensorflow/python/training:training_util",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -1585,22 +1584,22 @@ distribute_py_strict_test(
         ":tpu_values",
         ":values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1636,24 +1635,24 @@ distribute_py_strict_test(
         ":test_util",
         ":tpu_strategy",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
+        "//tensorflow/python/training",
         "//tensorflow/python/types:core",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1679,14 +1678,14 @@ distribute_py_strict_test(
         ":tpu_values",
         ":values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1707,15 +1706,15 @@ distribute_py_strict_test(
         ":combinations",
         ":test_util",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1734,13 +1733,13 @@ distribute_py_strict_test(
         ":distribute_utils",
         ":strategy_combinations",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/saved_model/model_utils:mode_keys",
         "@absl_py//absl/testing:parameterized",
         "@wrapt",
@@ -1765,24 +1764,23 @@ distribute_py_strict_test(
         ":strategy_test_lib",
         ":test_util",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:variable_utils",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1801,10 +1799,10 @@ distribute_py_strict_test(
         ":combinations",
         ":ps_values",
         ":strategy_combinations",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1824,11 +1822,11 @@ distribute_py_strict_test(
         ":strategy_combinations",
         ":strategy_test_lib",
         ":test_util",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1844,13 +1842,13 @@ distribute_py_strict_test(
     deps = [
         ":combinations",
         ":strategy_combinations",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1874,19 +1872,19 @@ distribute_py_strict_test(
         ":reduce_util",
         ":strategy_combinations",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/tpu:tpu_py",
         "//tensorflow/python/util:nest",
@@ -1901,12 +1899,12 @@ py_strict_library(
     deps = [
         ":step_fn",
         ":strategy_test_lib",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -1916,8 +1914,8 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:training",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/training",
     ],
 )
 
@@ -1932,11 +1930,11 @@ cuda_py_strict_test(
     deps = [
         ":combinations",
         ":strategy_combinations",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1983,18 +1981,8 @@ cuda_py_strict_test(
         ":test_util",
         ":values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/autograph/core:test_lib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/v1:input_lib",
@@ -2002,10 +1990,20 @@ cuda_py_strict_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:traceback_utils",
         "@absl_py//absl/testing:parameterized",
@@ -2028,25 +2026,25 @@ cuda_py_strict_test(
         ":distribute_utils",
         ":strategy_combinations",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:config",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
     ],
@@ -2063,12 +2061,12 @@ distribute_py_strict_test(
         ":combinations",
         ":strategy_combinations",
         ":strategy_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2085,13 +2083,13 @@ distribute_py_strict_test(
         ":combinations",
         ":strategy_combinations",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:gradient_descent",
@@ -2105,6 +2103,7 @@ cuda_py_strict_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "no_oss",  # TODO(b/283098184)
         "notsan",  # TODO(b/220133218)
     ],
     # b/155301154 broken with XLA:GPU
@@ -2122,27 +2121,27 @@ cuda_py_strict_test(
         ":strategy_test_lib",
         ":test_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:training_server_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
-        "//tensorflow/python/tpu:tpu_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -2170,27 +2169,27 @@ tpu_py_strict_test(
         ":strategy_test_lib",
         ":test_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
-        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:server_lib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -2203,6 +2202,7 @@ cuda_py_strict_test(
     python_version = "PY3",
     tags = [
         "multi_and_single_gpu",
+        "no_windows",  # TODO(b/284496291)
         "notsan",  # TODO(b/220133218)
     ],
     # b/141096229: Non-atomic AssignAdd
@@ -2220,28 +2220,28 @@ cuda_py_strict_test(
         ":reduce_util",
         ":strategy_test_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:training_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/distribute/v1:input_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/estimator:run_config",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:training_util",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2288,14 +2288,14 @@ py_strict_test(
     deps = [
         ":device_util",
         ":packed_distributed_variable",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2336,7 +2336,10 @@ distribute_py_strict_test(
     disable_mlir_bridge = False,
     python_version = "PY3",
     shard_count = 2,
-    tags = ["multi_and_single_gpu"],
+    tags = [
+        "multi_and_single_gpu",
+        "no_oss",  # TODO(b/283033375)
+    ],
     xla_enable_strict_auto_jit = True,
     deps = [
         ":collective_all_reduce_strategy",
@@ -2347,16 +2350,16 @@ distribute_py_strict_test(
         ":strategy_combinations",
         ":strategy_test_lib",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -2371,6 +2374,7 @@ distribute_py_strict_test(
     tags = [
         "multi_and_single_gpu",
         "no_cuda_asan",  # times out
+        "no_oss",  # TODO(b/283016506)
         "notsan",  # TODO(b/160006974)
     ],
     xla_enable_strict_auto_jit = True,
@@ -2383,16 +2387,16 @@ distribute_py_strict_test(
         ":strategy_combinations",
         ":test_util",
         ":tpu_strategy",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:gradients_impl",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -2411,15 +2415,15 @@ distribute_py_strict_test(
         ":device_util",
         ":strategy_combinations",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
@@ -2437,12 +2441,12 @@ py_strict_library(
         ":multi_worker_test_base",
         ":tpu_strategy",
         ":values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:config",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
         "//tensorflow/python/util:nest",
         "@absl_py//absl:app",
     ],
@@ -2459,13 +2463,13 @@ distribute_py_strict_test(
         ":combinations",
         ":strategy_combinations",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -2487,22 +2491,22 @@ py_strict_library(
         ":sharded_variable",
         ":values",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tf_decorator",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:keras_deps",
         "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -2524,16 +2528,7 @@ distribute_py_strict_test(
         ":ps_values",
         ":sharded_variable",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:embedding_ops",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:linalg_ops_impl",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -2542,9 +2537,18 @@ distribute_py_strict_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:linalg_ops_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
@@ -2573,11 +2577,7 @@ distribute_py_strict_test(
         ":parameter_server_strategy_v2",
         ":ps_values",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
@@ -2585,9 +2585,12 @@ distribute_py_strict_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
-        "//tensorflow/python/keras/saving",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
         "@absl_py//absl/testing:parameterized",
@@ -2609,13 +2612,6 @@ tpu_py_strict_test(
         ":tpu_replicated_variable",
         ":tpu_strategy",
         ":tpu_values",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
@@ -2629,9 +2625,15 @@ tpu_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:summary_test_util",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/tpu:device_assignment",
-        "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/tpu:tpu_replication",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -2667,12 +2669,12 @@ cuda_py_strict_test(
     deps = [
         ":distribute_lib",
         ":mirrored_strategy",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:template",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:template",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/distribute/cluster_resolver/BUILD b/tensorflow/python/distribute/cluster_resolver/BUILD
index a5a6bea57f4..0867421d347 100644
--- a/tensorflow/python/distribute/cluster_resolver/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/BUILD
@@ -33,11 +33,11 @@ py_strict_library(
     srcs = ["cluster_resolver.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
         "@six_archive//:six",
     ],
@@ -49,7 +49,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -60,7 +60,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -71,7 +71,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -92,7 +92,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -103,7 +103,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":base_cluster_resolver_py",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -114,12 +114,12 @@ tf_py_strict_test(
     main = "cluster_resolver_test.py",
     deps = [
         ":base_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -131,8 +131,8 @@ tf_py_strict_test(
     deps = [
         ":base_cluster_resolver_py",
         ":gce_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -144,12 +144,12 @@ tf_py_strict_test(
     main = "tfconfig_cluster_resolver_test.py",
     deps = [
         ":tfconfig_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -161,9 +161,9 @@ tf_py_strict_test(
     main = "sagemaker_cluster_resolver_test.py",
     deps = [
         ":sagemaker_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -175,8 +175,8 @@ tf_py_strict_test(
     tags = [],
     deps = [
         ":slurm_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -187,8 +187,8 @@ tf_py_strict_test(
     main = "kubernetes_cluster_resolver_test.py",
     deps = [
         ":kubernetes_cluster_resolver_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:server_lib",
     ],
 )
@@ -202,9 +202,9 @@ tf_py_strict_test(
 #     main = "brain_jobs_cluster_resolver_test.py",
 #     deps = [
 #         ":brain_jobs_cluster_resolver_py",
-#         "//tensorflow/python:client_testlib",
-#         "//tensorflow/python:framework_test_lib",
-#         "//tensorflow/python:training_server_lib",
+#         "//tensorflow/python/framework:test_lib",
+#         "//tensorflow/python/platform:client_testlib",
+#         "//tensorflow/python/training:server_lib",
 #     ],
 # )
 # copybara:uncomment_end
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
index 787bb4ae418..29f3da11fd8 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/BUILD
@@ -20,13 +20,15 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core/protobuf/tpu:topology_proto_py",
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/eager:remote",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/tpu:tpu_strategy_util",
         "//tensorflow/python/tpu:tpu_system_metadata",
         "//tensorflow/python/tpu/client",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
     ] + tf_additional_rpc_deps(),
 )
@@ -41,15 +43,15 @@ tf_py_strict_test(
     deps = [
         ":tpu_cluster_resolver_py",
         "//tensorflow/core/protobuf/tpu:topology_proto_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu/client",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
     ],
 )
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
index 6e40bac576d..58ec2ba9474 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu/tpu_cluster_resolver.py
@@ -18,10 +18,12 @@ import collections
 import re
 
 from tensorflow.core.protobuf.tpu import topology_pb2
-from tensorflow.python.distribute.cluster_resolver import cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.eager import remote
 from tensorflow.python.framework import config as framework_config
 from tensorflow.python.framework import errors
 from tensorflow.python.platform import tf_logging as logging
+from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu import tpu_system_metadata as tpu_system_metadata_lib
 from tensorflow.python.training import server_lib
 from tensorflow.python.util import compat
@@ -53,7 +55,44 @@ DeviceDetails = collections.namedtuple(
     'DeviceDetails', ['device_map', 'total_cores'])
 
 
-class TPUClusterResolver(cluster_resolver.ClusterResolver):
+def initialize_tpu_system(cluster_resolver=None):
+  """Initialize the TPU devices.
+
+  Args:
+    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster.
+  Returns:
+    The tf.tpu.Topology object for the topology of the TPU cluster. If called
+    inside tf.function, it returns the serialized topology object instead.
+
+  Raises:
+    RuntimeError: If running inside a tf.function.
+    NotFoundError: If no TPU devices found in eager mode.
+  """
+  return tpu_strategy_util.initialize_tpu_system_impl(
+      cluster_resolver, TPUClusterResolver)
+
+
+def shutdown_tpu_system(cluster_resolver=None):
+  """Shuts down the TPU devices.
+
+  This will clear all caches, even those that are maintained through sequential
+  calls to tf.tpu.experimental.initialize_tpu_system, such as the compilation
+  cache.
+
+  Args:
+    cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
+        which provides information about the TPU cluster.
+
+  Raises:
+    RuntimeError: If no TPU devices found for eager execution or if run in a
+        tf.function.
+  """
+  tpu_strategy_util.shutdown_tpu_system_impl(
+      cluster_resolver, TPUClusterResolver)
+
+
+class TPUClusterResolver(cluster_resolver_lib.ClusterResolver):
   """Cluster Resolver for Google Cloud TPUs.
 
   This is an implementation of cluster resolvers for the Google Cloud TPU
@@ -104,10 +143,8 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
       NotFoundError: If no TPU devices found in eager mode.
     """
     resolver = TPUClusterResolver(tpu, zone, project)
-    from tensorflow.python.eager import remote  # pylint: disable=g-import-not-at-top
     remote.connect_to_cluster(resolver)
-    from tensorflow.python.tpu import tpu_strategy_util  # pylint: disable=g-import-not-at-top
-    tpu_strategy_util.initialize_tpu_system(resolver)
+    tpu_strategy_util.initialize_tpu_system_impl(resolver)
     return resolver
 
   @staticmethod
@@ -266,7 +303,7 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
         if not job_tasks:
           raise ValueError('No TPUs with the specified names exist.')
         master = job_tasks[0]
-      return cluster_resolver.format_master_url(master, 'grpc')
+      return cluster_resolver_lib.format_master_url(master, 'grpc')
     else:
       return ''
 
@@ -384,7 +421,7 @@ class TPUClusterResolver(cluster_resolver.ClusterResolver):
     while True:
       try:
         device_details = TPUClusterResolver._get_device_dict_and_cores(
-            cluster_resolver.get_accelerator_devices(
+            cluster_resolver_lib.get_accelerator_devices(
                 self.master(), config_proto=config_proto))
         break
       except errors.DeadlineExceededError:
diff --git a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
index 0fdaf791632..4aebbf27cfc 100644
--- a/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
+++ b/tensorflow/python/distribute/cluster_resolver/tpu_cluster_resolver.py
@@ -15,8 +15,12 @@
 """Shim so that direct imports of tpu_cluster_resolver get correct symbols.
 """
 
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import initialize_tpu_system
 from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import is_running_in_gce  # pylint: disable=unused-import
+from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import shutdown_tpu_system
 from tensorflow.python.distribute.cluster_resolver.tpu.tpu_cluster_resolver import TPUClusterResolver
 from tensorflow.python.util.tf_export import tf_export
 
 tf_export('distribute.cluster_resolver.TPUClusterResolver')(TPUClusterResolver)
+tf_export('tpu.experimental.initialize_tpu_system')(initialize_tpu_system)
+tf_export('tpu.experimental.shutdown_tpu_system')(shutdown_tpu_system)
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy.py b/tensorflow/python/distribute/collective_all_reduce_strategy.py
index 99c796d5f6f..b87ded9aee9 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy.py
@@ -36,6 +36,7 @@ from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.framework import device as tf_device
@@ -45,7 +46,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import collective_ops
 from tensorflow.python.ops import control_flow_util
 from tensorflow.python.platform import tf_logging as logging
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.trackable import base
 from tensorflow.python.util import deprecation
 from tensorflow.python.util.tf_export import tf_export
@@ -540,7 +540,7 @@ class CollectiveAllReduceExtended(mirrored_strategy.MirroredExtended):
     local_devices, local_device_type = self._initialize_local_devices(
         cluster_resolver, self._worker_device)
     if local_device_type == "TPU":
-      tpu_strategy_util.initialize_tpu_system()
+      tpu_cluster_resolver.initialize_tpu_system()
 
     self._collective_keys = cross_device_utils.CollectiveKeys(
         group_key_start=1 + self._collective_key_base)
diff --git a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
index 4ad7a8e5397..9721a53fe59 100644
--- a/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
+++ b/tensorflow/python/distribute/collective_all_reduce_strategy_test.py
@@ -35,6 +35,7 @@ from tensorflow.python.distribute import strategy_combinations
 from tensorflow.python.distribute import strategy_test_lib
 from tensorflow.python.distribute import test_util
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.distribute.v1 import input_lib as input_lib_v1
 from tensorflow.python.eager import context
 from tensorflow.python.framework import config as tf_config
@@ -52,7 +53,6 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import server_lib
 
 
@@ -76,7 +76,7 @@ def create_test_objects(cluster_spec=None,
   if num_tpus is None:
     num_tpus = context.context().list_physical_devices('TPU')
   if num_tpus:
-    tpu_strategy_util.initialize_tpu_system()
+    tpu_cluster_resolver.initialize_tpu_system()
 
   if cluster_spec and task_type and task_id is not None:
     cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
diff --git a/tensorflow/python/distribute/coordinator/BUILD b/tensorflow/python/distribute/coordinator/BUILD
index 257cf9161a5..c7cdfc0d3fe 100644
--- a/tensorflow/python/distribute/coordinator/BUILD
+++ b/tensorflow/python/distribute/coordinator/BUILD
@@ -19,14 +19,14 @@ py_strict_library(
         ":utils",
         ":values",
         ":watchdog",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -64,11 +64,6 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":remote_value",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dataset_ops_gen",
-        "//tensorflow/python:experimental_dataset_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/distribute:input_lib",
@@ -77,7 +72,12 @@ py_strict_library(
         "//tensorflow/python/eager:function",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -103,17 +103,6 @@ distribute_py_strict_test(
         ":cluster_coordinator",
         ":coordinator_context",
         ":remote_value",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training_lib",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
@@ -121,13 +110,27 @@ distribute_py_strict_test(
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
+        "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:training_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -138,14 +141,6 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":cluster_coordinator",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
@@ -153,6 +148,14 @@ py_strict_library(
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:training_lib",
     ],
@@ -161,7 +164,6 @@ py_strict_library(
 tf_py_strict_test(
     name = "fault_tolerance_test",
     srcs = ["fault_tolerance_test.py"],
-    flaky = True,
     python_version = "PY3",
     shard_count = 40,  # = number of tests, so one shard = one test
     tags = [
@@ -180,7 +182,6 @@ tf_py_strict_test(
 tf_py_strict_test(
     name = "fault_tolerance_coordination_service_test",
     srcs = ["fault_tolerance_coordination_service_test.py"],
-    flaky = True,
     python_version = "PY3",
     shard_count = 41,
     tags = [
@@ -216,12 +217,12 @@ tf_py_strict_test(
     deps = [
         ":cluster_coordinator",
         ":metric_utils",
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -230,8 +231,8 @@ py_strict_library(
     srcs = ["utils.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:training_server_lib",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -265,7 +266,6 @@ tf_py_strict_test(
 tf_py_strict_test(
     name = "get_task_states_test",
     srcs = ["get_task_states_test.py"],
-    flaky = True,
     python_version = "PY3",
     shard_count = 3,
     tags = [
diff --git a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
index 51e2a430beb..af02b63bfbc 100644
--- a/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
+++ b/tensorflow/python/distribute/coordinator/cluster_coordinator_test.py
@@ -25,6 +25,7 @@ import threading
 import time
 import traceback
 from absl.testing import parameterized
+import numpy as np
 
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.data.ops import dataset_ops
@@ -33,6 +34,7 @@ from tensorflow.python.distribute import distribute_utils
 from tensorflow.python.distribute import input_lib
 from tensorflow.python.distribute import multi_worker_test_base
 from tensorflow.python.distribute import parameter_server_strategy_v2
+from tensorflow.python.distribute import sharded_variable
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.distribute.coordinator import cluster_coordinator as coordinator_lib
 from tensorflow.python.distribute.coordinator import coordinator_context
@@ -44,9 +46,11 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import embedding_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import resource_variable_ops
@@ -482,7 +486,7 @@ class TestCaseWithErrorReportingThread(test.TestCase):
       raise ErrorReportingThread.error  # pylint: disable=raising-bad-type
 
 
-def make_coordinator(num_workers, num_ps):
+def make_coordinator(num_workers, num_ps, partitioner=None):
   # TODO(rchao): Test the internal rpc_layer version.
   cluster_def = multi_worker_test_base.create_in_process_cluster(
       num_workers=num_workers, num_ps=num_ps, rpc_layer='grpc')
@@ -492,7 +496,7 @@ def make_coordinator(num_workers, num_ps):
   cluster_resolver = cluster_resolver_lib.SimpleClusterResolver(
       ClusterSpec(cluster_def), rpc_layer='grpc')
   strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
-      cluster_resolver)
+      cluster_resolver, variable_partitioner=partitioner)
   return coordinator_lib.ClusterCoordinator(strategy)
 
 
@@ -769,19 +773,21 @@ class ClusterCoordinatorTest(
     var_sum = sum(self.coordinator.fetch(worker_local_var._values))
     self.assertEqual(var_sum, 10.0)
 
-  def testPerWorkerVariableCreation(self):
+  @parameterized.parameters(True, False)
+  def testPerWorkerVariableCreation(self, define_shape):
     var_dtype = dtypes.float32
     var_name = 'var'
+    shape = [1] if define_shape else None
 
     with self.strategy.scope():
       var = variables.Variable(
-          initial_value=0.0, dtype=var_dtype, name=var_name,
+          initial_value=[0.0], shape=shape, dtype=var_dtype, name=var_name,
           per_worker_variable=True)
 
     # Use per-worker variable as a capture
     @def_function.function
     def worker_fn():
-      var.assign_add(1.0)
+      var.assign_add(constant_op.constant([1.0]))
       return var
 
     num_closures = 10
@@ -1129,6 +1135,81 @@ class LimitedClosureQueueErrorTest(ErrorReportingTest):
       cls.iteration = variables.Variable(initial_value=0.0)
 
 
+class ShardedVariableTest(TestCaseWithErrorReportingThread):
+
+  @classmethod
+  def setUpClass(cls):
+    super().setUpClass()
+    cls.coordinator = make_coordinator(
+        num_workers=5, num_ps=2,
+        partitioner=sharded_variable.FixedShardsPartitioner(3))
+    cls.strategy = cls.coordinator.strategy
+
+  def testEmbeddingLookup(self):
+    # Verify lookup ops use optimized implementations with ClusterCoordinator
+    with self.strategy.scope():
+      sv = variables.Variable(initial_value=np.arange(10).reshape((5, 2)) + 1,
+                              dtype=dtypes.float32)
+
+    @def_function.function
+    def lookup():
+      ids = constant_op.constant([0, 3, 4])
+      return embedding_ops.embedding_lookup_v2(sv, ids)
+
+    @def_function.function
+    def sparse_lookup():
+      sp_ids = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0], [2, 2]],
+          values=[0, 3, 4, 1],
+          dense_shape=[3, 3])
+      return embedding_ops.embedding_lookup_sparse_v2(sv, sp_ids, None)
+
+    @def_function.function
+    def safe_sparse_lookup():
+      sp_ids = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0], [2, 2]],
+          values=[0, -1, 4, 1],
+          dense_shape=[3, 3])
+      sp_weights = sparse_tensor.SparseTensor(
+          indices=[[0, 0], [0, 1], [1, 0], [2, 2]],
+          values=[1., 1., -1., 1.],
+          dense_shape=[3, 3])
+      return embedding_ops.safe_embedding_lookup_sparse_v2(
+          sv, sp_ids, sp_weights)
+
+    results = []
+    for func in [lookup, sparse_lookup, safe_sparse_lookup]:
+      # Manually create Closure so we can inspect its graph
+      closure = coordinator_lib.Closure(
+          func,
+          self.coordinator._cluster.closure_queue._cancellation_mgr)  # pylint: disable=protected-access
+      result = closure.build_output_remote_value()
+      self.coordinator._cluster.closure_queue.put(closure)
+
+      graph = closure._concrete_function.graph
+      num_gather_ops = 0
+      num_rv_ops = 0
+      for op in graph.get_operations():
+        if op.type == 'ResourceGather':
+          num_gather_ops += 1
+        if op.type == 'ReadVariableOp':
+          num_rv_ops += 1
+      self.assertEqual(
+          num_gather_ops, len(sv.variables), 'Number of ResourceGather op '
+          f'({num_gather_ops}) does not match expected ({len(sv.variables)}), '
+          'possibly due to ShardedVariable accidentally being converted to '
+          f'tensor in {func.__name__} ops.')
+      self.assertEqual(
+          num_rv_ops, 0, f'Function {func.__name__} graph has some '
+          'ReadVariableOps, possibly due to ShardedVariable accidentally being '
+          'converted to tensor')
+      results.append(result)
+
+    self.assertAllEqual(results[0].fetch(), [[1., 2.], [7., 8.], [9., 10.]])
+    self.assertAllClose(results[1].fetch(), [[4., 5.], [9., 10.], [3., 4.]])
+    self.assertAllClose(results[2].fetch(), [[1., 2.], [0., 0.], [3., 4.]])
+
+
 class StrategyIntegrationTest(test.TestCase, parameterized.TestCase):
 
   @classmethod
diff --git a/tensorflow/python/distribute/coordinator/metric_utils.py b/tensorflow/python/distribute/coordinator/metric_utils.py
index 8979f17486d..3127b744c2f 100644
--- a/tensorflow/python/distribute/coordinator/metric_utils.py
+++ b/tensorflow/python/distribute/coordinator/metric_utils.py
@@ -19,7 +19,7 @@ import time
 from tensorflow.python.eager import monitoring
 from tensorflow.python.util import tf_contextlib
 
-enable_metrics = False
+enable_metrics = True
 _METRICS_MAPPING = {}
 
 
diff --git a/tensorflow/python/distribute/distributed_table_test.py b/tensorflow/python/distribute/distributed_table_test.py
index 1015929436e..a2f5310f33c 100644
--- a/tensorflow/python/distribute/distributed_table_test.py
+++ b/tensorflow/python/distribute/distributed_table_test.py
@@ -37,7 +37,6 @@ from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import tensor_spec
-from tensorflow.python.keras.saving import save as keras_save
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import lookup_ops
@@ -48,9 +47,6 @@ from tensorflow.python.saved_model import save as tf_save
 
 source_combination = combinations.combine(source=["textfile", "keyvaluetensor"])
 
-source_and_load_combination = combinations.combine(
-    source=["textfile", "keyvaluetensor"], load=["tf_load", "keras_load"])
-
 
 class DistributedTableTest(test.TestCase, parameterized.TestCase):
 
@@ -619,8 +615,8 @@ class DistributedTableTest(test.TestCase, parameterized.TestCase):
     for r in results:
       self.assertAllClose(-2400, r.fetch())
 
-  @combinations.generate(source_and_load_combination)
-  def testDistributeTableSaveAndServe(self, load, source):
+  @combinations.generate(source_combination)
+  def testDistributeTableSaveAndServe(self, source):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         self.cluster_resolver)
     file_path = os.path.join(self.get_temp_dir(), "text_file_initializer")
@@ -630,12 +626,7 @@ class DistributedTableTest(test.TestCase, parameterized.TestCase):
     model_dir = self.get_temp_dir()
     tf_save.save(model, model_dir)
 
-    if load == "tf_load":
-      load_fn = tf_load.load
-    else:
-      load_fn = keras_save.load_model
-
-    loaded_without_strategy = load_fn(model_dir)
+    loaded_without_strategy = tf_load.load(model_dir)
     loaded_func_captures_without_strategy = (
         loaded_without_strategy.use_table.get_concrete_function().graph
         .external_captures)
@@ -649,8 +640,8 @@ class DistributedTableTest(test.TestCase, parameterized.TestCase):
         loaded_without_strategy.use_table(
             constant_op.constant([0, 1, 3], dtype=dtypes.int64)), [0, 1, -2])
 
-  @combinations.generate(source_and_load_combination)
-  def testDistributeTableSaveAndLoadUnderStrategy(self, load, source):
+  @combinations.generate(source_combination)
+  def testDistributeTableSaveAndLoadUnderStrategy(self, source):
     strategy = parameter_server_strategy_v2.ParameterServerStrategyV2(
         self.cluster_resolver)
     coordinator = coordinator_lib.ClusterCoordinator(strategy)
@@ -660,13 +651,8 @@ class DistributedTableTest(test.TestCase, parameterized.TestCase):
     model_dir = self.get_temp_dir()
     tf_save.save(model, model_dir)
 
-    if load == "tf_load":
-      load_fn = tf_load.load
-    else:
-      load_fn = keras_save.load_model
-
     with strategy.scope():
-      loaded = load_fn(model_dir)
+      loaded = tf_load.load(model_dir)
 
     loaded_func_captures = (
         loaded.use_table.get_concrete_function().graph.external_captures)
diff --git a/tensorflow/python/distribute/experimental/BUILD b/tensorflow/python/distribute/experimental/BUILD
index b33ad7c4b65..bae05eab4a3 100644
--- a/tensorflow/python/distribute/experimental/BUILD
+++ b/tensorflow/python/distribute/experimental/BUILD
@@ -34,17 +34,10 @@ py_strict_library(
     deps = [
         ":dtensor_strategy_extended",
         ":dtensor_util",
-        "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:config",
         "//tensorflow/dtensor/python:mesh_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/distribute:cross_device_ops",
-        "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/framework:device",
-        "//tensorflow/python/framework:ops",
     ],
 )
 
@@ -61,10 +54,6 @@ dtensor_test(
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python:mesh_util",
         "//tensorflow/dtensor/python/tests:test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:reduce_util",
@@ -73,6 +62,10 @@ dtensor_test(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -86,13 +79,18 @@ py_strict_library(
         "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:input_util",
         "//tensorflow/dtensor/python:layout",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:summary_ops_v2",
+        "//tensorflow/python/distribute:cross_device_ops",
+        "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
     ],
 )
 
@@ -116,7 +114,7 @@ dtensor_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dtensor_strategy_extended",
     srcs = ["dtensor_strategy_extended.py"],
     deps = [
@@ -126,16 +124,16 @@ py_library(
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:input_util",
         "//tensorflow/dtensor/python:layout",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/data/experimental/ops:distribute",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/util:nest",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "multi_worker_mirrored_strategy",
     srcs = ["multi_worker_mirrored_strategy.py"],
     deps = [
@@ -172,16 +170,24 @@ dtensor_test(
     deps = [
         ":dtensor_util",
         ":multi_worker_mirrored_strategy",
+        "//tensorflow/dtensor/python:api",
         "//tensorflow/dtensor/python:d_variable",
         "//tensorflow/dtensor/python:layout",
         "//tensorflow/dtensor/python/tests:multi_client_test_util",
         "//tensorflow/dtensor/python/tests:test_util",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute/cluster_resolver:tfconfig_cluster_resolver_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
diff --git a/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py b/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py
index f6abf22f224..ee7f15089e2 100644
--- a/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py
+++ b/tensorflow/python/distribute/experimental/dtensor_strategy_extended.py
@@ -37,6 +37,8 @@ class DTensorStrategyExtended(distribute_lib.StrategyExtendedV2):
   def __init__(self, container_strategy, mesh):
     super().__init__(container_strategy)
     self._mesh = mesh
+    self._num_clients = d_config.num_clients()
+    self._client_id = d_config.client_id()
 
   def _create_variable(self, next_creator, **kwargs):
     # Make sure the pop the `use_resource` which is not supported by the
@@ -165,11 +167,9 @@ class DTensorStrategyExtended(distribute_lib.StrategyExtendedV2):
   def _distribute_datasets_from_function(self, dataset_fn, options):
     # TODO(scottzhu): Implement the logic for options in future
     del options
-    # Single worker for now, this will change when deal with different input
-    # options or multiple workers.
     input_context = distribute_lib.InputContext(
-        num_input_pipelines=1,
-        input_pipeline_id=0,
+        num_input_pipelines=self._num_clients,
+        input_pipeline_id=self._client_id,
         num_replicas_in_sync=self._num_replicas_in_sync
     )
     dataset = dataset_fn(input_context)
@@ -223,7 +223,7 @@ class DTensorStrategyExtended(distribute_lib.StrategyExtendedV2):
                                mesh=self._mesh)
     return nest.map_structure(map_fn, result)
 
-  def call_for_each_replica(self, fn, args, kwargs):
+  def call_for_each_replica(self, fn, args=(), kwargs=None):
     """Run `fn` once per replica.
 
     This is a method that expected by the strategy base class in its `run()`.
diff --git a/tensorflow/python/distribute/experimental/dtensor_util.py b/tensorflow/python/distribute/experimental/dtensor_util.py
index 990670f9394..eef41aa4569 100644
--- a/tensorflow/python/distribute/experimental/dtensor_util.py
+++ b/tensorflow/python/distribute/experimental/dtensor_util.py
@@ -18,12 +18,17 @@ from tensorflow.dtensor.python import accelerator_util
 from tensorflow.dtensor.python import api as d_api
 from tensorflow.dtensor.python import input_util
 from tensorflow.dtensor.python import layout
+from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
+from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute import values as values_lib
 from tensorflow.python.eager import context
+from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import summary_ops_v2
 
 
@@ -231,3 +236,64 @@ def convert_per_replica_to_dtensor(per_replica_value, mesh):
       mesh, batch_dim=DEFAULT_BATCH_MESH_DIM_NAME, rank=rank)
 
   return d_api.pack(result, batch_layout)
+
+
+def dtensor_reduce(strategy, reduce_op, value, axis):
+  """Implement dtensor based strategy.reduce()."""
+  # Due to the limitation of using scalar in DTensor (e.g. the rank 0 tensor
+  # loss the batch shard information), we need to override the default
+  # reduce in addition to the strategy.extend._reduce_to()
+  # Most of the logic here is a mimic of the parent class, except for how
+  # mean and sum are calculated in a global context.
+  distribute_lib._require_cross_replica_or_default_context_extended(  # pylint: disable=protected-access
+      strategy.extended)
+  if isinstance(reduce_op, str):
+    reduce_op = reduce_util.ReduceOp(reduce_op.upper())
+
+  distributed_input = is_distributed_value(value)
+  if not distributed_input and axis is None:
+    # For any value that isn't distributed and doesn't need a reduction within
+    # the replica.
+    destinations = (device_util.current() or
+                    strategy.extended._default_device or  # pylint: disable=protected-access
+                    "/device:CPU:0")
+    devices = cross_device_ops_lib.get_devices_from(destinations)
+    with ops.device(devices[0]):
+      return array_ops.identity(
+          cross_device_ops_lib.reduce_non_distributed_value(
+              reduce_op, value, destinations, strategy.num_replicas_in_sync))
+
+  value = convert_inputs_to_dtensor(value, strategy._mesh)  # pylint: disable=protected-access
+  # At this point, the value is a DTensor instance now.
+  # There will be a final reduction step cross replica. In order to maintain
+  # the shape of each local replica, we need to add a new dim to the front.
+  # E.g. 2 replica with local shape as (4, 5, 6), the global tensor shape
+  # should be (8, 5, 6), we will reshape into (2, 4, 5, 6) and then do a
+  # reduction on axis 0.
+  if reduce_op == reduce_util.ReduceOp.MEAN:
+    reduce_op = math_ops.reduce_mean
+  else:
+    reduce_op = math_ops.reduce_sum
+
+  # TODO(scottzhu): Make sure we handle dynamic/uneven shape in future.
+  if d_api.fetch_layout(value).is_fully_replicated():
+    # In case of fully mirrored dtensor, we only need to do one reduce, and
+    # don't need to care about any per-replica logic.
+    if axis is not None:
+      value = reduce_op(value, axis=axis)
+  else:
+    new_shape = [strategy.num_replicas_in_sync, -1]
+    if len(value.shape) > 1:
+      new_shape.extend(array_ops.shape(value)[1:])
+    value = array_ops.reshape(value, new_shape)
+    if axis is not None:
+      # we do a reduce_sum/mean within each of the replica when axis is not
+      # None. Add 1 to the axis since there is a new dim added by reshape in
+      # front.
+      value = reduce_op(value, axis=axis + 1)
+    value = reduce_op(value, axis=0)
+
+  # Note that we return a DTensor instance here, which should have the same
+  # value as the original MirroredStrategy, but with a different type. User
+  # might want a tf.Tensor for the status quo.
+  return value
diff --git a/tensorflow/python/distribute/experimental/dtensor_util_test.py b/tensorflow/python/distribute/experimental/dtensor_util_test.py
index e486a777084..715a4ea110a 100644
--- a/tensorflow/python/distribute/experimental/dtensor_util_test.py
+++ b/tensorflow/python/distribute/experimental/dtensor_util_test.py
@@ -99,7 +99,7 @@ class DTensorReplicaContextTest(test_util.DTensorBaseTest):
     self.mesh = self.configTestMesh(mesh_dict)
 
   def test_unsupported_methods(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     replica_context = dtensor_util.DTensorReplicaContext(strategy)
 
     expected_error = replica_context._UNSUPPORTED_ERROR_MSG
diff --git a/tensorflow/python/distribute/experimental/mirrored_strategy.py b/tensorflow/python/distribute/experimental/mirrored_strategy.py
index 7a91b7edce6..639d7988bc9 100644
--- a/tensorflow/python/distribute/experimental/mirrored_strategy.py
+++ b/tensorflow/python/distribute/experimental/mirrored_strategy.py
@@ -18,19 +18,12 @@ This is an experiment to validate the viability of the DTensor API, and expose
 any potential feature gaps between the current API and the need.
 """
 
-from tensorflow.dtensor.python import api as d_api
 from tensorflow.dtensor.python import config as d_config
 from tensorflow.dtensor.python import mesh_util
-from tensorflow.python.distribute import cross_device_ops as cross_device_ops_lib
-from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute import distribute_lib
-from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute.experimental import dtensor_strategy_extended
 from tensorflow.python.distribute.experimental import dtensor_util
 from tensorflow.python.framework import device as tf_device
-from tensorflow.python.framework import ops
-from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import math_ops
 
 
 class MirroredStrategy(distribute_lib.Strategy):
@@ -44,18 +37,18 @@ class MirroredStrategy(distribute_lib.Strategy):
   placed on the `mesh` that is specified in the __init__.
   """
 
-  def __init__(self, mesh=None, devices=None, cross_device_ops=None):
+  def __init__(self, devices=None, cross_device_ops=None, *, mesh=None):
     """Synchronous training across multiple replicas on one machine.
 
     Args:
-      mesh: optional DTensor mesh for the computation. Note that either `mesh`
-        or `devices` should be provided, and not both. The mesh should be 1D,
-        and will be used to split the input data among that dimension.
       devices: a list of device strings, such as ['/gpu:0', '/gpu:1']. If both
         `mesh` and `devices` are None, all the available GPU/TPU will be used.
         If no accelerators are found, CPU is used.
       cross_device_ops: optional, a descendant of `CrossDeviceOps`. The value is
         ignored at the moment, and support will be added later.
+      mesh: optional DTensor mesh for the computation. Note that either `mesh`
+        or `devices` should be provided, and not both. The mesh should be 1D,
+        and will be used to split the input data among that dimension.
     """
     self._validate_init_args(mesh, devices)
     if not mesh:
@@ -98,60 +91,4 @@ class MirroredStrategy(distribute_lib.Strategy):
     return mesh
 
   def reduce(self, reduce_op, value, axis):
-    # Due to the limitation of using scalar in DTensor (e.g. the rank 0 tensor
-    # loss the batch shard information), we need to override the default
-    # reduce in addition to the strategy.extend._reduce_to()
-    # Most of the logic here is a mimic of the parent class, except for how
-    # mean and sum are calculated in a global context.
-    distribute_lib._require_cross_replica_or_default_context_extended(  # pylint: disable=protected-access
-        self.extended)
-    if isinstance(reduce_op, str):
-      reduce_op = reduce_util.ReduceOp(reduce_op.upper())
-
-    distributed_input = dtensor_util.is_distributed_value(value)
-    if not distributed_input and axis is None:
-      # For any value that isn't distributed and doesn't need a reduction within
-      # the replica.
-      destinations = (device_util.current() or
-                      self.extended._default_device or  # pylint: disable=protected-access
-                      '/device:CPU:0')
-      devices = cross_device_ops_lib.get_devices_from(destinations)
-      with ops.device(devices[0]):
-        return array_ops.identity(
-            cross_device_ops_lib.reduce_non_distributed_value(
-                reduce_op, value, destinations, self.num_replicas_in_sync))
-
-    value = dtensor_util.convert_inputs_to_dtensor(value, self._mesh)
-    # At this point, the value is a DTensor instance now.
-    # There will be a final reduction step cross replica. In order to maintain
-    # the shape of each local replica, we need to add a new dim to the front.
-    # E.g. 2 replica with local shape as (4, 5, 6), the global tensor shape
-    # should be (8, 5, 6), we will reshape into (2, 4, 5, 6) and then do a
-    # reduction on axis 0.
-    if reduce_op == reduce_util.ReduceOp.MEAN:
-      reduce_op = math_ops.reduce_mean
-    else:
-      reduce_op = math_ops.reduce_sum
-
-    # TODO(scottzhu): Make sure we handle dynamic/uneven shape in future.
-    if d_api.fetch_layout(value).is_fully_replicated():
-      # In case of fully mirrored dtensor, we only need to do one reduce, and
-      # don't need to care about any per-replica logic.
-      if axis is not None:
-        value = reduce_op(value, axis=axis)
-    else:
-      new_shape = [self.num_replicas_in_sync, -1]
-      if len(value.shape) > 1:
-        new_shape.extend(array_ops.shape(value)[1:])
-      value = array_ops.reshape(value, new_shape)
-      if axis is not None:
-        # we do a reduce_sum/mean within each of the replica when axis is not
-        # None. Add 1 to the axis since there is a new dim added by reshape in
-        # front.
-        value = reduce_op(value, axis=axis + 1)
-      value = reduce_op(value, axis=0)
-
-    # Note that we return a DTensor instance here, which should have the same
-    # value as the original MirroredStrategy, but with a different type. User
-    # might want a tf.Tensor for the status quo.
-    return value
+    return dtensor_util.dtensor_reduce(self, reduce_op, value, axis)
diff --git a/tensorflow/python/distribute/experimental/mirrored_strategy_test.py b/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
index 501fe6b6a23..5659f1e65ce 100644
--- a/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/experimental/mirrored_strategy_test.py
@@ -62,7 +62,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
   def test_variable_creation(self, init_value, convert_callable):
     if convert_callable:
       init_value = init_value()
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with strategy.scope():
       v = variables.Variable(init_value)
 
@@ -71,7 +71,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertEqual(v.layout, layout.Layout.replicated(self.mesh, rank=1))
 
   def test_variable_creation_with_dtype(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with strategy.scope():
       v = variables.Variable(
           0, dtype='int64',
@@ -80,37 +80,37 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertEqual(v.dtype, dtypes.int64)
 
   def test_mesh(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     self.assertEqual(strategy._mesh, self.mesh)
 
   def test_strategy_extension(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     self.assertIsInstance(strategy.extended, distribute_lib.StrategyExtendedV2)
 
   def test_num_replica_in_sync(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     self.assertEqual(strategy.num_replicas_in_sync, 2)
 
   def test_worker_devices(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     worker_devices = strategy.extended.worker_devices
     self.assertLen(worker_devices, 2)
     self.assertEqual(worker_devices, tuple(self.mesh.local_devices()))
 
   def test_parameter_devices(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     parameter_devices = strategy.extended.parameter_devices
     self.assertLen(parameter_devices, 2)
     self.assertEqual(parameter_devices, tuple(self.mesh.local_devices()))
 
   def test_variable_created_in_scope(self):
-    strategy1 = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy1 = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with strategy1.scope():
       v1 = variables.Variable(constant_op.constant([1.0, 2.0]))
 
     v2 = variables.Variable(constant_op.constant([1.0, 2.0]))
 
-    strategy2 = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy2 = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with strategy2.scope():
       v3 = variables.Variable(constant_op.constant([1.0, 2.0]))
 
@@ -120,7 +120,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertTrue(strategy2.extended.variable_created_in_scope(v3))
 
   def test_colocate_vars_with(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with strategy.scope():
       v1 = variables.Variable(constant_op.constant([1.0, 2.0]))
       with strategy.extended.colocate_vars_with(v1):
@@ -130,11 +130,11 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertEqual(v1.layout, v2.layout)
 
   def test_in_multi_worker_mode(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     self.assertFalse(strategy.extended._in_multi_worker_mode())
 
   def test_run_with_tensor_inputs(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     tensor_input = constant_op.constant(3.0)
 
     @def_function.function
@@ -150,7 +150,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     # test case was raising an error, but the graph context will run, by treat
     # the inputs as a global inputs.
     # TODO(scottzhu): Mitigate this eager/graph behavior difference in future.
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     @def_function.function
     def replica_fn(inputs):
@@ -166,7 +166,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertEqual(result, constant_op.constant(6.0))
 
   def test_run_with_unsupported_input_types(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     random_inputs = [123, '456']
 
     @def_function.function
@@ -178,7 +178,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
       strategy.run(replica_fn, args=(random_inputs,))
 
   def test_run_with_distribute_value_input(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     def value_fn(value_context):
       return value_context.replica_id_in_sync_group
@@ -200,8 +200,22 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result.values[0], constant_op.constant([0]))
     self.assertAllClose(result.values[1], constant_op.constant([2]))
 
+  def test_run_without_input(self):
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
+
+    @def_function.function
+    def replica_fn():
+      return constant_op.constant([1.0])
+
+    result = strategy.run(replica_fn)
+    self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result.values, 2)
+
+    self.assertAllClose(result.values[0], constant_op.constant([1.0]))
+    self.assertAllClose(result.values[1], constant_op.constant([1.0]))
+
   def test_nested_structure_output(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     array_value = np.array([3., 2., 1.])
     def value_fn(ctx):
       value = array_value[ctx.replica_id_in_sync_group]
@@ -239,7 +253,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     def replica_fn_2(inputs):
       return inputs + 1.0
 
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     tensor_input = constant_op.constant(3.0)
     d_tensor_input = strategy.experimental_distribute_values_from_function(
         lambda _: tensor_input)
@@ -262,7 +276,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     def replica_fn():
       return constant_op.constant([3.0])
 
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     result = strategy.run(replica_fn)
 
     self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
@@ -270,7 +284,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result.values[1], constant_op.constant([3.0]))
 
   def test_get_replica_context(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     tensor_input = constant_op.constant(3)
     d_tensor_input = strategy.experimental_distribute_values_from_function(
@@ -294,14 +308,14 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result.values[1], constant_op.constant([6]))
 
   def test_gather_non_dtensor_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     tensor_input = constant_op.constant(3.0)
 
     result = strategy.gather(tensor_input, axis=0)
     self.assertAllClose(result, tensor_input)
 
   def test_gather_dtensor_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     def value_fn(value_context):
       start = value_context.replica_id_in_sync_group
@@ -322,7 +336,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result, [[[0, 1, 2, 1, 2, 3], [3, 4, 5, 4, 5, 6]]])
 
   def test_reduce_mean_non_dtensor_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
 
     with self.assertRaisesRegex(
@@ -330,7 +344,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
       strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=0)
 
   def test_reduce_sum_non_dtensor_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
 
     with self.assertRaisesRegex(
@@ -338,7 +352,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
       strategy.reduce(reduce_util.ReduceOp.SUM, tensor_input, axis=0)
 
   def test_reduce_mean_distribute_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     @def_function.function
     def value_fn(value_context):
@@ -364,7 +378,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result, constant_op.constant([2.5, 4.5]))
 
   def test_reduce_sum_distribute_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     @def_function.function
     def value_fn(value_context):
@@ -390,7 +404,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result, constant_op.constant([10.0, 18.0]))
 
   def test_reduce_mean_mirrored_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     with  strategy.scope():
       v = variables.Variable(constant_op.constant([[1.0, 2.0], [3.0, 4.0]]))
@@ -404,7 +418,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result, constant_op.constant([1.5, 3.5]))
 
   def test_reduce_sum_mirrored_value(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
 
     with  strategy.scope():
       v = variables.Variable(constant_op.constant([[1.0, 2.0], [3.0, 4.0]]))
@@ -418,7 +432,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertAllClose(result, constant_op.constant([3.0, 7.0]))
 
   def test_reduce_value_device(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
 
     result = strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=None)
@@ -429,7 +443,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     def replica_fn():
       return constant_op.constant([3.0])
 
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     result = strategy.run(replica_fn)
     local_result = strategy.experimental_local_results(result)
 
@@ -439,7 +453,7 @@ class StrategyBaseTest(test_util.DTensorBaseTest):
     self.assertEqual(local_result[1], constant_op.constant([3.0]))
 
   def test_experimental_local_results_with_inputs(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     array_value = np.array([3., 2.])
     def value_fn(ctx):
       value = array_value[ctx.replica_id_in_sync_group]
@@ -484,7 +498,7 @@ class InvalidMeshTest(test_util.DTensorBaseTest):
   def test_invalid_mesh_shape(self):
     with self.assertRaisesRegex(
         ValueError, 'The mesh for MirroredStrategy must be 1D, received: 2D'):
-      mirrored_strategy.MirroredStrategy(self.mesh_2d)
+      mirrored_strategy.MirroredStrategy(mesh=self.mesh_2d)
 
 
 class StrategyCreationTest(test_util.DTensorBaseTest):
@@ -556,7 +570,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
         (self.images, self.labels)).repeat()
 
   def test_create_batched_dataset(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     global_batch_size = 8
     dataset = self.dataset.batch(global_batch_size).prefetch(2)
 
@@ -574,7 +588,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
     elements = [[1, 2, 3], [1, 2], [1, 2, 3, 4]]
     dataset = dataset_ops.Dataset.from_generator(
         lambda: elements, dtypes.int64).repeat()
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with self.assertRaisesRegex(ValueError, 'requires a static batch size'):
       strategy.experimental_distribute_dataset(dataset)
 
@@ -584,7 +598,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
     dataset = dataset_ops.Dataset.from_tensors(
         (self.images, self.labels)).repeat(30)  # There is a last partial batch
 
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     global_batch_size = 8
     dataset = dataset.batch(global_batch_size).prefetch(2)
 
@@ -603,7 +617,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
       self.assertLen(d_api.unpack(batched_label), self.mesh.num_local_devices())
 
   def test_deprecated_strategy_methods(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     with self.assertRaisesRegex(
         NotImplementedError, 'only available in the V1 API'):
       strategy.make_dataset_iterator(self.dataset)
@@ -620,7 +634,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
       return dataset_ops.Dataset.from_tensors(
           (self.images, self.labels)).repeat().batch(
               local_batch_size, drop_remainder=True).prefetch(2)
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     distributed_dataset = strategy.distribute_datasets_from_function(
         dataset_fn, None)
     iterator = iter(distributed_dataset)
@@ -647,7 +661,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
     array_value = np.array([3., 2., 1.])
     def value_fn(ctx):
       return array_value[ctx.replica_id_in_sync_group]
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     distributed_values = (
         strategy.experimental_distribute_values_from_function(
             value_fn))
@@ -662,7 +676,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
       value = array_value[ctx.replica_id_in_sync_group]
       return {'a': value,
               'b': constant_op.constant([value + 1.0, value + 2.0])}
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     distributed_values = (
         strategy.experimental_distribute_values_from_function(
             value_fn))
@@ -684,7 +698,7 @@ class StrategyDatasetTest(test_util.DTensorBaseTest):
         distributed_values['b'])
 
   def test_distribute_dataset_in_tf_function(self):
-    strategy = mirrored_strategy.MirroredStrategy(self.mesh)
+    strategy = mirrored_strategy.MirroredStrategy(mesh=self.mesh)
     local_batch_size = 4
     global_batch_size = 8
     dataset = self.dataset.batch(global_batch_size).prefetch(2)
diff --git a/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py
index 9b31da145b0..13f7ad21cde 100644
--- a/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py
+++ b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy.py
@@ -39,18 +39,18 @@ class MultiWorkerMirroredStrategy(distribute_lib.Strategy):
   implementation (e.g. all-reduce), so that multiple workers can work together.
   """
 
-  def __init__(self, mesh=None, cluster_resolver=None,
-               communication_options=None):
+  def __init__(self, cluster_resolver=None, communication_options=None, *,
+               mesh=None):
     """Creates the strategy.
 
     Args:
-      mesh: optional Dtensor global mesh for the computation. Note that either
-        `mesh` or the `cluster_resolver` should be provided. and not both.
       cluster_resolver: optional
         `tf.distribute.cluster_resolver.ClusterResolver`. In case neither `mesh`
         nor `cluster_resolver` are provided,
         `tf.distribute.cluster_resolver.TFConfigClusterResolver` is used.
       communication_options: currently ignore.
+      mesh: optional Dtensor global mesh for the computation. Note that either
+        `mesh` or the `cluster_resolver` should be provided. and not both.
     """
     self._validate_init_args(mesh, cluster_resolver)
     if not mesh:
@@ -77,6 +77,9 @@ class MultiWorkerMirroredStrategy(distribute_lib.Strategy):
       raise ValueError('The mesh for MultiWorkerMirroredStrategy must be 1D, '
                        f'received: {len(mesh.shape())}D')
 
+  def reduce(self, reduce_op, value, axis):
+    return dtensor_util.dtensor_reduce(self, reduce_op, value, axis)
+
 
 def _parse_dtensor_env_var_from_cluster_resolver(cluster_resolver):
   """Parse the env vars for Dtensor based on the cluster resolver.
diff --git a/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py
index 1851d2000fa..7a9423cc441 100644
--- a/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py
+++ b/tensorflow/python/distribute/experimental/multi_worker_mirrored_strategy_test.py
@@ -21,18 +21,26 @@ from absl import flags
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.dtensor.python import api as d_api
 from tensorflow.dtensor.python import d_variable
 from tensorflow.dtensor.python import layout
 from tensorflow.dtensor.python.tests import multi_client_test_util
 from tensorflow.dtensor.python.tests import test_backend_util
 from tensorflow.dtensor.python.tests import test_util
+from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import reduce_util
 from tensorflow.python.distribute.cluster_resolver import tfconfig_cluster_resolver
 from tensorflow.python.distribute.experimental import dtensor_util
 from tensorflow.python.distribute.experimental import multi_worker_mirrored_strategy as mwms
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import stateless_random_ops
 from tensorflow.python.ops import variables
 from tensorflow.python.platform import test as tf_test
 
@@ -268,6 +276,334 @@ class MultiWorkerMirroredStrategyTest(tf_test.TestCase, parameterized.TestCase):
           result.values[i],
           constant_op.constant([3 * strategy.num_replicas_in_sync]))
 
+  def test_gather_non_dtensor_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    tensor_input = constant_op.constant(3.0)
+
+    result = strategy.gather(tensor_input, axis=0)
+    self.assertAllClose(result, tensor_input)
+
+  def test_gather_dtensor_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    stride = self.num_client * self.num_local_devices
+
+    def value_fn(value_context):
+      start = value_context.replica_id_in_sync_group * stride
+      return array_ops.reshape(
+          math_ops.range(start=start, limit=start + stride), shape=(1, stride)
+      )
+    distribute_result = strategy.experimental_distribute_values_from_function(
+        value_fn
+    )
+    # distribute_result is a DTensorDistributedValue.
+    # The shape of the global tensor is [stride, stride],
+    # and each worker gets [stride/2, stride].
+    result = strategy.gather(distribute_result, axis=0)
+
+    start = stride * self.num_local_devices * self.client_id
+    end = start + stride * self.num_local_devices
+    self.assertEqual(result.shape, [self.num_local_devices, stride])
+    self.assertAllClose(
+        result,
+        array_ops.reshape(
+            math_ops.range(start=start, limit=end),
+            shape=(self.num_local_devices, -1),
+        ),
+    )
+
+    result = strategy.gather(distribute_result, axis=1)
+    self.assertEqual(result.shape, [1, self.num_local_devices * stride])
+    self.assertAllClose(
+        result,
+        array_ops.reshape(
+            math_ops.range(start=start, limit=end), shape=(1, -1)
+        ),
+    )
+
+  def test_reduce_mean_non_dtensor_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
+
+    with self.assertRaisesRegex(
+        ValueError, 'Unsupported input types for MirroredStrategy.'
+    ):
+      strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=0)
+
+  def test_reduce_sum_non_dtensor_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
+
+    with self.assertRaisesRegex(
+        ValueError, 'Unsupported input types for MirroredStrategy.'
+    ):
+      strategy.reduce(reduce_util.ReduceOp.SUM, tensor_input, axis=0)
+
+  def test_reduce_mean_distribute_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    @def_function.function
+    def value_fn(value_context):
+      i = value_context.replica_id_in_sync_group
+      return constant_op.constant([[0.0, 1.0], [2.0, 3.0]]) + i * 4.0
+
+    distribute_value = strategy.experimental_distribute_values_from_function(
+        value_fn
+    )
+    # replica 0 has [[0.0, 1.0],[2.0, 3.0]] and
+    # replica 1 has [[4.0, 5.0],[6.0, 7.0]]. Each worker has 4 replicas.
+    # For worker 2, it has replica 4 ~ 7.
+
+    result = strategy.reduce(
+        reduce_util.ReduceOp.MEAN, distribute_value, axis=None
+    )
+    # This should be a global reduce and each worker should have same value.
+    # [[14.0, 15.0],[16.0, 17.0]]
+    final = (self.num_local_devices * self.num_client - 1) * 2.0
+    self.assertAllClose(
+        result, constant_op.constant([[0.0, 1.0], [2.0, 3.0]]) + final
+    )
+
+    result = strategy.reduce(
+        reduce_util.ReduceOp.MEAN, distribute_value, axis=0
+    )
+    # [15.0, 16.0]
+    self.assertAllClose(result, constant_op.constant([0.0, 1.0]) + final + 1)
+
+    result = strategy.reduce(
+        reduce_util.ReduceOp.MEAN, distribute_value, axis=1
+    )
+
+    self.assertAllClose(result, constant_op.constant([0.5, 2.5]) + final)
+
+  def test_reduce_sum_distribute_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    @def_function.function
+    def value_fn(value_context):
+      i = value_context.replica_id_in_sync_group
+      return constant_op.constant([[0.0, 1.0], [2.0, 3.0]]) + i * 4.0
+
+    distribute_value = strategy.experimental_distribute_values_from_function(
+        value_fn
+    )
+    # replica 0 has [[0.0, 1.0],[2.0, 3.0]] and
+    # replica 1 has [[4.0, 5.0],[6.0, 7.0]]. Each worker has 4 replicas.
+    # For worker 2, it has replica 4 ~ 7.
+    # The shape of the global tensor is [16, 2], and each worker gets [8, 2].
+
+    result = strategy.reduce(
+        reduce_util.ReduceOp.SUM, distribute_value, axis=None
+    )
+    self.assertAllClose(result, [[112.0, 120.0], [128.0, 136.0]])
+
+    result = strategy.reduce(reduce_util.ReduceOp.SUM, distribute_value, axis=0)
+    self.assertAllClose(result, constant_op.constant([240.0, 256.0]))
+
+    result = strategy.reduce(reduce_util.ReduceOp.SUM, distribute_value, axis=1)
+    self.assertAllClose(result, constant_op.constant([232.0, 264.0]))
+
+  def test_reduce_mean_mirrored_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    with strategy.scope():
+      v = variables.Variable(constant_op.constant([[1.0, 2.0], [3.0, 4.0]]))
+    self.assertIsInstance(v, d_variable.DVariable)
+
+    result = strategy.reduce(reduce_util.ReduceOp.MEAN, v, axis=None)
+    self.assertAllClose(result, constant_op.constant([[1.0, 2.0], [3.0, 4.0]]))
+    result = strategy.reduce(reduce_util.ReduceOp.MEAN, v, axis=0)
+    self.assertAllClose(result, constant_op.constant([2.0, 3.0]))
+    result = strategy.reduce(reduce_util.ReduceOp.MEAN, v, axis=1)
+    self.assertAllClose(result, constant_op.constant([1.5, 3.5]))
+
+  def test_reduce_sum_mirrored_value(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    with strategy.scope():
+      v = variables.Variable(constant_op.constant([[1.0, 2.0], [3.0, 4.0]]))
+    self.assertIsInstance(v, d_variable.DVariable)
+
+    result = strategy.reduce(reduce_util.ReduceOp.SUM, v, axis=None)
+    self.assertAllClose(result, constant_op.constant([[1.0, 2.0], [3.0, 4.0]]))
+    result = strategy.reduce(reduce_util.ReduceOp.SUM, v, axis=0)
+    self.assertAllClose(result, constant_op.constant([4.0, 6.0]))
+    result = strategy.reduce(reduce_util.ReduceOp.SUM, v, axis=1)
+    self.assertAllClose(result, constant_op.constant([3.0, 7.0]))
+
+  def test_reduce_value_device(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    tensor_input = constant_op.constant([[3.0, 4.0], [5.0, 6.0], [7.0, 8.0]])
+
+    result = strategy.reduce(reduce_util.ReduceOp.MEAN, tensor_input, axis=None)
+    self.assertIn('CPU:0', result.device)
+
+  def test_experimental_local_results(self):
+    @def_function.function
+    def replica_fn():
+      return constant_op.constant([3.0])
+
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    result = strategy.run(replica_fn)
+    local_result = strategy.experimental_local_results(result)
+
+    self.assertIsInstance(local_result, tuple)
+    self.assertLen(local_result, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertEqual(local_result[i], constant_op.constant([3.0]))
+
+  def test_experimental_local_results_with_inputs(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+
+    def value_fn(ctx):
+      value = float(ctx.num_replicas_in_sync)
+      return {'a': value, 'b': constant_op.constant([value + 1.0, value + 2.0])}
+
+    distributed_values = strategy.experimental_distribute_values_from_function(
+        value_fn
+    )
+
+    @def_function.function
+    def replica_fn(inputs):
+      result = {}
+      for key in inputs:
+        result[key] = inputs[key] * 2.0
+      return result
+
+    result = strategy.run(replica_fn, args=(distributed_values,))
+    local_result = strategy.experimental_local_results(result)
+    self.assertIsInstance(local_result, tuple)
+    self.assertLen(local_result, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertDictEqual(
+          local_result[i],
+          {
+              'a': constant_op.constant([16.0]),
+              'b': constant_op.constant([18.0, 20.0]),
+          },
+      )
+
+
+class StrategyDatasetTest(tf_test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.num_client = flags.FLAGS.num_clients
+    self.num_local_devices = flags.FLAGS.num_local_devices
+
+    tf_config = json.loads(os.environ['TF_CONFIG'])
+    self.client_id = int(tf_config['task']['index'])
+
+    self.images = stateless_random_ops.stateless_random_uniform(
+        [8, 8, 3], seed=(1, 2), minval=0, maxval=255)
+    self.labels = stateless_random_ops.stateless_random_uniform(
+        [1], seed=(1, 2), minval=0, maxval=10)
+
+    self.dataset = dataset_ops.Dataset.from_tensors(
+        (self.images, self.labels)).repeat()
+
+  def test_create_batched_dataset(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    global_batch_size = self.num_client * self.num_local_devices * 2
+    dataset = self.dataset.batch(global_batch_size).prefetch(2)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(dataset)
+    element = next(iter(distributed_dataset))
+    batched_image, batched_label = element
+    self.assertEqual(batched_image.shape, [global_batch_size, 8, 8, 3])
+    self.assertEqual(batched_label.shape, [global_batch_size, 1])
+
+    # After unpack, it should only get the local shards.
+    self.assertLen(d_api.unpack(batched_image), self.num_local_devices)
+    self.assertLen(d_api.unpack(batched_label), self.num_local_devices)
+
+  def test_uneven_batched_dataset(self):
+    elements = [[1, 2, 3], [1, 2], [1, 2, 3, 4]]
+    dataset = dataset_ops.Dataset.from_generator(
+        lambda: elements, dtypes.int64).repeat()
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    with self.assertRaisesRegex(ValueError, 'requires a static batch size'):
+      strategy.experimental_distribute_dataset(dataset)
+
+  def test_deprecated_strategy_methods(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    with self.assertRaisesRegex(
+        NotImplementedError, 'only available in the V1 API'):
+      strategy.make_dataset_iterator(self.dataset)
+
+    with self.assertRaisesRegex(
+        NotImplementedError, 'only available in the V1 API'):
+      strategy.make_input_fn_iterator(lambda _: self.dataset)
+
+  def test_distribute_dataset_from_fn(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    local_batch_size = 4
+    global_batch_size = local_batch_size * strategy.num_replicas_in_sync
+
+    def dataset_fn(option):
+      del option
+      return dataset_ops.Dataset.from_tensors(
+          (self.images, self.labels)).repeat().batch(
+              local_batch_size, drop_remainder=True).prefetch(2)
+
+    distributed_dataset = strategy.distribute_datasets_from_function(
+        dataset_fn, None)
+    iterator = iter(distributed_dataset)
+
+    self.assertEqual(distributed_dataset.element_spec,
+                     (tensor_spec.TensorSpec(shape=(global_batch_size, 8, 8, 3),
+                                             dtype=dtypes.float32, name=None),
+                      tensor_spec.TensorSpec(shape=(global_batch_size, 1),
+                                             dtype=dtypes.float32, name=None)))
+    self.assertEqual(distributed_dataset.element_spec, iterator.element_spec)
+
+    batched_image, batched_label = next(iterator)
+    self.assertEqual(batched_image.shape, [global_batch_size, 8, 8, 3])
+    self.assertEqual(batched_label.shape, [global_batch_size, 1])
+
+    # After unpack, it should only get the local shards.
+    unpacked_images = d_api.unpack(batched_image)
+    self.assertLen(unpacked_images, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertEqual(unpacked_images[i].shape, [local_batch_size, 8, 8, 3])
+
+  def test_distribute_values_from_function(self):
+    array_value = np.array([1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0])
+    def value_fn(ctx):
+      return array_value[ctx.replica_id_in_sync_group]
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    distributed_values = (
+        strategy.experimental_distribute_values_from_function(
+            value_fn))
+
+    self.assertEqual(d_api.fetch_layout(distributed_values),
+                     layout.Layout.batch_sharded(
+                         strategy._mesh, batch_dim='batch', rank=1))
+    unpacked_value = d_api.unpack(distributed_values)
+    self.assertLen(unpacked_value, self.num_local_devices)
+    start = 1.0 + self.num_local_devices * self.client_id
+    for i in range(self.num_local_devices):
+      self.assertEqual(unpacked_value[i], start + i)
+
+  def test_distribute_dataset_in_tf_function(self):
+    strategy = mwms.MultiWorkerMirroredStrategy()
+    local_batch_size = 4
+    global_batch_size = local_batch_size * strategy.num_replicas_in_sync
+    dataset = self.dataset.batch(global_batch_size).prefetch(2)
+
+    distributed_dataset = strategy.experimental_distribute_dataset(dataset)
+
+    @def_function.function
+    def step_fn(iterator):
+      images, labels = next(iterator)
+      del labels
+      return images
+
+    result = strategy.run(step_fn, args=(iter(distributed_dataset),))
+    self.assertIsInstance(result, dtensor_util.DTensorDistributedValue)
+    self.assertLen(result.values, self.num_local_devices)
+    for i in range(self.num_local_devices):
+      self.assertEqual(result.values[i].shape, [local_batch_size, 8, 8, 3])
+
 
 def client_config_function(config_params):
   client_id = config_params['client_id']
diff --git a/tensorflow/python/distribute/experimental/rpc/BUILD b/tensorflow/python/distribute/experimental/rpc/BUILD
index c63145ed1d1..3b3e3f9aee3 100644
--- a/tensorflow/python/distribute/experimental/rpc/BUILD
+++ b/tensorflow/python/distribute/experimental/rpc/BUILD
@@ -18,8 +18,6 @@ pytype_strict_library(
     deps = [
         "//tensorflow/distribute/experimental/rpc/kernels:gen_rpc_ops",
         "//tensorflow/distribute/experimental/rpc/proto:tf_rpc_service_proto_py",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
@@ -28,6 +26,8 @@ pytype_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:nest",
@@ -45,10 +45,6 @@ tf_py_strict_test(
     ],
     deps = [
         ":rpc_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
@@ -58,6 +54,10 @@ tf_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
     ],
diff --git a/tensorflow/python/distribute/experimental/rpc/rpc_ops.py b/tensorflow/python/distribute/experimental/rpc/rpc_ops.py
index ccadb35cc41..370ebc48ecf 100644
--- a/tensorflow/python/distribute/experimental/rpc/rpc_ops.py
+++ b/tensorflow/python/distribute/experimental/rpc/rpc_ops.py
@@ -287,7 +287,7 @@ class GrpcServer(Server):
     """Method for registering functions."""
 
     if isinstance(func, def_function.Function):
-      if func._function_spec.arg_names:  # pylint: disable=protected-access
+      if func.function_spec.arg_names:
         if func.input_signature is None:
           raise ValueError("Input signature not specified for the function.")
       concrete_fn = func.get_concrete_function()
diff --git a/tensorflow/python/distribute/failure_handling/BUILD b/tensorflow/python/distribute/failure_handling/BUILD
index 4c444c94f99..df52d80552e 100644
--- a/tensorflow/python/distribute/failure_handling/BUILD
+++ b/tensorflow/python/distribute/failure_handling/BUILD
@@ -1,5 +1,5 @@
 load("//tensorflow:strict.default.bzl", "py_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_library", "tf_py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_custom_op_py_strict_library", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -18,7 +18,6 @@ py_strict_library(
     deps = [
         ":check_preemption_py",
         ":failure_handling_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_context",
         "//tensorflow/python/checkpoint:checkpoint_management",
@@ -29,6 +28,7 @@ py_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:deprecation",
@@ -77,7 +77,6 @@ tf_py_strict_test(
     deps = [
         ":failure_handling_lib",
         ":failure_handling_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
@@ -94,6 +93,7 @@ tf_py_strict_test(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
@@ -113,7 +113,6 @@ tf_py_strict_test(
     deps = [
         ":failure_handling_lib",
         ":failure_handling_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
@@ -128,6 +127,7 @@ tf_py_strict_test(
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
@@ -137,7 +137,7 @@ tf_py_strict_test(
     ],
 )
 
-tf_custom_op_py_library(
+tf_custom_op_py_strict_library(
     name = "check_preemption_py",
     kernels = [
         "//tensorflow/core/distributed_runtime/preemption:check_preemption_op_kernel",
@@ -146,6 +146,6 @@ tf_custom_op_py_library(
     srcs_version = "PY2AND3",
     deps = [
         "//tensorflow/core/distributed_runtime/preemption:gen_check_preemption_op",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
     ],
 )
diff --git a/tensorflow/python/distribute/integration_test/BUILD b/tensorflow/python/distribute/integration_test/BUILD
index f64e0bbb407..26ee3c7b51f 100644
--- a/tensorflow/python/distribute/integration_test/BUILD
+++ b/tensorflow/python/distribute/integration_test/BUILD
@@ -17,7 +17,6 @@ distribute_py_strict_test(
     ],
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:lookup_ops",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
@@ -28,6 +27,7 @@ distribute_py_strict_test(
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:lookup_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/distribute/parallel_device/BUILD b/tensorflow/python/distribute/parallel_device/BUILD
index fc148be837c..3341e955766 100644
--- a/tensorflow/python/distribute/parallel_device/BUILD
+++ b/tensorflow/python/distribute/parallel_device/BUILD
@@ -22,13 +22,13 @@ py_strict_library(
     ],
     deps = [
         "//tensorflow/python:_pywrap_parallel_device",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:variable_utils",
@@ -48,28 +48,28 @@ distribute_py_strict_test(
     ],
     deps = [
         ":parallel_device",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:stateful_random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
-        "//tensorflow/python/tpu:tpu_strategy_util",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
diff --git a/tensorflow/python/distribute/parallel_device/parallel_device_test.py b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
index d1aee83dd05..bc78a21c3da 100644
--- a/tensorflow/python/distribute/parallel_device/parallel_device_test.py
+++ b/tensorflow/python/distribute/parallel_device/parallel_device_test.py
@@ -20,6 +20,7 @@ from absl.testing import parameterized
 from tensorflow.python.checkpoint import checkpoint as tracking
 from tensorflow.python.checkpoint import checkpoint_management
 from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.distribute.parallel_device import parallel_device
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
@@ -39,7 +40,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load
 from tensorflow.python.saved_model import save
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.util import nest
 
 # When running collectives asynchronously, we need to give each parallel device
@@ -97,7 +97,7 @@ class _VirtualDeviceTestCase(test.TestCase):
     ctx = context.context()
     if ctx.list_physical_devices("TPU"):
       self.device_type = "TPU"
-      tpu_strategy_util.initialize_tpu_system()
+      tpu_cluster_resolver.initialize_tpu_system()
     elif ctx.list_physical_devices("GPU"):
       self.device_type = "GPU"
       gpus = ctx.list_physical_devices(self.device_type)
diff --git a/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc b/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
index 064f9996d98..cd3d2eeb1d1 100644
--- a/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
+++ b/tensorflow/python/distribute/parallel_device/pywrap_parallel_device.cc
@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+#include <vector>
+
 #include "Python.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
diff --git a/tensorflow/python/distribute/ps_values.py b/tensorflow/python/distribute/ps_values.py
index bff49f31df8..da0c282d693 100644
--- a/tensorflow/python/distribute/ps_values.py
+++ b/tensorflow/python/distribute/ps_values.py
@@ -31,23 +31,17 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_conversion_registry
-from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import handle_data_util
 from tensorflow.python.ops import lookup_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import variable_scope as vs
 from tensorflow.python.saved_model import save_context
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.types import core
-from tensorflow.python.util.lazy_loader import LazyLoader
 
 
-load_context = LazyLoader(
-    "load_context", globals(),
-    "tensorflow.python.keras.saving.saved_model.load_context"
-)
-
 TRACKABLE_RESOURCE_METHODS = [
     "_create_resource", "_initialize", "_destroy_resource"
 ]
@@ -554,18 +548,45 @@ CachingVariable._overload_overloadable_operators()  # pylint: disable=protected-
 class PerWorkerVariable(resource_variable_ops.BaseResourceVariable):
   """A wrapper around unsynced variables created on workers.
 
-  Overrides the Variable's handle to use the appropriate worker's variable
-  handle at call time. In doing so this class can support the built-in
-  `Variable` methods, but it is experimental.
+  `PerWorkerVariable`s are variables that are stored on workers and not
+  synchronized. A `PerWorkerVariable` is really a wrapper around multiple
+  independent `Variable`s stored on independent worker machines. 
+  `PerWorkerVariable` is currently only tested and supported when used with
+  `ParameterServerStrategy`. A `PerWorkerVariable` can be created by creating a
+  `Variable` within strategy scope and using the `per_worker_variable` flag,
+  e.g.:
 
-  All per-worker values can be read and retrieved as a list via
+  ```
+  with strategy.scope():
+    var = tf.Variable(initial_value=0.0, per_worker_variable=True)
+  ```
+
+  The implementation modifies the graph to ensure that a worker's local version
+  of the variable is used for computation at call time, while needing only one
+  function trace and requiring no code changes beyond the `per_worker_variable`
+  flag. `PerWorkerVariable`s can thus be treated like a standard `Variable`, but
+  support is experimental and not all ops have been tested.
+
+  All per-worker values can be retrieved and read into a list via
   `PerWorkerVariable.read_all()`.
+
+  Caveats:
+    - `PerWorkerVariable`s should not be used as direct inputs to a
+      `tf.function`. That is, they should not appear in a tf.function header as
+      an input argument. However they can still be read and manipulated in a
+      `tf.function`.
+    - The `shape` argument must be fully-defined (no `None` entries) or left
+      empty. Partially-defined shapes are not yet supported.
+    - Automatic control dependencies do not work with `PerWorkerVariable`s, so
+      returning a `PerWorkerVariable` is not supported, and `read_all()` should 
+      be used to retrieve values. (TODO: b/286052052)
+    - `PerWorkerVariable`s should not be created within a `tf.function`.
   """
 
   def __init__(self, strategy, next_creator, **kwargs):
     self._coordinator = strategy._cluster_coordinator
     self._per_worker_vars = None
-    self._next_creator = functools.partial(next_creator, **kwargs)
+    self._var_creator = functools.partial(next_creator, **kwargs)
 
     self._coordinator_instance = next_creator(**kwargs)
 
@@ -577,12 +598,8 @@ class PerWorkerVariable(resource_variable_ops.BaseResourceVariable):
       self._in_graph_mode = kwargs["in_graph_mode"]
 
     self._cached_value = None
-    self._shape = (
-        tensor_shape.as_shape(kwargs["shape"]) if kwargs.get("shape") else None
-    )
-    self._dtype = (
-        dtypes.as_dtype(kwargs["dtype"]) if kwargs.get("dtype") else None
-    )
+    self._shape = self._coordinator_instance.shape
+    self._dtype = self._coordinator_instance.dtype
     self._trainable = False  # not supported
     self._unique_id = kwargs.get("unique_id")
     if kwargs.get("handle_name") is None:
@@ -597,11 +614,14 @@ class PerWorkerVariable(resource_variable_ops.BaseResourceVariable):
 
   @property
   def handle(self):
-    self._maybe_create_per_worker_vars()
-    closure, spec = self.handle_call_time_value()
-    return ops.get_default_graph().capture_call_time_value(
-        closure,
-        spec)
+    if context.executing_eagerly() or save_context.in_save_context():
+      return self._coordinator_instance.handle
+    else:
+      self._maybe_create_per_worker_vars()
+      closure, spec = self.handle_call_time_value()
+      return ops.get_default_graph().capture_call_time_value(
+          closure,
+          spec)
 
   def handle_call_time_value(self):
     """Returns a closure to run for a handle at call time and its spec.
@@ -620,21 +640,31 @@ class PerWorkerVariable(resource_variable_ops.BaseResourceVariable):
       else:
         # Only needed for tracing
         return self._coordinator_instance.handle
-
-    return closure, tensor_spec.TensorSpec(
-        shape=self.shape, dtype=dtypes.resource)
+    return closure, PerWorkerVariableSpec(
+        value=self._coordinator_instance.handle)
 
   def _maybe_create_per_worker_vars(self):
     """Create variable on each worker if it hasn't been created."""
     if not self._per_worker_vars:
       self._per_worker_vars = (
-          self._coordinator._create_per_worker_resources(self._next_creator))  # pylint: disable=protected-access
+          self._coordinator._create_per_worker_resources(self._var_creator))  # pylint: disable=protected-access
 
   def read_all(self):
     """Synchronously read variables from all workers into a list of Tensors."""
     return [wv.get() for wv in self._per_worker_vars._values]  # pylint: disable=protected-access
 
 
+class PerWorkerVariableSpec(tensor_spec.TensorSpec):
+  def __init__(self, value=None, name=None):
+    super().__init__(value.shape, value.dtype, name=name)
+    self._value = value
+
+  def placeholder_value(self, placeholder_context):
+    placeholder = super().placeholder_value(placeholder_context)
+    handle_data_util.set_handle_data(placeholder, self._value._handle_data)  # pylint: disable=protected-access
+    return placeholder
+
+
 class DistributedTable(lookup_ops.StaticHashTable):
   """A distributed StaticHashTable for ParameterServerStrategy.
 
diff --git a/tensorflow/python/distribute/sharded_variable.py b/tensorflow/python/distribute/sharded_variable.py
index cd055a446b7..f65677920f4 100644
--- a/tensorflow/python/distribute/sharded_variable.py
+++ b/tensorflow/python/distribute/sharded_variable.py
@@ -246,11 +246,14 @@ class ShardedVariableSpec(type_spec.TypeSpec):
     return self._variable_specs
 
   def _to_components(self, value):
-    return value.variables
+    return tuple(value.variables)
 
   def _from_components(self, variables):
     return ShardedVariable(variables)
 
+  def _cast(self, value, _):
+    return value
+
 
 class ShardedVariableMixin(trackable.Trackable):
   """Mixin for ShardedVariable."""
diff --git a/tensorflow/python/distribute/strategy_combinations.py b/tensorflow/python/distribute/strategy_combinations.py
index 4a2bd812976..c73d180a0af 100644
--- a/tensorflow/python/distribute/strategy_combinations.py
+++ b/tensorflow/python/distribute/strategy_combinations.py
@@ -39,7 +39,6 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util as framework_test_util
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.training import server_lib
 from tensorflow.python.util.tf_export import tf_export
 
@@ -105,7 +104,7 @@ def _get_tpu_strategy_creator(steps_per_run,
       if getattr(FLAGS, "tpu", "") or did_automatically_resolve:
         remote.connect_to_cluster(resolver)
         _did_connect_to_cluster = True
-      _topology = tpu_strategy_util.initialize_tpu_system(resolver)
+      _topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
 
     device_assignment = None
     if use_single_core:
diff --git a/tensorflow/python/distribute/test_util_test.py b/tensorflow/python/distribute/test_util_test.py
index 1ef944a2652..e07104b57dc 100644
--- a/tensorflow/python/distribute/test_util_test.py
+++ b/tensorflow/python/distribute/test_util_test.py
@@ -37,7 +37,7 @@ from tensorflow.python.ops import array_ops
             strategy_combinations.multi_worker_mirrored_2x1_gpu,
             strategy_combinations.multi_worker_mirrored_2x2_gpu,
         ] + strategy_combinations.strategies_minus_tpu,
-        mode=['eager', 'graph']))
+        mode=['eager']))
 class GatherTest(test.TestCase, parameterized.TestCase):
 
   def testOne(self, strategy):
diff --git a/tensorflow/python/distribute/tpu_strategy.py b/tensorflow/python/distribute/tpu_strategy.py
index ff097916d2b..05579c99758 100644
--- a/tensorflow/python/distribute/tpu_strategy.py
+++ b/tensorflow/python/distribute/tpu_strategy.py
@@ -62,7 +62,6 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.tpu import device_assignment as device_assignment_lib  # pylint: disable=unused-import
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_hardware_feature
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu import training_loop
 from tensorflow.python.tpu.ops import tpu_ops
 from tensorflow.python.util import deprecation
@@ -1239,7 +1238,7 @@ class TPUExtended(distribute_lib.StrategyExtendedV1):
     This is a private method only to be used by Estimator. Other frameworks
     should directly be calling `tf.tpu.experimental.initialize_tpu_system`
     """
-    tpu_strategy_util.initialize_tpu_system(self._tpu_cluster_resolver)
+    tpu_cluster_resolver_lib.initialize_tpu_system(self._tpu_cluster_resolver)
 
   def _create_variable(self, next_creator, **kwargs):
     """Create a TPUMirroredVariable. See `DistributionStrategy.scope`."""
diff --git a/tensorflow/python/distribute/tpu_strategy_compilation_test.py b/tensorflow/python/distribute/tpu_strategy_compilation_test.py
index 63d54f109f6..defc9c2db88 100644
--- a/tensorflow/python/distribute/tpu_strategy_compilation_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_compilation_test.py
@@ -21,7 +21,6 @@ from tensorflow.python.eager import remote
 from tensorflow.python.eager import test
 from tensorflow.python.framework import constant_op
 from tensorflow.python.platform import flags
-from tensorflow.python.tpu import tpu_strategy_util
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
@@ -41,7 +40,7 @@ def get_tpu_cluster_resolver():
 def get_tpu_strategy():
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
-  tpu_strategy_util.initialize_tpu_system(resolver)
+  tpu_cluster_resolver.initialize_tpu_system(resolver)
   strategy = tpu_lib.TPUStrategyV2(resolver)
   return strategy
 
diff --git a/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py b/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
index 8b834f23bc9..b6801e5cba6 100644
--- a/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_model_parallelism_test.py
@@ -47,7 +47,6 @@ from tensorflow.python.ops import variables
 from tensorflow.python.platform import flags
 from tensorflow.python.tpu import device_assignment as device_assignment_lib
 from tensorflow.python.tpu import tpu_replication
-from tensorflow.python.tpu import tpu_strategy_util
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string("tpu", "", "Name of TPU to connect to.")
@@ -67,7 +66,7 @@ def get_tpu_cluster_resolver():
 def get_tpu_strategy(enable_spmd=False):
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
-  topology = tpu_strategy_util.initialize_tpu_system(resolver)
+  topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
   num_replicas = resolver.get_tpu_system_metadata().num_cores // 2
   device_assignment = device_assignment_lib.DeviceAssignment.build(
       topology, num_replicas=num_replicas, computation_shape=[1, 1, 1, 2])
@@ -516,6 +515,28 @@ class TPUStrategyModelParallelismTest(
         self.evaluate(strategy.reduce("SUM", result, axis=None)),
     )
 
+  def test_spmd_with_map_outside_comp(self):
+    strategy, num_replicas = get_tpu_strategy(enable_spmd=True)
+
+    def host_inc(x):
+      return x + 1
+
+    @def_function.function
+    def fn(a):
+      b = strategy.experimental_split_to_logical_devices(a, [2, 1])
+      c = tpu_replication.experimental_map_outside_compilation(host_inc, b)
+      d = strategy.experimental_split_to_logical_devices(c, [2, 1])
+      return d
+
+    arg = constant_op.constant(
+        [[0, 1], [2, 3]], shape=(2, 2), dtype=dtypes.int64
+    )
+    result = strategy.run(fn, args=(arg,))
+    expected = (arg + 1) * num_replicas
+    self.assertAllEqual(
+        expected, self.evaluate(strategy.reduce("SUM", result, axis=None))
+    )
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/distribute/tpu_strategy_test.py b/tensorflow/python/distribute/tpu_strategy_test.py
index 115ae1e4aaa..ab3711e996d 100644
--- a/tensorflow/python/distribute/tpu_strategy_test.py
+++ b/tensorflow/python/distribute/tpu_strategy_test.py
@@ -79,7 +79,7 @@ def get_tpu_cluster_resolver():
 def get_tpu_strategy(enable_packed_var=False):
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
-  tpu_strategy_util.initialize_tpu_system(resolver)
+  tpu_cluster_resolver.initialize_tpu_system(resolver)
   strategy = tpu_lib.TPUStrategyV2(resolver)
   strategy._enable_packed_variable_in_eager_mode = enable_packed_var
   return strategy
@@ -176,12 +176,30 @@ class TPUTest(test.TestCase):
   def test_multiple_initialize_system(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    tpu_strategy_util.initialize_tpu_system(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
 
     with test.mock.patch.object(logging, "warning") as mock_log:
-      tpu_strategy_util.initialize_tpu_system(resolver)
+      tpu_cluster_resolver.initialize_tpu_system(resolver)
       self.assertRegex(str(mock_log.call_args), "already been initialized")
 
+  def test_initialize_tpu_system_impl_input(self):
+    resolver = get_tpu_cluster_resolver()
+    with self.assertRaisesRegex(
+        TypeError,
+        r"tpu_cluster_resolver_cls is not"
+        r" tf.distribute.cluster_resolver.TPUClusterResolver."):
+      tpu_strategy_util.initialize_tpu_system_impl(
+          resolver, tpu_cluster_resolver_cls=None)
+
+  def test_shutdown_tpu_system_impl_input(self):
+    resolver = get_tpu_cluster_resolver()
+    with self.assertRaisesRegex(
+        TypeError,
+        r"tpu_cluster_resolver_cls is not"
+        r" tf.distribute.cluster_resolver.TPUClusterResolver."):
+      tpu_strategy_util.shutdown_tpu_system_impl(
+          resolver, tpu_cluster_resolver_cls=None)
+
   def test_tpu_tf_function_same_device(self):
     with ops.device("/device:TPU:0"):
       a = variables.Variable(1)
@@ -332,7 +350,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
   def test_sequential_runs(self, enable_packed_var):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
     # Computation replicated to all cores.
     device_assignment = device_assignment_lib.DeviceAssignment.build(
         topology, num_replicas=2)
@@ -443,7 +461,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
   def test_computation_on_subset_cores(self, enable_packed_var):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
     all_core_strategy = tpu_lib.TPUStrategyV2(resolver)
     all_core_strategy._enable_packed_variable_in_eager_mode = enable_packed_var
 
@@ -484,7 +502,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
   def test_worker_devices_on_subset_cores(self, enable_packed_var):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
 
     # Strategy for the 1st core.
     device_assignment = device_assignment_lib.DeviceAssignment.build(
@@ -1180,7 +1198,7 @@ class TPUStrategyTest(test.TestCase, parameterized.TestCase):
 
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
     device_assignment = device_assignment_lib.DeviceAssignment(
         topology, core_assignment=[[[0, 0, 0, 1]], [[0, 0, 0, 0]]]
     )
@@ -1317,7 +1335,7 @@ class TPUStrategyDistributionTest(
   def test_update_config_proto(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    tpu_strategy_util.initialize_tpu_system(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
     strategy = tpu_lib.TPUStrategyV2(resolver)
 
     config_proto = config_pb2.ConfigProto()
@@ -1449,7 +1467,7 @@ class DeviceAssignmentTest(test.TestCase):
   def test_core_assignment(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
     device_assignment = device_assignment_lib.DeviceAssignment(
         topology, core_assignment=[[[0, 0, 0, 0]]])
     self.assertAllEqual([[[0, 0, 0, 0]]], device_assignment.core_assignment)
@@ -1461,7 +1479,7 @@ class DeviceAssignmentTest(test.TestCase):
   def test_device_assignment_strategy_properties(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
     device_assignment = device_assignment_lib.DeviceAssignment(
         topology, core_assignment=[[[0, 0, 0, 0]]])
     strategy = tpu_lib.TPUStrategyV2(
@@ -1474,7 +1492,7 @@ class DeviceAssignmentTest(test.TestCase):
   def test_device_assignment_constants(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
     device_assignment = device_assignment_lib.DeviceAssignment(
         topology,
         core_assignment=device_assignment_lib.SINGLE_CORE_ASSIGNMENT)
@@ -1487,7 +1505,7 @@ class DeviceAssignmentTest(test.TestCase):
   def test_variables_mismatched_device_assignment(self):
     resolver = get_tpu_cluster_resolver()
     remote.connect_to_cluster(resolver)
-    topology = tpu_strategy_util.initialize_tpu_system(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
 
     strategy0 = tpu_lib.TPUStrategyV2(resolver)
     self.assertEqual(
diff --git a/tensorflow/python/distribute/v1/BUILD b/tensorflow/python/distribute/v1/BUILD
index c07eb8d6356..e88086280e7 100644
--- a/tensorflow/python/distribute/v1/BUILD
+++ b/tensorflow/python/distribute/v1/BUILD
@@ -18,13 +18,6 @@ cuda_py_strict_test(
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:kernels",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:collective_util",
         "//tensorflow/python/distribute:combinations",
@@ -40,7 +33,14 @@ cuda_py_strict_test(
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:kernels",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -52,11 +52,11 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nccl_ops",
         "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nccl_ops",
     ],
 )
 
@@ -66,14 +66,14 @@ tf_py_strict_test(
     deps = [
         ":all_reduce",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
@@ -84,7 +84,6 @@ pytype_strict_library(
     srcs = ["input_lib.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python/data/experimental/ops:cardinality",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
@@ -92,6 +91,7 @@ pytype_strict_library(
         "//tensorflow/python/distribute:input_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
         "//tensorflow/python/types:data",
         "//tensorflow/python/util:deprecation",
     ],
diff --git a/tensorflow/python/distribute/values.py b/tensorflow/python/distribute/values.py
index 1c332c71550..5994dd2836e 100644
--- a/tensorflow/python/distribute/values.py
+++ b/tensorflow/python/distribute/values.py
@@ -485,6 +485,9 @@ class DistributedVariableTraceType(trace.TraceType):
   def _to_tensors(self, value):
     return []
 
+  def _cast(self, value, _):
+    return value
+
   def __hash__(self) -> int:
     return hash(self.components)
 
diff --git a/tensorflow/python/distribute/vars_test.py b/tensorflow/python/distribute/vars_test.py
index e1472fe6773..5dd2c5a3b1a 100644
--- a/tensorflow/python/distribute/vars_test.py
+++ b/tensorflow/python/distribute/vars_test.py
@@ -42,7 +42,6 @@ from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variable_v1
 from tensorflow.python.ops import variables as variables_lib
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.util import variable_utils
 
 
@@ -971,7 +970,7 @@ class OnReadVariableSyncTest(test.TestCase, parameterized.TestCase):
     for aggregation in aggregations:
       if strategy_test_lib.is_tpu_strategy(distribution):
         resolver = tpu_cluster_resolver.TPUClusterResolver("")
-        tpu_strategy_util.initialize_tpu_system(resolver)
+        tpu_cluster_resolver.initialize_tpu_system(resolver)
       with distribution.scope():
         v = variable_v1.VariableV1(
             0.,
diff --git a/tensorflow/python/dlpack/BUILD b/tensorflow/python/dlpack/BUILD
index 93faa20fa74..0de12c3ae31 100644
--- a/tensorflow/python/dlpack/BUILD
+++ b/tensorflow/python/dlpack/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,24 +7,31 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "dlpack",
     srcs = ["dlpack.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:__subpackages__"],
     deps = [
-        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dlpack_test",
     srcs = ["dlpack_test.py"],
     srcs_version = "PY3",
     deps = [
         ":dlpack",
-        "//tensorflow/python/eager:test",
-        "@absl_py//absl/testing:absltest",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/eager/BUILD b/tensorflow/python/eager/BUILD
index cc47580a6fc..d14ec0a90bb 100644
--- a/tensorflow/python/eager/BUILD
+++ b/tensorflow/python/eager/BUILD
@@ -1,6 +1,7 @@
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("//tensorflow:tensorflow.bzl", "check_deps")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_python_pybind_extension")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_python_pybind_extension")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 load(
     "//tensorflow/tools/test:performance.bzl",
     "tf_py_logged_benchmark",
@@ -55,12 +56,12 @@ cc_library(
         "//tensorflow/core/platform:types",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/util:managed_stack_trace",
-        "//tensorflow/python:ndarray_tensor",
-        "//tensorflow/python:ndarray_tensor_bridge",
-        "//tensorflow/python:py_exception_registry",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:ndarray_tensor",
+        "//tensorflow/python/lib/core:ndarray_tensor_bridge",
+        "//tensorflow/python/lib/core:py_exception_registry",
         "//tensorflow/python/lib/core:py_seq_tensor",
         "//tensorflow/python/lib/core:py_util",
+        "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
         "//tensorflow/python/util:stack_trace",
@@ -85,12 +86,12 @@ tf_python_pybind_extension(
         ":pywrap_tfe_lib",
         "//tensorflow/c:tf_status_helper",
         "//tensorflow/c/eager:c_api_test_util",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@pybind11",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "pywrap_tensor_test",
     size = "small",
     srcs = ["pywrap_tensor_test.py"],
@@ -120,7 +121,7 @@ filegroup(
 )
 
 # Transitive dependencies of this target will be included in the pip package.
-py_library(
+py_strict_library(
     name = "eager_pip",
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
@@ -148,19 +149,19 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "core",
     srcs = ["core.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:errors",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cancellation",
     srcs = ["cancellation.py"],
     srcs_version = "PY3",
@@ -170,19 +171,18 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cancellation_test",
     size = "small",
     srcs = ["cancellation_test.py"],
     python_version = "PY3",
     deps = [
         ":cancellation",
-        ":test",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "executor",
     srcs = ["executor.py"],
     srcs_version = "PY3",
@@ -192,7 +192,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "context",
     srcs = ["context.py"],
     srcs_version = "PY3",
@@ -202,17 +202,16 @@ py_library(
     ],
     deps = [
         ":cancellation",
+        ":execute",
         ":executor",
         ":monitoring",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:c_api_util",
-        "//tensorflow/python:device",
-        "//tensorflow/python:device_spec",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:tfrt_utils",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:c_api_util",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:tfrt_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:function_utils",
@@ -220,6 +219,7 @@ py_library(
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
+        "@absl_py//absl/logging",
     ],
 )
 
@@ -234,7 +234,7 @@ tf_python_pybind_extension(
         "//tensorflow/c/eager:c_api",
         "//tensorflow/c/eager:c_api_experimental",
         "//tensorflow/c/eager:custom_device_testutil",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//tensorflow/python/util:cpp_python_util",
@@ -243,7 +243,7 @@ tf_python_pybind_extension(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "custom_device_test",
     size = "small",
     srcs = ["custom_device_test.py"],
@@ -267,7 +267,7 @@ py_test(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "context_test",
     size = "small",
     srcs = ["context_test.py"],
@@ -275,43 +275,54 @@ cuda_py_test(
     xla_enabled = True,
     deps = [
         ":context",
-        ":test",
+        ":def_function",
         "//tensorflow/compiler/xla/service:hlo_proto_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "monitoring",
     srcs = ["monitoring.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:c_api_util",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:c_api_util",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "monitoring_test",
     srcs = ["monitoring_test.py"],
     python_version = "PY3",
     deps = [
         ":monitoring",
         ":test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profiler",
     srcs = ["profiler.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
+        "//tensorflow/python/client:_pywrap_events_writer",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
@@ -320,7 +331,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "profiler_test",
     srcs = ["profiler_test.py"],
     python_version = "PY3",
@@ -331,8 +342,8 @@ cuda_py_test(
     deps = [
         ":profiler",
         ":test",
-        "//tensorflow/python:constant_op",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler:trace",
@@ -340,7 +351,7 @@ cuda_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profiler_client",
     srcs = ["profiler_client.py"],
     srcs_version = "PY3",
@@ -354,7 +365,7 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "profiler_client_test",
     srcs = ["profiler_client_test.py"],
     python_version = "PY3",
@@ -366,29 +377,55 @@ py_test(
     deps = [
         ":profiler_client",
         ":test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "small_constants_optimizer_test",
     size = "medium",
     srcs = ["small_constants_optimizer_test.py"],
     python_version = "PY3",
     deps = [
         ":context",
-        ":test",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager/polymorphic_function",
-        "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_test(
+    name = "summary_optimizer_test",
+    srcs = ["summary_optimizer_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],  # TODO(b/219089812)
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/runtime_client:runtime_client_py",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:gfile",
+        "@absl_py//absl/flags",
+    ],
+)
+
+py_strict_library(
     name = "tape",
     srcs = ["tape.py"],
     srcs_version = "PY3",
@@ -398,7 +435,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "record",
     srcs = ["record.py"],
     srcs_version = "PY3",
@@ -406,19 +443,29 @@ py_library(
     deps = ["//tensorflow/python:pywrap_tfe"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "tensor_test",
     srcs = ["tensor_test.py"],
     python_version = "PY3",
     deps = [
         ":context",
+        ":core",
         ":test",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "backprop_test",
     srcs = ["backprop_test.py"],
     python_version = "PY3",
@@ -433,43 +480,80 @@ cuda_py_test(
         ":backprop",
         ":backprop_util",
         ":context",
+        ":def_function",
         ":record",
         ":test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:memory_checker",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/training",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "forwardprop_test",
     srcs = ["forwardprop_test.py"],
     python_version = "PY3",
     shard_count = 5,
     deps = [
+        ":backprop",
+        ":context",
+        ":def_function",
         ":forwardprop",
         ":forwardprop_util",
         ":record",
-        ":test",
-        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:unconnected_gradients",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "core_test",
     size = "small",
     srcs = ["core_test.py"],
@@ -477,101 +561,114 @@ cuda_py_test(
     deps = [
         ":context",
         ":core",
+        ":def_function",
         ":execute",
+        ":executor",
         ":test",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test",
     srcs = ["test.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "execute",
     srcs = ["execute.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        ":context",
         ":core",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "graph_only_ops",
     srcs = ["graph_only_ops.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:op_callbacks",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:op_callbacks",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "graph_only_ops_test",
     srcs = ["graph_only_ops_test.py"],
     python_version = "PY3",
     deps = [
         "graph_only_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "framework_for_generated_wrappers",
     deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         ":execute",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "function",
     srcs = ["function.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
+        "//tensorflow/python/eager/polymorphic_function:atomic_function",
+        "//tensorflow/python/eager/polymorphic_function:concrete_function",
         "//tensorflow/python/eager/polymorphic_function:tf_method_target",
         "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
         "//tensorflow/python/eager/polymorphic_function:transform",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "backprop",
     srcs = ["backprop.py"],
     srcs_version = "PY3",
@@ -582,16 +679,24 @@ py_library(
         ":execute",
         ":imperative_grad",
         ":tape",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:unconnected_gradients",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:default_gradient",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:unconnected_gradients",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:_pywrap_utils",
@@ -602,25 +707,40 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "backprop_util",
     srcs = ["backprop_util.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:handle_data_util",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "forwardprop",
     srcs = ["forwardprop.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":backprop",
+        ":backprop_util",
+        ":execute",
         ":forwardprop_util",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:unconnected_gradients",
         "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:nest",
@@ -628,7 +748,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "forwardprop_util",
     srcs = ["forwardprop_util.py"],
     srcs_version = "PY3",
@@ -638,7 +758,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "benchmarks_test_base",
     srcs = ["benchmarks_test_base.py"],
     srcs_version = "PY3",
@@ -649,7 +769,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "benchmarks_test",
     srcs = ["benchmarks_test.py"],
     python_version = "PY3",
@@ -657,21 +777,35 @@ cuda_py_test(
         ":backprop",
         ":benchmarks_test_base",
         ":context",
+        ":core",
+        ":def_function",
         ":forwardprop",
-        ":function",
-        ":remote",
         ":test",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "run_eager_op_as_function_test",
     srcs = ["run_eager_op_as_function_test.py"],
     python_version = "PY3",
@@ -682,12 +816,19 @@ cuda_py_test(
         ":benchmarks_test_base",
         ":context",
         ":test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python/data",
+        "//tensorflow/python/data/experimental/ops:prefetching_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:critical_section_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_map_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
@@ -707,32 +848,29 @@ tf_xla_py_test(
         "no_pip",  # TODO(b/149738646): fix pip install so these tests run on kokoro pip
     ],
     deps = [
-        ":context",
+        ":def_function",
         ":test",
         "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remote_benchmarks_test",
     srcs = ["remote_benchmarks_test.py"],
     python_version = "PY3",
     deps = [
-        ":backprop",
-        ":benchmarks_test_base",
         ":context",
-        ":forwardprop",
-        ":function",
-        ":profiler",
+        ":def_function",
         ":remote",
         ":test",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training:server_lib",
     ],
 )
 
@@ -742,7 +880,7 @@ tf_py_logged_benchmark(
     target = "//tensorflow/python/eager:benchmarks_test",
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "record_test",
     srcs = ["record_test.py"],
     python_version = "PY3",
@@ -751,17 +889,19 @@ tf_py_test(
         ":context",
         ":record",
         ":test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "ops_test",
     srcs = ["ops_test.py"],
     python_version = "PY3",
@@ -769,24 +909,27 @@ cuda_py_test(
         ":context",
         ":execute",
         ":test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:config",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:sparse_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "pywrap_tfe_test",
     srcs = ["pywrap_tfe_test.py"],
     python_version = "PY3",
@@ -794,23 +937,31 @@ tf_py_test(
         ":backprop",
         ":context",
         ":core",
+        ":def_function",
         ":test",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:test_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "imperative_grad",
     srcs = ["imperative_grad.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:unconnected_gradients",
+        "//tensorflow/python/ops:unconnected_gradients",
         "//tensorflow/python/util:compat",
     ],
 )
@@ -824,7 +975,7 @@ check_deps(
     deps = [":def_function"],
 )
 
-py_library(
+py_strict_library(
     name = "def_function",
     srcs = ["def_function.py"],
     srcs_version = "PY3",
@@ -835,44 +986,62 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "lift_to_graph",
     srcs = ["lift_to_graph.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:op_selector",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:op_selector",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "lift_to_graph_test",
     size = "medium",
     srcs = ["lift_to_graph_test.py"],
     python_version = "PY3",
     deps = [
         "lift_to_graph",
+        ":def_function",
         ":test",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "wrap_function",
     srcs = ["wrap_function.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         ":context",
+        ":function",
         ":lift_to_graph",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/polymorphism:function_type",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:data_structures",
@@ -881,18 +1050,36 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "wrap_function_test",
     srcs = ["wrap_function_test.py"],
     python_version = "PY3",
     deps = [
+        ":backprop",
+        ":def_function",
         ":wrap_function",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:saver",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "wrap_function_device_test",
     srcs = ["wrap_function_device_test.py"],
     python_version = "PY3",
@@ -901,18 +1088,18 @@ cuda_py_test(
     deps = [
         ":def_function",
         ":wrap_function",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:config",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "remote",
     srcs = ["remote.py"],
     srcs_version = "PY3",
@@ -920,15 +1107,19 @@ py_library(
     deps = [
         ":context",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
-        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:remote_utils",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
     ] + tf_additional_rpc_deps(),
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remote_test",
     size = "medium",
     srcs = ["remote_test.py"],
@@ -944,20 +1135,34 @@ cuda_py_test(
         ":cancellation",
         ":context",
         ":def_function",
+        ":executor",
         ":remote",
         ":test",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remote_execution_test",
     srcs = ["remote_execution_test.py"],
     grpc_enabled = True,
@@ -967,18 +1172,27 @@ cuda_py_test(
         "no_oss",  # This test launches local server
     ],
     deps = [
+        ":backprop",
+        ":context",
+        ":def_function",
         ":remote",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:script_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remote_cluster_test",
     srcs = ["remote_cluster_test.py"],
     grpc_enabled = True,
@@ -990,16 +1204,25 @@ cuda_py_test(
         "notsan",  # TODO(b/170783249)
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
+        ":context",
+        ":def_function",
+        ":executor",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python/distribute/failure_handling:check_preemption_py",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:coordinator",
+        "//tensorflow/python/training:server_lib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "remote_cloud_tpu_test",
     srcs = ["remote_cloud_tpu_test.py"],
     python_version = "PY3",
@@ -1007,14 +1230,15 @@ tpu_py_test(
         "notap",
     ],
     deps = [
-        ":context",
         ":remote",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
-        "//tensorflow/python/tpu:tpu_strategy_util",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/framework:config",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "remote_cloud_tpu_pod_test",
     srcs = ["remote_cloud_tpu_test.py"],
     args = ["--num_tpu_devices=32"],
@@ -1025,14 +1249,16 @@ tpu_py_test(
         "tpu_pod",
     ],
     deps = [
-        ":context",
         ":remote",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/tpu:tpu_strategy_util",
+        "@absl_py//absl/flags",
+        "@absl_py//absl/testing:absltest",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "device_placement_test",
     size = "small",
     srcs = ["device_placement_test.py"],
@@ -1043,7 +1269,16 @@ cuda_py_test(
         ":def_function",
         ":remote",
         ":test",
-        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -1056,7 +1291,7 @@ py_library(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "gen_gradient_input_output_exclusions",
     srcs = ["gen_gradient_input_output_exclusions.py"],
     deps = [":gradient_input_output_exclusions"],
@@ -1067,7 +1302,7 @@ exports_files([
     "pywrap_gradient_exclusions.cc",
 ])
 
-py_test(
+py_strict_test(
     name = "gradient_input_output_exclusions_test",
     srcs = ["gradient_input_output_exclusions_test.py"],
     data = [
@@ -1080,8 +1315,8 @@ py_test(
     ],
     deps = [
         ":gradient_input_output_exclusions",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/eager/backprop.py b/tensorflow/python/eager/backprop.py
index b34a6ac58a6..9c488d1b133 100644
--- a/tensorflow/python/eager/backprop.py
+++ b/tensorflow/python/eager/backprop.py
@@ -1022,11 +1022,12 @@ class GradientTape:
                       " of Tensors, Variables or CompositeTensors to be "
                       "differentiated, but received None.")
 
-    flat_targets = []
-    for t in nest.flatten(target):
-      flat_targets.append(_handle_or_self(t))
     flat_targets = composite_tensor_gradient.get_flat_tensors_for_gradients(
-        flat_targets)
+        nest.flatten(target))
+    # TODO(b/246997907): Remove this once
+    # ResourceVariableGradient.get_gradient_components returns the handle.
+    flat_targets = nest.map_structure(_handle_or_self, flat_targets)
+
     for t in flat_targets:
       if not backprop_util.IsTrainable(t):
         logging.vlog(
diff --git a/tensorflow/python/eager/backprop_test.py b/tensorflow/python/eager/backprop_test.py
index 230da35ad08..883ab62df25 100644
--- a/tensorflow/python/eager/backprop_test.py
+++ b/tensorflow/python/eager/backprop_test.py
@@ -962,7 +962,7 @@ class BackpropTest(test.TestCase, parameterized.TestCase):
       d2y_dx2 = g.gradient(dy_dx, x)  # d2y/dx2 := 6x
     d3y_dx3 = g.gradient(d2y_dx2, x)  # d3y/dx3 := 6
     x = 3
-    self.assertEqual(self.evaluate(y), x**3)
+    self.assertAllClose(self.evaluate(y), x**3)
     self.assertEqual(self.evaluate(dy_dx), 3 * x**2)
     self.assertEqual(self.evaluate(d2y_dx2), 6 * x)
     self.assertEqual(self.evaluate(d3y_dx3), 6)
diff --git a/tensorflow/python/eager/benchmarks/BUILD b/tensorflow/python/eager/benchmarks/BUILD
index 4633cdfe327..58878b11343 100644
--- a/tensorflow/python/eager/benchmarks/BUILD
+++ b/tensorflow/python/eager/benchmarks/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +6,7 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "kpi_benchmark_test",
     size = "medium",
     srcs = ["kpi_benchmark_test.py"],
@@ -18,5 +18,7 @@ cuda_py_test(
     deps = [
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/eager:benchmarks_test_base",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/profiler:trace",
     ],
 )
diff --git a/tensorflow/python/eager/benchmarks/resnet50/BUILD b/tensorflow/python/eager/benchmarks/resnet50/BUILD
index a81290df7e9..8a19171805e 100644
--- a/tensorflow/python/eager/benchmarks/resnet50/BUILD
+++ b/tensorflow/python/eager/benchmarks/resnet50/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +7,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "resnet50",
     srcs = ["resnet50.py"],
     srcs_version = "PY3",
@@ -15,17 +16,14 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "resnet50_test_util",
     srcs = ["resnet50_test_util.py"],
     srcs_version = "PY3",
-    deps = [
-        ":resnet50",
-        "//tensorflow:tensorflow_py_no_contrib",
-    ],
+    deps = ["//tensorflow:tensorflow_py_no_contrib"],
 )
 
-py_library(
+py_strict_library(
     name = "resnet50_test_lib",
     srcs = ["resnet50_test.py"],
     srcs_version = "PY3",
@@ -33,10 +31,14 @@ py_library(
         ":resnet50",
         ":resnet50_test_util",
         "//tensorflow:tensorflow_py_no_contrib",
+        "//tensorflow/python/client",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "resnet50_test",
     size = "medium",
     srcs = ["resnet50_test.py"],
@@ -52,10 +54,14 @@ cuda_py_test(
         ":resnet50",
         ":resnet50_test_util",
         "//tensorflow:tensorflow_py_no_contrib",
+        "//tensorflow/python/client",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "hvp_test",
     size = "medium",
     srcs = ["hvp_test.py"],
@@ -74,10 +80,11 @@ cuda_py_test(
         ":resnet50_test_util",
         "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python/eager:forwardprop",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "resnet50_graph_test",
     size = "medium",
     srcs = ["resnet50_graph_test.py"],
@@ -90,7 +97,6 @@ cuda_py_test(
     ],
     deps = [
         ":resnet50",
-        ":resnet50_test_lib",
         "//tensorflow:tensorflow_py_no_contrib",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/eager/def_function.py b/tensorflow/python/eager/def_function.py
index 44e3604203a..07d85f0f4be 100644
--- a/tensorflow/python/eager/def_function.py
+++ b/tensorflow/python/eager/def_function.py
@@ -19,7 +19,6 @@
 # Config Options
 from tensorflow.python.eager.polymorphic_function.eager_function_run import run_functions_eagerly
 from tensorflow.python.eager.polymorphic_function.eager_function_run import functions_run_eagerly
-from tensorflow.python.eager.polymorphic_function.polymorphic_function import set_dynamic_variable_creation
 
 # tf.function Classes
 from tensorflow.python.eager.polymorphic_function.polymorphic_function import Function
diff --git a/tensorflow/python/eager/function.py b/tensorflow/python/eager/function.py
index ce3e6a0e693..3e598c4e439 100644
--- a/tensorflow/python/eager/function.py
+++ b/tensorflow/python/eager/function.py
@@ -18,19 +18,19 @@
 
 # TODO(b/243822285): Reduce this list as much as possible.
 # Constants
-from tensorflow.python.eager.polymorphic_function.monomorphic_function import _BACKWARD_PREFIX
-from tensorflow.python.eager.polymorphic_function.monomorphic_function import _FORWARD_PREFIX
-from tensorflow.python.eager.polymorphic_function.monomorphic_function import _INFERENCE_PREFIX
+from tensorflow.python.eager.polymorphic_function.concrete_function import _BACKWARD_PREFIX
+from tensorflow.python.eager.polymorphic_function.concrete_function import _FORWARD_PREFIX
+from tensorflow.python.eager.polymorphic_function.concrete_function import _INFERENCE_PREFIX
 
 # Function Classes
-from tensorflow.python.eager.polymorphic_function.monomorphic_function import ConcreteFunction
+from tensorflow.python.eager.polymorphic_function.concrete_function import ConcreteFunction
 from tensorflow.python.eager.polymorphic_function.tracing_compiler import TracingCompiler as Function
 from tensorflow.python.eager.polymorphic_function.atomic_function import from_func_graph
 from tensorflow.python.eager.polymorphic_function.atomic_function import AtomicFunction
 
 # Utilities
 from tensorflow.python.eager.polymorphic_function.tf_method_target import TfMethodTarget
-from tensorflow.python.eager.polymorphic_function.monomorphic_function import _inference_name
+from tensorflow.python.eager.polymorphic_function.concrete_function import _inference_name
 
 # TODO(b/244360504): Remove in favor of graph transformation API.
 # QUARANTINED - Function Callback Modification API
diff --git a/tensorflow/python/eager/memory_tests/BUILD b/tensorflow/python/eager/memory_tests/BUILD
index a6535b28080..63489a11c01 100644
--- a/tensorflow/python/eager/memory_tests/BUILD
+++ b/tensorflow/python/eager/memory_tests/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -9,14 +10,15 @@ package(
 # seem to confuse the memory_profiler, and the tests begin to flake. Add new
 # test files as needed.
 
-py_library(
+py_strict_library(
     name = "memory_test_util",
     srcs = ["memory_test_util.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
+    deps = ["//tensorflow/python/eager:context"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "memory_test",
     size = "medium",
     srcs = ["memory_test.py"],
@@ -30,16 +32,20 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":memory_test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python:gradients",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
-        "@six_archive//:six",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "remote_memory_test",
     size = "medium",
     srcs = ["remote_memory_test.py"],
@@ -50,10 +56,12 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # b/140261762
     deps = [
         ":memory_test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/training:server_lib",
     ],
 )
diff --git a/tensorflow/python/eager/monitoring_test.py b/tensorflow/python/eager/monitoring_test.py
index 410787d29fc..5bfc578008d 100644
--- a/tensorflow/python/eager/monitoring_test.py
+++ b/tensorflow/python/eager/monitoring_test.py
@@ -18,7 +18,6 @@ import time
 
 from tensorflow.python.eager import monitoring
 from tensorflow.python.eager import test
-from tensorflow.python.framework import errors
 from tensorflow.python.framework import test_util
 
 
@@ -42,8 +41,7 @@ class MonitoringTest(test_util.TensorFlowTestCase):
 
   def test_same_counter(self):
     counter1 = monitoring.Counter('test/same_counter', 'test counter')  # pylint: disable=unused-variable
-    with self.assertRaises(errors.AlreadyExistsError):
-      counter2 = monitoring.Counter('test/same_counter', 'test counter')  # pylint: disable=unused-variable
+    counter2 = monitoring.Counter('test/same_counter', 'test counter')  # pylint: disable=unused-variable
 
   def test_int_gauge(self):
     gauge = monitoring.IntGauge('test/gauge', 'test gauge')
diff --git a/tensorflow/python/eager/polymorphic_function/BUILD b/tensorflow/python/eager/polymorphic_function/BUILD
index 9a875cac943..c5f08df0579 100644
--- a/tensorflow/python/eager/polymorphic_function/BUILD
+++ b/tensorflow/python/eager/polymorphic_function/BUILD
@@ -1,19 +1,20 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
-load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/compiler/tests:build_defs.bzl", "tf_xla_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "attributes",
     srcs = ["attributes.py"],
     srcs_version = "PY3",
     deps = [],
 )
 
-py_library(
+py_strict_library(
     name = "autograph_util",
     srcs = ["autograph_util.py"],
     srcs_version = "PY3",
@@ -24,7 +25,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "transform",
     srcs = ["transform.py"],
     srcs_version = "PY3",
@@ -32,59 +33,81 @@ py_library(
     deps = [],
 )
 
-py_library(
+py_strict_library(
     name = "atomic_function",
     srcs = ["atomic_function.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow/python:__subpackages__"],
     deps = [
+        ":attributes",
+        ":function_type_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:handle_data_util",
+        "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/python/client:pywrap_tf_session",
-        "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:execute",
         "//tensorflow/python/eager:record",
-        "//tensorflow/python/framework:c_api_util",
+        "//tensorflow/python/framework:auto_control_deps_utils",
         "//tensorflow/python/framework:error_interpolation",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function_def_to_graph",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:handle_data_util",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:function_utils",
     ],
 )
 
-py_library(
-    name = "monomorphic_function",
-    srcs = ["monomorphic_function.py"],
+cuda_py_strict_test(
+    name = "atomic_function_test",
+    size = "medium",
+    srcs = ["atomic_function_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":atomic_function",
+        ":polymorphic_function",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:compat",
+    ],
+)
+
+py_strict_library(
+    name = "concrete_function",
+    srcs = ["concrete_function.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow/python:__subpackages__"],
     deps = [
         ":atomic_function",
         ":attributes",
-        ":function_spec",
+        ":function_type_utils",
         ":saved_model_exported_concrete",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/polymorphism:function_type",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:default_gradient",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
-        "//tensorflow/python:handle_data_util",
         "//tensorflow/python:pywrap_tfe",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:backprop_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:forwardprop_util",
         "//tensorflow/python/eager:graph_only_ops",
         "//tensorflow/python/eager:record",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:default_gradient",
+        "//tensorflow/python/ops:gradients_util",
+        "//tensorflow/python/ops:handle_data_util",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/trackable:base",
@@ -96,27 +119,49 @@ py_library(
     ],
 )
 
-py_library(
+cuda_py_strict_test(
+    name = "concrete_function_test",
+    size = "small",
+    srcs = ["concrete_function_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":concrete_function",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_library(
     name = "tracing_compiler",
     srcs = ["tracing_compiler.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow/python:__subpackages__"],
     deps = [
         ":attributes",
+        ":concrete_function",
         ":function_context",
-        ":monomorphic_function",
+        ":function_type_utils",
         ":tf_method_target",
         ":transform",
         "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/core/function/polymorphism:function_cache",
+        "//tensorflow/core/function/polymorphism:function_type",
+        "//tensorflow/core/function/trace_type",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/profiler:trace",
         "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:lazy_loader",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_method_target",
     srcs = ["tf_method_target.py"],
     srcs_version = "PY3",
@@ -129,7 +174,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "polymorphic_function",
     srcs = ["polymorphic_function.py"],
     srcs_version = "PY3",
@@ -139,22 +184,30 @@ py_library(
         ":autograph_util",
         ":compiler_ir",
         ":eager_function_run",
-        ":function_spec",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:cond_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:while_v2",  # TODO(b/118513001): Imported via control_flow_ops; remove.
+        ":function_type_utils",
+        ":tf_method_target",
+        ":tracing_compiler",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python/distribute/parallel_device",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:lift_to_graph",
-        "//tensorflow/python/eager/polymorphic_function:monomorphic_function",
-        "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/trackable:base",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
@@ -164,7 +217,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "polymorphic_function_test",
     size = "medium",
     srcs = ["polymorphic_function_test.py"],
@@ -175,40 +228,65 @@ cuda_py_test(
     ],
     # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
-        ":monomorphic_function",
+        ":attributes",
         ":polymorphic_function",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sendrecv_ops_gen",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:test_ops",
+        "//tensorflow/core/function/capture:capture_container",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/core:converter",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:lift_to_graph",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:cond_v2",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:random_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sendrecv_ops_gen",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/ops/structured:structured_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model:save_options",
+        "//tensorflow/python/training:training_ops",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -216,7 +294,7 @@ cuda_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "polymorphic_function_test_cpu_only",
     srcs = ["polymorphic_function_test_cpu_only.py"],
     python_version = "PY3",
@@ -229,14 +307,15 @@ tf_py_test(
     ],
     deps = [
         ":polymorphic_function",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "polymorphic_function_xla_jit_test",
     srcs = ["polymorphic_function_xla_jit_test.py"],
     disabled_backends = [
@@ -254,20 +333,33 @@ tf_xla_py_test(
     deps = [
         ":polymorphic_function",
         "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "polymorphic_function_xla_test",
     srcs = ["polymorphic_function_xla_test.py"],
     enable_mlir_bridge = False,
@@ -280,17 +372,14 @@ tf_xla_py_test(
     deps = [
         ":polymorphic_function",
         "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        # TODO(b/145618471): Remove this transitive dependency.
-        "//tensorflow/python/distribute:input_lib",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "gradients_test",
     size = "medium",
     srcs = ["gradients_test.py"],
@@ -298,19 +387,31 @@ cuda_py_test(
     shard_count = 5,
     deps = [
         ":polymorphic_function",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "composite_tensor_utils",
     srcs = ["composite_tensor_utils.py"],
     srcs_version = "PY3",
@@ -322,40 +423,44 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "quarantine_test",
     size = "medium",
     srcs = ["quarantine_test.py"],
     python_version = "PY3",
     deps = [
-        ":monomorphic_function",
         ":polymorphic_function",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_seed",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sendrecv_ops_gen",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:test_ops",
+        ":tracing_compiler",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/capture:capture_container",
         "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:functional_ops_gen",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -363,7 +468,7 @@ tf_py_test(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "argument_naming_test",
     size = "medium",
     srcs = ["argument_naming_test.py"],
@@ -371,38 +476,47 @@ cuda_py_test(
     # TODO(b/169371527): insert transfer op in eager lowering for TFRT.
     deps = [
         ":polymorphic_function",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "collection_test",
     size = "medium",
     srcs = ["collection_test.py"],
     python_version = "PY3",
     deps = [
         ":polymorphic_function",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python/eager:backprop",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "eager_function_run",
     srcs = ["eager_function_run.py"],
     visibility = ["//tensorflow:internal"],
-    deps = ["//tensorflow/python/util:tf_export"],
+    deps = [
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "function_context",
     srcs = ["function_context.py"],
     srcs_version = "PY3",
@@ -411,89 +525,95 @@ py_library(
     ],
     deps = [
         "//tensorflow/core/function/polymorphism:function_cache",
-        "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
         "//tensorflow/python/saved_model:save_context",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "saved_model_utils",
     srcs = ["saved_model_utils.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:constant_op",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:base",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "saved_model_exported_concrete",
     srcs = ["saved_model_exported_concrete.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
+        ":function_type_utils",
         "//tensorflow/python/trackable:base",
     ],
 )
 
-py_library(
-    name = "function_spec",
-    srcs = ["function_spec.py"],
+py_strict_library(
+    name = "function_type_utils",
+    srcs = ["function_type_utils.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         ":composite_tensor_utils",
         "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
+        "@six_archive//:six",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "function_spec_test",
     size = "medium",
     srcs = ["function_spec_test.py"],
     python_version = "PY3",
     deps = [
-        ":function_spec",
+        ":function_type_utils",
         "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_decorator",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "compiler_ir",
     srcs = ["compiler_ir.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/util:nest",
     ],
 )
 
-tf_xla_py_test(
+tf_xla_py_strict_test(
     name = "compiler_ir_test",
     srcs = ["compiler_ir_test.py"],
     disabled_backends = [
@@ -513,15 +633,16 @@ tf_xla_py_test(
         ":compiler_ir",
         ":polymorphic_function",
         "//tensorflow/compiler/tests:xla_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
     ],
 )
diff --git a/tensorflow/python/eager/polymorphic_function/atomic_function.py b/tensorflow/python/eager/polymorphic_function/atomic_function.py
index c368761a1f7..5f6a518656c 100644
--- a/tensorflow/python/eager/polymorphic_function/atomic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/atomic_function.py
@@ -16,126 +16,102 @@
 
 import dataclasses
 import traceback
-from typing import Any
+from typing import Any, List
 
 from tensorflow.core.framework import attr_value_pb2
-from tensorflow.core.function import trace_type
 from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.python.client import pywrap_tf_session
 from tensorflow.python.eager import context
 from tensorflow.python.eager import record
 from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.framework import auto_control_deps_utils as acd
 from tensorflow.python.framework import error_interpolation
 from tensorflow.python.framework import errors
+from tensorflow.python.framework import func_graph as func_graph_module
+from tensorflow.python.framework import function_def_to_graph
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import tensor_spec
 from tensorflow.python.ops import handle_data_util
 from tensorflow.python.util import compat
 from tensorflow.python.util import function_utils
 
 
-class InterpolateRuntimeError(object):
-  """Context Manager that interpolates exceptions received by AtomicFunction."""
-
-  DENY_LIST_PHRASES = ["<embedded"]
-
-  def __init__(self, top_level_func):
-    self._func = top_level_func
-
-  def interpolate(self, message, node_names, graph_debug_info):
-    """Uses the GraphDebugInfo to generate an error message."""
-    error_message = ["Graph execution error:", ""]
-    for node_name in node_names:
-      error_message.append(
-          f"Detected at node {node_name} defined at (most recent call last):"
-      )
-      if node_name in graph_debug_info.traces:
-        stack_trace = graph_debug_info.traces[node_name]
-        tb_frames = []
-        for frame in stack_trace.file_line_cols:
-          tb_frames.append(
-              traceback.FrameSummary(
-                  graph_debug_info.files[frame.file_index],
-                  frame.line,
-                  frame.func,
-              )
-          )
-          for formatted_frame in traceback.format_list(tb_frames):
-            if not any(p in formatted_frame for p in self.DENY_LIST_PHRASES):
-              error_message.append(formatted_frame)
-      else:
-        error_message.append("<stack traces unavailable>")
-
-    error_message.append(message.strip())
-    return "\n".join(error_message)
-
-  def __enter__(self):
-    pass
-
-  def __exit__(self, typ, exc, tb):
-    if not exc or not isinstance(exc, errors.OpError):
-      return False
-    message = compat.as_text(exc.message)
-    parsed_message, func_tags, node_tags = error_interpolation.parse_message(
-        message
-    )
-    deepest_func = None
-    for func_tag in func_tags:
-      # TODO(mdan): Tests should cover this.
-      if func_tag.name == compat.as_str(self._func.name):
-        deepest_func = self._func
-      elif deepest_func:
-        next_func = deepest_func.graph._get_function(func_tag.name)  # pylint: disable=protected-access
-        if next_func is not None and isinstance(next_func, AtomicFunction):
-          deepest_func = next_func
-    if deepest_func:
-      exc._message = self.interpolate(
-          parsed_message,
-          [t.name for t in node_tags],
-          deepest_func.graph_debug_info,
-      )
-    return False
-
-
-# TODO(b/232961485): Remove after quarantined `add_function_callback` removed.
-function_callbacks = set()
-
-
-# TODO(fmuham): Lower to FunctionRecord or remove otherwise.
+# TODO(fmuham): Should be lowered to FunctionDef/FunctionRecord.
 @dataclasses.dataclass(frozen=True)
-class GraphArtifacts:
-  control_captures: Any
-  graph: Any
-  stateful_ops: Any
+class CallOptions:
+  """Specifies additional configuration for an AtomicFunction call."""
 
-# Maps the scope_id and name in runtime to the number of AtomicFunctions.
+  # Used by ACD to identify the CollectiveManager this function is scoped in.
+  collective_manager_ids_used: List[int] = dataclasses.field(
+      default_factory=list
+  )
+
+  # Used by ACD to list Ops/Tensors/Callables that must be called in advance.
+  control_captures: List[Any] = dataclasses.field(default_factory=list)
+
+  # Determines what kind of partitoned call is used for this function.
+  is_stateful: bool = False
+
+
+# Maps the (scope_id, name) in runtime to associated AtomicFunctions.
 RUNTIME_FUNCTION_REFS = {}
 
 
 class AtomicFunction:
   """A Python callable for functions in the TF Runtime.
 
-  Supports tf.function features such as structured value inputs and outputs,
-  captures and control dependencies.
-
-  Lowest level abstraction in the Python tf.function implementation.
+  Provides core functionality for tf.function including:
+    - automatic lifecycle management of runtime functions
+    - structured inputs (including captures) and structured outputs
+    - calls from both eager and graph mode
+    - dependency tracking of children functions
+    - runtime error interpolation to identify user code stack traces
+    - compatibility with gradient infrastructure
+    - control dependencies (including automatic)
   """
+
   __slots__ = [
       "_name",
       "_bound_context",
       "_function_type",
-      "_graph_artifacts",
+      "_children",
+      "_call_options",
       "_cached_definition",
+      "_cached_graph",
+      "_generated_graph",
   ]
 
-  def __init__(self, name, bound_context, function_type, graph_artifacts):
+  def __init__(
+      self,
+      name,
+      bound_context,
+      function_type,
+      children=None,
+      call_options=CallOptions(),
+      cached_graph=None,
+  ):
+    """Construct a new AtomicFunction.
+
+    Args:
+      name: str/bytes name of the runtime function in the bound context.
+      bound_context: interface to the runtime for the AtomicFunction.
+      function_type: input/output contract for the AtomicFunction
+      children: list of AtomicFunctions that are needed to call this one.
+      call_options: extra configuration options for the call.
+      cached_graph: FuncGraph that this AtomicFunction was generated from (if
+        known). Otherwise it will lazily construct a new corresponding FuncGraph
+        if ever needed.
+    """
     self._name = compat.as_bytes(name)
     self._bound_context = bound_context
     self._function_type = function_type
-    self._graph_artifacts = graph_artifacts
+    self._children = children if children else []
+    self._call_options = call_options
     self._cached_definition = None
 
+    self._cached_graph = cached_graph
+    self._generated_graph = None
+
     ref_key = (self._bound_context.function_scope_id, self.name)
     if ref_key not in RUNTIME_FUNCTION_REFS:
       RUNTIME_FUNCTION_REFS[ref_key] = 1
@@ -143,28 +119,55 @@ class AtomicFunction:
       RUNTIME_FUNCTION_REFS[ref_key] += 1
 
   @property
-  def _c_func(self):
-    return context.get_c_function(self.name)
+  def name(self):
+    """Name represented in UTF-8 encoded bytes."""
+    return self._name
 
   @property
   def function_type(self):
+    """Represents the input/output contract of this function."""
     return self._function_type
 
-  # TODO(fmuham): Remove this property.
   @property
-  def graph(self):
-    return self._graph_artifacts.graph
-
-  # TODO(fmuham): Remove this property.
-  @property
-  def stateful_ops(self):
-    return self._graph_artifacts.stateful_ops
+  def children(self):
+    """AtomicFunctions needed as dependencies for this one."""
+    return self._children
 
   @property
   def definition(self):
     """Current FunctionDef in the Runtime."""
     return self._bound_context.get_function_def(self.name)
 
+  @property
+  def graph_debug_info(self):
+    """A GraphDebugInfo proto mapping nodes to corresponding stack traces."""
+    return self._bound_context.get_graph_debug_info(self.name)
+
+  @property
+  def call_options(self) -> CallOptions:
+    """Call options declared for this AtomicFunction."""
+    return self._call_options
+
+  @property
+  def graph_call_attrs(self):
+    """Returns a dictionary of attributes needed to add a call in graph."""
+    attrs = {
+        "is_stateful": self.call_options.is_stateful,
+        "tout": [
+            o.dtype.as_datatype_enum for o in self.function_type.flat_outputs
+        ],
+        "xla_compile_attr": self.cached_definition.attr.get(
+            attributes_lib.XLA_COMPILE, None
+        ),
+    }
+    attrs.update(self._bound_context.function_call_options.as_attrs())
+    return attrs
+
+  @property
+  def _c_func(self):
+    """Returns a scoped pybind object containing FunctionRecord in runtime."""
+    return self._bound_context.get_c_function(self.name)
+
   # TODO(fmuham): Move caching to dependent code and remove method.
   @property
   def cached_definition(self):
@@ -175,34 +178,19 @@ class AtomicFunction:
     return self._cached_definition
 
   @property
-  def graph_debug_info(self):
-    return self._bound_context.get_graph_debug_info(self.name)
+  def graph(self):
+    """Returns a FuncGraph corresponding to the AtomicFunction."""
+    if self._cached_graph:
+      return self._cached_graph
 
-  @property
-  def name(self):
-    """Name represented in UTF-8 encoded bytes."""
-    return self._name
+    # Lazily generate the graph if one is not specified.
+    if not self._generated_graph:
+      self._generated_graph = to_func_graph(self)
 
-  @property
-  def graph_call_attrs(self):
-    """Returns a dictionary of attributes needed to add a call in graph."""
-    attrs = {
-        "is_stateful": len(self.stateful_ops) > 0,  # pylint: disable=g-explicit-length-test
-        "tout": [
-            o.dtype.as_datatype_enum for o in self.function_type.flat_outputs
-        ],
-        "xla_compile_attr": self.cached_definition.attr.get(
-            attributes_lib.XLA_COMPILE, None
-        ),
-    }
-    attrs.update(self._bound_context.function_call_options.as_attrs())
-    return attrs
+    return self._generated_graph
 
   def __call__(self, *args):
-    """Calls this function with `args` as inputs.
-
-    `ConcreteFunction` execution respects device annotations only if the
-    function won't be compiled with xla.
+    """Calls with flat tensor inputs and returns flat tensor outputs.
 
     Args:
       *args: arguments to call this function with.
@@ -223,7 +211,7 @@ class AtomicFunction:
       )
 
     with InterpolateRuntimeError(self):
-      with ops.control_dependencies(self._graph_artifacts.control_captures):
+      with ops.control_dependencies(self._call_options.control_captures):
         # The caller must use record_operation to record this operation in the
         # eager case, so we enforce the same requirement for the non-eager
         # case by explicitly pausing recording. We don't have a gradient
@@ -237,7 +225,11 @@ class AtomicFunction:
                 len(self.function_type.flat_outputs),
             )
           else:
-            outputs = make_call_op_in_graph(self, list(args))
+            outputs = make_call_op_in_graph(
+                self,
+                list(args),
+                self._bound_context.function_call_options.as_attrs(),
+            )
 
     for i, output_type in enumerate(self.function_type.flat_outputs):
       handle_data = output_type.dtype._handle_data
@@ -252,6 +244,9 @@ class AtomicFunction:
     return outputs
 
   def __del__(self):
+    if self._generated_graph:
+      func_graph_module.dismantle_func_graph(self._generated_graph)
+
     key = (self._bound_context.function_scope_id, self.name)
     RUNTIME_FUNCTION_REFS[key] -= 1
     if RUNTIME_FUNCTION_REFS[key] < 0:
@@ -286,8 +281,9 @@ def _set_read_only_resource_inputs_attr(op, func_graph):
     func_graph: FuncGraph.
   """
   read_only_indices = acd.get_read_only_resource_input_indices_graph(func_graph)
-  ops.set_int_list_attr(op, acd.READ_ONLY_RESOURCE_INPUTS_ATTR,
-                        read_only_indices)
+  ops.set_int_list_attr(
+      op, acd.READ_ONLY_RESOURCE_INPUTS_ATTR, read_only_indices
+  )
 
 
 def partitioned_call_op(
@@ -328,13 +324,18 @@ def partitioned_call_op(
   args = [ops.convert_to_tensor(x) for x in args]
   tin_attr = attr_value_pb2.AttrValue(
       list=attr_value_pb2.AttrValue.ListValue(
-          type=[x.dtype.as_datatype_enum for x in args]))
+          type=[x.dtype.as_datatype_enum for x in args]
+      )
+  )
   tout_attr = attr_value_pb2.AttrValue(
-      list=attr_value_pb2.AttrValue.ListValue(type=tout))
+      list=attr_value_pb2.AttrValue.ListValue(type=tout)
+  )
   func_attr = attr_value_pb2.AttrValue(
-      func=attr_value_pb2.NameAttrList(name=name))
+      func=attr_value_pb2.NameAttrList(name=name)
+  )
   executor_type_attr = attr_value_pb2.AttrValue(
-      s=compat.as_bytes(executor_type))
+      s=compat.as_bytes(executor_type)
+  )
 
   # When running in graph mode, the graph and function graphs are optimized
   # (i.e. run through grappler) per the session options, so we can disable any
@@ -361,34 +362,36 @@ def partitioned_call_op(
   return op
 
 
-def make_call_op_in_graph(atomic, tensor_inputs):
+def make_call_op_in_graph(atomic, tensor_inputs, context_call_attrs):
   """Adds an AtomicFunction to graph."""
   graph = ops.get_default_graph()
   graph._add_function_recursive(atomic)  # pylint: disable=protected-access
 
-  function_call_attrs = atomic.graph_call_attrs
   op = partitioned_call_op(
       name=atomic.name,
       args=tensor_inputs,
-      is_stateful=function_call_attrs["is_stateful"],
-      tout=function_call_attrs["tout"],
-      config=function_call_attrs["config_proto"],
-      executor_type=function_call_attrs["executor_type"],
-      xla_compile_attr=function_call_attrs["xla_compile_attr"],
+      is_stateful=atomic.call_options.is_stateful,
+      tout=[
+          o.dtype.as_datatype_enum for o in atomic.function_type.flat_outputs
+      ],
+      config=context_call_attrs["config_proto"],
+      executor_type=context_call_attrs["executor_type"],
+      xla_compile_attr=atomic.cached_definition.attr.get(
+          attributes_lib.XLA_COMPILE, None
+      ),
   )
   _set_read_only_resource_inputs_attr(op, atomic.graph)
-  if hasattr(atomic.graph, "collective_manager_ids_used"):
-    ops.set_int_list_attr(
-        op,
-        acd.COLLECTIVE_MANAGER_IDS,
-        atomic.graph.collective_manager_ids_used,
-    )
+
+  ops.set_int_list_attr(
+      op,
+      acd.COLLECTIVE_MANAGER_IDS,
+      atomic._call_options.collective_manager_ids_used,  # pylint: disable=protected-access
+  )
+
   return op.outputs if op.outputs else op
 
 
-def from_func_graph(
-    name, graph, inputs, outputs, attrs, overwrite=False
-):
+def from_func_graph(name, graph, inputs, outputs, attrs, overwrite=False):
   """Initializes an AtomicFunction from FuncGraph.
 
   Args:
@@ -448,29 +451,131 @@ def from_func_graph(
   bound_context.add_c_function(fn)
   pywrap_tf_session.TF_DeleteFunction(fn)
 
-  graph_artifacts = GraphArtifacts(
+  call_options = CallOptions(
+      collective_manager_ids_used=getattr(
+          graph, "collective_manager_ids_used", []
+      ),
       control_captures=graph.function_captures.control,
-      graph=graph,
-      stateful_ops=tuple(op for op in operations if op._is_stateful),  # pylint: disable=protected-access
+      is_stateful=any(op._is_stateful for op in operations),  # pylint: disable=protected-access
   )
 
-  if graph.structured_input_signature is not None:
-    input_signature = graph.structured_input_signature
-  else:
-    input_signature = (
-        tuple(tensor_spec.TensorSpec.from_tensor(i) for i in inputs),
-        {},
+  function_type = function_type_utils.derive_from_graph(graph)
+
+  return AtomicFunction(
+      name,
+      bound_context,
+      function_type,
+      list(graph._functions.values()),  # pylint: disable=protected-access,
+      call_options,
+      cached_graph=graph,
+  )
+
+
+def to_func_graph(atomic):
+  """Generate a FuncGraph from an AtomicFunction."""
+  # pylint: disable=protected-access
+  input_signature, output_signature = function_type_lib.to_structured_signature(
+      atomic.function_type
+  )
+
+  with ops.Graph().as_default():
+    # Insert dependencies in the default graph so the new graph can pull them.
+    for f in atomic.children:
+      ops.get_default_graph()._add_function(f)
+
+    result = function_def_to_graph.function_def_to_graph(
+        atomic.definition,
+        structured_input_signature=input_signature,
+        structured_outputs=output_signature,
+        propagate_device_spec=True,
+        include_library_functions=False,
     )
+    for f in atomic.children:
+      result._add_function(f)
 
-  # TODO(fmuham): Include output structure info from structured_outputs
-  output_signature = tuple(
-      trace_type.from_value(o) for o in outputs
+  # Set input shapes and handle data
+  for i, input_type in enumerate(atomic.function_type.flat_inputs):
+    handle_data = input_type.dtype._handle_data
+    if handle_data:
+      handle_data_util.set_handle_data(result.inputs[i], handle_data)
+    result.inputs[i].set_shape(input_type.shape)
+
+  # Set output shapes and handle data
+  for i, output_type in enumerate(atomic.function_type.flat_outputs):
+    handle_data = output_type.dtype._handle_data
+    if handle_data:
+      handle_data_util.set_handle_data(result.outputs[i], handle_data)
+    result.outputs[i].set_shape(output_type.shape)
+
+  result.collective_manager_ids_used = (
+      atomic.call_options.collective_manager_ids_used,
   )
 
-  function_type = function_type_lib.from_structured_signature(
-      input_signature,
-      output_signature,
-      graph.function_captures.capture_types,
-  )
+  # pylint: enable=protected-access
+  return result
 
-  return AtomicFunction(name, bound_context, function_type, graph_artifacts)
+
+class InterpolateRuntimeError(object):
+  """Context Manager that interpolates exceptions received by AtomicFunction."""
+
+  DENY_LIST_PHRASES = ["<embedded"]
+
+  def __init__(self, top_level_func):
+    self._func = top_level_func
+
+  def interpolate(self, message, node_names, graph_debug_info):
+    """Uses the GraphDebugInfo to generate an error message."""
+    error_message = ["Graph execution error:", ""]
+    for node_name in node_names:
+      error_message.append(
+          f"Detected at node {node_name} defined at (most recent call last):"
+      )
+      if node_name in graph_debug_info.traces:
+        stack_trace = graph_debug_info.traces[node_name]
+        tb_frames = []
+        for frame in stack_trace.file_line_cols:
+          tb_frames.append(
+              traceback.FrameSummary(
+                  graph_debug_info.files[frame.file_index],
+                  frame.line,
+                  frame.func,
+              )
+          )
+          for formatted_frame in traceback.format_list(tb_frames):
+            if not any(p in formatted_frame for p in self.DENY_LIST_PHRASES):
+              error_message.append(formatted_frame)
+      else:
+        error_message.append("<stack traces unavailable>")
+
+    error_message.append(message.strip())
+    return "\n".join(error_message)
+
+  def __enter__(self):
+    pass
+
+  def __exit__(self, typ, exc, tb):
+    if not exc or not isinstance(exc, errors.OpError):
+      return False
+    message = compat.as_text(exc.message)
+    parsed_message, func_tags, node_tags = error_interpolation.parse_message(
+        message
+    )
+    deepest_func = None
+    for func_tag in func_tags:
+      if func_tag.name == compat.as_str(self._func.name):
+        deepest_func = self._func
+      elif deepest_func:
+        next_func = None
+        for child_func in deepest_func.children:
+          if func_tag.name == compat.as_str(child_func.name):
+            next_func = child_func
+            break
+        if next_func is not None and isinstance(next_func, AtomicFunction):
+          deepest_func = next_func
+    if deepest_func:
+      exc._message = self.interpolate(
+          parsed_message,
+          [t.name for t in node_tags],
+          deepest_func.graph_debug_info,
+      )
+    return False
diff --git a/tensorflow/python/eager/polymorphic_function/atomic_function_test.py b/tensorflow/python/eager/polymorphic_function/atomic_function_test.py
new file mode 100644
index 00000000000..20c6904d192
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/atomic_function_test.py
@@ -0,0 +1,113 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import atomic_function
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+def get_function_def_and_type(foo, inputs):
+  """Traces `foo` generate the FunctionDef and FunctionType."""
+  concrete = polymorphic_function.function(foo).get_concrete_function(*inputs)
+  atomic = concrete._inference_function
+  return atomic.definition, atomic.function_type
+
+
+def make_atomic_function(definition, func_type):
+  bound_context = context.context()
+  if bound_context.has_function(compat.as_bytes(definition.signature.name)):
+    raise ValueError("Function already registered in context.")
+
+  bound_context.add_function_def(definition)
+  return atomic_function.AtomicFunction(
+      definition.signature.name, bound_context, func_type
+  )
+
+
+class AtomicFunctionTest(test.TestCase):
+
+  def test_call_eager(self):
+    definition, func_type = get_function_def_and_type(
+        lambda x, y: x + y, (constant_op.constant(1), constant_op.constant(2))
+    )
+
+    atomic = make_atomic_function(definition, func_type)
+
+    self.assertEqual(
+        atomic(constant_op.constant(3), constant_op.constant(4))[0].numpy(),
+        7,
+    )
+
+  def test_call_graph(self):
+    definition, func_type = get_function_def_and_type(
+        lambda x, y: x + y, (constant_op.constant(1), constant_op.constant(2))
+    )
+
+    atomic = make_atomic_function(definition, func_type)
+
+    @polymorphic_function.function
+    def foo(a, b):
+      return atomic(a, b)[0]
+
+    self.assertEqual(
+        foo(constant_op.constant(3), constant_op.constant(4)).numpy(),
+        7,
+    )
+
+  def test_variable_input_eager(self):
+    definition, func_type = get_function_def_and_type(
+        lambda x, y: x + y,
+        (resource_variable_ops.ResourceVariable(1), constant_op.constant(2)),
+    )
+
+    atomic = make_atomic_function(definition, func_type)
+
+    self.assertEqual(
+        atomic(
+            resource_variable_ops.ResourceVariable(3)._handle,
+            constant_op.constant(4),
+        )[0].numpy(),
+        7,
+    )
+
+  def test_variable_input_graph(self):
+    definition, func_type = get_function_def_and_type(
+        lambda x, y: x + y,
+        (resource_variable_ops.ResourceVariable(1), constant_op.constant(2)),
+    )
+
+    atomic = make_atomic_function(definition, func_type)
+
+    @polymorphic_function.function
+    def foo(a, b):
+      return atomic(a, b)[0]
+
+    self.assertEqual(
+        foo(
+            resource_variable_ops.ResourceVariable(3)._handle,
+            constant_op.constant(4),
+        ).numpy(),
+        7,
+    )
+
+
+if __name__ == "__main__":
+  ops.enable_eager_execution()
+  test.main()
diff --git a/tensorflow/python/eager/polymorphic_function/attributes.py b/tensorflow/python/eager/polymorphic_function/attributes.py
index eadce647191..cfb1f834a59 100644
--- a/tensorflow/python/eager/polymorphic_function/attributes.py
+++ b/tensorflow/python/eager/polymorphic_function/attributes.py
@@ -28,6 +28,7 @@ API_IMPLEMENTS = "api_implements"
 API_PREFERRED_DEVICE = "api_preferred_device"
 BACKWARD_FUNCTION = "backward_function_name"
 DISABLE_CALL_SHAPE_INFERENCE = "_disable_call_shape_inference"
+DISABLE_SUMMARIES_AT_RUNTIME = "disable_summaries_at_runtime"
 EAGER_RUNTIME_CONSTRUCTION_CONTEXT = "_construction_context"
 FORWARD_FUNCTION = "forward_function_name"
 GO_BACKWARDS = "go_backwards"
@@ -63,6 +64,7 @@ XLA_SEPERATE_COMPILED_GRADIENTS = "_XlaSeparateCompiledGradients"
 POLYMORPHIC_FUNCTION_ALLOWLIST = frozenset({
     API_IMPLEMENTS,
     API_PREFERRED_DEVICE,
+    DISABLE_SUMMARIES_AT_RUNTIME,
     GO_BACKWARDS,
     IMPLEMENTS,
     INTS_ON_DEVICE,
diff --git a/tensorflow/python/eager/polymorphic_function/monomorphic_function.py b/tensorflow/python/eager/polymorphic_function/concrete_function.py
similarity index 95%
rename from tensorflow/python/eager/polymorphic_function/monomorphic_function.py
rename to tensorflow/python/eager/polymorphic_function/concrete_function.py
index f787643b4b6..6853ba2687e 100644
--- a/tensorflow/python/eager/polymorphic_function/monomorphic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/concrete_function.py
@@ -28,7 +28,7 @@ from tensorflow.python.eager import record
 from tensorflow.python.eager.graph_only_ops import graph_placeholder
 from tensorflow.python.eager.polymorphic_function import atomic_function
 from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
-from tensorflow.python.eager.polymorphic_function import function_spec
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.eager.polymorphic_function import saved_model_exported_concrete
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import dtypes
@@ -61,6 +61,43 @@ def _is_type_subset(a, b):
   return True
 
 
+def _parse_func_attr_value(key, value):
+  """Converts a python object to an attr_value_pb2.AttrValue object."""
+  if isinstance(value, attr_value_pb2.AttrValue):
+    return value
+  # bool type check has to happen before int since bool is a subclass of int.
+  elif isinstance(value, bool):
+    return attr_value_pb2.AttrValue(b=value)
+  elif isinstance(value, int):
+    return attr_value_pb2.AttrValue(i=value)
+  elif isinstance(value, float):
+    return attr_value_pb2.AttrValue(f=value)
+  elif isinstance(value, (str, bytes)):
+    return attr_value_pb2.AttrValue(s=compat.as_bytes(value))
+  elif isinstance(value, list):
+    list_value = attr_value_pb2.AttrValue.ListValue()
+    for v in value:
+      if isinstance(v, bool):
+        list_value.b.append(v)
+      elif isinstance(v, int):
+        list_value.i.append(v)
+      elif isinstance(v, float):
+        list_value.f.append(v)
+      elif isinstance(v, (str, bytes)):
+        list_value.s.append(compat.as_bytes(v))
+      else:
+        raise ValueError(
+            f"Attributes for {key} must be bool, int, float, or string. "
+            f"Got {type(v)}."
+        )
+    return attr_value_pb2.AttrValue(list=list_value)
+  else:
+    raise ValueError(
+        f"Attribute {key} must be bool, int, float, string, list, or"
+        f"AttrValue. Got {type(value)}."
+    )
+
+
 def _parse_func_attrs(attributes):
   """Convert the keyword arguments into function_def attributes.
 
@@ -80,20 +117,7 @@ def _parse_func_attrs(attributes):
     if key not in attributes_lib.MONOMORPHIC_FUNCTION_ALLOWLIST:
       raise ValueError(
           f"ConcreteFunction does not support `{key}` as an attribute.")
-    if isinstance(value, attr_value_pb2.AttrValue):
-      attrs[key] = value
-    # bool type check has to happen before int since bool is a subclass of int.
-    elif isinstance(value, bool):
-      attrs[key] = attr_value_pb2.AttrValue(b=value)
-    elif isinstance(value, int):
-      attrs[key] = attr_value_pb2.AttrValue(i=value)
-    elif isinstance(value, float):
-      attrs[key] = attr_value_pb2.AttrValue(f=value)
-    elif isinstance(value, (str, bytes)):
-      attrs[key] = attr_value_pb2.AttrValue(s=compat.as_bytes(value))
-    else:
-      raise ValueError(f"Attribute {key} must be bool, int, float, string, or "
-                       f"AttrValue. Got {type(value)}.")
+    attrs[key] = _parse_func_attr_value(key, value)
   return attrs
 
 _FORWARD_PREFIX = "__forward_"
@@ -130,8 +154,15 @@ def _create_forward_backward_with_graph(attrs, forward_graph, backwards_graph):
   backward_function_attr = _parse_func_attrs(
       {attributes_lib.FORWARD_FUNCTION: forward_function_name})
   backward_function_attr.update(common_attributes)
+  # TODO(fmuham): Include inputs as well.
+  function_type = function_type_lib.from_structured_signature(
+      ((), {}),
+      backwards_graph.structured_outputs,
+      backwards_graph.function_captures.capture_types,
+  )
   backward_function = ConcreteFunction(
-      backwards_graph, attrs=backward_function_attr)
+      backwards_graph, attrs=backward_function_attr, function_type=function_type
+  )
   forward_function_attr = _parse_func_attrs({
       attributes_lib.BACKWARD_FUNCTION:
       backward_function.name})
@@ -996,7 +1027,9 @@ class _ForwardBackwardCall(object):
 class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
   """A `tf.types.experimental.ConcreteFunction` created from `tf.function`."""
 
-  def __init__(self, func_graph, attrs=None, shared_func_graph=True, spec=None):
+  def __init__(
+      self, func_graph, attrs=None, shared_func_graph=True, function_type=None
+  ):
     """Initialize a `ConcreteFunction`.
 
     Args:
@@ -1007,8 +1040,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
      shared_func_graph: If False, the ConcreteFunction takes ownership of
        `func_graph` and will break reference cycles when it is deleted. This
        makes the FuncGraph inoperable.
-     spec: FunctionSpec for the original function.  If not specified, then this
-       ConcreteFunction may only be called using the flat signature.
+     function_type: Defines the structured input/output contract.
 
     Raises:
       ValueError: If number of input_placeholders is not equal to the number
@@ -1020,10 +1052,15 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     self._num_positional_args = None
 
     self._func_graph = func_graph
-    self._captured_inputs = self._func_graph.external_captures + self._func_graph.deferred_external_captures
-
-    # spec defines the structured signature.
-    self._set_function_spec(spec)
+    self._captured_inputs = (
+        self._func_graph.external_captures
+        + self._func_graph.deferred_external_captures
+    )
+    self._function_type = function_type
+    if func_graph.structured_outputs is not None and self.function_type is None:
+      raise ValueError(
+          "Must specify FunctionType if structured outputs are expected"
+      )
 
     if attrs and attributes_lib.IMPLEMENTS in attrs:
       # The alternative is to silently drop "implements" tag
@@ -1071,51 +1108,27 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     # building gradients.
     self._inference_function = self._delayed_rewrite_functions.forward()
 
-  def _set_function_spec(self, spec):
-    """Enables the structured signature by supplying a spec."""
-    self._function_spec = None
-    self._pre_initialized_function_spec = spec
-    self._initialize_function_spec()
+  @property
+  def function_type(self):
+    """Return the FunctionType associated with this ConcreteFunction."""
+    # TODO(fmuham): Ensure this is never None.
+    return self._function_type
 
-  def _initialize_function_spec(self):
-    """Updates `self._function_spec` to include varargs and bound variables.
+  # TODO(fmuham): Remove this property.
+  @property
+  def _function_spec(self):
+    if self.function_type is None:
+      return None
 
-    Adds new positional arguments for any varargs (i.e., for args that are
-    in `structured_input_signature`, but not in the original fullargspec.args).
-
-    Replaces `defaults` and `kwonlydefaults` with the `BOUND_VALUE`, for
-    all args and kwargs in `structured_input_signature`.
-
-    Sets `varkw` and `varargs` to None.
-    """
-    if self._pre_initialized_function_spec is None:
-      return  # e.g., SavedBareConcreteFunction doesn't have function_spec yet.
-    assert not self._function_spec, "already initialized"
-    spec = self._pre_initialized_function_spec
-    unconstrainted_poly_type = function_type_lib.FunctionType(
-        [
-            function_type_lib.Parameter(p.name, p.kind, p.optional, None)
-            for p in spec.function_type.parameters.values()
-        ]
-    )
-    arg_specs, kwarg_specs = self.structured_input_signature
-
-    _, func_type, _ = function_type_lib.canonicalize_to_monomorphic(
-        arg_specs,
+    return function_type_utils.FunctionSpec(
+        self.function_type,
         {
-            function_type_lib.sanitize_arg_name(k): v
-            for k, v in kwarg_specs.items()
+            p.default
+            for p in self.function_type.parameters.values()
+            if p.optional
         },
-        self._pre_initialized_function_spec.default_values,
-        {},
-        unconstrainted_poly_type,
-    )
-
-    self._function_spec = function_spec.FunctionSpec(
-        func_type,
-        {d: function_spec.BOUND_VALUE for d in spec.default_values},
-        spec.is_pure,
-        name=self._func_graph.name,
+        False,
+        name=self.name,
     )
 
   @property
@@ -1188,14 +1201,18 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     with trace.Trace(self._func_graph.name, tf_function_call="concrete"):
       # Construct the list of input tensors: check if the structured signature
       # applies first; and if not, then use the flat signature.
-      if self._function_spec is not None:
+      if self.function_type is not None:
         try:
           return self._call_with_structured_signature(args, kwargs)
         except TypeError as structured_err:
           try:
             return self._call_with_flat_signature(args, kwargs)
-          except TypeError:
-            raise structured_err
+          except TypeError as flat_err:
+            raise TypeError(  # pylint: disable=raise-missing-from
+                str(structured_err)
+                + "\n Fallback to flat signature also failed due to: "
+                + str(flat_err)
+            )
 
       return self._call_with_flat_signature(args, kwargs)
 
@@ -1266,7 +1283,9 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
         of this `ConcreteFunction`.
     """
     args, kwargs, filtered_flat_args = (
-        self._function_spec.canonicalize_function_inputs(args, kwargs))
+        function_type_utils.canonicalize_function_inputs(
+            args, kwargs, self.function_type)
+    )
     return self._call_flat(
         filtered_flat_args,
         captured_inputs=self.captured_inputs)
@@ -1682,22 +1701,14 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     Returns:
       The actual call output.
     """
-    # TODO(jlchu): call C++ version in function.cc when speed is improved
     if self._func_graph.structured_outputs is None:
       return result
 
-    # Replace outputs with results, skipping over any 'None' values.
-    outputs_list = nest.flatten(
-        self._func_graph.structured_outputs, expand_composites=True)
-    j = 0
-    for i, o in enumerate(outputs_list):
-      if o is not None:
-        handle_data_util.copy_handle_data(self.outputs[j], result[j])
-        outputs_list[i] = result[j]
-        j += 1
-    ret = nest.pack_sequence_as(self._func_graph.structured_outputs,
-                                outputs_list, expand_composites=True)
-    return ret
+    if isinstance(result, ops.Operation):
+      # We get an op when there are no tensor outputs.
+      return self.function_type.pack_output([])
+    else:
+      return self.function_type.pack_output(result)
 
   @property
   def _as_name_attr_list(self):
@@ -1716,11 +1727,11 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     Returns:
       A `string`.
     """
-    # Note: we can't just use self._funcion_spec.signature_summary(), because
+    # Note: we can't just use str(self.function_type), because
     # that would show "BOUND_VALUE" as the default value for all arguments.
-    assert self._function_spec is not None
+    assert self.function_type is not None
     arg_specs, kwarg_specs = self.structured_input_signature
-    arg_names = list(self._function_spec.arg_names)
+    arg_names = function_type_utils.to_arg_names(self.function_type)
 
     # If an explicit input_signature is provided to @tf.function, then any
     # arguments with defaults that are not covered by that explicit signature
@@ -1777,7 +1788,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
 
     lines = [self._structured_signature_summary(default_values=True)]
     arg_specs, kwarg_specs = self.structured_input_signature
-    names = list(self._function_spec.arg_names)
+    names = function_type_utils.to_arg_names(self.function_type)
 
     # If an explicit input_signature is provided to @tf.function, then any
     # arguments with defaults that are not covered by that explicit signature
@@ -1817,7 +1828,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
     return "\n".join(lines)
 
   def __repr__(self):
-    if self._function_spec is not None:
+    if self.function_type is not None:
       return "<ConcreteFunction {} at 0x{:X}>".format(
           self.pretty_printed_signature(verbose=False), id(self))
     elif not (self._num_positional_args is None or self._arg_keywords is None):
@@ -1827,7 +1838,7 @@ class ConcreteFunction(core.ConcreteFunction, trackable.Trackable):
       return object.__repr__(self)
 
   def __str__(self):
-    if self._function_spec is not None:
+    if self.function_type is not None:
       return "ConcreteFunction {}".format(self.pretty_printed_signature())
     else:
       return self.__repr__()
diff --git a/tensorflow/python/eager/polymorphic_function/concrete_function_test.py b/tensorflow/python/eager/polymorphic_function/concrete_function_test.py
new file mode 100644
index 00000000000..9c7f4943b88
--- /dev/null
+++ b/tensorflow/python/eager/polymorphic_function/concrete_function_test.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from absl.testing import parameterized
+
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.python.eager.polymorphic_function import concrete_function as cf
+from tensorflow.python.framework import func_graph as func_graph_module
+from tensorflow.python.platform import test
+from tensorflow.python.util import compat
+
+
+class ConcreteFunctionTest(test.TestCase, parameterized.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.func_graph = func_graph_module.FuncGraph("f")
+
+  @parameterized.parameters(
+      ({"api_implements": True}, attr_value_pb2.AttrValue(b=True)),
+      ({"api_implements": 1}, attr_value_pb2.AttrValue(i=1)),
+      ({"api_implements": 1.0}, attr_value_pb2.AttrValue(f=1.0)),
+      (
+          {"api_implements": "test"},
+          attr_value_pb2.AttrValue(s=compat.as_bytes("test")),
+      ),
+  )
+  def test_parses_func_attr_scalar_values(self, attrs, expected):
+    self.assertEqual(
+        cf.ConcreteFunction(self.func_graph, attrs=attrs).function_def.attr[
+            "api_implements"
+        ],
+        expected,
+    )
+
+  def test_parses_func_attr_list_values(self):
+    self.assertProtoEquals(
+        r"""
+        list {
+            s: 'test'
+            b: True
+            i: 1
+            f: 1.0
+        }
+        """,
+        cf.ConcreteFunction(
+            self.func_graph, attrs={"api_implements": ["test", True, 1, 1.0]}
+        ).function_def.attr["api_implements"],
+    )
+
+  def test_raises_value_error_for_invalid_attr(self):
+    with self.assertRaisesRegex(ValueError, "Attribute api_implements must be"):
+      cf.ConcreteFunction(self.func_graph, attrs={"api_implements": None})
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/eager/polymorphic_function/function_spec_test.py b/tensorflow/python/eager/polymorphic_function/function_spec_test.py
index 60501a0de0d..fa137fc7fc2 100644
--- a/tensorflow/python/eager/polymorphic_function/function_spec_test.py
+++ b/tensorflow/python/eager/polymorphic_function/function_spec_test.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Tests for function_spec."""
+"""Tests for function_type_utils."""
 
 from absl.testing import parameterized
 
 from tensorflow.core.function import trace_type
 from tensorflow.core.function.polymorphism import function_type as function_type_lib
-from tensorflow.python.eager.polymorphic_function import function_spec
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.platform import test
 from tensorflow.python.util import tf_decorator
@@ -70,7 +70,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
     def foo(x, y, z):  # pylint: disable=unused-argument
       pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
@@ -134,7 +134,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
     def foo(x=1, y=2, z=3):  # pylint: disable=unused-argument
       pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
@@ -194,7 +194,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
     def foo(x, y, z=3):  # pylint: disable=unused-argument
       pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
@@ -244,7 +244,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
     def foo(*my_var_args):  # pylint: disable=unused-argument
       pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
@@ -304,7 +304,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
     def foo(x, y, *, z=3):  # pylint: disable=unused-argument
       pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec), (['x', 'y'], None, None, None, ['z'], {
@@ -359,7 +359,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
   ):
 
     def testing_decorator(func):
-      spec = function_spec.FunctionSpec.from_function_and_signature(
+      spec = function_type_utils.FunctionSpec.from_function_and_signature(
           func, input_signature
       )
       self.assertEqual(
@@ -438,7 +438,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
       def foo(self, x, y=1):
         pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         MyClass().foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
@@ -492,7 +492,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
       def foo(self, x, y=1):
         pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         MyClass.foo, input_signature)
     self.assertEqual(
         tuple(spec.fullargspec),
@@ -523,7 +523,7 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
     def foo(x=2, y=3):  # pylint: disable=unused-argument
       pass
 
-    spec = function_spec.FunctionSpec.from_function_and_signature(
+    spec = function_type_utils.FunctionSpec.from_function_and_signature(
         foo, input_signature
     )
     self.assertEqual(
@@ -541,16 +541,18 @@ class FunctionSpecTest(test.TestCase, parameterized.TestCase):
 class SameStructureTest(test.TestCase):
 
   def test_same_structure(self):
-    self.assertTrue(function_spec.is_same_structure([1, 2, 3], [1, 2, 3], True))
     self.assertTrue(
-        function_spec.is_same_structure([1, 2, 3], [1, 2, 4], False)
+        function_type_utils.is_same_structure([1, 2, 3], [1, 2, 3], True)
+    )
+    self.assertTrue(
+        function_type_utils.is_same_structure([1, 2, 3], [1, 2, 4], False)
     )
 
     self.assertFalse(
-        function_spec.is_same_structure([1, 2, 3], [1, 2, 4], True)
+        function_type_utils.is_same_structure([1, 2, 3], [1, 2, 4], True)
     )
     self.assertFalse(
-        function_spec.is_same_structure([1, 2, 3], [1, 2, 3, 4], False)
+        function_type_utils.is_same_structure([1, 2, 3], [1, 2, 3, 4], False)
     )
 
 
diff --git a/tensorflow/python/eager/polymorphic_function/function_spec.py b/tensorflow/python/eager/polymorphic_function/function_type_utils.py
similarity index 58%
rename from tensorflow/python/eager/polymorphic_function/function_spec.py
rename to tensorflow/python/eager/polymorphic_function/function_type_utils.py
index 472c631778e..43439cf8e41 100644
--- a/tensorflow/python/eager/polymorphic_function/function_spec.py
+++ b/tensorflow/python/eager/polymorphic_function/function_type_utils.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-"""Defines an input type specification for tf.function."""
+"""Utilities for using FunctionType with tf.function."""
 
 import functools
 import inspect
@@ -32,14 +32,11 @@ from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.util import nest
 
-# Sentinel value used by with ConcreteFunction's structured signature to
-# indicate that a non-tensor parameter should use the value that was
-# specified when the concrete function was created.
-BOUND_VALUE = object()
 
-
-def to_fullargspec(function_type: function_type_lib.FunctionType,
-                   default_values: Dict[str, Any]) -> inspect.FullArgSpec:
+def to_fullargspec(
+    function_type: function_type_lib.FunctionType,
+    default_values: Dict[str, Any],
+) -> inspect.FullArgSpec:
   """Generates backwards compatible FullArgSpec from FunctionType."""
   args = []
   varargs = None
@@ -51,7 +48,7 @@ def to_fullargspec(function_type: function_type_lib.FunctionType,
   for parameter in function_type.parameters.values():
     if parameter.kind in [
         inspect.Parameter.POSITIONAL_ONLY,
-        inspect.Parameter.POSITIONAL_OR_KEYWORD
+        inspect.Parameter.POSITIONAL_OR_KEYWORD,
     ]:
       args.append(parameter.name)
       if parameter.default is not inspect.Parameter.empty:
@@ -72,15 +69,18 @@ def to_fullargspec(function_type: function_type_lib.FunctionType,
       tuple(defaults) if defaults else None,
       kwonlyargs,
       kwonlydefaults if kwonlydefaults else None,
-      annotations={})
+      annotations={},
+  )
 
 
 def _to_default_values(fullargspec):
   """Returns default values from the function's inspected fullargspec."""
   if fullargspec.defaults is not None:
     defaults = {
-        name: value for name, value in zip(
-            fullargspec.args[-len(fullargspec.defaults):], fullargspec.defaults)
+        name: value
+        for name, value in zip(
+            fullargspec.args[-len(fullargspec.defaults) :], fullargspec.defaults
+        )
     }
   else:
     defaults = {}
@@ -105,27 +105,42 @@ def to_function_type(fullargspec):
     arg_name = function_type_lib.sanitize_arg_name(arg)
     parameters.append(
         function_type_lib.Parameter(
-            arg_name, function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
-            arg_name in default_values, None))
+            arg_name,
+            function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
+            arg_name in default_values,
+            None,
+        )
+    )
 
   if fullargspec.varargs is not None:
     parameters.append(
-        function_type_lib.Parameter(fullargspec.varargs,
-                                    function_type_lib.Parameter.VAR_POSITIONAL,
-                                    False, None))
+        function_type_lib.Parameter(
+            fullargspec.varargs,
+            function_type_lib.Parameter.VAR_POSITIONAL,
+            False,
+            None,
+        )
+    )
 
   for kwarg in fullargspec.kwonlyargs:
     parameters.append(
         function_type_lib.Parameter(
             function_type_lib.sanitize_arg_name(kwarg),
-            function_type_lib.Parameter.KEYWORD_ONLY, kwarg in default_values,
-            None))
+            function_type_lib.Parameter.KEYWORD_ONLY,
+            kwarg in default_values,
+            None,
+        )
+    )
 
   if fullargspec.varkw is not None:
     parameters.append(
-        function_type_lib.Parameter(fullargspec.varkw,
-                                    function_type_lib.Parameter.VAR_KEYWORD,
-                                    False, None))
+        function_type_lib.Parameter(
+            fullargspec.varkw,
+            function_type_lib.Parameter.VAR_KEYWORD,
+            False,
+            None,
+        )
+    )
 
   return function_type_lib.FunctionType(parameters), default_values
 
@@ -155,7 +170,8 @@ def to_input_signature(function_type):
       )
       if any(
           not isinstance(arg, tensor_spec.TensorSpec)
-          for arg in nest.flatten([constraint], expand_composites=True)):
+          for arg in nest.flatten([constraint], expand_composites=True)
+      ):
         # input_signature only supports contiguous TensorSpec composites
         is_auto_constrained = True
         break
@@ -170,16 +186,29 @@ def to_input_signature(function_type):
   return tuple(constraints) if constraints else None
 
 
-# TODO(b/214462107): Clean up and migrate to core/function when unblocked.
+def to_arg_names(function_type):
+  """Generates a list of arg names from a FunctionType."""
+  arg_names = []
+  for p in function_type.parameters.values():
+    if p.kind in {
+        function_type_lib.Parameter.POSITIONAL_ONLY,
+        function_type_lib.Parameter.POSITIONAL_OR_KEYWORD,
+    }:
+      arg_names.append(p.name)
+  return arg_names
+
+
+# TODO(b/214462107): Minimize API surface for FunctionSpec.
 class FunctionSpec(object):
-  """Specification of how to bind arguments to a function."""
+  """Specification of how to bind arguments to a function.
+
+  Deprecated. Please use FunctionType instead.
+  """
 
   @classmethod
-  def from_function_and_signature(cls,
-                                  python_function,
-                                  input_signature,
-                                  is_pure=False,
-                                  jit_compile=None):
+  def from_function_and_signature(
+      cls, python_function, input_signature, is_pure=False, jit_compile=None
+  ):
     """Creates a FunctionSpec instance given a python function and signature.
 
     Args:
@@ -192,18 +221,8 @@ class FunctionSpec(object):
     Returns:
       instance of FunctionSpec
     """
-    _validate_signature(input_signature)
-
-    function_type = function_type_lib.FunctionType.from_callable(
-        python_function)
-    default_values = function_type_lib.FunctionType.get_default_values(
-        python_function)
-
-    if input_signature is not None:
-      input_signature = tuple(input_signature)
-      function_type = function_type_lib.add_type_constraints(
-          function_type, input_signature, default_values)
-
+    function_type, default_values = make_function_type(
+        python_function, input_signature)
     # Get the function's name.  Remove functools.partial wrappers if necessary.
     while isinstance(python_function, functools.partial):
       python_function = python_function.func
@@ -214,32 +233,39 @@ class FunctionSpec(object):
         default_values,
         is_pure=is_pure,
         jit_compile=jit_compile,
-        name=name)
+        name=name,
+    )
 
   @classmethod
-  def from_fullargspec_and_signature(cls,
-                                     fullargspec,
-                                     input_signature,
-                                     is_pure=False,
-                                     name=None,
-                                     jit_compile=None):
+  def from_fullargspec_and_signature(
+      cls,
+      fullargspec,
+      input_signature,
+      is_pure=False,
+      name=None,
+      jit_compile=None,
+  ):
     """Construct FunctionSpec from legacy FullArgSpec format."""
     function_type, default_values = to_function_type(fullargspec)
     if input_signature:
       input_signature = tuple(input_signature)
       _validate_signature(input_signature)
       function_type = function_type_lib.add_type_constraints(
-          function_type, input_signature, default_values)
+          function_type, input_signature, default_values
+      )
 
-    return FunctionSpec(function_type, default_values, is_pure,
-                        name, jit_compile)
+    return FunctionSpec(
+        function_type, default_values, is_pure, name, jit_compile
+    )
 
-  def __init__(self,
-               function_type,
-               default_values,
-               is_pure=False,
-               name=None,
-               jit_compile=None):
+  def __init__(
+      self,
+      function_type,
+      default_values,
+      is_pure=False,
+      name=None,
+      jit_compile=None,
+  ):
     """Constructs a FunctionSpec describing a python function.
 
     Args:
@@ -295,38 +321,7 @@ class FunctionSpec(object):
   # TODO(fmuham): Replace usages and remove.
   @property
   def arg_names(self):
-    return list(
-        p.name
-        for p in self.function_type.parameters.values()
-        if (
-            p.kind is function_type_lib.Parameter.POSITIONAL_ONLY
-            or p.kind is function_type_lib.Parameter.POSITIONAL_OR_KEYWORD
-        )
-    )
-
-  def make_canonicalized_monomorphic_type(
-      self,
-      args: Any,
-      kwargs: Any,
-      captures: Any = None,
-  ) -> Tuple[function_type_lib.FunctionType,
-             trace_type.InternalTracingContext]:
-    """Generates function type given the function arguments."""
-    if captures is None:
-      captures = dict()
-
-    kwargs = {
-        function_type_lib.sanitize_arg_name(name): value
-        for name, value in kwargs.items()
-    }
-
-    _, function_type, type_context = (
-        function_type_lib.canonicalize_to_monomorphic(
-            args, kwargs, self.default_values, captures, self.function_type
-        )
-    )
-
-    return function_type, type_context
+    return to_arg_names(self.function_type)
 
   def signature_summary(self, default_values=False):
     """Returns a string summarizing this function's signature.
@@ -342,66 +337,120 @@ class FunctionSpec(object):
       summary += f", defaults: {self.default_values!r}"
     return summary
 
-  def canonicalize_function_inputs(self, args, kwargs):
-    """Canonicalizes `args` and `kwargs`.
 
-    Canonicalize the inputs to the Python function using a `FunctionSpec`
-    instance. In particular, we parse the varargs and kwargs that the
-    original function was called with into a tuple corresponding to the
-    Python function's positional (named) arguments and a dictionary
-    corresponding to its kwargs.  Missing default arguments are added.
+def make_function_type(python_function, input_signature):
+  """Generates a FunctionType for python_function."""
+  _validate_signature(input_signature)
 
-    If this `FunctionSpec` has an input signature, then it is used to convert
-    arguments to tensors; otherwise, any inputs containing numpy arrays are
-    converted to tensors.
+  function_type = function_type_lib.FunctionType.from_callable(
+      python_function
+  )
+  default_values = function_type_lib.FunctionType.get_default_values(
+      python_function
+  )
 
-    Additionally, any inputs containing numpy arrays are converted to Tensors.
+  if input_signature is not None:
+    input_signature = tuple(input_signature)
+    function_type = function_type_lib.add_type_constraints(
+        function_type, input_signature, default_values
+    )
 
-    Args:
-      args: The varargs this object was called with.
-      kwargs: The keyword args this function was called with.
+  return function_type, default_values
 
-    Returns:
-      A canonicalized ordering of the inputs, as well as full and filtered
-      (Tensors and Variables only) versions of their concatenated flattened
-      representations, represented by a tuple in the form (args, kwargs,
-      flat_args, filtered_flat_args). Here: `args` is a full list of bound
-      arguments, and `kwargs` contains only true keyword arguments, as opposed
-      to named arguments called in a keyword-like fashion.
 
-    Raises:
-      ValueError: If a keyword in `kwargs` cannot be matched with a positional
-        argument when an input signature is specified, or when the inputs
-        do not conform to the input signature.
-    """
-    if self.is_pure:
-      args, kwargs = _convert_variables_to_tensors(args, kwargs)
-    args, kwargs = self.bind_function_inputs(args, kwargs)
-    filtered_flat_args = filter_function_inputs(args, kwargs)
+def make_canonicalized_monomorphic_type(
+    args: Any,
+    kwargs: Any,
+    capture_types: Any,
+    polymorphic_type,
+    default_values,
+) -> Tuple[function_type_lib.FunctionType, trace_type.InternalTracingContext]:
+  """Generates function type given the function arguments."""
+  kwargs = {
+      function_type_lib.sanitize_arg_name(name): value
+      for name, value in kwargs.items()
+  }
 
-    return args, kwargs, filtered_flat_args
+  _, function_type, type_context = (
+      function_type_lib.canonicalize_to_monomorphic(
+          args, kwargs, default_values, capture_types, polymorphic_type
+      )
+  )
 
-  def bind_function_inputs(self, args, kwargs):
-    """Bind `args` and `kwargs` into a canonicalized signature args, kwargs."""
-    sanitized_kwargs = {
-        function_type_lib.sanitize_arg_name(k): v for k, v in kwargs.items()
-    }
-    if len(kwargs) != len(sanitized_kwargs):
-      raise ValueError(f"Name collision after sanitization. Please rename "
-                       f"tf.function input parameters. Original: "
-                       f"{sorted(kwargs.keys())}, Sanitized: "
-                       f"{sorted(sanitized_kwargs.keys())}")
+  return function_type, type_context
 
-    try:
-      bound_arguments = self.function_type.bind_with_defaults(
-          args, sanitized_kwargs, self.default_values)
-    except Exception as e:
-      raise TypeError(
-          f"Binding inputs to tf.function `{self._name}` failed due to `{e}`. "
-          f"Received args: {args} and kwargs: {sanitized_kwargs} for signature:"
-          f" {self.function_type}."
-      ) from e
-    return bound_arguments.args, bound_arguments.kwargs
+
+def canonicalize_function_inputs(
+    args, kwargs, function_type, default_values=None, is_pure=False
+):
+  """Canonicalizes `args` and `kwargs`.
+
+  Canonicalize the inputs to the Python function using FunctionType.
+  In particular, we parse the varargs and kwargs that the
+  original function was called with into a tuple corresponding to the
+  Python function's positional (named) arguments and a dictionary
+  corresponding to its kwargs.  Missing default arguments are added.
+
+  If the FunctionType has an type constraints, then they are used to convert
+  arguments to tensors; otherwise, any inputs containing numpy arrays are
+  converted to tensors.
+
+
+  Args:
+    args: The varargs this object was called with.
+    kwargs: The keyword args this function was called with.
+    function_type: FunctionType to canonicalize against.
+    default_values: Default values to use.
+    is_pure: Force variable inputs to Tensors.
+
+  Returns:
+    A canonicalized ordering of the inputs, as well as full and filtered
+    (Tensors and Variables only) versions of their concatenated flattened
+    representations, represented by a tuple in the form (args, kwargs,
+    flat_args, filtered_flat_args). Here: `args` is a full list of bound
+    arguments, and `kwargs` contains only true keyword arguments, as opposed
+    to named arguments called in a keyword-like fashion.
+
+  Raises:
+    ValueError: If a keyword in `kwargs` cannot be matched with a positional
+      argument when an input signature is specified, or when the inputs
+      do not conform to the input signature.
+  """
+  default_values = {} if not default_values else default_values
+  if is_pure:
+    args, kwargs = _convert_variables_to_tensors(args, kwargs)
+  args, kwargs = bind_function_inputs(
+      args, kwargs, function_type, default_values
+  )
+  filtered_flat_args = filter_function_inputs(args, kwargs)
+
+  return args, kwargs, filtered_flat_args
+
+
+def bind_function_inputs(args, kwargs, function_type, default_values):
+  """Bind `args` and `kwargs` into a canonicalized signature args, kwargs."""
+  sanitized_kwargs = {
+      function_type_lib.sanitize_arg_name(k): v for k, v in kwargs.items()
+  }
+  if len(kwargs) != len(sanitized_kwargs):
+    raise ValueError(
+        "Name collision after sanitization. Please rename "
+        "tf.function input parameters. Original: "
+        f"{sorted(kwargs.keys())}, Sanitized: "
+        f"{sorted(sanitized_kwargs.keys())}"
+    )
+
+  try:
+    bound_arguments = function_type.bind_with_defaults(
+        args, sanitized_kwargs, default_values
+    )
+  except Exception as e:
+    raise TypeError(
+        f"Binding inputs to tf.function failed due to `{e}`. "
+        f"Received args: {args} and kwargs: {sanitized_kwargs} for signature:"
+        f" {function_type}."
+    ) from e
+  return bound_arguments.args, bound_arguments.kwargs
 
 
 def _validate_signature(signature):
@@ -410,29 +459,40 @@ def _validate_signature(signature):
     return
 
   if not isinstance(signature, (tuple, list)):
-    raise TypeError("input_signature must be either a tuple or a list, got "
-                    f"{type(signature)}.")
+    raise TypeError(
+        "input_signature must be either a tuple or a list, got "
+        f"{type(signature)}."
+    )
 
   # TODO(xjun): Allow VariableSpec once we figure out API for de-aliasing.
   variable_specs = _get_variable_specs(signature)
   if variable_specs:
     raise TypeError(
-        f"input_signature doesn't support VariableSpec, got {variable_specs}")
+        f"input_signature doesn't support VariableSpec, got {variable_specs}"
+    )
 
-  if any(not isinstance(arg, tensor_spec.TensorSpec)
-         for arg in nest.flatten(signature, expand_composites=True)):
+  if any(
+      not isinstance(arg, tensor_spec.TensorSpec)
+      for arg in nest.flatten(signature, expand_composites=True)
+  ):
     bad_args = [
-        arg for arg in nest.flatten(signature, expand_composites=True)
+        arg
+        for arg in nest.flatten(signature, expand_composites=True)
         if not isinstance(arg, tensor_spec.TensorSpec)
     ]
-    raise TypeError("input_signature must be a possibly nested sequence of "
-                    f"TensorSpec objects, got invalid args {bad_args} with "
-                    f"types {list(six.moves.map(type, bad_args))}.")
+    raise TypeError(
+        "input_signature must be a possibly nested sequence of "
+        f"TensorSpec objects, got invalid args {bad_args} with "
+        f"types {list(six.moves.map(type, bad_args))}."
+    )
 
 
 def _to_tensor_or_tensor_spec(x):
-  return (x if isinstance(x, (ops.Tensor, tensor_spec.TensorSpec)) else
-          ops.convert_to_tensor(x))
+  return (
+      x
+      if isinstance(x, (ops.Tensor, tensor_spec.TensorSpec))
+      else ops.convert_to_tensor(x)
+  )
 
 
 def _convert_variables_to_tensors(args, kwargs):
@@ -445,7 +505,8 @@ def _convert_variables_to_tensors(args, kwargs):
 def filter_function_inputs(args, kwargs):
   """Filters and flattens args and kwargs."""
   flat_inputs = composite_tensor_utils.flatten_with_variables(
-      args) + composite_tensor_utils.flatten_with_variables(kwargs)
+      args
+  ) + composite_tensor_utils.flatten_with_variables(kwargs)
 
   for index, flat_input in enumerate(flat_inputs):
     if hasattr(flat_input, "__array__") and not (
@@ -463,12 +524,15 @@ def filter_function_inputs(args, kwargs):
     ):
       ndarray = flat_input.__array__()
       if not isinstance(ndarray, np.ndarray):
-        raise TypeError(f"The output of __array__ must be an np.ndarray, "
-                        f"got {type(ndarray)} from {flat_input}.")
+        raise TypeError(
+            "The output of __array__ must be an np.ndarray, "
+            f"got {type(ndarray)} from {flat_input}."
+        )
       flat_inputs[index] = constant_op.constant(ndarray)
 
   filtered_flat_inputs = [
-      t for t in flat_inputs
+      t
+      for t in flat_inputs
       if isinstance(t, (ops.Tensor, resource_variable_ops.BaseResourceVariable))
   ]
 
@@ -489,6 +553,24 @@ def _get_variable_specs(args):
   return variable_specs
 
 
+def derive_from_graph(func_graph):
+  """Derives a FunctionType from FuncGraph."""
+  # TODO(fmuham): Include structure info from structured_inputs
+  input_signature = (
+      tuple(trace_type.from_value(i) for i in func_graph.inputs),
+      {},
+  )
+
+  # TODO(fmuham): Include output structure info from structured_outputs
+  output_signature = tuple(trace_type.from_value(o) for o in func_graph.outputs)
+
+  return function_type_lib.from_structured_signature(
+      input_signature,
+      output_signature,
+      func_graph.function_captures.capture_types,
+  )
+
+
 # TODO(fmuham): Replace usages with TraceType and remove.
 def is_same_structure(structure1, structure2, check_values=False):
   """Check two structures for equality, optionally of types and of values."""
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
index e5cfcab203f..f7886a78dcb 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function.py
@@ -78,7 +78,8 @@ from tensorflow.python.eager.polymorphic_function import attributes as attribute
 from tensorflow.python.eager.polymorphic_function import autograph_util
 from tensorflow.python.eager.polymorphic_function import compiler_ir
 from tensorflow.python.eager.polymorphic_function import eager_function_run
-from tensorflow.python.eager.polymorphic_function import function_spec as function_spec_lib
+from tensorflow.python.eager.polymorphic_function import function_type_utils
+from tensorflow.python.eager.polymorphic_function import tf_method_target
 from tensorflow.python.eager.polymorphic_function import tracing_compiler
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import errors
@@ -102,16 +103,10 @@ from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import traceback_utils
 from tensorflow.python.util.tf_export import tf_export
 
+
 FREQUENT_TRACING_WARNING_MAX_CALL_HISTORY = 10
 FREQUENT_TRACING_WARNING_THRESHOLD = 5
 FREQUENT_TRACING_WARNING_MAX_WARNING_PER_DETECTOR = 2
-ALLOW_DYNAMIC_VARIABLE_CREATION = False
-
-
-def set_dynamic_variable_creation(is_allowed):
-  global ALLOW_DYNAMIC_VARIABLE_CREATION
-  ALLOW_DYNAMIC_VARIABLE_CREATION = is_allowed
-
 
 _tf_function_counter = monitoring.Counter(
     "/tensorflow/core/tf_function_counter",
@@ -489,10 +484,8 @@ class Function(core.GenericFunction, trackable.Trackable):
     """
     self._lock = threading.RLock()
     self._python_function = python_function
-    self._function_spec = function_spec_lib.FunctionSpec.from_function_and_signature(
-        python_function,
-        input_signature,
-        jit_compile=jit_compile,
+    self._function_type, self._default_values = (
+        function_type_utils.make_function_type(python_function, input_signature)
     )
 
     self._attributes = {}
@@ -691,9 +684,8 @@ class Function(core.GenericFunction, trackable.Trackable):
     self._variable_creation_fn._name = self._name  # pylint: disable=protected-access
     # Force the definition of the function for these arguments
     self._concrete_variable_creation_fn = (
-        self._variable_creation_fn    # pylint: disable=protected-access
-        ._get_concrete_function_internal_garbage_collected(
-            *args, **kwds))
+        self._variable_creation_fn.get_concrete_function(args, kwds)
+    )
 
     def invalid_creator_scope(*unused_args, **unused_kwds):
       """Disables variable creation."""
@@ -749,8 +741,11 @@ class Function(core.GenericFunction, trackable.Trackable):
           "Functions cannot be decorated after they have been traced.")
 
     self._python_function = decorator(self._python_function)
-    self._function_spec = function_spec_lib.FunctionSpec.from_function_and_signature(
-        self._python_function, self.input_signature)
+    self._function_type, self._default_values = (
+        function_type_utils.make_function_type(
+            self._python_function, self.input_signature
+        )
+    )
 
   # TODO: Remove this private method after updating all its uses
   # A good moment to do this could be when the experimental label is removed
@@ -844,11 +839,7 @@ class Function(core.GenericFunction, trackable.Trackable):
   def _call(self, *args, **kwds):
     """Calls the graph function."""
     self._lock.acquire()
-    if ALLOW_DYNAMIC_VARIABLE_CREATION:
-      condition = self._created_variables and self._variable_creation_fn is None
-    else:
-      condition = self._created_variables
-    if condition:
+    if self._created_variables:
       # Release the lock early so that multiple threads can perform the call
       # in parallel.
       self._lock.release()
@@ -862,7 +853,7 @@ class Function(core.GenericFunction, trackable.Trackable):
       # In this case we have not created variables on the first call. So we can
       # run the first trace but we should fail if variables are created.
       results = self._variable_creation_fn(*args, **kwds)
-      if self._created_variables and not ALLOW_DYNAMIC_VARIABLE_CREATION:
+      if self._created_variables:
         raise ValueError("Creating variables on a non-first call to a function"
                          " decorated with tf.function.")
       return results
@@ -890,9 +881,14 @@ class Function(core.GenericFunction, trackable.Trackable):
         return self._no_variable_creation_fn(*args, **kwds)
     else:
       _, _, filtered_flat_args = (
-          self._variable_creation_fn._function_spec  # pylint: disable=protected-access
-          .canonicalize_function_inputs(
-              args, kwds))
+          function_type_utils.canonicalize_function_inputs(
+              args,
+              kwds,
+              self._variable_creation_fn._function_type,  # pylint: disable=protected-access
+              self._variable_creation_fn._default_values,  # pylint: disable=protected-access
+              self._variable_creation_fn._is_pure,  # pylint: disable=protected-access
+          )
+      )
       # If we did not create any variables the trace we have is good enough.
       return self._concrete_variable_creation_fn._call_flat(   # pylint: disable=protected-access
           filtered_flat_args,
@@ -926,8 +922,14 @@ class Function(core.GenericFunction, trackable.Trackable):
           "so this tf.function cannot be run on XLA. A possible workaround is "
           "to move variable creation outside of the XLA compiled function.")
     canon_args, canon_kwds, filtered_flat_args = (
-        self._variable_creation_fn._function_spec.canonicalize_function_inputs(  # pylint: disable=protected-access
-            args, kwds))
+        function_type_utils.canonicalize_function_inputs(
+            args,
+            kwds,
+            self._variable_creation_fn._function_type,  # pylint: disable=protected-access
+            self._variable_creation_fn._default_values,  # pylint: disable=protected-access
+            self._variable_creation_fn._is_pure,  # pylint: disable=protected-access
+        )
+    )
     return tracing_compiler.TracingCompiler(
         fn_with_cond, "fn_with_cond")(canon_args, canon_kwds,
                                       filtered_flat_args)
@@ -969,8 +971,9 @@ class Function(core.GenericFunction, trackable.Trackable):
     fn_name = concrete_fn.name
 
     # pylint: disable=protected-access
-    _, _, filtered_flat_args = (
-        concrete_fn._function_spec.canonicalize_function_inputs(args, kwargs))
+    _, _, filtered_flat_args = function_type_utils.canonicalize_function_inputs(
+        args, kwargs, concrete_fn.function_type
+    )
 
     def compiler_ir_generator(stage="hlo", device_name=None):
       device_name = compiler_ir.maybe_get_device_name(device_name)
@@ -996,11 +999,17 @@ class Function(core.GenericFunction, trackable.Trackable):
 
   @property
   def input_signature(self):
-    return self._function_spec.input_signature
+    return function_type_utils.to_input_signature(self._function_type)
 
   @property
   def function_spec(self):
-    return self._function_spec
+    return function_type_utils.FunctionSpec(
+        self._function_type,
+        self._default_values,
+        False,
+        self._name,
+        self._jit_compile,
+    )
 
   def pretty_printed_concrete_signatures(self, verbose=True):
     joiner = "\n\n" if verbose else "\n"
@@ -1123,7 +1132,7 @@ class Function(core.GenericFunction, trackable.Trackable):
                        signature)
           continue
         equal_to_signature = functools.partial(
-            function_spec_lib.is_same_structure, signature, check_values=True)
+            function_type_utils.is_same_structure, signature, check_values=True)
         if not any(equal_to_signature(s) for s in seen_signatures):
           seen_signatures.append(signature)
 
@@ -1172,13 +1181,13 @@ class Function(core.GenericFunction, trackable.Trackable):
     if self._created_variables:
       # In this case we have created variables on the first call, so we run the
       # version which is guaranteed to never create variables.
-      return self._no_variable_creation_fn._get_concrete_function_garbage_collected(  # pylint: disable=protected-access
-          *args, **kwargs)
+      return self._no_variable_creation_fn.get_concrete_function(
+          args, kwargs, bind_graph_to_function=True)
     elif self._variable_creation_fn is not None:
       # In this case we have not created variables on the first call. So we can
       # run the first trace but we should fail if variables are created.
-      concrete = self._variable_creation_fn._get_concrete_function_garbage_collected(  # pylint: disable=protected-access
-          *args, **kwargs)
+      concrete = self._variable_creation_fn.get_concrete_function(
+          args, kwargs, bind_graph_to_function=True)
       if self._created_variables:
         raise ValueError("Creating variables on a non-first call to a function"
                          " decorated with tf.function.")
@@ -1231,7 +1240,7 @@ class Function(core.GenericFunction, trackable.Trackable):
       # It's unclear whether we need the tf-decorator, or could just call
       # MethodType(self.clone(), instance)
       self._descriptor_cache[instance] = (
-          tracing_compiler.class_method_to_instance_method(self, instance))
+          class_method_to_instance_method(self, instance))
     return self._descriptor_cache[instance]
 
 
@@ -1618,3 +1627,60 @@ def function(
   #
   # use case, which is equivalent to `foo = tf.function(...)(foo)`
   return decorated
+
+
+def class_method_to_instance_method(original_function, instance):
+  """Constructs a new `TracingCompiler` with `self` bound."""
+  weak_instance = weakref.ref(instance)
+
+  # Note: while we could bind to a weakref proxy instead, that causes the
+  # bound method to be unhashable.
+  bound_method = types_lib.MethodType(
+      original_function.python_function,
+      tf_method_target.TfMethodTarget(weak_instance,
+                                      original_function.python_function))
+
+  # original_function is expected to be PolymorphicFunction
+  assert hasattr(original_function, "_name")
+  assert hasattr(original_function, "_autograph")
+  assert hasattr(original_function, "_function_type")
+  assert hasattr(original_function, "python_function")
+
+  weak_bound_method_wrapper = None
+
+  def bound_method_wrapper(*args, **kwargs):
+    """Wraps either a dummy MethodType or a converted AutoGraph function."""
+    # __wrapped__ allows AutoGraph to swap in a converted function.
+    strong_bound_method_wrapper = weak_bound_method_wrapper()
+    wrapped_fn = strong_bound_method_wrapper.__wrapped__
+
+    if wrapped_fn is strong_bound_method_wrapper.__original_wrapped__:
+      # If __wrapped__ was not replaced, then call original_function.
+      # TODO(mdan): For better consistency, use the wrapper's call().
+      wrapped_fn = original_function.python_function
+      return wrapped_fn(weak_instance(), *args, **kwargs)
+
+    # If __wrapped__ was replaced, then it is always an unbound function.
+    # However, the replacer is still responsible for attaching self properly.
+    # TODO(mdan): Is it possible to do it here instead?
+    return wrapped_fn(*args, **kwargs)
+
+  weak_bound_method_wrapper = weakref.ref(bound_method_wrapper)
+
+  # pylint: disable=protected-access
+  # We make a dummy MethodType object to generate the correct bound method
+  # signature. The actual call is to a function with a weak reference to
+  # `instance`.
+  instance_func = type(original_function)(
+      tf_decorator.make_decorator(bound_method, bound_method_wrapper),
+      name=original_function._name,
+      autograph=original_function._autograph,
+      input_signature=original_function.input_signature,
+      reduce_retracing=original_function._reduce_retracing,
+      jit_compile=original_function._jit_compile)
+  # pylint: enable=protected-access
+
+  # We wrap the bound method with tf_decorator so inspection works correctly
+  wrapped_instance_func = tf_decorator.make_decorator(bound_method,
+                                                      instance_func)
+  return wrapped_instance_func
diff --git a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
index f2b7e9d4aed..ee7bf632be5 100644
--- a/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
+++ b/tensorflow/python/eager/polymorphic_function/polymorphic_function_test.py
@@ -22,12 +22,12 @@ import platform
 import re
 import sys
 import time
-import unittest
 import weakref
 
 from absl.testing import parameterized
 import numpy
 
+from tensorflow.core.function import trace_type
 from tensorflow.core.function.capture import capture_container
 from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.autograph.core import converter
@@ -41,7 +41,6 @@ from tensorflow.python.eager import context
 from tensorflow.python.eager import lift_to_graph
 from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
 from tensorflow.python.eager.polymorphic_function import polymorphic_function
-from tensorflow.python.eager.polymorphic_function import tracing_compiler
 from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import config
 from tensorflow.python.framework import constant_op
@@ -63,7 +62,6 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import clip_ops
-from tensorflow.python.ops import cond
 from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_assert
 from tensorflow.python.ops import data_flow_ops
@@ -1095,34 +1093,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual(5., self.evaluate(concrete()))
     self.assertAllEqual(5., self.evaluate(tensor_init()))
 
-  def testFuncGraphCaptureByValue(self):
-    v = variables.Variable(1.0)
-
-    def trivial_function():
-      return v.read_value()
-
-    graph_function = tracing_compiler.TracingCompiler(
-        trivial_function, 'test', capture_by_value=True)
-
-    self.assertAllEqual(graph_function(), 1.0)
-    v.assign(2.0)
-    self.assertAllEqual(graph_function(), 1.0)
-
-  def testFuncGraphCaptureByValueNested(self):
-    v = variables.Variable(1.0)
-
-    def trivial_function():
-      return cond.cond(
-          array_ops.placeholder_with_default(True, ()), v.read_value,
-          v.read_value)
-
-    graph_function = tracing_compiler.TracingCompiler(
-        trivial_function, 'test', capture_by_value=True)
-
-    self.assertAllEqual(graph_function(), 1.0)
-    v.assign(2.0)
-    self.assertAllEqual(graph_function(), 1.0)
-
   def testDefunShapeInferenceWithCapturedResourceVariable(self):
     v = resource_variable_ops.ResourceVariable([[1, 2], [3, 4]])
 
@@ -1545,6 +1515,35 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
 
       self.assertEqual(1, int(self.evaluate(read())))
 
+  def testConcreteFunctionType(self):
+    y = constant_op.constant(1)
+
+    @polymorphic_function.function
+    def foo(x):
+      return {'input': x, 'capture': y}
+
+    cf = foo.get_concrete_function(tensor_spec.TensorSpec([], dtypes.int32))
+    x = constant_op.constant(2)
+    output = cf(x)
+    self.assertEqual(set(output.keys()), {'input', 'capture'})
+    self.assertEqual(output['input'].numpy(), 2)
+    self.assertEqual(output['capture'].numpy(), 1)
+
+    parameters = list(cf.function_type.parameters.values())
+    self.assertLen(parameters, 1)
+    self.assertEqual(parameters[0].name, 'x')
+    self.assertEqual(
+        parameters[0].type_constraint,
+        tensor_spec.TensorSpec([], dtypes.int32),
+    )
+
+    captures = cf.function_type.captures
+    self.assertLen(captures, 1)
+    self.assertEqual(captures[id(y)], tensor_spec.TensorSpec([], dtypes.int32))
+
+    output = cf.function_type.output
+    self.assertEqual(output, trace_type.from_value({'input': x, 'capture': y}))
+
   def testSequenceInputs(self):
     clip_by_global_norm = polymorphic_function.function(
         clip_ops.clip_by_global_norm)
@@ -1731,7 +1730,7 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       defined()
 
     with self.assertRaisesRegex(
-        TypeError, '.*was expected to be of type.* but is.*'
+        TypeError, r'Can not cast .*shape=\(3,\).* to .*shape=\(2,\).*'
     ):
       defined.get_concrete_function(
           tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.float32))
@@ -1746,13 +1745,13 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
         return x
 
       with self.assertRaisesRegex(
-          TypeError, 'Binding inputs to tf.function `f` failed .*'):
+          TypeError, 'Binding inputs to tf.function failed .*'):
         f.get_concrete_function(1)(constant_op.constant(1))
 
       f.get_concrete_function(constant_op.constant(1))(1)
 
       with self.assertRaisesRegex(
-          TypeError, 'Binding inputs to tf.function `f` failed .*'):
+          TypeError, 'Binding inputs to tf.function failed .*'):
         f.get_concrete_function(1)(2)
 
     run_test()
@@ -2797,12 +2796,8 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
       return x
 
     conc = func.get_concrete_function(*conc_args, **conc_kwargs)
-
-    # Remove _function_spec, to disable the structured signature.
-    conc._set_function_spec(None)  # pylint: disable=protected-access
-
     with self.assertRaisesRegex(exception, error):
-      self.evaluate(conc(*call_args, **call_kwargs))
+      self.evaluate(conc._call_with_flat_signature(call_args, call_kwargs))  # pylint: disable=protected-access
 
   @test_util.run_in_graph_and_eager_modes
   def testConcreteFunctionAmbiguousSignature(self):
@@ -3420,90 +3415,6 @@ class FunctionTest(test.TestCase, parameterized.TestCase):
     m1 = MyModel()
     self.assertAllEqual(m1.apply(3.0), 6.0)
 
-  @unittest.expectedFailure
-  def testMethodAllowDynamicVariableWithoutGuards(self):
-
-    class Foo:
-
-      def __init__(self):
-        self._var = 0
-
-      def __call__(self, val):
-        self.compute(val)
-        return self._var
-
-      @polymorphic_function.function
-      def compute(self, val):
-        self._var = variables.Variable(val)
-
-    polymorphic_function.set_dynamic_variable_creation(True)
-    foo = Foo()
-    self.assertAllEqual(foo(0.3), 0.3)
-    self.assertAllEqual(
-        foo(0.9), 0.9, 'https://github.com/tensorflow/tensorflow/issues/27120')
-
-  def testMethodAllowDynamicVariable(self):
-
-    class Foo:
-
-      def __init__(self):
-        self._flag_keyed_vars = {}
-        self.trace_count = 0
-
-      def __call__(self, var_creation_flag):
-        self.compute(var_creation_flag)
-        return self._flag_keyed_vars[var_creation_flag]
-
-      @polymorphic_function.function
-      def compute(self, var_creation_flag):
-        self.trace_count += 1
-        if var_creation_flag not in self._flag_keyed_vars:
-          if var_creation_flag:
-            self._flag_keyed_vars[var_creation_flag] = variables.Variable(1.0)
-          else:
-            self._flag_keyed_vars[var_creation_flag] = variables.Variable(2.0)
-
-    polymorphic_function.set_dynamic_variable_creation(True)
-    foo = Foo()
-    self.assertAllEqual(foo(True), 1.0)
-    self.assertEqual(foo.trace_count, 2)
-    self.assertAllEqual(foo(True), 1.0)
-    self.assertEqual(foo.trace_count, 2)
-    self.assertAllEqual(foo(False), 2.0)
-    self.assertEqual(foo.trace_count, 3)
-
-  def testMethodNotAllowDynamicVariable(self):
-
-    class Foo:
-
-      def __init__(self):
-        self._flag_keyed_vars = {}
-        self.trace_count = 0
-
-      def __call__(self, var_creation_flag):
-        self.compute(var_creation_flag)
-        return self._flag_keyed_vars[var_creation_flag]
-
-      @polymorphic_function.function
-      def compute(self, var_creation_flag):
-        self.trace_count += 1
-        if var_creation_flag not in self._flag_keyed_vars:
-          if var_creation_flag:
-            self._flag_keyed_vars[var_creation_flag] = variables.Variable(1.0)
-          else:
-            self._flag_keyed_vars[var_creation_flag] = variables.Variable(2.0)
-
-    polymorphic_function.set_dynamic_variable_creation(False)
-    foo = Foo()
-    self.assertAllEqual(foo(True), 1.0)
-    self.assertEqual(foo.trace_count, 2)
-    self.assertAllEqual(foo(True), 1.0)
-    self.assertEqual(foo.trace_count, 2)
-    msg = 'singleton tf.Variable.*on the first call'
-    with self.assertRaisesRegex(ValueError, msg):
-      foo(False)
-    self.assertEqual(foo.trace_count, 3)
-
   def testMethodExtensionType(self):
 
     class MaskedTensor(extension_type.ExtensionType):
diff --git a/tensorflow/python/eager/polymorphic_function/quarantine_test.py b/tensorflow/python/eager/polymorphic_function/quarantine_test.py
index 76a5a50f1ce..9515bbac115 100644
--- a/tensorflow/python/eager/polymorphic_function/quarantine_test.py
+++ b/tensorflow/python/eager/polymorphic_function/quarantine_test.py
@@ -603,12 +603,12 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     self.assertAllEqual([2, 2], func(numpy.array([[1, 1], [2, 2]])))
 
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `func` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       func([0.0, 1.0, 2.0])  # Wrong shape.
 
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `func` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       func([['wrong dtype']])
 
@@ -722,12 +722,12 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     a = array_ops.ones([1])
 
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `foo` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       defined([a, a, a], [a])
 
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `foo` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       defined([a], [a, a, a])
     defined([a, a], [a, a])
@@ -746,7 +746,7 @@ class DefunTest(test.TestCase, parameterized.TestCase):
 
     x = constant_op.constant(1.0)
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `foo` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       foo(x, training=False)
 
@@ -840,21 +840,21 @@ class DefunTest(test.TestCase, parameterized.TestCase):
     # Different number of rows
     rt3 = ragged_factory_ops.constant([[1, 2], [3, 4], [5], [6]])
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `f` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       defined(rt3)
 
     # Different dtype
     rt4 = ragged_factory_ops.constant([[1.0, 2.0], [], [3.0]])
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `f` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       defined(rt4)
 
     # Different rank
     rt5 = ragged_factory_ops.constant([[[1]], [[2]], [[3]]])
     with self.assertRaisesRegex(
-        TypeError, 'Binding inputs to tf.function `f` failed'
+        TypeError, 'Binding inputs to tf.function failed'
     ):
       defined(rt5)
 
diff --git a/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py b/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py
index f15e41bf006..3871f317877 100644
--- a/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py
+++ b/tensorflow/python/eager/polymorphic_function/saved_model_exported_concrete.py
@@ -21,6 +21,7 @@ This functionality should ultimately be moved into a first-class core API.
 """
 
 import gc
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.trackable import base as trackable
 
 
@@ -35,8 +36,9 @@ class ExportedConcreteFunction(trackable.Trackable):
     self.tensor_map = tensor_map
 
   def __call__(self, *args, **kwargs):
-    _, _, filtered_flat_args = (
-        self.function._function_spec.canonicalize_function_inputs(args, kwargs))
+    _, _, filtered_flat_args = function_type_utils.canonicalize_function_inputs(
+        args, kwargs, self.function._function_type
+    )
     export_captures = _map_captures_to_created_tensors(
         self.function.graph.captures, self.tensor_map, self.function)
     return self.function._call_flat(filtered_flat_args, export_captures)
diff --git a/tensorflow/python/eager/polymorphic_function/tracing_compiler.py b/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
index b9708a3fedf..3ee38bcec86 100644
--- a/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
+++ b/tensorflow/python/eager/polymorphic_function/tracing_compiler.py
@@ -14,43 +14,32 @@
 # ==============================================================================
 """Tracing Compiler implementation."""
 
-import collections
 import contextlib
 import threading
-import types as types_lib
 from typing import List
-import weakref
 
 from tensorflow.core.function import trace_type
 from tensorflow.core.function.capture import capture_container
 from tensorflow.core.function.polymorphism import function_cache
 from tensorflow.core.function.polymorphism import function_type as function_type_lib
+from tensorflow.python.autograph.core import ag_ctx
 from tensorflow.python.eager import monitoring
 from tensorflow.python.eager.polymorphic_function import attributes as attributes_lib
+from tensorflow.python.eager.polymorphic_function import concrete_function as concrete_function_lib
 from tensorflow.python.eager.polymorphic_function import function_context
-from tensorflow.python.eager.polymorphic_function import function_spec
-from tensorflow.python.eager.polymorphic_function import monomorphic_function
-from tensorflow.python.eager.polymorphic_function import tf_method_target
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.eager.polymorphic_function import transform
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.profiler import trace
 from tensorflow.python.util import compat
-from tensorflow.python.util import lazy_loader
-from tensorflow.python.util import tf_decorator
 
 
-# Loaded lazily due to a circular dependency (roughly
-# tf.function->autograph->->dataset->tf.function).
-# TODO(b/133251390): Use a regular import.
-ag_ctx = lazy_loader.LazyLoader(
-    "ag_ctx", globals(),
-    "tensorflow.python.autograph.core.ag_ctx")
-
 _graph_building_time_counter = monitoring.Counter(
     "/tensorflow/core/tf_function/graph_building_time_usecs",
-    "Time for tf.function to build a graph (us).")
+    "Time for tf.function to build a graph (us).",
+)
 
 
 # TODO(fmuham): Revamp the API of this class to be 100% compiler-focused.
@@ -72,16 +61,17 @@ class TracingCompiler:
   to call the wrapped function, not the wrapper.
   """
 
-  def __init__(self,
-               python_function,
-               name,
-               input_signature=None,
-               attributes=None,
-               autograph=True,
-               autograph_options=None,
-               reduce_retracing=False,
-               capture_by_value=None,
-               jit_compile=None):
+  def __init__(
+      self,
+      python_function,
+      name,
+      input_signature=None,
+      attributes=None,
+      autograph=True,
+      autograph_options=None,
+      reduce_retracing=False,
+      jit_compile=None,
+  ):
     """Initializes a `TracingCompiler`.
 
     Args:
@@ -100,9 +90,6 @@ class TracingCompiler:
       reduce_retracing: When True, `tf.function` uses
         `tf.types.experimental.TraceType` to trace supertypes of arguments to
         reduce the number of traces.
-      capture_by_value: Experimental. Whether to capture resource variables by
-        value or reference. If None, will inherit from a parent context or
-        default to False.
       jit_compile: Force-compile the function with XLA, cf. tf.function doc on
         jit_compile.
 
@@ -112,11 +99,10 @@ class TracingCompiler:
     """
     self._python_function = python_function
     pure_function = attributes and attributes_lib.IMPLEMENTS in attributes
-    self._function_spec = (
-        function_spec.FunctionSpec.from_function_and_signature(
-            python_function, input_signature, is_pure=pure_function
-        )
+    self._function_type, self._default_values = (
+        function_type_utils.make_function_type(python_function, input_signature)
     )
+    self._is_pure = pure_function
     self._name = name
     self._autograph = autograph
     self._autograph_options = autograph_options
@@ -130,25 +116,22 @@ class TracingCompiler:
             f"TracingCompiler does not support `{attribute}` as an attribute."
         )
 
-    self._capture_by_value = capture_by_value
     self.tracing_count = 0
     # Maintein a dict of all captures: identifier -> lambda function. It's used
     # to get runtime values for all captures during ConcreteFunction dispatch,
     self._func_captures = capture_container.FunctionCaptures()
     self._lock = threading.RLock()
-    # _descriptor_cache is a of instance of a class to an instance-specific
-    # `TracingCompiler`, used to make sure tf.function-decorated methods
-    # create different functions for each instance.
-    self._descriptor_cache = weakref.WeakKeyDictionary()
     self._jit_compile = jit_compile
 
   def __call__(self, *args, **kwargs):
     """Calls a graph function specialized to the inputs."""
     with self._lock:
-      (concrete_function,
-       filtered_flat_args) = self._maybe_define_function(args, kwargs)
+      (concrete_function, filtered_flat_args) = self._maybe_define_function(
+          args, kwargs
+      )
     return concrete_function._call_flat(
-        filtered_flat_args, captured_inputs=concrete_function.captured_inputs)  # pylint: disable=protected-access
+        filtered_flat_args, captured_inputs=concrete_function.captured_inputs
+    )  # pylint: disable=protected-access
 
   @property
   def python_function(self):
@@ -157,12 +140,18 @@ class TracingCompiler:
 
   @property
   def function_spec(self):
-    return self._function_spec
+    return function_type_utils.FunctionSpec(
+        self._function_type,
+        self._default_values,
+        self._is_pure,
+        self._name,
+        self._jit_compile,
+    )
 
   @property
   def input_signature(self):
     """Returns the input signature."""
-    return self._function_spec.input_signature
+    return function_type_utils.to_input_signature(self._function_type)
 
   def _maybe_define_concrete_function(self, args, kwargs):
     if self.input_signature and not args and not kwargs:
@@ -172,160 +161,93 @@ class TracingCompiler:
 
     return self._maybe_define_function(args, kwargs)
 
-  def _get_concrete_function_internal_garbage_collected(self, *args, **kwargs):
-    """Returns a concrete function which cleans up its graph function."""
-    with self._lock:
-      concrete_function, _ = self._maybe_define_concrete_function(args, kwargs)
-    return concrete_function
-
-  def _get_concrete_function_internal(self, *args, **kwargs):
-    """Bypasses error checking when getting a graph function."""
-    concrete_function = self._get_concrete_function_internal_garbage_collected(
-        *args, **kwargs)
-    # We're returning this concrete function to someone, and they may keep a
-    # reference to the FuncGraph without keeping a reference to the
-    # ConcreteFunction object. So we won't clean up the reference cycles
-    # manually and instead will leave them to Python's garbage collector.
-    concrete_function._garbage_collector.release()  # pylint: disable=protected-access
-    return concrete_function
-
-  def _get_concrete_function_garbage_collected(self, *args, **kwargs):
+  def get_concrete_function(
+      self, args=None, kwargs=None, bind_graph_to_function=False
+  ):
     """Returns a `ConcreteFunction` specialized to inputs and execution context.
 
-    Unlike `get_concrete_function(...)`, the graph will be deleted when the
-    returned function is deleted.  It's useful to avoid creating a reference
-    cycle when you know for sure that the graph will be no longer used without
-    the returned function.
-
     Args:
-      *args: inputs to specialize on.
-      **kwargs: inputs to specialize on.
+      args: inputs to specialize on. Can be concrete values (e.g. 1) or
+        `tf.Tensor` or `tf.TensorSpec`.
+      kwargs: keyword inputs to specialize on. Concrete values (e.g. 1) or
+        `tf.Tensor` or `tf.TensorSpec`.
+      bind_graph_to_function: Sets up FuncGraph to be deleted alongside
+        ConcreteFunction.
     """
+    args = args if args else ()
+    kwargs = kwargs if kwargs else {}
+
     if self.input_signature and (args or kwargs):
       # Check to see if a valid type can be generated from the args, kwargs
-      self._function_spec.make_canonicalized_monomorphic_type(args, kwargs)
+      args, kwargs = function_type_utils.bind_function_inputs(
+          args, kwargs, self._function_type, self._default_values
+      )
 
     with self._lock:
       concrete_function, _ = self._maybe_define_concrete_function(args, kwargs)
-      seen_names = set()
-      concrete_function._arg_keywords = []  # pylint: disable=protected-access
-      prefix_counts = {}
-      graph = concrete_function.graph
-      num_captures = len(
-          graph.internal_captures + graph.deferred_internal_captures)
-      num_positional = len(graph.inputs) - num_captures
-      for arg in concrete_function.graph.inputs[:num_positional]:
-        user_arg_name = compat.as_str(arg.op.get_attr("_user_specified_name"))
-        proposal = user_arg_name
-        while proposal in seen_names:
-          index = prefix_counts.get(user_arg_name, 1)
-          proposal = "{}_{}".format(user_arg_name, index)
-          prefix_counts[user_arg_name] = index + 1
-        seen_names.add(proposal)
-        concrete_function._arg_keywords.append(proposal)  # pylint: disable=protected-access
-      # Anything can be a positional argument, in the same order as .inputs
-      concrete_function._num_positional_args = num_positional  # pylint: disable=protected-access
-      return concrete_function
+      _set_arg_keywords(concrete_function)
 
-  def get_concrete_function(self, *args, **kwargs):
-    """Returns a `ConcreteFunction` specialized to inputs and execution context.
+    if not bind_graph_to_function:
+      concrete_function._garbage_collector.release()  # pylint: disable=protected-access
 
-    Args:
-      *args: inputs to specialize on. Can be concrete values (e.g. 1) or
-        `tf.Tensor` or `tf.TensorSpec`.
-      **kwargs: keyword inputs to specialize on. Concrete values (e.g. 1) or
-        `tf.Tensor` or `tf.TensorSpec`.
-    """
-    concrete_function = self._get_concrete_function_garbage_collected(
-        *args, **kwargs)
-    concrete_function._garbage_collector.release()  # pylint: disable=protected-access
     return concrete_function
 
   def _list_all_concrete_functions(
-      self) -> List[monomorphic_function.ConcreteFunction]:
+      self,
+  ) -> List[concrete_function_lib.ConcreteFunction]:
     return self._function_cache.values()
 
-  def __get__(self, instance, owner):
-    """Makes it possible to decorate instance methods."""
-    del owner
-    # `instance` here is the instance that this `TracingCompiler` was
-    # accessed through e.g., for
-    #
-    #   class Foo:
-    #
-    #     @tf.function
-    #     def bar(self):
-    #       ...
-    #
-    #   foo = Foo()
-    #   foo.bar()  # `foo.bar` is a `tf.function` instance
-    #
-    # then `instance` will be `foo` (and `owner` will be `Foo`).  We create a
-    # new instance of `TracingCompiler` here to allow different instances
-    # to create variables once, thereby allowing methods to be decorated with
-    # tf.function. Keeps a cache to avoid retracing the function every time the
-    # descriptor is accessed.
-    if instance not in self._descriptor_cache:
-      if instance is None:
-        return self
-      # If there is no instance-specific `TracingCompiler` in the cache, we
-      # construct an instance-specific `TracingCompiler` that uses a weak
-      # reference to the instance (so that the instance will be correctly gc'd).
-
-      # And finally add the wrapped function to the description cache
-      self._descriptor_cache[instance] = class_method_to_instance_method(
-          self, instance)
-
-    # Return the cached `TracingCompiler` for the instance
-    return self._descriptor_cache[instance]
-
-  def _create_concrete_function(self, args, kwargs, func_graph):
+  def _create_concrete_function(
+      self, function_type, type_context, func_graph
+  ):
     """Create a `ConcreteFunction` from `args`, `kwargs`, and `func_graph`."""
     self.tracing_count += 1
 
-    arglen = len(args)
-    base_arg_names = self._function_spec.arg_names[:arglen]
-    num_missing_args = arglen - len(self._function_spec.arg_names)
-    if num_missing_args > 0:
-      # Must have variable positional args if there are missing args.
-      var_arg_name = next(
-          p.name
-          for p in self._function_spec.function_type.parameters.values()
-          if p.kind is function_type_lib.Parameter.VAR_POSITIONAL
+    placeholder_context = trace_type.InternalPlaceholderContext(
+        func_graph, type_context.get_placeholder_mapping()
+    )
+    with func_graph.as_default():
+      placeholder_bound_args = function_type.placeholder_arguments(
+          placeholder_context
       )
-      missing_arg_names = [var_arg_name] * num_missing_args
-      # Produce a list of missing args of the form ["arg_0", "arg_1", ...],
-      # where arg is based on the self._function_spec.vararg_name.
-      missing_arg_names = [
-          "%s_%d" % (arg, i) for i, arg in enumerate(missing_arg_names)
-      ]
-      arg_names = base_arg_names + missing_arg_names
-    else:
-      arg_names = base_arg_names
 
     traced_func_graph = func_graph_module.func_graph_from_py_func(
         self._name,
         self._python_function,
-        args,
-        kwargs,
+        placeholder_bound_args.args,
+        placeholder_bound_args.kwargs,
         None,
         func_graph=func_graph,
-        arg_names=arg_names,
-        capture_by_value=self._capture_by_value,
+        arg_names=function_type_utils.to_arg_names(function_type),
         create_placeholders=False,
     )
 
     transform.apply_func_graph_transforms(traced_func_graph)
 
-    concrete_function = monomorphic_function.ConcreteFunction(
+    graph_capture_container = traced_func_graph.function_captures
+    # Maintain the list of all captures
+    self._func_captures.merge_by_ref_with(graph_capture_container)
+
+    # Create a new FunctionType including captures and outputs.
+    output_type = trace_type.from_value(
+        traced_func_graph.structured_outputs, type_context
+    )
+    traced_func_type = function_type_lib.FunctionType(
+        function_type.parameters.values(),
+        traced_func_graph.function_captures.capture_types,
+        return_annotation=output_type
+    )
+
+    concrete_function = concrete_function_lib.ConcreteFunction(
         traced_func_graph,
         self._function_attributes,
-        spec=self.function_spec,
         # Tell the ConcreteFunction to clean up its graph once it goes out of
         # scope. This is not the default behavior since it gets used in some
         # places (like Keras) where the FuncGraph lives longer than the
         # ConcreteFunction.
-        shared_func_graph=False)
+        shared_func_graph=False,
+        function_type=traced_func_type
+    )
 
     transform.call_concrete_function_callbacks(concrete_function)
 
@@ -352,24 +274,31 @@ class TracingCompiler:
         shape relaxation retracing.
     """
     args, kwargs, filtered_flat_args = (
-        self._function_spec.canonicalize_function_inputs(args, kwargs))
+        function_type_utils.canonicalize_function_inputs(
+            args,
+            kwargs,
+            self._function_type,
+            self._default_values,
+            self._is_pure,
+        )
+    )
 
     if self.input_signature is not None:
-      args = (*self.input_signature, *args[len(self.input_signature):])
-
-    # Get runtime values of captures
-    captures = self._func_captures.get_by_ref_snapshot()
+      args = (*self.input_signature, *args[len(self.input_signature) :])
 
     current_func_context = function_context.make_function_context()
-
-    # cache_key_deletion_observer is useless here. It's based on all captures.
-    # A new cache key will be built later when saving ConcreteFunction because
-    # only active captures should be saved.
     lookup_func_type, lookup_func_context = (
-        self._function_spec.make_canonicalized_monomorphic_type(
-            args, kwargs, captures))
-    concrete_function = self._function_cache.lookup(current_func_context,
-                                                    lookup_func_type)
+        function_type_utils.make_canonicalized_monomorphic_type(
+            args,
+            kwargs,
+            self._func_captures.capture_types,
+            self._function_type,
+            self._default_values,
+        )
+    )
+    concrete_function = self._function_cache.lookup(
+        lookup_func_type, current_func_context
+    )
     if concrete_function is not None:
       return concrete_function, filtered_flat_args
 
@@ -380,112 +309,56 @@ class TracingCompiler:
     ) if not ops.inside_function() else contextlib.nullcontext():
       with trace.Trace("tf.function-graph_building"):
         logging.vlog(
-            1, "Creating new FuncGraph for Python function %r (key: %r, %r)",
-            self._python_function, current_func_context, lookup_func_type)
-        logging.vlog(2, "Python function signature [args: %s] [kwargs: %s]",
-                     args, kwargs)
+            1,
+            "Creating new FuncGraph for Python function %r (key: %r, %r)",
+            self._python_function,
+            current_func_context,
+            lookup_func_type,
+        )
+        logging.vlog(
+            2, "Python function signature [args: %s] [kwargs: %s]", args, kwargs
+        )
         ag_status = (
-            ag_ctx.Status.ENABLED
-            if self._autograph else ag_ctx.Status.DISABLED)
+            ag_ctx.Status.ENABLED if self._autograph else ag_ctx.Status.DISABLED
+        )
         with ag_ctx.ControlStatusCtx(
-            status=ag_status, options=self._autograph_options):
-          func_graph = func_graph_module.FuncGraph(
-              self._name, capture_by_value=self._capture_by_value)
+            status=ag_status, options=self._autograph_options
+        ):
+          func_graph = func_graph_module.FuncGraph(self._name)
           if self.input_signature is None and self._reduce_retracing:
             target_func_type = self._function_cache.generalize(
-                current_func_context, lookup_func_type)
+                current_func_context, lookup_func_type
+            )
           else:
             target_func_type = lookup_func_type
-          placeholder_mapping = lookup_func_context.get_placeholder_mapping()
-          placeholder_context = trace_type.InternalPlaceholderContext(
-              func_graph, placeholder_mapping)
-          with func_graph.as_default():
-            placeholder_bound_args = target_func_type.placeholder_arguments(
-                placeholder_context)
-          args = placeholder_bound_args.args
-          kwargs = placeholder_bound_args.kwargs
-
           concrete_function = self._create_concrete_function(
-              args, kwargs, func_graph)
+              target_func_type, lookup_func_context, func_graph
+          )
 
-          # TODO(b/263520817): Remove access to private attribute.
-          graph_capture_container = concrete_function.graph.function_captures
-          # Maintain the list of all captures
-          self._func_captures.merge_by_ref_with(graph_capture_container)
-          # Get current active captures snapshot
-          captures = graph_capture_container.get_by_ref_snapshot()
-
-          # Create a cache_key with args and captures
-          traced_func_type = _insert_capture_type(
-              target_func_type, captures, lookup_func_context)
-
-          self._function_cache.add(current_func_context, traced_func_type,
-                                   concrete_function)
+          self._function_cache.add(concrete_function, current_func_context)
 
           return concrete_function, filtered_flat_args
 
 
-def class_method_to_instance_method(original_function, instance):
-  """Constructs a new `TracingCompiler` with `self` bound."""
-  weak_instance = weakref.ref(instance)
-
-  # Note: while we could bind to a weakref proxy instead, that causes the
-  # bound method to be unhashable.
-  bound_method = types_lib.MethodType(
-      original_function.python_function,
-      tf_method_target.TfMethodTarget(weak_instance,
-                                      original_function.python_function))
-
-  # original_function is expected to be either `TracingCompiler` or
-  # def_function.Function
-  assert hasattr(original_function, "_name")
-  assert hasattr(original_function, "_autograph")
-  assert hasattr(original_function, "_function_spec")
-  assert hasattr(original_function, "python_function")
-
-  weak_bound_method_wrapper = None
-
-  def bound_method_wrapper(*args, **kwargs):
-    """Wraps either a dummy MethodType or a converted AutoGraph function."""
-    # __wrapped__ allows AutoGraph to swap in a converted function.
-    strong_bound_method_wrapper = weak_bound_method_wrapper()
-    wrapped_fn = strong_bound_method_wrapper.__wrapped__
-
-    if wrapped_fn is strong_bound_method_wrapper.__original_wrapped__:
-      # If __wrapped__ was not replaced, then call original_function.
-      # TODO(mdan): For better consistency, use the wrapper's call().
-      wrapped_fn = original_function.python_function
-      return wrapped_fn(weak_instance(), *args, **kwargs)
-
-    # If __wrapped__ was replaced, then it is always an unbound function.
-    # However, the replacer is still responsible for attaching self properly.
-    # TODO(mdan): Is it possible to do it here instead?
-    return wrapped_fn(*args, **kwargs)
-
-  weak_bound_method_wrapper = weakref.ref(bound_method_wrapper)
-
-  # pylint: disable=protected-access
-  # We make a dummy MethodType object to generate the correct bound method
-  # signature. The actual call is to a function with a weak reference to
-  # `instance`.
-  instance_func = type(original_function)(
-      tf_decorator.make_decorator(bound_method, bound_method_wrapper),
-      name=original_function._name,
-      autograph=original_function._autograph,
-      input_signature=original_function.input_signature,
-      reduce_retracing=original_function._reduce_retracing,
-      jit_compile=original_function._jit_compile)
-  # pylint: enable=protected-access
-
-  # We wrap the bound method with tf_decorator so inspection works correctly
-  wrapped_instance_func = tf_decorator.make_decorator(bound_method,
-                                                      instance_func)
-  return wrapped_instance_func
-
-
-def _insert_capture_type(original_func_type, captures, type_context):
-  capture_types = collections.OrderedDict()
-  for name, value in captures.items():
-    capture_types[name] = trace_type.from_value(value, type_context)
-  return function_type_lib.FunctionType(
-      original_func_type.parameters.values(), capture_types)
+def _set_arg_keywords(concrete_function):
+  """Sets arg keywords for ConcreteFunction."""
+  seen_names = set()
+  concrete_function._arg_keywords = []  # pylint: disable=protected-access
+  prefix_counts = {}
+  graph = concrete_function.graph
+  num_captures = len(graph.internal_captures + graph.deferred_internal_captures)
+  num_positional = len(graph.inputs) - num_captures
+  for arg in concrete_function.graph.inputs[:num_positional]:
+    try:
+      user_arg_name = compat.as_str(arg.op.get_attr("_user_specified_name"))
+    except ValueError:
+      user_arg_name = "tensor_arg"
+    proposal = user_arg_name
+    while proposal in seen_names:
+      index = prefix_counts.get(user_arg_name, 1)
+      proposal = "{}_{}".format(user_arg_name, index)
+      prefix_counts[user_arg_name] = index + 1
+    seen_names.add(proposal)
+    concrete_function._arg_keywords.append(proposal)  # pylint: disable=protected-access
+  # Anything can be a positional argument, in the same order as .inputs
+  concrete_function._num_positional_args = num_positional  # pylint: disable=protected-access
diff --git a/tensorflow/python/eager/pywrap_tfe_src.cc b/tensorflow/python/eager/pywrap_tfe_src.cc
index 4c91ee0717d..1981bedc2a4 100644
--- a/tensorflow/python/eager/pywrap_tfe_src.cc
+++ b/tensorflow/python/eager/pywrap_tfe_src.cc
@@ -909,8 +909,9 @@ void TFE_Py_ExecuteCancelable(TFE_Context* ctx, const char* device_name,
   auto cleaner = tensorflow::gtl::MakeCleanup([ctx, op] { ReturnOp(ctx, op); });
   if (!out_status->status.ok()) return;
 
-  tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace(
-      tensorflow::StackTrace::kStackTraceInitialSize));
+  tensorflow::unwrap(op)->SetStackTrace(
+      tensorflow::ManagedStackTrace(tensorflow::GetStackTrace(
+          tensorflow::StackTrace::kStackTraceInitialSize)));
 
   for (int i = 0; i < inputs->size() && out_status->status.ok(); ++i) {
     TFE_OpAddInput(op, inputs->at(i), out_status);
@@ -3739,8 +3740,9 @@ PyObject* TFE_Py_FastPathExecute_C(PyObject* args) {
     return nullptr;
   }
 
-  tensorflow::unwrap(op)->SetStackTrace(tensorflow::GetStackTrace(
-      tensorflow::StackTrace::kStackTraceInitialSize));
+  tensorflow::unwrap(op)->SetStackTrace(
+      tensorflow::ManagedStackTrace(tensorflow::GetStackTrace(
+          tensorflow::StackTrace::kStackTraceInitialSize)));
 
   const tensorflow::OpDef* op_def = tensorflow::unwrap(op)->OpDef();
   if (op_def == nullptr) return nullptr;
diff --git a/tensorflow/python/eager/remote.py b/tensorflow/python/eager/remote.py
index b24a952b669..2f70bfc86dd 100644
--- a/tensorflow/python/eager/remote.py
+++ b/tensorflow/python/eager/remote.py
@@ -22,7 +22,6 @@ from tensorflow.core.protobuf.tensorflow_server_pb2 import ServerDef
 from tensorflow.python import pywrap_tfe
 from tensorflow.python.distribute import device_util
 from tensorflow.python.distribute.cluster_resolver import cluster_resolver
-from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.platform import remote_utils
@@ -184,8 +183,10 @@ def connect_to_cluster(cluster_spec_or_resolver,
     service_leader = ""
     # Maybe enable coordination service for the communication protocol
     # TODO(b/243839559): Fix UPTC + Coordination service crashing
-    if isinstance(cluster_spec_or_resolver,
-                  tpu_cluster_resolver.TPUClusterResolver):
+    # Check if cluster_spec_or_resolver is an instance of
+    #    tpu_cluster_resolver.TPUClusterResolver
+    if (isinstance(cluster_spec_or_resolver, cluster_resolver.ClusterResolver)
+        and hasattr(cluster_spec_or_resolver, "tpu_hardware_feature")):
       is_uptc_sess = ".uptc-worker." in cluster_spec_or_resolver.master()
       service_type = remote_utils.coordination_service_type(
           protocol, is_uptc_sess)
diff --git a/tensorflow/python/eager/remote_cloud_tpu_test.py b/tensorflow/python/eager/remote_cloud_tpu_test.py
index efad4ad037d..d1b23710c58 100644
--- a/tensorflow/python/eager/remote_cloud_tpu_test.py
+++ b/tensorflow/python/eager/remote_cloud_tpu_test.py
@@ -20,7 +20,6 @@ from absl.testing import absltest
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import remote
 from tensorflow.python.framework import config
-from tensorflow.python.tpu import tpu_strategy_util
 
 FLAGS = flags.FLAGS
 flags.DEFINE_string('tpu', '', 'Name of TPU to connect to.')
@@ -75,7 +74,7 @@ class RemoteCloudTPUTest(absltest.TestCase):
         expected_devices,
         [device.name for device in config.list_logical_devices()])
 
-    tpu_strategy_util.initialize_tpu_system(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
 
 if __name__ == '__main__':
   absltest.main()
diff --git a/tensorflow/python/eager/summary_optimizer_test.py b/tensorflow/python/eager/summary_optimizer_test.py
new file mode 100644
index 00000000000..7fdc8bfb7a3
--- /dev/null
+++ b/tensorflow/python/eager/summary_optimizer_test.py
@@ -0,0 +1,214 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Tests for summary op transformations."""
+import os
+import os.path
+
+from absl import flags
+
+from tensorflow.core.function.runtime_client import runtime_client
+from tensorflow.core.util import event_pb2
+from tensorflow.python.data.ops import readers
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import summary_ops_v2
+from tensorflow.python.ops import variables
+from tensorflow.python.ops import while_loop
+from tensorflow.python.platform import gfile
+from tensorflow.python.platform import test
+
+FLAGS = flags.FLAGS
+
+
+class SummaryOpsTransformationTest(test.TestCase):
+
+  def setUp(self):
+    super().setUp()
+    self.summary_dir = os.path.join(FLAGS.test_tmpdir, 'mylogs')
+
+    # Clean up any summary directories before starting the test so we can
+    # validate that summaries are only written when enabled.
+    try:
+      gfile.DeleteRecursively(self.summary_dir)
+    except Exception:  # pylint: disable=broad-exception-caught
+      pass
+
+  @test_util.run_v2_only
+  def test_strip_summary_ops(self):
+    def normalize_while_node(fndef):
+      """Helper method to normalize the while node for comparison."""
+      for node in fndef.node_def:
+        if node.op == 'While':
+          # The names of the nested functions are expected to be different
+          # because they will have a uid appended to them.
+          node.attr['body'].func.name = 'while_body'
+          node.attr['cond'].func.name = 'while_cond'
+
+          # The summary_writer and `inlcude_summary` args are expected to be
+          # passed in and out of the transformed function as we do not modify
+          # the function signatures.
+          # Expect a mismatch in input and output types/shapes.
+          node.attr['T'].ClearField('list')
+          node.attr['output_shapes'].ClearField('list')
+          expected_inputs = {
+              'write_summary_summary_cond_input_1',
+              'record_summary',
+          }
+          if 'record_summary' not in node.input:
+            continue
+          inputs = node.input
+          node.ClearField('input')
+          node.input.extend(inp for inp in inputs if inp not in expected_inputs)
+          node.attr['_num_original_outputs'].i -= 2
+
+      return fndef
+
+    def normalize_fdef(fndef):
+      """Method to normalize the tf.function's FunctionDefs for comparison."""
+      # Normalize the names for comparison as they have a uid appended.
+      fndef.signature.name = '__inference_add'
+
+      # The summary writer is expected to be passed into the transformed fn.
+      inputs = fndef.signature.input_arg
+      fndef.signature.ClearField('input_arg')
+      fndef.signature.input_arg.extend(
+          inp
+          for inp in inputs
+          if inp.name != 'write_summary_summary_cond_input_1'
+      )
+
+      # The disable_summaries_at_runtime attr is expected to be cleared.
+      fndef.attr['disable_summaries_at_runtime'].ClearField('list')
+      return fndef
+
+    writer = summary_ops_v2.create_file_writer_v2(self.summary_dir)
+    var = variables.Variable(1.0)
+
+    def remove_writer_attr(fndef):
+      arg_attr = fndef.arg_attr
+      attr_idx = None
+      # tf.function uses TraceType to create placeholder for captures.
+      # An extra "_user_specified_name" attr will be added to the placeholder.
+      for idx in arg_attr:
+        if arg_attr[idx].attr['_user_specified_name'].s == b'input_1':
+          attr_idx = idx
+          break
+      if attr_idx is not None:
+        # Copy subsequent arg_attr to ensure indexes are continuous
+        for idx in range(attr_idx, len(arg_attr) - 1):
+          fndef.arg_attr[idx].CopyFrom(fndef.arg_attr[idx + 1])
+        del fndef.arg_attr[len(arg_attr) - 1]
+      return fndef
+
+    @polymorphic_function.function(
+        autograph=False,
+        experimental_attributes={
+            'disable_summaries_at_runtime': ['record_summary', False]
+        },
+    )
+    def add(x, y, record_summary, include_summary):
+      def body(step, result):
+        result += math_ops.cast(step, dtypes.float32)
+        var.assign(result)
+        if include_summary:
+          # Perform a summary write in a nested function.
+          with writer.as_default():
+            summary_ops_v2.set_step(step)
+            summary_ops_v2.write('my_metric', result, step=step)
+          writer.flush()
+        return math_ops.add(step, 1)
+
+      result = math_ops.add(x, y)
+      step = constant_op.constant(0, dtypes.int64)
+
+      with summary_ops_v2.record_if(record_summary):
+        if include_summary:
+          # Perform a summary write in the main function body.
+          with writer.as_default():
+            summary_ops_v2.set_step(step)
+            summary_ops_v2.write('my_metric', result, step=step)
+          writer.flush()
+
+        step = math_ops.add(step, 1)
+        loop_cond = lambda i: math_ops.less(i, 3)
+        loop_body = lambda i: body(i, result)
+        step = while_loop.while_loop_v2(loop_cond, loop_body, [step])
+        var.assign(result)
+      return result
+
+    one = constant_op.constant(1.0, dtypes.float32)
+    inputs_with_summaries = [one, one, constant_op.constant(True), True]
+    inputs_without_summaries = [one, one, constant_op.constant(False), False]
+    inputs_without_summaries_at_runtime = [
+        one,
+        one,
+        constant_op.constant(False),
+        True,
+    ]
+
+    # Ensure the result of `add` is the same with and without summaries.
+    self.assertEqual(
+        add(*inputs_with_summaries), add(*inputs_without_summaries)
+    )
+
+    # Ensure the result of `add` is the same when summaries have been stripped
+    # at trace time.
+    self.assertEqual(
+        add(*inputs_without_summaries_at_runtime),
+        add(*inputs_without_summaries),
+    )
+
+    # Force a trace of `add` where summaries have been stripped at trace time.
+    expected = add.get_concrete_function(*inputs_without_summaries).function_def
+
+    # Extract the trace of `add` where summaries have been stripped in the
+    # runtime.
+    function_name = add.get_concrete_function(
+        *inputs_without_summaries_at_runtime
+    ).function_def.signature.name
+    ctx = runtime_client.GlobalPythonEagerContext()
+    rt = runtime_client.Runtime(ctx)
+    fndef = rt.GetFunctionProto(function_name + '__instance__no_summaries')
+
+    # Normalize the fndefs and compare them for equivalence.
+    fndef = normalize_fdef(normalize_while_node(fndef))
+    fndef = remove_writer_attr(fndef)
+    expected = normalize_fdef(normalize_while_node(expected))
+    self.assertProtoEquals(expected, fndef)
+
+    # Verify that summaries were only written when executing with the
+    # `inputs_with_summaries` argument.
+    num_summary_events = 0
+    summary_files = [
+        os.path.join(self.summary_dir, sf)
+        for sf in gfile.ListDirectory(self.summary_dir)
+    ]
+    for record in readers.TFRecordDatasetV2(
+        filenames=summary_files
+    ).as_numpy_iterator():
+      event = event_pb2.Event()
+      event.ParseFromString(record)
+      if event.HasField('summary'):
+        num_summary_events += 1
+    # 3 Events are written by `add` when summaries are enabled.
+    self.assertEqual(num_summary_events, 3)
+
+
+if __name__ == '__main__':
+  test.main()
diff --git a/tensorflow/python/eager/wrap_function.py b/tensorflow/python/eager/wrap_function.py
index 153dfb87f5e..9bd5b785645 100644
--- a/tensorflow/python/eager/wrap_function.py
+++ b/tensorflow/python/eager/wrap_function.py
@@ -17,6 +17,7 @@
 
 import weakref
 
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.eager import context
@@ -225,7 +226,14 @@ class WrappedFunction(function.ConcreteFunction):
     for f in fn_graph.as_graph_def().library.function:
       context.context().add_function_def(f)
     self._signature = signature
-    super(WrappedFunction, self).__init__(fn_graph, attrs=attrs)
+    function_type = function_type_lib.from_structured_signature(
+        fn_graph.structured_input_signature,
+        fn_graph.structured_outputs,
+        fn_graph.function_captures.capture_types,
+    )
+    super().__init__(
+        fn_graph, attrs=attrs, function_type=function_type
+    )
 
   def _call_impl(self, args, kwargs):
     if self._arg_keywords is None:
@@ -240,7 +248,7 @@ class WrappedFunction(function.ConcreteFunction):
             args[i] = ops.convert_to_tensor(arg, self._signature[i].dtype)
       return self._call_flat(args, self.captured_inputs)
     else:
-      return super(WrappedFunction, self)._call_impl(args, kwargs)
+      return super()._call_impl(args, kwargs)
 
   def prune(self, feeds, fetches, name=None, input_signature=None):
     """Extract a subgraph of this function's underlying graph.
diff --git a/tensorflow/python/feature_column/BUILD b/tensorflow/python/feature_column/BUILD
index c461b4b532c..69fd043267a 100644
--- a/tensorflow/python/feature_column/BUILD
+++ b/tensorflow/python/feature_column/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,101 +7,144 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "feature_column_py",
     srcs = ["feature_column_lib.py"],
     srcs_version = "PY3",
     deps = [
         ":feature_column",
         ":feature_column_v2",
+        ":sequence_feature_column",
+        ":serialization",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "feature_column",
     srcs = ["feature_column.py"],
     srcs_version = "PY3",
     deps = [
         ":utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:template",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/layers:layers_base",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:template",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
         "@six_archive//:six",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "feature_column_v2",
     srcs = [
         "feature_column_v2.py",
-        "sequence_feature_column.py",
-        "serialization.py",
     ],
     srcs_version = "PY3",
     deps = [
         ":feature_column",
+        ":feature_column_v2_types",
+        ":serialization",
         ":utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/data/experimental/ops:lookup_ops",
+        "//tensorflow/python/data/ops:readers",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
-        "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
         "//third_party/py/numpy",
         "@six_archive//:six",
     ],
 )
 
+py_strict_library(
+    name = "serialization",
+    srcs = [
+        "serialization.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":feature_column_v2_types",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
+        "@six_archive//:six",
+    ],
+)
+
+py_strict_library(
+    name = "sequence_feature_column",
+    srcs = ["sequence_feature_column.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":feature_column_v2",
+        ":serialization",
+        ":utils",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
+    ],
+)
+
 filegroup(
     name = "vocabulary_testdata",
     srcs = [
@@ -116,7 +160,7 @@ filegroup(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "feature_column_test",
     srcs = ["feature_column_test.py"],
     tags = [
@@ -131,37 +175,37 @@ tf_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "feature_column_test_main_lib",
     srcs = ["feature_column_test.py"],
     data = [":vocabulary_testdata"],
     srcs_version = "PY3",
     deps = [
         ":feature_column",
-        ":feature_column_py",
+        ":feature_column_v2",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "feature_column_v2_test",
     srcs = ["feature_column_v2_test.py"],
     shard_count = 5,
@@ -175,70 +219,77 @@ tf_py_test(
     deps = [":feature_column_v2_test_main_lib"],
 )
 
-py_library(
+py_strict_library(
     name = "feature_column_v2_test_main_lib",
     srcs = ["feature_column_v2_test.py"],
     data = [":vocabulary_testdata"],
     srcs_version = "PY3",
     deps = [
-        ":feature_column_py",
+        ":feature_column",
         ":feature_column_v2",
+        ":feature_column_v2_types",
+        ":serialization",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//third_party/py/numpy",
-    ],
-)
-
-py_library(
-    name = "utils",
-    srcs = ["utils.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/util:nest",
-    ],
-)
-
-tf_py_test(
-    name = "sequence_feature_column_test",
-    srcs = ["sequence_feature_column_test.py"],
-    tags = ["no_oss"],  # Due to the usage of keras component.
-    deps = [
-        ":feature_column_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:training",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_library(
+    name = "utils",
+    srcs = ["utils.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/util:nest",
+        "@six_archive//:six",
+    ],
+)
+
+tf_py_strict_test(
+    name = "sequence_feature_column_test",
+    srcs = ["sequence_feature_column_test.py"],
+    tags = ["no_oss"],  # Due to the usage of keras component.
+    deps = [
+        ":feature_column_v2",
+        ":sequence_feature_column",
+        ":serialization",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_test(
     name = "sequence_feature_column_integration_test",
     srcs = ["sequence_feature_column_integration_test.py"],
     python_version = "PY3",
@@ -246,18 +297,26 @@ py_test(
     tags = ["no_pip"],
     deps = [
         ":feature_column_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:parsing_ops",
+        ":sequence_feature_column",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "serialization_test",
     srcs = ["serialization_test.py"],
     deps = [
         ":feature_column_v2",
-        "//tensorflow/python:client_testlib",
-        "@absl_py//absl/testing:parameterized",
+        ":serialization",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
+
+py_strict_library(
+    name = "feature_column_v2_types",
+    srcs = ["feature_column_v2_types.py"],
+    deps = ["//tensorflow/python/util:tf_export"],
+)
diff --git a/tensorflow/python/feature_column/feature_column_v2.py b/tensorflow/python/feature_column/feature_column_v2.py
index 5cc4682e2e5..ec0a72327f8 100644
--- a/tensorflow/python/feature_column/feature_column_v2.py
+++ b/tensorflow/python/feature_column/feature_column_v2.py
@@ -135,6 +135,8 @@ from tensorflow.python.data.experimental.ops import lookup_ops as data_lookup_op
 from tensorflow.python.data.ops import readers
 from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column as fc_old
+from tensorflow.python.feature_column import feature_column_v2_types as fc_types
+from tensorflow.python.feature_column import serialization
 from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -516,7 +518,7 @@ def make_parse_example_spec_v2(feature_columns):
   """
   result = {}
   for column in feature_columns:
-    if not isinstance(column, FeatureColumn):
+    if not isinstance(column, fc_types.FeatureColumn):
       raise ValueError('All feature_columns must be FeatureColumn instances. '
                        'Given: {}'.format(column))
     config = column.parse_example_spec
@@ -1976,267 +1978,13 @@ def crossed_column(keys, hash_bucket_size, hash_key=None):
       keys=tuple(keys), hash_bucket_size=hash_bucket_size, hash_key=hash_key)
 
 
-
-
-# TODO(b/181853833): Add a tf.type for instance type checking.
-@tf_export('__internal__.feature_column.FeatureColumn', v1=[])
-@six.add_metaclass(abc.ABCMeta)
-class FeatureColumn(object):
-  """Represents a feature column abstraction.
-
-  WARNING: Do not subclass this layer unless you know what you are doing:
-  the API is subject to future changes.
-
-  To distinguish between the concept of a feature family and a specific binary
-  feature within a family, we refer to a feature family like "country" as a
-  feature column. For example, we can have a feature in a `tf.Example` format:
-    {key: "country",  value: [ "US" ]}
-  In this example the value of feature is "US" and "country" refers to the
-  column of the feature.
-
-  This class is an abstract class. Users should not create instances of this.
-  """
-
-  @abc.abstractproperty
-  def name(self):
-    """Returns string. Used for naming."""
-    pass
-
-  def __lt__(self, other):
-    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
-
-    Feature columns need to occasionally be sortable, for example when used as
-    keys in a features dictionary passed to a layer.
-
-    In CPython, `__lt__` must be defined for all objects in the
-    sequence being sorted.
-
-    If any objects in the sequence being sorted do not have an `__lt__` method
-    compatible with feature column objects (such as strings), then CPython will
-    fall back to using the `__gt__` method below.
-    https://docs.python.org/3/library/stdtypes.html#list.sort
-
-    Args:
-      other: The other object to compare to.
-
-    Returns:
-      True if the string representation of this object is lexicographically less
-      than the string representation of `other`. For FeatureColumn objects,
-      this looks like "<__main__.FeatureColumn object at 0xa>".
-    """
-    return str(self) < str(other)
-
-  def __gt__(self, other):
-    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
-
-    Feature columns need to occasionally be sortable, for example when used as
-    keys in a features dictionary passed to a layer.
-
-    `__gt__` is called when the "other" object being compared during the sort
-    does not have `__lt__` defined.
-    Example:
-    ```
-    # __lt__ only class
-    class A():
-      def __lt__(self, other): return str(self) < str(other)
-
-    a = A()
-    a < "b" # True
-    "0" < a # Error
-
-    # __lt__ and __gt__ class
-    class B():
-      def __lt__(self, other): return str(self) < str(other)
-      def __gt__(self, other): return str(self) > str(other)
-
-    b = B()
-    b < "c" # True
-    "0" < b # True
-    ```
-
-    Args:
-      other: The other object to compare to.
-
-    Returns:
-      True if the string representation of this object is lexicographically
-      greater than the string representation of `other`. For FeatureColumn
-      objects, this looks like "<__main__.FeatureColumn object at 0xa>".
-    """
-    return str(self) > str(other)
-
-  @abc.abstractmethod
-  def transform_feature(self, transformation_cache, state_manager):
-    """Returns intermediate representation (usually a `Tensor`).
-
-    Uses `transformation_cache` to create an intermediate representation
-    (usually a `Tensor`) that other feature columns can use.
-
-    Example usage of `transformation_cache`:
-    Let's say a Feature column depends on raw feature ('raw') and another
-    `FeatureColumn` (input_fc). To access corresponding `Tensor`s,
-    transformation_cache will be used as follows:
-
-    ```python
-    raw_tensor = transformation_cache.get('raw', state_manager)
-    fc_tensor = transformation_cache.get(input_fc, state_manager)
-    ```
-
-    Args:
-      transformation_cache: A `FeatureTransformationCache` object to access
-        features.
-      state_manager: A `StateManager` to create / access resources such as
-        lookup tables.
-
-    Returns:
-      Transformed feature `Tensor`.
-    """
-    pass
-
-  @abc.abstractproperty
-  def parse_example_spec(self):
-    """Returns a `tf.Example` parsing spec as dict.
-
-    It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is
-    a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
-    supported objects. Please check documentation of `tf.io.parse_example` for
-    all supported spec objects.
-
-    Let's say a Feature column depends on raw feature ('raw') and another
-    `FeatureColumn` (input_fc). One possible implementation of
-    parse_example_spec is as follows:
-
-    ```python
-    spec = {'raw': tf.io.FixedLenFeature(...)}
-    spec.update(input_fc.parse_example_spec)
-    return spec
-    ```
-    """
-    pass
-
-  def create_state(self, state_manager):
-    """Uses the `state_manager` to create state for the FeatureColumn.
-
-    Args:
-      state_manager: A `StateManager` to create / access resources such as
-        lookup tables and variables.
-    """
-    pass
-
-  @abc.abstractproperty
-  def _is_v2_column(self):
-    """Returns whether this FeatureColumn is fully conformant to the new API.
-
-    This is needed for composition type cases where an EmbeddingColumn etc.
-    might take in old categorical columns as input and then we want to use the
-    old API.
-    """
-    pass
-
-  @abc.abstractproperty
-  def parents(self):
-    """Returns a list of immediate raw feature and FeatureColumn dependencies.
-
-    For example:
-    # For the following feature columns
-    a = numeric_column('f1')
-    c = crossed_column(a, 'f2')
-    # The expected parents are:
-    a.parents = ['f1']
-    c.parents = [a, 'f2']
-    """
-    pass
-
-  def get_config(self):
-    """Returns the config of the feature column.
-
-    A FeatureColumn config is a Python dictionary (serializable) containing the
-    configuration of a FeatureColumn. The same FeatureColumn can be
-    reinstantiated later from this configuration.
-
-    The config of a feature column does not include information about feature
-    columns depending on it nor the FeatureColumn class name.
-
-    Example with (de)serialization practices followed in this file:
-    ```python
-    class SerializationExampleFeatureColumn(
-        FeatureColumn, collections.namedtuple(
-            'SerializationExampleFeatureColumn',
-            ('dimension', 'parent', 'dtype', 'normalizer_fn'))):
-
-      def get_config(self):
-        # Create a dict from the namedtuple.
-        # Python attribute literals can be directly copied from / to the config.
-        # For example 'dimension', assuming it is an integer literal.
-        config = dict(zip(self._fields, self))
-
-        # (De)serialization of parent FeatureColumns should use the provided
-        # (de)serialize_feature_column() methods that take care of de-duping.
-        config['parent'] = serialize_feature_column(self.parent)
-
-        # Many objects provide custom (de)serialization e.g: for tf.DType
-        # tf.DType.name, tf.as_dtype() can be used.
-        config['dtype'] = self.dtype.name
-
-        # Non-trivial dependencies should be Keras-(de)serializable.
-        config['normalizer_fn'] = generic_utils.serialize_keras_object(
-            self.normalizer_fn)
-
-        return config
-
-      @classmethod
-      def from_config(cls, config, custom_objects=None, columns_by_name=None):
-        # This should do the inverse transform from `get_config` and construct
-        # the namedtuple.
-        kwargs = config.copy()
-        kwargs['parent'] = deserialize_feature_column(
-            config['parent'], custom_objects, columns_by_name)
-        kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
-        kwargs['normalizer_fn'] = generic_utils.deserialize_keras_object(
-          config['normalizer_fn'], custom_objects=custom_objects)
-        return cls(**kwargs)
-
-    ```
-    Returns:
-      A serializable Dict that can be used to deserialize the object with
-      from_config.
-    """
-    return self._get_config()
-
-  def _get_config(self):
-    raise NotImplementedError('Must be implemented in subclasses.')
-
-  @classmethod
-  def from_config(cls, config, custom_objects=None, columns_by_name=None):
-    """Creates a FeatureColumn from its config.
-
-    This method should be the reverse of `get_config`, capable of instantiating
-    the same FeatureColumn from the config dictionary. See `get_config` for an
-    example of common (de)serialization practices followed in this file.
-
-    TODO(b/118939620): This is a private method until consensus is reached on
-    supporting object deserialization deduping within Keras.
-
-    Args:
-      config: A Dict config acquired with `get_config`.
-      custom_objects: Optional dictionary mapping names (strings) to custom
-        classes or functions to be considered during deserialization.
-      columns_by_name: A Dict[String, FeatureColumn] of existing columns in
-        order to avoid duplication. Should be passed to any calls to
-        deserialize_feature_column().
-
-    Returns:
-      A FeatureColumn for the input config.
-    """
-    return cls._from_config(config, custom_objects, columns_by_name)
-
-  @classmethod
-  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
-    raise NotImplementedError('Must be implemented in subclasses.')
+# TODO(b/283983575): Remove this once references are using the new location
+FeatureColumn = fc_types.FeatureColumn
 
 
 # TODO(b/181853833): Add a tf.type for instance type checking.
 @tf_export('__internal__.feature_column.DenseColumn', v1=[])
-class DenseColumn(FeatureColumn):
+class DenseColumn(fc_types.FeatureColumn):
   """Represents a column which can be represented as `Tensor`.
 
   Some examples of this type are: numeric_column, embedding_column,
@@ -2276,7 +2024,7 @@ class DenseColumn(FeatureColumn):
 def is_feature_column_v2(feature_columns):
   """Returns True if all feature columns are V2."""
   for feature_column in feature_columns:
-    if not isinstance(feature_column, FeatureColumn):
+    if not isinstance(feature_column, fc_types.FeatureColumn):
       return False
     if not feature_column._is_v2_column:  # pylint: disable=protected-access
       return False
@@ -2311,7 +2059,7 @@ def _create_dense_column_weighted_sum(column, transformation_cache,
   return math_ops.matmul(tensor, weight_var, name='weighted_sum')
 
 
-class CategoricalColumn(FeatureColumn):
+class CategoricalColumn(fc_types.FeatureColumn):
   """Represents a categorical feature.
 
   A categorical feature typically handled with a `tf.sparse.SparseTensor` of
@@ -2400,7 +2148,8 @@ def _create_categorical_column_weighted_sum(column, transformation_cache,
 
 # TODO(b/181853833): Add a tf.type for instance type checking.
 @tf_export('__internal__.feature_column.SequenceDenseColumn', v1=[])
-class SequenceDenseColumn(FeatureColumn):
+@serialization.register_feature_column
+class SequenceDenseColumn(fc_types.FeatureColumn):
   """Represents dense sequence data."""
 
   TensorSequenceLengthPair = collections.namedtuple(  # pylint: disable=invalid-name
@@ -2499,7 +2248,7 @@ class FeatureTransformationCache(object):
     if isinstance(key, six.string_types):
       raise ValueError('Feature {} is not in features dictionary.'.format(key))
 
-    if not isinstance(key, FeatureColumn):
+    if not isinstance(key, fc_types.FeatureColumn):
       raise TypeError('"key" must be either a "str" or "FeatureColumn". '
                       'Provided: {}'.format(key))
 
@@ -2633,7 +2382,7 @@ def _normalize_feature_columns(feature_columns):
   Raises:
     ValueError: for any invalid inputs, such as empty, duplicated names, etc.
   """
-  if isinstance(feature_columns, FeatureColumn):
+  if isinstance(feature_columns, fc_types.FeatureColumn):
     feature_columns = [feature_columns]
 
   if isinstance(feature_columns, collections_abc.Iterator):
@@ -2643,7 +2392,7 @@ def _normalize_feature_columns(feature_columns):
     raise ValueError('Expected feature_columns to be iterable, found dict.')
 
   for column in feature_columns:
-    if not isinstance(column, FeatureColumn):
+    if not isinstance(column, fc_types.FeatureColumn):
       raise ValueError('Items of feature_columns must be a FeatureColumn. '
                        'Given (type {}): {}.'.format(type(column), column))
   if not feature_columns:
@@ -2662,6 +2411,7 @@ def _normalize_feature_columns(feature_columns):
   return sorted(feature_columns, key=lambda x: x.name)
 
 
+@serialization.register_feature_column
 class NumericColumn(
     DenseColumn,
     fc_old._DenseColumn,  # pylint: disable=protected-access
@@ -2770,7 +2520,6 @@ class NumericColumn(
   def get_config(self):
     """See 'FeatureColumn` base class."""
     config = dict(zip(self._fields, self))
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
     config['normalizer_fn'] = serialization._serialize_keras_object(  # pylint: disable=protected-access
         self.normalizer_fn)
     config['dtype'] = self.dtype.name
@@ -2780,7 +2529,6 @@ class NumericColumn(
   def from_config(cls, config, custom_objects=None, columns_by_name=None):
     """See 'FeatureColumn` base class."""
     _check_config_keys(config, cls._fields)
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
     kwargs = _standardize_and_copy_config(config)
     kwargs['normalizer_fn'] = serialization._deserialize_keras_object(  # pylint: disable=protected-access
         config['normalizer_fn'],
@@ -2790,6 +2538,7 @@ class NumericColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class BucketizedColumn(
     DenseColumn,
     CategoricalColumn,
@@ -2801,8 +2550,10 @@ class BucketizedColumn(
 
   @property
   def _is_v2_column(self):
-    return (isinstance(self.source_column, FeatureColumn) and
-            self.source_column._is_v2_column)  # pylint: disable=protected-access
+    return (
+        isinstance(self.source_column, fc_types.FeatureColumn)
+        and self.source_column._is_v2_column
+    )  # pylint: disable=protected-access
 
   @property
   def name(self):
@@ -2944,6 +2695,7 @@ class BucketizedColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class EmbeddingColumn(
     DenseColumn,
     SequenceDenseColumn,
@@ -2980,8 +2732,10 @@ class EmbeddingColumn(
 
   @property
   def _is_v2_column(self):
-    return (isinstance(self.categorical_column, FeatureColumn) and
-            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+    return (
+        isinstance(self.categorical_column, fc_types.FeatureColumn)
+        and self.categorical_column._is_v2_column
+    )  # pylint: disable=protected-access
 
   @property
   def name(self):
@@ -3189,7 +2943,6 @@ class EmbeddingColumn(
 
   def get_config(self):
     """See 'FeatureColumn` base class."""
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
     config = dict(zip(self._fields, self))
     config['categorical_column'] = serialization.serialize_feature_column(
         self.categorical_column)
@@ -3202,7 +2955,6 @@ class EmbeddingColumn(
     """See 'FeatureColumn` base class."""
     if 'use_safe_embedding_lookup' not in config:
       config['use_safe_embedding_lookup'] = True
-    from tensorflow.python.feature_column import serialization  # pylint: disable=g-import-not-at-top
     _check_config_keys(config, cls._fields)
     kwargs = _standardize_and_copy_config(config)
     kwargs['categorical_column'] = serialization.deserialize_feature_column(
@@ -3274,6 +3026,7 @@ class SharedEmbeddingColumnCreator(autotrackable.AutoTrackable):
     return self._dimension
 
 
+@serialization.register_feature_column
 class SharedEmbeddingColumn(
     DenseColumn,
     SequenceDenseColumn,
@@ -3427,6 +3180,7 @@ def _check_shape(shape, key):
   return shape
 
 
+@serialization.register_feature_column
 class HashedCategoricalColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -3538,6 +3292,7 @@ class HashedCategoricalColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class VocabularyFileCategoricalColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -3699,6 +3454,7 @@ class VocabularyFileCategoricalColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class VocabularyListCategoricalColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -3820,6 +3576,7 @@ class VocabularyListCategoricalColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class IdentityCategoricalColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -3929,6 +3686,7 @@ class IdentityCategoricalColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class WeightedCategoricalColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -3939,8 +3697,10 @@ class WeightedCategoricalColumn(
 
   @property
   def _is_v2_column(self):
-    return (isinstance(self.categorical_column, FeatureColumn) and
-            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+    return (
+        isinstance(self.categorical_column, fc_types.FeatureColumn)
+        and self.categorical_column._is_v2_column
+    )  # pylint: disable=protected-access
 
   @property
   def name(self):
@@ -4055,6 +3815,7 @@ class WeightedCategoricalColumn(
     return cls(**kwargs)
 
 
+@serialization.register_feature_column
 class CrossedColumn(
     CategoricalColumn,
     fc_old._CategoricalColumn,  # pylint: disable=protected-access
@@ -4067,7 +3828,7 @@ class CrossedColumn(
     for key in _collect_leaf_level_keys(self):
       if isinstance(key, six.string_types):
         continue
-      if not isinstance(key, FeatureColumn):
+      if not isinstance(key, fc_types.FeatureColumn):
         return False
       if not key._is_v2_column:  # pylint: disable=protected-access
         return False
@@ -4078,7 +3839,7 @@ class CrossedColumn(
     """See `FeatureColumn` base class."""
     feature_names = []
     for key in _collect_leaf_level_keys(self):
-      if isinstance(key, (FeatureColumn, fc_old._FeatureColumn)):  # pylint: disable=protected-access
+      if isinstance(key, (fc_types.FeatureColumn, fc_old._FeatureColumn)):  # pylint: disable=protected-access
         feature_names.append(key.name)
       else:  # key must be a string
         feature_names.append(key)
@@ -4089,7 +3850,7 @@ class CrossedColumn(
     """See `FeatureColumn` base class."""
     config = {}
     for key in self.keys:
-      if isinstance(key, FeatureColumn):
+      if isinstance(key, fc_types.FeatureColumn):
         config.update(key.parse_example_spec)
       elif isinstance(key, fc_old._FeatureColumn):  # pylint: disable=protected-access
         config.update(key._parse_example_spec)  # pylint: disable=protected-access
@@ -4240,6 +4001,7 @@ def _prune_invalid_weights(sparse_ids, sparse_weights):
   return sparse_ids, sparse_weights
 
 
+@serialization.register_feature_column
 class IndicatorColumn(
     DenseColumn,
     SequenceDenseColumn,
@@ -4255,8 +4017,10 @@ class IndicatorColumn(
 
   @property
   def _is_v2_column(self):
-    return (isinstance(self.categorical_column, FeatureColumn) and
-            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+    return (
+        isinstance(self.categorical_column, fc_types.FeatureColumn)
+        and self.categorical_column._is_v2_column
+    )  # pylint: disable=protected-access
 
   @property
   def name(self):
@@ -4332,7 +4096,7 @@ class IndicatorColumn(
   @property
   def variable_shape(self):
     """Returns a `TensorShape` representing the shape of the dense `Tensor`."""
-    if isinstance(self.categorical_column, FeatureColumn):
+    if isinstance(self.categorical_column, fc_types.FeatureColumn):
       return tensor_shape.TensorShape([1, self.categorical_column.num_buckets])
     else:
       return tensor_shape.TensorShape([1, self.categorical_column._num_buckets])  # pylint: disable=protected-access
@@ -4493,6 +4257,7 @@ def _verify_static_batch_size_equality(tensors, columns):
                 expected_batch_size, batch_size))
 
 
+@serialization.register_feature_column
 class SequenceCategoricalColumn(
     CategoricalColumn,
     fc_old._SequenceCategoricalColumn,  # pylint: disable=protected-access
@@ -4502,8 +4267,10 @@ class SequenceCategoricalColumn(
 
   @property
   def _is_v2_column(self):
-    return (isinstance(self.categorical_column, FeatureColumn) and
-            self.categorical_column._is_v2_column)  # pylint: disable=protected-access
+    return (
+        isinstance(self.categorical_column, fc_types.FeatureColumn)
+        and self.categorical_column._is_v2_column
+    )  # pylint: disable=protected-access
 
   @property
   def name(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2_test.py b/tensorflow/python/feature_column/feature_column_v2_test.py
index cde578c83e7..21dcbb4452d 100644
--- a/tensorflow/python/feature_column/feature_column_v2_test.py
+++ b/tensorflow/python/feature_column/feature_column_v2_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.feature_column import feature_column as fc_old
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import feature_column_v2_types
 from tensorflow.python.feature_column import serialization
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -62,7 +63,7 @@ def get_linear_model_column_var(column, name='linear_model'):
                             name + '/' + column.name)[0]
 
 
-class BaseFeatureColumnForTests(fc.FeatureColumn):
+class BaseFeatureColumnForTests(feature_column_v2_types.FeatureColumn):
   """A base FeatureColumn useful to avoid boiler-plate in tests.
 
   Provides dummy implementations for abstract methods that raise ValueError in
@@ -1413,8 +1414,11 @@ class OldLinearModelTest(test.TestCase):
 
   def test_should_be_dense_or_categorical_column(self):
 
-    class NotSupportedColumn(BaseFeatureColumnForTests, fc.FeatureColumn,
-                             fc_old._FeatureColumn):
+    class NotSupportedColumn(
+        BaseFeatureColumnForTests,
+        feature_column_v2_types.FeatureColumn,
+        fc_old._FeatureColumn,
+    ):
 
       @property
       def _is_v2_column(self):
diff --git a/tensorflow/python/feature_column/feature_column_v2_types.py b/tensorflow/python/feature_column/feature_column_v2_types.py
new file mode 100644
index 00000000000..e76d4269f89
--- /dev/null
+++ b/tensorflow/python/feature_column/feature_column_v2_types.py
@@ -0,0 +1,272 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Types specific to tf.feature_column."""
+import abc
+
+from tensorflow.python.util.tf_export import tf_export
+
+
+@tf_export('__internal__.feature_column.FeatureColumn', v1=[])
+class FeatureColumn(object, metaclass=abc.ABCMeta):
+  """Represents a feature column abstraction.
+
+  WARNING: Do not subclass this layer unless you know what you are doing:
+  the API is subject to future changes.
+
+  To distinguish between the concept of a feature family and a specific binary
+  feature within a family, we refer to a feature family like "country" as a
+  feature column. For example, we can have a feature in a `tf.Example` format:
+    {key: "country",  value: [ "US" ]}
+  In this example the value of feature is "US" and "country" refers to the
+  column of the feature.
+
+  This class is an abstract class. Users should not create instances of this.
+  """
+
+  @abc.abstractproperty
+  def name(self):
+    """Returns string. Used for naming."""
+    pass
+
+  def __lt__(self, other):
+    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
+
+    Feature columns need to occasionally be sortable, for example when used as
+    keys in a features dictionary passed to a layer.
+
+    In CPython, `__lt__` must be defined for all objects in the
+    sequence being sorted.
+
+    If any objects in the sequence being sorted do not have an `__lt__` method
+    compatible with feature column objects (such as strings), then CPython will
+    fall back to using the `__gt__` method below.
+    https://docs.python.org/3/library/stdtypes.html#list.sort
+
+    Args:
+      other: The other object to compare to.
+
+    Returns:
+      True if the string representation of this object is lexicographically less
+      than the string representation of `other`. For FeatureColumn objects,
+      this looks like "<__main__.FeatureColumn object at 0xa>".
+    """
+    return str(self) < str(other)
+
+  def __gt__(self, other):
+    """Allows feature columns to be sorted in Python 3 as they are in Python 2.
+
+    Feature columns need to occasionally be sortable, for example when used as
+    keys in a features dictionary passed to a layer.
+
+    `__gt__` is called when the "other" object being compared during the sort
+    does not have `__lt__` defined.
+    Example:
+    ```
+    # __lt__ only class
+    class A():
+      def __lt__(self, other): return str(self) < str(other)
+
+    a = A()
+    a < "b" # True
+    "0" < a # Error
+
+    # __lt__ and __gt__ class
+    class B():
+      def __lt__(self, other): return str(self) < str(other)
+      def __gt__(self, other): return str(self) > str(other)
+
+    b = B()
+    b < "c" # True
+    "0" < b # True
+    ```
+
+    Args:
+      other: The other object to compare to.
+
+    Returns:
+      True if the string representation of this object is lexicographically
+      greater than the string representation of `other`. For FeatureColumn
+      objects, this looks like "<__main__.FeatureColumn object at 0xa>".
+    """
+    return str(self) > str(other)
+
+  @abc.abstractmethod
+  def transform_feature(self, transformation_cache, state_manager):
+    """Returns intermediate representation (usually a `Tensor`).
+
+    Uses `transformation_cache` to create an intermediate representation
+    (usually a `Tensor`) that other feature columns can use.
+
+    Example usage of `transformation_cache`:
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `FeatureColumn` (input_fc). To access corresponding `Tensor`s,
+    transformation_cache will be used as follows:
+
+    ```python
+    raw_tensor = transformation_cache.get('raw', state_manager)
+    fc_tensor = transformation_cache.get(input_fc, state_manager)
+    ```
+
+    Args:
+      transformation_cache: A `FeatureTransformationCache` object to access
+        features.
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables.
+
+    Returns:
+      Transformed feature `Tensor`.
+    """
+    pass
+
+  @abc.abstractproperty
+  def parse_example_spec(self):
+    """Returns a `tf.Example` parsing spec as dict.
+
+    It is used for get_parsing_spec for `tf.io.parse_example`. Returned spec is
+    a dict from keys ('string') to `VarLenFeature`, `FixedLenFeature`, and other
+    supported objects. Please check documentation of `tf.io.parse_example` for
+    all supported spec objects.
+
+    Let's say a Feature column depends on raw feature ('raw') and another
+    `FeatureColumn` (input_fc). One possible implementation of
+    parse_example_spec is as follows:
+
+    ```python
+    spec = {'raw': tf.io.FixedLenFeature(...)}
+    spec.update(input_fc.parse_example_spec)
+    return spec
+    ```
+    """
+    pass
+
+  def create_state(self, state_manager):
+    """Uses the `state_manager` to create state for the FeatureColumn.
+
+    Args:
+      state_manager: A `StateManager` to create / access resources such as
+        lookup tables and variables.
+    """
+    pass
+
+  @abc.abstractproperty
+  def _is_v2_column(self):
+    """Returns whether this FeatureColumn is fully conformant to the new API.
+
+    This is needed for composition type cases where an EmbeddingColumn etc.
+    might take in old categorical columns as input and then we want to use the
+    old API.
+    """
+    pass
+
+  @abc.abstractproperty
+  def parents(self):
+    """Returns a list of immediate raw feature and FeatureColumn dependencies.
+
+    For example:
+    # For the following feature columns
+    a = numeric_column('f1')
+    c = crossed_column(a, 'f2')
+    # The expected parents are:
+    a.parents = ['f1']
+    c.parents = [a, 'f2']
+    """
+    pass
+
+  def get_config(self):
+    """Returns the config of the feature column.
+
+    A FeatureColumn config is a Python dictionary (serializable) containing the
+    configuration of a FeatureColumn. The same FeatureColumn can be
+    reinstantiated later from this configuration.
+
+    The config of a feature column does not include information about feature
+    columns depending on it nor the FeatureColumn class name.
+
+    Example with (de)serialization practices followed in this file:
+    ```python
+    class SerializationExampleFeatureColumn(
+        FeatureColumn, collections.namedtuple(
+            'SerializationExampleFeatureColumn',
+            ('dimension', 'parent', 'dtype', 'normalizer_fn'))):
+
+      def get_config(self):
+        # Create a dict from the namedtuple.
+        # Python attribute literals can be directly copied from / to the config.
+        # For example 'dimension', assuming it is an integer literal.
+        config = dict(zip(self._fields, self))
+
+        # (De)serialization of parent FeatureColumns should use the provided
+        # (de)serialize_feature_column() methods that take care of de-duping.
+        config['parent'] = serialize_feature_column(self.parent)
+
+        # Many objects provide custom (de)serialization e.g: for tf.DType
+        # tf.DType.name, tf.as_dtype() can be used.
+        config['dtype'] = self.dtype.name
+
+        # Non-trivial dependencies should be Keras-(de)serializable.
+        config['normalizer_fn'] = generic_utils.serialize_keras_object(
+            self.normalizer_fn)
+
+        return config
+
+      @classmethod
+      def from_config(cls, config, custom_objects=None, columns_by_name=None):
+        # This should do the inverse transform from `get_config` and construct
+        # the namedtuple.
+        kwargs = config.copy()
+        kwargs['parent'] = deserialize_feature_column(
+            config['parent'], custom_objects, columns_by_name)
+        kwargs['dtype'] = dtypes.as_dtype(config['dtype'])
+        kwargs['normalizer_fn'] = generic_utils.deserialize_keras_object(
+          config['normalizer_fn'], custom_objects=custom_objects)
+        return cls(**kwargs)
+
+    ```
+    Returns:
+      A serializable Dict that can be used to deserialize the object with
+      from_config.
+    """
+    return self._get_config()
+
+  def _get_config(self):
+    raise NotImplementedError('Must be implemented in subclasses.')
+
+  @classmethod
+  def from_config(cls, config, custom_objects=None, columns_by_name=None):
+    """Creates a FeatureColumn from its config.
+
+    This method should be the reverse of `get_config`, capable of instantiating
+    the same FeatureColumn from the config dictionary. See `get_config` for an
+    example of common (de)serialization practices followed in this file.
+
+    TODO(b/118939620): This is a private method until consensus is reached on
+    supporting object deserialization deduping within Keras.
+
+    Args:
+      config: A Dict config acquired with `get_config`.
+      custom_objects: Optional dictionary mapping names (strings) to custom
+        classes or functions to be considered during deserialization.
+      columns_by_name: A Dict[String, FeatureColumn] of existing columns in
+        order to avoid duplication. Should be passed to any calls to
+        deserialize_feature_column().
+
+    Returns:
+      A FeatureColumn for the input config.
+    """
+    return cls._from_config(config, custom_objects, columns_by_name)
+
+  @classmethod
+  def _from_config(cls, config, custom_objects=None, columns_by_name=None):
+    raise NotImplementedError('Must be implemented in subclasses.')
diff --git a/tensorflow/python/feature_column/sequence_feature_column.py b/tensorflow/python/feature_column/sequence_feature_column.py
index 551899d1a4e..05dabd003dc 100644
--- a/tensorflow/python/feature_column/sequence_feature_column.py
+++ b/tensorflow/python/feature_column/sequence_feature_column.py
@@ -20,6 +20,7 @@ NOTE: This API is a work in progress and will likely be changing frequently.
 import collections
 
 from tensorflow.python.feature_column import feature_column_v2 as fc
+from tensorflow.python.feature_column import serialization
 from tensorflow.python.feature_column import utils as fc_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -408,8 +409,7 @@ def _assert_all_equal_and_return(tensors, name=None):
       return array_ops.identity(tensors[0])
 
 
-
-
+@serialization.register_feature_column
 class SequenceNumericColumn(
     fc.SequenceDenseColumn,
     collections.namedtuple(
diff --git a/tensorflow/python/feature_column/serialization.py b/tensorflow/python/feature_column/serialization.py
index 2b1204836f4..88797d9f95d 100644
--- a/tensorflow/python/feature_column/serialization.py
+++ b/tensorflow/python/feature_column/serialization.py
@@ -16,8 +16,7 @@
 
 import six
 
-from tensorflow.python.feature_column import feature_column_v2 as fc_lib
-from tensorflow.python.feature_column import sequence_feature_column as sfc_lib
+from tensorflow.python.feature_column import feature_column_v2_types as fc_types
 from tensorflow.python.ops import init_ops
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import tf_decorator
@@ -40,15 +39,7 @@ _FEATURE_COLUMN_DEPRECATION_RUNTIME_WARNING = (
     'a functional equivalent in `tf.keras.layers` for feature preprocessing '
     'when training a Keras model.')
 
-_FEATURE_COLUMNS = [
-    fc_lib.BucketizedColumn, fc_lib.CrossedColumn, fc_lib.EmbeddingColumn,
-    fc_lib.HashedCategoricalColumn, fc_lib.IdentityCategoricalColumn,
-    fc_lib.IndicatorColumn, fc_lib.NumericColumn,
-    fc_lib.SequenceCategoricalColumn, fc_lib.SequenceDenseColumn,
-    fc_lib.SharedEmbeddingColumn, fc_lib.VocabularyFileCategoricalColumn,
-    fc_lib.VocabularyListCategoricalColumn, fc_lib.WeightedCategoricalColumn,
-    init_ops.TruncatedNormal, sfc_lib.SequenceNumericColumn
-]
+_FEATURE_COLUMNS = [init_ops.TruncatedNormal]
 
 
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
@@ -98,13 +89,12 @@ def serialize_feature_column(fc):
   """
   if isinstance(fc, six.string_types):
     return fc
-  elif isinstance(fc, fc_lib.FeatureColumn):
+  elif isinstance(fc, fc_types.FeatureColumn):
     return {'class_name': fc.__class__.__name__, 'config': fc.get_config()}
   else:
     raise ValueError('Instance: {} is not a FeatureColumn'.format(fc))
 
 
-
 @doc_controls.header(_FEATURE_COLUMN_DEPRECATION_WARNING)
 @tf_export('__internal__.feature_column.deserialize_feature_column', v1=[])
 def deserialize_feature_column(config,
@@ -148,7 +138,7 @@ def deserialize_feature_column(config,
       custom_objects=custom_objects,
       printable_module_name='feature_column_v2')
 
-  if not issubclass(cls, fc_lib.FeatureColumn):
+  if not issubclass(cls, fc_types.FeatureColumn):
     raise ValueError(
         'Expected FeatureColumn class, instead found: {}'.format(cls))
 
@@ -164,7 +154,6 @@ def deserialize_feature_column(config,
       _column_name_with_class_name(new_instance), new_instance)
 
 
-
 def serialize_feature_columns(feature_columns):
   """Serializes a list of FeatureColumns.
 
@@ -356,3 +345,9 @@ def _get_registered_object(name, custom_objects=None, module_objects=None):
   elif module_objects and name in module_objects:
     return module_objects[name]
   return None
+
+
+def register_feature_column(fc):
+  """Decorator that registers a FeatureColumn for serialization."""
+  _FEATURE_COLUMNS.append(fc)
+  return fc
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index eef7d3636b4..5c5590c981b 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -1,20 +1,20 @@
 # python/framework package
 
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 load(
     "//tensorflow:tensorflow.bzl",
     "if_not_windows",
     "if_oss",
     "if_xla_available",
-    "py_test",
     "tf_cc_binary",
     "tf_cc_shared_object",
     "tf_cc_test",
     "tf_gen_op_wrapper_py",
     "tf_kernel_library",
 )
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test", "tf_python_pybind_extension")
-load("//tensorflow:pytype.default.bzl", "pytype_library", "pytype_strict_library")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test", "tf_python_pybind_extension")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 load("//tensorflow/core/platform:build_config.bzl", "pyx_library", "tf_additional_all_protos", "tf_additional_lib_deps", "tf_proto_library", "tf_protos_grappler")  # @unused
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static", "tf_additional_xla_deps_py")
 
@@ -53,7 +53,7 @@ tf_cc_shared_object(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "file_system_test",
     size = "small",
     srcs = ["file_system_test.py"],
@@ -66,11 +66,12 @@ tf_py_test(
         "notap",
     ],
     deps = [
-        ":for_generated_wrappers",
+        ":dtypes",
         ":framework",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:io_ops",
+        ":test_lib",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/util:compat",
     ],
@@ -219,7 +220,7 @@ tf_cc_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "for_generated_wrappers",
     deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
@@ -243,7 +244,7 @@ py_library(
 # What is needed for tf_gen_op_wrapper_py. This is the same as
 # "for_generated_wrappers" minus the "function" dep. This is to avoid
 # circular dependencies, as "function" uses generated op wrappers.
-py_library(
+py_strict_library(
     name = "for_generated_wrappers_v2",
     deprecation = "Depending on this target can cause build dependency cycles. Depend on the fine-grained sub-targets instead.",
     srcs_version = "PY3",
@@ -268,19 +269,19 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "subscribe",
     srcs = ["subscribe.py"],
     srcs_version = "PY3",
     deps = [
         ":ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "framework",
     srcs = [
         "framework_lib.py",
@@ -290,38 +291,52 @@ py_library(
         "meta_graph.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":_errors_test_helper",
         ":_pywrap_python_api_dispatcher",
         ":_pywrap_python_api_info",
         ":_pywrap_python_api_parameter_converter",
         ":_pywrap_python_op_gen",
+        ":byte_swap_tensor",
+        ":c_api_util",
         ":composite_tensor",
         ":config",
         ":cpp_shape_inference_proto_py",
+        ":device",
+        ":dtypes",
+        ":error_interpolation",
         ":errors",
         ":fast_tensor_util",
         ":for_generated_wrappers",
         ":function",
         ":graph_util",
+        ":indexed_slices",
+        ":op_def_registry",
+        ":ops",
         ":random_seed",
         ":sparse_tensor",
         ":tensor",
+        ":tensor_conversion_registry",
+        ":tensor_shape",
         ":tensor_spec",
         ":tensor_util",
         ":type_spec",
+        ":versions",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:_pywrap_py_exception_registry",
         "//tensorflow/python:_pywrap_quantize_training",
-        "//tensorflow/python:_pywrap_utils",
-        "//tensorflow/python:lib",
         "//tensorflow/python:pywrap_mlir",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:pywrap_tf_session",
         "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/client:_pywrap_debug_events_writer",
         "//tensorflow/python/client:_pywrap_events_writer",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/lib/core:_pywrap_py_func",  # TODO(b/142001480): remove once the bug is fixed.
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:control_flow_util",
         "//tensorflow/python/platform:_pywrap_stacktrace_handler",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
@@ -331,6 +346,7 @@ py_library(
         "//tensorflow/python/util:_pywrap_tfprof",
         "//tensorflow/python/util:_pywrap_transform_graph",
         "//tensorflow/python/util:_pywrap_util_port",
+        "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -340,28 +356,30 @@ py_library(
     ]),
 )
 
-py_library(
+py_strict_library(
     name = "byte_swap_tensor",
     srcs = ["byte_swap_tensor.py"],
     srcs_version = "PY3",
     deps = [
         ":dtypes",
+        "//tensorflow/core:protos_all_py",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "c_api_util",
     srcs = ["c_api_util.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "common_shapes",
     srcs = ["common_shapes.py"],
     srcs_version = "PY3",
@@ -370,42 +388,56 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "constant_op",
     srcs = ["constant_op.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":dtypes",
         ":ops",
+        ":tensor_conversion_registry",
         ":tensor_shape",
         ":tensor_util",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
-        "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/profiler:trace",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "device_spec",
     srcs = ["device_spec.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//third_party/py/tf_slim:__subpackages__"],
     deps = [
+        "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "device",
     srcs = ["device.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
+    deps = [
+        ":device_spec",
+        "//tensorflow/python:tf2",
+    ],
 )
 
 tf_python_pybind_extension(
     name = "_dtypes",
     srcs = ["dtypes.cc"],
+    visibility = visibility + ["//third_party/py/tensorfn:__subpackages__"],
     deps = [
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:protos_all_cc",
@@ -414,34 +446,55 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "dtypes",
     srcs = ["dtypes.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//smartass:__subpackages__",
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":_dtypes",
+        # copybara:uncomment_begin(google-only)
+        # ":cpp_shape_inference_proto_py",
+        # "//third_party/py/numpy",
+        # copybara:uncomment_end
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
+        "//tensorflow/python/types:trace",
         "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/lib/core:_pywrap_bfloat16",
         "//tensorflow/python/lib/core:_pywrap_custom_casts",
         "//tensorflow/python/lib/core:_pywrap_float8",
         "//tensorflow/python/types:doc_typealias",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
+        "@ml_dtypes",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "errors",
     srcs = [
         "errors.py",
         "errors_impl.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":c_api_util",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:_pywrap_py_exception_registry",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_decorator",
@@ -449,7 +502,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "error_interpolation",
     srcs = [
         "error_interpolation.py",
@@ -460,27 +513,35 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "function",
     srcs = ["function.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//smartass/brain:__subpackages__",
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+    ],
     deps = [
+        ":c_api_util",
         ":dtypes",
         ":graph_to_function_def",
-        ":op_def_registry",
         ":ops",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:pywrap_tf_session",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:function_utils",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "graph_to_function_def",
     srcs = ["graph_to_function_def.py"],
     srcs_version = "PY3",
@@ -490,7 +551,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "function_def_to_graph",
     srcs = ["function_def_to_graph.py"],
     srcs_version = "PY3",
@@ -498,15 +559,16 @@ py_library(
     deps = [
         ":cpp_shape_inference_proto_py",
         ":framework",
-        ":function",
+        ":func_graph",
         ":ops",
-        ":tensor_shape",
         ":versions",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:resource_variable_ops",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "function_def_to_graph_test",
     size = "small",
     srcs = ["function_def_to_graph_test.py"],
@@ -515,41 +577,41 @@ tf_py_test(
     deps = [
         ":constant_op",
         ":dtypes",
-        ":function",
         ":function_def_to_graph",
         ":graph_to_function_def",
         ":op_def_library",
         ":ops",
         ":tensor_shape",
         ":tensor_spec",
+        ":test_lib",
         ":test_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "graph_util",
     srcs = [
         "graph_util.py",
         "graph_util_impl.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":_proto_comparators",
         ":dtypes",
         ":ops",
-        ":tensor_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "convert_to_constants",
     srcs = [
         "convert_to_constants.py",
@@ -558,23 +620,30 @@ py_library(
     visibility = visibility + [
         "//learning/deepmind/deepfunc:__subpackages__",
         "//platforms/darwinn:__subpackages__",
+        "//tensorflow:internal",
     ],
     deps = [
         ":dtypes",
+        ":errors",
+        ":graph_util",
         ":ops",
         ":tensor_util",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "kernels",
     srcs = [
         "kernels.py",
@@ -582,12 +651,12 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/util:compat",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "op_def_library",
     srcs = ["op_def_library.py"],
     srcs_version = "PY3",
@@ -649,10 +718,11 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "op_def_registry",
     srcs = ["op_def_registry.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":_op_def_registry",
         "//tensorflow/core:protos_all_py",
@@ -682,13 +752,14 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "py_context_manager_test",
     srcs = ["py_context_manager_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
     deps = [
         ":_py_context_manager",
+        ":test_lib",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -697,6 +768,10 @@ cc_library(
     name = "op_def_util_headers",
     textual_hdrs = ["op_def_util.h"],
     visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/python/lib/core:safe_pyobject_ptr",
+    ],
 )
 
 cc_library(
@@ -719,14 +794,14 @@ tf_python_pybind_extension(
         ":op_def_util_headers",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "op_def_util_test",
     srcs = ["op_def_util_test.py"],
     python_version = "PY3",
@@ -734,7 +809,15 @@ tf_py_test(
         "no_pip",
         "no_windows",  # TODO(b/184424727): Enable this test on Windows.
     ],
-    deps = ["//tensorflow/python/platform:test"],
+    deps = [
+        ":_op_def_util",
+        ":dtypes",
+        ":tensor_shape",
+        ":test_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
 )
 
 cc_library(
@@ -813,17 +896,25 @@ tf_python_pybind_extension(
     ),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "python_api_parameter_converter_test",
     srcs = ["python_api_parameter_converter_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
     deps = [
+        ":_pywrap_python_api_info",
         ":_pywrap_python_api_parameter_converter",
         ":_pywrap_python_tensor_converter",
-        "//tensorflow/python:client_testlib",
+        ":constant_op",
+        ":dtypes",
+        ":ops",
+        ":tensor_shape",
+        ":test_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -901,16 +992,18 @@ tf_python_pybind_extension(
     ),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "python_api_info_test",
     srcs = ["python_api_info_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
     deps = [
         ":_pywrap_python_api_info",
-        ":_pywrap_python_tensor_converter",
-        "//tensorflow/python:client_testlib",
+        ":constant_op",
+        ":test_lib",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -948,7 +1041,7 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "python_api_dispatcher_test",
     srcs = ["python_api_dispatcher_test.py"],
     python_version = "PY3",
@@ -958,7 +1051,11 @@ tf_py_test(
     ],
     deps = [
         ":_pywrap_python_api_dispatcher",
-        "//tensorflow/python:client_testlib",
+        ":constant_op",
+        ":ops",
+        ":test_lib",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -1030,102 +1127,143 @@ tf_python_pybind_extension(
     ),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "python_tensor_converter_test",
     srcs = ["python_tensor_converter_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
     deps = [
         ":_pywrap_python_tensor_converter",
-        "//tensorflow/python:client_testlib",
+        ":constant_op",
+        ":dtypes",
+        ":indexed_slices",
+        ":ops",
+        ":test_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "ops",  # "ops" is already the name of a deprecated target
     srcs = ["ops.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+        "//third_party/py/tf_agents:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
-        ":_op_def_util",
         ":c_api_util",
+        ":composite_tensor",
         ":device",
+        ":device_spec",
         ":dtypes",
-        ":error_interpolation",
+        ":errors",
         ":op_callbacks",
-        ":op_def_registry",
         ":registry",
         ":stack",
+        ":tensor",
         ":tensor_conversion_registry",
         ":tensor_shape",
         ":tensor_util",
         ":traceable_stack",
-        ":type_spec",
         ":versions",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:handle_data_util",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:pywrap_tfe",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/eager:record",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:handle_data_util",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:trace",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/types:internal",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:decorator_utils",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:function_utils",
-        "//tensorflow/python/util:lazy_loader",
         "//tensorflow/python/util:lock_util",
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_stack",
         "//tensorflow/python/util:traceback_utils",
+        "//third_party/py/numpy",
+        "@absl_py//absl:app",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "op_callbacks",
     srcs = ["op_callbacks.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//third_party/cloud_tpu/convergence_tools:__subpackages__"],
+    deps = [
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:execute",
+    ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "op_callbacks_test",
     srcs = ["op_callbacks_test.py"],
     python_version = "PY3",
     deps = [
+        ":constant_op",
+        ":dtypes",
+        ":op_callbacks",
+        ":ops",
         ":sparse_tensor",
         ":test_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python/eager:execute",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "indexed_slices",
     srcs = ["indexed_slices.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":composite_tensor",
         ":composite_tensor_gradient",
         ":dtypes",
+        ":ops",
         ":tensor_conversion_registry",
         ":tensor_shape",
         ":tensor_spec",
+        ":tensor_util",
         ":type_spec",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types:internal",
         "//tensorflow/python/util:compat",
@@ -1134,18 +1272,18 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "indexed_slices_test",
     srcs = ["indexed_slices_test.py"],
     deps = [
         ":composite_tensor_gradient",
-        "//tensorflow/python:ops",
+        ":constant_op",
+        ":indexed_slices",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/platform:test",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tensor_conversion",
     srcs = ["tensor_conversion.py"],
     srcs_version = "PY3",
@@ -1157,7 +1295,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tensor_conversion_registry",
     srcs = ["tensor_conversion_registry.py"],
     srcs_version = "PY3",
@@ -1171,7 +1309,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "func_graph",
     srcs = ["func_graph.py"],
     srcs_version = "PY3",
@@ -1184,16 +1322,16 @@ py_library(
         ":ops",
         ":tensor_spec",
         ":type_spec",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/capture:capture_container",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:handle_data_util",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:execute",
+        "//tensorflow/python/eager/polymorphic_function:composite_tensor_utils",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
@@ -1202,27 +1340,34 @@ py_library(
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:variable_utils",
-        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "auto_control_deps",
     srcs = ["auto_control_deps.py"],
     srcs_version = "PY3",
     deps = [
         ":auto_control_deps_utils",
+        ":dtypes",
+        ":indexed_slices",
+        ":op_def_registry",
         ":ops",
+        ":registry",
         ":sparse_tensor",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:tensor_array_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:tensor_array_ops",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "auto_control_deps_utils",
     srcs = ["auto_control_deps_utils.py"],
     srcs_version = "PY3",
@@ -1232,28 +1377,45 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "auto_control_deps_test",
     size = "small",
     srcs = ["auto_control_deps_test.py"],
     python_version = "PY3",
     deps = [
         ":auto_control_deps",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:sendrecv_ops_gen",
-        "//tensorflow/python:while_loop",
+        ":constant_op",
+        ":dtypes",
+        ":ops",
+        ":test_lib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sendrecv_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:adam",
+        "//tensorflow/python/training:momentum",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "config",
     srcs = ["config.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":errors",
-        ":ops",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/util:_pywrap_determinism",
         "//tensorflow/python/util:_pywrap_tensor_float_32_execution",
         "//tensorflow/python/util:deprecation",
@@ -1261,7 +1423,7 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "config_test",
     size = "small",
     srcs = ["config_test.py"],
@@ -1275,22 +1437,21 @@ cuda_py_test(
         ":constant_op",
         ":dtypes",
         ":errors",
-        ":framework",
         ":ops",
         ":test_lib",
         ":test_ops",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "config_vgpu_test",
     size = "small",
     srcs = ["config_vgpu_test.py"],
@@ -1312,7 +1473,7 @@ cuda_py_test(
     ] + tf_additional_xla_deps_py(),
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "config_test_tpu",
     size = "small",
     srcs = ["config_test.py"],
@@ -1323,24 +1484,40 @@ tpu_py_test(
     deps = [
         ":config",
         ":constant_op",
+        ":dtypes",
+        ":errors",
+        ":ops",
+        ":test_lib",
         ":test_ops",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
-py_library(
+py_strict_library(
     name = "random_seed",
     srcs = ["random_seed.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":config",
         ":ops",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "registry",
     srcs = ["registry.py"],
     srcs_version = "PY3",
@@ -1350,71 +1527,124 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "smart_cond",
     srcs = ["smart_cond.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow_model_optimization:__subpackages__"],
     deps = [
+        ":ops",
         ":tensor_util",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_case",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "smart_cond_test",
     size = "small",
     srcs = ["smart_cond_test.py"],
     python_version = "PY3",
     deps = [
         ":constant_op",
+        ":dtypes",
         ":ops",
         ":smart_cond",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
+        ":test_lib",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "sparse_tensor",
     srcs = ["sparse_tensor.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":composite_tensor",
+        ":constant_op",
         ":dtypes",
         ":ops",
+        ":tensor",
+        ":tensor_shape",
+        ":tensor_spec",
         ":tensor_util",
         ":type_spec",
         ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:sparse_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types:internal",
         "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
+    name = "weak_tensor",
+    srcs = ["weak_tensor.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":dtypes",
+        ":errors",
+        ":extension_type",
+        ":ops",
+        "//tensorflow/python/eager:context",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_strict_test(
+    name = "weak_tensor_test",
+    srcs = ["weak_tensor_test.py"],
+    main = "weak_tensor_test.py",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = ["no_pip"],  # weak_tensor_test is not available in pip.
+    deps = [
+        ":constant_op",
+        ":dtypes",
+        ":errors",
+        ":ops",
+        ":tensor_spec",
+        ":test_lib",
+        ":weak_tensor",
+        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/module",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
+    ],
+)
+
+py_strict_library(
     name = "composite_tensor",
     srcs = ["composite_tensor.py"],
     srcs_version = "PY3",
     visibility = visibility,
     deps = [
-        ":dtypes",
-        ":tensor_util",
+        "//tensorflow/python:pywrap_tensorflow",
         "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "composite_tensor_test",
     srcs = ["composite_tensor_test.py"],
     main = "composite_tensor_test.py",
@@ -1422,30 +1652,37 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":composite_tensor",
-        ":for_generated_wrappers",
-        ":framework",
+        ":constant_op",
+        ":dtypes",
+        ":indexed_slices",
+        ":sparse_tensor",
         ":test_lib",
-        "//tensorflow/core:protos_all_py",
+        ":type_spec",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "composite_tensor_gradient",
     srcs = ["composite_tensor_gradient.py"],
     deps = [
         ":composite_tensor",
         "//tensorflow/python/util:nest",
+        "@typing_extensions_archive//:typing_extensions",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "extension_type",
     srcs = ["extension_type.py"],
     srcs_version = "PY3",
     visibility = visibility,
     deps = [
+        ":composite_tensor",
         ":dtypes",
         ":extension_type_field",
         ":immutable_dict",
@@ -1455,57 +1692,70 @@ py_library(
         ":type_spec",
         ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:composite_tensor_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:composite_tensor_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
+        "@typing_extensions_archive//:typing_extensions",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "extension_type_field",
     srcs = ["extension_type_field.py"],
     srcs_version = "PY3",
     deps = [
+        ":composite_tensor",
         ":dtypes",
         ":immutable_dict",
         ":ops",
         ":tensor_shape",
-        ":tensor_spec",
         ":type_spec",
         "//tensorflow/python/util:type_annotations",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "immutable_dict",
     srcs = ["immutable_dict.py"],
     srcs_version = "PY3",
     deps = [],
 )
 
-py_library(
+py_strict_library(
     name = "tensor_shape",
     srcs = ["tensor_shape.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//smartass/brain/configure/python:__pkg__",
+        "//tensorflow:internal",
+        "//third_party/py/reverb:__subpackages__",
+        "//third_party/py/tf_agents:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
-        ":dtypes",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:monitoring",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/types:trace",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "type_spec",
     srcs = ["type_spec.py"],
     srcs_version = "PY3",
-    visibility = visibility,
+    visibility = visibility + ["//third_party/py/tensorflow_gnn:__subpackages__"],
     deps = [
         ":composite_tensor",
         ":dtypes",
@@ -1524,11 +1774,12 @@ pytype_library(
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
+        "//tensorflow/tools/docs:doc_controls",
         "//third_party/py/numpy",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "type_spec_registry",
     srcs = ["type_spec_registry.py"],
     srcs_version = "PY3",
@@ -1538,7 +1789,7 @@ pytype_library(
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "type_utils",
     srcs = ["type_utils.py"],
     srcs_version = "PY3",
@@ -1550,6 +1801,7 @@ pytype_library(
     ],
     deps = [
         ":type_spec",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/structured:structured_tensor",
         "//tensorflow/python/platform:tf_logging",
@@ -1557,7 +1809,7 @@ pytype_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "stack",
     srcs = ["stack.py"],
     visibility = visibility + ["//tensorflow:internal"],
@@ -1567,54 +1819,78 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tensor",
     srcs = ["tensor.py"],
     deps = [
         ":common_shapes",
         ":dtypes",
+        ":errors",
+        ":op_callbacks",
         ":tensor_shape",
         ":tensor_util",
         ":type_spec",
         ":type_spec_registry",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
-        "//tensorflow/python/eager:graph_only_ops",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:stack",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/ops:handle_data_util",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/types:internal",
         "//tensorflow/python/util:_pywrap_utils",
         "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tensor_spec",
     srcs = ["tensor_spec.py"],
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//third_party/py/reverb:__subpackages__",
+        "//third_party/py/tf_agents:__subpackages__",
+    ],
     deps = [
         ":tensor",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tensor_util",
     srcs = ["tensor_util.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
+        ":dtypes",
         ":errors",
         ":tensor_shape",
         "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/types:core",
         "//tensorflow/python/types:internal",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "traceable_stack",
     srcs = ["traceable_stack.py"],
     srcs_version = "PY3",
@@ -1623,58 +1899,79 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "versions",
     srcs = ["versions.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "gpu_util",
     srcs = ["gpu_util.py"],
     srcs_version = "PY3",
     deps = [],
 )
 
-py_library(
+py_strict_library(
     name = "test_lib",
     srcs = ["test_util.py"],
     srcs_version = "PY3",
     visibility = visibility + [
+        "//tensorflow:internal",
         "//tensorflow_estimator/python/estimator:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
         "//third_party/py/keras:__subpackages__",
+        "//third_party/py/neural_structured_learning:__subpackages__",
+        "//third_party/py/tf_agents:__subpackages__",
     ],
     deps = [
         ":_test_metrics_util",
+        ":config",
+        ":device",
+        ":dtypes",
         ":errors",
-        ":for_generated_wrappers",
+        ":framework",
         ":gpu_util",
+        ":indexed_slices",
+        ":ops",
         ":random_seed",
+        ":sparse_tensor",
+        ":tensor_shape",
+        ":tensor_util",
+        ":tfrt_utils",
+        ":versions",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/config:flags_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:pywrap_sanitizers",
-        "//tensorflow/python:pywrap_tf_session",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sync_ops_gen",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/client",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/compat",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:control_flow_util_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:sync_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
         "//tensorflow/python/platform:_pywrap_stacktrace_handler",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:_pywrap_util_port",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
@@ -1690,12 +1987,13 @@ py_library(
 
 # Including this as a dependency will result in tests using
 # :test_lib to use XLA.
-py_library(
+py_strict_library(
     name = "is_xla_test_true",
     srcs = ["is_xla_test_true.py"],
     srcs_version = "PY3",
     visibility = visibility + [
         "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
         "//third_party/py/keras:__subpackages__",
         "//third_party/py/keras_cv:__subpackages__",
     ],
@@ -1703,32 +2001,33 @@ py_library(
 
 # Including this as a dependency will result in tests using
 # :test_lib to use MLIR.
-py_library(
+py_strict_library(
     name = "is_mlir_bridge_test_true",
     srcs = ["is_mlir_bridge_test_true.py"],
     srcs_version = "PY3",
-    visibility = visibility,
+    visibility = visibility + ["//tensorflow:internal"],
 )
 
 # Including this as a dependency will result in tests using
 # :test_lib to NOT use MLIR.
-py_library(
+py_strict_library(
     name = "is_mlir_bridge_test_false",
     srcs = ["is_mlir_bridge_test_false.py"],
     srcs_version = "PY3",
     visibility = visibility,
 )
 
-py_library(
+py_strict_library(
     name = "tfrt_utils",
     srcs = ["tfrt_utils.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "combinations",
     srcs = ["combinations.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":ops",
         ":test_combinations_lib",
@@ -1738,7 +2037,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test_combinations_lib",
     srcs = ["test_combinations.py"],
     srcs_version = "PY3",
@@ -1749,7 +2048,7 @@ py_library(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "test_combinations_test",
     srcs = ["test_combinations_test.py"],
     python_version = "PY3",
@@ -1757,16 +2056,18 @@ py_test(
     deps = [
         ":test_combinations_lib",
         "//tensorflow/python/eager:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "memory_checker",
     srcs = [
         "memory_checker.py",
         "python_memory_checker.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":_python_memory_checker_helper",
         # copybara:uncomment "//tensorflow/python/platform:cpp_memory_checker",
@@ -1784,7 +2085,7 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.py"],
@@ -1792,26 +2093,32 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":constant_op",
+        ":dtypes",
+        ":framework",
+        ":ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "registry_test",
     size = "small",
     srcs = ["registry_test.py"],
     main = "registry_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        "//tensorflow/python:client_testlib",
+        ":registry",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "errors_test",
     size = "small",
     srcs = ["errors_test.py"],
@@ -1820,14 +2127,16 @@ tf_py_test(
     tags = ["no_windows"],  # TODO(b/184424727): Enable this test on Windows.
     deps = [
         ":_errors_test_helper",
+        ":c_api_util",
         ":errors",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/lib/io:_pywrap_file_io",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "error_interpolation_test",
     size = "small",
     srcs = ["error_interpolation_test.py"],
@@ -1835,46 +2144,60 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":constant_op",
+        ":dtypes",
         ":error_interpolation",
+        ":errors",
+        ":ops",
+        ":test_lib",
         ":traceable_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_assert",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "subscribe_test",
     size = "small",
     srcs = ["subscribe_test.py"],
     main = "subscribe_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        ":framework",
+        ":constant_op",
+        ":dtypes",
+        ":ops",
+        ":sparse_tensor",
         ":subscribe",
         ":test_lib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_v1",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "proto_test",
     size = "small",
     srcs = ["proto_test.py"],
     main = "proto_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        "//tensorflow/python:client_testlib",
+        ":constant_op",
+        ":ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "function_test",
     size = "medium",
     srcs = ["function_test.py"],
@@ -1885,47 +2208,55 @@ cuda_py_test(
         "optonly",
     ],
     deps = [
+        ":constant_op",
+        ":dtypes",
         ":errors",
-        ":for_generated_wrappers",
+        ":function",
+        ":graph_to_function_def",
+        ":ops",
+        ":tensor_shape",
+        ":test_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_ops_gen",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:logging_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops_gen",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:logging_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:template",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "versions_test",
     size = "small",
     srcs = ["versions_test.py"],
     main = "versions_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        "//tensorflow/python:client_testlib",
+        ":versions",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "importer_test",
     size = "small",
     srcs = ["importer_test.py"],
@@ -1933,20 +2264,29 @@ tf_py_test(
     python_version = "PY3",
     tags = ["no_rocm"],
     deps = [
-        ":for_generated_wrappers",
+        ":constant_op",
+        ":device",
+        ":dtypes",
         ":framework",
+        ":function",
+        ":ops",
+        ":tensor_shape",
+        ":test_lib",
         ":test_ops",
+        ":versions",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -1959,7 +2299,7 @@ filegroup(
     visibility = if_oss(["//visibility:public"]),
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "meta_graph_test",
     size = "small",
     srcs = ["meta_graph_test.py"],
@@ -1971,25 +2311,35 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        ":for_generated_wrappers",
+        ":constant_op",
+        ":dtypes",
+        ":error_interpolation",
         ":framework",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:math_ops",
+        ":function",
+        ":ops",
+        ":test_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/training",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "node_file_writer_test",
     size = "small",
     srcs = ["node_file_writer_test.py"],
@@ -1999,15 +2349,22 @@ cuda_py_test(
     # TODO(reedwm): Debug this.
     tags = ["no_windows"],
     deps = [
-        ":test_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
+        ":config",
+        ":constant_op",
+        ":dtypes",
+        ":tensor_util",
+        ":test_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "traceable_stack_test",
     size = "small",
     srcs = ["traceable_stack_test.py"],
@@ -2015,7 +2372,6 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":test_lib",
-        ":test_ops",
         ":traceable_stack",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/util:tf_decorator",
@@ -2056,22 +2412,21 @@ cc_library(
     alwayslink = 1,
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "common_shapes_test",
     size = "small",
     srcs = ["common_shapes_test.py"],
     main = "common_shapes_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        ":framework",
+        ":common_shapes",
+        ":tensor_shape",
         ":test_lib",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "ops_test",
     size = "small",
     srcs = ["ops_test.py"],
@@ -2079,28 +2434,47 @@ tf_py_test(
     python_version = "PY3",
     tags = ["no_pip"],  # test_ops_2 is not available in pip.
     deps = [
+        ":composite_tensor",
+        ":config",
+        ":constant_op",
+        ":device",
+        ":dtypes",
         ":errors",
-        ":for_generated_wrappers",
-        ":framework",
+        ":function",
+        ":indexed_slices",
+        ":ops",
+        ":sparse_tensor",
+        ":tensor_conversion_registry",
+        ":tensor_shape",
+        ":tensor_spec",
+        ":tensor_util",
         ":test_lib",
         ":test_ops",
-        ":test_ops_2",
+        ":type_spec",
+        ":versions",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/function/trace_type",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:resources",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python:while_v2",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:resources",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -2114,7 +2488,7 @@ tf_gen_op_wrapper_py(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "op_allowlist_namespace_test",
     size = "small",
     srcs = ["op_allowlist_namespace_test.py"],
@@ -2122,63 +2496,83 @@ tf_py_test(
     python_version = "PY3",
     tags = ["no_pip"],  # Avoid a missing test dependency error
     deps = [
-        ":errors",
-        ":for_generated_wrappers",
-        ":framework",
-        ":test_lib",
         ":test_namespace_ops",
         "//tensorflow/python/platform:test",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "python_op_gen_annotation_test",
     size = "small",
     srcs = ["python_op_gen_annotation_test.py"],
     main = "python_op_gen_annotation_test.py",
     python_version = "PY3",
     deps = [
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:test",
-        "//tensorflow/security/fuzzing/py:annotation_types",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "ops_enable_eager_test",
     size = "small",
     srcs = ["ops_enable_eager_test.py"],
     main = "ops_enable_eager_test.py",
     python_version = "PY3",
     deps = [
-        ":framework",
+        ":ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python/eager:context",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "extension_type_test",
     size = "small",
     srcs = ["extension_type_test.py"],
     main = "extension_type_test.py",
     python_version = "PY3",
     deps = [
+        ":constant_op",
+        ":dtypes",
         ":extension_type",
+        ":extension_type_field",
+        ":immutable_dict",
+        ":ops",
+        ":tensor_shape",
+        ":tensor_spec",
         ":test_lib",
+        ":type_spec",
         ":type_spec_registry",
         ":type_utils",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/testing:parameterized",
+        "@typing_extensions_archive//:typing_extensions",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "extension_type_field_test",
     size = "small",
     srcs = ["extension_type_field_test.py"],
@@ -2186,14 +2580,21 @@ tf_py_test(
     python_version = "PY3",
     tags = ["no_pip"],  # TODO(b/184565242) Enable pip once we add a module that depends on this.
     deps = [
+        ":constant_op",
+        ":dtypes",
         ":extension_type_field",
+        ":ops",
+        ":tensor_shape",
+        ":tensor_spec",
         ":test_lib",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "immutable_dict_test",
     size = "small",
     srcs = ["immutable_dict_test.py"],
@@ -2207,14 +2608,15 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tensor_shape_test",
     size = "small",
     srcs = ["tensor_shape_test.py"],
     main = "tensor_shape_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":dtypes",
+        ":tensor_shape",
         ":test_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
@@ -2223,101 +2625,131 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "type_spec_test",
     size = "small",
     srcs = ["type_spec_test.py"],
     main = "type_spec_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":dtypes",
+        ":ops",
+        ":tensor_shape",
+        ":tensor_spec",
         ":test_lib",
         ":type_spec",
         ":type_spec_registry",
         ":type_utils",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tensor_test",
     size = "small",
     srcs = ["tensor_test.py"],
     main = "tensor_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":constant_op",
+        ":dtypes",
+        ":ops",
         ":tensor",
+        ":tensor_shape",
         ":test_lib",
+        ":type_spec",
         ":type_utils",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:array_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_tensor_test",
     size = "small",
     srcs = ["sparse_tensor_test.py"],
     main = "sparse_tensor_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        ":framework",
+        ":dtypes",
+        ":errors",
+        ":ops",
+        ":sparse_tensor",
+        ":tensor_shape",
+        ":tensor_spec",
         ":test_lib",
         ":type_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "device_spec_test",
     size = "small",
     srcs = ["device_spec_test.py"],
     main = "device_spec_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":device_spec",
         ":test_lib",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "device_test",
     size = "small",
     srcs = ["device_test.py"],
     main = "device_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":device",
+        ":device_spec",
+        ":ops",
         ":test_lib",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:variables",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "random_seed_test",
     size = "small",
     srcs = ["random_seed_test.py"],
     main = "random_seed_test.py",
     python_version = "PY3",
     deps = [
-        ":framework",
-        "//tensorflow/python:client_testlib",
+        ":random_seed",
+        ":test_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tensor_util_test",
     size = "small",
     srcs = ["tensor_util_test.py"],
@@ -2327,20 +2759,30 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        ":for_generated_wrappers",
-        ":framework",
+        ":constant_op",
+        ":dtypes",
+        ":func_graph",
+        ":indexed_slices",
+        ":ops",
+        ":sparse_tensor",
+        ":tensor_shape",
+        ":tensor_util",
         ":test_lib",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:shape_util",
-        "//tensorflow/python:state_ops_gen",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:shape_util",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "test_util_test",
     size = "small",
     srcs = ["test_util_test.py"],
@@ -2352,28 +2794,37 @@ cuda_py_test(
     ],
     deps = [
         ":combinations",
+        ":config",
+        ":constant_op",
+        ":dtypes",
         ":errors",
-        ":for_generated_wrappers",
+        ":indexed_slices",
+        ":ops",
+        ":random_seed",
         ":test_lib",
         ":test_ops",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:lookup_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:pywrap_sanitizers",
+        "//tensorflow/python/compat",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util/protobuf:compare_test_proto_py",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "memory_checker_test",
     size = "medium",
     srcs = ["memory_checker_test.py"],
@@ -2386,9 +2837,12 @@ tf_py_test(
         "no_windows",
         "noasan",  # TODO(b/149948895): Re-enable.
         "nomsan",  # TODO(b/149948895): Re-enable.
+        "not_run:arm",
         "notsan",  # TODO(b/149948895): Re-enable.
     ],
     deps = [
+        ":constant_op",
+        ":ops",
         ":test_lib",
         # TODO(kkb): Find more appropriate place to add `memory_checker` as deps
         # Adding directly to `test_lib` caused a Colab binary size
@@ -2407,38 +2861,48 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "dtypes_test",
     size = "small",
     srcs = ["dtypes_test.py"],
     main = "dtypes_test.py",
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":_dtypes",
+        ":dtypes",
         ":test_lib",
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/trace_type",
         "//tensorflow/python:platform_test",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "op_def_library_test",
     size = "small",
     srcs = ["op_def_library_test.py"],
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
+        ":constant_op",
+        ":dtypes",
+        ":function",
+        ":op_def_library",
         ":op_def_library_pybind",
+        ":ops",
+        ":tensor_shape",
+        ":tensor_spec",
         ":test_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "kernels_test",
     size = "small",
     srcs = ["kernels_test.py"],
@@ -2447,7 +2911,6 @@ tf_py_test(
     deps = [
         ":kernels",
         ":test_lib",
-        ":test_ops",
         "//tensorflow/python:platform_test",
     ],
 )
@@ -2470,7 +2933,7 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "c_api_util_test",
     size = "small",
     srcs = ["c_api_util_test.py"],
@@ -2482,27 +2945,29 @@ tf_py_test(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "graph_util_test",
     size = "small",
     srcs = ["graph_util_test.py"],
     python_version = "PY3",
     deps = [
-        ":for_generated_wrappers",
-        ":framework",
+        ":constant_op",
+        ":dtypes",
+        ":function",
+        ":graph_util",
+        ":ops",
+        ":tensor_util",
+        ":test_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops_gen",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "convert_to_constants_test",
     size = "medium",
     srcs = ["convert_to_constants_test.py"],
@@ -2510,22 +2975,48 @@ tf_py_test(
     shard_count = 4,
     tags = ["no_rocm"],
     deps = [
+        ":constant_op",
         ":convert_to_constants",
+        ":dtypes",
+        ":framework",
+        ":function",
+        ":ops",
+        ":tensor_spec",
         ":test_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python:while_v2",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:cond_v2",
+        "//tensorflow/python/ops:control_flow_case",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:constants",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/saved_model:simple_save",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "graph_building_benchmark",
     size = "medium",
     srcs = ["graph_building_benchmark.py"],
@@ -2533,11 +3024,10 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":dtypes",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_benchmark",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -2562,19 +3052,19 @@ pyx_library(
     py_deps = ["//tensorflow/python/util:compat"],
 )
 
-py_test(
+py_strict_test(
     name = "tf2_test",
     srcs = ["tf2_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":combinations",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:tf2",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/kernel_tests:test_base",
         "//tensorflow/python/platform:_pywrap_tf2",
         "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -2597,14 +3087,14 @@ tf_python_pybind_extension(
 # )
 # copybara:uncomment_end
 
-py_library(
+py_strict_library(
     name = "summary_test_util",
     testonly = True,
     srcs = ["summary_test_util.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:gfile",
     ],
 )
diff --git a/tensorflow/python/framework/config.py b/tensorflow/python/framework/config.py
index 33289c37283..207bccace84 100644
--- a/tensorflow/python/framework/config.py
+++ b/tensorflow/python/framework/config.py
@@ -37,6 +37,12 @@ def tensor_float_32_execution_enabled():
   return _pywrap_tensor_float_32_execution.is_enabled()
 
 
+# TODO(b/280688352): Rename or rework this function to make it appear less
+# specific to GPUs. TPUs use bfloat16 instead of TensorFloat-32 by default for
+# matmuls, yet on TPUs this function also can used to increase the precision of
+# matmuls to FP32 by passing enabled=False. It is misleading how the words
+# "tensor_float_32" appear in the API name, yet this API affects TPUs which do
+# not use TensorFloat-32.
 @tf_export('config.experimental.enable_tensor_float_32_execution')
 def enable_tensor_float_32_execution(enabled):
   """Enable or disable the use of TensorFloat-32 on supported hardware.
@@ -49,11 +55,10 @@ def enable_tensor_float_32_execution(enabled):
   deep learning models in practice.
 
   TensorFloat-32 is enabled by default. TensorFloat-32 is only supported on
-  NVIDIA GPUs starting with the Ampere generation, so older NVIDIA GPUs and
-  other hardware will use the full float32 precision regardless of whether
-  TensorFloat-32 is enabled or not. If you want to use the full float32
-  precision on all GPUs, you can disable TensorFloat-32 execution with this
-  function. For example:
+  NVIDIA GPUs starting with the Ampere generation, so older NVIDIA GPUs will use
+  the full float32 precision regardless of whether TensorFloat-32 is enabled or
+  not. If you want to use the full float32 precision on all GPUs, you can
+  disable TensorFloat-32 execution with this function. For example:
 
   ```python
   x = tf.fill((1024, 1024), 1.0001)
@@ -84,6 +89,12 @@ def enable_tensor_float_32_execution(enabled):
   TensorFloat-32 is also used for some complex64 ops. Currently, TensorFloat-32
   is used in fewer cases for complex64 as it is for float32.
 
+  Simiarly to GPUs, TPUs also run certain float32 ops, like matrix
+  multiplications and convolutions, with lower precision by default. Unlike
+  GPUs, TPUs use bfloat16 precision instead of TensorFloat-32 precision for such
+  ops. Disabling TensorFloat-32 with this function also causes TPUs to run
+  float32 ops with the full float32 precision but with lower performance.
+
   Args:
     enabled: Bool indicating whether to enable TensorFloat-32 execution.
   """
diff --git a/tensorflow/python/framework/constant_op.py b/tensorflow/python/framework/constant_op.py
index 2766b1bc683..20f314afc7b 100644
--- a/tensorflow/python/framework/constant_op.py
+++ b/tensorflow/python/framework/constant_op.py
@@ -432,3 +432,32 @@ class _ConstantTensorCodec:
 
 
 nested_structure_coder.register_codec(_ConstantTensorCodec())
+
+
+class _NumpyCodec:
+  """Codec for Numpy."""
+
+  def can_encode(self, pyobj):
+    return isinstance(pyobj, np.ndarray)
+
+  def do_encode(self, numpy_value, encode_fn):
+    """Returns an encoded `TensorProto` for `np.ndarray`."""
+    del encode_fn
+    encoded_numpy = struct_pb2.StructuredValue()
+    encoded_numpy.numpy_value.CopyFrom(
+        tensor_util.make_tensor_proto(numpy_value)
+    )
+    return encoded_numpy
+
+  def can_decode(self, value):
+    return value.HasField("numpy_value")
+
+  def do_decode(self, value, decode_fn):
+    """Returns the `np.ndarray` encoded by the proto `value`."""
+    del decode_fn
+    tensor_proto = value.numpy_value
+    numpy = tensor_util.MakeNdarray(tensor_proto)
+    return numpy
+
+
+nested_structure_coder.register_codec(_NumpyCodec())
diff --git a/tensorflow/python/framework/dtypes.py b/tensorflow/python/framework/dtypes.py
index 57dc976cf56..e72c38c39d2 100644
--- a/tensorflow/python/framework/dtypes.py
+++ b/tensorflow/python/framework/dtypes.py
@@ -95,7 +95,13 @@ class DType(
 
   @property
   def base_dtype(self):
-    """Returns a non-reference `DType` based on this `DType`."""
+    """Returns a non-reference `DType` based on this `DType` (for TF1).
+
+    Programs written for TensorFlow 2.x do not need this attribute.
+    It exists only for compatibility with TensorFlow 1.x, which used
+    reference `DType`s in the implementation of `tf.compat.v1.Variable`.
+    In TensorFlow 2.x, `tf.Variable` is implemented without reference types.
+    """
     if self._is_ref_dtype:
       return _INTERN_TABLE[self._type_enum - 100]
     else:
@@ -198,13 +204,15 @@ class DType(
     return min, max
 
   def is_compatible_with(self, other):
-    """Returns True if the `other` DType will be converted to this DType.
+    """Returns True if the `other` DType will be converted to this DType (TF1).
 
-    The conversion rules are as follows:
+    Programs written for TensorFlow 2.x do not need this function.
+    Instead, they can do equality comparison on `DType` objects directly:
+    `tf.as_dtype(this) == tf.as_dtype(other)`.
 
-    ```python
-    DType(T)       .is_compatible_with(DType(T))        == True
-    ```
+    This function exists only for compatibility with TensorFlow 1.x, where it
+    additionally allows conversion from a reference type (used by
+    `tf.compat.v1.Variable`) to its base type.
 
     Args:
       other: A `DType` (or object that may be converted to a `DType`).
diff --git a/tensorflow/python/framework/error_interpolation.py b/tensorflow/python/framework/error_interpolation.py
index e4c8964f68c..23a78224ee5 100644
--- a/tensorflow/python/framework/error_interpolation.py
+++ b/tensorflow/python/framework/error_interpolation.py
@@ -343,6 +343,45 @@ def create_graph_debug_info_def(func_named_operations):
   return graph_debug_info_def
 
 
+def merge_graph_debug_info_def(per_fn_info):
+  """Construct and returns a `GraphDebugInfo` protocol buffer.
+
+  Args:
+    per_fn_info: An iterable of (func_name, GraphDebugInfo) tuples.
+
+  Returns:
+    GraphDebugInfo protocol buffer.
+
+  Raises:
+    TypeError: If the arguments are not of the correct proto buffer type.
+  """
+  graph_debug_info_def = graph_debug_info_pb2.GraphDebugInfo()
+
+  all_file_names = set()
+  for _, fn_info in per_fn_info:
+    all_file_names.update(fn_info.files)
+  # Ensure determinism.
+  all_file_names = sorted(all_file_names)
+
+  graph_debug_info_def.files.extend(all_file_names)
+  file_to_index = dict(
+      [(y, x) for x, y in enumerate(graph_debug_info_def.files)])
+
+  for fn_name, fn_info in per_fn_info:
+    for fn_node_name, fn_trace in fn_info.traces.items():
+      trace_def = graph_debug_info_def.traces[fn_node_name + "@" + fn_name]
+      for fn_frame in fn_trace.file_line_cols:
+        trace_def.file_line_cols.add(
+            file_index=file_to_index[fn_info.files[fn_frame.file_index]],
+            line=fn_frame.line,
+            col=fn_frame.col,
+            func=fn_frame.func,
+            code=fn_frame.code,
+        )
+
+  return graph_debug_info_def
+
+
 def _compute_field_dict(op):
   r"""Return a dictionary mapping interpolation tokens to values.
 
diff --git a/tensorflow/python/framework/error_interpolation_test.py b/tensorflow/python/framework/error_interpolation_test.py
index 84151e39e86..fb6512657e5 100644
--- a/tensorflow/python/framework/error_interpolation_test.py
+++ b/tensorflow/python/framework/error_interpolation_test.py
@@ -18,6 +18,7 @@ import collections
 import os
 import re
 
+from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,9 +32,11 @@ from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import script_ops
 from tensorflow.python.platform import test
 
+
 # A mock for ``tf_stack.FrameSummary``.
 FrameSummary = collections.namedtuple(
-    "StackFrame", ["filename", "lineno", "name", "line"])
+    "StackFrame", ["filename", "lineno", "name", "line"]
+)
 
 # TODO(feyu): convert tests to tf function from graph when appropriate.
 
@@ -41,15 +44,12 @@ FrameSummary = collections.namedtuple(
 def _make_frame_with_filename(tb, idx, filename):
   """Return a copy of an existing stack frame with a new filename."""
   frame = tb[idx]
-  return FrameSummary(
-      filename,
-      frame.lineno,
-      frame.name,
-      frame.line)
+  return FrameSummary(filename, frame.lineno, frame.name, frame.line)
 
 
-def _modify_op_stack_with_filenames(tb, num_user_frames, user_filename,
-                                    num_inner_tf_frames):
+def _modify_op_stack_with_filenames(
+    tb, num_user_frames, user_filename, num_inner_tf_frames
+):
   """Replace traceback with a new traceback using special filenames."""
   tf_filename = error_interpolation._FRAMEWORK_PATH_PREFIXES[0] + "%d.py"
   user_filename = os.path.join("%d", "my_favorite_file.py")
@@ -75,14 +75,17 @@ class ComputeDeviceSummaryFromOpTest(test.TestCase):
   def testCorrectFormatWithActiveDeviceAssignments(self):
     assignments = []
     assignments.append(
-        traceable_stack.TraceableObject(
-            "/cpu:0", filename="hope.py", lineno=24))
+        traceable_stack.TraceableObject("/cpu:0", filename="hope.py", lineno=24)
+    )
     assignments.append(
         traceable_stack.TraceableObject(
-            "/gpu:2", filename="please.py", lineno=42))
+            "/gpu:2", filename="please.py", lineno=42
+        )
+    )
 
     summary = error_interpolation._compute_device_summary_from_list(
-        "nodename", assignments, prefix="  ")
+        "nodename", assignments, prefix="  "
+    )
 
     self.assertIn("nodename", summary)
     self.assertIn("tf.device(/cpu:0)", summary)
@@ -93,7 +96,8 @@ class ComputeDeviceSummaryFromOpTest(test.TestCase):
   def testCorrectFormatWhenNoColocationsWereActive(self):
     device_assignment_list = []
     summary = error_interpolation._compute_device_summary_from_list(
-        "nodename", device_assignment_list, prefix="  ")
+        "nodename", device_assignment_list, prefix="  "
+    )
     self.assertIn("nodename", summary)
     self.assertIn("No device assignments", summary)
 
@@ -102,15 +106,18 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
 
   def testCorrectFormatWithActiveColocations(self):
     t_obj_1 = traceable_stack.TraceableObject(
-        None, filename="test_1.py", lineno=27)
+        None, filename="test_1.py", lineno=27
+    )
     t_obj_2 = traceable_stack.TraceableObject(
-        None, filename="test_2.py", lineno=38)
+        None, filename="test_2.py", lineno=38
+    )
     colocation_dict = {
         "test_node_1": t_obj_1,
         "test_node_2": t_obj_2,
     }
     summary = error_interpolation._compute_colocation_summary_from_dict(
-        "node_name", colocation_dict, prefix="  ")
+        "node_name", colocation_dict, prefix="  "
+    )
     self.assertIn("node_name", summary)
     self.assertIn("colocate_with(test_node_1)", summary)
     self.assertIn("<test_1.py:27>", summary)
@@ -120,7 +127,8 @@ class ComputeColocationSummaryFromOpTest(test.TestCase):
   def testCorrectFormatWhenNoColocationsWereActive(self):
     colocation_dict = {}
     summary = error_interpolation._compute_colocation_summary_from_dict(
-        "node_name", colocation_dict, prefix="  ")
+        "node_name", colocation_dict, prefix="  "
+    )
     self.assertIn("node_name", summary)
     self.assertIn("No node-device colocations", summary)
 
@@ -139,8 +147,9 @@ class CreateGraphDebugInfoDefTest(test.TestCase):
       if flc.file_index == file_index:
         found_flc = flc
         break
-    self.assertIsNotNone(found_flc,
-                         "Could not find a stack trace entry for file")
+    self.assertIsNotNone(
+        found_flc, "Could not find a stack trace entry for file"
+    )
     return found_flc
 
   def testStackTraceExtraction(self):
@@ -167,30 +176,41 @@ class CreateGraphDebugInfoDefTest(test.TestCase):
           node_def=node_def_copy,
           inputs=[],
           control_inputs=[],
-          extract_traceback=False)
+          extract_traceback=False,
+      )
 
       non_traceback_op = ops.Operation._from_c_op(c_op, ops.get_default_graph())
       self.assertIsNone(non_traceback_op.traceback)
 
-      export_ops = [("", global_op), ("func1", op1), ("func2", op2),
-                    ("func2", non_traceback_op)]
+      export_ops = [
+          ("", global_op),
+          ("func1", op1),
+          ("func2", op2),
+          ("func2", non_traceback_op),
+      ]
       graph_debug_info = error_interpolation.create_graph_debug_info_def(
-          export_ops)
+          export_ops
+      )
       this_file_index = -1
       for file_index, file_name in enumerate(graph_debug_info.files):
         if "{}error_interpolation_test.py".format(os.sep) in file_name:
           this_file_index = file_index
       self.assertGreaterEqual(
-          this_file_index, 0,
-          "Could not find this file in trace:" + repr(graph_debug_info))
+          this_file_index,
+          0,
+          "Could not find this file in trace:" + repr(graph_debug_info),
+      )
 
       # Verify the traces exist for each op.
-      global_flc = self._getFirstStackTraceForFile(graph_debug_info, "Global@",
-                                                   this_file_index)
-      op1_flc = self._getFirstStackTraceForFile(graph_debug_info, "One@func1",
-                                                this_file_index)
-      op2_flc = self._getFirstStackTraceForFile(graph_debug_info, "Two@func2",
-                                                this_file_index)
+      global_flc = self._getFirstStackTraceForFile(
+          graph_debug_info, "Global@", this_file_index
+      )
+      op1_flc = self._getFirstStackTraceForFile(
+          graph_debug_info, "One@func1", this_file_index
+      )
+      op2_flc = self._getFirstStackTraceForFile(
+          graph_debug_info, "Two@func2", this_file_index
+      )
 
       self.assertNotIn("NonTraceback@func2", graph_debug_info.traces)
 
@@ -199,6 +219,79 @@ class CreateGraphDebugInfoDefTest(test.TestCase):
       self.assertEqual(op2_flc.line, global_line + 2, "op2 not on next line")
 
 
+class MergeGraphDebugInfoDefTest(test.TestCase):
+
+  def testMerges(self):
+    fn_1 = graph_debug_info_pb2.GraphDebugInfo(
+        files=["a.py", "b.py", "c.py"],
+        traces={
+            "node_1": graph_debug_info_pb2.GraphDebugInfo.StackTrace(
+                file_line_cols=[
+                    graph_debug_info_pb2.GraphDebugInfo.FileLineCol(
+                        file_index=0, line=19, col=2
+                    )
+                ]
+            ),
+            "node_2": graph_debug_info_pb2.GraphDebugInfo.StackTrace(
+                file_line_cols=[
+                    graph_debug_info_pb2.GraphDebugInfo.FileLineCol(
+                        file_index=1, line=33, col=4
+                    )
+                ]
+            ),
+        },
+    )
+
+    fn_2 = graph_debug_info_pb2.GraphDebugInfo(
+        files=["c.py", "a.py", "b.py"],
+        traces={
+            "node_1": graph_debug_info_pb2.GraphDebugInfo.StackTrace(
+                file_line_cols=[
+                    graph_debug_info_pb2.GraphDebugInfo.FileLineCol(
+                        file_index=0, line=9, col=6
+                    )
+                ]
+            ),
+            "node_2": graph_debug_info_pb2.GraphDebugInfo.StackTrace(
+                file_line_cols=[
+                    graph_debug_info_pb2.GraphDebugInfo.FileLineCol(
+                        file_index=1, line=56, col=7
+                    )
+                ]
+            ),
+        },
+    )
+
+    result = error_interpolation.merge_graph_debug_info_def(
+        [("fn_1", fn_1), ("fn_2", fn_2)]
+    )
+
+    self.assertEqual(set(result.files), {"b.py", "a.py", "c.py"})
+    self.assertLen(result.traces, 4)
+
+    # Check file names are correctly indexed.
+    self.assertEqual(
+        result.files[result.traces["node_1@fn_1"].file_line_cols[0].file_index],
+        "a.py",
+    )
+    self.assertEqual(
+        result.files[result.traces["node_2@fn_1"].file_line_cols[0].file_index],
+        "b.py",
+    )
+    self.assertEqual(
+        result.files[result.traces["node_1@fn_2"].file_line_cols[0].file_index],
+        "c.py",
+    )
+    self.assertEqual(
+        result.files[result.traces["node_2@fn_2"].file_line_cols[0].file_index],
+        "a.py",
+    )
+
+    # Check properties of a node.
+    self.assertEqual(result.traces["node_1@fn_1"].file_line_cols[0].line, 19)
+    self.assertEqual(result.traces["node_1@fn_1"].file_line_cols[0].col, 2)
+
+
 class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
 
   def testFindIndexOfDefiningFrameForOp(self):
@@ -209,7 +302,8 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
           local_op.traceback,
           num_user_frames=3,
           user_filename=user_filename,
-          num_inner_tf_frames=5)
+          num_inner_tf_frames=5,
+      )
       idx = error_interpolation._find_index_of_defining_frame(modified_tb)
       # Expected frame is 6th from the end because there are 5 inner frames with
       # TF filenames.
@@ -224,7 +318,8 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
           local_op.traceback[:7],  # Truncate stack to known length.
           num_user_frames=0,
           user_filename="user_file.py",
-          num_inner_tf_frames=7)
+          num_inner_tf_frames=7,
+      )
       idx = error_interpolation._find_index_of_defining_frame(modified_tb)
       self.assertEqual(0, idx)
 
@@ -233,14 +328,16 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(1, name="One")
       normal_string = "This is just a normal string"
       interpolated_string = error_interpolation.interpolate_graph(
-          normal_string, ops.get_default_graph())
+          normal_string, ops.get_default_graph()
+      )
       self.assertIn(normal_string, interpolated_string)
 
   def testOneTagWithAFakeNameResultsInPlaceholders(self):
     with ops.Graph().as_default():
       one_tag_string = "{{node MinusOne}}"
       interpolated_string = error_interpolation.interpolate_graph(
-          one_tag_string, ops.get_default_graph())
+          one_tag_string, ops.get_default_graph()
+      )
       self.assertIn(one_tag_string, interpolated_string)
 
   def testOneTagWithAFakeFunctionTag(self):
@@ -250,7 +347,8 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(2, name="Two")
       one_tag_with_a_fake_function_tag = "{{function_node fake}}{{node One}}"
       interpolated_string = error_interpolation.interpolate_graph(
-          one_tag_with_a_fake_function_tag, ops.get_default_graph())
+          one_tag_with_a_fake_function_tag, ops.get_default_graph()
+      )
       # Fragments the expression to avoid matching the pattern itself.
       expected_regex = re.compile(rf"node 'One'.*{defined_at}", re.DOTALL)
       self.assertRegex(interpolated_string, expected_regex)
@@ -265,10 +363,12 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(3, name="Three")
       two_tags_no_seps = "{{node One}}{{node Three}}"
       interpolated_string = error_interpolation.interpolate_graph(
-          two_tags_no_seps, ops.get_default_graph())
+          two_tags_no_seps, ops.get_default_graph()
+      )
       # Fragments the expression to avoid matching the pattern itself.
       expected_regex = re.compile(
-          rf"node 'One'.*{defined_at}.*node 'Three'.*{defined_at}", re.DOTALL)
+          rf"node 'One'.*{defined_at}.*node 'Three'.*{defined_at}", re.DOTALL
+      )
       self.assertRegex(interpolated_string, expected_regex)
 
   def testTwoTagsWithSeps(self):
@@ -279,10 +379,12 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(3, name="Three")
       two_tags_with_seps = ";;;{{node Two}},,,{{node Three}};;;"
       interpolated_string = error_interpolation.interpolate_graph(
-          two_tags_with_seps, ops.get_default_graph())
+          two_tags_with_seps, ops.get_default_graph()
+      )
       # Fragments the expression to avoid matching the pattern itself.
       expected_regex = re.compile(
-          rf"node 'Two'.*{defined_at}.*node 'Three'.*{defined_at}", re.DOTALL)
+          rf"node 'Two'.*{defined_at}.*node 'Three'.*{defined_at}", re.DOTALL
+      )
       self.assertRegex(interpolated_string, expected_regex)
 
   def testNewLine(self):
@@ -292,7 +394,8 @@ class InterpolateFilenamesAndLineNumbersTest(test.TestCase):
       constant_op.constant(2, name="Two")
       newline = "\n\n;;;{{node One}};;;"
       interpolated_string = error_interpolation.interpolate_graph(
-          newline, ops.get_default_graph())
+          newline, ops.get_default_graph()
+      )
       expected_regex = re.compile(rf"node 'One'.*{defined_at}", re.DOTALL)
       self.assertRegex(interpolated_string, expected_regex)
 
@@ -301,7 +404,6 @@ class OperationDefinedAtTraceTest(test.TestCase):
 
   @test_util.run_v2_only
   def testSimpleCall(self):
-
     @def_function.function
     def func():
       x = constant_op.constant([[1, 2, 3]])
@@ -310,14 +412,14 @@ class OperationDefinedAtTraceTest(test.TestCase):
 
     with self.assertRaisesRegex(
         errors_impl.InvalidArgumentError,
-        re.compile(r"defined at.*"
-                   r"in testSimpleCall.*"
-                   r"in func", re.DOTALL)):
+        re.compile(
+            r"defined at.*" r"in testSimpleCall.*" r"in func", re.DOTALL
+        ),
+    ):
       func()
 
   @test_util.run_v2_only
   def testNestedCall(self):
-
     def inner():
       x = constant_op.constant([[1, 2, 3]])
       y = script_ops.eager_py_func(lambda: [[1, 2, 3]], (), dtypes.int32)
@@ -329,10 +431,11 @@ class OperationDefinedAtTraceTest(test.TestCase):
 
     with self.assertRaisesRegex(
         errors_impl.InvalidArgumentError,
-        re.compile(r"defined at.*"
-                   r"in testNestedCall.*"
-                   r"in func.*"
-                   r"in inner", re.DOTALL)):
+        re.compile(
+            r"defined at.*" r"in testNestedCall.*" r"in func.*" r"in inner",
+            re.DOTALL,
+        ),
+    ):
       func()
 
   @test_util.run_v2_only
@@ -344,9 +447,8 @@ class OperationDefinedAtTraceTest(test.TestCase):
 
     with self.assertRaisesRegex(
         errors_impl.InvalidArgumentError,
-        re.compile(r"defined at.*"
-                   r"in testAssert.*"
-                   r"in func", re.DOTALL)):
+        re.compile(r"defined at.*" r"in testAssert.*" r"in func", re.DOTALL),
+    ):
       func()
 
   @test_util.run_v2_only
@@ -363,9 +465,10 @@ class OperationDefinedAtTraceTest(test.TestCase):
 
     with self.assertRaisesRegex(
         errors_impl.InvalidArgumentError,
-        re.compile(r"defined at.*"
-                   r"in testControlFlow.*"
-                   r"in func", re.DOTALL)):
+        re.compile(
+            r"defined at.*" r"in testControlFlow.*" r"in func", re.DOTALL
+        ),
+    ):
       func()
 
 
@@ -374,17 +477,21 @@ class IsFrameworkFilenameTest(test.TestCase):
   def testAllowsUnitTests(self):
     self.assertFalse(
         error_interpolation._is_framework_filename(
-            error_interpolation._FRAMEWORK_PATH_PREFIXES[0] + "foobar_test.py"))
+            error_interpolation._FRAMEWORK_PATH_PREFIXES[0] + "foobar_test.py"
+        )
+    )
 
   def testFrameworkPythonFile(self):
     self.assertTrue(
-        error_interpolation._is_framework_filename(
-            error_interpolation.__file__))
+        error_interpolation._is_framework_filename(error_interpolation.__file__)
+    )
 
   def testEmbedded(self):
     self.assertTrue(
         error_interpolation._is_framework_filename(
-            "<embedded stdlib>/context_lib.py"))
+            "<embedded stdlib>/context_lib.py"
+        )
+    )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/python/framework/experimental/BUILD b/tensorflow/python/framework/experimental/BUILD
index 21cf7445a69..cf5c515cb8d 100644
--- a/tensorflow/python/framework/experimental/BUILD
+++ b/tensorflow/python/framework/experimental/BUILD
@@ -1,10 +1,10 @@
 # Experimental Unified APIs for Eager and Graph modes.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_python_pybind_extension")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_python_pybind_extension")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
@@ -17,8 +17,8 @@ tf_python_pybind_extension(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
-        "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:unified_api_pywrap_required_headers",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@pybind11",
     ],
 )
@@ -32,8 +32,8 @@ tf_python_pybind_extension(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
-        "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:unified_api_pywrap_required_headers",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@pybind11",
     ],
 )
@@ -47,8 +47,8 @@ tf_python_pybind_extension(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
-        "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:unified_api_pywrap_required_headers",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@com_google_absl//absl/types:span",
         "@pybind11",
     ],
@@ -63,21 +63,21 @@ tf_python_pybind_extension(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/lib/llvm_rtti",
-        "//tensorflow/python:pybind11_lib",
         "//tensorflow/python:unified_api_pywrap_required_headers",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@com_google_absl//absl/types:span",
         "@pybind11",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "gradient_registry",
     srcs = ["gradient_registry.py"],
     srcs_version = "PY3",
     deps = [":_tape"],
 )
 
-py_library(
+py_strict_library(
     name = "math_ops",
     srcs = ["math_ops.py"],
     srcs_version = "PY3",
@@ -87,7 +87,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "nn_ops",
     srcs = ["nn_ops.py"],
     srcs_version = "PY3",
@@ -97,7 +97,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tape",
     srcs = ["tape.py"],
     srcs_version = "PY3",
@@ -105,30 +105,35 @@ py_library(
         ":_tape",
         ":context_stack",
         ":gradient_registry",
-        "//tensorflow/python/data/util:nest",
+        "//tensorflow/python/util:nest",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "def_function",
     srcs = ["def_function.py"],
     srcs_version = "PY3",
+    deps = [
+        ":_unified_api",
+        ":context_stack",
+        "//tensorflow/python/util:nest",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "thread_local_stack",
     srcs = ["thread_local_stack.py"],
     srcs_version = "PY3",
 )
 
-py_library(
+py_strict_library(
     name = "context_stack",
     srcs = ["context_stack.py"],
     srcs_version = "PY3",
     deps = [":thread_local_stack"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "unified_api_test",
     size = "small",
     srcs = ["unified_api_test.py"],
@@ -146,17 +151,36 @@ cuda_py_test(
         ":math_ops",
         ":nn_ops",
         ":tape",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "graph_building_test",
     size = "small",
     srcs = ["graph_building_test.py"],
     deps = [
         "//tensorflow/core/config:flags_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/framework/extension_type.py b/tensorflow/python/framework/extension_type.py
index 2759039a397..16ae8283112 100644
--- a/tensorflow/python/framework/extension_type.py
+++ b/tensorflow/python/framework/extension_type.py
@@ -17,6 +17,7 @@
 import abc
 import typing
 import warnings
+
 import typing_extensions
 
 from tensorflow.core.protobuf import struct_pb2
@@ -84,8 +85,9 @@ class ExtensionTypeMetaclass(abc.ABCMeta):
     # framework by setting '_tf_extension_type_do_not_transform_this_class=True'
     # in the class definition.  (Note: we check for this in the class namespace,
     # so it is *not* ineherited.)
-    if not namespace.get('_tf_extension_type_do_not_transform_this_class',
-                         False):
+    if not namespace.get(
+        '_tf_extension_type_do_not_transform_this_class', False
+    ):
       _check_field_annotations(cls)
       _add_extension_type_constructor(cls)
       _add_type_spec(cls)
@@ -97,7 +99,8 @@ class ExtensionTypeMetaclass(abc.ABCMeta):
 # ==============================================================================
 @tf_export('experimental.ExtensionType')
 class ExtensionType(
-    composite_tensor.CompositeTensor, metaclass=ExtensionTypeMetaclass):
+    composite_tensor.CompositeTensor, metaclass=ExtensionTypeMetaclass
+):
   """Base class for TensorFlow `ExtensionType` classes.
 
   Tensorflow `ExtensionType` classes are specialized Python classes that can be
@@ -157,8 +160,10 @@ class ExtensionType(
 
   def __init__(self, *args, **kwargs):
     if type(self) is ExtensionType:  # pylint: disable=unidiomatic-typecheck
-      raise AssertionError('Cannot create an instance of ExtensionType '
-                           'because ExtensionType is an abstract base class.')
+      raise AssertionError(
+          'Cannot create an instance of ExtensionType '
+          'because ExtensionType is an abstract base class.'
+      )
 
   # This class variable is used to cache the return value for
   # _tf_extension_type_fields.
@@ -196,11 +201,13 @@ class ExtensionType(
       ok_to_cache = False
 
     fields = []
-    for (name, value_type) in type_hints.items():
-      default = getattr(cls, name,
-                        extension_type_field.ExtensionTypeField.NO_DEFAULT)
+    for name, value_type in type_hints.items():
+      default = getattr(
+          cls, name, extension_type_field.ExtensionTypeField.NO_DEFAULT
+      )
       fields.append(
-          extension_type_field.ExtensionTypeField(name, value_type, default))
+          extension_type_field.ExtensionTypeField(name, value_type, default)
+      )
     fields = tuple(fields)
 
     if ok_to_cache:
@@ -213,33 +220,42 @@ class ExtensionType(
     return any(name == field.name for field in cls._tf_extension_type_fields())
 
   def _tf_extension_type_convert_fields(self):
-    extension_type_field.convert_fields(self._tf_extension_type_fields(),
-                                        self.__dict__)
+    extension_type_field.convert_fields(
+        self._tf_extension_type_fields(), self.__dict__
+    )
 
   def __repr__(self):
-    fields = ', '.join([
-        f'{field.name}={getattr(self, field.name)!r}'
-        for field in self._tf_extension_type_fields()
-    ])
+    fields = ', '.join(
+        [
+            f'{field.name}={getattr(self, field.name)!r}'
+            for field in self._tf_extension_type_fields()
+        ]
+    )
     return f'{type(self).__qualname__}({fields})'
 
   def __setattr__(self, name, value):
-    if (name in _MUTABLE_KERAS_PROPERTIES or
-        (hasattr(self, _IN_CONSTRUCTOR) and
-         self._tf_extension_type_has_field(name))):
+    if name in _MUTABLE_KERAS_PROPERTIES or (
+        hasattr(self, _IN_CONSTRUCTOR)
+        and self._tf_extension_type_has_field(name)
+    ):
       self.__dict__[name] = value
     else:
-      raise AttributeError(f'Cannot mutate attribute `{name}` '
-                           f'outside the custom constructor of ExtensionType.')
+      raise AttributeError(
+          f'Cannot mutate attribute `{name}` '
+          'outside the custom constructor of ExtensionType.'
+      )
 
   def __delattr__(self, name):
-    if (name in _MUTABLE_KERAS_PROPERTIES or
-        (hasattr(self, _IN_CONSTRUCTOR) and
-         self._tf_extension_type_has_field(name))):
+    if name in _MUTABLE_KERAS_PROPERTIES or (
+        hasattr(self, _IN_CONSTRUCTOR)
+        and self._tf_extension_type_has_field(name)
+    ):
       del self.__dict__[name]
     else:
-      raise AttributeError(f'Cannot mutate attribute `{name}` '
-                           f'outside the custom constructor of ExtensionType.')
+      raise AttributeError(
+          f'Cannot mutate attribute `{name}` '
+          'outside the custom constructor of ExtensionType.'
+      )
 
   def __getattr__(self, name):
     if name in _MUTABLE_KERAS_PROPERTIES:
@@ -255,7 +271,8 @@ class ExtensionType(
       return getattr(unpack(self), name)
 
     raise AttributeError(
-        f'{type(self).__name__!r} object has no attribute {name!r}')
+        f'{type(self).__name__!r} object has no attribute {name!r}'
+    )
 
   def __eq__(self, other):
     if type(self) is not type(other):
@@ -275,12 +292,17 @@ class ExtensionType(
               gen_math_ops.equal(
                   array_ops.shape(t1),
                   array_ops.shape(t2),
-                  incompatible_shape_error=False)))
+                  incompatible_shape_error=False,
+              )
+          )
+      )
       # Explicitly check shape (values that have different shapes but broadcast
       # to the same value are considered non-equal).
       conditions.append(
           math_ops.reduce_all(
-              gen_math_ops.equal(t1, t2, incompatible_shape_error=False)))
+              gen_math_ops.equal(t1, t2, incompatible_shape_error=False)
+          )
+      )
     return math_ops.reduce_all(array_ops_stack.stack(conditions))
 
   def __ne__(self, other):
@@ -302,8 +324,9 @@ class ExtensionType(
     # Note: the TypeSpec contains all static (non-tensor) data from `self`.
     if self._tf_extension_type_cached_type_spec is None:
       assert not is_packed(self)  # Packed version always caches TypeSpec.
-      self.__dict__[
-          '_tf_extension_type_cached_type_spec'] = self.Spec.from_value(self)
+      self.__dict__['_tf_extension_type_cached_type_spec'] = (
+          self.Spec.from_value(self)
+      )
     return self._tf_extension_type_cached_type_spec
 
 
@@ -347,14 +370,17 @@ def pack(value):
     # named type is not registered.  The default error message would simply
     # tell the user that there is no encoder for the object, so we provide
     # a more useful message letting them know how to register the type.
-    raise ValueError('ExtensionTypes must have a __name__ field in order '
-                     'to be packed.') from e
+    raise ValueError(
+        'ExtensionTypes must have a __name__ field in order to be packed.'
+    ) from e
 
   return _create_object_from_type_and_dict(
-      type(value), {
+      type(value),
+      {
           '_tf_extension_type_cached_type_spec': spec,
           '_tf_extension_type_packed_variant': variant,
-      })
+      },
+  )
 
 
 def unpack(value):
@@ -379,8 +405,10 @@ def unpack(value):
 def is_packed(value):
   """Returns true if `value`'s fields are packed in a single Variant."""
   if not isinstance(value, ExtensionType):
-    raise ValueError(f'Expected `value` to be an object of type ExtensionType,'
-                     f'got an instance of {type(value)}.')
+    raise ValueError(
+        'Expected `value` to be an object of type ExtensionType,'
+        f'got an instance of {type(value)}.'
+    )
   return '_tf_extension_type_packed_variant' in value.__dict__
 
 
@@ -389,6 +417,7 @@ def is_packed(value):
 # ==============================================================================
 
 
+@tf_export('experimental.ExtensionTypeSpec')
 class ExtensionTypeSpec(type_spec.TypeSpec):
   """Base class for tf.ExtensionType TypeSpec."""
 
@@ -398,7 +427,8 @@ class ExtensionTypeSpec(type_spec.TypeSpec):
     if self._tf_extension_type_is_packed:
       fields.append('_tf_extension_type_is_packed')
     return tuple(
-        (f, _change_nested_mappings_to(self.__dict__[f], dict)) for f in fields)
+        (f, _change_nested_mappings_to(self.__dict__[f], dict)) for f in fields
+    )
 
   @classmethod
   def _deserialize(cls, state):  # TypeSpec API.
@@ -423,16 +453,20 @@ class ExtensionTypeSpec(type_spec.TypeSpec):
     # is needed as nest.flatten would sort dictionary entries by key.
     value_tuple = tuple(value.__dict__[key] for key in self.__dict__)
     return tuple(
-        x for x in nest.flatten(value_tuple)
-        if isinstance(x, tensor_or_composite))
+        x
+        for x in nest.flatten(value_tuple)
+        if isinstance(x, tensor_or_composite)
+    )
 
   def _from_components(self, components):  # TypeSpec API.
     if self._tf_extension_type_is_packed:
       return _create_object_from_type_and_dict(
-          self.value_type, {
+          self.value_type,
+          {
               '_tf_extension_type_cached_type_spec': self,
-              '_tf_extension_type_packed_variant': components
-          })
+              '_tf_extension_type_packed_variant': components,
+          },
+      )
 
     spec_tuple = tuple(self.__dict__.values())
     components_iter = iter(components)
@@ -444,7 +478,8 @@ class ExtensionTypeSpec(type_spec.TypeSpec):
       raise ValueError(
           'Cannot build an ExtensionType instance from components '
           'because more components are provided than the number expected '
-          'by the type spec.')
+          'by the type spec.'
+      )
     value_tuple = nest.pack_sequence_as(spec_tuple, flat)
     fields = dict(zip(self.__dict__.keys(), value_tuple))
 
@@ -478,22 +513,27 @@ class ExtensionTypeSpec(type_spec.TypeSpec):
     return _create_object_from_type_and_dict(cls, spec_fields)
 
   def __setattr__(self, name, value):
-    if (hasattr(self, _IN_CONSTRUCTOR) and
-        self._tf_extension_type_has_field(name)):
+    if (hasattr(self, _IN_CONSTRUCTOR)
+        and self._tf_extension_type_has_field(name)):
       self.__dict__[name] = value
+    elif name in type_spec.CACHED_FIXED_PROPERTIES:
+      super().__setattr__(name, value)
     else:
       raise AttributeError(
           f'Cannot mutate attribute `{name}` '
-          f'outside the custom constructor of ExtensionTypeSpec.')
+          'outside the custom constructor of ExtensionTypeSpec.'
+      )
 
   def __delattr__(self, name):
-    if (hasattr(self, _IN_CONSTRUCTOR) and
-        self._tf_extension_type_has_field(name)):
+    if hasattr(self, _IN_CONSTRUCTOR) and self._tf_extension_type_has_field(
+        name
+    ):
       del self.__dict__[name]
     else:
       raise AttributeError(
           f'Cannot mutate attribute `{name}` '
-          f'outside the custom constructor of ExtensionTypeSpec.')
+          'outside the custom constructor of ExtensionTypeSpec.'
+      )
 
   def __validate__(self):
     """Perform post-construction validation."""
@@ -508,7 +548,8 @@ class ExtensionTypeSpec(type_spec.TypeSpec):
 
   def _tf_extension_type_convert_fields(self):
     extension_type_field.convert_fields_for_spec(
-        self._tf_extension_type_fields(), self.__dict__)
+        self._tf_extension_type_fields(), self.__dict__
+    )
 
   def __repr__(self):
     fields = ', '.join([f'{k}={v!r}' for (k, v) in self._serialize()])
@@ -529,6 +570,17 @@ class ExtensionTypeSpec(type_spec.TypeSpec):
     copy.__dict__['_tf_extension_type_is_packed'] = value
     return copy
 
+  def _to_legacy_output_shapes(self):
+    """Returns the shape property."""
+    try:
+      return self.shape
+    except AttributeError as e:
+      raise NotImplementedError(
+          'It appears that the Spec of the ExtensionType is missing a shape'
+          ' property. In order to support tf.Data, it is recommended that you'
+          ' implement a shape property on the Spec.'
+      ) from e
+
 
 class _ExtensionTypeSpecCodec:
   """Codec for `tf.ExtensionTypeSpec`."""
@@ -546,19 +598,24 @@ class _ExtensionTypeSpecCodec:
   def do_encode(self, extension_type_spec_value, encode_fn):
     """Returns an encoded proto for the given `tf.ExtensionTypeSpec`."""
     type_spec_class_name = type_spec_registry.get_name(
-        type(extension_type_spec_value))
+        type(extension_type_spec_value)
+    )
 
     type_state = extension_type_spec_value._serialize()  # pylint: disable=protected-access
     num_flat_components = len(
         nest.flatten(
-            extension_type_spec_value._component_specs, expand_composites=True))  # pylint: disable=protected-access
+            extension_type_spec_value._component_specs, expand_composites=True  # pylint: disable=protected-access
+        )
+    )
     encoded_type_spec = struct_pb2.StructuredValue()
     encoded_type_spec.type_spec_value.CopyFrom(
         struct_pb2.TypeSpecProto(
             type_spec_class=struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC,
             type_state=encode_fn(type_state),
             type_spec_class_name=type_spec_class_name,
-            num_flat_components=num_flat_components))
+            num_flat_components=num_flat_components,
+        )
+    )
     return encoded_type_spec
 
   def can_decode(self, value):
@@ -566,7 +623,8 @@ class _ExtensionTypeSpecCodec:
     if value.HasField('type_spec_value'):
       type_spec_class_enum = value.type_spec_value.type_spec_class
       return (
-          type_spec_class_enum == struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC)
+          type_spec_class_enum == struct_pb2.TypeSpecProto.EXTENSION_TYPE_SPEC
+      )
     return False
 
   def do_decode(self, value, decode_fn):
@@ -718,8 +776,8 @@ class ExtensionTypeBatchEncoder(type_spec.TypeSpecBatchEncoder):
         were batched, then `spec` should be `s.batch(batch_size)`; or if encoded
         values with spec `s` were unbatched, then `spec` should be
         `s.unbatch()`.
-      encoded_value: A nest of values returned by `encode`; or a nest of
-        values that was formed by stacking, unstacking, or concatenating the
+      encoded_value: A nest of values returned by `encode`; or a nest of values
+        that was formed by stacking, unstacking, or concatenating the
         corresponding elements of values returned by `encode`.
 
     Returns:
@@ -744,8 +802,9 @@ class ExtensionTypeBatchEncoder(type_spec.TypeSpecBatchEncoder):
     return spec._component_specs  # pylint: disable=protected-access
 
 
-class BatchableExtensionTypeSpec(ExtensionTypeSpec,
-                                 type_spec.BatchableTypeSpec):
+class BatchableExtensionTypeSpec(
+    ExtensionTypeSpec, type_spec.BatchableTypeSpec
+):
   """Base class for TypeSpecs for BatchableExtensionTypes."""
 
   __batch_encoder__ = ExtensionTypeBatchEncoder()
@@ -793,6 +852,7 @@ class BatchableExtensionType(ExtensionType):
   `__batch_encoder__` with a custom `ExtensionTypeBatchEncoder`.  See
   `tf.experimental.ExtensionTypeBatchEncoder` for more details.
   """
+
   # Let the metaclass know that it should *not* transform this class (since
   # this class is part of the ExtensionType framework, and not a user class).
   _tf_extension_type_do_not_transform_this_class = True
@@ -815,8 +875,12 @@ def _replace_tensor_with_spec(value):
 def _change_nested_mappings_to(value, new_type):
   """Recursively replace mappings with `new_type`."""
   if isinstance(value, (dict, immutable_dict.ImmutableDict)):
-    return new_type([(k, _change_nested_mappings_to(v, new_type))
-                     for (k, v) in value.items()])
+    return new_type(
+        [
+            (k, _change_nested_mappings_to(v, new_type))
+            for (k, v) in value.items()
+        ]
+    )
   elif isinstance(value, tuple):
     return tuple(_change_nested_mappings_to(elt, new_type) for elt in value)
   else:
@@ -836,28 +900,43 @@ def _check_field_annotations(cls):
   for name, value in cls.__dict__.items():
     if name == 'Spec':
       if not isinstance(value, type):
-        raise ValueError(f'{cls.__qualname__}.Spec must be a nested class; '
-                         f'got {value}.')
-      if (value.__bases__ != (type_spec.TypeSpec,) and value.__bases__ !=
-          (object,)):
-        raise ValueError(f'{cls.__qualname__}.Spec must be directly subclassed '
-                         'from tf.TypeSpec.')
+        raise ValueError(
+            f'{cls.__qualname__}.Spec must be a nested class; got {value}.'
+        )
+      if value.__bases__ != (type_spec.TypeSpec,) and value.__bases__ != (
+          object,
+      ):
+        raise ValueError(
+            f'{cls.__qualname__}.Spec must be directly subclassed '
+            'from tf.TypeSpec.'
+        )
     elif extension_type_field.ExtensionTypeField.is_reserved_name(name):
-      raise ValueError(f'The field annotations for {cls.__name__} are '
-                       f"invalid. Field '{name}' is reserved.")
+      raise ValueError(
+          f'The field annotations for {cls.__name__} are '
+          f"invalid. Field '{name}' is reserved."
+      )
   for name in annotations:
     if extension_type_field.ExtensionTypeField.is_reserved_name(name):
-      raise ValueError(f'The field annotations for {cls.__name__} are '
-                       f"invalid. Field '{name}' is reserved.")
+      raise ValueError(
+          f'The field annotations for {cls.__name__} are '
+          f"invalid. Field '{name}' is reserved."
+      )
 
   # Check that all fields have type annotaitons.
-  for (key, value) in cls.__dict__.items():
-    if not (key in annotations or callable(value) or key.startswith('_abc_') or
-            key == '_tf_extension_type_fields' or
-            key.startswith('__') and key.endswith('__') or
-            isinstance(value, (property, classmethod, staticmethod))):
-      raise ValueError(f'The field annotations for {cls.__name__} are '
-                       f'invalid. Field {key} is missing a type annotation.')
+  for key, value in cls.__dict__.items():
+    if not (
+        key in annotations
+        or callable(value)
+        or key.startswith('_abc_')
+        or key == '_tf_extension_type_fields'
+        or key.startswith('__')
+        and key.endswith('__')
+        or isinstance(value, (property, classmethod, staticmethod))
+    ):
+      raise ValueError(
+          f'The field annotations for {cls.__name__} are '
+          f'invalid. Field {key} is missing a type annotation.'
+      )
 
 
 def _add_extension_type_constructor(cls):
@@ -913,7 +992,9 @@ def _build_extension_type_constructor(cls):
       default = field.default
     params.append(
         tf_inspect.Parameter(
-            field.name, kind, default=default, annotation=field.value_type))
+            field.name, kind, default=default, annotation=field.value_type
+        )
+    )
 
   signature = tf_inspect.Signature(params, return_annotation=cls.__name__)
 
@@ -927,11 +1008,10 @@ def _build_extension_type_constructor(cls):
   # __signature__ is supported by some inspection/documentation tools
   # (but note: typing.get_type_hints does not respect __signature__).
   __init__.__signature__ = tf_inspect.Signature(
-      [
-          tf_inspect.Parameter('self',
-                               tf_inspect.Parameter.POSITIONAL_OR_KEYWORD)
-      ] + params,
-      return_annotation=cls)
+      [tf_inspect.Parameter('self', tf_inspect.Parameter.POSITIONAL_OR_KEYWORD)]
+      + params,
+      return_annotation=cls,
+  )
 
   cls.__init__ = __init__
 
@@ -954,11 +1034,10 @@ def _build_spec_constructor(cls):
 
   # __signature__ is supported by some inspection/documentation tools.
   __init__.__signature__ = tf_inspect.Signature(
-      [
-          tf_inspect.Parameter('self',
-                               tf_inspect.Parameter.POSITIONAL_OR_KEYWORD)
-      ] + params,
-      return_annotation=cls)
+      [tf_inspect.Parameter('self', tf_inspect.Parameter.POSITIONAL_OR_KEYWORD)]
+      + params,
+      return_annotation=cls,
+  )
 
   cls.__init__ = __init__
 
@@ -975,13 +1054,16 @@ def _add_type_spec(cls):
   # Copy user-supplied customizations into the TypeSpec.
   user_spec = cls.__dict__.get('Spec', None)
   if user_spec is not None:
-    for (name, value) in user_spec.__dict__.items():
+    for name, value in user_spec.__dict__.items():
       if extension_type_field.ExtensionTypeField.is_reserved_name(name):
-        raise ValueError(f'TypeSpec {spec_qualname} uses reserved '
-                         f"name '{name}'.")
+        raise ValueError(
+            f"TypeSpec {spec_qualname} uses reserved name '{name}'."
+        )
       if cls._tf_extension_type_has_field(name):  # pylint: disable=protected-access
-        raise ValueError(f"TypeSpec {spec_qualname} defines a variable '{name}'"
-                         f' which shadows a field in {cls.__qualname__}')
+        raise ValueError(
+            f"TypeSpec {spec_qualname} defines a variable '{name}'"
+            f' which shadows a field in {cls.__qualname__}'
+        )
       if name in ('__module__', '__dict__', '__weakref__'):
         continue
 
@@ -989,14 +1071,18 @@ def _add_type_spec(cls):
 
   if issubclass(cls, BatchableExtensionType):
     type_spec_base = BatchableExtensionTypeSpec
-    if hasattr(cls,
-               '__batch_encoder__') and '__batch_encoder__' not in spec_dict:
+    if (
+        hasattr(cls, '__batch_encoder__')
+        and '__batch_encoder__' not in spec_dict
+    ):
       spec_dict['__batch_encoder__'] = cls.__batch_encoder__
   else:
     type_spec_base = ExtensionTypeSpec
     if hasattr(cls, '__batch_encoder__') or '__batch_encoder__' in spec_dict:
-      raise ValueError('__batch_encoder__ should only be defined for '
-                       'BatchableExtensionType classes.')
+      raise ValueError(
+          '__batch_encoder__ should only be defined for '
+          'BatchableExtensionType classes.'
+      )
 
   # Build the TypeSpec and store it as a nested class inside `cls`.
   spec = type(spec_name, (type_spec_base,), spec_dict)
@@ -1043,11 +1129,13 @@ class AnonymousExtensionType(ExtensionType):
 
   def __init__(self, **fields):
     for name in fields:
-      if (extension_type_field.ExtensionTypeField.is_reserved_name(name) or
-          (name.startswith('__') and name.endswith('__'))):
+      if extension_type_field.ExtensionTypeField.is_reserved_name(name) or (
+          name.startswith('__') and name.endswith('__')
+      ):
         raise ValueError(
             f'Reserved field name {name} was encountered '
-            f'when trying to instantiate an AnonymousExtensionType.')
+            'when trying to instantiate an AnonymousExtensionType.'
+        )
     fields = [(k, _convert_anonymous_fields(v)) for (k, v) in fields.items()]
     self.__dict__.update(fields)
     self._tf_extension_type_convert_fields()
@@ -1062,23 +1150,29 @@ class AnonymousExtensionType(ExtensionType):
     ]
 
   def __setattr__(self, name, value):
-    raise AttributeError(f'Cannot set attribute `{name}`. '
-                         f'AnonymousExtensionType instances are immutable.')
+    raise AttributeError(
+        f'Cannot set attribute `{name}`. '
+        'AnonymousExtensionType instances are immutable.'
+    )
 
   def __delattr__(self, name):
-    raise AttributeError(f'Cannot delete attribute `{name}`. '
-                         f'AnonymousExtensionType instances are immutable.')
+    raise AttributeError(
+        f'Cannot delete attribute `{name}`. '
+        'AnonymousExtensionType instances are immutable.'
+    )
 
   def _tf_extension_type_convert_fields(self):
-    fields = [(k, _convert_anonymous_fields(v))
-              for (k, v) in self.__dict__.items()
-              if not extension_type_field.ExtensionTypeField.is_reserved_name(k)
-             ]
+    fields = [
+        (k, _convert_anonymous_fields(v))
+        for (k, v) in self.__dict__.items()
+        if not extension_type_field.ExtensionTypeField.is_reserved_name(k)
+    ]
     self.__dict__.update(fields)
 
   def __repr__(self):
     fields = [
-        f'{k}={v!r}' for (k, v) in self.__dict__.items()
+        f'{k}={v!r}'
+        for (k, v) in self.__dict__.items()
         if not extension_type_field.ExtensionTypeField.is_reserved_name(k)
     ]
     return f'AnonymousExtensionType({", ".join(fields)})'
@@ -1100,13 +1194,17 @@ class AnonymousExtensionTypeSpec(ExtensionTypeSpec):
 
   def __init__(self, **fields):
     for name in fields:
-      if (extension_type_field.ExtensionTypeField.is_reserved_name(name) or
-          (name.startswith('__') and name.endswith('__'))):
+      if extension_type_field.ExtensionTypeField.is_reserved_name(name) or (
+          name.startswith('__') and name.endswith('__')
+      ):
         raise ValueError(
             f'Reserved field name {name} was encountered '
-            f'when trying to instantiate an AnonymousExtensionTypeSpec.')
-    fields = [(k, _convert_anonymous_fields(v, for_spec=True))
-              for (k, v) in fields.items()]
+            'when trying to instantiate an AnonymousExtensionTypeSpec.'
+        )
+    fields = [
+        (k, _convert_anonymous_fields(v, for_spec=True))
+        for (k, v) in fields.items()
+    ]
     self.__dict__.update(fields)
     super().__init__()
 
@@ -1116,41 +1214,69 @@ class AnonymousExtensionTypeSpec(ExtensionTypeSpec):
     return tuple(
         (name, _change_nested_mappings_to(value, dict))
         for (name, value) in self.__dict__.items()
-        if not extension_type_field.ExtensionTypeField.is_reserved_name(name))
+        if not extension_type_field.ExtensionTypeField.is_reserved_name(name)
+    )
 
   def __setattr__(self, name, value):
-    raise AttributeError(f'Cannot set attribute `{name}`. '
-                         f'AnonymousExtensionTypeSpec instances are immutable.')
+    if name in type_spec.CACHED_FIXED_PROPERTIES:
+      super().__setattr__(name, value)
+    else:
+      raise AttributeError(
+          f'Cannot set attribute `{name}`. '
+          'AnonymousExtensionTypeSpec instances are immutable.'
+      )
 
   def __delattr__(self, name):
-    raise AttributeError(f'Cannot delete attribute `{name}`. '
-                         f'AnonymousExtensionTypeSpec instances are immutable.')
+    raise AttributeError(
+        f'Cannot delete attribute `{name}`. '
+        'AnonymousExtensionTypeSpec instances are immutable.'
+    )
 
 
 def _convert_anonymous_fields(value, for_spec=False):
   """Type-checks and converts `value` for inclusion in an AnonymousExtensionType."""
-  if isinstance(value, (int, float, bool, str, bytes, type(None), dtypes.DType,
-                        tensor_shape.TensorShape)):
+  if isinstance(
+      value,
+      (
+          int,
+          float,
+          bool,
+          str,
+          bytes,
+          type(None),
+          dtypes.DType,
+          tensor_shape.TensorShape,
+      ),
+  ):
     return value
 
   if isinstance(value, tuple):
     return tuple(_convert_anonymous_fields(v, for_spec) for v in value)
 
   if isinstance(value, typing.Mapping):
-    return immutable_dict.ImmutableDict([
-        (_convert_anonymous_fields(k, for_spec),
-         _convert_anonymous_fields(v, for_spec)) for (k, v) in value.items()
-    ])
+    return immutable_dict.ImmutableDict(
+        [
+            (
+                _convert_anonymous_fields(k, for_spec),
+                _convert_anonymous_fields(v, for_spec),
+            )
+            for (k, v) in value.items()
+        ]
+    )
 
-  if (isinstance(value, (ops.Tensor, composite_tensor.CompositeTensor)) and
-      not for_spec):
+  if (
+      isinstance(value, (ops.Tensor, composite_tensor.CompositeTensor))
+      and not for_spec
+  ):
     return value
 
   if isinstance(value, type_spec.TypeSpec) and for_spec:
     return value
 
-  raise ValueError(f'Cannot convert anonymous fields from '
-                   f'an unsupported `value` argument: {value!r}.')
+  raise ValueError(
+      'Cannot convert anonymous fields from '
+      f'an unsupported `value` argument: {value!r}.'
+  )
 
 
 # ==============================================================================
@@ -1174,15 +1300,18 @@ def reinterpret(value, new_type):
   """
   if not isinstance(value, ExtensionType):
     raise ValueError(
-        f'reinterpret expects `value` to be a tf.ExtensionType instance; '
-        f'got {value!r}')
+        'reinterpret expects `value` to be a tf.ExtensionType instance; '
+        f'got {value!r}'
+    )
   if not (isinstance(new_type, type) and issubclass(new_type, ExtensionType)):
     raise ValueError(
-        f'reinterpret expects `new_type` to be a subclass of tf.ExtensionType; '
-        f'got {new_type!r}')
+        'reinterpret expects `new_type` to be a subclass of tf.ExtensionType; '
+        f'got {new_type!r}'
+    )
 
   fields = [
-      item for item in value.__dict__.items()
+      item
+      for item in value.__dict__.items()
       if not extension_type_field.ExtensionTypeField.is_reserved_name(item[0])
   ]
   new_value = _create_object_from_type_and_dict(new_type, fields)
diff --git a/tensorflow/python/framework/extension_type_test.py b/tensorflow/python/framework/extension_type_test.py
index b4f6e62e126..0e6bccb396c 100644
--- a/tensorflow/python/framework/extension_type_test.py
+++ b/tensorflow/python/framework/extension_type_test.py
@@ -63,6 +63,7 @@ KEYWORD_ONLY = tf_inspect.Parameter.KEYWORD_ONLY
 
 class MaskedTensorV1(extension_type.ExtensionType):
   """Example subclass of ExtensionType, used for testing."""
+
   values: ops.Tensor
   mask: ops.Tensor
 
@@ -74,6 +75,7 @@ class MaskedTensorV2(extension_type.ExtensionType):
   customizes `__repr__` and `__validate__`.  It also adds a `__name__` field,
   which enables serialization.
   """
+
   __name__ = 'tf.test.MaskedTensorV2'
 
   values: ops.Tensor
@@ -81,8 +83,9 @@ class MaskedTensorV2(extension_type.ExtensionType):
 
   def __repr__(self):
     if hasattr(self.values, 'numpy') and hasattr(self.mask, 'numpy'):
-      return '<MaskedTensorV2 %s>' % _masked_array_repr(self.values.numpy(),
-                                                        self.mask.numpy())
+      return '<MaskedTensorV2 %s>' % _masked_array_repr(
+          self.values.numpy(), self.mask.numpy()
+      )
     else:
       return super(MaskedTensorV2, self).__repr__()
 
@@ -113,6 +116,16 @@ class MaskedTensorV2(extension_type.ExtensionType):
   __sub__ = math_ops.subtract
 
 
+class SimpleExtensionType(extension_type.ExtensionType):
+  x: ops.Tensor
+
+  class Spec:
+
+    @property
+    def shape(self):
+      return self.x.shape
+
+
 def _masked_array_repr(values, mask):
   """Returns a string representation for a masked numpy array."""
   assert len(values) == len(mask)
@@ -129,6 +142,7 @@ class MaskedTensorV3(extension_type.BatchableExtensionType):
   This version adds Keras required properties to MaskedTensor and its Spec
   class, to test Keras integration.
   """
+
   __name__ = 'tf.test.MaskedTensorV3.Spec'
 
   values: typing.Union[ops.Tensor, ragged_tensor.RaggedTensor]
@@ -196,17 +210,20 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     for mt in [mt1, mt2]:
       with self.assertRaisesRegex(
           AttributeError,
-          'Cannot mutate attribute `score` outside the custom constructor of ExtensionType'
+          'Cannot mutate attribute `score` outside the custom constructor of'
+          ' ExtensionType',
       ):
         mt.score = 12
       with self.assertRaisesRegex(
           AttributeError,
-          'Cannot mutate attribute `values` outside the custom constructor of ExtensionType'
+          'Cannot mutate attribute `values` outside the custom constructor of'
+          ' ExtensionType',
       ):
         mt.values = constant_op.constant([4, 3, 2, 1])
       with self.assertRaisesRegex(
           AttributeError,
-          'Cannot mutate attribute `values` outside the custom constructor of ExtensionType'
+          'Cannot mutate attribute `values` outside the custom constructor of'
+          ' ExtensionType',
       ):
         del mt.values
 
@@ -242,7 +259,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual({'values': values, 'mask': mask}, mt_dict)
 
   def testConstructorSignature(self):
-
     class MyType(extension_type.ExtensionType):
       x: ops.Tensor
       y: ops.Tensor
@@ -256,14 +272,15 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
             'z',
             POSITIONAL_OR_KEYWORD,
             annotation=typing.Tuple[typing.Union[int, str], ...],
-            default=(1, 'two', 3)),
+            default=(1, 'two', 3),
+        ),
     ]
     expected_sig = tf_inspect.Signature(
-        expected_parameters, return_annotation=MyType)
+        expected_parameters, return_annotation=MyType
+    )
     self.assertEqual(expected_sig, tf_inspect.signature(MyType.__init__))
 
   def testConstructorSignatureWithKeywordOnlyArgs(self):
-
     class MyType(extension_type.ExtensionType):
       a: int
       b: str = 'Hello world'
@@ -273,11 +290,13 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         tf_inspect.Parameter('self', POSITIONAL_OR_KEYWORD),
         tf_inspect.Parameter('a', POSITIONAL_OR_KEYWORD, annotation=int),
         tf_inspect.Parameter(
-            'b', POSITIONAL_OR_KEYWORD, annotation=str, default='Hello world'),
+            'b', POSITIONAL_OR_KEYWORD, annotation=str, default='Hello world'
+        ),
         tf_inspect.Parameter('c', KEYWORD_ONLY, annotation=ops.Tensor),
     ]
     expected_sig = tf_inspect.Signature(
-        expected_parameters, return_annotation=MyType)
+        expected_parameters, return_annotation=MyType
+    )
     self.assertEqual(expected_sig, tf_inspect.signature(MyType.__init__))
 
   def testConstructorSignatureWithDefaultForTensorField(self):
@@ -294,7 +313,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(a.x + constant_op.constant(3), 8)
 
   def testConstructorSignatureWithAnnotatedTensorField(self):
-
     class MyType(extension_type.ExtensionType):
       a: typing_extensions.Annotated[ops.Tensor, 'metadata']
       b: typing_extensions.Annotated[str, 'metadata'] = 'Hello world'
@@ -304,29 +322,31 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         tf_inspect.Parameter('self', POSITIONAL_OR_KEYWORD),
         tf_inspect.Parameter('a', POSITIONAL_OR_KEYWORD, annotation=ops.Tensor),
         tf_inspect.Parameter(
-            'b', POSITIONAL_OR_KEYWORD, annotation=str, default='Hello world'),
+            'b', POSITIONAL_OR_KEYWORD, annotation=str, default='Hello world'
+        ),
         tf_inspect.Parameter(
             'c',
             POSITIONAL_OR_KEYWORD,
             annotation=typing.Optional[int],
-            default=None),
+            default=None,
+        ),
     ]
     expected_sig = tf_inspect.Signature(
-        expected_parameters, return_annotation=MyType)
+        expected_parameters, return_annotation=MyType
+    )
     self.assertEqual(expected_sig, tf_inspect.signature(MyType.__init__))
 
   def testEmptyType(self):
-
     class EmptyType(extension_type.ExtensionType):
       pass
 
     self.assertEmpty(EmptyType._tf_extension_type_fields())
     x = EmptyType()
     self.assertEqual(
-        repr(x), 'ExtensionTypeTest.testEmptyType.<locals>.EmptyType()')
+        repr(x), 'ExtensionTypeTest.testEmptyType.<locals>.EmptyType()'
+    )
 
   def testCustomConstrutor(self):
-
     class SummarizedTensor(extension_type.ExtensionType):
       values: ops.Tensor
       mean: ops.Tensor
@@ -364,7 +384,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(c.children, (a, b))
 
   def testCustomConstrutorCantMutateNestedValues(self):
-
     class Foo(extension_type.ExtensionType):
       x: int
 
@@ -376,12 +395,12 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     with self.assertRaisesRegex(
         AttributeError,
-        'Cannot mutate attribute `x` outside the custom constructor of ExtensionType'
+        'Cannot mutate attribute `x` outside the custom constructor of'
+        ' ExtensionType',
     ):
       Bar(Foo(12))
 
   def testCustomValidate(self):
-
     class AlignedTensors(extension_type.ExtensionType):
       x: ops.Tensor
       y: ops.Tensor
@@ -397,7 +416,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       AlignedTensors([1, 2, 3], ['a', 'b', 'c', 'd'])
 
   def testEquals(self):
-
     class MyType(extension_type.ExtensionType):
       values: ops.Tensor
       score: ops.Tensor
@@ -432,7 +450,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(b_ph == c_ph, False)
 
   def testPassIntoTfFunction(self):
-
     @def_function.function
     def fn(x):
       return x.with_default(99)
@@ -442,7 +459,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual([1, 2, 99, 4], fn(extension_type.pack(mt)))
 
   def testReturnFromTfFunction(self):
-
     @def_function.function
     def mask_neg_values(x):
       return MaskedTensorV2(x, x > 0)
@@ -467,7 +483,8 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testCaptureByTfFunction(self):
     x = MaskedTensorV2(
         values=[[1, 2, 3], [4, 5, 6]],
-        mask=[[True, True, True], [True, False, True]])
+        mask=[[True, True, True], [True, False, True]],
+    )
 
     @def_function.function
     def add_to_x(y):
@@ -476,13 +493,13 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     actual = add_to_x(MaskedTensorV2([10, 20, 30], [False, True, True]))
     expected = MaskedTensorV2(
         values=[[11, 22, 33], [14, 25, 36]],
-        mask=[[False, True, True], [False, False, True]])
+        mask=[[False, True, True], [False, False, True]],
+    )
     self.assertIsInstance(actual, MaskedTensorV2)
     self.assertAllEqual(expected.values, actual.values)
     self.assertAllEqual(expected.mask, actual.mask)
 
   def testTfFunctionArgMutationError(self):
-
     @def_function.function
     def fn_with_side_effect(mts):
       mts.append(MaskedTensorV1(mts[0].values * 2, mts[0].mask))
@@ -491,7 +508,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       fn_with_side_effect([MaskedTensorV1([10, 20, 30], [False, True, True])])
 
   def testNestPackUnpack(self):
-
     class CandyStore(extension_type.ExtensionType):
       name: ops.Tensor
       prices: typing.Mapping[str, ops.Tensor]
@@ -499,9 +515,11 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     store = CandyStore('Yum', {'gum': [0.42, 0.48], 'chocolate': [0.83, 1.02]})
     components = nest.flatten(store, expand_composites=True)
     repacked_1 = nest.pack_sequence_as(
-        store, components, expand_composites=True)
+        store, components, expand_composites=True
+    )
     repacked_2 = nest.pack_sequence_as(
-        store._type_spec, components, expand_composites=True)
+        store._type_spec, components, expand_composites=True
+    )
 
     # Note: dicts get sorted by key.
     self.assertLen(components, 3)
@@ -518,10 +536,8 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     x = MaskedTensorV1([1, 2, 3, 4], [True, False, True, False])
     y = MaskedTensorV1([5, 6, 7, 8], [False, True, True, False])
 
-    x_2 = tf_cond.cond(
-        constant_op.constant(True), lambda: x, lambda: y)
-    y_2 = tf_cond.cond(
-        constant_op.constant(False), lambda: x, lambda: y)
+    x_2 = tf_cond.cond(constant_op.constant(True), lambda: x, lambda: y)
+    y_2 = tf_cond.cond(constant_op.constant(False), lambda: x, lambda: y)
 
     self.assertAllEqual(x.values, x_2.values)
     self.assertAllEqual(x.mask, x_2.mask)
@@ -533,12 +549,14 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     def true_fn():
       return MaskedTensorV1(
-          array_ops.where_v2(mt.mask, mt.values, -1), mt.values > 3)
+          array_ops.where_v2(mt.mask, mt.values, -1), mt.values > 3
+      )
 
     def false_fn():
       return MaskedTensorV1(
           array_ops.where_v2(mt.mask, 100, mt.values * 2),
-          math_ops.logical_not(mt.mask))
+          math_ops.logical_not(mt.mask),
+      )
 
     x = tf_cond.cond(constant_op.constant(True), true_fn, false_fn)
     y = tf_cond.cond(constant_op.constant(False), true_fn, false_fn)
@@ -549,15 +567,16 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(y.mask, [False, True, False, True])
 
   def testCondAutograph(self):
-
     @def_function.function
     def fn(mt):
       if mt.values[3] > 3:
         return MaskedTensorV1(
-            array_ops.where_v2(mt.mask, mt.values, -1), mt.values > 3)
+            array_ops.where_v2(mt.mask, mt.values, -1), mt.values > 3
+        )
       else:
         return MaskedTensorV1(
-            array_ops.where_v2(mt.mask, 100, mt.values * 2), not mt.mask)
+            array_ops.where_v2(mt.mask, 100, mt.values * 2), not mt.mask
+        )
 
     x = fn(MaskedTensorV1([1, 2, 3, 4], [True, False, True, False]))
     self.assertAllEqual(x.values, [1, -1, 3, -1])
@@ -579,16 +598,20 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     with self.assertRaisesRegex(
         ValueError,
         'Incompatible return values of true_fn and false_fn: The two '
-        "structures don't have the same nested structure"):
+        "structures don't have the same nested structure",
+    ):
       tf_cond.cond(constant_op.constant(True), a, b)
     with self.assertRaisesRegex(
-        TypeError, 'Incompatible return types of true_fn and false_fn: The two '
-        "structures don't have the same nested structure"):
+        TypeError,
+        'Incompatible return types of true_fn and false_fn: The two '
+        "structures don't have the same nested structure",
+    ):
       tf_cond.cond(constant_op.constant(True), a, c)
     with self.assertRaisesRegex(
         ValueError,
         'Incompatible return values of true_fn and false_fn: The two '
-        "structures don't have the same nested structure"):
+        "structures don't have the same nested structure",
+    ):
       tf_cond.cond(constant_op.constant(True), a, d)
 
   def testCondPacked(self):
@@ -597,10 +620,8 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     x = extension_type.pack(x)
     y = extension_type.pack(y)
 
-    x_2 = tf_cond.cond(
-        constant_op.constant(True), lambda: x, lambda: y)
-    y_2 = tf_cond.cond(
-        constant_op.constant(False), lambda: x, lambda: y)
+    x_2 = tf_cond.cond(constant_op.constant(True), lambda: x, lambda: y)
+    y_2 = tf_cond.cond(constant_op.constant(False), lambda: x, lambda: y)
 
     self.assertAllEqual(x.values, x_2.values)
     self.assertAllEqual(x.mask, x_2.mask)
@@ -610,8 +631,10 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     a = MaskedTensorV2([1, 2, 3, 4], [True, False, True, False])
     b = extension_type.pack(a)
     b = tf_cond.cond(
-        constant_op.constant(True), lambda: array_ops.size(a.mask),
-        lambda: array_ops.size(a.values))
+        constant_op.constant(True),
+        lambda: array_ops.size(a.mask),
+        lambda: array_ops.size(a.values),
+    )
     self.assertAllEqual(b, 4)
 
     # Note: the following example would fail (with `Retval[0] does not have a
@@ -620,8 +643,10 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     c = MaskedTensorV2([1, 2, 3, 4], [True, False, True, False])
     c = extension_type.pack(c)
     d = tf_cond.cond(
-        constant_op.constant(False), lambda: array_ops.size(c.mask),
-        lambda: array_ops.size(c.values))
+        constant_op.constant(False),
+        lambda: array_ops.size(c.mask),
+        lambda: array_ops.size(c.values),
+    )
     self.assertAllEqual(d, 4)
 
   def testWhileLoop(self):
@@ -636,7 +661,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllEqual(y.mask, [True, False, True, False])
 
   def testWhileLoopAutograph(self):
-
     @def_function.function
     def fn(x, n):
       for _ in math_ops.range(n):
@@ -660,7 +684,8 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         return MaskedTensorV1(x, x > i)
 
     with self.assertRaisesRegex(
-        ValueError, "The two structures don't have the same nested structure"):
+        ValueError, "The two structures don't have the same nested structure"
+    ):
       while_loop.while_loop_v2(cond, body, [0, x])
 
   def testWhileLoopPacked(self):
@@ -686,12 +711,10 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       boxes: typing.Mapping[str, ops.Tensor]
 
     authors = [[b'A', b'Aardvark'], [b'Z', b'Zhook']]
-    toys = [('car', 1.0, {
-        'size': [8, 3, 2],
-        'color': [0.3, 0.2, 0.8]
-    }), ('book', 3.7, {
-        'authors': ragged_factory_ops.constant(authors)
-    })]
+    toys = [
+        ('car', 1.0, {'size': [8, 3, 2], 'color': [0.3, 0.2, 0.8]}),
+        ('book', 3.7, {'authors': ragged_factory_ops.constant(authors)}),
+    ]
     boxes = {'green': ['car'], 'blue': ['car', 'book', 'book']}
     toy_info = ToyInfo(version='1.0 alpha', toys=toys, boxes=boxes)
 
@@ -702,8 +725,9 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertEqual(set(toy_info.toys[0][2].keys()), {'size', 'color'})
     self.assertIsInstance(toy_info.toys[0][2]['size'], ops.Tensor)
     self.assertAllEqual(toy_info.toys[0][2]['size'], [8, 3, 2])
-    self.assertIsInstance(toy_info.toys[1][2]['authors'],
-                          ragged_tensor.RaggedTensor)
+    self.assertIsInstance(
+        toy_info.toys[1][2]['authors'], ragged_tensor.RaggedTensor
+    )
     self.assertAllEqual(toy_info.toys[1][2]['authors'], authors)
     self.assertAllEqual(toy_info.boxes['green'], [b'car'])
     self.assertAllEqual(toy_info.boxes['blue'], ['car', 'book', 'book'])
@@ -715,7 +739,8 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         r"\('book', <tf.Tensor[^>]*>, ImmutableDict\("
         r"{'authors': (<tf.RaggedTensor[^>]*>|tf.RaggedTensor\(.*\))}\)\)\), "
         r'boxes=ImmutableDict\('
-        r"{'green': <tf.Tensor[^>]*>, 'blue': <tf.Tensor[^>]*>}\)\)")
+        r"{'green': <tf.Tensor[^>]*>, 'blue': <tf.Tensor[^>]*>}\)\)"
+    )
 
     self.assertRegex(repr(toy_info), expected_repr)
 
@@ -737,17 +762,15 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     authors = MaskedTensorV1(
         values=[[b'A', b'Quincy', b'Aardvark'], [b'Z', b'Zhook', b'']],
-        mask=[[True, True, True], [True, True, False]])
+        mask=[[True, True, True], [True, True, False]],
+    )
     toys = [
-        Toy('car', 1.0, {
-            'size': [8, 3, 2],
-            'color': [0.3, 0.2, 0.8]
-        }),
-        Toy(name='book', price=3.7, features={'authors': authors})
+        Toy('car', 1.0, {'size': [8, 3, 2], 'color': [0.3, 0.2, 0.8]}),
+        Toy(name='book', price=3.7, features={'authors': authors}),
     ]
     boxes = {
         'green': Box(['car']),
-        'blue': Box(contents=['car', 'book', 'book'])
+        'blue': Box(contents=['car', 'book', 'book']),
     }
     toy_info = ToyInfo(version='1.0 alpha', toys=toys, boxes=boxes)
 
@@ -759,7 +782,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertAllClose(fn(toy_info), 4.7)
 
   def testNestedCustomConstructor(self):
-
     class Toy(extension_type.ExtensionType):
       name: str
       price: ops.Tensor
@@ -767,7 +789,7 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       def __init__(self, name, price, discount=0):
         if discount:
           name += ' (discounted)'
-          price *= (1 - discount)
+          price *= 1 - discount
         self.name = name
         self.price = price
 
@@ -780,22 +802,17 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
             for (name, price) in name_to_price.items()
         ]
 
-    toy_box = ToyBox({
-        'car': 8.3,
-        'truck': 5.9,
-        'puzzle': 5.3,
-        'jacks': 2.8
-    }, {
-        'puzzle': .2,
-        'truck': .3
-    })
+    toy_box = ToyBox(
+        {'car': 8.3, 'truck': 5.9, 'puzzle': 5.3, 'jacks': 2.8},
+        {'puzzle': 0.2, 'truck': 0.3},
+    )
     self.assertLen(toy_box.toys, 4)
     self.assertEqual(
         set(toy.name for toy in toy_box.toys),
-        {'car', 'truck (discounted)', 'puzzle (discounted)', 'jacks'})
+        {'car', 'truck (discounted)', 'puzzle (discounted)', 'jacks'},
+    )
 
   def testExtensionTypeWithMathOperators(self):
-
     def masked_add(x, y, name=None):
       del name
       if not isinstance(x, MaskedTensorV2) and isinstance(y, MaskedTensorV2):
@@ -810,7 +827,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertAllEqual(z.mask, [[True, False], [False, True]])
 
   def testGetExtensionTypeFields(self):
-
     # Can be called on a type or an instance:
     fields_1 = MaskedTensorV1._tf_extension_type_fields()
     fields_2 = MaskedTensorV1([0], [True])._tf_extension_type_fields()
@@ -825,7 +841,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
       self.assertEqual(fields[1].default, fields[0].NO_DEFAULT)
 
   def testHasExtensionTypeField(self):
-
     self.assertTrue(MaskedTensorV1._tf_extension_type_has_field('values'))
     self.assertTrue(MaskedTensorV1._tf_extension_type_has_field('mask'))
     self.assertFalse(MaskedTensorV1._tf_extension_type_has_field('labels'))
@@ -838,13 +853,22 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
   def testForwardReferences(self):
     A, B = ForwardRefA, ForwardRefB
 
-    self.assertEqual(A._tf_extension_type_fields(),
-                     (extension_type_field.ExtensionTypeField(
-                         'x', typing.Tuple[typing.Union[A, B], ...]),
-                      extension_type_field.ExtensionTypeField('y', B)))
-    self.assertEqual(B._tf_extension_type_fields(),
-                     (extension_type_field.ExtensionTypeField('z', B),
-                      extension_type_field.ExtensionTypeField('n', ops.Tensor)))
+    self.assertEqual(
+        A._tf_extension_type_fields(),
+        (
+            extension_type_field.ExtensionTypeField(
+                'x', typing.Tuple[typing.Union[A, B], ...]
+            ),
+            extension_type_field.ExtensionTypeField('y', B),
+        ),
+    )
+    self.assertEqual(
+        B._tf_extension_type_fields(),
+        (
+            extension_type_field.ExtensionTypeField('z', B),
+            extension_type_field.ExtensionTypeField('n', ops.Tensor),
+        ),
+    )
 
     # Check the signature.
     expected_parameters = [
@@ -852,17 +876,20 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
         tf_inspect.Parameter(
             'x',
             POSITIONAL_OR_KEYWORD,
-            annotation=typing.Tuple[typing.Union['ForwardRefA', 'ForwardRefB'],
-                                    ...]),
+            annotation=typing.Tuple[
+                typing.Union['ForwardRefA', 'ForwardRefB'], ...
+            ],
+        ),
         tf_inspect.Parameter(
-            'y', POSITIONAL_OR_KEYWORD, annotation='ForwardRefB'),
+            'y', POSITIONAL_OR_KEYWORD, annotation='ForwardRefB'
+        ),
     ]
     expected_sig = tf_inspect.Signature(
-        expected_parameters, return_annotation=A)
+        expected_parameters, return_annotation=A
+    )
     self.assertEqual(tf_inspect.signature(A.__init__), expected_sig)
 
   def testUnresolvedForwardReference(self):
-
     class Broken(extension_type.ExtensionType):
       x: 'Cra'  # note: intentional typo for Car.
 
@@ -874,35 +901,43 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
   def testUnsupportedAnnotations(self):
     with self.assertRaisesRegex(
-        TypeError, "In field 'values': Unsupported type annotation"):
+        TypeError, "In field 'values': Unsupported type annotation"
+    ):
 
       class MyType1(extension_type.ExtensionType):  # pylint: disable=unused-variable
         values: typing.List[ops.Tensor]
 
-    with self.assertRaisesRegex(TypeError,
-                                "In field 'xyz': Unsupported type annotation"):
+    with self.assertRaisesRegex(
+        TypeError, "In field 'xyz': Unsupported type annotation"
+    ):
 
       class MyType2(extension_type.ExtensionType):  # pylint: disable=unused-variable
         xyz: typing.Union[typing.Tuple[complex, ...], int]
 
   def testCantUseReservedName(self):
     with self.assertRaisesRegex(
-        ValueError, 'The field annotations for MyType1 are invalid. '
-        "Field '_to_components' is reserved"):
+        ValueError,
+        'The field annotations for MyType1 are invalid. '
+        "Field '_to_components' is reserved",
+    ):
 
       class MyType1(extension_type.ExtensionType):  # pylint: disable=unused-variable
         _to_components: int
 
     with self.assertRaisesRegex(
-        ValueError, 'The field annotations for MyType2 are invalid. '
-        "Field '_tf_extension_type_foo' is reserved"):
+        ValueError,
+        'The field annotations for MyType2 are invalid. '
+        "Field '_tf_extension_type_foo' is reserved",
+    ):
 
       class MyType2(extension_type.ExtensionType):  # pylint: disable=unused-variable
         _tf_extension_type_foo: int
 
     with self.assertRaisesRegex(
-        ValueError, 'The field annotations for MyType3 are invalid. '
-        "Field 'is_compatible_with' is reserved"):
+        ValueError,
+        'The field annotations for MyType3 are invalid. '
+        "Field 'is_compatible_with' is reserved",
+    ):
 
       class MyType3(extension_type.ExtensionType):  # pylint: disable=unused-variable
 
@@ -913,8 +948,9 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertFalse(hasattr(extension_type.ExtensionType, 'Spec'))
 
   def testExtensionTypeBaseConstructorRaisesException(self):
-    with self.assertRaisesRegex(AssertionError,
-                                'ExtensionType is an abstract base class.'):
+    with self.assertRaisesRegex(
+        AssertionError, 'ExtensionType is an abstract base class.'
+    ):
       extension_type.ExtensionType()
 
   class ExtensionTypeWithName(extension_type.ExtensionType):
@@ -923,7 +959,6 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     y: ops.Tensor
 
   def testSavedModelSupport(self):
-
     class TestModule(module.Module):
 
       @def_function.function
@@ -969,11 +1004,11 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     mt4 = MaskedTensorV1([1, 2, 3, 4], [True, True, False, True])
     with self.assertRaisesRegex(
         ValueError,
-        'ExtensionTypes must have a __name__ field in order to be packed.'):
+        'ExtensionTypes must have a __name__ field in order to be packed.',
+    ):
       extension_type.pack(mt4)
 
   def testSubclassing(self):
-
     class Instrument(extension_type.ExtensionType):
       name: ops.Tensor
       weight: ops.Tensor
@@ -990,34 +1025,42 @@ class ExtensionTypeTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
     self.assertEqual(
         list(
-            tf_inspect.signature(
-                StringInstrument.__init__).parameters.values()), [
-                    tf_inspect.Parameter('self', POSITIONAL_OR_KEYWORD),
-                    tf_inspect.Parameter(
-                        'name', POSITIONAL_OR_KEYWORD, annotation=ops.Tensor),
-                    tf_inspect.Parameter(
-                        'weight', POSITIONAL_OR_KEYWORD, annotation=ops.Tensor),
-                    tf_inspect.Parameter(
-                        'needs_case',
-                        POSITIONAL_OR_KEYWORD,
-                        annotation=bool,
-                        default=True),
-                    tf_inspect.Parameter(
-                        'num_strings', KEYWORD_ONLY, annotation=int),
-                ])
-    self.assertEqual(
-        list(tf_inspect.signature(Violin.__init__).parameters.values()), [
+            tf_inspect.signature(StringInstrument.__init__).parameters.values()
+        ),
+        [
             tf_inspect.Parameter('self', POSITIONAL_OR_KEYWORD),
             tf_inspect.Parameter(
-                'name', POSITIONAL_OR_KEYWORD, annotation=str,
-                default='violin'),
+                'name', POSITIONAL_OR_KEYWORD, annotation=ops.Tensor
+            ),
+            tf_inspect.Parameter(
+                'weight', POSITIONAL_OR_KEYWORD, annotation=ops.Tensor
+            ),
+            tf_inspect.Parameter(
+                'needs_case',
+                POSITIONAL_OR_KEYWORD,
+                annotation=bool,
+                default=True,
+            ),
+            tf_inspect.Parameter('num_strings', KEYWORD_ONLY, annotation=int),
+        ],
+    )
+    self.assertEqual(
+        list(tf_inspect.signature(Violin.__init__).parameters.values()),
+        [
+            tf_inspect.Parameter('self', POSITIONAL_OR_KEYWORD),
+            tf_inspect.Parameter(
+                'name', POSITIONAL_OR_KEYWORD, annotation=str, default='violin'
+            ),
             tf_inspect.Parameter('weight', KEYWORD_ONLY, annotation=ops.Tensor),
             tf_inspect.Parameter(
-                'needs_case', KEYWORD_ONLY, annotation=bool, default=True),
+                'needs_case', KEYWORD_ONLY, annotation=bool, default=True
+            ),
             tf_inspect.Parameter(
-                'num_strings', KEYWORD_ONLY, annotation=int, default=4),
+                'num_strings', KEYWORD_ONLY, annotation=int, default=4
+            ),
             tf_inspect.Parameter('maker', KEYWORD_ONLY, annotation=ops.Tensor),
-        ])
+        ],
+    )
 
     violin = Violin(weight=28, maker='Amati')
     self.assertAllEqual(violin.name, 'violin')
@@ -1056,7 +1099,8 @@ class ExtensionTypeIntegrationTest(test_util.TensorFlowTestCase):
   def testDatasetBatchRagged(self):
     xs = MaskedTensorV3(
         ragged_factory_ops.constant([[1], [2, 3], [4]]),
-        ragged_factory_ops.constant([[True], [False], [True]]))
+        ragged_factory_ops.constant([[True], [False], [True]]),
+    )
     x0 = MaskedTensorV3(xs.values[0], xs.mask[0])
 
     ds = dataset_ops.DatasetV2.from_tensors(xs)
@@ -1082,8 +1126,9 @@ class ExtensionTypeIntegrationTest(test_util.TensorFlowTestCase):
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
-                            parameterized.TestCase):
+class ExtensionTypeSpecTest(
+    test_util.TensorFlowTestCase, parameterized.TestCase
+):
 
   def testSpecConstructor(self):
     values_spec = tensor_spec.TensorSpec([4], dtypes.float32)
@@ -1096,7 +1141,6 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
     self.assertEqual(mt._type_spec, mt_spec)
 
   def testSpecConstructorSignature(self):
-
     class MyType(extension_type.ExtensionType):
       x: ops.Tensor
       y: ops.Tensor
@@ -1109,23 +1153,30 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
         tf_inspect.Parameter('z', POSITIONAL_OR_KEYWORD),
     ]
     expected_sig = tf_inspect.Signature(
-        expected_parameters, return_annotation=MyType.Spec)
+        expected_parameters, return_annotation=MyType.Spec
+    )
     self.assertEqual(expected_sig, tf_inspect.signature(MyType.Spec.__init__))
 
   def testSpecAttributesAreImmutable(self):
     mt = MaskedTensorV1([1, 2, 3, 4], [True, True, False, True])
     mt_spec = MaskedTensorV1.Spec.from_value(mt)
     with self.assertRaisesRegex(
-        AttributeError, 'Cannot mutate attribute `score` '
-        'outside the custom constructor of ExtensionTypeSpec'):
+        AttributeError,
+        'Cannot mutate attribute `score` '
+        'outside the custom constructor of ExtensionTypeSpec',
+    ):
       mt_spec.score = 12
     with self.assertRaisesRegex(
-        AttributeError, 'Cannot mutate attribute `values` '
-        'outside the custom constructor of ExtensionTypeSpec'):
+        AttributeError,
+        'Cannot mutate attribute `values` '
+        'outside the custom constructor of ExtensionTypeSpec',
+    ):
       mt_spec.values = constant_op.constant([4, 3, 2, 1])
     with self.assertRaisesRegex(
-        AttributeError, 'Cannot mutate attribute `values` '
-        'outside the custom constructor of ExtensionTypeSpec'):
+        AttributeError,
+        'Cannot mutate attribute `values` '
+        'outside the custom constructor of ExtensionTypeSpec',
+    ):
       del mt_spec.values
 
   def testSpecFromValue(self):
@@ -1138,28 +1189,27 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
     self.assertEqual(mt_spec.mask, expected_mask_spec)
 
   def testSpecSerialize(self):
-
     class Zoo(extension_type.ExtensionType):
       zookeepers: typing.Tuple[str, ...]
       animals: typing.Mapping[str, typing.Mapping[str, ops.Tensor]]
 
     featurespec = {
         'size': tensor_spec.TensorSpec([3]),
-        'weight': tensor_spec.TensorSpec([])
+        'weight': tensor_spec.TensorSpec([]),
     }
     zoo_spec = Zoo.Spec(
         zookeepers=['Zoey', 'Zack'],
-        animals={
-            'tiger': featurespec,
-            'elephant': featurespec
-        })
+        animals={'tiger': featurespec, 'elephant': featurespec},
+    )
 
     serialized = zoo_spec._serialize()
-    self.assertEqual(serialized,
-                     (('zookeepers', ('Zoey', 'Zack')), ('animals', {
-                         'tiger': featurespec,
-                         'elephant': featurespec
-                     })))
+    self.assertEqual(
+        serialized,
+        (
+            ('zookeepers', ('Zoey', 'Zack')),
+            ('animals', {'tiger': featurespec, 'elephant': featurespec}),
+        ),
+    )
     restored = Zoo.Spec._deserialize(serialized)
     self.assertEqual(zoo_spec, restored)
 
@@ -1170,23 +1220,17 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
     self.assertIsInstance(serialized_field_value, dict)
 
   def testSpecComponents(self):
-
     class Zoo(extension_type.ExtensionType):
       zookeepers: typing.Tuple[str, ...]
       animals: typing.Mapping[str, typing.Mapping[str, ops.Tensor]]
 
     zoo = Zoo(
-        ['Zoey', 'Zack'], {
-            'elephant': {
-                'size': [25, 30, 20],
-                'weight': 2000.0
-            },
-            'tiger': {
-                'hunger': 3.2,
-                'size': [3, 8, 2],
-                'weight': 87.3
-            }
-        })
+        ['Zoey', 'Zack'],
+        {
+            'elephant': {'size': [25, 30, 20], 'weight': 2000.0},
+            'tiger': {'hunger': 3.2, 'size': [3, 8, 2], 'weight': 87.3},
+        },
+    )
     zoo_spec = Zoo.Spec.from_value(zoo)
 
     components = zoo_spec._to_components(zoo)
@@ -1200,12 +1244,16 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
     restored = zoo_spec._from_components(components)
     self.assertAllEqual(zoo == restored, True)
 
-    self.assertEqual(zoo_spec._component_specs,
-                     (tensor_spec.TensorSpec([3], dtypes.int32),
-                      tensor_spec.TensorSpec([], dtypes.float32),
-                      tensor_spec.TensorSpec([], dtypes.float32),
-                      tensor_spec.TensorSpec([3], dtypes.int32),
-                      tensor_spec.TensorSpec([], dtypes.float32)))
+    self.assertEqual(
+        zoo_spec._component_specs,
+        (
+            tensor_spec.TensorSpec([3], dtypes.int32),
+            tensor_spec.TensorSpec([], dtypes.float32),
+            tensor_spec.TensorSpec([], dtypes.float32),
+            tensor_spec.TensorSpec([3], dtypes.int32),
+            tensor_spec.TensorSpec([], dtypes.float32),
+        ),
+    )
 
   def testCopyAndPickle(self):
     values_spec = tensor_spec.TensorSpec([4], dtypes.float32)
@@ -1216,7 +1264,6 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
     self.assertEqual(pickle.loads(pickle.dumps(mt_spec)), mt_spec)
 
   def testCustomizeSpecTest(self):
-
     class WeightedTensor(extension_type.ExtensionType):
       """ExtensionType with a customized TypeSpec.
 
@@ -1225,6 +1272,7 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
       * Add properties (shape, dtype, weight_dtype).
       * Add method (with_shape).
       """
+
       values: ops.Tensor
       weight: ops.Tensor  # scalar
 
@@ -1264,19 +1312,20 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
 
   def testNestedSpecMustBeAClass(self):
     with self.assertRaisesRegex(
-        ValueError,
-        r'BrokenExtensionType\.Spec must be a nested class; got 12.'):
+        ValueError, r'BrokenExtensionType\.Spec must be a nested class; got 12.'
+    ):
 
       class BrokenExtensionType(extension_type.ExtensionType):
-
         Spec = 12  # pylint: disable=invalid-name
 
       del BrokenExtensionType
 
   def testNestedSpecMayNotHaveBaseClasses(self):
     with self.assertRaisesRegex(
-        ValueError, r'BrokenExtensionType\.Spec must be directly subclassed '
-        'from tf.TypeSpec.'):
+        ValueError,
+        r'BrokenExtensionType\.Spec must be directly subclassed '
+        'from tf.TypeSpec.',
+    ):
 
       class BrokenExtensionType(extension_type.ExtensionType):
 
@@ -1287,15 +1336,13 @@ class ExtensionTypeSpecTest(test_util.TensorFlowTestCase,
 
 
 @test_util.run_all_in_graph_and_eager_modes
-class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
-                                 parameterized.TestCase):
+class AnonymousExtensionTypeTest(
+    test_util.TensorFlowTestCase, parameterized.TestCase
+):
 
   @parameterized.parameters([
       [dict(i=5, f=3.2, b=True, n=None)],
-      [dict(x=(1, 2), y={
-          3: 4,
-          5: 6
-      })],
+      [dict(x=(1, 2), y={3: 4, 5: 6})],
       [lambda: dict(t=constant_op.constant(123))],
       [lambda: dict(r=ragged_factory_ops.constant([[1, 2], [3]]))],
   ])
@@ -1310,7 +1357,7 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
       [dict(x=(1, dict([(2, [])]))), 'unsupported `value` argument'],
       [
           dict(_tf_extension_type_xyz=5),
-          'Reserved field name .*_tf_extension_type_xyz.*'
+          'Reserved field name .*_tf_extension_type_xyz.*',
       ],
   ])
   def testConstructionErrors(self, fields, error):
@@ -1319,10 +1366,7 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
 
   @parameterized.parameters([
       [dict(i=5, f=3.2, b=True, n=None)],
-      [dict(x=(1, 2), y={
-          3: 4,
-          5: 6
-      })],
+      [dict(x=(1, 2), y={3: 4, 5: 6})],
       [lambda: dict(t=constant_op.constant(123))],
       [lambda: dict(r=ragged_factory_ops.constant([[1, 2], [3]]))],
   ])
@@ -1330,7 +1374,7 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
     if callable(fields):
       fields = fields()
     s = extension_type.AnonymousExtensionType(**fields)
-    for (name, value) in fields.items():
+    for name, value in fields.items():
       actual = getattr(s, name)
       if isinstance(actual, (ops.Tensor, ragged_tensor.RaggedTensor)):
         self.assertAllEqual(actual, value)
@@ -1348,8 +1392,9 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
 
   def testReinterpret(self):
     x = MaskedTensorV2([4, 5], [True, False])
-    anon_x = extension_type.reinterpret(x,
-                                        extension_type.AnonymousExtensionType)
+    anon_x = extension_type.reinterpret(
+        x, extension_type.AnonymousExtensionType
+    )
     self.assertAllEqual(anon_x.values, [4, 5])
     self.assertAllEqual(anon_x.mask, [True, False])
 
@@ -1365,32 +1410,44 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
   @parameterized.parameters([
       [
           lambda: extension_type.AnonymousExtensionType(
-              values=constant_op.constant([1, 2, 3])), MaskedTensorV2,
-          "Missing required fields: {'mask'}"
+              values=constant_op.constant([1, 2, 3])
+          ),
+          MaskedTensorV2,
+          "Missing required fields: {'mask'}",
       ],
       [
           lambda: extension_type.AnonymousExtensionType(
-              values=(1, 2, 3), mask=None), MaskedTensorV2,
-          "mask: expected a Tensor, got 'NoneType'"
+              values=(1, 2, 3), mask=None
+          ),
+          MaskedTensorV2,
+          "mask: expected a Tensor, got 'NoneType'",
       ],
       [
           lambda: extension_type.AnonymousExtensionType(
               values=constant_op.constant([1, 2, 3]),
-              mask=constant_op.constant([True, False])), MaskedTensorV2,
-          'Shapes .* are incompatible'
+              mask=constant_op.constant([True, False]),
+          ),
+          MaskedTensorV2,
+          'Shapes .* are incompatible',
       ],
       [
           lambda: extension_type.AnonymousExtensionType(
-              values=constant_op.constant([1, 2, 3])), ops.Tensor,
-          'reinterpret expects `new_type` to be a subclass of '
-          'tf.ExtensionType; '
-          'got .*.Tensor.*'
+              values=constant_op.constant([1, 2, 3])
+          ),
+          ops.Tensor,
+          (
+              'reinterpret expects `new_type` to be a subclass of '
+              'tf.ExtensionType; '
+              'got .*.Tensor.*'
+          ),
       ],
       [
           lambda: constant_op.constant([1, 2, 3]),
           extension_type.AnonymousExtensionType,
-          'reinterpret expects `value` to be a tf.ExtensionType instance; '
-          'got.*.Tensor.*'
+          (
+              'reinterpret expects `value` to be a tf.ExtensionType instance; '
+              'got.*.Tensor.*'
+          ),
       ],
   ])
   def testReinterpretErrors(self, value, new_type, error):
@@ -1400,7 +1457,6 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
       extension_type.reinterpret(value, new_type)
 
   def testLoadSavedModelWithUnregisteredExtensionType(self):
-
     def f(x, y):
       x_values = x.values if isinstance(x, MaskedTensorV1) else x
       y_values = y.values if isinstance(y, MaskedTensorV1) else y
@@ -1419,8 +1475,9 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
     model.f.get_concrete_function(mt_spec, mt_spec)
 
     path = tempfile.mkdtemp(prefix=test.get_temp_dir())
-    with temporarily_register_type_spec('tf.test.MaskedTensorV1.Spec',
-                                        MaskedTensorV1.Spec):
+    with temporarily_register_type_spec(
+        'tf.test.MaskedTensorV1.Spec', MaskedTensorV1.Spec
+    ):
       save.save(model, path)
     loaded_model = load.load(path)
 
@@ -1440,8 +1497,8 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
 
     mt = MaskedTensorV1([1, 2, 3], [True, True, False])
     v3 = loaded_model.f(
-        t, extension_type.reinterpret(mt,
-                                      extension_type.AnonymousExtensionType))
+        t, extension_type.reinterpret(mt, extension_type.AnonymousExtensionType)
+    )
     self.assertIsInstance(v3, extension_type.AnonymousExtensionType)
     self.assertAllEqual(v3.values, [11, 22, 33])
     self.assertAllEqual(v3.mask, [True, True, False])
@@ -1455,10 +1512,13 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
     x = MaskedTensorV2([4, 5], [True, False])
     spec = type_spec.type_spec_from_value(x)
     flat_specs = spec._flat_tensor_specs
-    self.assertEqual(flat_specs, [
-        tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.int32, name=None),
-        tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.bool, name=None)
-    ])
+    self.assertEqual(
+        flat_specs,
+        [
+            tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.int32, name=None),
+            tensor_spec.TensorSpec(shape=(2,), dtype=dtypes.bool, name=None),
+        ],
+    )
 
   def testFullTypesForFlatTensors(self):
     x = MaskedTensorV2([4, 5], [True, False])
@@ -1466,14 +1526,25 @@ class AnonymousExtensionTypeTest(test_util.TensorFlowTestCase,
     full_type_list = fulltypes_for_flat_tensors(spec)
     expect = [
         full_type_pb2.FullTypeDef(type_id=full_type_pb2.TFT_UNSET),
-        full_type_pb2.FullTypeDef(type_id=full_type_pb2.TFT_UNSET)
+        full_type_pb2.FullTypeDef(type_id=full_type_pb2.TFT_UNSET),
     ]
     self.assertEqual(len(spec._flat_tensor_specs), len(full_type_list))
     self.assertEqual(expect, full_type_list)
 
+  def testToLegacyOutputShape(self):
+    x = SimpleExtensionType([4, 5])
+    spec = type_spec.type_spec_from_value(x)
+    shape = spec._to_legacy_output_shapes()
+    self.assertAllEqual(shape.as_list(), [2])
+
+  def testToLegacyOutputShapeMissing(self):
+    x = MaskedTensorV2([4, 5], [True, False])
+    spec = type_spec.type_spec_from_value(x)
+    with self.assertRaises(NotImplementedError):
+      spec._to_legacy_output_shapes()
+
 
 def replace_tensors_with_placeholders(value):
-
   def repl(x):
     if isinstance(x, ops.Tensor):
       return array_ops.placeholder_with_default(x, shape=None)
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index cba9e780fc7..ab73c85df16 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -53,6 +53,7 @@ from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_callbacks
 from tensorflow.python.framework import registry
 from tensorflow.python.framework import stack
+from tensorflow.python.framework import tensor as tensor_lib
 from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
@@ -75,29 +76,20 @@ from tensorflow.python.util import tf_stack
 from tensorflow.python.util import traceback_utils
 from tensorflow.python.util.compat import collections_abc
 from tensorflow.python.util.deprecation import deprecated_args
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import kwarg_only
 from tensorflow.python.util.tf_export import tf_export
 
-# TODO(b/218887885): Loaded lazily due to a circular dependency with this file.
-tensor_spec = LazyLoader(
-    "tensor_spec", globals(),
-    "tensorflow.python.framework.tensor_spec")
-
 
 # Temporary global switches determining if we should enable the work-in-progress
 # calls to the C API. These will be removed once all functionality is supported.
 _USE_C_API = True
 _USE_C_SHAPES = True
 
+
 _api_usage_gauge = monitoring.BoolGauge(
     "/tensorflow/api/ops_eager_execution",
     "Whether ops.enable_eager_execution() is called.")
 
-_tensor_equality_api_usage_gauge = monitoring.BoolGauge(
-    "/tensorflow/api/enable_tensor_equality",
-    "Whether ops.enable_tensor_equality() is called.")
-
 _control_flow_api_gauge = monitoring.BoolGauge(
     "/tensorflow/api/enable_control_flow_v2",
     "Whether enable_control_flow_v2() is called.")
@@ -175,23 +167,6 @@ class NullContextmanager(object):
     return False  # False values do not suppress exceptions
 
 
-def _override_helper(clazz_object, operator, func):
-  """Overrides (string) operator on Tensors to call func.
-
-  Args:
-    clazz_object: the class to override for; either Tensor or SparseTensor.
-    operator: the string name of the operator to override.
-    func: the function that replaces the overridden operator.
-
-  Raises:
-    ValueError: If operator is not allowed to be overwritten.
-  """
-  if operator not in Tensor.OVERLOADABLE_OPERATORS:
-    raise ValueError(f"Overriding {operator} is disallowed. "
-                     f"Allowed operators are {Tensor.OVERLOADABLE_OPERATORS}.")
-  setattr(clazz_object, operator, func)
-
-
 def _as_graph_element(obj):
   """Convert `obj` to a graph element if possible, otherwise return `None`.
 
@@ -259,655 +234,13 @@ def value_text(tensor, is_repr=False):
   return text
 
 
-@tf_export(v1=["enable_tensor_equality"])
-def enable_tensor_equality():
-  """Compare Tensors with element-wise comparison and thus be unhashable.
-
-  Comparing tensors with element-wise allows comparisons such as
-  tf.Variable(1.0) == 1.0. Element-wise equality implies that tensors are
-  unhashable. Thus tensors can no longer be directly used in sets or as a key in
-  a dictionary.
-  """
-  logging.vlog(1, "Enabling tensor equality")
-  _tensor_equality_api_usage_gauge.get_cell().set(True)
-  Tensor._USE_EQUALITY = True  # pylint: disable=protected-access
-
-
-@tf_export(v1=["disable_tensor_equality"])
-def disable_tensor_equality():
-  """Compare Tensors by their id and be hashable.
-
-  This is a legacy behaviour of TensorFlow and is highly discouraged.
-  """
-  logging.vlog(1, "Disabling tensor equality")
-  _tensor_equality_api_usage_gauge.get_cell().set(False)
-  Tensor._USE_EQUALITY = False  # pylint: disable=protected-access
-
-
-@tf_export("Tensor", "experimental.numpy.ndarray", v1=["Tensor"])
-class Tensor(internal.NativeObject, core_tf_types.Symbol):
-  """A `tf.Tensor` represents a multidimensional array of elements.
-
-  All elements are of a single known data type.
-
-  When writing a TensorFlow program, the main object that is
-  manipulated and passed around is the `tf.Tensor`.
-
-  A `tf.Tensor` has the following properties:
-
-  * a single data type (float32, int32, or string, for example)
-  * a shape
-
-  TensorFlow supports eager execution and graph execution.  In eager
-  execution, operations are evaluated immediately.  In graph
-  execution, a computational graph is constructed for later
-  evaluation.
-
-  TensorFlow defaults to eager execution.  In the example below, the
-  matrix multiplication results are calculated immediately.
-
-  >>> # Compute some values using a Tensor
-  >>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
-  >>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
-  >>> e = tf.matmul(c, d)
-  >>> print(e)
-  tf.Tensor(
-  [[1. 3.]
-   [3. 7.]], shape=(2, 2), dtype=float32)
-
-  Note that during eager execution, you may discover your `Tensors` are actually
-  of type `EagerTensor`.  This is an internal detail, but it does give you
-  access to a useful function, `numpy`:
-
-  >>> type(e)
-  <class '...ops.EagerTensor'>
-  >>> print(e.numpy())
-    [[1. 3.]
-     [3. 7.]]
-
-  In TensorFlow, `tf.function`s are a common way to define graph execution.
-
-  A Tensor's shape (that is, the rank of the Tensor and the size of
-  each dimension) may not always be fully known.  In `tf.function`
-  definitions, the shape may only be partially known.
-
-  Most operations produce tensors of fully-known shapes if the shapes of their
-  inputs are also fully known, but in some cases it's only possible to find the
-  shape of a tensor at execution time.
-
-  A number of specialized tensors are available: see `tf.Variable`,
-  `tf.constant`, `tf.placeholder`, `tf.sparse.SparseTensor`, and
-  `tf.RaggedTensor`.
-
-  Caution: when constructing a tensor from a numpy array or pandas dataframe
-  the underlying buffer may be re-used:
-
-  ```python
-  a = np.array([1, 2, 3])
-  b = tf.constant(a)
-  a[0] = 4
-  print(b)  # tf.Tensor([4 2 3], shape=(3,), dtype=int64)
-  ```
-
-  Note: this is an implementation detail that is subject to change and users
-  should not rely on this behaviour.
-
-  For more on Tensors, see the [guide](https://tensorflow.org/guide/tensor).
-  """
-  # List of Python operators that we allow to override.
-  OVERLOADABLE_OPERATORS = {
-      # Binary.
-      "__add__",
-      "__radd__",
-      "__sub__",
-      "__rsub__",
-      "__mul__",
-      "__rmul__",
-      "__div__",
-      "__rdiv__",
-      "__truediv__",
-      "__rtruediv__",
-      "__floordiv__",
-      "__rfloordiv__",
-      "__mod__",
-      "__rmod__",
-      "__lt__",
-      "__le__",
-      "__gt__",
-      "__ge__",
-      "__ne__",
-      "__eq__",
-      "__and__",
-      "__rand__",
-      "__or__",
-      "__ror__",
-      "__xor__",
-      "__rxor__",
-      "__getitem__",
-      "__pow__",
-      "__rpow__",
-      # Unary.
-      "__invert__",
-      "__neg__",
-      "__abs__",
-      "__matmul__",
-      "__rmatmul__"
-  }
-
-  # Whether to allow hashing or numpy-style equality
-  _USE_EQUALITY = tf2.enabled()
-
-  def __getattr__(self, name):
-    if name in {"T", "astype", "ravel", "transpose", "reshape", "clip", "size",
-                "tolist", "data"}:
-      # TODO(wangpeng): Export the enable_numpy_behavior knob
-      raise AttributeError(
-          f"{type(self).__name__} object has no attribute '{name}'. " + """
-        If you are looking for numpy-related methods, please run the following:
-        tf.experimental.numpy.experimental_enable_numpy_behavior()
-      """)
-    self.__getattribute__(name)
-
-  @property
-  def dtype(self):
-    """The `DType` of elements in this tensor."""
-    return self._dtype
-
-  @property
-  def name(self):
-    return self._name
-
-  @property
-  def shape(self):
-    """Returns a `tf.TensorShape` that represents the shape of this tensor.
-
-    >>> t = tf.constant([1,2,3,4,5])
-    >>> t.shape
-    TensorShape([5])
-
-    `tf.Tensor.shape` is equivalent to `tf.Tensor.get_shape()`.
-
-    In a `tf.function` or when building a model using
-    `tf.keras.Input`, they return the build-time shape of the
-    tensor, which may be partially unknown.
-
-    A `tf.TensorShape` is not a tensor. Use `tf.shape(t)` to get a tensor
-    containing the shape, calculated at runtime.
-
-    See `tf.Tensor.get_shape()`, and `tf.TensorShape` for details and examples.
-    """
-    if self._shape_val is None:
-      dims, unknown_shape = self._shape
-      if unknown_shape:
-        self._shape_val = tensor_shape.unknown_shape()
-      else:
-        self._shape_val = tensor_shape.TensorShape(dims)
-    return self._shape_val
-
-  @property
-  def ndim(self):
-    return self.shape.rank
-
-  def _disallow(self, task):
-    raise errors.OperatorNotAllowedInGraphError(
-        f"{task} is not allowed."
-        " You can attempt the following resolutions to the problem:"
-        " If you are running in Graph mode, use Eager execution mode"
-        " or decorate this function with @tf.function."
-        " If you are using AutoGraph, you can try decorating this function"
-        " with @tf.function. If that does not work, then you may be using"
-        " an unsupported feature or your source code may not be visible"
-        " to AutoGraph. See"
-        " https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code"
-        " for more information.")
-
-  def _disallow_bool_casting(self):
-    self._disallow("Using a symbolic `tf.Tensor` as a Python `bool`")
-
-  def _disallow_iteration(self):
-    self._disallow("Iterating over a symbolic `tf.Tensor`")
-
-  def __iter__(self):
-    if not context.executing_eagerly():
-      self._disallow_iteration()
-
-    shape = self._shape_tuple()
-    if shape is None:
-      raise TypeError("Cannot iterate over a tensor with unknown shape.")
-    if not shape:
-      raise TypeError("Cannot iterate over a scalar tensor.")
-    if shape[0] is None:
-      raise TypeError(
-          "Cannot iterate over a tensor with unknown first dimension.")
-    return _TensorIterator(self, shape[0])
-
-  def _shape_as_list(self):
-    if self.shape.ndims is not None:
-      return [dim.value for dim in self.shape.dims]
-    else:
-      return None
-
-  def _shape_tuple(self):
-    shape = self._shape_as_list()
-    if shape is None:
-      return None
-    return tuple(shape)
-
-  def _record_tape(self, capture):
-    """Connect this graph tensor with capture for gradients calculation."""
-    record.record_operation(
-        "captured_value",
-        [self], [capture],
-        backward_function=lambda x: [x],
-        forward_function=lambda x: [x])
-
-  def get_shape(self):
-    """Returns a `tf.TensorShape` that represents the shape of this tensor.
-
-    In eager execution the shape is always fully-known.
-
-    >>> a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
-    >>> print(a.shape)
-    (2, 3)
-
-    `tf.Tensor.get_shape()` is equivalent to `tf.Tensor.shape`.
-
-
-    When executing in a `tf.function` or building a model using
-    `tf.keras.Input`, `Tensor.shape` may return a partial shape (including
-    `None` for unknown dimensions). See `tf.TensorShape` for more details.
-
-    >>> inputs = tf.keras.Input(shape = [10])
-    >>> # Unknown batch size
-    >>> print(inputs.shape)
-    (None, 10)
-
-    The shape is computed using shape inference functions that are
-    registered for each `tf.Operation`.
-
-    The returned `tf.TensorShape` is determined at *build* time, without
-    executing the underlying kernel. It is not a `tf.Tensor`. If you need a
-    shape *tensor*, either convert the `tf.TensorShape` to a `tf.constant`, or
-    use the `tf.shape(tensor)` function, which returns the tensor's shape at
-    *execution* time.
-
-    This is useful for debugging and providing early errors. For
-    example, when tracing a `tf.function`, no ops are being executed, shapes
-    may be unknown (See the [Concrete Functions
-    Guide](https://www.tensorflow.org/guide/concrete_function) for details).
-
-    >>> @tf.function
-    ... def my_matmul(a, b):
-    ...   result = a@b
-    ...   # the `print` executes during tracing.
-    ...   print("Result shape: ", result.shape)
-    ...   return result
-
-    The shape inference functions propagate shapes to the extent possible:
-
-    >>> f = my_matmul.get_concrete_function(
-    ...   tf.TensorSpec([None,3]),
-    ...   tf.TensorSpec([3,5]))
-    Result shape: (None, 5)
-
-    Tracing may fail if a shape missmatch can be detected:
-
-    >>> cf = my_matmul.get_concrete_function(
-    ...   tf.TensorSpec([None,3]),
-    ...   tf.TensorSpec([4,5]))
-    Traceback (most recent call last):
-    ...
-    ValueError: Dimensions must be equal, but are 3 and 4 for 'matmul' (op:
-    'MatMul') with input shapes: [?,3], [4,5].
-
-    In some cases, the inferred shape may have unknown dimensions. If
-    the caller has additional information about the values of these
-    dimensions, `tf.ensure_shape` or `Tensor.set_shape()` can be used to augment
-    the inferred shape.
-
-    >>> @tf.function
-    ... def my_fun(a):
-    ...   a = tf.ensure_shape(a, [5, 5])
-    ...   # the `print` executes during tracing.
-    ...   print("Result shape: ", a.shape)
-    ...   return a
-
-    >>> cf = my_fun.get_concrete_function(
-    ...   tf.TensorSpec([None, None]))
-    Result shape: (5, 5)
-
-    Returns:
-      A `tf.TensorShape` representing the shape of this tensor.
-
-    """
-    return self.shape
-
-  def set_shape(self, shape):
-    """Updates the shape of this tensor.
-
-    Note: It is recommended to use `tf.ensure_shape` instead of
-    `Tensor.set_shape`, because `tf.ensure_shape` provides better checking for
-    programming errors and can create guarantees for compiler
-    optimization.
-
-    With eager execution this operates as a shape assertion.
-    Here the shapes match:
-
-    >>> t = tf.constant([[1,2,3]])
-    >>> t.set_shape([1, 3])
-
-    Passing a `None` in the new shape allows any value for that axis:
-
-    >>> t.set_shape([1,None])
-
-    An error is raised if an incompatible shape is passed.
-
-    >>> t.set_shape([1,5])
-    Traceback (most recent call last):
-    ...
-    ValueError: Tensor's shape (1, 3) is not compatible with supplied
-    shape [1, 5]
-
-    When executing in a `tf.function`, or building a model using
-    `tf.keras.Input`, `Tensor.set_shape` will *merge* the given `shape` with
-    the current shape of this tensor, and set the tensor's shape to the
-    merged value (see `tf.TensorShape.merge_with` for details):
-
-    >>> t = tf.keras.Input(shape=[None, None, 3])
-    >>> print(t.shape)
-    (None, None, None, 3)
-
-    Dimensions set to `None` are not updated:
-
-    >>> t.set_shape([None, 224, 224, None])
-    >>> print(t.shape)
-    (None, 224, 224, 3)
-
-    The main use case for this is to provide additional shape information
-    that cannot be inferred from the graph alone.
-
-    For example if you know all the images in a dataset have shape [28,28,3] you
-    can set it with `tf.set_shape`:
-
-    >>> @tf.function
-    ... def load_image(filename):
-    ...   raw = tf.io.read_file(filename)
-    ...   image = tf.image.decode_png(raw, channels=3)
-    ...   # the `print` executes during tracing.
-    ...   print("Initial shape: ", image.shape)
-    ...   image.set_shape([28, 28, 3])
-    ...   print("Final shape: ", image.shape)
-    ...   return image
-
-    Trace the function, see the [Concrete Functions
-    Guide](https://www.tensorflow.org/guide/concrete_function) for details.
-
-    >>> cf = load_image.get_concrete_function(
-    ...     tf.TensorSpec([], dtype=tf.string))
-    Initial shape:  (None, None, 3)
-    Final shape: (28, 28, 3)
-
-    Similarly the `tf.io.parse_tensor` function could return a tensor with
-    any shape, even the `tf.rank` is unknown. If you know that all your
-    serialized tensors will be 2d, set it with `set_shape`:
-
-    >>> @tf.function
-    ... def my_parse(string_tensor):
-    ...   result = tf.io.parse_tensor(string_tensor, out_type=tf.float32)
-    ...   # the `print` executes during tracing.
-    ...   print("Initial shape: ", result.shape)
-    ...   result.set_shape([None, None])
-    ...   print("Final shape: ", result.shape)
-    ...   return result
-
-    Trace the function
-
-    >>> concrete_parse = my_parse.get_concrete_function(
-    ...     tf.TensorSpec([], dtype=tf.string))
-    Initial shape:  <unknown>
-    Final shape:  (None, None)
-
-    Make sure it works:
-
-    >>> t = tf.ones([5,3], dtype=tf.float32)
-    >>> serialized = tf.io.serialize_tensor(t)
-    >>> print(serialized.dtype)
-    <dtype: 'string'>
-    >>> print(serialized.shape)
-    ()
-    >>> t2 = concrete_parse(serialized)
-    >>> print(t2.shape)
-    (5, 3)
-
-    Caution: `set_shape` ensures that the applied shape is compatible with
-    the existing shape, but it does not check at runtime. Setting
-    incorrect shapes can result in inconsistencies between the
-    statically-known graph and the runtime value of tensors. For runtime
-    validation of the shape, use `tf.ensure_shape` instead. It also modifies
-    the `shape` of the tensor.
-
-    >>> # Serialize a rank-3 tensor
-    >>> t = tf.ones([5,5,5], dtype=tf.float32)
-    >>> serialized = tf.io.serialize_tensor(t)
-    >>> # The function still runs, even though it `set_shape([None,None])`
-    >>> t2 = concrete_parse(serialized)
-    >>> print(t2.shape)
-    (5, 5, 5)
-
-    Args:
-      shape: A `TensorShape` representing the shape of this tensor, a
-        `TensorShapeProto`, a list, a tuple, or None.
-
-    Raises:
-      ValueError: If `shape` is not compatible with the current shape of
-        this tensor.
-    """
-    # Reset cached shape.
-    self._shape_val = None
-
-    # We want set_shape to be reflected in the C API graph for when we run it.
-    if not isinstance(shape, tensor_shape.TensorShape):
-      shape = tensor_shape.TensorShape(shape)
-    dim_list = []
-    if shape.dims is None:
-      unknown_shape = True
-    else:
-      unknown_shape = False
-      for dim in shape.dims:
-        if dim.value is None:
-          dim_list.append(-1)
-        else:
-          dim_list.append(dim.value)
-    self._set_shape(dim_list, unknown_shape)
-
-  def _as_node_def_input(self):
-    """Return a value to use for the NodeDef "input" attribute.
-
-    The returned string can be used in a NodeDef "input" attribute
-    to indicate that the NodeDef uses this Tensor as input.
-
-    Raises:
-      ValueError: if this Tensor's Operation does not have a name.
-
-    Returns:
-      a string.
-    """
-    assert self._op.name
-    if self.value_index == 0:
-      return self._op.name
-    else:
-      return "%s:%d" % (self._op.name, self.value_index)
-
-  def __str__(self):
-    return "Tensor(\"%s\"%s%s%s)" % (
-        self.name,
-        (", shape=%s" %
-         self.get_shape()) if self.get_shape().ndims is not None else "",
-        (", dtype=%s" % self._dtype.name) if self._dtype else "",
-        (", device=%s" % self.device) if self.device else "")
-
-  def __repr__(self):
-    return "<tf.Tensor '%s' shape=%s dtype=%s>" % (self.name, self.get_shape(),
-                                                   self._dtype.name)
-
-  def __hash__(self):
-    g = getattr(self, "graph", None)
-    if (Tensor._USE_EQUALITY and (g is None or g.building_function)):
-      raise TypeError("Tensor is unhashable. "
-                      "Instead, use tensor.ref() as the key.")
-    else:
-      return id(self)
-
-  # NOTE(mrry): This enables the Tensor's overloaded "right" binary
-  # operators to run when the left operand is an ndarray, because it
-  # accords the Tensor class higher priority than an ndarray, or a
-  # numpy matrix.
-  # TODO(mrry): Convert this to using numpy's __numpy_ufunc__
-  # mechanism, which allows more control over how Tensors interact
-  # with ndarrays.
-  __array_priority__ = 100
-
-  def __array__(self, dtype=None):
-    del dtype
-    raise NotImplementedError(
-        f"Cannot convert a symbolic tf.Tensor ({self.name}) to a numpy array."
-        f" This error may indicate that you're trying to pass a Tensor to"
-        f" a NumPy call, which is not supported.")
-
-  def __len__(self):
-    raise TypeError(f"len is not well defined for a symbolic Tensor "
-                    f"({self.name}). Please call `x.shape` rather than "
-                    f"`len(x)` for shape information.")
-
-  # TODO(mdan): This convoluted machinery is hard to maintain. Clean up.
-  @staticmethod
-  def _override_operator(operator, func):
-    _override_helper(Tensor, operator, func)
-
-  def __bool__(self):
-    """Dummy method to prevent a tensor from being used as a Python `bool`.
-
-    This overload raises a `TypeError` when the user inadvertently
-    treats a `Tensor` as a boolean (most commonly in an `if` or `while`
-    statement), in code that was not converted by AutoGraph. For example:
-
-    ```python
-    if tf.constant(True):  # Will raise.
-      # ...
-
-    if tf.constant(5) < tf.constant(7):  # Will raise.
-      # ...
-    ```
-
-    Raises:
-      `TypeError`.
-    """
-    self._disallow_bool_casting()
-
-  def __nonzero__(self):
-    """Dummy method to prevent a tensor from being used as a Python `bool`.
-
-    This is the Python 2.x counterpart to `__bool__()` above.
-
-    Raises:
-      `TypeError`.
-    """
-    self._disallow_bool_casting()
-
-  def eval(self, feed_dict=None, session=None):
-    """Evaluates this tensor in a `Session`.
-
-    Note: If you are not using `compat.v1` libraries, you should not need this,
-    (or `feed_dict` or `Session`).  In eager execution (or within `tf.function`)
-    you do not need to call `eval`.
-
-    Calling this method will execute all preceding operations that
-    produce the inputs needed for the operation that produces this
-    tensor.
-
-    *N.B.* Before invoking `Tensor.eval()`, its graph must have been
-    launched in a session, and either a default session must be
-    available, or `session` must be specified explicitly.
-
-    Args:
-      feed_dict: A dictionary that maps `Tensor` objects to feed values. See
-        `tf.Session.run` for a description of the valid feed values.
-      session: (Optional.) The `Session` to be used to evaluate this tensor. If
-        none, the default session will be used.
-
-    Returns:
-      A numpy array corresponding to the value of this tensor.
-    """
-    return _eval_using_default_session(self, feed_dict, self.graph, session)
-
-  @deprecation.deprecated(None, "Use ref() instead.")
-  def experimental_ref(self):
-    return self.ref()
-
-  def ref(self):
-    # tf.Variable also has the same ref() API.  If you update the
-    # documentation here, please update tf.Variable.ref() as well.
-    """Returns a hashable reference object to this Tensor.
-
-    The primary use case for this API is to put tensors in a set/dictionary.
-    We can't put tensors in a set/dictionary as `tensor.__hash__()` is no longer
-    available starting Tensorflow 2.0.
-
-    The following will raise an exception starting 2.0
-
-    >>> x = tf.constant(5)
-    >>> y = tf.constant(10)
-    >>> z = tf.constant(10)
-    >>> tensor_set = {x, y, z}
-    Traceback (most recent call last):
-      ...
-    TypeError: Tensor is unhashable. Instead, use tensor.ref() as the key.
-    >>> tensor_dict = {x: 'five', y: 'ten'}
-    Traceback (most recent call last):
-      ...
-    TypeError: Tensor is unhashable. Instead, use tensor.ref() as the key.
-
-    Instead, we can use `tensor.ref()`.
-
-    >>> tensor_set = {x.ref(), y.ref(), z.ref()}
-    >>> x.ref() in tensor_set
-    True
-    >>> tensor_dict = {x.ref(): 'five', y.ref(): 'ten', z.ref(): 'ten'}
-    >>> tensor_dict[y.ref()]
-    'ten'
-
-    Also, the reference object provides `.deref()` function that returns the
-    original Tensor.
-
-    >>> x = tf.constant(5)
-    >>> x.ref().deref()
-    <tf.Tensor: shape=(), dtype=int32, numpy=5>
-    """
-    return object_identity.Reference(self)
-
-  def __tf_tracing_type__(self, signature_context):
-    if self.dtype == dtypes.resource or self.dtype == dtypes.variant:
-      handle_data = handle_data_util.get_handle_data(self)
-      dtype = dtypes.DType(self.dtype._type_enum, handle_data)
-    else:
-      dtype = self.dtype
-    spec = tensor_spec.TensorSpec(self.shape, dtype)
-    return spec
-
-  def __tf_tensor__(
-      self, dtype: Optional[dtypes.DType] = None, name: Optional[str] = None
-      ) -> "Tensor":
-    if dtype is not None and not dtype.is_compatible_with(self.dtype):
-      raise ValueError(
-          _add_error_prefix(
-              f"Tensor conversion requested dtype {dtype.name} "
-              f"for Tensor with dtype {self.dtype.name}: {self!r}",
-              name=name))
-    return self
+enable_tensor_equality = tensor_lib.enable_tensor_equality
+disable_tensor_equality = tensor_lib.disable_tensor_equality
+Tensor = tensor_lib.Tensor
 
 
 @tf_export("__internal__.SymbolicTensor")
-class SymbolicTensor(pywrap_tf_session.PyTensor, Tensor):
+class SymbolicTensor(pywrap_tf_session.PyTensor, tensor_lib.Tensor):
   """A symbolic tensor from a graph or tf.function."""
 
   def __new__(cls, op, value_index, dtype, unique_id=None):
@@ -949,7 +282,8 @@ def _create_graph_constant(
   return const_tensor
 
 
-class _EagerTensorBase(Tensor, internal.NativeObject, core_tf_types.Value):
+class _EagerTensorBase(
+    tensor_lib.Tensor, internal.NativeObject, core_tf_types.Value):
   """Base class for EagerTensor."""
 
   # __complex__, __int__, __float__ and __index__ may copy the tensor to CPU and
@@ -1251,7 +585,7 @@ class _EagerTensorBase(Tensor, internal.NativeObject, core_tf_types.Value):
 
   def __tf_tensor__(
       self, dtype: Optional[dtypes.DType] = None, name: Optional[str] = None
-      ) -> Tensor:
+      ) -> tensor_lib.Tensor:
     if not context.executing_eagerly():
       graph = get_default_graph()
       if not graph.building_function:
@@ -1361,7 +695,7 @@ def convert_to_tensor(
     dtype_hint=None,
     # TODO(b/268347915): Remove argument.
     ctx=None,  # pylint: disable=unused-argument
-    accepted_result_types=(Tensor,),
+    accepted_result_types=(tensor_lib.Tensor,),
 ):
   """Implementation of the public convert_to_tensor."""
   # TODO(b/142518781): Fix all call-sites and remove redundant arg
@@ -1510,7 +844,8 @@ def internal_convert_to_tensor_or_composite(value,
         dtype=dtype,
         name=name,
         as_ref=as_ref,
-        accepted_result_types=(Tensor, composite_tensor.CompositeTensor))
+        accepted_result_types=(
+            tensor_lib.Tensor, composite_tensor.CompositeTensor))
 
 
 def internal_convert_n_to_tensor_or_composite(values,
@@ -1784,7 +1119,7 @@ class Operation(pywrap_tf_session.PyOperation):
       raise TypeError(f"Argument inputs shall be a list of Tensors. "
                       f"Received an instance of type {type(inputs)}")
     for a in inputs:
-      if not isinstance(a, Tensor):
+      if not isinstance(a, tensor_lib.Tensor):
         raise TypeError(f"Items of argument inputs shall be Tensor. "
                         f"Received an instance of type {type(a)}.")
     if input_types is None:
@@ -1803,7 +1138,7 @@ class Operation(pywrap_tf_session.PyOperation):
         control_op = None
         if isinstance(c, Operation):
           control_op = c
-        elif isinstance(c, (Tensor, internal.IndexedSlices)):
+        elif isinstance(c, (tensor_lib.Tensor, internal.IndexedSlices)):
           control_op = c.op
         else:
           raise TypeError(f"Control input must be an Operation, "
@@ -2053,7 +1388,7 @@ class Operation(pywrap_tf_session.PyOperation):
         or if input tensor type is not convertible to dtype.
       ValueError: if the Tensor is from a different graph.
     """
-    if not isinstance(tensor, Tensor):
+    if not isinstance(tensor, tensor_lib.Tensor):
       raise TypeError("tensor must be a Tensor: %s" % tensor)
 
     _assert_same_graph(self, tensor)
@@ -2081,7 +1416,7 @@ class Operation(pywrap_tf_session.PyOperation):
     """
     with self.graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
       for tensor in tensors:
-        if not isinstance(tensor, Tensor):
+        if not isinstance(tensor, tensor_lib.Tensor):
           raise TypeError("tensor must be a Tensor: %s" % tensor)
         _assert_same_graph(self, tensor)
 
@@ -2967,7 +2302,8 @@ class Graph(pywrap_tf_session.PyGraph):
         grad_def.gradient_func = f.grad_func_name
         graph_def.library.gradient.extend([grad_def])
 
-  def _as_graph_def(self, from_version=None, add_shapes=False):
+  def _as_graph_def(
+      self, from_version=None, add_shapes=False, use_pybind11_proto=False):
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
 
@@ -2983,6 +2319,8 @@ class Graph(pywrap_tf_session.PyGraph):
         property had the given value.
       add_shapes: If true, adds an "_output_shapes" list attr to each node with
         the inferred shapes of each of its outputs.
+      use_pybind11_proto: If true, uses the c++ pybind11_proto api to get the
+        GraphDef proto directly from c++, instead of through a TF buffer.
 
     Returns:
       A tuple containing a
@@ -2996,12 +2334,17 @@ class Graph(pywrap_tf_session.PyGraph):
     """
     # pylint: enable=line-too-long
     with self._lock:
-      with c_api_util.tf_buffer() as buf:
+      if use_pybind11_proto:
         with self._c_graph.get() as c_graph:
-          pywrap_tf_session.TF_GraphToGraphDef(c_graph, buf)
-          data = pywrap_tf_session.TF_GetBuffer(buf)
-      graph = graph_pb2.GraphDef()
-      graph.ParseFromString(compat.as_bytes(data))
+          graph = graph_pb2.GraphDef()
+          graph.CopyFrom(pywrap_tf_session.TF_GraphToGraphDefPybind(c_graph))
+      else:
+        with c_api_util.tf_buffer() as buf:
+          with self._c_graph.get() as c_graph:
+            pywrap_tf_session.TF_GraphToGraphDef(c_graph, buf)
+            data = pywrap_tf_session.TF_GetBuffer(buf)
+        graph = graph_pb2.GraphDef()
+        graph.ParseFromString(compat.as_bytes(data))
       # Strip the experimental library field iff it's empty.
       if not graph.library.function:
         graph.ClearField("library")
@@ -3122,8 +2465,8 @@ class Graph(pywrap_tf_session.PyGraph):
     else:
       self._add_function(function)
 
-    if hasattr(function, "graph"):
-      for f in function.graph._functions.values():  # pylint: disable=protected-access
+    if hasattr(function, "children"):
+      for f in function.children:  # pylint: disable=protected-access
         if self._is_function(f.name):
           if overwrite:
             self._remove_function(f.name)
@@ -3240,7 +2583,7 @@ class Graph(pywrap_tf_session.PyGraph):
     """
     del compute_shapes
     for idx, a in enumerate(inputs):
-      if not isinstance(a, Tensor):
+      if not isinstance(a, tensor_lib.Tensor):
         raise TypeError("Input #%d is not a tensor: %s" % (idx, a))
     return self._create_op_internal(op_type, inputs, dtypes, input_types, name,
                                     attrs, op_def, compute_device)
@@ -3587,7 +2930,7 @@ class Graph(pywrap_tf_session.PyGraph):
                     "\"<op_name>:<output_index>\".")
         raise ValueError(err_msg)
 
-    elif isinstance(obj, Tensor) and allow_tensor:
+    elif isinstance(obj, tensor_lib.Tensor) and allow_tensor:
       # Actually obj is just the object it's referring to.
       if obj.graph is not self:
         raise ValueError("Tensor %s is not an element of this graph." % obj)
@@ -4387,12 +3730,12 @@ class Graph(pywrap_tf_session.PyGraph):
       return self._control_inputs_val
 
     def add_op(self, op):
-      if isinstance(op, Tensor):
+      if isinstance(op, tensor_lib.Tensor):
         op = op.ref()
       self._seen_nodes.add(op)
 
     def op_in_group(self, op):
-      if isinstance(op, Tensor):
+      if isinstance(op, tensor_lib.Tensor):
         op = op.ref()
       return op in self._seen_nodes
 
@@ -4557,7 +3900,7 @@ class Graph(pywrap_tf_session.PyGraph):
           (hasattr(c, "_handle") and hasattr(c, "op"))):
         c = c.op
       c = self.as_graph_element(c)
-      if isinstance(c, Tensor):
+      if isinstance(c, tensor_lib.Tensor):
         c = c.op
       elif not isinstance(c, Operation):
         raise TypeError("Control input must be Operation or Tensor: %s" % c)
@@ -4794,7 +4137,7 @@ class Graph(pywrap_tf_session.PyGraph):
 
   def is_fetchable(self, tensor_or_op):
     """Returns `True` if and only if `tensor_or_op` is fetchable."""
-    if isinstance(tensor_or_op, Tensor):
+    if isinstance(tensor_or_op, tensor_lib.Tensor):
       return tensor_or_op.op not in self._unfetchable_ops
     else:
       return tensor_or_op not in self._unfetchable_ops
@@ -5153,46 +4496,6 @@ def control_dependencies(control_inputs):
 get_default_session = stack.get_default_session
 
 
-def _eval_using_default_session(tensors, feed_dict, graph, session=None):
-  """Uses the default session to evaluate one or more tensors.
-
-  Args:
-    tensors: A single Tensor, or a list of Tensor objects.
-    feed_dict: A dictionary that maps Tensor objects (or tensor names) to lists,
-      numpy ndarrays, TensorProtos, or strings.
-    graph: The graph in which the tensors are defined.
-    session: (Optional) A different session to use to evaluate "tensors".
-
-  Returns:
-    Either a single numpy ndarray if "tensors" is a single tensor; or a list
-    of numpy ndarrays that each correspond to the respective element in
-    "tensors".
-
-  Raises:
-    ValueError: If no default session is available; the default session
-      does not have "graph" as its graph; or if "session" is specified,
-      and it does not have "graph" as its graph.
-  """
-  if session is None:
-    session = stack.get_default_session()
-    if session is None:
-      raise ValueError("Cannot evaluate tensor using `eval()`: No default "
-                       "session is registered. Use `with "
-                       "sess.as_default()` or pass an explicit session to "
-                       "`eval(session=sess)`")
-    if session.graph is not graph:
-      raise ValueError("Cannot use the default session to evaluate tensor: "
-                       "the tensor's graph is different from the session's "
-                       "graph. Pass an explicit session to "
-                       "`eval(session=sess)`.")
-  else:
-    if session.graph is not graph:
-      raise ValueError("Cannot use the given session to evaluate tensor: "
-                       "the tensor's graph is different from the session's "
-                       "graph.")
-  return session.run(tensors, feed_dict)
-
-
 def _run_using_default_session(operation, feed_dict, graph, session=None):
   """Uses the default session to run "operation".
 
@@ -6535,7 +5838,7 @@ def _op_to_colocate_with(v, graph):
   # colocation constraints altogether. Assuming that will
   # happen soon, perhaps this hack to work around the circular
   # import dependency is acceptable.
-  if hasattr(v, "handle") and isinstance(v.handle, Tensor):
+  if hasattr(v, "handle") and isinstance(v.handle, tensor_lib.Tensor):
     device_only_candidate = lambda: None
     device_only_candidate.device = v.device
     device_only_candidate.name = v.name
@@ -6670,29 +5973,6 @@ def enable_numpy_style_slicing():
   _numpy_style_slicing = True
 
 
-class _TensorIterator(object):
-  """Iterates over the leading dim of a Tensor. Performs no error checks."""
-
-  __slots__ = ["_tensor", "_index", "_limit"]
-
-  def __init__(self, tensor, dim0):
-    self._tensor = tensor
-    self._index = 0
-    self._limit = dim0
-
-  def __iter__(self):
-    return self
-
-  def __next__(self):
-    if self._index == self._limit:
-      raise StopIteration
-    result = self._tensor[self._index]
-    self._index += 1
-    return result
-
-  next = __next__  # python2.x compatibility.
-
-
 def set_int_list_attr(op, attr_name, ints):
   """TF internal method used to set a list(int) attribute in the node_def."""
   ints_list = attr_value_pb2.AttrValue.ListValue(i=ints)
diff --git a/tensorflow/python/framework/ops_test.py b/tensorflow/python/framework/ops_test.py
index bc29d36e86e..827081402a9 100644
--- a/tensorflow/python/framework/ops_test.py
+++ b/tensorflow/python/framework/ops_test.py
@@ -3700,7 +3700,7 @@ class GraphDefInputShapesTest(test_util.TensorFlowTestCase):
       concrete_function = eager_function.ConcreteFunction(
           concrete_function.graph,
           attrs={"_input_shapes": attr_value},
-          spec=concrete_function._pre_initialized_function_spec)
+          function_type=concrete_function.function_type)
 
     test_graph = ops.Graph()
     with test_graph.as_default():
diff --git a/tensorflow/python/framework/sparse_tensor.py b/tensorflow/python/framework/sparse_tensor.py
index c5dae425a83..eef5284309f 100644
--- a/tensorflow/python/framework/sparse_tensor.py
+++ b/tensorflow/python/framework/sparse_tensor.py
@@ -25,6 +25,7 @@ from tensorflow.python.framework import composite_tensor
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
@@ -38,8 +39,8 @@ from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util.tf_export import tf_export
 
 # pylint: disable=protected-access
-_eval_using_default_session = ops._eval_using_default_session
-_override_helper = ops._override_helper
+_eval_using_default_session = tensor._eval_using_default_session
+_override_helper = tensor._override_helper
 # pylint: enable=protected-access
 
 
diff --git a/tensorflow/python/framework/tensor.py b/tensorflow/python/framework/tensor.py
index 942fb67151a..06fb4854b5a 100644
--- a/tensorflow/python/framework/tensor.py
+++ b/tensorflow/python/framework/tensor.py
@@ -14,22 +14,27 @@
 # ==============================================================================
 """Tensor and TensorSpec classes."""
 
-from typing import Type
+from typing import Optional, Type
 
 import numpy as np
 
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.function import trace_type
 from tensorflow.core.protobuf import struct_pb2
+from tensorflow.python import tf2
+from tensorflow.python.eager import context
+from tensorflow.python.eager import monitoring
+from tensorflow.python.eager import record
 from tensorflow.python.framework import common_shapes
 from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
 from tensorflow.python.framework import op_callbacks
-from tensorflow.python.framework import ops
+from tensorflow.python.framework import stack
+from tensorflow.python.framework import tensor_conversion_registry
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.framework import type_spec_registry
-from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import handle_data_util
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
@@ -37,9 +42,751 @@ from tensorflow.python.types import core as core_tf_types
 from tensorflow.python.types import internal
 from tensorflow.python.util import _pywrap_utils
 from tensorflow.python.util import compat
+from tensorflow.python.util import deprecation
+from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
 
 
+_tensor_equality_api_usage_gauge = monitoring.BoolGauge(
+    "/tensorflow/api/enable_tensor_equality",
+    "Whether ops.enable_tensor_equality() is called.")
+
+
+def _override_helper(clazz_object, operator, func):
+  """Overrides (string) operator on Tensors to call func.
+
+  Args:
+    clazz_object: the class to override for; either Tensor or SparseTensor.
+    operator: the string name of the operator to override.
+    func: the function that replaces the overridden operator.
+
+  Raises:
+    ValueError: If operator is not allowed to be overwritten.
+  """
+  if operator not in Tensor.OVERLOADABLE_OPERATORS:
+    raise ValueError(f"Overriding {operator} is disallowed. "
+                     f"Allowed operators are {Tensor.OVERLOADABLE_OPERATORS}.")
+  setattr(clazz_object, operator, func)
+
+
+def _eval_using_default_session(tensors, feed_dict, graph, session=None):
+  """Uses the default session to evaluate one or more tensors.
+
+  Args:
+    tensors: A single Tensor, or a list of Tensor objects.
+    feed_dict: A dictionary that maps Tensor objects (or tensor names) to lists,
+      numpy ndarrays, TensorProtos, or strings.
+    graph: The graph in which the tensors are defined.
+    session: (Optional) A different session to use to evaluate "tensors".
+
+  Returns:
+    Either a single numpy ndarray if "tensors" is a single tensor; or a list
+    of numpy ndarrays that each correspond to the respective element in
+    "tensors".
+
+  Raises:
+    ValueError: If no default session is available; the default session
+      does not have "graph" as its graph; or if "session" is specified,
+      and it does not have "graph" as its graph.
+  """
+  if session is None:
+    session = stack.get_default_session()
+    if session is None:
+      raise ValueError("Cannot evaluate tensor using `eval()`: No default "
+                       "session is registered. Use `with "
+                       "sess.as_default()` or pass an explicit session to "
+                       "`eval(session=sess)`")
+    if session.graph is not graph:
+      raise ValueError("Cannot use the default session to evaluate tensor: "
+                       "the tensor's graph is different from the session's "
+                       "graph. Pass an explicit session to "
+                       "`eval(session=sess)`.")
+  else:
+    if session.graph is not graph:
+      raise ValueError("Cannot use the given session to evaluate tensor: "
+                       "the tensor's graph is different from the session's "
+                       "graph.")
+  return session.run(tensors, feed_dict)
+
+
+def _add_error_prefix(msg, *, name=None):
+  return msg if name is None else f"{name}: {msg}"
+
+
+class _TensorIterator(object):
+  """Iterates over the leading dim of a Tensor. Performs no error checks."""
+
+  __slots__ = ["_tensor", "_index", "_limit"]
+
+  def __init__(self, tensor, dim0):
+    self._tensor = tensor
+    self._index = 0
+    self._limit = dim0
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    if self._index == self._limit:
+      raise StopIteration
+    result = self._tensor[self._index]
+    self._index += 1
+    return result
+
+  next = __next__  # python2.x compatibility.
+
+
+@tf_export("Tensor", "experimental.numpy.ndarray", v1=["Tensor"])
+class Tensor(internal.NativeObject, core_tf_types.Symbol):
+  """A `tf.Tensor` represents a multidimensional array of elements.
+
+  All elements are of a single known data type.
+
+  When writing a TensorFlow program, the main object that is
+  manipulated and passed around is the `tf.Tensor`.
+
+  A `tf.Tensor` has the following properties:
+
+  * a single data type (float32, int32, or string, for example)
+  * a shape
+
+  TensorFlow supports eager execution and graph execution.  In eager
+  execution, operations are evaluated immediately.  In graph
+  execution, a computational graph is constructed for later
+  evaluation.
+
+  TensorFlow defaults to eager execution.  In the example below, the
+  matrix multiplication results are calculated immediately.
+
+  >>> # Compute some values using a Tensor
+  >>> c = tf.constant([[1.0, 2.0], [3.0, 4.0]])
+  >>> d = tf.constant([[1.0, 1.0], [0.0, 1.0]])
+  >>> e = tf.matmul(c, d)
+  >>> print(e)
+  tf.Tensor(
+  [[1. 3.]
+   [3. 7.]], shape=(2, 2), dtype=float32)
+
+  Note that during eager execution, you may discover your `Tensors` are actually
+  of type `EagerTensor`.  This is an internal detail, but it does give you
+  access to a useful function, `numpy`:
+
+  >>> type(e)
+  <class '...ops.EagerTensor'>
+  >>> print(e.numpy())
+    [[1. 3.]
+     [3. 7.]]
+
+  In TensorFlow, `tf.function`s are a common way to define graph execution.
+
+  A Tensor's shape (that is, the rank of the Tensor and the size of
+  each dimension) may not always be fully known.  In `tf.function`
+  definitions, the shape may only be partially known.
+
+  Most operations produce tensors of fully-known shapes if the shapes of their
+  inputs are also fully known, but in some cases it's only possible to find the
+  shape of a tensor at execution time.
+
+  A number of specialized tensors are available: see `tf.Variable`,
+  `tf.constant`, `tf.placeholder`, `tf.sparse.SparseTensor`, and
+  `tf.RaggedTensor`.
+
+  Caution: when constructing a tensor from a numpy array or pandas dataframe
+  the underlying buffer may be re-used:
+
+  ```python
+  a = np.array([1, 2, 3])
+  b = tf.constant(a)
+  a[0] = 4
+  print(b)  # tf.Tensor([4 2 3], shape=(3,), dtype=int64)
+  ```
+
+  Note: this is an implementation detail that is subject to change and users
+  should not rely on this behaviour.
+
+  For more on Tensors, see the [guide](https://tensorflow.org/guide/tensor).
+  """
+  # List of Python operators that we allow to override.
+  OVERLOADABLE_OPERATORS = {
+      # Binary.
+      "__add__",
+      "__radd__",
+      "__sub__",
+      "__rsub__",
+      "__mul__",
+      "__rmul__",
+      "__div__",
+      "__rdiv__",
+      "__truediv__",
+      "__rtruediv__",
+      "__floordiv__",
+      "__rfloordiv__",
+      "__mod__",
+      "__rmod__",
+      "__lt__",
+      "__le__",
+      "__gt__",
+      "__ge__",
+      "__ne__",
+      "__eq__",
+      "__and__",
+      "__rand__",
+      "__or__",
+      "__ror__",
+      "__xor__",
+      "__rxor__",
+      "__getitem__",
+      "__pow__",
+      "__rpow__",
+      # Unary.
+      "__invert__",
+      "__neg__",
+      "__abs__",
+      "__matmul__",
+      "__rmatmul__"
+  }
+
+  # Whether to allow hashing or numpy-style equality
+  _USE_EQUALITY = tf2.enabled()
+
+  def __getattr__(self, name):
+    if name in {"T", "astype", "ravel", "transpose", "reshape", "clip", "size",
+                "tolist", "data"}:
+      # TODO(wangpeng): Export the enable_numpy_behavior knob
+      raise AttributeError(
+          f"{type(self).__name__} object has no attribute '{name}'. " + """
+        If you are looking for numpy-related methods, please run the following:
+        tf.experimental.numpy.experimental_enable_numpy_behavior()
+      """)
+    self.__getattribute__(name)
+
+  @property
+  def dtype(self):
+    """The `DType` of elements in this tensor."""
+    return self._dtype
+
+  @property
+  def name(self):
+    return self._name
+
+  @property
+  def shape(self):
+    """Returns a `tf.TensorShape` that represents the shape of this tensor.
+
+    >>> t = tf.constant([1,2,3,4,5])
+    >>> t.shape
+    TensorShape([5])
+
+    `tf.Tensor.shape` is equivalent to `tf.Tensor.get_shape()`.
+
+    In a `tf.function` or when building a model using
+    `tf.keras.Input`, they return the build-time shape of the
+    tensor, which may be partially unknown.
+
+    A `tf.TensorShape` is not a tensor. Use `tf.shape(t)` to get a tensor
+    containing the shape, calculated at runtime.
+
+    See `tf.Tensor.get_shape()`, and `tf.TensorShape` for details and examples.
+    """
+    if self._shape_val is None:
+      dims, unknown_shape = self._shape
+      if unknown_shape:
+        self._shape_val = tensor_shape.unknown_shape()
+      else:
+        self._shape_val = tensor_shape.TensorShape(dims)
+    return self._shape_val
+
+  @property
+  def ndim(self):
+    return self.shape.rank
+
+  def _disallow(self, task):
+    raise errors.OperatorNotAllowedInGraphError(
+        f"{task} is not allowed."
+        " You can attempt the following resolutions to the problem:"
+        " If you are running in Graph mode, use Eager execution mode"
+        " or decorate this function with @tf.function."
+        " If you are using AutoGraph, you can try decorating this function"
+        " with @tf.function. If that does not work, then you may be using"
+        " an unsupported feature or your source code may not be visible"
+        " to AutoGraph. See"
+        " https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code"
+        " for more information.")
+
+  def _disallow_bool_casting(self):
+    self._disallow("Using a symbolic `tf.Tensor` as a Python `bool`")
+
+  def _disallow_iteration(self):
+    self._disallow("Iterating over a symbolic `tf.Tensor`")
+
+  def __iter__(self):
+    if not context.executing_eagerly():
+      self._disallow_iteration()
+
+    first_dim = self._get_first_dim()
+    return _TensorIterator(self, first_dim)
+
+  def _get_first_dim(self):
+    shape = self._shape_tuple()
+    if shape is None:
+      raise TypeError("Cannot iterate over a tensor with unknown shape.")
+    if not shape:
+      raise TypeError("Cannot iterate over a scalar tensor.")
+    if shape[0] is None:
+      raise TypeError(
+          "Cannot iterate over a tensor with unknown first dimension.")
+    return shape[0]
+
+  def _shape_as_list(self):
+    if self.shape.ndims is not None:
+      return [dim.value for dim in self.shape.dims]
+    else:
+      return None
+
+  def _shape_tuple(self):
+    shape = self._shape_as_list()
+    if shape is None:
+      return None
+    return tuple(shape)
+
+  def _record_tape(self, capture):
+    """Connect this graph tensor with capture for gradients calculation."""
+    record.record_operation(
+        "captured_value",
+        [self], [capture],
+        backward_function=lambda x: [x],
+        forward_function=lambda x: [x])
+
+  def get_shape(self):
+    """Returns a `tf.TensorShape` that represents the shape of this tensor.
+
+    In eager execution the shape is always fully-known.
+
+    >>> a = tf.constant([[1.0, 2.0, 3.0], [4.0, 5.0, 6.0]])
+    >>> print(a.shape)
+    (2, 3)
+
+    `tf.Tensor.get_shape()` is equivalent to `tf.Tensor.shape`.
+
+
+    When executing in a `tf.function` or building a model using
+    `tf.keras.Input`, `Tensor.shape` may return a partial shape (including
+    `None` for unknown dimensions). See `tf.TensorShape` for more details.
+
+    >>> inputs = tf.keras.Input(shape = [10])
+    >>> # Unknown batch size
+    >>> print(inputs.shape)
+    (None, 10)
+
+    The shape is computed using shape inference functions that are
+    registered for each `tf.Operation`.
+
+    The returned `tf.TensorShape` is determined at *build* time, without
+    executing the underlying kernel. It is not a `tf.Tensor`. If you need a
+    shape *tensor*, either convert the `tf.TensorShape` to a `tf.constant`, or
+    use the `tf.shape(tensor)` function, which returns the tensor's shape at
+    *execution* time.
+
+    This is useful for debugging and providing early errors. For
+    example, when tracing a `tf.function`, no ops are being executed, shapes
+    may be unknown (See the [Concrete Functions
+    Guide](https://www.tensorflow.org/guide/concrete_function) for details).
+
+    >>> @tf.function
+    ... def my_matmul(a, b):
+    ...   result = a@b
+    ...   # the `print` executes during tracing.
+    ...   print("Result shape: ", result.shape)
+    ...   return result
+
+    The shape inference functions propagate shapes to the extent possible:
+
+    >>> f = my_matmul.get_concrete_function(
+    ...   tf.TensorSpec([None,3]),
+    ...   tf.TensorSpec([3,5]))
+    Result shape: (None, 5)
+
+    Tracing may fail if a shape missmatch can be detected:
+
+    >>> cf = my_matmul.get_concrete_function(
+    ...   tf.TensorSpec([None,3]),
+    ...   tf.TensorSpec([4,5]))
+    Traceback (most recent call last):
+    ...
+    ValueError: Dimensions must be equal, but are 3 and 4 for 'matmul' (op:
+    'MatMul') with input shapes: [?,3], [4,5].
+
+    In some cases, the inferred shape may have unknown dimensions. If
+    the caller has additional information about the values of these
+    dimensions, `tf.ensure_shape` or `Tensor.set_shape()` can be used to augment
+    the inferred shape.
+
+    >>> @tf.function
+    ... def my_fun(a):
+    ...   a = tf.ensure_shape(a, [5, 5])
+    ...   # the `print` executes during tracing.
+    ...   print("Result shape: ", a.shape)
+    ...   return a
+
+    >>> cf = my_fun.get_concrete_function(
+    ...   tf.TensorSpec([None, None]))
+    Result shape: (5, 5)
+
+    Returns:
+      A `tf.TensorShape` representing the shape of this tensor.
+
+    """
+    return self.shape
+
+  def set_shape(self, shape):
+    """Updates the shape of this tensor.
+
+    Note: It is recommended to use `tf.ensure_shape` instead of
+    `Tensor.set_shape`, because `tf.ensure_shape` provides better checking for
+    programming errors and can create guarantees for compiler
+    optimization.
+
+    With eager execution this operates as a shape assertion.
+    Here the shapes match:
+
+    >>> t = tf.constant([[1,2,3]])
+    >>> t.set_shape([1, 3])
+
+    Passing a `None` in the new shape allows any value for that axis:
+
+    >>> t.set_shape([1,None])
+
+    An error is raised if an incompatible shape is passed.
+
+    >>> t.set_shape([1,5])
+    Traceback (most recent call last):
+    ...
+    ValueError: Tensor's shape (1, 3) is not compatible with supplied
+    shape [1, 5]
+
+    When executing in a `tf.function`, or building a model using
+    `tf.keras.Input`, `Tensor.set_shape` will *merge* the given `shape` with
+    the current shape of this tensor, and set the tensor's shape to the
+    merged value (see `tf.TensorShape.merge_with` for details):
+
+    >>> t = tf.keras.Input(shape=[None, None, 3])
+    >>> print(t.shape)
+    (None, None, None, 3)
+
+    Dimensions set to `None` are not updated:
+
+    >>> t.set_shape([None, 224, 224, None])
+    >>> print(t.shape)
+    (None, 224, 224, 3)
+
+    The main use case for this is to provide additional shape information
+    that cannot be inferred from the graph alone.
+
+    For example if you know all the images in a dataset have shape [28,28,3] you
+    can set it with `tf.set_shape`:
+
+    >>> @tf.function
+    ... def load_image(filename):
+    ...   raw = tf.io.read_file(filename)
+    ...   image = tf.image.decode_png(raw, channels=3)
+    ...   # the `print` executes during tracing.
+    ...   print("Initial shape: ", image.shape)
+    ...   image.set_shape([28, 28, 3])
+    ...   print("Final shape: ", image.shape)
+    ...   return image
+
+    Trace the function, see the [Concrete Functions
+    Guide](https://www.tensorflow.org/guide/concrete_function) for details.
+
+    >>> cf = load_image.get_concrete_function(
+    ...     tf.TensorSpec([], dtype=tf.string))
+    Initial shape:  (None, None, 3)
+    Final shape: (28, 28, 3)
+
+    Similarly the `tf.io.parse_tensor` function could return a tensor with
+    any shape, even the `tf.rank` is unknown. If you know that all your
+    serialized tensors will be 2d, set it with `set_shape`:
+
+    >>> @tf.function
+    ... def my_parse(string_tensor):
+    ...   result = tf.io.parse_tensor(string_tensor, out_type=tf.float32)
+    ...   # the `print` executes during tracing.
+    ...   print("Initial shape: ", result.shape)
+    ...   result.set_shape([None, None])
+    ...   print("Final shape: ", result.shape)
+    ...   return result
+
+    Trace the function
+
+    >>> concrete_parse = my_parse.get_concrete_function(
+    ...     tf.TensorSpec([], dtype=tf.string))
+    Initial shape:  <unknown>
+    Final shape:  (None, None)
+
+    Make sure it works:
+
+    >>> t = tf.ones([5,3], dtype=tf.float32)
+    >>> serialized = tf.io.serialize_tensor(t)
+    >>> print(serialized.dtype)
+    <dtype: 'string'>
+    >>> print(serialized.shape)
+    ()
+    >>> t2 = concrete_parse(serialized)
+    >>> print(t2.shape)
+    (5, 3)
+
+    Caution: `set_shape` ensures that the applied shape is compatible with
+    the existing shape, but it does not check at runtime. Setting
+    incorrect shapes can result in inconsistencies between the
+    statically-known graph and the runtime value of tensors. For runtime
+    validation of the shape, use `tf.ensure_shape` instead. It also modifies
+    the `shape` of the tensor.
+
+    >>> # Serialize a rank-3 tensor
+    >>> t = tf.ones([5,5,5], dtype=tf.float32)
+    >>> serialized = tf.io.serialize_tensor(t)
+    >>> # The function still runs, even though it `set_shape([None,None])`
+    >>> t2 = concrete_parse(serialized)
+    >>> print(t2.shape)
+    (5, 5, 5)
+
+    Args:
+      shape: A `TensorShape` representing the shape of this tensor, a
+        `TensorShapeProto`, a list, a tuple, or None.
+
+    Raises:
+      ValueError: If `shape` is not compatible with the current shape of
+        this tensor.
+    """
+    # Reset cached shape.
+    self._shape_val = None
+
+    # We want set_shape to be reflected in the C API graph for when we run it.
+    if not isinstance(shape, tensor_shape.TensorShape):
+      shape = tensor_shape.TensorShape(shape)
+    dim_list = []
+    if shape.dims is None:
+      unknown_shape = True
+    else:
+      unknown_shape = False
+      for dim in shape.dims:
+        if dim.value is None:
+          dim_list.append(-1)
+        else:
+          dim_list.append(dim.value)
+    self._set_shape(dim_list, unknown_shape)
+
+  def _as_node_def_input(self):
+    """Return a value to use for the NodeDef "input" attribute.
+
+    The returned string can be used in a NodeDef "input" attribute
+    to indicate that the NodeDef uses this Tensor as input.
+
+    Raises:
+      ValueError: if this Tensor's Operation does not have a name.
+
+    Returns:
+      a string.
+    """
+    assert self._op.name
+    if self.value_index == 0:
+      return self._op.name
+    else:
+      return "%s:%d" % (self._op.name, self.value_index)
+
+  def __str__(self):
+    return "Tensor(\"%s\"%s%s%s)" % (
+        self.name,
+        (", shape=%s" %
+         self.get_shape()) if self.get_shape().ndims is not None else "",
+        (", dtype=%s" % self._dtype.name) if self._dtype else "",
+        (", device=%s" % self.device) if self.device else "")
+
+  def __repr__(self):
+    return "<tf.Tensor '%s' shape=%s dtype=%s>" % (self.name, self.get_shape(),
+                                                   self._dtype.name)
+
+  def __hash__(self):
+    g = getattr(self, "graph", None)
+    if (Tensor._USE_EQUALITY and (g is None or g.building_function)):
+      raise TypeError("Tensor is unhashable. "
+                      "Instead, use tensor.ref() as the key.")
+    else:
+      return id(self)
+
+  # NOTE(mrry): This enables the Tensor's overloaded "right" binary
+  # operators to run when the left operand is an ndarray, because it
+  # accords the Tensor class higher priority than an ndarray, or a
+  # numpy matrix.
+  # TODO(mrry): Convert this to using numpy's __numpy_ufunc__
+  # mechanism, which allows more control over how Tensors interact
+  # with ndarrays.
+  __array_priority__ = 100
+
+  def __array__(self, dtype=None):
+    del dtype
+    raise NotImplementedError(
+        f"Cannot convert a symbolic tf.Tensor ({self.name}) to a numpy array."
+        f" This error may indicate that you're trying to pass a Tensor to"
+        f" a NumPy call, which is not supported.")
+
+  def __len__(self):
+    raise TypeError(f"len is not well defined for a symbolic Tensor "
+                    f"({self.name}). Please call `x.shape` rather than "
+                    f"`len(x)` for shape information.")
+
+  # TODO(mdan): This convoluted machinery is hard to maintain. Clean up.
+  @staticmethod
+  def _override_operator(operator, func):
+    _override_helper(Tensor, operator, func)
+
+  def __bool__(self):  # pylint: disable=invalid-bool-returned
+    """Dummy method to prevent a tensor from being used as a Python `bool`.
+
+    This overload raises a `TypeError` when the user inadvertently
+    treats a `Tensor` as a boolean (most commonly in an `if` or `while`
+    statement), in code that was not converted by AutoGraph. For example:
+
+    ```python
+    if tf.constant(True):  # Will raise.
+      # ...
+
+    if tf.constant(5) < tf.constant(7):  # Will raise.
+      # ...
+    ```
+
+    Raises:
+      `TypeError`.
+    """
+    self._disallow_bool_casting()
+
+  def __nonzero__(self):
+    """Dummy method to prevent a tensor from being used as a Python `bool`.
+
+    This is the Python 2.x counterpart to `__bool__()` above.
+
+    Raises:
+      `TypeError`.
+    """
+    self._disallow_bool_casting()
+
+  def eval(self, feed_dict=None, session=None):
+    """Evaluates this tensor in a `Session`.
+
+    Note: If you are not using `compat.v1` libraries, you should not need this,
+    (or `feed_dict` or `Session`).  In eager execution (or within `tf.function`)
+    you do not need to call `eval`.
+
+    Calling this method will execute all preceding operations that
+    produce the inputs needed for the operation that produces this
+    tensor.
+
+    *N.B.* Before invoking `Tensor.eval()`, its graph must have been
+    launched in a session, and either a default session must be
+    available, or `session` must be specified explicitly.
+
+    Args:
+      feed_dict: A dictionary that maps `Tensor` objects to feed values. See
+        `tf.Session.run` for a description of the valid feed values.
+      session: (Optional.) The `Session` to be used to evaluate this tensor. If
+        none, the default session will be used.
+
+    Returns:
+      A numpy array corresponding to the value of this tensor.
+    """
+    return _eval_using_default_session(self, feed_dict, self.graph, session)
+
+  @deprecation.deprecated(None, "Use ref() instead.")
+  def experimental_ref(self):
+    return self.ref()
+
+  def ref(self):
+    # tf.Variable also has the same ref() API.  If you update the
+    # documentation here, please update tf.Variable.ref() as well.
+    """Returns a hashable reference object to this Tensor.
+
+    The primary use case for this API is to put tensors in a set/dictionary.
+    We can't put tensors in a set/dictionary as `tensor.__hash__()` is no longer
+    available starting Tensorflow 2.0.
+
+    The following will raise an exception starting 2.0
+
+    >>> x = tf.constant(5)
+    >>> y = tf.constant(10)
+    >>> z = tf.constant(10)
+    >>> tensor_set = {x, y, z}
+    Traceback (most recent call last):
+      ...
+    TypeError: Tensor is unhashable. Instead, use tensor.ref() as the key.
+    >>> tensor_dict = {x: 'five', y: 'ten'}
+    Traceback (most recent call last):
+      ...
+    TypeError: Tensor is unhashable. Instead, use tensor.ref() as the key.
+
+    Instead, we can use `tensor.ref()`.
+
+    >>> tensor_set = {x.ref(), y.ref(), z.ref()}
+    >>> x.ref() in tensor_set
+    True
+    >>> tensor_dict = {x.ref(): 'five', y.ref(): 'ten', z.ref(): 'ten'}
+    >>> tensor_dict[y.ref()]
+    'ten'
+
+    Also, the reference object provides `.deref()` function that returns the
+    original Tensor.
+
+    >>> x = tf.constant(5)
+    >>> x.ref().deref()
+    <tf.Tensor: shape=(), dtype=int32, numpy=5>
+    """
+    return object_identity.Reference(self)
+
+  def __tf_tracing_type__(self, signature_context):
+    if self.dtype == dtypes.resource or self.dtype == dtypes.variant:
+      handle_data = handle_data_util.get_handle_data(self)
+      dtype = dtypes.DType(self.dtype._type_enum, handle_data)
+    else:
+      dtype = self.dtype
+    spec = TensorSpec(self.shape, dtype)
+    return spec
+
+  def __tf_tensor__(
+      self, dtype: Optional[dtypes.DType] = None, name: Optional[str] = None
+      ) -> "Tensor":
+    if dtype is not None and not dtype.is_compatible_with(self.dtype):
+      raise ValueError(
+          _add_error_prefix(
+              f"Tensor conversion requested dtype {dtype.name} "
+              f"for Tensor with dtype {self.dtype.name}: {self!r}",
+              name=name))
+    return self
+
+
+@tf_export(v1=["enable_tensor_equality"])
+def enable_tensor_equality():
+  """Compare Tensors with element-wise comparison and thus be unhashable.
+
+  Comparing tensors with element-wise allows comparisons such as
+  tf.Variable(1.0) == 1.0. Element-wise equality implies that tensors are
+  unhashable. Thus tensors can no longer be directly used in sets or as a key in
+  a dictionary.
+  """
+  logging.vlog(1, "Enabling tensor equality")
+  _tensor_equality_api_usage_gauge.get_cell().set(True)
+  Tensor._USE_EQUALITY = True  # pylint: disable=protected-access
+
+
+@tf_export(v1=["disable_tensor_equality"])
+def disable_tensor_equality():
+  """Compare Tensors by their id and be hashable.
+
+  This is a legacy behaviour of TensorFlow and is highly discouraged.
+  """
+  logging.vlog(1, "Disabling tensor equality")
+  _tensor_equality_api_usage_gauge.get_cell().set(False)
+  Tensor._USE_EQUALITY = False  # pylint: disable=protected-access
+
+
 # TODO(b/249802365): Sanitize all TensorSpec names.
 def sanitize_spec_name(name: str) -> str:
   """Sanitizes Spec names. Matches Graph Node and Python naming conventions.
@@ -327,9 +1074,16 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
     return result
 
   def _to_tensors(self, value):
-    assert isinstance(value, ops.Tensor)
+    assert isinstance(value, Tensor)
     return [value]
 
+  def _from_tensors(self, tensors):
+    tensor = next(tensors)
+    handle_data = self.dtype._handle_data  # pylint: disable=protected-access
+    if handle_data:
+      handle_data_util.set_handle_data(tensor, handle_data)
+    return tensor
+
   def _flatten(self):
     return [self]
 
@@ -342,12 +1096,12 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
       assert value.is_subtype_of(self), f"Can not cast {value!r} to {self!r}"
       return self
 
-    value = ops.convert_to_tensor(value, self.dtype)
+    value = tensor_conversion_registry.convert(value, self.dtype)
     value_spec = TensorSpec(value.shape, value.dtype, self.name)
 
     if not value_spec.is_subtype_of(self):
       if self.is_subtype_of(value_spec):
-        gen_array_ops.ensure_shape(value, self.shape)
+        value.set_shape(self.shape)
       else:
         raise AssertionError(f"Can not cast {value_spec!r} to {self!r}")
 
@@ -381,9 +1135,9 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
     Returns:
       A `TensorSpec` that describes `tensor`.
     """
-    if isinstance(tensor, ops.EagerTensor):
+    if isinstance(tensor, core_tf_types.Value):
       return TensorSpec(tensor.shape, tensor.dtype, name)
-    elif isinstance(tensor, ops.Tensor):
+    elif isinstance(tensor, core_tf_types.Symbol):
       # TODO(b/249802365): Return a sanitized version of op name or no name.
       return TensorSpec(tensor.shape, tensor.dtype, name or tensor.op.name)
     else:
@@ -393,7 +1147,7 @@ class TensorSpec(DenseSpec, type_spec.BatchableTypeSpec,
   @property
   def value_type(self):
     """The Python type for values that are compatible with this TypeSpec."""
-    return ops.Tensor
+    return Tensor
 
   def _to_components(self, value):
     assert isinstance(value, core_tf_types.Tensor)
@@ -688,7 +1442,7 @@ _pywrap_utils.RegisterType("TensorSpec", TensorSpec)
 
 # Note: we do not include Tensor names when constructing TypeSpecs.
 type_spec.register_type_spec_from_value_converter(
-    ops.Tensor, lambda tensor: TensorSpec(tensor.shape, tensor.dtype))
+    Tensor, lambda tensor: TensorSpec(tensor.shape, tensor.dtype))
 
 type_spec.register_type_spec_from_value_converter(
     np.ndarray, lambda array: TensorSpec(array.shape, array.dtype))
diff --git a/tensorflow/python/framework/type_spec.py b/tensorflow/python/framework/type_spec.py
index 93e678699cd..562d087ad66 100644
--- a/tensorflow/python/framework/type_spec.py
+++ b/tensorflow/python/framework/type_spec.py
@@ -42,6 +42,11 @@ from tensorflow.python.util.tf_export import tf_export
 from tensorflow.tools.docs import doc_controls
 
 
+_CACHED_CMP_KEY = "_cached_cmp_key"  # Used by hashing and equality.
+# Cache fixed, derived TypeSpec properties to avoid expensive recompute.
+CACHED_FIXED_PROPERTIES = [_CACHED_CMP_KEY]
+
+
 @tf_export("TypeSpec", v1=["TypeSpec", "data.experimental.Structure"])
 class TypeSpec(
     internal.TypeSpec,
@@ -91,7 +96,7 @@ class TypeSpec(
   # be used to reconstruct the `TypeSpec`.  See the documentation for
   # `_serialize()` for more information.
 
-  __slots__ = []
+  __slots__ = CACHED_FIXED_PROPERTIES
 
   @abc.abstractproperty
   def value_type(self):
@@ -240,24 +245,48 @@ class TypeSpec(
     return self._from_components(component_placeholders)
 
   def _to_tensors(self, value):
-    value_spec = type_spec_from_value(value)
-    assert value_spec.is_subtype_of(self)
-    return [arg for arg in nest.flatten(value, expand_composites=True)
-            if isinstance(arg, core_types.Symbol)]
+    tensors = []
+    nest.map_structure(
+        lambda spec, v: tensors.extend(spec._to_tensors(v)),  # pylint: disable=protected-access
+        self._component_specs,
+        self._to_components(value))
+    return tensors
+
+  def _from_tensors(self, tensors):
+    components = nest.map_structure(
+        lambda spec: spec._from_tensors(tensors),  # pylint: disable=protected-access
+        self._component_specs
+    )
+    return self._from_components(components)
 
   def _flatten(self):
-    return nest.flatten(self._component_specs, expand_composites=True)
+    flat = []
+    nest.map_structure(
+        lambda spec: flat.extend(spec._flatten()),  # pylint: disable=protected-access
+        self._component_specs)
+    return flat
 
   def _cast(self, value, casting_context):
     if casting_context.allow_specs and isinstance(value, TypeSpec):
       assert value.is_subtype_of(self), f"Can not cast {value!r} to {self!r}"
       return self
 
+    did_cast = False
+    def cast_fn(spec, v):
+      casted_v = spec._cast(v, casting_context)  # pylint: disable=protected-access
+      if casted_v is not v:
+        nonlocal did_cast
+        did_cast = True
+      return casted_v
+
     cast_components = nest.map_structure(
-        lambda spec, v: spec._cast(v, casting_context),  # pylint: disable=protected-access
-        self._component_specs,
-        self._to_components(value))
-    return self._from_components(cast_components)
+        cast_fn, self._component_specs, self._to_components(value)
+    )
+
+    if did_cast:
+      return self._from_components(cast_components)
+    else:
+      return value
 
   # TODO(b/225058047): Reconsider semantics.
   def is_compatible_with(self, spec_or_value):
@@ -562,8 +591,12 @@ class TypeSpec(
 
   def __get_cmp_key(self):
     """Returns a hashable eq-comparable key for `self`."""
-    # TODO(b/133606651): Decide whether to cache this value.
-    return (type(self), self.__make_cmp_key(self._serialize()))
+    if not hasattr(self, _CACHED_CMP_KEY):
+      setattr(self, _CACHED_CMP_KEY, (
+          type(self),
+          self.__make_cmp_key(self._serialize()),
+      ))
+    return getattr(self, _CACHED_CMP_KEY)
 
   def __make_cmp_key(self, value):
     """Converts `value` to a hashable key."""
diff --git a/tensorflow/python/framework/type_spec_test.py b/tensorflow/python/framework/type_spec_test.py
index 9044037bf93..a83648dd6cf 100644
--- a/tensorflow/python/framework/type_spec_test.py
+++ b/tensorflow/python/framework/type_spec_test.py
@@ -418,6 +418,50 @@ class TypeSpecTest(test_util.TensorFlowTestCase, parameterized.TestCase):
     self.assertIsNone(v1.most_specific_common_supertype([v2]))
     self.assertIsNone(v2.most_specific_common_supertype([v1]))
 
+  def testTensorDecomposition(self):
+    value = TwoComposites(
+        ragged_factory_ops.constant([[1, 2], [3]], dtypes.int32),
+        ragged_factory_ops.constant([[5], [6, 7, 8]], dtypes.float32),
+    )
+    spec = type_spec.type_spec_from_value(value)
+
+    self.assertEqual(
+        spec._flatten(),
+        [
+            tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.int32),
+            tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.int64),
+            tensor_spec.TensorSpec(shape=(None,), dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.int64),
+        ],
+    )
+    self.assertEqual(
+        [trace_type.from_value(t) for t in spec._to_tensors(value)],
+        [
+            tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.int32),
+            tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.int64),
+            tensor_spec.TensorSpec(shape=(4,), dtype=dtypes.float32),
+            tensor_spec.TensorSpec(shape=(3,), dtype=dtypes.int64),
+        ],
+    )
+
+    flat_original = spec._to_tensors(value)
+    reconstructed = spec._from_tensors(iter(flat_original))
+    flat_reconstructed = spec._to_tensors(reconstructed)
+
+    for original, reconstructed in zip(flat_original, flat_reconstructed):
+      self.assertIs(original, reconstructed)
+
+  def testCastDoesntRecreateCompositeTensor(self):
+    value = TwoComposites(
+        ragged_factory_ops.constant([[1, 2], [3]], dtypes.int32),
+        ragged_factory_ops.constant([[5], [6, 7, 8]], dtypes.float32),
+    )
+    spec = type_spec.type_spec_from_value(value)
+
+    casted_value = spec._cast(value, trace_type.InternalCastContext())
+
+    self.assertIs(value, casted_value)
+
   @parameterized.named_parameters(
       ("SameValue",
        TwoTensorsSpec([5, 3], dtypes.int32, [None], dtypes.bool),
diff --git a/tensorflow/python/framework/weak_tensor.py b/tensorflow/python/framework/weak_tensor.py
new file mode 100644
index 00000000000..2c9e9a39c66
--- /dev/null
+++ b/tensorflow/python/framework/weak_tensor.py
@@ -0,0 +1,176 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""An extension type that represents WeakTensor."""
+
+
+from typing import Optional
+
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import extension_type
+from tensorflow.python.framework import ops
+
+_ALLOWED_WEAK_DTYPES = (
+    dtypes.int32,
+    dtypes.int64,
+    dtypes.float32,
+    dtypes.float64,
+    dtypes.complex128,
+)
+
+
+# TODO(b/285024542): Modify the isinstance() checks to include WeakTensor.
+# instance.
+class WeakTensor(extension_type.ExtensionType):
+  """A simple wrapper of weakly typed Tensor.
+
+  Contains a normal Tensor.
+
+  A "weak" type means that its dtype is temporarily inferred by the system,
+  and could defer to other dtypes.
+
+  i.g. weak f64 + f16 => f16
+
+  This information is used to determine tf-numpy dtype promotion behavior.
+  """
+
+  # __name__ is required for serialization in SavedModel.
+  __name__ = "tf.WeakTensor"
+  tensor: ops.Tensor
+
+  def __init__(self, tensor):
+    if tensor.dtype not in _ALLOWED_WEAK_DTYPES:
+      raise TypeError(
+          f"{tensor.dtype} not allowed "
+          f"as a weak type. The allowed types are {_ALLOWED_WEAK_DTYPES}."
+      )
+    self.tensor = tensor
+
+  def __str__(self):
+    return f"{str(self.tensor)} weakly typed"
+
+  def __repr__(self):
+    return f"{repr(self.tensor)} weakly typed"
+
+  def __getattr__(self, *args, **kwargs):
+    # Fallback to `__getattr__` if `__getattribute__` fails, so that we can
+    # directly expose Tensor's methods.
+    return getattr(self.tensor, *args, **kwargs)
+
+  def __array__(self, dtype=None):
+    # We need to explicitly call np.array() because
+    # self_tensor.__array__() for scalars raise:
+    #     ValueError: object __array__ method not producing an array
+    # resource_variable_ops also follows the same pattern.
+    return np.array(self.tensor.__array__(dtype))
+
+  def _disallow(self, task):
+    raise errors.OperatorNotAllowedInGraphError(
+        f"{task} is not allowed. You can attempt the following resolutions to"
+        " the problem: If you are running in Graph mode, use Eager execution"
+        " mode or decorate this function with @tf.function. If you are using"
+        " AutoGraph, you can try decorating this function with @tf.function."
+        " If that does not work, then you may be using an unsupported feature"
+        " or your source code may not be visible to AutoGraph. See"
+        " https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code"
+        " for more information."
+    )
+
+  def _disallow_iteration(self):
+    self._disallow("Iterating over a symbolic `tf.WeakTensor`")
+
+  def _shape_as_list(self):
+    if self.shape.ndims is not None:
+      return [dim.value for dim in self.shape.dims]
+    else:
+      return None
+
+  def __iter__(self):
+    if not context.executing_eagerly():
+      self._disallow_iteration()
+    first_dim = self.tensor._get_first_dim()
+    return _WeakTensorIterator(self, first_dim)
+
+  def __hash__(self):
+    return self.tensor.__hash__()
+
+  def __copy__(self):
+    # Weak Tensors are immutable so it's safe to return themselves as a copy.
+    return self
+
+  def __len__(self):
+    return self.tensor.__len__()
+
+  def __bool__(self):
+    return self.tensor.__bool__()
+
+  def __tf_tensor__(
+      self, dtype: Optional[dtypes.DType] = None, name: Optional[str] = None
+  ):
+    return self.tensor.__tf_tensor__(self, dtype=dtype, name=name)
+
+  def __format__(self, format_spec):
+    return f"{self.tensor.__format__(format_spec)} weakly typed"
+
+  def __complex__(self):
+    return self.tensor.__complex__()
+
+  def __int__(self):
+    return self.tensor.__int__()
+
+  def __float__(self):
+    return self.tensor.__float__()
+
+  def __index__(self):
+    return self.tensor.__index__()
+
+  def __deepcopy__(self, memo):
+    # Eager Tensors are immutable so it's safe to return themselves as a copy.
+    del memo
+    return self
+
+  # Redefine `shape` and `dtype` rather than relying on `getattr` because the
+  # class derives from core.Tensor which returns None in the two methods.
+  @property
+  def dtype(self):
+    return self.tensor.dtype
+
+  @property
+  def shape(self):
+    return self.tensor.shape
+
+
+class _WeakTensorIterator(object):
+  """Iterates over the leading dim of a WeakTensor. Performs no error checks."""
+
+  __slots__ = ["_weak_tensor", "_index", "_limit"]
+
+  def __init__(self, weak_tensor, dim0):
+    self._weak_tensor = weak_tensor
+    self._index = 0
+    self._limit = dim0
+
+  def __iter__(self):
+    return self
+
+  def __next__(self):
+    if self._index == self._limit:
+      raise StopIteration
+    result = WeakTensor(self._weak_tensor.tensor[self._index])
+    self._index += 1
+    return result
diff --git a/tensorflow/python/framework/weak_tensor_test.py b/tensorflow/python/framework/weak_tensor_test.py
new file mode 100644
index 00000000000..c0464795dda
--- /dev/null
+++ b/tensorflow/python/framework/weak_tensor_test.py
@@ -0,0 +1,210 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =============================================================================
+"""Tests for tensorflow.python.framework.weak_tensor."""
+
+import numpy as np
+
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
+from tensorflow.python.framework import test_util
+from tensorflow.python.framework import weak_tensor
+from tensorflow.python.module import module
+from tensorflow.python.platform import googletest
+from tensorflow.python.saved_model.load import load
+from tensorflow.python.saved_model.save import save
+
+
+class WeakTensorTest(test_util.TensorFlowTestCase):
+
+  def test_weak_tensor_basic(self):
+    a = weak_tensor.WeakTensor(constant_op.constant(1, dtypes.int32))
+    self.assertEqual(a.dtype, dtypes.int32)
+    self.assertEqual(a.shape, [])
+    self.assertEqual(a.numpy(), 1)
+    self.assertEqual(np.array(a), 1)
+    with self.assertRaises(TypeError):
+      _ = weak_tensor.WeakTensor(constant_op.constant(1, dtypes.int16))
+
+    b = [1.0, 2.0], [3.0, 4.0]
+    bwt = weak_tensor.WeakTensor(constant_op.constant(b, dtypes.float32))
+    self.assertEqual(bwt.dtype, dtypes.float32)
+    self.assertEqual(bwt.shape, [2, 2])
+    self.assertAllEqual(bwt.numpy(), np.array(b, dtype=np.float32))
+    self.assertAllEqual(np.array(bwt), np.array(b, dtype=np.float32))
+
+  def test_weak_tensor_init(self):
+    # Make sure an exception is thrown for unallowed dtypes.
+    t = constant_op.constant(1, dtypes.int16)
+    with self.assertRaises(TypeError):
+      _ = weak_tensor.WeakTensor(t)
+
+  def test_weak_tensor_format_to_string(self):
+    # __str__ test
+    t = constant_op.constant([1.0, 2.0], dtypes.float32)
+    wt = weak_tensor.WeakTensor(t)
+    wt_str = f'{str(wt.tensor)} weakly typed'
+    self.assertEqual(str(wt), wt_str)
+
+    # __repr__ test
+    wt_repr = f'{repr(wt.tensor)} weakly typed'
+    self.assertEqual(repr(wt), wt_repr)
+
+  def test_weak_tensor_num_methods(self):
+    t = constant_op.constant(1, dtypes.int32)
+    wt = weak_tensor.WeakTensor(t)
+
+    self.assertEqual(complex(wt), complex(1))
+    self.assertEqual(int(wt), int(1))
+    self.assertEqual(float(wt), float(1))
+    self.assertEqual(wt.__index__(), int(1))
+
+  def test_weak_tensor_format(self):
+    t = constant_op.constant(2, dtypes.int32)
+    wt = weak_tensor.WeakTensor(t)
+    # Format to binary representation.
+    self.assertEqual(format(wt, 'b'), '10 weakly typed')
+
+  def test_weak_tensor_bool(self):
+    # Test to make sure WeakTensor(bool) isn't used as a bool.
+    with self.assertRaises(TypeError):
+      if weak_tensor.WeakTensor(constant_op.constant(True)):
+        raise TypeError('Type error is raised because WeakTensor != bool')
+
+  def test_weak_tensor_iter(self):
+    # Test normal weakTensor iteration.
+    t = constant_op.constant([0, 1, 2], dtypes.int32)
+    wt = weak_tensor.WeakTensor(t)
+    it_weak_tensor = iter(wt)
+    for i in range(len(wt)):
+      self.assertEqual(
+          next(it_weak_tensor), weak_tensor.WeakTensor(constant_op.constant(i))
+      )
+
+    # Test multi-dimensional weakTensor iteration.
+    t_multi = constant_op.constant([[1, 2], [3, 4]], dtypes.int32)
+    wt_multi = weak_tensor.WeakTensor(t_multi)
+    it_wt_multi_tensor = iter(wt_multi)
+    self.assertEqual(
+        next(it_wt_multi_tensor), weak_tensor.WeakTensor(t_multi[0])
+    )
+    self.assertEqual(
+        next(it_wt_multi_tensor), weak_tensor.WeakTensor(t_multi[1])
+    )
+
+    # Test scalar weakTensor iteration.
+    t_scalar = constant_op.constant(1, dtypes.int32)
+    wt_scalar = weak_tensor.WeakTensor(t_scalar)
+    with self.assertRaises(TypeError):
+      # Cannot iterate over a scalar tensor.
+      _ = iter(wt_scalar)
+
+    # Make sure iteration is not allowed in Graph mode.
+    ops.disable_eager_execution()
+    with self.assertRaisesRegex(
+        errors.OperatorNotAllowedInGraphError,
+        'Iterating over a symbolic `tf.WeakTensor` is not allowed. You can'
+        ' attempt the following resolutions to the problem: If you are running'
+        ' in Graph mode, use Eager execution mode or decorate this function'
+        ' with @tf.function. If you are using AutoGraph, you can try decorating'
+        ' this function with @tf.function. If that does not work, then you may'
+        ' be using an unsupported feature or your source code may not be'
+        ' visible to AutoGraph. See'
+        ' https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/autograph/g3doc/reference/limitations.md#access-to-source-code'
+        ' for more information.',
+    ):
+      _ = iter(wt)
+    ops.enable_eager_execution()
+
+  def test_weak_tensor_getattr(self):
+    t = constant_op.constant(1, dtypes.int32)
+    wt = weak_tensor.WeakTensor(t)
+    wt_name = getattr(wt, '__name__', None)
+    self.assertEqual(wt_name, 'tf.WeakTensor')
+
+  def test_weak_tensor_in_tf_func(self):
+    @def_function.function()
+    def f(x):
+      return x
+
+    t = constant_op.constant(1, dtypes.int32)
+    wt = weak_tensor.WeakTensor(t)
+    res = f(wt)
+    self.assertIsInstance(res, weak_tensor.WeakTensor)
+
+    _ = f(t)
+    self.assertEqual(f.experimental_get_tracing_count(), 2)
+
+  def test_weak_tensor_in_tf_func_with_branch_error(self):
+    a = constant_op.constant(1, dtypes.int32)
+    b = weak_tensor.WeakTensor(constant_op.constant(1, dtypes.int32))
+
+    @def_function.function()
+    def f(c, a, b):
+      if c > 1:
+        return a
+      else:
+        return b
+
+    with self.assertRaises(TypeError):
+      # if and else branch cannot return two different types in a tf.function.
+      _ = f(constant_op.constant(2, dtypes.int32), a, b)
+
+  def test_weak_tensor_in_tf_func_with_spec(self):
+    # Test weak tensor spec with matching input.
+    weak_tensor_spec = weak_tensor.WeakTensor.Spec(tensor_spec.TensorSpec([2]))
+    wt = weak_tensor.WeakTensor(constant_op.constant([1.0, 2.0]))
+
+    @def_function.function(input_signature=[weak_tensor_spec])
+    def f(x):
+      return x
+
+    _ = f(wt)
+    # Test weak tensor spec with mismatching input.
+    wt_mismatch = weak_tensor.WeakTensor(constant_op.constant([1.0, 2.0, 3.0]))
+    with self.assertRaises(TypeError):
+      _ = f(wt_mismatch)
+
+  def test_weak_tensor_in_restored_function(self):
+    class CustomModule(module.Module):
+
+      @def_function.function
+      def __call__(self, x):
+        if isinstance(x, ops.Tensor):
+          raise TypeError('Weak tensor should not be ops.Tensor type.')
+        return x
+
+    m = CustomModule()
+    a = weak_tensor.WeakTensor(constant_op.constant(1, dtypes.int32))
+    _ = m(a)
+
+    save(m, '/tmp/f')
+    m_loaded = load('/tmp/f')
+    res = m_loaded(a)
+    self.assertIsInstance(res, weak_tensor.WeakTensor)
+
+    b = constant_op.constant(1, dtypes.int32)
+    with self.assertRaisesRegex(
+        ValueError, 'Could not find matching concrete function'
+    ):
+      m_loaded(b)
+
+
+if __name__ == '__main__':
+  ops.enable_eager_execution()
+  googletest.main()
diff --git a/tensorflow/python/grappler/BUILD b/tensorflow/python/grappler/BUILD
index d981c18d15b..f54323c00a1 100644
--- a/tensorflow/python/grappler/BUILD
+++ b/tensorflow/python/grappler/BUILD
@@ -134,11 +134,6 @@ tf_py_strict_test(
     ],
     deps = [
         ":tf_item",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -146,6 +141,11 @@ tf_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -160,14 +160,14 @@ tf_py_strict_test(
     ],
     deps = [
         ":tf_item",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -223,11 +223,11 @@ cuda_py_strict_test(
         ":tf_cluster",
         ":tf_item",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -286,17 +286,17 @@ tf_py_strict_test(
         ":tf_item",
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -311,18 +311,18 @@ tf_py_strict_test(
     deps = [
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
     ],
 )
 
@@ -335,18 +335,18 @@ cuda_py_strict_test(
         "grappler",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -361,10 +361,10 @@ cuda_py_strict_test(
     ],
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -378,18 +378,18 @@ cuda_py_strict_test(
 #         "layout_optimizer_test.py",
 #     ],
 #     deps = [
-#         "//tensorflow/python:client_testlib",
+#         "//tensorflow/python/platform:client_testlib",
 #         "//tensorflow/python/framework:for_generated_wrappers",
-#         "//tensorflow/python:array_ops",
-#         "//tensorflow/python:functional_ops",
-#         "//tensorflow/python:math_ops",
+#         "//tensorflow/python/ops:array_ops",
+#         "//tensorflow/python/ops:functional_ops",
+#         "//tensorflow/python/ops:math_ops",
 #         "//tensorflow/python:nn",
-#         "//tensorflow/python:ops",
-#         "//tensorflow/python:random_ops",
-#         "//tensorflow/python:state_ops",
+#         "//tensorflow/python/user_ops:ops",
+#         "//tensorflow/python/ops:random_ops",
+#         "//tensorflow/python/ops:state_ops",
 #         ":tf_cluster",
 #         ":tf_optimizer",
-#         "//tensorflow/python:training",
+#         "//tensorflow/python/training:training",
 #         "//third_party/py/numpy",
 #         "//tensorflow/core:protos_all_py",
 #         "//tensorflow/python/framework:constant_op",
@@ -445,19 +445,19 @@ tf_py_strict_test(
     ],
     deps = [
         ":cost_analyzer",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
     ],
 )
 
@@ -480,11 +480,11 @@ tf_py_strict_test(
     ],
     deps = [
         ":model_analyzer",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -501,17 +501,7 @@ cuda_py_strict_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/framework:constant_op",
@@ -520,8 +510,19 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn",
+        "//tensorflow/python/ops:nn_impl",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:sysconfig",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:_pywrap_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -539,20 +540,20 @@ cuda_py_strict_test(
     deps = [
         ":tf_optimizer",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:sysconfig",
         "//tensorflow/python/util:_pywrap_utils",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/grappler/remapper_test.py b/tensorflow/python/grappler/remapper_test.py
index 1139d5d67f4..6d693431f60 100644
--- a/tensorflow/python/grappler/remapper_test.py
+++ b/tensorflow/python/grappler/remapper_test.py
@@ -157,6 +157,7 @@ class RemapperTest(test.TestCase, parameterized.TestCase):
   @parameterized.parameters(['cuda', 'mkl'])
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
+  @test_util.run_without_tensor_float_32('Avoid TF32 convs on A100+ GPUs')
   def test_matmul_biasadd_activation_fusion(self, mode):
     """Test MatMul+BiasAdd+Gelu fusion."""
     self.maybe_skip_test(mode)
@@ -221,6 +222,7 @@ class RemapperTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
+  @test_util.run_without_tensor_float_32('Avoid TF32 convs on A100+ GPUs')
   def test_conv2d_biasadd_act_fusion(self):
     """Test Conv2D+BiasAdd+Relu fusion."""
     if not test_util.is_gpu_available():
@@ -274,6 +276,7 @@ class RemapperTest(test.TestCase, parameterized.TestCase):
 
   @test_util.run_deprecated_v1
   @test_util.disable_xla('This test does not pass with XLA')
+  @test_util.run_without_tensor_float_32('Avoid TF32 convs on A100+ GPUs')
   def test_two_conv2d_fusions(self):
     """Test two Conv2D patterns and only the second is fusable."""
     if not test_util.is_gpu_available(
diff --git a/tensorflow/python/keras/BUILD b/tensorflow/python/keras/BUILD
index d32373636b1..db559ba5950 100755
--- a/tensorflow/python/keras/BUILD
+++ b/tensorflow/python/keras/BUILD
@@ -32,7 +32,6 @@ py_library(
         ":backend",
         ":engine",
         ":testing_utils",  # TODO(scottzhu): Stop exporting the test_utils after removing all the callers.
-        "//tensorflow/python:training",
         "//tensorflow/python/keras/distribute",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/mixed_precision:mixed_precision_experimental",
@@ -41,6 +40,7 @@ py_library(
         "//tensorflow/python/keras/utils",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -52,50 +52,50 @@ py_library(
     deps = [
         ":backend_config",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:ctc_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
         "//tensorflow/python:gradients",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
         "//tensorflow/python:nn",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:training_lib",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:multi_worker_util",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras/distribute:distribute_coordinator_utils",
         "//tensorflow/python/keras/engine:keras_tensor",
         "//tensorflow/python/keras/utils:control_flow_util",
         "//tensorflow/python/keras/utils:object_identity",
         "//tensorflow/python/keras/utils:tf_contextlib",
         "//tensorflow/python/keras/utils:tf_inspect",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:ctc_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:training_lib",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -158,7 +158,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":backend",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/keras/distribute:distributed_file_utils",
         "//tensorflow/python/keras/distribute:worker_training_state",
         "//tensorflow/python/keras/protobuf:projector_config_proto_py",
@@ -181,9 +181,9 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":testing_utils",
-        "//tensorflow/python:framework_combinations",
-        "//tensorflow/python:framework_test_combinations_lib",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:test_combinations_lib",
     ],
 )
 
@@ -195,7 +195,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":backend",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_v2",
@@ -211,8 +211,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":backend",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/ops:while_loop",
     ],
 )
 
@@ -224,9 +224,9 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":backend",
-        "//tensorflow/python:cond",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/ops:cond",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -241,28 +241,28 @@ py_library(
         ":activations",
         ":backend",
         ":losses",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:confusion_matrix",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras/distribute",
         "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/engine:base_layer_utils",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:metrics_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:confusion_matrix",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:weights_broadcast_ops",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
@@ -341,12 +341,12 @@ py_library(
     deps = [
         ":backend",
         ":models",
-        "//tensorflow/python:config",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python:tf2",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/keras/engine:base_layer_utils",
         "//tensorflow/python/keras/layers",
         "//tensorflow/python/keras/optimizer_v2",
diff --git a/tensorflow/python/keras/distribute/BUILD b/tensorflow/python/keras/distribute/BUILD
index 25bf7eb3729..537ad4b4f43 100644
--- a/tensorflow/python/keras/distribute/BUILD
+++ b/tensorflow/python/keras/distribute/BUILD
@@ -29,8 +29,6 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":distribute_coordinator_utils",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:distribute_lib",
@@ -50,6 +48,8 @@ py_library(
         "//tensorflow/python/keras/mixed_precision:policy",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/tools/docs:doc_controls",
@@ -61,10 +61,10 @@ py_library(
     srcs = ["optimizer_combinations.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:training",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:strategy_combinations",
         "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/training",
     ],
 )
 
@@ -76,14 +76,14 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":distributed_file_utils",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/training:checkpoint_management",
     ],
 )
@@ -95,8 +95,8 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:lib",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/lib/io:lib",
     ],
 )
 
@@ -108,10 +108,10 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:session",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:distribute_coordinator_context",
         "//tensorflow/python/distribute:multi_worker_util",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:server_lib",
     ],
 )
diff --git a/tensorflow/python/keras/engine/BUILD b/tensorflow/python/keras/engine/BUILD
index 063f6cddaba..e421c5b0551 100644
--- a/tensorflow/python/keras/engine/BUILD
+++ b/tensorflow/python/keras/engine/BUILD
@@ -45,7 +45,6 @@ py_library(
         ":input_spec",
         ":keras_tensor",
         ":node",
-        "//tensorflow/python:py_checkpoint_reader",
         "//tensorflow/python/data",
         "//tensorflow/python/distribute:distribute_coordinator",
         "//tensorflow/python/distribute:distribute_lib",
@@ -86,6 +85,7 @@ py_library(
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/tpu:tpu_lib",
         "//tensorflow/python/trackable:data_structures",
+        "//tensorflow/python/training:py_checkpoint_reader",
         "//tensorflow/python/types:data",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -99,19 +99,19 @@ py_library(
     srcs = ["base_layer_utils.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:auto_control_deps",
-        "//tensorflow/python:control_flow_v2_func_graphs",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:auto_control_deps",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/utils:tf_inspect",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_v2_func_graphs",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -134,20 +134,20 @@ py_library(
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:auto_control_deps",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/framework:auto_control_deps",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/autograph/core:ag_ctx",
         "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/distribute:distribute_lib",
@@ -186,8 +186,8 @@ py_library(
     srcs = ["data_adapter.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:dataset_creator",
         "//tensorflow/python/keras/utils:engine_utils",
@@ -204,11 +204,11 @@ py_library(
     srcs = ["input_spec.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/keras:backend",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -219,10 +219,10 @@ py_library(
     srcs = ["keras_tensor.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:tensor_spec",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/keras/utils:object_identity",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -248,8 +248,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":base_layer_utils",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/utils:tf_utils",
         "//tensorflow/python/util:nest",
diff --git a/tensorflow/python/keras/initializers/BUILD b/tensorflow/python/keras/initializers/BUILD
index dd0b48f4683..578ace1fc35 100644
--- a/tensorflow/python/keras/initializers/BUILD
+++ b/tensorflow/python/keras/initializers/BUILD
@@ -24,18 +24,18 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:stateless_random_ops",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_inspect",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/keras/layers/BUILD b/tensorflow/python/keras/layers/BUILD
index 25ff540d3b9..03598e46165 100644
--- a/tensorflow/python/keras/layers/BUILD
+++ b/tensorflow/python/keras/layers/BUILD
@@ -51,7 +51,6 @@ py_library(
     srcs = ["advanced_activations.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras:constraints",
@@ -59,6 +58,7 @@ py_library(
         "//tensorflow/python/keras/engine:input_spec",
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -69,11 +69,9 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":pooling",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
@@ -83,6 +81,8 @@ py_library(
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -93,7 +93,6 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":recurrent",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
@@ -104,6 +103,7 @@ py_library(
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -114,19 +114,14 @@ py_library(
     srcs = ["core.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:standard_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
@@ -139,6 +134,11 @@ py_library(
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:standard_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:dispatch",
@@ -154,17 +154,17 @@ py_library(
     srcs = ["dense_attention.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -174,16 +174,16 @@ py_library(
     srcs = ["embeddings.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:constraints",
         "//tensorflow/python/keras:regularizers",
         "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -193,12 +193,12 @@ py_library(
     srcs = ["merge.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -208,14 +208,14 @@ py_library(
     srcs = ["pooling.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
         "//tensorflow/python/keras/engine:input_spec",
         "//tensorflow/python/keras/utils:engine_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -225,14 +225,10 @@ py_library(
     srcs = ["recurrent.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:base_layer",
@@ -242,6 +238,10 @@ py_library(
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/utils:generic_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:data_structures",
diff --git a/tensorflow/python/keras/layers/legacy_rnn/BUILD b/tensorflow/python/keras/layers/legacy_rnn/BUILD
index bb20b172976..54cdae02fd8 100644
--- a/tensorflow/python/keras/layers/legacy_rnn/BUILD
+++ b/tensorflow/python/keras/layers/legacy_rnn/BUILD
@@ -22,28 +22,28 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":rnn_cell_wrapper_impl",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers_base",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/keras:activations",
         "//tensorflow/python/keras/engine:input_spec",
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/legacy_tf_layers:layers_base",
         "//tensorflow/python/keras/saving",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/layers:layers_base",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:nest",
@@ -56,16 +56,16 @@ py_library(
     srcs = ["rnn_cell_wrapper_impl.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
         "//tensorflow/python/util:nest",
     ],
 )
diff --git a/tensorflow/python/keras/legacy_tf_layers/BUILD b/tensorflow/python/keras/legacy_tf_layers/BUILD
index 916926dc75d..457c95f4d72 100644
--- a/tensorflow/python/keras/legacy_tf_layers/BUILD
+++ b/tensorflow/python/keras/legacy_tf_layers/BUILD
@@ -25,14 +25,14 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras/engine:base_layer",
         "//tensorflow/python/keras/mixed_precision:policy",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -46,8 +46,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":layers_base",
-        "//tensorflow/python:init_ops",
         "//tensorflow/python/keras/layers",
+        "//tensorflow/python/ops:init_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -58,8 +58,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":layers_base",
-        "//tensorflow/python:init_ops",
         "//tensorflow/python/keras/layers",
+        "//tensorflow/python/ops:init_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/keras/mixed_precision/BUILD b/tensorflow/python/keras/mixed_precision/BUILD
index 44707c72527..4697ef432f5 100644
--- a/tensorflow/python/keras/mixed_precision/BUILD
+++ b/tensorflow/python/keras/mixed_precision/BUILD
@@ -52,8 +52,8 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":device_compatibility_check",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:mixed_precision_global_state",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/training/experimental:mixed_precision_global_state",
     ],
 )
 
@@ -62,7 +62,7 @@ py_library(
     srcs = ["device_compatibility_check.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:config",
+        "//tensorflow/python/framework:config",
     ],
 )
 
@@ -82,16 +82,16 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_conversion_registry",
         "//tensorflow/python/keras/distribute",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/types:core",
     ],
 )
@@ -101,8 +101,8 @@ py_library(
     srcs = ["loss_scale.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:loss_scale",
         "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/training/experimental:loss_scale",
     ],
 )
 
@@ -112,8 +112,6 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":loss_scale",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
@@ -121,6 +119,8 @@ py_library(
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/optimizer_v2",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/trackable:base_delegate",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -131,9 +131,9 @@ py_library(
     srcs = ["test_util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras",
+        "//tensorflow/python/ops:cond",
     ],
 )
diff --git a/tensorflow/python/keras/optimizer_v2/BUILD b/tensorflow/python/keras/optimizer_v2/BUILD
index a51071f0895..63e0e06eb69 100644
--- a/tensorflow/python/keras/optimizer_v2/BUILD
+++ b/tensorflow/python/keras/optimizer_v2/BUILD
@@ -37,19 +37,13 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":learning_rate_schedule",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:central_storage_strategy",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:parameter_server_strategy",
         "//tensorflow/python/distribute:parameter_server_strategy_v2",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/distribute:values",
+        "//tensorflow/python/framework",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:backend_config",
@@ -57,6 +51,12 @@ py_library(
         "//tensorflow/python/keras/initializers",
         "//tensorflow/python/keras/utils:layer_utils",
         "//tensorflow/python/keras/utils:tf_utils",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
@@ -67,14 +67,14 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/keras/utils:generic_utils",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_case",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
     ],
 )
 
@@ -84,11 +84,11 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":learning_rate_schedule",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/keras/saving/BUILD b/tensorflow/python/keras/saving/BUILD
index 9a418b8da51..b40d979c82d 100644
--- a/tensorflow/python/keras/saving/BUILD
+++ b/tensorflow/python/keras/saving/BUILD
@@ -32,12 +32,9 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/checkpoint:graph_view",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/keras:backend",
         "//tensorflow/python/keras:losses",
         "//tensorflow/python/keras:optimizers",
@@ -50,8 +47,11 @@ py_library(
         "//tensorflow/python/keras/utils:engine_utils",
         "//tensorflow/python/keras/utils:metrics_utils",
         "//tensorflow/python/keras/utils:mode_keys",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model",
+        "//tensorflow/python/training:saver",
     ],
 )
diff --git a/tensorflow/python/keras/saving/saved_model/BUILD b/tensorflow/python/keras/saving/saved_model/BUILD
index fce066a818a..04c838a10a7 100644
--- a/tensorflow/python/keras/saving/saved_model/BUILD
+++ b/tensorflow/python/keras/saving/saved_model/BUILD
@@ -49,10 +49,10 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
diff --git a/tensorflow/python/keras/saving/utils_v1/BUILD b/tensorflow/python/keras/saving/utils_v1/BUILD
index 2e5059ebc59..411e567ae6c 100644
--- a/tensorflow/python/keras/saving/utils_v1/BUILD
+++ b/tensorflow/python/keras/saving/utils_v1/BUILD
@@ -36,10 +36,10 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
diff --git a/tensorflow/python/keras/utils/BUILD b/tensorflow/python/keras/utils/BUILD
index a19f717f734..af18b3fbecb 100644
--- a/tensorflow/python/keras/utils/BUILD
+++ b/tensorflow/python/keras/utils/BUILD
@@ -90,15 +90,15 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":object_identity",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:smart_cond",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute/coordinator:cluster_coordinator",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -153,16 +153,16 @@ py_library(
     deps = [
         ":generic_utils",
         ":tf_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/distribute",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:weights_broadcast_ops",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_util",
@@ -178,7 +178,7 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/BUILD b/tensorflow/python/kernel_tests/BUILD
index c752d2352c0..6d464135228 100644
--- a/tensorflow/python/kernel_tests/BUILD
+++ b/tensorflow/python/kernel_tests/BUILD
@@ -1,6 +1,6 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -11,21 +11,23 @@ package(
 # Please avoid the py_tests and cuda_py_tests (plural) while we
 # fix the shared/overbroad dependencies.
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "benchmark_test",
     size = "small",
     srcs = ["benchmark_test.py"],
     tags = ["no_windows"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "check_ops_test",
     size = "small",
     srcs = ["check_ops_test.py"],
@@ -33,41 +35,51 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "collective_ops_multi_worker_test",
     size = "medium",
     srcs = ["collective_ops_multi_worker_test.py"],
     python_version = "PY3",
     tags = ["no_rocm"],
     deps = [
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python/distribute:combinations",
         "//tensorflow/python/distribute:multi_process_runner",
         "//tensorflow/python/distribute:multi_worker_test_base",
         "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:collective_ops",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
 # b/234468872
-cuda_py_test(
+cuda_py_strict_test(
     name = "collective_ops_test",
     size = "medium",
     srcs = ["collective_ops_test.py"],
@@ -77,11 +89,6 @@ cuda_py_test(
         "no_tfrt",  # TODO(b/185944042)
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:collective_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/experimental/ops:testing",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -90,78 +97,101 @@ cuda_py_test(
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "composite_tensor_ops_test",
     size = "small",
     srcs = ["composite_tensor_ops_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:composite_tensor_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:composite_tensor_ops",
+        "//tensorflow/python/ops:composite_tensor_ops_gen",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "critical_section_test",
     size = "medium",
     srcs = ["critical_section_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:critical_section_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/data/experimental/ops:prefetching_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:critical_section_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "garbage_collection_test",
     size = "small",
     srcs = ["garbage_collection_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_array_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "gradient_correctness_test",
     size = "small",
     srcs = ["gradient_correctness_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "logging_ops_logging_level_test",
     size = "small",
     srcs = ["logging_ops_logging_level_test.py"],
@@ -170,112 +200,128 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:logging_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "logging_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["logging_ops_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "metrics_test",
     size = "medium",
     srcs = ["metrics_test.py"],
     shard_count = 20,
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_grad",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_grad",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "numerics_test",
     size = "small",
     srcs = ["numerics_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:numerics",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:numerics",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "template_test",
     size = "small",
     srcs = ["template_test.py"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:template",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:template",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "trace_op_test",
     size = "small",
     srcs = ["trace_op_test.py"],
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "histogram_ops_test",
     size = "small",
     srcs = ["histogram_ops_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:histogram_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
-        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:histogram_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/array_ops/BUILD b/tensorflow/python/kernel_tests/array_ops/BUILD
index c5631b443df..e0fb7c7066d 100644
--- a/tensorflow/python/kernel_tests/array_ops/BUILD
+++ b/tensorflow/python/kernel_tests/array_ops/BUILD
@@ -1,6 +1,6 @@
 # Tests of TensorFlow array ops kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 
 package(
@@ -8,7 +8,7 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "array_ops_test",
     size = "medium",
     srcs = ["array_ops_test.py"],
@@ -17,119 +17,127 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "batch_gather_op_test",
     srcs = ["batch_gather_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "batch_scatter_ops_test",
     srcs = ["batch_scatter_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "batchtospace_op_test",
     size = "small",
     srcs = ["batchtospace_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "bcast_ops_test",
     size = "small",
     srcs = ["bcast_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bitcast_op_test",
     size = "small",
     srcs = ["bitcast_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "broadcast_to_ops_test",
     size = "medium",
     srcs = ["broadcast_to_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cast_op_test",
     size = "small",
     srcs = ["cast_op_test.py"],
@@ -137,18 +145,22 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "concat_op_test",
     size = "medium",
     srcs = ["concat_op_test.py"],
@@ -158,19 +170,24 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "constant_op_eager_test",
     size = "small",
     srcs = ["constant_op_eager_test.py"],
@@ -178,47 +195,53 @@ cuda_py_test(
         "no_windows",  # TODO(b/207048720): re-enable
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:core",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "constant_op_test",
     size = "small",
     srcs = ["constant_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "denormal_test",
     size = "small",
     srcs = ["denormal_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "depthtospace_op_test",
     size = "medium",
     srcs = ["depthtospace_op_test.py"],
@@ -230,90 +253,119 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "diag_op_test",
     size = "medium",
     srcs = ["diag_op_test.py"],
     shard_count = 6,
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "edit_distance_op_test",
     size = "small",
     srcs = ["edit_distance_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//third_party/py/numpy",
-    ],
-)
-
-tf_py_test(
-    name = "fingerprint_op_test",
-    size = "small",
-    srcs = ["fingerprint_op_test.py"],
-    deps = [
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
-    name = "gather_nd_op_test",
+tf_py_strict_test(
+    name = "fingerprint_op_test",
     size = "small",
-    srcs = ["gather_nd_op_test.py"],
+    srcs = ["fingerprint_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
+    name = "gather_nd_op_test",
+    size = "small",
+    srcs = ["gather_nd_op_test.py"],
+    deps = [
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
     name = "gather_op_test",
     size = "medium",
     srcs = ["gather_op_test.py"],
     shard_count = 3,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:ref_variable",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "huge_slice_op_test",
     size = "medium",
     srcs = ["huge_slice_op_test.py"],
@@ -321,43 +373,40 @@ cuda_py_test(
         "no_oss",  # Requires 4GB+ RAM
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "identity_n_op_py_test",
     size = "small",
     srcs = ["identity_n_op_py_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "init_ops_test",
     size = "medium",
     srcs = ["init_ops_test.py"],
@@ -367,64 +416,72 @@ cuda_py_test(
         "notap",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "inplace_ops_test",
     size = "small",
     srcs = ["inplace_ops_test.py"],
     shard_count = 10,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:inplace_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "large_concat_op_test",
     size = "medium",
     srcs = ["large_concat_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "manip_ops_test",
     size = "small",
     srcs = ["manip_ops_test.py"],
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:manip_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matrix_band_part_op_test",
     size = "medium",
     srcs = ["matrix_band_part_op_test.py"],
@@ -433,15 +490,21 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "one_hot_op_test",
     size = "small",
     srcs = ["one_hot_op_test.py"],
@@ -450,14 +513,16 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "pad_op_test",
     size = "medium",
     timeout = "moderate",
@@ -467,130 +532,161 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reshape_op_test",
     size = "small",
     srcs = ["reshape_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reverse_sequence_op_test",
     size = "small",
     srcs = ["reverse_sequence_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "scalar_test",
     size = "small",
     srcs = ["scalar_test.py"],
     # b/140221961: Invalid dims for operations
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "scatter_nd_ops_test",
     size = "medium",
     srcs = ["scatter_nd_ops_test.py"],
     shard_count = 2,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "scatter_ops_test",
-    size = "medium",  # NOTE: This is not run by default.
-    srcs = ["scatter_ops_test.py"],
-    shard_count = 2,
-    tags = ["optonly"],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "shape_ops_test",
-    size = "medium",
-    srcs = ["shape_ops_test.py"],
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
-    name = "slice_op_test",
-    size = "medium",
-    srcs = ["slice_op_test.py"],
+cuda_py_strict_test(
+    name = "scatter_ops_test",
+    size = "medium",  # NOTE: This is not run by default.
+    srcs = ["scatter_ops_test.py"],
+    shard_count = 2,
+    tags = [
+        "no_oss",  # TODO(b/282985589)
+        "optonly",
+    ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
+    name = "shape_ops_test",
+    size = "medium",
+    srcs = ["shape_ops_test.py"],
+    deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "slice_op_test",
+    size = "medium",
+    srcs = ["slice_op_test.py"],
+    deps = [
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
     name = "spacetobatch_op_test",
     size = "small",
     srcs = ["spacetobatch_op_test.py"],
@@ -598,17 +694,22 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "spacetodepth_op_test",
     size = "medium",
     srcs = ["spacetodepth_op_test.py"],
@@ -617,47 +718,61 @@ cuda_py_test(
         "no_windows_gpu",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "split_op_test",
     size = "medium",
     srcs = ["split_op_test.py"],
     shard_count = 4,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "stack_op_test",
     size = "medium",
     srcs = ["stack_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "unique_op_test",
     size = "small",
     srcs = ["unique_op_test.py"],
@@ -665,13 +780,16 @@ cuda_py_test(
         "no_windows",  # TODO(b/192259045)
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "unstack_op_test",
     size = "small",
     srcs = ["unstack_op_test.py"],
@@ -682,43 +800,48 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "weights_broadcast_test",
     size = "small",
     srcs = ["weights_broadcast_test.py"],
     shard_count = 3,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:weights_broadcast_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:weights_broadcast_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "where_op_test",
     size = "medium",
     srcs = ["where_op_test.py"],
     tags = ["no_cuda_asan"],  #TODO(b/212580469)
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/array_ops/concat_op_test.py b/tensorflow/python/kernel_tests/array_ops/concat_op_test.py
index 76fc6212c96..6b552871324 100644
--- a/tensorflow/python/kernel_tests/array_ops/concat_op_test.py
+++ b/tensorflow/python/kernel_tests/array_ops/concat_op_test.py
@@ -796,6 +796,18 @@ class ConcatOffsetTest(test.TestCase):
           x_concat
       )  # This test is only meant to check the creation is not crashed
 
+  def testInt64Shape(self):
+    with test_util.use_gpu():
+      cdim = constant_op.constant(1, dtypes.int32)
+      s0 = constant_op.constant([2, 5000000000, 5], dtypes.int64)
+      s1 = constant_op.constant([2, 7, 5], dtypes.int64)
+      s2 = constant_op.constant([2, 20, 5], dtypes.int64)
+      off = gen_array_ops.concat_offset(cdim, [s0, s1, s2])
+      ans = self.evaluate(off)
+      self.assertAllEqual(
+          ans, [[0, 0, 0], [0, 5000000000, 0], [0, 5000000007, 0]])
+      self.assertEqual(ans[0].dtype, dtypes.int64)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/control_flow/BUILD b/tensorflow/python/kernel_tests/control_flow/BUILD
index 00179723a2b..5da68ef89a1 100644
--- a/tensorflow/python/kernel_tests/control_flow/BUILD
+++ b/tensorflow/python/kernel_tests/control_flow/BUILD
@@ -1,6 +1,6 @@
 # Tests of TensorFlow control flow ops written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 
 package(
@@ -8,33 +8,53 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cond_v2_test",
     size = "medium",
     srcs = ["cond_v2_test.py"],
     grpc_enabled = True,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python:while_v2",
         "//tensorflow/python/compat",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:cond_v2",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:optional_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/training",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "control_flow_ops_py_test",
     size = "medium",
     srcs = ["control_flow_ops_py_test.py"],
@@ -44,77 +64,104 @@ cuda_py_test(
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:data_flow_ops_gen",
         "//tensorflow/python:distributed_framework_test_lib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:logging_ops_gen",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:script_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:state_ops_gen",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:tf2",
         "//tensorflow/python:while_v2",
+        "//tensorflow/python/client",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/experimental/ops:cardinality",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_case",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops_gen",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:logging_ops_gen",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "control_flow_util_test",
     size = "small",
     srcs = ["control_flow_util_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_ops_gen",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "control_flow_util_v2_test",
     size = "small",
     srcs = ["control_flow_util_v2_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_util_v2",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python:while_v2",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:control_flow_util_v2",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "functional_ops_test",
     size = "medium",
     srcs = ["functional_ops_test.py"],
@@ -123,28 +170,32 @@ cuda_py_test(
     tags = ["no_windows"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_v2",
-        "//tensorflow/python/data/ops:iterator_ops",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:cancellation",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:executor",
-        "//tensorflow/python/util:compat",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:collective_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:functional_ops_gen",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "map_fn_test",
     size = "small",
     srcs = ["map_fn_test.py"],
@@ -155,77 +206,111 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "py_func_test",
     size = "small",
     srcs = ["py_func_test.py"],
     grpc_enabled = True,
     tags = ["no_windows"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:script_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:batch_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:script_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "scan_ops_test",
     size = "medium",
     srcs = ["scan_ops_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "while_v2_test",
     size = "medium",
     srcs = ["while_v2_test.py"],
     grpc_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:gradients",
         "//tensorflow/python:while_v2",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:control_flow_util_v2",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/custom_ops/BUILD b/tensorflow/python/kernel_tests/custom_ops/BUILD
index 0523f13a2c3..7404adce997 100644
--- a/tensorflow/python/kernel_tests/custom_ops/BUILD
+++ b/tensorflow/python/kernel_tests/custom_ops/BUILD
@@ -1,7 +1,7 @@
 # Tests of TensorFlow custom ops written
 
 load("//tensorflow:tensorflow.bzl", "tf_custom_op_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -14,7 +14,7 @@ tf_custom_op_library(
     srcs = ["ackermann_op.cc"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "ackermann_test",
     size = "small",
     srcs = ["ackermann_test.py"],
@@ -24,8 +24,9 @@ tf_py_test(
         "notap",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
@@ -35,7 +36,7 @@ tf_custom_op_library(
     srcs = ["duplicate_op.cc"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "duplicate_op_test",
     size = "small",
     srcs = ["duplicate_op_test.py"],
@@ -45,9 +46,10 @@ tf_py_test(
         "notap",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
@@ -57,7 +59,7 @@ tf_custom_op_library(
     srcs = ["invalid_op.cc"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "invalid_op_test",
     size = "small",
     srcs = ["invalid_op_test.py"],
@@ -67,9 +69,9 @@ tf_py_test(
         "notap",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/data_structures/BUILD b/tensorflow/python/kernel_tests/data_structures/BUILD
index ecd3fb3ab5c..d77d4653c06 100644
--- a/tensorflow/python/kernel_tests/data_structures/BUILD
+++ b/tensorflow/python/kernel_tests/data_structures/BUILD
@@ -1,13 +1,13 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "barrier_ops_test",
     size = "medium",  # NOTE(ebrevdo): This test is NOT small.
     srcs = ["barrier_ops_test.py"],
@@ -16,32 +16,35 @@ tf_py_test(
         "no_mac",  # TODO(b/129706424): Re-enable this test on Mac.
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "conditional_accumulator_test",
     size = "small",
     srcs = ["conditional_accumulator_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dynamic_partition_op_test",
     size = "medium",
     srcs = ["dynamic_partition_op_test.py"],
@@ -49,113 +52,155 @@ cuda_py_test(
         "multi_and_single_gpu",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_grad",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_grad",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dynamic_stitch_op_test",
     size = "small",
     srcs = ["dynamic_stitch_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_grad",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_grad",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fifo_queue_test",
     size = "small",
     srcs = ["fifo_queue_test.py"],
     tags = ["no_rocm"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "listdiff_op_test",
     size = "small",
     srcs = ["listdiff_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "list_ops_test",
     size = "small",
     srcs = ["list_ops_test.py"],
     grpc_enabled = True,
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:list_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "lookup_ops_test",
     size = "small",
     srcs = ["lookup_ops_test.py"],
     grpc_enabled = True,
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:training",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/checkpoint:graph_view",
         "//tensorflow/python/checkpoint:util",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/experimental/ops:counter",
         "//tensorflow/python/data/ops:dataset_ops",
-        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:lookup_ops_gen",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
+        "//tensorflow/python/trackable:asset",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/training",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:compat",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 # TODO(kattian): add GPU capability and change to cuda_py_test
-tf_py_test(
+tf_py_strict_test(
     name = "map_ops_test",
     size = "small",
     srcs = ["map_ops_test.py"],
@@ -164,125 +209,135 @@ tf_py_test(
         "no_windows",  # TODO(b/192259628)
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:map_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
-        "//third_party/py/numpy",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:map_ops",
+        "//tensorflow/python/ops:sort_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "map_stage_op_test",
     size = "medium",
     srcs = ["map_stage_op_test.py"],
     tags = ["no_oss"],  # b/124474135
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "padding_fifo_queue_test",
     size = "small",
     srcs = ["padding_fifo_queue_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "priority_queue_test",
     size = "medium",
     srcs = ["priority_queue_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "stack_ops_test",
     size = "small",
     srcs = ["stack_ops_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops_gen",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:data_flow_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "stage_op_test",
     size = "medium",
     srcs = ["stage_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "tensor_array_ops_test",
     size = "medium",
     srcs = ["tensor_array_ops_test.py"],
     flaky = 1,  # create_local_cluster sometimes times out.
     shard_count = 10,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:data_flow_ops_gen",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:distributed_framework_test_lib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python:while_v2",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops_gen",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/distributions/BUILD b/tensorflow/python/kernel_tests/distributions/BUILD
index d00bd7527de..aeb697b68be 100644
--- a/tensorflow/python/kernel_tests/distributions/BUILD
+++ b/tensorflow/python/kernel_tests/distributions/BUILD
@@ -1,61 +1,65 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bijector_test",
     size = "small",
     srcs = ["bijector_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/distributions:bijector",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "util_test",
     size = "medium",
     srcs = ["util_test.py"],
     shard_count = 3,
     xla_enable_strict_auto_jit = False,  # TODO(b/144920376)
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops/distributions:util",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "kullback_leibler_test",
     size = "small",
     srcs = ["kullback_leibler_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/ops/distributions:normal",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "beta_test",
     size = "small",
     srcs = ["beta_test.py"],
@@ -67,34 +71,40 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops/distributions:beta",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bernoulli_test",
     size = "medium",
     srcs = ["bernoulli_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops/distributions:bernoulli",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "categorical_test",
     size = "small",
     srcs = ["categorical_test.py"],
@@ -102,21 +112,26 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops/distributions:categorical",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/ops/distributions:normal",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dirichlet_test",
     size = "small",
     srcs = ["dirichlet_test.py"],
@@ -124,16 +139,21 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/distributions:dirichlet",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dirichlet_multinomial_test",
     size = "medium",
     srcs = ["dirichlet_multinomial_test.py"],
@@ -142,78 +162,84 @@ cuda_py_test(
         "notap",  # b/110489471
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/distributions:dirichlet_multinomial",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "exponential_test",
     srcs = ["exponential_test.py"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops/distributions:exponential",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "gamma_test",
     srcs = ["gamma_test.py"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops/distributions:gamma",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "laplace_test",
     srcs = ["laplace_test.py"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops/distributions:laplace",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "multinomial_test",
     srcs = ["multinomial_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/distributions:multinomial",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "student_t_test",
     size = "small",
     srcs = ["student_t_test.py"],
@@ -224,80 +250,87 @@ cuda_py_test(
         "no_oss",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops/distributions:student_t",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "uniform_test",
     size = "small",
     srcs = ["uniform_test.py"],
     deps = [
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops/distributions:uniform",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "normal_test",
     size = "medium",
     srcs = ["normal_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/distributions:kullback_leibler",
+        "//tensorflow/python/ops/distributions:normal",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "special_math_test",
     size = "medium",
     srcs = ["special_math_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/ops/distributions",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops/distributions:special_math",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "identity_bijector_test",
     size = "small",
     srcs = ["identity_bijector_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python/ops/distributions",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/distributions:bijector_test_util",
+        "//tensorflow/python/ops/distributions:identity_bijector",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/image_ops/BUILD b/tensorflow/python/kernel_tests/image_ops/BUILD
index 8d486d1c042..0258e5e05f1 100644
--- a/tensorflow/python/kernel_tests/image_ops/BUILD
+++ b/tensorflow/python/kernel_tests/image_ops/BUILD
@@ -1,121 +1,134 @@
 # Tests of TensorFlow image kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "attention_ops_test",
     size = "small",
     srcs = ["attention_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:image_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_bmp_op_test",
     size = "small",
     srcs = ["decode_bmp_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:nn_grad",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_compressed_op_test",
     size = "small",
     srcs = ["decode_compressed_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:parsing_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_image_op_test",
     size = "small",
     srcs = ["decode_image_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:nn_grad",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_jpeg_op_test",
     srcs = ["decode_jpeg_op_test.py"],
     data = ["//tensorflow/core:image_testdata"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_png_op_test",
     size = "small",
     srcs = ["decode_png_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:nn_grad",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_raw_op_test",
     size = "small",
     srcs = ["decode_raw_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "draw_bounding_box_op_test",
     size = "small",
     srcs = ["draw_bounding_box_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
 # TODO(gpapan): Revisit the gradient of extract_image_patches_op to resolve
 # http://b/31080670.
-cuda_py_test(
+cuda_py_strict_test(
     name = "extract_image_patches_grad_test",
     size = "medium",
     srcs = ["extract_image_patches_grad_test.py"],
@@ -127,29 +140,39 @@ cuda_py_test(
         "notap",  # b/31080670
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "extract_image_patches_op_test",
     size = "small",
     srcs = ["extract_image_patches_op_test.py"],
     # TODO(b/144432983): S32 convolutions should not be auto-clustered.
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "extract_volume_patches_grad_test",
     size = "medium",
     srcs = ["extract_volume_patches_grad_test.py"],
@@ -162,22 +185,29 @@ cuda_py_test(
         "notap",  # b/31080670
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "extract_volume_patches_op_test",
     size = "small",
     srcs = ["extract_volume_patches_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py b/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py
index fcc21fea151..0ac459f4742 100644
--- a/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py
+++ b/tensorflow/python/kernel_tests/image_ops/extract_image_patches_grad_test.py
@@ -18,6 +18,7 @@ from absl.testing import parameterized
 import numpy as np
 
 from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -181,8 +182,6 @@ class ExtractImagePatchesGradTest(test.TestCase, parameterized.TestCase):
     self._VariableShapeGradient([None, None, None, None])
 
   def testJitCompile(self):
-    import tensorflow as tf
-
     with test_util.AbstractGradientTape(use_tape=True) as tape:
       shape = (4, 512, 512, 1)
       ksize = 5
@@ -193,7 +192,7 @@ class ExtractImagePatchesGradTest(test.TestCase, parameterized.TestCase):
       # Github issues: #59058, #59061
       # tf.image.extract_image_patches() does not support backward pass
       # when compiled with XLA.
-      extract_image_patches_jit = tf.function(
+      extract_image_patches_jit = polymorphic_function.function(
           array_ops.extract_image_patches, jit_compile=True
       )
       patches = extract_image_patches_jit(
diff --git a/tensorflow/python/kernel_tests/io_ops/BUILD b/tensorflow/python/kernel_tests/io_ops/BUILD
index 3632138e411..66b71de8933 100644
--- a/tensorflow/python/kernel_tests/io_ops/BUILD
+++ b/tensorflow/python/kernel_tests/io_ops/BUILD
@@ -1,131 +1,145 @@
 # Tests of TensorFlow IO kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "checkpoint_ops_test",
     size = "medium",
     srcs = ["checkpoint_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpoint_ops_gen",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:checkpoint_ops_gen",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
+        "//tensorflow/python/training:saver",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_csv_op_test",
     size = "small",
     srcs = ["decode_csv_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "io_ops_test",
     size = "small",
     srcs = ["io_ops_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:io_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "parse_single_example_op_test",
     size = "small",
     srcs = ["parse_single_example_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "parsing_ops_test",
     size = "medium",
     srcs = ["parsing_ops_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops/ragged:ragged_concat_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "reader_ops_test",
     size = "small",
     srcs = ["reader_ops_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "record_input_test",
     size = "medium",
     srcs = ["record_input_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:io_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "save_restore_ops_test",
     size = "small",
     srcs = ["save_restore_ops_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:io_ops_gen",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 6e292444d57..eaf7055ffa6 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -1,44 +1,58 @@
 # Tests of TensorFlow linalg kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cholesky_op_test",
     size = "medium",
     srcs = ["cholesky_op_test.py"],
     shard_count = 5,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "determinant_op_test",
     size = "medium",
     srcs = ["determinant_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "eig_op_test",
     size = "medium",
     srcs = ["eig_op_test.py"],
@@ -49,18 +63,24 @@ tf_py_test(
         "nomsan",  # TODO(b/213581489): MSAN false positives from LAPACK use inside NumPy.
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sort_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
     # b/127344411: xla_enable_strict_auto_jit = True,
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "einsum_op_test",
     size = "medium",
     srcs = ["einsum_op_test.py"],
@@ -69,71 +89,77 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linalg_grad_test",
     size = "medium",
     srcs = ["linalg_grad_test.py"],
     shard_count = 20,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg:linalg_impl",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linalg_ops_test",
     size = "medium",
     srcs = ["linalg_ops_test.py"],
     shard_count = 8,
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_addition_test",
     size = "small",
     srcs = ["linear_operator_addition_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:linalg_ops",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_addition",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_adjoint_test",
     size = "medium",
     srcs = ["linear_operator_adjoint_test.py"],
@@ -142,86 +168,82 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_adjoint",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_algebra_test",
     size = "small",
     srcs = ["linear_operator_algebra_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/ops/linalg:cholesky_registrations",
         "//tensorflow/python/ops/linalg:linear_operator",
         "//tensorflow/python/ops/linalg:linear_operator_algebra",
         "//tensorflow/python/ops/linalg:matmul_registrations",
         "//tensorflow/python/ops/linalg:solve_registrations",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_block_diag_test",
     size = "medium",
     srcs = ["linear_operator_block_diag_test.py"],
     shard_count = 8,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_block_diag",
         "//tensorflow/python/ops/linalg:linear_operator_lower_triangular",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg:linear_operator_util",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_block_lower_triangular_test",
     size = "medium",
     srcs = ["linear_operator_block_lower_triangular_test.py"],
     shard_count = 8,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_block_lower_triangular",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg:linear_operator_util",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_composition_test",
     size = "medium",
     srcs = ["linear_operator_composition_test.py"],
@@ -230,20 +252,19 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_circulant_test",
     size = "medium",
     srcs = ["linear_operator_circulant_test.py"],
@@ -253,100 +274,105 @@ cuda_py_test(
         "optonly",  # times out, b/79171797
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_circulant",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/signal:fft_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_diag_test",
     size = "medium",
     srcs = ["linear_operator_diag_test.py"],
     shard_count = 5,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_full_matrix_test",
     size = "medium",
     srcs = ["linear_operator_full_matrix_test.py"],
     shard_count = 5,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_householder_test",
     size = "medium",
     srcs = ["linear_operator_householder_test.py"],
     shard_count = 5,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_householder",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_identity_test",
     size = "medium",
     srcs = ["linear_operator_identity_test.py"],
     shard_count = 5,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_inversion_test",
     size = "medium",
     srcs = ["linear_operator_inversion_test.py"],
@@ -355,81 +381,82 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_inversion",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_kronecker_test",
     size = "medium",
     srcs = ["linear_operator_kronecker_test.py"],
     shard_count = 10,
-    tags = ["optonly"],
-    xla_enable_strict_auto_jit = True,
+    tags = [
+        "no_oss",  # TODO(b/282984925)
+        "optonly",
+    ],
+    xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_kronecker",
         "//tensorflow/python/ops/linalg:linear_operator_lower_triangular",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg:linear_operator_util",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_low_rank_update_test",
     size = "medium",
     srcs = ["linear_operator_low_rank_update_test.py"],
     shard_count = 10,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_lower_triangular_test",
     size = "medium",
     srcs = ["linear_operator_lower_triangular_test.py"],
     shard_count = 4,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_permutation_test",
     size = "medium",
     srcs = ["linear_operator_permutation_test.py"],
@@ -437,37 +464,41 @@ cuda_py_test(
     tags = ["optonly"],
     xla_enable_strict_auto_jit = True,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_permutation",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_test",
     size = "small",
     srcs = ["linear_operator_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_toeplitz_test",
     size = "medium",
     srcs = ["linear_operator_toeplitz_test.py"],
@@ -478,21 +509,21 @@ cuda_py_test(
         "optonly",  # times out, b/79171797
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg:linear_operator_toeplitz",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_tridiag_test",
     size = "medium",
     srcs = ["linear_operator_tridiag_test.py"],
@@ -503,57 +534,56 @@ cuda_py_test(
     ],
     xla_enable_strict_auto_jit = True,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_util_test",
     size = "medium",
     srcs = ["linear_operator_util_test.py"],
     shard_count = 5,
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_util",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "linear_operator_zeros_test",
     size = "medium",
     srcs = ["linear_operator_zeros_test.py"],
     shard_count = 5,
     tags = ["optonly"],  # Test is flaky without optimization.
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "lu_op_test",
     size = "small",
     srcs = ["lu_op_test.py"],
@@ -561,115 +591,164 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "matrix_exponential_op_test",
     size = "medium",
     srcs = ["matrix_exponential_op_test.py"],
     shard_count = 16,
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg:linalg_impl",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matrix_inverse_op_test",
     size = "small",
     srcs = ["matrix_inverse_op_test.py"],
     tags = ["optonly"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "matrix_logarithm_op_test",
     size = "medium",
     srcs = ["matrix_logarithm_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg:linalg_impl",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matrix_solve_ls_op_test",
     size = "medium",
     srcs = ["matrix_solve_ls_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matrix_solve_op_test",
     size = "medium",
     srcs = ["matrix_solve_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matrix_square_root_op_test",
     size = "medium",
     srcs = ["matrix_square_root_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matrix_triangular_solve_op_test",
     size = "medium",
     srcs = ["matrix_triangular_solve_op_test.py"],
     shard_count = 3,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "normalize_op_test",
     size = "medium",
     srcs = ["normalize_op_test.py"],
@@ -679,14 +758,14 @@ cuda_py_test(
     # TODO(b/208263392): Re-enable. tf.Squeeze op after tf.Where op doesn't reshape.
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "norm_op_test",
     size = "medium",
     srcs = ["norm_op_test.py"],
@@ -696,32 +775,41 @@ cuda_py_test(
     # TODO(b/208263392): Re-enable. tf.Squeeze op after tf.Where op doesn't reshape.
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "qr_op_test",
     size = "medium",
     srcs = ["qr_op_test.py"],
     shard_count = 20,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "self_adjoint_eig_op_test",
     size = "medium",
     srcs = ["self_adjoint_eig_op_test.py"],
@@ -731,37 +819,39 @@ cuda_py_test(
         "no_windows",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
     # TODO(b/127344411): This test passes because XLA does not actually cluster
     # the self_adjoint_eig op.
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "slicing_test",
     size = "small",
     srcs = ["slicing_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg:linear_operator_test_util",
         "//tensorflow/python/ops/linalg:slicing",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "svd_op_test",
     size = "medium",
     srcs = ["svd_op_test.py"],
@@ -773,37 +863,55 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
     # TODO(b/127344411): This test passes because XLA does not actually cluster
     # the svd op.
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "tridiagonal_matmul_op_test",
     size = "medium",
     srcs = ["tridiagonal_matmul_op_test.py"],
     shard_count = 10,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg:linalg_impl",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "tridiagonal_solve_op_test",
     size = "medium",
     srcs = ["tridiagonal_solve_op_test.py"],
@@ -812,11 +920,19 @@ cuda_py_test(
         "no_oss",  # TODO(b/142818120): Re-enable.
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg:linalg_impl",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/linalg/sparse/BUILD b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
index 2b8acb0285c..fc07a517aac 100644
--- a/tensorflow/python/kernel_tests/linalg/sparse/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/sparse/BUILD
@@ -1,6 +1,6 @@
 # Tests of TensorFlow sparse linear algebra kernels using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -8,24 +8,21 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conjugate_gradient_test",
     size = "medium",
     srcs = ["conjugate_gradient_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops/linalg",
         "//tensorflow/python/ops/linalg/sparse:conjugate_gradient",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "csr_sparse_matrix_test",
     size = "medium",
     srcs = ["csr_sparse_matrix_test.py"],
@@ -34,11 +31,16 @@ cuda_py_test(
         "no_cuda_asan",  # TODO(b/190824595)
     ],
     deps = [
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "csr_sparse_matrix_ops_test",
     size = "medium",
     timeout = "long",
@@ -49,24 +51,55 @@ cuda_py_test(
         "no_gpu",  # b/203655060 (cuda 11.5 specific)
     ],
     deps = [
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "csr_sparse_matrix_grad_test",
     size = "medium",
     srcs = ["csr_sparse_matrix_grad_test.py"],
     main = "csr_sparse_matrix_grad_test.py",
     shard_count = 50,
     deps = [
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "csr_sparse_matrix_dense_mat_mul_grad_test",
     size = "medium",
     srcs = ["csr_sparse_matrix_dense_mat_mul_grad_test.py"],
@@ -76,19 +109,32 @@ cuda_py_test(
         "no_cuda_asan",  # TODO(b/190824595)
     ],
     deps = [
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "csr_sparse_matrix_sparse_mat_mul_grad_test",
     size = "medium",
     srcs = ["csr_sparse_matrix_sparse_mat_mul_grad_test.py"],
     main = "csr_sparse_matrix_sparse_mat_mul_grad_test.py",
     shard_count = 50,
     deps = [
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_grad",
         "//tensorflow/python/ops/linalg/sparse:sparse_csr_matrix_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/math_ops/BUILD b/tensorflow/python/kernel_tests/math_ops/BUILD
index 068348441e5..d5f259fb52f 100644
--- a/tensorflow/python/kernel_tests/math_ops/BUILD
+++ b/tensorflow/python/kernel_tests/math_ops/BUILD
@@ -1,13 +1,13 @@
 # Tests of TensorFlow math kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "aggregate_ops_test",
     size = "small",
     srcs = ["aggregate_ops_test.py"],
@@ -15,29 +15,41 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "approx_topk_test",
     srcs = ["approx_topk_test.py"],
     tags = ["no_oss"],
     xla_enable_strict_auto_jit = True,
     xla_enabled = True,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "argmax_op_test",
     size = "small",
     srcs = ["argmax_op_test.py"],
@@ -48,86 +60,109 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "banded_triangular_solve_op_test",
-    size = "small",
-    srcs = ["banded_triangular_solve_op_test.py"],
-    deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:linalg_ops",
-        "//third_party/py/numpy",
-    ],
-)
-
-cuda_py_test(
-    name = "basic_gpu_test",
-    size = "small",
-    srcs = ["basic_gpu_test.py"],
-    grpc_enabled = True,
-    deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
+    name = "banded_triangular_solve_op_test",
+    size = "small",
+    srcs = ["banded_triangular_solve_op_test.py"],
+    deps = [
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "basic_gpu_test",
+    size = "small",
+    srcs = ["basic_gpu_test.py"],
+    grpc_enabled = True,
+    deps = [
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
     name = "batch_matmul_op_test",
     size = "medium",
     srcs = ["batch_matmul_op_test.py"],
     shard_count = 20,
     tags = ["no_mac_arm64"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bincount_op_test",
     size = "small",
     srcs = ["bincount_op_test.py"],
     tags = ["no_windows_gpu"],
     deps = [
-        "//tensorflow/python:bincount_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bincount_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bucketize_op_test",
     size = "medium",
     srcs = ["bucketize_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "clip_ops_test",
     size = "small",
     srcs = ["clip_ops_test.py"],
@@ -136,55 +171,66 @@ tf_py_test(
         "no_windows",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "confusion_matrix_test",
     size = "small",
     srcs = ["confusion_matrix_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:confusion_matrix",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:confusion_matrix",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cross_grad_test",
     size = "small",
     srcs = ["cross_grad_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cumulative_logsumexp_test",
     size = "medium",
     srcs = ["cumulative_logsumexp_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cwise_ops_test",
     size = "medium",
     srcs = ["cwise_ops_test.py"],
@@ -193,21 +239,23 @@ cuda_py_test(
         "no_cuda_asan",  # b/179032113
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cwise_ops_binary_test",
     size = "medium",
     srcs = ["cwise_ops_binary_test.py"],
@@ -215,21 +263,23 @@ cuda_py_test(
     # b/140155647: Error just outside of tolerance
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cwise_ops_unary_test",
     size = "medium",
     srcs = ["cwise_ops_unary_test.py"],
@@ -240,34 +290,37 @@ cuda_py_test(
     # b/140155706: nans in result
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "division_future_test",
     size = "medium",
     srcs = ["division_future_test.py"],
     tags = ["manual"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "division_past_test",
     size = "medium",
     srcs = ["division_past_test.py"],
@@ -276,62 +329,64 @@ cuda_py_test(
         "no_windows",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "in_topk_op_test",
     size = "small",
     srcs = ["in_topk_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "matmul_op_test",
     size = "medium",
     srcs = ["matmul_op_test.py"],
     shard_count = 20,
     tags = [
-        "no_armv7",  # b/280582913
         "no_mac_arm64",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reduce_benchmark_test",
     srcs = ["reduce_benchmark_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_benchmark",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reduction_ops_test_big",
     size = "medium",
     srcs = ["reduction_ops_test_big.py"],
@@ -342,15 +397,16 @@ cuda_py_test(
         "noguitar",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reduction_ops_test",
     size = "medium",
     srcs = ["reduction_ops_test.py"],
@@ -360,31 +416,42 @@ cuda_py_test(
         "no_windows_gpu",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "segment_reduction_ops_d9m_test",
     size = "small",
     srcs = ["segment_reduction_ops_d9m_test.py"],
     tags = ["no_cuda_asan"],  # b/195247538
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "segment_reduction_ops_test",
     size = "medium",
     srcs = ["segment_reduction_ops_test.py"],
@@ -392,35 +459,43 @@ cuda_py_test(
     # TODO (b/173835746): the test fails with XLA.
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sets_test",
     size = "medium",
     srcs = ["sets_test.py"],
     deps = [
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:platform_test",
         "//tensorflow/python:sets",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:set_ops_gen",
+        "//tensorflow/python/ops:sparse_ops",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "tensordot_op_test",
     size = "medium",
     srcs = ["tensordot_op_test.py"],
@@ -431,16 +506,19 @@ cuda_py_test(
     ],
     xla_enable_strict_auto_jit = False,  # b/161856380
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "topk_op_test",
     size = "medium",
     srcs = ["topk_op_test.py"],
@@ -448,17 +526,23 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "transpose_op_test",
     size = "medium",
     srcs = ["transpose_op_test.py"],
@@ -470,21 +554,27 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "zero_division_test",
     size = "medium",
     srcs = ["zero_division_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py b/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
index a1a15bfc561..1b79d4857e0 100644
--- a/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
+++ b/tensorflow/python/kernel_tests/math_ops/cwise_ops_test.py
@@ -16,7 +16,6 @@
 
 import numpy as np
 
-from tensorflow.python.eager import context
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes as dtypes_lib
 from tensorflow.python.framework import errors
@@ -25,7 +24,6 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import gradient_checker
-from tensorflow.python.ops import gradients_impl
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import nn_grad  # pylint: disable=unused-import
 from tensorflow.python.platform import test
@@ -1331,40 +1329,5 @@ class PolyvalTest(test.TestCase):
       math_ops.polyval(coeffs, x)
 
 
-class SingularGradientOpTest(test.TestCase):
-
-  def testGradientAtSingularity(self):
-    if context.executing_eagerly():
-      self.skipTest(
-          "Only graph mode allows specifying gradient inputs directly"
-      )
-
-    ops_and_singularity = [
-        (math_ops.reciprocal, [0.0], [np.nan]),
-        (math_ops.rsqrt, [0.0], [np.nan]),
-        (math_ops.sqrt, [0.0], [np.nan]),
-        (math_ops.sqrt_grad, [0.0, 0.0], [np.nan, np.nan]),
-        (math_ops.reciprocal_grad, [1.0, 0.0], [-0.0, -0.0]),
-        (math_ops.tan, [np.pi / 2], [0.0]),
-        (math_ops.log, [0.0], [np.nan]),
-        (math_ops.log1p, [-1.0], [np.nan]),
-        (math_ops.acosh, [0.0], [np.nan]),
-        (math_ops.asin, [1.0], [np.nan]),
-        (math_ops.acos, [1.0], [np.nan]),
-        (math_ops.atan2, [0.0, 0.0], [np.nan, np.nan]),
-        (math_ops.div, [1.0, 0.0], [np.nan, np.nan]),
-        (math_ops.div_no_nan, [1.0, 0.0], [0.0, 0.0]),
-        (math_ops.real_div, [1.0, 0.0], [np.nan, np.nan]),
-        (math_ops.pow, [0.0, -1.0], [np.nan, np.nan]),
-    ]
-    for op, singularity, expected in ops_and_singularity:
-      with self.subTest(op=op.__name__):
-        args = [constant_op.constant(s) for s in singularity]
-        grad_y = constant_op.constant(0.0)
-        y = op(*args)
-        g = self.evaluate(gradients_impl.gradients(y, args, grad_ys=grad_y))
-        self.assertAllEqual(g, expected)
-
-
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/kernel_tests/nn_ops/BUILD b/tensorflow/python/kernel_tests/nn_ops/BUILD
index b3b18117cf3..40bc196d519 100644
--- a/tensorflow/python/kernel_tests/nn_ops/BUILD
+++ b/tensorflow/python/kernel_tests/nn_ops/BUILD
@@ -1,6 +1,7 @@
 # Tests of TensorFlow NN kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -27,7 +28,7 @@ test_suite(
             [basename + "_gpu" for basename in CONV_TEST_BASENAMES],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "atrous_conv2d_test",
     size = "medium",
     srcs = ["atrous_conv2d_test.py"],
@@ -36,17 +37,21 @@ cuda_py_test(
         "no_gpu",  #  Flaky: b/80127739, b/127001953
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "atrous_convolution_test",
     size = "medium",
     srcs = ["atrous_convolution_test.py"],
@@ -55,16 +60,20 @@ cuda_py_test(
         "no_cuda_asan",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "betainc_op_test",
     size = "small",
     srcs = ["betainc_op_test.py"],
@@ -72,43 +81,64 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "bias_op_base",
     srcs = ["bias_op_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bias_op_d9m_test",
     size = "medium",
     srcs = ["bias_op_d9m_test.py"],
     shard_count = 2,
     deps = [
         ":bias_op_base",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "bias_op_test",
     size = "medium",
     srcs = ["bias_op_test.py"],
@@ -118,32 +148,37 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv1d_test",
     size = "small",
     srcs = ["conv1d_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv1d_transpose_test",
     size = "small",
     srcs = ["conv1d_transpose_test.py"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv2d_backprop_filter_grad_test",
     size = "medium",
     srcs = ["conv2d_backprop_filter_grad_test.py"],
@@ -152,16 +187,19 @@ cuda_py_test(
         "optonly",  # flaky timeouts unless optimized
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv2d_transpose_test",
     size = "small",
     srcs = ["conv2d_transpose_test.py"],
@@ -170,43 +208,58 @@ cuda_py_test(
     # crashes tests.
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv3d_backprop_filter_v2_grad_test",
     size = "small",
     srcs = ["conv3d_backprop_filter_v2_grad_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv3d_transpose_test",
     size = "medium",
     srcs = ["conv3d_transpose_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv_ops_3d_test",
     size = "medium",
     srcs = ["conv_ops_3d_test.py"],
@@ -216,15 +269,21 @@ cuda_py_test(
         "no_mac_arm64",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "conv_ops_test",
     size = "medium",
     srcs = ["conv_ops_test.py"],
@@ -234,66 +293,91 @@ cuda_py_test(
         "optonly",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "ctc_decoder_ops_test",
     size = "small",
     srcs = ["ctc_decoder_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:ctc_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:ctc_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "ctc_loss_op_test",
     size = "medium",
     srcs = ["ctc_loss_op_test.py"],
     xla_enable_strict_auto_jit = False,  # b/148153828
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:ctc_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:ctc_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "cudnn_deterministic_base",
     srcs = ["cudnn_deterministic_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cudnn_deterministic_ops_test",
     size = "small",
     srcs = ["cudnn_deterministic_ops_test.py"],
@@ -304,11 +388,12 @@ cuda_py_test(
     xla_enable_strict_auto_jit = True,
     deps = [
         ":cudnn_deterministic_base",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "cudnn_d9m_test",
     size = "small",
     srcs = ["cudnn_d9m_test.py"],
@@ -322,22 +407,26 @@ cuda_py_test(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "depthwise_conv_op_base",
     srcs = ["depthwise_conv_op_base.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "depthwise_conv_op_d9m_test",
     size = "medium",  # http://b/30603882
     timeout = "long",
@@ -345,13 +434,19 @@ cuda_py_test(
     shard_count = 8,
     deps = [
         ":depthwise_conv_op_base",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python:nn",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "depthwise_conv_op_test",
     size = "medium",  # http://b/30603882
     timeout = "long",
@@ -363,7 +458,7 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "embedding_ops_test",
     size = "medium",
     srcs = ["embedding_ops_test.py"],
@@ -375,93 +470,117 @@ cuda_py_test(
         "no_cuda_asan",  # Size limit: b/192505612
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fractional_avg_pool_op_test",
     size = "small",
     srcs = ["fractional_avg_pool_op_test.py"],
     shard_count = 5,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "fractional_max_pool_op_test",
     size = "small",
     srcs = ["fractional_max_pool_op_test.py"],
     shard_count = 5,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "losses_test",
     size = "medium",
     srcs = ["losses_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/ops/losses:util",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "lrn_op_test",
     size = "medium",
     srcs = ["lrn_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_grad",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "morphological_ops_test",
     size = "small",
     srcs = ["morphological_ops_test.py"],
@@ -469,31 +588,39 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "nth_element_op_test",
     size = "small",
     srcs = ["nth_element_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "pool_test",
     size = "medium",
     srcs = ["pool_test.py"],
@@ -501,30 +628,39 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "pooling_ops_3d_test",
     size = "medium",
     srcs = ["pooling_ops_3d_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "pooling_ops_test",
     size = "medium",
     srcs = ["pooling_ops_test.py"],
@@ -532,20 +668,28 @@ cuda_py_test(
     # Some operations in this test can only be checked on sm61+.
     tags = ["prefer-sm70"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "relu_op_test",
     size = "small",
     srcs = ["relu_op_test.py"],
@@ -553,46 +697,55 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:tf2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "rnn_cell_test",
     size = "medium",
     srcs = ["rnn_cell_test.py"],
     shard_count = 15,
     tags = ["no_windows"],  # b/139739217
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:rnn",
+        "//tensorflow/core:protos_all_py",
         "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:rnn",
+        "//tensorflow/python/ops:rnn_ops_gen",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
@@ -603,7 +756,7 @@ cuda_py_test(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "rnn_test",
     size = "medium",
     timeout = "long",
@@ -611,123 +764,143 @@ cuda_py_test(
     shard_count = 10,
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_grad",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:rnn",
         "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:sparse_grad",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_grad",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:rnn",
+        "//tensorflow/python/ops:sparse_grad",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:saver",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "softmax_op_test",
     size = "medium",
     srcs = ["softmax_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "softplus_op_test",
     size = "small",
     srcs = ["softplus_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "softsign_op_test",
     size = "small",
     srcs = ["softsign_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "xent_op_d9m_test",
     size = "medium",
     timeout = "long",
     srcs = ["xent_op_d9m_test.py"],
     tags = [
+        "no_pip",  # TODO(b/283889547) Flaky on kokoro
         "no_windows",  # Flaky on Windows CPU: https://github.com/tensorflow/tensorflow/issues/55827
         "nomac",  # TODO(b/235277289) Flaky on OSX
     ],
     xla_enable_strict_auto_jit = False,
     deps = [
         ":xent_op_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "xent_op_test",
     size = "small",
     srcs = ["xent_op_test.py"],
     deps = [
         ":xent_op_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "xent_op_test_base",
     srcs = ["xent_op_test_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/proto/BUILD b/tensorflow/python/kernel_tests/proto/BUILD
index 9fb6544584e..94bb4fb92bf 100644
--- a/tensorflow/python/kernel_tests/proto/BUILD
+++ b/tensorflow/python/kernel_tests/proto/BUILD
@@ -1,7 +1,8 @@
 # Tests of tf.io.*proto.
 
+load("//tensorflow:strict.default.bzl", "py_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_shared_object")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "if_static")
 load("//tensorflow/core/platform:build_config.bzl", "tf_additional_all_protos", "tf_proto_library")
 
@@ -15,7 +16,7 @@ exports_files([
     "test_example.proto",
 ])
 
-tf_py_test(
+tf_py_strict_test(
     name = "decode_proto_op_test",
     size = "small",
     srcs = ["decode_proto_op_test.py"],
@@ -31,11 +32,12 @@ tf_py_test(
     deps = [
         ":decode_proto_op_test_base",
         ":py_test_deps",
-        "//tensorflow/python:proto_ops",
+        "//tensorflow/python/ops:proto_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "encode_proto_op_test",
     size = "small",
     srcs = ["encode_proto_op_test.py"],
@@ -51,22 +53,24 @@ tf_py_test(
     deps = [
         ":encode_proto_op_test_base",
         ":py_test_deps",
-        "//tensorflow/python:proto_ops",
+        "//tensorflow/python/ops:proto_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "proto_op_test_base",
     testonly = 1,
     srcs = ["proto_op_test_base.py"],
     srcs_version = "PY3",
     deps = [
         ":test_example_proto_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "decode_proto_op_test_base",
     testonly = 1,
     srcs = ["decode_proto_op_test_base.py"],
@@ -74,12 +78,14 @@ py_library(
     deps = [
         ":proto_op_test_base",
         ":test_example_proto_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "encode_proto_op_test_base",
     testonly = 1,
     srcs = ["encode_proto_op_test_base.py"],
@@ -87,12 +93,16 @@ py_library(
     deps = [
         ":proto_op_test_base",
         ":test_example_proto_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:array_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "py_test_deps",
     srcs_version = "PY3",
 )
@@ -112,20 +122,21 @@ tf_cc_shared_object(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "descriptor_source_test_base",
     testonly = 1,
     srcs = ["descriptor_source_test_base.py"],
     srcs_version = "PY3",
     deps = [
         ":proto_op_test_base",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
         "@com_google_protobuf//:protobuf_python",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "descriptor_source_test",
     size = "small",
     srcs = ["descriptor_source_test.py"],
@@ -135,8 +146,8 @@ tf_py_test(
     ],
     deps = [
         ":descriptor_source_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:proto_ops",
+        "//tensorflow/python/ops:proto_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/python/kernel_tests/quantization_ops/BUILD b/tensorflow/python/kernel_tests/quantization_ops/BUILD
index 1b76a247e56..88b85c62977 100644
--- a/tensorflow/python/kernel_tests/quantization_ops/BUILD
+++ b/tensorflow/python/kernel_tests/quantization_ops/BUILD
@@ -1,23 +1,26 @@
 # Tests of TensorFlow quantization ops written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "quantization_ops_test",
     size = "small",
     srcs = ["quantization_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/random/BUILD b/tensorflow/python/kernel_tests/random/BUILD
index b63f4082c3f..164ec79d165 100644
--- a/tensorflow/python/kernel_tests/random/BUILD
+++ b/tensorflow/python/kernel_tests/random/BUILD
@@ -1,6 +1,7 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -12,77 +13,84 @@ package(
 # Please avoid the py_tests and cuda_py_tests (plural) while we
 # fix the shared/overbroad dependencies.
 
-tf_py_test(
+tf_py_strict_test(
     name = "candidate_sampler_ops_test",
     size = "small",
     srcs = ["candidate_sampler_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:candidate_sampling_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:candidate_sampling_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "multinomial_op_big_test",
     size = "medium",
     srcs = ["multinomial_op_big_test.py"],
     shard_count = 3,
     deps = [
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "multinomial_op_test",
     size = "small",
     srcs = ["multinomial_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "parameterized_truncated_normal_op_test",
     size = "medium",
     srcs = ["parameterized_truncated_normal_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 # TODO(b/130359919): Reenable test when it becomes stable
-tf_py_test(
+tf_py_strict_test(
     name = "random_binomial_test",
     size = "medium",
     srcs = ["random_binomial_test.py"],
@@ -90,17 +98,19 @@ tf_py_test(
     tags = ["no_oss"],
     deps = [
         ":util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:stateful_random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "random_crop_test",
     size = "small",
     srcs = ["random_crop_test.py"],
@@ -108,13 +118,14 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_crop_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:random_crop_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "random_gamma_test",
     size = "medium",
     srcs = ["random_gamma_test.py"],
@@ -129,17 +140,22 @@ cuda_py_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "random_grad_test",
     size = "small",
     srcs = ["random_grad_test.py"],
@@ -147,48 +163,60 @@ cuda_py_test(
         "no_cuda_asan",  # times out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_grad",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_grad",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:stateless_random_ops_v2_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "random_ops_test",
     size = "medium",
     srcs = ["random_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:random_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "random_poisson_test",
     size = "medium",
     srcs = ["random_poisson_test.py"],
     deps = [
         ":util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "random_shuffle_queue_test",
     size = "medium",
     srcs = ["random_shuffle_queue_test.py"],
@@ -196,17 +224,19 @@ tf_py_test(
         "no_cuda_on_cpu_tap",  # TODO(b/171060960) flakyly broken assertions
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "stateless_random_ops_test",
     size = "medium",
     srcs = ["stateless_random_ops_test.py"],
@@ -215,16 +245,28 @@ cuda_py_test(
         "no_cuda_asan",  # times-out
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:stateless_random_ops",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:stateless_random_ops_v2_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "stateful_random_ops_test",
     size = "medium",
     srcs = ["stateful_random_ops_test.py"],
@@ -233,26 +275,48 @@ cuda_py_test(
     xla_enabled = True,
     deps = [
         ":util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:stateful_random_ops",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:random_ops_gen",
+        "//tensorflow/python/ops:stateful_random_ops",
+        "//tensorflow/python/ops:stateful_random_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "random_index_shuffle_test",
     srcs = ["random_index_shuffle_test.py"],
     shard_count = 10,
-    deps = ["//tensorflow/python/platform:client_testlib"],
+    deps = [
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_index_shuffle_ops_gen",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
 )
 
-py_library(
+py_strict_library(
     name = "util",
     srcs = ["util.py"],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/python/ops/distributions:special_math",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/signal/BUILD b/tensorflow/python/kernel_tests/signal/BUILD
index b57ced82067..bb117339226 100644
--- a/tensorflow/python/kernel_tests/signal/BUILD
+++ b/tensorflow/python/kernel_tests/signal/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 load("//tensorflow:tensorflow.bzl", "py_test")  # @unused
 
 package(
@@ -7,7 +8,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "test_util",
     srcs = ["test_util.py"],
     srcs_version = "PY3",
@@ -15,30 +16,30 @@ py_library(
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python:interpreter",
         "//tensorflow/lite/python:lite",
-        "//tensorflow/python:tf_optimizer",
-        "//tensorflow/python:training",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/grappler:tf_optimizer",
+        "//tensorflow/python/training:saver",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dct_ops_test",
     srcs = ["dct_ops_test.py"],
     python_version = "PY3",
     shard_count = 16,
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops/signal:dct_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "fft_ops_test",
     size = "medium",
     srcs = ["fft_ops_test.py"],
@@ -49,81 +50,98 @@ cuda_py_test(
         "optonly",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:spectral_ops_gen",
         "//tensorflow/python/ops/signal:fft_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "mel_ops_test",
     srcs = ["mel_ops_test.py"],
     python_version = "PY3",
     deps = [
         ":test_util",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops/signal:mel_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "mfcc_ops_test",
     srcs = ["mfcc_ops_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/ops/signal:mfcc_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reconstruction_ops_test",
     srcs = ["reconstruction_ops_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/signal:reconstruction_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "shape_ops_test",
     srcs = ["shape_ops_test.py"],
     python_version = "PY3",
     deps = [
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/signal:shape_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "spectral_ops_test",
     size = "large",
     srcs = ["spectral_ops_test.py"],
@@ -133,24 +151,22 @@ cuda_py_test(
         "nomac",
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/signal:spectral_ops",
         "//tensorflow/python/ops/signal:window_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "window_ops_test",
     srcs = ["window_ops_test.py"],
     python_version = "PY3",
@@ -161,12 +177,13 @@ cuda_py_test(
     ],
     deps = [
         ":test_util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops/signal:window_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/sparse_ops/BUILD b/tensorflow/python/kernel_tests/sparse_ops/BUILD
index 2ce8ca27d37..66192dba104 100644
--- a/tensorflow/python/kernel_tests/sparse_ops/BUILD
+++ b/tensorflow/python/kernel_tests/sparse_ops/BUILD
@@ -1,6 +1,7 @@
 # Tests of TensorFlow sparse ops written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -8,77 +9,96 @@ package(
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_add_op_test",
     size = "small",
     srcs = ["sparse_add_op_test.py"],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_grad",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_grad",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_concat_op_test",
     size = "small",
     srcs = ["sparse_concat_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_conditional_accumulator_test",
     size = "small",
     srcs = ["sparse_conditional_accumulator_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_cross_op_test",
     size = "small",
     srcs = ["sparse_cross_op_test.py"],
     tags = ["no_windows"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:sparse_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_matmul_op_test",
     size = "medium",
     srcs = ["sparse_matmul_op_test.py"],
     tags = ["no_windows"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_ops_test",
     size = "medium",
     srcs = ["sparse_ops_test.py"],
@@ -87,95 +107,114 @@ cuda_py_test(
         "optonly",  # b/77589990
     ],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:sparse_grad",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:sparse_grad",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:sparse_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_reorder_op_test",
     size = "small",
     srcs = ["sparse_reorder_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradient_checker",
-        "//tensorflow/python:sparse_grad",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:sparse_grad",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_reshape_op_test",
     size = "small",
     srcs = ["sparse_reshape_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_serialization_ops_test",
     size = "small",
     srcs = ["sparse_serialization_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_slice_op_test",
     size = "small",
     srcs = ["sparse_slice_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:sparse_grad",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:sparse_grad",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_split_op_test",
     size = "small",
     srcs = ["sparse_split_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:sparse_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_tensor_dense_matmul_grad_test",
     size = "small",
     srcs = ["sparse_tensor_dense_matmul_grad_test.py"],
@@ -183,79 +222,93 @@ cuda_py_test(
         "no_cuda_asan",  # b/182392418 times out
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_grad",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:sparse_grad",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     # Test name is shortened to workaround Windows Bazel's 259 char path limit.
     name = "sparse_tensor_dense_matmul_op_d9m_test",
     size = "medium",
     srcs = ["sparse_tensor_dense_matmul_op_d9m_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_tensor_dense_matmul_op_test",
     size = "medium",
     srcs = ["sparse_tensor_dense_matmul_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl:app",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparse_tensors_map_ops_test",
     size = "small",
     srcs = ["sparse_tensors_map_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_to_dense_op_py_test",
     size = "small",
     srcs = ["sparse_to_dense_op_py_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_xent_op_d9m_test",
     size = "medium",
     srcs = ["sparse_xent_op_d9m_test.py"],
@@ -263,69 +316,74 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,
     deps = [
         ":sparse_xent_op_test_base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "sparse_xent_op_test",
     size = "small",
     srcs = ["sparse_xent_op_test.py"],
     deps = [
         ":sparse_xent_op_test_base",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:sparse_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl:app",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "sparse_xent_op_test_base",
     srcs = ["sparse_xent_op_test_base.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:gradient_checker_v2",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "sparsemask_op_test",
     size = "small",
     srcs = ["sparsemask_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/strings_ops/BUILD b/tensorflow/python/kernel_tests/strings_ops/BUILD
index f11e9cc2f2c..a0a675c1334 100644
--- a/tensorflow/python/kernel_tests/strings_ops/BUILD
+++ b/tensorflow/python/kernel_tests/strings_ops/BUILD
@@ -1,304 +1,313 @@
 # Tests of TensorFlow math kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "as_string_op_test",
     size = "small",
     srcs = ["as_string_op_test.py"],
     tags = ["no_windows"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "base64_ops_test",
     size = "small",
     srcs = ["base64_ops_test.py"],
     tags = ["nomac"],  # b/35468214
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "reduce_join_op_test",
     size = "small",
     srcs = ["reduce_join_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "regex_full_match_op_test",
     size = "small",
     srcs = ["regex_full_match_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:string_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "regex_replace_op_test",
     size = "small",
     srcs = ["regex_replace_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:string_ops_gen",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_bytes_split_op_test",
     size = "small",
     srcs = ["string_bytes_split_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_format_op_test",
     size = "small",
     srcs = ["string_format_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_join_op_test",
     size = "small",
     srcs = ["string_join_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_length_op_test",
     size = "small",
     srcs = ["string_length_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_lower_op_test",
     size = "small",
     srcs = ["string_lower_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_split_op_test",
     size = "small",
     srcs = ["string_split_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_strip_op_test",
     size = "small",
     srcs = ["string_strip_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "string_to_hash_bucket_op_test",
     size = "small",
     srcs = ["string_to_hash_bucket_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "string_to_number_op_test",
     size = "small",
     srcs = ["string_to_number_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:parsing_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "string_upper_op_test",
     size = "small",
     srcs = ["string_upper_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
-        "//third_party/py/numpy",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "substr_op_test",
     size = "small",
     srcs = ["substr_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unicode_decode_op_test",
     size = "small",
     srcs = ["unicode_decode_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/ops/ragged",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:string_ops_gen",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unicode_encode_op_test",
     size = "small",
     srcs = ["unicode_encode_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_string_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:ragged_tensor_value",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unicode_script_op_test",
     size = "small",
     srcs = ["unicode_script_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "unicode_transcode_op_test",
     size = "small",
     srcs = ["unicode_transcode_op_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "unsorted_segment_join_op_test",
     size = "small",
     srcs = ["unsorted_segment_join_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/strings_ops/string_bytes_split_op_test.py b/tensorflow/python/kernel_tests/strings_ops/string_bytes_split_op_test.py
index 5fbb50cac66..85d3a610534 100644
--- a/tensorflow/python/kernel_tests/strings_ops/string_bytes_split_op_test.py
+++ b/tensorflow/python/kernel_tests/strings_ops/string_bytes_split_op_test.py
@@ -69,7 +69,7 @@ class StringsToBytesOpTest(test_util.TensorFlowTestCase,
       return ragged_string_ops.string_bytes_split(v)
 
     with self.assertRaisesRegex(TypeError,
-                                'Binding inputs to tf.function `f` failed'):
+                                'Binding inputs to tf.function failed'):
       f(['foo'])
 
 
diff --git a/tensorflow/python/kernel_tests/summary_ops/BUILD b/tensorflow/python/kernel_tests/summary_ops/BUILD
index ae84e7cdf4b..833f356360b 100644
--- a/tensorflow/python/kernel_tests/summary_ops/BUILD
+++ b/tensorflow/python/kernel_tests/summary_ops/BUILD
@@ -1,91 +1,97 @@
 # Tests of TensorFlow kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "summary_ops_test",
     size = "medium",
     srcs = ["summary_ops_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:tag_constants",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "summary_v1_audio_op_test",
     size = "small",
     srcs = ["summary_v1_audio_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/summary:summary_py",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "summary_v1_image_op_test",
     size = "small",
     srcs = ["summary_v1_image_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:nn_grad",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/summary:summary_py",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "summary_v1_ops_test",
     size = "small",
     srcs = ["summary_v1_ops_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:logging_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/summary:summary_py",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "summary_v1_tensor_op_test",
     size = "small",
     srcs = ["summary_v1_tensor_op_test.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/summary:summary_py",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
index 9bf77efcece..bcc5af5f69d 100644
--- a/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
+++ b/tensorflow/python/kernel_tests/v1_compat_tests/BUILD
@@ -1,83 +1,93 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "identity_op_py_test",
     size = "small",
     srcs = ["identity_op_py_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "scatter_nd_ops_test",
     size = "small",
     srcs = ["scatter_nd_ops_test.py"],
     deps = [
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "session_ops_test",
     size = "small",
     srcs = ["session_ops_test.py"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:session_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "array_ops_test",
     size = "small",
     srcs = ["array_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "stack_op_test",
     size = "small",
     srcs = ["stack_op_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dense_update_ops_test",
-    size = "small",
+    size = "medium",
     srcs = ["dense_update_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/kernel_tests/variables/BUILD b/tensorflow/python/kernel_tests/variables/BUILD
index 8857fdaf62a..7757a883cbb 100644
--- a/tensorflow/python/kernel_tests/variables/BUILD
+++ b/tensorflow/python/kernel_tests/variables/BUILD
@@ -1,63 +1,66 @@
 # Tests of TensorFlow variables kernels written using the Python API.
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "tf_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dense_update_ops_no_tsan_test",
     size = "small",
     srcs = ["dense_update_ops_no_tsan_test.py"],
     # TODO (b/140294007): the test fails with XLA.
     xla_enable_strict_auto_jit = False,
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "dense_update_ops_test",
     size = "small",
     srcs = ["dense_update_ops_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "partitioned_variables_test",
     size = "small",
     srcs = ["partitioned_variables_test.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:saver",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "resource_variable_ops_test",
     size = "medium",
     srcs = ["resource_variable_ops_test.py"],
@@ -67,92 +70,124 @@ cuda_py_test(
     tags = ["no_windows"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:handle_data_util",
-        "//tensorflow/python:memory_checker",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:memory_checker",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:handle_data_util",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:momentum",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/training:training_util",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "variable_ops_test",
     size = "small",
     srcs = ["variable_ops_test.py"],
+    tags = ["nomsan"],  # TODO(b/281849652): Re-enable.
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:state_ops_gen",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "variable_scope_test",
     size = "medium",
     srcs = ["variable_scope_test.py"],
     tags = ["no_windows"],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "variables_test",
     size = "small",
     srcs = ["variables_test.py"],
     tags = ["no_windows"],  # b/133869052
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:state_ops_gen",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py b/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py
index ec05c2d2fda..aaa80925390 100644
--- a/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py
+++ b/tensorflow/python/kernel_tests/variables/partitioned_variables_test.py
@@ -23,7 +23,7 @@ from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
@@ -35,7 +35,7 @@ from tensorflow.python.training import saver as saver_lib
 
 
 def initialized_value(var):
-  return control_flow_ops.cond(
+  return cond.cond(
       variable_v1.is_variable_initialized(var), var.read_value,
       lambda: var.initial_value)
 
diff --git a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
index 995124fa4a4..ebe9788667c 100644
--- a/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
+++ b/tensorflow/python/kernel_tests/variables/resource_variable_ops_test.py
@@ -29,6 +29,7 @@ from tensorflow.python.eager import backprop
 from tensorflow.python.eager import context
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import composite_tensor
+from tensorflow.python.framework import composite_tensor_gradient
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import cpp_shape_inference_pb2
 from tensorflow.python.framework import dtypes
@@ -65,6 +66,23 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 
 
+class CompositeVariableGradient(
+    composite_tensor_gradient.CompositeTensorGradient):
+  """Gradient protocol for CompositeVariable."""
+
+  def get_gradient_components(self, value):
+    return value._type_spec._to_components(value)
+
+  def replace_gradient_components(self, value, component_grads):
+    return value._type_spec._from_components(component_grads)
+
+
+class CompositeVariable(extension_type.ExtensionType):
+  v: resource_variable_ops.ResourceVariable
+
+  __composite_gradient__ = CompositeVariableGradient()
+
+
 def _eager_safe_var_handle_op(*args, **kwargs):
   # When running in eager mode the `shared_name` should be set to the
   # `anonymous_name` to avoid spurious sharing issues. The runtime generates a
@@ -205,12 +223,12 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
   def testInitializeVariableUsingInitializedValue(self):
     var1 = resource_variable_ops.ResourceVariable(1.0, name="var1")
     var2 = resource_variable_ops.ResourceVariable(
-        control_flow_ops.cond(
+        tf_cond.cond(
             variable_v1.is_variable_initialized(var1), var1.read_value,
             lambda: var1.initial_value),
         name="var2")
     self.assertAllEqual(
-        control_flow_ops.cond(
+        tf_cond.cond(
             variable_v1.is_variable_initialized(var2), var2.read_value,
             lambda: var2.initial_value), 1.0)
 
@@ -464,6 +482,18 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
     self.evaluate(variables.global_variables_initializer())
     self.assertAllEqual(self.evaluate(grads.values), [[1., 1.], [1., 1.]])
 
+  @test_util.run_in_graph_and_eager_modes
+  def testGradientCompositeVariable(self):
+    composite_variable = CompositeVariable(
+        resource_variable_ops.ResourceVariable([1., 2., 3.]))
+
+    self.evaluate(variables.global_variables_initializer())
+
+    with backprop.GradientTape() as tape:
+      result = tape.gradient(composite_variable, composite_variable.v)
+
+    self.assertAllEqual(result, [1., 1., 1.])
+
   @test_util.run_in_graph_and_eager_modes
   def testScatterSub(self):
     handle = _eager_safe_var_handle_op(dtype=dtypes.int32, shape=[1, 1])
@@ -1001,7 +1031,7 @@ class ResourceVariableOpsTest(test_util.TensorFlowTestCase,
       # But attempts to use read_value will result in errors.
       with self.assertRaises(ValueError):
         self.evaluate(
-            control_flow_ops.cond(
+            tf_cond.cond(
                 variable_v1.is_variable_initialized(v), v.read_value,
                 lambda: v.initial_value))
 
diff --git a/tensorflow/python/kernel_tests/variables/variables_test.py b/tensorflow/python/kernel_tests/variables/variables_test.py
index bc7ea366294..929452b104e 100644
--- a/tensorflow/python/kernel_tests/variables/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables/variables_test.py
@@ -30,7 +30,6 @@ from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import cond as tf_cond
-from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import gen_state_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import random_ops
@@ -44,7 +43,7 @@ from tensorflow.python.util import compat
 
 
 def initialized_value(var):
-  return control_flow_ops.cond(
+  return tf_cond.cond(
       variable_v1.is_variable_initialized(var), var.read_value,
       lambda: var.initial_value)
 
diff --git a/tensorflow/python/layers/BUILD b/tensorflow/python/layers/BUILD
index 7dae801fb0e..d1a56ff4bc9 100644
--- a/tensorflow/python/layers/BUILD
+++ b/tensorflow/python/layers/BUILD
@@ -1,5 +1,7 @@
 # python/layers package
 
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 visibility = [
     "//tensorflow:__subpackages__",
 ]
@@ -10,33 +12,38 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "layers_base",
     srcs = [
         "__init__.py",
         "base.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+    ],
     deps = [
         "//tensorflow/python/keras/legacy_tf_layers:layers_base",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "layers_util",
     srcs = [
         "utils.py",
     ],
     srcs_version = "PY3",
+    visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "layers",
     srcs = [
         "convolutional.py",
@@ -46,11 +53,17 @@ py_library(
         "pooling.py",
     ],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":layers_base",
         "//tensorflow/python/keras/legacy_tf_layers:convolutional",
         "//tensorflow/python/keras/legacy_tf_layers:core",
         "//tensorflow/python/keras/legacy_tf_layers:pooling",
+        "//tensorflow/python/util:lazy_loader",
         # Normalization layer will need //third_party/py/keras/legacy_tf_layers:normalization
         # Client lib should import that, since this target can't import it due to
         # circular dependency.
diff --git a/tensorflow/python/lib/core/BUILD b/tensorflow/python/lib/core/BUILD
index 38ff93f242a..9865a56362a 100644
--- a/tensorflow/python/lib/core/BUILD
+++ b/tensorflow/python/lib/core/BUILD
@@ -1,6 +1,6 @@
 # python/lib/core package
 
-load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_external_workspace_visible", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable", "tf_external_workspace_visible", "tf_py_strict_test", "tf_python_pybind_extension")
 
 visibility = [
     "//engedu/ml/tf_from_scratch:__pkg__",
@@ -33,6 +33,10 @@ cc_library(
     hdrs = [
         "bfloat16.h",
     ],
+    visibility = visibility + [
+        "//third_party/courier:__subpackages__",
+        "//third_party/py/courier:__subpackages__",
+    ],
     deps = [
         "//tensorflow/tsl/python/lib/core:bfloat16_lib",
         "//third_party/python_runtime:headers",  # build_cleaner: keep; DNR: b/35864863
@@ -327,14 +331,18 @@ cc_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "custom_float_test",
     size = "small",
     srcs = ["custom_float_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python/lib/io:lib",
+        ":_pywrap_custom_casts",
+        ":_pywrap_float8",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/tsl/python/lib/core:pywrap_bfloat16",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/lib/io/BUILD b/tensorflow/python/lib/io/BUILD
index df4b38b5db4..8eeace63a62 100644
--- a/tensorflow/python/lib/io/BUILD
+++ b/tensorflow/python/lib/io/BUILD
@@ -1,6 +1,7 @@
 # python/lib/io package
 
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test", "tf_python_pybind_extension")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test", "tf_python_pybind_extension")
 load("//tensorflow:tensorflow.bzl", "if_oss")
 
 # copybara:uncomment_begin(google-only)
@@ -29,7 +30,7 @@ tf_python_pybind_extension(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "lib",
     srcs = [
         "file_io.py",
@@ -37,11 +38,16 @@ py_library(
         "tf_record.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/py/tf_agents:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":_pywrap_file_io",
         ":_pywrap_record_io",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -63,7 +69,7 @@ tf_python_pybind_extension(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "file_io_test",
     size = "small",
     srcs = ["file_io_test.py"],
@@ -74,12 +80,14 @@ tf_py_test(
     ],
     deps = [
         ":lib",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:gfile",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = if_oss("tf_record_test", "_tf_record_test"),
     size = "small",
     srcs = ["tf_record_test.py"],
@@ -93,8 +101,9 @@ tf_py_test(
     ],
     deps = [
         ":lib",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:compat",
     ],
 )
 
diff --git a/tensorflow/python/module/BUILD b/tensorflow/python/module/BUILD
index 0fb60967564..8f7d6fc6707 100644
--- a/tensorflow/python/module/BUILD
+++ b/tensorflow/python/module/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,14 +7,15 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "module",
     srcs = ["module.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -21,24 +23,24 @@ py_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "module_test",
     srcs = ["module_test.py"],
     deps = [
         ":module",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:composite_tensor",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:ps_values",
         "//tensorflow/python/distribute:tpu_values",
         "//tensorflow/python/distribute:values",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/modules_with_exports.py b/tensorflow/python/modules_with_exports.py
index fb9663b7c64..c0688f2ae12 100644
--- a/tensorflow/python/modules_with_exports.py
+++ b/tensorflow/python/modules_with_exports.py
@@ -31,6 +31,9 @@ from tensorflow.core.protobuf.meta_graph_pb2 import MetaGraphDef
 from tensorflow.core.protobuf.config_pb2 import *
 from tensorflow.core.util.event_pb2 import *
 
+# Data
+from tensorflow.python import data
+
 # Framework
 from tensorflow.python.framework.framework_lib import *  # pylint: disable=redefined-builtin
 from tensorflow.python.framework.versions import *
@@ -72,6 +75,14 @@ from tensorflow.python.distribute.coordinator.cluster_coordinator import *
 from tensorflow.python.distribute.failure_handling.failure_handling import *
 from tensorflow.python.distribute.failure_handling.preemption_watcher import *
 
+from tensorflow.python.ops.numpy_ops import np_random
+from tensorflow.python.ops.numpy_ops import np_utils
+from tensorflow.python.ops.numpy_ops import np_array_ops
+from tensorflow.python.ops.numpy_ops import np_arrays
+from tensorflow.python.ops.numpy_ops import np_config
+from tensorflow.python.ops.numpy_ops import np_dtypes
+from tensorflow.python.ops.numpy_ops import np_math_ops
+
 tf_export('__internal__.decorator.make_decorator', v1=[])(make_decorator)
 tf_export('__internal__.decorator.unwrap', v1=[])(unwrap)
 
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
new file mode 100644
index 00000000000..96efe0d6aa2
--- /dev/null
+++ b/tensorflow/python/ops/BUILD
@@ -0,0 +1,4372 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
+load("//tensorflow/python:build_defs.bzl", "tf_gen_op_wrapper_private_py")
+
+visibility = [
+    "//engedu/ml/tf_from_scratch:__pkg__",
+    "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+    "//third_party/mlperf:__subpackages__",
+    "//tensorflow:internal",
+    "//tensorflow/dtensor:dtensor-internal",
+    "//tensorflow/lite/toco/python:__pkg__",
+    "//tensorflow_models:__subpackages__",
+    "//tensorflow_model_optimization:__subpackages__",
+    "//third_party/py/cleverhans:__subpackages__",
+    "//third_party/courier:__subpackages__",
+    "//third_party/py/courier:__subpackages__",
+    "//third_party/py/lingvo:__subpackages__",
+    "//third_party/py/reverb:__subpackages__",
+    "//third_party/py/tensorfn:__subpackages__",
+    "//third_party/py/neural_structured_learning:__subpackages__",
+    "//third_party/py/tensorflow_docs:__subpackages__",
+    "//third_party/py/tensorflow_examples:__subpackages__",
+    "//third_party/py/tensorflow_recommenders:__subpackages__",
+    "//third_party/py/tf_agents:__subpackages__",  # For benchmarks.
+    "//third_party/py/tf_slim:__subpackages__",
+    "//third_party/py/keras:__subpackages__",
+    "//third_party/py/starcraft2:__subpackages__",
+    "//third_party/py/tensorflow_gnn:__subpackages__",
+    "//third_party/py/tensorflow_numerics:__subpackages__",
+    "//third_party/py/tensorflow_privacy:__subpackages__",
+    "//third_party/reverb:__subpackages__",
+    "//tensorflow_minigo:__subpackages__",
+    "//research/graph/fairness/inproc_fair_reg:__subpackages__",
+]
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = visibility,
+    licenses = ["notice"],
+)
+
+py_strict_library(
+    name = "map_fn",
+    srcs = ["map_fn.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":tensor_array_ops",
+        ":variable_scope",
+        ":while_loop",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/autograph/impl:api",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "functional_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/eager/polymorphic_function:__pkg__",
+        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
+    ],
+)
+
+py_strict_library(
+    name = "functional_ops",
+    srcs = ["functional_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":functional_ops_gen",
+        ":math_ops",
+        ":tensor_array_ops",
+        ":variable_scope",
+        ":while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_test(
+    name = "functional_ops_test",
+    srcs = ["functional_ops_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":functional_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "debug_ops_gen",
+    out = "gen_debug_ops.py",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/debug:__subpackages__",
+    ],
+    deps = ["//tensorflow/core:debug_ops_op_lib"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "array_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/array_ops:__pkg__",
+        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
+        "//tensorflow/python/kernel_tests/v1_compat_tests:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+        "//tensorflow/security/fuzzing/google:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/c/kernels:bitcast_op_lib",
+        "//tensorflow/core:array_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "bitwise_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/contrib/quantization:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "boosted_trees_ops_gen",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core:boosted_trees_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "summary_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    deps = ["//tensorflow/core:summary_ops_op_lib"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "audio_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "cudnn_rnn_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "candidate_sampling_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "checkpoint_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/io_ops:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "clustering_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:clustering_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "collective_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:collective_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "control_flow_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:control_flow_ops_op_lib",
+        "//tensorflow/core:no_op_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "ctc_ops_gen",
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "data_flow_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
+        "//tensorflow/python/kernel_tests/data_structures:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "dataset_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "optional_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "experimental_dataset_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow:__subpackages__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "filesystem_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "image_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests/image_ops:__pkg__",
+        "//tensorflow/python/ops/parallel_for:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "io_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/checkpoint:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/array_ops:__pkg__",
+        "//tensorflow/python/kernel_tests/io_ops:__pkg__",
+        "//tensorflow/python/trackable:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "linalg_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "logging_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/framework:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
+        "//tensorflow/python/summary:__pkg__",
+        "//tensorflow/python/summary/writer:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/c/kernels:histogram_summary_op_lib",
+        "//tensorflow/c/kernels:merge_summary_op_lib",
+        "//tensorflow/c/kernels:summary_op_lib",
+        "//tensorflow/core:logging_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "lookup_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/data_structures:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "batch_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+)
+
+py_strict_library(
+    name = "batch_ops",
+    srcs = [
+        "batch_ops.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":batch_ops_gen",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "batch_ops_test",
+    size = "small",
+    srcs = ["batch_ops_test.py"],
+    main = "batch_ops_test.py",
+    python_version = "PY3",
+    srcs_version = "PY3",
+    tags = [
+        "manual",
+        "no_cuda_asan",  # b/177916286
+        "no_pip",
+        "nomac",
+    ],
+    deps = [
+        ":array_ops",
+        ":batch_ops",
+        ":batch_ops_gen",
+        ":functional_ops_gen",
+        ":math_ops",
+        ":random_ops",
+        ":resource_variable_ops",
+        ":script_ops",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "manip_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "math_ops_gen",
+    visibility = [
+        "//learning/brain/google/python/ops:__pkg__",
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/math_ops:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "nn_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/nn_ops:__pkg__",
+        "//tensorflow/python/kernel_tests/sparse_ops:__pkg__",
+        "//tensorflow/python/tools:__pkg__",
+        "//tensorflow/security/fuzzing/google:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "count_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/ops:__subpackages__",
+        "//tensorflow/security/fuzzing/google:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "parsing_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/autograph/operators:__pkg__",
+        "//tensorflow/python/data/ops:__pkg__",
+        "//tensorflow/python/ops/parallel_for:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "random_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/eager/polymorphic_function:__pkg__",
+        "//tensorflow/python/kernel_tests/random:__pkg__",
+        "//tensorflow/python/ops/parallel_for:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "special_math_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/python:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "stateful_random_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests/random:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "resource_variable_ops_gen",
+    visibility = [
+        "//tensorflow/compiler/tf2xla:internal",
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/distribute:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "stateless_random_ops_gen",
+    visibility = [
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/data/experimental/ops:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "stateless_random_ops_v2_gen",
+    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "random_index_shuffle_ops_gen",
+    visibility = visibility + ["//tensorflow/python/kernel_tests/random:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "list_ops_gen",
+    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "map_ops_gen",
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "script_ops_gen",
+    visibility = ["//tensorflow/python:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "sdca_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow_estimator/python/estimator/canned/linear_optimizer:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "set_ops_gen",
+    visibility = visibility + ["//tensorflow/python/kernel_tests/math_ops:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "state_ops_gen",
+    visibility = [
+        "//learning/brain/python/ops:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/framework:__pkg__",
+        "//tensorflow/python/kernel_tests:__pkg__",
+        "//tensorflow/python/kernel_tests/array_ops:__pkg__",
+        "//tensorflow/python/kernel_tests/control_flow:__pkg__",
+        "//tensorflow/python/kernel_tests/variables:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "sparse_ops_gen",
+    visibility = visibility + ["//tensorflow/python/kernel_tests/sparse_ops:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "spectral_ops_gen",
+    visibility = [
+        "//tensorflow/dtensor:dtensor-internal",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/ops/signal:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "string_ops_gen",
+    visibility = visibility + ["//tensorflow/dtensor:dtensor-internal"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "training_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/training:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "composite_tensor_ops_gen",
+    visibility = visibility + ["//tensorflow/python/kernel_tests:__pkg__"],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "ragged_array_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "ragged_math_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "ragged_conversion_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/ops/ragged:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "risc_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/ops/risc:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:risc_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "rnn_ops_gen",
+    visibility = [
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/kernel_tests/nn_ops:__pkg__",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "sendrecv_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:sendrecv_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "tpu_ops_gen",
+    visibility = [
+        "//smartass/brain/configure/python:__pkg__",
+        "//tensorflow/compiler/tests:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/tpu:__pkg__",
+        "//tensorflow/python/tpu/ops:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core:tpu_configuration_ops_op_lib",
+        "//tensorflow/core:tpu_cross_replica_ops_op_lib",
+        "//tensorflow/core:tpu_embedding_load_retrieve_ops_op_lib",
+        "//tensorflow/core:tpu_embedding_ops_op_lib",
+        "//tensorflow/core:tpu_functional_ops_op_lib",
+        "//tensorflow/core:tpu_heartbeat_ops_op_lib",
+        "//tensorflow/core:tpu_host_compute_ops_op_lib",
+        "//tensorflow/core:tpu_infeed_ops_op_lib",
+        "//tensorflow/core:tpu_ordinal_selector_ops_op_lib",
+        "//tensorflow/core:tpu_outfeed_ops_op_lib",
+        "//tensorflow/core:tpu_replication_ops_op_lib",
+        "//tensorflow/core:tpu_sharding_util_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "tpu_partition_ops_gen",
+    visibility = [
+        "//tensorflow:internal",
+        "//tensorflow/python/tpu:__pkg__",
+        "//tensorflow/python/tpu/ops:__pkg__",
+    ],
+    deps = [
+        "//tensorflow/core/tpu/ops:tpu_partitioned_ops",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "uniform_quant_ops_gen",
+    visibility = [
+        "//tensorflow:__subpackages__",
+    ],
+    deps = [
+        "//tensorflow/core:uniform_quant_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "sync_ops_gen",
+    visibility = [
+        "//tensorflow:internal",
+    ],
+    deps = [
+        "//tensorflow/core:sync_ops_op_lib",
+    ],
+)
+
+py_strict_library(
+    name = "array_grad",
+    srcs = ["array_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":array_ops_stack",
+        ":cond",
+        ":control_flow_util",
+        ":math_ops",
+        ":math_ops_gen",
+        ":resource_variable_ops_gen",
+        ":sparse_ops",
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "array_grad_test",
+    srcs = ["array_grad_test.py"],
+    main = "array_grad_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":gradient_checker_v2",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_library(
+    name = "array_ops",
+    srcs = ["array_ops.py"],
+    srcs_version = "PY3",
+    visibility = visibility,
+    deps = [
+        ":array_ops_gen",
+        ":array_ops_stack",
+        ":math_ops_gen",
+        ":shape_util",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:common_shapes",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "array_ops_stack",
+    srcs = ["array_ops_stack.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops_gen",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "inplace_ops",
+    srcs = ["inplace_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+    ],
+)
+
+py_strict_library(
+    name = "bitwise_ops",
+    srcs = ["bitwise_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":bitwise_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "boosted_trees_ops",
+    srcs = ["boosted_trees_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":boosted_trees_ops_gen",
+        ":resources",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/training:saver",
+    ],
+)
+
+py_strict_library(
+    name = "optional_grad",
+    srcs = ["optional_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":optional_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "sets",
+    srcs = [
+        "sets.py",
+    ],
+    srcs_version = "PY3",
+    deps = [":sets_impl"],
+)
+
+py_strict_library(
+    name = "candidate_sampling_ops",
+    srcs = ["candidate_sampling_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":candidate_sampling_ops_gen",
+        ":math_ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "check_ops",
+    srcs = ["check_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":cond",
+        ":control_flow_assert",
+        ":control_flow_ops",
+        ":math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "clip_ops",
+    srcs = ["clip_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":array_ops_stack",
+        ":math_ops",
+        ":nn_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+tf_py_strict_test(
+    name = "clip_ops_test",
+    size = "small",
+    srcs = ["clip_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":clip_ops",
+        ":numerics",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "clustering_ops",
+    srcs = ["clustering_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":check_ops",
+        ":clustering_ops_gen",
+        ":cond",
+        ":control_flow_ops",
+        ":embedding_ops",
+        ":math_ops",
+        ":nn_impl",
+        ":random_ops",
+        ":state_ops",
+        ":variable_v1",
+        ":while_loop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+    ],
+)
+
+tf_py_strict_test(
+    name = "clustering_ops_test",
+    size = "medium",
+    srcs = ["clustering_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":clustering_ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "collective_ops",
+    srcs = ["collective_ops.py"],
+    srcs_version = "PY3",
+    deps = [":collective_ops_gen"],
+)
+
+tf_py_strict_test(
+    name = "collective_ops_test",
+    size = "small",
+    srcs = ["collective_ops_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_rocm",
+    ],
+    deps = [
+        ":array_ops",
+        ":collective_ops",
+        ":math_ops",
+        ":variable_v1",
+        ":variables",
+        ":while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:kernels",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+    ],
+)
+
+tf_py_strict_test(
+    name = "collective_ops_xla_test",
+    size = "small",
+    srcs = ["collective_ops_xla_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_pip",
+        "no_rocm",
+        "no_windows",
+        "nomac",
+    ],
+    xla_enable_strict_auto_jit = True,
+    deps = [
+        ":array_ops",
+        ":collective_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "collective_ops_gpu_test",
+    size = "small",
+    srcs = ["collective_ops_gpu_test.py"],
+    main = "collective_ops_gpu_test.py",
+    python_version = "PY3",
+    tags = [
+        "guitar",
+        "multi_gpu",
+        "no_windows",
+    ],
+    deps = [
+        ":collective_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "compiled_collective_ops_gpu_test",
+    size = "small",
+    srcs = ["compiled_collective_ops_gpu_test.py"],
+    main = "compiled_collective_ops_gpu_test.py",
+    python_version = "PY3",
+    tags = [
+        "guitar",
+        "multi_gpu",
+        "no_windows",
+    ],
+    deps = [
+        ":array_ops",
+        ":collective_ops",
+        ":while_loop",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_assert",
+    srcs = ["control_flow_assert.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":cond",
+        ":control_flow_ops_gen",
+        ":logging_ops_gen",
+        ":math_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_should_use",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_grad",
+    srcs =
+        ["control_flow_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_ops",
+    srcs = ["control_flow_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":control_flow_ops_gen",
+        ":control_flow_util",
+        ":math_ops",
+        ":tensor_array_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_case",
+    srcs = ["control_flow_case.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops_stack",
+        ":cond",
+        ":control_flow_assert",
+        ":math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_switch_case",
+    srcs = ["control_flow_switch_case.py"],
+    deps = [
+        ":array_ops",
+        ":cond_v2",
+        ":control_flow_util",
+        ":functional_ops_gen",
+        ":math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "cond",
+    srcs = ["cond.py"],
+    deps = [
+        ":array_ops",
+        ":cond_v2",
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function:eager_function_run",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "while_loop",
+    srcs = ["while_loop.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":math_ops",
+        ":tensor_array_ops",
+        ":while_v2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_util",
+    srcs = ["control_flow_util.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/platform:tf_logging",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_util_v2",
+    srcs = ["control_flow_util_v2.py"],
+    srcs_version = "PY3",
+    visibility = visibility + ["//waymo/ml:__subpackages__"],
+    deps = [
+        ":control_flow_util",
+        ":control_flow_v2_func_graphs",
+        ":gradients_util",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager/polymorphic_function:atomic_function",
+        "//tensorflow/python/eager/polymorphic_function:concrete_function",
+        "//tensorflow/python/eager/polymorphic_function:tracing_compiler",
+        "//tensorflow/python/eager/polymorphic_function:transform",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function_def_to_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:keras_deps",
+        "//tensorflow/python/util:tf_decorator",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_v2_func_graphs",
+    srcs = ["control_flow_v2_func_graphs.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_v2_toggles",
+    srcs = ["control_flow_v2_toggles.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":control_flow_util",
+        ":control_flow_util_v2",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+tf_py_strict_test(
+    name = "control_flow_v2_toggles_test",
+    size = "small",
+    srcs = ["control_flow_v2_toggles_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":control_flow_util_v2",
+        ":control_flow_v2_toggles",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
+    ],
+)
+
+tf_py_strict_test(
+    name = "control_flow_v2_enable_test",
+    size = "small",
+    srcs = ["control_flow_v2_enable_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":control_flow_util",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
+    ],
+)
+
+tf_py_strict_test(
+    name = "control_flow_v2_disable_test",
+    size = "small",
+    srcs = ["control_flow_v2_disable_test.py"],
+    python_version = "PY3",
+    # This tests that it is possible to disable cfv2 using env vars.
+    # This does not apply to TF 2.0 nightly builds which enable
+    # v2 behavior using `tf.compat.v1.enable_v2_behavior()` in which case
+    # `tf.compat.v1.disable_control_flow_v2()` needs to be used.
+    tags = [
+        "no_oss",
+        "no_pip",
+    ],
+    deps = [
+        ":control_flow_util",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
+    ],
+)
+
+py_strict_library(
+    name = "cond_v2",
+    srcs = [
+        "cond_v2.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_util",
+        ":control_flow_util_v2",
+        ":default_gradient",
+        ":functional_ops_gen",
+        ":gradients_util",
+        ":handle_data_util",
+        ":math_ops",
+        ":optional_ops_gen",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:backprop_util",
+        "//tensorflow/python/framework:auto_control_deps",
+        "//tensorflow/python/framework:auto_control_deps_utils",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/util:nest",
+    ],
+)
+
+py_strict_library(
+    name = "while_v2",
+    srcs = [
+        "while_v2.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":control_flow_util_v2",
+        ":default_gradient",
+        ":functional_ops_gen",
+        ":gradients_util",
+        ":handle_data_util",
+        ":list_ops",
+        ":math_ops",
+        ":resource_variable_ops_gen",
+        ":tensor_array_ops",
+        ":while_v2_indexed_slices_rewriter",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/eager:backprop_util",
+        "//tensorflow/python/framework:auto_control_deps_utils",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:variable_utils",
+    ],
+)
+
+py_library(
+    name = "bincount_ops",
+    srcs = ["bincount_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":math_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "bincount_ops_test",
+    size = "small",
+    srcs = ["bincount_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":bincount_ops",
+        ":count_ops_gen",
+        ":sparse_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_library(
+    name = "ctc_ops",
+    srcs = ["ctc_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":ctc_ops_gen",
+        ":custom_gradient",
+        ":functional_ops",
+        ":inplace_ops",
+        ":linalg_ops",
+        ":map_fn",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":sparse_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "cudnn_rnn_grad",
+    srcs = ["cudnn_rnn_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":cudnn_rnn_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "data_flow_grad",
+    srcs = ["data_flow_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":data_flow_ops",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "data_flow_ops",
+    srcs = ["data_flow_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":control_flow_ops",
+        ":data_flow_ops_gen",
+        ":math_ops",
+        ":resource_variable_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "embedding_ops",
+    srcs = ["embedding_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":clip_ops",
+        ":data_flow_grad",
+        ":data_flow_ops",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":sparse_ops",
+        ":variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "handle_data_util",
+    srcs = [
+        "handle_data_util.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
+    ],
+)
+
+py_strict_library(
+    name = "gradients",
+    srcs = [
+        "gradients.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":custom_gradient",
+        ":gradients_impl",
+        ":gradients_util",
+        ":unconnected_gradients",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:forwardprop",
+        "//tensorflow/python/eager:function",
+    ],
+)
+
+py_strict_library(
+    name = "gradients_impl",
+    srcs = [
+        "gradients_impl.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_grad",
+        ":array_ops",
+        ":check_ops",
+        ":control_flow_grad",
+        ":gradients_util",
+        ":image_grad",
+        ":linalg_grad",
+        ":linalg_ops",
+        ":logging_ops",
+        ":manip_grad",
+        ":math_grad",
+        ":math_ops",
+        ":optional_grad",
+        ":random_grad",
+        ":tensor_array_ops",
+        ":unconnected_gradients",
+        ":while_loop",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "gradients_util",
+    srcs = [
+        "gradients_util.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":control_flow_state",
+        ":control_flow_util",
+        ":default_gradient",
+        ":functional_ops_gen",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":unconnected_gradients",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/eager:backprop_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
+    ],
+)
+
+py_strict_library(
+    name = "default_gradient",
+    srcs = [
+        "default_gradient.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":resource_variable_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+    ],
+)
+
+py_strict_library(
+    name = "control_flow_state",
+    srcs = [
+        "control_flow_state.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":control_flow_util",
+        ":control_flow_v2_func_graphs",
+        ":data_flow_ops_gen",
+        ":default_gradient",
+        ":resource_variable_ops",
+        ":resource_variable_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+    ],
+)
+
+py_strict_library(
+    name = "unconnected_gradients",
+    srcs = ["unconnected_gradients.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "filesystem_ops",
+    srcs = ["filesystem_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":filesystem_ops_gen",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "histogram_ops",
+    srcs = ["histogram_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":clip_ops",
+        ":control_flow_assert",
+        ":control_flow_ops",
+        ":math_ops",
+        ":math_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "image_grad",
+    srcs = ["image_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":image_ops_gen",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "image_ops",
+    srcs = [
+        "image_ops.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":image_ops_gen",
+        ":image_ops_impl",
+        ":linalg_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "init_ops",
+    srcs = ["init_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        ":random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "init_ops_v2",
+    srcs = ["init_ops_v2.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":init_ops",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":math_ops",
+        ":random_ops",
+        ":stateless_random_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "initializers_ns",
+    srcs = ["initializers_ns.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":init_ops",
+        ":variables",
+    ],
+)
+
+py_strict_library(
+    name = "io_ops",
+    srcs = ["io_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":data_flow_ops_gen",
+        ":io_ops_gen",
+        ":parsing_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "linalg_grad",
+    srcs = ["linalg_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":cond",
+        ":linalg_ops",
+        ":linalg_ops_gen",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/linalg:linalg_impl",
+    ],
+)
+
+py_strict_library(
+    name = "linalg_ops",
+    srcs = ["linalg_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":cond",
+        ":linalg_ops_gen",
+        ":linalg_ops_impl",
+        ":map_fn",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "linalg_ops_impl",
+    srcs = ["linalg_ops_impl.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:compat",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "manip_grad",
+    srcs = ["manip_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":manip_ops",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "manip_ops",
+    srcs = ["manip_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":manip_ops_gen",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "logging_ops",
+    srcs = ["logging_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":logging_ops_gen",
+        ":string_ops",
+        "//tensorflow/python:pywrap_tfe",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+        "@absl_py//absl/logging",
+    ],
+)
+
+py_strict_library(
+    name = "lookup_ops",
+    srcs = ["lookup_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":lookup_ops_gen",
+        ":math_ops",
+        ":string_ops",
+        "//tensorflow/python/checkpoint:saveable_compat",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/saved_model/registration",
+        "//tensorflow/python/trackable:asset",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "math_grad",
+    srcs = ["math_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":math_ops",
+        ":math_ops_gen",
+        ":special_math_ops",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "op_selector",
+    srcs = ["op_selector.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:object_identity",
+    ],
+)
+
+py_strict_library(
+    name = "math_ops",
+    srcs = ["math_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":array_ops_stack",
+        ":bitwise_ops_gen",
+        ":data_flow_ops_gen",
+        ":math_ops_gen",
+        ":nn_ops_gen",
+        ":sparse_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops/numpy_ops:np_dtypes",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:lazy_loader",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:traceback_utils",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "resources",
+    srcs = ["resources.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":control_flow_ops",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_should_use",
+    ],
+)
+
+py_strict_library(
+    name = "resource_variable_ops",
+    srcs = ["resource_variable_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":handle_data_util",
+        ":math_ops",
+        ":resource_variable_ops_gen",
+        ":state_ops",
+        ":state_ops_gen",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/function/trace_type",
+        "//tensorflow/python/checkpoint:tensor_callable",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/framework:auto_control_deps_utils",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "critical_section_ops",
+    srcs = ["critical_section_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":resource_variable_ops_gen",
+        ":tensor_array_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "list_ops",
+    srcs = ["list_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":handle_data_util",
+        ":list_ops_gen",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:cpp_shape_inference_proto_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "map_ops",
+    srcs = ["map_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":cond",
+        ":map_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "nn",
+    srcs = [
+        "nn.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":array_ops_stack",
+        ":candidate_sampling_ops",
+        ":check_ops",
+        ":cond",
+        ":ctc_ops",
+        ":embedding_ops",
+        ":linalg_ops",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_impl",
+        ":nn_ops",
+        ":nn_ops_gen",
+        ":sparse_ops_gen",
+        ":variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/losses:util",
+        "//tensorflow/python/platform:device_context",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "nn_grad",
+    srcs = ["nn_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":math_ops",
+        ":nn_ops",
+        ":nn_ops_gen",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "nn_ops",
+    srcs = ["nn_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":check_ops",
+        ":math_ops",
+        ":math_ops_gen",
+        ":nn_ops_gen",
+        ":random_ops",
+        ":stateless_random_ops",
+        ":variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:device_context",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "numerics",
+    srcs = ["numerics.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "parsing_config",
+    srcs = ["parsing_config.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":check_ops",
+        ":math_ops",
+        ":sparse_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops/ragged:ragged_math_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "parsing_ops",
+    srcs = ["parsing_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_assert",
+        ":control_flow_ops",
+        ":math_ops",
+        ":parsing_config",
+        ":parsing_ops_gen",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "partitioned_variables",
+    srcs = ["partitioned_variables.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":variable_scope",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "random_grad",
+    srcs = ["random_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":clip_ops",
+        ":math_ops",
+        ":random_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "random_ops",
+    srcs = ["random_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":random_ops_gen",
+        ":shape_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "random_crop_ops",
+    srcs = ["random_crop_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_assert",
+        ":control_flow_ops",
+        ":math_ops",
+        ":random_ops",
+        ":stateless_random_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "stateful_random_ops",
+    srcs = ["stateful_random_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":stateful_random_ops_gen",
+        ":stateless_random_ops",
+        ":stateless_random_ops_v2_gen",
+        ":variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:sharded_variable",
+        "//tensorflow/python/distribute:values_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/trackable:autotrackable",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "stateless_random_ops",
+    srcs = ["stateless_random_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":bitwise_ops",
+        ":math_ops",
+        ":random_index_shuffle_ops_gen",
+        ":shape_util",
+        ":stateless_random_ops_gen",
+        ":stateless_random_ops_v2_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "rnn",
+    srcs = ["rnn.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":cond",
+        ":control_flow_assert",
+        ":control_flow_util",
+        ":control_flow_util_v2",
+        ":math_ops",
+        ":rnn_cell_impl",
+        ":tensor_array_ops",
+        ":variable_scope",
+        ":while_loop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "rnn_cell",
+    srcs = [
+        "rnn_cell.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":rnn_cell_impl",
+        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_impl",
+        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_wrapper_impl",
+    ],
+)
+
+py_library(
+    name = "script_ops",
+    srcs = [
+        "autograph_ops.py",
+        "script_ops.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":script_ops_gen",
+        ":sort_ops",
+        "//tensorflow/python/autograph/operators:py_builtins",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:backprop_util",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:fast_tensor_util",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/lib/core:_pywrap_py_func",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "sdca_ops",
+    srcs = ["sdca_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":sdca_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "session_ops",
+    srcs = ["session_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":data_flow_ops_gen",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "shape_util",
+    srcs = ["shape_util.py"],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+    ],
+)
+
+py_strict_library(
+    name = "sparse_grad",
+    srcs = ["sparse_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":sparse_ops",
+        ":sparse_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+    ],
+)
+
+py_strict_library(
+    name = "sparse_ops",
+    srcs = ["sparse_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":bincount_ops",
+        ":check_ops",
+        ":control_flow_ops",
+        ":count_ops_gen",
+        ":math_ops",
+        ":math_ops_gen",
+        ":sparse_ops_gen",
+        ":special_math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "sparse_bincount_ops_test",
+    srcs = ["sparse_bincount_ops_test.py"],
+    deps = [
+        ":sparse_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_py_strict_test(
+    name = "sparse_ops_test",
+    srcs = ["sparse_ops_test.py"],
+    main = "sparse_ops_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_grad",
+        ":array_ops",
+        ":gradient_checker_v2",
+        ":math_ops",
+        ":sparse_grad",
+        ":sparse_ops",
+        ":sparse_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_library(
+    name = "sort_ops",
+    srcs = ["sort_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_strict_test(
+    name = "sort_ops_test",
+    srcs = ["sort_ops_test.py"],
+    main = "sort_ops_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":random_ops",
+        ":sort_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "confusion_matrix",
+    srcs = ["confusion_matrix.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":check_ops",
+        ":cond",
+        ":control_flow_ops",
+        ":math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "weights_broadcast_ops",
+    srcs = [
+        "weights_broadcast_ops.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":cond",
+        ":control_flow_assert",
+        ":control_flow_ops",
+        ":math_ops",
+        ":sets",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "metrics",
+    srcs = [
+        "metrics.py",
+    ],
+    srcs_version = "PY3",
+    deps = [":metrics_impl"],
+)
+
+py_strict_library(
+    name = "special_math_ops",
+    srcs = ["special_math_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":linalg_ops_gen",
+        ":math_ops",
+        ":special_math_ops_gen",
+        "//tensorflow/compiler/tf2xla/ops:gen_xla_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+        "@opt_einsum_archive//:opt_einsum",
+    ],
+)
+
+py_strict_library(
+    name = "rnn_grad",
+    srcs = ["rnn_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":rnn_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "rnn_grad_test",
+    srcs = ["rnn_grad_test.py"],
+    main = "rnn_grad_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":gradients",
+        ":math_ops",
+        ":rnn_grad",
+        ":rnn_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_test(
+    name = "script_ops_test",
+    srcs = ["script_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":resource_variable_ops",
+        ":script_ops",
+        ":script_ops_gen",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_test(
+    name = "autograph_ops_test",
+    srcs = ["autograph_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":script_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_library(
+    name = "standard_ops",
+    srcs = ["standard_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_grad",
+        ":array_ops",
+        ":batch_ops",
+        ":check_ops",
+        ":clip_ops",
+        ":cond",
+        ":confusion_matrix",
+        ":control_flow_assert",
+        ":control_flow_case",
+        ":control_flow_ops",
+        ":critical_section_ops",
+        ":cudnn_rnn_grad",
+        ":data_flow_grad",
+        ":data_flow_ops",
+        ":functional_ops",
+        ":gradients",
+        ":histogram_ops",
+        ":init_ops",
+        ":io_ops",
+        ":linalg_ops",
+        ":logging_ops",
+        ":lookup_ops",
+        ":manip_grad",
+        ":manip_ops",
+        ":math_grad",
+        ":math_ops",
+        ":numerics",
+        ":parsing_ops",
+        ":partitioned_variables",
+        ":proto_ops",
+        ":random_grad",
+        ":random_ops",
+        ":rnn_grad",
+        ":script_ops",
+        ":session_ops",
+        ":sort_ops",
+        ":sparse_grad",
+        ":sparse_ops",
+        ":special_math_ops",
+        ":state_grad",
+        ":state_ops",
+        ":stateless_random_ops",
+        ":string_ops",
+        ":template",
+        ":tensor_array_grad",
+        ":tensor_array_ops",
+        ":variable_scope",
+        ":variables",
+        ":while_loop",
+        "//tensorflow/python/autograph",
+        "//tensorflow/python/compiler/tensorrt:trt_convert_py",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/ops/parallel_for:control_flow_ops",
+        "//tensorflow/python/ops/ragged:ragged_dispatch",
+        "//tensorflow/python/ops/ragged:ragged_operators",
+    ],
+)
+
+py_strict_library(
+    name = "state_grad",
+    srcs = ["state_grad.py"],
+    srcs_version = "PY3",
+    deps = ["//tensorflow/python/framework:ops"],
+)
+
+py_strict_library(
+    name = "state_ops",
+    srcs = ["state_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops_gen",
+        ":resource_variable_ops_gen",
+        ":state_ops_gen",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "string_ops",
+    srcs = ["string_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":math_ops",
+        ":parsing_ops_gen",
+        ":string_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "summary_ops_v2",
+    srcs = ["summary_ops_v2.py"],
+    srcs_version = "PY3",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":resource_variable_ops_gen",
+        ":summary_op_util",
+        ":summary_ops_gen",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:profiler",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/trackable:resource",
+        "//tensorflow/python/training:training_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "template",
+    srcs = ["template.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":variable_scope",
+        "//tensorflow/python/checkpoint",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "tensor_array_grad",
+    srcs = ["tensor_array_grad.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":tensor_array_ops",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_library(
+    name = "tensor_array_ops",
+    srcs = ["tensor_array_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops_gen",
+        ":data_flow_ops_gen",
+        ":list_ops",
+        ":math_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_should_use",
+    ],
+)
+
+py_strict_library(
+    name = "composite_tensor_ops",
+    srcs = ["composite_tensor_ops.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        "composite_tensor_ops_gen",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/saved_model:nested_structure_coder",
+        "//tensorflow/python/util:nest",
+    ],
+)
+
+py_strict_library(
+    name = "variable_scope",
+    srcs = ["variable_scope.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":init_ops",
+        ":resource_variable_ops",
+        ":variables",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:function_utils",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "variables",
+    srcs = ["variables.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":control_flow_ops",
+        ":math_ops",
+        ":math_ops_gen",
+        ":state_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:pywrap_tensorflow",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/util:_pywrap_utils",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:object_identity",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_should_use",
+        "//tensorflow/python/util:traceback_utils",
+    ],
+)
+
+py_strict_library(
+    name = "ref_variable",
+    srcs = ["ref_variable.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":resource_variable_ops",
+        ":state_ops",
+        ":state_ops_gen",
+        ":variable_scope",
+        ":variable_v1",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/trackable:base",
+        "//tensorflow/python/types:core",
+        "//tensorflow/python/util:compat",
+        "//tensorflow/python/util:deprecation",
+    ],
+)
+
+py_strict_library(
+    name = "variable_v1",
+    srcs = ["variable_v1.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":cond",
+        ":state_ops",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:tf_should_use",
+    ],
+)
+
+py_strict_library(
+    name = "gradient_checker",
+    srcs = ["gradient_checker.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":gradients",
+        ":math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "gradient_checker_v2",
+    srcs = ["gradient_checker_v2.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "bitwise_ops_test",
+    size = "small",
+    srcs = ["bitwise_ops_test.py"],
+    main = "bitwise_ops_test.py",
+    python_version = "PY3",
+    tags = ["no_windows"],
+    deps = [
+        ":bitwise_ops",
+        ":bitwise_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "control_flow_ops_test",
+    size = "medium",  # Contains artificial waits.
+    srcs = ["control_flow_ops_test.py"],
+    main = "control_flow_ops_test.py",
+    python_version = "PY3",
+    shard_count = 2,
+    tags = [
+        "no_cuda_asan",  # b/173241932
+    ],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":check_ops",
+        ":cond",
+        ":control_flow_assert",
+        ":control_flow_case",
+        ":control_flow_ops",
+        ":control_flow_switch_case",
+        ":control_flow_util_v2",
+        ":control_flow_v2_toggles",
+        ":custom_gradient",
+        ":embedding_ops",
+        ":gradients",
+        ":gradients_impl",
+        ":init_ops",
+        ":linalg_ops",
+        ":math_ops",
+        ":random_ops",
+        ":script_ops",
+        ":state_ops",
+        ":summary_ops_v2",
+        ":tensor_array_grad",
+        ":tensor_array_ops",
+        ":variable_scope",
+        ":variables",
+        ":while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/autograph/lang:directives",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/training",
+        "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_test(
+    name = "op_selector_test",
+    srcs = ["op_selector_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":math_ops",
+        ":op_selector",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "embedding_ops_test",
+    srcs = ["embedding_ops_test.py"],
+    main = "embedding_ops_test.py",
+    python_version = "PY3",
+    tags = ["no_windows_gpu"],
+    deps = [
+        ":embedding_ops",
+        ":gradients",
+        ":math_ops",
+        ":resource_variable_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "gradient_checker_v2_test",
+    size = "medium",
+    srcs = ["gradient_checker_v2_test.py"],
+    main = "gradient_checker_v2_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":custom_gradient",
+        ":gradient_checker_v2",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":sparse_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "gradients_test",
+    size = "medium",
+    srcs = ["gradients_test.py"],
+    main = "gradients_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_grad",
+        ":array_ops",
+        ":array_ops_stack",
+        ":cond",
+        ":control_flow_grad",
+        ":custom_gradient",
+        ":data_flow_grad",
+        ":data_flow_ops",
+        ":functional_ops",
+        ":gradient_checker_v2",
+        ":gradients",
+        ":gradients_impl",
+        ":init_ops",
+        ":list_ops",
+        ":math_grad",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_ops",
+        ":ref_variable",
+        ":resource_variable_ops",
+        ":state_grad",
+        ":state_ops",
+        ":tensor_array_grad",
+        ":tensor_array_ops",
+        ":unconnected_gradients",
+        ":variable_scope",
+        ":variable_v1",
+        ":variables",
+        ":while_loop",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:test",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "image_grad_d9m_test",
+    size = "large",
+    srcs = ["image_grad_d9m_test.py"],
+    main = "image_grad_d9m_test.py",
+    python_version = "PY3",
+    shard_count = 5,
+    deps = [
+        ":array_ops",
+        ":gradients_impl",
+        ":image_grad_test_base",
+        ":image_ops",
+        ":random_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "image_grad_test",
+    size = "large",
+    srcs = ["image_grad_test.py"],
+    main = "image_grad_test.py",
+    python_version = "PY3",
+    shard_count = 5,
+    deps = [
+        ":image_grad_test_base",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_library(
+    name = "image_grad_test_base",
+    srcs = ["image_grad_test_base.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops_stack",
+        ":gradient_checker_v2",
+        ":image_ops",
+        ":image_ops_gen",
+        ":math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "image_ops_test",
+    size = "medium",
+    srcs = ["image_ops_test.py"],
+    data = ["//tensorflow/core:image_testdata"],
+    main = "image_ops_test.py",
+    python_version = "PY3",
+    shard_count = 16,
+    tags = [
+        "no_cuda_asan",  # TODO(b/171511582): re-enable.
+        "no_windows",  #TODO(b/207035199): re-enable
+    ],
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":control_flow_ops",
+        ":image_ops",
+        ":image_ops_gen",
+        ":image_ops_impl",
+        ":io_ops",
+        ":math_ops",
+        ":random_ops",
+        ":stateless_random_ops",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/compat",
+        "//tensorflow/python/data/experimental/ops:get_single_element",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "init_ops_test",
+    size = "small",
+    srcs = ["init_ops_test.py"],
+    main = "init_ops_test.py",
+    python_version = "PY3",
+    deps = [
+        ":init_ops",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "init_ops_v2_test",
+    size = "medium",
+    srcs = ["init_ops_v2_test.py"],
+    main = "init_ops_v2_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":init_ops_v2",
+        ":random_ops",
+        ":variables",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "math_grad_test",
+    size = "small",
+    srcs = ["math_grad_test.py"],
+    main = "math_grad_test.py",
+    python_version = "PY3",
+    tags = ["no_windows_gpu"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
+    deps = [
+        ":array_ops",
+        ":gradient_checker",
+        ":gradient_checker_v2",
+        ":gradients",
+        ":math_ops",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "math_ops_test",
+    size = "medium",
+    srcs = ["math_ops_test.py"],
+    main = "math_ops_test.py",
+    python_version = "PY3",
+    tags = [
+        "no_windows_gpu",
+    ],
+    deps = [
+        ":array_ops",
+        ":gradients",
+        ":math_ops",
+        ":resource_variable_ops",
+        ":tensor_array_ops",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:tf2",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "math_ops_linspace_test",
+    size = "medium",
+    srcs = ["math_ops_linspace_test.py"],
+    main = "math_ops_linspace_test.py",
+    python_version = "PY3",
+    tags = [
+        "no_windows_gpu",
+    ],
+    deps = [
+        ":math_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nn_batchnorm_test",
+    size = "medium",
+    srcs = ["nn_batchnorm_test.py"],
+    main = "nn_batchnorm_test.py",
+    python_version = "PY3",
+    shard_count = 4,
+    tags = [
+        "no_mac_arm64",
+        "no_windows",
+    ],
+    deps = [
+        ":array_ops",
+        ":gradient_checker",
+        ":gradients_impl",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_impl",
+        ":nn_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nn_fused_batchnorm_d9m_test",
+    size = "medium",
+    srcs = ["nn_fused_batchnorm_d9m_test.py"],
+    main = "nn_fused_batchnorm_d9m_test.py",
+    python_version = "PY3",
+    shard_count = 4,
+    tags = [
+        "no_rocm",
+    ],
+    deps = [
+        ":nn_grad",
+        ":nn_impl",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nn_fused_batchnorm_test",
+    size = "large",
+    srcs = ["nn_fused_batchnorm_test.py"],
+    main = "nn_fused_batchnorm_test.py",
+    python_version = "PY3",
+    shard_count = 24,
+    tags = ["no_rocm"],
+    deps = [
+        ":array_ops",
+        ":gradient_checker",
+        ":gradients_impl",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_impl",
+        ":nn_ops",
+        ":nn_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nn_test",
+    size = "medium",
+    srcs = ["nn_test.py"],
+    main = "nn_test.py",
+    python_version = "PY3",
+    tags = ["no_windows"],
+    xla_tags = [
+        "no_cuda_asan",  # times out
+    ],
+    deps = [
+        ":array_ops",
+        ":gradient_checker",
+        ":gradient_checker_v2",
+        ":math_ops",
+        ":nn",
+        ":nn_grad",
+        ":nn_impl",
+        ":nn_ops",
+        ":partitioned_variables",
+        ":stateful_random_ops",
+        ":variable_scope",
+        ":variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_test(
+    name = "nn_loss_scaling_utilities_test",
+    size = "small",
+    srcs = ["nn_loss_scaling_utilities_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":nn_impl",
+        "//tensorflow/python/distribute:combinations",
+        "//tensorflow/python/distribute:strategy_combinations",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nn_xent_test",
+    size = "medium",
+    srcs = ["nn_xent_test.py"],
+    main = "nn_xent_test.py",
+    python_version = "PY3",
+    deps = [
+        ":gradient_checker",
+        ":gradients_impl",
+        ":nn_grad",
+        ":nn_impl",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_test(
+    name = "tensor_array_ops_test",
+    size = "small",
+    srcs = ["tensor_array_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":tensor_array_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "sobol_ops_test",
+    size = "small",
+    srcs = ["sobol_ops_test.py"],
+    kernels = [
+        "//tensorflow/core/kernels:libtfkernel_sobol_op.so",
+    ],
+    main = "sobol_ops_test.py",
+    tags = [
+        "no_oss",  # TODO(b/149565560)
+        "no_windows_gpu",
+    ],
+    deps = [
+        ":math_ops",
+        ":math_ops_gen",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "special_math_ops_test",
+    size = "medium",
+    srcs = ["special_math_ops_test.py"],
+    main = "special_math_ops_test.py",
+    python_version = "PY3",
+    shard_count = 10,
+    tags = [
+        "no_rocm",
+        "no_windows_gpu",
+    ],
+    deps = [
+        ":array_ops",
+        ":gradient_checker_v2",
+        ":math_ops",
+        ":special_math_ops",
+        ":variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+        "@opt_einsum_archive//:opt_einsum",
+    ],
+)
+
+tf_py_strict_test(
+    name = "variable_spec_test",
+    size = "small",
+    srcs = ["variable_spec_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":resource_variable_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_library(
+    name = "summary_op_util",
+    srcs = ["summary_op_util.py"],
+    srcs_version = "PY3",
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+    ],
+)
+
+# -----------------------------------------------------------------------------
+# Quantization
+
+tf_py_strict_test(
+    name = "dequantize_op_test",
+    size = "small",
+    srcs = ["dequantize_op_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":array_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_strict_test(
+    name = "quantized_ops_test",
+    size = "small",
+    srcs = ["quantized_ops_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":array_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_py_strict_test(
+    name = "quantized_conv_ops_test",
+    size = "small",
+    srcs = ["quantized_conv_ops_test.py"],
+    python_version = "PY3",
+    tags = [
+        "no_windows",
+    ],
+    deps = [
+        ":math_ops",
+        ":nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_test(
+    name = "array_ops_test",
+    srcs = ["array_ops_test.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":math_ops",
+        ":random_ops",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_test(
+    name = "array_ops_shape_test",
+    srcs = ["array_ops_shape_test.py"],
+    env = {"TF_FLAG_TF_SHAPE_DEFAULT_INT64": "true"},
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":array_ops",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "accumulate_n_benchmark",
+    size = "medium",
+    srcs = ["accumulate_n_benchmark.py"],
+    main = "accumulate_n_benchmark.py",
+    python_version = "PY3",
+    shard_count = 6,
+    deps = [
+        ":array_ops",
+        ":control_flow_ops_gen",
+        ":data_flow_ops",
+        ":math_ops",
+        ":random_ops",
+        ":state_ops",
+        ":state_ops_gen",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "batch_norm_benchmark",
+    srcs = ["batch_norm_benchmark.py"],
+    main = "batch_norm_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":gradients_impl",
+        ":math_ops",
+        ":nn_grad",
+        ":nn_impl",
+        ":nn_ops_gen",
+        ":random_ops",
+        ":variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "collective_ops_benchmark",
+    srcs = ["collective_ops_benchmark.py"],
+    main = "collective_ops_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":collective_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "concat_benchmark",
+    srcs = ["concat_benchmark.py"],
+    main = "concat_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":gradients_impl",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "control_flow_ops_benchmark",
+    srcs = ["control_flow_ops_benchmark.py"],
+    main = "control_flow_ops_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":cond",
+        ":control_flow_util",
+        ":math_ops",
+        ":random_ops",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "conv2d_benchmark",
+    size = "large",
+    srcs = ["conv2d_benchmark.py"],
+    main = "conv2d_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":control_flow_ops",
+        ":nn_ops",
+        ":random_ops",
+        ":variable_v1",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "split_benchmark",
+    srcs = ["split_benchmark.py"],
+    main = "split_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":variables",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:benchmark",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "transpose_benchmark",
+    size = "medium",
+    srcs = ["transpose_benchmark.py"],
+    main = "transpose_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":control_flow_ops",
+        ":variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "matmul_benchmark",
+    size = "medium",
+    srcs = ["matmul_benchmark.py"],
+    main = "matmul_benchmark.py",
+    python_version = "PY3",
+    deps = [
+        ":control_flow_ops",
+        ":math_ops",
+        ":random_ops",
+        ":variable_v1",
+        ":variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+py_strict_library(
+    name = "matmul_benchmark_main_lib",
+    testonly = True,
+    srcs = ["matmul_benchmark.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":control_flow_ops",
+        ":math_ops",
+        ":random_ops",
+        ":variable_v1",
+        ":variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nn_grad_test",
+    size = "medium",
+    srcs = ["nn_grad_test.py"],
+    main = "nn_grad_test.py",
+    python_version = "PY3",
+    deps = [
+        ":array_ops",
+        ":gradient_checker",
+        ":gradient_checker_v2",
+        ":gradients_impl",
+        ":nn_grad",
+        ":nn_impl",
+        ":nn_ops",
+        ":nn_ops_gen",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//third_party/py/numpy",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "nccl_ops_gen",
+    visibility = ["//tensorflow:internal"],
+    deps = [
+        "//tensorflow/core:nccl_ops_op_lib",
+    ],
+)
+
+py_strict_library(
+    name = "nccl_ops",
+    srcs = ["nccl_ops.py"],
+    srcs_version = "PY3",
+    visibility = visibility + [
+        "//learning/deepmind/tensorflow:__subpackages__",
+        "//third_party/car/deep_nets/tensorflow:__subpackages__",
+    ],
+    deps = [
+        ":nccl_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "nccl_ops_test",
+    size = "small",
+    srcs = ["nccl_ops_test.py"],
+    main = "nccl_ops_test.py",
+    python_version = "PY3",
+    # Disabled on jenkins until errors finding nvmlShutdown are found.
+    tags = [
+        "manual",
+        "multi_gpu",
+        "no_oss",
+        "noguitar",
+        "notap",
+    ],
+    deps = [
+        ":array_ops",
+        ":gradients",
+        ":nccl_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "factory_ops_test",
+    size = "small",
+    srcs = ["factory_ops_test.py"],
+    main = "factory_ops_test.py",
+    deps = [
+        ":sparse_ops",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:mirrored_strategy",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "decode_proto_ops_gen",
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        "//tensorflow/core:decode_proto_ops_op_lib",
+    ],
+)
+
+tf_gen_op_wrapper_private_py(
+    name = "encode_proto_ops_gen",
+    visibility = ["//tensorflow/python:__pkg__"],
+    deps = [
+        "//tensorflow/core:encode_proto_ops_op_lib",
+    ],
+)
+
+py_strict_library(
+    name = "proto_ops",
+    srcs = ["proto_ops.py"],
+    srcs_version = "PY3",
+    deps = [
+        ":decode_proto_ops_gen",
+        ":encode_proto_ops_gen",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "raw_ops_test",
+    srcs = ["raw_ops_test.py"],
+    main = "raw_ops_test.py",
+    python_version = "PY3",
+    deps = [
+        ":data_flow_ops_gen",
+        ":math_ops_gen",
+        ":string_ops_gen",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
+py_strict_library(
+    name = "nn_impl",
+    srcs = ["nn_impl.py"],
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":array_ops_stack",
+        ":candidate_sampling_ops",
+        ":check_ops",
+        ":cond",
+        ":custom_gradient",
+        ":embedding_ops",
+        ":linalg_ops",
+        ":math_ops",
+        ":nn_ops",
+        ":nn_ops_gen",
+        ":sparse_ops_gen",
+        ":variables",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops/losses:util",
+        "//tensorflow/python/platform:device_context",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "rnn_cell_wrapper_impl",
+    srcs = ["rnn_cell_wrapper_impl.py"],
+    deps = ["//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_wrapper_impl"],
+)
+
+py_strict_library(
+    name = "rnn_cell_impl",
+    srcs = ["rnn_cell_impl.py"],
+    deps = [
+        ":array_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/keras/layers/legacy_rnn:rnn_cell_impl",
+        "//tensorflow/python/util:nest",
+    ],
+)
+
+py_strict_library(
+    name = "image_ops_impl",
+    srcs = ["image_ops_impl.py"],
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":check_ops",
+        ":cond",
+        ":control_flow_assert",
+        ":control_flow_case",
+        ":control_flow_ops",
+        ":image_ops_gen",
+        ":math_ops",
+        ":nn",
+        ":nn_ops",
+        ":random_ops",
+        ":sort_ops",
+        ":stateless_random_ops",
+        ":string_ops",
+        ":variables",
+        ":while_loop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "custom_gradient",
+    srcs = ["custom_gradient.py"],
+    deps = [
+        ":array_ops",
+        ":array_ops_gen",
+        ":handle_data_util",
+        ":math_ops",
+        ":op_selector",
+        ":resource_variable_ops",
+        ":unconnected_gradients",
+        ":variable_scope",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:record",
+        "//tensorflow/python/framework:composite_tensor_gradient",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:nest",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//tensorflow/python/util:variable_utils",
+    ],
+)
+
+py_strict_library(
+    name = "while_v2_indexed_slices_rewriter",
+    srcs = ["while_v2_indexed_slices_rewriter.py"],
+    deps = [
+        ":array_ops",
+        ":resource_variable_ops_gen",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/util:nest",
+    ],
+)
+
+py_strict_library(
+    name = "metrics_impl",
+    srcs = ["metrics_impl.py"],
+    deps = [
+        ":array_ops",
+        ":array_ops_stack",
+        ":check_ops",
+        ":cond",
+        ":confusion_matrix",
+        ":math_ops",
+        ":nn",
+        ":sets",
+        ":sparse_ops",
+        ":state_ops",
+        ":variable_scope",
+        ":variable_v1",
+        ":variables",
+        ":weights_broadcast_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "sets_impl",
+    srcs = ["sets_impl.py"],
+    deps = [
+        ":set_ops_gen",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/util:dispatch",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
diff --git a/tensorflow/python/ops/array_ops.py b/tensorflow/python/ops/array_ops.py
index 0408617c9be..6a055171d4e 100644
--- a/tensorflow/python/ops/array_ops.py
+++ b/tensorflow/python/ops/array_ops.py
@@ -18,6 +18,7 @@
 import numbers
 import numpy as np
 
+from tensorflow.core.config import flags
 from tensorflow.python.eager import context
 from tensorflow.python.eager import record
 from tensorflow.python.framework import common_shapes
@@ -589,8 +590,10 @@ def broadcast_static_shape(shape_x, shape_y):
 
 @tf_export("shape", v1=[])
 @dispatch.add_dispatch_support
-def shape_v2(input, out_type=dtypes.int32, name=None):
+def shape_v2(input, out_type=None, name=None):
   # pylint: disable=redefined-builtin
+  # TODO(b/274626120) Update `tf_shape_default_int64` comment when it is better
+  # supported.
   """Returns a tensor containing the shape of the input tensor.
 
   See also `tf.size`, `tf.rank`.
@@ -630,18 +633,25 @@ def shape_v2(input, out_type=dtypes.int32, name=None):
   Args:
     input: A `Tensor` or `SparseTensor`.
     out_type: (Optional) The specified output type of the operation (`int32` or
-      `int64`). Defaults to `tf.int32`.
+      `int64`). Defaults to `tf.int32`. (Note: there is an experimental
+      flag, `tf_shape_default_int64` that changes the default to `tf.int64`.
+      This is an unsupported, experimental setting that causes known breakages.)
     name: A name for the operation (optional).
 
   Returns:
     A `Tensor` of type `out_type`.
   """
+  if out_type is None:
+    if flags.config().tf_shape_default_int64.value():
+      out_type = dtypes.int64
+    else:
+      out_type = dtypes.int32
   return shape(input, name, out_type)
 
 
 @tf_export(v1=["shape"])
 @dispatch.add_dispatch_support
-def shape(input, name=None, out_type=dtypes.int32):
+def shape(input, name=None, out_type=None):
   # pylint: disable=redefined-builtin
   """Returns the shape of a tensor.
 
@@ -663,6 +673,11 @@ def shape(input, name=None, out_type=dtypes.int32):
   Returns:
     A `Tensor` of type `out_type`.
   """
+  if out_type is None:
+    if flags.config().tf_shape_default_int64.value():
+      out_type = dtypes.int64
+    else:
+      out_type = dtypes.int32
   return shape_internal(input, name, optimize=True, out_type=out_type)
 
 
@@ -739,7 +754,7 @@ def shape_n(input, out_type=dtypes.int32, name=None):
 
 @tf_export("size", v1=[])
 @dispatch.add_dispatch_support
-def size_v2(input, out_type=dtypes.int32, name=None):
+def size_v2(input, out_type=None, name=None):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
 
@@ -756,9 +771,11 @@ def size_v2(input, out_type=dtypes.int32, name=None):
 
   Args:
     input: A `Tensor` or `SparseTensor`.
-    name: A name for the operation (optional).
     out_type: (Optional) The specified non-quantized numeric output type of the
-      operation. Defaults to `tf.int32`.
+      operation. Defaults to `tf.int32`. (Note: there is an experimental
+      flag, `tf_shape_default_int64` that changes the default to `tf.int64`.
+      This is an unsupported, experimental setting that causes known breakages.)
+    name: A name for the operation (optional).
 
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
@@ -767,13 +784,17 @@ def size_v2(input, out_type=dtypes.int32, name=None):
   Equivalent to np.size()
   @end_compatibility
   """
-
+  if out_type is None:
+    if flags.config().tf_shape_default_int64.value():
+      out_type = dtypes.int64
+    else:
+      out_type = dtypes.int32
   return size(input, name, out_type)
 
 
 @tf_export(v1=["size"])
 @dispatch.add_dispatch_support
-def size(input, name=None, out_type=dtypes.int32):
+def size(input, name=None, out_type=None):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor.
 
@@ -791,7 +812,9 @@ def size(input, name=None, out_type=dtypes.int32):
     input: A `Tensor` or `SparseTensor`.
     name: A name for the operation (optional).
     out_type: (Optional) The specified non-quantized numeric output type of the
-      operation. Defaults to `tf.int32`.
+      operation. Defaults to `tf.int32`. (Note: there is an experimental
+      flag, `tf_shape_default_int64` that changes the default to `tf.int64`.
+      This is an unsupported, experimental setting that causes known breakages.)
 
   Returns:
     A `Tensor` of type `out_type`. Defaults to `tf.int32`.
@@ -800,6 +823,11 @@ def size(input, name=None, out_type=dtypes.int32):
   Equivalent to np.size()
   @end_compatibility
   """
+  if out_type is None:
+    if flags.config().tf_shape_default_int64.value():
+      out_type = dtypes.int64
+    else:
+      out_type = dtypes.int32
   return size_internal(input, name, optimize=True, out_type=out_type)
 
 
@@ -5608,7 +5636,10 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
     # grid of size B1 x B2.
     batch_dim_list = array_ops_stack.unstack(batch_shape, axis=0)
     dim_ranges = [
-        gen_math_ops.cast(gen_math_ops._range(0, x, 1), indices.dtype)
+        gen_math_ops.cast(
+            gen_math_ops._range(0, gen_math_ops.cast(x, dtypes.int32), 1),
+            indices.dtype,
+        )
         for x in batch_dim_list
     ]
     mesh_list = meshgrid(*dim_ranges, indexing="ij") if dim_ranges else []
@@ -5621,11 +5652,15 @@ def batch_gather_nd(params, indices, batch_dims, name=None):
     index_grid_shape = shape(index_grid)
     index_grid = reshape(
         index_grid,
-        concat([
-            index_grid_shape[:1],
-            ones(index_internal_ndims, dtype=dtypes.int32), index_grid_shape[1:]
-        ],
-               axis=0))
+        concat(
+            [
+                index_grid_shape[:1],
+                ones(index_internal_ndims, dtype=index_grid_shape.dtype),
+                index_grid_shape[1:],
+            ],
+            axis=0,
+        ),
+    )
     tile_shape = concat(((1,), indices_internal_shape, (1,)), axis=0)
     index_grid = tile(index_grid, multiples=tile_shape)
     # index_grid now has shape [(B1.B2), i1, ..., iK, 2]
diff --git a/tensorflow/python/ops/array_ops_shape_test.py b/tensorflow/python/ops/array_ops_shape_test.py
new file mode 100644
index 00000000000..e14d21831b7
--- /dev/null
+++ b/tensorflow/python/ops/array_ops_shape_test.py
@@ -0,0 +1,51 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for shape op int64 output."""
+
+from tensorflow.core.config import flags
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import array_ops
+from tensorflow.python.platform import test
+
+
+class ArrayOpShapeSizeTest(test.TestCase):
+
+  def testShapeInt64Flag(self):
+    # The tf_shape_default_int64 flag should be set when this test runs
+    self.assertTrue(flags.config().tf_shape_default_int64.value())
+    s1 = array_ops.shape_v2(array_ops.zeros([1, 2]))
+    self.assertEqual(s1.dtype, dtypes.int64)
+
+  def testShapeInt64FlagTf1(self):
+    # The tf_shape_default_int64 flag should be set when this test runs
+    self.assertTrue(flags.config().tf_shape_default_int64.value())
+    s1 = array_ops.shape(array_ops.zeros([1, 2]))
+    self.assertEqual(s1.dtype, dtypes.int64)
+
+  def testSizeInt64Flag(self):
+    # The tf_shape_default_int64 flag should be set when this test runs
+    self.assertTrue(flags.config().tf_shape_default_int64.value())
+    s1 = array_ops.size_v2(array_ops.zeros([1, 2]))
+    self.assertEqual(s1.dtype, dtypes.int64)
+
+  def testSizeInt64FlagTf1(self):
+    # The tf_shape_default_int64 flag should be set when this test runs
+    self.assertTrue(flags.config().tf_shape_default_int64.value())
+    s1 = array_ops.size(array_ops.zeros([1, 2]))
+    self.assertEqual(s1.dtype, dtypes.int64)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/ops/array_ops_test.py b/tensorflow/python/ops/array_ops_test.py
index 4cf619d4739..5a0b8555e34 100644
--- a/tensorflow/python/ops/array_ops_test.py
+++ b/tensorflow/python/ops/array_ops_test.py
@@ -14,6 +14,7 @@
 # ==============================================================================
 """Tests for array operations."""
 
+from tensorflow.core.config import flags
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import dtypes
@@ -142,6 +143,12 @@ class ArrayOpTest(test.TestCase):
     ):
       gen_array_ops.lower_bound(arg0, arg1)
 
+  def testShapeDefaultIn32(self):
+    # The tf_shape_default_int64 flag should NOT be set when this test runs
+    self.assertFalse(flags.config().tf_shape_default_int64.value())
+    s1 = array_ops.shape_v2(array_ops.zeros([1, 2]))
+    self.assertEqual(s1.dtype, dtypes.int32)
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/ops/bincount_ops.py b/tensorflow/python/ops/bincount_ops.py
index 577f14545de..ce63aac1b0c 100644
--- a/tensorflow/python/ops/bincount_ops.py
+++ b/tensorflow/python/ops/bincount_ops.py
@@ -17,18 +17,17 @@
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
-from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import check_ops
-from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import deprecation
+from tensorflow.python.util import dispatch
 from tensorflow.python.util.tf_export import tf_export
 
 
 @tf_export("math.bincount", v1=[])
+@dispatch.add_dispatch_support
 def bincount(arr,
              weights=None,
              minlength=None,
@@ -119,11 +118,7 @@ def bincount(arr,
     # TODO(b/255381064) Remove the following block which uses older kernels for
     # backwards compatibility for certain cases once all tests pass with the
     # newer (dense_bincount, ragged_bincount and sparse_bincount) kernels.
-    if (
-        not isinstance(arr, ragged_tensor.RaggedTensor)
-        and not binary_output
-        and axis is None
-    ):
+    if not binary_output and axis is None:
       arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
       array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
       output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * (
@@ -143,12 +138,10 @@ def bincount(arr,
       arr = array_ops.reshape(arr, [-1])
       return gen_math_ops.bincount(arr, output_size, weights)
 
-    if not isinstance(arr, sparse_tensor.SparseTensor):
-      arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr")
+    arr = tensor_conversion.convert_to_tensor_v2_with_dispatch(arr, name="arr")
     if weights is not None:
-      if not isinstance(weights, sparse_tensor.SparseTensor):
-        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-            weights, name="weights")
+      weights = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          weights, name="weights")
 
     if weights is not None and binary_output:
       raise ValueError("Arguments `binary_output` and `weights` are mutually "
@@ -164,12 +157,8 @@ def bincount(arr,
                        " -1 are currently supported.")
 
     array_is_nonempty = array_ops.size(arr) > 0
-    if isinstance(arr, sparse_tensor.SparseTensor):
-      output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
-          math_ops.reduce_max(arr.values) + 1)
-    else:
-      output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
-          math_ops.reduce_max(arr) + 1)
+    output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+        math_ops.reduce_max(arr) + 1)
     if minlength is not None:
       minlength = ops.convert_to_tensor(
           minlength, name="minlength", dtype=arr.dtype)
@@ -180,46 +169,16 @@ def bincount(arr,
       output_size = gen_math_ops.minimum(maxlength, output_size)
 
     if axis == 0:
-      if isinstance(arr, sparse_tensor.SparseTensor):
-        if weights is not None:
-          weights = validate_sparse_weights(arr, weights, dtype)
-        arr = arr.values
-      elif isinstance(arr, ragged_tensor.RaggedTensor):
-        # Flatten RaggedTensors with multiple ragged dimensions which use a
-        # nested RaggedTensor for the values tensor.
-        while isinstance(arr, ragged_tensor.RaggedTensor):
-          if weights is not None:
-            weights = validate_ragged_weights(arr, weights, dtype)
-          arr = arr.values
-      else:
-        if weights is not None:
-          weights = array_ops.reshape(weights, [-1])
-        arr = array_ops.reshape(arr, [-1])
+      if weights is not None:
+        weights = array_ops.reshape(weights, [-1])
+      arr = array_ops.reshape(arr, [-1])
 
-    if isinstance(arr, sparse_tensor.SparseTensor):
-      weights = validate_sparse_weights(arr, weights, dtype)
-      return gen_math_ops.sparse_bincount(
-          indices=arr.indices,
-          values=arr.values,
-          dense_shape=arr.dense_shape,
-          size=output_size,
-          weights=weights,
-          binary_output=binary_output)
-    elif isinstance(arr, ragged_tensor.RaggedTensor):
-      weights = validate_ragged_weights(arr, weights, dtype)
-      return gen_math_ops.ragged_bincount(
-          splits=arr.row_splits,
-          values=arr.values,
-          size=output_size,
-          weights=weights,
-          binary_output=binary_output)
-    else:
-      weights = validate_dense_weights(arr, weights, dtype)
-      return gen_math_ops.dense_bincount(
-          input=arr,
-          size=output_size,
-          weights=weights,
-          binary_output=binary_output)
+    weights = validate_dense_weights(arr, weights, dtype)
+    return gen_math_ops.dense_bincount(
+        input=arr,
+        size=output_size,
+        weights=weights,
+        binary_output=binary_output)
 
 
 @tf_export(v1=["math.bincount", "bincount"])
@@ -255,209 +214,6 @@ def bincount_v1(arr,
   return bincount(arr, weights, minlength, maxlength, dtype)
 
 
-@tf_export("sparse.bincount")
-def sparse_bincount(values,
-                    weights=None,
-                    axis=0,
-                    minlength=None,
-                    maxlength=None,
-                    binary_output=False,
-                    name=None):
-  """Count the number of times an integer value appears in a tensor.
-
-  This op takes an N-dimensional `Tensor`, `RaggedTensor`, or `SparseTensor`,
-  and returns an N-dimensional int64 SparseTensor where element
-  `[i0...i[axis], j]` contains the number of times the value `j` appears in
-  slice `[i0...i[axis], :]` of the input tensor.  Currently, only N=0 and
-  N=-1 are supported.
-
-  Args:
-    values: A Tensor, RaggedTensor, or SparseTensor whose values should be
-      counted. These tensors must have a rank of 2 if `axis=-1`.
-    weights: If non-None, must be the same shape as arr. For each value in
-      `value`, the bin will be incremented by the corresponding weight instead
-      of 1.
-    axis: The axis to slice over. Axes at and below `axis` will be flattened
-      before bin counting. Currently, only `0`, and `-1` are supported. If None,
-      all axes will be flattened (identical to passing `0`).
-    minlength: If given, ensures the output has length at least `minlength`,
-      padding with zeros at the end if necessary.
-    maxlength: If given, skips values in `values` that are equal or greater than
-      `maxlength`, ensuring that the output has length at most `maxlength`.
-    binary_output: If True, this op will output 1 instead of the number of times
-      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
-      reduce_add). Defaults to False.
-    name: A name for this op.
-
-  Returns:
-    A SparseTensor with `output.shape = values.shape[:axis] + [N]`, where `N` is
-      * `maxlength` (if set);
-      * `minlength` (if set, and `minlength > reduce_max(values)`);
-      * `0` (if `values` is empty);
-      * `reduce_max(values) + 1` otherwise.
-
-  Raises:
-    `InvalidArgumentError` if negative values are provided as an input.
-
-  Examples:
-
-  **Bin-counting every item in individual batches**
-
-  This example takes an input (which could be a Tensor, RaggedTensor, or
-  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
-  number of times value j appears in batch i.
-
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> output = tf.sparse.bincount(data, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([1 2 1 2 1 1], shape=(6,), dtype=int64),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
-  **Bin-counting with defined output shape**
-
-  This example takes an input (which could be a Tensor, RaggedTensor, or
-  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
-  number of times value j appears in batch i. However, all values of j
-  above 'maxlength' are ignored. The dense_shape of the output sparse tensor
-  is set to 'minlength'. Note that, while the input is identical to the
-  example above, the value '10001' in batch item 2 is dropped, and the
-  dense shape is [2, 500] instead of [2,10002] or [2, 102].
-
-  >>> minlength = maxlength = 500
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> output = tf.sparse.bincount(
-  ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[  0  10]
-   [  0  20]
-   [  0  30]
-   [  1  11]
-   [  1 101]], shape=(5, 2), dtype=int64),
-   values=tf.Tensor([1 2 1 2 1], shape=(5,), dtype=int64),
-   dense_shape=tf.Tensor([  2 500], shape=(2,), dtype=int64))
-
-  **Binary bin-counting**
-
-  This example takes an input (which could be a Tensor, RaggedTensor, or
-  SparseTensor) and returns a SparseTensor where (i,j) is 1 if the value j
-  appears in batch i at least once and is 0 otherwise. Note that, even though
-  some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
-  the 'values' tensor is all 1s.
-
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
-  **Weighted bin-counting**
-
-  This example takes two inputs - a values tensor and a weights tensor. These
-  tensors must be identically shaped, and have the same row splits or indices
-  in the case of RaggedTensors or SparseTensors. When performing a weighted
-  count, the op will output a SparseTensor where the value of (i, j) is the
-  sum of the values in the weight tensor's batch i in the locations where
-  the values tensor has the value j. In this case, the output dtype is the
-  same as the dtype of the weights tensor.
-
-  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
-  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
-  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
-  >>> print(output)
-  SparseTensor(indices=tf.Tensor(
-  [[    0    10]
-   [    0    20]
-   [    0    30]
-   [    1    11]
-   [    1   101]
-   [    1 10001]], shape=(6, 2), dtype=int64),
-   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
-   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
-
-  """
-  with ops.name_scope(name, "count", [values, weights]):
-    if not isinstance(values, sparse_tensor.SparseTensor):
-      values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-          values, name="values")
-    if weights is not None:
-      if not isinstance(weights, sparse_tensor.SparseTensor):
-        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
-            weights, name="weights")
-
-    if weights is not None and binary_output:
-      raise ValueError("Arguments `binary_output` and `weights` are mutually "
-                       "exclusive. Please specify only one.")
-
-    if axis is None:
-      axis = 0
-
-    if axis not in [0, -1]:
-      raise ValueError(f"Unsupported value for argument axis={axis}. Only 0 and"
-                       " -1 are currently supported.")
-
-    minlength_value = minlength if minlength is not None else -1
-    maxlength_value = maxlength if maxlength is not None else -1
-
-    if axis == 0:
-      if isinstance(values, sparse_tensor.SparseTensor):
-        if weights is not None:
-          weights = validate_sparse_weights(values, weights)
-        values = values.values
-      elif isinstance(values, ragged_tensor.RaggedTensor):
-        if weights is not None:
-          weights = validate_ragged_weights(values, weights)
-        values = values.values
-      else:
-        if weights is not None:
-          weights = array_ops.reshape(weights, [-1])
-        values = array_ops.reshape(values, [-1])
-
-    if isinstance(values, sparse_tensor.SparseTensor):
-      weights = validate_sparse_weights(values, weights)
-      c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
-          values.indices,
-          values.values,
-          values.dense_shape,
-          weights,
-          minlength=minlength_value,
-          maxlength=maxlength_value,
-          binary_output=binary_output)
-    elif isinstance(values, ragged_tensor.RaggedTensor):
-      weights = validate_ragged_weights(values, weights)
-      c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
-          values.row_splits,
-          values.values,
-          weights,
-          minlength=minlength_value,
-          maxlength=maxlength_value,
-          binary_output=binary_output)
-    else:
-      weights = validate_dense_weights(values, weights)
-      c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
-          values,
-          weights=weights,
-          minlength=minlength_value,
-          maxlength=maxlength_value,
-          binary_output=binary_output)
-
-    return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
-
-
 def validate_dense_weights(values, weights, dtype=None):
   """Validates the passed weight tensor or creates an empty one."""
   if weights is None:
@@ -471,68 +227,3 @@ def validate_dense_weights(values, weights, dtype=None):
         f"Received weights={weights} of type: {type(weights).__name__}")
 
   return weights
-
-
-def validate_sparse_weights(values, weights, dtype=None):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    if dtype:
-      return array_ops.constant([], dtype=dtype)
-    return array_ops.constant([], dtype=values.values.dtype)
-
-  if not isinstance(weights, sparse_tensor.SparseTensor):
-    raise ValueError(
-        "Argument `weights` must be a SparseTensor if `values` is a "
-        f"SparseTensor. Received weights={weights} of type: "
-        f"{type(weights).__name__}")
-
-  checks = []
-  if weights.dense_shape is not values.dense_shape:
-    checks.append(
-        check_ops.assert_equal(
-            weights.dense_shape,
-            values.dense_shape,
-            message="'weights' and 'values' must have the same dense shape."))
-  if weights.indices is not values.indices:
-    checks.append(
-        check_ops.assert_equal(
-            weights.indices,
-            values.indices,
-            message="'weights' and 'values' must have the same indices.")
-    )
-  if checks:
-    with ops.control_dependencies(checks):
-      weights = array_ops.identity(weights.values)
-  else:
-    weights = weights.values
-
-  return weights
-
-
-def validate_ragged_weights(values, weights, dtype=None):
-  """Validates the passed weight tensor or creates an empty one."""
-  if weights is None:
-    if dtype:
-      return array_ops.constant([], dtype=dtype)
-    return array_ops.constant([], dtype=values.values.dtype)
-
-  if not isinstance(weights, ragged_tensor.RaggedTensor):
-    raise ValueError(
-        "`weights` must be a RaggedTensor if `values` is a RaggedTensor. "
-        f"Received argument weights={weights} of type: "
-        f"{type(weights).__name__}.")
-
-  checks = []
-  if weights.row_splits is not values.row_splits:
-    checks.append(
-        check_ops.assert_equal(
-            weights.row_splits,
-            values.row_splits,
-            message="'weights' and 'values' must have the same row splits."))
-  if checks:
-    with ops.control_dependencies(checks):
-      weights = array_ops.identity(weights.values)
-  else:
-    weights = weights.values
-
-  return weights
diff --git a/tensorflow/python/ops/bincount_ops_test.py b/tensorflow/python/ops/bincount_ops_test.py
index 49291712b37..a625bf3a3b5 100644
--- a/tensorflow/python/ops/bincount_ops_test.py
+++ b/tensorflow/python/ops/bincount_ops_test.py
@@ -17,8 +17,6 @@
 from absl.testing import parameterized
 import numpy as np
 
-from tensorflow.python.eager import context
-from tensorflow.python.framework import config as tf_config
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
@@ -26,545 +24,9 @@ from tensorflow.python.framework import test_util
 from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import gen_count_ops
 from tensorflow.python.ops import sparse_ops
-from tensorflow.python.ops.ragged import ragged_factory_ops
-from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
 
-def _ragged_factory(x):
-  return lambda: ragged_factory_ops.constant(x)
-
-
-def _adjust_expected_rank1(x, minlength, maxlength):
-  """Trim or pad an expected result based on minlength and maxlength."""
-  n = len(x)
-  if (minlength is not None) and (n < minlength):
-    x = x + [0] * (minlength - n)
-  if (maxlength is not None) and (n > maxlength):
-    x = x[:maxlength]
-  return x
-
-
-def _adjust_expected_rank2(x, minlength, maxlength):
-  return [_adjust_expected_rank1(i, minlength, maxlength) for i in x]
-
-
-class TestSparseCount(test.TestCase, parameterized.TestCase):
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "_no_maxlength",
-          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 1, 1, 2, 1],
-          "expected_shape": [2, 6]
-      }, {
-          "testcase_name": "_maxlength",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "maxlength": 7,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 1, 1, 1, 2],
-          "expected_shape": [2, 7]
-      }, {
-          "testcase_name": "_maxlength_zero",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "maxlength": 0,
-          "expected_indices": np.empty([0, 2], dtype=np.int64),
-          "expected_values": [],
-          "expected_shape": [2, 0]
-      }, {
-          "testcase_name": "_minlength",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "minlength": 9,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
-                               [1, 7]],
-          "expected_values": [1, 1, 1, 1, 1, 2, 1],
-          "expected_shape": [2, 9]
-      }, {
-          "testcase_name": "_minlength_larger_values",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "minlength": 3,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
-                               [1, 7]],
-          "expected_values": [1, 1, 1, 1, 1, 2, 1],
-          "expected_shape": [2, 8]
-      }, {
-          "testcase_name": "_no_maxlength_binary",
-          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [1, 1, 1, 1, 1],
-          "expected_shape": [2, 6],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_maxlength_binary",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "maxlength": 7,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [1, 1, 1, 1, 1],
-          "expected_shape": [2, 7],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_minlength_binary",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "minlength": 9,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
-                               [1, 7]],
-          "expected_values": [1, 1, 1, 1, 1, 1, 1],
-          "expected_shape": [2, 9],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_minlength_larger_values_binary",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "minlength": 3,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
-                               [1, 7]],
-          "expected_values": [1, 1, 1, 1, 1, 1, 1],
-          "expected_shape": [2, 8],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_no_maxlength_weights",
-          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
-          "expected_values": [2, 1, 0.5, 9, 3],
-          "expected_shape": [2, 6],
-          "weights": [[0.5, 1, 2], [3, 4, 5]]
-      }, {
-          "testcase_name": "_maxlength_weights",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "maxlength": 7,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
-          "expected_values": [2, 1, 0.5, 3, 9],
-          "expected_shape": [2, 7],
-          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
-      }, {
-          "testcase_name": "_minlength_weights",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "minlength": 9,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
-                               [1, 7]],
-          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
-          "expected_shape": [2, 9],
-          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
-      }, {
-          "testcase_name": "_minlength_larger_values_weights",
-          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
-          "minlength": 3,
-          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
-                               [1, 7]],
-          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
-          "expected_shape": [2, 8],
-          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
-      }, {
-          "testcase_name": "_1d",
-          "x": np.array([3, 2, 1, 1], dtype=np.int32),
-          "expected_indices": [[1], [2], [3]],
-          "expected_values": [2, 1, 1],
-          "expected_shape": [4]
-      }, {
-          "testcase_name": "_all_axes",
-          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
-          "expected_indices": [[1], [2], [3], [4], [5]],
-          "expected_values": [1, 1, 1, 2, 1],
-          "expected_shape": [6],
-          "axis": None
-      }, {
-          "testcase_name":
-              "_large_inputs",
-          "x":
-              np.array([[
-                  1941591354222760687, 1748591354222760687, 1241591354229760689
-              ], [
-                  1941591354222760687, 1241591354229760689, 1241591354229760687
-              ]],
-                       dtype=np.int64),
-          "expected_indices": [[1241591354229760687], [1241591354229760689],
-                               [1748591354222760687], [1941591354222760687]],
-          "expected_values": [1, 2, 1, 2],
-          "expected_shape": [1941591354222760687 + 1],
-          "axis":
-              None
-      })
-  def test_dense_input(self,
-                       x,
-                       expected_indices,
-                       expected_values,
-                       expected_shape,
-                       minlength=None,
-                       maxlength=None,
-                       binary_output=False,
-                       weights=None,
-                       axis=-1):
-    y = bincount_ops.sparse_bincount(
-        x,
-        weights=weights,
-        minlength=minlength,
-        maxlength=maxlength,
-        binary_output=binary_output,
-        axis=axis)
-    self.assertAllEqual(expected_indices, y.indices)
-    self.assertAllEqual(expected_values, y.values)
-    self.assertAllEqual(expected_shape, y.dense_shape)
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name":
-              "_no_maxlength",
-          "x":
-              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 2, 1],
-          "expected_shape": [3, 6],
-      }, {
-          "testcase_name":
-              "_maxlength",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 2, 1],
-          "expected_shape": [3, 7],
-          "maxlength":
-              7,
-      }, {
-          "testcase_name":
-              "_maxlength_zero",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices":
-              np.empty([0, 2], dtype=np.int64),
-          "expected_values": [],
-          "expected_shape": [3, 0],
-          "maxlength":
-              0,
-      }, {
-          "testcase_name":
-              "_minlength",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 1, 2, 1],
-          "expected_shape": [3, 9],
-          "minlength":
-              9,
-      }, {
-          "testcase_name":
-              "_minlength_larger_values",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 1, 2, 1],
-          "expected_shape": [3, 8],
-          "minlength":
-              3,
-      }, {
-          "testcase_name":
-              "_no_maxlength_binary",
-          "x":
-              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 1, 1],
-          "expected_shape": [3, 6],
-          "binary_output":
-              True,
-      }, {
-          "testcase_name":
-              "_maxlength_binary",
-          "x":
-              np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 1, 1],
-          "expected_shape": [3, 7],
-          "maxlength":
-              7,
-          "binary_output":
-              True,
-      }, {
-          "testcase_name":
-              "_minlength_binary",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 1, 1, 1],
-          "expected_shape": [3, 9],
-          "minlength":
-              9,
-          "binary_output":
-              True,
-      }, {
-          "testcase_name":
-              "_minlength_larger_values_binary",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [1, 1, 1, 1, 1],
-          "expected_shape": [3, 8],
-          "minlength":
-              3,
-          "binary_output":
-              True,
-      }, {
-          "testcase_name":
-              "_no_maxlength_weights",
-          "x":
-              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 7, 10],
-          "expected_shape": [3, 6],
-          "weights":
-              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
-      }, {
-          "testcase_name":
-              "_maxlength_weights",
-          "x":
-              np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 7, 10],
-          "expected_shape": [3, 7],
-          "maxlength":
-              7,
-          "weights":
-              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
-      }, {
-          "testcase_name":
-              "_minlength_weights",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 14, 6.5, 10],
-          "expected_shape": [3, 9],
-          "minlength":
-              9,
-          "weights":
-              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
-      }, {
-          "testcase_name":
-              "_minlength_larger_values_weights",
-          "x":
-              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
-          "expected_values": [2, 6, 14, 6.5, 10],
-          "expected_shape": [3, 8],
-          "minlength":
-              3,
-          "weights":
-              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
-      }, {
-          "testcase_name": "_1d",
-          "x": np.array([3, 0, 1, 1], dtype=np.int32),
-          "expected_indices": [[1], [3]],
-          "expected_values": [2, 1],
-          "expected_shape": [4],
-      }, {
-          "testcase_name":
-              "_all_axes",
-          "x":
-              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
-                       dtype=np.int32),
-          "expected_indices": [[1], [3], [4], [5]],
-          "expected_values": [1, 1, 2, 1],
-          "expected_shape": [6],
-          "axis":
-              None,
-      }, {
-          "testcase_name":
-              "_large_inputs",
-          "x":
-              np.array([[1941591354222760687, 0, 1241591354229760689],
-                        [0, 1241591354229760689, 1241591354229760687]],
-                       dtype=np.int64),
-          "expected_indices": [[1241591354229760687], [1241591354229760689],
-                               [1941591354222760687]],
-          "expected_values": [1, 2, 1],
-          "expected_shape": [1941591354222760687 + 1],
-          "axis":
-              None
-      })
-  def test_sparse_input(self,
-                        x,
-                        expected_indices,
-                        expected_values,
-                        expected_shape,
-                        maxlength=None,
-                        minlength=None,
-                        binary_output=False,
-                        weights=None,
-                        axis=-1):
-    x_sparse = sparse_ops.from_dense(x)
-    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
-    y = bincount_ops.sparse_bincount(
-        x_sparse,
-        weights=w_sparse,
-        minlength=minlength,
-        maxlength=maxlength,
-        binary_output=binary_output,
-        axis=axis)
-    self.assertAllEqual(expected_indices, y.indices)
-    self.assertAllEqual(expected_values, y.values)
-    self.assertAllEqual(expected_shape, y.dense_shape)
-
-  @parameterized.named_parameters(
-      {
-          "testcase_name": "_no_maxlength",
-          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [1, 1, 1, 1, 2, 1],
-          "expected_shape": [5, 6],
-      }, {
-          "testcase_name": "_maxlength",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "maxlength": 7,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [1, 1, 1, 1, 2, 1],
-          "expected_shape": [5, 7],
-      }, {
-          "testcase_name": "_maxlength_zero",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "maxlength": 0,
-          "expected_indices": np.empty([0, 2], dtype=np.int64),
-          "expected_values": [],
-          "expected_shape": [5, 0],
-      }, {
-          "testcase_name": "_minlength",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "minlength": 9,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
-                               [4, 5]],
-          "expected_values": [1, 1, 1, 1, 1, 2, 1],
-          "expected_shape": [5, 9],
-      }, {
-          "testcase_name": "_minlength_larger_values",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "minlength": 3,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
-                               [4, 5]],
-          "expected_values": [1, 1, 1, 1, 1, 2, 1],
-          "expected_shape": [5, 8],
-      }, {
-          "testcase_name": "_no_maxlength_binary",
-          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [1, 1, 1, 1, 1, 1],
-          "expected_shape": [5, 6],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_maxlength_binary",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "maxlength": 7,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [1, 1, 1, 1, 1, 1],
-          "expected_shape": [5, 7],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_minlength_binary",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "minlength": 9,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
-                               [4, 5]],
-          "expected_values": [1, 1, 1, 1, 1, 1, 1],
-          "expected_shape": [5, 9],
-          "binary_output": True,
-      }, {
-          "testcase_name": "_minlength_larger_values_binary",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "minlength": 3,
-          "binary_output": True,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
-                               [4, 5]],
-          "expected_values": [1, 1, 1, 1, 1, 1, 1],
-          "expected_shape": [5, 8],
-      }, {
-          "testcase_name": "_no_maxlength_weights",
-          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
-          "expected_shape": [5, 6],
-          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
-      }, {
-          "testcase_name": "_maxlength_weights",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "maxlength": 7,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
-          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
-          "expected_shape": [5, 7],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
-      }, {
-          "testcase_name": "_minlength_weights",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "minlength": 9,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
-                               [4, 5]],
-          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
-          "expected_shape": [5, 9],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
-      }, {
-          "testcase_name": "_minlength_larger_values_weights",
-          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
-          "minlength": 3,
-          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
-                               [4, 5]],
-          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
-          "expected_shape": [5, 8],
-          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
-      }, {
-          "testcase_name": "_1d",
-          "x": [3, 0, 1, 1],
-          "expected_indices": [[0], [1], [3]],
-          "expected_values": [1, 2, 1],
-          "expected_shape": [4],
-      }, {
-          "testcase_name": "_all_axes",
-          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
-          "expected_indices": [[0], [1], [3], [4], [5]],
-          "expected_values": [2, 1, 1, 2, 1],
-          "expected_shape": [6],
-          "axis": None,
-      }, {
-          "testcase_name": "_large_inputs",
-          "x": [[1941591354222760687, 1748591354222760687],
-                [1941591354222760687, 1241591354229760689, 1241591354229760687]
-               ],
-          "expected_indices": [[1241591354229760687], [1241591354229760689],
-                               [1748591354222760687], [1941591354222760687]],
-          "expected_values": [1, 1, 1, 2],
-          "expected_shape": [1941591354222760687 + 1],
-          "axis": None
-      })
-  def test_ragged_input(self,
-                        x,
-                        expected_indices,
-                        expected_values,
-                        expected_shape,
-                        maxlength=None,
-                        minlength=None,
-                        binary_output=False,
-                        weights=None,
-                        axis=-1):
-    x_ragged = ragged_factory_ops.constant(x)
-    w = ragged_factory_ops.constant(weights) if weights is not None else None
-    y = bincount_ops.sparse_bincount(
-        x_ragged,
-        weights=w,
-        minlength=minlength,
-        maxlength=maxlength,
-        binary_output=binary_output,
-        axis=axis)
-    self.assertAllEqual(expected_indices, y.indices)
-    self.assertAllEqual(expected_values, y.values)
-    self.assertAllEqual(expected_shape, y.dense_shape)
-
-
 class TestDenseBincount(test.TestCase, parameterized.TestCase):
 
   @parameterized.parameters([{
@@ -689,413 +151,6 @@ class TestDenseBincount(test.TestCase, parameterized.TestCase):
         self.evaluate(
             bincount_ops.bincount(arr=inp_sparse, axis=-1, binary_output=True)))
 
-  @parameterized.parameters([{
-      "dtype": np.int32,
-  }, {
-      "dtype": np.int64,
-  }])
-  def test_ragged_input_count(self, dtype):
-    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]],
-                                    dtype)
-    # pyformat: disable
-    expected_output = [
-        [0, 0, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-        [1, 1, 0, 1, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-        [1, 0, 0, 0, 2, 1]]
-    # pyformat: enable
-    self.assertAllEqual(expected_output,
-                        self.evaluate(bincount_ops.bincount(arr=x, axis=-1)))
-
-  @parameterized.parameters([{
-      "dtype": np.int32,
-  }, {
-      "dtype": np.int64,
-  }])
-  def test_ragged_input_binary(self, dtype):
-    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
-    # pyformat: disable
-    expected_output = [
-        [0, 0, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-        [1, 1, 0, 1, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-        [1, 0, 0, 0, 1, 1]]
-    # pyformat: enable
-    self.assertAllEqual(
-        expected_output,
-        self.evaluate(
-            bincount_ops.bincount(arr=x, axis=-1, binary_output=True)))
-
-  @parameterized.parameters([{
-      "dtype": np.int32,
-  }, {
-      "dtype": np.int64,
-  }])
-  def test_ragged_input_count_with_weights(self, dtype):
-    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
-    weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [],
-                                           [.2, .5, .6, .3]])
-    # pyformat: disable
-    expected_output = [
-        [0, 0, 0, 0, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-        [.2, .3, 0, .1, 0, 0],
-        [0, 0, 0, 0, 0, 0],
-        [.5, 0, 0, 0, .9, .2]]
-    # pyformat: enable
-    self.assertAllClose(
-        expected_output,
-        self.evaluate(bincount_ops.bincount(arr=x, weights=weights, axis=-1)))
-
-  @parameterized.parameters([{
-      "dtype": np.int32,
-  }, {
-      "dtype": np.int64,
-  }])
-  def test_ragged_input_count_np(self, dtype):
-    np.random.seed(42)
-    num_rows = 128
-    num_cols = 27
-    size = 1000
-    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
-    np_out = np.reshape(
-        np.concatenate(
-            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
-            axis=0), (num_rows, size))
-    x = ragged_tensor.RaggedTensor.from_tensor(inp)
-    self.assertAllEqual(
-        np_out,
-        self.evaluate(bincount_ops.bincount(arr=x, minlength=size, axis=-1)))
-
-  @parameterized.parameters([{
-      "dtype": np.int32,
-  }, {
-      "dtype": np.int64,
-  }])
-  def test_ragged_input_count_np_with_weights(self, dtype):
-    np.random.seed(42)
-    num_rows = 128
-    num_cols = 27
-    size = 1000
-    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
-    np_weight = np.random.random((num_rows, num_cols))
-    np_out = np.reshape(
-        np.concatenate([
-            np.bincount(inp[j, :], weights=np_weight[j, :], minlength=size)
-            for j in range(num_rows)
-        ],
-                       axis=0), (num_rows, size))
-    x = ragged_tensor.RaggedTensor.from_tensor(inp)
-    weights = ragged_tensor.RaggedTensor.from_tensor(np_weight)
-    self.assertAllEqual(
-        np_out,
-        self.evaluate(
-            bincount_ops.bincount(
-                arr=x, weights=weights, minlength=size, axis=-1)))
-
-  @parameterized.product(
-      (
-          dict(
-              tid="_r2",
-              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
-              expected=[0, 1, 2, 3],  # no implied zeros
-          ),
-          dict(
-              tid="_r3",
-              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
-              expected=[0, 1, 2, 3],  # no implied zeros
-          ),
-      ),
-      (
-          dict(minlength=None, maxlength=None),
-          dict(minlength=3, maxlength=None),
-          dict(minlength=5, maxlength=None),
-          dict(minlength=None, maxlength=3),
-          dict(minlength=None, maxlength=5),
-          dict(minlength=2, maxlength=3),
-          dict(minlength=3, maxlength=5),
-          dict(minlength=5, maxlength=10),
-      ),
-  )
-  def test_default(self, x_factory, minlength, maxlength, expected, tid=None):
-    x = x_factory()
-    expected = _adjust_expected_rank1(expected, minlength, maxlength)
-    self.assertAllEqual(
-        expected,
-        self.evaluate(
-            bincount_ops.bincount(x, minlength=minlength, maxlength=maxlength)
-        ),
-    )
-    self.assertAllEqual(
-        expected,
-        self.evaluate(
-            bincount_ops.bincount(
-                x, minlength=minlength, maxlength=maxlength, axis=0
-            )
-        ),
-    )
-
-  @parameterized.product(
-      (
-          dict(
-              tid="_r2",
-              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
-              # no implied zeros
-              expected=[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]],
-          ),
-      ),
-      (
-          dict(minlength=None, maxlength=None),
-          dict(minlength=3, maxlength=None),
-          dict(minlength=5, maxlength=None),
-          dict(minlength=None, maxlength=3),
-          dict(minlength=None, maxlength=5),
-          dict(minlength=2, maxlength=3),
-          dict(minlength=3, maxlength=5),
-          dict(minlength=5, maxlength=10),
-      ),
-  )
-  def test_axis_neg_1(self, tid, x_factory, minlength, maxlength, expected):
-    x = x_factory()
-    expected = _adjust_expected_rank2(expected, minlength, maxlength)
-    self.assertAllEqual(
-        expected,
-        self.evaluate(
-            bincount_ops.bincount(
-                x, minlength=minlength, maxlength=maxlength, axis=-1
-            )
-        ),
-    )
-
-  @parameterized.product(
-      (
-          dict(
-              tid="_r2",
-              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
-              weights_factory=_ragged_factory([[], [1], [2, 3], [4, 5, 6]]),
-              axis=None,
-              expected=[0, 1, 5, 15],  # no implied zeros
-          ),
-          dict(
-              tid="_r3",
-              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
-              weights_factory=_ragged_factory([[[], [1]], [[2, 3], [4, 5, 6]]]),
-              expected=[0, 1, 5, 15],  # no implied zeros
-              axis=None,
-          ),
-          dict(
-              tid="_r2_axis_neg_1",
-              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
-              weights_factory=_ragged_factory([[], [1], [2, 3], [4, 5, 6]]),
-              # no implied zeros
-              expected=[
-                  [0, 0, 0, 0],
-                  [0, 1, 0, 0],
-                  [0, 0, 5, 0],
-                  [0, 0, 0, 15],
-              ],
-              axis=-1,
-          ),
-      ),
-      (
-          dict(minlength=None, maxlength=None),
-          dict(minlength=3, maxlength=None),
-          dict(minlength=5, maxlength=None),
-          dict(minlength=None, maxlength=3),
-          dict(minlength=None, maxlength=5),
-          dict(minlength=2, maxlength=3),
-          dict(minlength=3, maxlength=5),
-          dict(minlength=5, maxlength=10),
-      ),
-  )
-  def test_weights(
-      self,
-      tid,
-      x_factory,
-      weights_factory,
-      minlength,
-      maxlength,
-      expected,
-      axis,
-  ):
-    if "GPU" in set([d.device_type for d in tf_config.list_physical_devices()]):
-      self.skipTest(
-          "b/263004039 The DenseBincount GPU kernel does not support weights."
-          " unsorted_segment_sum should be used instead on GPU."
-      )
-    x = x_factory()
-    weights = weights_factory()
-    if axis == -1:
-      expected = _adjust_expected_rank2(expected, minlength, maxlength)
-    else:
-      expected = _adjust_expected_rank1(expected, minlength, maxlength)
-    self.assertAllEqual(
-        expected,
-        self.evaluate(
-            bincount_ops.bincount(
-                x,
-                weights=weights,
-                minlength=minlength,
-                maxlength=maxlength,
-                axis=axis,
-            )
-        ),
-    )
-
-  @parameterized.product(
-      (
-          dict(
-              tid="_r2",
-              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
-              expected=[0, 1, 1, 1],  # no implied zeros
-              axis=None,
-          ),
-          dict(
-              tid="_r3",
-              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
-              expected=[0, 1, 1, 1],  # no implied zeros
-              axis=None,
-          ),
-          dict(
-              tid="_r2_axis_neg_1",
-              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
-              # no implied zeros
-              expected=[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
-              axis=-1,
-          ),
-      ),
-      (
-          dict(minlength=None, maxlength=None),
-          dict(minlength=3, maxlength=None),
-          dict(minlength=5, maxlength=None),
-          dict(minlength=None, maxlength=3),
-          dict(minlength=None, maxlength=5),
-          dict(minlength=2, maxlength=3),
-          dict(minlength=3, maxlength=5),
-          dict(minlength=5, maxlength=10),
-      ),
-  )
-  def test_binary_output(
-      self,
-      tid,
-      x_factory,
-      minlength,
-      maxlength,
-      expected,
-      axis=None,
-      skip=False,
-  ):
-    x = x_factory()
-    if axis == -1:
-      expected = _adjust_expected_rank2(expected, minlength, maxlength)
-    else:
-      expected = _adjust_expected_rank1(expected, minlength, maxlength)
-    self.assertAllEqual(
-        expected,
-        self.evaluate(
-            bincount_ops.bincount(
-                x,
-                minlength=minlength,
-                maxlength=maxlength,
-                binary_output=True,
-                axis=axis,
-            )
-        ),
-    )
-
-
-class TestSparseCountFailureModes(test.TestCase):
-
-  def test_dense_input_sparse_weights_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegex(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_dense_input_ragged_weights_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegex(ValueError, "must be a tf.Tensor"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_dense_input_wrong_shape_fails(self):
-    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    weights = np.array([[3, 2], [5, 4], [4, 3]])
-    # Note: Eager mode and graph mode throw different errors here. Graph mode
-    # will fail with a ValueError from the shape checking logic, while Eager
-    # will fail with an InvalidArgumentError from the kernel itself.
-    if context.executing_eagerly():
-      with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                  "must have the same shape"):
-        self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-    else:
-      with self.assertRaisesRegex(ValueError, "both shapes must be equal"):
-        self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_dense_weights_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegex(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_ragged_weights_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegex(ValueError, "must be a SparseTensor"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_wrong_indices_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "must have the same indices"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_too_many_indices_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesIncompatibleShapesError():
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_sparse_input_wrong_shape_fails(self):
-    x = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
-                 dtype=np.int32))
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "must have the same dense shape"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_dense_weights_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
-    with self.assertRaisesRegex(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_sparse_weights_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = sparse_ops.from_dense(
-        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
-    with self.assertRaisesRegex(ValueError, "must be a RaggedTensor"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
-  def test_ragged_input_different_shape_fails(self):
-    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
-    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
-    with self.assertRaisesRegex(errors.InvalidArgumentError,
-                                "must have the same row splits"):
-      self.evaluate(bincount_ops.sparse_bincount(x, weights=weights, axis=-1))
-
 
 class RawOpsHeapOobTest(test.TestCase, parameterized.TestCase):
 
diff --git a/tensorflow/python/ops/cond.py b/tensorflow/python/ops/cond.py
index 0bd765ac85b..02cbdbf182a 100644
--- a/tensorflow/python/ops/cond.py
+++ b/tensorflow/python/ops/cond.py
@@ -21,6 +21,8 @@ from tensorflow.python.framework import indexed_slices
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond_v2
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
@@ -28,19 +30,8 @@ from tensorflow.python.types import core
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
 from tensorflow.python.util import nest
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
-# TODO(b/269483538): below lazy loads
-#   needed for references while refactors are in progress
-control_flow_ops = LazyLoader(
-    "control_flow_ops", globals(),
-    "tensorflow.python.ops.control_flow_ops")
-# This is to avoid a circular dependency:
-# cond_v2 -> gradients_util -> control_flow_ops
-cond_v2 = LazyLoader("cond_v2", globals(),
-                     "tensorflow.python.ops.cond_v2")
-
 
 # pylint: disable=redefined-outer-name
 # pylint: disable=g-doc-args
diff --git a/tensorflow/python/ops/control_flow_ops.py b/tensorflow/python/ops/control_flow_ops.py
index 1d05b3e33a5..80fcc7e5191 100644
--- a/tensorflow/python/ops/control_flow_ops.py
+++ b/tensorflow/python/ops/control_flow_ops.py
@@ -32,15 +32,11 @@ from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.framework import type_spec
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import cond as tf_cond
-from tensorflow.python.ops import control_flow_assert
-from tensorflow.python.ops import control_flow_case
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_array_ops
 from tensorflow.python.ops import gen_control_flow_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
-from tensorflow.python.ops import while_loop as while_loop_ops
 # go/tf-wildcard-import
 # pylint: disable=wildcard-import,undefined-variable
 from tensorflow.python.ops.gen_control_flow_ops import *
@@ -51,22 +47,6 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import variable_utils
 from tensorflow.python.util.tf_export import tf_export
 
-# TODO(b/269483538): needed for references while refactors are in progress
-case = control_flow_case.case
-_case_helper = control_flow_case._case_helper  # pylint: disable=protected-access
-case_v2 = control_flow_case.case_v2
-_case_create_default_action = control_flow_case._case_create_default_action  # pylint: disable=protected-access
-_case_verify_and_canonicalize_args = control_flow_case._case_verify_and_canonicalize_args  # pylint: disable=protected-access
-_assert_at_most_n_true = control_flow_case._assert_at_most_n_true  # pylint: disable=protected-access
-Assert = control_flow_assert.Assert
-_summarize_eager = control_flow_assert._summarize_eager  # pylint: disable=protected-access
-while_loop = while_loop_ops.while_loop
-while_loop_v2 = while_loop_ops.while_loop_v2
-cond = tf_cond.cond
-cond_for_tf_v2 = tf_cond.cond_for_tf_v2
-_UnpackIfSingleton = tf_cond._UnpackIfSingleton  # pylint: disable=protected-access
-_eager_cond_implementation = tf_cond._eager_cond_implementation  # pylint: disable=protected-access
-_cast_indexed_slice_indices = tf_cond._cast_indexed_slice_indices  # pylint: disable=protected-access
 
 # We override the 'tuple' for a control flow op, so we keep python's
 # existing 'tuple' for later use in this module.
@@ -742,7 +722,7 @@ class CondContext(ControlFlowContext):
   @property
   def back_prop(self):
     if self.GetWhileContext():
-      self.GetWhileContext().back_prop
+      return self.GetWhileContext().back_prop
     return False
 
   def GetControlPivot(self):
diff --git a/tensorflow/python/ops/control_flow_switch_case.py b/tensorflow/python/ops/control_flow_switch_case.py
index 3dd58070c69..8cb4fe685ef 100644
--- a/tensorflow/python/ops/control_flow_switch_case.py
+++ b/tensorflow/python/ops/control_flow_switch_case.py
@@ -17,18 +17,12 @@
 from tensorflow.python.eager import context
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import cond_v2
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import gen_functional_ops
 from tensorflow.python.ops import math_ops
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
-# TODO(b/269483538): needed for references while refactors are in progress
-# This is to avoid a circular dependency:
-# cond_v2 -> gradients_util -> control_flow_ops
-cond_v2 = LazyLoader("cond_v2", globals(),
-                     "tensorflow.python.ops.cond_v2")
-
 
 def _indexed_case_verify_and_canonicalize_args(branch_fns, default,
                                                branch_index):
diff --git a/tensorflow/python/ops/control_flow_util_v2.py b/tensorflow/python/ops/control_flow_util_v2.py
index 5e925fb85a9..847c1e27d97 100644
--- a/tensorflow/python/ops/control_flow_util_v2.py
+++ b/tensorflow/python/ops/control_flow_util_v2.py
@@ -18,7 +18,7 @@
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.python.eager import context
 from tensorflow.python.eager.polymorphic_function import atomic_function
-from tensorflow.python.eager.polymorphic_function import monomorphic_function
+from tensorflow.python.eager.polymorphic_function import concrete_function
 from tensorflow.python.eager.polymorphic_function import tracing_compiler
 from tensorflow.python.eager.polymorphic_function import transform
 from tensorflow.python.framework import function_def_to_graph
@@ -333,8 +333,10 @@ def get_func_graph(op, input_shapes, func_name):
     if operation.type in ["PartitionedCall", "StatefulPartitionedCall"]:
       f = graph._get_function(operation.get_attr("f").name)  # pylint: disable=protected-access
       try:
-        cf = monomorphic_function.ConcreteFunction(
-            f.graph, attrs=f.cached_definition.attr
+        cf = concrete_function.ConcreteFunction(
+            f.graph,
+            attrs=f.cached_definition.attr,
+            function_type=f.function_type,
         )
       except AttributeError:
         # f is not found or f is a _DefinedFunction that doesn't have a graph.
diff --git a/tensorflow/python/ops/distributions/BUILD b/tensorflow/python/ops/distributions/BUILD
index cff033e5b78..e7299480f0b 100644
--- a/tensorflow/python/ops/distributions/BUILD
+++ b/tensorflow/python/ops/distributions/BUILD
@@ -8,10 +8,7 @@ package(
 
 py_strict_library(
     name = "distributions",
-    srcs = glob(
-        ["*.py"],
-        exclude = ["util.py"],
-    ),
+    srcs = ["distributions.py"],
     deprecation = ("TensorFlow Distributions has migrated to " +
                    "TensorFlow Probability " +
                    "(https://github.com/tensorflow/probability). " +
@@ -21,27 +18,21 @@ py_strict_library(
                    "`tf.distributions` to `tfp.distributions`."),
     srcs_version = "PY3",
     deps = [
-        ":util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/framework:constant_op",
+        ":bernoulli",
+        ":beta",
+        ":categorical",
+        ":dirichlet",
+        ":dirichlet_multinomial",
+        ":distribution",
+        ":exponential",
+        ":gamma",
+        ":kullback_leibler",
+        ":laplace",
+        ":multinomial",
+        ":normal",
+        ":student_t",
+        ":uniform",
         "//tensorflow/python/util:deprecation",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//third_party/py/numpy",
     ],
 )
 
@@ -50,18 +41,349 @@ py_strict_library(
     srcs = ["util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_decorator",
         "//third_party/py/numpy",
     ],
 )
+
+py_strict_library(
+    name = "kullback_leibler",
+    srcs = ["kullback_leibler.py"],
+    deps = [
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "laplace",
+    srcs = ["laplace.py"],
+    deps = [
+        ":distribution",
+        ":special_math",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "dirichlet",
+    srcs = ["dirichlet.py"],
+    deps = [
+        ":distribution",
+        ":kullback_leibler",
+        ":util",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "beta",
+    srcs = ["beta.py"],
+    deps = [
+        ":distribution",
+        ":kullback_leibler",
+        ":util",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "bernoulli",
+    srcs = ["bernoulli.py"],
+    deps = [
+        ":distribution",
+        ":kullback_leibler",
+        ":util",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "uniform",
+    srcs = ["uniform.py"],
+    deps = [
+        ":distribution",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "special_math",
+    srcs = ["special_math.py"],
+    deps = [
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "bijector_impl",
+    srcs = ["bijector_impl.py"],
+    deps = [
+        ":util",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/util:object_identity",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "normal",
+    srcs = ["normal.py"],
+    deps = [
+        ":distribution",
+        ":kullback_leibler",
+        ":special_math",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "dirichlet_multinomial",
+    srcs = ["dirichlet_multinomial.py"],
+    deps = [
+        ":distribution",
+        ":util",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "distribution",
+    srcs = ["distribution.py"],
+    deps = [
+        ":kullback_leibler",
+        ":util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_decorator",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "identity_bijector",
+    srcs = ["identity_bijector.py"],
+    deps = [
+        ":bijector",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/util:deprecation",
+    ],
+)
+
+py_strict_library(
+    name = "categorical",
+    srcs = ["categorical.py"],
+    deps = [
+        ":distribution",
+        ":kullback_leibler",
+        ":util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "bijector_test_util",
+    srcs = ["bijector_test_util.py"],
+    deps = [
+        ":uniform",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:math_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "bijector",
+    srcs = ["bijector.py"],
+    deps = [":bijector_impl"],
+)
+
+py_strict_library(
+    name = "exponential",
+    srcs = ["exponential.py"],
+    deps = [
+        ":gamma",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "gamma",
+    srcs = ["gamma.py"],
+    deps = [
+        ":distribution",
+        ":kullback_leibler",
+        ":util",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "multinomial",
+    srcs = ["multinomial.py"],
+    deps = [
+        ":distribution",
+        ":util",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+    ],
+)
+
+py_strict_library(
+    name = "transformed_distribution",
+    srcs = ["transformed_distribution.py"],
+    deps = [
+        ":distribution",
+        ":identity_bijector",
+        ":util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/util:deprecation",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "student_t",
+    srcs = ["student_t.py"],
+    deps = [
+        ":distribution",
+        ":util",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/util:deprecation",
+        "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
+    ],
+)
diff --git a/tensorflow/python/ops/embedding_ops.py b/tensorflow/python/ops/embedding_ops.py
index d03a3db9475..fc46a849d86 100644
--- a/tensorflow/python/ops/embedding_ops.py
+++ b/tensorflow/python/ops/embedding_ops.py
@@ -549,6 +549,56 @@ def embedding_lookup_sparse_v2(
 ):
   """Looks up embeddings for the given ids and weights from a list of tensors.
 
+  `params` is a dense tensor or a list of dense tensors, and `sp_ids` is a 2D
+  `tf.SparseTensor` or `tf.RaggedTensor` indicating the indices of `params` to
+  gather.
+
+  This op is best described with an example. Suppose `params` is an embedding
+  table of size `(4, 2)` and `sp_ids` has 3 rows. Since `sp_ids` is sparse or
+  ragged, not every row has the same number of elements. The output has shape
+  (3, 2). Each row of `sp_ids` is a list of indices, where each index selects a
+  row of `params`. For a given row of `sp_ids`, the rows of `params` are
+  gathered based on the indices in `sp_ids`, then combined by taking their sum
+  or mean.
+
+  >>> params = tf.constant([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=tf.float32)
+  >>> sp_ids = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [2, 0]],
+  ...                          values=[0, 1, 3, 2], dense_shape=(3, 2))
+  >>> tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights=None,
+  ...                               combiner='sum').numpy()
+  array([[4., 6.], [7., 8.], [5., 6.]], dtype=float32)
+
+  In this example, `sp_ids` has 3 rows, so the output has 3 rows. Row 0 of
+  `sp_ids` has values 0 and 1, so it selects rows 0 and 1 from `params`, which
+  are `[1, 2]` and `[3, 4]`. The rows are summed since `combiner='sum'`,
+  resulting in the output row of `[4, 6]`.
+
+  Since row 1 and 2 of `sp_ids` only have one value each, they simply select the
+  corresponding row from `params` as the output row. Row 1 has value `3` so
+  it selects the `params` elements `[7, 8]` and row 2 has the value 2 so it
+  selects the the `params` elements `[5, 6]`.
+
+  If `sparse_weights` is specified, it must have the same shape as `sp_ids`.
+  `sparse_weights` is used to assign a weight to each slice of `params`. For
+  example:
+
+  >>> params = tf.constant([[1, 2], [3, 4], [5, 6], [7, 8]], dtype=tf.float32)
+  >>> sp_ids = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [2, 0]],
+  ...                          values=[0, 1, 3, 2], dense_shape=(3, 2))
+  >>> sparse_weights = tf.SparseTensor(indices=[[0, 0], [0, 1], [1, 0], [2, 0]],
+  ...                                  values=[0.1, 1.0, 0.5, 2.0],
+  ...                                  dense_shape=(3, 2))
+  >>> tf.nn.embedding_lookup_sparse(params, sp_ids, sp_weights=sparse_weights,
+  ...                               combiner='sum').numpy()
+  array([[3.1, 4.2], [3.5, 4.], [10., 12.]], dtype=float32)
+
+  In general, `params` can have shape `(p0, ..., pn)` and `sp_ids` can have `M`
+  rows, where each row can have any number of elements. The output has shape
+  `(M, p1, ..., pn)`. Each slice of the output `output[i, ...]` is obtained as
+  follows: The `combiner` argument is used to combine the values
+  `params[sp_ids[i, j], ...] * sparse_weights[i, j]` for each `j` in `range(0,
+  len(sp_ids[i]))`, e.g. by taking the sum or mean of the values.
+
   This op assumes that there is at least one id for each row in the dense tensor
   represented by sp_ids (i.e. there are no rows with empty features), and that
   all the indices of sp_ids are in canonical row-major order.
@@ -558,8 +608,9 @@ def embedding_lookup_sparse_v2(
   can be described as `RaggedTensor`s, use of `RaggedTensor`s can yield higher
   performance.
 
-  It also assumes that all id values lie in the range [0, p0), where p0
-  is the sum of the size of params along dimension 0.
+  This op assumes that all id values lie in the range [0, p0), where p0
+  is `params.shape[0]`. If you want a version of this op that prunes id values
+  less than 0, see `tf.nn.safe_embedding_lookup_sparse`
 
   If `len(params) > 1`, each element of `sp_ids` is partitioned between the
   elements of `params` according to the "div" partition strategy, which means we
@@ -662,9 +713,12 @@ def safe_embedding_lookup_sparse_v2(
   except for the first dimension. The first dimension is allowed to vary as the
   vocabulary size is not necessarily a multiple of num of shards.
 
-  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
-  with non-positive weight. For an entry with no features, the embedding vector
-  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.
+  This is similar to `tf.nn.embedding_lookup_sparse`, except invalid IDs (< 0)
+  are pruned from input IDs and weights, as well as any IDs with non-positive
+  weight. For an entry with no features, the embedding vector for `default_id`
+  is returned, or the 0-vector if `default_id` is not supplied. See
+  `tf.nn.embedding_lookup_sparse` for more information on how sparse embedding
+  lookups work in general.
 
   The ids and weights may be multi-dimensional `SparseTensor`s or
   `RaggedTensor`s with rank of 2. For `SpareTensor`s with left-aligned non-zero
@@ -865,7 +919,7 @@ def safe_embedding_lookup_sparse(
 
   dtype = sparse_weights.dtype if sparse_weights is not None else None
   embedding_weights = [
-      w if (isinstance(w, resource_variable_ops.ResourceVariable)
+      w if (resource_variable_ops.is_resource_variable(w)
             and dtype in (None, w.dtype))
       else ops.convert_to_tensor(w, dtype=dtype)
       for w in embedding_weights
diff --git a/tensorflow/python/ops/image_ops_impl.py b/tensorflow/python/ops/image_ops_impl.py
index 5020ae06b95..aeebb3cded0 100644
--- a/tensorflow/python/ops/image_ops_impl.py
+++ b/tensorflow/python/ops/image_ops_impl.py
@@ -2704,7 +2704,7 @@ def stateless_random_hue(image, max_delta, seed):
     image: RGB image or images. The size of the last dimension must be 3.
     max_delta: float. The maximum value for the random delta.
     seed: A shape [2] Tensor, the seed to the random number generator. Must have
-      dtype `int32` or `int64`. (When using XLA, only `int32` is allowed.)
+      dtype `int32` or `int64`.
 
   Returns:
     Adjusted image(s), same shape and DType as `image`.
diff --git a/tensorflow/python/ops/linalg/BUILD b/tensorflow/python/ops/linalg/BUILD
index e26c3b25a87..9295592b472 100644
--- a/tensorflow/python/ops/linalg/BUILD
+++ b/tensorflow/python/ops/linalg/BUILD
@@ -11,22 +11,22 @@ py_strict_library(
     srcs = ["linalg_impl.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:linalg_ops_gen",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:stateless_random_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -38,16 +38,16 @@ py_strict_library(
     srcs = ["linear_operator_util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
@@ -61,11 +61,11 @@ py_strict_library(
         ":linear_operator_diag",
         ":linear_operator_identity",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
@@ -78,11 +78,11 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/distributions:util",
         "//tensorflow/python/ops/signal:fft_ops",
         "//tensorflow/python/util:tf_export",
@@ -97,14 +97,14 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -117,9 +117,9 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -130,14 +130,14 @@ py_strict_library(
     deps = [
         ":linalg_impl",
         ":linear_operator",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python/framework:common_shapes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -155,14 +155,14 @@ py_strict_library(
         ":linear_operator",
         ":linear_operator_algebra",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:common_shapes",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
@@ -193,20 +193,20 @@ py_strict_library(
         ":linear_operator_algebra",
         ":linear_operator_util",
         ":slicing",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:data_structures",
         "//tensorflow/python/util:deprecation",
@@ -224,7 +224,7 @@ py_strict_library(
     deps = [
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -233,7 +233,7 @@ py_strict_library(
     name = "linear_operator_algebra",
     srcs = ["linear_operator_algebra.py"],
     deps = [
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/util:tf_inspect",
     ],
 )
@@ -251,7 +251,7 @@ py_strict_library(
         ":linear_operator_householder",
         ":linear_operator_identity",
         ":linear_operator_kronecker",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -262,15 +262,15 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:manip_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -283,11 +283,11 @@ py_strict_library(
         ":linear_operator",
         ":linear_operator_circulant",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops/signal:fft_ops",
         "//tensorflow/python/util:tf_export",
     ],
@@ -300,13 +300,13 @@ py_strict_library(
         ":linear_operator",
         ":linear_operator_algebra",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/framework:common_shapes",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -324,9 +324,9 @@ py_strict_library(
         ":linear_operator_kronecker",
         ":linear_operator_lower_triangular",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -336,22 +336,22 @@ py_strict_library(
     deps = [
         ":linalg_impl",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sort_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variables",
         "//tensorflow/python:while_v2",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sort_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:nested_structure_coder",
@@ -368,13 +368,13 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sort_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sort_ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -396,7 +396,7 @@ py_strict_library(
         ":linear_operator_identity",
         ":linear_operator_inversion",
         ":linear_operator_kronecker",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -435,10 +435,10 @@ py_strict_library(
     deps = [
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -450,13 +450,13 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -468,9 +468,9 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -498,12 +498,12 @@ py_strict_library(
     deps = [
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
         "//tensorflow/python/framework:common_shapes",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -512,9 +512,9 @@ py_strict_library(
     name = "slicing",
     srcs = ["slicing.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
@@ -527,14 +527,14 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_util",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
@@ -547,11 +547,11 @@ py_strict_library(
         ":linalg_impl",
         ":linear_operator",
         ":linear_operator_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -565,8 +565,8 @@ py_strict_library(
         ":linear_operator_full_matrix",
         ":linear_operator_identity",
         ":linear_operator_lower_triangular",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
     ],
 )
diff --git a/tensorflow/python/ops/linalg/sparse/BUILD b/tensorflow/python/ops/linalg/sparse/BUILD
index 10468612fc7..55e6eda79c4 100644
--- a/tensorflow/python/ops/linalg/sparse/BUILD
+++ b/tensorflow/python/ops/linalg/sparse/BUILD
@@ -32,12 +32,12 @@ py_strict_library(
     srcs = ["sparse_csr_matrix_grad.py"],
     deps = [
         ":sparse_csr_matrix_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
     ],
 )
 
@@ -50,12 +50,12 @@ py_strict_library(
     name = "conjugate_gradient",
     srcs = ["conjugate_gradient.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/linalg:linalg_impl",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
@@ -67,14 +67,14 @@ py_strict_library(
     srcs = ["sparse_csr_matrix_ops.py"],
     deps = [
         ":gen_sparse_csr_matrix_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:cpp_shape_inference_proto_py",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
     ],
 )
diff --git a/tensorflow/python/ops/losses/BUILD b/tensorflow/python/ops/losses/BUILD
index bda64481307..2dfbe4dcebf 100644
--- a/tensorflow/python/ops/losses/BUILD
+++ b/tensorflow/python/ops/losses/BUILD
@@ -1,5 +1,4 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -23,15 +22,14 @@ py_strict_library(
     name = "util",
     srcs = ["util.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:confusion_matrix",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:confusion_matrix",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
     ],
@@ -42,29 +40,31 @@ py_strict_library(
     srcs = ["losses_impl.py"],
     deps = [
         ":util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:confusion_matrix",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:weights_broadcast_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:confusion_matrix",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:weights_broadcast_ops",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "util_test",
     size = "small",
     srcs = ["util_test.py"],
     deps = [
-        ":losses",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        ":util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/ops/math_ops.py b/tensorflow/python/ops/math_ops.py
index 4a93eec90e9..c7516853cb7 100644
--- a/tensorflow/python/ops/math_ops.py
+++ b/tensorflow/python/ops/math_ops.py
@@ -91,6 +91,7 @@ from tensorflow.python.ops import gen_sparse_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_math_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.ops.numpy_ops import np_dtypes
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
@@ -99,15 +100,9 @@ from tensorflow.python.util import nest
 from tensorflow.python.util import tf_decorator
 from tensorflow.python.util import traceback_utils
 from tensorflow.python.util.compat import collections_abc
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-np_dtypes = LazyLoader(
-    "np_dtypes", globals(),
-    "tensorflow.python.ops.numpy_ops.np_dtypes")
-
-
 # Aliases for some automatically-generated names.
 nextafter = gen_math_ops.next_after
 
@@ -220,8 +215,11 @@ def linspace_nd(start, stop, num, name=None, axis=0):
     all_tensors = (expanded_start, res, expanded_stop)
     concatenated = array_ops.concat(all_tensors, axis=axis)
     begin = array_ops.zeros_like(shape)
-    size = array_ops.where_v2(mask, num_int, shape)
-
+    # Preserve shape information for final slice.
+    size = array_ops.concat(
+        (shape[0:axis], array_ops.reshape(num_int, [1]), shape[axis + 1 :]),
+        axis=0,
+    )
     return array_ops.slice(concatenated, begin, size)
 
 
@@ -2584,15 +2582,23 @@ def count_nonzero_v2(
     keepdims = False
   with ops.name_scope(name, "count_nonzero", [input]):
     input = ops.convert_to_tensor(input, name="input")
-    # A scalar of 'zero' is enough as `not_equal` will broadcast.
-    zero = array_ops.zeros([], dtype=input.dtype)
+    # if the input is already of type bool, then there is no need
+    # to compare to zero.
+    if input.dtype == dtypes.bool:
+      predicate = input
+    else:
+      # A scalar of 'zero' is enough as `not_equal` will broadcast.
+      zero = array_ops.zeros([], dtype=input.dtype)
+      predicate = gen_math_ops.not_equal(input, zero)
     return cast(
         reduce_sum(
             # int64 reduction happens on GPU
-            cast(gen_math_ops.not_equal(input, zero), dtypes.int64),
+            cast(predicate, dtypes.int64),
             axis=axis,
-            keepdims=keepdims),
-        dtype=dtype)
+            keepdims=keepdims,
+        ),
+        dtype=dtype,
+    )
 
 
 @tf_export(v1=["math.reduce_mean", "reduce_mean"])
diff --git a/tensorflow/python/ops/math_ops_linspace_test.py b/tensorflow/python/ops/math_ops_linspace_test.py
index 74c7fcc1af3..9e2499cd4dc 100644
--- a/tensorflow/python/ops/math_ops_linspace_test.py
+++ b/tensorflow/python/ops/math_ops_linspace_test.py
@@ -21,6 +21,9 @@ from distutils.version import LooseVersion  # pylint: disable=g-importing-member
 from absl.testing import parameterized
 import numpy as np
 
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import tensor
 from tensorflow.python.framework import test_util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
@@ -58,6 +61,21 @@ class LinspaceTest(test_util.TensorFlowTestCase, parameterized.TestCase):
 
         self.assertAllClose(np_ans, tf_ans)
 
+  def testShapeInformationPeserved(self):
+    @def_function.function
+    def linspace(start, stop, num, axis):
+      return math_ops.linspace_nd(start, stop, num=num, axis=axis)
+
+    # Constant num and axis leads to preserved known shape.
+    output_shape = linspace.get_concrete_function(
+        start=tensor.TensorSpec(shape=[64, None], dtype=dtypes.float32),
+        stop=tensor.TensorSpec(shape=[64, None], dtype=dtypes.float32),
+        num=10,
+        axis=-1,
+    ).output_shapes
+    expected_shape = (64, None, 10)
+    self.assertEqual(output_shape, expected_shape)
+
 
 if __name__ == "__main__":
   googletest.main()
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 8196a9613ed..350524ef4aa 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -57,6 +57,19 @@ class ReduceTest(test_util.TensorFlowTestCase):
 
     self.assertAllClose(out_bf16, expected, 1e-3)
 
+  def testCountNonzero(self):
+    # simple case
+    x = np.array([[0, -2, 0], [4, 0, 0]], dtype=np.int32)
+    self.assertEqual(self.evaluate(math_ops.count_nonzero(x)), 2)
+
+    # boolean input
+    x = math_ops.not_equal(x, 0)
+    self.assertEqual(self.evaluate(math_ops.count_nonzero(x)), 2)
+
+    # would overflow if int8 would be used for internal calculations
+    x = 2 * np.ones(512, dtype=np.int8)
+    self.assertEqual(self.evaluate(math_ops.count_nonzero(x)), 512)
+
   def testReduceExplicitAxes(self):
     x = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.int32)
     with test_util.device(use_gpu=True):
diff --git a/tensorflow/python/ops/memory_tests/BUILD b/tensorflow/python/ops/memory_tests/BUILD
index 94e73c4c472..61bceb43f91 100644
--- a/tensorflow/python/ops/memory_tests/BUILD
+++ b/tensorflow/python/ops/memory_tests/BUILD
@@ -1,7 +1,7 @@
 # python/ops/memory_tests package
 
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
+load("//tensorflow/python/tpu:tpu.bzl", "tpu_py_strict_test")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_xla_deps_py")
 
 package(
@@ -9,29 +9,30 @@ package(
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "custom_gradient_memory_test",
     size = "medium",
     srcs = ["custom_gradient_memory_test.py"],
     xla_enable_strict_auto_jit = False,  # XLA are enabled explicitly in XLA memory tests.
     deps = [
         "//tensorflow/compiler/xla/service:hlo_proto_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "custom_gradient_memory_test_tpu",
     size = "medium",
     srcs = ["custom_gradient_memory_test.py"],
@@ -42,18 +43,18 @@ tpu_py_test(
     main = "custom_gradient_memory_test.py",
     deps = [
         "//tensorflow/compiler/xla/service:hlo_proto_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
-        "//tensorflow/python/tpu:tpu_strategy_util",
         "@absl_py//absl/testing:parameterized",
     ] + tf_additional_xla_deps_py(),
 )
diff --git a/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py b/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
index f5650c4efe1..56b7204ffe0 100644
--- a/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
+++ b/tensorflow/python/ops/memory_tests/custom_gradient_memory_test.py
@@ -18,6 +18,7 @@ import functools
 
 from absl.testing import parameterized
 from tensorflow.compiler.xla.service import hlo_pb2
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import config
@@ -29,7 +30,6 @@ from tensorflow.python.ops import custom_gradient
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import googletest
 from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_strategy_util
 
 
 class RecomputeGradMemoryTest(test.TestCase, parameterized.TestCase):
@@ -116,7 +116,7 @@ class RecomputeGradMemoryTest(test.TestCase, parameterized.TestCase):
     device_name = f"{device_type}:0"
     # Necessary for TFRT tests.
     if device_type == "TPU":
-      tpu_strategy_util.initialize_tpu_system()
+      tpu_cluster_resolver.initialize_tpu_system()
 
     n = 500
     with ops.device(device_name):
diff --git a/tensorflow/python/ops/numpy_ops/BUILD b/tensorflow/python/ops/numpy_ops/BUILD
index 5c35790ffd9..129fc57f6a1 100644
--- a/tensorflow/python/ops/numpy_ops/BUILD
+++ b/tensorflow/python/ops/numpy_ops/BUILD
@@ -1,135 +1,274 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tensorflow_numerics:__subpackages__",
+        "//third_party/py/trax/tf_numpy/numpy:__pkg__",
+    ],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "numpy",
-    srcs = [
-        "__init__.py",
-        "np_array_ops.py",
-        "np_arrays.py",
-        "np_config.py",
-        "np_dtypes.py",
-        "np_export.py",
-        "np_math_ops.py",
-        "np_random.py",
-        "np_utils.py",
-    ],
+    srcs = ["__init__.py"],
     srcs_version = "PY3",
-    visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:indexed_slices",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:manip_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sort_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:while_loop",
+        ":np_array_ops",
+        ":np_arrays",
+        ":np_config",
+        ":np_dtypes",
+        ":np_math_ops",
+        ":np_random",
+        ":np_utils",
+        "//tensorflow/python/ops:array_ops",
+    ],
+)
+
+py_strict_library(
+    name = "np_utils",
+    srcs = ["np_utils.py"],
+    deps = [
+        ":np_arrays",
+        ":np_dtypes",
+        ":np_export",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+py_strict_library(
+    name = "np_array_ops",
+    srcs = ["np_array_ops.py"],
+    deps = [
+        ":np_arrays",
+        ":np_dtypes",
+        ":np_export",
+        ":np_utils",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sort_ops",
+        "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "np_config",
+    srcs = ["np_config.py"],
+    deps = [
+        ":np_dtypes",
+        ":np_export",
+        ":np_math_ops",
+        "//tensorflow/python/framework:ops",
+    ],
+)
+
+py_strict_library(
+    name = "np_dtypes",
+    srcs = ["np_dtypes.py"],
+    deps = [
+        ":np_export",
+        "//tensorflow/python/framework:dtypes",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "np_export",
+    srcs = ["np_export.py"],
+    deps = ["//tensorflow/python/util:tf_export"],
+)
+
+py_strict_library(
+    name = "np_random",
+    srcs = ["np_random.py"],
+    deps = [
+        ":np_array_ops",
+        ":np_dtypes",
+        ":np_utils",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "np_math_ops",
+    srcs = ["np_math_ops.py"],
+    deps = [
+        ":np_array_ops",
+        ":np_arrays",
+        ":np_dtypes",
+        ":np_export",
+        ":np_utils",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:sort_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//third_party/py/numpy",
+    ],
+)
+
+py_strict_library(
+    name = "np_arrays",
+    srcs = ["np_arrays.py"],
+    deps = [
+        ":np_dtypes",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_conversion",
+    ],
+)
+
+cuda_py_strict_test(
     name = "np_dtypes_test",
     srcs = ["np_dtypes_test.py"],
     deps = [
-        ":numpy",
+        ":np_dtypes",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_arrays_test",
     srcs = ["np_arrays_test.py"],
     deps = [
-        ":numpy",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        ":np_arrays",
+        ":np_math_ops",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
-        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_array_ops_test",
     srcs = ["np_array_ops_test.py"],
     tags = [
         "no_windows",  # TODO(b/215381493)
     ],
     deps = [
-        ":numpy",
+        ":np_array_ops",
+        ":np_arrays",
+        ":np_math_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_logic_test",
     srcs = ["np_logic_test.py"],
     deps = [
-        ":numpy",
+        ":np_array_ops",
+        ":np_arrays",
+        ":np_math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_math_ops_test",
     srcs = ["np_math_ops_test.py"],
     deps = [
-        ":numpy",
+        ":np_array_ops",
+        ":np_arrays",
+        ":np_math_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_random_test",
     srcs = ["np_random_test.py"],
     deps = [
-        ":numpy",
+        ":np_array_ops",
+        ":np_dtypes",
+        ":np_math_ops",
+        ":np_random",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_utils_test",
     srcs = ["np_utils_test.py"],
     deps = [
-        ":numpy",
+        ":np_utils",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "np_interop_test",
     srcs = ["np_interop_test.py"],
     deps = [
-        ":numpy",
+        ":np_math_ops",
         "//tensorflow:tensorflow_py",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//third_party/py/numpy",
     ],
 )
diff --git a/tensorflow/python/ops/numpy_ops/__init__.py b/tensorflow/python/ops/numpy_ops/__init__.py
index 77f02b0755d..87429e9d5ca 100644
--- a/tensorflow/python/ops/numpy_ops/__init__.py
+++ b/tensorflow/python/ops/numpy_ops/__init__.py
@@ -162,36 +162,3 @@ Here is a non-exhaustive list of differences:
     supported.
 """
 # TODO(wangpeng): Append `np_export`ed symbols to the comments above.
-
-# pylint: disable=g-direct-tensorflow-import
-
-from tensorflow.python.ops.array_ops import newaxis
-from tensorflow.python.ops.numpy_ops import np_random as random
-from tensorflow.python.ops.numpy_ops import np_utils
-# pylint: disable=wildcard-import
-from tensorflow.python.ops.numpy_ops.np_array_ops import *  # pylint: disable=redefined-builtin
-from tensorflow.python.ops.numpy_ops.np_arrays import ndarray
-from tensorflow.python.ops.numpy_ops.np_config import *
-from tensorflow.python.ops.numpy_ops.np_dtypes import *
-from tensorflow.python.ops.numpy_ops.np_math_ops import *  # pylint: disable=redefined-builtin
-# pylint: enable=wildcard-import
-from tensorflow.python.ops.numpy_ops.np_utils import finfo
-from tensorflow.python.ops.numpy_ops.np_utils import promote_types
-from tensorflow.python.ops.numpy_ops.np_utils import result_type
-
-
-# pylint: disable=redefined-builtin,undefined-variable
-@np_utils.np_doc("max", link=np_utils.AliasOf("amax"))
-def max(a, axis=None, keepdims=None):
-  return amax(a, axis=axis, keepdims=keepdims)
-
-
-@np_utils.np_doc("min", link=np_utils.AliasOf("amin"))
-def min(a, axis=None, keepdims=None):
-  return amin(a, axis=axis, keepdims=keepdims)
-
-
-@np_utils.np_doc("round", link=np_utils.AliasOf("around"))
-def round(a, decimals=0):
-  return around(a, decimals=decimals)
-# pylint: enable=redefined-builtin,undefined-variable
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/BUILD
index 2eccb3694ce..169ce16cb67 100644
--- a/tensorflow/python/ops/numpy_ops/integration_test/BUILD
+++ b/tensorflow/python/ops/numpy_ops/integration_test/BUILD
@@ -1,25 +1,25 @@
-load("//tensorflow:tensorflow.bzl", "py_test")
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
 licenses(["notice"])
 
-py_test(
+py_strict_test(
     name = "public_symbol_test",
     srcs = ["public_symbol_test.py"],
     python_version = "PY3",
     deps = [
         "//tensorflow:tensorflow_py",
-    ],
-)
-
-cuda_py_test(
-    name = "np_config_test",
-    srcs = ["np_config_test.py"],
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
     ],
 )
+
+cuda_py_strict_test(
+    name = "np_config_test",
+    srcs = ["np_config_test.py"],
+    deps = [
+        "//tensorflow:tensorflow_py_no_contrib",
+        "//tensorflow/python/ops/numpy_ops:np_config",
+    ],
+)
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
index 609a5bd6bd1..427a85b4e8e 100644
--- a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/BUILD
@@ -1,27 +1,26 @@
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_binary(
+py_strict_binary(
     name = "micro_benchmarks",
     srcs = ["micro_benchmarks.py"],
     python_version = "PY3",
     deps = [
         ":numpy_mlp",
         ":tf_numpy_mlp",
-        "//tensorflow:tensorflow_py",
+        "//tensorflow:tensorflow_py_no_contrib",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_benchmark",
-        "//tensorflow/python/ops/numpy_ops:numpy",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
         "@absl_py//absl/logging",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "numpy_mlp",
     srcs = ["numpy_mlp.py"],
     srcs_version = "PY3",
@@ -30,12 +29,9 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tf_numpy_mlp",
     srcs = ["tf_numpy_mlp.py"],
     srcs_version = "PY3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//tensorflow/python/ops/numpy_ops:numpy",
-    ],
+    deps = ["//tensorflow:tensorflow_py_no_contrib"],
 )
diff --git a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py
index 46674fa8fe7..4564fa1e8cc 100644
--- a/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py
+++ b/tensorflow/python/ops/numpy_ops/integration_test/benchmarks/micro_benchmarks.py
@@ -28,12 +28,15 @@ from absl import logging
 import numpy as np  # pylint: disable=unused-import
 import tensorflow.compat.v2 as tf
 
-from tensorflow.python.ops import numpy_ops as tfnp   # pylint: disable=g-direct-tensorflow-import
 from tensorflow.python.ops.numpy_ops.integration_test.benchmarks import numpy_mlp
 from tensorflow.python.ops.numpy_ops.integration_test.benchmarks import tf_numpy_mlp
 
 FLAGS = flags.FLAGS
 
+# Used instead of "import tensorflow(dot)experimental.numpy as tfnp" due to
+# copybara issues.
+tfnp = tf.experimental.numpy
+
 flags.DEFINE_integer('repeat', 100, '#Measurements per benchmark.')
 flags.DEFINE_integer('number', 100, '#Runs per a measure.')
 
diff --git a/tensorflow/python/ops/numpy_ops/np_array_ops.py b/tensorflow/python/ops/numpy_ops/np_array_ops.py
index c366dbba238..47553f4a70f 100644
--- a/tensorflow/python/ops/numpy_ops/np_array_ops.py
+++ b/tensorflow/python/ops/numpy_ops/np_array_ops.py
@@ -15,6 +15,7 @@
 """Common array methods."""
 # pylint: disable=g-direct-tensorflow-import
 
+import builtins
 import enum
 import functools
 import math
@@ -97,7 +98,7 @@ def eye(N, M=None, k=0, dtype=float):  # pylint: disable=invalid-name,missing-do
   if k == 0:
     return linalg_ops.eye(N, M, dtype=dtype)
   # We need the precise length, otherwise tf.linalg.diag will raise an error
-  diag_len = min(N, M)
+  diag_len = builtins.min(N, M)
   if k > 0:
     if N >= M:
       diag_len -= k
@@ -1011,11 +1012,11 @@ def _boundaries_to_sizes(a, boundaries, axis):
     if size < 0:
       raise ValueError('The %s-th boundary %s is smaller than the previous '
                        'boundary %s' % (i, b, prev))
-    size = min(size, max(0, total_size - sizes_sum))
+    size = builtins.min(size, builtins.max(0, total_size - sizes_sum))
     sizes.append(size)
     sizes_sum += size
     prev = b
-  sizes.append(max(0, total_size - sizes_sum))
+  sizes.append(builtins.max(0, total_size - sizes_sum))
   return sizes
 
 
@@ -1478,6 +1479,23 @@ def take_along_axis(arr, indices, axis):  # pylint: disable=missing-docstring
   return result
 
 
+# pylint: disable=redefined-builtin,undefined-variable
+@np_utils.np_doc('max', link=np_utils.AliasOf('amax'))
+def max(a, axis=None, keepdims=None):
+  return amax(a, axis=axis, keepdims=keepdims)
+
+
+@np_utils.np_doc('min', link=np_utils.AliasOf('amin'))
+def min(a, axis=None, keepdims=None):
+  return amin(a, axis=axis, keepdims=keepdims)
+
+
+@np_utils.np_doc('round', link=np_utils.AliasOf('around'))
+def round(a, decimals=0):
+  return around(a, decimals=decimals)
+# pylint: enable=redefined-builtin,undefined-variable
+
+
 _SLICE_ERORR = (
     'only integers, slices (`:`), ellipsis (`...`), '
     'numpy.newaxis (`None`) and integer or boolean arrays are valid indices')
diff --git a/tensorflow/python/ops/numpy_ops/np_interop_test.py b/tensorflow/python/ops/numpy_ops/np_interop_test.py
index d653d1ef43d..dbe54ae3e43 100644
--- a/tensorflow/python/ops/numpy_ops/np_interop_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_interop_test.py
@@ -19,9 +19,12 @@ import tensorflow.compat.v2 as tf
 
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
-from tensorflow.python.ops import numpy_ops as np
 from tensorflow.python.ops.numpy_ops import np_math_ops
 
+# Used instead of "import tensorflow(dot)experimental.numpy as np" due to
+# copybara issues.
+np = tf.experimental.numpy
+
 
 # Tests for code snippet put in README.md
 class ReadmeTest(tf.test.TestCase):
diff --git a/tensorflow/python/ops/numpy_ops/np_random_test.py b/tensorflow/python/ops/numpy_ops/np_random_test.py
index b54edf9da8e..5575f4ce75d 100644
--- a/tensorflow/python/ops/numpy_ops/np_random_test.py
+++ b/tensorflow/python/ops/numpy_ops/np_random_test.py
@@ -19,7 +19,6 @@ from absl.testing import parameterized
 import numpy as onp
 
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import numpy_ops as np
 # Needed for ndarray.reshape.
 from tensorflow.python.ops.numpy_ops import np_array_ops  # pylint: disable=unused-import
 from tensorflow.python.ops.numpy_ops import np_dtypes
@@ -31,10 +30,10 @@ from tensorflow.python.platform import test
 class SeedTest(test.TestCase):
 
   def test(self):
-    np.random.seed(1)
-    np.random.seed(np.int32(1))
+    np_random.seed(1)
+    np_random.seed(np_dtypes.int32(1))
     with self.assertRaises(ValueError):
-      np.random.seed((1, 3))
+      np_random.seed((1, 3))
 
 
 class RandomTestBase(test.TestCase, parameterized.TestCase):
@@ -61,7 +60,7 @@ class RandomTestBase(test.TestCase, parameterized.TestCase):
 class RandNTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.randn
+    self.np_func = np_random.randn
     self.onp_func = onp.random.randn
     super(RandNTest, self).setUp()
 
@@ -71,13 +70,13 @@ class RandNTest(RandomTestBase):
 
   @parameterized.parameters((), (2), ((2,)), (2, 3))
   def test_float32(self, *dims):
-    self._test(*dims, allow_float64=False, onp_dtype=np.float32)
+    self._test(*dims, allow_float64=False, onp_dtype=np_dtypes.float32)
 
 
 class StandardNormalTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.standard_normal
+    self.np_func = np_random.standard_normal
     self.onp_func = onp.random.standard_normal
     super(StandardNormalTest, self).setUp()
 
@@ -89,7 +88,7 @@ class StandardNormalTest(RandomTestBase):
 class UniformTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.uniform
+    self.np_func = np_random.uniform
     self.onp_func = onp.random.uniform
     super(UniformTest, self).setUp()
 
@@ -103,21 +102,21 @@ class UniformTest(RandomTestBase):
       ((), (), (2, 2, 2)),
   )
   def test_broadcast(self, low_shape, high_shape, size):
-    low = np.zeros(low_shape).astype(np.float64)
-    high = np.ones(high_shape).astype(np.float64)
+    low = np_array_ops.zeros(low_shape).astype(np_dtypes.float64)
+    high = np_array_ops.ones(high_shape).astype(np_dtypes.float64)
     self._test(low=low, high=high, size=size)
 
   def test_float32(self):
-    self._test(0, 1, (1, 2), allow_float64=False, onp_dtype=np.float32)
+    self._test(0, 1, (1, 2), allow_float64=False, onp_dtype=np_dtypes.float32)
 
   def test_dtype_cast(self):
-    self._test(np.int8(0), np.uint8(1), (1, 2))
+    self._test(np_dtypes.int8(0), np_dtypes.uint8(1), (1, 2))
 
 
 class PoissonTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.poisson
+    self.np_func = np_random.poisson
     self.onp_func = onp.random.poisson
     super(PoissonTest, self).setUp()
 
@@ -129,7 +128,7 @@ class PoissonTest(RandomTestBase):
 class RandomTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.random
+    self.np_func = np_random.random
     self.onp_func = onp.random.random
     super(RandomTest, self).setUp()
 
@@ -141,7 +140,7 @@ class RandomTest(RandomTestBase):
 class RandTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.rand
+    self.np_func = np_random.rand
     self.onp_func = onp.random.rand
     super(RandTest, self).setUp()
 
@@ -153,13 +152,18 @@ class RandTest(RandomTestBase):
 class RandIntTest(RandomTestBase):
 
   def setUp(self):
-    self.np_func = np.random.randint
+    self.np_func = np_random.randint
     self.onp_func = onp.random.randint
     super(RandIntTest, self).setUp()
 
-  @parameterized.parameters((0, 1, None, 'l'), (0, 1, None, np.int64),
-                            (0, 1, 2, np.int32), (0, 1, (), np.int32),
-                            (0, 1, (2), np.int64), (0, 1, (2, 2), 'l'))
+  @parameterized.parameters(
+      (0, 1, None, 'l'),
+      (0, 1, None, np_dtypes.int64),
+      (0, 1, 2, np_dtypes.int32),
+      (0, 1, (), np_dtypes.int32),
+      (0, 1, (2), np_dtypes.int64),
+      (0, 1, (2, 2), 'l'),
+  )
   def test(self, low, high, size, dtype):
     self._test(low, high, size=size, dtype=dtype)
 
@@ -171,8 +175,9 @@ class RandNDistriutionTest(test.TestCase):
       self.assertAllClose(a, b, **kwargs)
     except AssertionError:
       return
-    raise AssertionError('The two values are close at all %d elements' %
-                         np.size(a))
+    raise AssertionError(
+        'The two values are close at all %d elements' % np_array_ops.size(a)
+    )
 
   def testDistribution(self):
 
@@ -187,17 +192,20 @@ class RandNDistriutionTest(test.TestCase):
       for output in outputs:
         self.assertEqual(output.shape, tuple(args))
         default_dtype = (
-            np.float64 if np_dtypes.is_allow_float64() else np.float32)
+            np_dtypes.float64
+            if np_dtypes.is_allow_float64()
+            else np_dtypes.float32
+        )
         self.assertEqual(output.dtype.as_numpy_dtype, default_dtype)
 
-      if np.prod(args):  # Don't bother with empty arrays.
+      if np_array_ops.prod(args):  # Don't bother with empty arrays.
         outputs = [output.tolist() for output in outputs]
 
         # Test that the properties of normal distribution are satisfied.
-        mean = np.mean(outputs, axis=0)
-        stddev = np.std(outputs, axis=0)
-        self.assertAllClose(mean, np.zeros(args), atol=tol)
-        self.assertAllClose(stddev, np.ones(args), atol=tol)
+        mean = np_array_ops.mean(outputs, axis=0)
+        stddev = np_array_ops.std(outputs, axis=0)
+        self.assertAllClose(mean, np_array_ops.zeros(args), atol=tol)
+        self.assertAllClose(stddev, np_array_ops.ones(args), atol=tol)
 
         # Test that outputs are different with different seeds.
         np_random.seed(20)
diff --git a/tensorflow/python/ops/numpy_ops/np_utils.py b/tensorflow/python/ops/numpy_ops/np_utils.py
index 09f343e13f7..8e7795330f4 100644
--- a/tensorflow/python/ops/numpy_ops/np_utils.py
+++ b/tensorflow/python/ops/numpy_ops/np_utils.py
@@ -219,7 +219,7 @@ def _np_doc_helper(f, np_f, np_fun_name=None, unsupported_params=None,
   return doc
 
 
-_np_doc_form = os.getenv('TF_NP_DOC_FORM', '1.16')
+_np_doc_form = os.getenv('TF_NP_DOC_FORM', 'stable')
 
 
 def get_np_doc_form():
@@ -285,8 +285,7 @@ def generate_link(flag, np_fun_name):
         'https://numpy.org/doc/stable/reference/generated/numpy.%s.html')
   elif re.match(r'\d+(\.\d+(\.\d+)?)?$', flag):
     # `flag` is the version number
-    template = ('https://numpy.org/doc/' + flag +
-                '/reference/generated/numpy.%s.html')
+    template = (f'https://numpy.org/doc/{flag}/reference/generated/numpy.%s.html')
   else:
     return None
   return template % np_fun_name
diff --git a/tensorflow/python/ops/numpy_ops/tests/BUILD b/tensorflow/python/ops/numpy_ops/tests/BUILD
new file mode 100644
index 00000000000..3ca7b625e6e
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/BUILD
@@ -0,0 +1,161 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
+licenses(["notice"])
+
+py_library(
+    name = "config",
+    srcs = ["config.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+    ],
+)
+
+py_library(
+    name = "test_util",
+    srcs = ["test_util.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":config",
+        ":extensions",
+        "//tensorflow:tensorflow_py",
+    ],
+)
+
+py_library(
+    name = "np_wrapper",
+    srcs = ["np_wrapper.py"],
+    srcs_version = "PY2AND3",
+    visibility = [
+        "//visibility:public",
+    ],
+    deps = [
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/ops/numpy_ops:np_config",
+        "//tensorflow/python/ops/numpy_ops:np_dtypes",
+        "//tensorflow/python/ops/numpy_ops:numpy",
+    ],
+)
+
+py_library(
+    name = "extensions",
+    srcs = ["extensions.py"],
+    srcs_version = "PY2AND3",
+    deps = [
+        ":np_wrapper",
+        "//tensorflow:tensorflow_py",
+        "@six_archive//:six",
+    ],
+)
+
+# copybara:uncomment_begin(google-only)
+# py_test(
+#     name = "extensions_test",
+#     srcs = ["extensions_test.py"],
+#     python_version = "PY3",
+#     srcs_version = "PY2AND3",
+#     tags = [
+#         "gpu",
+#         "no_pip",
+#         "requires-gpu-nvidia",
+#     ],
+#     deps = [
+#         ":extensions",
+#         ":np_wrapper",
+#         "//learning/brain/research/jax:gpu_support",
+#         "//third_party/py/jax",
+#         "//tensorflow:tensorflow_py",
+#     ],
+# )
+#
+# py_test(
+#     name = "extensions_test_tpu",
+#     srcs = ["extensions_test.py"],
+#     args = [
+#         "--jax_allow_unused_tpus",
+#         "--requires_tpu",
+#     ],
+#     main = "extensions_test.py",
+#     python_version = "PY3",
+#     tags = [
+#         "no_pip",
+#         "requires-tpu",
+#     ],
+#     deps = [
+#         ":extensions",
+#         ":np_wrapper",
+#         "//learning/brain/google/xla",
+#         "//third_party/py/jax",
+#         "//tensorflow:tensorflow_py",
+#         "@absl_py//absl/flags",
+#     ],
+# )
+# copybara:uncomment_end
+
+py_test(
+    name = "np_test",
+    timeout = "long",
+    srcs = ["np_test.py"],
+    args = [
+        "--num_generated_cases=90",
+        "--enable_x64",  # Needed to enable dtype check
+    ],
+    python_version = "PY3",
+    shard_count = 20,
+    srcs_version = "PY2AND3",
+    tags = [
+        "gpu",
+        "no_pip",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":np_wrapper",
+        ":test_util",
+    ],
+)
+
+py_test(
+    name = "np_indexing_test",
+    srcs = ["np_indexing_test.py"],
+    args = [
+        "--num_generated_cases=90",
+        "--enable_x64",  # Needed to enable dtype check
+    ],
+    python_version = "PY3",
+    shard_count = 10,
+    srcs_version = "PY2AND3",
+    # TODO(b/164245103): Re-enable GPU once tf.tensor_strided_slice_update's segfault is fixed.
+    tags = [
+        "no_pip",
+        #     "gpu",
+        #     "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":np_wrapper",
+        ":test_util",
+    ],
+)
+
+py_test(
+    name = "np_einsum_test",
+    srcs = ["np_einsum_test.py"],
+    args = [
+        "--num_generated_cases=90",
+        "--enable_x64",  # Needed to enable dtype check
+    ],
+    python_version = "PY3",
+    shard_count = 20,
+    srcs_version = "PY2AND3",
+    tags = [
+        "gpu",
+        "no_pip",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":config",
+        ":test_util",
+        "//tensorflow/python/ops/numpy_ops:np_config",
+        "//tensorflow/python/ops/numpy_ops:numpy",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/ops/numpy_ops/tests/config.py b/tensorflow/python/ops/numpy_ops/tests/config.py
new file mode 100644
index 00000000000..9cb92f7c9f2
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/config.py
@@ -0,0 +1,141 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Test configurations."""
+import os
+import sys
+
+
+def bool_env(varname: str, default: bool) -> bool:
+  """Read an environment variable and interpret it as a boolean.
+
+  True values are (case insensitive): 'y', 'yes', 't', 'true', 'on', and '1';
+  false values are 'n', 'no', 'f', 'false', 'off', and '0'.
+
+  Args:
+    varname: the name of the variable
+    default: the default boolean value
+  Raises: ValueError if the environment variable is anything else.
+  """
+  val = os.getenv(varname, str(default))
+  val = val.lower()
+  if val in ('y', 'yes', 't', 'true', 'on', '1'):
+    return True
+  elif val in ('n', 'no', 'f', 'false', 'off', '0'):
+    return False
+  else:
+    raise ValueError(
+        'invalid truth value %r for environment %r' % (val, varname)
+    )
+
+
+class Config(object):
+
+  def __init__(self):
+    self.values = {}
+    self.meta = {}
+    self.FLAGS = NameSpace(self.read)
+    self.use_absl = False
+
+  def update(self, name, val):
+    if self.use_absl:
+      setattr(self.absl_flags.FLAGS, name, val)
+    else:
+      self.check_exists(name)
+      if name not in self.values:
+        raise Exception("Unrecognized config option: {}".format(name))
+      self.values[name] = val
+
+  def read(self, name):
+    if self.use_absl:
+      return getattr(self.absl_flags.FLAGS, name)
+    else:
+      self.check_exists(name)
+      return self.values[name]
+
+  def add_option(self, name, default, opt_type, meta_args, meta_kwargs):
+    if name in self.values:
+      raise Exception("Config option {} already defined".format(name))
+    self.values[name] = default
+    self.meta[name] = (opt_type, meta_args, meta_kwargs)
+
+  def check_exists(self, name):
+    if name not in self.values:
+      raise Exception("Unrecognized config option: {}".format(name))
+
+  def DEFINE_bool(self, name, default, *args, **kwargs):
+    self.add_option(name, default, bool, args, kwargs)
+
+  def DEFINE_integer(self, name, default, *args, **kwargs):
+    self.add_option(name, default, int, args, kwargs)
+
+  def DEFINE_string(self, name, default, *args, **kwargs):
+    self.add_option(name, default, str, args, kwargs)
+
+  def DEFINE_enum(self, name, default, *args, **kwargs):
+    self.add_option(name, default, 'enum', args, kwargs)
+
+  def config_with_absl(self):
+    # Run this before calling `app.run(main)` etc
+    import absl.flags as absl_FLAGS
+    from absl import app, flags as absl_flags
+
+    self.use_absl = True
+    self.absl_flags = absl_flags
+    absl_defs = { bool: absl_flags.DEFINE_bool,
+                  int:  absl_flags.DEFINE_integer,
+                  str:  absl_flags.DEFINE_string,
+                  'enum': absl_flags.DEFINE_enum }
+
+    for name, val in self.values.items():
+      flag_type, meta_args, meta_kwargs = self.meta[name]
+      absl_defs[flag_type](name, val, *meta_args, **meta_kwargs)
+
+    app.call_after_init(lambda: self.complete_absl_config(absl_flags))
+
+  def complete_absl_config(self, absl_flags):
+    for name, _ in self.values.items():
+      self.update(name, getattr(absl_flags.FLAGS, name))
+
+  def parse_flags_with_absl(self):
+    global already_configured_with_absl
+    if not already_configured_with_absl:
+      import absl.flags
+      self.config_with_absl()
+      absl.flags.FLAGS(sys.argv, known_only=True)
+      self.complete_absl_config(absl.flags)
+      already_configured_with_absl = True
+
+
+class NameSpace(object):
+  def __init__(self, getter):
+    self._getter = getter
+
+  def __getattr__(self, name):
+    return self._getter(name)
+
+
+config = Config()
+flags = config
+FLAGS = flags.FLAGS
+
+already_configured_with_absl = False
+
+flags.DEFINE_bool(
+    'jax_enable_checks',
+    bool_env('JAX_ENABLE_CHECKS', False),
+    help='Turn on invariant checking (core.skip_checks = False)')
+
+flags.DEFINE_bool('tf_numpy_additional_tests', True,
+                  'Run tests added specifically for TF numpy')
diff --git a/tensorflow/python/ops/numpy_ops/tests/extensions.py b/tensorflow/python/ops/numpy_ops/tests/extensions.py
new file mode 100644
index 00000000000..b915dff755b
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/extensions.py
@@ -0,0 +1,2094 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Extensions such as `jit`, `grad`, `logsumexp`, etc."""
+import bisect
+import contextlib
+import copy
+import functools
+import string
+import sys
+import threading
+import numpy as np
+import six
+from tensorflow.python.compiler.xla import xla
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.eager.polymorphic_function import polymorphic_function
+from tensorflow.python.framework import config
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import device_spec
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor as tensor_lib
+from tensorflow.python.framework import tensor_conversion_registry
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import clip_ops
+from tensorflow.python.ops import control_flow_assert
+from tensorflow.python.ops import custom_gradient
+from tensorflow.python.ops import gen_bitwise_ops
+from tensorflow.python.ops import gen_collective_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import special_math_ops
+from tensorflow.python.ops import stateless_random_ops
+from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import while_loop
+import tensorflow.python.ops.numpy_ops.tests.np_wrapper as tf_np
+from tensorflow.python.ops.parallel_for import control_flow_ops as pfor_ops
+from tensorflow.python.tpu import tpu
+from tensorflow.python.tpu.ops import tpu_ops
+from tensorflow.python.util import nest
+
+_int_dtype_lower_bounds = [
+    -2**63, -2**31, -2**15, -2**7, 0, 2**7, 2**15, 2**31, 2**64
+]
+_int_dtypes = [
+    dtypes.int64,
+    dtypes.int32,
+    dtypes.int16,
+    dtypes.int8,
+    dtypes.uint8,
+    dtypes.uint16,
+    dtypes.uint32,
+    dtypes.uint64,
+]
+_tf_nn_APIs = {
+    1: [nn_ops.conv1d, nn_ops.conv1d_transpose],
+    2: [nn_ops.conv2d_v2, nn_ops.conv2d_transpose],
+    3: [nn_ops.conv3d_v2, nn_ops.conv3d_transpose],
+}
+
+
+remat = custom_gradient.recompute_grad
+
+
+def most_precise_int_dtype(x):
+  if not isinstance(x, six.integer_types) or isinstance(x, bool):
+    return None
+  i = bisect.bisect_right(_int_dtype_lower_bounds, x)
+  if i in (0, len(_int_dtype_lower_bounds)):
+    raise ValueError("Integer %s is out of bounds" % x)
+  assert len(_int_dtype_lower_bounds) == len(_int_dtypes) + 1
+  return _int_dtypes[i - 1]
+
+
+def _canonicalize_jit_arg(x):  # pylint: disable=missing-function-docstring
+  if isinstance(x, tf_np.ndarray):
+    return x
+  try:
+    # We need to convert `int` to the most precise dtype, otherwise the dtype
+    # of the result may be different from numpy's. For example, when a binary
+    # op takes in a Python integer 5 and an array of uint32, numpy will pick
+    # uint32 as 5's dtype, while tf.convert_to_tensor will choose int32 which
+    # will cause the two arguments to be promoted to int64. We pick uint8
+    # here, which will be promoted to uint32 by the binary op.
+    # Note that we prefer unsigned int to signed int when both are equally
+    # precise. For example, for 5, we pick uint8 instead of int8. There is no
+    # reason to prefer one to the other, because for each there is a case
+    # where the behavior diverges from numpy. If we prefer signed int,
+    # consider the case where the first operand is 5 and the second is
+    # 2**64-1. Numpy picks uint64 as the result dtype, but because we choose a
+    # signed type for 5 such as int8, the result type will be float64. On the
+    # other hand, if we prefer unsigned int, consider the case where the first
+    # operand is 2**31-1 and the second is -1. Numpy will pick int32, but
+    # because we choose uint32 for 2*32-1, the result will be int64. The root
+    # of the problem is that `jit` converts `int` to tensors (hence committing
+    # to a dtype) too early, when we don't have enough information about the
+    # jitted function (e.g. which subset of the arguments should be promoted
+    # together using np.result_type). tf.function doesn't have this problem
+    # because it doesn't convert `int` to tensors. jax.jit doesn't have this
+    # problem because it converts `int` to "int tracer" which doesn't commit
+    # to a dtype.
+    # TODO(wangpeng): Revisit this design and see whether we can improve `jit`
+    #   and tf.function.
+    dtype = most_precise_int_dtype(x)
+    if dtype is None and isinstance(x, float):
+      dtype = tf_np.default_float_type()
+    return ops.convert_to_tensor(value=x, dtype=dtype)
+  except (TypeError, ValueError):
+    return x
+
+
+def _canonicalize_jit_arguments(inp):
+  """Canonicalize arguments to be used for jit.
+
+  Args:
+    inp: a nested structure of arguments to be canonicalized (i.e. to be
+      converted to Tensors). Only tf_np.ndarray and things accepted by
+      `tf.convert_to_tensor` will be converted.
+
+  Returns:
+    The canonicalized version.
+  """
+  return nest.map_structure(_canonicalize_jit_arg, inp)
+
+
+def _tf_to_np(inp):
+
+  def f(x):
+    if isinstance(x, indexed_slices.IndexedSlices):
+      return tf_np.asarray(x)
+    else:
+      return x
+
+  return nest.map_structure(f, inp)
+
+
+def stop_gradient(x):
+
+  def static_stop_gradient(x):
+    # `tf.stop_gradient` is a no-op for non-Tensor. Returning the original type
+    # allows it to be used in the conditional without Autograph, if static. For
+    # example:
+    # `if fastmath.stop_gradient(5) > 4:`
+    return array_ops.stop_gradient(x) if tensor_util.is_tensor(x) else x
+
+  return _tf_to_np(nest.map_structure(static_stop_gradient, x))
+
+
+def custom_grad(f_vjp, f_original=None):
+  """Decorator to define a function with a custom gradient.
+
+  This function is very similar to `tf.custom_gradient`. See the documentation
+  of `tf.custom_gradient` for detailed usage.
+
+  The differences with `tf.custom_gradient` are:
+
+  - All arguments and results are tf_np.ndarrays instead of tensors.
+
+  - The `grad_fn` returned by `f_vjp` accepts and returns nested structures,
+    unlike that in `tf.custom_gradient` which only accepts and returns lists.
+
+  Args:
+    f_vjp: the same as the `f` argument of `tf.custom_gradient`. Note that all
+      inputs and outputs of `f_vjp` and of the `grad_fn` function it returns can
+      be nested structures.
+    f_original: (optional) not used.
+
+  Returns:
+    The same as `tf.custom_gradient`.
+  """
+  del f_original
+
+  @custom_gradient.custom_gradient
+  def tf_f(*tf_args, **tf_kwargs):
+    np_args = _tf_to_np(tf_args)
+    np_kwargs = _tf_to_np(tf_kwargs)
+    np_y, np_vjp = f_vjp(*np_args, **np_kwargs)
+    tf_y = np_y
+
+    def tf_vjp(*flat_tf_dy):
+      tf_dy = nest.pack_sequence_as(tf_y, flat_tf_dy)
+      np_dy = _tf_to_np(tf_dy)
+      np_dx = np_vjp(np_dy)
+      return nest.flatten(np_dx)
+
+    return tf_y, tf_vjp
+
+  def np_f(*args, **kwargs):
+    return _tf_to_np(tf_f(*args), **kwargs)
+
+  return np_f
+
+
+def vjp(f, *primals, has_aux=False):
+  """Returns the result and the VJP function of `f`.
+
+  This function returns the result and the vector-Jacobian-product (VJP)
+  function of `f`.
+
+  Args:
+    f: a function from (nested structures of) tf_np.ndarrays to a (nested
+      structure of) tf_np.ndarray. If `has_aux` is True, it should return an
+      extra output.
+    *primals: the inputs to be fed to `f`.
+    has_aux: if True, the second output of `f` will be regarded as an auxiliary,
+      non-differentiable output that will be ignored by the VJP function.
+
+  Returns:
+    A pair `(y, vjpfun)` if `has_aux` is False; a tuple `(y, vjpfun, aux)`
+    otherwise. `y` and `aux` are the outputs of `f`, i.e. `y, aux =
+    f(*primals)`. `vjpfun` is a function `dx = vjpfun(dy)`, where `dy` is the
+    cotengents of `y`, having the same structures, shapes and dtypes as
+    `y`. `dx` is the cotengents of `x`, having the same structures, shapes and
+    dtypes as `x`.
+  """
+  with backprop.GradientTape(persistent=True) as tape:
+    tape.watch(nest.flatten(primals))
+    outputs = f(*primals)
+    if has_aux:
+      np_out, aux = outputs
+    else:
+      np_out = outputs
+
+    def _vjp(dy):
+      tf_dx = tape.gradient(np_out, primals, output_gradients=dy)
+      return _tf_to_np(tf_dx)
+
+  if has_aux:
+    ret = (np_out, _vjp, aux)
+  else:
+    ret = (np_out, _vjp)
+  return ret
+
+
+# TODO(wangpeng): match JAX's handling of kwargs and non-ndarray args
+def grad(f, has_aux=False):
+  """Returns a function that computes gradient of f.
+
+  Gradients can only be computed through numpy and tensorflow operations and not
+  through python float operations and values.
+
+  Args:
+    f: a function of type (params, *args) -> scalar. 'params' can be a nested
+      structure (made of lists and tuples) of ndarrays and the gradient is
+      evaluated against it. `scalar` is a scalar ndarray.
+    has_aux: bool, indicates whether fun returns a pair where the first element
+      is considered the output of the mathematical function to be differentiated
+      and the second element is auxiliary data.
+
+  Returns:
+    A gradient function of type (params, *args) -> gradients, where the result
+    'gradients' has the same structure and shapes as 'params'.
+  """
+
+  def check_loss_shape(np_loss):
+    if not isinstance(np_loss, tf_np.ndarray):
+      raise ValueError(
+          "The result of the function to take gradient must be an ndarray.")
+    if not np_loss.shape.is_compatible_with([]):
+      raise ValueError(
+          "The result of the function to take gradient must be a scalar.")
+
+  def _f(params, *args):
+    """The gradient function to be returned."""
+    with backprop.GradientTape() as g:
+      g.watch(nest.flatten(params))
+      outputs = f(params, *args)
+      if has_aux:
+        np_loss, aux = outputs
+      else:
+        np_loss = outputs
+      check_loss_shape(np_loss)
+      tf_grads = g.gradient(np_loss, params)
+      if has_aux:
+        res = (tf_grads, aux)
+      else:
+        res = tf_grads
+      return _tf_to_np(res)
+
+  return _f
+
+
+def _record_result_type(recorder, f):
+  """A decorator that records some information about the function.
+
+  Args:
+    recorder: a function of signature `(args, kwargs, res) -> res`.
+    f: the original function.
+
+  Returns:
+    A transformed function that calls the original function and then the
+    recorder afterwards.
+  """
+  def wrapper(*args, **kwargs):
+    res = f(*args, **kwargs)
+    res = recorder(args, kwargs, res)
+    return res
+
+  return wrapper
+
+
+def jit(f,
+        static_argnums=(),
+        xla_forced_compile=False,
+        input_signature=None,
+        autograph=False,
+        experimental_compile=False):
+  """Returns a function that runs a trace-compiled version of `f`.
+
+  A trace-compiled version of a function `f` has the same behavior as `f` (when
+  called with the same "static arguments", see below), but runs faster because
+  the whole computation is compiled into a computation graph once which is
+  reused for subsequent executions.
+
+  The trace compilation happens lazily, when the returned function is called for
+  the first time. The compiled function may not be cached implicitly and
+  multiple calls to `jit` may not share the compiled function (see below for
+  "static" vs "dynamic" arguments).
+
+  Args:
+    f: a function that takes any positional arguments `args` and any keyword
+      arguments `kwargs`. `ndarray`s and things accepted by
+      `tf.convert_to_tensor` in `args` and `kwargs` will be treated as 'dynamic
+      arguments' in the sense that calling the function with different values
+      for these arguments will not cause retracing. In contrast, arguments of
+      other types in `args` and `kwargs` are treated as 'static arguments' and
+      calling the function with different values of them will cause
+      re-compiling. Positional arguments whose positions are in `static_argnums`
+      are always treated as static arguments.
+    static_argnums: a tuple of positions of arguments that will be treated as
+      static arguments. Note that as aforementioned, any arguments that were not
+      convertible to tensor will also be static.
+    xla_forced_compile: if true, it will use XLA to force-compile the graph.
+      This requires that the function only contain ops that are XLA
+      compatible. It will compile the entire function into a single XLA op.
+    input_signature: a list of `tf.TensorSpec`, as the input signature to
+      control tracing behavior. See the
+      [doc](https://www.tensorflow.org/api_docs/python/tf/function]) of
+        `tf.function` for details.
+    autograph: whether to use autograph to convert Python constructs such as
+      `if` and `while` to their TensorFlow counterparts. See the
+      [doc](https://www.tensorflow.org/api_docs/python/tf/function]) of
+        `tf.function` for details.
+    experimental_compile: the `experimental_compile` flag for `tf.function`. See
+      the [doc](https://www.tensorflow.org/api_docs/python/tf/function]) of
+      `tf.function` for details. This is the recommended way to turn on XLA for
+      tf.function, but unlike xla_forced_compile, it doesn't force-compile the
+      entire function into a single XLA op.
+
+  Returns:
+    A trace-compiled version of f.
+  """
+
+  @polymorphic_function.function(
+      input_signature=input_signature,
+      autograph=autograph,
+      experimental_compile=experimental_compile,
+  )
+  def _tf_f(*args, **kwargs):
+    """Accelerated function with tensor inputs/outputs."""
+    np_args = _tf_to_np(args)
+    kwargs = {k: _tf_to_np(v) for k, v in kwargs.items()}
+    if xla_forced_compile:
+      # Use list for mutability
+      output_is_list = [False]
+      output_is_empty = [False]
+      output_structure = [None]
+      def recorder(args, kwargs, res):
+        del args, kwargs
+        # Workaround b/121383831
+        output_is_list[0] = isinstance(res, list)
+        # If outputs are empty, xla.compile returns an `Operation`, which we
+        # don't want.
+        if nest.flatten(res):
+          output_is_empty[0] = False
+          output_structure[0] = None
+        else:
+          output_is_empty[0] = True
+          # Without deepcopy, xla.compile will change output_structure[0] to a
+          # list of `Operation`.
+          output_structure[0] = copy.deepcopy(res)
+        return res
+      f_ = _record_result_type(recorder, f)
+      np_out = xla.compile(lambda: f_(*np_args, **kwargs))
+      # Workaround b/121383831
+      if output_is_empty[0]:
+        np_out = output_structure[0]
+      elif (isinstance(np_out, list) and len(np_out) == 1 and
+            not output_is_list[0]):
+        np_out = np_out[0]
+    else:
+      np_out = f(*np_args, **kwargs)
+    return np_out
+
+  def _f(*args, **kwargs):
+    args = [
+        _canonicalize_jit_arguments(arg) if i not in static_argnums else arg
+        for i, arg in enumerate(args)
+    ]
+    kwargs = {k: _canonicalize_jit_arguments(v) for k, v in kwargs.items()}
+    tf_out = _tf_f(*args, **kwargs)
+    return _tf_to_np(tf_out)
+
+  _f.tf_function = _tf_f
+
+  return _f
+
+
+def eval_on_shapes(f, static_argnums=(), allow_static_outputs=False):
+  """Returns a function that evaluates `f` given input shapes and dtypes.
+
+  It transforms function `f` to a function that performs the same computation as
+  `f` but only on shapes and dtypes (a.k.a. shape inference).
+
+  Args:
+    f: the function to be transformed.
+    static_argnums: see documentation of `jit`.
+    allow_static_outputs: whether to allow non-array outputs. If True, non-array
+      outputs (e.g. Python integers) will be returned as-is; otherwise, they
+      will be converted to ndarrays, and then specs of those ndarrays will be
+      returned.
+
+  Returns:
+    A function whose input arguments can be either the same as `f`'s or only
+    their shapes/dtypes represented by `tf.TensorSpec`, and whose return values
+    are `tf.TensorSpec`s with the same nested structure as `f`'s return
+    values. If `allow_static_outputs` is True, when `f` returns some non-array
+    outputs (e.g. Python integers), the converted function will return them
+    as-is instead of returning `tf.TensorSpec`s for them.
+  """
+  def abstractify(args):
+    def _abstractify(x):
+      x = _canonicalize_jit_arg(x)
+      if isinstance(x, (ops.Tensor, tf_np.ndarray)):
+        return tensor_lib.TensorSpec(x.shape, x.dtype)
+      else:
+        return x
+    new_args = []
+    for i, arg in enumerate(args):
+      if i in static_argnums:
+        new_args.append(arg)
+      else:
+        new_args.append(nest.map_structure(_abstractify, arg))
+    return new_args
+
+  if allow_static_outputs:
+    # When `tf_f` below is called (via get_concrete_function) with the same
+    # arugments (after abstraction), the Python function `f` won't be run, so we
+    # need this python_outputs_map to retrieve the Python outputs we've seen
+    # before that correspond the arguments.
+    python_outputs_map = {}
+    def recorder(args, kwargs, res):
+      # Since the get_concrete_function below only uses positional args, we also
+      # only positional args here.
+      del args, kwargs
+      def is_tensor_like(x):
+        if hasattr(x, "_type_spec"):
+          return True  # x is a CompositeTensor
+        return isinstance(x, (tf_np.ndarray, ops.Tensor))
+      py_values = nest.map_structure(
+          lambda x: None if is_tensor_like(x) else x, res
+      )
+      key = id(ops.get_default_graph())
+      python_outputs_map[key] = py_values
+      # Set non-tensor outputs to None to avoid tf.function calling
+      # tf.convert_to_tensor on them.
+      res = nest.map_structure(
+          lambda x: None if not is_tensor_like(x) else x, res
+      )
+      return res
+    f = _record_result_type(recorder, f)
+
+  # TODO(wangpeng): tf.function could add a knob to turn off materializing the
+  #   graph, so that we don't waste computation and memory when we just want
+  #   shape inference.
+  tf_f = jit(f, static_argnums=static_argnums).tf_function
+
+  # pylint: disable=missing-docstring
+  def f_return(*args):
+    def to_tensor_spec(x):
+      if isinstance(x, ops.Tensor):
+        return tensor_lib.TensorSpec(x.shape, x.dtype)
+      else:
+        return x
+
+    new_args = abstractify(args)
+    cfun = tf_f.get_concrete_function(*new_args)
+    res = cfun.structured_outputs
+    res = nest.map_structure(to_tensor_spec, res)
+
+    if allow_static_outputs:
+      key = id(cfun.graph)
+      py_values = python_outputs_map[key]
+      # We can also call tf.get_static_value on structured_outputs to retrieve
+      # the Python values, but since we'll need to use python_outputs_map to
+      # record "which outputs are static?" anyway, we choose to directly store
+      # the Python values in python_outputs_map.
+      res = nest.map_structure(
+          lambda x, python_value: x if python_value is None else python_value,
+          res,
+          py_values,
+      )
+
+    return res
+
+  # Provides access to `tf_f` for testing purpose.
+  f_return._tf_function = tf_f  # pylint: disable=protected-access
+  return f_return
+
+
+def _index_update_helper(updater, x, idx, y):
+  x = tf_np.asarray(x)
+  y = tf_np.asarray(y)
+  # TODO(b/164251540): Remove this expensive manual broadcasting once
+  #   tf.raw_ops.tensor_strided_slice_update and tf.tensor_scatter_nd_update
+  #   support broadcasting.
+  y = array_ops.broadcast_to(y, array_ops.shape_v2(x[idx]))
+  return updater(x, idx, y)
+
+
+# pylint: disable=protected-access
+def index_update(x, idx, y):
+  """Pure equivalent of `x[idx] = y`.
+
+  Returns the value of x that would result from the NumPy-style indexed
+  assignment `x[idx] = y`. Because it's a pure function, `x` itself won't be
+  changed.
+
+  Args:
+    x: an array with the values to be updated.
+    idx: a Numpy-style index, consisting of `None`, integers, slice objects,
+      ellipses, ndarrays with integer dtypes, or a tuple of the above.
+    y: the array of updates. `y` must be broadcastable to the shape of the array
+      that would be returned by `x[idx]`.
+
+  Returns:
+    The updated version of `x`.
+  """
+  return _index_update_helper(tf_np.ndarray._with_index_update, x, idx, y)
+
+
+def index_add(x, idx, y):
+  """Pure equivalent of `x[idx] += y`.
+
+  Returns the value of x that would result from the NumPy-style indexed
+  assignment `x[idx] += y`. Because it's a pure function, `x` itself won't be
+  changed.
+
+  Args:
+    x: an array with the values to be updated.
+    idx: a Numpy-style index, consisting of `None`, integers, slice objects,
+      ellipses, ndarrays with integer dtypes, or a tuple of the above.
+    y: the array of updates. `y` must be broadcastable to the shape of the array
+      that would be returned by `x[idx]`.
+
+  Returns:
+    The updated version of `x`.
+  """
+  return _index_update_helper(tf_np.ndarray._with_index_add, x, idx, y)
+
+
+def index_min(x, idx, y):
+  """Pure equivalent of `x[idx] = minimum(x[idx], y)`.
+
+  Returns the value of x that would result from the NumPy-style indexed
+  assignment `x[idx] = minimum(x[idx], y)`. Because it's a pure function, `x`
+  itself won't be changed.
+
+  Args:
+    x: an array with the values to be updated.
+    idx: a Numpy-style index, consisting of `None`, integers, slice objects,
+      ellipses, ndarrays with integer dtypes, or a tuple of the above.
+    y: the array of updates. `y` must be broadcastable to the shape of the array
+      that would be returned by `x[idx]`.
+
+  Returns:
+    The updated version of `x`.
+  """
+  return _index_update_helper(tf_np.ndarray._with_index_min, x, idx, y)
+
+
+def index_max(x, idx, y):
+  """Pure equivalent of `x[idx] = maximum(x[idx], y)`.
+
+  Returns the value of x that would result from the NumPy-style indexed
+  assignment `x[idx] = maximum(x[idx], y)`. Because it's a pure function, `x`
+  itself won't be changed.
+
+  Args:
+    x: an array with the values to be updated.
+    idx: a Numpy-style index, consisting of `None`, integers, slice objects,
+      ellipses, ndarrays with integer dtypes, or a tuple of the above.
+    y: the array of updates. `y` must be broadcastable to the shape of the array
+      that would be returned by `x[idx]`.
+
+  Returns:
+    The updated version of `x`.
+  """
+  return _index_update_helper(tf_np.ndarray._with_index_max, x, idx, y)
+# pylint: enable=protected-access
+
+
+def logsumexp(x, axis=None, keepdims=None):
+  """Computes log(sum(exp(elements across dimensions of a tensor))).
+
+  Reduces `x` along the dimensions given in `axis`.
+  Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
+  entry in `axis`. If `keepdims` is true, the reduced dimensions
+  are retained with length 1.
+  If `axis` has no entries, all dimensions are reduced, and a
+  tensor with a single element is returned.
+  This function is more numerically stable than log(sum(exp(input))). It avoids
+  overflows caused by taking the exp of large inputs and underflows caused by
+  taking the log of small inputs.
+
+  Args:
+    x: The tensor to reduce. Should have numeric type.
+    axis: The dimensions to reduce. If `None` (the default), reduces all
+      dimensions. Must be in the range `[-rank(x), rank(x))`.
+    keepdims: If true, retains reduced dimensions with length 1.
+
+  Returns:
+    The reduced tensor.
+  """
+  return tf_np.asarray(
+      math_ops.reduce_logsumexp(input_tensor=x, axis=axis, keepdims=keepdims)
+  )
+
+
+def expit(x):
+  """Compute 1 / (1 + exp(-x))."""
+  return tf_np.asarray(math_ops.sigmoid(x))
+
+
+def erf(x):
+  """Computes the Gauss error function of x element-wise."""
+  return tf_np.asarray(math_ops.erf(x))
+
+
+def _minus(a, b):
+  return [x for x in a if x not in b]
+
+
+def _compose_output_rep(lhs_rep, rhs_rep, lhs_contraction, rhs_contraction,
+                        lhs_batch, rhs_batch):
+  """Compose the output string representation.
+
+  e.g., ij, jk, (((1,), (0,)), ((), ())) -> ik
+        aij, ajk, (((2,), (1,)), ((0,), (0,))) -> aik
+
+  Args:
+    lhs_rep: A string representation for the left-hand side input array
+    rhs_rep: A string representation for the right-hand side input array
+    lhs_contraction: Sequence[int] (the contraction dimensions of lhs)
+    rhs_contraction: Sequence[int] (the contraction dimensions of rhs)
+    lhs_batch: Sequence[int] (the batch dimensions of lhs)
+    rhs_batch: Sequence[int] (the batch dimensions of rhs)
+
+  Returns:
+    A string representation of the result array.
+  """
+  output_rep = []
+  for dim in lhs_batch:
+    output_rep.append(lhs_rep[dim])
+
+  for i in _minus(range(len(lhs_rep)), lhs_batch + lhs_contraction):
+    output_rep.append(lhs_rep[i])
+  for i in _minus(range(len(rhs_rep)), rhs_batch + rhs_contraction):
+    output_rep.append(rhs_rep[i])
+  return "".join(output_rep)
+
+
+def _non_batched_matmul(lhs, rhs, lhs_contraction, rhs_contraction):
+  """Compute the non-batched matrix multiplication.
+
+  If it is the general non-batched/single-batched matrix multiplication,
+  use the highly optimized kernel `tf.tensordot` to handle it.
+
+  Args:
+    lhs: an array (the left-hand side matrix/vector to be multiplied)
+    rhs: an array (the right-hand side matrix/vector to be multiplied)
+    lhs_contraction: Sequence[int] (the contraction dimensions of lhs)
+    rhs_contraction: Sequence[int] (the contraction dimensions of rhs)
+
+  Returns:
+    An array that contains the result.
+  """
+  return math_ops.tensordot(
+      lhs, rhs, axes=(list(lhs_contraction), list(rhs_contraction))
+  )
+
+
+def tf_dot_general(lhs, rhs, dimension_numbers):
+  """The general dot operation for TensorFlow.
+
+  An equivalent general dot operation as that in JAX -
+     <https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.dot_general.html>
+  Although there is an implementation in TF XLA, avoid directly using XLA when
+  possible.
+
+  e.g., non-batched: ij,jk->ik
+        batched: ijk,ikl->ijl
+
+  Args:
+    lhs: an array (the left-hand side matrix/vector to be multiplied)
+    rhs: an array (the right-hand side matrix/vector to be multiplied)
+    dimension_numbers: (Tuple[Tuple[Sequence[int], Sequence[int]],
+      Tuple[Sequence[int], Sequence[int]]]) – a tuple of tuples of the form
+      ((lhs_contracting_dims, rhs_contracting_dims), (lhs_batch_dims,
+      rhs_batch_dims))
+
+  Returns:
+    An array that contains the result.
+  """
+  char_list = list(string.ascii_lowercase)
+  char_list = char_list[8:] + char_list[:8]
+  lhs_rank, rhs_rank = len(lhs.shape), len(rhs.shape)
+  lhs_rep = char_list[:lhs_rank]
+  rhs_rep = char_list[lhs_rank:lhs_rank + rhs_rank]
+  contraction, batch = dimension_numbers
+  lhs_contraction, rhs_contraction = contraction
+  if len(lhs_contraction) != len(rhs_contraction):
+    raise ValueError(
+        "The input matrices are required to have the same number "
+        "of contraction dimensions, but got: lhs {}, rhs: {}".format(
+            len(lhs_contraction), len(rhs_contraction)))
+  lhs_batch, rhs_batch = batch
+  if len(lhs_batch) != len(rhs_batch):
+    raise ValueError("The input matrices are required to have the same number "
+                     "of batch dimensions, but got: lhs {}, rhs: {}".format(
+                         len(lhs_batch), len(rhs_batch)))
+
+  if not lhs_batch and not rhs_batch:
+    return _non_batched_matmul(lhs, rhs, lhs_contraction, rhs_contraction)
+
+  if (lhs_rank == rhs_rank == 3 and lhs_batch == (0,) and rhs_batch == (0,) and
+      lhs_contraction == (2,) and rhs_contraction == (1,)):
+    return math_ops.matmul(lhs, rhs)
+
+  for i in range(len(lhs_contraction)):
+    rhs_rep[rhs_contraction[i]] = lhs_rep[lhs_contraction[i]]
+  for i in range(len(lhs_batch)):
+    rhs_rep[rhs_batch[i]] = lhs_rep[lhs_batch[i]]
+
+  output_rep = _compose_output_rep(lhs_rep, rhs_rep, lhs_contraction,
+                                   rhs_contraction, lhs_batch, rhs_batch)
+  equation = "".join(lhs_rep) + "," + "".join(rhs_rep) + "->" + output_rep
+  return special_math_ops.einsum(equation, lhs, rhs)
+
+
+def _conv_general_param_type_converter(window_strides, lhs_dilation,
+                                       rhs_dilation, dim):
+  """Convert strides, lhs_dilation, rhs_dilation to match TF convention.
+
+  For example,
+   in the 3D case, if lhs_dilation = 2, then convert it to [2, 2, 2]
+                   if lhs_dilation = (2, 2, 2), convert it also to [2, 2, 2]
+
+  Args:
+    window_strides: window_strides to be converted
+    lhs_dilation: lhs_dilation to be converted
+    rhs_dilation: rhs_dilation to be converted
+    dim: dim to be converted
+
+  Returns:
+    The updated window_strides, lhs_dilation and rhs_dilation
+  """
+  def _as_list_of_size(item, size):
+    if item is None:
+      return None
+    return [item] * size if isinstance(item, int) else list(item)
+  return (_as_list_of_size(window_strides, dim),
+          _as_list_of_size(lhs_dilation, dim),
+          _as_list_of_size(rhs_dilation, dim))
+
+
+# pylint: disable=g-bad-todo
+# TODO(DarrenZhang01): Expand the test cases of general convolution and revise
+# the according bugs.
+# TODO(DarrenZhang01): Support feature_group_count, batch_group_count and
+# precision, and allow lhs_dilation and rhs_dilation to happen at the same time.
+# pylint: enable=g-bad-todo
+def tf_conv_general_dilated(lhs, rhs, window_strides, padding, output_shape,
+                            lhs_dilation=None, rhs_dilation=None,
+                            dimension_numbers=None, feature_group_count=1,
+                            batch_group_count=1, precision=None):
+  """A general conv API for TensorFlow.
+
+  According JAX version:
+    https://jax.readthedocs.io/en/stable/_autosummary/jax.lax.conv_general_dilated.html
+
+  Args:
+    lhs: a rank n+2 dimensional input array.
+    rhs: a rank n+2 dimensional array of kernel weights.
+    window_strides: a sequence of n integers, representing the inter-window
+                    strides.
+    padding: either the string ‘SAME’, the string ‘VALID’, or a sequence of n
+             (low, high) integer pairs that give the padding to apply before and
+             after each spatial dimension.
+    output_shape: the output shape of the convolution (only required for
+                  transpose convolution).
+    lhs_dilation: None, or a sequence of n integers, giving the dilation factor
+                  to apply in each spatial dimension of lhs. LHS dilation is
+                  also known as transposed convolution.
+    rhs_dilation: None, or a sequence of n integers, giving the dilation factor
+                  to apply in each spatial dimension of rhs. RHS dilation is
+                  also known as atrous convolution.
+    dimension_numbers: either None, a ConvDimensionNumbers object, or a 3-tuple
+                       (lhs_spec, rhs_spec, out_spec), where each element is a
+                       string of length n+2.
+    feature_group_count:  integer, default 1. Changing this is currently not
+                          supported.
+    batch_group_count: integer, default 1. Changing this is currently not
+                       supported.
+    precision: Optional. Either None, which means the default precision for the
+               backend, or a Precision enum value.
+
+  Returns:
+    A TF NumPy array that contains the convolution result.
+  """
+  dim = None
+  lhs_spec, rhs_spec, out_spec = dimension_numbers
+  if lhs_spec != out_spec:
+    raise ValueError("Current implementation requires the `data_format` of the "
+                     "inputs and outputs to be the same.")
+  if len(lhs_spec) >= 6:
+    raise ValueError("Current implmentation does not support 4 or higher"
+                     "dimensional convolution, but got: ", len(lhs_spec) - 2)
+  dim = len(lhs_spec) - 2
+  if lhs_dilation and rhs_dilation:
+    if lhs_dilation == (1,) * dim and rhs_dilation == (1,) * dim:
+      lhs_dilation, rhs_dilation = None, None
+    else:
+      raise ValueError("Current implementation does not support that "
+                       "deconvolution and dilation to be performed at the same "
+                       "time, but got lhs_dilation: {}, rhs_dilation: {}"
+                       .format(lhs_dilation, rhs_dilation))
+  if padding not in ["SAME", "VALID"]:
+    raise ValueError("Current implementation requires the padding parameter"
+                     "to be either 'VALID' or 'SAME', but got: ", padding)
+  if batch_group_count != 1 or feature_group_count != 1:
+    raise NotImplementedError("batch_group_count and feature_group_count "
+                              "other than 1 is currently not supported, but"
+                              " got feature_group_count: {}, batch_group_count"
+                              ": {}".format(feature_group_count,
+                                            batch_group_count))
+  if precision is not None:
+    raise NotImplementedError("precision other than `None` is currently not "
+                              "supported, but got: {}".format(precision))
+  # Convert params from int/Sequence[int] to list of ints.
+  strides, lhs_dilation, rhs_dilation = _conv_general_param_type_converter(
+      window_strides, lhs_dilation, rhs_dilation, dim
+  )
+  # Preprocess the shapes
+  dim_maps = {}
+  if isinstance(lhs_spec, str):
+    dim_maps["I"] = list(rhs_spec).index("I")
+    dim_maps["O"] = list(rhs_spec).index("O")
+    dim_maps["N"] = list(lhs_spec).index("N")
+    dim_maps["C"] = list(lhs_spec).index("C")
+  else:
+    dim_maps["I"] = rhs_spec[1]
+    dim_maps["O"] = rhs_spec[0]
+    dim_maps["N"] = lhs_spec[0]
+    dim_maps["C"] = lhs_spec[1]
+
+  lhs = tf_np.moveaxis(lhs, (dim_maps["N"], dim_maps["C"]), (0, dim + 1))
+  # Adjust the filters, put the dimension 'I' and 'O' at last.
+  rhs = tf_np.moveaxis(rhs, (dim_maps["O"], dim_maps["I"]), (dim + 1, dim))
+  spatial_dim_maps = {1: "W", 2: "HW", 3: "DHW"}
+  data_format = "N" + spatial_dim_maps[dim] + "C"
+
+  if rhs_dilation or (lhs_dilation is None and rhs_dilation is None):
+    output = _tf_nn_APIs[dim][0](lhs, rhs, strides, padding, data_format,
+                                 rhs_dilation)
+  else:
+    output = _tf_nn_APIs[dim][1](
+        lhs,
+        rhs,
+        constant_op.constant(output_shape),
+        strides,
+        padding,
+        data_format,
+        lhs_dilation,
+    )
+  output = tf_np.moveaxis(output, (0, dim + 1), (dim_maps["N"], dim_maps["C"]))
+  return output
+
+
+def conv(inp,
+         fltr,
+         window_strides,
+         padding,
+         dimension_numbers,
+         filter_dilation=None):
+  """Convolution over an N-D array.
+
+  See https://www.tensorflow.org/api_docs/python/tf/nn/convolution and
+  https://www.tensorflow.org/xla/operation_semantics#conv_convolution for
+  reference.
+
+  Args:
+    inp: an (N+2)-D array. The input of the convolution.
+    fltr: an (N+2)-D array. The filter (i.e. kernel) of the convolution.
+    window_strides: a sequence of N ints, the strides for moving the convolution
+      window.
+    padding: a string, either "VALID" or "SAME". The padding algorithm.
+    dimension_numbers: a tuple of three strings encoding the data format of
+      input, filter and output. "I" means input; "O" means output; "C" means
+      channel; other characters such as "W", "H" and "D" means spatial
+      dimensions.
+    filter_dilation: the dilation rates for the filter. Dilating the filter
+      means adding "holes" to the filter.
+
+  Returns:
+    An (N+2)-D array. The convolution result.
+  """
+  input_spec, filter_spec, output_spec = dimension_numbers
+  if input_spec != output_spec:
+    raise ValueError("Input and output data formats must be the same; got %s "
+                     "and %s" % (input_spec, output_spec))
+  supported_filter_spec = ["WIO", "HWIO", "DHWIO"]
+  if filter_spec not in supported_filter_spec:
+    raise ValueError("The supported data format for the filter are %s; got %s" %
+                     (supported_filter_spec, filter_spec))
+  if input_spec[1:-1] != filter_spec[:-2]:
+    raise ValueError("Input data format (%s) is not compatible with filter "
+                     "data format (%s)" % (input_spec, filter_spec))
+  # No type promotion in order to prevent accidentally doing more expensive
+  # computation.
+  dtype = tf_np.result_type(inp, fltr)
+  inp = tf_np.asarray(inp, dtype)
+  fltr = tf_np.asarray(fltr, dtype)
+  return tf_np.asarray(
+      nn_ops.convolution_v2(
+          input=inp,
+          filters=fltr,
+          padding=padding,
+          strides=window_strides,
+          dilations=filter_dilation,
+          data_format=input_spec,
+      )
+  )
+
+
+def avg_pool(x, pool_size, strides, padding):
+  """Performs an N-D average pooling.
+
+  Args:
+    x: ndarray of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]`. Pooling happens over the spatial dimensions only.
+    pool_size: sequence of N ints.
+    strides: sequence of N ints.
+    padding: a string, the padding algorithm. Must be "SAME" or "VALID".
+
+  Returns:
+    An (N+2)-D array,  of shape
+      [batch_size] + output_spatial_shape + [num_channels],
+    where `output_spatial_shape` depends on the value of padding:
+    If padding = "SAME":
+      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+    If padding = "VALID":
+      output_spatial_shape[i] =
+        ceil((input_spatial_shape[i] - (pool_size[i] - 1)) / strides[i]).
+  """
+  x = tf_np.asarray(x)
+  return tf_np.asarray(
+      nn_ops.pool(
+          input=x,
+          window_shape=pool_size,
+          pooling_type="AVG",
+          strides=strides,
+          padding=padding,
+      )
+  )
+
+
+def max_pool(x, pool_size, strides, padding):
+  """Performs an N-D max pooling.
+
+  Args:
+    x: ndarray of rank N+2, of shape `[batch_size] + input_spatial_shape +
+      [num_channels]`. Pooling happens over the spatial dimensions only.
+    pool_size: sequence of N ints.
+    strides: sequence of N ints.
+    padding: a string, the padding algorithm. Must be "SAME" or "VALID".
+
+  Returns:
+    An (N+2)-D array,  of shape
+      [batch_size] + output_spatial_shape + [num_channels],
+    where `output_spatial_shape` depends on the value of padding:
+    If padding = "SAME":
+      output_spatial_shape[i] = ceil(input_spatial_shape[i] / strides[i])
+    If padding = "VALID":
+      output_spatial_shape[i] =
+        ceil((input_spatial_shape[i] - (pool_size[i] - 1)) / strides[i]).
+  """
+  x = tf_np.asarray(x)
+  return tf_np.asarray(
+      nn_ops.pool(
+          input=x,
+          window_shape=pool_size,
+          pooling_type="MAX",
+          strides=strides,
+          padding=padding,
+      )
+  )
+
+
+def sort_key_val(keys, values, dimension=-1):
+  """Sorts keys along a dimension and applies same permutation to values.
+
+  Args:
+    keys: an array. The dtype must be comparable numbers (integers and reals).
+    values: an array, with the same shape of `keys`.
+    dimension: an `int`. The dimension along which to sort.
+
+  Returns:
+    Permuted keys and values.
+  """
+  keys = tf_np.asarray(keys)
+  values = tf_np.asarray(values)
+  rank = keys.shape.ndims
+  if rank is None:
+    rank = values.shape.ndims
+  if rank is None:
+    # We need to know the rank because tf.gather requires batch_dims to be `int`
+    raise ValueError("The rank of either keys or values must be known, but "
+                     "both are unknown (i.e. their shapes are both None).")
+  if dimension in (-1, rank - 1):
+
+    def maybe_swapaxes(a):
+      return a
+  else:
+
+    def maybe_swapaxes(a):
+      return tf_np.swapaxes(a, dimension, -1)
+
+  # We need to swap axes because tf.gather (and tf.gather_nd) supports
+  # batch_dims on the left but not on the right.
+  # TODO(wangpeng): Investigate whether we should do swapaxes or moveaxis.
+  keys = maybe_swapaxes(keys)
+  values = maybe_swapaxes(values)
+  idxs = tf_np.argsort(keys)
+
+  # Using tf.gather rather than np.take because the former supports batch_dims
+  def gather(a):
+    return tf_np.asarray(array_ops.gather_v2(a, idxs, batch_dims=rank - 1))
+
+  keys = gather(keys)
+  values = gather(values)
+  keys = maybe_swapaxes(keys)
+  values = maybe_swapaxes(values)
+  return keys, values
+
+
+def scan(f, init, xs, length=None, reverse=False):
+  """Scan a function over leading array axes while carrying along state.
+
+  See the docstring of `jax.lax.scan`
+  (https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.scan.html) for
+  details.
+
+  Args:
+    f: a Python function to be scanned of type ``c -> a -> (c, b)``, meaning
+      that ``f`` accepts two arguments where the first is a value of the loop
+      carry and the second is a slice of ``xs`` along its leading axis, and that
+      ``f`` returns a pair where the first element represents a new value for
+      the loop carry and the second represents a slice of the output. Note that
+      the input and output carry must have the same dtype.
+    init: an initial loop carry value of type ``c``, which can be a scalar,
+      array, or any pytree (nested Python tuple/list/dict) thereof, representing
+      the initial loop carry value. This value must have the same structure as
+      the first element of the pair returned by ``f``.
+    xs: the value of type ``[a]`` over which to scan along the leading axis,
+      where ``[a]`` can be an array or any pytree (nested Python
+      tuple/list/dict) thereof with consistent leading axis sizes.
+    length: optional integer specifying the number of loop iterations, which
+      must agree with the sizes of leading axes of the arrays in ``xs`` (but can
+      be used to perform scans where no input ``xs`` are needed).
+    reverse: optional boolean specifying whether to run the scan iteration
+      forward (the default) or in reverse, equivalent to reversing the leading
+      axes of the arrays in both ``xs`` and in ``ys``.
+
+  Returns:
+    A pair of type ``(c, [b])`` where the first element represents the final
+    loop carry value and the second element represents the stacked outputs of
+    the second output of ``f`` when scanned over the leading axis of the inputs.
+  """
+  init, xs = nest.map_structure(
+      lambda x: tf_np.asarray(x) if x is not None else None, (init, xs)
+  )
+  if length is not None:
+    length = int(length)
+  def get_length(x):
+    if x is None:
+      return None
+    if x.shape.rank == 0:
+      raise ValueError("Some array in `xs` doesn't have a leading dimension")
+    return x.shape[0]
+  lengths = nest.flatten(nest.map_structure(get_length, xs))
+  for l in lengths:
+    if l is not None:
+      if length is None:
+        length = l
+      elif length != l:
+        raise ValueError("There are two different leading-dimension lengths: "
+                         f"{length} and {l}")
+  if length is None:
+    raise ValueError(
+        "Can't determine length. Please set the `length` argument.")
+  xs_ta = nest.map_structure(
+      lambda t: (  # pylint: disable=g-long-lambda
+          tensor_array_ops.TensorArray(  # pylint: disable=g-long-ternary
+              t.dtype, size=length, dynamic_size=False
+          ).unstack(  # pylint: disable=g-long-lambda
+              t
+          )
+          if t is not None
+          else None
+      ),
+      xs,
+  )
+  # tf.while_loop doesn't allow None in loop_vars, so we mask them.
+  is_init_none = nest.map_structure(lambda x: x is None, init)
+  def to_safe(carry):
+    return nest.map_structure(
+        lambda x, is_none: array_ops.zeros([]) if is_none else x,
+        carry,
+        is_init_none,
+    )
+  def from_safe(safe_carry):
+    return nest.map_structure(
+        lambda x, is_none: None if is_none else x, safe_carry, is_init_none
+    )
+  def body(i, safe_carry, ys_ta):
+    carry = from_safe(safe_carry)
+    if reverse:
+      i_ = length - 1 - i
+    else:
+      i_ = i
+    xs = nest.map_structure(
+        lambda x_ta: x_ta.read(i_) if x_ta is not None else None, xs_ta
+    )
+    carry, ys = f(*_tf_to_np((carry, xs)))
+    ys_ta = nest.map_structure(
+        lambda y_ta, y: (y_ta.write(i_, y) if y is not None else y_ta),
+        ys_ta,
+        ys,
+    )
+    i = i + 1
+    safe_carry = to_safe(carry)
+    return i, safe_carry, ys_ta
+  xs_spec = nest.map_structure(
+      lambda t: tensor_lib.TensorSpec(t.shape[1:], t.dtype)  # pylint: disable=g-long-lambda
+      if t is not None
+      else None,
+      xs,
+  )
+  _, ys_spec = eval_on_shapes(f)(init, xs_spec)
+  # ys_ta can't contain None because tf.while_loop doesn't allow None in
+  # loop_vars.
+  ys_ta = nest.map_structure(
+      lambda y: tensor_array_ops.TensorArray(  # pylint: disable=g-long-lambda
+          y.dtype if y is not None else dtypes.float32,
+          size=length,
+          dynamic_size=False,
+      ),
+      ys_spec,
+  )
+  safe_init = to_safe(init)
+  _, safe_carry, ys_ta = while_loop.while_loop_v2(
+      lambda i, *_: i < length,
+      body,
+      (0, safe_init, ys_ta),
+      maximum_iterations=length,
+  )
+  carry = from_safe(safe_carry)
+  def _stack(a, spec):
+    if spec is None:
+      return None
+    a = a.stack()
+    a.set_shape((length,) + a.shape[1:])
+    return a
+  ys = nest.map_structure(_stack, ys_ta, ys_spec)
+  return _tf_to_np((carry, ys))
+
+
+# named "tf_map" instead of "map" as in JAX to avoid conflict with Python `map`
+def tf_map(f, xs):
+  """Map a function over leading array axes.
+
+  See the docstring of `jax.lax.map`
+  (https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.map.html) for
+  details.
+
+  Args:
+    f: a Python function to apply element-wise over the first axis or axes of
+      `xs`.
+    xs: values over which to map along the leading axis.
+
+  Returns:
+    Mapped values.
+  """
+  def g(unused, x):
+    return unused, f(x)
+  carry = nest.map_structure(lambda _: None, xs)
+  return scan(g, carry, xs)[1]
+
+
+def _get_dynamic_indices(operand, start_indices, slice_sizes):
+  """Calcuates the indices for `tf.gather_nd` from slices.
+
+  Args:
+    operand: a Tensor to slice.
+    start_indices: a vector Tensor of integers, one per dimension. The starts of
+      the slice. The vector can be dynamic.
+    slice_sizes: a list of integers, one per dimension. The sizes of the slice.
+
+  Returns:
+    An index array suitable for `tf.gather_nd` and `tf.scatter_nd`, or `None` if
+    `operand` is a scalar.
+  """
+  rank = len(slice_sizes)
+  operand_rank = array_ops.rank(operand)
+  control_flow_assert.Assert(operand_rank == rank, [operand_rank, rank])
+  starts_rank = array_ops.rank(start_indices)
+  control_flow_assert.Assert(starts_rank == 1, [starts_rank])
+  num_starts = array_ops.shape_v2(start_indices)[0]
+  control_flow_assert.Assert(num_starts == rank, [num_starts, rank])
+  operand_shape = array_ops.shape_v2(operand)
+  control_flow_assert.Assert(
+      math_ops.reduce_all(slice_sizes <= operand_shape),
+      [slice_sizes, operand_shape],
+  )
+  if rank == 0:
+    return None
+  start_indices = array_ops.where(
+      start_indices < 0, start_indices + operand_shape, start_indices
+  )
+  idx_list = []
+  for i in range(rank):
+    start = start_indices[i]
+    size = slice_sizes[i]
+    dim = operand_shape[i]
+    start = clip_ops.clip_by_value(start, 0, dim - size)
+    # XLA requires tf.range's `start` to be compile-time constant, so we can't
+    # do tf.range(start, ...).
+    idx = start + math_ops.range(size)
+    shape = [1] * rank
+    shape[i] = size
+    idx = array_ops.reshape(idx, shape)
+    idx_list.append(idx)
+  slice_sizes_tensor = ops.convert_to_tensor(slice_sizes)
+  # tf.stack doesn't support broadcasting, so we need to broadcast manually.
+  # TODO(wangpeng): Reduce peak memory by broadcasting one-by-one instead of
+  #   all-together.
+  idx_list = [array_ops.broadcast_to(x, slice_sizes_tensor) for x in idx_list]
+  return array_ops_stack.stack(idx_list, axis=-1)
+
+
+def dynamic_slice(operand, start_indices, slice_sizes):
+  """Slicing operation where the indices can be dynamic vlaues.
+
+  See the docstring of `jax.lax.dynamic_slice`
+  (https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.dynamic_slice.html)
+  for details.
+
+  Args:
+    operand: an array to slice.
+    start_indices: a vector of integers, one per dimension. The starts of the
+      slice. The vector can be dynamic.
+    slice_sizes: a list of integers, one per dimension. The sizes of the slice.
+
+  Returns:
+    An array containing the slice, with shape equal to `slice_sizes`.
+  """
+  # This implementation uses tf.gather_nd to implement dynamic_slice, which is
+  # memory inefficient because the size of `indices` given to gather_nd is
+  # large.
+  operand = tf_np.asarray(operand).data
+  start_indices = tf_np.asarray(start_indices, np.int32).data
+  idx = _get_dynamic_indices(operand, start_indices, slice_sizes)
+  if idx is not None:
+    operand = array_ops.gather_nd(operand, idx)
+  return tf_np.asarray(operand)
+
+
+def dynamic_update_slice(operand, update, start_indices):
+  """Updates a dynamic slice.
+
+  See the docstring of `jax.lax.dynamic_update_slice`
+  (https://jax.readthedocs.io/en/latest/_autosummary/jax.lax.dynamic_update_slice.html)
+  for details.
+
+  Args:
+    operand: an array to slice.
+    update: an array containing the new values to write onto `operand`.
+    start_indices: a vector of integers, one per dimension. The starts of the
+      slice. The vector can be dynamic.
+
+  Returns:
+    The updated version of `operand`.
+  """
+  operand = tf_np.asarray(operand).data
+  update = tf_np.asarray(update).data
+  start_indices = tf_np.asarray(start_indices, np.int32).data
+  if not update.shape.is_fully_defined():
+    raise ValueError("update's shape must be fully defined")
+  slice_sizes = update.shape
+  idx = _get_dynamic_indices(operand, start_indices, slice_sizes)
+  if idx is None:
+    # `np.zeros([])[()] = 1.0` will result in a scalar array of 1.0
+    return tf_np.asarray(update)
+  operand = array_ops.tensor_scatter_nd_update(operand, idx, update)
+  return tf_np.asarray(operand)
+
+
+def dynamic_slice_in_dim(operand, start_index, slice_size, axis=0):
+  """Convenience wrapper around dynamic_slice applying to one dimension."""
+  operand = tf_np.asarray(operand)
+  start_indices = [0] * operand.ndim
+  slice_sizes = list(operand.shape)
+  axis = int(axis)
+  start_indices[axis] = start_index
+  slice_sizes[axis] = int(slice_size)
+  return dynamic_slice(operand, start_indices, slice_sizes)
+
+
+def dynamic_update_slice_in_dim(operand, update, start_index, axis):
+  """Convenience wrapper around dynamic_update_slice for one dimension."""
+  operand = tf_np.asarray(operand)
+  axis = int(axis)
+  start_indices = [0] * operand.ndim
+  start_indices[axis] = start_index
+  return dynamic_update_slice(operand, update, start_indices)
+
+
+# Use int64 instead of int32 to avoid TF's "int32 problem"
+_RNG_KEY_DTYPE = np.int64
+
+
+def _key2seed(a):
+  """Converts an RNG key to an RNG seed.
+
+  Args:
+    a: an RNG key, an ndarray of shape [] and dtype `np.int64`.
+
+  Returns:
+    an RNG seed, a tensor of shape [2] and dtype `tf.int32`.
+  """
+
+  def int64_to_int32s(a):
+    """Converts an int64 tensor of shape [] to an int32 tensor of shape [2]."""
+    a = math_ops.cast(a, dtypes.uint64)
+    fst = math_ops.cast(a, dtypes.uint32)
+    snd = math_ops.cast(
+        gen_bitwise_ops.right_shift(a, constant_op.constant(32, dtypes.uint64)),
+        dtypes.uint32,
+    )
+    a = [fst, snd]
+    a = nest.map_structure(lambda x: math_ops.cast(x, dtypes.int32), a)
+    a = array_ops_stack.stack(a)
+    return a
+
+  return int64_to_int32s(a)
+
+
+def _seed2key(a):
+  """Converts an RNG seed to an RNG key.
+
+  Args:
+    a: an RNG seed, a tensor of shape [2] and dtype `tf.int32`.
+
+  Returns:
+    an RNG key, an ndarray of shape [] and dtype `np.int64`.
+  """
+
+  def int32s_to_int64(a):
+    """Converts an int32 tensor of shape [2] to an int64 tensor of shape []."""
+    a = math_ops.bitwise_or(
+        math_ops.cast(a[0], dtypes.uint64),
+        math_ops.left_shift(
+            math_ops.cast(a[1], dtypes.uint64),
+            constant_op.constant(32, dtypes.uint64),
+        ),
+    )
+    a = math_ops.cast(a, dtypes.int64)
+    return a
+
+  return tf_np.asarray(int32s_to_int64(a))
+
+
+def prng(s):
+  """Creates RNG state from seed.
+
+  Args:
+    s: the seed, an integer.
+
+  Returns:
+    An RNG state, as a scalar array of dtype `np.int64`.
+  """
+  # TODO(wangpeng): Become bitwise-identical to JAX when TF stateless RNGs get
+  #   improved.
+  return tf_np.asarray(s, dtype=_RNG_KEY_DTYPE)
+
+
+def stateless_split(seed, num=2):
+  """Splits an RNG seed into `num` new seeds by adding a leading axis.
+
+  Example:
+
+  >>> seed = [1, 2]
+  >>> new_seeds = tf.random.experimental.stateless_split(seed, num=3)
+  >>> print(new_seeds)
+  tf.Tensor(
+  [[1105988140 1738052849]
+   [-335576002  370444179]
+   [  10670227 -246211131]], shape=(3, 2), dtype=int32)
+  >>> tf.random.stateless_normal(shape=[3], seed=new_seeds[0, :])
+  <tf.Tensor: shape=(3,), dtype=float32, numpy=array([-0.59835213, -0.9578608 ,
+  0.9002807 ], dtype=float32)>
+
+  Args:
+    seed: an RNG seed (a tensor with shape [2] and dtype `int32` or `int64`).
+      (When using XLA, only `int32` is allowed.)
+    num: optional, a positive integer or scalar tensor indicating the number of
+      seeds to produce (default 2).
+
+  Returns:
+    A tensor with shape [num, 2] representing `num` new seeds. It will have the
+    same dtype as `seed` (if `seed` doesn't have an explict dtype, the dtype
+    will be determined by `tf.convert_to_tensor`).
+  """
+  seed = ops.convert_to_tensor(seed)
+  return stateless_random_ops.stateless_random_uniform(
+      shape=[num, 2], seed=seed, dtype=seed.dtype, minval=None, maxval=None
+  )
+
+
+def split(state, num):
+  """Creates new independent RNG states from an existing state.
+
+  Args:
+    state: the existing state.
+    num: the number of the new states.
+
+  Returns:
+    A tuple of new states.
+  """
+  state = tf_np.asarray(state, dtype=_RNG_KEY_DTYPE)
+  state = _key2seed(state)
+  try:
+    states = stateless_random_ops.stateless_split(state, num)
+  except AttributeError as e:  # pylint: disable=unused-variable
+    # TODO(afrozm): For TF < 2.3 we need to do this. Delete once 2.3 launches.
+    states = stateless_split(state, num)
+  states = array_ops_stack.unstack(states, num)
+  states = nest.map_structure(_seed2key, states)
+  return states
+
+
+def uniform(key,
+            shape,
+            dtype=tf_np.random.DEFAULT_RANDN_DTYPE,
+            minval=0.,
+            maxval=1.):
+  """Sample uniform random values in range [`minval`, `maxval`).
+
+  Args:
+    key: the RNG key.
+    shape: the shape of the result.
+    dtype: the dtype of the result.
+    minval: the minimal value (inclusive).
+    maxval: the maximal value (exclusive).
+
+  Returns:
+    An ndarray with shape `shape` and dtype `dtype`. Each value in the ndarray
+    is sampled uniformly randomly in range [`minval`, `maxval`).
+  """
+  minval = math_ops.cast(minval, dtype)
+  maxval = math_ops.cast(maxval, dtype)
+  key = tf_np.asarray(key, dtype=_RNG_KEY_DTYPE)
+  return tf_np.asarray(
+      stateless_random_ops.stateless_random_uniform(
+          shape, seed=_key2seed(key), dtype=dtype, minval=minval, maxval=maxval
+      )
+  )
+
+
+def normal(key, shape, dtype=dtypes.float32):
+  """Sample standard-normal random values.
+
+  Args:
+    key: the RNG key.
+    shape: the shape of the result.
+    dtype: the dtype of the result.
+
+  Returns:
+    Random values in standard-normal distribution.
+  """
+  key = tf_np.asarray(key, dtype=_RNG_KEY_DTYPE)
+  return tf_np.asarray(
+      stateless_random_ops.stateless_random_normal(
+          shape, seed=_key2seed(key), dtype=dtype
+      )
+  )
+
+
+def bernoulli(key, mean=np.float32(0.5), shape=None):
+  """Sample Bernoulli random values with given shape and mean.
+
+  Args:
+    key: the RNG key.
+    mean: optional, an array_like broadcastable to `shape` for the mean of the
+      random variables (default 0.5).
+    shape: optional, a tuple of nonnegative integers representing the shape
+      (default to `mean`'s shape).
+
+  Returns:
+    A random array with the specified shape and boolean dtype.
+  """
+  mean = tf_np.asarray(mean)
+  if shape is None:
+    shape = mean.shape
+  return uniform(key, shape) < mean
+
+
+def _eager_dataset_iterator(dataset):
+  for item in dataset:
+    yield nest.map_structure(tf_np.asarray, item)
+
+
+def dataset_as_numpy(dataset):
+  """Converts a `tf.data.Dataset` to an iterable of ndarrays.
+
+  `dataset_as_numpy` converts a possibly nested structure of `tf.data.Dataset`s
+  and `tf.Tensor`s to iterables of ndarrays and ndarrays, respectively. This
+  function must be run in eager mode outside tf.function.
+
+  Args:
+    dataset: a possibly nested structure of `tf.data.Dataset`s and/or
+      `tf.Tensor`s.
+
+  Returns:
+    A structure matching `dataset` where `tf.data.Dataset`s are converted to
+    generators of ndarrays and `tf.Tensor`s are converted to ndarrays.
+  """
+  if not context.executing_eagerly():
+    raise ValueError(
+        "dataset_as_numpy must be run in eager mode outside tf.function")
+  nested_ds = dataset
+  del dataset
+
+  # Flatten
+  flat_ds = nest.flatten(nested_ds)
+  flat_np = []
+
+  # Type check for Tensors and Datasets
+  for ds_el in flat_ds:
+    if not isinstance(ds_el, (ops.Tensor, dataset_ops.DatasetV2)):
+      types = nest.map_structure(type, nested_ds)
+      raise ValueError("Arguments to dataset_as_numpy must be (possibly nested "
+                       "structure of) tf.Tensors or tf.data.Datasets. Got: %s" %
+                       types)
+
+  for ds_el in flat_ds:
+    if isinstance(ds_el, ops.Tensor):
+      np_el = tf_np.asarray(ds_el)
+    elif isinstance(ds_el, dataset_ops.DatasetV2):
+      np_el = _eager_dataset_iterator(ds_el)
+    else:
+      assert False
+    flat_np.append(np_el)
+
+  return nest.pack_sequence_as(nested_ds, flat_np)
+
+
+# TODO(nareshmodi): Group key should change based on the set of devices that we
+# are mapping over. Make it so that we assign a unique group_key for every
+# unique set of devices. We don't change it every time to avoid the overhead of
+# discovering the full group (though may not be problematic in the local case).
+_GROUP_KEY = 1
+_INSTANCE_KEY = 0
+_INSTANCE_LOCK = threading.Lock()
+
+
+# TODO(b/142565636): Ensure that multiple concurrent calls to a tf.function
+# containing a collective op run reasonably.
+def _get_instance_key():
+  global _INSTANCE_KEY
+  global _INSTANCE_LOCK
+  with _INSTANCE_LOCK:
+    _INSTANCE_KEY = _INSTANCE_KEY + 1
+    return _INSTANCE_KEY
+
+
+# Don't use a namedtuple since nest considers that a tuple and unflattens and
+# flattens it.
+class ShardedNdArray(object):
+  """Wrapper over ndarray that can contain tensors on multiple devices.
+
+    This is returned by extensions.pmap, and contains the individual tensors on
+    different devices.
+  """
+
+  def __init__(self, tensors):
+    """Initializes the ShardedNdArray.
+
+    Note that the tensors should be ordered in the way the pmap producing these
+    tensors is run.
+
+    Args:
+      tensors: list or tuple of eager tensors, one for each device.
+    """
+
+    if not isinstance(tensors, (list, tuple)) or not tensors:
+      raise ValueError(
+          "Unable to create a ShardedNdArray without a list of tensors.")
+    self.tensors = tensors
+    self.n_devices = len(tensors)
+
+  def __getitem__(self, i):
+    return tf_np.asarray(self.tensors[i])
+
+  @property
+  def shape(self):
+    return (self.n_devices,) + self.tensors[0]._shape_tuple()  # pylint: disable=protected-access
+
+  @property
+  def dtype(self):
+    return self.tensors[0].dtype
+
+
+def convert_sharded_tensor_to_eager_tensor(value, *args, **kwargs):
+  del args, kwargs
+  # TODO(nareshmodi): Consider a collective op to gather the tensors from the
+  # various devices for performance reasons.
+  return array_ops_stack.stack(value.tensors)
+
+
+tensor_conversion_registry.register_tensor_conversion_function(
+    ShardedNdArray, convert_sharded_tensor_to_eager_tensor
+)
+
+
+class _PmapConfig(threading.local):
+  """Simple config used to maintain state related to a current pmap call."""
+
+  def __init__(self):
+    super(_PmapConfig, self).__init__()
+    self._axis_name = None
+    self._devices = None
+
+  def axis_name(self):
+    return self._axis_name
+
+  def set_axis_name(self, axis_name):
+    self._axis_name = axis_name
+
+  def devices(self):
+    return self._devices
+
+  def set_devices(self, devices):
+    self._devices = devices
+
+
+_pmap_config = _PmapConfig()
+
+
+@contextlib.contextmanager
+def pmap_config(axis_name, devices):
+  """Records axis_name and devices for this context."""
+  old_axis_name = _pmap_config.axis_name()
+  old_devices = _pmap_config.devices()
+  _pmap_config.set_axis_name(axis_name)
+  _pmap_config.set_devices(devices)
+  try:
+    yield
+  finally:
+    _pmap_config.set_axis_name(old_axis_name)
+    _pmap_config.set_devices(old_devices)
+
+
+def _psum(tensor, axis_name=None):
+  """Sum all-reduction.
+
+  Args:
+    tensor: A tensor.
+    axis_name: The axis name to reduce. Must equal to that of the surrounding
+      pmap.
+
+  Returns:
+    The sum of the `tensor` replicas on each participating devices.
+  """
+  if axis_name != _pmap_config.axis_name():
+    raise ValueError("axis_name (%s) is not equal to that of the surrounding "
+                     "pmap (%s)" % (axis_name, _pmap_config.axis_name()))
+  devices = _pmap_config.devices()
+  if devices is None:
+    raise ValueError("Can't retrieve the device list from the surrounding pmap")
+  tensor = tf_np.asarray(tensor)
+  if tpu_devices(devices):
+    # TODO(b/170895907): Remove this workaround when tpu.cross_replica_sum
+    #   supports int64/float64.
+    is_int64 = False
+    is_float64 = False
+    if tensor.dtype == np.int64:
+      is_int64 = True
+      tensor = tensor.astype(np.int32)
+    elif tensor.dtype == np.float64:
+      is_float64 = True
+      tensor = tensor.astype(np.float32)
+    # TODO(wangpeng): Supply the `group_assignment` argument to
+    #   tpu.cross_replica_sum, calculated from `devices`.
+    tensor = tpu_ops.cross_replica_sum(tensor)
+    if is_int64:
+      tensor = math_ops.cast(tensor, dtypes.int64)
+    elif is_float64:
+      tensor = math_ops.cast(tensor, dtypes.float64)
+  else:
+    tensor = gen_collective_ops.collective_reduce(
+        input=tensor,
+        group_size=len(devices),
+        group_key=_GROUP_KEY,
+        instance_key=_get_instance_key(),
+        merge_op="Add",
+        final_op="Id",
+        subdiv_offsets=(0,),
+    )
+  return tf_np.asarray(tensor)
+
+
+def psum(tensors, axis_name=None):
+  return nest.map_structure(
+      functools.partial(_psum, axis_name=axis_name), tensors
+  )
+
+
+# Note this is not available in the jax api, but seemed like a reasonable API
+# to have.
+def pmean(tensor, axis_name=None):
+  """Mean all-reduction.
+
+  Args:
+    tensor: A tensor.
+    axis_name: The axis name to reduce. Must equal to that of the surrounding
+      pmap.
+
+  Returns:
+    The mean of the `tensor` replicas on each participating devices.
+  """
+  if axis_name != _pmap_config.axis_name():
+    raise ValueError("axis_name (%s) is not equal to that of the surrounding "
+                     "pmap (%s)" % (axis_name, _pmap_config.axis_name()))
+  devices = _pmap_config.devices()
+  if devices is None:
+    raise ValueError("Can't retrieve the device list from the surrounding pmap")
+  if tpu_devices(devices):
+    # TODO(wangpeng): Implement this.
+    raise ValueError("pmean for TPU is not supported yet.")
+  else:
+    return gen_collective_ops.collective_reduce(
+        input=tensor,
+        group_size=len(devices),
+        group_key=_GROUP_KEY,
+        instance_key=_get_instance_key(),
+        merge_op="Add",
+        final_op="Div",
+        subdiv_offsets=(0,),
+    )
+
+
+def _get_pmap_impl(f, devices, has_tpu):
+  """This is a helper function to return the pmap impl.
+
+  Args:
+    f: a function that takes ndarrays and returns ndarrays.
+    devices: a list of strings; the device list.
+    has_tpu: boolean; whether `devices` contains TPU devices.
+
+  Returns:
+    A function that takes tensors and returns tensors.
+  """
+  if has_tpu:
+    # Workaround b/121383831
+    output_is_list = [False]  # Use list for mutability
+    def recorder(args, kwargs, res):
+      del args, kwargs
+      output_is_list[0] = isinstance(res, list)
+      return res
+    f = _record_result_type(recorder, f)
+
+  def tf_f(*tf_args):
+    """A wrapper for `f` that takes/returns tensors."""
+    np_args = _tf_to_np(tf_args)
+    np_out = f(*np_args)
+    return np_out
+
+  if has_tpu:
+
+    @polymorphic_function.function(autograph=False)
+    def fn(inputs):
+      # TODO(wangpeng): Supply the `device_assignment` argument to
+      # tpu.replicate, calculated from `devices`.
+      res = tpu.replicate(tf_f, inputs)
+      # Workaround b/121383831
+      if (res and isinstance(res[0], list) and len(res[0]) == 1 and
+          not output_is_list[0]):
+        res = [x[0] for x in res]
+      return res
+
+    return fn
+  else:
+    # This is run in a tf.function so that the various underlying functions can
+    # be run in parallel.
+    # The trace happens on the client, so any devices should not depend on any
+    # side effects.
+
+    jit_tf_f = polymorphic_function.function(tf_f, autograph=False)
+
+    @polymorphic_function.function(autograph=False)
+    def fn(all_per_device_args):
+      """Multi-device function with calls placed on the correct device."""
+
+      results = []
+      for per_device_args, device in zip(all_per_device_args, devices):
+        with ops.device(device):
+          results.append(jit_tf_f(*per_device_args))
+      return results
+
+    return fn
+
+
+def pmap(f, axis_name=None, devices=None):
+  """Transforms a function into a multi-device function.
+
+  The semantics are similar to JAX's pmap.
+
+  Args:
+    f: The function to be converted.
+    axis_name: Used for nested pmap, which is not supported yet.
+    devices: The devices over which the returned function will run.
+
+  Returns:
+    A function that runs the underlying function `f` on `devices`. Its arguments
+    can be `ShardedNdArray`s, tensors or other Python objects, and its return
+    values are all `ShardedNdArray`s. If an input is a tensor, the length of its
+    first dimension must equal the number of devices, and the tensor will be
+    splitted along its first dimension among the devices. If an input is an
+    unknown Python object, it will be replicated among the devices.
+  """
+  if devices is None:
+    devices = accelerators()
+  if not isinstance(devices, (list, tuple)):
+    raise ValueError("Must pass a list or tuple of devices")
+  num_devices = len(devices)
+  if not num_devices:
+    raise ValueError("There must be at least 1 device")
+  has_tpu = bool(tpu_devices(devices))
+
+  pmap_fn = _get_pmap_impl(f, devices, has_tpu)
+
+  def wrapper(*args):
+    """Wrapper that wraps/unwraps args, retvals, and runs the function."""
+    if _pmap_config.devices() is not None:
+      raise ValueError("Found a surrounding pmap. Nested pmap is not supported "
+                       "yet.")
+    # TODO(wangpeng): Maybe we should use `asarray` to convert everything
+    # to ndarray first.
+
+    flattened_input_args = nest.flatten(args)
+    flattened_per_device_args = [[] for _ in devices]
+    for arg in flattened_input_args:
+      if isinstance(arg, ops.Tensor):
+        # TODO(nareshmodi): Try and use the dynamic shape instead.
+        if (not arg.shape.rank) or arg.shape[0] != len(devices):
+          # TODO(nareshmodi): Fix this restriction
+          raise ValueError(
+              "Input tensors need to have a first dimension equal to "
+              "the number of devices; got tensor of shape %s and %s devices" %
+              (arg.shape, len(devices)))
+        # NOTE: Alternatively use tf.split, and place the split tensors on the
+        # appropriate device. The best solution for this is to have an API that
+        # splits a tensor across devices.
+        for j, device in enumerate(devices):
+          updated_arg = array_ops.gather_v2(arg, j)
+          # TODO(wangpeng): Investigate whether we need a tf.identity for TPU.
+          if not has_tpu:
+            with ops.device(device):
+              updated_arg = array_ops.identity(updated_arg)
+          flattened_per_device_args[j].append(updated_arg)
+      elif isinstance(arg, ShardedNdArray):
+        for device_args, tensor in zip(flattened_per_device_args, arg.tensors):
+          device_args.append(tensor)
+      else:
+        for device_args in flattened_per_device_args:
+          device_args.append(arg)
+
+    all_per_device_args = [
+        nest.pack_sequence_as(args, device_args)
+        for device_args in flattened_per_device_args
+    ]
+
+    with pmap_config(axis_name, devices):
+      results = pmap_fn(all_per_device_args)
+
+    # Rewrap things. This can probably be written better.
+    flattened_results = [nest.flatten(result) for result in results]
+    final_tree = []
+
+    # TODO(nareshmodi): assert all items in flattened_results have the same
+    # structures
+
+    for i in range(len(flattened_results[0])):
+      tensors = []
+      for j, device in enumerate(devices):
+        assert isinstance(
+            flattened_results[j][i], ops.Tensor
+        ), "currently only tensor return items are supported"
+        tensors.append(flattened_results[j][i])
+      final_tree.append(ShardedNdArray(tensors))
+
+    return nest.pack_sequence_as(results[0], final_tree)
+
+  return wrapper
+
+
+def find_devices(device_type, devices=None):
+  if not devices:
+    devices = [d.name for d in config.list_logical_devices()]
+  devices = [(d, device_spec.DeviceSpecV2.from_string(d)) for d in devices]
+  results = [name for name, d in devices if d.device_type == device_type]
+  return results
+
+
+def tpu_devices(devices=None):
+  """Gets TPU devices out of `devices`.
+
+  Args:
+    devices: A device list (as a list of strings). If None, the list of all
+      available devices will be used for it.
+
+  Returns:
+    Those in `devices` that are TPUs.
+  """
+  return find_devices("TPU", devices)
+
+
+def gpu_devices(devices=None):
+  """Gets GPU devices out of `devices`.
+
+  Args:
+    devices: A device list (as a list of strings). If None, the list of all
+      available devices will be used for it.
+
+  Returns:
+    Those in `devices` that are GPUs.
+  """
+  return find_devices("GPU", devices)
+
+
+def accelerators(devices=None):
+  return tpu_devices(devices) or gpu_devices(devices)
+
+
+def _tree_broadcast(to, s):
+  """Broadcasts `s` to the nested structure `to`."""
+  if not isinstance(to, (list, tuple, dict)):
+    if not isinstance(s, (int, type(None))):
+      raise ValueError
+    return s
+  if isinstance(s, (int, type(None))):
+    return nest.map_structure(lambda x: s, to)
+  if isinstance(to, (list, tuple)):
+    if len(to) != len(s):
+      raise ValueError
+    new_s = [_tree_broadcast(x, y) for x, y in zip(to, s)]
+    if isinstance(to, tuple):
+      new_s = tuple(new_s)
+    return new_s
+  elif isinstance(to, dict):
+    return {k: _tree_broadcast(to[k], s[k]) for k in to}
+  else:
+    raise TypeError("Unsupported type %s" % type(to))
+
+
+def vmap(f, in_axes=0, out_axes=0):
+  """Returns a function that maps `f` over first dimension of inputs."""
+  in_axes_flat = nest.flatten(in_axes)
+  if not all(isinstance(l, (type(None), int))
+             for l in in_axes_flat):
+    raise TypeError(
+        "vmap in_axes must be an int, None, or (nested) container with "
+        "those types as leaves, but got {}.".format(in_axes))
+  if all(isinstance(l, type(None)) for l in in_axes_flat):
+    raise ValueError("vmap must have at least one non-None value in in_axes")
+
+  out_axes_flat = nest.flatten(out_axes)
+  if not all(isinstance(l, (type(None), int))
+             for l in out_axes_flat):
+    raise TypeError(
+        "vmap out_axes must be an int, None, or (nested) container with "
+        "those types as leaves, but got {}.".format(out_axes))
+
+  def _f(*args):
+    flat_args = nest.flatten(args)
+    try:
+      f_in_axes = _tree_broadcast(args, in_axes)
+    except ValueError:
+      six.reraise(
+          ValueError,
+          ValueError(
+              "vmap in_axes specification must be a tree prefix of the "
+              r"corresponding value, got specification %s for value tree %s" % (
+                  in_axes, args)),
+          sys.exc_info()[2])
+    f_in_axes_flat = nest.flatten(f_in_axes)
+
+    def tf_f(tf_args):
+      """Function passed to tf.vectorized_map call."""
+      # Note that unbatched arguments are not passed to tf_f. Here we fill thos
+      # arguments back before calling `f`.
+      tf_flat_args = []
+      j = 0
+      for arg, axis in zip(flat_args, f_in_axes_flat):
+        if axis is None:
+          tf_flat_args.append(arg)
+        else:
+          tf_flat_args.append(tf_args[j])
+          j += 1
+      unbatched_args = nest.pack_sequence_as(args, tf_flat_args)
+      return f(*unbatched_args)
+
+    # Constructs arguments to pass to `tf_f`.
+    # Unbatch arguments are skipped. Arguments with non-zero axis are
+    # transposed.
+    tf_args = []
+    for arg, axis in zip(flat_args, f_in_axes_flat):
+      if axis is None:
+        continue
+      arg = tf_np.asarray(arg)
+      if axis != 0:
+        arg = tf_np.moveaxis(arg, axis, 0)
+      tf_args.append(arg)
+    # TODO(agarwal): consider creating a tf.function outside of _f and reusing
+    # that to avoid overheads of re-vectorizing the code when running eagerly.
+    outputs = pfor_ops.vectorized_map(tf_f, tf_args)
+    try:
+      f_out_axes = _tree_broadcast(outputs, out_axes)
+    except ValueError:
+      six.reraise(
+          ValueError,
+          ValueError(
+              "vmap out_axes specification must be a tree prefix of the "
+              r"corresponding value, got specification %s for value tree %s" % (
+                  out_axes, outputs)),
+          sys.exc_info()[2])
+
+    def map_output(x, axis):
+      """Maps output of tf.vectorized_map to the final output."""
+      x = tf_np.asarray(x)
+      if axis is None:
+        # Note that `tf.vectorized_map always batches the outputs.
+        # Here we unbatch it again.
+        return x[0, ...]
+      elif axis == 0:
+        return x
+      else:
+        # Need to transpose the output.
+        return tf_np.moveaxis(x, 0, axis)
+    new_outputs = [
+        map_output(output, axis)
+        for output, axis in zip(nest.flatten(outputs), nest.flatten(f_out_axes))
+    ]
+    return nest.pack_sequence_as(outputs, new_outputs)
+
+  return _f
diff --git a/tensorflow/python/ops/numpy_ops/tests/np_einsum_test.py b/tensorflow/python/ops/numpy_ops/tests/np_einsum_test.py
new file mode 100644
index 00000000000..955e6953f6f
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/np_einsum_test.py
@@ -0,0 +1,355 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from collections import defaultdict  # pylint: disable=g-importing-member
+import itertools
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+import numpy as np
+from tensorflow.python.ops.numpy_ops.tests.config import config
+import tensorflow.python.ops.numpy_ops.tests.np_wrapper as tnp
+import tensorflow.python.ops.numpy_ops.tests.test_util as tntu
+
+
+config.parse_flags_with_absl()
+
+
+class EinsumTest(tntu.TestCase):
+
+  def _check(self, s, *ops):
+    a = np.einsum(s, *ops)
+    b = tnp.einsum(s, *ops)
+    self.assertAllClose(a, b, check_dtypes=True, atol=1e-4, rtol=1e-4)
+
+  def test_three_operands_1(self):
+    r = self.rng()
+    x = r.randn(3)
+    y = r.randn(4)
+    z = r.randn(5)
+    s = 'i,j,k->ijk'
+    self._check(s, x, y, z)
+
+  def test_three_operands_2(self):
+    r = self.rng()
+    x = r.randn(3)
+    y = r.randn(4)
+    z = r.randn(5)
+    s = 'i,j,k->ijk'
+    self._check(s, x, y, z)
+
+  def test_two_operands_1(self):
+    r = self.rng()
+    x = r.randn(3, 4)
+    y = r.randn(4)
+    s = 'ij,j->i'
+    self._check(s, x, y)
+
+  def test_two_operands_2(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    y = r.randn(4)
+    s = 'ijk,j->i'
+    self._check(s, x, y)
+
+  def test_two_operands_3(self):
+    r = self.rng()
+    x = r.randn(3, 4, 3)
+    y = r.randn(3)
+    s = 'iji,i->j'
+    self._check(s, x, y)
+
+  def test_two_operands_4(self):
+    r = self.rng()
+    x = r.randn(3, 4)
+    y = r.randn(3, 4)
+    s = 'ij,ij->'
+    self._check(s, x, y)
+
+  def test_two_operands_5(self):
+    r = self.rng()
+    x = r.randn(10, 2, 3)
+    y = r.randn(3, 4)
+    s = 'nij,jk->nik'
+    self._check(s, x, y)
+
+  def test_two_operands_6(self):
+    # based on https://github.com/google/jax/issues/37#issuecomment-448572187
+    r = self.rng()
+    x = r.randn(2, 1)
+    y = r.randn(2, 3, 4)
+    s = 'sa,shb->shab'
+    self._check(s, x, y)
+
+  def test_one_operand_1(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    s = 'ijk->j'
+    self._check(s, x)
+
+  def test_one_operand_2(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    s = 'ijk->kij'
+    self._check(s, x)
+
+  def test_one_operand_3(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    s = 'ijk->ki'
+    self._check(s, x)
+
+  def test_one_operand_4(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    s = 'ijk->ki'
+    self._check(s, x)
+
+  def test_one_operand_5(self):
+    r = self.rng()
+    x = r.randn(2, 3, 4, 5)
+    s = '...ijk->...ki'
+    self._check(s, x)
+
+  def test_one_operand_6(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    s = '...ijk->ki'
+    self._check(s, x)
+
+  def test_one_operand_7(self):
+    r = self.rng()
+    x = r.randn(3, 3)
+    s = 'ii->'
+    self._check(s, x)
+
+  def test_one_operand_8(self):
+    r = self.rng()
+    x = r.randn(3, 3)
+    s = 'ij->'
+    self._check(s, x)
+
+  def test_one_operand_9(self):
+    r = self.rng()
+    x = r.randn(3, 3, 3)
+    s = 'iii->'
+    self._check(s, x)
+
+  def test_one_operand_10(self):
+    r = self.rng()
+    x = r.randn(3, 3)
+    s = 'ii->i'
+    self._check(s, x)
+
+  def test_one_operand_11(self):
+    r = self.rng()
+    x = r.randn(3, 3, 4)
+    s = 'iij->i'
+    self._check(s, x)
+
+  def test_one_operand_12(self):
+    r = self.rng()
+    x = r.randn(3, 3, 3)
+    s = 'iii->i'
+    self._check(s, x)
+
+  def test_one_operand_13(self):
+    r = self.rng()
+    x = r.randn(3, 3, 5, 4, 4)
+    s = 'iijkk->i'
+    self._check(s, x)
+
+  def test_one_operand_14(self):
+    r = self.rng()
+    x = r.randn(3, 3, 5, 4, 4)
+    s = 'iijkk->ik'
+    self._check(s, x)
+
+  def test_one_operand_15(self):
+    r = self.rng()
+    x = r.randn(3, 3, 5, 4, 4)
+    s = 'iijkl->il'
+    self._check(s, x)
+
+  def test_one_operand_16(self):
+    r = self.rng()
+    x = r.randn(3, 3)
+    s = 'ij->ij'
+    self._check(s, x)
+
+  def test_tf_unsupported_1(self):
+    # from https://www.tensorflow.org/api_docs/python/tf/einsum
+    r = self.rng()
+    x = r.randn(2, 3, 5, 1)
+    y = r.randn(3, 4, 5, 1)
+    s = 'ij...,jk...->ik...'
+    self._check(s, x, y)
+
+  def test_tf_unsupported_2(self):
+    # from https://www.tensorflow.org/api_docs/python/tf/einsum
+    r = self.rng()
+    x = r.randn(2, 3, 3)
+    y = r.randn(4)
+    s = 'ijj,k->ik'
+    self._check(s, x, y)
+
+  def test_tf_unsupported_3(self):
+    # from https://www.tensorflow.org/api_docs/python/tf/einsum
+    r = self.rng()
+    x = r.randn(2, 3)
+    y = r.randn(2, 3)
+    z = r.randn(3, 4)
+    s = 'ij,ij,jk->ik'
+    self._check(s, x, y, z)
+
+  # these tests are based on https://github.com/dask/dask/pull/3412/files
+  @parameterized.named_parameters(
+      {'testcase_name': '_{}_dtype={}'.format(einstr, dtype.__name__),  # pylint: disable=g-complex-comprehension
+       'einstr': einstr, 'dtype': dtype}
+      for einstr in [
+          'abc,bad->abcd',
+          'abcdef,bcdfg->abcdeg',
+          'ea,fb,abcd,gc,hd->efgh',
+          'ab,b',
+          'aa',
+          'a,a->',
+          'a,a->a',
+          'a,a',
+          'a,b',
+          'a,b,c',
+          'a',
+          'ba,b',
+          'ba,b->',
+          'defab,fedbc->defac',
+          'ab...,bc...->ac...',
+          'a...a',
+          'abc...->cba...',
+          '...ab->...a',
+          'a...a->a...',
+          # Following 2 from # https://stackoverflow.com/a/19203475/1611416
+          '...abc,...abcd->...d',
+          'ab...,b->ab...',
+          # https://github.com/dask/dask/pull/3412#discussion_r182413444
+          'aa->a',
+          'ab,ab,c->c',
+          'aab,bc->ac',
+          'aab,bcc->ac',
+          'fdf,cdd,ccd,afe->ae',
+          'fff,fae,bef,def->abd',
+      ]
+      # TODO(wangpeng): Add tnp.bool_ to dtype list
+      for dtype in [tnp.float32, tnp.int32, tnp.complex64])
+  def test_from_dask(self, einstr, dtype):
+    r = tntu.rand_default()
+    if '->' in einstr:
+      input_str, _ = einstr.split('->')
+    else:
+      input_str = einstr
+    input_names = input_str.split(',')
+
+    dims = itertools.cycle([2, 3, 4])
+    shapes = defaultdict(lambda: next(dims))
+    input_shapes = [tuple(shapes[c] for c in names.replace('...', '01'))
+                    for names in input_names]
+    operands = [r(shape, dtype) for shape in input_shapes]
+
+    self._check(einstr, *operands)
+
+  def test_ordered_front_batch_dim_case(self):
+    x = np.ones((1, 8, 20, 4))
+    y = np.ones((1, 8, 20, 4))
+    s = 'ijkl,ijml->ijkm'
+    self._check(s, x, y)
+
+  # pylint: disable=invalid-name
+  def test_einsum_path(self):
+    # just check examples from np.einsum_path docstring
+    a = self.rng().rand(2, 2)
+    b = self.rng().rand(2, 5)
+    c = self.rng().rand(5, 2)
+
+    path_info = np.einsum_path('ij,jk,kl->il', a, b, c, optimize='greedy')
+    self.assertEqual(str(path_info[0]), "['einsum_path', (1, 2), (0, 1)]")
+    self.assertEqual(path_info[1].split('\n')[0],
+                     '  Complete contraction:  ij,jk,kl->il')
+
+    # check this doesn't crash
+    I = self.rng().rand(10, 10, 10, 10)
+    C = self.rng().rand(10, 10)
+    np.einsum_path('ea,fb,abcd,gc,hd->efgh', C, C, I, C, C, optimize='greedy')
+
+  @tntu.disable
+  def test_einsum_kpmurphy_example(self):
+    # code from an email with @murphyk
+    N = 2
+    C = 3
+    D = 4
+    K = 5
+    T = 6
+    r = self.rng()
+    S = r.randn(N, T, K)
+    W = r.randn(K, D)
+    V = r.randn(D, C)
+    L = np.zeros((N, C))
+    for n in range(N):
+      for c in range(C):
+        s = 0
+        for d in range(D):
+          for k in range(K):
+            for t in range(T):
+              s += S[n, t, k] * W[k, d] * V[d, c]
+        L[n, c] = s
+
+    path = tnp.einsum_path('ntk,kd,dc->nc', S, W, V, optimize='optimal')[0]
+    rtol = 1e-2 if tntu.device_under_test() == 'tpu' else None
+    self.assertAllClose(L, tnp.einsum('ntk,kd,dc->nc', S, W, V, optimize=path),
+                        check_dtypes=False, rtol=rtol)
+  # pylint: enable=invalid-name
+
+  @tntu.disable
+  def test_contraction_broadcasting(self):
+    r = self.rng()
+    x = r.randn(3, 4, 5)
+    y = r.randn(3, 1, 6)
+    s = 'cij,cjk->cik'
+    self._check(s, x, y)
+
+  @tntu.disable
+  def test_batch_broadcasting(self):
+    r = self.rng()
+    x = r.randn(1, 4, 5)
+    y = r.randn(3, 5, 6)
+    s = 'cij,cjk->cik'
+    self._check(s, x, y)
+
+  @tntu.disable
+  def test_batch_and_contraction_broadcasting(self):
+    r = self.rng()
+    x = r.randn(1, 4, 5)
+    y = r.randn(3, 1, 6)
+    s = 'cij,cjk->cik'
+    self._check(s, x, y)
+
+  @tntu.disable
+  def test_broadcasting_issue_2189(self):
+    r = self.rng()
+    x = r.randn(2, 1, 3, 3)
+    y = r.randn(2, 4, 3)
+    s = '...ij,...j'
+    self._check(s, x, y)
+
+
+if __name__ == '__main__':
+  absltest.main()
diff --git a/tensorflow/python/ops/numpy_ops/tests/np_indexing_test.py b/tensorflow/python/ops/numpy_ops/tests/np_indexing_test.py
new file mode 100644
index 00000000000..2b5c0432873
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/np_indexing_test.py
@@ -0,0 +1,988 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+import collections
+import enum
+from functools import partial
+import itertools
+import unittest
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as onp
+
+from tensorflow.python.framework import config as tf_config
+from tensorflow.python.ops.numpy_ops.tests.config import config
+import tensorflow.python.ops.numpy_ops.tests.extensions as nje
+import tensorflow.python.ops.numpy_ops.tests.np_wrapper as tnp
+import tensorflow.python.ops.numpy_ops.tests.test_util as jtu
+from tensorflow.python.util import nest
+
+config.parse_flags_with_absl()
+
+
+# We disable the whitespace continuation check in this file because otherwise it
+# makes the test name formatting unwieldy.
+# pylint: disable=bad-continuation
+# We also disable undefined-variable till we start enabling tests.
+# pylint: disable=undefined-variable
+
+
+def subvals(lst, replace):
+  lst = list(lst)
+  for i, v in replace:
+    lst[i] = v
+  return tuple(lst)
+
+
+float_dtypes = [onp.float32, onp.float64]
+int_dtypes = [onp.int32, onp.int64]
+bool_types = [onp.bool_]
+default_dtypes = float_dtypes + int_dtypes
+all_dtypes = float_dtypes + int_dtypes + bool_types
+
+IndexSpec = collections.namedtuple("IndexTest", ["shape", "indexer"])
+
+
+suppress_deprecated_indexing_warnings = partial(
+  jtu.ignore_warning, category=FutureWarning,
+  message='Using a non-tuple sequence.*')
+
+
+STATIC_INDEXING_TESTS = [
+    ("OneIntIndex", [
+        IndexSpec(shape=(3,), indexer=1),
+        IndexSpec(shape=(3, 3), indexer=0),
+        IndexSpec(shape=(3, 4, 5), indexer=2),
+        IndexSpec(shape=(3,), indexer=-1),
+        IndexSpec(shape=(3,), indexer=-2),
+    ]),
+    ("TwoIntIndices", [
+        IndexSpec(shape=(3, 3), indexer=(2, 1)),
+        IndexSpec(shape=(3, 4, 5), indexer=(1, 2)),
+        IndexSpec(shape=(3, 4, 5), indexer=(-1, 2)),
+    ]),
+    ("ThreeIntIndices", [IndexSpec((3, 4, 5), indexer=(1, 2, 3))]),
+    ("OneSliceIndex", [
+        IndexSpec(shape=(10,), indexer=slice(1, 3)),
+        IndexSpec(shape=(10,), indexer=slice(1, -1)),
+        IndexSpec(shape=(10,), indexer=slice(None, -1)),
+        IndexSpec(shape=(10,), indexer=slice(None, None, None)),
+        IndexSpec(shape=(10, 8), indexer=slice(1, 3)),
+        IndexSpec(shape=(10, 8), indexer=slice(1, None)),
+        IndexSpec(shape=(10, 8), indexer=slice(None, 3)),
+        IndexSpec(shape=(10, 8), indexer=slice(-3, None)),
+    ]),
+    ("OneSliceIndexNegativeStride", [
+        IndexSpec(shape=(10,), indexer=slice(3, 1, -1)),
+        IndexSpec(shape=(10,), indexer=slice(1, 8, -1)),  # empty result
+        IndexSpec(shape=(10,), indexer=slice(None, 1, -2)),
+        IndexSpec(shape=(10,), indexer=slice(None, None, -1)),
+        IndexSpec(shape=(10, 8), indexer=slice(3, 1, -1)),
+        IndexSpec(shape=(10, 8), indexer=slice(0, 8, -1)),  # empty result
+        IndexSpec(shape=(10, 8), indexer=slice(None, None, -1)),
+    ]),
+    ("OneSliceIndexNonUnitStride", [
+        IndexSpec(shape=(10,), indexer=slice(0, 8, 2)),
+        IndexSpec(shape=(10,), indexer=slice(0, 8, 3)),
+        IndexSpec(shape=(10,), indexer=slice(1, 3, 2)),
+        IndexSpec(shape=(10,), indexer=slice(1, None, 2)),
+        IndexSpec(shape=(10,), indexer=slice(None, 1, -2)),
+        IndexSpec(shape=(10, 8), indexer=slice(1, 8, 3)),
+        IndexSpec(shape=(10, 8), indexer=slice(None, None, 2)),
+        IndexSpec(shape=(10, 8), indexer=slice(None, 1, -2)),
+        IndexSpec(shape=(10, 8), indexer=slice(None, None, -2)),
+    ]),
+    ("TwoSliceIndices", [
+        IndexSpec(shape=(10, 8), indexer=(slice(1, 3), slice(0, 2))),
+        IndexSpec(shape=(10, 8), indexer=(slice(1, None), slice(None, 2))),
+        IndexSpec(
+            shape=(10, 8), indexer=(slice(None, None, -1), slice(None, 2))),
+        IndexSpec(shape=(10, 8, 3), indexer=(slice(1, 3), slice(0, 2))),
+        IndexSpec(shape=(10, 8, 3), indexer=(slice(1, 3), slice(0, None))),
+        IndexSpec(shape=(10, 8, 3), indexer=(slice(1, None), slice(0, 2))),
+    ]),
+    ("OneColonIndex", [
+        IndexSpec(shape=(3,), indexer=slice(None)),
+        IndexSpec(shape=(3, 4), indexer=slice(None)),
+    ]),
+    ("MultipleColonIndices", [
+        IndexSpec(shape=(3, 4), indexer=(slice(None), slice(None))),
+        IndexSpec(shape=(3, 4, 5), indexer=(slice(None), slice(None))),
+    ]),
+    ("MixedSliceIndices", [
+        IndexSpec(shape=(10, 4), indexer=(slice(None), slice(0, 2))),
+        IndexSpec(shape=(10, 4), indexer=(1, slice(None))),
+    ]),
+    ("EllipsisIndex", [
+        IndexSpec(shape=(3,), indexer=Ellipsis),
+        IndexSpec(shape=(3, 4), indexer=Ellipsis),
+        IndexSpec(shape=(3, 4, 5), indexer=(0, Ellipsis)),
+        IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis, 2, 3)),
+    ]),
+    ("NoneIndex", [
+        IndexSpec(shape=(), indexer=None),
+        IndexSpec(shape=(), indexer=(None, None)),
+        IndexSpec(shape=(), indexer=(Ellipsis, None)),
+        IndexSpec(shape=(3,), indexer=None),
+        IndexSpec(shape=(3, 4), indexer=None),
+        IndexSpec(shape=(3, 4), indexer=(Ellipsis, None)),
+        IndexSpec(shape=(3, 4), indexer=(0, None, Ellipsis)),
+        IndexSpec(shape=(3, 4, 5), indexer=(1, None, Ellipsis)),
+    ]),
+    ("EmptyIndex", [
+        IndexSpec(shape=(), indexer=()),
+        IndexSpec(shape=(3,), indexer=()),
+        IndexSpec(shape=(3, 4), indexer=()),
+    ]),
+]
+
+STATIC_INDEXING_GRAD_TESTS = [
+    ("OneIntIndex", [
+        IndexSpec(shape=(3,), indexer=1),
+        IndexSpec(shape=(3, 3), indexer=0),
+        IndexSpec(shape=(3, 4, 5), indexer=2),
+        IndexSpec(shape=(3,), indexer=-1),
+        IndexSpec(shape=(3,), indexer=-2),
+    ]),
+    ("TwoIntIndices", [
+        IndexSpec(shape=(3, 3), indexer=(2, 1)),
+        IndexSpec(shape=(3, 4, 5), indexer=(1, 2)),
+        IndexSpec(shape=(3, 4, 5), indexer=(-1, 2)),
+    ]),
+    ("ThreeIntIndices", [IndexSpec((3, 4, 5), indexer=(1, 2, 3))]),
+    ("OneSliceIndex", [
+        IndexSpec(shape=(5,), indexer=slice(1, 3)),
+        IndexSpec(shape=(5,), indexer=slice(1, -1)),
+        IndexSpec(shape=(5,), indexer=slice(None, -1)),
+        IndexSpec(shape=(5,), indexer=slice(None, None, None)),
+        IndexSpec(shape=(5, 4), indexer=slice(1, 3)),
+        IndexSpec(shape=(5, 4), indexer=slice(1, None)),
+        IndexSpec(shape=(5, 4), indexer=slice(None, 3)),
+        IndexSpec(shape=(5, 4), indexer=slice(-3, None)),
+    ]),
+    ("TwoSliceIndices", [
+        IndexSpec(shape=(5, 4), indexer=(slice(1, 3), slice(0, 2))),
+        IndexSpec(shape=(5, 4), indexer=(slice(1, None), slice(None, 2))),
+        IndexSpec(shape=(5, 4, 3), indexer=(slice(1, 3), slice(0, 2))),
+        IndexSpec(shape=(5, 4, 3), indexer=(slice(1, 3), slice(0, None))),
+        IndexSpec(shape=(5, 4, 3), indexer=(slice(1, None), slice(0, 2))),
+    ]),
+    ("OneColonIndex", [
+        IndexSpec(shape=(3,), indexer=slice(None)),
+        IndexSpec(shape=(3, 4), indexer=slice(None)),
+    ]),
+    ("MultipleColonIndices", [
+        IndexSpec(shape=(3, 4), indexer=(slice(None), slice(None))),
+        IndexSpec(shape=(3, 4, 5), indexer=(slice(None), slice(None))),
+    ]),
+    ("MixedSliceIndices", [
+        IndexSpec(shape=(5, 4), indexer=(slice(None), slice(0, 2))),
+        IndexSpec(shape=(5, 4), indexer=(1, slice(None))),
+    ]),
+    ("EllipsisIndex", [
+        IndexSpec(shape=(3,), indexer=Ellipsis),
+        IndexSpec(shape=(3, 4), indexer=Ellipsis),
+        IndexSpec(shape=(3, 4, 5), indexer=(0, Ellipsis)),
+        IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis, 2, 3)),
+    ]),
+    ("NoneIndex", [
+        IndexSpec(shape=(), indexer=None),
+        IndexSpec(shape=(), indexer=(None, None)),
+        IndexSpec(shape=(), indexer=(Ellipsis, None)),
+        IndexSpec(shape=(3,), indexer=None),
+        IndexSpec(shape=(3, 4), indexer=None),
+        IndexSpec(shape=(3, 4), indexer=(Ellipsis, None)),
+        IndexSpec(shape=(3, 4), indexer=(0, None, Ellipsis)),
+        IndexSpec(shape=(3, 4, 5), indexer=(1, None, Ellipsis)),
+    ]),
+    # TODO(mattjj): these fail for uninteresting dtype reasons
+    # ("EmptyIndex",
+    #  [IndexSpec(shape=(), indexer=()),
+    #   IndexSpec(shape=(3,), indexer=()),
+    #   IndexSpec(shape=(3, 4), indexer=()),
+    #   ]),
+]
+
+ADVANCED_INDEXING_TESTS = [
+    ("One1DIntArrayIndex",
+     [IndexSpec(shape=(3,), indexer=onp.array([0, 1])),
+     IndexSpec(shape=(3, 3), indexer=onp.array([1, 2, 1])),
+     IndexSpec(shape=(3, 4, 5), indexer=onp.array([0, 2, 0, 1])),
+     IndexSpec(shape=(3,), indexer=onp.array([-1, 1])),
+     IndexSpec(shape=(3,), indexer=onp.array([-2, -1])),
+     IndexSpec(shape=(0,), indexer=onp.array([], dtype=onp.int32)),
+     ]),
+    ("One2DIntArrayIndex",
+     [IndexSpec(shape=(3,), indexer=onp.array([[0, 0]])),
+     IndexSpec(shape=(3, 3), indexer=onp.array([[1, 2, 1],
+                                                [0, 1, -1]])),
+     IndexSpec(shape=(3, 4, 5), indexer=onp.array([[0, 2, 0, 1],
+                                                   [-1, -2, 1, 0]])),
+     ]),
+    ("Two1DIntArrayIndicesNoBroadcasting",
+     [IndexSpec(shape=(3, 3), indexer=(onp.array([0, 1]),
+                                       onp.array([1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2, 0, 1]),
+                                         onp.array([-1, 0, -1, 2]))),
+     ]),
+    ("Two1DIntArrayIndicesWithBroadcasting",
+     [IndexSpec(shape=(3, 3), indexer=(onp.array([[0, 1]]),
+                                       onp.array([1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([[0, 2, 0, 1]]),
+                                         onp.array([-1, 0, -1, 2]))),
+     ]),
+    ("TupleOfListsOfPythonInts",
+     [IndexSpec(shape=(3, 4, 5), indexer=([0, 1])),
+     IndexSpec(shape=(3, 4, 5), indexer=([[0], [-1]], [[2, 3, 0, 3]])),
+     ]),
+    ("TupleOfPythonIntsAndIntArrays",
+     [IndexSpec(shape=(3, 4, 5), indexer=(0, onp.array([0, 1]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(0, 1,
+                                         onp.array([[2, 3, 0, 3]]))),
+     ]),
+    ("TupleOfListsOfPythonIntsAndIntArrays",
+     [IndexSpec(shape=(3, 4, 5), indexer=([0, 1], onp.array([0]))),
+     IndexSpec(shape=(3, 4, 5), indexer=([[0], [-1]],
+                                         onp.array([[2, 3, 0, 3]]))),
+     ]),
+]
+
+ADVANCED_INDEXING_TESTS_NO_REPEATS = [
+    ("One1DIntArrayIndex",
+     [IndexSpec(shape=(3,), indexer=onp.array([0, 1])),
+      IndexSpec(shape=(3, 3), indexer=onp.array([1, 2, 0])),
+      IndexSpec(shape=(3, 4, 5), indexer=onp.array([0, 2, 1])),
+      IndexSpec(shape=(3,), indexer=onp.array([-1, 1])),
+      IndexSpec(shape=(3,), indexer=onp.array([-2, -1])),
+      # Fails with a TF/XLA error.
+      # IndexSpec(shape=(0,), indexer=onp.array([], dtype=onp.int32)),
+     ]),
+    ("One2DIntArrayIndex",
+     [IndexSpec(shape=(3,), indexer=onp.array([[0, 1]])),
+      IndexSpec(shape=(6, 6), indexer=onp.array([[1, 2, 0],
+                                                 [3, 4, -1]])),
+     ]),
+    ("Two1DIntArrayIndicesNoBroadcasting",
+     [IndexSpec(shape=(3, 3), indexer=(onp.array([0, 1]),
+                                       onp.array([1, 2]))),
+      IndexSpec(shape=(4, 5, 6), indexer=(onp.array([0, 2, 1, 3]),
+                                          onp.array([-1, 0, -2, 1]))),
+     ]),
+    ("Two1DIntArrayIndicesWithBroadcasting",
+     [IndexSpec(shape=(3, 3), indexer=(onp.array([[0, 1]]),
+                                       onp.array([1, 2]))),
+      IndexSpec(shape=(4, 5, 6), indexer=(onp.array([[0, 2, -1, 1]]),
+                                          onp.array([-1, 0, -2, 2]))),
+     ]),
+    ("TupleOfListsOfPythonInts",
+     [IndexSpec(shape=(3, 4, 5), indexer=([0, 1])),
+      IndexSpec(shape=(3, 4, 5), indexer=([[0], [-1]], [[2, 3, 0]])),
+     ]),
+    ("TupleOfPythonIntsAndIntArrays",
+     [IndexSpec(shape=(3, 4, 5), indexer=(0, onp.array([0, 1]))),
+      IndexSpec(shape=(3, 4, 5), indexer=(0, 1,
+                                          onp.array([[2, 3, 0]]))),
+     ]),
+    ("TupleOfListsOfPythonIntsAndIntArrays",
+     [IndexSpec(shape=(3, 4, 5), indexer=([0, 1], onp.array([0]))),
+      IndexSpec(shape=(3, 4, 5), indexer=([[0], [-1]],
+                                          onp.array([[2, 3, 0]]))),
+     ]),
+]
+
+MIXED_ADVANCED_INDEXING_TESTS_NO_REPEATS = [
+    ("SlicesAndOneIntArrayIndex",
+     [IndexSpec(shape=(2, 3), indexer=(onp.array([0, 1]), slice(1, 2))),
+     IndexSpec(shape=(2, 3), indexer=(slice(0, 2),
+                                      onp.array([0, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis,
+                                         onp.array([0, 2]),
+                                         slice(None))),
+     IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis,
+                                         onp.array([[0, 2], [1, 3]]),
+                                         slice(None))),
+     ]),
+    ("SlicesAndTwoIntArrayIndices",
+     [IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis,
+                                          onp.array([0, 2]),
+                                          onp.array([-1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2]),
+                                         Ellipsis,
+                                         onp.array([-1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2]),
+                                         onp.array([-1, 2]),
+                                         Ellipsis)),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2]),
+                                         onp.array([-1, 2]),
+                                         slice(1, 3))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2]),
+                                         slice(1, 3),
+                                         onp.array([-1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2, -2]),
+                                         slice(None, None, 2),
+                                         onp.array([-1, 2, 1]))),
+     ]),
+    ("NonesAndIntArrayIndices",
+     [IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2]),
+                                          None,
+                                          onp.array([-1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2]),
+                                         None,
+                                         None,
+                                         onp.array([-1, 2]))),
+     IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis,
+                                         onp.array([0, 2]),
+                                         None,
+                                         None,
+                                         onp.array([-1, 2]))),
+     ]),
+    ("IntArrayWithInt32Type",
+     [IndexSpec(shape=(3, 4), indexer=(Ellipsis, onp.array(1, dtype=onp.int32)))
+     ]),
+]
+
+MIXED_ADVANCED_INDEXING_TESTS = MIXED_ADVANCED_INDEXING_TESTS_NO_REPEATS + [
+    ("SlicesAndOneIntArrayIndex",
+     [
+     IndexSpec(shape=(3, 4, 5), indexer=(Ellipsis,
+                                         onp.array([[0, 2], [1, 1]]),
+                                         slice(None))),
+     ]),
+    ("SlicesAndTwoIntArrayIndices",
+     [IndexSpec(shape=(3, 4, 5), indexer=(onp.array([0, 2, -2]),
+                                         slice(None, None, 2),
+                                         onp.array([-1, 2, -1]))),
+      IndexSpec(shape=(3, 4, 5), indexer=(onp.array([[0, 2], [2, 0]]),
+                                          Ellipsis,
+                                          onp.array([[1, 0], [1, 0]]))),
+     ]),]
+
+
+def dynamic_slice_reference(operand, start_indices, slice_sizes):
+  out = onp.zeros(slice_sizes, dtype=operand.dtype)
+  idx = tuple(slice(start, start+size)
+              for start, size in zip(start_indices, slice_sizes))
+  section = operand[idx]
+  out[tuple(slice(None, stop) for stop in section.shape)] = section
+  return out
+
+
+def dynamic_update_slice_reference(operand, update, start_indices):
+  slices = tuple(map(
+      slice, start_indices, onp.add(start_indices, update.shape)))
+  updated_operand = onp.copy(operand)
+  updated_operand[slices] = update
+  return updated_operand
+
+
+class IndexingTest(jtu.TestCase):
+  """Tests for Numpy indexing translation rules."""
+
+  @parameterized.named_parameters(jtu.cases_from_list({
+      "testcase_name": "{}_inshape={}_indexer={}".format(
+          name, jtu.format_shape_dtype_string( shape, dtype), indexer),
+       "shape": shape, "dtype": dtype, "rng_factory": rng_factory, "indexer": indexer
+  } for name, index_specs in STATIC_INDEXING_TESTS
+    for shape, indexer in index_specs
+    for dtype in all_dtypes
+    for rng_factory in [jtu.rand_default]))
+  def testStaticIndexing(self, shape, dtype, rng_factory, indexer):
+    # TODO(rohanj): Revisit passing in self.rng() to this to customize further.
+    # This would need updating lax_numpy_test as well.
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype)]
+    onp_fun = lambda x: x[indexer]
+    jnp_fun = lambda x: tnp.asarray(x)[indexer]
+    self._CheckAgainstNumpy(onp_fun, jnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(jnp_fun, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True)
+
+  def _ReplaceSlicesWithTuples(self, idx):
+    """Helper method to replace slices with tuples for dynamic indexing args."""
+    if isinstance(idx, slice):
+      triple = idx.start, idx.stop, idx.step
+      isnone = [i for i, elt in enumerate(triple) if elt is None]
+      zeros = itertools.repeat(0)
+      nones = itertools.repeat(None)
+      out = subvals(triple, zip(isnone, zeros))
+      return out, lambda out: slice(*subvals(out, zip(isnone, nones)))
+    elif isinstance(idx, (tuple, list)) and idx:
+      t = type(idx)
+      elts, packs = zip(*map(self._ReplaceSlicesWithTuples, idx))
+      return elts, lambda elts: t((pack(i) for pack, i in zip(packs, elts)))
+    else:
+      return idx, lambda x: x
+
+  @parameterized.named_parameters(
+      {"testcase_name": "{}_inshape={}_indexer={}"
+       .format(name, jtu.format_shape_dtype_string(shape, dtype), indexer),
+       "shape": shape, "dtype": dtype, "rng_factory": rng_factory, "indexer": indexer}
+      for name, index_specs in [
+          ("OneSliceIndex",
+           [IndexSpec(shape=(5,), indexer=slice(1, 3)),
+            IndexSpec(shape=(5, 4), indexer=slice(1, 3))]),
+          ("TwoSliceIndices",
+           [IndexSpec(shape=(5, 4), indexer=(slice(1, 3), slice(0, 2))),
+            IndexSpec(shape=(5, 4, 3), indexer=(slice(1, 3), slice(0, 2)))]),
+          ("NonUnitStrides", [
+              IndexSpec(shape=(3,), indexer=slice(None, None, -1)),
+              IndexSpec(shape=(3, 3), indexer=slice(0, 3, -2)),
+              IndexSpec(shape=(3, 4, 5), indexer=slice(0, 4, 2))
+          ]),
+          ("OnlyStartOrStopDynamic", [
+              IndexSpec(shape=(5, 4), indexer=(slice(None, 3), slice(0, 2))),
+              IndexSpec(shape=(5, 4, 3), indexer=(slice(1, 3), slice(0, None)))
+          ]),
+      ]
+      for shape, indexer in index_specs
+      for dtype in all_dtypes
+      for rng_factory in [jtu.rand_default])
+  def testDynamicIndexingWithSlices(self, shape, dtype, rng_factory, indexer):
+    rng = rng_factory()
+    unpacked_indexer, pack_indexer = self._ReplaceSlicesWithTuples(indexer)
+
+    def onp_fun(x, unpacked_indexer):
+      indexer = pack_indexer(unpacked_indexer)
+      return x[indexer]
+
+    jnp_fun = lambda x, idx: onp_fun(tnp.asarray(x), idx)
+
+    args_maker = lambda: [rng(shape, dtype), unpacked_indexer]
+    self._CheckAgainstNumpy(onp_fun, jnp_fun, args_maker, check_dtypes=True)
+    # TODO(wangpeng): check_xla_forced_compile is turned off because some
+    # compile-time-constant requirements are violated. Investigate and turn it
+    # on.
+    self._CompileAndCheck(jnp_fun, args_maker, check_dtypes=True,
+                          check_eval_on_shapes=False,
+                          check_incomplete_shape=True,
+                          check_xla_forced_compile=False)
+
+  @parameterized.named_parameters(
+      {"testcase_name": "{}_inshape={}_indexer={}"
+       .format(name, jtu.format_shape_dtype_string(shape, dtype), indexer),
+       "shape": shape, "dtype": dtype, "rng_factory": rng_factory, "indexer": indexer}
+      for name, index_specs in [
+          ("OneIntIndex",
+           [IndexSpec(shape=(3,), indexer=1),
+            IndexSpec(shape=(3, 3), indexer=0),
+            IndexSpec(shape=(3, 4, 5), indexer=2),
+            IndexSpec(shape=(3,), indexer=-1),
+            IndexSpec(shape=(3,), indexer=-2)]),
+          ("TwoIntIndices",
+           [IndexSpec(shape=(3, 3), indexer=(2, 1)),
+            IndexSpec(shape=(3, 4, 5), indexer=(1, 2)),
+            IndexSpec(shape=(3, 4, 5), indexer=(-1, 2))]),
+          ("ThreeIntIndices",
+           [IndexSpec((3, 4, 5), indexer=(1, 2, 3))]),
+      ]
+      for shape, indexer in index_specs
+      for dtype in all_dtypes
+      for rng_factory in [jtu.rand_default])
+  def testDynamicIndexingWithIntegers(self, shape, dtype, rng_factory, indexer):
+    # TODO(rohanj): Revisit passing in self.rng() to this to customize further.
+    # This would need updating lax_numpy_test as well.
+    rng = rng_factory()
+    unpacked_indexer, pack_indexer = self._ReplaceSlicesWithTuples(indexer)
+
+    def onp_fun(x, unpacked_indexer):
+      indexer = pack_indexer(unpacked_indexer)
+      return x[indexer]
+
+    jnp_fun = lambda x, idx: onp_fun(tnp.asarray(x), idx)
+
+    args_maker = lambda: [rng(shape, dtype), unpacked_indexer]
+    self._CheckAgainstNumpy(onp_fun, jnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(jnp_fun, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True)
+
+  @parameterized.named_parameters(
+      {"testcase_name": "_{}_inshape={}_indexer={}"  # pylint: disable=g-complex-comprehension
+       .format(name, jtu.format_shape_dtype_string(shape, dtype), indexer),
+       "name": name, "shape": shape, "dtype": dtype, "rng_factory": rng_factory,
+       "indexer": indexer}
+      for name, index_specs in ADVANCED_INDEXING_TESTS
+      for shape, indexer in index_specs
+      for dtype in all_dtypes
+      for rng_factory in [jtu.rand_default])
+  def testAdvancedIntegerIndexing(self, name, shape, dtype, rng_factory,
+                                  indexer):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), indexer]
+    onp_fun = lambda x, idx: x[idx]
+    jnp_fun = lambda x, idx: onp_fun(tnp.asarray(x), idx)
+
+    self._CheckAgainstNumpy(onp_fun, jnp_fun, args_maker, check_dtypes=True)
+    # TODO(wangpeng): check_xla_forced_compile is turned off for
+    # ListOfPythonIntsAndIntArrays because it throws "The number of output
+    # elements has to equal to number of input elements that are sliced when
+    # input indices are not constant". Investigate and turn it on.
+    check_xla = (name != "ListOfPythonIntsAndIntArrays")
+    self._CompileAndCheck(jnp_fun, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True,
+                          check_xla_forced_compile=check_xla)
+
+  @parameterized.named_parameters(
+      {"testcase_name": "_{}_inshape={}_indexer={}"  # pylint: disable=g-complex-comprehension
+       .format(name, jtu.format_shape_dtype_string(shape, dtype), indexer),
+       "name": name, "shape": shape, "dtype": dtype, "rng_factory": rng_factory,
+       "indexer": indexer}
+      for name, index_specs in MIXED_ADVANCED_INDEXING_TESTS
+      for shape, indexer in index_specs
+      for dtype in all_dtypes
+      for rng_factory in [jtu.rand_default])
+  def testMixedAdvancedIntegerIndexing(self, name, shape, dtype, rng_factory,
+                                       indexer):
+    rng = rng_factory()
+    indexer_with_dummies = [e if isinstance(e, onp.ndarray) else ()
+                            for e in indexer]
+    substitutes = [(i, e) for i, e in enumerate(indexer)
+                   if not isinstance(e, onp.ndarray)]
+    args_maker = lambda: [rng(shape, dtype), indexer_with_dummies]
+
+    def np_fun(x, indexer_with_dummies):
+      idx = type(indexer)(subvals(indexer_with_dummies, substitutes))
+      return x[idx]
+
+    jnp_fun = lambda x, idx: np_fun(tnp.asarray(x), idx)
+
+    self._CheckAgainstNumpy(np_fun, jnp_fun, args_maker, check_dtypes=True)
+    # TODO(wangpeng): check_xla_forced_compile is turned off for
+    # IntArrayWithInt32Type because it throws "The number of output elements has
+    # to equal to number of input elements that are sliced when input indices
+    # are not constant". Investigate and turn it on.
+    check_xla = (name != "IntArrayWithInt32Type")
+    self._CompileAndCheck(jnp_fun, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True,
+                          check_xla_forced_compile=check_xla)
+
+  def testAdvancedIndexingManually(self):
+    x = onp.random.RandomState(0).randn(3, 4, 5)
+    index_array = onp.array([0, 2, -1, 0])
+
+    op = lambda x, index_array: x[..., index_array, :]
+    cop = nje.jit(op)
+
+    a1 = op(x, index_array)
+    a2 = cop(x, index_array)
+
+    self.assertAllClose(a1, a2, check_dtypes=True)
+
+    op = lambda x, index_array: x[..., index_array, :, index_array, None]
+    cop = nje.jit(op)
+
+    a1 = op(x, index_array)
+    a2 = cop(x, index_array)
+
+    self.assertAllClose(a1, a2, check_dtypes=True)
+
+    op = lambda x, index_array: x[index_array, ..., index_array[:, None], None]
+    cop = nje.jit(op)
+
+    a1 = op(x, index_array)
+    a2 = cop(x, index_array)
+
+    self.assertAllClose(a1, a2, check_dtypes=True)
+
+  # Note that we don't currently allow __iter__ in graph mode. So this test only
+  # iterates over eager tensor.
+  def testUnpacking(self):
+
+    def foo(x):
+      a, b, c = x
+      return a + b + c
+
+    a1 = foo(onp.arange(3))
+    a2 = foo(tnp.arange(3))
+
+    self.assertAllClose(a1, a2, check_dtypes=True)
+
+  def testBooleanIndexingArray1D(self):
+    idx = onp.array([True, True, False])
+    x = tnp.asarray(onp.arange(3))
+    ans = x[idx]
+    expected = onp.arange(3)[idx]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  def testBooleanIndexingList1D(self):
+    idx = [True, True, False]
+    x = tnp.asarray(onp.arange(3))
+    ans = x[idx]
+    expected = onp.arange(3)[idx]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  def testBooleanIndexingArray2DBroadcast(self):
+    idx = onp.array([True, True, False, True])
+    x = onp.arange(8).reshape(4, 2)
+    ans = tnp.asarray(x)[idx]
+    expected = x[idx]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  def testBooleanIndexingList2DBroadcast(self):
+    idx = [True, True, False, True]
+    x = onp.arange(8).reshape(4, 2)
+    ans = tnp.asarray(x)[idx]
+    expected = x[idx]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  def testBooleanIndexingArray2D(self):
+    idx = onp.array([[True, False],
+                     [False, True],
+                     [False, False],
+                     [True, True]])
+    x = onp.arange(8).reshape(4, 2)
+    ans = tnp.asarray(x)[idx]
+    expected = x[idx]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  def testBooleanIndexingDynamicShape(self):
+    x = onp.zeros(3)
+    i = onp.array([True, True, False])
+    ans = x[i]
+    expected = tnp.asarray(x)[i]
+    self.assertAllClose(ans, expected, check_dtypes=True)
+
+  def testIssue187(self):
+    x = tnp.ones((5, 5))
+    x[[0, 2, 4], [0, 2, 4]]  # doesn't crash
+
+    x = onp.arange(25).reshape((5, 5))
+    ans = nje.jit(lambda x: x[[0, 2, 4], [0, 2, 4]])(x)
+    expected = x[[0, 2, 4], [0, 2, 4]]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  # TODO(agarwal): Fix this use case.
+  @jtu.disable
+  def testIndexingEmptyDimension(self):
+    # Issue 2671: XLA error when indexing into dimension of size 0
+    x = tnp.ones((2, 0))
+    # The following work, even on axis 1 of size 0
+    _ = x[0, :] + x[0, None] + x[0, 1:] + x[0, 1:3:2]
+
+    with self.assertRaisesRegex(IndexError,
+                                "index .* is out of bounds for axis .* with size 0"):
+      _ = onp.ones((2, 0))[0, 0]  # The numpy error
+    with self.assertRaisesRegex(IndexError,
+                                "index is out of bounds for axis .* with size 0"):
+      _ = x[0, 0]  # JAX indexing
+    with self.assertRaisesRegex(IndexError,
+                                "index is out of bounds for axis .* with size 0"):
+      nje.jit(lambda i: x[0, i])(0)  # JAX indexing under jit
+
+  def testBooleanIndexingWithEmptyResult(self):
+    # based on a TensorFlow Probability test that started failing after #1623
+    x = tnp.array([-1])
+    mask = tnp.array([False])
+    ans = x[mask]  # doesn't crash
+
+    expected =  onp.array([-1])[onp.array([False])]
+    self.assertAllClose(ans, expected, check_dtypes=False)
+
+  def testFloatIndexingError(self):
+    error_regex = "only integers, slices.*are valid indices"
+    # Verify onp behavior
+    with self.assertRaisesRegex(IndexError, error_regex):
+      _ = onp.zeros((2, 2))[(0, 0.)]
+    # Test tnp
+    with self.assertRaisesRegex(IndexError, error_regex):
+      tnp.zeros(2)[0.]  # pylint: disable=expression-not-assigned
+    with self.assertRaisesRegex(IndexError, error_regex):
+      tnp.zeros((2, 2))[(0, 0.)]  # pylint: disable=expression-not-assigned
+    # Test with jit
+    with self.assertRaisesRegex(IndexError, error_regex):
+      nje.jit(lambda idx: tnp.zeros((2, 2))[idx])((0, 0.0))
+
+  def testIndexOutOfBounds(self):  # https://github.com/google/jax/issues/2245
+    array = tnp.ones(5)
+    self.assertAllClose(array, array[:10], check_dtypes=True)
+
+  @parameterized.named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_start_indices={}_size_indices={}".format(  # pylint: disable=g-complex-comprehension
+          jtu.format_shape_dtype_string(shape, dtype),
+          start_indices, size_indices),
+       "shape": shape, "dtype": dtype, "start_indices": start_indices,
+       "size_indices": size_indices, "rng_factory": rng_factory}
+      for shape, start_indices, size_indices in [
+        [(3,), onp.array((1,)), (1,)],
+        [(5, 3), (1, 1), (3, 1)],
+        [(5, 3), (1, -2), (3, 1)],
+        [(5, 3), onp.array((1, 1)), (3, 1)],
+        [(7, 5, 3), onp.array((4, 1, 0)), (2, 0, 1)],
+        [(), (), ()],
+      ]
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testDynamicSlice(self, shape, dtype, start_indices, size_indices,
+                       rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), onp.array(start_indices)]
+    op = lambda x, starts: nje.dynamic_slice(x, starts, size_indices)
+    self._CompileAndCheck(op, args_maker)
+
+  @parameterized.named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_start_indices={}_size_indices={}".format(  # pylint: disable=g-complex-comprehension
+          jtu.format_shape_dtype_string(shape, dtype),
+          start_indices, size_indices),
+       "shape": shape, "dtype": dtype, "start_indices": start_indices,
+       "size_indices": size_indices, "rng_factory": rng_factory}
+      for shape, start_indices, size_indices in [
+        [(3,), (1,), (1,)],
+        [(5, 3), (1, 1), (3, 1)],
+        [(5, 3), (1, -2), (3, 1)],
+        [(7, 5, 3), (4, 1, 0), (2, 0, 1)],
+        [(), (), ()],
+      ]
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testDynamicSliceAgainstNumpy(self, shape, dtype, start_indices,
+                                   size_indices, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), onp.array(start_indices)]
+    op = lambda x, s: nje.dynamic_slice(x, s, size_indices)
+    numpy_op = lambda x, s: dynamic_slice_reference(x, s, size_indices)
+    self._CheckAgainstNumpy(numpy_op, op, args_maker)
+
+  def testDynamicSliceInDim(self):
+    rng = jtu.rand_default()
+    x = rng((6, 7), onp.int32)
+    self.assertAllClose(
+        nje.dynamic_slice_in_dim(x, 2, 3), x[2:5], check_dtypes=True
+    )
+
+
+def _broadcastable_shapes(shape):
+  """Returns all shapes that broadcast to `shape`."""
+  def f(rshape):
+    yield []
+    if rshape:
+      for s in f(rshape[1:]):
+        yield rshape[0:1] + s
+      if rshape[0] != 1:
+        for s in f(rshape[1:]):
+          yield [1] + s
+  for x in f(list(reversed(shape))):
+    yield list(reversed(x))
+
+
+def _update_shape(shape, indexer):
+  return onp.zeros(shape)[indexer].shape
+
+
+class UpdateOps(enum.Enum):
+  UPDATE = 0
+  ADD = 1
+  # MUL = 2
+  MIN = 3
+  MAX = 4
+
+  def np_fn(op, indexer, x, y):  # pylint: disable=no-self-argument
+    x = x.copy()
+    x[indexer] = {
+      UpdateOps.UPDATE: lambda: y,
+      UpdateOps.ADD: lambda: x[indexer] + y,
+      # UpdateOps.MUL: lambda: x[indexer] * y,
+      UpdateOps.MIN: lambda: onp.minimum(x[indexer], y),
+      UpdateOps.MAX: lambda: onp.maximum(x[indexer], y),
+    }[op]()
+    return x
+
+  def tfnp_fn(op, indexer, x, y):  # pylint: disable=no-self-argument
+    return {
+        UpdateOps.UPDATE: nje.index_update,
+        UpdateOps.ADD: nje.index_add,
+        # UpdateOps.MUL: nje.index_mul,
+        UpdateOps.MIN: nje.index_min,
+        UpdateOps.MAX: nje.index_max,
+    }[op](x, indexer, y)
+
+
+# a test to workaround b/123559667
+def has_non_trivial_stride(indexer):
+  def has(idx):
+    return isinstance(idx, slice) and idx.step not in (1, -1, None)
+  return any(has(idx) for idx in nest.flatten(indexer))
+
+
+class IndexedUpdateTest(jtu.TestCase):
+
+  @parameterized.named_parameters(jtu.cases_from_list({  # pylint: disable=g-complex-comprehension
+      "testcase_name": "_{}_{}_{}_{}".format(
+          jtu.format_shape_dtype_string(shape, dtype), indexer,
+          jtu.format_shape_dtype_string(update_shape, update_dtype), op.name),
+      "shape": shape, "dtype": dtype, "rng_factory": rng_factory,
+      "indexer": indexer, "update_shape": update_shape,
+      "update_dtype": update_dtype, "op": op
+  } for name, index_specs in STATIC_INDEXING_TESTS
+    for shape, indexer in index_specs
+    for op in UpdateOps
+    for dtype in (all_dtypes if op == UpdateOps.UPDATE else default_dtypes)
+    for update_shape in _broadcastable_shapes(_update_shape(shape, indexer))
+    for update_dtype in all_dtypes
+    for rng_factory in [jtu.rand_default]))
+  def testStaticIndexing(self, shape, dtype, update_shape, update_dtype,
+                         rng_factory, indexer, op):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), rng(update_shape, update_dtype)]
+    np_fn = lambda x, y: UpdateOps.np_fn(op, indexer, x, y)
+    tfnp_fn = lambda x, y: UpdateOps.tfnp_fn(op, indexer, x, y)
+    self._CheckAgainstNumpy(np_fn, tfnp_fn, args_maker)
+    # TODO(wangpeng): When indexer is slice(_, 8, -1), XLA throws error "Missing
+    # xla_context 0-th output from". Investigate.
+    check_xla = (not has_non_trivial_stride(indexer) and  # b/123559667
+                 not (isinstance(indexer, slice) and indexer.stop == 8 and
+                      indexer.step == -1))
+    self._CompileAndCheck(tfnp_fn, args_maker, check_incomplete_shape=True,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+  @parameterized.named_parameters(jtu.cases_from_list({  # pylint: disable=g-complex-comprehension
+      "testcase_name": "_{}_{}_{}_{}".format(
+          jtu.format_shape_dtype_string(shape, dtype), indexer,
+          jtu.format_shape_dtype_string(update_shape, update_dtype), op.name),
+      "shape": shape, "dtype": dtype, "rng_factory": rng_factory,
+      "indexer": indexer, "update_shape": update_shape,
+      "update_dtype": update_dtype, "op": op
+  } for name, index_specs in ADVANCED_INDEXING_TESTS_NO_REPEATS
+    for shape, indexer in index_specs
+    for op in UpdateOps
+    for dtype in (all_dtypes if op == UpdateOps.UPDATE else default_dtypes)
+    for update_shape in _broadcastable_shapes(_update_shape(shape, indexer))
+    for update_dtype in all_dtypes
+    for rng_factory in [jtu.rand_default]))
+  def testAdvancedIndexing(self, shape, dtype, update_shape, update_dtype,
+                           rng_factory, indexer, op):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), rng(update_shape, update_dtype)]
+    np_fn = lambda x, y: UpdateOps.np_fn(op, indexer, x, y)
+    tfnp_fn = lambda x, y: UpdateOps.tfnp_fn(op, indexer, x, y)
+    self._CheckAgainstNumpy(np_fn, tfnp_fn, args_maker)
+    self._CompileAndCheck(tfnp_fn, args_maker, check_incomplete_shape=True)
+
+  @parameterized.named_parameters(jtu.cases_from_list({  # pylint: disable=g-complex-comprehension
+      "testcase_name": "_{}_{}_{}_{}".format(
+          jtu.format_shape_dtype_string(shape, dtype), indexer,
+          jtu.format_shape_dtype_string(update_shape, update_dtype), op.name),
+      "shape": shape, "dtype": dtype, "rng_factory": rng_factory,
+      "indexer": indexer, "update_shape": update_shape,
+      "update_dtype": update_dtype, "op": op
+  } for name, index_specs in MIXED_ADVANCED_INDEXING_TESTS_NO_REPEATS
+    for shape, indexer in index_specs
+    for op in UpdateOps
+    for dtype in (all_dtypes if op == UpdateOps.UPDATE else default_dtypes)
+    for update_shape in _broadcastable_shapes(_update_shape(shape, indexer))
+    for update_dtype in all_dtypes
+    for rng_factory in [jtu.rand_default]))
+  def testMixedAdvancedIndexing(self, shape, dtype, update_shape, update_dtype,
+                                rng_factory, indexer, op):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), rng(update_shape, update_dtype)]
+    np_fn = lambda x, y: UpdateOps.np_fn(op, indexer, x, y)
+    tfnp_fn = lambda x, y: UpdateOps.tfnp_fn(op, indexer, x, y)
+    self._CheckAgainstNumpy(np_fn, tfnp_fn, args_maker)
+    check_xla = not has_non_trivial_stride(indexer)  # b/123559667
+    self._CompileAndCheck(tfnp_fn, args_maker, check_incomplete_shape=True,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+  @parameterized.named_parameters(jtu.cases_from_list({  # pylint: disable=g-complex-comprehension
+      "testcase_name": "_{}_{}_{}_{}".format(
+          jtu.format_shape_dtype_string(shape, dtype), indexer,
+          jtu.format_shape_dtype_string(update_shape, update_dtype), op.name),
+      "shape": shape, "dtype": dtype, "rng_factory": rng_factory,
+      "indexer": indexer, "update_shape": update_shape,
+      "update_dtype": update_dtype, "op": op
+  } for name, index_specs in STATIC_INDEXING_TESTS
+    for shape, indexer in index_specs
+    for op in [UpdateOps.ADD, UpdateOps.UPDATE]
+    for dtype in float_dtypes
+    for update_shape in _broadcastable_shapes(_update_shape(shape, indexer))
+    for update_dtype in float_dtypes
+    for rng_factory in [jtu.rand_default]))
+  def testStaticIndexingGrads(self, shape, dtype, update_shape, update_dtype,
+                              rng_factory, indexer, op):
+    rng = rng_factory()
+    tfnp_fn = lambda x, y: UpdateOps.tfnp_fn(op, indexer, x, y)
+    x = rng(shape, dtype)
+    y = rng(update_shape, update_dtype)
+    self.check_grads(tfnp_fn, (x, y), rtol=1e-3, atol=1e-3, delta=1.)
+
+  @parameterized.named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_start_indices={}_update_shape={}".format(  # pylint: disable=g-complex-comprehension
+          jtu.format_shape_dtype_string(shape, dtype),
+          start_indices, update_shape),
+       "shape": shape, "dtype": dtype, "start_indices": start_indices,
+       "update_shape": update_shape, "rng_factory": rng_factory}
+      for shape, start_indices, update_shape in [
+        [(3,), (1,), (1,)],
+        [(5, 3), (1, 1), (3, 1)],
+        [(5, 3), (1, -2), (3, 1)],
+        [(7, 5, 3), (4, 1, 0), (2, 0, 1)],
+        [(), (), ()],
+      ]
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testDynamicUpdateSlice(self, shape, dtype, start_indices, update_shape,
+                             rng_factory):
+    rng = rng_factory()
+    def args_maker():
+      return [rng(shape, dtype), rng(update_shape, dtype),
+              onp.array(start_indices)]
+    # update's shape must be fully known.
+    # TODO(wangpeng): Support turning off check_incomplete_shape for individual
+    #   arguments.
+    self._CompileAndCheck(
+        nje.dynamic_update_slice, args_maker, check_incomplete_shape=False
+    )
+
+  @parameterized.named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_start_indices={}_update_shape={}".format(  # pylint: disable=g-complex-comprehension
+          jtu.format_shape_dtype_string(shape, dtype),
+          start_indices, update_shape),
+       "shape": shape, "dtype": dtype, "start_indices": start_indices,
+       "update_shape": update_shape, "rng_factory": rng_factory}
+      for shape, start_indices, update_shape in [
+        [(3,), (1,), (1,)],
+        [(5, 3), (1, 1), (3, 1)],
+        [(5, 3), (1, -2), (3, 1)],
+        [(7, 5, 3), (4, 1, 0), (2, 0, 1)],
+        [(), (), ()],
+      ]
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testDynamicUpdateSliceAgainstNumpy(self, shape, dtype, start_indices,
+                                         update_shape, rng_factory):
+    rng = rng_factory()
+    def args_maker():
+      return [rng(shape, dtype), rng(update_shape, dtype),
+              onp.array(start_indices)]
+    self._CheckAgainstNumpy(
+        dynamic_update_slice_reference, nje.dynamic_update_slice, args_maker
+    )
+
+  def testDynamicUpdateSliceInDim(self):
+    rng = jtu.rand_default()
+    x = rng((6, 7), onp.int32)
+    y = rng((3, 7), onp.int32)
+    z = x.copy()
+    z[2:5] = y
+    self.assertAllClose(
+        nje.dynamic_update_slice_in_dim(x, y, 2, 0), z, check_dtypes=True
+    )
+
+
+if __name__ == "__main__":
+  tf_config.set_soft_device_placement(False)
+  absltest.main()
diff --git a/tensorflow/python/ops/numpy_ops/tests/np_test.py b/tensorflow/python/ops/numpy_ops/tests/np_test.py
new file mode 100644
index 00000000000..9c23fcde63f
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/np_test.py
@@ -0,0 +1,3101 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+import collections
+import functools
+from functools import partial
+import itertools
+import operator
+import unittest
+from unittest import SkipTest
+import warnings
+
+from absl.testing import absltest
+from absl.testing import parameterized
+import numpy as onp
+import six
+
+from tensorflow.python.framework import errors_impl
+from tensorflow.python.framework import ops
+from tensorflow.python.ops.numpy_ops import np_config
+from tensorflow.python.ops.numpy_ops.tests.config import config
+from tensorflow.python.ops.numpy_ops.tests.config import FLAGS
+import tensorflow.python.ops.numpy_ops.tests.extensions as nje
+import tensorflow.python.ops.numpy_ops.tests.np_wrapper as tnp
+import tensorflow.python.ops.numpy_ops.tests.test_util as jtu
+from tensorflow.python.util import nest
+
+config.parse_flags_with_absl()
+
+
+nonempty_nonscalar_array_shapes = [(4,), (3, 4), (3, 1), (1, 4), (2, 1, 4), (2, 3, 4)]
+nonempty_array_shapes = [()] + nonempty_nonscalar_array_shapes
+empty_array_shapes = [(0,), (0, 4), (3, 0),]
+
+scalar_shapes = [jtu.NUMPY_SCALAR_SHAPE, jtu.PYTHON_SCALAR_SHAPE]
+array_shapes = nonempty_array_shapes + empty_array_shapes
+nonzerodim_shapes = nonempty_nonscalar_array_shapes + empty_array_shapes
+nonempty_shapes = scalar_shapes + nonempty_array_shapes
+all_shapes =  scalar_shapes + array_shapes
+
+# TODO(wangpeng): float_dtypes = [tnp.bfloat16, onp.float16, onp.float32,
+#                                 onp.float64]
+float_dtypes = [onp.float16, onp.float32, onp.float64]
+complex_dtypes = [onp.complex64, onp.complex128]
+int_dtypes = [onp.int32, onp.int64]
+unsigned_dtypes = [onp.uint32, onp.uint64]
+bool_dtypes = [onp.bool_]
+default_dtypes = float_dtypes + int_dtypes
+inexact_dtypes = float_dtypes + complex_dtypes
+number_dtypes = float_dtypes + complex_dtypes + int_dtypes
+all_dtypes = number_dtypes + bool_dtypes
+
+
+python_scalar_dtypes = [tnp.bool_, tnp.int_, tnp.float_, tnp.complex_]
+# pylint: disable=unnecessary-lambda,g-long-lambda,expression-not-assigned
+
+def _valid_dtypes_for_shape(shape, dtypes):
+  # Not all (shape, dtype) pairs are valid. In particular, Python scalars only
+  # have one type in each category (float, bool, etc.)
+  if shape is jtu.PYTHON_SCALAR_SHAPE:
+    return [t for t in dtypes if t in python_scalar_dtypes]
+  return dtypes
+
+def _shape_and_dtypes(shapes, dtypes):
+  for shape in shapes:
+    for dtype in _valid_dtypes_for_shape(shape, dtypes):
+      yield (shape, dtype)
+
+OpRecord = collections.namedtuple(
+  "OpRecord",
+  ["name", "nargs", "dtypes", "shapes", "rng_factory", "diff_modes",
+   "test_name", "check_dtypes", "tolerance", "inexact",
+   "check_incomplete_shape"])
+
+def op_record(name, nargs, dtypes, shapes, rng_factory, diff_modes,
+              test_name=None, check_dtypes=True, tolerance=None, inexact=False,
+              check_incomplete_shape=True):
+  test_name = test_name or name
+  return OpRecord(name, nargs, dtypes, shapes, rng_factory, diff_modes,
+                  test_name, check_dtypes, tolerance, inexact,
+                  check_incomplete_shape)
+
+
+def minus(a, b):
+  return [x for x in a if x not in b]
+
+
+JAX_ONE_TO_ONE_OP_RECORDS = [
+    op_record("abs", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("add", 2, all_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("ceil", 1, float_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("conj", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("equal", 2, all_dtypes, all_shapes, jtu.rand_some_equal, []),
+    op_record("exp", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              inexact=True),
+    op_record("fabs", 1, float_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("float_power", 2, inexact_dtypes, all_shapes,
+              partial(jtu.rand_default, scale=1), ["rev"],
+              tolerance={
+                  # TODO(wangpeng): tnp.bfloat16: 1e-2,
+                  onp.float32: 1e-3,
+                  onp.float64: 1e-12, onp.complex64: 2e-4,
+                  onp.complex128: 1e-12}, check_dtypes=False),
+    op_record("floor", 1, float_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("greater", 2, minus(all_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_equal, []),
+    op_record("greater_equal", 2, minus(all_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_equal, []),
+    op_record("less", 2, minus(all_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_equal, []),
+    op_record("less_equal", 2, minus(all_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_equal, []),
+    op_record("log", 1, number_dtypes, all_shapes, jtu.rand_positive, ["rev"],
+              inexact=True),
+    op_record("logical_and", 2, all_dtypes, all_shapes, jtu.rand_bool, []),
+    op_record("logical_not", 1, all_dtypes, all_shapes, jtu.rand_bool, []),
+    op_record("logical_or", 2, all_dtypes, all_shapes, jtu.rand_bool, []),
+    op_record("logical_xor", 2, all_dtypes, all_shapes, jtu.rand_bool, []),
+    op_record("maximum", 2, minus(all_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_inf, []),
+    op_record("minimum", 2, minus(all_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_inf, []),
+    op_record("multiply", 2, all_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("negative", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("nextafter", 2, [f for f in float_dtypes
+                               if f not in (tnp.bfloat16, onp.float16)],
+              all_shapes, jtu.rand_default, ["rev"], inexact=True, tolerance=0),
+    op_record("not_equal", 2, all_dtypes, all_shapes, jtu.rand_some_equal, ["rev"]),
+    op_record("array_equal", 2, number_dtypes, all_shapes, jtu.rand_some_equal, ["rev"]),
+    op_record("reciprocal", 1, inexact_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("subtract", 2, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("signbit", 1, default_dtypes + bool_dtypes, all_shapes,
+              jtu.rand_some_inf_and_nan, ["rev"]),
+    op_record("sin", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              inexact=True),
+    op_record("cos", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              inexact=True),
+    op_record("tan", 1, number_dtypes, all_shapes,
+              partial(jtu.rand_uniform, -1.5, 1.5), ["rev"],
+              tolerance={onp.complex64: 3e-5, onp.complex128: 4e-14},
+              inexact=True),
+    # TODO(wangpeng): Add float16 support
+    op_record("sinh", 1, minus(number_dtypes, [onp.float16]), all_shapes, jtu.rand_default, ["rev"],
+              inexact=True),
+    op_record("cosh", 1, minus(number_dtypes, [onp.float16]), all_shapes, jtu.rand_default, ["rev"],
+              inexact=True),
+    # TODO(b/142975473): on CPU, tanh for complex128 is only accurate to
+    # ~float32 precision.
+    # TODO(b/143135720): on GPU, tanh has only ~float32 precision.
+    op_record("tanh", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              tolerance={onp.float64: 1e-7, onp.complex128: 1e-7},
+              inexact=True),
+    op_record("arcsin", 1, minus(float_dtypes, [onp.float16]), all_shapes, jtu.rand_small, ["rev"],
+              inexact=True),
+    op_record("arccos", 1, minus(float_dtypes, [onp.float16]), all_shapes, jtu.rand_small, ["rev"],
+              inexact=True),
+    op_record("arctan", 1, minus(float_dtypes, [onp.float16]), all_shapes, jtu.rand_small, ["rev"],
+              inexact=True),
+    op_record("arctan2", 2, minus(float_dtypes, [onp.float16]), all_shapes, jtu.rand_small, ["rev"],
+              inexact=True),
+    op_record("arcsinh", 1, minus(number_dtypes, [onp.float16]), all_shapes, jtu.rand_positive, ["rev"],
+              inexact=True),
+    op_record("arccosh", 1, minus(number_dtypes, [onp.float16]), all_shapes, jtu.rand_positive, ["rev"],
+              inexact=True),
+    op_record("arctanh", 1, minus(number_dtypes, [onp.float16]), all_shapes, jtu.rand_small, ["rev"],
+              inexact=True),
+]
+
+JAX_COMPOUND_OP_RECORDS = [
+    # angle has inconsistent 32/64-bit return types across numpy versions.
+    op_record("angle", 1, number_dtypes, all_shapes, jtu.rand_default, [],
+              check_dtypes=False, inexact=True),
+    op_record("atleast_1d", 1, default_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("atleast_2d", 1, default_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("atleast_3d", 1, default_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("cbrt", 1, default_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              inexact=True),
+    op_record("conjugate", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("deg2rad", 1, float_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("divide", 2, number_dtypes, all_shapes, jtu.rand_nonzero, ["rev"],
+              inexact=six.PY3),
+    op_record("divmod", 2, minus(int_dtypes + float_dtypes, [onp.float16]),
+              all_shapes, jtu.rand_nonzero, []),
+    op_record("exp2", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              tolerance={
+                  # TODO(wangpeng): tnp.bfloat16: 2e-2,
+                  onp.float16: 1e-2}, inexact=True),
+    # TODO(b/142975473): on CPU, expm1 for float64 is only accurate to ~float32
+    # precision.
+    op_record("expm1", 1, number_dtypes, all_shapes, jtu.rand_positive, [],
+              test_name="expm1_large", tolerance={onp.float64: 1e-8}, inexact=True),
+    op_record("expm1", 1, number_dtypes, all_shapes, jtu.rand_small_positive,
+              [], tolerance={onp.float64: 1e-8}, inexact=True),
+    op_record("fix", 1, float_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("floor_divide", 2, minus(number_dtypes, complex_dtypes),
+              all_shapes, jtu.rand_nonzero, ["rev"]),
+    op_record("heaviside", 2, default_dtypes, all_shapes, jtu.rand_default, [],
+              inexact=True),
+    op_record("hypot", 2, default_dtypes, all_shapes, jtu.rand_default, [],
+              inexact=True),
+    op_record("kron", 2, number_dtypes, nonempty_shapes, jtu.rand_default, [],
+              check_incomplete_shape=False),
+    op_record("outer", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("imag", 1, number_dtypes, all_shapes, jtu.rand_some_inf, []),
+    op_record("iscomplex", 1, number_dtypes, all_shapes, jtu.rand_some_inf, []),
+    op_record("isfinite", 1, minus(inexact_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_inf_and_nan, []),
+    op_record("isinf", 1, minus(inexact_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_inf_and_nan, []),
+    op_record("isnan", 1, minus(inexact_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_inf_and_nan, []),
+    op_record("isneginf", 1, float_dtypes, all_shapes, jtu.rand_some_inf_and_nan, []),
+    op_record("isposinf", 1, float_dtypes, all_shapes, jtu.rand_some_inf_and_nan, []),
+    op_record("isreal", 1, number_dtypes, all_shapes, jtu.rand_some_inf, []),
+    op_record("isrealobj", 1, number_dtypes, all_shapes, jtu.rand_some_inf, []),
+    op_record("log2", 1, number_dtypes, all_shapes, jtu.rand_positive, ["rev"],
+              inexact=True),
+    op_record("log10", 1, number_dtypes, all_shapes, jtu.rand_positive, ["rev"],
+              inexact=True),
+    op_record("log1p", 1, number_dtypes, all_shapes, jtu.rand_positive, [],
+              test_name="log1p_large", tolerance={onp.float64: 1e-12},
+              inexact=True),
+    op_record("log1p", 1, number_dtypes, all_shapes, jtu.rand_small_positive, [],
+              tolerance={onp.float64: 1e-12}, inexact=True),
+    op_record("logaddexp", 2, float_dtypes, all_shapes,
+              jtu.rand_some_inf_and_nan, ["rev"],
+              tolerance={onp.float64: 1e-12}, inexact=True),
+    op_record("logaddexp2", 2, float_dtypes, all_shapes,
+              jtu.rand_some_inf_and_nan, ["rev"],
+              tolerance={onp.float16: 1e-2}, inexact=True),
+    op_record("polyval", 2, number_dtypes, nonempty_nonscalar_array_shapes,
+              jtu.rand_default, [], check_dtypes=False,
+              tolerance={onp.float16: 1e-2, onp.float64: 1e-12},
+              check_incomplete_shape=False),
+    op_record("positive", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("power", 2, number_dtypes, all_shapes, jtu.rand_positive, ["rev"],
+              tolerance={onp.complex128: 1e-14}),
+    op_record("rad2deg", 1, float_dtypes, all_shapes, jtu.rand_default, [],
+              tolerance={onp.float64: 5e-6}),
+    op_record("ravel", 1, all_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("real", 1, number_dtypes, all_shapes, jtu.rand_some_inf, []),
+    op_record("remainder", 2, minus(default_dtypes, [onp.float16]), all_shapes,
+              jtu.rand_nonzero, [], tolerance={onp.float16: 1e-2}),
+    op_record("mod", 2, minus(default_dtypes, [onp.float16]), all_shapes,
+              jtu.rand_nonzero, []),
+    op_record("sinc", 1, [t for t in number_dtypes if t != tnp.bfloat16],
+              all_shapes, jtu.rand_default, ["rev"],
+              tolerance={onp.complex64: 1e-5}, inexact=True,
+              check_dtypes=False),
+    op_record("square", 1, number_dtypes, all_shapes, jtu.rand_default, ["rev"]),
+    op_record("sqrt", 1, number_dtypes, all_shapes, jtu.rand_positive, ["rev"],
+              inexact=True),
+    op_record("transpose", 1, all_dtypes, all_shapes, jtu.rand_default, ["rev"],
+              check_dtypes=False),
+    op_record("true_divide", 2, all_dtypes, all_shapes, jtu.rand_nonzero,
+              ["rev"], inexact=True),
+    op_record("diff", 1, number_dtypes, nonzerodim_shapes, jtu.rand_default,
+              ["rev"], check_incomplete_shape=False),
+]
+
+JAX_BITWISE_OP_RECORDS = [
+    op_record("bitwise_and", 2, int_dtypes + unsigned_dtypes, all_shapes,
+              jtu.rand_default, []),
+    op_record("bitwise_not", 1, int_dtypes + unsigned_dtypes, all_shapes,
+              jtu.rand_default, []),
+    op_record("bitwise_or", 2, int_dtypes + unsigned_dtypes, all_shapes,
+              jtu.rand_default, []),
+    op_record("bitwise_xor", 2, int_dtypes + unsigned_dtypes, all_shapes,
+              jtu.rand_default, []),
+]
+
+JAX_REDUCER_RECORDS = [
+    op_record("mean", 1, number_dtypes, nonempty_shapes, jtu.rand_default, [],
+              inexact=True),
+    op_record("prod", 1, all_dtypes, all_shapes, jtu.rand_small_positive, []),
+    op_record("sum", 1, all_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("nanmean", 1, minus(inexact_dtypes, complex_dtypes),
+              nonempty_shapes, jtu.rand_some_nan, [], inexact=True),
+    op_record("nanprod", 1, minus(inexact_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_nan, []),
+    op_record("nansum", 1, minus(number_dtypes, complex_dtypes), all_shapes,
+              jtu.rand_some_nan, []),
+]
+
+JAX_REDUCER_NO_DTYPE_RECORDS = [
+    op_record("all", 1, all_dtypes, all_shapes, jtu.rand_some_zero, []),
+    op_record("any", 1, all_dtypes, all_shapes, jtu.rand_some_zero, []),
+    op_record("max", 1, minus(all_dtypes, complex_dtypes), nonempty_shapes,
+              jtu.rand_default, []),
+    op_record("min", 1, minus(all_dtypes, complex_dtypes), nonempty_shapes,
+              jtu.rand_default, []),
+    op_record("var", 1, all_dtypes, nonempty_shapes, jtu.rand_default, [],
+              inexact=True),
+    op_record("std", 1, all_dtypes, nonempty_shapes, jtu.rand_default, [],
+              inexact=True),
+]
+
+JAX_ARGMINMAX_RECORDS = [
+    op_record("argmin", 1, minus(all_dtypes, complex_dtypes), nonempty_shapes,
+              jtu.rand_some_equal, []),
+    op_record("argmax", 1, minus(all_dtypes, complex_dtypes), nonempty_shapes,
+              jtu.rand_some_equal, []),
+]
+
+JAX_OPERATOR_OVERLOADS = [
+    op_record("__add__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__sub__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__mul__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__eq__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__ne__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__lt__", 2, default_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__gt__", 2, default_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__ge__", 2, default_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__pos__", 1, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__neg__", 1, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__pow__", 2, inexact_dtypes, all_shapes, jtu.rand_positive, [],
+              tolerance={onp.float32: 2e-4, onp.complex64: 2e-4, onp.complex128: 1e-14}),
+    op_record("__mod__", 2, minus(default_dtypes, [onp.float16]), all_shapes, jtu.rand_nonzero, [],
+              tolerance={onp.float16: 1e-1}),
+    op_record("__floordiv__", 2, default_dtypes, all_shapes, jtu.rand_nonzero, []),
+    op_record("__truediv__", 2, number_dtypes, all_shapes, jtu.rand_nonzero, [],
+              inexact=True),
+    op_record("__abs__", 1, number_dtypes, all_shapes, jtu.rand_default, []),
+    # TODO(mattjj): __invert__ fails on bool dtypes because ~True == -2
+    op_record("__invert__", 1, int_dtypes, all_shapes, jtu.rand_default, []),
+    # TODO(mattjj): investigate these failures
+    # op_record("__or__", 2, number_dtypes, all_shapes, jtu.rand_bool, []),
+    # op_record("__and__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    # op_record("__xor__", 2, number_dtypes, all_shapes, jtu.rand_bool, []),
+    # op_record("__divmod__", 2, number_dtypes, all_shapes, jtu.rand_nonzero, []),
+    # TODO(mattjj): lshift, rshift
+]
+
+JAX_RIGHT_OPERATOR_OVERLOADS = [
+    op_record("__radd__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__rsub__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__rmul__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    op_record("__rpow__", 2, inexact_dtypes, all_shapes, jtu.rand_positive, [],
+              tolerance={onp.float32: 2e-4, onp.complex64: 1e-3}),
+    op_record("__rmod__", 2, minus(default_dtypes, [onp.float16]), all_shapes, jtu.rand_nonzero, [],
+              tolerance={onp.float16: 1e-1}),
+    op_record("__rfloordiv__", 2, default_dtypes, all_shapes, jtu.rand_nonzero, []),
+    op_record("__rtruediv__", 2, number_dtypes, all_shapes, jtu.rand_nonzero, [],
+              inexact=True),
+    # op_record("__ror__", 2, number_dtypes, all_shapes, jtu.rand_bool, []),
+    # op_record("__rand__", 2, number_dtypes, all_shapes, jtu.rand_default, []),
+    # op_record("__rxor__", 2, number_dtypes, all_shapes, jtu.rand_bool, []),
+    # op_record("__rdivmod__", 2, number_dtypes, all_shapes, jtu.rand_nonzero, []),
+]
+
+numpy_version = tuple(map(int, onp.version.version.split('.')))
+if numpy_version >= (1, 15):
+  JAX_COMPOUND_OP_RECORDS += [
+      op_record("isclose", 2, [t for t in all_dtypes if t != tnp.bfloat16],
+                all_shapes, jtu.rand_small_positive, []),
+      op_record("gcd", 2, int_dtypes, all_shapes, jtu.rand_default, []),
+      op_record("lcm", 2, int_dtypes, all_shapes, jtu.rand_default, []),
+  ]
+  JAX_REDUCER_NO_DTYPE_RECORDS += [
+      op_record("ptp", 1, minus(number_dtypes, complex_dtypes), nonempty_shapes,
+                jtu.rand_default, []),
+  ]
+
+if six.PY2:
+  JAX_OPERATOR_OVERLOADS += [
+    op_record("__div__", 2, number_dtypes, all_shapes, jtu.rand_nonzero, []),
+  ]
+  JAX_RIGHT_OPERATOR_OVERLOADS += [
+    op_record("__rdiv__", 2, number_dtypes, all_shapes, jtu.rand_nonzero, []),
+  ]
+
+
+CombosWithReplacement = itertools.combinations_with_replacement
+
+
+def _dtypes_are_compatible_for_bitwise_ops(args):
+  if len(args) <= 1:
+    return True
+  is_signed = lambda dtype: tnp.issubdtype(dtype, onp.signedinteger)
+  width = lambda dtype: tnp.iinfo(dtype).bits
+  x, y = args
+  # `tnp.iinfo(dtype).bits` can't be called on bools, so we convert bools to
+  # ints.
+  if x == tnp.bool_:
+    x = tnp.int32
+  if y == tnp.bool_:
+    y = tnp.int32
+  if width(x) > width(y):
+    x, y = y, x
+  if x == tnp.uint32 and y == tnp.uint64:
+    return False
+  # The following condition seems a little ad hoc, but seems to capture what
+  # numpy actually implements.
+  return (
+      is_signed(x) == is_signed(y)
+      or (width(x) == 32 and width(y) == 32)
+      or (width(x) == 32 and width(y) == 64 and is_signed(y)))
+
+
+def _shapes_are_broadcast_compatible(shapes):
+  accumulator = onp.zeros([])
+  for shape in shapes:
+    try:
+      accumulator = accumulator + onp.zeros(shape)
+    except ValueError:
+      return False
+  return True
+
+def _shapes_are_equal_length(shapes):
+  return all(len(shape) == len(shapes[0]) for shape in shapes[1:])
+
+
+# pylint: disable=g-doc-return-or-yield
+def _promote_like_lnp(fun, inexact=False):
+  """Decorator that promotes the arguments of `fun` to `tnp.result_type(*args)`.
+
+  tnp and onp have different type promotion semantics; this decorator allows
+  tests make an onp reference implementation act more like an tnp
+  implementation.
+  """
+  def wrapper(*args, **kw):
+    flat_args = nest.flatten(args)
+    if inexact and not any(
+        tnp.issubdtype(tnp.result_type(x).as_numpy_dtype, tnp.inexact)
+        for x in flat_args):
+      dtype = tnp.result_type(tnp.float_, *flat_args)
+    else:
+      dtype = tnp.result_type(*flat_args)
+    dtype = dtype.as_numpy_dtype
+    args = nest.map_structure(lambda a: onp.asarray(a, dtype), args)
+    return fun(*args, **kw)
+  return wrapper
+# pylint: enable=g-doc-return-or-yield
+
+
+def new_test(f):
+
+  def wrapper(self, *args, **kwargs):
+    if not FLAGS.tf_numpy_additional_tests:
+      self.skipTest("Newly added test is disabled, since flag is False.")
+    else:
+      f(self, *args, **kwargs)
+
+  return wrapper
+
+
+def named_parameters(ls):
+  """A version that allows an empty param list."""
+  def noop(_):
+    def wrapper(self, *args, **kwargs):
+      self.skipTest("Empty parameter list")
+    return wrapper
+  if isinstance(ls, (list, tuple)) and not ls:
+    return noop
+  if isinstance(ls, itertools.chain):
+    try:
+      first = next(ls)
+    except StopIteration:
+      return noop
+    else:
+      ls = itertools.chain([first], ls)
+  return parameterized.named_parameters(ls)
+
+
+# TODO(wangpeng): Enable all disabled tests in this class
+class LaxBackedNumpyTests(jtu.TestCase):
+  """Tests for LAX-backed Numpy implementation."""
+
+  def _GetArgsMaker(self, rng, shapes, dtypes, onp_arrays=True):
+    def f():
+      out = [rng(shape, dtype or tnp.float_)
+             for shape, dtype in zip(shapes, dtypes)]
+      return out if onp_arrays else [tnp.asarray(a) for a in out]
+    return f
+
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix(rec.test_name, shapes,
+                                                      dtypes),
+         "rng_factory": rec.rng_factory, "shapes": shapes, "dtypes": dtypes,
+         "onp_op": getattr(onp, rec.name), "lnp_op": getattr(tnp, rec.name),
+         "check_dtypes": rec.check_dtypes, "tolerance": rec.tolerance,
+         "inexact": rec.inexact,
+         "check_incomplete_shape": rec.check_incomplete_shape}
+        for shapes in filter(
+          _shapes_are_broadcast_compatible,
+          CombosWithReplacement(rec.shapes, rec.nargs))
+        for dtypes in itertools.product(
+          *(_valid_dtypes_for_shape(s, rec.dtypes) for s in shapes)))
+      for rec in itertools.chain(JAX_ONE_TO_ONE_OP_RECORDS,
+                                 JAX_COMPOUND_OP_RECORDS)))
+  def testOp(self, onp_op, lnp_op, rng_factory, shapes, dtypes, check_dtypes,
+             tolerance, inexact, check_incomplete_shape):
+    # TODO(b/147769803): Remove this skipping
+    if lnp_op.__name__ == "kron" and shapes == ((2, 3, 4), (2, 3, 4)):
+      self.skipTest("Case disabled because of b/147769803")
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, shapes, dtypes, onp_arrays=False)
+    tol = max(jtu.tolerance(dtype, tolerance) for dtype in dtypes)
+    tol = functools.reduce(jtu.join_tolerance,
+                           [tolerance, tol, jtu.default_tolerance()])
+    self._CheckAgainstNumpy(_promote_like_lnp(onp_op, inexact), lnp_op,
+                            args_maker, check_dtypes=check_dtypes, tol=tol)
+    # tf.math.pow doesn't support int32/int64 on XLA (b/169191476).
+    check_xla = not (lnp_op.__name__ == "power" and set(dtypes).intersection(
+        (onp.int32, onp.int64)))
+    self._CompileAndCheck(lnp_op, args_maker, check_dtypes=check_dtypes,
+                          atol=tol, rtol=tol,
+                          check_incomplete_shape=check_incomplete_shape,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix(rec.test_name, shapes,
+                                                      dtypes),
+         "rng_factory": rec.rng_factory, "shapes": shapes, "dtypes": dtypes, "name": rec.name,
+         "tol": rec.tolerance}
+        for shapes in filter(
+          _shapes_are_broadcast_compatible,
+          CombosWithReplacement(rec.shapes, rec.nargs))
+        for dtypes in itertools.product(
+          *(_valid_dtypes_for_shape(s, rec.dtypes) for s in shapes)))
+      for rec in JAX_OPERATOR_OVERLOADS))
+  def testOperatorOverload(self, name, rng_factory, shapes, dtypes, tol):
+    rng = rng_factory()
+    # onp and tnp arrays have different type promotion rules; force the use of
+    # tnp arrays.
+    args_maker = self._GetArgsMaker(rng, shapes, dtypes, onp_arrays=False)
+    fun = lambda *xs: getattr(operator, name.strip('_'))(*xs)
+    scalar_arg = (jtu.PYTHON_SCALAR_SHAPE in shapes or
+                  jtu.NUMPY_SCALAR_SHAPE in shapes or
+                  () in shapes)
+    empty_shape = any(isinstance(s, tuple) and 0 in s for s in shapes)
+    self._CompileAndCheck(
+      fun, args_maker, check_dtypes=True, #not scalar_arg and not empty_shape,
+      atol=tol, rtol=tol)
+
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix(rec.test_name, shapes,
+                                                      dtypes),
+         "rng_factory": rec.rng_factory, "shapes": shapes, "dtypes": dtypes, "name": rec.name,
+         "op_tolerance": rec.tolerance}
+        for shapes in filter(
+          _shapes_are_broadcast_compatible,
+          CombosWithReplacement(rec.shapes, rec.nargs))
+        for dtypes in itertools.product(
+          *(_valid_dtypes_for_shape(s, rec.dtypes) for s in shapes)))
+      for rec in JAX_RIGHT_OPERATOR_OVERLOADS))
+  def testRightOperatorOverload(self, name, rng_factory, shapes, dtypes,
+                                op_tolerance):
+    if shapes[1] is jtu.PYTHON_SCALAR_SHAPE:
+      raise SkipTest()  # TODO(mattjj): clean up
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, shapes, dtypes, onp_arrays=False)
+    fun = lambda fst, snd: getattr(snd, name)(fst)
+    tol = max(jtu.tolerance(dtype, op_tolerance) for dtype in dtypes)
+    scalar_arg = (jtu.PYTHON_SCALAR_SHAPE in shapes or
+                  jtu.NUMPY_SCALAR_SHAPE in shapes or
+                  () in shapes)
+    empty_shape = any(isinstance(s, tuple) and 0 in s for s in shapes)
+    self._CompileAndCheck(
+      fun, args_maker, check_dtypes=True, # not scalar_arg and not empty_shape,
+      atol=tol, rtol=tol)
+
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix(
+            rec.test_name, shapes, dtypes),
+         "rng_factory": rec.rng_factory, "shapes": shapes, "dtypes": dtypes,
+         "onp_op": getattr(onp, rec.name), "lnp_op": getattr(tnp, rec.name)}
+        for shapes in filter(
+          _shapes_are_broadcast_compatible,
+          CombosWithReplacement(rec.shapes, rec.nargs))
+        for dtypes in filter(
+          _dtypes_are_compatible_for_bitwise_ops,
+          CombosWithReplacement(rec.dtypes, rec.nargs)))
+      for rec in JAX_BITWISE_OP_RECORDS))
+  def testBitwiseOp(self, onp_op, lnp_op, rng_factory, shapes, dtypes):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, shapes, dtypes)
+    has_python_scalar = jtu.PYTHON_SCALAR_SHAPE in shapes
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    if onp_op == onp.bitwise_not and has_python_scalar:
+      # For bitwise_not with a Python `int`, nje.jit may choose a different
+      # dtype for the `int` from onp's choice, which may result in a different
+      # result value, so we skip _CompileAndCheck.
+      return
+    # Numpy does value-dependent dtype promotion on Python/numpy/array scalars
+    # which `jit` can't do (when np.result_type is called inside `jit`, tensor
+    # values are not available), so we skip dtype check in this case.
+    check_dtypes = not(set(shapes) & set([jtu.NUMPY_SCALAR_SHAPE,
+                                          jtu.PYTHON_SCALAR_SHAPE, ()]))
+    self._CompileAndCheck(lnp_op, args_maker, check_dtypes=check_dtypes)
+
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+        {"testcase_name": "{}_inshape={}_axis={}_dtype={}_keepdims={}".format(
+            rec.test_name.capitalize(),
+            jtu.format_shape_dtype_string(shape, dtype), axis,
+            "None" if out_dtype is None else onp.dtype(out_dtype).name, keepdims),
+         "rng_factory": rec.rng_factory, "shape": shape, "dtype": dtype, "out_dtype": out_dtype,
+         "onp_op": getattr(onp, rec.name), "lnp_op": getattr(tnp, rec.name),
+         "axis": axis, "keepdims": keepdims, "inexact": rec.inexact}
+        for shape in rec.shapes for dtype in rec.dtypes
+        for out_dtype in [None] + rec.dtypes
+        for axis in set(range(-len(shape), len(shape))) | set([None])
+        for keepdims in [False, True])
+    for rec in JAX_REDUCER_RECORDS))
+  def testReducer(self, onp_op, lnp_op, rng_factory, shape, dtype, out_dtype,
+                  axis, keepdims, inexact):
+    rng = rng_factory()
+    def onp_fun(x):
+      x_cast = x if dtype != tnp.bfloat16 else x.astype(onp.float32)
+      t = out_dtype if out_dtype != tnp.bfloat16 else onp.float32
+      return onp_op(x_cast, axis, dtype=t, keepdims=keepdims)
+    onp_fun = _promote_like_lnp(onp_fun, inexact)
+    lnp_fun = lambda x: lnp_op(x, axis, dtype=out_dtype, keepdims=keepdims)
+    args_maker = lambda: [rng(shape, dtype)]
+    tol_spec = {onp.float16: 1e-2, onp.float32: 1e-3, onp.complex64: 1e-3,
+                onp.float64: 1e-5, onp.complex128: 1e-5}
+    tol = jtu.tolerance(dtype, tol_spec)
+    tol = max(tol, jtu.tolerance(out_dtype, tol_spec)) if out_dtype else tol
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=tnp.bfloat16 not in (dtype, out_dtype),
+                            tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, atol=tol,
+                          rtol=tol)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "{}_inshape={}_axis={}_keepdims={}".format(
+          rec.test_name.capitalize(),
+          jtu.format_shape_dtype_string(shape, dtype), axis, keepdims),
+       "rng_factory": rec.rng_factory, "shape": shape, "dtype": dtype,
+       "onp_op": getattr(onp, rec.name), "lnp_op": getattr(tnp, rec.name),
+       "axis": axis, "keepdims": keepdims, "inexact": rec.inexact}
+      for rec in JAX_REDUCER_NO_DTYPE_RECORDS
+      for shape in rec.shapes for dtype in rec.dtypes
+      for axis in set(range(-len(shape), len(shape))) | set([None])
+      for keepdims in [False, True]))
+  def testReducerNoDtype(self, onp_op, lnp_op, rng_factory, shape, dtype, axis,
+                         keepdims, inexact):
+    rng = rng_factory()
+    onp_fun = lambda x: onp_op(x, axis, keepdims=keepdims)
+    onp_fun = _promote_like_lnp(onp_fun, inexact)
+    lnp_fun = lambda x: lnp_op(x, axis, keepdims=keepdims)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_axis={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), axis),
+       "shape": shape, "dtype": dtype, "axis": axis}
+      for shape in all_shapes for dtype in all_dtypes
+      for axis in set(range(-len(shape), len(shape))) | set([None])))
+  def testCountNonzero(self, shape, dtype, axis):
+    rng = jtu.rand_some_zero()
+    onp_fun = lambda x: onp.count_nonzero(x, axis)
+    lnp_fun = lambda x: tnp.count_nonzero(x, axis)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=False)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}".format(
+          jtu.format_shape_dtype_string(shape, dtype)),
+       "shape": shape, "dtype": dtype}
+      for shape in all_shapes for dtype in all_dtypes))
+  def testNonzero(self, shape, dtype):
+    rng = jtu.rand_some_zero()
+    onp_fun = lambda x: onp.nonzero(x)  # pylint: disable=unnecessary-lambda
+    lnp_fun = lambda x: tnp.nonzero(x)  # pylint: disable=unnecessary-lambda
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=False)
+    # The shapes of `nonzero`'s results are value-dependent, so `eval_on_shapes`
+    # won't return concrete shapes.
+    # Also, `nonzero` requires a known rank.
+    # Turns off XLA check because there are no XLA kernels for `Where`, which
+    # XLA can't support because it's output shape is dynamic.
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_eval_on_shapes=False,
+        check_incomplete_shape=True, check_unknown_rank=False,
+        check_experimental_compile=False, check_xla_forced_compile=False)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "{}_inshape={}_axis={}".format(
+          rec.test_name.capitalize(),
+          jtu.format_shape_dtype_string(shape, dtype), axis),
+       "rng_factory": rec.rng_factory, "shape": shape, "dtype": dtype,
+       "onp_op": getattr(onp, rec.name), "lnp_op": getattr(tnp, rec.name),
+       "axis": axis}
+      for rec in JAX_ARGMINMAX_RECORDS
+      for shape, dtype in _shape_and_dtypes(rec.shapes, rec.dtypes)
+      for axis in range(-len(shape), len(shape))))
+  def testArgMinMax(self, onp_op, lnp_op, rng_factory, shape, dtype, axis):
+    rng = rng_factory()
+    if dtype == onp.complex128 and jtu.device_under_test() == "gpu":
+      raise unittest.SkipTest("complex128 reductions not supported on GPU")
+
+    def onp_fun(array_to_reduce):
+      return onp_op(array_to_reduce, axis).astype(tnp.int_)
+
+    def lnp_fun(array_to_reduce):
+      return lnp_op(array_to_reduce, axis)
+
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_{}_{}".format(
+          jtu.format_shape_dtype_string(lhs_shape, lhs_dtype),
+          jtu.format_shape_dtype_string(rhs_shape, rhs_dtype),
+          axes),
+       "lhs_shape": lhs_shape, "lhs_dtype": lhs_dtype,
+       "rhs_shape": rhs_shape, "rhs_dtype": rhs_dtype,
+       "axes": axes, "rng_factory": rng_factory}
+      for rng_factory in [jtu.rand_default]
+      for lhs_shape, rhs_shape, axes in [
+          [(2,), (2,), (-1, -1, -1, None)], # scalar output
+          [(2, 4), (2, 4), (-1, -1, -1, 0)], # 2D vectors
+          [(3, 4), (3, 4), (-1, -1, -1, 0)], # 3D vectors
+          [(3, 4), (3, 6, 5, 4), (-1, -1, -1, 0)], # broadcasting
+          [(4, 3), (3, 6, 5, 4), (1, 0, -1, None)], # different axes
+          [(6, 1, 3), (5, 3), (-1, -1, -1, None)], # more broadcasting
+          [(6, 1, 2), (5, 3), (-1, -1, -1, None)], # mixed 2D and 3D vectors
+          [(10, 5, 2, 8), (1, 5, 1, 3), (-2, -1, -3, None)], # axes/broadcasting
+          [(4, 5, 2), (4, 5, 2), (-1, -1, 0, None)], # axisc should do nothing
+          [(4, 5, 2), (4, 5, 2), (-1, -1, -1, None)] # same as before
+      ]
+      for lhs_dtype, rhs_dtype in CombosWithReplacement(
+          minus(number_dtypes, complex_dtypes), 2)))
+  def testCross(self, lhs_shape, lhs_dtype, rhs_shape, rhs_dtype, axes, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(lhs_shape, lhs_dtype), rng(rhs_shape, rhs_dtype)]
+    axisa, axisb, axisc, axis = axes
+    lnp_fun = lambda a, b: tnp.cross(a, b, axisa, axisb, axisc, axis)
+    def onp_fun(a, b):
+      a = a.astype(onp.float32) if lhs_dtype == tnp.bfloat16 else a
+      b = b.astype(onp.float32) if rhs_dtype == tnp.bfloat16 else b
+      out = onp.cross(a, b, axisa, axisb, axisc, axis)
+      return out.astype(tnp.promote_types(lhs_dtype, rhs_dtype))
+    tol_spec = {
+        # TODO(wangpeng): dtypes.bfloat16: 3e-1,
+        onp.float16: 0.15}
+    tol = max(jtu.tolerance(lhs_dtype, tol_spec),
+              jtu.tolerance(rhs_dtype, tol_spec))
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True,
+                            tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, atol=tol,
+                          rtol=tol, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_{}_{}".format(
+          name,
+          jtu.format_shape_dtype_string(lhs_shape, lhs_dtype),
+          jtu.format_shape_dtype_string(rhs_shape, rhs_dtype)),
+       "lhs_shape": lhs_shape, "lhs_dtype": lhs_dtype,
+       "rhs_shape": rhs_shape, "rhs_dtype": rhs_dtype,
+       "rng_factory": rng_factory}
+      for rng_factory in [jtu.rand_default]
+      for name, lhs_shape, rhs_shape in [
+          ("matrix-scalar", (3, 3), ()),
+          ("scalar-matrix", (), (3, 3)),
+          ("matrix-vector", (4, 5), (5,)),
+          ("vector-matrix", (6,), (6, 4)),
+          ("matrix-matrix", (3, 4), (4, 5)),
+          ("tensor-vector", (4, 3, 2), (2,)),
+          ("vector-tensor", (2,), (3, 2, 4)),
+          ("tensor-matrix", (4, 3, 2), (2, 5)),
+          ("matrix-tensor", (5, 2), (3, 2, 4)),
+          ("tensor-tensor", (2, 3, 4), (5, 4, 1))]
+      for lhs_dtype, rhs_dtype in CombosWithReplacement(number_dtypes, 2)))
+  def testDot(self, lhs_shape, lhs_dtype, rhs_shape, rhs_dtype, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(lhs_shape, lhs_dtype), rng(rhs_shape, rhs_dtype)]
+    tol = {onp.float16: 1e-2, onp.float32: 1e-5, onp.float64: 1e-14,
+           onp.complex128: 1e-14}
+    if jtu.device_under_test() == "tpu":
+      tol[onp.float32] = tol[onp.complex64] = 2e-1
+    def onp_dot(x, y):
+      x = x.astype(onp.float32) if lhs_dtype == tnp.bfloat16 else x
+      y = y.astype(onp.float32) if rhs_dtype == tnp.bfloat16 else y
+      # `onp.dot(x, y).dtype` sometimes differs from `onp.result_type(x, y)`
+      # (e.g. when x is float64[] and y is complex64[3,3], or when x is
+      # float16[3,3] and y is int64[]). We ignore this corner case and pretend
+      # that they agree.
+      return onp.dot(x, y).astype(onp.result_type(x, y))
+    self._CheckAgainstNumpy(onp_dot, tnp.dot, args_maker,
+                            check_dtypes=True, tol=tol)
+    # We disable dtype check in the following cases because `np.dot` does
+    # value-dependent type promotion in those cases.
+    check_dtypes = () not in (lhs_shape, rhs_shape)
+    # XLA lacks int32/int64 MatMul kernels (b/168657656).
+    check_xla = not set((lhs_dtype, rhs_dtype)).intersection(
+        (onp.int32, onp.int64))
+    self._CompileAndCheck(tnp.dot, args_maker, check_dtypes=check_dtypes,
+                          atol=tol, rtol=tol, check_incomplete_shape=True,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_{}_{}".format(
+          name,
+          jtu.format_shape_dtype_string(lhs_shape, lhs_dtype),
+          jtu.format_shape_dtype_string(rhs_shape, rhs_dtype)),
+       "lhs_shape": lhs_shape, "lhs_dtype": lhs_dtype,
+       "rhs_shape": rhs_shape, "rhs_dtype": rhs_dtype,
+       "rng_factory": rng_factory}
+      for rng_factory in [jtu.rand_default]
+      for name, lhs_shape, rhs_shape in [
+          ("vector-vector", (3,), (3,)),
+          ("matrix-vector", (3, 3), (3,)),
+          ("vector-matrix", (3,), (3, 3)),
+          ("matrix-matrix", (3, 3), (3, 3)),
+          ("vector-tensor", (3,), (5, 3, 2)),
+          ("tensor-vector", (5, 3, 2), (2,)),
+          ("matrix-tensor", (5, 2), (3, 2, 4)),
+          ("tensor-matrix", (5, 2, 3), (3, 2)),
+          ("tensor-tensor", (5, 3, 4), (5, 4, 1)),
+          ("tensor-tensor-broadcast", (3, 1, 3, 4), (5, 4, 1))]
+      for lhs_dtype, rhs_dtype in CombosWithReplacement(number_dtypes, 2)))
+  def testMatmul(self, lhs_shape, lhs_dtype, rhs_shape, rhs_dtype, rng_factory):
+    rng = rng_factory()
+    def onp_fun(x, y):
+      dtype = tnp.promote_types(lhs_dtype, rhs_dtype)
+      return (onp.matmul(x, y).astype(dtype),
+              onp.array(x).__matmul__(y).astype(dtype),
+              onp.array(y).__rmatmul__(x).astype(dtype))
+    def lnp_fun(x, y):
+      return (tnp.matmul(x, y),
+              tnp.array(x).__matmul__(y),
+              tnp.array(y).__rmatmul__(x))
+    args_maker = lambda: [rng(lhs_shape, lhs_dtype), rng(rhs_shape, rhs_dtype)]
+    tol = {onp.float16: 1e-2, onp.float32: 2e-2, onp.float64: 1e-12,
+           onp.complex128: 1e-12}
+    if jtu.device_under_test() == "tpu":
+      tol[onp.float32] = tol[onp.complex64] = 4e-2
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=True, tol=tol)
+    # XLA lacks int32/int64 MatMul kernels (b/168657656).
+    check_xla = not set((lhs_dtype, rhs_dtype)).intersection(
+        (onp.int32, onp.int64))
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, atol=tol,
+                          rtol=tol, check_incomplete_shape=True,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_{}_{}".format(
+          name,
+          jtu.format_shape_dtype_string(lhs_shape, lhs_dtype),
+          jtu.format_shape_dtype_string(rhs_shape, rhs_dtype)),
+       "lhs_shape": lhs_shape, "lhs_dtype": lhs_dtype,
+       "rhs_shape": rhs_shape, "rhs_dtype": rhs_dtype,
+       "rng_factory": rng_factory}
+      for rng_factory in [jtu.rand_default]
+      for name, lhs_shape, rhs_shape in [
+          ("vector-vector", (3,), (3,)),
+          ("vector-matrix", (9,), (3, 3)),
+          ("matrix-matrix", (3, 3), (3, 3)),
+          ("tensor-vector", (5, 3, 2), (30,))]
+      for lhs_dtype, rhs_dtype in CombosWithReplacement(number_dtypes, 2)))
+  @new_test
+  def testVDot(self, lhs_shape, lhs_dtype, rhs_shape, rhs_dtype, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(lhs_shape, lhs_dtype), rng(rhs_shape, rhs_dtype)]
+    tol = {onp.float16: 1e-2, onp.float32: 2e-2, onp.float64: 1e-12,
+           onp.complex128: 1e-12}
+    self._CheckAgainstNumpy(onp.vdot, tnp.vdot, args_maker,
+                            check_dtypes=True, tol=tol)
+    # XLA lacks int32/int64 MatMul kernels (b/168657656).
+    check_xla = not set((lhs_dtype, rhs_dtype)).intersection(
+        (onp.int32, onp.int64))
+    self._CompileAndCheck(tnp.vdot, args_maker, check_dtypes=True, atol=tol,
+                          rtol=tol, check_incomplete_shape=True,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_{}_{}".format(
+          jtu.format_shape_dtype_string(lhs_shape, lhs_dtype),
+          jtu.format_shape_dtype_string(rhs_shape, rhs_dtype),
+          axes),
+       "lhs_shape": lhs_shape, "lhs_dtype": lhs_dtype,
+       "rhs_shape": rhs_shape, "rhs_dtype": rhs_dtype,
+       "axes": axes, "rng_factory": rng_factory}
+      for rng_factory in [jtu.rand_default]
+      for lhs_shape, rhs_shape, axes in [
+          [(2, 3, 4), (5, 6, 7), 0],  # from issue #740
+          [(2, 3, 4), (3, 4, 5, 6), 2],
+          [(2, 3, 4), (5, 4, 3, 6), [1, 2]],
+          [(2, 3, 4), (5, 4, 3, 6), [[1, 2], [2, 1]]],
+          [(1, 2, 3, 4), (4, 5, 3, 6), [[2, 3], [2, 0]]],
+      ]
+      for lhs_dtype, rhs_dtype in CombosWithReplacement(number_dtypes, 2)))
+  def testTensordot(self, lhs_shape, lhs_dtype, rhs_shape, rhs_dtype, axes, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(lhs_shape, lhs_dtype), rng(rhs_shape, rhs_dtype)]
+    lnp_fun = lambda a, b: tnp.tensordot(a, b, axes)
+    def onp_fun(a, b):
+      a = a if lhs_dtype != tnp.bfloat16 else a.astype(onp.float32)
+      b = b if rhs_dtype != tnp.bfloat16 else b.astype(onp.float32)
+      dtype = tnp.promote_types(lhs_dtype, rhs_dtype)
+      return onp.tensordot(a, b, axes).astype(dtype)
+    tol = {onp.float16: 1e-1, onp.float32: 1e-3, onp.float64: 1e-12,
+           onp.complex64: 1e-3, onp.complex128: 1e-12}
+    if jtu.device_under_test() == "tpu":
+      tol[onp.float32] = tol[onp.complex64] = 2e-1
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True,
+                            tol=tol)
+    # XLA lacks int32/int64 MatMul kernels (b/168657656).
+    check_xla = not set((lhs_dtype, rhs_dtype)).intersection(
+        (onp.int32, onp.int64))
+
+    tol = {onp.float64: 1e-14, onp.float16: 0.04, onp.complex128: 6e-15}
+    tol = max(jtu.tolerance(lhs_dtype, tol), jtu.tolerance(rhs_dtype, tol))
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla,
+                          atol = tol,
+                          rtol = tol)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_{}".format(
+          jtu.format_shape_dtype_string(lhs_shape, lhs_dtype),
+          jtu.format_shape_dtype_string(rhs_shape, rhs_dtype)),
+       "lhs_shape": lhs_shape, "lhs_dtype": lhs_dtype,
+       "rhs_shape": rhs_shape, "rhs_dtype": rhs_dtype,
+       "rng_factory": jtu.rand_default}
+      # TODO(phawkins): support integer dtypes too.
+      for lhs_shape, lhs_dtype in _shape_and_dtypes(all_shapes, inexact_dtypes)
+      for rhs_shape, rhs_dtype in _shape_and_dtypes(all_shapes, inexact_dtypes)
+      if len(jtu._dims_of_shape(lhs_shape)) == 0
+      or len(jtu._dims_of_shape(rhs_shape)) == 0
+      or lhs_shape[-1] == rhs_shape[-1]))
+  def testInner(self, lhs_shape, lhs_dtype, rhs_shape, rhs_dtype, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(lhs_shape, lhs_dtype), rng(rhs_shape, rhs_dtype)]
+    def onp_fun(lhs, rhs):
+      lhs = lhs if lhs_dtype != tnp.bfloat16 else lhs.astype(onp.float32)
+      rhs = rhs if rhs_dtype != tnp.bfloat16 else rhs.astype(onp.float32)
+      dtype = tnp.promote_types(lhs_dtype, rhs_dtype)
+      return onp.inner(lhs, rhs).astype(dtype)
+    lnp_fun = lambda lhs, rhs: tnp.inner(lhs, rhs)  # pylint: disable=unnecessary-lambda
+    tol_spec = {onp.float16: 1e-2, onp.float32: 1e-5, onp.float64: 2e-6}
+    if jtu.device_under_test() == "tpu":
+      tol_spec[onp.float32] = tol_spec[onp.complex64] = 2e-1
+    tol = max(jtu.tolerance(lhs_dtype, tol_spec),
+              jtu.tolerance(rhs_dtype, tol_spec))
+    # TODO(phawkins): there are float32/float64 disagreements for some inputs.
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=False,
+                            tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=False, atol=tol,
+                          rtol=tol, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_amin={}_amax={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), a_min, a_max),
+       "shape": shape, "dtype": dtype, "a_min": a_min, "a_max": a_max,
+       "rng_factory": jtu.rand_default}
+      for shape in all_shapes for dtype in minus(number_dtypes, complex_dtypes)
+      for a_min, a_max in [(-1, None), (None, 1), (-1, 1),
+                           (-onp.ones(1), None),
+                           (None, onp.ones(1)),
+                           (-onp.ones(1), onp.ones(1))]))
+  def testClipStaticBounds(self, shape, dtype, a_min, a_max, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.clip(x, a_min=a_min, a_max=a_max)
+    lnp_fun = lambda x: tnp.clip(x, a_min=a_min, a_max=a_max)
+    args_maker = lambda: [rng(shape, dtype)]
+    tol_spec = {onp.float64: 2e-7}
+    tol = jtu.tolerance(dtype, tol_spec)
+    is_x32_scalar = (dtype in [onp.int32, onp.float32] and
+                     shape in [jtu.NUMPY_SCALAR_SHAPE, ()])
+    # Turns check_dtypes off if is_x32_scalar is True because there is
+    # a weird promotion inconsistency in numpy:
+    # ```
+    # print(np.result_type(np.ones([], np.int32), 1))
+    # print(np.result_type(np.ones([1], np.int32), 1))
+    # print(np.result_type(np.int32(1), 1))
+    # print(np.result_type(np.int32, 1))
+    # print(np.result_type(np.ones([], np.float32), 1))
+    # print(np.result_type(np.ones([1], np.float32), 1))
+    # print(np.result_type(np.float32(1), 1))
+    # print(np.result_type(np.float32, 1))
+    # ```
+    # >>>
+    # int64
+    # int32
+    # int64
+    # int32
+    # float64
+    # float32
+    # float64
+    # float32
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=not is_x32_scalar, tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=not is_x32_scalar,
+                          atol=tol, rtol=tol, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_amin={}_amax={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), a_min, a_max),
+       "shape": shape, "dtype": dtype, "a_min": a_min, "a_max": a_max,
+       "rng_factory": jtu.rand_default}
+      for shape in array_shapes + [jtu.NUMPY_SCALAR_SHAPE]
+      for dtype in minus(number_dtypes, complex_dtypes)
+      for a_min, a_max in [(-1, None), (None, 1), (-1, 1),
+                           (-onp.ones(1), None),
+                           (None, onp.ones(1)),
+                           (-onp.ones(1), onp.ones(1))]))
+  @new_test
+  def testClipAsMethodStaticBounds(
+      self, shape, dtype, a_min, a_max, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.clip(x, a_min=a_min, a_max=a_max)
+    lnp_fun = lambda x: tnp.asarray(x).clip(a_min=a_min, a_max=a_max)
+    args_maker = lambda: [rng(shape, dtype)]
+    tol_spec = {onp.float64: 2e-7}
+    tol = jtu.tolerance(dtype, tol_spec)
+    is_x32_scalar = (dtype in [onp.int32, onp.float32] and
+                     shape in [jtu.NUMPY_SCALAR_SHAPE, ()])
+    # Turns check_dtypes off if is_x32_scalar is True because there is
+    # a weird promotion inconsistency in numpy:
+    # ```
+    # print(np.result_type(np.ones([], np.int32), 1))
+    # print(np.result_type(np.ones([1], np.int32), 1))
+    # print(np.result_type(np.int32(1), 1))
+    # print(np.result_type(np.int32, 1))
+    # print(np.result_type(np.ones([], np.float32), 1))
+    # print(np.result_type(np.ones([1], np.float32), 1))
+    # print(np.result_type(np.float32(1), 1))
+    # print(np.result_type(np.float32, 1))
+    # ```
+    # >>>
+    # int64
+    # int32
+    # int64
+    # int32
+    # float64
+    # float32
+    # float64
+    # float32
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=not is_x32_scalar, tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=not is_x32_scalar,
+                          atol=tol, rtol=tol, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_decimals={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), decimals),
+       "shape": shape, "dtype": dtype, "decimals": decimals,
+       "rng_factory": jtu.rand_default}
+      for shape, dtype in _shape_and_dtypes(
+          all_shapes, minus(number_dtypes, complex_dtypes))
+      for decimals in [0, 1, -2]))
+  def testRoundStaticDecimals(self, shape, dtype, decimals, rng_factory):
+    rng = rng_factory()
+    if tnp.issubdtype(dtype, onp.integer) and decimals < 0:
+      self.skipTest("Integer rounding with decimals < 0 not implemented")
+    onp_fun = lambda x: onp.round(x, decimals=decimals)
+    lnp_fun = lambda x: tnp.round(x, decimals=decimals)
+    args_maker = lambda: [rng(shape, dtype)]
+    tol = {
+        # TODO(b/154768983): tnp.bfloat16: 5e-2,
+        onp.float16: 1e-2}
+    check_dtypes = shape is not jtu.PYTHON_SCALAR_SHAPE
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=check_dtypes, tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=check_dtypes,
+                          atol=tol, rtol=tol, check_incomplete_shape=True)
+
+  def testOperatorRound(self):
+    self.assertAllClose(round(onp.float32(7.532), 1),
+                        round(tnp.float32(7.5), 1), check_dtypes=True)
+    self.assertAllClose(round(onp.float32(1.234), 2),
+                        round(tnp.float32(1.234), 2), check_dtypes=True)
+    self.assertAllClose(round(onp.float32(1.234)),
+                        round(tnp.float32(1.234)), check_dtypes=False)
+    self.assertAllClose(
+        round(onp.float32(7.532), 1),
+        round(tnp.array(7.5, tnp.float32), 1),
+        check_dtypes=True,
+    )
+    self.assertAllClose(
+        round(onp.float32(1.234), 2),
+        round(tnp.array(1.234, tnp.float32), 2),
+        check_dtypes=True,
+    )
+    self.assertAllClose(round(onp.float32(1.234)),
+                        round(tnp.array(1.234, tnp.float32)),
+                        check_dtypes=False)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_mode={}_rpadwidth={}_rconstantvalues={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), mode, pad_width_rank,
+          constant_values_rank),
+       "shape": shape, "dtype": dtype, "mode": mode,
+       "pad_width_rank": pad_width_rank,
+       "constant_values_rank": constant_values_rank,
+       "rng_factory": jtu.rand_default,
+       "irng_factory": partial(jtu.rand_int, 3)}
+      for mode, constant_values_rank, shapes in [
+        ('constant', 0, all_shapes),
+        ('constant', 1, all_shapes),
+        ('constant', 2, all_shapes),
+        ('symmetric', None, nonempty_shapes),
+        ('reflect', None, nonempty_shapes),
+        ('wrap', None, nonempty_shapes),
+      ]
+      for shape, dtype in _shape_and_dtypes(shapes, all_dtypes)
+      for pad_width_rank in range(3)))
+  @jtu.disable
+  def testPad(self, shape, dtype, mode, pad_width_rank, constant_values_rank,
+              rng_factory, irng_factory):
+    rng = rng_factory()
+    irng = irng_factory()
+    pad_width = irng([len(shape), 2][2 - pad_width_rank:], onp.int32)
+    def onp_fun(x, kwargs):
+      if pad_width.size == 0:
+        return x
+      return onp.pad(x, pad_width, mode=mode, **kwargs)
+    def lnp_fun(x, kwargs):
+      return tnp.pad(x, pad_width, mode=mode, **kwargs)
+
+    def args_maker():
+      kwargs = {}
+      if constant_values_rank:
+        kwargs["constant_values"] = rng(
+          [len(shape), 2][2 - constant_values_rank:], dtype)
+      return rng(shape, dtype), kwargs
+
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=shape is not jtu.PYTHON_SCALAR_SHAPE)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape=[{}]_reps={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), reps),
+       "shape": shape, "dtype": dtype, "reps": reps,
+       "rng_factory": jtu.rand_default}
+      for reps in [(), (2,), (3, 4), (2, 3, 4)]
+      for shape, dtype in _shape_and_dtypes(all_shapes, default_dtypes)
+      ))
+  def testTile(self, shape, dtype, reps, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda arg: onp.tile(arg, reps)
+    lnp_fun = lambda arg: tnp.tile(arg, reps)
+    args_maker = lambda: [rng(shape, dtype)]
+    tol_spec = {onp.float64: 2e-7}
+    tol = jtu.tolerance(dtype, tol_spec)
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker,
+                            check_dtypes=shape is not jtu.PYTHON_SCALAR_SHAPE,
+                            tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, atol=tol,
+                          rtol=tol)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_axis={}_baseshape=[{}]_dtypes=[{}]".format(
+          axis, ",".join(str(d) for d in base_shape),
+          ",".join(onp.dtype(dtype).name for dtype in arg_dtypes)),
+       "axis": axis, "base_shape": base_shape, "arg_dtypes": arg_dtypes,
+       "rng_factory": jtu.rand_default}
+      for num_arrs in [3]
+      for arg_dtypes in CombosWithReplacement(default_dtypes, num_arrs)
+      for base_shape in [(4,), (3, 4), (2, 3, 4)]
+      for axis in range(-len(base_shape)+1, len(base_shape))))
+  def testConcatenate(self, axis, base_shape, arg_dtypes, rng_factory):
+    rng = rng_factory()
+    wrapped_axis = axis % len(base_shape)
+    shapes = [base_shape[:wrapped_axis] + (size,) + base_shape[wrapped_axis+1:]
+              for size, _ in zip(itertools.cycle([3, 1, 4]), arg_dtypes)]
+    def onp_fun(*args):
+      # TODO(nareshmodi): enable once bfloat16 has better support
+      # args = [x if x.dtype != bfloat16 else x.astype(onp.float32)
+      #         for x in args]
+      dtype = functools.reduce(tnp.promote_types, arg_dtypes)
+      return onp.concatenate(args, axis=axis).astype(dtype)
+    lnp_fun = lambda *args: tnp.concatenate(args, axis=axis)
+
+    def args_maker():
+      return [rng(shape, dtype) for shape, dtype in zip(shapes, arg_dtypes)]
+
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_axis={}_baseshape=[{}]_dtypes=[{}]".format(
+          axis, ",".join(str(d) for d in base_shape),
+          ",".join(onp.dtype(dtype).name for dtype in arg_dtypes)),
+       "axis": axis, "base_shape": base_shape, "arg_dtypes": arg_dtypes,
+       "rng_factory": jtu.rand_default}
+      for arg_dtypes in CombosWithReplacement(default_dtypes, 2)
+      for base_shape in [(4,), (3, 4), (2, 3, 4)]
+      for axis in range(-len(base_shape)+1, len(base_shape))))
+  def testAppend(self, axis, base_shape, arg_dtypes, rng_factory):
+    rng = rng_factory()
+    wrapped_axis = axis % len(base_shape)
+    shapes = [base_shape[:wrapped_axis] + (size,) + base_shape[wrapped_axis+1:]
+              for size, _ in zip(itertools.cycle([3, 1, 4]), arg_dtypes)]
+    def onp_fun(arr, values):
+      arr = arr.astype(onp.float32) if tnp.bfloat16 == arr.dtype else arr
+      values = (
+          values.astype(onp.float32)
+          if tnp.bfloat16 == values.dtype else values)
+      out = onp.append(arr, values, axis=axis)
+      return out.astype(tnp.promote_types(*arg_dtypes))
+    lnp_fun = lambda arr, values: tnp.append(arr, values, axis=axis)
+
+    def args_maker():
+      return [rng(shape, dtype) for shape, dtype in zip(shapes, arg_dtypes)]
+
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape=[{}]_axis={}_repeats={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), axis, repeats),
+       "axis": axis, "shape": shape, "dtype": dtype, "repeats": repeats,
+       "rng_factory": jtu.rand_default}
+      for repeats in [0, 1, 2]
+      for shape, dtype in _shape_and_dtypes(all_shapes, default_dtypes)
+      for axis in [None] + list(range(-len(shape), len(shape)))))
+  def testRepeat(self, axis, shape, dtype, repeats, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda arg: onp.repeat(arg, repeats=repeats, axis=axis)
+    onp_fun = _promote_like_lnp(onp_fun)
+    lnp_fun = lambda arg: tnp.repeat(arg, repeats=repeats, axis=axis)
+
+    args_maker = lambda: [rng(shape, dtype)]
+
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=False)
+
+  def testIssue1233(self):
+    '''
+    Following numpy test suite from `test_repeat` at https://github.com/numpy/numpy/blob/master/numpy/core/tests/test_multiarray.py
+    '''
+    # pylint: disable=bad-whitespace
+    tol = 1e-5
+
+    def test_single(m, args_maker, repeats, axis):
+      lax_ans = tnp.repeat(m, repeats, axis)
+      numpy_ans = onp.repeat(m, repeats, axis)
+
+      self.assertAllClose(lax_ans, numpy_ans, check_dtypes=True, rtol=tol, atol=tol)
+
+      lnp_fun = lambda arg: tnp.repeat(arg, repeats=repeats, axis=axis)
+      # Turns off XLA check because there are no XLA kernels for `Where` used by
+      # tf.repeat (b/169192730).
+      self._CompileAndCheck(
+          lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=False,
+          check_experimental_compile=False, check_xla_forced_compile=False)
+
+    m = tnp.array([1,2,3,4,5,6])
+    args_maker = lambda: [m]
+
+    for repeats in [
+        2,
+        [1, 3, 2, 1, 1, 2],
+        [1, 3, 0, 1, 1, 2],
+        [2],
+        tnp.array([1, 3, 2, 1, 1, 2]),
+        tnp.array([2]),
+    ]:
+      test_single(m, args_maker, repeats, None)
+
+    m_rect = m.reshape((2,3))
+    args_maker = lambda: [m_rect]
+
+    for repeats in [2, [2,1], [2], tnp.array([2,1]), tnp.array([2])]:
+      test_single(m_rect, args_maker, repeats, axis=0)
+
+    for repeats in [2, [1,3,2], [2], tnp.array([1,3,2]), tnp.array([2])]:
+      test_single(m_rect, args_maker, repeats, axis=1)
+    # pylint: enable=bad-whitespace
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "op={}_shape=[{}]_axis={}_out_dtype={}".format(
+          op, jtu.format_shape_dtype_string(shape, dtype), axis, out_dtype),
+       "axis": axis, "shape": shape, "dtype": dtype, "out_dtype": out_dtype,
+       "rng_factory": jtu.rand_default, "lnp_op": getattr(tnp, op),
+       "onp_op": getattr(onp, op)}
+      for op in ["cumsum", "cumprod"]
+      for dtype in default_dtypes
+      for out_dtype in default_dtypes
+      for shape in all_shapes
+      for axis in [None] + list(range(-len(shape), len(shape)))))
+  def testCumSumProd(self, axis, shape, dtype, out_dtype, onp_op, lnp_op, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda arg: onp_op(arg, axis=axis, dtype=out_dtype)
+    lnp_fun = lambda arg: lnp_op(arg, axis=axis, dtype=out_dtype)
+
+    args_maker = lambda: [rng(shape, dtype)]
+
+    tol = max(jtu.tolerance(dtype), jtu.tolerance(out_dtype))
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True,
+                            tol=tol)
+    # XLA lacks int64 Cumsum/Cumprod kernels (b/168841378).
+    check_xla = out_dtype != onp.int64
+    rtol = None
+    if out_dtype == onp.float16:
+      rtol = 2e-3
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, rtol=rtol,
+        check_incomplete_shape=True,
+        check_experimental_compile=check_xla,
+        check_xla_forced_compile=check_xla)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_dtype={}_m={}_n={}_k={}".format(
+          onp.dtype(dtype).name, m, n, k),
+       "m": m, "n": n, "k": k, "dtype": dtype, "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for n in [0, 4]
+      for m in [None, 0, 1, 3, 4]
+      for k in list(range(-4, 4))))
+  def testTri(self, m, n, k, dtype, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda: onp.tri(n, M=m, k=k, dtype=dtype)
+    lnp_fun = lambda: tnp.tri(n, M=m, k=k, dtype=dtype)
+    args_maker = lambda: []
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_op={}_shape={}_k={}".format(
+          op, jtu.format_shape_dtype_string(shape, dtype), k),
+       "dtype": dtype, "shape": shape, "op": op, "k": k,
+       "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for shape in [shape for shape in all_shapes if len(shape) >= 2]
+      for op in ["tril", "triu"]
+      for k in list(range(-3, 3))))
+  def testTriLU(self, dtype, shape, op, k, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda arg: getattr(onp, op)(arg, k=k)
+    lnp_fun = lambda arg: getattr(tnp, op)(arg, k=k)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    # Incomplete shape support is not implemented at the moment.
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=False)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_ndim={}_n={}".format(ndim, n),
+       "ndim": ndim, "n": n}
+      for ndim in [0, 1, 4]
+      for n in [0, 1, 7]))
+  def testDiagIndices(self, ndim, n):
+    onp.testing.assert_equal(onp.diag_indices(n, ndim),
+                             tnp.diag_indices(n, ndim))
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_k={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), k),
+       "dtype": dtype, "shape": shape, "k": k, "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for shape in [shape for shape in all_shapes if len(shape) in (1, 2)]
+      for k in list(range(-4, 4))))
+  def testDiag(self, shape, dtype, k, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda arg: onp.diag(arg, k)
+    lnp_fun = lambda arg: tnp.diag(arg, k)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_offset={}_axis1={}_axis2={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), offset, axis1, axis2),
+       "dtype": dtype, "shape": shape, "offset": offset, "axis1": axis1,
+       "axis2": axis2, "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for shape in [shape for shape in all_shapes if len(shape) >= 2]
+      for axis1 in range(-len(shape), len(shape))
+      for axis2 in [a for a in range(-len(shape), len(shape))
+                    if a % len(shape) != axis1 % len(shape)]
+      for offset in list(range(-4, 4))))
+  def testDiagonal(self, shape, dtype, offset, axis1, axis2, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda arg: onp.diagonal(arg, offset, axis1, axis2)
+    lnp_fun = lambda arg: tnp.diagonal(arg, offset, axis1, axis2)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_n={}".format(onp.dtype(dtype).name, n),
+       "dtype": dtype, "n": n}
+      for dtype in default_dtypes
+      for n in list(range(4))))
+  def testIdentity(self, n, dtype):
+    onp_fun = lambda: onp.identity(n, dtype)
+    lnp_fun = lambda: tnp.identity(n, dtype)
+    args_maker = lambda: []
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_dtype_{}_offset={}_axis1={}_axis2={}".format(
+          jtu.format_shape_dtype_string(shape, dtype),
+          out_dtype, offset, axis1, axis2),
+       "dtype": dtype, "out_dtype": out_dtype, "shape": shape, "offset": offset,
+       "axis1": axis1, "axis2": axis2, "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for out_dtype in [None] + number_dtypes
+      for shape in [shape for shape in all_shapes if len(shape) >= 2]
+      for axis1 in range(-len(shape), len(shape))
+      for axis2 in range(-len(shape), len(shape))
+      if (axis1 % len(shape)) != (axis2 % len(shape))
+      for offset in list(range(-4, 4))))
+  def testTrace(self, shape, dtype, out_dtype, offset, axis1, axis2, rng_factory):
+    rng = rng_factory()
+    def onp_fun(arg):
+      if out_dtype == tnp.bfloat16:
+        return onp.trace(arg, offset, axis1, axis2, onp.float32).astype(
+            tnp.bfloat16
+        )
+      else:
+        return onp.trace(arg, offset, axis1, axis2, out_dtype)
+    lnp_fun = lambda arg: tnp.trace(arg, offset, axis1, axis2, out_dtype)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_axis={}".format(
+          jtu.format_test_name_suffix("", [shape] * len(dtypes), dtypes), axis),
+       "shape": shape, "axis": axis, "dtypes": dtypes, "rng_factory": rng_factory}
+      for dtypes in [
+        [onp.float32],
+        [onp.float32, onp.float32],
+        [onp.float32, onp.int32, onp.float32],
+        [onp.float32, onp.int64, onp.float32],
+        [onp.float32, onp.int32, onp.float64],
+      ]
+      for shape in [(), (2,), (3, 4), (1, 100)]
+      for axis in range(-len(shape), len(shape) + 1)
+      for rng_factory in [jtu.rand_default]))
+  def testStack(self, shape, axis, dtypes, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [[rng(shape, dtype) for dtype in dtypes]]
+    onp_fun = _promote_like_lnp(partial(onp.stack, axis=axis))
+    lnp_fun = partial(tnp.stack, axis=axis)
+    self._CheckAgainstNumpy(lnp_fun, onp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_op={}_{}".format(
+          op, jtu.format_test_name_suffix("", [shape] * len(dtypes), dtypes)),
+       "shape": shape, "op": op, "dtypes": dtypes, "rng_factory": rng_factory}
+      for op in ["hstack", "vstack", "dstack"]
+      for dtypes in [
+        [onp.float32],
+        [onp.float32, onp.float32],
+        [onp.float32, onp.int32, onp.float32],
+        [onp.float32, onp.int64, onp.float32],
+        [onp.float32, onp.int32, onp.float64],
+      ]
+      for shape in [(), (2,), (3, 4), (1, 100), (2, 3, 4)]
+      for rng_factory in [jtu.rand_default]))
+  def testHVDStack(self, shape, op, dtypes, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [[rng(shape, dtype) for dtype in dtypes]]
+    onp_fun = _promote_like_lnp(getattr(onp, op))
+    lnp_fun = getattr(tnp, op)
+    self._CheckAgainstNumpy(lnp_fun, onp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_outdtype={}".format(
+          jtu.format_shape_dtype_string(shape, fill_value_dtype),
+          onp.dtype(out_dtype).name if out_dtype else "None"),
+       "shape": shape, "fill_value_dtype": fill_value_dtype,
+       "out_dtype": out_dtype, "rng_factory": jtu.rand_default}
+      for shape in array_shapes + [3, onp.array(7, dtype=onp.int32)]
+      for fill_value_dtype in default_dtypes
+      for out_dtype in [None] + default_dtypes))
+  def testFull(self, shape, fill_value_dtype, out_dtype, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda fill_value: onp.full(shape, fill_value, dtype=out_dtype)
+    lnp_fun = lambda fill_value: tnp.full(shape, fill_value, dtype=out_dtype)
+    args_maker = lambda: [rng((), fill_value_dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(
+    jtu.cases_from_list(
+      {"testcase_name": ("_op={}_shape={}_dtype={}").format(op, shape, dtype),
+       "onp_op": getattr(onp, op), "lnp_op": getattr(tnp, op),
+       "shape": shape, "dtype": dtype}
+      for op in ["zeros", "ones"]
+      for shape in [2, (), (2,), (3, 0), onp.array((4, 5, 6), dtype=onp.int32),
+                    onp.array(4, dtype=onp.int32)]
+      for dtype in all_dtypes))
+  def testZerosOnes(self, onp_op, lnp_op, shape, dtype):
+    rng = jtu.rand_default()
+    def args_maker(): return []
+    onp_op = partial(onp_op, shape, dtype)
+    lnp_op = partial(lnp_op, shape, dtype)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_filldtype={}_outdtype={}".format(
+          jtu.format_shape_dtype_string(shape, in_dtype),
+          onp.dtype(fill_value_dtype).name,
+          onp.dtype(out_dtype).name),
+       "shape": shape, "in_dtype": in_dtype,
+       "fill_value_dtype": fill_value_dtype, "out_dtype": out_dtype,
+       "rng_factory": jtu.rand_default}
+      for shape in array_shapes
+      for in_dtype in default_dtypes
+      for fill_value_dtype in default_dtypes
+      for out_dtype in default_dtypes))
+  def testFullLike(self, shape, in_dtype, fill_value_dtype, out_dtype, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x, fill_value: onp.full_like(
+        x, fill_value, dtype=out_dtype
+    )
+    lnp_fun = lambda x, fill_value: tnp.full_like(
+        x, fill_value, dtype=out_dtype
+    )
+    args_maker = lambda: [rng(shape, in_dtype), rng((), fill_value_dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_axis={}_{}sections".format(
+          jtu.format_shape_dtype_string(shape, dtype), axis, num_sections),
+       "shape": shape, "num_sections": num_sections, "axis": axis,
+       "dtype": dtype, "rng_factory": jtu.rand_default}
+      for shape, axis, num_sections in [
+          ((3,), 0, 3), ((12,), 0, 3), ((12, 4), 0, 4), ((12, 4), 1, 2),
+          ((2, 3, 4), -1, 2), ((2, 3, 4), -2, 3)]
+      for dtype in default_dtypes))
+  def testSplitStaticInt(self, shape, num_sections, axis, dtype, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.split(x, num_sections, axis=axis)
+    lnp_fun = lambda x: tnp.split(x, num_sections, axis=axis)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_axis={}_{}sections".format(
+          jtu.format_shape_dtype_string(shape, dtype), axis, num_sections),
+       "shape": shape, "num_sections": num_sections, "axis": axis,
+       "dtype": dtype, "rng_factory": jtu.rand_default}
+      for shape, axis, num_sections in [
+          ((12, 4), 0, 4), ((12, 4), 1, 2),
+          ((2, 3, 4), 2, 2), ((4, 3, 4), 0, 2)]
+      for dtype in default_dtypes))
+  def testHVDSplit(self, shape, num_sections, axis, dtype, rng_factory):
+    rng = rng_factory()
+    def fn(module, axis):
+      if axis == 0:
+        return module.vsplit
+      elif axis == 1:
+        return module.hsplit
+      else:
+        assert axis == 2
+        return module.dsplit
+
+    onp_fun = lambda x: fn(onp, axis)(x, num_sections)
+    lnp_fun = lambda x: fn(tnp, axis)(x, num_sections)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_outshape={}_order={}".format(
+          jtu.format_shape_dtype_string(arg_shape, dtype),
+          jtu.format_shape_dtype_string(out_shape, dtype),
+          order),
+       "arg_shape": arg_shape, "out_shape": out_shape, "dtype": dtype,
+       "order": order, "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for order in ["C", "F"]
+      for arg_shape, out_shape in [
+          (jtu.NUMPY_SCALAR_SHAPE, (1, 1, 1)),
+          ((), (1, 1, 1)),
+          ((7, 0), (0, 42, 101)),
+          ((3, 4), 12),
+          ((3, 4), (12,)),
+          ((3, 4), -1),
+          ((2, 1, 4), (-1,)),
+          ((2, 2, 4), (2, 8))
+      ]))
+  def testReshape(self, arg_shape, out_shape, dtype, order, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.reshape(x, out_shape, order=order)
+    lnp_fun = lambda x: tnp.reshape(x, out_shape, order=order)
+    args_maker = lambda: [rng(arg_shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_outshape={}".format(
+          jtu.format_shape_dtype_string(arg_shape, dtype),
+          jtu.format_shape_dtype_string(out_shape, dtype)),
+       "arg_shape": arg_shape, "out_shape": out_shape, "dtype": dtype,
+       "rng_factory": jtu.rand_default}
+      for dtype in default_dtypes
+      for arg_shape, out_shape in [
+          ((7, 0), (0, 42, 101)),
+          ((2, 1, 4), (-1,)),
+          ((2, 2, 4), (2, 8))
+      ]))
+  def testReshapeMethod(self, arg_shape, out_shape, dtype, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.reshape(x, out_shape)
+    lnp_fun = lambda x: x.reshape(*out_shape)
+    args_maker = lambda: [rng(arg_shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_expanddim={}".format(
+          jtu.format_shape_dtype_string(arg_shape, dtype), dim),
+       "arg_shape": arg_shape, "dtype": dtype, "dim": dim,
+       "rng_factory": jtu.rand_default}
+      for arg_shape in [(), (3,), (3, 4)]
+      for dtype in default_dtypes
+      for dim in range(-len(arg_shape)+1, len(arg_shape))))
+  def testExpandDimsStaticDim(self, arg_shape, dtype, dim, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.expand_dims(x, dim)
+    lnp_fun = lambda x: tnp.expand_dims(x, dim)
+    args_maker = lambda: [rng(arg_shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_axes=({},{})".format(
+          jtu.format_shape_dtype_string(arg_shape, dtype), ax1, ax2),
+       "arg_shape": arg_shape, "dtype": dtype, "ax1": ax1, "ax2": ax2,
+       "rng_factory": jtu.rand_default}
+      for arg_shape, ax1, ax2 in [
+          ((3, 4), 0, 1), ((3, 4), 1, 0), ((3, 4, 5), 1, 2),
+          ((3, 4, 5), -1, -2), ((3, 4, 5), 0, 1)]
+      for dtype in default_dtypes))
+  def testSwapAxesStaticAxes(self, arg_shape, dtype, ax1, ax2, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.swapaxes(x, ax1, ax2)
+    lnp_fun = lambda x: tnp.swapaxes(x, ax1, ax2)
+    args_maker = lambda: [rng(arg_shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_axes=({},{})".format(
+          jtu.format_shape_dtype_string(arg_shape, dtype), source, destination),
+       "arg_shape": arg_shape, "dtype": dtype, "source": source,
+       "destination": destination, "rng_factory": jtu.rand_default}
+      for arg_shape, source, destination in [
+          (tuple(range(6)), (0, 2), (3, 5)),
+          (tuple(range(6)), (0, 2), (-1, -3)),
+          (tuple(range(6)), (-6, -4),(3, 5)),
+          (tuple(range(6)), (-6, -4), (-1, -3)),
+          (tuple(range(6)), 0, 4),
+          (tuple(range(6)), -6, -2),
+          (tuple(range(6)), tuple(range(6)), tuple(range(6))),
+          (tuple(range(6)), tuple(range(6)), tuple(reversed(range(6)))),
+          (tuple(range(6)), (), ()),
+      ] for dtype in default_dtypes))
+  @new_test
+  def testMoveaxisStaticAxes(self, arg_shape, dtype, source, destination,
+                             rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.moveaxis(x, source, destination)
+    lnp_fun = lambda x: tnp.moveaxis(x, source, destination)
+    args_maker = lambda: [rng(arg_shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_inshape={}_axis={}".format(
+          jtu.format_shape_dtype_string(arg_shape, dtype), ax),
+       "arg_shape": arg_shape, "dtype": dtype, "ax": ax,
+       "rng_factory": jtu.rand_default}
+      for arg_shape, ax in [
+          ((3, 1), None),
+          ((3, 1), 1),
+          ((1, 3, 1), (0, 2)),
+          ((1, 4, 1), (0,))]
+      for dtype in default_dtypes))
+  def testSqueeze(self, arg_shape, dtype, ax, rng_factory):
+    rng = rng_factory()
+    onp_fun = lambda x: onp.squeeze(x, ax)
+    lnp_fun = lambda x: tnp.squeeze(x, ax)
+    args_maker = lambda: [rng(arg_shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_axis={}_weights={}_returned={}".format(
+          jtu.format_shape_dtype_string(shape, dtype),
+          axis,
+          (None if weights_shape is None else jtu.format_shape_dtype_string(weights_shape, dtype)),
+          returned),
+       "rng_factory": jtu.rand_default, "shape": shape, "dtype": dtype, "axis": axis,
+       "weights_shape": weights_shape, "returned": returned}
+      for shape, dtype in _shape_and_dtypes(nonempty_shapes, number_dtypes)
+      for axis in set(range(-len(shape), len(shape))) | set([None])
+      # `weights_shape` is either `None`, same as the averaged axis, or same as
+      # that of the input
+      for weights_shape in ([None, shape] if axis is None or len(shape) == 1
+                            else [None, (shape[axis],), shape])
+      for returned in [False, True]))
+  def testAverage(self, shape, dtype, axis, weights_shape, returned, rng_factory):
+    rng = rng_factory()
+    if weights_shape is None:
+      onp_fun = lambda x: onp.average(x, axis, returned=returned)
+      lnp_fun = lambda x: tnp.average(x, axis, returned=returned)
+      args_maker = lambda: [rng(shape, dtype)]
+    else:
+      onp_fun = lambda x, weights: onp.average(x, axis, weights, returned)
+      lnp_fun = lambda x, weights: tnp.average(x, axis, weights, returned)
+      args_maker = lambda: [rng(shape, dtype), rng(weights_shape, dtype)]
+    onp_fun = _promote_like_lnp(onp_fun, inexact=True)
+    tol = {
+        # TODO(b/154768983): tnp.bfloat16: 1e-1,
+        onp.float16: 1e-1, onp.float32: 1e-3, onp.float64: 2e-7,
+        onp.complex64: 1e-3, onp.complex128: 1e-10,
+    }
+    check_dtypes = shape is not jtu.PYTHON_SCALAR_SHAPE
+    try:
+      self._CheckAgainstNumpy(
+          onp_fun, lnp_fun, args_maker, check_dtypes=check_dtypes, tol=tol)
+    except ZeroDivisionError:
+      self.skipTest("don't support checking for ZeroDivisionError")
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=check_dtypes,
+                          rtol=tol, atol=tol, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_arg{}_ndmin={}".format(i, ndmin),
+       "arg": arg, "ndmin": ndmin, "dtype": dtype}
+      for i, (arg, dtype) in enumerate([
+          ([True, False, True], tnp.bool_),
+          (3., tnp.float_),
+          ([1, 2, 3], tnp.int_),
+          ([1., 2., 3.], tnp.float_),
+          ([[1, 2], [3, 4], [5, 6]], tnp.int_),
+          ([[1, 2.], [3, 4], [5, 6]], tnp.float_),
+          ([[1., 2j], [3., 4.], [5., 6.]], tnp.complex_),
+          ([[3, onp.array(2, dtype=tnp.float_), 1],
+           onp.arange(3., dtype=tnp.float_)], tnp.float_),  # pylint: disable=bad-continuation
+      ])
+      for ndmin in [None, onp.ndim(arg), onp.ndim(arg) + 1, onp.ndim(arg) + 2]))
+  def testArray(self, arg, ndmin, dtype):
+    args_maker = lambda: [arg]
+    dtype = tnp.canonicalize_dtype(dtype)
+    if ndmin is not None:
+      onp_fun = partial(onp.array, ndmin=ndmin, dtype=dtype)
+      lnp_fun = partial(tnp.array, ndmin=ndmin)
+    else:
+      onp_fun = partial(onp.array, dtype=dtype)
+      lnp_fun = tnp.array
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True, static_argnums=[0])
+
+  def testIssue121(self):
+    assert not onp.isscalar(tnp.array(3))
+
+  @jtu.disable
+  def testArrayMethod(self):
+    class arraylike(object):
+      dtype = onp.float32
+
+      def __array__(self, dtype=None):
+        return 3.0
+
+    a = arraylike()
+    ans = tnp.array(a)
+    assert ans == 3.0
+
+  @jtu.skip_on_devices("tpu")  # TODO(b/32368900): TPUs don't support uint8 yet.
+  @jtu.disable
+  def testMemoryView(self):
+    ans = tnp.array(bytearray(b"\x2a"))
+    self.assertAllClose(
+        ans, onp.array([0x2A], dtype=onp.uint8), check_dtypes=True
+    )
+
+  def testAllClose(self):
+    rng = onp.random.RandomState(0)
+    x = rng.randn(2, 2)
+    y = rng.randn(2)
+
+    def same(list1, list2):
+      allclose = functools.partial(tnp.allclose, atol=1e-3, rtol=1e-3)
+      elements_close = list(map(allclose, list1, list2))
+      return tnp.all(tnp.array(elements_close))
+
+    csame = nje.jit(same)
+
+    a1 = same((x, y), (x, y))
+    a2 = csame((x, y), (x, y))
+    a3 = csame((x, y), (x, 2 * y))
+
+    self.assertTrue(a1)
+    self.assertTrue(a2)
+    self.assertFalse(a3)
+
+  @jtu.skip_on_devices("tpu")  # TODO(mattjj): investigate this failure
+  @jtu.disable
+  def testOnesBroadcastingConstantHandler(self):
+    # TODO(mattjj): update this test for jax3
+    self.skipTest("test needs jax3 update")
+
+    def fun(x):
+      ones = tnp.ones((3, 4))
+      assert isinstance(ones, onp.ndarray) and ones.strides == (0, 0)
+
+      # To check that the constant handler generates a Broadcast for stride-zero
+      # arrays, we monkey-patch the client instance.
+      # TODO(mattjj): once we have better HLO dumping and inspecting facilities,
+      # we can check the HLO more directly.
+      c = x._node.c
+      Broadcast = c.Broadcast  # pylint: disable=invalid-name
+      was_called = []
+      c.Broadcast = lambda *args: was_called.append(True) or Broadcast(*args)
+      out = x + ones  # the ndarray constant handler should call Broadcast here
+      assert was_called, "Broadcast was not called."
+
+      return out
+
+    fun = api.jit(fun)
+    out_val = fun(tnp.ones(4))
+    self.assertAllClose(out_val, onp.full((3, 4), 2.), check_dtypes=False)
+
+  def testZeroStridesConstantHandler(self):
+    raw_const = onp.random.RandomState(0).randn(1, 2, 1, 1, 5, 1)
+    const = onp.broadcast_to(raw_const, (3, 2, 3, 4, 5, 6))
+
+    def fun(x):
+      return x * const
+
+    fun = nje.jit(fun)
+    out_val = fun(3.)
+    self.assertAllClose(out_val, 3. * const, check_dtypes=False)
+
+  def testIsInstanceNdarrayDuringTracing(self):
+    arr = onp.ones(3)
+
+    @nje.jit
+    def f(x):
+      self.assertIsInstance(x, tnp.ndarray)
+      return tnp.sum(x)
+
+    f(arr)
+
+  @jtu.disable
+  def testNonArrayErrorMessage(self):
+    x = [1., 2.]
+    y = onp.array([3., 4.])
+
+    def g(x, y):
+      return tnp.add(x, y)
+
+    def f(x, y):
+      return tnp.dot(x, y)
+
+    self.assertRaises(TypeError, lambda: g(x, y))
+    self.assertRaises(TypeError, lambda: f(x, y))
+    self.assertRaises(TypeError, lambda: api.jit(g)(x, y))
+    self.assertRaises(TypeError, lambda: api.jit(f)(x, y))
+
+  @jtu.disable
+  def testAbstractionErrorMessage(self):
+
+    @api.jit
+    def f(x, n):
+      for _ in range(n):
+        x = x * x
+      return x
+
+    self.assertRaises(TypeError, lambda: f(3., 3))
+
+    @api.jit
+    def g(x):
+      if x > 0.:
+        return x * 2
+      else:
+        return x + 2
+
+    self.assertRaises(TypeError, lambda: g(3.))
+
+  @jtu.disable
+  def testTracingPrimitiveWithNoTranslationErrorMessage(self):
+    # TODO(mattjj): update this for jax3
+    self.skipTest("test needs jax3 update")
+    foo = tnp._not_implemented(lambda x: x)
+
+    # No error if there's no tracing.
+    foo(onp.arange(3))
+
+    cfoo = api.jit(foo)
+    self.assertRaises(NotImplementedError, lambda: cfoo(onp.arange(3)))
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_axis={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), axis),
+       "rng_factory": rng_factory, "shape": shape, "dtype": dtype, "axis": axis}
+      for shape in [(3,), (2, 3)]
+      for dtype in default_dtypes
+      for axis in list(range(-len(shape), len(shape))) + [None]  # Test negative axes
+      for rng_factory in [jtu.rand_default]))
+  def testFlip(self, shape, dtype, axis, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    lnp_op = lambda x: tnp.flip(x, axis)
+    onp_op = lambda x: onp.flip(x, axis)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}".format(
+          jtu.format_shape_dtype_string(shape, dtype)),
+       "rng_factory": rng_factory, "shape": shape, "dtype": dtype}
+      for shape in [(3,), (2, 3), (3, 2, 4)]
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testFlipud(self, shape, dtype, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    lnp_op = lambda x: tnp.flipud(x)
+    onp_op = lambda x: onp.flipud(x)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}".format(
+          jtu.format_shape_dtype_string(shape, dtype)),
+       "rng_factory": rng_factory, "shape": shape, "dtype": dtype}
+      for shape in [(3, 2), (2, 3), (3, 2, 4)]
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testFliplr(self, shape, dtype, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    lnp_op = lambda x: tnp.fliplr(x)
+    onp_op = lambda x: onp.fliplr(x)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_k={}_axes={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), k, axes),
+       "rng_factory": rng_factory, "shape": shape, "dtype": dtype, "k": k, "axes": axes}
+      for shape, axes in [
+          [(2, 3), (0, 1)],
+          [(2, 3), (1, 0)],
+          [(4, 3, 2), (0, 2)],
+          [(4, 3, 2), (2, 1)],
+      ]
+      for k in range(-3, 4)
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testRot90(self, shape, dtype, k, axes, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    lnp_op = lambda x: tnp.rot90(x, k, axes)
+    onp_op = lambda x: onp.rot90(x, k, axes)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_k={}_axes={}".format(
+          jtu.format_shape_dtype_string(shape, dtype), k, axes),
+       "rng_factory": rng_factory, "shape": shape, "dtype": dtype, "k": k,
+       "axes": axes}
+      for shape, axes in [
+          [(2, 3), (-2, -1)],
+          [(2, 3), (-2, 1)],
+          [(4, 3, 2), (-1, -2)],
+          [(4, 3, 2), (2, -2)],
+      ]
+      for k in range(-3, 4)
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  @new_test
+  # These tests are only added as a separate test from testRot90 since we would
+  # like to measure coverage directly against the existing baseline. Once we
+  # stop measuring that, we can combine this test with the above.
+  def testRot90Additional(self, shape, dtype, k, axes, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    lnp_op = lambda x: tnp.rot90(x, k, axes)
+    onp_op = lambda x: onp.rot90(x, k, axes)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  # TODO(mattjj): test infix operator overrides
+
+  def testRavel(self):
+    rng = onp.random.RandomState(0)
+    args_maker = lambda: [rng.randn(3, 4).astype("float32")]
+    self._CompileAndCheck(lambda x: x.ravel(), args_maker, check_dtypes=True,
+                          check_incomplete_shape=True)
+
+  def testAstype(self):
+    rng = onp.random.RandomState(0)
+    args_maker = lambda: [rng.randn(3, 4).astype("float32")]
+    op = lambda x: x.astype(tnp.int32)
+    self._CheckAgainstNumpy(op, op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  # TODO(mattjj): test other ndarray-like method overrides
+
+  def testOnpMean(self):
+    # from https://github.com/google/jax/issues/125
+    x = tnp.add(tnp.eye(3, dtype=tnp.float_), 0.)
+    ans = onp.mean(x)
+    self.assertAllClose(ans, onp.array(1./3), check_dtypes=False)
+
+  @jtu.disable
+  def testArangeOnFloats(self):
+    # from https://github.com/google/jax/issues/145
+    expected = onp.arange(0.0, 1.0, 0.1, dtype=tnp.float_)
+    ans = tnp.arange(0.0, 1.0, 0.1)
+    self.assertAllClose(expected, ans, check_dtypes=True)
+
+  def testSortManually(self):
+
+    def _test(*args, **kwargs):
+
+      raw_ans = tnp.sort(*args, **kwargs)
+      fn_ans = nje.jit(tnp.sort, static_argnums=(1,))(*args, **kwargs)
+      expected = onp.sort(*args, **kwargs)
+
+      self.assertAllClose(expected, raw_ans, check_dtypes=True)
+      self.assertAllClose(expected, fn_ans, check_dtypes=True)
+
+    # manual tests for sort are nice because we don't have to worry about ties.
+    # lax.sort is tested combinatorially.
+    _test(onp.array([16, 15, 23, 42, 8, 4]))
+    _test(onp.array([[1, 4], [3, 1]]), None)
+    _test(onp.array([[1, 4], [3, 1]]))
+    _test(onp.array([[1, 4], [3, 1]]), 0)
+
+  def testArgsortManually(self):
+
+    def _test(*args, **kwargs):
+
+      raw_ans = tnp.argsort(*args, **kwargs)
+      fn_ans = nje.jit(tnp.argsort, static_argnums=(1,))(*args, **kwargs)
+      expected = onp.argsort(*args, **kwargs)
+
+      self.assertAllClose(expected, raw_ans, check_dtypes=True)
+      self.assertAllClose(expected, fn_ans, check_dtypes=True)
+
+    _test(onp.array([16, 15, 23, 42, 8, 4]))
+    _test(onp.array([[16, 15, 23], [42, 8, 4]]), 0)
+    _test(onp.array([[16, 15, 23], [42, 8, 4]]), 1)
+    _test(onp.array([[16, 15, 23], [42, 8, 4]]), None)
+    _test(onp.array([[16, 15, 23], [42, 8, 4]]))
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_shifts={}_axis={}".format(
+          jtu.format_shape_dtype_string(shape, dtype),
+          shifts, axis),
+       "rng_factory": rng_factory, "shape": shape, "dtype": dtype, "shifts": shifts,
+       "axis": axis}
+      for dtype in all_dtypes
+      for shape in [(3, 4), (3, 4, 5), (7, 4, 0)]
+      for shifts, axis in [
+        (3, None),
+        (1, 1),
+        ((3,), (0,)),
+        ((-2,), (-2,)),
+        ((1, 2), (0, -1))
+      ]
+      for rng_factory in [jtu.rand_default]))
+  def testRoll(self, shape, dtype, shifts, axis, rng_factory):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype), onp.array(shifts)]
+    lnp_op = partial(tnp.roll, axis=axis)
+    onp_op = partial(onp.roll, axis=axis)
+    self._CheckAgainstNumpy(lnp_op, onp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_index={}_axis={}_mode={}".format(
+          jtu.format_shape_dtype_string(shape, dtype),
+          jtu.format_shape_dtype_string(index_shape, index_dtype),
+          axis, mode),
+       "rng_factory": rng_factory, "rng_indices_factory": rng_indices_factory,
+       "shape": shape, "index_shape": index_shape, "dtype": dtype,
+       "index_dtype": index_dtype, "axis": axis, "mode": mode}
+      for shape in [(3,), (3, 4), (3, 4, 5)]
+      for index_shape in scalar_shapes + [(3,), (2, 1, 3)]
+      for axis in itertools.chain(range(-len(shape), len(shape)), [None])
+      for dtype in all_dtypes
+      for index_dtype in int_dtypes
+      for mode in ['wrap', 'clip']
+      for rng_factory in [jtu.rand_default]
+      for rng_indices_factory in [partial(jtu.rand_int, -5, 5)]))
+  def testTake(self, shape, dtype, index_shape, index_dtype, axis, mode,
+               rng_factory, rng_indices_factory):
+    def args_maker():
+      x = rng(shape, dtype)
+      i = rng_indices(index_shape, index_dtype)
+      return x, i
+
+    rng = rng_factory()
+    rng_indices = rng_indices_factory()
+    lnp_op = lambda x, i: tnp.take(x, i, axis=axis, mode=mode)
+    onp_op = lambda x, i: onp.take(x, i, axis=axis, mode=mode)
+    self._CheckAgainstNumpy(lnp_op, onp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_{}_ishape={}_axis={}".format(
+          jtu.format_shape_dtype_string(x_shape, dtype), i_shape, axis),
+       "rng_factory": rng_factory, "x_shape": x_shape, "i_shape": i_shape, "dtype": dtype,
+       "axis": axis}
+      for x_shape, i_shape in filter(
+        _shapes_are_equal_length,
+        filter(_shapes_are_broadcast_compatible,
+               CombosWithReplacement(nonempty_nonscalar_array_shapes, 2)))
+      for axis in itertools.chain(range(len(x_shape)), [-1], [None])
+      for dtype in default_dtypes
+      for rng_factory in [jtu.rand_default]))
+  def testTakeAlongAxis(self, x_shape, i_shape, dtype, axis, rng_factory):
+    rng = rng_factory()
+    i_shape = onp.array(i_shape)
+    if axis is None:
+      i_shape = [onp.prod(i_shape, dtype=onp.int64)]
+    else:
+      # Test the case where the size of the axis doesn't necessarily broadcast.
+      i_shape[axis] *= 3
+      i_shape = list(i_shape)
+    def args_maker():
+      x = rng(x_shape, dtype)
+      n = onp.prod(x_shape, dtype=onp.int32) if axis is None else x_shape[axis]
+      i = rng(i_shape, onp.int32) % (2 * n - 1) - (n - 1)
+      return x, i
+
+    lnp_op = lambda x, i: tnp.take_along_axis(x, i, axis=axis)
+
+    if hasattr(onp, "take_along_axis"):
+      onp_op = lambda x, i: onp.take_along_axis(x, i, axis=axis)
+      self._CheckAgainstNumpy(lnp_op, onp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(lnp_op, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}_n={}_increasing={}".format(
+          jtu.format_shape_dtype_string([shape], dtype),
+          n, increasing),
+       "dtype": dtype, "shape": shape, "n": n, "increasing": increasing,
+       "rng_factory": jtu.rand_default}
+      for dtype in inexact_dtypes
+      for shape in [0, 5]
+      for n in [2, 4]
+      for increasing in [False, True]))
+  def testVander(self, shape, dtype, n, increasing, rng_factory):
+    rng = rng_factory()
+    def onp_fun(arg):
+      arg = arg.astype(onp.float32) if dtype == tnp.bfloat16 else arg
+      return onp.vander(arg, N=n, increasing=increasing)
+    lnp_fun = lambda arg: tnp.vander(arg, N=n, increasing=increasing)
+    args_maker = lambda: [rng([shape], dtype)]
+    # np.vander seems to return float64 for all floating types. We could obey
+    # those semantics, but they seem like a bug.
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=False,
+                            tol={onp.float32: 1e-3})
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=False, check_incomplete_shape=True,
+        rtol={onp.complex128: 2e-15})
+
+  @named_parameters(jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix("nan_to_num", [shape],
+                                                      [dtype]),
+         "rng_factory": jtu.rand_some_inf_and_nan, "shape": shape,
+         "dtype": dtype}
+        for shape in all_shapes
+        for dtype in inexact_dtypes))
+  @jtu.disable
+  def testNanToNum(self, rng_factory, shape, dtype):
+    rng = rng_factory()
+    dtype = onp.dtype(dtypes.canonicalize_dtype(dtype)).type
+    def onp_fun(x):
+      if dtype == tnp.bfloat16:
+        x = onp.where(onp.isnan(x), dtype(0), x)
+        x = onp.where(onp.isposinf(x), tnp.finfo(dtype).max, x)
+        x = onp.where(onp.isneginf(x), tnp.finfo(dtype).min, x)
+        return x
+      else:
+        return onp.nan_to_num(x).astype(dtype)
+
+    args_maker = lambda: [rng(shape, dtype)]
+    check_dtypes = shape is not jtu.PYTHON_SCALAR_SHAPE
+    self._CheckAgainstNumpy(onp_fun, tnp.nan_to_num, args_maker,
+                            check_dtypes=check_dtypes)
+    self._CompileAndCheck(tnp.nan_to_num, args_maker,
+                          check_dtypes=check_dtypes)
+
+  @named_parameters(jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix("ix_", shapes, dtypes),
+         "rng_factory": jtu.rand_default, "shapes": shapes, "dtypes": dtypes}
+        for shapes, dtypes in (
+          ((), ()),
+          (((7,),), (onp.int32,)),
+          (((3,), (4,)), (onp.int32, onp.int32)),
+          (((3,), (1,), (4,)), (onp.int32, onp.int32, onp.int32)),
+        )))
+  def testIx_(self, rng_factory, shapes, dtypes):
+    rng = rng_factory()
+    args_maker = lambda: [rng(shape, dtype)
+                          for shape, dtype in zip(shapes, dtypes)]
+    self._CheckAgainstNumpy(onp.ix_, tnp.ix_, args_maker,
+                            check_dtypes=True)
+    self._CompileAndCheck(
+        tnp.ix_, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(jtu.cases_from_list(
+        {"testcase_name":
+           "_op={}_a_shape={}_q_shape={}_axis={}_keepdims={}".format(
+             op,
+             jtu.format_shape_dtype_string(a_shape, a_dtype),
+             jtu.format_shape_dtype_string(q_shape, q_dtype),
+             axis, keepdims),
+         "a_rng": jtu.rand_default(), "q_rng": q_rng, "op": op,
+         "a_shape": a_shape, "a_dtype": a_dtype,
+         "q_shape": q_shape, "q_dtype": q_dtype, "axis": axis,
+         "keepdims": keepdims}
+        for (op, q_rng) in (
+          ("percentile", jtu.rand_uniform(low=0., high=100.)),
+          ("quantile", jtu.rand_uniform(low=0., high=1.)),
+          ("median", jtu.rand_uniform(low=0., high=1.)),
+        )
+        for a_dtype in float_dtypes
+        for a_shape, axis in (
+          ((7,), None),
+          ((47, 7), 0),
+          ((4, 101), 1),
+        )
+        for q_dtype in [onp.float32]
+        for q_shape in scalar_shapes + [(4,)]
+        for keepdims in [False, True]))
+  @jtu.disable
+  def testQuantile(self, op, a_rng, q_rng, a_shape, a_dtype, q_shape, q_dtype,
+                   axis, keepdims):
+    if op == "quantile" and numpy_version < (1, 15):
+      raise SkipTest("Numpy < 1.15 does not have np.quantile")
+    if op == "median":
+      args_maker = lambda: [a_rng(a_shape, a_dtype)]
+    else:
+      args_maker = lambda: [a_rng(a_shape, a_dtype), q_rng(q_shape, q_dtype)]
+
+    def onp_fun(*args):
+      args = [x if tnp.result_type(x) != tnp.bfloat16 else
+              onp.asarray(x, onp.float32) for x in args]
+      return getattr(onp, op)(*args, axis=axis, keepdims=keepdims)
+    lnp_fun = partial(getattr(tnp, op), axis=axis, keepdims=keepdims)
+    # TODO(phawkins): we currently set dtype=False because we aren't as
+    # aggressive about promoting to float64. It's not clear we want to mimic
+    # Numpy here.
+    tol_spec = {onp.float32: 2e-4, onp.float64: 5e-6}
+    tol = max(jtu.tolerance(a_dtype, tol_spec),
+              jtu.tolerance(q_dtype, tol_spec))
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=False,
+                            tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, rtol=tol)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_shape={}".format(
+          jtu.format_shape_dtype_string(shape, dtype)),
+       "shape": shape, "dtype": dtype}
+      for shape in all_shapes for dtype in all_dtypes))
+  def testWhereOneArgument(self, shape, dtype):
+    rng = jtu.rand_some_zero()
+    onp_fun = lambda x: onp.where(x)
+    lnp_fun = lambda x: tnp.where(x)
+    args_maker = lambda: [rng(shape, dtype)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=False)
+    # Turns off XLA check because there are no XLA kernels for `Where`, which
+    # XLA can't support because it's output shape is dynamic.
+    self._CompileAndCheck(
+        tnp.where,
+        args_maker,
+        check_dtypes=True,
+        check_eval_on_shapes=False,
+        check_incomplete_shape=True,
+        check_unknown_rank=False,
+        check_experimental_compile=False, check_xla_forced_compile=False)
+
+  @named_parameters(jtu.cases_from_list(
+    {"testcase_name": "_{}".format("_".join(
+        jtu.format_shape_dtype_string(shape, dtype)
+        for shape, dtype in zip(shapes, dtypes))),
+     "rng_factory": jtu.rand_default, "shapes": shapes, "dtypes": dtypes}
+    for shapes in filter(_shapes_are_broadcast_compatible,
+                         CombosWithReplacement(all_shapes, 3))
+    for dtypes in CombosWithReplacement(all_dtypes, 3)))
+  def testWhereThreeArgument(self, rng_factory, shapes, dtypes):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng_factory(), shapes, dtypes)
+    def onp_fun(cond, x, y):
+      return _promote_like_lnp(partial(onp.where, cond))(x, y)
+    self._CheckAgainstNumpy(onp_fun, tnp.where, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        tnp.where, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  def testWhereScalarPromotion(self):
+    x = tnp.where(tnp.array([True, False]), 3,
+                  tnp.ones((2,), dtype=tnp.float32))
+    self.assertEqual(x.dtype, onp.dtype(onp.float32))
+
+  @named_parameters(jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix("", shapes,
+                                                      (onp.bool_,) * n + dtypes),
+         "rng_factory": jtu.rand_default, "shapes": shapes, "dtypes": dtypes}
+        for n in range(0, 3)
+        for shapes in filter(
+          _shapes_are_broadcast_compatible,
+          CombosWithReplacement(all_shapes, 2 * n + 1))
+        for dtypes in CombosWithReplacement(all_dtypes, n + 1)))
+  def testSelect(self, rng_factory, shapes, dtypes):
+    rng = rng_factory()
+    n = len(dtypes) - 1
+    def args_maker():
+      condlist = [rng(shape, onp.bool_) for shape in shapes[:n]]
+      choicelist = [rng(shape, dtype)
+                    for shape, dtype in zip(shapes[n:-1], dtypes[:n])]
+      default = rng(shapes[-1], dtypes[-1])
+      return condlist, choicelist, default
+    # TODO(phawkins): float32/float64 type mismatches
+    def onp_fun(condlist, choicelist, default):
+      choicelist = [x if tnp.bfloat16 != tnp.result_type(x)
+                    else x.astype(onp.float32) for x in choicelist]
+      dtype = tnp.result_type(default, *choicelist).as_numpy_dtype
+      return onp.select(condlist,
+                        [onp.asarray(x, dtype=dtype) for x in choicelist],
+                        onp.asarray(default, dtype=dtype))
+    self._CheckAgainstNumpy(onp_fun, tnp.select, args_maker,
+                            check_dtypes=False)
+    self._CompileAndCheck(tnp.select, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True,
+                          rtol={onp.float64: 1e-7, onp.complex128: 1e-7})
+
+  @jtu.disable
+  def testIssue330(self):
+    x = tnp.full((1, 1), tnp.array([1])[0])  # doesn't crash
+    self.assertEqual(x[0, 0], 1)
+
+  @jtu.disable
+  def testScalarDtypePromotion(self):
+    orig_numpy_result = (1 + onp.eye(1, dtype=onp.float32)).dtype
+    jax_numpy_result = (1 + tnp.eye(1, dtype=tnp.float32)).dtype
+    self.assertEqual(orig_numpy_result, jax_numpy_result)
+
+  @jtu.disable
+  def testSymmetrizeDtypePromotion(self):
+    x = onp.eye(3, dtype=onp.float32)
+    orig_numpy_result = ((x + x.T) / 2).dtype
+
+    x = tnp.eye(3, dtype=tnp.float32)
+    jax_numpy_result = ((x + x.T) / 2).dtype
+    self.assertEqual(orig_numpy_result, jax_numpy_result)
+
+  @jtu.disable
+  def testIssue347(self):
+    # https://github.com/google/jax/issues/347
+    def test_fail(x):
+      x = tnp.sqrt(tnp.sum(x ** 2, axis=1))
+      ones = tnp.ones_like(x)
+      x = tnp.where(x > 0.5, x, ones)
+      return tnp.sum(x)
+
+    x = tnp.array([[1, 2], [3, 4], [0, 0]], dtype=tnp.float64)
+    result = api.grad(test_fail)(x)
+    assert not onp.any(onp.isnan(result))
+
+  def testIssue453(self):
+    # https://github.com/google/jax/issues/453
+    a = onp.arange(6) + 1
+    ans = tnp.reshape(a, (3, 2), order="F")
+    expected = onp.reshape(a, (3, 2), order="F")
+    self.assertAllClose(ans, expected, check_dtypes=True)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name": "_op={}_dtype={}".format(op, pytype.__name__),
+       "pytype": pytype, "dtype": dtype, "op": op}
+      for pytype, dtype in [(int, tnp.int_), (float, tnp.float_),
+                            (bool, tnp.bool_), (complex, tnp.complex_)]
+      for op in ["atleast_1d", "atleast_2d", "atleast_3d"]))
+  def testAtLeastNdLiterals(self, pytype, dtype, op):
+    # Fixes: https://github.com/google/jax/issues/634
+    onp_fun = lambda arg: getattr(onp, op)(arg).astype(dtype)
+    lnp_fun = lambda arg: getattr(tnp, op)(arg)
+    args_maker = lambda: [pytype(2)]
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  def testLongLong(self):
+    self.assertAllClose(
+        onp.int64(7), nje.jit(lambda x: x)(onp.longlong(7)), check_dtypes=True
+    )
+
+  def testArange(self):
+    # test cases inspired by dask tests at
+    # https://github.com/dask/dask/blob/master/dask/array/tests/test_creation.py#L92
+    self.assertAllClose(tnp.arange(77),
+                        onp.arange(77, dtype=tnp.int_), check_dtypes=True)
+    self.assertAllClose(tnp.arange(2, 13),
+                        onp.arange(2, 13, dtype=tnp.int_), check_dtypes=True)
+    self.assertAllClose(tnp.arange(4, 21, 9),
+                        onp.arange(4, 21, 9, dtype=tnp.int_), check_dtypes=True)
+    self.assertAllClose(tnp.arange(53, 5, -3),
+                        onp.arange(53, 5, -3, dtype=tnp.int_),
+                        check_dtypes=True)
+    # TODO(mattjj): make these tests work when enable_x64=True
+    self.assertAllClose(
+        tnp.arange(77, dtype=float),
+        onp.arange(77, dtype=float),
+        check_dtypes=True)
+    self.assertAllClose(
+        tnp.arange(2, 13, dtype=int),
+        onp.arange(2, 13, dtype=int),
+        check_dtypes=True)
+    self.assertAllClose(tnp.arange(0, 1, -0.5),
+                        onp.arange(0, 1, -0.5, dtype=tnp.float_),
+                        check_dtypes=True)
+
+    self.assertRaises(TypeError, lambda: tnp.arange())
+
+    # # The following have been disabled since they test JAX specific behavior
+    # # test that tnp.arange(N) doesn't instantiate an ndarray
+    # self.assertFalse(type(tnp.arange(77)) == type(onp.arange(77)))
+    # self.assertTrue(type(tnp.arange(77)) == type(lax.iota(onp.int32, 77)))
+
+    # # test that tnp.arange(N, dtype=int32) doesn't instantiate an ndarray
+    # self.assertFalse(type(tnp.arange(77, dtype=tnp.int32)) ==
+    #                  type(onp.arange(77, dtype=onp.int32)))
+    # self.assertTrue(type(tnp.arange(77, dtype=tnp.int32)) ==
+    #                 type(lax.iota(onp.int32, 77)))
+
+  def testIssue830(self):
+    a = tnp.arange(4, dtype=tnp.complex64)
+    self.assertEqual(a.dtype, tnp.complex64)
+
+  def testIssue728(self):
+    assert tnp.allclose(tnp.eye(5000), onp.eye(5000))
+    self.assertEqual(0, onp.sum(tnp.eye(1050) - onp.eye(1050)))
+
+  def testIssue746(self):
+    tnp.arange(12).reshape(3, 4)  # doesn't crash
+
+  def testIssue764(self):
+    x = tnp.linspace(190, 200, 4)
+    f = nje.grad(lambda x: tnp.sum(tnp.tanh(x)))
+    # Expected values computed with autograd in float64 precision.
+    expected = onp.array([3.71669453e-165, 4.72999108e-168, 6.01954653e-171,
+                          7.66067839e-174], onp.float64)
+    self.assertAllClose(f(x), expected, check_dtypes=False)
+
+  @jtu.disable
+  def testIssue776(self):
+    """Tests that the scatter-add transpose rule instantiates symbolic zeros."""
+    def f(u):
+      y = onp.ones(10,).at[[2, 4, 5]].add(u)
+      # The transpose rule for lax.tie_in returns a symbolic zero for its first
+      # argument.
+      return lax.tie_in(y, 7.)
+
+    self.assertAllClose(onp.zeros(3,), api.grad(f)(onp.ones(3,)),
+                        check_dtypes=True)
+
+  @jtu.disable
+  def testIssue777(self):
+    x = tnp.linspace(-200, 0, 4, dtype=onp.float32)
+    f = nje.grad(lambda x: tnp.sum(1 / (1 + tnp.exp(-x))))
+    self.assertAllClose(f(x), onp.array([0., 0., 0., 0.25], dtype=onp.float32),
+                        check_dtypes=True)
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix(op, [()], [dtype]),
+         "dtype": dtype, "op": op}
+      for dtype in float_dtypes
+      for op in ("sqrt", "arccos", "arcsin", "arctan", "sin", "cos", "tan",
+                 "sinh", "cosh", "tanh", "arccosh", "arcsinh", "arctanh", "exp",
+                 "log", "expm1", "log1p")))
+  def testMathSpecialFloatValues(self, op, dtype):
+    onp_op = getattr(onp, op)
+    lnp_op = getattr(tnp, op)
+    dtype = onp.dtype(tnp.canonicalize_dtype(dtype)).type
+    for x in (onp.nan, -onp.inf, -100., -2., -1., 0., 1., 2., 100., onp.inf,
+              tnp.finfo(dtype).max, onp.sqrt(tnp.finfo(dtype).max),
+              onp.sqrt(tnp.finfo(dtype).max) * 2.):
+      if (op in ("sin", "cos", "tan", "arctan") and
+          jtu.device_under_test() == "tpu"):
+        continue  # TODO(b/132196789, b/134175194): fix and reenable.
+      # TODO(b/158006398): fix and reenable.
+      if (op in ("cosh", "arccosh", "arcsinh", "arcsin", "sinh", "arccos",
+                 "arctan", "arctanh") and dtype == onp.float16):
+        continue
+      x = dtype(x)
+      expected = onp_op(x)
+      actual = lnp_op(x)
+      tol = jtu.tolerance(dtype, {onp.float32: 1e-3, onp.float64: 1e-7})
+      self.assertAllClose(expected, actual, check_dtypes=True, atol=tol,
+                          rtol=tol)
+
+  def testIssue883(self):
+    # from https://github.com/google/jax/issues/883
+
+    @partial(nje.jit, static_argnums=(1,))
+    def f(x, v):
+      return x
+
+    x = tnp.ones((10, 10))
+    v = tnp.array([1, 2, 3])
+    first_call = f(x, v)
+    second_call = f(x, v)  # doesn't crash
+
+  def testReductionOfOutOfBoundsAxis(self):  # Issue 888
+    x = tnp.ones((3, 4))
+    self.assertRaises(
+        errors_impl.InvalidArgumentError, lambda: tnp.sum(x, axis=2)
+    )
+
+  @jtu.disable
+  def testIssue956(self):
+    self.assertRaises(TypeError, lambda: tnp.ndarray((1, 1)))
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name":
+         "_shape={}_dtype={}_out_dtype={}_axis={}_ddof={}_keepdims={}"
+         .format(shape, dtype, out_dtype, axis, ddof, keepdims),
+         "shape": shape, "dtype": dtype, "out_dtype": out_dtype, "axis": axis,
+         "ddof": ddof, "keepdims": keepdims, "rng_factory": rng_factory}
+        for shape in [(5,), (10, 5)]
+        for dtype in all_dtypes
+        for out_dtype in inexact_dtypes
+        for axis in [None, 0, -1]
+        for ddof in [0, 1, 2]
+        for keepdims in [False, True]
+        for rng_factory in [jtu.rand_default]))
+  def testVar(self, shape, dtype, out_dtype, axis, ddof, keepdims, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    def onp_fun(x):
+      out = onp.var(x.astype(tnp.promote_types(onp.float32, dtype)),
+                    axis=axis, ddof=ddof, keepdims=keepdims)
+      return out.astype(out_dtype)
+    lnp_fun = partial(
+        tnp.var, dtype=out_dtype, axis=axis, ddof=ddof, keepdims=keepdims)
+    tol = jtu.tolerance(out_dtype, {onp.float16: 1e-1, onp.float32: 1e-3,
+                                    onp.float64: 1e-3, onp.complex128: 1e-6})
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True,
+                            tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, rtol=tol,
+                          atol=tol, check_incomplete_shape=True)
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name": "_shape={}_dtype={}_rowvar={}_ddof={}_bias={}".format(
+            shape, dtype, rowvar, ddof, bias),
+         "shape": shape, "dtype": dtype, "rowvar": rowvar, "ddof": ddof,
+         "bias": bias, "rng_factory": rng_factory}
+        for shape in [(5,), (10, 5), (5, 10)]
+        for dtype in all_dtypes
+        for rowvar in [True, False]
+        for bias in [True, False]
+        for ddof in [None, 2, 3]
+        for rng_factory in [jtu.rand_default]))
+  @jtu.skip_on_devices("gpu")  # TODO(b/138003641): test fails on GPU.
+  @jtu.disable
+  def testCov(self, shape, dtype, rowvar, ddof, bias, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    onp_fun = partial(onp.cov, rowvar=rowvar, ddof=ddof, bias=bias)
+    lnp_fun = partial(tnp.cov, rowvar=rowvar, ddof=ddof, bias=bias)
+    tol = {onp.float32: 1e-5, onp.float64: 1e-13, onp.complex128: 1e-13}
+    tol = 7e-2 if jtu.device_under_test() == "tpu" else tol
+    tol = jtu.join_tolerance(tol, jtu.tolerance(dtype))
+    self._CheckAgainstNumpy(
+        onp_fun, lnp_fun, args_maker, check_dtypes=False, tol=tol)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True, atol=tol,
+                          rtol=tol)
+
+  def testIssue967(self):
+    self.assertRaises(TypeError, lambda: tnp.zeros(1.5))
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name": "_shape={}_dtype={}_rowvar={}_ddof={}_bias={}".format(
+            shape, dtype, rowvar, ddof, bias),
+         "shape": shape, "dtype": dtype, "rowvar": rowvar, "ddof": ddof,
+         "bias": bias, "rng_factory": rng_factory}
+        for shape in [(5,), (10, 5), (3, 10)]
+        for dtype in number_dtypes
+        for rowvar in [True, False]
+        for bias in [True, False]
+        for ddof in [None, 2, 3]
+        for rng_factory in [jtu.rand_default]))
+  @jtu.disable
+  def testCorrCoef(self, shape, dtype, rowvar, ddof, bias, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [shape], [dtype])
+    mat = onp.asarray([rng(shape, dtype)])
+    onp_fun = partial(onp.corrcoef, rowvar=rowvar, ddof=ddof, bias=bias)
+    lnp_fun = partial(tnp.corrcoef, rowvar=rowvar, ddof=ddof, bias=bias)
+    if not onp.any(onp.isclose(onp.std(mat), 0.0)):
+      self._CheckAgainstNumpy(
+          onp_fun, lnp_fun, args_maker, check_dtypes=False,
+          tol=1e-2 if jtu.device_under_test() == "tpu" else None)
+    self._CompileAndCheck(lnp_fun, args_maker, check_dtypes=True)
+
+  @named_parameters(
+      jtu.cases_from_list(
+          {
+              "testcase_name":
+                  "_shapes={}_dtype={}_indexing={}_sparse={}".format(
+                      shapes, jtu.dtype_str(dtype), indexing, sparse),
+              "shapes":
+                  shapes,
+              "dtype":
+                  dtype,
+              "indexing":
+                  indexing,
+              "sparse":
+                  sparse,
+              "rng_factory":
+                  rng_factory
+          } for shapes in [(), (5,), (5, 3)] for dtype in number_dtypes
+          for indexing in ["xy", "ij"]
+          for sparse in [False]  # TODO(nareshmodi): Make sparse work
+          for rng_factory in [jtu.rand_default]))
+  def testMeshGrid(self, shapes, dtype, indexing, sparse, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [(x,) for x in shapes],
+                                    [dtype] * len(shapes))
+    onp_fun = partial(onp.meshgrid, indexing=indexing, sparse=sparse)
+    lnp_fun = partial(tnp.meshgrid, indexing=indexing, sparse=sparse)
+    self._CheckAgainstNumpy(onp_fun, lnp_fun, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_fun, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name": ("_start_shape={}_stop_shape={}_num={}_endpoint={}"
+                           "_retstep={}_dtype={}").format(
+            start_shape, stop_shape, num, endpoint, retstep, dtype),
+         "start_shape": start_shape, "stop_shape": stop_shape,
+         "num": num, "endpoint": endpoint, "retstep": retstep,
+         "dtype": dtype, "rng_factory": rng_factory}
+        for start_shape in [(), (2,), (2, 2)]
+        for stop_shape in [(), (2,), (2, 2)]
+        for num in [0, 1, 2, 5, 20]
+        for endpoint in [True, False]
+        for retstep in [True, False]
+        for dtype in number_dtypes + [None,]
+        for rng_factory in [jtu.rand_default]))
+  def testLinspace(self, start_shape, stop_shape, num, endpoint,
+                   retstep, dtype, rng_factory):
+    if not endpoint and onp.issubdtype(dtype, onp.integer):
+      # TODO(b/157597565): Support all dtypes when the tf op supports endpoint
+      # Currently, subtracting the step early leads to rounding errors for
+      # integers.
+      return
+    rng = rng_factory()
+    # relax default tolerances slightly
+    tol = jtu.tolerance(dtype if dtype else onp.float32) * 10
+    args_maker = self._GetArgsMaker(rng,
+                                    [start_shape, stop_shape],
+                                    [dtype, dtype])
+    start, stop = args_maker()
+    ndim = len(onp.shape(start + stop))
+    for axis in range(-ndim, ndim):
+      lnp_op = lambda start, stop: tnp.linspace(
+        start, stop, num,
+        endpoint=endpoint, retstep=retstep, dtype=dtype, axis=axis)
+      onp_op = lambda start, stop: onp.linspace(
+        start, stop, num,
+        endpoint=endpoint, retstep=retstep, dtype=dtype, axis=axis)
+      self._CheckAgainstNumpy(onp_op, lnp_op, args_maker,
+                              check_dtypes=False, tol=tol)
+      # floating-point compute between jitted platforms and non-jit + rounding
+      # cause unavoidable variation in integer truncation for some inputs.
+      if dtype in (inexact_dtypes + [None,]):
+        self._CompileAndCheck(lnp_op, args_maker,
+                              check_dtypes=False, atol=tol, rtol=tol,
+                              check_incomplete_shape=True)
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name": ("_start_shape={}_stop_shape={}_num={}_endpoint={}"
+                           "_base={}_dtype={}").format(
+            start_shape, stop_shape, num, endpoint, base,
+            dtype.__name__ if dtype else "None"),
+         "start_shape": start_shape,
+         "stop_shape": stop_shape,
+         "num": num, "endpoint": endpoint, "base": base,
+         "dtype": dtype, "rng_factory": rng_factory}
+        for start_shape in [(), (2,), (2, 2)]
+        for stop_shape in [(), (2,), (2, 2)]
+        for num in [0, 1, 2, 5, 20]
+        for endpoint in [True, False]
+        for base in [10.0, 2, onp.e]
+        for dtype in inexact_dtypes + [None,]
+        for rng_factory in [jtu.rand_default]))
+  def testLogspace(self, start_shape, stop_shape, num,
+                   endpoint, base, dtype, rng_factory):
+    if (dtype in int_dtypes and
+        jtu.device_under_test() in ("gpu", "tpu") and
+        not FLAGS.enable_x64):
+      raise unittest.SkipTest("GPUx32 truncated exponentiation"
+                              " doesn't exactly match other platforms.")
+    rng = rng_factory()
+    # relax default tolerances slightly
+    tol = {onp.float16: 2e-2, onp.float32: 1e-2, onp.float64: 1e-6,
+           onp.complex64: 1e-3, onp.complex128: 1e-6}
+    args_maker = self._GetArgsMaker(rng,
+                                    [start_shape, stop_shape],
+                                    [dtype, dtype])
+    start, stop = args_maker()
+    ndim = len(onp.shape(start + stop))
+    for axis in range(-ndim, ndim):
+      lnp_op = lambda start, stop: tnp.logspace(
+        start, stop, num, endpoint=endpoint, base=base, dtype=dtype, axis=axis)
+      onp_op = lambda start, stop: onp.logspace(
+        start, stop, num, endpoint=endpoint, base=base, dtype=dtype, axis=axis)
+      self._CheckAgainstNumpy(onp_op, lnp_op, args_maker,
+                              check_dtypes=False, tol=tol)
+      if dtype in (inexact_dtypes + [None,]):
+        # Why do compiled and op-by-op float16 np.power numbers differ
+        # slightly more than expected?
+        atol = {onp.float16: 1e-2}
+        self._CompileAndCheck(lnp_op, args_maker,
+                              check_dtypes=False, atol=atol, rtol=tol,
+                              check_incomplete_shape=True)
+
+  @named_parameters(
+      jtu.cases_from_list(
+        {"testcase_name": ("_start_shape={}_stop_shape={}_num={}_endpoint={}"
+                           "_dtype={}").format(
+            start_shape, stop_shape, num, endpoint, dtype),
+         "start_shape": start_shape,
+         "stop_shape": stop_shape,
+         "num": num, "endpoint": endpoint,
+         "dtype": dtype, "rng_factory": rng_factory}
+        for start_shape in [(), (2,), (2, 2)]
+        for stop_shape in [(), (2,), (2, 2)]
+        for num in [0, 1, 2, 5, 20]
+        for endpoint in [True, False]
+        # NB: numpy's geomspace gives nonsense results on integer types
+        for dtype in inexact_dtypes + [None,]
+        for rng_factory in [jtu.rand_default]))
+  def testGeomspace(self, start_shape, stop_shape, num,
+                    endpoint, dtype, rng_factory):
+    rng = rng_factory()
+    # relax default tolerances slightly
+    tol = {onp.float16: 4e-3, onp.float32: 2e-3, onp.complex128: 1e-14}
+    def args_maker():
+      """Test the set of inputs onp.geomspace is well-defined on."""
+      start, stop = self._GetArgsMaker(rng,
+                                [start_shape, stop_shape],
+                                [dtype, dtype])()
+      # onp.geomspace can't handle differently ranked tensors
+      # w. negative numbers!
+      start, stop = tnp.broadcast_arrays(start, stop)
+      if dtype in complex_dtypes:
+        return start, stop
+      # to avoid NaNs, non-complex start and stop cannot
+      # differ in sign, elementwise
+      start = start * tnp.sign(start) * tnp.sign(stop)
+      return start, stop
+    start, stop = args_maker()
+    ndim = len(onp.shape(start + stop))
+    for axis in range(-ndim, ndim):
+      def lnp_op(start, stop):
+        return tnp.geomspace(start, stop, num, endpoint=endpoint, dtype=dtype,
+                             axis=axis)
+      def onp_op(start, stop):
+        start = start.astype(onp.float32) if dtype == tnp.bfloat16 else start
+        stop = stop.astype(onp.float32) if dtype == tnp.bfloat16 else stop
+        return onp.geomspace(
+            start, stop, num, endpoint=endpoint,
+            dtype=dtype if dtype != tnp.bfloat16 else onp.float32,
+            axis=axis).astype(dtype)
+      self._CheckAgainstNumpy(onp_op, lnp_op, args_maker,
+                              check_dtypes=False, tol=tol)
+      if dtype in (inexact_dtypes + [None,]):
+        self._CompileAndCheck(lnp_op, args_maker,
+                              check_dtypes=False, atol=tol, rtol=tol,
+                              check_incomplete_shape=True)
+
+  @jtu.disable
+  def testDisableNumpyRankPromotionBroadcasting(self):
+    try:
+      prev_flag = FLAGS.jax_numpy_rank_promotion
+      FLAGS.jax_numpy_rank_promotion = "allow"
+      tnp.ones(2) + tnp.ones((1, 2))  # works just fine
+    finally:
+      FLAGS.jax_numpy_rank_promotion = prev_flag
+
+    try:
+      prev_flag = FLAGS.jax_numpy_rank_promotion
+      FLAGS.jax_numpy_rank_promotion = "raise"
+      self.assertRaises(ValueError, lambda: tnp.ones(2) + tnp.ones((1, 2)))
+    finally:
+      FLAGS.jax_numpy_rank_promotion = prev_flag
+
+    try:
+      prev_flag = FLAGS.jax_numpy_rank_promotion
+      FLAGS.jax_numpy_rank_promotion = "warn"
+      with warnings.catch_warnings(record=True) as w:
+        warnings.simplefilter("always")
+        tnp.ones(2) + tnp.ones((1, 2))
+        assert len(w) > 0
+        msg = str(w[-1].message)
+        expected_msg = ("Following NumPy automatic rank promotion for add on "
+                        "shapes (2,) (1, 2).")
+        self.assertEqual(msg[:len(expected_msg)], expected_msg)
+
+        prev_len = len(w)
+        tnp.ones(2) + 3
+        self.assertEqual(len(w), prev_len)  # don't want to warn for scalars
+    finally:
+      FLAGS.jax_numpy_rank_promotion = prev_flag
+
+  def testStackArrayArgument(self):
+    # tests https://github.com/google/jax/issues/1271
+    @nje.jit
+    def foo(x):
+      return tnp.stack(x)
+    foo(onp.zeros(2))  # doesn't crash
+
+    @nje.jit
+    def foo(x):
+      return tnp.concatenate(x)
+    foo(onp.zeros((2, 2)))  # doesn't crash
+
+  @jtu.disable
+  def testReluGradientConstants(self):
+    # This is a regression test that verifies that constants associated with the
+    # gradient of np.maximum (from lax._balanced_eq) aren't hoisted into the
+    # outermost jaxpr. This was producing some large materialized constants for
+    # every relu activation in a model.
+    def body(i, xy):
+      x, y = xy
+      y = y + jax.grad(lambda z: tnp.sum(tnp.maximum(z, 0.)))(x)  # pylint: disable=undefined-variable
+      return x, y
+
+    f = lambda y: lax.fori_loop(0, 5, body, (y, y))
+    wrapped = linear_util.wrap_init(f)
+    pv = partial_eval.PartialVal(
+      (jax.core.ShapedArray((3, 4), onp.float32), jax.core.unit))
+    _, _, consts = partial_eval.trace_to_jaxpr(wrapped, [pv])
+    self.assertFalse(
+      any(onp.array_equal(x, onp.full((3, 4), 2., dtype=onp.float32))
+          for x in consts))
+
+  @named_parameters(
+      {"testcase_name": "_from={}_to={}".format(from_shape, to_shape),
+       "rng_factory": rng_factory, "from_shape": from_shape, "to_shape": to_shape}
+      for from_shape, to_shape in [
+          [(1, 3), (4, 3)],
+          [(3,), (2, 1, 3)],
+          [(3,), (3, 3)],
+          [(1,), (3,)],
+      ]
+      for rng_factory in [jtu.rand_default])
+  def testBroadcastTo(self, from_shape, to_shape, rng_factory):
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(rng, [from_shape], [onp.float32])
+    onp_op = lambda x: onp.broadcast_to(x, to_shape)
+    lnp_op = lambda x: tnp.broadcast_to(x, to_shape)
+    self._CheckAgainstNumpy(onp_op, lnp_op, args_maker, check_dtypes=True)
+    self._CompileAndCheck(
+        lnp_op, args_maker, check_dtypes=True, check_incomplete_shape=True)
+
+  def testBroadcastToIssue1522(self):
+    self.assertRaisesRegex(
+        Exception, "Unable to broadcast",
+        lambda: tnp.broadcast_to(onp.ones((2, 3)), (1, 3)))
+
+  def testBroadcastToIntIssue1548(self):
+    self.assertAllClose(tnp.broadcast_to(1, (3, 2)), onp.ones((3, 2)),
+                        check_dtypes=False)
+
+  def testBroadcastToOnScalar(self):
+    self.assertIsInstance(tnp.broadcast_to(10.0, ()), tnp.ndarray)
+    self.assertIsInstance(onp.broadcast_to(10.0, ()), onp.ndarray)
+
+  @jtu.disable
+  def testPrecision(self):
+
+    ones_1d = onp.ones((2,))
+    ones_2d = onp.ones((2, 2))
+    ones_3d = onp.ones((2, 2, 2))
+    HIGHEST = lax.Precision.HIGHEST
+
+    jtu.assert_dot_precision(None, tnp.dot, ones_1d, ones_1d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.dot, precision=HIGHEST),
+        ones_1d, ones_1d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.dot, precision=HIGHEST),
+        ones_3d, ones_3d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.matmul, precision=HIGHEST),
+        ones_2d, ones_2d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.vdot, precision=HIGHEST),
+        ones_1d, ones_1d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.tensordot, axes=2, precision=HIGHEST),
+        ones_2d, ones_2d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.tensordot, axes=(0, 0), precision=HIGHEST),
+        ones_1d, ones_1d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.tensordot, axes=((0,), (0,)), precision=HIGHEST),
+        ones_1d, ones_1d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.einsum, "i,i", precision=HIGHEST),
+        ones_1d, ones_1d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.einsum, "ij,ij", precision=HIGHEST),
+        ones_2d, ones_2d)
+    jtu.assert_dot_precision(
+        HIGHEST,
+        partial(tnp.inner, precision=HIGHEST),
+        ones_1d, ones_1d)
+
+  @named_parameters(jtu.cases_from_list(
+      {"testcase_name":
+       "_{}_{}_{}_{}".format(
+           shape, jtu.dtype_str(key_dtype), jtu.dtype_str(value_dtype),
+           dimension).replace(" ", ""),
+       "shape": shape, "key_dtype": key_dtype, "value_dtype": value_dtype,
+       "dimension": dimension, "rng_factory": rng_factory}
+      for shape in all_shapes
+      for key_dtype in minus(number_dtypes, complex_dtypes)
+      for value_dtype in all_dtypes
+      for dimension in range(-len(shape), len(shape))
+      for rng_factory in [jtu.rand_default]))
+  @new_test
+  def testSortKeyValue(self, shape, key_dtype, value_dtype, dimension,
+                       rng_factory):
+    def onp_ref(keys, values):
+      idxs = list(onp.ix_(*[onp.arange(d) for d in keys.shape]))
+      idxs[dimension] = onp.argsort(keys, axis=dimension)
+      return keys[tuple(idxs)], values[tuple(idxs)]
+    rng = rng_factory()
+    args_maker = self._GetArgsMaker(
+        rng, [shape, shape], [key_dtype, value_dtype])
+    op = partial(nje.sort_key_val, dimension=dimension)
+    self._CheckAgainstNumpy(onp_ref, op, args_maker,
+                            check_dtypes=True)
+    # sort_key_val requires known rank.
+    # XLA only has TopKV2 (used by tf.argsort) kernels on those dtypes
+    # (b/169194137).
+    check_xla = key_dtype in (onp.uint32, onp.int32, onp.float32, tnp.bfloat16)
+    self._CompileAndCheck(op, args_maker, check_dtypes=True,
+                          check_incomplete_shape=True, check_unknown_rank=False,
+                          check_experimental_compile=check_xla,
+                          check_xla_forced_compile=check_xla)
+
+
+# Most grad tests are at the lax level (see lax_test.py), but we add some here
+# as needed for e.g. particular compound ops of interest.
+
+GradTestSpec = collections.namedtuple(
+    "GradTestSpec",
+    ["op", "nargs", "order", "rng_factory", "dtypes", "name", "tol"])
+def grad_test_spec(op, nargs, order, rng_factory, dtypes, name=None, tol=None):
+  return GradTestSpec(
+      op, nargs, order, rng_factory, dtypes, name or op.__name__, tol)
+
+GRAD_TEST_RECORDS = [
+    grad_test_spec(tnp.arcsinh, nargs=1, order=2,
+                   rng_factory=jtu.rand_positive,
+                   dtypes=[onp.float64, onp.complex64], tol=1e-4),
+    grad_test_spec(tnp.arccosh, nargs=1, order=2,
+                   rng_factory=jtu.rand_positive,
+                   dtypes=[onp.float64, onp.complex64], tol=1e-4),
+    grad_test_spec(tnp.arctanh, nargs=1, order=2,
+                   rng_factory=partial(jtu.rand_uniform, -0.9, 0.9),
+                   dtypes=[onp.float64, onp.complex64], tol=1e-4),
+]
+
+GradSpecialValuesTestSpec = collections.namedtuple(
+    "GradSpecialValuesTestSpec", ["op", "values", "order"])
+
+GRAD_SPECIAL_VALUE_TEST_RECORDS = [
+    GradSpecialValuesTestSpec(tnp.arcsinh, [0., 1000.], 2),
+    GradSpecialValuesTestSpec(tnp.arccosh, [1000.], 2),
+    GradSpecialValuesTestSpec(tnp.arctanh, [0.], 2),
+    # TODO(wangpeng): Add `GradSpecialValuesTestSpec(tnp.sinc, [0.], 1)`
+]
+
+# def num_float_bits(dtype):
+#   return tnp.finfo(dtypes.canonicalize_dtype(dtype)).bits
+
+class NumpyGradTests(jtu.TestCase):
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+        {"testcase_name": jtu.format_test_name_suffix(
+            rec.name, shapes, itertools.repeat(dtype)),
+         "op": rec.op, "rng_factory": rec.rng_factory, "shapes": shapes, "dtype": dtype,
+         "order": rec.order, "tol": rec.tol}
+        for shapes in CombosWithReplacement(nonempty_shapes, rec.nargs)
+        for dtype in rec.dtypes)
+      for rec in GRAD_TEST_RECORDS))
+  @jtu.disable
+  def testOpGrad(self, op, rng_factory, shapes, dtype, order, tol):
+    rng = rng_factory()
+    tol = {onp.float32: 1e-1, onp.complex64: 1e-1}
+    args = tuple(rng(shape, dtype) for shape in shapes)
+    check_grads(op, args, order, ["fwd", "rev"], tol, tol)
+
+  @named_parameters(itertools.chain.from_iterable(
+      jtu.cases_from_list(
+          {"testcase_name": "_{}_{}".format(rec.op.__name__, special_value),
+           "op": rec.op, "special_value": special_value, "order": rec.order}
+          for special_value in rec.values)
+      for rec in GRAD_SPECIAL_VALUE_TEST_RECORDS))
+  @jtu.disable
+  def testOpGradSpecialValue(self, op, special_value, order):
+    check_grads(op, (special_value,), order, ["fwd", "rev"],
+                atol={onp.float32: 3e-3})
+
+  @jtu.disable
+  def testTakeAlongAxisIssue1521(self):
+    # https://github.com/google/jax/issues/1521
+    idx = tnp.repeat(tnp.arange(3), 10).reshape((30, 1))
+
+    def f(x):
+      y = x * tnp.arange(3.).reshape((1, 3))
+      return tnp.take_along_axis(y, idx, -1).sum()
+
+    check_grads(f, (1.,), order=1)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/tensorflow/python/ops/numpy_ops/tests/np_wrapper.py b/tensorflow/python/ops/numpy_ops/tests/np_wrapper.py
new file mode 100644
index 00000000000..bc75342af49
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/np_wrapper.py
@@ -0,0 +1,43 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""TF NumPy API wrapper for the tests."""
+
+# pylint: disable=wildcard-import
+# pylint: disable=unused-import
+# pylint: disable=g-importing-member
+
+import numpy as onp
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.framework.dtypes import bfloat16
+from tensorflow.python.ops.numpy_ops import np_random as random
+from tensorflow.python.ops.numpy_ops.np_array_ops import *
+from tensorflow.python.ops.numpy_ops.np_arrays import ndarray
+from tensorflow.python.ops.numpy_ops.np_config import enable_numpy_behavior
+from tensorflow.python.ops.numpy_ops.np_dtypes import *
+from tensorflow.python.ops.numpy_ops.np_dtypes import canonicalize_dtype
+from tensorflow.python.ops.numpy_ops.np_dtypes import default_float_type
+from tensorflow.python.ops.numpy_ops.np_dtypes import is_allow_float64
+from tensorflow.python.ops.numpy_ops.np_dtypes import set_allow_float64
+from tensorflow.python.ops.numpy_ops.np_math_ops import *
+from tensorflow.python.ops.numpy_ops.np_utils import finfo
+from tensorflow.python.ops.numpy_ops.np_utils import promote_types
+from tensorflow.python.ops.numpy_ops.np_utils import result_type
+
+random.DEFAULT_RANDN_DTYPE = onp.float32
+# pylint: enable=unused-import
+
+v2_compat.enable_v2_behavior()
+# TODO(b/171429739): This should be moved to every individual file/test.
+enable_numpy_behavior()
diff --git a/tensorflow/python/ops/numpy_ops/tests/test_util.py b/tensorflow/python/ops/numpy_ops/tests/test_util.py
new file mode 100644
index 00000000000..27840cc7025
--- /dev/null
+++ b/tensorflow/python/ops/numpy_ops/tests/test_util.py
@@ -0,0 +1,886 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""NumPy test utilities."""
+from contextlib import contextmanager
+from distutils.util import strtobool
+import functools
+from functools import partial
+import re
+import itertools as it
+import os
+from typing import Dict, Sequence, Union
+import unittest
+import warnings
+import zlib
+
+from absl.testing import absltest
+from absl.testing import parameterized
+
+import numpy as onp
+import numpy.random as npr
+
+from tensorflow.python.util import nest
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gradient_checker_v2
+
+from tensorflow.python.ops.numpy_ops.tests.config import flags
+import tensorflow.python.ops.numpy_ops.tests.extensions as nje
+from tensorflow.python.ops.numpy_ops import np_utils
+from tensorflow.python.ops.numpy_ops import np_array_ops
+
+
+tree_map = nest.map_structure
+tree_multimap = nest.map_structure
+
+
+FLAGS = flags.FLAGS
+
+
+# TODO(wangpeng): Remove this flag after broken tests are fixed
+flags.DEFINE_bool('enable_x64',
+                  strtobool('False'),
+                  'Enable 64-bit types to be used.')
+
+
+flags.DEFINE_enum(
+    'test_dut', '',
+    enum_values=['', 'cpu', 'gpu', 'tpu'],
+    help=
+    'Describes the device under test in case special consideration is required.'
+)
+
+
+flags.DEFINE_integer(
+  'num_generated_cases',
+  10,
+  help='Number of generated cases to test')
+
+
+EPS = 1e-4
+
+
+# Default dtypes corresponding to Python scalars.
+python_scalar_dtypes = {
+  bool: onp.dtype(onp.bool_),
+  int: onp.dtype(onp.int_),
+  float: onp.dtype(onp.float_),
+  complex: onp.dtype(onp.complex_),
+}
+
+
+def _dtype(x):
+  if isinstance(x, ops.Tensor):
+    return x.dtype.as_numpy_dtype
+  return (getattr(x, 'dtype', None) or
+          onp.dtype(python_scalar_dtypes.get(type(x), None)) or
+          onp.asarray(x).dtype)
+
+
+def is_sequence(x):
+  try:
+    iter(x)
+  except TypeError:
+    return False
+  else:
+    return True
+
+_default_tolerance = {
+  onp.dtype(onp.bool_): 0,
+  onp.dtype(onp.int8): 0,
+  onp.dtype(onp.int16): 0,
+  onp.dtype(onp.int32): 0,
+  onp.dtype(onp.int64): 0,
+  onp.dtype(onp.uint8): 0,
+  onp.dtype(onp.uint16): 0,
+  onp.dtype(onp.uint32): 0,
+  onp.dtype(onp.uint64): 0,
+  # TODO(b/154768983): onp.dtype(dtypes.bfloat16): 1e-2,
+  onp.dtype(onp.float16): 1e-3,
+  onp.dtype(onp.float32): 1e-6,
+  onp.dtype(onp.float64): 1e-15,
+  onp.dtype(onp.complex64): 1e-6,
+  onp.dtype(onp.complex128): 1e-15,
+}
+
+def default_tolerance():
+  return _default_tolerance
+
+default_gradient_tolerance = {
+  # TODO(b/154768983): onp.dtype(dtypes.bfloat16): 1e-1,
+  onp.dtype(onp.float16): 1e-2,
+  onp.dtype(onp.float32): 2e-3,
+  onp.dtype(onp.float64): 1e-5,
+  onp.dtype(onp.complex64): 1e-3,
+  onp.dtype(onp.complex128): 1e-5,
+}
+
+def _assert_numpy_allclose(a, b, atol=None, rtol=None):
+  # TODO(b/154768983):
+  #   a = a.astype(onp.float32) if a.dtype == dtypes.bfloat16 else a
+  #   b = b.astype(onp.float32) if b.dtype == dtypes.bfloat16 else b
+  kw = {}
+  if atol: kw["atol"] = atol
+  if rtol: kw["rtol"] = rtol
+  onp.testing.assert_allclose(a, b, **kw)
+
+def tolerance(dtype, tol=None):
+  tol = {} if tol is None else tol
+  if not isinstance(tol, dict):
+    return tol
+  tol = {onp.dtype(key): value for key, value in tol.items()}
+  dtype = onp.dtype(dtype)
+  return tol.get(dtype, default_tolerance()[dtype])
+
+def _normalize_tolerance(tol):
+  tol = tol or 0
+  if isinstance(tol, dict):
+    return {onp.dtype(k): v for k, v in tol.items()}
+  else:
+    return {k: tol for k in _default_tolerance}
+
+def join_tolerance(tol1, tol2):
+  tol1 = _normalize_tolerance(tol1)
+  tol2 = _normalize_tolerance(tol2)
+  out = tol1
+  for k, v in tol2.items():
+    out[k] = max(v, tol1.get(k, 0))
+  return out
+
+def _assert_numpy_close(a, b, atol=None, rtol=None):
+  assert a.shape == b.shape
+  atol = max(tolerance(a.dtype, atol), tolerance(b.dtype, atol))
+  rtol = max(tolerance(a.dtype, rtol), tolerance(b.dtype, rtol))
+  _assert_numpy_allclose(a, b, atol=atol * a.size, rtol=rtol * b.size)
+
+
+def check_eq(xs, ys):
+  tree_all(tree_multimap(_assert_numpy_allclose, xs, ys))
+
+
+def check_close(xs, ys, atol=None, rtol=None):
+  assert_close = partial(_assert_numpy_close, atol=atol, rtol=rtol)
+  tree_all(tree_multimap(assert_close, xs, ys))
+
+
+def inner_prod(xs, ys):
+  def contract(x, y):
+    return onp.real(onp.dot(onp.conj(x).reshape(-1), y.reshape(-1)))
+  return tree_reduce(onp.add, tree_multimap(contract, xs, ys))
+
+
+add = partial(tree_multimap, lambda x, y: onp.add(x, y, dtype=_dtype(x)))
+sub = partial(tree_multimap, lambda x, y: onp.subtract(x, y, dtype=_dtype(x)))
+conj = partial(tree_map, lambda x: onp.conj(x, dtype=_dtype(x)))
+
+def scalar_mul(xs, a):
+  return tree_map(lambda x: onp.multiply(x, a, dtype=_dtype(x)), xs)
+
+
+def rand_like(rng, x):
+  shape = onp.shape(x)
+  dtype = _dtype(x)
+  randn = lambda: onp.asarray(rng.randn(*shape), dtype=dtype)
+  if onp.issubdtype(dtype, onp.complexfloating):
+    return randn() + dtype.type(1.0j) * randn()
+  else:
+    return randn()
+
+
+def numerical_jvp(f, primals, tangents, eps=EPS):
+  delta = scalar_mul(tangents, eps)
+  f_pos = f(*add(primals, delta))
+  f_neg = f(*sub(primals, delta))
+  return scalar_mul(sub(f_pos, f_neg), 0.5 / eps)
+
+
+def _merge_tolerance(tol, default):
+  if tol is None:
+    return default
+  if not isinstance(tol, dict):
+    return tol
+  out = default.copy()
+  for k, v in tol.items():
+    out[onp.dtype(k)] = v
+  return out
+
+def check_jvp(f, f_jvp, args, atol=None, rtol=None, eps=EPS):
+  atol = _merge_tolerance(atol, default_gradient_tolerance)
+  rtol = _merge_tolerance(rtol, default_gradient_tolerance)
+  rng = onp.random.RandomState(0)
+  tangent = tree_map(partial(rand_like, rng), args)
+  v_out, t_out = f_jvp(args, tangent)
+  v_out_expected = f(*args)
+  t_out_expected = numerical_jvp(f, args, tangent, eps=eps)
+  # In principle we should expect exact equality of v_out and v_out_expected,
+  # but due to nondeterminism especially on GPU (e.g., due to convolution
+  # autotuning) we only require "close".
+  check_close(v_out, v_out_expected, atol=atol, rtol=rtol)
+  check_close(t_out, t_out_expected, atol=atol, rtol=rtol)
+
+
+def check_vjp(f, f_vjp, args, atol=None, rtol=None, eps=EPS):
+  atol = _merge_tolerance(atol, default_gradient_tolerance)
+  rtol = _merge_tolerance(rtol, default_gradient_tolerance)
+  _rand_like = partial(rand_like, onp.random.RandomState(0))
+  v_out, vjpfun = f_vjp(*args)
+  v_out_expected = f(*args)
+  check_close(v_out, v_out_expected, atol=atol, rtol=rtol)
+  tangent = tree_map(_rand_like, args)
+  tangent_out = numerical_jvp(f, args, tangent, eps=eps)
+  cotangent = tree_map(_rand_like, v_out)
+  cotangent_out = conj(vjpfun(conj(cotangent)))
+  ip = inner_prod(tangent, cotangent_out)
+  ip_expected = inner_prod(tangent_out, cotangent)
+  check_close(ip, ip_expected, atol=atol, rtol=rtol)
+
+
+def device_under_test():
+  return FLAGS.test_dut
+
+def if_device_under_test(device_type: Union[str, Sequence[str]],
+                         if_true, if_false):
+  """Chooses `if_true` of `if_false` based on device_under_test."""
+  if device_under_test() in ([device_type] if isinstance(device_type, str)
+                             else device_type):
+    return if_true
+  else:
+    return if_false
+
+def supported_dtypes():
+  if device_under_test() == "tpu":
+    return {onp.bool_, onp.int32, onp.uint32, dtypes.bfloat16, onp.float32,
+            onp.complex64}
+  else:
+    return {onp.bool_, onp.int8, onp.int16, onp.int32, onp.int64,
+            onp.uint8, onp.uint16, onp.uint32, onp.uint64,
+            dtypes.bfloat16, onp.float16, onp.float32, onp.float64,
+            onp.complex64, onp.complex128}
+
+def skip_if_unsupported_type(dtype):
+  if dtype not in supported_dtypes():
+    raise unittest.SkipTest(
+      f"Type {dtype} not supported on {device_under_test()}")
+
+def skip_on_devices(*disabled_devices):
+  """A decorator for test methods to skip the test on certain devices."""
+  def skip(test_method):
+    @functools.wraps(test_method)
+    def test_method_wrapper(self, *args, **kwargs):
+      device = device_under_test()
+      if device in disabled_devices:
+        test_name = getattr(test_method, '__name__', '[unknown test]')
+        raise unittest.SkipTest(
+          f"{test_name} not supported on {device.upper()}.")
+      return test_method(self, *args, **kwargs)
+    return test_method_wrapper
+  return skip
+
+
+def skip_on_flag(flag_name, skip_value):
+  """A decorator for test methods to skip the test when flags are set."""
+  def skip(test_method):        # pylint: disable=missing-docstring
+    @functools.wraps(test_method)
+    def test_method_wrapper(self, *args, **kwargs):
+      flag_value = getattr(FLAGS, flag_name)
+      if flag_value == skip_value:
+        test_name = getattr(test_method, '__name__', '[unknown test]')
+        raise unittest.SkipTest(
+          f"{test_name} not supported when FLAGS.{flag_name} is {flag_value}")
+      return test_method(self, *args, **kwargs)
+    return test_method_wrapper
+  return skip
+
+
+def format_test_name_suffix(opname, shapes, dt):
+  arg_descriptions = (format_shape_dtype_string(shape, dtype)
+                      for shape, dtype in zip(shapes, dt))
+  return '{}_{}'.format(opname.capitalize(), '_'.join(arg_descriptions))
+
+
+# We use special symbols, represented as singleton objects, to distinguish
+# between NumPy scalars, Python scalars, and 0-D arrays.
+class ScalarShape:
+  def __len__(self): return 0
+  def __getitem__(self, i):
+    raise IndexError(f'index {i} out of range.')
+class _NumpyScalar(ScalarShape): pass
+class _PythonScalar(ScalarShape): pass
+NUMPY_SCALAR_SHAPE = _NumpyScalar()
+PYTHON_SCALAR_SHAPE = _PythonScalar()
+
+
+def _dims_of_shape(shape):
+  """Converts `shape` to a tuple of dimensions."""
+  if type(shape) in (list, tuple):
+    return shape
+  elif isinstance(shape, ScalarShape):
+    return ()
+  else:
+    raise TypeError(type(shape))
+
+
+def _cast_to_shape(value, shape, dtype):
+  """Casts `value` to the correct Python type for `shape` and `dtype`."""
+  if shape is NUMPY_SCALAR_SHAPE:
+    # explicitly cast to NumPy scalar in case `value` is a Python scalar.
+    return onp.dtype(dtype).type(value)
+  elif shape is PYTHON_SCALAR_SHAPE:
+    # explicitly cast to Python scalar via https://stackoverflow.com/a/11389998
+    return onp.asarray(value).item()
+  elif type(shape) in (list, tuple):
+    assert onp.shape(value) == tuple(shape)
+    return value
+  else:
+    raise TypeError(type(shape))
+
+
+def dtype_str(dtype):
+  return onp.dtype(dtype).name
+
+
+def format_shape_dtype_string(shape, dtype):
+  if shape is NUMPY_SCALAR_SHAPE:
+    return dtype_str(dtype)
+  elif shape is PYTHON_SCALAR_SHAPE:
+    return 'py' + dtype_str(dtype)
+  elif type(shape) in (list, tuple):
+    shapestr = ','.join(str(dim) for dim in shape)
+    return '{}[{}]'.format(dtype_str(dtype), shapestr)
+  elif type(shape) is int:
+    return '{}[{},]'.format(dtype_str(dtype), shape)
+  elif isinstance(shape, onp.ndarray):
+    return '{}[{}]'.format(dtype_str(dtype), shape)
+  else:
+    raise TypeError(type(shape))
+
+
+def _rand_dtype(rand, shape, dtype, scale=1., post=lambda x: x):
+  """Produce random values given shape, dtype, scale, and post-processor.
+
+  Args:
+    rand: a function for producing random values of a given shape, e.g. a
+      bound version of either onp.RandomState.randn or onp.RandomState.rand.
+    shape: a shape value as a tuple of positive integers.
+    dtype: a numpy dtype.
+    scale: optional, a multiplicative scale for the random values (default 1).
+    post: optional, a callable for post-processing the random values (default
+      identity).
+
+  Returns:
+    An ndarray of the given shape and dtype using random values based on a call
+    to rand but scaled, converted to the appropriate dtype, and post-processed.
+  """
+  r = lambda: onp.asarray(scale * rand(*_dims_of_shape(shape)), dtype)
+  if onp.issubdtype(dtype, onp.complexfloating):
+    vals = r() + 1.0j * r()
+  else:
+    vals = r()
+  return _cast_to_shape(onp.asarray(post(vals), dtype), shape, dtype)
+
+
+def rand_default(scale=3):
+  randn = npr.RandomState(0).randn
+  return partial(_rand_dtype, randn, scale=scale)
+
+
+def rand_nonzero():
+  post = lambda x: onp.where(x == 0, onp.array(1, dtype=x.dtype), x)
+  randn = npr.RandomState(0).randn
+  return partial(_rand_dtype, randn, scale=3, post=post)
+
+
+def rand_positive():
+  post = lambda x: x + 1
+  rand = npr.RandomState(0).rand
+  return partial(_rand_dtype, rand, scale=2, post=post)
+
+
+def rand_small():
+  randn = npr.RandomState(0).randn
+  return partial(_rand_dtype, randn, scale=1e-3)
+
+
+def rand_not_small(offset=10.):
+  post = lambda x: x + onp.where(x > 0, offset, -offset)
+  randn = npr.RandomState(0).randn
+  return partial(_rand_dtype, randn, scale=3., post=post)
+
+
+def rand_small_positive():
+  rand = npr.RandomState(0).rand
+  return partial(_rand_dtype, rand, scale=2e-5)
+
+def rand_uniform(low=0.0, high=1.0):
+  assert low < high
+  rand = npr.RandomState(0).rand
+  post = lambda x: x * (high - low) + low
+  return partial(_rand_dtype, rand, post=post)
+
+
+def rand_some_equal():
+  randn = npr.RandomState(0).randn
+  rng = npr.RandomState(0)
+
+  def post(x):
+    x_ravel = x.ravel()
+    if len(x_ravel) == 0:
+      return x
+    flips = rng.rand(*onp.shape(x)) < 0.5
+    return onp.where(flips, x_ravel[0], x)
+
+  return partial(_rand_dtype, randn, scale=100., post=post)
+
+
+def rand_some_inf():
+  """Return a random sampler that produces infinities in floating types."""
+  rng = npr.RandomState(1)
+  base_rand = rand_default()
+
+  """
+  TODO: Complex numbers are not correctly tested
+  If blocks should be switched in order, and relevant tests should be fixed
+  """
+  def rand(shape, dtype):
+    """The random sampler function."""
+    if not onp.issubdtype(dtype, onp.floating):
+      # only float types have inf
+      return base_rand(shape, dtype)
+
+    if onp.issubdtype(dtype, onp.complexfloating):
+      base_dtype = onp.real(onp.array(0, dtype=dtype)).dtype
+      out = (rand(shape, base_dtype) +
+             onp.array(1j, dtype) * rand(shape, base_dtype))
+      return _cast_to_shape(out, shape, dtype)
+
+    dims = _dims_of_shape(shape)
+    posinf_flips = rng.rand(*dims) < 0.1
+    neginf_flips = rng.rand(*dims) < 0.1
+
+    vals = base_rand(shape, dtype)
+    vals = onp.where(posinf_flips, onp.array(onp.inf, dtype=dtype), vals)
+    vals = onp.where(neginf_flips, onp.array(-onp.inf, dtype=dtype), vals)
+
+    return _cast_to_shape(onp.asarray(vals, dtype=dtype), shape, dtype)
+
+  return rand
+
+def rand_some_nan():
+  """Return a random sampler that produces nans in floating types."""
+  rng = npr.RandomState(1)
+  base_rand = rand_default()
+
+  def rand(shape, dtype):
+    """The random sampler function."""
+    if onp.issubdtype(dtype, onp.complexfloating):
+      base_dtype = onp.real(onp.array(0, dtype=dtype)).dtype
+      out = (rand(shape, base_dtype) +
+             onp.array(1j, dtype) * rand(shape, base_dtype))
+      return _cast_to_shape(out, shape, dtype)
+
+    if not onp.issubdtype(dtype, onp.floating):
+      # only float types have inf
+      return base_rand(shape, dtype)
+
+    dims = _dims_of_shape(shape)
+    nan_flips = rng.rand(*dims) < 0.1
+
+    vals = base_rand(shape, dtype)
+    vals = onp.where(nan_flips, onp.array(onp.nan, dtype=dtype), vals)
+
+    return _cast_to_shape(onp.asarray(vals, dtype=dtype), shape, dtype)
+
+  return rand
+
+def rand_some_inf_and_nan():
+  """Return a random sampler that produces infinities in floating types."""
+  rng = npr.RandomState(1)
+  base_rand = rand_default()
+
+  """
+  TODO: Complex numbers are not correctly tested
+  If blocks should be switched in order, and relevant tests should be fixed
+  """
+  def rand(shape, dtype):
+    """The random sampler function."""
+    if not onp.issubdtype(dtype, onp.floating):
+      # only float types have inf
+      return base_rand(shape, dtype)
+
+    if onp.issubdtype(dtype, onp.complexfloating):
+      base_dtype = onp.real(onp.array(0, dtype=dtype)).dtype
+      out = (rand(shape, base_dtype) +
+             onp.array(1j, dtype) * rand(shape, base_dtype))
+      return _cast_to_shape(out, shape, dtype)
+
+    dims = _dims_of_shape(shape)
+    posinf_flips = rng.rand(*dims) < 0.1
+    neginf_flips = rng.rand(*dims) < 0.1
+    nan_flips = rng.rand(*dims) < 0.1
+
+    vals = base_rand(shape, dtype)
+    vals = onp.where(posinf_flips, onp.array(onp.inf, dtype=dtype), vals)
+    vals = onp.where(neginf_flips, onp.array(-onp.inf, dtype=dtype), vals)
+    vals = onp.where(nan_flips, onp.array(onp.nan, dtype=dtype), vals)
+
+    return _cast_to_shape(onp.asarray(vals, dtype=dtype), shape, dtype)
+
+  return rand
+
+# TODO(mattjj): doesn't handle complex types
+def rand_some_zero():
+  """Return a random sampler that produces some zeros."""
+  rng = npr.RandomState(1)
+  base_rand = rand_default()
+
+  def rand(shape, dtype):
+    """The random sampler function."""
+    dims = _dims_of_shape(shape)
+    zeros = rng.rand(*dims) < 0.5
+
+    vals = base_rand(shape, dtype)
+    vals = onp.where(zeros, onp.array(0, dtype=dtype), vals)
+
+    return _cast_to_shape(onp.asarray(vals, dtype=dtype), shape, dtype)
+
+  return rand
+
+
+def rand_int(low, high=None):
+  randint = npr.RandomState(0).randint
+  def fn(shape, dtype):
+    return randint(low, high=high, size=shape, dtype=dtype)
+  return fn
+
+def rand_unique_int():
+  randchoice = npr.RandomState(0).choice
+  def fn(shape, dtype):
+    return randchoice(onp.arange(onp.prod(shape), dtype=dtype),
+                      size=shape, replace=False)
+  return fn
+
+def rand_bool():
+  rng = npr.RandomState(0)
+  def generator(shape, dtype):
+    return _cast_to_shape(rng.rand(*_dims_of_shape(shape)) < 0.5, shape, dtype)
+  return generator
+
+def check_raises(thunk, err_type, msg):
+  try:
+    thunk()
+    assert False
+  except err_type as e:
+    assert str(e).startswith(msg), "\n{}\n\n{}\n".format(e, msg)
+
+def check_raises_regexp(thunk, err_type, pattern):
+  try:
+    thunk()
+    assert False
+  except err_type as e:
+    assert re.match(pattern, str(e)), "{}\n\n{}\n".format(e, pattern)
+
+
+def _iter_eqns(jaxpr):
+  # TODO(necula): why doesn't this search in params?
+  for eqn in jaxpr.eqns:
+    yield eqn
+  for subjaxpr in core.subjaxprs(jaxpr):
+    yield from _iter_eqns(subjaxpr)
+
+def assert_dot_precision(expected_precision, fun, *args):
+  jaxpr = api.make_jaxpr(fun)(*args)
+  precisions = [eqn.params['precision'] for eqn in _iter_eqns(jaxpr.jaxpr)
+                if eqn.primitive == lax.dot_general_p]
+  for precision in precisions:
+    msg = "Unexpected precision: {} != {}".format(expected_precision, precision)
+    assert precision == expected_precision, msg
+
+
+_CACHED_INDICES: Dict[int, Sequence[int]] = {}
+
+def cases_from_list(xs):
+  xs = list(xs)
+  n = len(xs)
+  k = min(n, FLAGS.num_generated_cases)
+  # Random sampling for every parameterized test is expensive. Do it once and
+  # cache the result.
+  indices = _CACHED_INDICES.get(n)
+  if indices is None:
+    rng = npr.RandomState(42)
+    _CACHED_INDICES[n] = indices = rng.permutation(n)
+  return [xs[i] for i in indices[:k]]
+
+def cases_from_gens(*gens):
+  sizes = [1, 3, 10]
+  cases_per_size = int(FLAGS.num_generated_cases / len(sizes)) + 1
+  for size in sizes:
+    for i in range(cases_per_size):
+      yield ('_{}_{}'.format(size, i),) + tuple(gen(size) for gen in gens)
+
+
+def to_np(a):
+  return nest.map_structure(np_array_ops.asarray, a)
+
+
+def to_tf_fn(f):
+  return lambda *args: f(*to_np(args))
+
+
+class TestCase(parameterized.TestCase):
+  """Base class for tests including numerical checks and boilerplate."""
+
+  # copied from jax.test_util
+  def setUp(self):
+    super().setUp()
+    self._rng = npr.RandomState(zlib.adler32(self._testMethodName.encode()))
+
+  # copied from jax.test_util
+  def rng(self):
+    return self._rng
+
+  # TODO(mattjj): this obscures the error messages from failures, figure out how
+  # to re-enable it
+  # def tearDown(self) -> None:
+  #   assert core.reset_trace_state()
+
+  def assertArraysAllClose(self, x, y, check_dtypes, atol=None, rtol=None):
+    """Assert that x and y are close (up to numerical tolerances)."""
+    self.assertEqual(x.shape, y.shape)
+    atol = max(tolerance(_dtype(x), atol), tolerance(_dtype(y), atol))
+    rtol = max(tolerance(_dtype(x), rtol), tolerance(_dtype(y), rtol))
+
+    _assert_numpy_allclose(x, y, atol=atol, rtol=rtol)
+
+    if check_dtypes:
+      self.assertDtypesMatch(x, y)
+
+  def assertDtypesMatch(self, x, y):
+    if FLAGS.enable_x64:
+      self.assertEqual(_dtype(x), _dtype(y))
+
+  def assertAllClose(self, x, y, check_dtypes, atol=None, rtol=None):
+    """Assert that x and y, either arrays or nested tuples/lists, are close."""
+    if isinstance(x, dict):
+      self.assertIsInstance(y, dict)
+      self.assertEqual(set(x.keys()), set(y.keys()))
+      for k in x:
+        self.assertAllClose(x[k], y[k], check_dtypes, atol=atol, rtol=rtol)
+    elif is_sequence(x) and not hasattr(x, '__array__'):
+      self.assertTrue(is_sequence(y) and not hasattr(y, '__array__'))
+      self.assertEqual(len(x), len(y))
+      for x_elt, y_elt in zip(x, y):
+        self.assertAllClose(x_elt, y_elt, check_dtypes, atol=atol, rtol=rtol)
+    elif hasattr(x, '__array__') or onp.isscalar(x):
+      self.assertTrue(hasattr(y, '__array__') or onp.isscalar(y))
+      if check_dtypes:
+        self.assertDtypesMatch(x, y)
+      x = onp.asarray(x)
+      y = onp.asarray(y)
+      self.assertArraysAllClose(x, y, check_dtypes=False, atol=atol, rtol=rtol)
+    elif x == y:
+      return
+    else:
+      raise TypeError((type(x), type(y)))
+
+  def assertMultiLineStrippedEqual(self, expected, what):
+    """Asserts two strings are equal, after stripping each line."""
+    ignore_space_re = re.compile(r'\s*\n\s*')
+    expected_clean = re.sub(ignore_space_re, '\n', expected.strip())
+    what_clean = re.sub(ignore_space_re, '\n', what.strip())
+    self.assertMultiLineEqual(expected_clean, what_clean,
+                              msg="Found\n{}\nExpecting\n{}".format(what, expected))
+
+  def _CheckAgainstNumpy(self, numpy_reference_op, lax_op, args_maker,
+                         check_dtypes=True, tol=None):
+    args = args_maker()
+    lax_ans = lax_op(*args)
+    numpy_ans = numpy_reference_op(*args)
+    self.assertAllClose(numpy_ans, lax_ans, check_dtypes=check_dtypes,
+                        atol=tol, rtol=tol)
+
+  def _CompileAndCheck(self,
+                       fun,
+                       args_maker,
+                       check_dtypes=True,
+                       rtol=None,
+                       atol=None,
+                       check_eval_on_shapes=True,
+                       check_incomplete_shape=True,
+                       check_unknown_rank=True,
+                       static_argnums=(),
+                       check_experimental_compile=True,
+                       check_xla_forced_compile=True):
+    """Compiles the function and checks the results.
+
+    Args:
+      fun: the function to be checked.
+      args_maker: a callable that returns a tuple which will be used as the
+        positional arguments.
+      check_dtypes: whether to check that the result dtypes from non-compiled
+        and compiled runs agree.
+      rtol: relative tolerance for allclose assertions.
+      atol: absolute tolerance for allclose assertions.
+      check_eval_on_shapes: whether to run `eval_on_shapes` on the function and
+        check that the result shapes and dtypes are correct.
+      check_incomplete_shape: whether to check that the function can handle
+        incomplete shapes (including those with and without a known rank).
+      check_unknown_rank: (only has effect when check_incomplete_shape is True)
+        whether to check that the function can handle unknown ranks.
+      static_argnums: indices of arguments to be treated as static arguments for
+        `jit` and `eval_on_shapes`.
+      check_experimental_compile: whether to check compilation with
+        experimental_compile=True (in addition to compilation without the flag).
+      check_xla_forced_compile: whether to check compilation with
+        forced_compile=True (in addition to compilation without the flag). This
+        flag is different from experimental_compile because it enforces
+        whole-function compilation while the latter doesn't. TPU requires
+        whole-function compilation.
+    """
+    args = args_maker()
+
+    for x in args:
+      if not hasattr(x, 'dtype'):
+        # If there is a input that doesn't have dtype info, jit and
+        # eval_on_shapes may pick a different dtype for it than numpy, so we
+        # skip the dtype check.
+        check_dtypes = False
+
+    python_ans = fun(*args)
+
+    python_shapes = nest.map_structure(onp.shape, python_ans)
+    onp_shapes = nest.map_structure(
+        lambda x: onp.shape(onp.asarray(x)), python_ans
+    )
+    self.assertEqual(python_shapes, onp_shapes)
+
+    def check_compile(**kwargs):
+      # `wrapped_fun` and `python_should_be_executing` are used to check that
+      # when the jitted function is called the second time, the original Python
+      # function won't be executed.
+      def wrapped_fun(*args):
+        self.assertTrue(python_should_be_executing)
+        return fun(*args)
+
+      cfun = nje.jit(wrapped_fun, static_argnums=static_argnums, **kwargs)
+      python_should_be_executing = True
+      monitored_ans = cfun(*args)
+
+      python_should_be_executing = False
+      compiled_ans = cfun(*args)
+
+      self.assertAllClose(python_ans, monitored_ans, check_dtypes, atol, rtol)
+      self.assertAllClose(python_ans, compiled_ans, check_dtypes, atol, rtol)
+
+      # Run `cfun` with a different set of arguments to check that changing
+      # arguments won't cause recompilation.
+
+      new_args = args_maker()
+
+      skip_retracing_test = False
+      for old, new in zip(nest.flatten(args), nest.flatten(new_args)):
+        if nje.most_precise_int_dtype(old) != nje.most_precise_int_dtype(new):
+          # If the old and new arguments result in different dtypes (because
+          # they fall into different value ranges), tf-numpy will retrace, so we
+          # skip the no-retrace test.
+          skip_retracing_test = True
+
+      if not skip_retracing_test:
+        python_should_be_executing = True
+        new_python_ans = fun(*new_args)
+        python_should_be_executing = False
+        compiled_ans = cfun(*new_args)
+        self.assertAllClose(new_python_ans, compiled_ans, check_dtypes, atol,
+                            rtol)
+
+    check_compile()
+    if check_experimental_compile:
+      check_compile(experimental_compile=True)
+    if check_xla_forced_compile:
+      check_compile(xla_forced_compile=True)
+
+    if check_eval_on_shapes:
+      # Check that nje.eval_on_shapes can get complete output shapes given
+      # complete input shapes.
+      cfun = nje.eval_on_shapes(fun, static_argnums=static_argnums)
+      compiled_ans = cfun(*args)
+      flat_python_ans = nest.flatten(python_ans)
+      flat_compiled_ans = nest.flatten(compiled_ans)
+      self.assertEqual(len(flat_python_ans), len(flat_compiled_ans))
+      for a, b in zip(flat_python_ans, flat_compiled_ans):
+        if hasattr(a, 'shape'):
+          self.assertEqual(a.shape, b.shape)
+        if check_dtypes and hasattr(a, 'dtype'):
+          self.assertEqual(dtypes.as_dtype(a.dtype), b.dtype)
+
+    # If some argument doesn't have a `dtype` attr (e.g. a Python scalar), we
+    # skip incomplete-shape checks, since shape specs need dtype. It's OK to
+    # skip since the same incomplete-shape checks will run for []-shaped arrays.
+    if check_incomplete_shape and all(hasattr(x, 'dtype') for x in args):
+      # Check partial shapes with known ranks.
+      # Numpy scalars (created by e.g. np.int32(5)) have `dtype` but not
+      # `shape`.
+      if all(hasattr(x, 'shape') for x in args):
+        specs = [
+            tensor.TensorSpec([None] * len(x.shape), x.dtype) for x in args
+        ]
+        cfun = nje.jit(
+            fun, static_argnums=static_argnums, input_signature=specs
+        )
+        compiled_ans = cfun(*args)
+        self.assertAllClose(python_ans, compiled_ans, check_dtypes, atol, rtol)
+
+      if check_unknown_rank:
+        # Check unknown ranks.
+        specs = [tensor.TensorSpec(None, x.dtype) for x in args]
+        cfun = nje.jit(
+            fun, static_argnums=static_argnums, input_signature=specs)
+        compiled_ans = cfun(*args)
+        self.assertAllClose(python_ans, compiled_ans, check_dtypes, atol, rtol)
+
+  def check_grads(self, f, args, atol=None, rtol=None, delta=None):
+    """Check gradients against finite differences.
+
+    Args:
+      f: function to check at ``f(*args)``.
+      args: a list or tuple of argument values.
+      atol: absolute tolerance for gradient equality.
+      rtol: relative tolerance for gradient equality.
+      delta: step size used for finite differences.
+    """
+    if delta is None:
+      # Optimal stepsize for central difference is O(epsilon^{1/3}).
+      dtype = np_utils.result_type(*args)
+      epsilon = onp.finfo(dtype).eps
+      delta = epsilon ** (1.0 / 3.0)
+    theoretical, numerical = gradient_checker_v2.compute_gradient(
+        to_tf_fn(f), args, delta=delta)
+    self.assertAllClose(theoretical, numerical, check_dtypes=False, atol=atol,
+                        rtol=rtol)
+
+
+@contextmanager
+def ignore_warning(**kw):
+  with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", **kw)
+    yield
+
+
+def disable(_):
+
+  def wrapper(self, *args, **kwargs):
+    self.skipTest('Test is disabled')
+
+  return wrapper
diff --git a/tensorflow/python/ops/parallel_for/BUILD b/tensorflow/python/ops/parallel_for/BUILD
index 0487794321b..4712255e302 100644
--- a/tensorflow/python/ops/parallel_for/BUILD
+++ b/tensorflow/python/ops/parallel_for/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -8,112 +9,100 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "parallel_for",
     srcs = [
         "__init__.py",
-        "control_flow_ops.py",
-        "gradients.py",
-        "pfor.py",
-        "test_util.py",
     ],
     srcs_version = "PY3",
     deps = [
         ":control_flow_ops",
         ":gradients",
-        ":test_util",
-        "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:handle_data_util",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python/framework:smart_cond",
-        "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/platform:flags",
-        "//tensorflow/python/platform:tf_logging",
-        "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:nest",
-        "//tensorflow/python/util:object_identity",
-        "//tensorflow/python/util:tf_decorator",
-        "//tensorflow/python/util:tf_export",
-        "//tensorflow/python/util:variable_utils",
-        "@absl_py//absl/flags",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "pfor_lib",
     srcs = ["pfor.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:manip_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:execute",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:handle_data_util",
+        "//tensorflow/python/ops:image_ops_gen",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:linalg_ops_gen",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:optional_ops_gen",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:parsing_ops_gen",
+        "//tensorflow/python/ops:random_ops_gen",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:sparse_ops_gen",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:spectral_ops_gen",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
-        "@absl_py//absl/flags",
+        "//third_party/py/numpy",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "control_flow_ops",
     srcs = ["control_flow_ops.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
         ":pfor_lib",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_array_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/autograph/core:ag_ctx",
+        "//tensorflow/python/autograph/impl:api",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:function",
-        "//tensorflow/python/ops/numpy_ops:numpy",
-        "//tensorflow/python/ops/signal",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -122,20 +111,20 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "test_util",
     srcs = ["test_util.py"],
     srcs_version = "PY3",
     deps = [
-        ":pfor_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variables",
+        ":control_flow_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "control_flow_ops_test",
     srcs = ["control_flow_ops_test.py"],
     shard_count = 16,
@@ -146,23 +135,60 @@ cuda_py_test(
         ":control_flow_ops",
         ":test_util",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:gradient_checker_v2",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:cond_v2",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:gradient_checker_v2",
+        "//tensorflow/python/ops:list_ops",
+        "//tensorflow/python/ops:list_ops_gen",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
+        "//tensorflow/python/ops:optional_ops_gen",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:stateless_random_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/ops/signal:fft_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "xla_control_flow_ops_test",
     srcs = ["xla_control_flow_ops_test.py"],
     tags = [
@@ -178,18 +204,23 @@ cuda_py_test(
         ":control_flow_ops",
         ":test_util",
         "//tensorflow/compiler/tf2xla/python:xla",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_v2_toggles",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/compiler/xla",
         "//tensorflow/python/compiler/xla:compiler_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_v2_toggles",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "array_test",
     srcs = ["array_test.py"],
     tags = [
@@ -198,14 +229,21 @@ cuda_py_test(
     deps = [
         ":control_flow_ops",
         ":test_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python:nn",
         "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "math_test",
     srcs = ["math_test.py"],
     shard_count = 5,
@@ -213,37 +251,64 @@ cuda_py_test(
     deps = [
         ":control_flow_ops",
         ":test_util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python:nn",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "gradients",
     srcs = ["gradients.py"],
     srcs_version = "PY3",
     deps = [
         ":control_flow_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:gradients_impl",
         "//tensorflow/python/util:nest",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "gradients_test",
     srcs = ["gradients_test.py"],
     tags = ["optonly"],  # Too slow in non-opt mode
     deps = [
         ":control_flow_ops",
         ":gradients",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:functional_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python:nn",
+        "//tensorflow/python:rnn_cell",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:functional_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/ops/parallel_for/pfor.py b/tensorflow/python/ops/parallel_for/pfor.py
index bdf76b72940..472c196c02e 100644
--- a/tensorflow/python/ops/parallel_for/pfor.py
+++ b/tensorflow/python/ops/parallel_for/pfor.py
@@ -158,9 +158,11 @@ def _stack(t, length):
       raise ValueError(
           "Attempted to stack an unhandled variant-dtype tensor of "
           f"type {shapes_and_types[0].type!r} ({t!r}).")
-  ones = array_ops.ones_like(array_ops.shape(t))
+  shape = array_ops.shape(t)
+  ones = array_ops.ones_like(shape)
   ones = array_ops.reshape(ones, [-1])
   length = array_ops.reshape(length, [-1])
+  length = math_ops.cast(length, shape.dtype)
   multiples = array_ops.concat([length, ones], 0)
   t = array_ops.tile(array_ops.expand_dims(t, 0), multiples)
   return wrap(t, True)
@@ -855,7 +857,8 @@ class _PforInput:
       if inp.is_stacked:
         shape = array_ops.shape(inp.t)
         rank_diff = array_ops.reshape(max_rank - ranks[i], [1])
-        ones = array_ops.tile([1], rank_diff)
+        ones = constant_op.constant([1], dtype=shape.dtype)
+        ones = array_ops.tile(ones, rank_diff)
         new_shape = array_ops.concat([shape[:1], ones, shape[1:]], axis=0)
         self._inputs[i] = wrap(array_ops.reshape(inp.t, new_shape), True)
 
@@ -1761,14 +1764,17 @@ def _convert_adjust_saturation(pfor_input):
 def _flatten_first_two_dims(x):
   """Merges first two dimensions."""
   old_shape = array_ops.shape(x)
-  new_shape = array_ops.concat([[-1], old_shape[2:]], axis=0)
+  first_dim = constant_op.constant([-1], dtype=old_shape.dtype)
+  new_shape = array_ops.concat([first_dim, old_shape[2:]], axis=0)
   return array_ops.reshape(x, new_shape)
 
 
 def _unflatten_first_dim(x, first_dim):
   """Splits first dimension into [first_dim, -1]."""
   old_shape = array_ops.shape(x)
-  new_shape = array_ops.concat([first_dim, [-1], old_shape[1:]], axis=0)
+  first_dim = math_ops.cast(first_dim, old_shape.dtype)
+  second_dim = constant_op.constant([-1], dtype=old_shape.dtype)
+  new_shape = array_ops.concat([first_dim, second_dim, old_shape[1:]], axis=0)
   return array_ops.reshape(x, new_shape)
 
 
@@ -1826,19 +1832,23 @@ def _convert_batch_to_space_nd(pfor_input):
   crops = pfor_input.unstacked_input(2)
 
   inp_shape = array_ops.shape(inp)
-  n = pfor_input.pfor.loop_len_vector
+  n = math_ops.cast(pfor_input.pfor.loop_len_vector, inp_shape.dtype)
+  block_shape = math_ops.cast(block_shape, inp_shape.dtype)
 
   # Reshape and transpose to move the vectorization axis inside the axes that
   # will move to space.
   # Reshape to 4D and transpose
   block_size = math_ops.reduce_prod(block_shape)
-  new_shape = [n[0], block_size, inp_shape[1] // block_size, -1]
+  neg_one = constant_op.constant(-1, dtype=inp_shape.dtype)
+  new_shape = [n[0], block_size, inp_shape[1] // block_size, neg_one]
   inp = array_ops.reshape(inp, new_shape)
+
   inp = array_ops.transpose(inp, [1, 0, 2, 3])
   # Reshape back to merge the block, vectorization and batch dimension, and
   # restore the other dimensions.
   new_shape = array_ops.concat([n * inp_shape[1], inp_shape[2:]], axis=0)
   inp = array_ops.reshape(inp, new_shape)
+
   # Call batch_to_space and then split the new batch axis.
   output = gen_array_ops.batch_to_space_nd(inp, block_shape, crops)
   output = _unflatten_first_dim(output, n)
@@ -1851,14 +1861,19 @@ def _convert_space_to_batch_nd(pfor_input):
   block_shape = pfor_input.unstacked_input(1)
   paddings = pfor_input.unstacked_input(2)
 
-  n = pfor_input.pfor.loop_len_vector
   inp_shape = array_ops.shape(inp)
+  n = math_ops.cast(pfor_input.pfor.loop_len_vector, inp_shape.dtype)
+  block_shape = math_ops.cast(block_shape, inp_shape.dtype)
+
   inp = _flatten_first_two_dims(inp)
   output = gen_array_ops.space_to_batch_nd(inp, block_shape, paddings)
   output_shape = array_ops.shape(output)
+
   block_size = math_ops.reduce_prod(block_shape)
-  new_shape = [block_size, n[0], -1]
+  neg_one = constant_op.constant(-1, dtype=inp_shape.dtype)
+  new_shape = [block_size, n[0], neg_one]
   output = array_ops.reshape(output, new_shape)
+
   output = array_ops.transpose(output, [1, 0, 2])
   new_shape = array_ops.concat(
       [n, block_size * inp_shape[1:2], output_shape[1:]], axis=0)
@@ -1888,13 +1903,14 @@ def _channel_flatten_input(x, data_format):
   cache_key = (graph, x.ref(), data_format)
   if cache_key not in _channel_flatten_input_cache:
     x_shape = array_ops.shape(x)
+    neg_ones = constant_op.constant([-1], dtype=x_shape.dtype)
     if data_format == b"NCHW":
       order = [1, 0, 2, 3, 4]
-      shape = array_ops.concat([x_shape[1:2], [-1], x_shape[3:]], axis=0)
+      shape = array_ops.concat([x_shape[1:2], neg_ones, x_shape[3:]], axis=0)
       reverse_order = order
     else:
       order = [1, 2, 3, 0, 4]
-      shape = array_ops.concat([x_shape[1:4], [-1]], axis=0)
+      shape = array_ops.concat([x_shape[1:4], neg_ones], axis=0)
       reverse_order = [3, 0, 1, 2, 4]
     # Move S dimension next to C dimension.
     x = array_ops.transpose(x, order)
@@ -2195,7 +2211,8 @@ def _convert_identity_n(pfor_input):
 def _convert_reshape(pfor_input):
   t = pfor_input.stacked_input(0)
   shape = pfor_input.unstacked_input(1)
-  new_shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape], axis=0)
+  n = math_ops.cast(pfor_input.pfor.loop_len_vector, shape.dtype)
+  new_shape = array_ops.concat([n, shape], axis=0)
   return wrap(array_ops.reshape(t, new_shape), True)
 
 
@@ -2218,15 +2235,18 @@ def _convert_fill(pfor_input):
 def _convert_broadcast_to(pfor_input):
   t = pfor_input.stacked_input(0)
   shape = pfor_input.unstacked_input(1)
-  new_shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape], axis=0)
+  n = pfor_input.pfor.loop_len_vector
+  new_shape = array_ops.concat([n, shape], axis=0)
 
   # Expand dims of stacked t to broadcast against the new shape.
   # TODO(davmre): consider factoring out common code with
   # `expanddim_inputs_for_broadcast`, which has similar logic but with
   # implicit shapes (of input Tensors) rather than explicit shapes.
-  rank_diff = array_ops.shape(new_shape)[0] - array_ops.rank(t)
-  ones = array_ops.tile([1], array_ops.reshape(rank_diff, [1]))
   t_shape = array_ops.shape(t)
+  t_rank = math_ops.cast(array_ops.rank(t), t_shape.dtype)
+  rank_diff = array_ops.shape(new_shape)[0] - t_rank
+  ones = array_ops.tile([1], array_ops.reshape(rank_diff, [1]))
+  ones = math_ops.cast(ones, t_shape.dtype)
   t_expanded_shape = array_ops.concat([t_shape[:1], ones, t_shape[1:]], axis=0)
 
   return wrap(
@@ -2376,9 +2396,12 @@ def _convert_slice(pfor_input):
     # the output would be ragged. This case is not supported. But `size` having
     # some negative values and some loop-variant `begin`s is OK (and it's hard
     # to tell the difference statically).
-    original_unstacked_shape = _stack(
-        array_ops.shape(t)[1:], pfor_input.pfor.loop_len_vector).t
-    broadcast_size = _stack(size, pfor_input.pfor.loop_len_vector).t
+    t_shape = array_ops.shape(t)
+    size = math_ops.cast(size, t_shape.dtype)
+    begin = math_ops.cast(begin, t_shape.dtype)
+    n = math_ops.cast(pfor_input.pfor.loop_len_vector, t_shape.dtype)
+    original_unstacked_shape = _stack(t_shape[1:], n).t
+    broadcast_size = _stack(size, n).t
     result_shape = array_ops.where(
         math_ops.less(broadcast_size, 0),
         original_unstacked_shape - begin + broadcast_size + 1, broadcast_size)
@@ -3315,8 +3338,8 @@ def _convert_random(pfor_input, op_type, *args, **kw_args):
   del kw_args
   inputs = [pfor_input.unstacked_input(i) for i in range(pfor_input.num_inputs)]
   # inputs[0] is "shape"
-  inputs[0] = array_ops.concat([pfor_input.pfor.loop_len_vector, inputs[0]],
-                               axis=0)
+  n = math_ops.cast(pfor_input.pfor.loop_len_vector, inputs[0].dtype)
+  inputs[0] = array_ops.concat([n, inputs[0]], axis=0)
   # TODO(b/222761732): Turn this warning back on when legacy RNGs are
   #   deprecated.
   # logging.warning(
@@ -3350,7 +3373,8 @@ def _convert_random_with_param(pfor_input):
     loop_dim = array_ops.shape(shape)[0]
     stacked_samples = _transpose_dim_to_front(samples, loop_dim)
   else:
-    shape = array_ops.concat([pfor_input.pfor.loop_len_vector, shape], axis=0)
+    n = math_ops.cast(pfor_input.pfor.loop_len_vector, shape.dtype)
+    shape = array_ops.concat([n, shape], axis=0)
     stacked_samples = _create_op(
         pfor_input.op_type,
         inputs=[shape, param],
@@ -4096,8 +4120,8 @@ def _convert_tensor_array_push_back(pfor_input):
   else:
     # PopBack has an element shape set when it's the gradient of PushBack, only
     # used when the list is uninitialized.
-    vectorized_shape = array_ops.concat(
-        [pfor_input.pfor.loop_len_vector, element_shape], axis=0)
+    n = math_ops.cast(pfor_input.pfor.loop_len_vector, element_shape.dtype)
+    vectorized_shape = array_ops.concat([n, element_shape], axis=0)
 
   output_handle, tensor = gen_list_ops.tensor_list_pop_back(
       input_handle=handle, element_dtype=pfor_input.get_attr("element_dtype"),
diff --git a/tensorflow/python/ops/ragged/BUILD b/tensorflow/python/ops/ragged/BUILD
index 7364c30be27..3f2fb396c0d 100644
--- a/tensorflow/python/ops/ragged/BUILD
+++ b/tensorflow/python/ops/ragged/BUILD
@@ -58,19 +58,19 @@ py_strict_library(
         ":ragged_tensor",
         ":ragged_util",
         ":segment_id_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_array_ops_gen",
-        "//tensorflow/python:sort_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_array_ops_gen",
+        "//tensorflow/python/ops:sort_ops",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
@@ -83,7 +83,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:check_ops",
+        "//tensorflow/python/ops:check_ops",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -95,7 +95,7 @@ py_strict_library(
     deps = [
         ":ragged_gather_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -113,11 +113,47 @@ py_strict_library(
         ":ragged_tensor",
         ":ragged_tensor_shape",
         ":ragged_where_op",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+    ],
+)
+
+py_strict_library(
+    name = "ragged_bincount_ops",
+    srcs = ["ragged_bincount_ops.py"],
+    deps = [
+        ":ragged_tensor",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bincount_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:count_ops_gen",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/util:dispatch",
+    ],
+)
+
+cuda_py_strict_test(
+    name = "ragged_bincount_ops_test",
+    size = "small",
+    srcs = ["ragged_bincount_ops_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":ragged_factory_ops",
+        ":ragged_tensor",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:bincount_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
@@ -129,12 +165,12 @@ py_strict_library(
         ":ragged_gather_ops",
         ":ragged_tensor",
         ":ragged_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -146,12 +182,12 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_conversion_ops_gen",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_conversion_ops_gen",
     ],
 )
 
@@ -163,14 +199,14 @@ py_strict_library(
         ":ragged_array_ops",
         ":ragged_functional_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -182,11 +218,11 @@ py_strict_library(
     deps = [
         ":ragged_tensor",
         ":ragged_tensor_value",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -215,13 +251,13 @@ py_strict_library(
         ":ragged_array_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_array_ops_gen",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_array_ops_gen",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -234,15 +270,15 @@ py_strict_library(
         ":ragged_gather_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -256,16 +292,16 @@ py_strict_library(
         ":ragged_functional_ops",
         ":ragged_tensor",
         ":segment_id_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:ragged_math_ops_gen",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:ragged_math_ops_gen",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -279,8 +315,8 @@ py_strict_library(
     deps = [
         ":ragged_getitem",
         ":ragged_tensor",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_decorator",
     ],
 )
@@ -293,6 +329,7 @@ py_strict_library(
         ":ragged_array_ops",
         ":ragged_batch_gather_ops",
         ":ragged_batch_gather_with_default_op",
+        ":ragged_bincount_ops",
         ":ragged_check_ops",
         ":ragged_concat_ops",
         ":ragged_config",  # fixdeps: keep
@@ -326,16 +363,17 @@ py_strict_library(
         ":ragged_functional_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:string_ops_gen",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:string_ops_gen",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
@@ -350,13 +388,13 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
     ],
@@ -375,12 +413,6 @@ py_strict_library(
     deps = [
         ":segment_id_ops",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:bincount_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_math_ops_gen",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -391,6 +423,12 @@ py_strict_library(
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bincount_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_math_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -407,27 +445,27 @@ py_strict_library(
         ":ragged_util",
         ":row_partition",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_conversion_ops_gen",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python:tf2",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:composite_tensor_gradient",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_spec_registry",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_conversion_ops_gen",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/types:core",
         "//tensorflow/python/types:internal",
@@ -447,15 +485,15 @@ py_strict_library(
         ":ragged_config",
         ":ragged_tensor",
         ":ragged_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -466,12 +504,6 @@ py_strict_library(
     deps = [
         ":ragged_tensor",
         ":row_partition",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:extension_type",
@@ -479,6 +511,12 @@ py_strict_library(
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
@@ -503,11 +541,11 @@ py_strict_library(
     srcs = ["ragged_util.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_math_ops_gen",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_math_ops_gen",
     ],
 )
 
@@ -521,9 +559,9 @@ py_strict_library(
         ":ragged_gather_ops",
         ":ragged_tensor",
         ":ragged_tensor_shape",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -534,13 +572,13 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":ragged_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:bincount_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bincount_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -552,16 +590,16 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
     ],
 )
@@ -572,7 +610,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:map_fn",
+        "//tensorflow/python/ops:map_fn",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -593,17 +631,17 @@ py_strict_library(
         ":ragged_tensor_shape",
         ":ragged_util",  # fixdeps: keep
         ":ragged_where_op",  # fixdeps: keep
-        "//tensorflow/python:array_ops",  # fixdeps: keep
-        "//tensorflow/python:bitwise_ops",  # fixdeps: keep
-        "//tensorflow/python:clip_ops",  # fixdeps: keep
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:parsing_ops",  # fixdeps: keep
-        "//tensorflow/python:sparse_tensor",  # fixdeps: keep
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:variables",  # fixdeps: keep
         "//tensorflow/python/framework:dtypes",  # fixdeps: keep
         "//tensorflow/python/framework:ops",  # fixdeps: keep
+        "//tensorflow/python/framework:sparse_tensor",  # fixdeps: keep
+        "//tensorflow/python/ops:array_ops",  # fixdeps: keep
+        "//tensorflow/python/ops:bitwise_ops",  # fixdeps: keep
+        "//tensorflow/python/ops:clip_ops",  # fixdeps: keep
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",  # fixdeps: keep
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",  # fixdeps: keep
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
@@ -616,15 +654,15 @@ py_strict_library(
     srcs = ["ragged_tensor_test_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:bitwise_ops_gen",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:special_math_ops",
-        "//tensorflow/python:string_ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:bitwise_ops_gen",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:string_ops",
     ],
 )
 
@@ -651,16 +689,6 @@ py_strict_test(
         ":ragged_tensor_value",
         ":row_partition",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_grad",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_grad",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ragged_conversion_ops_gen",
-        "//tensorflow/python:tensor_array_grad",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -674,6 +702,16 @@ py_strict_test(
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/ops:array_grad",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_grad",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ragged_conversion_ops_gen",
+        "//tensorflow/python/ops:tensor_array_grad",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
@@ -692,11 +730,11 @@ py_strict_test(
         ":ragged_math_ops",
         ":ragged_string_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -718,12 +756,12 @@ py_strict_test(
         ":ragged",  # fixdeps: keep
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -740,7 +778,6 @@ py_strict_test(
     deps = [
         ":ragged",  # fixdeps: keep
         ":row_partition",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -749,6 +786,7 @@ py_strict_test(
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -778,10 +816,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":ragged_math_ops",
-        "//tensorflow/python:ragged_math_ops_gen",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:ragged_math_ops_gen",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -826,8 +864,6 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_gather_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients_impl",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -835,6 +871,8 @@ py_strict_test(
         "//tensorflow/python/framework:indexed_slices",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -851,12 +889,12 @@ py_strict_test(
         ":ragged_batch_gather_with_default_op",
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -870,11 +908,11 @@ py_strict_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_gather_ops",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -914,11 +952,11 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -938,13 +976,13 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_functional_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -956,12 +994,12 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -974,8 +1012,6 @@ py_strict_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:gradients_impl",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
@@ -985,6 +1021,8 @@ py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradients_impl",
         "//tensorflow/python/platform:benchmark",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:nest",
@@ -1007,10 +1045,10 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1024,11 +1062,11 @@ py_strict_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_math_ops",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1044,12 +1082,12 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_functional_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
     ],
 )
@@ -1130,11 +1168,11 @@ py_strict_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1148,12 +1186,12 @@ py_strict_test(
     deps = [
         ":ragged_concat_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1196,10 +1234,10 @@ py_strict_test(
     deps = [
         ":ragged_array_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1212,10 +1250,10 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":ragged_util",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1264,25 +1302,25 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":ragged_tensor_test_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python:tf2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:nest",
@@ -1315,13 +1353,13 @@ py_strict_test(
         ":ragged",  # fixdeps: keep
         ":ragged_concat_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1340,15 +1378,15 @@ py_strict_test(
         ":ragged_map_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1387,12 +1425,6 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":row_partition",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
@@ -1405,6 +1437,12 @@ py_strict_test(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1450,10 +1488,10 @@ py_strict_test(
         ":ragged_conversion_ops",
         ":ragged_factory_ops",
         ":ragged_squeeze_op",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1468,12 +1506,12 @@ cuda_py_strict_test(
         ":ragged_array_ops",
         ":ragged_concat_ops",
         ":ragged_factory_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:data_flow_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1486,9 +1524,9 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":ragged_factory_ops",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:nest",
         "@absl_py//absl/testing:parameterized",
@@ -1504,13 +1542,13 @@ py_strict_test(
     deps = [
         ":ragged_factory_ops",
         ":ragged_string_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1541,10 +1579,6 @@ py_strict_test(
         ":ragged_array_ops",
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:ragged_array_ops_gen",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
@@ -1552,6 +1586,10 @@ py_strict_test(
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:ragged_array_ops_gen",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1567,11 +1605,11 @@ py_strict_test(
         ":ragged_array_ops",
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -1588,12 +1626,12 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_string_ops",
         ":ragged_tensor",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:sparse_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:sparse_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1609,12 +1647,6 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_tensor",
         ":ragged_tensor_test_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:clip_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -1624,6 +1656,12 @@ py_strict_test(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:clip_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/util:dispatch",
         "@absl_py//absl/testing:parameterized",
@@ -1640,14 +1678,14 @@ py_strict_test(
         ":ragged_factory_ops",
         ":ragged_math_ops",
         ":ragged_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1678,9 +1716,6 @@ cuda_py_strict_test(
     srcs = ["ragged_factory_ops_test.py"],
     deps = [
         ":ragged_factory_ops",
-        "//tensorflow/python:map_fn",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:string_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:multi_device_iterator_ops",
         "//tensorflow/python/distribute:mirrored_strategy",
@@ -1689,6 +1724,9 @@ cuda_py_strict_test(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:map_fn",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:string_ops",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -1714,11 +1752,11 @@ py_strict_test(
         ":ragged_array_ops",
         ":ragged_factory_ops",
         ":ragged_tensor",
-        "//tensorflow/python:gradient_checker",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
         "//tensorflow/python/platform:test",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/ops/ragged/ragged_bincount_ops.py b/tensorflow/python/ops/ragged/ragged_bincount_ops.py
new file mode 100644
index 00000000000..16a4dcf04a3
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_bincount_ops.py
@@ -0,0 +1,385 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# maxlengthations under the License.
+# ==============================================================================
+"""bincount ops for RaggedTensors."""
+
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import bincount_ops
+from tensorflow.python.ops import check_ops
+from tensorflow.python.ops import gen_count_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+from tensorflow.python.util import dispatch
+
+
+@dispatch.dispatch_for_api(bincount_ops.bincount)
+def bincount(arr: ragged_tensor.RaggedTensor,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32,
+             name=None,
+             axis=None,
+             binary_output=False):
+  # TODO(b/285398376): add RaggedTensor examples to docstring.
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  tf.math.bincount(values) #[0 2 2 1 2 1]
+  ```
+  Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
+                  will be the vector length.
+
+  Each bin value in the output indicates number of occurrences of the particular
+  index. Here, index 1 in output has a value 2. This indicates value 1 occurs
+  two times in `values`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  weights = tf.constant([1,5,0,1,0,5,4,5])
+  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
+  ```
+  Bin will be incremented by the corresponding weight instead of 1.
+  Here, index 1 in output has a value 6. This is the summation of weights
+  corresponding to the value in `values`.
+
+  **Bin-counting on a certain axis**
+
+  This example takes a 2 dimensional input and returns a `Tensor` with
+  bincounting on each sample.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [2, 1, 1, 0]], dtype=int32)>
+
+
+  **Bin-counting with binary_output**
+
+  This example gives binary output instead of counting the occurrence.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1, binary_output=True)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [1, 1, 1, 0]], dtype=int32)>
+
+  Args:
+    arr: A RaggedTensor whose values should be counted.
+      These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+
+  Raises:
+    `InvalidArgumentError` if negative values are provided as an input.
+
+  """
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    arr = ragged_tensor.convert_to_tensor_or_ragged_tensor(arr, name="arr")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("Arguments `binary_output` and `weights` are mutually "
+                       "exclusive. Please specify only one.")
+
+    if not arr.dtype.is_integer:
+      arr = math_ops.cast(arr, dtypes.int32)
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError(f"Unsupported value for argument axis={axis}. Only 0 and"
+                       " -1 are currently supported.")
+
+    array_is_nonempty = array_ops.size(arr) > 0
+    output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+        math_ops.reduce_max(arr) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=arr.dtype)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=arr.dtype)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+
+    if axis == 0:
+      # Flatten RaggedTensors with multiple ragged dimensions which use a
+      # nested RaggedTensor for the values tensor.
+      while isinstance(arr, ragged_tensor.RaggedTensor):
+        if weights is not None:
+          weights = validate_ragged_weights(arr, weights, dtype)
+        arr = arr.values
+
+    if isinstance(arr, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(arr, weights, dtype)
+      return gen_math_ops.ragged_bincount(
+          splits=arr.row_splits,
+          values=arr.values,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+    else:
+      weights = bincount_ops.validate_dense_weights(arr, weights, dtype)
+      return gen_math_ops.dense_bincount(
+          input=arr,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+
+
+@dispatch.dispatch_for_api(sparse_ops.sparse_bincount)
+def sparse_bincount(values: ragged_tensor.RaggedTensor,
+                    weights=None,
+                    axis=0,
+                    minlength=None,
+                    maxlength=None,
+                    binary_output=False,
+                    name=None):
+  """Count the number of times an integer value appears in a tensor.
+
+  This op takes an N-dimensional `Tensor`, `RaggedTensor`, or `SparseTensor`,
+  and returns an N-dimensional int64 SparseTensor where element
+  `[i0...i[axis], j]` contains the number of times the value `j` appears in
+  slice `[i0...i[axis], :]` of the input tensor.  Currently, only N=0 and
+  N=-1 are supported.
+
+  Args:
+    values: A RaggedTensor whose values should be
+      counted. These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `value`, the bin will be incremented by the corresponding weight instead
+      of 1.
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `values` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+    name: A name for this op.
+
+  Returns:
+    A SparseTensor with `output.shape = values.shape[:axis] + [N]`, where `N` is
+      * `maxlength` (if set);
+      * `minlength` (if set, and `minlength > reduce_max(values)`);
+      * `0` (if `values` is empty);
+      * `reduce_max(values) + 1` otherwise.
+
+  Raises:
+    `InvalidArgumentError` if negative values are provided as an input.
+
+  Examples:
+
+  **Bin-counting every item in individual batches**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Bin-counting with defined output shape**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i. However, all values of j
+  above 'maxlength' are ignored. The dense_shape of the output sparse tensor
+  is set to 'minlength'. Note that, while the input is identical to the
+  example above, the value '10001' in batch item 2 is dropped, and the
+  dense shape is [2, 500] instead of [2,10002] or [2, 102].
+
+  >>> minlength = maxlength = 500
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(
+  ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[  0  10]
+   [  0  20]
+   [  0  30]
+   [  1  11]
+   [  1 101]], shape=(5, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1], shape=(5,), dtype=int64),
+   dense_shape=tf.Tensor([  2 500], shape=(2,), dtype=int64))
+
+  **Binary bin-counting**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where (i,j) is 1 if the value j
+  appears in batch i at least once and is 0 otherwise. Note that, even though
+  some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
+  the 'values' tensor is all 1s.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  """
+  with ops.name_scope(name, "count", [values, weights]):
+    values = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+        values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("Arguments `binary_output` and `weights` are mutually "
+                       "exclusive. Please specify only one.")
+
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError(f"Unsupported value for argument axis={axis}. Only 0 and"
+                       " -1 are currently supported.")
+
+    minlength_value = minlength if minlength is not None else -1
+    maxlength_value = maxlength if maxlength is not None else -1
+
+    if axis == 0:
+      if weights is not None:
+        weights = validate_ragged_weights(values, weights)
+      values = values.values
+
+    if isinstance(values, ragged_tensor.RaggedTensor):
+      weights = validate_ragged_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.ragged_count_sparse_output(
+          values.row_splits,
+          values.values,
+          weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+    else:
+      weights = bincount_ops.validate_dense_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
+          values,
+          weights=weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+
+    return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_ragged_weights(values, weights, dtype=None):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, ragged_tensor.RaggedTensor):
+    raise ValueError(
+        "`weights` must be a RaggedTensor if `values` is a RaggedTensor. "
+        f"Received argument weights={weights} of type: "
+        f"{type(weights).__name__}.")
+
+  checks = []
+  if weights.row_splits is not values.row_splits:
+    checks.append(
+        check_ops.assert_equal(
+            weights.row_splits,
+            values.row_splits,
+            message="'weights' and 'values' must have the same row splits."))
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
diff --git a/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py b/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
new file mode 100644
index 00000000000..391d9e1342c
--- /dev/null
+++ b/tensorflow/python/ops/ragged/ragged_bincount_ops_test.py
@@ -0,0 +1,550 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# maxlengthations under the License.
+# ==============================================================================
+"""Tests for bincount ops with RaggedTensor inputs."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.framework import config as tf_config
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import bincount_ops
+from tensorflow.python.ops import sparse_ops
+from tensorflow.python.ops.ragged import ragged_factory_ops
+from tensorflow.python.ops.ragged import ragged_tensor
+
+
+def _ragged_factory(x):
+  return lambda: ragged_factory_ops.constant(x)
+
+
+def _adjust_expected_rank1(x, minlength, maxlength):
+  """Trim or pad an expected result based on minlength and maxlength."""
+  n = len(x)
+  if (minlength is not None) and (n < minlength):
+    x = x + [0] * (minlength - n)
+  if (maxlength is not None) and (n > maxlength):
+    x = x[:maxlength]
+  return x
+
+
+def _adjust_expected_rank2(x, minlength, maxlength):
+  return [_adjust_expected_rank1(i, minlength, maxlength) for i in x]
+
+
+class TestDenseBincount(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+                                    dtype)
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 1, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 2, 1]]
+    # pyformat: enable
+    self.assertAllEqual(expected_output,
+                        self.evaluate(bincount_ops.bincount(arr=x, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_binary(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 1, 0, 1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [1, 0, 0, 0, 1, 1]]
+    # pyformat: enable
+    self.assertAllEqual(
+        expected_output,
+        self.evaluate(
+            bincount_ops.bincount(arr=x, axis=-1, binary_output=True)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_with_weights(self, dtype):
+    x = ragged_factory_ops.constant([[], [], [3, 0, 1], [], [5, 0, 4, 4]])
+    weights = ragged_factory_ops.constant([[], [], [.1, .2, .3], [],
+                                           [.2, .5, .6, .3]])
+    # pyformat: disable
+    expected_output = [
+        [0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [.2, .3, 0, .1, 0, 0],
+        [0, 0, 0, 0, 0, 0],
+        [.5, 0, 0, 0, .9, .2]]
+    # pyformat: enable
+    self.assertAllClose(
+        expected_output,
+        self.evaluate(bincount_ops.bincount(arr=x, weights=weights, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_np(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_out = np.reshape(
+        np.concatenate(
+            [np.bincount(inp[j, :], minlength=size) for j in range(num_rows)],
+            axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(bincount_ops.bincount(arr=x, minlength=size, axis=-1)))
+
+  @parameterized.parameters([{
+      "dtype": np.int32,
+  }, {
+      "dtype": np.int64,
+  }])
+  def test_ragged_input_count_np_with_weights(self, dtype):
+    np.random.seed(42)
+    num_rows = 128
+    num_cols = 27
+    size = 1000
+    inp = np.random.randint(0, size, (num_rows, num_cols), dtype=dtype)
+    np_weight = np.random.random((num_rows, num_cols))
+    np_out = np.reshape(
+        np.concatenate([
+            np.bincount(inp[j, :], weights=np_weight[j, :], minlength=size)
+            for j in range(num_rows)
+        ],
+                       axis=0), (num_rows, size))
+    x = ragged_tensor.RaggedTensor.from_tensor(inp)
+    weights = ragged_tensor.RaggedTensor.from_tensor(np_weight)
+    self.assertAllEqual(
+        np_out,
+        self.evaluate(
+            bincount_ops.bincount(
+                arr=x, weights=weights, minlength=size, axis=-1)))
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              expected=[0, 1, 2, 3],  # no implied zeros
+          ),
+          dict(
+              tid="_r3",
+              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
+              expected=[0, 1, 2, 3],  # no implied zeros
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_default(self, x_factory, minlength, maxlength, expected, tid=None):
+    x = x_factory()
+    expected = _adjust_expected_rank1(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(x, minlength=minlength, maxlength=maxlength)
+        ),
+    )
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x, minlength=minlength, maxlength=maxlength, axis=0
+            )
+        ),
+    )
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              # no implied zeros
+              expected=[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 2, 0], [0, 0, 0, 3]],
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_axis_neg_1(self, tid, x_factory, minlength, maxlength, expected):
+    x = x_factory()
+    expected = _adjust_expected_rank2(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x, minlength=minlength, maxlength=maxlength, axis=-1
+            )
+        ),
+    )
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              weights_factory=_ragged_factory([[], [1], [2, 3], [4, 5, 6]]),
+              axis=None,
+              expected=[0, 1, 5, 15],  # no implied zeros
+          ),
+          dict(
+              tid="_r3",
+              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
+              weights_factory=_ragged_factory([[[], [1]], [[2, 3], [4, 5, 6]]]),
+              expected=[0, 1, 5, 15],  # no implied zeros
+              axis=None,
+          ),
+          dict(
+              tid="_r2_axis_neg_1",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              weights_factory=_ragged_factory([[], [1], [2, 3], [4, 5, 6]]),
+              # no implied zeros
+              expected=[
+                  [0, 0, 0, 0],
+                  [0, 1, 0, 0],
+                  [0, 0, 5, 0],
+                  [0, 0, 0, 15],
+              ],
+              axis=-1,
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_weights(
+      self,
+      tid,
+      x_factory,
+      weights_factory,
+      minlength,
+      maxlength,
+      expected,
+      axis,
+  ):
+    if "GPU" in set([d.device_type for d in tf_config.list_physical_devices()]):
+      self.skipTest(
+          "b/263004039 The DenseBincount GPU kernel does not support weights."
+          " unsorted_segment_sum should be used instead on GPU."
+      )
+    x = x_factory()
+    weights = weights_factory()
+    if axis == -1:
+      expected = _adjust_expected_rank2(expected, minlength, maxlength)
+    else:
+      expected = _adjust_expected_rank1(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x,
+                weights=weights,
+                minlength=minlength,
+                maxlength=maxlength,
+                axis=axis,
+            )
+        ),
+    )
+
+  @parameterized.product(
+      (
+          dict(
+              tid="_r2",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              expected=[0, 1, 1, 1],  # no implied zeros
+              axis=None,
+          ),
+          dict(
+              tid="_r3",
+              x_factory=_ragged_factory([[[], [1]], [[2, 2], [3, 3, 3]]]),
+              expected=[0, 1, 1, 1],  # no implied zeros
+              axis=None,
+          ),
+          dict(
+              tid="_r2_axis_neg_1",
+              x_factory=_ragged_factory([[], [1], [2, 2], [3, 3, 3]]),
+              # no implied zeros
+              expected=[[0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 1]],
+              axis=-1,
+          ),
+      ),
+      (
+          dict(minlength=None, maxlength=None),
+          dict(minlength=3, maxlength=None),
+          dict(minlength=5, maxlength=None),
+          dict(minlength=None, maxlength=3),
+          dict(minlength=None, maxlength=5),
+          dict(minlength=2, maxlength=3),
+          dict(minlength=3, maxlength=5),
+          dict(minlength=5, maxlength=10),
+      ),
+  )
+  def test_binary_output(
+      self,
+      tid,
+      x_factory,
+      minlength,
+      maxlength,
+      expected,
+      axis=None,
+      skip=False,
+  ):
+    x = x_factory()
+    if axis == -1:
+      expected = _adjust_expected_rank2(expected, minlength, maxlength)
+    else:
+      expected = _adjust_expected_rank1(expected, minlength, maxlength)
+    self.assertAllEqual(
+        expected,
+        self.evaluate(
+            bincount_ops.bincount(
+                x,
+                minlength=minlength,
+                maxlength=maxlength,
+                binary_output=True,
+                axis=axis,
+            )
+        ),
+    )
+
+
+class TestSparseCount(test_util.TensorFlowTestCase, parameterized.TestCase):
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "_no_maxlength",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 6],
+      }, {
+          "testcase_name": "_maxlength",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 7,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 7],
+      }, {
+          "testcase_name": "_maxlength_zero",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 0,
+          "expected_indices": np.empty([0, 2], dtype=np.int64),
+          "expected_values": [],
+          "expected_shape": [5, 0],
+      }, {
+          "testcase_name": "_minlength",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 9,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 9],
+      }, {
+          "testcase_name": "_minlength_larger_values",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 3,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [5, 8],
+      }, {
+          "testcase_name": "_no_maxlength_binary",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 6],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_maxlength_binary",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 7,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 7],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_minlength_binary",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 9,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 9],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_minlength_larger_values_binary",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 3,
+          "binary_output": True,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [5, 8],
+      }, {
+          "testcase_name": "_no_maxlength_weights",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_shape": [5, 6],
+          "weights": [[], [], [6, 0.5, 2], [], [10, 0.25, 5, 3]],
+      }, {
+          "testcase_name": "_maxlength_weights",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "maxlength": 7,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [4, 0], [4, 4], [4, 5]],
+          "expected_values": [0.5, 2, 6, 0.25, 8, 10],
+          "expected_shape": [5, 7],
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+      }, {
+          "testcase_name": "_minlength_weights",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 9,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_shape": [5, 9],
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+      }, {
+          "testcase_name": "_minlength_larger_values_weights",
+          "x": [[], [], [3, 0, 1], [7], [5, 0, 4, 4]],
+          "minlength": 3,
+          "expected_indices": [[2, 0], [2, 1], [2, 3], [3, 7], [4, 0], [4, 4],
+                               [4, 5]],
+          "expected_values": [0.5, 2, 6, 14, 0.25, 8, 10],
+          "expected_shape": [5, 8],
+          "weights": [[], [], [6, 0.5, 2], [14], [10, 0.25, 5, 3]],
+      }, {
+          "testcase_name": "_1d",
+          "x": [3, 0, 1, 1],
+          "expected_indices": [[0], [1], [3]],
+          "expected_values": [1, 2, 1],
+          "expected_shape": [4],
+      }, {
+          "testcase_name": "_all_axes",
+          "x": [[], [], [3, 0, 1], [], [5, 0, 4, 4]],
+          "expected_indices": [[0], [1], [3], [4], [5]],
+          "expected_values": [2, 1, 1, 2, 1],
+          "expected_shape": [6],
+          "axis": None,
+      }, {
+          "testcase_name": "_large_inputs",
+          "x": [[1941591354222760687, 1748591354222760687],
+                [1941591354222760687, 1241591354229760689, 1241591354229760687]
+               ],
+          "expected_indices": [[1241591354229760687], [1241591354229760689],
+                               [1748591354222760687], [1941591354222760687]],
+          "expected_values": [1, 1, 1, 2],
+          "expected_shape": [1941591354222760687 + 1],
+          "axis": None
+      })
+  def test_ragged_input(self,
+                        x,
+                        expected_indices,
+                        expected_values,
+                        expected_shape,
+                        maxlength=None,
+                        minlength=None,
+                        binary_output=False,
+                        weights=None,
+                        axis=-1):
+    x_ragged = ragged_factory_ops.constant(x)
+    w = ragged_factory_ops.constant(weights) if weights is not None else None
+    y = sparse_ops.sparse_bincount(
+        x_ragged,
+        weights=w,
+        minlength=minlength,
+        maxlength=maxlength,
+        binary_output=binary_output,
+        axis=axis)
+    self.assertAllEqual(expected_indices, y.indices)
+    self.assertAllEqual(expected_values, y.values)
+    self.assertAllEqual(expected_shape, y.dense_shape)
+
+
+class TestSparseCountFailureModes(test_util.TensorFlowTestCase):
+  def test_dense_input_ragged_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegex(ValueError, "must be a tf.Tensor"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_ragged_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [14], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegex(ValueError, "must be a SparseTensor"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_dense_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegex(ValueError, "must be a RaggedTensor"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_sparse_weights_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegex(ValueError, "must be a RaggedTensor"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_ragged_input_different_shape_fails(self):
+    x = ragged_factory_ops.constant([[6, 1, 2], [14], [10, 1, 5, 3]])
+    weights = ragged_factory_ops.constant([[6, 0.5, 2], [], [10, 0.25, 5, 3]])
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must have the same row splits"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
diff --git a/tensorflow/python/ops/ragged/ragged_ops.py b/tensorflow/python/ops/ragged/ragged_ops.py
index 5d01d27b95a..e9ec08741cc 100644
--- a/tensorflow/python/ops/ragged/ragged_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_ops.py
@@ -27,6 +27,7 @@ circular dependencies.
 from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_batch_gather_ops
 from tensorflow.python.ops.ragged import ragged_batch_gather_with_default_op
+from tensorflow.python.ops.ragged import ragged_bincount_ops
 from tensorflow.python.ops.ragged import ragged_check_ops
 from tensorflow.python.ops.ragged import ragged_concat_ops
 from tensorflow.python.ops.ragged import ragged_conversion_ops
diff --git a/tensorflow/python/ops/ragged/ragged_string_ops.py b/tensorflow/python/ops/ragged/ragged_string_ops.py
index db6c057541d..3ab3d606a38 100644
--- a/tensorflow/python/ops/ragged/ragged_string_ops.py
+++ b/tensorflow/python/ops/ragged/ragged_string_ops.py
@@ -25,6 +25,7 @@ from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import cond
 from tensorflow.python.ops import gen_string_ops
+from tensorflow.python.ops import map_fn as map_fn_lib
 from tensorflow.python.ops import string_ops
 from tensorflow.python.ops.ragged import ragged_array_ops
 from tensorflow.python.ops.ragged import ragged_functional_ops
@@ -33,14 +34,9 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat as util_compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-map_fn_lib = LazyLoader("map_fn_lib", globals(),
-                        "tensorflow.python.ops.map_fn")
-
-
 @tf_export("strings.bytes_split")
 @dispatch.add_dispatch_support
 def string_bytes_split(input, name=None):  # pylint: disable=redefined-builtin
diff --git a/tensorflow/python/ops/risc/BUILD b/tensorflow/python/ops/risc/BUILD
index a37c18f28c8..82062b713ad 100644
--- a/tensorflow/python/ops/risc/BUILD
+++ b/tensorflow/python/ops/risc/BUILD
@@ -1,23 +1,25 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//tensorflow:internal"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "risc_grad",
     srcs = ["risc_grad.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:ops",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "risc_ops",
     srcs = ["risc_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:risc_ops_gen",
+        "//tensorflow/python/ops:risc_ops_gen",
     ],
 )
diff --git a/tensorflow/python/ops/rnn_cell_impl.py b/tensorflow/python/ops/rnn_cell_impl.py
index 3c8f0e602da..e190f4a35cd 100644
--- a/tensorflow/python/ops/rnn_cell_impl.py
+++ b/tensorflow/python/ops/rnn_cell_impl.py
@@ -20,19 +20,20 @@ operators that allow adding dropouts, projections, or embeddings for inputs.
 Constructing multi-layer cells is supported by the class `MultiRNNCell`, or by
 calling the `rnn` ops several times.
 """
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_shape
+from tensorflow.python.framework import tensor_util
 from tensorflow.python.keras.layers.legacy_rnn import rnn_cell_impl
+from tensorflow.python.ops import array_ops
+from tensorflow.python.util import nest
 
 # Remove caller that rely on private symbol in future.
-# pylint: disable=protected-access
-_BIAS_VARIABLE_NAME = rnn_cell_impl._BIAS_VARIABLE_NAME
-_WEIGHTS_VARIABLE_NAME = rnn_cell_impl._WEIGHTS_VARIABLE_NAME
-_concat = rnn_cell_impl._concat
-_zero_state_tensors = rnn_cell_impl._zero_state_tensors
-# pylint: disable=protected-access
+_BIAS_VARIABLE_NAME = "bias"
+_WEIGHTS_VARIABLE_NAME = "kernel"
 
-
-assert_like_rnncell = rnn_cell_impl.assert_like_rnncell
-ASSERT_LIKE_RNNCELL_ERROR_REGEXP = rnn_cell_impl.ASSERT_LIKE_RNNCELL_ERROR_REGEXP  # pylint: disable=line-too-long
 BasicLSTMCell = rnn_cell_impl.BasicLSTMCell
 BasicRNNCell = rnn_cell_impl.BasicRNNCell
 DeviceWrapper = rnn_cell_impl.DeviceWrapper
@@ -44,3 +45,135 @@ LSTMStateTuple = rnn_cell_impl.LSTMStateTuple
 MultiRNNCell = rnn_cell_impl.MultiRNNCell
 ResidualWrapper = rnn_cell_impl.ResidualWrapper
 RNNCell = rnn_cell_impl.RNNCell
+
+
+def _zero_state_tensors(state_size, batch_size, dtype):
+  """Create tensors of zeros based on state_size, batch_size, and dtype."""
+
+  def get_state_shape(s):
+    """Combine s with batch_size to get a proper tensor shape."""
+    c = _concat(batch_size, s)
+    size = array_ops.zeros(c, dtype=dtype)
+    if not context.executing_eagerly():
+      c_static = _concat(batch_size, s, static=True)
+      size.set_shape(c_static)
+    return size
+
+  return nest.map_structure(get_state_shape, state_size)
+
+
+def _concat(prefix, suffix, static=False):
+  """Concat that enables int, Tensor, or TensorShape values.
+
+  This function takes a size specification, which can be an integer, a
+  TensorShape, or a Tensor, and converts it into a concatenated Tensor
+  (if static = False) or a list of integers (if static = True).
+
+  Args:
+    prefix: The prefix; usually the batch size (and/or time step size).
+      (TensorShape, int, or Tensor.)
+    suffix: TensorShape, int, or Tensor.
+    static: If `True`, return a python list with possibly unknown dimensions.
+      Otherwise return a `Tensor`.
+
+  Returns:
+    shape: the concatenation of prefix and suffix.
+
+  Raises:
+    ValueError: if `suffix` is not a scalar or vector (or TensorShape).
+    ValueError: if prefix or suffix was `None` and asked for dynamic
+      Tensors out.
+  """
+  if isinstance(prefix, ops.Tensor):
+    p = prefix
+    p_static = tensor_util.constant_value(prefix)
+    if p.shape.ndims == 0:
+      p = array_ops.expand_dims(p, 0)
+    elif p.shape.ndims != 1:
+      raise ValueError(
+          "prefix tensor must be either a scalar or vector, but saw tensor: %s"
+          % p
+      )
+  else:
+    p = tensor_shape.TensorShape(prefix)
+    p_static = p.as_list() if p.ndims is not None else None
+    p = (
+        constant_op.constant(p.as_list(), dtype=dtypes.int32)
+        if p.is_fully_defined()
+        else None
+    )
+  if isinstance(suffix, ops.Tensor):
+    s = suffix
+    s_static = tensor_util.constant_value(suffix)
+    if s.shape.ndims == 0:
+      s = array_ops.expand_dims(s, 0)
+    elif s.shape.ndims != 1:
+      raise ValueError(
+          "suffix tensor must be either a scalar or vector, but saw tensor: %s"
+          % s
+      )
+  else:
+    s = tensor_shape.TensorShape(suffix)
+    s_static = s.as_list() if s.ndims is not None else None
+    s = (
+        constant_op.constant(s.as_list(), dtype=dtypes.int32)
+        if s.is_fully_defined()
+        else None
+    )
+
+  if static:
+    shape = tensor_shape.TensorShape(p_static).concatenate(s_static)
+    shape = shape.as_list() if shape.ndims is not None else None
+  else:
+    if p is None or s is None:
+      raise ValueError(
+          "Provided a prefix or suffix of None: %s and %s" % (prefix, suffix)
+      )
+    shape = array_ops.concat((p, s), 0)
+  return shape
+
+
+def _hasattr(obj, attr_name):
+  try:
+    getattr(obj, attr_name)
+  except AttributeError:
+    return False
+  else:
+    return True
+
+
+def assert_like_rnncell(cell_name, cell):
+  """Raises a TypeError if cell is not like an RNNCell.
+
+  NOTE: Do not rely on the error message (in particular in tests) which can be
+  subject to change to increase readability. Use
+  ASSERT_LIKE_RNNCELL_ERROR_REGEXP.
+
+  Args:
+    cell_name: A string to give a meaningful error referencing to the name of
+      the functionargument.
+    cell: The object which should behave like an RNNCell.
+
+  Raises:
+    TypeError: A human-friendly exception.
+  """
+  conditions = [
+      _hasattr(cell, "output_size"),
+      _hasattr(cell, "state_size"),
+      _hasattr(cell, "get_initial_state") or _hasattr(cell, "zero_state"),
+      callable(cell),
+  ]
+  errors = [
+      "'output_size' property is missing",
+      "'state_size' property is missing",
+      "either 'zero_state' or 'get_initial_state' method is required",
+      "is not callable",
+  ]
+
+  if not all(conditions):
+    errors = [error for error, cond in zip(errors, conditions) if not cond]
+    raise TypeError(
+        "The argument {!r} ({}) is not an RNNCell: {}.".format(
+            cell_name, cell, ", ".join(errors)
+        )
+    )
diff --git a/tensorflow/python/ops/signal/BUILD b/tensorflow/python/ops/signal/BUILD
index d1b8d460602..b44a2df4784 100644
--- a/tensorflow/python/ops/signal/BUILD
+++ b/tensorflow/python/ops/signal/BUILD
@@ -11,12 +11,12 @@ py_strict_library(
     srcs = ["dct_ops.py"],
     deps = [
         ":fft_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:smart_cond",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -26,14 +26,14 @@ py_strict_library(
     name = "fft_ops",
     srcs = ["fft_ops.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:manip_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:spectral_ops_gen",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:manip_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:spectral_ops_gen",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -45,11 +45,11 @@ py_strict_library(
     srcs = ["mel_ops.py"],
     deps = [
         ":shape_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -60,9 +60,9 @@ py_strict_library(
     srcs = ["mfcc_ops.py"],
     deps = [
         ":dct_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -72,11 +72,11 @@ py_strict_library(
     name = "reconstruction_ops",
     srcs = ["reconstruction_ops.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
     ],
@@ -87,11 +87,11 @@ py_strict_library(
     srcs = ["shape_ops.py"],
     deps = [
         ":util_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -122,12 +122,12 @@ py_strict_library(
         ":reconstruction_ops",
         ":shape_ops",
         ":window_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -138,11 +138,11 @@ py_strict_library(
     name = "util_ops",
     srcs = ["util_ops.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
     ],
 )
 
@@ -150,14 +150,14 @@ py_strict_library(
     name = "window_ops",
     srcs = ["window_ops.py"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:special_math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:special_math_ops",
         "//tensorflow/python/util:dispatch",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/ops/sparse_bincount_ops_test.py b/tensorflow/python/ops/sparse_bincount_ops_test.py
new file mode 100644
index 00000000000..3bec1b38f52
--- /dev/null
+++ b/tensorflow/python/ops/sparse_bincount_ops_test.py
@@ -0,0 +1,447 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for sparse bincount ops."""
+
+from absl.testing import parameterized
+import numpy as np
+
+from tensorflow.python.eager import context
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import sparse_ops
+
+
+class TestSparseCount(test_util.TensorFlowTestCase, parameterized.TestCase):
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name": "_no_maxlength",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [2, 6]
+      }, {
+          "testcase_name": "_maxlength",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 7,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
+          "expected_values": [1, 1, 1, 1, 2],
+          "expected_shape": [2, 7]
+      }, {
+          "testcase_name": "_maxlength_zero",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 0,
+          "expected_indices": np.empty([0, 2], dtype=np.int64),
+          "expected_values": [],
+          "expected_shape": [2, 0]
+      }, {
+          "testcase_name": "_minlength",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 9,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [2, 9]
+      }, {
+          "testcase_name": "_minlength_larger_values",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 3,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 2, 1],
+          "expected_shape": [2, 8]
+      }, {
+          "testcase_name": "_no_maxlength_binary",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [2, 6],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_maxlength_binary",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 7,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [2, 7],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_minlength_binary",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 9,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [2, 9],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_minlength_larger_values_binary",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 3,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [1, 1, 1, 1, 1, 1, 1],
+          "expected_shape": [2, 8],
+          "binary_output": True,
+      }, {
+          "testcase_name": "_no_maxlength_weights",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 4], [1, 5]],
+          "expected_values": [2, 1, 0.5, 9, 3],
+          "expected_shape": [2, 6],
+          "weights": [[0.5, 1, 2], [3, 4, 5]]
+      }, {
+          "testcase_name": "_maxlength_weights",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "maxlength": 7,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [1, 0], [1, 4]],
+          "expected_values": [2, 1, 0.5, 3, 9],
+          "expected_shape": [2, 7],
+          "weights": [[0.5, 1, 2, 11], [7, 3, 4, 5]]
+      }, {
+          "testcase_name": "_minlength_weights",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 9,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_shape": [2, 9],
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+      }, {
+          "testcase_name": "_minlength_larger_values_weights",
+          "x": np.array([[3, 2, 1, 7], [7, 0, 4, 4]], dtype=np.int32),
+          "minlength": 3,
+          "expected_indices": [[0, 1], [0, 2], [0, 3], [0, 7], [1, 0], [1, 4],
+                               [1, 7]],
+          "expected_values": [2, 1, 0.5, 3, 5, 13, 4],
+          "expected_shape": [2, 8],
+          "weights": [[0.5, 1, 2, 3], [4, 5, 6, 7]]
+      }, {
+          "testcase_name": "_1d",
+          "x": np.array([3, 2, 1, 1], dtype=np.int32),
+          "expected_indices": [[1], [2], [3]],
+          "expected_values": [2, 1, 1],
+          "expected_shape": [4]
+      }, {
+          "testcase_name": "_all_axes",
+          "x": np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32),
+          "expected_indices": [[1], [2], [3], [4], [5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [6],
+          "axis": None
+      }, {
+          "testcase_name":
+              "_large_inputs",
+          "x":
+              np.array([[
+                  1941591354222760687, 1748591354222760687, 1241591354229760689
+              ], [
+                  1941591354222760687, 1241591354229760689, 1241591354229760687
+              ]],
+                       dtype=np.int64),
+          "expected_indices": [[1241591354229760687], [1241591354229760689],
+                               [1748591354222760687], [1941591354222760687]],
+          "expected_values": [1, 2, 1, 2],
+          "expected_shape": [1941591354222760687 + 1],
+          "axis":
+              None
+      })
+  def test_dense_input(self,
+                       x,
+                       expected_indices,
+                       expected_values,
+                       expected_shape,
+                       minlength=None,
+                       maxlength=None,
+                       binary_output=False,
+                       weights=None,
+                       axis=-1):
+    y = sparse_ops.sparse_bincount(
+        x,
+        weights=weights,
+        minlength=minlength,
+        maxlength=maxlength,
+        binary_output=binary_output,
+        axis=axis)
+    self.assertAllEqual(expected_indices, y.indices)
+    self.assertAllEqual(expected_values, y.values)
+    self.assertAllEqual(expected_shape, y.dense_shape)
+
+  @parameterized.named_parameters(
+      {
+          "testcase_name":
+              "_no_maxlength",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 2, 1],
+          "expected_shape": [3, 6],
+      }, {
+          "testcase_name":
+              "_maxlength",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 2, 1],
+          "expected_shape": [3, 7],
+          "maxlength":
+              7,
+      }, {
+          "testcase_name":
+              "_maxlength_zero",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices":
+              np.empty([0, 2], dtype=np.int64),
+          "expected_values": [],
+          "expected_shape": [3, 0],
+          "maxlength":
+              0,
+      }, {
+          "testcase_name":
+              "_minlength",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [3, 9],
+          "minlength":
+              9,
+      }, {
+          "testcase_name":
+              "_minlength_larger_values",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 2, 1],
+          "expected_shape": [3, 8],
+          "minlength":
+              3,
+      }, {
+          "testcase_name":
+              "_no_maxlength_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1],
+          "expected_shape": [3, 6],
+          "binary_output":
+              True,
+      }, {
+          "testcase_name":
+              "_maxlength_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1],
+          "expected_shape": [3, 7],
+          "maxlength":
+              7,
+          "binary_output":
+              True,
+      }, {
+          "testcase_name":
+              "_minlength_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [3, 9],
+          "minlength":
+              9,
+          "binary_output":
+              True,
+      }, {
+          "testcase_name":
+              "_minlength_larger_values_binary",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [1, 1, 1, 1, 1],
+          "expected_shape": [3, 8],
+          "minlength":
+              3,
+          "binary_output":
+              True,
+      }, {
+          "testcase_name":
+              "_no_maxlength_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 7, 10],
+          "expected_shape": [3, 6],
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 0, 0], [10, 0, 3.5, 3.5]]),
+      }, {
+          "testcase_name":
+              "_maxlength_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 7, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 7, 10],
+          "expected_shape": [3, 7],
+          "maxlength":
+              7,
+          "weights":
+              np.array([[6, 0, 2, 0], [0, 0, 14, 0], [10, 0, 3.5, 3.5]]),
+      }, {
+          "testcase_name":
+              "_minlength_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_shape": [3, 9],
+          "minlength":
+              9,
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+      }, {
+          "testcase_name":
+              "_minlength_larger_values_weights",
+          "x":
+              np.array([[3, 0, 1, 0], [7, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[0, 1], [0, 3], [1, 7], [2, 4], [2, 5]],
+          "expected_values": [2, 6, 14, 6.5, 10],
+          "expected_shape": [3, 8],
+          "minlength":
+              3,
+          "weights":
+              np.array([[6, 0, 2, 0], [14, 0, 0, 0], [10, 0, 3, 3.5]]),
+      }, {
+          "testcase_name": "_1d",
+          "x": np.array([3, 0, 1, 1], dtype=np.int32),
+          "expected_indices": [[1], [3]],
+          "expected_values": [2, 1],
+          "expected_shape": [4],
+      }, {
+          "testcase_name":
+              "_all_axes",
+          "x":
+              np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]],
+                       dtype=np.int32),
+          "expected_indices": [[1], [3], [4], [5]],
+          "expected_values": [1, 1, 2, 1],
+          "expected_shape": [6],
+          "axis":
+              None,
+      }, {
+          "testcase_name":
+              "_large_inputs",
+          "x":
+              np.array([[1941591354222760687, 0, 1241591354229760689],
+                        [0, 1241591354229760689, 1241591354229760687]],
+                       dtype=np.int64),
+          "expected_indices": [[1241591354229760687], [1241591354229760689],
+                               [1941591354222760687]],
+          "expected_values": [1, 2, 1],
+          "expected_shape": [1941591354222760687 + 1],
+          "axis":
+              None
+      })
+  def test_sparse_input(self,
+                        x,
+                        expected_indices,
+                        expected_values,
+                        expected_shape,
+                        maxlength=None,
+                        minlength=None,
+                        binary_output=False,
+                        weights=None,
+                        axis=-1):
+    x_sparse = sparse_ops.from_dense(x)
+    w_sparse = sparse_ops.from_dense(weights) if weights is not None else None
+    y = sparse_ops.sparse_bincount(
+        x_sparse,
+        weights=w_sparse,
+        minlength=minlength,
+        maxlength=maxlength,
+        binary_output=binary_output,
+        axis=axis)
+    self.assertAllEqual(expected_indices, y.indices)
+    self.assertAllEqual(expected_values, y.values)
+    self.assertAllEqual(expected_shape, y.dense_shape)
+
+
+class TestSparseCountFailureModes(test_util.TensorFlowTestCase):
+
+  def test_dense_input_sparse_weights_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegex(ValueError, "must be a tf.Tensor"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_dense_input_wrong_shape_fails(self):
+    x = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    weights = np.array([[3, 2], [5, 4], [4, 3]])
+    # Note: Eager mode and graph mode throw different errors here. Graph mode
+    # will fail with a ValueError from the shape checking logic, while Eager
+    # will fail with an InvalidArgumentError from the kernel itself.
+    if context.executing_eagerly():
+      with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                  "must have the same shape"):
+        self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+    else:
+      with self.assertRaisesRegex(ValueError, "both shapes must be equal"):
+        self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_dense_weights_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = np.array([[3, 2, 1], [5, 4, 4]], dtype=np.int32)
+    with self.assertRaisesRegex(ValueError, "must be a SparseTensor"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 0, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must have the same indices"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_too_many_indices_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 1, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    with self.assertRaisesIncompatibleShapesError():
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
+
+  def test_sparse_input_wrong_shape_fails(self):
+    x = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4]], dtype=np.int32))
+    weights = sparse_ops.from_dense(
+        np.array([[3, 0, 1, 0], [0, 0, 0, 0], [5, 0, 4, 4], [0, 0, 0, 0]],
+                 dtype=np.int32))
+    with self.assertRaisesRegex(errors.InvalidArgumentError,
+                                "must have the same dense shape"):
+      self.evaluate(sparse_ops.sparse_bincount(x, weights=weights, axis=-1))
diff --git a/tensorflow/python/ops/sparse_ops.py b/tensorflow/python/ops/sparse_ops.py
index 79f9ad60e56..42d26e27d79 100644
--- a/tensorflow/python/ops/sparse_ops.py
+++ b/tensorflow/python/ops/sparse_ops.py
@@ -27,12 +27,16 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import sparse_tensor
+from tensorflow.python.framework import tensor_conversion
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import tensor_util
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import array_ops_stack
+from tensorflow.python.ops import bincount_ops
 from tensorflow.python.ops import check_ops
 from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import gen_count_ops
+from tensorflow.python.ops import gen_math_ops
 from tensorflow.python.ops import gen_sparse_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import special_math_ops
@@ -40,6 +44,7 @@ from tensorflow.python.ops import special_math_ops
 # pylint: disable=wildcard-import
 from tensorflow.python.ops.gen_sparse_ops import *
 # pylint: enable=wildcard-import
+from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.util import compat
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import dispatch
@@ -2985,6 +2990,393 @@ def map_values(op, *args, **kwargs):
                                       sparse_list[0].dense_shape)
 
 
+@dispatch.dispatch_for_api(bincount_ops.bincount)
+def bincount(arr: sparse_tensor.SparseTensor,
+             weights=None,
+             minlength=None,
+             maxlength=None,
+             dtype=dtypes.int32,
+             name=None,
+             axis=None,
+             binary_output=False):
+  # TODO(b/285398376): update docstring to use SparseTensor arr.
+  """Counts the number of occurrences of each value in an integer array.
+
+  If `minlength` and `maxlength` are not given, returns a vector with length
+  `tf.reduce_max(arr) + 1` if `arr` is non-empty, and length 0 otherwise.
+  If `weights` are non-None, then index `i` of the output stores the sum of the
+  value in `weights` at each index where the corresponding value in `arr` is
+  `i`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  tf.math.bincount(values) #[0 2 2 1 2 1]
+  ```
+  Vector length = Maximum element in vector `values` is 5. Adding 1, which is 6
+                  will be the vector length.
+
+  Each bin value in the output indicates number of occurrences of the particular
+  index. Here, index 1 in output has a value 2. This indicates value 1 occurs
+  two times in `values`.
+
+  ```python
+  values = tf.constant([1,1,2,3,2,4,4,5])
+  weights = tf.constant([1,5,0,1,0,5,4,5])
+  tf.math.bincount(values, weights=weights) #[0 6 0 1 9 5]
+  ```
+  Bin will be incremented by the corresponding weight instead of 1.
+  Here, index 1 in output has a value 6. This is the summation of weights
+  corresponding to the value in `values`.
+
+  **Bin-counting on a certain axis**
+
+  This example takes a 2 dimensional input and returns a `Tensor` with
+  bincounting on each sample.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [2, 1, 1, 0]], dtype=int32)>
+
+
+  **Bin-counting with binary_output**
+
+  This example gives binary output instead of counting the occurrence.
+
+  >>> data = np.array([[1, 2, 3, 0], [0, 0, 1, 2]], dtype=np.int32)
+  >>> tf.math.bincount(data, axis=-1, binary_output=True)
+  <tf.Tensor: shape=(2, 4), dtype=int32, numpy=
+    array([[1, 1, 1, 1],
+           [1, 1, 1, 0]], dtype=int32)>
+
+  Args:
+    arr: A SparseTensor whose values should be counted.
+      These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `arr`, the bin will be incremented by the corresponding weight instead of
+      1.
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `arr` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    dtype: If `weights` is None, determines the type of the output bins.
+    name: A name scope for the associated operations (optional).
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+
+  Returns:
+    A vector with the same dtype as `weights` or the given `dtype`. The bin
+    values.
+
+  Raises:
+    `InvalidArgumentError` if negative values are provided as an input.
+
+  """
+  name = "bincount" if name is None else name
+  with ops.name_scope(name):
+    # TODO(b/255381064) Remove the following block which uses older kernels for
+    # backwards compatibility for certain cases once all tests pass with the
+    # newer (dense_bincount, ragged_bincount and sparse_bincount) kernels.
+    if not binary_output and axis is None:
+      arr = ops.convert_to_tensor(arr, name="arr", dtype=dtypes.int32)
+      array_is_nonempty = math_ops.reduce_prod(array_ops.shape(arr)) > 0
+      output_size = math_ops.cast(array_is_nonempty, dtypes.int32) * (
+          math_ops.reduce_max(arr) + 1)
+      if minlength is not None:
+        minlength = ops.convert_to_tensor(
+            minlength, name="minlength", dtype=dtypes.int32)
+        output_size = gen_math_ops.maximum(minlength, output_size)
+      if maxlength is not None:
+        maxlength = ops.convert_to_tensor(
+            maxlength, name="maxlength", dtype=dtypes.int32)
+        output_size = gen_math_ops.minimum(maxlength, output_size)
+      if weights is not None:
+        weights = ops.convert_to_tensor(weights, name="weights")
+        return gen_math_ops.unsorted_segment_sum(weights, arr, output_size)
+      weights = constant_op.constant([], dtype)
+      arr = array_ops.reshape(arr, [-1])
+      return gen_math_ops.bincount(arr, output_size, weights)
+
+    if weights is not None and binary_output:
+      raise ValueError("Arguments `binary_output` and `weights` are mutually "
+                       "exclusive. Please specify only one.")
+
+    if not arr.dtype.is_integer:
+      arr = math_ops.cast(arr, dtypes.int32)
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError(f"Unsupported value for argument axis={axis}. Only 0 and"
+                       " -1 are currently supported.")
+
+    array_is_nonempty = array_ops.size(arr) > 0
+    output_size = math_ops.cast(array_is_nonempty, arr.dtype) * (
+        math_ops.reduce_max(arr.values) + 1)
+    if minlength is not None:
+      minlength = ops.convert_to_tensor(
+          minlength, name="minlength", dtype=arr.dtype)
+      output_size = gen_math_ops.maximum(minlength, output_size)
+    if maxlength is not None:
+      maxlength = ops.convert_to_tensor(
+          maxlength, name="maxlength", dtype=arr.dtype)
+      output_size = gen_math_ops.minimum(maxlength, output_size)
+
+    if axis == 0:
+      if weights is not None:
+        weights = validate_sparse_weights(arr, weights, dtype)
+      arr = arr.values
+
+    if isinstance(arr, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(arr, weights, dtype)
+      return gen_math_ops.sparse_bincount(
+          indices=arr.indices,
+          values=arr.values,
+          dense_shape=arr.dense_shape,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+    else:
+      weights = bincount_ops.validate_dense_weights(arr, weights, dtype)
+      return gen_math_ops.dense_bincount(
+          input=arr,
+          size=output_size,
+          weights=weights,
+          binary_output=binary_output)
+
+
+@tf_export("sparse.bincount")
+@dispatch.add_dispatch_support
+def sparse_bincount(values,
+                    weights=None,
+                    axis=0,
+                    minlength=None,
+                    maxlength=None,
+                    binary_output=False,
+                    name=None):
+  """Count the number of times an integer value appears in a tensor.
+
+  This op takes an N-dimensional `Tensor`, `RaggedTensor`, or `SparseTensor`,
+  and returns an N-dimensional int64 SparseTensor where element
+  `[i0...i[axis], j]` contains the number of times the value `j` appears in
+  slice `[i0...i[axis], :]` of the input tensor.  Currently, only N=0 and
+  N=-1 are supported.
+
+  Args:
+    values: A Tensor, RaggedTensor, or SparseTensor whose values should be
+      counted. These tensors must have a rank of 2 if `axis=-1`.
+    weights: If non-None, must be the same shape as arr. For each value in
+      `value`, the bin will be incremented by the corresponding weight instead
+      of 1.
+    axis: The axis to slice over. Axes at and below `axis` will be flattened
+      before bin counting. Currently, only `0`, and `-1` are supported. If None,
+      all axes will be flattened (identical to passing `0`).
+    minlength: If given, ensures the output has length at least `minlength`,
+      padding with zeros at the end if necessary.
+    maxlength: If given, skips values in `values` that are equal or greater than
+      `maxlength`, ensuring that the output has length at most `maxlength`.
+    binary_output: If True, this op will output 1 instead of the number of times
+      a token appears (equivalent to one_hot + reduce_any instead of one_hot +
+      reduce_add). Defaults to False.
+    name: A name for this op.
+
+  Returns:
+    A SparseTensor with `output.shape = values.shape[:axis] + [N]`, where `N` is
+      * `maxlength` (if set);
+      * `minlength` (if set, and `minlength > reduce_max(values)`);
+      * `0` (if `values` is empty);
+      * `reduce_max(values) + 1` otherwise.
+
+  Raises:
+    `InvalidArgumentError` if negative values are provided as an input.
+
+  Examples:
+
+  **Bin-counting every item in individual batches**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Bin-counting with defined output shape**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where the value of (i,j) is the
+  number of times value j appears in batch i. However, all values of j
+  above 'maxlength' are ignored. The dense_shape of the output sparse tensor
+  is set to 'minlength'. Note that, while the input is identical to the
+  example above, the value '10001' in batch item 2 is dropped, and the
+  dense shape is [2, 500] instead of [2,10002] or [2, 102].
+
+  >>> minlength = maxlength = 500
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(
+  ...    data, axis=-1, minlength=minlength, maxlength=maxlength)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[  0  10]
+   [  0  20]
+   [  0  30]
+   [  1  11]
+   [  1 101]], shape=(5, 2), dtype=int64),
+   values=tf.Tensor([1 2 1 2 1], shape=(5,), dtype=int64),
+   dense_shape=tf.Tensor([  2 500], shape=(2,), dtype=int64))
+
+  **Binary bin-counting**
+
+  This example takes an input (which could be a Tensor, RaggedTensor, or
+  SparseTensor) and returns a SparseTensor where (i,j) is 1 if the value j
+  appears in batch i at least once and is 0 otherwise. Note that, even though
+  some values (like 20 in batch 1 and 11 in batch 2) appear more than once,
+  the 'values' tensor is all 1s.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> output = tf.sparse.bincount(data, binary_output=True, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([1 1 1 1 1 1], shape=(6,), dtype=int64),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  **Weighted bin-counting**
+
+  This example takes two inputs - a values tensor and a weights tensor. These
+  tensors must be identically shaped, and have the same row splits or indices
+  in the case of RaggedTensors or SparseTensors. When performing a weighted
+  count, the op will output a SparseTensor where the value of (i, j) is the
+  sum of the values in the weight tensor's batch i in the locations where
+  the values tensor has the value j. In this case, the output dtype is the
+  same as the dtype of the weights tensor.
+
+  >>> data = np.array([[10, 20, 30, 20], [11, 101, 11, 10001]], dtype=np.int64)
+  >>> weights = [[2, 0.25, 15, 0.5], [2, 17, 3, 0.9]]
+  >>> output = tf.sparse.bincount(data, weights=weights, axis=-1)
+  >>> print(output)
+  SparseTensor(indices=tf.Tensor(
+  [[    0    10]
+   [    0    20]
+   [    0    30]
+   [    1    11]
+   [    1   101]
+   [    1 10001]], shape=(6, 2), dtype=int64),
+   values=tf.Tensor([2. 0.75 15. 5. 17. 0.9], shape=(6,), dtype=float32),
+   dense_shape=tf.Tensor([    2 10002], shape=(2,), dtype=int64))
+
+  """
+  with ops.name_scope(name, "count", [values, weights]):
+    if not isinstance(values, sparse_tensor.SparseTensor):
+      values = tensor_conversion.convert_to_tensor_v2_with_dispatch(
+          values, name="values")
+    if weights is not None:
+      if not isinstance(weights, sparse_tensor.SparseTensor):
+        weights = ragged_tensor.convert_to_tensor_or_ragged_tensor(
+            weights, name="weights")
+
+    if weights is not None and binary_output:
+      raise ValueError("Arguments `binary_output` and `weights` are mutually "
+                       "exclusive. Please specify only one.")
+
+    if axis is None:
+      axis = 0
+
+    if axis not in [0, -1]:
+      raise ValueError(f"Unsupported value for argument axis={axis}. Only 0 and"
+                       " -1 are currently supported.")
+
+    minlength_value = minlength if minlength is not None else -1
+    maxlength_value = maxlength if maxlength is not None else -1
+
+    if axis == 0:
+      if isinstance(values, sparse_tensor.SparseTensor):
+        if weights is not None:
+          weights = validate_sparse_weights(values, weights)
+        values = values.values
+      else:
+        if weights is not None:
+          weights = array_ops.reshape(weights, [-1])
+        values = array_ops.reshape(values, [-1])
+
+    if isinstance(values, sparse_tensor.SparseTensor):
+      weights = validate_sparse_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.sparse_count_sparse_output(
+          values.indices,
+          values.values,
+          values.dense_shape,
+          weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+    else:
+      weights = bincount_ops.validate_dense_weights(values, weights)
+      c_ind, c_val, c_shape = gen_count_ops.dense_count_sparse_output(
+          values,
+          weights=weights,
+          minlength=minlength_value,
+          maxlength=maxlength_value,
+          binary_output=binary_output)
+
+    return sparse_tensor.SparseTensor(c_ind, c_val, c_shape)
+
+
+def validate_sparse_weights(values, weights, dtype=None):
+  """Validates the passed weight tensor or creates an empty one."""
+  if weights is None:
+    if dtype:
+      return array_ops.constant([], dtype=dtype)
+    return array_ops.constant([], dtype=values.values.dtype)
+
+  if not isinstance(weights, sparse_tensor.SparseTensor):
+    raise ValueError(
+        "Argument `weights` must be a SparseTensor if `values` is a "
+        f"SparseTensor. Received weights={weights} of type: "
+        f"{type(weights).__name__}")
+
+  checks = []
+  if weights.dense_shape is not values.dense_shape:
+    checks.append(
+        check_ops.assert_equal(
+            weights.dense_shape,
+            values.dense_shape,
+            message="'weights' and 'values' must have the same dense shape."))
+  if weights.indices is not values.indices:
+    checks.append(
+        check_ops.assert_equal(
+            weights.indices,
+            values.indices,
+            message="'weights' and 'values' must have the same indices.")
+    )
+  if checks:
+    with ops.control_dependencies(checks):
+      weights = array_ops.identity(weights.values)
+  else:
+    weights = weights.values
+
+  return weights
+
+
 def _assert_sparse_compatible(sparse_tensors):
   """Check that all of `sparse_tensors` have same `indices` and `dense_shape`.
 
diff --git a/tensorflow/python/ops/structured/BUILD b/tensorflow/python/ops/structured/BUILD
index 877a9d858ad..e88d1855331 100644
--- a/tensorflow/python/ops/structured/BUILD
+++ b/tensorflow/python/ops/structured/BUILD
@@ -1,6 +1,6 @@
 # Structured Tensors
 
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -11,7 +11,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "structured",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
@@ -21,7 +21,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "structured_ops",
     srcs = ["structured_ops.py"],
     deps = [
@@ -30,7 +30,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "structured_tensor",
     srcs = [
         "structured_array_ops.py",
@@ -38,21 +38,19 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:type_spec",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
         "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
@@ -66,7 +64,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "structured_tensor_dynamic",
     srcs = [
         "structured_tensor_dynamic.py",
@@ -74,10 +72,14 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":structured_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "structured_array_ops",
     srcs = [
         "structured_array_ops.py",
@@ -85,27 +87,22 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":structured_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:type_spec",
-        "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/core/config:flags_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:dispatch",
-        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "structured_tensor_test",
     srcs = ["structured_tensor_test.py"],
     python_version = "PY3",
@@ -113,17 +110,18 @@ py_test(
         ":structured_array_ops",
         ":structured_tensor",
         ":structured_tensor_dynamic",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:extension_type",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
@@ -133,62 +131,68 @@ py_test(
     ],
 )
 
-py_test(
+py_strict_test(
     name = "structured_tensor_spec_test",
     srcs = ["structured_tensor_spec_test.py"],
     python_version = "PY3",
     deps = [
         ":structured_tensor",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/ops/ragged:row_partition",
+        "//tensorflow/python/platform:test",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "structured_array_ops_test",
     srcs = ["structured_array_ops_test.py"],
     python_version = "PY3",
     deps = [
         ":structured_array_ops",
         ":structured_tensor",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:random_seed",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops/ragged:dynamic_ragged_shape",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/ops/ragged:row_partition",
         "//tensorflow/python/util:nest",
-        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "structured_tensor_slice_test",
     srcs = ["structured_tensor_slice_test.py"],
     python_version = "PY3",
     deps = [
         ":structured_tensor",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python/ops/ragged:ragged_factory_ops",
-        "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:test",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/ops/structured/structured_array_ops.py b/tensorflow/python/ops/structured/structured_array_ops.py
index 80bb87eda87..4217293180b 100644
--- a/tensorflow/python/ops/structured/structured_array_ops.py
+++ b/tensorflow/python/ops/structured/structured_array_ops.py
@@ -16,6 +16,7 @@
 
 from typing import Sequence
 
+from tensorflow.core.config import flags
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
@@ -203,17 +204,27 @@ def random_shuffle(value, seed=None, name=None):
 
 
 @dispatch.dispatch_for_types(array_ops.size_v2, StructuredTensor)
-def size_v2(input, out_type=dtypes.int32, name=None):
+def size_v2(input, out_type=None, name=None):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor."""
+  if out_type is None:
+    if flags.config().tf_shape_default_int64.value():
+      out_type = dtypes.int64
+    else:
+      out_type = dtypes.int32
   return size(input, name=name, out_type=out_type)
 
 
 # pylint: disable=protected-access
 @dispatch.dispatch_for_types(array_ops.size, StructuredTensor)
-def size(input, name=None, out_type=dtypes.int32):
+def size(input, name=None, out_type=None):
   # pylint: disable=redefined-builtin
   """Returns the size of a tensor."""
+  if out_type is None:
+    if flags.config().tf_shape_default_int64.value():
+      out_type = dtypes.int64
+    else:
+      out_type = dtypes.int32
   with ops.name_scope(name, 'size', [input]) as name:
     if not input.row_partitions:
       if input.nrows() is not None:
diff --git a/tensorflow/python/ops/structured/structured_tensor_test.py b/tensorflow/python/ops/structured/structured_tensor_test.py
index 0694b8bfe69..3e84e21ae6f 100644
--- a/tensorflow/python/ops/structured/structured_tensor_test.py
+++ b/tensorflow/python/ops/structured/structured_tensor_test.py
@@ -51,7 +51,7 @@ class _PrivateSpecialType(extension_type.ExtensionType):
 @dispatch.dispatch_for_types(array_ops.shape_v2, _PrivateSpecialType)
 def shape_v2_special(
     input: _PrivateSpecialType,  # pylint: disable=redefined-builtin
-    out_type: dtypes.DType = dtypes.int32,
+    out_type: dtypes.DType = None,
     name: Optional[str] = None) -> DynamicRaggedShape:
   """Returns a DynamicRaggedShape containing the shape of the input."""
   del name
@@ -65,7 +65,7 @@ class _PrivateBrokenType(extension_type.ExtensionType):
 @dispatch.dispatch_for_types(array_ops.shape_v2, _PrivateBrokenType)
 def shape_v2_broken(
     input: _PrivateBrokenType,  # pylint: disable=redefined-builtin
-    out_type: dtypes.DType = dtypes.int32,
+    out_type: dtypes.DType = None,
     name: Optional[str] = None) -> DynamicRaggedShape:
   """Returns a DynamicRaggedShape containing the shape of the input."""
   del name
diff --git a/tensorflow/python/ops/tensor_array_ops.py b/tensorflow/python/ops/tensor_array_ops.py
index cb1c3a9237b..ea6ade396fa 100644
--- a/tensorflow/python/ops/tensor_array_ops.py
+++ b/tensorflow/python/ops/tensor_array_ops.py
@@ -15,8 +15,8 @@
 """TensorArray: a dynamically sized array of Tensors."""
 # Mixture of pep8 and non-pep8 names, so disable pylint bad-name
 # pylint: disable=g-bad-name
-import contextlib
 
+import contextlib
 import traceback
 import weakref
 
@@ -42,6 +42,7 @@ from tensorflow.python.ops import list_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.saved_model import nested_structure_coder
+from tensorflow.python.types import trace
 from tensorflow.python.util import tf_should_use
 from tensorflow.python.util.tf_export import tf_export
 
@@ -1307,6 +1308,9 @@ class TensorArray:
     """Close the current TensorArray."""
     return self._implementation.close(name=name)
 
+  def __tf_tracing_type__(self, _):
+    return TensorArrayTraceType(self)
+
 
 def build_ta_with_new_flow(old_ta, flow):
   """Builds a TensorArray with a new `flow` tensor."""
@@ -1489,6 +1493,43 @@ nested_structure_coder.register_codec(
 )
 
 
+# TODO(b/147450234): TensorArray has inconsistent tf.function semantics.
+class TensorArrayTraceType(trace.TraceType):
+  """Represents TraceType of TensorArray."""
+
+  def __init__(self, value):
+    self._value = value
+
+  def is_subtype_of(self, other):
+    return self == other
+
+  def most_specific_common_supertype(self, types):
+    return self if all(self == other for other in types) else None
+
+  def placeholder_value(self, placeholder_context):
+    return self._value
+
+  def _from_tensors(self, tensors):
+    return next(tensors)
+
+  def __eq__(self, other):
+    if not isinstance(other, trace.TraceType):
+      return NotImplemented
+
+    if not isinstance(other, TensorArrayTraceType):
+      return False
+
+    # Retrace for each instance since equality between symbolic values is not
+    # defined.
+    return self._value is other._value
+
+  def __hash__(self):
+    return id(self._value)
+
+  def __repr__(self):
+    return f"{self.__class__.__name__}(value={self._value!r})"
+
+
 # Register the TypeSpec for TensorArray.  If TensorArray is updated to be a
 # CompositeTensor, then this registration can be deleted.
 type_spec.register_type_spec_from_value_converter(
diff --git a/tensorflow/python/ops/v1_compat_tests/BUILD b/tensorflow/python/ops/v1_compat_tests/BUILD
index aba67c474ae..715f4b05dbb 100644
--- a/tensorflow/python/ops/v1_compat_tests/BUILD
+++ b/tensorflow/python/ops/v1_compat_tests/BUILD
@@ -1,22 +1,25 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "gradient_checker_test",
     size = "medium",
     srcs = ["gradient_checker_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:gradient_checker",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//third_party/py/numpy",
     ],
diff --git a/tensorflow/python/ops/variable_scope.py b/tensorflow/python/ops/variable_scope.py
index bb0e73e735e..a86b8c2999f 100644
--- a/tensorflow/python/ops/variable_scope.py
+++ b/tensorflow/python/ops/variable_scope.py
@@ -2693,10 +2693,6 @@ def set_variable_v1(variable_v1):
   _variable_v1 = variable_v1
 
 
-# TODO(apassos) remove forwarding symbol
-variable = _variable_v1
-
-
 @tf_export(v1=["variable_creator_scope"])
 @tf_contextlib.contextmanager
 def variable_creator_scope_v1(variable_creator):
diff --git a/tensorflow/python/ops/while_loop.py b/tensorflow/python/ops/while_loop.py
index f1ce9e4652b..6ef24e0a7ce 100644
--- a/tensorflow/python/ops/while_loop.py
+++ b/tensorflow/python/ops/while_loop.py
@@ -19,28 +19,17 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import type_spec
+from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import control_flow_util as util
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import tensor_array_ops
+from tensorflow.python.ops import while_v2
 from tensorflow.python.util import deprecation
 from tensorflow.python.util import nest
 from tensorflow.python.util import variable_utils
-from tensorflow.python.util.lazy_loader import LazyLoader
 from tensorflow.python.util.tf_export import tf_export
 
 
-# TODO(b/269483538): below lazy loads
-#   needed for references while refactors are in progress
-control_flow_ops = LazyLoader(
-    "control_flow_ops", globals(),
-    "tensorflow.python.ops.control_flow_ops")
-# This is to avoid circular dependencies:
-# while_v2 -> control_flow_ops
-# while_v2 -> gradients_util -> control_flow_ops
-while_v2 = LazyLoader("while_v2", globals(),
-                      "tensorflow.python.ops.while_v2")
-
-
 # @TODO(b/133606651) Replace "shape_invariants" with "loop_vars_signature".
 # pylint: disable=redefined-outer-name
 @tf_export("while_loop", v1=[])
diff --git a/tensorflow/python/platform/BUILD b/tensorflow/python/platform/BUILD
index afe02fceb54..633fab01d59 100644
--- a/tensorflow/python/platform/BUILD
+++ b/tensorflow/python/platform/BUILD
@@ -12,6 +12,7 @@ visibility = [
     "//tensorflow:__subpackages__",
     "//tensorflow/dtensor:dtensor-internal",
     "//learning/brain/python/platform:__subpackages__",
+    "//learning/brain/contrib/eager/numlib/benchmarks/kumamon:__subpackages__",
 ]
 
 package(
@@ -45,11 +46,12 @@ py_strict_library(
     name = "benchmark",
     srcs = ["benchmark.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":gfile",
         ":tf_logging",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
+        "//tensorflow/python/client",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_inspect",
@@ -68,8 +70,8 @@ py_strict_library(
     srcs = ["device_context.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
     ],
 )
 
@@ -77,6 +79,7 @@ py_strict_library(
     name = "test",
     srcs = ["googletest.py"],
     srcs_version = "PY3",
+    visibility = visibility + ["//tensorflow:internal"],
     deps = [
         ":benchmark",
         ":flags",
@@ -206,10 +209,17 @@ py_strict_library(
     name = "client_testlib",
     srcs = ["test.py"],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow:internal",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":test",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:gradient_checker",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:gradient_checker",
         "//tensorflow/python/util:tf_export",
     ],
 )
diff --git a/tensorflow/python/platform/stacktrace_handler_wrapper.cc b/tensorflow/python/platform/stacktrace_handler_wrapper.cc
index f43614a60e6..3b3ff76110e 100644
--- a/tensorflow/python/platform/stacktrace_handler_wrapper.cc
+++ b/tensorflow/python/platform/stacktrace_handler_wrapper.cc
@@ -16,8 +16,6 @@ limitations under the License.
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/core/platform/stacktrace_handler.h"
 
-namespace py = pybind11;
-
 PYBIND11_MODULE(_pywrap_stacktrace_handler, m) {
   m.def("InstallStacktraceHandler",
         &tensorflow::testing::InstallStacktraceHandler);
diff --git a/tensorflow/python/profiler/BUILD b/tensorflow/python/profiler/BUILD
index 84b83fe8476..0701489ac1f 100644
--- a/tensorflow/python/profiler/BUILD
+++ b/tensorflow/python/profiler/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
-load("//tensorflow:tensorflow.bzl", "py_test", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -7,7 +7,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "profiler",
     srcs = ["profiler.py"],
     srcs_version = "PY3",
@@ -21,42 +21,45 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profiler_client",
     srcs = ["profiler_client.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:c_api_util",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "profiler_client_test",
     srcs = ["profiler_client_test.py"],
     python_version = "PY3",
     tags = ["no_pip"],
     deps = [
         ":profiler_client",
+        ":profiler_v2",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profiler_v2",
     srcs = ["profiler_v2.py"],
     srcs_version = "PY3",
     visibility = ["//tensorflow:internal"],
     deps = [
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "profiler_v2_test",
     srcs = ["profiler_v2_test.py"],
     python_version = "PY3",
@@ -65,14 +68,16 @@ cuda_py_test(
     ],
     deps = [
         ":profiler_v2",
-        "//tensorflow/python:constant_op",
+        ":trace",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:gfile",
-        "//tensorflow/python/profiler:trace",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "profiler_wrapper_test",
     srcs = ["profiler_wrapper_test.py"],
     python_version = "PY3",
@@ -81,11 +86,12 @@ py_test(
     ],
     deps = [
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/profiler/internal:_pywrap_profiler",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "option_builder",
     srcs = ["option_builder.py"],
     srcs_version = "PY3",
@@ -95,7 +101,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "model_analyzer",
     srcs = ["model_analyzer.py"],
     srcs_version = "PY3",
@@ -103,13 +109,15 @@ py_library(
         ":option_builder",
         ":tfprof_logger",
         "//tensorflow/core/profiler:protos_all_py",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:_pywrap_tfprof",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "model_analyzer_test",
     srcs = ["model_analyzer_test.py"],
     python_version = "PY3",
@@ -121,20 +129,26 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer",
+        ":option_builder",
         ":profile_context",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/profiler:protos_all_py",
         "//tensorflow/python:distributed_framework_test_lib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python:gradients",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:model_analyzer_testlib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "profiler_test",
     srcs = ["profiler_test.py"],
     python_version = "PY3",
@@ -142,59 +156,64 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
+        ":option_builder",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:model_analyzer_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tfprof_logger",
     srcs = ["tfprof_logger.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core/profiler:protos_all_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:flops_registry",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tfprof_logger_test",
     size = "small",
     srcs = ["tfprof_logger_test.py"],
     python_version = "PY3",
     deps = [
-        ":tfprof_logger",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/profiler:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profile_context",
     srcs = ["profile_context.py"],
     srcs_version = "PY3",
     deps = [
         ":model_analyzer",
-        ":tfprof_logger",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/util:_pywrap_tfprof",
         "//tensorflow/python/util:compat",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "profile_context_test",
     srcs = ["profile_context_test.py"],
     python_version = "PY3",
@@ -204,24 +223,26 @@ cuda_py_test(
     ],
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
+        ":option_builder",
         ":profile_context",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/profiler/internal:model_analyzer_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "pprof_profiler",
     srcs = ["pprof_profiler.py"],
     srcs_version = "PY3",
     deps = ["@com_google_pprof//:pprof_proto_py"],
 )
 
-py_test(
+py_strict_test(
     name = "pprof_profiler_test",
     size = "small",
     srcs = ["pprof_profiler_test.py"],
@@ -232,15 +253,16 @@ py_test(
     deps = [
         ":pprof_profiler",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "@com_google_pprof//:pprof_proto_py",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "trace",
     srcs = ["trace.py"],
     srcs_version = "PY3",
diff --git a/tensorflow/python/profiler/integration_test/BUILD b/tensorflow/python/profiler/integration_test/BUILD
index 6c67c077591..e7060e5a315 100644
--- a/tensorflow/python/profiler/integration_test/BUILD
+++ b/tensorflow/python/profiler/integration_test/BUILD
@@ -1,4 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -6,7 +7,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "mnist_testing_utils",
     srcs = ["mnist_testing_utils.py"],
     srcs_version = "PY3",
@@ -16,24 +17,21 @@ py_library(
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "profiler_api_test",
     srcs = ["profiler_api_test.py"],
     python_version = "PY3",
     tags = [
+        "no_oss",  # TODO(b/283175845)
         "no_pip",
         "no_windows",  # TODO(b/192257727)
     ],
     deps = [
         ":mnist_testing_utils",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:multi_process_runner",
-        "//tensorflow/python/distribute:multi_worker_test_base",
-        "//tensorflow/python/eager:test",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_client",
         "//tensorflow/python/profiler:profiler_v2",
diff --git a/tensorflow/python/profiler/internal/BUILD b/tensorflow/python/profiler/internal/BUILD
index 794c23672ab..57a0e030502 100644
--- a/tensorflow/python/profiler/internal/BUILD
+++ b/tensorflow/python/profiler/internal/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "cuda_py_test", "get_compatible_with_cloud", "tf_python_pybind_extension")
-load("//tensorflow:tensorflow.bzl", "py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+load("//tensorflow:tensorflow.default.bzl", "cuda_py_strict_test", "get_compatible_with_cloud", "tf_python_pybind_extension")
 load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
@@ -8,71 +8,72 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "flops_registry",
     srcs = ["flops_registry.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:graph_util",
+        "//tensorflow/python/framework:graph_util",
+        "//tensorflow/python/framework:ops",
+        "//third_party/py/numpy",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "flops_registry_test",
     srcs = ["flops_registry_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
         ":flops_registry",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "model_analyzer_testlib",
     srcs = ["model_analyzer_testlib.py"],
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:rnn",
-        "//tensorflow/python:rnn_cell",
-        "//tensorflow/python:tensor_array_grad",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:rnn",
+        "//tensorflow/python/ops:rnn_cell",
+        "//tensorflow/python/ops:tensor_array_grad",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/profiler:model_analyzer",
+        "//tensorflow/python/training",
+        "//tensorflow/python/util:_pywrap_tfprof",
+        "//tensorflow/python/util:compat",
     ],
 )
 
-py_test(
+py_strict_test(
     name = "print_model_analysis_test",
     srcs = ["print_model_analysis_test.py"],
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/core/profiler:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-cuda_py_test(
+cuda_py_strict_test(
     name = "run_metadata_test",
     srcs = ["run_metadata_test.py"],
     python_version = "PY3",
@@ -83,12 +84,16 @@ cuda_py_test(
     xla_enable_strict_auto_jit = False,  # Node names are different with autojit
     deps = [
         ":model_analyzer_testlib",
-        "//tensorflow/core/profiler:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/profiler:model_analyzer",
+        "//tensorflow/python/profiler:option_builder",
     ],
 )
 
@@ -122,7 +127,7 @@ tf_python_pybind_extension(
         "//tensorflow/core/profiler/convert:tool_options",
         "//tensorflow/core/profiler/convert:xplane_to_tools_data",
         "//tensorflow/core/profiler/rpc:profiler_server_for_pybind",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_status",
         "@com_google_absl//absl/types:variant",
         "@pybind11",
     ],
@@ -139,13 +144,8 @@ cc_library(
         "//tensorflow/compiler/xla/python/profiler/internal:python_hooks",
         "//tensorflow/core:lib",
         "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:time_utils",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
         "@pybind11",
     ],
     alwayslink = True,
@@ -170,6 +170,7 @@ cc_library(
         "//tensorflow/core/profiler/rpc/client:save_profile",
         "//tensorflow/tsl/profiler/convert:xplane_to_trace_events",
         "//tensorflow/tsl/profiler/rpc/client:capture_profile",
+        "//tensorflow/tsl/profiler/utils:session_manager",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
diff --git a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
index 5c6f480d2fa..1e8b2ea94d5 100644
--- a/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
+++ b/tensorflow/python/profiler/internal/profiler_pywrap_impl.cc
@@ -38,200 +38,24 @@ limitations under the License.
 #include "tensorflow/core/profiler/rpc/profiler_server.h"
 #include "tensorflow/tsl/profiler/convert/xplane_to_trace_events.h"
 #include "tensorflow/tsl/profiler/rpc/client/capture_profile.h"
+#include "tensorflow/tsl/profiler/utils/session_manager.h"
 
 namespace tensorflow {
 namespace profiler {
 namespace pywrap {
 
-namespace {
-
 using ::tensorflow::RemoteProfilerSessionManagerOptions;
-
-// Profiler gives grace after profiling duration to terminate.
-constexpr absl::Duration kMinSessionGraceTime = absl::Seconds(60);
-
-tensorflow::Status ValidateHostPortPair(absl::string_view host_port) {
-  tensorflow::uint32 port;
-  std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
-  // Must be host:port, port must be a number, host must not contain a '/',
-  // host also must not be empty.
-  if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
-      absl::StrContains(parts[0], "/") || parts[0].empty()) {
-    return tensorflow::errors::InvalidArgument(
-        "Could not interpret \"", host_port, "\" as a host-port pair.");
-  }
-  return OkStatus();
-}
-
-tensorflow::Status ValidateOptions(
-    const RemoteProfilerSessionManagerOptions& options) {
-  if (options.service_addresses().empty()) {
-    return tensorflow::errors::InvalidArgument("No service address provided.");
-  }
-
-  if (options.profiler_options().duration_ms() == 0) {
-    return tensorflow::errors::InvalidArgument(
-        "duration_ms must be greater than zero.");
-  }
-
-  for (absl::string_view host_port : options.service_addresses()) {
-    TF_RETURN_IF_ERROR(ValidateHostPortPair(host_port));
-  }
-
-  if (options.max_session_duration_ms() <
-      options.profiler_options().duration_ms()) {
-    return tensorflow::errors::InvalidArgument(
-        "The maximum profiling session duration must be greater than or equal "
-        "to the local profiler duration.");
-  }
-
-  return OkStatus();
-}
-
-// Receives a comma delimited list of service_addresses and adds them to
-// RemoteProfilerSessionManagerOptions::service_addresses.
-void AddServiceAddresses(absl::string_view service_addresses,
-                         RemoteProfilerSessionManagerOptions* options) {
-  for (absl::string_view server : absl::StrSplit(service_addresses, ',')) {
-    options->add_service_addresses(server.data(), server.size());
-  }
-}
-
-// Sets gRPC deadline to a grace period based on the profiling duration.
-void UpdateMaxSessionDuration(RemoteProfilerSessionManagerOptions& options) {
-  auto local_profiler_duration = options.profiler_options().duration_ms();
-  auto session_creation_ts = options.session_creation_timestamp_ns();
-  auto requested_start_ts = options.profiler_options().start_timestamp_ns();
-  // User only needs to set maximal session duration if the profiling duration
-  // is bounded.
-  DCHECK_GT(local_profiler_duration, 0);
-  VLOG(3) << "duration_ms was given as " << local_profiler_duration;
-  // Max session duration is the profiling session with grace time.
-  auto profile_duration = std::max(
-      kMinSessionGraceTime, absl::Milliseconds(local_profiler_duration) * 2);
-  absl::Duration delay_duration;
-  // When requested start timestamp is 0, profiling starts immediately.
-  if (requested_start_ts > 0) {
-    delay_duration =
-        absl::Nanoseconds(requested_start_ts - session_creation_ts);
-  }
-
-  auto max_session_duration = profile_duration + delay_duration;
-  options.set_max_session_duration_ms(
-      absl::ToInt64Milliseconds(max_session_duration));
-  VLOG(1) << "max_session_duration set to " << max_session_duration;
-}
-
-// Takes profiler options in absl::flat_hash_map and returns a
-// RemoteProfilerSessionManagerOptions.
-RemoteProfilerSessionManagerOptions GetOptionsLocked(
-    absl::string_view logdir,
-    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
-        opts) {
-  RemoteProfilerSessionManagerOptions options;
-  *options.mutable_profiler_options() =
-      tensorflow::ProfilerSession::DefaultOptions();
-  // Store a timestamp of when this session was created. This will be the basis
-  // of gRPC deadline afterwards.
-  auto now = absl::Now();
-  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(now));
-  VLOG(2) << "set_session_creation_timestamp_ns set to "
-          << options.session_creation_timestamp_ns() << " [" << now << "]";
-
-  // Set the path of where to store XSpaces.
-  options.mutable_profiler_options()->set_repository_path(logdir.data(),
-                                                          logdir.size());
-  VLOG(2) << "repository_path set to "
-          << options.profiler_options().repository_path();
-
-  for (const auto& kw : opts) {
-    absl::string_view key = kw.first;
-    if (key == "host_tracer_level") {
-      int value = std::get<int>(kw.second);
-      options.mutable_profiler_options()->set_host_tracer_level(value);
-      VLOG(1) << "host_tracer_level set to " << value;
-    } else if (key == "device_tracer_level") {
-      int value = std::get<int>(kw.second);
-      options.mutable_profiler_options()->set_device_tracer_level(value);
-      VLOG(1) << "device_tracer_level set to " << value;
-    } else if (key == "python_tracer_level") {
-      int value = std::get<int>(kw.second);
-      options.mutable_profiler_options()->set_python_tracer_level(value);
-      VLOG(1) << "python_tracer_level set to " << value;
-    } else if (key == "delay_ms") {
-      int value = std::get<int>(kw.second);
-      options.set_delay_ms(value);
-      VLOG(1) << "delay_ms was set to " << value;
-    } else {
-      LOG(WARNING) << "Unrecognised key: " << key;
-    }
-  }
-
-  return options;
-}
-
-RemoteProfilerSessionManagerOptions GetOptionsLocked(
-    absl::string_view service_addresses, absl::string_view logdir,
-    absl::string_view worker_list, bool include_dataset_ops,
-    int32_t duration_ms,
-    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
-        opts,
-    bool* is_cloud_tpu_session) {
-  auto options = GetOptionsLocked(logdir, opts);
-
-  // Remote profiling does not support any use cases where the following options
-  // are set by `opts`. e.g. `opts['service_addrs']` will not happen.
-  DCHECK(options.service_addresses().empty());
-  // In remote profiling, duration is always passed by value explicitly and not
-  // set in opts.
-  DCHECK_EQ(options.profiler_options().duration_ms(), 0);
-  // Because duration_ms is not set from opts, it follows that
-  // max_session_duration_ms must be unset as well.
-  DCHECK_EQ(options.max_session_duration_ms(), 0);
-
-  // Worker_list is only used for TensorBoard TPU capture cases. For a TPU
-  // cluster, service_address is the Master, which can already be found in the
-  // list of workers. These sessions will be used with the ProfileAnalysis
-  // service.
-  *is_cloud_tpu_session = !worker_list.empty();
-  AddServiceAddresses(*is_cloud_tpu_session ? worker_list : service_addresses,
-                      &options);
-
-  // Set local profiler duration and profiler session durations.
-  options.mutable_profiler_options()->set_include_dataset_ops(
-      include_dataset_ops);
-  options.mutable_profiler_options()->set_duration_ms(duration_ms);
-  UpdateMaxSessionDuration(options);
-
-  for (int idx = 0; idx < options.service_addresses_size(); ++idx) {
-    VLOG(1) << "service_addr " << idx << " set to "
-            << options.service_addresses(idx);
-  }
-  VLOG(1) << "include_dataset_ops set to " << include_dataset_ops;
-  VLOG(1) << "duration_ms set to " << duration_ms;
-
-  return options;
-}
-
-}  // namespace
+using tsl::profiler::GetRemoteSessionManagerOptionsLocked;
+using tsl::profiler::ValidateHostPortPair;
 
 tensorflow::Status Trace(
     const char* service_addr, const char* logdir, const char* worker_list,
     bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
     const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
         options) {
-  // TPU capture is true if the user sets worker_list.
-  bool is_cloud_tpu_session = false;
-  RemoteProfilerSessionManagerOptions opts =
-      GetOptionsLocked(service_addr, logdir, worker_list, include_dataset_ops,
-                       duration_ms, options, &is_cloud_tpu_session);
-  TF_RETURN_IF_ERROR(ValidateOptions(opts));
-
-  {
-    TF_RETURN_IF_ERROR(tsl::profiler::CaptureRemoteTrace(
-        logdir, num_tracing_attempts, opts, is_cloud_tpu_session));
-  }
-  return OkStatus();
+  return tsl::profiler::CaptureRemoteTrace(service_addr, logdir, worker_list,
+                                           include_dataset_ops, duration_ms,
+                                           num_tracing_attempts, options);
 }
 
 tensorflow::Status Monitor(const char* service_addr, int duration_ms,
@@ -250,7 +74,7 @@ tensorflow::Status ProfilerSessionWrapper::Start(
     const char* logdir,
     const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
         options) {
-  auto opts = GetOptionsLocked(logdir, options);
+  auto opts = GetRemoteSessionManagerOptionsLocked(logdir, options);
   session_ = tensorflow::ProfilerSession::Create(opts.profiler_options());
   logdir_ = logdir;
   return session_->Status();
diff --git a/tensorflow/python/pywrap_dtensor_device.cc b/tensorflow/python/pywrap_dtensor_device.cc
index 506b3db6c22..01b7f65af9b 100644
--- a/tensorflow/python/pywrap_dtensor_device.cc
+++ b/tensorflow/python/pywrap_dtensor_device.cc
@@ -444,6 +444,8 @@ PYBIND11_MODULE(_pywrap_dtensor_device, m) {
           py::arg("dim_name"), "Returns the size of mesh dimension.")
       .def("device_type", &Mesh::device_type,
            "Returns the device_type of a Mesh.")
+      .def("host_mesh", &Mesh::host_mesh,
+           "Returns a host mesh corresponding to this mesh.")
       .def("num_local_devices", &Mesh::num_local_devices,
            "Returns the number of local devices.")
       .def("min_global_device_id", &Mesh::min_global_device_id,
diff --git a/tensorflow/python/saved_model/BUILD b/tensorflow/python/saved_model/BUILD
index 7055b3a60c7..218501c285d 100644
--- a/tensorflow/python/saved_model/BUILD
+++ b/tensorflow/python/saved_model/BUILD
@@ -74,12 +74,12 @@ py_strict_library(
         ":pywrap_saved_model",
         ":signature_def_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -100,11 +100,11 @@ py_strict_library(
         ":signature_def_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -120,15 +120,15 @@ tf_py_strict_test(
         ":loader",
         ":signature_def_utils",
         ":utils",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:saver",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -143,7 +143,7 @@ py_strict_library(
         ":signature_constants",
         ":signature_def_utils",
         ":tag_constants",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -157,10 +157,10 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
     ],
@@ -182,23 +182,23 @@ tf_py_strict_test(
         ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:test_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/framework:test_ops",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//tensorflow/python/training:saver_test_utils",
         "//tensorflow/python/util:compat",
     ],
@@ -210,7 +210,7 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":constants",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/util:compat",
     ],
 )
@@ -225,7 +225,6 @@ py_strict_library(
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:byte_swap_tensor",
         "//tensorflow/python/framework:composite_tensor",
@@ -233,6 +232,7 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -247,9 +247,6 @@ tf_py_strict_test(
         ":nested_structure_coder",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
@@ -257,8 +254,11 @@ tf_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -273,8 +273,8 @@ py_strict_library(
         ":signature_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -290,12 +290,12 @@ tf_py_strict_test(
         ":signature_def_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -308,9 +308,9 @@ tf_py_strict_test(
         ":signature_constants",
         ":simple_save",
         ":tag_constants",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -322,12 +322,12 @@ py_strict_library(
         ":function_serialization",
         ":revived_types",
         ":signature_constants",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
@@ -374,18 +374,6 @@ py_strict_library(
         ":tracing_utils",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:error_interpolation",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:function",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:versions",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_options",
         "//tensorflow/python/checkpoint:functional_saver",
@@ -398,6 +386,18 @@ py_strict_library(
         "//tensorflow/python/eager/polymorphic_function",
         "//tensorflow/python/eager/polymorphic_function:saved_model_exported_concrete",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:error_interpolation",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/framework:versions",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:base",
@@ -409,7 +409,9 @@ py_strict_library(
         "//tensorflow/python/util:object_identity",
         "//tensorflow/python/util:tf_export",
         "@absl_py//absl/logging",
-    ],
+    ] + if_google([
+        "//tensorflow/tools/proto_splitter/python:saved_model",
+    ]),
 )
 
 py_strict_library(
@@ -438,13 +440,6 @@ tf_py_strict_test(
         ":tag_constants",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/config:flags_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_switch_case",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
@@ -462,17 +457,27 @@ tf_py_strict_test(
         "//tensorflow/python/framework:versions",
         "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_switch_case",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/training:saver",
         "//tensorflow/python/util:compat",
+        "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
-    ],
+    ] + if_google([
+        "//tensorflow/tools/proto_splitter:constants",
+    ]),
 )
 
-py_library(
+py_strict_library(
     name = "load",
     srcs = ["load.py"],
     srcs_version = "PY3",
@@ -492,16 +497,6 @@ py_library(
         ":utils",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/function/capture:restore_captures",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:checkpoint_options",
         "//tensorflow/python/checkpoint:graph_view",
@@ -513,6 +508,16 @@ py_library(
         "//tensorflow/python/eager:function",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model/registration",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
@@ -537,20 +542,20 @@ py_strict_library(
         ":loader",
         ":pywrap_saved_model",
         ":signature_serialization",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:sparse_tensor",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:asset",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/trackable:resource",
         "//tensorflow/python/training:monitored_session",
+        "//tensorflow/python/training:saver",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -570,21 +575,7 @@ cuda_py_strict_test(
         ":save",
         ":save_options",
         ":tag_constants",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond_v2",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python:while_v2",  # b/118513001
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/checkpoint:saveable_compat",
         "//tensorflow/python/client:session",
@@ -597,14 +588,28 @@ cuda_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:function",
         "//tensorflow/python/framework:op_callbacks",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:versions",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond_v2",
+        "//tensorflow/python/ops:custom_gradient",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/trackable:asset",
@@ -631,32 +636,32 @@ tf_py_strict_test(
         ":tag_constants",
         ":utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:lift_to_graph",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/framework:versions",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/ragged:ragged_factory_ops",
         "//tensorflow/python/training:saver",
     ],
@@ -669,8 +674,8 @@ tf_py_strict_test(
     tags = ["no_oss"],  # Due to the usage of keras component.
     deps = [
         ":load",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
@@ -691,7 +696,7 @@ tf_py_strict_test(
     deps = [
         ":revived_types",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/trackable:autotrackable",
     ],
 )
@@ -703,8 +708,11 @@ py_strict_library(
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:func_graph",
+        "//tensorflow/core/function/polymorphism:function_type",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/eager:wrap_function",
+        "//tensorflow/python/eager/polymorphic_function:function_type_utils",
+        "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -716,19 +724,21 @@ py_strict_library(
     deps = [
         ":nested_structure_coder",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:default_gradient",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
-        "//tensorflow/python:function_def_to_graph",
-        "//tensorflow/python:gradients",
-        "//tensorflow/python:op_def_registry",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:tensor_spec",
-        "//tensorflow/python:type_spec",
+        "//tensorflow/core/function/polymorphism:function_type",
+        "//tensorflow/core/function/trace_type",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
-        "//tensorflow/python/eager/polymorphic_function:function_spec",
+        "//tensorflow/python/eager/polymorphic_function:function_type_utils",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function_def_to_graph",
+        "//tensorflow/python/framework:op_def_registry",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:custom_gradient",
+        "//tensorflow/python/ops:default_gradient",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_decorator",
@@ -741,7 +751,7 @@ py_strict_library(
     srcs = ["nested_structure_coder.py"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:dtypes",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:type_spec_registry",
         "//tensorflow/python/types:internal",
         "//tensorflow/python/util:compat",
@@ -799,7 +809,7 @@ py_strict_library(
     deps = [
         ":constants",
         ":loader",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
@@ -814,8 +824,8 @@ tf_py_strict_test(
         ":loader",
         ":method_name_updater",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework",
         "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:compat",
@@ -847,7 +857,9 @@ tf_pybind_cc_library_wrapper(
         "//tensorflow/cc/saved_model:constants",
         "//tensorflow/cc/saved_model:fingerprinting",
         "//tensorflow/cc/saved_model:metrics",
-    ],
+    ] + if_google([
+        "//tensorflow/tools/proto_splitter:merge",
+    ]),
 )
 
 tf_python_pybind_extension(
@@ -857,12 +869,16 @@ tf_python_pybind_extension(
         "pywrap_saved_model_constants.cc",
         "pywrap_saved_model_fingerprinting.cc",
         "pywrap_saved_model_metrics.cc",
-    ],
+    ] + if_google([
+        "pywrap_saved_model_merger.cc",
+    ]),
     hdrs = [
         "pywrap_saved_model_constants.h",
         "pywrap_saved_model_fingerprinting.h",
         "pywrap_saved_model_metrics.h",
-    ],
+    ] + if_google([
+        "pywrap_saved_model_merger.h",
+    ]),
     # This fails Windows builds. Please check b/266870200 for details.
     #    dynamic_deps = ["//tensorflow/python:_pywrap_tensorflow_internal.so"] + select({
     #        "//tensorflow:macos": ["//tensorflow:libtensorflow_framework.%s.dylib" % VERSION],
@@ -873,13 +889,18 @@ tf_python_pybind_extension(
     features = ["-layering_check"],
     visibility = [
         "//tensorflow/python/checkpoint:__subpackages__",
+        "//tensorflow/python/tpu:__pkg__",
         "//tensorflow/python/training:__subpackages__",
     ],
     deps = [
         ":pywrap_saved_model_headers",
         "//tensorflow/cc/experimental/libexport:save",
+        "//tensorflow/cc/saved_model:reader",
+        "//tensorflow/core:protos_all_cc",
         "//tensorflow/python/lib/core:pybind11_status",
         "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:status_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
     ],
 )
 
@@ -893,6 +914,26 @@ tf_py_strict_test(
     ],
 )
 
+# copybara:uncomment_begin(google-only)
+#
+# tf_py_strict_test(
+#     name = "pywrap_saved_model_merger_test",
+#     srcs = ["pywrap_saved_model_merger_test.py"],
+#     data = [
+#         "//tensorflow/tools/proto_splitter/testdata:split-standard.cpb",
+#         "//tensorflow/tools/proto_splitter/testdata:split-standard.pbtxt",
+#     ],
+#     deps = [
+#         ":pywrap_saved_model",
+#         "//tensorflow/core:protos_all_py",
+#         "//tensorflow/python/eager:test",
+#         "//tensorflow/python/lib/io:lib",
+#         "//tensorflow/python/platform:client_testlib",
+#     ],
+# )
+#
+# copybara:uncomment_end
+
 tf_py_strict_test(
     name = "keras_injection_test",
     size = "small",
@@ -970,11 +1011,11 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":tracing_utils",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
     ],
 )
diff --git a/tensorflow/python/saved_model/builder_impl.py b/tensorflow/python/saved_model/builder_impl.py
index ba2b9554eff..3ad7911328d 100644
--- a/tensorflow/python/saved_model/builder_impl.py
+++ b/tensorflow/python/saved_model/builder_impl.py
@@ -417,9 +417,6 @@ class _SavedModelBuilder(object):
     if not file_io.file_exists(self._export_dir):
       file_io.recursive_create_dir(self._export_dir)
 
-    saved_model_serialized = self._saved_model.SerializeToString(
-        deterministic=True)
-
     if as_text:
       path = file_io.join(
           compat.as_bytes(self._export_dir),
@@ -430,10 +427,10 @@ class _SavedModelBuilder(object):
           compat.as_bytes(self._export_dir),
           compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
       file_io.write_string_to_file(
-          path, saved_model_serialized)
+          path, self._saved_model.SerializeToString(deterministic=True))
+
     tf_logging.info("SavedModel written to: %s", compat.as_text(path))
     metrics.IncrementWrite(write_version="1")
-
     # Placeholder for internal TF1 model fingerprint write
 
     return path
diff --git a/tensorflow/python/saved_model/constants.py b/tensorflow/python/saved_model/constants.py
index 9ab19fe2dc2..0bd3dbb7d59 100644
--- a/tensorflow/python/saved_model/constants.py
+++ b/tensorflow/python/saved_model/constants.py
@@ -68,6 +68,9 @@ tf_export(
         "saved_model.constants.SAVED_MODEL_SCHEMA_VERSION"
     ]).export_constant(__name__, "SAVED_MODEL_SCHEMA_VERSION")
 
+# File name prefix for SavedModel protocol buffer.
+SAVED_MODEL_FILENAME_PREFIX = constants.SAVED_MODEL_FILENAME_PREFIX
+
 # File name for SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PB = constants.SAVED_MODEL_FILENAME_PB
 tf_export(
@@ -77,6 +80,9 @@ tf_export(
         "saved_model.constants.SAVED_MODEL_FILENAME_PB"
     ]).export_constant(__name__, "SAVED_MODEL_FILENAME_PB")
 
+# File name for SavedModel chunked protocol buffer (experimental).
+SAVED_MODEL_FILENAME_CPB = constants.SAVED_MODEL_FILENAME_CPB
+
 # File name for text version of SavedModel protocol buffer.
 SAVED_MODEL_FILENAME_PBTXT = constants.SAVED_MODEL_FILENAME_PBTXT
 tf_export(
diff --git a/tensorflow/python/saved_model/fingerprinting_test.py b/tensorflow/python/saved_model/fingerprinting_test.py
index 941db29fddc..8638f2c0bbe 100644
--- a/tensorflow/python/saved_model/fingerprinting_test.py
+++ b/tensorflow/python/saved_model/fingerprinting_test.py
@@ -149,22 +149,6 @@ class FingerprintingTest(test.TestCase):
                                 "SavedModel Fingerprint Error"):
       fingerprinting.read_fingerprint("foo")
 
-  def test_write_fingerprint(self):
-    save_dir = os.path.join(self.get_temp_dir(), "model_and_fingerprint")
-    save.save_and_return_nodes(
-        self._create_model_with_data(), save_dir,
-        experimental_skip_checkpoint=True)  # checkpoint data won't be loaded*
-
-    new_dir = os.path.join(self.get_temp_dir(), "fingerprint_dir")
-    os.mkdir(new_dir)
-    serialized_model = self._read_saved_model(  # *here
-        os.path.join(save_dir, "saved_model.pb")).SerializeToString()
-    fingerprinting_utils.write_fingerprint(new_dir, serialized_model)
-
-    model_fingerprint = fingerprinting.read_fingerprint(save_dir)
-    solo_fingerprint = fingerprinting.read_fingerprint(new_dir)
-    self.assertEqual(model_fingerprint, solo_fingerprint)
-
   def test_valid_singleprint(self):
     save_dir = os.path.join(self.get_temp_dir(), "singleprint_model")
     save.save(self._create_model_with_data(), save_dir)
diff --git a/tensorflow/python/saved_model/fingerprinting_utils.py b/tensorflow/python/saved_model/fingerprinting_utils.py
index 6d56c59263c..70c34f3c194 100644
--- a/tensorflow/python/saved_model/fingerprinting_utils.py
+++ b/tensorflow/python/saved_model/fingerprinting_utils.py
@@ -30,7 +30,7 @@ from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.util import compat
 
 
-def write_fingerprint(export_dir, saved_model_serialized):
+def write_fingerprint(export_dir, serialized_model=None):
   """Write fingerprint protobuf, if requested.
 
   Writes a `tf.saved_model.experimental.Fingerprint` object to a
@@ -38,9 +38,9 @@ def write_fingerprint(export_dir, saved_model_serialized):
 
   Args:
     export_dir: The directory in which to write the fingerprint.
-    saved_model_serialized: The serialized SavedModel proto.
+    serialized_model: Deprecated. The serialized model proto string.
   """
-
+  del serialized_model
   if flags.config().saved_model_fingerprinting.value():
     fingerprint_path = file_io.join(
         compat.as_str(export_dir),
@@ -48,7 +48,7 @@ def write_fingerprint(export_dir, saved_model_serialized):
     logging.info("Writing fingerprint to %s", fingerprint_path)
     try:
       fingerprint_serialized = fingerprinting_pywrap.CreateFingerprintDef(
-          saved_model_serialized, export_dir)
+          export_dir)
     except fingerprinting_pywrap.FingerprintException as e:
       raise ValueError(e) from None
     file_io.atomic_write_string_to_file(fingerprint_path,
diff --git a/tensorflow/python/saved_model/function_deserialization.py b/tensorflow/python/saved_model/function_deserialization.py
index 04eee47047b..bcdc9adf1d7 100644
--- a/tensorflow/python/saved_model/function_deserialization.py
+++ b/tensorflow/python/saved_model/function_deserialization.py
@@ -20,10 +20,12 @@ import re
 
 from absl import logging
 
+from tensorflow.core.function import trace_type
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import def_function
 from tensorflow.python.eager import function as function_lib
-from tensorflow.python.eager.polymorphic_function import function_spec as function_spec_lib
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.framework import func_graph as func_graph_lib
 from tensorflow.python.framework import function_def_to_graph as function_def_lib
 from tensorflow.python.framework import op_def_registry
@@ -157,12 +159,47 @@ def _deserialize_function_spec_as_nonmethod(function_spec_proto):
       saved_object_graph_pb2.FunctionSpec.JitCompile.OFF: False,
   }.get(function_spec_proto.jit_compile)
 
-  return function_spec_lib.FunctionSpec.from_fullargspec_and_signature(
+  return function_type_utils.FunctionSpec.from_fullargspec_and_signature(
       fullargspec=fullargspec,
       input_signature=input_signature,
       jit_compile=jit_compile)
 
 
+# TODO(b/203440205): Set FunctionType with ConcreteFunction constructor.
+def set_preinitialized_function_spec(concrete_fn, spec):
+  """Set the FunctionType of the ConcreteFunction using FunctionSpec."""
+  if spec is None:
+    concrete_fn._function_type = None  # pylint: disable=protected-access
+    return
+
+  unconstrained_type = function_type_lib.FunctionType(
+      [
+          function_type_lib.Parameter(p.name, p.kind, p.optional, None)
+          for p in spec.function_type.parameters.values()
+      ]
+  )
+  arg_specs, kwarg_specs = concrete_fn.structured_input_signature
+
+  _, input_function_type, _ = function_type_lib.canonicalize_to_monomorphic(
+      arg_specs,
+      {
+          function_type_lib.sanitize_arg_name(k): v
+          for k, v in kwarg_specs.items()
+      },
+      spec.default_values,
+      {},
+      unconstrained_type,
+  )
+
+  output_type = trace_type.from_value(concrete_fn.graph.structured_outputs)
+  # Captures are restored later so we will update it then.
+  function_type = function_type_lib.FunctionType(
+      input_function_type.parameters.values(),
+      return_annotation=output_type,
+  )
+  concrete_fn._function_type = function_type  # pylint: disable=protected-access
+
+
 # TODO(b/205016761): The fact that we can't derive ConcreteFunction calling
 # conventions from the serialized input spec right now is unfortunate. Merging
 # these would be good, maybe by adding TensorSpec names to cache keys so renamed
@@ -180,7 +217,7 @@ def setup_bare_concrete_function(saved_bare_concrete_function,
   if saved_bare_concrete_function.HasField("function_spec"):
     function_spec = _deserialize_function_spec_as_nonmethod(
         saved_bare_concrete_function.function_spec)
-    concrete_function._set_function_spec(function_spec)
+    set_preinitialized_function_spec(concrete_function, function_spec)
   # pylint: enable=protected-access
   concrete_function.add_to_graph()
   return concrete_function
@@ -200,7 +237,8 @@ class RestoredFunction(def_function.Function):
         autograph=False,
         jit_compile=function_spec.jit_compile)
     self.concrete_functions = concrete_functions
-    self._function_spec = function_spec
+    self._function_type = function_spec.function_type
+    self._default_values = function_spec.default_values
 
     # Prevent RestoredFunction from spamming users with frequent tracing
     # warnings.
@@ -228,7 +266,8 @@ class RestoredFunction(def_function.Function):
 
   def _compiler_with_scope(self, scope):
     func = super(RestoredFunction, self)._compiler_with_scope(scope)
-    func._function_spec = self._function_spec  # pylint: disable=protected-access
+    func._function_type = self._function_type  # pylint: disable=protected-access
+    func._default_values = self._default_values  # pylint: disable=protected-access
     return func
 
 
@@ -310,7 +349,7 @@ def recreate_function(saved_function, concrete_functions):
     concrete_function_objects.append(concrete_functions[concrete_function_name])
 
   for cf in concrete_function_objects:
-    cf._set_function_spec(function_spec)  # pylint: disable=protected-access
+    set_preinitialized_function_spec(cf, function_spec)
 
   restored_function = RestoredFunction(restored_function_body,
                                        restored_function_body.__name__,
@@ -440,7 +479,14 @@ def load_function_def_library(library,
     # initialization at a later stage.
     if "_input_shapes" in fdef.attr:
       del fdef.attr["_input_shapes"]
-    func = function_lib.ConcreteFunction(func_graph, attrs=fdef.attr)
+    function_type = function_type_lib.from_structured_signature(
+        func_graph.structured_input_signature,
+        func_graph.structured_outputs,
+        func_graph.function_captures.capture_types,
+    )
+    func = function_lib.ConcreteFunction(
+        func_graph, attrs=fdef.attr, function_type=function_type
+    )
     if wrapper_function:
       func = wrapper_function(func)
     func.add_to_graph(graph)
diff --git a/tensorflow/python/saved_model/function_serialization.py b/tensorflow/python/saved_model/function_serialization.py
index e37e7344d16..bd344ea299e 100644
--- a/tensorflow/python/saved_model/function_serialization.py
+++ b/tensorflow/python/saved_model/function_serialization.py
@@ -14,8 +14,11 @@
 # ==============================================================================
 """Tools for serializing `Function`s."""
 
+from tensorflow.core.function.polymorphism import function_type as function_type_lib
 from tensorflow.core.protobuf import saved_object_graph_pb2
 from tensorflow.python.eager import function as defun
+from tensorflow.python.eager import wrap_function as wrap_function_lib
+from tensorflow.python.eager.polymorphic_function import function_type_utils
 from tensorflow.python.framework import func_graph as func_graph_module
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.util import nest
@@ -82,6 +85,38 @@ def serialize_concrete_function(concrete_function, node_ids):
   return concrete_function_proto
 
 
+# TODO(b/203440205): Support FunctionType directly.
+def get_preinitialized_function_spec(concrete_function):
+  """Generates an unconstrained FunctionSpec from FunctionType."""
+  # TODO(b/203440205): SavedModel does not support FunctionType on its own
+  # without a FuncGraph signature.
+  # WrappedFunctions are not supposed to have FunctionSpecs.
+  if concrete_function.structured_input_signature is None or isinstance(
+      concrete_function, wrap_function_lib.WrappedFunction
+  ):
+    return None
+
+  function_type = concrete_function.function_type
+  if function_type is None:
+    return None
+
+  unconstrained_type = function_type_lib.FunctionType(
+      [
+          function_type_lib.Parameter(p.name, p.kind, p.optional, None)
+          for p in function_type.parameters.values()
+      ]
+  )
+  default_values = {
+      p.default for p in function_type.parameters.values() if p.optional
+  }
+  return function_type_utils.FunctionSpec(
+      unconstrained_type,
+      default_values,
+      False,
+      name=concrete_function.name,
+  )
+
+
 def serialize_bare_concrete_function(concrete_function):
   """Build a SavedBareConcreteFunction."""
   # pylint: disable=protected-access
@@ -89,10 +124,9 @@ def serialize_bare_concrete_function(concrete_function):
       concrete_function_name=concrete_function.name,
       allowed_positional_arguments=concrete_function._num_positional_args,
       argument_keywords=concrete_function._arg_keywords)
-  if concrete_function._pre_initialized_function_spec is not None:
-    proto.function_spec.CopyFrom(
-        _serialize_function_spec(
-            concrete_function._pre_initialized_function_spec))
+  function_spec = get_preinitialized_function_spec(concrete_function)
+  if function_spec is not None:
+    proto.function_spec.CopyFrom(_serialize_function_spec(function_spec))
   return proto
   # pylint: enable=protected-access
 
@@ -152,7 +186,9 @@ def wrap_cached_variables(concrete_function):
   if not mapped_captures:
     return concrete_function
 
-  inner_concrete = defun.ConcreteFunction(concrete_function.graph)
+  inner_concrete = defun.ConcreteFunction(
+      concrete_function.graph, function_type=concrete_function.function_type
+  )
 
   def wrap_function(*args):
     return inner_concrete._call_flat(args, inner_concrete.captured_inputs)  # pylint:disable=protected-access
@@ -167,11 +203,10 @@ def wrap_cached_variables(concrete_function):
   # the function.
   # pylint: disable=protected-access
   fn = defun.ConcreteFunction(
-      outer_graph, spec=concrete_function._function_spec)
+      outer_graph, function_type=concrete_function.function_type
+  )
   fn._arg_keywords = concrete_function._arg_keywords
   fn._num_positional_args = concrete_function._num_positional_args
-  fn._pre_initialized_function_spec = (
-      concrete_function._pre_initialized_function_spec)
   # pylint: enable=protected-access
 
   # Return the captures to their original values
diff --git a/tensorflow/python/saved_model/load.py b/tensorflow/python/saved_model/load.py
index b3c47f3c68c..f9e3dab1045 100644
--- a/tensorflow/python/saved_model/load.py
+++ b/tensorflow/python/saved_model/load.py
@@ -1004,6 +1004,7 @@ def load_partial(export_dir, filters, tags=None, options=None):
   saved_model_proto, debug_info = (
       loader_impl.parse_saved_model_with_debug_info(export_dir))
 
+  loader = None
   if (len(saved_model_proto.meta_graphs) == 1 and
       saved_model_proto.meta_graphs[0].HasField("object_graph_def")):
     metrics.IncrementReadApi(_LOAD_V2_LABEL)
@@ -1041,12 +1042,23 @@ def load_partial(export_dir, filters, tags=None, options=None):
     root.tensorflow_git_version = (
         meta_graph_def.meta_info_def.tensorflow_git_version)
     metrics.IncrementRead(write_version="2")
+
+    if options.experimental_load_function_aliases:
+      if hasattr(root, "function_aliases"):
+        raise ValueError(
+            "Could not load with experimental_load_function_aliases option"
+            " because the top-level object already has an attributed with name"
+            " 'function_aliases'"
+        )
+      root.function_aliases = loader.function_aliases
   else:
     if filters:
       raise ValueError("SavedModels saved from Tensorflow 1.x or Estimator (any"
                        " version) cannot be loaded with node filters.")
     with ops.init_scope():
-      root = load_v1_in_v2.load(export_dir, tags)
+      root = load_v1_in_v2.load(
+          export_dir, tags, options.experimental_skip_checkpoint
+      )
       root.graph_debug_info = debug_info
   # For privacy concerns, please see the note in
   #  tensorflow/cc/saved_model/metrics.h
@@ -1070,16 +1082,44 @@ def load_partial(export_dir, filters, tags=None, options=None):
     singleprint = fingerprint.singleprint()
   metrics.SetReadPathAndSingleprint(path=export_dir, singleprint=singleprint)
 
-  if options.experimental_load_function_aliases:
-    if hasattr(root, "function_aliases"):
-      raise ValueError(
-          "Could not load with experimental_load_function_aliases option"
-          " because the top-level object already has an attributed with name"
-          " 'function_aliases'"
-      )
-    root.function_aliases = loader.function_aliases
-
-  if filters:
+  if filters and loader is not None:
     return {node_id: loader.get(node_id) for node_id in filters}
   else:
     return {"root": root}
+
+
+def is_tf2_saved_model(export_dir):
+  """Identifies if an exported SavedModel is a TF2 SavedModel.
+
+  There are differences in SavedModel semantics between TF1 and TF2 that are
+  documented here:
+  https://www.tensorflow.org/guide/migrate/saved_model#savedmodel. This helper
+  util function serves to distinguish the TF1 vs TF2 semantics used when
+  exporting SavedModels.
+
+  Args:
+    export_dir: The SavedModel directory to load from.
+
+  Returns:
+    True if TF2 SavedModel semantics are used, False if TF1 SavedModel semantics
+    are used.
+  """
+  # Try reading the fingerprint first before parsing the SavedModel proto
+  try:
+    fingerprint = fingerprinting.read_fingerprint(export_dir)
+    if fingerprint.saved_object_graph_hash != 0:
+      logging.info("SavedModel at %s is a TF2 SavedModel", export_dir)
+      return True
+  except Exception:  # pylint: disable=broad-exception-caught
+    logging.info(
+        "Failed to read fingerprint from SavedModel. Parsing MetaGraph ..."
+    )
+    saved_model_proto = loader_impl.parse_saved_model(export_dir)
+    if len(
+        saved_model_proto.meta_graphs
+    ) == 1 and saved_model_proto.meta_graphs[0].HasField("object_graph_def"):
+      logging.info("SavedModel at %s is a TF2 SavedModel", export_dir)
+      return True
+
+  logging.info("SavedModel at %s is a TF1 SavedModel", export_dir)
+  return False
diff --git a/tensorflow/python/saved_model/load_test.py b/tensorflow/python/saved_model/load_test.py
index 6aba07fbe56..9a2e53ec25b 100644
--- a/tensorflow/python/saved_model/load_test.py
+++ b/tensorflow/python/saved_model/load_test.py
@@ -526,7 +526,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(4, imported.f(constant_op.constant(2), True).numpy())
     self.assertEqual(7, imported.f(constant_op.constant(2)).numpy())
 
-  def test_function_with_defaults_input(self, cycles, use_cpp_bindings):
+  def test_function_with_defaults_input_tensor(self, cycles, use_cpp_bindings):
     # TODO(b/264869228) Fix LoadTest
     if use_cpp_bindings:
       self.skipTest("Not implemented for cpp.")
@@ -555,6 +555,35 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     #     ),
     # )
 
+  def test_function_with_defaults_input_numpy(self, cycles, use_cpp_bindings):
+    # TODO(b/264869228) Fix LoadTest
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    @def_function.function(input_signature=[tensor_spec.TensorSpec([])])
+    def func(x=np.array(5.0)):
+      return x
+
+    root = autotrackable.AutoTrackable()
+    root.f = func
+
+    self.assertAllEqual(5.0, root.f())
+    self.assertAllEqual(7.0, root.f(np.array(7.0)))
+
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+
+    self.assertEqual(5.0, imported.f().numpy())
+    self.assertEqual(7.0, imported.f(np.array(7.0)).numpy())
+
+    # imported.signatures with defaults are not supported.
+    # TODO(b/277814477) support defaults in loaded.signatures
+    # self.assertEqual(
+    #     {"output_0": 5.0},
+    #     self.evaluate(
+    #         imported.signatures["serving_default"]()
+    #     ),
+    # )
+
   def test_function_with_default_none_input(self, cycles, use_cpp_bindings):
     # TODO(b/264869228) Fix LoadTest
     if use_cpp_bindings:
@@ -770,6 +799,54 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertEqual(5, result[1].numpy())
     self.assertEqual(0.5, result[2]["x"].numpy())
 
+  def testConcreteFunctionType(self, cycles, use_cpp_bindings):
+    if use_cpp_bindings:
+      self.skipTest("Not implemented for cpp.")
+
+    y = constant_op.constant(1)
+
+    @def_function.function
+    def foo(x):
+      return {"input": x, "capture": y}
+
+    root = autotrackable.AutoTrackable()
+    root.f = foo.get_concrete_function(tensor_spec.TensorSpec([], dtypes.int32))
+
+    imported = cycle(root, cycles, use_cpp_bindings=use_cpp_bindings)
+
+    x = constant_op.constant(2)
+    output = imported.f(x)
+    self.assertEqual(set(output.keys()), {"input", "capture"})
+    self.assertEqual(output["input"].numpy(), 2)
+    self.assertEqual(output["capture"].numpy(), 1)
+
+    parameters = list(imported.f.function_type.parameters.values())
+    self.assertLen(parameters, 1)
+    self.assertEqual(parameters[0].name, "x")
+    self.assertEqual(
+        parameters[0].type_constraint,
+        tensor_spec.TensorSpec([], dtypes.int32, name="x"),
+    )
+
+    captures = imported.f.function_type.captures
+    self.assertLen(captures, 1)
+    self.assertEqual(
+        list(captures.values())[0], tensor_spec.TensorSpec([], dtypes.int32)
+    )
+
+    output = imported.f.function_type.output
+    self.assertEqual(
+        output.mapping,
+        {
+            "input": tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.int32, name="input"
+            ),
+            "capture": tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.int32, name="capture"
+            ),
+        },
+    )
+
   def test_pretty_print_signature(self, cycles, use_cpp_bindings):
     # TODO(b/264869228) Fix LoadTest
     if use_cpp_bindings:
@@ -1256,7 +1333,7 @@ class LoadTest(test.TestCase, parameterized.TestCase):
     self.assertLen(restored_concrete_functions, 1)
 
     with self.assertRaisesRegex(
-        TypeError, "Binding inputs to tf.function `f` failed"
+        TypeError, "Binding inputs to tf.function failed"
     ):
       # We cannot call the function with a constant of shape ().
       imported.f(constant_op.constant(2)).numpy()
diff --git a/tensorflow/python/saved_model/load_v1_in_v2.py b/tensorflow/python/saved_model/load_v1_in_v2.py
index b29c681d495..69dd40bd023 100644
--- a/tensorflow/python/saved_model/load_v1_in_v2.py
+++ b/tensorflow/python/saved_model/load_v1_in_v2.py
@@ -217,7 +217,7 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
       signature_functions[signature_key] = signature_fn
     return signature_functions
 
-  def load(self, tags):
+  def load(self, tags, skip_restoring_checkpoint=False):
     """Creates an object from the MetaGraph identified by `tags`."""
     meta_graph_def = self.get_meta_graph_def_from_tags(tags)
     load_shared_name_suffix = "_load_{}".format(ops.uid())
@@ -246,7 +246,10 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     )
     (saver,) = load_graph_returns
     restore_from_saver = self._extract_saver_restore(wrapped, saver)
-    self.restore_variables(wrapped, restore_from_saver)
+
+    if not skip_restoring_checkpoint:
+      self.restore_variables(wrapped, restore_from_saver)
+
     with wrapped.graph.as_default():
       init_op = (
           loader_impl.get_init_op(meta_graph_def)
@@ -299,10 +302,12 @@ class _EagerSavedModelLoader(loader_impl.SavedModelLoader):
     return root
 
 
-def load(export_dir, tags):
+def load(export_dir, tags, skip_restoring_checkpoint=False):
   """Load a v1-style SavedModel as an object."""
   metrics.IncrementReadApi(_LOAD_V1_V2_LABEL)
   loader = _EagerSavedModelLoader(export_dir)
-  result = loader.load(tags=tags)
+  result = loader.load(
+      tags=tags, skip_restoring_checkpoint=skip_restoring_checkpoint
+  )
   metrics.IncrementRead(write_version="1")
   return result
diff --git a/tensorflow/python/saved_model/loader_impl.py b/tensorflow/python/saved_model/loader_impl.py
index d15d3f516d0..65bf31fb823 100644
--- a/tensorflow/python/saved_model/loader_impl.py
+++ b/tensorflow/python/saved_model/loader_impl.py
@@ -32,6 +32,7 @@ from tensorflow.python.saved_model import constants
 from tensorflow.python.saved_model import path_helpers
 from tensorflow.python.saved_model import signature_def_utils
 from tensorflow.python.saved_model import utils_impl as saved_model_utils
+# Placeholder for protosplitter merger import.
 from tensorflow.python.saved_model.pywrap_saved_model import metrics
 from tensorflow.python.training import saver as tf_saver
 from tensorflow.python.util import compat
@@ -93,6 +94,10 @@ def parse_saved_model(export_dir):
   path_to_pb = file_io.join(
       compat.as_bytes(compat.path_to_str(export_dir)),
       compat.as_bytes(constants.SAVED_MODEL_FILENAME_PB))
+  # Build the path to the SavedModel in cpb format.
+  path_to_cpb = file_io.join(
+      compat.as_bytes(compat.path_to_str(export_dir)),
+      compat.as_bytes(constants.SAVED_MODEL_FILENAME_CPB))
 
   # Parse the SavedModel protocol buffer.
   saved_model = saved_model_pb2.SavedModel()
@@ -108,15 +113,15 @@ def parse_saved_model(export_dir):
     with file_io.FileIO(path_to_pbtxt, "rb") as f:
       file_content = f.read()
     try:
-      text_format.Merge(file_content.decode("utf-8"), saved_model)
-      return saved_model
+      text_format.Parse(file_content.decode("utf-8"), saved_model)
     except text_format.ParseError as e:
-      raise IOError(f"Cannot parse file {path_to_pbtxt}: {str(e)}.")
+      raise IOError(f"Cannot parse file {path_to_pbtxt}: {str(e)}.") from e
   else:
     raise IOError(
         f"SavedModel file does not exist at: {export_dir}{os.path.sep}"
         f"{{{constants.SAVED_MODEL_FILENAME_PBTXT}|"
         f"{constants.SAVED_MODEL_FILENAME_PB}}}")
+  return saved_model
 
 
 def get_asset_tensors(export_dir, meta_graph_def_to_load, import_scope=None):
@@ -380,6 +385,7 @@ class SavedModelLoader(object):
       RuntimeError: if no metagraphs were found with the associated tags.
     """
     found_match = False
+    meta_graph_def_to_load = None
     available_tags = []
     for meta_graph_def in self._saved_model.meta_graphs:
       available_tags.append(set(meta_graph_def.meta_info_def.tags))
diff --git a/tensorflow/python/saved_model/metrics_test.py b/tensorflow/python/saved_model/metrics_test.py
index 4800ee5f793..70d7e2725e6 100644
--- a/tensorflow/python/saved_model/metrics_test.py
+++ b/tensorflow/python/saved_model/metrics_test.py
@@ -75,22 +75,21 @@ class MetricsTests(test.TestCase):
     self.assertEqual(metrics.GetWrite(write_version="1"), write_count + 1)
 
   def test_load_v2(self):
+    save_dir = self._create_save_v2_model()
+
     read_count = metrics.GetRead(write_version="2")
     load_v2_count = metrics.GetReadApi(load._LOAD_V2_LABEL)
-
-    save_dir = self._create_save_v2_model()
     load.load(save_dir)
 
     self.assertEqual(metrics.GetReadApi(load._LOAD_V2_LABEL), load_v2_count + 1)
     self.assertEqual(metrics.GetRead(write_version="2"), read_count + 1)
 
   def test_load_v1_in_v2(self):
+    save_dir = self._create_save_v1_model()
     read_v1_count = metrics.GetRead(write_version="1")
     read_v2_count = metrics.GetRead(write_version="2")
     load_v2_count = metrics.GetReadApi(load._LOAD_V2_LABEL)
     load_v1_v2_count = metrics.GetReadApi(load_v1_in_v2._LOAD_V1_V2_LABEL)
-
-    save_dir = self._create_save_v1_model()
     load.load(save_dir)
 
     # Check that `load_v2` was *not* incremented.
@@ -103,9 +102,10 @@ class MetricsTests(test.TestCase):
     self.assertEqual(metrics.GetRead(write_version="1"), read_v1_count + 1)
 
   def test_loader_v1(self):
-    read_count = metrics.GetRead(write_version="1")
     ops.disable_eager_execution()
     save_dir = self._create_save_v1_model()
+
+    read_count = metrics.GetRead(write_version="1")
     loader = loader_impl.SavedModelLoader(save_dir)
     with self.session(graph=ops.Graph()) as sess:
       loader.load(sess, ["foo"])
diff --git a/tensorflow/python/saved_model/model_utils/BUILD b/tensorflow/python/saved_model/model_utils/BUILD
index 47425713b08..86485c3b761 100644
--- a/tensorflow/python/saved_model/model_utils/BUILD
+++ b/tensorflow/python/saved_model/model_utils/BUILD
@@ -40,10 +40,10 @@ py_strict_library(
     srcs = ["export_output.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:tensor_util",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/saved_model:signature_def_utils",
     ],
 )
@@ -56,16 +56,16 @@ py_strict_test(
     deps = [
         ":export_output",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:metrics",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:metrics",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:signature_constants",
     ],
 )
@@ -77,8 +77,8 @@ py_strict_library(
     deps = [
         ":export_output",
         ":mode_keys",
-        "//tensorflow/python:op_selector",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:op_selector",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:signature_constants",
@@ -100,13 +100,13 @@ py_strict_test(
         ":export_output",
         ":export_utils",
         ":mode_keys",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:signature_constants",
         "//tensorflow/python/saved_model:signature_def_utils",
     ],
@@ -126,6 +126,6 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":mode_keys",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/saved_model/nested_structure_coder.py b/tensorflow/python/saved_model/nested_structure_coder.py
index 61131ac20c8..bce787087a9 100644
--- a/tensorflow/python/saved_model/nested_structure_coder.py
+++ b/tensorflow/python/saved_model/nested_structure_coder.py
@@ -437,21 +437,8 @@ class BuiltInTypeSpecCodec:
 class _TypeSpecCodec:
   """Codec for `tf.TypeSpec`."""
 
-  # Mapping from enum value to type (TypeSpec subclass).
-  # Must leave this for backwards-compatibility until all external usages
-  # have been removed.
-  TYPE_SPEC_CLASS_FROM_PROTO = {
-  }
-
-  # Mapping from type (TypeSpec subclass) to enum value.
-  TYPE_SPEC_CLASS_TO_PROTO = dict(
-      (cls, enum) for (enum, cls) in TYPE_SPEC_CLASS_FROM_PROTO.items())
-
   def can_encode(self, pyobj):
     """Returns true if `pyobj` can be encoded as a TypeSpec."""
-    if type(pyobj) in self.TYPE_SPEC_CLASS_TO_PROTO:  # pylint: disable=unidiomatic-typecheck
-      return True
-
     # Check if it's a registered type.
     if isinstance(pyobj, internal.TypeSpec):
       try:
@@ -464,17 +451,13 @@ class _TypeSpecCodec:
 
   def do_encode(self, type_spec_value, encode_fn):
     """Returns an encoded proto for the given `tf.TypeSpec`."""
-    type_spec_class = self.TYPE_SPEC_CLASS_TO_PROTO.get(type(type_spec_value))
-    type_spec_class_name = type(type_spec_value).__name__
-
-    if type_spec_class is None:
-      type_spec_class_name = type_spec_registry.get_name(type(type_spec_value))
-      type_spec_class = struct_pb2.TypeSpecProto.REGISTERED_TYPE_SPEC
-      # Support for saving registered TypeSpecs is currently experimental.
-      # Issue a warning to indicate the limitations.
-      warnings.warn("Encoding a StructuredValue with type %s; loading this "
-                    "StructuredValue will require that this type be "
-                    "imported and registered." % type_spec_class_name)
+    type_spec_class_name = type_spec_registry.get_name(type(type_spec_value))
+    type_spec_class = struct_pb2.TypeSpecProto.REGISTERED_TYPE_SPEC
+    # Support for saving registered TypeSpecs is currently experimental.
+    # Issue a warning to indicate the limitations.
+    warnings.warn("Encoding a StructuredValue with type %s; loading this "
+                  "StructuredValue will require that this type be "
+                  "imported and registered." % type_spec_class_name)
 
     type_state = type_spec_value._serialize()  # pylint: disable=protected-access
     num_flat_components = len(
@@ -507,12 +490,10 @@ class _TypeSpecCodec:
             "registered before you load this object (typically by importing "
             "its module).") from e
     else:
-      if type_spec_class_enum not in self.TYPE_SPEC_CLASS_FROM_PROTO:
-        raise ValueError(
-            f"The type '{class_name}' is not supported by this version of "
-            "TensorFlow. (The object you are loading must have been created "
-            "with a newer version of TensorFlow.)")
-      type_spec_class = self.TYPE_SPEC_CLASS_FROM_PROTO[type_spec_class_enum]
+      raise ValueError(
+          f"The type '{class_name}' is not supported by this version of "
+          "TensorFlow. (The object you are loading must have been created "
+          "with a newer version of TensorFlow.)")
 
     # pylint: disable=protected-access
     return type_spec_class._deserialize(decode_fn(type_spec_proto.type_state))
diff --git a/tensorflow/python/saved_model/nested_structure_coder_test.py b/tensorflow/python/saved_model/nested_structure_coder_test.py
index 5c766a82b83..f010471da13 100644
--- a/tensorflow/python/saved_model/nested_structure_coder_test.py
+++ b/tensorflow/python/saved_model/nested_structure_coder_test.py
@@ -18,6 +18,8 @@ import collections
 import typing
 import warnings
 
+import numpy as np
+
 from google.protobuf import text_format
 from tensorflow.core.protobuf import struct_pb2
 from tensorflow.python.data.ops import dataset_ops
@@ -403,6 +405,26 @@ class NestedStructureCoderTest(test.TestCase):
     decoded = nested_structure_coder.decode_proto(encoded)
     self.assertAllEqual(structure, decoded)
 
+  def testEncodeDecodeNumpy(self):
+    structure = np.array(1.0)
+    self.assertTrue(nested_structure_coder.can_encode(structure))
+    encoded = nested_structure_coder.encode_structure(structure)
+    expected_pbtxt = r"""
+      numpy_value {
+        dtype: DT_DOUBLE
+        tensor_shape {
+        }
+        double_val: 1.0
+      }
+    """
+    expected = struct_pb2.StructuredValue()
+    text_format.Parse(expected_pbtxt, expected)
+    self.assertEqual(expected, encoded)
+
+    decoded = nested_structure_coder.decode_proto(encoded)
+    self.assertIsInstance(decoded, np.ndarray)
+    self.assertAllEqual(structure, decoded)
+
   def testNotEncodable(self):
 
     class NotEncodable(object):
diff --git a/tensorflow/python/saved_model/pywrap_saved_model.cc b/tensorflow/python/saved_model/pywrap_saved_model.cc
index 83301ac2279..cabefeb9225 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "tensorflow/python/lib/core/pybind11_status.h"
 #include "tensorflow/python/saved_model/pywrap_saved_model_constants.h"
 #include "tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.h"
+// Placeholder for protosplitter merger include.
 #include "tensorflow/python/saved_model/pywrap_saved_model_metrics.h"
 
 namespace tensorflow {
@@ -37,6 +38,7 @@ PYBIND11_MODULE(pywrap_saved_model, m) {
   DefineConstantsModule(m);
   DefineMetricsModule(m);
   DefineFingerprintingModule(m);
+  // Placeholder for protosplitter merger module definition.
 }
 
 }  // namespace python
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_constants.cc b/tensorflow/python/saved_model/pywrap_saved_model_constants.cc
index f8fb1cf0bc8..4fb42417ca0 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_constants.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_constants.cc
@@ -52,9 +52,15 @@ void DefineConstantsModule(py::module main_module) {
   m.attr("TRAIN_OP_SIGNATURE_KEY") =
       py::str(tensorflow::kSavedModelTrainOpSignatureKey);
 
+  m.attr("SAVED_MODEL_FILENAME_PREFIX") =
+      py::str(tensorflow::kSavedModelFilenamePrefix);
+
   m.attr("SAVED_MODEL_FILENAME_PB") =
       py::str(tensorflow::kSavedModelFilenamePb);
 
+  m.attr("SAVED_MODEL_FILENAME_CPB") =
+      py::str(tensorflow::kSavedModelFilenameCpb);
+
   m.attr("SAVED_MODEL_FILENAME_PBTXT") =
       py::str(tensorflow::kSavedModelFilenamePbTxt);
 
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
index 9e33e5fd5f8..aa85c523ec3 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting.cc
@@ -19,8 +19,14 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil
+#include "tensorflow/cc/saved_model/constants.h"
 #include "tensorflow/cc/saved_model/fingerprinting.h"
+#include "tensorflow/cc/saved_model/reader.h"
+#include "tensorflow/core/common_runtime/graph_runner.h"
+#include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/protobuf/saved_model.pb.h"
+#include "tensorflow/python/lib/core/pybind11_status.h"
 
 namespace tensorflow {
 namespace saved_model {
@@ -76,13 +82,15 @@ void DefineFingerprintingModule(py::module main_module) {
 
   m.def(
       "CreateFingerprintDef",
-      [](std::string serialized_saved_model, std::string export_dir) {
+      [](std::string export_dir) -> StatusOr<py::bytes> {
         // Deserialize the SavedModel.
         SavedModel saved_model_pb;
-        saved_model_pb.ParseFromString(serialized_saved_model);
+        auto env = Env::Default();
+        TF_RETURN_IF_ERROR(
+            tensorflow::ReadSavedModel(export_dir, &saved_model_pb));
 
         StatusOr<FingerprintDef> fingerprint =
-            fingerprinting::CreateFingerprintDef(saved_model_pb, export_dir);
+            fingerprinting::CreateFingerprintDef(&saved_model_pb, export_dir);
         if (fingerprint.ok()) {
           return py::bytes(fingerprint.value().SerializeAsString());
         }
@@ -91,7 +99,7 @@ void DefineFingerprintingModule(py::module main_module) {
                         export_dir)
                 .c_str());
       },
-      py::arg("saved_model"), py::arg("export_dir"),
+      py::arg("export_dir"),
       py::doc(
           "Returns the serialized FingerprintDef of a serialized SavedModel."));
 
diff --git a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
index 942b77ae25b..2b91209d680 100644
--- a/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
+++ b/tensorflow/python/saved_model/pywrap_saved_model_fingerprinting_test.py
@@ -14,10 +14,7 @@
 # ==============================================================================
 """Tests for pywrap_saved_model_fingerprinting."""
 
-import os
-
 from tensorflow.core.protobuf import fingerprint_pb2
-from tensorflow.python.lib.io import file_io
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model.pywrap_saved_model import fingerprinting as pywrap_fingerprinting
 
@@ -26,11 +23,9 @@ class FingerprintingTest(test.TestCase):
   def test_create_fingerprint_def(self):
     export_dir = test.test_src_dir_path(
         "cc/saved_model/testdata/VarsAndArithmeticObjectGraph")
-    with file_io.FileIO(os.path.join(export_dir, "saved_model.pb"), "rb") as f:
-      file_content = f.read()
 
     fingerprint = fingerprint_pb2.FingerprintDef().FromString(
-        pywrap_fingerprinting.CreateFingerprintDef(file_content, export_dir))
+        pywrap_fingerprinting.CreateFingerprintDef(export_dir))
     # We cannot check the value of the saved_model_checksum due to
     # non-determinism in serialization.
     self.assertGreater(fingerprint.saved_model_checksum, 0)
diff --git a/tensorflow/python/saved_model/registration/BUILD b/tensorflow/python/saved_model/registration/BUILD
index efb628fcfa1..b1b049f3427 100644
--- a/tensorflow/python/saved_model/registration/BUILD
+++ b/tensorflow/python/saved_model/registration/BUILD
@@ -46,12 +46,6 @@ tf_py_strict_test(
     srcs = ["registration_saving_test.py"],
     deps = [
         ":registration",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
@@ -59,7 +53,13 @@ tf_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
@@ -92,7 +92,7 @@ tf_py_strict_test(
         ":registration",
         ":test_util",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
     ],
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 92b1d22fdb4..110e677d4ac 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -77,6 +77,8 @@ from tensorflow.python.types import core as types_core
 from tensorflow.python.util import compat
 from tensorflow.python.util import object_identity
 from tensorflow.python.util.tf_export import tf_export
+# Placeholder for protosplitter import.
+
 
 _UNCOPIABLE_DTYPES = frozenset((dtypes.resource, dtypes.variant))
 
@@ -900,7 +902,8 @@ def _fill_meta_graph_def(
   # added, so run the following to validate deserialization dependencies.
   _dependency_sorted_node_ids(saveable_view)
 
-  graph_def = exported_graph.as_graph_def(add_shapes=True)
+  graph_def, _ = exported_graph._as_graph_def(  # pylint: disable=protected-access
+      add_shapes=True, use_pybind11_proto=False)
   graph_def.library.registered_gradients.extend(saveable_view.gradient_defs)
   _verify_ops(graph_def, namespace_whitelist)
 
@@ -1074,18 +1077,16 @@ def _export_debug_info(exported_graph, export_dir):
     exported_graph: A Graph that has been created by tracing a saveable view.
     export_dir: SavedModel directory in which to write the debug info.
   """
-  exported_operations = []
+  per_fn_info = []
   for fn_name in exported_graph._functions:  # pylint: disable=protected-access
     fn = exported_graph._get_function(fn_name)  # pylint: disable=protected-access
     if not isinstance(fn, defun.AtomicFunction):  # pylint: disable=protected-access
       continue
 
-    fn_graph = fn.graph
-    for fn_op in fn_graph.get_operations():
-      exported_operations.append((fn_name, fn_op))
+    per_fn_info.append((fn_name, fn.graph_debug_info))
 
-  graph_debug_info = error_interpolation.create_graph_debug_info_def(
-      exported_operations)
+  graph_debug_info = error_interpolation.merge_graph_debug_info_def(
+      per_fn_info)
   file_io.atomic_write_string_to_file(
       file_io.join(
           path_helpers.get_or_create_debug_dir(export_dir),
@@ -1343,14 +1344,18 @@ def save_and_return_nodes(obj,
   # as we build up the C++ API.
   pywrap_saved_model.Save(export_dir)
 
-  saved_model_serialized = saved_model.SerializeToString(deterministic=True)
-
-  fingerprinting_utils.write_fingerprint(export_dir, saved_model_serialized)
-
-  path = file_io.join(
-      compat.as_str(export_dir),
-      compat.as_str(constants.SAVED_MODEL_FILENAME_PB))
-  file_io.atomic_write_string_to_file(path, saved_model_serialized)
+  if options.experimental_image_format:
+    prefix = file_io.join(
+        compat.as_str(export_dir),
+        "saved_model")
+    proto_splitter.SavedModelSplitter(saved_model).write(prefix)
+  else:
+    path = file_io.join(
+        compat.as_str(export_dir),
+        compat.as_str(constants.SAVED_MODEL_FILENAME_PB))
+    file_io.atomic_write_string_to_file(
+        path, saved_model.SerializeToString(deterministic=True))
+    fingerprinting_utils.write_fingerprint(export_dir)
 
   # Save debug info, if requested.
   if options.save_debug_info:
diff --git a/tensorflow/python/saved_model/save_options.py b/tensorflow/python/saved_model/save_options.py
index 6267d2012e5..ae2f777ddbb 100644
--- a/tensorflow/python/saved_model/save_options.py
+++ b/tensorflow/python/saved_model/save_options.py
@@ -20,6 +20,9 @@ from tensorflow.python.util import compat
 from tensorflow.python.util.tf_export import tf_export
 
 
+is_oss = True  # Updated by copybara.
+
+
 @tf_export("saved_model.experimental.VariablePolicy")
 class VariablePolicy(enum.Enum):
   """Enum defining options for variable handling when saving.
@@ -95,17 +98,26 @@ class SaveOptions:
   """
 
   # Define object attributes in __slots__ for improved memory and performance.
-  __slots__ = ("namespace_whitelist", "save_debug_info", "function_aliases",
-               "experimental_io_device", "experimental_variable_policy",
-               "experimental_custom_gradients")
+  __slots__ = (
+      "namespace_whitelist",
+      "save_debug_info",
+      "function_aliases",
+      "experimental_io_device",
+      "experimental_variable_policy",
+      "experimental_custom_gradients",
+      "experimental_image_format",
+  )
 
-  def __init__(self,
-               namespace_whitelist=None,
-               save_debug_info=False,
-               function_aliases=None,
-               experimental_io_device=None,
-               experimental_variable_policy=None,
-               experimental_custom_gradients=True):
+  def __init__(
+      self,
+      namespace_whitelist=None,
+      save_debug_info=False,
+      function_aliases=None,
+      experimental_io_device=None,
+      experimental_variable_policy=None,
+      experimental_custom_gradients=True,
+      experimental_image_format=False,
+  ):
     """Creates an object that stores options for SavedModel saving.
 
     Args:
@@ -122,33 +134,24 @@ class SaveOptions:
         @tf.function. A single tf.function can generate many ConcreteFunctions.
         If a downstream tool wants to refer to all concrete functions generated
         by a single tf.function you can use the `function_aliases` argument to
-        store a map from the alias name to all concrete function names.
-        E.g.
-
-        >>> class Adder(tf.Module):
-        ...   @tf.function
-        ...   def double(self, x):
-        ...     return x + x
-
-        >>> model = Adder()
-        >>> model.double.get_concrete_function(
-        ...   tf.TensorSpec(shape=[], dtype=tf.float32, name="float_input"))
-        >>> model.double.get_concrete_function(
-        ...   tf.TensorSpec(shape=[], dtype=tf.string, name="string_input"))
-
-        >>> options = tf.saved_model.SaveOptions(
-        ...   function_aliases={'double': model.double})
-        >>> tf.saved_model.save(model, '/tmp/adder', options=options)
-
+        store a map from the alias name to all concrete function names. E.g. >>>
+        class Adder(tf.Module): ...   @tf.function ...   def double(self, x):
+        ...     return x + x  >>> model = Adder() >>>
+        model.double.get_concrete_function( ...   tf.TensorSpec(shape=[],
+        dtype=tf.float32, name="float_input")) >>>
+        model.double.get_concrete_function( ...   tf.TensorSpec(shape=[],
+        dtype=tf.string, name="string_input"))  >>> options =
+        tf.saved_model.SaveOptions( ...   function_aliases={'double':
+        model.double}) >>> tf.saved_model.save(model, '/tmp/adder',
+        options=options)
       experimental_io_device: string. Applies in a distributed setting.
         Tensorflow device to use to access the filesystem. If `None` (default)
         then for each variable the filesystem is accessed from the CPU:0 device
         of the host where that variable is assigned. If specified, the
-        filesystem is instead accessed from that device for all variables.
-
-        This is for example useful if you want to save to a local directory,
-        such as "/tmp" when running in a distributed setting. In that case pass
-        a device for the host where the "/tmp" directory is accessible.
+        filesystem is instead accessed from that device for all variables.  This
+        is for example useful if you want to save to a local directory, such as
+        "/tmp" when running in a distributed setting. In that case pass a device
+        for the host where the "/tmp" directory is accessible.
       experimental_variable_policy: The policy to apply to variables when
         saving. This is either a `saved_model.experimental.VariablePolicy` enum
         instance or one of its value strings (case is not important). See that
@@ -157,15 +160,29 @@ class SaveOptions:
       experimental_custom_gradients: Boolean. When True, will save traced
         gradient functions for the functions decorated by `tf.custom_gradient`.
         Defaults to `True`.
+      experimental_image_format: New (highly) experimental format that is
+        capable of saving models larger than the 2GB protobuf limit. Enabling
+        this option will likely break compatibility with downstream consumers.
+        This option is currently disabled in OSS.
     """
     self.namespace_whitelist = _validate_namespace_whitelist(
-        namespace_whitelist)
+        namespace_whitelist
+    )
     self.save_debug_info = save_debug_info
     self.function_aliases = function_aliases if function_aliases else dict()
     self.experimental_custom_gradients = experimental_custom_gradients
     self.experimental_io_device = experimental_io_device
-    self.experimental_variable_policy = (
-        VariablePolicy.from_obj(experimental_variable_policy))
+    self.experimental_variable_policy = VariablePolicy.from_obj(
+        experimental_variable_policy
+    )
+
+    # TODO(b/277279153): Enable image format in OSS after proto splitter is
+    #  public.
+    if experimental_image_format and is_oss:
+      raise ValueError(
+          "The option `experimental_image_format` is disabled in OSS."
+      )
+    self.experimental_image_format = experimental_image_format
 
 
 def _validate_namespace_whitelist(namespace_whitelist):
@@ -173,14 +190,18 @@ def _validate_namespace_whitelist(namespace_whitelist):
   if namespace_whitelist is None:
     return None
   if not isinstance(namespace_whitelist, list):
-    raise TypeError("`namespace_whitelist` must be a list of strings. Got: "
-                    f"{namespace_whitelist} with type "
-                    f"{type(namespace_whitelist)}.")
+    raise TypeError(
+        "`namespace_whitelist` must be a list of strings. Got: "
+        f"{namespace_whitelist} with type "
+        f"{type(namespace_whitelist)}."
+    )
 
   processed = []
   for namespace in namespace_whitelist:
     if not isinstance(namespace, str):
-      raise ValueError("Whitelisted namespace must be a string. Got: "
-                       f"{namespace} of type {type(namespace)}.")
+      raise ValueError(
+          "Whitelisted namespace must be a string. Got: "
+          f"{namespace} of type {type(namespace)}."
+      )
     processed.append(compat.as_str(namespace))
   return processed
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index bc913b6f012..3a4e56f2ce7 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -17,6 +17,7 @@
 import os
 
 from absl.testing import parameterized
+import numpy as np
 
 from google.protobuf import text_format
 from tensorflow.core.config import flags
@@ -59,6 +60,10 @@ from tensorflow.python.trackable import asset
 from tensorflow.python.trackable import autotrackable
 from tensorflow.python.training import saver
 from tensorflow.python.util import compat
+# Placeholder for protosplitter constants import.
+
+
+is_oss = True  # Updated by copybara.
 
 
 def _run_signature(
@@ -228,6 +233,132 @@ class SaveTest(test.TestCase, parameterized.TestCase):
         ),
     )
 
+  def test_save_defaults_dict(self):
+    root = autotrackable.AutoTrackable()
+
+    @def_function.function(
+        input_signature=[{
+            "temperature": tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.float32, name="d"
+            ),
+            "per_example_max_decode_steps": tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.int32, name="c"
+            ),
+            "per_example_top_k": tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.int32, name="b"
+            ),
+            "gumbel_prng_key": tensor_spec.TensorSpec(
+                shape=(), dtype=dtypes.int32, name="a"
+            ),
+        }]
+    )
+    def f(
+        x={
+            "temperature": constant_op.constant(0.5),
+            "per_example_max_decode_steps": constant_op.constant(1024),
+            "per_example_top_k": constant_op.constant(40),
+            "gumbel_prng_key": constant_op.constant(0),
+        }
+    ):  # pylint: disable=dangerous-default-value
+      return x
+
+    root.f = f
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, root.f)
+
+    self.assertEqual(
+        {
+            "gumbel_prng_key": 0,
+            "per_example_max_decode_steps": 1024,
+            "temperature": 0.5,
+            "per_example_top_k": 40,
+        },
+        _import_and_infer(
+            save_dir, {}, disable_check_for_input_signature_size_match=True
+        ),
+    )
+
+  def test_save_defaults_nested_structure(self):
+    root = autotrackable.AutoTrackable()
+
+    @def_function.function(
+        input_signature=[
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name="m"),
+            [
+                tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+                tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+                {
+                    "temperature": tensor_spec.TensorSpec(
+                        shape=(), dtype=dtypes.float32
+                    ),
+                    "per_example_max_decode_steps": tensor_spec.TensorSpec(
+                        shape=(), dtype=dtypes.int32
+                    ),
+                    "per_example_top_k": tensor_spec.TensorSpec(
+                        shape=(), dtype=dtypes.int32
+                    ),
+                    "gumbel_prng_key": tensor_spec.TensorSpec(
+                        shape=(), dtype=dtypes.int32
+                    ),
+                    "dict_entry": {
+                        "a": tensor_spec.TensorSpec(
+                            shape=(), dtype=dtypes.float32
+                        ),
+                        "b": tensor_spec.TensorSpec(
+                            shape=(), dtype=dtypes.float32
+                        ),
+                    },
+                },
+                tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32),
+            ],
+            tensor_spec.TensorSpec(shape=(), dtype=dtypes.float32, name="y"),
+        ]
+    )
+    def f(
+        m,
+        x=[
+            constant_op.constant(5.0),
+            constant_op.constant(1.0),
+            {
+                "tempurature": constant_op.constant(0.5),
+                "per_example_max_decode_steps": constant_op.constant(1024),
+                "per_example_top_k": constant_op.constant(40),
+                "gumbel_prng_key": constant_op.constant(0),
+                "dict_entry": {
+                    "a": constant_op.constant(1.0),
+                    "b": constant_op.constant(2.0),
+                },
+            },
+            constant_op.constant(3.0),
+        ],
+        y=constant_op.constant(2.0),
+    ):  # pylint: disable=dangerous-default-value
+      return m + x[2]["dict_entry"]["a"] + x[3] + y
+
+    root.f = f
+    save_dir = os.path.join(self.get_temp_dir(), "saved_model")
+    save.save(root, save_dir, {"f": root.f})
+
+    self.assertEqual(
+        {"output_0": 8.0},
+        _import_and_infer(
+            save_dir,
+            inputs={"m": 2.0},
+            signature_key="f",
+            disable_check_for_input_signature_size_match=True,
+        ),
+    )
+
+    self.assertEqual(
+        {"output_0": 10.0},
+        _import_and_infer(
+            save_dir,
+            inputs={"m": 2.0, "y": 4.0},
+            signature_key="f",
+            disable_check_for_input_signature_size_match=True,
+        ),
+    )
+
   def test_unsaveable_func_graph(self):
     root = module.Module()
 
@@ -1090,6 +1221,41 @@ class SavingOptionsTest(test.TestCase):
       options = save_options.SaveOptions(
           experimental_variable_policy="not_a_valid_value")
 
+  def test_save_experimental_image_format(self):
+    if is_oss:
+      self.skipTest("Experimental image format disabled in OSS.")
+    root = module.Module()
+    root.c = constant_op.constant(np.random.random_sample([150, 150]))
+    root.get_c = def_function.function(lambda: root.c)
+    save_dir = os.path.join(self.get_temp_dir(), "chunked_model")
+    constants.debug_set_max_size(80000)
+    options = save_options.SaveOptions(experimental_image_format=True)
+    save.save(
+        root,
+        save_dir,
+        signatures=root.get_c.get_concrete_function(),
+        options=options,
+    )
+    self.assertTrue(os.path.exists(save_dir + "/saved_model.cpb"))
+
+  def test_save_experimental_image_format_not_chunked(self):
+    if is_oss:
+      self.skipTest("Experimental image format disabled in OSS.")
+    root = module.Module()
+    root.c = constant_op.constant(np.random.random_sample([150, 150]))
+    root.get_c = def_function.function(lambda: root.c)
+    save_dir = os.path.join(self.get_temp_dir(), "not_chunked_model")
+    constants.debug_set_max_size(1 << 31)  # 2GB
+    options = save_options.SaveOptions(experimental_image_format=True)
+    save.save(
+        root,
+        save_dir,
+        signatures=root.get_c.get_concrete_function(),
+        options=options,
+    )
+    # Should save an unchunked proto (.pb) and not .cpb
+    self.assertTrue(os.path.exists(save_dir + "/saved_model.pb"))
+
 
 class AssetTests(test.TestCase):
 
@@ -1152,7 +1318,7 @@ class AssetTests(test.TestCase):
         input_signature=[tensor_spec.TensorSpec(None, dtypes.string)])
     root.table_user(constant_op.constant("gamma"))
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
-    with self.assertRaisesRegexp(AssertionError, "HashTable"):
+    with self.assertRaisesRegex(AssertionError, "HashTable"):
       save.save(root, save_dir)
 
   def test_unused_asset(self):
diff --git a/tensorflow/python/saved_model/signature_serialization.py b/tensorflow/python/saved_model/signature_serialization.py
index 043cfd4703b..d44c9846456 100644
--- a/tensorflow/python/saved_model/signature_serialization.py
+++ b/tensorflow/python/saved_model/signature_serialization.py
@@ -31,7 +31,6 @@ from tensorflow.python.util import compat
 from tensorflow.python.util import nest
 from tensorflow.python.util.compat import collections_abc
 
-
 DEFAULT_SIGNATURE_ATTR = "_default_save_signature"
 SIGNATURE_ATTRIBUTE_NAME = "signatures"
 # Max number of warnings to show if signature contains normalized input names.
@@ -39,8 +38,10 @@ _NUM_DISPLAY_NORMALIZED_SIGNATURES = 5
 
 
 def _get_signature(function):
-  if (isinstance(function, (defun.Function, def_function.Function)) and
-      function.input_signature is not None):
+  if (
+      isinstance(function, (defun.Function, def_function.Function))
+      and function.input_signature is not None
+  ):
     function = function._get_concrete_function_garbage_collected()  # pylint: disable=protected-access
   if not isinstance(function, defun.ConcreteFunction):
     return None
@@ -64,13 +65,15 @@ def _valid_signature(concrete_function):
 
 def _validate_inputs(concrete_function):
   """Raises error if input type is tf.Variable."""
-  if any(isinstance(inp, resource_variable_ops.VariableSpec)
-         for inp in nest.flatten(
-             concrete_function.structured_input_signature)):
+  if any(
+      isinstance(inp, resource_variable_ops.VariableSpec)
+      for inp in nest.flatten(concrete_function.structured_input_signature)
+  ):
     raise ValueError(
         f"Unable to serialize concrete_function '{concrete_function.name}'"
-        f"with tf.Variable input. Functions that expect tf.Variable "
-        "inputs cannot be exported as signatures.")
+        "with tf.Variable input. Functions that expect tf.Variable "
+        "inputs cannot be exported as signatures."
+    )
 
 
 def _get_signature_name_changes(concrete_function):
@@ -79,10 +82,12 @@ def _get_signature_name_changes(concrete_function):
   name_changes = {}
   for signature_input_name, graph_input in zip(
       concrete_function.function_def.signature.input_arg,
-      concrete_function.graph.inputs):
+      concrete_function.graph.inputs,
+  ):
     try:
       user_specified_name = compat.as_str(
-          graph_input.op.get_attr("_user_specified_name"))
+          graph_input.op.get_attr("_user_specified_name")
+      )
       if signature_input_name.name != user_specified_name:
         name_changes[user_specified_name] = signature_input_name.name
     except ValueError:
@@ -113,7 +118,7 @@ def find_function_to_export(saveable_view):
   if len(possible_signatures) == 1:
     single_function = possible_signatures[0]
     signature = _get_signature(single_function)
-    if signature and  _valid_signature(signature):
+    if signature and _valid_signature(signature):
       return signature
   return None
 
@@ -124,7 +129,8 @@ def canonicalize_signatures(signatures):
     return {}, {}, {}
   if not isinstance(signatures, collections_abc.Mapping):
     signatures = {
-        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures}
+        signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: signatures
+    }
   num_normalized_signatures_counter = 0
   concrete_signatures = {}
   wrapped_functions = {}
@@ -135,11 +141,13 @@ def canonicalize_signatures(signatures):
       raise ValueError(
           "Expected a TensorFlow function for which to generate a signature, "
           f"but got {function}. Only `tf.functions` with an input signature or "
-          "concrete functions can be used as a signature.")
+          "concrete functions can be used as a signature."
+      )
 
     wrapped_functions[original_function] = signature_function = (
-        wrapped_functions.get(original_function) or
-        function_serialization.wrap_cached_variables(original_function))
+        wrapped_functions.get(original_function)
+        or function_serialization.wrap_cached_variables(original_function)
+    )
     _validate_inputs(signature_function)
     if num_normalized_signatures_counter < _NUM_DISPLAY_NORMALIZED_SIGNATURES:
       signature_name_changes = _get_signature_name_changes(signature_function)
@@ -150,14 +158,18 @@ def canonicalize_signatures(signatures):
             "characters which will be renamed to %s in the SavedModel.",
             compat.as_str(signature_function.graph.name),
             ", ".join(signature_name_changes.keys()),
-            ", ".join(signature_name_changes.values()))
+            ", ".join(signature_name_changes.values()),
+        )
+
     # Re-wrap the function so that it returns a dictionary of Tensors. This
     # matches the format of 1.x-style signatures.
     # pylint: disable=cell-var-from-loop
     def signature_wrapper(**kwargs):
       structured_outputs = signature_function(**kwargs)
       return _normalize_outputs(
-          structured_outputs, signature_function.name, signature_key)
+          structured_outputs, signature_function.name, signature_key
+      )
+
     if hasattr(function, "__name__"):
       signature_wrapper.__name__ = "signature_wrapper_" + function.__name__
     wrapped_function = def_function.function(signature_wrapper)
@@ -166,15 +178,19 @@ def canonicalize_signatures(signatures):
       # The structured input signature may contain other non-tensor arguments.
       inputs = filter(
           lambda x: isinstance(x, tensor_spec.TensorSpec),
-          nest.flatten(signature_function.structured_input_signature,
-                       expand_composites=True))
+          nest.flatten(
+              signature_function.structured_input_signature,
+              expand_composites=True,
+          ),
+      )
     else:
       # Structured input signature isn't always defined for some functions.
       inputs = signature_function.inputs
 
     for keyword, inp in zip(
         signature_function._arg_keywords,  # pylint: disable=protected-access
-        inputs):
+        inputs,
+    ):
       keyword = compat.as_str(keyword)
       if isinstance(inp, tensor_spec.TensorSpec):
         spec = tensor_spec.TensorSpec(inp.shape, inp.dtype, name=keyword)
@@ -182,7 +198,8 @@ def canonicalize_signatures(signatures):
         spec = tensor_spec.TensorSpec.from_tensor(inp, name=keyword)
       tensor_spec_signature[keyword] = spec
     final_concrete = wrapped_function._get_concrete_function_garbage_collected(  # pylint: disable=protected-access
-        **tensor_spec_signature)
+        **tensor_spec_signature
+    )
     # pylint: disable=protected-access
     if len(final_concrete._arg_keywords) == 1:
       # If there is only one input to the signature, a very common case, then
@@ -196,14 +213,21 @@ def canonicalize_signatures(signatures):
     concrete_signatures[signature_key] = final_concrete
     # pylint: enable=cell-var-from-loop
     if isinstance(function, core.GenericFunction):
-      full_arg_spec = function._function_spec.fullargspec  # pylint: disable=protected-access
-      len_defaults = len(full_arg_spec.defaults or [])
-      for arg, default in zip(
-          full_arg_spec.args[-len_defaults:], full_arg_spec.defaults or []
-      ):
-        if not (default and isinstance(default, ops.Tensor)):
-          continue
-        defaults.setdefault(signature_key, {})[arg] = default
+      flattened_defaults = nest.flatten(
+          function.function_spec.fullargspec.defaults  # pylint: disable=protected-access
+      )
+      len_default = len(flattened_defaults or [])
+      arg_names = list(tensor_spec_signature.keys())
+      if len_default > 0:
+        # tensor_spec_signature uses the same nest.flatten() as
+        # flattened_defaults.
+        for arg, default in zip(
+            arg_names[-len_default:],  # pylint: disable=protected-access
+            flattened_defaults or [],
+        ):
+          if not isinstance(default, ops.Tensor):
+            continue
+          defaults.setdefault(signature_key, {})[arg] = default
   return concrete_signatures, wrapped_functions, defaults
 
 
@@ -217,8 +241,10 @@ def _normalize_outputs(outputs, function_name, signature_key):
     else:
       if not isinstance(outputs, collections_abc.Sequence):
         outputs = [outputs]
-      outputs = {("output_{}".format(output_index)): output
-                 for output_index, output in enumerate(outputs)}
+      outputs = {
+          "output_{}".format(output_index): output
+          for output_index, output in enumerate(outputs)
+      }
 
   # Check that the keys of `outputs` are strings and the values are Tensors.
   for key, value in outputs.items():
@@ -226,14 +252,16 @@ def _normalize_outputs(outputs, function_name, signature_key):
       raise ValueError(
           f"Got a dictionary with a non-string key {key!r} in the output of "
           f"the function {compat.as_str_any(function_name)} used to generate "
-          f"the SavedModel signature {signature_key!r}.")
+          f"the SavedModel signature {signature_key!r}."
+      )
     if not isinstance(value, (ops.Tensor, composite_tensor.CompositeTensor)):
       raise ValueError(
           f"Got a non-Tensor value {value!r} for key {key!r} in the output of "
           f"the function {compat.as_str_any(function_name)} used to generate "
           f"the SavedModel signature {signature_key!r}. "
           "Outputs for functions used as signatures must be a single Tensor, "
-          "a sequence of Tensors, or a dictionary from string to Tensor.")
+          "a sequence of Tensors, or a dictionary from string to Tensor."
+      )
   return outputs
 
 
@@ -273,7 +301,8 @@ class _SignatureMap(collections_abc.Mapping, base.Trackable):
       return {}
 
     return {
-        key: value for key, value in self.items()
+        key: value
+        for key, value in self.items()
         if isinstance(value, (def_function.Function, defun.ConcreteFunction))
     }
 
@@ -281,15 +310,19 @@ class _SignatureMap(collections_abc.Mapping, base.Trackable):
 revived_types.register_revived_type(
     "signature_map",
     lambda obj: isinstance(obj, _SignatureMap),
-    versions=[revived_types.VersionedTypeRegistration(
-        # Standard dependencies are enough to reconstruct the trackable
-        # items in dictionaries, so we don't need to save any extra information.
-        object_factory=lambda proto: _SignatureMap(),
-        version=1,
-        min_producer_version=1,
-        min_consumer_version=1,
-        setter=_SignatureMap._add_signature  # pylint: disable=protected-access
-    )])
+    versions=[
+        revived_types.VersionedTypeRegistration(
+            # Standard dependencies are enough to reconstruct the trackable
+            # items in dictionaries, so we don't need to save any extra
+            # information.
+            object_factory=lambda proto: _SignatureMap(),
+            version=1,
+            min_producer_version=1,
+            min_consumer_version=1,
+            setter=_SignatureMap._add_signature,  # pylint: disable=protected-access
+        )
+    ],
+)
 
 
 def create_signature_map(signatures):
@@ -315,14 +348,16 @@ def create_signature_map(signatures):
 def validate_augmented_graph_view(augmented_graph_view):
   """Performs signature-related sanity checks on `augmented_graph_view`."""
   for name, dep in augmented_graph_view.list_children(
-      augmented_graph_view.root):
+      augmented_graph_view.root
+  ):
     if name == SIGNATURE_ATTRIBUTE_NAME:
       if not isinstance(dep, _SignatureMap):
         raise ValueError(
-            f"Exporting an object {augmented_graph_view.root} which has an attribute "
-            f"named '{SIGNATURE_ATTRIBUTE_NAME}'. This is a reserved attribute "
-            "used to store SavedModel signatures in objects which come from "
-            "`tf.saved_model.load`. Delete this attribute "
-            f"(e.g. `del obj.{SIGNATURE_ATTRIBUTE_NAME}`) before saving if "
-            "this shadowing is acceptable.")
+            f"Exporting an object {augmented_graph_view.root} which has an"
+            f" attribute named '{SIGNATURE_ATTRIBUTE_NAME}'. This is a reserved"
+            " attribute used to store SavedModel signatures in objects which"
+            " come from `tf.saved_model.load`. Delete this attribute (e.g."
+            f" `del obj.{SIGNATURE_ATTRIBUTE_NAME}`) before saving if this"
+            " shadowing is acceptable."
+        )
       break
diff --git a/tensorflow/python/summary/BUILD b/tensorflow/python/summary/BUILD
index a07fbc883c6..415ff76c834 100644
--- a/tensorflow/python/summary/BUILD
+++ b/tensorflow/python/summary/BUILD
@@ -27,7 +27,7 @@ py_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -38,16 +38,16 @@ py_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:logging_ops_gen",
-        "//tensorflow/python:summary_op_util",
-        "//tensorflow/python:summary_ops_gen",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/distribute:summary_op_util",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:logging_ops_gen",
+        "//tensorflow/python/ops:summary_op_util",
+        "//tensorflow/python/ops:summary_ops_gen",
+        "//tensorflow/python/ops:summary_ops_v2",
         "//tensorflow/python/summary/writer",
         "//tensorflow/python/summary/writer:writer_cache",
         "//tensorflow/python/training:training_util",
@@ -78,8 +78,8 @@ tf_py_strict_test(
         ":summary_iterator",
         ":summary_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/summary/writer",
     ],
 )
@@ -92,17 +92,17 @@ tf_py_strict_test(
     deps = [
         ":summary_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:platform_test",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -113,13 +113,13 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":summary_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:training_util",
     ],
 )
diff --git a/tensorflow/python/summary/writer/BUILD b/tensorflow/python/summary/writer/BUILD
index 42de5bf41f5..8acb0c742ba 100644
--- a/tensorflow/python/summary/writer/BUILD
+++ b/tensorflow/python/summary/writer/BUILD
@@ -66,11 +66,11 @@ py_strict_library(
     srcs = ["event_file_writer_v2.py"],
     visibility = ["//visibility:public"],
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
         "//tensorflow/python/platform:gfile",
     ],
 )
@@ -86,8 +86,6 @@ tf_py_strict_test(
         ":writer",
         ":writer_cache",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:summary_ops_v2",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
@@ -95,6 +93,8 @@ tf_py_strict_test(
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/summary:plugin_asset",
         "//tensorflow/python/summary:summary_iterator",
diff --git a/tensorflow/python/tfe_wrapper.cc b/tensorflow/python/tfe_wrapper.cc
index f8645f01bd5..78a4c19ea03 100644
--- a/tensorflow/python/tfe_wrapper.cc
+++ b/tensorflow/python/tfe_wrapper.cc
@@ -1126,7 +1126,9 @@ PYBIND11_MODULE(_pywrap_tfe, m) {
         [](const py::handle& context, const py::handle& handles) {
           return tensorflow::TFE_Py_PackEagerTensors_wrapper(context, handles);
         });
-  m.def("TFE_Py_SetEagerTensorProfiler", &TFE_Py_SetEagerTensorProfiler);
+  m.def("TFE_Py_SetEagerTensorProfiler", [](const py::handle& o) {
+    return tensorflow::PyoOrThrow(TFE_Py_SetEagerTensorProfiler(o.ptr()));
+  });
   m.def("TFE_Py_RegisterJVPFunction", [](const py::handle& o) {
     return tensorflow::PyoOrThrow(TFE_Py_RegisterJVPFunction(o.ptr()));
   });
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index f36bf08752a..c7e32895075 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -61,10 +61,10 @@ py_strict_test(
     visibility = ["//visibility:private"],
     deps = [
         ":saved_model_utils",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:tag_constants",
@@ -75,34 +75,6 @@ py_strict_library(
     name = "freeze_graph_lib",
     srcs = ["freeze_graph.py"],
     srcs_version = "PY3",
-    deps = [
-        ":saved_model_utils",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:py_checkpoint_reader",
-        "//tensorflow/python/checkpoint:checkpoint_management",
-        "//tensorflow/python/client:session",
-        "//tensorflow/python/framework",
-        "//tensorflow/python/framework:convert_to_constants",
-        "//tensorflow/python/platform:gfile",
-        "//tensorflow/python/saved_model:loader",
-        "//tensorflow/python/saved_model:tag_constants",
-        "//tensorflow/python/training:saver",
-        "@absl_py//absl:app",
-    ],
-)
-
-py_strict_binary(
-    name = "freeze_graph",
-    srcs = ["freeze_graph.py"],
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [":freeze_graph_main_lib"],
-)
-
-py_strict_library(
-    name = "freeze_graph_main_lib",
-    srcs = ["freeze_graph.py"],
-    srcs_version = "PY3",
     deps = [
         ":saved_model_utils",
         "//tensorflow/core:protos_all_py",
@@ -119,6 +91,14 @@ py_strict_library(
     ],
 )
 
+py_strict_binary(
+    name = "freeze_graph",
+    srcs = ["freeze_graph.py"],
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [":freeze_graph_lib"],
+)
+
 py_strict_binary(
     name = "import_pb_to_tensorboard",
     srcs = ["import_pb_to_tensorboard.py"],
@@ -150,19 +130,18 @@ py_strict_test(
     deps = [
         ":freeze_graph_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:nn",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:builder",
         "//tensorflow/python/saved_model:signature_constants",
@@ -186,9 +165,9 @@ py_strict_library(
     srcs = ["inspect_checkpoint.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:py_checkpoint_reader",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:flags",
+        "//tensorflow/python/training:py_checkpoint_reader",
         "//third_party/py/numpy",
         "@absl_py//absl:app",
     ],
@@ -218,7 +197,7 @@ py_strict_binary(
     srcs_version = "PY3",
     deps = [
         ":strip_unused_lib",
-        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "@absl_py//absl:app",
     ],
 )
@@ -233,13 +212,12 @@ py_strict_test(
     deps = [
         ":strip_unused_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -251,7 +229,7 @@ py_strict_library(
     deps = [
         ":strip_unused_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:graph_util",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/platform:flags",
@@ -276,7 +254,7 @@ py_strict_library(
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/framework",
-        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/platform:gfile",
         "@absl_py//absl:app",
     ],
@@ -291,17 +269,16 @@ py_strict_test(
     deps = [
         ":optimize_for_inference_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python:image_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:nn_ops_gen",
         "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
-        "//tensorflow/python/framework:dtypes",
-        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:nn_ops_gen",
         "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
@@ -313,6 +290,9 @@ py_strict_library(
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
     deps = [
+        # copybara:comment_begin(oss-only)
+        "//tensorflow/python",  # to fix libtensorflow_framework.so.2 import error.
+        # copybara:comment_end
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
@@ -407,14 +387,14 @@ py_strict_library(
     deps = [
         "//tensorflow/core:protos_all_py",
         "//tensorflow/python",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:tf_optimizer",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework:convert_to_constants",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/framework:versions",
+        "//tensorflow/python/grappler:tf_optimizer",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:sysconfig",
         "//tensorflow/python/platform:tf_logging",
@@ -435,15 +415,15 @@ py_strict_test(
     deps = [
         ":saved_model_cli_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:parsing_config",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/debug/wrappers:local_cli_wrapper",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:parsing_config",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:save",
@@ -458,11 +438,11 @@ py_strict_binary(
     srcs = ["make_aot_compile_models.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/trackable:autotrackable",
         "@absl_py//absl:app",
diff --git a/tensorflow/python/tools/api/generator/BUILD b/tensorflow/python/tools/api/generator/BUILD
index fd244d43185..7d9381125d8 100644
--- a/tensorflow/python/tools/api/generator/BUILD
+++ b/tensorflow/python/tools/api/generator/BUILD
@@ -24,7 +24,6 @@ py_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         ":doc_srcs",
-        "//tensorflow/python/util:module_wrapper",  # build_cleaner: keep
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
     ],
@@ -50,7 +49,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":doc_srcs",
-        "//tensorflow/python:no_contrib",  # build_cleaner: keep
+        "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
@@ -69,7 +68,7 @@ py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":doc_srcs",
-        "//tensorflow/python:no_contrib",  # build_cleaner: keep
+        "//tensorflow/python:no_contrib",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -90,11 +89,11 @@ py_strict_test(
         "no_pip",
     ],
     deps = [
-        "//tensorflow/lite/python:analyzer",  # build_cleaner: keep
-        "//tensorflow/lite/python:lite",  # build_cleaner: keep
-        "//tensorflow/lite/python/authoring",  # build_cleaner: keep
-        "//tensorflow/python:modules_with_exports",  # build_cleaner: keep
-        "//tensorflow/python:no_contrib",  # build_cleaner: keep
+        "//tensorflow/lite/python:analyzer",
+        "//tensorflow/lite/python:lite",
+        "//tensorflow/lite/python/authoring",
+        "//tensorflow/python:dtensor",
+        "//tensorflow/python:modules_with_exports",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/util:tf_decorator",
diff --git a/tensorflow/python/tools/saved_model_cli.py b/tensorflow/python/tools/saved_model_cli.py
index 0fb53e1073d..e2d387969d6 100644
--- a/tensorflow/python/tools/saved_model_cli.py
+++ b/tensorflow/python/tools/saved_model_cli.py
@@ -516,7 +516,7 @@ def _show_all(saved_model_dir):
   saved_model = saved_model_utils.read_saved_model(saved_model_dir)
   for meta_graph_def in sorted(
       saved_model.meta_graphs,
-      key=lambda meta_graph_def: meta_graph_def.meta_info_def.tags,
+      key=lambda meta_graph_def: list(meta_graph_def.meta_info_def.tags),
   ):
     tag_set = meta_graph_def.meta_info_def.tags
     print("\nMetaGraphDef with tag-set: '%s' "
diff --git a/tensorflow/python/tpu/BUILD b/tensorflow/python/tpu/BUILD
index e9980406ea2..c017042812e 100644
--- a/tensorflow/python/tpu/BUILD
+++ b/tensorflow/python/tpu/BUILD
@@ -1,31 +1,36 @@
+# load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("//tensorflow:strict.default.bzl", "py_strict_library", "py_strict_test")
+
 # Description: Operations defined for Cloud TPUs
-load("//tensorflow:pytype.default.bzl", "pytype_library", "pytype_strict_library")
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 load("//tensorflow/core/platform:build_config.bzl", "tf_proto_library")
-load("//tensorflow/python/tpu:tpu.bzl", "internal_create_sanitizer_settings", "tpu_py_test")
+load("//tensorflow/python/tpu:tpu.bzl", "internal_create_sanitizer_settings", "tpu_py_strict_test")
 
 # Do not add anymore paths here. You do not need to be in the visibility list
 # to use TPU symbols. They are accessible from tf.contrib.tpu in TF 1.x and
 # tf.tpu and tf.compat.v1.tpu in TF 2.x.
+visibility = [
+    "//learning/brain:__subpackages__",
+    "//learning/deepmind:__subpackages__",
+    "//learning/serving:__subpackages__",
+    "//research/graph:__subpackages__",
+    "//third_party/py/jax_tpu_embedding:__subpackages__",
+    "//third_party/py/lingvo:__subpackages__",
+    "//third_party/py/medical_research_foundations:__subpackages__",
+    "//tensorflow:__subpackages__",
+    "//waymo/ml/deploy/sync_test/tools:__subpackages__",
+]
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//learning/brain:__subpackages__",
-        "//learning/deepmind:__subpackages__",
-        "//learning/serving:__subpackages__",
-        "//research/graph:__subpackages__",
-        "//tensorflow:__subpackages__",
-        "//third_party/py/jax_tpu_embedding:__subpackages__",
-        "//third_party/py/lingvo:__subpackages__",
-        "//third_party/py/medical_research_foundations:__subpackages__",
-        "//waymo/ml/deploy/sync_test/tools:__subpackages__",
-    ],
+    default_visibility = visibility,
     licenses = ["notice"],
 )
 
 exports_files(["tpu_test_wrapper.py"])
 
-py_test(
+py_strict_test(
     name = "tpu_test_wrapper_test",
     srcs = [
         "tpu_test_wrapper.py",
@@ -39,7 +44,7 @@ py_test(
         "no_pip",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:flags",
         "//tensorflow/python/util:tf_decorator",
         "@absl_py//absl/testing:flagsaver",
@@ -51,27 +56,26 @@ alias(
     actual = "//tensorflow/python/tpu/ops",
 )
 
-pytype_library(
+pytype_strict_library(
     name = "async_checkpoint",
     srcs = ["async_checkpoint.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/estimator:estimator_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:pywrap_saved_model",
+        "//tensorflow/python/training",
+        "//tensorflow/python/training:basic_session_run_hooks",
+        "//tensorflow/python/training:saver",
+        "//tensorflow/python/training:session_run_hook",
+        "//tensorflow/python/training:training_util",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "async_checkpoint_test",
     size = "medium",
     srcs = ["async_checkpoint_test.py"],
@@ -81,13 +85,27 @@ tpu_py_test(
         ":async_checkpoint",
         ":tpu_estimator",
         ":tpu_lib",
-        "//tensorflow/python:lib",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:metrics",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/saved_model:pywrap_saved_model",
+        "//tensorflow/python/training:basic_session_run_hooks",
+        "//tensorflow/python/training:training_lib",
         "//third_party/py/numpy",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "device_assignment",
     srcs = ["device_assignment.py"],
     visibility = [
@@ -97,37 +115,36 @@ pytype_library(
         ":topology",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
+        "//third_party/py/numpy",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "preempted_hook_py",
     srcs = ["preempted_hook.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
-        "//tensorflow/python:session_run_hook",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/training:session_run_hook",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_replication",
     srcs = ["tpu_replication.py"],
     deps = [
         ":device_assignment",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
-        "//tensorflow/python/framework:c_api_util",
         "//tensorflow/python/framework:device",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
@@ -136,7 +153,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_estimator",
     srcs = [
         "error_handling.py",
@@ -155,24 +172,24 @@ py_library(
         ":tpu_embedding",
         ":tpu_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:function",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/estimator:estimator_py",
         "//tensorflow/python/estimator:util",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/training",
     ],
 )
 
-py_library(
+py_strict_library(
     name = "functional",
     srcs = ["functional.py"],
     srcs_version = "PY3",
@@ -180,11 +197,11 @@ py_library(
         "//visibility:public",
     ],
     deps = [
-        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python/tpu/ops",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "topology",
     srcs = ["topology.py"],
     srcs_version = "PY3",
@@ -195,22 +212,15 @@ pytype_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu",
     srcs = [
         "__init__.py",
     ],
     srcs_version = "PY3",
-    deps = [
-        ":feature_column",
-        ":feature_column_v2",
-        ":tpu_embedding",
-        ":tpu_estimator",
-        ":tpu_lib",
-    ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_noestimator",
     srcs = [
         "__init__.py",
@@ -218,69 +228,84 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        ":feature_column",
         ":feature_column_v2",
-        ":preempted_hook_py",
         ":tpu_embedding",
-        ":tpu_embedding_base",
         ":tpu_embedding_for_serving",
         ":tpu_embedding_v1",
         ":tpu_embedding_v2",
         ":tpu_embedding_v2_utils",
         ":tpu_hardware_feature",
         ":tpu_lib",
+        ":tpu_py",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tensor_tracer",
-    srcs = [
-        "tensor_tracer.py",
-        "tensor_tracer_flags.py",
-        "tensor_tracer_report.py",
-    ],
-    srcs_version = "PY3",
+    srcs = ["tensor_tracer.py"],
     deps = [
+        ":tensor_tracer_flags",
+        ":tensor_tracer_report",
         ":tpu_replication",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_case",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:linalg_ops",
-        "//tensorflow/python:logging_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python:nn",
-        "//tensorflow/python:platform_analytics",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:summary_ops_v2",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:func_graph",
         "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
         "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_case",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/platform:analytics",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:remote_utils",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/summary:summary_iterator",
-        "//tensorflow/python/tpu:tensor_tracer_proto_py",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/training:training_util",
         "//third_party/py/numpy",
+    ],
+)
+
+pytype_strict_library(
+    name = "tensor_tracer_report",
+    srcs = ["tensor_tracer_report.py"],
+    deps = [
+        ":tensor_tracer_proto_py",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/platform:tf_logging",
+    ],
+)
+
+pytype_strict_library(
+    name = "tensor_tracer_flags",
+    srcs = ["tensor_tracer_flags.py"],
+    deps = [
+        "//tensorflow/python/ops:linalg_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/platform:tf_logging",
         "@absl_py//absl/flags",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_lib",
     srcs = [
         "__init__.py",
@@ -292,50 +317,41 @@ pytype_library(
     ],
     srcs_version = "PY3",
     deps = [
-        ":datasets",
-        ":device_assignment",
-        ":functional",
         ":tensor_tracer",
         ":topology",
         ":tpu_feed",
         ":tpu_function",
-        ":tpu_replication",
-        ":tpu_sharding",
-        "//tensorflow/compiler/xla/python_api:xla_shape",
+        ":tpu_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
-        "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
-        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_py",
-        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:batch_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:control_flow_util",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:platform_analytics",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tf2",
-        "//tensorflow/python:tpu_ops_gen",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:while_loop",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/compiler/xla",
-        "//tensorflow/python/compiler/xla/experimental:xla_sharding",
-        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:while_loop",
         "//tensorflow/python/ops/losses",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu/ops",
-        "//tensorflow/python/tpu/profiler",
+        "//tensorflow/python/training",
+        "//tensorflow/python/training:session_run_hook",
+        "//tensorflow/python/training:training_util",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_py",
     srcs = ["tpu.py"],
     deps = [
@@ -345,29 +361,30 @@ pytype_library(
         ":tpu_function",
         ":tpu_name_util",
         ":tpu_replication",
+        "//tensorflow/compiler/tf2xla/python:xla",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/core/protobuf/tpu:compilation_result_proto_py",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_py",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:auto_control_deps",
-        "//tensorflow/python:c_api_util",
-        "//tensorflow/python:composite_tensor",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:config",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:func_graph",
-        "//tensorflow/python:function",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python:tf2",
         "//tensorflow/python/compiler/xla",
-        "//tensorflow/python/distribute:device_util",
-        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:auto_control_deps",
+        "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:func_graph",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/tpu/ops",
+        "//tensorflow/python/types:core",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:object_identity",
@@ -379,70 +396,77 @@ pytype_library(
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_feed",
     srcs = ["tpu_feed.py"],
     deps = [
         ":tpu_name_util",
         ":tpu_sharding",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:tensor_shape",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/tpu/ops",
-        "//tensorflow/python/user_ops:ops",
         "//tensorflow/python/util:nest",
+        "//third_party/py/numpy",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_function",
     srcs = ["tpu_function.py"],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_sharding",
     srcs = ["tpu_sharding.py"],
     deps = [
-        "//tensorflow/python:tensor_shape",
+        "//tensorflow/python/framework:tensor_shape",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_system_metadata",
     srcs = ["tpu_system_metadata.py"],
     deps = [
         ":tpu_py",
-        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:config",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:device_util",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "datasets",
     srcs = [
         "datasets.py",
     ],
     srcs_version = "PY3",
+    visibility = visibility + [
+        "//tensorflow_models/official/recommendation:__pkg__",
+    ],
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:function",
-        "//tensorflow/python:functional_ops",
         "//tensorflow/python/data/experimental/ops:interleave_ops",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:functional_ops",
         "//tensorflow/python/types:data",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "datasets_test",
     size = "medium",
     srcs = ["datasets_test.py"],
@@ -451,12 +475,21 @@ tf_py_test(
     tags = ["no_oss"],
     deps = [
         ":datasets",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training:server_lib",
         "//tensorflow/python/util:compat",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tpu_test",
     size = "small",
     srcs = ["tpu_test.py"],
@@ -465,60 +498,78 @@ tf_py_test(
         "no_windows",  # TODO: needs investigation on Windows
     ],
     deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:layers",
+        ":tpu_feed",
+        ":tpu_lib",
+        ":tpu_py",
+        ":tpu_replication",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/layers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:control_flow_util",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:special_math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu/ops",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tpu_sharding_test",
     size = "small",
     srcs = ["tpu_sharding_test.py"],
     deps = [
         ":tpu_sharding",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "bfloat16_test",
-    size = "small",
-    srcs = ["bfloat16_test.py"],
-    deps = [
-        ":tpu",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-    ],
-)
-
-tf_py_test(
-    name = "tpu_infeed_test",
-    size = "small",
-    srcs = ["tpu_infeed_test.py"],
-    deps = [
-        ":tpu",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:tensor_shape",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
+    name = "bfloat16_test",
+    size = "small",
+    srcs = ["bfloat16_test.py"],
+    deps = [
+        ":tpu_lib",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+tf_py_strict_test(
+    name = "tpu_infeed_test",
+    size = "small",
+    srcs = ["tpu_infeed_test.py"],
+    deps = [
+        ":tpu_feed",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
+tf_py_strict_test(
     name = "topology_test",
     size = "medium",
     srcs = ["topology_test.py"],
     deps = [
         ":topology",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_embedding",
     srcs = [
         "tpu_embedding.py",
@@ -526,22 +577,26 @@ pytype_library(
     ],
     srcs_version = "PY3",
     deps = [
-        ":tpu_lib",
+        ":tpu_system_metadata",
+        "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_py",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:tpu_ops_gen",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
+        "//tensorflow/python/tpu/ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_strategy_util",
     srcs = ["tpu_strategy_util.py"],
     visibility = [
@@ -553,20 +608,23 @@ pytype_library(
         "//third_party/py/tensorflow_numerics/extensions:__pkg__",
     ],
     deps = [
-        ":tpu_lib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/distribute:device_util",
-        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        ":topology",
+        ":tpu_py",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/distribute/cluster_resolver:base_cluster_resolver_py",
         "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:tape",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:monitoring",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
-        "//tensorflow/python/util:tf_export",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_hardware_feature",
     srcs = ["tpu_hardware_feature.py"],
     deps = [
@@ -575,7 +633,7 @@ pytype_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "tpu_name_util",
     srcs = ["tpu_name_util.py"],
     srcs_version = "PY3",
@@ -584,39 +642,44 @@ py_library(
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "feature_column",
     srcs = ["feature_column.py"],
     deps = [
-        ":tpu_lib",
+        ":tpu_function",
         ":tpu_py",
         ":tpu_replication",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:variable_scope",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "feature_column_v2",
     srcs = ["feature_column_v2.py"],
     deps = [
         ":feature_column",
-        ":tpu_lib",
         ":tpu_py",
         ":tpu_replication",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "feature_column_test",
     srcs = [
         "feature_column_test.py",
@@ -624,21 +687,22 @@ tf_py_test(
     main = "feature_column_test.py",
     deps = [
         ":feature_column",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/feature_column",
         "//tensorflow/python/feature_column:feature_column_py",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "feature_column_v2_test",
     srcs = [
         "feature_column_v2_test.py",
@@ -647,35 +711,39 @@ tf_py_test(
     tags = ["no_oss"],  # Due to the usage of keras component.
     deps = [
         ":feature_column_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/feature_column",
+        ":tpu_function",
+        ":tpu_py",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/feature_column:feature_column_py",
-        "//third_party/py/numpy",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_embedding_v2_utils",
     srcs = ["tpu_embedding_v2_utils.py"],
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core/protobuf/tpu:optimization_parameters_proto_py",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:sharded_variable",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/framework:device_spec",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/tpu/ops",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:tf_export",
@@ -683,19 +751,16 @@ pytype_library(
     ],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "tpu_embedding_v2",
     srcs = ["tpu_embedding_v2.py"],
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_utils",
         ":tpu_py",
+        ":tpu_replication",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:device_util",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:distribute_utils",
@@ -709,6 +774,11 @@ pytype_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/saved_model:save_context",
         "//tensorflow/python/saved_model/registration",
@@ -730,8 +800,8 @@ pytype_strict_library(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_utils",
-        "//tensorflow/python:variables",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:autotrackable",
         "//tensorflow/python/util:nest",
     ],
@@ -744,17 +814,17 @@ pytype_strict_library(
     deps = [
         ":tpu_embedding_base",
         ":tpu_embedding_v2_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/types:core",
         "//tensorflow/python/util:nest",
@@ -763,7 +833,7 @@ pytype_strict_library(
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tpu_embedding_for_serving_test",
     srcs = [
         "tpu_embedding_for_serving_test.py",
@@ -772,8 +842,12 @@ tf_py_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_for_serving",
-        "//tensorflow/python:init_ops_v2",
+        ":tpu_embedding_v2_utils",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:init_ops_v2",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
@@ -788,25 +862,24 @@ pytype_strict_library(
     deps = [
         ":tpu_embedding_base",
         ":tpu_embedding_v2_utils",
-        ":tpu_py",
         ":tpu_replication",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:embedding_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:embedding_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
     ],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "tpu_embedding_v2_utils_test",
     srcs = [
         "tpu_embedding_v2_utils_test.py",
@@ -814,13 +887,15 @@ tf_py_test(
     python_version = "PY3",
     srcs_version = "PY3",
     deps = [
-        ":tpu_embedding_v2",
+        ":tpu_embedding_v2_utils",
+        "//tensorflow/core/protobuf/tpu:tpu_embedding_configuration_proto_py",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
-tpu_py_test(
+tpu_py_strict_test(
     name = "tpu_outside_compilation_test",
     srcs = [
         "tpu_outside_compilation_test.py",
@@ -830,22 +905,50 @@ tpu_py_test(
     python_version = "PY3",
     tags = ["no_oss"],
     deps = [
-        ":tpu_lib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        ":functional",
+        ":tpu_py",
+        ":tpu_replication",
+        "//tensorflow/core:protos_all_py",
+        "//tensorflow/python:image_ops",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:remote",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:config",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:logging_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:summary_ops_v2",
+        "//tensorflow/python/ops:tensor_array_ops",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/platform:gfile",
+        "//tensorflow/python/tpu/ops",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
     ],
 )
 
 # NOTE this target should only be depended on by the tpu_test_wrapper macro.
-py_library(
+py_strict_library(
     name = "tpu_test_deps",
     srcs_version = "PY3",
     visibility = ["//visibility:public"],
-    deps = ["//tensorflow/python:client_testlib"],
+    deps = [
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:flags",
+        "//tensorflow/python/util:tf_inspect",
+    ],
 )
 
 tf_proto_library(
diff --git a/tensorflow/python/tpu/async_checkpoint.py b/tensorflow/python/tpu/async_checkpoint.py
index 0b0977356ea..768e46e79e0 100644
--- a/tensorflow/python/tpu/async_checkpoint.py
+++ b/tensorflow/python/tpu/async_checkpoint.py
@@ -138,6 +138,8 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
     graph = ops.get_default_graph()
     meta_graph_def = meta_graph.create_meta_graph_def(
         graph_def=graph.as_graph_def(add_shapes=True), saver_def=saver_def)
+    if self._summary_writer is None:
+      raise ValueError("Summary writer is not initialised")
     self._summary_writer.add_graph(graph)
     self._summary_writer.add_meta_graph(meta_graph_def)
     # The checkpoint saved here is the state at step "global_step".
@@ -184,6 +186,8 @@ class AsyncCheckpointSaverHook(basic_session_run_hooks.CheckpointSaverHook):
         l.before_save(session, step)
 
       self._get_saver().save(session, self._save_path, global_step=step)
+      if self._summary_writer is None:
+        raise ValueError("Summary writer is not initialised")
       self._summary_writer.add_session_log(
           event_pb2.SessionLog(
               status=event_pb2.SessionLog.CHECKPOINT,
diff --git a/tensorflow/python/tpu/client/BUILD b/tensorflow/python/tpu/client/BUILD
index e31ac2a3854..a3ea745c28a 100644
--- a/tensorflow/python/tpu/client/BUILD
+++ b/tensorflow/python/tpu/client/BUILD
@@ -1,6 +1,7 @@
 # Cloud TPU Client.
 
-load("//tensorflow:tensorflow.default.bzl", "tf_py_test")
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -10,16 +11,17 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "client",
     srcs = [
         "client.py",
         "version.py",
     ],
     srcs_version = "PY3",
+    deps = ["@absl_py//absl/flags"],
 )
 
-py_library(
+py_strict_library(
     name = "client_lib",
     srcs = [
         "__init__.py",
@@ -28,7 +30,7 @@ py_library(
     deps = [":client"],
 )
 
-tf_py_test(
+tf_py_strict_test(
     name = "client_py_test",
     size = "small",
     srcs = ["client_test.py"],
@@ -37,10 +39,7 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:platform_test",
-        "//tensorflow/python:training_server_lib",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/flags",
     ],
 )
diff --git a/tensorflow/python/tpu/experimental/BUILD b/tensorflow/python/tpu/experimental/BUILD
index 7962d3d149d..e0191e70d51 100644
--- a/tensorflow/python/tpu/experimental/BUILD
+++ b/tensorflow/python/tpu/experimental/BUILD
@@ -1,9 +1,11 @@
+load("//tensorflow:strict.default.bzl", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "experimental",
     srcs = [
         "__init__.py",
diff --git a/tensorflow/python/tpu/ops/BUILD b/tensorflow/python/tpu/ops/BUILD
index 2c7d716ba28..e3919967567 100644
--- a/tensorflow/python/tpu/ops/BUILD
+++ b/tensorflow/python/tpu/ops/BUILD
@@ -1,4 +1,4 @@
-load("//tensorflow:pytype.default.bzl", "pytype_library")
+load("//tensorflow:pytype.default.bzl", "pytype_strict_library")
 load("//tensorflow:tensorflow.bzl", "tf_gen_op_wrapper_py")
 
 package(
@@ -9,15 +9,17 @@ package(
     licenses = ["notice"],
 )
 
-pytype_library(
+pytype_strict_library(
     name = "ops",
     srcs = ["tpu_ops.py"],
     srcs_version = "PY3",
     deps = [
-        ":gen_tpu_embedding_ops",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:tpu_ops_gen",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:tpu_ops_gen",
+        "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/tpu:tpu_function",
+        "//tensorflow/python/util:tf_export",
     ],
 )
 
diff --git a/tensorflow/python/tpu/profiler/BUILD b/tensorflow/python/tpu/profiler/BUILD
index e657856353b..b7314e92c9b 100644
--- a/tensorflow/python/tpu/profiler/BUILD
+++ b/tensorflow/python/tpu/profiler/BUILD
@@ -1,3 +1,5 @@
+load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
+
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
@@ -6,7 +8,7 @@ package(
     licenses = ["notice"],
 )
 
-py_library(
+py_strict_library(
     name = "profiler",
     srcs = ["__init__.py"],
     srcs_version = "PY3",
@@ -17,7 +19,7 @@ py_library(
     ],
 )
 
-py_library(
+py_strict_library(
     name = "profiler_analysis_pb2_grpc",
     srcs = ["profiler_analysis_pb2_grpc.py"],
     srcs_version = "PY3",
@@ -25,7 +27,7 @@ py_library(
     deps = ["//tensorflow/core/profiler:profiler_analysis_proto_py"],
 )
 
-py_library(
+py_strict_library(
     name = "capture_tpu_profile_lib",
     srcs = [
         "capture_tpu_profile.py",
@@ -33,10 +35,9 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:versions",
-        "//tensorflow/python/distribute/cluster_resolver:cluster_resolver_lib",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:versions",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/profiler:profiler_client",
@@ -46,7 +47,7 @@ py_library(
     ],
 )
 
-py_binary(
+py_strict_binary(
     name = "capture_tpu_profile_bin",
     srcs = ["capture_tpu_profile.py"],
     main = "capture_tpu_profile.py",
diff --git a/tensorflow/python/tpu/tests/BUILD b/tensorflow/python/tpu/tests/BUILD
index a904995fddf..ebfc4cd6ec3 100644
--- a/tensorflow/python/tpu/tests/BUILD
+++ b/tensorflow/python/tpu/tests/BUILD
@@ -13,10 +13,6 @@ pytype_strict_library(
     srcs = ["tpu_embedding_base_test.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:tpu_strategy",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
@@ -24,11 +20,14 @@ pytype_strict_library(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding_v2",
         "//tensorflow/python/tpu:tpu_embedding_v2_utils",
-        "//tensorflow/python/tpu:tpu_strategy_util",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
@@ -47,22 +46,22 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
-        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:init_ops_v2",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/tpu:tpu_embedding_for_serving",
         "//tensorflow/python/tpu:tpu_embedding_v2",
         "//tensorflow/python/tpu:tpu_embedding_v2_utils",
-        "//tensorflow/python/tpu:tpu_strategy_util",
         "//tensorflow/python/training:checkpoint_utils",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
@@ -80,10 +79,6 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
-        "//tensorflow/python:init_ops_v2",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
@@ -91,6 +86,10 @@ tpu_py_strict_test(
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding",
         "//tensorflow/python/tpu:tpu_embedding_v2",
@@ -100,6 +99,39 @@ tpu_py_strict_test(
     ],
 )
 
+tpu_py_strict_test(
+    name = "tpu_embedding_v2_mp_strategy_test",
+    srcs = [
+        "tpu_embedding_v2_mp_strategy_test.py",
+    ],
+    disable_experimental = True,
+    disable_mlir_bridge = False,
+    python_version = "PY3",
+    srcs_version = "PY3",
+    deps = [
+        ":tpu_embedding_base_test",
+        "//tensorflow/python/compat:v2_compat",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/distribute:tpu_strategy",
+        "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
+        "//tensorflow/python/eager:backprop",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/eager:remote",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:init_ops_v2",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/tpu:device_assignment",
+        "//tensorflow/python/tpu:tpu_embedding_v2",
+        "//tensorflow/python/tpu:tpu_embedding_v2_utils",
+        "//third_party/py/numpy",
+    ],
+)
+
 tpu_py_strict_test(
     name = "tpu_embedding_v2_enqueue_mode_test",
     srcs = [
@@ -111,12 +143,12 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
         "//third_party/py/numpy",
@@ -157,14 +189,14 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops_v2",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding_v2",
@@ -419,10 +451,10 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_v2_correctness_base_test",
-        "//tensorflow/python:array_ops",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding_v2",
         "//tensorflow/python/tpu:tpu_embedding_v2_utils",
@@ -443,11 +475,11 @@ tpu_py_strict_test(
     srcs_version = "PY3",
     deps = [
         ":tpu_embedding_base_test",
-        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:sparse_tensor",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/ops:init_ops_v2",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding_v2",
         "//tensorflow/python/tpu:tpu_embedding_v2_utils",
@@ -468,13 +500,13 @@ tpu_py_strict_test(
     tags = ["no_oss"],
     deps = [
         ":tpu_embedding_base_test",
-        "//tensorflow/python:init_ops_v2",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
+        "//tensorflow/python/ops:init_ops_v2",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
@@ -502,7 +534,6 @@ tpu_py_strict_test(
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/keras/optimizer_v2",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/tpu:tpu_embedding_v1",
         "//tensorflow/python/tpu:tpu_embedding_v2_utils",
@@ -526,7 +557,6 @@ tpu_py_strict_test(
         "//tensorflow/python/compat:v2_compat",
         "//tensorflow/python/distribute/cluster_resolver:tpu_cluster_resolver_py",
         "//tensorflow/python/platform:client_testlib",
-        "//tensorflow/python/tpu:tpu_strategy_util",
         "@absl_py//absl/testing:parameterized",
     ],
 )
diff --git a/tensorflow/python/tpu/tests/tpu_embedding_base_test.py b/tensorflow/python/tpu/tests/tpu_embedding_base_test.py
index 7c751e3958e..cbe3b5b59b7 100644
--- a/tensorflow/python/tpu/tests/tpu_embedding_base_test.py
+++ b/tensorflow/python/tpu/tests/tpu_embedding_base_test.py
@@ -35,7 +35,6 @@ from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_embedding_v2
 from tensorflow.python.tpu import tpu_embedding_v2_utils
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.util import nest
 
 FLAGS = flags.FLAGS
@@ -154,7 +153,7 @@ class TPUEmbeddingBaseTest(parameterized.TestCase, test.TestCase):
       self.resolver._cloud_tpu_client.configure_tpu_version(
           version='nightly', restart_type='always')
     remote.connect_to_cluster(self.resolver)
-    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    tpu_cluster_resolver.initialize_tpu_system(self.resolver)
     return tpu_strategy.TPUStrategy(self.resolver)
 
   def _create_mid_level(self, optimizer=None):
diff --git a/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py b/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py
index c0132d4ebf3..dd60130d005 100644
--- a/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py
+++ b/tensorflow/python/tpu/tests/tpu_embedding_v1_correctness_test.py
@@ -13,19 +13,17 @@
 # limitations under the License.
 # ==============================================================================
 """Tests for TPU Embeddings mid level API on TPU."""
+
 import itertools
 
 from absl.testing import parameterized
+from keras import optimizers
 import numpy as np
 
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.eager import backprop
 from tensorflow.python.eager import def_function
-from tensorflow.python.keras.optimizer_v2 import adagrad
-from tensorflow.python.keras.optimizer_v2 import adam
-from tensorflow.python.keras.optimizer_v2 import ftrl
-from tensorflow.python.keras.optimizer_v2 import gradient_descent
 from tensorflow.python.platform import test
 from tensorflow.python.tpu import tpu_embedding_v1
 from tensorflow.python.tpu import tpu_embedding_v2_utils
@@ -35,9 +33,9 @@ from tensorflow.python.tpu.tests import tpu_embedding_base_test
 _SLOT_NAME_MAPPING = {
     # Slot names in Keras optimizer v2 are different compared to the slot names
     # in our API.
-    adagrad.Adagrad: {'accumulators': 'accumulator'},
-    adam.Adam: {'momenta': 'm', 'velocities': 'v'},
-    ftrl.Ftrl: {'accumulators': 'accumulator', 'linears': 'linear'},
+    optimizers.Adagrad: {'accumulators': 'accumulator'},
+    optimizers.Adam: {'momenta': 'm', 'velocities': 'v'},
+    optimizers.Ftrl: {'accumulators': 'accumulator', 'linears': 'linear'},
 }
 
 
@@ -77,22 +75,22 @@ class TPUEmbeddingV0CorrectnessTest(tpu_embedding_base_test.TPUEmbeddingBaseTest
     # variable creation fn properly populated.
     with strategy.scope():
       if optimizer_name == 'sgd':
-        optimizer = gradient_descent.SGD(learning_rate=0.1)
+        optimizer = optimizers.SGD(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.SGD(learning_rate=0.1)
       elif optimizer_name == 'adagrad':
-        optimizer = adagrad.Adagrad(learning_rate=0.1)
+        optimizer = optimizers.Adagrad(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.Adagrad(
             learning_rate=0.1,
             slot_variable_creation_fn=self._get_slot_variable_creation_fn(
                 optimizer))
       elif optimizer_name == 'adam':
-        optimizer = adam.Adam(learning_rate=0.1)
+        optimizer = optimizers.Adam(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.Adam(
             learning_rate=0.1,
             slot_variable_creation_fn=self._get_slot_variable_creation_fn(
                 optimizer))
       elif optimizer_name == 'ftrl':
-        optimizer = ftrl.Ftrl(learning_rate=0.1)
+        optimizer = optimizers.Ftrl(learning_rate=0.1)
         embedding_optimizer = tpu_embedding_v2_utils.FTRL(
             learning_rate=0.1,
             slot_variable_creation_fn=self._get_slot_variable_creation_fn(
@@ -173,13 +171,13 @@ class TPUEmbeddingV0CorrectnessTest(tpu_embedding_base_test.TPUEmbeddingBaseTest
                                           embedding_table_video_before,
                                           gradients_wrt_video, optimizer,
                                           table_to_variable):
-    if isinstance(optimizer, gradient_descent.SGD):
+    if isinstance(optimizer, optimizers.SGD):
       check_fn = self._check_embedding_and_slot_variables_for_sgd
-    elif isinstance(optimizer, adagrad.Adagrad):
+    elif isinstance(optimizer, optimizers.Adagrad):
       check_fn = self._check_embedding_and_slot_variables_for_adagrad
-    elif isinstance(optimizer, adam.Adam):
+    elif isinstance(optimizer, optimizers.Adam):
       check_fn = self._check_embedding_and_slot_variables_for_adam
-    elif isinstance(optimizer, ftrl.Ftrl):
+    elif isinstance(optimizer, optimizers.Ftrl):
       check_fn = self._check_embedding_and_slot_variables_for_ftrl
     else:
       raise ValueError('optimizer is not recognized: ', type(optimizer))
diff --git a/tensorflow/python/tpu/tests/tpu_embedding_v2_checkpoint_test.py b/tensorflow/python/tpu/tests/tpu_embedding_v2_checkpoint_test.py
index ca8381993e1..a725a2ba967 100644
--- a/tensorflow/python/tpu/tests/tpu_embedding_v2_checkpoint_test.py
+++ b/tensorflow/python/tpu/tests/tpu_embedding_v2_checkpoint_test.py
@@ -18,6 +18,7 @@ from absl.testing import parameterized
 import numpy as np
 from tensorflow.python.checkpoint import checkpoint as util
 from tensorflow.python.compat import v2_compat
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.eager import def_function
 from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
@@ -31,7 +32,6 @@ from tensorflow.python.saved_model import save
 from tensorflow.python.tpu import tpu_embedding_for_serving
 from tensorflow.python.tpu import tpu_embedding_v2
 from tensorflow.python.tpu import tpu_embedding_v2_utils
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu.tests import tpu_embedding_base_test
 from tensorflow.python.training import checkpoint_utils
 
@@ -86,7 +86,7 @@ class TPUEmbeddingCheckpointTest(tpu_embedding_base_test.TPUEmbeddingBaseTest):
         msg='Checkpoint should contain values from the first api object.')
 
     # Reinitialize the tpu.
-    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    tpu_cluster_resolver.initialize_tpu_system(self.resolver)
 
     with strategy.scope():
       second_mid_level_contents = np.ones((num_rows, 4)) * 2
@@ -148,7 +148,7 @@ class TPUEmbeddingCheckpointTest(tpu_embedding_base_test.TPUEmbeddingBaseTest):
     first_checkpoint = util.Checkpoint(model=first_mid_level)
     first_checkpoint.save(self._get_tmpdir('restore', 'save'))
 
-    tpu_strategy_util.initialize_tpu_system(self.resolver)
+    tpu_cluster_resolver.initialize_tpu_system(self.resolver)
 
     with strategy.scope():
       second_mid_level_contents = np.ones((num_rows, 4)) * 2
diff --git a/tensorflow/python/tpu/tests/tpu_embedding_v2_mp_strategy_test.py b/tensorflow/python/tpu/tests/tpu_embedding_v2_mp_strategy_test.py
new file mode 100644
index 00000000000..d57371bddc2
--- /dev/null
+++ b/tensorflow/python/tpu/tests/tpu_embedding_v2_mp_strategy_test.py
@@ -0,0 +1,165 @@
+# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for TPU Embeddings mid level API on TPU."""
+
+import numpy as np
+
+from tensorflow.python.compat import v2_compat
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.distribute import distribute_lib
+from tensorflow.python.distribute import tpu_strategy
+from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import def_function
+from tensorflow.python.eager import remote
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import init_ops_v2
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import variables as tf_variables
+from tensorflow.python.platform import test
+from tensorflow.python.tpu import device_assignment as device_lib
+from tensorflow.python.tpu import tpu_embedding_v2
+from tensorflow.python.tpu import tpu_embedding_v2_utils
+from tensorflow.python.tpu.tests import tpu_embedding_base_test
+
+
+class TPUEmbeddingTPUStrategyV2Test(
+    tpu_embedding_base_test.TPUEmbeddingBaseTest
+):
+
+  def setUp(self):
+    super().setUp()
+
+    self._num_replicas = 1
+    self._num_cores_per_replica = 2
+
+  def _get_strategy(self) -> tpu_strategy.TPUStrategy:
+    resolver = tpu_cluster_resolver.TPUClusterResolver('')
+    remote.connect_to_cluster(resolver)
+    topology = tpu_cluster_resolver.initialize_tpu_system(resolver)
+
+    d_assign = device_lib.device_assignment(
+        topology,
+        computation_shape=[1, 1, 1, 2],
+        num_replicas=1,
+    )
+
+    self.assertEqual(d_assign.num_replicas, self._num_replicas)
+    self.assertEqual(
+        d_assign.num_cores_per_replica, self._num_cores_per_replica
+    )
+
+    return tpu_strategy.TPUStrategyV2(
+        resolver,
+        experimental_device_assignment=d_assign,
+        experimental_spmd_xla_partitioning=True,
+    )
+
+  def test_spmd_training(self):
+    num_steps = 10
+    num_steps_float = float(num_steps)
+    starting_lr = 1.0
+    ending_lr = 0.5
+
+    strategy = self._get_strategy()
+
+    # Create model with Keras.
+    with strategy.scope():
+      step_counter = tf_variables.Variable(0.0, dtypes.float32)
+
+      def lr_function():
+        return gen_math_ops.maximum(
+            ending_lr,
+            starting_lr
+            + ((ending_lr - starting_lr) * step_counter) / num_steps_float,
+        )
+
+      optimizer = tpu_embedding_v2_utils.SGD(learning_rate=lr_function)
+      table_config = tpu_embedding_v2_utils.TableConfig(
+          vocabulary_size=2,
+          dim=4,
+          initializer=init_ops_v2.Constant(np.zeros((2, 4))),
+          combiner='sum',
+          name='table',
+      )
+      mid_level_api = tpu_embedding_v2.TPUEmbedding(
+          feature_config={
+              'feature': tpu_embedding_v2_utils.FeatureConfig(
+                  table=table_config, name='feature'
+              )
+          },
+          optimizer=optimizer,
+      )
+
+    feature = {
+        'feature': constant_op.constant(
+            [0, 1], shape=(2, 1), dtype=dtypes.int32
+        )
+    }
+
+    def input_fn(ctx):
+      del ctx
+      return dataset_ops.DatasetV2.from_tensors(feature).repeat()
+
+    dist = strategy.distribute_datasets_from_function(
+        input_fn,
+        options=distribute_lib.InputOptions(experimental_fetch_to_device=False))
+    dist_iter = iter(dist)
+
+    @def_function.function(jit_compile=True)
+    def test_fn():
+      def step():
+        with backprop.GradientTape() as tape:
+          activations = mid_level_api.dequeue()
+          tape.watch(activations)
+          result = math_ops.reduce_sum(activations['feature'])
+          loss = result / self._num_replicas
+        grads = tape.gradient(loss, activations)
+        mid_level_api.apply_gradients(grads)
+        return activations
+
+      mid_level_api.enqueue(next(dist_iter), training=True)
+      return strategy.run(step)
+
+    # Run model.
+    results = []
+    for _ in range(num_steps):
+      result = test_fn()
+      results.append(self._unpack(strategy, result['feature']))
+      step_counter.assign_add(1.0)
+
+    # Table is 2 elements wide, per-replica batch size of 1, with id 0.
+    # Loss for the gradient is the sum of the entries divided by the number of
+    # replicas. Thus the per replica gradient is 1/#of replicas for row 0 and no
+    # other updates. The reduced gradient is therefore 1.
+    # Learning rate schedule over num_steps steps:
+    # 1.0 0.95 0.9 0.85 0.8 ...
+    # Since use SGD and the gradient is one, the first row of the table is
+    # [0, 0] [-1.0, -1.0] [-1.95, -1.95] [-2.85, -2.85] ... (the negative
+    # partial sums of the above).
+
+    learning_rates = [starting_lr - (starting_lr - ending_lr) / num_steps * j
+                      for j in range(num_steps)]
+    cumsum = [sum(learning_rates[0:j]) for j in range(num_steps)]
+    goldens = [[[-cumsum[i]] * table_config.dim] * self._num_cores_per_replica
+               for i in range(10)]
+    self.assertAllClose(results, goldens)
+
+
+if __name__ == '__main__':
+  v2_compat.enable_v2_behavior()
+  test.main()
diff --git a/tensorflow/python/tpu/tests/tpu_initialization_test.py b/tensorflow/python/tpu/tests/tpu_initialization_test.py
index 252f6c84d94..a398b2f7769 100644
--- a/tensorflow/python/tpu/tests/tpu_initialization_test.py
+++ b/tensorflow/python/tpu/tests/tpu_initialization_test.py
@@ -19,14 +19,13 @@ from absl.testing import parameterized
 from tensorflow.python.compat import v2_compat
 from tensorflow.python.distribute.cluster_resolver import tpu_cluster_resolver
 from tensorflow.python.platform import test
-from tensorflow.python.tpu import tpu_strategy_util
 
 
 class TPUInitializationTest(parameterized.TestCase, test.TestCase):
 
   def test_tpu_initialization(self):
     resolver = tpu_cluster_resolver.TPUClusterResolver('')
-    tpu_strategy_util.initialize_tpu_system(resolver)
+    tpu_cluster_resolver.initialize_tpu_system(resolver)
 
 
 if __name__ == '__main__':
diff --git a/tensorflow/python/tpu/tpu.bzl b/tensorflow/python/tpu/tpu.bzl
index 91ea10ee047..cdc33d35786 100644
--- a/tensorflow/python/tpu/tpu.bzl
+++ b/tensorflow/python/tpu/tpu.bzl
@@ -29,6 +29,7 @@ def tpu_py_test(
         disable_mlir_bridge = True,
         disable_tfrt = None,
         args = [],
+        test_rule = native.py_test,
         **kwargs):
     """Generates identical unit test variants for various Cloud TPU versions.
 
@@ -47,7 +48,7 @@ def tpu_py_test(
         **kwargs: Additional named arguments to apply to tests.
     """
 
-    native.py_test(
+    test_rule(
         **_get_kwargs_for_wrapping(
             name,
             tags,
diff --git a/tensorflow/python/tpu/tpu_embedding_v2.py b/tensorflow/python/tpu/tpu_embedding_v2.py
index cc653d31367..9b1946a5e40 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2.py
@@ -36,6 +36,7 @@ from tensorflow.python.framework import sparse_tensor
 from tensorflow.python.framework.tensor_shape import TensorShape
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import sparse_ops
 from tensorflow.python.ops import variable_scope
 from tensorflow.python.ops import variables as tf_variables
 from tensorflow.python.ops.ragged import ragged_tensor
@@ -279,6 +280,13 @@ class TPUEmbedding(autotrackable.AutoTrackable):
     for feature in nest.flatten(feature_config):
       self._output_shapes.append(feature.output_shape)
 
+    device_assignment = getattr(
+        self._strategy.extended, "_device_assignment", None
+    )
+    self._num_cores_per_replica = (
+        device_assignment.num_cores_per_replica if device_assignment else None
+    )
+
     # The TPU embedding ops are slightly inconsistent with how they refer to
     # tables:
     # * The enqueue op takes a parallel list of tensors for input, one of those
@@ -585,8 +593,11 @@ class TPUEmbedding(autotrackable.AutoTrackable):
     config_proto.mode = (
         tpu_embedding_configuration_pb2.TPUEmbeddingConfiguration.TRAINING)
 
+    num_replica = self._strategy.num_replicas_in_sync
+    num_cores_per_replica = self._num_cores_per_replica or 1
+
     config_proto.num_hosts = self._strategy.extended.num_hosts
-    config_proto.num_tensor_cores = self._strategy.num_replicas_in_sync
+    config_proto.num_tensor_cores = num_replica * num_cores_per_replica
 
     # TODO(bfontain): Allow users to pick MOD for the host sharding.
     config_proto.sharding_strategy = (
@@ -594,6 +605,12 @@ class TPUEmbedding(autotrackable.AutoTrackable):
     config_proto.pipeline_execution_with_tensor_core = (
         self._pipeline_execution_with_tensor_core)
 
+    if self._num_cores_per_replica:
+      config_proto.spmd_sharding.enabled = True
+      config_proto.spmd_sharding.num_cores_per_replica = (
+          self._num_cores_per_replica
+      )
+
     return config_proto
 
   def apply_gradients(self, gradients, name: Optional[Text] = None):
@@ -660,12 +677,16 @@ class TPUEmbedding(autotrackable.AutoTrackable):
                          "object. Please either call enqueue first or manually "
                          "call the build method.")
 
+    num_cores_per_replica = self._num_cores_per_replica or 1
+
     nest.assert_same_structure(self._feature_config, gradients)
     updated_gradients = []
     for (path, gradient), feature, output_shape in zip(
         nest.flatten_with_joined_string_paths(gradients),
         nest.flatten(self._feature_config), self._output_shapes):
-      full_output_shape = list(output_shape) + [feature.table.dim]
+      full_output_shape = [x * num_cores_per_replica for x in output_shape] + [
+          feature.table.dim
+      ]
       if gradient is not None and not isinstance(gradient, ops.Tensor):
         raise ValueError(
             f"found non-tensor type: {type(gradient)} at path {path}.")
@@ -1233,6 +1254,9 @@ class TPUEmbedding(autotrackable.AutoTrackable):
             "the TPU for embeddings.")
     else:
       input_shapes = self._get_input_shapes(features, in_tpu_context)
+      if self._num_cores_per_replica:
+        input_shapes = [self._get_batch_dim_split_shape(
+            inp_shape) for inp_shape in input_shapes]
 
       self._maybe_build(input_shapes)
       # If is already built, we still need to check if the output shapes matches
@@ -1282,26 +1306,59 @@ class TPUEmbedding(autotrackable.AutoTrackable):
       # We rely here on the fact that the devices in the PerReplica value occur
       # in the same (standard) order as self._strategy.extended.worker_devices.
       enqueue_ops = []
+
+      def _split_fn(ts, idx):
+        if ts is None:
+          return None
+        elif isinstance(ts, ops.Tensor):
+          return array_ops.split(
+              ts,
+              num_or_size_splits=self._num_cores_per_replica,
+              axis=0)[idx]
+        elif isinstance(ts, sparse_tensor.Tensor):
+          return sparse_ops.split(
+              ts,
+              num_or_size_splits=self._num_cores_per_replica,
+              axis=0)[idx]
+        else:
+          raise ValueError("SPMD does not support raggedTensor yet.")
+
+      def _maybe_split(ts_inputs, core_id):
+        if self._num_cores_per_replica is None:
+          return ts_inputs
+        else:
+          splitter = functools.partial(_split_fn, idx=core_id)
+          return nest.map_structure(splitter, ts_inputs)
+
       for replica_id in range(self._strategy.num_replicas_in_sync):
         replica_inputs = distribute_utils.select_replica(replica_id,
                                                          flat_inputs)
         replica_weights = distribute_utils.select_replica(replica_id,
                                                           flat_weights)
-        tpu_device = self._strategy.extended.worker_devices[replica_id]
+
+        if self._num_cores_per_replica:
+          tpu_devices = self._strategy.extended._tpu_devices[replica_id]   # pylint: disable=protected-access
+        else:
+          tpu_devices = [self._strategy.extended.worker_devices[replica_id]]
         # TPU devices string are like /job:worker/replica:0/task:0/device:TPU:0
         # the device ordinal is the last number
-        device_ordinal = (
-            tf_device.DeviceSpec.from_string(tpu_device).device_index)
 
-        with ops.device(device_util.get_host_for_device(tpu_device)):
-          enqueue_op = self._generate_enqueue_op(
-              replica_inputs, replica_weights, flat_features,
-              device_ordinal=device_ordinal, mode_override=mode_override)
+        for core_id in range(self._num_cores_per_replica or 1):
+          tpu_device = tpu_devices[core_id]
+          device_ordinal = (
+              tf_device.DeviceSpec.from_string(tpu_device).device_index)
 
-          # Apply the name tag to the op.
-          if name is not None:
-            _add_key_attr(enqueue_op, name)
-          enqueue_ops.append(enqueue_op)
+          with ops.device(device_util.get_host_for_device(tpu_device)):
+            enqueue_op = self._generate_enqueue_op(
+                _maybe_split(replica_inputs, core_id),
+                _maybe_split(replica_weights, core_id),
+                flat_features,
+                device_ordinal=device_ordinal, mode_override=mode_override)
+
+            # Apply the name tag to the op.
+            if name is not None:
+              _add_key_attr(enqueue_op, name)
+            enqueue_ops.append(enqueue_op)
     else:
       mode_override = "train" if training else "inference"
       device_spec = tf_device.DeviceSpec.from_string(device)
@@ -1468,6 +1525,11 @@ class TPUEmbedding(autotrackable.AutoTrackable):
         output_shapes.append(TensorShape(per_replica_batch_size))
     return output_shapes
 
+  def _get_batch_dim_split_shape(self, input_shape):
+    shape_list = input_shape.as_list()
+    return TensorShape(
+        [shape_list[0] // self._num_cores_per_replica] + shape_list[1:])
+
   def _create_copy_for_async_checkpoint(
       self, feature_config, optimizer, pipeline_execution_with_tensor_core):
     """Create a TPUEmbedding copy for checkpoint/async_checkpoint_helper.py."""
diff --git a/tensorflow/python/tpu/tpu_outside_compilation_test.py b/tensorflow/python/tpu/tpu_outside_compilation_test.py
index 7f4fcb023d0..eb0ce826ca6 100644
--- a/tensorflow/python/tpu/tpu_outside_compilation_test.py
+++ b/tensorflow/python/tpu/tpu_outside_compilation_test.py
@@ -51,7 +51,6 @@ from tensorflow.python.platform import gfile
 from tensorflow.python.tpu import functional as tpu_functional
 from tensorflow.python.tpu import tpu
 from tensorflow.python.tpu import tpu_replication
-from tensorflow.python.tpu import tpu_strategy_util
 from tensorflow.python.tpu.ops import tpu_ops
 
 FLAGS = flags.FLAGS
@@ -72,7 +71,7 @@ def get_tpu_cluster_resolver():
 def get_tpu_strategy():
   resolver = get_tpu_cluster_resolver()
   remote.connect_to_cluster(resolver)
-  tpu_strategy_util.initialize_tpu_system(resolver)
+  tpu_cluster_resolver.initialize_tpu_system(resolver)
   return tpu_lib.TPUStrategyV2(resolver)
 
 
diff --git a/tensorflow/python/tpu/tpu_replication.py b/tensorflow/python/tpu/tpu_replication.py
index 576e733564e..f88dd2f192e 100644
--- a/tensorflow/python/tpu/tpu_replication.py
+++ b/tensorflow/python/tpu/tpu_replication.py
@@ -36,6 +36,7 @@ from tensorflow.python.util.tf_export import tf_export
 _MAX_WARNING_LINES = 5
 _TPU_REPLICATE_ATTR = "_tpu_replicate"
 _OUTSIDE_COMPILATION_ATTR = "_xla_outside_compilation"
+_MAP_OUTSIDE_COMPILATION_ATTR = "_xla_map_outside_compilation"
 
 # Operations that indicate some error in the users graph, e.g. a placeholder
 # that's introduced outside of the infeed.
@@ -108,6 +109,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     self._outer_device_function_stack = None
     self._oc_dev_fn_stack = None
     self._outside_compilation_cluster = None
+    self._is_map_outside_compilation = False
     self._outside_compilation_v2_context = None
     self._outside_compilation_counter = 0
     self._in_gradient_colocation = None
@@ -298,8 +300,9 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
              f"expected {last_op}, got {op.name}")
         )
 
-  def _EnterOutsideCompilationScope(self, cluster: Optional[Text] = None):
-
+  def _EnterOutsideCompilationScope(
+      self, cluster: Optional[Text] = None, is_map_outside_compilation=False
+  ):
     class FakeOp(object):
       """A helper class to determine the current device.
 
@@ -334,6 +337,8 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
     else:
       self._outside_compilation_cluster = str(self._outside_compilation_counter)
       self._outside_compilation_counter += 1
+    if is_map_outside_compilation:
+      self._is_map_outside_compilation = True
     graph = ops.get_default_graph()
     fake_op = FakeOp()
     graph._apply_device_functions(fake_op)  # pylint: disable=protected-access
@@ -350,6 +355,7 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
       raise ValueError(
           "Attempted to exit outside_compilation scope when not in scope")
     self._outside_compilation_cluster = None
+    self._is_map_outside_compilation = False
     graph = ops.get_default_graph()
     graph._device_function_stack = self._oc_dev_fn_stack  # pylint: disable=protected-access
 
@@ -419,6 +425,11 @@ class TPUReplicateContext(control_flow_ops.XLAControlFlowContext):
           _OUTSIDE_COMPILATION_ATTR,
           attr_value_pb2.AttrValue(
               s=compat.as_bytes(self._outside_compilation_cluster)))
+    if self._is_map_outside_compilation:
+      op._set_attr(
+          _MAP_OUTSIDE_COMPILATION_ATTR,
+          attr_value_pb2.AttrValue(b=True),
+      )
     if self._num_replicas > 1 or not self._outside_compilation_cluster:
       # Prevent feeding or fetching anything that is being compiled,
       # and any replicated outside_compilation Op.
@@ -544,33 +555,106 @@ class OutsideCompilationV2Context(control_flow_ops.ControlFlowContext):
   attribute.
   """
 
-  def __init__(self, name: Text):
+  def __init__(self, name: Text, is_map_outside_compilation=False):
     control_flow_ops.ControlFlowContext.__init__(self)
     self._name = name
+    self._is_map_outside_compilation = is_map_outside_compilation
 
   def AddOp(self, op: ops.Operation) -> None:
     if self._outer_context:
       self._outer_context.AddOp(op)
-    # pylint: disable=protected-access
-    op._set_attr("_xla_outside_compilation",
-                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
+    self._set_outside_compilation_attributes(op)
 
   def AddInnerOp(self, op: ops.Operation) -> None:
     if self._outer_context:
       self._outer_context.AddInnerOp(op)
-    # pylint: disable=protected-access
-    op._set_attr("_xla_outside_compilation",
-                 attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)))
-    # pylint: enable=protected-access
+    self._set_outside_compilation_attributes(op)
 
   def to_control_flow_context_def(self, context_def, export_scope=None):
     raise NotImplementedError
 
+  def _set_outside_compilation_attributes(self, op: ops.Operation) -> None:
+    # pylint: disable=protected-access
+    op._set_attr(
+        _OUTSIDE_COMPILATION_ATTR,
+        attr_value_pb2.AttrValue(s=compat.as_bytes(self._name)),
+    )
+    if self._is_map_outside_compilation:
+      op._set_attr(
+          _MAP_OUTSIDE_COMPILATION_ATTR, attr_value_pb2.AttrValue(b=True)
+      )
+    # pylint: enable=protected-access
+
+
+def outside_compilation_impl(
+    is_map, computation: Callable[..., Any], *args, **kwargs
+) -> Any:
+  """Tags ops in `computation` with outside compilation attributes for ordinary `outside_compilation` or `map_outside_compilation`."""
+  args = [] if args is None else args
+  graph = ops.get_default_graph()
+
+  # If we are in TF 2 functions (control flow V2 functions, or tf.function()),
+  # we need to attach _xla_outside_compilation attribute directly because we are
+  # not in TPUReplicateContext.
+  if isinstance(graph, func_graph.FuncGraph):
+    try:
+      tpu_context, _ = _enclosing_tpu_context_and_graph()
+    except ValueError:
+      logging.warning(
+          "Outside compilation attempted outside TPUReplicateContext "
+          "scope. As no enclosing TPUReplicateContext can be found, "
+          "returning the result of `computation` as is."
+      )
+      return computation(*args, **kwargs)
+
+    # pylint: disable=protected-access
+    outside_compilation_name = str(tpu_context._outside_compilation_counter)
+    tpu_context._outside_compilation_counter = (
+        tpu_context._outside_compilation_counter + 1
+    )
+    # pylint: enable=protected-access
+
+    outside_compilation_context = OutsideCompilationV2Context(
+        outside_compilation_name, is_map_outside_compilation=is_map
+    )
+    outside_compilation_context.Enter()
+    args = [] if args is None else args
+    retval = computation(*args, **kwargs)
+    outside_compilation_context.Exit()
+    return retval
+
+  # If we are in a TPUReplicateContext, signal that we are now
+  # outside_compilation
+  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._EnterOutsideCompilationScope(is_map_outside_compilation=is_map)  # pylint: disable=protected-access
+    context = context.outer_context
+
+  retval = computation(*args, **kwargs)
+
+  # If we are in a TPUReplicateContext, signal that we are no longer
+  # outside_compilation
+  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
+  if initial_context is not final_context:
+    raise NotImplementedError(
+        "Control-flow context cannot be different at start and end of an "
+        "outside_compilation scope"
+    )
+  context = initial_context
+  while context:
+    if isinstance(context, TPUReplicateContext):
+      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
+    context = context.outer_context
+
+  return retval
+
 
 @tf_export(v1=["tpu.outside_compilation"])
-def outside_compilation(computation: Callable[..., Any], *args,
-                        **kwargs) -> Any:
+def outside_compilation(
+    computation: Callable[..., Any], *args, **kwargs
+) -> Any:
   """Builds part of a computation outside any current TPU replicate scope.
 
   `tf.tpu.outside_compilation()` is used to run ops in `computation` on CPU
@@ -617,13 +701,12 @@ def outside_compilation(computation: Callable[..., Any], *args,
   computation as well, then this may lead to deadlock.
 
   Internally, `tf.tpu.outside_compilation()` adds outside compilation
-  attributes to all ops in `computation`. During later graph pass, these
-  ops with outside compilation attribute is extracted out and replicated
-  into a host-side graph. Inputs to this extract host-side graph is sent
-  from TPU computation graph to host graph via a pair of XlaSendToHost and
-  XlaRecvFromHost ops. Note that using `tf.tpu.outside_compilation()`
-  may result in tensor transfer between TPU and CPU, leading to non-trivial
-  performance impact.
+  attributes to all ops in `computation`. During a later passes ops with outside
+  compilation attributes are moved to a host-side graph. Inputs to this extract
+  host-side graph are sent from TPU computation graph to host graph via a pair
+  of XlaSendToHost and XlaRecvFromHost ops. Note that using
+  `tf.tpu.outside_compilation()` may result in tensor transfer between TPU and
+  CPU, leading to non-trivial performance impact.
 
   Args:
     computation: A Python function that builds the computation to place on the
@@ -634,58 +717,56 @@ def outside_compilation(computation: Callable[..., Any], *args,
   Returns:
     The Tensors returned by computation.
   """
-  args = [] if args is None else args
-  graph = ops.get_default_graph()
+  return outside_compilation_impl(False, computation, *args, **kwargs)
 
-  # If we are in TF 2 functions (control flow V2 functions, or tf.function()),
-  # we need to attach _xla_outside_compilation attribute directly because we are
-  # not in TPUReplicateContext.
-  if isinstance(graph, func_graph.FuncGraph):
-    try:
-      tpu_context, _ = _enclosing_tpu_context_and_graph()
-    except ValueError:
-      logging.warning(
-          "Outside compilation attempted outside TPUReplicateContext "
-          "scope. As no enclosing TPUReplicateContext can be found, "
-          "returning the result of `computation` as is.")
-      return computation(*args, **kwargs)
 
-    # pylint: disable=protected-access
-    outside_compilation_name = str(tpu_context._outside_compilation_counter)
-    tpu_context._outside_compilation_counter = (
-        tpu_context._outside_compilation_counter + 1)
-    # pylint: enable=protected-access
+def experimental_map_outside_compilation(
+    computation: Callable[..., Any], *args, **kwargs
+) -> Any:
+  """Maps `computation` onto shards and puts it outside any current TPU replicate scope.
 
-    outside_compilation_context = OutsideCompilationV2Context(
-        outside_compilation_name)
-    outside_compilation_context.Enter()
-    args = [] if args is None else args
-    retval = computation(*args, **kwargs)
-    outside_compilation_context.Exit()
-    return retval
+  `experimental_map_outside_compilation(f, x)` maps `f` onto the shards
+  of `x`, where `x` is split-sharded. Each invocation of `f` on a split occurs
+  on the CPU that's associated with the TPU that owns the split.
 
-  # If we are in a TPUReplicateContext, signal that we are now
-  # outside_compilation
-  initial_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._EnterOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
+  Example usage:
 
-  retval = computation(*args, **kwargs)
+  ```python
+  def normalize_each_split(split):
+    return split - tf.math.reduce_mean(split)
 
-  # If we are in a TPUReplicateContext, signal that we are no longer
-  # outside_compilation
-  final_context = graph._get_control_flow_context()  # pylint: disable=protected-access
-  if initial_context is not final_context:
-    raise NotImplementedError(
-        "Control-flow context cannot be different at start and end of an "
-        "outside_compilation scope")
-  context = initial_context
-  while context:
-    if isinstance(context, TPUReplicateContext):
-      context._ExitOutsideCompilationScope()  # pylint: disable=protected-access
-    context = context.outer_context
+  def tpu_computation(x):
+    x_split = strategy.experimental_split_to_logical_devices(
+                x, [num_cores_per_replica, 1])
+    y = experimental_map_outside_compilation(
+          normalize_each_split, x_split)
+    y_split = strategy.experimental_split_to_logical_devices(
+                x, [num_cores_per_replica, 1])
+    return y_split
+  ```
 
-  return retval
+  `experimental_map_outside_compilation` should be called inside
+  TPUReplicateContext. That is, `outside_compilation()` should be called
+  inside a function that is passed to `tpu.split_compile_and_replicate()` --
+  this is implied when outside compilation is invoked inside a function passed
+  to TPUStrategy `run()`. It is invalid to invoke outside of
+  TPUReplicateContext.
+
+  `experimental_map_outside_compilation` should input and output tensors that
+  are located on the TPU.
+
+  Internally, `experimental_map_outside_compilation()` adds outside
+  compilation attributes to all ops in `computation` and moves outside-compiled
+  ops to a host-side graph. This is similar to `tf.tpu.outside_compilation()`.
+  Send/recv ops from/to the TPU send each split directly to the TPU's host.
+
+  Args:
+    computation: A Python function that builds the computation to place on the
+      host.
+    *args: the positional arguments for the computation.
+    **kwargs: the keyword arguments for the computation.
+
+  Returns:
+    The Tensors returned by computation.
+  """
+  return outside_compilation_impl(True, computation, *args, **kwargs)
diff --git a/tensorflow/python/tpu/tpu_strategy_util.py b/tensorflow/python/tpu/tpu_strategy_util.py
index feb8a0c522a..e122d4816ae 100644
--- a/tensorflow/python/tpu/tpu_strategy_util.py
+++ b/tensorflow/python/tpu/tpu_strategy_util.py
@@ -18,12 +18,10 @@ import gc
 
 from tensorflow.core.protobuf import config_pb2
 from tensorflow.python.client import session as session_lib
-from tensorflow.python.distribute.cluster_resolver.tpu_cluster_resolver import TPUClusterResolver
+from tensorflow.python.distribute.cluster_resolver import cluster_resolver as cluster_resolver_lib
 from tensorflow.python.eager import context
+from tensorflow.python.eager import def_function
 from tensorflow.python.eager import monitoring
-from tensorflow.python.eager.def_function import function
-from tensorflow.python.eager.def_function import functions_run_eagerly
-from tensorflow.python.eager.def_function import run_functions_eagerly
 from tensorflow.python.framework import device
 from tensorflow.python.framework import errors
 from tensorflow.python.framework import ops
@@ -31,7 +29,6 @@ from tensorflow.python.platform import tf_logging as logging
 from tensorflow.python.tpu import topology
 from tensorflow.python.tpu import tpu
 from tensorflow.python.util import compat
-from tensorflow.python.util.tf_export import tf_export
 
 
 _INITIALIZED_TPU_SYSTEMS = {}
@@ -43,13 +40,19 @@ _tpu_worker_address = monitoring.StringGauge(
     "The worker address that the coordinator/client connects to.", "address")
 
 
-@tf_export("tpu.experimental.initialize_tpu_system")
-def initialize_tpu_system(cluster_resolver=None):
-  """Initialize the TPU devices.
+def initialize_tpu_system_impl(cluster_resolver, tpu_cluster_resolver_cls):
+  """Implementation for tpu.experimental.initialize_tpu_system.
+
+  Kept separate to avoid tpu_oss code duplication.
+
+  Initialize the TPU devices.
 
   Args:
     cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
+    tpu_cluster_resolver_cls: a reference to
+        tf.distribute.cluster_resolver.TPUClusterResolver so that an instance
+        of it can be initialized if cluster_resolver is None.
   Returns:
     The tf.tpu.Topology object for the topology of the TPU cluster. If called
     inside tf.function, it returns the serialized topology object instead.
@@ -57,8 +60,17 @@ def initialize_tpu_system(cluster_resolver=None):
   Raises:
     RuntimeError: If running inside a tf.function.
     NotFoundError: If no TPU devices found in eager mode.
+    TypeError: If tpu_cluster_resolver_cls is
+        not tf.distribute.cluster_resolver.TPUClusterResolver.
   """
-
+  # check that tpu_cluster_resolver_cls is a
+  # tf.distribute.cluster_resolver.TPUClusterResolver
+  if tpu_cluster_resolver_cls is None or not issubclass(
+      tpu_cluster_resolver_cls, cluster_resolver_lib.ClusterResolver
+  ) or not hasattr(tpu_cluster_resolver_cls, "tpu_hardware_feature"):
+    raise TypeError(
+        "tpu_cluster_resolver_cls is not"
+        " tf.distribute.cluster_resolver.TPUClusterResolver.")
   # Deallocate all TPU buffers by clearing out eager context caches and
   # triggering garbage collection to avoid keeping invalid tpu buffer around
   # after reinitialized tpu system.
@@ -76,8 +88,8 @@ def initialize_tpu_system(cluster_resolver=None):
       if curr_device.job is not None:
         job = "{}/replica:0/task:0".format(curr_device.job)
 
-    cluster_resolver = TPUClusterResolver("")
-  assert isinstance(cluster_resolver, TPUClusterResolver)
+    cluster_resolver = tpu_cluster_resolver_cls("")
+  assert isinstance(cluster_resolver, tpu_cluster_resolver_cls)
 
   tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
   if tpu_name in _INITIALIZED_TPU_SYSTEMS:
@@ -99,7 +111,7 @@ def initialize_tpu_system(cluster_resolver=None):
     job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
 
   if context.executing_eagerly():
-    @function(autograph=False)
+    @def_function.function(autograph=False)
     def _tpu_init_fn():
       # In TF1, we usually close chips when compilation fails to clear the data
       # in infeed. In TF2, we don't need to do this because infeed is no longer
@@ -113,7 +125,7 @@ def initialize_tpu_system(cluster_resolver=None):
     # The TPU_SYSTEM device must match the device used in tpu.initialize_system
     # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
     # devices available.
-    run_eagerly = functions_run_eagerly()
+    run_eagerly = def_function.functions_run_eagerly()
     if run_eagerly:
       logging.warning(
           "It looks like tf.function behavior was disabled, perhaps using"
@@ -121,7 +133,7 @@ def initialize_tpu_system(cluster_resolver=None):
           " tf.tpu.experimental.initialize_tpu_system requires tf.function to"
           " work. This primitive will override the disable."
       )
-      run_functions_eagerly(False)
+      def_function.run_functions_eagerly(False)
     try:
       with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
         output = _tpu_init_fn()
@@ -133,7 +145,7 @@ def initialize_tpu_system(cluster_resolver=None):
           + str(e))
     finally:
       if run_eagerly is not None:
-        run_functions_eagerly(run_eagerly)
+        def_function.run_functions_eagerly(run_eagerly)
     # Clear out the eager context caches since the memory is invalid now.
     context.context()._initialize_logical_devices()  # pylint: disable=protected-access
 
@@ -181,9 +193,12 @@ def get_initialized_tpu_systems():
   return _INITIALIZED_TPU_SYSTEMS.copy()
 
 
-@tf_export("tpu.experimental.shutdown_tpu_system")
-def shutdown_tpu_system(cluster_resolver=None):
-  """Shuts down the TPU devices.
+def shutdown_tpu_system_impl(cluster_resolver, tpu_cluster_resolver_cls):
+  """Implementation for tpu.experimental.shutdown_tpu_system.
+
+  Kept separate to avoid tpu_oss code duplication.
+
+  Shuts down the TPU devices.
 
   This will clear all caches, even those that are maintained through sequential
   calls to tf.tpu.experimental.initialize_tpu_system, such as the compilation
@@ -192,11 +207,25 @@ def shutdown_tpu_system(cluster_resolver=None):
   Args:
     cluster_resolver: A tf.distribute.cluster_resolver.TPUClusterResolver,
         which provides information about the TPU cluster.
+    tpu_cluster_resolver_cls: a reference to
+        tf.distribute.cluster_resolver.TPUClusterResolver so that an instance
+        of it can be initialized if cluster_resolver is None.
 
   Raises:
     RuntimeError: If no TPU devices found for eager execution or if run in a
         tf.function.
+    TypeError: If tpu_cluster_resolver_cls is
+        not tf.distribute.cluster_resolver.TPUClusterResolver.
   """
+  # check that tpu_cluster_resolver_cls is a
+  # tf.distribute.cluster_resolver.TPUClusterResolver
+  if tpu_cluster_resolver_cls is None or not issubclass(
+      tpu_cluster_resolver_cls, cluster_resolver_lib.ClusterResolver
+  ) or not hasattr(tpu_cluster_resolver_cls, "tpu_hardware_feature"):
+    raise TypeError(
+        "tpu_cluster_resolver_cls is not"
+        " tf.distribute.cluster_resolver.TPUClusterResolver.")
+
   job = None
   if cluster_resolver is None:
     # If no cluster resolver is specified, and running eagerly, execute the init
@@ -206,8 +235,8 @@ def shutdown_tpu_system(cluster_resolver=None):
       if curr_device.job is not None:
         job = "{}/replica:0/task:0".format(curr_device.job)
 
-    cluster_resolver = TPUClusterResolver("")
-  assert isinstance(cluster_resolver, TPUClusterResolver)
+    cluster_resolver = tpu_cluster_resolver_cls("")
+  assert isinstance(cluster_resolver, tpu_cluster_resolver_cls)
 
   tpu_name = compat.as_text(cluster_resolver._tpu)  # pylint: disable=protected-access
   if tpu_name not in _INITIALIZED_TPU_SYSTEMS:
@@ -227,14 +256,14 @@ def shutdown_tpu_system(cluster_resolver=None):
       # avoid the output node match multiple devices error.
       job = "{}/replica:0/task:0".format(cluster_resolver.get_job_name())
 
-    @function(autograph=False)
+    @def_function.function(autograph=False)
     def _tpu_shutdown_fn():
       tpu.shutdown_system(job=job)
 
     # The TPU_SYSTEM device must match the device used in tpu.shutdown_system
     # exactly, otherwise you can get errors if there are multiple TPU_SYSTEM
     # devices available.
-    run_eagerly = functions_run_eagerly()
+    run_eagerly = def_function.functions_run_eagerly()
     if run_eagerly:
       logging.warning(
           "It looks like tf.function behavior was disabled, perhaps using"
@@ -242,13 +271,13 @@ def shutdown_tpu_system(cluster_resolver=None):
           " tf.tpu.experimental.shutdown_tpu_system requires tf.function to"
           " work. This primitive will override the disable."
       )
-      run_functions_eagerly(False)
+      def_function.run_functions_eagerly(False)
     try:
       with ops.device(tpu._tpu_system_device_name(job)):  # pylint: disable=protected-access
         _tpu_shutdown_fn()
     finally:
       if run_eagerly is not None:
-        run_functions_eagerly(run_eagerly)
+        def_function.run_functions_eagerly(run_eagerly)
 
     # Clear out the eager context caches since the memory is invalid now.
     logging.info("Clearing out eager caches")
diff --git a/tensorflow/python/trackable/BUILD b/tensorflow/python/trackable/BUILD
index 61ad33c8d78..f84fad99235 100644
--- a/tensorflow/python/trackable/BUILD
+++ b/tensorflow/python/trackable/BUILD
@@ -14,6 +14,10 @@ package(
 
 py_strict_library(
     name = "trackable",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tf_agents:__subpackages__",
+    ],
     deps = [
         ":asset",
         ":autotrackable",
@@ -42,10 +46,10 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":constants",
-        "//tensorflow/python:control_flow_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
         "//tensorflow/python/training/saving:saveable_object",
         "//tensorflow/python/util:tf_decorator",
         "//tensorflow/python/util:tf_export",
@@ -57,10 +61,10 @@ tf_py_strict_test(
     srcs = ["base_test.py"],
     deps = [
         ":base",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -77,10 +81,10 @@ py_strict_library(
     deps = [
         ":base",
         ":data_structures",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager/polymorphic_function:saved_model_utils",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:resource_variable_ops",
     ],
 )
 
@@ -115,10 +119,10 @@ tf_py_strict_test(
         ":base",
         ":base_delegate",
         "//tensorflow/python:extra_py_tests_deps",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:test",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
     ],
@@ -130,13 +134,13 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":base",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion_registry",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/saved_model:path_helpers",
         "//tensorflow/python/util:tf_export",
     ],
@@ -163,10 +167,10 @@ tf_py_strict_test(
     deps = [
         ":autotrackable",
         ":data_structures",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -191,11 +195,11 @@ tf_py_strict_test(
     srcs = ["resource_test.py"],
     deps = [
         ":resource",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:wrap_function",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -213,9 +217,9 @@ py_strict_library(
     deps = [
         ":base",
         ":layer_utils",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/eager:function",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
         "@wrapt",
@@ -232,10 +236,6 @@ tf_py_strict_test(
     deps = [
         ":autotrackable",
         ":data_structures",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:layers",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/eager:context",
@@ -243,7 +243,11 @@ tf_py_strict_test(
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/layers",
         "//tensorflow/python/module",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:serialization",
         "@absl_py//absl/testing:parameterized",
@@ -268,11 +272,11 @@ tf_py_strict_test(
     srcs = ["python_state_test.py"],
     deps = [
         ":python_state",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/client:session",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/module",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/python/training/BUILD b/tensorflow/python/training/BUILD
index 50aae54fae9..9789541a4d6 100644
--- a/tensorflow/python/training/BUILD
+++ b/tensorflow/python/training/BUILD
@@ -72,6 +72,10 @@ py_library(
         ":deprecated_inclusions_in_training_lib",
     ],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":adadelta",
         ":adagrad",
@@ -106,9 +110,9 @@ py_library(
         ":training_util",
         ":warm_starting_util",
         "//tensorflow/python:learning_rate_decay",
-        "//tensorflow/python:sdca_ops",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/checkpoint:checkpoint_view",
+        "//tensorflow/python/ops:sdca_ops",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/summary:summary_iterator",
@@ -117,7 +121,6 @@ py_library(
         "//tensorflow/python/trackable:base_delegate",
         "//tensorflow/python/training/experimental:loss_scale_optimizer",
         "//tensorflow/python/training/experimental:mixed_precision",
-        "//tensorflow/python/training/tracking:base_delegate",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
@@ -130,14 +133,20 @@ py_library(
 py_library(
     name = "training",
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//tensorflow_minigo:__subpackages__",
+        "//tensorflow_model_optimization:__subpackages__",
+        "//tensorflow_models:__subpackages__",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/mlperf:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":training_lib",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
-        "//tensorflow/python/training/tracking:base",
-        "//tensorflow/python/training/tracking:python_state",
-        "//tensorflow/python/training/tracking:util",
     ],
 )
 
@@ -148,8 +157,8 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -161,10 +170,10 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -176,11 +185,11 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_gen",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_gen",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -192,12 +201,12 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -207,7 +216,7 @@ py_library(
     srcs = ["basic_loops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -217,12 +226,12 @@ py_library(
     srcs = ["checkpoint_ops.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpoint_ops_gen",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:checkpoint_ops_gen",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
     ],
 )
 
@@ -232,13 +241,13 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":py_checkpoint_reader",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training/saving:saveable_object_util",
@@ -251,7 +260,7 @@ py_library(
     srcs = ["coordinator.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
@@ -264,7 +273,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":server_lib",
-        "//tensorflow/python:device",
+        "//tensorflow/python/framework:device",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
@@ -278,13 +287,13 @@ py_library(
         ":basic_session_run_hooks",
         ":monitored_session",
         ":session_run_hook",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -296,10 +305,10 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -311,9 +320,9 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -324,23 +333,23 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":queue_runner",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_assert",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:layers_util",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:sparse_tensor",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:sparse_tensor",
+        "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/layers:layers_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_assert",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variable_v1",
         "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
@@ -355,8 +364,8 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -367,17 +376,18 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":slot_creator",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:reduce_util",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -388,21 +398,21 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":slot_creator",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_utils",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:backprop",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:nest",
         "//tensorflow/python/util:tf_export",
@@ -416,9 +426,9 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -430,8 +440,8 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -452,10 +462,10 @@ py_library(
     srcs = ["queue_runner_impl.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -476,10 +486,10 @@ py_library(
     deps = [
         ":optimizer",
         ":training_ops",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:math_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/util:tf_export",
     ],
 )
@@ -489,11 +499,11 @@ py_library(
     srcs = ["session_manager.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:session",
         "//tensorflow/python/checkpoint:checkpoint_lib",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
         "//third_party/py/numpy",
@@ -505,15 +515,16 @@ py_library(
     srcs = ["slot_creator.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/compiler/xla/experimental:xla_sharding",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
     ],
 )
 
@@ -537,14 +548,14 @@ py_library(
         ":queue_runner",
         ":session_manager",
         ":session_run_hook",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -559,7 +570,7 @@ py_library(
     ],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:training_ops_gen",
+        "//tensorflow/python/ops:training_ops_gen",
     ],
 )
 
@@ -571,11 +582,11 @@ py_library(
         ":checkpoint_ops",
         ":checkpoint_utils",
         ":saver",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training/saving:saveable_object_util",
         "//tensorflow/python/util:tf_export",
@@ -590,16 +601,16 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:training",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/training",
         "//third_party/py/numpy",
     ],
 )
@@ -612,14 +623,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -632,14 +643,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -652,14 +663,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -672,14 +683,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -692,14 +703,14 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -718,13 +729,13 @@ cuda_py_test(
     ],
     deps = [
         ":device_setter",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python:distributed_framework_test_lib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -742,9 +753,9 @@ tf_py_test(
         "oss_serial",
     ],
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -760,17 +771,17 @@ tf_py_test(
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
         "//tensorflow/python:metrics",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/losses",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -780,8 +791,8 @@ py_library(
     srcs = ["py_checkpoint_reader.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/util:_pywrap_checkpoint_reader",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:tf_export",
@@ -809,12 +820,12 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":training_util",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:variable_scope",
         "//tensorflow/python/checkpoint:checkpoint_management",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/util:deprecation",
     ],
 )
@@ -823,23 +834,27 @@ py_library(
     name = "saver",
     srcs = ["saver.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":py_checkpoint_reader",
         ":training_util",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:device",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:session",
-        "//tensorflow/python:string_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:device",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:io_ops_gen",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/saved_model:pywrap_saved_model",
@@ -858,10 +873,10 @@ py_library(
     srcs_version = "PY3",
     deps = [
         ":saver",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops_gen",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:lookup_ops_gen",
     ],
 )
 
@@ -881,36 +896,36 @@ cuda_py_test(
         ":saver",
         ":saver_test_utils",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:data_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:function",
-        "//tensorflow/python:gradients_impl",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:random_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:sparse_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:iterator_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:function",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:data_flow_ops",
+        "//tensorflow/python/ops:gradients_impl",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:sparse_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/trackable:base",
@@ -927,11 +942,11 @@ tf_py_test(
     tags = ["manual"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -941,11 +956,11 @@ tf_py_test(
     srcs = ["saver_large_partitioned_variable_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -953,17 +968,21 @@ py_library(
     name = "basic_session_run_hooks",
     srcs = ["basic_session_run_hooks.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
         ":session_run_hook",
         ":summary_io",
         ":training_util",
-        "//tensorflow/python:client",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:variable_scope",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:variable_scope",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
@@ -975,6 +994,10 @@ py_library(
     name = "session_run_hook",
     srcs = ["session_run_hook.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = ["//tensorflow/python/util:tf_export"],
 )
 
@@ -987,13 +1010,13 @@ py_library(
         ":saver",
         ":session_manager",
         ":training_util",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/util:deprecation",
@@ -1013,16 +1036,16 @@ tf_py_test(
         ":supervisor",
         ":training",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:parsing_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/summary:summary_iterator",
         "//tensorflow/python/summary:summary_py",
@@ -1035,8 +1058,8 @@ py_library(
     srcs = ["server_lib.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:errors",
-        "//tensorflow/python:pywrap_tf_session",
+        "//tensorflow/python/client:pywrap_tf_session",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/util:compat",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -1047,17 +1070,23 @@ py_library(
     name = "training_util",
     srcs = ["training_util.py"],
     srcs_version = "PY3",
+    visibility = [
+        "//tensorflow:internal",
+        "//third_party/cloud_tpu/convergence_tools:__subpackages__",
+        "//third_party/py/tf_slim:__subpackages__",
+    ],
     deps = [
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:tf_export",
     ],
@@ -1070,10 +1099,10 @@ tf_py_test(
     python_version = "PY3",
     deps = [
         ":training_util",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1085,18 +1114,18 @@ cuda_py_test(
     tags = ["no_rocm"],
     deps = [
         ":adam",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -1117,49 +1146,50 @@ cuda_py_test(
     deps = [
         ":moving_averages",
         ":saver",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:state_ops_gen",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:state_ops_gen",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
 TRAINING_TEST_DEPS = [
     "//third_party/py/numpy",
     "//tensorflow/core:protos_all_py",
-    "//tensorflow/python:array_ops",
-    "//tensorflow/python:client",
-    "//tensorflow/python:client_testlib",
-    "//tensorflow/python:control_flow_ops",
-    "//tensorflow/python:data_flow_ops",
-    "//tensorflow/python:data_flow_ops_gen",
-    "//tensorflow/python:embedding_ops",
-    "//tensorflow/python:errors",
-    "//tensorflow/python:framework",
-    "//tensorflow/python:framework_for_generated_wrappers",
-    "//tensorflow/python:framework_test_lib",
+    "//tensorflow/python/ops:array_ops",
+    "//tensorflow/python/client:client",
+    "//tensorflow/python/platform:client_testlib",
+    "//tensorflow/python/ops:cond",
+    "//tensorflow/python/ops:control_flow_ops",
+    "//tensorflow/python/ops:data_flow_ops",
+    "//tensorflow/python/ops:data_flow_ops_gen",
+    "//tensorflow/python/ops:embedding_ops",
+    "//tensorflow/python/framework:errors",
+    "//tensorflow/python/framework",
+    "//tensorflow/python/framework:for_generated_wrappers",
+    "//tensorflow/python/framework:test_lib",
     "//tensorflow/python:gradients",
-    "//tensorflow/python:lookup_ops",
-    "//tensorflow/python:math_ops",
-    "//tensorflow/python:nn_grad",
-    "//tensorflow/python:nn_ops",
-    "//tensorflow/python:partitioned_variables",
+    "//tensorflow/python/ops:lookup_ops",
+    "//tensorflow/python/ops:math_ops",
+    "//tensorflow/python/ops:nn_grad",
+    "//tensorflow/python/ops:nn_ops",
+    "//tensorflow/python/ops:partitioned_variables",
     "//tensorflow/python:platform_test",
     "//tensorflow/python:pywrap_tensorflow",
-    "//tensorflow/python:random_ops",
-    "//tensorflow/python:resource_variable_ops",
-    "//tensorflow/python:resources",
-    "//tensorflow/python:sparse_ops",
-    "//tensorflow/python:state_ops",
-    "//tensorflow/python:state_ops_gen",
-    "//tensorflow/python:variable_scope",
-    "//tensorflow/python:variables",
+    "//tensorflow/python/ops:random_ops",
+    "//tensorflow/python/ops:resource_variable_ops",
+    "//tensorflow/python/ops:resources",
+    "//tensorflow/python/ops:sparse_ops",
+    "//tensorflow/python/ops:state_ops",
+    "//tensorflow/python/ops:state_ops_gen",
+    "//tensorflow/python/ops:variable_scope",
+    "//tensorflow/python/ops:variables",
     "//tensorflow/python/distribute:cross_device_ops",
     "//tensorflow/python/distribute:distribute_utils",
     "//tensorflow/python/distribute:mirrored_strategy",
@@ -1292,7 +1322,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/ops:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1303,7 +1333,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/ops:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1324,7 +1354,7 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
-        "//tensorflow/python:variable_v1",
+        "//tensorflow/python/ops:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1335,10 +1365,10 @@ cuda_py_test(
     python_version = "PY3",
     deps = [
         ":training",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_v1",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
     ] + TRAINING_TEST_DEPS,
 )
 
@@ -1388,17 +1418,17 @@ cuda_py_test(
         ":saver",
         ":server_lib",
         ":session_manager",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:framework_test_lib",
-        "//tensorflow/python:session",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
-        "//tensorflow/python:while_loop",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/ops:while_loop",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
     ],
 )
@@ -1415,15 +1445,15 @@ tf_py_test(
     ],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:nn_grad",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:nn_grad",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/summary:summary_py",
@@ -1445,15 +1475,15 @@ tf_py_test(
         "notap",
     ],
     deps = [
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:partitioned_variables",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
     ],
 )
@@ -1464,16 +1494,16 @@ tf_py_test(
     srcs = ["checkpoint_ops_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:checkpoint_ops_gen",
-        "//tensorflow/python:client",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:io_ops",
-        "//tensorflow/python:partitioned_variables",
         "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:checkpoint_ops_gen",
+        "//tensorflow/python/ops:io_ops",
+        "//tensorflow/python/ops:partitioned_variables",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1483,13 +1513,13 @@ tf_py_test(
     srcs = ["warm_starting_util_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:init_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
     ],
 )
@@ -1505,14 +1535,14 @@ py_library(
         ":saver",
         ":session_manager",
         ":session_run_hook",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:lookup_ops",
-        "//tensorflow/python:resources",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_coordinator_context",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:lookup_ops",
+        "//tensorflow/python/ops:resources",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/summary:summary_py",
         "//tensorflow/python/util:function_utils",
@@ -1531,22 +1561,22 @@ tf_py_test(
     deps = [
         ":monitored_session",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:session",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:checkpoint_management",
+        "//tensorflow/python/client:session",
         "//tensorflow/python/distribute:collective_all_reduce_strategy",
         "//tensorflow/python/distribute:distribute_coordinator",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model",
         "//tensorflow/python/summary:summary_py",
+        "//tensorflow/python/training:saver",
     ],
 )
 
@@ -1556,13 +1586,13 @@ tf_py_test(
     srcs = ["input_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/util:compat",
         "//third_party/py/numpy",
diff --git a/tensorflow/python/training/experimental/BUILD b/tensorflow/python/training/experimental/BUILD
index 884d4bd2944..ca4cf54c32b 100644
--- a/tensorflow/python/training/experimental/BUILD
+++ b/tensorflow/python/training/experimental/BUILD
@@ -12,17 +12,17 @@ py_strict_library(
     srcs = ["loss_scale.py"],
     srcs_version = "PY3",
     deps = [
-        "//tensorflow/python:cond",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variable_v1",
-        "//tensorflow/python:variables",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:reduce_util",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variable_v1",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:nest",
@@ -36,11 +36,11 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":loss_scale",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:smart_cond",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/framework:indexed_slices",
+        "//tensorflow/python/framework:smart_cond",
+        "//tensorflow/python/ops:control_flow_ops",
+        "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/training:optimizer",
         "//tensorflow/python/util:deprecation",
         "//tensorflow/python/util:tf_export",
@@ -55,19 +55,19 @@ py_strict_test(
     deps = [
         ":loss_scale",
         ":loss_scale_optimizer",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:framework_test_lib",
         "//tensorflow/python:gradients",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/training:gradient_descent",
         "//tensorflow/python/training:momentum",
         "@absl_py//absl/testing:parameterized",
@@ -81,12 +81,6 @@ py_strict_test(
     python_version = "PY3",
     deps = [
         ":loss_scale",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:check_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:cond",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/distribute:distribute_lib",
         "//tensorflow/python/distribute:mirrored_strategy",
@@ -94,6 +88,12 @@ py_strict_test(
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:check_ops",
+        "//tensorflow/python/ops:cond",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -113,7 +113,7 @@ py_strict_library(
     deps = [
         ":loss_scale_optimizer",
         ":mixed_precision_global_state",
-        "//tensorflow/python:config",
+        "//tensorflow/python/framework:config",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:optimizer",
         "//tensorflow/python/util:deprecation",
@@ -131,15 +131,15 @@ cuda_py_strict_test(
         ":mixed_precision",
         ":mixed_precision_global_state",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:config",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/training:gradient_descent",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/training/moving_averages.py b/tensorflow/python/training/moving_averages.py
index e9a7d177e1e..1f3d05a2163 100644
--- a/tensorflow/python/training/moving_averages.py
+++ b/tensorflow/python/training/moving_averages.py
@@ -17,6 +17,7 @@ from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.distribute import reduce_util as ds_reduce_util
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import control_flow_ops
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import math_ops
@@ -550,7 +551,7 @@ class ExponentialMovingAverage:
         with ops.init_scope():
           if isinstance(var, variables.Variable):
             with ops.device(var.device):
-              initialized_value = control_flow_ops.cond(
+              initialized_value = cond.cond(
                   variable_v1.is_variable_initialized(var), var.read_value,
                   lambda: var.initial_value)  # pylint: disable=cell-var-from-loop
             avg = slot_creator.create_slot(
diff --git a/tensorflow/python/training/quantize_training_wrapper.cc b/tensorflow/python/training/quantize_training_wrapper.cc
index c4f61f7eee2..372d805f2df 100644
--- a/tensorflow/python/training/quantize_training_wrapper.cc
+++ b/tensorflow/python/training/quantize_training_wrapper.cc
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <string>
+
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "tensorflow/core/common_runtime/quantize_training.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
diff --git a/tensorflow/python/training/saving/BUILD b/tensorflow/python/training/saving/BUILD
index 005bed02d67..52a0af997f7 100644
--- a/tensorflow/python/training/saving/BUILD
+++ b/tensorflow/python/training/saving/BUILD
@@ -44,12 +44,6 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":saveable_object",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:control_flow_ops_gen",
-        "//tensorflow/python:ref_variable",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:state_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint:saveable_compat",
         "//tensorflow/python/client:session",
         "//tensorflow/python/eager:context",
@@ -58,6 +52,12 @@ py_strict_library(
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_util",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:control_flow_ops_gen",
+        "//tensorflow/python/ops:ref_variable",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:state_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:python_state",
@@ -76,11 +76,11 @@ py_strict_library(
     deps = [
         ":saveable_object",
         ":saveable_object_util",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/util:nest",
     ],
 )
@@ -91,12 +91,12 @@ tf_py_strict_test(
     deps = [
         ":saveable_object",
         ":saveable_object_util",
-        "//tensorflow/python:resource_variable_ops_gen",
-        "//tensorflow/python:variables",
         "//tensorflow/python/checkpoint",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:test",
         "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/ops:resource_variable_ops_gen",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/trackable:resource",
     ],
diff --git a/tensorflow/python/training/saving/saveable_object_util.py b/tensorflow/python/training/saving/saveable_object_util.py
index 9ff5fdfa8ff..ecf4d319df5 100644
--- a/tensorflow/python/training/saving/saveable_object_util.py
+++ b/tensorflow/python/training/saving/saveable_object_util.py
@@ -658,7 +658,8 @@ def trackable_has_serialize_to_tensor(obj):
       # In some cases (e.g. restored objects), the object may have
       # `_serialize_to_tensors` even if the class does not.
       return True
-  except AttributeError:  # Data structure proxy wrappers don't have __dict__.
+  except (AttributeError, TypeError):
+    # Data structure proxy wrappers don't have __dict__.
     pass
 
   # Use MRO so that if a parent class has `_serialize_to_tensors`, but the
diff --git a/tensorflow/python/training/slot_creator.py b/tensorflow/python/training/slot_creator.py
index 33e307da3e1..a6a088a0456 100644
--- a/tensorflow/python/training/slot_creator.py
+++ b/tensorflow/python/training/slot_creator.py
@@ -38,7 +38,7 @@ update_mavg = mavg.assign_sub((mavg - var) * (1 - decay))
 from tensorflow.python.compiler.xla.experimental import xla_sharding
 from tensorflow.python.distribute import distribute_lib
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import ref_variable
 from tensorflow.python.ops import resource_variable_ops
@@ -267,7 +267,7 @@ def create_zeros_slot(primary,
   else:
     if isinstance(primary, variables.Variable):
       slot_shape = array_ops.shape(
-          control_flow_ops.cond(
+          cond.cond(
               variable_v1.is_variable_initialized(primary), primary.read_value,
               lambda: primary.initial_value))
     else:
diff --git a/tensorflow/python/training/slot_creator_test.py b/tensorflow/python/training/slot_creator_test.py
index 69c7b6682ae..ea12eaff590 100644
--- a/tensorflow/python/training/slot_creator_test.py
+++ b/tensorflow/python/training/slot_creator_test.py
@@ -22,7 +22,7 @@ from tensorflow.python.framework import constant_op
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.ops import array_ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import partitioned_variables
 from tensorflow.python.ops import random_ops
 from tensorflow.python.ops import variable_scope
@@ -33,7 +33,7 @@ from tensorflow.python.training import slot_creator
 
 
 def initialized_value(var):
-  return control_flow_ops.cond(
+  return cond.cond(
       variable_v1.is_variable_initialized(var), var.read_value,
       lambda: var.initial_value)
 
diff --git a/tensorflow/python/training/tracking/BUILD b/tensorflow/python/training/tracking/BUILD
deleted file mode 100644
index 8203152b0da..00000000000
--- a/tensorflow/python/training/tracking/BUILD
+++ /dev/null
@@ -1,188 +0,0 @@
-# Description:
-#   Utilities for reading and writing object-based checkpoints.
-
-package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [
-        "//tensorflow:internal",
-    ],
-    licenses = ["notice"],
-)
-
-py_library(
-    name = "base",
-    srcs = ["base.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:control_flow_ops",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/saved_model/registration",
-        "//tensorflow/python/trackable:base",
-        "//tensorflow/python/training/saving:saveable_object",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "trackable_utils",
-    srcs = ["trackable_utils.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/trackable:trackable_utils",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "base_delegate",
-    srcs = ["base_delegate.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/trackable:base_delegate",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "asset",
-    srcs = ["asset.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        "//tensorflow/python:lib",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/trackable:asset",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "autotrackable",
-    srcs = ["autotrackable.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        ":data_structures",
-        "//tensorflow/python/trackable:autotrackable",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "resource",
-    srcs = ["resource.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        "//tensorflow/python/trackable:resource",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "tracking",
-    srcs = ["tracking.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":asset",
-        ":autotrackable",
-        ":resource",
-    ],
-)
-
-py_library(
-    name = "layer_utils",
-    srcs = ["layer_utils.py"],
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow/python/trackable:layer_utils",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "data_structures",
-    srcs = ["data_structures.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        ":layer_utils",
-        "//tensorflow/python/saved_model:revived_types",
-        "//tensorflow/python/trackable:data_structures",
-        "//tensorflow/python/util:deprecation",
-        "@wrapt",
-    ],
-)
-
-py_library(
-    name = "graph_view",
-    srcs = ["graph_view.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        ":trackable_utils",
-        ":tracking",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python/checkpoint:graph_view",
-        "//tensorflow/python/training:optimizer",
-        "//tensorflow/python/training/saving:saveable_object",
-        "//tensorflow/python/training/saving:saveable_object_util",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "util",
-    srcs = ["util.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        ":data_structures",
-        ":graph_view",
-        ":tracking",
-        "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:checkpoint_management",
-        "//tensorflow/python:constant_op",
-        "//tensorflow/python:dtypes",
-        "//tensorflow/python:errors",
-        "//tensorflow/python:framework_ops",
-        "//tensorflow/python:init_ops",
-        "//tensorflow/python:io_ops_gen",
-        "//tensorflow/python:lib",
-        "//tensorflow/python:pywrap_tensorflow",
-        "//tensorflow/python:saver",
-        "//tensorflow/python:session",
-        "//tensorflow/python:tensor_shape",
-        "//tensorflow/python:tensor_util",
-        "//tensorflow/python:variable_scope",
-        "//tensorflow/python:variables",
-        "//tensorflow/python/checkpoint",
-        "//tensorflow/python/eager:context",
-        "//tensorflow/python/eager:def_function",
-        "//tensorflow/python/saved_model:utils",
-        "//tensorflow/python/training/saving:checkpoint_options",
-        "//tensorflow/python/training/saving:functional_saver",
-        "//tensorflow/python/training/saving:saveable_object_util",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
-
-py_library(
-    name = "python_state",
-    srcs = ["python_state.py"],
-    srcs_version = "PY3",
-    deps = [
-        ":base",
-        "//tensorflow/python/trackable:python_state",
-        "//tensorflow/python/util:deprecation",
-    ],
-)
diff --git a/tensorflow/python/training/tracking/asset.py b/tensorflow/python/training/tracking/asset.py
deleted file mode 100644
index 2a399bce465..00000000000
--- a/tensorflow/python/training/tracking/asset.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Asset-type Trackable object."""
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import asset
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, asset, "2.11")
diff --git a/tensorflow/python/training/tracking/autotrackable.py b/tensorflow/python/training/tracking/autotrackable.py
deleted file mode 100644
index d9c2f87bebc..00000000000
--- a/tensorflow/python/training/tracking/autotrackable.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Dependency tracking for trackable objects."""
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import autotrackable
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, autotrackable, "2.11")
diff --git a/tensorflow/python/training/tracking/base.py b/tensorflow/python/training/tracking/base.py
deleted file mode 100644
index 0dad2b3f8f1..00000000000
--- a/tensorflow/python/training/tracking/base.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""An object-local variable management scheme."""
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import base
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, base, "2.11")
diff --git a/tensorflow/python/training/tracking/base_delegate.py b/tensorflow/python/training/tracking/base_delegate.py
deleted file mode 100644
index 52c8d28a7c8..00000000000
--- a/tensorflow/python/training/tracking/base_delegate.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""A mixin class that delegates another Trackable to be used when saving.
-
-This is intended to be used with wrapper classes that cannot directly proxy the
-wrapped object (e.g. with wrapt.ObjectProxy), because there are inner attributes
-that cannot be exposed.
-
-The Wrapper class itself cannot contain any Trackable children, as only the
-delegated Trackable will be saved to checkpoint and SavedModel.
-
-This class will "disappear" and be replaced with the wrapped inner Trackable
-after a cycle of SavedModel saving and loading, unless the object is registered
-and loaded with Keras.
-"""
-
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import base_delegate
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, base_delegate, "2.11")
diff --git a/tensorflow/python/training/tracking/data_structures.py b/tensorflow/python/training/tracking/data_structures.py
deleted file mode 100644
index 76f65b6b407..00000000000
--- a/tensorflow/python/training/tracking/data_structures.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Trackable data structures."""
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import data_structures
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, data_structures, "2.11")
diff --git a/tensorflow/python/training/tracking/graph_view.py b/tensorflow/python/training/tracking/graph_view.py
deleted file mode 100644
index b958106c104..00000000000
--- a/tensorflow/python/training/tracking/graph_view.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Manages a graph of Trackable objects."""
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.checkpoint import graph_view
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, graph_view, "2.11")
diff --git a/tensorflow/python/training/tracking/layer_utils.py b/tensorflow/python/training/tracking/layer_utils.py
deleted file mode 100644
index 36fa7c2a606..00000000000
--- a/tensorflow/python/training/tracking/layer_utils.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2018 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utilities related to layer/model functionality."""
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import layer_utils
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, layer_utils, "2.11")
diff --git a/tensorflow/python/training/tracking/resource.py b/tensorflow/python/training/tracking/resource.py
deleted file mode 100644
index fcc1722d56b..00000000000
--- a/tensorflow/python/training/tracking/resource.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Definitions for resource-type trackable object classes."""
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import resource
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, resource, "2.11")
diff --git a/tensorflow/python/training/tracking/trackable_utils.py b/tensorflow/python/training/tracking/trackable_utils.py
deleted file mode 100644
index 8c188b6edee..00000000000
--- a/tensorflow/python/training/tracking/trackable_utils.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Utility methods for the trackable dependencies."""
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.trackable import trackable_utils
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, trackable_utils, "2.11")
diff --git a/tensorflow/python/training/tracking/tracking.py b/tensorflow/python/training/tracking/tracking.py
deleted file mode 100644
index 41d17f524d3..00000000000
--- a/tensorflow/python/training/tracking/tracking.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""Various types of Trackable objects."""
-
-from tensorflow.python.trackable import asset
-from tensorflow.python.trackable import autotrackable
-from tensorflow.python.trackable import resource
-
-
-AutoTrackable = autotrackable.AutoTrackable
-Asset = asset.Asset
-ResourceTracker = resource.ResourceTracker
-resource_tracker_scope = resource.resource_tracker_scope
-CapturableResource = resource.CapturableResource
-TrackableResource = resource.TrackableResource
diff --git a/tensorflow/python/training/tracking/util.py b/tensorflow/python/training/tracking/util.py
deleted file mode 100644
index a1b688bc8ef..00000000000
--- a/tensorflow/python/training/tracking/util.py
+++ /dev/null
@@ -1,23 +0,0 @@
-"""Utilities for saving/loading Trackable objects."""
-# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-# TODO(kathywu): Delete this file after all imports have been moved to the path
-# below.
-from tensorflow.python.checkpoint import checkpoint
-from tensorflow.python.util import deprecation
-
-__getattr__ = deprecation.deprecate_moved_module(
-    __name__, checkpoint, "2.11")
diff --git a/tensorflow/python/training/training_util.py b/tensorflow/python/training/training_util.py
index 3b4a20ba289..050828f7637 100644
--- a/tensorflow/python/training/training_util.py
+++ b/tensorflow/python/training/training_util.py
@@ -17,7 +17,7 @@ from tensorflow.python.eager import context
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import graph_io
 from tensorflow.python.framework import ops
-from tensorflow.python.ops import control_flow_ops
+from tensorflow.python.ops import cond
 from tensorflow.python.ops import init_ops
 from tensorflow.python.ops import resource_variable_ops
 from tensorflow.python.ops import state_ops
@@ -396,7 +396,7 @@ def _get_or_create_global_step_read(graph=None):
       # this run. This is needed for example Estimator makes all model_fn build
       # under global_step_read_tensor dependency.
       if isinstance(global_step_tensor, variables.Variable):
-        global_step_value = control_flow_ops.cond(
+        global_step_value = cond.cond(
             variable_v1.is_variable_initialized(global_step_tensor),
             global_step_tensor.read_value,
             lambda: global_step_tensor.initial_value)
diff --git a/tensorflow/python/types/trace.py b/tensorflow/python/types/trace.py
index 4cedaa92625..061df9bab11 100644
--- a/tensorflow/python/types/trace.py
+++ b/tensorflow/python/types/trace.py
@@ -26,7 +26,7 @@ traced (a process known as retracing).
 """
 
 import abc
-from typing import Any, List, Optional, Sequence
+from typing import Any, List, Optional, Sequence, Iterator
 
 from typing_extensions import Protocol
 from typing_extensions import runtime_checkable
@@ -199,7 +199,7 @@ class TraceType(metaclass=abc.ABCMeta):
     """
 
   @doc_controls.do_not_doc_inheritable
-  def _to_tensors(self, value) -> List[core.Tensor]:
+  def _to_tensors(self, value: Any) -> List[core.Tensor]:
     """Breaks down a value of this type into Tensors.
 
     Args:
@@ -211,6 +211,21 @@ class TraceType(metaclass=abc.ABCMeta):
     del value
     return []
 
+  @doc_controls.do_not_doc_inheritable
+  def _from_tensors(self, tensors: Iterator[core.Tensor]) -> Any:
+    """Regenerates a value of this type from Tensors.
+
+    Must use the same fixed amount of tensors as `_to_tensors`.
+
+    Args:
+      tensors: An iterator from which the tensors can be pulled.
+
+    Returns:
+      A value of this type.
+    """
+    del tensors
+    return self.placeholder_value(PlaceholderContext())
+
   @doc_controls.do_not_doc_inheritable
   def _flatten(self) -> List["TraceType"]:
     """Returns a list of TensorSpecs corresponding to `_to_tensors` values."""
diff --git a/tensorflow/python/user_ops/BUILD b/tensorflow/python/user_ops/BUILD
index 4eae2f33702..84b15e42401 100644
--- a/tensorflow/python/user_ops/BUILD
+++ b/tensorflow/python/user_ops/BUILD
@@ -16,6 +16,7 @@ package(
 
 tf_gen_op_wrapper_private_py(
     name = "user_ops_gen",
+    out = "ops/gen_user_ops.py",
 )
 
 # This target is deprecated.
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index e511b018f1b..c33fe231091 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -81,7 +81,7 @@ tf_python_pybind_extension(
     hdrs = ["util.h"],
     deps = [
         "//tensorflow/core/platform:platform_port",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -92,7 +92,7 @@ tf_python_pybind_extension(
     srcs = ["nest_wrapper.cc"],
     hdrs = ["nest.h"],
     deps = [
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -122,7 +122,7 @@ tf_python_pybind_extension(
         "//tensorflow/core:framework_headers_lib",
         "//tensorflow/core:lib_headers_for_pybind",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
     ],
@@ -205,8 +205,8 @@ tf_python_pybind_extension(
         "//tensorflow/core:op_gen_lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/util/tensor_bundle:tensor_bundle_headers_lib",
-        "//tensorflow/python:pybind11_lib",
-        "//tensorflow/python:pybind11_status",
+        "//tensorflow/python/lib/core:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_status",
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
@@ -251,12 +251,12 @@ tf_py_strict_test(
     deps = [
         ":deprecation",
         ":tf_inspect",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -270,14 +270,6 @@ tf_py_strict_test(
         ":dispatch",
         ":nest",
         ":tf_export",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:array_ops_stack",
-        "//tensorflow/python:bitwise_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:math_ops_gen",
-        "//tensorflow/python:proto_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
@@ -285,7 +277,15 @@ tf_py_strict_test(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:tensor_conversion",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:array_ops_stack",
+        "//tensorflow/python/ops:bitwise_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:math_ops_gen",
+        "//tensorflow/python/ops:proto_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/ops/linalg:linear_operator_diag",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:test",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/python/types:core",
@@ -298,7 +298,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":keyword_args",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -324,7 +324,7 @@ tf_py_strict_test(
     deps = [
         ":tf_decorator",
         ":tf_export",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -334,9 +334,9 @@ tf_py_strict_test(
     srcs = ["vlog_test.py"],
     python_version = "PY3",
     deps = [
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:nn_ops",
-        "//tensorflow/python:random_ops",
+        "//tensorflow/python/ops:nn_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -409,7 +409,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":tf_stack",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -419,15 +419,12 @@ cc_library(
     hdrs = ["stack_trace.h"],
     deps = [
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/platform:str_util",
-        "//tensorflow/core/platform:stringpiece",
         "//tensorflow/core/util:managed_stack_trace",
+        "//tensorflow/tsl/platform:fingerprint",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -477,7 +474,7 @@ tf_py_strict_test(
     ],
     deps = [
         ":_function_parameter_canonicalizer_binding_for_test",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -488,11 +485,11 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":traceback_utils",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -526,14 +523,14 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":nest",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:resource_variable_ops",
         "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:resource_variable_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
+        "//tensorflow/python/platform:client_testlib",
         "//third_party/py/numpy",
         "@absl_py//absl/testing:parameterized",
     ],
@@ -547,8 +544,8 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":serialization",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/framework:tensor_shape",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -558,7 +555,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":function_utils",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -569,7 +566,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":tf_decorator",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -580,7 +577,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":tf_decorator",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -591,8 +588,8 @@ py_strict_library(
     srcs_version = "PY3",
     deps = [
         ":tf_decorator",
-        "//tensorflow/python:framework_ops",
         "//tensorflow/python/eager:context",
+        "//tensorflow/python/framework:ops",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -604,11 +601,11 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":tf_should_use",
-        "//tensorflow/python:client_testlib",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/eager:def_function",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -620,7 +617,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":tf_decorator",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
     ],
 )
@@ -632,7 +629,7 @@ py_strict_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:framework_for_generated_wrappers",
+        "//tensorflow/python/framework:for_generated_wrappers",
         "//tensorflow/python/framework:tensor_util",
     ],
 )
@@ -728,6 +725,17 @@ py_strict_library(
     ],
 )
 
+tf_py_strict_test(
+    name = "lazy_loader_test",
+    srcs = ["lazy_loader_test.py"],
+    python_version = "PY3",
+    deps = [
+        ":lazy_loader",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
+    ],
+)
+
 py_strict_library(
     name = "decorator_utils",
     srcs = ["decorator_utils.py"],
@@ -1007,7 +1015,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":lock_util",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -1041,7 +1049,7 @@ tf_py_strict_test(
     deps = [
         ":module_wrapper",
         ":tf_inspect",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/tools/compatibility:all_renames_v2",
     ],
@@ -1056,11 +1064,11 @@ tf_py_strict_test(
     deps = [
         ":example_parser_configuration",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:array_ops",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:parsing_ops",
         "//tensorflow/python/client:session",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1088,7 +1096,7 @@ tf_py_strict_test(
     python_version = "PY3",
     deps = [
         ":compat",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
@@ -1098,13 +1106,13 @@ tf_py_strict_test(
     deps = [
         ":nest",
         ":variable_utils",
-        "//tensorflow/python:resource_variable_ops",
-        "//tensorflow/python:variables",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:composite_tensor",
         "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/ops:resource_variable_ops",
+        "//tensorflow/python/ops:variables",
         "//tensorflow/python/platform:client_testlib",
     ],
 )
@@ -1131,7 +1139,7 @@ tf_python_pybind_extension(
         "//tensorflow/compiler/tf2xla:tf2xla_opset_hdrs",
     ],
     deps = [
-        "//tensorflow/python:pybind11_lib",
+        "//tensorflow/python/lib/core:pybind11_lib",
         "@com_google_absl//absl/status:statusor",
         "@pybind11",
         "@pybind11_abseil//pybind11_abseil:absl_casters",
diff --git a/tensorflow/python/util/dispatch.py b/tensorflow/python/util/dispatch.py
index 91e592cecb1..2605c2a17c7 100644
--- a/tensorflow/python/util/dispatch.py
+++ b/tensorflow/python/util/dispatch.py
@@ -197,6 +197,92 @@ class _TypeBasedDispatcher(OpDispatcher):
       return self.NOT_SUPPORTED
 
 
+def _remove_annotation(sig):
+  """Removes annotation from a python Signature."""
+  parameters = [p.replace(annotation=p.empty) for p in sig.parameters.values()]
+  return sig.replace(parameters=parameters, return_annotation=sig.empty)
+
+
+def _get_required_param_names(sig):
+  """Returns a list of required parameter names from a python Signature."""
+  params = []
+  for p in sig.parameters.values():
+    if p.kind == p.VAR_POSITIONAL:
+      continue
+    if p.kind == p.VAR_KEYWORD:
+      continue
+    if p.default is not p.empty:
+      continue
+    params.append(p.name)
+  return params
+
+
+def get_compatible_func(op, func):
+  """Returns a compatible function.
+
+  Args:
+    op: a callable with whose signature the returned function is compatible.
+    func: a callable which is called by the returned function.
+
+  Returns:
+    a compatible function, which conducts the actions of `func` but can
+    be called like `op`, given that:
+      - the list of required arguments in `func` and `op` are the same.
+      - there is no override of the default arguments of `op` that are not
+        supported by `func`.
+  """
+  op_signature = _remove_annotation(tf_inspect.signature(op))
+  func_signature = _remove_annotation(tf_inspect.signature(func))
+
+  # Identitical signatures, no need to apply compatibility fixes.
+  if op_signature == func_signature:
+    return func
+
+  # When calling func:
+  # - Positional args without default must be in the same order.
+  # - Ignore missing optional arguments from op
+
+  op_pos_names = _get_required_param_names(op_signature)
+  func_pos_names = _get_required_param_names(func_signature)
+
+  if op_pos_names != func_pos_names:
+    raise AssertionError(
+        "The decorated function's non-default arguments must be identical"
+        " to that of the overridden op."
+        f" func has {func_pos_names}. op has {op_pos_names}."
+    )
+
+  func_missing_params = {}
+
+  for name in set(op_signature.parameters.keys()) - set(
+      func_signature.parameters.keys()
+  ):
+    p = op_signature.parameters[name]
+    if p.default is p.empty:
+      raise AssertionError(
+          "The decorated function's signature must implement all of the"
+          f" non-default arguments of the overridden op. Argument `{name}` is"
+          " unimplemented."
+      )
+    func_missing_params[name] = p
+
+  def compatible_func(*args, **kwargs):
+    bound = op_signature.bind(*args, **kwargs)
+    for name, param in func_missing_params.items():
+      if name not in bound.arguments:
+        continue
+      value = bound.arguments.pop(name)
+      if value is not param.default:
+        raise AssertionError(
+            f"Dispatched op is called with argument `{name}` set to a"
+            " non-default value, which is not supported by the decorated"
+            " function"
+        )
+    return func(*bound.args, **bound.kwargs)
+
+  return compatible_func
+
+
 # pylint: disable=g-doc-return-or-yield
 def dispatch_for_types(op, *types):
   """Decorator to declare that a Python function overrides an op for a type.
@@ -218,10 +304,8 @@ def dispatch_for_types(op, *types):
   """
 
   def decorator(func):
-    if tf_inspect.getargspec(func) != tf_inspect.getargspec(op):
-      raise AssertionError("The decorated function's signature must exactly "
-                           "match the signature of the overridden op.")
-    _TypeBasedDispatcher(func, types).register(op)
+
+    _TypeBasedDispatcher(get_compatible_func(op, func), types).register(op)
     return func
 
   return decorator
diff --git a/tensorflow/python/util/dispatch_test.py b/tensorflow/python/util/dispatch_test.py
index 718fc14f1b1..db01441afba 100644
--- a/tensorflow/python/util/dispatch_test.py
+++ b/tensorflow/python/util/dispatch_test.py
@@ -59,6 +59,22 @@ def test_op(x, y, z):
   return x + (2 * y) + (3 * z)
 
 
+@tf_export("test_op_with_optional")
+@dispatch.add_dispatch_support
+def test_op_with_optional(x, y, z, optional=None):
+  """A fake op for testing dispatch of Python ops."""
+  del optional
+  return x + (2 * y) + (3 * z)
+
+
+@tf_export("test_op_with_kwonly")
+@dispatch.add_dispatch_support
+def test_op_with_kwonly(*, x, y, z, optional=None):
+  """A fake op for testing dispatch of Python ops."""
+  del optional
+  return x + (2 * y) + (3 * z)
+
+
 class TensorTracer(object):
   """An object used to trace TensorFlow graphs.
 
@@ -174,12 +190,17 @@ class DispatchTest(test_util.TensorFlowTestCase):
   def testAddDispatchForTypes_With_PythonOp(self):
     original_handlers = test_op._tf_fallback_dispatchers[:]
 
-    @dispatch.dispatch_for_types(test_op, CustomTensor)
     def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
       return CustomTensor(
           test_op(x.tensor, y.tensor, z.tensor),
           (x.score + y.score + z.score) / 3.0)
 
+    override = dispatch.dispatch_for_types(test_op, CustomTensor)(
+        override_for_test_op
+    )
+
+    self.assertIs(override, override_for_test_op)
+
     x = CustomTensor([1, 2, 3], 0.2)
     y = CustomTensor([7, 8, 2], 0.4)
     z = CustomTensor([0, 1, 2], 0.6)
@@ -191,11 +212,112 @@ class DispatchTest(test_util.TensorFlowTestCase):
     # Clean up
     test_op._tf_fallback_dispatchers = original_handlers
 
-  def testDispatchForTypes_SignatureMismatch(self):
-    with self.assertRaisesRegex(
-        AssertionError, "The decorated function's "
-        "signature must exactly match.*"):
+  def testDispatchForTypes_MissingArgs(self):
+    original_handlers = test_op_with_optional._tf_fallback_dispatchers[:]
 
+    def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
+      return CustomTensor(
+          test_op(x.tensor, y.tensor, z.tensor),
+          (x.score + y.score + z.score) / 3.0,
+      )
+
+    override = dispatch.dispatch_for_types(test_op_with_optional, CustomTensor)(
+        override_for_test_op
+    )
+
+    self.assertIs(override, override_for_test_op)
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    result = test_op_with_optional(x, y, z)
+    self.assertAllEqual(self.evaluate(result.tensor), [15, 21, 13])
+    self.assertNear(result.score, 0.4, 0.001)
+
+    # Clean up
+    test_op_with_optional._tf_fallback_dispatchers = original_handlers
+
+  def testDispatchForTypes_ProvidingMissingArgs(self):
+    original_handlers = test_op_with_optional._tf_fallback_dispatchers[:]
+
+    @dispatch.dispatch_for_types(test_op_with_optional, CustomTensor)
+    def override_for_test_op(x, y, z):  # pylint: disable=unused-variable
+      return CustomTensor(
+          test_op(x.tensor, y.tensor, z.tensor),
+          (x.score + y.score + z.score) / 3.0,
+      )
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    with self.assertRaisesRegex(
+        AssertionError,
+        "Dispatched op is called with argument `optional` set to a non-default"
+        " value, which is not supported by the decorated function",
+    ):
+      test_op_with_optional(x, y, z, optional=3)
+
+    # Clean up
+    test_op_with_optional._tf_fallback_dispatchers = original_handlers
+
+  def testDispatchForTypes_NewArgs(self):
+    original_handlers = test_op_with_optional._tf_fallback_dispatchers[:]
+
+    @dispatch.dispatch_for_types(test_op_with_optional, CustomTensor)
+    def override_for_test_op(x, y, z, u=None):  # pylint: disable=unused-variable
+      del u
+      return CustomTensor(
+          test_op(x.tensor, y.tensor, z.tensor),
+          (x.score + y.score + z.score) / 3.0,
+      )
+
+    x = CustomTensor([1, 2, 3], 0.2)
+    y = CustomTensor([7, 8, 2], 0.4)
+    z = CustomTensor([0, 1, 2], 0.6)
+
+    result = test_op_with_optional(x, y, z)
+    self.assertAllEqual(self.evaluate(result.tensor), [15, 21, 13])
+    self.assertNear(result.score, 0.4, 0.001)
+
+    # Clean up
+    test_op_with_optional._tf_fallback_dispatchers = original_handlers
+
+  def testDispatchForTypes_SignatureMismatchOrder(self):
+    with self.assertRaisesRegex(
+        AssertionError,
+        "The decorated function's non-default arguments must be identical to"
+        " that of the overridden op.",
+    ):
+
+      @dispatch.dispatch_for_types(test_op, CustomTensor)
+      def override_for_test_op(x, z, y):  # pylint: disable=unused-variable
+        return CustomTensor(
+            test_op(x.tensor, y.tensor, z.tensor),
+            (x.score + y.score + z.score) / 3.0,
+        )
+
+  def testDispatchForTypes_MissingKwOnly(self):
+    with self.assertRaisesRegex(
+        AssertionError,
+        "The decorated function's non-default arguments must be identical to"
+        " that of the overridden op.",
+    ):
+
+      @dispatch.dispatch_for_types(test_op_with_kwonly, CustomTensor)
+      def override_for_test_op(x, z, y):  # pylint: disable=unused-variable
+        return CustomTensor(
+            test_op(x.tensor, y.tensor, z.tensor),
+            (x.score + y.score + z.score) / 3.0,
+        )
+
+  def testDispatchForTypes_SignatureMismatchNames(self):
+    with self.assertRaisesRegex(
+        AssertionError,
+        "The decorated function's non-default arguments must be identical to"
+        " that of the overridden op.",
+    ):
       @dispatch.dispatch_for_types(test_op, CustomTensor)
       def override_for_test_op(a, b, c):  # pylint: disable=unused-variable
         return CustomTensor(
diff --git a/tensorflow/python/util/lazy_loader.py b/tensorflow/python/util/lazy_loader.py
index 0ae4fcd3c01..534d2037d10 100644
--- a/tensorflow/python/util/lazy_loader.py
+++ b/tensorflow/python/util/lazy_loader.py
@@ -33,6 +33,14 @@ class LazyLoader(types.ModuleType):
     self._parent_module_globals = parent_module_globals
     self._warning = warning
 
+    # These members allows doctest correctly process this module member without
+    # triggering self._load(). self._load() mutates parant_module_globals and
+    # triggers a dict mutated during iteration error from doctest.py.
+    # - for from_module()
+    self.__module__ = name.rsplit(".", 1)[0]
+    # - for is_routine()
+    self.__wrapped__ = None
+
     super(LazyLoader, self).__init__(name)
 
   def _load(self):
@@ -58,6 +66,11 @@ class LazyLoader(types.ModuleType):
     module = self._load()
     return getattr(module, item)
 
+  def __repr__(self):
+    # Carefully to not trigger _load, since repr may be called in very
+    # sensitive places.
+    return f"<LazyLoader {self.__name__} as {self._local_name}>"
+
   def __dir__(self):
     module = self._load()
     return dir(module)
diff --git a/tensorflow/python/util/lazy_loader_test.py b/tensorflow/python/util/lazy_loader_test.py
new file mode 100644
index 00000000000..73097168598
--- /dev/null
+++ b/tensorflow/python/util/lazy_loader_test.py
@@ -0,0 +1,41 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""lazy loader tests."""
+
+# pylint: disable=unused-import
+import doctest
+import inspect
+import types
+
+from tensorflow.python.platform import test
+from tensorflow.python.util import lazy_loader
+
+
+class LazyLoaderTest(test.TestCase):
+
+  def testDocTestDoesNotLoad(self):
+    module = types.ModuleType("mytestmodule")
+    module.foo = lazy_loader.LazyLoader("foo", module.__dict__, "os.path")
+
+    self.assertIsInstance(module.foo, lazy_loader.LazyLoader)
+
+    finder = doctest.DocTestFinder()
+    finder.find(module)
+
+    self.assertIsInstance(module.foo, lazy_loader.LazyLoader)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/util/stack_trace.cc b/tensorflow/python/util/stack_trace.cc
index 762ddc9a352..cac9ea206ab 100644
--- a/tensorflow/python/util/stack_trace.cc
+++ b/tensorflow/python/util/stack_trace.cc
@@ -15,10 +15,15 @@ limitations under the License.
 
 #include "tensorflow/python/util/stack_trace.h"
 
-#include <limits>
+#include <Python.h>
 
-#include "tensorflow/core/platform/str_util.h"
-#include "tensorflow/core/platform/stringpiece.h"
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/core/platform/status.h"
 
 namespace {
 
@@ -26,25 +31,64 @@ namespace {
 // TODO(kkb): This is a generic Python utility function. factor out as a
 // utility.
 const char* GetPythonString(PyObject* o) {
-#if PY_MAJOR_VERSION >= 3
   if (PyBytes_Check(o)) {
     return PyBytes_AsString(o);
   } else {
     return PyUnicode_AsUTF8(o);
   }
-#else
-  return PyBytes_AsString(o);
-#endif
 }
 
 }  // namespace
 
 namespace tensorflow {
 
+ABSL_MUST_USE_RESULT
+ABSL_ATTRIBUTE_HOT
+std::shared_ptr<StackTrace> StackTrace::Capture(int limit) {
+  DCHECK(PyGILState_Check());
+  if (limit == -1) limit = std::numeric_limits<int>::max();
+
+  StackTrace result;
+#if PY_VERSION_HEX >= 0x030B0000
+  PyFrameObject* oldframe;
+  PyFrameObject* frame = PyThreadState_GetFrame(PyThreadState_GET());
+  for (int i = 0; i < limit && frame != nullptr; oldframe = frame,
+           frame = PyFrame_GetBack(frame), Py_DECREF(oldframe), ++i) {
+    PyCodeObject* code_obj = PyFrame_GetCode(frame);
+    DCHECK(code_obj != nullptr);
+
+    int line_number = PyFrame_GetLineNumber(const_cast<PyFrameObject*>(frame));
+    result.code_objs_.push_back(std::make_pair(code_obj, line_number));
+  }
+  Py_XDECREF(frame);
+#else
+  const PyFrameObject* frame = PyThreadState_GET()->frame;
+  for (int i = 0; i < limit && frame != nullptr; frame = frame->f_back, ++i) {
+    PyCodeObject* code_obj = frame->f_code;
+    Py_XINCREF(code_obj);
+    DCHECK(code_obj != nullptr);
+
+    int line_number = PyFrame_GetLineNumber(const_cast<PyFrameObject*>(frame));
+    result.code_objs_.push_back(std::make_pair(code_obj, line_number));
+  }
+#endif
+
+  static absl::flat_hash_map<uint64_t, std::shared_ptr<StackTrace>>* cache =
+      new absl::flat_hash_map<uint64_t, std::shared_ptr<StackTrace>>();
+
+  uint64_t hash_code = result.hash();
+  if (!cache->contains(hash_code)) {
+    cache->insert(std::make_pair(
+        hash_code, std::make_shared<StackTrace>(std::move(result))));
+  }
+
+  return cache->at(hash_code);
+}
+
 std::vector<StackFrame> StackTrace::ToStackFrames(
     const SourceMap& source_map, const StackTraceFilter& filtered,
     bool reverse_traversal, int limit) const {
-  DCheckPyGilStateForStackTrace();
+  auto gil_state = PyGILState_Ensure();
   std::vector<StackFrame> result;
   result.reserve(code_objs_.size());
 
@@ -74,16 +118,8 @@ std::vector<StackFrame> StackTrace::ToStackFrames(
     }
   }
 
+  PyGILState_Release(gil_state);
   return result;
 }
 
-StackTrace* StackTraceManager::Get(int id) {
-  DCheckPyGilStateForStackTrace();
-  if (next_id_ - id > kStackTraceCircularBufferSize) return nullptr;
-
-  return &stack_traces_[id & (kStackTraceCircularBufferSize - 1)];
-}
-
-StackTraceManager* const stack_trace_manager = new StackTraceManager();
-
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/stack_trace.h b/tensorflow/python/util/stack_trace.h
index 3b16e1f9d71..d0b28587ec4 100644
--- a/tensorflow/python/util/stack_trace.h
+++ b/tensorflow/python/util/stack_trace.h
@@ -19,36 +19,24 @@ limitations under the License.
 #include <Python.h>
 #include <frameobject.h>
 
-#include <array>
-#include <limits>
-#include <sstream>
-#include <string>
+#include <memory>
+#include <utility>
+#include <vector>
 
 #include "absl/base/attributes.h"
-#include "absl/base/optimization.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
-#include "absl/types/optional.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/util/managed_stack_trace.h"
+#include "tensorflow/tsl/platform/fingerprint.h"
 
 namespace tensorflow {
 
-// Assert that Python GIL is held.
-// TODO(cheshire): Fix duplication vs. py_util.h
-inline void DCheckPyGilStateForStackTrace() {
-#if PY_MAJOR_VERSION >= 3 && PY_MINOR_VERSION >= 4
-  DCHECK(PyGILState_Check());
-#endif
-}
-
 // A class for capturing Python stack trace.
-class StackTrace final {
+class StackTrace : public CapturedStackTrace {
  public:
   static constexpr int kStackTraceInitialSize = 30;
 
-  StackTrace() {}
+  StackTrace() = default;
+  StackTrace(StackTrace&& other) = default;
 
   // Returns `StackTrace` object that captures the current Python stack trace.
   // `limit` determines how many stack frames at most are returned: set to -1
@@ -56,53 +44,19 @@ class StackTrace final {
   // Python GIL must be acquired beforehand.
   ABSL_MUST_USE_RESULT
   ABSL_ATTRIBUTE_HOT
-  static StackTrace Capture(int limit) {
-    DCheckPyGilStateForStackTrace();
-    if (limit == -1) limit = std::numeric_limits<int>::max();
+  static std::shared_ptr<StackTrace> Capture(int limit);
 
-    StackTrace result;
-#if PY_VERSION_HEX >= 0x030B0000
-    PyFrameObject* oldframe;
-    PyFrameObject* frame = PyThreadState_GetFrame(PyThreadState_GET());
-#else
-    const PyFrameObject* frame = PyThreadState_GET()->frame;
-#endif
-    int i = 0;
-#if PY_VERSION_HEX >= 0x030B0000
-    for (; i < limit && frame != nullptr; oldframe = frame,
-                                          frame = PyFrame_GetBack(frame),
-                                          Py_DECREF(oldframe), ++i) {
-      PyCodeObject* code_obj = PyFrame_GetCode(frame);
-#else
-    for (; i < limit && frame != nullptr; frame = frame->f_back, ++i) {
-      PyCodeObject* code_obj = frame->f_code;
-      Py_XINCREF(code_obj);
-#endif
-      DCHECK(code_obj != nullptr);
-
-      int line_number =
-          PyFrame_GetLineNumber(const_cast<PyFrameObject*>(frame));
-      result.code_objs_.push_back(std::make_pair(code_obj, line_number));
+  uint64_t hash() const {
+    uint64_t hash = 0;
+    for (const auto& p : code_objs_) {
+      hash = tsl::FingerprintCat64(hash,
+                                   reinterpret_cast<const uint64_t>(p.first));
+      hash = tsl::FingerprintCat64(hash, p.second);
     }
-#if PY_VERSION_HEX >= 0x030B0000
-    Py_XDECREF(frame);
-#endif
-    return result;
+    return hash;
   }
 
-  // Python GIL must be acquired beforehand.
-  ABSL_ATTRIBUTE_HOT
-  ~StackTrace() { Clear(); }
-
-  StackTrace(StackTrace&& other) { std::swap(code_objs_, other.code_objs_); }
-
-  // Python GIL must be acquired beforehand.
-  ABSL_ATTRIBUTE_HOT
-  StackTrace& operator=(StackTrace&& other) {
-    Clear();
-    std::swap(code_objs_, other.code_objs_);
-    return *this;
-  }
+  ~StackTrace() override { Clear(); }
 
   // Returns a structured representation of the captured stack trace.
   // `source_map` provides a custom mapping for translating stack frames,
@@ -112,83 +66,25 @@ class StackTrace final {
   // `limit` bounds the number of returned frames (after filtering).
   std::vector<StackFrame> ToStackFrames(const SourceMap& source_map,
                                         const StackTraceFilter& filtered,
-                                        bool reverse_traversal = false,
-                                        int limit = -1) const;
-
-  // Python GIL must be acquired beforehand.
-  ABSL_ATTRIBUTE_HOT
-  void Clear() {
-    if (!code_objs_.empty()) DCheckPyGilStateForStackTrace();
-    for (const auto& p : code_objs_) Py_DECREF(p.first);
-    code_objs_.clear();
-  }
+                                        bool reverse_traversal,
+                                        int limit) const override;
 
  private:
-  absl::InlinedVector<std::pair<PyCodeObject*, int>, kStackTraceInitialSize>
-      code_objs_;
+  void Clear() {
+    auto gil_state = PyGILState_Ensure();
+    for (const auto& p : code_objs_) Py_DECREF(p.first);
+    code_objs_.clear();
+    PyGILState_Release(gil_state);
+  }
+
+  absl::InlinedVector<std::pair<PyCodeObject*, int>, 16> code_objs_;
 
   StackTrace(const StackTrace&) = delete;
   StackTrace& operator=(const StackTrace&) = delete;
 };
 
-// A class that manages Python stack traces in a circular buffer. Users can
-// insert stack trace entries and retrieve them by ids.
-class StackTraceManager {
- public:
-  static constexpr int kStackTraceCircularBufferSize = 1024;
-
-  // Captures the current Python stack trace and returns an id.
-  // Python GIL must be acquired beforehand.
-  ABSL_MUST_USE_RESULT
-  ABSL_ATTRIBUTE_HOT
-  int Capture(int limit) {
-    DCheckPyGilStateForStackTrace();
-    const int id = next_id_++;
-    const int index = id & (kStackTraceCircularBufferSize - 1);
-    stack_traces_[index] = StackTrace::Capture(limit);
-    return id;
-  }
-
-  // Retrieve captured Python stack trace by id. Returns `nullptr` if the
-  // requested stack trace is evicted from the circular buffer.
-  // Python GIL must be acquired beforehand.
-  ABSL_MUST_USE_RESULT
-  StackTrace* Get(int id);
-
- private:
-  int next_id_ = 0;
-  std::array<StackTrace, kStackTraceCircularBufferSize> stack_traces_;
-};
-
-// Singleton StackTraceManager.
-extern StackTraceManager* const stack_trace_manager;
-
-// Converts the ManagedStackTrace (identified by ID) to a vector of stack
-// frames.
-inline std::vector<StackFrame> ManagedStackTraceToStackFrames(
-    int id, const SourceMap& source_map, const StackTraceFilter& filtered,
-    bool reverse_traversal, int limit) {
-  PyGILState_STATE gstate = PyGILState_Ensure();
-  StackTrace* stack_trace = stack_trace_manager->Get(id);
-  if (!stack_trace) {
-    // Must have evicted the stack trace by now. Do best effort.
-    return {};
-  }
-
-  std::vector<StackFrame> result = stack_trace->ToStackFrames(
-      source_map, filtered, reverse_traversal, limit);
-  PyGILState_Release(gstate);
-  return result;
-}
-
-// Returns Python stack trace object that can be converted to string.
-// Note that the actual stack trace is kept in a circular buffer for string
-// conversion could fail if it's evicted before.
-// Python GIL must be acquired beforehand.
-inline ManagedStackTrace GetStackTrace(int limit) {
-  DCheckPyGilStateForStackTrace();
-  return ManagedStackTrace(stack_trace_manager->Capture(limit),
-                           &ManagedStackTraceToStackFrames);
+inline std::shared_ptr<StackTrace> GetStackTrace(int limit = -1) {
+  return StackTrace::Capture(limit);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/python/util/tf_decorator_test.py b/tensorflow/python/util/tf_decorator_test.py
index 1517524b6cd..13533e53558 100644
--- a/tensorflow/python/util/tf_decorator_test.py
+++ b/tensorflow/python/util/tf_decorator_test.py
@@ -78,12 +78,6 @@ def test_decorated_function(x):
   return x * 2
 
 
-@test_injectable_decorator_square
-@test_injectable_decorator_increment
-def test_rewrappable_decorated(x):
-  return x * 2
-
-
 @test_tfdecorator('decorator')
 class TestDecoratedClass(object):
   """Test Decorated Class."""
@@ -300,6 +294,11 @@ class TfDecoratorRewrapTest(test.TestCase):
 
   def testRewrapMutatesAffectedFunction(self):
 
+    @test_injectable_decorator_square
+    @test_injectable_decorator_increment
+    def test_rewrappable_decorated(x):
+      return x * 2
+
     def new_target(x):
       return x * 3
 
@@ -310,6 +309,11 @@ class TfDecoratorRewrapTest(test.TestCase):
 
   def testRewrapOfDecoratorFunction(self):
 
+    @test_injectable_decorator_square
+    @test_injectable_decorator_increment
+    def test_rewrappable_decorated(x):
+      return x * 2
+
     def new_target(x):
       return x * 3
 
diff --git a/tensorflow/python/util/tf_stack.cc b/tensorflow/python/util/tf_stack.cc
index 50a2e64fd0c..9e173d30b3e 100644
--- a/tensorflow/python/util/tf_stack.cc
+++ b/tensorflow/python/util/tf_stack.cc
@@ -37,6 +37,7 @@ limitations under the License.
 
 #include "Python.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/hash/hash.h"
 #include "absl/strings/str_format.h"
@@ -46,6 +47,7 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/python/util/stack_trace.h"
 
 struct StackFrame;  // Forward declaration.
@@ -84,7 +86,7 @@ class PyBindFileSet {
 //
 // Precondition: must be holding Python GIL.
 py::str LineContents(const StackFrame& frame) {
-  DCheckPyGilStateForStackTrace();
+  DCHECK(PyGILState_Check());
   // Pointers are to avoid static destruction of pybind::object, which
   // occurs in uncontrollable states.
   static const auto* inspect = new py::module(py::module::import("inspect"));
@@ -181,37 +183,37 @@ class StackTraceWrapper : public AbstractStackTrace {
     if (stack_frames_cache_) {
       return *stack_frames_cache_;
     }
+    stack_frames_cache_ = ToUncachedFrames();
+    return *stack_frames_cache_;
+  }
 
+  std::vector<StackFrame> ToUncachedFrames() const override {
     // Grabbing the GIL solves two purposes: 1) makes the class thread-safe,
     // and 2) ToStackFrames and LineContents actually need it.
     PyGILState_STATE state = PyGILState_Ensure();
 
-    stack_frames_cache_ = captured_.ToStackFrames(
-        *source_map_, [&](const char* f) { return StackTraceFiltering(f); });
+    std::vector<StackFrame> frames = captured_->ToStackFrames(
+        *source_map_, [&](const char* f) { return StackTraceFiltering(f); },
+        /*reverse_traversal=*/false, /*limit=*/-1);
 
     // Drop last stack frames.
-    int newsize = stack_frames_cache_->size() - stacklevel_;
+    int newsize = frames.size() - stacklevel_;
     if (newsize < 0) {
       newsize = 0;
     }
-    stack_frames_cache_->resize(newsize);
+    frames.resize(newsize);
 
     PyGILState_Release(state);
-    return *stack_frames_cache_;
-  }
-
-  void WipeCache() override {
-    tensorflow::mutex_lock lock(mu_);
-    stack_frames_cache_ = {};
+    return frames;
   }
 
   int get_stacklevel() const { return stacklevel_; }
 
   void set_stacklevel(int stacklevel) { stacklevel_ = stacklevel; }
 
-  std::vector<StackFrame> GetUserFrames(int limit = -1) const {
+  std::vector<StackFrame> GetUserFrames(int limit = -1) const override {
     PyGILState_STATE state = PyGILState_Ensure();
-    std::vector<StackFrame> user_frames = captured_.ToStackFrames(
+    std::vector<StackFrame> user_frames = captured_->ToStackFrames(
         *source_map_,
         [&](const char* file_name) {
           return StackTraceFiltering(file_name) ||
@@ -286,17 +288,17 @@ class StackTraceWrapper : public AbstractStackTrace {
 
   ~StackTraceWrapper() override {
     PyGILState_STATE state = PyGILState_Ensure();
-    captured_.Clear();
+    captured_.reset();
     source_map_.reset();
     filter_.reset();
     PyGILState_Release(state);
   }
 
  private:
-  StackTraceWrapper(StackTrace&& captured,
+  StackTraceWrapper(const std::shared_ptr<StackTrace>& captured,
                     const std::shared_ptr<SourceMap>& source_map,
                     const std::shared_ptr<StringSet>& filter, int stacklevel)
-      : captured_(std::move(captured)),
+      : captured_(captured),
         source_map_(source_map),
         filter_(filter),
         stacklevel_(stacklevel) {}
@@ -317,7 +319,7 @@ class StackTraceWrapper : public AbstractStackTrace {
 
   // Note: Make sure to update move constructor while adding new member
   // variables.
-  StackTrace captured_;
+  std::shared_ptr<const StackTrace> captured_;
   std::shared_ptr<SourceMap> source_map_;
   std::shared_ptr<StringSet> filter_;
   int stacklevel_;
@@ -409,7 +411,6 @@ PYBIND11_MODULE(_tf_stack, m) {
       .def("__len__", [](const StackFrame&) { return 4; });
 
   py::class_<StackTraceWrapper>(m, "StackTraceWrapper")
-      // TODO(slebedev): upstream negative indexing support into pybind11.
       .def(
           "__getitem__",
           [](const StackTraceWrapper& self, py::ssize_t index) {
@@ -490,15 +491,18 @@ PYBIND11_MODULE(_tf_stack, m) {
           "_stacklevel", &StackTraceWrapper::get_stacklevel,
           &StackTraceWrapper::set_stacklevel,
           "Adjusts stacklevel; no effects after ToFrames() is called.")
+      .def(
+          "uncached",
+          [](const StackTraceWrapper& self) {
+            return StackTraceWrapper{self.ToUncachedFrames()};
+          },
+          "Gets stack frames without using (or filling) caches.")
       .def(
           "get_user_frames",
           [](const StackTraceWrapper& self) {
             return StackTraceWrapper{self.GetUserFrames()};
           },
           "Returns the non-framework frames as a new trace object.")
-      .def(
-          "wipe_cache", [](StackTraceWrapper& self) { self.WipeCache(); },
-          "Remove all cached or generated data.")
       .def(
           "last_user_frame",
           [](const StackTraceWrapper& self) { return self.LastUserFrame(); },
diff --git a/tensorflow/python/util/tf_stack_test.py b/tensorflow/python/util/tf_stack_test.py
index 673bdb9cdbf..ba94f1b8796 100644
--- a/tensorflow/python/util/tf_stack_test.py
+++ b/tensorflow/python/util/tf_stack_test.py
@@ -64,6 +64,21 @@ class TFStackTest(test.TestCase):
     self.assertRegex(frames[-1].line, "# COMMENT")
     self.assertRegex(frames[-2].line, "# CALLSITE")
 
+  def testUncached(self):
+    def func(n):
+      if n == 0:
+        return tf_stack.extract_stack()  # COMMENT
+      else:
+        return func(n - 1)
+
+    trace = func(5)
+    full_list = list(trace)
+    del trace[-1]
+    self.assertLess(len(trace), len(full_list))
+    # Since stacktrace modifications are stored in the cache,
+    # the uncached representation doesn't have them.
+    self.assertEqual(len(trace.uncached()), len(full_list))
+
   def testGelItem(self):
 
     def func(n):
@@ -117,22 +132,28 @@ class TFStackTest(test.TestCase):
     with self.assertRaises(IndexError):
       del trace[len(trace)]
 
-  def testWipeCache(self):
+  def testSourceMap(self):
+    source_map = tf_stack._tf_stack.PyBindSourceMap()
+
     def func(n):
       if n == 0:
-        return tf_stack.extract_stack()  # COMMENT
+        return tf_stack._tf_stack.extract_stack(
+            source_map, tf_stack._tf_stack.PyBindFileSet()
+        )
       else:
         return func(n - 1)
 
     trace = func(5)
-    full_list = list(trace)
-    del trace[-1]
-    self.assertLess(len(trace), len(full_list))
-
-    # Wiping the "cache" restores the stack trace to its
-    # original representation.
-    trace.wipe_cache()
-    self.assertEqual(len(trace), len(full_list))
+    source_map.update_to((
+        (
+            (trace[0].filename, trace[0].lineno),
+            ("filename", 42, "function_name"),
+        ),
+    ))
+    trace = list(func(5).uncached())
+    self.assertEqual(
+        str(trace[0]), 'File "filename", line 42, in function_name'
+    )
 
 
 if __name__ == "__main__":
diff --git a/tensorflow/security/fuzzing/cc/core/framework/BUILD b/tensorflow/security/fuzzing/cc/core/framework/BUILD
index 405a9266044..3c9648bb763 100644
--- a/tensorflow/security/fuzzing/cc/core/framework/BUILD
+++ b/tensorflow/security/fuzzing/cc/core/framework/BUILD
@@ -1,5 +1,7 @@
 # Fuzztest TensorShape domains
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 cc_library(
     name = "tensor_shape_domains",
     testonly = 1,
diff --git a/tensorflow/security/fuzzing/cc/core/function/BUILD b/tensorflow/security/fuzzing/cc/core/function/BUILD
index 36b98aadb1f..e9ec4d22e7a 100644
--- a/tensorflow/security/fuzzing/cc/core/function/BUILD
+++ b/tensorflow/security/fuzzing/cc/core/function/BUILD
@@ -5,6 +5,8 @@ load(
     "tf_cc_fuzz_test",
 )
 
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 tf_cc_fuzz_test(
     name = "runtime_client_fuzz",
     srcs = ["runtime_client_fuzz.cc"],
diff --git a/tensorflow/security/fuzzing/cc/fuzz_session.h b/tensorflow/security/fuzzing/cc/fuzz_session.h
index 690a48d0d9e..d3661dc0d25 100644
--- a/tensorflow/security/fuzzing/cc/fuzz_session.h
+++ b/tensorflow/security/fuzzing/cc/fuzz_session.h
@@ -96,7 +96,7 @@ template <typename... T>
 class FuzzSession {
  public:
   FuzzSession() : initialized_(false) {}
-  virtual ~FuzzSession() {}
+  virtual ~FuzzSession() = default;
 
   // Constructs a Graph using the supplied Scope.
   // By convention, the graph should have inputs named "input1", ...
diff --git a/tensorflow/security/fuzzing/cc/ops/BUILD b/tensorflow/security/fuzzing/cc/ops/BUILD
index 80504f03cb4..bfbdac6b45f 100644
--- a/tensorflow/security/fuzzing/cc/ops/BUILD
+++ b/tensorflow/security/fuzzing/cc/ops/BUILD
@@ -57,7 +57,10 @@ tf_cc_fuzz_test(
 tf_cc_fuzz_test(
     name = "matmul_fuzz",
     srcs = ["matmul_fuzz.cc"],
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+        "noasan",  # b/283972985
+    ],
     deps = [
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core/framework:types_proto_cc",
diff --git a/tensorflow/security/fuzzing/py/BUILD b/tensorflow/security/fuzzing/py/BUILD
index 9d2b4867683..5e9f78ca7c6 100644
--- a/tensorflow/security/fuzzing/py/BUILD
+++ b/tensorflow/security/fuzzing/py/BUILD
@@ -1,3 +1,5 @@
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+
 # A library of python types used to annotate ops
 py_library(
     name = "annotation_types",
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index c74d3c7cdfa..da7c81351a7 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -410,7 +410,6 @@ def tf_copts(
     android_copts = [
         "-DTF_LEAN_BINARY",
         "-Wno-narrowing",
-        "-fomit-frame-pointer",
     ]
     if android_optimization_level_override:
         android_copts.append(android_optimization_level_override)
@@ -439,7 +438,7 @@ def tf_copts(
         if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
         if_zendnn(["-DAMD_ZENDNN"]) +
         if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
-        if_android_arm(["-mfpu=neon"]) +
+        if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) +
         if_linux_x86_64(["-msse3"]) +
         if_ios_x86_64(["-msse4.1"]) +
         if_no_default_logger(["-DNO_DEFAULT_LOGGER"]) +
@@ -1445,7 +1444,7 @@ def tf_gen_op_wrapper_py(
         srcs_version = "PY3",
         visibility = visibility,
         deps = [
-            clean_dep("//tensorflow/python:framework_for_generated_wrappers_v2"),
+            clean_dep("//tensorflow/python/framework:for_generated_wrappers_v2"),
         ],
         # Instruct build_cleaner to try to avoid using this rule; typically ops
         # creators will provide their own tf_custom_op_py_library based target
@@ -2243,26 +2242,6 @@ def tf_custom_op_library(
         **kwargs
     )
 
-# Placeholder to use until bazel supports py_strict_binary.
-def py_strict_binary(name, **kwargs):
-    native.py_binary(name = name, **kwargs)
-
-# Placeholder to use until bazel supports py_strict_library.
-def py_strict_library(name, **kwargs):
-    native.py_library(name = name, **kwargs)
-
-# Placeholder to use until bazel supports pytype_strict_binary.
-def pytype_strict_binary(name, **kwargs):
-    native.py_binary(name = name, **kwargs)
-
-# Placeholder to use until bazel supports pytype_strict_library.
-def pytype_strict_library(name, **kwargs):
-    native.py_library(name = name, **kwargs)
-
-# Placeholder to use until bazel supports py_strict_test.
-def py_strict_test(name, **kwargs):
-    py_test(name = name, **kwargs)
-
 def tf_custom_op_py_library(
         name,
         srcs = [],
@@ -3360,10 +3339,10 @@ def if_mlir(if_true, if_false = []):
 def tf_enable_mlir_bridge():
     return select({
         str(Label("//tensorflow:enable_mlir_bridge")): [
-            "//tensorflow/python:is_mlir_bridge_test_true",
+            "//tensorflow/python/framework:is_mlir_bridge_test_true",
         ],
         str(Label("//tensorflow:disable_mlir_bridge")): [
-            "//tensorflow/python:is_mlir_bridge_test_false",
+            "//tensorflow/python/framework:is_mlir_bridge_test_false",
         ],
         "//conditions:default": [],
     })
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
index 6849e8b5b53..cec3beb3dfd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-tensor.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.Tensor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt
index 2f1b0e2cf98..3e6de08d24f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.__internal__.-symbolic-tensor.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.__internal__.SymbolicTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.SymbolicTensor\'>"
   is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-numpy-iterator.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-numpy-iterator.pbtxt
new file mode 100644
index 00000000000..0c82786564f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-numpy-iterator.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.data.NumpyIterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.NumpyIterator\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
index 30eb7cb7da0..2ea431872dd 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.pbtxt
@@ -204,6 +204,10 @@ tf_module {
     name: "map_and_batch_with_legacy_function"
     argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "pad_to_cardinality"
+    argspec: "args=[\'cardinality\', \'mask_key\'], varargs=None, keywords=None, defaults=[\'valid\'], "
+  }
   member_method {
     name: "parallel_interleave"
     argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
index 1a36675158b..f6b0d3ba8b7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.pbtxt
@@ -24,6 +24,10 @@ tf_module {
     name: "Iterator"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NumpyIterator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Options"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.-extension-type-spec.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-extension-type-spec.pbtxt
new file mode 100644
index 00000000000..ed505ae5e68
--- /dev/null
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.-extension-type-spec.pbtxt
@@ -0,0 +1,52 @@
+path: "tensorflow.experimental.ExtensionTypeSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
+  is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value_type"
+    mtype: "<class \'abc.abstractproperty\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "experimental_as_proto"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_from_proto"
+    argspec: "args=[\'cls\', \'proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_type_proto"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'cls\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_subtype_of"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "most_specific_common_supertype"
+    argspec: "args=[\'self\', \'others\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "most_specific_compatible_type"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
index 393688804c5..dcd8b231ffe 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.experimental.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "ExtensionTypeBatchEncoder"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ExtensionTypeSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
index 3ee703db84b..0d742054134 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.pbtxt
@@ -2126,7 +2126,7 @@ tf_module {
   }
   member_method {
     name: "shape"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "shape_n"
@@ -2150,7 +2150,7 @@ tf_module {
   }
   member_method {
     name: "size"
-    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \"<dtype: \'int32\'>\"], "
+    argspec: "args=[\'input\', \'name\', \'out_type\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "slice"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
index d0337698dfa..268dd750ec2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "experimental_custom_gradients"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_image_format"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_io_device"
     mtype: "<type \'member_descriptor\'>"
@@ -28,6 +32,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\', \'experimental_image_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.train.-server-def.pbtxt
index 641ea210601..993aef348ba 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.train.-server-def.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.train.-server-def.pbtxt
@@ -15,6 +15,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_STRING
     }
+    field {
+      name: "replica"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
     field {
       name: "task_index"
       number: 3
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
index 6849e8b5b53..cec3beb3dfd 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-tensor.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.Tensor"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-eager-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-eager-tensor.pbtxt
index 7c49b9cbb84..0d86c75a749 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-eager-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-eager-tensor.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.__internal__.EagerTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.EagerTensor\'>"
   is_instance: "<class \'tensorflow.python.framework.ops._EagerTensorBase\'>"
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
   is_instance: "<class \'tensorflow.python.types.core.Value\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt
index 2f1b0e2cf98..3e6de08d24f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.-symbolic-tensor.pbtxt
@@ -2,7 +2,7 @@ path: "tensorflow.__internal__.SymbolicTensor"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.SymbolicTensor\'>"
   is_instance: "<class \'tensorflow.python.client._pywrap_tf_session.PyTensor\'>"
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-dense-column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-dense-column.pbtxt
index 0dedc0ebd8f..7d0500e46fc 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-dense-column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-dense-column.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.__internal__.feature_column.DenseColumn"
 tf_class {
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.DenseColumn\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.FeatureColumn\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2_types.FeatureColumn\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-feature-column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-feature-column.pbtxt
index a7a7f145cba..25c427803f0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-feature-column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-feature-column.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.__internal__.feature_column.FeatureColumn"
 tf_class {
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.FeatureColumn\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2_types.FeatureColumn\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "name"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.pbtxt
index 583d0c61d63..55eba231104 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.__internal__.feature_column.-sequence-dense-column.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.__internal__.feature_column.SequenceDenseColumn"
 tf_class {
   is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.SequenceDenseColumn\'>"
-  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2.FeatureColumn\'>"
+  is_instance: "<class \'tensorflow.python.feature_column.feature_column_v2_types.FeatureColumn\'>"
   is_instance: "<type \'object\'>"
   member {
     name: "TensorSequenceLengthPair"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-numpy-iterator.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-numpy-iterator.pbtxt
new file mode 100644
index 00000000000..0c82786564f
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-numpy-iterator.pbtxt
@@ -0,0 +1,22 @@
+path: "tensorflow.data.NumpyIterator"
+tf_class {
+  is_instance: "<class \'tensorflow.python.data.ops.dataset_ops.NumpyIterator\'>"
+  is_instance: "<class \'tensorflow.python.trackable.base.Trackable\'>"
+  is_instance: "<type \'object\'>"
+  member_method {
+    name: "__init__"
+    argspec: "args=[\'self\', \'dataset\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "next"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "restore"
+    argspec: "args=[\'self\', \'state\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "save"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
index c702334cdf0..3c0f3438001 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.pbtxt
@@ -180,6 +180,10 @@ tf_module {
     name: "map_and_batch"
     argspec: "args=[\'map_func\', \'batch_size\', \'num_parallel_batches\', \'drop_remainder\', \'num_parallel_calls\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\'], "
   }
+  member_method {
+    name: "pad_to_cardinality"
+    argspec: "args=[\'cardinality\', \'mask_key\'], varargs=None, keywords=None, defaults=[\'valid\'], "
+  }
   member_method {
     name: "parallel_interleave"
     argspec: "args=[\'map_func\', \'cycle_length\', \'block_length\', \'sloppy\', \'buffer_output_elements\', \'prefetch_input_elements\'], varargs=None, keywords=None, defaults=[\'1\', \'False\', \'None\', \'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
index 613a4db8cf5..88959a748b0 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.pbtxt
@@ -28,6 +28,10 @@ tf_module {
     name: "IteratorSpec"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "NumpyIterator"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Options"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.-extension-type-spec.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-extension-type-spec.pbtxt
new file mode 100644
index 00000000000..ed505ae5e68
--- /dev/null
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.-extension-type-spec.pbtxt
@@ -0,0 +1,52 @@
+path: "tensorflow.experimental.ExtensionTypeSpec"
+tf_class {
+  is_instance: "<class \'tensorflow.python.framework.extension_type.ExtensionTypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.framework.type_spec.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.internal.TypeSpec\'>"
+  is_instance: "<class \'tensorflow.python.types.trace.TraceType\'>"
+  is_instance: "<class \'tensorflow.core.function.trace_type.serialization.Serializable\'>"
+  is_instance: "<type \'object\'>"
+  member {
+    name: "value_type"
+    mtype: "<class \'abc.abstractproperty\'>"
+  }
+  member_method {
+    name: "__init__"
+  }
+  member_method {
+    name: "experimental_as_proto"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_from_proto"
+    argspec: "args=[\'cls\', \'proto\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "experimental_type_proto"
+    argspec: "args=[\'cls\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "from_value"
+    argspec: "args=[\'cls\', \'value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_compatible_with"
+    argspec: "args=[\'self\', \'spec_or_value\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "is_subtype_of"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "most_specific_common_supertype"
+    argspec: "args=[\'self\', \'others\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "most_specific_compatible_type"
+    argspec: "args=[\'self\', \'other\'], varargs=None, keywords=None, defaults=None"
+  }
+  member_method {
+    name: "placeholder_value"
+    argspec: "args=[\'self\', \'placeholder_context\'], varargs=None, keywords=None, defaults=None"
+  }
+}
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
index 2741860251b..1fbfb172de4 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.pbtxt
@@ -80,21 +80,25 @@ tf_module {
     name: "full_job_name"
     argspec: "args=[\'task_id\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "get_default_mesh"
+    argspec: "args=[], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "heartbeat_enabled"
     argspec: "args=[], varargs=None, keywords=None, defaults=None"
   }
   member_method {
     name: "initialize_accelerator_system"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\'], "
   }
   member_method {
     name: "initialize_multi_client"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\'], "
   }
   member_method {
     name: "initialize_tpu_system"
-    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'False\'], "
+    argspec: "args=[\'device_type\', \'enable_coordination_service\', \'num_logical_cpu_devices\', \'experimental_reset_context\'], varargs=None, keywords=None, defaults=[\'None\', \'True\', \'None\', \'False\'], "
   }
   member_method {
     name: "is_dtensor"
@@ -142,7 +146,11 @@ tf_module {
   }
   member_method {
     name: "relayout"
-    argspec: "args=[\'tensor\', \'layout\'], varargs=None, keywords=None, defaults=None"
+    argspec: "args=[\'tensor\', \'layout\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
+  member_method {
+    name: "relayout_like"
+    argspec: "args=[\'tensor\', \'layout_tensor\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
   member_method {
     name: "run_on"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
index 26cf4ef1de6..3e1f11a3b01 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.pbtxt
@@ -16,6 +16,10 @@ tf_module {
     name: "ExtensionTypeBatchEncoder"
     mtype: "<type \'type\'>"
   }
+  member {
+    name: "ExtensionTypeSpec"
+    mtype: "<type \'type\'>"
+  }
   member {
     name: "Optional"
     mtype: "<type \'type\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
index a6169d37bdb..56eba701950 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.pbtxt
@@ -1014,7 +1014,7 @@ tf_module {
   }
   member_method {
     name: "shape"
-    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "shape_n"
@@ -1038,7 +1038,7 @@ tf_module {
   }
   member_method {
     name: "size"
-    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\"<dtype: \'int32\'>\", \'None\'], "
+    argspec: "args=[\'input\', \'out_type\', \'name\'], varargs=None, keywords=None, defaults=[\'None\', \'None\'], "
   }
   member_method {
     name: "slice"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
index d0337698dfa..268dd750ec2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "experimental_custom_gradients"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_image_format"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_io_device"
     mtype: "<type \'member_descriptor\'>"
@@ -28,6 +32,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\', \'None\', \'True\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\', \'experimental_image_format\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'False\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.train.-server-def.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.train.-server-def.pbtxt
index 641ea210601..993aef348ba 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.train.-server-def.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.train.-server-def.pbtxt
@@ -15,6 +15,12 @@ tf_proto {
       label: LABEL_OPTIONAL
       type: TYPE_STRING
     }
+    field {
+      name: "replica"
+      number: 8
+      label: LABEL_OPTIONAL
+      type: TYPE_INT32
+    }
     field {
       name: "task_index"
       number: 3
diff --git a/tensorflow/tools/api/tests/BUILD b/tensorflow/tools/api/tests/BUILD
index 7f3b08838fb..9091954b74a 100644
--- a/tensorflow/tools/api/tests/BUILD
+++ b/tensorflow/tools/api/tests/BUILD
@@ -40,8 +40,8 @@ py_test(
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v2_estimator",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:lib",
+        "//tensorflow/python/lib/io:lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:resource_loader",
         "//tensorflow/python/platform:tf_logging",
         "//tensorflow/tools/api/lib:python_object_to_proto_visitor",
@@ -60,7 +60,7 @@ py_test(
     ],
     deps = [
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
 
diff --git a/tensorflow/tools/api/tests/convert_from_multiline.cc b/tensorflow/tools/api/tests/convert_from_multiline.cc
index 7c47931cd54..afde8f9f122 100644
--- a/tensorflow/tools/api/tests/convert_from_multiline.cc
+++ b/tensorflow/tools/api/tests/convert_from_multiline.cc
@@ -13,6 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 // Converts all *.pbtxt files in a directory from Multiline to proto format.
+#include <iostream>
+#include <string>
+#include <vector>
+
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
diff --git a/tensorflow/core/tfrt/eager/backends/BUILD b/tensorflow/tools/ci_build/BUILD
similarity index 55%
rename from tensorflow/core/tfrt/eager/backends/BUILD
rename to tensorflow/tools/ci_build/BUILD
index 6fda64b3595..48ac8791019 100644
--- a/tensorflow/core/tfrt/eager/backends/BUILD
+++ b/tensorflow/tools/ci_build/BUILD
@@ -1,5 +1,9 @@
-# Empty build file to create a package.
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
+
+exports_files(
+    srcs = glob(["Dockerfile.*"]),
+    visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64 b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
index 2b674e7015c..59e427c30ef 100644
--- a/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
+++ b/tensorflow/tools/ci_build/Dockerfile.cpu.arm64
@@ -1,4 +1,4 @@
-FROM linaro/tensorflow-arm64-build:2.13-multipython
+FROM linaro/tensorflow-arm64-build:2.14-multipython
 
 ARG py_major_minor_version='3.10'
 
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
index c595345aa6e..a2c7c534010 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
@@ -8,7 +8,7 @@
 #  --tag "gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython" .
 # $ docker push gcr.io/tensorflow-testing/nosla-cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython
 
-FROM gcr.io/tensorflow-sigs/build@sha256:86ab6082134fb68ff54f02bb183fecf45a4099846bd509e139bc932dd0c0049e
+FROM gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282
 
 # Copy and run the install scripts.
 ARG DEBIAN_FRONTEND=noninteractive
@@ -35,14 +35,12 @@ COPY install/install_bazel.sh /install/
 RUN /install/install_bazel.sh
 
 COPY install/build_and_install_python.sh /install/
-RUN /install/build_and_install_python.sh "3.7.7"
 RUN /install/build_and_install_python.sh "3.8.2"
 RUN /install/build_and_install_python.sh "3.9.4"
 RUN /install/build_and_install_python.sh "3.10.0"
 RUN /install/build_and_install_python.sh "3.11.0"
 
 COPY install/install_pip_packages_by_version.sh /install/
-RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.7" "jax"
 RUN /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.8" "jax"
 # https://github.com/numpy/numpy/issues/22623 for `SETUPTOOLS_USE_DISTUTILS`.
 RUN SETUPTOOLS_USE_DISTUTILS=stdlib /install/install_pip_packages_by_version.sh "/usr/local/bin/pip3.9" "jax"
diff --git a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
index f8ff5347dbe..72d7a3e7c34 100644
--- a/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
+++ b/tensorflow/tools/ci_build/Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
@@ -4,11 +4,11 @@
 # glibc (2.12) and system libstdc++ (4.4).
 #
 # To push a new version, run:
-# $ docker build -f Dockerfile.rbe.cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython \
-#  --tag "gcr.io/tensorflow-testing/nosla-cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython" .
-# $ docker push gcr.io/tensorflow-testing/nosla-cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython
+# $ docker build -f Dockerfile.rbe.cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython \
+#  --tag "gcr.io/tensorflow-testing/nosla-cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython" .
+# $ docker push gcr.io/tensorflow-testing/nosla-cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython
 
-FROM nvidia/cuda:12.0.1-cudnn8-devel-ubuntu20.04 as devtoolset
+FROM nvidia/cuda:12.0.1-devel-ubuntu20.04 as devtoolset
 
 ENV DEBIAN_FRONTEND=noninteractive
 RUN apt-get update && apt-get install -y \
@@ -35,7 +35,7 @@ RUN /build_devtoolset.sh devtoolset-7 /dt7
 RUN /build_devtoolset.sh devtoolset-9 /dt9
 
 # TODO(klimek): Split up into two different docker images.
-FROM nvidia/cuda:12.0.1-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.0.1-devel-ubuntu20.04
 COPY --from=devtoolset /dt7 /dt7
 COPY --from=devtoolset /dt9 /dt9
 
@@ -48,6 +48,9 @@ RUN /install/install_bootstrap_deb_packages.sh
 COPY install/install_deb_packages.sh /install/
 RUN /install/install_deb_packages.sh
 
+RUN apt-get update
+RUN apt-get install -y libcudnn8-dev=8.9.1.23-1+cuda12.1
+
 # LLVM/Clang: https://apt.llvm.org/
 RUN apt-key adv --fetch-keys https://apt.llvm.org/llvm-snapshot.gpg.key
 RUN printf "deb http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main\ndeb-src http://apt.llvm.org/focal/ llvm-toolchain-focal-16 main\n" > /etc/apt/sources.list.d/llvm.list
diff --git a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
index 2b64cbba56a..d2846da3046 100644
--- a/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
+++ b/tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
@@ -17,4 +17,5 @@ set -x
 
 ARM_SKIP_TESTS="-//tensorflow/lite/... \
 -//tensorflow/python/kernel_tests/nn_ops:atrous_conv2d_test \
+-//tensorflow/python/kernel_tests/nn_ops:conv_ops_test \
 "
diff --git a/tensorflow/tools/ci_build/install/BUILD b/tensorflow/tools/ci_build/install/BUILD
index bb3d0d893b5..8f0cc7660b2 100644
--- a/tensorflow/tools/ci_build/install/BUILD
+++ b/tensorflow/tools/ci_build/install/BUILD
@@ -6,6 +6,11 @@ package(
     licenses = ["notice"],
 )
 
+exports_files(
+    srcs = glob(["*.sh"]),
+    visibility = ["//visibility:public"],
+)
+
 sh_binary(
     name = "install_bazel",
     srcs = ["install_bazel.sh"],
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 137b5b33e7f..8a3fce78657 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-BAZEL_VERSION="5.3.0"
+BAZEL_VERSION="6.1.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index 4ba37d01b45..d571859e001 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -17,7 +17,7 @@
 # This script is to be used to install bzel on non x86_64 systems
 # It will compile bazel from source and install it in /usr/local/bin
 
-BAZEL_VERSION="5.3.0"
+BAZEL_VERSION="6.1.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
index 80bcb7844ab..a37f79a225a 100755
--- a/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
+++ b/tensorflow/tools/ci_build/install/install_pip_packages_by_version.sh
@@ -43,6 +43,7 @@ JAX_PACKAGES=(
   "msgpack"
   "typing_extensions"
   "ml_dtypes>=0.1.0"
+  "importlib_metadata>=4.6"
 )
 
 PACKAGES=(
diff --git a/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
new file mode 100644
index 00000000000..ee123593485
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc
@@ -0,0 +1,80 @@
+# This bazelrc can build a CPU-supporting TF package.
+
+# Set DEVELOPER_DIR to select a version of Xcode.
+build --action_env  DEVELOPER_DIR=/Applications/Xcode.app/Contents/Developer
+
+# Build TensorFlow v2
+build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
+
+# Disable nccl support for macOS.
+build --define=no_nccl_support=true
+
+# Suppress all warning messages
+build --output_filter=^$
+
+# Disable MKL
+build --define=tensorflow_mkldnn_contraction_kernel=0
+
+# Settings for MacOS on ARM CPUs.
+build --cpu=darwin_arm64
+build --macos_minimum_os=11.0
+build --action_env MACOSX_DEPLOYMENT_TARGET=11.0
+
+# Test-related settings below this point.
+test --verbose_failures=true --local_test_jobs=HOST_CPUS --test_output=errors
+
+# Increase the test timeout as tests often take longer on mac.
+test --test_timeout=300,450,1200,3600
+
+# Only run small and medium tests. Some tests initially fail but eventually
+# succeed so set flaky_test_attempts=3. If it fails on all three tries, then
+# the test is reported as failed.
+test --flaky_test_attempts=3 --test_size_filters=small,medium
+
+# Only build what is required to run the tests
+test --build_tests_only --keep_going
+
+# "nonpip" tests are regular py_test tests.
+# Pass --config=nonpip_filters to run the same suite of tests. If you want to run just
+# one test for investigation, you don't need --config=nonpip_filters; just run the
+# bazel test invocation as normal.
+test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
+test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
+test:nonpip_filters --test_lang_filters=cc,py
+test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xla/service/gpu/... -//tensorflow/compiler/xla/tools/multihost_hlo_runner/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
+
+# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
+# odd to attempt to validate the quality of the pip package. The wheel is
+# installed into a virtual environment, and then that venv is used to run all
+# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
+# drops all the bazel dependencies for each py_test; this makes all the tests
+# use the wheel's TensorFlow installation instead of the one made available
+# through bazel. This must be done in a different root directory, //bazel_pip/...,
+# because "import tensorflow" run from the root directory would instead import
+# the folder instead of the venv package.
+#
+# Pass --config=pip to run the same suite of tests. If you want to run just one
+# test for investigation, you'll need --config=pip_base instead, and then you
+# can specify whichever target you want.
+test:pip_base --define=no_tensorflow_py_deps=true
+test:pip_filters --build_tag_filters=-nopip,-no_pip,-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
+test:pip_filters --test_tag_filters=-nopip,-no_pip,-nomac,-no_mac,-no_oss,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
+test:pip_filters --test_lang_filters=py
+test:pip --config=pip_base --config=pip_filters -- //bazel_pip/tensorflow/python/...
+
+# For building libtensorflow archives
+build:libtensorflow_filters --action_env TF_NEED_HDFS=0
+build:libtensorflow_filters --action_env TF_NEED_ROCM=0 --action_env TF_NEED_MKL=0
+build:libtensorflow_filters --action_env COMPUTECPP_PATH="/usr/local"
+test:libtensorflow_test --config=libtensorflow_filters -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
+build:libtensorflow_build --config=libtensorflow_filters  -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
+
+# For continuous builds
+# nodistinct_host_configuration saves building twice a lot of targets
+test:continuous_filters --nodistinct_host_configuration --keep_going
+test:continuous_filters --build_tests_only --test_output=errors --flaky_test_attempts=3
+test:continuous_filters --test_size_filters=small,medium --test_timeout=300,450,1200,3600
+test:continuous_filters --test_tag_filters=-no_oss,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
+test:continuous_filters --build_tag_filters=-no_oss,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
+
+test:continuous --config=continuous_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/... -//tensorflow/compiler/xla/tests:local_client_aot_test_computation -//tensorflow/compiler/xla/tests:local_client_aot_test_helper -//tensorflow/compiler/xla/tests:local_client_aot_test
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_nightly.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_nightly.Jenkinsfile
new file mode 100644
index 00000000000..c6dfd764d31
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_nightly.Jenkinsfile
@@ -0,0 +1,260 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+    stages {
+        stage("Build Tensorflow") {
+            parallel {
+                stage("Python 3.8") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+
+                    }
+                    steps {
+                        
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.8.13
+                            '''
+
+                            sh 'python --version'
+
+                            git branch: "nightly",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                                python tensorflow/tools/ci_build/update_version.py --nightly
+                            '''
+
+                            // Install Pillow for metal plugin tests
+                            sh 'pip install Pillow'
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.8.13/lib/python3.8/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                    
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --nightly_flag \
+                                --project_name "tf-nightly-macos" \
+                                dist
+                            '''
+                        }
+                        
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+
+                        sh 'python ${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py'
+
+                    }
+                }
+                stage("Python 3.9") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.9.13
+                            '''
+
+                            sh 'python --version'
+
+                            git branch: "nightly",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+                                
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                                python tensorflow/tools/ci_build/update_version.py --nightly
+                            '''
+
+                            // Install Pillow for metal plugin tests
+                            sh 'pip install Pillow'
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.9.13/lib/python3.9/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                    
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --nightly_flag \
+                                --project_name "tf-nightly-macos" \
+                                dist
+                            '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+
+                        sh 'python ${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py'
+
+                    }
+                }
+                stage("Python 3.10") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.10.4
+                            '''
+                            
+                            sh 'python --version'
+
+                            git branch: "nightly",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                                python tensorflow/tools/ci_build/update_version.py --nightly
+                            '''
+
+                            // Install Pillow for metal plugin tests
+                            sh 'pip install Pillow'
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.10.4/lib/python3.10/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --nightly_flag \
+                                --project_name "tf-nightly-macos" \
+                                dist
+                            '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+
+                        sh 'python ${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py'
+                    }
+                }
+                stage("Python 3.11") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.11.2
+                            '''
+                            
+                            sh 'python --version'
+
+                            git branch: "nightly",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                                python tensorflow/tools/ci_build/update_version.py --nightly
+                            '''
+
+                            // Install Pillow for metal plugin tests
+                            sh 'pip install Pillow'
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.11.2/lib/python3.11/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --nightly_flag \
+                                --project_name "tf-nightly-macos" \
+                                dist
+                            '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+
+                        sh 'python ${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py'
+                    }
+                }
+            }
+        } 
+    }
+    post {
+        always {
+            build 'upload-nightly'
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile
new file mode 100644
index 00000000000..08b181fc956
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_build_release.Jenkinsfile
@@ -0,0 +1,224 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+    environment {
+        RELEASE_BRANCH = 'r2.13'
+    }
+    stages {
+        stage("Build Tensorflow") {
+            parallel {
+                stage("Python 3.8") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.8.13
+                            '''
+
+                            sh 'python --version'
+
+                            git branch: "${RELEASE_BRANCH}",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                            '''
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.8.13/lib/python3.8/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                    
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --project_name tensorflow_macos \
+                                dist
+                            '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+                    }
+                }
+                stage("Python 3.9") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.9.13
+                            '''
+
+                            sh 'python --version'
+
+                            git branch: "${RELEASE_BRANCH}",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                            '''
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.9.13/lib/python3.9/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                    
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --project_name tensorflow_macos \
+                                dist
+                                '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+                    }
+                }
+                stage("Python 3.10") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.10.4
+                            '''
+                            
+                            sh 'python --version'
+
+                            git branch: "${RELEASE_BRANCH}",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                            '''
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.10.4/lib/python3.10/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --project_name tensorflow_macos \
+                                dist
+                            '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+                    }
+                }
+                stage("Python 3.11") {
+                    agent {
+                        label "nightly-build"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        dir('tensorflow') {
+
+                            sh '''
+                                pyenv init -
+                                pyenv global 3.11.2
+                            '''
+                            
+                            sh 'python --version'
+
+                            git branch: "${RELEASE_BRANCH}",
+                                url: "https://github.com/tensorflow/tensorflow.git"
+
+                            sh '''
+                                pip install --upgrade pip
+                                pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                            '''
+
+                            sh '''
+                                /opt/homebrew/bin/bazel --bazelrc="${WORKSPACE}/tensorflow/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" build \
+                                --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.11.2/lib/python3.11/site-packages" \
+                                //tensorflow/tools/pip_package:build_pip_package
+                                
+                                ./bazel-bin/tensorflow/tools/pip_package/build_pip_package \
+                                --project_name tensorflow_macos \
+                                dist
+                            '''
+                        }
+
+                        // Sanity check before archiving/uploading to PyPi
+                        sh '''
+                            python -m pip install ${WORKSPACE}/tensorflow/dist/*.whl
+
+                            python -c 'import tensorflow as tf; t1=tf.constant([1,2,3,4]); t2=tf.constant([5,6,7,8]); print(tf.add(t1,t2).shape)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.keras" in tf.keras.__name__ else 1)'
+                            python -c 'import sys; import tensorflow as tf; sys.exit(0 if "_v2.estimator" in tf.estimator.__name__ else 1)'
+                        '''
+                            
+                        archiveArtifacts artifacts: "tensorflow/dist/*.whl", followSymlinks: false, onlyIfSuccessful: true
+                    }
+                }
+            }
+        } 
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_ci.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_ci.Jenkinsfile
new file mode 100644
index 00000000000..8e0073c0396
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_ci.Jenkinsfile
@@ -0,0 +1,166 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+    stages {
+        stage("Build Tensorflow") {
+            parallel {
+                stage("Python 3.8") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+
+                    }
+                    steps {
+
+                        sh '''
+                            echo 3.8.13 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.8.13
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "master",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+                            
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.8.13/lib/python3.8/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.8.13/bin/python3.8" \
+                            --config=nonpip
+                            '''
+                    }
+                }
+                stage("Python 3.9") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+
+                        sh '''
+                            echo 3.9.13 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.9.13
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "master",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.9.13/lib/python3.9/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.9.13/bin/python3.9" \
+                            --config=nonpip
+                            '''
+                    }
+                }
+                stage("Python 3.10") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        sh '''
+                            echo 3.10.4 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.10.4
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "master",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        configFileProvider([configFile(fileId: '561b70ba-de73-428b-919e-99346716e33c', targetLocation: '.macos.bazelrc')]) {}
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.10.4/lib/python3.10/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.10.4/bin/python3.10" \
+                            --config=nonpip
+                            '''
+
+                    }
+                }
+                stage("Python 3.11") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        sh '''
+                            echo 3.11.2 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.11.2
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "master",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.11.2/lib/python3.11/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.11.2/bin/python3.11" \
+                            --config=nonpip
+                            '''
+
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_nightly.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_nightly.Jenkinsfile
new file mode 100644
index 00000000000..5e605fa8bd9
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_nightly.Jenkinsfile
@@ -0,0 +1,164 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+    stages {
+        stage("Build Tensorflow") {
+            parallel {
+                stage("Python 3.8") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+
+                    }
+                    steps {
+
+                        sh '''
+                            echo 3.8.13 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.8.13
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "nightly",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+                            
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.8.13/lib/python3.8/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.8.13/bin/python3.8" \
+                            --config=nonpip
+                            '''
+                    }
+                }
+                stage("Python 3.9") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+
+                        sh '''
+                            echo 3.9.13 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.9.13
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "nightly",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.9.13/lib/python3.9/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.9.13/bin/python3.9" \
+                            --config=nonpip
+                            '''
+                    }
+                }
+                stage("Python 3.10") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        sh '''
+                            echo 3.10.4 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.10.4
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "nightly",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+                        
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.10.4/lib/python3.10/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.10.4/bin/python3.10" \
+                            --config=nonpip
+                            '''
+
+                    }
+                }
+                stage("Python 3.11") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        sh '''
+                            echo 3.11.2 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.11.2
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "nightly",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.11.2/lib/python3.11/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.11.2/bin/python3.11" \
+                            --config=nonpip
+                            '''
+
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile
new file mode 100644
index 00000000000..6928c6bf732
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_as_test_release.Jenkinsfile
@@ -0,0 +1,166 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+    environment {
+        RELEASE_BRANCH = 'r2.13'
+    }
+    stages {
+        stage("Build Tensorflow") {
+            parallel {
+                stage("Python 3.8") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+
+                    }
+                    steps {
+
+                        sh '''
+                            echo 3.8.13 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.8.13
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "${RELEASE_BRANCH}",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+                            
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                        
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.8.13/lib/python3.8/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.8.13/bin/python3.8" \
+                            --config=nonpip
+                            '''
+                    }
+                }
+                stage("Python 3.9") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+
+                        sh '''
+                            echo 3.9.13 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.9.13
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "${RELEASE_BRANCH}",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.9.13/lib/python3.9/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.9.13/bin/python3.9" \
+                            --config=nonpip
+                            '''
+                    }
+                }
+                stage("Python 3.10") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        sh '''
+                            echo 3.10.4 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.10.4
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "${RELEASE_BRANCH}",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.10.4/lib/python3.10/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.10.4/bin/python3.10" \
+                            --config=nonpip
+                            '''
+
+                    }
+                }
+                stage("Python 3.11") {
+                    agent {
+                        label "silicon-ci"
+                    }
+                    environment {
+                        PYENV_ROOT="$HOME/.pyenv"
+                        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+                    }
+                    steps {
+                        sh '''
+                            echo 3.11.2 > /Users/admin/.python-version
+                            pyenv init -
+                            pyenv global 3.11.2
+                        '''
+
+                        sh 'python --version'
+
+                        git branch: "${RELEASE_BRANCH}",
+                            url: "https://github.com/tensorflow/tensorflow.git"
+
+                        sh '''
+                            pip install --upgrade pip
+                            pip install -r ./tensorflow/tools/ci_build/release/requirements_mac.txt
+                        '''
+
+                        sh '''
+                            bazel --bazelrc="${WORKSPACE}/tensorflow/tools/ci_build/osx/arm64/.macos.bazelrc" test \
+                            --action_env PYTHON_LIB_PATH="/Users/admin/.pyenv/versions/3.11.2/lib/python3.11/site-packages" \
+                            --action_env PYTHON_BIN_PATH="/Users/admin/.pyenv/versions/3.11.2/bin/python3.11" \
+                            --config=nonpip
+                            '''
+
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py b/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py
new file mode 100644
index 00000000000..e0c5968fbdf
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_metal_plugin_test.py
@@ -0,0 +1,6267 @@
+"""Copyright 2023 The TensorFlow Authors.
+
+All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+# !/usr/bin/env python3
+# pylint: disable=g-bad-todo
+# pylint: disable=redefined-builtin
+# pylint: disable=arguments-out-of-order
+# pylint: disable=missing-function-docstring
+# pylint: disable=missing-class-docstring
+# pylint: disable=protected-access
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import copy
+import functools
+import gc
+import itertools
+import math
+import re
+import time
+
+from absl.testing import parameterized
+import numpy as np
+from PIL import Image
+from six.moves import xrange  # pylint: disable=redefined-builtin
+import tensorflow as tf
+from tensorflow import raw_ops
+
+from tensorflow.python.client import session
+from tensorflow.python.compat import compat
+from tensorflow.python.eager import backprop
+from tensorflow.python.eager import context
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.framework import errors
+from tensorflow.python.framework import indexed_slices
+from tensorflow.python.framework import ops
+from tensorflow.python.framework import random_seed
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.framework import test_util
+from tensorflow.python.ops import array_ops
+from tensorflow.python.ops import embedding_ops
+from tensorflow.python.ops import gen_math_ops
+from tensorflow.python.ops import gen_nn_ops
+from tensorflow.python.ops import gradient_checker
+from tensorflow.python.ops import gradient_checker_v2
+from tensorflow.python.ops import gradients_impl
+from tensorflow.python.ops import image_ops
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_impl
+from tensorflow.python.ops import nn_ops
+from tensorflow.python.ops import random_ops
+from tensorflow.python.ops import resource_variable_ops
+from tensorflow.python.ops import resources
+from tensorflow.python.ops import variables
+from tensorflow.python.platform import test
+from tensorflow.python.platform import tf_logging
+from tensorflow.python.training import adam
+from tensorflow.python.training import gradient_descent
+from tensorflow.python.util.compat import collections_abc
+
+_ADD = lambda x, y: x + y
+_SUB = lambda x, y: x - y
+_MUL = lambda x, y: x * y
+_POW = lambda x, y: x**y
+_TRUEDIV = lambda x, y: x / y
+_FLOORDIV = lambda x, y: x // y
+_MOD = lambda x, y: x % y
+_NEG = lambda x: -x
+_ABS = abs
+_MAX_RANK = 5
+
+
+def _default_tolerance(dtype):
+  """Returns a sensible default tolerance for comparing results of a given type.
+
+  Args:
+    dtype: A datatype.
+  """
+  if dtype == np.float16:
+    return 5e-3
+  elif dtype in (np.float32, np.complex64):
+    return 1e-3
+  elif dtype in (np.float64, np.complex128):
+    return 1e-5
+  else:
+    return None  # Fail fast for unexpected types
+
+
+def _powerset(iterable):
+  """Helper for generating all possible reduction_axes arguments.
+
+  Example: powerset([0,1,2]): () (0,) (1,) (2,) (0,1) (0,2) (1,2) (0,1,2)
+
+  Args:
+    iterable: An iterable of items to generate the powerset of.
+
+  Returns:
+    The powerset of all items in iterable.
+  """
+  s = list(iterable)
+  return itertools.chain.from_iterable(
+      itertools.combinations(s, r) for r in range(len(s) + 1)
+  )
+
+
+def adam_update_numpy(
+    param, g_t, t, m, v, alpha=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8
+):
+  alpha_t = alpha * np.sqrt(1 - beta2**t) / (1 - beta1**t)
+
+  m_t = beta1 * m + (1 - beta1) * g_t
+  v_t = beta2 * v + (1 - beta2) * g_t * g_t
+
+  param_t = param - alpha_t * m_t / (np.sqrt(v_t) + epsilon)
+  return param_t, m_t, v_t
+
+
+def pool_direct_single_axis(
+    input,  # pylint: disable=redefined-builtin
+    axis,
+    window_size,
+    pooling_type,
+    padding,
+    dilation_rate,
+    stride,
+):
+  effective_window_size = (window_size - 1) * dilation_rate + 1
+  input_size = input.shape[axis]
+  if padding == "SAME":
+    output_size = int(math.ceil(input_size / stride))
+    total_padding_amount = max(
+        0, (output_size - 1) * stride + effective_window_size - input_size
+    )
+    before_padding = total_padding_amount // 2
+  elif padding == "VALID":
+    output_size = int(
+        math.ceil((input_size - effective_window_size + 1) / stride)
+    )
+    before_padding = 0
+  else:
+    raise ValueError("Unsupported padding type: %r" % (padding,))
+
+  output_shape = input.shape[:axis] + (output_size,) + input.shape[axis + 1 :]
+  output = np.zeros(output_shape, input.dtype)
+  initial_dim_selector = tuple(np.s_[:] for _ in range(axis))
+  if pooling_type == "MAX":
+    pooling_func = np.max
+  elif pooling_type == "AVG":
+    pooling_func = np.mean
+  else:
+    raise ValueError("Unsupported pooling type: %r" % (pooling_type,))
+  for output_pos in range(output_size):
+    input_start_pos = output_pos * stride - before_padding
+    input_end_pos = min(input_start_pos + effective_window_size, input_size)
+    if input_start_pos < 0:
+      input_start_pos += dilation_rate
+    input_slice = np.s_[input_start_pos:input_end_pos:dilation_rate]
+
+    output[initial_dim_selector + (output_pos,)] = pooling_func(
+        input[initial_dim_selector + (input_slice,)], axis=axis
+    )
+  return output
+
+
+def pool_direct(
+    input,  # pylint: disable=redefined-builtin
+    window_shape,
+    pooling_type,
+    padding,  # pylint: disable=redefined-builtin
+    dilation_rate,
+    strides,
+    data_format=None,
+):
+  if data_format is None or not data_format.startswith("NC"):
+    spatial_start_dim = 1
+  else:
+    spatial_start_dim = 2
+  output = input
+  for i in range(len(window_shape)):
+    output = pool_direct_single_axis(
+        input=output,
+        axis=i + spatial_start_dim,
+        window_size=window_shape[i],
+        pooling_type=pooling_type,
+        padding=padding,
+        dilation_rate=dilation_rate[i],
+        stride=strides[i],
+    )
+  return output
+
+
+_TEST_TYPES = [dtypes.float32]
+
+
+class MomentumOptimizerTest(test.TestCase, parameterized.TestCase):
+
+  def _update_nesterov_momentum_numpy(self, var, accum, g, lr, momentum):
+    accum = accum * momentum - g * lr
+    var += accum * momentum - g * lr
+    return var, accum
+
+  def testBasic(self):
+    for _, dtype in enumerate([dtypes.float32]):
+      var0 = variables.Variable([1.0, 2.0], dtype=dtype, name="var0")
+      var1 = variables.Variable([3.0, 4.0], dtype=dtype, name="var1")
+      grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+      grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+      learning_rate = 2.0
+      momentum = 0.9
+      mom_opt = tf.keras.optimizers.legacy.SGD(
+          learning_rate=learning_rate, momentum=momentum
+      )
+      # self.assertFalse(mom_opt._initial_decay)
+      mom_update = mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+      # Check we have slots
+      slot0 = mom_opt.get_slot(var0, "momentum")
+      self.assertEqual(slot0.shape, var0.shape)
+      slot1 = mom_opt.get_slot(var1, "momentum")
+      self.assertEqual(slot1.shape, var1.shape)
+
+      # Step 1: the momentum accumulators where 0. So we should see a normal
+      # update: v -= grad * learning_rate
+      self.evaluate(variables.global_variables_initializer())
+      self.evaluate(mom_update)
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([-0.2, -0.2]), self.evaluate(slot0)
+      )
+      self.assertAllCloseAccordingToType(
+          np.array([-0.02, -0.02]), self.evaluate(slot1)
+      )
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([1.0 - (0.1 * 2.0), 2.0 - (0.1 * 2.0)]), self.evaluate(var0)
+      )
+      self.assertAllCloseAccordingToType(
+          np.array([3.0 - (0.01 * 2.0), 4.0 - (0.01 * 2.0)]),
+          self.evaluate(var1),
+      )
+      # Step 2: the momentum accumulators contain the previous update.
+      self.evaluate(mom_update)
+      if context.executing_eagerly():
+        mom_opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+      # Check that the momentum accumulators have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([(0.9 * (-0.2) - 2.0 * 0.1), (0.9 * (-0.2) - 2.0 * 0.1)]),
+          self.evaluate(slot0),
+      )
+      self.assertAllCloseAccordingToType(
+          np.array(
+              [(0.9 * (-0.02) - 2.0 * 0.01), (0.9 * (-0.02) - 2.0 * 0.01)]
+          ),
+          self.evaluate(slot1),
+      )
+      # Check that the parameters have been updated.
+      self.assertAllCloseAccordingToType(
+          np.array([
+              1.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+              2.0 - (0.1 * 2.0) - ((0.9 * 0.1 + 0.1) * 2.0),
+          ]),
+          self.evaluate(var0),
+      )
+      self.assertAllCloseAccordingToType(
+          np.array([
+              2.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+              3.98 - ((0.9 * 0.01 + 0.01) * 2.0),
+          ]),
+          self.evaluate(var1),
+      )
+
+  def testNesterovMomentum(self):
+    with ops.Graph().as_default():
+      for dtype in [dtypes.float32]:
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype, name="var0")
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype, name="var1")
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        accum0_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        accum1_np = np.array([0.0, 0.0], dtype=dtype.as_numpy_dtype)
+        loss = lambda: 5 * var0 * var0 + 3 * var1  # pylint: disable=cell-var-from-loop
+        mom_op = tf.keras.optimizers.legacy.SGD(
+            learning_rate=2.0, momentum=0.9, nesterov=True
+        )
+        opt_op = mom_op.minimize(loss, [var0, var1])
+        self.evaluate(variables.global_variables_initializer())
+        for _ in range(1, 5):
+          self.evaluate(opt_op)
+          var0_np, accum0_np = self._update_nesterov_momentum_numpy(
+              var0_np, accum0_np, var0_np * 10, 2.0, 0.9
+          )
+          var1_np, accum1_np = self._update_nesterov_momentum_numpy(
+              var1_np, accum1_np, 3, 2.0, 0.9
+          )
+          self.assertAllClose(var0_np, self.evaluate(var0))
+          self.assertAllClose(var1_np, self.evaluate(var1))
+
+
+class ArgMaxTest(test.TestCase):
+
+  def _testArg(
+      self,
+      method,
+      x,
+      axis,
+      expected_values,
+      use_gpu=False,
+      expected_err_re=None,
+  ):
+    with self.session(use_gpu=use_gpu):
+      ans = method(x, axis=axis)
+      if expected_err_re is None:
+        tf_ans = self.evaluate(ans)
+        # Defaults to int64 output.
+        self.assertEqual(np.int64, tf_ans.dtype)
+        self.assertAllEqual(tf_ans, expected_values)
+        self.assertShapeEqual(expected_values, ans)
+      else:
+        with self.assertRaisesOpError(expected_err_re):
+          self.evaluate(ans)
+
+  def _testBothArg(
+      self, method, x, axis, expected_values, expected_err_re=None
+  ):
+    self._testArg(method, x, axis, expected_values, True, expected_err_re)
+    # Compilation time is too large with XLA/CPU autojit.
+    if not test_util.is_xla_enabled():
+      self._testArg(method, x, axis, expected_values, False, expected_err_re)
+
+  def _testBasic(self, dtype):
+    x = np.arange(200, dtype=np.float32).astype(np.bool_).astype(dtype)
+    np.random.shuffle(x)
+
+    # Check that argmin and argmax match numpy along the primary axis
+    self._testBothArg(math_ops.argmax, x, 0, x.argmax())
+    # self._testBothArg(math_ops.argmin, x, 0, x.argmin())
+
+  def _testTieBreaking(self, dtype):
+    x = np.zeros(200, dtype=dtype)
+
+    # Check that argmin and argmax match numpy along the primary axis for
+    # breaking ties.
+    self._testBothArg(math_ops.argmax, x, 0, x.argmax())
+    self._testBothArg(math_ops.argmin, x, 0, x.argmin())
+
+  def _testDim(self, dtype):
+    shape = (3, 2, 4, 5, 6, 3, 7)
+    x = np.arange(
+        functools.reduce(lambda x, y: x * y, shape), dtype=np.float32
+    ).astype(dtype)
+    np.random.shuffle(x)
+    x = x.reshape(shape)
+
+    # Check that argmin and argmax match numpy along all axes
+    for axis in range(-7, 7):
+      self._testBothArg(math_ops.argmax, x, axis, x.argmax(axis))
+      self._testBothArg(math_ops.argmin, x, axis, x.argmin(axis))
+
+  def testFloat(self):
+    self._testBasic(np.float32)
+    # self._testTieBreaking(np.float32)
+    # self._testDim(np.float32)
+
+  def testFloatInt32Output(self):
+    x = np.asarray(100 * np.random.randn(200), dtype=np.float32)
+    expected_values = x.argmax()
+    with self.session(use_gpu=True):
+      ans = math_ops.argmax(x, axis=0, output_type=dtypes.int32)
+      tf_ans = self.evaluate(ans)
+      self.assertEqual(np.int32, tf_ans.dtype)
+      # The values are equal when comparing int32 to int64 because
+      # the values don't have a range that exceeds 32-bit integers.
+      self.assertAllEqual(tf_ans, expected_values)
+    expected_values = x.argmin()
+    with self.session(use_gpu=True):
+      ans = math_ops.argmin(x, axis=0, output_type=dtypes.int32)
+      tf_ans = self.evaluate(ans)
+      self.assertEqual(np.int32, tf_ans.dtype)
+      self.assertAllEqual(tf_ans, expected_values)
+
+
+class GatherTest(test.TestCase, parameterized.TestCase):
+
+  def _buildParams(self, data, dtype):
+    data = data.astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
+
+  def testScalar1D(self):
+    with self.cached_session(use_gpu=True):
+      data = np.array([0, 1, 2, 3, 7, 5])
+      for dtype in _TEST_TYPES:
+        for indices in 4, [1, 2, 2, 4, 5]:
+          params_np = self._buildParams(data, dtype)
+          params = constant_op.constant(params_np)
+          indices_tf = constant_op.constant(indices)
+          gather_t = array_ops.gather(params, indices_tf)
+          gather_val = self.evaluate(gather_t)
+          np_val = params_np[indices]
+          self.assertAllEqual(np_val, gather_val)
+          self.assertEqual(np_val.shape, gather_t.get_shape())
+
+  def testScalar2D(self):
+    with self.session(use_gpu=True):
+      data = np.array(
+          [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]
+      )
+      for dtype in _TEST_TYPES:
+        for axis in range(data.ndim):
+          params_np = self._buildParams(data, dtype)
+          params = constant_op.constant(params_np)
+          indices = constant_op.constant(2)
+          gather_t = array_ops.gather(params, indices, axis=axis)
+          gather_val = self.evaluate(gather_t)
+          print("TF {}".format(gather_val))
+          print("CPU {}".format(np.take(params_np, 2, axis=axis)))
+          self.assertAllEqual(np.take(params_np, 2, axis=axis), gather_val)
+          expected_shape = data.shape[:axis] + data.shape[axis + 1 :]
+          self.assertEqual(expected_shape, gather_t.get_shape())
+
+  def testSimpleTwoD32(self):
+    with self.session(use_gpu=True):
+      data = np.array(
+          [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11], [12, 13, 14]]
+      )
+      for dtype in _TEST_TYPES:
+        for axis in range(data.ndim):
+          params_np = self._buildParams(data, dtype)
+          params = constant_op.constant(params_np)
+          # The indices must be in bounds for any axis.
+          indices = constant_op.constant([0, 1, 0, 2])
+          gather_t = array_ops.gather(params, indices, axis=axis)
+          gather_val = self.evaluate(gather_t)
+          self.assertAllEqual(
+              np.take(params_np, [0, 1, 0, 2], axis=axis), gather_val
+          )
+          expected_shape = data.shape[:axis] + (4,) + data.shape[axis + 1 :]
+          self.assertEqual(expected_shape, gather_t.get_shape())
+
+
+class SliceTest(test.TestCase):
+
+  def testEmpty(self):
+    inp = np.random.rand(4, 4).astype("f")
+    for k in xrange(4):
+      with self.cached_session(use_gpu=True):
+        a = constant_op.constant(inp, shape=[4, 4], dtype=dtypes.float32)
+        slice_t = a[2, k:k]
+        slice_val = self.evaluate(slice_t)
+      self.assertAllEqual(slice_val, inp[2, k:k])
+
+  def testSimple(self):
+    with self.session(use_gpu=True) as _:
+      inp = np.random.rand(4, 4).astype("f")
+      a = constant_op.constant(
+          [float(x) for x in inp.ravel(order="C")],
+          shape=[4, 4],
+          dtype=dtypes.float32,
+      )
+      slice_t = array_ops.slice(a, [0, 0], [2, 2])
+      slice2_t = a[:2, :2]
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
+    self.assertAllEqual(slice_val, inp[:2, :2])
+    self.assertAllEqual(slice2_val, inp[:2, :2])
+    self.assertEqual(slice_val.shape, slice_t.get_shape())
+    self.assertEqual(slice2_val.shape, slice2_t.get_shape())
+
+  def testSingleDimension(self):
+    for _ in range(10):
+      with self.cached_session(use_gpu=True):
+        inp = np.random.rand(10).astype("f")
+        a = constant_op.constant(inp, shape=[10], dtype=dtypes.float32)
+
+        hi = np.random.randint(0, 9)
+        scalar_t = a[hi]
+        scalar_val = self.evaluate(scalar_t)
+        self.assertAllEqual(scalar_val, inp[hi])
+
+        if hi > 0:
+          lo = np.random.randint(0, hi)
+        else:
+          lo = 0
+        slice_t = a[lo:hi]
+        slice_val = self.evaluate(slice_t)
+        self.assertAllEqual(slice_val, inp[lo:hi])
+
+  def test3Dimension(self):
+    with self.cached_session():
+      input_shape = [8, 16, 16, 16, 8]
+      total_input_size = 1
+      for s in input_shape:
+        total_input_size *= s
+      inputs = [
+          i * 1.0 / total_input_size for i in range(1, total_input_size + 1)
+      ]
+      a = constant_op.constant(inputs, shape=input_shape, dtype=dtypes.float32)
+
+      filter_shape = [1, 1, 1, 8, 8]
+      total_filter_size = 1
+      for s in filter_shape:
+        total_filter_size *= s
+      filters = [
+          i * 1.0 / total_filter_size for i in range(1, total_filter_size + 1)
+      ]
+      f = constant_op.constant(
+          filters, shape=filter_shape, dtype=dtypes.float32
+      )
+
+      conv_t = nn_ops.conv3d(
+          a, filter=f, strides=[1, 1, 1, 1, 1], padding="VALID"
+      )
+      slice_t = array_ops.slice(conv_t, [0, 1, 1, 1, 0], [1, 1, 1, 1, 8])
+      result = self.evaluate(slice_t)
+      expected = [
+          0.03028321,
+          0.03132677,
+          0.03237033,
+          0.03341389,
+          0.03445745,
+          0.035501,
+          0.03654456,
+          0.03758812,
+      ]
+      self.assertAllClose(expected, result.flatten(), rtol=1e-6)
+
+  def testRandom(self):
+    # Random dims of rank 6
+    input_shape = np.random.randint(0, 20, size=6)
+    inp = np.random.rand(*input_shape).astype("f")
+    with self.session(use_gpu=True) as _:
+      a = constant_op.constant(
+          [float(x) for x in inp.ravel(order="C")],
+          shape=input_shape,
+          dtype=dtypes.float32,
+      )
+      indices = [0 if x == 0 else np.random.randint(x) for x in input_shape]
+      sizes = [
+          np.random.randint(0, input_shape[i] - indices[i] + 1)
+          for i in range(6)
+      ]
+      slice_t = array_ops.slice(a, indices, sizes)
+      slice2_t = a[
+          indices[0] : indices[0] + sizes[0],
+          indices[1] : indices[1] + sizes[1],
+          indices[2] : indices[2] + sizes[2],
+          indices[3] : indices[3] + sizes[3],
+          indices[4] : indices[4] + sizes[4],
+          indices[5] : indices[5] + sizes[5],
+      ]
+
+      slice_val, slice2_val = self.evaluate([slice_t, slice2_t])
+
+    expected_val = inp[
+        indices[0] : indices[0] + sizes[0],
+        indices[1] : indices[1] + sizes[1],
+        indices[2] : indices[2] + sizes[2],
+        indices[3] : indices[3] + sizes[3],
+        indices[4] : indices[4] + sizes[4],
+        indices[5] : indices[5] + sizes[5],
+    ]
+    self.assertAllEqual(slice_val, expected_val)
+    self.assertAllEqual(slice2_val, expected_val)
+    self.assertEqual(expected_val.shape, slice_t.get_shape())
+    self.assertEqual(expected_val.shape, slice2_t.get_shape())
+
+  def testPartialShapeInference(self):
+    z = array_ops.zeros((1, 2, 3))
+    self.assertAllEqual(z.get_shape().as_list(), [1, 2, 3])
+
+    m1 = array_ops.slice(z, [0, 0, 0], [-1, -1, -1])
+    self.assertAllEqual(m1.get_shape().as_list(), [1, 2, 3])
+
+    m2 = array_ops.slice(z, [0, 0, 0], [constant_op.constant(1) + 0, 2, -1])
+    self.assertAllEqual(m2.get_shape().as_list(), [1, 2, 3])
+
+
+class L2LossTest(test.TestCase):
+
+  @test_util.run_in_graph_and_eager_modes
+  def testL2Loss(self):
+    for dtype in [dtypes.float32, dtypes.float64]:
+      x = constant_op.constant(
+          [1.0, 0.0, 3.0, 2.0], shape=[2, 2], name="x", dtype=dtype
+      )
+      l2loss = nn_ops.l2_loss(x)
+      value = self.evaluate(l2loss)
+      self.assertAllClose(7.0, value)
+
+  @test_util.run_deprecated_v1
+  def testGradient(self):
+    x_shape = [20, 7, 3]
+    np.random.seed(1)  # Make it reproducible.
+    x_val = np.random.random_sample(x_shape).astype(np.float64)
+    with self.cached_session():
+      x = constant_op.constant(x_val, name="x")
+      output = nn_ops.l2_loss(x)
+      err = gradient_checker.compute_gradient_error(x, x_shape, output, [1])
+    print("L2Loss gradient err = %g " % err)
+    err_tolerance = 1e-10
+    self.assertLess(err, err_tolerance)
+
+
+class AdamOptimizerTest(test.TestCase):
+
+  def doTestBasic(self, use_resource=False, use_callable_params=False):
+    if context.executing_eagerly() and not use_resource:
+      self.skipTest(
+          "Skipping test with use_resource=False and executing eagerly."
+      )
+    for i, dtype in enumerate([dtypes.float32]):
+      with self.session(graph=ops.Graph()):
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        if use_resource:
+          var0 = resource_variable_ops.ResourceVariable(
+              var0_np, name="var0_%d" % i
+          )
+          var1 = resource_variable_ops.ResourceVariable(
+              var1_np, name="var1_%d" % i
+          )
+        else:
+          var0 = variables.RefVariable(var0_np)
+          var1 = variables.RefVariable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+
+        learning_rate = lambda: 0.001
+        beta1 = lambda: 0.9
+        beta2 = lambda: 0.999
+        epsilon = lambda: 1e-8
+        if not use_callable_params:
+          learning_rate = learning_rate()
+          beta1 = beta1()
+          beta2 = beta2()
+          epsilon = epsilon()
+
+        opt = adam.AdamOptimizer(learning_rate=learning_rate)
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        opt_variables = opt.variables()
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+        self.assertIsNotNone(beta1_power)
+        self.assertIsNotNone(beta2_power)
+        self.assertIn(beta1_power, opt_variables)
+        self.assertIn(beta2_power, opt_variables)
+        # Ensure that non-slot variables are the same type as the requested
+        # variables.
+        self.assertEqual(
+            use_resource,
+            resource_variable_ops.is_resource_variable(beta1_power),
+        )
+        self.assertEqual(
+            use_resource,
+            resource_variable_ops.is_resource_variable(beta2_power),
+        )
+
+        if not context.executing_eagerly():
+          with ops.Graph().as_default():
+            # Shouldn't return non-slot variables from other graphs.
+            self.assertEqual(0, len(opt.variables()))
+          self.evaluate(variables.global_variables_initializer())
+          # Fetch params to validate initial values
+          self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+          self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          if not context.executing_eagerly():
+            self.evaluate(update)
+          elif t > 1:
+            opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+
+          self.assertAllCloseAccordingToType(
+              0.9 ** (t + 1), self.evaluate(beta1_power)
+          )
+          self.assertAllCloseAccordingToType(
+              0.999 ** (t + 1), self.evaluate(beta2_power)
+          )
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+          var0_eval = self.evaluate(var0)
+          var1_eval = self.evaluate(var1)
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, var0_eval)
+          self.assertAllCloseAccordingToType(var1_np, var1_eval)
+          if use_resource:
+            self.assertEqual(
+                "var0_%d/Adam:0" % (i,), opt.get_slot(var=var0, name="m").name
+            )
+
+  def testBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testResourceBasic(self):
+    self.doTestBasic(use_resource=True)
+
+  def testBasicCallableParams(self):
+    with context.eager_mode():
+      self.doTestBasic(use_resource=True, use_callable_params=True)
+
+  @test_util.run_deprecated_v1
+  def testTensorLearningRate(self):
+    for dtype in [dtypes.float32]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer(constant_op.constant(0.001))
+        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Run 3 steps of Adam
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(
+              0.9**t, self.evaluate(beta1_power)
+          )
+          self.assertAllCloseAccordingToType(
+              0.999**t, self.evaluate(beta2_power)
+          )
+          update.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  @test_util.run_deprecated_v1
+  def testSharing(self):
+    for dtype in [dtypes.float32]:
+      with self.cached_session():
+        # Initialize variables for numpy implementation.
+        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
+        var0_np = np.array([1.0, 2.0], dtype=dtype.as_numpy_dtype)
+        grads0_np = np.array([0.1, 0.1], dtype=dtype.as_numpy_dtype)
+        var1_np = np.array([3.0, 4.0], dtype=dtype.as_numpy_dtype)
+        grads1_np = np.array([0.01, 0.01], dtype=dtype.as_numpy_dtype)
+
+        var0 = variables.Variable(var0_np)
+        var1 = variables.Variable(var1_np)
+        grads0 = constant_op.constant(grads0_np)
+        grads1 = constant_op.constant(grads1_np)
+        opt = adam.AdamOptimizer()
+        update1 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        update2 = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        variables.global_variables_initializer().run()
+
+        beta1_power, beta2_power = opt._get_beta_accumulators()
+
+        # Fetch params to validate initial values
+        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
+        self.assertAllClose([3.0, 4.0], self.evaluate(var1))
+
+        # Run 3 steps of intertwined Adam1 and Adam2.
+        for t in range(1, 4):
+          self.assertAllCloseAccordingToType(
+              0.9**t, self.evaluate(beta1_power)
+          )
+          self.assertAllCloseAccordingToType(
+              0.999**t, self.evaluate(beta2_power)
+          )
+          if t % 2 == 0:
+            update1.run()
+          else:
+            update2.run()
+
+          var0_np, m0, v0 = adam_update_numpy(var0_np, grads0_np, t, m0, v0)
+          var1_np, m1, v1 = adam_update_numpy(var1_np, grads1_np, t, m1, v1)
+
+          # Validate updated params
+          self.assertAllCloseAccordingToType(var0_np, self.evaluate(var0))
+          self.assertAllCloseAccordingToType(var1_np, self.evaluate(var1))
+
+  def testTwoSessions(self):
+    optimizer = adam.AdamOptimizer()
+
+    with context.eager_mode():
+      var0 = variables.Variable(
+          np.array([1.0, 2.0], dtype=np.float32), name="v0"
+      )
+      grads0 = constant_op.constant(np.array([0.1, 0.1], dtype=np.float32))
+      optimizer.apply_gradients([(grads0, var0)])
+
+    g = ops.Graph()
+    with g.as_default():
+      with session.Session():
+        var0 = variables.Variable(
+            np.array([1.0, 2.0], dtype=np.float32), name="v0"
+        )
+        grads0 = constant_op.constant(np.array([0.1, 0.1], dtype=np.float32))
+        optimizer.apply_gradients([(grads0, var0)])
+
+    gg = ops.Graph()
+    with gg.as_default():
+      with session.Session():
+        var0 = variables.Variable(np.array([1.0, 2.0]), name="v0")
+        grads0 = constant_op.constant(np.array([0.1, 0.1]))
+
+        # If the optimizer saves any state not keyed by graph the following line
+        # fails.
+        optimizer.apply_gradients([(grads0, var0)])
+
+  def testSlotsUniqueEager(self):
+    with context.eager_mode():
+      v1 = resource_variable_ops.ResourceVariable(1.0)
+      v2 = resource_variable_ops.ResourceVariable(1.0)
+      opt = adam.AdamOptimizer(1.0)
+      opt.minimize(lambda: v1 + v2)
+      # There should be two non-slot variables, and two unique slot variables
+      # for v1 and v2 respectively.
+      self.assertEqual(6, len({id(v) for v in opt.variables()}))
+
+
+class RoundingTest(test.TestCase):
+
+  def _compare_values(self, x, y=None):
+    y = np.rint(x) if y is None else np.asarray(y)
+
+    tf_rint = math_ops.rint(x)
+    np_rint = self.evaluate(tf_rint)
+
+    self.assertAllEqual(y, np_rint)
+    self.assertShapeEqual(y, tf_rint)
+
+  def _compare(self, x):
+    np_floor, np_ceil = np.floor(x), np.ceil(x)
+
+    inx = ops.convert_to_tensor(x)
+    ofloor, oceil = math_ops.floor(inx), math_ops.ceil(inx)
+    tf_floor, tf_ceil = self.evaluate([ofloor, oceil])
+
+    self.assertAllEqual(np_floor, tf_floor)
+    self.assertAllEqual(np_ceil, tf_ceil)
+    self.assertShapeEqual(np_floor, ofloor)
+    self.assertShapeEqual(np_ceil, oceil)
+
+  def _testDtype(self, dtype):
+    data = (np.arange(-3, 3) / 4.0).reshape(1, 3, 2).astype(dtype)
+    self._compare(data)
+    # TODO: rint op is not supported for float16
+    if dtype is np.float16:
+      return
+    self._compare_values(data)
+    x = [0.5, 0.5000001]
+    y = [0.0, 1.0]
+    self._compare_values(x, y=y)
+
+    # numpy example
+    x = [-1.7, -1.5, -0.2, 0.2, 1.5, 1.7, 2.0]
+    y = [-2.0, -2.0, -0.0, 0.0, 2.0, 2.0, 2.0]
+    self._compare_values(x, y=y)
+
+  def testTypes(self):
+    self.skipTest("b/131162241")
+    for dtype in [np.float16, np.float32, np.float64]:
+      self._testDtype(dtype)
+
+
+class ReverseSequenceTest(test.TestCase):
+
+  def _validateReverseSequence(
+      self, x, batch_axis, seq_axis, seq_lengths, truth, use_gpu=False
+  ):
+    with self.cached_session(use_gpu=use_gpu):
+      ans = array_ops.reverse_sequence(
+          x, batch_axis=batch_axis, seq_axis=seq_axis, seq_lengths=seq_lengths
+      )
+      tf_ans = self.evaluate(ans)
+      self.assertAllClose(tf_ans, truth, atol=1e-10)
+      self.assertShapeEqual(truth, ans)
+
+  def _testBasic(self, dtype, len_dtype=np.int64):
+    x = np.asarray(
+        [
+            [[1, 2, 3, 4], [5, 6, 7, 8]],
+            [[9, 10, 11, 12], [13, 14, 15, 16]],
+            [[17, 18, 19, 20], [21, 22, 23, 24]],
+        ],
+        dtype=dtype,
+    )
+    x = x.reshape(3, 2, 4, 1, 1)
+    x = x.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    # reverse dim 2 up to (0:3, none, 0:4) along dim=0
+    seq_lengths = np.asarray([3, 0, 4], dtype=len_dtype)
+
+    truth_orig = np.asarray(
+        [
+            [[3, 2, 1, 4], [7, 6, 5, 8]],  # reverse 0:3
+            [[9, 10, 11, 12], [13, 14, 15, 16]],  # reverse none
+            [[20, 19, 18, 17], [24, 23, 22, 21]],
+        ],  # reverse 0:4 (all)
+        dtype=dtype,
+    )
+    truth_orig = truth_orig.reshape(3, 2, 4, 1, 1)
+    truth = truth_orig.transpose([2, 1, 0, 3, 4])  # permute axes 0 <=> 2
+
+    seq_axis = 0  # permute seq_axis and batch_axis (originally 2 and 0, resp.)
+    batch_axis = 2
+    self._validateReverseSequence(
+        x, batch_axis, seq_axis, seq_lengths, truth, use_gpu=True
+    )
+
+  def testFloat(self):
+    self._testBasic(np.float32, len_dtype=np.int32)
+    self._testBasic(np.float32, len_dtype=np.int64)
+
+
+class TopKTest(test.TestCase):
+
+  def _validateTopK(self, inputs, k, expected_values, expected_indices):
+    np_expected_values = np.array(expected_values)
+    np_expected_indices = np.array(expected_indices)
+
+    with self.cached_session(use_gpu=True) as _:
+      values_op, indices_op = nn_ops.top_k(inputs, k)
+
+      self.assertShapeEqual(np_expected_values, values_op)
+      self.assertShapeEqual(np_expected_indices, indices_op)
+
+      self.assertAllClose(np_expected_values, values_op)
+
+  def testTop1(self):
+    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.3, 0.2]]
+    self._validateTopK(inputs, 1, [[0.4], [0.3]], [[3], [1]])
+
+  def testTop2(self):
+    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.4, 0.2]]
+    self._validateTopK(inputs, 2, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])
+
+  def testTop3(self):
+    k = 5
+    inputs = np.random.permutation(np.linspace(0, 100, 6140, dtype=np.float32))
+    indices = np.argsort(-inputs)[:k]
+    values = -np.sort(-inputs)[:k]
+    self._validateTopK(inputs, k, values, indices)
+
+  def testTensorK(self):
+    inputs = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.3, 0.4, 0.2]]
+    k = constant_op.constant(2)
+    self._validateTopK(inputs, k, [[0.4, 0.3], [0.4, 0.3]], [[3, 1], [2, 1]])
+
+
+class InTopKTest(test.TestCase):
+
+  def _validateInTopK(self, predictions, target, k, expected):
+    np_ans = np.array(expected, np.bool)
+    with self.cached_session(use_gpu=True) as _:
+      output = nn_ops.in_top_k(predictions, target, k)
+      nn_ans = self.evaluate(output)
+      self.assertAllEqual(np_ans, nn_ans)
+      self.assertShapeEqual(np_ans, output)
+
+  def testInTop1(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    target = [3, 2]
+    self._validateInTopK(predictions, target, 1, [True, False])
+
+  def testInTop2(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    target = [2, 2]
+    self._validateInTopK(predictions, target, 2, [False, True])
+
+  def testInTop2Tie(self):
+    # Class 2 and 3 tie for 2nd, so both are considered in top 2.
+    predictions = [[0.1, 0.3, 0.2, 0.2], [0.1, 0.3, 0.2, 0.2]]
+    target = [2, 3]
+    self._validateInTopK(predictions, target, 2, [True, True])
+
+  def testInTop2_int64Target(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    target = np.asarray([0, 2]).astype(np.int64)
+    self._validateInTopK(predictions, target, 2, [False, True])
+
+  def testTensorK(self):
+    predictions = [[0.1, 0.3, 0.2, 0.4], [0.1, 0.2, 0.3, 0.4]]
+    target = [0, 2]
+    k = constant_op.constant(3)
+    self._validateInTopK(predictions, target, k, [False, True])
+
+
+class SplitTest(test.TestCase):
+
+  def testSpecialCase2(self):
+    # Test that fails if the Special case 2 is enabled in split_v_op.cc
+    split_dim = 0
+    shape = (86, 2, 2, 4, 4)
+    size_splits = [4, 2, 4, 7, 5, 7, 4, 6, 2, 3, 7, 6, 5, 2, 5, 2, 4, 6, 5]
+    x = np.random.rand(*shape).astype(np.float32)
+    _ = self.evaluate(array_ops.split(x, size_splits, split_dim))
+
+  def testRandomVariableSlices(self):
+    # Random dims of rank 5
+    shape = np.random.randint(1, 5, size=5)
+    split_dim = np.random.randint(-5, 5)
+    num_split = np.random.randint(2, 25)
+    size_splits = np.random.randint(2, 8, num_split, dtype=np.int32)
+    shape[split_dim] = np.sum(size_splits)
+    x = np.random.rand(*shape).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      result = self.evaluate(array_ops.split(x, size_splits, split_dim))
+    slices = [slice(0, x) for x in shape]
+    offset = 0
+    for i in range(num_split):
+      slices[split_dim] = slice(offset, offset + size_splits[i])
+      offset += size_splits[i]
+      self.assertAllEqual(result[i], x[tuple(slices)])
+
+  def testRegularSlices(self):
+    shape = np.random.randint(1, 5, size=5)
+    split_dim = np.random.randint(-5, 5)
+    num_split = np.random.randint(2, 10)
+    shape[split_dim] = shape[split_dim] * num_split
+    x = np.random.rand(*shape).astype(np.float32)
+    with self.cached_session(use_gpu=True):
+      result = self.evaluate(array_ops.split(x, num_split, split_dim))
+    slices = [slice(0, x) for x in shape]
+    offset = 0
+    length = shape[split_dim] // num_split
+    for i in range(num_split):
+      slices[split_dim] = slice(offset, offset + length)
+      offset += length
+      self.assertAllEqual(result[i], x[tuple(slices)])
+
+
+class ResizeBilinearTest(test.TestCase):
+
+  def _testResize(self, x, y, use_gpu=False):
+    with self.cached_session(use_gpu=use_gpu):
+      ans = image_ops.resize_bilinear(x, y, half_pixel_centers=True)
+      tf_ans = self.evaluate(ans)
+      ref_ans = self._refResize(x, y)
+      self.assertAllEqual(tf_ans.shape, ref_ans.shape)
+      self.assertAllClose(tf_ans, ref_ans)
+
+  def _refResize(self, x, y):
+    # pylint: disable=g-doc-args
+    # pylint: disable=g-doc-return-or-yield
+    """PIL has to treat each channel separately.
+
+    Additionally it expects the new shape to be given (width, height), where as
+    tensorflow expects (height, width)
+    """
+    resized_array = []
+    for array in x:
+      img_channels = []
+      for channel_ind in range(array.shape[-1]):
+        channel = array[:, :, channel_ind]
+        pil_img = Image.fromarray(channel)
+        resized_img = np.asarray(
+            pil_img.resize(size=(y[1], y[0]), resample=Image.BILINEAR)
+        )
+        img_channels.append(resized_img)
+      img = np.stack(img_channels, axis=-1)
+      resized_array.append(img)
+
+    resized_array = np.array(resized_array)
+    return resized_array
+
+  def testFloatBasic(self):
+    x = np.random.rand(3, 24, 24, 3)
+    x = x.astype(np.float32)
+    y = np.asarray([48, 48], dtype=np.int32)
+    self._testResize(x, y, use_gpu=True)
+
+  def testFloatUneven(self):
+    x = np.random.rand(3, 24, 48, 3)
+    x = x.astype(np.float32)
+    y = np.asarray([96, 64])
+    self._testResize(x, y, use_gpu=True)
+
+  def testFloatLarge(self):
+    x = np.random.rand(3, 256, 256, 3)
+    x = x.astype(np.float32)
+    y = np.asarray([1024, 1024])
+    self._testResize(x, y, use_gpu=True)
+
+
+class OneHotTest(test.TestCase):
+
+  def _testOneHot(
+      self, truth, use_gpu=False, expected_err_re=None, raises=None, **inputs
+  ):
+    with self.cached_session(use_gpu=use_gpu):
+      if raises is not None:
+        with self.assertRaises(raises):
+          array_ops.one_hot(**inputs)
+      else:
+        ans = array_ops.one_hot(**inputs)
+
+        if expected_err_re is None:
+          tf_ans = self.evaluate(ans)
+          self.assertEqual(tf_ans.shape, ans.get_shape())
+          self.assertAllEqual(tf_ans, truth)
+        else:
+          with self.assertRaisesOpError(expected_err_re):
+            self.evaluate(ans)
+
+  def _testBothOneHot(self, truth, expected_err_re=None, raises=None, **inputs):
+    self._testOneHot(truth, True, expected_err_re, raises, **inputs)
+    self._testOneHot(truth, False, expected_err_re, raises, **inputs)
+
+  def _testBasic(self, dtype):
+    indices = np.asarray([0, 2, -1, 1], dtype=np.int32)
+    depth = 3
+    on_value = np.asarray(1.0, dtype=dtype)
+    off_value = np.asarray(-1.0, dtype=dtype)
+
+    truth = np.asarray(
+        [
+            [1.0, -1.0, -1.0],
+            [-1.0, -1.0, 1.0],
+            [-1.0, -1.0, -1.0],
+            [-1.0, 1.0, -1.0],
+        ],
+        dtype=dtype,
+    )
+
+    # axis == -1
+    self._testBothOneHot(
+        indices=indices,
+        depth=depth,
+        on_value=on_value,
+        off_value=off_value,
+        dtype=dtype,
+        truth=truth,
+    )
+
+    # axis == 0
+    self._testBothOneHot(
+        indices=indices,
+        depth=depth,
+        on_value=on_value,
+        off_value=off_value,
+        axis=0,
+        dtype=dtype,
+        truth=truth.T,
+    )  # Output is transpose version in this case
+
+  def _testDefaultBasic(self, dtype):
+    indices = np.asarray([0, 2, -1, 1], dtype=np.int32)
+    depth = 3
+
+    truth = np.asarray(
+        [[1.0, 0.0, 0.0], [0.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 1.0, 0.0]],
+        dtype=dtype,
+    )
+
+    # axis == -1
+    self._testBothOneHot(indices=indices, depth=depth, truth=truth)
+
+    # axis == 0
+    self._testBothOneHot(
+        indices=indices, depth=depth, axis=0, truth=truth.T
+    )  # Output is transpose version in this case
+
+  def testFloatBasic(self):
+    self._testBasic(np.float32)
+    self._testDefaultBasic(np.float32)
+
+
+def get_test_configs():
+  """Get all the valid tests configs to run.
+
+  Returns:
+    all the valid test configs as tuples of data_format and use_gpu.
+  """
+  test_configs = [("NHWC", False), ("NHWC", True)]
+  return test_configs
+
+
+class Conv2DTest(test.TestCase):
+
+  def _DtypesToTest(self, use_gpu):
+    # double datatype is currently not supported for convolution ops
+    # on the ROCm platform
+    optional_float64 = [] if test.is_built_with_rocm() else [dtypes.float64]
+    if use_gpu and not test_util.GpuSupportsHalfMatMulAndConv():
+      return [dtypes.float32] + optional_float64
+    else:
+      # It is important that float32 comes before float16 here,
+      # as we will be using its gradients as reference for fp16 gradients.
+      return [dtypes.float32, dtypes.float16] + optional_float64
+
+  def _CreateNumpyTensor(self, shape):
+    total_size = 1
+    for s in shape:
+      total_size *= s
+    return np.arange(1, total_size + 1, dtype=np.float32).reshape(shape)
+
+  def _SetupValuesForDevice(
+      self,
+      tensor_in_sizes,
+      filter_in_sizes,
+      dilations,
+      strides,
+      padding,
+      data_format,
+      dtype,
+      use_gpu,
+  ):
+    """Verifies the output values of the convolution function.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols,
+        input_depth, output_depth].
+      dilations: Dilated rate: [col_dilation, row_dilation]
+      strides: Stride: [col_stride, row_stride]
+      padding: Padding type.
+      data_format: Format of the data tensors.
+      dtype: Data type for inputs and outputs.
+      use_gpu: True if the operations should be run on GPU
+
+    Returns:
+      Symbolic tensor value that can be used to execute the computation
+    """
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
+
+    with test_util.device(use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes, dtype=dtype)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes, dtype=dtype)
+      strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW(padding)
+      conv = nn_ops.conv2d(
+          t1,
+          t2,
+          dilations=dilations,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+      )
+      self.assertEqual(conv.dtype, dtype)
+      if data_format == "NCHW":
+        conv = test_util.NCHWToNHWC(conv)
+
+      return conv
+
+  def _CompareFwdValues(
+      self, tensor_in_sizes, filter_in_sizes, conv_strides, padding
+  ):
+    """Verifies that CPU and GPU produce the same values.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols,
+        input_depth, output_depth].
+      conv_strides: [row_stride, col_stride] for the convolution;
+      padding: Padding type.
+    """
+    x1 = np.random.rand(*tensor_in_sizes).astype(np.float32)
+    x2 = np.random.rand(*filter_in_sizes).astype(np.float32)
+
+    def _setup_val(data_format, use_gpu):
+      with test_util.device(use_gpu):
+        t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+        t2 = constant_op.constant(x2, shape=filter_in_sizes)
+        strides = [1] + conv_strides + [1]
+        if data_format == "NCHW":
+          t1 = test_util.NHWCToNCHW(t1)
+          strides = test_util.NHWCToNCHW(strides)
+        conv = nn_ops.conv2d(
+            t1, t2, strides=strides, padding=padding, data_format=data_format
+        )
+        if data_format == "NCHW":
+          conv = test_util.NCHWToNHWC(conv)
+        return conv
+
+    tensors = []
+    for data_format, use_gpu in get_test_configs():
+      tensors.append(_setup_val(data_format, use_gpu))
+    values = self.evaluate(tensors)
+    for i in range(1, len(values)):
+      self.assertAllClose(values[0], values[i], rtol=1e-3, atol=1e-3)
+
+  def _ComputeReferenceDilatedConv(
+      self,
+      tensor_in_sizes,
+      filter_in_sizes,
+      stride,
+      dilation,
+      padding,
+      data_format,
+      use_gpu,
+  ):
+    x1 = self._CreateNumpyTensor(tensor_in_sizes)
+    x2 = self._CreateNumpyTensor(filter_in_sizes)
+    with test_util.device(use_gpu):
+      t1 = constant_op.constant(x1, shape=tensor_in_sizes)
+      t2 = constant_op.constant(x2, shape=filter_in_sizes)
+      if isinstance(stride, collections_abc.Iterable):
+        strides = list(stride)
+      else:
+        strides = [stride, stride]
+      if data_format == "NCHW":
+        t1 = test_util.NHWCToNCHW(t1)
+        full_strides = [1, 1] + strides
+        full_dilation = [1, 1] + dilation
+      else:
+        full_strides = [1] + strides + [1]
+        full_dilation = [1] + dilation + [1]
+      expected = nn_ops.convolution(
+          t1,
+          t2,
+          padding=padding,
+          strides=strides,
+          dilation_rate=dilation,
+          data_format=data_format,
+      )
+      computed = nn_ops.conv2d(
+          t1,
+          t2,
+          strides=full_strides,
+          dilations=full_dilation,
+          padding=padding,
+          data_format=data_format,
+      )
+      if data_format == "NCHW":
+        expected = test_util.NCHWToNHWC(expected)
+        computed = test_util.NCHWToNHWC(computed)
+    return expected, computed
+
+  def _VerifyDilatedConvValues(
+      self,
+      tensor_in_sizes,
+      filter_in_sizes,
+      strides,
+      padding,
+      dilations,
+      rtol=1e-4,
+  ):
+    expected_results = []
+    computed_results = []
+    for data_format, use_gpu in get_test_configs():
+      expected, computed = self._ComputeReferenceDilatedConv(
+          tensor_in_sizes,
+          filter_in_sizes,
+          strides,
+          dilations,
+          padding,
+          data_format,
+          use_gpu,
+      )
+      expected_results.append(expected)
+      computed_results.append(computed)
+      tolerance = 1e-2 if use_gpu else 1e-5
+      expected_values = self.evaluate(expected_results)
+      computed_values = self.evaluate(computed_results)
+      for e_value, c_value in zip(expected_values, computed_values):
+        tf_logging.debug("expected = %s", e_value)
+        tf_logging.debug("actual = %s", c_value)
+        self.assertAllClose(
+            e_value.flatten(), c_value.flatten(), atol=tolerance, rtol=rtol
+        )
+
+  def _VerifyValues(
+      self,
+      tensor_in_sizes,
+      filter_in_sizes,
+      strides,
+      padding,
+      expected,
+      dilations=(1, 1),
+      gpu_only=False,
+      test_grappler_layout_optimizer=False,
+      tol=1e-5,
+      fp16_tol=1e-3,
+  ):
+    if gpu_only and not test.is_gpu_available(cuda_only=True):
+      return
+    tensors = []
+    dilations = list(dilations)
+    for data_format, use_gpu in get_test_configs():
+      if gpu_only and not use_gpu:
+        continue
+      dtypes_to_test = self._DtypesToTest(use_gpu)
+      if not test_grappler_layout_optimizer and data_format == "NHWC":
+        dtypes_to_test.append(dtypes.int32)
+      for dtype in dtypes_to_test:
+        result = self._SetupValuesForDevice(
+            tensor_in_sizes,
+            filter_in_sizes,
+            dilations,
+            strides,
+            padding,
+            data_format,
+            dtype,
+            use_gpu=use_gpu,
+        )
+        if test_grappler_layout_optimizer and data_format == "NHWC" and use_gpu:
+          # Grappler's layout optimizer will not optimize a fetch node, so
+          # this identity allows Grappler to optimize the Conv2D node.
+          result = array_ops.identity(result)
+        tensors.append(result)
+      values = self.evaluate(tensors)
+      for i in range(len(tensors)):
+        conv = tensors[i]
+        value = values[i]
+        tf_logging.debug("expected = %s", expected)
+        tf_logging.debug("actual = %s", value)
+        tol_to_use = fp16_tol if value.dtype == np.float16 else tol
+        if np.issubdtype(value.dtype, np.integer):
+          self.assertAllEqual(np.rint(expected), np.ravel(value))
+        else:
+          self.assertAllClose(
+              expected, np.ravel(value), atol=tol_to_use, rtol=tol_to_use
+          )
+        self.assertShapeEqual(value, conv)
+        self.assertEqual(value.dtype, conv.dtype.as_numpy_dtype)
+
+  def _VerifyExplicitPaddings(
+      self,
+      tensor_in_sizes,
+      filter_in_sizes,
+      strides,
+      padding,
+      dilations=(1, 1),
+      test_grappler_layout_optimizer=False,
+      tol=1e-5,
+      fp16_tol=1e-3,
+  ):
+    """Verifies Conv2D with explicit padding generates correct values.
+
+    It does this by comparing with Conv2D without explicit padding. This
+    function assumes Conv2D without explicit padding works correctly.
+
+    Args:
+      tensor_in_sizes: Input tensor dimensions in [batch, input_rows,
+        input_cols, input_depth].
+      filter_in_sizes: Filter tensor dimensions in [kernel_rows, kernel_cols,
+        input_depth, output_depth].
+      strides: [row_stride, col_stride] for the convolution;
+      padding: Explicit padding amounts.
+      dilations: Dilation values
+      test_grappler_layout_optimizer: If True, allow the Grappler layout
+        optimizer to run, which turns NHWC Conv2Ds on the GPU to NCHW Conv2Ds.
+      tol: The absolute and relative tolerance for non-fp16 dtypes.
+      fp16_tol: The absolute and relative tolerance for fp16.
+    """
+    input_tensor = self._CreateNumpyTensor(tensor_in_sizes)
+    filter_tensor = self._CreateNumpyTensor(filter_in_sizes)
+    input_tensor = array_ops.pad(input_tensor, [(0, 0)] + padding + [(0, 0)])
+    dilations = list(dilations)
+    conv2d_result = nn_ops.conv2d(
+        input_tensor,
+        filter_tensor,
+        [1] + list(strides) + [1],
+        "VALID",
+        dilations=[1] + dilations + [1],
+    )
+    expected = list(self.evaluate(array_ops.reshape(conv2d_result, [-1])))
+    self._VerifyValues(
+        tensor_in_sizes,
+        filter_in_sizes,
+        strides,
+        padding,
+        expected,
+        dilations,
+        test_grappler_layout_optimizer=test_grappler_layout_optimizer,
+        tol=tol,
+        fp16_tol=fp16_tol,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D1x1Filter(self):
+    expected_output = [
+        30.0,
+        36.0,
+        42.0,
+        66.0,
+        81.0,
+        96.0,
+        102.0,
+        126.0,
+        150.0,
+        138.0,
+        171.0,
+        204.0,
+        174.0,
+        216.0,
+        258.0,
+        210.0,
+        261.0,
+        312.0,
+    ]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        strides=[1, 1],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D2x2Filter2x1Dilation(self):
+    self._VerifyDilatedConvValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DEmpty(self):
+    expected_output = []
+    self._VerifyValues(
+        tensor_in_sizes=[0, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        strides=[1, 1],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DEmptyDilation(self):
+    self._VerifyDilatedConvValues(
+        tensor_in_sizes=[0, 2, 3, 3],
+        filter_in_sizes=[1, 1, 3, 3],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D2x2Filter(self):
+    # The outputs are computed using third_party/py/IPython/notebook.
+    expected_output = [2271.0, 2367.0, 2463.0, 2901.0, 3033.0, 3165.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D2x2FilterDilation(self):
+    self._VerifyDilatedConvValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        dilations=[1, 2],
+        padding="VALID",
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D1x2Filter(self):
+    # The outputs are computed using third_party/py/IPython/notebook.
+    expected_output = [
+        231.0,
+        252.0,
+        273.0,
+        384.0,
+        423.0,
+        462.0,
+        690.0,
+        765.0,
+        840.0,
+        843.0,
+        936.0,
+        1029.0,
+    ]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 2, 3, 3],
+        strides=[1, 1],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D1x2FilterDilation(self):
+    self._VerifyDilatedConvValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[1, 2, 3, 3],
+        strides=[1, 1],
+        dilations=[2, 1],
+        padding="VALID",
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D2x2FilterStride2(self):
+    expected_output = [2271.0, 2367.0, 2463.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[2, 2],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D2x2FilterStride2Same(self):
+    expected_output = [2271.0, 2367.0, 2463.0, 1230.0, 1305.0, 1380.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[2, 2],
+        padding="SAME",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2D2x2FilterStride1x2(self):
+    expected_output = [58.0, 78.0, 98.0, 118.0, 138.0, 158.0]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 3, 6, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        strides=[1, 2],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DKernelSmallerThanStrideValid(self):
+    expected_output = [65, 95, 275, 305]
+    self._VerifyValues(
+        tensor_in_sizes=[1, 7, 7, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        strides=[3, 3],
+        padding="VALID",
+        expected=expected_output,
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DKernelSmallerThanStrideSame(self):
+    self._VerifyValues(
+        tensor_in_sizes=[1, 3, 3, 1],
+        filter_in_sizes=[1, 1, 1, 1],
+        strides=[2, 2],
+        padding="SAME",
+        expected=[1, 3, 7, 9],
+    )
+
+    self._VerifyValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[1, 1, 1, 1],
+        strides=[2, 2],
+        padding="SAME",
+        expected=[1, 3, 9, 11],
+    )
+
+    self._VerifyValues(
+        tensor_in_sizes=[1, 4, 4, 1],
+        filter_in_sizes=[2, 2, 1, 1],
+        strides=[3, 3],
+        padding="SAME",
+        expected=[44, 28, 41, 16],
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DKernelSizeMatchesInputSize(self):
+    self._VerifyValues(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[2, 2, 1, 2],
+        strides=[1, 1],
+        padding="VALID",
+        expected=[50, 60],
+    )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DKernelSizeMatchesInputSizeDilation(self):
+    self._VerifyDilatedConvValues(
+        tensor_in_sizes=[1, 3, 3, 1],
+        filter_in_sizes=[2, 2, 1, 2],
+        strides=[1, 1],
+        dilations=[2, 2],
+        padding="VALID",
+    )
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D0x0Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[0, 0], [0, 0]],
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[3, 4, 3, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 2],
+        padding=[[0, 0], [0, 0]],
+    )
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D1x1Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]],
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 2, 1],
+        filter_in_sizes=[1, 1, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 1], [1, 1]],
+    )
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2D2x2Padding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[2, 1, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 2], [2, 2]],
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 2],
+        filter_in_sizes=[1, 1, 2, 1],
+        strides=[2, 1],
+        padding=[[2, 2], [2, 2]],
+    )
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DOnlyTopRightPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 3],
+        filter_in_sizes=[2, 2, 3, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 2]],
+        tol=5e-5,
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 4, 2],
+        filter_in_sizes=[2, 2, 2, 2],
+        strides=[1, 3],
+        padding=[[1, 0], [0, 2]],
+    )
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DLotsPadding(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 1, 1, 3],
+        filter_in_sizes=[2, 2, 3, 3],
+        strides=[1, 1],
+        padding=[[3, 4], [4, 2]],
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 1, 1],
+        filter_in_sizes=[2, 2, 1, 3],
+        strides=[2, 1],
+        padding=[[3, 4], [4, 2]],
+    )
+
+  @test_util.run_in_graph_and_eager_modes()
+  def testConv2DExplicitPaddingWithDilations(self):
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1],
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3],
+    )
+
+  def testConv2DExplicitPaddingWithLayoutOptimizer(self):
+    # Test with Grappler's layout optimizer, to ensure the layout optimizer
+    # handles explicit padding correctly.
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 3, 2, 1],
+        filter_in_sizes=[1, 2, 1, 2],
+        strides=[1, 1],
+        padding=[[1, 0], [0, 1]],
+        dilations=[2, 1],
+        test_grappler_layout_optimizer=True,
+    )
+
+    self._VerifyExplicitPaddings(
+        tensor_in_sizes=[1, 2, 3, 2],
+        filter_in_sizes=[3, 2, 2, 1],
+        strides=[1, 1],
+        padding=[[2, 1], [1, 2]],
+        dilations=[2, 3],
+        test_grappler_layout_optimizer=True,
+    )
+
+  # Testing for backprops
+  def _RunAndVerifyBackpropInput(
+      self,
+      input_sizes,
+      filter_sizes,
+      output_sizes,
+      strides,
+      padding,
+      expected,
+      data_format,
+      use_gpu,
+      err,
+      dilations=(1, 1),
+  ):
+    if use_gpu and not test.is_gpu_available(cuda_only=True):
+      return
+    x1 = self._CreateNumpyTensor(filter_sizes)
+    x2 = self._CreateNumpyTensor(output_sizes)
+    dilations = list(dilations)
+    with test_util.device(use_gpu):
+      if data_format == "NCHW":
+        input_sizes = test_util.NHWCToNCHW(input_sizes)
+      t0 = constant_op.constant(input_sizes, shape=[len(input_sizes)])
+      t1 = constant_op.constant(x1, shape=filter_sizes)
+      t2 = constant_op.constant(x2, shape=output_sizes)
+      strides = [1] + strides + [1]
+      dilations = [1] + dilations + [1]
+      if isinstance(padding, (list, tuple)):
+        padding = [(0, 0)] + padding + [(0, 0)]
+      if data_format == "NCHW":
+        t2 = test_util.NHWCToNCHW(t2)
+        strides = test_util.NHWCToNCHW(strides)
+        dilations = test_util.NHWCToNCHW(dilations)
+        if isinstance(padding, (list, tuple)):
+          padding = test_util.NHWCToNCHW((padding))
+      conv = nn_ops.conv2d_backprop_input(
+          t0,
+          t1,
+          t2,
+          strides=strides,
+          padding=padding,
+          data_format=data_format,
+          dilations=dilations,
+      )
+      if data_format == "NCHW":
+        conv = test_util.NCHWToNHWC(conv)
+      # "values" consists of two tensors for two backprops
+      value = self.evaluate(conv)
+      self.assertShapeEqual(value, conv)
+    tf_logging.debug("expected = %s", expected)
+    tf_logging.debug("actual = %s", value)
+    self.assertAllCloseAccordingToType(expected, value.flatten(), atol=1e-5)
+
+  def _CompareBackpropInput(
+      self, input_sizes, filter_sizes, output_sizes, conv_strides, padding
+  ):
+    x1 = np.random.rand(*filter_sizes).astype(np.float32)
+    x2 = np.random.rand(*output_sizes).astype(np.float32)
+
+    def _get_val(data_format, use_gpu):
+      with test_util.device(use_gpu):
+        if data_format == "NCHW":
+          new_input_sizes = test_util.NHWCToNCHW(input_sizes)
+        else:
+          new_input_sizes = input_sizes
+        t0 = constant_op.constant(new_input_sizes, shape=[len(new_input_sizes)])
+        t1 = constant_op.constant(x1, shape=filter_sizes)
+        t2 = constant_op.constant(x2, shape=output_sizes)
+        strides = [1] + conv_strides + [1]
+        if data_format == "NCHW":
+          t2 = test_util.NHWCToNCHW(t2)
+          strides = test_util.NHWCToNCHW(strides)
+        conv = nn_ops.conv2d_backprop_input(
+            t0,
+            t1,
+            t2,
+            strides=strides,
+            padding=padding,
+            data_format=data_format,
+        )
+        if data_format == "NCHW":
+          conv = test_util.NCHWToNHWC(conv)
+        ret = self.evaluate(conv)
+        self.assertShapeEqual(ret, conv)
+        return ret
+
+    values = []
+    for data_format, use_gpu in get_test_configs():
+      values.append(_get_val(data_format, use_gpu))
+
+    for i in range(1, len(values)):
+      self.assertAllClose(values[0], values[i], rtol=1e-2, atol=1e-2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DEmptyBackpropInput(self):
+    expected_output = []
+    for data_format, use_gpu in get_test_configs():
+      self._RunAndVerifyBackpropInput(
+          input_sizes=[0, 2, 3, 1],
+          filter_sizes=[2, 2, 1, 1],
+          output_sizes=[0, 1, 2, 1],
+          strides=[1, 1],
+          padding="VALID",
+          expected=expected_output,
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=1e-5,
+      )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testConv2DStrideTwoFilterOneSameBackpropInput(self):
+    expected_output = [
+        1.0,
+        0.0,
+        2.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        3.0,
+        0.0,
+        4.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+        0.0,
+    ]
+    for data_format, use_gpu in get_test_configs():
+      self._RunAndVerifyBackpropInput(
+          input_sizes=[1, 4, 4, 1],
+          filter_sizes=[1, 1, 1, 1],
+          output_sizes=[1, 2, 2, 1],
+          strides=[2, 2],
+          padding="SAME",
+          expected=expected_output,
+          data_format=data_format,
+          use_gpu=use_gpu,
+          err=1e-5,
+      )
+
+
+class PoolingTest(test.TestCase):
+
+  def _test(self, input_shape, dtype, **kwargs):
+    # Use negative numbers to make sure there isn't any zero padding getting
+    # used.
+    x = -np.arange(np.prod(input_shape), dtype=dtype).reshape(input_shape) - 1
+    y1 = pool_direct(input=x, **kwargs)
+    y2 = nn_ops.pool(input=x, **kwargs)
+    self.assertAllClose(y1, self.evaluate(y2), rtol=1e-2, atol=1e-2)
+
+  def _test_gradient(self, input_shape, dtype, **kwargs):
+    x_val = (
+        -np.arange(np.prod(input_shape), dtype=dtype).reshape(input_shape) - 1
+    )
+    x = constant_op.constant(x_val, name="x", dtype=dtype)
+    output = nn_ops.pool(input=x, **kwargs)
+    y_shape = output.get_shape().as_list()
+    err = gradient_checker.compute_gradient_error(
+        [x], [input_shape], output, y_shape, x_init_value=[x_val]
+    )
+    err_tolerance = 1e-2
+    if dtype == dtypes.float16:
+      err_tolerance = 1.1
+      # TODO: this is too high.
+      # investigate precision issues.
+    self.assertLess(err, err_tolerance)
+
+  def testPoolSimple(self):
+    with self.session(use_gpu=test.is_gpu_available()):
+      for padding in ["SAME", "VALID"]:
+        for pooling_type in ["MAX", "AVG"]:
+          for dtype in [np.float32, np.float16]:
+            self._test(
+                input_shape=[1, 1, 10, 1],
+                window_shape=[1, 3],
+                padding=padding,
+                pooling_type=pooling_type,
+                dilation_rate=[1, 1],
+                strides=[1, 2],
+                dtype=dtype,
+            )
+
+  def testPool1D(self):
+    with self.session(use_gpu=test.is_gpu_available()):
+      for padding in ["SAME", "VALID"]:
+        for dtype in [np.float32, np.float16]:
+          for pooling_type in ["MAX", "AVG"]:
+            for input_shape in [[2, 9, 2], [2, 10, 2]]:
+              for window_shape in [[1], [2], [3]]:
+                if padding != "SAME":
+                  for dilation_rate in [[1], [2], [3]]:
+                    self._test(
+                        input_shape=input_shape,
+                        window_shape=window_shape,
+                        padding=padding,
+                        pooling_type=pooling_type,
+                        dilation_rate=dilation_rate,
+                        strides=[1],
+                        dtype=dtype,
+                    )
+                for strides in [[1], [2], [3]]:
+                  if np.any(np.array(strides) > window_shape):
+                    continue
+                  self._test(
+                      input_shape=input_shape,
+                      window_shape=window_shape,
+                      padding=padding,
+                      pooling_type=pooling_type,
+                      dilation_rate=[1],
+                      strides=strides,
+                      dtype=dtype,
+                  )
+
+  def testPool2D(self):
+    with self.session(use_gpu=test.is_gpu_available()):
+      for padding in ["SAME", "VALID"]:
+        for dtype in [np.float32, np.float16]:
+          for pooling_type in ["MAX", "AVG"]:
+            for input_shape in [[2, 9, 10, 2], [2, 10, 9, 2]]:
+              for window_shape in [[1, 1], [2, 1], [2, 3]]:
+                if padding != "SAME":
+                  for dilation_rate in [[1, 1], [2, 1], [1, 2], [2, 3]]:
+                    self._test(
+                        input_shape=input_shape,
+                        window_shape=window_shape,
+                        padding=padding,
+                        pooling_type=pooling_type,
+                        dilation_rate=dilation_rate,
+                        strides=[1, 1],
+                        dtype=dtype,
+                    )
+                for strides in [[1, 1], [2, 1], [1, 2], [2, 3]]:
+                  if np.any(np.array(strides) > window_shape):
+                    continue
+                  self._test(
+                      input_shape=input_shape,
+                      window_shape=window_shape,
+                      padding=padding,
+                      pooling_type=pooling_type,
+                      dilation_rate=[1, 1],
+                      strides=strides,
+                      dtype=dtype,
+                  )
+
+  @test_util.run_deprecated_v1
+  def testGradient2D(self):
+    with self.session(use_gpu=test.is_gpu_available()):
+      for padding in ["SAME", "VALID"]:
+        for dtype in [np.float32, np.float16]:
+          for pooling_type in ["AVG", "MAX"]:
+            for input_shape in [[2, 4, 5, 2], [1, 5, 4, 1]]:
+              for window_shape in [[1, 1], [2, 1], [2, 2]]:
+                if padding != "SAME":
+                  for dilation_rate in [[1, 1], [2, 1], [2, 2]]:
+                    self._test_gradient(
+                        input_shape=input_shape,
+                        window_shape=window_shape,
+                        padding=padding,
+                        pooling_type=pooling_type,
+                        dilation_rate=dilation_rate,
+                        strides=[1, 1],
+                        dtype=dtype,
+                    )
+                for strides in [[1, 1], [2, 1], [1, 2], [2, 2]]:
+                  if np.any(np.array(strides) > window_shape):
+                    continue
+                  self._test_gradient(
+                      input_shape=input_shape,
+                      window_shape=window_shape,
+                      padding=padding,
+                      pooling_type=pooling_type,
+                      dilation_rate=[1, 1],
+                      strides=strides,
+                      dtype=dtype,
+                  )
+
+
+class FractionalMaxPoolGradTest(test.TestCase):
+  _PRNG = np.random.RandomState(341261)
+  _SEED = 123456
+
+  def _GenerateUniqueRandomInputTensor(self, shape):
+    num_elements = 1
+    for size in shape:
+      num_elements *= size
+    x = np.arange(num_elements, dtype=np.float32)
+    self._PRNG.shuffle(x)
+    return x.reshape(shape)
+
+  def testDirectNotUseOverlapping(self):
+    for num_batches in [1]:
+      for row_window_size in [2, 5]:
+        for col_window_size in [2, 4]:
+          num_rows = row_window_size
+          num_cols = col_window_size
+          for num_channels in [1]:
+            input_shape = (num_batches, num_rows, num_cols, num_channels)
+            with self.cached_session() as _:
+              input_tensor = constant_op.constant(
+                  self._GenerateUniqueRandomInputTensor(input_shape)
+              )
+              window_size = [1, row_window_size, col_window_size, 1]
+              stride_size = [1, row_window_size, col_window_size, 1]
+              padding = "VALID"
+              output_tensor = nn_ops.max_pool(
+                  input_tensor, window_size, stride_size, padding
+              )
+              output_data = self.evaluate(output_tensor)
+              output_backprop = self._PRNG.randint(100, size=output_data.shape)
+              input_backprop_tensor = gen_nn_ops.max_pool_grad(
+                  input_tensor,
+                  output_tensor,
+                  output_backprop,
+                  window_size,
+                  stride_size,
+                  padding,
+              )
+              _ = self.evaluate(input_backprop_tensor)
+
+
+class RandomOpTestCommon(test.TestCase):
+
+  # Checks that executing the same rng_func multiple times rarely produces the
+  # same result.
+  def _testSingleSessionNotConstant(
+      self,
+      rng_func,
+      num,
+      dtype,
+      min_or_mean,
+      max_or_stddev,
+      use_gpu,
+      op_seed=None,
+      graph_seed=None,
+  ):
+    with self.session(use_gpu=use_gpu, graph=ops.Graph()) as _:
+      if graph_seed is not None:
+        random_seed.set_random_seed(graph_seed)
+      x = rng_func([num], min_or_mean, max_or_stddev, dtype=dtype, seed=op_seed)
+
+      y = self.evaluate(x)
+      z = self.evaluate(x)
+      w = self.evaluate(x)
+
+      # We use exact equality here. If the random-number generator is producing
+      # the same output, all three outputs will be bitwise identical.
+      self.assertTrue(
+          (not np.array_equal(y, z))
+          or (not np.array_equal(z, w))
+          or (not np.array_equal(y, w))
+      )
+
+
+@test_util.for_all_test_methods(
+    test_util.disable_xla, "This never passed on XLA"
+)
+class RandomUniformTest(RandomOpTestCommon):
+
+  def _Sampler(self, num, minv, maxv, dtype, use_gpu, seed=None):
+    def func():
+      with self.session(use_gpu=use_gpu, graph=ops.Graph()) as _:
+        rng = random_ops.random_uniform(
+            [num], minval=minv, maxval=maxv, dtype=dtype, seed=seed
+        )
+        ret = np.empty([10, num])
+        for i in xrange(10):
+          ret[i, :] = self.evaluate(rng)
+      return ret
+
+    return func
+
+  def testRange(self):
+    for dt in (
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.int32,
+        dtypes.int64,
+    ):
+      sampler = self._Sampler(1000, minv=-2, maxv=8, dtype=dt, use_gpu=True)
+      x = sampler()
+      self.assertLessEqual(-2, np.min(x))
+      self.assertLess(np.max(x), 8)
+
+  # Asserts that different trials (1000 samples per trial) is unlikely
+  # to see the same sequence of values. Will catch buggy
+  # implementations which uses the same random number seed.
+  def testDistinct(self):
+    for dt in (
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.int32,
+        dtypes.int64,
+    ):
+      maxv = 1.0 if dt.is_floating else 1 << 30
+      sampler = self._Sampler(1000, minv=0, maxv=maxv, dtype=dt, use_gpu=True)
+      x = sampler()
+      y = sampler()
+      count = (x == y).sum()
+      count_limit = 50 if dt == dtypes.float16 else 10
+      if count >= count_limit:
+        print("x = ", x)
+        print("y = ", y)
+        print("count = ", count)
+      self.assertLess(count, count_limit)
+
+  @test_util.run_deprecated_v1
+  def testUniformIntsWithInvalidShape(self):
+    for dtype in dtypes.int32, dtypes.int64:
+      with self.assertRaisesRegex(
+          ValueError, "minval must be a scalar; got a tensor of shape"
+      ):
+        random_ops.random_uniform([1000], minval=[1, 2], maxval=3, dtype=dtype)
+      with self.assertRaisesRegex(
+          ValueError, "maxval must be a scalar; got a tensor of shape"
+      ):
+        random_ops.random_uniform([1000], minval=1, maxval=[2, 3], dtype=dtype)
+
+  # Check that uniform ints actually follow a uniform distribution.
+  @test_util.run_deprecated_v1
+  def testUniformInts(self):
+    minv = -2
+    maxv = 15
+    n = 100000
+    p = 1 / (maxv - minv)
+    # The counts should follow an (n, p) binomial distribution.
+    mean = p * n
+    std = np.sqrt(n * p * (1 - p))
+    for dt in dtypes.int32, dtypes.int64:
+      # Use a fixed seed here to make the test deterministic.
+      # Without the fixed seed, the 5 * std bound will (very rarely) fail.
+      sampler = self._Sampler(
+          n // 10, minv=minv, maxv=maxv, dtype=dt, use_gpu=True, seed=17
+      )
+      x = sampler().ravel()
+      self.assertEqual(x.shape, (n,))
+      counts, _ = np.histogram(x, bins=maxv - minv)
+      self.assertEqual(counts.shape, (maxv - minv,))
+      self.assertEqual(counts.sum(), n)
+      error = np.abs(counts - mean)
+      self.assertLess(error.max(), 5 * std)
+
+  # Check that minval = maxval is fine iff we're producing no numbers
+  def testUniformIntsDegenerate(self):
+    for dt in dtypes.int32, dtypes.int64:
+
+      def sample(n, dtype=dt):
+        return self._Sampler(n, minv=0, maxv=0, dtype=dtype, use_gpu=True)()
+
+      self.assertEqual(sample(0, dt).shape, (10, 0))
+      with self.assertRaisesOpError("Need minval < maxval, got 0 >= 0"):
+        sample(1)
+
+  @test_util.run_deprecated_v1
+  def testSeed(self):
+    for dt in (
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.int32,
+        dtypes.int64,
+    ):
+      for seed in [345, 2**100, -(2**100)]:
+        sx = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
+        sy = self._Sampler(1000, 0, 17, dtype=dt, use_gpu=True, seed=seed)
+        self.assertAllEqual(sx(), sy())
+
+  @test_util.run_deprecated_v1
+  def testNoCSE(self):
+    shape = [2, 3, 4]
+    for dtype in dtypes.float16, dtypes.float32, dtypes.int32:
+      with self.session(use_gpu=True):
+        rnd1 = random_ops.random_uniform(shape, 0, 17, dtype=dtype)
+        rnd2 = random_ops.random_uniform(shape, 0, 17, dtype=dtype)
+        diff = (rnd2 - rnd1).eval()
+        self.assertGreater(np.linalg.norm(diff), 0.1)
+
+  @test_util.run_deprecated_v1
+  def testSingleSessionNotConstant(self):
+    for use_gpu in [False, True]:
+      for dt in (
+          dtypes.float16,
+          dtypes.float32,
+          dtypes.float64,
+          dtypes.int32,
+          dtypes.int64,
+      ):
+        self._testSingleSessionNotConstant(
+            random_ops.random_uniform, 100, dt, 0, 17, use_gpu=use_gpu
+        )
+
+  @test_util.run_deprecated_v1
+  def testSingleSessionOpSeedNotConstant(self):
+    for use_gpu in [False, True]:
+      for dt in (
+          dtypes.float16,
+          dtypes.float32,
+          dtypes.float64,
+          dtypes.int32,
+          dtypes.int64,
+      ):
+        self._testSingleSessionNotConstant(
+            random_ops.random_uniform,
+            100,
+            dt,
+            10,
+            20,
+            use_gpu=use_gpu,
+            op_seed=1345,
+        )
+
+  @test_util.run_deprecated_v1
+  def testSingleSessionGraphSeedNotConstant(self):
+    for use_gpu in [False, True]:
+      for dt in (
+          dtypes.float16,
+          dtypes.float32,
+          dtypes.float64,
+          dtypes.int32,
+          dtypes.int64,
+      ):
+        self._testSingleSessionNotConstant(
+            random_ops.random_uniform,
+            100,
+            dt,
+            20,
+            200,
+            use_gpu=use_gpu,
+            graph_seed=965,
+        )
+
+
+class BroadcastToTest(test_util.TensorFlowTestCase):
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToBasic(self):
+    for dtype in [np.uint8, np.uint16, np.int8, np.int16, np.int32, np.int64]:
+      with self.session(use_gpu=True):
+        x = np.array([1, 2, 3], dtype=dtype)
+        v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToString(self):
+    with self.session(use_gpu=True):
+      x = np.array([b"1", b"2", b"3"])
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToBool(self):
+    with self.session(use_gpu=True):
+      x = np.array([True, False, True], dtype=np.bool)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShape(self):
+    for input_dim in range(1, 6):
+      for output_dim in range(input_dim, 6):
+        with self.cached_session(use_gpu=True):
+          input_shape = [2] * input_dim
+          output_shape = [2] * output_dim
+          x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+          v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+          v_np = np.broadcast_to(x, output_shape)
+          self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeInnerDim(self):
+    input_shape = [2, 1, 3]
+    output_shape = [2, 5, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2]
+    output_shape = [1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 15, 3, 2, 2, 2]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeLargerDim2(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    with self.cached_session(use_gpu=True):
+      x = np.array(np.random.randint(5, size=input_shape), dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), output_shape)
+      v_np = np.broadcast_to(x, output_shape)
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToScalar(self):
+    with self.session(use_gpu=True):
+      x = np.array(1, dtype=np.int32)
+      v_tf = array_ops.broadcast_to(constant_op.constant(x), [3, 3])
+      v_np = np.broadcast_to(x, [3, 3])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastScalarToNonScalar(self):
+    with self.session(use_gpu=True):
+      x = np.array(1.0, dtype=np.float)
+      v_tf = array_ops.broadcast_to(
+          constant_op.constant(1.0), [2, 3, 4, 1, 1, 1]
+      )
+      v_np = np.broadcast_to(x, [2, 3, 4, 1, 1, 1])
+      self.assertAllEqual(v_tf.eval(), v_np)
+
+  @test_util.run_deprecated_v1
+  def testBroadcastToShapeTypeAndInference(self):
+    for dtype in [dtypes.int32, dtypes.int64]:
+      with self.cached_session(use_gpu=True):
+        x = np.array([1, 2, 3])
+        v_tf = array_ops.broadcast_to(
+            constant_op.constant(x), constant_op.constant([3, 3], dtype=dtype)
+        )
+        shape = v_tf.get_shape().as_list()
+        v_np = np.broadcast_to(x, [3, 3])
+        self.assertAllEqual(v_tf.eval(), v_np)
+        # check shape inference when shape input is constant
+        self.assertAllEqual(shape, v_np.shape)
+
+  def testBroadcastToBadOutputShape(self):
+    with context.eager_mode():
+      with self.assertRaisesRegex(
+          errors.InvalidArgumentError, "Unable to broadcast tensor of shape"
+      ):
+        self.evaluate(
+            array_ops.broadcast_to(
+                constant_op.constant([0, 1]), constant_op.constant([2, 1])
+            )
+        )
+
+  @test_util.run_deprecated_v1
+  def testGradientForScalar(self):
+    x = constant_op.constant(1, dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [2, 4, 3])
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(
+          x, x.get_shape(), out, out.get_shape()
+      )
+    self.assertLess(err, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithSameRank(self):
+    x = constant_op.constant(
+        np.reshape(np.arange(6), (2, 1, 3)), dtype=dtypes.float32
+    )
+    v = array_ops.broadcast_to(x, [2, 5, 3])
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(
+          x, x.get_shape(), out, out.get_shape()
+      )
+    self.assertLess(err, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithIncreasingRank(self):
+    x = constant_op.constant([[1], [2]], dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [5, 2, 3])
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(
+          x, x.get_shape(), out, out.get_shape()
+      )
+    self.assertLess(err, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithBroadcastAllDimensions(self):
+    x = constant_op.constant([1], dtype=dtypes.float32)
+    v = array_ops.broadcast_to(x, [5, 2, 3])
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(
+          x, x.get_shape(), out, out.get_shape()
+      )
+    self.assertLess(err, 1e-4)
+
+  @test_util.run_deprecated_v1
+  def testGradientWithLargeDim(self):
+    input_shape = [2, 1, 3, 2, 2, 2, 1, 1, 1]
+    output_shape = [1, 1, 1, 2, 5, 3, 2, 2, 2, 3, 3, 3]
+    x = constant_op.constant(
+        np.array(np.random.randn(*input_shape), dtype=np.float32)
+    )
+    v = array_ops.broadcast_to(x, output_shape)
+    out = 2 * v
+    with self.cached_session():
+      err = gradient_checker.compute_gradient_error(
+          x, x.get_shape(), out, out.get_shape()
+      )
+    self.assertLess(err, 1e-4)
+
+
+class GPUBinaryOpsTest(test.TestCase):
+
+  def _compareGPU(self, x, y, np_func, tf_func):
+    with self.cached_session(use_gpu=True) as _:
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = tf_func(inx, iny)
+      tf_gpu = self.evaluate(out)
+
+    with self.cached_session(use_gpu=False) as _:
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = tf_func(inx, iny)
+      tf_cpu = self.evaluate(out)
+
+    self.assertAllClose(tf_cpu, tf_gpu)
+
+  def testFloatBasic(self):
+    x = np.linspace(-5, 20, 15).reshape((1, 3, 5)).astype(np.float32)
+    y = np.linspace(20, -5, 15).reshape((1, 3, 5)).astype(np.float32)
+    self._compareGPU(x, y, np.add, math_ops.add)
+    self._compareGPU(x, y, np.subtract, math_ops.subtract)
+    self._compareGPU(x, y, np.multiply, math_ops.multiply)
+    self._compareGPU(x, y + 0.1, np.true_divide, math_ops.truediv)
+    self._compareGPU(x, y + 0.1, np.floor_divide, math_ops.floordiv)
+    self._compareGPU(x, y, np.power, math_ops.pow)
+
+  def testFloatWithBCast(self):
+    x = np.linspace(-5, 20, 15).reshape((3, 5)).astype(np.float32)
+    y = np.linspace(20, -5, 30).reshape((2, 3, 5)).astype(np.float32)
+    self._compareGPU(x, y, np.add, math_ops.add)
+    self._compareGPU(x, y, np.subtract, math_ops.subtract)
+    self._compareGPU(x, y, np.multiply, math_ops.multiply)
+    self._compareGPU(x, y + 0.1, np.true_divide, math_ops.truediv)
+
+  def testHalfBasic(self):
+    x = (
+        np.linspace(-5, 20, 15, dtype=np.float16)
+        .reshape((1, 3, 5))
+        .astype(np.float16)
+    )
+    y = (
+        np.linspace(20, -5, 15, dtype=np.float16)
+        .reshape((1, 3, 5))
+        .astype(np.float16)
+    )
+    self._compareGPU(x, y, np.add, math_ops.add)
+    self._compareGPU(x, y, np.subtract, math_ops.subtract)
+    self._compareGPU(x, y, np.multiply, math_ops.multiply)
+    self._compareGPU(x, y + 0.1, np.true_divide, math_ops.truediv)
+    self._compareGPU(x, y + 0.1, np.floor_divide, math_ops.floordiv)
+    self._compareGPU(x, y, np.power, math_ops.pow)
+
+
+class LogicalOpTest(test.TestCase):
+
+  def _compareBinary(self, x, y, np_func, tf_func, use_gpu=False):
+    np_ans = np_func(x, y)
+    with test_util.device(use_gpu=use_gpu):
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = tf_func(inx, iny)
+      tf_val = self.evaluate(out)
+    self.assertEqual(out.dtype, dtypes.bool)
+    self.assertAllEqual(np_ans, tf_val)
+    self.assertShapeEqual(np_ans, out)
+
+  def _not(self, x, use_gpu=False):
+    np_ans = np.logical_not(x)
+    with test_util.device(use_gpu=use_gpu):
+      out = math_ops.logical_not(ops.convert_to_tensor(x))
+      tf_val = self.evaluate(out)
+    self.assertEqual(out.dtype, dtypes.bool)
+    self.assertAllEqual(np_ans, tf_val)
+    self.assertShapeEqual(np_ans, out)
+
+  def testScalar(self):
+    data = [np.array([True]), np.array([False])]
+    for use_gpu in [True, False]:
+      for x in data:
+        self._not(x, use_gpu)
+      for x in data:
+        for y in data:
+          self._compareBinary(
+              x, y, np.logical_and, math_ops.logical_and, use_gpu
+          )
+          self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+          self._compareBinary(
+              x, y, np.logical_xor, math_ops.logical_xor, use_gpu
+          )
+
+  def testTensor(self):
+    x = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    y = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    for use_gpu in [True, False]:
+      self._not(x, use_gpu)
+      self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
+      self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+      self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+
+  def testBCast(self):
+    shapes = [
+        ([1, 3, 2], [1]),
+        ([1, 3, 2], [2]),
+        ([1, 3, 2], [3, 2]),
+        ([1, 3, 2], [3, 1]),
+        ([1, 3, 2], [1, 3, 2]),
+        ([1, 3, 2], [2, 3, 1]),
+        ([1, 3, 2], [2, 1, 1]),
+        ([1, 3, 2], [1, 3, 1]),
+        ([2, 1, 5], [2, 3, 1]),
+        ([2, 0, 5], [2, 0, 1]),
+        ([2, 3, 0], [2, 3, 1]),
+    ]
+    for xs, ys in shapes:
+      x = np.random.randint(0, 2, np.prod(xs)).astype(np.bool).reshape(xs)
+      y = np.random.randint(0, 2, np.prod(ys)).astype(np.bool).reshape(ys)
+      for use_gpu in [True, False]:
+        self._compareBinary(x, y, np.logical_and, math_ops.logical_and, use_gpu)
+        self._compareBinary(x, y, np.logical_or, math_ops.logical_or, use_gpu)
+        self._compareBinary(x, y, np.logical_xor, math_ops.logical_xor, use_gpu)
+
+
+class XentTest(test.TestCase):
+
+  def _npXent(self, features, labels, dim=-1):
+    if dim == -1:
+      dim = len(features.shape) - 1
+    print("dim ", dim)
+    one_only_on_dim = list(features.shape)
+    one_only_on_dim[dim] = 1
+    e = np.exp(
+        features - np.reshape(np.amax(features, axis=dim), one_only_on_dim)
+    )
+    probs = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
+    bp = probs - labels
+    tmp = labels * np.log(probs + 1.0e-20)
+    print("before reduction ", tmp)
+    l = -np.sum(tmp, axis=dim)
+    return l, bp
+
+  # TODO(b/123860949): The values are constant folded for XLA, so placeholders
+  # are needed.
+  def _testXent(
+      self, np_features, np_labels, use_gpu=True, with_placeholders=False
+  ):
+    _, np_backprop = self._npXent(np_features, np_labels)
+    with self.cached_session(use_gpu=use_gpu) as sess:
+      if with_placeholders:
+        features_placeholder = array_ops.placeholder(np_features.dtype)
+        labels_placeholder = array_ops.placeholder(np_labels.dtype)
+        loss, backprop_ = gen_nn_ops.softmax_cross_entropy_with_logits(
+            labels=labels_placeholder, features=features_placeholder
+        )
+        _, tf_backprop = sess.run(
+            [loss, backprop_],
+            feed_dict={
+                labels_placeholder: np_labels,
+                features_placeholder: np_features,
+            },
+        )
+      else:
+        loss, backprop_ = gen_nn_ops.softmax_cross_entropy_with_logits(
+            np_features, np_labels
+        )
+        _, tf_backprop = self.evaluate([loss, backprop_])
+    self.assertAllCloseAccordingToType(np_backprop, tf_backprop)
+
+  def _testXentWrapper(self, np_features, np_labels, dim=-1, use_gpu=False):
+    np_loss, _ = self._npXent(np_features, np_labels, dim=dim)
+    with self.cached_session(use_gpu=use_gpu) as _:
+      loss = gen_nn_ops.softmax_cross_entropy_with_logits(
+          labels=np_labels, logits=np_features, dim=dim
+      )
+      tf_loss = self.evaluate(loss)
+    self.assertAllCloseAccordingToType(np_loss, tf_loss)
+
+  def _testAll(self, features, labels, with_placeholders=False):
+    self._testXent(
+        features, labels, use_gpu=True, with_placeholders=with_placeholders
+    )
+
+  def testFloat(self):
+    self._testAll(
+        np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]]).astype(
+            np.float32
+        ),
+        np.array([[0.0, 0.0, 0.0, 1.0], [0.0, 0.5, 0.5, 0.0]]).astype(
+            np.float32
+        ),
+    )
+
+  def testHalf(self):
+    self._testAll(
+        np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]]).astype(
+            np.float16
+        ),
+        np.array([[0.0, 0.0, 0.0, 1.0], [0.0, 0.5, 0.5, 0.0]]).astype(
+            np.float16
+        ),
+    )
+
+
+class AddNTest(test.TestCase):
+  # AddN special-cases adding the first M inputs to make (N - M) divisible by 8,
+  # after which it adds the remaining (N - M) tensors 8 at a time in a loop.
+  # Test N in [1, 10] so we check each special-case from 1 to 9 and one
+  # iteration of the loop.
+  _MAX_N = 10
+
+  def _supported_types(self):
+    if test.is_gpu_available():
+      return [
+          dtypes.float16,
+          dtypes.float32,
+          dtypes.float64,
+          dtypes.complex64,
+          dtypes.complex128,
+          dtypes.int64,
+          dtypes.bfloat16,
+      ]
+    return [
+        dtypes.int8,
+        dtypes.int16,
+        dtypes.int32,
+        dtypes.int64,
+        dtypes.float16,
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.complex64,
+        dtypes.complex128,
+        dtypes.bfloat16,
+    ]
+
+  def _buildData(self, shape, dtype):
+    data = np.random.randn(*shape).astype(dtype.as_numpy_dtype)
+    # For complex types, add an index-dependent imaginary component so we can
+    # tell we got the right value.
+    if dtype.is_complex:
+      return data + 10j * data
+    return data
+
+  def testAddN(self):
+    np.random.seed(12345)
+    with self.session(use_gpu=True) as _:
+      for dtype in self._supported_types():
+        for count in range(1, self._MAX_N + 1):
+          data = [self._buildData((2, 2), dtype) for _ in range(count)]
+          actual = self.evaluate(math_ops.add_n(data))
+          expected = np.sum(
+              np.vstack([np.expand_dims(d, 0) for d in data]), axis=0
+          )
+          tol = 5e-3 if dtype == dtypes.float16 else 5e-7
+          if dtype == dtypes.bfloat16:
+            tol = 2e-2
+          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+  def testBigAddN(self):
+    np.random.seed(12345)
+    with self.session(use_gpu=True) as _:
+      for dtype in self._supported_types():
+        for count in range(10, 31):
+          data = [self._buildData((2, 2), dtype) for _ in range(count)]
+          actual = self.evaluate(math_ops.add_n(data))
+          expected = np.sum(
+              np.vstack([np.expand_dims(d, 0) for d in data]), axis=0
+          )
+          tol = 5e-2 if dtype in [dtypes.float16, dtypes.bfloat16] else 5e-6
+          self.assertAllClose(expected, actual, rtol=tol, atol=tol)
+
+
+class ResourceVariableOpsTest(
+    test_util.TensorFlowTestCase, parameterized.TestCase
+):
+
+  def tearDown(self):
+    gc.collect()
+    # This will only contain uncollectable garbage, i.e. reference cycles
+    # involving objects with __del__ defined.
+    self.assertEmpty(gc.garbage)
+    super(ResourceVariableOpsTest, self).tearDown()
+
+  @test_util.run_gpu_only
+  def testGPUInt64(self):
+    with context.eager_mode(), context.device("gpu:0"):
+      v = resource_variable_ops.ResourceVariable(1, dtype=dtypes.int64)
+      self.assertAllEqual(1, v.numpy())
+
+  def testEagerNameNotIdentity(self):
+    with context.eager_mode():
+      v0 = resource_variable_ops.ResourceVariable(1.0, name="a")
+      v1 = resource_variable_ops.ResourceVariable(2.0, name="a")
+      self.assertAllEqual(v0.numpy(), 1.0)
+      self.assertAllEqual(v1.numpy(), 2.0)
+
+  def testEagerNameNotNeeded(self):
+    with context.eager_mode():
+      v0 = resource_variable_ops.ResourceVariable(1.0)
+      self.assertAllEqual(v0.numpy(), 1.0)
+
+  def testReadVariableDtypeMismatchEager(self):
+    with context.eager_mode():
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1], name="foo"
+      )
+      resource_variable_ops.assign_variable_op(handle, 1)
+      with self.assertRaisesRegex(
+          errors.InvalidArgumentError,
+          "Trying to read variable with wrong dtype. Expected float got int32",
+      ):
+        _ = resource_variable_ops.read_variable_op(handle, dtype=dtypes.float32)
+
+  def testEagerInitializedValue(self):
+    with context.eager_mode():
+      variable = resource_variable_ops.ResourceVariable(1.0, name="eager-init")
+      self.assertAllEqual(variable.numpy(), 1.0)
+      self.assertAllEqual(variable.initialized_value().numpy(), 1.0)
+
+  def testInitializeVariableUsingInitializedValue(self):
+    var1 = resource_variable_ops.ResourceVariable(1.0, name="var1")
+    var2 = resource_variable_ops.ResourceVariable(
+        var1.initialized_value(), name="var2"
+    )
+    self.assertAllEqual(var2.initialized_value(), 1.0)
+
+  def testEagerBool(self):
+    with context.eager_mode():
+      v = resource_variable_ops.ResourceVariable(False, name="bool_test")
+      self.assertAllEqual(bool(v), False)
+
+  def testEagerDeepCopy(self):
+    with context.eager_mode():
+      init_value = np.ones((4, 4, 4))
+      variable = resource_variable_ops.ResourceVariable(init_value, name="init")
+
+      copied_variable = copy.deepcopy(variable)
+      self.assertEqual(variable.name, copied_variable.name)
+      self.assertEqual(variable.shape, copied_variable.shape)
+      self.assertEqual(variable.device, copied_variable.device)
+
+      # The copied variable should have the same value as the original.
+      self.assertAllEqual(variable.numpy(), copied_variable.numpy())
+
+      # Updates to the copy should not be reflected in the original.
+      copied_variable.assign(4 * np.ones((4, 4, 4)))
+      self.assertNotAllEqual(variable.numpy(), copied_variable.numpy())
+
+  def testVariableShape(self):
+    v = resource_variable_ops.ResourceVariable([1.0, 1.0])
+    self.assertAllEqual(
+        tensor_util.constant_value(
+            resource_variable_ops.variable_shape(v.handle)
+        ),
+        [2],
+    )
+
+  def testAssignVariableDtypeMismatchEager(self):
+    with context.eager_mode():
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1], name="foo"
+      )
+      resource_variable_ops.assign_variable_op(
+          handle, constant_op.constant([1])
+      )
+      with self.assertRaisesRegex(
+          errors.InvalidArgumentError,
+          "Trying to assign variable with wrong "
+          "dtype. Expected int32 got float",
+      ):
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([1.0], dtype=dtypes.float32)
+        )
+
+  @test_util.run_in_graph_and_eager_modes
+  def testCreateRead(self):
+    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant(1, dtype=dtypes.int32)
+        )
+    )
+    value = self.evaluate(
+        resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    )
+    self.assertAllEqual(1, value)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testManyAssigns(self):
+    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    create = resource_variable_ops.assign_variable_op(
+        handle, constant_op.constant(1, dtype=dtypes.int32)
+    )
+    with ops.control_dependencies([create]):
+      first_read = resource_variable_ops.read_variable_op(
+          handle, dtype=dtypes.int32
+      )
+    with ops.control_dependencies([first_read]):
+      write = resource_variable_ops.assign_variable_op(
+          handle, constant_op.constant(2, dtype=dtypes.int32)
+      )
+    with ops.control_dependencies([write]):
+      second_read = resource_variable_ops.read_variable_op(
+          handle, dtype=dtypes.int32
+      )
+    f, s = self.evaluate([first_read, second_read])
+    self.assertEqual(f, 1)
+    self.assertEqual(s, 2)
+
+  def testAssignAdd(self):
+    handle = resource_variable_ops.var_handle_op(dtype=dtypes.int32, shape=[])
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant(1, dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.assign_add_variable_op(
+            handle, constant_op.constant(1, dtype=dtypes.int32)
+        )
+    )
+    read = self.evaluate(
+        resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    )
+    self.assertEqual(read, 2)
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAssignAddMethod(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name="var0")
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(v.assign_add(1.0))
+    self.assertEqual(2.0, self.evaluate(v.value()))
+
+    # Tests for the 'read_value' argument:
+    assign_with_read = v.assign_add(1.0, read_value=True)
+    self.assertEqual(3.0, self.evaluate(assign_with_read))
+    assign_without_read = v.assign_add(1.0, read_value=False)
+    if context.executing_eagerly():
+      self.assertIsNone(assign_without_read)
+    else:
+      self.assertIsInstance(assign_without_read, ops.Operation)
+    self.evaluate(assign_without_read)
+    self.assertEqual(4.0, self.evaluate(v.value()))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAssignSubMethod(self):
+    v = resource_variable_ops.ResourceVariable(3.0, name="var0")
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(v.assign_sub(1.0))
+    self.assertEqual(2.0, self.evaluate(v.value()))
+
+    # Tests for the 'read_value' argument:
+    assign_with_read = v.assign_sub(1.0, read_value=True)
+    self.assertEqual(1.0, self.evaluate(assign_with_read))
+    assign_without_read = v.assign_sub(1.0, read_value=False)
+    if context.executing_eagerly():
+      self.assertIsNone(assign_without_read)
+    else:
+      self.assertIsInstance(assign_without_read, ops.Operation)
+    self.evaluate(assign_without_read)
+    self.assertEqual(0.0, self.evaluate(v.value()))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterAdd(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_add(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterSub(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant([[2]], dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterMul(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant([[5]], dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterDiv(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterMin(self):
+    with ops.device("cpu:0"):
+      handle = resource_variable_ops.var_handle_op(
+          dtype=dtypes.int32, shape=[1, 1]
+      )
+      self.evaluate(
+          resource_variable_ops.assign_variable_op(
+              handle, constant_op.constant([[6]], dtype=dtypes.int32)
+          )
+      )
+      self.evaluate(
+          resource_variable_ops.resource_scatter_min(
+              handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)
+          )
+      )
+      read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+      self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterMax(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant([[3]], dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterSubScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_sub(
+            handle, [0], constant_op.constant(2, dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[-1]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterMulScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[1]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_mul(
+            handle, [0], constant_op.constant(5, dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[5]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterDivScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_div(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[2]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterMinScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_min(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[3]])
+
+  @test_util.run_in_graph_and_eager_modes
+  def testScatterMaxScalar(self):
+    handle = resource_variable_ops.var_handle_op(
+        dtype=dtypes.int32, shape=[1, 1]
+    )
+    self.evaluate(
+        resource_variable_ops.assign_variable_op(
+            handle, constant_op.constant([[6]], dtype=dtypes.int32)
+        )
+    )
+    self.evaluate(
+        resource_variable_ops.resource_scatter_max(
+            handle, [0], constant_op.constant(3, dtype=dtypes.int32)
+        )
+    )
+    read = resource_variable_ops.read_variable_op(handle, dtype=dtypes.int32)
+    self.assertEqual(self.evaluate(read), [[6]])
+
+
+class GradientDescentOptimizerTest(test.TestCase):
+  dtypes_ = [dtypes.float16, dtypes.float32]
+
+  def testBasic(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        optimizer = gradient_descent.GradientDescentOptimizer(3.0)
+        sgd_op = optimizer.apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+        )
+        self.assertEqual(0, len(optimizer.variables()))
+
+  def testBasicResourceVariable(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1])
+        )
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+        )
+
+  def testBasicCallableParams(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([1.0, 2.0], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lr = lambda: 3.0
+        sgd_op = gradient_descent.GradientDescentOptimizer(lr).apply_gradients(
+            zip([grads0, grads1], [var0, var1])
+        )
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+        )
+
+  def testMinimizeResourceVariable(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(var0, x) + var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        resources.initialize_resources([var0, var1]).run()
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
+
+  def testMinimizeSparseResourceVariable(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = resource_variable_ops.ResourceVariable([[1.0, 2.0]], dtype=dtype)
+        var1 = resource_variable_ops.ResourceVariable([3.0], dtype=dtype)
+        x = constant_op.constant([[4.0], [5.0]], dtype=dtype)
+        pred = math_ops.matmul(embedding_ops.embedding_lookup([var0], [0]), x)
+        pred += var1
+        loss = pred * pred
+        sgd_op = gradient_descent.GradientDescentOptimizer(1.0).minimize(loss)
+        # TODO(apassos) calling initialize_resources on all resources here
+        # doesn't work because the sessions and graph are reused across unit
+        # tests and this would mean trying to reinitialize variables. Figure out
+        # a long-term solution for this.
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0, 2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        np_pred = 1.0 * 4.0 + 2.0 * 5.0 + 3.0
+        np_grad = 2 * np_pred
+        self.assertAllCloseAccordingToType(
+            [[1.0 - np_grad * 4.0, 2.0 - np_grad * 5.0]], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType([3.0 - np_grad], self.evaluate(var1))
+
+  def testTensorLearningRate(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        lrate = constant_op.constant(3.0)
+        sgd_op = gradient_descent.GradientDescentOptimizer(
+            lrate
+        ).apply_gradients(zip([grads0, grads1], [var0, var1]))
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+        )
+
+  def testGradWrtRef(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        opt = gradient_descent.GradientDescentOptimizer(3.0)
+        values = [1.0, 3.0]
+        vars_ = [variables.Variable([v], dtype=dtype) for v in values]
+        grads_and_vars = opt.compute_gradients(vars_[0] + vars_[1], vars_)
+        self.evaluate(variables.global_variables_initializer())
+        for grad, _ in grads_and_vars:
+          self.assertAllCloseAccordingToType([1.0], self.evaluate(grad))
+
+  def testWithGlobalStep(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        global_step = variables.Variable(0, trainable=False)
+        var0 = variables.Variable([1.0, 2.0], dtype=dtype)
+        var1 = variables.Variable([3.0, 4.0], dtype=dtype)
+        grads0 = constant_op.constant([0.1, 0.1], dtype=dtype)
+        grads1 = constant_op.constant([0.01, 0.01], dtype=dtype)
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1]), global_step=global_step
+        )
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([1.0, 2.0], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([3.0, 4.0], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params and global_step
+        self.assertAllCloseAccordingToType(
+            [1.0 - 3.0 * 0.1, 2.0 - 3.0 * 0.1], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [3.0 - 3.0 * 0.01, 4.0 - 3.0 * 0.01], self.evaluate(var1)
+        )
+        self.assertAllCloseAccordingToType(1, self.evaluate(global_step))
+
+  def testSparseBasic(self):
+    for dtype in self.dtypes_:
+      # train.GradientDescentOptimizer is V1 only API.
+      with ops.Graph().as_default(), self.cached_session():
+        var0 = variables.Variable([[1.0], [2.0]], dtype=dtype)
+        var1 = variables.Variable([[3.0], [4.0]], dtype=dtype)
+        grads0 = indexed_slices.IndexedSlices(
+            constant_op.constant([0.1], shape=[1, 1], dtype=dtype),
+            constant_op.constant([0]),
+            constant_op.constant([2, 1]),
+        )
+        grads1 = indexed_slices.IndexedSlices(
+            constant_op.constant([0.01], shape=[1, 1], dtype=dtype),
+            constant_op.constant([1]),
+            constant_op.constant([2, 1]),
+        )
+        sgd_op = gradient_descent.GradientDescentOptimizer(3.0).apply_gradients(
+            zip([grads0, grads1], [var0, var1])
+        )
+        self.evaluate(variables.global_variables_initializer())
+        # Fetch params to validate initial values
+        self.assertAllCloseAccordingToType([[1.0], [2.0]], self.evaluate(var0))
+        self.assertAllCloseAccordingToType([[3.0], [4.0]], self.evaluate(var1))
+        # Run 1 step of sgd
+        sgd_op.run()
+        # Validate updated params
+        self.assertAllCloseAccordingToType(
+            [[1.0 - 3.0 * 0.1], [2.0]], self.evaluate(var0)
+        )
+        self.assertAllCloseAccordingToType(
+            [[3.0], [4.0 - 3.0 * 0.01]], self.evaluate(var1)
+        )
+
+
+class BiasAddTestBase(test.TestCase):
+
+  def _npBias(self, inputs, bias):
+    assert len(bias.shape) == 1
+    assert inputs.shape[-1] == bias.shape[0]
+    return inputs + bias.reshape(
+        ([1] * (len(inputs.shape) - 1)) + [bias.shape[0]]
+    )
+
+  def testNpBias(self):
+    self.assertAllClose(
+        np.array([[11, 22, 33], [41, 52, 63]]),
+        self._npBias(
+            np.array([[10, 20, 30], [40, 50, 60]]), np.array([1, 2, 3])
+        ),
+    )
+
+  def _testBias(self, np_inputs, np_bias, use_gpu=False):
+    np_val = self._npBias(np_inputs, np_bias)
+    tf_val = nn_ops.bias_add(np_inputs, np_bias)
+    self.assertAllCloseAccordingToType(np_val, tf_val)
+
+  def _AtLeast3d(self, np_value):
+    # fill the input value to at least 3-dimension
+    if np_value.ndim < 3:
+      return np.reshape(np_value, (1,) * (3 - np_value.ndim) + np_value.shape)
+    return np_value
+
+  def _NHWCToNCHW(self, np_value):
+    # fill the input value to at least 3-dimension
+    np_value = self._AtLeast3d(np_value)
+    # move the last dimension to second
+    np_dim = list(range(np_value.ndim))
+    np_dim_new = list(np_dim[0:1]) + list(np_dim[-1:]) + list(np_dim[1:-1])
+    return np.transpose(np_value, np_dim_new)
+
+  def _NCHWToNHWC(self, np_value):
+    assert len(np_value.shape) >= 3
+    np_dim = list(range(np_value.ndim))
+    # move the second dimension to the last
+    np_dim_new = list(np_dim[0:1]) + list(np_dim[2:]) + list(np_dim[1:2])
+    return np.transpose(np_value, np_dim_new)
+
+  def _testBiasNCHW(self, np_inputs, np_bias, use_gpu):
+    np_val = self._npBias(np_inputs, np_bias)
+    np_inputs = self._NHWCToNCHW(np_inputs)
+    tf_val = nn_ops.bias_add(np_inputs, np_bias, data_format="NCHW")
+    tf_val = self._NCHWToNHWC(tf_val)
+    self.assertAllCloseAccordingToType(self._AtLeast3d(np_val), tf_val)
+
+  def _testAll(self, np_inputs, np_bias):
+    if np_inputs.dtype in [np.float32, np.float16]:
+      self._testBias(np_inputs, np_bias, use_gpu=True)
+      self._testBiasNCHW(np_inputs, np_bias, use_gpu=True)
+
+  def testFloatTypes(self):
+    for t in [np.float32, np.float16]:
+      self._testAll(
+          np.random.rand(4, 3, 3).astype(t), np.random.rand(3).astype(t)
+      )
+      self._testAll(
+          np.random.rand(7, 5, 13).astype(t), np.random.rand(13).astype(t)
+      )
+      self._testAll(np.random.rand(9, 9).astype(t), np.random.rand(9).astype(t))
+
+  def _testGradient(self, np_input, bias, dtype, data_format, use_gpu):
+    with self.cached_session(use_gpu=use_gpu):
+      if data_format == "NCHW":
+        np_input = self._NHWCToNCHW(np_input)
+      input_tensor = constant_op.constant(
+          np_input, shape=np_input.shape, dtype=dtype
+      )
+      bias_tensor = constant_op.constant(bias, shape=bias.shape, dtype=dtype)
+
+      if dtype == dtypes.float16:
+        delta = 4.0 / 1024
+      else:
+        delta = 1.0 / 1024
+
+      output_tensor = nn_ops.bias_add(
+          input_tensor, bias_tensor, data_format=data_format
+      )
+      tensor_jacob_t, tensor_jacob_n = gradient_checker.compute_gradient(
+          input_tensor,
+          np_input.shape,
+          output_tensor,
+          np_input.shape,
+          delta=delta,
+      )
+      bias_jacob_t, bias_jacob_n = gradient_checker.compute_gradient(
+          bias_tensor, bias.shape, output_tensor, np_input.shape, delta=delta
+      )
+
+      # Test gradient of BiasAddGrad
+      bias_add_grad = gradients_impl.gradients(
+          nn_ops.l2_loss(output_tensor), bias_tensor
+      )[0]
+      grad_jacob_t, grad_jacob_n = gradient_checker.compute_gradient(
+          output_tensor, np_input.shape, bias_add_grad, bias.shape, delta=delta
+      )
+
+      threshold = 5e-3
+      if dtype == dtypes.float64:
+        threshold = 1e-10
+      if dtype == dtypes.float16:
+        threshold = 2e-2
+        # threshold for fp16 < threshold for fp32 since precision is lower.
+
+      self.assertAllClose(tensor_jacob_t, tensor_jacob_n, threshold, threshold)
+      self.assertAllClose(bias_jacob_t, bias_jacob_n, threshold, threshold)
+      self.assertAllClose(grad_jacob_t, grad_jacob_n, threshold, threshold)
+
+  @test_util.run_deprecated_v1
+  def testGradientTensor2D(self):
+    for data_format, use_gpu in [("NHWC", True)]:
+      for dtype in [dtypes.float32, dtypes.float16]:
+        np_input = np.array(
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0], dtype=dtype.as_numpy_dtype
+        ).reshape(3, 2)
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
+  @test_util.run_deprecated_v1
+  def testGradientTensor3D(self):
+    for data_format, use_gpu in [("NHWC", True)]:
+      for dtype in (dtypes.float32, dtypes.float64, dtypes.float16):
+        print(data_format)
+        np_input = np.array(
+            [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0],
+            dtype=dtype.as_numpy_dtype,
+        ).reshape((2, 3, 2))
+        bias = np.array([1.3, 2.4], dtype=dtype.as_numpy_dtype)
+        self._testGradient(np_input, bias, dtype, data_format, use_gpu)
+
+  @test_util.run_deprecated_v1
+  def testEmpty(self):
+    np.random.seed(7)
+    for shape in (0, 0), (2, 0), (0, 2), (4, 3, 0), (4, 0, 3), (0, 4, 3):
+      self._testAll(np.random.randn(*shape), np.random.randn(shape[-1]))
+
+  @test_util.run_deprecated_v1
+  def testEmptyGradient(self):
+    for data_format, use_gpu in ("NHWC", False), ("NHWC", True):
+      for shape in (0, 0), (2, 0), (0, 2):
+        self._testGradient(
+            np.random.randn(*shape),
+            np.random.randn(shape[-1]),
+            dtypes.float64,
+            data_format,
+            use_gpu,
+        )
+
+    for data_format, use_gpu in [
+        ("NHWC", False),
+        ("NHWC", True),
+        ("NCHW", False),
+        ("NCHW", True),
+    ]:
+      for shape in (4, 3, 0), (4, 0, 3), (0, 4, 3):
+        self._testGradient(
+            np.random.randn(*shape),
+            np.random.randn(shape[-1]),
+            dtypes.float64,
+            data_format,
+            use_gpu,
+        )
+
+
+class LeakyReluTest(test.TestCase):
+
+  def _npLeakyRelu(self, np_features, alpha=0.1):
+    return np.maximum(np_features, alpha * np_features)
+
+  def testNpLeakyRelu(self):
+    self.assertAllClose(
+        np.array(
+            [[-0.09, 0.7, -0.05, 0.3, -0.01], [0.1, -0.03, 0.5, -0.07, 0.9]]
+        ),
+        self._npLeakyRelu(
+            np.array(
+                [[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]]
+            ),
+            alpha=0.1,
+        ),
+    )
+
+  def _testLeakyRelu(self, np_features, alpha):
+    np_leaky_relu = self._npLeakyRelu(np_features, alpha)
+    tf_leaky_relu = nn_ops.leaky_relu(np_features, alpha)
+    self.assertAllClose(np_leaky_relu, tf_leaky_relu)
+    self.assertShapeEqual(np_leaky_relu, tf_leaky_relu)
+
+  def testNumbersCPU(self):
+    for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testLeakyRelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t),
+            alpha=0.2,
+        )
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32, np.float64]:
+      self._testLeakyRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t), alpha=0.1
+      )
+
+  def testGradGradFloat16(self):
+    with self.cached_session():
+
+      def f(x):
+        assert x.dtype == dtypes.float16
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.leaky_relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float16,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat16(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float16,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x])
+      )
+      print(err)
+    self.assertLess(err, 6e-2)  # check if this is too high.
+
+  def testGradientFloat32(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat64(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float64,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.leaky_relu, [x])
+      )
+    self.assertLess(err, 1e-10)
+
+  def testGradGradFloat32(self):
+    with self.cached_session():
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.leaky_relu(x)
+        return tape.gradient(y, x)
+
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+
+class ReluTest(test.TestCase):
+
+  def _npRelu(self, np_features):
+    return np.maximum(np_features, np.zeros(np_features.shape))
+
+  def testNpRelu(self):
+    self.assertAllClose(
+        np.array([[0.0, 0.7, 0.0, 0.3, 0.0], [0.1, 0.0, 0.5, 0.0, 0.9]]),
+        self._npRelu(
+            np.array(
+                [[-0.9, 0.7, -0.5, 0.3, -0.1], [0.1, -0.3, 0.5, -0.7, 0.9]]
+            )
+        ),
+    )
+
+  def _testRelu(self, np_features):
+    np_relu = self._npRelu(np_features)
+    tf_relu = nn_ops.relu(np_features)
+    self.assertAllClose(np_relu, tf_relu)
+    self.assertShapeEqual(np_relu, tf_relu)
+
+  def testNumbersCPU(self):
+    for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testRelu(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t)
+        )
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float32]:
+      self._testRelu(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t)
+      )
+
+  def testNoElement(self):
+    self._testRelu(np.array([[], []], dtype=np.float32))
+
+  def testGradGradFloat32(self):
+    with self.cached_session():
+
+      def f(x):
+        assert x.dtype == dtypes.float32
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+          dy = tape.gradient(y, x)
+          return dy
+
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float32,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+  def testGradGradFloat16(self):
+    with self.cached_session():
+
+      def f(x):
+        assert x.dtype == dtypes.float16
+        with backprop.GradientTape() as tape:
+          tape.watch(x)
+          y = nn_ops.relu(x)
+          dy = tape.gradient(y, x)
+          return dy
+
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [0.1, 0.3, 0.5, 0.7, 0.9]],
+          dtype=np.float16,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(f, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+
+class Relu6Test(test.TestCase):
+
+  def _npRelu6(self, np_features):
+    sixes = np.copy(np_features)
+    sixes.fill(6.0)
+    return np.minimum(
+        np.maximum(np_features, np.zeros(np_features.shape)), sixes
+    )
+
+  def testNpRelu6(self):
+    self.assertAllClose(
+        np.array([[0.0, 0.7, 0.0, 0.3, 6.0], [0.1, 0.0, 6.0, 0.0, 0.9]]),
+        self._npRelu6(
+            np.array([[-0.9, 0.7, -0.5, 0.3, 6.0], [0.1, -0.3, 6.5, -0.7, 0.9]])
+        ),
+    )
+
+  def _testRelu6(self, np_features):
+    np_relu6 = self._npRelu6(np_features)
+    tf_relu6 = nn_ops.relu6(np_features)
+    self.assertAllClose(np_relu6, tf_relu6)
+    self.assertShapeEqual(np_relu6, tf_relu6)
+
+  def testNumbersCPU(self):
+    for t in [np.int32, np.int64, np.float16, np.float32, np.float64]:
+      # Force execution on CPU even if a GPU kernel is available for the type.
+      with ops.device("/device:CPU:0"):
+        self._testRelu6(
+            np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t)
+        )
+
+  def testNumbersGPU(self):
+    if not test.is_gpu_available():
+      self.skipTest("No GPU available")
+    for t in [np.float16, np.float, np.double]:
+      print(t)
+      self._testRelu6(
+          np.array([[-9, 7, -5, 3, -1], [1, -3, 5, -7, 9]]).astype(t)
+      )
+
+  # The gradient test for ReLU6 is a bit tricky as the derivative is
+  # not well defined at around zero and six and we want to avoid that
+  # in terms of input values.
+  def testGradientFloat32(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
+          dtype=np.float32,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat16(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
+          dtype=np.float16,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x])
+      )
+    self.assertLess(err, 1e-4)
+
+  def testGradientFloat64(self):
+    with self.cached_session():
+      x = np.asarray(
+          [[-0.9, -0.7, -0.5, -0.3, -0.1], [6.1, 6.3, 6.5, 6.7, 6.9]],
+          dtype=np.float64,
+          order="F",
+      )
+      err = gradient_checker_v2.max_error(
+          *gradient_checker_v2.compute_gradient(nn_ops.relu6, [x])
+      )
+    self.assertLess(err, 1e-10)
+
+
+class SoftmaxTest(test.TestCase):
+
+  def _npSoftmax(self, features, dim=-1, log=False):
+    if dim == -1:
+      dim = len(features.shape) - 1
+    one_only_on_dim = list(features.shape)
+    one_only_on_dim[dim] = 1
+    is_fp16 = features.dtype == np.float16
+    if is_fp16:
+      # Do the compute in fp32 and cast the input back to fp32.
+      features = features.astype(np.float32)
+    e = np.exp(
+        features - np.reshape(np.amax(features, axis=dim), one_only_on_dim)
+    )
+    softmax = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim)
+    if log:
+      res = np.log(softmax)
+    else:
+      res = softmax
+    if is_fp16:
+      res = res.astype(np.float16)
+    return res
+
+  def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False):
+    # A previous version of the code checked the op name rather than the op type
+    # to distinguish between log and non-log.  Use an arbitrary name to catch
+    # this bug in future.
+    name = "arbitrary"
+    np_softmax = self._npSoftmax(np_features, dim=dim, log=log)
+    with self.cached_session(use_gpu=use_gpu):
+      if log:
+        tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name)
+      else:
+        tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name)
+      out = self.evaluate(tf_softmax)
+    self.assertAllCloseAccordingToType(np_softmax, out)
+    self.assertShapeEqual(np_softmax, tf_softmax)
+    if not log:
+      # Bonus check: the softmaxes should add to one in dimension dim.
+      sum_along_dim = np.sum(out, axis=dim)
+      self.assertAllCloseAccordingToType(
+          np.ones(sum_along_dim.shape), sum_along_dim
+      )
+
+  def _testAll(self, features):
+    self._testSoftmax(features, use_gpu=True)
+    self._testSoftmax(features, log=True, use_gpu=True)
+    self._testOverflow(use_gpu=True)
+
+  def testNpSoftmax(self):
+    features = [[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]]
+    # Batch 0: All exps are 1.  The expected result is
+    # Softmaxes = [0.25, 0.25, 0.25, 0.25]
+    # LogSoftmaxes = [-1.386294, -1.386294, -1.386294, -1.386294]
+    #
+    # Batch 1:
+    # exps = [1., 2.718, 7.389, 20.085]
+    # sum = 31.192
+    # Softmaxes = exps / sum = [0.0320586, 0.08714432, 0.23688282, 0.64391426]
+    # LogSoftmaxes = [-3.44019 , -2.44019 , -1.44019 , -0.44019]
+    np_sm = self._npSoftmax(np.array(features))
+    self.assertAllClose(
+        np.array([
+            [0.25, 0.25, 0.25, 0.25],
+            [0.0320586, 0.08714432, 0.23688282, 0.64391426],
+        ]),
+        np_sm,
+        rtol=1.0e-5,
+        atol=1.0e-5,
+    )
+    np_lsm = self._npSoftmax(np.array(features), log=True)
+    self.assertAllClose(
+        np.array([
+            [-1.386294, -1.386294, -1.386294, -1.386294],
+            [-3.4401897, -2.4401897, -1.4401897, -0.4401897],
+        ]),
+        np_lsm,
+        rtol=1.0e-5,
+        atol=1.0e-5,
+    )
+
+  def _testOverflow(self, use_gpu=False):
+    if use_gpu:
+      type = np.float32  # pylint: disable=redefined-builtin
+    else:
+      type = np.float64  # pylint: disable=redefined-builtin
+    max = np.finfo(type).max  # pylint: disable=redefined-builtin
+    features = np.array([[1.0, 1.0, 1.0, 1.0], [max, 1.0, 2.0, 3.0]]).astype(
+        type
+    )
+    with self.cached_session(use_gpu=use_gpu):
+      tf_log_softmax = nn_ops.log_softmax(features)
+      out = self.evaluate(tf_log_softmax)
+    self.assertAllClose(
+        np.array([
+            [-1.386294, -1.386294, -1.386294, -1.386294],
+            [0, -max, -max, -max],
+        ]),
+        out,
+        rtol=1.0e-5,
+        atol=1.0e-5,
+    )
+
+  def testFloat(self):
+    self._testAll(
+        np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]]).astype(
+            np.float32
+        )
+    )
+
+  def testHalf(self):
+    self._testAll(
+        np.array([[1.0, 1.0, 1.0, 1.0], [1.0, 2.0, 3.0, 4.0]]).astype(
+            np.float16
+        )
+    )
+
+
+class BaseReductionTest(test.TestCase):
+
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    raise NotImplementedError()
+
+  def _np_reduce(self, x, reduction_axes, keepdims):
+    raise NotImplementedError()
+
+  def _makeIncremental(self, shape, dtype):
+    data = np.arange(np.prod(shape)).reshape(shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 2j * data
+    return data
+
+  def _makeRandom(self, shape, dtype):
+    data = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
+    if dtype.is_complex:
+      data -= 2j * data
+    return data
+
+  def _compareGradient(self, x, reduction_axes, rtol=1e-8, atol=1e-8):
+    if reduction_axes is not None and np.shape(reduction_axes) == (1,):
+      # Test scalar reduction_axes argument
+      self._compareGradient(x, reduction_axes[0], rtol=rtol, atol=atol)
+    with self.cached_session(use_gpu=True):
+      t = ops.convert_to_tensor(x)
+      su = self._tf_reduce(t, reduction_axes, False)
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          t, x.shape, su, su.get_shape().as_list(), x_init_value=x, delta=1
+      )
+    self.assertAllClose(jacob_t, jacob_n, rtol=rtol, atol=atol)
+
+  def _compareGradientAxes(self, x, rtol=1e-8, atol=1e-8):
+    self._compareGradient(x, None, rtol=rtol, atol=atol)
+    self._compareGradient(x, [], rtol=rtol, atol=atol)
+    self._compareGradient(x, 0, rtol=rtol, atol=atol)
+    self._compareGradient(x, [1], rtol=rtol, atol=atol)
+    self._compareGradient(x, [2], rtol=rtol, atol=atol)
+    self._compareGradient(x, [1, 2], rtol=rtol, atol=atol)
+    self._compareGradient(x, [0, 1, 2, 3], rtol=rtol, atol=atol)
+
+
+class ConcatOpTest(test.TestCase):
+
+  def _testRandom(self, dtype):
+    # Random dims of rank 5
+    shape = np.random.randint(1, 5, size=5)
+    # Random number of tensors, but always > 1.
+    num_tensors = np.random.randint(2, 10)
+    # Random dim to concat on
+    concat_dim = np.random.randint(5)
+    params = {}
+    if dtype == dtypes.bfloat16:
+      dtype_feed = dtypes.float32
+    else:
+      dtype_feed = dtype
+    with self.cached_session(use_gpu=True):
+      p = []
+      for i in np.arange(num_tensors):
+        input_shape = shape
+        input_shape[concat_dim] = np.random.randint(1, 5)
+        placeholder = array_ops.placeholder(dtype_feed, shape=input_shape)
+        p.append(placeholder)
+
+        t = dtype_feed.as_numpy_dtype
+        params[placeholder] = np.random.rand(*input_shape).astype(t)
+
+      if dtype != dtype_feed:
+        concat_inputs = [math_ops.cast(p_i, dtype) for p_i in p]
+      else:
+        concat_inputs = p
+      c = array_ops.concat(concat_inputs, concat_dim)
+      if dtype != dtype_feed:
+        c = math_ops.cast(c, dtype_feed)
+      result = c.eval(feed_dict=params)
+
+    self.assertEqual(result.shape, c.get_shape())
+    cur_offset = 0
+
+    for i in np.arange(num_tensors):
+      # The index into the result is the ':' along all dimensions
+      # except the concat_dim. slice(0, size) is used for ':', and
+      # a list of slices is used to index into result.
+      ind = [slice(0, params[p[i]].shape[j]) for j in np.arange(5)]
+      ind[concat_dim] = slice(
+          cur_offset, cur_offset + params[p[i]].shape[concat_dim]
+      )
+      cur_offset += params[p[i]].shape[concat_dim]
+      if dtype == dtype_feed:
+        self.assertAllEqual(result[tuple(ind)], params[p[i]])
+      else:
+        self.assertAllClose(result[tuple(ind)], params[p[i]], 0.01)
+
+  @test_util.run_deprecated_v1
+  def testRandom(self):
+    self._testRandom(dtypes.bfloat16.as_numpy_dtype)
+    self._testRandom(dtypes.float16)
+    self._testRandom(dtypes.float32)
+    self._testRandom(dtypes.int32)
+    self._testRandom(dtypes.int64)
+
+  def _RunAndVerifyGradientsRandom(self, dtype=dtypes.float32.as_numpy_dtype):
+    # Random dims of rank 5
+    input_shape = np.random.randint(1, 5, size=5)
+    # Random number of tensors
+    num_tensors = np.random.randint(12, 20)
+    # Random dim to concat on
+    concat_dim = np.random.randint(5)
+    concat_dim_sizes = np.random.randint(1, 5, size=num_tensors)
+    with test_util.use_gpu():
+      inp = []
+      inp_tensors = []
+      for x in concat_dim_sizes:
+        shape = input_shape
+        shape[concat_dim] = x
+        t = np.random.rand(*shape).astype(dtype)
+        inp.append(t)
+        inp_tensors.append(
+            constant_op.constant(t.flatten(), shape=shape, dtype=dtype)
+        )
+      c = array_ops.concat(inp_tensors, concat_dim)
+      output_shape = input_shape
+      output_shape[concat_dim] = concat_dim_sizes.sum()
+      grad_inp = np.random.rand(*output_shape).astype(dtype)
+      grad_tensor = constant_op.constant(grad_inp.flatten(), shape=output_shape)
+      grad = gradients_impl.gradients([c], inp_tensors, [grad_tensor])
+      concated_grad = array_ops.concat(grad, concat_dim)
+      result = self.evaluate(concated_grad)
+
+    self.assertAllEqual(result, grad_inp)
+
+  @test_util.run_deprecated_v1
+  def testGradientsRandom(self):
+    for _ in range(5):
+      self._RunAndVerifyGradientsRandom()
+      self._RunAndVerifyGradientsRandom(dtypes.bfloat16.as_numpy_dtype)
+
+
+class TileTest(test.TestCase, parameterized.TestCase):
+
+  def testSimple(self):
+    # multiples could be int32 or int64
+    for dtype in [dtypes.int32, dtypes.int64]:
+      for in_type in [np.float32, dtypes.bfloat16.as_numpy_dtype]:
+        with self.cached_session(use_gpu=True):
+          inp = np.random.rand(4, 1).astype(in_type)
+          a = constant_op.constant(inp)
+          tiled = array_ops.tile(a, constant_op.constant([1, 4], dtype=dtype))
+          result = self.evaluate(tiled)
+        self.assertEqual(result.shape, (4, 4))
+        self.assertEqual([4, 4], tiled.get_shape())
+        self.assertTrue((result == np.tile(inp, (1, 4))).all())
+
+
+class PadOpTest(test.TestCase):
+
+  def _npPad(self, inp, paddings, mode, constant_values=0):
+    mode = mode.lower()
+    if mode == "constant":
+      return np.pad(inp, paddings, mode=mode, constant_values=constant_values)
+    else:
+      return np.pad(inp, paddings, mode=mode)
+
+  def _testPad(self, np_inputs, paddings, mode, constant_values):
+    np_val = self._npPad(
+        np_inputs, paddings, mode=mode, constant_values=constant_values
+    )
+    with self.cached_session(use_gpu=True):
+      tf_val = array_ops.pad(
+          np_inputs, paddings, mode=mode, constant_values=constant_values
+      )
+      out = self.evaluate(tf_val)
+    self.assertAllEqual(np_val, out)
+    self.assertShapeEqual(np_val, tf_val)
+
+  def _testPadGradient(self, x, a, mode, constant_values):
+    with self.cached_session(use_gpu=True):
+      inx = ops.convert_to_tensor(x)
+      xs = list(x.shape)
+      ina = ops.convert_to_tensor(a)
+      y = array_ops.pad(inx, ina, mode=mode, constant_values=constant_values)
+      # Expected y's shape to be:
+      ys = list(np.array(x.shape) + np.sum(np.array(a), axis=1))
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, y, ys, x_init_value=x
+      )
+    self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testPaddingAll(self, np_inputs, paddings, constant_values):
+    for mode in (
+        "CONSTANT",
+        "REFLECT",
+        "SYMMETRIC",
+        "reflect",
+        "symmetric",
+        "constant",
+    ):
+      # Zero-sized input is not allowed for REFLECT mode, but we still want
+      # zero-sized input test cases for the other modes.
+      if np_inputs.size or mode.upper() != "REFLECT":
+        self._testPad(
+            np_inputs, paddings, mode=mode, constant_values=constant_values
+        )
+        if np_inputs.dtype == np.float32:
+          self._testPadGradient(
+              np_inputs, paddings, mode=mode, constant_values=constant_values
+          )
+
+  @test_util.run_deprecated_v1
+  def testPadding(self):
+    for t in [np.float32]:
+      self._testPaddingAll(
+          np.random.rand(2, 5).astype(t), [[1, 0], [2, 0]], 0.0
+      )
+      self._testPaddingAll(
+          np.random.rand(2, 3, 4).astype(t), [[0, 0], [0, 0], [0, 0]], -1234.0
+      )
+      self._testPaddingAll(
+          np.random.rand(0, 3, 4).astype(t), [[0, 0], [2, 1], [2, 3]], 0.0
+      )
+
+
+class RandomOpsCorrectnessTest(test.TestCase):
+  shapes = [[1, 5], [2, 6, 5], [5, 3, 6, 2], [100, 100]]
+  seeds = [2, 16, 1582, 12]
+  minvals = [-10.0, 0.5, 10.0, 1000.0]
+  maxvals = [-5.0, 1.0, 20.0, 2000.0]
+  means = [-5.0, 1.0, 100.0, 1000.0]
+  stddevs = [0.1, 1.0, 10.0, 100.0]
+
+  def _testRandomDefault(self, rnfunc, shape, seed, dtype):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(shape, seed=seed, dtype=dtype)
+    with test_util.device(use_gpu=True):
+      result = rnfunc(shape, seed=seed, dtype=dtype)
+    if dtype == dtypes.float16:
+      self.assertAllClose(result, ref, atol=1e-3)
+    else:
+      self.assertAllClose(result, ref, atol=1e-5)
+
+  def _testRandomMinvalMaxval(
+      self, rnfunc, shape, seed, minvalue, maxvalue, dtype
+  ):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(
+          shape, seed=seed, minval=minvalue, maxval=maxvalue, dtype=dtype
+      )
+    with test_util.device(use_gpu=True):
+      result = rnfunc(
+          shape, seed=seed, minval=minvalue, maxval=maxvalue, dtype=dtype
+      )
+    if dtype == dtypes.float16:
+      self.assertAllClose(result, ref, atol=1e-3)
+    else:
+      self.assertAllClose(result, ref, atol=1e-5)
+
+  def _testRandomMeanStd(self, rnfunc, shape, seed, mean, stddev, dtype):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(shape, seed=seed, mean=mean, stddev=stddev, dtype=dtype)
+    with test_util.device(use_gpu=True):
+      result = rnfunc(shape, seed=seed, mean=mean, stddev=stddev, dtype=dtype)
+    if dtype == dtypes.float16:
+      self.assertAllClose(result, ref, atol=5e-1)
+      # TODO: this tolerance is too high.
+      # For the random normal case & truncated normal, this particular test has
+      # 1/10000 mismatched element in the (100, 100) shape.
+    else:
+      self.assertAllClose(result, ref, atol=1e-5)
+
+  def testRandomUniformCorrectness_1(self):
+    for dtype in [dtypes.float32, dtypes.float16]:
+      for i in range(len(self.shapes)):
+        self._testRandomDefault(
+            random_ops.random_uniform, self.shapes[i], self.seeds[i], dtype
+        )
+
+  def testRandomUniformCorrectness_2(self):
+    for dtype in [dtypes.float32, dtypes.float16]:
+      for i in range(len(self.shapes)):
+        self._testRandomMinvalMaxval(
+            random_ops.random_uniform,
+            self.shapes[i],
+            self.seeds[i],
+            self.minvals[i],
+            self.maxvals[i],
+            dtype,
+        )
+
+  def testRandomNormalCorrectness_1(self):
+    for dtype in [dtypes.float32, dtypes.float16]:
+      for i in range(len(self.shapes)):
+        self._testRandomDefault(
+            random_ops.random_normal, self.shapes[i], self.seeds[i], dtype
+        )
+
+  def testRandomNormalCorrectness_2(self):
+    for dtype in [dtypes.float32, dtypes.float16]:
+      for i in range(len(self.shapes)):
+        self._testRandomMeanStd(
+            random_ops.random_normal,
+            self.shapes[i],
+            self.seeds[i],
+            self.means[i],
+            self.stddevs[i],
+            dtype,
+        )
+
+  def testRandomTruncatedCorrectness_1(self):
+    for dtype in [dtypes.float32, dtypes.float16]:
+      for i in range(len(self.shapes)):
+        self._testRandomDefault(
+            random_ops.truncated_normal, self.shapes[i], self.seeds[i], dtype
+        )
+
+  def testRandomTruncatedCorrectness_2(self):
+    for dtype in [dtypes.float32, dtypes.float16]:
+      for i in range(len(self.shapes)):
+        self._testRandomMeanStd(
+            random_ops.truncated_normal,
+            self.shapes[i],
+            self.seeds[i],
+            self.means[i],
+            self.stddevs[i],
+            dtype,
+        )
+
+
+class StatelessRandomOpsCorrectnessTest(test.TestCase):
+  shapes = [[1, 5], [2, 6, 5], [5, 3, 6, 2], [100, 100]]
+  seeds = [[2, 1], [16, 12], [1582, 10230], [12, 23101]]
+  dtypes = [dtypes.float32, dtypes.float32, dtypes.half, dtypes.half]
+
+  def _testStatelessRandomDefault(self, rnfunc, shape, seed, dtype):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(shape=shape, seed=seed, dtype=dtype)
+    with test_util.device(use_gpu=True):
+      result = rnfunc(shape=shape, seed=seed, dtype=dtype)
+    if dtype == dtypes.float32:
+      self.assertAllClose(result, ref, atol=1e-5)
+    elif dtype == dtypes.float16:
+      self.assertAllClose(result, ref, atol=1e-3)
+
+  def testRandomUniformCorrectness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefault(
+          raw_ops.StatelessRandomUniform,
+          self.shapes[i],
+          self.seeds[i],
+          self.dtypes[i],
+      )
+
+  def testRandomNormalCorrectness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefault(
+          raw_ops.StatelessRandomNormal,
+          self.shapes[i],
+          self.seeds[i],
+          self.dtypes[i],
+      )
+
+  def testTruncatedNormalCorrectness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefault(
+          raw_ops.StatelessTruncatedNormal,
+          self.shapes[i],
+          self.seeds[i],
+          self.dtypes[i],
+      )
+
+
+class StatelessRandomOpsCorrectnessTestV2(test.TestCase):
+  shapes = [[1, 5], [2, 6, 5], [5, 3, 6, 2], [100, 100]]
+  seeds = [[2, 1], [16, 12], [1582, 10230], [12, 23101]]
+  key = [[2], [16], [1582], [12]]
+  counters = [[23, 11], [11, 23], [2000312, 0], [0, 0]]
+  itypes = [dtypes.int32, dtypes.uint32, dtypes.int64, dtypes.uint64]
+  dtypes = [dtypes.float32, dtypes.float32, dtypes.half, dtypes.half]
+
+  def _testStatelessRandomDefault(self, rnfunc, shape, seed, dtype):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(shape=shape, seed=seed, dtype=dtype)
+    with test_util.device(use_gpu=True):
+      result = rnfunc(shape=shape, seed=seed, dtype=dtype)
+    if dtype == dtypes.float32:
+      self.assertAllClose(result, ref, atol=1e-5)
+    elif dtype == dtypes.float16:
+      self.assertAllClose(result, ref, atol=1e-3)
+
+  def _testStatelessRandomDefaultV2(
+      self, rnfunc, shape, key, counter, dtype, alg=1
+  ):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(shape=shape, key=[key[0]], alg=alg, counter=counter)
+    with test_util.device(use_gpu=True):
+      result = rnfunc(shape=shape, key=[key[0]], alg=alg, counter=counter)
+    self.assertAllClose(result, ref, atol=1e-5)
+
+  def _testStatelessRandomUniformFullIntV2(
+      self, rnfunc, shape, key, counter, dtype
+  ):
+    with test_util.device(use_gpu=False):
+      ref = rnfunc(shape=shape, alg=1, key=key, counter=counter, dtype=dtype)
+    with test_util.device(use_gpu=True):
+      result = rnfunc(shape=shape, alg=1, key=key, counter=counter, dtype=dtype)
+    self.assertEqual(result.shape, ref.shape)
+
+  def testRandomUniformCorrectness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefault(
+          raw_ops.StatelessRandomUniform,
+          self.shapes[i],
+          self.seeds[i],
+          self.dtypes[i],
+      )
+
+  def testRandomUniformV2Correctness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefaultV2(
+          raw_ops.StatelessRandomUniformV2,
+          self.shapes[i],
+          self.seeds[i],
+          self.counters[i],
+          self.dtypes[i],
+      )
+
+  def testRandomNormalCorrectness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefault(
+          raw_ops.StatelessRandomNormal,
+          self.shapes[i],
+          self.seeds[i],
+          self.dtypes[i],
+      )
+
+  def testRandomNormalV2Correctness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefaultV2(
+          raw_ops.StatelessRandomNormalV2,
+          self.shapes[i],
+          self.seeds[i],
+          self.counters[i],
+          self.dtypes[i],
+      )
+
+  def testTruncatedNormalCorrectness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefault(
+          raw_ops.StatelessTruncatedNormal,
+          self.shapes[i],
+          self.seeds[i],
+          self.dtypes[i],
+      )
+
+  def testTruncatedNormalV2Correctness_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomDefaultV2(
+          raw_ops.StatelessTruncatedNormalV2,
+          self.shapes[i],
+          self.seeds[i],
+          self.counters[i],
+          self.dtypes[i],
+      )
+
+  def testRandomUniformFullIntV2Functional_1(self):
+    for i in range(len(self.shapes)):
+      self._testStatelessRandomUniformFullIntV2(
+          raw_ops.StatelessRandomUniformFullIntV2,
+          self.shapes[i],
+          self.key[i],
+          self.counters[i],
+          self.itypes[i],
+      )
+
+
+class BatchNormTest(test.TestCase):
+
+  def _batch_norm(self, x, mean, var, offset, scale, epsilon):
+    # We compute the batch norm manually in this function because
+    # nn_impl.batch_normalization does not support float16 yet.
+    # TODO(reedwm): Add float16 support to nn_impl.batch_normalization.
+    inv = math_ops.rsqrt(var + epsilon) * scale
+    y = math_ops.cast(x, scale.dtype) * inv + (offset - mean * inv)
+    return math_ops.cast(y, x.dtype)
+
+  def _running_mean(self, old_mean, new_val, factor):
+    if factor == 1.0:
+      return new_val
+    else:
+      return (1.0 - factor) * old_mean + factor * new_val
+
+  def _training_ref(
+      self,
+      x,
+      scale,
+      offset,
+      old_mean,
+      old_var,
+      exponential_avg_factor,
+      epsilon,
+      data_format,
+  ):
+    if data_format not in ["NHWC", "NCHW"]:
+      raise ValueError(
+          "data_format must be NCHW or NHWC, got %s." % data_format
+      )
+    if data_format == "NCHW":
+      x = array_ops.transpose(x, [0, 2, 3, 1])
+    batch_mean, batch_var = nn_impl.moments(
+        math_ops.cast(x, scale.dtype), [0, 1, 2], keep_dims=False
+    )
+
+    y = self._batch_norm(x, batch_mean, batch_var, offset, scale, epsilon)
+    if data_format == "NCHW":
+      y = array_ops.transpose(y, [0, 3, 1, 2])
+
+    # This is for Bessel's correction. tf.nn.moments uses n, instead of n-1, as
+    # the denominator in the formula to calculate variance, while
+    # tf.compat.v1.nn.fused_batch_norm has Bessel's correction built in.
+    sample_size = math_ops.cast(
+        array_ops.size(x) / array_ops.size(scale), scale.dtype
+    )
+    batch_var_corrected = (
+        batch_var * sample_size / (math_ops.maximum(sample_size - 1.0, 1.0))
+    )
+
+    mean = self._running_mean(old_mean, batch_mean, exponential_avg_factor)
+    var = self._running_mean(
+        old_var, batch_var_corrected, exponential_avg_factor
+    )
+    return self.evaluate(y), self.evaluate(mean), self.evaluate(var)
+
+  def _test_training(
+      self,
+      x_shape,
+      x_dtype,
+      scale_shape,
+      scale_dtype,
+      use_gpu=True,
+      exponential_avg_factor=1.0,
+      data_format="NHWC",
+  ):
+    np.random.seed(1)
+    x_val = np.random.random_sample(x_shape).astype(x_dtype)
+    scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+    offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+    if exponential_avg_factor == 1.0:
+      old_mean_val = None
+      old_var_val = None
+    else:
+      old_mean_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+      old_var_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+
+    with self.cached_session(use_gpu=use_gpu) as _:
+      x = constant_op.constant(x_val, name="x")
+      scale = constant_op.constant(scale_val, name="scale")
+      offset = constant_op.constant(offset_val, name="offset")
+      epsilon = 0.001
+      y, mean, var = nn_impl.fused_batch_norm(
+          x,
+          scale,
+          offset,
+          mean=old_mean_val,
+          variance=old_var_val,
+          epsilon=epsilon,
+          exponential_avg_factor=exponential_avg_factor,
+          data_format=data_format,
+          is_training=True,
+      )
+      y_val, mean_val, var_val = self.evaluate([y, mean, var])
+      y_ref, mean_ref, var_ref = self._training_ref(
+          x,
+          scale,
+          offset,
+          old_mean_val,
+          old_var_val,
+          exponential_avg_factor,
+          epsilon,
+          data_format,
+      )
+    y_atol = 2e-3 if x_dtype == np.float16 else 1e-3
+    self.assertAllClose(y_ref, y_val, atol=y_atol)
+    self.assertAllClose(mean_ref, mean_val, atol=1e-3)
+    self.assertAllClose(var_ref, var_val, atol=1e-3)
+
+  def _inference_ref(self, x, scale, offset, mean, var, epsilon, data_format):
+    if data_format not in ["NHWC", "NCHW"]:
+      raise ValueError(
+          "data_format must be NCHW or NHWC, got %s." % data_format
+      )
+    if data_format == "NCHW":
+      x = array_ops.transpose(x, [0, 2, 3, 1])
+    y = self._batch_norm(x, mean, var, offset, scale, epsilon)
+    if data_format == "NCHW":
+      y = array_ops.transpose(y, [0, 3, 1, 2])
+    return self.evaluate(y)
+
+  def _test_inference(
+      self,
+      x_shape,
+      x_dtype,
+      scale_shape,
+      scale_dtype,
+      use_gpu=True,
+      exponential_avg_factor=1.0,
+      data_format="NHWC",
+  ):
+    np.random.seed(1)
+    x_val = np.random.random_sample(x_shape).astype(x_dtype)
+    scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+    offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+    mean_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+    var_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+
+    with self.cached_session(use_gpu=use_gpu) as _:
+      x = constant_op.constant(x_val, name="x")
+      scale = constant_op.constant(scale_val, name="scale")
+      offset = constant_op.constant(offset_val, name="offset")
+      mean = constant_op.constant(mean_val, name="mean")
+      var = constant_op.constant(var_val, name="variance")
+      epsilon = 0.001
+      y, _, _ = nn_impl.fused_batch_norm(
+          x,
+          scale,
+          offset,
+          mean=mean,
+          variance=var,
+          epsilon=epsilon,
+          exponential_avg_factor=exponential_avg_factor,
+          data_format=data_format,
+          is_training=False,
+      )
+      y_val = self.evaluate(y)
+      y_ref = self._inference_ref(
+          x, scale, offset, mean, var, epsilon, data_format
+      )
+    # An atol value of 1e-3 is too small for float16's, because some adjacent
+    # float16 values that y_val can take are greater than 1e-3 apart, e.g.
+    # 2.16602 and 2.16797.
+    atol = 2e-3 if x_dtype == np.float16 else 1e-3
+    self.assertAllClose(y_ref, y_val, atol=atol)
+
+  def _runtests(self, x_shape, is_training, gradient_test=False):
+    use_gpu_vals = [False]
+    if test.is_gpu_available(cuda_only=True):
+      use_gpu_vals += [True]
+    factors = [
+        1.0,
+    ]
+    if compat.forward_compatible(2020, 3, 6):
+      factors += [
+          0.6,
+      ]
+    for dtype in [np.float16, np.float32]:
+      for use_gpu in use_gpu_vals:
+        for data_format in ["NHWC", "NCHW"]:
+          if data_format == "NHWC":
+            scale_shape = x_shape[-1:]
+          else:
+            scale_shape = x_shape[1:2]
+          for exponential_avg_factor in factors:
+            if gradient_test:
+              self._test_gradient(
+                  x_shape,
+                  dtype,
+                  scale_shape,
+                  np.float32,
+                  use_gpu=use_gpu,
+                  data_format=data_format,
+                  is_training=is_training,
+              )
+            else:
+              if is_training:
+                self._test_training(
+                    x_shape,
+                    dtype,
+                    scale_shape,
+                    np.float32,
+                    use_gpu=use_gpu,
+                    data_format=data_format,
+                    exponential_avg_factor=exponential_avg_factor,
+                )
+              else:
+                self._test_inference(
+                    x_shape,
+                    dtype,
+                    scale_shape,
+                    np.float32,
+                    use_gpu=use_gpu,
+                    data_format=data_format,
+                    exponential_avg_factor=exponential_avg_factor,
+                )
+
+  def testInferenceShape1(self):
+    x_shape = [1, 1, 6, 1]
+    self._runtests(x_shape, False)
+
+  def testInferenceShape2(self):
+    x_shape = [1, 1, 6, 2]
+    self._runtests(x_shape, False)
+
+  def testInferenceShape3(self):
+    x_shape = [1, 2, 1, 6]
+    self._runtests(x_shape, False)
+
+  def testInferenceShape4(self):
+    x_shape = [27, 131, 127, 6]
+    self._runtests(x_shape, False)
+
+  def testInferenceShape5(self):
+    x_shape = [0, 131, 127, 6]
+    self._runtests(x_shape, False)
+
+  def testTrainingShape1(self):
+    x_shape = [1, 1, 6, 1]
+    self._runtests(x_shape, True)
+
+  def testTrainingShape2(self):
+    x_shape = [1, 1, 6, 2]
+    self._runtests(x_shape, True)
+
+  def testTrainingShape3(self):
+    x_shape = [1, 2, 1, 6]
+    self._runtests(x_shape, True)
+
+  def testTrainingShape4(self):
+    x_shape = [27, 131, 127, 6]
+    self._runtests(x_shape, True)
+
+  def _test_gradient(
+      self,
+      x_shape,
+      x_dtype,
+      scale_shape,
+      scale_dtype,
+      use_gpu=True,
+      data_format="NHWC",
+      is_training=True,
+  ):
+    np.random.seed(1)
+    x_val = np.random.random_sample(x_shape).astype(x_dtype)
+    scale_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+    offset_val = np.random.random_sample(scale_shape).astype(scale_dtype)
+
+    with self.cached_session(use_gpu=use_gpu):
+      x = constant_op.constant(x_val, name="x")
+      scale = constant_op.constant(scale_val, name="scale")
+      offset = constant_op.constant(offset_val, name="offset")
+      if is_training:
+        pop_mean = None
+        pop_var = None
+      else:
+        pop_mean = np.random.random_sample(scale_shape).astype(scale_dtype)
+        pop_var = np.random.random_sample(scale_shape).astype(scale_dtype)
+      y, _, _ = nn_impl.fused_batch_norm(
+          x,
+          scale,
+          offset,
+          mean=pop_mean,
+          variance=pop_var,
+          data_format=data_format,
+          is_training=is_training,
+      )
+      if x_dtype != np.float16:
+        err_x = gradient_checker.compute_gradient_error(x, x_shape, y, x_shape)
+        err_scale = gradient_checker.compute_gradient_error(
+            scale, scale_shape, y, x_shape
+        )
+        err_offset = gradient_checker.compute_gradient_error(
+            offset, scale_shape, y, x_shape
+        )
+      else:
+        x32 = constant_op.constant(x_val, name="x32", dtype=dtypes.float32)
+        y32, _, _ = nn_impl.fused_batch_norm(
+            x32,
+            scale,
+            offset,
+            mean=pop_mean,
+            variance=pop_var,
+            data_format=data_format,
+            is_training=is_training,
+        )
+        err_x = self._compute_gradient_error_float16(
+            x, x32, x_shape, y, y32, x_shape
+        )
+        err_scale = self._compute_gradient_error_float16(
+            scale, scale, scale_shape, y, y32, x_shape
+        )
+        err_offset = self._compute_gradient_error_float16(
+            offset, offset, scale_shape, y, y32, x_shape
+        )
+
+    x_err_tolerance = 2e-3 if x_dtype == np.float16 else 1e-3
+    scale_err_tolerance = 1e-3
+    self.assertLess(err_x, x_err_tolerance)
+    self.assertLess(err_scale, scale_err_tolerance)
+    self.assertLess(err_offset, scale_err_tolerance)
+
+  @test_util.run_deprecated_v1
+  def testBatchNormGradShape1(self):
+    for is_training in [True, False]:
+      x_shape = [1, 1, 6, 1]
+      for dtype in [np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_gradient(
+              x_shape,
+              dtype,
+              [1],
+              np.float32,
+              use_gpu=True,
+              data_format="NHWC",
+              is_training=is_training,
+          )
+          self._test_gradient(
+              x_shape,
+              dtype,
+              [1],
+              np.float32,
+              use_gpu=True,
+              data_format="NCHW",
+              is_training=is_training,
+          )
+        self._test_gradient(
+            x_shape,
+            dtype,
+            [1],
+            np.float32,
+            use_gpu=False,
+            data_format="NHWC",
+            is_training=is_training,
+        )
+        self._test_gradient(
+            x_shape,
+            dtype,
+            [1],
+            np.float32,
+            use_gpu=False,
+            data_format="NCHW",
+            is_training=is_training,
+        )
+
+  @test_util.run_deprecated_v1
+  def testBatchNormGradShape2(self):
+    for is_training in [True, False]:
+      x_shape = [1, 1, 6, 2]
+      for dtype in [np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_gradient(
+              x_shape,
+              dtype,
+              [2],
+              np.float32,
+              use_gpu=True,
+              data_format="NHWC",
+              is_training=is_training,
+          )
+        self._test_gradient(
+            x_shape,
+            dtype,
+            [2],
+            np.float32,
+            use_gpu=False,
+            data_format="NHWC",
+            is_training=is_training,
+        )
+
+  @test_util.run_deprecated_v1
+  def testBatchNormGradShape3(self):
+    for is_training in [True, False]:
+      x_shape = [1, 2, 1, 6]
+      for dtype in [np.float32]:
+        if test.is_gpu_available(cuda_only=True):
+          self._test_gradient(
+              x_shape,
+              dtype,
+              [2],
+              np.float32,
+              use_gpu=True,
+              data_format="NCHW",
+              is_training=is_training,
+          )
+        self._test_gradient(
+            x_shape,
+            dtype,
+            [2],
+            np.float32,
+            use_gpu=False,
+            data_format="NCHW",
+            is_training=is_training,
+        )
+
+
+class SumReductionTest(BaseReductionTest):
+
+  def _tf_reduce(self, x, reduction_axes, keepdims):
+    return math_ops.reduce_sum(x, reduction_axes, keepdims)
+
+  def _np_reduce(self, x, reduction_axes, keepdims):
+    if isinstance(reduction_axes, list) or isinstance(
+        reduction_axes, np.ndarray
+    ):
+      reduction_axes = tuple(reduction_axes)
+    return np.sum(x, axis=reduction_axes, keepdims=keepdims)
+
+  def testAxesType(self):
+    for dtype in [dtypes.int64, dtypes.int32]:
+      with self.cached_session(use_gpu=True) as _:
+        v = math_ops.reduce_sum([0, 0], constant_op.constant(0, dtype=dtype))
+        tf_v = self.evaluate(v)
+      self.assertAllEqual(tf_v, 0)
+
+  def testFloat32(self):
+    for _ in range(5):
+      size_x = int(2 ** np.random.uniform(0, 15))
+      size_y = int(2 ** np.random.uniform(0, 15))
+      if size_x * size_y > 1e7:
+        size_y = int(1e7 / size_x)
+      if size_x % 2:
+        size_x = size_x + 1
+      if size_y % 2:
+        size_y = size_y + 1
+      arr = np.ones([size_x, size_y], dtype=np.float32)
+      col_sum = np.sum(arr, axis=0)
+      row_sum = np.sum(arr, axis=1)
+      full_sum = np.sum(arr, axis=-1, keepdims=True)
+      with self.cached_session(use_gpu=True) as _:
+        tf_row_sum = self._tf_reduce(arr, 1, False)
+        tf_col_sum = self._tf_reduce(arr, 0, False)
+        tf_full_sum = self._tf_reduce(arr, -1, keepdims=True)
+        tf_out_col = self.evaluate(tf_col_sum)
+        tf_out_row = self.evaluate(tf_row_sum)
+        tf_out_full = self.evaluate(tf_full_sum)
+      self.assertAllClose(col_sum, tf_out_col)
+      self.assertAllClose(row_sum, tf_out_row)
+      self.assertAllClose(full_sum, tf_out_full)
+
+    for size_x in [4, 16, 32]:
+      for size_y in [4, 16, 32]:
+        for size_z in [4, 16, 32]:
+          arr = np.ones([size_x, size_y, size_z], dtype=np.float32)
+          sum_y = np.sum(arr, axis=1)
+          sum_xz = np.sum(arr, axis=(0, 2))
+
+          with self.cached_session(use_gpu=True) as _:
+            tf_sum_xz = self._tf_reduce(arr, [0, 2], False)
+            tf_sum_y = self._tf_reduce(arr, 1, False)
+            tf_out_sum_xz, tf_out_sum_y = self.evaluate([tf_sum_xz, tf_sum_y])
+          self.assertAllClose(sum_y, tf_out_sum_y)
+          self.assertAllClose(sum_xz, tf_out_sum_xz)
+
+
+class MinMaxOpTest(test.TestCase):
+
+  def _compare(self, x, y, use_gpu):
+    np_min, np_max = np.minimum(x, y), np.maximum(x, y)
+    with test_util.device(use_gpu=use_gpu):
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      omin, omax = math_ops.minimum(inx, iny), math_ops.maximum(inx, iny)
+      tf_min, tf_max = self.evaluate([omin, omax])
+    self.assertAllEqual(np_min, tf_min)
+    self.assertAllEqual(np_max, tf_max)
+
+  def testBasic(self):
+    x = np.random.rand(1, 3, 2) * 100.0
+    y = np.random.rand(1, 3, 2) * 100.0
+    for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+      self._compare(x.astype(t), y.astype(t), use_gpu=False)
+      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+
+  def testDifferentShapes(self):
+    x = np.random.rand(1, 3, 2) * 100.0
+    y = np.random.rand(2) * 100.0  # should broadcast
+    for t in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+      self._compare(x.astype(t), y.astype(t), use_gpu=False)
+      self._compare(x.astype(t), y.astype(t), use_gpu=True)
+
+  def testScalar(self):
+    x = np.random.rand(1, 3, 2) * 100.0
+    y = np.random.rand(1).item() * 100.0  # should broadcast
+    # dropped np.float64, int64 because TF automatically converts to 32 bit
+    for t in [np.float32, np.int32]:
+      self._compare(x.astype(t), t(y), use_gpu=False)
+      self._compare(x.astype(t), t(y), use_gpu=True)
+
+  def _compareGradientX(self, func, x, y):
+    with self.cached_session():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = func(inx, iny)
+      s = list(np.shape(x))
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, s, out, s, x_init_value=x
+      )
+    if x.dtype == np.float16:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float32:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float64:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _compareGradientY(self, func, x, y):
+    with self.cached_session():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = func(inx, iny)
+      s = list(np.shape(x))
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          iny, s, out, s, x_init_value=y
+      )
+    if x.dtype == np.float16:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float32:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float64:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  @test_util.run_deprecated_v1
+  def testGradients(self):
+    x = np.random.rand(1, 3, 2) * 100.0
+    # ensure x != y
+    y = x + (np.random.randint(2, size=x.shape) - 0.5) * 2  # -1 or +1
+    self._compareGradientX(math_ops.maximum, x, y)
+    self._compareGradientY(math_ops.maximum, x, y)
+    self._compareGradientX(math_ops.minimum, x, y)
+    self._compareGradientY(math_ops.minimum, x, y)
+
+
+class MPSFillTest(test.TestCase):
+
+  def _compare(self, dims, val, np_ans, use_gpu):
+    ctx = context.context()
+    device = "GPU:0" if (use_gpu and ctx.num_gpus()) else "CPU:0"
+    with ops.device(device):
+      tf_ans = array_ops.fill(dims, val, name="fill")
+      out = tf_ans.numpy()
+    self.assertAllClose(np_ans, out)
+
+  def _compareAll(self, dims, val, np_ans):
+    self._compare(dims, val, np_ans, False)
+    self._compare(dims, val, np_ans, True)
+
+  def testFillFloat(self):
+    np_ans = np.array([[3.1415] * 3] * 2).astype(np.float32)
+    self._compareAll([2, 3], np_ans[0][0], np_ans)
+
+
+@test_util.run_all_in_graph_and_eager_modes
+class SquaredDifferenceTest(test_util.TensorFlowTestCase):
+
+  def testSquaredDifference(self):
+    for dtype in [np.float16, np.float32, np.float64, np.int32, np.int64]:
+      x = np.array([[1, 2, 3], [4, 5, 6]], dtype=dtype)
+      y = np.array([-3, -2, -1], dtype=dtype)
+      z = (x - y) * (x - y)
+      with test_util.device(use_gpu=True):
+        z_tf = self.evaluate(math_ops.squared_difference(x, y))
+        self.assertAllClose(z, z_tf)
+
+  def testComplexSquaredDifference(self):
+    for dtype in [np.complex64, np.complex128]:
+      x = np.array(
+          [[1 + 3j, 2 + 2j, 3 + 1j], [4 - 1j, 5 - 2j, 6 - 3j]], dtype=dtype
+      )
+      y = np.array([-3 + 1j, -2 + 2j, -1 + 3j], dtype=dtype)
+      z = np.conj(x - y) * (x - y)
+      with test_util.device(use_gpu=False):
+        z_tf = self.evaluate(math_ops.squared_difference(x, y))
+        self.assertAllClose(z, z_tf)
+
+
+class MPSOnesLikeTest(test.TestCase):
+
+  def testOnesLike(self):
+    for dtype in [
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.int32,
+        dtypes.uint8,
+        dtypes.int16,
+        dtypes.int8,
+        dtypes.complex64,
+        dtypes.complex128,
+        dtypes.int64,
+    ]:
+      numpy_dtype = dtype.as_numpy_dtype
+      # Creates a tensor of non-zero values with shape 2 x 3.
+      d = constant_op.constant(np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
+      # Constructs a tensor of zeros of the same dimensions and type as "d".
+      z_var = array_ops.ones_like(d)
+      # Test that the type is correct
+      self.assertEqual(z_var.dtype, dtype)
+      z_value = z_var.numpy()
+
+      # Test that the value is correct
+      self.assertTrue(np.array_equal(z_value, np.array([[1] * 3] * 2)))
+      self.assertEqual([2, 3], z_var.get_shape())
+
+
+class SelectOpTest(test.TestCase):
+
+  def _compare(self, fn, c, x, y, use_gpu):
+    np_ans = np.where(c, x, y)
+    with test_util.device(use_gpu=use_gpu):
+      out = fn(c, x, y)
+      tf_ans = self.evaluate(out)
+    self.assertAllEqual(np_ans, tf_ans)
+    self.assertShapeEqual(np_ans, out)
+
+  def _compareGradientX(
+      self, fn, c, x, y, numeric_gradient_type=None, x_init_value=None
+  ):
+    with self.cached_session():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = fn(c, inx, iny)
+      s = list(np.shape(c))
+      if x_init_value is None:
+        x_init_value = x
+      if x.shape != y.shape:
+        x_init_value = np.broadcast_to(y, x.shape)
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, s, out, s, x_init_value=x_init_value
+      )
+      if numeric_gradient_type is not None:
+        xf = x.astype(numeric_gradient_type)
+        yf = y.astype(numeric_gradient_type)
+        inxf = ops.convert_to_tensor(xf)
+        inyf = ops.convert_to_tensor(yf)
+        outf = fn(c, inxf, inyf)
+        _, jacob_n = gradient_checker.compute_gradient(
+            inxf, s, outf, s, x_init_value=xf
+        )
+        jacob_n = jacob_n.astype(x.dtype)
+    if x.dtype == np.float16:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float32:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float64:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _compareGradientY(self, fn, c, x, y, numeric_gradient_type=None):
+    with self.cached_session():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = fn(c, inx, iny)
+      s = list(np.shape(c))
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          iny, s, out, s, x_init_value=x, delta=1.0
+      )
+      if numeric_gradient_type is not None:
+        xf = x.astype(numeric_gradient_type)
+        yf = y.astype(numeric_gradient_type)
+        inxf = ops.convert_to_tensor(xf)
+        inyf = ops.convert_to_tensor(yf)
+        outf = fn(c, inxf, inyf)
+        _, jacob_n = gradient_checker.compute_gradient(
+            inyf, s, outf, s, x_init_value=yf
+        )
+        jacob_n = jacob_n.astype(x.dtype)
+    if x.dtype == np.float16:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float32:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-3, atol=1e-3)
+    elif x.dtype == np.float64:
+      self.assertAllClose(jacob_t, jacob_n, rtol=1e-5, atol=1e-5)
+
+  def _testScalar(self, fn):
+    c = True
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 2) * 100
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        np.int32,
+        np.int64,
+        np.complex64,
+        np.complex128,
+    ]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(fn, c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(fn, c, xt, yt, use_gpu=True)
+
+  def testScalar(self):
+    self._testScalar(array_ops.where)
+    self._testScalar(array_ops.where_v2)
+
+  def _testScalarBroadcast(self, fn, c, x, y):
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        np.int32,
+        np.int64,
+        np.complex64,
+        np.complex128,
+    ]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(fn, c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(fn, c, xt, yt, use_gpu=True)
+
+  def testScalarBroadcast(self):
+    c = True
+    # where_v2 only
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 1, 1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 1, 2) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 2) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(3, 2) * 100
+    self._testScalarBroadcast(array_ops.where_v2, c, x, y)
+    self._testScalarBroadcast(array_ops.where_v2, c, y, x)
+
+  def _testBasic(self, fn):
+    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 2) * 100
+    for t in [np.float32]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      # self._compare(fn, c, xt, yt, use_gpu=False)
+      if t in [np.float32]:
+        self._compare(fn, c, xt, yt, use_gpu=True)
+
+  def testBasic(self):
+    self._testBasic(array_ops.where)
+    self._testBasic(array_ops.where_v2)
+
+  def _testBasicBroadcast(self, fn, c, x, y):
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        np.int32,
+        np.int64,
+        np.complex64,
+        np.complex128,
+    ]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(fn, c, xt, yt, use_gpu=False)
+      if t in [np.float16, np.float32, np.float64]:
+        self._compare(fn, c, xt, yt, use_gpu=True)
+
+  def testBasicBroadcast(self):
+    c0 = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    c1 = np.random.randint(0, 2, 2).astype(np.bool).reshape(1, 1, 2)
+    c2 = np.random.randint(0, 2, 3).astype(np.bool).reshape(1, 3, 1)
+    c3 = np.random.randint(0, 2, 1).astype(np.bool).reshape(1, 1, 1)
+    for c in [c0, c1, c2, c3]:
+      # where_v2 only
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 3, 1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 2) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 2) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(3, 2) * 100
+      self._testBasicBroadcast(array_ops.where_v2, c, x, y)
+      self._testBasicBroadcast(array_ops.where_v2, c, y, x)
+
+  def _testGradients(self, fn):
+    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(1, 3, 2) * 100
+    for t in [np.float16, np.float32, np.float64]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      if t == np.float16:
+        # Compare fp16 theoretical gradients to fp32 numerical gradients,
+        # since fp16 numerical gradients are too imprecise unless great
+        # care is taken with choosing the inputs and the delta. This is
+        # a weaker check (in particular, it does not test the op itself,
+        # only its gradient), but it's much better than nothing.
+        self._compareGradientX(fn, c, xt, yt, np.float)
+        self._compareGradientY(fn, c, xt, yt, np.float)
+      else:
+        self._compareGradientX(fn, c, xt, yt)
+        self._compareGradientY(fn, c, xt, yt)
+
+  @test_util.run_deprecated_v1
+  def testGradients(self):
+    self._testGradients(array_ops.where)
+    self._testGradients(array_ops.where_v2)
+
+  @test_util.run_deprecated_v1
+  def testGradientsBroadcast(self):
+    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    for t in [np.float32, np.float64]:
+      # where_v2 only
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 3, 1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1, 2) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(1, 2) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+      x = np.random.rand(1, 3, 2) * 100
+      y = np.random.rand(3, 2) * 100
+      self._compareGradientX(array_ops.where_v2, c, x.astype(t), y.astype(t))
+
+  def _testShapeMismatch(self, fn):
+    c = np.random.randint(0, 2, 6).astype(np.bool).reshape(1, 3, 2)
+    x = np.random.rand(1, 3, 2) * 100
+    y = np.random.rand(2, 5, 3) * 100
+    for t in [
+        np.float16,
+        np.float32,
+        np.float64,
+        np.int32,
+        np.int64,
+        np.complex64,
+        np.complex128,
+    ]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      with self.assertRaises(ValueError):
+        fn(c, xt, yt)
+
+  @test_util.run_deprecated_v1
+  def testShapeMismatch(self):
+    self._testShapeMismatch(array_ops.where)
+    self._testShapeMismatch(array_ops.where_v2)
+
+  def _testEmptyTensor(self, fn):
+    c = np.random.randint(0, 3, 0).astype(np.bool).reshape(1, 3, 0)
+    x = np.random.rand(1, 3, 0) * 100
+    y = np.random.rand(1, 3, 0) * 100
+    z_expected = np.zeros((1, 3, 0), dtype=np.float32)
+    with self.cached_session():
+      xt = x.astype(np.float32)
+      yt = y.astype(np.float32)
+      z = fn(c, xt, yt).eval()
+      self.assertAllEqual(z_expected, z)
+
+  @test_util.run_deprecated_v1
+  def testEmptyTensor(self):
+    self._testEmptyTensor(array_ops.where)
+    self._testEmptyTensor(array_ops.where_v2)
+
+  def _testNan(self, fn):
+    with self.cached_session():
+      for c in False, True:
+        for a in 7.0, np.nan:
+          for b in 5.0, np.nan:
+            x = fn(c, a, b).eval()
+            y = a if c else b
+            self.assertEqual(np.isnan(x), np.isnan(y))
+
+  @test_util.run_deprecated_v1
+  def testNan(self):
+    """Verify that nans don't propagate where they shouldn't."""
+    self._testNan(array_ops.where)
+    self._testNan(array_ops.where_v2)
+
+
+class ZerosLikeTest(test.TestCase):
+
+  def _compareZeros(self, dtype, use_gpu):
+    # Creates a tensor of non-zero values with shape 2 x 3.
+    # NOTE(kearnes): The default numpy dtype associated with tf.string is
+    # np.object (and can't be changed without breaking a lot things), which
+    # causes a TypeError in constant_op.constant below. Here we catch the
+    # special case of tf.string and set the numpy dtype appropriately.
+    if dtype == dtypes.string:
+      numpy_dtype = np.string_
+    else:
+      numpy_dtype = dtype.as_numpy_dtype
+    d = constant_op.constant(np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
+    # Constructs a tensor of zeros of the same dimensions and type as "d".
+    z_var = array_ops.zeros_like(d)
+    # Test that the type is correct
+    self.assertEqual(z_var.dtype, dtype)
+    # Test that the shape is correct
+    self.assertEqual([2, 3], z_var.get_shape())
+
+    # Test that the value is correct
+    z_value = z_var.numpy()
+    self.assertFalse(np.any(z_value))
+    self.assertEqual((2, 3), z_value.shape)
+
+  def testZerosLikeCPU(self):
+    for dtype in [
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.int32,
+        dtypes.uint8,
+        dtypes.int16,
+        dtypes.int8,
+        dtypes.complex64,
+        dtypes.complex128,
+        dtypes.int64,
+    ]:
+      self._compareZeros(dtype, use_gpu=False)
+
+  def testZerosLikeGPU(self):
+    for dtype in [
+        dtypes.float32,
+        dtypes.float64,
+        dtypes.int32,
+        dtypes.bool,
+        dtypes.int64,
+    ]:
+      self._compareZeros(dtype, use_gpu=True)
+
+  def testZerosLikeDtype(self):
+    # Make sure zeros_like works even for dtypes that cannot be cast between
+    shape = (3, 5)
+    dtypes_ = np.float32, np.complex64
+    for in_type in dtypes_:
+      x = np.arange(15).astype(in_type).reshape(*shape)
+      for out_type in dtypes_:
+        y = array_ops.zeros_like(x, dtype=out_type).numpy()
+        self.assertEqual(y.dtype, out_type)
+        self.assertEqual(y.shape, shape)
+        self.assertAllEqual(y, np.zeros(shape, dtype=out_type))
+
+
+class MpsTest(test.TestCase):
+
+  def _compareCpu(self, x, y, np_func, tf_func, also_compare_variables=False):
+    np_ans = np_func(x, y)
+    with test_util.force_cpu():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = tf_func(inx, iny)
+      tf_cpu = self.evaluate(out)
+      # Test that the op takes precedence over numpy operators.
+      np_left = self.evaluate(tf_func(x, iny))
+      np_right = self.evaluate(tf_func(inx, y))
+
+      if also_compare_variables:
+        var_x = variables.Variable(x)
+        var_y = variables.Variable(y)
+        self.evaluate(variables.global_variables_initializer())
+        print(type(x), type(y), type(var_x), type(var_y))
+        print(type(tf_func(x, var_y)), type(tf_func(var_x, y)))
+        np_var_left = self.evaluate(tf_func(x, var_y))
+        np_var_right = self.evaluate(tf_func(var_x, y))
+
+    if np_ans.dtype != np.object:
+      self.assertAllClose(np_ans, tf_cpu)
+      self.assertAllClose(np_ans, np_left)
+      self.assertAllClose(np_ans, np_right)
+      if also_compare_variables:
+        self.assertAllClose(np_ans, np_var_left)
+        self.assertAllClose(np_ans, np_var_right)
+    self.assertShapeEqual(np_ans, out)
+
+  def _inv(self, x):
+    return 1.0 / x
+
+  def _rsqrt(self, x):
+    return self._inv(np.sqrt(x))
+
+  def _sigmoid(self, x):
+    return 1.0 / (1.0 + np.exp(-x))
+
+  def _log_sigmoid(self, x):
+    return np.log(self._sigmoid(x))
+
+  def _replace_domain_error_with_inf(self, fn):
+    def func(x):
+      try:
+        return fn(x)
+      except ValueError as e:
+        if "domain error" in str(e):
+          return np.inf * np.ones_like(x)
+        else:
+          raise e
+
+    return func
+
+  def _compareTanhGrad(self, x, y):
+    default = gen_math_ops.tanh_grad(x, y)
+
+    with test_util.device(use_gpu=False):
+      cpu = gen_math_ops.tanh_grad(x, y)
+
+    self.assertAllClose(cpu, default)
+
+  def testTanhGrad(self):
+    x = np.random.uniform(-2.0, 2.0, size=[4, 4]).astype(np.float32)
+    y = np.random.uniform(-2.0, 2.0, size=[4, 4]).astype(np.float32)
+    self._compareTanhGrad(x, y)
+
+  _GRAD_TOL = {
+      dtypes.float16: 1e-3,
+      dtypes.float32: 1e-3,
+      dtypes.complex64: 1e-2,
+      dtypes.float64: 1e-5,
+      dtypes.complex128: 1e-4,
+  }
+
+  def _compareGradientX(
+      self, x, y, np_func, tf_func, numeric_gradient_type=None
+  ):
+    z = np_func(x, y)
+    zs = list(z.shape)
+    with self.cached_session():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      if x.dtype in (np.float32, np.float64):
+        out = 1.1 * tf_func(inx, iny)
+      else:
+        out = tf_func(inx, iny)
+      xs = list(x.shape)
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          inx, xs, out, zs, x_init_value=x
+      )
+      if numeric_gradient_type is not None:
+        xf = x.astype(numeric_gradient_type)
+        yf = y.astype(numeric_gradient_type)
+        inxf = ops.convert_to_tensor(xf)
+        inyf = ops.convert_to_tensor(yf)
+        outf = tf_func(inxf, inyf)
+        _, jacob_n = gradient_checker.compute_gradient(
+            inxf, xs, outf, zs, x_init_value=xf, delta=1e-3
+        )
+        jacob_n = jacob_n.astype(x.dtype)
+      tol = self._GRAD_TOL[dtypes.as_dtype(x.dtype)]
+      self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
+
+  def _compareGradientY(
+      self, x, y, np_func, tf_func, numeric_gradient_type=None
+  ):
+    z = np_func(x, y)
+    zs = list(z.shape)
+    with self.cached_session():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      if x.dtype in (np.float32, np.float64):
+        out = 1.1 * tf_func(inx, iny)
+      else:
+        out = tf_func(inx, iny)
+      ys = list(np.shape(y))
+      jacob_t, jacob_n = gradient_checker.compute_gradient(
+          iny, ys, out, zs, x_init_value=y
+      )
+      if numeric_gradient_type is not None:
+        xf = x.astype(numeric_gradient_type)
+        yf = y.astype(numeric_gradient_type)
+        inxf = ops.convert_to_tensor(xf)
+        inyf = ops.convert_to_tensor(yf)
+        outf = tf_func(inxf, inyf)
+        _, jacob_n = gradient_checker.compute_gradient(
+            inyf, ys, outf, zs, x_init_value=yf
+        )
+        jacob_n = jacob_n.astype(x.dtype)
+    tol = self._GRAD_TOL[dtypes.as_dtype(x.dtype)]
+    self.assertAllClose(jacob_t, jacob_n, rtol=tol, atol=tol)
+
+  def compareUnaryGradient_CPU_GPU(self, inx, func, test_name):
+    with test_util.force_cpu():
+      with backprop.GradientTape() as t:
+        t.watch(inx)
+        y = func(inx)
+      cpu_gradient = t.gradient(y, inx)
+      print(test_name, " (CPU) = ", cpu_gradient)
+
+    with test_util.force_gpu():
+      with backprop.GradientTape() as t:
+        t.watch(inx)
+        y = func(inx)
+      gpu_gradient = t.gradient(y, inx)
+      print(test_name, " (GPU) = ", gpu_gradient)
+
+    tol = self._GRAD_TOL[dtypes.as_dtype(inx.dtype)]
+    self.assertAllClose(cpu_gradient, gpu_gradient, rtol=tol, atol=tol)
+
+  def _compareGpu(self, x, y, np_func, tf_func):
+    np_ans = np_func(x, y)
+    with test_util.use_gpu():
+      inx = ops.convert_to_tensor(x)
+      iny = ops.convert_to_tensor(y)
+      out = tf_func(inx, iny)
+      tf_gpu = self.evaluate(out)
+    self.assertAllClose(np_ans, tf_gpu)
+    self.assertShapeEqual(np_ans, out)
+
+  def _compareBoth(self, x, y, np_func, tf_func, also_compare_variables=False):
+    self._compareCpu(x, y, np_func, tf_func, also_compare_variables)
+    self._compareGpu(x, y, np_func, tf_func)
+
+  def _compare(self, x, y, np_func, tf_func):
+    np_ans = np_func(x, y)
+    with test_util.use_gpu():
+      out = tf_func(ops.convert_to_tensor(x), ops.convert_to_tensor(y))
+      tf_ans = self.evaluate(out)
+    self.assertAllEqual(np_ans, tf_ans)
+
+  @test_util.run_deprecated_v1
+  def testGradGrad(self):
+    np.random.seed(7)
+    shape = (5,)
+    dtype_tols = [
+        (np.float32, 5e-4),
+        (np.float64, 1e-6),
+        (np.complex64, 5e-4),
+        (np.complex128, 1e-6),
+    ]
+    op_range = [
+        (gen_math_ops.tanh_grad, [-2, 2]),
+    ]
+
+    def rand(dtype, real_range):
+      x = np.random.uniform(real_range[0], real_range[1], size=shape[0]).astype(
+          dtype
+      )
+      return x
+
+    for op, real_range in op_range:
+      with self.cached_session():
+        for dtype, tol in dtype_tols:
+          x = constant_op.constant(rand(dtype, real_range))
+          y = constant_op.constant(rand(dtype, real_range))
+          z = op(x, y)
+          grads = gradient_checker.compute_gradient(
+              [x, y],
+              [shape, shape],
+              z,
+              shape,
+              x_init_value=[rand(dtype, real_range), rand(dtype, real_range)],
+          )
+          if isinstance(grads, tuple):
+            grads = [grads]
+          for analytical, numerical in grads:
+            self.assertAllClose(analytical, numerical, rtol=tol, atol=tol)
+
+  def testFloatCompareTensor(self):
+    x = np.linspace(-15, 15, 6).reshape((1, 3, 2))
+    y = np.linspace(20, -10, 6).reshape((1, 3, 2))
+    for t in [np.float32, np.float16]:
+      xt = x.astype(t)
+      yt = y.astype(t)
+      self._compare(xt, yt, np.less, math_ops.less)
+      self._compare(xt, yt, np.less_equal, math_ops.less_equal)
+      self._compare(xt, yt, np.greater, math_ops.greater)
+      self._compare(xt, yt, np.greater_equal, math_ops.greater_equal)
+      self._compare(xt, yt, np.equal, math_ops.equal)
+      self._compare(xt, yt, np.not_equal, math_ops.not_equal)
+
+  def testFloatBasic(self):
+    x = np.linspace(-5, 20, 30).reshape((1, 2, 3, 5)).astype(np.float32)
+    y = np.linspace(20, -5, 30).reshape((1, 2, 3, 5)).astype(np.float32)
+    self._compareBoth(x, y, np.add, math_ops.add, True)
+    self._compareBoth(x, y, np.subtract, math_ops.subtract, True)
+    self._compareBoth(x, y, np.multiply, math_ops.multiply, True)
+    self._compareBoth(x, y + 0.1, np.true_divide, math_ops.truediv)
+    self._compareBoth(x, y + 0.1, np.floor_divide, math_ops.floordiv)
+    self._compareBoth(x, y, np.add, _ADD)
+    self._compareBoth(x, y, np.subtract, _SUB)
+    self._compareBoth(x, y, np.multiply, _MUL)
+
+  def testHalfBasic(self):
+    x = np.linspace(-5, 20, 30).reshape((1, 2, 3, 5)).astype(np.float16)
+    y = np.linspace(20, -5, 30).reshape((1, 2, 3, 5)).astype(np.float16)
+    self._compareBoth(x, y, np.add, math_ops.add, True)
+    self._compareBoth(x, y, np.subtract, math_ops.subtract, True)
+    self._compareBoth(x, y, np.multiply, math_ops.multiply, True)
+    self._compareBoth(x, y + 0.1, np.true_divide, math_ops.truediv)
+    self._compareBoth(x, y + 0.1, np.floor_divide, math_ops.floordiv)
+    self._compareBoth(x, y, np.add, _ADD)
+    self._compareBoth(x, y, np.subtract, _SUB)
+    self._compareBoth(x, y, np.multiply, _MUL)
+
+  def testIntBasic(self):
+    x = np.arange(1, 13, 2).reshape(1, 3, 2).astype(np.int32)
+    y = np.arange(1, 7, 1).reshape(1, 3, 2).astype(np.int32)
+    self._compareBoth(x, y, np.add, math_ops.add)
+    self._compareBoth(x, y, np.subtract, math_ops.subtract)
+    self._compareBoth(x, y, np.multiply, math_ops.multiply)
+    self._compareBoth(x, y, np.true_divide, math_ops.truediv)
+    self._compareBoth(x, y, np.floor_divide, math_ops.floordiv)
+    self._compareBoth(x, y, np.mod, math_ops.mod)
+    self._compareBoth(x, y, np.add, _ADD)
+    self._compareBoth(x, y, np.subtract, _SUB)
+    self._compareBoth(x, y, np.multiply, _MUL)
+    self._compareBoth(x, y, np.true_divide, _TRUEDIV)
+    self._compareBoth(x, y, np.floor_divide, _FLOORDIV)
+    self._compareBoth(x, y, np.mod, _MOD)
+    # _compareBoth tests on GPU only for floating point types, so test
+    # _MOD for int32 on GPU by calling _compareGpu
+    self._compareGpu(x, y, np.mod, _MOD)
+
+  def testZeroElementBinaryOp(self):
+    x = array_ops.ones([0, 3])
+    y = 4.0
+    self._compareBoth(x, y, np.add, math_ops.add, True)
+    self._compareBoth(x, y, np.subtract, math_ops.subtract, True)
+    self._compareBoth(x, y, np.multiply, math_ops.multiply, True)
+    self._compareBoth(x, y + 0.1, np.true_divide, math_ops.truediv)
+    self._compareBoth(x, y, np.add, _ADD)
+    self._compareBoth(x, y, np.subtract, _SUB)
+    self._compareBoth(x, y, np.multiply, _MUL)
+
+  def testAssignMethod(self):
+    v = resource_variable_ops.ResourceVariable(1.0, name="var0")
+    self.evaluate(variables.global_variables_initializer())
+    self.evaluate(v.assign(2.0))
+    self.assertEqual(2.0, self.evaluate(v.value()))
+
+    # Tests for the 'read_value' argument:
+    assign_with_read = v.assign(3.0, read_value=True)
+    self.assertEqual(3.0, self.evaluate(assign_with_read))
+    assign_without_read = v.assign(4.0, read_value=False)
+    if context.executing_eagerly():
+      self.assertIsNone(assign_without_read)
+    else:
+      self.assertIsInstance(assign_without_read, ops.Operation)
+    self.evaluate(assign_without_read)
+    self.assertEqual(4.0, self.evaluate(v.value()))
+
+  @test_util.run_in_graph_and_eager_modes
+  def testAssignIncompatibleShape(self):
+    v = resource_variable_ops.ResourceVariable([0, 1, 2, 3])
+    self.evaluate(v.initializer)
+    pattern = re.compile("shapes must be equal", re.IGNORECASE)
+    with self.assertRaisesRegex(Exception, pattern):
+      self.evaluate(v.assign_add(1))
+
+  def _compareUnaryCpu(
+      self, x, np_func, tf_func, grad_rtol=None, grad_atol=None
+  ):
+    if grad_rtol is None:
+      grad_rtol = _default_tolerance(x.dtype)
+    if grad_atol is None:
+      grad_atol = _default_tolerance(x.dtype)
+    np_ans = np_func(x)
+    with self.cached_session(use_gpu=False):
+      inx = ops.convert_to_tensor(x)
+      if x.dtype in (
+          np.float32,
+          np.float64,
+          dtypes.bfloat16.as_numpy_dtype,
+      ):
+        y = 1.1 * tf_func(inx)
+        np_ans *= 1.1
+      else:
+        y = tf_func(inx)
+      tf_cpu = self.evaluate(y)
+      self.assertShapeEqual(np_ans, y)
+      if x.dtype == np.float16:
+        self.assertAllClose(np_ans, tf_cpu, rtol=1e-3, atol=1e-3)
+      elif x.dtype == dtypes.bfloat16.as_numpy_dtype:
+        self.assertAllClose(np_ans, tf_cpu, rtol=1e-2, atol=1e-2)
+      else:
+        self.assertAllClose(np_ans, tf_cpu)
+
+      if x.dtype in (np.complex64, np.complex128) and tf_func == math_ops.sign:
+        return  # Return early
+
+      if x.dtype == np.float16:
+        s = list(np.shape(x))
+        jacob_t, _ = gradient_checker.compute_gradient(
+            inx, s, y, s, x_init_value=x
+        )
+        xf = x.astype(np.float)
+        inxf = ops.convert_to_tensor(xf)
+        yf = tf_func(inxf)
+        _, jacob_n = gradient_checker.compute_gradient(
+            inxf, s, yf, s, x_init_value=xf, delta=1e-2
+        )
+        jacob_n = jacob_n.astype(np.float16)
+        self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
+      elif x.dtype in (np.float32, np.complex64):
+        s = list(np.shape(x))
+        jacob_t, jacob_n = gradient_checker.compute_gradient(
+            inx, s, y, s, x_init_value=x, delta=1e-3
+        )
+        self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
+      elif x.dtype in (np.float64, np.complex128):
+        s = list(np.shape(x))
+        jacob_t, jacob_n = gradient_checker.compute_gradient(
+            inx, s, y, s, x_init_value=x, delta=1e-5
+        )
+        self.assertAllClose(jacob_t, jacob_n, rtol=grad_rtol, atol=grad_atol)
+
+  def _compareUnaryGpu(self, x, np_func, tf_func):
+    np_ans = np_func(x)
+    with test_util.use_gpu():
+      result = tf_func(ops.convert_to_tensor(x))
+      tf_gpu = self.evaluate(result)
+    if x.dtype == np.float16:
+      self.assertAllClose(np_ans, tf_gpu, rtol=1e-3, atol=1e-3)
+    else:
+      self.assertAllClose(np_ans, tf_gpu)
+
+  def _compareUnaryBoth(self, x, np_func, tf_func):
+    self._compareUnaryGpu(x, np_func, tf_func)
+
+  def compareConv2d(
+      self, input, filter, padding, format="NHWC", dilations=None
+  ):
+    stride = 2
+
+    strides = [stride, stride]
+
+    with test_util.force_gpu():
+      gpu = nn_ops.conv2d(
+          input=input,
+          filter=filter,
+          strides=strides,
+          padding=padding,
+          data_format=format,
+          dilations=dilations,
+      )
+
+    with test_util.force_cpu():
+      if format == "NCHW":
+        input = array_ops.transpose(input, [0, 2, 3, 1])
+        if not isinstance(padding, str):
+          padding = [padding[0], padding[2], padding[3], padding[1]]
+      cpu = nn_ops.conv2d(
+          input=input,
+          filter=filter,
+          strides=strides,
+          padding=padding,
+          data_format="NHWC",
+          dilations=dilations,
+      )
+      if format == "NCHW":
+        cpu = array_ops.transpose(cpu, [0, 3, 1, 2])
+
+      if math_ops.reduce_any(math_ops.not_equal(cpu, gpu)):
+        print(
+            "Error: padding: {0} format: {1} dilations: {2}".format(
+                padding, format, dilations
+            )
+        )
+        print("CPU: ", cpu)
+        print("GPU: ", gpu)
+      else:
+        print(
+            "Passed: padding: {0} format: {1} dilations: {2}".format(
+                padding, format, dilations
+            )
+        )
+        print("CPU: ", cpu)
+        print("GPU: ", gpu)
+
+    self.assertAllEqual(cpu, gpu)
+
+  def testConvolution(self):
+    input = constant_op.constant([[
+        [[1], [2.0], [3.0], [4.0]],
+        [[6], [7], [8], [9]],
+        [[10], [11], [12], [13]],
+        [[14], [15], [16], [17]],
+    ]])
+
+    input2 = constant_op.constant([[
+        [[1], [2.0], [3.0], [4.0], [5.0]],
+        [[6], [7], [8], [9], [15.0]],
+        [[10], [11], [12], [13], [25.0]],
+        [[14], [15], [16], [17], [35.0]],
+    ]])
+
+    input4 = constant_op.constant([[
+        [[1], [2.0], [3.0], [4.0], [5.0], [1], [2.0]],
+        [[6], [7], [8], [9], [15.0], [1], [2.0]],
+        [[10], [11], [12], [13], [25.0], [1], [2.0]],
+        [[14], [15], [16], [17], [35.0], [1], [2.0]],
+        [[6], [7], [8], [9], [15.0], [1], [2.0]],
+        [[10], [11], [12], [13], [25.0], [1], [2.0]],
+    ]])
+
+    print("input: ", input)
+
+    ## (2,2,1,1)
+    filter2x2 = constant_op.constant(
+        [
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+        ],
+    )
+
+    ## (3,2,1,1)
+    filter3x2 = constant_op.constant(
+        [[[[1.0]], [[1]]], [[[1.0]], [[1]]], [[[1.0]], [[1]]]],
+    )
+
+    ## (4,2,1,1)
+    filter4x2 = constant_op.constant(
+        [
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+        ],
+    )
+
+    ## (5,2,1,1)
+    filter5x2 = constant_op.constant(
+        [
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+            [[[1.0]], [[1]]],
+        ],
+    )
+
+    print("filter2x2: ", filter2x2)
+
+    self.compareConv2d(input, filter2x2, "VALID")
+    self.compareConv2d(input, filter3x2, "VALID")
+    self.compareConv2d(input, filter4x2, "VALID")
+    self.compareConv2d(input, filter5x2, "VALID")
+
+    self.compareConv2d(input, filter2x2, "SAME")
+    self.compareConv2d(input, filter3x2, "SAME")
+    self.compareConv2d(input, filter4x2, "SAME")
+    self.compareConv2d(input, filter5x2, "SAME")
+
+    self.compareConv2d(input2, filter2x2, "VALID")
+    self.compareConv2d(input2, filter2x2, "SAME")
+
+    pad_top = 2
+    pad_bottom = 3
+    pad_left = 1
+    pad_right = 5
+    self.compareConv2d(
+        input2,
+        filter2x2,
+        [[0, 0], [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]],
+    )
+
+    self.compareConv2d(input2, filter2x2, "VALID", dilations=[2, 2])
+    self.compareConv2d(input2, filter2x2, "SAME", dilations=[2, 2])
+
+    self.compareConv2d(input4, filter2x2, "VALID", dilations=[2, 3])
+    self.compareConv2d(input4, filter2x2, "SAME", dilations=[3, 2])
+
+    self.compareConv2d(input4, filter3x2, "VALID", dilations=[2, 3])
+    self.compareConv2d(input4, filter3x2, "SAME", dilations=[3, 2])
+
+    self.compareConv2d(input4, filter5x2, "VALID", dilations=[2, 3])
+    self.compareConv2d(input4, filter5x2, "SAME", dilations=[3, 2])
+    self.compareConv2d(
+        input2,
+        filter2x2,
+        [[0, 0], [pad_top, pad_bottom], [pad_left, pad_right], [0, 0]],
+        dilations=[2, 2],
+    )
+
+    input3 = constant_op.constant([[[
+        [1, 2.0, 3.0, 4.0, 5.0],
+        [6, 7, 8, 9, 15],
+        [10, 11, 12, 13, 25.0],
+        [14, 15, 16, 17, 35.0],
+    ]]])
+
+    self.compareConv2d(input3, filter2x2, "VALID", "NCHW")
+    self.compareConv2d(input3, filter2x2, "SAME", "NCHW")
+    self.compareConv2d(
+        input3,
+        filter2x2,
+        [[0, 0], [0, 0], [pad_top, pad_bottom], [pad_left, pad_right]],
+        "NCHW",
+    )
+
+  def compareTranspose(self, input, perm):
+    with test_util.force_gpu():
+      gpu = array_ops.transpose(input, perm)
+
+    with test_util.force_cpu():
+      cpu = array_ops.transpose(input, perm)
+
+      if math_ops.reduce_any(math_ops.not_equal(cpu, gpu)):
+        print("Error")
+        print("CPU: ", cpu)
+        print("GPU: ", gpu)
+      else:
+        print("Passed")
+
+    self.assertAllEqual(cpu, gpu)
+
+  def testTranspose(self):
+    for dtype in [dtypes.float32, dtypes.bfloat16]:
+      input = tf.convert_to_tensor(np.arange(0.0, 5 * 2 * 13), dtype=dtype)
+      input = array_ops.reshape(input, [5, 2, 13])
+
+      self.compareTranspose(input, [1, 2, 0])
+      self.compareTranspose(input, [0, 2, 1])
+      self.compareTranspose(input, [2, 0, 1])
+      self.compareTranspose(input, [2, 1, 0])
+
+      input = tf.convert_to_tensor(np.arange(0.0, 2 * 4 * 3 * 5), dtype=dtype)
+      input = array_ops.reshape(input, [2, 4, 3, 5])
+
+      self.compareTranspose(input, [1, 0, 2, 3])
+      self.compareTranspose(input, [0, 3, 1, 2])
+      self.compareTranspose(input, [3, 2, 1, 0])
+
+  def testUnaryHalfBasic(self):
+    x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float16)
+    _ = x - x.min() + 1.02  # all greater than 1
+    y = (x + 0.5).astype(np.float16)  # no zero
+    z = (x + 15.5).astype(np.float16)  # all positive
+    _ = np.arange(-0.90, 0.90, 0.25).astype(np.float16)  # between -1 and 1
+
+    self._compareUnaryBoth(x, np.abs, math_ops.abs)
+    self._compareUnaryBoth(x, np.abs, _ABS)
+    self._compareUnaryBoth(x, np.negative, math_ops.negative)
+    self._compareUnaryBoth(x, np.negative, _NEG)
+    self._compareUnaryBoth(y, self._inv, math_ops.reciprocal)
+    self._compareUnaryBoth(z, np.log, math_ops.log)
+    self._compareUnaryBoth(x, self._sigmoid, math_ops.sigmoid)
+    self._compareUnaryBoth(z, np.sqrt, math_ops.sqrt)
+    self._compareUnaryBoth(z, self._rsqrt, math_ops.rsqrt)
+    self._compareUnaryBoth(x, np.exp, math_ops.exp)
+    self._compareUnaryBoth(x, self._sigmoid, math_ops.sigmoid)
+    self._compareUnaryBoth(x, np.square, math_ops.square)
+    self._compareUnaryBoth(y, np.sign, math_ops.sign)
+    self._compareUnaryBoth(x, np.tanh, math_ops.tanh)
+
+  def testUnaryFloatBasic(self):
+    x = np.arange(-3, 3).reshape(1, 3, 2).astype(np.float32)
+    _ = x - x.min() + 1.02  # all greater than 1
+    y = (x + 0.5).astype(np.float32)  # no zero
+    z = (x + 15.5).astype(np.float32)  # all positive
+    _ = np.arange(-0.90, 0.90, 0.25).astype(np.float32)  # between -1 and 1
+
+    self._compareUnaryBoth(x, np.abs, math_ops.abs)
+    self._compareUnaryBoth(x, np.abs, _ABS)
+    self._compareUnaryBoth(x, np.negative, math_ops.negative)
+    self._compareUnaryBoth(x, np.negative, _NEG)
+    self._compareUnaryBoth(y, self._inv, math_ops.reciprocal)
+    self._compareUnaryBoth(z, np.log, math_ops.log)
+    self._compareUnaryBoth(x, np.square, math_ops.square)
+    self._compareUnaryBoth(x, self._sigmoid, math_ops.sigmoid)
+    self._compareUnaryBoth(z, np.sqrt, math_ops.sqrt)
+    self._compareUnaryBoth(z, self._rsqrt, math_ops.rsqrt)
+    self._compareUnaryBoth(x, np.exp, math_ops.exp)
+    self._compareUnaryBoth(x, self._sigmoid, math_ops.sigmoid)
+    self._compareUnaryBoth(z, np.log1p, math_ops.log1p)
+    self._compareUnaryBoth(x, np.square, math_ops.square)
+    self._compareUnaryBoth(y, np.sign, math_ops.sign)
+    self._compareUnaryBoth(x, np.tanh, math_ops.tanh)
+
+    x = np.array([0.5, 0.7], np.float32)
+    inx = ops.convert_to_tensor(x)
+
+    print("\nsigmoidGrad:\n")
+
+    self.compareUnaryGradient_CPU_GPU(inx, gen_math_ops.sigmoid, "sigmoidGrad")
+
+    gradient = gen_math_ops.sigmoid_grad(
+        gen_math_ops.sigmoid(inx), constant_op.constant(1.0)
+    )
+    print("gen_math_ops.sigmoid_grad(y) = ", gradient)
+
+  def _compareBCast(self, xs, ys, dtype, np_func, tf_func):
+    x = (1 + np.linspace(0, 5, np.prod(xs))).astype(dtype).reshape(xs)
+    y = (1 + np.linspace(0, 5, np.prod(ys))).astype(dtype).reshape(ys)
+    self._compareCpu(x, y, np_func, tf_func)
+    if x.dtype in (np.float16, np.float32, np.float64):
+      self._compareGpu(x, y, np_func, tf_func)
+
+  def _testBCastByFunc(self, funcs, xs, ys):
+    dtypes_ = [
+        np.float32,
+    ]
+    for dtype in dtypes_:
+      for np_func, tf_func in funcs:
+        self._compareBCast(xs, ys, dtype, np_func, tf_func)
+        self._compareBCast(ys, xs, dtype, np_func, tf_func)
+
+  def _testBCastA(self, xs, ys):
+    funcs = [
+        (np.add, math_ops.add),
+        (np.add, _ADD),
+    ]
+    self._testBCastByFunc(funcs, xs, ys)
+
+  def _testBCastB(self, xs, ys):
+    funcs = [
+        (np.subtract, math_ops.subtract),
+        (np.subtract, _SUB),
+        (np.power, math_ops.pow),
+    ]
+    self._testBCastByFunc(funcs, xs, ys)
+
+  def _testBCastC(self, xs, ys):
+    funcs = [
+        (np.multiply, math_ops.multiply),
+        (np.multiply, _MUL),
+    ]
+    self._testBCastByFunc(funcs, xs, ys)
+
+  def _testBCastD(self, xs, ys):
+    funcs = [
+        (np.true_divide, math_ops.truediv),
+        (np.true_divide, _TRUEDIV),
+    ]
+    self._testBCastByFunc(funcs, xs, ys)
+
+  def testBCast_0A(self):
+    self._testBCastA([1, 3, 2], [1])
+
+  def testBCast_0B(self):
+    self._testBCastB([1, 3, 2], [1])
+
+  def testBCast_0C(self):
+    self._testBCastC([1, 3, 2], [1])
+
+  def testBCast_0D(self):
+    self._testBCastD([1, 3, 2], [1])
+
+  def testBCast_1A(self):
+    self._testBCastA([2, 3, 2], [2])
+
+  def testBCast_1B(self):
+    self._testBCastB([1, 3, 2], [2])
+
+  def testBCast_1C(self):
+    self._testBCastC([1, 3, 2], [2])
+
+  def testBCast_1D(self):
+    self._testBCastD([1, 3, 2], [2])
+
+  def testBCast_2A(self):
+    self._testBCastA([2, 3, 2], [3, 2])
+
+  def testBCast_2B(self):
+    self._testBCastB([1, 3, 2], [3, 2])
+
+  def testBCast_2C(self):
+    self._testBCastC([1, 3, 2], [3, 2])
+
+  def testBCast_2D(self):
+    self._testBCastD([1, 3, 2], [3, 2])
+
+  def testBCast_3A(self):
+    self._testBCastA([1, 3, 2], [3, 1])
+
+  def testBCast_3B(self):
+    self._testBCastB([1, 3, 2], [3, 1])
+
+  def testBCast_3C(self):
+    self._testBCastC([1, 3, 2], [3, 1])
+
+  def testBCast_3D(self):
+    self._testBCastD([1, 3, 2], [3, 1])
+
+  def testBCast_4A(self):
+    self._testBCastA([1, 3, 2], [1, 3, 2])
+
+  def testBCast_4B(self):
+    self._testBCastB([1, 3, 2], [1, 3, 2])
+
+  def testBCast_4C(self):
+    self._testBCastC([1, 3, 2], [1, 3, 2])
+
+  def testBCast_4D(self):
+    self._testBCastD([1, 3, 2], [1, 3, 2])
+
+  def testBCast_5A(self):
+    self._testBCastA([1, 3, 2], [2, 3, 1])
+
+  def testBCast_5B(self):
+    self._testBCastB([1, 3, 2], [2, 3, 1])
+
+  def testBCast_5C(self):
+    self._testBCastC([1, 3, 2], [2, 3, 1])
+
+  def testBCast_5D(self):
+    self._testBCastD([1, 3, 2], [2, 3, 1])
+
+
+def run_benchmark(func, num_iters, execution_mode=None):
+  ctx = context.context()
+  with context.execution_mode(execution_mode):
+    # call func to warm up
+    func()
+    if execution_mode == context.ASYNC:
+      ctx.executor.wait()
+    start = time.time()
+    for _ in xrange(num_iters):
+      func()
+    if execution_mode == context.ASYNC:
+      ctx.executor.wait()
+    end = time.time()
+
+    return end - start
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_nightly_kickoff.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_nightly_kickoff.Jenkinsfile
new file mode 100644
index 00000000000..b3bfc66f700
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_nightly_kickoff.Jenkinsfile
@@ -0,0 +1,36 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+
+    stages {
+        stage("Build Apple Silicon Nightly") {
+            parallel {
+                stage("Build Nightly Wheel") {
+                    steps {
+                        build 'tensorflow-as-build-nightly'
+                    }
+                }
+                stage("Run Non PIP Test Suite") {
+                    steps {
+                        build 'tensorflow-as-test-nightly'
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/tensorflow_release_kickoff.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/tensorflow_release_kickoff.Jenkinsfile
new file mode 100644
index 00000000000..6364aa08ef9
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/tensorflow_release_kickoff.Jenkinsfile
@@ -0,0 +1,35 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent none
+    stages {
+        stage("Build Apple Silicon Release") {
+            parallel {
+                stage("Build Nightly Wheel") {
+                    steps {
+                        build 'tensorflow-as-build-release'
+                    }
+                }
+                stage("Run Non PIP Test Suite") {
+                    steps {
+                        build 'tensorflow-as-test-release'
+                    }
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/upload_nightly.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/upload_nightly.Jenkinsfile
new file mode 100644
index 00000000000..61843fb3a1c
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/upload_nightly.Jenkinsfile
@@ -0,0 +1,45 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent { 
+        label "nightly-upload" 
+    }
+    environment {
+        PYENV_ROOT="$HOME/.pyenv"
+        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+        TWINE_NON_INTERACTIVE=true
+    }
+    stages {
+        stage('build') {
+            steps {
+
+                git 'https://github.com/tensorflow/tensorflow'
+                
+                sh 'mkdir dist'
+                
+                copyArtifacts fingerprintArtifacts: true, projectName: 'tensorflow-as-build-nightly', selector: upstream()
+                
+                sh 'pyenv global 3.10.10'
+                
+                withCredentials([string(credentialsId: 'ef67da81-2d62-4ae6-a200-cbd2bcab8429', variable: 'PYPI_API_TOKEN')]) {
+                    sh 'twine check tensorflow/dist/*'
+                    sh 'twine upload tensorflow/dist/* -u __token__ -p $PYPI_API_TOKEN --verbose --disable-progress-bar'
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/osx/arm64/upload_release.Jenkinsfile b/tensorflow/tools/ci_build/osx/arm64/upload_release.Jenkinsfile
new file mode 100644
index 00000000000..e70f7da0664
--- /dev/null
+++ b/tensorflow/tools/ci_build/osx/arm64/upload_release.Jenkinsfile
@@ -0,0 +1,45 @@
+/*
+Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+pipeline {
+    agent { 
+        label "nightly-upload" 
+    }
+    environment {
+        PYENV_ROOT="$HOME/.pyenv"
+        PATH="$PYENV_ROOT/shims:/opt/homebrew/bin/:$PATH"
+        TWINE_NON_INTERACTIVE=true
+    }
+    stages {
+        stage('build') {
+            steps {
+
+                git 'https://github.com/tensorflow/tensorflow'
+                
+                sh 'mkdir dist'
+                
+                copyArtifacts fingerprintArtifacts: true, projectName: 'tensorflow-as-build-release', selector: lastSuccessful()
+                
+                sh 'pyenv global 3.10.10'
+                
+                withCredentials([string(credentialsId: 'ef67da81-2d62-4ae6-a200-cbd2bcab8429', variable: 'PYPI_API_TOKEN')]) {
+                    sh 'twine check tensorflow/dist/*'
+                    sh 'twine upload tensorflow/dist/* -u __token__ -p $PYPI_API_TOKEN --verbose --disable-progress-bar'
+                }
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh
index 0af258a6e9a..a9d72834653 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_cpp.sh
@@ -19,12 +19,10 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-sudo mkdir /tmpfs
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tmpfs
-sudo mkdir /tensorflow
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tensorflow
+sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d /tmpfs
+sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d /tensorflow
 sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/lib/python*
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
 sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/lib/python3/dist-packages
 
 # Update bazel
@@ -73,7 +71,7 @@ source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
 export TF_BUILD_FLAGS="--config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
 export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
     --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=tf_api_version=2 \
-    --test_lang_filters=cc --flaky_test_attempts=3 --test_size_filters=small,medium \
+    --test_lang_filters=cc --test_size_filters=small,medium \
     --test_output=errors --verbose_failures=true --test_keep_going --notest_verbose_timeout_warnings"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
 export TF_FILTER_TAGS="-no_oss,-oss_excluded,-oss_serial,-v1only,-benchmark-test,-no_aarch64,-gpu,-tpu,-no_oss_py38,-no_oss_py39,-no_oss_py310"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
index 1568c0e80a1..80458f75aef 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_nonpip.sh
@@ -19,12 +19,10 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-sudo mkdir /tmpfs
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tmpfs
-sudo mkdir /tensorflow
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tensorflow
+sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d /tmpfs
+sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d /tensorflow
 sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/lib/python*
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
 sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/lib/python3/dist-packages
 
 # Update bazel
@@ -73,7 +71,7 @@ source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS_EXTENDED.sh
 export TF_BUILD_FLAGS="--config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
 export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
     --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=tf_api_version=2 \
-    --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium \
+    --test_lang_filters=py --test_size_filters=small,medium \
     --test_output=errors --verbose_failures=true --test_keep_going --notest_verbose_timeout_warnings"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
 export TF_FILTER_TAGS="-no_oss,-oss_excluded,-oss_serial,-v1only,-benchmark-test,-no_aarch64,-gpu,-tpu,-no_oss_py38,-no_oss_py39,-no_oss_py310"
diff --git a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
index 9f1bbdb7f49..ef7ddab4795 100644
--- a/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
+++ b/tensorflow/tools/ci_build/rel/ubuntu/cpu_arm64_pip.sh
@@ -19,12 +19,10 @@ set -x
 
 source tensorflow/tools/ci_build/release/common.sh
 
-sudo mkdir /tmpfs
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tmpfs
-sudo mkdir /tensorflow
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /tensorflow
+sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d /tmpfs
+sudo install -o ${CI_BUILD_USER} -g ${CI_BUILD_GROUP} -d /tensorflow
 sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/lib/python*
-sudo chown ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
+sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/local/bin
 sudo chown -R ${CI_BUILD_USER}:${CI_BUILD_GROUP} /usr/lib/python3/dist-packages
 
 # Update bazel
@@ -73,7 +71,7 @@ source tensorflow/tools/ci_build/build_scripts/ARM_SKIP_TESTS.sh
 export TF_BUILD_FLAGS="--config=mkl_aarch64_threadpool --copt=-flax-vector-conversions"
 export TF_TEST_FLAGS="${TF_BUILD_FLAGS} \
     --test_env=TF_ENABLE_ONEDNN_OPTS=1 --test_env=TF2_BEHAVIOR=1 --define=no_tensorflow_py_deps=true \
-    --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium \
+    --test_lang_filters=py --test_size_filters=small,medium \
     --test_output=errors --verbose_failures=true --test_keep_going --notest_verbose_timeout_warnings"
 export TF_TEST_TARGETS="${DEFAULT_BAZEL_TARGETS} ${ARM_SKIP_TESTS}"
 export TF_PIP_TESTS="test_pip_virtualenv_clean test_pip_virtualenv_oss_serial"
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 09799b36bf1..e5a490ac177 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -17,7 +17,7 @@
 
 # Keeps Bazel versions of the build scripts.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=5.3.0
+LATEST_BAZEL_VERSION=6.1.0
 # LINT.ThenChange(
 #   //tensorflow/opensource_only/.bazelversion,
 #   //tensorflow/tools/ci_build/install/install_bazel.sh,
diff --git a/tensorflow/tools/ci_build/release/mac_build_utils.sh b/tensorflow/tools/ci_build/release/mac_build_utils.sh
index 8f8dfd30433..67abac534f4 100644
--- a/tensorflow/tools/ci_build/release/mac_build_utils.sh
+++ b/tensorflow/tools/ci_build/release/mac_build_utils.sh
@@ -134,7 +134,7 @@ function bazel_test_wheel {
 
   # Install additional test requirements
   # TODO - Add these to setup.py test requirements
-  pip install portpicker~=1.4.0 scipy~=1.7.2
+  pip install portpicker~=1.5.2 scipy~=1.7.2
 
   PIP_TEST_PREFIX=bazel_pip
   TEST_ROOT=$(pwd)/${PIP_TEST_PREFIX}
diff --git a/tensorflow/tools/ci_build/release/requirements_common.txt b/tensorflow/tools/ci_build/release/requirements_common.txt
index 4ec4f5458d2..8a38396fcac 100644
--- a/tensorflow/tools/ci_build/release/requirements_common.txt
+++ b/tensorflow/tools/ci_build/release/requirements_common.txt
@@ -3,7 +3,7 @@
 # This will change in the future.
 absl-py ~= 1.0.0
 astunparse ~= 1.6.3
-flatbuffers ~= 23.1.21
+flatbuffers ~= 23.5.8
 google_pasta ~= 0.2
 h5py ~= 3.8.0  # Earliest version for Python 3.11
 # TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
@@ -26,15 +26,18 @@ gast == 0.4.0
 # Note that we must use nightly here as these are used in nightly jobs
 # For release jobs, we will pin these on the release branch
 keras-nightly ~= 2.14.0.dev
-tb-nightly ~= 2.13.0.a
+tb-nightly ~= 2.14.0.a
 tf-estimator-nightly ~= 2.14.0.dev
 
 # Test dependencies
 grpcio ~= 1.49.1 # Earliest version for Python 3.11
-portpicker ~= 1.4.0
+portpicker ~= 1.5.2
 scipy ~= 1.7.2; python_version < '3.11'
 scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
 
 # This is usually vendored in setuptools but ensure it gets installed in CI anyway
 # No bound here, we prefer the one in setuptools
 packaging
+
+# For using Python 3.11 with Bazel 6 (b/286090018)
+lit ~= 16.0.5.post0
diff --git a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
index b4396a74022..b93c3b530fb 100644
--- a/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
+++ b/tensorflow/tools/ci_build/windows/bazel/bazel_test_lib.sh
@@ -37,8 +37,3 @@ function create_python_test_dir() {
   mkdir -p "$1"
   cmd /c "mklink /J $1\\tensorflow .\\tensorflow"
 }
-
-function reinstall_tensorflow_pip() {
-  echo "y" | pip uninstall tensorflow -q || true
-  pip install ${1} --no-deps
-}
diff --git a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
index 0042a8369d9..3dc30e09853 100644
--- a/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/cpu/pip/build_tf_windows.sh
@@ -161,23 +161,3 @@ if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
 fi
 
-# Running python tests on Windows needs pip package installed
-PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow*.whl)
-reinstall_tensorflow_pip ${PIP_NAME}
-
-# NUMBER_OF_PROCESSORS is predefined on Windows
-N_JOBS="${NUMBER_OF_PROCESSORS}"
-
-# Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
-# which will result testing system installed tensorflow
-bazel test --announce_rc --config=opt -k --test_output=errors \
-  --experimental_cc_shared_library \
-  ${EXTRA_TEST_FLAGS} \
-  --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-windows_excluded,-no_oss,-oss_excluded,-gpu,-tpu,-v1only \
-  --build_tag_filters=-no_pip,-no_windows,-windows_excluded,-no_oss,-oss_excluded,-gpu,-tpu --build_tests_only \
-  --test_size_filters=small,medium \
-  --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
-  --flaky_test_attempts=3 \
-  --output_filter=^$ \
-  ${TEST_TARGET}
diff --git a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
index 612d681fa10..d5e50e0c9d1 100644
--- a/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/gpu/pip/build_tf_windows.sh
@@ -158,29 +158,3 @@ if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
 fi
 
-# Running python tests on Windows needs pip package installed
-PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow*.whl)
-reinstall_tensorflow_pip ${PIP_NAME}
-
-TF_GPU_COUNT=${TF_GPU_COUNT:-4}
-
-# Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
-# which will result testing system installed tensorflow
-# GPU tests are very flaky when running concurrently, so set local_test_jobs=1
-bazel test \
-  --experimental_cc_shared_library \
-  --announce_rc --config=opt -k --test_output=errors \
-  --test_env=TF_GPU_COUNT \
-  ${EXTRA_TEST_FLAGS} \
-  --run_under=//tensorflow/tools/ci_build/gpu_build:parallel_gpu_execute \
-  --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-windows_excluded,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss,-oss_excluded,gpu,-v1only \
-  --build_tag_filters=-no_pip,-no_windows,-windows_excluded,-no_windows_gpu,-no_gpu,-no_pip_gpu,-no_oss,-oss_excluded,gpu --build_tests_only \
-  --test_size_filters=small,medium \
-  --local_test_jobs=$TF_GPU_COUNT --test_timeout="300,450,1200,3600" \
-  --flaky_test_attempts=3 \
-  --output_filter=^$ \
-  -- ${TEST_TARGET} -//${PY_TEST_DIR}/tensorflow/python/client:timeline_test_gpu
-# TODO(b/140106487): apply https://developer.nvidia.com/ERR_NVGPUCTRPERM to the
-# Kokoro machines and enable timeline_test_gpu again.
-
diff --git a/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh b/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh
index d672d7dc475..e9ec77449ce 100644
--- a/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh
+++ b/tensorflow/tools/ci_build/windows/integration/gpu_pip_on_cpu/build_tf_windows.sh
@@ -152,50 +152,3 @@ create_python_test_dir "${PY_TEST_DIR}"
 if [[ "$TF_NIGHTLY" == 1 ]]; then
   exit 0
 fi
-
-# Running python tests on Windows needs pip package installed
-PIP_NAME=$(ls ${PY_TEST_DIR}/tensorflow_gpu-*.whl)
-reinstall_tensorflow_pip ${PIP_NAME}
-
-###########################
-# Run pip tests without GPU
-###########################
-# Wipe out CUDA related envs
-export CUDA_TOOLKIT_PATH=""
-export CUDNN_INSTALL_PATH=""
-
-# Setting up environment for CPU tests
-export TF_NEED_CUDA=0
-yes "" | ./configure
-
-# Remove cuda libraries from PATH
-echo ${PATH}
-NEW_PATH=""
-echo "Removing NVIDIA GPU Computing Toolkit related directories from PATH..."
-for DIR in ${PATH//:/ } ; do
-  if [[ ${DIR} == *"CUDA"* ]]; then
-    echo "Skipping ${DIR}"
-  else
-    NEW_PATH="${NEW_PATH}:${DIR}"
-  fi
-done
-export PATH=${NEW_PATH}
-echo ${PATH}
-
-
-# NUMBER_OF_PROCESSORS is predefined on Windows
-N_JOBS="${NUMBER_OF_PROCESSORS}"
-
-# Define no_tensorflow_py_deps=true so that every py_test has no deps anymore,
-# which will result testing system installed tensorflow
-bazel test --announce_rc --config=opt -k --test_output=errors \
-  ${EXTRA_TEST_FLAGS} \
-  --define=no_tensorflow_py_deps=true --test_lang_filters=py \
-  --test_tag_filters=-no_pip,-no_windows,-windows_excluded,-no_oss,-oss_excluded,-gpu,-tpu \
-  --build_tag_filters=-no_pip,-no_windows,-windows_excluded,-no_oss,-oss_excluded,-gpu,-tpu --build_tests_only \
-  --test_size_filters=small,medium \
-  --jobs="${N_JOBS}" --test_timeout="300,450,1200,3600" \
-  --flaky_test_attempts=3 \
-  --output_filter=^$ \
-  -- ${TEST_TARGET} \
-  -//${PY_TEST_DIR}/tensorflow/python/client:virtual_gpu_test
diff --git a/tensorflow/tools/compatibility/BUILD b/tensorflow/tools/compatibility/BUILD
index b9244233852..bb605a9204c 100644
--- a/tensorflow/tools/compatibility/BUILD
+++ b/tensorflow/tools/compatibility/BUILD
@@ -39,8 +39,8 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":ast_edits",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@six_archive//:six",
     ],
 )
@@ -76,8 +76,8 @@ py_test(
     ],
     deps = [
         ":tf_upgrade_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@six_archive//:six",
     ],
 )
@@ -112,8 +112,8 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":all_renames_v2",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@six_archive//:six",
     ],
 )
@@ -180,8 +180,8 @@ py_test(
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v2_estimator",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
         "@absl_py//absl/testing:parameterized",
@@ -195,8 +195,8 @@ py_test(
     srcs_version = "PY3",
     deps = [
         ":tf_upgrade_v2_safety_lib",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_test_lib",
+        "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/platform:client_testlib",
         "@six_archive//:six",
     ],
 )
diff --git a/tensorflow/tools/compatibility/update/BUILD b/tensorflow/tools/compatibility/update/BUILD
index 5bcbdefe4bd..2e68cda9d64 100644
--- a/tensorflow/tools/compatibility/update/BUILD
+++ b/tensorflow/tools/compatibility/update/BUILD
@@ -14,9 +14,9 @@ py_binary(
         "//tensorflow:tensorflow_py",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v2_estimator",
-        "//tensorflow/python:lib",
         "//tensorflow/python:modules_with_exports",
         "//tensorflow/python:no_contrib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
         "//tensorflow/tools/compatibility:all_renames_v2",
@@ -32,8 +32,8 @@ py_binary(
         "//tensorflow:tensorflow_py",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v1_estimator",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_compat_v2_estimator",
-        "//tensorflow/python:lib",
         "//tensorflow/python:no_contrib",
+        "//tensorflow/python/lib/io:lib",
         "//tensorflow/tools/common:public_api",
         "//tensorflow/tools/common:traverse",
         "//tensorflow/tools/compatibility:tf_upgrade_v2_lib",
diff --git a/tensorflow/tools/docs/BUILD b/tensorflow/tools/docs/BUILD
index af893328925..4fa44b62c87 100644
--- a/tensorflow/tools/docs/BUILD
+++ b/tensorflow/tools/docs/BUILD
@@ -61,8 +61,8 @@ py_test(
     deps = [
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:logging_ops",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/ops:logging_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
         "@absl_py//absl/testing:absltest",
@@ -107,8 +107,8 @@ py_test(
     deps = [
         ":tf_doctest_lib",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:logging_ops",
         "//tensorflow/python/distribute:distribute_lib",
+        "//tensorflow/python/ops:logging_ops",
         "//third_party/py/numpy",
         "@absl_py//absl/flags",
         "@absl_py//absl/testing:absltest",
@@ -170,7 +170,9 @@ py_test(
     ],
     deps = [
         ":generate2_lib",
+        "//third_party/py/packaging",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_estimator",
+        "//third_party/py/yaml",
         "//tensorflow:tensorflow_py",
         "//tensorflow/python:platform_test",
     ],
@@ -202,7 +204,8 @@ py_library(
         ":base_dir_oss",
         # copybara:uncomment "//third_party/py/tensorflow:tensorflow_estimator",
         "//tensorflow:tensorflow_py",
-        "//tensorflow/python:framework_ops",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/util:pywrap_xla_ops",
         "//tensorflow/python/util:tf_export",
         "//tensorflow/python/util:tf_inspect",
         "@absl_py//absl:app",
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index f199b185d0b..e0cbc588d0d 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -27,29 +27,28 @@ pip install git+https://github.com/tensorflow/docs
 import contextlib
 import pathlib
 import textwrap
-
 from typing import NamedTuple
 
 from absl import app
 from absl import flags
 from packaging import version
-
 import tensorflow as tf
-
 from tensorflow_docs.api_generator import doc_controls
 from tensorflow_docs.api_generator import doc_generator_visitor
 from tensorflow_docs.api_generator import generate_lib
 from tensorflow_docs.api_generator.pretty_docs import base_page
 from tensorflow_docs.api_generator.pretty_docs import module_page
-
 import yaml
 
 from tensorflow.python.framework import ops
 from tensorflow.python.util import tf_export
 from tensorflow.python.util import tf_inspect
 
+if version.parse(tf.__version__) >= version.parse("2.14"):
+  from tensorflow.python.util.pywrap_xla_ops import get_gpu_kernel_names  # pylint: disable=g-import-not-at-top
+
 # Caution: the google and oss versions of this import are different.
-import base_dir
+import base_dir  # pylint: disable=g-import-not-at-top
 
 # pylint: disable=g-import-not-at-top
 try:
@@ -119,11 +118,14 @@ class RawOpsPageInfo(module_page.ModulePageInfo):
     # Skip the ModulePage implementation, which doesn't use a template.
     content = base_page.PageInfo.build(self)
 
-    raw_ops_doc = self.generate_raw_ops_doc()
+    if version.parse(tf.__version__) >= version.parse("2.14"):
+      raw_ops_doc = self.generate_raw_ops_doc_ge_214()
+    else:
+      raw_ops_doc = self.generate_raw_ops_doc_lt_214()
 
     return "\n".join([content, raw_ops_doc])
 
-  def generate_raw_ops_doc(self):
+  def generate_raw_ops_doc_lt_214(self):
     """Generates docs for `tf.raw_ops`."""
     del self
 
@@ -157,6 +159,49 @@ class RawOpsPageInfo(module_page.ModulePageInfo):
 
     return "\n".join(parts)
 
+  def generate_raw_ops_doc_ge_214(self):
+    """Generates docs for `tf.raw_ops`."""
+    del self
+
+    warning = textwrap.dedent("""\n
+      Note: `tf.raw_ops` provides direct/low level access to all TensorFlow ops.
+      See [the RFC](https://github.com/tensorflow/community/blob/master/rfcs/20181225-tf-raw-ops.md)
+      for details. Unless you are library writer, you likely do not need to use
+      these ops directly.""")
+
+    table_header = textwrap.dedent("""
+
+        | Op Name | Has Gradient | GPU XLA Support |
+        |---------|:------------:|:---------------:|""")
+
+    parts = [warning, table_header]
+    xla_compiled_ops = get_gpu_kernel_names()
+    for op_name in sorted(dir(tf.raw_ops)):
+      try:
+        ops._gradient_registry.lookup(op_name)  # pylint: disable=protected-access
+        has_gradient = "\N{HEAVY CHECK MARK}\N{VARIATION SELECTOR-16}"
+      except LookupError:
+        has_gradient = "\N{CROSS MARK}"
+      is_xla_compilable = "\N{CROSS MARK}"
+      if op_name in xla_compiled_ops:
+        is_xla_compilable = "\N{HEAVY CHECK MARK}\N{VARIATION SELECTOR-16}"
+
+      if not op_name.startswith("_"):
+        path = pathlib.Path("/") / FLAGS.site_path / "tf/raw_ops" / op_name
+        path = path.with_suffix(".md")
+        link = ('<a id={op_name} href="{path}">{op_name}</a>').format(
+            op_name=op_name, path=str(path)
+        )
+        parts.append(
+            "| {link} | {has_gradient} | {is_xla_compilable} |".format(
+                link=link,
+                has_gradient=has_gradient,
+                is_xla_compilable=is_xla_compilable,
+            )
+        )
+
+    return "\n".join(parts)
+
 
 # The doc generator isn't aware of tf_export.
 # So prefix the score tuples with -1 when this is the canonical name, +1
@@ -303,7 +348,7 @@ def build_docs(output_dir, code_url_prefix, search_hints):
     raise ValueError("\n".join(error_msg_parts))
 
   rejected_path_contents = {
-      "tf/keras/optimizers.md": "keras/optimizers/__init__.py",
+      "tf/keras/optimizers.md": "api/_v2/keras/optimizers/__init__.py",
   }
 
   all_passed = True
diff --git a/tensorflow/tools/docs/generate2_test.py b/tensorflow/tools/docs/generate2_test.py
index 0e33a8e1f26..c0e7cce13e0 100644
--- a/tensorflow/tools/docs/generate2_test.py
+++ b/tensorflow/tools/docs/generate2_test.py
@@ -20,9 +20,9 @@ import shutil
 import types
 from unittest import mock
 
+from packaging import version
 import tensorflow as tf
 from tensorflow import estimator as tf_estimator
-
 import yaml
 
 from tensorflow.python.platform import googletest
@@ -38,6 +38,7 @@ class AutoModule(types.ModuleType):
     setattr(self, name, mod)
     return mod
 
+
 # Make a mock tensorflow package that won't take too long to test.
 fake_tf = AutoModule('FakeTensorFlow')
 fake_tf.Module = tf.Module  # pylint: disable=invalid-name
@@ -48,8 +49,10 @@ fake_tf.keras.preprocessing = tf.keras.preprocessing
 fake_tf.keras.layers.Layer = tf.keras.layers.Layer
 fake_tf.keras.optimizers.Optimizer = tf.keras.optimizers.Optimizer
 fake_tf.nn.sigmoid_cross_entropy_with_logits = (
-    tf.nn.sigmoid_cross_entropy_with_logits)
+    tf.nn.sigmoid_cross_entropy_with_logits
+)
 fake_tf.raw_ops.Add = tf.raw_ops.Add
+fake_tf.raw_ops.Print = tf.raw_ops.Print  # op with no XLA support
 fake_tf.summary.audio = tf.summary.audio
 fake_tf.summary.audio2 = tf.summary.audio
 fake_tf.__version__ = tf.__version__
@@ -81,5 +84,14 @@ class Generate2Test(googletest.TestCase):
     redirects = yaml.safe_load((output_dir / 'tf/_redirects.yaml').read_text())
     self.assertIn({'from': '/tf_overview', 'to': '/tf'}, redirects['redirects'])
 
+    if version.parse(fake_tf.__version__) >= version.parse('2.14'):
+      self.assertIn(
+          '<a id=Add href="/tf/raw_ops/Add.md">Add</a> | ✔️ | ✔️ |', raw_ops_page
+      )
+      self.assertIn(
+          '<a id=Print href="/tf/raw_ops/Print.md">Print</a> | ✔️ | ❌ |',
+          raw_ops_page,
+      )
+
 if __name__ == '__main__':
   googletest.main()
diff --git a/tensorflow/tools/graph_transforms/BUILD b/tensorflow/tools/graph_transforms/BUILD
index 0f3b169eb3e..43b2c1d7eca 100644
--- a/tensorflow/tools/graph_transforms/BUILD
+++ b/tensorflow/tools/graph_transforms/BUILD
@@ -335,7 +335,7 @@ py_library(
     srcs_version = "PY3",
     deps = [
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/util:_pywrap_transform_graph",
         "//tensorflow/python/util:compat",
     ],
@@ -350,9 +350,9 @@ tf_py_test(
     deps = [
         ":transform_graph_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
-        "//tensorflow/python:framework_for_generated_wrappers",
-        "//tensorflow/python:math_ops",
-        "//tensorflow/python:variables",
+        "//tensorflow/python/framework:for_generated_wrappers",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
     ],
 )
diff --git a/tensorflow/tools/lib_package/BUILD b/tensorflow/tools/lib_package/BUILD
index 9c09d8320d6..e79ba3f5589 100644
--- a/tensorflow/tools/lib_package/BUILD
+++ b/tensorflow/tools/lib_package/BUILD
@@ -3,7 +3,6 @@
 
 load("@rules_pkg//pkg:tar.bzl", "pkg_tar")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
 load("//tensorflow:tensorflow.bzl", "VERSION", "VERSION_MAJOR", "if_macos")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 load("//third_party/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl")
@@ -137,36 +136,18 @@ pkg_tar(
 genrule(
     name = "clicenses_generate",
     srcs = [
-        "//third_party/eigen3:LICENSE",
-        "//third_party/fft2d:LICENSE",
         "//third_party/icu/data:LICENSE",
-        "@boringssl//:LICENSE",
-        "@com_google_protobuf//:LICENSE",
-        "@com_googlesource_code_re2//:LICENSE",
         "@curl//:COPYING",
-        "@double_conversion//:LICENSE",
-        "@eigen_archive//:COPYING.MPL2",
-        "@farmhash_archive//:COPYING",
-        "@fft2d//:readme2d.txt",
         "@gemmlowp//:LICENSE",
-        "@gif//:COPYING",
-        "@highwayhash//:LICENSE",
-        "@icu//:icu4c/LICENSE",
         "@libjpeg_turbo//:LICENSE.md",
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@local_config_tensorrt//:LICENSE",
-        "@nsync//:LICENSE",
         "@png//:LICENSE",
         "@snappy//:COPYING",
         "@stablehlo//:LICENSE",
         "@tf_runtime//:LICENSE",
-        "@tf_runtime//third_party/llvm_derived:LICENSE",
-        "@zlib//:zlib.h",
     ] + select({
-        "//tensorflow:linux_aarch64": [],
-        "//conditions:default": ["@nasm//:LICENSE"],
-    }) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
@@ -185,13 +166,7 @@ genrule(
         "@local_config_nccl//:LICENSE",
     ]) + if_mkl([
         "//third_party/mkl_dnn:LICENSE",
-    ]) + if_enable_mkl(["//third_party/mkl:LICENSE"]) + if_not_system_lib(
-        "com_github_grpc_grpc",
-        [
-            "@com_github_grpc_grpc//:LICENSE",
-            "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
-        ],
-    ) + tf_additional_license_deps(),
+    ]) + if_enable_mkl(["//third_party/mkl:LICENSE"]) + tf_additional_license_deps(),
     outs = ["THIRD_PARTY_TF_C_LICENSES"],
     cmd = "$(location :concat_licenses.sh) $(SRCS) >$@",
     tools = [":concat_licenses.sh"],
@@ -200,38 +175,18 @@ genrule(
 genrule(
     name = "jnilicenses_generate",
     srcs = [
-        "//third_party/eigen3:LICENSE",
-        "//third_party/fft2d:LICENSE",
         "//third_party/icu/data:LICENSE",
-        "@boringssl//:LICENSE",
-        "@com_google_protobuf//:LICENSE",
-        "@com_googlesource_code_re2//:LICENSE",
         "@curl//:COPYING",
-        "@double_conversion//:LICENSE",
-        "@eigen_archive//:COPYING.MPL2",
-        "@farmhash_archive//:COPYING",
-        "@fft2d//:readme2d.txt",
         "@gemmlowp//:LICENSE",
-        "@gif//:COPYING",
-        "@com_github_grpc_grpc//:LICENSE",
-        "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
-        "@highwayhash//:LICENSE",
-        "@icu//:icu4j/main/shared/licenses/LICENSE",
         "@libjpeg_turbo//:LICENSE.md",
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@local_config_tensorrt//:LICENSE",
-        "@nsync//:LICENSE",
         "@png//:LICENSE",
         "@snappy//:COPYING",
         "@stablehlo//:LICENSE",
         "@tf_runtime//:LICENSE",
-        "@tf_runtime//third_party/llvm_derived:LICENSE",
-        "@zlib//:zlib.h",
     ] + select({
-        "//tensorflow:linux_aarch64": [],
-        "//conditions:default": ["@nasm//:LICENSE"],
-    }) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 3488e8e4636..b17c962c018 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -36,11 +36,14 @@ transitive_hdrs(
         "//tensorflow/cc/saved_model:reader",
         "//tensorflow/cc/saved_model:bundle_v2",
         "//tensorflow/c/experimental/filesystem:filesystem_interface",
+        "//tensorflow/c/experimental/next_pluggable_device:c_api_hdrs",
         "//tensorflow/c/experimental/pluggable_profiler:pluggable_profiler_hdrs",
         "//tensorflow/c/experimental/stream_executor:stream_executor_hdrs",
         "//tensorflow/c/experimental/grappler:grappler_hdrs",
         "//tensorflow/c:kernels_hdrs",
         "//tensorflow/c:ops_hdrs",
+        "//tensorflow/compiler/xla/pjrt/c:pjrt_c_api_hdrs",
+        "//tensorflow/core/common_runtime/next_pluggable_device/c:plugin_c_api_hdrs",
         # TODO(rostam): Revisit these after cc_shared_library everywhere
         "//tensorflow/cc:client_session",
         "//tensorflow/cc:cc_ops",
@@ -102,6 +105,7 @@ COMMON_PIP_DEPS = [
     ":xla_cmake",
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_hdrs",
     "//tensorflow/compiler/tf2xla:xla_compiled_cpu_runtime_srcs",
+    "//tensorflow/compiler/mlir/stablehlo:stablehlo",
     "//tensorflow/compiler/mlir/tensorflow:gen_mlir_passthrough_op_py",
     "//tensorflow/core/function/trace_type:serialization_test_proto_py",
     "//tensorflow/core/function/trace_type:serialization",
@@ -121,7 +125,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/autograph/utils:context_managers",
     "//tensorflow/python/autograph/utils:tensor_list",
     "//tensorflow/python/compiler:compiler",
-    "//tensorflow/python:cond_v2",
+    "//tensorflow/python/ops:cond_v2",
     "//tensorflow/python:distributed_framework_test_lib",
     "//tensorflow/python/distribute:distribute_test_lib_pip",
     "//tensorflow/python/training/experimental:loss_scale",
@@ -156,6 +160,7 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/kernel_tests/signal:test_util",
     "//tensorflow/python/kernel_tests/sparse_ops:sparse_xent_op_test_base",
     "//tensorflow/python/lib:__init__",
+    "//tensorflow/python/ops/parallel_for:test_util",
     "//tensorflow/python/ops/structured:structured_tensor_dynamic",
     "//tensorflow/python/platform:resource_loader",
     "//tensorflow/python/profiler:trace",
@@ -175,20 +180,22 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/util:lock_util",
     "//tensorflow/python/util:module_wrapper",
     "//tensorflow/python/util:serialization",
-    "//tensorflow/python:image_grad_test_base",
-    "//tensorflow/python:memory_checker",
-    "//tensorflow/python:test_ops",
+    "//tensorflow/python/ops:image_grad_test_base",
+    "//tensorflow/python/framework:memory_checker",
+    "//tensorflow/python/framework:test_ops",
     "//tensorflow/python:while_v2",
     "//tensorflow/tools/common:public_api",
     "//tensorflow/tools/common:test_module1",
     "//tensorflow/tools/common:traverse",
+    "//tensorflow/tools/docs:tf_doctest_lib",
+    "//tensorflow/tsl/profiler/protobuf:trace_events_proto_py",
     "//tensorflow/python/distribute:parameter_server_strategy_v2",
     "//tensorflow/python/distribute/coordinator:cluster_coordinator",
     "//tensorflow/python/distribute/coordinator:remote_eager_lib",
     "//tensorflow/python/distribute/coordinator:metric_utils",
     "//tensorflow/python/distribute/experimental/rpc:rpc_ops",
+    "//tensorflow/python/util:pywrap_xla_ops",
     "//tensorflow:tensorflow_py",
-    "//tensorflow/tools/docs:tf_doctest_lib",
 ]
 
 # On Windows, python binary is a zip file of runfiles tree.
@@ -208,59 +215,32 @@ filegroup(
     name = "licenses",
     data = [
         "//:LICENSE",
-        "//third_party/eigen3:LICENSE",
-        "//third_party/fft2d:LICENSE",
         "//third_party/icu/data:LICENSE",
-        "@FP16//:LICENSE",
-        "@FXdiv//:LICENSE",
-        "@XNNPACK//:LICENSE",
         "@astunparse_archive//:LICENSE",
-        "@boringssl//:LICENSE",
         "@com_google_absl//:LICENSE",
         "@com_google_protobuf//:LICENSE",
-        "@com_googlesource_code_re2//:LICENSE",
-        "@cpuinfo//:LICENSE",
         "@curl//:COPYING",
         "@dill_archive//:LICENSE",
-        "@dlpack//:LICENSE",
-        "@double_conversion//:LICENSE",
-        "@eigen_archive//:COPYING.MPL2",
-        "@farmhash_archive//:COPYING",
-        "@fft2d//:readme2d.txt",
-        "@flatbuffers//:LICENSE.txt",
+        "@flatbuffers//:LICENSE",
         "@gast_archive//:PKG-INFO",
         "@gemmlowp//:LICENSE",
-        "@gif//:COPYING",
-        "@highwayhash//:LICENSE",
-        "@icu//:icu4c/LICENSE",
-        "@kissfft//:COPYING",
         "@libjpeg_turbo//:LICENSE.md",
         "@llvm-project//llvm:LICENSE.TXT",
         "@llvm-project//mlir:LICENSE.TXT",
         "@local_config_tensorrt//:LICENSE",
-        "@nsync//:LICENSE",
+        "@ml_dtypes//:LICENSE",
         "@opt_einsum_archive//:LICENSE",
         "@pasta//:LICENSE",
         "@png//:LICENSE",
-        "@pthreadpool//:LICENSE",
         "@ruy//:LICENSE",
         "@six_archive//:LICENSE",
         "@snappy//:COPYING",
-        "@sobol_data//:LICENSE",
         "@stablehlo//:LICENSE",
         "@tblib_archive//:LICENSE",
         "@termcolor_archive//:COPYING.txt",
         "@tf_runtime//:LICENSE",
-        "@tf_runtime//third_party/llvm_derived:LICENSE",
         "@typing_extensions_archive//:LICENSE",
-        "@zlib//:zlib.h",
     ] + select({
-        "//tensorflow:linux_aarch64": [],
-        "//conditions:default": [
-            "@arm_neon_2_x86_sse//:LICENSE",
-            "@nasm//:LICENSE",
-        ],
-    }) + select({
         "//tensorflow:android": [],
         "//tensorflow:ios": [],
         "//tensorflow:linux_s390x": [],
@@ -283,12 +263,6 @@ filegroup(
         [
             "@absl_py//:LICENSE",
         ],
-    ) + if_not_system_lib(
-        "com_github_grpc_grpc",
-        [
-            "@com_github_grpc_grpc//:LICENSE",
-            "@com_github_grpc_grpc//third_party/address_sorting:LICENSE",
-        ],
     ) + tf_additional_license_deps(),
 )
 
diff --git a/tensorflow/tools/pip_package/build_pip_package.sh b/tensorflow/tools/pip_package/build_pip_package.sh
index 0e439e4c7a8..fceafa81682 100755
--- a/tensorflow/tools/pip_package/build_pip_package.sh
+++ b/tensorflow/tools/pip_package/build_pip_package.sh
@@ -17,6 +17,12 @@
 
 set -e
 
+# Read the value of VERSION from vercod.bzl
+VERSION=$(grep 'VERSION = ' tensorflow/tensorflow.bzl | sed -E 's/VERSION = "(.*)"/\1/g')
+VERSION_MAJOR=$(echo "$VERSION" | cut -d '.' -f1)
+echo TensorFlow Version: ${VERSION}
+echo TensorFlow Major Version: ${VERSION_MAJOR}
+
 function is_absolute {
   [[ "$1" = /* ]] || [[ "$1" =~ ^[a-zA-Z]:[/\\].* ]]
 }
@@ -123,6 +129,7 @@ function prepare_src() {
 
   TMPDIR="${1%/}"
   mkdir -p "$TMPDIR"
+  echo TMPDIR: ${TMPDIR}
   EXTERNAL_INCLUDES="${TMPDIR}/tensorflow/include/external"
   XLA_AOT_RUNTIME_SOURCES="${TMPDIR}/tensorflow/xla_aot_runtime_src"
 
@@ -160,6 +167,25 @@ function prepare_src() {
     fi
   else
     RUNFILES=bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow
+    # Resolved the issue of a missing symlink to libtensorflow_cc.so.2 b/264967822#comment25
+    if is_macos; then
+      if [ ! -L "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib" ]; then
+        ln -s "$(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION}.dylib")")/libtensorflow_cc.${VERSION_MAJOR}.dylib" \
+         "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
+        echo "Created symlink: $(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION}.dylib")")/libtensorflow_cc.${VERSION_MAJOR}.dylib -> \
+          ${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
+      else
+        echo "Symlink already exists: ${RUNFILES}/tensorflow/libtensorflow_cc.${VERSION_MAJOR}.dylib"
+      fi
+    else
+      # cp -P ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION} ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}
+      if [ ! -L "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}" ]; then
+        ln -s "$(dirname "$(readlink "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION}")")/libtensorflow_cc.so.${VERSION_MAJOR}" \
+          "${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}"
+      else
+        echo "Symlink already exists: ${RUNFILES}/tensorflow/libtensorflow_cc.so.${VERSION_MAJOR}"
+      fi
+    fi
     cp -L \
       bazel-bin/tensorflow/tools/pip_package/build_pip_package.runfiles/org_tensorflow/LICENSE \
       "${TMPDIR}"
@@ -193,8 +219,8 @@ function prepare_src() {
     if is_macos; then
       chmod +rw ${TMPDIR}/tensorflow/tsl/python/lib/core/pywrap_bfloat16.so
       chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
-      install_name_tool -change "@loader_path/../../../../../${so_lib_dir}//libtensorflow_Stsl_Spython_Slib_Score_Slibbfloat16.so.so" "@loader_path/libbfloat16.so.so" ${TMPDIR}/tensorflow/tsl/python/lib/core/pywrap_bfloat16.so
-      install_name_tool -change "@loader_path/../../${so_lib_dir}//libtensorflow_Stsl_Spython_Slib_Score_Slibbfloat16.so.so" "@loader_path/../tsl/python/lib/core/libbfloat16.so.so" ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
+      install_name_tool -change "@loader_path/../../../../../${so_lib_dir}//libtensorflow_Stsl_Spython_Slib_Score_Slibbfloat16.so.dylib" "@loader_path/libbfloat16.so.dylib" ${TMPDIR}/tensorflow/tsl/python/lib/core/pywrap_bfloat16.so
+      install_name_tool -change "@loader_path/../../${so_lib_dir}//libtensorflow_Stsl_Spython_Slib_Score_Slibbfloat16.so.dylib" "@loader_path/../tsl/python/lib/core/libbfloat16.so.dylib" ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
     else
       chmod +rw ${TMPDIR}/tensorflow/tsl/python/lib/core/pywrap_bfloat16.so
       chmod +rw ${TMPDIR}/tensorflow/python/_pywrap_tensorflow_internal.so
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index 545482cd1ed..9e692280202 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -84,7 +84,7 @@ def standard_or_nightly(standard, nightly):
 REQUIRED_PACKAGES = [
     'absl-py >= 1.0.0',
     'astunparse >= 1.6.0',
-    'flatbuffers >= 23.1.21',
+    'flatbuffers >= 23.5.8',
     # TODO(b/213222745) gast versions above 0.4.0 break TF's tests
     'gast >= 0.2.1, <= 0.4.0',
     'google_pasta >= 0.1.1',
@@ -97,12 +97,10 @@ REQUIRED_PACKAGES = [
     'setuptools',
     'six >= 1.12.0',
     'termcolor >= 1.1.0',
-    'typing_extensions >= 3.6.6',
-    # TODO(b/266362323): wrapt==1.15.0rc0 incompatible with TF 2.12.0 RC0 (and
-    # nightly, but works with TF 2.11)
-    'wrapt >= 1.11.0, <1.15',
-    'tensorflow-io-gcs-filesystem >= 0.23.1;platform_machine!="arm64" or ' +
-    'platform_system!="Darwin"',
+    'typing_extensions>=3.6.6,<4.6.0',
+    'wrapt >= 1.11.0',
+    'tensorflow-io-gcs-filesystem >= 0.23.1;platform_machine!="arm64" or '
+    + 'platform_system!="Darwin"',
     # grpcio does not build correctly on big-endian machines due to lack of
     # BoringSSL support.
     # See https://github.com/tensorflow/tensorflow/issues/17882.
@@ -114,12 +112,16 @@ REQUIRED_PACKAGES = [
     # current release version. These also usually have "alpha" or "dev" in their
     # version name.
     # These are all updated during the TF release process.
-    standard_or_nightly('tensorboard >= 2.13, < 2.14',
-                        'tb-nightly ~= 2.13.0.a'),
-    standard_or_nightly('tensorflow_estimator >= 2.13.0rc0, < 2.14',
-                        'tf-estimator-nightly ~= 2.14.0.dev'),
-    standard_or_nightly('keras >= 2.13.1rc0, < 2.14',
-                        'keras-nightly ~= 2.14.0.dev'),
+    standard_or_nightly(
+        'tensorboard >= 2.13, < 2.14', 'tb-nightly ~= 2.14.0.a'
+    ),
+    standard_or_nightly(
+        'tensorflow_estimator >= 2.13.0rc0, < 2.14',
+        'tf-estimator-nightly ~= 2.14.0.dev',
+    ),
+    standard_or_nightly(
+        'keras >= 2.13.1rc0, < 2.14', 'keras-nightly ~= 2.14.0.dev'
+    ),
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
 
diff --git a/tensorflow/tools/proto_splitter_public/BUILD b/tensorflow/tools/proto_splitter_public/BUILD
new file mode 100644
index 00000000000..1306829d638
--- /dev/null
+++ b/tensorflow/tools/proto_splitter_public/BUILD
@@ -0,0 +1,22 @@
+# Description:
+#   Fake exports until ProtoSplitter is exported to OSS.
+#   (original description) Utilities for splitting and joining large protos > 2GB.
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "__subpackages__",
+        "//tensorflow:internal",
+    ],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "internal_api",
+    srcs = ["internal_api.cc"],
+    hdrs = ["internal_api.h"],
+    deps = [
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/status",
+    ],
+)
diff --git a/tensorflow/tools/proto_splitter_public/README.md b/tensorflow/tools/proto_splitter_public/README.md
new file mode 100644
index 00000000000..2b0072e8e4a
--- /dev/null
+++ b/tensorflow/tools/proto_splitter_public/README.md
@@ -0,0 +1,5 @@
+# Proto Splitter
+
+A library for splitting protos into smaller chunks. Except it's not ready for
+OSS yet, so this is a temporary directory containing copies of symbol / file
+imports.
diff --git a/tensorflow/tools/proto_splitter_public/internal_api.cc b/tensorflow/tools/proto_splitter_public/internal_api.cc
new file mode 100644
index 00000000000..c22d0d618b6
--- /dev/null
+++ b/tensorflow/tools/proto_splitter_public/internal_api.cc
@@ -0,0 +1,40 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tools/proto_splitter_public/internal_api.h"
+
+#include <string>
+
+namespace tensorflow {
+namespace image_format {
+
+absl::Status ReadSavedModel(const std::string& file_prefix,
+                            SavedModel* saved_model_proto) {
+  return absl::UnimplementedError("Not yet available in OSS");
+}
+
+absl::Status WriteSavedModel(SavedModel* saved_model_proto,
+                             const std::string& file_prefix) {
+  return absl::UnimplementedError("Not yet available in OSS");
+}
+
+absl::Status WriteSavedModel(SavedModel* saved_model_proto,
+                             const std::string& file_prefix,
+                             int debug_max_size) {
+  return absl::UnimplementedError("Not yet available in OSS");
+}
+
+}  // namespace image_format
+}  // namespace tensorflow
diff --git a/tensorflow/tools/proto_splitter_public/internal_api.h b/tensorflow/tools/proto_splitter_public/internal_api.h
new file mode 100644
index 00000000000..442d72f8b2d
--- /dev/null
+++ b/tensorflow/tools/proto_splitter_public/internal_api.h
@@ -0,0 +1,51 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Copy of cc/experimental/image_format/internal_api.h
+
+#ifndef TENSORFLOW_TOOLS_PROTO_SPLITTER_PUBLIC_INTERNAL_API_H_
+#define TENSORFLOW_TOOLS_PROTO_SPLITTER_PUBLIC_INTERNAL_API_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "tensorflow/core/protobuf/saved_model.pb.h"
+
+namespace tensorflow {
+namespace image_format {
+
+// Reads the SavedModel proto from {file_prefix}{.pb|.cpb}.
+// Returns a failure status when the SavedModel file does not exist.
+absl::Status ReadSavedModel(const std::string& file_prefix,
+                            SavedModel* saved_model_proto);
+
+// Writes the SavedModel proto to {file_prefix}{.pb|.cpb}.
+// If the proto is < the protobuf maximum size, then it will be serialized
+// as a `.pb` proto binary. When larger than the maximum size, the SavedModel
+// proto is destructively separated into chunks and written to
+// `.cpb` (chunked proto).
+absl::Status WriteSavedModel(SavedModel* saved_model_proto,
+                             const std::string& file_prefix);
+
+// See above. The `debug_max_size` argument can be used to set the maximum size
+// to less than 2GB for testing purposes.
+absl::Status WriteSavedModel(SavedModel* saved_model_proto,
+                             const std::string& file_prefix,
+                             int debug_max_size);
+
+}  // namespace image_format
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_TOOLS_PROTO_SPLITTER_PUBLIC_INTERNAL_API_H_
diff --git a/tensorflow/tools/test/BUILD b/tensorflow/tools/test/BUILD
index 62b63530d68..91bac405977 100644
--- a/tensorflow/tools/test/BUILD
+++ b/tensorflow/tools/test/BUILD
@@ -28,8 +28,8 @@ py_library(
     deps = [
         "//tensorflow:tensorflow_py",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client",
-        "//tensorflow/python:errors",
+        "//tensorflow/python/client",
+        "//tensorflow/python/framework:errors",
         "//tensorflow/python/platform:gfile",
         "@six_archive//:six",
     ],
@@ -80,7 +80,7 @@ py_library(
     deps = [
         ":run_and_gather_logs_lib",
         "//tensorflow/core:protos_all_py",
-        "//tensorflow/python:client_testlib",
+        "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/platform:gfile",
         "//tensorflow/python/platform:tf_logging",
         "@six_archive//:six",
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index c8b3761deb1..c8dece95c09 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -28,6 +28,7 @@ build-essential
 ca-certificates
 llvm-16
 clang-16
+lld-16
 clang-format-12
 colordiff
 curl
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index 7b7f2bab1fe..459371d0339 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -6,7 +6,7 @@
 # This will change in the future.
 absl-py ~= 1.0.0
 astunparse ~= 1.6.3
-flatbuffers ~= 2.0
+flatbuffers ~= 23.5.8
 google_pasta ~= 0.2
 h5py ~= 3.8.0 # Earliest version for Python 3.11
 # TODO(b/262592253): Support older versions of NumPy for Python 3.10 and lower
@@ -34,12 +34,12 @@ tb-nightly ~= 2.13.0.a
 tf-estimator-nightly ~= 2.14.0.dev
 # Test dependencies
 grpcio ~= 1.49.1 # Earliest version for Python 3.11
-portpicker ~= 1.4.0
+portpicker ~= 1.5.2
 scipy ~= 1.7.2; python_version < '3.11'
 scipy ~= 1.9.2; python_version >= '3.11' # Earliest version for Python 3.11
 # Required for TFLite import from JAX tests
-jax ~= 0.3.24
-jaxlib ~= 0.3.24 # Earliest version for Python 3.11
+jax ~= 0.3.25
+jaxlib ~= 0.3.25 # Earliest version for Python 3.11
 # Needs to be addressed. Unblocked 2.4 branchcut cl/338377048
 PyYAML ~= 6.0
 # For uploading
@@ -49,3 +49,6 @@ twine ~= 3.6.0
 junitparser ~= 2.2.0
 lxml ~= 4.9.1
 pylint ~= 2.13.9
+
+# For using Python 3.11 with Bazel 6 (b/286090018)
+lit ~= 16.0.5.post0
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
index 93acb2a97b9..3a324603bdf 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu.bazelrc
@@ -23,6 +23,10 @@ build --define=tf_api_version=2 --action_env=TF2_BEHAVIOR=1
 # Target the AVX instruction set
 build --copt=-mavx --host_copt=-mavx
 
+# Use lld as the linker
+build --linkopt="-fuse-ld=lld"
+build --linkopt="-lm"
+
 # Disable clang extention that rejects type definitions within offsetof. 
 # This was added in clang-16 by https://reviews.llvm.org/D133574.
 # Can be removed once upb is updated, since a type definition is used within
@@ -36,7 +40,7 @@ build --copt=-Wno-gnu-offsetof-extensions
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -51,33 +55,9 @@ test --test_summary=short
 # bazel test invocation as normal.
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
-# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
-# odd to attempt to validate the quality of the pip package. The wheel is
-# installed into a virtual environment, and then that venv is used to run all
-# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
-# drops all the bazel dependencies for each py_test; this makes all the tests
-# use the wheel's TensorFlow installation instead of the one made available
-# through bazel. This must be done in a different root directory, //bazel_pip/...,
-# because "import tensorflow" run from the root directory would instead import
-# the folder instead of the venv package.
-# 
-# Pass --config=pip to run the same suite of tests. If you want to run just one
-# test for investigation, you'll need --config=pip_venv instead, and then you
-# can specify whichever target you want.
-test:pip_venv --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
-test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
-test:pip_venv --python_path="/bazel_pip/bin/python3"
-test:pip_venv --define=no_tensorflow_py_deps=true
-test:pip --config=pip_venv
-# Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
@@ -91,7 +71,6 @@ build:rbe --bes_backend=buildeventservice.googleapis.com
 build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
 build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
-build:rbe --flaky_test_attempts=3
 build:rbe --jobs=800
 build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
 build:rbe --remote_timeout=3600
@@ -99,19 +78,19 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.13-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.13-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.13-clang_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13-clang_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
index d8625816879..cc74fd978cf 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/cpu_gcc.bazelrc
@@ -29,7 +29,7 @@ build --copt=-mavx --host_copt=-mavx
 build --profile=/tf/pkg/profile.json.gz
 
 # Use the NVCC toolchain to compile for manylinux2014
-build --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
 
 # Test-related settings below this point.
 test --build_tests_only --keep_going --test_output=errors --verbose_failures=true
@@ -44,33 +44,9 @@ test --test_summary=short
 # bazel test invocation as normal.
 test:nonpip_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
-# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
-# odd to attempt to validate the quality of the pip package. The wheel is
-# installed into a virtual environment, and then that venv is used to run all
-# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
-# drops all the bazel dependencies for each py_test; this makes all the tests
-# use the wheel's TensorFlow installation instead of the one made available
-# through bazel. This must be done in a different root directory, //bazel_pip/...,
-# because "import tensorflow" run from the root directory would instead import
-# the folder instead of the venv package.
-# 
-# Pass --config=pip to run the same suite of tests. If you want to run just one
-# test for investigation, you'll need --config=pip_venv instead, and then you
-# can specify whichever target you want.
-test:pip_venv --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
-test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
-test:pip_venv --python_path="/bazel_pip/bin/python3"
-test:pip_venv --define=no_tensorflow_py_deps=true
-test:pip --config=pip_venv
-# Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --build_tag_filters=-nopip,-no_pip,-no_oss,-oss_excluded,-oss_serial,-v1only,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pip --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
@@ -84,7 +60,6 @@ build:rbe --bes_backend=buildeventservice.googleapis.com
 build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
 build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
-build:rbe --flaky_test_attempts=3
 build:rbe --jobs=800
 build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
 build:rbe --remote_timeout=3600
@@ -92,19 +67,19 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.13_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.13_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.13_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.13_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
index 0cc043421e6..50ea5752059 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu.bazelrc
@@ -30,6 +30,10 @@ build --copt=-mavx --host_copt=-mavx
 # See https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
 build --copt=-Wno-gnu-offsetof-extensions
 
+# Use lld as the linker
+build --linkopt="-fuse-ld=lld"
+build --linkopt="-lm"
+
 # Store performance profiling log in the mounted artifact directory.
 # The profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
@@ -47,7 +51,7 @@ build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-16/bin/clang"
 build --action_env=TF_CUDA_CLANG="1"
 build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-build --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
 
 # CUDA: Enable TensorRT optimizations
 # https://developer.nvidia.com/tensorrt
@@ -74,32 +78,9 @@ test --test_summary=short
 # bazel test invocation as normal.
 test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
-# "pip tests" run a similar suite of tests the "nonpip" tests, but do something
-# odd to attempt to validate the quality of the pip package. The wheel is
-# installed into a virtual environment, and then that venv is used to run all
-# bazel tests with a special flag "--define=no_tensorflow_py_deps=true", which
-# drops all the bazel dependencies for each py_test; this makes all the tests
-# use the wheel's TensorFlow installation instead of the one made available
-# through bazel. This must be done in a different root directory, //bazel_pip/...,
-# because "import tensorflow" run from the root directory would instead import
-# the folder instead of the venv package.
-# 
-# Pass --config=pip to run the same suite of tests. If you want to run just one
-# test for investigation, you'll need --config=pip_venv instead, and then you
-# can specify whichever target you want.
-test:pip_venv --action_env PYTHON_BIN_PATH="/bazel_pip/bin/python3"
-test:pip_venv --action_env PYTHON_LIB_PATH="/bazel_pip/lib/python3/site-packages"
-test:pip_venv --python_path="/bazel_pip/bin/python3"
-test:pip_venv --define=no_tensorflow_py_deps=true
-# Yes, we don't exclude the gpu tests on pip for some reason.
-test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
-test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
-
 # For building libtensorflow archives
 test:libtensorflow_test -- //tensorflow/tools/lib_package:libtensorflow_test //tensorflow/tools/lib_package:libtensorflow_java_test
 build:libtensorflow_build -- //tensorflow/tools/lib_package:libtensorflow.tar.gz //tensorflow/tools/lib_package:libtensorflow_jni.tar.gz //tensorflow/java:libtensorflow.jar //tensorflow/java:libtensorflow-src.jar //tensorflow/tools/lib_package:libtensorflow_proto.zip
@@ -113,7 +94,6 @@ build:rbe --bes_backend=buildeventservice.googleapis.com
 build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
 build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
-build:rbe --flaky_test_attempts=3
 build:rbe --jobs=800
 build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
 build:rbe --remote_timeout=3600
@@ -121,27 +101,27 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.13-clang_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.13-clang_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.13-clang_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.13-clang_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14-clang_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14-clang_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14-clang_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13-clang_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
 test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.13-clang_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.13-clang_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.13-clang_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13-clang_config_python"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14-clang_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14-clang_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14-clang_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14-clang_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
-test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc
index 81a1642ef3c..46450fae895 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/gpu_gcc.bazelrc
@@ -36,7 +36,7 @@ build --action_env=TF_CUDNN_VERSION="8"
 build --action_env=CUDA_TOOLKIT_PATH="/usr/local/cuda-11.8"
 build --action_env=GCC_HOST_COMPILER_PATH="/dt9/usr/bin/gcc"
 build --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64:/usr/local/tensorrt/lib"
-build --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
+build --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
 
 # CUDA: Enable TensorRT optimizations
 # https://developer.nvidia.com/tensorrt
@@ -63,7 +63,7 @@ test --test_summary=short
 # bazel test invocation as normal.
 test:nonpip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:nonpip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:nonpip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:nonpip_filters --test_lang_filters=py --test_size_filters=small,medium
 test:nonpip --config=nonpip_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # "pip tests" run a similar suite of tests the "nonpip" tests, but do something
@@ -86,7 +86,7 @@ test:pip_venv --define=no_tensorflow_py_deps=true
 # Yes, we don't exclude the gpu tests on pip for some reason.
 test:pip_filters --test_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
 test:pip_filters --build_tag_filters=gpu,requires-gpu,-no_gpu,-no_oss,-oss_excluded,-oss_serial,-no_cuda11,-no_pip,-nopip,-no_oss_py38,-no_oss_py39,-no_oss_py310
-test:pip_filters --test_lang_filters=py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pip_filters --test_lang_filters=py --test_size_filters=small,medium
 test:pip --config=pip_venv --config=pip_filters -- //bazel_pip/tensorflow/... -//bazel_pip/tensorflow/python/integration_testing/... -//bazel_pip/tensorflow/compiler/tf2tensorrt/... -//bazel_pip/tensorflow/compiler/xrt/... -//bazel_pip/tensorflow/core/tpu/... -//bazel_pip/tensorflow/lite/... -//tensorflow/tools/toolchains/...
 
 # For building libtensorflow archives
@@ -102,7 +102,6 @@ build:rbe --bes_backend=buildeventservice.googleapis.com
 build:rbe --bes_results_url="https://source.cloud.google.com/results/invocations"
 build:rbe --bes_timeout=600s
 build:rbe --define=EXECUTOR=remote
-build:rbe --flaky_test_attempts=3
 build:rbe --jobs=800
 build:rbe --remote_executor=grpcs://remotebuildexecution.googleapis.com
 build:rbe --remote_timeout=3600
@@ -110,27 +109,27 @@ build:rbe --spawn_strategy=remote,worker,standalone,local
 build:rbe --remote_download_toplevel
 build:rbe --action_env=PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/go/bin"
 build:rbe --linkopt=-lrt --host_linkopt=-lrt --linkopt=-lm --host_linkopt=-lm  # Unclear why this is here
-build:rbe --host_crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
-build:rbe --crosstool_top="@sigbuild-r2.13_config_cuda//crosstool:toolchain"
-build:rbe --extra_toolchains="@sigbuild-r2.13_config_cuda//crosstool:toolchain-linux-x86_64"
-build:rbe --extra_execution_platforms="@sigbuild-r2.13_config_platform//:platform"
-build:rbe --host_platform="@sigbuild-r2.13_config_platform//:platform"
-build:rbe --platforms="@sigbuild-r2.13_config_platform//:platform"
+build:rbe --host_crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
+build:rbe --crosstool_top="@sigbuild-r2.14_config_cuda//crosstool:toolchain"
+build:rbe --extra_toolchains="@sigbuild-r2.14_config_cuda//crosstool:toolchain-linux-x86_64"
+build:rbe --extra_execution_platforms="@sigbuild-r2.14_config_platform//:platform"
+build:rbe --host_platform="@sigbuild-r2.14_config_platform//:platform"
+build:rbe --platforms="@sigbuild-r2.14_config_platform//:platform"
 # Python config is the same across all containers because the binary is the same
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13_config_python"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14_config_python"
 build:rbe --remote_instance_name=projects/tensorflow-testing/instances/default_instance
 build:rbe --project_id="tensorflow-testing"
 
 # For Remote build execution -- GPU configuration
 build:rbe --repo_env=REMOTE_GPU_TESTING=1
 test:rbe --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
-build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.13_config_cuda"
-build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.13_config_tensorrt"
-build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.13_config_nccl"
-build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.13_config_python"
+build:rbe --repo_env=TF_CUDA_CONFIG_REPO="@sigbuild-r2.14_config_cuda"
+build:rbe --repo_env=TF_TENSORRT_CONFIG_REPO="@sigbuild-r2.14_config_tensorrt"
+build:rbe --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.14_config_nccl"
+build:rbe --repo_env=TF_PYTHON_CONFIG_REPO="@sigbuild-r2.14_config_python"
 
 # For continuous builds
 test:pycpp_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
 test:pycpp_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-benchmark-test,-v1only,gpu,-no_gpu,-no_gpu_presubmit,-no_cuda11
-test:pycpp_filters --test_lang_filters=cc,py --flaky_test_attempts=3 --test_size_filters=small,medium
+test:pycpp_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:pycpp --config=pycpp_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/compiler/xrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/setup_venv_test.sh b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/setup_venv_test.sh
deleted file mode 100755
index db05f3d3c1d..00000000000
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/setup_venv_test.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-#
-# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-set -euxo pipefail
-
-# Run this from inside the tensorflow github directory.
-# Usage: setup_venv_test.sh venv_and_symlink_name "glob pattern for one wheel file"
-# Example: setup_venv_test.sh bazel_pip "/tf/pkg/*.whl"
-# 
-# This will create a venv with that wheel file installed in it, and a symlink
-# in ./venv_and_symlink_name/tensorflow to ./tensorflow. We use this for the
-# "pip" tests.
-
-python -m venv /$1
-mkdir -p $1
-rm -f ./$1/tensorflow
-ln -s $(ls /$1/lib) /$1/lib/python3
-ln -s ../tensorflow $1/tensorflow
-# extglob is necessary for @(a|b) pattern matching
-# see "extglob" in the bash manual page ($ man bash)
-bash -O extglob -c "/$1/bin/pip install $2"
-/$1/bin/pip install -r /usertools/test.requirements.txt
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
deleted file mode 100644
index c9f1caa1d5d..00000000000
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools/test.requirements.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-# Test dependencies for pip tests
-grpcio ~= 1.53.0
-portpicker ~= 1.5.2
-scipy ~= 1.10.1
-jax ~= 0.4.7
-jaxlib ~= 0.4.7
diff --git a/tensorflow/tools/tfg_graph_transforms/BUILD b/tensorflow/tools/tfg_graph_transforms/BUILD
index 329338ea5e6..5b4e093fa9a 100644
--- a/tensorflow/tools/tfg_graph_transforms/BUILD
+++ b/tensorflow/tools/tfg_graph_transforms/BUILD
@@ -1,7 +1,7 @@
 # Description:
 #   Utilities to perform MLIR TFG graph transformations.
 
-load("//tensorflow:tensorflow.bzl", "tf_cc_binary")
+load("//tensorflow:tensorflow.bzl", "if_google", "if_oss", "tf_cc_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -21,9 +21,13 @@ cc_library(
         "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:path",
         "//tensorflow/core/platform:status",
-        "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
         "//tensorflow/core/protobuf:for_core_protos_cc",
-    ],
+        "@com_google_absl//absl/strings",
+    ] + if_google([
+        "//tensorflow/cc/experimental/image_format:internal_api",
+    ]) + if_oss([
+        "//tensorflow/tools/proto_splitter_public:internal_api",
+    ]),
 )
 
 TFG_GRAPH_TRANSFORM_DEPS = [
diff --git a/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc b/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
index 386b34d8e32..94f8be4b5f4 100644
--- a/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
+++ b/tensorflow/tools/tfg_graph_transforms/tfg_graph_transforms_main.cc
@@ -68,6 +68,24 @@ llvm::cl::opt<DataFormat> data_format(
     llvm::cl::init(DataFormat::SavedModel),
     llvm::cl::cat(tfg_graph_transform_category));
 
+// NOLINTNEXTLINE
+llvm::cl::opt<bool> experimental_image_format(
+    "experimental_image_format",
+    llvm::cl::desc("Whether to expect use the experimental SavedModel image "
+                   "format. Only applies to SavedModel inputs and outputs. "
+                   "When enabled, the output filename may have a different "
+                   "extension than the one provided."),
+    llvm::cl::init(false));
+
+// NOLINTNEXTLINE
+llvm::cl::opt<int> experimental_image_format_max_proto_size(
+    "experimental_image_format_max_proto_size",
+    llvm::cl::desc(
+        "Sets the maximum chunk size in bytes allowed for protos (2GB by "
+        "default). This flag should only be used for testing purposes and can "
+        "be removed at any time."),
+    llvm::cl::init(0));
+
 // Validate CL options and returns false in case of an error.
 bool CheckCLParams() {
   if (input_file == output_file) {
@@ -133,15 +151,21 @@ tensorflow::Status RunOptimizationPasses(
 // Import model to the TFG MLIR module.
 tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportModel(
     DataFormat data_format, const std::string& input_file,
-    mlir::MLIRContext* mlir_context) {
+    bool experimental_image_format, mlir::MLIRContext* mlir_context) {
   tensorflow::GraphDebugInfo debug_info;
 
   switch (data_format) {
     case DataFormat::SavedModel: {
       tensorflow::SavedModel saved_model;
-      TF_RETURN_IF_ERROR(
-          mlir::tfg::graph_transforms::ReadModelProto<tensorflow::SavedModel>(
-              input_file, saved_model));
+      if (experimental_image_format) {
+        TF_RETURN_IF_ERROR(
+            mlir::tfg::graph_transforms::ReadSavedModelImageFormat(
+                input_file, saved_model));
+      } else {
+        TF_RETURN_IF_ERROR(
+            mlir::tfg::graph_transforms::ReadModelProto<tensorflow::SavedModel>(
+                input_file, saved_model));
+      }
       return mlir::tfg::ImportSavedModelToMlir(mlir_context, debug_info,
                                                saved_model);
     }
@@ -158,13 +182,22 @@ tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportModel(
 tensorflow::Status ExportTFGModule(mlir::ModuleOp module_op,
                                    DataFormat data_format,
                                    const std::string& input_file,
-                                   const std::string& output_file) {
+                                   const std::string& output_file,
+                                   bool experimental_image_format,
+                                   int experimental_image_format_max_size) {
   switch (data_format) {
     case DataFormat::SavedModel: {
       tensorflow::SavedModel original_saved_model;
-      TF_RETURN_IF_ERROR(
-          mlir::tfg::graph_transforms::ReadModelProto<tensorflow::SavedModel>(
-              input_file, original_saved_model));
+      if (experimental_image_format) {
+        TF_RETURN_IF_ERROR(
+            mlir::tfg::graph_transforms::ReadSavedModelImageFormat(
+                input_file, original_saved_model));
+      } else {
+        TF_RETURN_IF_ERROR(
+            mlir::tfg::graph_transforms::ReadModelProto<tensorflow::SavedModel>(
+                input_file, original_saved_model));
+      }
+
       tensorflow::SavedModel final_saved_model;
 
       TF_RETURN_WITH_CONTEXT_IF_ERROR(
@@ -172,9 +205,18 @@ tensorflow::Status ExportTFGModule(mlir::ModuleOp module_op,
                                             &final_saved_model),
           "while converting TFG to SavedModel");
 
-      VLOG(1) << "Serializing resulting SavedModel to " << output_file;
-      return mlir::tfg::graph_transforms::SerializeProto<
-          tensorflow::SavedModel>(final_saved_model, output_file);
+      if (experimental_image_format) {
+        VLOG(1) << "Serializing resulting SavedModel to " << output_file
+                << " (filename might not exactly match since "
+                   "`experimental_image_format` has been enabled)";
+        return mlir::tfg::graph_transforms::WriteSavedModelImageFormat(
+            &final_saved_model, output_file,
+            experimental_image_format_max_proto_size);
+      } else {
+        VLOG(1) << "Serializing resulting SavedModel to " << output_file;
+        return mlir::tfg::graph_transforms::SerializeProto<
+            tensorflow::SavedModel>(final_saved_model, output_file);
+      }
     }
     case DataFormat::GraphDef: {
       tensorflow::GraphDef new_graphdef;
@@ -212,11 +254,11 @@ int main(int argc, char** argv) {
   mlir::MLIRContext context(registry);
 
   // Import model to the TFG MLIR module.
-  auto module_ref_status = ImportModel(data_format, input_file, &context);
+  auto module_ref_status =
+      ImportModel(data_format, input_file, experimental_image_format, &context);
 
   if (!module_ref_status.ok()) {
-    LOG(QFATAL) << "Model import failed: "
-                << module_ref_status.status().ToString();
+    LOG(QFATAL) << "Model import failed: " << module_ref_status.status();
   }
   auto module_ref = std::move(module_ref_status.value());
 
@@ -225,16 +267,16 @@ int main(int argc, char** argv) {
   tensorflow::Status pass_pipeline_status =
       RunOptimizationPasses(pass_pipeline, *module_ref, &context);
   if (!pass_pipeline_status.ok()) {
-    LOG(QFATAL) << pass_pipeline_status.ToString() << "\n";
+    LOG(QFATAL) << pass_pipeline_status << "\n";
   }
 
   // Export MLIR TFG module to the resulting model proto.
-  tensorflow::Status export_status =
-      ExportTFGModule(*module_ref, data_format, input_file, output_file);
+  tensorflow::Status export_status = ExportTFGModule(
+      *module_ref, data_format, input_file, output_file,
+      experimental_image_format, experimental_image_format_max_proto_size);
 
   if (!export_status.ok()) {
-    LOG(QFATAL) << "Export of TFG module failed: " << export_status.ToString()
-                << "\n";
+    LOG(QFATAL) << "Export of TFG module failed: " << export_status << "\n";
   }
 
   return EXIT_SUCCESS;
diff --git a/tensorflow/tools/tfg_graph_transforms/utils.cc b/tensorflow/tools/tfg_graph_transforms/utils.cc
index 30be560d822..e2b52f2aaeb 100644
--- a/tensorflow/tools/tfg_graph_transforms/utils.cc
+++ b/tensorflow/tools/tfg_graph_transforms/utils.cc
@@ -17,18 +17,49 @@ limitations under the License.
 
 #include <string>
 
-#include "tensorflow/core/platform/env.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/tools/proto_splitter_public/internal_api.h"
 #include "tensorflow/core/platform/status.h"
+#include "tensorflow/tsl/platform/stringpiece.h"
 
 namespace mlir {
 namespace tfg {
 namespace graph_transforms {
 
+namespace {
+
+tsl::StringPiece GetNameWithoutExtension(tsl::StringPiece filename) {
+  auto pos = filename.rfind('.');
+  if (pos == tsl::StringPiece::npos) return filename;
+  return filename.substr(0, pos);
+}
+
+}  // namespace
+
 bool IsTextProto(const std::string& input_file) {
   tensorflow::StringPiece extension = tensorflow::io::Extension(input_file);
   return !extension.compare("pbtxt");
 }
 
+tensorflow::Status ReadSavedModelImageFormat(
+    const std::string& input_file, tensorflow::SavedModel& model_proto) {
+  std::string saved_model_prefix(GetNameWithoutExtension(input_file));
+  return tensorflow::image_format::ReadSavedModel(saved_model_prefix,
+                                                  &model_proto);
+}
+tensorflow::Status WriteSavedModelImageFormat(
+    tensorflow::SavedModel* model_proto, const std::string& output_file,
+    int debug_max_size) {
+  std::string saved_model_prefix(GetNameWithoutExtension(output_file));
+  if (debug_max_size > 0) {
+    return tensorflow::image_format::WriteSavedModel(
+        model_proto, saved_model_prefix, debug_max_size);
+  } else {
+    return tensorflow::image_format::WriteSavedModel(model_proto,
+                                                     saved_model_prefix);
+  }
+}
+
 }  // namespace graph_transforms
 }  // namespace tfg
 }  // namespace mlir
diff --git a/tensorflow/tools/tfg_graph_transforms/utils.h b/tensorflow/tools/tfg_graph_transforms/utils.h
index fd4e13f012d..ca60b786db6 100644
--- a/tensorflow/tools/tfg_graph_transforms/utils.h
+++ b/tensorflow/tools/tfg_graph_transforms/utils.h
@@ -92,6 +92,13 @@ tensorflow::Status SerializeProto(T model_proto,
   return ::tensorflow::OkStatus();
 }
 
+// Read and write to the experimental SavedModel Image format.
+tensorflow::Status ReadSavedModelImageFormat(
+    const std::string& input_file, tensorflow::SavedModel& model_proto);
+tensorflow::Status WriteSavedModelImageFormat(
+    tensorflow::SavedModel* model_proto, const std::string& output_file,
+    int debug_max_size);
+
 }  // namespace graph_transforms
 }  // namespace tfg
 }  // namespace mlir
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index a5995fb43b5..93a2b8f06cb 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -577,3 +577,82 @@ def initialize_rbe_configs():
             "TF_TENSORRT_VERSION": "7.2",
         },
     )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.14": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
+            "sigbuild-r2.14-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
+            "sigbuild-r2.14-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
+            "sigbuild-r2.14-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
+            "sigbuild-r2.14-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/dt9/usr/bin/gcc",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/dt9/usr/bin/gcc",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "GCC_HOST_COMPILER_PATH": "/dt9/usr/bin/gcc",
+            "GCC_HOST_COMPILER_PREFIX": "/usr/bin",
+            "HOST_CXX_COMPILER": "/dt9/usr/bin/gcc",
+            "HOST_C_COMPILER": "/dt9/usr/bin/gcc",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "0",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "11.8",
+            "TF_CUDNN_VERSION": "8.6",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "8.4",
+        },
+    )
+
+    sigbuild_tf_configs(
+        name_container_map = {
+            "sigbuild-r2.14-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
+            "sigbuild-r2.14-clang-python3.8": "docker://gcr.io/tensorflow-sigs/build@sha256:c46d275e5bc760b7af465dc063629b234cfa34aabf0c7fe30581effc0b99648a",
+            "sigbuild-r2.14-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:c03809a6b4008b430bf241efce78cdcd92c7bc41d11d0ba57216e97d813ac282",
+            "sigbuild-r2.14-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:06b3a97ef247dbb00a9c6d8315e4e035d891ae2f18088de254f15d6ecedadfb9",
+            "sigbuild-r2.14-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:00dc9e13130727dcdeb54ca77423e317a79aae84d5783c05b38b7bbdf753f0f6",
+        },
+        # Unclear why LIBC is set to 2.19 here, and yet manylinux2010 is 2.12
+        # and manylinux2014 is 2.17.
+        env = {
+            "ABI_LIBC_VERSION": "glibc_2.19",
+            "ABI_VERSION": "gcc",
+            "BAZEL_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "BAZEL_HOST_SYSTEM": "i686-unknown-linux-gnu",
+            "BAZEL_TARGET_CPU": "k8",
+            "BAZEL_TARGET_LIBC": "glibc_2.19",
+            "BAZEL_TARGET_SYSTEM": "x86_64-unknown-linux-gnu",
+            "CC": "/usr/lib/llvm-16/bin/clang",
+            "CC_TOOLCHAIN_NAME": "linux_gnu_x86",
+            "CLEAR_CACHE": "1",
+            "CUDNN_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "CLANG_CUDA_COMPILER_PATH": "/usr/lib/llvm-16/bin/clang",
+            "HOST_CXX_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "HOST_C_COMPILER": "/usr/lib/llvm-16/bin/clang",
+            "PYTHON_BIN_PATH": "/usr/bin/python3",
+            "TENSORRT_INSTALL_PATH": "/usr/lib/x86_64-linux-gnu",
+            "TF_CUDA_CLANG": "1",
+            "TF_CUDA_COMPUTE_CAPABILITIES": "3.5,6.0",
+            "TF_CUDA_VERSION": "11.2",
+            "TF_CUDNN_VERSION": "8.1",
+            "TF_ENABLE_XLA": "1",
+            "TF_NEED_CUDA": "1",
+            "TF_NEED_TENSORRT": "1",
+            "TF_SYSROOT": "/dt9",
+            "TF_TENSORRT_VERSION": "7.2",
+        },
+    )
diff --git a/tensorflow/tools/toolchains/remote_config/containers.bzl b/tensorflow/tools/toolchains/remote_config/containers.bzl
index be92f6e3cf8..87f347a045c 100644
--- a/tensorflow/tools/toolchains/remote_config/containers.bzl
+++ b/tensorflow/tools/toolchains/remote_config/containers.bzl
@@ -7,8 +7,8 @@ container_digests = {
     # JAX manylinux2014 configs.
     "cuda11.1-cudnn8-ubuntu20.04-manylinux2014-multipython": "sha256:011034978c5f1e5dcecc816b3b964faafc42b243001d9cd09ff7cfe4a6a0f4b9",
     "cuda11.4-cudnn8.2-ubuntu20.04-manylinux2014-multipython": "sha256:d17894a1349a12baea1732cb133f65f08754ed97d0a6647efe23c916a9ab8f1c",
-    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:be5a7644b1fe786be01c9ba3593fa9e1c81775812df096ee8198b8ba9704c895",
-    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:5e915c804c0cc3e73ce7d72e26d3fe257344e3f1d1c721e56982399ab06437b0",
+    "cuda11.8-cudnn8.6-ubuntu20.04-manylinux2014-multipython": "sha256:262a548d6afdcd0c0ce9f0b86352bfd38dc3fd056035c78242dfb87013e99503",
+    "cuda12.0.1-cudnn8.8-ubuntu20.04-manylinux2014-multipython": "sha256:fd975f0314e7e3d62ff9bc2c27861a0c15e309bfa38da590083c0213c53fab26",
     # ROCM, probably not all of them still in use
     "rocm-ubuntu18.04-manylinux2010-multipython": "sha256:6e953a09b145df338bcb03e9e36f99b291140c29b72d0a048fb6c5905ccad5eb",
     "rocm-ubuntu20.04-manylinux2014-multipython": "sha256:906faec7765fe5dd067f2b092b5d5f220c1fedde725fb42c83d031b4d6f32204",
diff --git a/tensorflow/tsl/c/tsl_status_test.cc b/tensorflow/tsl/c/tsl_status_test.cc
index 650118bb775..ac6727c19b5 100644
--- a/tensorflow/tsl/c/tsl_status_test.cc
+++ b/tensorflow/tsl/c/tsl_status_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/tsl/c/tsl_status.h"
 
+#include <string>
+#include <unordered_map>
 #include <utility>
 
 #include "tensorflow/tsl/c/tsl_status_internal.h"
diff --git a/tensorflow/tsl/concurrency/async_value.cc b/tensorflow/tsl/concurrency/async_value.cc
index 2ca9aabc44a..53e6b3007f7 100644
--- a/tensorflow/tsl/concurrency/async_value.cc
+++ b/tensorflow/tsl/concurrency/async_value.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/tsl/concurrency/async_value.h"
 
+#include <atomic>
 #include <cstdlib>
 #include <functional>
 #include <utility>
diff --git a/tensorflow/tsl/distributed_runtime/call_options.cc b/tensorflow/tsl/distributed_runtime/call_options.cc
index 1af2d2f52ed..09cc57d557b 100644
--- a/tensorflow/tsl/distributed_runtime/call_options.cc
+++ b/tensorflow/tsl/distributed_runtime/call_options.cc
@@ -15,11 +15,13 @@ limitations under the License.
 
 #include "tensorflow/tsl/distributed_runtime/call_options.h"
 
+#include <utility>
+
 #include "tensorflow/tsl/platform/mutex.h"
 
 namespace tsl {
 
-CallOptions::CallOptions() {}
+CallOptions::CallOptions() = default;
 
 void CallOptions::StartCancel() {
   mutex_lock l(mu_);
diff --git a/tensorflow/tsl/distributed_runtime/coordination/BUILD b/tensorflow/tsl/distributed_runtime/coordination/BUILD
index 3742c8c9351..d25351a302f 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/BUILD
+++ b/tensorflow/tsl/distributed_runtime/coordination/BUILD
@@ -122,6 +122,7 @@ tsl_cc_test(
         "//tensorflow/tsl/platform:types",
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
         "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
@@ -148,6 +149,7 @@ tsl_gpu_library(
         "//tensorflow/tsl/protobuf:coordination_config_proto_cc",
         "//tensorflow/tsl/protobuf:coordination_service_proto_cc",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
index 0a8d2b8a555..83ca045e92f 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service.cc
@@ -62,6 +62,7 @@ constexpr int kDefaultHeartbeatTimeoutMs = 10 * 1000;  // 10 seconds
 constexpr int kServiceToClientTimeoutMs = 10 * 1000;   // 10 seconds
 constexpr size_t kOngoingBarriersSoftLimit = 20;
 constexpr char kHealthCheckThread[] = "CoordinationServiceHealthCheck";
+constexpr int kPendingTaskLogLimit = 20;
 
 std::string GetTaskName(absl::string_view job_name, int task_id) {
   return strings::StrCat("/job:", job_name, "/replica:", 0, "/task:", task_id);
@@ -440,9 +441,22 @@ void CoordinationServiceStandaloneImpl::StartCheckStaleness() {
             }
             // Pass these barriers with the time out error.
             for (const auto& [barrier_id, barrier] : expired_barriers) {
+              std::string pending_tasks;
+              int pending_task_count = 0;
+              for (const auto& [task, at_barrier] : barrier->tasks_at_barrier) {
+                if (!at_barrier) {
+                  ++pending_task_count;
+                  if (pending_task_count <= kPendingTaskLogLimit) {
+                    absl::StrAppend(&pending_tasks, GetTaskName(task), "\n");
+                  } else {
+                    break;
+                  }
+                }
+              }
               const Status error =
                   MakeCoordinationError(errors::DeadlineExceeded(absl::StrCat(
-                      "Barrier timed out. Barrier_id: ", barrier_id)));
+                      "Barrier timed out. Barrier_id: ", barrier_id,
+                      ". Timed out task names:\n", pending_tasks)));
               PassBarrier(barrier_id, error, barrier);
             }
           }
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
index d2c3a52b207..0a01bb7c8fd 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/substitute.h"
 #include "absl/synchronization/notification.h"
@@ -288,8 +289,8 @@ Status CoordinationServiceAgentImpl::Connect() {
            // process of being enabled.
            (connect_status.GetPayload(CoordinationErrorPayloadKey()) ==
                 std::nullopt ||
-            errors::IsAborted(connect_status) ||
-            errors::IsInternal(connect_status)));
+            absl::IsAborted(connect_status) ||
+            absl::IsInternal(connect_status)));
   if (!connect_status.ok()) {
     SetError(connect_status);
     return connect_status;
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
index 6f69848d825..89648774834 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_agent_test.cc
@@ -62,33 +62,44 @@ KeyValueEntry CreateKv(const std::string& key, const std::string& value) {
 class TestCoordinationClient : public CoordinationClient {
  public:
   TestCoordinationClient() = default;
-  // NOLINTBEGIN (MOCK_METHOD macro fails in Windows build, so use deprecated
-  // macro instead)
-  MOCK_METHOD4(GetKeyValueAsync,
-               void(CallOptions* call_opts, const GetKeyValueRequest*,
-                    GetKeyValueResponse*, StatusCallback));
-  MOCK_METHOD3(TryGetKeyValueAsync,
-               void(const TryGetKeyValueRequest*, TryGetKeyValueResponse*,
-                    StatusCallback));
-  MOCK_METHOD3(GetKeyValueDirAsync,
-               void(const GetKeyValueDirRequest*, GetKeyValueDirResponse*,
-                    StatusCallback));
-  MOCK_METHOD4(RegisterTaskAsync, void(CallOptions*, const RegisterTaskRequest*,
-                                       RegisterTaskResponse*, StatusCallback));
-  MOCK_METHOD4(ShutdownTaskAsync, void(CallOptions*, const ShutdownTaskRequest*,
-                                       ShutdownTaskResponse*, StatusCallback));
-  MOCK_METHOD3(ResetTaskAsync, void(const ResetTaskRequest*, ResetTaskResponse*,
-                                    StatusCallback));
-  MOCK_METHOD3(ReportErrorToServiceAsync,
-               void(const ReportErrorToServiceRequest*,
-                    ReportErrorToServiceResponse*, StatusCallback));
-  MOCK_METHOD3(BarrierAsync,
-               void(const BarrierRequest*, BarrierResponse*, StatusCallback));
-  MOCK_METHOD3(GetTaskStateAsync, void(const GetTaskStateRequest*,
-                                       GetTaskStateResponse*, StatusCallback));
-  MOCK_METHOD4(HeartbeatAsync, void(CallOptions*, const HeartbeatRequest*,
-                                    HeartbeatResponse*, StatusCallback));
-  // NOLINTEND
+  MOCK_METHOD(void, GetKeyValueAsync,
+              (CallOptions * call_opts, const GetKeyValueRequest*,
+               GetKeyValueResponse*, StatusCallback),
+              (override));
+  MOCK_METHOD(void, TryGetKeyValueAsync,
+              (const TryGetKeyValueRequest*, TryGetKeyValueResponse*,
+               StatusCallback),
+              (override));
+  MOCK_METHOD(void, GetKeyValueDirAsync,
+              (const GetKeyValueDirRequest*, GetKeyValueDirResponse*,
+               StatusCallback),
+              (override));
+  MOCK_METHOD(void, RegisterTaskAsync,
+              (CallOptions*, const RegisterTaskRequest*, RegisterTaskResponse*,
+               StatusCallback),
+              (override));
+  MOCK_METHOD(void, ShutdownTaskAsync,
+              (CallOptions*, const ShutdownTaskRequest*, ShutdownTaskResponse*,
+               StatusCallback),
+              (override));
+  MOCK_METHOD(void, ResetTaskAsync,
+              (const ResetTaskRequest*, ResetTaskResponse*, StatusCallback),
+              (override));
+  MOCK_METHOD(void, ReportErrorToServiceAsync,
+              (const ReportErrorToServiceRequest*,
+               ReportErrorToServiceResponse*, StatusCallback),
+              (override));
+  MOCK_METHOD(void, BarrierAsync,
+              (const BarrierRequest*, BarrierResponse*, StatusCallback),
+              (override));
+  MOCK_METHOD(void, GetTaskStateAsync,
+              (const GetTaskStateRequest*, GetTaskStateResponse*,
+               StatusCallback),
+              (override));
+  MOCK_METHOD(void, HeartbeatAsync,
+              (CallOptions*, const HeartbeatRequest*, HeartbeatResponse*,
+               StatusCallback),
+              (override));
 
 #define UNIMPLEMENTED(method)                                         \
   void method##Async(const method##Request* request,                  \
diff --git a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
index 334a6a0b031..296e00ba41f 100644
--- a/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
+++ b/tensorflow/tsl/distributed_runtime/coordination/coordination_service_test.cc
@@ -21,6 +21,8 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
@@ -194,6 +196,12 @@ class CoordinationBarrierTest : public ::testing::Test {
   }
   CoordinatedTask GetTask(int i) { return tasks_[i]; }
 
+  // TODO(b/286141652) Refactor this method into a util file.
+  std::string GetTaskName(const CoordinatedTask& task) {
+    return absl::StrCat("/job:", task.job_name(), "/replica:", 0,
+                        "/task:", task.task_id());
+  }
+
  private:
   std::unique_ptr<CoordinationServiceInterface> coord_service_;
   std::vector<CoordinatedTask> tasks_;
@@ -285,13 +293,13 @@ TEST_F(CoordinateTwoTasksTest, TestStandaloneService) {
   TF_ASSERT_OK(coord_service_->RecordHeartbeat(task_0_, incarnation_0_));
   TF_ASSERT_OK(coord_service_->RecordHeartbeat(task_1_, incarnation_1_));
   EXPECT_TRUE(
-      errors::IsInvalidArgument(coord_service_->RecordHeartbeat(task_2, 0)));
+      absl::IsInvalidArgument(coord_service_->RecordHeartbeat(task_2, 0)));
 
   // Sending heartbeat with incarnation mismatch leads to Aborted error.
-  EXPECT_TRUE(errors::IsAborted(coord_service_->RecordHeartbeat(task_1_, 0)));
-  EXPECT_TRUE(errors::IsAborted(coord_service_->RecordHeartbeat(task_1_, 0)));
+  EXPECT_TRUE(absl::IsAborted(coord_service_->RecordHeartbeat(task_1_, 0)));
+  EXPECT_TRUE(absl::IsAborted(coord_service_->RecordHeartbeat(task_1_, 0)));
   // Error is propagated to other tasks.
-  EXPECT_TRUE(errors::IsAborted(client_0_.GetStatus()));
+  EXPECT_TRUE(absl::IsAborted(client_0_.GetStatus()));
 }
 
 TEST(CoordinationServiceTest, TestCoordinatedJobs) {
@@ -356,7 +364,7 @@ TEST(CoordinationServiceTest, TestCoordinatedJobs) {
 
   // Registering the evaluator task is unexpected
   Status status = coord_service->RegisterTask(evaluator, /*incarnation=*/0);
-  EXPECT_TRUE(errors::IsInvalidArgument(status)) << status;
+  EXPECT_TRUE(absl::IsInvalidArgument(status)) << status;
   EXPECT_TRUE(!status.message().empty());
 }
 
@@ -401,7 +409,7 @@ TEST(CoordinationServiceTest,
   // restarts.
   const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/1);
 
-  EXPECT_TRUE(errors::IsAborted(status)) << status;
+  EXPECT_TRUE(absl::IsAborted(status)) << status;
   EXPECT_TRUE(!status.message().empty());
 }
 
@@ -424,7 +432,7 @@ TEST(CoordinationServiceTest, RegisterTask_AlreadyInError_Fails) {
   // Registration should fail since task already registered previously.
   const Status status = coord_service->RegisterTask(task_0, /*incarnation=*/0);
 
-  EXPECT_TRUE(errors::IsAborted(status)) << status;
+  EXPECT_TRUE(absl::IsAborted(status)) << status;
   EXPECT_TRUE(!status.message().empty());
 }
 
@@ -436,9 +444,9 @@ TEST_F(CoordinateTwoTasksTest, TestTaskHeartbeatTimeout) {
   // No heartbeat for a while, leader considers the task as stale.
   Env::Default()->SleepForMicroseconds(
       absl::ToInt64Microseconds(2 * kHeartbeatTimeout));
-  EXPECT_TRUE(errors::IsUnavailable(
+  EXPECT_TRUE(absl::IsUnavailable(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
-  EXPECT_TRUE(errors::IsUnavailable(
+  EXPECT_TRUE(absl::IsUnavailable(
       coord_service_->RecordHeartbeat(task_1_, incarnation_1_)));
 }
 
@@ -454,9 +462,9 @@ TEST_F(CoordinateTwoTasksTest,
       absl::ToInt64Microseconds(2 * kHeartbeatTimeout));
   // Unexpected heartbeat from unregistered tasks since service state has been
   // reset.
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       coord_service_->RecordHeartbeat(task_1_, incarnation_1_)));
 }
 
@@ -468,10 +476,9 @@ TEST_F(CoordinateTwoTasksTest, TestTaskRestart) {
   // Simulate task restart scenario: trying to register to cluster again.
   Status s =
       coord_service_->RegisterTask(task_1_, /*incarnation=*/random::New64());
-  EXPECT_TRUE(errors::IsAborted(s)) << s;
+  EXPECT_TRUE(absl::IsAborted(s)) << s;
   // Aborted error is also propagated to other tasks in cluster.
-  EXPECT_TRUE(errors::IsAborted(client_0_.GetStatus()))
-      << client_0_.GetStatus();
+  EXPECT_TRUE(absl::IsAborted(client_0_.GetStatus())) << client_0_.GetStatus();
 }
 
 TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
@@ -485,7 +492,7 @@ TEST_F(CoordinateTwoTasksTest, TestSetGetValues) {
   // Key with redundant slashes
   TF_ASSERT_OK(coord_service_->InsertKeyValue("path/to//key2/", "value2"));
   // Error when repeatedly inserting the same key
-  EXPECT_TRUE(errors::IsAlreadyExists(
+  EXPECT_TRUE(absl::IsAlreadyExists(
       coord_service_->InsertKeyValue("/path/to/key1/", "value2")));
 
   // Get simple key
@@ -548,7 +555,7 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
 
   // Try to get nonexistent key.
   StatusOr<std::string> result = coord_service->TryGetKeyValue("test_key");
-  EXPECT_TRUE(errors::IsNotFound(result.status()));
+  EXPECT_TRUE(absl::IsNotFound(result.status()));
 
   // Insert key value.
   TF_ASSERT_OK(coord_service->InsertKeyValue("test_key", "test_value"));
@@ -558,7 +565,7 @@ TEST(CoordinationServiceTest, TryGetKeyValue) {
   // Delete Key, and try to get the key again.
   TF_ASSERT_OK(coord_service->DeleteKeyValue("test_key"));
   result = coord_service->TryGetKeyValue("test_key");
-  EXPECT_TRUE(errors::IsNotFound(result.status()));
+  EXPECT_TRUE(absl::IsNotFound(result.status()));
 }
 
 TEST_F(CoordinateTwoTasksTest, GetKeyValueDir_SingleValueInDirectory) {
@@ -920,8 +927,8 @@ TEST_F(CoordinationBarrierTest, BarrierWithMismatchedTasks) {
       /*participating_tasks=*/{GetTask(1), GetTask(2)},
       [&barrier_status_1](Status s) { barrier_status_1 = s; });
 
-  EXPECT_TRUE(errors::IsInvalidArgument(barrier_status_0));
-  EXPECT_TRUE(errors::IsInvalidArgument(barrier_status_1));
+  EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
+  EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_1));
 }
 
 TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
@@ -943,8 +950,8 @@ TEST_F(CoordinationBarrierTest, BarrierByNonParticipatingTask) {
       [&barrier_status_1](Status s) { barrier_status_1 = s; });
 
   // Barrier should fail for all tasks with the unexpected call.
-  EXPECT_TRUE(errors::IsInvalidArgument(barrier_status_0));
-  EXPECT_TRUE(errors::IsInvalidArgument(barrier_status_1));
+  EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
+  EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_1));
 }
 
 TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
@@ -966,7 +973,7 @@ TEST_F(CoordinationBarrierTest, BarrierByNonClusterTask) {
   n_0.WaitForNotification();
 
   // Barrier should fail with the unexpected participating task argument.
-  EXPECT_TRUE(errors::IsInvalidArgument(barrier_status_0));
+  EXPECT_TRUE(absl::IsInvalidArgument(barrier_status_0));
 }
 
 TEST_F(CoordinationBarrierTest, BarrierTimeout) {
@@ -984,7 +991,13 @@ TEST_F(CoordinationBarrierTest, BarrierTimeout) {
 
   // Block until user-specified timeout.
   n_0.WaitForNotification();
-  EXPECT_TRUE(errors::IsDeadlineExceeded(barrier_status_0));
+  EXPECT_TRUE(absl::IsDeadlineExceeded(barrier_status_0));
+  EXPECT_FALSE(
+      absl::StrContains(barrier_status_0.message(), GetTaskName(GetTask(0))));
+  EXPECT_TRUE(
+      absl::StrContains(barrier_status_0.message(), GetTaskName(GetTask(1))));
+  EXPECT_TRUE(
+      absl::StrContains(barrier_status_0.message(), GetTaskName(GetTask(2))));
 }
 
 TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
@@ -1010,8 +1023,8 @@ TEST_F(CoordinationBarrierTest, BarrierReturnsPreviousError) {
       /*participating_tasks=*/{},
       [&barrier_status_1](Status s) { barrier_status_1 = s; });
 
-  EXPECT_TRUE(errors::IsInternal(barrier_status_0));
-  EXPECT_TRUE(errors::IsInternal(barrier_status_1));
+  EXPECT_TRUE(absl::IsInternal(barrier_status_0));
+  EXPECT_TRUE(absl::IsInternal(barrier_status_1));
 }
 
 TEST_F(CoordinationBarrierTest, BarrierCancelled) {
@@ -1026,7 +1039,7 @@ TEST_F(CoordinationBarrierTest, BarrierCancelled) {
   Status cancelled_status =
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
-  EXPECT_TRUE(errors::IsCancelled(barrier_status));
+  EXPECT_TRUE(absl::IsCancelled(barrier_status));
   TF_EXPECT_OK(cancelled_status);
 }
 
@@ -1043,7 +1056,7 @@ TEST_F(CoordinationBarrierTest, CancelNonExistentBarrier_FutureBarrierFails) {
       /*participating_tasks=*/{},
       [&barrier_status](Status s) { barrier_status = s; });
 
-  EXPECT_TRUE(errors::IsCancelled(barrier_status)) << barrier_status;
+  EXPECT_TRUE(absl::IsCancelled(barrier_status)) << barrier_status;
 }
 
 TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
@@ -1069,7 +1082,7 @@ TEST_F(CoordinationBarrierTest, CancelAfterBarrierHasPassed) {
   Status cancelled_status =
       GetCoordinationService()->CancelBarrier(barrier_id, GetTask(0));
 
-  EXPECT_TRUE(errors::IsFailedPrecondition(cancelled_status));
+  EXPECT_TRUE(absl::IsFailedPrecondition(cancelled_status));
   TF_EXPECT_OK(barrier_status_0);
   TF_EXPECT_OK(barrier_status_1);
   TF_EXPECT_OK(barrier_status_2);
@@ -1137,7 +1150,7 @@ TEST_F(CoordinationBarrierTest, BarrierFailsIfTaskIsAlreadyInError) {
       /*participating_tasks=*/{},
       [&barrier_status](Status s) { barrier_status = s; });
 
-  EXPECT_TRUE(errors::IsInternal(barrier_status));
+  EXPECT_TRUE(absl::IsInternal(barrier_status));
 }
 
 TEST_F(CoordinationBarrierTest, BarrierFailsUponTaskError) {
@@ -1156,7 +1169,7 @@ TEST_F(CoordinationBarrierTest, BarrierFailsUponTaskError) {
       GetTask(0), errors::Internal("test_error")));
   n0.WaitForNotification();
 
-  EXPECT_TRUE(errors::IsInternal(barrier_status));
+  EXPECT_TRUE(absl::IsInternal(barrier_status));
 }
 
 TEST_F(CoordinationBarrierTest,
@@ -1223,7 +1236,7 @@ TEST_F(CoordinateTwoTasksTest, Reset_HeartbeatsAreAcceptedForAGracePeriod) {
   // period.
   Env::Default()->SleepForMicroseconds(
       absl::ToInt64Microseconds(3 * kHeartbeatTimeout));
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
 }
 
@@ -1244,7 +1257,7 @@ TEST_F(CoordinateTwoTasksTest, Reset_FailsOngoingBarrier) {
 
   // Ongoing barrier should fail with error after shutdown.
   EXPECT_TRUE(barrier_n.HasBeenNotified());
-  EXPECT_TRUE(errors::IsInternal(barrier_status)) << barrier_status;
+  EXPECT_TRUE(absl::IsInternal(barrier_status)) << barrier_status;
 }
 
 TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
@@ -1266,7 +1279,7 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_HeartbeatsAreAcceptedForAGracePeriod) {
   // period.
   Env::Default()->SleepForMicroseconds(
       absl::ToInt64Microseconds(3 * kHeartbeatTimeout));
-  EXPECT_TRUE(errors::IsInvalidArgument(
+  EXPECT_TRUE(absl::IsInvalidArgument(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
 }
 
@@ -1292,7 +1305,7 @@ TEST_F(CoordinateTwoTasksTest, Shutdown_FailsOngoingBarrier) {
 
   // Ongoing barrier should fail with error after shutdown.
   EXPECT_TRUE(barrier_n.HasBeenNotified());
-  EXPECT_TRUE(errors::IsInternal(barrier_status)) << barrier_status;
+  EXPECT_TRUE(absl::IsInternal(barrier_status)) << barrier_status;
 }
 
 TEST_F(CoordinateTwoTasksTest, ShutdownWithBarrier_BarrierSucceeds) {
@@ -1334,7 +1347,7 @@ TEST_F(CoordinateTwoTasksTest,
   // Block until barrier times out.
   n.WaitForNotification();
 
-  EXPECT_TRUE(errors::IsDeadlineExceeded(barrier_status)) << barrier_status;
+  EXPECT_TRUE(absl::IsDeadlineExceeded(barrier_status)) << barrier_status;
   // Confirm that task_0_ has disconnected.
   // Note: this should not happen in prod where RegisterTask() is called after
   // Shutdown(), which is prevented by agent-side logic.
@@ -1342,7 +1355,7 @@ TEST_F(CoordinateTwoTasksTest,
 
   // Other task is alerted that shutdown has been initiated without it.
   Status other_task_status = client_1_.GetStatus();
-  EXPECT_TRUE(errors::IsInternal(other_task_status)) << other_task_status;
+  EXPECT_TRUE(absl::IsInternal(other_task_status)) << other_task_status;
 }
 
 TEST_F(CoordinateTwoTasksTest,
@@ -1364,7 +1377,7 @@ TEST_F(CoordinateTwoTasksTest,
   Env::Default()->SleepForMicroseconds(
       absl::ToInt64Microseconds(absl::Seconds(1)));
 
-  EXPECT_TRUE(errors::IsDeadlineExceeded(barrier_status)) << barrier_status;
+  EXPECT_TRUE(absl::IsDeadlineExceeded(barrier_status)) << barrier_status;
 
   // Service stops because no service-to-client connection is available for
   // error propagation.
@@ -1372,7 +1385,7 @@ TEST_F(CoordinateTwoTasksTest,
   // service has stopped yet, which should fail.
   Status s = coord_service_->RecordHeartbeat(task_1_, incarnation_1_);
 
-  EXPECT_TRUE(errors::IsInvalidArgument(s)) << s;
+  EXPECT_TRUE(absl::IsInvalidArgument(s)) << s;
 }
 
 TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
@@ -1386,10 +1399,10 @@ TEST_F(CoordinateTwoTasksTest, UnrecoverableTaskPropagatesError) {
   TF_ASSERT_OK(
       coord_service_->ReportTaskError(task_0_, errors::Internal("test_error")));
 
-  EXPECT_TRUE(errors::IsInternal(
+  EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
   // For unrecoverable task, error propagates to all connected tasks.
-  EXPECT_TRUE(errors::IsInternal(client_1_.GetStatus()));
+  EXPECT_TRUE(absl::IsInternal(client_1_.GetStatus()));
 }
 
 TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
@@ -1403,7 +1416,7 @@ TEST_F(CoordinateTwoTasksTest, RecoverableTaskWillNotPropagateError) {
   TF_ASSERT_OK(
       coord_service_->ReportTaskError(task_0_, errors::Internal("test_error")));
 
-  EXPECT_TRUE(errors::IsInternal(
+  EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
   // Since no error propagation for recoverable tasks, other tasks should work
   // as normal.
@@ -1422,7 +1435,7 @@ TEST_F(CoordinateTwoTasksTest,
   TF_ASSERT_OK(
       coord_service_->ReportTaskError(task_0_, errors::Internal("test_error")));
 
-  EXPECT_TRUE(errors::IsInternal(
+  EXPECT_TRUE(absl::IsInternal(
       coord_service_->RecordHeartbeat(task_0_, incarnation_0_)));
   // Since no error propagation for recoverable tasks, other tasks should work
   // as normal.
diff --git a/tensorflow/tsl/distributed_runtime/rpc/BUILD b/tensorflow/tsl/distributed_runtime/rpc/BUILD
index a55a98ef76b..576085e21a3 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/BUILD
+++ b/tensorflow/tsl/distributed_runtime/rpc/BUILD
@@ -129,6 +129,7 @@ cc_library(
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:strcat",
         "//tensorflow/tsl/util:env_var",
+        "@com_google_absl//absl/status",
     ] + tsl_grpc_cc_dependencies(),
 )
 
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.cc b/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.cc
index abd500e7e11..61761c098a2 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.cc
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.cc
@@ -18,10 +18,12 @@ limitations under the License.
 #include <cstdlib>
 #include <limits>
 #include <map>
+#include <string>
 #include <unordered_map>
 
 #include "grpcpp/create_channel.h"
 #include "absl/strings/escaping.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_split.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/grpc_channel_common.h"
 #include "tensorflow/tsl/lib/gtl/map_util.h"
@@ -42,8 +44,8 @@ namespace tsl {
 
 namespace {
 
-string MakeAddress(const string& job, int task) {
-  return strings::StrCat("/job:", job, "/replica:0/task:", task);
+string MakeAddress(const string& job, int replica, int task) {
+  return strings::StrCat("/job:", job, "/replica:", replica, "/task:", task);
 }
 
 // Allows the host to be a raw IP (either v4 or v6).
@@ -164,15 +166,6 @@ ChannelCreationFunction ConvertToChannelCreationFunction(
   };
 }
 
-Status GrpcChannelSpec::AddHostPortsJob(const string& job_id,
-                                        const std::vector<string>& host_ports) {
-  std::map<int, string> host_ports_map;
-  for (size_t i = 0; i < host_ports.size(); ++i) {
-    host_ports_map[i] = host_ports[i];
-  }
-  return AddHostPortsJob(job_id, host_ports_map);
-}
-
 Status GrpcChannelSpec::AddHostPortsJob(
     const string& job_id, const std::map<int, string>& host_ports) {
   if (!job_ids_.insert(job_id).second) {
@@ -277,7 +270,12 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
   void ListWorkers(std::vector<string>* workers) override {
     workers->reserve(workers->size() + host_ports_.size());
     for (const auto& id_host_port : host_ports_) {
-      workers->emplace_back(MakeAddress(job_id_, id_host_port.first));
+      std::vector<std::string> replicas =
+          absl::StrSplit(id_host_port.second, ',', absl::SkipEmpty());
+      for (int replica = 0; replica < replicas.size(); ++replica) {
+        workers->emplace_back(
+            MakeAddress(job_id_, replica, id_host_port.first));
+      }
     }
   }
 
@@ -298,10 +296,7 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
     if (!parsed.has_job || parsed.job != job_id_) {
       return "";
     }
-    if (!parsed.has_replica || parsed.replica != 0) {
-      LOG(WARNING) << "Replica ID must be 0 in target: " << target;
-      return "";
-    }
+
     int32_t task = parsed.has_task ? parsed.task : -1;
     auto iter = host_ports_.find(task);
     if (iter == host_ports_.end()) {
@@ -309,7 +304,15 @@ class SparseGrpcChannelCache : public CachingGrpcChannelCache {
                    << job_id_ << ": " << target;
       return "";
     }
-    return iter->second;
+
+    std::vector<std::string> host_ports =
+        absl::StrSplit(iter->second, ',', absl::SkipEmpty());
+    if (host_ports.size() > parsed.replica) {
+      return host_ports[parsed.replica];
+    }
+    LOG(WARNING) << "Requested out-of-range replica, defaulting to 0: "
+                 << target;
+    return host_ports[0];
   }
 
  protected:
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.h
index 0fd08a4ab19..00bbe5026f9 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.h
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_channel.h
@@ -43,9 +43,6 @@ class GrpcChannelSpec {
     const std::map<int, string> host_ports;
   };
 
-  Status AddHostPortsJob(const string& job_id,
-                         const std::vector<string>& host_ports);
-
   Status AddHostPortsJob(const string& job_id,
                          const std::map<int, string>& host_ports);
 
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_channel_test.cc b/tensorflow/tsl/distributed_runtime/rpc/grpc_channel_test.cc
index dde9ada5dfa..b78b14ac98f 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_channel_test.cc
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_channel_test.cc
@@ -58,8 +58,12 @@ TEST(GrpcChannelTest, IsSameAddressSpace) {
 
 TEST(GrpcChannelTest, HostPorts) {
   GrpcChannelSpec spec;
-  TF_EXPECT_OK(spec.AddHostPortsJob(
-      "mnist", {"a:1", "b:2", "c:3", "d:4", "e:5", "f:6"}));
+  TF_ASSERT_OK(spec.AddHostPortsJob("mnist", {{0, "a:1"},
+                                              {1, "b:2"},
+                                              {2, "c:3"},
+                                              {3, "d:4"},
+                                              {4, "e:5"},
+                                              {5, "f:6"}}));
   ChannelCreationFunction channel_func =
       ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
   std::unique_ptr<GrpcChannelCache> cc(
@@ -68,7 +72,6 @@ TEST(GrpcChannelTest, HostPorts) {
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:6"));
-  EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:1/task:0"));
 
   {
     // NOTE(mrry): The gRPC channel doesn't expose the target, so we
@@ -122,7 +125,8 @@ TEST(GrpcChannelTest, HostPorts) {
 
 TEST(GrpcChannelTest, HostPortsMultiChannelPerTarget) {
   GrpcChannelSpec spec;
-  TF_EXPECT_OK(spec.AddHostPortsJob("mnist", {"a:1", "b:2", "c:3"}));
+  TF_EXPECT_OK(
+      spec.AddHostPortsJob("mnist", {{0, "a:1"}, {1, "b:2"}, {2, "c:3"}}));
   ChannelCreationFunction channel_func =
       ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
   tensorflow::RPCOptions rpc_options;
@@ -133,7 +137,6 @@ TEST(GrpcChannelTest, HostPortsMultiChannelPerTarget) {
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:3"));
-  EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:1/task:0"));
 
   {
     // NOTE(mrry): The gRPC channel doesn't expose the target, so we
@@ -202,8 +205,10 @@ TEST(GrpcChannelTest, HostPortsMultiChannelPerTarget) {
 
 TEST(GrpcChannelTest, HostPortsMultiGrpcMultiChannelPerTarget) {
   GrpcChannelSpec spec;
-  TF_EXPECT_OK(spec.AddHostPortsJob("mnist", {"a:1", "b:2", "c:3"}));
-  TF_EXPECT_OK(spec.AddHostPortsJob("mnist2", {"a:1", "b:2", "c:3"}));
+  TF_EXPECT_OK(
+      spec.AddHostPortsJob("mnist", {{0, "a:1"}, {1, "b:2"}, {2, "c:3"}}));
+  TF_EXPECT_OK(
+      spec.AddHostPortsJob("mnist2", {{0, "a:1"}, {1, "b:2"}, {2, "c:3"}}));
   ChannelCreationFunction channel_func =
       ConvertToChannelCreationFunction(NewHostPortGrpcChannel);
   tensorflow::RPCOptions rpc_options;
@@ -214,7 +219,6 @@ TEST(GrpcChannelTest, HostPortsMultiGrpcMultiChannelPerTarget) {
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("invalid_target"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:other/replica:0/task:0"));
   EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:0/task:3"));
-  EXPECT_EQ(nullptr, cc->FindWorkerChannel("/job:mnist/replica:1/task:0"));
   EXPECT_NE(nullptr, cc->FindWorkerChannel("/job:mnist2/replica:0/task:0"));
 
   {
diff --git a/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h b/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
index 503905e8562..0a2d968e0f4 100644
--- a/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/tsl/distributed_runtime/rpc/grpc_state.h
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "grpcpp/generic/generic_stub.h"
 #include "grpcpp/grpcpp.h"
+#include "absl/status/status.h"
 #include "tensorflow/tsl/distributed_runtime/call_options.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/grpc_client_cq_tag.h"
 #include "tensorflow/tsl/distributed_runtime/rpc/grpc_util.h"
@@ -173,7 +174,7 @@ class RPCState : public GrpcClientCQTag {
             << context_->debug_error_string();
     // Retry if we have any attempts left
     if (++num_retries_ <= max_retries_ &&
-        (errors::IsUnavailable(s) || errors::IsUnknown(s))) {
+        (absl::IsUnavailable(s) || absl::IsUnknown(s))) {
       response_buf_.Clear();
       VLOG(1) << "Retrying call for " << method_ << "Retry: " << num_retries_
               << " of " << max_retries_;
@@ -189,6 +190,7 @@ class RPCState : public GrpcClientCQTag {
       if (target_) {
         strings::StrAppend(&error_msg, " from remote target ", *target_);
       }
+      strings::StrAppend(&error_msg, " while calling ", method_);
       strings::StrAppend(&error_msg, ":\n:", context_->debug_error_string());
       s = errors::CreateWithUpdatedMessage(s, error_msg);
       // Always treat gRPC cancellation as a derived error. This ensures that
diff --git a/tensorflow/tsl/framework/BUILD b/tensorflow/tsl/framework/BUILD
index e298d1e9f8d..53913974b91 100644
--- a/tensorflow/tsl/framework/BUILD
+++ b/tensorflow/tsl/framework/BUILD
@@ -270,6 +270,7 @@ cc_library(
         "//tensorflow/tsl/platform:str_util",
         "//tensorflow/tsl/util:device_name_utils",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/tsl/framework/contraction/BUILD b/tensorflow/tsl/framework/contraction/BUILD
index da6f962aa8d..51e79da6fff 100644
--- a/tensorflow/tsl/framework/contraction/BUILD
+++ b/tensorflow/tsl/framework/contraction/BUILD
@@ -2,12 +2,12 @@ load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow/tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         # Required since headers are not self-contained.
         "-parse_headers",
     ],
-    licenses = ["notice"],
 )
 
 config_setting(
diff --git a/tensorflow/tsl/framework/convolution/BUILD b/tensorflow/tsl/framework/convolution/BUILD
index 3aa880d9a61..744be2248e0 100644
--- a/tensorflow/tsl/framework/convolution/BUILD
+++ b/tensorflow/tsl/framework/convolution/BUILD
@@ -6,12 +6,12 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         # Required since headers are not self-contained.
         "-parse_headers",
     ],
-    licenses = ["notice"],
 )
 
 cc_library(
diff --git a/tensorflow/tsl/framework/device_id_utils.cc b/tensorflow/tsl/framework/device_id_utils.cc
index 4bfb617c2ba..82ee7f59a78 100644
--- a/tensorflow/tsl/framework/device_id_utils.cc
+++ b/tensorflow/tsl/framework/device_id_utils.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
 #include "absl/strings/numbers.h"
 #include "tensorflow/tsl/framework/device_id.h"
 #include "tensorflow/tsl/framework/device_id_manager.h"
@@ -126,7 +127,7 @@ StatusOr<int> GetDeviceIdFromDeviceParsedName(
   if (platform_id_status.ok()) {
     return platform_device_id.value();
   }
-  if (tsl::errors::IsNotFound(platform_id_status)) {
+  if (absl::IsNotFound(platform_id_status)) {
     return tf_device_id.value();
   }
   return platform_id_status;
diff --git a/tensorflow/tsl/lib/hash/crc32c_test.cc b/tensorflow/tsl/lib/hash/crc32c_test.cc
index ade7e460540..136277ccb4a 100644
--- a/tensorflow/tsl/lib/hash/crc32c_test.cc
+++ b/tensorflow/tsl/lib/hash/crc32c_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 #include "tensorflow/tsl/lib/hash/crc32c.h"
+
+#include <string>
+
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/test.h"
 #include "tensorflow/tsl/platform/test_benchmark.h"
diff --git a/tensorflow/tsl/lib/io/BUILD b/tensorflow/tsl/lib/io/BUILD
index 2ec076a03e8..552527d6042 100644
--- a/tensorflow/tsl/lib/io/BUILD
+++ b/tensorflow/tsl/lib/io/BUILD
@@ -61,6 +61,7 @@ cc_library(
         ":inputstream_interface",
         ":random_inputstream",
         "//tensorflow/tsl/platform:env",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = True,
 )
@@ -255,6 +256,21 @@ cc_library(
     ],
 )
 
+tsl_cc_test(
+    name = "buffered_file_test",
+    size = "small",
+    srcs = ["buffered_file_test.cc"],
+    deps = [
+        ":buffered_file",
+        "//tensorflow/tsl/lib/core:status_test_util",
+        "//tensorflow/tsl/platform:env",
+        "//tensorflow/tsl/platform:env_impl",
+        "//tensorflow/tsl/platform:test",
+        "//tensorflow/tsl/platform:test_benchmark",
+        "//tensorflow/tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "zlib_compression_options",
     srcs = ["zlib_compression_options.cc"],
diff --git a/tensorflow/tsl/lib/io/buffered_file.h b/tensorflow/tsl/lib/io/buffered_file.h
index fa3ec529560..53e5c7f0729 100644
--- a/tensorflow/tsl/lib/io/buffered_file.h
+++ b/tensorflow/tsl/lib/io/buffered_file.h
@@ -81,6 +81,17 @@ class BufferedWritableFile : public WritableFile {
     return file_->Flush();
   }
 
+  tsl::Status Tell(int64_t* position) override {
+    int64_t bytes_written;
+    tsl::Status status = file_->Tell(&bytes_written);
+    if (status.ok()) {
+      *position = bytes_written + buffer_pos_;
+      return OkStatus();
+    } else {
+      return status;
+    }
+  }
+
   Status Sync() override { return file_->Sync(); }
 
   // For compatibilty with the TensorBundle writer, we expose CRC32 checksums.
diff --git a/tensorflow/tsl/lib/io/buffered_file_test.cc b/tensorflow/tsl/lib/io/buffered_file_test.cc
new file mode 100644
index 00000000000..c6d7ecb0b89
--- /dev/null
+++ b/tensorflow/tsl/lib/io/buffered_file_test.cc
@@ -0,0 +1,51 @@
+/* Copyright 2016 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/lib/io/buffered_file.h"
+
+#include <memory>
+#include <utility>
+
+#include "tensorflow/tsl/lib/core/status_test_util.h"
+#include "tensorflow/tsl/platform/env.h"
+#include "tensorflow/tsl/platform/test.h"
+#include "tensorflow/tsl/platform/test_benchmark.h"
+
+namespace tsl {
+namespace io {
+namespace {
+
+TEST(BufferedInputStream, Tell) {
+  Env* env = Env::Default();
+  string fname;
+  ASSERT_TRUE(env->LocalTempFilename(&fname));
+  std::unique_ptr<WritableFile> write_file;
+  TF_ASSERT_OK(env->NewWritableFile(fname, &write_file));
+  BufferedWritableFile file(std::move(write_file), 8);
+  int64_t position;
+  TF_ASSERT_OK(file.Append("foo"));
+  TF_ASSERT_OK(file.Tell(&position));
+  EXPECT_EQ(position, 3);
+  TF_ASSERT_OK(file.Append("bar"));
+  TF_ASSERT_OK(file.Tell(&position));
+  EXPECT_EQ(position, 6);
+  TF_ASSERT_OK(file.Append("baz"));
+  TF_ASSERT_OK(file.Tell(&position));
+  EXPECT_EQ(position, 9);
+}
+
+}  // anonymous namespace
+}  // namespace io
+}  // namespace tsl
diff --git a/tensorflow/tsl/lib/io/buffered_inputstream.cc b/tensorflow/tsl/lib/io/buffered_inputstream.cc
index 2b9e0c2b5b7..930b52b1fe1 100644
--- a/tensorflow/tsl/lib/io/buffered_inputstream.cc
+++ b/tensorflow/tsl/lib/io/buffered_inputstream.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/tsl/lib/io/buffered_inputstream.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/tsl/lib/io/random_inputstream.h"
 
 namespace tsl {
@@ -87,7 +88,7 @@ Status BufferedInputStream::ReadLineHelper(StringType* result,
     }
     pos_++;
   }
-  if (errors::IsOutOfRange(s) && !result->empty()) {
+  if (absl::IsOutOfRange(s) && !result->empty()) {
     return OkStatus();
   }
   return s;
@@ -124,7 +125,7 @@ Status BufferedInputStream::ReadNBytes(int64_t bytes_to_read, tstring* result) {
   // Filling the buffer might lead to a situation when we go past the end of
   // the file leading to an OutOfRange() status return. But we might have
   // obtained enough data to satisfy the function call. Returning OK then.
-  if (errors::IsOutOfRange(s) &&
+  if (absl::IsOutOfRange(s) &&
       (result->size() == static_cast<size_t>(bytes_to_read))) {
     return OkStatus();
   }
@@ -146,7 +147,7 @@ Status BufferedInputStream::SkipNBytes(int64_t bytes_to_skip) {
     Status s = input_stream_->SkipNBytes(bytes_to_skip - (limit_ - pos_));
     pos_ = 0;
     limit_ = 0;
-    if (errors::IsOutOfRange(s)) {
+    if (absl::IsOutOfRange(s)) {
       file_status_ = s;
     }
     return s;
@@ -195,7 +196,7 @@ Status BufferedInputStream::ReadAll(T* result) {
     pos_ = limit_;
   }
 
-  if (errors::IsOutOfRange(status)) {
+  if (absl::IsOutOfRange(status)) {
     file_status_ = status;
     return OkStatus();
   }
@@ -244,7 +245,7 @@ Status BufferedInputStream::SkipLine() {
       return OkStatus();
     }
   }
-  if (errors::IsOutOfRange(s) && skipped) {
+  if (absl::IsOutOfRange(s) && skipped) {
     return OkStatus();
   }
   return s;
diff --git a/tensorflow/tsl/lib/io/snappy/BUILD b/tensorflow/tsl/lib/io/snappy/BUILD
index 34b63cf3da7..76092e5bc67 100644
--- a/tensorflow/tsl/lib/io/snappy/BUILD
+++ b/tensorflow/tsl/lib/io/snappy/BUILD
@@ -39,6 +39,7 @@ cc_library(
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:types",
+        "@com_google_absl//absl/status",
     ],
     alwayslink = True,
 )
diff --git a/tensorflow/tsl/lib/io/snappy/snappy_inputbuffer.cc b/tensorflow/tsl/lib/io/snappy/snappy_inputbuffer.cc
index ab16a5e69d2..444b21f3a5b 100644
--- a/tensorflow/tsl/lib/io/snappy/snappy_inputbuffer.cc
+++ b/tensorflow/tsl/lib/io/snappy/snappy_inputbuffer.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <algorithm>
 
+#include "absl/status/status.h"
+
 namespace tsl {
 namespace io {
 SnappyInputBuffer::SnappyInputBuffer(
@@ -182,7 +184,7 @@ Status SnappyInputBuffer::ReadFromFile() {
   avail_in_ += data.size();
   file_pos_ += data.size();
 
-  if (!s.ok() && !errors::IsOutOfRange(s)) {
+  if (!s.ok() && !absl::IsOutOfRange(s)) {
     return s;
   }
 
@@ -194,7 +196,7 @@ Status SnappyInputBuffer::ReadFromFile() {
   if (data.empty()) {
     return errors::OutOfRange("EOF reached");
   }
-  if (errors::IsOutOfRange(s)) {
+  if (absl::IsOutOfRange(s)) {
     return OkStatus();
   }
 
diff --git a/tensorflow/tsl/lib/math/math_util.h b/tensorflow/tsl/lib/math/math_util.h
index 12f89de5ccf..981d62553cb 100644
--- a/tensorflow/tsl/lib/math/math_util.h
+++ b/tensorflow/tsl/lib/math/math_util.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_LIB_MATH_MATH_UTIL_H_
 #define TENSORFLOW_TSL_LIB_MATH_MATH_UTIL_H_
 
+#include <limits>
+
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/types.h"
 
@@ -82,29 +84,6 @@ class MathUtil {
   // Input validity is DCHECKed.
   template <typename T>
   static T IPow(T base, int exp);
-
-  // Retrieves the sign of `x`:
-  //  nan if x is nan.
-  //   -1 if x < 0,
-  //   +1 if x > 0,
-  //    0 if x = 0.
-  template <typename T, absl::enable_if_t<std::is_integral<T>::value, int> = 0>
-  static T Sign(const T x) {
-    return SignHelper<T>(x);
-  }
-  template <typename T, absl::enable_if_t<!std::is_integral<T>::value, int> = 0>
-  static T Sign(const T x) {
-    return std::isnan(x) ? x : SignHelper<T>(x);
-  }
-
- private:
-  // A helper function to reduce duplication between two MathUtil::Sign
-  // functions, which are required to be split to avoid ambiguity for integral
-  // types with std::isnan for some builds.
-  template <typename T>
-  static T SignHelper(const T x) {
-    return x == T(0) ? T(0) : (x > T(0) ? T(1) : T(-1));
-  }
 };
 
 // ---- CeilOrFloorOfRatio ----
diff --git a/tensorflow/tsl/lib/math/math_util_test.cc b/tensorflow/tsl/lib/math/math_util_test.cc
index 0dfd8e03386..8900d796bbb 100644
--- a/tensorflow/tsl/lib/math/math_util_test.cc
+++ b/tensorflow/tsl/lib/math/math_util_test.cc
@@ -339,24 +339,5 @@ TEST(MathUtil, IPowEdgeCases) {
   // pow(+∞, exp) returns +0 for any negative exp
 }
 
-TEST(MathUtil, Sign) {
-  EXPECT_EQ(tsl::MathUtil::Sign(1), 1);
-  EXPECT_EQ(tsl::MathUtil::Sign(0), 0);
-  EXPECT_EQ(tsl::MathUtil::Sign(-1), -1);
-
-  EXPECT_EQ(tsl::MathUtil::Sign(0.0f), 0.0f);
-  EXPECT_EQ(tsl::MathUtil::Sign(1.0f), 1.0f);
-  EXPECT_EQ(tsl::MathUtil::Sign(-1.0f), -1.0f);
-
-  EXPECT_EQ(tsl::MathUtil::Sign(std::numeric_limits<float>::infinity()), 1.0f);
-  EXPECT_EQ(tsl::MathUtil::Sign(-std::numeric_limits<float>::infinity()),
-            -1.0f);
-
-  EXPECT_TRUE(
-      std::isnan(tsl::MathUtil::Sign(std::numeric_limits<float>::quiet_NaN())));
-  EXPECT_TRUE(std::isnan(
-      tsl::MathUtil::Sign(-std::numeric_limits<float>::quiet_NaN())));
-}
-
 }  // namespace
 }  // namespace tsl
diff --git a/tensorflow/tsl/lib/monitoring/BUILD b/tensorflow/tsl/lib/monitoring/BUILD
index ebb64a7a8d6..90035176163 100644
--- a/tensorflow/tsl/lib/monitoring/BUILD
+++ b/tensorflow/tsl/lib/monitoring/BUILD
@@ -162,11 +162,12 @@ cc_library(
         ":collected_metrics",
         ":collection_registry",
         ":metric_def",
+        ":test_utils",
         ":types",
-        "//tensorflow/tsl/lib/monitoring:test_utils",
         "//tensorflow/tsl/platform:errors",
         "//tensorflow/tsl/platform:status",
         "//tensorflow/tsl/platform:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
diff --git a/tensorflow/tsl/lib/monitoring/cell_reader-inl.h b/tensorflow/tsl/lib/monitoring/cell_reader-inl.h
index b4f86da02ab..9bc6b79eb61 100644
--- a/tensorflow/tsl/lib/monitoring/cell_reader-inl.h
+++ b/tensorflow/tsl/lib/monitoring/cell_reader-inl.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "tensorflow/tsl/lib/monitoring/collected_metrics.h"
 #include "tensorflow/tsl/lib/monitoring/metric_def.h"
 #include "tensorflow/tsl/lib/monitoring/test_utils.h"
@@ -94,7 +95,7 @@ ValueType GetLatestValueOrDefault(const CollectedMetrics& metrics,
                                   const std::vector<std::string>& labels,
                                   const ValueType default_value = ValueType()) {
   StatusOr<Point> latest_point = GetLatestPoint(metrics, metric_name, labels);
-  if (errors::IsUnavailable(latest_point.status())) {
+  if (absl::IsUnavailable(latest_point.status())) {
     return std::move(default_value);
   }
   if (!latest_point.ok()) {
diff --git a/tensorflow/tsl/lib/monitoring/collection_registry.cc b/tensorflow/tsl/lib/monitoring/collection_registry.cc
index bafe113a549..1333d22fc41 100644
--- a/tensorflow/tsl/lib/monitoring/collection_registry.cc
+++ b/tensorflow/tsl/lib/monitoring/collection_registry.cc
@@ -78,9 +78,14 @@ CollectionRegistry::Register(const AbstractMetricDef* const metric_def,
 
   const auto found_it = registry_.find(metric_def->name());
   if (found_it != registry_.end()) {
-    LOG(ERROR) << "Cannot register 2 metrics with the same name: "
-               << metric_def->name();
-    return nullptr;
+    LOG(WARNING)
+        << "Trying to register 2 metrics with the same name: "
+        << metric_def->name()
+        << ". The old value will be erased in order to register a new one. "
+           "Please check if you link the metric more than once, or "
+           "if the name is already used by other metrics.";
+    // Erase the old value and insert the new value to registry.
+    registry_.erase(found_it);
   }
   registry_.insert(
       {metric_def->name(),
diff --git a/tensorflow/tsl/lib/strings/proto_serialization.cc b/tensorflow/tsl/lib/strings/proto_serialization.cc
index 367bcad39e9..7c51c059265 100644
--- a/tensorflow/tsl/lib/strings/proto_serialization.cc
+++ b/tensorflow/tsl/lib/strings/proto_serialization.cc
@@ -15,6 +15,7 @@ limitations under the License.
 #include "tensorflow/tsl/lib/strings/proto_serialization.h"
 
 #include <cstring>
+#include <memory>
 
 #include "absl/memory/memory.h"
 #include "absl/strings/string_view.h"
diff --git a/tensorflow/tsl/platform/BUILD b/tensorflow/tsl/platform/BUILD
index a1ab3f4761d..b8a7eac3b41 100644
--- a/tensorflow/tsl/platform/BUILD
+++ b/tensorflow/tsl/platform/BUILD
@@ -1060,7 +1060,7 @@ cuda_cc_test(
 cc_library(
     name = "cuda",
     deps = [
-        "//tensorflow/tsl/platform/default/build_config:cuda",
+        "@local_config_cuda//cuda:cudart_static",
     ],
 )
 
@@ -1506,6 +1506,7 @@ tsl_cc_test(
         ":errors",
         ":test",
         ":test_main",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/tsl/platform/cloud/BUILD b/tensorflow/tsl/platform/cloud/BUILD
index 223b8f2443b..071b3e9504c 100644
--- a/tensorflow/tsl/platform/cloud/BUILD
+++ b/tensorflow/tsl/platform/cloud/BUILD
@@ -130,6 +130,7 @@ cc_library(
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
         "@jsoncpp_git//:jsoncpp",
     ],
     alwayslink = 1,
@@ -173,6 +174,7 @@ cc_library(
         "//tensorflow/tsl/platform:thread_annotations",
         "//tensorflow/tsl/profiler/lib:traceme",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
         "@jsoncpp_git//:jsoncpp",
     ],
     alwayslink = 1,
@@ -436,6 +438,7 @@ tsl_cc_test(
         "//tensorflow/tsl/platform:platform_port",
         "//tensorflow/tsl/platform:test",
         "//tensorflow/tsl/platform:test_main",
+        "@com_google_absl//absl/status",
     ],
 )
 
diff --git a/tensorflow/tsl/platform/cloud/curl_http_request_test.cc b/tensorflow/tsl/platform/cloud/curl_http_request_test.cc
index 6368e6eb3bd..963cea16e7c 100644
--- a/tensorflow/tsl/platform/cloud/curl_http_request_test.cc
+++ b/tensorflow/tsl/platform/cloud/curl_http_request_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <fstream>
 #include <string>
 
+#include "absl/status/status.h"
 #include "tensorflow/tsl/lib/core/status_test_util.h"
 #include "tensorflow/tsl/platform/mem.h"
 #include "tensorflow/tsl/platform/path.h"
@@ -875,7 +876,7 @@ TEST(CurlHttpRequestTest, StatsGetNotFound) {
   EXPECT_EQ(&http_request, stats.record_response_request_);
   EXPECT_EQ("http://www.testuri.com", stats.record_response_uri_);
   EXPECT_EQ(HttpRequest::RequestMethod::kGet, stats.record_response_method_);
-  EXPECT_TRUE(errors::IsNotFound(stats.record_response_result_));
+  EXPECT_TRUE(absl::IsNotFound(stats.record_response_result_));
   EXPECT_EQ(s, stats.record_response_result_);
 
   // Check interaction with libcurl.
diff --git a/tensorflow/tsl/platform/cloud/gcs_file_system.cc b/tensorflow/tsl/platform/cloud/gcs_file_system.cc
index e0d2f73921c..f451279053e 100644
--- a/tensorflow/tsl/platform/cloud/gcs_file_system.cc
+++ b/tensorflow/tsl/platform/cloud/gcs_file_system.cc
@@ -17,6 +17,8 @@ limitations under the License.
 
 #include <stdio.h>
 
+#include "absl/status/status.h"
+
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@@ -378,7 +380,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
           offset + copy_size >= buffer_end && buffer_end_is_past_eof_;
       if (copy_size < n && !consumed_buffer_to_eof) {
         Status status = FillBuffer(offset + copy_size);
-        if (!status.ok() && !errors::IsOutOfRange(status)) {
+        if (!status.ok() && !absl::IsOutOfRange(status)) {
           // Empty the buffer to avoid caching bad reads.
           buffer_.resize(0);
           return status;
@@ -407,7 +409,7 @@ class BufferedGcsRandomAccessFile : public RandomAccessFile {
     StringPiece str_piece;
     Status status = read_fn_(filename_, buffer_start_, buffer_size_, &str_piece,
                              &(buffer_[0]));
-    buffer_end_is_past_eof_ = errors::IsOutOfRange(status);
+    buffer_end_is_past_eof_ = absl::IsOutOfRange(status);
     buffer_.resize(str_piece.size());
     return status;
   }
@@ -642,7 +644,7 @@ class GcsWritableFile : public WritableFile {
                                  already_uploaded);
         },
         retry_config_);
-    if (errors::IsNotFound(upload_status)) {
+    if (absl::IsNotFound(upload_status)) {
       // GCS docs recommend retrying the whole upload. We're relying on the
       // RetryingFileSystem to retry the Sync() call.
       return errors::Unavailable(
@@ -1463,7 +1465,7 @@ Status GcsFileSystem::FileExists(const string& fname, TransactionToken* token) {
   // Check if the object exists.
   GcsFileStat stat;
   const Status status = StatForObject(fname, bucket, object, &stat);
-  if (!errors::IsNotFound(status)) {
+  if (!absl::IsNotFound(status)) {
     return status;
   }
 
@@ -1656,7 +1658,7 @@ Status GcsFileSystem::FolderExists(const string& dirname, bool* result) {
     *result = stat.base.is_directory;
     return OkStatus();
   }
-  if (errors::IsInvalidArgument(s)) {
+  if (absl::IsInvalidArgument(s)) {
     *result = false;
     return OkStatus();
   }
@@ -1856,7 +1858,7 @@ Status GcsFileSystem::Stat(const string& fname, TransactionToken* token,
     *stat = gcs_stat.base;
     return OkStatus();
   }
-  if (!errors::IsNotFound(status)) {
+  if (!absl::IsNotFound(status)) {
     return status;
   }
   bool is_folder;
diff --git a/tensorflow/tsl/platform/default/BUILD b/tensorflow/tsl/platform/default/BUILD
index 228b4c06a0b..3542c8ae3e8 100644
--- a/tensorflow/tsl/platform/default/BUILD
+++ b/tensorflow/tsl/platform/default/BUILD
@@ -557,13 +557,6 @@ cc_library(
     ],
     textual_hdrs = ["status.h"],
     visibility = set_external_visibility(["//tensorflow:__subpackages__"]),
-    deps = [
-        "//tensorflow/tsl/platform:types",
-        "//tensorflow/tsl/protobuf:error_codes_proto_impl_cc",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
 )
 
 bzl_library(
diff --git a/tensorflow/tsl/platform/default/status.h b/tensorflow/tsl/platform/default/status.h
index 51390516ed6..1de5de28ba1 100644
--- a/tensorflow/tsl/platform/default/status.h
+++ b/tensorflow/tsl/platform/default/status.h
@@ -15,56 +15,9 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PLATFORM_DEFAULT_STATUS_H_
 #define TENSORFLOW_TSL_PLATFORM_DEFAULT_STATUS_H_
 
-#include "absl/status/status.h"
-#include "absl/strings/string_view.h"
-#include "absl/types/span.h"
-#include "tensorflow/tsl/platform/types.h"
-#include "tensorflow/tsl/protobuf/error_codes.pb.h"
+#define MAYBE_ADD_SOURCE_LOCATION(status) \
+  {}
 
-namespace tsl {
-#if ABSL_HAVE_BUILTIN(__builtin_LINE) && ABSL_HAVE_BUILTIN(__builtin_FILE)
-#define TF_INTERNAL_HAVE_BUILTIN_LINE_FILE 1
-#endif
-
-class SourceLocationImpl {
- public:
-  uint32_t line() const { return line_; }
-  const char* file_name() const { return file_name_; }
-
-#ifdef TF_INTERNAL_HAVE_BUILTIN_LINE_FILE
-  static SourceLocationImpl current(uint32_t line = __builtin_LINE(),
-                                    const char* file_name = __builtin_FILE()) {
-    return SourceLocationImpl(line, file_name);
-  }
-#else
-  static SourceLocationImpl current(uint32_t line = 0,
-                                    const char* file_name = nullptr) {
-    return SourceLocationImpl(line, file_name);
-  }
-#endif
- private:
-  SourceLocationImpl(uint32_t line, const char* file_name)
-      : line_(line), file_name_(file_name) {}
-  uint32_t line_;
-  const char* file_name_;
-};
-
-namespace internal {
-
-inline absl::Status MakeAbslStatus(
-    absl::StatusCode code, absl::string_view message,
-    absl::Span<const SourceLocationImpl>,
-    SourceLocationImpl loc = SourceLocationImpl::current()) {
-  return absl::Status(static_cast<absl::StatusCode>(code), message);
-}
-
-inline absl::Span<const SourceLocationImpl> GetSourceLocations(
-    const absl::Status& status) {
-  return {};
-}
-
-}  // namespace internal
-
-}  // namespace tsl
+#define ADD_SOURCE_LOCATION(status) status
 
 #endif  // TENSORFLOW_TSL_PLATFORM_DEFAULT_STATUS_H_
diff --git a/tensorflow/tsl/platform/errors.h b/tensorflow/tsl/platform/errors.h
index 1eab87d1520..8e441eca65f 100644
--- a/tensorflow/tsl/platform/errors.h
+++ b/tensorflow/tsl/platform/errors.h
@@ -124,11 +124,27 @@ inline void CopyPayloads(const ::tsl::Status& from, ::tsl::Status& to) {
 inline ::tsl::Status Create(
     absl::StatusCode code, ::tsl::StringPiece message,
     const std::unordered_map<std::string, std::string>& payloads,
-    SourceLocation loc = SourceLocation::current()) {
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
   Status status(code, message, loc);
   InsertPayloads(status, payloads);
   return status;
 }
+// Returns a new Status, replacing its message with the given.
+inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
+                                              ::tsl::StringPiece message) {
+  auto locations = status.GetSourceLocations();
+  auto initial_loc =
+      locations.empty() ? absl::SourceLocation::current() : locations[0];
+  Status new_status = Create(static_cast<absl::StatusCode>(status.code()),
+                             message, GetPayloads(status), initial_loc);
+  if (locations.size() > 1) {
+    for (auto loc : locations.subspan(1)) {
+      new_status.AddSourceLocation(loc);
+    }
+  }
+  return new_status;
+}
+
 #else
 inline ::absl::Status Create(
     absl::StatusCode code, ::tsl::StringPiece message,
@@ -137,32 +153,33 @@ inline ::absl::Status Create(
   InsertPayloads(status, payloads);
   return status;
 }
-#endif
-
 // Returns a new Status, replacing its message with the given.
 inline ::tsl::Status CreateWithUpdatedMessage(const ::tsl::Status& status,
                                               ::tsl::StringPiece message) {
   return Create(static_cast<absl::StatusCode>(status.code()), message,
                 GetPayloads(status));
 }
+#endif
 
 // Append some context to an error message.  Each time we append
 // context put it on a new line, since it is possible for there
 // to be several layers of additional context.
 template <typename... Args>
 void AppendToMessage(::tsl::Status* status, Args... args) {
-  auto new_status =
-      ::tsl::Status(status->code(),
-                    ::tsl::strings::StrCat(status->message(), "\n\t", args...));
+  auto new_status = CreateWithUpdatedMessage(
+      *status, ::tsl::strings::StrCat(status->message(), "\n\t", args...));
   CopyPayloads(*status, new_status);
   *status = std::move(new_status);
 }
 
 // For propagating errors when calling a function.
-#define TF_RETURN_IF_ERROR(...)                          \
-  do {                                                   \
-    ::tsl::Status _status = (__VA_ARGS__);               \
-    if (TF_PREDICT_FALSE(!_status.ok())) return _status; \
+#define TF_RETURN_IF_ERROR(...)             \
+  do {                                      \
+    ::absl::Status _status = (__VA_ARGS__); \
+    if (TF_PREDICT_FALSE(!_status.ok())) {  \
+      MAYBE_ADD_SOURCE_LOCATION(_status)    \
+      return _status;                       \
+    }                                       \
   } while (0)
 
 #define TF_RETURN_WITH_CONTEXT_IF_ERROR(expr, ...)           \
@@ -204,6 +221,18 @@ template <typename... Args>
 
 #if defined(PLATFORM_GOOGLE)
 // Specialized overloads to capture source location for up to three arguments.
+template <typename Arg1, typename Arg2, typename Arg3, typename Arg4>
+::absl::Status InvalidArgument(
+    Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4,
+    absl::SourceLocation loc = absl::SourceLocation::current()) {
+  return ::tsl::Status(
+      absl::StatusCode::kInvalidArgument,
+      ::tsl::strings::StrCat(::tsl::errors::internal::PrepareForStrCat(arg1),
+                             ::tsl::errors::internal::PrepareForStrCat(arg2),
+                             ::tsl::errors::internal::PrepareForStrCat(arg3),
+                             ::tsl::errors::internal::PrepareForStrCat(arg4)),
+      loc);
+}
 template <typename Arg1, typename Arg2, typename Arg3>
 ::absl::Status InvalidArgument(
     Arg1 arg1, Arg2 arg2, Arg3 arg3,
diff --git a/tensorflow/tsl/platform/errors_test.cc b/tensorflow/tsl/platform/errors_test.cc
index c7b63ff1542..9d3a785abe0 100644
--- a/tensorflow/tsl/platform/errors_test.cc
+++ b/tensorflow/tsl/platform/errors_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/tsl/platform/errors.h"
 
+#include "absl/status/status.h"
 #include "tensorflow/tsl/platform/test.h"
 
 namespace tsl {
@@ -76,4 +77,47 @@ TEST(Status, ErrorStatusInsertPayloadsFromErrorStatus) {
   ASSERT_EQ(payloads_error_status.size(), 3);
 }
 
+#if defined(PLATFORM_GOOGLE)
+
+Status GetError() {
+  return absl::InvalidArgumentError("An invalid argument error");
+}
+
+Status PropagateError() {
+  TF_RETURN_IF_ERROR(GetError());
+  return absl::OkStatus();
+}
+
+Status PropagateError2() {
+  TF_RETURN_IF_ERROR(PropagateError());
+  return absl::OkStatus();
+}
+
+TEST(Status, StackTracePropagation) {
+  Status s = PropagateError2();
+  auto sources = s.GetSourceLocations();
+  ASSERT_EQ(sources.size(), 3);
+
+  for (int i = 0; i < 3; ++i) {
+    ASSERT_EQ(sources[i].file_name(),
+              "third_party/tensorflow/tsl/platform/errors_test.cc");
+  }
+}
+
+TEST(Status, SourceLocationsPreservedByAppend) {
+  Status s = PropagateError2();
+  ASSERT_EQ(s.GetSourceLocations().size(), 3);
+  errors::AppendToMessage(&s, "A new message.");
+  ASSERT_EQ(s.GetSourceLocations().size(), 3);
+}
+
+TEST(Status, SourceLocationsPreservedByUpdate) {
+  Status s = PropagateError2();
+  ASSERT_EQ(s.GetSourceLocations().size(), 3);
+  Status s2 = errors::CreateWithUpdatedMessage(s, "New message.");
+  ASSERT_EQ(s2.GetSourceLocations().size(), 3);
+}
+
+#endif
+
 }  // namespace tsl
diff --git a/tensorflow/tsl/platform/float8.h b/tensorflow/tsl/platform/float8.h
index 6527b4100c2..5c31774e3c8 100644
--- a/tensorflow/tsl/platform/float8.h
+++ b/tensorflow/tsl/platform/float8.h
@@ -440,7 +440,7 @@ struct numeric_limits_float8_e4m3fn : public numeric_limits_float8_base {
   static constexpr float8_e4m3fn min() {
     return float8_e4m3fn::FromRep(0b0'0001 << kMantissaBits);
   }
-  // -(1 + 0b110 * 2^-3) * 2^(0b1111 - 7) = -1.75 * 2^8 = 448
+  // -(1 + 0b110 * 2^-3) * 2^(0b1111 - 7) = -1.75 * 2^8 = -448
   static constexpr float8_e4m3fn lowest() {
     return float8_e4m3fn::FromRep(0b1'1111'110);
   }
@@ -821,7 +821,7 @@ struct ConvertImpl<From, To, kSaturate, kTruncate,
   static constexpr int kToExponentBias = ToTraits::kExponentBias;
   static constexpr ToBits kToExponentMask = ToTraits::kExponentMask;
 
-  // `WideBits` is wide enough to accomodate the largest exponent and mantissa
+  // `WideBits` is wide enough to accommodate the largest exponent and mantissa
   // in either `From` or `To`.
   static constexpr int kWideBits =
       (std::max(kToMantissaBits, kFromMantissaBits)) +  // Max significand.
@@ -978,7 +978,9 @@ struct ConvertImpl<Eigen::half, float8_e5m2, kSaturate, kTruncate> {
     uint16_t abs_bits = from_bits & 0x7FFF;
     if (abs_bits == 0x7C00) {
       return float8_e5m2::FromRep(from_bits >> 8);
-    } else if (abs_bits > 0x7C00) {
+    }
+
+    if (abs_bits > 0x7C00) {
       // IEEE 754-2019 6.2.1: "A quiet NaN bit string should be encoded with the
       // first bit (d1) of the trailing significand field T being 1."
       // IEEE 754-2019 6.2.3: "Conversion of a quiet NaN to a floating-point
diff --git a/tensorflow/tsl/platform/path.cc b/tensorflow/tsl/platform/path.cc
index 3102c3f4262..12e1ca5a1d3 100644
--- a/tensorflow/tsl/platform/path.cc
+++ b/tensorflow/tsl/platform/path.cc
@@ -148,6 +148,10 @@ StringPiece Extension(StringPiece path) {
   return internal::SplitBasename(path).second;
 }
 
+StringPiece BasenamePrefix(StringPiece path) {
+  return internal::SplitBasename(path).first;
+}
+
 string CleanPath(StringPiece unclean_path) {
   string path(unclean_path);
   const char* src = path.c_str();
diff --git a/tensorflow/tsl/platform/path.h b/tensorflow/tsl/platform/path.h
index a400ea79a11..722162c1afc 100644
--- a/tensorflow/tsl/platform/path.h
+++ b/tensorflow/tsl/platform/path.h
@@ -66,6 +66,10 @@ tsl::StringPiece Basename(tsl::StringPiece path);
 // there is no "." in the basename, the result is empty.
 tsl::StringPiece Extension(tsl::StringPiece path);
 
+// Returns the part of the basename of path before the final ".".  If
+// there is no "." in the basename, the result is empty.
+tsl::StringPiece BasenamePrefix(tsl::StringPiece path);
+
 // Returns the largest common subpath of `paths`.
 //
 // For example, for "/alpha/beta/gamma" and "/alpha/beta/ga" returns
diff --git a/tensorflow/tsl/platform/status.h b/tensorflow/tsl/platform/status.h
index 35405825f25..f4b0af8031a 100644
--- a/tensorflow/tsl/platform/status.h
+++ b/tensorflow/tsl/platform/status.h
@@ -47,10 +47,9 @@ limitations under the License.
 
 namespace tsl {
 
-typedef SourceLocationImpl SourceLocation;
-
-// Since April 2023, tensorflow::Status is an alias to absl::Status. TF 2.13 is
-// the first release including this change.
+// Since April 2023, tensorflow::Status is an alias to absl::Status. The first
+// TF release including this change will be TF 2.14 (the latest release in
+// April 2023 is 2.13).
 // At the same time `tsl::errors::Code` aliases `absl::StatusCode`.
 //
 // Here is a set of correspondences:
diff --git a/tensorflow/tsl/platform/status_to_from_proto.cc b/tensorflow/tsl/platform/status_to_from_proto.cc
index 6849388d528..2ecf81fb87a 100644
--- a/tensorflow/tsl/platform/status_to_from_proto.cc
+++ b/tensorflow/tsl/platform/status_to_from_proto.cc
@@ -35,12 +35,22 @@ tensorflow::StatusProto StatusToProto(const Status& s) {
   return status_proto;
 }
 
+#if defined(PLATFORM_GOOGLE)
 Status StatusFromProto(const tensorflow::StatusProto& proto,
-                       SourceLocation loc) {
+                       absl::SourceLocation loc) {
+  if (proto.code() == tensorflow::error::OK) {
+    return OkStatus();
+  }
+  return Status(static_cast<absl::StatusCode>(proto.code()), proto.message(),
+                loc);
+}
+#else
+Status StatusFromProto(const tensorflow::StatusProto& proto) {
   if (proto.code() == tensorflow::error::OK) {
     return OkStatus();
   }
   return Status(static_cast<absl::StatusCode>(proto.code()), proto.message());
 }
+#endif
 
 }  // namespace tsl
diff --git a/tensorflow/tsl/platform/status_to_from_proto.h b/tensorflow/tsl/platform/status_to_from_proto.h
index f6c5bd897ae..100f3d0969d 100644
--- a/tensorflow/tsl/platform/status_to_from_proto.h
+++ b/tensorflow/tsl/platform/status_to_from_proto.h
@@ -30,10 +30,14 @@ namespace tsl {
 // Converts a `Status` to a `StatusProto`.
 tensorflow::StatusProto StatusToProto(const Status& s);
 
+#if defined(PLATFORM_GOOGLE)
 // Constructs a `Status` from a `StatusProto`.
-Status StatusFromProto(const tensorflow::StatusProto& proto,
-                       SourceLocation loc = SourceLocation::current());
-
+Status StatusFromProto(
+    const tensorflow::StatusProto& proto,
+    absl::SourceLocation loc = absl::SourceLocation::current());
+#else
+Status StatusFromProto(const tensorflow::StatusProto& proto);
+#endif
 }  // namespace tsl
 
 #endif  // TENSORFLOW_TSL_PLATFORM_STATUS_TO_FROM_PROTO_H_
diff --git a/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h b/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h
index 04d8c5a8527..d2841e2b57a 100644
--- a/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h
+++ b/tensorflow/tsl/profiler/lib/scoped_annotation_stack.h
@@ -47,6 +47,8 @@ class ScopedAnnotationStack {
   static constexpr size_t kInvalidActivity = static_cast<size_t>(-1);
 
  public:
+  static bool IsEnabled() { return AnnotationStack::IsEnabled(); }
+
   static int64_t ActivityStart(std::string name) {
 #if !defined(IS_MOBILE_PLATFORM)
 #if GOOGLE_CUDA
diff --git a/tensorflow/tsl/profiler/rpc/client/BUILD b/tensorflow/tsl/profiler/rpc/client/BUILD
index 4e4716f38f7..bbb806db5ac 100644
--- a/tensorflow/tsl/profiler/rpc/client/BUILD
+++ b/tensorflow/tsl/profiler/rpc/client/BUILD
@@ -44,6 +44,8 @@ cc_library(
         "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:profiler_service_proto_cc",
         "//tensorflow/tsl/profiler/protobuf:xplane_proto_cc",
+        "//tensorflow/tsl/profiler/utils:session_manager",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
     ],
@@ -210,6 +212,7 @@ tsl_cc_test(
         "//tensorflow/tsl/profiler/rpc:profiler_server_impl",
         "//tensorflow/tsl/profiler/rpc:profiler_service_impl",
         "//tensorflow/tsl/profiler/utils:time_utils_impl",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
     ] + tf_protos_profiler_service(),
 )
diff --git a/tensorflow/tsl/profiler/rpc/client/capture_profile.cc b/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
index 8e7b7833422..a475fff27f3 100644
--- a/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
+++ b/tensorflow/tsl/profiler/rpc/client/capture_profile.cc
@@ -17,8 +17,10 @@ limitations under the License.
 #include <iostream>
 #include <limits>
 #include <memory>
+#include <variant>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_split.h"
@@ -36,6 +38,7 @@ limitations under the License.
 #include "tensorflow/tsl/profiler/rpc/client/profiler_client.h"
 #include "tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager.h"
 #include "tensorflow/tsl/profiler/rpc/client/save_profile.h"
+#include "tensorflow/tsl/profiler/utils/session_manager.h"
 
 namespace tsl {
 namespace profiler {
@@ -263,5 +266,25 @@ Status ExportToTensorBoard(const XSpace& xspace, const std::string& logdir,
   return OkStatus();
 }
 
+Status CaptureRemoteTrace(
+    const char* service_addr, const char* logdir, const char* worker_list,
+    bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        options) {
+  // TPU capture is true if the user sets worker_list.
+  bool is_cloud_tpu_session = false;
+  RemoteProfilerSessionManagerOptions opts =
+      GetRemoteSessionManagerOptionsLocked(service_addr, logdir, worker_list,
+                                           include_dataset_ops, duration_ms,
+                                           options, &is_cloud_tpu_session);
+  TF_RETURN_IF_ERROR(ValidateRemoteProfilerSessionManagerOptions(opts));
+
+  {
+    TF_RETURN_IF_ERROR(CaptureRemoteTrace(logdir, num_tracing_attempts, opts,
+                                          is_cloud_tpu_session));
+  }
+  return OkStatus();
+}
+
 }  // namespace profiler
 }  // namespace tsl
diff --git a/tensorflow/tsl/profiler/rpc/client/capture_profile.h b/tensorflow/tsl/profiler/rpc/client/capture_profile.h
index f941ac960ad..80899f1d97f 100644
--- a/tensorflow/tsl/profiler/rpc/client/capture_profile.h
+++ b/tensorflow/tsl/profiler/rpc/client/capture_profile.h
@@ -18,7 +18,9 @@ limitations under the License.
 #define TENSORFLOW_TSL_PROFILER_RPC_CLIENT_CAPTURE_PROFILE_H_
 
 #include <string>
+#include <variant>
 
+#include "absl/container/flat_hash_map.h"
 #include "tensorflow/tsl/platform/status.h"
 #include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
 #include "tensorflow/tsl/profiler/protobuf/profiler_service.pb.h"
@@ -45,6 +47,14 @@ Status CaptureRemoteTrace(const std::string& logdir, int num_tracing_attempts,
                           tensorflow::RemoteProfilerSessionManagerOptions& opts,
                           bool is_cloud_tpu_session);
 
+// Generates RemoteProfilerSessionManagerOptions from inputs and calls
+// CaptureRemoteTrace.
+Status CaptureRemoteTrace(
+    const char* service_addr, const char* logdir, const char* worker_list,
+    bool include_dataset_ops, int duration_ms, int num_tracing_attempts,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        options);
+
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
index 365267e2d8a..051d0ca2ea9 100644
--- a/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
+++ b/tensorflow/tsl/profiler/rpc/client/remote_profiler_session_manager_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "tensorflow/tsl/platform/errors.h"
@@ -123,7 +124,7 @@ TEST(RemoteProfilerSessionManagerTest, ExpiredDeadline) {
   absl::Duration elapsed = absl::Now() - approx_start;
   EXPECT_THAT(elapsed, DurationNear(absl::Seconds(0)));
   ASSERT_EQ(responses.size(), 1);
-  EXPECT_TRUE(errors::IsDeadlineExceeded(responses.back().status));
+  EXPECT_TRUE(absl::IsDeadlineExceeded(responses.back().status));
   EXPECT_TRUE(responses.back().profile_response->empty_trace());
   EXPECT_EQ(responses.back().profile_response->tool_data_size(), 0);
 }
diff --git a/tensorflow/tsl/profiler/utils/BUILD b/tensorflow/tsl/profiler/utils/BUILD
index b861b9d1bbc..589e77821d8 100644
--- a/tensorflow/tsl/profiler/utils/BUILD
+++ b/tensorflow/tsl/profiler/utils/BUILD
@@ -444,3 +444,16 @@ tsl_cc_test(
         "@com_google_absl//absl/hash",
     ],
 )
+
+cc_library(
+    name = "session_manager",
+    srcs = ["session_manager.cc"],
+    hdrs = ["session_manager.h"],
+    deps = [
+        "//tensorflow/tsl/platform:errors",
+        "//tensorflow/tsl/platform:status",
+        "//tensorflow/tsl/profiler/lib:profiler_session",
+        "//tensorflow/tsl/profiler/protobuf:profiler_options_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+    ],
+)
diff --git a/tensorflow/tsl/profiler/utils/buffer_pool.cc b/tensorflow/tsl/profiler/utils/buffer_pool.cc
index 7c51c1a8519..59d189dc703 100644
--- a/tensorflow/tsl/profiler/utils/buffer_pool.cc
+++ b/tensorflow/tsl/profiler/utils/buffer_pool.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/tsl/profiler/utils/buffer_pool.h"
 
+#include <ios>
+
 #include "tensorflow/tsl/platform/logging.h"
 #include "tensorflow/tsl/platform/mem.h"
 #include "tensorflow/tsl/platform/mutex.h"
diff --git a/tensorflow/tsl/profiler/utils/group_events.h b/tensorflow/tsl/profiler/utils/group_events.h
index 4ab10fe417f..6b737e49de5 100644
--- a/tensorflow/tsl/profiler/utils/group_events.h
+++ b/tensorflow/tsl/profiler/utils/group_events.h
@@ -147,12 +147,12 @@ using ContextGroupMap = absl::flat_hash_map<
 class EventForest {
  public:
   void AddSpace(
-      const std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
+      std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
           visitor_factory,
       tensorflow::profiler::XSpace* space);
 
   void AddPlanes(
-      const std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
+      std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
           visitor_factory,
       const std::vector<tensorflow::profiler::XPlane*>& planes);
 
@@ -171,7 +171,7 @@ class EventForest {
 
  private:
   void AddPlane(
-      const std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
+      std::function<XPlaneVisitor(const tensorflow::profiler::XPlane*)>
           visitor_factory,
       tensorflow::profiler::XPlane* plane);
 
diff --git a/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc b/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc
index 48797b587eb..e7eb00f122a 100644
--- a/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc
+++ b/tensorflow/tsl/profiler/utils/preprocess_xplane_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/tsl/profiler/utils/preprocess_xplane.h"
 
+#include <optional>
+
 #include "absl/container/flat_hash_map.h"
 #include "absl/hash/hash.h"
 #include "tensorflow/tsl/platform/test.h"
@@ -85,7 +87,7 @@ TEST(PreprocessXPlane, ProducerConsumerTest) {
       GetHostEventTypeStr(HostEventType::kTpuExecuteOp), 200, 100,
       {{StatType::kStepId, int64_t{123}}, {StatType::kIterNum, int64_t{456}}});
   PreprocessXSpace(&space);
-  absl::optional<uint64_t> producer_context_id, consumer_context_id;
+  std::optional<uint64_t> producer_context_id, consumer_context_id;
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   plane_visitor.ForEachLine([&](const XLineVisitor& line) {
     line.ForEachEvent([&](const XEventVisitor& event) {
@@ -135,7 +137,7 @@ TEST(PreprocessXPlane, ProducerConsumerNotMatchedTest) {
       GetHostEventTypeStr(HostEventType::kTpuExecuteOp), 200, 100,
       {{StatType::kStepId, int64_t{123}}, {StatType::kIterNum, int64_t{789}}});
   PreprocessXSpace(&space);
-  absl::optional<uint64_t> producer_context_id, consumer_context_id;
+  std::optional<uint64_t> producer_context_id, consumer_context_id;
   XPlaneVisitor plane_visitor = CreateTfXPlaneVisitor(plane);
   plane_visitor.ForEachLine([&](const XLineVisitor& line) {
     line.ForEachEvent([&](const XEventVisitor& event) {
diff --git a/tensorflow/tsl/profiler/utils/session_manager.cc b/tensorflow/tsl/profiler/utils/session_manager.cc
new file mode 100644
index 00000000000..9d7d2de2fc6
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/session_manager.cc
@@ -0,0 +1,200 @@
+/* Copyright 2023 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/tsl/profiler/utils/session_manager.h"
+
+#include <algorithm>
+#include <string>
+#include <variant>
+#include <vector>
+
+#include "tensorflow/tsl/platform/errors.h"
+#include "tensorflow/tsl/profiler/lib/profiler_session.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tsl {
+namespace profiler {
+namespace {
+
+using tensorflow::RemoteProfilerSessionManagerOptions;
+
+// Profiler gives grace after profiling duration to terminate.
+constexpr absl::Duration kMinSessionGraceTime = absl::Seconds(60);
+
+// Sets gRPC deadline to a grace period based on the profiling duration.
+void UpdateMaxSessionDuration(RemoteProfilerSessionManagerOptions& options) {
+  auto local_profiler_duration = options.profiler_options().duration_ms();
+  auto session_creation_ts = options.session_creation_timestamp_ns();
+  auto requested_start_ts = options.profiler_options().start_timestamp_ns();
+  // User only needs to set maximal session duration if the profiling duration
+  // is bounded.
+  DCHECK_GT(local_profiler_duration, 0);
+  VLOG(3) << "duration_ms was given as " << local_profiler_duration;
+  // Max session duration is the profiling session with grace time.
+  auto profile_duration = std::max(
+      kMinSessionGraceTime, absl::Milliseconds(local_profiler_duration) * 2);
+  absl::Duration delay_duration;
+  // When requested start timestamp is 0, profiling starts immediately.
+  if (requested_start_ts > 0) {
+    delay_duration =
+        absl::Nanoseconds(requested_start_ts - session_creation_ts);
+  }
+
+  auto max_session_duration = profile_duration + delay_duration;
+  options.set_max_session_duration_ms(
+      absl::ToInt64Milliseconds(max_session_duration));
+  VLOG(1) << "max_session_duration set to " << max_session_duration;
+}
+
+// Receives a comma delimited list of service_addresses and adds them to
+// RemoteProfilerSessionManagerOptions::service_addresses.
+void AddServiceAddresses(absl::string_view service_addresses,
+                         RemoteProfilerSessionManagerOptions* options) {
+  for (absl::string_view server : absl::StrSplit(service_addresses, ',')) {
+    options->add_service_addresses(server.data(), server.size());
+  }
+}
+
+}  // namespace
+// Takes profiler options in absl::flat_hash_map and returns a
+// RemoteProfilerSessionManagerOptions.
+RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
+    absl::string_view logdir,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        opts) {
+  RemoteProfilerSessionManagerOptions options;
+  *options.mutable_profiler_options() = tsl::ProfilerSession::DefaultOptions();
+  // Store a timestamp of when this session was created. This will be the basis
+  // of gRPC deadline afterwards.
+  auto now = absl::Now();
+  options.set_session_creation_timestamp_ns(absl::ToUnixNanos(now));
+  VLOG(2) << "set_session_creation_timestamp_ns set to "
+          << options.session_creation_timestamp_ns() << " [" << now << "]";
+
+  // Set the path of where to store XSpaces.
+  options.mutable_profiler_options()->set_repository_path(logdir.data(),
+                                                          logdir.size());
+  VLOG(2) << "repository_path set to "
+          << options.profiler_options().repository_path();
+
+  for (const auto& kw : opts) {
+    absl::string_view key = kw.first;
+    if (key == "host_tracer_level") {
+      int value = std::get<int>(kw.second);
+      options.mutable_profiler_options()->set_host_tracer_level(value);
+      VLOG(1) << "host_tracer_level set to " << value;
+    } else if (key == "device_tracer_level") {
+      int value = std::get<int>(kw.second);
+      options.mutable_profiler_options()->set_device_tracer_level(value);
+      VLOG(1) << "device_tracer_level set to " << value;
+    } else if (key == "python_tracer_level") {
+      int value = std::get<int>(kw.second);
+      options.mutable_profiler_options()->set_python_tracer_level(value);
+      VLOG(1) << "python_tracer_level set to " << value;
+    } else if (key == "delay_ms") {
+      int value = std::get<int>(kw.second);
+      options.set_delay_ms(value);
+      VLOG(1) << "delay_ms was set to " << value;
+    } else {
+      LOG(WARNING) << "Unrecognised key: " << key;
+    }
+  }
+
+  return options;
+}
+
+RemoteProfilerSessionManagerOptions GetRemoteSessionManagerOptionsLocked(
+    absl::string_view service_addresses, absl::string_view logdir,
+    absl::string_view worker_list, bool include_dataset_ops,
+    int32_t duration_ms,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        opts,
+    bool* is_cloud_tpu_session) {
+  auto options = GetRemoteSessionManagerOptionsLocked(logdir, opts);
+
+  // Remote profiling does not support any use cases where the following options
+  // are set by `opts`. e.g. `opts['service_addrs']` will not happen.
+  DCHECK(options.service_addresses().empty());
+  // In remote profiling, duration is always passed by value explicitly and not
+  // set in opts.
+  DCHECK_EQ(options.profiler_options().duration_ms(), 0);
+  // Because duration_ms is not set from opts, it follows that
+  // max_session_duration_ms must be unset as well.
+  DCHECK_EQ(options.max_session_duration_ms(), 0);
+
+  // Worker_list is only used for TensorBoard TPU capture cases. For a TPU
+  // cluster, service_address is the Master, which can already be found in the
+  // list of workers. These sessions will be used with the ProfileAnalysis
+  // service.
+  *is_cloud_tpu_session = !worker_list.empty();
+  AddServiceAddresses(*is_cloud_tpu_session ? worker_list : service_addresses,
+                      &options);
+
+  // Set local profiler duration and profiler session durations.
+  options.mutable_profiler_options()->set_include_dataset_ops(
+      include_dataset_ops);
+  options.mutable_profiler_options()->set_duration_ms(duration_ms);
+  UpdateMaxSessionDuration(options);
+
+  for (int idx = 0; idx < options.service_addresses_size(); ++idx) {
+    VLOG(1) << "service_addr " << idx << " set to "
+            << options.service_addresses(idx);
+  }
+  VLOG(1) << "include_dataset_ops set to " << include_dataset_ops;
+  VLOG(1) << "duration_ms set to " << duration_ms;
+
+  return options;
+}
+
+tsl::Status ValidateRemoteProfilerSessionManagerOptions(
+    const RemoteProfilerSessionManagerOptions& options) {
+  if (options.service_addresses().empty()) {
+    return tsl::errors::InvalidArgument("No service address provided.");
+  }
+
+  if (options.profiler_options().duration_ms() == 0) {
+    return tsl::errors::InvalidArgument(
+        "duration_ms must be greater than zero.");
+  }
+
+  for (absl::string_view host_port : options.service_addresses()) {
+    TF_RETURN_IF_ERROR(ValidateHostPortPair(host_port));
+  }
+
+  if (options.max_session_duration_ms() <
+      options.profiler_options().duration_ms()) {
+    return tsl::errors::InvalidArgument(
+        "The maximum profiling session duration must be greater than or equal "
+        "to the local profiler duration.");
+  }
+
+  return OkStatus();
+}
+
+tsl::Status ValidateHostPortPair(absl::string_view host_port) {
+  tsl::uint32 port;
+  std::vector<absl::string_view> parts = absl::StrSplit(host_port, ':');
+  // Must be host:port, port must be a number, host must not contain a '/',
+  // host also must not be empty.
+  if (parts.size() != 2 || !absl::SimpleAtoi(parts[1], &port) ||
+      absl::StrContains(parts[0], "/") || parts[0].empty()) {
+    return tsl::errors::InvalidArgument("Could not interpret \"", host_port,
+                                        "\" as a host-port pair.");
+  }
+  return OkStatus();
+}
+
+}  // namespace profiler
+}  // namespace tsl
diff --git a/tensorflow/tsl/profiler/utils/session_manager.h b/tensorflow/tsl/profiler/utils/session_manager.h
new file mode 100644
index 00000000000..d4ecddb703e
--- /dev/null
+++ b/tensorflow/tsl/profiler/utils/session_manager.h
@@ -0,0 +1,55 @@
+/* Copyright 2023 The TensorFlow Authors All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_UTILS_SESSION_MANAGER_H_
+#define TENSORFLOW_TSL_PROFILER_UTILS_SESSION_MANAGER_H_
+
+#include <string>
+#include <variant>
+
+#include "absl/container/flat_hash_map.h"
+#include "tensorflow/tsl/platform/status.h"
+#include "tensorflow/tsl/profiler/protobuf/profiler_options.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Validate RemoteProfilerSessionManagerOptions.
+tsl::Status ValidateRemoteProfilerSessionManagerOptions(
+    const tensorflow::RemoteProfilerSessionManagerOptions& options);
+
+// Get RemoteSessionManagerOptions from logdir and opts.
+tensorflow::RemoteProfilerSessionManagerOptions
+GetRemoteSessionManagerOptionsLocked(
+    absl::string_view logdir,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        opts);
+
+// Get RemoteSessionManagerOptions from provided options.
+tensorflow::RemoteProfilerSessionManagerOptions
+GetRemoteSessionManagerOptionsLocked(
+    absl::string_view service_addresses, absl::string_view logdir,
+    absl::string_view worker_list, bool include_dataset_ops,
+    int32_t duration_ms,
+    const absl::flat_hash_map<std::string, std::variant<int, std::string>>&
+        opts,
+    bool* is_cloud_tpu_session);
+
+// Validate Host Port pair.
+tsl::Status ValidateHostPortPair(absl::string_view host_port);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_UTILS_SESSION_MANAGER_H_
diff --git a/tensorflow/tsl/profiler/utils/tf_op_utils_test.cc b/tensorflow/tsl/profiler/utils/tf_op_utils_test.cc
index 93d672e3c2d..157417b7b84 100644
--- a/tensorflow/tsl/profiler/utils/tf_op_utils_test.cc
+++ b/tensorflow/tsl/profiler/utils/tf_op_utils_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/tsl/profiler/utils/tf_op_utils.h"
 
+#include <vector>
+
 #include "absl/strings/string_view.h"
 #include "tensorflow/tsl/platform/test.h"
 
diff --git a/tensorflow/tsl/profiler/utils/xplane_schema.cc b/tensorflow/tsl/profiler/utils/xplane_schema.cc
index 4102823eb4d..ebd2b09df97 100644
--- a/tensorflow/tsl/profiler/utils/xplane_schema.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_schema.cc
@@ -15,7 +15,9 @@ limitations under the License.
 
 #include "tensorflow/tsl/profiler/utils/xplane_schema.h"
 
+#include <atomic>
 #include <cstdint>
+#include <optional>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
@@ -55,6 +57,7 @@ const absl::string_view kXlaOpLineName = "XLA Ops";
 const absl::string_view kXlaAsyncOpLineName = "Async XLA Ops";
 const absl::string_view kKernelLaunchLineName = "Launch Stats";
 const absl::string_view kSourceLineName = "Source code";
+const absl::string_view kCounterEventsLineName = "_counters_";
 
 const absl::string_view kDeviceVendorNvidia = "Nvidia";
 const absl::string_view kDeviceVendorAMD = "AMD";
@@ -67,11 +70,16 @@ constexpr int kNumHostEventTypes =
 constexpr int kNumStatTypes =
     StatType::kLastStatType - StatType::kFirstStatType + 1;
 
+constexpr int kNumLineIdTypes =
+    LineIdType::kLastLineIdType - LineIdType::kFirstLineIdType + 1;
+
 using HostEventTypeMap = absl::flat_hash_map<absl::string_view, HostEventType>;
 using HostEventTypeStrMap =
     absl::flat_hash_map<HostEventType, absl::string_view>;
 using StatTypeMap = absl::flat_hash_map<absl::string_view, StatType>;
 using StatTypeStrMap = absl::flat_hash_map<StatType, absl::string_view>;
+using LineIdTypeMap = absl::flat_hash_map<absl::string_view, LineIdType>;
+using LineIdTypeStrMap = absl::flat_hash_map<LineIdType, absl::string_view>;
 
 const HostEventTypeMap& GetHostEventTypeMap() {
   static auto* host_event_type_map = new HostEventTypeMap({
@@ -249,6 +257,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"tracing_count", kTfFunctionTracingCount},
       {"flops", kFlops},
       {"bytes_accessed", kBytesAccessed},
+      {"memory_access_breakdown", kMemoryAccessBreakdown},
       {"source", kSourceInfo},
       {"model_name", kModelName},
       {"model_version", kModelVersion},
@@ -261,6 +270,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"matrix_unit_utilization_percent", kMatrixUnitUtilizationPercent},
       // XLA metadata map related.
       {"Hlo Proto", kHloProto},
+      {"Model information", kModelInfo},
       // Device capability related.
       {"clock_rate", kDevCapClockRateKHz},
       {"core_count", kDevCapCoreCount},
@@ -283,7 +293,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"theoretical_occupancy_pct", kTheoreticalOccupancyPct},
       {"occupancy_min_grid_size", kOccupancyMinGridSize},
       {"occupancy_suggested_block_size", kOccupancySuggestedBlockSize},
-      // Aggregrated Stat
+      // Aggregated Stat
       {"self_duration_ps", kSelfDurationPs},
       {"min_duration_ps", kMinDurationPs},
       {"total_profile_duration_ps", kTotalProfileDurationPs},
@@ -299,11 +309,29 @@ const StatTypeMap& GetStatTypeMap() {
       {"duration_us", kDuration},
       {"buffer_size", kBufferSize},
       {"transfers", kTransfers},
+      // Dcn message Stats
+      {"dcn_label", kDcnLabel},
+      {"dcn_source_slice_id", kDcnSourceSliceId},
+      {"dcn_source_per_slice_device_id", kDcnSourcePerSliceDeviceId},
+      {"dcn_destination_slice_id", kDcnDestinationSliceId},
+      {"dcn_destination_per_slice_device_id", kDcnDestinationPerSliceDeviceId},
+      {"dcn_chunk", kDcnChunk},
+      {"dcn_loop_index", kDcnLoopIndex},
   });
   DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
   return *stat_type_map;
 }
 
+const LineIdTypeMap& GetLineIdTypeMap() {
+  static auto* line_id_type_map = new LineIdTypeMap({
+      {"UnknownLineIdType", kUnknownLineIdType},
+      {"DcnHostTraffic", kDcnHostTraffic},
+      {"DcnCollectiveTraffic", kDcnCollectiveTraffic},
+  });
+  DCHECK_EQ(line_id_type_map->size(), kNumLineIdTypes);
+  return *line_id_type_map;
+}
+
 const HostEventTypeStrMap& GetHostEventTypeStrMap() {
   static auto* host_event_type_str_map = new HostEventTypeStrMap(
       gtl::ReverseMap<HostEventTypeStrMap>(GetHostEventTypeMap()));
@@ -316,20 +344,26 @@ const StatTypeStrMap& GetStatTypeStrMap() {
   return *stat_type_str_map;
 }
 
+const LineIdTypeStrMap& GetLineIdTypeStrMap() {
+  static auto* line_id_type_str_map = new LineIdTypeStrMap(
+      gtl::ReverseMap<LineIdTypeStrMap>(GetLineIdTypeMap()));
+  return *line_id_type_str_map;
+}
+
 }  // namespace
 
 absl::string_view GetHostEventTypeStr(HostEventType event_type) {
   return GetHostEventTypeStrMap().at(event_type);
 }
 
-absl::optional<int64_t> FindHostEventType(absl::string_view event_name) {
+std::optional<int64_t> FindHostEventType(absl::string_view event_name) {
   if (auto event_type = gtl::FindOrNull(GetHostEventTypeMap(), event_name)) {
     return *event_type;
   }
-  return absl::nullopt;
+  return std::nullopt;
 }
 
-absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name) {
+std::optional<int64_t> FindTfOpEventType(absl::string_view event_name) {
   // TF op names.
   Category category = ParseTfOpFullname(event_name).category;
   switch (category) {
@@ -338,7 +372,7 @@ absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name) {
     case Category::kTfData:
       return HostEventType::kIterator;
     default:
-      return absl::nullopt;
+      return std::nullopt;
   }
 }
 
@@ -346,14 +380,18 @@ absl::string_view GetStatTypeStr(StatType stat_type) {
   return GetStatTypeStrMap().at(stat_type);
 }
 
-absl::optional<int64_t> FindStatType(absl::string_view stat_name) {
+std::optional<int64_t> FindStatType(absl::string_view stat_name) {
   if (auto stat_type = gtl::FindOrNull(GetStatTypeMap(), stat_name)) {
     return *stat_type;
   }
-  return absl::nullopt;
+  return std::nullopt;
 }
 
-bool IsInternalEvent(absl::optional<int64_t> event_type) {
+absl::string_view GetLineIdTypeStr(LineIdType line_id_type) {
+  return GetLineIdTypeStrMap().at(line_id_type);
+}
+
+bool IsInternalEvent(std::optional<int64_t> event_type) {
   // TODO(b/162102421): Introduce a prefix for internal event names.
   if (!event_type.has_value()) return false;
   switch (*event_type) {
@@ -376,7 +414,7 @@ bool IsInternalEvent(absl::optional<int64_t> event_type) {
   }
 }
 
-bool IsInternalStat(absl::optional<int64_t> stat_type) {
+bool IsInternalStat(std::optional<int64_t> stat_type) {
   // TODO(b/162102421): Introduce a prefix for internal stat names.
   if (!stat_type.has_value()) return false;
   switch (*stat_type) {
diff --git a/tensorflow/tsl/profiler/utils/xplane_schema.h b/tensorflow/tsl/profiler/utils/xplane_schema.h
index e4239e8755e..54a6e100234 100644
--- a/tensorflow/tsl/profiler/utils/xplane_schema.h
+++ b/tensorflow/tsl/profiler/utils/xplane_schema.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <atomic>
 #include <cstdint>
+#include <optional>
 #include <string>
 
 #include "absl/hash/hash.h"
@@ -71,11 +72,17 @@ TF_CONST_INIT extern const absl::string_view kXlaOpLineName;
 TF_CONST_INIT extern const absl::string_view kXlaAsyncOpLineName;
 TF_CONST_INIT extern const absl::string_view kKernelLaunchLineName;
 TF_CONST_INIT extern const absl::string_view kSourceLineName;
+TF_CONST_INIT extern const absl::string_view kCounterEventsLineName;
 
 // GPU device vendors.
 TF_CONST_INIT extern const absl::string_view kDeviceVendorNvidia;
 TF_CONST_INIT extern const absl::string_view kDeviceVendorAMD;
 
+// Max collectives to display per TPU.
+// Since in most cases there will be more than 9 collectives, the last line
+// contains all collectives that did not qualify to get their own line.
+static constexpr uint32_t kMaxCollectivesToDisplay = 9;
+
 // Interesting event types (i.e., TraceMe names).
 enum HostEventType {
   kFirstHostEventType = 0,
@@ -243,6 +250,7 @@ enum StatType {
   kTfFunctionTracingCount,
   kFlops,
   kBytesAccessed,
+  kMemoryAccessBreakdown,
   kSourceInfo,
   kModelName,
   kModelVersion,
@@ -275,7 +283,7 @@ enum StatType {
   kTheoreticalOccupancyPct,
   kOccupancyMinGridSize,
   kOccupancySuggestedBlockSize,
-  // Aggregrated Stats
+  // Aggregated Stats
   kSelfDurationPs,
   kMinDurationPs,
   kTotalProfileDurationPs,
@@ -290,7 +298,30 @@ enum StatType {
   kDuration,
   kBufferSize,
   kTransfers,
-  kLastStatType = kTransfers,
+  // Dcn message Stats
+  kDcnLabel,
+  kDcnSourceSliceId,
+  kDcnSourcePerSliceDeviceId,
+  kDcnDestinationSliceId,
+  kDcnDestinationPerSliceDeviceId,
+  kDcnChunk,
+  kDcnLoopIndex,
+  kModelInfo,
+  kLastStatType = kModelInfo,
+};
+
+static constexpr uint32_t kLineIdOffset = 10000;
+
+enum LineIdType {
+  kFirstLineIdType = kLineIdOffset,
+  kUnknownLineIdType = kFirstLineIdType,
+  // DCN Traffic
+  kDcnHostTraffic,
+  kDcnCollectiveTraffic,
+  // kDcnCollectiveTrafficMax reserves id's from kDcnCollectiveTraffic to
+  // (kDcnCollectiveTraffic + kMaxCollectivesToDisplay) for DcnCollective lines.
+  kDcnCollectiveTrafficMax = kDcnCollectiveTraffic + kMaxCollectivesToDisplay,
+  kLastLineIdType = kDcnCollectiveTrafficMax,
 };
 
 inline std::string TpuPlaneName(int32_t device_ordinal) {
@@ -310,9 +341,9 @@ inline bool IsHostEventType(HostEventType event_type,
   return GetHostEventTypeStr(event_type) == event_name;
 }
 
-absl::optional<int64_t> FindHostEventType(absl::string_view event_name);
+std::optional<int64_t> FindHostEventType(absl::string_view event_name);
 
-absl::optional<int64_t> FindTfOpEventType(absl::string_view event_name);
+std::optional<int64_t> FindTfOpEventType(absl::string_view event_name);
 
 absl::string_view GetStatTypeStr(StatType stat_type);
 
@@ -324,13 +355,13 @@ inline bool IsStatType(StatType stat_type, absl::string_view stat_name) {
 
 bool IsTensorCorePlaneName(absl::string_view plane_name);
 
-absl::optional<int64_t> FindStatType(absl::string_view stat_name);
+std::optional<int64_t> FindStatType(absl::string_view stat_name);
 
 // Returns true if the given event shouldn't be shown in the trace viewer.
-bool IsInternalEvent(absl::optional<int64_t> event_type);
+bool IsInternalEvent(std::optional<int64_t> event_type);
 
 // Returns true if the given stat shouldn't be shown in the trace viewer.
-bool IsInternalStat(absl::optional<int64_t> stat_type);
+bool IsInternalStat(std::optional<int64_t> stat_type);
 
 // Support for flow events:
 // This class enables encoding/decoding the flow id and direction, stored as
diff --git a/tensorflow/tsl/profiler/utils/xplane_test_utils.cc b/tensorflow/tsl/profiler/utils/xplane_test_utils.cc
index fc5edf64bae..96d911cb5f6 100644
--- a/tensorflow/tsl/profiler/utils/xplane_test_utils.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_test_utils.cc
@@ -16,6 +16,7 @@ limitations under the License.
 
 #include <string>
 #include <utility>
+#include <variant>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
@@ -88,7 +89,7 @@ void CreateXEvent(
     XStatValueVisitor stat_value_visitor(
         &event_builder,
         plane_builder->GetOrCreateStatMetadata(GetStatTypeStr(stat_type)));
-    absl::visit(stat_value_visitor, stat_value);
+    std::visit(stat_value_visitor, stat_value);
   }
 }
 
diff --git a/tensorflow/tsl/profiler/utils/xplane_test_utils.h b/tensorflow/tsl/profiler/utils/xplane_test_utils.h
index 70a7325ef39..5d48b5ee648 100644
--- a/tensorflow/tsl/profiler/utils/xplane_test_utils.h
+++ b/tensorflow/tsl/profiler/utils/xplane_test_utils.h
@@ -16,6 +16,8 @@ limitations under the License.
 #define TENSORFLOW_TSL_PROFILER_UTILS_XPLANE_TEST_UTILS_H_
 
 #include <initializer_list>
+#include <utility>
+#include <variant>
 
 #include "absl/strings/string_view.h"
 #include "absl/types/variant.h"
@@ -26,7 +28,7 @@ limitations under the License.
 namespace tsl {
 namespace profiler {
 
-using XStatValue = absl::variant<int64_t, uint64, absl::string_view>;
+using XStatValue = std::variant<int64_t, uint64, absl::string_view>;
 
 XPlane* GetOrCreateHostXPlane(XSpace* space);
 
diff --git a/tensorflow/tsl/profiler/utils/xplane_utils_test.cc b/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
index f7c9f527f3d..769d023914b 100644
--- a/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_utils_test.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
@@ -186,7 +187,7 @@ XLineBuilder CreateXLine(XPlaneBuilder* plane, absl::string_view name,
 
 XEventBuilder CreateXEvent(XPlaneBuilder* plane, XLineBuilder line,
                            absl::string_view event_name,
-                           absl::optional<absl::string_view> display,
+                           std::optional<absl::string_view> display,
                            int64_t offset_ns, int64_t duration_ns) {
   XEventMetadata* event_metadata = plane->GetOrCreateEventMetadata(event_name);
   if (display) event_metadata->set_display_name(std::string(*display));
@@ -240,19 +241,19 @@ TEST(XPlaneUtilsTest, MergeXPlaneTest) {
     auto l1 = CreateXLine(&src, "l1", "d1", kLineIdOnlyInSrcPlane, 100);
     auto e1 = CreateXEvent(&src, l1, "event1", "display1", 1, 2);
     CreateXStats(&src, &e1, "event_stat1", 2.0);
-    auto e2 = CreateXEvent(&src, l1, "event2", absl::nullopt, 3, 4);
+    auto e2 = CreateXEvent(&src, l1, "event2", std::nullopt, 3, 4);
     CreateXStats(&src, &e2, "event_stat2", 3);
 
     auto l2 = CreateXLine(&src, "l2", "d2", kLineIdInBothPlanes, 200);
-    auto e3 = CreateXEvent(&src, l2, "event3", absl::nullopt, 5, 7);
+    auto e3 = CreateXEvent(&src, l2, "event3", std::nullopt, 5, 7);
     CreateXStats(&src, &e3, "event_stat3", 2.0);
-    auto e4 = CreateXEvent(&src, l2, "event4", absl::nullopt, 6, 8);
+    auto e4 = CreateXEvent(&src, l2, "event4", std::nullopt, 6, 8);
     CreateXStats(&src, &e4, "event_stat4", 3);
     CreateXStats(&src, &e4, "event_stat5", 3);
 
     auto l5 = CreateXLine(&src, "l5", "d5", kLineIdInBothPlanes2, 700);
-    CreateXEvent(&src, l5, "event51", absl::nullopt, 9, 10);
-    CreateXEvent(&src, l5, "event52", absl::nullopt, 11, 12);
+    CreateXEvent(&src, l5, "event51", std::nullopt, 9, 10);
+    CreateXEvent(&src, l5, "event52", std::nullopt, 11, 12);
   }
 
   {  // Populate the destination plane.
@@ -261,20 +262,20 @@ TEST(XPlaneUtilsTest, MergeXPlaneTest) {
     CreateXStats(&dst, &dst, "plane_stat3", 4);  // shared but different.
 
     auto l3 = CreateXLine(&dst, "l3", "d3", kLineIdOnlyInDstPlane, 300);
-    auto e5 = CreateXEvent(&dst, l3, "event5", absl::nullopt, 11, 2);
+    auto e5 = CreateXEvent(&dst, l3, "event5", std::nullopt, 11, 2);
     CreateXStats(&dst, &e5, "event_stat6", 2.0);
-    auto e6 = CreateXEvent(&dst, l3, "event6", absl::nullopt, 13, 4);
+    auto e6 = CreateXEvent(&dst, l3, "event6", std::nullopt, 13, 4);
     CreateXStats(&dst, &e6, "event_stat7", 3);
 
     auto l2 = CreateXLine(&dst, "l4", "d4", kLineIdInBothPlanes, 400);
-    auto e7 = CreateXEvent(&dst, l2, "event7", absl::nullopt, 15, 7);
+    auto e7 = CreateXEvent(&dst, l2, "event7", std::nullopt, 15, 7);
     CreateXStats(&dst, &e7, "event_stat8", 2.0);
     auto e8 = CreateXEvent(&dst, l2, "event8", "display8", 16, 8);
     CreateXStats(&dst, &e8, "event_stat9", 3);
 
     auto l6 = CreateXLine(&dst, "l6", "d6", kLineIdInBothPlanes2, 300);
-    CreateXEvent(&dst, l6, "event61", absl::nullopt, 21, 10);
-    CreateXEvent(&dst, l6, "event62", absl::nullopt, 22, 12);
+    CreateXEvent(&dst, l6, "event61", std::nullopt, 21, 10);
+    CreateXEvent(&dst, l6, "event62", std::nullopt, 22, 12);
   }
 
   MergePlanes(src_plane, &dst_plane);
diff --git a/tensorflow/tsl/profiler/utils/xplane_visitor.cc b/tensorflow/tsl/profiler/utils/xplane_visitor.cc
index 4b021fe91be..a3f5cca25a3 100644
--- a/tensorflow/tsl/profiler/utils/xplane_visitor.cc
+++ b/tensorflow/tsl/profiler/utils/xplane_visitor.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/tsl/profiler/utils/xplane_visitor.h"
 
+#include <optional>
 #include <string>
 #include <utility>
 
@@ -34,7 +35,7 @@ XStatVisitor::XStatVisitor(const XPlaneVisitor* plane, const XStat* stat)
 
 XStatVisitor::XStatVisitor(const XPlaneVisitor* plane, const XStat* stat,
                            const XStatMetadata* metadata,
-                           absl::optional<int64_t> type)
+                           std::optional<int64_t> type)
     : stat_(stat), metadata_(metadata), plane_(plane), type_(type) {}
 
 std::string XStatVisitor::ToString() const {
@@ -94,7 +95,7 @@ void XPlaneVisitor::BuildEventTypeMap(
     uint64 metadata_id = event_metadata.first;
     const auto& metadata = event_metadata.second;
     for (const auto& event_type_getter : event_type_getter_list) {
-      absl::optional<int64_t> event_type = event_type_getter(metadata.name());
+      std::optional<int64_t> event_type = event_type_getter(metadata.name());
       if (event_type.has_value()) {
         auto result = event_type_by_id_.emplace(metadata_id, *event_type);
         DCHECK(result.second);  // inserted
@@ -112,11 +113,11 @@ const XEventMetadata* XPlaneVisitor::GetEventMetadata(
   return &XEventMetadata::default_instance();
 }
 
-absl::optional<int64_t> XPlaneVisitor::GetEventType(
+std::optional<int64_t> XPlaneVisitor::GetEventType(
     int64_t event_metadata_id) const {
   const auto it = event_type_by_id_.find(event_metadata_id);
   if (it != event_type_by_id_.end()) return it->second;
-  return absl::nullopt;
+  return std::nullopt;
 }
 
 void XPlaneVisitor::BuildStatTypeMap(
@@ -125,7 +126,7 @@ void XPlaneVisitor::BuildStatTypeMap(
     uint64 metadata_id = stat_metadata.first;
     const auto& metadata = stat_metadata.second;
     for (const auto& stat_type_getter : stat_type_getter_list) {
-      absl::optional<int64_t> stat_type = stat_type_getter(metadata.name());
+      std::optional<int64_t> stat_type = stat_type_getter(metadata.name());
       if (stat_type.has_value()) {
         auto result = stat_type_by_id_.emplace(metadata_id, *stat_type);
         DCHECK(result.second);  // inserted
@@ -144,11 +145,11 @@ const XStatMetadata* XPlaneVisitor::GetStatMetadata(
   return &XStatMetadata::default_instance();
 }
 
-absl::optional<int64_t> XPlaneVisitor::GetStatType(
+std::optional<int64_t> XPlaneVisitor::GetStatType(
     int64_t stat_metadata_id) const {
   const auto it = stat_type_by_id_.find(stat_metadata_id);
   if (it != stat_type_by_id_.end()) return it->second;
-  return absl::nullopt;
+  return std::nullopt;
 }
 
 const XStatMetadata* XPlaneVisitor::GetStatMetadataByType(
diff --git a/tensorflow/tsl/profiler/utils/xplane_visitor.h b/tensorflow/tsl/profiler/utils/xplane_visitor.h
index 1e1d4c261aa..267f187ce34 100644
--- a/tensorflow/tsl/profiler/utils/xplane_visitor.h
+++ b/tensorflow/tsl/profiler/utils/xplane_visitor.h
@@ -18,6 +18,7 @@ limitations under the License.
 #include <stddef.h>
 
 #include <functional>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -48,13 +49,13 @@ class XStatVisitor {
 
   // REQUIRED: plane, stat and metadata cannot be nullptr.
   XStatVisitor(const XPlaneVisitor* plane, const XStat* stat,
-               const XStatMetadata* metadata, absl::optional<int64_t> type);
+               const XStatMetadata* metadata, std::optional<int64_t> type);
 
   int64_t Id() const { return stat_->metadata_id(); }
 
   absl::string_view Name() const { return metadata_->name(); }
 
-  absl::optional<int64_t> Type() const { return type_; }
+  std::optional<int64_t> Type() const { return type_; }
 
   absl::string_view Description() const { return metadata_->description(); }
 
@@ -88,7 +89,7 @@ class XStatVisitor {
   const XStat* stat_;
   const XStatMetadata* metadata_;
   const XPlaneVisitor* plane_;
-  absl::optional<int64_t> type_;
+  std::optional<int64_t> type_;
 };
 
 template <class T>
@@ -109,17 +110,17 @@ class XStatsOwner {
   // Shortcut to get a specific stat type, nullopt if absent.
   // This function performs a linear search for the requested stat value.
   // Prefer ForEachStat above when multiple stat values are necessary.
-  absl::optional<XStatVisitor> GetStat(int64_t stat_type) const;
+  std::optional<XStatVisitor> GetStat(int64_t stat_type) const;
 
   // Same as above that skips searching for the stat.
-  absl::optional<XStatVisitor> GetStat(
+  std::optional<XStatVisitor> GetStat(
       int64_t stat_type, const XStatMetadata& stat_metadata) const {
     for (const XStat& stat : stats_owner_->stats()) {
       if (stat.metadata_id() == stat_metadata.id()) {
         return XStatVisitor(plane_, &stat, &stat_metadata, stat_type);
       }
     }
-    return absl::nullopt;  // type does not exist in this owner.
+    return std::nullopt;  // type does not exist in this owner.
   }
 
  protected:
@@ -168,7 +169,7 @@ class XEventVisitor : public XStatsOwner<XEvent> {
 
   absl::string_view Name() const { return metadata_->name(); }
 
-  absl::optional<int64_t> Type() const { return type_; }
+  std::optional<int64_t> Type() const { return type_; }
 
   bool HasDisplayName() const { return !metadata_->display_name().empty(); }
 
@@ -217,7 +218,7 @@ class XEventVisitor : public XStatsOwner<XEvent> {
   const XLine* line_;
   const XEvent* event_;
   const XEventMetadata* metadata_;
-  absl::optional<int64_t> type_;
+  std::optional<int64_t> type_;
 };
 
 class XLineVisitor {
@@ -257,7 +258,7 @@ class XLineVisitor {
   const XLine* line_;
 };
 
-using TypeGetter = std::function<absl::optional<int64_t>(absl::string_view)>;
+using TypeGetter = std::function<std::optional<int64_t>(absl::string_view)>;
 using TypeGetterList = std::vector<TypeGetter>;
 
 class XPlaneVisitor : public XStatsOwner<XPlane> {
@@ -303,7 +304,7 @@ class XPlaneVisitor : public XStatsOwner<XPlane> {
   const XEventMetadata* GetEventMetadata(int64_t event_metadata_id) const;
 
   // Returns the type of an event given its id.
-  absl::optional<int64_t> GetEventType(int64_t event_metadata_id) const;
+  std::optional<int64_t> GetEventType(int64_t event_metadata_id) const;
 
   // Returns stat metadata given its id. Returns a default value if not found.
   const XStatMetadata* GetStatMetadata(int64_t stat_metadata_id) const;
@@ -313,7 +314,7 @@ class XPlaneVisitor : public XStatsOwner<XPlane> {
   const XStatMetadata* GetStatMetadataByType(int64_t stat_type) const;
 
   // Returns the type of an stat given its id.
-  absl::optional<int64_t> GetStatType(int64_t stat_metadata_id) const;
+  std::optional<int64_t> GetStatType(int64_t stat_metadata_id) const;
 
  private:
   void BuildEventTypeMap(const XPlane* plane,
@@ -332,12 +333,12 @@ class XPlaneVisitor : public XStatsOwner<XPlane> {
 };
 
 template <class T>
-absl::optional<XStatVisitor> XStatsOwner<T>::GetStat(int64_t stat_type) const {
+std::optional<XStatVisitor> XStatsOwner<T>::GetStat(int64_t stat_type) const {
   const auto* stat_metadata = plane_->GetStatMetadataByType(stat_type);
   if (stat_metadata != nullptr) {
     return GetStat(stat_type, *stat_metadata);
   }
-  return absl::nullopt;  // type does not exist in this owner.
+  return std::nullopt;  // type does not exist in this owner.
 }
 
 template <typename ForEachChildFunc>
diff --git a/tensorflow/tsl/python/lib/core/BUILD b/tensorflow/tsl/python/lib/core/BUILD
index 634a0059e91..4680a7631cd 100644
--- a/tensorflow/tsl/python/lib/core/BUILD
+++ b/tensorflow/tsl/python/lib/core/BUILD
@@ -5,6 +5,7 @@ load("//tensorflow/tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("//tensorflow/tsl/platform:rules_cc.bzl", "cc_shared_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
         "//visibility:public",
     ],
diff --git a/tensorflow/tsl/tsl.bzl b/tensorflow/tsl/tsl.bzl
index d2fcbd5e283..8f0989a9515 100644
--- a/tensorflow/tsl/tsl.bzl
+++ b/tensorflow/tsl/tsl.bzl
@@ -230,7 +230,6 @@ def tsl_copts(
     android_copts = [
         "-DTF_LEAN_BINARY",
         "-Wno-narrowing",
-        "-fomit-frame-pointer",
     ]
     if android_optimization_level_override:
         android_copts.append(android_optimization_level_override)
@@ -257,7 +256,7 @@ def tsl_copts(
         if_mkldnn_aarch64_acl(["-DDNNL_AARCH64_USE_ACL=1"]) +
         if_mkldnn_aarch64_acl_openmp(["-DENABLE_ONEDNN_OPENMP"]) +
         if_enable_acl(["-DXLA_CPU_USE_ACL=1", "-fexceptions"]) +
-        if_android_arm(["-mfpu=neon"]) +
+        if_android_arm(["-mfpu=neon", "-fomit-frame-pointer"]) +
         if_linux_x86_64(["-msse3"]) +
         if_ios_x86_64(["-msse4.1"]) +
         if_no_default_logger(["-DNO_DEFAULT_LOGGER"]) +
diff --git a/tensorflow/tsl/util/device_name_utils.h b/tensorflow/tsl/util/device_name_utils.h
index 16ce7466055..9ba47b05881 100644
--- a/tensorflow/tsl/util/device_name_utils.h
+++ b/tensorflow/tsl/util/device_name_utils.h
@@ -129,6 +129,13 @@ class DeviceNameUtils {
       return false;
     }
 
+    template <typename H>
+    friend H AbslHashValue(H h, const ParsedName& n) {
+      return H::combine(std::move(h), n.has_job, n.job, n.has_replica,
+                        n.replica, n.has_task, n.task, n.has_type, n.type,
+                        n.has_id, n.id);
+    }
+
     bool has_job = false;
     std::string job;
     bool has_replica = false;
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index de3f598da33..d7708712ef6 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -34,7 +34,9 @@ load("//third_party/icu:workspace.bzl", icu = "repo")
 load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/libprotobuf_mutator:workspace.bzl", libprotobuf_mutator = "repo")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
+load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
 load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
+load("//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
 load("//third_party/opencl_headers:workspace.bzl", opencl_headers = "repo")
 load("//third_party/kissfft:workspace.bzl", kissfft = "repo")
 load("//third_party/pasta:workspace.bzl", pasta = "repo")
@@ -73,11 +75,13 @@ def _initialize_third_party():
     jpeg()
     kissfft()
     libprotobuf_mutator()
+    ml_dtypes()
     nasm()
     opencl_headers()
     pasta()
     psimd()
     pybind11_abseil()
+    pybind11_bazel()
     ruy()
     sobol_data()
     stablehlo()
@@ -174,9 +178,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "bfcf778030831f325cfc13ae5995388cc834fbff2995a297ba580d9ec65ca3b6",
-        strip_prefix = "cudnn-frontend-0.8",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.8.zip"),
+        sha256 = "d8dba9e2607a0c256aa8eacb45b39986ab6f3f24a4d431d4397047a3cb0cd4fb",
+        strip_prefix = "cudnn-frontend-0.9",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v0.9.zip"),
     )
 
     tf_http_archive(
@@ -259,10 +263,10 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "com_googlesource_code_re2",
-        sha256 = "b90430b2a9240df4459108b3e291be80ae92c68a47bc06ef2dc419c5724de061",
-        strip_prefix = "re2-a276a8c738735a0fe45a6ee590fe2df69bcf4502",
+        sha256 = "ef516fb84824a597c4d5d0d6d330daedb18363b5a99eda87d027e6bdd9cba299",
+        strip_prefix = "re2-03da4fc0857c285e3a26782f6bc8931c4c950df4",
         system_build_file = "//third_party/systemlibs:re2.BUILD",
-        urls = tf_mirror_urls("https://github.com/google/re2/archive/a276a8c738735a0fe45a6ee590fe2df69bcf4502.tar.gz"),
+        urls = tf_mirror_urls("https://github.com/google/re2/archive/03da4fc0857c285e3a26782f6bc8931c4c950df4.tar.gz"),
     )
 
     tf_http_archive(
@@ -506,10 +510,10 @@ def _tf_repositories():
     tf_http_archive(
         name = "curl",
         build_file = "//third_party:curl.BUILD",
-        sha256 = "cdb38b72e36bc5d33d5b8810f8018ece1baa29a8f215b4495e495ded82bbf3c7",
-        strip_prefix = "curl-7.88.1",
+        sha256 = "5fd29000a4089934f121eff456101f0a5d09e2a3e89da1d714adf06c4be887cb",
+        strip_prefix = "curl-8.0.1",
         system_build_file = "//third_party/systemlibs:curl.BUILD",
-        urls = tf_mirror_urls("https://curl.haxx.se/download/curl-7.88.1.tar.gz"),
+        urls = tf_mirror_urls("https://curl.haxx.se/download/curl-8.0.1.tar.gz"),
     )
 
     # WARNING: make sure ncteisen@ and vpai@ are cc-ed on any CL to change the below rule
@@ -819,8 +823,8 @@ def _tf_repositories():
     # https://github.com/bazelbuild/rules_apple/releases
     tf_http_archive(
         name = "build_bazel_rules_apple",
-        sha256 = "36072d4f3614d309d6a703da0dfe48684ec4c65a89611aeb9590b45af7a3e592",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_apple/releases/download/1.0.1/rules_apple.1.0.1.tar.gz"),
+        sha256 = "a6141240657093fa7ccc7ca1ee5a62408dd9996d1bf47bc2369b8b9faefb2698",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/rules_apple/releases/download/2.3.0/rules_apple.2.3.0.tar.gz"),
     )
 
     # https://github.com/bazelbuild/rules_swift/releases
@@ -833,8 +837,8 @@ def _tf_repositories():
     # https://github.com/bazelbuild/apple_support/releases
     tf_http_archive(
         name = "build_bazel_apple_support",
-        sha256 = "ce1042cf936540eaa7b49c4549d7cd9b6b1492acbb6e765840a67a34b8e17a97",
-        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.1.0/apple_support.1.1.0.tar.gz"),
+        sha256 = "9f7bb62c3ae889e0eae8c18458fd8764e2e537687d9a1d85885d6af980e4fc31",
+        urls = tf_mirror_urls("https://github.com/bazelbuild/apple_support/releases/download/1.6.0/apple_support.1.6.0.tar.gz"),
     )
 
     # https://github.com/apple/swift-protobuf/releases
@@ -845,12 +849,11 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/apple/swift-protobuf/archive/1.19.0.tar.gz"),
     )
 
-    # https://github.com/google/xctestrunner/releases
     tf_http_archive(
         name = "xctestrunner",
-        strip_prefix = "xctestrunner-0.2.15",
-        sha256 = "b789cf18037c8c28d17365f14925f83b93b1f7dabcabb80333ae4331cf0bcb2f",
-        urls = tf_mirror_urls("https://github.com/google/xctestrunner/archive/refs/tags/0.2.15.tar.gz"),
+        strip_prefix = "xctestrunner-4c5709da9444eae6bba2425734b8654635bed0a6",
+        sha256 = "e5d4c53c3965ae943fb08ccd7df0efd75590213fce5052388f23fad81a649f5a",
+        urls = tf_mirror_urls("https://github.com/google/xctestrunner/archive/4c5709da9444eae6bba2425734b8654635bed0a6.tar.gz"),
     )
 
     tf_http_archive(
@@ -863,9 +866,9 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "pybind11",
-        urls = tf_mirror_urls("https://github.com/pybind/pybind11/archive/v2.10.0.tar.gz"),
-        sha256 = "eacf582fa8f696227988d08cfc46121770823839fe9e301a20fbce67e7cd70ec",
-        strip_prefix = "pybind11-2.10.0",
+        urls = tf_mirror_urls("https://github.com/pybind/pybind11/archive/v2.10.4.tar.gz"),
+        sha256 = "832e2f309c57da9c1e6d4542dedd34b24e4192ecb4d62f6f4866a737454c9970",
+        strip_prefix = "pybind11-2.10.4",
         build_file = "//third_party:pybind11.BUILD",
         system_build_file = "//third_party/systemlibs:pybind11.BUILD",
     )
@@ -921,11 +924,14 @@ def _tf_repositories():
 
     tf_http_archive(
         name = "com_google_ortools",
-        sha256 = "b87922b75bbcce9b2ab5da0221751a3c8c0bff54b2a1eafa951dbf70722a640e",
-        strip_prefix = "or-tools-7.3",
+        sha256 = "bc4b07dc9c23f0cca43b1f5c889f08a59c8f2515836b03d4cc7e0f8f2c879234",
+        strip_prefix = "or-tools-9.6",
         patch_file = ["//third_party/ortools:ortools.patch"],
-        urls = tf_mirror_urls("https://github.com/google/or-tools/archive/v7.3.tar.gz"),
-        repo_mapping = {"@com_google_protobuf_cc": "@com_google_protobuf"},
+        urls = tf_mirror_urls("https://github.com/google/or-tools/archive/v9.6.tar.gz"),
+        repo_mapping = {
+            "@com_google_protobuf_cc": "@com_google_protobuf",
+            "@eigen": "@eigen_archive",
+        },
     )
 
     tf_http_archive(
@@ -938,6 +944,23 @@ def _tf_repositories():
         ],
     )
 
+    tf_http_archive(
+        name = "scip",
+        sha256 = "fe7636f8165a8c9298ff55ed3220d084d4ea31ba9b69d2733beec53e0e4335d6",
+        strip_prefix = "scip-803",
+        build_file = "//third_party/ortools:scip.BUILD",
+        patch_file = ["//third_party/ortools:scip.patch"],
+        urls = tf_mirror_urls("https://github.com/scipopt/scip/archive/refs/tags/v803.tar.gz"),
+    )
+
+    tf_http_archive(
+        name = "bliss",
+        build_file = "//third_party/ortools:bliss.BUILD",
+        sha256 = "f57bf32804140cad58b1240b804e0dbd68f7e6bf67eba8e0c0fa3a62fd7f0f84",
+        urls = tf_mirror_urls("https://github.com/google/or-tools/releases/download/v9.0/bliss-0.73.zip"),
+        #url = "http://www.tcs.hut.fi/Software/bliss/bliss-0.73.zip",
+    )
+
     # used for adding androidx.annotation dependencies in tflite android jni.
     maven_install(
         artifacts = [
diff --git a/third_party/cudnn_frontend_header_fix.patch b/third_party/cudnn_frontend_header_fix.patch
index bcc3e5943b6..ee37c4b1482 100644
--- a/third_party/cudnn_frontend_header_fix.patch
+++ b/third_party/cudnn_frontend_header_fix.patch
@@ -1,27 +1,3 @@
-From 6e44c563e7d71e5a8988ac0d0f3259c4e3405b7d Mon Sep 17 00:00:00 2001
-From: Kaixi Hou <kaixih@nvidia.com>
-Date: Tue, 4 May 2021 15:21:11 -0700
-Subject: [PATCH] Update headers path to TF-compat
-
----
- include/cudnn_backend_base.h                | 2 +-
- include/cudnn_frontend_ConvDesc.h           | 4 ++--
- include/cudnn_frontend_Engine.h             | 4 ++--
- include/cudnn_frontend_EngineConfig.h       | 4 ++--
- include/cudnn_frontend_EngineFallbackList.h | 2 +-
- include/cudnn_frontend_ExecutionPlan.h      | 4 ++--
- include/cudnn_frontend_Filters.h            | 2 +-
- include/cudnn_frontend_Heuristics.h         | 4 ++--
- include/cudnn_frontend_MatMulDesc.h         | 4 ++--
- include/cudnn_frontend_Operation.h          | 4 ++--
- include/cudnn_frontend_OperationGraph.h     | 4 ++--
- include/cudnn_frontend_PointWiseDesc.h      | 4 ++--
- include/cudnn_frontend_ReductionDesc.h      | 4 ++--
- include/cudnn_frontend_Resample.h           | 4 ++--
- include/cudnn_frontend_Rng.h                | 4 ++--
- include/cudnn_frontend_VariantPack.h        | 4 ++--
- 16 files changed, 29 insertions(+), 29 deletions(-)
-
 diff --git a/include/cudnn_backend_base.h b/include/cudnn_backend_base.h
 index 56d8bec..8ceb19c 100644
 --- a/include/cudnn_backend_base.h
@@ -137,7 +113,7 @@ index 680906a..3df8924 100644
  #include "cudnn_frontend_OperationGraph.h"
  #include "cudnn_frontend_EngineConfig.h"
 diff --git a/include/cudnn_frontend_MatMulDesc.h b/include/cudnn_frontend_MatMulDesc.h
-index 0b15295..cae5323 100644
+index e7dd8f7..7a5d443 100644
 --- a/include/cudnn_frontend_MatMulDesc.h
 +++ b/include/cudnn_frontend_MatMulDesc.h
 @@ -29,8 +29,8 @@
@@ -152,7 +128,7 @@ index 0b15295..cae5323 100644
  #include "cudnn_frontend_utils.h"
  
 diff --git a/include/cudnn_frontend_Operation.h b/include/cudnn_frontend_Operation.h
-index 097e970..fca04ea 100644
+index fe75d5b..a43d696 100644
 --- a/include/cudnn_frontend_Operation.h
 +++ b/include/cudnn_frontend_Operation.h
 @@ -30,8 +30,8 @@
@@ -167,7 +143,7 @@ index 097e970..fca04ea 100644
  #include "cudnn_frontend_ConvDesc.h"
  #include "cudnn_frontend_PointWiseDesc.h"
 diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
-index 2a68c41..50162bd 100644
+index 919a190..5e31484 100644
 --- a/include/cudnn_frontend_OperationGraph.h
 +++ b/include/cudnn_frontend_OperationGraph.h
 @@ -30,8 +30,8 @@
@@ -182,7 +158,7 @@ index 2a68c41..50162bd 100644
  #include "cudnn_frontend_Operation.h"
  #include "cudnn_frontend_utils.h"
 diff --git a/include/cudnn_frontend_PointWiseDesc.h b/include/cudnn_frontend_PointWiseDesc.h
-index 7a69b66..cfc9559 100644
+index ad1f943..f320a27 100644
 --- a/include/cudnn_frontend_PointWiseDesc.h
 +++ b/include/cudnn_frontend_PointWiseDesc.h
 @@ -30,8 +30,8 @@
@@ -256,6 +232,3 @@ index dc68207..8b47fce 100644
  
  #include "cudnn_frontend_utils.h"
  
--- 
-2.25.1
-
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index b080bbbb385..4de0704b099 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -44,8 +44,8 @@ cc_library(
         "lib/bufref.h",
         "lib/c-hyper.c",
         "lib/c-hyper.h",
-        "lib/cf-http.c",
-        "lib/cf-http.h",
+        "lib/cf-https-connect.c",
+        "lib/cf-https-connect.h",
         "lib/cf-socket.c",
         "lib/cf-socket.h",
         "lib/cfilters.c",
@@ -232,6 +232,7 @@ cc_library(
         "lib/sendf.h",
         "lib/setopt.c",
         "lib/setopt.h",
+        "lib/setup-os400.h",
         "lib/setup-vms.h",
         "lib/sha256.c",
         "lib/share.c",
@@ -285,12 +286,14 @@ cc_library(
         "lib/vauth/digest.c",
         "lib/vauth/digest.h",
         "lib/vauth/digest_sspi.c",
+        "lib/vauth/gsasl.c",
         "lib/vauth/krb5_gssapi.c",
         "lib/vauth/krb5_sspi.c",
         "lib/vauth/ntlm.c",
         "lib/vauth/ntlm.h",
         "lib/vauth/ntlm_sspi.c",
         "lib/vauth/oauth2.c",
+        "lib/vauth/spnego_gssapi.c",
         "lib/vauth/spnego_sspi.c",
         "lib/vauth/vauth.c",
         "lib/vauth/vauth.h",
@@ -343,8 +346,6 @@ cc_library(
         "lib/vtls/x509asn1.h",
         "lib/warnless.c",
         "lib/warnless.h",
-        "lib/wildcard.c",
-        "lib/wildcard.h",
         "lib/ws.c",
         "lib/ws.h",
     ] + select({
diff --git a/third_party/eigen3/workspace.bzl b/third_party/eigen3/workspace.bzl
index db80ac81e8e..99b0096926e 100644
--- a/third_party/eigen3/workspace.bzl
+++ b/third_party/eigen3/workspace.bzl
@@ -7,8 +7,8 @@ def repo():
 
     # Attention: tools parse and update these lines.
     # LINT.IfChange
-    EIGEN_COMMIT = "b0f877f8e01e90a5b0f3a79d46ea234899f8b499"
-    EIGEN_SHA256 = "bdb1353ba33a5a7a5caadf822057ac1f0254ba2c5e70512dd1ec20cbb64e2f6c"
+    EIGEN_COMMIT = "0b51f763cbbd0ed08168f88972724329f0375498"
+    EIGEN_SHA256 = "70a3b0e357fc037740002f5097a15dba1ea0dde28d37f5d9c86f76a06626f4fc"
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/eigen.cmake)
 
     tf_http_archive(
diff --git a/third_party/flatbuffers/flatbuffers.BUILD b/third_party/flatbuffers/flatbuffers.BUILD
index 2905833ad15..67ae952677c 100644
--- a/third_party/flatbuffers/flatbuffers.BUILD
+++ b/third_party/flatbuffers/flatbuffers.BUILD
@@ -5,7 +5,7 @@ package(default_visibility = ["//visibility:public"])
 
 licenses(["notice"])  # Apache 2.0
 
-exports_files(["LICENSE.txt"])
+exports_files(["LICENSE"])
 
 config_setting(
     name = "platform_freebsd",
@@ -41,16 +41,15 @@ filegroup(
         "include/flatbuffers/allocator.h",
         "include/flatbuffers/array.h",
         "include/flatbuffers/base.h",
-        "include/flatbuffers/bfbs_generator.h",
         "include/flatbuffers/buffer.h",
         "include/flatbuffers/buffer_ref.h",
         "include/flatbuffers/code_generator.h",
         "include/flatbuffers/code_generators.h",
         "include/flatbuffers/default_allocator.h",
         "include/flatbuffers/detached_buffer.h",
+        "include/flatbuffers/file_manager.h",
         "include/flatbuffers/flatbuffer_builder.h",
         "include/flatbuffers/flatbuffers.h",
-        "include/flatbuffers/flatc.h",
         "include/flatbuffers/flex_flat_util.h",
         "include/flatbuffers/flexbuffers.h",
         "include/flatbuffers/grpc.h",
diff --git a/third_party/flatbuffers/workspace.bzl b/third_party/flatbuffers/workspace.bzl
index 2fa19ab1a96..1c05d535298 100644
--- a/third_party/flatbuffers/workspace.bzl
+++ b/third_party/flatbuffers/workspace.bzl
@@ -5,9 +5,9 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     tf_http_archive(
         name = "flatbuffers",
-        strip_prefix = "flatbuffers-23.1.21",
-        sha256 = "d84cb25686514348e615163b458ae0767001b24b42325f426fd56406fd384238",
-        urls = tf_mirror_urls("https://github.com/google/flatbuffers/archive/v23.1.21.tar.gz"),
+        strip_prefix = "flatbuffers-23.5.8",
+        sha256 = "55b75dfa5b6f6173e4abf9c35284a10482ba65db886b39db511eba6c244f1e88",
+        urls = tf_mirror_urls("https://github.com/google/flatbuffers/archive/v23.5.8.tar.gz"),
         build_file = "//third_party/flatbuffers:flatbuffers.BUILD",
         system_build_file = "//third_party/flatbuffers:BUILD.system",
         link_files = {
diff --git a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
index c79ad6c7392..ffa305c772e 100644
--- a/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
+++ b/third_party/gpus/crosstool/cc_toolchain_config.bzl.tpl
@@ -599,6 +599,7 @@ def _features(cpu, compiler, ctx):
         ]
     elif cpu == "x64_windows":
         return [
+            feature(name = "compiler_param_file"),
             feature(name = "no_legacy_features"),
             feature(
                 name = "common_flags",
@@ -623,12 +624,6 @@ def _features(cpu, compiler, ctx):
                     flag_set(
                         actions = all_compile_actions(),
                         flag_groups = [
-                            flag_group(
-                                flags = [
-                                    "-B",
-                                    "external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py",
-                                ],
-                            ),
                             _nologo(),
                             flag_group(
                                 flags = [
diff --git a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
index 447e5ba7ec9..ceed29b4c69 100644
--- a/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
+++ b/third_party/gpus/crosstool/windows/msvc_wrapper_for_nvcc.py.tpl
@@ -193,7 +193,29 @@ def InvokeNvcc(argv, log=False):
   proc.wait()
   return proc.returncode
 
+def ExpandParamsFileForArgv():
+  new_argv = []
+  for arg in sys.argv:
+    if arg.startswith("@"):
+      with open(arg.strip("@")) as f:
+        new_argv.extend([l.strip() for l in f.readlines()])
+    else:
+      new_argv.append(arg)
+
+  sys.argv = new_argv
+
+def ProcessFlagForCommandFile(flag):
+  if flag.startswith("/D") or flag.startswith("-D"):
+    # We need to re-escape /DFOO="BAR" as /DFOO=\"BAR\", so that we get
+    # `#define FOO "BAR"` after expansion as a string literal define
+    if flag.endswith('"') and not flag.endswith('\\"'):
+      flag = '\\"'.join(flag.split('"', 1))
+      flag = '\\"'.join(flag.rsplit('"', 1))
+      return flag
+  return flag
+
 def main():
+  ExpandParamsFileForArgv()
   parser = ArgumentParser()
   parser.add_argument('-x', nargs=1)
   parser.add_argument('--cuda_log', action='store_true')
@@ -212,7 +234,18 @@ def main():
   cpu_compiler_flags = [flag for flag in sys.argv[1:]
                              if not flag.startswith(('--cuda_log'))
                              and not flag.startswith(('-nvcc_options'))]
+  output = [flag for flag in cpu_compiler_flags if flag.startswith("/Fo")]
 
+  # Store command line options in a file to avoid hitting the character limit.
+  if len(output) == 1:
+    commandfile_path = output[0][3:] + ".msvc_params"
+    commandfile = open(commandfile_path, "w")
+    cpu_compiler_flags = [ProcessFlagForCommandFile(flag) for flag in cpu_compiler_flags]
+    commandfile.write("\n".join(cpu_compiler_flags))
+    commandfile.close()
+    return subprocess.call([CPU_COMPILER, "@" + commandfile_path])
+  else:
+    return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
   return subprocess.call([CPU_COMPILER] + cpu_compiler_flags)
 
 if __name__ == '__main__':
diff --git a/third_party/gpus/cuda/BUILD.tpl b/third_party/gpus/cuda/BUILD.tpl
index 8f783e58d66..a3ba2ffccc0 100644
--- a/third_party/gpus/cuda/BUILD.tpl
+++ b/third_party/gpus/cuda/BUILD.tpl
@@ -42,7 +42,6 @@ selects.config_setting_group(
 config_setting(
     name = "_opt",
     values = {"compilation_mode": "opt"},
-    visibility = ["//visibility:private"],
 )
 
 # Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
diff --git a/third_party/gpus/cuda/BUILD.windows.tpl b/third_party/gpus/cuda/BUILD.windows.tpl
index e34a29e2a65..f20ecbd654b 100644
--- a/third_party/gpus/cuda/BUILD.windows.tpl
+++ b/third_party/gpus/cuda/BUILD.windows.tpl
@@ -42,7 +42,6 @@ selects.config_setting_group(
 config_setting(
     name = "_opt",
     values = {"compilation_mode": "opt"},
-    visibility = ["//visibility:private"],
 )
 
 # Provides CUDA headers for '#include "third_party/gpus/cuda/include/cuda.h"'
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index 1e29ce9766a..71acfa7cb7d 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -69,7 +69,7 @@ def cuda_default_copts():
         if_nvcc = [
             "-Xcuda-fatbinary=--compress-all",
             # Ensure that NVCC matches clang's constexpr behavior.
-            "--expt-relaxed-constexpr"
+            "-nvcc_options=expt-relaxed-constexpr"
         ]
     )
 
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index 922d26deef9..b14e6835063 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -168,7 +168,7 @@ def _get_win_cuda_defines(repository_ctx):
         ),
     )
 
-    msvc_cl_path = get_python_bin(repository_ctx)
+    msvc_cl_path = "windows/msvc_wrapper_for_nvcc.bat"
     msvc_ml_path = find_msvc_tool(repository_ctx, vc_path, "ml64.exe").replace(
         "\\",
         "/",
@@ -1301,6 +1301,12 @@ def _create_local_cuda_repository(repository_ctx):
             tpl_paths["crosstool:clang/bin/crosstool_wrapper_driver_is_not_gcc"],
             wrapper_defines,
         )
+        repository_ctx.file(
+            "crosstool/windows/msvc_wrapper_for_nvcc.bat",
+            content = "@echo OFF\n{} -B external/local_config_cuda/crosstool/windows/msvc_wrapper_for_nvcc.py %*".format(
+                get_python_bin(repository_ctx),
+            ),
+        )
         repository_ctx.template(
             "crosstool/windows/msvc_wrapper_for_nvcc.py",
             tpl_paths["crosstool:windows/msvc_wrapper_for_nvcc.py"],
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 6c31fa53316..99589a22fa8 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -196,6 +196,8 @@ def _rocm_include_path(repository_ctx, rocm_config, bash_bin):
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/13.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/14.0.0/include")
     inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/15.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/16.0.0/include")
+    inc_dirs.append(rocm_toolkit_path + "/llvm/lib/clang/17.0.0/include")
 
     # Support hcc based off clang 10.0.0 (for ROCm 3.3)
     inc_dirs.append(rocm_toolkit_path + "/hcc/compiler/lib/clang/10.0.0/include/")
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index 561fb512ef3..509398da979 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,119 +1 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang/include/clang/Basic/Specifiers.h b/clang/include/clang/Basic/Specifiers.h
---- a/clang/include/clang/Basic/Specifiers.h
-+++ b/clang/include/clang/Basic/Specifiers.h
-@@ -19,9 +19,6 @@
- #include "llvm/Support/DataTypes.h"
- #include "llvm/Support/ErrorHandling.h"
- 
--namespace llvm {
--class raw_ostream;
--} // namespace llvm
- namespace clang {
- 
-   /// Define the meaning of possible values of the kind in ExplicitSpecifier.
-@@ -336,8 +333,6 @@
-     // parameters are assumed to only get null on error.
-     NullableResult,
-   };
--  /// Prints human-readable debug representation.
--  llvm::raw_ostream &operator<<(llvm::raw_ostream&, NullabilityKind);
- 
-   /// Return true if \p L has a weaker nullability annotation than \p R. The
-   /// ordering is: Unspecified < Nullable < NonNull.
-diff -ruN --strip-trailing-cr a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp
---- a/clang/lib/Basic/Diagnostic.cpp
-+++ b/clang/lib/Basic/Diagnostic.cpp
-@@ -43,12 +43,28 @@
- 
- const StreamingDiagnostic &clang::operator<<(const StreamingDiagnostic &DB,
-                                              DiagNullabilityKind nullability) {
--  DB.AddString(
--      ("'" +
--       getNullabilitySpelling(nullability.first,
--                              /*isContextSensitive=*/nullability.second) +
--       "'")
--          .str());
-+  StringRef string;
-+  switch (nullability.first) {
-+  case NullabilityKind::NonNull:
-+    string = nullability.second ? "'nonnull'" : "'_Nonnull'";
-+    break;
-+
-+  case NullabilityKind::Nullable:
-+    string = nullability.second ? "'nullable'" : "'_Nullable'";
-+    break;
-+
-+  case NullabilityKind::Unspecified:
-+    string = nullability.second ? "'null_unspecified'" : "'_Null_unspecified'";
-+    break;
-+
-+  case NullabilityKind::NullableResult:
-+    assert(!nullability.second &&
-+           "_Nullable_result isn't supported as context-sensitive keyword");
-+    string = "_Nullable_result";
-+    break;
-+  }
-+
-+  DB.AddString(string);
-   return DB;
- }
- 
-diff -ruN --strip-trailing-cr a/clang/lib/Basic/IdentifierTable.cpp b/clang/lib/Basic/IdentifierTable.cpp
---- a/clang/lib/Basic/IdentifierTable.cpp
-+++ b/clang/lib/Basic/IdentifierTable.cpp
-@@ -849,20 +849,6 @@
-   llvm_unreachable("Unknown nullability kind.");
- }
- 
--llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, NullabilityKind NK) {
--  switch (NK) {
--  case NullabilityKind::NonNull:
--    return OS << "NonNull";
--  case NullabilityKind::Nullable:
--    return OS << "Nullable";
--  case NullabilityKind::NullableResult:
--    return OS << "NullableResult";
--  case NullabilityKind::Unspecified:
--    return OS << "Unspecified";
--  }
--  llvm_unreachable("Unknown nullability kind.");
--}
--
- diag::kind
- IdentifierTable::getFutureCompatDiagKind(const IdentifierInfo &II,
-                                          const LangOptions &LangOpts) {
-diff -ruN --strip-trailing-cr a/clang/test/SemaObjC/nullable-result.m b/clang/test/SemaObjC/nullable-result.m
---- a/clang/test/SemaObjC/nullable-result.m
-+++ b/clang/test/SemaObjC/nullable-result.m
-@@ -25,9 +25,9 @@
- }
- 
- void test_dup(void) {
--  id _Nullable_result _Nullable_result a; // expected-warning {{duplicate nullability specifier '_Nullable_result'}}
--  id _Nullable _Nullable_result b; // expected-error{{nullability specifier '_Nullable_result' conflicts with existing specifier '_Nullable'}}
--  id _Nullable_result _Nonnull c; // expected-error{{nullability specifier '_Nonnull' conflicts with existing specifier '_Nullable_result'}}
-+  id _Nullable_result _Nullable_result a; // expected-warning {{duplicate nullability specifier _Nullable_result}}
-+  id _Nullable _Nullable_result b; // expected-error{{nullability specifier _Nullable_result conflicts with existing specifier '_Nullable'}}
-+  id _Nullable_result _Nonnull c; // expected-error{{nullability specifier '_Nonnull' conflicts with existing specifier _Nullable_result}}
- }
- 
- @interface NoContextSensitive
-diff -ruN --strip-trailing-cr a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
---- a/llvm/lib/Analysis/ValueTracking.cpp
-+++ b/llvm/lib/Analysis/ValueTracking.cpp
-@@ -924,12 +924,14 @@
- 
-     if (Arg == V && isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-       assert(BitWidth == 1 && "assume operand is not i1?");
-+      (void)BitWidth;
-       Known.setAllOnes();
-       return;
-     }
-     if (match(Arg, m_Not(m_Specific(V))) &&
-         isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-       assert(BitWidth == 1 && "assume operand is not i1?");
-+      (void)BitWidth;
-       Known.setAllZero();
-       return;
-     }
diff --git a/third_party/llvm/setup.bzl b/third_party/llvm/setup.bzl
index e9b4270f8ea..564a400ed0b 100644
--- a/third_party/llvm/setup.bzl
+++ b/third_party/llvm/setup.bzl
@@ -1,6 +1,6 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure", "llvm_disable_optional_support_deps")
+load("@llvm-raw//utils/bazel:configure.bzl", "llvm_configure")
 
 # The subset of LLVM targets that TensorFlow cares about.
 _LLVM_TARGETS = [
@@ -15,9 +15,6 @@ _LLVM_TARGETS = [
 ]
 
 def llvm_setup(name):
-    # Disable terminfo and zlib that are bundled with LLVM.
-    llvm_disable_optional_support_deps()
-
     # Build @llvm-project from @llvm-raw using overlays.
     llvm_configure(
         name = name,
diff --git a/third_party/llvm/toolchains.patch b/third_party/llvm/toolchains.patch
new file mode 100644
index 00000000000..026b70ed0da
--- /dev/null
+++ b/third_party/llvm/toolchains.patch
@@ -0,0 +1,51 @@
+diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+index 8f7335c0b4ed..055fd65342d6 100644
+--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
++++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+@@ -19,6 +19,30 @@ licenses(["notice"])
+ 
+ exports_files(["LICENSE.TXT"])
+ 
++config_setting(
++    name = "macos_arm64",
++    values = {
++        "apple_platform_type": "macos",
++        "cpu": "darwin_arm64",
++    },
++)
++
++config_setting(
++    name = "macos_x86_64_default",
++    values = {
++        "apple_platform_type": "macos",
++        "cpu": "darwin",
++    },
++)
++
++config_setting(
++    name = "macos_x86_64",
++    values = {
++        "apple_platform_type": "macos",
++        "cpu": "darwin_x86_64",
++    },
++)
++
+ # It may be tempting to add compiler flags here, but that should be avoided.
+ # The necessary warnings and other compile flags should be provided by the
+ # toolchain or the `.bazelrc` file. This is just a workaround until we have a
+diff --git a/utils/bazel/llvm-project-overlay/llvm/config.bzl b/utils/bazel/llvm-project-overlay/llvm/config.bzl
+index b15ec9e1bb39..56c2766872fa 100644
+--- a/utils/bazel/llvm-project-overlay/llvm/config.bzl
++++ b/utils/bazel/llvm-project-overlay/llvm/config.bzl
+@@ -89,8 +89,9 @@ os_defines = select({
+ # TODO: We should split out host vs. target here.
+ llvm_config_defines = os_defines + select({
+     "@bazel_tools//src/conditions:windows": native_arch_defines("X86", "x86_64-pc-win32"),
+-    "@bazel_tools//src/conditions:darwin_arm64": native_arch_defines("AArch64", "arm64-apple-darwin"),
+-    "@bazel_tools//src/conditions:darwin_x86_64": native_arch_defines("X86", "x86_64-unknown-darwin"),
++    "//llvm:macos_arm64": native_arch_defines("AArch64", "arm64-apple-darwin"),
++    "//llvm:macos_x86_64": native_arch_defines("X86", "x86_64-unknown-darwin"),
++    "//llvm:macos_x86_64_default": native_arch_defines("X86", "x86_64-unknown-darwin"),
+     "@bazel_tools//src/conditions:linux_aarch64": native_arch_defines("AArch64", "aarch64-unknown-linux-gnu"),
+     "@bazel_tools//src/conditions:linux_ppc64le": native_arch_defines("PowerPC", "powerpc64le-unknown-linux-gnu"),
+     "@bazel_tools//src/conditions:linux_s390x": native_arch_defines("SystemZ", "systemz-unknown-linux_gnu"),
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 7568013ef85..bbc5d76a52a 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "30af2fb33ed2f610abfa50e53df9712887b2bd25"
-    LLVM_SHA256 = "60c68baf819aef5417f411de2c3c33fce44ce48cc509c243f834317b308d7e71"
+    LLVM_COMMIT = "0258a53521cfedf5cb80c2b1d4a66c942615de74"
+    LLVM_SHA256 = "e9f298730072bca14d4db5a786ed7d11962654d3895c27c47ee8130b77cb5f18"
 
     tf_http_archive(
         name = name,
@@ -20,6 +20,7 @@ def repo(name):
             "//third_party/llvm:generated.patch",  # Autogenerated, don't remove.
             "//third_party/llvm:build.patch",
             "//third_party/llvm:mathextras.patch",
+            "//third_party/llvm:toolchains.patch",
             "//third_party/llvm:zstd.patch",
         ],
         link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
diff --git a/third_party/llvm/zstd.patch b/third_party/llvm/zstd.patch
index feeab860e3a..3d25a9c7c3c 100644
--- a/third_party/llvm/zstd.patch
+++ b/third_party/llvm/zstd.patch
@@ -1,11 +1,15 @@
 diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-index 80d5d677c537..5e47b681052f 100644
+index eb887a2e04a2..3ca71bc623e3 100644
 --- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
 +++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
-@@ -283,10 +283,6 @@ cc_library(
-         # be an empty library unless zlib is enabled, in which case it will
-         # both provide the necessary dependencies and configuration defines.
-         "@llvm_zlib//:zlib",
+@@ -274,14 +274,6 @@ cc_library(
+     deps = [
+         ":config",
+         ":Demangle",
+-        # We unconditionally depend on the custom LLVM zlib wrapper. This will
+-        # be an empty library unless zlib is enabled, in which case it will
+-        # both provide the necessary dependencies and configuration defines.
+-        "@llvm_zlib//:zlib",
 -        # We unconditionally depend on the custom LLVM zstd wrapper. This will
 -        # be an empty library unless zstd is enabled, in which case it will
 -        # both provide the necessary dependencies and configuration defines.
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 6a26ca83b44..dd3f63f738c 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -101,6 +101,7 @@ _COPTS_LIST = select({
     "-UUSE_MKL",
     "-UUSE_CBLAS",
     "-DDNNL_ENABLE_MAX_CPU_ISA",
+    "-DDNNL_ENABLE_ITT_TASKS",
 ] + tf_openmp_copts()
 
 _INCLUDES_LIST = [
diff --git a/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch b/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
index e8e2f465bfa..95f0374ec4d 100644
--- a/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
+++ b/third_party/mkl_dnn/onednn_acl_depthwise_convolution.patch
@@ -1,5 +1,5 @@
  *******************************************************************************
- Copyright 2022-2023 Arm Limited and affiliates.
+ Copyright 2022 Arm Limited and affiliates.
  SPDX-License-Identifier: Apache-2.0
 
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,7 +16,7 @@
  *******************************************************************************
 
 diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index 1792d0af96..ad19caf5d8 100644
+index fc93d2aa9..6ebac0d17 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.cpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.cpp
 @@ -54,10 +54,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -33,13 +33,12 @@ index 1792d0af96..ad19caf5d8 100644
          return status::unimplemented;
      }
  
-@@ -135,11 +137,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -135,11 +137,11 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
          is_nspc = utils::one_of(src_tag, nhwc);
  
          memory_desc_t want_wei_md = weights_md;
 -        auto wei_tag = is_nspc ? ohwi : oihw;
 +        auto wei_tag = is_depthwise ? hwigo : (is_nspc ? ohwi : oihw);
-+
          CHECK(memory_desc_init_by_tag(want_wei_md, wei_tag));
  
          // Compute Library does not support mismatching layouts
@@ -48,7 +47,7 @@ index 1792d0af96..ad19caf5d8 100644
              return status::unimplemented;
  
          if (weights_md.format_kind == format_kind::any) {
-@@ -187,6 +190,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -187,6 +189,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
              acl_wei_data_t,
              acl_layout);
  
@@ -61,7 +60,7 @@ index 1792d0af96..ad19caf5d8 100644
      acp.dst_info = arm_compute::TensorInfo(
              is_nspc ? arm_compute::TensorShape(oc, ow, oh, mb) :
              arm_compute::TensorShape(ow, oh, oc, mb),
-@@ -212,6 +221,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -212,6 +220,12 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                  arm_compute::QuantizationInfo(1.0f / scales[0], 0));
      }
  
@@ -71,10 +70,10 @@ index 1792d0af96..ad19caf5d8 100644
 +        return status::success;
 +    }
 +
-     // WeightFormat::ANY tells ACL we can handle any format
      acp.weights_info = arm_compute::WeightsInfo(
-             false, kw, kh, oc, false, arm_compute::WeightFormat::ANY);
-@@ -280,6 +295,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+         false,
+         kw,
+@@ -302,6 +316,10 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          const primitive_attr_t &attr) {
      acp.is_indirect = false;
  
@@ -85,7 +84,7 @@ index 1792d0af96..ad19caf5d8 100644
      // General Compute Library checks, memory tags are also set there
      CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
  
-@@ -308,7 +327,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -330,7 +348,8 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
      auto math_mode = get_fpmath_mode();
      // Indirect convolution results in slowdown for low thread count or 1x1
      // kernels, so fall back to GEMM-based convolution in these cases
@@ -95,7 +94,7 @@ index 1792d0af96..ad19caf5d8 100644
                  weights_md.dims[3] == 1, // kw
                  (!math_mode && dnnl_get_max_threads() < 28))) {
          return status::unimplemented;
-@@ -333,6 +353,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -355,6 +374,27 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
      return status::success;
  }
  
@@ -123,7 +122,7 @@ index 1792d0af96..ad19caf5d8 100644
  status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
-@@ -342,7 +383,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -364,7 +404,8 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
      // Under these conditions, fallback to faster GEMM-based convolution
      // unless the user explicitly specifies Winograd algorithm
      // clang-format off
@@ -134,7 +133,7 @@ index 1792d0af96..ad19caf5d8 100644
                  src_md.dims[1] < 64, // ic
                  dst_md.dims[1] < 64, // oc
 diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 44dc8eecbf..7eae5cbb1a 100644
+index 44dc8eecb..7eae5cbb1 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.hpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.hpp
 @@ -67,6 +67,11 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
@@ -150,7 +149,7 @@ index 44dc8eecbf..7eae5cbb1a 100644
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
 diff --git a/src/cpu/cpu_convolution_list.cpp b/src/cpu/cpu_convolution_list.cpp
-index 4142dbc7e7..1800aaf583 100644
+index 4142dbc7e..1800aaf58 100644
 --- a/src/cpu/cpu_convolution_list.cpp
 +++ b/src/cpu/cpu_convolution_list.cpp
 @@ -65,6 +65,7 @@ using namespace dnnl::impl::cpu::x64;
@@ -171,7 +170,7 @@ index 4142dbc7e7..1800aaf583 100644
              CPU_INSTANCE(gemm_convolution_fwd_t)
 diff --git a/src/cpu/aarch64/acl_depthwise_convolution.cpp b/src/cpu/aarch64/acl_depthwise_convolution.cpp
 new file mode 100644
-index 0000000000..1beb8b8af3
+index 000000000..1beb8b8af
 --- /dev/null
 +++ b/src/cpu/aarch64/acl_depthwise_convolution.cpp
 @@ -0,0 +1,41 @@
@@ -218,7 +217,7 @@ index 0000000000..1beb8b8af3
 +}
 diff --git a/src/cpu/aarch64/acl_depthwise_convolution.hpp b/src/cpu/aarch64/acl_depthwise_convolution.hpp
 new file mode 100644
-index 0000000000..d84fc4fb51
+index 000000000..d84fc4fb5
 --- /dev/null
 +++ b/src/cpu/aarch64/acl_depthwise_convolution.hpp
 @@ -0,0 +1,139 @@
diff --git a/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch b/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
index a0a10d79cea..2c8af08ab8a 100644
--- a/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
+++ b/third_party/mkl_dnn/onednn_acl_fixed_format_kernels.patch
@@ -1,5 +1,5 @@
  *******************************************************************************
- Copyright 2022-2023 Arm Limited and affiliates.
+ Copyright 2022 Arm Limited and affiliates.
  SPDX-License-Identifier: Apache-2.0
 
  Licensed under the Apache License, Version 2.0 (the "License");
@@ -16,76 +16,98 @@
  *******************************************************************************
 
 diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
-index c46d697575..1792d0af96 100644
+index c46d69757..fc93d2aa9 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.cpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.cpp
-@@ -212,6 +212,65 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -212,6 +212,87 @@ status_t acl_init_conf(acl_conv_conf_t &acp, memory_desc_t &src_md,
                  arm_compute::QuantizationInfo(1.0f / scales[0], 0));
      }
  
-+    // WeightFormat::ANY tells ACL we can handle any format
 +    acp.weights_info = arm_compute::WeightsInfo(
-+            false, kw, kh, oc, false, arm_compute::WeightFormat::ANY);
-+
-+    // Get the format that the ACL kernel will expect the weights to be
-+    // in (if a kernel exists). Note that these are referred to as fixed format
-+    // kernels, because they require one specific weights format
++        false,
++        kw,
++        kh,
++        oc,
++        false,
++        arm_compute::WeightFormat::ANY);
 +    arm_compute::WeightFormat expected_weight_format;
-+    ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
-+            expected_weight_format, &acp.src_info, &acp.wei_info,
-+            acp.with_bias ? &acp.bia_info : nullptr,
-+            &acp.dst_info, acp.padstride_info, acp.weights_info,
-+            acp.dilation_info, acp.act_info, acp.fast_math));
-+
-+    // Set weights info to the one returned by has_opt_impl
++    auto acl_st = arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
++        expected_weight_format,
++        &acp.src_info,
++        &acp.wei_info,
++        acp.with_bias ? &acp.bia_info : nullptr,
++        &acp.dst_info,
++        acp.padstride_info,
++        acp.weights_info,
++        acp.dilation_info,
++        acp.act_info,
++        acp.fast_math);
++    if(acl_st.error_code() != arm_compute::ErrorCode::OK) {
++        return status::unimplemented;
++    }
 +    acp.weights_info.set_weight_format(expected_weight_format);
 +
-+    // has_opt_impl may return a non fast math kernel, even if we requested one
-+    acp.fast_math
-+            = arm_compute::is_fixed_format_fast_math(expected_weight_format);
++    int interleaved_by = arm_compute::interleave_by(expected_weight_format);
++    int block_by = arm_compute::block_by(expected_weight_format);
 +
-+    // Map OIHW used in ACL WeightFormat to the logical dimensions of the memory descriptor
-+    dim_t O_dim = 0;
-+    dim_t I_dim = 1;
-+    dim_t H_dim = 2;
-+    dim_t W_dim = 3;
-+
-+    if (!is_nspc) {
-+        // We can try to support NCHW by swapping IHW around, note that this
-+        // requires weights_md.dims[I_dim] % block_by != 0 (see next block)
-+        O_dim = 0;
-+        I_dim = 3;
-+        H_dim = 1;
-+        W_dim = 2;
-+    }
-+
-+    // We can't currently support nchw and block_by != 1. If this is the case,
-+    // try a non fast math kernel, which currently have no blocking
-+    int block_by = arm_compute::block_by(acp.weights_info.weight_format());
-+    if (!is_nspc && weights_md.dims[I_dim] % block_by != 0 && acp.fast_math) {
++    bool is_fast_math_kernel = arm_compute::is_fixed_format_fast_math(expected_weight_format);
++    if(!is_fast_math_kernel) {
++        // FP32 kernel is faster then BF16
 +        acp.fast_math = false;
-+        acp.weights_info.set_weight_format(arm_compute::WeightFormat::ANY);
-+        ACL_CHECK_VALID(arm_compute::NEGEMMConvolutionLayer::has_opt_impl(
-+                expected_weight_format, &acp.src_info,
-+                &acp.wei_info,
-+                acp.with_bias ? &acp.bia_info : nullptr,
-+                &acp.dst_info, acp.padstride_info, acp.weights_info,
-+                acp.dilation_info, acp.act_info, acp.fast_math));
-+        acp.weights_info.set_weight_format(expected_weight_format);
-+        block_by = arm_compute::block_by(expected_weight_format);
-+        // This shouldn't happen, because non-fastmath have no blocking, but
-+        // guard against it because it would silently return incorrect results
-+        if (weights_md.dims[I_dim] % block_by != 0)
-+            return status::unimplemented;
 +    }
 +
-+    acl_utils::reorder_to_weight_format(acp.wei_info, weights_md,
-+            expected_weight_format, I_dim, O_dim, {W_dim, H_dim}, {});
++    memory_desc_t want_wei_md = weights_md;
++
++    int ic_multiply = ic;
++    if(ic % block_by != 0) {
++        ic_multiply = utils::div_up(ic, block_by) * block_by;
++        // Also we need to set padded dimensions as well
++        want_wei_md.padded_dims[1] = ic_multiply;
++    } else {
++        // If we do not need to pad input channels for fast math mode
++        // then it would be faster to run convolution with im2row
++        // instead of using indirect buffer
++        if(acp.fast_math && acp.is_indirect) {
++            return status::unimplemented;
++        }
++    }
++    if(oc % interleaved_by != 0) {
++        int padded_dim = utils::div_up(oc, interleaved_by) * interleaved_by;
++        want_wei_md.padded_dims[0] = padded_dim;
++    }
++
++    // Set strides based on blocking information
++    want_wei_md.format_desc.blocking.strides[0] = interleaved_by*ic_multiply*kw*kh;
++    want_wei_md.format_desc.blocking.strides[1] = interleaved_by*block_by;
++    want_wei_md.format_desc.blocking.strides[2] = interleaved_by*ic_multiply*kw;
++    want_wei_md.format_desc.blocking.strides[3] = interleaved_by*ic_multiply;
++
++    acl_utils::update_strides_y_and_z(
++        acp.wei_info,
++        want_wei_md.format_desc.blocking.strides[0] * wei_d.data_type_size(),
++        acp.wei_info.strides_in_bytes().z());
++
++    // Set blocking
++    want_wei_md.format_desc.blocking.inner_nblks = (block_by > 1) + 1;
++    want_wei_md.format_desc.blocking.inner_idxs[0] = 0; // second to last dimension in abcd format
++    want_wei_md.format_desc.blocking.inner_blks[0] = interleaved_by;
++
++    if(block_by > 1) {
++        want_wei_md.format_desc.blocking.inner_idxs[1] = 1; // second to last dimension in abcd format
++        want_wei_md.format_desc.blocking.inner_blks[1] = block_by;
++    }
++
++    if(is_fast_math_kernel) {
++        // If it is fast math mode we need weights in BFloat16
++        want_wei_md.data_type = dnnl_bf16;
++    }
++
++    weights_md = want_wei_md;
 +
      return status::success;
  }
  
-@@ -219,6 +278,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -219,6 +300,7 @@ status_t init_conf_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr) {
@@ -93,7 +115,7 @@ index c46d697575..1792d0af96 100644
  
      // General Compute Library checks, memory tags are also set there
      CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
-@@ -244,11 +304,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -244,11 +326,13 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr) {
@@ -108,7 +130,7 @@ index c46d697575..1792d0af96 100644
          return status::unimplemented;
      }
  
-@@ -275,6 +337,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
+@@ -275,6 +359,7 @@ status_t init_conf_wino(acl_conv_conf_t &acp, memory_desc_t &src_md,
          memory_desc_t &weights_md, memory_desc_t &dst_md,
          memory_desc_t &bias_md, const convolution_desc_t &cd,
          const primitive_attr_t &attr) {
@@ -117,7 +139,7 @@ index c46d697575..1792d0af96 100644
      // Under these conditions, fallback to faster GEMM-based convolution
      // unless the user explicitly specifies Winograd algorithm
 diff --git a/src/cpu/aarch64/acl_convolution_utils.hpp b/src/cpu/aarch64/acl_convolution_utils.hpp
-index 3e56245faf..44dc8eecbf 100644
+index 3e56245fa..44dc8eecb 100644
 --- a/src/cpu/aarch64/acl_convolution_utils.hpp
 +++ b/src/cpu/aarch64/acl_convolution_utils.hpp
 @@ -43,6 +43,7 @@ struct acl_conv_conf_t {
@@ -129,7 +151,7 @@ index 3e56245faf..44dc8eecbf 100644
      arm_compute::TensorInfo wei_info;
      arm_compute::TensorInfo bia_info;
 diff --git a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
-index bcf031a771..4ddc8cf910 100644
+index bcf031a77..4ddc8cf91 100644
 --- a/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
 +++ b/src/cpu/aarch64/acl_indirect_gemm_convolution.hpp
 @@ -41,6 +41,7 @@ struct acl_indirect_gemm_resource_t : public resource_t {
@@ -152,7 +174,7 @@ index bcf031a771..4ddc8cf910 100644
  
          return status::success;
 diff --git a/src/cpu/aarch64/acl_inner_product.hpp b/src/cpu/aarch64/acl_inner_product.hpp
-index c5e507085f..163ff066e6 100644
+index c5e507085..163ff066e 100644
 --- a/src/cpu/aarch64/acl_inner_product.hpp
 +++ b/src/cpu/aarch64/acl_inner_product.hpp
 @@ -45,6 +45,7 @@ struct acl_ip_conf_t {
@@ -303,7 +325,7 @@ index c5e507085f..163ff066e6 100644
              // Validate fully connected layer manually to check for return status
              ACL_CHECK_VALID(arm_compute::NEFullyConnectedLayer::validate(
 diff --git a/src/cpu/aarch64/acl_utils.cpp b/src/cpu/aarch64/acl_utils.cpp
-index 79ea775d6d..3424d5dacc 100644
+index 79ea775d6..7ee4c7398 100644
 --- a/src/cpu/aarch64/acl_utils.cpp
 +++ b/src/cpu/aarch64/acl_utils.cpp
 @@ -157,6 +157,28 @@ status_t tensor_info(
@@ -335,84 +357,8 @@ index 79ea775d6d..3424d5dacc 100644
  status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i) {
  
      // Max 6 dims in ACL, so we can't insert another
-@@ -261,6 +283,75 @@ int reorder_dimensions_by_stride(std::vector<memory_desc_t *> permuted_mds,
-     return reordered_dims;
- }
- 
-+void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
-+        arm_compute::WeightFormat wf, dim_t I_dim, dim_t O_dim,
-+        std::vector<dim_t> spatial_dims, std::vector<dim_t> batch_dims) {
-+
-+    md.format_kind = format_kind::blocked;
-+    md.format_desc.blocking = blocking_desc_t {};
-+    const int interleaved_by = arm_compute::interleave_by(wf);
-+    const int block_by = arm_compute::block_by(wf);
-+
-+    // I dimension becomes densest (apart from blocking)
-+    md.format_desc.blocking.strides[I_dim] = interleaved_by * block_by;
-+    md.padded_dims[I_dim] = utils::rnd_up(md.dims[I_dim], block_by);
-+
-+    // Then any spatial dimensions (e.g. HW)
-+    dim_t ldb = interleaved_by * md.padded_dims[I_dim];
-+    for (dim_t sd : spatial_dims) {
-+        md.format_desc.blocking.strides[sd] = ldb;
-+        ldb *= md.padded_dims[sd];
-+    }
-+
-+    // O dim (which was the innermost) becomes the outermost (apart from batching)
-+    md.format_desc.blocking.strides[O_dim] = ldb;
-+    md.padded_dims[O_dim] = utils::rnd_up(md.dims[O_dim], interleaved_by);
-+
-+    // Update the batch dimensions, starting with stride of the innermost batch
-+    const dim_t innermost_batch_stride
-+            = md.padded_dims[I_dim] * md.padded_dims[O_dim];
-+    dim_t batch_stride = innermost_batch_stride;
-+    for (dim_t bd : batch_dims) {
-+        md.format_desc.blocking.strides[bd] = batch_stride;
-+        batch_stride *= md.padded_dims[bd];
-+    }
-+
-+    // Weights can only be blocked if they are also interleaved
-+    if (interleaved_by > 1) {
-+        md.format_desc.blocking.inner_nblks = 1 + (block_by > 1);
-+
-+        md.format_desc.blocking.inner_idxs[0] = O_dim;
-+        md.format_desc.blocking.inner_blks[0] = interleaved_by;
-+        if (block_by > 1) {
-+            md.format_desc.blocking.inner_idxs[1] = I_dim;
-+            md.format_desc.blocking.inner_blks[1] = block_by;
-+        }
-+    }
-+
-+    if (arm_compute::is_fixed_format_fast_math(wf)) {
-+        md.data_type = dnnl_bf16;
-+        info.set_data_type(arm_compute::DataType::BFLOAT16);
-+    }
-+
-+    // The data layout is now determined by the manually set strides
-+    info.set_data_layout(arm_compute::DataLayout::UNKNOWN);
-+
-+    // x is ignored in fixed format kernels
-+    // y is the leading dimension of b (ldb) in the GEMM d = a*b + c
-+    //   This is the stride of O_dim in the md
-+    // z is the batch dimension (not strictly needed if there's only 1 batch)
-+    //   i.e. how much do I need to stride to get to the next matmul (ignoring
-+    //   the interleaving). Note that we use the innermost_batch_stride
-+    //   because all the batched dimensions are collapsed (as required by ACL).
-+    arm_compute::Strides new_strides_in_bytes = info.strides_in_bytes();
-+    new_strides_in_bytes.set(1, ldb * info.element_size());
-+    new_strides_in_bytes.set(2, innermost_batch_stride * info.element_size());
-+
-+    info.init(info.tensor_shape(), info.num_channels(), info.data_type(),
-+            new_strides_in_bytes, info.offset_first_element_in_bytes(),
-+            memory_desc_wrapper(md).size());
-+}
-+
- } // namespace acl_utils
- 
- } // namespace aarch64
 diff --git a/src/cpu/aarch64/acl_utils.hpp b/src/cpu/aarch64/acl_utils.hpp
-index 28693bb167..141b2974a2 100644
+index 28693bb16..c7c9e1278 100644
 --- a/src/cpu/aarch64/acl_utils.hpp
 +++ b/src/cpu/aarch64/acl_utils.hpp
 @@ -62,6 +62,9 @@ status_t tensor_info(arm_compute::TensorInfo &info, const memory_desc_t &md);
@@ -425,37 +371,8 @@ index 28693bb167..141b2974a2 100644
  // Insert a dimension of size 1 at the index dim_i of TensorInfo
  status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i);
  
-@@ -74,6 +77,28 @@ status_t insert_singleton_dimension(arm_compute::TensorInfo &ti, size_t dim_i);
- int reorder_dimensions_by_stride(std::vector<memory_desc_t *> permuted_mds,
-         std::vector<const memory_desc_t *> mds);
- 
-+// Reorder a memory_desc_t and set the strides on a arm_compute::TensorInfo to
-+// match an arm_compute::WeightFormat. You are required to specify how various
-+// logical dimensions in oneDNN correspond to logical dimensions in arm_compute.
-+// info  TensorInfo where the strides will be changed to match the reordering
-+// md    memory descriptor where the stride and padded dimensions will be
-+//       changed or reordering
-+// wf    Describes the memory format/layout of the weights
-+// I_dim The logical dimension of md corresponding to the input channel of
-+//       a convolution or the K dimension in a matmul
-+// O_dim The logical dimension of md corresponding to the output channel of a
-+//       convolution or the N dimension in a matmul
-+// spatial_dims The logical dimensions of md corresponding to the spatial
-+//              dimensions of the weights (H, W, D for example). These will be
-+//              the next densest after the inner blocks and the input channel.
-+// batch_dims The logical dimensions of md related to the batch in a batched
-+//            matmul, ordered from innermost to outermost. ACL calls these
-+//            the multi_stride_b. These will become the outermost (least dense)
-+//            dimensions and will be collapsed.
-+void reorder_to_weight_format(arm_compute::TensorInfo &info, memory_desc_t &md,
-+        arm_compute::WeightFormat wf, dim_t I_dim, dim_t O_dim,
-+        std::vector<dim_t> spatial_dims, std::vector<dim_t> batch_dims = {});
-+
- // Logs a custom 'info' line describing an unsupported case
- #define LOG_ACL_UNSUPPORTED(msg) \
-     do { \
 diff --git a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
-index 679baec3a4..853277e37b 100644
+index 679baec3a..853277e37 100644
 --- a/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
 +++ b/src/cpu/aarch64/matmul/acl_matmul_utils.cpp
 @@ -66,15 +66,12 @@ status_t init_conf_matmul(acl_matmul_conf_t &amp, memory_desc_t &src_md,
diff --git a/third_party/ortools/bliss.BUILD b/third_party/ortools/bliss.BUILD
new file mode 100644
index 00000000000..955edaa2c8e
--- /dev/null
+++ b/third_party/ortools/bliss.BUILD
@@ -0,0 +1,30 @@
+cc_library(
+    name = "libbliss",
+    srcs = [
+        "bliss-0.73/bliss_C.cc",
+        "bliss-0.73/defs.cc",
+        "bliss-0.73/graph.cc",
+        "bliss-0.73/heap.cc",
+        "bliss-0.73/orbit.cc",
+        "bliss-0.73/partition.cc",
+        "bliss-0.73/timer.cc",
+        "bliss-0.73/uintseqhash.cc",
+        "bliss-0.73/utils.cc",
+    ],
+    hdrs = [
+        "bliss-0.73/bignum.hh",
+        "bliss-0.73/bliss_C.h",
+        "bliss-0.73/defs.hh",
+        "bliss-0.73/graph.hh",
+        "bliss-0.73/heap.hh",
+        "bliss-0.73/kqueue.hh",
+        "bliss-0.73/kstack.hh",
+        "bliss-0.73/orbit.hh",
+        "bliss-0.73/partition.hh",
+        "bliss-0.73/timer.hh",
+        "bliss-0.73/uintseqhash.hh",
+        "bliss-0.73/utils.hh",
+    ],
+    includes = ["."],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/ortools/ortools.patch b/third_party/ortools/ortools.patch
index e02ec5b9482..c0fb1201ad4 100644
--- a/third_party/ortools/ortools.patch
+++ b/third_party/ortools/ortools.patch
@@ -1,166 +1,12 @@
-diff '--color=auto' -u -r or-tools-7.3/ortools/linear_solver/BUILD or-tools-7.3-new/ortools/linear_solver/BUILD
---- or-tools-7.3/ortools/linear_solver/BUILD	2019-08-05 20:37:22.000000000 +0000
-+++ or-tools-7.3-new/ortools/linear_solver/BUILD	2022-08-12 18:25:24.910289156 +0000
-@@ -1,35 +1,5 @@
- package(default_visibility = ["//visibility:public"])
- 
--config_setting(
--    name = "with_glpk",
--    values = {"define": "USE_GLPK="},
--)
--
--config_setting(
--    name = "with_bop",
--    values = {"define": "USE_BOP="},
--)
--
--config_setting(
--    name = "with_cplex",
--    values = {"define": "USE_CPLEX="},
--)
--
--config_setting(
--    name = "with_cbc",
--    values = {"define": "USE_CBC="},
--)
--
--config_setting(
--    name = "with_clp",
--    values = {"define": "USE_CLP="},
--)
--
--config_setting(
--    name = "with_gurobi",
--    values = {"define": "USE_GUROBI="},
--)
--
- proto_library(
-     name = "linear_solver_proto",
-     srcs = ["linear_solver.proto"],
-@@ -41,22 +11,6 @@
-     deps = [":linear_solver_proto"],
- )
- 
--# You can include the interfaces to different solvers by invoking '--define'
--# flags. By default only GLOP interface is included.
--#
--# For instance, if you want to use the GLPK solver, build with
--# '--define USE_GLPK=' (or add it to your bazel.rc file). This will download,
--# build and link to GLPK.
--#
--# Currently compiling with '--define USE_BOP=' flag is broken due to the
--# circular dependency:
--# .-> //ortools/linear_solver:linear_solver
--# |   //ortools/bop:integral_solver
--# |   //ortools/bop:bop_solver
--# |   //ortools/bop:complete_optimizer
--# |   //ortools/sat:optimization
--# `-- //ortools/linear_solver:linear_solver
--
- cc_library(
-     name = "linear_solver",
-     srcs = [
-@@ -66,25 +20,8 @@
-         "model_validator.cc",
-         "glop_interface.cc",
-         "glop_utils.cc",
--    ] + select({
--        ":with_bop": ["bop_interface.cc"],
--        "//conditions:default": [],
--    }) + select({
--        ":with_cbc": ["cbc_interface.cc"],
--        "//conditions:default": [],
--    }) + select({
--        ":with_clp": ["clp_interface.cc"],
--        "//conditions:default": [],
--    }) + select({
--        ":with_cplex": ["cplex_interface.cc"],
--        "//conditions:default": [],
--    }) + select({
--        ":with_glpk": ["glpk_interface.cc"],
--        "//conditions:default": [],
--    }) + select({
--        ":with_gurobi": ["gurobi_interface.cc"],
--        "//conditions:default": [],
--    }),
-+        "glpk_interface.cc",
-+    ],
-     hdrs = [
-         "glop_interface.cc",
-         "glop_utils.h",
-@@ -95,6 +32,7 @@
-     ],
-     defines = [
-         "USE_GLOP=",
-+        "USE_GLPK=",
-     ],
-     deps = [
-         ":linear_solver_cc_proto",
-@@ -115,14 +53,6 @@
-         "//ortools/port:file",
-         "//ortools/port:proto_utils",
-         "//ortools/util:fp_utils",
--    ] + select({
--        ":with_bop": [
--            "//ortools/bop:bop_parameters_cc_proto",
--            "//ortools/bop:integral_solver",
--        ],
--        "//conditions:default": [],
--    }) + select({
--        ":with_glpk": ["@glpk//:glpk"],
--        "//conditions:default": [],
--    }),
-+        "@glpk//:glpk",
-+    ],
- )
-diff '--color=auto' -u -r or-tools-7.3/ortools/util/file_util.cc or-tools-7.3-new/ortools/util/file_util.cc
---- or-tools-7.3/ortools/util/file_util.cc	2019-08-05 20:37:22.000000000 +0000
-+++ or-tools-7.3-new/ortools/util/file_util.cc	2022-09-30 23:01:03.559353459 +0000
-@@ -69,6 +69,8 @@
-         return false;
-       }
-       break;
-+    default:
-+      break;
+diff --git a/src/ortools/base/file.cc b/src/ortools/base/file.cc
+--- a/ortools/base/file.cc
++++ b/ortools/base/file.cc
+@@ -186,7 +186,7 @@
    }
-   const std::string output_filename = absl::StrCat(filename, file_type_suffix);
-   VLOG(1) << "Writing " << output_string.size() << " bytes to "
-diff '--color=auto' -u -r or-tools-7.3/ortools/base/BUILD or-tools-7.3-new/ortools/base/BUILD
---- or-tools-7.3/ortools/base/BUILD 2019-08-05 20:37:22.000000000 +0000
-+++ or-tools-7.3-new/ortools/base/BUILD	2023-03-17 21:01:45.070628657 +0000
-@@ -137,13 +137,13 @@
-     hdrs = [
-         "recordio.h",
-     ],
--    linkopts = ["-lz"],
-     deps = [
-         ":base",
-         ":file",
-         ":status",
-         "@com_google_absl//absl/strings:strings",
-         "@com_google_protobuf_cc//:protobuf",
-+        "@zlib",
-     ],
- )
+ #endif  // _MSC_VER
  
-diff --git a/ortools/util/fp_utils.h b/ortools/util/fp_utils.h
-index 569eeddf47..7188abb29a 100644
---- a/ortools/util/fp_utils.h
-+++ b/ortools/util/fp_utils.h
-@@ -78,11 +78,14 @@ class ScopedFloatingPointEnv {
-   void EnableExceptions(int excepts) {
- #if defined(_MSC_VER)
-     // _controlfp(static_cast<unsigned int>(excepts), _MCW_EM);
--#elif defined(ARCH_K8)
-+#elif (defined(__GNUC__) || defined(__llvm__)) && defined(__x86_64__) && \
-+    !defined(__ANDROID__)
-     CHECK_EQ(0, fegetenv(&fenv_));
-     excepts &= FE_ALL_EXCEPT;
--#ifdef __APPLE__
-+#if defined(__APPLE__)
-     fenv_.__control &= ~excepts;
-+#elif defined(__FreeBSD__)
-+    fenv_.__x87.__control &= ~excepts;
- #else  // Linux
-     fenv_.__control_word &= ~excepts;
- #endif
\ No newline at end of file
+-  file->Close(flags);  // Even if ReadToString() fails!
++  static_cast<void>(file->Close(flags));  // Even if ReadToString() fails!
+   return absl::Status(absl::StatusCode::kInvalidArgument,
+                       absl::StrCat("Could not read from '", filename, "'."));
+ }
\ No newline at end of file
diff --git a/third_party/ortools/scip.BUILD b/third_party/ortools/scip.BUILD
new file mode 100644
index 00000000000..6435954dd9d
--- /dev/null
+++ b/third_party/ortools/scip.BUILD
@@ -0,0 +1,124 @@
+exports_files(
+    ["src/lpi/lpi_glop.cpp"],
+)
+
+config_setting(
+    name = "on_linux",
+    constraint_values = [
+        "@platforms//os:linux",
+    ],
+)
+
+config_setting(
+    name = "on_macos",
+    constraint_values = [
+        "@platforms//os:macos",
+    ],
+)
+
+config_setting(
+    name = "on_windows",
+    constraint_values = [
+        "@platforms//os:windows",
+    ],
+)
+
+PLATFORM_FLAGS = select({
+    "on_linux": [
+        "-Wunknown-pragmas",
+        "-fexceptions",
+        "-DSYM=bliss",
+    ],
+    "on_macos": [
+        "-Wunknown-pragmas",
+        "-fexceptions",
+        "-DSYM=bliss",
+    ],
+    "on_windows": [
+        "/DSYM=none",
+        "/DSCIP_NO_SIGACTION",
+        "/DSCIP_NO_STRTOK_R",
+    ],
+    "//conditions:default": [],
+})
+
+PLATFORM_DEPS = select({
+    "on_linux": ["@bliss//:libbliss"],
+    "on_macos": ["@bliss//:libbliss"],
+    "on_windows": [],
+    "//conditions:default": [],
+})
+
+BLISS_FILE = select({
+    "on_linux": ["src/symmetry/compute_symmetry_bliss.cpp"],
+    "on_macos": ["src/symmetry/compute_symmetry_bliss.cpp"],
+    "on_windows": ["src/symmetry/compute_symmetry_none.cpp"],
+    "//conditions:default": ["src/symmetry/compute_symmetry_none.cpp"],
+})
+
+# TODO(ckstanton): Remove glob.
+cc_library(
+    name = "libscip",
+    srcs = glob(
+        [
+            "src/*/*.c",
+        ],
+        exclude = [
+            "src/lpi/lpi_*.c",
+            "src/scip/exprinterpret_*.c",
+            "src/scip/nlpi_filtersqp.c",
+            "src/scip/nlpi_worhp.c",
+            "src/scip/compr_xyz.c",
+            "src/scip/sorttpl.c",
+            "src/symmetry/compute_symmetry_*.cpp",
+            "src/tpi/tpi_*.c",
+        ],
+    ) + BLISS_FILE + [
+        "src/scip/exprinterpret_none.c",
+        "src/tpi/tpi_tnycthrd.c",
+    ],
+    hdrs = glob(
+        [
+            "src/*/*.h",
+            "src/*/*.hpp",
+            "src/scip/githash.c",
+            "src/scip/sorttpl.c",
+            "src/scip/buildflags.c",
+        ],
+        exclude =
+            [
+                #"src/scip/prop_symmetry.h",
+            ],
+    ),
+    copts = [
+        "$(STACK_FRAME_UNLIMITED)",  # src/scip/reader_cnf.c
+        "-DSCIP_WITH_ZLIB",
+        "-DWITH_SCIPDEF",
+        "-DSCIP_ROUNDING_FE",
+        "-DTPI_TNYC",  # src/tpi/type_tpi_tnycthrd.h
+        # Compile in thead-safe mode (required since we use TPI_TNYC). Note,
+        # one does not technically need to add this, as SCIP code always
+        # uses syntax like "#ifndef NPARASCIP". But let's be explicit here.
+        "-DPARASCIP",
+        "-Isrc",
+        "-Isrc/scip",
+    ] + PLATFORM_FLAGS,
+    defines = [
+        # Scip v800 optionally depends on scip/config.h and
+        # scip/scip_export.h that are generated by build system.
+        #
+        # We need every library and binary that depends on SCIP libraries to
+        # define this macro. That is why we use `defines' here instead of
+        # `copts' or `local_defines'.
+        "NO_CONFIG_HEADER",
+    ],
+    features = ["-parse_headers"],
+    includes = [
+        "src",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        #"@cppad:cppad_includes",
+        "@zlib",
+    ] + PLATFORM_DEPS,
+)
diff --git a/third_party/ortools/scip.patch b/third_party/ortools/scip.patch
new file mode 100644
index 00000000000..691f93504f2
--- /dev/null
+++ b/third_party/ortools/scip.patch
@@ -0,0 +1,415 @@
+diff --git a/.gitignore b/.gitignore
+index 4fbbc0e3b3..964b2d030d 100644
+--- a/.gitignore
++++ b/.gitignore
+@@ -83,8 +83,6 @@ hooks/
+ localhooks/
+ 
+ # created when packaging, don't version control this
+-src/scip/githash.c
+-src/scip/buildflags.c
+ 
+ # settings
+ settings/
+diff --git a/src/scip/benders_xyz.c b/src/scip/benders_xyz.c
+index 0d812ba6bd..ffe1badee0 100644
+--- a/src/scip/benders_xyz.c
++++ b/src/scip/benders_xyz.c
+@@ -47,6 +47,7 @@
+ /** Benders' decomposition data */
+ struct SCIP_BendersData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/benderscut_xyz.c b/src/scip/benderscut_xyz.c
+index 0f05582fe4..7fb99f648a 100644
+--- a/src/scip/benderscut_xyz.c
++++ b/src/scip/benderscut_xyz.c
+@@ -41,6 +41,7 @@
+ /** Benders' decomposition cut data */
+ struct SCIP_BenderscutData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/branch_xyz.c b/src/scip/branch_xyz.c
+index 1f13ac7460..e8ded751cf 100644
+--- a/src/scip/branch_xyz.c
++++ b/src/scip/branch_xyz.c
+@@ -42,6 +42,7 @@
+ /** branching rule data */
+ struct SCIP_BranchruleData
+ {
++  void* ptr;
+ };
+ 
+ 
+@@ -216,7 +217,7 @@ SCIP_RETCODE SCIPincludeBranchruleXyz(
+    /* use SCIPincludeBranchrule() if you want to set all callbacks explicitly and realize (by getting compiler errors) when
+     * new callbacks are added in future SCIP versions
+     */
+-   SCIP_CALL( SCIPincludeBranchrule(scip, BRANCHRULE_NAME, BRANCHRULE_DESC, BRANCHRULE_PRIORITY, BRANCHRULE_MAXDEPTH, 
++   SCIP_CALL( SCIPincludeBranchrule(scip, BRANCHRULE_NAME, BRANCHRULE_DESC, BRANCHRULE_PRIORITY, BRANCHRULE_MAXDEPTH,
+          BRANCHRULE_MAXBOUNDDIST,
+          branchCopyXyz, branchFreeXyz, branchInitXyz, branchExitXyz, branchInitsolXyz, branchExitsolXyz,
+          branchExeclpXyz, branchExecextXyz, branchExecpsXyz,
+diff --git a/src/scip/compr_xyz.c b/src/scip/compr_xyz.c
+index 2f6b29e88c..a6142d7785 100644
+--- a/src/scip/compr_xyz.c
++++ b/src/scip/compr_xyz.c
+@@ -41,6 +41,7 @@
+ /** tree compression data */
+ struct SCIP_ComprData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/cons_xyz.c b/src/scip/cons_xyz.c
+index 8141039a2e..e3f5d2b94b 100644
+--- a/src/scip/cons_xyz.c
++++ b/src/scip/cons_xyz.c
+@@ -69,11 +69,13 @@
+ /** constraint data for xyz constraints */
+ struct SCIP_ConsData
+ {
++  void* ptr;
+ };
+ 
+ /** constraint handler data */
+ struct SCIP_ConshdlrData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/cutsel_xyz.c b/src/scip/cutsel_xyz.c
+index c660098bb0..92f3fb9d92 100644
+--- a/src/scip/cutsel_xyz.c
++++ b/src/scip/cutsel_xyz.c
+@@ -40,6 +40,7 @@
+ /** cut selector data */
+ struct SCIP_CutselData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/dialog_xyz.c b/src/scip/dialog_xyz.c
+index 1918057017..8506d526e6 100644
+--- a/src/scip/dialog_xyz.c
++++ b/src/scip/dialog_xyz.c
+@@ -28,7 +28,7 @@
+ 
+ #define DIALOG_NAME            "xyz"
+ #define DIALOG_DESC            "xyz user interface dialog"
+-#define DIALOG_ISSUBMENU          FALSE 
++#define DIALOG_ISSUBMENU          FALSE
+ 
+ 
+ 
+@@ -42,6 +42,7 @@
+ /** dialog data */
+ struct SCIP_DialogData
+ {
++  void* ptr;
+ };
+ 
+ 
+@@ -154,7 +155,7 @@ SCIP_RETCODE SCIPincludeDialogXyz(
+    /* create, include, and release dialog */
+    if( !SCIPdialogHasEntry(parentdialog, DIALOG_NAME) )
+    {
+-      SCIP_CALL( SCIPincludeDialog(scip, &dialog, 
++      SCIP_CALL( SCIPincludeDialog(scip, &dialog,
+             dialogCopyXyz, dialogExecXyz, dialogDescXyz, dialogFreeXyz,
+             DIALOG_NAME, DIALOG_DESC, DIALOG_ISSUBMENU, dialogdata) );
+       SCIP_CALL( SCIPaddDialogEntry(scip, parentdialog, dialog) );
+diff --git a/src/scip/disp_xyz.c b/src/scip/disp_xyz.c
+index 6c6a776091..b00cf1e036 100644
+--- a/src/scip/disp_xyz.c
++++ b/src/scip/disp_xyz.c
+@@ -26,13 +26,13 @@
+ #include "scip/disp_xyz.h"
+ 
+ 
+-#define DISP_NAME               "xyz"                
++#define DISP_NAME               "xyz"
+ #define DISP_DESC               "xyz display column"
+-#define DISP_HEADER             "xyz" 
++#define DISP_HEADER             "xyz"
+ #define DISP_WIDTH              14      /**< the width of the display column */
+ #define DISP_PRIORITY           110000  /**< the priority of the display column */
+ #define DISP_POSITION           30100   /**< the relative position of the display column */
+-#define DISP_STRIPLINE          TRUE    /**< the default for whether the display column should be separated 
++#define DISP_STRIPLINE          TRUE    /**< the default for whether the display column should be separated
+                                          *   with a line from its right neighbor */
+ 
+ 
+@@ -47,6 +47,7 @@
+ /** display column data */
+ struct SCIP_DispData
+ {
++  void* ptr;
+ };
+ 
+ 
+@@ -188,10 +189,10 @@ SCIP_RETCODE SCIPincludeDispXyz(
+    /* TODO: (optional) create display column specific data here */
+ 
+    /* include display column */
+-   SCIP_CALL( SCIPincludeDisp(scip, DISP_NAME, DISP_DESC, DISP_HEADER, SCIP_DISPSTATUS_AUTO, 
++   SCIP_CALL( SCIPincludeDisp(scip, DISP_NAME, DISP_DESC, DISP_HEADER, SCIP_DISPSTATUS_AUTO,
+          dispCopyXyz,
+-         dispFreeXyz, dispInitXyz, dispExitXyz, 
+-         dispInitsolXyz, dispExitsolXyz, dispOutputXyz, 
++         dispFreeXyz, dispInitXyz, dispExitXyz,
++         dispInitsolXyz, dispExitsolXyz, dispOutputXyz,
+          dispdata, DISP_WIDTH, DISP_PRIORITY, DISP_POSITION, DISP_STRIPLINE) );
+ 
+    /* add xyz display column parameters */
+diff --git a/src/scip/event_xyz.c b/src/scip/event_xyz.c
+index 31fd333f98..c793d69bc4 100644
+--- a/src/scip/event_xyz.c
++++ b/src/scip/event_xyz.c
+@@ -36,6 +36,7 @@
+ /** event handler data */
+ struct SCIP_EventhdlrData
+ {
++  void* ptr;
+ };
+ 
+ /*
+@@ -179,7 +180,7 @@ SCIP_RETCODE SCIPincludeEventHdlrXyz(
+     */
+    SCIP_CALL( SCIPincludeEventhdlr(scip, EVENTHDLR_NAME, EVENTHDLR_DESC,
+          eventCopyXyz,
+-         eventFreeXyz, eventInitXyz, eventExitXyz, 
++         eventFreeXyz, eventInitXyz, eventExitXyz,
+          eventInitsolXyz, eventExitsolXyz, eventDeleteXyz, eventExecXyz,
+          eventhdlrdata) );
+ #else
+diff --git a/src/scip/expr_xyz.c b/src/scip/expr_xyz.c
+index 2eb7914e1d..4e924b03b5 100644
+--- a/src/scip/expr_xyz.c
++++ b/src/scip/expr_xyz.c
+@@ -38,11 +38,13 @@
+ /** expression handler data */
+ struct SCIP_ExprhdlrData
+ {
++  void* ptr;
+ };
+ 
+ /** expression data */
+ struct SCIP_ExprData
+ {
++  void* ptr;
+ };
+ 
+ /*
+diff --git a/src/scip/githash.c b/src/scip/githash.c
+new file mode 100644
+index 0000000000..2891bc72de
+--- /dev/null
++++ b/src/scip/githash.c
+@@ -0,0 +1 @@
++#define SCIP_GITHASH "a740f0891e"
+diff --git a/src/scip/heur_xyz.c b/src/scip/heur_xyz.c
+index 9f7d804f4d..e33bb83b7c 100644
+--- a/src/scip/heur_xyz.c
++++ b/src/scip/heur_xyz.c
+@@ -46,6 +46,7 @@
+ /** primal heuristic data */
+ struct SCIP_HeurData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/nlhdlr_xyz.c b/src/scip/nlhdlr_xyz.c
+index bc90f3dafe..056af7b6d3 100644
+--- a/src/scip/nlhdlr_xyz.c
++++ b/src/scip/nlhdlr_xyz.c
+@@ -40,11 +40,13 @@
+ /** nonlinear handler data */
+ struct SCIP_NlhdlrData
+ {
++  void* ptr;
+ };
+ 
+ /** nonlinear handler expression data */
+ struct SCIP_NlhdlrExprData
+ {
++  void* ptr;
+ };
+ 
+ /*
+diff --git a/src/scip/nlpi_xyz.c b/src/scip/nlpi_xyz.c
+index 3509410b23..901433d2d4 100644
+--- a/src/scip/nlpi_xyz.c
++++ b/src/scip/nlpi_xyz.c
+@@ -43,12 +43,14 @@
+ 
+ struct SCIP_NlpiData
+ {
++  void* ptr;
+ };
+ 
+ /* TODO: fill in the necessary NLP problem instance data */
+ 
+ struct SCIP_NlpiProblem
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/nodesel_xyz.c b/src/scip/nodesel_xyz.c
+index a5b6d9d7d6..0aacc3c8d2 100644
+--- a/src/scip/nodesel_xyz.c
++++ b/src/scip/nodesel_xyz.c
+@@ -41,6 +41,7 @@
+ /** node selector data */
+ struct SCIP_NodeselData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/presol_xyz.c b/src/scip/presol_xyz.c
+index 38ba9df72e..04fe8605f5 100644
+--- a/src/scip/presol_xyz.c
++++ b/src/scip/presol_xyz.c
+@@ -42,6 +42,7 @@
+ /** presolver data */
+ struct SCIP_PresolData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/pricer_xyz.c b/src/scip/pricer_xyz.c
+index 16c968b951..5090a91e35 100644
+--- a/src/scip/pricer_xyz.c
++++ b/src/scip/pricer_xyz.c
+@@ -43,6 +43,7 @@
+ /** variable pricer data */
+ struct SCIP_PricerData
+ {
++  void* ptr;
+ };
+ 
+ 
+@@ -204,7 +205,7 @@ SCIP_RETCODE SCIPincludePricerXyz(
+     * new callbacks are added in future SCIP versions
+     */
+    SCIP_CALL( SCIPincludePricer(scip, PRICER_NAME, PRICER_DESC, PRICER_PRIORITY, PRICER_DELAY,
+-         pricerCopyXyz, pricerFreeXyz, pricerInitXyz, pricerExitXyz, 
++         pricerCopyXyz, pricerFreeXyz, pricerInitXyz, pricerExitXyz,
+          pricerInitsolXyz, pricerExitsolXyz, pricerRedcostXyz, pricerFarkasXyz,
+          pricerdata) );
+ #else
+diff --git a/src/scip/prop_xyz.c b/src/scip/prop_xyz.c
+index 431d8e909b..975564f12b 100644
+--- a/src/scip/prop_xyz.c
++++ b/src/scip/prop_xyz.c
+@@ -28,7 +28,7 @@
+ /* fundamental propagator properties */
+ #define PROP_NAME              "xyz"
+ #define PROP_DESC              "propagator template"
+-#define PROP_PRIORITY                 0 /**< propagator priority */ 
++#define PROP_PRIORITY                 0 /**< propagator priority */
+ #define PROP_FREQ                    10 /**< propagator frequency */
+ #define PROP_DELAY                FALSE /**< should propagation method be delayed, if other propagators found reductions? */
+ #define PROP_TIMING             SCIP_PROPTIMING_BEFORELP/**< propagation timing mask */
+@@ -50,6 +50,7 @@
+ /** propagator data */
+ struct SCIP_PropData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/reader_xyz.c b/src/scip/reader_xyz.c
+index 08e5a6ff7b..c8fdd9d4f5 100644
+--- a/src/scip/reader_xyz.c
++++ b/src/scip/reader_xyz.c
+@@ -40,6 +40,7 @@
+ /** data for xyz reader */
+ struct SCIP_ReaderData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/relax_xyz.c b/src/scip/relax_xyz.c
+index 4d4acc80b3..74dfbda91f 100644
+--- a/src/scip/relax_xyz.c
++++ b/src/scip/relax_xyz.c
+@@ -43,6 +43,7 @@
+ /** relaxator data */
+ struct SCIP_RelaxData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/scipbuildflags.c b/src/scip/scipbuildflags.c
+index b54b9112cb..dc8e62b5e0 100644
+--- a/src/scip/scipbuildflags.c
++++ b/src/scip/scipbuildflags.c
+@@ -21,10 +21,9 @@
+ 
+ /*---+----1----+----2----+----3----+----4----+----5----+----6----+----7----+----8----+----9----+----0----+----1----+----2*/
+ 
++#define SCIP_BUILDFLAGS " ARCH=x86_64\n COMP=gnu\n DEBUGSOL=false\n EXPRINT=none\n GAMS=false\n SYM=bliss\n GMP=false\n IPOPT=false\n IPOPTOPT=opt\n WORHP=false\n WORHPOPT=opt\n LPS=spx2\n LPSCHECK=false\n LPSOPT=opt\n NOBLKBUFMEM=false\n NOBLKMEM=false\n NOBUFMEM=false\n OPT=opt\n OSTYPE=linux\n PARASCIP=true\n READLINE=false\n SANITIZE=\n SHARED=false\n USRARFLAGS=\n USRCFLAGS=-fPIC\n USRCXXFLAGS=-fPIC\n USRDFLAGS=\n USRFLAGS=\n USRLDFLAGS=\n USROFLAGS=\n VERSION=7.0.1\n ZIMPL=false\n ZIMPLOPT=opt\n ZLIB=true"
++
+ #include "scip/scipbuildflags.h"
+-#ifdef NO_CONFIG_HEADER
+-#include "buildflags.c"
+-#endif
+ 
+ /** returns the flags that were used to build SCIP */
+ const char* SCIPgetBuildFlags(
+diff --git a/src/scip/sepa_xyz.c b/src/scip/sepa_xyz.c
+index 40d3c1c5f7..f68658951d 100644
+--- a/src/scip/sepa_xyz.c
++++ b/src/scip/sepa_xyz.c
+@@ -44,6 +44,7 @@
+ /** separator data */
+ struct SCIP_SepaData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/scip/table_xyz.c b/src/scip/table_xyz.c
+index 2c93a43235..4f1fd28de4 100644
+--- a/src/scip/table_xyz.c
++++ b/src/scip/table_xyz.c
+@@ -43,6 +43,7 @@
+ /** statistics table data */
+ struct SCIP_TableData
+ {
++  void* ptr;
+ };
+ 
+ 
+diff --git a/src/symmetry/compute_symmetry_bliss.cpp b/src/symmetry/compute_symmetry_bliss.cpp
+index 484627c4b9..27c2895165 100644
+--- a/src/symmetry/compute_symmetry_bliss.cpp
++++ b/src/symmetry/compute_symmetry_bliss.cpp
+@@ -25,8 +25,8 @@
+ #include "compute_symmetry.h"
+ 
+ /* include bliss graph */
+-#include <bliss/defs.hh>
+-#include <bliss/graph.hh>
++#include <bliss-0.73/defs.hh>
++#include <bliss-0.73/graph.hh>
+ 
+ #include <string.h>
+ #include <vector>
+ 
\ No newline at end of file
diff --git a/third_party/py/ml_dtypes/BUILD b/third_party/py/ml_dtypes/BUILD
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/third_party/py/ml_dtypes/LICENSE b/third_party/py/ml_dtypes/LICENSE
new file mode 100644
index 00000000000..d6456956733
--- /dev/null
+++ b/third_party/py/ml_dtypes/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/third_party/py/ml_dtypes/ml_dtypes.BUILD b/third_party/py/ml_dtypes/ml_dtypes.BUILD
new file mode 100644
index 00000000000..95f58d3c476
--- /dev/null
+++ b/third_party/py/ml_dtypes/ml_dtypes.BUILD
@@ -0,0 +1,50 @@
+load("@pybind11_bazel//:build_defs.bzl", "pybind_extension")
+
+package(
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
+
+exports_files(["LICENSE"])
+
+cc_library(
+    name = "float8",
+    hdrs = ["include/float8.h"],
+    # Internal headers are all relative to , but other packages
+    # include these headers with the  prefix.
+    includes = [
+        ".",
+        "ml_dtypes",
+    ],
+    deps = ["@org_tensorflow//third_party/eigen3"],
+)
+
+pybind_extension(
+    name = "_custom_floats",
+    srcs = [
+        "_src/common.h",
+        "_src/custom_float.h",
+        "_src/dtypes.cc",
+        "_src/int4.h",
+        "_src/numpy.cc",
+        "_src/numpy.h",
+        "_src/ufuncs.h",
+    ],
+    includes = ["ml_dtypes"],
+    visibility = [":__subpackages__"],
+    deps = [
+        ":float8",
+        "@org_tensorflow//third_party/eigen3",
+        "@org_tensorflow//third_party/py/numpy:headers",
+    ],
+)
+
+py_library(
+    name = "ml_dtypes",
+    srcs = [
+        "__init__.py",
+        "_finfo.py",
+        "_iinfo.py",
+    ],
+    deps = [":_custom_floats"],
+)
diff --git a/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD b/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
new file mode 100644
index 00000000000..fde5f2eaccf
--- /dev/null
+++ b/third_party/py/ml_dtypes/ml_dtypes.tests.BUILD
@@ -0,0 +1,60 @@
+package(
+    default_visibility = ["//visibility:public"],
+)
+
+py_library(
+    name = "testing_base",
+    deps = [
+        "//:ml_dtypes",
+        "@absl_py//absl/testing:absltest",
+        "@absl_py//absl/testing:parameterized",
+        "@org_tensorflow//third_party/py/numpy",
+    ],
+)
+
+py_test(
+    name = "custom_float_test",
+    srcs = ["custom_float_test.py"],
+    main = "custom_float_test.py",
+    deps = [":testing_base"],
+)
+
+py_test(
+    name = "int4_test",
+    srcs = ["int4_test.py"],
+    main = "int4_test.py",
+    deps = [":testing_base"],
+)
+
+py_test(
+    name = "iinfo_test",
+    srcs = ["iinfo_test.py"],
+    main = "iinfo_test.py",
+    deps = [":testing_base"],
+)
+
+py_test(
+    name = "finfo_test",
+    srcs = ["finfo_test.py"],
+    main = "finfo_test.py",
+    deps = [":testing_base"],
+)
+
+py_test(
+    name = "metadata_test",
+    srcs = ["metadata_test.py"],
+    main = "metadata_test.py",
+    deps = [":testing_base"],
+)
+
+cc_test(
+    name = "float8_test",
+    srcs = ["float8_test.cc"],
+    linkstatic = 1,
+    deps = [
+        "//:float8",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@org_tensorflow//third_party/eigen3",
+    ],
+)
diff --git a/third_party/py/ml_dtypes/workspace.bzl b/third_party/py/ml_dtypes/workspace.bzl
new file mode 100644
index 00000000000..2c34f494c34
--- /dev/null
+++ b/third_party/py/ml_dtypes/workspace.bzl
@@ -0,0 +1,22 @@
+"""Provides the repo macro to import ml_dtypes.
+
+ml_dtypes provides machine-learning-specific data-types like bfloat16,
+float8 varieties, and int4.
+"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    ML_DTYPES_COMMIT = "5b9fc9ad978757654843f4a8d899715dbea30e88"
+    ML_DTYPES_SHA256 = "9662811d9ab3823a56f8fa91b5a67fd82062b6dd4f187169b41e82a44e526455"
+    tf_http_archive(
+        name = "ml_dtypes",
+        build_file = "//third_party/py/ml_dtypes:ml_dtypes.BUILD",
+        link_files = {
+            "//third_party/py/ml_dtypes:ml_dtypes.tests.BUILD": "tests/BUILD.bazel",
+            "//third_party/py/ml_dtypes:LICENSE": "LICENSE",
+        },
+        sha256 = ML_DTYPES_SHA256,
+        strip_prefix = "ml_dtypes-{commit}/ml_dtypes".format(commit = ML_DTYPES_COMMIT),
+        urls = tf_mirror_urls("https://github.com/jax-ml/ml_dtypes/archive/{commit}/ml_dtypes-{commit}.tar.gz".format(commit = ML_DTYPES_COMMIT)),
+    )
diff --git a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
index db3fecd02a5..9198264c029 100644
--- a/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
+++ b/third_party/py/numpy/tf_numpy_api/tensorflow.experimental.numpy.ndarray.pbtxt
@@ -1,6 +1,6 @@
 path: "tensorflow.experimental.numpy.ndarray"
 tf_class {
-  is_instance: "<class \'tensorflow.python.framework.ops.Tensor\'>"
+  is_instance: "<class \'tensorflow.python.framework.tensor.Tensor\'>"
   is_instance: "<class \'tensorflow.python.types.internal.NativeObject\'>"
   is_instance: "<class \'tensorflow.python.types.core.Symbol\'>"
   is_instance: "<class \'tensorflow.python.types.core.Tensor\'>"
diff --git a/third_party/pybind11.BUILD b/third_party/pybind11.BUILD
index 2f1ada6193c..5e0e03e2669 100644
--- a/third_party/pybind11.BUILD
+++ b/third_party/pybind11.BUILD
@@ -23,3 +23,9 @@ cc_library(
         "@org_tensorflow//third_party/python_runtime:headers",
     ],
 )
+
+# Needed by pybind11_bazel.
+config_setting(
+    name = "osx",
+    constraint_values = ["@platforms//os:osx"],
+)
diff --git a/third_party/pybind11_abseil/workspace.bzl b/third_party/pybind11_abseil/workspace.bzl
index 31f479658c8..19c11118b8f 100644
--- a/third_party/pybind11_abseil/workspace.bzl
+++ b/third_party/pybind11_abseil/workspace.bzl
@@ -18,13 +18,3 @@ def repo():
         build_file = "//third_party/pybind11_abseil:BUILD",
         patch_file = ["//third_party/pybind11_abseil:remove_license.patch"],
     )
-
-    # pybind11_bazel is a dependency of pybind11_abseil.
-    PB_COMMIT = "72cbbf1fbc830e487e3012862b7b720001b70672"
-    PB_SHA256 = "516c1b3a10d87740d2b7de6f121f8e19dde2c372ecbfe59aef44cd1872c10395"
-    tf_http_archive(
-        name = "pybind11_bazel",
-        strip_prefix = "pybind11_bazel-{commit}".format(commit = PB_COMMIT),
-        sha256 = PB_SHA256,
-        urls = tf_mirror_urls("https://github.com/pybind/pybind11_bazel/archive/72cbbf1fbc830e487e3012862b7b720001b70672.tar.gz"),
-    )
diff --git a/third_party/pybind11_bazel/BUILD b/third_party/pybind11_bazel/BUILD
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/third_party/pybind11_bazel/pybind11_bazel.patch b/third_party/pybind11_bazel/pybind11_bazel.patch
new file mode 100644
index 00000000000..74e038ddc11
--- /dev/null
+++ b/third_party/pybind11_bazel/pybind11_bazel.patch
@@ -0,0 +1,37 @@
+diff --git a/build_defs.bzl b/build_defs.bzl
+index cde1e93..03f14a5 100644
+--- a/build_defs.bzl
++++ b/build_defs.bzl
+@@ -27,7 +27,9 @@ PYBIND_DEPS = [
+ 
+ # Builds a Python extension module using pybind11.
+ # This can be directly used in python with the import statement.
+-# This adds rules for a .so binary file.
++# This adds rules for .so and .pyd binary files, as well as
++# a base target that selects between them depending on the platform
++# (.pyd for windows, .so otherwise).
+ def pybind_extension(
+         name,
+         copts = [],
+@@ -59,6 +61,21 @@ def pybind_extension(
+         **kwargs
+     )
+ 
++    native.genrule(
++        name = name + "_pyd",
++        srcs = [name + ".so"],
++        outs = [name + ".pyd"],
++        cmd = "cp $< $@",
++    )
++
++    native.py_library(
++        name = name,
++        data = select({
++            "@platforms//os:windows": [":" + name + ".pyd"],
++            "//conditions:default": [":" + name + ".so"],
++        }),
++    )
++
+ # Builds a pybind11 compatible library. This can be linked to a pybind_extension.
+ def pybind_library(
+         name,
diff --git a/third_party/pybind11_bazel/workspace.bzl b/third_party/pybind11_bazel/workspace.bzl
new file mode 100644
index 00000000000..dcc71d32228
--- /dev/null
+++ b/third_party/pybind11_bazel/workspace.bzl
@@ -0,0 +1,17 @@
+"""Provides the repo macro to import pybind11_bazel.
+
+pybind11_bazel requires pybind11 (which is loaded in another rule).
+"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    PB_COMMIT = "72cbbf1fbc830e487e3012862b7b720001b70672"
+    PB_SHA256 = "516c1b3a10d87740d2b7de6f121f8e19dde2c372ecbfe59aef44cd1872c10395"
+    tf_http_archive(
+        name = "pybind11_bazel",
+        strip_prefix = "pybind11_bazel-{commit}".format(commit = PB_COMMIT),
+        sha256 = PB_SHA256,
+        patch_file = ["//third_party/pybind11_bazel:pybind11_bazel.patch"],
+        urls = tf_mirror_urls("https://github.com/pybind/pybind11_bazel/archive/{commit}.tar.gz".format(commit = PB_COMMIT)),
+    )
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index e60d88cf76b..8b137891791 100644
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,2609 +1 @@
-diff --ruN a/stablehlo/stablehlo/dialect/Base.h b/stablehlo/stablehlo/dialect/Base.h
---- stablehlo/stablehlo/dialect/Base.h
-+++ stablehlo/stablehlo/dialect/Base.h
-@@ -294,7 +294,8 @@
-   static LogicalResult inferReturnTypes(
-       MLIRContext * /*context*/, std::optional<Location> location,
-       ValueRange operands, DictionaryAttr /*attributes*/,
--      RegionRange /*regions*/, SmallVectorImpl<Type> &inferredReturnTypes) {
-+      OpaqueProperties /*properties*/, RegionRange /*regions*/,
-+      SmallVectorImpl<Type> &inferredReturnTypes) {
-     // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
-     // support quantization or sparsity.
-     if (operands.empty())
-@@ -314,11 +315,13 @@
-   // (see examples in StablehloOps.cpp).
-   static LogicalResult inferReturnTypeComponentsFromOperands(
-       MLIRContext *context, std::optional<Location> location,
--      ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+      ValueShapeRange operands, DictionaryAttr attributes,
-+      OpaqueProperties properties, RegionRange regions,
-       SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
-     SmallVector<Type> inferredReturnTypes;
-     if (failed(inferReturnTypes(context, location, operands.getValues(),
--                                attributes, regions, inferredReturnTypes)))
-+                                attributes, properties, regions,
-+                                inferredReturnTypes)))
-       return failure();
-     if (inferredReturnTypes.size() != 1) return failure();
-     auto inferredReturnType = inferredReturnTypes[0].dyn_cast<ShapedType>();
-diff --ruN a/stablehlo/stablehlo/dialect/Base.td b/stablehlo/stablehlo/dialect/Base.td
---- stablehlo/stablehlo/dialect/Base.td
-+++ stablehlo/stablehlo/dialect/Base.td
-@@ -33,8 +33,8 @@
- def HLO_UInt : UnsignedIntOfWidths<[4, 8, 16, 32, 64]>;
- def HLO_Int : AnyTypeOf<[HLO_SInt, HLO_UInt]>;
- 
--def HLO_Float : AnyTypeOf<[F8E4M3FN, F8E5M2, F8E4M3FNUZ, F8E5M2FNUZ,
--                           F8E4M3B11FNUZ, F16, F32, F64, BF16]>;
-+def HLO_Float : AnyTypeOf<[F8E4M3B11FNUZ, F8E4M3FN, F8E4M3FNUZ, F8E5M2,
-+                           F8E5M2FNUZ, F16, F32, F64, BF16]>;
- def HLO_Float32Or64 : AnyTypeOf<[F32, F64]>;
- 
- def HLO_Complex : Complex<AnyTypeOf<[F32, F64]>>;
-diff --ruN a/stablehlo/stablehlo/dialect/ChloOps.cpp b/stablehlo/stablehlo/dialect/ChloOps.cpp
---- stablehlo/stablehlo/dialect/ChloOps.cpp
-+++ stablehlo/stablehlo/dialect/ChloOps.cpp
-@@ -43,15 +43,15 @@
- 
- // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
- // support quantization or sparsity.
--#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
--  LogicalResult Op::inferReturnTypeComponents(                                \
--      MLIRContext* context, std::optional<Location> location,                 \
--      ValueShapeRange operands, DictionaryAttr attributes,                    \
--      RegionRange regions,                                                    \
--      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
--    return inferReturnTypeComponentsFromOperands(context, location, operands, \
--                                                 attributes, regions,         \
--                                                 inferredReturnShapes);       \
-+#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
-+  LogicalResult Op::inferReturnTypeComponents(                        \
-+      MLIRContext* context, std::optional<Location> location,         \
-+      ValueShapeRange operands, DictionaryAttr attributes,            \
-+      OpaqueProperties properties, RegionRange regions,               \
-+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
-+    return inferReturnTypeComponentsFromOperands(                     \
-+        context, location, operands, attributes, properties, regions, \
-+        inferredReturnShapes);                                        \
-   }
- 
- INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AcosOp)
-@@ -188,7 +188,7 @@
- LogicalResult BroadcastComplexOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
-     ValueShapeRange operands, DictionaryAttr attributes,
--    RegionRange /*regions*/,
-+    OpaqueProperties /*properties*/, RegionRange /*regions*/,
-     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
-   ShapedType lhsType = operands[0].getType().cast<ShapedType>();
-   Type elementType = ComplexType::get(lhsType.getElementType());
-@@ -221,7 +221,7 @@
- LogicalResult BroadcastCompareOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
-     ValueShapeRange operands, DictionaryAttr attributes,
--    RegionRange /*regions*/,
-+    OpaqueProperties /*properties*/, RegionRange /*regions*/,
-     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
-   Type elementType = IntegerType::get(context, 1);
-   return InferBroadcastBinaryOpReturnTypeComponents(context, location, operands,
-@@ -248,7 +248,8 @@
- 
- LogicalResult IsInfOp::inferReturnTypes(
-     MLIRContext* /*ctx*/, std::optional<Location>, ValueRange operands,
--    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr, OpaqueProperties, RegionRange,
-+    SmallVectorImpl<Type>& inferredReturnTypes) {
-   inferredReturnTypes.push_back(getIsInfLikeReturnType(operands.front()));
-   return success();
- }
-@@ -259,7 +260,8 @@
- 
- LogicalResult IsNegInfOp::inferReturnTypes(
-     MLIRContext* /*ctx*/, std::optional<Location>, ValueRange operands,
--    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr, OpaqueProperties, RegionRange,
-+    SmallVectorImpl<Type>& inferredReturnTypes) {
-   inferredReturnTypes.push_back(getIsInfLikeReturnType(operands.front()));
-   return success();
- }
-@@ -270,7 +272,8 @@
- 
- LogicalResult IsPosInfOp::inferReturnTypes(
-     MLIRContext* /*ctx*/, std::optional<Location>, ValueRange operands,
--    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr, OpaqueProperties, RegionRange,
-+    SmallVectorImpl<Type>& inferredReturnTypes) {
-   inferredReturnTypes.push_back(getIsInfLikeReturnType(operands.front()));
-   return success();
- }
-@@ -283,7 +286,7 @@
-   LogicalResult Op::inferReturnTypeComponents(                             \
-       MLIRContext* context, std::optional<Location> location,              \
-       ValueShapeRange operands, DictionaryAttr attributes,                 \
--      RegionRange regions,                                                 \
-+      OpaqueProperties /*properties*/, RegionRange regions,                \
-       SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {        \
-     return InferBroadcastBinaryOpReturnTypeComponents(                     \
-         context, location, operands, attributes, /*element_type=*/nullptr, \
-@@ -342,7 +345,7 @@
- 
- LogicalResult ConstantLikeOp::inferReturnTypeComponents(
-     MLIRContext* /*context*/, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes,
-+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-     RegionRange /*regions*/,
-     SmallVectorImpl<ShapedTypeComponents>& inferedReturnShapes) {
-   ConstantLikeOp::Adaptor op(operands, attributes);
-@@ -376,7 +379,7 @@
- 
- LogicalResult BroadcastSelectOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr, RegionRange,
-+    DictionaryAttr, OpaqueProperties, RegionRange,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-   BroadcastSelectOp::Adaptor op(operands.getValues());
-   auto predType = op.getPred().getType().cast<ShapedType>();
-@@ -447,10 +450,11 @@
- 
- LogicalResult TopKOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-   Builder builder(context);
--  TopKOp::Adaptor adaptor(operands, attributes, regions);
-+  TopKOp::Adaptor adaptor(operands, attributes, {}, regions);
-   Value operand = adaptor.getOperand();
-   uint64_t k = adaptor.getK();
- 
-@@ -483,7 +487,7 @@
- 
- LogicalResult ConstantOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location>, ValueRange,
--    DictionaryAttr attributes, RegionRange,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
-     SmallVectorImpl<Type>& inferredReturnTypes) {
-   Type type = attributes.get("value").cast<TypedAttr>().getType();
-   inferredReturnTypes.push_back(type);
-diff --ruN a/stablehlo/stablehlo/dialect/StablehloOps.cpp b/stablehlo/stablehlo/dialect/StablehloOps.cpp
---- stablehlo/stablehlo/dialect/StablehloOps.cpp
-+++ stablehlo/stablehlo/dialect/StablehloOps.cpp
-@@ -149,15 +149,15 @@
- 
- // TODO(b/231358795): Review the use of InferTypeOpInterface for ops that
- // support quantization or sparsity.
--#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                        \
--  LogicalResult Op::inferReturnTypeComponents(                                \
--      MLIRContext* context, std::optional<Location> location,                 \
--      ValueShapeRange operands, DictionaryAttr attributes,                    \
--      RegionRange regions,                                                    \
--      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {          \
--    return inferReturnTypeComponentsFromOperands(context, location, operands, \
--                                                 attributes, regions,         \
--                                                 inferredReturnShapes);       \
-+#define INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Op)                \
-+  LogicalResult Op::inferReturnTypeComponents(                        \
-+      MLIRContext* context, std::optional<Location> location,         \
-+      ValueShapeRange operands, DictionaryAttr attributes,            \
-+      OpaqueProperties properties, RegionRange regions,               \
-+      SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {  \
-+    return inferReturnTypeComponentsFromOperands(                     \
-+        context, location, operands, attributes, properties, regions, \
-+        inferredReturnShapes);                                        \
-   }
- 
- INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(AddOp)
-@@ -206,7 +206,7 @@
- 
- LogicalResult AfterAllOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<Type>& inferredReturnTypes) {
-   return hlo::inferAfterAllOp(getStablehloDialect(context), location,
-                               inferredReturnTypes);
-@@ -250,7 +250,7 @@
- 
- LogicalResult ConstantOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
-     SmallVectorImpl<Type>& inferredReturnTypes) {
-   ConstantOpAdaptor adaptor(operands, attributes);
-   return hlo::inferConstantOp(location, adaptor.getValue(),
-@@ -317,7 +317,7 @@
- 
- LogicalResult CreateTokenOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<Type>& inferredReturnTypes) {
-   return hlo::inferCreateTokenOp(getStablehloDialect(context), location,
-                                  inferredReturnTypes);
-@@ -484,9 +484,9 @@
- 
- LogicalResult CholeskyOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  CholeskyOp::Adaptor adaptor(operands, attributes, regions);
-+  CholeskyOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferCholeskyOp(location, adaptor.getA(), inferredReturnShapes);
- }
- 
-@@ -585,9 +585,9 @@
- 
- LogicalResult FftOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  FftOp::Adaptor adaptor(operands, attributes, regions);
-+  FftOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferFftOp(location, adaptor.getOperand(),
-                          adaptor.getFftType() == FftType::RFFT,
-                          adaptor.getFftType() == FftType::IRFFT,
-@@ -690,9 +690,10 @@
- 
- LogicalResult GatherOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  GatherOp::Adaptor adaptor(operands, attributes, regions);
-+  GatherOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferGatherOp(
-       location, adaptor.getOperand(), adaptor.getStartIndices(),
-       adaptor.getDimensionNumbers().getOffsetDims(),
-@@ -714,9 +715,10 @@
- 
- LogicalResult DynamicGatherOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes, OpaqueProperties,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  DynamicGatherOp::Adaptor adaptor(operands, attributes, regions);
-+  DynamicGatherOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferDynamicGatherOp(
-       location, adaptor.getOperand(), adaptor.getStartIndices(),
-       adaptor.getSliceSizes(), adaptor.getDimensionNumbers().getOffsetDims(),
-@@ -731,9 +733,9 @@
- 
- LogicalResult GetDimensionSizeOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
-+  GetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferGetDimensionSizeOp(location, adaptor.getOperand().getType(),
-                                       adaptor.getDimension(),
-                                       inferredReturnShapes);
-@@ -779,9 +781,9 @@
- 
- LogicalResult DynamicUpdateSliceOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, regions);
-+  DynamicUpdateSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferDynamicUpdateSliceOp(
-       location, adaptor.getOperand(), adaptor.getUpdate(),
-       adaptor.getStartIndices(), inferredReturnShapes);
-@@ -793,9 +795,9 @@
- 
- LogicalResult AbsOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  AbsOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  AbsOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferAbsOp(location, adaptor.getOperand(), inferredReturnTypes);
- }
- 
-@@ -849,9 +851,10 @@
- 
- LogicalResult AllToAllOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  AllToAllOp::Adaptor adaptor(operands, attributes, regions);
-+  AllToAllOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferAllToAllOp(
-       location, adaptor.getOperand(), adaptor.getSplitDimension(),
-       adaptor.getConcatDimension(), adaptor.getSplitCount(),
-@@ -893,9 +896,10 @@
- 
- LogicalResult BatchNormGradOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  BatchNormGradOp::Adaptor adaptor(operands, attributes, regions);
-+  BatchNormGradOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferBatchNormGradOp(
-       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getMean(),
-       adaptor.getVariance(), adaptor.getGradOutput(), adaptor.getFeatureIndex(),
-@@ -908,9 +912,10 @@
- 
- LogicalResult BatchNormTrainingOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, regions);
-+  BatchNormTrainingOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferBatchNormTrainingOp(
-       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
-       adaptor.getFeatureIndex(), inferredReturnShapes);
-@@ -922,9 +927,10 @@
- 
- LogicalResult BatchNormInferenceOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, regions);
-+  BatchNormInferenceOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferBatchNormInferenceOp(
-       location, adaptor.getOperand(), adaptor.getScale(), adaptor.getOffset(),
-       adaptor.getMean(), adaptor.getVariance(), adaptor.getFeatureIndex(),
-@@ -967,9 +973,10 @@
- 
- LogicalResult BroadcastOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  BroadcastOp::Adaptor adaptor(operands, attributes, regions);
-+  BroadcastOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferBroadcastOp(location, adaptor.getOperand(),
-                                adaptor.getBroadcastSizes(),
-                                inferredReturnShapes);
-@@ -1042,9 +1049,10 @@
- 
- LogicalResult ClampOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  ClampOp::Adaptor adaptor(operands, attributes, regions);
-+  ClampOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferClampOp(location, adaptor.getMin(), adaptor.getOperand(),
-                            adaptor.getMax(), inferredReturnShapes);
- }
-@@ -1063,9 +1071,9 @@
- 
- LogicalResult ComplexOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  ComplexOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  ComplexOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferComplexOp(location, adaptor.getLhs(), inferredReturnTypes);
- }
- 
-@@ -1075,9 +1083,9 @@
- 
- LogicalResult ImagOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  ImagOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  ImagOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferImagOp(location, adaptor.getOperand(), inferredReturnTypes);
- }
- 
-@@ -1087,9 +1095,9 @@
- 
- LogicalResult IsFiniteOp::inferReturnTypes(
-     MLIRContext* ctx, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  IsFiniteOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  IsFiniteOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferIsFiniteOp(ctx, location, adaptor.getX(),
-                               inferredReturnTypes);
- }
-@@ -1100,9 +1108,9 @@
- 
- LogicalResult RealOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  RealOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  RealOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferRealOp(location, adaptor.getOperand(), inferredReturnTypes);
- }
- 
-@@ -1112,9 +1120,9 @@
- 
- LogicalResult ConcatenateOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  ConcatenateOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  ConcatenateOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferConcatenateOp(location, adaptor.getInputs().getTypes(),
-                                  adaptor.getDimension(), inferredReturnTypes);
- }
-@@ -1196,9 +1204,10 @@
- 
- LogicalResult DynamicSliceOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  DynamicSliceOp::Adaptor adaptor(operands, attributes, regions);
-+  DynamicSliceOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferDynamicSliceOp(location, adaptor.getOperand().getType(),
-                                   adaptor.getStartIndices().getTypes(),
-                                   adaptor.getSliceSizes(),
-@@ -1276,9 +1285,10 @@
- 
- LogicalResult MapOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  MapOp::Adaptor adaptor(operands, attributes, regions);
-+  MapOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferMapOp(location, adaptor.getInputs(), adaptor.getDimensions(),
-                          adaptor.getComputation(), inferredReturnShapes);
- }
-@@ -1296,8 +1306,8 @@
- 
- LogicalResult OutfeedOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-   return hlo::inferOutfeedOp(getStablehloDialect(context), location,
-                              inferredReturnTypes);
- }
-@@ -1308,7 +1318,8 @@
- 
- LogicalResult SendOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr, OpaqueProperties, RegionRange,
-+    SmallVectorImpl<Type>& inferredReturnTypes) {
-   return hlo::inferSendOp(getStablehloDialect(context), location,
-                           inferredReturnTypes);
- }
-@@ -1328,9 +1339,10 @@
- 
- LogicalResult ReduceWindowOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  ReduceWindowOp::Adaptor adaptor(operands, attributes, regions);
-+  ReduceWindowOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferReduceWindowOp(
-       location, adaptor.getInputs(), adaptor.getInitValues(),
-       adaptor.getWindowDimensions(), adaptor.getWindowStrides(),
-@@ -1400,7 +1412,7 @@
-   if (mlir::succeeded(ReduceWindowOp::inferReturnTypes(
-           odsBuilder.getContext(), odsState.location, odsState.operands,
-           odsState.attributes.getDictionary(odsState.getContext()),
--          odsState.regions, inferredReturnTypes)))
-+          odsState.getRawProperties(), odsState.regions, inferredReturnTypes)))
-     odsState.addTypes(inferredReturnTypes);
-   else
-     llvm::report_fatal_error("Failed to infer result type(s).");
-@@ -1731,9 +1743,10 @@
- 
- LogicalResult ReduceOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  ReduceOp::Adaptor adaptor(operands, attributes, regions);
-+  ReduceOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferReduceOp(location, adaptor.getInputs().getTypes(),
-                             adaptor.getInitValues().getTypes(),
-                             adaptor.getDimensions(), inferredReturnShapes);
-@@ -1789,7 +1802,7 @@
- //===----------------------------------------------------------------------===//
- LogicalResult OptimizationBarrierOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
-     SmallVectorImpl<Type>& inferredReturnTypes) {
-   OptimizationBarrierOp::Adaptor adaptor(operands, attributes);
-   return hlo::inferOptimizationBarrierOp(location, adaptor.getOperand(),
-@@ -1805,9 +1818,10 @@
- 
- LogicalResult ReverseOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  ReverseOp::Adaptor adaptor(operands, attributes, regions);
-+  ReverseOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferReverseOp(location, adaptor.getOperand().getType(),
-                              inferredReturnShapes);
- }
-@@ -1828,9 +1842,10 @@
- 
- LogicalResult RngOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  RngOp::Adaptor adaptor(operands, attributes, regions);
-+  RngOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferRngOp(
-       location, adaptor.getA(), adaptor.getB(), adaptor.getShape(),
-       adaptor.getRngDistribution() == RngDistribution::UNIFORM,
-@@ -1852,7 +1867,7 @@
- 
- LogicalResult SelectOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
-   SelectOp::Adaptor op(operands, attributes);
-   return hlo::inferSelectOp(location, op.getPred(), op.getOnTrue(),
-@@ -1873,9 +1888,10 @@
- 
- LogicalResult SetDimensionSizeOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, regions);
-+  SetDimensionSizeOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferSetDimensionSizeOp(
-       getStablehloDialect(context), location, adaptor.getOperand().getType(),
-       adaptor.getSize(), adaptor.getDimension(), inferredReturnShapes);
-@@ -1887,9 +1903,9 @@
- 
- LogicalResult PadOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  PadOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  PadOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferPadOp(location, adaptor.getOperand().getType(),
-                          adaptor.getPaddingValue().getType(),
-                          adaptor.getEdgePaddingLow(),
-@@ -2047,7 +2063,8 @@
- 
- LogicalResult ReplicaIdOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr, RegionRange, SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr, OpaqueProperties, RegionRange,
-+    SmallVectorImpl<Type>& inferredReturnTypes) {
-   return hlo::inferReplicaIdOp(context, location, inferredReturnTypes);
- }
- 
-@@ -2057,7 +2074,7 @@
- 
- LogicalResult PartitionIdOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location,
--    ValueRange /*operands*/, DictionaryAttr, RegionRange,
-+    ValueRange /*operands*/, DictionaryAttr, OpaqueProperties, RegionRange,
-     SmallVectorImpl<Type>& inferredReturnTypes) {
-   return hlo::inferPartitionIdOp(context, location, inferredReturnTypes);
- }
-@@ -2068,9 +2085,9 @@
- 
- LogicalResult IfOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  IfOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  IfOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferIfOp(location, adaptor.getPred(), adaptor.getRegions(),
-                         inferredReturnTypes);
- }
-@@ -2081,9 +2098,9 @@
- 
- LogicalResult CaseOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  CaseOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  CaseOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferCaseOp(location, adaptor.getIndex(), adaptor.getRegions(),
-                           inferredReturnTypes);
- }
-@@ -2094,8 +2111,8 @@
- 
- LogicalResult SliceOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-   SliceOpAdaptor adaptor(operands, attributes);
-   return hlo::inferSliceOp(location, adaptor.getOperand().getType(),
-                            adaptor.getStartIndices(), adaptor.getLimitIndices(),
-@@ -2119,9 +2136,10 @@
- 
- LogicalResult SortOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  SortOp::Adaptor adaptor(operands, attributes, regions);
-+  SortOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferSortOp(location, adaptor.getInputs(), inferredReturnShapes);
- }
- 
-@@ -2174,9 +2192,9 @@
- 
- LogicalResult TransposeOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> loc, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  TransposeOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  TransposeOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferTransposeOp(loc, adaptor.getOperand(),
-                                adaptor.getPermutation(), inferredReturnTypes);
- }
-@@ -2187,9 +2205,10 @@
- 
- LogicalResult TriangularSolveOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  TriangularSolveOp::Adaptor adaptor(operands, attributes, regions);
-+  TriangularSolveOp::Adaptor adaptor(operands, attributes, {}, regions);
-   bool isTransposeAInvalid =
-       (adaptor.getTransposeA() == Transpose::TRANSPOSE_INVALID);
-   return hlo::inferTriangularSolveOp(location, adaptor.getA(), adaptor.getB(),
-@@ -2203,9 +2222,9 @@
- 
- LogicalResult GetTupleElementOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  GetTupleElementOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  GetTupleElementOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferGetTupleElementOp(location, adaptor.getOperand(),
-                                      adaptor.getIndex(), inferredReturnTypes);
- }
-@@ -2216,9 +2235,9 @@
- 
- LogicalResult TupleOp::inferReturnTypes(
-     MLIRContext* context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  TupleOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  TupleOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferTupleOp(context, location, adaptor.getVal(),
-                            inferredReturnTypes);
- }
-@@ -2237,9 +2256,10 @@
- 
- LogicalResult CompareOp::inferReturnTypeComponents(
-     MLIRContext* context, std::optional<Location> location,
--    ValueShapeRange operands, DictionaryAttr attributes, RegionRange regions,
-+    ValueShapeRange operands, DictionaryAttr attributes,
-+    OpaqueProperties /*properties*/, RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  CompareOp::Adaptor adaptor(operands, attributes, regions);
-+  CompareOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferCompareOp(context, location, adaptor.getLhs(),
-                              inferredReturnShapes);
- }
-@@ -2257,9 +2277,9 @@
- 
- LogicalResult SelectAndScatterOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  SelectAndScatterOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  SelectAndScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferSelectAndScatterOp(adaptor.getOperand(),
-                                       inferredReturnTypes);
- }
-@@ -2277,9 +2297,9 @@
- 
- LogicalResult ScatterOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  ScatterOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  ScatterOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferScatterOp(location, adaptor.getInputs(),
-                              inferredReturnTypes);
- }
-@@ -2299,9 +2319,9 @@
- 
- LogicalResult WhileOp::inferReturnTypes(
-     MLIRContext*, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
--    SmallVectorImpl<Type>& inferredReturnTypes) {
--  WhileOp::Adaptor adaptor(operands, attributes, regions);
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions, SmallVectorImpl<Type>& inferredReturnTypes) {
-+  WhileOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferWhileOp(location, adaptor.getOperand(), inferredReturnTypes);
- }
- 
-@@ -2376,9 +2396,10 @@
- 
- LogicalResult UniformDequantizeOp::inferReturnTypeComponents(
-     MLIRContext*, std::optional<Location> location, ValueShapeRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties /*properties*/,
-+    RegionRange regions,
-     SmallVectorImpl<ShapedTypeComponents>& inferredReturnShapes) {
--  UniformDequantizeOp::Adaptor adaptor(operands, attributes, regions);
-+  UniformDequantizeOp::Adaptor adaptor(operands, attributes, {}, regions);
-   return hlo::inferUniformDequantizeOp(location, adaptor.getOperand(),
-                                        inferredReturnShapes);
- }
-diff --ruN a/stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py b/stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py
---- stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py
-+++ stablehlo/stablehlo/integrations/python/mlir/dialects/stablehlo.py
-@@ -17,3 +17,10 @@
- # pylint: disable=wildcard-import,relative-beyond-top-level,g-import-not-at-top
- from ._stablehlo_ops_gen import *
- from .._mlir_libs._stablehlo import *
-+
-+
-+def get_earliest_forward_compatible_version():
-+  """Return the earliest StableHLO version that the current StableHLO version
-+    is still forward compatible with.
-+  """
-+  return "0.9.0"
-diff --ruN a/stablehlo/stablehlo/reference/Tensor.cpp b/stablehlo/stablehlo/reference/Tensor.cpp
---- stablehlo/stablehlo/reference/Tensor.cpp
-+++ stablehlo/stablehlo/reference/Tensor.cpp
-@@ -100,14 +100,24 @@
-       getSizeInBytes(elementType) * flattenIndex(getShape(), index);
- 
-   // Handle floating-point types.
-+  if (elementType.isFloat8E4M3B11FNUZ()) {
-+    auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
-+    return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3B11FNUZ(),
-+                                        APInt(8, *elementData)));
-+  }
-+  if (elementType.isFloat8E4M3FN()) {
-+    auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
-+    return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3FN(),
-+                                        APInt(8, *elementData)));
-+  }
-   if (elementType.isFloat8E4M3FNUZ()) {
-     auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
-     return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3FNUZ(),
-                                         APInt(8, *elementData)));
-   }
--  if (elementType.isFloat8E4M3B11FNUZ()) {
-+  if (elementType.isFloat8E5M2()) {
-     auto elementData = reinterpret_cast<const uint8_t *>(elementPtr);
--    return Element(elementType, APFloat(llvm::APFloatBase::Float8E4M3B11FNUZ(),
-+    return Element(elementType, APFloat(llvm::APFloatBase::Float8E5M2(),
-                                         APInt(8, *elementData)));
-   }
-   if (elementType.isFloat8E5M2FNUZ()) {
-@@ -222,7 +232,8 @@
-       getSizeInBytes(elementType) * flattenIndex(getShape(), index);
- 
-   // Handle floating-point types.
--  if (elementType.isFloat8E4M3FNUZ() || elementType.isFloat8E4M3B11FNUZ() ||
-+  if (elementType.isFloat8E4M3B11FNUZ() || elementType.isFloat8E4M3FN() ||
-+      elementType.isFloat8E4M3FNUZ() || elementType.isFloat8E5M2() ||
-       elementType.isFloat8E5M2FNUZ()) {
-     auto elementData = reinterpret_cast<uint8_t *>(elementPtr);
-     auto value = element.getFloatValue();
-@@ -377,16 +388,17 @@
-   auto elemType = type.getElementType();
- 
-   // Handle floating-point types.
--  if (elemType.isFloat8E4M3FNUZ() || elemType.isFloat8E4M3B11FNUZ() ||
-+  if (elemType.isFloat8E4M3B11FNUZ() || elemType.isFloat8E4M3FN() ||
-+      elemType.isFloat8E4M3FNUZ() || elemType.isFloat8E5M2() ||
-       elemType.isFloat8E5M2FNUZ()) {
-     auto floatValues = llvm::to_vector(llvm::map_range(
-         attr.getValues<APFloat>(), [&](APFloat value) -> uint8_t {
-           return value.bitcastToAPInt().getZExtValue();
-         }));
- 
--    // For f8E4M3FNUZ, f8E4M3B11FNUZ, and f8E5M2FNUZ floating-point types, we
--    // use uint8_t as their storage type because there are no builtin types for
--    // those.
-+    // For f8E4M3B11FNUZ, f8E4M3FN, f8E4M3FNUZ, f8E5M2, and f8E5M2FNUZ
-+    // floating-point types, we use uint8_t as their storage type because there
-+    // are no builtin types for those.
-     return Tensor(type, HeapAsmResourceBlob::allocateAndCopyInferAlign<uint8_t>(
-                             floatValues));
-   }
-@@ -398,7 +410,7 @@
-         }));
- 
-     // For both f16 and bf16 floating-point types, we use uint16_t as their
--    // storage type because there are no buitin types for those.
-+    // storage type because there are no builtin types for those.
-     return Tensor(
-         type,
-         HeapAsmResourceBlob::allocateAndCopyInferAlign<uint16_t>(floatValues));
-diff --ruN a/stablehlo/stablehlo/reference/Types.cpp b/stablehlo/stablehlo/reference/Types.cpp
---- stablehlo/stablehlo/reference/Types.cpp
-+++ stablehlo/stablehlo/reference/Types.cpp
-@@ -44,7 +44,8 @@
- }
- 
- bool isSupportedFloatType(Type type) {
--  return type.isFloat8E4M3FNUZ() || type.isFloat8E4M3B11FNUZ() ||
-+  return type.isFloat8E4M3B11FNUZ() || type.isFloat8E4M3FN() ||
-+         type.isFloat8E4M3FNUZ() || type.isFloat8E5M2() ||
-          type.isFloat8E5M2FNUZ() || type.isF16() || type.isBF16() ||
-          type.isF32() || type.isF64();
- }
-diff --ruN a/stablehlo/stablehlo/tests/TestUtils.cpp b/stablehlo/stablehlo/tests/TestUtils.cpp
---- stablehlo/stablehlo/tests/TestUtils.cpp
-+++ stablehlo/stablehlo/tests/TestUtils.cpp
-@@ -55,7 +55,8 @@
-     SmallVector<Type, 4> types;
-     if (failed(definingOpInt.inferReturnTypes(
-             op->getContext(), op->getLoc(), definingOp->getOperands(),
--            definingOp->getAttrDictionary(), definingOp->getRegions(), types)))
-+            definingOp->getAttrDictionary(), nullptr, definingOp->getRegions(),
-+            types)))
-       return failure();
- 
-     // Replace the op with another pass-through op with attributes added.
-diff --ruN a/stablehlo/stablehlo/tests/infer_chlo.mlir b/stablehlo/stablehlo/tests/infer_chlo.mlir
---- stablehlo/stablehlo/tests/infer_chlo.mlir
-+++ stablehlo/stablehlo/tests/infer_chlo.mlir
-@@ -46,6 +46,7 @@
- 
- // -----
- func.func @broadcast_complex_mismatch(%arg0: tensor<2xf64>, %arg1: tensor<2xf32>) -> tensor<2xcomplex<f32>> {
-+  // expected-error @+2 {{failed to infer returned types}}
-   // expected-error @+1 {{mismatched operand types}}
-   %0 = "chlo.broadcast_complex"(%arg0, %arg1) : (tensor<2xf64>, tensor<2xf32>) -> tensor<2xcomplex<f32>>
-   %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<2xcomplex<f32>>) -> tensor<2xcomplex<f32>>
-@@ -109,6 +110,7 @@
- 
- // -----
- func.func @broadcast_select_branch_mismatch(%arg0: tensor<2xi1>, %arg1: tensor<2xi32>, %arg2: tensor<2xf32>) -> tensor<2xi32> {
-+  // expected-error @+2 {{failed to infer returned types}}
-   // expected-error @+1 {{mismatched operand types}}
-   %0 = "chlo.broadcast_select"(%arg0, %arg1, %arg2) : (tensor<2xi1>, tensor<2xi32>, tensor<2xf32>) -> tensor<2xi32>
-   %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<2xi32>) -> tensor<2xi32>
-diff --ruN a/stablehlo/stablehlo/tests/infer_stablehlo.mlir b/stablehlo/stablehlo/tests/infer_stablehlo.mlir
---- stablehlo/stablehlo/tests/infer_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/infer_stablehlo.mlir
-@@ -59,6 +59,7 @@
- // -----
- 
- func.func @broadcast(%a : tensor<3xi32>) -> tensor<1x2x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{Broadcast with negative dimension size -2}}
-   %0 = "stablehlo.broadcast"(%a) {broadcast_sizes = dense<[1, -2]> : tensor<2xi64>}
-       : (tensor<3xi32>) -> tensor<1x2x3xi32>
-@@ -938,6 +939,7 @@
- // -----
- 
- func.func @slice_with_index_larger_than_bound_dim(%arg0: tensor<3x?x?xi32, #stablehlo.bounds<?, 4, ?>>) -> tensor<*xindex> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{limit index 5 is larger than dimension bound 4 in dimension 1}}
-   %0 = "stablehlo.slice"(%arg0) {start_indices = dense<[1, 0, 0]> : tensor<3xi64>, limit_indices = dense<[2, 5, 4]> : tensor<3xi64>, strides = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x?x?xi32, #stablehlo.bounds<?, 4, ?>>) -> tensor<*xi32>
-   %1 = "hlo_test_infer.get_return_types"(%0) : (tensor<*xi32>) -> tensor<*xindex>
-@@ -961,6 +963,7 @@
- // -----
- 
- func.func @pad_with_negative_inferred_bounds(%arg0: tensor<3x?x?xf16, #stablehlo.bounds<?, 3, ?>>, %arg1: tensor<f16>) -> tensor<*xindex> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{Padding result in negative bound for dimension 1}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<[2, -10, 0]> : tensor<3xi64>,
-diff --ruN a/stablehlo/stablehlo/tests/interpret_constant.mlir b/stablehlo/stablehlo/tests/interpret_constant.mlir
---- stablehlo/stablehlo/tests/interpret_constant.mlir
-+++ stablehlo/stablehlo/tests/interpret_constant.mlir
-@@ -80,24 +80,41 @@
- 
- // -----
- 
--func.func @constant_op_test_f8_e4m3_fnuz() {
--  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3FNUZ>
--  check.expect_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.1, 3.25, 240.0, -240.0, 0.000976562, -0.000976562]> : tensor<10xf8E4M3FNUZ>
-+func.func @constant_op_test_f8_e4m3b11_fnuz() {
-+  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3B11FNUZ>
-+  check.expect_almost_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.101563, 3.25, 30.0, -30.0, 0.00012207, -0.00012207]> : tensor<10xf8E4M3B11FNUZ>
-   func.return
- }
- 
- // -----
- 
--func.func @constant_op_test_f8_e4m3b11_fnuz() {
--  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3B11FNUZ>
--  check.expect_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.1, 3.25, 30.0, -30.0, 0.0001220703125, -0.0001220703125]> : tensor<10xf8E4M3B11FNUZ>
-+func.func @constant_op_test_f8_e4m3_fn() {
-+  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3FN>
-+  check.expect_almost_eq_const %0, dense<[0.0, -0.0, 1.0, 0.125, 0.1015630, 3.25, 0x7F, 0xFF, 0.001953130, -0.001953130]> : tensor<10xf8E4M3FN>
-   func.return
- }
- 
- // -----
-+
-+func.func @constant_op_test_f8_e4m3_fnuz() {
-+  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E4M3FNUZ>
-+  check.expect_almost_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.101563, 3.25, 240.0, -240.0, 0.000976562, -0.0009765620]> : tensor<10xf8E4M3FNUZ>
-+  func.return
-+}
-+
-+// -----
-+
-+func.func @constant_op_test_f8_e5m2() {
-+  %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E5M2>
-+  check.expect_almost_eq_const %0, dense<[0.0, -0.0, 1.0, 0.125, 0.09375, 3.0, 0x7F, 0xFF, 0.0000152588, -0.0000152588]> : tensor<10xf8E5M2>
-+  func.return
-+}
-+
-+// -----
-+
- func.func @constant_op_test_f8_e5m2_fnuz() {
-   %0 = stablehlo.constant dense<[0.0, -0.0, 1.0, 0.125, 0.1, 3.1415, 0x7F, 0xFF, 0x01, 0x81]> : tensor<10xf8E5M2FNUZ>
--  check.expect_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.1, 3.0, 57344.0, -57344.0, 7.62939e-06, -7.62939e-06]> : tensor<10xf8E5M2FNUZ>
-+  check.expect_almost_eq_const %0, dense<[0.0, 0.0, 1.0, 0.125, 0.09375, 3.0, 57344.0, -57344.0, 0.00000762939, -0.00000762939]> : tensor<10xf8E5M2FNUZ>
-   func.return
- }
- 
-diff --ruN a/stablehlo/stablehlo/tests/ops_chlo.mlir b/stablehlo/stablehlo/tests/ops_chlo.mlir
---- stablehlo/stablehlo/tests/ops_chlo.mlir
-+++ stablehlo/stablehlo/tests/ops_chlo.mlir
-@@ -91,6 +91,7 @@
- // -----
- 
- func.func @top_k(%arg0 : tensor<*xf32>) {
-+  // expected-error @+2 {{failed to infer returned types}}
-   // @expected-error @+1{{operand must be ranked}}
-   %0:2 = chlo.top_k(%arg0, k=8) : tensor<*xf32> -> (tensor<8xf32>, tensor<8xi32>)
-   return
-@@ -99,6 +100,7 @@
- // -----
- 
- func.func @top_k(%arg0 : tensor<f32>) {
-+  // expected-error @+2 {{failed to infer returned types}}
-   // @expected-error @+1{{operand's rank must be at least 1}}
-   %0:2 = chlo.top_k(%arg0, k=8) : tensor<f32> -> (tensor<8xf32>, tensor<8xi32>)
-   return
-@@ -107,6 +109,7 @@
- // -----
- 
- func.func @top_k(%arg0 : tensor<?xf32>) {
-+  // expected-error @+2 {{failed to infer returned types}}
-   // @expected-error @+1{{operand's last dimension must be static}}
-   %0:2 = chlo.top_k(%arg0, k=8) : tensor<?xf32> -> (tensor<8xf32>, tensor<8xi32>)
-   return
-@@ -115,6 +118,7 @@
- // -----
- 
- func.func @top_k(%arg0 : tensor<4xf32>) {
-+  // expected-error @+2 {{failed to infer returned types}}
-   // @expected-error @+1{{operand's last dimension must be at least 8}}
-   %0:2 = chlo.top_k(%arg0, k=8) : tensor<4xf32> -> (tensor<8xf32>, tensor<8xi32>)
-   return
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo.mlir b/stablehlo/stablehlo/tests/ops_stablehlo.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo.mlir
-@@ -484,6 +484,7 @@
- // -----
- 
- func.func @alltoall_negative_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{AllToAll split_dimension cannot be negative}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = -1 : i64,
-@@ -497,6 +498,7 @@
- // -----
- 
- func.func @alltoall_out_bound_split_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{AllToAll split_dimension 2 is out-of-bounds for input rank 2}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 2 : i64,
-@@ -510,6 +512,7 @@
- // -----
- 
- func.func @alltoall_negative_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{AllToAll concat_dimension cannot be negative}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -523,6 +526,7 @@
- // -----
- 
- func.func @alltoall_out_bound_concat_dimension(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{AllToAll concat_dimension 2 is out-of-bounds for input rank 2}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -536,6 +540,7 @@
- // -----
- 
- func.func @alltoall_invalid_split_count(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{AllToAll split_count must be > 0}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -549,7 +554,8 @@
- // -----
- 
- func.func @alltoall_invalid_split_dim_size(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
--// expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
-+  // expected-error@+2 {{failed to infer returned types}}
-+  // expected-error@+1 {{split dimension has size 16, expected to be a multiple of split_count 5}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-     concat_dimension = 0 : i64,
-@@ -562,6 +568,7 @@
- // -----
- 
- func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{replica groups should be a rank 2 tensor}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -575,6 +582,7 @@
- // -----
- 
- func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{replica id #1 not seen in replica groups}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -588,6 +596,7 @@
- // -----
- 
- func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{replica id #2 seen more than once}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -601,6 +610,7 @@
- // -----
- 
- func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{replica id #4 not seen in replica groups}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -614,6 +624,7 @@
- // -----
- 
- func.func @alltoall_invalid_replica_group(%data: tensor<4x16xf32>) -> tensor<16x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{group size of replica_groups must be 4}}
-   %0 = "stablehlo.all_to_all"(%data) {
-     split_dimension = 1 : i64,
-@@ -794,6 +805,7 @@
- // -----
- 
- func.func @broadcast_bad_sizes_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{broadcast_sizes has rank 2 instead of rank 1}}
-   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[[1, 2]]> : tensor<1x2xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
-   func.return %0 : tensor<1x2x3xi32>
-@@ -802,6 +814,7 @@
- // -----
- 
- func.func @broadcast_bad_result_rank(%arg0: tensor<3xi32>) -> tensor<1x2x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x2x3xi32>'}}
-   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x2x3xi32>
-   func.return %0 : tensor<1x2x3xi32>
-@@ -810,6 +823,7 @@
- // -----
- 
- func.func @broadcast_bad_first_part_result_shape(%arg0: tensor<3xi32>) -> tensor<1x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<1x3xi32>'}}
-   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<1x3xi32>
-   func.return %0 : tensor<1x3xi32>
-@@ -818,6 +832,7 @@
- // -----
- 
- func.func @broadcast_bad_second_part_result_shape(%arg0: tensor<3xi32>) -> tensor<2x1xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.broadcast' op inferred type(s) 'tensor<2x3xi32>' are incompatible with return type(s) of operation 'tensor<2x1xi32>'}}
-   %0 = "stablehlo.broadcast"(%arg0) {broadcast_sizes = dense<[2]> : tensor<1xi64>} : (tensor<3xi32>) -> tensor<2x1xi32>
-   func.return %0 : tensor<2x1xi32>
-@@ -976,6 +991,7 @@
- // -----
- 
- func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{branch 0 must have 0 arguments, but found 1}}
-   %0 = "stablehlo.if"(%pred) ({
-       ^bb0(%arg0: tensor<f32>):
-@@ -989,6 +1005,7 @@
- // -----
- 
- func.func @if_c1(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{branch 1 must have 0 arguments, but found 1}}
-   %0 = "stablehlo.if"(%pred) ({
-       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
-@@ -1002,6 +1019,7 @@
- // -----
- 
- func.func @if_c2(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{branch 0 and branch 1 have mismatched return types: 'tensor<f32>', 'tensor<f32>' vs 'tensor<f32>'}}
-   %0 = "stablehlo.if"(%pred) ({
-       "stablehlo.return"(%branch_operand, %branch_operand) : (tensor<f32>, tensor<f32>) -> ()
-@@ -1014,6 +1032,7 @@
- // -----
- 
- func.func @if_c3(%pred : tensor<i1>, %branch_operand : tensor<f32>) -> tensor<i32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
-   %0 = "stablehlo.if"(%pred) ({
-       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
-@@ -1050,6 +1069,7 @@
- // -----
- 
- func.func @if_i1(%pred : tensor<1xi1>, %branch_operand : tensor<f32>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
-   %0 = "stablehlo.if"(%pred) ({
-       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
-@@ -1086,6 +1106,7 @@
- // -----
- 
- func.func @case_c1(%index : tensor<i32>, %branch_operand : tensor<2xf32>) -> tensor<2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{expect at least one branch}}
-   %0 = "stablehlo.case"(%index) : (tensor<i32>) -> tensor<2xf32>
-   func.return %0 : tensor<2xf32>
-@@ -1094,6 +1115,7 @@
- // -----
- 
- func.func @case_c2(%index : tensor<i32>, %branch_operand : tensor<f32>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // @expected-error@+1 {{branch 1 must have 0 arguments, but found 1}}
-   %0 = "stablehlo.case"(%index) ({
-       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
-@@ -1107,6 +1129,7 @@
- // -----
- 
- func.func @case_c3(%index: tensor<i32>, %operand_1: tensor<f32>, %operand_2: tensor<f32>, %operand_3: tensor<f32>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{branch 0 and branch 1 have mismatched return types: 'tensor<f32>' vs 'tensor<i32>'}}
-   %0 = "stablehlo.case"(%index) ({
-       %1 = "stablehlo.negate"(%operand_1) : (tensor<f32>) -> tensor<f32>
-@@ -1121,7 +1144,8 @@
- // -----
- 
- func.func @case_c4(%index : tensor<i32>, %branch_operand : tensor<f32>) -> tensor<i32> {
--  // @expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
-+  // expected-error@+2 {{failed to infer returned types}}
-+  // expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
-   %0 = "stablehlo.case"(%index) ({
-       "stablehlo.return"(%branch_operand) : (tensor<f32>) -> ()
-   }, {
-@@ -1157,7 +1181,8 @@
- // -----
- 
- func.func @case_i1(%index : tensor<1xi32>, %branch_operand : tensor<2xf32>) -> tensor<2xf32> {
--  // @expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
-+  // expected-error@+2 {{failed to infer returned types}}
-+  // expected-error@+1 {{operand should be rank 0 tensor but got rank 1}}
-   %0 = "stablehlo.case"(%index) ({
-       "stablehlo.return"(%branch_operand) : (tensor<2xf32>) -> ()
-   }, {
-@@ -1295,6 +1320,7 @@
- // -----
- 
- func.func @concatenate_c1_c5(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{op inferred type(s) 'tensor<3xi32>' are incompatible with return type(s) of operation 'tensor<4xi32>'}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
-@@ -1303,6 +1329,7 @@
- // -----
- 
- func.func @concatenate_c2(%arg0: tensor<1xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{operands (0) and (1) do not match rank}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1xi32>, tensor<2x2xi32>) -> tensor<3xi32>
-   func.return %0 : tensor<3xi32>
-@@ -1319,6 +1346,7 @@
- // -----
- 
- func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{dimension -1 is negative}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
-   func.return %0 : tensor<3xi32>
-@@ -1327,6 +1355,7 @@
- // -----
- 
- func.func @concatenate_c4(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>)  -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{dimension -1 is negative}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = -1 : i64 } : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-   func.return %0 : tensor<*xi32>
-@@ -1335,6 +1364,7 @@
- // -----
- 
- func.func @concatenate_c4(%arg0: tensor<i32>, %arg1: tensor<i32>)  -> tensor<2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{rank-0 values cannot be concatenated}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<i32>, tensor<i32>) -> tensor<2xi32>
-   func.return %0 : tensor<2xi32>
-@@ -1343,6 +1373,7 @@
- // -----
- 
- func.func @concatenate_c4(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>)  -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{dimension 10 is out-of-bounds for input rank 1}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 10 : i64 } : (tensor<1xi32>, tensor<2xi32>) -> tensor<3xi32>
-   func.return %0 : tensor<3xi32>
-@@ -1351,6 +1382,7 @@
- // -----
- 
- func.func @concatenate_c6(%arg0: tensor<1x3xi32>, %arg1: tensor<2x2xi32>)  -> tensor<3x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{shapes of operand (0) and (1) do not match at non-concat index: (1, 3) != (2, 2) at non-concat index 1}}
-   %0 = "stablehlo.concatenate"(%arg0, %arg1) { dimension = 0 : i64 } : (tensor<1x3xi32>, tensor<2x2xi32>) -> tensor<3x3xi32>
-   func.return %0 : tensor<3x3xi32>
-@@ -1381,6 +1413,7 @@
- // -----
- 
- func.func @clamp_c1(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{min shape [2] is not scalar and is not compatible to operand shape [1]}}
-   %0 = "stablehlo.clamp"(%arg1, %arg0, %arg0) : (tensor<2xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1xi32>
-   func.return %0: tensor<1xi32>
-@@ -1389,6 +1422,7 @@
- // -----
- 
- func.func @clamp_c2(%arg0: tensor<1xi32>, %arg1: tensor<2xi32>) -> tensor<1xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{max shape [2] is not scalar and is not compatible to operand shape [1]}}
-   %0 = "stablehlo.clamp"(%arg0, %arg0, %arg1) : (tensor<1xi32>, tensor<1xi32>, tensor<2xi32>) -> tensor<1xi32>
-   func.return %0: tensor<1xi32>
-@@ -1397,7 +1431,8 @@
- // -----
- 
- func.func @clamp_c4(%arg0: tensor<1xi32>) -> tensor<1x2xi32> {
--  // // expected-error@+1{{inferred type(s) 'tensor<1xi32>' are incompatible with return type(s) of operation 'tensor<1x2xi32>'}}
-+  // expected-error@+2 {{failed to infer returned types}}
-+  // expected-error@+1{{inferred type(s) 'tensor<1xi32>' are incompatible with return type(s) of operation 'tensor<1x2xi32>'}}
-   %0 = "stablehlo.clamp"(%arg0, %arg0, %arg0) : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<1x2xi32>
-   func.return %0: tensor<1x2xi32>
- }
-@@ -1421,6 +1456,7 @@
- // -----
- 
- func.func @cholesky_error_nonsquare(%arg0: tensor<1x2x1xf32>) -> tensor<1x2x1xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{minor dimensions of 'a' must have equal size, got shape 1, 2, 1}}
-   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x1xf32>) -> tensor<1x2x1xf32>
-   func.return %0: tensor<1x2x1xf32>
-@@ -1429,6 +1465,7 @@
- // -----
- 
- func.func @cholesky_invalid_rank(%arg0: tensor<1xf32>) -> tensor<1xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{argument 'a' must have rank >= 2, got shape 1}}
-   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1xf32>) -> tensor<1xf32>
-   func.return %0: tensor<1xf32>
-@@ -1437,7 +1474,7 @@
- // -----
- 
- func.func @cholesky_invalid_elt(%arg0: tensor<1x2x2xi32>) -> tensor<1x2x2xi32> {
--  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>}}
-+  // expected-error@+1 {{operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type or complex type with 32-bit float or 64-bit float elements values, but got 'tensor<1x2x2xi32>'}}
-   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xi32>) -> tensor<1x2x2xi32>
-   func.return %0: tensor<1x2x2xi32>
- }
-@@ -1445,6 +1482,7 @@
- // -----
- 
- func.func @cholesky_wrong_infer_shape(%arg0: tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.cholesky' op inferred type(s) 'tensor<1x2x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2x2x2xf32>'}}
-   %0 = "stablehlo.cholesky"(%arg0) { lower = true } : (tensor<1x2x2xf32>) -> tensor<1x2x2x2xf32>
-   func.return %0: tensor<1x2x2x2xf32>
-@@ -1544,6 +1582,7 @@
- // -----
- 
- func.func @imag_c2(%arg0: tensor<2xf32>) -> tensor<2xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<2xf32>' are incompatible with return type(s) of operation 'tensor<2xf16>'}}
-   %0 = "stablehlo.imag"(%arg0) : (tensor<2xf32>) -> tensor<2xf16>
-   func.return %0 : tensor<2xf16>
-@@ -1637,6 +1676,7 @@
- // -----
- 
- func.func @map_c3(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires monotonically increasing dimension numbers, but got: dense<[1, 0]> : tensor<2xi64>}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-@@ -1649,6 +1689,7 @@
- // -----
- 
- func.func @map_c3(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{applied to a subset of dimensions currently not supported: operand dimensions = 2, requested map dimensions size = 3}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-@@ -1661,6 +1702,7 @@
- // -----
- 
- func.func @map_c4(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects number of operands to match the arity of map computation, but got: 2 and 1}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg: tensor<f32>):
-@@ -1673,6 +1715,7 @@
- // -----
- 
- func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{computation arguments must be 0-rank tensor, but got: arg #1 of type 'tensor<5xf32>'}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<f32>, %arg3: tensor<5xf32>):
-@@ -1685,6 +1728,7 @@
- // -----
- 
- func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{element type of operands and computation arguments must match, but got: 'f32' and 'i32'}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<i32>, %arg3: tensor<i32>):
-@@ -1697,6 +1741,7 @@
- // -----
- 
- func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{computation must return single output, but got: 0}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-@@ -1709,6 +1754,7 @@
- // -----
- 
- func.func @map_c4(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{computation must return 0-rank tensor, but got: 'tensor<5xf32>'}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-@@ -1732,6 +1778,7 @@
- // -----
- 
- func.func @map_i2(%arg0: tensor<4x5xf32>, %arg1: tensor<4x5xf32>) -> tensor<4x5xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{dimensions should be rank 1 but got rank 2}}
-   %0 = "stablehlo.map"(%arg0, %arg1) ({
-     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
-@@ -1778,6 +1825,7 @@
- // -----
- 
- func.func @real_c2(%arg0: tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<2x3xf32>' are incompatible with return type(s) of operation 'tensor<2x3xf16>'}}
-   %0 = "stablehlo.real"(%arg0) : (tensor<2x3xcomplex<f32>>) -> tensor<2x3xf16>
-   func.return %0 : tensor<2x3xf16>
-@@ -1899,6 +1947,7 @@
- 
- func.func @rng_normal_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>) {
-   %cst = "stablehlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error @+1 {{inferred type(s) 'tensor<7xf32>' are incompatible with return type(s) of operation 'tensor<12xf32>'}}
-   %0 = "stablehlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<f32>, tensor<f32>, tensor<1xi64>) -> tensor<12xf32>
-   func.return
-@@ -1908,7 +1957,7 @@
- 
- func.func @rng_normal_invalid_mu_rank(%mu: tensor<1xf32>, %sigma: tensor<f32>) -> tensor<2x3x5xf32> {
-   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
--  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-+  // expected-error@+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-   %0 = "stablehlo.rng"(%mu, %sigma, %shape) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-   func.return %0 : tensor<2x3x5xf32>
- }
-@@ -1917,7 +1966,7 @@
- 
- func.func @rng_normal_invalid_sigma_rank(%mu: tensor<f32>, %sigma: tensor<1xf32>) -> tensor<2x3x5xf32> {
-   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
--  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-+  // expected-error@+1 {{#1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-   %0 = "stablehlo.rng"(%mu, %sigma, %shape) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-   func.return %0 : tensor<2x3x5xf32>
- }
-@@ -1935,7 +1984,7 @@
- 
- func.func @rng_normal_invalid_type(%arg0: tensor<complex<f32>>, %arg1: tensor<f32>) {
-   %cst = "stablehlo.constant"() {value = dense<7> : tensor<1xi64>} : () -> tensor<1xi64>
--  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
-+  // expected-error @+1 {{#0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
-   %0 = "stablehlo.rng"(%arg0, %arg1, %cst) {rng_distribution = #stablehlo<rng_distribution NORMAL>}: (tensor<complex<f32>>, tensor<f32>, tensor<1xi64>) -> tensor<7xf32>
-   func.return
- }
-@@ -1968,6 +2017,7 @@
- // -----
- 
- func.func @rng_uniform_invalid_shape(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<7xi64>) {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error @+1 {{inferred type(s) 'tensor<?x?x?x?x?x?x?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>'}}
-   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<f32>, tensor<f32>, tensor<7xi64>) -> tensor<?xf32>
-   func.return
-@@ -1977,7 +2027,7 @@
- 
- func.func @rng_uniform_invalid_a_rank(%a: tensor<1xf32>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
-   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
--  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-+  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-   %0 = "stablehlo.rng"(%a, %b, %shape) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<1xf32>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-   func.return %0 : tensor<2x3x5xf32>
- }
-@@ -1987,7 +2037,7 @@
- 
- func.func @rng_uniform_invalid_b_rank(%a: tensor<f32>, %b: tensor<1xf32>) -> tensor<2x3x5xf32> {
-   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
--  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-+  // expected-error@+1 {{operand #1 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<1xf32>'}}
-   %0 = "stablehlo.rng"(%a, %b, %shape) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<f32>, tensor<1xf32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-   func.return %0 : tensor<2x3x5xf32>
- }
-@@ -2005,7 +2055,7 @@
- 
- func.func @rng_uniform_invalid_type(%a: tensor<complex<f32>>, %b: tensor<f32>) -> tensor<2x3x5xf32> {
-   %shape = stablehlo.constant dense<[2, 3, 5]> : tensor<3xi64>
--  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
-+  // expected-error@+1 {{operand #0 must be 0D tensor of pred (AKA boolean or 1-bit integer) or 4/8/16/32/64-bit signless integer or 4/8/16/32/64-bit unsigned integer or f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<complex<f32>>'}}
-   %0 = "stablehlo.rng"(%a, %b, %shape) {rng_distribution = #stablehlo<rng_distribution UNIFORM>}: (tensor<complex<f32>>, tensor<f32>, tensor<3xi64>) -> tensor<2x3x5xf32>
-   func.return %0 : tensor<2x3x5xf32>
- }
-@@ -2069,6 +2119,7 @@
- // -----
- 
- func.func @select_c1(%arg0: tensor<3xi1>, %arg1: tensor<2x3xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires the same shape for all operands}}
-   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-   func.return %0 : tensor<2x3xi32>
-@@ -2077,6 +2128,7 @@
- // -----
- 
- func.func @select_c2(%arg0: tensor<3xi1>, %arg1: tensor<2x4xi32>, %arg2: tensor<2x3xi32>) -> tensor<2x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires compatible types for non-predicate operands}}
-   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<3xi1>, tensor<2x4xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
-   func.return %0 : tensor<2x3xi32>
-@@ -2085,6 +2137,7 @@
- // -----
- 
- func.func @select_c2(%arg0: tensor<i1>, %arg1: tensor<2x3xf32>, %arg2: tensor<2x3xf32>) -> tensor<2x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<2x3xf32>' are incompatible with return type(s) of operation 'tensor<2x3xi32>'}}
-   %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<i1>, tensor<2x3xf32>, tensor<2x3xf32>) -> tensor<2x3xi32>
-   func.return %0 : tensor<2x3xi32>
-@@ -2101,6 +2154,7 @@
- // -----
- 
- func.func @slice_c2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{the number of elements in start_indices (3) does not match the rank of the operand (2)}}
-   %0 = "stablehlo.slice"(%arg0) {
-     start_indices = dense<[1, 0, 0]> : tensor<3xi64>,
-@@ -2113,6 +2167,7 @@
- // -----
- 
- func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{negative start index -1 in dimension 0}}
-   %0 = "stablehlo.slice"(%arg0) {
-     start_indices = dense<[-1, 0]> : tensor<2xi64>,
-@@ -2125,6 +2180,7 @@
- // -----
- 
- func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{limit index 5 is larger than dimension size 4 in dimension 1}}
-   %0 = "stablehlo.slice"(%arg0) {
-     start_indices = dense<[1, 0]> : tensor<2xi64>,
-@@ -2137,6 +2193,7 @@
- // -----
- 
- func.func @slice_c3(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start index 3 is larger than limit index 2 in dimension 1}}
-   %0 = "stablehlo.slice"(%arg0) {
-     start_indices = dense<[1, 3]> : tensor<2xi64>,
-@@ -2149,6 +2206,7 @@
- // -----
- 
- func.func @slice_c4(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{stride must be positive but got 0 in dimension 0}}
-   %0 = "stablehlo.slice"(%arg0) {
-     start_indices = dense<[1, 0]> : tensor<2xi64>,
-@@ -2173,6 +2231,7 @@
- // -----
- 
- func.func @slice_i2(%arg0: tensor<3x4xi32>) -> tensor<1x2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start_indices has rank 2 instead of required rank 1}}
-   %0 = "stablehlo.slice"(%arg0) {
-     start_indices = dense<[[1, 0]]> : tensor<1x2xi64>,
-@@ -2201,6 +2260,7 @@
- // -----
- 
- func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{has mismatched number of slice sizes (1) and number of start indices (2)}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[4]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
-   func.return %0 : tensor<1x4xi32>
-@@ -2209,6 +2269,7 @@
- // -----
- 
- func.func @dynamic_slice_c2(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>) -> tensor<1x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{has mismatched number of start indices (1) and the rank of operand (2)}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1) {slice_sizes = dense<[1]> : tensor<1xi64>} : (tensor<3x4xi32>, tensor<i64>) -> tensor<1x4xi32>
-   func.return %0 : tensor<1x4xi32>
-@@ -2217,6 +2278,7 @@
- // -----
- 
- func.func @dynamic_slice_c3(%arg0: tensor<3x4xi32>, %arg1: tensor<i32>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start indices must have same element type}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i32>, tensor<i64>) -> tensor<1x4xi32>
-   func.return %0 : tensor<1x4xi32>
-@@ -2225,6 +2287,7 @@
- // -----
- 
- func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{has negative size index to dynamic slice: -1}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[-1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
-   func.return %0 : tensor<1x4xi32>
-@@ -2233,6 +2296,7 @@
- // -----
- 
- func.func @dynamic_slice_c4(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{has slice size 10 greater than dimension size 4 in dimension 1 of operand}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 10]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
-   func.return %0 : tensor<1x4xi32>
-@@ -2241,6 +2305,7 @@
- // -----
- 
- func.func @dynamic_slice_c5(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<2x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<1x4xi32>' are incompatible with return type(s) of operation 'tensor<2x4xi32>'}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<[1, 4]> : tensor<2xi64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<2x4xi32>
-   func.return %0 : tensor<2x4xi32>
-@@ -2257,6 +2322,7 @@
- // -----
- 
- func.func @dynamic_slice_i3(%arg0: tensor<3x4xi32>, %arg1: tensor<i64>, %arg2: tensor<i64>) -> tensor<1x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes should be rank 1, but got rank 0.}}
-   %0 = "stablehlo.dynamic_slice"(%arg0, %arg1, %arg2) {slice_sizes = dense<1> : tensor<i64>} : (tensor<3x4xi32>, tensor<i64>, tensor<i64>) -> tensor<1x4xi32>
-   func.return %0 : tensor<1x4xi32>
-@@ -2273,6 +2339,7 @@
- // -----
- 
- func.func @dynamic_update_slice_c1(%operand: tensor<3x4xi64>, %update: tensor<1x4xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x5xi64> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{op inferred type(s) 'tensor<3x4xi64>' are incompatible with return type(s) of operation 'tensor<3x5xi64>'}}
-   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<1x4xi64>, tensor<i64>, tensor<i64>) -> tensor<3x5xi64>
-   func.return %0 : tensor<3x5xi64>
-@@ -2281,6 +2348,7 @@
- // -----
- 
- func.func @dynamic_update_slice_c3(%operand: tensor<3x4xi64>, %update: tensor<2xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x4xi64> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{update rank does not match operand rank: 1 vs 2.}}
-   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<2xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
-   func.return %0 : tensor<3x4xi64>
-@@ -2289,6 +2357,7 @@
- // -----
- 
- func.func @dynamic_update_slice_c4(%operand: tensor<3x4xi64>, %update: tensor<1x2xi64>, %start_indices0: tensor<i64>) -> tensor<3x4xi64> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects number of start_indices to match operand rank: 1 vs 2.}}
-   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0) : (tensor<3x4xi64>, tensor<1x2xi64>, tensor<i64>) -> tensor<3x4xi64>
-   func.return %0 : tensor<3x4xi64>
-@@ -2297,6 +2366,7 @@
- // -----
- 
- func.func @dynamic_update_slice_c5(%operand: tensor<11x3x4xi32>, %update: tensor<1x3x4xi32>, %start_indices0: tensor<i32>, %start_indices1: tensor<i64>, %start_indices2: tensor<i64>) -> tensor<11x3x4xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start indices must have same element type}}
-   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1, %start_indices2) : (tensor<11x3x4xi32>, tensor<1x3x4xi32>, tensor<i32>, tensor<i64>, tensor<i64>) -> tensor<11x3x4xi32>
-   func.return %0 : tensor<11x3x4xi32>
-@@ -2305,6 +2375,7 @@
- // -----
- 
- func.func @dynamic_update_slice_c6(%operand: tensor<3x4xi64>, %update: tensor<1x5xi64>, %start_indices0: tensor<i64>, %start_indices1: tensor<i64>) -> tensor<3x4xi64> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects size at dimension 1 of update to be in range [0, 4]. Got: 5.}}
-   %0 = "stablehlo.dynamic_update_slice"(%operand, %update, %start_indices0, %start_indices1) : (tensor<3x4xi64>, tensor<1x5xi64>, tensor<i64>, tensor<i64>) -> tensor<3x4xi64>
-   func.return %0 : tensor<3x4xi64>
-@@ -2375,6 +2446,7 @@
- // -----
- 
- func.func @transpose_bad_permutations_rank(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{permutation has rank 2 instead of rank 1}}
-   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[[1]]> : tensor<1x1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-   func.return %0: tensor<2x1x4x3xi32>
-@@ -2383,6 +2455,7 @@
- // -----
- 
- func.func @transpose_bad_permutations_size(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{TransposeOp operand rank 4 does not match permutation size 1}}
-   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1]> : tensor<1xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-   func.return %0: tensor<2x1x4x3xi32>
-@@ -2391,6 +2464,7 @@
- // -----
- 
- func.func @transpose_bad_permutation(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2x1x4x3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{attribute permutation must be a permutation of [0, 1, 2, 3] but got dense<[1, 0, 3, 9]> : tensor<4xi64>}}
-   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 9]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2x1x4x3xi32>
-   func.return %0: tensor<2x1x4x3xi32>
-@@ -2399,6 +2473,7 @@
- // -----
- 
- func.func @transpose_operand_result_rank_mismatch(%arg0: tensor<1x2x3x4xi32>) ->  tensor<2xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{op inferred type(s) 'tensor<2x1x4x3xi32>' are incompatible with return type(s) of operation 'tensor<2xi32>'}}
-   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x2x3x4xi32>) -> tensor<2xi32>
-   func.return %0: tensor<2xi32>
-@@ -2407,6 +2482,7 @@
- // -----
- 
- func.func @transpose_operand_result_permutation_mismatch(%arg0: tensor<1x?x3x?xi32>) ->  tensor<?x2x?x?xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{op inferred type(s) 'tensor<?x1x?x3xi32>' are incompatible with return type(s) of operation 'tensor<?x2x?x?xi32>}}
-   %0 = "stablehlo.transpose"(%arg0) {permutation = dense<[1, 0, 3, 2]> : tensor<4xi64>} : (tensor<1x?x3x?xi32>) -> tensor<?x2x?x?xi32>
-   func.return %0: tensor<?x2x?x?xi32>
-@@ -2471,6 +2547,7 @@
- // -----
- 
- func.func @triangular_solve_rank_less_than_2(%arg0: tensor<4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{operand 'a' must have rank >= 2, but got 'tensor<4xf32>'}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
-   func.return %0 : tensor<4x3xf32>
-@@ -2479,6 +2556,7 @@
- // -----
- 
- func.func @triangular_solve_unequal_minor_dims_a(%arg0: tensor<4x3xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{two minor dimensions of operand 'a' must be compatible, but got 'tensor<4x3xf32>'}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x3xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
-   func.return %0 : tensor<4x3xf32>
-@@ -2487,6 +2565,7 @@
- // -----
- 
- func.func @triangular_solve_unequal_rank(%arg0: tensor<10x4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x3xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{operands must have equal rank, but got 'tensor<10x4x4xf32>' and 'tensor<4x3xf32>'}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x4x4xf32>, tensor<4x3xf32>) -> tensor<4x3xf32>
-   func.return %0 : tensor<4x3xf32>
-@@ -2495,6 +2574,7 @@
- // -----
- 
- func.func @triangular_solve_mismatch_shared_dim(%arg0: tensor<4x4xf32>, %arg1: tensor<3x4xf32>) -> tensor<3x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{shared dimension of operands 'a' and 'b' must be compatible, but got 'tensor<4x4xf32>' and 'tensor<3x4xf32>'}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<3x4xf32>) -> tensor<3x4xf32>
-   func.return %0 : tensor<3x4xf32>
-@@ -2503,6 +2583,7 @@
- // -----
- 
- func.func @triangular_solve_mismatch_leading_dims(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{batch dimensions of the operands must be compatible, but got 'tensor<10x5x4x4xf32>' and 'tensor<10x6x4x3xf32>'}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x6x4x3xf32>) -> tensor<10x6x4x3xf32>
-   func.return %0 : tensor<10x6x4x3xf32>
-@@ -2511,6 +2592,7 @@
- // -----
- 
- func.func @triangular_solve_mismatch_result_and_b_type(%arg0: tensor<4x4xf32>, %arg1: tensor<4x3xf32>) -> tensor<4x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<4x3xf32>' are incompatible with return type(s) of operation 'tensor<4x4xf32>'}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose NO_TRANSPOSE>, unit_diagonal = true} : (tensor<4x4xf32>, tensor<4x3xf32>) -> tensor<4x4xf32>
-   func.return %0 : tensor<4x4xf32>
-@@ -2519,6 +2601,7 @@
- // -----
- 
- func.func @triangular_solve(%arg0: tensor<10x5x4x4xf32>, %arg1: tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{Invalid transpose option value for triangular solve}}
-   %0 = "stablehlo.triangular_solve"(%arg0, %arg1) {left_side = true, lower = true, transpose_a = #stablehlo<transpose TRANSPOSE_INVALID>, unit_diagonal = true} : (tensor<10x5x4x4xf32>, tensor<10x5x4x4xf32>) -> tensor<10x5x4x4xf32>
-   func.return %0 : tensor<10x5x4x4xf32>
-@@ -2542,6 +2625,7 @@
- // -----
- 
- func.func @tuple_arg_size_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<f32>, tensor<f32>>'}}
-   %0 = "stablehlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<f32>, tensor<f32>>
-   func.return %0 : tuple<tensor<f32>, tensor<f32>, tensor<f32>>
-@@ -2550,6 +2634,7 @@
- // -----
- 
- func.func @tuple_type_mismatch(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tuple<tensor<f32>, tensor<i32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tuple<tensor<f32>, tensor<f32>>' are incompatible with return type(s) of operation 'tuple<tensor<f32>, tensor<i32>>'}}
-   %0 = "stablehlo.tuple"(%arg0, %arg1) : (tensor<f32>, tensor<f32>) -> tuple<tensor<f32>, tensor<i32>>
-   func.return %0 : tuple<tensor<f32>, tensor<i32>>
-@@ -2572,6 +2657,7 @@
- // -----
- 
- func.func @get_tuple_element_bad_type(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<i32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<f32>' are incompatible with return type(s) of operation 'tensor<i32>'}}
-   %0 = "stablehlo.get_tuple_element"(%arg0) {index = 0 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<i32>
-   func.return %0 : tensor<i32>
-@@ -2580,6 +2666,7 @@
- // -----
- 
- func.func @get_tuple_element_index_out_of_bounds(%arg0: tuple<tensor<f32>, tensor<i32>>) -> tensor<f32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{index 2 is out of bounds of operand with size 2}}
-   %0 = "stablehlo.get_tuple_element"(%arg0) {index = 2 : i32} : (tuple<tensor<f32>, tensor<i32>>) -> tensor<f32>
-   func.return %0 : tensor<f32>
-@@ -2611,7 +2698,7 @@
- // -----
- 
- func.func @floor_invalid_i32_type(%arg0: tensor<4xi32>) -> tensor<4xi32> {
--  // expected-error@+1 {{must be tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
-+  // expected-error@+1 {{must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<4xi32>'}}
-   %0 = "stablehlo.floor"(%arg0) : (tensor<4xi32>) -> tensor<4xi32>
-   func.return %0 : tensor<4xi32>
- }
-@@ -2632,6 +2719,7 @@
- // -----
- 
- func.func @constant_invalid() -> () {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.constant' op inferred type(s) 'tensor<i32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
-   %0 = "stablehlo.constant"() {value = dense<0> : tensor<i32>} : () -> (tensor<3xi32>)
-   func.return
-@@ -3516,6 +3604,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3533,6 +3622,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3550,6 +3640,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3567,6 +3658,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes.rank != 1}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3584,6 +3676,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3601,6 +3694,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes size (6) not equal to (implied) operand rank (3)}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3618,6 +3712,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<1x5x8xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3635,6 +3730,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>) -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<8x?x7x1x6x1x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3652,6 +3748,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes collapsed dimension 2 should <= 1 but got 8}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3669,6 +3766,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{collapsed dimension -1 is out of bounds for slice_sizes.size (3)}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3686,6 +3784,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{collapsed dimension 17 is out of bounds for slice_sizes.size (3)}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3703,6 +3802,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice size (-1) is out of bounds for operand dimension (2) at index 2}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3720,6 +3820,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<?x?x2xi32>, %start_indices : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice size (8) is out of bounds for operand dimension (2) at index 2}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3737,6 +3838,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects offset_dims to not repeat, got: [2, 2]}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3754,6 +3856,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<16x11xi32>, %start_indices : tensor<5x2xi32>) -> tensor<5x8x6xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects offset_dims to be sorted, got: [2, 1]}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3771,6 +3874,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects collapsed_slice_dims to not repeat, got: [1, 1]}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3788,6 +3892,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects collapsed_slice_dims to be sorted, got: [1, 0]}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3805,6 +3910,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{offset_dims[0]: -1 is out of bounds for implied result rank 3}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3822,6 +3928,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{offset_dims[0]: 3 is out of bounds for implied result rank 3}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3839,6 +3946,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start_index_map[0]: -2 is out of bounds for operand rank 3}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3856,6 +3964,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start_index_map[1]: 3 is out of bounds for operand rank 3}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3873,6 +3982,7 @@
- // -----
- 
- func.func @gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>) -> tensor<1x5x8xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects start_index_map to not repeat, got: [0, 0]}}
-   %res = "stablehlo.gather"(%operand, %start_indices) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3936,6 +4046,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{index_vector_dim 4 is out of bounds for start indices with rank 3}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3952,6 +4063,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{offset_dims size (2) plus collapse_slice_dims size (2) is not equal to operand rank (3)}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3968,6 +4080,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x2xi32>, %slice_sizes : tensor<*xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{start_index_map size (1) is not equal to size of index dimension (2) of start_indices (2)}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -3984,6 +4097,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<?x?xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes.rank != 1}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -4000,6 +4114,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<*xi32>, %slice_sizes : tensor<2xi32>) -> tensor<*xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{slice_sizes size (2) not equal to (implied) operand rank (3)}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -4016,6 +4131,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -4032,6 +4148,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<2x4x9xi32>, %start_indices : tensor<1x5x2xi32>, %slice_sizes : tensor<*xi32>) -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<1x5x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -4048,6 +4165,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<?x?x?xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<3xi32>) -> tensor<3xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<3xi32>'}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -4064,6 +4182,7 @@
- // -----
- 
- func.func @dynamic_gather(%operand : tensor<*xi32>, %start_indices : tensor<?x?x?xi32>, %slice_sizes : tensor<?xi32>) -> tensor<?xi32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<?x?x?xi32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
-   %res = "stablehlo.dynamic_gather"(%operand, %start_indices, %slice_sizes) {
-     dimension_numbers = #stablehlo.gather<
-@@ -4087,6 +4206,7 @@
- // -----
- 
- func.func @get_dimension_size_c1(%I: tensor<1x128x512xf32>) -> tensor<i32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
-   %size = "stablehlo.get_dimension_size"(%I) {dimension = -1 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
-   func.return %size : tensor<i32>
-@@ -4095,6 +4215,7 @@
- // -----
- 
- func.func @get_dimension_size_c1(%I: tensor<1x128x512xf32>) -> tensor<i32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
-   %size = "stablehlo.get_dimension_size"(%I) {dimension = 3 : i64} : (tensor<1x128x512xf32>) -> tensor<i32>
-   func.return %size : tensor<i32>
-@@ -4105,6 +4226,7 @@
- func.func @set_dimension_size(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
-   %dim = stablehlo.constant dense<512> : tensor<1xi32>
- 
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{size operand should be of rank-0}}
-   %result = "stablehlo.set_dimension_size"(%I, %dim) {dimension = 2 : i64} : (tensor<1x128x512xf32>, tensor<1xi32>) -> tensor<1x128x512xf32>
-   func.return %result : tensor<1x128x512xf32>
-@@ -4114,6 +4236,7 @@
- 
- func.func @set_dimension_size_negative_dimension(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
-   %dim = stablehlo.constant dense<512> : tensor<i32>
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
-   %result = "stablehlo.set_dimension_size"(%I, %dim) {dimension =-1 : i64} : (tensor<1x128x512xf32>, tensor<i32>) -> tensor<1x128x512xf32>
-   func.return %result : tensor<1x128x512xf32>
-@@ -4123,6 +4246,7 @@
- 
- func.func @set_dimension_size_invalid_dimension(%I: tensor<1x128x512xf32>) -> tensor<1x128x512xf32> {
-   %dim = stablehlo.constant dense<512> : tensor<i32>
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires dimension attribute in range [0, 3); found (3)}}
-   %result = "stablehlo.set_dimension_size"(%I, %dim) {dimension = 3 : i64} : (tensor<1x128x512xf32>, tensor<i32>) -> tensor<1x128x512xf32>
-   func.return %result : tensor<1x128x512xf32>
-@@ -4651,6 +4775,7 @@
- // -----
- 
- func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
-   %0:3 = "stablehlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4659,6 +4784,7 @@
- // -----
- 
- func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %offset: tensor<2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
-   %0:3 = "stablehlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4667,6 +4793,7 @@
- // -----
- 
- func.func @error_batch_norm_train(%input: tensor<2x2x2x2xf32>, %scale: tensor<3xf32>, %offset: tensor<3xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 3 and the feature count is 2.}}
-   %0:3 = "stablehlo.batch_norm_training" (%input, %scale, %offset) {epsilon = 0.001 : f32, feature_index = 3 : i64} : (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>) -> (tensor<2x2x2x2xf32>, tensor<3xf32>, tensor<3xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4697,6 +4824,7 @@
- // -----
- 
- func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 2, and rank 2.}}
-   %0 = "stablehlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 2 : i64} :
-       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
-@@ -4707,6 +4835,7 @@
- // -----
- 
- func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<256xf32>, %offset: tensor<256xf32>, %mean: tensor<256xf32>, %variance: tensor<256xf32>) -> (tensor<4x256xf32>) {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
-   %0 = "stablehlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = -1 : i64} :
-       (tensor<4x256xf32>, tensor<256xf32>, tensor<256xf32>, tensor<256xf32>,
-@@ -4717,6 +4846,7 @@
- // -----
- 
- func.func @error_batch_norm_inference(%input: tensor<4x256xf32>, %scale: tensor<25xf32>, %offset: tensor<25xf32>, %mean: tensor<25xf32>, %variance: tensor<25xf32>) -> (tensor<4x256xf32>) {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 25 and the feature count is 256.}}
-   %0 = "stablehlo.batch_norm_inference" (%input, %scale, %offset, %mean, %variance) {epsilon = 1.001000e-05 : f32, feature_index = 1 : i64} :
-       (tensor<4x256xf32>, tensor<25xf32>, tensor<25xf32>, tensor<25xf32>,
-@@ -4746,6 +4876,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects featureIndex to be smaller than the rank of multi-dimensional operands; got featureIndex 4, and rank 4.}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 4 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4754,6 +4885,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects featureIndex to be a non-negative number, got -1.}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = -1 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4762,6 +4894,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<4xf32>, %variance: tensor<4xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects the size of single-dimensional operands to be compatible with feature count, but the size of single-dimensional operands is 4 and the feature count is 2.}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>, tensor<4xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<4xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4770,6 +4903,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<4xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects single-dimensional operands to have compatible shapes}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<4xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4778,7 +4912,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xi32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
--  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
-+  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2x2x2xi32>'}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xi32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
- }
-@@ -4786,6 +4920,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2xf32>) -> tensor<2x2x2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{expects multi-dimensional operands to have compatible shapes}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
-@@ -4810,7 +4945,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<2x2x2x2xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
--  // expected-error@+1 {{result #1 must be 1D tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
-+  // expected-error@+1 {{result #1 must be 1D tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<2x2xf32>'}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<2x2x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<2x2x2x2xf32>, tensor<2x2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<2x2x2x2xf32>
- }
-@@ -4818,7 +4953,7 @@
- // -----
- 
- func.func @error_batch_norm_grad(%input: tensor<*xf32>, %scale: tensor<2xf32>, %mean: tensor<2xf32>, %variance: tensor<2xf32>, %grad_output: tensor<2x2x2x2xf32>) -> tensor<*xf32> {
--  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
-+  // expected-error@+1 {{operand #0 must be ranked tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<*xf32>'}}
-   %0:3 = "stablehlo.batch_norm_grad" (%input, %scale, %mean, %variance, %grad_output) {epsilon = 0.001 : f32, feature_index = 0 : i64} : (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2x2x2x2xf32>) -> (tensor<*xf32>, tensor<2xf32>, tensor<2xf32>)
-   func.return %0#0 : tensor<*xf32>
- }
-@@ -4866,6 +5001,7 @@
- // -----
- 
- func.func @rfft_not_float32or64(%arg0: tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{RFFT requires f32 or f64 input type, but is given 'f16'.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf16>) -> tensor<3x5xcomplex<f32>>
-   func.return %0 : tensor<3x5xcomplex<f32>>
-@@ -4874,6 +5010,7 @@
- // -----
- 
- func.func @fft_invalid_rank(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{rank must be between 1 and 3, but got 4.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<4xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
-   func.return %0 : tensor<3x9xcomplex<f32>>
-@@ -4882,6 +5019,7 @@
- // -----
- 
- func.func @fft_rank_mismatch(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{operand rank must not be less than fft rank of 3 for operand of type 'tensor<3x9xf32>'}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<3xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
-   func.return %0 : tensor<3x9xcomplex<f32>>
-@@ -4890,6 +5028,7 @@
- // -----
- 
- func.func @rfft_invalid_dim(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
-   func.return %0 : tensor<3x9xcomplex<f32>>
-@@ -4898,6 +5037,7 @@
- // -----
- 
- func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{IRFFT requires non-final dimensions to be compatible with fft_length. Got: 3, 9 but wanted 9, 9, and 3 != 9.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<2xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
-   func.return %0 : tensor<3x9xf32>
-@@ -4906,6 +5046,7 @@
- // -----
- 
- func.func @irfft_invalid_dim(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 9 but fft_length is 9.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x9xf32>
-   func.return %0 : tensor<3x9xf32>
-@@ -4914,6 +5055,7 @@
- // -----
- 
- func.func @irfft_invalid_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{FFT/IFFT/IRFFT take a complex tensor as input, but is given 'tensor<3x9xf32>'}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xf32>) -> tensor<3x9xcomplex<f32>>
-   func.return %0 : tensor<3x9xcomplex<f32>>
-@@ -4922,6 +5064,7 @@
- // -----
- 
- func.func @irfft_invalid_ret_elt(%arg0: tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<3x16xf32>' are incompatible with return type(s) of operation 'tensor<3x16xcomplex<f32>>'}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<3x9xcomplex<f32>>) -> tensor<3x16xcomplex<f32>>
-   func.return %0 : tensor<3x16xcomplex<f32>>
-@@ -4930,6 +5073,7 @@
- // -----
- 
- func.func @rfft_invalid_ret_elt(%arg0: tensor<3x9xf32>) -> tensor<3x9xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<3x5xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<3x9xf32>'}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x9xf32>) -> tensor<3x9xf32>
-   func.return %0 : tensor<3x9xf32>
-@@ -4946,6 +5090,7 @@
- // -----
- 
- func.func @rfft_dynamic_incompatible_dims(%arg0: tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1{{RFFT requires innermost dimensions to be compatible with fft_length. Got: 3, 10 but wanted 9.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<9> : tensor<1xi64>, fft_type = #stablehlo<fft_type RFFT> } : (tensor<3x10xf32>) -> tensor<?x?xcomplex<f32>>
-   func.return %0 : tensor<?x?xcomplex<f32>>
-@@ -4962,6 +5107,7 @@
- // -----
- 
- func.func @irfft_dynamic_incompatible_non_final_dims(%arg0: tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1{{IRFFT requires non-final dimensions to be compatible with fft_length. Got: -9223372036854775808, 3, 15 but wanted 4, 16, and 3 != 4}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<[4, 16]> : tensor<2xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<?x3x15xcomplex<f32>>) -> tensor<?x?x?xf32>
-   func.return %0 : tensor<?x?x?xf32>
-@@ -4970,6 +5116,7 @@
- // -----
- 
- func.func @irfft_dynamic_incompatible_final_dim(%arg0: tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1{{IRFFT requires innermost dimension to be compatible with fft_length[-1]/2+1. Got: 8 but fft_length is 16.}}
-   %0 = "stablehlo.fft"(%arg0) { fft_length = dense<16> : tensor<1xi64>, fft_type = #stablehlo<fft_type IRFFT> } : (tensor<?x8xcomplex<f32>>) -> tensor<?x?xf32>
-   func.return %0 : tensor<?x?xf32>
-@@ -5115,6 +5262,7 @@
- // -----
- 
- func.func @quantized_constants_invalid_storage_type() -> () {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.constant' op inferred type(s) 'tensor<2xui8>' are incompatible with return type(s) of operation 'tensor<2x!quant.uniform<i8:f32, 2.000000e+00:15>>}}
-   %0 = "stablehlo.constant"() {value = dense<[1, 2]> : tensor<2xui8>} : () -> tensor<2x!quant.uniform<i8:f32, 2.0:15>>
-   func.return
-@@ -5187,6 +5335,7 @@
- // -----
- 
- func.func @pad_c2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{edge_padding_low length (2) must match operand rank (3)}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<[0, 1]> : tensor<2xi64>,
-@@ -5199,6 +5348,7 @@
- // -----
- 
- func.func @pad_c3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x3xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{Interior padding cannot be negative: -1}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-@@ -5211,6 +5361,7 @@
- // -----
- 
- func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{Padding result in negative size for dimension 2}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<[0, 1, -4]> : tensor<3xi64>,
-@@ -5223,6 +5374,7 @@
- // -----
- 
- func.func @pad_c4(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<8x8x8xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.pad' op inferred type(s) 'tensor<2x4x7xf16>' are incompatible with return type(s) of operation 'tensor<8x8x8xf16>'}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-@@ -5248,6 +5400,7 @@
- // -----
- 
- func.func @pad_i2(%arg0: tensor<1x2x3xf16>, %arg1: tensor<2xf16>) -> tensor<2x4x7xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{padding value type should be a rank-0 tensor, is rank 1}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<[0, 1, 2]> : tensor<3xi64>,
-@@ -5260,6 +5413,7 @@
- // -----
- 
- func.func @pad_i3(%arg0: tensor<1x2x3xf16>, %arg1: tensor<f16>) -> tensor<2x4x7xf16> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{edge_padding_low has rank 0 instead of required rank 1}}
-   %0 = "stablehlo.pad"(%arg0, %arg1) {
-     edge_padding_low = dense<1> : tensor<i64>,
-@@ -5495,6 +5649,7 @@
- // -----
- 
- func.func @abs_c2(%arg0: tensor<1x2xf32>) -> tensor<1x2xf64> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
-   %0 = "stablehlo.abs"(%arg0) {} : (tensor<1x2xf32>) -> tensor<1x2xf64>
-   func.return %0 : tensor<1x2xf64>
-@@ -5503,7 +5658,8 @@
- // -----
- 
- func.func @abs_c2(%arg0: tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64> {
--// expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
-+  // expected-error@+2 {{failed to infer returned types}}
-+  // expected-error@+1 {{'stablehlo.abs' op inferred type(s) 'tensor<1x2xf32>' are incompatible with return type(s) of operation 'tensor<1x2xf64>'}}
-   %0 = "stablehlo.abs"(%arg0) {} : (tensor<1x2xcomplex<f32>>) -> tensor<1x2xf64>
-   func.return %0 : tensor<1x2xf64>
- }
-@@ -5526,6 +5682,47 @@
- 
- // -----
- 
-+func.func @complex_int_input(%arg0: tensor<10x10xi32>, %arg1: tensor<10x10xi32>) -> tensor<10x10xcomplex<i32>> {
-+  // expected-error@+1 {{operand #0 must be tensor of 32-bit float or 64-bit float values, but got 'tensor<10x10xi32>'}}
-+  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xi32>, tensor<10x10xi32>) -> tensor<10x10xcomplex<i32>>
-+  func.return %0 : tensor<10x10xcomplex<i32>>
-+}
-+
-+// -----
-+
-+func.func @complex_f32_f64_mix_input(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf64>) -> tensor<10x10xcomplex<f64>> {
-+  // expected-error@+1 {{requires the same element type for all operands}}
-+  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf64>) -> tensor<10x10xcomplex<f64>>
-+  func.return %0 : tensor<10x10xcomplex<f64>>
-+}
-+
-+// -----
-+
-+func.func @complex_f16_input(%arg0: tensor<10x10xf16>, %arg1: tensor<10x10xf16>) -> tensor<10x10xcomplex<f16>> {
-+  // expected-error@+1 {{operand #0 must be tensor of 32-bit float or 64-bit float values, but got 'tensor<10x10xf16>'}}
-+  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf16>, tensor<10x10xf16>) -> tensor<10x10xcomplex<f16>>
-+  func.return %0 : tensor<10x10xcomplex<f16>>
-+}
-+
-+// -----
-+
-+func.func @complex_mismatch_return_element_type(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf32>) -> tensor<10x10xcomplex<f64>> {
-+  // expected-error@+2 {{failed to infer returned types}}
-+  // expected-error@+1 {{inferred type(s) 'tensor<10x10xcomplex<f32>>' are incompatible with return type(s) of operation 'tensor<10x10xcomplex<f64>>'}}
-+  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<10x10xcomplex<f64>>
-+  func.return %0 : tensor<10x10xcomplex<f64>>
-+}
-+
-+// -----
-+
-+func.func @complex_mismatch_return_shape(%arg0: tensor<10x10xf32>, %arg1: tensor<10x10xf32>) -> tensor<5x5xcomplex<f32>> {
-+  // expected-error@+1 {{requires the same shape for all operands and results}}
-+  %0 = "stablehlo.complex"(%arg0, %arg1) {} : (tensor<10x10xf32>, tensor<10x10xf32>) -> tensor<5x5xcomplex<f32>>
-+  func.return %0 : tensor<5x5xcomplex<f32>>
-+}
-+
-+// -----
-+
- // CHECK-LABEL: func @is_finite
- func.func @is_finite(%arg0: tensor<3xf32>) -> tensor<3xi1> {
-   %0 = "stablehlo.is_finite"(%arg0) {} : (tensor<3xf32>) -> tensor<3xi1>
-@@ -5535,7 +5732,7 @@
- // -----
- 
- func.func @is_finite_int_input(%arg0: tensor<3xi32>) -> tensor<3xi1> {
--  // expected-error@+1 {{operand #0 must be tensor of f8E4M3FN type or f8E5M2 type or f8E4M3FNUZ type or f8E5M2FNUZ type or f8E4M3B11FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
-+  // expected-error@+1 {{operand #0 must be tensor of f8E4M3B11FNUZ type or f8E4M3FN type or f8E4M3FNUZ type or f8E5M2 type or f8E5M2FNUZ type or 16-bit float or 32-bit float or 64-bit float or bfloat16 type values, but got 'tensor<3xi32>'}}
-   %0 = "stablehlo.is_finite"(%arg0) {} : (tensor<3xi32>) -> tensor<3xi1>
-   func.return %0 : tensor<3xi1>
- }
-@@ -5559,6 +5756,7 @@
- // -----
- 
- func.func @negative_dimension_attr(%arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires non-negative dimension attribute; found (-1)}}
-   %result = "stablehlo.set_dimension_size"(%arg0, %arg1) {dimension = -1 : i64} : (tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
-   func.return %result : tensor<*xf32>
-@@ -5567,6 +5765,7 @@
- // -----
- 
- func.func @invalid_dimension_attr(%arg0: tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, %arg1: tensor<i32>) -> tensor<*xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{requires dimension attribute in range [0, 2); found (2)}}
-   %result = "stablehlo.set_dimension_size"(%arg0, %arg1) {dimension = 2 : i64} : (tensor<?x?xf32, #stablehlo.type_extensions<bounds = [3, -1]>>, tensor<i32>) -> tensor<*xf32>
-   func.return %result : tensor<*xf32>
-diff --ruN a/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir b/stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
---- stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
-+++ stablehlo/stablehlo/tests/ops_stablehlo_roundtrip.mlir
-@@ -179,13 +179,15 @@
-   %cst_4 = arith.constant dense<[[1, 2], [3, 4]]> : tensor<2x2xi32>
-   %cst_5 = arith.constant dense<[[3, 2], [1, 4]]> : tensor<2x2xi32>
-   %cst_6 = arith.constant dense<[[1, 2], [4, 8]]> : tensor<2x2xui32>
--  %cst_7 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E5M2FNUZ>
--  %cst_8 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3FNUZ>
--  %cst_9 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3B11FNUZ>
--  %cst_10 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
--  %cst_11 = arith.constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
--  %cst_12 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
--  %cst_13 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
-+  %cst_7 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3B11FNUZ>
-+  %cst_8 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3FN>
-+  %cst_9 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E4M3FNUZ>
-+  %cst_10 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E5M2>
-+  %cst_11 = arith.constant dense<[1.0, 2.0, 3.0, 4.0]> : tensor<4xf8E5M2FNUZ>
-+  %cst_12 = arith.constant dense<[1.000000e+00, 2.000000e+00, 3.000000e+00, 4.000000e+00]> : tensor<4xbf16>
-+  %cst_13 = arith.constant dense<[1.0e+00, -4.0e+00, -65504.0e+00, 1.5625e-02]> : tensor<4xf16>
-+  %cst_14 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f32>>
-+  %cst_15 = arith.constant dense<(1.000000e+00,0.000000e+00)> : tensor<complex<f64>>
-   func.return
- }
- 
-diff --ruN a/stablehlo/stablehlo/tests/verify_reduce.mlir b/stablehlo/stablehlo/tests/verify_reduce.mlir
---- stablehlo/stablehlo/tests/verify_reduce.mlir
-+++ stablehlo/stablehlo/tests/verify_reduce.mlir
-@@ -401,6 +401,7 @@
- func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
-     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
- 
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{inferred type(s) 'tensor<?xf32>', 'tensor<?xi32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?xi32>', 'tensor<?xi32>'}}
-   %0:3 = "stablehlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
- 
-@@ -419,6 +420,7 @@
- func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
-     -> (tensor<?x?xi32>) {
- 
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?xf32>'}}
-   %0:2 = "stablehlo.reduce"(%arg0, %arg1) ({
- 
-@@ -436,6 +438,7 @@
- func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xi32>,
-     %arg2: tensor<f32>, %arg3: tensor<i32>) -> (tensor<?xf32>) {
- 
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>', 'tensor<?xi32>' are incompatible with return type(s) of operation 'tensor<?xf32>', 'tensor<?x?xf32>'}}
-   %0:2 = "stablehlo.reduce"(%arg0, %arg1, %arg2, %arg3) ({
- 
-@@ -454,6 +457,7 @@
- func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
-     -> (tensor<?xi32>) {
- 
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?xi32>'}}
-   %0 = "stablehlo.reduce"(%arg0, %arg1) ({
- 
-@@ -471,6 +475,7 @@
- func.func @reduce_verify_rettype(%arg0: tensor<?x?xf32>, %arg1 : tensor<f32>)
-     -> (tensor<?x?xi32>) {
- 
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error@+1 {{'stablehlo.reduce' op inferred type(s) 'tensor<?xf32>' are incompatible with return type(s) of operation 'tensor<?x?xf32>'}}
-   %0 = "stablehlo.reduce"(%arg0, %arg1) ({
- 
-diff --ruN a/stablehlo/stablehlo/tests/verify_reduce_window.mlir b/stablehlo/stablehlo/tests/verify_reduce_window.mlir
---- stablehlo/stablehlo/tests/verify_reduce_window.mlir
-+++ stablehlo/stablehlo/tests/verify_reduce_window.mlir
-@@ -450,6 +450,7 @@
- func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
-     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
-         tensor<2x2xf32> {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xf32>'}}
-   %0 = "stablehlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
-          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
-@@ -472,6 +473,7 @@
- func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
-     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
-         (tensor<2x2xf32>, tensor<2x3xi32>) {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xf32>', 'tensor<2x3xi32>'}}
-   %0:2 = "stablehlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
-          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
-@@ -494,6 +496,7 @@
- func.func @reduce_window_invalid_ret_type(%arg0: tensor<4x2xf32>,
-     %arg1: tensor<4x2xi32>, %init0: tensor<f32>, %init1: tensor<i32>) ->
-         (tensor<2x2xi32>, tensor<2x2xi32>) {
-+  // expected-error@+2 {{failed to infer returned types}}
-   // expected-error @+1 {{inferred type(s) 'tensor<2x2xf32>', 'tensor<2x2xi32>' are incompatible with return type(s) of operation 'tensor<2x2xi32>', 'tensor<2x2xi32>'}}
-   %0:2 = "stablehlo.reduce_window"(%arg0, %arg1, %init0, %init1) ({
-          ^bb0(%a0: tensor<f32>, %a1: tensor<i32>,
-diff --ruN a/stablehlo/stablehlo/tests/verify_scatter.mlir b/stablehlo/stablehlo/tests/verify_scatter.mlir
---- stablehlo/stablehlo/tests/verify_scatter.mlir
-+++ stablehlo/stablehlo/tests/verify_scatter.mlir
-@@ -481,6 +481,7 @@
-     %scatter_indices: tensor<10x2xi32>, %updates: tensor<10x300xf32>) ->
-       tensor<200x100xf32> {
- 
-+  // expected-error @+2 {{failed to infer returned types}}
-   // expected-error @+1 {{inferred type(s) 'tensor<200x100x300xf32>' are incompatible with return type(s) of operation 'tensor<200x100xf32>'}}
-   %0 = "stablehlo.scatter" (%input_tensor, %scatter_indices, %updates) ({
-   ^bb0(%lhs: tensor<f32>, %rhs: tensor<f32>):
-diff --ruN a/stablehlo/stablehlo/tests/verify_select_and_scatter.mlir b/stablehlo/stablehlo/tests/verify_select_and_scatter.mlir
---- stablehlo/stablehlo/tests/verify_select_and_scatter.mlir
-+++ stablehlo/stablehlo/tests/verify_select_and_scatter.mlir
-@@ -394,6 +394,7 @@
-     %arg1: tensor<10x12x12x64xf32>) -> () {
-     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
- 
-+    // expected-error @+2 {{failed to infer returned types}}
-     // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x32xf32>'}}
-     %1 = "stablehlo.select_and_scatter"(%arg0, %arg1, %0) ({
-     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-@@ -422,6 +423,7 @@
-     %arg1: tensor<10x12x12x64xf32>) -> () {
-     %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
- 
-+    // expected-error @+2 {{failed to infer returned types}}
-     // expected-error @+1 {{inferred type(s) 'tensor<10x24x24x64xf32>' are incompatible with return type(s) of operation 'tensor<10x24x24x64xi32>'}}
-     %1 = "stablehlo.select_and_scatter"(%arg0, %arg1, %0) ({
-     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
-diff --ruN a/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
---- stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-+++ stablehlo/stablehlo/transforms/StablehloRefineShapes.cpp
-@@ -852,7 +852,8 @@
-     SmallVector<Type> inferredReturnTypes;
-     if (failed(op.inferReturnTypes(getContext(), /*location=*/{},
-                                    op->getOperands(), op->getAttrDictionary(),
--                                   op->getRegions(), inferredReturnTypes)))
-+                                   op->getPropertiesStorage(), op->getRegions(),
-+                                   inferredReturnTypes)))
-       return rewriter.notifyMatchFailure(op, "inferReturnTypes failed");
-     return refineReturnTypes(rewriter, op, inferredReturnTypes);
-   }
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index 1d7122cec43..88a76da4ef4 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "579f865e350cbd3513df71ecec4f4a12f0acb5fb"
-    STABLEHLO_SHA256 = "7d342ac2fd71dfb16da852f74bb929b548c6f5fbc0643504c32d60e5bb87ef76"
+    STABLEHLO_COMMIT = "0feb9842462ad5219f05154ac2b545a6fe440b45"
+    STABLEHLO_SHA256 = "4058d90fbdae81f658a13f5e3ae37cb949f258b0e68000e4a07515d2572b33c8"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/systemlibs/absl_py.absl.flags.BUILD b/third_party/systemlibs/absl_py.absl.flags.BUILD
index 4049989ae2f..d92f4949df1 100644
--- a/third_party/systemlibs/absl_py.absl.flags.BUILD
+++ b/third_party/systemlibs/absl_py.absl.flags.BUILD
@@ -5,3 +5,7 @@ package(default_visibility = ["//visibility:public"])
 py_library(
     name = "flags",
 )
+
+py_library(
+    name = "argparse_flags",
+)
diff --git a/third_party/triton/cl526173620.patch b/third_party/triton/cl526173620.patch
deleted file mode 100644
index b1addd9b585..00000000000
--- a/third_party/triton/cl526173620.patch
+++ /dev/null
@@ -1,81 +0,0 @@
-==== triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp#6 - triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2023-04-07 13:02:50.000000000 -0700
-+++ triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2023-04-21 17:35:14.000000000 -0700
-@@ -68,14 +68,14 @@ public:
-                   ConversionPatternRewriter &rewriter) const override {
-     Type retType = getTypeConverter()->convertType(op.getType());
-     auto value = adaptor.getValue().dyn_cast<DenseElementsAttr>();
--    if (dyn_cast<RankedTensorType>(retType)) {
-+    if (dyn_cast<RankedTensorType>(cast<ShapedType>(retType))) {
-       assert(value);
-       if (value.getElementType().isInteger(1) && value.isSplat())
-         // Workaround until https://reviews.llvm.org/D133743 is included.
--        value = DenseElementsAttr::get(retType, value.getSplatValue<bool>());
-+        value = DenseElementsAttr::get(cast<ShapedType>(retType), value.getSplatValue<bool>());
-       else
-         // This is a hack. We just want to add encoding
--        value = value.reshape(retType);
-+        value = value.reshape(cast<ShapedType>(retType));
-     }
-     addNamedAttrs(
-         rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, retType, value),
-==== triton/lib/Dialect/Triton/IR/Dialect.cpp#2 - triton/lib/Dialect/Triton/IR/Dialect.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/Triton/IR/Dialect.cpp	2023-04-07 13:02:50.000000000 -0700
-+++ triton/lib/Dialect/Triton/IR/Dialect.cpp	2023-04-21 17:28:36.000000000 -0700
-@@ -47,5 +47,5 @@
- Operation *TritonDialect::materializeConstant(OpBuilder &builder,
-                                               Attribute value, Type type,
-                                               Location loc) {
--  return builder.create<arith::ConstantOp>(loc, type, value);
-+  return builder.create<arith::ConstantOp>(loc, type, cast<TypedAttr>(value));
- }
-==== triton/lib/Dialect/Triton/Transforms/Combine.cpp#3 - triton/lib/Dialect/Triton/Transforms/Combine.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/Triton/Transforms/Combine.cpp	2023-04-07 13:02:50.000000000 -0700
-+++ triton/lib/Dialect/Triton/Transforms/Combine.cpp	2023-04-21 17:49:20.000000000 -0700
-@@ -40,10 +40,10 @@
-   Type resType = bcast_res.getType();
-   DenseElementsAttr res;
-   if (auto denseValue = value.dyn_cast<DenseElementsAttr>()) {
--    res =
--        DenseElementsAttr::get(resType, denseValue.getSplatValue<Attribute>());
-+    res = DenseElementsAttr::get(cast<ShapedType>(resType),
-+                                 denseValue.getSplatValue<Attribute>());
-   } else {
--    res = DenseElementsAttr::get(resType, value);
-+    res = DenseElementsAttr::get(cast<ShapedType>(resType), value);
-   }
-   return res;
- }
-==== triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp#1 - triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp	2023-04-07 13:02:50.000000000 -0700
-+++ triton/lib/Dialect/Triton/Transforms/RewriteTensorPointer.cpp	2023-04-21 17:48:23.000000000 -0700
-@@ -181,7 +181,8 @@
-     }
- 
-     // Create tensor
--    Value constant = builder.create<arith::ConstantOp>(loc, attr);
-+    Value constant =
-+        builder.create<arith::ConstantOp>(loc, cast<TypedAttr>(attr));
-     return builder.create<triton::SplatOp>(loc, otherTensorType, constant);
-   }
- };
-==== triton/lib/Dialect/TritonGPU/IR/Dialect.cpp#10 - triton/lib/Dialect/TritonGPU/IR/Dialect.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/IR/Dialect.cpp	2023-04-07 13:02:50.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/IR/Dialect.cpp	2023-04-21 17:46:57.000000000 -0700
-@@ -1056,8 +1056,9 @@
-   // cvt(type, constant) -> constant
-   if (auto cst = llvm::dyn_cast<arith::ConstantOp>(arg))
-     if (auto ret = cst.getValue().dyn_cast<SplatElementsAttr>()) {
--      auto newRet = SplatElementsAttr::get(op->getResultTypes().front(),
--                                           ret.getSplatValue<Attribute>());
-+      auto newRet =
-+          SplatElementsAttr::get(cast<ShapedType>(op->getResultTypes().front()),
-+                                 ret.getSplatValue<Attribute>());
-       rewriter.replaceOpWithNewOp<arith::ConstantOp>(op, newRet);
-       return mlir::success();
-     }
diff --git a/third_party/triton/cl528701873.patch b/third_party/triton/cl528701873.patch
deleted file mode 100644
index 21a035a0be6..00000000000
--- a/third_party/triton/cl528701873.patch
+++ /dev/null
@@ -1,54 +0,0 @@
-==== triton/lib/Dialect/Triton/IR/Ops.cpp#7 - /google/src/cloud/csigg/mlir_5e118f933b6590cecd7f1afb30845a1594bc4a5d_1683013084/triton/lib/Dialect/Triton/IR/Ops.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/Triton/IR/Ops.cpp	2023-04-24 23:33:26.000000000 -0700
-+++ triton/lib/Dialect/Triton/IR/Ops.cpp	2023-05-02 03:18:21.000000000 -0700
-@@ -260,7 +260,7 @@
- //-- TransOp --
- mlir::LogicalResult mlir::triton::TransOp::inferReturnTypes(
-     MLIRContext *context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<Type> &inferredReturnTypes) {
-   // type is the same as the input
-   auto argTy = operands[0].getType().cast<RankedTensorType>();
-@@ -287,7 +287,7 @@
- //-- DotOp --
- mlir::LogicalResult mlir::triton::DotOp::inferReturnTypes(
-     MLIRContext *context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<Type> &inferredReturnTypes) {
-   // type is the same as the accumulator
-   auto accTy = operands[2].getType().cast<RankedTensorType>();
-@@ -355,7 +355,7 @@
- 
- mlir::LogicalResult mlir::triton::ReduceOp::inferReturnTypes(
-     MLIRContext *context, std::optional<Location> location, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<Type> &inferredReturnTypes) {
-   for (auto arg : operands) {
-     auto argTy = arg.getType().cast<RankedTensorType>();
-@@ -462,7 +462,7 @@
- //-- ExpandDimsOp --
- mlir::LogicalResult mlir::triton::ExpandDimsOp::inferReturnTypes(
-     MLIRContext *context, std::optional<Location> loc, ValueRange operands,
--    DictionaryAttr attributes, RegionRange regions,
-+    DictionaryAttr attributes, OpaqueProperties, RegionRange regions,
-     SmallVectorImpl<Type> &inferredReturnTypes) {
-   // infer shape
-   auto arg = operands[0];
-==== triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp#4 - /google/src/cloud/csigg/mlir_5e118f933b6590cecd7f1afb30845a1594bc4a5d_1683013084/triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-04-24 23:33:26.000000000 -0700
-+++ triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-05-02 02:57:06.000000000 -0700
-@@ -224,7 +224,8 @@
-     SmallVector<Type, 1> newTypes;
-     auto success = typeInfer.inferReturnTypes(
-         newOp->getContext(), newOp->getLoc(), newOp->getOperands(),
--        newOp->getAttrDictionary(), newOp->getRegions(), newTypes);
-+        newOp->getAttrDictionary(), newOp->getPropertiesStorage(),
-+        newOp->getRegions(), newTypes);
-     if (succeeded(success))
-       newOp->getResult(0).setType(newTypes.front());
-   }
diff --git a/third_party/triton/cl536931041.patch b/third_party/triton/cl536931041.patch
new file mode 100644
index 00000000000..51352a2497f
--- /dev/null
+++ b/third_party/triton/cl536931041.patch
@@ -0,0 +1,15 @@
+diff --git triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+index 32605e605..07901f06f 100644
+--- triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
++++ triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp
+@@ -163,7 +163,8 @@ public:
+ 
+     mlir::RewritePatternSet patterns(context);
+     patterns.add<ConvertTransConvert>(context);
+-    patterns.add<MoveOpAfterLayoutConversion>(context);
++    // TODO(b/283035396): Fix CUDA_ERROR_MISALIGNED_ADDRESS and uncomment.
++    // patterns.add<MoveOpAfterLayoutConversion>(context);
+     if (applyPatternsAndFoldGreedily(m, std::move(patterns)).failed())
+       signalPassFailure();
+     if (fixupLoops(m).failed())
+
diff --git a/third_party/triton/cl538748120.patch b/third_party/triton/cl538748120.patch
new file mode 100644
index 00000000000..b8870fe1237
--- /dev/null
+++ b/third_party/triton/cl538748120.patch
@@ -0,0 +1,63 @@
+==== triton/BUILD#15 - triton/BUILD ====
+# action=edit type=text
+--- triton/BUILD	2023-04-24 19:41:15.000000000 -0700
++++ triton/BUILD	2023-06-08 04:26:58.000000000 -0700
+@@ -37,6 +37,22 @@
+ # exports_files(["LICENSE"])
+ # copybara:uncomment_end
+ 
++config_setting(
++    name = "compiler_is_msvc",
++    flag_values = {
++        # copybara:comment_begin
++        "@bazel_tools" +
++        # copybara:comment_end
++        "//tools/cpp:compiler": "msvc-cl",
++    },
++)
++
++# TODO(csigg): fix, enable error upstream, remove.
++_no_unused_variable = select({
++    ":compiler_is_msvc": [],
++    "//conditions:default": ["-Wno-unused-variable"],
++})
++
+ td_library(
+     name = "td_files",
+     srcs = glob(["include/triton/**/*.td"]),
+@@ -276,7 +292,7 @@
+     name = "TritonDialect",
+     srcs = glob(["lib/Dialect/Triton/IR/*.cpp"]),
+     hdrs = glob(["include/triton/Dialect/Triton/IR/*.h"]),
+-    copts = ["-Wno-unused-variable"],  # TODO(manany): fix
++    copts = _no_unused_variable,
+     includes = ["include"],
+     deps = [
+         ":triton_dialect_inc_gen",
+@@ -328,7 +344,7 @@
+         "include/triton/Analysis/*.h",
+         "include/triton/Dialect/TritonGPU/IR/*.h",
+     ]),
+-    copts = ["-Wno-unused-variable"],  # TODO(csigg): fix
++    copts = _no_unused_variable,
+     includes = ["include"],
+     deps = [
+         ":TritonDialect",
+@@ -356,7 +372,7 @@
+         "lib/Dialect/TritonGPU/Transforms/*.h",
+     ]),
+     hdrs = glob(["include/triton/Dialect/TritonGPU/Transforms/*.h"]),
+-    copts = ["-Wno-unused-variable"],  # TODO(csigg): fix
++    copts = _no_unused_variable,
+     includes = ["include"],
+     deps = [
+         ":TritonDialect",
+@@ -391,7 +407,7 @@
+         "include/triton/Tools/Sys/*.hpp",
+         "include/triton/Conversion/TritonGPUToLLVM/*.h",
+     ]),
+-    copts = ["-Wno-unused-variable"],  # TODO(csigg): fix
++    copts = _no_unused_variable,
+     includes = [
+         "include",
+         "lib/Conversion/TritonGPUToLLVM",
diff --git a/third_party/triton/cl539025816.patch b/third_party/triton/cl539025816.patch
new file mode 100644
index 00000000000..bffde101e30
--- /dev/null
+++ b/third_party/triton/cl539025816.patch
@@ -0,0 +1,49 @@
+==== triton/include/triton/Analysis/Utility.h#4 - /google/src/cloud/csigg/mlir_641b12e94b8a4e7befbda691364554c186a61639_1686303895/triton/include/triton/Analysis/Utility.h ====
+# action=edit type=text
+--- triton/include/triton/Analysis/Utility.h	2023-05-26 16:07:06.000000000 -0700
++++ triton/include/triton/Analysis/Utility.h	2023-06-09 02:47:56.000000000 -0700
+@@ -119,9 +119,9 @@
+ multiRootTopologicalSort(const SetVector<Operation *> &toSort);
+ 
+ /// This uses the toplogicalSort above
+-SetVector<Operation *>
+-multiRootGetSlice(Operation *op, TransitiveFilter backwardFilter = nullptr,
+-                  TransitiveFilter forwardFilter = nullptr);
++SetVector<Operation *> multiRootGetSlice(
++    Operation *op, SliceOptions::TransitiveFilter backwardFilter = nullptr,
++    SliceOptions::TransitiveFilter forwardFilter = nullptr);
+ 
+ /// Create a basic DataFlowSolver with constant and dead code analysis included.
+ std::unique_ptr<DataFlowSolver> createDataFlowSolver();
+==== triton/lib/Analysis/Utility.cpp#7 - /google/src/cloud/csigg/mlir_641b12e94b8a4e7befbda691364554c186a61639_1686303895/triton/lib/Analysis/Utility.cpp ====
+# action=edit type=text
+--- triton/lib/Analysis/Utility.cpp	2023-05-26 16:07:06.000000000 -0700
++++ triton/lib/Analysis/Utility.cpp	2023-06-09 03:02:09.000000000 -0700
+@@ -317,9 +317,9 @@
+   return res;
+ }
+ 
+-SetVector<Operation *> multiRootGetSlice(Operation *op,
+-                                         TransitiveFilter backwardFilter,
+-                                         TransitiveFilter forwardFilter) {
++SetVector<Operation *> multiRootGetSlice(
++    Operation *op, SliceOptions::TransitiveFilter backwardFilter,
++    SliceOptions::TransitiveFilter forwardFilter) {
+   SetVector<Operation *> slice;
+   slice.insert(op);
+ 
+@@ -330,12 +330,12 @@
+     auto *currentOp = (slice)[currentIndex];
+     // Compute and insert the backwardSlice starting from currentOp.
+     backwardSlice.clear();
+-    getBackwardSlice(currentOp, &backwardSlice, backwardFilter);
++    getBackwardSlice(currentOp, &backwardSlice, {{backwardFilter}});
+     slice.insert(backwardSlice.begin(), backwardSlice.end());
+ 
+     // Compute and insert the forwardSlice starting from currentOp.
+     forwardSlice.clear();
+-    getForwardSlice(currentOp, &forwardSlice, forwardFilter);
++    getForwardSlice(currentOp, &forwardSlice, {{forwardFilter}});
+     slice.insert(forwardSlice.begin(), forwardSlice.end());
+     ++currentIndex;
+   }
diff --git a/third_party/triton/cl539043953.patch b/third_party/triton/cl539043953.patch
new file mode 100644
index 00000000000..d65e6d15301
--- /dev/null
+++ b/third_party/triton/cl539043953.patch
@@ -0,0 +1,37 @@
+==== triton/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp#3 - /google/src/cloud/jreiffers/hmhmmm/triton/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp	2023-05-26 16:07:06.000000000 -0700
++++ triton/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp	2023-06-09 04:38:55.000000000 -0700
+@@ -44,7 +44,7 @@
+ SmallVector<unsigned, 2> warpsPerTileV2(triton::DotOp dotOp,
+                                         const ArrayRef<int64_t> shape,
+                                         int numWarps) {
+-  auto filter = [&dotOp](Operation *op) {
++  mlir::TransitiveFilter filter = [&dotOp](Operation *op) {
+     return op->getParentRegion() == dotOp->getParentRegion();
+   };
+   auto slices = mlir::getSlice(dotOp, filter);
+@@ -116,7 +116,9 @@
+     triton::gpu::MmaEncodingAttr mmaEnc;
+     if (versionMajor == 1) {
+       SetVector<Operation *> aBwdSlices, bBwdSlices;
+-      auto isCvt = [](Operation *op) { return isa<ConvertLayoutOp>(op); };
++      mlir::TransitiveFilter isCvt = [](Operation *op) {
++        return isa<ConvertLayoutOp>(op);
++      };
+       getBackwardSlice(a, &aBwdSlices, isCvt);
+       getBackwardSlice(b, &bBwdSlices, isCvt);
+       // get the source of the first conversion found in slices
+==== triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp#8 - /google/src/cloud/jreiffers/hmhmmm/triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp ====
+# action=edit type=text
+--- triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp	2023-06-01 05:23:23.000000000 -0700
++++ triton/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp	2023-06-09 04:35:01.000000000 -0700
+@@ -347,7 +347,7 @@
+     if (srcEncoding.isa<triton::gpu::SliceEncodingAttr>())
+       return failure();
+     SetVector<Operation *> cvtSlices;
+-    auto filter = [&](Operation *op) {
++    mlir::TransitiveFilter filter = [&](Operation *op) {
+       return op->getBlock() == cvt->getBlock() &&
+              !isa<triton::gpu::ConvertLayoutOp, scf::YieldOp>(op) &&
+              !(isa<triton::ReduceOp>(op) &&
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 58e0dc4d918..61a5eb62dc6 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,8 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "1627e0c27869b4098e5fa720717645c1baaf5972"
-    TRITON_SHA256 = "574436dab7c65f185834bd80c1d92167bacb7471b0c25906db60686835c46e21"
+    TRITON_COMMIT = "cl535269113"
+    TRITON_SHA256 = "e6f46ad21404aa2b8b6a8f2ccf3e9d9947b9bdef8492d052505134001546a848"
 
     tf_http_archive(
         name = "triton",
@@ -15,7 +15,9 @@ def repo():
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
-            "//third_party/triton:cl526173620.patch",
-            "//third_party/triton:cl528701873.patch",
+            "//third_party/triton:cl536931041.patch",
+            "//third_party/triton:cl538748120.patch",
+            "//third_party/triton:cl539025816.patch",
+            "//third_party/triton:cl539043953.patch",
         ],
     )